Skip to content
Open
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
198 changes: 198 additions & 0 deletions src/walk_dir.zig
Original file line number Diff line number Diff line change
@@ -0,0 +1,198 @@
const std = @import("std");
const fs = std.fs;
const mem = std.mem;

const testing = std.testing;

const State = union(enum) { anything: bool, exact: []const u8 };
const StateMachine = std.ArrayList(State);
const PathIter = mem.SplitIterator(u8, .sequence);
const CheckResult = enum { Ignore, Exclude, None };

fn match_iter(states: []const State, paths: []const []const u8) bool {
if (states.len == 0) {
return paths.len == 0;
}

if (paths.len == 0) {
for (states) |s| {
if (.anything != s) {
return false;
}
}
return true;
}

switch (states[0]) {
.anything => return match_iter(states, paths[1..]) or
match_iter(states[1..], paths[1..]),
.exact => |expect| {
if (std.mem.eql(u8, expect, paths[0])) {
return match_iter(states[1..], paths[1..]);
}

return false;
},
}
}

test "match iter" {
inline for (.{
.{ &[_]State{.{ .anything = true }}, "aaa", true },
.{ &[_]State{.{ .anything = true }}, "b", true },
.{ &[_]State{.{ .anything = true }}, "", true },
.{ &[_]State{ .{ .anything = true }, .{ .exact = "b" } }, "a/a/b", true },
.{ &[_]State{ .{ .anything = true }, .{ .exact = "b" } }, "a/a/b/c", false },
.{ &[_]State{ .{ .anything = true }, .{ .exact = "b" } }, "a/b/a/b", true },
.{ &[_]State{ .{ .anything = true }, .{ .exact = "b" }, .{ .anything = true } }, "a/a/b/c", true },
}) |case| {
const states = case.@"0";
const input = case.@"1";
const expected = case.@"2";
var path_iter = mem.splitSequence(u8, input, "/");
var paths = std.ArrayList([]const u8).init(testing.allocator);
defer paths.deinit();
while (path_iter.next()) |v| {
paths.append(v) catch @panic("OOM");
}
try testing.expectEqual(match_iter(states, paths.items), expected);
}
}

const IgnoreRule = struct {
is_dir: bool,
is_exclude: bool,
state_machine: StateMachine,
dir: []const u8,

const Self = @This();

fn init(allocator: std.mem.Allocator, dir: []const u8) Self {
return .{
.is_dir = false,
.is_exclude = false,
.state_machine = StateMachine.init(allocator),
.dir = dir,
};
}

fn deinit(self: Self) void {
self.state_machine.deinit();
}

fn pushState(self: *Self, state: State) !void {
try self.state_machine.append(state);
}

fn check(self: Self, path: []const u8, file_entry: fs.IterableDir.Entry) !CheckResult {
if (self.is_dir and file_entry.kind != .directory) {
return if (self.is_exclude) .Exclude else .Ignore;
}
Comment on lines +88 to +90

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

medium

The current logic for directory-specific rules seems to have an issue. If a rule is specific to directories (i.e., self.is_dir is true, often from a pattern ending in /), and the file_entry being checked is not a directory, this rule should simply not apply to the entry. In such a case, the function should return .None.

Currently, if self.is_dir is true and file_entry.kind != .directory, it returns .Ignore (or .Exclude if self.is_exclude is true). This could lead to incorrectly ignoring or excluding files.

For example, if the rule is build/ (ignore the build directory) and we are checking a file named other_file.txt, this rule should not cause other_file.txt to be ignored. However, the current logic would return .Ignore.

Could you clarify if this behavior is intended, or if it should return .None when a directory-specific rule encounters a non-directory entry?

            // If the rule is for a directory, but the entry is not a directory,
            // this rule does not apply to this entry.
            return .None;


const remainings = mem.trimLeft(u8, path, self.dir);
var path_iter = mem.splitSequence(u8, remainings, "/");
var paths = std.ArrayList([]const u8);
Copy link

Copilot AI May 16, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The paths ArrayList is declared without calling .init(allocator) and is never deinitialized, leading to a panic on append and a memory leak. Initialize it with the allocator (e.g., std.ArrayList([]const u8).init(self.allocator)) and defer paths.deinit().

Suggested change
var paths = std.ArrayList([]const u8);
var paths = std.ArrayList([]const u8).init(self.state_machine.allocator);
defer paths.deinit();

Copilot uses AI. Check for mistakes.
while (path_iter.next()) |v| {
try paths.append(v);
}
const match = match_iter(self.state_machine.items, paths);
if (match) {
return if (self.is_exclude) .Exclude else .Ignore;
}

return .None;
}

fn printState(self: Self, buf: anytype) !void {
try buf.writeAll("state: [");
for (self.state_machine.items, 0..) |item, i| {
if (i > 0) {
try buf.writeAll(", ");
}
switch (item) {
.anything => try buf.writeAll("any"),
.exact => |exact| try buf.writeAll(exact),
}
}
try buf.writeAll("]");
}
};

const IgnoreParser = struct {
allocator: std.mem.Allocator,
dir: []const u8,

const Self = @This();
fn init(allocator: std.mem.Allocator, dir: []const u8) Self {
return .{
.allocator = allocator,
.dir = dir,
};
}

fn parse(self: Self, input: []const u8) !?IgnoreRule {
if (std.mem.startsWith(u8, input, "#") or std.mem.eql(u8, input, "")) {
return null;
}

var rule = IgnoreRule.init(self.allocator, self.dir);
var start: usize = 0;
var end: usize = input.len;
if (std.mem.startsWith(u8, input, "!")) {
rule.is_exclude = true;
start = 1;
}
if (std.mem.endsWith(u8, input, "/")) {
rule.is_dir = true;
end = end - 1;
}

var it = std.mem.splitScalar(u8, input[start..end], '/');
var first_item = it.first();
if (!std.mem.eql(u8, "", first_item)) {
try rule.pushState(State{ .anything = true });
try rule.pushState(State{ .exact = first_item });
}

while (it.next()) |item| {
if (std.mem.eql(u8, "**", item)) {
try rule.pushState(State{ .anything = true });
} else {
try rule.pushState(State{ .exact = item });
}
}
Comment on lines +151 to +163

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

critical

There appears to be a critical issue in the parsing logic for rule components. The current approach using it.first() (line 151) to peek at the first component, potentially processing it in lines 153-154, and then iterating with while (it.next()) (line 157) can lead to the first component being processed twice. This is because it.first() does not advance the iterator.

For example, with a pattern like "a/b":

  1. input[start..end] would be "a/b".
  2. first_item (line 151) would be "a".
  3. The if condition on line 152 would be true.
  4. Lines 153-154 would push State{ .anything = true } and then State{ .exact = "a" }.
  5. The while (it.next()) loop on line 157 would then again process "a" as its first item, followed by "b".
    This would result in an incorrect state machine like [any, a, a, b] instead of the expected [any, a, b] for a pattern like "a/b" (which implies **/a/b).

This could lead to incorrect matching behavior. Consider refactoring this section to correctly handle the implicit ** for non-anchored patterns and then iterate through all components once.

A revised approach might involve:

  1. Checking if input[start..end] starts with /.
  2. If not (and the pattern is not empty), add the initial State{ .anything = true }.
  3. Then, use a single while (it.next()) loop to process all components, ensuring to skip any empty components that arise (e.g., from a leading / or //).
        // Corrected logic to avoid double-processing of the first component.
        const current_pattern_slice = input[start..end];
        if (current_pattern_slice.len > 0 && !std.mem.startsWith(u8, current_pattern_slice, "/")) {
            // Pattern is not anchored and not empty, so prepend 'anything' (like '**')
            try rule.pushState(State{ .anything = true });
        }

        // 'it' is already initialized from input[start..end] on line 150.
        // Now, iterate through its components.
        while (it.next()) |component| {
            if (component.len == 0) {
                // Skip empty components. These arise from:
                // - A leading slash (e.g., "/foo" -> "" then "foo"). The "" is skipped.
                // - Consecutive slashes (e.g., "foo//bar" -> "foo", "", "bar"). The "" is skipped.
                continue;
            }

            if (std.mem.eql(u8, "**", component)) {
                // Add 'anything' state, but avoid duplicates if the last state was also 'anything'.
                if (rule.state_machine.items.len == 0 or rule.state_machine.items[rule.state_machine.items.len - 1] != .{.anything = true}) {
                    try rule.pushState(State{ .anything = true });
                }
            } else {
                try rule.pushState(State{ .exact = component });
            }
        }


return rule;
}
};

test "parser rule" {
const parser = IgnoreParser.init(std.testing.allocator, "/tmp");

// https://www.atlassian.com/git/tutorials/saving-changes/gitignore#git-ignore-patterns
// https://git-scm.com/docs/gitignore
inline for (.{
// (input, is_dir, is_exclude, state)
.{ "/a/b/c", false, false, "state: [a, b, c]" },
.{ "a/b/", true, false, "state: [any, a, b]" },
.{ "/a/b/", true, false, "state: [a, b]" },
.{ "!/a/b/", true, true, "state: [a, b]" },
.{ "!/a/**/b/", true, true, "state: [a, any, b]" },
}) |case| {
const input = case.@"0";
const is_dir = case.@"1";
const is_exclude = case.@"2";
const state = case.@"3";

const rule = parser.parse(input) catch unreachable orelse unreachable;
defer rule.deinit();

try testing.expectEqual(is_dir, rule.is_dir);
try testing.expectEqual(is_exclude, rule.is_exclude);

var collector = std.ArrayList(u8).init(std.testing.allocator);
defer collector.deinit();
try rule.printState(collector.writer());
try testing.expectEqualStrings(state, collector.items);
}
}