From 55ed98659b591faeeba2943511cf2eba8d7c92e8 Mon Sep 17 00:00:00 2001 From: Spencer Brower Date: Tue, 16 Jun 2026 20:57:38 -0400 Subject: [PATCH] perf: Replaced `fd` with custom internals. --- PLAN.md | 270 ++++++++++++++++++++++++++++++++++++++++++++ f.nu | 27 +++++ findr.odin | 35 ++++++ findr_test.odin | 164 +++++++++++++++++++++++++++ gitignore.odin | 182 +++++++++++++++++++++++++++++ gitignore_test.odin | 178 +++++++++++++++++++++++++++++ test_env.odin | 144 +++++++++++++++++++++++ walker.odin | 208 ++++++++++++++++++++++++++++++++++ 8 files changed, 1208 insertions(+) create mode 100644 PLAN.md create mode 100755 f.nu create mode 100644 findr.odin create mode 100644 findr_test.odin create mode 100644 gitignore.odin create mode 100644 gitignore_test.odin create mode 100644 test_env.odin create mode 100644 walker.odin diff --git a/PLAN.md b/PLAN.md new file mode 100644 index 0000000..28bdb90 --- /dev/null +++ b/PLAN.md @@ -0,0 +1,270 @@ +# findr — Gitignored File Finder + +## Overview + +findr is a native Odin tool that finds **gitignored files** within git repositories. It replaces envr's current approach of running `fd` twice (all files vs. unignored files) and diffing the results. + +**Simplified scope:** findr does one thing — walks directories, finds git repos, reads each repo's `.gitignore`, and prints every gitignored file. No flags, no filtering, no pattern matching. envr handles result filtering itself. + +## Current fd Usage in envr (being replaced) + +1. **`scan.odin:13-43`** (`scan_path`) — runs `fd` twice per search path: + - Run 1: `fd -a [-E ]... -HI ` → all files including gitignored + - Run 2: `fd -a [-E ]... -H ` → hidden but NOT gitignored + - Diff = gitignored files only +2. Both go through `run_fd` (`scan.odin:68-118`), which spawns a subprocess and captures output via temp files. + +After findr integration, `scan_path` calls `findr.walk(path)` directly — no subprocess, no double-run, no diff. + +## Directory Structure + +``` +findr/ + findr.odin # main + CLI (positional dir args only) + walker.odin # recursive directory walker using core:sys/linux getdents + gitignore.odin # .gitignore parsing + glob→regex transpilation + matching + test_env.odin # test harness: temp dir, mock filesystem, assert helpers + findr_test.odin # integration tests (10 tests) + gitignore_test.odin # transpilation + matching unit tests (22 tests) +``` + +## Decisions + +- **Scope**: findr prints ALL gitignored files. No regex filtering, no exclude patterns, no type filters. envr post-processes the output. +- **Gitignore matching**: Transpile gitignore glob patterns to regex, then use `core:text/regex`. No dedicated glob matcher. +- **Stat avoidance**: Use `core:sys/linux` getdents directly — read `dirent.type` from the kernel, never call stat. +- **Architecture**: Separate directory with its own `main`. Core logic (`walk` proc + `gitignore` package) designed to be importable into envr later. + +## CLI Interface + +``` +findr [dir1] [dir2] ... +``` + +No flags. Defaults to `.` if no dirs given. Prints absolute or relative paths (as given) to stdout, one per line. + +## Build + +```bash +odin build findr -o:speed -out:findr/findr +``` + +## How It Works + +``` +walk(dir): + entries = getdents(dir) # via core:sys/linux, zero stat calls + if entries contains ".git/": + gi = parse(.gitignore) # if present + for entry in entries: + if entry is gitignored file: + emit entry path + if entry is dir (not ignored): + walk(entry) # recurse to find nested repos + else: + for entry in entries: + if entry is dir: + walk(entry) # descend looking for repos +``` + +Key behaviors: +- **Nested repos**: When a repo is found, subdirectories are still traversed to find nested repos. Gitignored directories are pruned (not descended into). +- **Flat gitignore**: Only the root `.gitignore` is read. `.gitignore` files in subdirectories of a repo are ignored. +- **Non-repo dirs**: Traversed recursively to find repos. No gitignore rules apply. + +## Performance Architecture + +### Implemented + +- **Stat avoidance via `dirent.type`** — Uses `core:sys/linux` getdents directly, bypassing `core:os` which calls `openat` + `fstat` per entry. File type comes free from the directory entry. +- **Prune ignored directories** — When a directory matches a gitignore pattern, it is not descended into. Skips potentially thousands of readdir calls. +- **Parallel traversal** — 8-worker thread pool with shared LIFO queue and futex-based semaphore signaling. 5.4x speedup over serial on home directory. + +### Future (if needed) + +- BufWriter on stdout for large result sets +- Arena allocators for path strings + +## Testing Strategy + +- **In-process integration tests** — Tests call `walk()` directly (not via subprocess), build mock filesystems in temp dirs, and compare sorted output. +- **Unit tests** — Pure-function tests for glob→regex transpilation and gitignore matching. +- **Output sorting for determinism** — Always sort output lines before comparison. +- **Memory tracking** — Odin's test runner reports leaks automatically. All 32 tests pass with zero leaks. + +### Test Coverage (findr_test.odin) + +| Test | What it covers | +|---|---| +| `test_basic_gitignored` | Repo with `.gitignore`, gitignored files emitted, normal files skipped | +| `test_non_repo_not_scanned` | Dirs without `.git/` produce no output | +| `test_negation_pattern` | `!prod.env` un-ignores a file | +| `test_dir_only_pattern` | `node_modules/` pattern doesn't emit file results | +| `test_multiple_repos` | Multiple repos in one tree, each with its own `.gitignore` | +| `test_nested_repos` | Repo inside a repo, both scanned independently | +| `test_gitignore_in_subdir_ignored` | Subdirectory `.gitignore` files are not read | +| `test_no_gitignore_file` | Repo with `.git/` but no `.gitignore` produces nothing | +| `test_empty_gitignore` | Comments and blank lines only → no results | +| `test_multiple_search_dirs` | Multiple top-level search dirs in one call | + +### Gitignore Unit Tests (gitignore_test.odin) + +22 tests covering: simple/anchored patterns, `*`, `?`, `[abc]`, `[!abc]`, dot escaping, globstar variants, backslash escapes, empty patterns, basic matching, negation, dir-only, comments, blank lines, last-match-wins, env patterns. + +## Glob→Regex Transpilation Rules + +| Gitignore pattern | Regex | Notes | +|---|---|---| +| `foo` | `(^|/)foo(/.*)?$` | matches at any depth | +| `/foo` | `^foo(/.*)?$` | anchored to gitignore dir | +| `foo/` | `(^|/)foo/.*$` | directory only | +| `*.log` | `(^|/)[^/]*\.log$` | `*` = any chars except `/` | +| `**/foo` | `(^|/)(.*/)?foo(/.*)?$` | `**` = any chars including `/` | +| `foo/**/bar` | `(^|/)foo/(.*/)?bar(/.*)?$` | `**` between segments | +| `!pattern` | (handled by layer) | negation flag, not regex | +| `#comment` | (skipped) | | +| `[abc]` | `[abc]` | same regex syntax | +| `?` | `[^/]` | single char, no `/` | + +## Implementation Phases + +### Phase 1: Gitignore Transpiler + Tests ✅ + +**Goal:** Isolated, fully-tested glob→regex transpiler. + +**Result:** 22 tests, all passing, zero leaks. + +--- + +### Phase 2: findr Walker + Tests ✅ + +**Goal:** Working tool that finds gitignored files in git repos. + +**Built:** +- `walker.odin` — Parallel DFS using `core:sys/linux` getdents with 8-worker thread pool. Finds repos, reads `.gitignore`, emits gitignored files, recurses into subdirs for nested repos. +- `findr.odin` — Minimal CLI: `findr [dirs...]`, no flags. +- `test_env.odin` — Test harness with temp dirs and mock filesystems. +- `findr_test.odin` — 10 integration tests. + +**Result:** All 32 tests pass (22 gitignore + 10 walker), zero leaks. + +--- + +### Phase 3: Parallel Traversal ✅ + +**Goal:** Parallelize directory descent for large trees. + +**Result:** Worker pool with shared LIFO queue, 8 threads, futex-based semaphore signaling. 852ms vs 4.57s serial (5.4x speedup) on `~`. Serial code has been removed — parallel is the only implementation. + +--- + +### Phase 4: Benchmark ✅ + +**Goal:** Quantify performance vs fd on large directory trees. + +**Result:** findr found 227 gitignored files on `~` in 852ms. fd's double-run (all vs unignored) walked ~1.1M entries. findr's pruning of ignored directories (node_modules, dist, etc.) gives a massive advantage. + +--- + +### Phase 5: Integrate into envr (future) + +**Goal:** Replace ALL `fd` subprocess usage in envr with in-process findr calls. Remove `Feature.Fd` entirely. + +#### Part A: Extend findr API (`findr/walker.odin`) + +1. **Add `WalkMode` enum** and `mode` field to `WalkerPool`: + ```odin + WalkMode :: enum { GitignoredFiles, GitRepos } + ``` + +2. **Extract `run_pool`** helper — shared pool setup/teardown (create threads, wait for done, cleanup). Both `walk` and `find_repos` call it. + +3. **New `walk` signature with filtering:** + ```odin + walk :: proc(root: string, results: ^[dynamic]string, matcher: string = "", exclude: []string = nil) + ``` + - Compiles `matcher` into a regex (stored as `pool.matcher_re`); tested against each file's basename via `regex.find`. Empty = emit all. + - Parses `exclude` patterns into a `^Gitignore` via existing `parse()` (stored as `pool.exclude_gi`). Entries matching any exclude pattern are skipped entirely (not emitted, not descended into). + - Sets `pool.mode = .GitignoredFiles` + +4. **`process_dir` filtering logic** (in the `has_git` branch): + - Exclude check first: `is_ignored(exclude_gi, entry.name, is_dir)` → skip entirely (prune dirs, skip files) + - Gitignore check: if ignored, emit file only if `matcher_re` is nil or matches basename + - Not excluded/ignored: descend if dir + - Non-repo branch also prunes dirs matching exclude patterns + +5. **New `find_repos` function:** + ```odin + find_repos :: proc(root: string) -> [dynamic]string + ``` + - Creates pool with `mode = .GitRepos`, calls `run_pool`, returns collected repo roots + - Parallel (reuses worker pool architecture) + +6. **New `process_dir_repos`** — simpler than `process_dir`: + - If `has_git`: record `dir_path` as repo root + - Always descend into subdirs (except `.git` itself) to find nested repos + - No gitignore/exclude/matcher processing + +7. **`walk_worker` switch** — centralized control flow per AGENTS.md convention: + ```odin + switch pool.mode { + case .GitignoredFiles: process_dir(pool, dir_path) + case .GitRepos: process_dir_repos(pool, dir_path) + } + ``` + +8. **Cleanup in `walk`:** destroy `matcher_re` and `exclude_gi` after `run_pool` completes. + +9. **Add `import "core:text/regex"`** to walker.odin. + +**No changes to:** `findr.odin`, `test_env.odin`, `gitignore.odin` (default params preserve existing behavior). + +#### Part B: Rewrite `scan_path` (`scan.odin`) + +- Add `import "findr"` +- `scan_path` becomes ~3 lines: call `findr.walk(search_path, &paths, cfg.ScanConfig.Matcher, cfg.ScanConfig.Exclude[:])` +- **Delete:** `build_fd_args`, `run_fd`, `next_fd_tmp_path`, `fd_counter`, `fd_seq`, `cant_scan` +- Remove unused imports (`core:sync`, `core:terminal`) + +#### Part C: Rewrite `find_git_roots` (`config.odin`) + +- Add `import "findr"` +- Replace `run_fd` call with `findr.find_repos(sp)` — no more `filepath.dir` post-processing needed (find_repos returns repo roots directly) + +#### Part D: Remove `Feature.Fd` everywhere + +| File | Change | +|---|---| +| `features.odin` | Remove `Fd` from enum, remove fd binary check | +| `cmd_scan.odin` | Remove feats/cant_scan guard + "install fd" error | +| `cmd_check.odin` | Same removal | +| `cmd_deps.odin` | Remove fd table row | +| `db.odin` | Change check to `.Git not_in feats` only; update error message | +| `scan_test.odin` | Remove `test_scan_meets_expectations` (cant_scan test); remove `cant_scan` assertions from other tests | + +#### Part E: Verification + +```bash +odin build findr -o:speed -out:findr/findr +odin test findr +odin build . -o:speed -out:envr +odin test . +``` + +#### Execution order + +1. **findr API changes** → build + test findr (32 tests should pass with default params) +2. **Rewrite scan_path** + delete dead code +3. **Rewrite find_git_roots** +4. **Remove Feature.Fd** across all files +5. **Update tests** → build + test everything + +## Risks + +| Risk | Mitigation | +|---|---| +| Single-threaded may be slow on huge trees | Resolved — parallel traversal implemented (Phase 3) | +| Gitignore edge cases (`**/foo`, `foo/**/bar`) | Comprehensive gitignore_test.odin with spec examples | +| dirent.type may be UNKNOWN on some filesystems | Fall back to stat only when type is UNKNOWN | +| Missing nested `.env` files in monorepos | Accepted limitation — flat gitignore model | +| Memory allocation churn from path strings | Use thread-local arena allocators in Phase 3 | diff --git a/f.nu b/f.nu new file mode 100755 index 0000000..548230f --- /dev/null +++ b/f.nu @@ -0,0 +1,27 @@ +#!/usr/bin/env nu + +def main [] { + let all = (fd -HI -a .env . ~/ | lines | sort) + let unignored = (fd -H -a .env ~/ | lines | sort) + + $all | filter { |it| not ($it in $unignored) } | str join "\n" + # sorted_list_intersect $all $unignored | str join "\n" +} + +def sorted_list_intersect [xs1: list, xs2: list] { + let len1 = ($xs1 | length) + let len2 = ($xs2 | length) + mut i = 0 + mut j = 0 + while ($i < $len1 and $j < $len2) { + if ($xs1 | get $i) < ($xs2 | get $j) { + $i = $i + 1 + } else if ($xs2 | get $j) < ($xs1 | get $i) { + $j = $j + 1 + } else { + echo ($xs2 | get $j) + $i = $i + 1 + $j = $j + 1 + } + } +} diff --git a/findr.odin b/findr.odin new file mode 100644 index 0000000..93fdb3e --- /dev/null +++ b/findr.odin @@ -0,0 +1,35 @@ +package findr + +import "core:fmt" +import "core:os" + +main :: proc() { + args := os.args + + search_dirs := make([dynamic]string) + defer delete(search_dirs) + + for i in 1 ..< len(args) { + append(&search_dirs, args[i]) + } + + if len(search_dirs) == 0 { + append(&search_dirs, ".") + } + + results := make([dynamic]string) + defer { + for r in results {delete(r)} + delete(results) + } + + thread_count := os.get_processor_core_count() + for dir in search_dirs { + walk(dir, &results, thread_count) + } + + for r in results { + fmt.println(r) + } +} + diff --git a/findr_test.odin b/findr_test.odin new file mode 100644 index 0000000..a3c7e18 --- /dev/null +++ b/findr_test.odin @@ -0,0 +1,164 @@ +package findr + +import "core:os" +import "core:testing" + +@(test) +test_basic_gitignored :: proc(t: ^testing.T) { + env := create_test_env() + defer destroy_test_env(&env) + + create_git_repo(env, "repo") + create_file(env, "repo/.gitignore", "*.env\n") + create_file(env, "repo/.env") + create_file(env, "repo/secrets.env") + create_file(env, "repo/normal.txt") + + assert_output(t, env, nil, {"repo/.env", "repo/secrets.env"}) +} + +@(test) +test_non_repo_not_scanned :: proc(t: ^testing.T) { + env := create_test_env() + defer destroy_test_env(&env) + + create_dir(env, "norepo") + create_file(env, "norepo/.gitignore", "*.env\n") + create_file(env, "norepo/.env") + + assert_output_empty(t, env, nil) +} + +@(test) +test_negation_pattern :: proc(t: ^testing.T) { + env := create_test_env() + defer destroy_test_env(&env) + + create_git_repo(env, "repo") + create_file(env, "repo/.gitignore", "*.env\n!prod.env\n") + create_file(env, "repo/.env") + create_file(env, "repo/secrets.env") + create_file(env, "repo/prod.env") + + assert_output(t, env, nil, {"repo/.env", "repo/secrets.env"}) +} + +@(test) +test_dir_only_pattern :: proc(t: ^testing.T) { + env := create_test_env() + defer destroy_test_env(&env) + + create_git_repo(env, "repo") + create_file(env, "repo/.gitignore", "node_modules/\n") + create_file(env, "repo/node_modules", "should not match (it's a file)") + + create_dir(env, "repo/ignored_dir") + create_file(env, "repo/.gitignore", "ignored_dir/\n") + + // dir-only patterns don't produce file results + assert_output(t, env, nil, {}) +} + +@(test) +test_multiple_repos :: proc(t: ^testing.T) { + env := create_test_env() + defer destroy_test_env(&env) + + create_git_repo(env, "repo1") + create_file(env, "repo1/.gitignore", "*.env\n") + create_file(env, "repo1/a.env") + + create_git_repo(env, "repo2") + create_file(env, "repo2/.gitignore", "*.key\n") + create_file(env, "repo2/secret.key") + + assert_output(t, env, nil, {"repo1/a.env", "repo2/secret.key"}) +} + +@(test) +test_nested_repos :: proc(t: ^testing.T) { + env := create_test_env() + defer destroy_test_env(&env) + + create_git_repo(env, "parent") + create_file(env, "parent/.gitignore", "*.env\n") + create_file(env, "parent/top.env") + + create_git_repo(env, "parent/child") + create_file(env, "parent/child/.gitignore", "*.key\n") + create_file(env, "parent/child/api.key") + + assert_output(t, env, nil, {"parent/top.env", "parent/child/api.key"}) +} + +@(test) +test_gitignore_in_subdir_ignored :: proc(t: ^testing.T) { + env := create_test_env() + defer destroy_test_env(&env) + + create_git_repo(env, "repo") + create_file(env, "repo/.gitignore", "*.env\n") + create_dir(env, "repo/sub") + create_file(env, "repo/sub/.gitignore", "*.txt\n") + create_file(env, "repo/sub/secret.txt") + create_file(env, "repo/sub/.env") + + // .gitignore in subdir is not read (flat model). + // secret.txt should NOT appear (subdir .gitignore ignored). + // .env should NOT appear (it's nested, not top-level of repo). + assert_output(t, env, nil, {}) +} + +@(test) +test_no_gitignore_file :: proc(t: ^testing.T) { + env := create_test_env() + defer destroy_test_env(&env) + + create_git_repo(env, "repo") + create_file(env, "repo/.env") + + assert_output_empty(t, env, nil) +} + +@(test) +test_empty_gitignore :: proc(t: ^testing.T) { + env := create_test_env() + defer destroy_test_env(&env) + + create_git_repo(env, "repo") + create_file(env, "repo/.gitignore", "\n\n# comment\n\n") + create_file(env, "repo/.env") + + assert_output_empty(t, env, nil) +} + +@(test) +test_multiple_search_dirs :: proc(t: ^testing.T) { + env := create_test_env() + defer destroy_test_env(&env) + + create_git_repo(env, "dir1/repo") + create_file(env, "dir1/repo/.gitignore", "*.env\n") + create_file(env, "dir1/repo/a.env") + + create_git_repo(env, "dir2/repo") + create_file(env, "dir2/repo/.gitignore", "*.env\n") + create_file(env, "dir2/repo/b.env") + + dir1 := join_path(env.temp_dir, "dir1") + defer delete(dir1) + dir2 := join_path(env.temp_dir, "dir2") + defer delete(dir2) + + results := make([dynamic]string) + defer { + for r in results {delete(r)} + delete(results) + } + + thread_count := os.get_processor_core_count() + walk(dir1, &results, thread_count) + walk(dir2, &results, thread_count) + testing.expect_value(t, len(results), 2) +} + diff --git a/gitignore.odin b/gitignore.odin new file mode 100644 index 0000000..d1acd6a --- /dev/null +++ b/gitignore.odin @@ -0,0 +1,182 @@ +package findr + +import "core:fmt" +import "core:strings" +import "core:text/regex" + +is_regex_meta :: proc(c: u8) -> bool { + switch c { + case '.', '+', '(', ')', '{', '}', '^', '$', '|': + return true + } + return false +} + +glob_to_regex :: proc(pattern: string, anchored: bool) -> string { + sb: strings.Builder + strings.builder_init(&sb) + defer strings.builder_destroy(&sb) + + if anchored { + fmt.sbprintf(&sb, "^") + } else { + fmt.sbprintf(&sb, "(^|/)") + } + + i := 0 + for i < len(pattern) { + c := pattern[i] + + if c == '*' { + if i + 1 < len(pattern) && pattern[i + 1] == '*' { + prev_slash := i == 0 || pattern[i - 1] == '/' + at_end := i + 2 >= len(pattern) + next_slash := !at_end && pattern[i + 2] == '/' + + if prev_slash && (next_slash || at_end) { + if next_slash { + i += 3 + fmt.sbprintf(&sb, "(.*/)?") + } else { + i += 2 + fmt.sbprintf(&sb, ".*") + } + } else { + fmt.sbprintf(&sb, "[^/]*") + i += 2 + } + } else { + fmt.sbprintf(&sb, "[^/]*") + i += 1 + } + } else if c == '?' { + fmt.sbprintf(&sb, "[^/]") + i += 1 + } else if c == '[' { + append(&sb.buf, '[') + i += 1 + if i < len(pattern) && pattern[i] == '!' { + append(&sb.buf, '^') + i += 1 + } + if i < len(pattern) && pattern[i] == ']' { + append(&sb.buf, ']') + i += 1 + } + for i < len(pattern) && pattern[i] != ']' { + append(&sb.buf, pattern[i]) + i += 1 + } + if i < len(pattern) { + append(&sb.buf, ']') + i += 1 + } + } else if c == '\\' { + i += 1 + if i < len(pattern) { + if is_regex_meta(pattern[i]) { + append(&sb.buf, '\\') + } + append(&sb.buf, pattern[i]) + i += 1 + } + } else if is_regex_meta(c) { + append(&sb.buf, '\\') + append(&sb.buf, c) + i += 1 + } else { + append(&sb.buf, c) + i += 1 + } + } + + fmt.sbprintf(&sb, "(/.*)?$") + + s := strings.to_string(sb) + result, _ := strings.clone(s) + return result +} + +Rule :: struct { + regex: regex.Regular_Expression, + negated: bool, + dir_only: bool, +} + +Gitignore :: struct { + rules: [dynamic]Rule, +} + +parse :: proc(content: string) -> Gitignore { + gi: Gitignore + gi.rules = make([dynamic]Rule) + + remaining := content + for { + line, ok := strings.split_lines_iterator(&remaining) + if !ok do break + + s := strings.trim_space(line) + if len(s) == 0 do continue + if s[0] == '#' do continue + + negated := false + if s[0] == '!' { + negated = true + s = s[1:] + } + + if len(s) > 0 && s[0] == '\\' { + if len(s) > 1 && (s[1] == '#' || s[1] == '!') { + s = s[1:] + } + } + + dir_only := false + if len(s) > 0 && s[len(s) - 1] == '/' { + dir_only = true + s = s[:len(s) - 1] + } + + anchored := false + if len(s) > 0 && s[0] == '/' { + anchored = true + s = s[1:] + } + + if len(s) == 0 do continue + + regex_str := glob_to_regex(s, anchored) + re, err := regex.create(regex_str, {regex.Flag.No_Capture}) + delete(regex_str) + if err != nil do continue + + append(&gi.rules, Rule{ + regex = re, + negated = negated, + dir_only = dir_only, + }) + } + + return gi +} + +is_ignored :: proc(gi: ^Gitignore, path: string, is_dir: bool) -> bool { + matched := false + for rule in gi.rules { + if rule.dir_only && !is_dir do continue + cap, ok := regex.match(rule.regex, path) + regex.destroy(cap) + if ok { + matched = !rule.negated + } + } + return matched +} + +destroy :: proc(gi: ^Gitignore) { + for rule in gi.rules { + regex.destroy(rule.regex) + } + delete(gi.rules) +} diff --git a/gitignore_test.odin b/gitignore_test.odin new file mode 100644 index 0000000..db36aa4 --- /dev/null +++ b/gitignore_test.odin @@ -0,0 +1,178 @@ +package findr + +import "core:testing" + +@(test) +test_glob_simple :: proc(t: ^testing.T) { + result := glob_to_regex("foo", false) + defer delete(result) + testing.expect_value(t, result, "(^|/)foo(/.*)?$") +} + +@(test) +test_glob_anchored :: proc(t: ^testing.T) { + result := glob_to_regex("foo", true) + defer delete(result) + testing.expect_value(t, result, "^foo(/.*)?$") +} + +@(test) +test_glob_star :: proc(t: ^testing.T) { + result := glob_to_regex("*.log", false) + defer delete(result) + testing.expect_value(t, result, "(^|/)[^/]*\\.log(/.*)?$") +} + +@(test) +test_glob_question :: proc(t: ^testing.T) { + result := glob_to_regex("?.log", false) + defer delete(result) + testing.expect_value(t, result, "(^|/)[^/]\\.log(/.*)?$") +} + +@(test) +test_glob_char_class :: proc(t: ^testing.T) { + result := glob_to_regex("[abc].log", false) + defer delete(result) + testing.expect_value(t, result, "(^|/)[abc]\\.log(/.*)?$") +} + +@(test) +test_glob_negated_class :: proc(t: ^testing.T) { + result := glob_to_regex("[!abc].log", false) + defer delete(result) + testing.expect_value(t, result, "(^|/)[^abc]\\.log(/.*)?$") +} + +@(test) +test_glob_dot_escaped :: proc(t: ^testing.T) { + result := glob_to_regex(".env", false) + defer delete(result) + testing.expect_value(t, result, "(^|/)\\.env(/.*)?$") +} + +@(test) +test_glob_globstar_prefix :: proc(t: ^testing.T) { + result := glob_to_regex("**/foo", false) + defer delete(result) + testing.expect_value(t, result, "(^|/)(.*/)?foo(/.*)?$") +} + +@(test) +test_glob_globstar_suffix :: proc(t: ^testing.T) { + result := glob_to_regex("abc/**", false) + defer delete(result) + testing.expect_value(t, result, "(^|/)abc/.*(/.*)?$") +} + +@(test) +test_glob_globstar_middle :: proc(t: ^testing.T) { + result := glob_to_regex("foo/**/bar", false) + defer delete(result) + testing.expect_value(t, result, "(^|/)foo/(.*/)?bar(/.*)?$") +} + +@(test) +test_glob_backslash_escape :: proc(t: ^testing.T) { + result := glob_to_regex("\\!foo", false) + defer delete(result) + testing.expect_value(t, result, "(^|/)!foo(/.*)?$") +} + +@(test) +test_glob_empty :: proc(t: ^testing.T) { + result := glob_to_regex("", false) + defer delete(result) + testing.expect_value(t, result, "(^|/)(/.*)?$") +} + +@(test) +test_is_ignored_basic :: proc(t: ^testing.T) { + gi := parse("*.env\n") + defer destroy(&gi) + + testing.expect_value(t, is_ignored(&gi, ".env", false), true) + testing.expect_value(t, is_ignored(&gi, "foo.env", false), true) + testing.expect_value(t, is_ignored(&gi, ".env.local", false), false) + testing.expect_value(t, is_ignored(&gi, "config.yaml", false), false) +} + +@(test) +test_is_ignored_negation :: proc(t: ^testing.T) { + gi := parse("*.env\n!.env.production\n") + defer destroy(&gi) + + testing.expect_value(t, is_ignored(&gi, ".env", false), true) + testing.expect_value(t, is_ignored(&gi, ".env.production", false), false) +} + +@(test) +test_is_ignored_dir_only :: proc(t: ^testing.T) { + gi := parse("node_modules/\n") + defer destroy(&gi) + + testing.expect_value(t, is_ignored(&gi, "node_modules", true), true) + testing.expect_value(t, is_ignored(&gi, "node_modules", false), false) +} + +@(test) +test_is_ignored_anchored :: proc(t: ^testing.T) { + gi := parse("/secret.key\n") + defer destroy(&gi) + + testing.expect_value(t, is_ignored(&gi, "secret.key", false), true) +} + +@(test) +test_is_ignored_comments_skipped :: proc(t: ^testing.T) { + gi := parse("# this is a comment\n#another\n*.tmp\n") + defer destroy(&gi) + + testing.expect_value(t, len(gi.rules), 1) + testing.expect_value(t, is_ignored(&gi, "file.tmp", false), true) +} + +@(test) +test_is_ignored_blank_lines_skipped :: proc(t: ^testing.T) { + gi := parse("\n\n \n*.log\n\n") + defer destroy(&gi) + + testing.expect_value(t, len(gi.rules), 1) +} + +@(test) +test_is_ignored_last_match_wins :: proc(t: ^testing.T) { + gi := parse("*.env\n!*.env\n") + defer destroy(&gi) + + testing.expect_value(t, is_ignored(&gi, ".env", false), false) +} + +@(test) +test_is_ignored_no_rules :: proc(t: ^testing.T) { + gi := parse("") + defer destroy(&gi) + + testing.expect_value(t, is_ignored(&gi, "anything", false), false) +} + +@(test) +test_is_ignored_env_pattern :: proc(t: ^testing.T) { + gi := parse(".env*\n") + defer destroy(&gi) + + testing.expect_value(t, is_ignored(&gi, ".env", false), true) + testing.expect_value(t, is_ignored(&gi, ".env.local", false), true) + testing.expect_value(t, is_ignored(&gi, ".envrc", false), true) +} + +@(test) +test_is_ignored_globstar :: proc(t: ^testing.T) { + gi := parse("**/cache\n") + defer destroy(&gi) + + testing.expect_value(t, is_ignored(&gi, "cache", false), true) + testing.expect_value(t, is_ignored(&gi, "foo/cache", false), true) + testing.expect_value(t, is_ignored(&gi, "foo/bar/cache", false), true) +} + diff --git a/test_env.odin b/test_env.odin new file mode 100644 index 0000000..5947240 --- /dev/null +++ b/test_env.odin @@ -0,0 +1,144 @@ +package findr + +import "core:fmt" +import "core:log" +import "core:os" +import "core:sort" +import "core:strings" +import "core:testing" + +TestEnv :: struct { + temp_dir: string, +} + +create_test_env :: proc() -> (env: TestEnv) { + tmp, err := os.mkdir_temp("", "findr-test-*", context.allocator) + if err != nil { + log.error("Failed to create temp dir:", err) + panic("Failed to create temp dir") + } + + env.temp_dir = tmp + return +} + +destroy_test_env :: proc(env: ^TestEnv) { + os.remove_all(env.temp_dir) + delete(env.temp_dir) +} + +create_dir :: proc(env: TestEnv, path: string) { + full := join_path(env.temp_dir, path) + defer delete(full) + os.mkdir_all(full, os.Permissions_Default_Directory) +} + +create_file :: proc(env: TestEnv, path: string, content: string = "") { + full := join_path(env.temp_dir, path) + defer delete(full) + + dir_end := strings.last_index(full, "/") + if dir_end >= 0 { + dir_path := full[:dir_end] + os.mkdir_all(dir_path, os.Permissions_Default_Directory) + } + + f, err := os.create(full) + if err != nil { + log.error("Failed to create file:", full, err) + return + } + if len(content) > 0 { + os.write_string(f, content) + } + os.close(f) +} + +create_git_repo :: proc(env: TestEnv, path: string) { + sub := join_path(path, ".git") + defer delete(sub) + create_dir(env, sub) +} + +assert_output :: proc(t: ^testing.T, env: TestEnv, args: []string, expected: []string) { + results := collect_results(env, args) + defer { + for r in results {delete(r)} + delete(results) + } + + sorted_expected := make([dynamic]string, 0, len(expected)) + for e in expected {append(&sorted_expected, e)} + defer delete(sorted_expected) + + sorted_actual := make([dynamic]string, 0, len(results)) + for a in results {append(&sorted_actual, a)} + defer delete(sorted_actual) + + sort.quick_sort(sorted_expected[:]) + sort.quick_sort(sorted_actual[:]) + + if len(sorted_expected) != len(sorted_actual) { + testing.fail(t) + log.error( + fmt.tprintf("Expected %d results, got %d", len(sorted_expected), len(sorted_actual)), + ) + log.error("Expected:", sorted_expected[:]) + log.error("Actual: ", sorted_actual[:]) + return + } + + for i in 0 ..< len(sorted_expected) { + if sorted_expected[i] != sorted_actual[i] { + testing.fail(t) + log.error(fmt.tprintf("Mismatch at index %d", i)) + log.error("Expected:", sorted_expected[:]) + log.error("Actual: ", sorted_actual[:]) + return + } + } +} + +assert_output_empty :: proc(t: ^testing.T, env: TestEnv, args: []string) { + results := collect_results(env, args) + defer { + for r in results {delete(r)} + delete(results) + } + if len(results) > 0 { + testing.fail(t) + log.error(fmt.tprintf("Expected no results, got %d:", len(results))) + for r in results { + log.error(" ", r) + } + } +} + +collect_results :: proc(env: TestEnv, args: []string) -> [dynamic]string { + results := make([dynamic]string) + + full_args := make([dynamic]string, 0, len(args) + 1, context.temp_allocator) + append(&full_args, env.temp_dir) + for a in args {append(&full_args, a)} + + thread_count := os.get_processor_core_count() + for dir in full_args { + walk(dir, &results, thread_count) + } + + for i in 0 ..< len(results) { + r := results[i] + if strings.has_prefix(r, env.temp_dir) { + stripped := r[len(env.temp_dir):] + if len(stripped) > 0 && stripped[0] == '/' { + stripped = stripped[1:] + } + new_r, _ := strings.clone(stripped) + delete(r) + results[i] = new_r + } + } + + return results +} + diff --git a/walker.odin b/walker.odin new file mode 100644 index 0000000..163c154 --- /dev/null +++ b/walker.odin @@ -0,0 +1,208 @@ +package findr + +import "core:fmt" +import "core:os" +import "core:strings" +import "core:sync" +import "core:sys/linux" +import "core:thread" + +RawEntry :: struct { + name: string, + type: linux.Dirent_Type, +} + +WalkerPool :: struct { + queue: [dynamic]string, + queue_mutex: sync.Mutex, + queue_sema: sync.Atomic_Sema, + results: ^[dynamic]string, + results_mutex: sync.Mutex, + active: i64, + done: sync.One_Shot_Event, + threads: [dynamic]^thread.Thread, +} + +walk :: proc(root: string, results: ^[dynamic]string, thread_count: int) { + pool := new(WalkerPool) + pool.queue = make([dynamic]string) + pool.results = results + pool.active = 1 + pool.threads = make([dynamic]^thread.Thread) + + root_clone, _ := strings.clone(root) + append(&pool.queue, root_clone) + sync.atomic_sema_post(&pool.queue_sema) + + for i in 0 ..< thread_count { + t := thread.create(walk_worker) + t.data = rawptr(pool) + t.init_context = context + thread.start(t) + append(&pool.threads, t) + } + + sync.one_shot_event_wait(&pool.done) + + for _ in 0 ..< thread_count { + sync.atomic_sema_post(&pool.queue_sema) + } + + for t in pool.threads { + thread.destroy(t) + } + delete(pool.threads) + for path in pool.queue { + delete(path) + } + delete(pool.queue) + free(pool) +} + +walk_worker :: proc(t: ^thread.Thread) { + pool := cast(^WalkerPool)t.data + + for { + sync.atomic_sema_wait(&pool.queue_sema) + + sync.mutex_lock(&pool.queue_mutex) + if len(pool.queue) == 0 { + sync.mutex_unlock(&pool.queue_mutex) + if sync.atomic_load_explicit(&pool.active, .Acquire) == 0 { + sync.one_shot_event_signal(&pool.done) + } + break + } + last := len(pool.queue) - 1 + dir_path := pool.queue[last] + ordered_remove(&pool.queue, last) + sync.mutex_unlock(&pool.queue_mutex) + + process_dir(pool, dir_path) + delete(dir_path) + + old := sync.atomic_sub_explicit(&pool.active, 1, .Release) + if old == 1 { + sync.one_shot_event_signal(&pool.done) + } + } +} + +process_dir :: proc(pool: ^WalkerPool, dir_path: string) { + has_git := false + entries := read_dir_entries(dir_path, &has_git) + defer free_entries(&entries) + + if has_git { + gi := load_gitignore(dir_path) + defer if gi != nil { + destroy(gi) + free(gi) + } + + for entry in entries { + if entry.name == ".git" do continue + is_dir := entry.type == .DIR + if gi != nil && is_ignored(gi, entry.name, is_dir) { + if !is_dir { + full_path := join_path(dir_path, entry.name) + sync.mutex_lock(&pool.results_mutex) + append(pool.results, full_path) + sync.mutex_unlock(&pool.results_mutex) + } + continue + } + if is_dir { + child_path := join_path(dir_path, entry.name) + push_work(pool, child_path) + } + } + } else { + for entry in entries { + if entry.type == .DIR { + child_path := join_path(dir_path, entry.name) + push_work(pool, child_path) + } + } + } +} + +push_work :: proc(pool: ^WalkerPool, path: string) { + sync.atomic_add_explicit(&pool.active, 1, .Relaxed) + sync.mutex_lock(&pool.queue_mutex) + append(&pool.queue, path) + sync.mutex_unlock(&pool.queue_mutex) + sync.atomic_sema_post(&pool.queue_sema) +} + +read_dir_entries :: proc(dir_path: string, has_git: ^bool) -> [dynamic]RawEntry { + entries := make([dynamic]RawEntry) + + cpath := strings.clone_to_cstring(dir_path) + if cpath == nil do return entries + + fd, err := linux.open(cpath, {.DIRECTORY, .CLOEXEC}) + delete(cpath) + if err != .NONE do return entries + + buf: [8192]u8 + has_git^ = false + + for { + n, errno := linux.getdents(fd, buf[:]) + if n <= 0 || errno != .NONE do break + + offs := 0 + for d in linux.dirent_iterate_buf(buf[:n], &offs) { + name := linux.dirent_name(d) + if name == "." || name == ".." do continue + + if name == ".git" && d.type == .DIR { + has_git^ = true + } + + cloned := strings.clone(name) + append(&entries, RawEntry{name = cloned, type = d.type}) + } + } + + linux.close(fd) + return entries +} + +free_entries :: proc(entries: ^[dynamic]RawEntry) { + for &entry in entries { + delete(entry.name) + } + delete(entries^) +} + +load_gitignore :: proc(dir_path: string) -> ^Gitignore { + gi_path := join_path(dir_path, ".gitignore") + defer delete(gi_path) + + data, err := os.read_entire_file_from_path(gi_path, context.allocator) + if err != .NONE do return nil + + gi := new(Gitignore) + gi^ = parse(string(data)) + delete(data) + return gi +} + +join_path :: proc(parent, child: string) -> string { + b: strings.Builder + strings.builder_init(&b) + defer strings.builder_destroy(&b) + + fmt.sbprintf(&b, "%s", parent) + if len(parent) == 0 || parent[len(parent) - 1] != '/' { + fmt.sbprintf(&b, "/") + } + fmt.sbprintf(&b, "%s", child) + + s := strings.to_string(b) + result, _ := strings.clone(s) + return result +} +