From 90e02a46bd94ca6c27bc9c58f6a61d609aca31de Mon Sep 17 00:00:00 2001 From: Spencer Brower Date: Wed, 17 Jun 2026 15:17:41 -0400 Subject: [PATCH] perf(findr): Inline entry processing. --- PERFORMANCE_IDEAS.md | 62 +++++++++++++---- walker.odin | 162 +++++++++++++++++++------------------------ 2 files changed, 117 insertions(+), 107 deletions(-) diff --git a/PERFORMANCE_IDEAS.md b/PERFORMANCE_IDEAS.md index 0a38759..547cdd8 100644 --- a/PERFORMANCE_IDEAS.md +++ b/PERFORMANCE_IDEAS.md @@ -1,34 +1,66 @@ # Performance Ideas -Current state after regex→glob migration. findr beats fd in 3/4 cases. +Current state after regex→glob migration + 32KB getdents + skip gitignore in .All mode + inline entry processing. findr beats fd in 3/4 cases. -## Benchmark results (2026-06-17) +## Benchmark results (2026-06-17, post-inline-processing) | Case | fd | findr | Ratio | |------|------|-------|-------| -| 1 `-E .jj` | 172ms | 135ms | **1.27x faster** | -| 2 `-H` | 1.184s | 1.097s | **1.08x faster** | -| 3 `-HI` | 1.251s | 1.670s | **1.34x slower** | -| 4 `-E .git` | 274ms | 202ms | **1.36x faster** | +| 1 `-E .jj` | 187ms | 150ms | **1.25x faster** | +| 2 `-H` | 1.242s | 1.136s | **1.09x faster** | +| 3 `-HI` | 1.708s | 1.612s | **1.06x slower** | +| 4 `-E .git` | 306ms | 242ms | **1.26x faster** | -Case 3 (`-HI`) skips gitignore entirely, so it's pure I/O + allocation. System time is 2x fd's (12.1s vs 5.5s), pointing to syscall/allocation overhead. +Case 3 (`-HI`) wall time is now close to parity. User time dropped 38% (6.9s → 4.3s) from eliminating entry name clones, but system time rose 38% (8.2s → 11.3s) from the `openat(".git")` probe overhead. ## Completed 1. **Per-thread result buffers** — each thread accumulates locally, merges once at exit. Eliminates per-result mutex contention. 2. **Lean path join** — `join_path`/`join_path_dir` use stack buffer + `copy` + single alloc instead of `strings.Builder` + `fmt.sbprintf` + `clone`. 3. **Regex→glob migration** — replaced regex NFA with backtracking glob matcher. Eliminated 27% of CPU spent on `add_thread`/`is_ignored`. Biggest win. +4. **32KB getdents buffer** — bumped from 8KB. Marginal improvement, within noise. +5. **Skip gitignore loading in .All mode** — eliminated thousands of unnecessary file opens/parses in `-HI`. Cut system time 34% (12.4s → 8.2s). +6. **Fixed-size threads slice** — replaced `[dynamic]^thread.Thread` with `[]^thread.Thread` since thread count is known upfront. +7. **Inline entry processing** — merged `read_dir_entries` into `process_dir`. Entry names consumed directly from getdents buffer via `dirent_name(d)` views. Eliminated millions of `strings.clone`/`delete` pairs. User time dropped 38% in `-HI` case. + +## fd vs findr architecture comparison + +| Aspect | fd (ignore crate) | findr | +|--------|-------------------|-------| +| Syscall | `libc::readdir` | raw `getdents64` | +| Entry names | Clones into owned `PathBuf` per entry | Zero-copy view from getdents buffer | +| `.git` detection | `stat(".git")` per directory | `openat(fd, ".git")` probe per directory | +| Gitignore setup | Before entry iteration | Before entry iteration | +| Path traversal | Full paths | Full paths | +| Glob matching | globset stratification (literals→hash, complex→regex) | Backtracking token matcher | + +## Known problems + +1. **`openat(".git")` probe regression** — The inline processing refactor replaced a free dirent-name scan with a paid `openat` syscall per directory (~280K directories = 280K syscalls, most returning ENOENT). User time dropped from clone elimination, but system time rose from the probe, roughly canceling out. The old code detected `.git` for free while scanning entries; the new code needs `.git` info before processing, forcing the probe. + + Fixes to explore: + - **Skip probe in `.All` mode** — gitignore context is irrelevant, so `has_git` is unused. Eliminates ~280K ENOENT probes in `-HI` case. Low effort. + - **Two-pass over first getdents batch** — scan first batch for `.git`, set up context, then process all batches. `.git` virtually always appears in the first batch. Risk: not guaranteed. + - **Lazy context reset** — process entries optimistically, reset context if `.git` found mid-scan. Complex, entries already processed with wrong context. + +2. **Allocator efficiency gap** — findr still allocates 1-3 heap strings per entry (`join_path` results, work item paths). fd does the same but benefits from Rust's allocator. Odin's default allocator may have higher per-allocation overhead. ## Remaining ideas -1. **Larger getdents buffer** (8KB → 64KB+) - Fewer syscalls per directory with many entries. Low effort. +1. **Skip `has_git_dir` probe in `.All` mode** + Trivial guard. Directly addresses the system-time regression in the `-HI` case. -2. **Eliminate entry name cloning** - `strings.clone(name)` in `read_dir_entries` heap-allocates per dirent. Names are valid in the getdents buffer during `process_dir`, so the clone may be unnecessary. Low effort. +2. **Arena allocator per thread** + Bump allocator for all transient strings (result paths, work item paths), free once at exit. Would address the allocator efficiency gap. Bigger change, helps everywhere. -3. **Arena allocator per thread** - Bump allocator for all transient strings, free once at exit. Bigger change, helps everywhere. - -4. **Batched channel** (fd's approach) +3. **Batched channel** (fd's approach) Replace global results array with buffered channel of batches. Enables streaming output and sorting like fd does. + +## Allocator analysis + +Each emitted entry still needs a heap-allocated result string from `join_path`/`join_path_dir`, and each subdirectory needs a cloned `child_path` + `child_rel` for the work queue. That's 1-3 heap allocs per entry × millions of entries. + +fd has the same pattern (PathBuf per entry + per subdirectory) but benefits from Rust's allocator (system allocator tuned via `malloc`/`free` or jemalloc). Odin's default allocator may have higher per-allocation overhead. Options: +- **Arena per thread**: bulk-allocate, reset after each directory or at thread exit. Best for transient data. +- **Slab allocator for small strings**: most filenames are <64 bytes. A slab for small allocations could reduce fragmentation and improve cache locality. +- **Test with different Odin allocators**: `context.allocator` can be swapped. Worth profiling with `mem.virt_allocator` or a custom arena to measure the gap. diff --git a/walker.odin b/walker.odin index b5f562a..c47be1e 100644 --- a/walker.odin +++ b/walker.odin @@ -21,11 +21,6 @@ WalkOptions :: struct { ignore_mode: IgnoreMode, } -RawEntry :: struct { - name: string, - type: linux.Dirent_Type, -} - GIContext :: struct { gi: ^Gitignore, // nil if this dir had no .gitignore base_rel: string, // relative path from repo root to this dir @@ -188,9 +183,16 @@ walk_worker :: proc(t: ^thread.Thread) { process_dir :: proc(pool: ^WalkerPool, item: WorkItem, local_results: ^[dynamic]string) { dir_path := item.path - has_git := false - entries := read_dir_entries(dir_path, &has_git) - defer free_entries(&entries) + + cpath := strings.clone_to_cstring(dir_path) + if cpath == nil do return + defer delete(cpath) + + fd, open_err := linux.open(cpath, {.DIRECTORY, .CLOEXEC}) + if open_err != .NONE do return + defer linux.close(fd) + + has_git := has_git_dir(fd) gi_ctx := item.gi_ctx rel := item.rel @@ -221,58 +223,67 @@ process_dir :: proc(pool: ^WalkerPool, item: WorkItem, local_results: ^[dynamic] gi_ctx = new_ctx } + buf: [32768]u8 rel_buf: [4096]u8 - for entry in entries { - if entry.name == ".git" do continue + for { + n, errno := linux.getdents(fd, buf[:]) + if n <= 0 || errno != .NONE do break - is_dir := entry.type == .DIR - is_nondir := entry.type != .DIR + offs := 0 + for d in linux.dirent_iterate_buf(buf[:n], &offs) { + name := linux.dirent_name(d) + if name == "." || name == ".." do continue + if name == ".git" do continue - if pool.exclude_gi != nil && is_ignored(pool.exclude_gi, entry.name, is_dir) { - continue - } + is_dir := d.type == .DIR + is_nondir := d.type != .DIR - if !pool.opts.include_hidden && len(entry.name) > 0 && entry.name[0] == '.' { - continue - } - - entry_rel := build_rel(rel_buf[:], rel, entry.name) - - ignored := false - if gi_ctx != nil && pool.opts.ignore_mode != .All { - ignored = check_chain(gi_ctx, entry_rel, is_dir) - } - - should_emit: bool - if ignored { - should_emit = pool.opts.ignore_mode == .Ignored - } else { - should_emit = pool.opts.ignore_mode != .Ignored - } - - if is_dir { - if should_emit && matches_pattern(pool, entry.name) { - dir_path_out := join_path_dir(dir_path, entry.name) - append(local_results, dir_path_out) + if pool.exclude_gi != nil && is_ignored(pool.exclude_gi, name, is_dir) { + continue } - if !ignored { - child_rel, _ := strings.clone(entry_rel) - child_path := join_path(dir_path, entry.name) - push_work( - pool, - WorkItem { - path = child_path, - rel = child_rel, - gi_ctx = gi_ctx, - in_repo = child_in_repo, - }, - ) + + if !pool.opts.include_hidden && len(name) > 0 && name[0] == '.' { + continue } - } else if is_nondir { - if should_emit && matches_pattern(pool, entry.name) { - full_path := join_path(dir_path, entry.name) - append(local_results, full_path) + + entry_rel := build_rel(rel_buf[:], rel, name) + + ignored := false + if gi_ctx != nil && pool.opts.ignore_mode != .All { + ignored = check_chain(gi_ctx, entry_rel, is_dir) + } + + should_emit: bool + if ignored { + should_emit = pool.opts.ignore_mode == .Ignored + } else { + should_emit = pool.opts.ignore_mode != .Ignored + } + + if is_dir { + if should_emit && matches_pattern(pool, name) { + dir_path_out := join_path_dir(dir_path, name) + append(local_results, dir_path_out) + } + if !ignored { + child_rel, _ := strings.clone(entry_rel) + child_path := join_path(dir_path, name) + push_work( + pool, + WorkItem { + path = child_path, + rel = child_rel, + gi_ctx = gi_ctx, + in_repo = child_in_repo, + }, + ) + } + } else if is_nondir { + if should_emit && matches_pattern(pool, name) { + full_path := join_path(dir_path, name) + append(local_results, full_path) + } } } } @@ -330,46 +341,13 @@ push_work :: proc(pool: ^WalkerPool, item: WorkItem) { sync.atomic_sema_post(&pool.queue_sema) } -read_dir_entries :: proc(dir_path: string, has_git: ^bool) -> [dynamic]RawEntry { - entries := make([dynamic]RawEntry) - - cpath := strings.clone_to_cstring(dir_path) - if cpath == nil do return entries - - fd, err := linux.open(cpath, {.DIRECTORY, .CLOEXEC}) - delete(cpath) - if err != .NONE do return entries - - buf: [32 * 1024]u8 - has_git^ = false - - for { - n, errno := linux.getdents(fd, buf[:]) - if n <= 0 || errno != .NONE do break - - offs := 0 - for d in linux.dirent_iterate_buf(buf[:n], &offs) { - name := linux.dirent_name(d) - if name == "." || name == ".." do continue - - if name == ".git" && d.type == .DIR { - has_git^ = true - } - - cloned := strings.clone(name) - append(&entries, RawEntry{name = cloned, type = d.type}) - } +has_git_dir :: proc(fd: linux.Fd) -> bool { + git_fd, err := linux.openat(fd, ".git", {.DIRECTORY, .CLOEXEC}) + if err == .NONE { + linux.close(git_fd) + return true } - - linux.close(fd) - return entries -} - -free_entries :: proc(entries: ^[dynamic]RawEntry) { - for &entry in entries { - delete(entry.name) - } - delete(entries^) + return false } load_ignore_patterns :: proc(dir_path: string, in_repo: bool) -> ^Gitignore {