perf(findr): Switched to using channels.

This commit is contained in:
2026-06-17 16:01:01 -04:00
parent 2eecf7c348
commit 26a6028ae6
3 changed files with 156 additions and 67 deletions

View File

@@ -1,17 +1,17 @@
# Performance Ideas # Performance Ideas
Current state after regex→glob migration + 32KB getdents + skip gitignore in .All mode + inline entry processing. findr beats fd in 3/4 cases. Current state after regex→glob migration + inline entry processing + skip gitignore in .All mode + channel-based streaming output. findr beats fd in 3/4 cases.
## Benchmark results (2026-06-17, post-inline-processing) ## Benchmark results (2026-06-17, post-channels)
| Case | fd | findr | Ratio | | Case | fd | findr | Ratio |
|------|------|-------|-------| |------|------|-------|-------|
| 1 `-E .jj` | 187ms | 150ms | **1.25x faster** | | 1 `-E .jj` | 159ms | 112ms | **1.42x faster** |
| 2 `-H` | 1.242s | 1.136s | **1.09x faster** | | 2 `-H` | 1.202s | 710ms | **1.69x faster** |
| 3 `-HI` | 1.708s | 1.612s | **1.06x slower** | | 3 `-HI` | 1.080s | 1.212s | **1.12x slower** |
| 4 `-E .git` | 306ms | 242ms | **1.26x faster** | | 4 `-E .git` | 298ms | 222ms | **1.34x faster** |
Case 3 (`-HI`) wall time is now close to parity. User time dropped 38% (6.9s → 4.3s) from eliminating entry name clones, but system time rose 38% (8.2s → 11.3s) from the `openat(".git")` probe overhead. Channels gave the biggest single improvement since the project started. Cases 1, 2, and 4 got dramatically faster because output I/O now overlaps with directory walking. Case 3 improved from 1.18x slower to 1.12x slower.
## Completed ## Completed
@@ -19,9 +19,11 @@ Case 3 (`-HI`) wall time is now close to parity. User time dropped 38% (6.9s →
2. **Lean path join**`join_path`/`join_path_dir` use stack buffer + `copy` + single alloc instead of `strings.Builder` + `fmt.sbprintf` + `clone`. 2. **Lean path join**`join_path`/`join_path_dir` use stack buffer + `copy` + single alloc instead of `strings.Builder` + `fmt.sbprintf` + `clone`.
3. **Regex→glob migration** — replaced regex NFA with backtracking glob matcher. Eliminated 27% of CPU spent on `add_thread`/`is_ignored`. Biggest win. 3. **Regex→glob migration** — replaced regex NFA with backtracking glob matcher. Eliminated 27% of CPU spent on `add_thread`/`is_ignored`. Biggest win.
4. **32KB getdents buffer** — bumped from 8KB. Marginal improvement, within noise. 4. **32KB getdents buffer** — bumped from 8KB. Marginal improvement, within noise.
5. **Skip gitignore loading in .All mode** — eliminated thousands of unnecessary file opens/parses in `-HI`. Cut system time 34% (12.4s → 8.2s). 5. **Skip gitignore loading in `.All` mode** — eliminated thousands of unnecessary file opens/parses in `-HI`. Cut system time 34% (12.4s → 8.2s).
6. **Fixed-size threads slice** — replaced `[dynamic]^thread.Thread` with `[]^thread.Thread` since thread count is known upfront. 6. **Fixed-size threads slice** — replaced `[dynamic]^thread.Thread` with `[]^thread.Thread` since thread count is known upfront.
7. **Inline entry processing** — merged `read_dir_entries` into `process_dir`. Entry names consumed directly from getdents buffer via `dirent_name(d)` views. Eliminated millions of `strings.clone`/`delete` pairs. User time dropped 38% in `-HI` case. 7. **Inline entry processing** — merged `read_dir_entries` into `process_dir`. Entry names consumed directly from getdents buffer via `dirent_name(d)` views. Eliminated millions of `strings.clone`/`delete` pairs. User time dropped 38% in `-HI` case.
8. **Skip `has_git_dir` probe in `.All` mode** — guarded `has_git_dir(fd)` with `ignore_mode != .All`. Eliminated ~280K wasted `openat` ENOENT probes in `-HI` case. System time dropped 33% (11.3s → 7.6s).
9. **Channel-based streaming output** — replaced global results array + mutex with `chan.Chan([]string)`, cap `2 * thread_count`. Workers flush 256-result batches through the channel; a consumer thread drains to stdout. Matches fd's architecture (`crossbeam_channel::bounded(2*threads)`, batch size `0x100`). Eliminates the collect-then-write barrier. Cases 1/2/4 went from 1.1-1.3x faster to 1.3-1.7x faster.
## fd vs findr architecture comparison ## fd vs findr architecture comparison
@@ -33,34 +35,39 @@ Case 3 (`-HI`) wall time is now close to parity. User time dropped 38% (6.9s →
| Gitignore setup | Before entry iteration | Before entry iteration | | Gitignore setup | Before entry iteration | Before entry iteration |
| Path traversal | Full paths | Full paths | | Path traversal | Full paths | Full paths |
| Glob matching | globset stratification (literals→hash, complex→regex) | Backtracking token matcher | | Glob matching | globset stratification (literals→hash, complex→regex) | Backtracking token matcher |
| Result transport | `crossbeam_channel::bounded(2*threads)` (lock-free MPMC) | `core:sync/chan` (single-mutex ring buffer) |
| Batching | `Arc<Mutex<Option<Vec>>>` shared buffer, flush on first item | Detach backing array as `[]string`, flush when full (256) |
| Output mode | Hybrid: buffer 1000 items / 100ms → sort → stream | Direct streaming (no buffer/sort mode yet) |
## Known problems ## Known problems
1. **`openat(".git")` probe regression** — The inline processing refactor replaced a free dirent-name scan with a paid `openat` syscall per directory (~280K directories = 280K syscalls, most returning ENOENT). User time dropped from clone elimination, but system time rose from the probe, roughly canceling out. The old code detected `.git` for free while scanning entries; the new code needs `.git` info before processing, forcing the probe. 1. **Allocator efficiency gap**findr still allocates 1-3 heap strings per entry (`join_path` results, work item paths). fd does the same but benefits from Rust's allocator. Odin's default allocator may have higher per-allocation overhead.
Fixes to explore: 2. **Channel mutex contention (unconfirmed)** — Odin's `core:sync/chan` uses a single mutex for the entire ring buffer. With 16 senders + 1 receiver hitting the same lock, every `chan.send`/`chan.recv` is a potential futex contention point. fd uses `crossbeam_channel::bounded` which is lock-free MPMC. **Note**: early spall profiles showed 11.8% futex_wait, but this was likely a profiling artifact — the channel ops generate more instrumentation events, causing the 1GB spall cap to be hit over a longer wall-time window (3.5s vs 1s), skewing the profile. Needs a fair comparison (smaller tree or larger cap) to confirm whether this is real.
- **Skip probe in `.All` mode** — gitignore context is irrelevant, so `has_git` is unused. Eliminates ~280K ENOENT probes in `-HI` case. Low effort.
- **Two-pass over first getdents batch** — scan first batch for `.git`, set up context, then process all batches. `.git` virtually always appears in the first batch. Risk: not guaranteed.
- **Lazy context reset** — process entries optimistically, reset context if `.git` found mid-scan. Complex, entries already processed with wrong context.
2. **Allocator efficiency gap** — findr still allocates 1-3 heap strings per entry (`join_path` results, work item paths). fd does the same but benefits from Rust's allocator. Odin's default allocator may have higher per-allocation overhead.
## Remaining ideas ## Remaining ideas
1. **Skip `has_git_dir` probe in `.All` mode** 1. **Lock-free MPMC queue**
Trivial guard. Directly addresses the system-time regression in the `-HI` case. Replace Odin's mutex-based channel with a custom multi-producer-single-consumer ring buffer using atomics. Eliminates all futex syscalls on the result-transport hot path.
**Design**:
- Fixed-capacity ring buffer of `[]string` slots (cap = `2 * thread_count`, same as now)
- Producer side: each worker atomic-CASes a `head` counter forward to claim a slot index, writes its batch, then sets a `ready` flag on the slot
- Consumer side: atomic-load `head`, drains all ready slots up to `head`, writes to stdout, frees batches
- Backpressure: if `head - tail >= cap`, producer spins/waits (yields via `sched_yield` or `futex` with private flag)
- Close: atomic flag set by `walk_stream` after all workers joined; consumer drains remaining then exits
**Alternative**: Use a per-producer SPSC queue (one ring per worker thread). Consumer round-robins across all N queues. No CAS on producer side — each worker writes to its own queue with only a `store` + fence. Consumer reads from each with a `load`. Trades simplicity for zero contention.
**Risk**: Low. The API surface is small (`send`, `recv`, `close`). Can be swapped behind the existing `flush_batch` interface without touching `walk_worker` or `output_writer`. fd's `crossbeam_channel` proves lock-free MPMC is achievable.
**Effort**: Medium. ~100-150 lines for the queue + a few tests. No changes to walker or main.
2. **Arena allocator per thread** 2. **Arena allocator per thread**
Bump allocator for all transient strings (result paths, work item paths), free once at exit. Would address the allocator efficiency gap. Bigger change, helps everywhere. Bump allocator for all transient strings (result paths, work item paths), free once at exit. Would address the allocator efficiency gap. Bigger change, helps everywhere.
3. **Batched channel** (fd's approach) 3. **Buffer/sort output mode** (fd's approach)
Replace global results array with buffered channel of batches. Enables streaming output and sorting like fd does. Buffer up to 1000 results (or 100ms deadline), sort them, then switch to streaming. Gives sorted output for small searches without sacrificing throughput on large ones. fd's `ReceiverMode::Buffering → Streaming` pattern.
## Allocator analysis 4. **Git index parsing**
Parse `.git/index` binary format to show tracked dotfiles. Closes the 84-file correctness delta in cases 1/4. Last correctness gap.
Each emitted entry still needs a heap-allocated result string from `join_path`/`join_path_dir`, and each subdirectory needs a cloned `child_path` + `child_rel` for the work queue. That's 1-3 heap allocs per entry × millions of entries.
fd has the same pattern (PathBuf per entry + per subdirectory) but benefits from Rust's allocator (system allocator tuned via `malloc`/`free` or jemalloc). Odin's default allocator may have higher per-allocation overhead. Options:
- **Arena per thread**: bulk-allocate, reset after each directory or at thread exit. Best for transient data.
- **Slab allocator for small strings**: most filenames are <64 bytes. A slab for small allocations could reduce fragmentation and improve cache locality.
- **Test with different Odin allocators**: `context.allocator` can be swapped. Worth profiling with `mem.virt_allocator` or a custom arena to measure the gap.

View File

@@ -3,6 +3,32 @@ package findr
import "core:bufio" import "core:bufio"
import "core:os" import "core:os"
import "core:strings" import "core:strings"
import "core:sync/chan"
import "core:thread"
Writer_Data :: struct {
ch: chan.Chan([]string),
}
output_writer :: proc(t: ^thread.Thread) {
data := cast(^Writer_Data)t.data
w: bufio.Writer
bufio.writer_init(&w, os.to_stream(os.stdout), 1 << 13)
defer bufio.writer_destroy(&w)
for {
batch, ok := chan.recv(data.ch)
if !ok do break
for s in batch {
bufio.writer_write_string(&w, s)
bufio.writer_write_byte(&w, '\n')
delete(s)
}
delete(batch)
}
bufio.writer_flush(&w)
}
main :: proc() { main :: proc() {
prof_init() prof_init()
@@ -69,23 +95,24 @@ main :: proc() {
append(&paths, ".") append(&paths, ".")
} }
results := make([dynamic]string)
defer {
for r in results {delete(r)}
delete(results)
}
thread_count := os.get_processor_core_count() thread_count := os.get_processor_core_count()
walk(paths[:], &results, opts, thread_count)
w: bufio.Writer ch, _ := chan.create(chan.Chan([]string), max(2 * thread_count, 2), context.allocator)
bufio.writer_init(&w, os.to_stream(os.stdout), 1 << 13) defer chan.destroy(ch)
defer bufio.writer_destroy(&w)
for r in results { wdata := new(Writer_Data)
bufio.writer_write_string(&w, r) wdata.ch = ch
bufio.writer_write_byte(&w, '\n') defer free(wdata)
}
bufio.writer_flush(&w) writer := thread.create(output_writer)
writer.data = rawptr(wdata)
writer.init_context = context
thread.start(writer)
walk_stream(paths[:], ch, opts, thread_count)
chan.close(ch)
thread.join(writer)
thread.destroy(writer)
} }

View File

@@ -4,10 +4,13 @@ import "core:fmt"
import "core:os" import "core:os"
import "core:strings" import "core:strings"
import "core:sync" import "core:sync"
import "core:sync/chan"
import "core:sys/linux" import "core:sys/linux"
import "core:text/regex" import "core:text/regex"
import "core:thread" import "core:thread"
BATCH_SIZE :: 256
IgnoreMode :: enum { IgnoreMode :: enum {
Respected, // skip gitignored, prune ignored dirs (fd -H default) Respected, // skip gitignored, prune ignored dirs (fd -H default)
All, // ignore .gitignore entirely, descend everywhere (fd -HI) All, // ignore .gitignore entirely, descend everywhere (fd -HI)
@@ -38,8 +41,7 @@ WalkerPool :: struct {
queue: [dynamic]WorkItem, queue: [dynamic]WorkItem,
queue_mutex: sync.Mutex, queue_mutex: sync.Mutex,
queue_sema: sync.Atomic_Sema, queue_sema: sync.Atomic_Sema,
results: ^[dynamic]string, result_chan: chan.Chan([]string),
results_mutex: sync.Mutex,
active: i64, active: i64,
done: sync.One_Shot_Event, done: sync.One_Shot_Event,
threads: []^thread.Thread, threads: []^thread.Thread,
@@ -51,12 +53,24 @@ WalkerPool :: struct {
contexts_lock: sync.Mutex, contexts_lock: sync.Mutex,
} }
walk :: proc(roots: []string, results: ^[dynamic]string, opts: WalkOptions, thread_count: int) { flush_batch :: proc(ch: chan.Chan([]string), local: ^[dynamic]string) {
if len(local) == 0 do return
batch := local[:]
local^ = make([dynamic]string, 0, BATCH_SIZE)
chan.send(ch, batch)
}
walk_stream :: proc(
roots: []string,
result_chan: chan.Chan([]string),
opts: WalkOptions,
thread_count: int,
) {
if len(roots) == 0 do return if len(roots) == 0 do return
pool := new(WalkerPool) pool := new(WalkerPool)
pool.queue = make([dynamic]WorkItem) pool.queue = make([dynamic]WorkItem)
pool.results = results pool.result_chan = result_chan
pool.active = i64(len(roots)) pool.active = i64(len(roots))
pool.threads = make([]^thread.Thread, thread_count) pool.threads = make([]^thread.Thread, thread_count)
pool.all_contexts = make([dynamic]^GIContext) pool.all_contexts = make([dynamic]^GIContext)
@@ -137,14 +151,59 @@ walk :: proc(roots: []string, results: ^[dynamic]string, opts: WalkOptions, thre
free(pool) free(pool)
} }
Collector_Data :: struct {
ch: chan.Chan([]string),
results: ^[dynamic]string,
}
collect_worker :: proc(t: ^thread.Thread) {
data := cast(^Collector_Data)t.data
for {
batch, ok := chan.recv(data.ch)
if !ok do break
for s in batch {
append(data.results, s)
}
delete(batch)
}
}
walk :: proc(roots: []string, results: ^[dynamic]string, opts: WalkOptions, thread_count: int) {
if len(roots) == 0 do return
ch, _ := chan.create(chan.Chan([]string), max(2 * thread_count, 2), context.allocator)
defer chan.destroy(ch)
data := new(Collector_Data)
data.ch = ch
data.results = results
collector := thread.create(collect_worker)
collector.data = rawptr(data)
collector.init_context = context
thread.start(collector)
walk_stream(roots, ch, opts, thread_count)
chan.close(ch)
thread.join(collector)
thread.destroy(collector)
free(data)
}
walk_worker :: proc(t: ^thread.Thread) { walk_worker :: proc(t: ^thread.Thread) {
pool := cast(^WalkerPool)t.data pool := cast(^WalkerPool)t.data
prof_thread_init("walker") prof_thread_init("walker")
defer prof_thread_destroy() defer prof_thread_destroy()
local_results := make([dynamic]string, 0, 256) local_results := make([dynamic]string, 0, BATCH_SIZE)
defer delete(local_results) defer {
if len(local_results) > 0 {
flush_batch(pool.result_chan, &local_results)
}
delete(local_results)
}
for { for {
sync.atomic_sema_wait(&pool.queue_sema) sync.atomic_sema_wait(&pool.queue_sema)
@@ -166,19 +225,15 @@ walk_worker :: proc(t: ^thread.Thread) {
delete(item.path) delete(item.path)
if len(item.rel) > 0 {delete(item.rel)} if len(item.rel) > 0 {delete(item.rel)}
if len(local_results) >= BATCH_SIZE {
flush_batch(pool.result_chan, &local_results)
}
old := sync.atomic_sub_explicit(&pool.active, 1, .Release) old := sync.atomic_sub_explicit(&pool.active, 1, .Release)
if old == 1 { if old == 1 {
sync.one_shot_event_signal(&pool.done) sync.one_shot_event_signal(&pool.done)
} }
} }
if len(local_results) > 0 {
sync.mutex_lock(&pool.results_mutex)
for res in local_results {
append(pool.results, res)
}
sync.mutex_unlock(&pool.results_mutex)
}
} }
process_dir :: proc(pool: ^WalkerPool, item: WorkItem, local_results: ^[dynamic]string) { process_dir :: proc(pool: ^WalkerPool, item: WorkItem, local_results: ^[dynamic]string) {