From 3f6eb17aad6bf5407dc359873e644dc2e13dc285 Mon Sep 17 00:00:00 2001 From: Spencer Brower Date: Wed, 17 Jun 2026 13:07:29 -0400 Subject: [PATCH] perf(findr): Each thread gets its own buffer. --- PERFORMANCE_IDEAS.md | 25 +++++++++++++++++-------- walker.odin | 23 +++++++++++++++-------- 2 files changed, 32 insertions(+), 16 deletions(-) diff --git a/PERFORMANCE_IDEAS.md b/PERFORMANCE_IDEAS.md index 863f048..8fea388 100644 --- a/PERFORMANCE_IDEAS.md +++ b/PERFORMANCE_IDEAS.md @@ -1,13 +1,22 @@ -findr is 4.5x slower than fd (case 1: 658ms vs 146ms). Opportunities: -- Per-thread result buffers (eliminate mutex contention) -- Arena allocator for path strings -- Larger getdents buffer (8KB → 64KB+) -- Buffered stdout output +findr is ~2.3x slower than fd (case 1: 547ms vs 241ms). Opportunities: -- Write while walking rather than waiting until the end? +1. Per-thread result buffers (DONE) +Each thread accumulates results locally, then merges once at exit. Eliminates per-result mutex contention. -1. Per-thread result buffers (biggest win) -Every result append currently takes results_mutex. With millions of files, that's millions of lock/unlock cycles. Fix: each thread accumulates results locally, then merges once when done. +2. Batched channel (fd's approach) +Replace global results array + merge with a buffered channel of batches. Each worker fills a local batch (~256 items), sends it to a `chan.Chan([]string)` (capacity = 2 × threads). A receiver thread drains batches and collects/prints. Provides backpressure, streaming output, and per-batch (not global) synchronization. Enables sorting like fd does (buffer first 1000 results or 100ms, then stream). + +3. Path allocation waste (join_path/join_path_dir) +Every path construction spins up a strings.Builder, does fmt.sbprintf, to_string, clone, then builder_destroy — 2 heap allocs + 2 frees per path. Could be a simple memcpy into a stack buffer with a single alloc. + +4. Larger getdents buffer +Currently 8KB. Increasing to 64KB+ means fewer syscalls per directory with many entries. + +5. Eliminate entry name cloning +strings.clone(name) in read_dir_entries heap-allocates per dirent. Names are valid in the getdents buffer during process_dir, so the clone may be unnecessary. + +6. Arena allocator per thread +Replace the default allocator for transient strings with a bump allocator — allocate in bulk, free all at once. 2. Path allocation waste (join_path/join_path_dir) Every path construction spins up a strings.Builder, does fmt.sbprintf, to_string, clone, then builder_destroy — 2 heap allocs + 2 frees per path. Could be a simple memcpy into a stack buffer with a single alloc. 3. Larger getdents buffer diff --git a/walker.odin b/walker.odin index 9e40968..3cad90a 100644 --- a/walker.odin +++ b/walker.odin @@ -145,6 +145,9 @@ walk :: proc(roots: []string, results: ^[dynamic]string, opts: WalkOptions, thre walk_worker :: proc(t: ^thread.Thread) { pool := cast(^WalkerPool)t.data + local_results := make([dynamic]string, 0, 256) + defer delete(local_results) + for { sync.atomic_sema_wait(&pool.queue_sema) @@ -161,7 +164,7 @@ walk_worker :: proc(t: ^thread.Thread) { ordered_remove(&pool.queue, last) sync.mutex_unlock(&pool.queue_mutex) - process_dir(pool, item) + process_dir(pool, item, &local_results) delete(item.path) if len(item.rel) > 0 { delete(item.rel) } @@ -170,9 +173,17 @@ walk_worker :: proc(t: ^thread.Thread) { sync.one_shot_event_signal(&pool.done) } } + + if len(local_results) > 0 { + sync.mutex_lock(&pool.results_mutex) + for res in local_results { + append(pool.results, res) + } + sync.mutex_unlock(&pool.results_mutex) + } } -process_dir :: proc(pool: ^WalkerPool, item: WorkItem) { +process_dir :: proc(pool: ^WalkerPool, item: WorkItem, local_results: ^[dynamic]string) { dir_path := item.path has_git := false entries := read_dir_entries(dir_path, &has_git) @@ -237,9 +248,7 @@ process_dir :: proc(pool: ^WalkerPool, item: WorkItem) { if is_dir { if should_emit && matches_pattern(pool, entry.name) { dir_path_out := join_path_dir(dir_path, entry.name) - sync.mutex_lock(&pool.results_mutex) - append(pool.results, dir_path_out) - sync.mutex_unlock(&pool.results_mutex) + append(local_results, dir_path_out) } if !ignored { child_rel, _ := strings.clone(entry_rel) @@ -249,9 +258,7 @@ process_dir :: proc(pool: ^WalkerPool, item: WorkItem) { } else if is_nondir { if should_emit && matches_pattern(pool, entry.name) { full_path := join_path(dir_path, entry.name) - sync.mutex_lock(&pool.results_mutex) - append(pool.results, full_path) - sync.mutex_unlock(&pool.results_mutex) + append(local_results, full_path) } } }