perf(findr): Each thread gets its own buffer.
This commit is contained in:
@@ -1,13 +1,22 @@
|
|||||||
findr is 4.5x slower than fd (case 1: 658ms vs 146ms). Opportunities:
|
findr is ~2.3x slower than fd (case 1: 547ms vs 241ms). Opportunities:
|
||||||
- Per-thread result buffers (eliminate mutex contention)
|
|
||||||
- Arena allocator for path strings
|
|
||||||
- Larger getdents buffer (8KB → 64KB+)
|
|
||||||
- Buffered stdout output
|
|
||||||
|
|
||||||
- Write while walking rather than waiting until the end?
|
1. Per-thread result buffers (DONE)
|
||||||
|
Each thread accumulates results locally, then merges once at exit. Eliminates per-result mutex contention.
|
||||||
|
|
||||||
1. Per-thread result buffers (biggest win)
|
2. Batched channel (fd's approach)
|
||||||
Every result append currently takes results_mutex. With millions of files, that's millions of lock/unlock cycles. Fix: each thread accumulates results locally, then merges once when done.
|
Replace global results array + merge with a buffered channel of batches. Each worker fills a local batch (~256 items), sends it to a `chan.Chan([]string)` (capacity = 2 × threads). A receiver thread drains batches and collects/prints. Provides backpressure, streaming output, and per-batch (not global) synchronization. Enables sorting like fd does (buffer first 1000 results or 100ms, then stream).
|
||||||
|
|
||||||
|
3. Path allocation waste (join_path/join_path_dir)
|
||||||
|
Every path construction spins up a strings.Builder, does fmt.sbprintf, to_string, clone, then builder_destroy — 2 heap allocs + 2 frees per path. Could be a simple memcpy into a stack buffer with a single alloc.
|
||||||
|
|
||||||
|
4. Larger getdents buffer
|
||||||
|
Currently 8KB. Increasing to 64KB+ means fewer syscalls per directory with many entries.
|
||||||
|
|
||||||
|
5. Eliminate entry name cloning
|
||||||
|
strings.clone(name) in read_dir_entries heap-allocates per dirent. Names are valid in the getdents buffer during process_dir, so the clone may be unnecessary.
|
||||||
|
|
||||||
|
6. Arena allocator per thread
|
||||||
|
Replace the default allocator for transient strings with a bump allocator — allocate in bulk, free all at once.
|
||||||
2. Path allocation waste (join_path/join_path_dir)
|
2. Path allocation waste (join_path/join_path_dir)
|
||||||
Every path construction spins up a strings.Builder, does fmt.sbprintf, to_string, clone, then builder_destroy — 2 heap allocs + 2 frees per path. Could be a simple memcpy into a stack buffer with a single alloc.
|
Every path construction spins up a strings.Builder, does fmt.sbprintf, to_string, clone, then builder_destroy — 2 heap allocs + 2 frees per path. Could be a simple memcpy into a stack buffer with a single alloc.
|
||||||
3. Larger getdents buffer
|
3. Larger getdents buffer
|
||||||
|
|||||||
23
walker.odin
23
walker.odin
@@ -145,6 +145,9 @@ walk :: proc(roots: []string, results: ^[dynamic]string, opts: WalkOptions, thre
|
|||||||
walk_worker :: proc(t: ^thread.Thread) {
|
walk_worker :: proc(t: ^thread.Thread) {
|
||||||
pool := cast(^WalkerPool)t.data
|
pool := cast(^WalkerPool)t.data
|
||||||
|
|
||||||
|
local_results := make([dynamic]string, 0, 256)
|
||||||
|
defer delete(local_results)
|
||||||
|
|
||||||
for {
|
for {
|
||||||
sync.atomic_sema_wait(&pool.queue_sema)
|
sync.atomic_sema_wait(&pool.queue_sema)
|
||||||
|
|
||||||
@@ -161,7 +164,7 @@ walk_worker :: proc(t: ^thread.Thread) {
|
|||||||
ordered_remove(&pool.queue, last)
|
ordered_remove(&pool.queue, last)
|
||||||
sync.mutex_unlock(&pool.queue_mutex)
|
sync.mutex_unlock(&pool.queue_mutex)
|
||||||
|
|
||||||
process_dir(pool, item)
|
process_dir(pool, item, &local_results)
|
||||||
delete(item.path)
|
delete(item.path)
|
||||||
if len(item.rel) > 0 { delete(item.rel) }
|
if len(item.rel) > 0 { delete(item.rel) }
|
||||||
|
|
||||||
@@ -170,9 +173,17 @@ walk_worker :: proc(t: ^thread.Thread) {
|
|||||||
sync.one_shot_event_signal(&pool.done)
|
sync.one_shot_event_signal(&pool.done)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if len(local_results) > 0 {
|
||||||
|
sync.mutex_lock(&pool.results_mutex)
|
||||||
|
for res in local_results {
|
||||||
|
append(pool.results, res)
|
||||||
|
}
|
||||||
|
sync.mutex_unlock(&pool.results_mutex)
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
process_dir :: proc(pool: ^WalkerPool, item: WorkItem) {
|
process_dir :: proc(pool: ^WalkerPool, item: WorkItem, local_results: ^[dynamic]string) {
|
||||||
dir_path := item.path
|
dir_path := item.path
|
||||||
has_git := false
|
has_git := false
|
||||||
entries := read_dir_entries(dir_path, &has_git)
|
entries := read_dir_entries(dir_path, &has_git)
|
||||||
@@ -237,9 +248,7 @@ process_dir :: proc(pool: ^WalkerPool, item: WorkItem) {
|
|||||||
if is_dir {
|
if is_dir {
|
||||||
if should_emit && matches_pattern(pool, entry.name) {
|
if should_emit && matches_pattern(pool, entry.name) {
|
||||||
dir_path_out := join_path_dir(dir_path, entry.name)
|
dir_path_out := join_path_dir(dir_path, entry.name)
|
||||||
sync.mutex_lock(&pool.results_mutex)
|
append(local_results, dir_path_out)
|
||||||
append(pool.results, dir_path_out)
|
|
||||||
sync.mutex_unlock(&pool.results_mutex)
|
|
||||||
}
|
}
|
||||||
if !ignored {
|
if !ignored {
|
||||||
child_rel, _ := strings.clone(entry_rel)
|
child_rel, _ := strings.clone(entry_rel)
|
||||||
@@ -249,9 +258,7 @@ process_dir :: proc(pool: ^WalkerPool, item: WorkItem) {
|
|||||||
} else if is_nondir {
|
} else if is_nondir {
|
||||||
if should_emit && matches_pattern(pool, entry.name) {
|
if should_emit && matches_pattern(pool, entry.name) {
|
||||||
full_path := join_path(dir_path, entry.name)
|
full_path := join_path(dir_path, entry.name)
|
||||||
sync.mutex_lock(&pool.results_mutex)
|
append(local_results, full_path)
|
||||||
append(pool.results, full_path)
|
|
||||||
sync.mutex_unlock(&pool.results_mutex)
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
Reference in New Issue
Block a user