diff --git a/PERFORMANCE_IDEAS.md b/PERFORMANCE_IDEAS.md
index f943484..863f048 100644
--- a/PERFORMANCE_IDEAS.md
+++ b/PERFORMANCE_IDEAS.md
@@ -3,3 +3,16 @@ findr is 4.5x slower than fd (case 1: 658ms vs 146ms). Opportunities:
 - Arena allocator for path strings
 - Larger getdents buffer (8KB → 64KB+)
 - Buffered stdout output
+
+- Write while walking rather than waiting until the end?
+
+1. Per-thread result buffers (biggest win)
+Every result append currently takes results_mutex. With millions of files, that's millions of lock/unlock cycles. Fix: each thread accumulates results locally, then merges once when done.
+2. Path allocation waste (join_path/join_path_dir)
+Every path construction spins up a strings.Builder, does fmt.sbprintf, to_string, clone, then builder_destroy — 2 heap allocs + 2 frees per path. Could be a simple memcpy into a stack buffer with a single alloc.
+3. Larger getdents buffer
+Currently 8KB. Increasing to 64KB+ means fewer syscalls per directory with many entries.
+4. Eliminate entry name cloning
+strings.clone(name) in read_dir_entries heap-allocates per dirent. Names are valid in the getdents buffer during process_dir, so the clone may be unnecessary.
+5. Arena allocator per thread
+Replace the default allocator for transient strings with a bump allocator — allocate in bulk, free all at once.
diff --git a/findr.odin b/findr.odin
index ce89474..cd5db19 100644
--- a/findr.odin
+++ b/findr.odin
@@ -1,5 +1,6 @@
 package findr
 
+import "core:bufio"
 import "core:fmt"
 import "core:os"
 import "core:strings"
@@ -75,7 +76,13 @@ main :: proc() {
 	thread_count := os.get_processor_core_count()
 	walk(paths[:], &results, opts, thread_count)
 
+	w: bufio.Writer
+	bufio.writer_init(&w, os.to_stream(os.stdout), 1 << 13)
+	defer bufio.writer_destroy(&w)
+
 	for r in results {
-		fmt.println(r)
+		bufio.writer_write_string(&w, r)
+		bufio.writer_write_byte(&w, '\n')
 	}
+	bufio.writer_flush(&w)
 }