infotheory/
search.rs

1use crate::rosaplus::RosaPlus;
2use crate::{InfotheoryCtx, RateBackend, cross_entropy_bytes, marginal_entropy_bytes};
3use rayon::prelude::*;
4use std::collections::hash_map::DefaultHasher;
5use std::fs;
6use std::hash::{Hash, Hasher};
7use std::path::{Path, PathBuf};
8
9#[derive(Debug, Clone)]
10/// One scored retrieval unit returned by code search.
11pub struct Snippet {
12    /// Source file containing the match/candidate.
13    pub path: PathBuf,
14    /// 1-based inclusive start line for snippet display.
15    pub start_line: usize,
16    /// 1-based inclusive end line for snippet display.
17    pub end_line: usize,
18    /// Raw candidate bytes used for entropy/rerank scoring.
19    pub content: Vec<u8>,
20    /// Final ranking score (larger is better).
21    pub score: f64,
22}
23
24fn stage0_prefilter(
25    query_bytes: &[u8],
26    mut candidates: Vec<Snippet>,
27    opts: &SearchOptions,
28    debug: bool,
29) -> Vec<Snippet> {
30    let n = candidates.len();
31    if n == 0 {
32        return candidates;
33    }
34
35    let frac = opts.stage0_keep_frac.clamp(0.0, 1.0);
36    if frac >= 1.0 {
37        return candidates;
38    }
39
40    // Option A: Unigram (i.i.d.) likelihood-gain proxy.
41    // score0(x) = H0(Q) - H0(Q|X)
42    // where H0(Q|X) is computed as cross-entropy of Q under X's unigram model.
43    let h0_q = marginal_entropy_bytes(query_bytes);
44    candidates.par_iter_mut().for_each(|s| {
45        let h0_q_x = cross_entropy_bytes(query_bytes, &s.content, 0);
46        s.score = h0_q - h0_q_x;
47    });
48
49    let mut keep = ((n as f64) * frac).ceil() as usize;
50    keep = keep.max(opts.top_k).min(n);
51    if keep < n {
52        let nth = keep.saturating_sub(1);
53        candidates.select_nth_unstable_by(nth, |a, b| {
54            b.score
55                .partial_cmp(&a.score)
56                .unwrap_or(std::cmp::Ordering::Equal)
57        });
58        candidates.truncate(keep);
59    }
60
61    if debug {
62        println!(
63            "Stage-0 prefilter kept {}/{} candidates (frac={:.4})",
64            candidates.len(),
65            n,
66            frac
67        );
68    }
69
70    candidates
71}
72
73#[derive(Debug, Clone, Copy, Eq, PartialEq)]
74/// Candidate unit granularity for stage-0/1 collection.
75pub enum SearchGranularity {
76    /// Split files into hashed line windows/snippets.
77    Snippet,
78    /// Treat each file as a single candidate.
79    File,
80}
81
82#[derive(Debug, Clone, Copy, Eq, PartialEq)]
83/// Stage-2 prior handling strategy for KMI reranking.
84pub enum Stage2PriorMode {
85    /// Use the (full or summarized) universal prior as a prefix for compression metrics.
86    Use,
87    /// Do NOT use the universal prior in Stage 2 (pure NCD/KMI rerank on Stage-1-filtered set).
88    Disable,
89    /// Summarize the universal prior via an inner prior-less search over the prior corpus.
90    Summarize,
91}
92
93#[derive(Clone)]
94/// Tunables for the three-stage information-theoretic search pipeline.
95pub struct SearchOptions {
96    /// Candidate granularity at collection time.
97    pub granularity: SearchGranularity,
98    /// Universal prior corpus path (file or directory). If set:
99    /// - Stage 1 always uses it.
100    /// - Stage 2 uses it by default (unless Stage2PriorMode::Disable).
101    pub universal_prior: Option<String>,
102    /// Whether/how Stage-2 reranking uses universal prior context.
103    pub stage2_prior_mode: Stage2PriorMode,
104    /// Maximum model order used by entropy-rate estimators.
105    pub max_order: i64,
106    /// Number of final results to keep.
107    pub top_k: usize,
108    /// Fraction of candidates retained by the unigram prefilter.
109    pub stage0_keep_frac: f64,
110    /// Fully configured information-theory context/backend bundle.
111    pub ctx: InfotheoryCtx,
112}
113
114impl Default for SearchOptions {
115    fn default() -> Self {
116        Self {
117            granularity: SearchGranularity::Snippet,
118            universal_prior: None,
119            stage2_prior_mode: Stage2PriorMode::Use,
120            max_order: 8,
121            top_k: 50,
122            stage0_keep_frac: 0.2,
123            ctx: InfotheoryCtx::with_zpaq("5"),
124        }
125    }
126}
127
128/// Run search with default options and print top shell extraction commands.
129pub fn run_search(query: &str, target_path: &str) {
130    run_search_with_options(query, target_path, &SearchOptions::default());
131}
132
133/// Run search with explicit options and print top shell extraction commands.
134pub fn run_search_with_options(query: &str, target_path: &str, opts: &SearchOptions) {
135    let debug = std::env::var("DEBUG_SEARCH").is_ok();
136    let results = search_with_options(query, target_path, opts);
137    for (i, snippet) in results.iter().take(5).enumerate() {
138        if debug {
139            println!(
140                "Rank {}: Score={:.6}, Path={}",
141                i + 1,
142                snippet.score,
143                snippet.path.display()
144            );
145        }
146        println!(
147            "sed -n '{},{}p' {}",
148            snippet.start_line,
149            snippet.end_line,
150            snippet.path.display()
151        );
152    }
153}
154
155/// Run the full 3-stage search pipeline and return ranked results.
156///
157/// The returned `Vec<Snippet>` is sorted by descending score, truncated
158/// to `opts.top_k` entries.  Each snippet carries its file path, line
159/// range, content bytes, and final KMI-reranked score.
160pub fn search_with_options(query: &str, target_path: &str, opts: &SearchOptions) -> Vec<Snippet> {
161    let debug = std::env::var("DEBUG_SEARCH").is_ok();
162    let query_bytes = resolve_query_bytes(query);
163    if query_bytes.is_empty() {
164        eprintln!("Error: Query is empty.");
165        return Vec::new();
166    }
167
168    if debug {
169        println!(
170            "Scanning target: {} (granularity={:?}, prior={}, stage2_prior_mode={:?})",
171            target_path,
172            opts.granularity,
173            opts.universal_prior.as_deref().unwrap_or("<none>"),
174            opts.stage2_prior_mode
175        );
176    }
177
178    let candidates = collect_candidates(target_path, opts.granularity);
179    if candidates.is_empty() {
180        eprintln!("No accessible files found in target '{}'.", target_path);
181        return Vec::new();
182    }
183
184    let candidates = stage0_prefilter(query_bytes.as_slice(), candidates, opts, debug);
185    if candidates.is_empty() {
186        eprintln!("No candidates remain after Stage-0 prefilter.");
187        return Vec::new();
188    }
189    if debug {
190        println!("Found {} candidates. Filtering...", candidates.len());
191    }
192
193    // Stage 1: Filter
194    let mut scored_candidates = if let Some(prior_path) = opts.universal_prior.as_deref() {
195        stage1_filter_with_universal_prior(&query_bytes, prior_path, candidates, opts)
196    } else {
197        stage1_filter_no_prior(&query_bytes, candidates, opts)
198    };
199
200    let top_k_size = opts.top_k.min(scored_candidates.len());
201    if top_k_size < scored_candidates.len() {
202        let nth = top_k_size.saturating_sub(1);
203        scored_candidates.select_nth_unstable_by(nth, |a, b| {
204            b.score
205                .partial_cmp(&a.score)
206                .unwrap_or(std::cmp::Ordering::Equal)
207        });
208        scored_candidates.truncate(top_k_size);
209    }
210
211    scored_candidates.sort_by(|a, b| {
212        b.score
213            .partial_cmp(&a.score)
214            .unwrap_or(std::cmp::Ordering::Equal)
215    });
216
217    let top_candidates = &mut scored_candidates[..top_k_size];
218    if debug {
219        println!(
220            "Reranking top {} candidates with Kolmogorov Mutual Information...",
221            top_k_size
222        );
223    }
224
225    // Stage 2: Rerank
226    stage2_rerank_kmi(&query_bytes, top_candidates, opts);
227    top_candidates.sort_by(|a, b| {
228        b.score
229            .partial_cmp(&a.score)
230            .unwrap_or(std::cmp::Ordering::Equal)
231    });
232
233    scored_candidates
234}
235
236fn resolve_query_bytes(query: &str) -> Vec<u8> {
237    let p = Path::new(query);
238    if p.exists() && fs::metadata(p).map(|m| m.is_file()).unwrap_or(false) {
239        fs::read(p).unwrap_or_else(|_| query.as_bytes().to_vec())
240    } else {
241        query.as_bytes().to_vec()
242    }
243}
244
245fn stage1_filter_no_prior(
246    query_bytes: &[u8],
247    candidates: Vec<Snippet>,
248    opts: &SearchOptions,
249) -> Vec<Snippet> {
250    let h_q = opts.ctx.entropy_rate_bytes(query_bytes, opts.max_order);
251
252    let scored: Vec<Snippet> = candidates
253        .into_par_iter()
254        .map(|mut snippet| {
255            let h_q_x =
256                opts.ctx
257                    .cross_entropy_rate_bytes(query_bytes, &snippet.content, opts.max_order);
258            snippet.score = h_q - h_q_x;
259            snippet
260        })
261        .collect();
262
263    // Keep equivalence with old behavior by not clamping.
264    scored
265}
266
267fn stage1_filter_with_universal_prior(
268    query_bytes: &[u8],
269    prior_path: &str,
270    candidates: Vec<Snippet>,
271    opts: &SearchOptions,
272) -> Vec<Snippet> {
273    #[cfg(feature = "backend-rwkv")]
274    if let Some((mut base, prior_snapshot)) = rwkv_prior_snapshot(opts, prior_path) {
275        let h_u_q = {
276            base.restore_runtime(&prior_snapshot);
277            base.cross_entropy_from_current(query_bytes).unwrap_or(0.0)
278        };
279        return candidates
280            .into_par_iter()
281            .map_init(
282                || base.clone(),
283                |m: &mut crate::rwkvzip::Compressor, mut snippet| {
284                    m.restore_runtime(&prior_snapshot);
285                    let _ = m.absorb_chain(&[snippet.content.as_slice()]);
286                    let h_ux_q = m.cross_entropy_from_current(query_bytes).unwrap_or(0.0);
287                    snippet.score = h_u_q - h_ux_q;
288                    snippet
289                },
290            )
291            .collect();
292    }
293
294    if !matches!(opts.ctx.rate_backend, RateBackend::RosaPlus) {
295        let prior_prefix = corpus_bytes(prior_path, SearchGranularity::File);
296        let h_u_q = opts
297            .ctx
298            .cross_entropy_conditional_chain(&[prior_prefix.as_slice()], query_bytes);
299        return candidates
300            .into_par_iter()
301            .map(|mut snippet| {
302                let h_ux_q = opts.ctx.cross_entropy_conditional_chain(
303                    &[prior_prefix.as_slice(), snippet.content.as_slice()],
304                    query_bytes,
305                );
306                snippet.score = h_u_q - h_ux_q;
307                snippet
308            })
309            .collect();
310    }
311
312    // PERFORMANCE NOTE:
313    // Training the prior using snippet-level windows would duplicate overlapping content
314    // and explode runtime. We *always* train/load the prior at file granularity.
315    let mut base = load_or_train_prior_model(prior_path, opts);
316    // For true conditional updates we require the fixed 256-byte alphabet LM.
317    // This ensures symbol indices remain stable across incremental updates.
318    base.ensure_lm_built_no_finalize_endpos();
319    // Reduce the cost of cloning `base` per worker.
320    base.shrink_aux_buffers();
321
322    // Precompute query codepoints once (cross_entropy() would allocate this per call).
323    let query_cps: Vec<u32> = query_bytes.iter().map(|&b| b as u32).collect();
324    let h_u_q = base.cross_entropy_cps(&query_cps);
325
326    // True conditional update:
327    // score(x) = H_U(q) - H_{U+x}(q)
328    // by applying a reversible candidate update to the *full* prior model.
329    //
330    // MEMORY NOTE:
331    // `map_init(|| base.clone(), ...)` clones the model once per Rayon worker.
332    // For large priors this can blow up RSS. We cap worker count based on an estimate
333    // of model bytes and best-effort available memory (Linux).
334    let model_bytes = base.estimated_size_bytes().max(1);
335    let threads = memory_aware_threads(model_bytes);
336    let pool = rayon::ThreadPoolBuilder::new()
337        .num_threads(threads)
338        .build()
339        .expect("failed to build rayon pool");
340
341    pool.install(|| {
342        candidates
343            .into_par_iter()
344            .map_init(
345                || base.clone(),
346                |m, mut snippet| {
347                    let mut tx = m.begin_tx();
348                    m.train_example_tx(&mut tx, &snippet.content);
349                    let h_ux_q = m.cross_entropy_cps(&query_cps);
350                    m.rollback_tx(tx);
351                    snippet.score = h_u_q - h_ux_q;
352                    snippet
353                },
354            )
355            .collect()
356    })
357}
358
359#[cfg(feature = "backend-rwkv")]
360fn rwkv_prior_snapshot(
361    opts: &SearchOptions,
362    prior_path: &str,
363) -> Option<(crate::rwkvzip::Compressor, crate::rwkvzip::RuntimeSnapshot)> {
364    let mut compressor = match &opts.ctx.rate_backend {
365        RateBackend::Rwkv7 { model } => crate::rwkvzip::Compressor::new_from_model(model.clone()),
366        RateBackend::Rwkv7Method { method } => {
367            crate::rwkvzip::Compressor::new_from_method(method).ok()?
368        }
369        _ => return None,
370    };
371
372    let prior_prefix = corpus_bytes(prior_path, SearchGranularity::File);
373    compressor.reset_and_prime();
374    let _ = compressor.absorb_chain(&[prior_prefix.as_slice()]);
375    let snapshot = compressor.snapshot_runtime();
376    Some((compressor, snapshot))
377}
378
379fn memory_aware_threads(model_bytes: usize) -> usize {
380    let hw = num_cpus::get().max(1);
381    let avail = linux_mem_available_bytes().unwrap_or(0);
382    if avail == 0 {
383        return hw;
384    }
385
386    // Heuristic: allow up to 25% of available memory for (worker clones + overhead).
387    let budget = (avail / 4).max(model_bytes as u64);
388    let max_by_mem = (budget / (model_bytes as u64)).max(1) as usize;
389    hw.min(max_by_mem).max(1)
390}
391
392fn linux_mem_available_bytes() -> Option<u64> {
393    // Linux-only best-effort. If parsing fails, fall back to unconstrained.
394    let s = std::fs::read_to_string("/proc/meminfo").ok()?;
395    for line in s.lines() {
396        if let Some(rest) = line.strip_prefix("MemAvailable:") {
397            let parts: Vec<&str> = rest.split_whitespace().collect();
398            if parts.is_empty() {
399                return None;
400            }
401            let kb: u64 = parts[0].parse().ok()?;
402            return Some(kb.saturating_mul(1024));
403        }
404    }
405    None
406}
407
408fn stage2_rerank_kmi(query_bytes: &[u8], top_candidates: &mut [Snippet], opts: &SearchOptions) {
409    let prior_prefix: Option<Vec<u8>> =
410        match (opts.universal_prior.as_deref(), opts.stage2_prior_mode) {
411            (None, _) => None,
412            (Some(_), Stage2PriorMode::Disable) => None,
413            (Some(prior_path), Stage2PriorMode::Use) => {
414                Some(corpus_bytes(prior_path, SearchGranularity::File))
415            }
416            (Some(prior_path), Stage2PriorMode::Summarize) => {
417                Some(summarize_prior_for_query(query_bytes, prior_path, opts))
418            }
419        };
420
421    let cq = if let Some(prefix) = prior_prefix.as_deref() {
422        opts.ctx.compress_size_chain(&[prefix, query_bytes])
423    } else {
424        opts.ctx.compress_size_chain(&[query_bytes])
425    };
426
427    top_candidates.par_iter_mut().for_each(|snippet| {
428        let cx = if let Some(prefix) = prior_prefix.as_deref() {
429            opts.ctx
430                .compress_size_chain(&[prefix, snippet.content.as_slice()])
431        } else {
432            opts.ctx.compress_size_chain(&[snippet.content.as_slice()])
433        };
434
435        let c1 = if let Some(prefix) = prior_prefix.as_deref() {
436            opts.ctx
437                .compress_size_chain(&[prefix, snippet.content.as_slice(), query_bytes])
438        } else {
439            opts.ctx
440                .compress_size_chain(&[snippet.content.as_slice(), query_bytes])
441        };
442
443        let c2 = if let Some(prefix) = prior_prefix.as_deref() {
444            opts.ctx
445                .compress_size_chain(&[prefix, query_bytes, snippet.content.as_slice()])
446        } else {
447            opts.ctx
448                .compress_size_chain(&[query_bytes, snippet.content.as_slice()])
449        };
450
451        let c_joint = c1.min(c2);
452        snippet.score = if c_joint == u64::MAX {
453            0.0
454        } else {
455            (cq as f64 + cx as f64 - c_joint as f64).max(0.0)
456        };
457    });
458}
459
460fn summarize_prior_for_query(
461    query_bytes: &[u8],
462    prior_path: &str,
463    opts: &SearchOptions,
464) -> Vec<u8> {
465    // Prior-less search inside the prior corpus itself.
466    // We approximate K(q|x) via conditional compression: min(C(xq),C(qx)) - C(x), and select the MIN.
467    let candidates = collect_candidates(prior_path, opts.granularity);
468    if candidates.is_empty() {
469        return Vec::new();
470    }
471
472    let cq = opts.ctx.compress_size_chain(&[query_bytes]);
473
474    let mut best: Option<(f64, Vec<u8>)> = None;
475    for c in candidates {
476        let cx = opts.ctx.compress_size_chain(&[c.content.as_slice()]);
477
478        let cxq = opts
479            .ctx
480            .compress_size_chain(&[c.content.as_slice(), query_bytes]);
481        let cqx = opts
482            .ctx
483            .compress_size_chain(&[query_bytes, c.content.as_slice()]);
484        let c_joint = cxq.min(cqx);
485        if c_joint == u64::MAX {
486            continue;
487        }
488        // Conditional complexity proxy.
489        let k_q_given_x = (c_joint as f64 - cx as f64).max(0.0);
490        // Tie-breaker: if equal, prefer smaller candidate.
491        let candidate_key = (k_q_given_x, cx as f64, cq as f64);
492        let is_better = match &best {
493            None => true,
494            Some((best_k, best_bytes)) => {
495                let best_cx = opts.ctx.compress_size(best_bytes) as f64;
496                (candidate_key.0, candidate_key.1) < (*best_k, best_cx)
497            }
498        };
499        if is_better {
500            best = Some((k_q_given_x, c.content));
501        }
502    }
503
504    best.map(|(_, b)| b).unwrap_or_default()
505}
506
507fn train_rosa_on_corpus(m: &mut RosaPlus, corpus_path: &str, granularity: SearchGranularity) {
508    // Train incrementally on each candidate to avoid giant concatenations.
509    for c in collect_candidates(corpus_path, granularity) {
510        if !c.content.is_empty() {
511            m.train_example(&c.content);
512        }
513    }
514}
515
516fn prior_cache_path(prior_path: &str, max_order: i64) -> Option<PathBuf> {
517    let home = std::env::var("XDG_CACHE_HOME")
518        .ok()
519        .or_else(|| std::env::var("HOME").ok().map(|h| format!("{}/.cache", h)));
520    let cache_root = match home {
521        Some(h) => PathBuf::from(h).join("infotheory").join("rosa_prior"),
522        None => return None,
523    };
524
525    let mut hasher = DefaultHasher::new();
526    // Cache format/version (bump when training or serialization semantics change).
527    (4u32).hash(&mut hasher);
528    prior_path.hash(&mut hasher);
529    max_order.hash(&mut hasher);
530    // file-granularity is baked into the cache key (we always use it for prior training)
531    ("file" as &str).hash(&mut hasher);
532    let key = hasher.finish();
533    Some(cache_root.join(format!("prior_{:016x}.rosa", key)))
534}
535
536fn load_or_train_prior_model(prior_path: &str, opts: &SearchOptions) -> RosaPlus {
537    // Load cached prior model if present.
538    if let Some(cache_path) = prior_cache_path(prior_path, opts.max_order) {
539        if let Some(parent) = cache_path.parent() {
540            let _ = fs::create_dir_all(parent);
541        }
542        if cache_path.exists()
543            && let Ok(mut m) = RosaPlus::load(cache_path.to_string_lossy().as_ref())
544        {
545            // Ensure fixed 256-byte alphabet LM for incremental conditional updates.
546            if m.lm_alpha_n() != 256 {
547                m.build_lm_full_bytes_no_finalize_endpos();
548                let _ = m.save(cache_path.to_string_lossy().as_ref());
549            }
550            return m;
551        }
552
553        // Train + save.
554        let mut m = RosaPlus::new(opts.max_order, false, 0, 42);
555        train_rosa_on_corpus(&mut m, prior_path, SearchGranularity::File);
556        // Build a fixed-byte alphabet LM once so the saved model is the full state.
557        m.build_lm_full_bytes_no_finalize_endpos();
558        let _ = m.save(cache_path.to_string_lossy().as_ref());
559        return m;
560    }
561
562    // Fallback: no cache location available.
563    let mut m = RosaPlus::new(opts.max_order, false, 0, 42);
564    train_rosa_on_corpus(&mut m, prior_path, SearchGranularity::File);
565    m
566}
567
568fn corpus_bytes(corpus_path: &str, granularity: SearchGranularity) -> Vec<u8> {
569    // Compression prior prefix requires a concrete byte buffer.
570    // We join candidates with a simple delimiter to preserve boundaries.
571    let mut out = Vec::new();
572    for c in collect_candidates(corpus_path, granularity) {
573        if c.content.is_empty() {
574            continue;
575        }
576        out.extend_from_slice(&c.content);
577        out.extend_from_slice(b"\n\n");
578    }
579    out
580}
581
582fn collect_candidates(target: &str, granularity: SearchGranularity) -> Vec<Snippet> {
583    let mut snippets = Vec::new();
584    let path = Path::new(target);
585
586    if path.exists() {
587        if path.is_file() {
588            snippets.extend(file_to_candidates(path, granularity));
589        } else if path.is_dir() {
590            visit_dirs(path, &mut snippets, granularity);
591        }
592    }
593
594    snippets
595}
596
597fn visit_dirs(dir: &Path, snippets: &mut Vec<Snippet>, granularity: SearchGranularity) {
598    if let Ok(entries) = fs::read_dir(dir) {
599        for entry in entries.flatten() {
600            let path = entry.path();
601            if path.is_dir() {
602                if let Some(name_str) = path.file_name().and_then(|n| n.to_str())
603                    && !name_str.starts_with('.')
604                {
605                    visit_dirs(&path, snippets, granularity);
606                }
607            } else {
608                snippets.extend(file_to_candidates(&path, granularity));
609            }
610        }
611    }
612}
613
614fn file_to_candidates(path: &Path, granularity: SearchGranularity) -> Vec<Snippet> {
615    let mut snippets = Vec::new();
616
617    // Only process text files
618    if let Some(ext) = path.extension() {
619        let ext_str = ext.to_string_lossy();
620        if matches!(
621            ext_str.as_ref(),
622            "o" | "a" | "so" | "dll" | "exe" | "bin" | "png" | "jpg" | "zip" | "gz"
623        ) {
624            return snippets;
625        }
626    }
627
628    match granularity {
629        SearchGranularity::File => {
630            if let Ok(bytes) = fs::read(path)
631                && !bytes.is_empty()
632            {
633                // Best-effort line count for `sed` output.
634                let lines = bytes.iter().filter(|&&b| b == b'\n').count() + 1;
635                snippets.push(Snippet {
636                    path: path.to_path_buf(),
637                    start_line: 1,
638                    end_line: lines.max(1),
639                    content: bytes,
640                    score: 0.0,
641                });
642            }
643        }
644        SearchGranularity::Snippet => {
645            if let Ok(bytes) = fs::read(path) {
646                if bytes.is_empty() {
647                    return snippets;
648                }
649
650                let window = 50usize;
651                let stride = 20usize;
652
653                let mut line_starts: Vec<usize> = Vec::new();
654                line_starts.push(0);
655                for (i, &b) in bytes.iter().enumerate() {
656                    if b == b'\n' {
657                        let next = i + 1;
658                        if next < bytes.len() {
659                            line_starts.push(next);
660                        }
661                    }
662                }
663
664                if line_starts.is_empty() {
665                    return snippets;
666                }
667
668                let mut i = 0usize;
669                while i < line_starts.len() {
670                    let end = (i + window).min(line_starts.len());
671                    let start_b = line_starts[i];
672                    let end_b = if end >= line_starts.len() {
673                        bytes.len()
674                    } else {
675                        line_starts[end]
676                    };
677
678                    if end_b > start_b {
679                        let content = bytes[start_b..end_b].to_vec();
680                        if content.len() > 50 {
681                            snippets.push(Snippet {
682                                path: path.to_path_buf(),
683                                start_line: i + 1,
684                                end_line: end,
685                                content,
686                                score: 0.0,
687                            });
688                        }
689                    }
690
691                    if end == line_starts.len() {
692                        break;
693                    }
694                    i += stride;
695                }
696            }
697        }
698    }
699    snippets
700}
701
702#[cfg(test)]
703mod tests {
704    use super::*;
705    use std::time::{SystemTime, UNIX_EPOCH};
706
707    fn temp_path(prefix: &str) -> PathBuf {
708        let nanos = SystemTime::now()
709            .duration_since(UNIX_EPOCH)
710            .expect("clock before epoch")
711            .as_nanos();
712        std::env::temp_dir().join(format!("infotheory-search-{prefix}-{nanos}"))
713    }
714
715    #[test]
716    fn resolve_query_bytes_prefers_file_contents() {
717        let path = temp_path("query");
718        fs::write(&path, b"query-from-file").expect("write query file");
719        let got = resolve_query_bytes(path.to_string_lossy().as_ref());
720        assert_eq!(got, b"query-from-file");
721        let _ = fs::remove_file(path);
722    }
723
724    #[test]
725    fn file_to_candidates_skips_binary_extensions() {
726        let path = temp_path("binary").with_extension("png");
727        fs::write(&path, b"not-actually-image").expect("write pseudo-binary");
728        let out = file_to_candidates(&path, SearchGranularity::File);
729        assert!(out.is_empty(), "binary extension should be skipped");
730        let _ = fs::remove_file(path);
731    }
732
733    #[test]
734    fn file_to_candidates_generates_snippets() {
735        let path = temp_path("snippet").with_extension("txt");
736        let mut text = String::new();
737        for i in 0..120 {
738            text.push_str(&format!("line-{i:03}\n"));
739        }
740        fs::write(&path, text.as_bytes()).expect("write snippet file");
741        let out = file_to_candidates(&path, SearchGranularity::Snippet);
742        assert!(!out.is_empty(), "expected snippet candidates");
743        assert!(out.iter().all(|s| s.end_line >= s.start_line));
744        let _ = fs::remove_file(path);
745    }
746
747    #[test]
748    fn collect_candidates_skips_hidden_directories() {
749        let root = temp_path("tree");
750        let hidden = root.join(".hidden");
751        let visible = root.join("visible");
752        fs::create_dir_all(&hidden).expect("create hidden dir");
753        fs::create_dir_all(&visible).expect("create visible dir");
754        fs::write(hidden.join("secret.txt"), b"hidden").expect("write hidden file");
755        fs::write(visible.join("public.txt"), b"visible\ntext\n").expect("write visible file");
756
757        let out = collect_candidates(root.to_string_lossy().as_ref(), SearchGranularity::File);
758        assert_eq!(out.len(), 1, "only visible file should be collected");
759        assert!(
760            out[0].path.to_string_lossy().contains("public.txt"),
761            "unexpected collected file path: {}",
762            out[0].path.display()
763        );
764
765        let _ = fs::remove_dir_all(root);
766    }
767
768    #[test]
769    fn stage0_prefilter_respects_topk_floor() {
770        let mut candidates = Vec::new();
771        for i in 0..10 {
772            candidates.push(Snippet {
773                path: PathBuf::from(format!("f{i}.txt")),
774                start_line: 1,
775                end_line: 1,
776                content: format!("candidate-{i}").into_bytes(),
777                score: 0.0,
778            });
779        }
780        let opts = SearchOptions {
781            top_k: 4,
782            stage0_keep_frac: 0.1,
783            ..SearchOptions::default()
784        };
785        let kept = stage0_prefilter(b"candidate", candidates, &opts, false);
786        assert!(
787            kept.len() >= 4,
788            "stage0 must keep at least top_k candidates, got {}",
789            kept.len()
790        );
791    }
792}