• Home
  • Features
  • Pricing
  • Docs
  • Announcements
  • Sign In

jzombie / rust-triplets / 23559372400

25 Mar 2026 07:13PM UTC coverage: 93.193% (-1.6%) from 94.794%
23559372400

Pull #40

github

web-flow
Merge 5e64594c1 into 65addee9d
Pull Request #40: Refactor BM25 integration

2490 of 2828 new or added lines in 6 files covered. (88.05%)

26 existing lines in 1 file now uncovered.

15142 of 16248 relevant lines covered (93.19%)

135830.15 hits per line

Source File
Press 'n' to go to next uncovered line, 'b' for previous

96.34
/src/example_apps.rs
1
// TODO: Consider extracting to a debug crate
2

3
use std::collections::HashMap;
4
use std::error::Error;
5
use std::path::PathBuf;
6
use std::sync::Arc;
7
use std::sync::Once;
8
use std::time::Instant;
9

10
use cache_manager::CacheRoot;
11
use clap::{Parser, ValueEnum, error::ErrorKind};
12

13
use crate::config::{ChunkingStrategy, SamplerConfig, TripletRecipe};
14
use crate::constants::cache::{MULTI_SOURCE_DEMO_GROUP, MULTI_SOURCE_DEMO_STORE_FILENAME};
15
use crate::data::ChunkView;
16
use crate::heuristics::{
17
    CapacityTotals, EFFECTIVE_NEGATIVES_PER_ANCHOR, EFFECTIVE_POSITIVES_PER_ANCHOR,
18
    estimate_source_split_capacity_from_counts, format_replay_factor, format_u128_with_commas,
19
    resolve_text_recipes_for_source, split_counts_for_total,
20
};
21
use crate::metrics::source_skew;
22
use crate::sampler::chunk_weight;
23
use crate::source::DataSource;
24
use crate::splits::{FileSplitStore, SplitLabel, SplitRatios, SplitStore};
25
use crate::{
26
    RecordChunk, SampleBatch, Sampler, SamplerError, SourceId, TextBatch, TextRecipe, TripletBatch,
27
    TripletSampler,
28
};
29

30
type DynSource = Box<dyn DataSource + 'static>;
31

32
fn managed_demo_split_store_path() -> Result<PathBuf, String> {
2✔
33
    let cache_root = CacheRoot::from_discovery()
2✔
34
        .map_err(|err| format!("failed discovering managed cache root: {err}"))?;
2✔
35
    let group = PathBuf::from(MULTI_SOURCE_DEMO_GROUP);
2✔
36
    let dir = cache_root.ensure_group(&group).map_err(|err| {
2✔
37
        format!(
×
38
            "failed creating managed demo cache group '{}': {err}",
39
            group.display()
×
40
        )
41
    })?;
×
42
    Ok(dir.join(MULTI_SOURCE_DEMO_STORE_FILENAME))
2✔
43
}
2✔
44

45
fn init_example_tracing() {
21✔
46
    static INIT: Once = Once::new();
47
    INIT.call_once(|| {
21✔
48
        let env_filter = tracing_subscriber::EnvFilter::try_from_default_env()
1✔
49
            .unwrap_or_else(|_| tracing_subscriber::EnvFilter::new("triplets=info"));
1✔
50
        let _ = tracing_subscriber::fmt()
1✔
51
            .with_env_filter(env_filter)
1✔
52
            .try_init();
1✔
53
    });
1✔
54
}
21✔
55

56
#[derive(Debug, Clone, Copy, ValueEnum)]
57
/// CLI split selector mapped onto `SplitLabel`.
58
enum SplitArg {
59
    Train,
60
    Validation,
61
    Test,
62
}
63

64
impl From<SplitArg> for SplitLabel {
65
    fn from(value: SplitArg) -> Self {
6✔
66
        match value {
6✔
67
            SplitArg::Train => SplitLabel::Train,
1✔
68
            SplitArg::Validation => SplitLabel::Validation,
4✔
69
            SplitArg::Test => SplitLabel::Test,
1✔
70
        }
71
    }
6✔
72
}
73

74
#[derive(Debug, Parser)]
75
#[command(
76
    name = "estimate_capacity",
77
    disable_help_subcommand = true,
78
    about = "Metadata-only capacity estimation",
79
    long_about = "Estimate record, pair, triplet, and text-sample capacity using source-reported counts only (no data refresh).",
80
    after_help = "Source roots are optional and resolved in order by explicit arg, environment variables, then project defaults."
81
)]
82
/// CLI arguments for metadata-only capacity estimation.
83
struct EstimateCapacityCli {
84
    #[arg(
85
        long,
86
        default_value_t = 99,
87
        help = "Deterministic seed used for split allocation"
88
    )]
89
    seed: u64,
90
    #[arg(
91
        long = "split-ratios",
92
        value_name = "TRAIN,VALIDATION,TEST",
93
        value_parser = parse_split_ratios_arg,
94
        default_value = "0.8,0.1,0.1",
95
        help = "Comma-separated split ratios that must sum to 1.0"
96
    )]
97
    split: SplitRatios,
98
    #[arg(
99
        long = "source-root",
100
        value_name = "PATH",
101
        help = "Optional source root override, repeat as needed in source order"
102
    )]
103
    source_roots: Vec<String>,
104
}
105

106
#[derive(Debug, Parser)]
107
#[command(
108
    name = "multi_source_demo",
109
    disable_help_subcommand = true,
110
    about = "Run sampled batches from multiple sources",
111
    long_about = "Sample triplet, pair, or text batches from multiple sources and persist split/epoch state.",
112
    after_help = "Source roots are optional and resolved in order by explicit arg, environment variables, then project defaults."
113
)]
114
/// CLI for `multi_source_demo`.
115
///
116
/// Common usage:
117
/// - Use managed cache-group default path (no flag)
118
/// - Set an explicit file path: `--split-store-path /tmp/split_store.bin`
119
/// - Repeat `--source-root <PATH>` to override source roots in order
120
struct MultiSourceDemoCli {
121
    #[arg(
122
        long = "text-recipes",
123
        help = "Emit a text batch instead of a triplet batch"
124
    )]
125
    show_text_samples: bool,
126
    #[arg(
127
        long = "pair-batch",
128
        help = "Emit a pair batch instead of a triplet batch"
129
    )]
130
    show_pair_samples: bool,
131
    #[arg(
132
        long = "list-text-recipes",
133
        help = "Print registered text recipes and exit"
134
    )]
135
    list_text_recipes: bool,
136
    #[arg(
137
        long = "batch-size",
138
        default_value_t = 4,
139
        value_parser = parse_positive_usize,
140
        help = "Batch size used for sampling"
141
    )]
142
    batch_size: usize,
143
    #[arg(long, help = "Optional deterministic seed override")]
144
    seed: Option<u64>,
145
    #[arg(long, value_enum, help = "Target split to sample from")]
146
    split: Option<SplitArg>,
147
    #[arg(
148
        long = "source-root",
149
        value_name = "PATH",
150
        help = "Optional source root override, repeat as needed in source order"
151
    )]
152
    source_roots: Vec<String>,
153
    #[arg(
154
        long = "split-store-path",
155
        value_name = "SPLIT_STORE_PATH",
156
        help = "Optional explicit path for persisted split/epoch state file"
157
    )]
158
    split_store_path: Option<PathBuf>,
159
    #[arg(
160
        long = "reset",
161
        help = "Delete the persisted split/epoch state before sampling, restarting from epoch 0"
162
    )]
163
    reset: bool,
164
    #[arg(
165
        long = "batches",
166
        value_name = "N",
167
        value_parser = parse_positive_usize,
168
        help = "Run N triplet batches in succession, printing a timing line per batch and (with --features extended-metrics) a per-source similarity summary at the end"
169
    )]
170
    batches: Option<usize>,
171
}
172

173
#[derive(Debug, Clone)]
174
/// Source-level inventory used by capacity estimation output.
175
struct SourceInventory {
176
    source_id: String,
177
    reported_records: u128,
178
    triplet_recipes: Vec<TripletRecipe>,
179
}
180

181
/// Run the capacity-estimation CLI with injectable root resolution/source builders.
182
///
183
/// `build_sources` is construction-only; sampler configuration is applied
184
/// centrally by this function before any source calls.
185
pub fn run_estimate_capacity<R, Resolve, Build, I>(
4✔
186
    args_iter: I,
4✔
187
    resolve_roots: Resolve,
4✔
188
    build_sources: Build,
4✔
189
) -> Result<(), Box<dyn Error>>
4✔
190
where
4✔
191
    Resolve: FnOnce(Vec<String>) -> Result<R, Box<dyn Error>>,
4✔
192
    Build: FnOnce(&R) -> Vec<DynSource>,
4✔
193
    I: Iterator<Item = String>,
4✔
194
{
195
    init_example_tracing();
4✔
196

197
    let Some(cli) = parse_cli::<EstimateCapacityCli, _>(
4✔
198
        std::iter::once("estimate_capacity".to_string()).chain(args_iter),
4✔
UNCOV
199
    )?
×
200
    else {
UNCOV
201
        return Ok(());
×
202
    };
203

204
    let roots = resolve_roots(cli.source_roots)?;
4✔
205

206
    let config = SamplerConfig {
3✔
207
        seed: cli.seed,
3✔
208
        split: cli.split,
3✔
209
        ..SamplerConfig::default()
3✔
210
    };
3✔
211

212
    let sources = build_sources(&roots);
3✔
213

214
    let mut inventories = Vec::new();
3✔
215
    for source in &sources {
3✔
216
        let recipes = if config.recipes.is_empty() {
3✔
217
            source.default_triplet_recipes()
3✔
218
        } else {
UNCOV
219
            config.recipes.clone()
×
220
        };
221
        let reported_records = source.reported_record_count(&config).map_err(|err| {
3✔
222
            format!(
1✔
223
                "source '{}' failed to report exact record count: {err}",
224
                source.id()
1✔
225
            )
226
        })?;
1✔
227
        inventories.push(SourceInventory {
2✔
228
            source_id: source.id().to_string(),
2✔
229
            reported_records,
2✔
230
            triplet_recipes: recipes,
2✔
231
        });
2✔
232
    }
233

234
    let mut per_source_split_counts: HashMap<(String, SplitLabel), u128> = HashMap::new();
2✔
235
    let mut split_record_counts: HashMap<SplitLabel, u128> = HashMap::new();
2✔
236

237
    for source in &inventories {
2✔
238
        let counts = split_counts_for_total(source.reported_records, cli.split);
2✔
239
        for (label, count) in counts {
6✔
240
            per_source_split_counts.insert((source.source_id.clone(), label), count);
6✔
241
            *split_record_counts.entry(label).or_insert(0) += count;
6✔
242
        }
6✔
243
    }
244

245
    let mut totals_by_split: HashMap<SplitLabel, CapacityTotals> = HashMap::new();
2✔
246
    let mut totals_by_source_and_split: HashMap<(String, SplitLabel), CapacityTotals> =
2✔
247
        HashMap::new();
2✔
248

249
    for split_label in [SplitLabel::Train, SplitLabel::Validation, SplitLabel::Test] {
6✔
250
        let mut totals = CapacityTotals::default();
6✔
251

252
        for source in &inventories {
6✔
253
            let source_split_records = per_source_split_counts
6✔
254
                .get(&(source.source_id.clone(), split_label))
6✔
255
                .copied()
6✔
256
                .unwrap_or(0);
6✔
257

6✔
258
            let triplet_recipes = &source.triplet_recipes;
6✔
259
            let text_recipes = resolve_text_recipes_for_source(&config, triplet_recipes);
6✔
260

6✔
261
            let capacity = estimate_source_split_capacity_from_counts(
6✔
262
                source_split_records,
6✔
263
                triplet_recipes,
6✔
264
                &text_recipes,
6✔
265
            );
6✔
266

6✔
267
            totals_by_source_and_split.insert((source.source_id.clone(), split_label), capacity);
6✔
268

6✔
269
            totals.triplets += capacity.triplets;
6✔
270
            totals.effective_triplets += capacity.effective_triplets;
6✔
271
            totals.pairs += capacity.pairs;
6✔
272
            totals.text_samples += capacity.text_samples;
6✔
273
        }
6✔
274

275
        totals_by_split.insert(split_label, totals);
6✔
276
    }
277

278
    let min_nonzero_records_by_split: HashMap<SplitLabel, u128> =
2✔
279
        [SplitLabel::Train, SplitLabel::Validation, SplitLabel::Test]
2✔
280
            .into_iter()
2✔
281
            .map(|split_label| {
6✔
282
                let min_nonzero = inventories
6✔
283
                    .iter()
6✔
284
                    .filter_map(|source| {
6✔
285
                        per_source_split_counts
6✔
286
                            .get(&(source.source_id.clone(), split_label))
6✔
287
                            .copied()
6✔
288
                    })
6✔
289
                    .filter(|&records| records > 0)
6✔
290
                    .min()
6✔
291
                    .unwrap_or(0);
6✔
292
                (split_label, min_nonzero)
6✔
293
            })
6✔
294
            .collect();
2✔
295

296
    let min_nonzero_records_all_splits = inventories
2✔
297
        .iter()
2✔
298
        .map(|source| source.reported_records)
2✔
299
        .filter(|&records| records > 0)
2✔
300
        .min()
2✔
301
        .unwrap_or(0);
2✔
302

303
    println!("=== capacity estimate (length-only) ===");
2✔
304
    println!("mode: metadata-only (no source.refresh calls)");
2✔
305
    println!("classification: heuristic approximation (not exact)");
2✔
306
    println!("split seed: {}", cli.seed);
2✔
307
    println!(
2✔
308
        "split ratios: train={:.4}, validation={:.4}, test={:.4}",
309
        cli.split.train, cli.split.validation, cli.split.test
310
    );
311
    println!();
2✔
312

313
    println!("[SOURCES]");
2✔
314
    for source in &inventories {
2✔
315
        println!(
2✔
316
            "  {} => reported records: {}",
2✔
317
            source.source_id,
2✔
318
            format_u128_with_commas(source.reported_records)
2✔
319
        );
2✔
320
    }
2✔
321
    println!();
2✔
322

323
    println!("[PER SOURCE BREAKDOWN]");
2✔
324
    for source in &inventories {
2✔
325
        println!("  {}", source.source_id);
2✔
326
        let mut source_grand = CapacityTotals::default();
2✔
327
        let mut source_total_records = 0u128;
2✔
328
        for split_label in [SplitLabel::Train, SplitLabel::Validation, SplitLabel::Test] {
6✔
329
            let split_records = per_source_split_counts
6✔
330
                .get(&(source.source_id.clone(), split_label))
6✔
331
                .copied()
6✔
332
                .unwrap_or(0);
6✔
333
            source_total_records = source_total_records.saturating_add(split_records);
6✔
334
            let split_longest_records = inventories
6✔
335
                .iter()
6✔
336
                .map(|candidate| {
6✔
337
                    per_source_split_counts
6✔
338
                        .get(&(candidate.source_id.clone(), split_label))
6✔
339
                        .copied()
6✔
340
                        .unwrap_or(0)
6✔
341
                })
6✔
342
                .max()
6✔
343
                .unwrap_or(0);
6✔
344
            let totals = totals_by_source_and_split
6✔
345
                .get(&(source.source_id.clone(), split_label))
6✔
346
                .copied()
6✔
347
                .unwrap_or_default();
6✔
348
            source_grand.triplets += totals.triplets;
6✔
349
            source_grand.effective_triplets += totals.effective_triplets;
6✔
350
            source_grand.pairs += totals.pairs;
6✔
351
            source_grand.text_samples += totals.text_samples;
6✔
352
            println!("    [{:?}]", split_label);
6✔
353
            println!("      records: {}", format_u128_with_commas(split_records));
6✔
354
            println!(
6✔
355
                "      triplet combinations: {}",
356
                format_u128_with_commas(totals.triplets)
6✔
357
            );
358
            println!(
6✔
359
                "      effective sampled triplets (p={}, k={}): {}",
360
                EFFECTIVE_POSITIVES_PER_ANCHOR,
361
                EFFECTIVE_NEGATIVES_PER_ANCHOR,
362
                format_u128_with_commas(totals.effective_triplets)
6✔
363
            );
364
            println!(
6✔
365
                "      pair combinations:    {}",
366
                format_u128_with_commas(totals.pairs)
6✔
367
            );
368
            println!(
6✔
369
                "      text samples:         {}",
370
                format_u128_with_commas(totals.text_samples)
6✔
371
            );
372
            println!(
6✔
373
                "      replay factor vs longest source: {}",
374
                format_replay_factor(split_longest_records, split_records)
6✔
375
            );
376
            println!(
6✔
377
                "      suggested proportional-size batch weight (0-1): {:.4}",
378
                suggested_balancing_weight(split_longest_records, split_records)
6✔
379
            );
380
            let split_smallest_nonzero = min_nonzero_records_by_split
6✔
381
                .get(&split_label)
6✔
382
                .copied()
6✔
383
                .unwrap_or(0);
6✔
384
            println!(
6✔
385
                "      suggested small-source-boost batch weight (0-1): {:.4}",
386
                suggested_oversampling_weight(split_smallest_nonzero, split_records)
6✔
387
            );
388
            println!();
6✔
389
        }
390
        let longest_source_total = inventories
2✔
391
            .iter()
2✔
392
            .map(|candidate| candidate.reported_records)
2✔
393
            .max()
2✔
394
            .unwrap_or(0);
2✔
395
        println!("    [ALL SPLITS FOR SOURCE]");
2✔
396
        println!(
2✔
397
            "      triplet combinations: {}",
398
            format_u128_with_commas(source_grand.triplets)
2✔
399
        );
400
        println!(
2✔
401
            "      effective sampled triplets (p={}, k={}): {}",
402
            EFFECTIVE_POSITIVES_PER_ANCHOR,
403
            EFFECTIVE_NEGATIVES_PER_ANCHOR,
404
            format_u128_with_commas(source_grand.effective_triplets)
2✔
405
        );
406
        println!(
2✔
407
            "      pair combinations:    {}",
408
            format_u128_with_commas(source_grand.pairs)
2✔
409
        );
410
        println!(
2✔
411
            "      text samples:         {}",
412
            format_u128_with_commas(source_grand.text_samples)
2✔
413
        );
414
        println!(
2✔
415
            "      replay factor vs longest source: {}",
416
            format_replay_factor(longest_source_total, source_total_records)
2✔
417
        );
418
        println!(
2✔
419
            "      suggested proportional-size batch weight (0-1): {:.4}",
420
            suggested_balancing_weight(longest_source_total, source_total_records)
2✔
421
        );
422
        println!(
2✔
423
            "      suggested small-source-boost batch weight (0-1): {:.4}",
424
            suggested_oversampling_weight(min_nonzero_records_all_splits, source_total_records)
2✔
425
        );
426
        println!();
2✔
427
    }
428

429
    let mut grand = CapacityTotals::default();
2✔
430
    for split_label in [SplitLabel::Train, SplitLabel::Validation, SplitLabel::Test] {
6✔
431
        let record_count = split_record_counts.get(&split_label).copied().unwrap_or(0);
6✔
432
        let totals = totals_by_split
6✔
433
            .get(&split_label)
6✔
434
            .copied()
6✔
435
            .unwrap_or_default();
6✔
436

6✔
437
        grand.triplets += totals.triplets;
6✔
438
        grand.effective_triplets += totals.effective_triplets;
6✔
439
        grand.pairs += totals.pairs;
6✔
440
        grand.text_samples += totals.text_samples;
6✔
441

6✔
442
        println!("[{:?}]", split_label);
6✔
443
        println!("  records: {}", format_u128_with_commas(record_count));
6✔
444
        println!(
6✔
445
            "  triplet combinations: {}",
6✔
446
            format_u128_with_commas(totals.triplets)
6✔
447
        );
6✔
448
        println!(
6✔
449
            "  effective sampled triplets (p={}, k={}): {}",
6✔
450
            EFFECTIVE_POSITIVES_PER_ANCHOR,
6✔
451
            EFFECTIVE_NEGATIVES_PER_ANCHOR,
6✔
452
            format_u128_with_commas(totals.effective_triplets)
6✔
453
        );
6✔
454
        println!(
6✔
455
            "  pair combinations:    {}",
6✔
456
            format_u128_with_commas(totals.pairs)
6✔
457
        );
6✔
458
        println!(
6✔
459
            "  text samples:         {}",
6✔
460
            format_u128_with_commas(totals.text_samples)
6✔
461
        );
6✔
462
        println!();
6✔
463
    }
6✔
464

465
    println!("[ALL SPLITS TOTAL]");
2✔
466
    println!(
2✔
467
        "  triplet combinations: {}",
468
        format_u128_with_commas(grand.triplets)
2✔
469
    );
470
    println!(
2✔
471
        "  effective sampled triplets (p={}, k={}): {}",
472
        EFFECTIVE_POSITIVES_PER_ANCHOR,
473
        EFFECTIVE_NEGATIVES_PER_ANCHOR,
474
        format_u128_with_commas(grand.effective_triplets)
2✔
475
    );
476
    println!(
2✔
477
        "  pair combinations:    {}",
478
        format_u128_with_commas(grand.pairs)
2✔
479
    );
480
    println!(
2✔
481
        "  text samples:         {}",
482
        format_u128_with_commas(grand.text_samples)
2✔
483
    );
484
    println!();
2✔
485
    println!(
2✔
486
        "Note: counts are heuristic, length-based estimates from source-reported totals and recipe structure. They are approximate, not exact, and assume anchor-positive pairs=records (one positive per anchor by default), negatives=source_records_in_split-1 (anchor excluded as its own negative), and at most one chunk/window realization per sample. In real-world chunked sampling, practical combinations are often higher, so treat this as a floor-like baseline."
487
    );
488
    println!();
2✔
489
    println!(
2✔
490
        "Effective sampled triplets apply a bounded training assumption: effective_triplets = records * p * k per triplet recipe, with defaults p={} positives per anchor and k={} negatives per anchor.",
491
        EFFECTIVE_POSITIVES_PER_ANCHOR, EFFECTIVE_NEGATIVES_PER_ANCHOR
492
    );
493
    println!();
2✔
494
    println!(
2✔
495
        "Oversample loops are not inferred from this static report. To measure true oversampling (how many times sampling loops through the combination space), use observed sampled draw counts from an actual run."
496
    );
497
    println!();
2✔
498
    println!(
2✔
499
        "Suggested proportional-size batch weight (0-1) is source/max_source by record count: 1.0 for the largest source in scope, smaller values for smaller sources."
500
    );
501
    println!();
2✔
502
    println!(
2✔
503
        "Suggested small-source-boost batch weight (0-1) is min_nonzero_source/source by record count: 1.0 for the smallest non-zero source in scope, smaller values for larger sources."
504
    );
505
    println!();
2✔
506
    println!(
2✔
507
        "When passed to next_*_batch_with_weights, higher weight means that source is sampled more often relative to lower-weight sources."
508
    );
509

510
    Ok(())
2✔
511
}
4✔
512

513
/// Run the multi-source demo CLI with injectable root resolution/source builders.
514
///
515
/// `build_sources` is construction-only. Source sampler configuration is owned
516
/// by sampler registration (`TripletSampler::register_source`).
517
pub fn run_multi_source_demo<R, Resolve, Build, I>(
17✔
518
    args_iter: I,
17✔
519
    resolve_roots: Resolve,
17✔
520
    build_sources: Build,
17✔
521
) -> Result<(), Box<dyn Error>>
17✔
522
where
17✔
523
    Resolve: FnOnce(Vec<String>) -> Result<R, Box<dyn Error>>,
17✔
524
    Build: FnOnce(&R) -> Vec<DynSource>,
17✔
525
    I: Iterator<Item = String>,
17✔
526
{
527
    init_example_tracing();
17✔
528

529
    let Some(cli) = parse_cli::<MultiSourceDemoCli, _>(
17✔
530
        std::iter::once("multi_source_demo".to_string()).chain(args_iter),
17✔
UNCOV
531
    )?
×
532
    else {
533
        return Ok(());
1✔
534
    };
535

536
    let roots = resolve_roots(cli.source_roots)?;
16✔
537

538
    let mut config = SamplerConfig::default();
15✔
539
    config.seed = cli.seed.unwrap_or(config.seed);
15✔
540
    config.batch_size = cli.batch_size;
15✔
541
    config.chunking = Default::default();
15✔
542
    let selected_split = cli.split.map(Into::into).unwrap_or(SplitLabel::Train);
15✔
543
    config.split = SplitRatios::default();
15✔
544
    config.allowed_splits = vec![selected_split];
15✔
545
    let chunking = config.chunking.clone();
15✔
546
    let config_snapshot = MultiSourceDemoConfigSnapshot {
15✔
547
        seed: config.seed,
15✔
548
        batch_size: config.batch_size,
15✔
549
        ingestion_max_records: config.ingestion_max_records,
15✔
550
        split: selected_split,
15✔
551
        split_ratios: config.split,
15✔
552
        max_window_tokens: config.chunking.max_window_tokens,
15✔
553
        overlap_tokens: config.chunking.overlap_tokens.clone(),
15✔
554
        summary_fallback_tokens: config.chunking.summary_fallback_tokens,
15✔
555
    };
15✔
556

557
    let split_store_path = if let Some(path) = cli.split_store_path {
15✔
558
        path
14✔
559
    } else {
560
        managed_demo_split_store_path().map_err(|err| {
1✔
561
            Box::<dyn Error>::from(format!("failed to resolve demo split-store path: {err}"))
×
562
        })?
×
563
    };
564

565
    if cli.reset && split_store_path.exists() {
15✔
566
        std::fs::remove_file(&split_store_path).map_err(|err| {
2✔
567
            Box::<dyn Error>::from(format!(
1✔
568
                "failed to remove split store '{}': {err}",
1✔
569
                split_store_path.display()
1✔
570
            ))
1✔
571
        })?;
1✔
572
        println!("Reset: removed {}", split_store_path.display());
1✔
573
    }
13✔
574
    println!(
14✔
575
        "Persisting split assignments and epoch state to {}",
576
        split_store_path.display()
14✔
577
    );
578
    let sources = build_sources(&roots);
14✔
579
    let split_store = Arc::new(FileSplitStore::open(&split_store_path, config.split, 99)?);
14✔
580
    let sampler = TripletSampler::new(config, split_store.clone());
14✔
581
    for source in sources {
14✔
582
        sampler.register_source(source);
14✔
583
    }
14✔
584

585
    if cli.show_pair_samples {
14✔
586
        match sampler.next_pair_batch(selected_split) {
4✔
587
            Ok(pair_batch) => {
1✔
588
                if pair_batch.pairs.is_empty() {
1✔
UNCOV
589
                    println!("Pair sampling produced no results.");
×
590
                } else {
1✔
591
                    print_pair_batch(&chunking, &pair_batch, split_store.as_ref());
1✔
592
                }
1✔
593
                sampler.save_sampler_state(None)?;
1✔
594
            }
595
            Err(SamplerError::Exhausted(name)) => {
3✔
596
                eprintln!(
3✔
597
                    "Pair sampler exhausted recipe '{}'. Ensure both positive and negative examples exist.",
3✔
598
                    name
3✔
599
                );
3✔
600
            }
3✔
601
            Err(err) => return Err(err.into()),
×
602
        }
603
    } else if cli.show_text_samples {
10✔
604
        match sampler.next_text_batch(selected_split) {
3✔
605
            Ok(text_batch) => {
1✔
606
                if text_batch.samples.is_empty() {
1✔
UNCOV
607
                    println!(
×
UNCOV
608
                        "Text sampling produced no results. Ensure each source has eligible sections."
×
609
                    );
×
610
                } else {
1✔
611
                    print_text_batch(&chunking, &text_batch, split_store.as_ref());
1✔
612
                }
1✔
613
                sampler.save_sampler_state(None)?;
1✔
614
            }
615
            Err(SamplerError::Exhausted(name)) => {
2✔
616
                eprintln!(
2✔
617
                    "Text sampler exhausted selector '{}'. Ensure matching sections exist.",
2✔
618
                    name
2✔
619
                );
2✔
620
            }
2✔
UNCOV
621
            Err(err) => return Err(err.into()),
×
622
        }
623
    } else if cli.list_text_recipes {
7✔
624
        let recipes = sampler.text_recipes();
3✔
625
        if recipes.is_empty() {
3✔
626
            println!(
1✔
627
                "No text recipes registered. Ensure your sources expose triplet selectors or configure text_recipes explicitly."
1✔
628
            );
1✔
629
        } else {
2✔
630
            print_text_recipes(&recipes);
2✔
631
        }
2✔
632
    } else if let Some(batch_count) = cli.batches {
4✔
633
        print_demo_config(&config_snapshot);
1✔
634
        println!("=== benchmark: {} triplet batches ===", batch_count);
1✔
635

636
        // source_id -> Vec<(pos_jaccard, pos_byte_cosine, neg_jaccard, neg_byte_cosine)>
637
        #[cfg(feature = "extended-metrics")]
638
        let mut source_metrics: HashMap<String, Vec<(f32, f32, f32, f32)>> = HashMap::new();
1✔
639

640
        for i in 0..batch_count {
2✔
641
            let t0 = Instant::now();
2✔
642
            match sampler.next_triplet_batch(selected_split) {
2✔
643
                Ok(batch) => {
2✔
644
                    let elapsed = t0.elapsed();
2✔
645
                    let n = batch.triplets.len();
2✔
646
                    println!(
2✔
647
                        "batch {:>4}  triplets={:<4}  elapsed={:>8.2}ms  per_triplet={:.2}ms",
648
                        i + 1,
2✔
649
                        n,
650
                        elapsed.as_secs_f64() * 1000.0,
2✔
651
                        if n > 0 {
2✔
652
                            elapsed.as_secs_f64() * 1000.0 / n as f64
2✔
653
                        } else {
NEW
654
                            0.0
×
655
                        },
656
                    );
657
                    #[cfg(feature = "extended-metrics")]
658
                    {
659
                        use crate::metrics::lexical_similarity_scores;
660
                        for triplet in &batch.triplets {
8✔
661
                            let (pj, pc) = lexical_similarity_scores(
8✔
662
                                &triplet.anchor.text,
8✔
663
                                &triplet.positive.text,
8✔
664
                            );
8✔
665
                            let (nj, nc) = lexical_similarity_scores(
8✔
666
                                &triplet.anchor.text,
8✔
667
                                &triplet.negative.text,
8✔
668
                            );
8✔
669
                            let source = extract_source(&triplet.anchor.record_id);
8✔
670
                            source_metrics
8✔
671
                                .entry(source)
8✔
672
                                .or_default()
8✔
673
                                .push((pj, pc, nj, nc));
8✔
674
                        }
8✔
675
                    }
676
                }
NEW
677
                Err(SamplerError::Exhausted(name)) => {
×
NEW
678
                    println!(
×
679
                        "batch {:>4}  exhausted recipe '{}' — stopping early",
NEW
680
                        i + 1,
×
681
                        name
682
                    );
NEW
683
                    break;
×
684
                }
NEW
685
                Err(err) => return Err(err.into()),
×
686
            }
687
        }
688

689
        sampler.save_sampler_state(None)?;
1✔
690

691
        #[cfg(feature = "extended-metrics")]
692
        if !source_metrics.is_empty() {
1✔
693
            println!();
1✔
694
            print_metric_summary(&source_metrics);
1✔
695
        }
1✔
696

697
        #[cfg(all(feature = "extended-metrics", feature = "bm25-mining"))]
698
        {
699
            let (fallback, total) = sampler.bm25_fallback_stats();
1✔
700
            if total > 0 {
1✔
701
                let pct = fallback as f64 / total as f64 * 100.0;
1✔
702
                println!("bm25 fallback rate : {}/{} ({:.1}%)", fallback, total, pct);
1✔
703
            }
1✔
704
        }
705
    } else {
706
        match sampler.next_triplet_batch(selected_split) {
3✔
707
            Ok(triplet_batch) => {
×
708
                if triplet_batch.triplets.is_empty() {
×
709
                    println!(
×
710
                        "Triplet sampling produced no results. Ensure multiple records per source exist."
×
711
                    );
×
712
                } else {
×
UNCOV
713
                    print_triplet_batch(&chunking, &triplet_batch, split_store.as_ref());
×
UNCOV
714
                }
×
715
                sampler.save_sampler_state(None)?;
×
716
                #[cfg(all(feature = "extended-metrics", feature = "bm25-mining"))]
717
                {
NEW
718
                    let (fallback, total) = sampler.bm25_fallback_stats();
×
NEW
719
                    if total > 0 {
×
NEW
720
                        let pct = fallback as f64 / total as f64 * 100.0;
×
NEW
721
                        println!("bm25 fallback rate : {}/{} ({:.1}%)", fallback, total, pct);
×
NEW
722
                    }
×
723
                }
724
            }
725
            Err(SamplerError::Exhausted(name)) => {
3✔
726
                eprintln!(
3✔
727
                    "Triplet sampler exhausted recipe '{}'. Ensure both positive and negative examples exist.",
3✔
728
                    name
3✔
729
                );
3✔
730
            }
3✔
731
            Err(err) => return Err(err.into()),
×
732
        }
733
    }
734

735
    Ok(())
14✔
736
}
17✔
737

738
struct MultiSourceDemoConfigSnapshot {
739
    seed: u64,
740
    batch_size: usize,
741
    ingestion_max_records: usize,
742
    split: SplitLabel,
743
    split_ratios: SplitRatios,
744
    max_window_tokens: usize,
745
    overlap_tokens: Vec<usize>,
746
    summary_fallback_tokens: usize,
747
}
748

749
fn print_demo_config(cfg: &MultiSourceDemoConfigSnapshot) {
1✔
750
    let overlaps: Vec<String> = cfg.overlap_tokens.iter().map(|t| t.to_string()).collect();
1✔
751
    println!("=== sampler config ===");
1✔
752
    println!("seed                 : {}", cfg.seed);
1✔
753
    println!("batch_size           : {}", cfg.batch_size);
1✔
754
    println!("ingestion_max_records: {}", cfg.ingestion_max_records);
1✔
755
    println!("split                : {:?}", cfg.split);
1✔
756
    println!(
1✔
757
        "split_ratios         : train={:.2} val={:.2} test={:.2}",
758
        cfg.split_ratios.train, cfg.split_ratios.validation, cfg.split_ratios.test
759
    );
760
    println!("max_window_tokens    : {}", cfg.max_window_tokens);
1✔
761
    println!("overlap_tokens       : [{}]", overlaps.join(", "));
1✔
762
    println!(
1✔
763
        "summary_fallback     : {} tokens (0 = disabled)",
764
        cfg.summary_fallback_tokens
765
    );
766
    println!();
1✔
767
}
1✔
768

769
fn parse_positive_usize(raw: &str) -> Result<usize, String> {
22✔
770
    let parsed = raw.parse::<usize>().map_err(|_| {
22✔
771
        format!(
1✔
772
            "Could not parse --batch-size value '{}' as a positive integer",
773
            raw
774
        )
775
    })?;
1✔
776
    if parsed == 0 {
21✔
777
        return Err("--batch-size must be greater than zero".to_string());
2✔
778
    }
19✔
779
    Ok(parsed)
19✔
780
}
22✔
781

782
fn suggested_balancing_weight(max_baseline: u128, source_baseline: u128) -> f32 {
13✔
783
    if max_baseline == 0 || source_baseline == 0 {
13✔
784
        return 0.0;
4✔
785
    }
9✔
786
    (source_baseline as f64 / max_baseline as f64).clamp(0.0, 1.0) as f32
9✔
787
}
13✔
788

789
fn suggested_oversampling_weight(min_nonzero_baseline: u128, source_baseline: u128) -> f32 {
13✔
790
    if min_nonzero_baseline == 0 || source_baseline == 0 {
13✔
791
        return 0.0;
4✔
792
    }
9✔
793
    (min_nonzero_baseline as f64 / source_baseline as f64).clamp(0.0, 1.0) as f32
9✔
794
}
13✔
795

796
fn parse_cli<T, I>(args: I) -> Result<Option<T>, Box<dyn Error>>
27✔
797
where
27✔
798
    T: Parser,
27✔
799
    I: IntoIterator,
27✔
800
    I::Item: Into<std::ffi::OsString> + Clone,
27✔
801
{
802
    match T::try_parse_from(args) {
27✔
803
        Ok(cli) => Ok(Some(cli)),
21✔
804
        Err(err) => match err.kind() {
6✔
805
            ErrorKind::DisplayHelp | ErrorKind::DisplayVersion => {
806
                err.print()?;
4✔
807
                Ok(None)
4✔
808
            }
809
            _ => Err(err.into()),
2✔
810
        },
811
    }
812
}
27✔
813

814
fn parse_split_ratios_arg(raw: &str) -> Result<SplitRatios, String> {
11✔
815
    let parts: Vec<&str> = raw.split(',').collect();
11✔
816
    if parts.len() != 3 {
11✔
817
        return Err("--split-ratios expects exactly 3 comma-separated values".to_string());
1✔
818
    }
10✔
819
    let train = parts[0]
10✔
820
        .trim()
10✔
821
        .parse::<f32>()
10✔
822
        .map_err(|_| format!("invalid train ratio '{}': must be a float", parts[0].trim()))?;
10✔
823
    let validation = parts[1].trim().parse::<f32>().map_err(|_| {
9✔
824
        format!(
1✔
825
            "invalid validation ratio '{}': must be a float",
826
            parts[1].trim()
1✔
827
        )
828
    })?;
1✔
829
    let test = parts[2]
8✔
830
        .trim()
8✔
831
        .parse::<f32>()
8✔
832
        .map_err(|_| format!("invalid test ratio '{}': must be a float", parts[2].trim()))?;
8✔
833
    let ratios = SplitRatios {
7✔
834
        train,
7✔
835
        validation,
7✔
836
        test,
7✔
837
    };
7✔
838
    let sum = ratios.train + ratios.validation + ratios.test;
7✔
839
    if (sum - 1.0).abs() > 1e-5 {
7✔
840
        return Err(format!(
1✔
841
            "split ratios must sum to 1.0, got {:.6} (train={}, validation={}, test={})",
1✔
842
            sum, ratios.train, ratios.validation, ratios.test
1✔
843
        ));
1✔
844
    }
6✔
845
    if ratios.train < 0.0 || ratios.validation < 0.0 || ratios.test < 0.0 {
6✔
846
        return Err("split ratios must be non-negative".to_string());
1✔
847
    }
5✔
848
    Ok(ratios)
5✔
849
}
11✔
850

851
fn print_triplet_batch(
1✔
852
    strategy: &ChunkingStrategy,
1✔
853
    batch: &TripletBatch,
1✔
854
    split_store: &impl SplitStore,
1✔
855
) {
1✔
856
    println!("=== triplet batch ===");
1✔
857
    for (idx, triplet) in batch.triplets.iter().enumerate() {
1✔
858
        println!("--- triplet #{} ---", idx);
1✔
859
        println!("recipe       : {}", triplet.recipe);
1✔
860
        println!("sample_weight: {:.4}", triplet.weight);
1✔
861
        if let Some(instr) = &triplet.instruction {
1✔
862
            println!("instruction shown to model:\n{}\n", instr);
1✔
863
        }
1✔
864
        #[cfg(feature = "extended-metrics")]
865
        let (pos_sim, neg_sim) = {
1✔
866
            use crate::metrics::lexical_similarity_scores;
867
            (
1✔
868
                Some(lexical_similarity_scores(
1✔
869
                    &triplet.anchor.text,
1✔
870
                    &triplet.positive.text,
1✔
871
                )),
1✔
872
                Some(lexical_similarity_scores(
1✔
873
                    &triplet.anchor.text,
1✔
874
                    &triplet.negative.text,
1✔
875
                )),
1✔
876
            )
1✔
877
        };
878
        #[cfg(not(feature = "extended-metrics"))]
879
        let (pos_sim, neg_sim): (Option<(f32, f32)>, Option<(f32, f32)>) = (None, None);
880
        print_chunk_block("ANCHOR", &triplet.anchor, strategy, split_store, None);
1✔
881
        print_chunk_block(
1✔
882
            "POSITIVE",
1✔
883
            &triplet.positive,
1✔
884
            strategy,
1✔
885
            split_store,
1✔
886
            pos_sim,
1✔
887
        );
888
        print_chunk_block(
1✔
889
            "NEGATIVE",
1✔
890
            &triplet.negative,
1✔
891
            strategy,
1✔
892
            split_store,
1✔
893
            neg_sim,
1✔
894
        );
895
    }
896
    print_source_summary(
1✔
897
        "triplet anchors",
1✔
898
        batch
1✔
899
            .triplets
1✔
900
            .iter()
1✔
901
            .map(|triplet| triplet.anchor.record_id.as_str()),
1✔
902
    );
903
    print_recipe_context_by_source(
1✔
904
        "triplet recipes by source",
1✔
905
        batch
1✔
906
            .triplets
1✔
907
            .iter()
1✔
908
            .map(|triplet| (triplet.anchor.record_id.as_str(), triplet.recipe.as_str())),
1✔
909
    );
910
}
1✔
911

912
fn print_text_batch(strategy: &ChunkingStrategy, batch: &TextBatch, split_store: &impl SplitStore) {
2✔
913
    println!("=== text batch ===");
2✔
914
    for (idx, sample) in batch.samples.iter().enumerate() {
5✔
915
        println!("--- sample #{} ---", idx);
5✔
916
        println!("recipe       : {}", sample.recipe);
5✔
917
        println!("sample_weight: {:.4}", sample.weight);
5✔
918
        if let Some(instr) = &sample.instruction {
5✔
919
            println!("instruction shown to model:\n{}\n", instr);
1✔
920
        }
4✔
921
        print_chunk_block("TEXT", &sample.chunk, strategy, split_store, None);
5✔
922
    }
923
    print_source_summary(
2✔
924
        "text samples",
2✔
925
        batch
2✔
926
            .samples
2✔
927
            .iter()
2✔
928
            .map(|sample| sample.chunk.record_id.as_str()),
5✔
929
    );
930
    print_recipe_context_by_source(
2✔
931
        "text recipes by source",
2✔
932
        batch
2✔
933
            .samples
2✔
934
            .iter()
2✔
935
            .map(|sample| (sample.chunk.record_id.as_str(), sample.recipe.as_str())),
5✔
936
    );
937
}
2✔
938

939
fn print_pair_batch(
2✔
940
    strategy: &ChunkingStrategy,
2✔
941
    batch: &SampleBatch,
2✔
942
    split_store: &impl SplitStore,
2✔
943
) {
2✔
944
    println!("=== pair batch ===");
2✔
945
    for (idx, pair) in batch.pairs.iter().enumerate() {
5✔
946
        println!("--- pair #{} ---", idx);
5✔
947
        println!("recipe       : {}", pair.recipe);
5✔
948
        println!("label        : {:?}", pair.label);
5✔
949
        if let Some(reason) = &pair.reason {
5✔
950
            println!("reason       : {}", reason);
3✔
951
        }
3✔
952
        print_chunk_block("ANCHOR", &pair.anchor, strategy, split_store, None);
5✔
953
        print_chunk_block("OTHER", &pair.positive, strategy, split_store, None);
5✔
954
    }
955
    print_source_summary(
2✔
956
        "pair anchors",
2✔
957
        batch
2✔
958
            .pairs
2✔
959
            .iter()
2✔
960
            .map(|pair| pair.anchor.record_id.as_str()),
5✔
961
    );
962
    print_recipe_context_by_source(
2✔
963
        "pair recipes by source",
2✔
964
        batch
2✔
965
            .pairs
2✔
966
            .iter()
2✔
967
            .map(|pair| (pair.anchor.record_id.as_str(), pair.recipe.as_str())),
5✔
968
    );
969
}
2✔
970

971
fn print_text_recipes(recipes: &[TextRecipe]) {
3✔
972
    println!("=== available text recipes ===");
3✔
973
    for recipe in recipes {
7✔
974
        println!(
7✔
975
            "- {} (weight: {:.3}) selector={:?}",
976
            recipe.name, recipe.weight, recipe.selector
977
        );
978
        if let Some(instr) = &recipe.instruction {
7✔
979
            println!("  instruction: {}", instr);
1✔
980
        }
6✔
981
    }
982
}
3✔
983

984
#[cfg(feature = "extended-metrics")]
985
fn metric_mean_median(vals: &mut [f32]) -> (f32, f32) {
17✔
986
    let mean = vals.iter().sum::<f32>() / vals.len() as f32;
17✔
987
    vals.sort_by(|a, b| a.partial_cmp(b).unwrap_or(std::cmp::Ordering::Equal));
91✔
988
    let median = if vals.len() % 2 == 1 {
17✔
NEW
UNCOV
989
        vals[vals.len() / 2]
×
990
    } else {
991
        (vals[vals.len() / 2 - 1] + vals[vals.len() / 2]) / 2.0
17✔
992
    };
993
    (mean, median)
17✔
994
}
17✔
995

996
#[cfg(feature = "extended-metrics")]
997
fn print_metric_summary(source_data: &HashMap<String, Vec<(f32, f32, f32, f32)>>) {
2✔
998
    let total: usize = source_data.values().map(|v| v.len()).sum();
3✔
999
    let n_sources = source_data.len();
2✔
1000
    println!(
2✔
1001
        "=== extended metrics summary ({} triplets, {} {}) ===",
1002
        total,
1003
        n_sources,
1004
        if n_sources == 1 { "source" } else { "sources" }
2✔
1005
    );
1006

1007
    // Returns [pos, neg] as (mean, median) pairs for one metric across entries.
1008
    fn metric_pair(
8✔
1009
        entries: &[(f32, f32, f32, f32)],
8✔
1010
        pos_idx: usize,
8✔
1011
        neg_idx: usize,
8✔
1012
    ) -> [(f32, f32); 2] {
8✔
1013
        let extract = |idx: usize| -> Vec<f32> {
16✔
1014
            entries
16✔
1015
                .iter()
16✔
1016
                .map(|e| match idx {
64✔
1017
                    0 => e.0,
16✔
1018
                    1 => e.1,
16✔
1019
                    2 => e.2,
16✔
1020
                    _ => e.3,
16✔
1021
                })
64✔
1022
                .collect()
16✔
1023
        };
16✔
1024
        let mut pos_vals = extract(pos_idx);
8✔
1025
        let mut neg_vals = extract(neg_idx);
8✔
1026
        [
8✔
1027
            metric_mean_median(&mut pos_vals),
8✔
1028
            metric_mean_median(&mut neg_vals),
8✔
1029
        ]
8✔
1030
    }
8✔
1031

1032
    fn print_metric_section(
4✔
1033
        label: &str,
4✔
1034
        sources: &[&String],
4✔
1035
        source_data: &HashMap<String, Vec<(f32, f32, f32, f32)>>,
4✔
1036
        pos_idx: usize,
4✔
1037
        neg_idx: usize,
4✔
1038
        total: usize,
4✔
1039
        n_sources: usize,
4✔
1040
    ) {
4✔
1041
        const SEP: usize = 83;
1042
        println!();
4✔
1043
        println!("[{}]", label);
4✔
1044
        println!(
4✔
1045
            "{:<24} {:>5}  {:<16} {:<16} {:<16}",
1046
            "source", "n", "positive", "negative", "gap (pos\u{2212}neg)"
1047
        );
1048
        println!(
4✔
1049
            "{:<24} {:>5}  {:<16} {:<16} {:<16}",
1050
            "", "", "mean / median", "mean / median", "mean / median"
1051
        );
1052
        println!("{}", "-".repeat(SEP));
4✔
1053
        for source in sources {
6✔
1054
            let entries = &source_data[*source];
6✔
1055
            let [pos, neg] = metric_pair(entries, pos_idx, neg_idx);
6✔
1056
            let gap_mean = pos.0 - neg.0;
6✔
1057
            let gap_med = pos.1 - neg.1;
6✔
1058
            println!(
6✔
1059
                "{:<24} {:>5}  {:.3} / {:.3}     {:.3} / {:.3}     {:+.3} / {:+.3}",
6✔
1060
                source,
6✔
1061
                entries.len(),
6✔
1062
                pos.0,
6✔
1063
                pos.1,
6✔
1064
                neg.0,
6✔
1065
                neg.1,
6✔
1066
                gap_mean,
6✔
1067
                gap_med,
6✔
1068
            );
6✔
1069
        }
6✔
1070
        if n_sources > 1 {
4✔
1071
            let all: Vec<(f32, f32, f32, f32)> = source_data.values().flatten().copied().collect();
2✔
1072
            let [pos, neg] = metric_pair(&all, pos_idx, neg_idx);
2✔
1073
            let gap_mean = pos.0 - neg.0;
2✔
1074
            let gap_med = pos.1 - neg.1;
2✔
1075
            println!("{}", "-".repeat(SEP));
2✔
1076
            println!(
2✔
1077
                "{:<24} {:>5}  {:.3} / {:.3}     {:.3} / {:.3}     {:+.3} / {:+.3}",
2✔
1078
                "ALL", total, pos.0, pos.1, neg.0, neg.1, gap_mean, gap_med,
2✔
1079
            );
2✔
1080
        }
2✔
1081
    }
4✔
1082

1083
    let mut sources: Vec<&String> = source_data.keys().collect();
2✔
1084
    sources.sort();
2✔
1085

1086
    print_metric_section(
2✔
1087
        "jaccard \u{2194} anchor",
2✔
1088
        &sources,
2✔
1089
        source_data,
2✔
1090
        0,
1091
        2,
1092
        total,
2✔
1093
        n_sources,
2✔
1094
    );
1095
    print_metric_section(
2✔
1096
        "byte-cos \u{2194} anchor",
2✔
1097
        &sources,
2✔
1098
        source_data,
2✔
1099
        1,
1100
        3,
1101
        total,
2✔
1102
        n_sources,
2✔
1103
    );
1104
    println!();
2✔
1105
}
2✔
1106

1107
trait ChunkDebug {
1108
    fn view_name(&self) -> String;
1109
}
1110

1111
impl ChunkDebug for RecordChunk {
1112
    fn view_name(&self) -> String {
18✔
1113
        match &self.view {
18✔
1114
            ChunkView::Window {
1115
                index,
16✔
1116
                span,
16✔
1117
                overlap,
16✔
1118
                start_ratio,
16✔
1119
            } => format!(
16✔
1120
                "window#index={} span={} overlap={} start_ratio={:.3} tokens={}",
1121
                index, span, overlap, start_ratio, self.tokens_estimate
1122
            ),
1123
            ChunkView::SummaryFallback { strategy, .. } => {
2✔
1124
                format!("summary:{} tokens={}", strategy, self.tokens_estimate)
2✔
1125
            }
1126
        }
1127
    }
18✔
1128
}
1129

1130
fn print_chunk_block(
18✔
1131
    title: &str,
18✔
1132
    chunk: &RecordChunk,
18✔
1133
    strategy: &ChunkingStrategy,
18✔
1134
    split_store: &impl SplitStore,
18✔
1135
    anchor_sim: Option<(f32, f32)>,
18✔
1136
) {
18✔
1137
    let chunk_weight = chunk_weight(strategy, chunk);
18✔
1138
    let split = split_store
18✔
1139
        .label_for(&chunk.record_id)
18✔
1140
        .map(|label| format!("{:?}", label))
18✔
1141
        .unwrap_or_else(|| "Unknown".to_string());
18✔
1142
    println!("--- {} ---", title);
18✔
1143
    println!("split        : {}", split);
18✔
1144
    println!("view         : {}", chunk.view_name());
18✔
1145
    println!("chunk_weight : {:.4}", chunk_weight);
18✔
1146
    println!("record_id    : {}", chunk.record_id);
18✔
1147
    println!("section_idx  : {}", chunk.section_idx);
18✔
1148
    println!("token_est    : {}", chunk.tokens_estimate);
18✔
1149
    if let Some((j, c)) = anchor_sim {
18✔
1150
        println!("jaccard(↔a)  : {:.4}  byte-cos(↔a): {:.4}", j, c);
2✔
1151
    }
16✔
1152
    println!("model_input (exact text sent to the model):");
18✔
1153
    println!(
18✔
1154
        "<<< BEGIN MODEL TEXT >>>\n{}\n<<< END MODEL TEXT >>>\n",
1155
        chunk.text
1156
    );
1157
}
18✔
1158

1159
fn print_source_summary<'a, I>(label: &str, ids: I)
6✔
1160
where
6✔
1161
    I: Iterator<Item = &'a str>,
6✔
1162
{
1163
    let mut counts: HashMap<SourceId, usize> = HashMap::new();
6✔
1164
    for id in ids {
11✔
1165
        let source = extract_source(id);
11✔
1166
        *counts.entry(source).or_insert(0) += 1;
11✔
1167
    }
11✔
1168
    if counts.is_empty() {
6✔
1169
        return;
1✔
1170
    }
5✔
1171
    let skew = source_skew(&counts);
5✔
1172
    let mut entries: Vec<(String, usize)> = counts.into_iter().collect();
5✔
1173
    entries.sort_by(|a, b| b.1.cmp(&a.1).then_with(|| a.0.cmp(&b.0)));
5✔
1174
    println!("--- {} by source ---", label);
5✔
1175
    if let Some(skew) = skew {
5✔
1176
        for entry in &skew.per_source {
5✔
1177
            println!(
5✔
1178
                "{}: count={} share={:.2}",
5✔
1179
                entry.source, entry.count, entry.share
5✔
1180
            );
5✔
1181
        }
5✔
1182
        println!(
5✔
1183
            "skew: sources={} total={} min={} max={} mean={:.2} ratio={:.2}",
1184
            skew.sources, skew.total, skew.min, skew.max, skew.mean, skew.ratio
1185
        );
1186
    } else {
UNCOV
1187
        for (source, count) in &entries {
×
UNCOV
1188
            println!("{source}: count={count}");
×
UNCOV
1189
        }
×
1190
    }
1191
}
6✔
1192

1193
fn print_recipe_context_by_source<'a, I>(label: &str, entries: I)
6✔
1194
where
6✔
1195
    I: Iterator<Item = (&'a str, &'a str)>,
6✔
1196
{
1197
    let mut counts: HashMap<SourceId, HashMap<String, usize>> = HashMap::new();
6✔
1198
    for (record_id, recipe) in entries {
11✔
1199
        let source = extract_source(record_id);
11✔
1200
        let entry = counts
11✔
1201
            .entry(source)
11✔
1202
            .or_default()
11✔
1203
            .entry(recipe.to_string())
11✔
1204
            .or_insert(0);
11✔
1205
        *entry += 1;
11✔
1206
    }
11✔
1207
    if counts.is_empty() {
6✔
1208
        return;
1✔
1209
    }
5✔
1210
    let mut sources: Vec<(SourceId, HashMap<String, usize>)> = counts.into_iter().collect();
5✔
1211
    sources.sort_by(|a, b| a.0.cmp(&b.0));
5✔
1212
    println!("--- {} ---", label);
5✔
1213
    for (source, recipes) in sources {
5✔
1214
        println!("{source}");
5✔
1215
        let mut entries: Vec<(String, usize)> = recipes.into_iter().collect();
5✔
1216
        entries.sort_by(|a, b| b.1.cmp(&a.1).then_with(|| a.0.cmp(&b.0)));
5✔
1217
        for (recipe, count) in entries {
6✔
1218
            println!("  - {recipe}={count}");
6✔
1219
        }
6✔
1220
    }
1221
}
6✔
1222

1223
fn extract_source(record_id: &str) -> SourceId {
32✔
1224
    record_id
32✔
1225
        .split_once("::")
32✔
1226
        .map(|(source, _)| source.to_string())
32✔
1227
        .unwrap_or_else(|| "unknown".to_string())
32✔
1228
}
32✔
1229

1230
#[cfg(test)]
1231
mod tests {
1232
    use super::*;
1233
    use crate::DataRecord;
1234
    use crate::DeterministicSplitStore;
1235
    use crate::data::{QualityScore, RecordSection, SectionRole};
1236
    use crate::source::{SourceCursor, SourceSnapshot};
1237
    use crate::utils::make_section;
1238
    use chrono::{TimeZone, Utc};
1239
    use tempfile::tempdir;
1240

1241
    /// Minimal in-memory `DataSource` test double for example app tests.
1242
    struct TestSource {
1243
        id: String,
1244
        count: Option<u128>,
1245
        recipes: Vec<TripletRecipe>,
1246
    }
1247

1248
    impl DataSource for TestSource {
1249
        fn id(&self) -> &str {
131✔
1250
            &self.id
131✔
1251
        }
131✔
1252

1253
        fn refresh(
30✔
1254
            &self,
30✔
1255
            _config: &SamplerConfig,
30✔
1256
            _cursor: Option<&SourceCursor>,
30✔
1257
            _limit: Option<usize>,
30✔
1258
        ) -> Result<SourceSnapshot, SamplerError> {
30✔
1259
            Ok(SourceSnapshot {
30✔
1260
                records: Vec::new(),
30✔
1261
                cursor: SourceCursor {
30✔
1262
                    last_seen: Utc::now(),
30✔
1263
                    revision: 0,
30✔
1264
                },
30✔
1265
            })
30✔
1266
        }
30✔
1267

1268
        fn reported_record_count(&self, _config: &SamplerConfig) -> Result<u128, SamplerError> {
2✔
1269
            self.count.ok_or_else(|| SamplerError::SourceInconsistent {
2✔
1270
                source_id: self.id.clone(),
1✔
1271
                details: "test source has no configured exact count".to_string(),
1✔
1272
            })
1✔
1273
        }
2✔
1274

1275
        fn default_triplet_recipes(&self) -> Vec<TripletRecipe> {
11✔
1276
            self.recipes.clone()
11✔
1277
        }
11✔
1278
    }
1279

1280
    struct ConfigRequiredSource {
1281
        id: String,
1282
        expected_seed: u64,
1283
    }
1284

1285
    impl DataSource for ConfigRequiredSource {
1286
        fn id(&self) -> &str {
1✔
1287
            &self.id
1✔
1288
        }
1✔
1289

1290
        fn refresh(
1✔
1291
            &self,
1✔
1292
            _config: &SamplerConfig,
1✔
1293
            _cursor: Option<&SourceCursor>,
1✔
1294
            _limit: Option<usize>,
1✔
1295
        ) -> Result<SourceSnapshot, SamplerError> {
1✔
1296
            Ok(SourceSnapshot {
1✔
1297
                records: Vec::new(),
1✔
1298
                cursor: SourceCursor {
1✔
1299
                    last_seen: Utc::now(),
1✔
1300
                    revision: 0,
1✔
1301
                },
1✔
1302
            })
1✔
1303
        }
1✔
1304

1305
        fn reported_record_count(&self, config: &SamplerConfig) -> Result<u128, SamplerError> {
2✔
1306
            if config.seed == self.expected_seed {
2✔
1307
                Ok(1)
1✔
1308
            } else {
1309
                Err(SamplerError::SourceInconsistent {
1✔
1310
                    source_id: self.id.clone(),
1✔
1311
                    details: format!(
1✔
1312
                        "expected sampler seed {} but got {}",
1✔
1313
                        self.expected_seed, config.seed
1✔
1314
                    ),
1✔
1315
                })
1✔
1316
            }
1317
        }
2✔
1318

1319
        fn default_triplet_recipes(&self) -> Vec<TripletRecipe> {
2✔
1320
            Vec::new()
2✔
1321
        }
2✔
1322
    }
1323

1324
    struct FixtureSource {
1325
        id: String,
1326
        records: Vec<DataRecord>,
1327
        recipes: Vec<TripletRecipe>,
1328
    }
1329

1330
    impl DataSource for FixtureSource {
1331
        fn id(&self) -> &str {
16✔
1332
            &self.id
16✔
1333
        }
16✔
1334

1335
        fn refresh(
3✔
1336
            &self,
3✔
1337
            _config: &SamplerConfig,
3✔
1338
            _cursor: Option<&SourceCursor>,
3✔
1339
            _limit: Option<usize>,
3✔
1340
        ) -> Result<SourceSnapshot, SamplerError> {
3✔
1341
            Ok(SourceSnapshot {
3✔
1342
                records: self.records.clone(),
3✔
1343
                cursor: SourceCursor {
3✔
1344
                    last_seen: Utc::now(),
3✔
1345
                    revision: 0,
3✔
1346
                },
3✔
1347
            })
3✔
1348
        }
3✔
1349

NEW
UNCOV
1350
        fn reported_record_count(&self, _config: &SamplerConfig) -> Result<u128, SamplerError> {
×
NEW
UNCOV
1351
            Ok(self.records.len() as u128)
×
NEW
UNCOV
1352
        }
×
1353

1354
        fn default_triplet_recipes(&self) -> Vec<TripletRecipe> {
2✔
1355
            self.recipes.clone()
2✔
1356
        }
2✔
1357
    }
1358

1359
    fn fixture_record(
11✔
1360
        source: &str,
11✔
1361
        id_suffix: &str,
11✔
1362
        day: u32,
11✔
1363
        title: &str,
11✔
1364
        body: &str,
11✔
1365
    ) -> DataRecord {
11✔
1366
        let now = Utc.with_ymd_and_hms(2025, 1, day, 12, 0, 0).unwrap();
11✔
1367
        DataRecord {
11✔
1368
            id: format!("{source}::{id_suffix}"),
11✔
1369
            source: source.to_string(),
11✔
1370
            created_at: now,
11✔
1371
            updated_at: now,
11✔
1372
            quality: QualityScore { trust: 1.0 },
11✔
1373
            taxonomy: Vec::new(),
11✔
1374
            sections: vec![
11✔
1375
                make_section(SectionRole::Anchor, Some("title"), title),
11✔
1376
                make_section(SectionRole::Context, Some("body"), body),
11✔
1377
            ],
11✔
1378
            meta_prefix: None,
11✔
1379
        }
11✔
1380
    }
11✔
1381

1382
    fn default_recipe(name: &str) -> TripletRecipe {
12✔
1383
        TripletRecipe {
12✔
1384
            name: name.to_string().into(),
12✔
1385
            anchor: crate::config::Selector::Role(SectionRole::Anchor),
12✔
1386
            positive_selector: crate::config::Selector::Role(SectionRole::Context),
12✔
1387
            negative_selector: crate::config::Selector::Role(SectionRole::Context),
12✔
1388
            negative_strategy: crate::config::NegativeStrategy::WrongArticle,
12✔
1389
            weight: 1.0,
12✔
1390
            instruction: None,
12✔
1391
            allow_same_anchor_positive: false,
12✔
1392
        }
12✔
1393
    }
12✔
1394

1395
    #[test]
1396
    fn parse_helpers_validate_inputs() {
1✔
1397
        assert_eq!(parse_positive_usize("2").unwrap(), 2);
1✔
1398
        assert!(parse_positive_usize("0").is_err());
1✔
1399
        assert!(parse_positive_usize("abc").is_err());
1✔
1400

1401
        let split = parse_split_ratios_arg("0.8,0.1,0.1").unwrap();
1✔
1402
        assert!((split.train - 0.8).abs() < 1e-6);
1✔
1403
        assert!(parse_split_ratios_arg("0.8,0.1").is_err());
1✔
1404
        assert!(parse_split_ratios_arg("1.0,0.0,0.1").is_err());
1✔
1405
        assert!(parse_split_ratios_arg("-0.1,0.6,0.5").is_err());
1✔
1406
    }
1✔
1407

1408
    #[test]
1409
    fn suggested_balancing_weight_is_longest_normalized_and_bounded() {
1✔
1410
        assert!((suggested_balancing_weight(100, 100) - 1.0).abs() < 1e-6);
1✔
1411
        assert!((suggested_balancing_weight(400, 100) - 0.25).abs() < 1e-6);
1✔
1412
        assert!((suggested_balancing_weight(400, 400) - 1.0).abs() < 1e-6);
1✔
1413
        assert_eq!(suggested_balancing_weight(0, 100), 0.0);
1✔
1414
        assert_eq!(suggested_balancing_weight(100, 0), 0.0);
1✔
1415
    }
1✔
1416

1417
    #[test]
1418
    fn suggested_oversampling_weight_is_inverse_in_unit_interval() {
1✔
1419
        assert!((suggested_oversampling_weight(100, 100) - 1.0).abs() < 1e-6);
1✔
1420
        assert!((suggested_oversampling_weight(100, 400) - 0.25).abs() < 1e-6);
1✔
1421
        assert!((suggested_oversampling_weight(100, 1000) - 0.1).abs() < 1e-6);
1✔
1422
        assert_eq!(suggested_oversampling_weight(0, 100), 0.0);
1✔
1423
        assert_eq!(suggested_oversampling_weight(100, 0), 0.0);
1✔
1424
    }
1✔
1425

1426
    #[test]
1427
    fn parse_cli_handles_help_and_invalid_args() {
1✔
1428
        let help = parse_cli::<EstimateCapacityCli, _>(["estimate_capacity", "--help"]).unwrap();
1✔
1429
        assert!(help.is_none());
1✔
1430

1431
        let err = parse_cli::<EstimateCapacityCli, _>(["estimate_capacity", "--unknown"]);
1✔
1432
        assert!(err.is_err());
1✔
1433
    }
1✔
1434

1435
    #[test]
1436
    fn run_estimate_capacity_succeeds_with_reported_counts() {
1✔
1437
        let result = run_estimate_capacity(
1✔
1438
            std::iter::empty::<String>(),
1✔
1439
            |roots| {
1✔
1440
                assert!(roots.is_empty());
1✔
1441
                Ok(())
1✔
1442
            },
1✔
1443
            |_| {
1✔
1444
                vec![Box::new(TestSource {
1✔
1445
                    id: "source_a".into(),
1✔
1446
                    count: Some(12),
1✔
1447
                    recipes: vec![default_recipe("r1")],
1✔
1448
                }) as DynSource]
1✔
1449
            },
1✔
1450
        );
1451

1452
        assert!(result.is_ok());
1✔
1453
    }
1✔
1454

1455
    #[test]
1456
    fn run_estimate_capacity_errors_when_source_count_missing() {
1✔
1457
        let result = run_estimate_capacity(
1✔
1458
            std::iter::empty::<String>(),
1✔
1459
            |_| Ok(()),
1✔
1460
            |_| {
1✔
1461
                vec![Box::new(TestSource {
1✔
1462
                    id: "source_missing".into(),
1✔
1463
                    count: None,
1✔
1464
                    recipes: vec![default_recipe("r1")],
1✔
1465
                }) as DynSource]
1✔
1466
            },
1✔
1467
        );
1468

1469
        let err = result.unwrap_err().to_string();
1✔
1470
        assert!(err.contains("failed to report exact record count"));
1✔
1471
    }
1✔
1472

1473
    #[test]
1474
    fn run_estimate_capacity_propagates_root_resolution_error() {
1✔
1475
        let result = run_estimate_capacity(
1✔
1476
            std::iter::empty::<String>(),
1✔
1477
            |_| Err("root resolution failed".into()),
1✔
1478
            |_: &()| Vec::<DynSource>::new(),
×
1479
        );
1480

1481
        let err = result.unwrap_err().to_string();
1✔
1482
        assert!(err.contains("root resolution failed"));
1✔
1483
    }
1✔
1484

1485
    #[test]
1486
    fn run_estimate_capacity_configures_sources_centrally_before_counting() {
1✔
1487
        let result = run_estimate_capacity(
1✔
1488
            std::iter::empty::<String>(),
1✔
1489
            |_| Ok(()),
1✔
1490
            |_| {
1✔
1491
                vec![Box::new(ConfigRequiredSource {
1✔
1492
                    id: "requires_config".into(),
1✔
1493
                    expected_seed: 99,
1✔
1494
                }) as DynSource]
1✔
1495
            },
1✔
1496
        );
1497

1498
        assert!(result.is_ok());
1✔
1499
    }
1✔
1500

1501
    #[test]
1502
    fn config_required_source_refresh_and_seed_mismatch_are_exercised() {
1✔
1503
        let source = ConfigRequiredSource {
1✔
1504
            id: "cfg-source".to_string(),
1✔
1505
            expected_seed: 42,
1✔
1506
        };
1✔
1507

1508
        let refreshed = source
1✔
1509
            .refresh(&SamplerConfig::default(), None, None)
1✔
1510
            .unwrap();
1✔
1511
        assert!(refreshed.records.is_empty());
1✔
1512

1513
        let mismatched = source.reported_record_count(&SamplerConfig {
1✔
1514
            seed: 7,
1✔
1515
            ..SamplerConfig::default()
1✔
1516
        });
1✔
1517
        assert!(matches!(
1✔
1518
            mismatched,
1✔
1519
            Err(SamplerError::SourceInconsistent { .. })
1520
        ));
1521

1522
        assert!(source.default_triplet_recipes().is_empty());
1✔
1523
    }
1✔
1524

1525
    #[test]
1526
    fn run_multi_source_demo_exhausted_paths_return_ok() {
1✔
1527
        struct OneRecordSource;
1528

1529
        impl DataSource for OneRecordSource {
1530
            fn id(&self) -> &str {
48✔
1531
                "one_record"
48✔
1532
            }
48✔
1533

1534
            fn refresh(
11✔
1535
                &self,
11✔
1536
                _config: &SamplerConfig,
11✔
1537
                _cursor: Option<&SourceCursor>,
11✔
1538
                _limit: Option<usize>,
11✔
1539
            ) -> Result<SourceSnapshot, SamplerError> {
11✔
1540
                let now = Utc::now();
11✔
1541
                Ok(SourceSnapshot {
11✔
1542
                    records: vec![DataRecord {
11✔
1543
                        id: "one_record::r1".to_string(),
11✔
1544
                        source: "one_record".to_string(),
11✔
1545
                        created_at: now,
11✔
1546
                        updated_at: now,
11✔
1547
                        quality: QualityScore { trust: 1.0 },
11✔
1548
                        taxonomy: Vec::new(),
11✔
1549
                        sections: vec![
11✔
1550
                            RecordSection {
11✔
1551
                                role: SectionRole::Anchor,
11✔
1552
                                heading: Some("title".to_string()),
11✔
1553
                                text: "anchor".to_string(),
11✔
1554
                                sentences: vec!["anchor".to_string()],
11✔
1555
                            },
11✔
1556
                            RecordSection {
11✔
1557
                                role: SectionRole::Context,
11✔
1558
                                heading: Some("body".to_string()),
11✔
1559
                                text: "context".to_string(),
11✔
1560
                                sentences: vec!["context".to_string()],
11✔
1561
                            },
11✔
1562
                        ],
11✔
1563
                        meta_prefix: None,
11✔
1564
                    }],
11✔
1565
                    cursor: SourceCursor {
11✔
1566
                        last_seen: now,
11✔
1567
                        revision: 0,
11✔
1568
                    },
11✔
1569
                })
11✔
1570
            }
11✔
1571

UNCOV
1572
            fn reported_record_count(&self, _config: &SamplerConfig) -> Result<u128, SamplerError> {
×
UNCOV
1573
                Ok(1)
×
UNCOV
1574
            }
×
1575

1576
            fn default_triplet_recipes(&self) -> Vec<TripletRecipe> {
3✔
1577
                vec![default_recipe("single_record_recipe")]
3✔
1578
            }
3✔
1579
        }
1580

1581
        for mode in ["--pair-batch", "--text-recipes", ""] {
3✔
1582
            let dir = tempdir().unwrap();
3✔
1583
            let split_store_path = dir.path().join("split_store.bin");
3✔
1584
            let mut args = vec![
3✔
1585
                "--split-store-path".to_string(),
3✔
1586
                split_store_path.to_string_lossy().to_string(),
3✔
1587
            ];
1588
            if !mode.is_empty() {
3✔
1589
                args.push(mode.to_string());
2✔
1590
            }
2✔
1591

1592
            let result = run_multi_source_demo(
3✔
1593
                args.into_iter(),
3✔
1594
                |_| Ok(()),
3✔
1595
                |_| vec![Box::new(OneRecordSource) as DynSource],
3✔
1596
            );
1597
            assert!(result.is_ok());
3✔
1598
        }
1599
    }
1✔
1600

1601
    #[test]
1602
    fn parse_multi_source_cli_handles_help_and_batch_size_validation() {
1✔
1603
        let help = parse_cli::<MultiSourceDemoCli, _>(["multi_source_demo", "--help"]).unwrap();
1✔
1604
        assert!(help.is_none());
1✔
1605

1606
        let err = parse_cli::<MultiSourceDemoCli, _>(["multi_source_demo", "--batch-size", "0"]);
1✔
1607
        assert!(err.is_err());
1✔
1608

1609
        let parsed = parse_cli::<MultiSourceDemoCli, _>(["multi_source_demo"]);
1✔
1610
        assert!(parsed.is_ok());
1✔
1611
    }
1✔
1612

1613
    #[test]
1614
    fn parse_cli_handles_display_version_path() {
1✔
1615
        #[derive(Debug, Parser)]
1616
        #[command(name = "version_test", version = "1.0.0")]
1617
        struct VersionCli {}
1618

1619
        let parsed = parse_cli::<VersionCli, _>(["version_test", "--version"]).unwrap();
1✔
1620
        assert!(parsed.is_none());
1✔
1621
    }
1✔
1622

1623
    #[test]
1624
    fn run_multi_source_demo_list_text_recipes_path_succeeds() {
1✔
1625
        let dir = tempdir().unwrap();
1✔
1626
        let split_store_path = dir.path().join("recipes_split_store.bin");
1✔
1627
        let mut args = vec![
1✔
1628
            "--list-text-recipes".to_string(),
1✔
1629
            "--split-store-path".to_string(),
1✔
1630
            split_store_path.to_string_lossy().to_string(),
1✔
1631
        ];
1632
        let result = run_multi_source_demo(
1✔
1633
            args.drain(..),
1✔
1634
            |_| Ok(()),
1✔
1635
            |_| {
1✔
1636
                vec![Box::new(TestSource {
1✔
1637
                    id: "source_for_recipes".into(),
1✔
1638
                    count: Some(10),
1✔
1639
                    recipes: vec![default_recipe("recipe_a")],
1✔
1640
                }) as DynSource]
1✔
1641
            },
1✔
1642
        );
1643

1644
        assert!(result.is_ok());
1✔
1645
    }
1✔
1646

1647
    #[test]
1648
    fn run_multi_source_demo_list_text_recipes_uses_explicit_split_store_path() {
1✔
1649
        let dir = tempdir().unwrap();
1✔
1650
        let split_store_path = dir.path().join("custom_split_store.bin");
1✔
1651
        let args = vec![
1✔
1652
            "--list-text-recipes".to_string(),
1✔
1653
            "--split-store-path".to_string(),
1✔
1654
            split_store_path.to_string_lossy().to_string(),
1✔
1655
        ];
1656

1657
        let result = run_multi_source_demo(
1✔
1658
            args.into_iter(),
1✔
1659
            |_| Ok(()),
1✔
1660
            |_| {
1✔
1661
                vec![Box::new(TestSource {
1✔
1662
                    id: "source_without_text_recipes".into(),
1✔
1663
                    count: Some(1),
1✔
1664
                    recipes: Vec::new(),
1✔
1665
                }) as DynSource]
1✔
1666
            },
1✔
1667
        );
1668

1669
        assert!(result.is_ok());
1✔
1670
    }
1✔
1671

1672
    #[test]
1673
    fn run_multi_source_demo_sampling_modes_handle_empty_sources() {
1✔
1674
        for mode in [
3✔
1675
            vec!["--pair-batch".to_string()],
1✔
1676
            vec!["--text-recipes".to_string()],
1✔
1677
            vec![],
1✔
1678
        ] {
1✔
1679
            let dir = tempdir().unwrap();
3✔
1680
            let split_store_path = dir.path().join("empty_sources_split_store.bin");
3✔
1681
            let mut args = mode;
3✔
1682
            args.push("--split-store-path".to_string());
3✔
1683
            args.push(split_store_path.to_string_lossy().to_string());
3✔
1684
            args.push("--split".to_string());
3✔
1685
            args.push("validation".to_string());
3✔
1686

1687
            let result = run_multi_source_demo(
3✔
1688
                args.into_iter(),
3✔
1689
                |_| Ok(()),
3✔
1690
                |_| {
3✔
1691
                    vec![Box::new(TestSource {
3✔
1692
                        id: "source_empty".into(),
3✔
1693
                        count: Some(0),
3✔
1694
                        recipes: vec![default_recipe("recipe_empty")],
3✔
1695
                    }) as DynSource]
3✔
1696
                },
3✔
1697
            );
1698

1699
            assert!(result.is_ok());
3✔
1700
        }
1701
    }
1✔
1702

1703
    #[test]
1704
    fn run_multi_source_demo_propagates_root_resolution_error() {
1✔
1705
        let dir = tempdir().unwrap();
1✔
1706
        let split_store_path = dir.path().join("root_resolution_error_store.bin");
1✔
1707
        let result = run_multi_source_demo(
1✔
1708
            [
1✔
1709
                "--split-store-path".to_string(),
1✔
1710
                split_store_path.to_string_lossy().to_string(),
1✔
1711
            ]
1✔
1712
            .into_iter(),
1✔
1713
            |_| Err("demo root resolution failed".into()),
1✔
UNCOV
1714
            |_: &()| Vec::<DynSource>::new(),
×
1715
        );
1716

1717
        let err = result.unwrap_err().to_string();
1✔
1718
        assert!(err.contains("demo root resolution failed"));
1✔
1719
    }
1✔
1720

1721
    #[test]
1722
    fn print_helpers_and_extract_source_cover_paths() {
1✔
1723
        let split = SplitRatios::default();
1✔
1724
        let store = DeterministicSplitStore::new(split, 42).unwrap();
1✔
1725
        let strategy = ChunkingStrategy::default();
1✔
1726

1727
        let anchor = RecordChunk {
1✔
1728
            record_id: "source_a::rec1".to_string(),
1✔
1729
            section_idx: 0,
1✔
1730
            view: ChunkView::Window {
1✔
1731
                index: 1,
1✔
1732
                overlap: 2,
1✔
1733
                span: 12,
1✔
1734
                start_ratio: 0.25,
1✔
1735
            },
1✔
1736
            text: "anchor text".to_string(),
1✔
1737
            tokens_estimate: 8,
1✔
1738
            quality: crate::data::QualityScore { trust: 0.9 },
1✔
1739
        };
1✔
1740
        let positive = RecordChunk {
1✔
1741
            record_id: "source_a::rec2".to_string(),
1✔
1742
            section_idx: 1,
1✔
1743
            view: ChunkView::SummaryFallback {
1✔
1744
                strategy: "summary".to_string(),
1✔
1745
                weight: 0.7,
1✔
1746
            },
1✔
1747
            text: "positive text".to_string(),
1✔
1748
            tokens_estimate: 6,
1✔
1749
            quality: crate::data::QualityScore { trust: 0.8 },
1✔
1750
        };
1✔
1751
        let negative = RecordChunk {
1✔
1752
            record_id: "source_b::rec3".to_string(),
1✔
1753
            section_idx: 2,
1✔
1754
            view: ChunkView::Window {
1✔
1755
                index: 0,
1✔
1756
                overlap: 0,
1✔
1757
                span: 16,
1✔
1758
                start_ratio: 0.0,
1✔
1759
            },
1✔
1760
            text: "negative text".to_string(),
1✔
1761
            tokens_estimate: 7,
1✔
1762
            quality: crate::data::QualityScore { trust: 0.5 },
1✔
1763
        };
1✔
1764

1765
        let triplet_batch = TripletBatch {
1✔
1766
            triplets: vec![crate::SampleTriplet {
1✔
1767
                recipe: "triplet_recipe".to_string(),
1✔
1768
                anchor: anchor.clone(),
1✔
1769
                positive: positive.clone(),
1✔
1770
                negative: negative.clone(),
1✔
1771
                weight: 1.0,
1✔
1772
                instruction: Some("triplet instruction".to_string()),
1✔
1773
            }],
1✔
1774
        };
1✔
1775
        print_triplet_batch(&strategy, &triplet_batch, &store);
1✔
1776

1777
        let pair_batch = SampleBatch {
1✔
1778
            pairs: vec![crate::SamplePair {
1✔
1779
                recipe: "pair_recipe".to_string(),
1✔
1780
                anchor: anchor.clone(),
1✔
1781
                positive: positive.clone(),
1✔
1782
                weight: 1.0,
1✔
1783
                instruction: None,
1✔
1784
                label: crate::PairLabel::Positive,
1✔
1785
                reason: Some("same topic".to_string()),
1✔
1786
            }],
1✔
1787
        };
1✔
1788
        print_pair_batch(&strategy, &pair_batch, &store);
1✔
1789

1790
        let text_batch = TextBatch {
1✔
1791
            samples: vec![crate::TextSample {
1✔
1792
                recipe: "text_recipe".to_string(),
1✔
1793
                chunk: negative,
1✔
1794
                weight: 0.8,
1✔
1795
                instruction: Some("text instruction".to_string()),
1✔
1796
            }],
1✔
1797
        };
1✔
1798
        print_text_batch(&strategy, &text_batch, &store);
1✔
1799

1800
        let recipes = vec![TextRecipe {
1✔
1801
            name: "recipe_name".into(),
1✔
1802
            selector: crate::config::Selector::Role(SectionRole::Context),
1✔
1803
            instruction: Some("instruction".into()),
1✔
1804
            weight: 1.0,
1✔
1805
        }];
1✔
1806
        print_text_recipes(&recipes);
1✔
1807

1808
        assert_eq!(extract_source("source_a::record"), "source_a");
1✔
1809
        assert_eq!(extract_source("record-without-delimiter"), "unknown");
1✔
1810
    }
1✔
1811

1812
    #[test]
1813
    fn split_arg_conversion_and_version_parse_paths_are_covered() {
1✔
1814
        assert!(matches!(
1✔
1815
            SplitLabel::from(SplitArg::Train),
1✔
1816
            SplitLabel::Train
1817
        ));
1818
        assert!(matches!(
1✔
1819
            SplitLabel::from(SplitArg::Validation),
1✔
1820
            SplitLabel::Validation
1821
        ));
1822
        assert!(matches!(SplitLabel::from(SplitArg::Test), SplitLabel::Test));
1✔
1823
    }
1✔
1824

1825
    #[test]
1826
    fn parse_split_ratios_reports_per_field_parse_errors() {
1✔
1827
        assert!(
1✔
1828
            parse_split_ratios_arg("x,0.1,0.9")
1✔
1829
                .unwrap_err()
1✔
1830
                .contains("invalid train ratio")
1✔
1831
        );
1832
        assert!(
1✔
1833
            parse_split_ratios_arg("0.1,y,0.8")
1✔
1834
                .unwrap_err()
1✔
1835
                .contains("invalid validation ratio")
1✔
1836
        );
1837
        assert!(
1✔
1838
            parse_split_ratios_arg("0.1,0.2,z")
1✔
1839
                .unwrap_err()
1✔
1840
                .contains("invalid test ratio")
1✔
1841
        );
1842
    }
1✔
1843

1844
    #[test]
1845
    fn run_multi_source_demo_exhausted_paths_are_handled() {
1✔
1846
        for mode in [
3✔
1847
            vec!["--pair-batch".to_string()],
1✔
1848
            vec!["--text-recipes".to_string()],
1✔
1849
            Vec::new(),
1✔
1850
        ] {
1✔
1851
            let dir = tempdir().unwrap();
3✔
1852
            let split_store_path = dir.path().join("exhausted_split_store.bin");
3✔
1853
            let mut args = mode;
3✔
1854
            args.push("--split-store-path".to_string());
3✔
1855
            args.push(split_store_path.to_string_lossy().to_string());
3✔
1856

1857
            let result = run_multi_source_demo(
3✔
1858
                args.into_iter(),
3✔
1859
                |_| Ok(()),
3✔
1860
                |_| {
3✔
1861
                    vec![Box::new(TestSource {
3✔
1862
                        id: "source_without_recipes".into(),
3✔
1863
                        count: Some(1),
3✔
1864
                        recipes: Vec::new(),
3✔
1865
                    }) as DynSource]
3✔
1866
                },
3✔
1867
            );
1868

1869
            assert!(result.is_ok());
3✔
1870
        }
1871
    }
1✔
1872

1873
    #[test]
1874
    fn run_multi_source_demo_reset_recreates_split_store_and_samples() {
1✔
1875
        let dir = tempdir().unwrap();
1✔
1876
        let split_store_path = dir.path().join("reset_split_store.bin");
1✔
1877
        std::fs::write(&split_store_path, b"stale-data").unwrap();
1✔
1878

1879
        let args = vec![
1✔
1880
            "--reset".to_string(),
1✔
1881
            "--pair-batch".to_string(),
1✔
1882
            "--split-store-path".to_string(),
1✔
1883
            split_store_path.to_string_lossy().to_string(),
1✔
1884
        ];
1885

1886
        let result = run_multi_source_demo(
1✔
1887
            args.into_iter(),
1✔
1888
            |_| Ok(()),
1✔
1889
            |_| {
1✔
1890
                let recipes = vec![default_recipe("fixture_recipe")];
1✔
1891
                let records: Vec<DataRecord> = (1..=8)
1✔
1892
                    .map(|day| {
8✔
1893
                        fixture_record(
8✔
1894
                            "fixture_source",
8✔
1895
                            &format!("r{day}"),
8✔
1896
                            day,
8✔
1897
                            &format!("Fixture headline {day}"),
8✔
1898
                            &format!("Fixture body content for day {day}."),
8✔
1899
                        )
1900
                    })
8✔
1901
                    .collect();
1✔
1902
                vec![Box::new(FixtureSource {
1✔
1903
                    id: "fixture_source".into(),
1✔
1904
                    records,
1✔
1905
                    recipes,
1✔
1906
                }) as DynSource]
1✔
1907
            },
1✔
1908
        );
1909

1910
        assert!(result.is_ok());
1✔
1911
        assert!(split_store_path.exists());
1✔
1912
        let metadata = std::fs::metadata(&split_store_path).unwrap();
1✔
1913
        assert!(metadata.len() > 0);
1✔
1914
    }
1✔
1915

1916
    #[test]
1917
    fn run_multi_source_demo_batches_mode_executes_multiple_batches() {
1✔
1918
        let dir = tempdir().unwrap();
1✔
1919
        let split_store_path = dir.path().join("batches_split_store.bin");
1✔
1920
        let args = vec![
1✔
1921
            "--batches".to_string(),
1✔
1922
            "2".to_string(),
1✔
1923
            "--split-store-path".to_string(),
1✔
1924
            split_store_path.to_string_lossy().to_string(),
1✔
1925
        ];
1926

1927
        let result = run_multi_source_demo(
1✔
1928
            args.into_iter(),
1✔
1929
            |_| Ok(()),
1✔
1930
            |_| {
1✔
1931
                let recipes = vec![default_recipe("batch_recipe")];
1✔
1932
                vec![Box::new(FixtureSource {
1✔
1933
                    id: "batch_source".into(),
1✔
1934
                    records: vec![
1✔
1935
                        fixture_record(
1✔
1936
                            "batch_source",
1✔
1937
                            "r1",
1✔
1938
                            3,
1✔
1939
                            "Inflation cools in latest report",
1✔
1940
                            "Core inflation moderated compared with prior quarter.",
1✔
1941
                        ),
1✔
1942
                        fixture_record(
1✔
1943
                            "batch_source",
1✔
1944
                            "r2",
1✔
1945
                            4,
1✔
1946
                            "Labor market remains resilient",
1✔
1947
                            "Job openings remain elevated despite slower growth.",
1✔
1948
                        ),
1✔
1949
                        fixture_record(
1✔
1950
                            "batch_source",
1✔
1951
                            "r3",
1✔
1952
                            5,
1✔
1953
                            "Manufacturing sentiment stabilizes",
1✔
1954
                            "Survey data suggests output expectations are improving.",
1✔
1955
                        ),
1✔
1956
                    ],
1✔
1957
                    recipes,
1✔
1958
                }) as DynSource]
1✔
1959
            },
1✔
1960
        );
1961

1962
        assert!(result.is_ok());
1✔
1963
        assert!(split_store_path.exists());
1✔
1964
    }
1✔
1965

1966
    #[test]
1967
    fn managed_demo_split_store_path_resolves_under_cache_group() {
1✔
1968
        let path = managed_demo_split_store_path().unwrap();
1✔
1969
        assert!(path.ends_with(MULTI_SOURCE_DEMO_STORE_FILENAME));
1✔
1970
        let parent = path
1✔
1971
            .parent()
1✔
1972
            .expect("managed split-store path should have a parent");
1✔
1973
        assert!(parent.ends_with(PathBuf::from(MULTI_SOURCE_DEMO_GROUP)));
1✔
1974
    }
1✔
1975

1976
    #[test]
1977
    fn run_multi_source_demo_help_returns_ok_without_work() {
1✔
1978
        let result = run_multi_source_demo(
1✔
1979
            ["--help".to_string()].into_iter(),
1✔
NEW
UNCOV
1980
            |_| -> Result<(), Box<dyn Error>> {
×
NEW
UNCOV
1981
                panic!("help path should return before resolving roots")
×
1982
            },
NEW
UNCOV
1983
            |_: &()| -> Vec<DynSource> {
×
NEW
UNCOV
1984
                panic!("help path should return before building sources")
×
1985
            },
1986
        );
1987

1988
        assert!(result.is_ok());
1✔
1989
    }
1✔
1990

1991
    #[test]
1992
    fn run_multi_source_demo_uses_managed_split_store_path_when_not_provided() {
1✔
1993
        let result = run_multi_source_demo(
1✔
1994
            ["--list-text-recipes".to_string()].into_iter(),
1✔
1995
            |_| Ok(()),
1✔
1996
            |_| {
1✔
1997
                vec![Box::new(TestSource {
1✔
1998
                    id: "managed_path_source".into(),
1✔
1999
                    count: Some(2),
1✔
2000
                    recipes: vec![default_recipe("managed_recipe")],
1✔
2001
                }) as DynSource]
1✔
2002
            },
1✔
2003
        );
2004

2005
        assert!(result.is_ok());
1✔
2006
    }
1✔
2007

2008
    #[test]
2009
    fn run_multi_source_demo_reset_errors_when_target_is_directory() {
1✔
2010
        let dir = tempdir().unwrap();
1✔
2011
        let split_store_path = dir.path().join("split_store_dir");
1✔
2012
        std::fs::create_dir(&split_store_path).unwrap();
1✔
2013

2014
        let result = run_multi_source_demo(
1✔
2015
            [
1✔
2016
                "--reset".to_string(),
1✔
2017
                "--split-store-path".to_string(),
1✔
2018
                split_store_path.to_string_lossy().to_string(),
1✔
2019
            ]
1✔
2020
            .into_iter(),
1✔
2021
            |_| Ok(()),
1✔
NEW
UNCOV
2022
            |_| Vec::<DynSource>::new(),
×
2023
        );
2024

2025
        let err = result.unwrap_err().to_string();
1✔
2026
        assert!(err.contains("failed to remove split store"));
1✔
2027
    }
1✔
2028

2029
    #[test]
2030
    fn print_summary_helpers_accept_empty_iterators() {
1✔
2031
        print_source_summary("empty summary", std::iter::empty::<&str>());
1✔
2032
        print_recipe_context_by_source("empty recipe context", std::iter::empty::<(&str, &str)>());
1✔
2033
    }
1✔
2034

2035
    #[cfg(feature = "extended-metrics")]
2036
    #[test]
2037
    fn metric_mean_median_handles_even_length_inputs() {
1✔
2038
        let mut vals = [1.0, 4.0, 2.0, 3.0];
1✔
2039
        let (mean, median) = metric_mean_median(&mut vals);
1✔
2040
        assert!((mean - 2.5).abs() < 1e-6);
1✔
2041
        assert!((median - 2.5).abs() < 1e-6);
1✔
2042
    }
1✔
2043

2044
    #[cfg(feature = "extended-metrics")]
2045
    #[test]
2046
    fn print_metric_summary_includes_multi_source_aggregate() {
1✔
2047
        let source_data = HashMap::from([
1✔
2048
            (
1✔
2049
                "source_a".to_string(),
1✔
2050
                vec![(0.9, 0.8, 0.2, 0.1), (0.8, 0.7, 0.3, 0.2)],
1✔
2051
            ),
1✔
2052
            (
1✔
2053
                "source_b".to_string(),
1✔
2054
                vec![(0.7, 0.6, 0.4, 0.3), (0.6, 0.5, 0.5, 0.4)],
1✔
2055
            ),
1✔
2056
        ]);
1✔
2057

2058
        print_metric_summary(&source_data);
1✔
2059
    }
1✔
2060
}
STATUS · Troubleshooting · Open an Issue · Sales · Support · CAREERS · ENTERPRISE · START FREE · SCHEDULE DEMO
ANNOUNCEMENTS · TWITTER · TOS & SLA · Supported CI Services · What's a CI service? · Automated Testing

© 2026 Coveralls, Inc