• Home
  • Features
  • Pricing
  • Docs
  • Announcements
  • Sign In

jzombie / rust-triplets / 23565355220

25 Mar 2026 09:35PM UTC coverage: 94.941% (+0.1%) from 94.794%
23565355220

Pull #40

github

web-flow
Merge 455532c40 into 65addee9d
Pull Request #40: Refactor BM25 integration

3007 of 3092 new or added lines in 7 files covered. (97.25%)

2 existing lines in 1 file now uncovered.

15670 of 16505 relevant lines covered (94.94%)

133718.45 hits per line

Source File
Press 'n' to go to next uncovered line, 'b' for previous

97.43
/src/example_apps.rs
1
// TODO: Consider extracting to a debug crate
2

3
use std::collections::HashMap;
4
use std::error::Error;
5
use std::path::PathBuf;
6
use std::sync::Arc;
7
use std::sync::Once;
8
use std::time::Instant;
9

10
use cache_manager::CacheRoot;
11
use clap::{Parser, ValueEnum, error::ErrorKind};
12

13
use crate::config::{ChunkingStrategy, SamplerConfig, TripletRecipe};
14
use crate::constants::cache::{MULTI_SOURCE_DEMO_GROUP, MULTI_SOURCE_DEMO_STORE_FILENAME};
15
use crate::data::ChunkView;
16
use crate::heuristics::{
17
    CapacityTotals, EFFECTIVE_NEGATIVES_PER_ANCHOR, EFFECTIVE_POSITIVES_PER_ANCHOR,
18
    estimate_source_split_capacity_from_counts, format_replay_factor, format_u128_with_commas,
19
    resolve_text_recipes_for_source, split_counts_for_total,
20
};
21
use crate::metrics::source_skew;
22
use crate::sampler::chunk_weight;
23
use crate::source::DataSource;
24
use crate::splits::{FileSplitStore, SplitLabel, SplitRatios, SplitStore};
25
use crate::{
26
    RecordChunk, SampleBatch, Sampler, SamplerError, SourceId, TextBatch, TextRecipe, TripletBatch,
27
    TripletSampler,
28
};
29

30
type DynSource = Box<dyn DataSource + 'static>;
31

32
fn managed_demo_split_store_path() -> Result<PathBuf, String> {
2✔
33
    let cache_root = CacheRoot::from_discovery()
2✔
34
        .map_err(|err| format!("failed discovering managed cache root: {err}"))?;
2✔
35
    let group = PathBuf::from(MULTI_SOURCE_DEMO_GROUP);
2✔
36
    let dir = cache_root.ensure_group(&group).map_err(|err| {
2✔
37
        format!(
×
38
            "failed creating managed demo cache group '{}': {err}",
39
            group.display()
×
40
        )
41
    })?;
×
42
    Ok(dir.join(MULTI_SOURCE_DEMO_STORE_FILENAME))
2✔
43
}
2✔
44

45
fn init_example_tracing() {
27✔
46
    static INIT: Once = Once::new();
47
    INIT.call_once(|| {
27✔
48
        let env_filter = tracing_subscriber::EnvFilter::try_from_default_env()
1✔
49
            .unwrap_or_else(|_| tracing_subscriber::EnvFilter::new("triplets=info"));
1✔
50
        let _ = tracing_subscriber::fmt()
1✔
51
            .with_env_filter(env_filter)
1✔
52
            .try_init();
1✔
53
    });
1✔
54
}
27✔
55

56
#[derive(Debug, Clone, Copy, ValueEnum)]
57
/// CLI split selector mapped onto `SplitLabel`.
58
enum SplitArg {
59
    Train,
60
    Validation,
61
    Test,
62
}
63

64
impl From<SplitArg> for SplitLabel {
65
    fn from(value: SplitArg) -> Self {
6✔
66
        match value {
6✔
67
            SplitArg::Train => SplitLabel::Train,
1✔
68
            SplitArg::Validation => SplitLabel::Validation,
4✔
69
            SplitArg::Test => SplitLabel::Test,
1✔
70
        }
71
    }
6✔
72
}
73

74
#[derive(Debug, Parser)]
75
#[command(
76
    name = "estimate_capacity",
77
    disable_help_subcommand = true,
78
    about = "Metadata-only capacity estimation",
79
    long_about = "Estimate record, pair, triplet, and text-sample capacity using source-reported counts only (no data refresh).",
80
    after_help = "Source roots are optional and resolved in order by explicit arg, environment variables, then project defaults."
81
)]
82
/// CLI arguments for metadata-only capacity estimation.
83
struct EstimateCapacityCli {
84
    #[arg(
85
        long,
86
        default_value_t = 99,
87
        help = "Deterministic seed used for split allocation"
88
    )]
89
    seed: u64,
90
    #[arg(
91
        long = "split-ratios",
92
        value_name = "TRAIN,VALIDATION,TEST",
93
        value_parser = parse_split_ratios_arg,
94
        default_value = "0.8,0.1,0.1",
95
        help = "Comma-separated split ratios that must sum to 1.0"
96
    )]
97
    split: SplitRatios,
98
    #[arg(
99
        long = "source-root",
100
        value_name = "PATH",
101
        help = "Optional source root override, repeat as needed in source order"
102
    )]
103
    source_roots: Vec<String>,
104
}
105

106
#[derive(Debug, Parser)]
107
#[command(
108
    name = "multi_source_demo",
109
    disable_help_subcommand = true,
110
    about = "Run sampled batches from multiple sources",
111
    long_about = "Sample triplet, pair, or text batches from multiple sources and persist split/epoch state.",
112
    after_help = "Source roots are optional and resolved in order by explicit arg, environment variables, then project defaults."
113
)]
114
/// CLI for `multi_source_demo`.
115
///
116
/// Common usage:
117
/// - Use managed cache-group default path (no flag)
118
/// - Set an explicit file path: `--split-store-path /tmp/split_store.bin`
119
/// - Repeat `--source-root <PATH>` to override source roots in order
120
struct MultiSourceDemoCli {
121
    #[arg(
122
        long = "text-recipes",
123
        help = "Emit a text batch instead of a triplet batch"
124
    )]
125
    show_text_samples: bool,
126
    #[arg(
127
        long = "pair-batch",
128
        help = "Emit a pair batch instead of a triplet batch"
129
    )]
130
    show_pair_samples: bool,
131
    #[arg(
132
        long = "list-text-recipes",
133
        help = "Print registered text recipes and exit"
134
    )]
135
    list_text_recipes: bool,
136
    #[arg(
137
        long = "batch-size",
138
        default_value_t = 4,
139
        value_parser = parse_batch_size,
140
        help = "Batch size used for sampling"
141
    )]
142
    batch_size: usize,
143
    #[arg(
144
        long = "ingestion-max-records",
145
        default_value_t = default_ingestion_max_records(),
146
        value_parser = parse_ingestion_max_records,
147
        help = "Per-source ingestion buffer target used while refreshing records"
148
    )]
149
    ingestion_max_records: usize,
150
    #[arg(long, help = "Optional deterministic seed override")]
151
    seed: Option<u64>,
152
    #[arg(long, value_enum, help = "Target split to sample from")]
153
    split: Option<SplitArg>,
154
    #[arg(
155
        long = "source-root",
156
        value_name = "PATH",
157
        help = "Optional source root override, repeat as needed in source order"
158
    )]
159
    source_roots: Vec<String>,
160
    #[arg(
161
        long = "split-store-path",
162
        value_name = "SPLIT_STORE_PATH",
163
        help = "Optional explicit path for persisted split/epoch state file"
164
    )]
165
    split_store_path: Option<PathBuf>,
166
    #[arg(
167
        long = "reset",
168
        help = "Delete the persisted split/epoch state before sampling, restarting from epoch 0"
169
    )]
170
    reset: bool,
171
    #[arg(
172
        long = "batches",
173
        value_name = "N",
174
        value_parser = parse_batch_count,
175
        help = "Run N triplet batches in succession, printing a timing line per batch and (with --features extended-metrics) a per-source similarity summary at the end"
176
    )]
177
    batches: Option<usize>,
178
}
179

180
#[derive(Debug, Clone)]
181
/// Source-level inventory used by capacity estimation output.
182
struct SourceInventory {
183
    source_id: String,
184
    reported_records: u128,
185
    triplet_recipes: Vec<TripletRecipe>,
186
}
187

188
/// Run the capacity-estimation CLI with injectable root resolution/source builders.
189
///
190
/// `build_sources` is construction-only; sampler configuration is applied
191
/// centrally by this function before any source calls.
192
pub fn run_estimate_capacity<R, Resolve, Build, I>(
6✔
193
    args_iter: I,
6✔
194
    resolve_roots: Resolve,
6✔
195
    build_sources: Build,
6✔
196
) -> Result<(), Box<dyn Error>>
6✔
197
where
6✔
198
    Resolve: FnOnce(Vec<String>) -> Result<R, Box<dyn Error>>,
6✔
199
    Build: FnOnce(&R) -> Vec<DynSource>,
6✔
200
    I: Iterator<Item = String>,
6✔
201
{
202
    init_example_tracing();
6✔
203

204
    let Some(cli) = parse_cli::<EstimateCapacityCli, _>(
6✔
205
        std::iter::once("estimate_capacity".to_string()).chain(args_iter),
6✔
206
    )?
×
207
    else {
208
        return Ok(());
1✔
209
    };
210

211
    let roots = resolve_roots(cli.source_roots)?;
5✔
212

213
    let config = SamplerConfig {
4✔
214
        seed: cli.seed,
4✔
215
        split: cli.split,
4✔
216
        ..SamplerConfig::default()
4✔
217
    };
4✔
218

219
    let sources = build_sources(&roots);
4✔
220

221
    let mut inventories = Vec::new();
4✔
222
    for source in &sources {
4✔
223
        let recipes = if config.recipes.is_empty() {
3✔
224
            source.default_triplet_recipes()
3✔
225
        } else {
226
            config.recipes.clone()
×
227
        };
228
        let reported_records = source.reported_record_count(&config).map_err(|err| {
3✔
229
            format!(
1✔
230
                "source '{}' failed to report exact record count: {err}",
231
                source.id()
1✔
232
            )
233
        })?;
1✔
234
        inventories.push(SourceInventory {
2✔
235
            source_id: source.id().to_string(),
2✔
236
            reported_records,
2✔
237
            triplet_recipes: recipes,
2✔
238
        });
2✔
239
    }
240

241
    let mut per_source_split_counts: HashMap<(String, SplitLabel), u128> = HashMap::new();
3✔
242
    let mut split_record_counts: HashMap<SplitLabel, u128> = HashMap::new();
3✔
243

244
    for source in &inventories {
3✔
245
        let counts = split_counts_for_total(source.reported_records, cli.split);
2✔
246
        for (label, count) in counts {
6✔
247
            per_source_split_counts.insert((source.source_id.clone(), label), count);
6✔
248
            *split_record_counts.entry(label).or_insert(0) += count;
6✔
249
        }
6✔
250
    }
251

252
    let mut totals_by_split: HashMap<SplitLabel, CapacityTotals> = HashMap::new();
3✔
253
    let mut totals_by_source_and_split: HashMap<(String, SplitLabel), CapacityTotals> =
3✔
254
        HashMap::new();
3✔
255

256
    for split_label in [SplitLabel::Train, SplitLabel::Validation, SplitLabel::Test] {
9✔
257
        let mut totals = CapacityTotals::default();
9✔
258

259
        for source in &inventories {
9✔
260
            let source_split_records = per_source_split_counts
6✔
261
                .get(&(source.source_id.clone(), split_label))
6✔
262
                .copied()
6✔
263
                .unwrap_or(0);
6✔
264

6✔
265
            let triplet_recipes = &source.triplet_recipes;
6✔
266
            let text_recipes = resolve_text_recipes_for_source(&config, triplet_recipes);
6✔
267

6✔
268
            let capacity = estimate_source_split_capacity_from_counts(
6✔
269
                source_split_records,
6✔
270
                triplet_recipes,
6✔
271
                &text_recipes,
6✔
272
            );
6✔
273

6✔
274
            totals_by_source_and_split.insert((source.source_id.clone(), split_label), capacity);
6✔
275

6✔
276
            totals.triplets += capacity.triplets;
6✔
277
            totals.effective_triplets += capacity.effective_triplets;
6✔
278
            totals.pairs += capacity.pairs;
6✔
279
            totals.text_samples += capacity.text_samples;
6✔
280
        }
6✔
281

282
        totals_by_split.insert(split_label, totals);
9✔
283
    }
284

285
    let min_nonzero_records_by_split: HashMap<SplitLabel, u128> =
3✔
286
        [SplitLabel::Train, SplitLabel::Validation, SplitLabel::Test]
3✔
287
            .into_iter()
3✔
288
            .map(|split_label| {
9✔
289
                let min_nonzero = inventories
9✔
290
                    .iter()
9✔
291
                    .filter_map(|source| {
9✔
292
                        per_source_split_counts
6✔
293
                            .get(&(source.source_id.clone(), split_label))
6✔
294
                            .copied()
6✔
295
                    })
6✔
296
                    .filter(|&records| records > 0)
9✔
297
                    .min()
9✔
298
                    .unwrap_or(0);
9✔
299
                (split_label, min_nonzero)
9✔
300
            })
9✔
301
            .collect();
3✔
302

303
    let min_nonzero_records_all_splits = inventories
3✔
304
        .iter()
3✔
305
        .map(|source| source.reported_records)
3✔
306
        .filter(|&records| records > 0)
3✔
307
        .min()
3✔
308
        .unwrap_or(0);
3✔
309

310
    println!("=== capacity estimate (length-only) ===");
3✔
311
    println!("mode: metadata-only (no source.refresh calls)");
3✔
312
    println!("classification: heuristic approximation (not exact)");
3✔
313
    println!("split seed: {}", cli.seed);
3✔
314
    println!(
3✔
315
        "split ratios: train={:.4}, validation={:.4}, test={:.4}",
316
        cli.split.train, cli.split.validation, cli.split.test
317
    );
318
    println!();
3✔
319

320
    println!("[SOURCES]");
3✔
321
    for source in &inventories {
3✔
322
        println!(
2✔
323
            "  {} => reported records: {}",
2✔
324
            source.source_id,
2✔
325
            format_u128_with_commas(source.reported_records)
2✔
326
        );
2✔
327
    }
2✔
328
    println!();
3✔
329

330
    println!("[PER SOURCE BREAKDOWN]");
3✔
331
    for source in &inventories {
3✔
332
        println!("  {}", source.source_id);
2✔
333
        let mut source_grand = CapacityTotals::default();
2✔
334
        let mut source_total_records = 0u128;
2✔
335
        for split_label in [SplitLabel::Train, SplitLabel::Validation, SplitLabel::Test] {
6✔
336
            let split_records = per_source_split_counts
6✔
337
                .get(&(source.source_id.clone(), split_label))
6✔
338
                .copied()
6✔
339
                .unwrap_or(0);
6✔
340
            source_total_records = source_total_records.saturating_add(split_records);
6✔
341
            let split_longest_records = inventories
6✔
342
                .iter()
6✔
343
                .map(|candidate| {
6✔
344
                    per_source_split_counts
6✔
345
                        .get(&(candidate.source_id.clone(), split_label))
6✔
346
                        .copied()
6✔
347
                        .unwrap_or(0)
6✔
348
                })
6✔
349
                .max()
6✔
350
                .unwrap_or(0);
6✔
351
            let totals = totals_by_source_and_split
6✔
352
                .get(&(source.source_id.clone(), split_label))
6✔
353
                .copied()
6✔
354
                .unwrap_or_default();
6✔
355
            source_grand.triplets += totals.triplets;
6✔
356
            source_grand.effective_triplets += totals.effective_triplets;
6✔
357
            source_grand.pairs += totals.pairs;
6✔
358
            source_grand.text_samples += totals.text_samples;
6✔
359
            println!("    [{:?}]", split_label);
6✔
360
            println!("      records: {}", format_u128_with_commas(split_records));
6✔
361
            println!(
6✔
362
                "      triplet combinations: {}",
363
                format_u128_with_commas(totals.triplets)
6✔
364
            );
365
            println!(
6✔
366
                "      effective sampled triplets (p={}, k={}): {}",
367
                EFFECTIVE_POSITIVES_PER_ANCHOR,
368
                EFFECTIVE_NEGATIVES_PER_ANCHOR,
369
                format_u128_with_commas(totals.effective_triplets)
6✔
370
            );
371
            println!(
6✔
372
                "      pair combinations:    {}",
373
                format_u128_with_commas(totals.pairs)
6✔
374
            );
375
            println!(
6✔
376
                "      text samples:         {}",
377
                format_u128_with_commas(totals.text_samples)
6✔
378
            );
379
            println!(
6✔
380
                "      replay factor vs longest source: {}",
381
                format_replay_factor(split_longest_records, split_records)
6✔
382
            );
383
            println!(
6✔
384
                "      suggested proportional-size batch weight (0-1): {:.4}",
385
                suggested_balancing_weight(split_longest_records, split_records)
6✔
386
            );
387
            let split_smallest_nonzero = min_nonzero_records_by_split
6✔
388
                .get(&split_label)
6✔
389
                .copied()
6✔
390
                .unwrap_or(0);
6✔
391
            println!(
6✔
392
                "      suggested small-source-boost batch weight (0-1): {:.4}",
393
                suggested_oversampling_weight(split_smallest_nonzero, split_records)
6✔
394
            );
395
            println!();
6✔
396
        }
397
        let longest_source_total = inventories
2✔
398
            .iter()
2✔
399
            .map(|candidate| candidate.reported_records)
2✔
400
            .max()
2✔
401
            .unwrap_or(0);
2✔
402
        println!("    [ALL SPLITS FOR SOURCE]");
2✔
403
        println!(
2✔
404
            "      triplet combinations: {}",
405
            format_u128_with_commas(source_grand.triplets)
2✔
406
        );
407
        println!(
2✔
408
            "      effective sampled triplets (p={}, k={}): {}",
409
            EFFECTIVE_POSITIVES_PER_ANCHOR,
410
            EFFECTIVE_NEGATIVES_PER_ANCHOR,
411
            format_u128_with_commas(source_grand.effective_triplets)
2✔
412
        );
413
        println!(
2✔
414
            "      pair combinations:    {}",
415
            format_u128_with_commas(source_grand.pairs)
2✔
416
        );
417
        println!(
2✔
418
            "      text samples:         {}",
419
            format_u128_with_commas(source_grand.text_samples)
2✔
420
        );
421
        println!(
2✔
422
            "      replay factor vs longest source: {}",
423
            format_replay_factor(longest_source_total, source_total_records)
2✔
424
        );
425
        println!(
2✔
426
            "      suggested proportional-size batch weight (0-1): {:.4}",
427
            suggested_balancing_weight(longest_source_total, source_total_records)
2✔
428
        );
429
        println!(
2✔
430
            "      suggested small-source-boost batch weight (0-1): {:.4}",
431
            suggested_oversampling_weight(min_nonzero_records_all_splits, source_total_records)
2✔
432
        );
433
        println!();
2✔
434
    }
435

436
    let mut grand = CapacityTotals::default();
3✔
437
    for split_label in [SplitLabel::Train, SplitLabel::Validation, SplitLabel::Test] {
9✔
438
        let record_count = split_record_counts.get(&split_label).copied().unwrap_or(0);
9✔
439
        let totals = totals_by_split
9✔
440
            .get(&split_label)
9✔
441
            .copied()
9✔
442
            .unwrap_or_default();
9✔
443

9✔
444
        grand.triplets += totals.triplets;
9✔
445
        grand.effective_triplets += totals.effective_triplets;
9✔
446
        grand.pairs += totals.pairs;
9✔
447
        grand.text_samples += totals.text_samples;
9✔
448

9✔
449
        println!("[{:?}]", split_label);
9✔
450
        println!("  records: {}", format_u128_with_commas(record_count));
9✔
451
        println!(
9✔
452
            "  triplet combinations: {}",
9✔
453
            format_u128_with_commas(totals.triplets)
9✔
454
        );
9✔
455
        println!(
9✔
456
            "  effective sampled triplets (p={}, k={}): {}",
9✔
457
            EFFECTIVE_POSITIVES_PER_ANCHOR,
9✔
458
            EFFECTIVE_NEGATIVES_PER_ANCHOR,
9✔
459
            format_u128_with_commas(totals.effective_triplets)
9✔
460
        );
9✔
461
        println!(
9✔
462
            "  pair combinations:    {}",
9✔
463
            format_u128_with_commas(totals.pairs)
9✔
464
        );
9✔
465
        println!(
9✔
466
            "  text samples:         {}",
9✔
467
            format_u128_with_commas(totals.text_samples)
9✔
468
        );
9✔
469
        println!();
9✔
470
    }
9✔
471

472
    println!("[ALL SPLITS TOTAL]");
3✔
473
    println!(
3✔
474
        "  triplet combinations: {}",
475
        format_u128_with_commas(grand.triplets)
3✔
476
    );
477
    println!(
3✔
478
        "  effective sampled triplets (p={}, k={}): {}",
479
        EFFECTIVE_POSITIVES_PER_ANCHOR,
480
        EFFECTIVE_NEGATIVES_PER_ANCHOR,
481
        format_u128_with_commas(grand.effective_triplets)
3✔
482
    );
483
    println!(
3✔
484
        "  pair combinations:    {}",
485
        format_u128_with_commas(grand.pairs)
3✔
486
    );
487
    println!(
3✔
488
        "  text samples:         {}",
489
        format_u128_with_commas(grand.text_samples)
3✔
490
    );
491
    println!();
3✔
492
    println!(
3✔
493
        "Note: counts are heuristic, length-based estimates from source-reported totals and recipe structure. They are approximate, not exact, and assume anchor-positive pairs=records (one positive per anchor by default), negatives=source_records_in_split-1 (anchor excluded as its own negative), and at most one chunk/window realization per sample. In real-world chunked sampling, practical combinations are often higher, so treat this as a floor-like baseline."
494
    );
495
    println!();
3✔
496
    println!(
3✔
497
        "Effective sampled triplets apply a bounded training assumption: effective_triplets = records * p * k per triplet recipe, with defaults p={} positives per anchor and k={} negatives per anchor.",
498
        EFFECTIVE_POSITIVES_PER_ANCHOR, EFFECTIVE_NEGATIVES_PER_ANCHOR
499
    );
500
    println!();
3✔
501
    println!(
3✔
502
        "Oversample loops are not inferred from this static report. To measure true oversampling (how many times sampling loops through the combination space), use observed sampled draw counts from an actual run."
503
    );
504
    println!();
3✔
505
    println!(
3✔
506
        "Suggested proportional-size batch weight (0-1) is source/max_source by record count: 1.0 for the largest source in scope, smaller values for smaller sources."
507
    );
508
    println!();
3✔
509
    println!(
3✔
510
        "Suggested small-source-boost batch weight (0-1) is min_nonzero_source/source by record count: 1.0 for the smallest non-zero source in scope, smaller values for larger sources."
511
    );
512
    println!();
3✔
513
    println!(
3✔
514
        "When passed to next_*_batch_with_weights, higher weight means that source is sampled more often relative to lower-weight sources."
515
    );
516

517
    Ok(())
3✔
518
}
6✔
519

520
/// Run the multi-source demo CLI with injectable root resolution/source builders.
521
///
522
/// `build_sources` is construction-only. Source sampler configuration is owned
523
/// by sampler registration (`TripletSampler::register_source`).
524
pub fn run_multi_source_demo<R, Resolve, Build, I>(
21✔
525
    args_iter: I,
21✔
526
    resolve_roots: Resolve,
21✔
527
    build_sources: Build,
21✔
528
) -> Result<(), Box<dyn Error>>
21✔
529
where
21✔
530
    Resolve: FnOnce(Vec<String>) -> Result<R, Box<dyn Error>>,
21✔
531
    Build: FnOnce(&R) -> Vec<DynSource>,
21✔
532
    I: Iterator<Item = String>,
21✔
533
{
534
    init_example_tracing();
21✔
535

536
    let Some(cli) = parse_cli::<MultiSourceDemoCli, _>(
21✔
537
        std::iter::once("multi_source_demo".to_string()).chain(args_iter),
21✔
538
    )?
×
539
    else {
540
        return Ok(());
1✔
541
    };
542

543
    let roots = resolve_roots(cli.source_roots)?;
20✔
544

545
    let mut config = SamplerConfig::default();
18✔
546
    config.seed = cli.seed.unwrap_or(config.seed);
18✔
547
    config.batch_size = cli.batch_size;
18✔
548
    config.ingestion_max_records = cli.ingestion_max_records;
18✔
549
    config.chunking = Default::default();
18✔
550
    let selected_split = cli.split.map(Into::into).unwrap_or(SplitLabel::Train);
18✔
551
    config.split = SplitRatios::default();
18✔
552
    config.allowed_splits = vec![selected_split];
18✔
553
    let chunking = config.chunking.clone();
18✔
554
    let config_snapshot = MultiSourceDemoConfigSnapshot {
18✔
555
        seed: config.seed,
18✔
556
        batch_size: config.batch_size,
18✔
557
        ingestion_max_records: config.ingestion_max_records,
18✔
558
        split: selected_split,
18✔
559
        split_ratios: config.split,
18✔
560
        max_window_tokens: config.chunking.max_window_tokens,
18✔
561
        overlap_tokens: config.chunking.overlap_tokens.clone(),
18✔
562
        summary_fallback_tokens: config.chunking.summary_fallback_tokens,
18✔
563
    };
18✔
564

565
    let split_store_path = if let Some(path) = cli.split_store_path {
18✔
566
        path
17✔
567
    } else {
568
        managed_demo_split_store_path().map_err(|err| {
1✔
569
            Box::<dyn Error>::from(format!("failed to resolve demo split-store path: {err}"))
×
570
        })?
×
571
    };
572

573
    if cli.reset && split_store_path.exists() {
18✔
574
        std::fs::remove_file(&split_store_path).map_err(|err| {
2✔
575
            Box::<dyn Error>::from(format!(
1✔
576
                "failed to remove split store '{}': {err}",
1✔
577
                split_store_path.display()
1✔
578
            ))
1✔
579
        })?;
1✔
580
        println!("Reset: removed {}", split_store_path.display());
1✔
581
    }
16✔
582
    println!(
17✔
583
        "Persisting split assignments and epoch state to {}",
584
        split_store_path.display()
17✔
585
    );
586
    let sources = build_sources(&roots);
17✔
587
    let split_store = Arc::new(FileSplitStore::open(&split_store_path, config.split, 99)?);
17✔
588
    let sampler = TripletSampler::new(config, split_store.clone());
17✔
589
    for source in sources {
17✔
590
        sampler.register_source(source);
16✔
591
    }
16✔
592

593
    if cli.show_pair_samples {
17✔
594
        match sampler.next_pair_batch(selected_split) {
6✔
595
            Ok(pair_batch) => {
2✔
596
                if pair_batch.pairs.is_empty() {
2✔
597
                    println!("Pair sampling produced no results.");
×
598
                } else {
2✔
599
                    print_pair_batch(&chunking, &pair_batch, split_store.as_ref());
2✔
600
                }
2✔
601
                sampler.save_sampler_state(None)?;
2✔
602
            }
603
            Err(SamplerError::Exhausted(name)) => {
4✔
604
                eprintln!(
4✔
605
                    "Pair sampler exhausted recipe '{}'. Ensure both positive and negative examples exist.",
4✔
606
                    name
4✔
607
                );
4✔
608
            }
4✔
609
            Err(err) => return Err(err.into()),
×
610
        }
611
    } else if cli.show_text_samples {
11✔
612
        match sampler.next_text_batch(selected_split) {
3✔
613
            Ok(text_batch) => {
1✔
614
                if text_batch.samples.is_empty() {
1✔
615
                    println!(
×
616
                        "Text sampling produced no results. Ensure each source has eligible sections."
×
617
                    );
×
618
                } else {
1✔
619
                    print_text_batch(&chunking, &text_batch, split_store.as_ref());
1✔
620
                }
1✔
621
                sampler.save_sampler_state(None)?;
1✔
622
            }
623
            Err(SamplerError::Exhausted(name)) => {
2✔
624
                eprintln!(
2✔
625
                    "Text sampler exhausted selector '{}'. Ensure matching sections exist.",
2✔
626
                    name
2✔
627
                );
2✔
628
            }
2✔
629
            Err(err) => return Err(err.into()),
×
630
        }
631
    } else if cli.list_text_recipes {
8✔
632
        let recipes = sampler.text_recipes();
4✔
633
        if recipes.is_empty() {
4✔
634
            println!(
2✔
635
                "No text recipes registered. Ensure your sources expose triplet selectors or configure text_recipes explicitly."
2✔
636
            );
2✔
637
        } else {
2✔
638
            print_text_recipes(&recipes);
2✔
639
        }
2✔
640
    } else if let Some(batch_count) = cli.batches {
4✔
641
        print_demo_config(&config_snapshot);
1✔
642
        println!("=== benchmark: {} triplet batches ===", batch_count);
1✔
643

644
        // source_id -> Vec<(pos_jaccard, pos_byte_cosine, neg_jaccard, neg_byte_cosine)>
645
        #[cfg(feature = "extended-metrics")]
646
        let mut source_metrics: HashMap<String, Vec<(f32, f32, f32, f32)>> = HashMap::new();
1✔
647

648
        for i in 0..batch_count {
2✔
649
            let t0 = Instant::now();
2✔
650
            match sampler.next_triplet_batch(selected_split) {
2✔
651
                Ok(batch) => {
2✔
652
                    let elapsed = t0.elapsed();
2✔
653
                    let n = batch.triplets.len();
2✔
654
                    println!(
2✔
655
                        "batch {:>4}  triplets={:<4}  elapsed={:>8.2}ms  per_triplet={:.2}ms",
656
                        i + 1,
2✔
657
                        n,
658
                        elapsed.as_secs_f64() * 1000.0,
2✔
659
                        if n > 0 {
2✔
660
                            elapsed.as_secs_f64() * 1000.0 / n as f64
2✔
661
                        } else {
NEW
662
                            0.0
×
663
                        },
664
                    );
665
                    #[cfg(feature = "extended-metrics")]
666
                    {
667
                        use crate::metrics::lexical_similarity_scores;
668
                        for triplet in &batch.triplets {
8✔
669
                            let (pj, pc) = lexical_similarity_scores(
8✔
670
                                &triplet.anchor.text,
8✔
671
                                &triplet.positive.text,
8✔
672
                            );
8✔
673
                            let (nj, nc) = lexical_similarity_scores(
8✔
674
                                &triplet.anchor.text,
8✔
675
                                &triplet.negative.text,
8✔
676
                            );
8✔
677
                            let source = extract_source(&triplet.anchor.record_id);
8✔
678
                            source_metrics
8✔
679
                                .entry(source)
8✔
680
                                .or_default()
8✔
681
                                .push((pj, pc, nj, nc));
8✔
682
                        }
8✔
683
                    }
684
                }
NEW
685
                Err(SamplerError::Exhausted(name)) => {
×
NEW
686
                    println!(
×
687
                        "batch {:>4}  exhausted recipe '{}' — stopping early",
NEW
688
                        i + 1,
×
689
                        name
690
                    );
NEW
691
                    break;
×
692
                }
NEW
693
                Err(err) => return Err(err.into()),
×
694
            }
695
        }
696

697
        sampler.save_sampler_state(None)?;
1✔
698

699
        #[cfg(feature = "extended-metrics")]
700
        if !source_metrics.is_empty() {
1✔
701
            println!();
1✔
702
            print_metric_summary(&source_metrics);
1✔
703
        }
1✔
704

705
        #[cfg(all(feature = "extended-metrics", feature = "bm25-mining"))]
706
        {
707
            let (fallback, total) = sampler.bm25_fallback_stats();
1✔
708
            if total > 0 {
1✔
709
                let pct = fallback as f64 / total as f64 * 100.0;
1✔
710
                println!("bm25 fallback rate : {}/{} ({:.1}%)", fallback, total, pct);
1✔
711
            }
1✔
712
        }
713
    } else {
714
        match sampler.next_triplet_batch(selected_split) {
3✔
715
            Ok(triplet_batch) => {
×
716
                if triplet_batch.triplets.is_empty() {
×
717
                    println!(
×
718
                        "Triplet sampling produced no results. Ensure multiple records per source exist."
×
719
                    );
×
720
                } else {
×
721
                    print_triplet_batch(&chunking, &triplet_batch, split_store.as_ref());
×
722
                }
×
723
                sampler.save_sampler_state(None)?;
×
724
                #[cfg(all(feature = "extended-metrics", feature = "bm25-mining"))]
725
                {
NEW
726
                    let (fallback, total) = sampler.bm25_fallback_stats();
×
NEW
727
                    if total > 0 {
×
NEW
728
                        let pct = fallback as f64 / total as f64 * 100.0;
×
NEW
729
                        println!("bm25 fallback rate : {}/{} ({:.1}%)", fallback, total, pct);
×
NEW
730
                    }
×
731
                }
732
            }
733
            Err(SamplerError::Exhausted(name)) => {
3✔
734
                eprintln!(
3✔
735
                    "Triplet sampler exhausted recipe '{}'. Ensure both positive and negative examples exist.",
3✔
736
                    name
3✔
737
                );
3✔
738
            }
3✔
739
            Err(err) => return Err(err.into()),
×
740
        }
741
    }
742

743
    Ok(())
17✔
744
}
21✔
745

746
struct MultiSourceDemoConfigSnapshot {
747
    seed: u64,
748
    batch_size: usize,
749
    ingestion_max_records: usize,
750
    split: SplitLabel,
751
    split_ratios: SplitRatios,
752
    max_window_tokens: usize,
753
    overlap_tokens: Vec<usize>,
754
    summary_fallback_tokens: usize,
755
}
756

757
fn print_demo_config(cfg: &MultiSourceDemoConfigSnapshot) {
1✔
758
    let overlaps: Vec<String> = cfg.overlap_tokens.iter().map(|t| t.to_string()).collect();
1✔
759
    println!("=== sampler config ===");
1✔
760
    println!("seed                 : {}", cfg.seed);
1✔
761
    println!("batch_size           : {}", cfg.batch_size);
1✔
762
    println!("ingestion_max_records: {}", cfg.ingestion_max_records);
1✔
763
    println!("split                : {:?}", cfg.split);
1✔
764
    println!(
1✔
765
        "split_ratios         : train={:.2} val={:.2} test={:.2}",
766
        cfg.split_ratios.train, cfg.split_ratios.validation, cfg.split_ratios.test
767
    );
768
    println!("max_window_tokens    : {}", cfg.max_window_tokens);
1✔
769
    println!("overlap_tokens       : [{}]", overlaps.join(", "));
1✔
770
    println!(
1✔
771
        "summary_fallback     : {} tokens (0 = disabled)",
772
        cfg.summary_fallback_tokens
773
    );
774
    println!();
1✔
775
}
1✔
776

777
fn default_ingestion_max_records() -> usize {
1✔
778
    SamplerConfig::default().ingestion_max_records
1✔
779
}
1✔
780

781
fn parse_positive_usize_flag(raw: &str, flag: &str) -> Result<usize, String> {
51✔
782
    let parsed = raw.parse::<usize>().map_err(|_| {
51✔
783
        format!(
1✔
784
            "Could not parse {} value '{}' as a positive integer",
785
            flag, raw
786
        )
787
    })?;
1✔
788
    if parsed == 0 {
50✔
789
        return Err(format!("{} must be greater than zero", flag));
5✔
790
    }
45✔
791
    Ok(parsed)
45✔
792
}
51✔
793

794
fn parse_batch_size(raw: &str) -> Result<usize, String> {
25✔
795
    parse_positive_usize_flag(raw, "--batch-size")
25✔
796
}
25✔
797

798
fn parse_ingestion_max_records(raw: &str) -> Result<usize, String> {
24✔
799
    parse_positive_usize_flag(raw, "--ingestion-max-records")
24✔
800
}
24✔
801

802
fn parse_batch_count(raw: &str) -> Result<usize, String> {
2✔
803
    parse_positive_usize_flag(raw, "--batches")
2✔
804
}
2✔
805

806
fn suggested_balancing_weight(max_baseline: u128, source_baseline: u128) -> f32 {
13✔
807
    if max_baseline == 0 || source_baseline == 0 {
13✔
808
        return 0.0;
4✔
809
    }
9✔
810
    (source_baseline as f64 / max_baseline as f64).clamp(0.0, 1.0) as f32
9✔
811
}
13✔
812

813
fn suggested_oversampling_weight(min_nonzero_baseline: u128, source_baseline: u128) -> f32 {
13✔
814
    if min_nonzero_baseline == 0 || source_baseline == 0 {
13✔
815
        return 0.0;
4✔
816
    }
9✔
817
    (min_nonzero_baseline as f64 / source_baseline as f64).clamp(0.0, 1.0) as f32
9✔
818
}
13✔
819

820
fn parse_cli<T, I>(args: I) -> Result<Option<T>, Box<dyn Error>>
34✔
821
where
34✔
822
    T: Parser,
34✔
823
    I: IntoIterator,
34✔
824
    I::Item: Into<std::ffi::OsString> + Clone,
34✔
825
{
826
    match T::try_parse_from(args) {
34✔
827
        Ok(cli) => Ok(Some(cli)),
26✔
828
        Err(err) => match err.kind() {
8✔
829
            ErrorKind::DisplayHelp | ErrorKind::DisplayVersion => {
830
                err.print()?;
5✔
831
                Ok(None)
5✔
832
            }
833
            _ => Err(err.into()),
3✔
834
        },
835
    }
836
}
34✔
837

838
fn parse_split_ratios_arg(raw: &str) -> Result<SplitRatios, String> {
12✔
839
    let parts: Vec<&str> = raw.split(',').collect();
12✔
840
    if parts.len() != 3 {
12✔
841
        return Err("--split-ratios expects exactly 3 comma-separated values".to_string());
1✔
842
    }
11✔
843
    let train = parts[0]
11✔
844
        .trim()
11✔
845
        .parse::<f32>()
11✔
846
        .map_err(|_| format!("invalid train ratio '{}': must be a float", parts[0].trim()))?;
11✔
847
    let validation = parts[1].trim().parse::<f32>().map_err(|_| {
10✔
848
        format!(
1✔
849
            "invalid validation ratio '{}': must be a float",
850
            parts[1].trim()
1✔
851
        )
852
    })?;
1✔
853
    let test = parts[2]
9✔
854
        .trim()
9✔
855
        .parse::<f32>()
9✔
856
        .map_err(|_| format!("invalid test ratio '{}': must be a float", parts[2].trim()))?;
9✔
857
    let ratios = SplitRatios {
8✔
858
        train,
8✔
859
        validation,
8✔
860
        test,
8✔
861
    };
8✔
862
    let sum = ratios.train + ratios.validation + ratios.test;
8✔
863
    if (sum - 1.0).abs() > 1e-5 {
8✔
864
        return Err(format!(
1✔
865
            "split ratios must sum to 1.0, got {:.6} (train={}, validation={}, test={})",
1✔
866
            sum, ratios.train, ratios.validation, ratios.test
1✔
867
        ));
1✔
868
    }
7✔
869
    if ratios.train < 0.0 || ratios.validation < 0.0 || ratios.test < 0.0 {
7✔
870
        return Err("split ratios must be non-negative".to_string());
1✔
871
    }
6✔
872
    Ok(ratios)
6✔
873
}
12✔
874

875
fn print_triplet_batch(
1✔
876
    strategy: &ChunkingStrategy,
1✔
877
    batch: &TripletBatch,
1✔
878
    split_store: &impl SplitStore,
1✔
879
) {
1✔
880
    println!("=== triplet batch ===");
1✔
881
    for (idx, triplet) in batch.triplets.iter().enumerate() {
1✔
882
        println!("--- triplet #{} ---", idx);
1✔
883
        println!("recipe       : {}", triplet.recipe);
1✔
884
        println!("sample_weight: {:.4}", triplet.weight);
1✔
885
        if let Some(instr) = &triplet.instruction {
1✔
886
            println!("instruction shown to model:\n{}\n", instr);
1✔
887
        }
1✔
888
        #[cfg(feature = "extended-metrics")]
889
        let (pos_sim, neg_sim) = {
1✔
890
            use crate::metrics::lexical_similarity_scores;
891
            (
1✔
892
                Some(lexical_similarity_scores(
1✔
893
                    &triplet.anchor.text,
1✔
894
                    &triplet.positive.text,
1✔
895
                )),
1✔
896
                Some(lexical_similarity_scores(
1✔
897
                    &triplet.anchor.text,
1✔
898
                    &triplet.negative.text,
1✔
899
                )),
1✔
900
            )
1✔
901
        };
902
        #[cfg(not(feature = "extended-metrics"))]
903
        let (pos_sim, neg_sim): (Option<(f32, f32)>, Option<(f32, f32)>) = (None, None);
904
        print_chunk_block("ANCHOR", &triplet.anchor, strategy, split_store, None);
1✔
905
        print_chunk_block(
1✔
906
            "POSITIVE",
1✔
907
            &triplet.positive,
1✔
908
            strategy,
1✔
909
            split_store,
1✔
910
            pos_sim,
1✔
911
        );
912
        print_chunk_block(
1✔
913
            "NEGATIVE",
1✔
914
            &triplet.negative,
1✔
915
            strategy,
1✔
916
            split_store,
1✔
917
            neg_sim,
1✔
918
        );
919
    }
920
    print_source_summary(
1✔
921
        "triplet anchors",
1✔
922
        batch
1✔
923
            .triplets
1✔
924
            .iter()
1✔
925
            .map(|triplet| triplet.anchor.record_id.as_str()),
1✔
926
    );
927
    print_recipe_context_by_source(
1✔
928
        "triplet recipes by source",
1✔
929
        batch
1✔
930
            .triplets
1✔
931
            .iter()
1✔
932
            .map(|triplet| (triplet.anchor.record_id.as_str(), triplet.recipe.as_str())),
1✔
933
    );
934
}
1✔
935

936
fn print_text_batch(strategy: &ChunkingStrategy, batch: &TextBatch, split_store: &impl SplitStore) {
2✔
937
    println!("=== text batch ===");
2✔
938
    for (idx, sample) in batch.samples.iter().enumerate() {
5✔
939
        println!("--- sample #{} ---", idx);
5✔
940
        println!("recipe       : {}", sample.recipe);
5✔
941
        println!("sample_weight: {:.4}", sample.weight);
5✔
942
        if let Some(instr) = &sample.instruction {
5✔
943
            println!("instruction shown to model:\n{}\n", instr);
1✔
944
        }
4✔
945
        print_chunk_block("TEXT", &sample.chunk, strategy, split_store, None);
5✔
946
    }
947
    print_source_summary(
2✔
948
        "text samples",
2✔
949
        batch
2✔
950
            .samples
2✔
951
            .iter()
2✔
952
            .map(|sample| sample.chunk.record_id.as_str()),
5✔
953
    );
954
    print_recipe_context_by_source(
2✔
955
        "text recipes by source",
2✔
956
        batch
2✔
957
            .samples
2✔
958
            .iter()
2✔
959
            .map(|sample| (sample.chunk.record_id.as_str(), sample.recipe.as_str())),
5✔
960
    );
961
}
2✔
962

963
fn print_pair_batch(
3✔
964
    strategy: &ChunkingStrategy,
3✔
965
    batch: &SampleBatch,
3✔
966
    split_store: &impl SplitStore,
3✔
967
) {
3✔
968
    println!("=== pair batch ===");
3✔
969
    for (idx, pair) in batch.pairs.iter().enumerate() {
9✔
970
        println!("--- pair #{} ---", idx);
9✔
971
        println!("recipe       : {}", pair.recipe);
9✔
972
        println!("label        : {:?}", pair.label);
9✔
973
        if let Some(reason) = &pair.reason {
9✔
974
            println!("reason       : {}", reason);
5✔
975
        }
5✔
976
        print_chunk_block("ANCHOR", &pair.anchor, strategy, split_store, None);
9✔
977
        print_chunk_block("OTHER", &pair.positive, strategy, split_store, None);
9✔
978
    }
979
    print_source_summary(
3✔
980
        "pair anchors",
3✔
981
        batch
3✔
982
            .pairs
3✔
983
            .iter()
3✔
984
            .map(|pair| pair.anchor.record_id.as_str()),
9✔
985
    );
986
    print_recipe_context_by_source(
3✔
987
        "pair recipes by source",
3✔
988
        batch
3✔
989
            .pairs
3✔
990
            .iter()
3✔
991
            .map(|pair| (pair.anchor.record_id.as_str(), pair.recipe.as_str())),
9✔
992
    );
993
}
3✔
994

995
fn print_text_recipes(recipes: &[TextRecipe]) {
3✔
996
    println!("=== available text recipes ===");
3✔
997
    for recipe in recipes {
7✔
998
        println!(
7✔
999
            "- {} (weight: {:.3}) selector={:?}",
1000
            recipe.name, recipe.weight, recipe.selector
1001
        );
1002
        if let Some(instr) = &recipe.instruction {
7✔
1003
            println!("  instruction: {}", instr);
1✔
1004
        }
6✔
1005
    }
1006
}
3✔
1007

1008
#[cfg(feature = "extended-metrics")]
1009
fn metric_mean_median(vals: &mut [f32]) -> (f32, f32) {
18✔
1010
    let mean = vals.iter().sum::<f32>() / vals.len() as f32;
18✔
1011
    vals.sort_by(|a, b| a.partial_cmp(b).unwrap_or(std::cmp::Ordering::Equal));
92✔
1012
    let median = if vals.len() % 2 == 1 {
18✔
1013
        vals[vals.len() / 2]
1✔
1014
    } else {
1015
        (vals[vals.len() / 2 - 1] + vals[vals.len() / 2]) / 2.0
17✔
1016
    };
1017
    (mean, median)
18✔
1018
}
18✔
1019

1020
#[cfg(feature = "extended-metrics")]
1021
fn print_metric_summary(source_data: &HashMap<String, Vec<(f32, f32, f32, f32)>>) {
2✔
1022
    let total: usize = source_data.values().map(|v| v.len()).sum();
3✔
1023
    let n_sources = source_data.len();
2✔
1024
    println!(
2✔
1025
        "=== extended metrics summary ({} triplets, {} {}) ===",
1026
        total,
1027
        n_sources,
1028
        if n_sources == 1 { "source" } else { "sources" }
2✔
1029
    );
1030

1031
    // Returns [pos, neg] as (mean, median) pairs for one metric across entries.
1032
    fn metric_pair(
8✔
1033
        entries: &[(f32, f32, f32, f32)],
8✔
1034
        pos_idx: usize,
8✔
1035
        neg_idx: usize,
8✔
1036
    ) -> [(f32, f32); 2] {
8✔
1037
        let extract = |idx: usize| -> Vec<f32> {
16✔
1038
            entries
16✔
1039
                .iter()
16✔
1040
                .map(|e| match idx {
64✔
1041
                    0 => e.0,
16✔
1042
                    1 => e.1,
16✔
1043
                    2 => e.2,
16✔
1044
                    _ => e.3,
16✔
1045
                })
64✔
1046
                .collect()
16✔
1047
        };
16✔
1048
        let mut pos_vals = extract(pos_idx);
8✔
1049
        let mut neg_vals = extract(neg_idx);
8✔
1050
        [
8✔
1051
            metric_mean_median(&mut pos_vals),
8✔
1052
            metric_mean_median(&mut neg_vals),
8✔
1053
        ]
8✔
1054
    }
8✔
1055

1056
    fn print_metric_section(
4✔
1057
        label: &str,
4✔
1058
        sources: &[&String],
4✔
1059
        source_data: &HashMap<String, Vec<(f32, f32, f32, f32)>>,
4✔
1060
        pos_idx: usize,
4✔
1061
        neg_idx: usize,
4✔
1062
        total: usize,
4✔
1063
        n_sources: usize,
4✔
1064
    ) {
4✔
1065
        const SEP: usize = 83;
1066
        println!();
4✔
1067
        println!("[{}]", label);
4✔
1068
        println!(
4✔
1069
            "{:<24} {:>5}  {:<16} {:<16} {:<16}",
1070
            "source", "n", "positive", "negative", "gap (pos\u{2212}neg)"
1071
        );
1072
        println!(
4✔
1073
            "{:<24} {:>5}  {:<16} {:<16} {:<16}",
1074
            "", "", "mean / median", "mean / median", "mean / median"
1075
        );
1076
        println!("{}", "-".repeat(SEP));
4✔
1077
        for source in sources {
6✔
1078
            let entries = &source_data[*source];
6✔
1079
            let [pos, neg] = metric_pair(entries, pos_idx, neg_idx);
6✔
1080
            let gap_mean = pos.0 - neg.0;
6✔
1081
            let gap_med = pos.1 - neg.1;
6✔
1082
            println!(
6✔
1083
                "{:<24} {:>5}  {:.3} / {:.3}     {:.3} / {:.3}     {:+.3} / {:+.3}",
6✔
1084
                source,
6✔
1085
                entries.len(),
6✔
1086
                pos.0,
6✔
1087
                pos.1,
6✔
1088
                neg.0,
6✔
1089
                neg.1,
6✔
1090
                gap_mean,
6✔
1091
                gap_med,
6✔
1092
            );
6✔
1093
        }
6✔
1094
        if n_sources > 1 {
4✔
1095
            let all: Vec<(f32, f32, f32, f32)> = source_data.values().flatten().copied().collect();
2✔
1096
            let [pos, neg] = metric_pair(&all, pos_idx, neg_idx);
2✔
1097
            let gap_mean = pos.0 - neg.0;
2✔
1098
            let gap_med = pos.1 - neg.1;
2✔
1099
            println!("{}", "-".repeat(SEP));
2✔
1100
            println!(
2✔
1101
                "{:<24} {:>5}  {:.3} / {:.3}     {:.3} / {:.3}     {:+.3} / {:+.3}",
2✔
1102
                "ALL", total, pos.0, pos.1, neg.0, neg.1, gap_mean, gap_med,
2✔
1103
            );
2✔
1104
        }
2✔
1105
    }
4✔
1106

1107
    let mut sources: Vec<&String> = source_data.keys().collect();
2✔
1108
    sources.sort();
2✔
1109

1110
    print_metric_section(
2✔
1111
        "jaccard \u{2194} anchor",
2✔
1112
        &sources,
2✔
1113
        source_data,
2✔
1114
        0,
1115
        2,
1116
        total,
2✔
1117
        n_sources,
2✔
1118
    );
1119
    print_metric_section(
2✔
1120
        "byte-cos \u{2194} anchor",
2✔
1121
        &sources,
2✔
1122
        source_data,
2✔
1123
        1,
1124
        3,
1125
        total,
2✔
1126
        n_sources,
2✔
1127
    );
1128
    println!();
2✔
1129
}
2✔
1130

1131
trait ChunkDebug {
1132
    fn view_name(&self) -> String;
1133
}
1134

1135
impl ChunkDebug for RecordChunk {
1136
    fn view_name(&self) -> String {
26✔
1137
        match &self.view {
26✔
1138
            ChunkView::Window {
1139
                index,
24✔
1140
                span,
24✔
1141
                overlap,
24✔
1142
                start_ratio,
24✔
1143
            } => format!(
24✔
1144
                "window#index={} span={} overlap={} start_ratio={:.3} tokens={}",
1145
                index, span, overlap, start_ratio, self.tokens_estimate
1146
            ),
1147
            ChunkView::SummaryFallback { strategy, .. } => {
2✔
1148
                format!("summary:{} tokens={}", strategy, self.tokens_estimate)
2✔
1149
            }
1150
        }
1151
    }
26✔
1152
}
1153

1154
fn print_chunk_block(
26✔
1155
    title: &str,
26✔
1156
    chunk: &RecordChunk,
26✔
1157
    strategy: &ChunkingStrategy,
26✔
1158
    split_store: &impl SplitStore,
26✔
1159
    anchor_sim: Option<(f32, f32)>,
26✔
1160
) {
26✔
1161
    let chunk_weight = chunk_weight(strategy, chunk);
26✔
1162
    let split = split_store
26✔
1163
        .label_for(&chunk.record_id)
26✔
1164
        .map(|label| format!("{:?}", label))
26✔
1165
        .unwrap_or_else(|| "Unknown".to_string());
26✔
1166
    println!("--- {} ---", title);
26✔
1167
    println!("split        : {}", split);
26✔
1168
    println!("view         : {}", chunk.view_name());
26✔
1169
    println!("chunk_weight : {:.4}", chunk_weight);
26✔
1170
    println!("record_id    : {}", chunk.record_id);
26✔
1171
    println!("section_idx  : {}", chunk.section_idx);
26✔
1172
    println!("token_est    : {}", chunk.tokens_estimate);
26✔
1173
    if let Some((j, c)) = anchor_sim {
26✔
1174
        println!("jaccard(↔a)  : {:.4}  byte-cos(↔a): {:.4}", j, c);
2✔
1175
    }
24✔
1176
    println!("model_input (exact text sent to the model):");
26✔
1177
    println!(
26✔
1178
        "<<< BEGIN MODEL TEXT >>>\n{}\n<<< END MODEL TEXT >>>\n",
1179
        chunk.text
1180
    );
1181
}
26✔
1182

1183
fn print_source_summary<'a, I>(label: &str, ids: I)
7✔
1184
where
7✔
1185
    I: Iterator<Item = &'a str>,
7✔
1186
{
1187
    let mut counts: HashMap<SourceId, usize> = HashMap::new();
7✔
1188
    for id in ids {
15✔
1189
        let source = extract_source(id);
15✔
1190
        *counts.entry(source).or_insert(0) += 1;
15✔
1191
    }
15✔
1192
    if counts.is_empty() {
7✔
1193
        return;
1✔
1194
    }
6✔
1195
    let skew = source_skew(&counts);
6✔
1196
    let mut entries: Vec<(String, usize)> = counts.into_iter().collect();
6✔
1197
    entries.sort_by(|a, b| b.1.cmp(&a.1).then_with(|| a.0.cmp(&b.0)));
6✔
1198
    println!("--- {} by source ---", label);
6✔
1199
    if let Some(skew) = skew {
6✔
1200
        for entry in &skew.per_source {
6✔
1201
            println!(
6✔
1202
                "{}: count={} share={:.2}",
6✔
1203
                entry.source, entry.count, entry.share
6✔
1204
            );
6✔
1205
        }
6✔
1206
        println!(
6✔
1207
            "skew: sources={} total={} min={} max={} mean={:.2} ratio={:.2}",
1208
            skew.sources, skew.total, skew.min, skew.max, skew.mean, skew.ratio
1209
        );
1210
    } else {
1211
        for (source, count) in &entries {
×
UNCOV
1212
            println!("{source}: count={count}");
×
UNCOV
1213
        }
×
1214
    }
1215
}
7✔
1216

1217
fn print_recipe_context_by_source<'a, I>(label: &str, entries: I)
7✔
1218
where
7✔
1219
    I: Iterator<Item = (&'a str, &'a str)>,
7✔
1220
{
1221
    let mut counts: HashMap<SourceId, HashMap<String, usize>> = HashMap::new();
7✔
1222
    for (record_id, recipe) in entries {
15✔
1223
        let source = extract_source(record_id);
15✔
1224
        let entry = counts
15✔
1225
            .entry(source)
15✔
1226
            .or_default()
15✔
1227
            .entry(recipe.to_string())
15✔
1228
            .or_insert(0);
15✔
1229
        *entry += 1;
15✔
1230
    }
15✔
1231
    if counts.is_empty() {
7✔
1232
        return;
1✔
1233
    }
6✔
1234
    let mut sources: Vec<(SourceId, HashMap<String, usize>)> = counts.into_iter().collect();
6✔
1235
    sources.sort_by(|a, b| a.0.cmp(&b.0));
6✔
1236
    println!("--- {} ---", label);
6✔
1237
    for (source, recipes) in sources {
6✔
1238
        println!("{source}");
6✔
1239
        let mut entries: Vec<(String, usize)> = recipes.into_iter().collect();
6✔
1240
        entries.sort_by(|a, b| b.1.cmp(&a.1).then_with(|| a.0.cmp(&b.0)));
6✔
1241
        for (recipe, count) in entries {
7✔
1242
            println!("  - {recipe}={count}");
7✔
1243
        }
7✔
1244
    }
1245
}
7✔
1246

1247
fn extract_source(record_id: &str) -> SourceId {
40✔
1248
    record_id
40✔
1249
        .split_once("::")
40✔
1250
        .map(|(source, _)| source.to_string())
40✔
1251
        .unwrap_or_else(|| "unknown".to_string())
40✔
1252
}
40✔
1253

1254
#[cfg(test)]
1255
mod tests {
1256
    use super::*;
1257
    use crate::DataRecord;
1258
    use crate::DeterministicSplitStore;
1259
    use crate::data::{QualityScore, RecordSection, SectionRole};
1260
    use crate::source::{SourceCursor, SourceSnapshot};
1261
    use crate::utils::make_section;
1262
    use chrono::{TimeZone, Utc};
1263
    use tempfile::tempdir;
1264

1265
    fn empty_dyn_sources(_: &()) -> Vec<DynSource> {
2✔
1266
        Vec::new()
2✔
1267
    }
2✔
1268

NEW
1269
    fn ok_unit_roots(_: Vec<String>) -> Result<(), Box<dyn Error>> {
×
NEW
1270
        Ok(())
×
NEW
1271
    }
×
1272

1273
    fn error_unit_roots(_: Vec<String>) -> Result<(), Box<dyn Error>> {
1✔
1274
        Err("root-resolution-error".into())
1✔
1275
    }
1✔
1276

1277
    /// Minimal in-memory `DataSource` test double for example app tests.
1278
    struct TestSource {
1279
        id: String,
1280
        count: Option<u128>,
1281
        recipes: Vec<TripletRecipe>,
1282
    }
1283

1284
    impl DataSource for TestSource {
1285
        fn id(&self) -> &str {
131✔
1286
            &self.id
131✔
1287
        }
131✔
1288

1289
        fn refresh(
30✔
1290
            &self,
30✔
1291
            _config: &SamplerConfig,
30✔
1292
            _cursor: Option<&SourceCursor>,
30✔
1293
            _limit: Option<usize>,
30✔
1294
        ) -> Result<SourceSnapshot, SamplerError> {
30✔
1295
            Ok(SourceSnapshot {
30✔
1296
                records: Vec::new(),
30✔
1297
                cursor: SourceCursor {
30✔
1298
                    last_seen: Utc::now(),
30✔
1299
                    revision: 0,
30✔
1300
                },
30✔
1301
            })
30✔
1302
        }
30✔
1303

1304
        fn reported_record_count(&self, _config: &SamplerConfig) -> Result<u128, SamplerError> {
2✔
1305
            self.count.ok_or_else(|| SamplerError::SourceInconsistent {
2✔
1306
                source_id: self.id.clone(),
1✔
1307
                details: "test source has no configured exact count".to_string(),
1✔
1308
            })
1✔
1309
        }
2✔
1310

1311
        fn default_triplet_recipes(&self) -> Vec<TripletRecipe> {
11✔
1312
            self.recipes.clone()
11✔
1313
        }
11✔
1314
    }
1315

1316
    struct ConfigRequiredSource {
1317
        id: String,
1318
        expected_seed: u64,
1319
    }
1320

1321
    impl DataSource for ConfigRequiredSource {
1322
        fn id(&self) -> &str {
1✔
1323
            &self.id
1✔
1324
        }
1✔
1325

1326
        fn refresh(
1✔
1327
            &self,
1✔
1328
            _config: &SamplerConfig,
1✔
1329
            _cursor: Option<&SourceCursor>,
1✔
1330
            _limit: Option<usize>,
1✔
1331
        ) -> Result<SourceSnapshot, SamplerError> {
1✔
1332
            Ok(SourceSnapshot {
1✔
1333
                records: Vec::new(),
1✔
1334
                cursor: SourceCursor {
1✔
1335
                    last_seen: Utc::now(),
1✔
1336
                    revision: 0,
1✔
1337
                },
1✔
1338
            })
1✔
1339
        }
1✔
1340

1341
        fn reported_record_count(&self, config: &SamplerConfig) -> Result<u128, SamplerError> {
2✔
1342
            if config.seed == self.expected_seed {
2✔
1343
                Ok(1)
1✔
1344
            } else {
1345
                Err(SamplerError::SourceInconsistent {
1✔
1346
                    source_id: self.id.clone(),
1✔
1347
                    details: format!(
1✔
1348
                        "expected sampler seed {} but got {}",
1✔
1349
                        self.expected_seed, config.seed
1✔
1350
                    ),
1✔
1351
                })
1✔
1352
            }
1353
        }
2✔
1354

1355
        fn default_triplet_recipes(&self) -> Vec<TripletRecipe> {
2✔
1356
            Vec::new()
2✔
1357
        }
2✔
1358
    }
1359

1360
    struct FixtureSource {
1361
        id: String,
1362
        records: Vec<DataRecord>,
1363
        recipes: Vec<TripletRecipe>,
1364
    }
1365

1366
    impl DataSource for FixtureSource {
1367
        fn id(&self) -> &str {
37✔
1368
            &self.id
37✔
1369
        }
37✔
1370

1371
        fn refresh(
9✔
1372
            &self,
9✔
1373
            _config: &SamplerConfig,
9✔
1374
            _cursor: Option<&SourceCursor>,
9✔
1375
            _limit: Option<usize>,
9✔
1376
        ) -> Result<SourceSnapshot, SamplerError> {
9✔
1377
            Ok(SourceSnapshot {
9✔
1378
                records: self.records.clone(),
9✔
1379
                cursor: SourceCursor {
9✔
1380
                    last_seen: Utc::now(),
9✔
1381
                    revision: 0,
9✔
1382
                },
9✔
1383
            })
9✔
1384
        }
9✔
1385

1386
        fn reported_record_count(&self, _config: &SamplerConfig) -> Result<u128, SamplerError> {
1✔
1387
            Ok(self.records.len() as u128)
1✔
1388
        }
1✔
1389

1390
        fn default_triplet_recipes(&self) -> Vec<TripletRecipe> {
4✔
1391
            self.recipes.clone()
4✔
1392
        }
4✔
1393
    }
1394

1395
    struct IngestionConfigSource {
1396
        expected_ingestion_max_records: usize,
1397
        records: Vec<DataRecord>,
1398
    }
1399

1400
    impl DataSource for IngestionConfigSource {
1401
        fn id(&self) -> &str {
7✔
1402
            "ingestion_config_source"
7✔
1403
        }
7✔
1404

1405
        fn refresh(
3✔
1406
            &self,
3✔
1407
            config: &SamplerConfig,
3✔
1408
            _cursor: Option<&SourceCursor>,
3✔
1409
            _limit: Option<usize>,
3✔
1410
        ) -> Result<SourceSnapshot, SamplerError> {
3✔
1411
            if config.ingestion_max_records != self.expected_ingestion_max_records {
3✔
1412
                return Err(SamplerError::SourceInconsistent {
1✔
1413
                    source_id: self.id().to_string(),
1✔
1414
                    details: format!(
1✔
1415
                        "expected ingestion_max_records {} but got {}",
1✔
1416
                        self.expected_ingestion_max_records, config.ingestion_max_records
1✔
1417
                    ),
1✔
1418
                });
1✔
1419
            }
2✔
1420
            Ok(SourceSnapshot {
2✔
1421
                records: self.records.clone(),
2✔
1422
                cursor: SourceCursor {
2✔
1423
                    last_seen: Utc::now(),
2✔
1424
                    revision: 0,
2✔
1425
                },
2✔
1426
            })
2✔
1427
        }
3✔
1428

1429
        fn reported_record_count(&self, _config: &SamplerConfig) -> Result<u128, SamplerError> {
1✔
1430
            Ok(self.records.len() as u128)
1✔
1431
        }
1✔
1432

1433
        fn default_triplet_recipes(&self) -> Vec<TripletRecipe> {
2✔
1434
            vec![default_recipe("ingestion_config_recipe")]
2✔
1435
        }
2✔
1436
    }
1437

1438
    fn fixture_record(
21✔
1439
        source: &str,
21✔
1440
        id_suffix: &str,
21✔
1441
        day: u32,
21✔
1442
        title: &str,
21✔
1443
        body: &str,
21✔
1444
    ) -> DataRecord {
21✔
1445
        let now = Utc.with_ymd_and_hms(2025, 1, day, 12, 0, 0).unwrap();
21✔
1446
        DataRecord {
21✔
1447
            id: format!("{source}::{id_suffix}"),
21✔
1448
            source: source.to_string(),
21✔
1449
            created_at: now,
21✔
1450
            updated_at: now,
21✔
1451
            quality: QualityScore { trust: 1.0 },
21✔
1452
            taxonomy: Vec::new(),
21✔
1453
            sections: vec![
21✔
1454
                make_section(SectionRole::Anchor, Some("title"), title),
21✔
1455
                make_section(SectionRole::Context, Some("body"), body),
21✔
1456
            ],
21✔
1457
            meta_prefix: None,
21✔
1458
        }
21✔
1459
    }
21✔
1460

1461
    fn default_recipe(name: &str) -> TripletRecipe {
17✔
1462
        TripletRecipe {
17✔
1463
            name: name.to_string().into(),
17✔
1464
            anchor: crate::config::Selector::Role(SectionRole::Anchor),
17✔
1465
            positive_selector: crate::config::Selector::Role(SectionRole::Context),
17✔
1466
            negative_selector: crate::config::Selector::Role(SectionRole::Context),
17✔
1467
            negative_strategy: crate::config::NegativeStrategy::WrongArticle,
17✔
1468
            weight: 1.0,
17✔
1469
            instruction: None,
17✔
1470
            allow_same_anchor_positive: false,
17✔
1471
        }
17✔
1472
    }
17✔
1473

1474
    #[test]
1475
    fn parse_helpers_validate_inputs() {
1✔
1476
        assert_eq!(parse_batch_size("2").unwrap(), 2);
1✔
1477
        assert!(parse_batch_size("0").is_err());
1✔
1478
        assert!(parse_batch_size("abc").is_err());
1✔
1479
        assert_eq!(parse_ingestion_max_records("16").unwrap(), 16);
1✔
1480
        assert!(parse_ingestion_max_records("0").is_err());
1✔
1481
        assert!(parse_batch_count("0").is_err());
1✔
1482

1483
        let split = parse_split_ratios_arg("0.8,0.1,0.1").unwrap();
1✔
1484
        assert!((split.train - 0.8).abs() < 1e-6);
1✔
1485
        assert!(parse_split_ratios_arg("0.8,0.1").is_err());
1✔
1486
        assert!(parse_split_ratios_arg("1.0,0.0,0.1").is_err());
1✔
1487
        assert!(parse_split_ratios_arg("-0.1,0.6,0.5").is_err());
1✔
1488
    }
1✔
1489

1490
    #[test]
1491
    fn fixture_and_ingestion_sources_trait_methods_cover_paths() {
1✔
1492
        let records = vec![fixture_record("fixture_source", "r1", 1, "Title", "Body")];
1✔
1493
        let recipes = vec![default_recipe("fixture_recipe")];
1✔
1494
        let fixture = FixtureSource {
1✔
1495
            id: "fixture_source".into(),
1✔
1496
            records: records.clone(),
1✔
1497
            recipes: recipes.clone(),
1✔
1498
        };
1✔
1499

1500
        let snapshot = fixture
1✔
1501
            .refresh(&SamplerConfig::default(), None, None)
1✔
1502
            .expect("fixture refresh should succeed");
1✔
1503
        assert_eq!(snapshot.records.len(), 1);
1✔
1504
        assert_eq!(
1✔
1505
            fixture
1✔
1506
                .reported_record_count(&SamplerConfig::default())
1✔
1507
                .unwrap(),
1✔
1508
            1
1509
        );
1510
        assert_eq!(fixture.default_triplet_recipes().len(), 1);
1✔
1511

1512
        let source = IngestionConfigSource {
1✔
1513
            expected_ingestion_max_records: 7,
1✔
1514
            records,
1✔
1515
        };
1✔
1516
        let ok_cfg = SamplerConfig {
1✔
1517
            ingestion_max_records: 7,
1✔
1518
            ..SamplerConfig::default()
1✔
1519
        };
1✔
1520
        assert!(source.refresh(&ok_cfg, None, None).is_ok());
1✔
1521
        assert_eq!(source.reported_record_count(&ok_cfg).unwrap(), 1);
1✔
1522
        assert_eq!(source.default_triplet_recipes().len(), 1);
1✔
1523

1524
        let bad_cfg = SamplerConfig {
1✔
1525
            ingestion_max_records: 8,
1✔
1526
            ..SamplerConfig::default()
1✔
1527
        };
1✔
1528
        let err = source.refresh(&bad_cfg, None, None).unwrap_err();
1✔
1529
        assert!(matches!(err, SamplerError::SourceInconsistent { .. }));
1✔
1530
    }
1✔
1531

1532
    #[test]
1533
    fn suggested_balancing_weight_is_longest_normalized_and_bounded() {
1✔
1534
        assert!((suggested_balancing_weight(100, 100) - 1.0).abs() < 1e-6);
1✔
1535
        assert!((suggested_balancing_weight(400, 100) - 0.25).abs() < 1e-6);
1✔
1536
        assert!((suggested_balancing_weight(400, 400) - 1.0).abs() < 1e-6);
1✔
1537
        assert_eq!(suggested_balancing_weight(0, 100), 0.0);
1✔
1538
        assert_eq!(suggested_balancing_weight(100, 0), 0.0);
1✔
1539
    }
1✔
1540

1541
    #[test]
1542
    fn suggested_oversampling_weight_is_inverse_in_unit_interval() {
1✔
1543
        assert!((suggested_oversampling_weight(100, 100) - 1.0).abs() < 1e-6);
1✔
1544
        assert!((suggested_oversampling_weight(100, 400) - 0.25).abs() < 1e-6);
1✔
1545
        assert!((suggested_oversampling_weight(100, 1000) - 0.1).abs() < 1e-6);
1✔
1546
        assert_eq!(suggested_oversampling_weight(0, 100), 0.0);
1✔
1547
        assert_eq!(suggested_oversampling_weight(100, 0), 0.0);
1✔
1548
    }
1✔
1549

1550
    #[test]
1551
    fn parse_cli_handles_help_and_invalid_args() {
1✔
1552
        let help = parse_cli::<EstimateCapacityCli, _>(["estimate_capacity", "--help"]).unwrap();
1✔
1553
        assert!(help.is_none());
1✔
1554

1555
        let err = parse_cli::<EstimateCapacityCli, _>(["estimate_capacity", "--unknown"]);
1✔
1556
        assert!(err.is_err());
1✔
1557
    }
1✔
1558

1559
    #[test]
1560
    fn run_estimate_capacity_succeeds_with_reported_counts() {
1✔
1561
        let result = run_estimate_capacity(
1✔
1562
            std::iter::empty::<String>(),
1✔
1563
            |roots| {
1✔
1564
                assert!(roots.is_empty());
1✔
1565
                Ok(())
1✔
1566
            },
1✔
1567
            |_| {
1✔
1568
                vec![Box::new(TestSource {
1✔
1569
                    id: "source_a".into(),
1✔
1570
                    count: Some(12),
1✔
1571
                    recipes: vec![default_recipe("r1")],
1✔
1572
                }) as DynSource]
1✔
1573
            },
1✔
1574
        );
1575

1576
        assert!(result.is_ok());
1✔
1577
    }
1✔
1578

1579
    #[test]
1580
    fn run_estimate_capacity_errors_when_source_count_missing() {
1✔
1581
        let result = run_estimate_capacity(
1✔
1582
            std::iter::empty::<String>(),
1✔
1583
            |_| Ok(()),
1✔
1584
            |_| {
1✔
1585
                vec![Box::new(TestSource {
1✔
1586
                    id: "source_missing".into(),
1✔
1587
                    count: None,
1✔
1588
                    recipes: vec![default_recipe("r1")],
1✔
1589
                }) as DynSource]
1✔
1590
            },
1✔
1591
        );
1592

1593
        let err = result.unwrap_err().to_string();
1✔
1594
        assert!(err.contains("failed to report exact record count"));
1✔
1595
    }
1✔
1596

1597
    #[test]
1598
    fn run_estimate_capacity_propagates_root_resolution_error() {
1✔
1599
        let result = run_estimate_capacity(
1✔
1600
            std::iter::empty::<String>(),
1✔
1601
            |_| Err("root resolution failed".into()),
1✔
1602
            empty_dyn_sources,
1603
        );
1604

1605
        let err = result.unwrap_err().to_string();
1✔
1606
        assert!(err.contains("root resolution failed"));
1✔
1607
    }
1✔
1608

1609
    #[test]
1610
    fn run_estimate_capacity_allows_empty_source_list() {
1✔
1611
        let result =
1✔
1612
            run_estimate_capacity(std::iter::empty::<String>(), |_| Ok(()), empty_dyn_sources);
1✔
1613

1614
        assert!(result.is_ok());
1✔
1615
    }
1✔
1616

1617
    #[test]
1618
    fn run_estimate_capacity_configures_sources_centrally_before_counting() {
1✔
1619
        let result = run_estimate_capacity(
1✔
1620
            std::iter::empty::<String>(),
1✔
1621
            |_| Ok(()),
1✔
1622
            |_| {
1✔
1623
                vec![Box::new(ConfigRequiredSource {
1✔
1624
                    id: "requires_config".into(),
1✔
1625
                    expected_seed: 99,
1✔
1626
                }) as DynSource]
1✔
1627
            },
1✔
1628
        );
1629

1630
        assert!(result.is_ok());
1✔
1631
    }
1✔
1632

1633
    #[test]
1634
    fn config_required_source_refresh_and_seed_mismatch_are_exercised() {
1✔
1635
        let source = ConfigRequiredSource {
1✔
1636
            id: "cfg-source".to_string(),
1✔
1637
            expected_seed: 42,
1✔
1638
        };
1✔
1639

1640
        let refreshed = source
1✔
1641
            .refresh(&SamplerConfig::default(), None, None)
1✔
1642
            .unwrap();
1✔
1643
        assert!(refreshed.records.is_empty());
1✔
1644

1645
        let mismatched = source.reported_record_count(&SamplerConfig {
1✔
1646
            seed: 7,
1✔
1647
            ..SamplerConfig::default()
1✔
1648
        });
1✔
1649
        assert!(matches!(
1✔
1650
            mismatched,
1✔
1651
            Err(SamplerError::SourceInconsistent { .. })
1652
        ));
1653

1654
        assert!(source.default_triplet_recipes().is_empty());
1✔
1655
    }
1✔
1656

1657
    #[test]
1658
    fn run_multi_source_demo_exhausted_paths_return_ok() {
1✔
1659
        struct OneRecordSource;
1660

1661
        impl DataSource for OneRecordSource {
1662
            fn id(&self) -> &str {
48✔
1663
                "one_record"
48✔
1664
            }
48✔
1665

1666
            fn refresh(
11✔
1667
                &self,
11✔
1668
                _config: &SamplerConfig,
11✔
1669
                _cursor: Option<&SourceCursor>,
11✔
1670
                _limit: Option<usize>,
11✔
1671
            ) -> Result<SourceSnapshot, SamplerError> {
11✔
1672
                let now = Utc::now();
11✔
1673
                Ok(SourceSnapshot {
11✔
1674
                    records: vec![DataRecord {
11✔
1675
                        id: "one_record::r1".to_string(),
11✔
1676
                        source: "one_record".to_string(),
11✔
1677
                        created_at: now,
11✔
1678
                        updated_at: now,
11✔
1679
                        quality: QualityScore { trust: 1.0 },
11✔
1680
                        taxonomy: Vec::new(),
11✔
1681
                        sections: vec![
11✔
1682
                            RecordSection {
11✔
1683
                                role: SectionRole::Anchor,
11✔
1684
                                heading: Some("title".to_string()),
11✔
1685
                                text: "anchor".to_string(),
11✔
1686
                                sentences: vec!["anchor".to_string()],
11✔
1687
                            },
11✔
1688
                            RecordSection {
11✔
1689
                                role: SectionRole::Context,
11✔
1690
                                heading: Some("body".to_string()),
11✔
1691
                                text: "context".to_string(),
11✔
1692
                                sentences: vec!["context".to_string()],
11✔
1693
                            },
11✔
1694
                        ],
11✔
1695
                        meta_prefix: None,
11✔
1696
                    }],
11✔
1697
                    cursor: SourceCursor {
11✔
1698
                        last_seen: now,
11✔
1699
                        revision: 0,
11✔
1700
                    },
11✔
1701
                })
11✔
1702
            }
11✔
1703

1704
            fn reported_record_count(&self, _config: &SamplerConfig) -> Result<u128, SamplerError> {
1✔
1705
                Ok(1)
1✔
1706
            }
1✔
1707

1708
            fn default_triplet_recipes(&self) -> Vec<TripletRecipe> {
4✔
1709
                vec![default_recipe("single_record_recipe")]
4✔
1710
            }
4✔
1711
        }
1712

1713
        let one = OneRecordSource;
1✔
1714
        assert_eq!(
1✔
1715
            one.reported_record_count(&SamplerConfig::default())
1✔
1716
                .unwrap(),
1✔
1717
            1
1718
        );
1719
        assert_eq!(one.default_triplet_recipes().len(), 1);
1✔
1720

1721
        for mode in ["--pair-batch", "--text-recipes", ""] {
3✔
1722
            let dir = tempdir().unwrap();
3✔
1723
            let split_store_path = dir.path().join("split_store.bin");
3✔
1724
            let mut args = vec![
3✔
1725
                "--split-store-path".to_string(),
3✔
1726
                split_store_path.to_string_lossy().to_string(),
3✔
1727
            ];
1728
            if !mode.is_empty() {
3✔
1729
                args.push(mode.to_string());
2✔
1730
            }
2✔
1731

1732
            let result = run_multi_source_demo(
3✔
1733
                args.into_iter(),
3✔
1734
                |_| Ok(()),
3✔
1735
                |_| vec![Box::new(OneRecordSource) as DynSource],
3✔
1736
            );
1737
            assert!(result.is_ok());
3✔
1738
        }
1739
    }
1✔
1740

1741
    #[test]
1742
    fn parse_multi_source_cli_handles_help_and_batch_size_validation() {
1✔
1743
        let help = parse_cli::<MultiSourceDemoCli, _>(["multi_source_demo", "--help"]).unwrap();
1✔
1744
        assert!(help.is_none());
1✔
1745

1746
        let err = parse_cli::<MultiSourceDemoCli, _>(["multi_source_demo", "--batch-size", "0"]);
1✔
1747
        assert!(err.is_err());
1✔
1748

1749
        let err = parse_cli::<MultiSourceDemoCli, _>([
1✔
1750
            "multi_source_demo",
1✔
1751
            "--ingestion-max-records",
1✔
1752
            "0",
1✔
1753
        ]);
1✔
1754
        assert!(err.is_err());
1✔
1755

1756
        let parsed = parse_cli::<MultiSourceDemoCli, _>(["multi_source_demo"]);
1✔
1757
        assert!(parsed.is_ok());
1✔
1758
    }
1✔
1759

1760
    #[test]
1761
    fn run_multi_source_demo_passes_ingestion_max_records_to_sources() {
1✔
1762
        let dir = tempdir().unwrap();
1✔
1763
        let split_store_path = dir.path().join("ingestion_config_split_store.bin");
1✔
1764
        let expected = 7;
1✔
1765

1766
        let result = run_multi_source_demo(
1✔
1767
            [
1✔
1768
                "--pair-batch".to_string(),
1✔
1769
                "--ingestion-max-records".to_string(),
1✔
1770
                expected.to_string(),
1✔
1771
                "--split-store-path".to_string(),
1✔
1772
                split_store_path.to_string_lossy().to_string(),
1✔
1773
            ]
1✔
1774
            .into_iter(),
1✔
1775
            |_| Ok(()),
1✔
1776
            |_| {
1✔
1777
                vec![Box::new(IngestionConfigSource {
1✔
1778
                    expected_ingestion_max_records: expected,
1✔
1779
                    records: (1..=8)
1✔
1780
                        .map(|day| {
8✔
1781
                            fixture_record(
8✔
1782
                                "ingestion_config_source",
8✔
1783
                                &format!("r{day}"),
8✔
1784
                                day,
8✔
1785
                                &format!("Config headline {day}"),
8✔
1786
                                &format!("Config body {day}"),
8✔
1787
                            )
1788
                        })
8✔
1789
                        .collect(),
1✔
1790
                }) as DynSource]
1✔
1791
            },
1✔
1792
        );
1793

1794
        assert!(result.is_ok());
1✔
1795
    }
1✔
1796

1797
    #[test]
1798
    fn parse_cli_handles_display_version_path() {
1✔
1799
        #[derive(Debug, Parser)]
1800
        #[command(name = "version_test", version = "1.0.0")]
1801
        struct VersionCli {}
1802

1803
        let parsed = parse_cli::<VersionCli, _>(["version_test", "--version"]).unwrap();
1✔
1804
        assert!(parsed.is_none());
1✔
1805
    }
1✔
1806

1807
    #[test]
1808
    fn run_multi_source_demo_list_text_recipes_path_succeeds() {
1✔
1809
        let dir = tempdir().unwrap();
1✔
1810
        let split_store_path = dir.path().join("recipes_split_store.bin");
1✔
1811
        let mut args = vec![
1✔
1812
            "--list-text-recipes".to_string(),
1✔
1813
            "--split-store-path".to_string(),
1✔
1814
            split_store_path.to_string_lossy().to_string(),
1✔
1815
        ];
1816
        let result = run_multi_source_demo(
1✔
1817
            args.drain(..),
1✔
1818
            |_| Ok(()),
1✔
1819
            |_| {
1✔
1820
                vec![Box::new(TestSource {
1✔
1821
                    id: "source_for_recipes".into(),
1✔
1822
                    count: Some(10),
1✔
1823
                    recipes: vec![default_recipe("recipe_a")],
1✔
1824
                }) as DynSource]
1✔
1825
            },
1✔
1826
        );
1827

1828
        assert!(result.is_ok());
1✔
1829
    }
1✔
1830

1831
    #[test]
1832
    fn run_multi_source_demo_list_text_recipes_uses_explicit_split_store_path() {
1✔
1833
        let dir = tempdir().unwrap();
1✔
1834
        let split_store_path = dir.path().join("custom_split_store.bin");
1✔
1835
        let args = vec![
1✔
1836
            "--list-text-recipes".to_string(),
1✔
1837
            "--split-store-path".to_string(),
1✔
1838
            split_store_path.to_string_lossy().to_string(),
1✔
1839
        ];
1840

1841
        let result = run_multi_source_demo(
1✔
1842
            args.into_iter(),
1✔
1843
            |_| Ok(()),
1✔
1844
            |_| {
1✔
1845
                vec![Box::new(TestSource {
1✔
1846
                    id: "source_without_text_recipes".into(),
1✔
1847
                    count: Some(1),
1✔
1848
                    recipes: Vec::new(),
1✔
1849
                }) as DynSource]
1✔
1850
            },
1✔
1851
        );
1852

1853
        assert!(result.is_ok());
1✔
1854
    }
1✔
1855

1856
    #[test]
1857
    fn run_multi_source_demo_sampling_modes_handle_empty_sources() {
1✔
1858
        for mode in [
3✔
1859
            vec!["--pair-batch".to_string()],
1✔
1860
            vec!["--text-recipes".to_string()],
1✔
1861
            vec![],
1✔
1862
        ] {
1✔
1863
            let dir = tempdir().unwrap();
3✔
1864
            let split_store_path = dir.path().join("empty_sources_split_store.bin");
3✔
1865
            let mut args = mode;
3✔
1866
            args.push("--split-store-path".to_string());
3✔
1867
            args.push(split_store_path.to_string_lossy().to_string());
3✔
1868
            args.push("--split".to_string());
3✔
1869
            args.push("validation".to_string());
3✔
1870

1871
            let result = run_multi_source_demo(
3✔
1872
                args.into_iter(),
3✔
1873
                |_| Ok(()),
3✔
1874
                |_| {
3✔
1875
                    vec![Box::new(TestSource {
3✔
1876
                        id: "source_empty".into(),
3✔
1877
                        count: Some(0),
3✔
1878
                        recipes: vec![default_recipe("recipe_empty")],
3✔
1879
                    }) as DynSource]
3✔
1880
                },
3✔
1881
            );
1882

1883
            assert!(result.is_ok());
3✔
1884
        }
1885
    }
1✔
1886

1887
    #[test]
1888
    fn run_multi_source_demo_propagates_root_resolution_error() {
1✔
1889
        let dir = tempdir().unwrap();
1✔
1890
        let split_store_path = dir.path().join("root_resolution_error_store.bin");
1✔
1891
        let result = run_multi_source_demo(
1✔
1892
            [
1✔
1893
                "--split-store-path".to_string(),
1✔
1894
                split_store_path.to_string_lossy().to_string(),
1✔
1895
            ]
1✔
1896
            .into_iter(),
1✔
1897
            |_| Err("demo root resolution failed".into()),
1✔
1898
            empty_dyn_sources,
1899
        );
1900

1901
        let err = result.unwrap_err().to_string();
1✔
1902
        assert!(err.contains("demo root resolution failed"));
1✔
1903
    }
1✔
1904

1905
    #[test]
1906
    fn run_multi_source_demo_list_text_recipes_allows_empty_sources() {
1✔
1907
        let dir = tempdir().unwrap();
1✔
1908
        let split_store_path = dir.path().join("empty_source_list_recipes.bin");
1✔
1909
        let result = run_multi_source_demo(
1✔
1910
            [
1✔
1911
                "--list-text-recipes".to_string(),
1✔
1912
                "--split-store-path".to_string(),
1✔
1913
                split_store_path.to_string_lossy().to_string(),
1✔
1914
            ]
1✔
1915
            .into_iter(),
1✔
1916
            |_| Ok(()),
1✔
1917
            empty_dyn_sources,
1918
        );
1919

1920
        assert!(result.is_ok());
1✔
1921
    }
1✔
1922

1923
    #[test]
1924
    fn print_helpers_and_extract_source_cover_paths() {
1✔
1925
        let split = SplitRatios::default();
1✔
1926
        let store = DeterministicSplitStore::new(split, 42).unwrap();
1✔
1927
        let strategy = ChunkingStrategy::default();
1✔
1928

1929
        let anchor = RecordChunk {
1✔
1930
            record_id: "source_a::rec1".to_string(),
1✔
1931
            section_idx: 0,
1✔
1932
            view: ChunkView::Window {
1✔
1933
                index: 1,
1✔
1934
                overlap: 2,
1✔
1935
                span: 12,
1✔
1936
                start_ratio: 0.25,
1✔
1937
            },
1✔
1938
            text: "anchor text".to_string(),
1✔
1939
            tokens_estimate: 8,
1✔
1940
            quality: crate::data::QualityScore { trust: 0.9 },
1✔
1941
        };
1✔
1942
        let positive = RecordChunk {
1✔
1943
            record_id: "source_a::rec2".to_string(),
1✔
1944
            section_idx: 1,
1✔
1945
            view: ChunkView::SummaryFallback {
1✔
1946
                strategy: "summary".to_string(),
1✔
1947
                weight: 0.7,
1✔
1948
            },
1✔
1949
            text: "positive text".to_string(),
1✔
1950
            tokens_estimate: 6,
1✔
1951
            quality: crate::data::QualityScore { trust: 0.8 },
1✔
1952
        };
1✔
1953
        let negative = RecordChunk {
1✔
1954
            record_id: "source_b::rec3".to_string(),
1✔
1955
            section_idx: 2,
1✔
1956
            view: ChunkView::Window {
1✔
1957
                index: 0,
1✔
1958
                overlap: 0,
1✔
1959
                span: 16,
1✔
1960
                start_ratio: 0.0,
1✔
1961
            },
1✔
1962
            text: "negative text".to_string(),
1✔
1963
            tokens_estimate: 7,
1✔
1964
            quality: crate::data::QualityScore { trust: 0.5 },
1✔
1965
        };
1✔
1966

1967
        let triplet_batch = TripletBatch {
1✔
1968
            triplets: vec![crate::SampleTriplet {
1✔
1969
                recipe: "triplet_recipe".to_string(),
1✔
1970
                anchor: anchor.clone(),
1✔
1971
                positive: positive.clone(),
1✔
1972
                negative: negative.clone(),
1✔
1973
                weight: 1.0,
1✔
1974
                instruction: Some("triplet instruction".to_string()),
1✔
1975
            }],
1✔
1976
        };
1✔
1977
        print_triplet_batch(&strategy, &triplet_batch, &store);
1✔
1978

1979
        let pair_batch = SampleBatch {
1✔
1980
            pairs: vec![crate::SamplePair {
1✔
1981
                recipe: "pair_recipe".to_string(),
1✔
1982
                anchor: anchor.clone(),
1✔
1983
                positive: positive.clone(),
1✔
1984
                weight: 1.0,
1✔
1985
                instruction: None,
1✔
1986
                label: crate::PairLabel::Positive,
1✔
1987
                reason: Some("same topic".to_string()),
1✔
1988
            }],
1✔
1989
        };
1✔
1990
        print_pair_batch(&strategy, &pair_batch, &store);
1✔
1991

1992
        let text_batch = TextBatch {
1✔
1993
            samples: vec![crate::TextSample {
1✔
1994
                recipe: "text_recipe".to_string(),
1✔
1995
                chunk: negative,
1✔
1996
                weight: 0.8,
1✔
1997
                instruction: Some("text instruction".to_string()),
1✔
1998
            }],
1✔
1999
        };
1✔
2000
        print_text_batch(&strategy, &text_batch, &store);
1✔
2001

2002
        let recipes = vec![TextRecipe {
1✔
2003
            name: "recipe_name".into(),
1✔
2004
            selector: crate::config::Selector::Role(SectionRole::Context),
1✔
2005
            instruction: Some("instruction".into()),
1✔
2006
            weight: 1.0,
1✔
2007
        }];
1✔
2008
        print_text_recipes(&recipes);
1✔
2009

2010
        assert_eq!(extract_source("source_a::record"), "source_a");
1✔
2011
        assert_eq!(extract_source("record-without-delimiter"), "unknown");
1✔
2012
    }
1✔
2013

2014
    #[test]
2015
    fn split_arg_conversion_and_version_parse_paths_are_covered() {
1✔
2016
        assert!(matches!(
1✔
2017
            SplitLabel::from(SplitArg::Train),
1✔
2018
            SplitLabel::Train
2019
        ));
2020
        assert!(matches!(
1✔
2021
            SplitLabel::from(SplitArg::Validation),
1✔
2022
            SplitLabel::Validation
2023
        ));
2024
        assert!(matches!(SplitLabel::from(SplitArg::Test), SplitLabel::Test));
1✔
2025
    }
1✔
2026

2027
    #[test]
2028
    fn parse_split_ratios_reports_per_field_parse_errors() {
1✔
2029
        assert!(
1✔
2030
            parse_split_ratios_arg("x,0.1,0.9")
1✔
2031
                .unwrap_err()
1✔
2032
                .contains("invalid train ratio")
1✔
2033
        );
2034
        assert!(
1✔
2035
            parse_split_ratios_arg("0.1,y,0.8")
1✔
2036
                .unwrap_err()
1✔
2037
                .contains("invalid validation ratio")
1✔
2038
        );
2039
        assert!(
1✔
2040
            parse_split_ratios_arg("0.1,0.2,z")
1✔
2041
                .unwrap_err()
1✔
2042
                .contains("invalid test ratio")
1✔
2043
        );
2044
    }
1✔
2045

2046
    #[test]
2047
    fn run_multi_source_demo_exhausted_paths_are_handled() {
1✔
2048
        for mode in [
3✔
2049
            vec!["--pair-batch".to_string()],
1✔
2050
            vec!["--text-recipes".to_string()],
1✔
2051
            Vec::new(),
1✔
2052
        ] {
1✔
2053
            let dir = tempdir().unwrap();
3✔
2054
            let split_store_path = dir.path().join("exhausted_split_store.bin");
3✔
2055
            let mut args = mode;
3✔
2056
            args.push("--split-store-path".to_string());
3✔
2057
            args.push(split_store_path.to_string_lossy().to_string());
3✔
2058

2059
            let result = run_multi_source_demo(
3✔
2060
                args.into_iter(),
3✔
2061
                |_| Ok(()),
3✔
2062
                |_| {
3✔
2063
                    vec![Box::new(TestSource {
3✔
2064
                        id: "source_without_recipes".into(),
3✔
2065
                        count: Some(1),
3✔
2066
                        recipes: Vec::new(),
3✔
2067
                    }) as DynSource]
3✔
2068
                },
3✔
2069
            );
2070

2071
            assert!(result.is_ok());
3✔
2072
        }
2073
    }
1✔
2074

2075
    #[test]
2076
    fn run_multi_source_demo_reset_recreates_split_store_and_samples() {
1✔
2077
        let dir = tempdir().unwrap();
1✔
2078
        let split_store_path = dir.path().join("reset_split_store.bin");
1✔
2079
        std::fs::write(&split_store_path, b"stale-data").unwrap();
1✔
2080

2081
        let args = vec![
1✔
2082
            "--reset".to_string(),
1✔
2083
            "--pair-batch".to_string(),
1✔
2084
            "--split-store-path".to_string(),
1✔
2085
            split_store_path.to_string_lossy().to_string(),
1✔
2086
        ];
2087

2088
        let result = run_multi_source_demo(
1✔
2089
            args.into_iter(),
1✔
2090
            |_| Ok(()),
1✔
2091
            |_| {
1✔
2092
                let recipes = vec![default_recipe("fixture_recipe")];
1✔
2093
                let records: Vec<DataRecord> = (1..=8)
1✔
2094
                    .map(|day| {
8✔
2095
                        fixture_record(
8✔
2096
                            "fixture_source",
8✔
2097
                            &format!("r{day}"),
8✔
2098
                            day,
8✔
2099
                            &format!("Fixture headline {day}"),
8✔
2100
                            &format!("Fixture body content for day {day}."),
8✔
2101
                        )
2102
                    })
8✔
2103
                    .collect();
1✔
2104
                vec![Box::new(FixtureSource {
1✔
2105
                    id: "fixture_source".into(),
1✔
2106
                    records,
1✔
2107
                    recipes,
1✔
2108
                }) as DynSource]
1✔
2109
            },
1✔
2110
        );
2111

2112
        assert!(result.is_ok());
1✔
2113
        assert!(split_store_path.exists());
1✔
2114
        let metadata = std::fs::metadata(&split_store_path).unwrap();
1✔
2115
        assert!(metadata.len() > 0);
1✔
2116
    }
1✔
2117

2118
    #[test]
2119
    fn run_multi_source_demo_batches_mode_executes_multiple_batches() {
1✔
2120
        let dir = tempdir().unwrap();
1✔
2121
        let split_store_path = dir.path().join("batches_split_store.bin");
1✔
2122
        let args = vec![
1✔
2123
            "--batches".to_string(),
1✔
2124
            "2".to_string(),
1✔
2125
            "--split-store-path".to_string(),
1✔
2126
            split_store_path.to_string_lossy().to_string(),
1✔
2127
        ];
2128

2129
        let result = run_multi_source_demo(
1✔
2130
            args.into_iter(),
1✔
2131
            |_| Ok(()),
1✔
2132
            |_| {
1✔
2133
                let recipes = vec![default_recipe("batch_recipe")];
1✔
2134
                vec![Box::new(FixtureSource {
1✔
2135
                    id: "batch_source".into(),
1✔
2136
                    records: vec![
1✔
2137
                        fixture_record(
1✔
2138
                            "batch_source",
1✔
2139
                            "r1",
1✔
2140
                            3,
1✔
2141
                            "Inflation cools in latest report",
1✔
2142
                            "Core inflation moderated compared with prior quarter.",
1✔
2143
                        ),
1✔
2144
                        fixture_record(
1✔
2145
                            "batch_source",
1✔
2146
                            "r2",
1✔
2147
                            4,
1✔
2148
                            "Labor market remains resilient",
1✔
2149
                            "Job openings remain elevated despite slower growth.",
1✔
2150
                        ),
1✔
2151
                        fixture_record(
1✔
2152
                            "batch_source",
1✔
2153
                            "r3",
1✔
2154
                            5,
1✔
2155
                            "Manufacturing sentiment stabilizes",
1✔
2156
                            "Survey data suggests output expectations are improving.",
1✔
2157
                        ),
1✔
2158
                    ],
1✔
2159
                    recipes,
1✔
2160
                }) as DynSource]
1✔
2161
            },
1✔
2162
        );
2163

2164
        assert!(result.is_ok());
1✔
2165
        assert!(split_store_path.exists());
1✔
2166
    }
1✔
2167

2168
    #[test]
2169
    fn managed_demo_split_store_path_resolves_under_cache_group() {
1✔
2170
        let path = managed_demo_split_store_path().unwrap();
1✔
2171
        assert!(path.ends_with(MULTI_SOURCE_DEMO_STORE_FILENAME));
1✔
2172
        let parent = path
1✔
2173
            .parent()
1✔
2174
            .expect("managed split-store path should have a parent");
1✔
2175
        assert!(parent.ends_with(PathBuf::from(MULTI_SOURCE_DEMO_GROUP)));
1✔
2176
    }
1✔
2177

2178
    #[test]
2179
    fn run_multi_source_demo_help_returns_ok_without_work() {
1✔
2180
        let no_help = run_multi_source_demo(
1✔
2181
            std::iter::empty::<String>(),
1✔
2182
            error_unit_roots,
2183
            empty_dyn_sources,
2184
        );
2185
        assert!(
1✔
2186
            no_help
1✔
2187
                .expect_err("non-help path should attempt to resolve roots")
1✔
2188
                .to_string()
1✔
2189
                .contains("root-resolution-error")
1✔
2190
        );
2191

2192
        let result = run_multi_source_demo(
1✔
2193
            ["--help".to_string()].into_iter(),
1✔
2194
            ok_unit_roots,
2195
            empty_dyn_sources,
2196
        );
2197

2198
        assert!(result.is_ok());
1✔
2199
    }
1✔
2200

2201
    #[test]
2202
    fn run_estimate_capacity_help_returns_ok_without_work() {
1✔
2203
        let result = run_estimate_capacity(
1✔
2204
            ["--help".to_string()].into_iter(),
1✔
2205
            ok_unit_roots,
2206
            empty_dyn_sources,
2207
        );
2208

2209
        assert!(result.is_ok());
1✔
2210
    }
1✔
2211

2212
    #[test]
2213
    fn run_multi_source_demo_pair_exhausted_branch_returns_ok() {
1✔
2214
        let dir = tempdir().unwrap();
1✔
2215
        let split_store_path = dir.path().join("pair_exhausted_split_store.bin");
1✔
2216
        let args = vec![
1✔
2217
            "--pair-batch".to_string(),
1✔
2218
            "--split-store-path".to_string(),
1✔
2219
            split_store_path.to_string_lossy().to_string(),
1✔
2220
        ];
2221

2222
        let result = run_multi_source_demo(
1✔
2223
            args.into_iter(),
1✔
2224
            |_| Ok(()),
1✔
2225
            |_| {
1✔
2226
                vec![Box::new(FixtureSource {
1✔
2227
                    id: "pair_exhausted_source".into(),
1✔
2228
                    records: vec![fixture_record(
1✔
2229
                        "pair_exhausted_source",
1✔
2230
                        "r1",
1✔
2231
                        1,
1✔
2232
                        "Single record title",
1✔
2233
                        "Single record body",
1✔
2234
                    )],
1✔
2235
                    recipes: vec![default_recipe("pair_exhausted_recipe")],
1✔
2236
                }) as DynSource]
1✔
2237
            },
1✔
2238
        );
2239

2240
        assert!(result.is_ok());
1✔
2241
    }
1✔
2242

2243
    #[test]
2244
    fn run_multi_source_demo_uses_managed_split_store_path_when_not_provided() {
1✔
2245
        let result = run_multi_source_demo(
1✔
2246
            ["--list-text-recipes".to_string()].into_iter(),
1✔
2247
            |_| Ok(()),
1✔
2248
            |_| {
1✔
2249
                vec![Box::new(TestSource {
1✔
2250
                    id: "managed_path_source".into(),
1✔
2251
                    count: Some(2),
1✔
2252
                    recipes: vec![default_recipe("managed_recipe")],
1✔
2253
                }) as DynSource]
1✔
2254
            },
1✔
2255
        );
2256

2257
        assert!(result.is_ok());
1✔
2258
    }
1✔
2259

2260
    #[test]
2261
    fn run_multi_source_demo_reset_errors_when_target_is_directory() {
1✔
2262
        let dir = tempdir().unwrap();
1✔
2263
        let split_store_path = dir.path().join("split_store_dir");
1✔
2264
        std::fs::create_dir(&split_store_path).unwrap();
1✔
2265

2266
        let result = run_multi_source_demo(
1✔
2267
            [
1✔
2268
                "--reset".to_string(),
1✔
2269
                "--split-store-path".to_string(),
1✔
2270
                split_store_path.to_string_lossy().to_string(),
1✔
2271
            ]
1✔
2272
            .into_iter(),
1✔
2273
            |_| Ok(()),
1✔
NEW
2274
            |_| Vec::<DynSource>::new(),
×
2275
        );
2276

2277
        let err = result.unwrap_err().to_string();
1✔
2278
        assert!(err.contains("failed to remove split store"));
1✔
2279
    }
1✔
2280

2281
    #[test]
2282
    fn print_summary_helpers_accept_empty_iterators() {
1✔
2283
        print_source_summary("empty summary", std::iter::empty::<&str>());
1✔
2284
        print_recipe_context_by_source("empty recipe context", std::iter::empty::<(&str, &str)>());
1✔
2285
    }
1✔
2286

2287
    #[cfg(feature = "extended-metrics")]
2288
    #[test]
2289
    fn metric_mean_median_handles_even_length_inputs() {
1✔
2290
        let mut vals = [1.0, 4.0, 2.0, 3.0];
1✔
2291
        let (mean, median) = metric_mean_median(&mut vals);
1✔
2292
        assert!((mean - 2.5).abs() < 1e-6);
1✔
2293
        assert!((median - 2.5).abs() < 1e-6);
1✔
2294
    }
1✔
2295

2296
    #[cfg(feature = "extended-metrics")]
2297
    #[test]
2298
    fn metric_mean_median_handles_odd_length_inputs() {
1✔
2299
        let mut vals = [3.0, 1.0, 2.0];
1✔
2300
        let (mean, median) = metric_mean_median(&mut vals);
1✔
2301
        assert!((mean - 2.0).abs() < 1e-6);
1✔
2302
        assert!((median - 2.0).abs() < 1e-6);
1✔
2303
    }
1✔
2304

2305
    #[cfg(feature = "extended-metrics")]
2306
    #[test]
2307
    fn print_metric_summary_includes_multi_source_aggregate() {
1✔
2308
        let source_data = HashMap::from([
1✔
2309
            (
1✔
2310
                "source_a".to_string(),
1✔
2311
                vec![(0.9, 0.8, 0.2, 0.1), (0.8, 0.7, 0.3, 0.2)],
1✔
2312
            ),
1✔
2313
            (
1✔
2314
                "source_b".to_string(),
1✔
2315
                vec![(0.7, 0.6, 0.4, 0.3), (0.6, 0.5, 0.5, 0.4)],
1✔
2316
            ),
1✔
2317
        ]);
1✔
2318

2319
        print_metric_summary(&source_data);
1✔
2320
    }
1✔
2321
}
STATUS · Troubleshooting · Open an Issue · Sales · Support · CAREERS · ENTERPRISE · START FREE · SCHEDULE DEMO
ANNOUNCEMENTS · TWITTER · TOS & SLA · Supported CI Services · What's a CI service? · Automated Testing

© 2026 Coveralls, Inc