• Home
  • Features
  • Pricing
  • Docs
  • Announcements
  • Sign In

jzombie / rust-triplets / 23568005407

25 Mar 2026 10:46PM UTC coverage: 95.114% (+0.3%) from 94.794%
23568005407

Pull #40

github

web-flow
Merge 17a2a7f6a into 65addee9d
Pull Request #40: Refactor BM25 integration

3156 of 3228 new or added lines in 7 files covered. (97.77%)

2 existing lines in 2 files now uncovered.

15826 of 16639 relevant lines covered (95.11%)

132645.2 hits per line

Source File
Press 'n' to go to next uncovered line, 'b' for previous

98.93
/src/example_apps.rs
1
// TODO: Consider extracting to a debug crate
2

3
use std::collections::HashMap;
4
use std::error::Error;
5
use std::path::PathBuf;
6
use std::sync::Arc;
7
use std::sync::Once;
8
use std::time::Instant;
9

10
use cache_manager::CacheRoot;
11
use clap::{Parser, ValueEnum, error::ErrorKind};
12

13
use crate::config::{ChunkingStrategy, SamplerConfig, TripletRecipe};
14
use crate::constants::cache::{MULTI_SOURCE_DEMO_GROUP, MULTI_SOURCE_DEMO_STORE_FILENAME};
15
use crate::data::ChunkView;
16
use crate::heuristics::{
17
    CapacityTotals, EFFECTIVE_NEGATIVES_PER_ANCHOR, EFFECTIVE_POSITIVES_PER_ANCHOR,
18
    estimate_source_split_capacity_from_counts, format_replay_factor, format_u128_with_commas,
19
    resolve_text_recipes_for_source, split_counts_for_total,
20
};
21
use crate::metrics::source_skew;
22
use crate::sampler::chunk_weight;
23
use crate::source::DataSource;
24
use crate::splits::{FileSplitStore, SplitLabel, SplitRatios, SplitStore};
25
use crate::{
26
    RecordChunk, SampleBatch, Sampler, SamplerError, SourceId, TextBatch, TextRecipe, TripletBatch,
27
    TripletSampler,
28
};
29

30
type DynSource = Box<dyn DataSource + 'static>;
31

32
fn managed_demo_split_store_path() -> Result<PathBuf, String> {
2✔
33
    let cache_root = CacheRoot::from_discovery()
2✔
34
        .map_err(|err| format!("failed discovering managed cache root: {err}"))?;
2✔
35
    let group = PathBuf::from(MULTI_SOURCE_DEMO_GROUP);
2✔
36
    let dir = cache_root.ensure_group(&group).map_err(|err| {
2✔
37
        format!(
×
38
            "failed creating managed demo cache group '{}': {err}",
39
            group.display()
×
40
        )
41
    })?;
×
42
    Ok(dir.join(MULTI_SOURCE_DEMO_STORE_FILENAME))
2✔
43
}
2✔
44

45
fn init_example_tracing() {
35✔
46
    static INIT: Once = Once::new();
47
    INIT.call_once(|| {
35✔
48
        let env_filter = tracing_subscriber::EnvFilter::try_from_default_env()
1✔
49
            .unwrap_or_else(|_| tracing_subscriber::EnvFilter::new("triplets=info"));
1✔
50
        let _ = tracing_subscriber::fmt()
1✔
51
            .with_env_filter(env_filter)
1✔
52
            .try_init();
1✔
53
    });
1✔
54
}
35✔
55

56
#[derive(Debug, Clone, Copy, ValueEnum)]
57
/// CLI split selector mapped onto `SplitLabel`.
58
enum SplitArg {
59
    Train,
60
    Validation,
61
    Test,
62
}
63

64
impl From<SplitArg> for SplitLabel {
65
    fn from(value: SplitArg) -> Self {
6✔
66
        match value {
6✔
67
            SplitArg::Train => SplitLabel::Train,
1✔
68
            SplitArg::Validation => SplitLabel::Validation,
4✔
69
            SplitArg::Test => SplitLabel::Test,
1✔
70
        }
71
    }
6✔
72
}
73

74
#[derive(Debug, Parser)]
75
#[command(
76
    name = "estimate_capacity",
77
    disable_help_subcommand = true,
78
    about = "Metadata-only capacity estimation",
79
    long_about = "Estimate record, pair, triplet, and text-sample capacity using source-reported counts only (no data refresh).",
80
    after_help = "Source roots are optional and resolved in order by explicit arg, environment variables, then project defaults."
81
)]
82
/// CLI arguments for metadata-only capacity estimation.
83
struct EstimateCapacityCli {
84
    #[arg(
85
        long,
86
        default_value_t = 99,
87
        help = "Deterministic seed used for split allocation"
88
    )]
89
    seed: u64,
90
    #[arg(
91
        long = "split-ratios",
92
        value_name = "TRAIN,VALIDATION,TEST",
93
        value_parser = parse_split_ratios_arg,
94
        default_value = "0.8,0.1,0.1",
95
        help = "Comma-separated split ratios that must sum to 1.0"
96
    )]
97
    split: SplitRatios,
98
    #[arg(
99
        long = "source-root",
100
        value_name = "PATH",
101
        help = "Optional source root override, repeat as needed in source order"
102
    )]
103
    source_roots: Vec<String>,
104
}
105

106
#[derive(Debug, Parser)]
107
#[command(
108
    name = "multi_source_demo",
109
    disable_help_subcommand = true,
110
    about = "Run sampled batches from multiple sources",
111
    long_about = "Sample triplet, pair, or text batches from multiple sources and persist split/epoch state.",
112
    after_help = "Source roots are optional and resolved in order by explicit arg, environment variables, then project defaults."
113
)]
114
/// CLI for `multi_source_demo`.
115
///
116
/// Common usage:
117
/// - Use managed cache-group default path (no flag)
118
/// - Set an explicit file path: `--split-store-path /tmp/split_store.bin`
119
/// - Repeat `--source-root <PATH>` to override source roots in order
120
struct MultiSourceDemoCli {
121
    #[arg(
122
        long = "text-recipes",
123
        help = "Emit a text batch instead of a triplet batch"
124
    )]
125
    show_text_samples: bool,
126
    #[arg(
127
        long = "pair-batch",
128
        help = "Emit a pair batch instead of a triplet batch"
129
    )]
130
    show_pair_samples: bool,
131
    #[arg(
132
        long = "list-text-recipes",
133
        help = "Print registered text recipes and exit"
134
    )]
135
    list_text_recipes: bool,
136
    #[arg(
137
        long = "batch-size",
138
        default_value_t = 4,
139
        value_parser = parse_batch_size,
140
        help = "Batch size used for sampling"
141
    )]
142
    batch_size: usize,
143
    #[arg(
144
        long = "ingestion-max-records",
145
        default_value_t = default_ingestion_max_records(),
146
        value_parser = parse_ingestion_max_records,
147
        help = "Per-source ingestion buffer target used while refreshing records"
148
    )]
149
    ingestion_max_records: usize,
150
    #[arg(long, help = "Optional deterministic seed override")]
151
    seed: Option<u64>,
152
    #[arg(long, value_enum, help = "Target split to sample from")]
153
    split: Option<SplitArg>,
154
    #[arg(
155
        long = "source-root",
156
        value_name = "PATH",
157
        help = "Optional source root override, repeat as needed in source order"
158
    )]
159
    source_roots: Vec<String>,
160
    #[arg(
161
        long = "split-store-path",
162
        value_name = "SPLIT_STORE_PATH",
163
        help = "Optional explicit path for persisted split/epoch state file"
164
    )]
165
    split_store_path: Option<PathBuf>,
166
    #[arg(
167
        long = "reset",
168
        help = "Delete the persisted split/epoch state before sampling, restarting from epoch 0"
169
    )]
170
    reset: bool,
171
    #[arg(
172
        long = "batches",
173
        value_name = "N",
174
        value_parser = parse_batch_count,
175
        help = "Run N triplet batches in succession, printing a timing line per batch and (with --features extended-metrics) a per-source similarity summary at the end"
176
    )]
177
    batches: Option<usize>,
178
}
179

180
#[derive(Debug, Clone)]
181
/// Source-level inventory used by capacity estimation output.
182
struct SourceInventory {
183
    source_id: String,
184
    reported_records: u128,
185
    triplet_recipes: Vec<TripletRecipe>,
186
}
187

188
/// Run the capacity-estimation CLI with injectable root resolution/source builders.
189
///
190
/// `build_sources` is construction-only; sampler configuration is applied
191
/// centrally by this function before any source calls.
192
pub fn run_estimate_capacity<R, Resolve, Build, I>(
7✔
193
    args_iter: I,
7✔
194
    resolve_roots: Resolve,
7✔
195
    build_sources: Build,
7✔
196
) -> Result<(), Box<dyn Error>>
7✔
197
where
7✔
198
    Resolve: FnOnce(Vec<String>) -> Result<R, Box<dyn Error>>,
7✔
199
    Build: FnOnce(&R) -> Vec<DynSource>,
7✔
200
    I: Iterator<Item = String>,
7✔
201
{
202
    init_example_tracing();
7✔
203

204
    let Some(cli) = parse_cli::<EstimateCapacityCli, _>(
7✔
205
        std::iter::once("estimate_capacity".to_string()).chain(args_iter),
7✔
206
    )?
1✔
207
    else {
208
        return Ok(());
1✔
209
    };
210

211
    let roots = resolve_roots(cli.source_roots)?;
5✔
212

213
    let config = SamplerConfig {
4✔
214
        seed: cli.seed,
4✔
215
        split: cli.split,
4✔
216
        ..SamplerConfig::default()
4✔
217
    };
4✔
218

219
    let sources = build_sources(&roots);
4✔
220

221
    let mut inventories = Vec::new();
4✔
222
    for source in &sources {
4✔
223
        let recipes = if config.recipes.is_empty() {
3✔
224
            source.default_triplet_recipes()
3✔
225
        } else {
226
            config.recipes.clone()
×
227
        };
228
        let reported_records = source.reported_record_count(&config).map_err(|err| {
3✔
229
            format!(
1✔
230
                "source '{}' failed to report exact record count: {err}",
231
                source.id()
1✔
232
            )
233
        })?;
1✔
234
        inventories.push(SourceInventory {
2✔
235
            source_id: source.id().to_string(),
2✔
236
            reported_records,
2✔
237
            triplet_recipes: recipes,
2✔
238
        });
2✔
239
    }
240

241
    let mut per_source_split_counts: HashMap<(String, SplitLabel), u128> = HashMap::new();
3✔
242
    let mut split_record_counts: HashMap<SplitLabel, u128> = HashMap::new();
3✔
243

244
    for source in &inventories {
3✔
245
        let counts = split_counts_for_total(source.reported_records, cli.split);
2✔
246
        for (label, count) in counts {
6✔
247
            per_source_split_counts.insert((source.source_id.clone(), label), count);
6✔
248
            *split_record_counts.entry(label).or_insert(0) += count;
6✔
249
        }
6✔
250
    }
251

252
    let mut totals_by_split: HashMap<SplitLabel, CapacityTotals> = HashMap::new();
3✔
253
    let mut totals_by_source_and_split: HashMap<(String, SplitLabel), CapacityTotals> =
3✔
254
        HashMap::new();
3✔
255

256
    for split_label in [SplitLabel::Train, SplitLabel::Validation, SplitLabel::Test] {
9✔
257
        let mut totals = CapacityTotals::default();
9✔
258

259
        for source in &inventories {
9✔
260
            let source_split_records = per_source_split_counts
6✔
261
                .get(&(source.source_id.clone(), split_label))
6✔
262
                .copied()
6✔
263
                .unwrap_or(0);
6✔
264

6✔
265
            let triplet_recipes = &source.triplet_recipes;
6✔
266
            let text_recipes = resolve_text_recipes_for_source(&config, triplet_recipes);
6✔
267

6✔
268
            let capacity = estimate_source_split_capacity_from_counts(
6✔
269
                source_split_records,
6✔
270
                triplet_recipes,
6✔
271
                &text_recipes,
6✔
272
            );
6✔
273

6✔
274
            totals_by_source_and_split.insert((source.source_id.clone(), split_label), capacity);
6✔
275

6✔
276
            totals.triplets += capacity.triplets;
6✔
277
            totals.effective_triplets += capacity.effective_triplets;
6✔
278
            totals.pairs += capacity.pairs;
6✔
279
            totals.text_samples += capacity.text_samples;
6✔
280
        }
6✔
281

282
        totals_by_split.insert(split_label, totals);
9✔
283
    }
284

285
    let min_nonzero_records_by_split: HashMap<SplitLabel, u128> =
3✔
286
        [SplitLabel::Train, SplitLabel::Validation, SplitLabel::Test]
3✔
287
            .into_iter()
3✔
288
            .map(|split_label| {
9✔
289
                let min_nonzero = inventories
9✔
290
                    .iter()
9✔
291
                    .filter_map(|source| {
9✔
292
                        per_source_split_counts
6✔
293
                            .get(&(source.source_id.clone(), split_label))
6✔
294
                            .copied()
6✔
295
                    })
6✔
296
                    .filter(|&records| records > 0)
9✔
297
                    .min()
9✔
298
                    .unwrap_or(0);
9✔
299
                (split_label, min_nonzero)
9✔
300
            })
9✔
301
            .collect();
3✔
302

303
    let min_nonzero_records_all_splits = inventories
3✔
304
        .iter()
3✔
305
        .map(|source| source.reported_records)
3✔
306
        .filter(|&records| records > 0)
3✔
307
        .min()
3✔
308
        .unwrap_or(0);
3✔
309

310
    println!("=== capacity estimate (length-only) ===");
3✔
311
    println!("mode: metadata-only (no source.refresh calls)");
3✔
312
    println!("classification: heuristic approximation (not exact)");
3✔
313
    println!("split seed: {}", cli.seed);
3✔
314
    println!(
3✔
315
        "split ratios: train={:.4}, validation={:.4}, test={:.4}",
316
        cli.split.train, cli.split.validation, cli.split.test
317
    );
318
    println!();
3✔
319

320
    println!("[SOURCES]");
3✔
321
    for source in &inventories {
3✔
322
        println!(
2✔
323
            "  {} => reported records: {}",
2✔
324
            source.source_id,
2✔
325
            format_u128_with_commas(source.reported_records)
2✔
326
        );
2✔
327
    }
2✔
328
    println!();
3✔
329

330
    println!("[PER SOURCE BREAKDOWN]");
3✔
331
    for source in &inventories {
3✔
332
        println!("  {}", source.source_id);
2✔
333
        let mut source_grand = CapacityTotals::default();
2✔
334
        let mut source_total_records = 0u128;
2✔
335
        for split_label in [SplitLabel::Train, SplitLabel::Validation, SplitLabel::Test] {
6✔
336
            let split_records = per_source_split_counts
6✔
337
                .get(&(source.source_id.clone(), split_label))
6✔
338
                .copied()
6✔
339
                .unwrap_or(0);
6✔
340
            source_total_records = source_total_records.saturating_add(split_records);
6✔
341
            let split_longest_records = inventories
6✔
342
                .iter()
6✔
343
                .map(|candidate| {
6✔
344
                    per_source_split_counts
6✔
345
                        .get(&(candidate.source_id.clone(), split_label))
6✔
346
                        .copied()
6✔
347
                        .unwrap_or(0)
6✔
348
                })
6✔
349
                .max()
6✔
350
                .unwrap_or(0);
6✔
351
            let totals = totals_by_source_and_split
6✔
352
                .get(&(source.source_id.clone(), split_label))
6✔
353
                .copied()
6✔
354
                .unwrap_or_default();
6✔
355
            source_grand.triplets += totals.triplets;
6✔
356
            source_grand.effective_triplets += totals.effective_triplets;
6✔
357
            source_grand.pairs += totals.pairs;
6✔
358
            source_grand.text_samples += totals.text_samples;
6✔
359
            println!("    [{:?}]", split_label);
6✔
360
            println!("      records: {}", format_u128_with_commas(split_records));
6✔
361
            println!(
6✔
362
                "      triplet combinations: {}",
363
                format_u128_with_commas(totals.triplets)
6✔
364
            );
365
            println!(
6✔
366
                "      effective sampled triplets (p={}, k={}): {}",
367
                EFFECTIVE_POSITIVES_PER_ANCHOR,
368
                EFFECTIVE_NEGATIVES_PER_ANCHOR,
369
                format_u128_with_commas(totals.effective_triplets)
6✔
370
            );
371
            println!(
6✔
372
                "      pair combinations:    {}",
373
                format_u128_with_commas(totals.pairs)
6✔
374
            );
375
            println!(
6✔
376
                "      text samples:         {}",
377
                format_u128_with_commas(totals.text_samples)
6✔
378
            );
379
            println!(
6✔
380
                "      replay factor vs longest source: {}",
381
                format_replay_factor(split_longest_records, split_records)
6✔
382
            );
383
            println!(
6✔
384
                "      suggested proportional-size batch weight (0-1): {:.4}",
385
                suggested_balancing_weight(split_longest_records, split_records)
6✔
386
            );
387
            let split_smallest_nonzero = min_nonzero_records_by_split
6✔
388
                .get(&split_label)
6✔
389
                .copied()
6✔
390
                .unwrap_or(0);
6✔
391
            println!(
6✔
392
                "      suggested small-source-boost batch weight (0-1): {:.4}",
393
                suggested_oversampling_weight(split_smallest_nonzero, split_records)
6✔
394
            );
395
            println!();
6✔
396
        }
397
        let longest_source_total = inventories
2✔
398
            .iter()
2✔
399
            .map(|candidate| candidate.reported_records)
2✔
400
            .max()
2✔
401
            .unwrap_or(0);
2✔
402
        println!("    [ALL SPLITS FOR SOURCE]");
2✔
403
        println!(
2✔
404
            "      triplet combinations: {}",
405
            format_u128_with_commas(source_grand.triplets)
2✔
406
        );
407
        println!(
2✔
408
            "      effective sampled triplets (p={}, k={}): {}",
409
            EFFECTIVE_POSITIVES_PER_ANCHOR,
410
            EFFECTIVE_NEGATIVES_PER_ANCHOR,
411
            format_u128_with_commas(source_grand.effective_triplets)
2✔
412
        );
413
        println!(
2✔
414
            "      pair combinations:    {}",
415
            format_u128_with_commas(source_grand.pairs)
2✔
416
        );
417
        println!(
2✔
418
            "      text samples:         {}",
419
            format_u128_with_commas(source_grand.text_samples)
2✔
420
        );
421
        println!(
2✔
422
            "      replay factor vs longest source: {}",
423
            format_replay_factor(longest_source_total, source_total_records)
2✔
424
        );
425
        println!(
2✔
426
            "      suggested proportional-size batch weight (0-1): {:.4}",
427
            suggested_balancing_weight(longest_source_total, source_total_records)
2✔
428
        );
429
        println!(
2✔
430
            "      suggested small-source-boost batch weight (0-1): {:.4}",
431
            suggested_oversampling_weight(min_nonzero_records_all_splits, source_total_records)
2✔
432
        );
433
        println!();
2✔
434
    }
435

436
    let mut grand = CapacityTotals::default();
3✔
437
    for split_label in [SplitLabel::Train, SplitLabel::Validation, SplitLabel::Test] {
9✔
438
        let record_count = split_record_counts.get(&split_label).copied().unwrap_or(0);
9✔
439
        let totals = totals_by_split
9✔
440
            .get(&split_label)
9✔
441
            .copied()
9✔
442
            .unwrap_or_default();
9✔
443

9✔
444
        grand.triplets += totals.triplets;
9✔
445
        grand.effective_triplets += totals.effective_triplets;
9✔
446
        grand.pairs += totals.pairs;
9✔
447
        grand.text_samples += totals.text_samples;
9✔
448

9✔
449
        println!("[{:?}]", split_label);
9✔
450
        println!("  records: {}", format_u128_with_commas(record_count));
9✔
451
        println!(
9✔
452
            "  triplet combinations: {}",
9✔
453
            format_u128_with_commas(totals.triplets)
9✔
454
        );
9✔
455
        println!(
9✔
456
            "  effective sampled triplets (p={}, k={}): {}",
9✔
457
            EFFECTIVE_POSITIVES_PER_ANCHOR,
9✔
458
            EFFECTIVE_NEGATIVES_PER_ANCHOR,
9✔
459
            format_u128_with_commas(totals.effective_triplets)
9✔
460
        );
9✔
461
        println!(
9✔
462
            "  pair combinations:    {}",
9✔
463
            format_u128_with_commas(totals.pairs)
9✔
464
        );
9✔
465
        println!(
9✔
466
            "  text samples:         {}",
9✔
467
            format_u128_with_commas(totals.text_samples)
9✔
468
        );
9✔
469
        println!();
9✔
470
    }
9✔
471

472
    println!("[ALL SPLITS TOTAL]");
3✔
473
    println!(
3✔
474
        "  triplet combinations: {}",
475
        format_u128_with_commas(grand.triplets)
3✔
476
    );
477
    println!(
3✔
478
        "  effective sampled triplets (p={}, k={}): {}",
479
        EFFECTIVE_POSITIVES_PER_ANCHOR,
480
        EFFECTIVE_NEGATIVES_PER_ANCHOR,
481
        format_u128_with_commas(grand.effective_triplets)
3✔
482
    );
483
    println!(
3✔
484
        "  pair combinations:    {}",
485
        format_u128_with_commas(grand.pairs)
3✔
486
    );
487
    println!(
3✔
488
        "  text samples:         {}",
489
        format_u128_with_commas(grand.text_samples)
3✔
490
    );
491
    println!();
3✔
492
    println!(
3✔
493
        "Note: counts are heuristic, length-based estimates from source-reported totals and recipe structure. They are approximate, not exact, and assume anchor-positive pairs=records (one positive per anchor by default), negatives=source_records_in_split-1 (anchor excluded as its own negative), and at most one chunk/window realization per sample. In real-world chunked sampling, practical combinations are often higher, so treat this as a floor-like baseline."
494
    );
495
    println!();
3✔
496
    println!(
3✔
497
        "Effective sampled triplets apply a bounded training assumption: effective_triplets = records * p * k per triplet recipe, with defaults p={} positives per anchor and k={} negatives per anchor.",
498
        EFFECTIVE_POSITIVES_PER_ANCHOR, EFFECTIVE_NEGATIVES_PER_ANCHOR
499
    );
500
    println!();
3✔
501
    println!(
3✔
502
        "Oversample loops are not inferred from this static report. To measure true oversampling (how many times sampling loops through the combination space), use observed sampled draw counts from an actual run."
503
    );
504
    println!();
3✔
505
    println!(
3✔
506
        "Suggested proportional-size batch weight (0-1) is source/max_source by record count: 1.0 for the largest source in scope, smaller values for smaller sources."
507
    );
508
    println!();
3✔
509
    println!(
3✔
510
        "Suggested small-source-boost batch weight (0-1) is min_nonzero_source/source by record count: 1.0 for the smallest non-zero source in scope, smaller values for larger sources."
511
    );
512
    println!();
3✔
513
    println!(
3✔
514
        "When passed to next_*_batch_with_weights, higher weight means that source is sampled more often relative to lower-weight sources."
515
    );
516

517
    Ok(())
3✔
518
}
7✔
519

520
/// Run the multi-source demo CLI with injectable root resolution/source builders.
521
///
522
/// `build_sources` is construction-only. Source sampler configuration is owned
523
/// by sampler registration (`TripletSampler::register_source`).
524
pub fn run_multi_source_demo<R, Resolve, Build, I>(
28✔
525
    args_iter: I,
28✔
526
    resolve_roots: Resolve,
28✔
527
    build_sources: Build,
28✔
528
) -> Result<(), Box<dyn Error>>
28✔
529
where
28✔
530
    Resolve: FnOnce(Vec<String>) -> Result<R, Box<dyn Error>>,
28✔
531
    Build: FnOnce(&R) -> Vec<DynSource>,
28✔
532
    I: Iterator<Item = String>,
28✔
533
{
534
    init_example_tracing();
28✔
535

536
    let Some(cli) = parse_cli::<MultiSourceDemoCli, _>(
28✔
537
        std::iter::once("multi_source_demo".to_string()).chain(args_iter),
28✔
538
    )?
1✔
539
    else {
540
        return Ok(());
1✔
541
    };
542

543
    let roots = resolve_roots(cli.source_roots)?;
26✔
544

545
    let mut config = SamplerConfig::default();
24✔
546
    config.seed = cli.seed.unwrap_or(config.seed);
24✔
547
    config.batch_size = cli.batch_size;
24✔
548
    config.ingestion_max_records = cli.ingestion_max_records;
24✔
549
    config.chunking = Default::default();
24✔
550
    let selected_split = cli.split.map(Into::into).unwrap_or(SplitLabel::Train);
24✔
551
    config.split = SplitRatios::default();
24✔
552
    config.allowed_splits = vec![selected_split];
24✔
553
    let chunking = config.chunking.clone();
24✔
554
    let config_snapshot = MultiSourceDemoConfigSnapshot {
24✔
555
        seed: config.seed,
24✔
556
        batch_size: config.batch_size,
24✔
557
        ingestion_max_records: config.ingestion_max_records,
24✔
558
        split: selected_split,
24✔
559
        split_ratios: config.split,
24✔
560
        max_window_tokens: config.chunking.max_window_tokens,
24✔
561
        overlap_tokens: config.chunking.overlap_tokens.clone(),
24✔
562
        summary_fallback_tokens: config.chunking.summary_fallback_tokens,
24✔
563
    };
24✔
564

565
    let split_store_path = if let Some(path) = cli.split_store_path {
24✔
566
        path
23✔
567
    } else {
568
        managed_demo_split_store_path().map_err(|err| {
1✔
569
            Box::<dyn Error>::from(format!("failed to resolve demo split-store path: {err}"))
×
570
        })?
×
571
    };
572

573
    if cli.reset && split_store_path.exists() {
24✔
574
        std::fs::remove_file(&split_store_path).map_err(|err| {
2✔
575
            Box::<dyn Error>::from(format!(
1✔
576
                "failed to remove split store '{}': {err}",
1✔
577
                split_store_path.display()
1✔
578
            ))
1✔
579
        })?;
1✔
580
        println!("Reset: removed {}", split_store_path.display());
1✔
581
    }
22✔
582
    println!(
23✔
583
        "Persisting split assignments and epoch state to {}",
584
        split_store_path.display()
23✔
585
    );
586
    let sources = build_sources(&roots);
23✔
587
    let split_store = Arc::new(FileSplitStore::open(&split_store_path, config.split, 99)?);
23✔
588
    let sampler = TripletSampler::new(config, split_store.clone());
23✔
589
    for source in sources {
23✔
590
        sampler.register_source(source);
22✔
591
    }
22✔
592

593
    if cli.show_pair_samples {
23✔
594
        match sampler.next_pair_batch(selected_split) {
7✔
595
            Ok(pair_batch) => {
2✔
596
                if pair_batch.pairs.is_empty() {
2✔
597
                    println!("Pair sampling produced no results.");
×
598
                } else {
2✔
599
                    print_pair_batch(&chunking, &pair_batch, split_store.as_ref());
2✔
600
                }
2✔
601
                sampler.save_sampler_state(None)?;
2✔
602
            }
603
            Err(SamplerError::Exhausted(name)) => {
5✔
604
                eprintln!(
5✔
605
                    "Pair sampler exhausted recipe '{}'. Ensure both positive and negative examples exist.",
5✔
606
                    name
5✔
607
                );
5✔
608
            }
5✔
609
            Err(err) => return Err(err.into()),
×
610
        }
611
    } else if cli.show_text_samples {
16✔
612
        match sampler.next_text_batch(selected_split) {
4✔
613
            Ok(text_batch) => {
1✔
614
                if text_batch.samples.is_empty() {
1✔
615
                    println!(
×
616
                        "Text sampling produced no results. Ensure each source has eligible sections."
×
617
                    );
×
618
                } else {
1✔
619
                    print_text_batch(&chunking, &text_batch, split_store.as_ref());
1✔
620
                }
1✔
621
                sampler.save_sampler_state(None)?;
1✔
622
            }
623
            Err(SamplerError::Exhausted(name)) => {
3✔
624
                eprintln!(
3✔
625
                    "Text sampler exhausted selector '{}'. Ensure matching sections exist.",
3✔
626
                    name
3✔
627
                );
3✔
628
            }
3✔
629
            Err(err) => return Err(err.into()),
×
630
        }
631
    } else if cli.list_text_recipes {
12✔
632
        let recipes = sampler.text_recipes();
4✔
633
        if recipes.is_empty() {
4✔
634
            println!(
2✔
635
                "No text recipes registered. Ensure your sources expose triplet selectors or configure text_recipes explicitly."
2✔
636
            );
2✔
637
        } else {
2✔
638
            print_text_recipes(&recipes);
2✔
639
        }
2✔
640
    } else if let Some(batch_count) = cli.batches {
8✔
641
        print_demo_config(&config_snapshot);
3✔
642
        println!("=== benchmark: {} triplet batches ===", batch_count);
3✔
643

644
        // source_id -> Vec<(pos_jaccard, pos_byte_cosine, neg_jaccard, neg_byte_cosine)>
645
        #[cfg(feature = "extended-metrics")]
646
        let mut source_metrics: HashMap<String, Vec<(f32, f32, f32, f32)>> = HashMap::new();
3✔
647

648
        for i in 0..batch_count {
4✔
649
            let t0 = Instant::now();
4✔
650
            match sampler.next_triplet_batch(selected_split) {
4✔
651
                Ok(batch) => {
2✔
652
                    let elapsed = t0.elapsed();
2✔
653
                    let n = batch.triplets.len();
2✔
654
                    println!(
2✔
655
                        "batch {:>4}  triplets={:<4}  elapsed={:>8.2}ms  per_triplet={:.2}ms",
656
                        i + 1,
2✔
657
                        n,
658
                        elapsed.as_secs_f64() * 1000.0,
2✔
659
                        if n > 0 {
2✔
660
                            elapsed.as_secs_f64() * 1000.0 / n as f64
2✔
661
                        } else {
NEW
662
                            0.0
×
663
                        },
664
                    );
665
                    #[cfg(feature = "extended-metrics")]
666
                    {
667
                        use crate::metrics::lexical_similarity_scores;
668
                        for triplet in &batch.triplets {
8✔
669
                            let (pj, pc) = lexical_similarity_scores(
8✔
670
                                &triplet.anchor.text,
8✔
671
                                &triplet.positive.text,
8✔
672
                            );
8✔
673
                            let (nj, nc) = lexical_similarity_scores(
8✔
674
                                &triplet.anchor.text,
8✔
675
                                &triplet.negative.text,
8✔
676
                            );
8✔
677
                            let source = extract_source(&triplet.anchor.record_id);
8✔
678
                            source_metrics
8✔
679
                                .entry(source)
8✔
680
                                .or_default()
8✔
681
                                .push((pj, pc, nj, nc));
8✔
682
                        }
8✔
683
                    }
684
                }
685
                Err(SamplerError::Exhausted(name)) => {
2✔
686
                    println!(
2✔
687
                        "batch {:>4}  exhausted recipe '{}' — stopping early",
688
                        i + 1,
2✔
689
                        name
690
                    );
691
                    break;
2✔
692
                }
NEW
693
                Err(err) => return Err(err.into()),
×
694
            }
695
        }
696

697
        sampler.save_sampler_state(None)?;
3✔
698

699
        #[cfg(feature = "extended-metrics")]
700
        if !source_metrics.is_empty() {
3✔
701
            println!();
1✔
702
            print_metric_summary(&source_metrics);
1✔
703
        }
2✔
704

705
        #[cfg(all(feature = "extended-metrics", feature = "bm25-mining"))]
706
        {
707
            let (fallback, total) = sampler.bm25_fallback_stats();
3✔
708
            if total > 0 {
3✔
709
                let pct = fallback as f64 / total as f64 * 100.0;
1✔
710
                println!("bm25 fallback rate : {}/{} ({:.1}%)", fallback, total, pct);
1✔
711
            }
2✔
712
        }
713
    } else {
714
        match sampler.next_triplet_batch(selected_split) {
5✔
715
            Ok(triplet_batch) => {
1✔
716
                if triplet_batch.triplets.is_empty() {
1✔
717
                    println!(
×
718
                        "Triplet sampling produced no results. Ensure multiple records per source exist."
×
719
                    );
×
720
                } else {
1✔
721
                    print_triplet_batch(&chunking, &triplet_batch, split_store.as_ref());
1✔
722
                }
1✔
723
                sampler.save_sampler_state(None)?;
1✔
724
                #[cfg(all(feature = "extended-metrics", feature = "bm25-mining"))]
725
                {
726
                    let (fallback, total) = sampler.bm25_fallback_stats();
1✔
727
                    if total > 0 {
1✔
728
                        let pct = fallback as f64 / total as f64 * 100.0;
1✔
729
                        println!("bm25 fallback rate : {}/{} ({:.1}%)", fallback, total, pct);
1✔
730
                    }
1✔
731
                }
732
            }
733
            Err(SamplerError::Exhausted(name)) => {
4✔
734
                eprintln!(
4✔
735
                    "Triplet sampler exhausted recipe '{}'. Ensure both positive and negative examples exist.",
4✔
736
                    name
4✔
737
                );
4✔
738
            }
4✔
739
            Err(err) => return Err(err.into()),
×
740
        }
741
    }
742

743
    Ok(())
23✔
744
}
28✔
745

746
struct MultiSourceDemoConfigSnapshot {
747
    seed: u64,
748
    batch_size: usize,
749
    ingestion_max_records: usize,
750
    split: SplitLabel,
751
    split_ratios: SplitRatios,
752
    max_window_tokens: usize,
753
    overlap_tokens: Vec<usize>,
754
    summary_fallback_tokens: usize,
755
}
756

757
fn print_demo_config(cfg: &MultiSourceDemoConfigSnapshot) {
3✔
758
    let overlaps: Vec<String> = cfg.overlap_tokens.iter().map(|t| t.to_string()).collect();
3✔
759
    println!("=== sampler config ===");
3✔
760
    println!("seed                 : {}", cfg.seed);
3✔
761
    println!("batch_size           : {}", cfg.batch_size);
3✔
762
    println!("ingestion_max_records: {}", cfg.ingestion_max_records);
3✔
763
    println!("split                : {:?}", cfg.split);
3✔
764
    println!(
3✔
765
        "split_ratios         : train={:.2} val={:.2} test={:.2}",
766
        cfg.split_ratios.train, cfg.split_ratios.validation, cfg.split_ratios.test
767
    );
768
    println!("max_window_tokens    : {}", cfg.max_window_tokens);
3✔
769
    println!("overlap_tokens       : [{}]", overlaps.join(", "));
3✔
770
    println!(
3✔
771
        "summary_fallback     : {} tokens (0 = disabled)",
772
        cfg.summary_fallback_tokens
773
    );
774
    println!();
3✔
775
}
3✔
776

777
fn default_ingestion_max_records() -> usize {
1✔
778
    SamplerConfig::default().ingestion_max_records
1✔
779
}
1✔
780

781
fn parse_positive_usize_flag(raw: &str, flag: &str) -> Result<usize, String> {
65✔
782
    let parsed = raw.parse::<usize>().map_err(|_| {
65✔
783
        format!(
1✔
784
            "Could not parse {} value '{}' as a positive integer",
785
            flag, raw
786
        )
787
    })?;
1✔
788
    if parsed == 0 {
64✔
789
        return Err(format!("{} must be greater than zero", flag));
5✔
790
    }
59✔
791
    Ok(parsed)
59✔
792
}
65✔
793

794
fn parse_batch_size(raw: &str) -> Result<usize, String> {
31✔
795
    parse_positive_usize_flag(raw, "--batch-size")
31✔
796
}
31✔
797

798
fn parse_ingestion_max_records(raw: &str) -> Result<usize, String> {
30✔
799
    parse_positive_usize_flag(raw, "--ingestion-max-records")
30✔
800
}
30✔
801

802
fn parse_batch_count(raw: &str) -> Result<usize, String> {
4✔
803
    parse_positive_usize_flag(raw, "--batches")
4✔
804
}
4✔
805

806
fn suggested_balancing_weight(max_baseline: u128, source_baseline: u128) -> f32 {
13✔
807
    if max_baseline == 0 || source_baseline == 0 {
13✔
808
        return 0.0;
4✔
809
    }
9✔
810
    (source_baseline as f64 / max_baseline as f64).clamp(0.0, 1.0) as f32
9✔
811
}
13✔
812

813
fn suggested_oversampling_weight(min_nonzero_baseline: u128, source_baseline: u128) -> f32 {
13✔
814
    if min_nonzero_baseline == 0 || source_baseline == 0 {
13✔
815
        return 0.0;
4✔
816
    }
9✔
817
    (min_nonzero_baseline as f64 / source_baseline as f64).clamp(0.0, 1.0) as f32
9✔
818
}
13✔
819

820
fn parse_cli<T, I>(args: I) -> Result<Option<T>, Box<dyn Error>>
42✔
821
where
42✔
822
    T: Parser,
42✔
823
    I: IntoIterator,
42✔
824
    I::Item: Into<std::ffi::OsString> + Clone,
42✔
825
{
826
    match T::try_parse_from(args) {
42✔
827
        Ok(cli) => Ok(Some(cli)),
32✔
828
        Err(err) => match err.kind() {
10✔
829
            ErrorKind::DisplayHelp | ErrorKind::DisplayVersion => {
830
                err.print()?;
5✔
831
                Ok(None)
5✔
832
            }
833
            _ => Err(err.into()),
5✔
834
        },
835
    }
836
}
42✔
837

838
fn parse_split_ratios_arg(raw: &str) -> Result<SplitRatios, String> {
12✔
839
    let parts: Vec<&str> = raw.split(',').collect();
12✔
840
    if parts.len() != 3 {
12✔
841
        return Err("--split-ratios expects exactly 3 comma-separated values".to_string());
1✔
842
    }
11✔
843
    let train = parts[0]
11✔
844
        .trim()
11✔
845
        .parse::<f32>()
11✔
846
        .map_err(|_| format!("invalid train ratio '{}': must be a float", parts[0].trim()))?;
11✔
847
    let validation = parts[1].trim().parse::<f32>().map_err(|_| {
10✔
848
        format!(
1✔
849
            "invalid validation ratio '{}': must be a float",
850
            parts[1].trim()
1✔
851
        )
852
    })?;
1✔
853
    let test = parts[2]
9✔
854
        .trim()
9✔
855
        .parse::<f32>()
9✔
856
        .map_err(|_| format!("invalid test ratio '{}': must be a float", parts[2].trim()))?;
9✔
857
    let ratios = SplitRatios {
8✔
858
        train,
8✔
859
        validation,
8✔
860
        test,
8✔
861
    };
8✔
862
    let sum = ratios.train + ratios.validation + ratios.test;
8✔
863
    if (sum - 1.0).abs() > 1e-5 {
8✔
864
        return Err(format!(
1✔
865
            "split ratios must sum to 1.0, got {:.6} (train={}, validation={}, test={})",
1✔
866
            sum, ratios.train, ratios.validation, ratios.test
1✔
867
        ));
1✔
868
    }
7✔
869
    if ratios.train < 0.0 || ratios.validation < 0.0 || ratios.test < 0.0 {
7✔
870
        return Err("split ratios must be non-negative".to_string());
1✔
871
    }
6✔
872
    Ok(ratios)
6✔
873
}
12✔
874

875
fn print_triplet_batch(
2✔
876
    strategy: &ChunkingStrategy,
2✔
877
    batch: &TripletBatch,
2✔
878
    split_store: &impl SplitStore,
2✔
879
) {
2✔
880
    println!("=== triplet batch ===");
2✔
881
    for (idx, triplet) in batch.triplets.iter().enumerate() {
5✔
882
        println!("--- triplet #{} ---", idx);
5✔
883
        println!("recipe       : {}", triplet.recipe);
5✔
884
        println!("sample_weight: {:.4}", triplet.weight);
5✔
885
        if let Some(instr) = &triplet.instruction {
5✔
886
            println!("instruction shown to model:\n{}\n", instr);
1✔
887
        }
4✔
888
        #[cfg(feature = "extended-metrics")]
889
        let (pos_sim, neg_sim) = {
5✔
890
            use crate::metrics::lexical_similarity_scores;
891
            (
5✔
892
                Some(lexical_similarity_scores(
5✔
893
                    &triplet.anchor.text,
5✔
894
                    &triplet.positive.text,
5✔
895
                )),
5✔
896
                Some(lexical_similarity_scores(
5✔
897
                    &triplet.anchor.text,
5✔
898
                    &triplet.negative.text,
5✔
899
                )),
5✔
900
            )
5✔
901
        };
902
        #[cfg(not(feature = "extended-metrics"))]
903
        let (pos_sim, neg_sim): (Option<(f32, f32)>, Option<(f32, f32)>) = (None, None);
904
        print_chunk_block("ANCHOR", &triplet.anchor, strategy, split_store, None);
5✔
905
        print_chunk_block(
5✔
906
            "POSITIVE",
5✔
907
            &triplet.positive,
5✔
908
            strategy,
5✔
909
            split_store,
5✔
910
            pos_sim,
5✔
911
        );
912
        print_chunk_block(
5✔
913
            "NEGATIVE",
5✔
914
            &triplet.negative,
5✔
915
            strategy,
5✔
916
            split_store,
5✔
917
            neg_sim,
5✔
918
        );
919
    }
920
    print_source_summary(
2✔
921
        "triplet anchors",
2✔
922
        batch
2✔
923
            .triplets
2✔
924
            .iter()
2✔
925
            .map(|triplet| triplet.anchor.record_id.as_str()),
5✔
926
    );
927
    print_recipe_context_by_source(
2✔
928
        "triplet recipes by source",
2✔
929
        batch
2✔
930
            .triplets
2✔
931
            .iter()
2✔
932
            .map(|triplet| (triplet.anchor.record_id.as_str(), triplet.recipe.as_str())),
5✔
933
    );
934
}
2✔
935

936
fn print_text_batch(strategy: &ChunkingStrategy, batch: &TextBatch, split_store: &impl SplitStore) {
2✔
937
    println!("=== text batch ===");
2✔
938
    for (idx, sample) in batch.samples.iter().enumerate() {
5✔
939
        println!("--- sample #{} ---", idx);
5✔
940
        println!("recipe       : {}", sample.recipe);
5✔
941
        println!("sample_weight: {:.4}", sample.weight);
5✔
942
        if let Some(instr) = &sample.instruction {
5✔
943
            println!("instruction shown to model:\n{}\n", instr);
1✔
944
        }
4✔
945
        print_chunk_block("TEXT", &sample.chunk, strategy, split_store, None);
5✔
946
    }
947
    print_source_summary(
2✔
948
        "text samples",
2✔
949
        batch
2✔
950
            .samples
2✔
951
            .iter()
2✔
952
            .map(|sample| sample.chunk.record_id.as_str()),
5✔
953
    );
954
    print_recipe_context_by_source(
2✔
955
        "text recipes by source",
2✔
956
        batch
2✔
957
            .samples
2✔
958
            .iter()
2✔
959
            .map(|sample| (sample.chunk.record_id.as_str(), sample.recipe.as_str())),
5✔
960
    );
961
}
2✔
962

963
fn print_pair_batch(
3✔
964
    strategy: &ChunkingStrategy,
3✔
965
    batch: &SampleBatch,
3✔
966
    split_store: &impl SplitStore,
3✔
967
) {
3✔
968
    println!("=== pair batch ===");
3✔
969
    for (idx, pair) in batch.pairs.iter().enumerate() {
9✔
970
        println!("--- pair #{} ---", idx);
9✔
971
        println!("recipe       : {}", pair.recipe);
9✔
972
        println!("label        : {:?}", pair.label);
9✔
973
        if let Some(reason) = &pair.reason {
9✔
974
            println!("reason       : {}", reason);
5✔
975
        }
5✔
976
        print_chunk_block("ANCHOR", &pair.anchor, strategy, split_store, None);
9✔
977
        print_chunk_block("OTHER", &pair.positive, strategy, split_store, None);
9✔
978
    }
979
    print_source_summary(
3✔
980
        "pair anchors",
3✔
981
        batch
3✔
982
            .pairs
3✔
983
            .iter()
3✔
984
            .map(|pair| pair.anchor.record_id.as_str()),
9✔
985
    );
986
    print_recipe_context_by_source(
3✔
987
        "pair recipes by source",
3✔
988
        batch
3✔
989
            .pairs
3✔
990
            .iter()
3✔
991
            .map(|pair| (pair.anchor.record_id.as_str(), pair.recipe.as_str())),
9✔
992
    );
993
}
3✔
994

995
fn print_text_recipes(recipes: &[TextRecipe]) {
3✔
996
    println!("=== available text recipes ===");
3✔
997
    for recipe in recipes {
7✔
998
        println!(
7✔
999
            "- {} (weight: {:.3}) selector={:?}",
1000
            recipe.name, recipe.weight, recipe.selector
1001
        );
1002
        if let Some(instr) = &recipe.instruction {
7✔
1003
            println!("  instruction: {}", instr);
1✔
1004
        }
6✔
1005
    }
1006
}
3✔
1007

1008
#[cfg(feature = "extended-metrics")]
1009
fn metric_mean_median(vals: &mut [f32]) -> (f32, f32) {
18✔
1010
    let mean = vals.iter().sum::<f32>() / vals.len() as f32;
18✔
1011
    vals.sort_by(|a, b| a.partial_cmp(b).unwrap_or(std::cmp::Ordering::Equal));
92✔
1012
    let median = if vals.len() % 2 == 1 {
18✔
1013
        vals[vals.len() / 2]
1✔
1014
    } else {
1015
        (vals[vals.len() / 2 - 1] + vals[vals.len() / 2]) / 2.0
17✔
1016
    };
1017
    (mean, median)
18✔
1018
}
18✔
1019

1020
#[cfg(feature = "extended-metrics")]
1021
fn print_metric_summary(source_data: &HashMap<String, Vec<(f32, f32, f32, f32)>>) {
2✔
1022
    let total: usize = source_data.values().map(|v| v.len()).sum();
3✔
1023
    let n_sources = source_data.len();
2✔
1024
    println!(
2✔
1025
        "=== extended metrics summary ({} triplets, {} {}) ===",
1026
        total,
1027
        n_sources,
1028
        if n_sources == 1 { "source" } else { "sources" }
2✔
1029
    );
1030

1031
    // Returns [pos, neg] as (mean, median) pairs for one metric across entries.
1032
    fn metric_pair(
8✔
1033
        entries: &[(f32, f32, f32, f32)],
8✔
1034
        pos_idx: usize,
8✔
1035
        neg_idx: usize,
8✔
1036
    ) -> [(f32, f32); 2] {
8✔
1037
        let extract = |idx: usize| -> Vec<f32> {
16✔
1038
            entries
16✔
1039
                .iter()
16✔
1040
                .map(|e| match idx {
64✔
1041
                    0 => e.0,
16✔
1042
                    1 => e.1,
16✔
1043
                    2 => e.2,
16✔
1044
                    _ => e.3,
16✔
1045
                })
64✔
1046
                .collect()
16✔
1047
        };
16✔
1048
        let mut pos_vals = extract(pos_idx);
8✔
1049
        let mut neg_vals = extract(neg_idx);
8✔
1050
        [
8✔
1051
            metric_mean_median(&mut pos_vals),
8✔
1052
            metric_mean_median(&mut neg_vals),
8✔
1053
        ]
8✔
1054
    }
8✔
1055

1056
    fn print_metric_section(
4✔
1057
        label: &str,
4✔
1058
        sources: &[&String],
4✔
1059
        source_data: &HashMap<String, Vec<(f32, f32, f32, f32)>>,
4✔
1060
        pos_idx: usize,
4✔
1061
        neg_idx: usize,
4✔
1062
        total: usize,
4✔
1063
        n_sources: usize,
4✔
1064
    ) {
4✔
1065
        const SEP: usize = 83;
1066
        println!();
4✔
1067
        println!("[{}]", label);
4✔
1068
        println!(
4✔
1069
            "{:<24} {:>5}  {:<16} {:<16} {:<16}",
1070
            "source", "n", "positive", "negative", "gap (pos\u{2212}neg)"
1071
        );
1072
        println!(
4✔
1073
            "{:<24} {:>5}  {:<16} {:<16} {:<16}",
1074
            "", "", "mean / median", "mean / median", "mean / median"
1075
        );
1076
        println!("{}", "-".repeat(SEP));
4✔
1077
        for source in sources {
6✔
1078
            let entries = &source_data[*source];
6✔
1079
            let [pos, neg] = metric_pair(entries, pos_idx, neg_idx);
6✔
1080
            let gap_mean = pos.0 - neg.0;
6✔
1081
            let gap_med = pos.1 - neg.1;
6✔
1082
            println!(
6✔
1083
                "{:<24} {:>5}  {:.3} / {:.3}     {:.3} / {:.3}     {:+.3} / {:+.3}",
6✔
1084
                source,
6✔
1085
                entries.len(),
6✔
1086
                pos.0,
6✔
1087
                pos.1,
6✔
1088
                neg.0,
6✔
1089
                neg.1,
6✔
1090
                gap_mean,
6✔
1091
                gap_med,
6✔
1092
            );
6✔
1093
        }
6✔
1094
        if n_sources > 1 {
4✔
1095
            let all: Vec<(f32, f32, f32, f32)> = source_data.values().flatten().copied().collect();
2✔
1096
            let [pos, neg] = metric_pair(&all, pos_idx, neg_idx);
2✔
1097
            let gap_mean = pos.0 - neg.0;
2✔
1098
            let gap_med = pos.1 - neg.1;
2✔
1099
            println!("{}", "-".repeat(SEP));
2✔
1100
            println!(
2✔
1101
                "{:<24} {:>5}  {:.3} / {:.3}     {:.3} / {:.3}     {:+.3} / {:+.3}",
2✔
1102
                "ALL", total, pos.0, pos.1, neg.0, neg.1, gap_mean, gap_med,
2✔
1103
            );
2✔
1104
        }
2✔
1105
    }
4✔
1106

1107
    let mut sources: Vec<&String> = source_data.keys().collect();
2✔
1108
    sources.sort();
2✔
1109

1110
    print_metric_section(
2✔
1111
        "jaccard \u{2194} anchor",
2✔
1112
        &sources,
2✔
1113
        source_data,
2✔
1114
        0,
1115
        2,
1116
        total,
2✔
1117
        n_sources,
2✔
1118
    );
1119
    print_metric_section(
2✔
1120
        "byte-cos \u{2194} anchor",
2✔
1121
        &sources,
2✔
1122
        source_data,
2✔
1123
        1,
1124
        3,
1125
        total,
2✔
1126
        n_sources,
2✔
1127
    );
1128
    println!();
2✔
1129
}
2✔
1130

1131
trait ChunkDebug {
1132
    fn view_name(&self) -> String;
1133
}
1134

1135
impl ChunkDebug for RecordChunk {
1136
    fn view_name(&self) -> String {
38✔
1137
        match &self.view {
38✔
1138
            ChunkView::Window {
1139
                index,
36✔
1140
                span,
36✔
1141
                overlap,
36✔
1142
                start_ratio,
36✔
1143
            } => format!(
36✔
1144
                "window#index={} span={} overlap={} start_ratio={:.3} tokens={}",
1145
                index, span, overlap, start_ratio, self.tokens_estimate
1146
            ),
1147
            ChunkView::SummaryFallback { strategy, .. } => {
2✔
1148
                format!("summary:{} tokens={}", strategy, self.tokens_estimate)
2✔
1149
            }
1150
        }
1151
    }
38✔
1152
}
1153

1154
fn print_chunk_block(
38✔
1155
    title: &str,
38✔
1156
    chunk: &RecordChunk,
38✔
1157
    strategy: &ChunkingStrategy,
38✔
1158
    split_store: &impl SplitStore,
38✔
1159
    anchor_sim: Option<(f32, f32)>,
38✔
1160
) {
38✔
1161
    let chunk_weight = chunk_weight(strategy, chunk);
38✔
1162
    let split = split_store
38✔
1163
        .label_for(&chunk.record_id)
38✔
1164
        .map(|label| format!("{:?}", label))
38✔
1165
        .unwrap_or_else(|| "Unknown".to_string());
38✔
1166
    println!("--- {} ---", title);
38✔
1167
    println!("split        : {}", split);
38✔
1168
    println!("view         : {}", chunk.view_name());
38✔
1169
    println!("chunk_weight : {:.4}", chunk_weight);
38✔
1170
    println!("record_id    : {}", chunk.record_id);
38✔
1171
    println!("section_idx  : {}", chunk.section_idx);
38✔
1172
    println!("token_est    : {}", chunk.tokens_estimate);
38✔
1173
    if let Some((j, c)) = anchor_sim {
38✔
1174
        println!("jaccard(↔a)  : {:.4}  byte-cos(↔a): {:.4}", j, c);
10✔
1175
    }
28✔
1176
    println!("model_input (exact text sent to the model):");
38✔
1177
    println!(
38✔
1178
        "<<< BEGIN MODEL TEXT >>>\n{}\n<<< END MODEL TEXT >>>\n",
1179
        chunk.text
1180
    );
1181
}
38✔
1182

1183
fn print_source_summary<'a, I>(label: &str, ids: I)
9✔
1184
where
9✔
1185
    I: Iterator<Item = &'a str>,
9✔
1186
{
1187
    let mut counts: HashMap<SourceId, usize> = HashMap::new();
9✔
1188
    for id in ids {
23✔
1189
        let source = extract_source(id);
23✔
1190
        *counts.entry(source).or_insert(0) += 1;
23✔
1191
    }
23✔
1192
    if counts.is_empty() {
9✔
1193
        return;
1✔
1194
    }
8✔
1195
    let skew = source_skew(&counts);
8✔
1196
    let mut entries: Vec<(String, usize)> = counts.into_iter().collect();
8✔
1197
    entries.sort_by(|a, b| b.1.cmp(&a.1).then_with(|| a.0.cmp(&b.0)));
8✔
1198
    println!("--- {} by source ---", label);
8✔
1199
    if let Some(skew) = skew {
8✔
1200
        for entry in &skew.per_source {
10✔
1201
            println!(
10✔
1202
                "{}: count={} share={:.2}",
10✔
1203
                entry.source, entry.count, entry.share
10✔
1204
            );
10✔
1205
        }
10✔
1206
        println!(
8✔
1207
            "skew: sources={} total={} min={} max={} mean={:.2} ratio={:.2}",
1208
            skew.sources, skew.total, skew.min, skew.max, skew.mean, skew.ratio
1209
        );
UNCOV
1210
    }
×
1211
}
9✔
1212

1213
fn print_recipe_context_by_source<'a, I>(label: &str, entries: I)
8✔
1214
where
8✔
1215
    I: Iterator<Item = (&'a str, &'a str)>,
8✔
1216
{
1217
    let mut counts: HashMap<SourceId, HashMap<String, usize>> = HashMap::new();
8✔
1218
    for (record_id, recipe) in entries {
19✔
1219
        let source = extract_source(record_id);
19✔
1220
        let entry = counts
19✔
1221
            .entry(source)
19✔
1222
            .or_default()
19✔
1223
            .entry(recipe.to_string())
19✔
1224
            .or_insert(0);
19✔
1225
        *entry += 1;
19✔
1226
    }
19✔
1227
    if counts.is_empty() {
8✔
1228
        return;
1✔
1229
    }
7✔
1230
    let mut sources: Vec<(SourceId, HashMap<String, usize>)> = counts.into_iter().collect();
7✔
1231
    sources.sort_by(|a, b| a.0.cmp(&b.0));
7✔
1232
    println!("--- {} ---", label);
7✔
1233
    for (source, recipes) in sources {
7✔
1234
        println!("{source}");
7✔
1235
        let mut entries: Vec<(String, usize)> = recipes.into_iter().collect();
7✔
1236
        entries.sort_by(|a, b| b.1.cmp(&a.1).then_with(|| a.0.cmp(&b.0)));
7✔
1237
        for (recipe, count) in entries {
8✔
1238
            println!("  - {recipe}={count}");
8✔
1239
        }
8✔
1240
    }
1241
}
8✔
1242

1243
fn extract_source(record_id: &str) -> SourceId {
52✔
1244
    record_id
52✔
1245
        .split_once("::")
52✔
1246
        .map(|(source, _)| source.to_string())
52✔
1247
        .unwrap_or_else(|| "unknown".to_string())
52✔
1248
}
52✔
1249

1250
#[cfg(test)]
1251
mod tests {
1252
    use super::*;
1253
    use crate::DataRecord;
1254
    use crate::DeterministicSplitStore;
1255
    use crate::data::{QualityScore, RecordSection, SectionRole};
1256
    use crate::source::{SourceCursor, SourceSnapshot};
1257
    use crate::utils::make_section;
1258
    use chrono::{TimeZone, Utc};
1259
    use tempfile::tempdir;
1260

1261
    fn empty_dyn_sources(_: &()) -> Vec<DynSource> {
2✔
1262
        Vec::new()
2✔
1263
    }
2✔
1264

1265
    fn ok_unit_roots(_: Vec<String>) -> Result<(), Box<dyn Error>> {
1✔
1266
        Ok(())
1✔
1267
    }
1✔
1268

1269
    fn error_unit_roots(_: Vec<String>) -> Result<(), Box<dyn Error>> {
2✔
1270
        Err("root-resolution-error".into())
2✔
1271
    }
2✔
1272

1273
    struct ErrorRefreshSource {
1274
        id: String,
1275
    }
1276

1277
    impl DataSource for ErrorRefreshSource {
1278
        fn id(&self) -> &str {
84✔
1279
            &self.id
84✔
1280
        }
84✔
1281

1282
        fn refresh(
20✔
1283
            &self,
20✔
1284
            _config: &SamplerConfig,
20✔
1285
            _cursor: Option<&SourceCursor>,
20✔
1286
            _limit: Option<usize>,
20✔
1287
        ) -> Result<SourceSnapshot, SamplerError> {
20✔
1288
            Err(SamplerError::SourceUnavailable {
20✔
1289
                source_id: self.id.clone(),
20✔
1290
                reason: "simulated refresh failure".to_string(),
20✔
1291
            })
20✔
1292
        }
20✔
1293

1294
        fn reported_record_count(&self, _config: &SamplerConfig) -> Result<u128, SamplerError> {
1✔
1295
            Ok(1)
1✔
1296
        }
1✔
1297

1298
        fn default_triplet_recipes(&self) -> Vec<TripletRecipe> {
5✔
1299
            vec![default_recipe("error_refresh_recipe")]
5✔
1300
        }
5✔
1301
    }
1302

1303
    /// Minimal in-memory `DataSource` test double for example app tests.
1304
    struct TestSource {
1305
        id: String,
1306
        count: Option<u128>,
1307
        recipes: Vec<TripletRecipe>,
1308
    }
1309

1310
    impl DataSource for TestSource {
1311
        fn id(&self) -> &str {
131✔
1312
            &self.id
131✔
1313
        }
131✔
1314

1315
        fn refresh(
30✔
1316
            &self,
30✔
1317
            _config: &SamplerConfig,
30✔
1318
            _cursor: Option<&SourceCursor>,
30✔
1319
            _limit: Option<usize>,
30✔
1320
        ) -> Result<SourceSnapshot, SamplerError> {
30✔
1321
            Ok(SourceSnapshot {
30✔
1322
                records: Vec::new(),
30✔
1323
                cursor: SourceCursor {
30✔
1324
                    last_seen: Utc::now(),
30✔
1325
                    revision: 0,
30✔
1326
                },
30✔
1327
            })
30✔
1328
        }
30✔
1329

1330
        fn reported_record_count(&self, _config: &SamplerConfig) -> Result<u128, SamplerError> {
2✔
1331
            self.count.ok_or_else(|| SamplerError::SourceInconsistent {
2✔
1332
                source_id: self.id.clone(),
1✔
1333
                details: "test source has no configured exact count".to_string(),
1✔
1334
            })
1✔
1335
        }
2✔
1336

1337
        fn default_triplet_recipes(&self) -> Vec<TripletRecipe> {
11✔
1338
            self.recipes.clone()
11✔
1339
        }
11✔
1340
    }
1341

1342
    struct ConfigRequiredSource {
1343
        id: String,
1344
        expected_seed: u64,
1345
    }
1346

1347
    impl DataSource for ConfigRequiredSource {
1348
        fn id(&self) -> &str {
1✔
1349
            &self.id
1✔
1350
        }
1✔
1351

1352
        fn refresh(
1✔
1353
            &self,
1✔
1354
            _config: &SamplerConfig,
1✔
1355
            _cursor: Option<&SourceCursor>,
1✔
1356
            _limit: Option<usize>,
1✔
1357
        ) -> Result<SourceSnapshot, SamplerError> {
1✔
1358
            Ok(SourceSnapshot {
1✔
1359
                records: Vec::new(),
1✔
1360
                cursor: SourceCursor {
1✔
1361
                    last_seen: Utc::now(),
1✔
1362
                    revision: 0,
1✔
1363
                },
1✔
1364
            })
1✔
1365
        }
1✔
1366

1367
        fn reported_record_count(&self, config: &SamplerConfig) -> Result<u128, SamplerError> {
2✔
1368
            if config.seed == self.expected_seed {
2✔
1369
                Ok(1)
1✔
1370
            } else {
1371
                Err(SamplerError::SourceInconsistent {
1✔
1372
                    source_id: self.id.clone(),
1✔
1373
                    details: format!(
1✔
1374
                        "expected sampler seed {} but got {}",
1✔
1375
                        self.expected_seed, config.seed
1✔
1376
                    ),
1✔
1377
                })
1✔
1378
            }
1379
        }
2✔
1380

1381
        fn default_triplet_recipes(&self) -> Vec<TripletRecipe> {
2✔
1382
            Vec::new()
2✔
1383
        }
2✔
1384
    }
1385

1386
    struct FixtureSource {
1387
        id: String,
1388
        records: Vec<DataRecord>,
1389
        recipes: Vec<TripletRecipe>,
1390
    }
1391

1392
    impl DataSource for FixtureSource {
1393
        fn id(&self) -> &str {
65✔
1394
            &self.id
65✔
1395
        }
65✔
1396

1397
        fn refresh(
15✔
1398
            &self,
15✔
1399
            _config: &SamplerConfig,
15✔
1400
            _cursor: Option<&SourceCursor>,
15✔
1401
            _limit: Option<usize>,
15✔
1402
        ) -> Result<SourceSnapshot, SamplerError> {
15✔
1403
            Ok(SourceSnapshot {
15✔
1404
                records: self.records.clone(),
15✔
1405
                cursor: SourceCursor {
15✔
1406
                    last_seen: Utc::now(),
15✔
1407
                    revision: 0,
15✔
1408
                },
15✔
1409
            })
15✔
1410
        }
15✔
1411

1412
        fn reported_record_count(&self, _config: &SamplerConfig) -> Result<u128, SamplerError> {
1✔
1413
            Ok(self.records.len() as u128)
1✔
1414
        }
1✔
1415

1416
        fn default_triplet_recipes(&self) -> Vec<TripletRecipe> {
6✔
1417
            self.recipes.clone()
6✔
1418
        }
6✔
1419
    }
1420

1421
    struct IngestionConfigSource {
1422
        expected_ingestion_max_records: usize,
1423
        records: Vec<DataRecord>,
1424
    }
1425

1426
    impl DataSource for IngestionConfigSource {
1427
        fn id(&self) -> &str {
7✔
1428
            "ingestion_config_source"
7✔
1429
        }
7✔
1430

1431
        fn refresh(
3✔
1432
            &self,
3✔
1433
            config: &SamplerConfig,
3✔
1434
            _cursor: Option<&SourceCursor>,
3✔
1435
            _limit: Option<usize>,
3✔
1436
        ) -> Result<SourceSnapshot, SamplerError> {
3✔
1437
            if config.ingestion_max_records != self.expected_ingestion_max_records {
3✔
1438
                return Err(SamplerError::SourceInconsistent {
1✔
1439
                    source_id: self.id().to_string(),
1✔
1440
                    details: format!(
1✔
1441
                        "expected ingestion_max_records {} but got {}",
1✔
1442
                        self.expected_ingestion_max_records, config.ingestion_max_records
1✔
1443
                    ),
1✔
1444
                });
1✔
1445
            }
2✔
1446
            Ok(SourceSnapshot {
2✔
1447
                records: self.records.clone(),
2✔
1448
                cursor: SourceCursor {
2✔
1449
                    last_seen: Utc::now(),
2✔
1450
                    revision: 0,
2✔
1451
                },
2✔
1452
            })
2✔
1453
        }
3✔
1454

1455
        fn reported_record_count(&self, _config: &SamplerConfig) -> Result<u128, SamplerError> {
1✔
1456
            Ok(self.records.len() as u128)
1✔
1457
        }
1✔
1458

1459
        fn default_triplet_recipes(&self) -> Vec<TripletRecipe> {
2✔
1460
            vec![default_recipe("ingestion_config_recipe")]
2✔
1461
        }
2✔
1462
    }
1463

1464
    fn fixture_record(
25✔
1465
        source: &str,
25✔
1466
        id_suffix: &str,
25✔
1467
        day: u32,
25✔
1468
        title: &str,
25✔
1469
        body: &str,
25✔
1470
    ) -> DataRecord {
25✔
1471
        let now = Utc.with_ymd_and_hms(2025, 1, day, 12, 0, 0).unwrap();
25✔
1472
        DataRecord {
25✔
1473
            id: format!("{source}::{id_suffix}"),
25✔
1474
            source: source.to_string(),
25✔
1475
            created_at: now,
25✔
1476
            updated_at: now,
25✔
1477
            quality: QualityScore { trust: 1.0 },
25✔
1478
            taxonomy: Vec::new(),
25✔
1479
            sections: vec![
25✔
1480
                make_section(SectionRole::Anchor, Some("title"), title),
25✔
1481
                make_section(SectionRole::Context, Some("body"), body),
25✔
1482
            ],
25✔
1483
            meta_prefix: None,
25✔
1484
        }
25✔
1485
    }
25✔
1486

1487
    fn default_recipe(name: &str) -> TripletRecipe {
24✔
1488
        TripletRecipe {
24✔
1489
            name: name.to_string().into(),
24✔
1490
            anchor: crate::config::Selector::Role(SectionRole::Anchor),
24✔
1491
            positive_selector: crate::config::Selector::Role(SectionRole::Context),
24✔
1492
            negative_selector: crate::config::Selector::Role(SectionRole::Context),
24✔
1493
            negative_strategy: crate::config::NegativeStrategy::WrongArticle,
24✔
1494
            weight: 1.0,
24✔
1495
            instruction: None,
24✔
1496
            allow_same_anchor_positive: false,
24✔
1497
        }
24✔
1498
    }
24✔
1499

1500
    #[test]
1501
    fn parse_helpers_validate_inputs() {
1✔
1502
        assert_eq!(parse_batch_size("2").unwrap(), 2);
1✔
1503
        assert!(parse_batch_size("0").is_err());
1✔
1504
        assert!(parse_batch_size("abc").is_err());
1✔
1505
        assert_eq!(parse_ingestion_max_records("16").unwrap(), 16);
1✔
1506
        assert!(parse_ingestion_max_records("0").is_err());
1✔
1507
        assert!(parse_batch_count("0").is_err());
1✔
1508

1509
        let split = parse_split_ratios_arg("0.8,0.1,0.1").unwrap();
1✔
1510
        assert!((split.train - 0.8).abs() < 1e-6);
1✔
1511
        assert!(parse_split_ratios_arg("0.8,0.1").is_err());
1✔
1512
        assert!(parse_split_ratios_arg("1.0,0.0,0.1").is_err());
1✔
1513
        assert!(parse_split_ratios_arg("-0.1,0.6,0.5").is_err());
1✔
1514
    }
1✔
1515

1516
    #[test]
1517
    fn fixture_and_ingestion_sources_trait_methods_cover_paths() {
1✔
1518
        let records = vec![fixture_record("fixture_source", "r1", 1, "Title", "Body")];
1✔
1519
        let recipes = vec![default_recipe("fixture_recipe")];
1✔
1520
        let fixture = FixtureSource {
1✔
1521
            id: "fixture_source".into(),
1✔
1522
            records: records.clone(),
1✔
1523
            recipes: recipes.clone(),
1✔
1524
        };
1✔
1525

1526
        let snapshot = fixture
1✔
1527
            .refresh(&SamplerConfig::default(), None, None)
1✔
1528
            .expect("fixture refresh should succeed");
1✔
1529
        assert_eq!(snapshot.records.len(), 1);
1✔
1530
        assert_eq!(
1✔
1531
            fixture
1✔
1532
                .reported_record_count(&SamplerConfig::default())
1✔
1533
                .unwrap(),
1✔
1534
            1
1535
        );
1536
        assert_eq!(fixture.default_triplet_recipes().len(), 1);
1✔
1537

1538
        let source = IngestionConfigSource {
1✔
1539
            expected_ingestion_max_records: 7,
1✔
1540
            records,
1✔
1541
        };
1✔
1542
        let ok_cfg = SamplerConfig {
1✔
1543
            ingestion_max_records: 7,
1✔
1544
            ..SamplerConfig::default()
1✔
1545
        };
1✔
1546
        assert!(source.refresh(&ok_cfg, None, None).is_ok());
1✔
1547
        assert_eq!(source.reported_record_count(&ok_cfg).unwrap(), 1);
1✔
1548
        assert_eq!(source.default_triplet_recipes().len(), 1);
1✔
1549

1550
        let bad_cfg = SamplerConfig {
1✔
1551
            ingestion_max_records: 8,
1✔
1552
            ..SamplerConfig::default()
1✔
1553
        };
1✔
1554
        let err = source.refresh(&bad_cfg, None, None).unwrap_err();
1✔
1555
        assert!(matches!(err, SamplerError::SourceInconsistent { .. }));
1✔
1556
    }
1✔
1557

1558
    #[test]
1559
    fn suggested_balancing_weight_is_longest_normalized_and_bounded() {
1✔
1560
        assert!((suggested_balancing_weight(100, 100) - 1.0).abs() < 1e-6);
1✔
1561
        assert!((suggested_balancing_weight(400, 100) - 0.25).abs() < 1e-6);
1✔
1562
        assert!((suggested_balancing_weight(400, 400) - 1.0).abs() < 1e-6);
1✔
1563
        assert_eq!(suggested_balancing_weight(0, 100), 0.0);
1✔
1564
        assert_eq!(suggested_balancing_weight(100, 0), 0.0);
1✔
1565
    }
1✔
1566

1567
    #[test]
1568
    fn suggested_oversampling_weight_is_inverse_in_unit_interval() {
1✔
1569
        assert!((suggested_oversampling_weight(100, 100) - 1.0).abs() < 1e-6);
1✔
1570
        assert!((suggested_oversampling_weight(100, 400) - 0.25).abs() < 1e-6);
1✔
1571
        assert!((suggested_oversampling_weight(100, 1000) - 0.1).abs() < 1e-6);
1✔
1572
        assert_eq!(suggested_oversampling_weight(0, 100), 0.0);
1✔
1573
        assert_eq!(suggested_oversampling_weight(100, 0), 0.0);
1✔
1574
    }
1✔
1575

1576
    #[test]
1577
    fn parse_cli_handles_help_and_invalid_args() {
1✔
1578
        let help = parse_cli::<EstimateCapacityCli, _>(["estimate_capacity", "--help"]).unwrap();
1✔
1579
        assert!(help.is_none());
1✔
1580

1581
        let err = parse_cli::<EstimateCapacityCli, _>(["estimate_capacity", "--unknown"]);
1✔
1582
        assert!(err.is_err());
1✔
1583
    }
1✔
1584

1585
    #[test]
1586
    fn run_estimate_capacity_succeeds_with_reported_counts() {
1✔
1587
        let result = run_estimate_capacity(
1✔
1588
            std::iter::empty::<String>(),
1✔
1589
            |roots| {
1✔
1590
                assert!(roots.is_empty());
1✔
1591
                Ok(())
1✔
1592
            },
1✔
1593
            |_| {
1✔
1594
                vec![Box::new(TestSource {
1✔
1595
                    id: "source_a".into(),
1✔
1596
                    count: Some(12),
1✔
1597
                    recipes: vec![default_recipe("r1")],
1✔
1598
                }) as DynSource]
1✔
1599
            },
1✔
1600
        );
1601

1602
        assert!(result.is_ok());
1✔
1603
    }
1✔
1604

1605
    #[test]
1606
    fn run_estimate_capacity_errors_when_source_count_missing() {
1✔
1607
        let result = run_estimate_capacity(
1✔
1608
            std::iter::empty::<String>(),
1✔
1609
            |_| Ok(()),
1✔
1610
            |_| {
1✔
1611
                vec![Box::new(TestSource {
1✔
1612
                    id: "source_missing".into(),
1✔
1613
                    count: None,
1✔
1614
                    recipes: vec![default_recipe("r1")],
1✔
1615
                }) as DynSource]
1✔
1616
            },
1✔
1617
        );
1618

1619
        let err = result.unwrap_err().to_string();
1✔
1620
        assert!(err.contains("failed to report exact record count"));
1✔
1621
    }
1✔
1622

1623
    #[test]
1624
    fn run_estimate_capacity_propagates_root_resolution_error() {
1✔
1625
        let result = run_estimate_capacity(
1✔
1626
            std::iter::empty::<String>(),
1✔
1627
            |_| Err("root resolution failed".into()),
1✔
1628
            empty_dyn_sources,
1629
        );
1630

1631
        let err = result.unwrap_err().to_string();
1✔
1632
        assert!(err.contains("root resolution failed"));
1✔
1633
    }
1✔
1634

1635
    #[test]
1636
    fn run_estimate_capacity_allows_empty_source_list() {
1✔
1637
        let result =
1✔
1638
            run_estimate_capacity(std::iter::empty::<String>(), |_| Ok(()), empty_dyn_sources);
1✔
1639

1640
        assert!(result.is_ok());
1✔
1641
    }
1✔
1642

1643
    #[test]
1644
    fn run_estimate_capacity_configures_sources_centrally_before_counting() {
1✔
1645
        let result = run_estimate_capacity(
1✔
1646
            std::iter::empty::<String>(),
1✔
1647
            |_| Ok(()),
1✔
1648
            |_| {
1✔
1649
                vec![Box::new(ConfigRequiredSource {
1✔
1650
                    id: "requires_config".into(),
1✔
1651
                    expected_seed: 99,
1✔
1652
                }) as DynSource]
1✔
1653
            },
1✔
1654
        );
1655

1656
        assert!(result.is_ok());
1✔
1657
    }
1✔
1658

1659
    #[test]
1660
    fn config_required_source_refresh_and_seed_mismatch_are_exercised() {
1✔
1661
        let source = ConfigRequiredSource {
1✔
1662
            id: "cfg-source".to_string(),
1✔
1663
            expected_seed: 42,
1✔
1664
        };
1✔
1665

1666
        let refreshed = source
1✔
1667
            .refresh(&SamplerConfig::default(), None, None)
1✔
1668
            .unwrap();
1✔
1669
        assert!(refreshed.records.is_empty());
1✔
1670

1671
        let mismatched = source.reported_record_count(&SamplerConfig {
1✔
1672
            seed: 7,
1✔
1673
            ..SamplerConfig::default()
1✔
1674
        });
1✔
1675
        assert!(matches!(
1✔
1676
            mismatched,
1✔
1677
            Err(SamplerError::SourceInconsistent { .. })
1678
        ));
1679

1680
        assert!(source.default_triplet_recipes().is_empty());
1✔
1681
    }
1✔
1682

1683
    #[test]
1684
    fn run_multi_source_demo_exhausted_paths_return_ok() {
1✔
1685
        struct OneRecordSource;
1686

1687
        impl DataSource for OneRecordSource {
1688
            fn id(&self) -> &str {
48✔
1689
                "one_record"
48✔
1690
            }
48✔
1691

1692
            fn refresh(
11✔
1693
                &self,
11✔
1694
                _config: &SamplerConfig,
11✔
1695
                _cursor: Option<&SourceCursor>,
11✔
1696
                _limit: Option<usize>,
11✔
1697
            ) -> Result<SourceSnapshot, SamplerError> {
11✔
1698
                let now = Utc::now();
11✔
1699
                Ok(SourceSnapshot {
11✔
1700
                    records: vec![DataRecord {
11✔
1701
                        id: "one_record::r1".to_string(),
11✔
1702
                        source: "one_record".to_string(),
11✔
1703
                        created_at: now,
11✔
1704
                        updated_at: now,
11✔
1705
                        quality: QualityScore { trust: 1.0 },
11✔
1706
                        taxonomy: Vec::new(),
11✔
1707
                        sections: vec![
11✔
1708
                            RecordSection {
11✔
1709
                                role: SectionRole::Anchor,
11✔
1710
                                heading: Some("title".to_string()),
11✔
1711
                                text: "anchor".to_string(),
11✔
1712
                                sentences: vec!["anchor".to_string()],
11✔
1713
                            },
11✔
1714
                            RecordSection {
11✔
1715
                                role: SectionRole::Context,
11✔
1716
                                heading: Some("body".to_string()),
11✔
1717
                                text: "context".to_string(),
11✔
1718
                                sentences: vec!["context".to_string()],
11✔
1719
                            },
11✔
1720
                        ],
11✔
1721
                        meta_prefix: None,
11✔
1722
                    }],
11✔
1723
                    cursor: SourceCursor {
11✔
1724
                        last_seen: now,
11✔
1725
                        revision: 0,
11✔
1726
                    },
11✔
1727
                })
11✔
1728
            }
11✔
1729

1730
            fn reported_record_count(&self, _config: &SamplerConfig) -> Result<u128, SamplerError> {
1✔
1731
                Ok(1)
1✔
1732
            }
1✔
1733

1734
            fn default_triplet_recipes(&self) -> Vec<TripletRecipe> {
4✔
1735
                vec![default_recipe("single_record_recipe")]
4✔
1736
            }
4✔
1737
        }
1738

1739
        let one = OneRecordSource;
1✔
1740
        assert_eq!(
1✔
1741
            one.reported_record_count(&SamplerConfig::default())
1✔
1742
                .unwrap(),
1✔
1743
            1
1744
        );
1745
        assert_eq!(one.default_triplet_recipes().len(), 1);
1✔
1746

1747
        for mode in ["--pair-batch", "--text-recipes", ""] {
3✔
1748
            let dir = tempdir().unwrap();
3✔
1749
            let split_store_path = dir.path().join("split_store.bin");
3✔
1750
            let mut args = vec![
3✔
1751
                "--split-store-path".to_string(),
3✔
1752
                split_store_path.to_string_lossy().to_string(),
3✔
1753
            ];
1754
            if !mode.is_empty() {
3✔
1755
                args.push(mode.to_string());
2✔
1756
            }
2✔
1757

1758
            let result = run_multi_source_demo(
3✔
1759
                args.into_iter(),
3✔
1760
                |_| Ok(()),
3✔
1761
                |_| vec![Box::new(OneRecordSource) as DynSource],
3✔
1762
            );
1763
            assert!(result.is_ok());
3✔
1764
        }
1765
    }
1✔
1766

1767
    #[test]
1768
    fn parse_multi_source_cli_handles_help_and_batch_size_validation() {
1✔
1769
        let help = parse_cli::<MultiSourceDemoCli, _>(["multi_source_demo", "--help"]).unwrap();
1✔
1770
        assert!(help.is_none());
1✔
1771

1772
        let err = parse_cli::<MultiSourceDemoCli, _>(["multi_source_demo", "--batch-size", "0"]);
1✔
1773
        assert!(err.is_err());
1✔
1774

1775
        let err = parse_cli::<MultiSourceDemoCli, _>([
1✔
1776
            "multi_source_demo",
1✔
1777
            "--ingestion-max-records",
1✔
1778
            "0",
1✔
1779
        ]);
1✔
1780
        assert!(err.is_err());
1✔
1781

1782
        let parsed = parse_cli::<MultiSourceDemoCli, _>(["multi_source_demo"]);
1✔
1783
        assert!(parsed.is_ok());
1✔
1784
    }
1✔
1785

1786
    #[test]
1787
    fn run_example_apps_invalid_cli_args_return_errors() {
1✔
1788
        let estimate = run_estimate_capacity(
1✔
1789
            ["--unknown".to_string()].into_iter(),
1✔
1790
            ok_unit_roots,
1791
            empty_dyn_sources,
1792
        );
1793
        assert!(estimate.is_err());
1✔
1794

1795
        let demo = run_multi_source_demo(
1✔
1796
            ["--unknown".to_string()].into_iter(),
1✔
1797
            ok_unit_roots,
1798
            empty_dyn_sources,
1799
        );
1800
        assert!(demo.is_err());
1✔
1801
    }
1✔
1802

1803
    #[test]
1804
    fn helper_and_error_refresh_source_methods_are_exercised() {
1✔
1805
        assert!(ok_unit_roots(Vec::new()).is_ok());
1✔
1806
        assert!(error_unit_roots(Vec::new()).is_err());
1✔
1807

1808
        let source = ErrorRefreshSource {
1✔
1809
            id: "error_refresh_source".to_string(),
1✔
1810
        };
1✔
1811
        assert_eq!(
1✔
1812
            source
1✔
1813
                .reported_record_count(&SamplerConfig::default())
1✔
1814
                .unwrap(),
1✔
1815
            1
1816
        );
1817
        assert_eq!(source.default_triplet_recipes().len(), 1);
1✔
1818
    }
1✔
1819

1820
    #[test]
1821
    fn print_source_summary_handles_non_empty_ids() {
1✔
1822
        let ids = [
1✔
1823
            "source_a::r1",
1✔
1824
            "source_a::r2",
1✔
1825
            "source_b::r1",
1✔
1826
            "source_without_delimiter",
1✔
1827
        ];
1✔
1828
        print_source_summary("non-empty summary", ids.into_iter());
1✔
1829
    }
1✔
1830

1831
    #[test]
1832
    fn run_multi_source_demo_refresh_failures_degrade_to_exhausted_paths() {
1✔
1833
        for mode in [
4✔
1834
            vec!["--pair-batch".to_string()],
1✔
1835
            vec!["--text-recipes".to_string()],
1✔
1836
            vec!["--batches".to_string(), "1".to_string()],
1✔
1837
            Vec::new(),
1✔
1838
        ] {
1✔
1839
            let dir = tempdir().unwrap();
4✔
1840
            let split_store_path = dir.path().join("error_modes_split_store.bin");
4✔
1841
            let mut args = mode;
4✔
1842
            args.push("--split-store-path".to_string());
4✔
1843
            args.push(split_store_path.to_string_lossy().to_string());
4✔
1844

1845
            let result = run_multi_source_demo(
4✔
1846
                args.into_iter(),
4✔
1847
                |_| Ok(()),
4✔
1848
                |_| {
4✔
1849
                    vec![Box::new(ErrorRefreshSource {
4✔
1850
                        id: "error_refresh_source".to_string(),
4✔
1851
                    }) as DynSource]
4✔
1852
                },
4✔
1853
            );
1854

1855
            assert!(result.is_ok());
4✔
1856
        }
1857
    }
1✔
1858

1859
    #[test]
1860
    fn run_multi_source_demo_batches_exhausted_path_returns_ok() {
1✔
1861
        let dir = tempdir().unwrap();
1✔
1862
        let split_store_path = dir.path().join("batches_exhausted_split_store.bin");
1✔
1863
        let args = vec![
1✔
1864
            "--batches".to_string(),
1✔
1865
            "3".to_string(),
1✔
1866
            "--split-store-path".to_string(),
1✔
1867
            split_store_path.to_string_lossy().to_string(),
1✔
1868
        ];
1869

1870
        let result = run_multi_source_demo(
1✔
1871
            args.into_iter(),
1✔
1872
            |_| Ok(()),
1✔
1873
            |_| {
1✔
1874
                vec![Box::new(FixtureSource {
1✔
1875
                    id: "batches_exhausted_source".into(),
1✔
1876
                    records: vec![fixture_record(
1✔
1877
                        "batches_exhausted_source",
1✔
1878
                        "r1",
1✔
1879
                        1,
1✔
1880
                        "Only one record",
1✔
1881
                        "Single record body",
1✔
1882
                    )],
1✔
1883
                    recipes: vec![default_recipe("batches_exhausted_recipe")],
1✔
1884
                }) as DynSource]
1✔
1885
            },
1✔
1886
        );
1887

1888
        assert!(result.is_ok());
1✔
1889
    }
1✔
1890

1891
    #[test]
1892
    fn run_multi_source_demo_default_triplet_success_path_returns_ok() {
1✔
1893
        let dir = tempdir().unwrap();
1✔
1894
        let split_store_path = dir.path().join("default_triplet_success_split_store.bin");
1✔
1895
        let args = vec![
1✔
1896
            "--split-store-path".to_string(),
1✔
1897
            split_store_path.to_string_lossy().to_string(),
1✔
1898
        ];
1899

1900
        let result = run_multi_source_demo(
1✔
1901
            args.into_iter(),
1✔
1902
            |_| Ok(()),
1✔
1903
            |_| {
1✔
1904
                vec![Box::new(FixtureSource {
1✔
1905
                    id: "default_triplet_success_source".into(),
1✔
1906
                    records: vec![
1✔
1907
                        fixture_record(
1✔
1908
                            "default_triplet_success_source",
1✔
1909
                            "r1",
1✔
1910
                            1,
1✔
1911
                            "Title one",
1✔
1912
                            "Body one",
1✔
1913
                        ),
1✔
1914
                        fixture_record(
1✔
1915
                            "default_triplet_success_source",
1✔
1916
                            "r2",
1✔
1917
                            2,
1✔
1918
                            "Title two",
1✔
1919
                            "Body two",
1✔
1920
                        ),
1✔
1921
                        fixture_record(
1✔
1922
                            "default_triplet_success_source",
1✔
1923
                            "r3",
1✔
1924
                            3,
1✔
1925
                            "Title three",
1✔
1926
                            "Body three",
1✔
1927
                        ),
1✔
1928
                    ],
1✔
1929
                    recipes: vec![default_recipe("default_triplet_success_recipe")],
1✔
1930
                }) as DynSource]
1✔
1931
            },
1✔
1932
        );
1933

1934
        assert!(result.is_ok());
1✔
1935
    }
1✔
1936

1937
    #[test]
1938
    fn run_multi_source_demo_passes_ingestion_max_records_to_sources() {
1✔
1939
        let dir = tempdir().unwrap();
1✔
1940
        let split_store_path = dir.path().join("ingestion_config_split_store.bin");
1✔
1941
        let expected = 7;
1✔
1942

1943
        let result = run_multi_source_demo(
1✔
1944
            [
1✔
1945
                "--pair-batch".to_string(),
1✔
1946
                "--ingestion-max-records".to_string(),
1✔
1947
                expected.to_string(),
1✔
1948
                "--split-store-path".to_string(),
1✔
1949
                split_store_path.to_string_lossy().to_string(),
1✔
1950
            ]
1✔
1951
            .into_iter(),
1✔
1952
            |_| Ok(()),
1✔
1953
            |_| {
1✔
1954
                vec![Box::new(IngestionConfigSource {
1✔
1955
                    expected_ingestion_max_records: expected,
1✔
1956
                    records: (1..=8)
1✔
1957
                        .map(|day| {
8✔
1958
                            fixture_record(
8✔
1959
                                "ingestion_config_source",
8✔
1960
                                &format!("r{day}"),
8✔
1961
                                day,
8✔
1962
                                &format!("Config headline {day}"),
8✔
1963
                                &format!("Config body {day}"),
8✔
1964
                            )
1965
                        })
8✔
1966
                        .collect(),
1✔
1967
                }) as DynSource]
1✔
1968
            },
1✔
1969
        );
1970

1971
        assert!(result.is_ok());
1✔
1972
    }
1✔
1973

1974
    #[test]
1975
    fn parse_cli_handles_display_version_path() {
1✔
1976
        #[derive(Debug, Parser)]
1977
        #[command(name = "version_test", version = "1.0.0")]
1978
        struct VersionCli {}
1979

1980
        let parsed = parse_cli::<VersionCli, _>(["version_test", "--version"]).unwrap();
1✔
1981
        assert!(parsed.is_none());
1✔
1982
    }
1✔
1983

1984
    #[test]
1985
    fn run_multi_source_demo_list_text_recipes_path_succeeds() {
1✔
1986
        let dir = tempdir().unwrap();
1✔
1987
        let split_store_path = dir.path().join("recipes_split_store.bin");
1✔
1988
        let mut args = vec![
1✔
1989
            "--list-text-recipes".to_string(),
1✔
1990
            "--split-store-path".to_string(),
1✔
1991
            split_store_path.to_string_lossy().to_string(),
1✔
1992
        ];
1993
        let result = run_multi_source_demo(
1✔
1994
            args.drain(..),
1✔
1995
            |_| Ok(()),
1✔
1996
            |_| {
1✔
1997
                vec![Box::new(TestSource {
1✔
1998
                    id: "source_for_recipes".into(),
1✔
1999
                    count: Some(10),
1✔
2000
                    recipes: vec![default_recipe("recipe_a")],
1✔
2001
                }) as DynSource]
1✔
2002
            },
1✔
2003
        );
2004

2005
        assert!(result.is_ok());
1✔
2006
    }
1✔
2007

2008
    #[test]
2009
    fn run_multi_source_demo_list_text_recipes_uses_explicit_split_store_path() {
1✔
2010
        let dir = tempdir().unwrap();
1✔
2011
        let split_store_path = dir.path().join("custom_split_store.bin");
1✔
2012
        let args = vec![
1✔
2013
            "--list-text-recipes".to_string(),
1✔
2014
            "--split-store-path".to_string(),
1✔
2015
            split_store_path.to_string_lossy().to_string(),
1✔
2016
        ];
2017

2018
        let result = run_multi_source_demo(
1✔
2019
            args.into_iter(),
1✔
2020
            |_| Ok(()),
1✔
2021
            |_| {
1✔
2022
                vec![Box::new(TestSource {
1✔
2023
                    id: "source_without_text_recipes".into(),
1✔
2024
                    count: Some(1),
1✔
2025
                    recipes: Vec::new(),
1✔
2026
                }) as DynSource]
1✔
2027
            },
1✔
2028
        );
2029

2030
        assert!(result.is_ok());
1✔
2031
    }
1✔
2032

2033
    #[test]
2034
    fn run_multi_source_demo_sampling_modes_handle_empty_sources() {
1✔
2035
        for mode in [
3✔
2036
            vec!["--pair-batch".to_string()],
1✔
2037
            vec!["--text-recipes".to_string()],
1✔
2038
            vec![],
1✔
2039
        ] {
1✔
2040
            let dir = tempdir().unwrap();
3✔
2041
            let split_store_path = dir.path().join("empty_sources_split_store.bin");
3✔
2042
            let mut args = mode;
3✔
2043
            args.push("--split-store-path".to_string());
3✔
2044
            args.push(split_store_path.to_string_lossy().to_string());
3✔
2045
            args.push("--split".to_string());
3✔
2046
            args.push("validation".to_string());
3✔
2047

2048
            let result = run_multi_source_demo(
3✔
2049
                args.into_iter(),
3✔
2050
                |_| Ok(()),
3✔
2051
                |_| {
3✔
2052
                    vec![Box::new(TestSource {
3✔
2053
                        id: "source_empty".into(),
3✔
2054
                        count: Some(0),
3✔
2055
                        recipes: vec![default_recipe("recipe_empty")],
3✔
2056
                    }) as DynSource]
3✔
2057
                },
3✔
2058
            );
2059

2060
            assert!(result.is_ok());
3✔
2061
        }
2062
    }
1✔
2063

2064
    #[test]
2065
    fn run_multi_source_demo_propagates_root_resolution_error() {
1✔
2066
        let dir = tempdir().unwrap();
1✔
2067
        let split_store_path = dir.path().join("root_resolution_error_store.bin");
1✔
2068
        let result = run_multi_source_demo(
1✔
2069
            [
1✔
2070
                "--split-store-path".to_string(),
1✔
2071
                split_store_path.to_string_lossy().to_string(),
1✔
2072
            ]
1✔
2073
            .into_iter(),
1✔
2074
            |_| Err("demo root resolution failed".into()),
1✔
2075
            empty_dyn_sources,
2076
        );
2077

2078
        let err = result.unwrap_err().to_string();
1✔
2079
        assert!(err.contains("demo root resolution failed"));
1✔
2080
    }
1✔
2081

2082
    #[test]
2083
    fn run_multi_source_demo_list_text_recipes_allows_empty_sources() {
1✔
2084
        let dir = tempdir().unwrap();
1✔
2085
        let split_store_path = dir.path().join("empty_source_list_recipes.bin");
1✔
2086
        let result = run_multi_source_demo(
1✔
2087
            [
1✔
2088
                "--list-text-recipes".to_string(),
1✔
2089
                "--split-store-path".to_string(),
1✔
2090
                split_store_path.to_string_lossy().to_string(),
1✔
2091
            ]
1✔
2092
            .into_iter(),
1✔
2093
            |_| Ok(()),
1✔
2094
            empty_dyn_sources,
2095
        );
2096

2097
        assert!(result.is_ok());
1✔
2098
    }
1✔
2099

2100
    #[test]
2101
    fn print_helpers_and_extract_source_cover_paths() {
1✔
2102
        let split = SplitRatios::default();
1✔
2103
        let store = DeterministicSplitStore::new(split, 42).unwrap();
1✔
2104
        let strategy = ChunkingStrategy::default();
1✔
2105

2106
        let anchor = RecordChunk {
1✔
2107
            record_id: "source_a::rec1".to_string(),
1✔
2108
            section_idx: 0,
1✔
2109
            view: ChunkView::Window {
1✔
2110
                index: 1,
1✔
2111
                overlap: 2,
1✔
2112
                span: 12,
1✔
2113
                start_ratio: 0.25,
1✔
2114
            },
1✔
2115
            text: "anchor text".to_string(),
1✔
2116
            tokens_estimate: 8,
1✔
2117
            quality: crate::data::QualityScore { trust: 0.9 },
1✔
2118
        };
1✔
2119
        let positive = RecordChunk {
1✔
2120
            record_id: "source_a::rec2".to_string(),
1✔
2121
            section_idx: 1,
1✔
2122
            view: ChunkView::SummaryFallback {
1✔
2123
                strategy: "summary".to_string(),
1✔
2124
                weight: 0.7,
1✔
2125
            },
1✔
2126
            text: "positive text".to_string(),
1✔
2127
            tokens_estimate: 6,
1✔
2128
            quality: crate::data::QualityScore { trust: 0.8 },
1✔
2129
        };
1✔
2130
        let negative = RecordChunk {
1✔
2131
            record_id: "source_b::rec3".to_string(),
1✔
2132
            section_idx: 2,
1✔
2133
            view: ChunkView::Window {
1✔
2134
                index: 0,
1✔
2135
                overlap: 0,
1✔
2136
                span: 16,
1✔
2137
                start_ratio: 0.0,
1✔
2138
            },
1✔
2139
            text: "negative text".to_string(),
1✔
2140
            tokens_estimate: 7,
1✔
2141
            quality: crate::data::QualityScore { trust: 0.5 },
1✔
2142
        };
1✔
2143

2144
        let triplet_batch = TripletBatch {
1✔
2145
            triplets: vec![crate::SampleTriplet {
1✔
2146
                recipe: "triplet_recipe".to_string(),
1✔
2147
                anchor: anchor.clone(),
1✔
2148
                positive: positive.clone(),
1✔
2149
                negative: negative.clone(),
1✔
2150
                weight: 1.0,
1✔
2151
                instruction: Some("triplet instruction".to_string()),
1✔
2152
            }],
1✔
2153
        };
1✔
2154
        print_triplet_batch(&strategy, &triplet_batch, &store);
1✔
2155

2156
        let pair_batch = SampleBatch {
1✔
2157
            pairs: vec![crate::SamplePair {
1✔
2158
                recipe: "pair_recipe".to_string(),
1✔
2159
                anchor: anchor.clone(),
1✔
2160
                positive: positive.clone(),
1✔
2161
                weight: 1.0,
1✔
2162
                instruction: None,
1✔
2163
                label: crate::PairLabel::Positive,
1✔
2164
                reason: Some("same topic".to_string()),
1✔
2165
            }],
1✔
2166
        };
1✔
2167
        print_pair_batch(&strategy, &pair_batch, &store);
1✔
2168

2169
        let text_batch = TextBatch {
1✔
2170
            samples: vec![crate::TextSample {
1✔
2171
                recipe: "text_recipe".to_string(),
1✔
2172
                chunk: negative,
1✔
2173
                weight: 0.8,
1✔
2174
                instruction: Some("text instruction".to_string()),
1✔
2175
            }],
1✔
2176
        };
1✔
2177
        print_text_batch(&strategy, &text_batch, &store);
1✔
2178

2179
        let recipes = vec![TextRecipe {
1✔
2180
            name: "recipe_name".into(),
1✔
2181
            selector: crate::config::Selector::Role(SectionRole::Context),
1✔
2182
            instruction: Some("instruction".into()),
1✔
2183
            weight: 1.0,
1✔
2184
        }];
1✔
2185
        print_text_recipes(&recipes);
1✔
2186

2187
        assert_eq!(extract_source("source_a::record"), "source_a");
1✔
2188
        assert_eq!(extract_source("record-without-delimiter"), "unknown");
1✔
2189
    }
1✔
2190

2191
    #[test]
2192
    fn split_arg_conversion_and_version_parse_paths_are_covered() {
1✔
2193
        assert!(matches!(
1✔
2194
            SplitLabel::from(SplitArg::Train),
1✔
2195
            SplitLabel::Train
2196
        ));
2197
        assert!(matches!(
1✔
2198
            SplitLabel::from(SplitArg::Validation),
1✔
2199
            SplitLabel::Validation
2200
        ));
2201
        assert!(matches!(SplitLabel::from(SplitArg::Test), SplitLabel::Test));
1✔
2202
    }
1✔
2203

2204
    #[test]
2205
    fn parse_split_ratios_reports_per_field_parse_errors() {
1✔
2206
        assert!(
1✔
2207
            parse_split_ratios_arg("x,0.1,0.9")
1✔
2208
                .unwrap_err()
1✔
2209
                .contains("invalid train ratio")
1✔
2210
        );
2211
        assert!(
1✔
2212
            parse_split_ratios_arg("0.1,y,0.8")
1✔
2213
                .unwrap_err()
1✔
2214
                .contains("invalid validation ratio")
1✔
2215
        );
2216
        assert!(
1✔
2217
            parse_split_ratios_arg("0.1,0.2,z")
1✔
2218
                .unwrap_err()
1✔
2219
                .contains("invalid test ratio")
1✔
2220
        );
2221
    }
1✔
2222

2223
    #[test]
2224
    fn run_multi_source_demo_exhausted_paths_are_handled() {
1✔
2225
        for mode in [
3✔
2226
            vec!["--pair-batch".to_string()],
1✔
2227
            vec!["--text-recipes".to_string()],
1✔
2228
            Vec::new(),
1✔
2229
        ] {
1✔
2230
            let dir = tempdir().unwrap();
3✔
2231
            let split_store_path = dir.path().join("exhausted_split_store.bin");
3✔
2232
            let mut args = mode;
3✔
2233
            args.push("--split-store-path".to_string());
3✔
2234
            args.push(split_store_path.to_string_lossy().to_string());
3✔
2235

2236
            let result = run_multi_source_demo(
3✔
2237
                args.into_iter(),
3✔
2238
                |_| Ok(()),
3✔
2239
                |_| {
3✔
2240
                    vec![Box::new(TestSource {
3✔
2241
                        id: "source_without_recipes".into(),
3✔
2242
                        count: Some(1),
3✔
2243
                        recipes: Vec::new(),
3✔
2244
                    }) as DynSource]
3✔
2245
                },
3✔
2246
            );
2247

2248
            assert!(result.is_ok());
3✔
2249
        }
2250
    }
1✔
2251

2252
    #[test]
2253
    fn run_multi_source_demo_reset_recreates_split_store_and_samples() {
1✔
2254
        let dir = tempdir().unwrap();
1✔
2255
        let split_store_path = dir.path().join("reset_split_store.bin");
1✔
2256
        std::fs::write(&split_store_path, b"stale-data").unwrap();
1✔
2257

2258
        let args = vec![
1✔
2259
            "--reset".to_string(),
1✔
2260
            "--pair-batch".to_string(),
1✔
2261
            "--split-store-path".to_string(),
1✔
2262
            split_store_path.to_string_lossy().to_string(),
1✔
2263
        ];
2264

2265
        let result = run_multi_source_demo(
1✔
2266
            args.into_iter(),
1✔
2267
            |_| Ok(()),
1✔
2268
            |_| {
1✔
2269
                let recipes = vec![default_recipe("fixture_recipe")];
1✔
2270
                let records: Vec<DataRecord> = (1..=8)
1✔
2271
                    .map(|day| {
8✔
2272
                        fixture_record(
8✔
2273
                            "fixture_source",
8✔
2274
                            &format!("r{day}"),
8✔
2275
                            day,
8✔
2276
                            &format!("Fixture headline {day}"),
8✔
2277
                            &format!("Fixture body content for day {day}."),
8✔
2278
                        )
2279
                    })
8✔
2280
                    .collect();
1✔
2281
                vec![Box::new(FixtureSource {
1✔
2282
                    id: "fixture_source".into(),
1✔
2283
                    records,
1✔
2284
                    recipes,
1✔
2285
                }) as DynSource]
1✔
2286
            },
1✔
2287
        );
2288

2289
        assert!(result.is_ok());
1✔
2290
        assert!(split_store_path.exists());
1✔
2291
        let metadata = std::fs::metadata(&split_store_path).unwrap();
1✔
2292
        assert!(metadata.len() > 0);
1✔
2293
    }
1✔
2294

2295
    #[test]
2296
    fn run_multi_source_demo_batches_mode_executes_multiple_batches() {
1✔
2297
        let dir = tempdir().unwrap();
1✔
2298
        let split_store_path = dir.path().join("batches_split_store.bin");
1✔
2299
        let args = vec![
1✔
2300
            "--batches".to_string(),
1✔
2301
            "2".to_string(),
1✔
2302
            "--split-store-path".to_string(),
1✔
2303
            split_store_path.to_string_lossy().to_string(),
1✔
2304
        ];
2305

2306
        let result = run_multi_source_demo(
1✔
2307
            args.into_iter(),
1✔
2308
            |_| Ok(()),
1✔
2309
            |_| {
1✔
2310
                let recipes = vec![default_recipe("batch_recipe")];
1✔
2311
                vec![Box::new(FixtureSource {
1✔
2312
                    id: "batch_source".into(),
1✔
2313
                    records: vec![
1✔
2314
                        fixture_record(
1✔
2315
                            "batch_source",
1✔
2316
                            "r1",
1✔
2317
                            3,
1✔
2318
                            "Inflation cools in latest report",
1✔
2319
                            "Core inflation moderated compared with prior quarter.",
1✔
2320
                        ),
1✔
2321
                        fixture_record(
1✔
2322
                            "batch_source",
1✔
2323
                            "r2",
1✔
2324
                            4,
1✔
2325
                            "Labor market remains resilient",
1✔
2326
                            "Job openings remain elevated despite slower growth.",
1✔
2327
                        ),
1✔
2328
                        fixture_record(
1✔
2329
                            "batch_source",
1✔
2330
                            "r3",
1✔
2331
                            5,
1✔
2332
                            "Manufacturing sentiment stabilizes",
1✔
2333
                            "Survey data suggests output expectations are improving.",
1✔
2334
                        ),
1✔
2335
                    ],
1✔
2336
                    recipes,
1✔
2337
                }) as DynSource]
1✔
2338
            },
1✔
2339
        );
2340

2341
        assert!(result.is_ok());
1✔
2342
        assert!(split_store_path.exists());
1✔
2343
    }
1✔
2344

2345
    #[test]
2346
    fn managed_demo_split_store_path_resolves_under_cache_group() {
1✔
2347
        let path = managed_demo_split_store_path().unwrap();
1✔
2348
        assert!(path.ends_with(MULTI_SOURCE_DEMO_STORE_FILENAME));
1✔
2349
        let parent = path
1✔
2350
            .parent()
1✔
2351
            .expect("managed split-store path should have a parent");
1✔
2352
        assert!(parent.ends_with(PathBuf::from(MULTI_SOURCE_DEMO_GROUP)));
1✔
2353
    }
1✔
2354

2355
    #[test]
2356
    fn run_multi_source_demo_help_returns_ok_without_work() {
1✔
2357
        let no_help = run_multi_source_demo(
1✔
2358
            std::iter::empty::<String>(),
1✔
2359
            error_unit_roots,
2360
            empty_dyn_sources,
2361
        );
2362
        assert!(
1✔
2363
            no_help
1✔
2364
                .expect_err("non-help path should attempt to resolve roots")
1✔
2365
                .to_string()
1✔
2366
                .contains("root-resolution-error")
1✔
2367
        );
2368

2369
        let result = run_multi_source_demo(
1✔
2370
            ["--help".to_string()].into_iter(),
1✔
2371
            ok_unit_roots,
2372
            empty_dyn_sources,
2373
        );
2374

2375
        assert!(result.is_ok());
1✔
2376
    }
1✔
2377

2378
    #[test]
2379
    fn run_estimate_capacity_help_returns_ok_without_work() {
1✔
2380
        let result = run_estimate_capacity(
1✔
2381
            ["--help".to_string()].into_iter(),
1✔
2382
            ok_unit_roots,
2383
            empty_dyn_sources,
2384
        );
2385

2386
        assert!(result.is_ok());
1✔
2387
    }
1✔
2388

2389
    #[test]
2390
    fn run_multi_source_demo_pair_exhausted_branch_returns_ok() {
1✔
2391
        let dir = tempdir().unwrap();
1✔
2392
        let split_store_path = dir.path().join("pair_exhausted_split_store.bin");
1✔
2393
        let args = vec![
1✔
2394
            "--pair-batch".to_string(),
1✔
2395
            "--split-store-path".to_string(),
1✔
2396
            split_store_path.to_string_lossy().to_string(),
1✔
2397
        ];
2398

2399
        let result = run_multi_source_demo(
1✔
2400
            args.into_iter(),
1✔
2401
            |_| Ok(()),
1✔
2402
            |_| {
1✔
2403
                vec![Box::new(FixtureSource {
1✔
2404
                    id: "pair_exhausted_source".into(),
1✔
2405
                    records: vec![fixture_record(
1✔
2406
                        "pair_exhausted_source",
1✔
2407
                        "r1",
1✔
2408
                        1,
1✔
2409
                        "Single record title",
1✔
2410
                        "Single record body",
1✔
2411
                    )],
1✔
2412
                    recipes: vec![default_recipe("pair_exhausted_recipe")],
1✔
2413
                }) as DynSource]
1✔
2414
            },
1✔
2415
        );
2416

2417
        assert!(result.is_ok());
1✔
2418
    }
1✔
2419

2420
    #[test]
2421
    fn run_multi_source_demo_uses_managed_split_store_path_when_not_provided() {
1✔
2422
        let result = run_multi_source_demo(
1✔
2423
            ["--list-text-recipes".to_string()].into_iter(),
1✔
2424
            |_| Ok(()),
1✔
2425
            |_| {
1✔
2426
                vec![Box::new(TestSource {
1✔
2427
                    id: "managed_path_source".into(),
1✔
2428
                    count: Some(2),
1✔
2429
                    recipes: vec![default_recipe("managed_recipe")],
1✔
2430
                }) as DynSource]
1✔
2431
            },
1✔
2432
        );
2433

2434
        assert!(result.is_ok());
1✔
2435
    }
1✔
2436

2437
    #[test]
2438
    fn run_multi_source_demo_reset_errors_when_target_is_directory() {
1✔
2439
        let dir = tempdir().unwrap();
1✔
2440
        let split_store_path = dir.path().join("split_store_dir");
1✔
2441
        std::fs::create_dir(&split_store_path).unwrap();
1✔
2442

2443
        let result = run_multi_source_demo(
1✔
2444
            [
1✔
2445
                "--reset".to_string(),
1✔
2446
                "--split-store-path".to_string(),
1✔
2447
                split_store_path.to_string_lossy().to_string(),
1✔
2448
            ]
1✔
2449
            .into_iter(),
1✔
2450
            |_| Ok(()),
1✔
2451
            empty_dyn_sources,
2452
        );
2453

2454
        let err = result.unwrap_err().to_string();
1✔
2455
        assert!(err.contains("failed to remove split store"));
1✔
2456
    }
1✔
2457

2458
    #[test]
2459
    fn print_summary_helpers_accept_empty_iterators() {
1✔
2460
        print_source_summary("empty summary", std::iter::empty::<&str>());
1✔
2461
        print_recipe_context_by_source("empty recipe context", std::iter::empty::<(&str, &str)>());
1✔
2462
    }
1✔
2463

2464
    #[cfg(feature = "extended-metrics")]
2465
    #[test]
2466
    fn metric_mean_median_handles_even_length_inputs() {
1✔
2467
        let mut vals = [1.0, 4.0, 2.0, 3.0];
1✔
2468
        let (mean, median) = metric_mean_median(&mut vals);
1✔
2469
        assert!((mean - 2.5).abs() < 1e-6);
1✔
2470
        assert!((median - 2.5).abs() < 1e-6);
1✔
2471
    }
1✔
2472

2473
    #[cfg(feature = "extended-metrics")]
2474
    #[test]
2475
    fn metric_mean_median_handles_odd_length_inputs() {
1✔
2476
        let mut vals = [3.0, 1.0, 2.0];
1✔
2477
        let (mean, median) = metric_mean_median(&mut vals);
1✔
2478
        assert!((mean - 2.0).abs() < 1e-6);
1✔
2479
        assert!((median - 2.0).abs() < 1e-6);
1✔
2480
    }
1✔
2481

2482
    #[cfg(feature = "extended-metrics")]
2483
    #[test]
2484
    fn print_metric_summary_includes_multi_source_aggregate() {
1✔
2485
        let source_data = HashMap::from([
1✔
2486
            (
1✔
2487
                "source_a".to_string(),
1✔
2488
                vec![(0.9, 0.8, 0.2, 0.1), (0.8, 0.7, 0.3, 0.2)],
1✔
2489
            ),
1✔
2490
            (
1✔
2491
                "source_b".to_string(),
1✔
2492
                vec![(0.7, 0.6, 0.4, 0.3), (0.6, 0.5, 0.5, 0.4)],
1✔
2493
            ),
1✔
2494
        ]);
1✔
2495

2496
        print_metric_summary(&source_data);
1✔
2497
    }
1✔
2498
}
STATUS · Troubleshooting · Open an Issue · Sales · Support · CAREERS · ENTERPRISE · START FREE · SCHEDULE DEMO
ANNOUNCEMENTS · TWITTER · TOS & SLA · Supported CI Services · What's a CI service? · Automated Testing

© 2026 Coveralls, Inc