• Home
  • Features
  • Pricing
  • Docs
  • Announcements
  • Sign In

jzombie / rust-triplets / 23533755847

25 Mar 2026 09:21AM UTC coverage: 92.101% (-2.7%) from 94.794%
23533755847

Pull #40

github

web-flow
Merge 497e6bc79 into 65addee9d
Pull Request #40: Refactor BM25 integration

2138 of 2620 new or added lines in 6 files covered. (81.6%)

55 existing lines in 1 file now uncovered.

14773 of 16040 relevant lines covered (92.1%)

137583.71 hits per line

Source File
Press 'n' to go to next uncovered line, 'b' for previous

83.19
/src/example_apps.rs
1
// TODO: Consider extracting to a debug crate
2

3
use std::collections::HashMap;
4
use std::error::Error;
5
use std::path::PathBuf;
6
use std::sync::Arc;
7
use std::sync::Once;
8
use std::time::Instant;
9

10
use cache_manager::CacheRoot;
11
use clap::{Parser, ValueEnum, error::ErrorKind};
12

13
use crate::config::{ChunkingStrategy, SamplerConfig, TripletRecipe};
14
use crate::constants::cache::{MULTI_SOURCE_DEMO_GROUP, MULTI_SOURCE_DEMO_STORE_FILENAME};
15
use crate::data::ChunkView;
16
use crate::heuristics::{
17
    CapacityTotals, EFFECTIVE_NEGATIVES_PER_ANCHOR, EFFECTIVE_POSITIVES_PER_ANCHOR,
18
    estimate_source_split_capacity_from_counts, format_replay_factor, format_u128_with_commas,
19
    resolve_text_recipes_for_source, split_counts_for_total,
20
};
21
use crate::metrics::source_skew;
22
use crate::sampler::chunk_weight;
23
use crate::source::DataSource;
24
use crate::splits::{FileSplitStore, SplitLabel, SplitRatios, SplitStore};
25
use crate::{
26
    RecordChunk, SampleBatch, Sampler, SamplerError, SourceId, TextBatch, TextRecipe, TripletBatch,
27
    TripletSampler,
28
};
29

30
type DynSource = Box<dyn DataSource + 'static>;
31

32
fn managed_demo_split_store_path() -> Result<PathBuf, String> {
×
33
    let cache_root = CacheRoot::from_discovery()
×
34
        .map_err(|err| format!("failed discovering managed cache root: {err}"))?;
×
35
    let group = PathBuf::from(MULTI_SOURCE_DEMO_GROUP);
×
36
    let dir = cache_root.ensure_group(&group).map_err(|err| {
×
37
        format!(
×
38
            "failed creating managed demo cache group '{}': {err}",
39
            group.display()
×
40
        )
41
    })?;
×
42
    Ok(dir.join(MULTI_SOURCE_DEMO_STORE_FILENAME))
×
43
}
×
44

45
fn init_example_tracing() {
16✔
46
    static INIT: Once = Once::new();
47
    INIT.call_once(|| {
16✔
48
        let env_filter = tracing_subscriber::EnvFilter::try_from_default_env()
1✔
49
            .unwrap_or_else(|_| tracing_subscriber::EnvFilter::new("triplets=info"));
1✔
50
        let _ = tracing_subscriber::fmt()
1✔
51
            .with_env_filter(env_filter)
1✔
52
            .try_init();
1✔
53
    });
1✔
54
}
16✔
55

56
#[derive(Debug, Clone, Copy, ValueEnum)]
57
/// CLI split selector mapped onto `SplitLabel`.
58
enum SplitArg {
59
    Train,
60
    Validation,
61
    Test,
62
}
63

64
impl From<SplitArg> for SplitLabel {
65
    fn from(value: SplitArg) -> Self {
6✔
66
        match value {
6✔
67
            SplitArg::Train => SplitLabel::Train,
1✔
68
            SplitArg::Validation => SplitLabel::Validation,
4✔
69
            SplitArg::Test => SplitLabel::Test,
1✔
70
        }
71
    }
6✔
72
}
73

74
#[derive(Debug, Parser)]
75
#[command(
76
    name = "estimate_capacity",
77
    disable_help_subcommand = true,
78
    about = "Metadata-only capacity estimation",
79
    long_about = "Estimate record, pair, triplet, and text-sample capacity using source-reported counts only (no data refresh).",
80
    after_help = "Source roots are optional and resolved in order by explicit arg, environment variables, then project defaults."
81
)]
82
/// CLI arguments for metadata-only capacity estimation.
83
struct EstimateCapacityCli {
84
    #[arg(
85
        long,
86
        default_value_t = 99,
87
        help = "Deterministic seed used for split allocation"
88
    )]
89
    seed: u64,
90
    #[arg(
91
        long = "split-ratios",
92
        value_name = "TRAIN,VALIDATION,TEST",
93
        value_parser = parse_split_ratios_arg,
94
        default_value = "0.8,0.1,0.1",
95
        help = "Comma-separated split ratios that must sum to 1.0"
96
    )]
97
    split: SplitRatios,
98
    #[arg(
99
        long = "source-root",
100
        value_name = "PATH",
101
        help = "Optional source root override, repeat as needed in source order"
102
    )]
103
    source_roots: Vec<String>,
104
}
105

106
#[derive(Debug, Parser)]
107
#[command(
108
    name = "multi_source_demo",
109
    disable_help_subcommand = true,
110
    about = "Run sampled batches from multiple sources",
111
    long_about = "Sample triplet, pair, or text batches from multiple sources and persist split/epoch state.",
112
    after_help = "Source roots are optional and resolved in order by explicit arg, environment variables, then project defaults."
113
)]
114
/// CLI for `multi_source_demo`.
115
///
116
/// Common usage:
117
/// - Use managed cache-group default path (no flag)
118
/// - Set an explicit file path: `--split-store-path /tmp/split_store.bin`
119
/// - Repeat `--source-root <PATH>` to override source roots in order
120
struct MultiSourceDemoCli {
121
    #[arg(
122
        long = "text-recipes",
123
        help = "Emit a text batch instead of a triplet batch"
124
    )]
125
    show_text_samples: bool,
126
    #[arg(
127
        long = "pair-batch",
128
        help = "Emit a pair batch instead of a triplet batch"
129
    )]
130
    show_pair_samples: bool,
131
    #[arg(
132
        long = "list-text-recipes",
133
        help = "Print registered text recipes and exit"
134
    )]
135
    list_text_recipes: bool,
136
    #[arg(
137
        long = "batch-size",
138
        default_value_t = 4,
139
        value_parser = parse_positive_usize,
140
        help = "Batch size used for sampling"
141
    )]
142
    batch_size: usize,
143
    #[arg(long, help = "Optional deterministic seed override")]
144
    seed: Option<u64>,
145
    #[arg(long, value_enum, help = "Target split to sample from")]
146
    split: Option<SplitArg>,
147
    #[arg(
148
        long = "source-root",
149
        value_name = "PATH",
150
        help = "Optional source root override, repeat as needed in source order"
151
    )]
152
    source_roots: Vec<String>,
153
    #[arg(
154
        long = "split-store-path",
155
        value_name = "SPLIT_STORE_PATH",
156
        help = "Optional explicit path for persisted split/epoch state file"
157
    )]
158
    split_store_path: Option<PathBuf>,
159
    #[arg(
160
        long = "reset",
161
        help = "Delete the persisted split/epoch state before sampling, restarting from epoch 0"
162
    )]
163
    reset: bool,
164
    #[arg(
165
        long = "batches",
166
        value_name = "N",
167
        value_parser = parse_positive_usize,
168
        help = "Run N triplet batches in succession, printing a timing line per batch and (with --features extended-metrics) a per-source similarity summary at the end"
169
    )]
170
    batches: Option<usize>,
171
}
172

173
#[derive(Debug, Clone)]
174
/// Source-level inventory used by capacity estimation output.
175
struct SourceInventory {
176
    source_id: String,
177
    reported_records: u128,
178
    triplet_recipes: Vec<TripletRecipe>,
179
}
180

181
/// Run the capacity-estimation CLI with injectable root resolution/source builders.
182
///
183
/// `build_sources` is construction-only; sampler configuration is applied
184
/// centrally by this function before any source calls.
185
pub fn run_estimate_capacity<R, Resolve, Build, I>(
4✔
186
    args_iter: I,
4✔
187
    resolve_roots: Resolve,
4✔
188
    build_sources: Build,
4✔
189
) -> Result<(), Box<dyn Error>>
4✔
190
where
4✔
191
    Resolve: FnOnce(Vec<String>) -> Result<R, Box<dyn Error>>,
4✔
192
    Build: FnOnce(&R) -> Vec<DynSource>,
4✔
193
    I: Iterator<Item = String>,
4✔
194
{
195
    init_example_tracing();
4✔
196

197
    let Some(cli) = parse_cli::<EstimateCapacityCli, _>(
4✔
198
        std::iter::once("estimate_capacity".to_string()).chain(args_iter),
4✔
UNCOV
199
    )?
×
200
    else {
UNCOV
201
        return Ok(());
×
202
    };
203

204
    let roots = resolve_roots(cli.source_roots)?;
4✔
205

206
    let config = SamplerConfig {
3✔
207
        seed: cli.seed,
3✔
208
        split: cli.split,
3✔
209
        ..SamplerConfig::default()
3✔
210
    };
3✔
211

212
    let sources = build_sources(&roots);
3✔
213

214
    let mut inventories = Vec::new();
3✔
215
    for source in &sources {
3✔
216
        let recipes = if config.recipes.is_empty() {
3✔
217
            source.default_triplet_recipes()
3✔
218
        } else {
UNCOV
219
            config.recipes.clone()
×
220
        };
221
        let reported_records = source.reported_record_count(&config).map_err(|err| {
3✔
222
            format!(
1✔
223
                "source '{}' failed to report exact record count: {err}",
224
                source.id()
1✔
225
            )
226
        })?;
1✔
227
        inventories.push(SourceInventory {
2✔
228
            source_id: source.id().to_string(),
2✔
229
            reported_records,
2✔
230
            triplet_recipes: recipes,
2✔
231
        });
2✔
232
    }
233

234
    let mut per_source_split_counts: HashMap<(String, SplitLabel), u128> = HashMap::new();
2✔
235
    let mut split_record_counts: HashMap<SplitLabel, u128> = HashMap::new();
2✔
236

237
    for source in &inventories {
2✔
238
        let counts = split_counts_for_total(source.reported_records, cli.split);
2✔
239
        for (label, count) in counts {
6✔
240
            per_source_split_counts.insert((source.source_id.clone(), label), count);
6✔
241
            *split_record_counts.entry(label).or_insert(0) += count;
6✔
242
        }
6✔
243
    }
244

245
    let mut totals_by_split: HashMap<SplitLabel, CapacityTotals> = HashMap::new();
2✔
246
    let mut totals_by_source_and_split: HashMap<(String, SplitLabel), CapacityTotals> =
2✔
247
        HashMap::new();
2✔
248

249
    for split_label in [SplitLabel::Train, SplitLabel::Validation, SplitLabel::Test] {
6✔
250
        let mut totals = CapacityTotals::default();
6✔
251

252
        for source in &inventories {
6✔
253
            let source_split_records = per_source_split_counts
6✔
254
                .get(&(source.source_id.clone(), split_label))
6✔
255
                .copied()
6✔
256
                .unwrap_or(0);
6✔
257

6✔
258
            let triplet_recipes = &source.triplet_recipes;
6✔
259
            let text_recipes = resolve_text_recipes_for_source(&config, triplet_recipes);
6✔
260

6✔
261
            let capacity = estimate_source_split_capacity_from_counts(
6✔
262
                source_split_records,
6✔
263
                triplet_recipes,
6✔
264
                &text_recipes,
6✔
265
            );
6✔
266

6✔
267
            totals_by_source_and_split.insert((source.source_id.clone(), split_label), capacity);
6✔
268

6✔
269
            totals.triplets += capacity.triplets;
6✔
270
            totals.effective_triplets += capacity.effective_triplets;
6✔
271
            totals.pairs += capacity.pairs;
6✔
272
            totals.text_samples += capacity.text_samples;
6✔
273
        }
6✔
274

275
        totals_by_split.insert(split_label, totals);
6✔
276
    }
277

278
    let min_nonzero_records_by_split: HashMap<SplitLabel, u128> =
2✔
279
        [SplitLabel::Train, SplitLabel::Validation, SplitLabel::Test]
2✔
280
            .into_iter()
2✔
281
            .map(|split_label| {
6✔
282
                let min_nonzero = inventories
6✔
283
                    .iter()
6✔
284
                    .filter_map(|source| {
6✔
285
                        per_source_split_counts
6✔
286
                            .get(&(source.source_id.clone(), split_label))
6✔
287
                            .copied()
6✔
288
                    })
6✔
289
                    .filter(|&records| records > 0)
6✔
290
                    .min()
6✔
291
                    .unwrap_or(0);
6✔
292
                (split_label, min_nonzero)
6✔
293
            })
6✔
294
            .collect();
2✔
295

296
    let min_nonzero_records_all_splits = inventories
2✔
297
        .iter()
2✔
298
        .map(|source| source.reported_records)
2✔
299
        .filter(|&records| records > 0)
2✔
300
        .min()
2✔
301
        .unwrap_or(0);
2✔
302

303
    println!("=== capacity estimate (length-only) ===");
2✔
304
    println!("mode: metadata-only (no source.refresh calls)");
2✔
305
    println!("classification: heuristic approximation (not exact)");
2✔
306
    println!("split seed: {}", cli.seed);
2✔
307
    println!(
2✔
308
        "split ratios: train={:.4}, validation={:.4}, test={:.4}",
309
        cli.split.train, cli.split.validation, cli.split.test
310
    );
311
    println!();
2✔
312

313
    println!("[SOURCES]");
2✔
314
    for source in &inventories {
2✔
315
        println!(
2✔
316
            "  {} => reported records: {}",
2✔
317
            source.source_id,
2✔
318
            format_u128_with_commas(source.reported_records)
2✔
319
        );
2✔
320
    }
2✔
321
    println!();
2✔
322

323
    println!("[PER SOURCE BREAKDOWN]");
2✔
324
    for source in &inventories {
2✔
325
        println!("  {}", source.source_id);
2✔
326
        let mut source_grand = CapacityTotals::default();
2✔
327
        let mut source_total_records = 0u128;
2✔
328
        for split_label in [SplitLabel::Train, SplitLabel::Validation, SplitLabel::Test] {
6✔
329
            let split_records = per_source_split_counts
6✔
330
                .get(&(source.source_id.clone(), split_label))
6✔
331
                .copied()
6✔
332
                .unwrap_or(0);
6✔
333
            source_total_records = source_total_records.saturating_add(split_records);
6✔
334
            let split_longest_records = inventories
6✔
335
                .iter()
6✔
336
                .map(|candidate| {
6✔
337
                    per_source_split_counts
6✔
338
                        .get(&(candidate.source_id.clone(), split_label))
6✔
339
                        .copied()
6✔
340
                        .unwrap_or(0)
6✔
341
                })
6✔
342
                .max()
6✔
343
                .unwrap_or(0);
6✔
344
            let totals = totals_by_source_and_split
6✔
345
                .get(&(source.source_id.clone(), split_label))
6✔
346
                .copied()
6✔
347
                .unwrap_or_default();
6✔
348
            source_grand.triplets += totals.triplets;
6✔
349
            source_grand.effective_triplets += totals.effective_triplets;
6✔
350
            source_grand.pairs += totals.pairs;
6✔
351
            source_grand.text_samples += totals.text_samples;
6✔
352
            println!("    [{:?}]", split_label);
6✔
353
            println!("      records: {}", format_u128_with_commas(split_records));
6✔
354
            println!(
6✔
355
                "      triplet combinations: {}",
356
                format_u128_with_commas(totals.triplets)
6✔
357
            );
358
            println!(
6✔
359
                "      effective sampled triplets (p={}, k={}): {}",
360
                EFFECTIVE_POSITIVES_PER_ANCHOR,
361
                EFFECTIVE_NEGATIVES_PER_ANCHOR,
362
                format_u128_with_commas(totals.effective_triplets)
6✔
363
            );
364
            println!(
6✔
365
                "      pair combinations:    {}",
366
                format_u128_with_commas(totals.pairs)
6✔
367
            );
368
            println!(
6✔
369
                "      text samples:         {}",
370
                format_u128_with_commas(totals.text_samples)
6✔
371
            );
372
            println!(
6✔
373
                "      replay factor vs longest source: {}",
374
                format_replay_factor(split_longest_records, split_records)
6✔
375
            );
376
            println!(
6✔
377
                "      suggested proportional-size batch weight (0-1): {:.4}",
378
                suggested_balancing_weight(split_longest_records, split_records)
6✔
379
            );
380
            let split_smallest_nonzero = min_nonzero_records_by_split
6✔
381
                .get(&split_label)
6✔
382
                .copied()
6✔
383
                .unwrap_or(0);
6✔
384
            println!(
6✔
385
                "      suggested small-source-boost batch weight (0-1): {:.4}",
386
                suggested_oversampling_weight(split_smallest_nonzero, split_records)
6✔
387
            );
388
            println!();
6✔
389
        }
390
        let longest_source_total = inventories
2✔
391
            .iter()
2✔
392
            .map(|candidate| candidate.reported_records)
2✔
393
            .max()
2✔
394
            .unwrap_or(0);
2✔
395
        println!("    [ALL SPLITS FOR SOURCE]");
2✔
396
        println!(
2✔
397
            "      triplet combinations: {}",
398
            format_u128_with_commas(source_grand.triplets)
2✔
399
        );
400
        println!(
2✔
401
            "      effective sampled triplets (p={}, k={}): {}",
402
            EFFECTIVE_POSITIVES_PER_ANCHOR,
403
            EFFECTIVE_NEGATIVES_PER_ANCHOR,
404
            format_u128_with_commas(source_grand.effective_triplets)
2✔
405
        );
406
        println!(
2✔
407
            "      pair combinations:    {}",
408
            format_u128_with_commas(source_grand.pairs)
2✔
409
        );
410
        println!(
2✔
411
            "      text samples:         {}",
412
            format_u128_with_commas(source_grand.text_samples)
2✔
413
        );
414
        println!(
2✔
415
            "      replay factor vs longest source: {}",
416
            format_replay_factor(longest_source_total, source_total_records)
2✔
417
        );
418
        println!(
2✔
419
            "      suggested proportional-size batch weight (0-1): {:.4}",
420
            suggested_balancing_weight(longest_source_total, source_total_records)
2✔
421
        );
422
        println!(
2✔
423
            "      suggested small-source-boost batch weight (0-1): {:.4}",
424
            suggested_oversampling_weight(min_nonzero_records_all_splits, source_total_records)
2✔
425
        );
426
        println!();
2✔
427
    }
428

429
    let mut grand = CapacityTotals::default();
2✔
430
    for split_label in [SplitLabel::Train, SplitLabel::Validation, SplitLabel::Test] {
6✔
431
        let record_count = split_record_counts.get(&split_label).copied().unwrap_or(0);
6✔
432
        let totals = totals_by_split
6✔
433
            .get(&split_label)
6✔
434
            .copied()
6✔
435
            .unwrap_or_default();
6✔
436

6✔
437
        grand.triplets += totals.triplets;
6✔
438
        grand.effective_triplets += totals.effective_triplets;
6✔
439
        grand.pairs += totals.pairs;
6✔
440
        grand.text_samples += totals.text_samples;
6✔
441

6✔
442
        println!("[{:?}]", split_label);
6✔
443
        println!("  records: {}", format_u128_with_commas(record_count));
6✔
444
        println!(
6✔
445
            "  triplet combinations: {}",
6✔
446
            format_u128_with_commas(totals.triplets)
6✔
447
        );
6✔
448
        println!(
6✔
449
            "  effective sampled triplets (p={}, k={}): {}",
6✔
450
            EFFECTIVE_POSITIVES_PER_ANCHOR,
6✔
451
            EFFECTIVE_NEGATIVES_PER_ANCHOR,
6✔
452
            format_u128_with_commas(totals.effective_triplets)
6✔
453
        );
6✔
454
        println!(
6✔
455
            "  pair combinations:    {}",
6✔
456
            format_u128_with_commas(totals.pairs)
6✔
457
        );
6✔
458
        println!(
6✔
459
            "  text samples:         {}",
6✔
460
            format_u128_with_commas(totals.text_samples)
6✔
461
        );
6✔
462
        println!();
6✔
463
    }
6✔
464

465
    println!("[ALL SPLITS TOTAL]");
2✔
466
    println!(
2✔
467
        "  triplet combinations: {}",
468
        format_u128_with_commas(grand.triplets)
2✔
469
    );
470
    println!(
2✔
471
        "  effective sampled triplets (p={}, k={}): {}",
472
        EFFECTIVE_POSITIVES_PER_ANCHOR,
473
        EFFECTIVE_NEGATIVES_PER_ANCHOR,
474
        format_u128_with_commas(grand.effective_triplets)
2✔
475
    );
476
    println!(
2✔
477
        "  pair combinations:    {}",
478
        format_u128_with_commas(grand.pairs)
2✔
479
    );
480
    println!(
2✔
481
        "  text samples:         {}",
482
        format_u128_with_commas(grand.text_samples)
2✔
483
    );
484
    println!();
2✔
485
    println!(
2✔
486
        "Note: counts are heuristic, length-based estimates from source-reported totals and recipe structure. They are approximate, not exact, and assume anchor-positive pairs=records (one positive per anchor by default), negatives=source_records_in_split-1 (anchor excluded as its own negative), and at most one chunk/window realization per sample. In real-world chunked sampling, practical combinations are often higher, so treat this as a floor-like baseline."
487
    );
488
    println!();
2✔
489
    println!(
2✔
490
        "Effective sampled triplets apply a bounded training assumption: effective_triplets = records * p * k per triplet recipe, with defaults p={} positives per anchor and k={} negatives per anchor.",
491
        EFFECTIVE_POSITIVES_PER_ANCHOR, EFFECTIVE_NEGATIVES_PER_ANCHOR
492
    );
493
    println!();
2✔
494
    println!(
2✔
495
        "Oversample loops are not inferred from this static report. To measure true oversampling (how many times sampling loops through the combination space), use observed sampled draw counts from an actual run."
496
    );
497
    println!();
2✔
498
    println!(
2✔
499
        "Suggested proportional-size batch weight (0-1) is source/max_source by record count: 1.0 for the largest source in scope, smaller values for smaller sources."
500
    );
501
    println!();
2✔
502
    println!(
2✔
503
        "Suggested small-source-boost batch weight (0-1) is min_nonzero_source/source by record count: 1.0 for the smallest non-zero source in scope, smaller values for larger sources."
504
    );
505
    println!();
2✔
506
    println!(
2✔
507
        "When passed to next_*_batch_with_weights, higher weight means that source is sampled more often relative to lower-weight sources."
508
    );
509

510
    Ok(())
2✔
511
}
4✔
512

513
/// Run the multi-source demo CLI with injectable root resolution/source builders.
514
///
515
/// `build_sources` is construction-only. Source sampler configuration is owned
516
/// by sampler registration (`TripletSampler::register_source`).
517
pub fn run_multi_source_demo<R, Resolve, Build, I>(
12✔
518
    args_iter: I,
12✔
519
    resolve_roots: Resolve,
12✔
520
    build_sources: Build,
12✔
521
) -> Result<(), Box<dyn Error>>
12✔
522
where
12✔
523
    Resolve: FnOnce(Vec<String>) -> Result<R, Box<dyn Error>>,
12✔
524
    Build: FnOnce(&R) -> Vec<DynSource>,
12✔
525
    I: Iterator<Item = String>,
12✔
526
{
527
    init_example_tracing();
12✔
528

529
    let Some(cli) = parse_cli::<MultiSourceDemoCli, _>(
12✔
530
        std::iter::once("multi_source_demo".to_string()).chain(args_iter),
12✔
UNCOV
531
    )?
×
532
    else {
UNCOV
533
        return Ok(());
×
534
    };
535

536
    let roots = resolve_roots(cli.source_roots)?;
12✔
537

538
    let mut config = SamplerConfig::default();
11✔
539
    config.seed = cli.seed.unwrap_or(config.seed);
11✔
540
    config.batch_size = cli.batch_size;
11✔
541
    config.chunking = Default::default();
11✔
542
    let selected_split = cli.split.map(Into::into).unwrap_or(SplitLabel::Train);
11✔
543
    config.split = SplitRatios::default();
11✔
544
    config.allowed_splits = vec![selected_split];
11✔
545
    let chunking = config.chunking.clone();
11✔
546
    let config_snapshot = MultiSourceDemoConfigSnapshot {
11✔
547
        seed: config.seed,
11✔
548
        batch_size: config.batch_size,
11✔
549
        ingestion_max_records: config.ingestion_max_records,
11✔
550
        split: selected_split,
11✔
551
        split_ratios: config.split,
11✔
552
        max_window_tokens: config.chunking.max_window_tokens,
11✔
553
        overlap_tokens: config.chunking.overlap_tokens.clone(),
11✔
554
        summary_fallback_tokens: config.chunking.summary_fallback_tokens,
11✔
555
    };
11✔
556

557
    let split_store_path = if let Some(path) = cli.split_store_path {
11✔
558
        path
11✔
559
    } else {
560
        managed_demo_split_store_path().map_err(|err| {
×
561
            Box::<dyn Error>::from(format!("failed to resolve demo split-store path: {err}"))
×
562
        })?
×
563
    };
564

565
    if cli.reset && split_store_path.exists() {
11✔
NEW
UNCOV
566
        std::fs::remove_file(&split_store_path).map_err(|err| {
×
NEW
UNCOV
567
            Box::<dyn Error>::from(format!(
×
NEW
568
                "failed to remove split store '{}': {err}",
×
NEW
569
                split_store_path.display()
×
NEW
570
            ))
×
NEW
UNCOV
571
        })?;
×
NEW
UNCOV
572
        println!("Reset: removed {}", split_store_path.display());
×
573
    }
11✔
574
    println!(
11✔
575
        "Persisting split assignments and epoch state to {}",
576
        split_store_path.display()
11✔
577
    );
578
    let sources = build_sources(&roots);
11✔
579
    let split_store = Arc::new(FileSplitStore::open(&split_store_path, config.split, 99)?);
11✔
580
    let sampler = TripletSampler::new(config, split_store.clone());
11✔
581
    for source in sources {
11✔
582
        sampler.register_source(source);
11✔
583
    }
11✔
584

585
    if cli.show_pair_samples {
11✔
586
        match sampler.next_pair_batch(selected_split) {
3✔
UNCOV
587
            Ok(pair_batch) => {
×
UNCOV
588
                if pair_batch.pairs.is_empty() {
×
UNCOV
589
                    println!("Pair sampling produced no results.");
×
UNCOV
590
                } else {
×
UNCOV
591
                    print_pair_batch(&chunking, &pair_batch, split_store.as_ref());
×
UNCOV
592
                }
×
UNCOV
593
                sampler.save_sampler_state(None)?;
×
594
            }
595
            Err(SamplerError::Exhausted(name)) => {
3✔
596
                eprintln!(
3✔
597
                    "Pair sampler exhausted recipe '{}'. Ensure both positive and negative examples exist.",
3✔
598
                    name
3✔
599
                );
3✔
600
            }
3✔
601
            Err(err) => return Err(err.into()),
×
602
        }
603
    } else if cli.show_text_samples {
8✔
604
        match sampler.next_text_batch(selected_split) {
3✔
605
            Ok(text_batch) => {
1✔
606
                if text_batch.samples.is_empty() {
1✔
UNCOV
607
                    println!(
×
UNCOV
608
                        "Text sampling produced no results. Ensure each source has eligible sections."
×
609
                    );
×
610
                } else {
1✔
611
                    print_text_batch(&chunking, &text_batch, split_store.as_ref());
1✔
612
                }
1✔
613
                sampler.save_sampler_state(None)?;
1✔
614
            }
615
            Err(SamplerError::Exhausted(name)) => {
2✔
616
                eprintln!(
2✔
617
                    "Text sampler exhausted selector '{}'. Ensure matching sections exist.",
2✔
618
                    name
2✔
619
                );
2✔
620
            }
2✔
UNCOV
621
            Err(err) => return Err(err.into()),
×
622
        }
623
    } else if cli.list_text_recipes {
5✔
624
        let recipes = sampler.text_recipes();
2✔
625
        if recipes.is_empty() {
2✔
626
            println!(
1✔
627
                "No text recipes registered. Ensure your sources expose triplet selectors or configure text_recipes explicitly."
1✔
628
            );
1✔
629
        } else {
1✔
630
            print_text_recipes(&recipes);
1✔
631
        }
1✔
632
    } else if let Some(batch_count) = cli.batches {
3✔
NEW
UNCOV
633
        print_demo_config(&config_snapshot);
×
NEW
UNCOV
634
        println!("=== benchmark: {} triplet batches ===", batch_count);
×
635

636
        // source_id -> Vec<(pos_jaccard, pos_cosine, neg_jaccard, neg_cosine)>
637
        #[cfg(feature = "extended-metrics")]
NEW
UNCOV
638
        let mut source_metrics: HashMap<String, Vec<(f32, f32, f32, f32)>> = HashMap::new();
×
639

NEW
640
        for i in 0..batch_count {
×
NEW
641
            let t0 = Instant::now();
×
NEW
642
            match sampler.next_triplet_batch(selected_split) {
×
NEW
643
                Ok(batch) => {
×
NEW
644
                    let elapsed = t0.elapsed();
×
NEW
645
                    let n = batch.triplets.len();
×
NEW
646
                    println!(
×
647
                        "batch {:>4}  triplets={:<4}  elapsed={:>8.2}ms  per_triplet={:.2}ms",
NEW
648
                        i + 1,
×
649
                        n,
NEW
650
                        elapsed.as_secs_f64() * 1000.0,
×
NEW
651
                        if n > 0 {
×
NEW
652
                            elapsed.as_secs_f64() * 1000.0 / n as f64
×
653
                        } else {
NEW
654
                            0.0
×
655
                        },
656
                    );
657
                    #[cfg(feature = "extended-metrics")]
658
                    {
659
                        use crate::metrics::lexical_similarity_scores;
NEW
660
                        for triplet in &batch.triplets {
×
NEW
661
                            let (pj, pc) = lexical_similarity_scores(
×
NEW
662
                                &triplet.anchor.text,
×
NEW
663
                                &triplet.positive.text,
×
NEW
664
                            );
×
NEW
665
                            let (nj, nc) = lexical_similarity_scores(
×
NEW
666
                                &triplet.anchor.text,
×
NEW
667
                                &triplet.negative.text,
×
NEW
668
                            );
×
NEW
669
                            let source = extract_source(&triplet.anchor.record_id);
×
NEW
670
                            source_metrics
×
NEW
671
                                .entry(source)
×
NEW
672
                                .or_default()
×
NEW
673
                                .push((pj, pc, nj, nc));
×
NEW
674
                        }
×
675
                    }
676
                }
NEW
677
                Err(SamplerError::Exhausted(name)) => {
×
NEW
678
                    println!(
×
679
                        "batch {:>4}  exhausted recipe '{}' — stopping early",
NEW
680
                        i + 1,
×
681
                        name
682
                    );
NEW
683
                    break;
×
684
                }
NEW
685
                Err(err) => return Err(err.into()),
×
686
            }
687
        }
688

NEW
689
        sampler.save_sampler_state(None)?;
×
690

691
        #[cfg(feature = "extended-metrics")]
NEW
692
        if !source_metrics.is_empty() {
×
NEW
693
            println!();
×
NEW
694
            print_metric_summary(&source_metrics);
×
NEW
695
        }
×
696

697
        #[cfg(all(feature = "extended-metrics", feature = "bm25-mining"))]
698
        {
NEW
699
            let (fallback, total) = sampler.bm25_fallback_stats();
×
NEW
700
            if total > 0 {
×
NEW
701
                let pct = fallback as f64 / total as f64 * 100.0;
×
NEW
702
                println!("bm25 fallback rate : {}/{} ({:.1}%)", fallback, total, pct);
×
NEW
703
            }
×
704
        }
705
    } else {
706
        match sampler.next_triplet_batch(selected_split) {
3✔
707
            Ok(triplet_batch) => {
×
708
                if triplet_batch.triplets.is_empty() {
×
709
                    println!(
×
710
                        "Triplet sampling produced no results. Ensure multiple records per source exist."
×
711
                    );
×
712
                } else {
×
UNCOV
713
                    print_triplet_batch(&chunking, &triplet_batch, split_store.as_ref());
×
UNCOV
714
                }
×
715
                sampler.save_sampler_state(None)?;
×
716
            }
717
            Err(SamplerError::Exhausted(name)) => {
3✔
718
                eprintln!(
3✔
719
                    "Triplet sampler exhausted recipe '{}'. Ensure both positive and negative examples exist.",
3✔
720
                    name
3✔
721
                );
3✔
722
            }
3✔
723
            Err(err) => return Err(err.into()),
×
724
        }
725
    }
726

727
    Ok(())
11✔
728
}
12✔
729

730
struct MultiSourceDemoConfigSnapshot {
731
    seed: u64,
732
    batch_size: usize,
733
    ingestion_max_records: usize,
734
    split: SplitLabel,
735
    split_ratios: SplitRatios,
736
    max_window_tokens: usize,
737
    overlap_tokens: Vec<usize>,
738
    summary_fallback_tokens: usize,
739
}
740

NEW
UNCOV
741
fn print_demo_config(cfg: &MultiSourceDemoConfigSnapshot) {
×
NEW
UNCOV
742
    let overlaps: Vec<String> = cfg.overlap_tokens.iter().map(|t| t.to_string()).collect();
×
NEW
UNCOV
743
    println!("=== sampler config ===");
×
NEW
UNCOV
744
    println!("seed                 : {}", cfg.seed);
×
NEW
UNCOV
745
    println!("batch_size           : {}", cfg.batch_size);
×
NEW
746
    println!("ingestion_max_records: {}", cfg.ingestion_max_records);
×
NEW
747
    println!("split                : {:?}", cfg.split);
×
NEW
748
    println!(
×
749
        "split_ratios         : train={:.2} val={:.2} test={:.2}",
750
        cfg.split_ratios.train, cfg.split_ratios.validation, cfg.split_ratios.test
751
    );
NEW
752
    println!("max_window_tokens    : {}", cfg.max_window_tokens);
×
NEW
753
    println!("overlap_tokens       : [{}]", overlaps.join(", "));
×
NEW
754
    println!(
×
755
        "summary_fallback     : {} tokens (0 = disabled)",
756
        cfg.summary_fallback_tokens
757
    );
NEW
758
    println!();
×
NEW
759
}
×
760

761
fn parse_positive_usize(raw: &str) -> Result<usize, String> {
17✔
762
    let parsed = raw.parse::<usize>().map_err(|_| {
17✔
763
        format!(
1✔
764
            "Could not parse --batch-size value '{}' as a positive integer",
765
            raw
766
        )
767
    })?;
1✔
768
    if parsed == 0 {
16✔
769
        return Err("--batch-size must be greater than zero".to_string());
2✔
770
    }
14✔
771
    Ok(parsed)
14✔
772
}
17✔
773

774
fn suggested_balancing_weight(max_baseline: u128, source_baseline: u128) -> f32 {
13✔
775
    if max_baseline == 0 || source_baseline == 0 {
13✔
776
        return 0.0;
4✔
777
    }
9✔
778
    (source_baseline as f64 / max_baseline as f64).clamp(0.0, 1.0) as f32
9✔
779
}
13✔
780

781
fn suggested_oversampling_weight(min_nonzero_baseline: u128, source_baseline: u128) -> f32 {
13✔
782
    if min_nonzero_baseline == 0 || source_baseline == 0 {
13✔
783
        return 0.0;
4✔
784
    }
9✔
785
    (min_nonzero_baseline as f64 / source_baseline as f64).clamp(0.0, 1.0) as f32
9✔
786
}
13✔
787

788
fn parse_cli<T, I>(args: I) -> Result<Option<T>, Box<dyn Error>>
22✔
789
where
22✔
790
    T: Parser,
22✔
791
    I: IntoIterator,
22✔
792
    I::Item: Into<std::ffi::OsString> + Clone,
22✔
793
{
794
    match T::try_parse_from(args) {
22✔
795
        Ok(cli) => Ok(Some(cli)),
17✔
796
        Err(err) => match err.kind() {
5✔
797
            ErrorKind::DisplayHelp | ErrorKind::DisplayVersion => {
798
                err.print()?;
3✔
799
                Ok(None)
3✔
800
            }
801
            _ => Err(err.into()),
2✔
802
        },
803
    }
804
}
22✔
805

806
fn parse_split_ratios_arg(raw: &str) -> Result<SplitRatios, String> {
11✔
807
    let parts: Vec<&str> = raw.split(',').collect();
11✔
808
    if parts.len() != 3 {
11✔
809
        return Err("--split-ratios expects exactly 3 comma-separated values".to_string());
1✔
810
    }
10✔
811
    let train = parts[0]
10✔
812
        .trim()
10✔
813
        .parse::<f32>()
10✔
814
        .map_err(|_| format!("invalid train ratio '{}': must be a float", parts[0].trim()))?;
10✔
815
    let validation = parts[1].trim().parse::<f32>().map_err(|_| {
9✔
816
        format!(
1✔
817
            "invalid validation ratio '{}': must be a float",
818
            parts[1].trim()
1✔
819
        )
820
    })?;
1✔
821
    let test = parts[2]
8✔
822
        .trim()
8✔
823
        .parse::<f32>()
8✔
824
        .map_err(|_| format!("invalid test ratio '{}': must be a float", parts[2].trim()))?;
8✔
825
    let ratios = SplitRatios {
7✔
826
        train,
7✔
827
        validation,
7✔
828
        test,
7✔
829
    };
7✔
830
    let sum = ratios.train + ratios.validation + ratios.test;
7✔
831
    if (sum - 1.0).abs() > 1e-5 {
7✔
832
        return Err(format!(
1✔
833
            "split ratios must sum to 1.0, got {:.6} (train={}, validation={}, test={})",
1✔
834
            sum, ratios.train, ratios.validation, ratios.test
1✔
835
        ));
1✔
836
    }
6✔
837
    if ratios.train < 0.0 || ratios.validation < 0.0 || ratios.test < 0.0 {
6✔
838
        return Err("split ratios must be non-negative".to_string());
1✔
839
    }
5✔
840
    Ok(ratios)
5✔
841
}
11✔
842

843
fn print_triplet_batch(
1✔
844
    strategy: &ChunkingStrategy,
1✔
845
    batch: &TripletBatch,
1✔
846
    split_store: &impl SplitStore,
1✔
847
) {
1✔
848
    println!("=== triplet batch ===");
1✔
849
    for (idx, triplet) in batch.triplets.iter().enumerate() {
1✔
850
        println!("--- triplet #{} ---", idx);
1✔
851
        println!("recipe       : {}", triplet.recipe);
1✔
852
        println!("sample_weight: {:.4}", triplet.weight);
1✔
853
        if let Some(instr) = &triplet.instruction {
1✔
854
            println!("instruction shown to model:\n{}\n", instr);
1✔
855
        }
1✔
856
        #[cfg(feature = "extended-metrics")]
857
        let (pos_sim, neg_sim) = {
1✔
858
            use crate::metrics::lexical_similarity_scores;
859
            (
1✔
860
                Some(lexical_similarity_scores(
1✔
861
                    &triplet.anchor.text,
1✔
862
                    &triplet.positive.text,
1✔
863
                )),
1✔
864
                Some(lexical_similarity_scores(
1✔
865
                    &triplet.anchor.text,
1✔
866
                    &triplet.negative.text,
1✔
867
                )),
1✔
868
            )
1✔
869
        };
870
        #[cfg(not(feature = "extended-metrics"))]
871
        let (pos_sim, neg_sim): (Option<(f32, f32)>, Option<(f32, f32)>) = (None, None);
872
        print_chunk_block("ANCHOR", &triplet.anchor, strategy, split_store, None);
1✔
873
        print_chunk_block(
1✔
874
            "POSITIVE",
1✔
875
            &triplet.positive,
1✔
876
            strategy,
1✔
877
            split_store,
1✔
878
            pos_sim,
1✔
879
        );
880
        print_chunk_block(
1✔
881
            "NEGATIVE",
1✔
882
            &triplet.negative,
1✔
883
            strategy,
1✔
884
            split_store,
1✔
885
            neg_sim,
1✔
886
        );
887
    }
888
    print_source_summary(
1✔
889
        "triplet anchors",
1✔
890
        batch
1✔
891
            .triplets
1✔
892
            .iter()
1✔
893
            .map(|triplet| triplet.anchor.record_id.as_str()),
1✔
894
    );
895
    print_recipe_context_by_source(
1✔
896
        "triplet recipes by source",
1✔
897
        batch
1✔
898
            .triplets
1✔
899
            .iter()
1✔
900
            .map(|triplet| (triplet.anchor.record_id.as_str(), triplet.recipe.as_str())),
1✔
901
    );
902
}
1✔
903

904
fn print_text_batch(strategy: &ChunkingStrategy, batch: &TextBatch, split_store: &impl SplitStore) {
2✔
905
    println!("=== text batch ===");
2✔
906
    for (idx, sample) in batch.samples.iter().enumerate() {
5✔
907
        println!("--- sample #{} ---", idx);
5✔
908
        println!("recipe       : {}", sample.recipe);
5✔
909
        println!("sample_weight: {:.4}", sample.weight);
5✔
910
        if let Some(instr) = &sample.instruction {
5✔
911
            println!("instruction shown to model:\n{}\n", instr);
1✔
912
        }
4✔
913
        print_chunk_block("TEXT", &sample.chunk, strategy, split_store, None);
5✔
914
    }
915
    print_source_summary(
2✔
916
        "text samples",
2✔
917
        batch
2✔
918
            .samples
2✔
919
            .iter()
2✔
920
            .map(|sample| sample.chunk.record_id.as_str()),
5✔
921
    );
922
    print_recipe_context_by_source(
2✔
923
        "text recipes by source",
2✔
924
        batch
2✔
925
            .samples
2✔
926
            .iter()
2✔
927
            .map(|sample| (sample.chunk.record_id.as_str(), sample.recipe.as_str())),
5✔
928
    );
929
}
2✔
930

931
fn print_pair_batch(
1✔
932
    strategy: &ChunkingStrategy,
1✔
933
    batch: &SampleBatch,
1✔
934
    split_store: &impl SplitStore,
1✔
935
) {
1✔
936
    println!("=== pair batch ===");
1✔
937
    for (idx, pair) in batch.pairs.iter().enumerate() {
1✔
938
        println!("--- pair #{} ---", idx);
1✔
939
        println!("recipe       : {}", pair.recipe);
1✔
940
        println!("label        : {:?}", pair.label);
1✔
941
        if let Some(reason) = &pair.reason {
1✔
942
            println!("reason       : {}", reason);
1✔
943
        }
1✔
944
        print_chunk_block("ANCHOR", &pair.anchor, strategy, split_store, None);
1✔
945
        print_chunk_block("OTHER", &pair.positive, strategy, split_store, None);
1✔
946
    }
947
    print_source_summary(
1✔
948
        "pair anchors",
1✔
949
        batch
1✔
950
            .pairs
1✔
951
            .iter()
1✔
952
            .map(|pair| pair.anchor.record_id.as_str()),
1✔
953
    );
954
    print_recipe_context_by_source(
1✔
955
        "pair recipes by source",
1✔
956
        batch
1✔
957
            .pairs
1✔
958
            .iter()
1✔
959
            .map(|pair| (pair.anchor.record_id.as_str(), pair.recipe.as_str())),
1✔
960
    );
961
}
1✔
962

963
fn print_text_recipes(recipes: &[TextRecipe]) {
2✔
964
    println!("=== available text recipes ===");
2✔
965
    for recipe in recipes {
4✔
966
        println!(
4✔
967
            "- {} (weight: {:.3}) selector={:?}",
968
            recipe.name, recipe.weight, recipe.selector
969
        );
970
        if let Some(instr) = &recipe.instruction {
4✔
971
            println!("  instruction: {}", instr);
1✔
972
        }
3✔
973
    }
974
}
2✔
975

976
#[cfg(feature = "extended-metrics")]
NEW
977
fn metric_mean_median(vals: &mut [f32]) -> (f32, f32) {
×
NEW
UNCOV
978
    let mean = vals.iter().sum::<f32>() / vals.len() as f32;
×
NEW
UNCOV
979
    vals.sort_by(|a, b| a.partial_cmp(b).unwrap_or(std::cmp::Ordering::Equal));
×
NEW
UNCOV
980
    let median = if vals.len() % 2 == 1 {
×
NEW
UNCOV
981
        vals[vals.len() / 2]
×
982
    } else {
NEW
UNCOV
983
        (vals[vals.len() / 2 - 1] + vals[vals.len() / 2]) / 2.0
×
984
    };
NEW
UNCOV
985
    (mean, median)
×
NEW
UNCOV
986
}
×
987

988
#[cfg(feature = "extended-metrics")]
NEW
UNCOV
989
fn print_metric_summary(source_data: &HashMap<String, Vec<(f32, f32, f32, f32)>>) {
×
NEW
UNCOV
990
    let total: usize = source_data.values().map(|v| v.len()).sum();
×
NEW
UNCOV
991
    let n_sources = source_data.len();
×
NEW
UNCOV
992
    println!(
×
993
        "=== extended metrics summary ({} triplets, {} {}) ===",
994
        total,
995
        n_sources,
NEW
UNCOV
996
        if n_sources == 1 { "source" } else { "sources" }
×
997
    );
998

999
    // Returns [pos, neg] as (mean, median) pairs for one metric across entries.
NEW
UNCOV
1000
    fn metric_pair(
×
NEW
UNCOV
1001
        entries: &[(f32, f32, f32, f32)],
×
NEW
UNCOV
1002
        pos_idx: usize,
×
NEW
UNCOV
1003
        neg_idx: usize,
×
NEW
UNCOV
1004
    ) -> [(f32, f32); 2] {
×
NEW
UNCOV
1005
        let extract = |idx: usize| -> Vec<f32> {
×
NEW
UNCOV
1006
            entries
×
NEW
UNCOV
1007
                .iter()
×
NEW
1008
                .map(|e| match idx {
×
NEW
1009
                    0 => e.0,
×
NEW
1010
                    1 => e.1,
×
NEW
1011
                    2 => e.2,
×
NEW
1012
                    _ => e.3,
×
NEW
1013
                })
×
NEW
1014
                .collect()
×
NEW
1015
        };
×
NEW
1016
        let mut pos_vals = extract(pos_idx);
×
NEW
1017
        let mut neg_vals = extract(neg_idx);
×
NEW
1018
        [
×
NEW
1019
            metric_mean_median(&mut pos_vals),
×
NEW
1020
            metric_mean_median(&mut neg_vals),
×
NEW
1021
        ]
×
NEW
1022
    }
×
1023

NEW
1024
    fn print_metric_section(
×
NEW
1025
        label: &str,
×
NEW
1026
        sources: &[&String],
×
NEW
1027
        source_data: &HashMap<String, Vec<(f32, f32, f32, f32)>>,
×
NEW
1028
        pos_idx: usize,
×
NEW
1029
        neg_idx: usize,
×
NEW
1030
        total: usize,
×
NEW
1031
        n_sources: usize,
×
NEW
1032
    ) {
×
1033
        const SEP: usize = 83;
NEW
1034
        println!();
×
NEW
1035
        println!("[{}]", label);
×
NEW
1036
        println!(
×
1037
            "{:<24} {:>5}  {:<16} {:<16} {:<16}",
1038
            "source", "n", "positive", "negative", "gap (pos\u{2212}neg)"
1039
        );
NEW
1040
        println!(
×
1041
            "{:<24} {:>5}  {:<16} {:<16} {:<16}",
1042
            "", "", "mean / median", "mean / median", "mean / median"
1043
        );
NEW
1044
        println!("{}", "-".repeat(SEP));
×
NEW
1045
        for source in sources {
×
NEW
1046
            let entries = &source_data[*source];
×
NEW
1047
            let [pos, neg] = metric_pair(entries, pos_idx, neg_idx);
×
NEW
1048
            let gap_mean = pos.0 - neg.0;
×
NEW
1049
            let gap_med = pos.1 - neg.1;
×
NEW
1050
            println!(
×
NEW
1051
                "{:<24} {:>5}  {:.3} / {:.3}     {:.3} / {:.3}     {:+.3} / {:+.3}",
×
NEW
1052
                source,
×
NEW
1053
                entries.len(),
×
NEW
1054
                pos.0,
×
NEW
1055
                pos.1,
×
NEW
1056
                neg.0,
×
NEW
1057
                neg.1,
×
NEW
1058
                gap_mean,
×
NEW
1059
                gap_med,
×
NEW
1060
            );
×
NEW
1061
        }
×
NEW
1062
        if n_sources > 1 {
×
NEW
1063
            let all: Vec<(f32, f32, f32, f32)> = source_data.values().flatten().copied().collect();
×
NEW
1064
            let [pos, neg] = metric_pair(&all, pos_idx, neg_idx);
×
NEW
1065
            let gap_mean = pos.0 - neg.0;
×
NEW
1066
            let gap_med = pos.1 - neg.1;
×
NEW
1067
            println!("{}", "-".repeat(SEP));
×
NEW
1068
            println!(
×
NEW
1069
                "{:<24} {:>5}  {:.3} / {:.3}     {:.3} / {:.3}     {:+.3} / {:+.3}",
×
NEW
1070
                "ALL", total, pos.0, pos.1, neg.0, neg.1, gap_mean, gap_med,
×
NEW
1071
            );
×
NEW
1072
        }
×
NEW
1073
    }
×
1074

NEW
1075
    let mut sources: Vec<&String> = source_data.keys().collect();
×
NEW
1076
    sources.sort();
×
1077

NEW
1078
    print_metric_section(
×
NEW
1079
        "jaccard \u{2194} anchor",
×
NEW
1080
        &sources,
×
NEW
1081
        source_data,
×
1082
        0,
1083
        2,
NEW
1084
        total,
×
NEW
1085
        n_sources,
×
1086
    );
NEW
1087
    print_metric_section(
×
NEW
1088
        "cosine  \u{2194} anchor",
×
NEW
1089
        &sources,
×
NEW
1090
        source_data,
×
1091
        1,
1092
        3,
NEW
1093
        total,
×
NEW
1094
        n_sources,
×
1095
    );
NEW
1096
    println!();
×
NEW
1097
}
×
1098

1099
trait ChunkDebug {
1100
    fn view_name(&self) -> String;
1101
}
1102

1103
impl ChunkDebug for RecordChunk {
1104
    fn view_name(&self) -> String {
10✔
1105
        match &self.view {
10✔
1106
            ChunkView::Window {
1107
                index,
8✔
1108
                span,
8✔
1109
                overlap,
8✔
1110
                start_ratio,
8✔
1111
            } => format!(
8✔
1112
                "window#index={} span={} overlap={} start_ratio={:.3} tokens={}",
1113
                index, span, overlap, start_ratio, self.tokens_estimate
1114
            ),
1115
            ChunkView::SummaryFallback { strategy, .. } => {
2✔
1116
                format!("summary:{} tokens={}", strategy, self.tokens_estimate)
2✔
1117
            }
1118
        }
1119
    }
10✔
1120
}
1121

1122
fn print_chunk_block(
10✔
1123
    title: &str,
10✔
1124
    chunk: &RecordChunk,
10✔
1125
    strategy: &ChunkingStrategy,
10✔
1126
    split_store: &impl SplitStore,
10✔
1127
    anchor_sim: Option<(f32, f32)>,
10✔
1128
) {
10✔
1129
    let chunk_weight = chunk_weight(strategy, chunk);
10✔
1130
    let split = split_store
10✔
1131
        .label_for(&chunk.record_id)
10✔
1132
        .map(|label| format!("{:?}", label))
10✔
1133
        .unwrap_or_else(|| "Unknown".to_string());
10✔
1134
    println!("--- {} ---", title);
10✔
1135
    println!("split        : {}", split);
10✔
1136
    println!("view         : {}", chunk.view_name());
10✔
1137
    println!("chunk_weight : {:.4}", chunk_weight);
10✔
1138
    println!("record_id    : {}", chunk.record_id);
10✔
1139
    println!("section_idx  : {}", chunk.section_idx);
10✔
1140
    println!("token_est    : {}", chunk.tokens_estimate);
10✔
1141
    if let Some((j, c)) = anchor_sim {
10✔
1142
        println!("jaccard(↔a)  : {:.4}  cosine(↔a)  : {:.4}", j, c);
2✔
1143
    }
8✔
1144
    println!("model_input (exact text sent to the model):");
10✔
1145
    println!(
10✔
1146
        "<<< BEGIN MODEL TEXT >>>\n{}\n<<< END MODEL TEXT >>>\n",
1147
        chunk.text
1148
    );
1149
}
10✔
1150

1151
fn print_source_summary<'a, I>(label: &str, ids: I)
4✔
1152
where
4✔
1153
    I: Iterator<Item = &'a str>,
4✔
1154
{
1155
    let mut counts: HashMap<SourceId, usize> = HashMap::new();
4✔
1156
    for id in ids {
7✔
1157
        let source = extract_source(id);
7✔
1158
        *counts.entry(source).or_insert(0) += 1;
7✔
1159
    }
7✔
1160
    if counts.is_empty() {
4✔
UNCOV
1161
        return;
×
1162
    }
4✔
1163
    let skew = source_skew(&counts);
4✔
1164
    let mut entries: Vec<(String, usize)> = counts.into_iter().collect();
4✔
1165
    entries.sort_by(|a, b| b.1.cmp(&a.1).then_with(|| a.0.cmp(&b.0)));
4✔
1166
    println!("--- {} by source ---", label);
4✔
1167
    if let Some(skew) = skew {
4✔
1168
        for entry in &skew.per_source {
4✔
1169
            println!(
4✔
1170
                "{}: count={} share={:.2}",
4✔
1171
                entry.source, entry.count, entry.share
4✔
1172
            );
4✔
1173
        }
4✔
1174
        println!(
4✔
1175
            "skew: sources={} total={} min={} max={} mean={:.2} ratio={:.2}",
1176
            skew.sources, skew.total, skew.min, skew.max, skew.mean, skew.ratio
1177
        );
1178
    } else {
UNCOV
1179
        for (source, count) in &entries {
×
UNCOV
1180
            println!("{source}: count={count}");
×
UNCOV
1181
        }
×
1182
    }
1183
}
4✔
1184

1185
fn print_recipe_context_by_source<'a, I>(label: &str, entries: I)
4✔
1186
where
4✔
1187
    I: Iterator<Item = (&'a str, &'a str)>,
4✔
1188
{
1189
    let mut counts: HashMap<SourceId, HashMap<String, usize>> = HashMap::new();
4✔
1190
    for (record_id, recipe) in entries {
7✔
1191
        let source = extract_source(record_id);
7✔
1192
        let entry = counts
7✔
1193
            .entry(source)
7✔
1194
            .or_default()
7✔
1195
            .entry(recipe.to_string())
7✔
1196
            .or_insert(0);
7✔
1197
        *entry += 1;
7✔
1198
    }
7✔
1199
    if counts.is_empty() {
4✔
UNCOV
1200
        return;
×
1201
    }
4✔
1202
    let mut sources: Vec<(SourceId, HashMap<String, usize>)> = counts.into_iter().collect();
4✔
1203
    sources.sort_by(|a, b| a.0.cmp(&b.0));
4✔
1204
    println!("--- {} ---", label);
4✔
1205
    for (source, recipes) in sources {
4✔
1206
        println!("{source}");
4✔
1207
        let mut entries: Vec<(String, usize)> = recipes.into_iter().collect();
4✔
1208
        entries.sort_by(|a, b| b.1.cmp(&a.1).then_with(|| a.0.cmp(&b.0)));
4✔
1209
        for (recipe, count) in entries {
5✔
1210
            println!("  - {recipe}={count}");
5✔
1211
        }
5✔
1212
    }
1213
}
4✔
1214

1215
fn extract_source(record_id: &str) -> SourceId {
16✔
1216
    record_id
16✔
1217
        .split_once("::")
16✔
1218
        .map(|(source, _)| source.to_string())
16✔
1219
        .unwrap_or_else(|| "unknown".to_string())
16✔
1220
}
16✔
1221

1222
#[cfg(test)]
1223
mod tests {
1224
    use super::*;
1225
    use crate::DataRecord;
1226
    use crate::DeterministicSplitStore;
1227
    use crate::data::{QualityScore, RecordSection, SectionRole};
1228
    use crate::source::{SourceCursor, SourceSnapshot};
1229
    use chrono::Utc;
1230
    use tempfile::tempdir;
1231

1232
    /// Minimal in-memory `DataSource` test double for example app tests.
1233
    struct TestSource {
1234
        id: String,
1235
        count: Option<u128>,
1236
        recipes: Vec<TripletRecipe>,
1237
    }
1238

1239
    impl DataSource for TestSource {
1240
        fn id(&self) -> &str {
130✔
1241
            &self.id
130✔
1242
        }
130✔
1243

1244
        fn refresh(
30✔
1245
            &self,
30✔
1246
            _config: &SamplerConfig,
30✔
1247
            _cursor: Option<&SourceCursor>,
30✔
1248
            _limit: Option<usize>,
30✔
1249
        ) -> Result<SourceSnapshot, SamplerError> {
30✔
1250
            Ok(SourceSnapshot {
30✔
1251
                records: Vec::new(),
30✔
1252
                cursor: SourceCursor {
30✔
1253
                    last_seen: Utc::now(),
30✔
1254
                    revision: 0,
30✔
1255
                },
30✔
1256
            })
30✔
1257
        }
30✔
1258

1259
        fn reported_record_count(&self, _config: &SamplerConfig) -> Result<u128, SamplerError> {
2✔
1260
            self.count.ok_or_else(|| SamplerError::SourceInconsistent {
2✔
1261
                source_id: self.id.clone(),
1✔
1262
                details: "test source has no configured exact count".to_string(),
1✔
1263
            })
1✔
1264
        }
2✔
1265

1266
        fn default_triplet_recipes(&self) -> Vec<TripletRecipe> {
10✔
1267
            self.recipes.clone()
10✔
1268
        }
10✔
1269
    }
1270

1271
    struct ConfigRequiredSource {
1272
        id: String,
1273
        expected_seed: u64,
1274
    }
1275

1276
    impl DataSource for ConfigRequiredSource {
1277
        fn id(&self) -> &str {
1✔
1278
            &self.id
1✔
1279
        }
1✔
1280

1281
        fn refresh(
1✔
1282
            &self,
1✔
1283
            _config: &SamplerConfig,
1✔
1284
            _cursor: Option<&SourceCursor>,
1✔
1285
            _limit: Option<usize>,
1✔
1286
        ) -> Result<SourceSnapshot, SamplerError> {
1✔
1287
            Ok(SourceSnapshot {
1✔
1288
                records: Vec::new(),
1✔
1289
                cursor: SourceCursor {
1✔
1290
                    last_seen: Utc::now(),
1✔
1291
                    revision: 0,
1✔
1292
                },
1✔
1293
            })
1✔
1294
        }
1✔
1295

1296
        fn reported_record_count(&self, config: &SamplerConfig) -> Result<u128, SamplerError> {
2✔
1297
            if config.seed == self.expected_seed {
2✔
1298
                Ok(1)
1✔
1299
            } else {
1300
                Err(SamplerError::SourceInconsistent {
1✔
1301
                    source_id: self.id.clone(),
1✔
1302
                    details: format!(
1✔
1303
                        "expected sampler seed {} but got {}",
1✔
1304
                        self.expected_seed, config.seed
1✔
1305
                    ),
1✔
1306
                })
1✔
1307
            }
1308
        }
2✔
1309

1310
        fn default_triplet_recipes(&self) -> Vec<TripletRecipe> {
2✔
1311
            Vec::new()
2✔
1312
        }
2✔
1313
    }
1314

1315
    fn default_recipe(name: &str) -> TripletRecipe {
9✔
1316
        TripletRecipe {
9✔
1317
            name: name.to_string().into(),
9✔
1318
            anchor: crate::config::Selector::Role(SectionRole::Anchor),
9✔
1319
            positive_selector: crate::config::Selector::Role(SectionRole::Context),
9✔
1320
            negative_selector: crate::config::Selector::Role(SectionRole::Context),
9✔
1321
            negative_strategy: crate::config::NegativeStrategy::WrongArticle,
9✔
1322
            weight: 1.0,
9✔
1323
            instruction: None,
9✔
1324
            allow_same_anchor_positive: false,
9✔
1325
        }
9✔
1326
    }
9✔
1327

1328
    #[test]
1329
    fn parse_helpers_validate_inputs() {
1✔
1330
        assert_eq!(parse_positive_usize("2").unwrap(), 2);
1✔
1331
        assert!(parse_positive_usize("0").is_err());
1✔
1332
        assert!(parse_positive_usize("abc").is_err());
1✔
1333

1334
        let split = parse_split_ratios_arg("0.8,0.1,0.1").unwrap();
1✔
1335
        assert!((split.train - 0.8).abs() < 1e-6);
1✔
1336
        assert!(parse_split_ratios_arg("0.8,0.1").is_err());
1✔
1337
        assert!(parse_split_ratios_arg("1.0,0.0,0.1").is_err());
1✔
1338
        assert!(parse_split_ratios_arg("-0.1,0.6,0.5").is_err());
1✔
1339
    }
1✔
1340

1341
    #[test]
1342
    fn suggested_balancing_weight_is_longest_normalized_and_bounded() {
1✔
1343
        assert!((suggested_balancing_weight(100, 100) - 1.0).abs() < 1e-6);
1✔
1344
        assert!((suggested_balancing_weight(400, 100) - 0.25).abs() < 1e-6);
1✔
1345
        assert!((suggested_balancing_weight(400, 400) - 1.0).abs() < 1e-6);
1✔
1346
        assert_eq!(suggested_balancing_weight(0, 100), 0.0);
1✔
1347
        assert_eq!(suggested_balancing_weight(100, 0), 0.0);
1✔
1348
    }
1✔
1349

1350
    #[test]
1351
    fn suggested_oversampling_weight_is_inverse_in_unit_interval() {
1✔
1352
        assert!((suggested_oversampling_weight(100, 100) - 1.0).abs() < 1e-6);
1✔
1353
        assert!((suggested_oversampling_weight(100, 400) - 0.25).abs() < 1e-6);
1✔
1354
        assert!((suggested_oversampling_weight(100, 1000) - 0.1).abs() < 1e-6);
1✔
1355
        assert_eq!(suggested_oversampling_weight(0, 100), 0.0);
1✔
1356
        assert_eq!(suggested_oversampling_weight(100, 0), 0.0);
1✔
1357
    }
1✔
1358

1359
    #[test]
1360
    fn parse_cli_handles_help_and_invalid_args() {
1✔
1361
        let help = parse_cli::<EstimateCapacityCli, _>(["estimate_capacity", "--help"]).unwrap();
1✔
1362
        assert!(help.is_none());
1✔
1363

1364
        let err = parse_cli::<EstimateCapacityCli, _>(["estimate_capacity", "--unknown"]);
1✔
1365
        assert!(err.is_err());
1✔
1366
    }
1✔
1367

1368
    #[test]
1369
    fn run_estimate_capacity_succeeds_with_reported_counts() {
1✔
1370
        let result = run_estimate_capacity(
1✔
1371
            std::iter::empty::<String>(),
1✔
1372
            |roots| {
1✔
1373
                assert!(roots.is_empty());
1✔
1374
                Ok(())
1✔
1375
            },
1✔
1376
            |_| {
1✔
1377
                vec![Box::new(TestSource {
1✔
1378
                    id: "source_a".into(),
1✔
1379
                    count: Some(12),
1✔
1380
                    recipes: vec![default_recipe("r1")],
1✔
1381
                }) as DynSource]
1✔
1382
            },
1✔
1383
        );
1384

1385
        assert!(result.is_ok());
1✔
1386
    }
1✔
1387

1388
    #[test]
1389
    fn run_estimate_capacity_errors_when_source_count_missing() {
1✔
1390
        let result = run_estimate_capacity(
1✔
1391
            std::iter::empty::<String>(),
1✔
1392
            |_| Ok(()),
1✔
1393
            |_| {
1✔
1394
                vec![Box::new(TestSource {
1✔
1395
                    id: "source_missing".into(),
1✔
1396
                    count: None,
1✔
1397
                    recipes: vec![default_recipe("r1")],
1✔
1398
                }) as DynSource]
1✔
1399
            },
1✔
1400
        );
1401

1402
        let err = result.unwrap_err().to_string();
1✔
1403
        assert!(err.contains("failed to report exact record count"));
1✔
1404
    }
1✔
1405

1406
    #[test]
1407
    fn run_estimate_capacity_propagates_root_resolution_error() {
1✔
1408
        let result = run_estimate_capacity(
1✔
1409
            std::iter::empty::<String>(),
1✔
1410
            |_| Err("root resolution failed".into()),
1✔
1411
            |_: &()| Vec::<DynSource>::new(),
×
1412
        );
1413

1414
        let err = result.unwrap_err().to_string();
1✔
1415
        assert!(err.contains("root resolution failed"));
1✔
1416
    }
1✔
1417

1418
    #[test]
1419
    fn run_estimate_capacity_configures_sources_centrally_before_counting() {
1✔
1420
        let result = run_estimate_capacity(
1✔
1421
            std::iter::empty::<String>(),
1✔
1422
            |_| Ok(()),
1✔
1423
            |_| {
1✔
1424
                vec![Box::new(ConfigRequiredSource {
1✔
1425
                    id: "requires_config".into(),
1✔
1426
                    expected_seed: 99,
1✔
1427
                }) as DynSource]
1✔
1428
            },
1✔
1429
        );
1430

1431
        assert!(result.is_ok());
1✔
1432
    }
1✔
1433

1434
    #[test]
1435
    fn config_required_source_refresh_and_seed_mismatch_are_exercised() {
1✔
1436
        let source = ConfigRequiredSource {
1✔
1437
            id: "cfg-source".to_string(),
1✔
1438
            expected_seed: 42,
1✔
1439
        };
1✔
1440

1441
        let refreshed = source
1✔
1442
            .refresh(&SamplerConfig::default(), None, None)
1✔
1443
            .unwrap();
1✔
1444
        assert!(refreshed.records.is_empty());
1✔
1445

1446
        let mismatched = source.reported_record_count(&SamplerConfig {
1✔
1447
            seed: 7,
1✔
1448
            ..SamplerConfig::default()
1✔
1449
        });
1✔
1450
        assert!(matches!(
1✔
1451
            mismatched,
1✔
1452
            Err(SamplerError::SourceInconsistent { .. })
1453
        ));
1454

1455
        assert!(source.default_triplet_recipes().is_empty());
1✔
1456
    }
1✔
1457

1458
    #[test]
1459
    fn run_multi_source_demo_exhausted_paths_return_ok() {
1✔
1460
        struct OneRecordSource;
1461

1462
        impl DataSource for OneRecordSource {
1463
            fn id(&self) -> &str {
48✔
1464
                "one_record"
48✔
1465
            }
48✔
1466

1467
            fn refresh(
11✔
1468
                &self,
11✔
1469
                _config: &SamplerConfig,
11✔
1470
                _cursor: Option<&SourceCursor>,
11✔
1471
                _limit: Option<usize>,
11✔
1472
            ) -> Result<SourceSnapshot, SamplerError> {
11✔
1473
                let now = Utc::now();
11✔
1474
                Ok(SourceSnapshot {
11✔
1475
                    records: vec![DataRecord {
11✔
1476
                        id: "one_record::r1".to_string(),
11✔
1477
                        source: "one_record".to_string(),
11✔
1478
                        created_at: now,
11✔
1479
                        updated_at: now,
11✔
1480
                        quality: QualityScore { trust: 1.0 },
11✔
1481
                        taxonomy: Vec::new(),
11✔
1482
                        sections: vec![
11✔
1483
                            RecordSection {
11✔
1484
                                role: SectionRole::Anchor,
11✔
1485
                                heading: Some("title".to_string()),
11✔
1486
                                text: "anchor".to_string(),
11✔
1487
                                sentences: vec!["anchor".to_string()],
11✔
1488
                            },
11✔
1489
                            RecordSection {
11✔
1490
                                role: SectionRole::Context,
11✔
1491
                                heading: Some("body".to_string()),
11✔
1492
                                text: "context".to_string(),
11✔
1493
                                sentences: vec!["context".to_string()],
11✔
1494
                            },
11✔
1495
                        ],
11✔
1496
                        meta_prefix: None,
11✔
1497
                    }],
11✔
1498
                    cursor: SourceCursor {
11✔
1499
                        last_seen: now,
11✔
1500
                        revision: 0,
11✔
1501
                    },
11✔
1502
                })
11✔
1503
            }
11✔
1504

1505
            fn reported_record_count(&self, _config: &SamplerConfig) -> Result<u128, SamplerError> {
×
1506
                Ok(1)
×
1507
            }
×
1508

1509
            fn default_triplet_recipes(&self) -> Vec<TripletRecipe> {
3✔
1510
                vec![default_recipe("single_record_recipe")]
3✔
1511
            }
3✔
1512
        }
1513

1514
        for mode in ["--pair-batch", "--text-recipes", ""] {
3✔
1515
            let dir = tempdir().unwrap();
3✔
1516
            let split_store_path = dir.path().join("split_store.bin");
3✔
1517
            let mut args = vec![
3✔
1518
                "--split-store-path".to_string(),
3✔
1519
                split_store_path.to_string_lossy().to_string(),
3✔
1520
            ];
1521
            if !mode.is_empty() {
3✔
1522
                args.push(mode.to_string());
2✔
1523
            }
2✔
1524

1525
            let result = run_multi_source_demo(
3✔
1526
                args.into_iter(),
3✔
1527
                |_| Ok(()),
3✔
1528
                |_| vec![Box::new(OneRecordSource) as DynSource],
3✔
1529
            );
1530
            assert!(result.is_ok());
3✔
1531
        }
1532
    }
1✔
1533

1534
    #[test]
1535
    fn parse_multi_source_cli_handles_help_and_batch_size_validation() {
1✔
1536
        let help = parse_cli::<MultiSourceDemoCli, _>(["multi_source_demo", "--help"]).unwrap();
1✔
1537
        assert!(help.is_none());
1✔
1538

1539
        let err = parse_cli::<MultiSourceDemoCli, _>(["multi_source_demo", "--batch-size", "0"]);
1✔
1540
        assert!(err.is_err());
1✔
1541

1542
        let parsed = parse_cli::<MultiSourceDemoCli, _>(["multi_source_demo"]);
1✔
1543
        assert!(parsed.is_ok());
1✔
1544
    }
1✔
1545

1546
    #[test]
1547
    fn parse_cli_handles_display_version_path() {
1✔
1548
        #[derive(Debug, Parser)]
1549
        #[command(name = "version_test", version = "1.0.0")]
1550
        struct VersionCli {}
1551

1552
        let parsed = parse_cli::<VersionCli, _>(["version_test", "--version"]).unwrap();
1✔
1553
        assert!(parsed.is_none());
1✔
1554
    }
1✔
1555

1556
    #[test]
1557
    fn run_multi_source_demo_list_text_recipes_path_succeeds() {
1✔
1558
        let dir = tempdir().unwrap();
1✔
1559
        let split_store_path = dir.path().join("recipes_split_store.bin");
1✔
1560
        let mut args = vec![
1✔
1561
            "--list-text-recipes".to_string(),
1✔
1562
            "--split-store-path".to_string(),
1✔
1563
            split_store_path.to_string_lossy().to_string(),
1✔
1564
        ];
1565
        let result = run_multi_source_demo(
1✔
1566
            args.drain(..),
1✔
1567
            |_| Ok(()),
1✔
1568
            |_| {
1✔
1569
                vec![Box::new(TestSource {
1✔
1570
                    id: "source_for_recipes".into(),
1✔
1571
                    count: Some(10),
1✔
1572
                    recipes: vec![default_recipe("recipe_a")],
1✔
1573
                }) as DynSource]
1✔
1574
            },
1✔
1575
        );
1576

1577
        assert!(result.is_ok());
1✔
1578
    }
1✔
1579

1580
    #[test]
1581
    fn run_multi_source_demo_list_text_recipes_uses_explicit_split_store_path() {
1✔
1582
        let dir = tempdir().unwrap();
1✔
1583
        let split_store_path = dir.path().join("custom_split_store.bin");
1✔
1584
        let args = vec![
1✔
1585
            "--list-text-recipes".to_string(),
1✔
1586
            "--split-store-path".to_string(),
1✔
1587
            split_store_path.to_string_lossy().to_string(),
1✔
1588
        ];
1589

1590
        let result = run_multi_source_demo(
1✔
1591
            args.into_iter(),
1✔
1592
            |_| Ok(()),
1✔
1593
            |_| {
1✔
1594
                vec![Box::new(TestSource {
1✔
1595
                    id: "source_without_text_recipes".into(),
1✔
1596
                    count: Some(1),
1✔
1597
                    recipes: Vec::new(),
1✔
1598
                }) as DynSource]
1✔
1599
            },
1✔
1600
        );
1601

1602
        assert!(result.is_ok());
1✔
1603
    }
1✔
1604

1605
    #[test]
1606
    fn run_multi_source_demo_sampling_modes_handle_empty_sources() {
1✔
1607
        for mode in [
3✔
1608
            vec!["--pair-batch".to_string()],
1✔
1609
            vec!["--text-recipes".to_string()],
1✔
1610
            vec![],
1✔
1611
        ] {
1✔
1612
            let dir = tempdir().unwrap();
3✔
1613
            let split_store_path = dir.path().join("empty_sources_split_store.bin");
3✔
1614
            let mut args = mode;
3✔
1615
            args.push("--split-store-path".to_string());
3✔
1616
            args.push(split_store_path.to_string_lossy().to_string());
3✔
1617
            args.push("--split".to_string());
3✔
1618
            args.push("validation".to_string());
3✔
1619

1620
            let result = run_multi_source_demo(
3✔
1621
                args.into_iter(),
3✔
1622
                |_| Ok(()),
3✔
1623
                |_| {
3✔
1624
                    vec![Box::new(TestSource {
3✔
1625
                        id: "source_empty".into(),
3✔
1626
                        count: Some(0),
3✔
1627
                        recipes: vec![default_recipe("recipe_empty")],
3✔
1628
                    }) as DynSource]
3✔
1629
                },
3✔
1630
            );
1631

1632
            assert!(result.is_ok());
3✔
1633
        }
1634
    }
1✔
1635

1636
    #[test]
1637
    fn run_multi_source_demo_propagates_root_resolution_error() {
1✔
1638
        let dir = tempdir().unwrap();
1✔
1639
        let split_store_path = dir.path().join("root_resolution_error_store.bin");
1✔
1640
        let result = run_multi_source_demo(
1✔
1641
            [
1✔
1642
                "--split-store-path".to_string(),
1✔
1643
                split_store_path.to_string_lossy().to_string(),
1✔
1644
            ]
1✔
1645
            .into_iter(),
1✔
1646
            |_| Err("demo root resolution failed".into()),
1✔
UNCOV
1647
            |_: &()| Vec::<DynSource>::new(),
×
1648
        );
1649

1650
        let err = result.unwrap_err().to_string();
1✔
1651
        assert!(err.contains("demo root resolution failed"));
1✔
1652
    }
1✔
1653

1654
    #[test]
1655
    fn print_helpers_and_extract_source_cover_paths() {
1✔
1656
        let split = SplitRatios::default();
1✔
1657
        let store = DeterministicSplitStore::new(split, 42).unwrap();
1✔
1658
        let strategy = ChunkingStrategy::default();
1✔
1659

1660
        let anchor = RecordChunk {
1✔
1661
            record_id: "source_a::rec1".to_string(),
1✔
1662
            section_idx: 0,
1✔
1663
            view: ChunkView::Window {
1✔
1664
                index: 1,
1✔
1665
                overlap: 2,
1✔
1666
                span: 12,
1✔
1667
                start_ratio: 0.25,
1✔
1668
            },
1✔
1669
            text: "anchor text".to_string(),
1✔
1670
            tokens_estimate: 8,
1✔
1671
            quality: crate::data::QualityScore { trust: 0.9 },
1✔
1672
        };
1✔
1673
        let positive = RecordChunk {
1✔
1674
            record_id: "source_a::rec2".to_string(),
1✔
1675
            section_idx: 1,
1✔
1676
            view: ChunkView::SummaryFallback {
1✔
1677
                strategy: "summary".to_string(),
1✔
1678
                weight: 0.7,
1✔
1679
            },
1✔
1680
            text: "positive text".to_string(),
1✔
1681
            tokens_estimate: 6,
1✔
1682
            quality: crate::data::QualityScore { trust: 0.8 },
1✔
1683
        };
1✔
1684
        let negative = RecordChunk {
1✔
1685
            record_id: "source_b::rec3".to_string(),
1✔
1686
            section_idx: 2,
1✔
1687
            view: ChunkView::Window {
1✔
1688
                index: 0,
1✔
1689
                overlap: 0,
1✔
1690
                span: 16,
1✔
1691
                start_ratio: 0.0,
1✔
1692
            },
1✔
1693
            text: "negative text".to_string(),
1✔
1694
            tokens_estimate: 7,
1✔
1695
            quality: crate::data::QualityScore { trust: 0.5 },
1✔
1696
        };
1✔
1697

1698
        let triplet_batch = TripletBatch {
1✔
1699
            triplets: vec![crate::SampleTriplet {
1✔
1700
                recipe: "triplet_recipe".to_string(),
1✔
1701
                anchor: anchor.clone(),
1✔
1702
                positive: positive.clone(),
1✔
1703
                negative: negative.clone(),
1✔
1704
                weight: 1.0,
1✔
1705
                instruction: Some("triplet instruction".to_string()),
1✔
1706
            }],
1✔
1707
        };
1✔
1708
        print_triplet_batch(&strategy, &triplet_batch, &store);
1✔
1709

1710
        let pair_batch = SampleBatch {
1✔
1711
            pairs: vec![crate::SamplePair {
1✔
1712
                recipe: "pair_recipe".to_string(),
1✔
1713
                anchor: anchor.clone(),
1✔
1714
                positive: positive.clone(),
1✔
1715
                weight: 1.0,
1✔
1716
                instruction: None,
1✔
1717
                label: crate::PairLabel::Positive,
1✔
1718
                reason: Some("same topic".to_string()),
1✔
1719
            }],
1✔
1720
        };
1✔
1721
        print_pair_batch(&strategy, &pair_batch, &store);
1✔
1722

1723
        let text_batch = TextBatch {
1✔
1724
            samples: vec![crate::TextSample {
1✔
1725
                recipe: "text_recipe".to_string(),
1✔
1726
                chunk: negative,
1✔
1727
                weight: 0.8,
1✔
1728
                instruction: Some("text instruction".to_string()),
1✔
1729
            }],
1✔
1730
        };
1✔
1731
        print_text_batch(&strategy, &text_batch, &store);
1✔
1732

1733
        let recipes = vec![TextRecipe {
1✔
1734
            name: "recipe_name".into(),
1✔
1735
            selector: crate::config::Selector::Role(SectionRole::Context),
1✔
1736
            instruction: Some("instruction".into()),
1✔
1737
            weight: 1.0,
1✔
1738
        }];
1✔
1739
        print_text_recipes(&recipes);
1✔
1740

1741
        assert_eq!(extract_source("source_a::record"), "source_a");
1✔
1742
        assert_eq!(extract_source("record-without-delimiter"), "unknown");
1✔
1743
    }
1✔
1744

1745
    #[test]
1746
    fn split_arg_conversion_and_version_parse_paths_are_covered() {
1✔
1747
        assert!(matches!(
1✔
1748
            SplitLabel::from(SplitArg::Train),
1✔
1749
            SplitLabel::Train
1750
        ));
1751
        assert!(matches!(
1✔
1752
            SplitLabel::from(SplitArg::Validation),
1✔
1753
            SplitLabel::Validation
1754
        ));
1755
        assert!(matches!(SplitLabel::from(SplitArg::Test), SplitLabel::Test));
1✔
1756
    }
1✔
1757

1758
    #[test]
1759
    fn parse_split_ratios_reports_per_field_parse_errors() {
1✔
1760
        assert!(
1✔
1761
            parse_split_ratios_arg("x,0.1,0.9")
1✔
1762
                .unwrap_err()
1✔
1763
                .contains("invalid train ratio")
1✔
1764
        );
1765
        assert!(
1✔
1766
            parse_split_ratios_arg("0.1,y,0.8")
1✔
1767
                .unwrap_err()
1✔
1768
                .contains("invalid validation ratio")
1✔
1769
        );
1770
        assert!(
1✔
1771
            parse_split_ratios_arg("0.1,0.2,z")
1✔
1772
                .unwrap_err()
1✔
1773
                .contains("invalid test ratio")
1✔
1774
        );
1775
    }
1✔
1776

1777
    #[test]
1778
    fn run_multi_source_demo_exhausted_paths_are_handled() {
1✔
1779
        for mode in [
3✔
1780
            vec!["--pair-batch".to_string()],
1✔
1781
            vec!["--text-recipes".to_string()],
1✔
1782
            Vec::new(),
1✔
1783
        ] {
1✔
1784
            let dir = tempdir().unwrap();
3✔
1785
            let split_store_path = dir.path().join("exhausted_split_store.bin");
3✔
1786
            let mut args = mode;
3✔
1787
            args.push("--split-store-path".to_string());
3✔
1788
            args.push(split_store_path.to_string_lossy().to_string());
3✔
1789

1790
            let result = run_multi_source_demo(
3✔
1791
                args.into_iter(),
3✔
1792
                |_| Ok(()),
3✔
1793
                |_| {
3✔
1794
                    vec![Box::new(TestSource {
3✔
1795
                        id: "source_without_recipes".into(),
3✔
1796
                        count: Some(1),
3✔
1797
                        recipes: Vec::new(),
3✔
1798
                    }) as DynSource]
3✔
1799
                },
3✔
1800
            );
1801

1802
            assert!(result.is_ok());
3✔
1803
        }
1804
    }
1✔
1805
}
STATUS · Troubleshooting · Open an Issue · Sales · Support · CAREERS · ENTERPRISE · START FREE · SCHEDULE DEMO
ANNOUNCEMENTS · TWITTER · TOS & SLA · Supported CI Services · What's a CI service? · Automated Testing

© 2026 Coveralls, Inc