• Home
  • Features
  • Pricing
  • Docs
  • Announcements
  • Sign In

jzombie / rust-triplets / 23534687259

25 Mar 2026 09:45AM UTC coverage: 92.077% (-2.7%) from 94.794%
23534687259

Pull #40

github

web-flow
Merge 5e5e9a856 into 65addee9d
Pull Request #40: Refactor BM25 integration

2148 of 2635 new or added lines in 6 files covered. (81.52%)

46 existing lines in 1 file now uncovered.

14783 of 16055 relevant lines covered (92.08%)

137458.06 hits per line

Source File
Press 'n' to go to next uncovered line, 'b' for previous

82.85
/src/example_apps.rs
1
// TODO: Consider extracting to a debug crate
2

3
use std::collections::HashMap;
4
use std::error::Error;
5
use std::path::PathBuf;
6
use std::sync::Arc;
7
use std::sync::Once;
8
use std::time::Instant;
9

10
use cache_manager::CacheRoot;
11
use clap::{Parser, ValueEnum, error::ErrorKind};
12

13
use crate::config::{ChunkingStrategy, SamplerConfig, TripletRecipe};
14
use crate::constants::cache::{MULTI_SOURCE_DEMO_GROUP, MULTI_SOURCE_DEMO_STORE_FILENAME};
15
use crate::data::ChunkView;
16
use crate::heuristics::{
17
    CapacityTotals, EFFECTIVE_NEGATIVES_PER_ANCHOR, EFFECTIVE_POSITIVES_PER_ANCHOR,
18
    estimate_source_split_capacity_from_counts, format_replay_factor, format_u128_with_commas,
19
    resolve_text_recipes_for_source, split_counts_for_total,
20
};
21
use crate::metrics::source_skew;
22
use crate::sampler::chunk_weight;
23
use crate::source::DataSource;
24
use crate::splits::{FileSplitStore, SplitLabel, SplitRatios, SplitStore};
25
use crate::{
26
    RecordChunk, SampleBatch, Sampler, SamplerError, SourceId, TextBatch, TextRecipe, TripletBatch,
27
    TripletSampler,
28
};
29

30
type DynSource = Box<dyn DataSource + 'static>;
31

32
fn managed_demo_split_store_path() -> Result<PathBuf, String> {
×
33
    let cache_root = CacheRoot::from_discovery()
×
34
        .map_err(|err| format!("failed discovering managed cache root: {err}"))?;
×
35
    let group = PathBuf::from(MULTI_SOURCE_DEMO_GROUP);
×
36
    let dir = cache_root.ensure_group(&group).map_err(|err| {
×
37
        format!(
×
38
            "failed creating managed demo cache group '{}': {err}",
39
            group.display()
×
40
        )
41
    })?;
×
42
    Ok(dir.join(MULTI_SOURCE_DEMO_STORE_FILENAME))
×
43
}
×
44

45
fn init_example_tracing() {
16✔
46
    static INIT: Once = Once::new();
47
    INIT.call_once(|| {
16✔
48
        let env_filter = tracing_subscriber::EnvFilter::try_from_default_env()
1✔
49
            .unwrap_or_else(|_| tracing_subscriber::EnvFilter::new("triplets=info"));
1✔
50
        let _ = tracing_subscriber::fmt()
1✔
51
            .with_env_filter(env_filter)
1✔
52
            .try_init();
1✔
53
    });
1✔
54
}
16✔
55

56
#[derive(Debug, Clone, Copy, ValueEnum)]
57
/// CLI split selector mapped onto `SplitLabel`.
58
enum SplitArg {
59
    Train,
60
    Validation,
61
    Test,
62
}
63

64
impl From<SplitArg> for SplitLabel {
65
    fn from(value: SplitArg) -> Self {
6✔
66
        match value {
6✔
67
            SplitArg::Train => SplitLabel::Train,
1✔
68
            SplitArg::Validation => SplitLabel::Validation,
4✔
69
            SplitArg::Test => SplitLabel::Test,
1✔
70
        }
71
    }
6✔
72
}
73

74
#[derive(Debug, Parser)]
75
#[command(
76
    name = "estimate_capacity",
77
    disable_help_subcommand = true,
78
    about = "Metadata-only capacity estimation",
79
    long_about = "Estimate record, pair, triplet, and text-sample capacity using source-reported counts only (no data refresh).",
80
    after_help = "Source roots are optional and resolved in order by explicit arg, environment variables, then project defaults."
81
)]
82
/// CLI arguments for metadata-only capacity estimation.
83
struct EstimateCapacityCli {
84
    #[arg(
85
        long,
86
        default_value_t = 99,
87
        help = "Deterministic seed used for split allocation"
88
    )]
89
    seed: u64,
90
    #[arg(
91
        long = "split-ratios",
92
        value_name = "TRAIN,VALIDATION,TEST",
93
        value_parser = parse_split_ratios_arg,
94
        default_value = "0.8,0.1,0.1",
95
        help = "Comma-separated split ratios that must sum to 1.0"
96
    )]
97
    split: SplitRatios,
98
    #[arg(
99
        long = "source-root",
100
        value_name = "PATH",
101
        help = "Optional source root override, repeat as needed in source order"
102
    )]
103
    source_roots: Vec<String>,
104
}
105

106
#[derive(Debug, Parser)]
107
#[command(
108
    name = "multi_source_demo",
109
    disable_help_subcommand = true,
110
    about = "Run sampled batches from multiple sources",
111
    long_about = "Sample triplet, pair, or text batches from multiple sources and persist split/epoch state.",
112
    after_help = "Source roots are optional and resolved in order by explicit arg, environment variables, then project defaults."
113
)]
114
/// CLI for `multi_source_demo`.
115
///
116
/// Common usage:
117
/// - Use managed cache-group default path (no flag)
118
/// - Set an explicit file path: `--split-store-path /tmp/split_store.bin`
119
/// - Repeat `--source-root <PATH>` to override source roots in order
120
struct MultiSourceDemoCli {
121
    #[arg(
122
        long = "text-recipes",
123
        help = "Emit a text batch instead of a triplet batch"
124
    )]
125
    show_text_samples: bool,
126
    #[arg(
127
        long = "pair-batch",
128
        help = "Emit a pair batch instead of a triplet batch"
129
    )]
130
    show_pair_samples: bool,
131
    #[arg(
132
        long = "list-text-recipes",
133
        help = "Print registered text recipes and exit"
134
    )]
135
    list_text_recipes: bool,
136
    #[arg(
137
        long = "batch-size",
138
        default_value_t = 4,
139
        value_parser = parse_positive_usize,
140
        help = "Batch size used for sampling"
141
    )]
142
    batch_size: usize,
143
    #[arg(long, help = "Optional deterministic seed override")]
144
    seed: Option<u64>,
145
    #[arg(long, value_enum, help = "Target split to sample from")]
146
    split: Option<SplitArg>,
147
    #[arg(
148
        long = "source-root",
149
        value_name = "PATH",
150
        help = "Optional source root override, repeat as needed in source order"
151
    )]
152
    source_roots: Vec<String>,
153
    #[arg(
154
        long = "split-store-path",
155
        value_name = "SPLIT_STORE_PATH",
156
        help = "Optional explicit path for persisted split/epoch state file"
157
    )]
158
    split_store_path: Option<PathBuf>,
159
    #[arg(
160
        long = "reset",
161
        help = "Delete the persisted split/epoch state before sampling, restarting from epoch 0"
162
    )]
163
    reset: bool,
164
    #[arg(
165
        long = "batches",
166
        value_name = "N",
167
        value_parser = parse_positive_usize,
168
        help = "Run N triplet batches in succession, printing a timing line per batch and (with --features extended-metrics) a per-source similarity summary at the end"
169
    )]
170
    batches: Option<usize>,
171
}
172

173
#[derive(Debug, Clone)]
174
/// Source-level inventory used by capacity estimation output.
175
struct SourceInventory {
176
    source_id: String,
177
    reported_records: u128,
178
    triplet_recipes: Vec<TripletRecipe>,
179
}
180

181
/// Run the capacity-estimation CLI with injectable root resolution/source builders.
182
///
183
/// `build_sources` is construction-only; sampler configuration is applied
184
/// centrally by this function before any source calls.
185
pub fn run_estimate_capacity<R, Resolve, Build, I>(
4✔
186
    args_iter: I,
4✔
187
    resolve_roots: Resolve,
4✔
188
    build_sources: Build,
4✔
189
) -> Result<(), Box<dyn Error>>
4✔
190
where
4✔
191
    Resolve: FnOnce(Vec<String>) -> Result<R, Box<dyn Error>>,
4✔
192
    Build: FnOnce(&R) -> Vec<DynSource>,
4✔
193
    I: Iterator<Item = String>,
4✔
194
{
195
    init_example_tracing();
4✔
196

197
    let Some(cli) = parse_cli::<EstimateCapacityCli, _>(
4✔
198
        std::iter::once("estimate_capacity".to_string()).chain(args_iter),
4✔
UNCOV
199
    )?
×
200
    else {
UNCOV
201
        return Ok(());
×
202
    };
203

204
    let roots = resolve_roots(cli.source_roots)?;
4✔
205

206
    let config = SamplerConfig {
3✔
207
        seed: cli.seed,
3✔
208
        split: cli.split,
3✔
209
        ..SamplerConfig::default()
3✔
210
    };
3✔
211

212
    let sources = build_sources(&roots);
3✔
213

214
    let mut inventories = Vec::new();
3✔
215
    for source in &sources {
3✔
216
        let recipes = if config.recipes.is_empty() {
3✔
217
            source.default_triplet_recipes()
3✔
218
        } else {
UNCOV
219
            config.recipes.clone()
×
220
        };
221
        let reported_records = source.reported_record_count(&config).map_err(|err| {
3✔
222
            format!(
1✔
223
                "source '{}' failed to report exact record count: {err}",
224
                source.id()
1✔
225
            )
226
        })?;
1✔
227
        inventories.push(SourceInventory {
2✔
228
            source_id: source.id().to_string(),
2✔
229
            reported_records,
2✔
230
            triplet_recipes: recipes,
2✔
231
        });
2✔
232
    }
233

234
    let mut per_source_split_counts: HashMap<(String, SplitLabel), u128> = HashMap::new();
2✔
235
    let mut split_record_counts: HashMap<SplitLabel, u128> = HashMap::new();
2✔
236

237
    for source in &inventories {
2✔
238
        let counts = split_counts_for_total(source.reported_records, cli.split);
2✔
239
        for (label, count) in counts {
6✔
240
            per_source_split_counts.insert((source.source_id.clone(), label), count);
6✔
241
            *split_record_counts.entry(label).or_insert(0) += count;
6✔
242
        }
6✔
243
    }
244

245
    let mut totals_by_split: HashMap<SplitLabel, CapacityTotals> = HashMap::new();
2✔
246
    let mut totals_by_source_and_split: HashMap<(String, SplitLabel), CapacityTotals> =
2✔
247
        HashMap::new();
2✔
248

249
    for split_label in [SplitLabel::Train, SplitLabel::Validation, SplitLabel::Test] {
6✔
250
        let mut totals = CapacityTotals::default();
6✔
251

252
        for source in &inventories {
6✔
253
            let source_split_records = per_source_split_counts
6✔
254
                .get(&(source.source_id.clone(), split_label))
6✔
255
                .copied()
6✔
256
                .unwrap_or(0);
6✔
257

6✔
258
            let triplet_recipes = &source.triplet_recipes;
6✔
259
            let text_recipes = resolve_text_recipes_for_source(&config, triplet_recipes);
6✔
260

6✔
261
            let capacity = estimate_source_split_capacity_from_counts(
6✔
262
                source_split_records,
6✔
263
                triplet_recipes,
6✔
264
                &text_recipes,
6✔
265
            );
6✔
266

6✔
267
            totals_by_source_and_split.insert((source.source_id.clone(), split_label), capacity);
6✔
268

6✔
269
            totals.triplets += capacity.triplets;
6✔
270
            totals.effective_triplets += capacity.effective_triplets;
6✔
271
            totals.pairs += capacity.pairs;
6✔
272
            totals.text_samples += capacity.text_samples;
6✔
273
        }
6✔
274

275
        totals_by_split.insert(split_label, totals);
6✔
276
    }
277

278
    let min_nonzero_records_by_split: HashMap<SplitLabel, u128> =
2✔
279
        [SplitLabel::Train, SplitLabel::Validation, SplitLabel::Test]
2✔
280
            .into_iter()
2✔
281
            .map(|split_label| {
6✔
282
                let min_nonzero = inventories
6✔
283
                    .iter()
6✔
284
                    .filter_map(|source| {
6✔
285
                        per_source_split_counts
6✔
286
                            .get(&(source.source_id.clone(), split_label))
6✔
287
                            .copied()
6✔
288
                    })
6✔
289
                    .filter(|&records| records > 0)
6✔
290
                    .min()
6✔
291
                    .unwrap_or(0);
6✔
292
                (split_label, min_nonzero)
6✔
293
            })
6✔
294
            .collect();
2✔
295

296
    let min_nonzero_records_all_splits = inventories
2✔
297
        .iter()
2✔
298
        .map(|source| source.reported_records)
2✔
299
        .filter(|&records| records > 0)
2✔
300
        .min()
2✔
301
        .unwrap_or(0);
2✔
302

303
    println!("=== capacity estimate (length-only) ===");
2✔
304
    println!("mode: metadata-only (no source.refresh calls)");
2✔
305
    println!("classification: heuristic approximation (not exact)");
2✔
306
    println!("split seed: {}", cli.seed);
2✔
307
    println!(
2✔
308
        "split ratios: train={:.4}, validation={:.4}, test={:.4}",
309
        cli.split.train, cli.split.validation, cli.split.test
310
    );
311
    println!();
2✔
312

313
    println!("[SOURCES]");
2✔
314
    for source in &inventories {
2✔
315
        println!(
2✔
316
            "  {} => reported records: {}",
2✔
317
            source.source_id,
2✔
318
            format_u128_with_commas(source.reported_records)
2✔
319
        );
2✔
320
    }
2✔
321
    println!();
2✔
322

323
    println!("[PER SOURCE BREAKDOWN]");
2✔
324
    for source in &inventories {
2✔
325
        println!("  {}", source.source_id);
2✔
326
        let mut source_grand = CapacityTotals::default();
2✔
327
        let mut source_total_records = 0u128;
2✔
328
        for split_label in [SplitLabel::Train, SplitLabel::Validation, SplitLabel::Test] {
6✔
329
            let split_records = per_source_split_counts
6✔
330
                .get(&(source.source_id.clone(), split_label))
6✔
331
                .copied()
6✔
332
                .unwrap_or(0);
6✔
333
            source_total_records = source_total_records.saturating_add(split_records);
6✔
334
            let split_longest_records = inventories
6✔
335
                .iter()
6✔
336
                .map(|candidate| {
6✔
337
                    per_source_split_counts
6✔
338
                        .get(&(candidate.source_id.clone(), split_label))
6✔
339
                        .copied()
6✔
340
                        .unwrap_or(0)
6✔
341
                })
6✔
342
                .max()
6✔
343
                .unwrap_or(0);
6✔
344
            let totals = totals_by_source_and_split
6✔
345
                .get(&(source.source_id.clone(), split_label))
6✔
346
                .copied()
6✔
347
                .unwrap_or_default();
6✔
348
            source_grand.triplets += totals.triplets;
6✔
349
            source_grand.effective_triplets += totals.effective_triplets;
6✔
350
            source_grand.pairs += totals.pairs;
6✔
351
            source_grand.text_samples += totals.text_samples;
6✔
352
            println!("    [{:?}]", split_label);
6✔
353
            println!("      records: {}", format_u128_with_commas(split_records));
6✔
354
            println!(
6✔
355
                "      triplet combinations: {}",
356
                format_u128_with_commas(totals.triplets)
6✔
357
            );
358
            println!(
6✔
359
                "      effective sampled triplets (p={}, k={}): {}",
360
                EFFECTIVE_POSITIVES_PER_ANCHOR,
361
                EFFECTIVE_NEGATIVES_PER_ANCHOR,
362
                format_u128_with_commas(totals.effective_triplets)
6✔
363
            );
364
            println!(
6✔
365
                "      pair combinations:    {}",
366
                format_u128_with_commas(totals.pairs)
6✔
367
            );
368
            println!(
6✔
369
                "      text samples:         {}",
370
                format_u128_with_commas(totals.text_samples)
6✔
371
            );
372
            println!(
6✔
373
                "      replay factor vs longest source: {}",
374
                format_replay_factor(split_longest_records, split_records)
6✔
375
            );
376
            println!(
6✔
377
                "      suggested proportional-size batch weight (0-1): {:.4}",
378
                suggested_balancing_weight(split_longest_records, split_records)
6✔
379
            );
380
            let split_smallest_nonzero = min_nonzero_records_by_split
6✔
381
                .get(&split_label)
6✔
382
                .copied()
6✔
383
                .unwrap_or(0);
6✔
384
            println!(
6✔
385
                "      suggested small-source-boost batch weight (0-1): {:.4}",
386
                suggested_oversampling_weight(split_smallest_nonzero, split_records)
6✔
387
            );
388
            println!();
6✔
389
        }
390
        let longest_source_total = inventories
2✔
391
            .iter()
2✔
392
            .map(|candidate| candidate.reported_records)
2✔
393
            .max()
2✔
394
            .unwrap_or(0);
2✔
395
        println!("    [ALL SPLITS FOR SOURCE]");
2✔
396
        println!(
2✔
397
            "      triplet combinations: {}",
398
            format_u128_with_commas(source_grand.triplets)
2✔
399
        );
400
        println!(
2✔
401
            "      effective sampled triplets (p={}, k={}): {}",
402
            EFFECTIVE_POSITIVES_PER_ANCHOR,
403
            EFFECTIVE_NEGATIVES_PER_ANCHOR,
404
            format_u128_with_commas(source_grand.effective_triplets)
2✔
405
        );
406
        println!(
2✔
407
            "      pair combinations:    {}",
408
            format_u128_with_commas(source_grand.pairs)
2✔
409
        );
410
        println!(
2✔
411
            "      text samples:         {}",
412
            format_u128_with_commas(source_grand.text_samples)
2✔
413
        );
414
        println!(
2✔
415
            "      replay factor vs longest source: {}",
416
            format_replay_factor(longest_source_total, source_total_records)
2✔
417
        );
418
        println!(
2✔
419
            "      suggested proportional-size batch weight (0-1): {:.4}",
420
            suggested_balancing_weight(longest_source_total, source_total_records)
2✔
421
        );
422
        println!(
2✔
423
            "      suggested small-source-boost batch weight (0-1): {:.4}",
424
            suggested_oversampling_weight(min_nonzero_records_all_splits, source_total_records)
2✔
425
        );
426
        println!();
2✔
427
    }
428

429
    let mut grand = CapacityTotals::default();
2✔
430
    for split_label in [SplitLabel::Train, SplitLabel::Validation, SplitLabel::Test] {
6✔
431
        let record_count = split_record_counts.get(&split_label).copied().unwrap_or(0);
6✔
432
        let totals = totals_by_split
6✔
433
            .get(&split_label)
6✔
434
            .copied()
6✔
435
            .unwrap_or_default();
6✔
436

6✔
437
        grand.triplets += totals.triplets;
6✔
438
        grand.effective_triplets += totals.effective_triplets;
6✔
439
        grand.pairs += totals.pairs;
6✔
440
        grand.text_samples += totals.text_samples;
6✔
441

6✔
442
        println!("[{:?}]", split_label);
6✔
443
        println!("  records: {}", format_u128_with_commas(record_count));
6✔
444
        println!(
6✔
445
            "  triplet combinations: {}",
6✔
446
            format_u128_with_commas(totals.triplets)
6✔
447
        );
6✔
448
        println!(
6✔
449
            "  effective sampled triplets (p={}, k={}): {}",
6✔
450
            EFFECTIVE_POSITIVES_PER_ANCHOR,
6✔
451
            EFFECTIVE_NEGATIVES_PER_ANCHOR,
6✔
452
            format_u128_with_commas(totals.effective_triplets)
6✔
453
        );
6✔
454
        println!(
6✔
455
            "  pair combinations:    {}",
6✔
456
            format_u128_with_commas(totals.pairs)
6✔
457
        );
6✔
458
        println!(
6✔
459
            "  text samples:         {}",
6✔
460
            format_u128_with_commas(totals.text_samples)
6✔
461
        );
6✔
462
        println!();
6✔
463
    }
6✔
464

465
    println!("[ALL SPLITS TOTAL]");
2✔
466
    println!(
2✔
467
        "  triplet combinations: {}",
468
        format_u128_with_commas(grand.triplets)
2✔
469
    );
470
    println!(
2✔
471
        "  effective sampled triplets (p={}, k={}): {}",
472
        EFFECTIVE_POSITIVES_PER_ANCHOR,
473
        EFFECTIVE_NEGATIVES_PER_ANCHOR,
474
        format_u128_with_commas(grand.effective_triplets)
2✔
475
    );
476
    println!(
2✔
477
        "  pair combinations:    {}",
478
        format_u128_with_commas(grand.pairs)
2✔
479
    );
480
    println!(
2✔
481
        "  text samples:         {}",
482
        format_u128_with_commas(grand.text_samples)
2✔
483
    );
484
    println!();
2✔
485
    println!(
2✔
486
        "Note: counts are heuristic, length-based estimates from source-reported totals and recipe structure. They are approximate, not exact, and assume anchor-positive pairs=records (one positive per anchor by default), negatives=source_records_in_split-1 (anchor excluded as its own negative), and at most one chunk/window realization per sample. In real-world chunked sampling, practical combinations are often higher, so treat this as a floor-like baseline."
487
    );
488
    println!();
2✔
489
    println!(
2✔
490
        "Effective sampled triplets apply a bounded training assumption: effective_triplets = records * p * k per triplet recipe, with defaults p={} positives per anchor and k={} negatives per anchor.",
491
        EFFECTIVE_POSITIVES_PER_ANCHOR, EFFECTIVE_NEGATIVES_PER_ANCHOR
492
    );
493
    println!();
2✔
494
    println!(
2✔
495
        "Oversample loops are not inferred from this static report. To measure true oversampling (how many times sampling loops through the combination space), use observed sampled draw counts from an actual run."
496
    );
497
    println!();
2✔
498
    println!(
2✔
499
        "Suggested proportional-size batch weight (0-1) is source/max_source by record count: 1.0 for the largest source in scope, smaller values for smaller sources."
500
    );
501
    println!();
2✔
502
    println!(
2✔
503
        "Suggested small-source-boost batch weight (0-1) is min_nonzero_source/source by record count: 1.0 for the smallest non-zero source in scope, smaller values for larger sources."
504
    );
505
    println!();
2✔
506
    println!(
2✔
507
        "When passed to next_*_batch_with_weights, higher weight means that source is sampled more often relative to lower-weight sources."
508
    );
509

510
    Ok(())
2✔
511
}
4✔
512

513
/// Run the multi-source demo CLI with injectable root resolution/source builders.
514
///
515
/// `build_sources` is construction-only. Source sampler configuration is owned
516
/// by sampler registration (`TripletSampler::register_source`).
517
pub fn run_multi_source_demo<R, Resolve, Build, I>(
12✔
518
    args_iter: I,
12✔
519
    resolve_roots: Resolve,
12✔
520
    build_sources: Build,
12✔
521
) -> Result<(), Box<dyn Error>>
12✔
522
where
12✔
523
    Resolve: FnOnce(Vec<String>) -> Result<R, Box<dyn Error>>,
12✔
524
    Build: FnOnce(&R) -> Vec<DynSource>,
12✔
525
    I: Iterator<Item = String>,
12✔
526
{
527
    init_example_tracing();
12✔
528

529
    let Some(cli) = parse_cli::<MultiSourceDemoCli, _>(
12✔
530
        std::iter::once("multi_source_demo".to_string()).chain(args_iter),
12✔
UNCOV
531
    )?
×
532
    else {
UNCOV
533
        return Ok(());
×
534
    };
535

536
    let roots = resolve_roots(cli.source_roots)?;
12✔
537

538
    let mut config = SamplerConfig::default();
11✔
539
    config.seed = cli.seed.unwrap_or(config.seed);
11✔
540
    config.batch_size = cli.batch_size;
11✔
541
    config.chunking = Default::default();
11✔
542
    let selected_split = cli.split.map(Into::into).unwrap_or(SplitLabel::Train);
11✔
543
    config.split = SplitRatios::default();
11✔
544
    config.allowed_splits = vec![selected_split];
11✔
545
    let chunking = config.chunking.clone();
11✔
546
    let config_snapshot = MultiSourceDemoConfigSnapshot {
11✔
547
        seed: config.seed,
11✔
548
        batch_size: config.batch_size,
11✔
549
        ingestion_max_records: config.ingestion_max_records,
11✔
550
        split: selected_split,
11✔
551
        split_ratios: config.split,
11✔
552
        max_window_tokens: config.chunking.max_window_tokens,
11✔
553
        overlap_tokens: config.chunking.overlap_tokens.clone(),
11✔
554
        summary_fallback_tokens: config.chunking.summary_fallback_tokens,
11✔
555
    };
11✔
556

557
    let split_store_path = if let Some(path) = cli.split_store_path {
11✔
558
        path
11✔
559
    } else {
560
        managed_demo_split_store_path().map_err(|err| {
×
561
            Box::<dyn Error>::from(format!("failed to resolve demo split-store path: {err}"))
×
562
        })?
×
563
    };
564

565
    if cli.reset && split_store_path.exists() {
11✔
NEW
UNCOV
566
        std::fs::remove_file(&split_store_path).map_err(|err| {
×
NEW
UNCOV
567
            Box::<dyn Error>::from(format!(
×
NEW
568
                "failed to remove split store '{}': {err}",
×
NEW
569
                split_store_path.display()
×
NEW
570
            ))
×
NEW
UNCOV
571
        })?;
×
NEW
UNCOV
572
        println!("Reset: removed {}", split_store_path.display());
×
573
    }
11✔
574
    println!(
11✔
575
        "Persisting split assignments and epoch state to {}",
576
        split_store_path.display()
11✔
577
    );
578
    let sources = build_sources(&roots);
11✔
579
    let split_store = Arc::new(FileSplitStore::open(&split_store_path, config.split, 99)?);
11✔
580
    let sampler = TripletSampler::new(config, split_store.clone());
11✔
581
    for source in sources {
11✔
582
        sampler.register_source(source);
11✔
583
    }
11✔
584

585
    if cli.show_pair_samples {
11✔
586
        match sampler.next_pair_batch(selected_split) {
3✔
UNCOV
587
            Ok(pair_batch) => {
×
UNCOV
588
                if pair_batch.pairs.is_empty() {
×
UNCOV
589
                    println!("Pair sampling produced no results.");
×
UNCOV
590
                } else {
×
UNCOV
591
                    print_pair_batch(&chunking, &pair_batch, split_store.as_ref());
×
UNCOV
592
                }
×
UNCOV
593
                sampler.save_sampler_state(None)?;
×
594
            }
595
            Err(SamplerError::Exhausted(name)) => {
3✔
596
                eprintln!(
3✔
597
                    "Pair sampler exhausted recipe '{}'. Ensure both positive and negative examples exist.",
3✔
598
                    name
3✔
599
                );
3✔
600
            }
3✔
601
            Err(err) => return Err(err.into()),
×
602
        }
603
    } else if cli.show_text_samples {
8✔
604
        match sampler.next_text_batch(selected_split) {
3✔
605
            Ok(text_batch) => {
1✔
606
                if text_batch.samples.is_empty() {
1✔
UNCOV
607
                    println!(
×
UNCOV
608
                        "Text sampling produced no results. Ensure each source has eligible sections."
×
609
                    );
×
610
                } else {
1✔
611
                    print_text_batch(&chunking, &text_batch, split_store.as_ref());
1✔
612
                }
1✔
613
                sampler.save_sampler_state(None)?;
1✔
614
            }
615
            Err(SamplerError::Exhausted(name)) => {
2✔
616
                eprintln!(
2✔
617
                    "Text sampler exhausted selector '{}'. Ensure matching sections exist.",
2✔
618
                    name
2✔
619
                );
2✔
620
            }
2✔
UNCOV
621
            Err(err) => return Err(err.into()),
×
622
        }
623
    } else if cli.list_text_recipes {
5✔
624
        let recipes = sampler.text_recipes();
2✔
625
        if recipes.is_empty() {
2✔
626
            println!(
1✔
627
                "No text recipes registered. Ensure your sources expose triplet selectors or configure text_recipes explicitly."
1✔
628
            );
1✔
629
        } else {
1✔
630
            print_text_recipes(&recipes);
1✔
631
        }
1✔
632
    } else if let Some(batch_count) = cli.batches {
3✔
NEW
UNCOV
633
        print_demo_config(&config_snapshot);
×
NEW
UNCOV
634
        println!("=== benchmark: {} triplet batches ===", batch_count);
×
635

636
        // source_id -> Vec<(pos_jaccard, pos_byte_cosine, neg_jaccard, neg_byte_cosine)>
637
        #[cfg(feature = "extended-metrics")]
NEW
UNCOV
638
        let mut source_metrics: HashMap<String, Vec<(f32, f32, f32, f32)>> = HashMap::new();
×
639

NEW
640
        for i in 0..batch_count {
×
NEW
641
            let t0 = Instant::now();
×
NEW
642
            match sampler.next_triplet_batch(selected_split) {
×
NEW
643
                Ok(batch) => {
×
NEW
644
                    let elapsed = t0.elapsed();
×
NEW
645
                    let n = batch.triplets.len();
×
NEW
646
                    println!(
×
647
                        "batch {:>4}  triplets={:<4}  elapsed={:>8.2}ms  per_triplet={:.2}ms",
NEW
648
                        i + 1,
×
649
                        n,
NEW
650
                        elapsed.as_secs_f64() * 1000.0,
×
NEW
651
                        if n > 0 {
×
NEW
652
                            elapsed.as_secs_f64() * 1000.0 / n as f64
×
653
                        } else {
NEW
654
                            0.0
×
655
                        },
656
                    );
657
                    #[cfg(feature = "extended-metrics")]
658
                    {
659
                        use crate::metrics::lexical_similarity_scores;
NEW
660
                        for triplet in &batch.triplets {
×
NEW
661
                            let (pj, pc) = lexical_similarity_scores(
×
NEW
662
                                &triplet.anchor.text,
×
NEW
663
                                &triplet.positive.text,
×
NEW
664
                            );
×
NEW
665
                            let (nj, nc) = lexical_similarity_scores(
×
NEW
666
                                &triplet.anchor.text,
×
NEW
667
                                &triplet.negative.text,
×
NEW
668
                            );
×
NEW
669
                            let source = extract_source(&triplet.anchor.record_id);
×
NEW
670
                            source_metrics
×
NEW
671
                                .entry(source)
×
NEW
672
                                .or_default()
×
NEW
673
                                .push((pj, pc, nj, nc));
×
NEW
674
                        }
×
675
                    }
676
                }
NEW
677
                Err(SamplerError::Exhausted(name)) => {
×
NEW
678
                    println!(
×
679
                        "batch {:>4}  exhausted recipe '{}' — stopping early",
NEW
680
                        i + 1,
×
681
                        name
682
                    );
NEW
683
                    break;
×
684
                }
NEW
685
                Err(err) => return Err(err.into()),
×
686
            }
687
        }
688

NEW
689
        sampler.save_sampler_state(None)?;
×
690

691
        #[cfg(feature = "extended-metrics")]
NEW
692
        if !source_metrics.is_empty() {
×
NEW
693
            println!();
×
NEW
694
            print_metric_summary(&source_metrics);
×
NEW
695
        }
×
696

697
        #[cfg(all(feature = "extended-metrics", feature = "bm25-mining"))]
698
        {
NEW
699
            let (fallback, total) = sampler.bm25_fallback_stats();
×
NEW
700
            if total > 0 {
×
NEW
701
                let pct = fallback as f64 / total as f64 * 100.0;
×
NEW
702
                println!("bm25 fallback rate : {}/{} ({:.1}%)", fallback, total, pct);
×
NEW
703
            }
×
704
        }
705
    } else {
706
        match sampler.next_triplet_batch(selected_split) {
3✔
707
            Ok(triplet_batch) => {
×
708
                if triplet_batch.triplets.is_empty() {
×
709
                    println!(
×
710
                        "Triplet sampling produced no results. Ensure multiple records per source exist."
×
711
                    );
×
712
                } else {
×
UNCOV
713
                    print_triplet_batch(&chunking, &triplet_batch, split_store.as_ref());
×
UNCOV
714
                }
×
715
                sampler.save_sampler_state(None)?;
×
716
                #[cfg(all(feature = "extended-metrics", feature = "bm25-mining"))]
717
                {
NEW
718
                    let (fallback, total) = sampler.bm25_fallback_stats();
×
NEW
719
                    if total > 0 {
×
NEW
720
                        let pct = fallback as f64 / total as f64 * 100.0;
×
NEW
721
                        println!("bm25 fallback rate : {}/{} ({:.1}%)", fallback, total, pct);
×
NEW
722
                    }
×
723
                }
724
            }
725
            Err(SamplerError::Exhausted(name)) => {
3✔
726
                eprintln!(
3✔
727
                    "Triplet sampler exhausted recipe '{}'. Ensure both positive and negative examples exist.",
3✔
728
                    name
3✔
729
                );
3✔
730
            }
3✔
731
            Err(err) => return Err(err.into()),
×
732
        }
733
    }
734

735
    Ok(())
11✔
736
}
12✔
737

738
struct MultiSourceDemoConfigSnapshot {
739
    seed: u64,
740
    batch_size: usize,
741
    ingestion_max_records: usize,
742
    split: SplitLabel,
743
    split_ratios: SplitRatios,
744
    max_window_tokens: usize,
745
    overlap_tokens: Vec<usize>,
746
    summary_fallback_tokens: usize,
747
}
748

NEW
749
fn print_demo_config(cfg: &MultiSourceDemoConfigSnapshot) {
×
NEW
750
    let overlaps: Vec<String> = cfg.overlap_tokens.iter().map(|t| t.to_string()).collect();
×
NEW
751
    println!("=== sampler config ===");
×
NEW
752
    println!("seed                 : {}", cfg.seed);
×
NEW
753
    println!("batch_size           : {}", cfg.batch_size);
×
NEW
754
    println!("ingestion_max_records: {}", cfg.ingestion_max_records);
×
NEW
755
    println!("split                : {:?}", cfg.split);
×
NEW
756
    println!(
×
757
        "split_ratios         : train={:.2} val={:.2} test={:.2}",
758
        cfg.split_ratios.train, cfg.split_ratios.validation, cfg.split_ratios.test
759
    );
NEW
760
    println!("max_window_tokens    : {}", cfg.max_window_tokens);
×
NEW
761
    println!("overlap_tokens       : [{}]", overlaps.join(", "));
×
NEW
762
    println!(
×
763
        "summary_fallback     : {} tokens (0 = disabled)",
764
        cfg.summary_fallback_tokens
765
    );
NEW
766
    println!();
×
NEW
767
}
×
768

769
fn parse_positive_usize(raw: &str) -> Result<usize, String> {
17✔
770
    let parsed = raw.parse::<usize>().map_err(|_| {
17✔
771
        format!(
1✔
772
            "Could not parse --batch-size value '{}' as a positive integer",
773
            raw
774
        )
775
    })?;
1✔
776
    if parsed == 0 {
16✔
777
        return Err("--batch-size must be greater than zero".to_string());
2✔
778
    }
14✔
779
    Ok(parsed)
14✔
780
}
17✔
781

782
fn suggested_balancing_weight(max_baseline: u128, source_baseline: u128) -> f32 {
13✔
783
    if max_baseline == 0 || source_baseline == 0 {
13✔
784
        return 0.0;
4✔
785
    }
9✔
786
    (source_baseline as f64 / max_baseline as f64).clamp(0.0, 1.0) as f32
9✔
787
}
13✔
788

789
fn suggested_oversampling_weight(min_nonzero_baseline: u128, source_baseline: u128) -> f32 {
13✔
790
    if min_nonzero_baseline == 0 || source_baseline == 0 {
13✔
791
        return 0.0;
4✔
792
    }
9✔
793
    (min_nonzero_baseline as f64 / source_baseline as f64).clamp(0.0, 1.0) as f32
9✔
794
}
13✔
795

796
fn parse_cli<T, I>(args: I) -> Result<Option<T>, Box<dyn Error>>
22✔
797
where
22✔
798
    T: Parser,
22✔
799
    I: IntoIterator,
22✔
800
    I::Item: Into<std::ffi::OsString> + Clone,
22✔
801
{
802
    match T::try_parse_from(args) {
22✔
803
        Ok(cli) => Ok(Some(cli)),
17✔
804
        Err(err) => match err.kind() {
5✔
805
            ErrorKind::DisplayHelp | ErrorKind::DisplayVersion => {
806
                err.print()?;
3✔
807
                Ok(None)
3✔
808
            }
809
            _ => Err(err.into()),
2✔
810
        },
811
    }
812
}
22✔
813

814
fn parse_split_ratios_arg(raw: &str) -> Result<SplitRatios, String> {
11✔
815
    let parts: Vec<&str> = raw.split(',').collect();
11✔
816
    if parts.len() != 3 {
11✔
817
        return Err("--split-ratios expects exactly 3 comma-separated values".to_string());
1✔
818
    }
10✔
819
    let train = parts[0]
10✔
820
        .trim()
10✔
821
        .parse::<f32>()
10✔
822
        .map_err(|_| format!("invalid train ratio '{}': must be a float", parts[0].trim()))?;
10✔
823
    let validation = parts[1].trim().parse::<f32>().map_err(|_| {
9✔
824
        format!(
1✔
825
            "invalid validation ratio '{}': must be a float",
826
            parts[1].trim()
1✔
827
        )
828
    })?;
1✔
829
    let test = parts[2]
8✔
830
        .trim()
8✔
831
        .parse::<f32>()
8✔
832
        .map_err(|_| format!("invalid test ratio '{}': must be a float", parts[2].trim()))?;
8✔
833
    let ratios = SplitRatios {
7✔
834
        train,
7✔
835
        validation,
7✔
836
        test,
7✔
837
    };
7✔
838
    let sum = ratios.train + ratios.validation + ratios.test;
7✔
839
    if (sum - 1.0).abs() > 1e-5 {
7✔
840
        return Err(format!(
1✔
841
            "split ratios must sum to 1.0, got {:.6} (train={}, validation={}, test={})",
1✔
842
            sum, ratios.train, ratios.validation, ratios.test
1✔
843
        ));
1✔
844
    }
6✔
845
    if ratios.train < 0.0 || ratios.validation < 0.0 || ratios.test < 0.0 {
6✔
846
        return Err("split ratios must be non-negative".to_string());
1✔
847
    }
5✔
848
    Ok(ratios)
5✔
849
}
11✔
850

851
fn print_triplet_batch(
1✔
852
    strategy: &ChunkingStrategy,
1✔
853
    batch: &TripletBatch,
1✔
854
    split_store: &impl SplitStore,
1✔
855
) {
1✔
856
    println!("=== triplet batch ===");
1✔
857
    for (idx, triplet) in batch.triplets.iter().enumerate() {
1✔
858
        println!("--- triplet #{} ---", idx);
1✔
859
        println!("recipe       : {}", triplet.recipe);
1✔
860
        println!("sample_weight: {:.4}", triplet.weight);
1✔
861
        if let Some(instr) = &triplet.instruction {
1✔
862
            println!("instruction shown to model:\n{}\n", instr);
1✔
863
        }
1✔
864
        #[cfg(feature = "extended-metrics")]
865
        let (pos_sim, neg_sim) = {
1✔
866
            use crate::metrics::lexical_similarity_scores;
867
            (
1✔
868
                Some(lexical_similarity_scores(
1✔
869
                    &triplet.anchor.text,
1✔
870
                    &triplet.positive.text,
1✔
871
                )),
1✔
872
                Some(lexical_similarity_scores(
1✔
873
                    &triplet.anchor.text,
1✔
874
                    &triplet.negative.text,
1✔
875
                )),
1✔
876
            )
1✔
877
        };
878
        #[cfg(not(feature = "extended-metrics"))]
879
        let (pos_sim, neg_sim): (Option<(f32, f32)>, Option<(f32, f32)>) = (None, None);
880
        print_chunk_block("ANCHOR", &triplet.anchor, strategy, split_store, None);
1✔
881
        print_chunk_block(
1✔
882
            "POSITIVE",
1✔
883
            &triplet.positive,
1✔
884
            strategy,
1✔
885
            split_store,
1✔
886
            pos_sim,
1✔
887
        );
888
        print_chunk_block(
1✔
889
            "NEGATIVE",
1✔
890
            &triplet.negative,
1✔
891
            strategy,
1✔
892
            split_store,
1✔
893
            neg_sim,
1✔
894
        );
895
    }
896
    print_source_summary(
1✔
897
        "triplet anchors",
1✔
898
        batch
1✔
899
            .triplets
1✔
900
            .iter()
1✔
901
            .map(|triplet| triplet.anchor.record_id.as_str()),
1✔
902
    );
903
    print_recipe_context_by_source(
1✔
904
        "triplet recipes by source",
1✔
905
        batch
1✔
906
            .triplets
1✔
907
            .iter()
1✔
908
            .map(|triplet| (triplet.anchor.record_id.as_str(), triplet.recipe.as_str())),
1✔
909
    );
910
}
1✔
911

912
fn print_text_batch(strategy: &ChunkingStrategy, batch: &TextBatch, split_store: &impl SplitStore) {
2✔
913
    println!("=== text batch ===");
2✔
914
    for (idx, sample) in batch.samples.iter().enumerate() {
5✔
915
        println!("--- sample #{} ---", idx);
5✔
916
        println!("recipe       : {}", sample.recipe);
5✔
917
        println!("sample_weight: {:.4}", sample.weight);
5✔
918
        if let Some(instr) = &sample.instruction {
5✔
919
            println!("instruction shown to model:\n{}\n", instr);
1✔
920
        }
4✔
921
        print_chunk_block("TEXT", &sample.chunk, strategy, split_store, None);
5✔
922
    }
923
    print_source_summary(
2✔
924
        "text samples",
2✔
925
        batch
2✔
926
            .samples
2✔
927
            .iter()
2✔
928
            .map(|sample| sample.chunk.record_id.as_str()),
5✔
929
    );
930
    print_recipe_context_by_source(
2✔
931
        "text recipes by source",
2✔
932
        batch
2✔
933
            .samples
2✔
934
            .iter()
2✔
935
            .map(|sample| (sample.chunk.record_id.as_str(), sample.recipe.as_str())),
5✔
936
    );
937
}
2✔
938

939
fn print_pair_batch(
1✔
940
    strategy: &ChunkingStrategy,
1✔
941
    batch: &SampleBatch,
1✔
942
    split_store: &impl SplitStore,
1✔
943
) {
1✔
944
    println!("=== pair batch ===");
1✔
945
    for (idx, pair) in batch.pairs.iter().enumerate() {
1✔
946
        println!("--- pair #{} ---", idx);
1✔
947
        println!("recipe       : {}", pair.recipe);
1✔
948
        println!("label        : {:?}", pair.label);
1✔
949
        if let Some(reason) = &pair.reason {
1✔
950
            println!("reason       : {}", reason);
1✔
951
        }
1✔
952
        print_chunk_block("ANCHOR", &pair.anchor, strategy, split_store, None);
1✔
953
        print_chunk_block("OTHER", &pair.positive, strategy, split_store, None);
1✔
954
    }
955
    print_source_summary(
1✔
956
        "pair anchors",
1✔
957
        batch
1✔
958
            .pairs
1✔
959
            .iter()
1✔
960
            .map(|pair| pair.anchor.record_id.as_str()),
1✔
961
    );
962
    print_recipe_context_by_source(
1✔
963
        "pair recipes by source",
1✔
964
        batch
1✔
965
            .pairs
1✔
966
            .iter()
1✔
967
            .map(|pair| (pair.anchor.record_id.as_str(), pair.recipe.as_str())),
1✔
968
    );
969
}
1✔
970

971
fn print_text_recipes(recipes: &[TextRecipe]) {
2✔
972
    println!("=== available text recipes ===");
2✔
973
    for recipe in recipes {
4✔
974
        println!(
4✔
975
            "- {} (weight: {:.3}) selector={:?}",
976
            recipe.name, recipe.weight, recipe.selector
977
        );
978
        if let Some(instr) = &recipe.instruction {
4✔
979
            println!("  instruction: {}", instr);
1✔
980
        }
3✔
981
    }
982
}
2✔
983

984
#[cfg(feature = "extended-metrics")]
NEW
UNCOV
985
fn metric_mean_median(vals: &mut [f32]) -> (f32, f32) {
×
NEW
UNCOV
986
    let mean = vals.iter().sum::<f32>() / vals.len() as f32;
×
NEW
UNCOV
987
    vals.sort_by(|a, b| a.partial_cmp(b).unwrap_or(std::cmp::Ordering::Equal));
×
NEW
UNCOV
988
    let median = if vals.len() % 2 == 1 {
×
NEW
UNCOV
989
        vals[vals.len() / 2]
×
990
    } else {
NEW
UNCOV
991
        (vals[vals.len() / 2 - 1] + vals[vals.len() / 2]) / 2.0
×
992
    };
NEW
UNCOV
993
    (mean, median)
×
NEW
UNCOV
994
}
×
995

996
#[cfg(feature = "extended-metrics")]
NEW
UNCOV
997
fn print_metric_summary(source_data: &HashMap<String, Vec<(f32, f32, f32, f32)>>) {
×
NEW
UNCOV
998
    let total: usize = source_data.values().map(|v| v.len()).sum();
×
NEW
UNCOV
999
    let n_sources = source_data.len();
×
NEW
UNCOV
1000
    println!(
×
1001
        "=== extended metrics summary ({} triplets, {} {}) ===",
1002
        total,
1003
        n_sources,
NEW
UNCOV
1004
        if n_sources == 1 { "source" } else { "sources" }
×
1005
    );
1006

1007
    // Returns [pos, neg] as (mean, median) pairs for one metric across entries.
NEW
1008
    fn metric_pair(
×
NEW
1009
        entries: &[(f32, f32, f32, f32)],
×
NEW
1010
        pos_idx: usize,
×
NEW
1011
        neg_idx: usize,
×
NEW
1012
    ) -> [(f32, f32); 2] {
×
NEW
1013
        let extract = |idx: usize| -> Vec<f32> {
×
NEW
1014
            entries
×
NEW
1015
                .iter()
×
NEW
1016
                .map(|e| match idx {
×
NEW
1017
                    0 => e.0,
×
NEW
1018
                    1 => e.1,
×
NEW
1019
                    2 => e.2,
×
NEW
1020
                    _ => e.3,
×
NEW
1021
                })
×
NEW
1022
                .collect()
×
NEW
1023
        };
×
NEW
1024
        let mut pos_vals = extract(pos_idx);
×
NEW
1025
        let mut neg_vals = extract(neg_idx);
×
NEW
1026
        [
×
NEW
1027
            metric_mean_median(&mut pos_vals),
×
NEW
1028
            metric_mean_median(&mut neg_vals),
×
NEW
1029
        ]
×
NEW
1030
    }
×
1031

NEW
1032
    fn print_metric_section(
×
NEW
1033
        label: &str,
×
NEW
1034
        sources: &[&String],
×
NEW
1035
        source_data: &HashMap<String, Vec<(f32, f32, f32, f32)>>,
×
NEW
1036
        pos_idx: usize,
×
NEW
1037
        neg_idx: usize,
×
NEW
1038
        total: usize,
×
NEW
1039
        n_sources: usize,
×
NEW
1040
    ) {
×
1041
        const SEP: usize = 83;
NEW
1042
        println!();
×
NEW
1043
        println!("[{}]", label);
×
NEW
1044
        println!(
×
1045
            "{:<24} {:>5}  {:<16} {:<16} {:<16}",
1046
            "source", "n", "positive", "negative", "gap (pos\u{2212}neg)"
1047
        );
NEW
1048
        println!(
×
1049
            "{:<24} {:>5}  {:<16} {:<16} {:<16}",
1050
            "", "", "mean / median", "mean / median", "mean / median"
1051
        );
NEW
1052
        println!("{}", "-".repeat(SEP));
×
NEW
1053
        for source in sources {
×
NEW
1054
            let entries = &source_data[*source];
×
NEW
1055
            let [pos, neg] = metric_pair(entries, pos_idx, neg_idx);
×
NEW
1056
            let gap_mean = pos.0 - neg.0;
×
NEW
1057
            let gap_med = pos.1 - neg.1;
×
NEW
1058
            println!(
×
NEW
1059
                "{:<24} {:>5}  {:.3} / {:.3}     {:.3} / {:.3}     {:+.3} / {:+.3}",
×
NEW
1060
                source,
×
NEW
1061
                entries.len(),
×
NEW
1062
                pos.0,
×
NEW
1063
                pos.1,
×
NEW
1064
                neg.0,
×
NEW
1065
                neg.1,
×
NEW
1066
                gap_mean,
×
NEW
1067
                gap_med,
×
NEW
1068
            );
×
NEW
1069
        }
×
NEW
1070
        if n_sources > 1 {
×
NEW
1071
            let all: Vec<(f32, f32, f32, f32)> = source_data.values().flatten().copied().collect();
×
NEW
1072
            let [pos, neg] = metric_pair(&all, pos_idx, neg_idx);
×
NEW
1073
            let gap_mean = pos.0 - neg.0;
×
NEW
1074
            let gap_med = pos.1 - neg.1;
×
NEW
1075
            println!("{}", "-".repeat(SEP));
×
NEW
1076
            println!(
×
NEW
1077
                "{:<24} {:>5}  {:.3} / {:.3}     {:.3} / {:.3}     {:+.3} / {:+.3}",
×
NEW
1078
                "ALL", total, pos.0, pos.1, neg.0, neg.1, gap_mean, gap_med,
×
NEW
1079
            );
×
NEW
1080
        }
×
NEW
1081
    }
×
1082

NEW
1083
    let mut sources: Vec<&String> = source_data.keys().collect();
×
NEW
1084
    sources.sort();
×
1085

NEW
1086
    print_metric_section(
×
NEW
1087
        "jaccard \u{2194} anchor",
×
NEW
1088
        &sources,
×
NEW
1089
        source_data,
×
1090
        0,
1091
        2,
NEW
1092
        total,
×
NEW
1093
        n_sources,
×
1094
    );
NEW
1095
    print_metric_section(
×
NEW
1096
        "byte-cos \u{2194} anchor",
×
NEW
1097
        &sources,
×
NEW
1098
        source_data,
×
1099
        1,
1100
        3,
NEW
1101
        total,
×
NEW
1102
        n_sources,
×
1103
    );
NEW
1104
    println!();
×
NEW
1105
}
×
1106

1107
trait ChunkDebug {
1108
    fn view_name(&self) -> String;
1109
}
1110

1111
impl ChunkDebug for RecordChunk {
1112
    fn view_name(&self) -> String {
10✔
1113
        match &self.view {
10✔
1114
            ChunkView::Window {
1115
                index,
8✔
1116
                span,
8✔
1117
                overlap,
8✔
1118
                start_ratio,
8✔
1119
            } => format!(
8✔
1120
                "window#index={} span={} overlap={} start_ratio={:.3} tokens={}",
1121
                index, span, overlap, start_ratio, self.tokens_estimate
1122
            ),
1123
            ChunkView::SummaryFallback { strategy, .. } => {
2✔
1124
                format!("summary:{} tokens={}", strategy, self.tokens_estimate)
2✔
1125
            }
1126
        }
1127
    }
10✔
1128
}
1129

1130
fn print_chunk_block(
10✔
1131
    title: &str,
10✔
1132
    chunk: &RecordChunk,
10✔
1133
    strategy: &ChunkingStrategy,
10✔
1134
    split_store: &impl SplitStore,
10✔
1135
    anchor_sim: Option<(f32, f32)>,
10✔
1136
) {
10✔
1137
    let chunk_weight = chunk_weight(strategy, chunk);
10✔
1138
    let split = split_store
10✔
1139
        .label_for(&chunk.record_id)
10✔
1140
        .map(|label| format!("{:?}", label))
10✔
1141
        .unwrap_or_else(|| "Unknown".to_string());
10✔
1142
    println!("--- {} ---", title);
10✔
1143
    println!("split        : {}", split);
10✔
1144
    println!("view         : {}", chunk.view_name());
10✔
1145
    println!("chunk_weight : {:.4}", chunk_weight);
10✔
1146
    println!("record_id    : {}", chunk.record_id);
10✔
1147
    println!("section_idx  : {}", chunk.section_idx);
10✔
1148
    println!("token_est    : {}", chunk.tokens_estimate);
10✔
1149
    if let Some((j, c)) = anchor_sim {
10✔
1150
        println!("jaccard(↔a)  : {:.4}  byte-cos(↔a): {:.4}", j, c);
2✔
1151
    }
8✔
1152
    println!("model_input (exact text sent to the model):");
10✔
1153
    println!(
10✔
1154
        "<<< BEGIN MODEL TEXT >>>\n{}\n<<< END MODEL TEXT >>>\n",
1155
        chunk.text
1156
    );
1157
}
10✔
1158

1159
fn print_source_summary<'a, I>(label: &str, ids: I)
4✔
1160
where
4✔
1161
    I: Iterator<Item = &'a str>,
4✔
1162
{
1163
    let mut counts: HashMap<SourceId, usize> = HashMap::new();
4✔
1164
    for id in ids {
7✔
1165
        let source = extract_source(id);
7✔
1166
        *counts.entry(source).or_insert(0) += 1;
7✔
1167
    }
7✔
1168
    if counts.is_empty() {
4✔
UNCOV
1169
        return;
×
1170
    }
4✔
1171
    let skew = source_skew(&counts);
4✔
1172
    let mut entries: Vec<(String, usize)> = counts.into_iter().collect();
4✔
1173
    entries.sort_by(|a, b| b.1.cmp(&a.1).then_with(|| a.0.cmp(&b.0)));
4✔
1174
    println!("--- {} by source ---", label);
4✔
1175
    if let Some(skew) = skew {
4✔
1176
        for entry in &skew.per_source {
4✔
1177
            println!(
4✔
1178
                "{}: count={} share={:.2}",
4✔
1179
                entry.source, entry.count, entry.share
4✔
1180
            );
4✔
1181
        }
4✔
1182
        println!(
4✔
1183
            "skew: sources={} total={} min={} max={} mean={:.2} ratio={:.2}",
1184
            skew.sources, skew.total, skew.min, skew.max, skew.mean, skew.ratio
1185
        );
1186
    } else {
UNCOV
1187
        for (source, count) in &entries {
×
UNCOV
1188
            println!("{source}: count={count}");
×
UNCOV
1189
        }
×
1190
    }
1191
}
4✔
1192

1193
fn print_recipe_context_by_source<'a, I>(label: &str, entries: I)
4✔
1194
where
4✔
1195
    I: Iterator<Item = (&'a str, &'a str)>,
4✔
1196
{
1197
    let mut counts: HashMap<SourceId, HashMap<String, usize>> = HashMap::new();
4✔
1198
    for (record_id, recipe) in entries {
7✔
1199
        let source = extract_source(record_id);
7✔
1200
        let entry = counts
7✔
1201
            .entry(source)
7✔
1202
            .or_default()
7✔
1203
            .entry(recipe.to_string())
7✔
1204
            .or_insert(0);
7✔
1205
        *entry += 1;
7✔
1206
    }
7✔
1207
    if counts.is_empty() {
4✔
UNCOV
1208
        return;
×
1209
    }
4✔
1210
    let mut sources: Vec<(SourceId, HashMap<String, usize>)> = counts.into_iter().collect();
4✔
1211
    sources.sort_by(|a, b| a.0.cmp(&b.0));
4✔
1212
    println!("--- {} ---", label);
4✔
1213
    for (source, recipes) in sources {
4✔
1214
        println!("{source}");
4✔
1215
        let mut entries: Vec<(String, usize)> = recipes.into_iter().collect();
4✔
1216
        entries.sort_by(|a, b| b.1.cmp(&a.1).then_with(|| a.0.cmp(&b.0)));
4✔
1217
        for (recipe, count) in entries {
5✔
1218
            println!("  - {recipe}={count}");
5✔
1219
        }
5✔
1220
    }
1221
}
4✔
1222

1223
fn extract_source(record_id: &str) -> SourceId {
16✔
1224
    record_id
16✔
1225
        .split_once("::")
16✔
1226
        .map(|(source, _)| source.to_string())
16✔
1227
        .unwrap_or_else(|| "unknown".to_string())
16✔
1228
}
16✔
1229

1230
#[cfg(test)]
1231
mod tests {
1232
    use super::*;
1233
    use crate::DataRecord;
1234
    use crate::DeterministicSplitStore;
1235
    use crate::data::{QualityScore, RecordSection, SectionRole};
1236
    use crate::source::{SourceCursor, SourceSnapshot};
1237
    use chrono::Utc;
1238
    use tempfile::tempdir;
1239

1240
    /// Minimal in-memory `DataSource` test double for example app tests.
1241
    struct TestSource {
1242
        id: String,
1243
        count: Option<u128>,
1244
        recipes: Vec<TripletRecipe>,
1245
    }
1246

1247
    impl DataSource for TestSource {
1248
        fn id(&self) -> &str {
130✔
1249
            &self.id
130✔
1250
        }
130✔
1251

1252
        fn refresh(
30✔
1253
            &self,
30✔
1254
            _config: &SamplerConfig,
30✔
1255
            _cursor: Option<&SourceCursor>,
30✔
1256
            _limit: Option<usize>,
30✔
1257
        ) -> Result<SourceSnapshot, SamplerError> {
30✔
1258
            Ok(SourceSnapshot {
30✔
1259
                records: Vec::new(),
30✔
1260
                cursor: SourceCursor {
30✔
1261
                    last_seen: Utc::now(),
30✔
1262
                    revision: 0,
30✔
1263
                },
30✔
1264
            })
30✔
1265
        }
30✔
1266

1267
        fn reported_record_count(&self, _config: &SamplerConfig) -> Result<u128, SamplerError> {
2✔
1268
            self.count.ok_or_else(|| SamplerError::SourceInconsistent {
2✔
1269
                source_id: self.id.clone(),
1✔
1270
                details: "test source has no configured exact count".to_string(),
1✔
1271
            })
1✔
1272
        }
2✔
1273

1274
        fn default_triplet_recipes(&self) -> Vec<TripletRecipe> {
10✔
1275
            self.recipes.clone()
10✔
1276
        }
10✔
1277
    }
1278

1279
    struct ConfigRequiredSource {
1280
        id: String,
1281
        expected_seed: u64,
1282
    }
1283

1284
    impl DataSource for ConfigRequiredSource {
1285
        fn id(&self) -> &str {
1✔
1286
            &self.id
1✔
1287
        }
1✔
1288

1289
        fn refresh(
1✔
1290
            &self,
1✔
1291
            _config: &SamplerConfig,
1✔
1292
            _cursor: Option<&SourceCursor>,
1✔
1293
            _limit: Option<usize>,
1✔
1294
        ) -> Result<SourceSnapshot, SamplerError> {
1✔
1295
            Ok(SourceSnapshot {
1✔
1296
                records: Vec::new(),
1✔
1297
                cursor: SourceCursor {
1✔
1298
                    last_seen: Utc::now(),
1✔
1299
                    revision: 0,
1✔
1300
                },
1✔
1301
            })
1✔
1302
        }
1✔
1303

1304
        fn reported_record_count(&self, config: &SamplerConfig) -> Result<u128, SamplerError> {
2✔
1305
            if config.seed == self.expected_seed {
2✔
1306
                Ok(1)
1✔
1307
            } else {
1308
                Err(SamplerError::SourceInconsistent {
1✔
1309
                    source_id: self.id.clone(),
1✔
1310
                    details: format!(
1✔
1311
                        "expected sampler seed {} but got {}",
1✔
1312
                        self.expected_seed, config.seed
1✔
1313
                    ),
1✔
1314
                })
1✔
1315
            }
1316
        }
2✔
1317

1318
        fn default_triplet_recipes(&self) -> Vec<TripletRecipe> {
2✔
1319
            Vec::new()
2✔
1320
        }
2✔
1321
    }
1322

1323
    fn default_recipe(name: &str) -> TripletRecipe {
9✔
1324
        TripletRecipe {
9✔
1325
            name: name.to_string().into(),
9✔
1326
            anchor: crate::config::Selector::Role(SectionRole::Anchor),
9✔
1327
            positive_selector: crate::config::Selector::Role(SectionRole::Context),
9✔
1328
            negative_selector: crate::config::Selector::Role(SectionRole::Context),
9✔
1329
            negative_strategy: crate::config::NegativeStrategy::WrongArticle,
9✔
1330
            weight: 1.0,
9✔
1331
            instruction: None,
9✔
1332
            allow_same_anchor_positive: false,
9✔
1333
        }
9✔
1334
    }
9✔
1335

1336
    #[test]
1337
    fn parse_helpers_validate_inputs() {
1✔
1338
        assert_eq!(parse_positive_usize("2").unwrap(), 2);
1✔
1339
        assert!(parse_positive_usize("0").is_err());
1✔
1340
        assert!(parse_positive_usize("abc").is_err());
1✔
1341

1342
        let split = parse_split_ratios_arg("0.8,0.1,0.1").unwrap();
1✔
1343
        assert!((split.train - 0.8).abs() < 1e-6);
1✔
1344
        assert!(parse_split_ratios_arg("0.8,0.1").is_err());
1✔
1345
        assert!(parse_split_ratios_arg("1.0,0.0,0.1").is_err());
1✔
1346
        assert!(parse_split_ratios_arg("-0.1,0.6,0.5").is_err());
1✔
1347
    }
1✔
1348

1349
    #[test]
1350
    fn suggested_balancing_weight_is_longest_normalized_and_bounded() {
1✔
1351
        assert!((suggested_balancing_weight(100, 100) - 1.0).abs() < 1e-6);
1✔
1352
        assert!((suggested_balancing_weight(400, 100) - 0.25).abs() < 1e-6);
1✔
1353
        assert!((suggested_balancing_weight(400, 400) - 1.0).abs() < 1e-6);
1✔
1354
        assert_eq!(suggested_balancing_weight(0, 100), 0.0);
1✔
1355
        assert_eq!(suggested_balancing_weight(100, 0), 0.0);
1✔
1356
    }
1✔
1357

1358
    #[test]
1359
    fn suggested_oversampling_weight_is_inverse_in_unit_interval() {
1✔
1360
        assert!((suggested_oversampling_weight(100, 100) - 1.0).abs() < 1e-6);
1✔
1361
        assert!((suggested_oversampling_weight(100, 400) - 0.25).abs() < 1e-6);
1✔
1362
        assert!((suggested_oversampling_weight(100, 1000) - 0.1).abs() < 1e-6);
1✔
1363
        assert_eq!(suggested_oversampling_weight(0, 100), 0.0);
1✔
1364
        assert_eq!(suggested_oversampling_weight(100, 0), 0.0);
1✔
1365
    }
1✔
1366

1367
    #[test]
1368
    fn parse_cli_handles_help_and_invalid_args() {
1✔
1369
        let help = parse_cli::<EstimateCapacityCli, _>(["estimate_capacity", "--help"]).unwrap();
1✔
1370
        assert!(help.is_none());
1✔
1371

1372
        let err = parse_cli::<EstimateCapacityCli, _>(["estimate_capacity", "--unknown"]);
1✔
1373
        assert!(err.is_err());
1✔
1374
    }
1✔
1375

1376
    #[test]
1377
    fn run_estimate_capacity_succeeds_with_reported_counts() {
1✔
1378
        let result = run_estimate_capacity(
1✔
1379
            std::iter::empty::<String>(),
1✔
1380
            |roots| {
1✔
1381
                assert!(roots.is_empty());
1✔
1382
                Ok(())
1✔
1383
            },
1✔
1384
            |_| {
1✔
1385
                vec![Box::new(TestSource {
1✔
1386
                    id: "source_a".into(),
1✔
1387
                    count: Some(12),
1✔
1388
                    recipes: vec![default_recipe("r1")],
1✔
1389
                }) as DynSource]
1✔
1390
            },
1✔
1391
        );
1392

1393
        assert!(result.is_ok());
1✔
1394
    }
1✔
1395

1396
    #[test]
1397
    fn run_estimate_capacity_errors_when_source_count_missing() {
1✔
1398
        let result = run_estimate_capacity(
1✔
1399
            std::iter::empty::<String>(),
1✔
1400
            |_| Ok(()),
1✔
1401
            |_| {
1✔
1402
                vec![Box::new(TestSource {
1✔
1403
                    id: "source_missing".into(),
1✔
1404
                    count: None,
1✔
1405
                    recipes: vec![default_recipe("r1")],
1✔
1406
                }) as DynSource]
1✔
1407
            },
1✔
1408
        );
1409

1410
        let err = result.unwrap_err().to_string();
1✔
1411
        assert!(err.contains("failed to report exact record count"));
1✔
1412
    }
1✔
1413

1414
    #[test]
1415
    fn run_estimate_capacity_propagates_root_resolution_error() {
1✔
1416
        let result = run_estimate_capacity(
1✔
1417
            std::iter::empty::<String>(),
1✔
1418
            |_| Err("root resolution failed".into()),
1✔
1419
            |_: &()| Vec::<DynSource>::new(),
×
1420
        );
1421

1422
        let err = result.unwrap_err().to_string();
1✔
1423
        assert!(err.contains("root resolution failed"));
1✔
1424
    }
1✔
1425

1426
    #[test]
1427
    fn run_estimate_capacity_configures_sources_centrally_before_counting() {
1✔
1428
        let result = run_estimate_capacity(
1✔
1429
            std::iter::empty::<String>(),
1✔
1430
            |_| Ok(()),
1✔
1431
            |_| {
1✔
1432
                vec![Box::new(ConfigRequiredSource {
1✔
1433
                    id: "requires_config".into(),
1✔
1434
                    expected_seed: 99,
1✔
1435
                }) as DynSource]
1✔
1436
            },
1✔
1437
        );
1438

1439
        assert!(result.is_ok());
1✔
1440
    }
1✔
1441

1442
    #[test]
1443
    fn config_required_source_refresh_and_seed_mismatch_are_exercised() {
1✔
1444
        let source = ConfigRequiredSource {
1✔
1445
            id: "cfg-source".to_string(),
1✔
1446
            expected_seed: 42,
1✔
1447
        };
1✔
1448

1449
        let refreshed = source
1✔
1450
            .refresh(&SamplerConfig::default(), None, None)
1✔
1451
            .unwrap();
1✔
1452
        assert!(refreshed.records.is_empty());
1✔
1453

1454
        let mismatched = source.reported_record_count(&SamplerConfig {
1✔
1455
            seed: 7,
1✔
1456
            ..SamplerConfig::default()
1✔
1457
        });
1✔
1458
        assert!(matches!(
1✔
1459
            mismatched,
1✔
1460
            Err(SamplerError::SourceInconsistent { .. })
1461
        ));
1462

1463
        assert!(source.default_triplet_recipes().is_empty());
1✔
1464
    }
1✔
1465

1466
    #[test]
1467
    fn run_multi_source_demo_exhausted_paths_return_ok() {
1✔
1468
        struct OneRecordSource;
1469

1470
        impl DataSource for OneRecordSource {
1471
            fn id(&self) -> &str {
48✔
1472
                "one_record"
48✔
1473
            }
48✔
1474

1475
            fn refresh(
11✔
1476
                &self,
11✔
1477
                _config: &SamplerConfig,
11✔
1478
                _cursor: Option<&SourceCursor>,
11✔
1479
                _limit: Option<usize>,
11✔
1480
            ) -> Result<SourceSnapshot, SamplerError> {
11✔
1481
                let now = Utc::now();
11✔
1482
                Ok(SourceSnapshot {
11✔
1483
                    records: vec![DataRecord {
11✔
1484
                        id: "one_record::r1".to_string(),
11✔
1485
                        source: "one_record".to_string(),
11✔
1486
                        created_at: now,
11✔
1487
                        updated_at: now,
11✔
1488
                        quality: QualityScore { trust: 1.0 },
11✔
1489
                        taxonomy: Vec::new(),
11✔
1490
                        sections: vec![
11✔
1491
                            RecordSection {
11✔
1492
                                role: SectionRole::Anchor,
11✔
1493
                                heading: Some("title".to_string()),
11✔
1494
                                text: "anchor".to_string(),
11✔
1495
                                sentences: vec!["anchor".to_string()],
11✔
1496
                            },
11✔
1497
                            RecordSection {
11✔
1498
                                role: SectionRole::Context,
11✔
1499
                                heading: Some("body".to_string()),
11✔
1500
                                text: "context".to_string(),
11✔
1501
                                sentences: vec!["context".to_string()],
11✔
1502
                            },
11✔
1503
                        ],
11✔
1504
                        meta_prefix: None,
11✔
1505
                    }],
11✔
1506
                    cursor: SourceCursor {
11✔
1507
                        last_seen: now,
11✔
1508
                        revision: 0,
11✔
1509
                    },
11✔
1510
                })
11✔
1511
            }
11✔
1512

UNCOV
1513
            fn reported_record_count(&self, _config: &SamplerConfig) -> Result<u128, SamplerError> {
×
UNCOV
1514
                Ok(1)
×
UNCOV
1515
            }
×
1516

1517
            fn default_triplet_recipes(&self) -> Vec<TripletRecipe> {
3✔
1518
                vec![default_recipe("single_record_recipe")]
3✔
1519
            }
3✔
1520
        }
1521

1522
        for mode in ["--pair-batch", "--text-recipes", ""] {
3✔
1523
            let dir = tempdir().unwrap();
3✔
1524
            let split_store_path = dir.path().join("split_store.bin");
3✔
1525
            let mut args = vec![
3✔
1526
                "--split-store-path".to_string(),
3✔
1527
                split_store_path.to_string_lossy().to_string(),
3✔
1528
            ];
1529
            if !mode.is_empty() {
3✔
1530
                args.push(mode.to_string());
2✔
1531
            }
2✔
1532

1533
            let result = run_multi_source_demo(
3✔
1534
                args.into_iter(),
3✔
1535
                |_| Ok(()),
3✔
1536
                |_| vec![Box::new(OneRecordSource) as DynSource],
3✔
1537
            );
1538
            assert!(result.is_ok());
3✔
1539
        }
1540
    }
1✔
1541

1542
    #[test]
1543
    fn parse_multi_source_cli_handles_help_and_batch_size_validation() {
1✔
1544
        let help = parse_cli::<MultiSourceDemoCli, _>(["multi_source_demo", "--help"]).unwrap();
1✔
1545
        assert!(help.is_none());
1✔
1546

1547
        let err = parse_cli::<MultiSourceDemoCli, _>(["multi_source_demo", "--batch-size", "0"]);
1✔
1548
        assert!(err.is_err());
1✔
1549

1550
        let parsed = parse_cli::<MultiSourceDemoCli, _>(["multi_source_demo"]);
1✔
1551
        assert!(parsed.is_ok());
1✔
1552
    }
1✔
1553

1554
    #[test]
1555
    fn parse_cli_handles_display_version_path() {
1✔
1556
        #[derive(Debug, Parser)]
1557
        #[command(name = "version_test", version = "1.0.0")]
1558
        struct VersionCli {}
1559

1560
        let parsed = parse_cli::<VersionCli, _>(["version_test", "--version"]).unwrap();
1✔
1561
        assert!(parsed.is_none());
1✔
1562
    }
1✔
1563

1564
    #[test]
1565
    fn run_multi_source_demo_list_text_recipes_path_succeeds() {
1✔
1566
        let dir = tempdir().unwrap();
1✔
1567
        let split_store_path = dir.path().join("recipes_split_store.bin");
1✔
1568
        let mut args = vec![
1✔
1569
            "--list-text-recipes".to_string(),
1✔
1570
            "--split-store-path".to_string(),
1✔
1571
            split_store_path.to_string_lossy().to_string(),
1✔
1572
        ];
1573
        let result = run_multi_source_demo(
1✔
1574
            args.drain(..),
1✔
1575
            |_| Ok(()),
1✔
1576
            |_| {
1✔
1577
                vec![Box::new(TestSource {
1✔
1578
                    id: "source_for_recipes".into(),
1✔
1579
                    count: Some(10),
1✔
1580
                    recipes: vec![default_recipe("recipe_a")],
1✔
1581
                }) as DynSource]
1✔
1582
            },
1✔
1583
        );
1584

1585
        assert!(result.is_ok());
1✔
1586
    }
1✔
1587

1588
    #[test]
1589
    fn run_multi_source_demo_list_text_recipes_uses_explicit_split_store_path() {
1✔
1590
        let dir = tempdir().unwrap();
1✔
1591
        let split_store_path = dir.path().join("custom_split_store.bin");
1✔
1592
        let args = vec![
1✔
1593
            "--list-text-recipes".to_string(),
1✔
1594
            "--split-store-path".to_string(),
1✔
1595
            split_store_path.to_string_lossy().to_string(),
1✔
1596
        ];
1597

1598
        let result = run_multi_source_demo(
1✔
1599
            args.into_iter(),
1✔
1600
            |_| Ok(()),
1✔
1601
            |_| {
1✔
1602
                vec![Box::new(TestSource {
1✔
1603
                    id: "source_without_text_recipes".into(),
1✔
1604
                    count: Some(1),
1✔
1605
                    recipes: Vec::new(),
1✔
1606
                }) as DynSource]
1✔
1607
            },
1✔
1608
        );
1609

1610
        assert!(result.is_ok());
1✔
1611
    }
1✔
1612

1613
    #[test]
1614
    fn run_multi_source_demo_sampling_modes_handle_empty_sources() {
1✔
1615
        for mode in [
3✔
1616
            vec!["--pair-batch".to_string()],
1✔
1617
            vec!["--text-recipes".to_string()],
1✔
1618
            vec![],
1✔
1619
        ] {
1✔
1620
            let dir = tempdir().unwrap();
3✔
1621
            let split_store_path = dir.path().join("empty_sources_split_store.bin");
3✔
1622
            let mut args = mode;
3✔
1623
            args.push("--split-store-path".to_string());
3✔
1624
            args.push(split_store_path.to_string_lossy().to_string());
3✔
1625
            args.push("--split".to_string());
3✔
1626
            args.push("validation".to_string());
3✔
1627

1628
            let result = run_multi_source_demo(
3✔
1629
                args.into_iter(),
3✔
1630
                |_| Ok(()),
3✔
1631
                |_| {
3✔
1632
                    vec![Box::new(TestSource {
3✔
1633
                        id: "source_empty".into(),
3✔
1634
                        count: Some(0),
3✔
1635
                        recipes: vec![default_recipe("recipe_empty")],
3✔
1636
                    }) as DynSource]
3✔
1637
                },
3✔
1638
            );
1639

1640
            assert!(result.is_ok());
3✔
1641
        }
1642
    }
1✔
1643

1644
    #[test]
1645
    fn run_multi_source_demo_propagates_root_resolution_error() {
1✔
1646
        let dir = tempdir().unwrap();
1✔
1647
        let split_store_path = dir.path().join("root_resolution_error_store.bin");
1✔
1648
        let result = run_multi_source_demo(
1✔
1649
            [
1✔
1650
                "--split-store-path".to_string(),
1✔
1651
                split_store_path.to_string_lossy().to_string(),
1✔
1652
            ]
1✔
1653
            .into_iter(),
1✔
1654
            |_| Err("demo root resolution failed".into()),
1✔
UNCOV
1655
            |_: &()| Vec::<DynSource>::new(),
×
1656
        );
1657

1658
        let err = result.unwrap_err().to_string();
1✔
1659
        assert!(err.contains("demo root resolution failed"));
1✔
1660
    }
1✔
1661

1662
    #[test]
1663
    fn print_helpers_and_extract_source_cover_paths() {
1✔
1664
        let split = SplitRatios::default();
1✔
1665
        let store = DeterministicSplitStore::new(split, 42).unwrap();
1✔
1666
        let strategy = ChunkingStrategy::default();
1✔
1667

1668
        let anchor = RecordChunk {
1✔
1669
            record_id: "source_a::rec1".to_string(),
1✔
1670
            section_idx: 0,
1✔
1671
            view: ChunkView::Window {
1✔
1672
                index: 1,
1✔
1673
                overlap: 2,
1✔
1674
                span: 12,
1✔
1675
                start_ratio: 0.25,
1✔
1676
            },
1✔
1677
            text: "anchor text".to_string(),
1✔
1678
            tokens_estimate: 8,
1✔
1679
            quality: crate::data::QualityScore { trust: 0.9 },
1✔
1680
        };
1✔
1681
        let positive = RecordChunk {
1✔
1682
            record_id: "source_a::rec2".to_string(),
1✔
1683
            section_idx: 1,
1✔
1684
            view: ChunkView::SummaryFallback {
1✔
1685
                strategy: "summary".to_string(),
1✔
1686
                weight: 0.7,
1✔
1687
            },
1✔
1688
            text: "positive text".to_string(),
1✔
1689
            tokens_estimate: 6,
1✔
1690
            quality: crate::data::QualityScore { trust: 0.8 },
1✔
1691
        };
1✔
1692
        let negative = RecordChunk {
1✔
1693
            record_id: "source_b::rec3".to_string(),
1✔
1694
            section_idx: 2,
1✔
1695
            view: ChunkView::Window {
1✔
1696
                index: 0,
1✔
1697
                overlap: 0,
1✔
1698
                span: 16,
1✔
1699
                start_ratio: 0.0,
1✔
1700
            },
1✔
1701
            text: "negative text".to_string(),
1✔
1702
            tokens_estimate: 7,
1✔
1703
            quality: crate::data::QualityScore { trust: 0.5 },
1✔
1704
        };
1✔
1705

1706
        let triplet_batch = TripletBatch {
1✔
1707
            triplets: vec![crate::SampleTriplet {
1✔
1708
                recipe: "triplet_recipe".to_string(),
1✔
1709
                anchor: anchor.clone(),
1✔
1710
                positive: positive.clone(),
1✔
1711
                negative: negative.clone(),
1✔
1712
                weight: 1.0,
1✔
1713
                instruction: Some("triplet instruction".to_string()),
1✔
1714
            }],
1✔
1715
        };
1✔
1716
        print_triplet_batch(&strategy, &triplet_batch, &store);
1✔
1717

1718
        let pair_batch = SampleBatch {
1✔
1719
            pairs: vec![crate::SamplePair {
1✔
1720
                recipe: "pair_recipe".to_string(),
1✔
1721
                anchor: anchor.clone(),
1✔
1722
                positive: positive.clone(),
1✔
1723
                weight: 1.0,
1✔
1724
                instruction: None,
1✔
1725
                label: crate::PairLabel::Positive,
1✔
1726
                reason: Some("same topic".to_string()),
1✔
1727
            }],
1✔
1728
        };
1✔
1729
        print_pair_batch(&strategy, &pair_batch, &store);
1✔
1730

1731
        let text_batch = TextBatch {
1✔
1732
            samples: vec![crate::TextSample {
1✔
1733
                recipe: "text_recipe".to_string(),
1✔
1734
                chunk: negative,
1✔
1735
                weight: 0.8,
1✔
1736
                instruction: Some("text instruction".to_string()),
1✔
1737
            }],
1✔
1738
        };
1✔
1739
        print_text_batch(&strategy, &text_batch, &store);
1✔
1740

1741
        let recipes = vec![TextRecipe {
1✔
1742
            name: "recipe_name".into(),
1✔
1743
            selector: crate::config::Selector::Role(SectionRole::Context),
1✔
1744
            instruction: Some("instruction".into()),
1✔
1745
            weight: 1.0,
1✔
1746
        }];
1✔
1747
        print_text_recipes(&recipes);
1✔
1748

1749
        assert_eq!(extract_source("source_a::record"), "source_a");
1✔
1750
        assert_eq!(extract_source("record-without-delimiter"), "unknown");
1✔
1751
    }
1✔
1752

1753
    #[test]
1754
    fn split_arg_conversion_and_version_parse_paths_are_covered() {
1✔
1755
        assert!(matches!(
1✔
1756
            SplitLabel::from(SplitArg::Train),
1✔
1757
            SplitLabel::Train
1758
        ));
1759
        assert!(matches!(
1✔
1760
            SplitLabel::from(SplitArg::Validation),
1✔
1761
            SplitLabel::Validation
1762
        ));
1763
        assert!(matches!(SplitLabel::from(SplitArg::Test), SplitLabel::Test));
1✔
1764
    }
1✔
1765

1766
    #[test]
1767
    fn parse_split_ratios_reports_per_field_parse_errors() {
1✔
1768
        assert!(
1✔
1769
            parse_split_ratios_arg("x,0.1,0.9")
1✔
1770
                .unwrap_err()
1✔
1771
                .contains("invalid train ratio")
1✔
1772
        );
1773
        assert!(
1✔
1774
            parse_split_ratios_arg("0.1,y,0.8")
1✔
1775
                .unwrap_err()
1✔
1776
                .contains("invalid validation ratio")
1✔
1777
        );
1778
        assert!(
1✔
1779
            parse_split_ratios_arg("0.1,0.2,z")
1✔
1780
                .unwrap_err()
1✔
1781
                .contains("invalid test ratio")
1✔
1782
        );
1783
    }
1✔
1784

1785
    #[test]
1786
    fn run_multi_source_demo_exhausted_paths_are_handled() {
1✔
1787
        for mode in [
3✔
1788
            vec!["--pair-batch".to_string()],
1✔
1789
            vec!["--text-recipes".to_string()],
1✔
1790
            Vec::new(),
1✔
1791
        ] {
1✔
1792
            let dir = tempdir().unwrap();
3✔
1793
            let split_store_path = dir.path().join("exhausted_split_store.bin");
3✔
1794
            let mut args = mode;
3✔
1795
            args.push("--split-store-path".to_string());
3✔
1796
            args.push(split_store_path.to_string_lossy().to_string());
3✔
1797

1798
            let result = run_multi_source_demo(
3✔
1799
                args.into_iter(),
3✔
1800
                |_| Ok(()),
3✔
1801
                |_| {
3✔
1802
                    vec![Box::new(TestSource {
3✔
1803
                        id: "source_without_recipes".into(),
3✔
1804
                        count: Some(1),
3✔
1805
                        recipes: Vec::new(),
3✔
1806
                    }) as DynSource]
3✔
1807
                },
3✔
1808
            );
1809

1810
            assert!(result.is_ok());
3✔
1811
        }
1812
    }
1✔
1813
}
STATUS · Troubleshooting · Open an Issue · Sales · Support · CAREERS · ENTERPRISE · START FREE · SCHEDULE DEMO
ANNOUNCEMENTS · TWITTER · TOS & SLA · Supported CI Services · What's a CI service? · Automated Testing

© 2026 Coveralls, Inc