• Home
  • Features
  • Pricing
  • Docs
  • Announcements
  • Sign In

jzombie / rust-triplets / 22361174421

24 Feb 2026 04:57PM UTC coverage: 93.296% (+0.6%) from 92.675%
22361174421

push

github

web-flow
Add HF source (#7)

5314 of 5790 new or added lines in 8 files covered. (91.78%)

1 existing line in 1 file now uncovered.

14751 of 15811 relevant lines covered (93.3%)

2502.34 hits per line

Source File
Press 'n' to go to next uncovered line, 'b' for previous

93.02
/src/example_apps.rs
1
use std::collections::HashMap;
2
use std::error::Error;
3
use std::path::PathBuf;
4
use std::sync::Arc;
5
use std::sync::Once;
6

7
use clap::{Parser, ValueEnum, error::ErrorKind};
8

9
use crate::config::{ChunkingStrategy, SamplerConfig, TripletRecipe};
10
use crate::data::ChunkView;
11
use crate::heuristics::{
12
    CapacityTotals, EFFECTIVE_NEGATIVES_PER_ANCHOR, EFFECTIVE_POSITIVES_PER_ANCHOR,
13
    estimate_source_split_capacity_from_counts, format_replay_factor, format_u128_with_commas,
14
    resolve_text_recipes_for_source, split_counts_for_total,
15
};
16
use crate::metrics::source_skew;
17
use crate::sampler::chunk_weight;
18
use crate::source::DataSource;
19
use crate::splits::{FileSplitStore, SplitLabel, SplitRatios, SplitStore};
20
use crate::{
21
    RecordChunk, SampleBatch, Sampler, SamplerError, SourceId, TextBatch, TextRecipe, TripletBatch,
22
    TripletSampler,
23
};
24

25
type DynSource = Box<dyn DataSource + 'static>;
26

27
fn init_example_tracing() {
13✔
28
    static INIT: Once = Once::new();
29
    INIT.call_once(|| {
13✔
30
        let env_filter = tracing_subscriber::EnvFilter::try_from_default_env()
1✔
31
            .unwrap_or_else(|_| tracing_subscriber::EnvFilter::new("triplets=info"));
1✔
32
        let _ = tracing_subscriber::fmt()
1✔
33
            .with_env_filter(env_filter)
1✔
34
            .try_init();
1✔
35
    });
1✔
36
}
13✔
37

38
#[derive(Debug, Clone, Copy, ValueEnum)]
39
/// CLI split selector mapped onto `SplitLabel`.
40
enum SplitArg {
41
    Train,
42
    Validation,
43
    Test,
44
}
45

46
impl From<SplitArg> for SplitLabel {
47
    fn from(value: SplitArg) -> Self {
6✔
48
        match value {
6✔
49
            SplitArg::Train => SplitLabel::Train,
1✔
50
            SplitArg::Validation => SplitLabel::Validation,
4✔
51
            SplitArg::Test => SplitLabel::Test,
1✔
52
        }
53
    }
6✔
54
}
55

56
#[derive(Debug, Parser)]
57
#[command(
58
    name = "estimate_capacity",
59
    disable_help_subcommand = true,
60
    about = "Metadata-only capacity estimation",
61
    long_about = "Estimate record, pair, triplet, and text-sample capacity using source-reported counts only (no data refresh).",
62
    after_help = "Source roots are optional and resolved in order by explicit arg, environment variables, then project defaults."
63
)]
64
/// CLI arguments for metadata-only capacity estimation.
65
struct EstimateCapacityCli {
66
    #[arg(
67
        long,
68
        default_value_t = 99,
69
        help = "Deterministic seed used for split allocation"
70
    )]
71
    seed: u64,
72
    #[arg(
73
        long = "split-ratios",
74
        value_name = "TRAIN,VALIDATION,TEST",
75
        value_parser = parse_split_ratios_arg,
76
        default_value = "0.8,0.1,0.1",
77
        help = "Comma-separated split ratios that must sum to 1.0"
78
    )]
79
    split: SplitRatios,
80
    #[arg(
81
        long = "source-root",
82
        value_name = "PATH",
83
        help = "Optional source root override, repeat as needed in source order"
84
    )]
85
    source_roots: Vec<String>,
86
}
87

88
#[derive(Debug, Parser)]
89
#[command(
90
    name = "multi_source_demo",
91
    disable_help_subcommand = true,
92
    about = "Run sampled batches from multiple sources",
93
    long_about = "Sample triplet, pair, or text batches from multiple sources and persist split/epoch state.",
94
    after_help = "Source roots are optional and resolved in order by explicit arg, environment variables, then project defaults."
95
)]
96
/// CLI for `multi_source_demo`.
97
///
98
/// Common usage:
99
/// - Keep default persistence file location: `.sampler_store/split_store.bin`
100
/// - Set an explicit file path: `--split-store-path /tmp/split_store.bin`
101
/// - Set a custom directory and keep default filename: `--split-store-dir /tmp/sampler_store`
102
/// - Repeat `--source-root <PATH>` to override source roots in order
103
struct MultiSourceDemoCli {
104
    #[arg(
105
        long = "text-recipes",
106
        help = "Emit a text batch instead of a triplet batch"
107
    )]
108
    show_text_samples: bool,
109
    #[arg(
110
        long = "pair-batch",
111
        help = "Emit a pair batch instead of a triplet batch"
112
    )]
113
    show_pair_samples: bool,
114
    #[arg(
115
        long = "list-text-recipes",
116
        help = "Print registered text recipes and exit"
117
    )]
118
    list_text_recipes: bool,
119
    #[arg(
120
        long = "batch-size",
121
        default_value_t = 4,
122
        value_parser = parse_positive_usize,
123
        help = "Batch size used for sampling"
124
    )]
125
    batch_size: usize,
126
    #[arg(long, help = "Optional deterministic seed override")]
127
    seed: Option<u64>,
128
    #[arg(long, value_enum, help = "Target split to sample from")]
129
    split: Option<SplitArg>,
130
    #[arg(
131
        long = "source-root",
132
        value_name = "PATH",
133
        help = "Optional source root override, repeat as needed in source order"
134
    )]
135
    source_roots: Vec<String>,
136
    #[arg(
137
        long = "split-store-path",
138
        value_name = "SPLIT_STORE_PATH",
139
        help = "Optional path for persisted split/epoch state file"
140
    )]
141
    split_store_path: Option<PathBuf>,
142
    #[arg(
143
        long = "split-store-dir",
144
        value_name = "DIR",
145
        conflicts_with = "split_store_path",
146
        help = "Optional directory for persisted split/epoch state file (uses split_store.bin filename)"
147
    )]
148
    split_store_dir: Option<PathBuf>,
149
}
150

151
#[derive(Debug, Clone)]
152
/// Source-level inventory used by capacity estimation output.
153
struct SourceInventory {
154
    source_id: String,
155
    reported_records: u128,
156
    triplet_recipes: Vec<TripletRecipe>,
157
}
158

159
/// Run the capacity-estimation CLI with injectable root resolution/source builders.
160
///
161
/// `build_sources` is construction-only; sampler configuration is applied
162
/// centrally by this function before any source calls.
163
pub fn run_estimate_capacity<R, Resolve, Build, I>(
4✔
164
    args_iter: I,
4✔
165
    resolve_roots: Resolve,
4✔
166
    build_sources: Build,
4✔
167
) -> Result<(), Box<dyn Error>>
4✔
168
where
4✔
169
    Resolve: FnOnce(Vec<String>) -> Result<R, Box<dyn Error>>,
4✔
170
    Build: FnOnce(&R) -> Vec<DynSource>,
4✔
171
    I: Iterator<Item = String>,
4✔
172
{
173
    init_example_tracing();
4✔
174

175
    let Some(cli) = parse_cli::<EstimateCapacityCli, _>(
4✔
176
        std::iter::once("estimate_capacity".to_string()).chain(args_iter),
4✔
177
    )?
×
178
    else {
179
        return Ok(());
×
180
    };
181

182
    let roots = resolve_roots(cli.source_roots)?;
4✔
183

184
    let config = SamplerConfig {
3✔
185
        seed: cli.seed,
3✔
186
        split: cli.split,
3✔
187
        ..SamplerConfig::default()
3✔
188
    };
3✔
189

190
    let sources = build_sources(&roots);
3✔
191

192
    let mut inventories = Vec::new();
3✔
193
    for source in &sources {
3✔
194
        let recipes = if config.recipes.is_empty() {
3✔
195
            source.default_triplet_recipes()
3✔
196
        } else {
197
            config.recipes.clone()
×
198
        };
199
        let reported_records = source.reported_record_count(&config).map_err(|err| {
3✔
200
            format!(
1✔
201
                "source '{}' failed to report exact record count: {err}",
202
                source.id()
1✔
203
            )
204
        })?;
1✔
205
        inventories.push(SourceInventory {
2✔
206
            source_id: source.id().to_string(),
2✔
207
            reported_records,
2✔
208
            triplet_recipes: recipes,
2✔
209
        });
2✔
210
    }
211

212
    let mut per_source_split_counts: HashMap<(String, SplitLabel), u128> = HashMap::new();
2✔
213
    let mut split_record_counts: HashMap<SplitLabel, u128> = HashMap::new();
2✔
214

215
    for source in &inventories {
2✔
216
        let counts = split_counts_for_total(source.reported_records, cli.split);
2✔
217
        for (label, count) in counts {
6✔
218
            per_source_split_counts.insert((source.source_id.clone(), label), count);
6✔
219
            *split_record_counts.entry(label).or_insert(0) += count;
6✔
220
        }
6✔
221
    }
222

223
    let mut totals_by_split: HashMap<SplitLabel, CapacityTotals> = HashMap::new();
2✔
224
    let mut totals_by_source_and_split: HashMap<(String, SplitLabel), CapacityTotals> =
2✔
225
        HashMap::new();
2✔
226

227
    for split_label in [SplitLabel::Train, SplitLabel::Validation, SplitLabel::Test] {
6✔
228
        let mut totals = CapacityTotals::default();
6✔
229

230
        for source in &inventories {
6✔
231
            let source_split_records = per_source_split_counts
6✔
232
                .get(&(source.source_id.clone(), split_label))
6✔
233
                .copied()
6✔
234
                .unwrap_or(0);
6✔
235

6✔
236
            let triplet_recipes = &source.triplet_recipes;
6✔
237
            let text_recipes = resolve_text_recipes_for_source(&config, triplet_recipes);
6✔
238

6✔
239
            let capacity = estimate_source_split_capacity_from_counts(
6✔
240
                source_split_records,
6✔
241
                triplet_recipes,
6✔
242
                &text_recipes,
6✔
243
            );
6✔
244

6✔
245
            totals_by_source_and_split.insert((source.source_id.clone(), split_label), capacity);
6✔
246

6✔
247
            totals.triplets += capacity.triplets;
6✔
248
            totals.effective_triplets += capacity.effective_triplets;
6✔
249
            totals.pairs += capacity.pairs;
6✔
250
            totals.text_samples += capacity.text_samples;
6✔
251
        }
6✔
252

253
        totals_by_split.insert(split_label, totals);
6✔
254
    }
255

256
    println!("=== capacity estimate (length-only) ===");
2✔
257
    println!("mode: metadata-only (no source.refresh calls)");
2✔
258
    println!("classification: heuristic approximation (not exact)");
2✔
259
    println!("split seed: {}", cli.seed);
2✔
260
    println!(
2✔
261
        "split ratios: train={:.4}, validation={:.4}, test={:.4}",
262
        cli.split.train, cli.split.validation, cli.split.test
263
    );
264
    println!();
2✔
265

266
    println!("[SOURCES]");
2✔
267
    for source in &inventories {
2✔
268
        println!(
2✔
269
            "  {} => reported records: {}",
2✔
270
            source.source_id,
2✔
271
            format_u128_with_commas(source.reported_records)
2✔
272
        );
2✔
273
    }
2✔
274
    println!();
2✔
275

276
    println!("[PER SOURCE BREAKDOWN]");
2✔
277
    for source in &inventories {
2✔
278
        println!("  {}", source.source_id);
2✔
279
        let mut source_grand = CapacityTotals::default();
2✔
280
        let mut source_total_records = 0u128;
2✔
281
        for split_label in [SplitLabel::Train, SplitLabel::Validation, SplitLabel::Test] {
6✔
282
            let split_records = per_source_split_counts
6✔
283
                .get(&(source.source_id.clone(), split_label))
6✔
284
                .copied()
6✔
285
                .unwrap_or(0);
6✔
286
            source_total_records = source_total_records.saturating_add(split_records);
6✔
287
            let split_longest_records = inventories
6✔
288
                .iter()
6✔
289
                .map(|candidate| {
6✔
290
                    per_source_split_counts
6✔
291
                        .get(&(candidate.source_id.clone(), split_label))
6✔
292
                        .copied()
6✔
293
                        .unwrap_or(0)
6✔
294
                })
6✔
295
                .max()
6✔
296
                .unwrap_or(0);
6✔
297
            let totals = totals_by_source_and_split
6✔
298
                .get(&(source.source_id.clone(), split_label))
6✔
299
                .copied()
6✔
300
                .unwrap_or_default();
6✔
301
            source_grand.triplets += totals.triplets;
6✔
302
            source_grand.effective_triplets += totals.effective_triplets;
6✔
303
            source_grand.pairs += totals.pairs;
6✔
304
            source_grand.text_samples += totals.text_samples;
6✔
305
            println!("    [{:?}]", split_label);
6✔
306
            println!("      records: {}", format_u128_with_commas(split_records));
6✔
307
            println!(
6✔
308
                "      triplet combinations: {}",
309
                format_u128_with_commas(totals.triplets)
6✔
310
            );
311
            println!(
6✔
312
                "      effective sampled triplets (p={}, k={}): {}",
313
                EFFECTIVE_POSITIVES_PER_ANCHOR,
314
                EFFECTIVE_NEGATIVES_PER_ANCHOR,
315
                format_u128_with_commas(totals.effective_triplets)
6✔
316
            );
317
            println!(
6✔
318
                "      pair combinations:    {}",
319
                format_u128_with_commas(totals.pairs)
6✔
320
            );
321
            println!(
6✔
322
                "      text samples:         {}",
323
                format_u128_with_commas(totals.text_samples)
6✔
324
            );
325
            println!(
6✔
326
                "      replay factor vs longest source: {}",
327
                format_replay_factor(split_longest_records, split_records)
6✔
328
            );
329
        }
330
        let longest_source_total = inventories
2✔
331
            .iter()
2✔
332
            .map(|candidate| candidate.reported_records)
2✔
333
            .max()
2✔
334
            .unwrap_or(0);
2✔
335
        println!("    [ALL SPLITS FOR SOURCE]");
2✔
336
        println!(
2✔
337
            "      triplet combinations: {}",
338
            format_u128_with_commas(source_grand.triplets)
2✔
339
        );
340
        println!(
2✔
341
            "      effective sampled triplets (p={}, k={}): {}",
342
            EFFECTIVE_POSITIVES_PER_ANCHOR,
343
            EFFECTIVE_NEGATIVES_PER_ANCHOR,
344
            format_u128_with_commas(source_grand.effective_triplets)
2✔
345
        );
346
        println!(
2✔
347
            "      pair combinations:    {}",
348
            format_u128_with_commas(source_grand.pairs)
2✔
349
        );
350
        println!(
2✔
351
            "      text samples:         {}",
352
            format_u128_with_commas(source_grand.text_samples)
2✔
353
        );
354
        println!(
2✔
355
            "      replay factor vs longest source: {}",
356
            format_replay_factor(longest_source_total, source_total_records)
2✔
357
        );
358
        println!();
2✔
359
    }
360

361
    let mut grand = CapacityTotals::default();
2✔
362
    for split_label in [SplitLabel::Train, SplitLabel::Validation, SplitLabel::Test] {
6✔
363
        let record_count = split_record_counts.get(&split_label).copied().unwrap_or(0);
6✔
364
        let totals = totals_by_split
6✔
365
            .get(&split_label)
6✔
366
            .copied()
6✔
367
            .unwrap_or_default();
6✔
368

6✔
369
        grand.triplets += totals.triplets;
6✔
370
        grand.effective_triplets += totals.effective_triplets;
6✔
371
        grand.pairs += totals.pairs;
6✔
372
        grand.text_samples += totals.text_samples;
6✔
373

6✔
374
        println!("[{:?}]", split_label);
6✔
375
        println!("  records: {}", format_u128_with_commas(record_count));
6✔
376
        println!(
6✔
377
            "  triplet combinations: {}",
6✔
378
            format_u128_with_commas(totals.triplets)
6✔
379
        );
6✔
380
        println!(
6✔
381
            "  effective sampled triplets (p={}, k={}): {}",
6✔
382
            EFFECTIVE_POSITIVES_PER_ANCHOR,
6✔
383
            EFFECTIVE_NEGATIVES_PER_ANCHOR,
6✔
384
            format_u128_with_commas(totals.effective_triplets)
6✔
385
        );
6✔
386
        println!(
6✔
387
            "  pair combinations:    {}",
6✔
388
            format_u128_with_commas(totals.pairs)
6✔
389
        );
6✔
390
        println!(
6✔
391
            "  text samples:         {}",
6✔
392
            format_u128_with_commas(totals.text_samples)
6✔
393
        );
6✔
394
        println!();
6✔
395
    }
6✔
396

397
    println!("[ALL SPLITS TOTAL]");
2✔
398
    println!(
2✔
399
        "  triplet combinations: {}",
400
        format_u128_with_commas(grand.triplets)
2✔
401
    );
402
    println!(
2✔
403
        "  effective sampled triplets (p={}, k={}): {}",
404
        EFFECTIVE_POSITIVES_PER_ANCHOR,
405
        EFFECTIVE_NEGATIVES_PER_ANCHOR,
406
        format_u128_with_commas(grand.effective_triplets)
2✔
407
    );
408
    println!(
2✔
409
        "  pair combinations:    {}",
410
        format_u128_with_commas(grand.pairs)
2✔
411
    );
412
    println!(
2✔
413
        "  text samples:         {}",
414
        format_u128_with_commas(grand.text_samples)
2✔
415
    );
416
    println!();
2✔
417
    println!(
2✔
418
        "Note: counts are heuristic, length-based estimates from source-reported totals and recipe structure. They are approximate, not exact, and assume anchor-positive pairs=records (one positive per anchor by default), negatives=source_records_in_split-1 (anchor excluded as its own negative), and at most one chunk/window realization per sample. In real-world chunked sampling, practical combinations are often higher, so treat this as a floor-like baseline."
419
    );
420
    println!(
2✔
421
        "Effective sampled triplets apply a bounded training assumption: effective_triplets = records * p * k per triplet recipe, with defaults p={} positives per anchor and k={} negatives per anchor.",
422
        EFFECTIVE_POSITIVES_PER_ANCHOR, EFFECTIVE_NEGATIVES_PER_ANCHOR
423
    );
424
    println!(
2✔
425
        "Oversample loops are not inferred from this static report. To measure true oversampling (how many times sampling loops through the combination space), use observed sampled draw counts from an actual run."
426
    );
427

428
    Ok(())
2✔
429
}
4✔
430

431
/// Run the multi-source demo CLI with injectable root resolution/source builders.
432
///
433
/// `build_sources` is construction-only. Source sampler configuration is owned
434
/// by sampler registration (`TripletSampler::register_source`).
435
pub fn run_multi_source_demo<R, Resolve, Build, I>(
9✔
436
    args_iter: I,
9✔
437
    resolve_roots: Resolve,
9✔
438
    build_sources: Build,
9✔
439
) -> Result<(), Box<dyn Error>>
9✔
440
where
9✔
441
    Resolve: FnOnce(Vec<String>) -> Result<R, Box<dyn Error>>,
9✔
442
    Build: FnOnce(&R) -> Vec<DynSource>,
9✔
443
    I: Iterator<Item = String>,
9✔
444
{
445
    init_example_tracing();
9✔
446

447
    let Some(cli) = parse_cli::<MultiSourceDemoCli, _>(
9✔
448
        std::iter::once("multi_source_demo".to_string()).chain(args_iter),
9✔
449
    )?
×
450
    else {
451
        return Ok(());
×
452
    };
453

454
    let roots = resolve_roots(cli.source_roots)?;
9✔
455

456
    let mut config = SamplerConfig::default();
8✔
457
    config.seed = cli.seed.unwrap_or(config.seed);
8✔
458
    config.batch_size = cli.batch_size;
8✔
459
    config.chunking = Default::default();
8✔
460
    let selected_split = cli.split.map(Into::into).unwrap_or(SplitLabel::Train);
8✔
461
    config.split = SplitRatios::default();
8✔
462
    config.allowed_splits = vec![selected_split];
8✔
463
    let chunking = config.chunking.clone();
8✔
464

465
    let split_store_path = if let Some(path) = cli.split_store_path {
8✔
466
        path
1✔
467
    } else if let Some(dir) = cli.split_store_dir {
7✔
468
        FileSplitStore::default_path_in_dir(dir)
7✔
469
    } else {
470
        FileSplitStore::default_path()
×
471
    };
472

473
    println!(
8✔
474
        "Persisting split assignments and epoch state to {}",
475
        split_store_path.display()
8✔
476
    );
477
    let sources = build_sources(&roots);
8✔
478
    let split_store = Arc::new(FileSplitStore::open(&split_store_path, config.split, 99)?);
8✔
479
    let sampler = TripletSampler::new(config, split_store.clone());
8✔
480
    for source in sources {
8✔
481
        sampler.register_source(source);
8✔
482
    }
8✔
483

484
    if cli.show_pair_samples {
8✔
485
        match sampler.next_pair_batch(selected_split) {
2✔
486
            Ok(pair_batch) => {
×
487
                if pair_batch.pairs.is_empty() {
×
488
                    println!("Pair sampling produced no results.");
×
489
                } else {
×
490
                    print_pair_batch(&chunking, &pair_batch, split_store.as_ref());
×
491
                }
×
492
                sampler.persist_state()?;
×
493
            }
494
            Err(SamplerError::Exhausted(name)) => {
2✔
495
                eprintln!(
2✔
496
                    "Pair sampler exhausted recipe '{}'. Ensure both positive and negative examples exist.",
2✔
497
                    name
2✔
498
                );
2✔
499
            }
2✔
500
            Err(err) => return Err(err.into()),
×
501
        }
502
    } else if cli.show_text_samples {
6✔
503
        match sampler.next_text_batch(selected_split) {
2✔
504
            Ok(text_batch) => {
×
505
                if text_batch.samples.is_empty() {
×
506
                    println!(
×
507
                        "Text sampling produced no results. Ensure each source has eligible sections."
×
508
                    );
×
509
                } else {
×
510
                    print_text_batch(&chunking, &text_batch, split_store.as_ref());
×
511
                }
×
512
                sampler.persist_state()?;
×
513
            }
514
            Err(SamplerError::Exhausted(name)) => {
2✔
515
                eprintln!(
2✔
516
                    "Text sampler exhausted selector '{}'. Ensure matching sections exist.",
2✔
517
                    name
2✔
518
                );
2✔
519
            }
2✔
520
            Err(err) => return Err(err.into()),
×
521
        }
522
    } else if cli.list_text_recipes {
4✔
523
        let recipes = sampler.text_recipes();
2✔
524
        if recipes.is_empty() {
2✔
525
            println!(
1✔
526
                "No text recipes registered. Ensure your sources expose triplet selectors or configure text_recipes explicitly."
1✔
527
            );
1✔
528
        } else {
1✔
529
            print_text_recipes(&recipes);
1✔
530
        }
1✔
531
    } else {
532
        match sampler.next_triplet_batch(selected_split) {
2✔
533
            Ok(triplet_batch) => {
×
534
                if triplet_batch.triplets.is_empty() {
×
535
                    println!(
×
536
                        "Triplet sampling produced no results. Ensure multiple records per source exist."
×
537
                    );
×
538
                } else {
×
539
                    print_triplet_batch(&chunking, &triplet_batch, split_store.as_ref());
×
540
                }
×
541
                sampler.persist_state()?;
×
542
            }
543
            Err(SamplerError::Exhausted(name)) => {
2✔
544
                eprintln!(
2✔
545
                    "Triplet sampler exhausted recipe '{}'. Ensure both positive and negative examples exist.",
2✔
546
                    name
2✔
547
                );
2✔
548
            }
2✔
549
            Err(err) => return Err(err.into()),
×
550
        }
551
    }
552

553
    Ok(())
8✔
554
}
9✔
555

556
fn parse_positive_usize(raw: &str) -> Result<usize, String> {
14✔
557
    let parsed = raw.parse::<usize>().map_err(|_| {
14✔
558
        format!(
1✔
559
            "Could not parse --batch-size value '{}' as a positive integer",
560
            raw
561
        )
562
    })?;
1✔
563
    if parsed == 0 {
13✔
564
        return Err("--batch-size must be greater than zero".to_string());
2✔
565
    }
11✔
566
    Ok(parsed)
11✔
567
}
14✔
568

569
fn parse_cli<T, I>(args: I) -> Result<Option<T>, Box<dyn Error>>
19✔
570
where
19✔
571
    T: Parser,
19✔
572
    I: IntoIterator,
19✔
573
    I::Item: Into<std::ffi::OsString> + Clone,
19✔
574
{
575
    match T::try_parse_from(args) {
19✔
576
        Ok(cli) => Ok(Some(cli)),
13✔
577
        Err(err) => match err.kind() {
6✔
578
            ErrorKind::DisplayHelp | ErrorKind::DisplayVersion => {
579
                err.print()?;
3✔
580
                Ok(None)
3✔
581
            }
582
            _ => Err(err.into()),
3✔
583
        },
584
    }
585
}
19✔
586

587
fn parse_split_ratios_arg(raw: &str) -> Result<SplitRatios, String> {
11✔
588
    let parts: Vec<&str> = raw.split(',').collect();
11✔
589
    if parts.len() != 3 {
11✔
590
        return Err("--split-ratios expects exactly 3 comma-separated values".to_string());
1✔
591
    }
10✔
592
    let train = parts[0]
10✔
593
        .trim()
10✔
594
        .parse::<f32>()
10✔
595
        .map_err(|_| format!("invalid train ratio '{}': must be a float", parts[0].trim()))?;
10✔
596
    let validation = parts[1].trim().parse::<f32>().map_err(|_| {
9✔
597
        format!(
1✔
598
            "invalid validation ratio '{}': must be a float",
599
            parts[1].trim()
1✔
600
        )
601
    })?;
1✔
602
    let test = parts[2]
8✔
603
        .trim()
8✔
604
        .parse::<f32>()
8✔
605
        .map_err(|_| format!("invalid test ratio '{}': must be a float", parts[2].trim()))?;
8✔
606
    let ratios = SplitRatios {
7✔
607
        train,
7✔
608
        validation,
7✔
609
        test,
7✔
610
    };
7✔
611
    let sum = ratios.train + ratios.validation + ratios.test;
7✔
612
    if (sum - 1.0).abs() > 1e-5 {
7✔
613
        return Err(format!(
1✔
614
            "split ratios must sum to 1.0, got {:.6} (train={}, validation={}, test={})",
1✔
615
            sum, ratios.train, ratios.validation, ratios.test
1✔
616
        ));
1✔
617
    }
6✔
618
    if ratios.train < 0.0 || ratios.validation < 0.0 || ratios.test < 0.0 {
6✔
619
        return Err("split ratios must be non-negative".to_string());
1✔
620
    }
5✔
621
    Ok(ratios)
5✔
622
}
11✔
623

624
fn print_triplet_batch(
1✔
625
    strategy: &ChunkingStrategy,
1✔
626
    batch: &TripletBatch,
1✔
627
    split_store: &impl SplitStore,
1✔
628
) {
1✔
629
    println!("=== triplet batch ===");
1✔
630
    for (idx, triplet) in batch.triplets.iter().enumerate() {
1✔
631
        println!("--- triplet #{} ---", idx);
1✔
632
        println!("recipe       : {}", triplet.recipe);
1✔
633
        println!("sample_weight: {:.4}", triplet.weight);
1✔
634
        if let Some(instr) = &triplet.instruction {
1✔
635
            println!("instruction shown to model:\n{}\n", instr);
1✔
636
        }
1✔
637
        print_chunk_block("ANCHOR", &triplet.anchor, strategy, split_store);
1✔
638
        print_chunk_block("POSITIVE", &triplet.positive, strategy, split_store);
1✔
639
        print_chunk_block("NEGATIVE", &triplet.negative, strategy, split_store);
1✔
640
    }
641
    print_source_summary(
1✔
642
        "triplet anchors",
1✔
643
        batch
1✔
644
            .triplets
1✔
645
            .iter()
1✔
646
            .map(|triplet| triplet.anchor.record_id.as_str()),
1✔
647
    );
648
    print_recipe_summary_by_source(
1✔
649
        "triplet recipes by source",
1✔
650
        batch
1✔
651
            .triplets
1✔
652
            .iter()
1✔
653
            .map(|triplet| (triplet.anchor.record_id.as_str(), triplet.recipe.as_str())),
1✔
654
    );
655
}
1✔
656

657
fn print_text_batch(strategy: &ChunkingStrategy, batch: &TextBatch, split_store: &impl SplitStore) {
1✔
658
    println!("=== text batch ===");
1✔
659
    for (idx, sample) in batch.samples.iter().enumerate() {
1✔
660
        println!("--- sample #{} ---", idx);
1✔
661
        println!("recipe       : {}", sample.recipe);
1✔
662
        println!("sample_weight: {:.4}", sample.weight);
1✔
663
        if let Some(instr) = &sample.instruction {
1✔
664
            println!("instruction shown to model:\n{}\n", instr);
1✔
665
        }
1✔
666
        print_chunk_block("TEXT", &sample.chunk, strategy, split_store);
1✔
667
    }
668
    print_source_summary(
1✔
669
        "text samples",
1✔
670
        batch
1✔
671
            .samples
1✔
672
            .iter()
1✔
673
            .map(|sample| sample.chunk.record_id.as_str()),
1✔
674
    );
675
    print_recipe_summary_by_source(
1✔
676
        "text recipes by source",
1✔
677
        batch
1✔
678
            .samples
1✔
679
            .iter()
1✔
680
            .map(|sample| (sample.chunk.record_id.as_str(), sample.recipe.as_str())),
1✔
681
    );
682
}
1✔
683

684
fn print_pair_batch(
1✔
685
    strategy: &ChunkingStrategy,
1✔
686
    batch: &SampleBatch,
1✔
687
    split_store: &impl SplitStore,
1✔
688
) {
1✔
689
    println!("=== pair batch ===");
1✔
690
    for (idx, pair) in batch.pairs.iter().enumerate() {
1✔
691
        println!("--- pair #{} ---", idx);
1✔
692
        println!("recipe       : {}", pair.recipe);
1✔
693
        println!("label        : {:?}", pair.label);
1✔
694
        if let Some(reason) = &pair.reason {
1✔
695
            println!("reason       : {}", reason);
1✔
696
        }
1✔
697
        print_chunk_block("ANCHOR", &pair.anchor, strategy, split_store);
1✔
698
        print_chunk_block("OTHER", &pair.positive, strategy, split_store);
1✔
699
    }
700
    print_source_summary(
1✔
701
        "pair anchors",
1✔
702
        batch
1✔
703
            .pairs
1✔
704
            .iter()
1✔
705
            .map(|pair| pair.anchor.record_id.as_str()),
1✔
706
    );
707
    print_recipe_summary_by_source(
1✔
708
        "pair recipes by source",
1✔
709
        batch
1✔
710
            .pairs
1✔
711
            .iter()
1✔
712
            .map(|pair| (pair.anchor.record_id.as_str(), pair.recipe.as_str())),
1✔
713
    );
714
}
1✔
715

716
fn print_text_recipes(recipes: &[TextRecipe]) {
2✔
717
    println!("=== available text recipes ===");
2✔
718
    for recipe in recipes {
4✔
719
        println!(
4✔
720
            "- {} (weight: {:.3}) selector={:?}",
721
            recipe.name, recipe.weight, recipe.selector
722
        );
723
        if let Some(instr) = &recipe.instruction {
4✔
724
            println!("  instruction: {}", instr);
1✔
725
        }
3✔
726
    }
727
}
2✔
728

729
trait ChunkDebug {
730
    fn view_name(&self) -> String;
731
}
732

733
impl ChunkDebug for RecordChunk {
734
    fn view_name(&self) -> String {
6✔
735
        match &self.view {
6✔
736
            ChunkView::Window {
737
                index,
4✔
738
                span,
4✔
739
                overlap,
4✔
740
                start_ratio,
4✔
741
            } => format!(
4✔
742
                "window#index={} span={} overlap={} start_ratio={:.3} tokens={}",
743
                index, span, overlap, start_ratio, self.tokens_estimate
744
            ),
745
            ChunkView::SummaryFallback { strategy, .. } => {
2✔
746
                format!("summary:{} tokens={}", strategy, self.tokens_estimate)
2✔
747
            }
748
        }
749
    }
6✔
750
}
751

752
fn print_chunk_block(
6✔
753
    title: &str,
6✔
754
    chunk: &RecordChunk,
6✔
755
    strategy: &ChunkingStrategy,
6✔
756
    split_store: &impl SplitStore,
6✔
757
) {
6✔
758
    let chunk_weight = chunk_weight(strategy, chunk);
6✔
759
    let split = split_store
6✔
760
        .label_for(&chunk.record_id)
6✔
761
        .map(|label| format!("{:?}", label))
6✔
762
        .unwrap_or_else(|| "Unknown".to_string());
6✔
763
    println!("--- {} ---", title);
6✔
764
    println!("split        : {}", split);
6✔
765
    println!("view         : {}", chunk.view_name());
6✔
766
    println!("chunk_weight : {:.4}", chunk_weight);
6✔
767
    println!("record_id    : {}", chunk.record_id);
6✔
768
    println!("section_idx  : {}", chunk.section_idx);
6✔
769
    println!("token_est    : {}", chunk.tokens_estimate);
6✔
770
    println!("model_input (exact text sent to the model):");
6✔
771
    println!(
6✔
772
        "<<< BEGIN MODEL TEXT >>>\n{}\n<<< END MODEL TEXT >>>\n",
773
        chunk.text
774
    );
775
}
6✔
776

777
fn print_source_summary<'a, I>(label: &str, ids: I)
3✔
778
where
3✔
779
    I: Iterator<Item = &'a str>,
3✔
780
{
781
    let mut counts: HashMap<SourceId, usize> = HashMap::new();
3✔
782
    for id in ids {
3✔
783
        let source = extract_source(id);
3✔
784
        *counts.entry(source).or_insert(0) += 1;
3✔
785
    }
3✔
786
    if counts.is_empty() {
3✔
787
        return;
×
788
    }
3✔
789
    let skew = source_skew(&counts);
3✔
790
    let mut entries: Vec<(String, usize)> = counts.into_iter().collect();
3✔
791
    entries.sort_by(|a, b| b.1.cmp(&a.1).then_with(|| a.0.cmp(&b.0)));
3✔
792
    println!("--- {} by source ---", label);
3✔
793
    if let Some(skew) = skew {
3✔
794
        for entry in &skew.per_source {
3✔
795
            println!(
3✔
796
                "{}: count={} share={:.2}",
3✔
797
                entry.source, entry.count, entry.share
3✔
798
            );
3✔
799
        }
3✔
800
        println!(
3✔
801
            "skew: sources={} total={} min={} max={} mean={:.2} ratio={:.2}",
802
            skew.sources, skew.total, skew.min, skew.max, skew.mean, skew.ratio
803
        );
804
    } else {
805
        for (source, count) in &entries {
×
806
            println!("{source}: count={count}");
×
807
        }
×
808
    }
809
}
3✔
810

811
fn print_recipe_summary_by_source<'a, I>(label: &str, entries: I)
3✔
812
where
3✔
813
    I: Iterator<Item = (&'a str, &'a str)>,
3✔
814
{
815
    let mut counts: HashMap<SourceId, HashMap<String, usize>> = HashMap::new();
3✔
816
    for (record_id, recipe) in entries {
3✔
817
        let source = extract_source(record_id);
3✔
818
        let entry = counts
3✔
819
            .entry(source)
3✔
820
            .or_default()
3✔
821
            .entry(recipe.to_string())
3✔
822
            .or_insert(0);
3✔
823
        *entry += 1;
3✔
824
    }
3✔
825
    if counts.is_empty() {
3✔
826
        return;
×
827
    }
3✔
828
    let mut sources: Vec<(SourceId, HashMap<String, usize>)> = counts.into_iter().collect();
3✔
829
    sources.sort_by(|a, b| a.0.cmp(&b.0));
3✔
830
    println!("--- {} ---", label);
3✔
831
    for (source, recipes) in sources {
3✔
832
        println!("{source}");
3✔
833
        let mut entries: Vec<(String, usize)> = recipes.into_iter().collect();
3✔
834
        entries.sort_by(|a, b| b.1.cmp(&a.1).then_with(|| a.0.cmp(&b.0)));
3✔
835
        for (recipe, count) in entries {
3✔
836
            println!("  - {recipe}={count}");
3✔
837
        }
3✔
838
    }
839
}
3✔
840

841
fn extract_source(record_id: &str) -> SourceId {
8✔
842
    record_id
8✔
843
        .split_once("::")
8✔
844
        .map(|(source, _)| source.to_string())
8✔
845
        .unwrap_or_else(|| "unknown".to_string())
8✔
846
}
8✔
847

848
#[cfg(test)]
849
mod tests {
850
    use super::*;
851
    use crate::DeterministicSplitStore;
852
    use crate::data::SectionRole;
853
    use crate::source::{SourceCursor, SourceSnapshot};
854
    use chrono::Utc;
855
    use tempfile::tempdir;
856

857
    /// Minimal in-memory `DataSource` test double for example app tests.
858
    struct TestSource {
859
        id: String,
860
        count: Option<u128>,
861
        recipes: Vec<TripletRecipe>,
862
    }
863

864
    impl DataSource for TestSource {
865
        fn id(&self) -> &str {
70✔
866
            &self.id
70✔
867
        }
70✔
868

869
        fn refresh(
30✔
870
            &self,
30✔
871
            _config: &SamplerConfig,
30✔
872
            _cursor: Option<&SourceCursor>,
30✔
873
            _limit: Option<usize>,
30✔
874
        ) -> Result<SourceSnapshot, SamplerError> {
30✔
875
            Ok(SourceSnapshot {
30✔
876
                records: Vec::new(),
30✔
877
                cursor: SourceCursor {
30✔
878
                    last_seen: Utc::now(),
30✔
879
                    revision: 0,
30✔
880
                },
30✔
881
            })
30✔
882
        }
30✔
883

884
        fn reported_record_count(&self, _config: &SamplerConfig) -> Result<u128, SamplerError> {
2✔
885
            self.count.ok_or_else(|| SamplerError::SourceInconsistent {
2✔
886
                source_id: self.id.clone(),
1✔
887
                details: "test source has no configured exact count".to_string(),
1✔
888
            })
1✔
889
        }
2✔
890

891
        fn default_triplet_recipes(&self) -> Vec<TripletRecipe> {
10✔
892
            self.recipes.clone()
10✔
893
        }
10✔
894
    }
895

896
    struct ConfigRequiredSource {
897
        id: String,
898
        expected_seed: u64,
899
    }
900

901
    impl DataSource for ConfigRequiredSource {
902
        fn id(&self) -> &str {
1✔
903
            &self.id
1✔
904
        }
1✔
905

NEW
906
        fn refresh(
×
NEW
907
            &self,
×
NEW
908
            _config: &SamplerConfig,
×
NEW
909
            _cursor: Option<&SourceCursor>,
×
NEW
910
            _limit: Option<usize>,
×
NEW
911
        ) -> Result<SourceSnapshot, SamplerError> {
×
NEW
912
            Ok(SourceSnapshot {
×
NEW
913
                records: Vec::new(),
×
NEW
914
                cursor: SourceCursor {
×
NEW
915
                    last_seen: Utc::now(),
×
NEW
916
                    revision: 0,
×
NEW
917
                },
×
NEW
918
            })
×
NEW
919
        }
×
920

921
        fn reported_record_count(&self, config: &SamplerConfig) -> Result<u128, SamplerError> {
1✔
922
            if config.seed == self.expected_seed {
1✔
923
                Ok(1)
1✔
924
            } else {
NEW
925
                Err(SamplerError::SourceInconsistent {
×
NEW
926
                    source_id: self.id.clone(),
×
NEW
927
                    details: format!(
×
NEW
928
                        "expected sampler seed {} but got {}",
×
NEW
929
                        self.expected_seed, config.seed
×
NEW
930
                    ),
×
NEW
931
                })
×
932
            }
933
        }
1✔
934

935
        fn default_triplet_recipes(&self) -> Vec<TripletRecipe> {
1✔
936
            Vec::new()
1✔
937
        }
1✔
938
    }
939

940
    fn default_recipe(name: &str) -> TripletRecipe {
6✔
941
        TripletRecipe {
6✔
942
            name: name.to_string().into(),
6✔
943
            anchor: crate::config::Selector::Role(SectionRole::Anchor),
6✔
944
            positive_selector: crate::config::Selector::Role(SectionRole::Context),
6✔
945
            negative_selector: crate::config::Selector::Role(SectionRole::Context),
6✔
946
            negative_strategy: crate::config::NegativeStrategy::WrongArticle,
6✔
947
            weight: 1.0,
6✔
948
            instruction: None,
6✔
949
        }
6✔
950
    }
6✔
951

952
    #[test]
953
    fn parse_helpers_validate_inputs() {
1✔
954
        assert_eq!(parse_positive_usize("2").unwrap(), 2);
1✔
955
        assert!(parse_positive_usize("0").is_err());
1✔
956
        assert!(parse_positive_usize("abc").is_err());
1✔
957

958
        let split = parse_split_ratios_arg("0.8,0.1,0.1").unwrap();
1✔
959
        assert!((split.train - 0.8).abs() < 1e-6);
1✔
960
        assert!(parse_split_ratios_arg("0.8,0.1").is_err());
1✔
961
        assert!(parse_split_ratios_arg("1.0,0.0,0.1").is_err());
1✔
962
        assert!(parse_split_ratios_arg("-0.1,0.6,0.5").is_err());
1✔
963
    }
1✔
964

965
    #[test]
966
    fn parse_cli_handles_help_and_invalid_args() {
1✔
967
        let help = parse_cli::<EstimateCapacityCli, _>(["estimate_capacity", "--help"]).unwrap();
1✔
968
        assert!(help.is_none());
1✔
969

970
        let err = parse_cli::<EstimateCapacityCli, _>(["estimate_capacity", "--unknown"]);
1✔
971
        assert!(err.is_err());
1✔
972
    }
1✔
973

974
    #[test]
975
    fn run_estimate_capacity_succeeds_with_reported_counts() {
1✔
976
        let result = run_estimate_capacity(
1✔
977
            std::iter::empty::<String>(),
1✔
978
            |roots| {
1✔
979
                assert!(roots.is_empty());
1✔
980
                Ok(())
1✔
981
            },
1✔
982
            |_| {
1✔
983
                vec![Box::new(TestSource {
1✔
984
                    id: "source_a".into(),
1✔
985
                    count: Some(12),
1✔
986
                    recipes: vec![default_recipe("r1")],
1✔
987
                }) as DynSource]
1✔
988
            },
1✔
989
        );
990

991
        assert!(result.is_ok());
1✔
992
    }
1✔
993

994
    #[test]
995
    fn run_estimate_capacity_errors_when_source_count_missing() {
1✔
996
        let result = run_estimate_capacity(
1✔
997
            std::iter::empty::<String>(),
1✔
998
            |_| Ok(()),
1✔
999
            |_| {
1✔
1000
                vec![Box::new(TestSource {
1✔
1001
                    id: "source_missing".into(),
1✔
1002
                    count: None,
1✔
1003
                    recipes: vec![default_recipe("r1")],
1✔
1004
                }) as DynSource]
1✔
1005
            },
1✔
1006
        );
1007

1008
        let err = result.unwrap_err().to_string();
1✔
1009
        assert!(err.contains("failed to report exact record count"));
1✔
1010
    }
1✔
1011

1012
    #[test]
1013
    fn run_estimate_capacity_propagates_root_resolution_error() {
1✔
1014
        let result = run_estimate_capacity(
1✔
1015
            std::iter::empty::<String>(),
1✔
1016
            |_| Err("root resolution failed".into()),
1✔
NEW
1017
            |_: &()| Vec::<DynSource>::new(),
×
1018
        );
1019

1020
        let err = result.unwrap_err().to_string();
1✔
1021
        assert!(err.contains("root resolution failed"));
1✔
1022
    }
1✔
1023

1024
    #[test]
1025
    fn run_estimate_capacity_configures_sources_centrally_before_counting() {
1✔
1026
        let result = run_estimate_capacity(
1✔
1027
            std::iter::empty::<String>(),
1✔
1028
            |_| Ok(()),
1✔
1029
            |_| {
1✔
1030
                vec![Box::new(ConfigRequiredSource {
1✔
1031
                    id: "requires_config".into(),
1✔
1032
                    expected_seed: 99,
1✔
1033
                }) as DynSource]
1✔
1034
            },
1✔
1035
        );
1036

1037
        assert!(result.is_ok());
1✔
1038
    }
1✔
1039

1040
    #[test]
1041
    fn parse_multi_source_cli_handles_help_and_batch_size_validation() {
1✔
1042
        let help = parse_cli::<MultiSourceDemoCli, _>(["multi_source_demo", "--help"]).unwrap();
1✔
1043
        assert!(help.is_none());
1✔
1044

1045
        let err = parse_cli::<MultiSourceDemoCli, _>(["multi_source_demo", "--batch-size", "0"]);
1✔
1046
        assert!(err.is_err());
1✔
1047

1048
        let conflict = parse_cli::<MultiSourceDemoCli, _>([
1✔
1049
            "multi_source_demo",
1✔
1050
            "--split-store-dir",
1✔
1051
            "./a",
1✔
1052
            "--split-store-path",
1✔
1053
            "./b.bin",
1✔
1054
        ]);
1✔
1055
        assert!(conflict.is_err());
1✔
1056
    }
1✔
1057

1058
    #[test]
1059
    fn parse_cli_handles_display_version_path() {
1✔
1060
        #[derive(Debug, Parser)]
1061
        #[command(name = "version_test", version = "1.0.0")]
1062
        struct VersionCli {}
1063

1064
        let parsed = parse_cli::<VersionCli, _>(["version_test", "--version"]).unwrap();
1✔
1065
        assert!(parsed.is_none());
1✔
1066
    }
1✔
1067

1068
    #[test]
1069
    fn run_multi_source_demo_list_text_recipes_path_succeeds() {
1✔
1070
        let dir = tempdir().unwrap();
1✔
1071
        let mut args = vec![
1✔
1072
            "--list-text-recipes".to_string(),
1✔
1073
            "--split-store-dir".to_string(),
1✔
1074
            dir.path().to_string_lossy().to_string(),
1✔
1075
        ];
1076
        let result = run_multi_source_demo(
1✔
1077
            args.drain(..),
1✔
1078
            |_| Ok(()),
1✔
1079
            |_| {
1✔
1080
                vec![Box::new(TestSource {
1✔
1081
                    id: "source_for_recipes".into(),
1✔
1082
                    count: Some(10),
1✔
1083
                    recipes: vec![default_recipe("recipe_a")],
1✔
1084
                }) as DynSource]
1✔
1085
            },
1✔
1086
        );
1087

1088
        assert!(result.is_ok());
1✔
1089
    }
1✔
1090

1091
    #[test]
1092
    fn run_multi_source_demo_list_text_recipes_uses_explicit_split_store_path() {
1✔
1093
        let dir = tempdir().unwrap();
1✔
1094
        let split_store_path = dir.path().join("custom_split_store.bin");
1✔
1095
        let args = vec![
1✔
1096
            "--list-text-recipes".to_string(),
1✔
1097
            "--split-store-path".to_string(),
1✔
1098
            split_store_path.to_string_lossy().to_string(),
1✔
1099
        ];
1100

1101
        let result = run_multi_source_demo(
1✔
1102
            args.into_iter(),
1✔
1103
            |_| Ok(()),
1✔
1104
            |_| {
1✔
1105
                vec![Box::new(TestSource {
1✔
1106
                    id: "source_without_text_recipes".into(),
1✔
1107
                    count: Some(1),
1✔
1108
                    recipes: Vec::new(),
1✔
1109
                }) as DynSource]
1✔
1110
            },
1✔
1111
        );
1112

1113
        assert!(result.is_ok());
1✔
1114
    }
1✔
1115

1116
    #[test]
1117
    fn run_multi_source_demo_sampling_modes_handle_empty_sources() {
1✔
1118
        for mode in [
3✔
1119
            vec!["--pair-batch".to_string()],
1✔
1120
            vec!["--text-recipes".to_string()],
1✔
1121
            vec![],
1✔
1122
        ] {
1✔
1123
            let dir = tempdir().unwrap();
3✔
1124
            let mut args = mode;
3✔
1125
            args.push("--split-store-dir".to_string());
3✔
1126
            args.push(dir.path().to_string_lossy().to_string());
3✔
1127
            args.push("--split".to_string());
3✔
1128
            args.push("validation".to_string());
3✔
1129

1130
            let result = run_multi_source_demo(
3✔
1131
                args.into_iter(),
3✔
1132
                |_| Ok(()),
3✔
1133
                |_| {
3✔
1134
                    vec![Box::new(TestSource {
3✔
1135
                        id: "source_empty".into(),
3✔
1136
                        count: Some(0),
3✔
1137
                        recipes: vec![default_recipe("recipe_empty")],
3✔
1138
                    }) as DynSource]
3✔
1139
                },
3✔
1140
            );
1141

1142
            assert!(result.is_ok());
3✔
1143
        }
1144
    }
1✔
1145

1146
    #[test]
1147
    fn run_multi_source_demo_propagates_root_resolution_error() {
1✔
1148
        let result = run_multi_source_demo(
1✔
1149
            std::iter::empty::<String>(),
1✔
1150
            |_| Err("demo root resolution failed".into()),
1✔
NEW
1151
            |_: &()| Vec::<DynSource>::new(),
×
1152
        );
1153

1154
        let err = result.unwrap_err().to_string();
1✔
1155
        assert!(err.contains("demo root resolution failed"));
1✔
1156
    }
1✔
1157

1158
    #[test]
1159
    fn print_helpers_and_extract_source_cover_paths() {
1✔
1160
        let split = SplitRatios::default();
1✔
1161
        let store = DeterministicSplitStore::new(split, 42).unwrap();
1✔
1162
        let strategy = ChunkingStrategy::default();
1✔
1163

1164
        let anchor = RecordChunk {
1✔
1165
            record_id: "source_a::rec1".to_string(),
1✔
1166
            section_idx: 0,
1✔
1167
            view: ChunkView::Window {
1✔
1168
                index: 1,
1✔
1169
                overlap: 2,
1✔
1170
                span: 12,
1✔
1171
                start_ratio: 0.25,
1✔
1172
            },
1✔
1173
            text: "anchor text".to_string(),
1✔
1174
            tokens_estimate: 8,
1✔
1175
            quality: crate::data::QualityScore { trust: 0.9 },
1✔
1176
        };
1✔
1177
        let positive = RecordChunk {
1✔
1178
            record_id: "source_a::rec2".to_string(),
1✔
1179
            section_idx: 1,
1✔
1180
            view: ChunkView::SummaryFallback {
1✔
1181
                strategy: "summary".to_string(),
1✔
1182
                weight: 0.7,
1✔
1183
            },
1✔
1184
            text: "positive text".to_string(),
1✔
1185
            tokens_estimate: 6,
1✔
1186
            quality: crate::data::QualityScore { trust: 0.8 },
1✔
1187
        };
1✔
1188
        let negative = RecordChunk {
1✔
1189
            record_id: "source_b::rec3".to_string(),
1✔
1190
            section_idx: 2,
1✔
1191
            view: ChunkView::Window {
1✔
1192
                index: 0,
1✔
1193
                overlap: 0,
1✔
1194
                span: 16,
1✔
1195
                start_ratio: 0.0,
1✔
1196
            },
1✔
1197
            text: "negative text".to_string(),
1✔
1198
            tokens_estimate: 7,
1✔
1199
            quality: crate::data::QualityScore { trust: 0.5 },
1✔
1200
        };
1✔
1201

1202
        let triplet_batch = TripletBatch {
1✔
1203
            triplets: vec![crate::SampleTriplet {
1✔
1204
                recipe: "triplet_recipe".to_string(),
1✔
1205
                anchor: anchor.clone(),
1✔
1206
                positive: positive.clone(),
1✔
1207
                negative: negative.clone(),
1✔
1208
                weight: 1.0,
1✔
1209
                instruction: Some("triplet instruction".to_string()),
1✔
1210
            }],
1✔
1211
        };
1✔
1212
        print_triplet_batch(&strategy, &triplet_batch, &store);
1✔
1213

1214
        let pair_batch = SampleBatch {
1✔
1215
            pairs: vec![crate::SamplePair {
1✔
1216
                recipe: "pair_recipe".to_string(),
1✔
1217
                anchor: anchor.clone(),
1✔
1218
                positive: positive.clone(),
1✔
1219
                weight: 1.0,
1✔
1220
                instruction: None,
1✔
1221
                label: crate::PairLabel::Positive,
1✔
1222
                reason: Some("same topic".to_string()),
1✔
1223
            }],
1✔
1224
        };
1✔
1225
        print_pair_batch(&strategy, &pair_batch, &store);
1✔
1226

1227
        let text_batch = TextBatch {
1✔
1228
            samples: vec![crate::TextSample {
1✔
1229
                recipe: "text_recipe".to_string(),
1✔
1230
                chunk: negative,
1✔
1231
                weight: 0.8,
1✔
1232
                instruction: Some("text instruction".to_string()),
1✔
1233
            }],
1✔
1234
        };
1✔
1235
        print_text_batch(&strategy, &text_batch, &store);
1✔
1236

1237
        let recipes = vec![TextRecipe {
1✔
1238
            name: "recipe_name".into(),
1✔
1239
            selector: crate::config::Selector::Role(SectionRole::Context),
1✔
1240
            instruction: Some("instruction".into()),
1✔
1241
            weight: 1.0,
1✔
1242
        }];
1✔
1243
        print_text_recipes(&recipes);
1✔
1244

1245
        assert_eq!(extract_source("source_a::record"), "source_a");
1✔
1246
        assert_eq!(extract_source("record-without-delimiter"), "unknown");
1✔
1247
    }
1✔
1248

1249
    #[test]
1250
    fn split_arg_conversion_and_version_parse_paths_are_covered() {
1✔
1251
        assert!(matches!(
1✔
1252
            SplitLabel::from(SplitArg::Train),
1✔
1253
            SplitLabel::Train
1254
        ));
1255
        assert!(matches!(
1✔
1256
            SplitLabel::from(SplitArg::Validation),
1✔
1257
            SplitLabel::Validation
1258
        ));
1259
        assert!(matches!(SplitLabel::from(SplitArg::Test), SplitLabel::Test));
1✔
1260
    }
1✔
1261

1262
    #[test]
1263
    fn parse_split_ratios_reports_per_field_parse_errors() {
1✔
1264
        assert!(
1✔
1265
            parse_split_ratios_arg("x,0.1,0.9")
1✔
1266
                .unwrap_err()
1✔
1267
                .contains("invalid train ratio")
1✔
1268
        );
1269
        assert!(
1✔
1270
            parse_split_ratios_arg("0.1,y,0.8")
1✔
1271
                .unwrap_err()
1✔
1272
                .contains("invalid validation ratio")
1✔
1273
        );
1274
        assert!(
1✔
1275
            parse_split_ratios_arg("0.1,0.2,z")
1✔
1276
                .unwrap_err()
1✔
1277
                .contains("invalid test ratio")
1✔
1278
        );
1279
    }
1✔
1280

1281
    #[test]
1282
    fn run_multi_source_demo_exhausted_paths_are_handled() {
1✔
1283
        for mode in [
3✔
1284
            vec!["--pair-batch".to_string()],
1✔
1285
            vec!["--text-recipes".to_string()],
1✔
1286
            Vec::new(),
1✔
1287
        ] {
1✔
1288
            let dir = tempdir().unwrap();
3✔
1289
            let mut args = mode;
3✔
1290
            args.push("--split-store-dir".to_string());
3✔
1291
            args.push(dir.path().to_string_lossy().to_string());
3✔
1292

1293
            let result = run_multi_source_demo(
3✔
1294
                args.into_iter(),
3✔
1295
                |_| Ok(()),
3✔
1296
                |_| {
3✔
1297
                    vec![Box::new(TestSource {
3✔
1298
                        id: "source_without_recipes".into(),
3✔
1299
                        count: Some(1),
3✔
1300
                        recipes: Vec::new(),
3✔
1301
                    }) as DynSource]
3✔
1302
                },
3✔
1303
            );
1304

1305
            assert!(result.is_ok());
3✔
1306
        }
1307
    }
1✔
1308
}
STATUS · Troubleshooting · Open an Issue · Sales · Support · CAREERS · ENTERPRISE · START FREE · SCHEDULE DEMO
ANNOUNCEMENTS · TWITTER · TOS & SLA · Supported CI Services · What's a CI service? · Automated Testing

© 2026 Coveralls, Inc