• Home
  • Features
  • Pricing
  • Docs
  • Announcements
  • Sign In

jzombie / rust-triplets / 22358031159

24 Feb 2026 03:38PM UTC coverage: 92.488% (-0.2%) from 92.675%
22358031159

Pull #7

github

web-flow
Merge 843abfd29 into 980559192
Pull Request #7: Add HF source

4634 of 5195 new or added lines in 8 files covered. (89.2%)

1 existing line in 1 file now uncovered.

14073 of 15216 relevant lines covered (92.49%)

2599.58 hits per line

Source File
Press 'n' to go to next uncovered line, 'b' for previous

92.39
/src/example_apps.rs
1
use std::collections::HashMap;
2
use std::error::Error;
3
use std::path::PathBuf;
4
use std::sync::Arc;
5
use std::sync::Once;
6

7
use clap::{Parser, ValueEnum, error::ErrorKind};
8

9
use crate::config::{ChunkingStrategy, SamplerConfig, TripletRecipe};
10
use crate::data::ChunkView;
11
use crate::heuristics::{
12
    CapacityTotals, EFFECTIVE_NEGATIVES_PER_ANCHOR, EFFECTIVE_POSITIVES_PER_ANCHOR,
13
    estimate_source_split_capacity_from_counts, format_replay_factor, format_u128_with_commas,
14
    resolve_text_recipes_for_source, split_counts_for_total,
15
};
16
use crate::metrics::source_skew;
17
use crate::sampler::chunk_weight;
18
use crate::source::DataSource;
19
use crate::splits::{FileSplitStore, SplitLabel, SplitRatios, SplitStore};
20
use crate::{
21
    RecordChunk, SampleBatch, Sampler, SamplerError, SourceId, TextBatch, TextRecipe, TripletBatch,
22
    TripletSampler,
23
};
24

25
type DynSource = Box<dyn DataSource + 'static>;
26

27
fn init_example_tracing() {
10✔
28
    static INIT: Once = Once::new();
29
    INIT.call_once(|| {
10✔
30
        let env_filter = tracing_subscriber::EnvFilter::try_from_default_env()
1✔
31
            .unwrap_or_else(|_| tracing_subscriber::EnvFilter::new("triplets=debug"));
1✔
32
        let _ = tracing_subscriber::fmt()
1✔
33
            .with_env_filter(env_filter)
1✔
34
            .try_init();
1✔
35
    });
1✔
36
}
10✔
37

38
#[derive(Debug, Clone, Copy, ValueEnum)]
39
/// CLI split selector mapped onto `SplitLabel`.
40
enum SplitArg {
41
    Train,
42
    Validation,
43
    Test,
44
}
45

46
impl From<SplitArg> for SplitLabel {
47
    fn from(value: SplitArg) -> Self {
6✔
48
        match value {
6✔
49
            SplitArg::Train => SplitLabel::Train,
1✔
50
            SplitArg::Validation => SplitLabel::Validation,
4✔
51
            SplitArg::Test => SplitLabel::Test,
1✔
52
        }
53
    }
6✔
54
}
55

56
#[derive(Debug, Parser)]
57
#[command(
58
    name = "estimate_capacity",
59
    disable_help_subcommand = true,
60
    about = "Metadata-only capacity estimation",
61
    long_about = "Estimate record, pair, triplet, and text-sample capacity using source-reported counts only (no data refresh).",
62
    after_help = "Source roots are optional and resolved in order by explicit arg, environment variables, then project defaults."
63
)]
64
/// CLI arguments for metadata-only capacity estimation.
65
struct EstimateCapacityCli {
66
    #[arg(
67
        long,
68
        default_value_t = 99,
69
        help = "Deterministic seed used for split allocation"
70
    )]
71
    seed: u64,
72
    #[arg(
73
        long = "split-ratios",
74
        value_name = "TRAIN,VALIDATION,TEST",
75
        value_parser = parse_split_ratios_arg,
76
        default_value = "0.8,0.1,0.1",
77
        help = "Comma-separated split ratios that must sum to 1.0"
78
    )]
79
    split: SplitRatios,
80
    #[arg(
81
        long = "source-root",
82
        value_name = "PATH",
83
        help = "Optional source root override, repeat as needed in source order"
84
    )]
85
    source_roots: Vec<String>,
86
}
87

88
#[derive(Debug, Parser)]
89
#[command(
90
    name = "multi_source_demo",
91
    disable_help_subcommand = true,
92
    about = "Run sampled batches from multiple sources",
93
    long_about = "Sample triplet, pair, or text batches from multiple sources and persist split/epoch state.",
94
    after_help = "Source roots are optional and resolved in order by explicit arg, environment variables, then project defaults."
95
)]
96
/// CLI for `multi_source_demo`.
97
///
98
/// Common usage:
99
/// - Keep default persistence file location: `.sampler_store/split_store.bin`
100
/// - Set an explicit file path: `--split-store-path /tmp/split_store.bin`
101
/// - Set a custom directory and keep default filename: `--split-store-dir /tmp/sampler_store`
102
/// - Repeat `--source-root <PATH>` to override source roots in order
103
struct MultiSourceDemoCli {
104
    #[arg(
105
        long = "text-recipes",
106
        help = "Emit a text batch instead of a triplet batch"
107
    )]
108
    show_text_samples: bool,
109
    #[arg(
110
        long = "pair-batch",
111
        help = "Emit a pair batch instead of a triplet batch"
112
    )]
113
    show_pair_samples: bool,
114
    #[arg(
115
        long = "list-text-recipes",
116
        help = "Print registered text recipes and exit"
117
    )]
118
    list_text_recipes: bool,
119
    #[arg(
120
        long = "batch-size",
121
        default_value_t = 4,
122
        value_parser = parse_positive_usize,
123
        help = "Batch size used for sampling"
124
    )]
125
    batch_size: usize,
126
    #[arg(long, help = "Optional deterministic seed override")]
127
    seed: Option<u64>,
128
    #[arg(long, value_enum, help = "Target split to sample from")]
129
    split: Option<SplitArg>,
130
    #[arg(
131
        long = "source-root",
132
        value_name = "PATH",
133
        help = "Optional source root override, repeat as needed in source order"
134
    )]
135
    source_roots: Vec<String>,
136
    #[arg(
137
        long = "split-store-path",
138
        value_name = "SPLIT_STORE_PATH",
139
        help = "Optional path for persisted split/epoch state file"
140
    )]
141
    split_store_path: Option<PathBuf>,
142
    #[arg(
143
        long = "split-store-dir",
144
        value_name = "DIR",
145
        conflicts_with = "split_store_path",
146
        help = "Optional directory for persisted split/epoch state file (uses split_store.bin filename)"
147
    )]
148
    split_store_dir: Option<PathBuf>,
149
}
150

151
#[derive(Debug, Clone)]
152
/// Source-level inventory used by capacity estimation output.
153
struct SourceInventory {
154
    source_id: String,
155
    reported_records: u128,
156
    triplet_recipes: Vec<TripletRecipe>,
157
}
158

159
/// Run the capacity-estimation CLI with injectable root resolution/source builders.
160
///
161
/// `build_sources` is construction-only; sampler configuration is applied
162
/// centrally by this function before any source calls.
163
pub fn run_estimate_capacity<R, Resolve, Build, I>(
3✔
164
    args_iter: I,
3✔
165
    resolve_roots: Resolve,
3✔
166
    build_sources: Build,
3✔
167
) -> Result<(), Box<dyn Error>>
3✔
168
where
3✔
169
    Resolve: FnOnce(Vec<String>) -> Result<R, Box<dyn Error>>,
3✔
170
    Build: FnOnce(&R) -> Vec<DynSource>,
3✔
171
    I: Iterator<Item = String>,
3✔
172
{
173
    init_example_tracing();
3✔
174

175
    let Some(cli) = parse_cli::<EstimateCapacityCli, _>(
3✔
176
        std::iter::once("estimate_capacity".to_string()).chain(args_iter),
3✔
177
    )?
×
178
    else {
179
        return Ok(());
×
180
    };
181

182
    let roots = resolve_roots(cli.source_roots)?;
3✔
183

184
    let config = SamplerConfig {
3✔
185
        seed: cli.seed,
3✔
186
        split: cli.split,
3✔
187
        ..SamplerConfig::default()
3✔
188
    };
3✔
189

190
    let sources = build_sources(&roots);
3✔
191

192
    let mut inventories = Vec::new();
3✔
193
    for source in &sources {
3✔
194
        let recipes = if config.recipes.is_empty() {
3✔
195
            source.default_triplet_recipes()
3✔
196
        } else {
197
            config.recipes.clone()
×
198
        };
199
        let reported_records = source.reported_record_count(&config).map_err(|err| {
3✔
200
            format!(
1✔
201
                "source '{}' failed to report exact record count: {err}",
202
                source.id()
1✔
203
            )
204
        })?;
1✔
205
        inventories.push(SourceInventory {
2✔
206
            source_id: source.id().to_string(),
2✔
207
            reported_records,
2✔
208
            triplet_recipes: recipes,
2✔
209
        });
2✔
210
    }
211

212
    let mut per_source_split_counts: HashMap<(String, SplitLabel), u128> = HashMap::new();
2✔
213
    let mut split_record_counts: HashMap<SplitLabel, u128> = HashMap::new();
2✔
214

215
    for source in &inventories {
2✔
216
        let counts = split_counts_for_total(source.reported_records, cli.split);
2✔
217
        for (label, count) in counts {
6✔
218
            per_source_split_counts.insert((source.source_id.clone(), label), count);
6✔
219
            *split_record_counts.entry(label).or_insert(0) += count;
6✔
220
        }
6✔
221
    }
222

223
    let mut totals_by_split: HashMap<SplitLabel, CapacityTotals> = HashMap::new();
2✔
224
    let mut totals_by_source_and_split: HashMap<(String, SplitLabel), CapacityTotals> =
2✔
225
        HashMap::new();
2✔
226

227
    for split_label in [SplitLabel::Train, SplitLabel::Validation, SplitLabel::Test] {
6✔
228
        let mut totals = CapacityTotals::default();
6✔
229

230
        for source in &inventories {
6✔
231
            let source_split_records = per_source_split_counts
6✔
232
                .get(&(source.source_id.clone(), split_label))
6✔
233
                .copied()
6✔
234
                .unwrap_or(0);
6✔
235

6✔
236
            let triplet_recipes = &source.triplet_recipes;
6✔
237
            let text_recipes = resolve_text_recipes_for_source(&config, triplet_recipes);
6✔
238

6✔
239
            let capacity = estimate_source_split_capacity_from_counts(
6✔
240
                source_split_records,
6✔
241
                triplet_recipes,
6✔
242
                &text_recipes,
6✔
243
            );
6✔
244

6✔
245
            totals_by_source_and_split.insert((source.source_id.clone(), split_label), capacity);
6✔
246

6✔
247
            totals.triplets += capacity.triplets;
6✔
248
            totals.effective_triplets += capacity.effective_triplets;
6✔
249
            totals.pairs += capacity.pairs;
6✔
250
            totals.text_samples += capacity.text_samples;
6✔
251
        }
6✔
252

253
        totals_by_split.insert(split_label, totals);
6✔
254
    }
255

256
    println!("=== capacity estimate (length-only) ===");
2✔
257
    println!("mode: metadata-only (no source.refresh calls)");
2✔
258
    println!("classification: heuristic approximation (not exact)");
2✔
259
    println!("split seed: {}", cli.seed);
2✔
260
    println!(
2✔
261
        "split ratios: train={:.4}, validation={:.4}, test={:.4}",
262
        cli.split.train, cli.split.validation, cli.split.test
263
    );
264
    println!();
2✔
265

266
    println!("[SOURCES]");
2✔
267
    for source in &inventories {
2✔
268
        println!(
2✔
269
            "  {} => reported records: {}",
2✔
270
            source.source_id,
2✔
271
            format_u128_with_commas(source.reported_records)
2✔
272
        );
2✔
273
    }
2✔
274
    println!();
2✔
275

276
    println!("[PER SOURCE BREAKDOWN]");
2✔
277
    for source in &inventories {
2✔
278
        println!("  {}", source.source_id);
2✔
279
        let mut source_grand = CapacityTotals::default();
2✔
280
        let mut source_total_records = 0u128;
2✔
281
        for split_label in [SplitLabel::Train, SplitLabel::Validation, SplitLabel::Test] {
6✔
282
            let split_records = per_source_split_counts
6✔
283
                .get(&(source.source_id.clone(), split_label))
6✔
284
                .copied()
6✔
285
                .unwrap_or(0);
6✔
286
            source_total_records = source_total_records.saturating_add(split_records);
6✔
287
            let split_longest_records = inventories
6✔
288
                .iter()
6✔
289
                .map(|candidate| {
6✔
290
                    per_source_split_counts
6✔
291
                        .get(&(candidate.source_id.clone(), split_label))
6✔
292
                        .copied()
6✔
293
                        .unwrap_or(0)
6✔
294
                })
6✔
295
                .max()
6✔
296
                .unwrap_or(0);
6✔
297
            let totals = totals_by_source_and_split
6✔
298
                .get(&(source.source_id.clone(), split_label))
6✔
299
                .copied()
6✔
300
                .unwrap_or_default();
6✔
301
            source_grand.triplets += totals.triplets;
6✔
302
            source_grand.effective_triplets += totals.effective_triplets;
6✔
303
            source_grand.pairs += totals.pairs;
6✔
304
            source_grand.text_samples += totals.text_samples;
6✔
305
            println!("    [{:?}]", split_label);
6✔
306
            println!("      records: {}", format_u128_with_commas(split_records));
6✔
307
            println!(
6✔
308
                "      triplet combinations: {}",
309
                format_u128_with_commas(totals.triplets)
6✔
310
            );
311
            println!(
6✔
312
                "      effective sampled triplets (p={}, k={}): {}",
313
                EFFECTIVE_POSITIVES_PER_ANCHOR,
314
                EFFECTIVE_NEGATIVES_PER_ANCHOR,
315
                format_u128_with_commas(totals.effective_triplets)
6✔
316
            );
317
            println!(
6✔
318
                "      pair combinations:    {}",
319
                format_u128_with_commas(totals.pairs)
6✔
320
            );
321
            println!(
6✔
322
                "      text samples:         {}",
323
                format_u128_with_commas(totals.text_samples)
6✔
324
            );
325
            println!(
6✔
326
                "      replay factor vs longest source: {}",
327
                format_replay_factor(split_longest_records, split_records)
6✔
328
            );
329
        }
330
        let longest_source_total = inventories
2✔
331
            .iter()
2✔
332
            .map(|candidate| candidate.reported_records)
2✔
333
            .max()
2✔
334
            .unwrap_or(0);
2✔
335
        println!("    [ALL SPLITS FOR SOURCE]");
2✔
336
        println!(
2✔
337
            "      triplet combinations: {}",
338
            format_u128_with_commas(source_grand.triplets)
2✔
339
        );
340
        println!(
2✔
341
            "      effective sampled triplets (p={}, k={}): {}",
342
            EFFECTIVE_POSITIVES_PER_ANCHOR,
343
            EFFECTIVE_NEGATIVES_PER_ANCHOR,
344
            format_u128_with_commas(source_grand.effective_triplets)
2✔
345
        );
346
        println!(
2✔
347
            "      pair combinations:    {}",
348
            format_u128_with_commas(source_grand.pairs)
2✔
349
        );
350
        println!(
2✔
351
            "      text samples:         {}",
352
            format_u128_with_commas(source_grand.text_samples)
2✔
353
        );
354
        println!(
2✔
355
            "      replay factor vs longest source: {}",
356
            format_replay_factor(longest_source_total, source_total_records)
2✔
357
        );
358
        println!();
2✔
359
    }
360

361
    let mut grand = CapacityTotals::default();
2✔
362
    for split_label in [SplitLabel::Train, SplitLabel::Validation, SplitLabel::Test] {
6✔
363
        let record_count = split_record_counts.get(&split_label).copied().unwrap_or(0);
6✔
364
        let totals = totals_by_split
6✔
365
            .get(&split_label)
6✔
366
            .copied()
6✔
367
            .unwrap_or_default();
6✔
368

6✔
369
        grand.triplets += totals.triplets;
6✔
370
        grand.effective_triplets += totals.effective_triplets;
6✔
371
        grand.pairs += totals.pairs;
6✔
372
        grand.text_samples += totals.text_samples;
6✔
373

6✔
374
        println!("[{:?}]", split_label);
6✔
375
        println!("  records: {}", format_u128_with_commas(record_count));
6✔
376
        println!(
6✔
377
            "  triplet combinations: {}",
6✔
378
            format_u128_with_commas(totals.triplets)
6✔
379
        );
6✔
380
        println!(
6✔
381
            "  effective sampled triplets (p={}, k={}): {}",
6✔
382
            EFFECTIVE_POSITIVES_PER_ANCHOR,
6✔
383
            EFFECTIVE_NEGATIVES_PER_ANCHOR,
6✔
384
            format_u128_with_commas(totals.effective_triplets)
6✔
385
        );
6✔
386
        println!(
6✔
387
            "  pair combinations:    {}",
6✔
388
            format_u128_with_commas(totals.pairs)
6✔
389
        );
6✔
390
        println!(
6✔
391
            "  text samples:         {}",
6✔
392
            format_u128_with_commas(totals.text_samples)
6✔
393
        );
6✔
394
        println!();
6✔
395
    }
6✔
396

397
    println!("[ALL SPLITS TOTAL]");
2✔
398
    println!(
2✔
399
        "  triplet combinations: {}",
400
        format_u128_with_commas(grand.triplets)
2✔
401
    );
402
    println!(
2✔
403
        "  effective sampled triplets (p={}, k={}): {}",
404
        EFFECTIVE_POSITIVES_PER_ANCHOR,
405
        EFFECTIVE_NEGATIVES_PER_ANCHOR,
406
        format_u128_with_commas(grand.effective_triplets)
2✔
407
    );
408
    println!(
2✔
409
        "  pair combinations:    {}",
410
        format_u128_with_commas(grand.pairs)
2✔
411
    );
412
    println!(
2✔
413
        "  text samples:         {}",
414
        format_u128_with_commas(grand.text_samples)
2✔
415
    );
416
    println!();
2✔
417
    println!(
2✔
418
        "Note: counts are heuristic, length-based estimates from source-reported totals and recipe structure. They are approximate, not exact, and assume anchor-positive pairs=records (one positive per anchor by default), negatives=source_records_in_split-1 (anchor excluded as its own negative), and at most one chunk/window realization per sample. In real-world chunked sampling, practical combinations are often higher, so treat this as a floor-like baseline."
419
    );
420
    println!(
2✔
421
        "Effective sampled triplets apply a bounded training assumption: effective_triplets = records * p * k per triplet recipe, with defaults p={} positives per anchor and k={} negatives per anchor.",
422
        EFFECTIVE_POSITIVES_PER_ANCHOR, EFFECTIVE_NEGATIVES_PER_ANCHOR
423
    );
424
    println!(
2✔
425
        "Oversample loops are not inferred from this static report. To measure true oversampling (how many times sampling loops through the combination space), use observed sampled draw counts from an actual run."
426
    );
427

428
    Ok(())
2✔
429
}
3✔
430

431
/// Run the multi-source demo CLI with injectable root resolution/source builders.
432
///
433
/// `build_sources` is construction-only. Source sampler configuration is owned
434
/// by sampler registration (`TripletSampler::register_source`).
435
pub fn run_multi_source_demo<R, Resolve, Build, I>(
7✔
436
    args_iter: I,
7✔
437
    resolve_roots: Resolve,
7✔
438
    build_sources: Build,
7✔
439
) -> Result<(), Box<dyn Error>>
7✔
440
where
7✔
441
    Resolve: FnOnce(Vec<String>) -> Result<R, Box<dyn Error>>,
7✔
442
    Build: FnOnce(&R) -> Vec<DynSource>,
7✔
443
    I: Iterator<Item = String>,
7✔
444
{
445
    init_example_tracing();
7✔
446

447
    let Some(cli) = parse_cli::<MultiSourceDemoCli, _>(
7✔
448
        std::iter::once("multi_source_demo".to_string()).chain(args_iter),
7✔
449
    )?
×
450
    else {
451
        return Ok(());
×
452
    };
453

454
    let roots = resolve_roots(cli.source_roots)?;
7✔
455

456
    let mut config = SamplerConfig::default();
7✔
457
    config.seed = cli.seed.unwrap_or(config.seed);
7✔
458
    config.batch_size = cli.batch_size;
7✔
459
    config.chunking = Default::default();
7✔
460
    let selected_split = cli.split.map(Into::into).unwrap_or(SplitLabel::Train);
7✔
461
    config.split = SplitRatios::default();
7✔
462
    config.allowed_splits = vec![selected_split];
7✔
463
    let chunking = config.chunking.clone();
7✔
464

465
    let split_store_path = if let Some(path) = cli.split_store_path {
7✔
466
        path
×
467
    } else if let Some(dir) = cli.split_store_dir {
7✔
468
        FileSplitStore::default_path_in_dir(dir)
7✔
469
    } else {
470
        FileSplitStore::default_path()
×
471
    };
472

473
    println!(
7✔
474
        "Persisting split assignments and epoch state to {}",
475
        split_store_path.display()
7✔
476
    );
477
    let sources = build_sources(&roots);
7✔
478
    let split_store = Arc::new(FileSplitStore::open(&split_store_path, config.split, 99)?);
7✔
479
    let sampler = TripletSampler::new(config, split_store.clone());
7✔
480
    for source in sources {
7✔
481
        sampler.register_source(source);
7✔
482
    }
7✔
483

484
    if cli.show_pair_samples {
7✔
485
        match sampler.next_pair_batch(selected_split) {
2✔
486
            Ok(pair_batch) => {
×
487
                if pair_batch.pairs.is_empty() {
×
488
                    println!("Pair sampling produced no results.");
×
489
                } else {
×
490
                    print_pair_batch(&chunking, &pair_batch, split_store.as_ref());
×
491
                }
×
492
                sampler.persist_state()?;
×
493
            }
494
            Err(SamplerError::Exhausted(name)) => {
2✔
495
                eprintln!(
2✔
496
                    "Pair sampler exhausted recipe '{}'. Ensure both positive and negative examples exist.",
2✔
497
                    name
2✔
498
                );
2✔
499
            }
2✔
500
            Err(err) => return Err(err.into()),
×
501
        }
502
    } else if cli.show_text_samples {
5✔
503
        match sampler.next_text_batch(selected_split) {
2✔
504
            Ok(text_batch) => {
×
505
                if text_batch.samples.is_empty() {
×
506
                    println!(
×
507
                        "Text sampling produced no results. Ensure each source has eligible sections."
×
508
                    );
×
509
                } else {
×
510
                    print_text_batch(&chunking, &text_batch, split_store.as_ref());
×
511
                }
×
512
                sampler.persist_state()?;
×
513
            }
514
            Err(SamplerError::Exhausted(name)) => {
2✔
515
                eprintln!(
2✔
516
                    "Text sampler exhausted selector '{}'. Ensure matching sections exist.",
2✔
517
                    name
2✔
518
                );
2✔
519
            }
2✔
520
            Err(err) => return Err(err.into()),
×
521
        }
522
    } else if cli.list_text_recipes {
3✔
523
        let recipes = sampler.text_recipes();
1✔
524
        if recipes.is_empty() {
1✔
525
            println!(
×
526
                "No text recipes registered. Ensure your sources expose triplet selectors or configure text_recipes explicitly."
×
527
            );
×
528
        } else {
1✔
529
            print_text_recipes(&recipes);
1✔
530
        }
1✔
531
    } else {
532
        match sampler.next_triplet_batch(selected_split) {
2✔
533
            Ok(triplet_batch) => {
×
534
                if triplet_batch.triplets.is_empty() {
×
535
                    println!(
×
536
                        "Triplet sampling produced no results. Ensure multiple records per source exist."
×
537
                    );
×
538
                } else {
×
539
                    print_triplet_batch(&chunking, &triplet_batch, split_store.as_ref());
×
540
                }
×
541
                sampler.persist_state()?;
×
542
            }
543
            Err(SamplerError::Exhausted(name)) => {
2✔
544
                eprintln!(
2✔
545
                    "Triplet sampler exhausted recipe '{}'. Ensure both positive and negative examples exist.",
2✔
546
                    name
2✔
547
                );
2✔
548
            }
2✔
549
            Err(err) => return Err(err.into()),
×
550
        }
551
    }
552

553
    Ok(())
7✔
554
}
7✔
555

556
fn parse_positive_usize(raw: &str) -> Result<usize, String> {
11✔
557
    let parsed = raw.parse::<usize>().map_err(|_| {
11✔
558
        format!(
1✔
559
            "Could not parse --batch-size value '{}' as a positive integer",
560
            raw
561
        )
562
    })?;
1✔
563
    if parsed == 0 {
10✔
564
        return Err("--batch-size must be greater than zero".to_string());
2✔
565
    }
8✔
566
    Ok(parsed)
8✔
567
}
11✔
568

569
fn parse_cli<T, I>(args: I) -> Result<Option<T>, Box<dyn Error>>
14✔
570
where
14✔
571
    T: Parser,
14✔
572
    I: IntoIterator,
14✔
573
    I::Item: Into<std::ffi::OsString> + Clone,
14✔
574
{
575
    match T::try_parse_from(args) {
14✔
576
        Ok(cli) => Ok(Some(cli)),
10✔
577
        Err(err) => match err.kind() {
4✔
578
            ErrorKind::DisplayHelp | ErrorKind::DisplayVersion => {
579
                err.print()?;
2✔
580
                Ok(None)
2✔
581
            }
582
            _ => Err(err.into()),
2✔
583
        },
584
    }
585
}
14✔
586

587
fn parse_split_ratios_arg(raw: &str) -> Result<SplitRatios, String> {
10✔
588
    let parts: Vec<&str> = raw.split(',').collect();
10✔
589
    if parts.len() != 3 {
10✔
590
        return Err("--split-ratios expects exactly 3 comma-separated values".to_string());
1✔
591
    }
9✔
592
    let train = parts[0]
9✔
593
        .trim()
9✔
594
        .parse::<f32>()
9✔
595
        .map_err(|_| format!("invalid train ratio '{}': must be a float", parts[0].trim()))?;
9✔
596
    let validation = parts[1].trim().parse::<f32>().map_err(|_| {
8✔
597
        format!(
1✔
598
            "invalid validation ratio '{}': must be a float",
599
            parts[1].trim()
1✔
600
        )
601
    })?;
1✔
602
    let test = parts[2]
7✔
603
        .trim()
7✔
604
        .parse::<f32>()
7✔
605
        .map_err(|_| format!("invalid test ratio '{}': must be a float", parts[2].trim()))?;
7✔
606
    let ratios = SplitRatios {
6✔
607
        train,
6✔
608
        validation,
6✔
609
        test,
6✔
610
    };
6✔
611
    let sum = ratios.train + ratios.validation + ratios.test;
6✔
612
    if (sum - 1.0).abs() > 1e-5 {
6✔
613
        return Err(format!(
1✔
614
            "split ratios must sum to 1.0, got {:.6} (train={}, validation={}, test={})",
1✔
615
            sum, ratios.train, ratios.validation, ratios.test
1✔
616
        ));
1✔
617
    }
5✔
618
    if ratios.train < 0.0 || ratios.validation < 0.0 || ratios.test < 0.0 {
5✔
619
        return Err("split ratios must be non-negative".to_string());
1✔
620
    }
4✔
621
    Ok(ratios)
4✔
622
}
10✔
623

624
fn print_triplet_batch(
1✔
625
    strategy: &ChunkingStrategy,
1✔
626
    batch: &TripletBatch,
1✔
627
    split_store: &impl SplitStore,
1✔
628
) {
1✔
629
    println!("=== triplet batch ===");
1✔
630
    for (idx, triplet) in batch.triplets.iter().enumerate() {
1✔
631
        println!("--- triplet #{} ---", idx);
1✔
632
        println!("recipe       : {}", triplet.recipe);
1✔
633
        println!("sample_weight: {:.4}", triplet.weight);
1✔
634
        if let Some(instr) = &triplet.instruction {
1✔
635
            println!("instruction shown to model:\n{}\n", instr);
1✔
636
        }
1✔
637
        print_chunk_block("ANCHOR", &triplet.anchor, strategy, split_store);
1✔
638
        print_chunk_block("POSITIVE", &triplet.positive, strategy, split_store);
1✔
639
        print_chunk_block("NEGATIVE", &triplet.negative, strategy, split_store);
1✔
640
    }
641
    print_source_summary(
1✔
642
        "triplet anchors",
1✔
643
        batch
1✔
644
            .triplets
1✔
645
            .iter()
1✔
646
            .map(|triplet| triplet.anchor.record_id.as_str()),
1✔
647
    );
648
    print_recipe_summary_by_source(
1✔
649
        "triplet recipes by source",
1✔
650
        batch
1✔
651
            .triplets
1✔
652
            .iter()
1✔
653
            .map(|triplet| (triplet.anchor.record_id.as_str(), triplet.recipe.as_str())),
1✔
654
    );
655
}
1✔
656

657
fn print_text_batch(strategy: &ChunkingStrategy, batch: &TextBatch, split_store: &impl SplitStore) {
1✔
658
    println!("=== text batch ===");
1✔
659
    for (idx, sample) in batch.samples.iter().enumerate() {
1✔
660
        println!("--- sample #{} ---", idx);
1✔
661
        println!("recipe       : {}", sample.recipe);
1✔
662
        println!("sample_weight: {:.4}", sample.weight);
1✔
663
        if let Some(instr) = &sample.instruction {
1✔
664
            println!("instruction shown to model:\n{}\n", instr);
1✔
665
        }
1✔
666
        print_chunk_block("TEXT", &sample.chunk, strategy, split_store);
1✔
667
    }
668
    print_source_summary(
1✔
669
        "text samples",
1✔
670
        batch
1✔
671
            .samples
1✔
672
            .iter()
1✔
673
            .map(|sample| sample.chunk.record_id.as_str()),
1✔
674
    );
675
    print_recipe_summary_by_source(
1✔
676
        "text recipes by source",
1✔
677
        batch
1✔
678
            .samples
1✔
679
            .iter()
1✔
680
            .map(|sample| (sample.chunk.record_id.as_str(), sample.recipe.as_str())),
1✔
681
    );
682
}
1✔
683

684
fn print_pair_batch(
1✔
685
    strategy: &ChunkingStrategy,
1✔
686
    batch: &SampleBatch,
1✔
687
    split_store: &impl SplitStore,
1✔
688
) {
1✔
689
    println!("=== pair batch ===");
1✔
690
    for (idx, pair) in batch.pairs.iter().enumerate() {
1✔
691
        println!("--- pair #{} ---", idx);
1✔
692
        println!("recipe       : {}", pair.recipe);
1✔
693
        println!("label        : {:?}", pair.label);
1✔
694
        if let Some(reason) = &pair.reason {
1✔
695
            println!("reason       : {}", reason);
1✔
696
        }
1✔
697
        print_chunk_block("ANCHOR", &pair.anchor, strategy, split_store);
1✔
698
        print_chunk_block("OTHER", &pair.positive, strategy, split_store);
1✔
699
    }
700
    print_source_summary(
1✔
701
        "pair anchors",
1✔
702
        batch
1✔
703
            .pairs
1✔
704
            .iter()
1✔
705
            .map(|pair| pair.anchor.record_id.as_str()),
1✔
706
    );
707
    print_recipe_summary_by_source(
1✔
708
        "pair recipes by source",
1✔
709
        batch
1✔
710
            .pairs
1✔
711
            .iter()
1✔
712
            .map(|pair| (pair.anchor.record_id.as_str(), pair.recipe.as_str())),
1✔
713
    );
714
}
1✔
715

716
fn print_text_recipes(recipes: &[TextRecipe]) {
2✔
717
    println!("=== available text recipes ===");
2✔
718
    for recipe in recipes {
4✔
719
        println!(
4✔
720
            "- {} (weight: {:.3}) selector={:?}",
721
            recipe.name, recipe.weight, recipe.selector
722
        );
723
        if let Some(instr) = &recipe.instruction {
4✔
724
            println!("  instruction: {}", instr);
1✔
725
        }
3✔
726
    }
727
}
2✔
728

729
trait ChunkDebug {
730
    fn view_name(&self) -> String;
731
}
732

733
impl ChunkDebug for RecordChunk {
734
    fn view_name(&self) -> String {
6✔
735
        match &self.view {
6✔
736
            ChunkView::Window {
737
                index,
4✔
738
                span,
4✔
739
                overlap,
4✔
740
                start_ratio,
4✔
741
            } => format!(
4✔
742
                "window#index={} span={} overlap={} start_ratio={:.3} tokens={}",
743
                index, span, overlap, start_ratio, self.tokens_estimate
744
            ),
745
            ChunkView::SummaryFallback { strategy, .. } => {
2✔
746
                format!("summary:{} tokens={}", strategy, self.tokens_estimate)
2✔
747
            }
748
        }
749
    }
6✔
750
}
751

752
fn print_chunk_block(
6✔
753
    title: &str,
6✔
754
    chunk: &RecordChunk,
6✔
755
    strategy: &ChunkingStrategy,
6✔
756
    split_store: &impl SplitStore,
6✔
757
) {
6✔
758
    let chunk_weight = chunk_weight(strategy, chunk);
6✔
759
    let split = split_store
6✔
760
        .label_for(&chunk.record_id)
6✔
761
        .map(|label| format!("{:?}", label))
6✔
762
        .unwrap_or_else(|| "Unknown".to_string());
6✔
763
    println!("--- {} ---", title);
6✔
764
    println!("split        : {}", split);
6✔
765
    println!("view         : {}", chunk.view_name());
6✔
766
    println!("chunk_weight : {:.4}", chunk_weight);
6✔
767
    println!("record_id    : {}", chunk.record_id);
6✔
768
    println!("section_idx  : {}", chunk.section_idx);
6✔
769
    println!("token_est    : {}", chunk.tokens_estimate);
6✔
770
    println!("model_input (exact text sent to the model):");
6✔
771
    println!(
6✔
772
        "<<< BEGIN MODEL TEXT >>>\n{}\n<<< END MODEL TEXT >>>\n",
773
        chunk.text
774
    );
775
}
6✔
776

777
fn print_source_summary<'a, I>(label: &str, ids: I)
3✔
778
where
3✔
779
    I: Iterator<Item = &'a str>,
3✔
780
{
781
    let mut counts: HashMap<SourceId, usize> = HashMap::new();
3✔
782
    for id in ids {
3✔
783
        let source = extract_source(id);
3✔
784
        *counts.entry(source).or_insert(0) += 1;
3✔
785
    }
3✔
786
    if counts.is_empty() {
3✔
787
        return;
×
788
    }
3✔
789
    let skew = source_skew(&counts);
3✔
790
    let mut entries: Vec<(String, usize)> = counts.into_iter().collect();
3✔
791
    entries.sort_by(|a, b| b.1.cmp(&a.1).then_with(|| a.0.cmp(&b.0)));
3✔
792
    println!("--- {} by source ---", label);
3✔
793
    if let Some(skew) = skew {
3✔
794
        for entry in &skew.per_source {
3✔
795
            println!(
3✔
796
                "{}: count={} share={:.2}",
3✔
797
                entry.source, entry.count, entry.share
3✔
798
            );
3✔
799
        }
3✔
800
        println!(
3✔
801
            "skew: sources={} total={} min={} max={} mean={:.2} ratio={:.2}",
802
            skew.sources, skew.total, skew.min, skew.max, skew.mean, skew.ratio
803
        );
804
    } else {
805
        for (source, count) in &entries {
×
806
            println!("{source}: count={count}");
×
807
        }
×
808
    }
809
}
3✔
810

811
fn print_recipe_summary_by_source<'a, I>(label: &str, entries: I)
3✔
812
where
3✔
813
    I: Iterator<Item = (&'a str, &'a str)>,
3✔
814
{
815
    let mut counts: HashMap<SourceId, HashMap<String, usize>> = HashMap::new();
3✔
816
    for (record_id, recipe) in entries {
3✔
817
        let source = extract_source(record_id);
3✔
818
        let entry = counts
3✔
819
            .entry(source)
3✔
820
            .or_default()
3✔
821
            .entry(recipe.to_string())
3✔
822
            .or_insert(0);
3✔
823
        *entry += 1;
3✔
824
    }
3✔
825
    if counts.is_empty() {
3✔
826
        return;
×
827
    }
3✔
828
    let mut sources: Vec<(SourceId, HashMap<String, usize>)> = counts.into_iter().collect();
3✔
829
    sources.sort_by(|a, b| a.0.cmp(&b.0));
3✔
830
    println!("--- {} ---", label);
3✔
831
    for (source, recipes) in sources {
3✔
832
        println!("{source}");
3✔
833
        let mut entries: Vec<(String, usize)> = recipes.into_iter().collect();
3✔
834
        entries.sort_by(|a, b| b.1.cmp(&a.1).then_with(|| a.0.cmp(&b.0)));
3✔
835
        for (recipe, count) in entries {
3✔
836
            println!("  - {recipe}={count}");
3✔
837
        }
3✔
838
    }
839
}
3✔
840

841
fn extract_source(record_id: &str) -> SourceId {
8✔
842
    record_id
8✔
843
        .split_once("::")
8✔
844
        .map(|(source, _)| source.to_string())
8✔
845
        .unwrap_or_else(|| "unknown".to_string())
8✔
846
}
8✔
847

848
#[cfg(test)]
849
mod tests {
850
    use super::*;
851
    use crate::DeterministicSplitStore;
852
    use crate::data::SectionRole;
853
    use crate::source::{SourceCursor, SourceSnapshot};
854
    use chrono::Utc;
855
    use tempfile::tempdir;
856

857
    /// Minimal in-memory `DataSource` test double for example app tests.
858
    struct TestSource {
859
        id: String,
860
        count: Option<u128>,
861
        recipes: Vec<TripletRecipe>,
862
    }
863

864
    impl DataSource for TestSource {
865
        fn id(&self) -> &str {
129✔
866
            &self.id
129✔
867
        }
129✔
868

869
        fn refresh(
30✔
870
            &self,
30✔
871
            _config: &SamplerConfig,
30✔
872
            _cursor: Option<&SourceCursor>,
30✔
873
            _limit: Option<usize>,
30✔
874
        ) -> Result<SourceSnapshot, SamplerError> {
30✔
875
            Ok(SourceSnapshot {
30✔
876
                records: Vec::new(),
30✔
877
                cursor: SourceCursor {
30✔
878
                    last_seen: Utc::now(),
30✔
879
                    revision: 0,
30✔
880
                },
30✔
881
            })
30✔
882
        }
30✔
883

884
        fn reported_record_count(&self, _config: &SamplerConfig) -> Result<u128, SamplerError> {
2✔
885
            self.count.ok_or_else(|| SamplerError::SourceInconsistent {
2✔
886
                source_id: self.id.clone(),
1✔
887
                details: "test source has no configured exact count".to_string(),
1✔
888
            })
1✔
889
        }
2✔
890

891
        fn default_triplet_recipes(&self) -> Vec<TripletRecipe> {
9✔
892
            self.recipes.clone()
9✔
893
        }
9✔
894
    }
895

896
    struct ConfigRequiredSource {
897
        id: String,
898
        expected_seed: u64,
899
    }
900

901
    impl DataSource for ConfigRequiredSource {
902
        fn id(&self) -> &str {
1✔
903
            &self.id
1✔
904
        }
1✔
905

NEW
906
        fn refresh(
×
NEW
907
            &self,
×
NEW
908
            _config: &SamplerConfig,
×
NEW
909
            _cursor: Option<&SourceCursor>,
×
NEW
910
            _limit: Option<usize>,
×
NEW
911
        ) -> Result<SourceSnapshot, SamplerError> {
×
NEW
912
            Ok(SourceSnapshot {
×
NEW
913
                records: Vec::new(),
×
NEW
914
                cursor: SourceCursor {
×
NEW
915
                    last_seen: Utc::now(),
×
NEW
916
                    revision: 0,
×
NEW
917
                },
×
NEW
918
            })
×
NEW
919
        }
×
920

921
        fn reported_record_count(&self, config: &SamplerConfig) -> Result<u128, SamplerError> {
1✔
922
            if config.seed == self.expected_seed {
1✔
923
                Ok(1)
1✔
924
            } else {
NEW
925
                Err(SamplerError::SourceInconsistent {
×
NEW
926
                    source_id: self.id.clone(),
×
NEW
927
                    details: format!(
×
NEW
928
                        "expected sampler seed {} but got {}",
×
NEW
929
                        self.expected_seed, config.seed
×
NEW
930
                    ),
×
NEW
931
                })
×
932
            }
933
        }
1✔
934

935
        fn default_triplet_recipes(&self) -> Vec<TripletRecipe> {
1✔
936
            Vec::new()
1✔
937
        }
1✔
938
    }
939

940
    fn default_recipe(name: &str) -> TripletRecipe {
6✔
941
        TripletRecipe {
6✔
942
            name: name.to_string().into(),
6✔
943
            anchor: crate::config::Selector::Role(SectionRole::Anchor),
6✔
944
            positive_selector: crate::config::Selector::Role(SectionRole::Context),
6✔
945
            negative_selector: crate::config::Selector::Role(SectionRole::Context),
6✔
946
            negative_strategy: crate::config::NegativeStrategy::WrongArticle,
6✔
947
            weight: 1.0,
6✔
948
            instruction: None,
6✔
949
        }
6✔
950
    }
6✔
951

952
    #[test]
953
    fn parse_helpers_validate_inputs() {
1✔
954
        assert_eq!(parse_positive_usize("2").unwrap(), 2);
1✔
955
        assert!(parse_positive_usize("0").is_err());
1✔
956
        assert!(parse_positive_usize("abc").is_err());
1✔
957

958
        let split = parse_split_ratios_arg("0.8,0.1,0.1").unwrap();
1✔
959
        assert!((split.train - 0.8).abs() < 1e-6);
1✔
960
        assert!(parse_split_ratios_arg("0.8,0.1").is_err());
1✔
961
        assert!(parse_split_ratios_arg("1.0,0.0,0.1").is_err());
1✔
962
        assert!(parse_split_ratios_arg("-0.1,0.6,0.5").is_err());
1✔
963
    }
1✔
964

965
    #[test]
966
    fn parse_cli_handles_help_and_invalid_args() {
1✔
967
        let help = parse_cli::<EstimateCapacityCli, _>(["estimate_capacity", "--help"]).unwrap();
1✔
968
        assert!(help.is_none());
1✔
969

970
        let err = parse_cli::<EstimateCapacityCli, _>(["estimate_capacity", "--unknown"]);
1✔
971
        assert!(err.is_err());
1✔
972
    }
1✔
973

974
    #[test]
975
    fn run_estimate_capacity_succeeds_with_reported_counts() {
1✔
976
        let result = run_estimate_capacity(
1✔
977
            std::iter::empty::<String>(),
1✔
978
            |roots| {
1✔
979
                assert!(roots.is_empty());
1✔
980
                Ok(())
1✔
981
            },
1✔
982
            |_| {
1✔
983
                vec![Box::new(TestSource {
1✔
984
                    id: "source_a".into(),
1✔
985
                    count: Some(12),
1✔
986
                    recipes: vec![default_recipe("r1")],
1✔
987
                }) as DynSource]
1✔
988
            },
1✔
989
        );
990

991
        assert!(result.is_ok());
1✔
992
    }
1✔
993

994
    #[test]
995
    fn run_estimate_capacity_errors_when_source_count_missing() {
1✔
996
        let result = run_estimate_capacity(
1✔
997
            std::iter::empty::<String>(),
1✔
998
            |_| Ok(()),
1✔
999
            |_| {
1✔
1000
                vec![Box::new(TestSource {
1✔
1001
                    id: "source_missing".into(),
1✔
1002
                    count: None,
1✔
1003
                    recipes: vec![default_recipe("r1")],
1✔
1004
                }) as DynSource]
1✔
1005
            },
1✔
1006
        );
1007

1008
        let err = result.unwrap_err().to_string();
1✔
1009
        assert!(err.contains("failed to report exact record count"));
1✔
1010
    }
1✔
1011

1012
    #[test]
1013
    fn run_estimate_capacity_configures_sources_centrally_before_counting() {
1✔
1014
        let result = run_estimate_capacity(
1✔
1015
            std::iter::empty::<String>(),
1✔
1016
            |_| Ok(()),
1✔
1017
            |_| {
1✔
1018
                vec![Box::new(ConfigRequiredSource {
1✔
1019
                    id: "requires_config".into(),
1✔
1020
                    expected_seed: 99,
1✔
1021
                }) as DynSource]
1✔
1022
            },
1✔
1023
        );
1024

1025
        assert!(result.is_ok());
1✔
1026
    }
1✔
1027

1028
    #[test]
1029
    fn parse_multi_source_cli_handles_help_and_batch_size_validation() {
1✔
1030
        let help = parse_cli::<MultiSourceDemoCli, _>(["multi_source_demo", "--help"]).unwrap();
1✔
1031
        assert!(help.is_none());
1✔
1032

1033
        let err = parse_cli::<MultiSourceDemoCli, _>(["multi_source_demo", "--batch-size", "0"]);
1✔
1034
        assert!(err.is_err());
1✔
1035
    }
1✔
1036

1037
    #[test]
1038
    fn run_multi_source_demo_list_text_recipes_path_succeeds() {
1✔
1039
        let dir = tempdir().unwrap();
1✔
1040
        let mut args = vec![
1✔
1041
            "--list-text-recipes".to_string(),
1✔
1042
            "--split-store-dir".to_string(),
1✔
1043
            dir.path().to_string_lossy().to_string(),
1✔
1044
        ];
1045
        let result = run_multi_source_demo(
1✔
1046
            args.drain(..),
1✔
1047
            |_| Ok(()),
1✔
1048
            |_| {
1✔
1049
                vec![Box::new(TestSource {
1✔
1050
                    id: "source_for_recipes".into(),
1✔
1051
                    count: Some(10),
1✔
1052
                    recipes: vec![default_recipe("recipe_a")],
1✔
1053
                }) as DynSource]
1✔
1054
            },
1✔
1055
        );
1056

1057
        assert!(result.is_ok());
1✔
1058
    }
1✔
1059

1060
    #[test]
1061
    fn run_multi_source_demo_sampling_modes_handle_empty_sources() {
1✔
1062
        for mode in [
3✔
1063
            vec!["--pair-batch".to_string()],
1✔
1064
            vec!["--text-recipes".to_string()],
1✔
1065
            vec![],
1✔
1066
        ] {
1✔
1067
            let dir = tempdir().unwrap();
3✔
1068
            let mut args = mode;
3✔
1069
            args.push("--split-store-dir".to_string());
3✔
1070
            args.push(dir.path().to_string_lossy().to_string());
3✔
1071
            args.push("--split".to_string());
3✔
1072
            args.push("validation".to_string());
3✔
1073

1074
            let result = run_multi_source_demo(
3✔
1075
                args.into_iter(),
3✔
1076
                |_| Ok(()),
3✔
1077
                |_| {
3✔
1078
                    vec![Box::new(TestSource {
3✔
1079
                        id: "source_empty".into(),
3✔
1080
                        count: Some(0),
3✔
1081
                        recipes: vec![default_recipe("recipe_empty")],
3✔
1082
                    }) as DynSource]
3✔
1083
                },
3✔
1084
            );
1085

1086
            assert!(result.is_ok());
3✔
1087
        }
1088
    }
1✔
1089

1090
    #[test]
1091
    fn print_helpers_and_extract_source_cover_paths() {
1✔
1092
        let split = SplitRatios::default();
1✔
1093
        let store = DeterministicSplitStore::new(split, 42).unwrap();
1✔
1094
        let strategy = ChunkingStrategy::default();
1✔
1095

1096
        let anchor = RecordChunk {
1✔
1097
            record_id: "source_a::rec1".to_string(),
1✔
1098
            section_idx: 0,
1✔
1099
            view: ChunkView::Window {
1✔
1100
                index: 1,
1✔
1101
                overlap: 2,
1✔
1102
                span: 12,
1✔
1103
                start_ratio: 0.25,
1✔
1104
            },
1✔
1105
            text: "anchor text".to_string(),
1✔
1106
            tokens_estimate: 8,
1✔
1107
            quality: crate::data::QualityScore { trust: 0.9 },
1✔
1108
        };
1✔
1109
        let positive = RecordChunk {
1✔
1110
            record_id: "source_a::rec2".to_string(),
1✔
1111
            section_idx: 1,
1✔
1112
            view: ChunkView::SummaryFallback {
1✔
1113
                strategy: "summary".to_string(),
1✔
1114
                weight: 0.7,
1✔
1115
            },
1✔
1116
            text: "positive text".to_string(),
1✔
1117
            tokens_estimate: 6,
1✔
1118
            quality: crate::data::QualityScore { trust: 0.8 },
1✔
1119
        };
1✔
1120
        let negative = RecordChunk {
1✔
1121
            record_id: "source_b::rec3".to_string(),
1✔
1122
            section_idx: 2,
1✔
1123
            view: ChunkView::Window {
1✔
1124
                index: 0,
1✔
1125
                overlap: 0,
1✔
1126
                span: 16,
1✔
1127
                start_ratio: 0.0,
1✔
1128
            },
1✔
1129
            text: "negative text".to_string(),
1✔
1130
            tokens_estimate: 7,
1✔
1131
            quality: crate::data::QualityScore { trust: 0.5 },
1✔
1132
        };
1✔
1133

1134
        let triplet_batch = TripletBatch {
1✔
1135
            triplets: vec![crate::SampleTriplet {
1✔
1136
                recipe: "triplet_recipe".to_string(),
1✔
1137
                anchor: anchor.clone(),
1✔
1138
                positive: positive.clone(),
1✔
1139
                negative: negative.clone(),
1✔
1140
                weight: 1.0,
1✔
1141
                instruction: Some("triplet instruction".to_string()),
1✔
1142
            }],
1✔
1143
        };
1✔
1144
        print_triplet_batch(&strategy, &triplet_batch, &store);
1✔
1145

1146
        let pair_batch = SampleBatch {
1✔
1147
            pairs: vec![crate::SamplePair {
1✔
1148
                recipe: "pair_recipe".to_string(),
1✔
1149
                anchor: anchor.clone(),
1✔
1150
                positive: positive.clone(),
1✔
1151
                weight: 1.0,
1✔
1152
                instruction: None,
1✔
1153
                label: crate::PairLabel::Positive,
1✔
1154
                reason: Some("same topic".to_string()),
1✔
1155
            }],
1✔
1156
        };
1✔
1157
        print_pair_batch(&strategy, &pair_batch, &store);
1✔
1158

1159
        let text_batch = TextBatch {
1✔
1160
            samples: vec![crate::TextSample {
1✔
1161
                recipe: "text_recipe".to_string(),
1✔
1162
                chunk: negative,
1✔
1163
                weight: 0.8,
1✔
1164
                instruction: Some("text instruction".to_string()),
1✔
1165
            }],
1✔
1166
        };
1✔
1167
        print_text_batch(&strategy, &text_batch, &store);
1✔
1168

1169
        let recipes = vec![TextRecipe {
1✔
1170
            name: "recipe_name".into(),
1✔
1171
            selector: crate::config::Selector::Role(SectionRole::Context),
1✔
1172
            instruction: Some("instruction".into()),
1✔
1173
            weight: 1.0,
1✔
1174
        }];
1✔
1175
        print_text_recipes(&recipes);
1✔
1176

1177
        assert_eq!(extract_source("source_a::record"), "source_a");
1✔
1178
        assert_eq!(extract_source("record-without-delimiter"), "unknown");
1✔
1179
    }
1✔
1180

1181
    #[test]
1182
    fn split_arg_conversion_and_version_parse_paths_are_covered() {
1✔
1183
        assert!(matches!(
1✔
1184
            SplitLabel::from(SplitArg::Train),
1✔
1185
            SplitLabel::Train
1186
        ));
1187
        assert!(matches!(
1✔
1188
            SplitLabel::from(SplitArg::Validation),
1✔
1189
            SplitLabel::Validation
1190
        ));
1191
        assert!(matches!(SplitLabel::from(SplitArg::Test), SplitLabel::Test));
1✔
1192
    }
1✔
1193

1194
    #[test]
1195
    fn parse_split_ratios_reports_per_field_parse_errors() {
1✔
1196
        assert!(
1✔
1197
            parse_split_ratios_arg("x,0.1,0.9")
1✔
1198
                .unwrap_err()
1✔
1199
                .contains("invalid train ratio")
1✔
1200
        );
1201
        assert!(
1✔
1202
            parse_split_ratios_arg("0.1,y,0.8")
1✔
1203
                .unwrap_err()
1✔
1204
                .contains("invalid validation ratio")
1✔
1205
        );
1206
        assert!(
1✔
1207
            parse_split_ratios_arg("0.1,0.2,z")
1✔
1208
                .unwrap_err()
1✔
1209
                .contains("invalid test ratio")
1✔
1210
        );
1211
    }
1✔
1212

1213
    #[test]
1214
    fn run_multi_source_demo_exhausted_paths_are_handled() {
1✔
1215
        for mode in [
3✔
1216
            vec!["--pair-batch".to_string()],
1✔
1217
            vec!["--text-recipes".to_string()],
1✔
1218
            Vec::new(),
1✔
1219
        ] {
1✔
1220
            let dir = tempdir().unwrap();
3✔
1221
            let mut args = mode;
3✔
1222
            args.push("--split-store-dir".to_string());
3✔
1223
            args.push(dir.path().to_string_lossy().to_string());
3✔
1224

1225
            let result = run_multi_source_demo(
3✔
1226
                args.into_iter(),
3✔
1227
                |_| Ok(()),
3✔
1228
                |_| {
3✔
1229
                    vec![Box::new(TestSource {
3✔
1230
                        id: "source_without_recipes".into(),
3✔
1231
                        count: Some(1),
3✔
1232
                        recipes: Vec::new(),
3✔
1233
                    }) as DynSource]
3✔
1234
                },
3✔
1235
            );
1236

1237
            assert!(result.is_ok());
3✔
1238
        }
1239
    }
1✔
1240
}
STATUS · Troubleshooting · Open an Issue · Sales · Support · CAREERS · ENTERPRISE · START FREE · SCHEDULE DEMO
ANNOUNCEMENTS · TWITTER · TOS & SLA · Supported CI Services · What's a CI service? · Automated Testing

© 2026 Coveralls, Inc