• Home
  • Features
  • Pricing
  • Docs
  • Announcements
  • Sign In

jzombie / rust-triplets / 22355546657

24 Feb 2026 02:38PM UTC coverage: 91.328% (-1.3%) from 92.675%
22355546657

Pull #7

github

web-flow
Merge 3a0e16a81 into 980559192
Pull Request #7: Add HF source

3853 of 4411 new or added lines in 6 files covered. (87.35%)

64 existing lines in 2 files now uncovered.

13238 of 14495 relevant lines covered (91.33%)

2725.83 hits per line

Source File
Press 'n' to go to next uncovered line, 'b' for previous

91.81
/src/example_apps.rs
1
use std::collections::HashMap;
2
use std::error::Error;
3
use std::path::PathBuf;
4
use std::sync::Arc;
5
use std::sync::Once;
6

7
use clap::{Parser, ValueEnum, error::ErrorKind};
8

9
use crate::config::{ChunkingStrategy, SamplerConfig, TripletRecipe};
10
use crate::data::ChunkView;
11
use crate::heuristics::{
12
    CapacityTotals, EFFECTIVE_NEGATIVES_PER_ANCHOR, EFFECTIVE_POSITIVES_PER_ANCHOR,
13
    estimate_source_split_capacity_from_counts, format_replay_factor, format_u128_with_commas,
14
    resolve_text_recipes_for_source, split_counts_for_total,
15
};
16
use crate::metrics::source_skew;
17
use crate::sampler::chunk_weight;
18
use crate::source::{DataSource, configure_sources_for_sampler};
19
use crate::splits::{FileSplitStore, SplitLabel, SplitRatios, SplitStore};
20
use crate::{
21
    PairSampler, RecordChunk, SampleBatch, Sampler, SamplerError, SourceId, TextBatch, TextRecipe,
22
    TripletBatch,
23
};
24

25
type DynSource = Box<dyn DataSource + 'static>;
26

27
fn init_example_tracing() {
7✔
28
    static INIT: Once = Once::new();
29
    INIT.call_once(|| {
7✔
30
        let env_filter = tracing_subscriber::EnvFilter::try_from_default_env()
1✔
31
            .unwrap_or_else(|_| tracing_subscriber::EnvFilter::new("triplets=debug"));
1✔
32
        let _ = tracing_subscriber::fmt()
1✔
33
            .with_env_filter(env_filter)
1✔
34
            .try_init();
1✔
35
    });
1✔
36
}
7✔
37

38
#[derive(Debug, Clone, Copy, ValueEnum)]
39
/// CLI split selector mapped onto `SplitLabel`.
40
enum SplitArg {
41
    Train,
42
    Validation,
43
    Test,
44
}
45

46
impl From<SplitArg> for SplitLabel {
47
    fn from(value: SplitArg) -> Self {
3✔
48
        match value {
3✔
49
            SplitArg::Train => SplitLabel::Train,
×
50
            SplitArg::Validation => SplitLabel::Validation,
3✔
51
            SplitArg::Test => SplitLabel::Test,
×
52
        }
53
    }
3✔
54
}
55

56
#[derive(Debug, Parser)]
57
#[command(
58
    name = "estimate_capacity",
59
    disable_help_subcommand = true,
60
    about = "Metadata-only capacity estimation",
61
    long_about = "Estimate record, pair, triplet, and text-sample capacity using source-reported counts only (no data refresh).",
62
    after_help = "Source roots are optional and resolved in order by explicit arg, environment variables, then project defaults."
63
)]
64
/// CLI arguments for metadata-only capacity estimation.
65
struct EstimateCapacityCli {
66
    #[arg(
67
        long,
68
        default_value_t = 99,
69
        help = "Deterministic seed used for split allocation"
70
    )]
71
    seed: u64,
72
    #[arg(
73
        long = "split-ratios",
74
        value_name = "TRAIN,VALIDATION,TEST",
75
        value_parser = parse_split_ratios_arg,
76
        default_value = "0.8,0.1,0.1",
77
        help = "Comma-separated split ratios that must sum to 1.0"
78
    )]
79
    split: SplitRatios,
80
    #[arg(
81
        long = "source-root",
82
        value_name = "PATH",
83
        help = "Optional source root override, repeat as needed in source order"
84
    )]
85
    source_roots: Vec<String>,
86
}
87

88
#[derive(Debug, Parser)]
89
#[command(
90
    name = "multi_source_demo",
91
    disable_help_subcommand = true,
92
    about = "Run sampled batches from multiple sources",
93
    long_about = "Sample triplet, pair, or text batches from multiple sources and persist split/epoch state.",
94
    after_help = "Source roots are optional and resolved in order by explicit arg, environment variables, then project defaults."
95
)]
96
/// CLI for `multi_source_demo`.
97
///
98
/// Common usage:
99
/// - Keep default persistence file location: `.sampler_store/split_store.bin`
100
/// - Set an explicit file path: `--split-store-path /tmp/split_store.bin`
101
/// - Set a custom directory and keep default filename: `--split-store-dir /tmp/sampler_store`
102
/// - Repeat `--source-root <PATH>` to override source roots in order
103
struct MultiSourceDemoCli {
104
    #[arg(
105
        long = "text-recipes",
106
        help = "Emit a text batch instead of a triplet batch"
107
    )]
108
    show_text_samples: bool,
109
    #[arg(
110
        long = "pair-batch",
111
        help = "Emit a pair batch instead of a triplet batch"
112
    )]
113
    show_pair_samples: bool,
114
    #[arg(
115
        long = "list-text-recipes",
116
        help = "Print registered text recipes and exit"
117
    )]
118
    list_text_recipes: bool,
119
    #[arg(
120
        long = "batch-size",
121
        default_value_t = 4,
122
        value_parser = parse_positive_usize,
123
        help = "Batch size used for sampling"
124
    )]
125
    batch_size: usize,
126
    #[arg(long, help = "Optional deterministic seed override")]
127
    seed: Option<u64>,
128
    #[arg(long, value_enum, help = "Target split to sample from")]
129
    split: Option<SplitArg>,
130
    #[arg(
131
        long = "source-root",
132
        value_name = "PATH",
133
        help = "Optional source root override, repeat as needed in source order"
134
    )]
135
    source_roots: Vec<String>,
136
    #[arg(
137
        long = "split-store-path",
138
        value_name = "SPLIT_STORE_PATH",
139
        help = "Optional path for persisted split/epoch state file"
140
    )]
141
    split_store_path: Option<PathBuf>,
142
    #[arg(
143
        long = "split-store-dir",
144
        value_name = "DIR",
145
        conflicts_with = "split_store_path",
146
        help = "Optional directory for persisted split/epoch state file (uses split_store.bin filename)"
147
    )]
148
    split_store_dir: Option<PathBuf>,
149
}
150

151
#[derive(Debug, Clone)]
152
/// Source-level inventory used by capacity estimation output.
153
struct SourceInventory {
154
    source_id: String,
155
    reported_records: u128,
156
    triplet_recipes: Vec<TripletRecipe>,
157
}
158

159
/// Run the capacity-estimation CLI with injectable root resolution/source builders.
160
///
161
/// `build_sources` is construction-only; sampler configuration is applied
162
/// centrally by this function before any source calls.
163
pub fn run_estimate_capacity<R, Resolve, Build, I>(
3✔
164
    args_iter: I,
3✔
165
    resolve_roots: Resolve,
3✔
166
    build_sources: Build,
3✔
167
) -> Result<(), Box<dyn Error>>
3✔
168
where
3✔
169
    Resolve: FnOnce(Vec<String>) -> Result<R, Box<dyn Error>>,
3✔
170
    Build: FnOnce(&R) -> Vec<DynSource>,
3✔
171
    I: Iterator<Item = String>,
3✔
172
{
173
    init_example_tracing();
3✔
174

175
    let Some(cli) = parse_cli::<EstimateCapacityCli, _>(
3✔
176
        std::iter::once("estimate_capacity".to_string()).chain(args_iter),
3✔
177
    )?
×
178
    else {
179
        return Ok(());
×
180
    };
181

182
    let roots = resolve_roots(cli.source_roots)?;
3✔
183

184
    let config = SamplerConfig {
3✔
185
        seed: cli.seed,
3✔
186
        split: cli.split,
3✔
187
        ..SamplerConfig::default()
3✔
188
    };
3✔
189

190
    let sources = configure_sources_for_sampler(build_sources(&roots), &config);
3✔
191

192
    let mut inventories = Vec::new();
3✔
193
    for source in &sources {
3✔
194
        let recipes = if config.recipes.is_empty() {
3✔
195
            source.default_triplet_recipes()
3✔
196
        } else {
197
            config.recipes.clone()
×
198
        };
199
        let reported_records = source.reported_record_count().map_err(|err| {
3✔
200
            format!(
1✔
201
                "source '{}' failed to report exact record count: {err}",
202
                source.id()
1✔
203
            )
204
        })?;
1✔
205
        inventories.push(SourceInventory {
2✔
206
            source_id: source.id().to_string(),
2✔
207
            reported_records,
2✔
208
            triplet_recipes: recipes,
2✔
209
        });
2✔
210
    }
211

212
    let mut per_source_split_counts: HashMap<(String, SplitLabel), u128> = HashMap::new();
2✔
213
    let mut split_record_counts: HashMap<SplitLabel, u128> = HashMap::new();
2✔
214

215
    for source in &inventories {
2✔
216
        let counts = split_counts_for_total(source.reported_records, cli.split);
2✔
217
        for (label, count) in counts {
6✔
218
            per_source_split_counts.insert((source.source_id.clone(), label), count);
6✔
219
            *split_record_counts.entry(label).or_insert(0) += count;
6✔
220
        }
6✔
221
    }
222

223
    let mut totals_by_split: HashMap<SplitLabel, CapacityTotals> = HashMap::new();
2✔
224
    let mut totals_by_source_and_split: HashMap<(String, SplitLabel), CapacityTotals> =
2✔
225
        HashMap::new();
2✔
226

227
    for split_label in [SplitLabel::Train, SplitLabel::Validation, SplitLabel::Test] {
6✔
228
        let mut totals = CapacityTotals::default();
6✔
229

230
        for source in &inventories {
6✔
231
            let source_split_records = per_source_split_counts
6✔
232
                .get(&(source.source_id.clone(), split_label))
6✔
233
                .copied()
6✔
234
                .unwrap_or(0);
6✔
235

6✔
236
            let triplet_recipes = &source.triplet_recipes;
6✔
237
            let text_recipes = resolve_text_recipes_for_source(&config, triplet_recipes);
6✔
238

6✔
239
            let capacity = estimate_source_split_capacity_from_counts(
6✔
240
                source_split_records,
6✔
241
                triplet_recipes,
6✔
242
                &text_recipes,
6✔
243
            );
6✔
244

6✔
245
            totals_by_source_and_split.insert((source.source_id.clone(), split_label), capacity);
6✔
246

6✔
247
            totals.triplets += capacity.triplets;
6✔
248
            totals.effective_triplets += capacity.effective_triplets;
6✔
249
            totals.pairs += capacity.pairs;
6✔
250
            totals.text_samples += capacity.text_samples;
6✔
251
        }
6✔
252

253
        totals_by_split.insert(split_label, totals);
6✔
254
    }
255

256
    println!("=== capacity estimate (length-only) ===");
2✔
257
    println!("mode: metadata-only (no source.refresh calls)");
2✔
258
    println!("classification: heuristic approximation (not exact)");
2✔
259
    println!("split seed: {}", cli.seed);
2✔
260
    println!(
2✔
261
        "split ratios: train={:.4}, validation={:.4}, test={:.4}",
262
        cli.split.train, cli.split.validation, cli.split.test
263
    );
264
    println!();
2✔
265

266
    println!("[SOURCES]");
2✔
267
    for source in &inventories {
2✔
268
        println!(
2✔
269
            "  {} => reported records: {}",
2✔
270
            source.source_id,
2✔
271
            format_u128_with_commas(source.reported_records)
2✔
272
        );
2✔
273
    }
2✔
274
    println!();
2✔
275

276
    println!("[PER SOURCE BREAKDOWN]");
2✔
277
    for source in &inventories {
2✔
278
        println!("  {}", source.source_id);
2✔
279
        let mut source_grand = CapacityTotals::default();
2✔
280
        let mut source_total_records = 0u128;
2✔
281
        for split_label in [SplitLabel::Train, SplitLabel::Validation, SplitLabel::Test] {
6✔
282
            let split_records = per_source_split_counts
6✔
283
                .get(&(source.source_id.clone(), split_label))
6✔
284
                .copied()
6✔
285
                .unwrap_or(0);
6✔
286
            source_total_records = source_total_records.saturating_add(split_records);
6✔
287
            let split_longest_records = inventories
6✔
288
                .iter()
6✔
289
                .map(|candidate| {
6✔
290
                    per_source_split_counts
6✔
291
                        .get(&(candidate.source_id.clone(), split_label))
6✔
292
                        .copied()
6✔
293
                        .unwrap_or(0)
6✔
294
                })
6✔
295
                .max()
6✔
296
                .unwrap_or(0);
6✔
297
            let totals = totals_by_source_and_split
6✔
298
                .get(&(source.source_id.clone(), split_label))
6✔
299
                .copied()
6✔
300
                .unwrap_or_default();
6✔
301
            source_grand.triplets += totals.triplets;
6✔
302
            source_grand.effective_triplets += totals.effective_triplets;
6✔
303
            source_grand.pairs += totals.pairs;
6✔
304
            source_grand.text_samples += totals.text_samples;
6✔
305
            println!("    [{:?}]", split_label);
6✔
306
            println!("      records: {}", format_u128_with_commas(split_records));
6✔
307
            println!(
6✔
308
                "      triplet combinations: {}",
309
                format_u128_with_commas(totals.triplets)
6✔
310
            );
311
            println!(
6✔
312
                "      effective sampled triplets (p={}, k={}): {}",
313
                EFFECTIVE_POSITIVES_PER_ANCHOR,
314
                EFFECTIVE_NEGATIVES_PER_ANCHOR,
315
                format_u128_with_commas(totals.effective_triplets)
6✔
316
            );
317
            println!(
6✔
318
                "      pair combinations:    {}",
319
                format_u128_with_commas(totals.pairs)
6✔
320
            );
321
            println!(
6✔
322
                "      text samples:         {}",
323
                format_u128_with_commas(totals.text_samples)
6✔
324
            );
325
            println!(
6✔
326
                "      replay factor vs longest source: {}",
327
                format_replay_factor(split_longest_records, split_records)
6✔
328
            );
329
        }
330
        let longest_source_total = inventories
2✔
331
            .iter()
2✔
332
            .map(|candidate| candidate.reported_records)
2✔
333
            .max()
2✔
334
            .unwrap_or(0);
2✔
335
        println!("    [ALL SPLITS FOR SOURCE]");
2✔
336
        println!(
2✔
337
            "      triplet combinations: {}",
338
            format_u128_with_commas(source_grand.triplets)
2✔
339
        );
340
        println!(
2✔
341
            "      effective sampled triplets (p={}, k={}): {}",
342
            EFFECTIVE_POSITIVES_PER_ANCHOR,
343
            EFFECTIVE_NEGATIVES_PER_ANCHOR,
344
            format_u128_with_commas(source_grand.effective_triplets)
2✔
345
        );
346
        println!(
2✔
347
            "      pair combinations:    {}",
348
            format_u128_with_commas(source_grand.pairs)
2✔
349
        );
350
        println!(
2✔
351
            "      text samples:         {}",
352
            format_u128_with_commas(source_grand.text_samples)
2✔
353
        );
354
        println!(
2✔
355
            "      replay factor vs longest source: {}",
356
            format_replay_factor(longest_source_total, source_total_records)
2✔
357
        );
358
        println!();
2✔
359
    }
360

361
    let mut grand = CapacityTotals::default();
2✔
362
    for split_label in [SplitLabel::Train, SplitLabel::Validation, SplitLabel::Test] {
6✔
363
        let record_count = split_record_counts.get(&split_label).copied().unwrap_or(0);
6✔
364
        let totals = totals_by_split
6✔
365
            .get(&split_label)
6✔
366
            .copied()
6✔
367
            .unwrap_or_default();
6✔
368

6✔
369
        grand.triplets += totals.triplets;
6✔
370
        grand.effective_triplets += totals.effective_triplets;
6✔
371
        grand.pairs += totals.pairs;
6✔
372
        grand.text_samples += totals.text_samples;
6✔
373

6✔
374
        println!("[{:?}]", split_label);
6✔
375
        println!("  records: {}", format_u128_with_commas(record_count));
6✔
376
        println!(
6✔
377
            "  triplet combinations: {}",
6✔
378
            format_u128_with_commas(totals.triplets)
6✔
379
        );
6✔
380
        println!(
6✔
381
            "  effective sampled triplets (p={}, k={}): {}",
6✔
382
            EFFECTIVE_POSITIVES_PER_ANCHOR,
6✔
383
            EFFECTIVE_NEGATIVES_PER_ANCHOR,
6✔
384
            format_u128_with_commas(totals.effective_triplets)
6✔
385
        );
6✔
386
        println!(
6✔
387
            "  pair combinations:    {}",
6✔
388
            format_u128_with_commas(totals.pairs)
6✔
389
        );
6✔
390
        println!(
6✔
391
            "  text samples:         {}",
6✔
392
            format_u128_with_commas(totals.text_samples)
6✔
393
        );
6✔
394
        println!();
6✔
395
    }
6✔
396

397
    println!("[ALL SPLITS TOTAL]");
2✔
398
    println!(
2✔
399
        "  triplet combinations: {}",
400
        format_u128_with_commas(grand.triplets)
2✔
401
    );
402
    println!(
2✔
403
        "  effective sampled triplets (p={}, k={}): {}",
404
        EFFECTIVE_POSITIVES_PER_ANCHOR,
405
        EFFECTIVE_NEGATIVES_PER_ANCHOR,
406
        format_u128_with_commas(grand.effective_triplets)
2✔
407
    );
408
    println!(
2✔
409
        "  pair combinations:    {}",
410
        format_u128_with_commas(grand.pairs)
2✔
411
    );
412
    println!(
2✔
413
        "  text samples:         {}",
414
        format_u128_with_commas(grand.text_samples)
2✔
415
    );
416
    println!();
2✔
417
    println!(
2✔
418
        "Note: counts are heuristic, length-based estimates from source-reported totals and recipe structure. They are approximate, not exact, and assume anchor-positive pairs=records (one positive per anchor by default), negatives=source_records_in_split-1 (anchor excluded as its own negative), and at most one chunk/window realization per sample. In real-world chunked sampling, practical combinations are often higher, so treat this as a floor-like baseline."
419
    );
420
    println!(
2✔
421
        "Effective sampled triplets apply a bounded training assumption: effective_triplets = records * p * k per triplet recipe, with defaults p={} positives per anchor and k={} negatives per anchor.",
422
        EFFECTIVE_POSITIVES_PER_ANCHOR, EFFECTIVE_NEGATIVES_PER_ANCHOR
423
    );
424
    println!(
2✔
425
        "Oversample loops are not inferred from this static report. To measure true oversampling (how many times sampling loops through the combination space), use observed sampled draw counts from an actual run."
426
    );
427

428
    Ok(())
2✔
429
}
3✔
430

431
/// Run the multi-source demo CLI with injectable root resolution/source builders.
432
///
433
/// `build_sources` is construction-only. Source sampler configuration is owned
434
/// by sampler registration (`PairSampler::register_source`).
435
pub fn run_multi_source_demo<R, Resolve, Build, I>(
4✔
436
    args_iter: I,
4✔
437
    resolve_roots: Resolve,
4✔
438
    build_sources: Build,
4✔
439
) -> Result<(), Box<dyn Error>>
4✔
440
where
4✔
441
    Resolve: FnOnce(Vec<String>) -> Result<R, Box<dyn Error>>,
4✔
442
    Build: FnOnce(&R) -> Vec<DynSource>,
4✔
443
    I: Iterator<Item = String>,
4✔
444
{
445
    init_example_tracing();
4✔
446

447
    let Some(cli) = parse_cli::<MultiSourceDemoCli, _>(
4✔
448
        std::iter::once("multi_source_demo".to_string()).chain(args_iter),
4✔
449
    )?
×
450
    else {
451
        return Ok(());
×
452
    };
453

454
    let roots = resolve_roots(cli.source_roots)?;
4✔
455

456
    let mut config = SamplerConfig::default();
4✔
457
    config.seed = cli.seed.unwrap_or(config.seed);
4✔
458
    config.batch_size = cli.batch_size;
4✔
459
    config.chunking = Default::default();
4✔
460
    let selected_split = cli.split.map(Into::into).unwrap_or(SplitLabel::Train);
4✔
461
    config.split = SplitRatios::default();
4✔
462
    config.allowed_splits = vec![selected_split];
4✔
463
    let chunking = config.chunking.clone();
4✔
464

465
    let split_store_path = if let Some(path) = cli.split_store_path {
4✔
466
        path
×
467
    } else if let Some(dir) = cli.split_store_dir {
4✔
468
        FileSplitStore::default_path_in_dir(dir)
4✔
469
    } else {
470
        FileSplitStore::default_path()
×
471
    };
472

473
    println!(
4✔
474
        "Persisting split assignments and epoch state to {}",
475
        split_store_path.display()
4✔
476
    );
477
    let sources = build_sources(&roots);
4✔
478
    let split_store = Arc::new(FileSplitStore::open(&split_store_path, config.split, 99)?);
4✔
479
    let sampler = PairSampler::new(config, split_store.clone());
4✔
480
    for source in sources {
4✔
481
        sampler.register_source(source);
4✔
482
    }
4✔
483

484
    if cli.show_pair_samples {
4✔
485
        match sampler.next_pair_batch(selected_split) {
1✔
486
            Ok(pair_batch) => {
×
487
                if pair_batch.pairs.is_empty() {
×
488
                    println!("Pair sampling produced no results.");
×
489
                } else {
×
490
                    print_pair_batch(&chunking, &pair_batch, split_store.as_ref());
×
491
                }
×
492
                sampler.persist_state()?;
×
493
            }
494
            Err(SamplerError::Exhausted(name)) => {
1✔
495
                eprintln!(
1✔
496
                    "Pair sampler exhausted recipe '{}'. Ensure both positive and negative examples exist.",
1✔
497
                    name
1✔
498
                );
1✔
499
            }
1✔
500
            Err(err) => return Err(err.into()),
×
501
        }
502
    } else if cli.show_text_samples {
3✔
503
        match sampler.next_text_batch(selected_split) {
1✔
504
            Ok(text_batch) => {
×
505
                if text_batch.samples.is_empty() {
×
506
                    println!(
×
507
                        "Text sampling produced no results. Ensure each source has eligible sections."
×
508
                    );
×
509
                } else {
×
510
                    print_text_batch(&chunking, &text_batch, split_store.as_ref());
×
511
                }
×
512
                sampler.persist_state()?;
×
513
            }
514
            Err(SamplerError::Exhausted(name)) => {
1✔
515
                eprintln!(
1✔
516
                    "Text sampler exhausted selector '{}'. Ensure matching sections exist.",
1✔
517
                    name
1✔
518
                );
1✔
519
            }
1✔
520
            Err(err) => return Err(err.into()),
×
521
        }
522
    } else if cli.list_text_recipes {
2✔
523
        let recipes = sampler.text_recipes();
1✔
524
        if recipes.is_empty() {
1✔
525
            println!(
×
526
                "No text recipes registered. Ensure your sources expose triplet selectors or configure text_recipes explicitly."
×
527
            );
×
528
        } else {
1✔
529
            print_text_recipes(&recipes);
1✔
530
        }
1✔
531
    } else {
532
        match sampler.next_triplet_batch(selected_split) {
1✔
533
            Ok(triplet_batch) => {
×
534
                if triplet_batch.triplets.is_empty() {
×
535
                    println!(
×
536
                        "Triplet sampling produced no results. Ensure multiple records per source exist."
×
537
                    );
×
538
                } else {
×
539
                    print_triplet_batch(&chunking, &triplet_batch, split_store.as_ref());
×
540
                }
×
541
                sampler.persist_state()?;
×
542
            }
543
            Err(SamplerError::Exhausted(name)) => {
1✔
544
                eprintln!(
1✔
545
                    "Triplet sampler exhausted recipe '{}'. Ensure both positive and negative examples exist.",
1✔
546
                    name
1✔
547
                );
1✔
548
            }
1✔
549
            Err(err) => return Err(err.into()),
×
550
        }
551
    }
552

553
    Ok(())
4✔
554
}
4✔
555

556
fn parse_positive_usize(raw: &str) -> Result<usize, String> {
8✔
557
    let parsed = raw.parse::<usize>().map_err(|_| {
8✔
558
        format!(
1✔
559
            "Could not parse --batch-size value '{}' as a positive integer",
560
            raw
561
        )
562
    })?;
1✔
563
    if parsed == 0 {
7✔
564
        return Err("--batch-size must be greater than zero".to_string());
2✔
565
    }
5✔
566
    Ok(parsed)
5✔
567
}
8✔
568

569
fn parse_cli<T, I>(args: I) -> Result<Option<T>, Box<dyn Error>>
11✔
570
where
11✔
571
    T: Parser,
11✔
572
    I: IntoIterator,
11✔
573
    I::Item: Into<std::ffi::OsString> + Clone,
11✔
574
{
575
    match T::try_parse_from(args) {
11✔
576
        Ok(cli) => Ok(Some(cli)),
7✔
577
        Err(err) => match err.kind() {
4✔
578
            ErrorKind::DisplayHelp | ErrorKind::DisplayVersion => {
579
                err.print()?;
2✔
580
                Ok(None)
2✔
581
            }
582
            _ => Err(err.into()),
2✔
583
        },
584
    }
585
}
11✔
586

587
fn parse_split_ratios_arg(raw: &str) -> Result<SplitRatios, String> {
7✔
588
    let parts: Vec<&str> = raw.split(',').collect();
7✔
589
    if parts.len() != 3 {
7✔
590
        return Err("--split-ratios expects exactly 3 comma-separated values".to_string());
1✔
591
    }
6✔
592
    let train = parts[0]
6✔
593
        .trim()
6✔
594
        .parse::<f32>()
6✔
595
        .map_err(|_| format!("invalid train ratio '{}': must be a float", parts[0].trim()))?;
6✔
596
    let validation = parts[1].trim().parse::<f32>().map_err(|_| {
6✔
597
        format!(
×
598
            "invalid validation ratio '{}': must be a float",
599
            parts[1].trim()
×
600
        )
601
    })?;
×
602
    let test = parts[2]
6✔
603
        .trim()
6✔
604
        .parse::<f32>()
6✔
605
        .map_err(|_| format!("invalid test ratio '{}': must be a float", parts[2].trim()))?;
6✔
606
    let ratios = SplitRatios {
6✔
607
        train,
6✔
608
        validation,
6✔
609
        test,
6✔
610
    };
6✔
611
    let sum = ratios.train + ratios.validation + ratios.test;
6✔
612
    if (sum - 1.0).abs() > 1e-5 {
6✔
613
        return Err(format!(
1✔
614
            "split ratios must sum to 1.0, got {:.6} (train={}, validation={}, test={})",
1✔
615
            sum, ratios.train, ratios.validation, ratios.test
1✔
616
        ));
1✔
617
    }
5✔
618
    if ratios.train < 0.0 || ratios.validation < 0.0 || ratios.test < 0.0 {
5✔
619
        return Err("split ratios must be non-negative".to_string());
1✔
620
    }
4✔
621
    Ok(ratios)
4✔
622
}
7✔
623

624
fn print_triplet_batch(
1✔
625
    strategy: &ChunkingStrategy,
1✔
626
    batch: &TripletBatch,
1✔
627
    split_store: &impl SplitStore,
1✔
628
) {
1✔
629
    println!("=== triplet batch ===");
1✔
630
    for (idx, triplet) in batch.triplets.iter().enumerate() {
1✔
631
        println!("--- triplet #{} ---", idx);
1✔
632
        println!("recipe       : {}", triplet.recipe);
1✔
633
        println!("sample_weight: {:.4}", triplet.weight);
1✔
634
        if let Some(instr) = &triplet.instruction {
1✔
635
            println!("instruction shown to model:\n{}\n", instr);
1✔
636
        }
1✔
637
        print_chunk_block("ANCHOR", &triplet.anchor, strategy, split_store);
1✔
638
        print_chunk_block("POSITIVE", &triplet.positive, strategy, split_store);
1✔
639
        print_chunk_block("NEGATIVE", &triplet.negative, strategy, split_store);
1✔
640
    }
641
    print_source_summary(
1✔
642
        "triplet anchors",
1✔
643
        batch
1✔
644
            .triplets
1✔
645
            .iter()
1✔
646
            .map(|triplet| triplet.anchor.record_id.as_str()),
1✔
647
    );
648
    print_recipe_summary_by_source(
1✔
649
        "triplet recipes by source",
1✔
650
        batch
1✔
651
            .triplets
1✔
652
            .iter()
1✔
653
            .map(|triplet| (triplet.anchor.record_id.as_str(), triplet.recipe.as_str())),
1✔
654
    );
655
}
1✔
656

657
fn print_text_batch(strategy: &ChunkingStrategy, batch: &TextBatch, split_store: &impl SplitStore) {
1✔
658
    println!("=== text batch ===");
1✔
659
    for (idx, sample) in batch.samples.iter().enumerate() {
1✔
660
        println!("--- sample #{} ---", idx);
1✔
661
        println!("recipe       : {}", sample.recipe);
1✔
662
        println!("sample_weight: {:.4}", sample.weight);
1✔
663
        if let Some(instr) = &sample.instruction {
1✔
664
            println!("instruction shown to model:\n{}\n", instr);
1✔
665
        }
1✔
666
        print_chunk_block("TEXT", &sample.chunk, strategy, split_store);
1✔
667
    }
668
    print_source_summary(
1✔
669
        "text samples",
1✔
670
        batch
1✔
671
            .samples
1✔
672
            .iter()
1✔
673
            .map(|sample| sample.chunk.record_id.as_str()),
1✔
674
    );
675
    print_recipe_summary_by_source(
1✔
676
        "text recipes by source",
1✔
677
        batch
1✔
678
            .samples
1✔
679
            .iter()
1✔
680
            .map(|sample| (sample.chunk.record_id.as_str(), sample.recipe.as_str())),
1✔
681
    );
682
}
1✔
683

684
fn print_pair_batch(
1✔
685
    strategy: &ChunkingStrategy,
1✔
686
    batch: &SampleBatch,
1✔
687
    split_store: &impl SplitStore,
1✔
688
) {
1✔
689
    println!("=== pair batch ===");
1✔
690
    for (idx, pair) in batch.pairs.iter().enumerate() {
1✔
691
        println!("--- pair #{} ---", idx);
1✔
692
        println!("recipe       : {}", pair.recipe);
1✔
693
        println!("label        : {:?}", pair.label);
1✔
694
        if let Some(reason) = &pair.reason {
1✔
695
            println!("reason       : {}", reason);
1✔
696
        }
1✔
697
        print_chunk_block("ANCHOR", &pair.anchor, strategy, split_store);
1✔
698
        print_chunk_block("OTHER", &pair.positive, strategy, split_store);
1✔
699
    }
700
    print_source_summary(
1✔
701
        "pair anchors",
1✔
702
        batch
1✔
703
            .pairs
1✔
704
            .iter()
1✔
705
            .map(|pair| pair.anchor.record_id.as_str()),
1✔
706
    );
707
    print_recipe_summary_by_source(
1✔
708
        "pair recipes by source",
1✔
709
        batch
1✔
710
            .pairs
1✔
711
            .iter()
1✔
712
            .map(|pair| (pair.anchor.record_id.as_str(), pair.recipe.as_str())),
1✔
713
    );
714
}
1✔
715

716
fn print_text_recipes(recipes: &[TextRecipe]) {
2✔
717
    println!("=== available text recipes ===");
2✔
718
    for recipe in recipes {
4✔
719
        println!(
4✔
720
            "- {} (weight: {:.3}) selector={:?}",
721
            recipe.name, recipe.weight, recipe.selector
722
        );
723
        if let Some(instr) = &recipe.instruction {
4✔
724
            println!("  instruction: {}", instr);
1✔
725
        }
3✔
726
    }
727
}
2✔
728

729
trait ChunkDebug {
730
    fn view_name(&self) -> String;
731
}
732

733
impl ChunkDebug for RecordChunk {
734
    fn view_name(&self) -> String {
6✔
735
        match &self.view {
6✔
736
            ChunkView::Window {
737
                index,
4✔
738
                span,
4✔
739
                overlap,
4✔
740
                start_ratio,
4✔
741
            } => format!(
4✔
742
                "window#index={} span={} overlap={} start_ratio={:.3} tokens={}",
743
                index, span, overlap, start_ratio, self.tokens_estimate
744
            ),
745
            ChunkView::SummaryFallback { strategy, .. } => {
2✔
746
                format!("summary:{} tokens={}", strategy, self.tokens_estimate)
2✔
747
            }
748
        }
749
    }
6✔
750
}
751

752
fn print_chunk_block(
6✔
753
    title: &str,
6✔
754
    chunk: &RecordChunk,
6✔
755
    strategy: &ChunkingStrategy,
6✔
756
    split_store: &impl SplitStore,
6✔
757
) {
6✔
758
    let chunk_weight = chunk_weight(strategy, chunk);
6✔
759
    let split = split_store
6✔
760
        .label_for(&chunk.record_id)
6✔
761
        .map(|label| format!("{:?}", label))
6✔
762
        .unwrap_or_else(|| "Unknown".to_string());
6✔
763
    println!("--- {} ---", title);
6✔
764
    println!("split        : {}", split);
6✔
765
    println!("view         : {}", chunk.view_name());
6✔
766
    println!("chunk_weight : {:.4}", chunk_weight);
6✔
767
    println!("record_id    : {}", chunk.record_id);
6✔
768
    println!("section_idx  : {}", chunk.section_idx);
6✔
769
    println!("token_est    : {}", chunk.tokens_estimate);
6✔
770
    println!("model_input (exact text sent to the model):");
6✔
771
    println!(
6✔
772
        "<<< BEGIN MODEL TEXT >>>\n{}\n<<< END MODEL TEXT >>>\n",
773
        chunk.text
774
    );
775
}
6✔
776

777
fn print_source_summary<'a, I>(label: &str, ids: I)
3✔
778
where
3✔
779
    I: Iterator<Item = &'a str>,
3✔
780
{
781
    let mut counts: HashMap<SourceId, usize> = HashMap::new();
3✔
782
    for id in ids {
3✔
783
        let source = extract_source(id);
3✔
784
        *counts.entry(source).or_insert(0) += 1;
3✔
785
    }
3✔
786
    if counts.is_empty() {
3✔
787
        return;
×
788
    }
3✔
789
    let skew = source_skew(&counts);
3✔
790
    let mut entries: Vec<(String, usize)> = counts.into_iter().collect();
3✔
791
    entries.sort_by(|a, b| b.1.cmp(&a.1).then_with(|| a.0.cmp(&b.0)));
3✔
792
    println!("--- {} by source ---", label);
3✔
793
    if let Some(skew) = skew {
3✔
794
        for entry in &skew.per_source {
3✔
795
            println!(
3✔
796
                "{}: count={} share={:.2}",
3✔
797
                entry.source, entry.count, entry.share
3✔
798
            );
3✔
799
        }
3✔
800
        println!(
3✔
801
            "skew: sources={} total={} min={} max={} mean={:.2} ratio={:.2}",
802
            skew.sources, skew.total, skew.min, skew.max, skew.mean, skew.ratio
803
        );
804
    } else {
805
        for (source, count) in &entries {
×
806
            println!("{source}: count={count}");
×
807
        }
×
808
    }
809
}
3✔
810

811
fn print_recipe_summary_by_source<'a, I>(label: &str, entries: I)
3✔
812
where
3✔
813
    I: Iterator<Item = (&'a str, &'a str)>,
3✔
814
{
815
    let mut counts: HashMap<SourceId, HashMap<String, usize>> = HashMap::new();
3✔
816
    for (record_id, recipe) in entries {
3✔
817
        let source = extract_source(record_id);
3✔
818
        let entry = counts
3✔
819
            .entry(source)
3✔
820
            .or_default()
3✔
821
            .entry(recipe.to_string())
3✔
822
            .or_insert(0);
3✔
823
        *entry += 1;
3✔
824
    }
3✔
825
    if counts.is_empty() {
3✔
826
        return;
×
827
    }
3✔
828
    let mut sources: Vec<(SourceId, HashMap<String, usize>)> = counts.into_iter().collect();
3✔
829
    sources.sort_by(|a, b| a.0.cmp(&b.0));
3✔
830
    println!("--- {} ---", label);
3✔
831
    for (source, recipes) in sources {
3✔
832
        println!("{source}");
3✔
833
        let mut entries: Vec<(String, usize)> = recipes.into_iter().collect();
3✔
834
        entries.sort_by(|a, b| b.1.cmp(&a.1).then_with(|| a.0.cmp(&b.0)));
3✔
835
        for (recipe, count) in entries {
3✔
836
            println!("  - {recipe}={count}");
3✔
837
        }
3✔
838
    }
839
}
3✔
840

841
fn extract_source(record_id: &str) -> SourceId {
8✔
842
    record_id
8✔
843
        .split_once("::")
8✔
844
        .map(|(source, _)| source.to_string())
8✔
845
        .unwrap_or_else(|| "unknown".to_string())
8✔
846
}
8✔
847

848
#[cfg(test)]
849
mod tests {
850
    use super::*;
851
    use crate::DeterministicSplitStore;
852
    use crate::data::SectionRole;
853
    use crate::source::{SourceCursor, SourceSnapshot};
854
    use chrono::Utc;
855
    use std::sync::atomic::{AtomicBool, Ordering};
856
    use tempfile::tempdir;
857

858
    /// Minimal in-memory `DataSource` test double for example app tests.
859
    struct TestSource {
860
        id: String,
861
        count: Option<u128>,
862
        recipes: Vec<TripletRecipe>,
863
    }
864

865
    impl DataSource for TestSource {
866
        fn id(&self) -> &str {
66✔
867
            &self.id
66✔
868
        }
66✔
869

870
        fn refresh(
15✔
871
            &self,
15✔
872
            _cursor: Option<&SourceCursor>,
15✔
873
            _limit: Option<usize>,
15✔
874
        ) -> Result<SourceSnapshot, SamplerError> {
15✔
875
            Ok(SourceSnapshot {
15✔
876
                records: Vec::new(),
15✔
877
                cursor: SourceCursor {
15✔
878
                    last_seen: Utc::now(),
15✔
879
                    revision: 0,
15✔
880
                },
15✔
881
            })
15✔
882
        }
15✔
883

884
        fn reported_record_count(&self) -> Result<u128, SamplerError> {
2✔
885
            self.count.ok_or_else(|| SamplerError::SourceInconsistent {
2✔
886
                source_id: self.id.clone(),
1✔
887
                details: "test source has no configured exact count".to_string(),
1✔
888
            })
1✔
889
        }
2✔
890

891
        fn configure_sampler(&self, _config: &SamplerConfig) {}
6✔
892

893
        fn default_triplet_recipes(&self) -> Vec<TripletRecipe> {
6✔
894
            self.recipes.clone()
6✔
895
        }
6✔
896
    }
897

898
    struct ConfigRequiredSource {
899
        id: String,
900
        configured: AtomicBool,
901
    }
902

903
    impl DataSource for ConfigRequiredSource {
904
        fn id(&self) -> &str {
1✔
905
            &self.id
1✔
906
        }
1✔
907

NEW
908
        fn refresh(
×
NEW
909
            &self,
×
NEW
910
            _cursor: Option<&SourceCursor>,
×
NEW
911
            _limit: Option<usize>,
×
NEW
912
        ) -> Result<SourceSnapshot, SamplerError> {
×
NEW
913
            Ok(SourceSnapshot {
×
NEW
914
                records: Vec::new(),
×
NEW
915
                cursor: SourceCursor {
×
NEW
916
                    last_seen: Utc::now(),
×
NEW
917
                    revision: 0,
×
NEW
918
                },
×
NEW
919
            })
×
NEW
920
        }
×
921

922
        fn reported_record_count(&self) -> Result<u128, SamplerError> {
1✔
923
            if self.configured.load(Ordering::SeqCst) {
1✔
924
                Ok(1)
1✔
925
            } else {
NEW
926
                Err(SamplerError::SourceInconsistent {
×
NEW
927
                    source_id: self.id.clone(),
×
NEW
928
                    details: "sampler configuration not provided".to_string(),
×
NEW
929
                })
×
930
            }
931
        }
1✔
932

933
        fn configure_sampler(&self, _config: &SamplerConfig) {
1✔
934
            self.configured.store(true, Ordering::SeqCst);
1✔
935
        }
1✔
936
    }
937

938
    fn default_recipe(name: &str) -> TripletRecipe {
6✔
939
        TripletRecipe {
6✔
940
            name: name.to_string().into(),
6✔
941
            anchor: crate::config::Selector::Role(SectionRole::Anchor),
6✔
942
            positive_selector: crate::config::Selector::Role(SectionRole::Context),
6✔
943
            negative_selector: crate::config::Selector::Role(SectionRole::Context),
6✔
944
            negative_strategy: crate::config::NegativeStrategy::WrongArticle,
6✔
945
            weight: 1.0,
6✔
946
            instruction: None,
6✔
947
        }
6✔
948
    }
6✔
949

950
    #[test]
951
    fn parse_helpers_validate_inputs() {
1✔
952
        assert_eq!(parse_positive_usize("2").unwrap(), 2);
1✔
953
        assert!(parse_positive_usize("0").is_err());
1✔
954
        assert!(parse_positive_usize("abc").is_err());
1✔
955

956
        let split = parse_split_ratios_arg("0.8,0.1,0.1").unwrap();
1✔
957
        assert!((split.train - 0.8).abs() < 1e-6);
1✔
958
        assert!(parse_split_ratios_arg("0.8,0.1").is_err());
1✔
959
        assert!(parse_split_ratios_arg("1.0,0.0,0.1").is_err());
1✔
960
        assert!(parse_split_ratios_arg("-0.1,0.6,0.5").is_err());
1✔
961
    }
1✔
962

963
    #[test]
964
    fn parse_cli_handles_help_and_invalid_args() {
1✔
965
        let help = parse_cli::<EstimateCapacityCli, _>(["estimate_capacity", "--help"]).unwrap();
1✔
966
        assert!(help.is_none());
1✔
967

968
        let err = parse_cli::<EstimateCapacityCli, _>(["estimate_capacity", "--unknown"]);
1✔
969
        assert!(err.is_err());
1✔
970
    }
1✔
971

972
    #[test]
973
    fn run_estimate_capacity_succeeds_with_reported_counts() {
1✔
974
        let result = run_estimate_capacity(
1✔
975
            std::iter::empty::<String>(),
1✔
976
            |roots| {
1✔
977
                assert!(roots.is_empty());
1✔
978
                Ok(())
1✔
979
            },
1✔
980
            |_| {
1✔
981
                vec![Box::new(TestSource {
1✔
982
                    id: "source_a".into(),
1✔
983
                    count: Some(12),
1✔
984
                    recipes: vec![default_recipe("r1")],
1✔
985
                }) as DynSource]
1✔
986
            },
1✔
987
        );
988

989
        assert!(result.is_ok());
1✔
990
    }
1✔
991

992
    #[test]
993
    fn run_estimate_capacity_errors_when_source_count_missing() {
1✔
994
        let result = run_estimate_capacity(
1✔
995
            std::iter::empty::<String>(),
1✔
996
            |_| Ok(()),
1✔
997
            |_| {
1✔
998
                vec![Box::new(TestSource {
1✔
999
                    id: "source_missing".into(),
1✔
1000
                    count: None,
1✔
1001
                    recipes: vec![default_recipe("r1")],
1✔
1002
                }) as DynSource]
1✔
1003
            },
1✔
1004
        );
1005

1006
        let err = result.unwrap_err().to_string();
1✔
1007
        assert!(err.contains("failed to report exact record count"));
1✔
1008
    }
1✔
1009

1010
    #[test]
1011
    fn run_estimate_capacity_configures_sources_centrally_before_counting() {
1✔
1012
        let result = run_estimate_capacity(
1✔
1013
            std::iter::empty::<String>(),
1✔
1014
            |_| Ok(()),
1✔
1015
            |_| {
1✔
1016
                vec![Box::new(ConfigRequiredSource {
1✔
1017
                    id: "requires_config".into(),
1✔
1018
                    configured: AtomicBool::new(false),
1✔
1019
                }) as DynSource]
1✔
1020
            },
1✔
1021
        );
1022

1023
        assert!(result.is_ok());
1✔
1024
    }
1✔
1025

1026
    #[test]
1027
    fn parse_multi_source_cli_handles_help_and_batch_size_validation() {
1✔
1028
        let help = parse_cli::<MultiSourceDemoCli, _>(["multi_source_demo", "--help"]).unwrap();
1✔
1029
        assert!(help.is_none());
1✔
1030

1031
        let err = parse_cli::<MultiSourceDemoCli, _>(["multi_source_demo", "--batch-size", "0"]);
1✔
1032
        assert!(err.is_err());
1✔
1033
    }
1✔
1034

1035
    #[test]
1036
    fn run_multi_source_demo_list_text_recipes_path_succeeds() {
1✔
1037
        let dir = tempdir().unwrap();
1✔
1038
        let mut args = vec![
1✔
1039
            "--list-text-recipes".to_string(),
1✔
1040
            "--split-store-dir".to_string(),
1✔
1041
            dir.path().to_string_lossy().to_string(),
1✔
1042
        ];
1043
        let result = run_multi_source_demo(
1✔
1044
            args.drain(..),
1✔
1045
            |_| Ok(()),
1✔
1046
            |_| {
1✔
1047
                vec![Box::new(TestSource {
1✔
1048
                    id: "source_for_recipes".into(),
1✔
1049
                    count: Some(10),
1✔
1050
                    recipes: vec![default_recipe("recipe_a")],
1✔
1051
                }) as DynSource]
1✔
1052
            },
1✔
1053
        );
1054

1055
        assert!(result.is_ok());
1✔
1056
    }
1✔
1057

1058
    #[test]
1059
    fn run_multi_source_demo_sampling_modes_handle_empty_sources() {
1✔
1060
        for mode in [
3✔
1061
            vec!["--pair-batch".to_string()],
1✔
1062
            vec!["--text-recipes".to_string()],
1✔
1063
            vec![],
1✔
1064
        ] {
1✔
1065
            let dir = tempdir().unwrap();
3✔
1066
            let mut args = mode;
3✔
1067
            args.push("--split-store-dir".to_string());
3✔
1068
            args.push(dir.path().to_string_lossy().to_string());
3✔
1069
            args.push("--split".to_string());
3✔
1070
            args.push("validation".to_string());
3✔
1071

1072
            let result = run_multi_source_demo(
3✔
1073
                args.into_iter(),
3✔
1074
                |_| Ok(()),
3✔
1075
                |_| {
3✔
1076
                    vec![Box::new(TestSource {
3✔
1077
                        id: "source_empty".into(),
3✔
1078
                        count: Some(0),
3✔
1079
                        recipes: vec![default_recipe("recipe_empty")],
3✔
1080
                    }) as DynSource]
3✔
1081
                },
3✔
1082
            );
1083

1084
            assert!(result.is_ok());
3✔
1085
        }
1086
    }
1✔
1087

1088
    #[test]
1089
    fn print_helpers_and_extract_source_cover_paths() {
1✔
1090
        let split = SplitRatios::default();
1✔
1091
        let store = DeterministicSplitStore::new(split, 42).unwrap();
1✔
1092
        let strategy = ChunkingStrategy::default();
1✔
1093

1094
        let anchor = RecordChunk {
1✔
1095
            record_id: "source_a::rec1".to_string(),
1✔
1096
            section_idx: 0,
1✔
1097
            view: ChunkView::Window {
1✔
1098
                index: 1,
1✔
1099
                overlap: 2,
1✔
1100
                span: 12,
1✔
1101
                start_ratio: 0.25,
1✔
1102
            },
1✔
1103
            text: "anchor text".to_string(),
1✔
1104
            tokens_estimate: 8,
1✔
1105
            quality: crate::data::QualityScore { trust: 0.9 },
1✔
1106
        };
1✔
1107
        let positive = RecordChunk {
1✔
1108
            record_id: "source_a::rec2".to_string(),
1✔
1109
            section_idx: 1,
1✔
1110
            view: ChunkView::SummaryFallback {
1✔
1111
                strategy: "summary".to_string(),
1✔
1112
                weight: 0.7,
1✔
1113
            },
1✔
1114
            text: "positive text".to_string(),
1✔
1115
            tokens_estimate: 6,
1✔
1116
            quality: crate::data::QualityScore { trust: 0.8 },
1✔
1117
        };
1✔
1118
        let negative = RecordChunk {
1✔
1119
            record_id: "source_b::rec3".to_string(),
1✔
1120
            section_idx: 2,
1✔
1121
            view: ChunkView::Window {
1✔
1122
                index: 0,
1✔
1123
                overlap: 0,
1✔
1124
                span: 16,
1✔
1125
                start_ratio: 0.0,
1✔
1126
            },
1✔
1127
            text: "negative text".to_string(),
1✔
1128
            tokens_estimate: 7,
1✔
1129
            quality: crate::data::QualityScore { trust: 0.5 },
1✔
1130
        };
1✔
1131

1132
        let triplet_batch = TripletBatch {
1✔
1133
            triplets: vec![crate::SampleTriplet {
1✔
1134
                recipe: "triplet_recipe".to_string(),
1✔
1135
                anchor: anchor.clone(),
1✔
1136
                positive: positive.clone(),
1✔
1137
                negative: negative.clone(),
1✔
1138
                weight: 1.0,
1✔
1139
                instruction: Some("triplet instruction".to_string()),
1✔
1140
            }],
1✔
1141
        };
1✔
1142
        print_triplet_batch(&strategy, &triplet_batch, &store);
1✔
1143

1144
        let pair_batch = SampleBatch {
1✔
1145
            pairs: vec![crate::SamplePair {
1✔
1146
                recipe: "pair_recipe".to_string(),
1✔
1147
                anchor: anchor.clone(),
1✔
1148
                positive: positive.clone(),
1✔
1149
                weight: 1.0,
1✔
1150
                instruction: None,
1✔
1151
                label: crate::PairLabel::Positive,
1✔
1152
                reason: Some("same topic".to_string()),
1✔
1153
            }],
1✔
1154
        };
1✔
1155
        print_pair_batch(&strategy, &pair_batch, &store);
1✔
1156

1157
        let text_batch = TextBatch {
1✔
1158
            samples: vec![crate::TextSample {
1✔
1159
                recipe: "text_recipe".to_string(),
1✔
1160
                chunk: negative,
1✔
1161
                weight: 0.8,
1✔
1162
                instruction: Some("text instruction".to_string()),
1✔
1163
            }],
1✔
1164
        };
1✔
1165
        print_text_batch(&strategy, &text_batch, &store);
1✔
1166

1167
        let recipes = vec![TextRecipe {
1✔
1168
            name: "recipe_name".into(),
1✔
1169
            selector: crate::config::Selector::Role(SectionRole::Context),
1✔
1170
            instruction: Some("instruction".into()),
1✔
1171
            weight: 1.0,
1✔
1172
        }];
1✔
1173
        print_text_recipes(&recipes);
1✔
1174

1175
        assert_eq!(extract_source("source_a::record"), "source_a");
1✔
1176
        assert_eq!(extract_source("record-without-delimiter"), "unknown");
1✔
1177
    }
1✔
1178
}
STATUS · Troubleshooting · Open an Issue · Sales · Support · CAREERS · ENTERPRISE · START FREE · SCHEDULE DEMO
ANNOUNCEMENTS · TWITTER · TOS & SLA · Supported CI Services · What's a CI service? · Automated Testing

© 2026 Coveralls, Inc