• Home
  • Features
  • Pricing
  • Docs
  • Announcements
  • Sign In

jzombie / rust-triplets / 22358031159

24 Feb 2026 03:38PM UTC coverage: 92.488% (-0.2%) from 92.675%
22358031159

Pull #7

github

web-flow
Merge 843abfd29 into 980559192
Pull Request #7: Add HF source

4634 of 5195 new or added lines in 8 files covered. (89.2%)

1 existing line in 1 file now uncovered.

14073 of 15216 relevant lines covered (92.49%)

2599.58 hits per line

Source File
Press 'n' to go to next uncovered line, 'b' for previous

96.6
/src/source/backends/file_source.rs
1
use std::collections::HashMap;
2
use std::path::{Path, PathBuf};
3
use std::sync::Arc;
4

5
use crate::config::{SamplerConfig, TripletRecipe};
6
use crate::data::{DataRecord, QualityScore, RecordSection, SectionRole};
7
use crate::errors::SamplerError;
8
use crate::source::indexing::file_corpus::FileCorpusIndex;
9
use crate::source::{DataSource, SourceCursor, SourceSnapshot};
10
use crate::transport::fs::{file_times, is_text_file};
11
use crate::types::{CategoryId, SourceId, TaxonomyValue};
12
use crate::utils::{make_section, normalize_inline_whitespace};
13

14
/// Builds taxonomy values from a root path and file path.
15
pub type TaxonomyBuilder =
16
    Arc<dyn Fn(&Path, &Path, &SourceId) -> Vec<TaxonomyValue> + Send + Sync + 'static>;
17

18
/// Builds record sections from a normalized title and body.
19
pub type SectionBuilder = Arc<dyn Fn(&str, &str) -> Vec<RecordSection> + Send + Sync + 'static>;
20

21
/// Configuration for a generic filesystem-backed data source.
22
#[derive(Clone)]
23
pub struct FileSourceConfig {
24
    /// Stable source identifier used in records and persistence keys.
25
    pub source_id: SourceId,
26
    /// Root directory containing source files.
27
    pub root: PathBuf,
28
    /// Default quality trust score applied to generated records.
29
    pub trust: f32,
30
    /// Optional trust overrides keyed by taxonomy segment.
31
    pub category_trust: HashMap<CategoryId, f32>,
32
    /// Whether to follow symlinks during index walking.
33
    pub follow_links: bool,
34
    /// Whether indexing should include only text files.
35
    pub text_files_only: bool,
36
    /// Whether deterministic directory grouping is enabled.
37
    pub group_by_directory: bool,
38
    /// Whether title extraction should replace underscores with spaces.
39
    pub title_replace_underscores: bool,
40
    /// Optional default recipes returned by this source.
41
    pub default_triplet_recipes: Vec<TripletRecipe>,
42
    /// Taxonomy builder invoked per file.
43
    pub taxonomy_builder: TaxonomyBuilder,
44
    /// Section builder invoked per file.
45
    pub section_builder: SectionBuilder,
46
}
47

48
impl FileSourceConfig {
49
    /// Create a config for a filesystem source with explicit id and root.
50
    pub fn new(source_id: impl Into<SourceId>, root: impl Into<PathBuf>) -> Self {
10✔
51
        Self {
10✔
52
            source_id: source_id.into(),
10✔
53
            root: root.into(),
10✔
54
            trust: 0.85,
10✔
55
            category_trust: HashMap::new(),
10✔
56
            follow_links: true,
10✔
57
            text_files_only: false,
10✔
58
            group_by_directory: true,
10✔
59
            title_replace_underscores: true,
10✔
60
            default_triplet_recipes: Vec::new(),
10✔
61
            taxonomy_builder: Arc::new(taxonomy_from_path),
10✔
62
            section_builder: Arc::new(anchor_context_sections),
10✔
63
        }
10✔
64
    }
10✔
65

66
    /// Override default trust score.
67
    pub fn with_trust(mut self, trust: f32) -> Self {
1✔
68
        self.trust = trust;
1✔
69
        self
1✔
70
    }
1✔
71

72
    /// Add a taxonomy-segment trust override.
73
    pub fn with_category_trust(mut self, category: impl Into<String>, trust: f32) -> Self {
3✔
74
        self.category_trust
3✔
75
            .insert(category.into().to_lowercase(), trust);
3✔
76
        self
3✔
77
    }
3✔
78

79
    /// Override whether symlinks are followed during index walk.
NEW
80
    pub fn with_follow_links(mut self, follow_links: bool) -> Self {
×
NEW
81
        self.follow_links = follow_links;
×
NEW
82
        self
×
NEW
83
    }
×
84

85
    /// Override whether index walk includes only text files.
86
    pub fn with_text_files_only(mut self, text_files_only: bool) -> Self {
1✔
87
        self.text_files_only = text_files_only;
1✔
88
        self
1✔
89
    }
1✔
90

91
    /// Enable or disable deterministic directory grouping.
NEW
92
    pub fn with_directory_grouping(mut self, group_by_directory: bool) -> Self {
×
NEW
93
        self.group_by_directory = group_by_directory;
×
NEW
94
        self
×
NEW
95
    }
×
96

97
    /// Set whether title extraction replaces underscores with spaces.
98
    pub fn with_title_replace_underscores(mut self, replace_underscores: bool) -> Self {
1✔
99
        self.title_replace_underscores = replace_underscores;
1✔
100
        self
1✔
101
    }
1✔
102

103
    /// Set source-provided default triplet recipes.
104
    pub fn with_default_triplet_recipes(mut self, recipes: Vec<TripletRecipe>) -> Self {
1✔
105
        self.default_triplet_recipes = recipes;
1✔
106
        self
1✔
107
    }
1✔
108

109
    /// Set a custom taxonomy builder.
110
    pub fn with_taxonomy_builder(mut self, taxonomy_builder: TaxonomyBuilder) -> Self {
1✔
111
        self.taxonomy_builder = taxonomy_builder;
1✔
112
        self
1✔
113
    }
1✔
114

115
    /// Set a custom section builder.
116
    pub fn with_section_builder(mut self, section_builder: SectionBuilder) -> Self {
1✔
117
        self.section_builder = section_builder;
1✔
118
        self
1✔
119
    }
1✔
120
}
121

122
/// Generic filesystem-backed source with configurable taxonomy and section mapping.
123
pub struct FileSource {
124
    config: FileSourceConfig,
125
}
126

127
impl FileSource {
128
    /// Create a generic file source from configuration.
129
    pub fn new(config: FileSourceConfig) -> Self {
10✔
130
        Self { config }
10✔
131
    }
10✔
132

133
    fn file_corpus_index(&self, sampler_seed: u64) -> FileCorpusIndex {
11✔
134
        FileCorpusIndex::new(&self.config.root, &self.config.source_id)
11✔
135
            .with_sampler_seed(sampler_seed)
11✔
136
            .with_follow_links(self.config.follow_links)
11✔
137
            .with_text_files_only(self.config.text_files_only)
11✔
138
            .with_directory_grouping(self.config.group_by_directory)
11✔
139
    }
11✔
140

141
    fn trust_for_taxonomy(&self, taxonomy: &[String]) -> f32 {
32✔
142
        for segment in taxonomy.iter().skip(1) {
32✔
143
            if let Some(weight) = self.config.category_trust.get(&segment.to_lowercase()) {
4✔
144
                return *weight;
2✔
145
            }
2✔
146
        }
147
        self.config.trust
30✔
148
    }
32✔
149

150
    fn build_record(&self, path: &Path) -> Result<Option<DataRecord>, SamplerError> {
33✔
151
        if !is_text_file(path) {
33✔
152
            return Ok(None);
1✔
153
        }
32✔
154
        let title = FileCorpusIndex::normalized_title_from_stem(
32✔
155
            path,
32✔
156
            &self.config.source_id,
32✔
157
            self.config.title_replace_underscores,
32✔
NEW
158
        )?;
×
159
        if title.is_empty() {
32✔
NEW
160
            return Ok(None);
×
161
        }
32✔
162

163
        let body_raw = std::fs::read_to_string(path)?;
32✔
164
        let body = normalize_inline_whitespace(body_raw);
32✔
165
        if body.is_empty() {
32✔
NEW
166
            return Ok(None);
×
167
        }
32✔
168

169
        let taxonomy =
32✔
170
            (self.config.taxonomy_builder)(&self.config.root, path, &self.config.source_id);
32✔
171
        let sections = (self.config.section_builder)(&title, &body);
32✔
172
        let trust = self.trust_for_taxonomy(&taxonomy);
32✔
173
        let (created_at, updated_at) = file_times(path);
32✔
174

175
        Ok(Some(DataRecord {
32✔
176
            id: FileCorpusIndex::source_scoped_record_id(
32✔
177
                &self.config.source_id,
32✔
178
                &self.config.root,
32✔
179
                path,
32✔
180
            ),
32✔
181
            source: self.config.source_id.clone(),
32✔
182
            created_at,
32✔
183
            updated_at,
32✔
184
            quality: QualityScore { trust },
32✔
185
            taxonomy,
32✔
186
            sections,
32✔
187
            meta_prefix: None,
32✔
188
        }))
32✔
189
    }
33✔
190
}
191

192
impl DataSource for FileSource {
193
    fn id(&self) -> &str {
1✔
194
        &self.config.source_id
1✔
195
    }
1✔
196

197
    fn refresh(
10✔
198
        &self,
10✔
199
        config: &SamplerConfig,
10✔
200
        cursor: Option<&SourceCursor>,
10✔
201
        limit: Option<usize>,
10✔
202
    ) -> Result<SourceSnapshot, SamplerError> {
10✔
203
        self.file_corpus_index(config.seed)
10✔
204
            .refresh_indexable(cursor, limit, |path| self.build_record(path))
33✔
205
    }
10✔
206

207
    fn reported_record_count(&self, config: &SamplerConfig) -> Result<u128, SamplerError> {
1✔
208
        self.file_corpus_index(config.seed)
1✔
209
            .indexed_record_count()
1✔
210
            .map(|count| count as u128)
1✔
211
    }
1✔
212

213
    fn default_triplet_recipes(&self) -> Vec<TripletRecipe> {
1✔
214
        self.config.default_triplet_recipes.clone()
1✔
215
    }
1✔
216
}
217

218
/// Build default taxonomy from the file path relative to `root`.
219
///
220
/// Output shape is `[source_id, <parent segments...>]`.
221
pub fn taxonomy_from_path(root: &Path, path: &Path, source_id: &SourceId) -> Vec<TaxonomyValue> {
33✔
222
    let mut taxonomy = vec![source_id.to_string()];
33✔
223
    if let Ok(rel) = path.strip_prefix(root)
33✔
224
        && let Some(parent) = rel.parent()
32✔
225
    {
226
        for segment in parent.iter() {
32✔
227
            taxonomy.push(segment.to_string_lossy().to_string());
5✔
228
        }
5✔
229
    }
1✔
230
    taxonomy
33✔
231
}
33✔
232

233
/// Build a default two-section payload of title anchor and body context.
234
pub fn anchor_context_sections(title: &str, body: &str) -> Vec<RecordSection> {
32✔
235
    vec![
32✔
236
        make_section(SectionRole::Anchor, None, title),
32✔
237
        make_section(SectionRole::Context, None, body),
32✔
238
    ]
239
}
32✔
240

241
#[cfg(test)]
242
mod tests {
243
    use super::*;
244
    use crate::config::{NegativeStrategy, Selector};
245
    use tempfile::tempdir;
246

247
    fn sampler_config(seed: u64) -> SamplerConfig {
10✔
248
        SamplerConfig {
10✔
249
            seed,
10✔
250
            ..SamplerConfig::default()
10✔
251
        }
10✔
252
    }
10✔
253

254
    #[test]
255
    fn reads_records_without_default_source_id() {
1✔
256
        let temp = tempdir().unwrap();
1✔
257
        let category = temp.path().join("factual");
1✔
258
        std::fs::create_dir_all(&category).unwrap();
1✔
259
        std::fs::write(
1✔
260
            category.join("What_is_alpha.txt"),
1✔
261
            "Alpha measures risk-adjusted outperformance.",
262
        )
263
        .unwrap();
1✔
264

265
        let source = FileSource::new(FileSourceConfig::new("qa_custom", temp.path()));
1✔
266
        let snapshot = source.refresh(&sampler_config(101), None, None).unwrap();
1✔
267

268
        assert_eq!(snapshot.records.len(), 1);
1✔
269
        assert_eq!(snapshot.records[0].source, "qa_custom");
1✔
270
    }
1✔
271

272
    #[test]
273
    fn applies_category_trust_overrides() {
1✔
274
        let temp = tempdir().unwrap();
1✔
275
        let factual = temp.path().join("factual");
1✔
276
        let opinion = temp.path().join("opinionated");
1✔
277
        std::fs::create_dir_all(&factual).unwrap();
1✔
278
        std::fs::create_dir_all(&opinion).unwrap();
1✔
279
        std::fs::write(
1✔
280
            factual.join("What_is_beta.txt"),
1✔
281
            "Beta compares volatility.",
282
        )
283
        .unwrap();
1✔
284
        std::fs::write(
1✔
285
            opinion.join("Will_rates_fall.txt"),
1✔
286
            "Probably not this year.",
287
        )
288
        .unwrap();
1✔
289

290
        let source = FileSource::new(
1✔
291
            FileSourceConfig::new("qa_weighted", temp.path())
1✔
292
                .with_category_trust("factual", 0.95)
1✔
293
                .with_category_trust("opinionated", 0.6),
1✔
294
        );
295
        let snapshot = source.refresh(&sampler_config(101), None, None).unwrap();
1✔
296

297
        let factual_record = snapshot
1✔
298
            .records
1✔
299
            .iter()
1✔
300
            .find(|record| record.taxonomy.iter().any(|value| value == "factual"))
2✔
301
            .unwrap();
1✔
302
        let opinion_record = snapshot
1✔
303
            .records
1✔
304
            .iter()
1✔
305
            .find(|record| record.taxonomy.iter().any(|value| value == "opinionated"))
4✔
306
            .unwrap();
1✔
307
        assert_eq!(factual_record.quality.trust, 0.95);
1✔
308
        assert_eq!(opinion_record.quality.trust, 0.6);
1✔
309
    }
1✔
310

311
    #[test]
312
    fn supports_custom_sections_and_default_recipes() {
1✔
313
        let temp = tempdir().unwrap();
1✔
314
        std::fs::write(
1✔
315
            temp.path().join("What_is_gamma.txt"),
1✔
316
            "Gamma measures convexity.",
317
        )
318
        .unwrap();
1✔
319

320
        let sections: SectionBuilder = Arc::new(|question, answer| {
1✔
321
            vec![
1✔
322
                make_section(SectionRole::Anchor, Some("Question"), question),
1✔
323
                make_section(SectionRole::Context, Some("Answer"), answer),
1✔
324
            ]
325
        });
1✔
326

327
        let recipes = vec![TripletRecipe {
1✔
328
            name: "question_answer".into(),
1✔
329
            anchor: Selector::Role(SectionRole::Anchor),
1✔
330
            positive_selector: Selector::Role(SectionRole::Context),
1✔
331
            negative_selector: Selector::Role(SectionRole::Context),
1✔
332
            negative_strategy: NegativeStrategy::QuestionAnswerMismatch,
1✔
333
            weight: 1.0,
1✔
334
            instruction: None,
1✔
335
        }];
1✔
336

337
        let source = FileSource::new(
1✔
338
            FileSourceConfig::new("qa_sections", temp.path())
1✔
339
                .with_section_builder(sections)
1✔
340
                .with_default_triplet_recipes(recipes.clone()),
1✔
341
        );
342

343
        let snapshot = source.refresh(&sampler_config(101), None, None).unwrap();
1✔
344
        assert_eq!(snapshot.records.len(), 1);
1✔
345
        assert_eq!(snapshot.records[0].sections.len(), 2);
1✔
346
        assert_eq!(source.default_triplet_recipes().len(), recipes.len());
1✔
347
    }
1✔
348

349
    #[test]
350
    fn taxonomy_from_path_handles_nested_and_non_descendant_paths() {
1✔
351
        let temp = tempdir().unwrap();
1✔
352
        let root = temp.path().join("root");
1✔
353
        std::fs::create_dir_all(root.join("topic/subtopic")).unwrap();
1✔
354

355
        let nested = root.join("topic/subtopic/doc.txt");
1✔
356
        let taxonomy = taxonomy_from_path(&root, &nested, &"qa_tax".to_string());
1✔
357
        assert_eq!(taxonomy, vec!["qa_tax", "topic", "subtopic"]);
1✔
358

359
        let outside = temp.path().join("outside.txt");
1✔
360
        let outside_taxonomy = taxonomy_from_path(&root, &outside, &"qa_tax".to_string());
1✔
361
        assert_eq!(outside_taxonomy, vec!["qa_tax"]);
1✔
362
    }
1✔
363

364
    #[test]
365
    fn anchor_context_sections_build_expected_roles_and_text() {
1✔
366
        let sections = anchor_context_sections("What is delta", "Delta is change over time.");
1✔
367
        assert_eq!(sections.len(), 2);
1✔
368
        assert_eq!(sections[0].role, SectionRole::Anchor);
1✔
369
        assert_eq!(sections[0].text, "What is delta");
1✔
370
        assert_eq!(sections[1].role, SectionRole::Context);
1✔
371
        assert_eq!(sections[1].text, "Delta is change over time.");
1✔
372
    }
1✔
373

374
    #[test]
375
    fn title_replace_underscores_toggle_changes_anchor_title_text() {
1✔
376
        let temp = tempdir().unwrap();
1✔
377
        std::fs::write(
1✔
378
            temp.path().join("What_is_delta.txt"),
1✔
379
            "Delta captures directional change.",
380
        )
381
        .unwrap();
1✔
382

383
        let source_default =
1✔
384
            FileSource::new(FileSourceConfig::new("qa_title_default", temp.path()));
1✔
385
        let default_snapshot = source_default
1✔
386
            .refresh(&sampler_config(101), None, Some(1))
1✔
387
            .unwrap();
1✔
388
        assert_eq!(default_snapshot.records.len(), 1);
1✔
389
        assert_eq!(
1✔
390
            default_snapshot.records[0].sections[0].text,
1✔
391
            "What is delta"
392
        );
393

394
        let source_preserve = FileSource::new(
1✔
395
            FileSourceConfig::new("qa_title_preserve", temp.path())
1✔
396
                .with_title_replace_underscores(false),
1✔
397
        );
398
        let preserve_snapshot = source_preserve
1✔
399
            .refresh(&sampler_config(101), None, Some(1))
1✔
400
            .unwrap();
1✔
401
        assert_eq!(preserve_snapshot.records.len(), 1);
1✔
402
        assert_eq!(
1✔
403
            preserve_snapshot.records[0].sections[0].text,
1✔
404
            "What_is_delta"
405
        );
406
    }
1✔
407

408
    #[test]
409
    fn refresh_skips_non_txt_files_even_when_text_only_disabled() {
1✔
410
        let temp = tempdir().unwrap();
1✔
411
        std::fs::write(temp.path().join("notes.md"), "markdown should be skipped").unwrap();
1✔
412
        std::fs::write(temp.path().join("doc.txt"), "plain text should be indexed").unwrap();
1✔
413

414
        let source = FileSource::new(
1✔
415
            FileSourceConfig::new("qa_filtering", temp.path()).with_text_files_only(false),
1✔
416
        );
417
        let snapshot = source.refresh(&sampler_config(101), None, None).unwrap();
1✔
418
        assert_eq!(snapshot.records.len(), 1);
1✔
419
        assert!(snapshot.records[0].id.contains("doc.txt"));
1✔
420
    }
1✔
421

422
    #[test]
423
    fn trust_falls_back_to_default_and_count_and_id_are_exposed() {
1✔
424
        let temp = tempdir().unwrap();
1✔
425
        let docs = temp.path().join("docs");
1✔
426
        std::fs::create_dir_all(&docs).unwrap();
1✔
427
        std::fs::write(docs.join("alpha.txt"), "Alpha body.").unwrap();
1✔
428

429
        let source = FileSource::new(
1✔
430
            FileSourceConfig::new("qa_count", temp.path())
1✔
431
                .with_trust(0.42)
1✔
432
                .with_category_trust("factual", 0.95)
1✔
433
                .with_taxonomy_builder(Arc::new(|_, _, source_id| {
1✔
434
                    vec![source_id.clone(), "UNMATCHED".to_string()]
1✔
435
                })),
1✔
436
        );
437

438
        let seed_101 = sampler_config(101);
1✔
439
        let snapshot = source.refresh(&seed_101, None, None).unwrap();
1✔
440
        assert_eq!(snapshot.records.len(), 1);
1✔
441
        assert_eq!(snapshot.records[0].quality.trust, 0.42);
1✔
442
        assert_eq!(source.id(), "qa_count");
1✔
443
        assert_eq!(source.reported_record_count(&seed_101).unwrap(), 1);
1✔
444
    }
1✔
445

446
    #[test]
447
    fn sampler_seed_controls_file_source_refresh_order() {
1✔
448
        let temp = tempdir().unwrap();
1✔
449
        for idx in 0..12 {
12✔
450
            std::fs::write(
12✔
451
                temp.path().join(format!("doc_{idx:02}.txt")),
12✔
452
                format!("Body text for {idx}"),
12✔
453
            )
12✔
454
            .unwrap();
12✔
455
        }
12✔
456

457
        let source_a = FileSource::new(FileSourceConfig::new("seeded_a", temp.path()));
1✔
458
        let source_b = FileSource::new(FileSourceConfig::new("seeded_a", temp.path()));
1✔
459
        let source_c = FileSource::new(FileSourceConfig::new("seeded_a", temp.path()));
1✔
460

461
        let ids_a: Vec<String> = source_a
1✔
462
            .refresh(&sampler_config(11), None, Some(8))
1✔
463
            .unwrap()
1✔
464
            .records
1✔
465
            .into_iter()
1✔
466
            .map(|record| record.id)
1✔
467
            .collect();
1✔
468
        let ids_b: Vec<String> = source_b
1✔
469
            .refresh(&sampler_config(11), None, Some(8))
1✔
470
            .unwrap()
1✔
471
            .records
1✔
472
            .into_iter()
1✔
473
            .map(|record| record.id)
1✔
474
            .collect();
1✔
475
        let ids_c: Vec<String> = source_c
1✔
476
            .refresh(&sampler_config(29), None, Some(8))
1✔
477
            .unwrap()
1✔
478
            .records
1✔
479
            .into_iter()
1✔
480
            .map(|record| record.id)
1✔
481
            .collect();
1✔
482

483
        assert_eq!(ids_a, ids_b);
1✔
484
        assert_ne!(ids_a, ids_c);
1✔
485
    }
1✔
486
}
STATUS · Troubleshooting · Open an Issue · Sales · Support · CAREERS · ENTERPRISE · START FREE · SCHEDULE DEMO
ANNOUNCEMENTS · TWITTER · TOS & SLA · Supported CI Services · What's a CI service? · Automated Testing

© 2026 Coveralls, Inc