• Home
  • Features
  • Pricing
  • Docs
  • Announcements
  • Sign In

jzombie / rust-triplets / 22355015974

24 Feb 2026 02:24PM UTC coverage: 91.416% (-1.3%) from 92.675%
22355015974

Pull #7

github

web-flow
Merge 3e9bcacec into 980559192
Pull Request #7: Add HF source

3819 of 4360 new or added lines in 6 files covered. (87.59%)

93 existing lines in 3 files now uncovered.

13206 of 14446 relevant lines covered (91.42%)

2735.09 hits per line

Source File
Press 'n' to go to next uncovered line, 'b' for previous

94.96
/src/source/backends/file_source.rs
1
use std::collections::HashMap;
2
use std::path::{Path, PathBuf};
3
use std::sync::{Arc, Mutex};
4

5
use crate::config::{SamplerConfig, TripletRecipe};
6
use crate::data::{DataRecord, QualityScore, RecordSection, SectionRole};
7
use crate::errors::SamplerError;
8
use crate::source::indexing::file_corpus::FileCorpusIndex;
9
use crate::source::{DataSource, SourceCursor, SourceSnapshot};
10
use crate::transport::fs::{file_times, is_text_file};
11
use crate::types::{CategoryId, SourceId, TaxonomyValue};
12
use crate::utils::{make_section, normalize_inline_whitespace};
13

14
/// Builds taxonomy values from a root path and file path.
15
pub type TaxonomyBuilder =
16
    Arc<dyn Fn(&Path, &Path, &SourceId) -> Vec<TaxonomyValue> + Send + Sync + 'static>;
17

18
/// Builds record sections from a normalized title and body.
19
pub type SectionBuilder = Arc<dyn Fn(&str, &str) -> Vec<RecordSection> + Send + Sync + 'static>;
20

21
/// Configuration for a generic filesystem-backed data source.
22
#[derive(Clone)]
23
pub struct FileSourceConfig {
24
    /// Stable source identifier used in records and persistence keys.
25
    pub source_id: SourceId,
26
    /// Root directory containing source files.
27
    pub root: PathBuf,
28
    /// Default quality trust score applied to generated records.
29
    pub trust: f32,
30
    /// Optional trust overrides keyed by taxonomy segment.
31
    pub category_trust: HashMap<CategoryId, f32>,
32
    /// Whether to follow symlinks during index walking.
33
    pub follow_links: bool,
34
    /// Whether indexing should include only text files.
35
    pub text_files_only: bool,
36
    /// Whether deterministic directory grouping is enabled.
37
    pub group_by_directory: bool,
38
    /// Whether title extraction should replace underscores with spaces.
39
    pub title_replace_underscores: bool,
40
    /// Optional default recipes returned by this source.
41
    pub default_triplet_recipes: Vec<TripletRecipe>,
42
    /// Taxonomy builder invoked per file.
43
    pub taxonomy_builder: TaxonomyBuilder,
44
    /// Section builder invoked per file.
45
    pub section_builder: SectionBuilder,
46
}
47

48
impl FileSourceConfig {
49
    /// Create a config for a filesystem source with explicit id and root.
50
    pub fn new(source_id: impl Into<SourceId>, root: impl Into<PathBuf>) -> Self {
10✔
51
        Self {
10✔
52
            source_id: source_id.into(),
10✔
53
            root: root.into(),
10✔
54
            trust: 0.85,
10✔
55
            category_trust: HashMap::new(),
10✔
56
            follow_links: true,
10✔
57
            text_files_only: false,
10✔
58
            group_by_directory: true,
10✔
59
            title_replace_underscores: true,
10✔
60
            default_triplet_recipes: Vec::new(),
10✔
61
            taxonomy_builder: Arc::new(taxonomy_from_path),
10✔
62
            section_builder: Arc::new(anchor_context_sections),
10✔
63
        }
10✔
64
    }
10✔
65

66
    /// Override default trust score.
67
    pub fn with_trust(mut self, trust: f32) -> Self {
1✔
68
        self.trust = trust;
1✔
69
        self
1✔
70
    }
1✔
71

72
    /// Add a taxonomy-segment trust override.
73
    pub fn with_category_trust(mut self, category: impl Into<String>, trust: f32) -> Self {
3✔
74
        self.category_trust
3✔
75
            .insert(category.into().to_lowercase(), trust);
3✔
76
        self
3✔
77
    }
3✔
78

79
    /// Override whether symlinks are followed during index walk.
NEW
80
    pub fn with_follow_links(mut self, follow_links: bool) -> Self {
×
NEW
81
        self.follow_links = follow_links;
×
NEW
82
        self
×
NEW
83
    }
×
84

85
    /// Override whether index walk includes only text files.
86
    pub fn with_text_files_only(mut self, text_files_only: bool) -> Self {
1✔
87
        self.text_files_only = text_files_only;
1✔
88
        self
1✔
89
    }
1✔
90

91
    /// Enable or disable deterministic directory grouping.
NEW
92
    pub fn with_directory_grouping(mut self, group_by_directory: bool) -> Self {
×
NEW
93
        self.group_by_directory = group_by_directory;
×
NEW
94
        self
×
NEW
95
    }
×
96

97
    /// Set whether title extraction replaces underscores with spaces.
98
    pub fn with_title_replace_underscores(mut self, replace_underscores: bool) -> Self {
1✔
99
        self.title_replace_underscores = replace_underscores;
1✔
100
        self
1✔
101
    }
1✔
102

103
    /// Set source-provided default triplet recipes.
104
    pub fn with_default_triplet_recipes(mut self, recipes: Vec<TripletRecipe>) -> Self {
1✔
105
        self.default_triplet_recipes = recipes;
1✔
106
        self
1✔
107
    }
1✔
108

109
    /// Set a custom taxonomy builder.
110
    pub fn with_taxonomy_builder(mut self, taxonomy_builder: TaxonomyBuilder) -> Self {
1✔
111
        self.taxonomy_builder = taxonomy_builder;
1✔
112
        self
1✔
113
    }
1✔
114

115
    /// Set a custom section builder.
116
    pub fn with_section_builder(mut self, section_builder: SectionBuilder) -> Self {
1✔
117
        self.section_builder = section_builder;
1✔
118
        self
1✔
119
    }
1✔
120
}
121

122
/// Generic filesystem-backed source with configurable taxonomy and section mapping.
123
pub struct FileSource {
124
    config: FileSourceConfig,
125
    sampler_seed: Mutex<Option<u64>>,
126
}
127

128
impl FileSource {
129
    /// Create a generic file source from configuration.
130
    pub fn new(config: FileSourceConfig) -> Self {
10✔
131
        Self {
10✔
132
            config,
10✔
133
            sampler_seed: Mutex::new(None),
10✔
134
        }
10✔
135
    }
10✔
136

137
    fn configured_sampler_seed(&self) -> Result<u64, SamplerError> {
11✔
138
        self.sampler_seed
11✔
139
            .lock()
11✔
140
            .map_err(|_| SamplerError::SourceUnavailable {
11✔
NEW
141
                source_id: self.config.source_id.clone(),
×
NEW
142
                reason: "file source sampler-seed lock poisoned".to_string(),
×
NEW
143
            })?
×
144
            .ok_or_else(|| SamplerError::SourceInconsistent {
11✔
NEW
145
                source_id: self.config.source_id.clone(),
×
NEW
146
                details: "file source sampler configuration not provided".to_string(),
×
NEW
147
            })
×
148
    }
11✔
149

150
    fn file_corpus_index(&self, sampler_seed: u64) -> FileCorpusIndex {
11✔
151
        FileCorpusIndex::new(&self.config.root, &self.config.source_id)
11✔
152
            .with_sampler_seed(sampler_seed)
11✔
153
            .with_follow_links(self.config.follow_links)
11✔
154
            .with_text_files_only(self.config.text_files_only)
11✔
155
            .with_directory_grouping(self.config.group_by_directory)
11✔
156
    }
11✔
157

158
    fn trust_for_taxonomy(&self, taxonomy: &[String]) -> f32 {
32✔
159
        for segment in taxonomy.iter().skip(1) {
32✔
160
            if let Some(weight) = self.config.category_trust.get(&segment.to_lowercase()) {
4✔
161
                return *weight;
2✔
162
            }
2✔
163
        }
164
        self.config.trust
30✔
165
    }
32✔
166

167
    fn build_record(&self, path: &Path) -> Result<Option<DataRecord>, SamplerError> {
33✔
168
        if !is_text_file(path) {
33✔
169
            return Ok(None);
1✔
170
        }
32✔
171
        let title = FileCorpusIndex::normalized_title_from_stem(
32✔
172
            path,
32✔
173
            &self.config.source_id,
32✔
174
            self.config.title_replace_underscores,
32✔
NEW
175
        )?;
×
176
        if title.is_empty() {
32✔
NEW
177
            return Ok(None);
×
178
        }
32✔
179

180
        let body_raw = std::fs::read_to_string(path)?;
32✔
181
        let body = normalize_inline_whitespace(body_raw);
32✔
182
        if body.is_empty() {
32✔
NEW
183
            return Ok(None);
×
184
        }
32✔
185

186
        let taxonomy =
32✔
187
            (self.config.taxonomy_builder)(&self.config.root, path, &self.config.source_id);
32✔
188
        let sections = (self.config.section_builder)(&title, &body);
32✔
189
        let trust = self.trust_for_taxonomy(&taxonomy);
32✔
190
        let (created_at, updated_at) = file_times(path);
32✔
191

192
        Ok(Some(DataRecord {
32✔
193
            id: FileCorpusIndex::source_scoped_record_id(
32✔
194
                &self.config.source_id,
32✔
195
                &self.config.root,
32✔
196
                path,
32✔
197
            ),
32✔
198
            source: self.config.source_id.clone(),
32✔
199
            created_at,
32✔
200
            updated_at,
32✔
201
            quality: QualityScore { trust },
32✔
202
            taxonomy,
32✔
203
            sections,
32✔
204
            meta_prefix: None,
32✔
205
        }))
32✔
206
    }
33✔
207
}
208

209
impl DataSource for FileSource {
210
    fn id(&self) -> &str {
1✔
211
        &self.config.source_id
1✔
212
    }
1✔
213

214
    fn refresh(
10✔
215
        &self,
10✔
216
        cursor: Option<&SourceCursor>,
10✔
217
        limit: Option<usize>,
10✔
218
    ) -> Result<SourceSnapshot, SamplerError> {
10✔
219
        let sampler_seed = self.configured_sampler_seed()?;
10✔
220
        self.file_corpus_index(sampler_seed)
10✔
221
            .refresh_indexable(cursor, limit, |path| self.build_record(path))
33✔
222
    }
10✔
223

224
    fn reported_record_count(&self) -> Result<u128, SamplerError> {
1✔
225
        let sampler_seed = self.configured_sampler_seed()?;
1✔
226
        self.file_corpus_index(sampler_seed)
1✔
227
            .indexed_record_count()
1✔
228
            .map(|count| count as u128)
1✔
229
    }
1✔
230

231
    fn configure_sampler(&self, config: &SamplerConfig) {
10✔
232
        if let Ok(mut slot) = self.sampler_seed.lock() {
10✔
233
            *slot = Some(config.seed);
10✔
234
        }
10✔
235
    }
10✔
236

237
    fn default_triplet_recipes(&self) -> Vec<TripletRecipe> {
1✔
238
        self.config.default_triplet_recipes.clone()
1✔
239
    }
1✔
240
}
241

242
/// Build default taxonomy from the file path relative to `root`.
243
///
244
/// Output shape is `[source_id, <parent segments...>]`.
245
pub fn taxonomy_from_path(root: &Path, path: &Path, source_id: &SourceId) -> Vec<TaxonomyValue> {
33✔
246
    let mut taxonomy = vec![source_id.to_string()];
33✔
247
    if let Ok(rel) = path.strip_prefix(root)
33✔
248
        && let Some(parent) = rel.parent()
32✔
249
    {
250
        for segment in parent.iter() {
32✔
251
            taxonomy.push(segment.to_string_lossy().to_string());
5✔
252
        }
5✔
253
    }
1✔
254
    taxonomy
33✔
255
}
33✔
256

257
/// Build a default two-section payload of title anchor and body context.
258
pub fn anchor_context_sections(title: &str, body: &str) -> Vec<RecordSection> {
32✔
259
    vec![
32✔
260
        make_section(SectionRole::Anchor, None, title),
32✔
261
        make_section(SectionRole::Context, None, body),
32✔
262
    ]
263
}
32✔
264

265
#[cfg(test)]
266
mod tests {
267
    use super::*;
268
    use crate::config::{NegativeStrategy, Selector};
269
    use crate::source::configured_source_with_seed;
270
    use tempfile::tempdir;
271

272
    fn seeded_source(config: FileSourceConfig, seed: u64) -> FileSource {
10✔
273
        configured_source_with_seed(FileSource::new(config), seed)
10✔
274
    }
10✔
275

276
    #[test]
277
    fn reads_records_without_default_source_id() {
1✔
278
        let temp = tempdir().unwrap();
1✔
279
        let category = temp.path().join("factual");
1✔
280
        std::fs::create_dir_all(&category).unwrap();
1✔
281
        std::fs::write(
1✔
282
            category.join("What_is_alpha.txt"),
1✔
283
            "Alpha measures risk-adjusted outperformance.",
284
        )
285
        .unwrap();
1✔
286

287
        let source = seeded_source(FileSourceConfig::new("qa_custom", temp.path()), 101);
1✔
288
        let snapshot = source.refresh(None, None).unwrap();
1✔
289

290
        assert_eq!(snapshot.records.len(), 1);
1✔
291
        assert_eq!(snapshot.records[0].source, "qa_custom");
1✔
292
    }
1✔
293

294
    #[test]
295
    fn applies_category_trust_overrides() {
1✔
296
        let temp = tempdir().unwrap();
1✔
297
        let factual = temp.path().join("factual");
1✔
298
        let opinion = temp.path().join("opinionated");
1✔
299
        std::fs::create_dir_all(&factual).unwrap();
1✔
300
        std::fs::create_dir_all(&opinion).unwrap();
1✔
301
        std::fs::write(
1✔
302
            factual.join("What_is_beta.txt"),
1✔
303
            "Beta compares volatility.",
304
        )
305
        .unwrap();
1✔
306
        std::fs::write(
1✔
307
            opinion.join("Will_rates_fall.txt"),
1✔
308
            "Probably not this year.",
309
        )
310
        .unwrap();
1✔
311

312
        let source = seeded_source(
1✔
313
            FileSourceConfig::new("qa_weighted", temp.path())
1✔
314
                .with_category_trust("factual", 0.95)
1✔
315
                .with_category_trust("opinionated", 0.6),
1✔
316
            101,
317
        );
318
        let snapshot = source.refresh(None, None).unwrap();
1✔
319

320
        let factual_record = snapshot
1✔
321
            .records
1✔
322
            .iter()
1✔
323
            .find(|record| record.taxonomy.iter().any(|value| value == "factual"))
2✔
324
            .unwrap();
1✔
325
        let opinion_record = snapshot
1✔
326
            .records
1✔
327
            .iter()
1✔
328
            .find(|record| record.taxonomy.iter().any(|value| value == "opinionated"))
4✔
329
            .unwrap();
1✔
330
        assert_eq!(factual_record.quality.trust, 0.95);
1✔
331
        assert_eq!(opinion_record.quality.trust, 0.6);
1✔
332
    }
1✔
333

334
    #[test]
335
    fn supports_custom_sections_and_default_recipes() {
1✔
336
        let temp = tempdir().unwrap();
1✔
337
        std::fs::write(
1✔
338
            temp.path().join("What_is_gamma.txt"),
1✔
339
            "Gamma measures convexity.",
340
        )
341
        .unwrap();
1✔
342

343
        let sections: SectionBuilder = Arc::new(|question, answer| {
1✔
344
            vec![
1✔
345
                make_section(SectionRole::Anchor, Some("Question"), question),
1✔
346
                make_section(SectionRole::Context, Some("Answer"), answer),
1✔
347
            ]
348
        });
1✔
349

350
        let recipes = vec![TripletRecipe {
1✔
351
            name: "question_answer".into(),
1✔
352
            anchor: Selector::Role(SectionRole::Anchor),
1✔
353
            positive_selector: Selector::Role(SectionRole::Context),
1✔
354
            negative_selector: Selector::Role(SectionRole::Context),
1✔
355
            negative_strategy: NegativeStrategy::QuestionAnswerMismatch,
1✔
356
            weight: 1.0,
1✔
357
            instruction: None,
1✔
358
        }];
1✔
359

360
        let source = seeded_source(
1✔
361
            FileSourceConfig::new("qa_sections", temp.path())
1✔
362
                .with_section_builder(sections)
1✔
363
                .with_default_triplet_recipes(recipes.clone()),
1✔
364
            101,
365
        );
366

367
        let snapshot = source.refresh(None, None).unwrap();
1✔
368
        assert_eq!(snapshot.records.len(), 1);
1✔
369
        assert_eq!(snapshot.records[0].sections.len(), 2);
1✔
370
        assert_eq!(source.default_triplet_recipes().len(), recipes.len());
1✔
371
    }
1✔
372

373
    #[test]
374
    fn taxonomy_from_path_handles_nested_and_non_descendant_paths() {
1✔
375
        let temp = tempdir().unwrap();
1✔
376
        let root = temp.path().join("root");
1✔
377
        std::fs::create_dir_all(root.join("topic/subtopic")).unwrap();
1✔
378

379
        let nested = root.join("topic/subtopic/doc.txt");
1✔
380
        let taxonomy = taxonomy_from_path(&root, &nested, &"qa_tax".to_string());
1✔
381
        assert_eq!(taxonomy, vec!["qa_tax", "topic", "subtopic"]);
1✔
382

383
        let outside = temp.path().join("outside.txt");
1✔
384
        let outside_taxonomy = taxonomy_from_path(&root, &outside, &"qa_tax".to_string());
1✔
385
        assert_eq!(outside_taxonomy, vec!["qa_tax"]);
1✔
386
    }
1✔
387

388
    #[test]
389
    fn anchor_context_sections_build_expected_roles_and_text() {
1✔
390
        let sections = anchor_context_sections("What is delta", "Delta is change over time.");
1✔
391
        assert_eq!(sections.len(), 2);
1✔
392
        assert_eq!(sections[0].role, SectionRole::Anchor);
1✔
393
        assert_eq!(sections[0].text, "What is delta");
1✔
394
        assert_eq!(sections[1].role, SectionRole::Context);
1✔
395
        assert_eq!(sections[1].text, "Delta is change over time.");
1✔
396
    }
1✔
397

398
    #[test]
399
    fn title_replace_underscores_toggle_changes_anchor_title_text() {
1✔
400
        let temp = tempdir().unwrap();
1✔
401
        std::fs::write(
1✔
402
            temp.path().join("What_is_delta.txt"),
1✔
403
            "Delta captures directional change.",
404
        )
405
        .unwrap();
1✔
406

407
        let source_default =
1✔
408
            seeded_source(FileSourceConfig::new("qa_title_default", temp.path()), 101);
1✔
409
        let default_snapshot = source_default.refresh(None, Some(1)).unwrap();
1✔
410
        assert_eq!(default_snapshot.records.len(), 1);
1✔
411
        assert_eq!(
1✔
412
            default_snapshot.records[0].sections[0].text,
1✔
413
            "What is delta"
414
        );
415

416
        let source_preserve = seeded_source(
1✔
417
            FileSourceConfig::new("qa_title_preserve", temp.path())
1✔
418
                .with_title_replace_underscores(false),
1✔
419
            101,
420
        );
421
        let preserve_snapshot = source_preserve.refresh(None, Some(1)).unwrap();
1✔
422
        assert_eq!(preserve_snapshot.records.len(), 1);
1✔
423
        assert_eq!(
1✔
424
            preserve_snapshot.records[0].sections[0].text,
1✔
425
            "What_is_delta"
426
        );
427
    }
1✔
428

429
    #[test]
430
    fn refresh_skips_non_txt_files_even_when_text_only_disabled() {
1✔
431
        let temp = tempdir().unwrap();
1✔
432
        std::fs::write(temp.path().join("notes.md"), "markdown should be skipped").unwrap();
1✔
433
        std::fs::write(temp.path().join("doc.txt"), "plain text should be indexed").unwrap();
1✔
434

435
        let source = seeded_source(
1✔
436
            FileSourceConfig::new("qa_filtering", temp.path()).with_text_files_only(false),
1✔
437
            101,
438
        );
439
        let snapshot = source.refresh(None, None).unwrap();
1✔
440
        assert_eq!(snapshot.records.len(), 1);
1✔
441
        assert!(snapshot.records[0].id.contains("doc.txt"));
1✔
442
    }
1✔
443

444
    #[test]
445
    fn trust_falls_back_to_default_and_count_and_id_are_exposed() {
1✔
446
        let temp = tempdir().unwrap();
1✔
447
        let docs = temp.path().join("docs");
1✔
448
        std::fs::create_dir_all(&docs).unwrap();
1✔
449
        std::fs::write(docs.join("alpha.txt"), "Alpha body.").unwrap();
1✔
450

451
        let source = seeded_source(
1✔
452
            FileSourceConfig::new("qa_count", temp.path())
1✔
453
                .with_trust(0.42)
1✔
454
                .with_category_trust("factual", 0.95)
1✔
455
                .with_taxonomy_builder(Arc::new(|_, _, source_id| {
1✔
456
                    vec![source_id.clone(), "UNMATCHED".to_string()]
1✔
457
                })),
1✔
458
            101,
459
        );
460

461
        let snapshot = source.refresh(None, None).unwrap();
1✔
462
        assert_eq!(snapshot.records.len(), 1);
1✔
463
        assert_eq!(snapshot.records[0].quality.trust, 0.42);
1✔
464
        assert_eq!(source.id(), "qa_count");
1✔
465
        assert_eq!(source.reported_record_count().unwrap(), 1);
1✔
466
    }
1✔
467

468
    #[test]
469
    fn sampler_seed_controls_file_source_refresh_order() {
1✔
470
        let temp = tempdir().unwrap();
1✔
471
        for idx in 0..12 {
12✔
472
            std::fs::write(
12✔
473
                temp.path().join(format!("doc_{idx:02}.txt")),
12✔
474
                format!("Body text for {idx}"),
12✔
475
            )
12✔
476
            .unwrap();
12✔
477
        }
12✔
478

479
        let source_a = seeded_source(FileSourceConfig::new("seeded_a", temp.path()), 11);
1✔
480
        let source_b = seeded_source(FileSourceConfig::new("seeded_a", temp.path()), 11);
1✔
481
        let source_c = seeded_source(FileSourceConfig::new("seeded_a", temp.path()), 29);
1✔
482

483
        let ids_a: Vec<String> = source_a
1✔
484
            .refresh(None, Some(8))
1✔
485
            .unwrap()
1✔
486
            .records
1✔
487
            .into_iter()
1✔
488
            .map(|record| record.id)
1✔
489
            .collect();
1✔
490
        let ids_b: Vec<String> = source_b
1✔
491
            .refresh(None, Some(8))
1✔
492
            .unwrap()
1✔
493
            .records
1✔
494
            .into_iter()
1✔
495
            .map(|record| record.id)
1✔
496
            .collect();
1✔
497
        let ids_c: Vec<String> = source_c
1✔
498
            .refresh(None, Some(8))
1✔
499
            .unwrap()
1✔
500
            .records
1✔
501
            .into_iter()
1✔
502
            .map(|record| record.id)
1✔
503
            .collect();
1✔
504

505
        assert_eq!(ids_a, ids_b);
1✔
506
        assert_ne!(ids_a, ids_c);
1✔
507
    }
1✔
508
}
STATUS · Troubleshooting · Open an Issue · Sales · Support · CAREERS · ENTERPRISE · START FREE · SCHEDULE DEMO
ANNOUNCEMENTS · TWITTER · TOS & SLA · Supported CI Services · What's a CI service? · Automated Testing

© 2026 Coveralls, Inc