• Home
  • Features
  • Pricing
  • Docs
  • Announcements
  • Sign In

jzombie / rust-triplets / 24927961805

25 Apr 2026 09:37AM UTC coverage: 95.473% (-0.06%) from 95.528%
24927961805

push

github

web-flow
Add optional denoise (#81)

* Bump rustls-webpki in the cargo group across 1 directory

Bumps the cargo group with 1 update in the / directory: [rustls-webpki](https://github.com/rustls/webpki).


Updates `rustls-webpki` from 0.103.12 to 0.103.13
- [Release notes](https://github.com/rustls/webpki/releases)
- [Commits](https://github.com/rustls/webpki/compare/v/0.103.12...v/0.103.13)

---
updated-dependencies:
- dependency-name: rustls-webpki
  dependency-version: 0.103.13
  dependency-type: indirect
  dependency-group: cargo
...

Signed-off-by: dependabot[bot] <support@github.com>

* Prototype denoiser implementation

Co-authored-by: Copilot <copilot@github.com>

* Add more tests

Co-authored-by: Copilot <copilot@github.com>

* Add support for linearized data

Co-authored-by: Copilot <copilot@github.com>

* cargo fmt --all

* Remove user-configurable line-level control

Co-authored-by: Copilot <copilot@github.com>

* Draft README update

* Use single-line assertion

* Draft tagline and description

Co-authored-by: Copilot <copilot@github.com>

* Add line break

* Make markdown stripping optional

Co-authored-by: Copilot <copilot@github.com>

* Add link to GFM

Co-authored-by: Copilot <copilot@github.com>

* Fix doc comment

Co-authored-by: Copilot <copilot@github.com>

* Debug flaky CI tests

* Prepare for 0.19.0-alpha

* Add denoiser example to README

Co-authored-by: Copilot <copilot@github.com>

* Improve InMemory source implementation

Co-authored-by: Copilot <copilot@github.com>

* Include links to individual sources

Co-authored-by: Copilot <copilot@github.com>

* Reorder sources

* Update default source verbiage

* Add ability to construct DataRecord from text

Co-authored-by: Copilot <copilot@github.com>

* Prototype iterative wave expansion

Co-authored-by: Copilot <copilot@github.com>

* cargo fmt --all

* Migrate to preprocessor architecture

Co-authored-by: Copilot <copilot@github.com>

* Add more tests

Co-authored-by: Copilot <copilot@gi... (continued)

1028 of 1083 new or added lines in 6 files covered. (94.92%)

1 existing line in 1 file now uncovered.

18708 of 19595 relevant lines covered (95.47%)

3961.16 hits per line

Source File
Press 'n' to go to next uncovered line, 'b' for previous

76.56
/src/data.rs
1
use chrono::{DateTime, Utc};
2
use serde::{Deserialize, Serialize};
3
use std::collections::HashMap;
4

5
use crate::kvp::KvpPrefixSampler;
6

7
pub use crate::types::{RecordId, Sentence, SourceId, TaxonomyValue};
8

9
/// Trust/quality metadata for a record.
10
#[derive(Clone, Copy, Debug, Serialize, Deserialize)]
11
pub struct QualityScore {
12
    /// Normalized 0-1 trust measure combining provenance, recency, and manual reviews.
13
    pub trust: f32,
14
}
15

16
impl Default for QualityScore {
17
    fn default() -> Self {
403✔
18
        Self {
403✔
19
            // Assume medium trust by default, allowing recipes to upweight or downweight based on other signals.
403✔
20
            trust: 0.5,
403✔
21
        }
403✔
22
    }
403✔
23
}
24

25
/// Canonical record payload produced by a DataSource.
26
#[derive(Clone, Debug, Serialize, Deserialize)]
27
pub struct DataRecord {
28
    /// Stable record identifier (used for splits and determinism).
29
    pub id: RecordId,
30
    /// Source identifier that produced this record.
31
    pub source: SourceId,
32
    /// Canonical creation time for the record (used for ordering/metadata).
33
    pub created_at: DateTime<Utc>,
34
    /// Last update time for the record (used for refresh decisions).
35
    pub updated_at: DateTime<Utc>,
36
    /// Trust/quality score used to weight sampling.
37
    pub quality: QualityScore,
38
    /// Free-form tags (e.g., source id, year, date) used for filtering/recipes.
39
    pub taxonomy: Vec<TaxonomyValue>,
40
    /// Structured content sections used by sampling recipes.
41
    pub sections: Vec<RecordSection>,
42
    /// Optional metadata prefix policy for KVP sampling (key-value headers injected into text).
43
    #[serde(default, skip_serializing_if = "Option::is_none")]
44
    pub meta_prefix: Option<KvpPrefixSampler>,
45
}
46

47
impl DataRecord {
48
    /// Create a record with a single [`SectionRole::Context`] section from a plain text string.
49
    ///
50
    /// The `id` and `source` are set to the same value. Use [`DataRecord::from_text_with_role`]
51
    /// to assign a different role, or construct the struct directly for full control.
52
    ///
53
    /// # Example
54
    ///
55
    /// ```
56
    /// use triplets::DataRecord;
57
    ///
58
    /// let record = DataRecord::from_text("doc-0", "my_corpus", "The quick brown fox.");
59
    /// assert_eq!(record.id.as_str(), "doc-0");
60
    /// assert_eq!(record.sections[0].text, "The quick brown fox.");
61
    /// ```
NEW
62
    pub fn from_text(
×
NEW
63
        id: impl Into<crate::types::RecordId>,
×
NEW
64
        source: impl Into<crate::types::SourceId>,
×
NEW
65
        text: impl Into<String>,
×
NEW
66
    ) -> Self {
×
NEW
67
        Self::from_text_with_role(id, source, text, SectionRole::Context)
×
NEW
68
    }
×
69

70
    /// Create a record with a single section of the given role from a plain text string.
71
    ///
72
    /// # Example
73
    ///
74
    /// ```
75
    /// use triplets::{DataRecord, SectionRole};
76
    ///
77
    /// let record = DataRecord::from_text_with_role(
78
    ///     "doc-0", "my_corpus", "What is the capital of France?", SectionRole::Anchor,
79
    /// );
80
    /// assert_eq!(record.sections[0].role, SectionRole::Anchor);
81
    /// ```
NEW
82
    pub fn from_text_with_role(
×
NEW
83
        id: impl Into<crate::types::RecordId>,
×
NEW
84
        source: impl Into<crate::types::SourceId>,
×
NEW
85
        text: impl Into<String>,
×
NEW
86
        role: SectionRole,
×
NEW
87
    ) -> Self {
×
NEW
88
        let now = chrono::Utc::now();
×
NEW
89
        Self {
×
NEW
90
            id: id.into(),
×
NEW
91
            source: source.into(),
×
NEW
92
            created_at: now,
×
NEW
93
            updated_at: now,
×
NEW
94
            quality: QualityScore::default(),
×
NEW
95
            taxonomy: vec![],
×
NEW
96
            sections: vec![RecordSection {
×
NEW
97
                role,
×
NEW
98
                heading: None,
×
NEW
99
                text: text.into(),
×
NEW
100
                sentences: vec![],
×
NEW
101
            }],
×
NEW
102
            meta_prefix: None,
×
NEW
103
        }
×
NEW
104
    }
×
105
}
106

107
/// A structured section within a record.
108
#[derive(Clone, Debug, Serialize, Deserialize)]
109
pub struct RecordSection {
110
    /// Semantic role used by selectors (for example, anchor vs context text).
111
    pub role: SectionRole,
112
    /// Optional short heading/title for this section.
113
    pub heading: Option<String>,
114
    /// Full section text.
115
    pub text: String,
116
    /// Sentence-level segmentation of `text` used by chunking strategies.
117
    pub sentences: Vec<Sentence>,
118
}
119

120
/// Role label for a section.
121
#[derive(Clone, Debug, Serialize, Deserialize, PartialEq, Eq)]
122
pub enum SectionRole {
123
    /// Primary section typically used as an anchor candidate.
124
    Anchor,
125
    /// Supporting/context section used for positives, negatives, or text samples.
126
    Context,
127
}
128

129
/// A chunked view over a section.
130
#[derive(Clone, Debug, Serialize, Deserialize)]
131
pub struct RecordChunk {
132
    /// Parent record id this chunk belongs to.
133
    pub record_id: RecordId,
134
    /// Index of the source section in `DataRecord.sections`.
135
    pub section_idx: usize,
136
    /// Chunk view metadata (window position or summary fallback).
137
    pub view: ChunkView,
138
    /// Rendered chunk text (possibly with metadata prefix decoration).
139
    pub text: String,
140
    /// Approximate token count for scheduling/weighting heuristics.
141
    pub tokens_estimate: usize,
142
    /// Trust/quality inherited from the parent record.
143
    pub quality: QualityScore,
144
    /// All KVP metadata defined on the source record's `meta_prefix`, exposed for
145
    /// downstream inspection and debugging. Contains every key with all its possible
146
    /// values across all variants — unaffected by presence probability, dropout, or
147
    /// which variant was sampled into this chunk's text.
148
    ///
149
    /// Populated unconditionally by the sampler during chunk decoration. Empty when the
150
    /// record has no `meta_prefix` configured.
151
    #[serde(default, skip_serializing_if = "HashMap::is_empty")]
152
    pub kvp_meta: HashMap<String, Vec<String>>,
153
}
154

155
/// Chunk view metadata (window or summary).
156
#[derive(Clone, Debug, Serialize, Deserialize)]
157
pub enum ChunkView {
158
    /// Sliding-window chunk extracted directly from section text.
159
    Window {
160
        /// Zero-based window index within the section.
161
        index: usize,
162
        /// Overlap (in tokens) with the previous window.
163
        overlap: usize,
164
        /// Nominal window span in tokens.
165
        span: usize,
166
    },
167
    /// Summary fallback chunk used when window extraction is unavailable.
168
    SummaryFallback {
169
        /// Name of summary strategy that produced this fallback chunk.
170
        strategy: String,
171
        /// Precomputed base weight for summary-fallback chunks before trust/floor are applied.
172
        weight: f32,
173
    },
174
}
175

176
/// Sample pair (positive/negative) derived from a triplet.
177
#[derive(Clone, Debug, Serialize, Deserialize)]
178
pub struct SamplePair {
179
    /// Recipe name used to generate this pair.
180
    pub recipe: String,
181
    /// Anchor chunk used to build this supervised pair.
182
    pub anchor: RecordChunk,
183
    /// Candidate chunk paired with the anchor.
184
    pub positive: RecordChunk,
185
    /// Training weight for this pair.
186
    pub weight: f32,
187
    /// Optional instruction/prompt hint for this sample.
188
    pub instruction: Option<String>,
189
    /// Supervision label (positive or negative).
190
    pub label: PairLabel,
191
    /// Optional reason/annotation describing the label.
192
    pub reason: Option<String>,
193
}
194

195
/// Sample triplet (anchor/positive/negative).
196
#[derive(Clone, Debug, Serialize, Deserialize)]
197
pub struct SampleTriplet {
198
    /// Recipe name used to generate this triplet.
199
    pub recipe: String,
200
    /// Anchor chunk.
201
    pub anchor: RecordChunk,
202
    /// Positive chunk.
203
    pub positive: RecordChunk,
204
    /// Negative chunk.
205
    pub negative: RecordChunk,
206
    /// Training weight for this triplet.
207
    pub weight: f32,
208
    /// Optional instruction/prompt hint for this sample.
209
    pub instruction: Option<String>,
210
}
211

212
/// Pair label for supervised pair batches.
213
#[derive(Clone, Debug, Serialize, Deserialize, PartialEq, Eq)]
214
pub enum PairLabel {
215
    /// Anchor and candidate are semantically aligned.
216
    Positive,
217
    /// Anchor and candidate are semantically mismatched.
218
    Negative,
219
}
220

221
/// Batch of pairs.
222
#[derive(Clone, Debug, Serialize, Deserialize)]
223
pub struct SampleBatch {
224
    /// Pair samples contained in this batch.
225
    pub pairs: Vec<SamplePair>,
226
}
227

228
impl SampleBatch {
229
    /// Returns `true` when the batch has no pairs.
230
    pub fn is_empty(&self) -> bool {
4✔
231
        self.pairs.is_empty()
4✔
232
    }
4✔
233
}
234

235
/// Batch of triplets.
236
#[derive(Clone, Debug, Serialize, Deserialize)]
237
pub struct TripletBatch {
238
    /// Triplet samples contained in this batch.
239
    pub triplets: Vec<SampleTriplet>,
240
}
241

242
impl TripletBatch {
243
    /// Returns `true` when the batch has no triplets.
244
    pub fn is_empty(&self) -> bool {
3✔
245
        self.triplets.is_empty()
3✔
246
    }
3✔
247
}
248

249
/// A single text sample (chunk + weight).
250
#[derive(Clone, Debug, Serialize, Deserialize)]
251
pub struct TextSample {
252
    /// Recipe name used to generate this sample.
253
    pub recipe: String,
254
    /// Chunk payload used for this text sample.
255
    pub chunk: RecordChunk,
256
    /// Training weight for this sample.
257
    pub weight: f32,
258
    /// Optional instruction/prompt hint for this sample.
259
    pub instruction: Option<String>,
260
}
261

262
/// Batch of text samples.
263
#[derive(Clone, Debug, Serialize, Deserialize)]
264
pub struct TextBatch {
265
    /// Text samples contained in this batch.
266
    pub samples: Vec<TextSample>,
267
}
268

269
impl TextBatch {
270
    /// Returns `true` when the batch has no text samples.
271
    pub fn is_empty(&self) -> bool {
5✔
272
        self.samples.is_empty()
5✔
273
    }
5✔
274
}
275

276
#[cfg(test)]
277
mod tests {
278
    use super::*;
279
    use chrono::{TimeZone, Utc};
280

281
    fn sample_chunk(id: &str) -> RecordChunk {
6✔
282
        RecordChunk {
6✔
283
            record_id: id.to_string(),
6✔
284
            section_idx: 0,
6✔
285
            view: ChunkView::SummaryFallback {
6✔
286
                strategy: "test".to_string(),
6✔
287
                weight: 1.0,
6✔
288
            },
6✔
289
            text: "text".to_string(),
6✔
290
            tokens_estimate: 4,
6✔
291
            quality: QualityScore::default(),
6✔
292
            kvp_meta: Default::default(),
6✔
293
        }
6✔
294
    }
6✔
295

296
    #[test]
297
    fn quality_score_defaults_to_medium_trust() {
1✔
298
        let quality = QualityScore::default();
1✔
299
        assert!((quality.trust - 0.5).abs() < f32::EPSILON);
1✔
300
    }
1✔
301

302
    #[test]
303
    fn batch_is_empty_helpers_match_contents() {
1✔
304
        let empty_pairs = SampleBatch { pairs: Vec::new() };
1✔
305
        assert!(empty_pairs.is_empty());
1✔
306

307
        let non_empty_pairs = SampleBatch {
1✔
308
            pairs: vec![SamplePair {
1✔
309
                recipe: "r".to_string(),
1✔
310
                anchor: sample_chunk("a"),
1✔
311
                positive: sample_chunk("b"),
1✔
312
                weight: 1.0,
1✔
313
                instruction: None,
1✔
314
                label: PairLabel::Positive,
1✔
315
                reason: Some("test".to_string()),
1✔
316
            }],
1✔
317
        };
1✔
318
        assert!(!non_empty_pairs.is_empty());
1✔
319

320
        let empty_triplets = TripletBatch {
1✔
321
            triplets: Vec::new(),
1✔
322
        };
1✔
323
        assert!(empty_triplets.is_empty());
1✔
324

325
        let non_empty_triplets = TripletBatch {
1✔
326
            triplets: vec![SampleTriplet {
1✔
327
                recipe: "r".to_string(),
1✔
328
                anchor: sample_chunk("a"),
1✔
329
                positive: sample_chunk("b"),
1✔
330
                negative: sample_chunk("c"),
1✔
331
                weight: 1.0,
1✔
332
                instruction: Some("hint".to_string()),
1✔
333
            }],
1✔
334
        };
1✔
335
        assert!(!non_empty_triplets.is_empty());
1✔
336

337
        let empty_text = TextBatch {
1✔
338
            samples: Vec::new(),
1✔
339
        };
1✔
340
        assert!(empty_text.is_empty());
1✔
341

342
        let non_empty_text = TextBatch {
1✔
343
            samples: vec![TextSample {
1✔
344
                recipe: "r".to_string(),
1✔
345
                chunk: sample_chunk("t"),
1✔
346
                weight: 1.0,
1✔
347
                instruction: None,
1✔
348
            }],
1✔
349
        };
1✔
350
        assert!(!non_empty_text.is_empty());
1✔
351
    }
1✔
352

353
    #[test]
354
    fn data_record_roundtrip_basics_are_constructible() {
1✔
355
        let now = Utc.with_ymd_and_hms(2025, 1, 1, 0, 0, 0).unwrap();
1✔
356
        let record = DataRecord {
1✔
357
            id: "source_a::1".to_string(),
1✔
358
            source: "source_a".to_string(),
1✔
359
            created_at: now,
1✔
360
            updated_at: now,
1✔
361
            quality: QualityScore { trust: 0.9 },
1✔
362
            taxonomy: vec!["topic:news".to_string()],
1✔
363
            sections: vec![RecordSection {
1✔
364
                role: SectionRole::Anchor,
1✔
365
                heading: Some("headline".to_string()),
1✔
366
                text: "body".to_string(),
1✔
367
                sentences: vec!["body".to_string()],
1✔
368
            }],
1✔
369
            meta_prefix: None,
1✔
370
        };
1✔
371

372
        assert_eq!(record.source, "source_a");
1✔
373
        assert_eq!(record.sections.len(), 1);
1✔
374
        assert!(matches!(record.sections[0].role, SectionRole::Anchor));
1✔
375
    }
1✔
376
}
STATUS · Troubleshooting · Open an Issue · Sales · Support · CAREERS · ENTERPRISE · START FREE · SCHEDULE DEMO
ANNOUNCEMENTS · TWITTER · TOS & SLA · Supported CI Services · What's a CI service? · Automated Testing

© 2026 Coveralls, Inc