• Home
  • Features
  • Pricing
  • Docs
  • Announcements
  • Sign In

vigna / webgraph-rs / 22098292023

17 Feb 2026 12:22PM UTC coverage: 72.228% (+0.04%) from 72.192%
22098292023

push

github

vigna
Documentation for LoadConfig

6052 of 8379 relevant lines covered (72.23%)

49854014.42 hits per line

Source File
Press 'n' to go to next uncovered line, 'b' for previous

80.77
/webgraph/src/utils/batch_codec/gaps.rs
1
/*
2
 * SPDX-FileCopyrightText: 2023 Inria
3
 * SPDX-FileCopyrightText: 2023 Sebastiano Vigna
4
 * SPDX-FileCopyrightText: 2025 Tommaso Fontana
5
 *
6
 * SPDX-License-Identifier: Apache-2.0 OR LGPL-2.1-or-later
7
 */
8

9
use super::{BitReader, BitWriter};
10
use crate::traits::SortedIterator;
11
use crate::utils::{ArcMmapHelper, MmapHelper, Triple};
12
use crate::{
13
    traits::{BitDeserializer, BitSerializer},
14
    utils::{BatchCodec, humanize},
15
};
16

17
use std::sync::Arc;
18

19
use anyhow::{Context, Result};
20
use dsi_bitstream::prelude::*;
21
use mmap_rs::MmapFlags;
22
use rdst::*;
23

24
#[derive(Clone, Debug)]
25
/// A codec for encoding and decoding batches of triples using gap compression.
26
///
27
/// This codec encodes triples of the form `(src, dst, label)` by encoding the
28
/// gaps between consecutive sources and destinations using a specified code.
29
///
30
/// # Type Parameters
31
///
32
/// - `S`: Serializer for the labels, implementing [`BitSerializer`] for the label type.
33
/// - `D`: Deserializer for the labels, implementing [`BitDeserializer`] for the label type.
34
/// - `SRC_CODE`: Code used for encoding source gaps (default: gamma).
35
/// - `DST_CODE`: Code used for encoding destination gaps (default: gamma).
36
///
37
/// # Encoding Format
38
///
39
/// 1. The batch length is written using delta coding.
40
/// 2. For each group of triples with the same source:
41
///     - The gap from the previous source is encoded.
42
///     - The gap from the previous destination is encoded.
43
///     - The label is serialized.
44
///
45
/// The bit deserializer must be [`Clone`] because we need one for each
46
/// [`GapsIter`], and there are possible scenarios in which the
47
/// deserializer might be stateful.
48
pub struct GapsCodec<
49
    E: Endianness = NE,
50
    S: BitSerializer<E, BitWriter<E>> = (),
51
    D: BitDeserializer<E, BitReader<E>, DeserType = S::SerType> + Clone = (),
52
    const SRC_CODE: usize = { dsi_bitstream::dispatch::code_consts::GAMMA },
53
    const DST_CODE: usize = { dsi_bitstream::dispatch::code_consts::DELTA },
54
> where
55
    BitReader<E>: BitRead<E> + CodesRead<E>,
56
    BitWriter<E>: BitWrite<E> + CodesWrite<E>,
57
{
58
    /// Serializer for the labels
59
    pub serializer: S,
60
    /// Deserializer for the labels
61
    pub deserializer: D,
62
    /// Marker for the endianness
63
    pub _marker: std::marker::PhantomData<E>,
64
}
65

66
impl<E, S, D, const SRC_CODE: usize, const DST_CODE: usize> GapsCodec<E, S, D, SRC_CODE, DST_CODE>
67
where
68
    E: Endianness,
69
    S: BitSerializer<E, BitWriter<E>> + Send + Sync,
70
    D: BitDeserializer<E, BitReader<E>, DeserType = S::SerType> + Send + Sync + Clone,
71
    BitReader<E>: BitRead<E> + CodesRead<E>,
72
    BitWriter<E>: BitWrite<E> + CodesWrite<E>,
73
{
74
    /// Creates a new `GapsCodec` with the given serializer and deserializer.
75
    pub fn new(serializer: S, deserializer: D) -> Self {
7✔
76
        Self {
77
            serializer,
78
            deserializer,
79
            _marker: std::marker::PhantomData,
80
        }
81
    }
82
}
83

84
impl<E, S: Default, D: Default, const SRC_CODE: usize, const DST_CODE: usize> core::default::Default
85
    for GapsCodec<E, S, D, SRC_CODE, DST_CODE>
86
where
87
    E: Endianness,
88
    S: BitSerializer<E, BitWriter<E>> + Send + Sync,
89
    D: BitDeserializer<E, BitReader<E>, DeserType = S::SerType> + Send + Sync + Clone,
90
    BitReader<E>: BitRead<E> + CodesRead<E>,
91
    BitWriter<E>: BitWrite<E> + CodesWrite<E>,
92
{
93
    fn default() -> Self {
5✔
94
        Self::new(Default::default(), Default::default())
15✔
95
    }
96
}
97

98
#[derive(Debug, Clone, Copy)]
99
/// Statistics about the encoding performed by [`GapsCodec`].
100
pub struct GapsStats {
101
    /// Total number of triples encoded
102
    pub total_triples: usize,
103
    /// Number of bits used for source gaps
104
    pub src_bits: usize,
105
    /// Number of bits used for destination gaps
106
    pub dst_bits: usize,
107
    /// Number of bits used for labels
108
    pub labels_bits: usize,
109
}
110

111
impl core::fmt::Display for GapsStats {
112
    fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result {
×
113
        let total_bits = self.src_bits + self.dst_bits + self.labels_bits;
×
114
        write!(
×
115
            f,
×
116
            "src: {}B ({:.3} bits / arc), dst: {}B ({:.3} bits / arc), labels: {}B ({:.3} bits / arc), total: {}B ({:.3} bits / arc)",
117
            humanize(self.src_bits as f64 / 8.0),
×
118
            self.src_bits as f64 / self.total_triples as f64,
×
119
            humanize(self.dst_bits as f64 / 8.0),
×
120
            self.dst_bits as f64 / self.total_triples as f64,
×
121
            humanize(self.labels_bits as f64 / 8.0),
×
122
            self.labels_bits as f64 / self.total_triples as f64,
×
123
            humanize(total_bits as f64 / 8.0),
×
124
            total_bits as f64 / self.total_triples as f64,
×
125
        )
126
    }
127
}
128

129
impl<E, S, D, const SRC_CODE: usize, const DST_CODE: usize> BatchCodec
130
    for GapsCodec<E, S, D, SRC_CODE, DST_CODE>
131
where
132
    E: Endianness,
133
    S: BitSerializer<E, BitWriter<E>> + Send + Sync,
134
    D: BitDeserializer<E, BitReader<E>, DeserType = S::SerType> + Send + Sync + Clone,
135
    S::SerType: Send + Sync + Copy + 'static + core::fmt::Debug, // needed by radix sort
136
    BitReader<E>: BitRead<E> + CodesRead<E>,
137
    BitWriter<E>: BitWrite<E> + CodesWrite<E>,
138
{
139
    type Label = S::SerType;
140
    type DecodedBatch = GapsIter<E, D, SRC_CODE, DST_CODE>;
141
    type EncodedBatchStats = GapsStats;
142

143
    fn encode_batch(
11✔
144
        &self,
145
        path: impl AsRef<std::path::Path>,
146
        batch: &mut [((usize, usize), Self::Label)],
147
    ) -> Result<(usize, Self::EncodedBatchStats)> {
148
        let start = std::time::Instant::now();
22✔
149
        Triple::cast_batch_mut(batch).radix_sort_unstable();
22✔
150
        log::debug!("Sorted {} arcs in {:?}", batch.len(), start.elapsed());
11✔
151
        self.encode_sorted_batch(path, batch)
44✔
152
    }
153

154
    fn encode_sorted_batch(
12✔
155
        &self,
156
        path: impl AsRef<std::path::Path>,
157
        batch: &[((usize, usize), Self::Label)],
158
    ) -> Result<(usize, Self::EncodedBatchStats)> {
159
        debug_assert!(Triple::cast_batch(batch).is_sorted());
36✔
160
        // create a bitstream to write to the file
161
        let file_path = path.as_ref();
36✔
162
        let mut stream = buf_bit_writer::from_path::<E, usize>(file_path).with_context(|| {
48✔
163
            format!(
×
164
                "Could not create BatchIterator temporary file {}",
×
165
                file_path.display()
×
166
            )
167
        })?;
168

169
        // prefix the stream with the length of the batch
170
        // we use a delta code since it'll be a big number most of the time
171
        stream
12✔
172
            .write_delta(batch.len() as u64)
24✔
173
            .context("Could not write length")?;
174

175
        let mut stats = GapsStats {
176
            total_triples: batch.len(),
12✔
177
            src_bits: 0,
178
            dst_bits: 0,
179
            labels_bits: 0,
180
        };
181
        // Dump the triples to the bitstream
182
        let (mut prev_src, mut prev_dst) = (0, 0);
36✔
183
        for ((src, dst), label) in batch.iter() {
180✔
184
            // write the source gap as gamma
185
            stats.src_bits += ConstCode::<SRC_CODE>
52✔
186
                .write(&mut stream, (src - prev_src) as u64)
156✔
187
                .with_context(|| format!("Could not write {src} after {prev_src}"))?;
52✔
188
            if *src != prev_src {
94✔
189
                // Reset prev_y
190
                prev_dst = 0;
42✔
191
            }
192
            // write the destination gap as gamma
193
            stats.dst_bits += ConstCode::<DST_CODE>
52✔
194
                .write(&mut stream, (dst - prev_dst) as u64)
156✔
195
                .with_context(|| format!("Could not write {dst} after {prev_dst}"))?;
52✔
196
            // write the label
197
            stats.labels_bits += self
52✔
198
                .serializer
52✔
199
                .serialize(label, &mut stream)
156✔
200
                .context("Could not serialize label")?;
52✔
201
            (prev_src, prev_dst) = (*src, *dst);
156✔
202
        }
203
        // flush the stream and reset the buffer
204
        stream.flush().context("Could not flush stream")?;
36✔
205

206
        let total_bits = stats.src_bits + stats.dst_bits + stats.labels_bits;
24✔
207
        Ok((total_bits, stats))
12✔
208
    }
209

210
    fn decode_batch(&self, path: impl AsRef<std::path::Path>) -> Result<Self::DecodedBatch> {
12✔
211
        // open the file
212
        let mut stream = <BufBitReader<E, _>>::new(MemWordReader::new(ArcMmapHelper(Arc::new(
48✔
213
            MmapHelper::mmap(
12✔
214
                path.as_ref(),
24✔
215
                MmapFlags::TRANSPARENT_HUGE_PAGES | MmapFlags::SEQUENTIAL,
12✔
216
            )
217
            .with_context(|| format!("Could not mmap {}", path.as_ref().display()))?,
12✔
218
        ))));
219

220
        // read the length of the batch (first value in the stream)
221
        let len = stream.read_delta().context("Could not read length")? as usize;
48✔
222

223
        // create the iterator
224
        Ok(GapsIter {
12✔
225
            deserializer: self.deserializer.clone(),
36✔
226
            stream,
12✔
227
            len,
12✔
228
            current: 0,
12✔
229
            prev_src: 0,
12✔
230
            prev_dst: 0,
12✔
231
        })
232
    }
233
}
234

235
#[derive(Clone, Debug)]
236
/// An iterator over triples encoded with gaps, this is returned by [`GapsCodec`].
237
pub struct GapsIter<
238
    E: Endianness = NE,
239
    D: BitDeserializer<E, BitReader<E>> = (),
240
    const SRC_CODE: usize = { dsi_bitstream::dispatch::code_consts::GAMMA },
241
    const DST_CODE: usize = { dsi_bitstream::dispatch::code_consts::GAMMA },
242
> where
243
    BitReader<E>: BitRead<E> + CodesRead<E>,
244
    BitWriter<E>: BitWrite<E> + CodesWrite<E>,
245
{
246
    /// Deserializer for the labels
247
    deserializer: D,
248
    /// Bitstream to read from
249
    stream: BitReader<E>,
250
    /// Length of the iterator (number of triples)
251
    len: usize,
252
    /// Current position in the iterator
253
    current: usize,
254
    /// Previous source node
255
    prev_src: usize,
256
    /// Previous destination node
257
    prev_dst: usize,
258
}
259

260
unsafe impl<
261
    E: Endianness,
262
    D: BitDeserializer<E, BitReader<E>>,
263
    const SRC_CODE: usize,
264
    const DST_CODE: usize,
265
> SortedIterator for GapsIter<E, D, SRC_CODE, DST_CODE>
266
where
267
    BitReader<E>: BitRead<E> + CodesRead<E>,
268
    BitWriter<E>: BitWrite<E> + CodesWrite<E>,
269
{
270
}
271

272
impl<
273
    E: Endianness,
274
    D: BitDeserializer<E, BitReader<E>>,
275
    const SRC_CODE: usize,
276
    const DST_CODE: usize,
277
> Iterator for GapsIter<E, D, SRC_CODE, DST_CODE>
278
where
279
    BitReader<E>: BitRead<E> + CodesRead<E>,
280
    BitWriter<E>: BitWrite<E> + CodesWrite<E>,
281
{
282
    type Item = ((usize, usize), D::DeserType);
283

284
    fn next(&mut self) -> Option<Self::Item> {
85✔
285
        if self.current >= self.len {
85✔
286
            return None;
14✔
287
        }
288
        let src_gap = ConstCode::<SRC_CODE>.read(&mut self.stream).ok()?;
355✔
289
        let dst_gap = ConstCode::<DST_CODE>.read(&mut self.stream).ok()?;
355✔
290
        let label = self.deserializer.deserialize(&mut self.stream).ok()?;
355✔
291
        self.prev_src += src_gap as usize;
71✔
292
        if src_gap != 0 {
133✔
293
            self.prev_dst = 0;
62✔
294
        }
295
        self.prev_dst += dst_gap as usize;
71✔
296
        self.current += 1;
71✔
297
        Some(((self.prev_src, self.prev_dst), label))
71✔
298
    }
299

300
    fn size_hint(&self) -> (usize, Option<usize>) {
2✔
301
        (self.len(), Some(self.len()))
6✔
302
    }
303
}
304

305
impl<
306
    E: Endianness,
307
    D: BitDeserializer<E, BitReader<E>>,
308
    const SRC_CODE: usize,
309
    const DST_CODE: usize,
310
> ExactSizeIterator for GapsIter<E, D, SRC_CODE, DST_CODE>
311
where
312
    BitReader<E>: BitRead<E> + CodesRead<E>,
313
    BitWriter<E>: BitWrite<E> + CodesWrite<E>,
314
{
315
    fn len(&self) -> usize {
5✔
316
        self.len - self.current
5✔
317
    }
318
}
STATUS · Troubleshooting · Open an Issue · Sales · Support · CAREERS · ENTERPRISE · START FREE · SCHEDULE DEMO
ANNOUNCEMENTS · TWITTER · TOS & SLA · Supported CI Services · What's a CI service? · Automated Testing

© 2026 Coveralls, Inc