• Home
  • Features
  • Pricing
  • Docs
  • Announcements
  • Sign In

vigna / webgraph-rs / 18429665353

11 Oct 2025 12:54PM UTC coverage: 47.997% (+0.2%) from 47.835%
18429665353

push

github

zommiommy
Introduce BatchCodec to substitute BatchIterator

104 of 226 new or added lines in 9 files covered. (46.02%)

7 existing lines in 3 files now uncovered.

4001 of 8336 relevant lines covered (48.0%)

21929200.14 hits per line

Source File
Press 'n' to go to next uncovered line, 'b' for previous

57.81
/webgraph/src/utils/batch_codec/gaps.rs
1
/*
2
 * SPDX-FileCopyrightText: 2023 Inria
3
 * SPDX-FileCopyrightText: 2023 Sebastiano Vigna
4
 * SPDX-FileCopyrightText: 2025 Tommaso Fontana
5
 *
6
 * SPDX-License-Identifier: Apache-2.0 OR LGPL-2.1-or-later
7
 */
8

9
use super::{BitReader, BitWriter};
10
use crate::traits::SortedIterator;
11
use crate::utils::{ArcMmapHelper, MmapHelper, Triple};
12
use crate::{
13
    traits::{BitDeserializer, BitSerializer},
14
    utils::BatchCodec,
15
};
16

17
use std::sync::Arc;
18

19
use anyhow::{Context, Result};
20
use dsi_bitstream::prelude::*;
21
use mmap_rs::MmapFlags;
22
use rdst::*;
23

24
#[derive(Clone, Debug, Default)]
25
/// A codec for encoding and decoding batches of triples using gap compression.
26
///
27
/// This codec encodes triples of the form `(src, dst, label)` by encoding the
28
/// gaps between consecutive sources and destinations using a specified code.
29
///
30
/// ## Type Parameters
31
/// - `S`: Serializer for the labels, implementing [`BitSerializer`] for the label type.
32
/// - `D`: Deserializer for the labels, implementing [`BitDeserializer`] for the label type.
33
/// - `SRC_CODE`: Code used for encoding source gaps (default: gamma).
34
/// - `DST_CODE`: Code used for encoding destination gaps (default: gamma).
35
///
36
/// ## Fields
37
/// - `serializer`: The label serializer.
38
/// - `deserializer`: The label deserializer.
39
///
40
/// ## Encoding Format
41
/// 1. The batch length is written using delta coding.
42
/// 2. For each group of triples with the same source:
43
///     - The gap from the previous source is encoded.
44
///     - The gap from the previous destination is encoded.
45
///     - The label is serialized.
46
///
47
/// The bit deserializer must be [`Clone`] because we need one for each
48
/// [`GapsIterator`], and there are possible scenarios in which the
49
/// deserializer might be stateful.
50
///
51
/// ## Choosing the codes
52
///
53
/// These are the top 10 codes for src and dst gaps when transposing `enwiki-2024`.
54
/// ```ignore
55
/// Src codes:
56
///   Code: Unary        Size: 179553432
57
///   Code: Golomb(1)    Size: 179553432
58
///   Code: Rice(0)      Size: 179553432
59
///   Code: Gamma        Size: 185374984
60
///   Code: Zeta(1)      Size: 185374984
61
///   Code: ExpGolomb(0) Size: 185374984
62
///   Code: Omega        Size: 185439656
63
///   Code: Delta        Size: 191544794
64
///   Code: Golomb(2)    Size: 345986198
65
///   Code: Rice(1)      Size: 345986198
66
/// Dst codes:
67
///   Code: Pi(2)   Size: 2063880685
68
///   Code: Pi(3)   Size: 2074138948
69
///   Code: Zeta(3) Size: 2122730298
70
///   Code: Zeta(4) Size: 2123948774
71
///   Code: Zeta(5) Size: 2169131998
72
///   Code: Pi(4)   Size: 2176097847
73
///   Code: Zeta(2) Size: 2226573622
74
///   Code: Zeta(6) Size: 2237680403
75
///   Code: Delta   Size: 2272691460
76
///   Code: Zeta(7) Size: 2305354857
77
/// ```
78
///
79
/// So the best combination is `Unary` for src gaps and `Pi(2)` for dst gaps.
80
/// But, `Unary` can behave poorly if the distribution of your data changes,
81
/// therefore the recommended default is `Gamma` for src gaps and `Delta` for
82
/// dst gaps as they are universal codes.
83
pub struct GapsCodec<
84
    S: BitSerializer<NE, BitWriter> = (),
85
    D: BitDeserializer<NE, BitReader, DeserType = S::SerType> + Clone = (),
86
    const SRC_CODE: usize = { dsi_bitstream::dispatch::code_consts::GAMMA },
87
    const DST_CODE: usize = { dsi_bitstream::dispatch::code_consts::DELTA },
88
> {
89
    /// Serializer for the labels
90
    pub serializer: S,
91
    /// Deserializer for the labels
92
    pub deserializer: D,
93
}
94

95
impl<S, D, const SRC_CODE: usize, const DST_CODE: usize> BatchCodec
96
    for GapsCodec<S, D, SRC_CODE, DST_CODE>
97
where
98
    S: BitSerializer<NE, BitWriter> + Send + Sync,
99
    D: BitDeserializer<NE, BitReader, DeserType = S::SerType> + Send + Sync + Clone,
100
    S::SerType: Send + Sync + Copy + 'static + core::fmt::Debug, // needed by radix sort
101
{
102
    type Label = S::SerType;
103
    type DecodedBatch = GapsIterator<D, SRC_CODE, DST_CODE>;
104

105
    fn encode_batch(
8✔
106
        &self,
107
        path: impl AsRef<std::path::Path>,
108
        batch: &mut [(usize, usize, Self::Label)],
109
    ) -> Result<usize> {
110
        let start = std::time::Instant::now();
16✔
111
        Triple::cast_batch_mut(batch).radix_sort_unstable();
16✔
112
        log::debug!("Sorted {} arcs in {:?}", batch.len(), start.elapsed());
8✔
113
        self.encode_sorted_batch(path, batch)
32✔
114
    }
115

116
    fn encode_sorted_batch(
8✔
117
        &self,
118
        path: impl AsRef<std::path::Path>,
119
        batch: &[(usize, usize, Self::Label)],
120
    ) -> Result<usize> {
121
        debug_assert!(Triple::cast_batch(batch).is_sorted());
24✔
122
        // create a batch file where to dump
123
        let file_path = path.as_ref();
24✔
124
        let file = std::io::BufWriter::with_capacity(
125
            1 << 16,
8✔
126
            std::fs::File::create(file_path).with_context(|| {
24✔
NEW
127
                format!(
×
NEW
128
                    "Could not create BatchIterator temporary file {}",
×
NEW
129
                    file_path.display()
×
130
                )
131
            })?,
132
        );
133
        // create a bitstream to write to the file
NEW
134
        let mut stream = <BufBitWriter<NE, _>>::new(<WordAdapter<usize, _>>::new(file));
×
135

136
        // prefix the stream with the length of the batch
137
        // we use a delta code since it'll be a big number most of the time
NEW
138
        stream
×
NEW
139
            .write_delta(batch.len() as u64)
×
140
            .context("Could not write length")?;
141

142
        // dump the triples to the bitstream
143
        let (mut prev_src, mut prev_dst) = (0, 0);
8✔
NEW
144
        let mut written_bits = 0;
×
145
        for (src, dst, label) in batch.iter() {
120✔
146
            // write the source gap as gamma
147
            written_bits += ConstCode::<SRC_CODE>
40✔
148
                .write(&mut stream, (src - prev_src) as u64)
120✔
149
                .with_context(|| format!("Could not write {src} after {prev_src}"))?;
40✔
150
            if *src != prev_src {
74✔
151
                // Reset prev_y
152
                prev_dst = 0;
34✔
153
            }
154
            // write the destination gap as gamma
NEW
155
            written_bits += ConstCode::<DST_CODE>
×
NEW
156
                .write(&mut stream, (dst - prev_dst) as u64)
×
NEW
157
                .with_context(|| format!("Could not write {dst} after {prev_dst}"))?;
×
158
            // write the label
159
            written_bits += self
40✔
160
                .serializer
40✔
NEW
161
                .serialize(label, &mut stream)
×
NEW
162
                .context("Could not serialize label")?;
×
163
            (prev_src, prev_dst) = (*src, *dst);
40✔
164
        }
165
        // flush the stream and reset the buffer
166
        written_bits += stream.flush().context("Could not flush stream")?;
8✔
167

168
        Ok(written_bits)
8✔
169
    }
170

171
    fn decode_batch(&self, path: impl AsRef<std::path::Path>) -> Result<Self::DecodedBatch> {
8✔
172
        // open the file
173
        let mut stream = <BufBitReader<NE, _>>::new(MemWordReader::new(ArcMmapHelper(Arc::new(
8✔
174
            MmapHelper::mmap(
8✔
175
                path.as_ref(),
16✔
176
                MmapFlags::TRANSPARENT_HUGE_PAGES | MmapFlags::SEQUENTIAL,
8✔
177
            )
178
            .with_context(|| format!("Could not mmap {}", path.as_ref().display()))?,
8✔
179
        ))));
180

181
        // read the length of the batch (first value in the stream)
182
        let len = stream.read_delta().context("Could not read length")? as usize;
8✔
183

184
        // create the iterator
NEW
185
        Ok(GapsIterator {
×
NEW
186
            deserializer: self.deserializer.clone(),
×
NEW
187
            stream,
×
NEW
188
            len,
×
NEW
189
            current: 0,
×
NEW
190
            prev_src: 0,
×
NEW
191
            prev_dst: 0,
×
192
        })
193
    }
194
}
195

196
#[derive(Clone, Debug)]
197
/// An iterator over triples encoded with gaps, this is returned by [`GapsCodec`].
198
pub struct GapsIterator<
199
    D: BitDeserializer<NE, BitReader> = (),
200
    const SRC_CODE: usize = { dsi_bitstream::dispatch::code_consts::GAMMA },
201
    const DST_CODE: usize = { dsi_bitstream::dispatch::code_consts::GAMMA },
202
> {
203
    /// Deserializer for the labels
204
    deserializer: D,
205
    /// Bitstream to read from
206
    stream: BitReader,
207
    /// Length of the iterator (number of triples)
208
    len: usize,
209
    /// Current position in the iterator
210
    current: usize,
211
    /// Previous source node
212
    prev_src: usize,
213
    /// Previous destination node
214
    prev_dst: usize,
215
}
216

217
unsafe impl<D: BitDeserializer<NE, BitReader>, const SRC_CODE: usize, const DST_CODE: usize>
218
    SortedIterator for GapsIterator<D, SRC_CODE, DST_CODE>
219
{
220
}
221

222
impl<D: BitDeserializer<NE, BitReader>, const SRC_CODE: usize, const DST_CODE: usize> Iterator
223
    for GapsIterator<D, SRC_CODE, DST_CODE>
224
{
225
    type Item = (usize, usize, D::DeserType);
226

227
    fn next(&mut self) -> Option<Self::Item> {
73✔
228
        if self.current >= self.len {
73✔
229
            return None;
11✔
230
        }
231
        let src_gap = ConstCode::<SRC_CODE>.read(&mut self.stream).ok()?;
62✔
232
        let dst_gap = ConstCode::<DST_CODE>.read(&mut self.stream).ok()?;
62✔
233
        let label = self.deserializer.deserialize(&mut self.stream).ok()?;
62✔
NEW
234
        self.prev_src += src_gap as usize;
×
235
        if src_gap != 0 {
56✔
236
            self.prev_dst = 0;
56✔
237
        }
NEW
238
        self.prev_dst += dst_gap as usize;
×
NEW
239
        self.current += 1;
×
NEW
240
        Some((self.prev_src, self.prev_dst, label))
×
241
    }
242

NEW
243
    fn size_hint(&self) -> (usize, Option<usize>) {
×
NEW
244
        (self.len(), Some(self.len()))
×
245
    }
246
}
247

248
impl<D: BitDeserializer<NE, BitReader>, const SRC_CODE: usize, const DST_CODE: usize>
249
    ExactSizeIterator for GapsIterator<D, SRC_CODE, DST_CODE>
250
{
NEW
251
    fn len(&self) -> usize {
×
NEW
252
        self.len - self.current
×
253
    }
254
}
STATUS · Troubleshooting · Open an Issue · Sales · Support · CAREERS · ENTERPRISE · START FREE · SCHEDULE DEMO
ANNOUNCEMENTS · TWITTER · TOS & SLA · Supported CI Services · What's a CI service? · Automated Testing

© 2026 Coveralls, Inc