• Home
  • Features
  • Pricing
  • Docs
  • Announcements
  • Sign In

vigna / webgraph-rs / 14130263389

28 Mar 2025 01:40PM UTC coverage: 49.654% (-0.1%) from 49.798%
14130263389

push

github

zommiommy
fixed llp wrong combine types

22 of 41 new or added lines in 4 files covered. (53.66%)

1019 existing lines in 41 files now uncovered.

2437 of 4908 relevant lines covered (49.65%)

18919274.03 hits per line

Source File
Press 'n' to go to next uncovered line, 'b' for previous

45.71
/src/cli/mod.rs
1
/*
2
 * SPDX-FileCopyrightText: 2023 Inria
3
 * SPDX-FileCopyrightText: 2023 Tommaso Fontana
4
 *
5
 * SPDX-License-Identifier: Apache-2.0 OR LGPL-2.1-or-later
6
 */
7

8
//! Command-line interface structs, functions, and methods.
9
//!
10
//! Each module correspond to a group of commands, and each command is
11
//! implemented as a submodule.
12

13
use crate::prelude::CompFlags;
14
use crate::{build_info, utils::Granularity};
15
use anyhow::{anyhow, bail, ensure, Context, Result};
16
use clap::{Arg, Args, Command, ValueEnum};
17
use common_traits::UnsignedInt;
18
use dsi_bitstream::dispatch::Codes;
19
use jiff::fmt::friendly::{Designator, Spacing, SpanPrinter};
20
use jiff::SpanRound;
21
use std::io::Write;
22
use std::path::{Path, PathBuf};
23
use std::time::Duration;
24
use std::time::SystemTime;
25
use sysinfo::System;
26

27
pub mod analyze;
28
pub mod bench;
29
pub mod build;
30
pub mod check;
31
pub mod from;
32
pub mod perm;
33
pub mod run;
34
pub mod to;
35
pub mod transform;
36

37
#[derive(Debug, Copy, Clone, PartialEq, Eq, PartialOrd, Ord, ValueEnum)]
38
/// Enum for instantaneous codes.
39
///
40
/// It is used to implement [`ValueEnum`] here instead of in [`dsi_bitstream`].
41
pub enum PrivCode {
42
    Unary,
43
    Gamma,
44
    Delta,
45
    Zeta1,
46
    Zeta2,
47
    Zeta3,
48
    Zeta4,
49
    Zeta5,
50
    Zeta6,
51
    Zeta7,
52
}
53

54
impl From<PrivCode> for Codes {
55
    fn from(value: PrivCode) -> Self {
20✔
56
        match value {
20✔
57
            PrivCode::Unary => Codes::Unary,
4✔
58
            PrivCode::Gamma => Codes::Gamma,
12✔
UNCOV
59
            PrivCode::Delta => Codes::Delta,
×
60
            PrivCode::Zeta1 => Codes::Zeta { k: 1 },
61
            PrivCode::Zeta2 => Codes::Zeta { k: 2 },
62
            PrivCode::Zeta3 => Codes::Zeta { k: 3 },
63
            PrivCode::Zeta4 => Codes::Zeta { k: 4 },
64
            PrivCode::Zeta5 => Codes::Zeta { k: 5 },
65
            PrivCode::Zeta6 => Codes::Zeta { k: 6 },
66
            PrivCode::Zeta7 => Codes::Zeta { k: 7 },
67
        }
68
    }
69
}
70

71
#[derive(Args, Debug)]
72
/// Shared CLI arguments for reading files containing arcs.
73
pub struct ArcsArgs {
74
    #[arg(long, default_value_t = '#')]
75
    /// Ignore lines that start with this symbol.
76
    pub line_comment_symbol: char,
77

78
    #[arg(long, default_value_t = 0)]
79
    /// How many lines to skip, ignoring comment lines.
80
    pub lines_to_skip: usize,
81

82
    #[arg(long)]
83
    /// How many lines to parse, after skipping the first lines_to_skip and
84
    /// ignoring comment lines.
85
    pub max_arcs: Option<usize>,
86

87
    #[arg(long, default_value_t = '\t')]
88
    /// The column separator.
89
    pub separator: char,
90

91
    #[arg(long, default_value_t = 0)]
92
    /// The index of the column containing the source node of an arc.
93
    pub source_column: usize,
94

95
    #[arg(long, default_value_t = 1)]
96
    /// The index of the column containing the target node of an arc.
97
    pub target_column: usize,
98

99
    #[arg(long, default_value_t = false)]
100
    /// Source and destinations are node identifiers.
101
    pub exact: bool,
102
}
103

104
/// Shared CLI arguments for commands that specify a number of threads.
105
#[derive(Args, Debug)]
106
pub struct NumThreadsArg {
107
    #[arg(short = 'j', long, default_value_t = rayon::current_num_threads().max(1))]
108
    /// The number of threads to use
109
    pub num_threads: usize,
110
}
111

112
/// Shared CLI arguments for commands that specify a granularity.
113
#[derive(Args, Debug)]
114
pub struct GranularityArgs {
115
    #[arg(long, conflicts_with("node_granularity"))]
116
    /// The tentative number of arcs used define the size of a parallel job
117
    /// (advanced option).
118
    pub arc_granularity: Option<u64>,
119

120
    #[arg(long, conflicts_with("arc_granularity"))]
121
    /// The tentative number of nodes used define the size of a parallel job
122
    /// (advanced option).
123
    pub node_granularity: Option<usize>,
124
}
125

126
impl GranularityArgs {
127
    pub fn into_granularity(&self) -> Granularity {
2✔
128
        match (self.arc_granularity, self.node_granularity) {
2✔
129
            (Some(_), Some(_)) => unreachable!(),
UNCOV
130
            (Some(arc_granularity), None) => Granularity::Arcs(arc_granularity),
×
UNCOV
131
            (None, Some(node_granularity)) => Granularity::Nodes(node_granularity),
×
132
            (None, None) => Granularity::default(),
2✔
133
        }
134
    }
135
}
136

137
/// Shared CLI arguments for commands that specify a batch size.
138
#[derive(Args, Debug)]
139
pub struct BatchSizeArg {
140
    #[clap(short = 'b', long, value_parser = batch_size, default_value = "50%")]
141
    /// The number of pairs to be used in batches. Two times this number of
142
    /// `usize` will be allocated to sort pairs. You can use the SI and NIST
143
    /// multipliers k, M, G, T, P, ki, Mi, Gi, Ti, and Pi. You can also use a
144
    /// percentage of the available memory by appending a `%` to the number.
145
    pub batch_size: usize,
146
}
147

148
/// Parses a batch size.
149
///
150
/// This function accepts either a number (possibly followed by a
151
/// SI or NIST multiplier k, M, G, T, P, ki, Mi, Gi, Ti, or Pi), or a percentage
152
/// (followed by a `%`) that is interpreted as a percentage of the core
153
/// memory. The function returns the number of pairs to be used for batches.
154
pub fn batch_size(arg: &str) -> anyhow::Result<usize> {
4✔
155
    const PREF_SYMS: [(&str, u64); 10] = [
156
        ("k", 1E3 as u64),
157
        ("m", 1E6 as u64),
158
        ("g", 1E9 as u64),
159
        ("t", 1E12 as u64),
160
        ("p", 1E15 as u64),
161
        ("ki", 1 << 10),
162
        ("mi", 1 << 20),
163
        ("gi", 1 << 30),
164
        ("ti", 1 << 40),
165
        ("pi", 1 << 50),
166
    ];
167
    let arg = arg.trim().to_ascii_lowercase();
4✔
168
    ensure!(!arg.is_empty(), "empty string");
4✔
169

170
    if arg.ends_with('%') {
4✔
171
        let perc = arg[..arg.len() - 1].parse::<f64>()?;
8✔
UNCOV
172
        ensure!(perc >= 0.0 || perc <= 100.0, "percentage out of range");
×
173
        let mut system = System::new();
4✔
174
        system.refresh_memory();
4✔
175
        let num_pairs: usize = (((system.total_memory() as f64) * (perc / 100.0)
4✔
UNCOV
176
            / (std::mem::size_of::<(usize, usize)>() as f64))
×
UNCOV
177
            as u64)
×
178
            .try_into()?;
179
        // TODO: try_align_to when available
UNCOV
180
        return Ok(num_pairs.align_to(1 << 20)); // Round up to MiBs
×
181
    }
182

UNCOV
183
    arg.chars().position(|c| c.is_alphabetic()).map_or_else(
×
UNCOV
184
        || Ok(arg.parse::<usize>()?),
×
UNCOV
185
        |pos| {
×
UNCOV
186
            let (num, pref_sym) = arg.split_at(pos);
×
UNCOV
187
            let multiplier = PREF_SYMS
×
UNCOV
188
                .iter()
×
UNCOV
189
                .find(|(x, _)| *x == pref_sym)
×
UNCOV
190
                .map(|(_, m)| m)
×
UNCOV
191
                .ok_or(anyhow!("invalid prefix symbol"))?;
×
192

UNCOV
193
            Ok((num.parse::<u64>()? * multiplier).try_into()?)
×
194
        },
195
    )
196
}
197

198
#[derive(Args, Debug)]
199
/// Shared CLI arguments for compression.
200
pub struct CompressArgs {
201
    /// The endianness of the graph to write
202
    #[clap(short = 'E', long)]
203
    pub endianness: Option<String>,
204

205
    /// The compression windows
206
    #[clap(short = 'w', long, default_value_t = 7)]
207
    pub compression_window: usize,
208
    /// The minimum interval length
209
    #[clap(short = 'i', long, default_value_t = 4)]
210
    pub min_interval_length: usize,
211
    /// The maximum recursion depth for references (-1 for infinite recursion depth)
212
    #[clap(short = 'r', long, default_value_t = 3)]
213
    pub max_ref_count: isize,
214

215
    #[arg(value_enum)]
216
    #[clap(long, default_value = "gamma")]
217
    /// The code to use for the outdegree
218
    pub outdegrees: PrivCode,
219

220
    #[arg(value_enum)]
221
    #[clap(long, default_value = "unary")]
222
    /// The code to use for the reference offsets
223
    pub references: PrivCode,
224

225
    #[arg(value_enum)]
226
    #[clap(long, default_value = "gamma")]
227
    /// The code to use for the blocks
228
    pub blocks: PrivCode,
229

230
    #[arg(value_enum)]
231
    #[clap(long, default_value = "zeta3")]
232
    /// The code to use for the residuals
233
    pub residuals: PrivCode,
234
}
235

236
impl From<CompressArgs> for CompFlags {
237
    fn from(value: CompressArgs) -> Self {
4✔
238
        CompFlags {
239
            outdegrees: value.outdegrees.into(),
4✔
240
            references: value.references.into(),
4✔
241
            blocks: value.blocks.into(),
4✔
242
            intervals: PrivCode::Gamma.into(),
4✔
243
            residuals: value.residuals.into(),
4✔
244
            min_interval_length: value.min_interval_length,
4✔
245
            compression_window: value.compression_window,
4✔
246
            max_ref_count: match value.max_ref_count {
4✔
247
                -1 => usize::MAX,
248
                _ => value.max_ref_count as usize,
249
            },
250
        }
251
    }
252
}
253

254
/// Creates a threadpool with the given number of threads
255
pub fn get_thread_pool(num_threads: usize) -> rayon::ThreadPool {
6✔
256
    rayon::ThreadPoolBuilder::new()
6✔
257
        .num_threads(num_threads)
6✔
258
        .build()
259
        .expect("Failed to create thread pool")
260
}
261

262
/// Appends a string to the filename of a path.
263
///
264
/// # Panics
265
/// * Will panic if there is no filename.
266
/// * Will panic in test mode if the path has an extension.
UNCOV
267
pub fn append(path: impl AsRef<Path>, s: impl AsRef<str>) -> PathBuf {
×
UNCOV
268
    debug_assert!(path.as_ref().extension().is_none());
×
UNCOV
269
    let mut path_buf = path.as_ref().to_owned();
×
UNCOV
270
    let mut filename = path_buf.file_name().unwrap().to_owned();
×
UNCOV
271
    filename.push(s.as_ref());
×
UNCOV
272
    path_buf.push(filename);
×
UNCOV
273
    path_buf
×
274
}
275

276
/// Creates all parent directories of the given file path.
277
pub fn create_parent_dir(file_path: impl AsRef<Path>) -> Result<()> {
10✔
278
    // ensure that the dst directory exists
279
    if let Some(parent_dir) = file_path.as_ref().parent() {
20✔
UNCOV
280
        std::fs::create_dir_all(parent_dir).with_context(|| {
×
UNCOV
281
            format!(
×
UNCOV
282
                "Failed to create the directory {:?}",
×
UNCOV
283
                parent_dir.to_string_lossy()
×
284
            )
285
        })?;
286
    }
287
    Ok(())
10✔
288
}
289

290
/// Parse a duration from a string.
291
/// For compatibility with Java, if no suffix is given, it is assumed to be in milliseconds.
292
/// You can use suffixes, the available ones are:
293
/// - `s` for seconds
294
/// - `m` for minutes
295
/// - `h` for hours
296
/// - `d` for days
297
///
298
/// Example: `1d2h3m4s567` this is parsed as: 1 day, 2 hours, 3 minutes, 4 seconds, and 567 milliseconds.
299
fn parse_duration(value: &str) -> Result<Duration> {
×
300
    if value.is_empty() {
×
301
        bail!("Empty duration string, if you want every 0 milliseconds use `0`.");
×
302
    }
303
    let mut duration = Duration::from_secs(0);
×
304
    let mut acc = String::new();
×
305
    for c in value.chars() {
×
UNCOV
306
        if c.is_ascii_digit() {
×
307
            acc.push(c);
×
308
        } else if c.is_whitespace() {
×
309
            continue;
×
310
        } else {
311
            let dur = acc.parse::<u64>()?;
×
312
            match c {
×
313
                's' => duration += Duration::from_secs(dur),
×
314
                'm' => duration += Duration::from_secs(dur * 60),
×
315
                'h' => duration += Duration::from_secs(dur * 60 * 60),
×
316
                'd' => duration += Duration::from_secs(dur * 60 * 60 * 24),
×
UNCOV
317
                _ => return Err(anyhow!("Invalid duration suffix: {}", c)),
×
318
            }
319
            acc.clear();
×
320
        }
321
    }
UNCOV
322
    if !acc.is_empty() {
×
UNCOV
323
        let dur = acc.parse::<u64>()?;
×
UNCOV
324
        duration += Duration::from_millis(dur);
×
325
    }
UNCOV
326
    Ok(duration)
×
327
}
328

329
pub fn init_envlogger() -> Result<()> {
2✔
330
    let mut builder =
2✔
331
        env_logger::Builder::from_env(env_logger::Env::default().default_filter_or("info"));
2✔
332

333
    let start = std::time::Instant::now();
2✔
334
    let printer = SpanPrinter::new()
2✔
335
        .spacing(Spacing::None)
2✔
336
        .designator(Designator::Compact);
2✔
337
    let span_round = SpanRound::new()
2✔
338
        .largest(jiff::Unit::Day)
2✔
339
        .smallest(jiff::Unit::Millisecond)
2✔
340
        .days_are_24_hours();
341

342
    builder.format(move |buf, record| {
1,866✔
343
        let Ok(ts) = jiff::Timestamp::try_from(SystemTime::now()) else {
3,728✔
NEW
344
            return Err(std::io::Error::other("Failed to get timestamp"));
×
345
        };
NEW
346
        let style = buf.default_level_style(record.level());
×
NEW
347
        let elapsed = start.elapsed();
×
NEW
348
        let span = jiff::Span::new()
×
NEW
349
            .seconds(elapsed.as_secs() as i64)
×
NEW
350
            .milliseconds(elapsed.subsec_millis() as i64);
×
NEW
351
        let span = span.round(span_round).expect("Failed to round span");
×
NEW
352
        writeln!(
×
NEW
353
            buf,
×
NEW
354
            "{} {} {style}{}{style:#} [{:?}] {} - {}",
×
NEW
355
            ts.strftime("%F %T%.3f"),
×
NEW
356
            printer.span_to_string(&span),
×
NEW
357
            record.level(),
×
NEW
358
            std::thread::current().id(),
×
NEW
359
            record.target(),
×
NEW
360
            record.args()
×
361
        )
362
    });
363
    builder.init();
2✔
364
    Ok(())
2✔
365
}
366

367
/// The entry point of the command-line interface.
368
pub fn main<I, T>(args: I) -> Result<()>
9✔
369
where
370
    I: IntoIterator<Item = T>,
371
    T: Into<std::ffi::OsString> + Clone,
372
{
373
    let start = std::time::Instant::now();
9✔
374

375
    let command = Command::new("webgraph")
9✔
376
        .about("Webgraph tools to build, convert, modify, and analyze webgraph files.")
377
        .version(build_info::version_string())
9✔
378
        .subcommand_required(true)
379
        .arg_required_else_help(true)
380
        .arg(
381
            Arg::new("log-interval")
9✔
382
                .short('l')
9✔
383
                .long("log-interval")
9✔
384
                .value_parser(parse_duration)
9✔
385
                .help(
9✔
386
                    "How often to log progress. Default is 10s. You can use the suffixes `s` for seconds, `m` for minutes, `h` for hours, and `d` for days. If no suffix is provided it is assumed to be in milliseconds. Example: `1d2h3m4s567` is parsed as 1 day + 2 hours + 3 minutes + 4 seconds + 567 milliseconds = 93784567 milliseconds.",)
9✔
387
                .global(true),
9✔
388
        )
389
        .after_help(
390
            "Environment (noteworthy environment variables used):
391
RUST_MIN_STACK: minimum thread stack size (in bytes)
392
    we suggest RUST_MIN_STACK=8388608 which is the maximum allowed by linux.
393
TMPDIR: where to store temporary files (potentially very large ones)
394
RUST_LOG: configuration for env_logger, pass `info` to see the progress of the
395
  compression, `debug` to see the progress of the decompression, and `trace` to see all the
396
  details. You can also use `RUST_LOG=webgraph=debug` to see only the webgraph logs.
397
",
398
        );
399

400
    macro_rules! impl_dispatch {
×
UNCOV
401
        ($command:expr, $($module:ident),*) => {{
×
UNCOV
402
            let command = build::cli($command);
×
UNCOV
403
            $(
×
UNCOV
404
                let command = $module::cli(command);
×
405
            )*
×
406
            let command = command.display_order(0); // sort args alphabetically
UNCOV
407
            let mut completion_command = command.clone();
×
UNCOV
408
            let matches = command.get_matches_from(args);
×
409

UNCOV
410
            let subcommand = matches.subcommand();
×
411
            // if no command is specified, print the help message
UNCOV
412
            if subcommand.is_none() {
×
UNCOV
413
                completion_command.print_help().unwrap();
×
UNCOV
414
                return Ok(());
×
415
            }
UNCOV
416
            match subcommand.unwrap() {
×
UNCOV
417
                (build::COMMAND_NAME, sub_m) => build::main(sub_m, &mut completion_command),
×
UNCOV
418
                $(
×
UNCOV
419
                    ($module::COMMAND_NAME, sub_m) => $module::main(sub_m),
×
UNCOV
420
                )*
×
UNCOV
421
                (command_name, _) => {
×
422
                    // this shouldn't happen as clap should catch this
UNCOV
423
                    eprintln!("Unknown command: {:?}", command_name);
×
UNCOV
424
                    completion_command.print_help().unwrap();
×
UNCOV
425
                    std::process::exit(1);
×
426
                }
427
            }
428
        }};
429
    }
430

431
    impl_dispatch!(command, analyze, bench, check, from, perm, run, to, transform)?;
9✔
432

433
    log::info!(
9✔
434
        "The command took {}",
9✔
435
        pretty_print_elapsed(start.elapsed().as_secs_f64())
9✔
436
    );
437

UNCOV
438
    Ok(())
×
439
}
440

441
/// Pretty prints seconds in a humanly readable format.
442
fn pretty_print_elapsed(elapsed: f64) -> String {
18✔
443
    let mut result = String::new();
18✔
444
    let mut elapsed_seconds = elapsed as u64;
18✔
445
    let weeks = elapsed_seconds / (60 * 60 * 24 * 7);
18✔
446
    elapsed_seconds %= 60 * 60 * 24 * 7;
18✔
447
    let days = elapsed_seconds / (60 * 60 * 24);
18✔
448
    elapsed_seconds %= 60 * 60 * 24;
18✔
449
    let hours = elapsed_seconds / (60 * 60);
18✔
450
    elapsed_seconds %= 60 * 60;
18✔
451
    let minutes = elapsed_seconds / 60;
18✔
452
    //elapsed_seconds %= 60;
453

454
    match weeks {
18✔
455
        0 => {}
18✔
UNCOV
456
        1 => result.push_str("1 week "),
×
UNCOV
457
        _ => result.push_str(&format!("{} weeks ", weeks)),
×
458
    }
459
    match days {
18✔
460
        0 => {}
18✔
UNCOV
461
        1 => result.push_str("1 day "),
×
UNCOV
462
        _ => result.push_str(&format!("{} days ", days)),
×
463
    }
464
    match hours {
18✔
465
        0 => {}
18✔
UNCOV
466
        1 => result.push_str("1 hour "),
×
UNCOV
467
        _ => result.push_str(&format!("{} hours ", hours)),
×
468
    }
469
    match minutes {
18✔
470
        0 => {}
16✔
471
        1 => result.push_str("1 minute "),
2✔
UNCOV
472
        _ => result.push_str(&format!("{} minutes ", minutes)),
×
473
    }
474

475
    result.push_str(&format!("{:.3} seconds ({}s)", elapsed % 60.0, elapsed));
18✔
476
    result
18✔
477
}
STATUS · Troubleshooting · Open an Issue · Sales · Support · CAREERS · ENTERPRISE · START FREE · SCHEDULE DEMO
ANNOUNCEMENTS · TWITTER · TOS & SLA · Supported CI Services · What's a CI service? · Automated Testing

© 2026 Coveralls, Inc