• Home
  • Features
  • Pricing
  • Docs
  • Announcements
  • Sign In

NVIDIA / nvrc / 20381075488

19 Dec 2025 07:59PM UTC coverage: 89.348% (+8.9%) from 80.415%
20381075488

Pull #85

github

web-flow
Merge 40e9e3e58 into 2295d6b0d
Pull Request #85: Update coverage.yaml to use cargo-llvm-cov

1233 of 1380 relevant lines covered (89.35%)

4.08 hits per line

Source File
Press 'n' to go to next uncovered line, 'b' for previous

93.43
/src/kernel_params.rs
1
use anyhow::{Context, Result};
2
use log::{debug, warn};
3
use std::fs;
4

5
use crate::nvrc::NVRC;
6

7
/// Kernel parameters use various boolean representations (on/off, true/false, 1/0, yes/no).
8
/// Normalize them to a single bool to simplify downstream logic.
9
fn parse_boolean(s: &str) -> bool {
28✔
10
    match s.to_ascii_lowercase().as_str() {
28✔
11
        "on" | "true" | "1" | "yes" => true,
28✔
12
        "off" | "false" | "0" | "no" => false,
11✔
13
        _ => {
14
            warn!("unrecognized boolean '{}', defaulting to false", s);
3✔
15
            false
3✔
16
        }
17
    }
18
}
28✔
19

20
impl NVRC {
21
    /// Parse kernel command line parameters to configure NVRC behavior.
22
    /// Using kernel params allows configuration without userspace tools—critical
23
    /// for a minimal init where no config files or environment variables exist.
24
    pub fn process_kernel_params(&mut self, cmdline: Option<&str>) -> Result<()> {
10✔
25
        let content = match cmdline {
10✔
26
            Some(c) => c.to_owned(),
9✔
27
            None => fs::read_to_string("/proc/cmdline").context("read /proc/cmdline")?,
1✔
28
        };
29

30
        for (k, v) in content.split_whitespace().filter_map(|p| p.split_once('=')) {
31✔
31
            match k {
30✔
32
                "nvrc.log" => nvrc_log(v, self)?,
30✔
33
                "nvrc.uvm.persistence.mode" => uvm_persistenced_mode(v, self)?,
24✔
34
                "nvrc.dcgm" => nvrc_dcgm(v, self)?,
23✔
35
                "nvrc.fabricmanager" => nvrc_fabricmanager(v, self)?,
22✔
36
                "nvrc.smi.srs" => nvidia_smi_srs(v, self)?,
21✔
37
                "nvrc.smi.lgc" => nvidia_smi_lgc(v, self)?,
20✔
38
                "nvrc.smi.lmcd" => nvidia_smi_lmcd(v, self)?,
18✔
39
                "nvrc.smi.pl" => nvidia_smi_pl(v, self)?,
17✔
40
                _ => {}
15✔
41
            }
42
        }
43
        Ok(())
10✔
44
    }
10✔
45
}
46

47
/// DCGM (Data Center GPU Manager) provides telemetry and health monitoring.
48
/// Off by default—only enable when observability infrastructure expects it.
49
fn nvrc_dcgm(value: &str, ctx: &mut NVRC) -> Result<()> {
8✔
50
    let dcgm = parse_boolean(value);
8✔
51
    ctx.dcgm_enabled = Some(dcgm);
8✔
52
    debug!("nvrc.dcgm: {dcgm}");
8✔
53
    Ok(())
8✔
54
}
8✔
55

56
/// Fabric Manager enables NVLink/NVSwitch multi-GPU communication.
57
/// Only needed for multi-GPU systems with NVLink topology.
58
fn nvrc_fabricmanager(value: &str, ctx: &mut NVRC) -> Result<()> {
3✔
59
    let fabricmanager = parse_boolean(value);
3✔
60
    ctx.fabricmanager_enabled = Some(fabricmanager);
3✔
61
    debug!("nvrc.fabricmanager: {fabricmanager}");
3✔
62
    Ok(())
3✔
63
}
3✔
64

65
/// Control log verbosity at runtime. Defaults to off to minimize noise.
66
/// Enabling devkmsg allows kernel log output even in minimal init environments.
67
fn nvrc_log(value: &str, _ctx: &mut NVRC) -> Result<()> {
7✔
68
    let lvl = match value.to_ascii_lowercase().as_str() {
7✔
69
        "off" | "0" | "" => log::LevelFilter::Off,
7✔
70
        "error" => log::LevelFilter::Error,
5✔
71
        "warn" => log::LevelFilter::Warn,
5✔
72
        "info" => log::LevelFilter::Info,
5✔
73
        "debug" => log::LevelFilter::Debug,
4✔
74
        "trace" => log::LevelFilter::Trace,
2✔
75
        _ => log::LevelFilter::Off,
1✔
76
    };
77

78
    log::set_max_level(lvl);
7✔
79
    debug!("nvrc.log: {}", log::max_level());
7✔
80
    fs::write("/proc/sys/kernel/printk_devkmsg", b"on\n").context("printk_devkmsg")?;
7✔
81

82
    Ok(())
7✔
83
}
7✔
84

85
/// Secure Randomization Seed for GPU memory. Passed directly to nvidia-smi.
86
fn nvidia_smi_srs(value: &str, ctx: &mut NVRC) -> Result<()> {
3✔
87
    ctx.nvidia_smi_srs = Some(value.to_owned());
3✔
88
    debug!("nvidia_smi_srs: {value}");
3✔
89
    Ok(())
3✔
90
}
3✔
91

92
/// Lock GPU core clocks to a fixed frequency (MHz) for consistent performance.
93
/// Eliminates thermal/power throttling variance in benchmarks and latency-sensitive workloads.
94
fn nvidia_smi_lgc(value: &str, ctx: &mut NVRC) -> Result<()> {
5✔
95
    let mhz: u32 = value.parse().context("nvrc.smi.lgc: invalid frequency")?;
5✔
96
    debug!("nvrc.smi.lgc: {} MHz (all GPUs)", mhz);
4✔
97
    ctx.nvidia_smi_lgc = Some(mhz);
4✔
98
    Ok(())
4✔
99
}
5✔
100

101
/// Lock memory clocks to a fixed frequency (MHz). Requires driver reload to take effect.
102
/// Used alongside lgc for fully deterministic GPU behavior.
103
fn nvidia_smi_lmcd(value: &str, ctx: &mut NVRC) -> Result<()> {
4✔
104
    let mhz: u32 = value.parse().context("nvrc.smi.lmcd: invalid frequency")?;
4✔
105
    debug!("nvrc.smi.lmcd: {} MHz (all GPUs)", mhz);
3✔
106
    ctx.nvidia_smi_lmcd = Some(mhz);
3✔
107
    Ok(())
3✔
108
}
4✔
109

110
/// Set GPU power limit (Watts). Lower limits reduce heat/power, higher allows peak perf.
111
/// Useful for power-constrained environments or thermal management.
112
fn nvidia_smi_pl(value: &str, ctx: &mut NVRC) -> Result<()> {
5✔
113
    let watts: u32 = value.parse().context("nvrc.smi.pl: invalid wattage")?;
5✔
114
    debug!("nvrc.smi.pl: {} W (all GPUs)", watts);
4✔
115
    ctx.nvidia_smi_pl = Some(watts);
4✔
116
    Ok(())
4✔
117
}
5✔
118

119
/// UVM persistence mode keeps unified memory state across CUDA context teardowns.
120
/// Reduces initialization overhead for short-lived CUDA applications.
121
fn uvm_persistenced_mode(value: &str, ctx: &mut NVRC) -> Result<()> {
4✔
122
    let enabled = parse_boolean(value);
4✔
123
    ctx.uvm_persistence_mode = Some(enabled);
4✔
124
    debug!("nvrc.uvm.persistence.mode: {enabled}");
4✔
125
    Ok(())
4✔
126
}
4✔
127

128
#[cfg(test)]
129
mod tests {
130
    use super::*;
131
    use nix::unistd::Uid;
132
    use serial_test::serial;
133
    use std::env;
134
    use std::process::Command;
135
    use std::sync::{LazyLock, Once};
136

137
    static LOG: LazyLock<Once> = LazyLock::new(Once::new);
138

139
    fn log_setup() {
7✔
140
        LOG.call_once(|| {
7✔
141
            kernlog::init().unwrap();
1✔
142
        });
1✔
143
    }
7✔
144

145
    fn rerun_with_sudo() {
×
146
        let args: Vec<String> = env::args().collect();
×
147
        let output = Command::new("sudo").args(&args).status();
×
148

149
        match output {
×
150
            Ok(o) => {
×
151
                if o.success() {
×
152
                    println!("running with sudo")
×
153
                } else {
154
                    panic!("not running with sudo")
×
155
                }
156
            }
157
            Err(e) => {
×
158
                panic!("Failed to escalate privileges: {e:?}")
×
159
            }
160
        }
161
    }
×
162

163
    #[test]
164
    #[serial]
165
    fn test_nvrc_log_debug() {
1✔
166
        if !Uid::effective().is_root() {
1✔
167
            return rerun_with_sudo();
×
168
        }
1✔
169

170
        log_setup();
1✔
171
        let mut c = NVRC::default();
1✔
172

173
        nvrc_log("debug", &mut c).unwrap();
1✔
174
        assert!(log_enabled!(log::Level::Debug));
1✔
175
    }
176

177
    #[test]
178
    #[serial]
179
    fn test_process_kernel_params_nvrc_log_debug() {
1✔
180
        if !Uid::effective().is_root() {
1✔
181
            return rerun_with_sudo();
×
182
        }
1✔
183

184
        log_setup();
1✔
185
        let mut init = NVRC::default();
1✔
186

187
        init.process_kernel_params(Some(
1✔
188
            "nvidia.smi.lgc=1500 nvrc.log=debug nvidia.smi.lgc=1500",
1✔
189
        ))
1✔
190
        .unwrap();
1✔
191

192
        assert_eq!(log::max_level(), log::LevelFilter::Debug);
1✔
193
        assert!(!log_enabled!(log::Level::Trace));
1✔
194
    }
195

196
    #[test]
197
    #[serial]
198
    fn test_process_kernel_params_nvrc_log_info() {
1✔
199
        if !Uid::effective().is_root() {
1✔
200
            return rerun_with_sudo();
×
201
        }
1✔
202

203
        log_setup();
1✔
204
        let mut init = NVRC::default();
1✔
205

206
        init.process_kernel_params(Some(
1✔
207
            "nvidia.smi.lgc=1500 nvrc.log=info nvidia.smi.lgc=1500",
1✔
208
        ))
1✔
209
        .unwrap();
1✔
210

211
        assert_eq!(log::max_level(), log::LevelFilter::Info);
1✔
212
        assert!(!log_enabled!(log::Level::Debug));
1✔
213
    }
214

215
    #[test]
216
    #[serial]
217
    fn test_process_kernel_params_nvrc_log_0() {
1✔
218
        if !Uid::effective().is_root() {
1✔
219
            return rerun_with_sudo();
×
220
        }
1✔
221

222
        log_setup();
1✔
223
        let mut init = NVRC::default();
1✔
224

225
        init.process_kernel_params(Some("nvidia.smi.lgc=1500 nvrc.log=0 nvidia.smi.lgc=1500"))
1✔
226
            .unwrap();
1✔
227
        assert_eq!(log::max_level(), log::LevelFilter::Off);
1✔
228
    }
229

230
    #[test]
231
    #[serial]
232
    fn test_process_kernel_params_nvrc_log_none() {
1✔
233
        if !Uid::effective().is_root() {
1✔
234
            return rerun_with_sudo();
×
235
        }
1✔
236

237
        log_setup();
1✔
238
        let mut init = NVRC::default();
1✔
239

240
        init.process_kernel_params(Some("nvidia.smi.lgc=1500 nvrc.log= "))
1✔
241
            .unwrap();
1✔
242
        assert_eq!(log::max_level(), log::LevelFilter::Off);
1✔
243
    }
244

245
    #[test]
246
    #[serial]
247
    fn test_process_kernel_params_nvrc_log_trace() {
1✔
248
        if !Uid::effective().is_root() {
1✔
249
            return rerun_with_sudo();
×
250
        }
1✔
251

252
        log_setup();
1✔
253
        let mut init = NVRC::default();
1✔
254

255
        init.process_kernel_params(Some("nvrc.log=trace")).unwrap();
1✔
256
        assert_eq!(log::max_level(), log::LevelFilter::Trace);
1✔
257
    }
258

259
    #[test]
260
    #[serial]
261
    fn test_process_kernel_params_nvrc_log_unknown() {
1✔
262
        if !Uid::effective().is_root() {
1✔
263
            return rerun_with_sudo();
×
264
        }
1✔
265

266
        log_setup();
1✔
267
        let mut init = NVRC::default();
1✔
268

269
        // Unknown log level should default to Off
270
        init.process_kernel_params(Some("nvrc.log=garbage"))
1✔
271
            .unwrap();
1✔
272
        assert_eq!(log::max_level(), log::LevelFilter::Off);
1✔
273
    }
274

275
    #[test]
276
    fn test_nvrc_dcgm_parameter_handling() {
1✔
277
        let mut c = NVRC::default();
1✔
278

279
        // Test various "on" values
280
        nvrc_dcgm("on", &mut c).unwrap();
1✔
281
        assert_eq!(c.dcgm_enabled, Some(true));
1✔
282

283
        nvrc_dcgm("true", &mut c).unwrap();
1✔
284
        assert_eq!(c.dcgm_enabled, Some(true));
1✔
285

286
        nvrc_dcgm("1", &mut c).unwrap();
1✔
287
        assert_eq!(c.dcgm_enabled, Some(true));
1✔
288

289
        nvrc_dcgm("yes", &mut c).unwrap();
1✔
290
        assert_eq!(c.dcgm_enabled, Some(true));
1✔
291

292
        // Test "off" values
293
        nvrc_dcgm("off", &mut c).unwrap();
1✔
294
        assert_eq!(c.dcgm_enabled, Some(false));
1✔
295

296
        nvrc_dcgm("false", &mut c).unwrap();
1✔
297
        assert_eq!(c.dcgm_enabled, Some(false));
1✔
298

299
        nvrc_dcgm("invalid", &mut c).unwrap();
1✔
300
        assert_eq!(c.dcgm_enabled, Some(false));
1✔
301
    }
1✔
302

303
    #[test]
304
    fn test_nvrc_fabricmanager() {
1✔
305
        let mut c = NVRC::default();
1✔
306

307
        nvrc_fabricmanager("on", &mut c).unwrap();
1✔
308
        assert_eq!(c.fabricmanager_enabled, Some(true));
1✔
309

310
        nvrc_fabricmanager("off", &mut c).unwrap();
1✔
311
        assert_eq!(c.fabricmanager_enabled, Some(false));
1✔
312
    }
1✔
313

314
    #[test]
315
    fn test_nvidia_smi_srs() {
1✔
316
        let mut c = NVRC::default();
1✔
317

318
        nvidia_smi_srs("enabled", &mut c).unwrap();
1✔
319
        assert_eq!(c.nvidia_smi_srs, Some("enabled".to_owned()));
1✔
320

321
        nvidia_smi_srs("disabled", &mut c).unwrap();
1✔
322
        assert_eq!(c.nvidia_smi_srs, Some("disabled".to_owned()));
1✔
323
    }
1✔
324

325
    #[test]
326
    fn test_uvm_persistenced_mode() {
1✔
327
        let mut c = NVRC::default();
1✔
328

329
        uvm_persistenced_mode("on", &mut c).unwrap();
1✔
330
        assert_eq!(c.uvm_persistence_mode, Some(true));
1✔
331

332
        uvm_persistenced_mode("OFF", &mut c).unwrap();
1✔
333
        assert_eq!(c.uvm_persistence_mode, Some(false));
1✔
334

335
        uvm_persistenced_mode("True", &mut c).unwrap();
1✔
336
        assert_eq!(c.uvm_persistence_mode, Some(true));
1✔
337
    }
1✔
338

339
    #[test]
340
    fn test_parse_boolean() {
1✔
341
        assert!(parse_boolean("on"));
1✔
342
        assert!(parse_boolean("true"));
1✔
343
        assert!(parse_boolean("1"));
1✔
344
        assert!(parse_boolean("yes"));
1✔
345
        assert!(parse_boolean("ON"));
1✔
346
        assert!(parse_boolean("True"));
1✔
347
        assert!(parse_boolean("YES"));
1✔
348

349
        assert!(!parse_boolean("off"));
1✔
350
        assert!(!parse_boolean("false"));
1✔
351
        assert!(!parse_boolean("0"));
1✔
352
        assert!(!parse_boolean("no"));
1✔
353
        assert!(!parse_boolean("invalid"));
1✔
354
        assert!(!parse_boolean(""));
1✔
355
    }
1✔
356

357
    #[test]
358
    fn test_nvidia_smi_lgc() {
1✔
359
        let mut c = NVRC::default();
1✔
360

361
        nvidia_smi_lgc("1500", &mut c).unwrap();
1✔
362
        assert_eq!(c.nvidia_smi_lgc, Some(1500));
1✔
363

364
        nvidia_smi_lgc("2100", &mut c).unwrap();
1✔
365
        assert_eq!(c.nvidia_smi_lgc, Some(2100));
1✔
366

367
        // Invalid value should error
368
        assert!(nvidia_smi_lgc("invalid", &mut c).is_err());
1✔
369
    }
1✔
370

371
    #[test]
372
    fn test_nvidia_smi_lmcd() {
1✔
373
        let mut c = NVRC::default();
1✔
374

375
        nvidia_smi_lmcd("5001", &mut c).unwrap();
1✔
376
        assert_eq!(c.nvidia_smi_lmcd, Some(5001));
1✔
377

378
        nvidia_smi_lmcd("6000", &mut c).unwrap();
1✔
379
        assert_eq!(c.nvidia_smi_lmcd, Some(6000));
1✔
380

381
        // Invalid value should error
382
        assert!(nvidia_smi_lmcd("not_a_number", &mut c).is_err());
1✔
383
    }
1✔
384

385
    #[test]
386
    fn test_nvidia_smi_pl() {
1✔
387
        let mut c = NVRC::default();
1✔
388

389
        nvidia_smi_pl("300", &mut c).unwrap();
1✔
390
        assert_eq!(c.nvidia_smi_pl, Some(300));
1✔
391

392
        nvidia_smi_pl("450", &mut c).unwrap();
1✔
393
        assert_eq!(c.nvidia_smi_pl, Some(450));
1✔
394

395
        // Invalid value should error
396
        assert!(nvidia_smi_pl("abc", &mut c).is_err());
1✔
397
    }
1✔
398

399
    #[test]
400
    fn test_process_kernel_params_gpu_settings() {
1✔
401
        let mut c = NVRC::default();
1✔
402

403
        c.process_kernel_params(Some("nvrc.smi.lgc=1500 nvrc.smi.lmcd=5001 nvrc.smi.pl=300"))
1✔
404
            .unwrap();
1✔
405

406
        assert_eq!(c.nvidia_smi_lgc, Some(1500));
1✔
407
        assert_eq!(c.nvidia_smi_lmcd, Some(5001));
1✔
408
        assert_eq!(c.nvidia_smi_pl, Some(300));
1✔
409
    }
1✔
410

411
    #[test]
412
    fn test_process_kernel_params_combined() {
1✔
413
        let mut c = NVRC::default();
1✔
414

415
        c.process_kernel_params(Some(
1✔
416
            "nvrc.smi.lgc=2100 nvrc.uvm.options=opt1=1,opt2=2 nvrc.dcgm=on nvrc.smi.pl=400",
1✔
417
        ))
1✔
418
        .unwrap();
1✔
419

420
        assert_eq!(c.nvidia_smi_lgc, Some(2100));
1✔
421
        assert_eq!(c.nvidia_smi_pl, Some(400));
1✔
422
        assert_eq!(c.dcgm_enabled, Some(true));
1✔
423
    }
1✔
424

425
    #[test]
426
    fn test_process_kernel_params_from_proc_cmdline() {
1✔
427
        // Exercise the None path which reads /proc/cmdline.
428
        // We can't control the contents but can verify it doesn't error.
429
        let mut c = NVRC::default();
1✔
430
        let result = c.process_kernel_params(None);
1✔
431
        assert!(result.is_ok());
1✔
432
    }
1✔
433

434
    #[test]
435
    fn test_process_kernel_params_with_fabricmanager_and_uvm() {
1✔
436
        let mut c = NVRC::default();
1✔
437

438
        c.process_kernel_params(Some(
1✔
439
            "nvrc.fabricmanager=on nvrc.uvm.persistence.mode=true nvrc.smi.srs=enabled",
1✔
440
        ))
1✔
441
        .unwrap();
1✔
442

443
        assert_eq!(c.fabricmanager_enabled, Some(true));
1✔
444
        assert_eq!(c.uvm_persistence_mode, Some(true));
1✔
445
        assert_eq!(c.nvidia_smi_srs, Some("enabled".to_owned()));
1✔
446
    }
1✔
447
}
STATUS · Troubleshooting · Open an Issue · Sales · Support · CAREERS · ENTERPRISE · START FREE · SCHEDULE DEMO
ANNOUNCEMENTS · TWITTER · TOS & SLA · Supported CI Services · What's a CI service? · Automated Testing

© 2025 Coveralls, Inc