• Home
  • Features
  • Pricing
  • Docs
  • Announcements
  • Sign In

NVIDIA / nvrc / 20381075488

19 Dec 2025 07:59PM UTC coverage: 89.348% (+8.9%) from 80.415%
20381075488

Pull #85

github

web-flow
Merge 40e9e3e58 into 2295d6b0d
Pull Request #85: Update coverage.yaml to use cargo-llvm-cov

1233 of 1380 relevant lines covered (89.35%)

4.08 hits per line

Source File
Press 'n' to go to next uncovered line, 'b' for previous

97.83
/src/daemon.rs
1
// SPDX-License-Identifier: Apache-2.0
2
// Copyright (c) NVIDIA CORPORATION
3

4
use anyhow::{Context, Result};
5

6
use crate::execute::background;
7
use crate::nvrc::NVRC;
8
use std::fs;
9

10
/// UVM persistence mode keeps unified memory mappings alive between kernel launches,
11
/// avoiding expensive page migrations. Enabled by default for ML workloads.
12
fn persistenced_args(uvm_enabled: bool) -> Vec<&'static str> {
20✔
13
    if uvm_enabled {
20✔
14
        vec!["--verbose", "--uvm-persistence-mode"]
12✔
15
    } else {
16
        vec!["--verbose"]
8✔
17
    }
18
}
20✔
19

20
/// Hostengine needs a service account to avoid running as root, and /tmp as home
21
/// because the rootfs is read-only after init completes.
22
fn hostengine_args() -> &'static [&'static str] {
8✔
23
    &["--service-account", "nvidia-dcgm", "--home-dir", "/tmp"]
8✔
24
}
8✔
25

26
/// Kubernetes mode disables standalone HTTP server (we're behind kata-agent),
27
/// and we use the standard counters config shipped with the container image.
28
fn dcgm_exporter_args() -> &'static [&'static str] {
8✔
29
    &["-k", "-f", "/etc/dcgm-exporter/default-counters.csv"]
8✔
30
}
8✔
31

32
/// Fabricmanager needs explicit config path because it doesn't search standard
33
/// locations when running as a subprocess of init.
34
fn fabricmanager_args() -> &'static [&'static str] {
8✔
35
    &["-c", "/usr/share/nvidia/nvswitch/fabricmanager.cfg"]
8✔
36
}
8✔
37

38
/// Configurable path parameters allow testing with /bin/true instead of real
39
/// NVIDIA binaries that don't exist in the test environment.
40
impl NVRC {
41
    /// nvidia-persistenced keeps GPU state warm between container invocations,
42
    /// reducing cold-start latency. UVM persistence mode enables unified memory
43
    /// optimizations. Enabled by default since most workloads benefit from it.
44
    pub fn nvidia_persistenced(&mut self) -> Result<()> {
×
45
        self.spawn_persistenced("/var/run/nvidia-persistenced", "/bin/nvidia-persistenced")
×
46
    }
×
47

48
    fn spawn_persistenced(&mut self, run_dir: &str, bin: &str) -> Result<()> {
12✔
49
        fs::create_dir_all(run_dir).with_context(|| format!("create_dir_all {}", run_dir))?;
12✔
50
        let uvm_enabled = self.uvm_persistence_mode.unwrap_or(true);
12✔
51
        let args = persistenced_args(uvm_enabled);
12✔
52
        let child = background(bin, &args)?;
12✔
53
        self.track_daemon("nvidia-persistenced", child);
8✔
54
        Ok(())
8✔
55
    }
12✔
56

57
    /// nv-hostengine is the DCGM backend daemon. Only started when DCGM monitoring
58
    /// is explicitly requested - not needed for basic GPU workloads.
59
    pub fn nv_hostengine(&mut self) -> Result<()> {
4✔
60
        self.spawn_hostengine("/bin/nv-hostengine")
4✔
61
    }
4✔
62

63
    fn spawn_hostengine(&mut self, bin: &str) -> Result<()> {
8✔
64
        if !self.dcgm_enabled.unwrap_or(false) {
8✔
65
            return Ok(());
4✔
66
        }
4✔
67
        let child = background(bin, hostengine_args())?;
4✔
68
        self.track_daemon("nv-hostengine", child);
4✔
69
        Ok(())
4✔
70
    }
8✔
71

72
    /// dcgm-exporter exposes GPU metrics for Prometheus. Only started when DCGM
73
    /// is enabled - adds overhead so disabled by default.
74
    pub fn dcgm_exporter(&mut self) -> Result<()> {
4✔
75
        self.spawn_dcgm_exporter("/bin/dcgm-exporter")
4✔
76
    }
4✔
77

78
    fn spawn_dcgm_exporter(&mut self, bin: &str) -> Result<()> {
8✔
79
        if !self.dcgm_enabled.unwrap_or(false) {
8✔
80
            return Ok(());
4✔
81
        }
4✔
82
        let child = background(bin, dcgm_exporter_args())?;
4✔
83
        self.track_daemon("dcgm-exporter", child);
4✔
84
        Ok(())
4✔
85
    }
8✔
86

87
    /// NVSwitch fabric manager is only needed for multi-GPU NVLink topologies.
88
    /// Disabled by default since most VMs have single GPUs.
89
    pub fn nv_fabricmanager(&mut self) -> Result<()> {
4✔
90
        self.spawn_fabricmanager("/bin/nv-fabricmanager")
4✔
91
    }
4✔
92

93
    fn spawn_fabricmanager(&mut self, bin: &str) -> Result<()> {
8✔
94
        if !self.fabricmanager_enabled.unwrap_or(false) {
8✔
95
            return Ok(());
4✔
96
        }
4✔
97
        let child = background(bin, fabricmanager_args())?;
4✔
98
        self.track_daemon("nv-fabricmanager", child);
4✔
99
        Ok(())
4✔
100
    }
8✔
101
}
102

103
#[cfg(test)]
104
mod tests {
105
    use super::*;
106
    use tempfile::TempDir;
107

108
    // === Args builder tests ===
109

110
    #[test]
111
    fn test_persistenced_args_with_uvm() {
4✔
112
        let args = persistenced_args(true);
4✔
113
        assert_eq!(args, vec!["--verbose", "--uvm-persistence-mode"]);
4✔
114
    }
4✔
115

116
    #[test]
117
    fn test_persistenced_args_without_uvm() {
4✔
118
        let args = persistenced_args(false);
4✔
119
        assert_eq!(args, vec!["--verbose"]);
4✔
120
    }
4✔
121

122
    #[test]
123
    fn test_hostengine_args() {
4✔
124
        let args = hostengine_args();
4✔
125
        assert_eq!(
4✔
126
            args,
127
            &["--service-account", "nvidia-dcgm", "--home-dir", "/tmp"]
128
        );
129
    }
4✔
130

131
    #[test]
132
    fn test_dcgm_exporter_args() {
4✔
133
        let args = dcgm_exporter_args();
4✔
134
        assert_eq!(
4✔
135
            args,
136
            &["-k", "-f", "/etc/dcgm-exporter/default-counters.csv"]
137
        );
138
    }
4✔
139

140
    #[test]
141
    fn test_fabricmanager_args() {
4✔
142
        let args = fabricmanager_args();
4✔
143
        assert_eq!(
4✔
144
            args,
145
            &["-c", "/usr/share/nvidia/nvswitch/fabricmanager.cfg"]
146
        );
147
    }
4✔
148

149
    // === Skip path tests ===
150

151
    #[test]
152
    fn test_nv_hostengine_skipped_by_default() {
4✔
153
        // DCGM disabled by default - should be a no-op, no daemon spawned
154
        let mut nvrc = NVRC::default();
4✔
155
        assert!(nvrc.nv_hostengine().is_ok());
4✔
156
        assert!(nvrc.check_daemons().is_ok());
4✔
157
    }
4✔
158

159
    #[test]
160
    fn test_dcgm_exporter_skipped_by_default() {
4✔
161
        let mut nvrc = NVRC::default();
4✔
162
        assert!(nvrc.dcgm_exporter().is_ok());
4✔
163
    }
4✔
164

165
    #[test]
166
    fn test_nv_fabricmanager_skipped_by_default() {
4✔
167
        let mut nvrc = NVRC::default();
4✔
168
        assert!(nvrc.nv_fabricmanager().is_ok());
4✔
169
    }
4✔
170

171
    #[test]
172
    fn test_spawn_persistenced_success() {
4✔
173
        let tmpdir = TempDir::new().unwrap();
4✔
174
        let run_dir = tmpdir.path().join("nvidia-persistenced");
4✔
175

176
        let mut nvrc = NVRC::default();
4✔
177
        let result = nvrc.spawn_persistenced(run_dir.to_str().unwrap(), "/bin/true");
4✔
178
        assert!(result.is_ok());
4✔
179

180
        // Directory should be created
181
        assert!(run_dir.exists());
4✔
182

183
        // Daemon should be tracked and exit cleanly
184
        assert!(nvrc.check_daemons().is_ok());
4✔
185
    }
4✔
186

187
    #[test]
188
    fn test_spawn_persistenced_uvm_disabled() {
4✔
189
        let tmpdir = TempDir::new().unwrap();
4✔
190
        let run_dir = tmpdir.path().join("nvidia-persistenced");
4✔
191

192
        let mut nvrc = NVRC::default();
4✔
193
        nvrc.uvm_persistence_mode = Some(false); // Tests the else branch for args
4✔
194
        let result = nvrc.spawn_persistenced(run_dir.to_str().unwrap(), "/bin/true");
4✔
195
        assert!(result.is_ok());
4✔
196
    }
4✔
197

198
    #[test]
199
    fn test_spawn_hostengine_success() {
4✔
200
        let mut nvrc = NVRC::default();
4✔
201
        nvrc.dcgm_enabled = Some(true);
4✔
202
        let result = nvrc.spawn_hostengine("/bin/true");
4✔
203
        assert!(result.is_ok());
4✔
204
        assert!(nvrc.check_daemons().is_ok());
4✔
205
    }
4✔
206

207
    #[test]
208
    fn test_spawn_dcgm_exporter_success() {
4✔
209
        let mut nvrc = NVRC::default();
4✔
210
        nvrc.dcgm_enabled = Some(true);
4✔
211
        let result = nvrc.spawn_dcgm_exporter("/bin/true");
4✔
212
        assert!(result.is_ok());
4✔
213
    }
4✔
214

215
    #[test]
216
    fn test_spawn_fabricmanager_success() {
4✔
217
        let mut nvrc = NVRC::default();
4✔
218
        nvrc.fabricmanager_enabled = Some(true);
4✔
219
        let result = nvrc.spawn_fabricmanager("/bin/true");
4✔
220
        assert!(result.is_ok());
4✔
221
    }
4✔
222

223
    #[test]
224
    fn test_spawn_persistenced_binary_not_found() {
4✔
225
        let tmpdir = TempDir::new().unwrap();
4✔
226
        let run_dir = tmpdir.path().join("nvidia-persistenced");
4✔
227

228
        let mut nvrc = NVRC::default();
4✔
229
        let result = nvrc.spawn_persistenced(run_dir.to_str().unwrap(), "/nonexistent/binary");
4✔
230
        assert!(result.is_err());
4✔
231
    }
4✔
232

233
    #[test]
234
    fn test_check_daemons_empty() {
4✔
235
        let mut nvrc = NVRC::default();
4✔
236
        assert!(nvrc.check_daemons().is_ok());
4✔
237
    }
4✔
238
}
STATUS · Troubleshooting · Open an Issue · Sales · Support · CAREERS · ENTERPRISE · START FREE · SCHEDULE DEMO
ANNOUNCEMENTS · TWITTER · TOS & SLA · Supported CI Services · What's a CI service? · Automated Testing

© 2026 Coveralls, Inc