• Home
  • Features
  • Pricing
  • Docs
  • Announcements
  • Sign In

systemd / systemd / 23825567702

31 Mar 2026 12:42PM UTC coverage: 72.404% (+0.006%) from 72.398%
23825567702

push

github

daandemeyer
terminal-util: fix boot hang from ANSI terminal size queries

Since v257, terminal_fix_size() is called during early boot via
console_setup() → reset_dev_console_fd() to query terminal dimensions
via ANSI escape sequences. This has caused intermittent boot hangs
where the system gets stuck with a blinking cursor and requires a
keypress to continue (see systemd/systemd#35499).

The function tries CSI 18 first, then falls back to DSR if that fails.
Previously, each method independently opened a non-blocking fd, disabled
echo/icanon, ran its query, restored termios, and closed its fd. This
created two problems:

1. Echo window between CSI 18 and DSR fallback: After CSI 18 times out
   and restores termios (re-enabling ECHO and ICANON), there is a brief
   window before DSR disables them again. If the terminal's CSI 18
   response arrives during this window, it is echoed back to the
   terminal — where the terminal interprets \e[8;rows;cols t as a
   "resize text area" command — and the response bytes land in the
   canonical line buffer as stale input that can confuse the DSR
   response parser.

2. Cursor left at bottom-right on DSR timeout: The DSR method worked by
   sending two DSR queries — one to save the cursor position, then
   moving the cursor to (32766,32766) and sending another to read the
   clamped position. If neither response was received (timeout), the
   cursor restore was skipped (conditional on saved_row > 0), leaving
   the cursor at the bottom-right corner of the terminal. The
   subsequent terminal_reset_ansi_seq() then moved it to the beginning
   of the last line via \e[1G, making boot output appear at the bottom
   of the screen — giving the appearance of a hang even when the system
   was still booting.

This commit fixes both issues:

- terminal_fix_size() now opens the non-blocking fd and configures
  termios once for both query methods, so echo stays disabled for the
  entire CSI 18 → DSR fallback sequence with no gap. tcflu... (continued)

22 of 57 new or added lines in 3 files covered. (38.6%)

834 existing lines in 52 files now uncovered.

318485 of 439872 relevant lines covered (72.4%)

1162379.76 hits per line

Source File
Press 'n' to go to next uncovered line, 'b' for previous

82.77
/src/basic/process-util.c
1
/* SPDX-License-Identifier: LGPL-2.1-or-later */
2

3
#include <linux/oom.h>
4
#include <pthread.h>
5
#include <spawn.h>
6
#include <stdio.h>
7
#include <sys/mman.h>
8
#include <sys/mount.h>
9
#include <sys/personality.h>
10
#include <sys/prctl.h>
11
#include <sys/wait.h>
12
#include <syslog.h>
13
#include <threads.h>
14
#include <unistd.h>
15
#if HAVE_VALGRIND_VALGRIND_H
16
#include <valgrind/valgrind.h>
17
#endif
18

19
#include "sd-messages.h"
20

21
#include "alloc-util.h"
22
#include "architecture.h"
23
#include "argv-util.h"
24
#include "capability-util.h"
25
#include "cgroup-util.h"
26
#include "dirent-util.h"
27
#include "dlfcn-util.h"
28
#include "errno-util.h"
29
#include "escape.h"
30
#include "fd-util.h"
31
#include "fileio.h"
32
#include "fs-util.h"
33
#include "io-util.h"
34
#include "iovec-util.h"
35
#include "locale-util.h"
36
#include "log.h"
37
#include "memory-util.h"
38
#include "mountpoint-util.h"
39
#include "namespace-util.h"
40
#include "nulstr-util.h"
41
#include "parse-util.h"
42
#include "path-util.h"
43
#include "pidfd-util.h"
44
#include "pidref.h"
45
#include "process-util.h"
46
#include "raw-clone.h"
47
#include "rlimit-util.h"
48
#include "signal-util.h"
49
#include "socket-util.h"
50
#include "stat-util.h"
51
#include "stdio-util.h"
52
#include "string-table.h"
53
#include "string-util.h"
54
#include "strv.h"
55
#include "time-util.h"
56
#include "user-util.h"
57

58
/* The kernel limits userspace processes to TASK_COMM_LEN (16 bytes), but allows higher values for its own
59
 * workers, e.g. "kworker/u9:3-kcryptd/253:0". Let's pick a fixed smallish limit that will work for the kernel.
60
 */
61
#define COMM_MAX_LEN 128
62

63
static int get_process_state(pid_t pid) {
13,967✔
64
        _cleanup_free_ char *line = NULL;
13,967✔
65
        const char *p;
13,967✔
66
        char state;
13,967✔
67
        int r;
13,967✔
68

69
        assert(pid >= 0);
13,967✔
70

71
        /* Shortcut: if we are enquired about our own state, we are obviously running */
72
        if (pid == 0 || pid == getpid_cached())
13,967✔
73
                return (unsigned char) 'R';
×
74

75
        p = procfs_file_alloca(pid, "stat");
13,967✔
76

77
        r = read_one_line_file(p, &line);
13,967✔
78
        if (r == -ENOENT)
13,967✔
79
                return -ESRCH;
80
        if (r < 0)
10,833✔
81
                return r;
82

83
        p = strrchr(line, ')');
10,829✔
84
        if (!p)
10,829✔
85
                return -EIO;
86

87
        p++;
10,829✔
88

89
        if (sscanf(p, " %c", &state) != 1)
10,829✔
90
                return -EIO;
91

92
        return (unsigned char) state;
10,829✔
93
}
94

95
int pid_get_comm(pid_t pid, char **ret) {
44,971✔
96
        _cleanup_free_ char *escaped = NULL, *comm = NULL;
44,971✔
97
        int r;
44,971✔
98

99
        assert(pid >= 0);
44,971✔
100
        assert(ret);
44,971✔
101

102
        if (pid == 0 || pid == getpid_cached()) {
44,971✔
103
                comm = new0(char, TASK_COMM_LEN + 1); /* Must fit in 16 byte according to prctl(2) */
23,776✔
104
                if (!comm)
23,776✔
105
                        return -ENOMEM;
106

107
                if (prctl(PR_GET_NAME, comm) < 0)
23,776✔
108
                        return -errno;
×
109
        } else {
110
                const char *p;
21,195✔
111

112
                p = procfs_file_alloca(pid, "comm");
21,195✔
113

114
                /* Note that process names of kernel threads can be much longer than TASK_COMM_LEN */
115
                r = read_one_line_file(p, &comm);
21,195✔
116
                if (r == -ENOENT)
21,195✔
117
                        return -ESRCH;
118
                if (r < 0)
16,650✔
119
                        return r;
120
        }
121

122
        escaped = new(char, COMM_MAX_LEN);
40,425✔
123
        if (!escaped)
40,425✔
124
                return -ENOMEM;
125

126
        /* Escape unprintable characters, just in case, but don't grow the string beyond the underlying size */
127
        cellescape(escaped, COMM_MAX_LEN, comm);
40,425✔
128

129
        *ret = TAKE_PTR(escaped);
40,425✔
130
        return 0;
40,425✔
131
}
132

133
int pidref_get_comm(const PidRef *pid, char **ret) {
193✔
134
        _cleanup_free_ char *comm = NULL;
193✔
135
        int r;
193✔
136

137
        if (!pidref_is_set(pid))
193✔
138
                return -ESRCH;
139

140
        if (pidref_is_remote(pid))
386✔
141
                return -EREMOTE;
142

143
        r = pid_get_comm(pid->pid, &comm);
193✔
144
        if (r < 0)
193✔
145
                return r;
146

147
        r = pidref_verify(pid);
193✔
148
        if (r < 0)
193✔
149
                return r;
150

151
        if (ret)
193✔
152
                *ret = TAKE_PTR(comm);
193✔
153
        return 0;
154
}
155

156
static int pid_get_cmdline_nulstr(
19,801✔
157
                pid_t pid,
158
                size_t max_size,
159
                ProcessCmdlineFlags flags,
160
                char **ret,
161
                size_t *ret_size) {
162

163
        _cleanup_free_ char *t = NULL;
19,801✔
164
        const char *p;
19,801✔
165
        size_t k;
19,801✔
166
        int r;
19,801✔
167

168
        /* Retrieves a process' command line as a "sized nulstr", i.e. possibly without the last NUL, but
169
         * with a specified size.
170
         *
171
         * If PROCESS_CMDLINE_COMM_FALLBACK is specified in flags and the process has no command line set
172
         * (the case for kernel threads), or has a command line that resolves to the empty string, will
173
         * return the "comm" name of the process instead. This will use at most _SC_ARG_MAX bytes of input
174
         * data.
175
         *
176
         * Returns an error, 0 if output was read but is truncated, 1 otherwise.
177
         */
178

179
        p = procfs_file_alloca(pid, "cmdline");
20,017✔
180
        r = read_virtual_file(p, max_size, &t, &k); /* Let's assume that each input byte results in >= 1
19,801✔
181
                                                     * columns of output. We ignore zero-width codepoints. */
182
        if (r == -ENOENT)
19,801✔
183
                return -ESRCH;
184
        if (r < 0)
15,167✔
185
                return r;
186

187
        if (k == 0) {
15,166✔
188
                if (!(flags & PROCESS_CMDLINE_COMM_FALLBACK))
597✔
189
                        return -ENOENT;
578✔
190

191
                /* Kernel threads have no argv[] */
192
                _cleanup_free_ char *comm = NULL;
19✔
193

194
                r = pid_get_comm(pid, &comm);
19✔
195
                if (r < 0)
19✔
196
                        return r;
197

198
                free(t);
19✔
199
                t = strjoin("[", comm, "]");
19✔
200
                if (!t)
19✔
201
                        return -ENOMEM;
202

203
                k = strlen(t);
19✔
204
                r = k <= max_size;
19✔
205
                if (r == 0) /* truncation */
19✔
206
                        t[max_size] = '\0';
12✔
207
        }
208

209
        if (ret)
14,588✔
210
                *ret = TAKE_PTR(t);
14,588✔
211
        if (ret_size)
14,588✔
212
                *ret_size = k;
14,588✔
213

214
        return r;
215
}
216

217
int pid_get_cmdline(pid_t pid, size_t max_columns, ProcessCmdlineFlags flags, char **ret) {
15,027✔
218
        _cleanup_free_ char *t = NULL;
15,027✔
219
        size_t k;
15,027✔
220
        char *ans;
15,027✔
221

222
        assert(pid >= 0);
15,027✔
223
        assert(ret);
15,027✔
224

225
        /* Retrieve and format a command line. See above for discussion of retrieval options.
226
         *
227
         * There are two main formatting modes:
228
         *
229
         * - when PROCESS_CMDLINE_QUOTE is specified, output is quoted in C/Python style. If no shell special
230
         *   characters are present, this output can be copy-pasted into the terminal to execute. UTF-8
231
         *   output is assumed.
232
         *
233
         * - otherwise, a compact non-roundtrippable form is returned. Non-UTF8 bytes are replaced by �. The
234
         *   returned string is of the specified console width at most, abbreviated with an ellipsis.
235
         *
236
         * Returns -ESRCH if the process doesn't exist, and -ENOENT if the process has no command line (and
237
         * PROCESS_CMDLINE_COMM_FALLBACK is not specified). Returns 0 and sets *line otherwise. */
238

239
        int full = pid_get_cmdline_nulstr(pid, max_columns, flags, &t, &k);
15,027✔
240
        if (full < 0)
15,027✔
241
                return full;
242

243
        if (flags & (PROCESS_CMDLINE_QUOTE | PROCESS_CMDLINE_QUOTE_POSIX)) {
9,889✔
244
                ShellEscapeFlags shflags = SHELL_ESCAPE_EMPTY |
9,549✔
245
                        FLAGS_SET(flags, PROCESS_CMDLINE_QUOTE_POSIX) * SHELL_ESCAPE_POSIX;
9,549✔
246

247
                assert(!(flags & PROCESS_CMDLINE_USE_LOCALE));
9,549✔
248

249
                _cleanup_strv_free_ char **args = NULL;
9,549✔
250

251
                /* Drop trailing NULs, otherwise strv_parse_nulstr() adds additional empty strings at the end.
252
                 * See also issue #21186. */
253
                args = strv_parse_nulstr_full(t, k, /* drop_trailing_nuls= */ true);
9,549✔
254
                if (!args)
9,549✔
255
                        return -ENOMEM;
256

257
                ans = quote_command_line(args, shflags);
9,549✔
258
                if (!ans)
9,549✔
259
                        return -ENOMEM;
260
        } else {
261
                /* Arguments are separated by NULs. Let's replace those with spaces. */
262
                for (size_t i = 0; i < k - 1; i++)
16,819✔
263
                        if (t[i] == '\0')
16,479✔
264
                                t[i] = ' ';
580✔
265

266
                delete_trailing_chars(t, WHITESPACE);
340✔
267

268
                bool eight_bit = (flags & PROCESS_CMDLINE_USE_LOCALE) && !is_locale_utf8();
340✔
269

270
                ans = escape_non_printable_full(t, max_columns,
1,020✔
271
                                                eight_bit * XESCAPE_8_BIT | !full * XESCAPE_FORCE_ELLIPSIS);
631✔
272
                if (!ans)
340✔
273
                        return -ENOMEM;
274

275
                ans = str_realloc(ans);
340✔
276
        }
277

278
        *ret = ans;
9,889✔
279
        return 0;
9,889✔
280
}
281

282
int pidref_get_cmdline(const PidRef *pid, size_t max_columns, ProcessCmdlineFlags flags, char **ret) {
47✔
283
        _cleanup_free_ char *s = NULL;
47✔
284
        int r;
47✔
285

286
        if (!pidref_is_set(pid))
47✔
287
                return -ESRCH;
288

289
        if (pidref_is_remote(pid))
94✔
290
                return -EREMOTE;
291

292
        r = pid_get_cmdline(pid->pid, max_columns, flags, &s);
47✔
293
        if (r < 0)
47✔
294
                return r;
295

296
        r = pidref_verify(pid);
47✔
297
        if (r < 0)
47✔
298
                return r;
299

300
        if (ret)
47✔
301
                *ret = TAKE_PTR(s);
47✔
302
        return 0;
303
}
304

305
int pid_get_cmdline_strv(pid_t pid, ProcessCmdlineFlags flags, char ***ret) {
4,774✔
306
        _cleanup_free_ char *t = NULL;
4,774✔
307
        char **args;
4,774✔
308
        size_t k;
4,774✔
309
        int r;
4,774✔
310

311
        assert(pid >= 0);
4,774✔
312
        assert((flags & ~PROCESS_CMDLINE_COMM_FALLBACK) == 0);
4,774✔
313
        assert(ret);
4,774✔
314

315
        r = pid_get_cmdline_nulstr(pid, SIZE_MAX, flags, &t, &k);
4,774✔
316
        if (r < 0)
4,774✔
317
                return r;
318

319
        args = strv_parse_nulstr_full(t, k, /* drop_trailing_nuls= */ true);
4,699✔
320
        if (!args)
4,699✔
321
                return -ENOMEM;
322

323
        *ret = args;
4,699✔
324
        return 0;
4,699✔
325
}
326

327
int pidref_get_cmdline_strv(const PidRef *pid, ProcessCmdlineFlags flags, char ***ret) {
×
328
        _cleanup_strv_free_ char **args = NULL;
×
329
        int r;
×
330

331
        if (!pidref_is_set(pid))
×
332
                return -ESRCH;
333

334
        if (pidref_is_remote(pid))
×
335
                return -EREMOTE;
336

337
        r = pid_get_cmdline_strv(pid->pid, flags, &args);
×
338
        if (r < 0)
×
339
                return r;
340

341
        r = pidref_verify(pid);
×
342
        if (r < 0)
×
343
                return r;
344

345
        if (ret)
×
346
                *ret = TAKE_PTR(args);
×
347

348
        return 0;
349
}
350

351
int pid_is_kernel_thread(pid_t pid) {
4,009✔
352
        int r;
4,009✔
353

354
        if (IN_SET(pid, 0, 1) || pid == getpid_cached()) /* pid 1, and we ourselves certainly aren't a kernel thread */
4,009✔
355
                return 0;
4,009✔
356
        if (!pid_is_valid(pid))
3,984✔
357
                return -EINVAL;
358

359
        const char *p = procfs_file_alloca(pid, "stat");
3,984✔
360
        _cleanup_free_ char *line = NULL;
3,984✔
361
        r = read_one_line_file(p, &line);
3,984✔
362
        if (r == -ENOENT)
3,984✔
363
                return -ESRCH;
364
        if (r < 0)
3,984✔
365
                return r;
366

367
        /* Skip past the comm field */
368
        char *q = strrchr(line, ')');
3,984✔
369
        if (!q)
3,984✔
370
                return -EINVAL;
371
        q++;
3,984✔
372

373
        /* Skip 6 fields to reach the flags field */
374
        for (size_t i = 0; i < 6; i++) {
27,888✔
375
                size_t l = strspn(q, WHITESPACE);
23,904✔
376
                if (l < 1)
23,904✔
377
                        return -EINVAL;
378
                q += l;
23,904✔
379

380
                l = strcspn(q, WHITESPACE);
23,904✔
381
                if (l < 1)
23,904✔
382
                        return -EINVAL;
383
                q += l;
23,904✔
384
        }
385

386
        /* Skip preceding whitespace */
387
        size_t l = strspn(q, WHITESPACE);
3,984✔
388
        if (l < 1)
3,984✔
389
                return -EINVAL;
390
        q += l;
3,984✔
391

392
        /* Truncate the rest */
393
        l = strcspn(q, WHITESPACE);
3,984✔
394
        if (l < 1)
3,984✔
395
                return -EINVAL;
396
        q[l] = 0;
3,984✔
397

398
        unsigned long long flags;
3,984✔
399
        r = safe_atollu(q, &flags);
3,984✔
400
        if (r < 0)
3,984✔
401
                return r;
402

403
        return !!(flags & PF_KTHREAD);
3,984✔
404
}
405

406
int pidref_is_kernel_thread(const PidRef *pid) {
1,723✔
407
        int result, r;
1,723✔
408

409
        if (!pidref_is_set(pid))
1,723✔
410
                return -ESRCH;
411

412
        if (pidref_is_remote(pid))
1,723✔
413
                return -EREMOTE;
414

415
        result = pid_is_kernel_thread(pid->pid);
1,723✔
416
        if (result < 0)
1,723✔
417
                return result;
418

419
        r = pidref_verify(pid); /* Verify that the PID wasn't reused since */
1,723✔
420
        if (r < 0)
1,723✔
421
                return r;
×
422

423
        return result;
424
}
425

426
static int get_process_link_contents(pid_t pid, const char *proc_file, char **ret) {
14,515✔
427
        const char *p;
14,515✔
428
        int r;
14,515✔
429

430
        assert(proc_file);
14,515✔
431

432
        p = procfs_file_alloca(pid, proc_file);
14,519✔
433

434
        r = readlink_malloc(p, ret);
14,515✔
435
        return (r == -ENOENT && proc_mounted() > 0) ? -ESRCH : r;
14,515✔
436
}
437

438
int get_process_exe(pid_t pid, char **ret) {
14,489✔
439
        char *d;
14,489✔
440
        int r;
14,489✔
441

442
        assert(pid >= 0);
14,489✔
443

444
        r = get_process_link_contents(pid, "exe", ret);
14,489✔
445
        if (r < 0)
14,489✔
446
                return r;
447

448
        if (ret) {
9,699✔
449
                d = endswith(*ret, " (deleted)");
9,699✔
450
                if (d)
9,699✔
451
                        *d = '\0';
×
452
        }
453

454
        return 0;
455
}
456

457
int pid_get_uid(pid_t pid, uid_t *ret) {
4,128✔
458
        int r;
4,128✔
459

460
        assert(pid >= 0);
4,128✔
461
        assert(ret);
4,128✔
462

463
        if (pid == 0 || pid == getpid_cached()) {
4,128✔
464
                *ret = getuid();
1✔
465
                return 0;
4,128✔
466
        }
467

468
        _cleanup_free_ char *v = NULL;
4,127✔
469
        r = procfs_file_get_field(pid, "status", "Uid", &v);
4,127✔
470
        if (r == -ENOENT)
4,127✔
471
                return -ESRCH;
472
        if (r < 0)
173✔
473
                return r;
474

475
        return parse_uid(v, ret);
173✔
476
}
477

478
int pidref_get_uid(const PidRef *pid, uid_t *ret) {
72✔
479
        int r;
72✔
480

481
        if (!pidref_is_set(pid))
72✔
482
                return -ESRCH;
72✔
483

484
        if (pidref_is_remote(pid))
72✔
485
                return -EREMOTE;
486

487
        if (pid->fd >= 0) {
72✔
488
                r = pidfd_get_uid(pid->fd, ret);
72✔
489
                if (!ERRNO_IS_NEG_NOT_SUPPORTED(r))
72✔
490
                        return r;
491
        }
492

493
        uid_t uid;
×
494
        r = pid_get_uid(pid->pid, &uid);
×
495
        if (r < 0)
×
496
                return r;
497

498
        r = pidref_verify(pid);
×
499
        if (r < 0)
×
500
                return r;
501

502
        if (ret)
×
503
                *ret = uid;
×
504
        return 0;
505
}
506

507
int get_process_gid(pid_t pid, gid_t *ret) {
4,128✔
508
        int r;
4,128✔
509

510
        assert(pid >= 0);
4,128✔
511
        assert(ret);
4,128✔
512

513
        if (pid == 0 || pid == getpid_cached()) {
4,128✔
514
                *ret = getgid();
1✔
515
                return 0;
4,128✔
516
        }
517

518
        _cleanup_free_ char *v = NULL;
4,127✔
519
        r = procfs_file_get_field(pid, "status", "Gid", &v);
4,127✔
520
        if (r == -ENOENT)
4,127✔
521
                return -ESRCH;
522
        if (r < 0)
173✔
523
                return r;
524

525
        return parse_gid(v, ret);
4,127✔
526
}
527

528
int get_process_cwd(pid_t pid, char **ret) {
13✔
529
        assert(pid >= 0);
13✔
530

531
        if (pid == 0 || pid == getpid_cached())
13✔
532
                return safe_getcwd(ret);
×
533

534
        return get_process_link_contents(pid, "cwd", ret);
13✔
535
}
536

537
int get_process_root(pid_t pid, char **ret) {
13✔
538
        assert(pid >= 0);
13✔
539
        return get_process_link_contents(pid, "root", ret);
13✔
540
}
541

542
#define ENVIRONMENT_BLOCK_MAX (5U*1024U*1024U)
543

544
int get_process_environ(pid_t pid, char **ret) {
15✔
545
        _cleanup_fclose_ FILE *f = NULL;
15✔
546
        _cleanup_free_ char *outcome = NULL;
15✔
547
        size_t sz = 0;
15✔
548
        const char *p;
15✔
549
        int r;
15✔
550

551
        assert(pid >= 0);
15✔
552
        assert(ret);
15✔
553

554
        p = procfs_file_alloca(pid, "environ");
15✔
555

556
        r = fopen_unlocked(p, "re", &f);
15✔
557
        if (r == -ENOENT)
15✔
558
                return -ESRCH;
559
        if (r < 0)
15✔
560
                return r;
561

562
        for (;;) {
6,608✔
563
                char c;
6,623✔
564

565
                if (sz >= ENVIRONMENT_BLOCK_MAX)
6,623✔
566
                        return -ENOBUFS;
×
567

568
                if (!GREEDY_REALLOC(outcome, sz + 5))
6,623✔
569
                        return -ENOMEM;
570

571
                r = safe_fgetc(f, &c);
6,623✔
572
                if (r < 0)
6,623✔
573
                        return r;
574
                if (r == 0)
6,623✔
575
                        break;
576

577
                if (c == '\0')
6,608✔
578
                        outcome[sz++] = '\n';
235✔
579
                else
580
                        sz += cescape_char(c, outcome + sz);
6,373✔
581
        }
582

583
        outcome[sz] = '\0';
15✔
584
        *ret = TAKE_PTR(outcome);
15✔
585

586
        return 0;
15✔
587
}
588

589
int pid_get_ppid(pid_t pid, pid_t *ret) {
6✔
590
        _cleanup_free_ char *line = NULL;
6✔
591
        unsigned long ppid;
6✔
592
        const char *p;
6✔
593
        int r;
6✔
594

595
        assert(pid >= 0);
6✔
596

597
        if (pid == 0)
6✔
598
                pid = getpid_cached();
1✔
599
        if (pid == 1) /* PID 1 has no parent, shortcut this case */
6✔
600
                return -EADDRNOTAVAIL;
601

602
        if (pid == getpid_cached()) {
3✔
603
                if (ret)
2✔
604
                        *ret = getppid();
2✔
605
                return 0;
2✔
606
        }
607

608
        p = procfs_file_alloca(pid, "stat");
1✔
609
        r = read_one_line_file(p, &line);
1✔
610
        if (r == -ENOENT)
1✔
611
                return -ESRCH;
612
        if (r < 0)
×
613
                return r;
614

615
        /* Let's skip the pid and comm fields. The latter is enclosed in () but does not escape any () in its
616
         * value, so let's skip over it manually */
617

618
        p = strrchr(line, ')');
×
619
        if (!p)
×
620
                return -EIO;
621
        p++;
×
622

623
        if (sscanf(p, " "
×
624
                   "%*c "  /* state */
625
                   "%lu ", /* ppid */
626
                   &ppid) != 1)
627
                return -EIO;
628

629
        /* If ppid is zero the process has no parent. Which might be the case for PID 1 (caught above)
630
         * but also for processes originating in other namespaces that are inserted into a pidns.
631
         * Return a recognizable error in this case. */
632
        if (ppid == 0)
×
633
                return -EADDRNOTAVAIL;
634

635
        if ((pid_t) ppid < 0 || (unsigned long) (pid_t) ppid != ppid)
×
636
                return -ERANGE;
637

638
        if (ret)
×
639
                *ret = (pid_t) ppid;
×
640

641
        return 0;
642
}
643

644
int pidref_get_ppid(const PidRef *pidref, pid_t *ret) {
2,705✔
645
        int r;
2,705✔
646

647
        if (!pidref_is_set(pidref))
2,705✔
648
                return -ESRCH;
2,705✔
649

650
        if (pidref_is_remote(pidref))
2,705✔
651
                return -EREMOTE;
652

653
        if (pidref->fd >= 0) {
2,705✔
654
                r = pidfd_get_ppid(pidref->fd, ret);
2,705✔
655
                if (!ERRNO_IS_NEG_NOT_SUPPORTED(r))
2,705✔
656
                        return r;
657
        }
658

659
        pid_t ppid;
×
660
        r = pid_get_ppid(pidref->pid, ret ? &ppid : NULL);
×
661
        if (r < 0)
×
662
                return r;
663

664
        r = pidref_verify(pidref);
×
665
        if (r < 0)
×
666
                return r;
667

668
        if (ret)
×
669
                *ret = ppid;
×
670
        return 0;
671
}
672

673
int pidref_get_ppid_as_pidref(const PidRef *pidref, PidRef *ret) {
11✔
674
        pid_t ppid;
11✔
675
        int r;
11✔
676

677
        assert(ret);
11✔
678

679
        r = pidref_get_ppid(pidref, &ppid);
11✔
680
        if (r < 0)
11✔
681
                return r;
11✔
682

683
        for (unsigned attempt = 0; attempt < 16; attempt++) {
10✔
684
                _cleanup_(pidref_done) PidRef parent = PIDREF_NULL;
10✔
685

686
                r = pidref_set_pid(&parent, ppid);
10✔
687
                if (r < 0)
10✔
688
                        return r;
689

690
                /* If we have a pidfd of the original PID, let's verify that the process we acquired really
691
                 * is the parent still */
692
                if (pidref->fd >= 0) {
10✔
693
                        r = pidref_get_ppid(pidref, &ppid);
10✔
694
                        if (r < 0)
10✔
695
                                return r;
696

697
                        /* Did the PPID change since we queried it? if so we might have pinned the wrong
698
                         * process, if its PID got reused by now. Let's try again */
699
                        if (parent.pid != ppid)
10✔
700
                                continue;
×
701
                }
702

703
                *ret = TAKE_PIDREF(parent);
10✔
704
                return 0;
10✔
705
        }
706

707
        /* Give up after 16 tries */
708
        return -ENOTRECOVERABLE;
709
}
710

711
int pid_get_start_time(pid_t pid, usec_t *ret) {
742✔
712
        _cleanup_free_ char *line = NULL;
742✔
713
        const char *p;
742✔
714
        int r;
742✔
715

716
        assert(pid >= 0);
742✔
717

718
        p = procfs_file_alloca(pid, "stat");
742✔
719
        r = read_one_line_file(p, &line);
742✔
720
        if (r == -ENOENT)
742✔
721
                return -ESRCH;
722
        if (r < 0)
742✔
723
                return r;
724

725
        /* Let's skip the pid and comm fields. The latter is enclosed in () but does not escape any () in its
726
         * value, so let's skip over it manually */
727

728
        p = strrchr(line, ')');
742✔
729
        if (!p)
742✔
730
                return -EIO;
731
        p++;
742✔
732

733
        unsigned long llu;
742✔
734

735
        if (sscanf(p, " "
742✔
736
                   "%*c " /* state */
737
                   "%*u " /* ppid */
738
                   "%*u " /* pgrp */
739
                   "%*u " /* session */
740
                   "%*u " /* tty_nr */
741
                   "%*u " /* tpgid */
742
                   "%*u " /* flags */
743
                   "%*u " /* minflt */
744
                   "%*u " /* cminflt */
745
                   "%*u " /* majflt */
746
                   "%*u " /* cmajflt */
747
                   "%*u " /* utime */
748
                   "%*u " /* stime */
749
                   "%*u " /* cutime */
750
                   "%*u " /* cstime */
751
                   "%*i " /* priority */
752
                   "%*i " /* nice */
753
                   "%*u " /* num_threads */
754
                   "%*u " /* itrealvalue */
755
                   "%lu ", /* starttime */
756
                   &llu) != 1)
757
                return -EIO;
758

759
        if (ret)
742✔
760
                *ret = jiffies_to_usec(llu); /* CLOCK_BOOTTIME */
742✔
761

762
        return 0;
763
}
764

765
int pidref_get_start_time(const PidRef *pid, usec_t *ret) {
742✔
766
        usec_t t;
742✔
767
        int r;
742✔
768

769
        if (!pidref_is_set(pid))
742✔
770
                return -ESRCH;
742✔
771

772
        if (pidref_is_remote(pid))
742✔
773
                return -EREMOTE;
774

775
        r = pid_get_start_time(pid->pid, ret ? &t : NULL);
742✔
776
        if (r < 0)
742✔
777
                return r;
778

779
        r = pidref_verify(pid);
742✔
780
        if (r < 0)
742✔
781
                return r;
782

783
        if (ret)
742✔
784
                *ret = t;
742✔
785

786
        return 0;
787
}
788

789
int get_process_umask(pid_t pid, mode_t *ret) {
26,703✔
790
        _cleanup_free_ char *m = NULL;
26,703✔
791
        int r;
26,703✔
792

793
        assert(pid >= 0);
26,703✔
794
        assert(ret);
26,703✔
795

796
        r = procfs_file_get_field(pid, "status", "Umask", &m);
26,703✔
797
        if (r == -ENOENT)
26,703✔
798
                return -ESRCH;
799
        if (r < 0)
26,703✔
800
                return r;
801

802
        return parse_mode(m, ret);
26,703✔
803
}
804

805
/*
806
 * Return values:
807
 * < 0 : pidref_wait_for_terminate() failed to get the state of the
808
 *       process, the process was terminated by a signal, or
809
 *       failed for an unknown reason.
810
 * >=0 : The process terminated normally, and its exit code is
811
 *       returned.
812
 *
813
 * That is, success is indicated by a return value of zero, and an
814
 * error is indicated by a non-zero value.
815
 *
816
 * A warning is emitted if the process terminates abnormally,
817
 * and also if it returns non-zero unless check_exit_code is true.
818
 */
819
int pidref_wait_for_terminate_and_check(const char *name, PidRef *pidref, WaitFlags flags) {
7,892✔
820
        int r;
7,892✔
821

822
        if (!pidref_is_set(pidref))
7,892✔
823
                return -ESRCH;
7,892✔
824
        if (pidref_is_remote(pidref))
15,784✔
825
                return -EREMOTE;
826
        if (pidref->pid == 1 || pidref_is_self(pidref))
7,892✔
827
                return -ECHILD;
×
828

829
        _cleanup_free_ char *buffer = NULL;
7,892✔
830
        if (!name) {
7,892✔
831
                r = pidref_get_comm(pidref, &buffer);
2✔
832
                if (r < 0)
2✔
833
                        log_debug_errno(r, "Failed to acquire process name of " PID_FMT ", ignoring: %m", pidref->pid);
×
834
                else
835
                        name = buffer;
2✔
836
        }
837

838
        int prio = flags & WAIT_LOG_ABNORMAL ? LOG_ERR : LOG_DEBUG;
7,892✔
839

840
        siginfo_t status;
7,892✔
841
        r = pidref_wait_for_terminate(pidref, &status);
7,892✔
842
        if (r < 0)
7,892✔
843
                return log_full_errno(prio, r, "Failed to wait for '%s': %m", strna(name));
×
844

845
        if (status.si_code == CLD_EXITED) {
7,892✔
846
                if (status.si_status != EXIT_SUCCESS)
7,892✔
847
                        log_full(flags & WAIT_LOG_NON_ZERO_EXIT_STATUS ? LOG_ERR : LOG_DEBUG,
72✔
848
                                 "'%s' failed with exit status %i.", strna(name), status.si_status);
849
                else
850
                        log_debug("'%s' succeeded.", name);
7,820✔
851

852
                return status.si_status;
7,892✔
853

UNCOV
854
        } else if (IN_SET(status.si_code, CLD_KILLED, CLD_DUMPED))
×
UNCOV
855
                return log_full_errno(prio, SYNTHETIC_ERRNO(EPROTO),
×
856
                                      "'%s' terminated by signal %s.", strna(name), signal_to_string(status.si_status));
857

858
        return log_full_errno(prio, SYNTHETIC_ERRNO(EPROTO),
×
859
                              "'%s' failed due to unknown reason.", strna(name));
860
}
861

862
int kill_and_sigcont(pid_t pid, int sig) {
×
863
        int r;
×
864

865
        r = RET_NERRNO(kill(pid, sig));
×
866

867
        /* If this worked, also send SIGCONT, unless we already just sent a SIGCONT, or SIGKILL was sent which isn't
868
         * affected by a process being suspended anyway. */
869
        if (r >= 0 && !IN_SET(sig, SIGCONT, SIGKILL))
×
870
                (void) kill(pid, SIGCONT);
×
871

872
        return r;
×
873
}
874

875
int getenv_for_pid(pid_t pid, const char *field, char **ret) {
5,413✔
876
        _cleanup_fclose_ FILE *f = NULL;
5,413✔
877
        const char *path;
5,413✔
878
        size_t sum = 0;
5,413✔
879
        int r;
5,413✔
880

881
        assert(pid >= 0);
5,413✔
882
        assert(field);
5,413✔
883
        assert(ret);
5,413✔
884

885
        if (pid == 0 || pid == getpid_cached())
5,413✔
886
                return strdup_to_full(ret, getenv(field));
14✔
887

888
        if (!pid_is_valid(pid))
5,399✔
889
                return -EINVAL;
890

891
        path = procfs_file_alloca(pid, "environ");
5,399✔
892

893
        r = fopen_unlocked(path, "re", &f);
5,399✔
894
        if (r == -ENOENT)
5,399✔
895
                return -ESRCH;
896
        if (r < 0)
4,989✔
897
                return r;
898

899
        for (;;) {
59,457✔
900
                _cleanup_free_ char *line = NULL;
27,966✔
901
                const char *match;
31,509✔
902

903
                if (sum > ENVIRONMENT_BLOCK_MAX) /* Give up searching eventually */
31,509✔
904
                        return -ENOBUFS;
905

906
                r = read_nul_string(f, LONG_LINE_MAX, &line);
31,509✔
907
                if (r < 0)
31,509✔
908
                        return r;
909
                if (r == 0)  /* EOF */
31,509✔
910
                        break;
911

912
                sum += r;
27,966✔
913

914
                match = startswith(line, field);
27,966✔
915
                if (match && *match == '=')
27,966✔
916
                        return strdup_to_full(ret, match + 1);
18✔
917
        }
918

919
        *ret = NULL;
3,543✔
920
        return 0;
3,543✔
921
}
922

923
int pidref_is_my_child(PidRef *pid) {
2,683✔
924
        int r;
2,683✔
925

926
        if (!pidref_is_set(pid))
2,683✔
927
                return -ESRCH;
2,683✔
928

929
        if (pidref_is_remote(pid))
2,683✔
930
                return -EREMOTE;
931

932
        if (pid->pid == 1 || pidref_is_self(pid))
2,683✔
933
                return false;
×
934

935
        pid_t ppid;
2,683✔
936
        r = pidref_get_ppid(pid, &ppid);
2,683✔
937
        if (r == -EADDRNOTAVAIL) /* if this process is outside of our pidns, it is definitely not our child */
2,683✔
938
                return false;
939
        if (r < 0)
2,683✔
940
                return r;
941

942
        return ppid == getpid_cached();
2,683✔
943
}
944

945
int pid_is_my_child(pid_t pid) {
×
946

947
        if (pid == 0)
×
948
                return false;
×
949

950
        return pidref_is_my_child(&PIDREF_MAKE_FROM_PID(pid));
×
951
}
952

953
int pidref_is_unwaited(PidRef *pid) {
8,637✔
954
        int r;
8,637✔
955

956
        /* Checks whether a PID is still valid at all, including a zombie */
957

958
        if (!pidref_is_set(pid))
8,637✔
959
                return -ESRCH;
960

961
        if (pidref_is_remote(pid))
8,636✔
962
                return -EREMOTE;
963

964
        if (pid->pid == 1 || pidref_is_self(pid))
8,636✔
965
                return true;
1✔
966

967
        r = pidref_kill(pid, 0);
8,635✔
968
        if (r == -ESRCH)
8,635✔
969
                return false;
970
        if (r < 0)
2,280✔
971
                return r;
184✔
972

973
        return true;
974
}
975

976
int pid_is_unwaited(pid_t pid) {
7,897✔
977

978
        if (pid == 0)
7,897✔
979
                return true;
7,897✔
980

981
        return pidref_is_unwaited(&PIDREF_MAKE_FROM_PID(pid));
7,897✔
982
}
983

984
int pid_is_alive(pid_t pid) {
13,969✔
985
        int r;
13,969✔
986

987
        /* Checks whether a PID is still valid and not a zombie */
988

989
        if (pid < 0)
13,969✔
990
                return -ESRCH;
991

992
        if (pid <= 1) /* If we or PID 1 would be a zombie, this code would not be running */
13,968✔
993
                return true;
994

995
        if (pid == getpid_cached())
13,968✔
996
                return true;
997

998
        r = get_process_state(pid);
13,967✔
999
        if (r == -ESRCH)
13,967✔
1000
                return false;
1001
        if (r < 0)
10,829✔
1002
                return r;
1003

1004
        return r != 'Z';
10,829✔
1005
}
1006

1007
int pidref_is_alive(const PidRef *pidref) {
13,966✔
1008
        int r, result;
13,966✔
1009

1010
        if (!pidref_is_set(pidref))
13,966✔
1011
                return -ESRCH;
1012

1013
        if (pidref_is_remote(pidref))
13,964✔
1014
                return -EREMOTE;
1015

1016
        result = pid_is_alive(pidref->pid);
13,964✔
1017
        if (result < 0) {
13,964✔
1018
                assert(result != -ESRCH);
×
1019
                return result;
1020
        }
1021

1022
        r = pidref_verify(pidref);
13,964✔
1023
        if (r == -ESRCH)
13,964✔
1024
                return false;
1025
        if (r < 0)
10,825✔
1026
                return r;
×
1027

1028
        return result;
1029
}
1030

1031
int pidref_from_same_root_fs(PidRef *a, PidRef *b) {
20✔
1032
        _cleanup_(pidref_done) PidRef self = PIDREF_NULL;
×
1033
        int r;
20✔
1034

1035
        /* Checks if the two specified processes have the same root fs. Either can be specified as NULL in
1036
         * which case we'll check against ourselves. */
1037

1038
        if (!a || !b) {
20✔
1039
                r = pidref_set_self(&self);
×
1040
                if (r < 0)
×
1041
                        return r;
1042
                if (!a)
×
1043
                        a = &self;
×
1044
                if (!b)
×
1045
                        b = &self;
×
1046
        }
1047

1048
        if (!pidref_is_set(a) || !pidref_is_set(b))
20✔
1049
                return -ESRCH;
×
1050

1051
        /* If one of the two processes have the same root they cannot have the same root fs, but if both of
1052
         * them do we don't know */
1053
        if (pidref_is_remote(a) && pidref_is_remote(b))
20✔
1054
                return -EREMOTE;
1055
        if (pidref_is_remote(a) || pidref_is_remote(b))
60✔
1056
                return false;
1057

1058
        if (pidref_equal(a, b))
20✔
1059
                return true;
1060

1061
        const char *roota = procfs_file_alloca(a->pid, "root");
18✔
1062
        const char *rootb = procfs_file_alloca(b->pid, "root");
18✔
1063

1064
        int result = inode_same(roota, rootb, 0);
18✔
1065
        if (result == -ENOENT)
18✔
1066
                return proc_mounted() == 0 ? -ENOSYS : -ESRCH;
×
1067
        if (result < 0)
18✔
1068
                return result;
1069

1070
        r = pidref_verify(a);
18✔
1071
        if (r < 0)
18✔
1072
                return r;
1073
        r = pidref_verify(b);
18✔
1074
        if (r < 0)
18✔
1075
                return r;
×
1076

1077
        return result;
1078
}
1079

1080
bool is_main_thread(void) {
7,827,883✔
1081
        static thread_local int cached = -1;
7,827,883✔
1082

1083
        if (cached < 0)
7,827,883✔
1084
                cached = getpid_cached() == gettid();
57,975✔
1085

1086
        return cached;
7,827,883✔
1087
}
1088

1089
unsigned long personality_from_string(const char *s) {
9✔
1090
        Architecture architecture;
9✔
1091

1092
        if (!s)
9✔
1093
                return PERSONALITY_INVALID;
1094

1095
        /* Parse a personality specifier. We use our own identifiers that indicate specific ABIs, rather than just
1096
         * hints regarding the register size, since we want to keep things open for multiple locally supported ABIs for
1097
         * the same register size. */
1098

1099
        architecture = architecture_from_string(s);
8✔
1100
        if (architecture < 0)
8✔
1101
                return PERSONALITY_INVALID;
1102

1103
        if (architecture == native_architecture())
6✔
1104
                return PER_LINUX;
1105
#ifdef ARCHITECTURE_SECONDARY
1106
        if (architecture == ARCHITECTURE_SECONDARY)
3✔
1107
                return PER_LINUX32;
2✔
1108
#endif
1109

1110
        return PERSONALITY_INVALID;
1111
}
1112

1113
const char* personality_to_string(unsigned long p) {
3,048✔
1114
        Architecture architecture = _ARCHITECTURE_INVALID;
3,048✔
1115

1116
        if (p == PER_LINUX)
3,048✔
1117
                architecture = native_architecture();
1118
#ifdef ARCHITECTURE_SECONDARY
1119
        else if (p == PER_LINUX32)
3,043✔
1120
                architecture = ARCHITECTURE_SECONDARY;
1121
#endif
1122

1123
        if (architecture < 0)
1124
                return NULL;
1125

1126
        return architecture_to_string(architecture);
7✔
1127
}
1128

1129
int safe_personality(unsigned long p) {
1,496✔
1130
        int ret;
1,496✔
1131

1132
        /* So here's the deal, personality() is weirdly defined by glibc. In some cases it returns a failure via errno,
1133
         * and in others as negative return value containing an errno-like value. Let's work around this: this is a
1134
         * wrapper that uses errno if it is set, and uses the return value otherwise. And then it sets both errno and
1135
         * the return value indicating the same issue, so that we are definitely on the safe side.
1136
         *
1137
         * See https://github.com/systemd/systemd/issues/6737 */
1138

1139
        errno = 0;
1,496✔
1140
        ret = personality(p);
1,496✔
1141
        if (ret < 0) {
1,496✔
1142
                if (errno != 0)
12✔
1143
                        return -errno;
12✔
1144

1145
                errno = -ret;
×
1146
        }
1147

1148
        return ret;
1149
}
1150

1151
int opinionated_personality(unsigned long *ret) {
1,481✔
1152
        int current;
1,481✔
1153

1154
        /* Returns the current personality, or PERSONALITY_INVALID if we can't determine it. This function is a bit
1155
         * opinionated though, and ignores all the finer-grained bits and exotic personalities, only distinguishing the
1156
         * two most relevant personalities: PER_LINUX and PER_LINUX32. */
1157

1158
        current = safe_personality(PERSONALITY_INVALID);
1,481✔
1159
        if (current < 0)
1,481✔
1160
                return current;
1161

1162
        if (((unsigned long) current & OPINIONATED_PERSONALITY_MASK) == PER_LINUX32)
1,481✔
1163
                *ret = PER_LINUX32;
×
1164
        else
1165
                *ret = PER_LINUX;
1,481✔
1166

1167
        return 0;
1168
}
1169

1170
void valgrind_summary_hack(void) {
39✔
1171
#if HAVE_VALGRIND_VALGRIND_H
1172
        if (getpid_cached() == 1 && RUNNING_ON_VALGRIND) {
1173
                pid_t pid;
1174
                pid = raw_clone(SIGCHLD);
1175
                if (pid < 0)
1176
                        log_struct_errno(
1177
                                LOG_EMERG, errno,
1178
                                LOG_MESSAGE_ID(SD_MESSAGE_VALGRIND_HELPER_FORK_STR),
1179
                                LOG_MESSAGE("Failed to fork off valgrind helper: %m"));
1180
                else if (pid == 0)
1181
                        exit(EXIT_SUCCESS);
1182
                else {
1183
                        log_info("Spawned valgrind helper as PID "PID_FMT".", pid);
1184
                        _cleanup_(pidref_done) PidRef pidref = PIDREF_MAKE_FROM_PID(pid);
1185
                        (void) pidref_set_pid(&pidref, pid);
1186
                        (void) pidref_wait_for_terminate(&pidref, NULL);
1187
                }
1188
        }
1189
#endif
1190
}
39✔
1191

1192
int pid_compare_func(const pid_t *a, const pid_t *b) {
1,500✔
1193
        /* Suitable for usage in qsort() */
1194
        return CMP(*a, *b);
1,500✔
1195
}
1196

1197
bool nice_is_valid(int n) {
936✔
1198
        return n >= PRIO_MIN && n < PRIO_MAX;
936✔
1199
}
1200

1201
bool sched_policy_is_valid(int policy) {
×
1202
        return IN_SET(policy, SCHED_OTHER, SCHED_BATCH, SCHED_IDLE, SCHED_FIFO, SCHED_RR, SCHED_EXT);
×
1203
}
1204

1205
bool sched_policy_supported(int policy) {
4✔
1206
        return sched_get_priority_min(policy) >= 0;
4✔
1207
}
1208

1209
/* Wrappers around sched_get_priority_{min,max}() that gracefully handles missing SCHED_EXT support in the kernel */
1210
int sched_get_priority_min_safe(int policy) {
4✔
1211
        int r;
4✔
1212

1213
        r = sched_get_priority_min(policy);
4✔
1214
        if (r >= 0)
4✔
1215
                return r;
4✔
1216

1217
        /* Fallback priority */
1218
        return 0;
1219
}
1220

1221
int sched_get_priority_max_safe(int policy) {
4✔
1222
        int r;
4✔
1223

1224
        r = sched_get_priority_max(policy);
4✔
1225
        if (r >= 0)
4✔
1226
                return r;
4✔
1227

1228
        return 0;
1229
}
1230

1231
/* The cached PID, possible values:
1232
 *
1233
 *     == UNSET [0]  → cache not initialized yet
1234
 *     == BUSY [-1]  → some thread is initializing it at the moment
1235
 *     any other     → the cached PID
1236
 */
1237

1238
#define CACHED_PID_UNSET ((pid_t) 0)
1239
#define CACHED_PID_BUSY ((pid_t) -1)
1240

1241
static pid_t cached_pid = CACHED_PID_UNSET;
1242

1243
void reset_cached_pid(void) {
1,801✔
1244
        /* Invoked in the child after a fork(), i.e. at the first moment the PID changed */
1245
        cached_pid = CACHED_PID_UNSET;
1,801✔
1246
}
1,801✔
1247

1248
pid_t getpid_cached(void) {
151,551,486✔
1249
        static bool installed = false;
151,551,486✔
1250
        pid_t current_value = CACHED_PID_UNSET;
151,551,486✔
1251

1252
        /* getpid_cached() is much like getpid(), but caches the value in local memory, to avoid having to invoke a
1253
         * system call each time. This restores glibc behaviour from before 2.24, when getpid() was unconditionally
1254
         * cached. Starting with 2.24 getpid() started to become prohibitively expensive when used for detecting when
1255
         * objects were used across fork()s. With this caching the old behaviour is somewhat restored.
1256
         *
1257
         * https://bugzilla.redhat.com/show_bug.cgi?id=1443976
1258
         * https://sourceware.org/git/gitweb.cgi?p=glibc.git;h=c579f48edba88380635ab98cb612030e3ed8691e
1259
         */
1260

1261
        (void) __atomic_compare_exchange_n(
151,551,486✔
1262
                        &cached_pid,
1263
                        &current_value,
1264
                        CACHED_PID_BUSY,
1265
                        false,
1266
                        __ATOMIC_SEQ_CST,
1267
                        __ATOMIC_SEQ_CST);
1268

1269
        switch (current_value) {
151,551,486✔
1270

1271
        case CACHED_PID_UNSET: { /* Not initialized yet, then do so now */
107,172✔
1272
                pid_t new_pid;
107,172✔
1273

1274
                new_pid = getpid();
107,172✔
1275

1276
                if (!installed) {
107,172✔
1277
                        /* __register_atfork() either returns 0 or -ENOMEM, in its glibc implementation. Since it's
1278
                         * only half-documented (glibc doesn't document it but LSB does — though only superficially)
1279
                         * we'll check for errors only in the most generic fashion possible. */
1280

1281
                        if (pthread_atfork(NULL, NULL, reset_cached_pid) != 0) {
74,407✔
1282
                                /* OOM? Let's try again later */
1283
                                cached_pid = CACHED_PID_UNSET;
×
1284
                                return new_pid;
×
1285
                        }
1286

1287
                        installed = true;
74,407✔
1288
                }
1289

1290
                cached_pid = new_pid;
107,172✔
1291
                return new_pid;
107,172✔
1292
        }
1293

1294
        case CACHED_PID_BUSY: /* Somebody else is currently initializing */
×
1295
                return getpid();
×
1296

1297
        default: /* Properly initialized */
1298
                return current_value;
1299
        }
1300
}
1301

1302
int must_be_root(void) {
65✔
1303

1304
        if (geteuid() == 0)
65✔
1305
                return 0;
1306

1307
        return log_error_errno(SYNTHETIC_ERRNO(EPERM), "Need to be root.");
×
1308
}
1309

1310
pid_t clone_with_nested_stack(int (*fn)(void *), int flags, void *userdata) {
3,707✔
1311
        size_t ps;
3,707✔
1312
        pid_t pid;
3,707✔
1313
        void *mystack;
3,707✔
1314

1315
        /* A wrapper around glibc's clone() call that automatically sets up a "nested" stack. Only supports
1316
         * invocations without CLONE_VM, so that we can continue to use the parent's stack mapping.
1317
         *
1318
         * Note: glibc's clone() wrapper does not synchronize malloc() locks. This means that if the parent
1319
         * is threaded these locks will be in an undefined state in the child, and hence memory allocations
1320
         * are likely going to run into deadlocks. Hence: if you use this function make sure your parent is
1321
         * strictly single-threaded or your child never calls malloc(). */
1322

1323
        assert((flags & (CLONE_VM|CLONE_PARENT_SETTID|CLONE_CHILD_SETTID|
3,707✔
1324
                         CLONE_CHILD_CLEARTID|CLONE_SETTLS)) == 0);
1325

1326
        /* We allocate some space on the stack to use as the stack for the child (hence "nested"). Note that
1327
         * the net effect is that the child will have the start of its stack inside the stack of the parent,
1328
         * but since they are a CoW copy of each other that's fine. We allocate one page-aligned page. But
1329
         * since we don't want to deal with differences between systems where the stack grows backwards or
1330
         * forwards we'll allocate one more and place the stack address in the middle. Except that we also
1331
         * want it page aligned, hence we'll allocate one page more. Makes 3. */
1332

1333
        ps = page_size();
3,707✔
1334
        mystack = alloca(ps*3);
3,707✔
1335
        mystack = (uint8_t*) mystack + ps; /* move pointer one page ahead since stacks usually grow backwards */
3,707✔
1336
        mystack = (void*) ALIGN_TO((uintptr_t) mystack, ps); /* align to page size (moving things further ahead) */
3,707✔
1337

1338
#if HAVE_CLONE
1339
        pid = clone(fn, mystack, flags, userdata);
3,707✔
1340
#else
1341
        pid = __clone2(fn, mystack, ps, flags, userdata);
1342
#endif
1343
        if (pid < 0)
3,707✔
1344
                return -errno;
×
1345

1346
        return pid;
1347
}
1348

1349
static int fork_flags_to_signal(ForkFlags flags) {
31,031✔
1350
        return (flags & FORK_DEATHSIG_SIGTERM) ? SIGTERM :
31,031✔
1351
                (flags & FORK_DEATHSIG_SIGINT) ? SIGINT :
981✔
1352
                                                 SIGKILL;
1353
}
1354

1355
int pidref_safe_fork_full(
30,172✔
1356
                const char *name,
1357
                const int stdio_fds[3],
1358
                int except_fds[],
1359
                size_t n_except_fds,
1360
                ForkFlags flags,
1361
                PidRef *ret) {
1362

1363
        pid_t original_pid, pid;
30,172✔
1364
        sigset_t saved_ss, ss;
30,172✔
1365
        _unused_ _cleanup_(block_signals_reset) sigset_t *saved_ssp = NULL;
×
1366
        bool block_signals = false, block_all = false, intermediary = false;
30,172✔
1367
        _cleanup_close_pair_ int pidref_transport_fds[2] = EBADF_PAIR;
62,108✔
1368
        int prio, r;
30,172✔
1369

1370
        assert(!FLAGS_SET(flags, FORK_WAIT|FORK_FREEZE));
30,172✔
1371
        assert(!FLAGS_SET(flags, FORK_DETACH) ||
30,172✔
1372
               (flags & (FORK_WAIT|FORK_DEATHSIG_SIGTERM|FORK_DEATHSIG_SIGINT|FORK_DEATHSIG_SIGKILL)) == 0);
1373

1374
        /* A wrapper around fork(), that does a couple of important initializations in addition to mere
1375
         * forking. If provided, ret is initialized in both the parent and the child process, both times
1376
         * referencing the child process. Returns == 0 in the child and > 0 in the parent. */
1377

1378
        prio = flags & FORK_LOG ? LOG_ERR : LOG_DEBUG;
30,172✔
1379

1380
        original_pid = getpid_cached();
30,172✔
1381

1382
        if (flags & FORK_FLUSH_STDIO) {
30,172✔
1383
                fflush(stdout);
5✔
1384
                fflush(stderr); /* This one shouldn't be necessary, stderr should be unbuffered anyway, but let's better be safe than sorry */
5✔
1385
        }
1386

1387
        if (flags & (FORK_RESET_SIGNALS|FORK_DEATHSIG_SIGTERM|FORK_DEATHSIG_SIGINT)) {
30,172✔
1388
                /* We temporarily block all signals, so that the new child has them blocked initially. This
1389
                 * way, we can be sure that SIGTERMs are not lost we might send to the child. (Note that for
1390
                 * FORK_DEATHSIG_SIGKILL we don't bother, since it cannot be blocked anyway.) */
1391

1392
                assert_se(sigfillset(&ss) >= 0);
25,677✔
1393
                block_signals = block_all = true;
1394

1395
        } else if (flags & FORK_WAIT) {
4,495✔
1396
                /* Let's block SIGCHLD at least, so that we can safely watch for the child process */
1397

1398
                assert_se(sigemptyset(&ss) >= 0);
164✔
1399
                assert_se(sigaddset(&ss, SIGCHLD) >= 0);
164✔
1400
                block_signals = true;
1401
        }
1402

1403
        if (block_signals) {
1404
                if (sigprocmask(SIG_BLOCK, &ss, &saved_ss) < 0)
25,841✔
1405
                        return log_full_errno(prio, errno, "Failed to block signal mask: %m");
×
1406
                saved_ssp = &saved_ss;
25,841✔
1407
        }
1408

1409
        if (FLAGS_SET(flags, FORK_DETACH)) {
30,172✔
1410
                /* Fork off intermediary child if needed */
1411

1412
                r = is_reaper_process();
80✔
1413
                if (r < 0)
80✔
1414
                        return log_full_errno(prio, r, "Failed to determine if we are a reaper process: %m");
×
1415

1416
                if (!r) {
80✔
1417
                        /* Not a reaper process, hence do a double fork() so we are reparented to one */
1418

1419
                        if (ret && socketpair(AF_UNIX, SOCK_DGRAM|SOCK_CLOEXEC, 0, pidref_transport_fds) < 0)
11✔
1420
                                return log_full_errno(prio, errno, "Failed to allocate pidref socket: %m");
×
1421

1422
                        pid = fork();
11✔
1423
                        if (pid < 0)
28✔
1424
                                return log_full_errno(prio, errno, "Failed to fork off '%s': %m", strna(name));
×
1425
                        if (pid > 0) {
28✔
1426
                                log_debug("Successfully forked off intermediary '%s' as PID " PID_FMT ".", strna(name), pid);
11✔
1427

1428
                                pidref_transport_fds[1] = safe_close(pidref_transport_fds[1]);
11✔
1429

1430
                                if (pidref_transport_fds[0] >= 0) {
11✔
1431
                                        /* Wait for the intermediary child to exit so the caller can be
1432
                                         * certain the actual child process has been reparented by the time
1433
                                         * this function returns. */
1434
                                        r = pidref_wait_for_terminate_and_check(
10✔
1435
                                                        name,
1436
                                                        &PIDREF_MAKE_FROM_PID(pid),
10✔
1437
                                                        FLAGS_SET(flags, FORK_LOG) ? WAIT_LOG : 0);
1438
                                        if (r < 0)
10✔
1439
                                                return log_full_errno(prio, r, "Failed to wait for intermediary process: %m");
×
1440
                                        if (r != EXIT_SUCCESS) /* exit status > 0 should be treated as failure, too */
10✔
1441
                                                return -EPROTO;
1442

1443
                                        int pidfd;
10✔
1444
                                        ssize_t n = receive_one_fd_iov(
20✔
1445
                                                        pidref_transport_fds[0],
1446
                                                        &IOVEC_MAKE(&pid, sizeof(pid)),
10✔
1447
                                                        /* iovlen= */ 1,
1448
                                                        /* flags= */ 0,
1449
                                                        &pidfd);
1450
                                        if (n < 0)
10✔
1451
                                                return log_full_errno(prio, n, "Failed to receive child pidref: %m");
×
1452

1453
                                        *ret = (PidRef) { .pid = pid, .fd = pidfd };
10✔
1454
                                }
1455

1456
                                return 1; /* return in the parent */
11✔
1457
                        }
1458

1459
                        pidref_transport_fds[0] = safe_close(pidref_transport_fds[0]);
17✔
1460
                        intermediary = true;
17✔
1461
                }
1462
        }
1463

1464
        if ((flags & (FORK_NEW_MOUNTNS|FORK_NEW_USERNS|FORK_NEW_NETNS|FORK_NEW_PIDNS)) != 0)
30,178✔
1465
                pid = raw_clone(SIGCHLD|
6,402✔
1466
                                (FLAGS_SET(flags, FORK_NEW_MOUNTNS) ? CLONE_NEWNS : 0) |
6,402✔
1467
                                (FLAGS_SET(flags, FORK_NEW_USERNS) ? CLONE_NEWUSER : 0) |
6,402✔
1468
                                (FLAGS_SET(flags, FORK_NEW_NETNS) ? CLONE_NEWNET : 0) |
6,402✔
1469
                                (FLAGS_SET(flags, FORK_NEW_PIDNS) ? CLONE_NEWPID : 0));
6,402✔
1470
        else
1471
                pid = fork();
23,776✔
1472
        if (pid < 0)
62,108✔
1473
                return log_full_errno(prio, errno, "Failed to fork off '%s': %m", strna(name));
2✔
1474
        if (pid > 0) {
62,107✔
1475

1476
                /* If we are in the intermediary process, exit now */
1477
                if (intermediary) {
29,720✔
1478
                        if (pidref_transport_fds[1] >= 0) {
11✔
1479
                                _cleanup_(pidref_done) PidRef pidref = PIDREF_NULL;
10✔
1480

1481
                                r = pidref_set_pid(&pidref, pid);
10✔
1482
                                if (r < 0) {
10✔
1483
                                        log_full_errno(prio, r, "Failed to open reference to PID "PID_FMT": %m", pid);
×
1484
                                        _exit(EXIT_FAILURE);
×
1485
                                }
1486

1487
                                r = send_one_fd_iov(
10✔
1488
                                                pidref_transport_fds[1],
1489
                                                pidref.fd,
1490
                                                &IOVEC_MAKE(&pidref.pid, sizeof(pidref.pid)),
1491
                                                /* iovlen= */ 1,
1492
                                                /* flags= */ 0);
1493
                                if (r < 0) {
10✔
1494
                                        log_full_errno(prio, r, "Failed to send child pidref: %m");
×
1495
                                        _exit(EXIT_FAILURE);
×
1496
                                }
1497
                        }
1498

1499
                        _exit(EXIT_SUCCESS);
11✔
1500
                }
1501

1502
                /* We are in the parent process */
1503
                log_debug("Successfully forked off '%s' as PID " PID_FMT ".", strna(name), pid);
29,709✔
1504

1505
                if (flags & FORK_WAIT) {
29,709✔
1506
                        if (block_all) {
966✔
1507
                                /* undo everything except SIGCHLD */
1508
                                ss = saved_ss;
802✔
1509
                                assert_se(sigaddset(&ss, SIGCHLD) >= 0);
802✔
1510
                                (void) sigprocmask(SIG_SETMASK, &ss, NULL);
802✔
1511
                        }
1512

1513
                        r = pidref_wait_for_terminate_and_check(
966✔
1514
                                        name,
1515
                                        &PIDREF_MAKE_FROM_PID(pid),
966✔
1516
                                        FLAGS_SET(flags, FORK_LOG) ? WAIT_LOG : 0);
1517
                        if (r < 0)
966✔
1518
                                return r;
966✔
1519
                        if (r != EXIT_SUCCESS) /* exit status > 0 should be treated as failure, too */
966✔
1520
                                return -EPROTO;
1521

1522
                        /* If we are in the parent and successfully waited, then the process doesn't exist anymore. */
1523
                        if (ret)
958✔
1524
                                *ret = PIDREF_NULL;
28✔
1525

1526
                        return 1;
958✔
1527
                }
1528

1529
                if (ret) {
28,743✔
1530
                        r = pidref_set_pid(ret, pid);
27,485✔
1531
                        if (r < 0) /* Let's not fail for this, no matter what, the process exists after all, and that's key */
27,485✔
1532
                                *ret = PIDREF_MAKE_FROM_PID(pid);
×
1533
                }
1534

1535
                return 1;
28,743✔
1536
        }
1537

1538
        /* We are in the child process */
1539

1540
        pidref_transport_fds[1] = safe_close(pidref_transport_fds[1]);
32,387✔
1541

1542
        /* Restore signal mask manually */
1543
        saved_ssp = NULL;
32,387✔
1544

1545
        if (flags & FORK_REOPEN_LOG) {
32,387✔
1546
                /* Close the logs if requested, before we log anything. And make sure we reopen it if needed. */
1547
                log_close();
8,327✔
1548
                log_set_open_when_needed(true);
8,327✔
1549
                log_settle_target();
8,327✔
1550
        }
1551

1552
        if (name) {
32,387✔
1553
                r = rename_process(name);
32,387✔
1554
                if (r < 0)
32,387✔
1555
                        log_full_errno(flags & FORK_LOG ? LOG_WARNING : LOG_DEBUG,
×
1556
                                       r, "Failed to rename process, ignoring: %m");
1557
        }
1558

1559
        /* let's disable dlopen() in the child, as a paranoia safety precaution: children should not live for
1560
         * long and only do minimal work before exiting or exec()ing. Doing dlopen() is not either. If people
1561
         * want dlopen() they should do it before forking. This is a safety precaution in particular for
1562
         * cases where the child does namespace shenanigans: we should never end up loading a module from a
1563
         * foreign environment. Note that this has no effect on NSS! (i.e. it only has effect on uses of our
1564
         * dlopen_safe(), which we use comprehensively in our codebase, but glibc NSS doesn't bother, of
1565
         * course.) */
1566
        if (!FLAGS_SET(flags, FORK_ALLOW_DLOPEN))
32,387✔
1567
                block_dlopen();
32,348✔
1568

1569
        if (flags & (FORK_DEATHSIG_SIGTERM|FORK_DEATHSIG_SIGINT|FORK_DEATHSIG_SIGKILL))
32,387✔
1570
                if (prctl(PR_SET_PDEATHSIG, fork_flags_to_signal(flags)) < 0) {
31,031✔
1571
                        log_full_errno(prio, errno, "Failed to set death signal: %m");
×
1572
                        _exit(EXIT_FAILURE);
×
1573
                }
1574

1575
        if (flags & FORK_RESET_SIGNALS) {
32,387✔
1576
                r = reset_all_signal_handlers();
26,894✔
1577
                if (r < 0) {
26,894✔
1578
                        log_full_errno(prio, r, "Failed to reset signal handlers: %m");
×
1579
                        _exit(EXIT_FAILURE);
×
1580
                }
1581

1582
                /* This implicitly undoes the signal mask stuff we did before the fork()ing above */
1583
                r = reset_signal_mask();
26,894✔
1584
                if (r < 0) {
26,894✔
1585
                        log_full_errno(prio, r, "Failed to reset signal mask: %m");
×
1586
                        _exit(EXIT_FAILURE);
×
1587
                }
1588
        } else if (block_signals) { /* undo what we did above */
5,493✔
1589
                if (sigprocmask(SIG_SETMASK, &saved_ss, NULL) < 0) {
5,007✔
1590
                        log_full_errno(prio, errno, "Failed to restore signal mask: %m");
×
1591
                        _exit(EXIT_FAILURE);
×
1592
                }
1593
        }
1594

1595
        if (flags & (FORK_DEATHSIG_SIGTERM|FORK_DEATHSIG_SIGKILL|FORK_DEATHSIG_SIGINT)) {
32,387✔
1596
                pid_t ppid;
31,031✔
1597
                /* Let's see if the parent PID is still the one we started from? If not, then the parent
1598
                 * already died by the time we set PR_SET_PDEATHSIG, hence let's emulate the effect */
1599

1600
                ppid = getppid();
31,031✔
1601
                if (ppid == 0)
31,031✔
1602
                        /* Parent is in a different PID namespace. */;
1603
                else if (ppid != original_pid) {
30,993✔
1604
                        int sig = fork_flags_to_signal(flags);
×
1605
                        log_debug("Parent died early, raising %s.", signal_to_string(sig));
×
1606
                        (void) raise(sig);
×
1607
                        _exit(EXIT_FAILURE);
×
1608
                }
1609
        }
1610

1611
        if (FLAGS_SET(flags, FORK_NEW_MOUNTNS | FORK_MOUNTNS_SLAVE)) {
32,387✔
1612
                /* Optionally, make sure we never propagate mounts to the host. */
1613
                if (mount(NULL, "/", NULL, MS_SLAVE | MS_REC, NULL) < 0) {
195✔
1614
                        log_full_errno(prio, errno, "Failed to remount root directory as MS_SLAVE: %m");
×
1615
                        _exit(EXIT_FAILURE);
×
1616
                }
1617
        }
1618

1619
        if (FLAGS_SET(flags, FORK_PRIVATE_TMP)) {
32,387✔
1620
                assert(FLAGS_SET(flags, FORK_NEW_MOUNTNS));
×
1621

1622
                /* Optionally, overmount new tmpfs instance on /tmp/. */
1623
                r = mount_nofollow("tmpfs", "/tmp", "tmpfs",
×
1624
                                   MS_NOSUID|MS_NODEV,
1625
                                   "mode=01777" TMPFS_LIMITS_RUN);
1626
                if (r < 0) {
×
1627
                        log_full_errno(prio, r, "Failed to overmount /tmp/: %m");
×
1628
                        _exit(EXIT_FAILURE);
×
1629
                }
1630
        }
1631

1632
        if (flags & FORK_REARRANGE_STDIO) {
32,387✔
1633
                if (stdio_fds) {
16,177✔
1634
                        r = rearrange_stdio(stdio_fds[0], stdio_fds[1], stdio_fds[2]);
16,161✔
1635
                        if (r < 0) {
16,161✔
1636
                                log_full_errno(prio, r, "Failed to rearrange stdio fds: %m");
×
1637
                                _exit(EXIT_FAILURE);
×
1638
                        }
1639

1640
                        /* Turn off O_NONBLOCK on the fdio fds, in case it was left on */
1641
                        stdio_disable_nonblock();
16,161✔
1642
                } else {
1643
                        r = make_null_stdio();
16✔
1644
                        if (r < 0) {
16✔
1645
                                log_full_errno(prio, r, "Failed to connect stdin/stdout to /dev/null: %m");
×
1646
                                _exit(EXIT_FAILURE);
×
1647
                        }
1648
                }
1649
        } else if (flags & FORK_STDOUT_TO_STDERR) {
16,210✔
1650
                if (dup2(STDERR_FILENO, STDOUT_FILENO) < 0) {
2✔
1651
                        log_full_errno(prio, errno, "Failed to connect stdout to stderr: %m");
×
1652
                        _exit(EXIT_FAILURE);
×
1653
                }
1654
        }
1655

1656
        if (flags & FORK_CLOSE_ALL_FDS) {
32,387✔
1657
                /* Close the logs here in case it got reopened above, as close_all_fds() would close them for us */
1658
                log_close();
25,578✔
1659

1660
                r = close_all_fds(except_fds, n_except_fds);
25,578✔
1661
                if (r < 0) {
25,578✔
1662
                        log_full_errno(prio, r, "Failed to close all file descriptors: %m");
×
1663
                        _exit(EXIT_FAILURE);
×
1664
                }
1665
        }
1666

1667
        if (flags & FORK_PACK_FDS) {
32,387✔
1668
                /* FORK_CLOSE_ALL_FDS ensures that except_fds are the only FDs >= 3 that are
1669
                 * open, this is including the log. This is required by pack_fds, which will
1670
                 * get stuck in an infinite loop of any FDs other than except_fds are open. */
1671
                assert(FLAGS_SET(flags, FORK_CLOSE_ALL_FDS));
128✔
1672

1673
                r = pack_fds(except_fds, n_except_fds);
128✔
1674
                if (r < 0) {
128✔
1675
                        log_full_errno(prio, r, "Failed to pack file descriptors: %m");
×
1676
                        _exit(EXIT_FAILURE);
×
1677
                }
1678
        }
1679

1680
        if (flags & FORK_CLOEXEC_OFF) {
32,387✔
1681
                r = fd_cloexec_many(except_fds, n_except_fds, false);
143✔
1682
                if (r < 0) {
143✔
1683
                        log_full_errno(prio, r, "Failed to turn off O_CLOEXEC on file descriptors: %m");
×
1684
                        _exit(EXIT_FAILURE);
×
1685
                }
1686
        }
1687

1688
        /* When we were asked to reopen the logs, do so again now */
1689
        if (flags & FORK_REOPEN_LOG) {
32,387✔
1690
                log_open();
8,327✔
1691
                log_set_open_when_needed(false);
8,327✔
1692
        }
1693

1694
        if (flags & FORK_RLIMIT_NOFILE_SAFE) {
32,387✔
1695
                r = rlimit_nofile_safe();
17,035✔
1696
                if (r < 0) {
17,035✔
1697
                        log_full_errno(prio, r, "Failed to lower RLIMIT_NOFILE's soft limit to 1K: %m");
×
1698
                        _exit(EXIT_FAILURE);
×
1699
                }
1700
        }
1701

1702
        if (!FLAGS_SET(flags, FORK_KEEP_NOTIFY_SOCKET)) {
32,387✔
1703
                r = RET_NERRNO(unsetenv("NOTIFY_SOCKET"));
32,387✔
1704
                if (r < 0) {
×
1705
                        log_full_errno(prio, r, "Failed to unset $NOTIFY_SOCKET: %m");
×
1706
                        _exit(EXIT_FAILURE);
×
1707
                }
1708
        }
1709

1710
        if (FLAGS_SET(flags, FORK_FREEZE))
32,387✔
1711
                freeze();
×
1712

1713
        if (ret) {
32,387✔
1714
                r = pidref_set_self(ret);
30,176✔
1715
                if (r < 0) {
30,176✔
1716
                        log_full_errno(prio, r, "Failed to acquire PID reference on ourselves: %m");
×
1717
                        _exit(EXIT_FAILURE);
×
1718
                }
1719
        }
1720

1721
        return 0;
1722
}
1723

1724
int namespace_fork_full(
140✔
1725
                const char *outer_name,
1726
                const char *inner_name,
1727
                int except_fds[],
1728
                size_t n_except_fds,
1729
                ForkFlags flags,
1730
                int pidns_fd,
1731
                int mntns_fd,
1732
                int netns_fd,
1733
                int userns_fd,
1734
                int root_fd,
1735
                PidRef *ret) {
1736

1737
        _cleanup_(pidref_done_sigkill_wait) PidRef pidref_outer = PIDREF_NULL;
×
1738
        _cleanup_close_pair_ int errno_pipe_fd[2] = EBADF_PAIR;
237✔
1739
        int r, prio = FLAGS_SET(flags, FORK_LOG) ? LOG_ERR : LOG_DEBUG;
140✔
1740

1741
        /* This is much like safe_fork(), but forks twice, and joins the specified namespaces in the middle
1742
         * process. This ensures that we are fully a member of the destination namespace, with pidns an all, so that
1743
         * /proc/self/fd works correctly.
1744
         *
1745
         * TODO: once we can rely on PIDFD_INFO_EXIT, do not keep the middle process around and instead
1746
         * return the pidfd of the inner process for direct tracking. */
1747

1748
        /* Insist on PDEATHSIG being enabled, as the pid returned is the one of the middle man, and otherwise
1749
         * killing of it won't be propagated to the inner child. */
1750
        assert((flags & (FORK_DEATHSIG_SIGKILL|FORK_DEATHSIG_SIGTERM|FORK_DEATHSIG_SIGINT)) != 0);
140✔
1751
        assert((flags & (FORK_DETACH|FORK_FREEZE)) == 0);
140✔
1752
        assert(!FLAGS_SET(flags, FORK_ALLOW_DLOPEN)); /* never allow loading shared library from another ns */
140✔
1753

1754
        /* We want read() to block as a synchronization point */
1755
        assert_cc(sizeof(int) <= PIPE_BUF);
140✔
1756
        if (pipe2(errno_pipe_fd, O_CLOEXEC) < 0)
140✔
1757
                return log_full_errno(prio, errno, "Failed to create pipe: %m");
×
1758

1759
        r = pidref_safe_fork_full(
376✔
1760
                        outer_name,
1761
                        /* stdio_fds= */ NULL, /* except_fds= */ NULL, /* n_except_fds= */ 0,
1762
                        (flags|FORK_DEATHSIG_SIGKILL) & ~(FORK_DEATHSIG_SIGTERM|FORK_DEATHSIG_SIGINT|FORK_REOPEN_LOG|FORK_NEW_MOUNTNS|FORK_MOUNTNS_SLAVE|FORK_NEW_USERNS|FORK_NEW_NETNS|FORK_NEW_PIDNS|FORK_CLOSE_ALL_FDS|FORK_PACK_FDS|FORK_CLOEXEC_OFF|FORK_RLIMIT_NOFILE_SAFE),
140✔
1763
                        &pidref_outer);
1764
        if (r == -EPROTO && FLAGS_SET(flags, FORK_WAIT)) {
236✔
1765
                errno_pipe_fd[1] = safe_close(errno_pipe_fd[1]);
8✔
1766

1767
                int k = read_errno(errno_pipe_fd[0]);
8✔
1768
                if (k < 0 && k != -EIO)
8✔
1769
                        return k;
1770
        }
1771
        if (r < 0)
236✔
1772
                return r;
1773
        if (r == 0) {
228✔
1774
                _cleanup_(pidref_done) PidRef pidref_inner = PIDREF_NULL;
×
1775

1776
                /* Child */
1777

1778
                errno_pipe_fd[0] = safe_close(errno_pipe_fd[0]);
96✔
1779

1780
                r = namespace_enter(pidns_fd, mntns_fd, netns_fd, userns_fd, root_fd);
96✔
1781
                if (r < 0) {
96✔
1782
                        log_full_errno(prio, r, "Failed to join namespace: %m");
×
1783
                        report_errno_and_exit(errno_pipe_fd[1], r);
×
1784
                }
1785

1786
                /* We mask a few flags here that either make no sense for the grandchild, or that we don't have to do again */
1787
                r = pidref_safe_fork_full(
289✔
1788
                                inner_name,
1789
                                NULL,
1790
                                except_fds, n_except_fds,
1791
                                flags & ~(FORK_WAIT|FORK_RESET_SIGNALS|FORK_REARRANGE_STDIO|FORK_FLUSH_STDIO|FORK_STDOUT_TO_STDERR),
96✔
1792
                                &pidref_inner);
1793
                if (r < 0)
193✔
1794
                        report_errno_and_exit(errno_pipe_fd[1], r);
×
1795
                if (r == 0) {
193✔
1796
                        /* Child */
1797

1798
                        if (!FLAGS_SET(flags, FORK_CLOSE_ALL_FDS)) {
97✔
1799
                                errno_pipe_fd[1] = safe_close(errno_pipe_fd[1]);
94✔
1800
                                pidref_done(&pidref_outer);
94✔
1801
                        } else {
1802
                                errno_pipe_fd[1] = -EBADF;
3✔
1803
                                pidref_outer = PIDREF_NULL;
3✔
1804
                        }
1805

1806
                        if (ret)
97✔
1807
                                *ret = TAKE_PIDREF(pidref_inner);
97✔
1808
                        return 0;
97✔
1809
                }
1810

1811
                log_forget_fds();
96✔
1812
                log_set_open_when_needed(true);
96✔
1813

1814
                (void) close_all_fds(&pidref_inner.fd, 1);
96✔
1815

1816
                r = pidref_wait_for_terminate_and_check(
192✔
1817
                                inner_name,
1818
                                &pidref_inner,
1819
                                FLAGS_SET(flags, FORK_LOG) ? WAIT_LOG : 0);
1820
                if (r < 0)
96✔
1821
                        _exit(EXIT_FAILURE);
×
1822

1823
                _exit(r);
96✔
1824
        }
1825

1826
        errno_pipe_fd[1] = safe_close(errno_pipe_fd[1]);
132✔
1827

1828
        r = read_errno(errno_pipe_fd[0]);
132✔
1829
        if (r < 0)
132✔
1830
                return r; /* the child logs about failures on its own, no need to duplicate here */
1831

1832
        if (ret)
132✔
1833
                *ret = TAKE_PIDREF(pidref_outer);
106✔
1834
        else
1835
                pidref_done(&pidref_outer); /* disarm sigkill_wait */
26✔
1836

1837
        return 1;
1838
}
1839

1840
bool oom_score_adjust_is_valid(int oa) {
7,007✔
1841
        return oa >= OOM_SCORE_ADJ_MIN && oa <= OOM_SCORE_ADJ_MAX;
7,007✔
1842
}
1843

1844
int set_oom_score_adjust(int value) {
3,748✔
1845
        char t[DECIMAL_STR_MAX(int)];
3,748✔
1846

1847
        if (!oom_score_adjust_is_valid(value))
3,748✔
1848
                return -EINVAL;
3,748✔
1849

1850
        xsprintf(t, "%i", value);
3,748✔
1851

1852
        return write_string_file("/proc/self/oom_score_adj", t,
3,748✔
1853
                                 WRITE_STRING_FILE_VERIFY_ON_FAILURE|WRITE_STRING_FILE_DISABLE_BUFFER);
1854
}
1855

1856
int get_oom_score_adjust(int *ret) {
2,519✔
1857
        _cleanup_free_ char *t = NULL;
2,519✔
1858
        int r, a;
2,519✔
1859

1860
        r = read_virtual_file("/proc/self/oom_score_adj", SIZE_MAX, &t, NULL);
2,519✔
1861
        if (r < 0)
2,519✔
1862
                return r;
1863

1864
        delete_trailing_chars(t, WHITESPACE);
2,519✔
1865

1866
        r = safe_atoi(t, &a);
2,519✔
1867
        if (r < 0)
2,519✔
1868
                return r;
1869

1870
        if (!oom_score_adjust_is_valid(a))
2,519✔
1871
                return -ENODATA;
1872

1873
        if (ret)
2,519✔
1874
                *ret = a;
2,519✔
1875

1876
        return 0;
1877
}
1878

1879
static int rlimit_to_nice(rlim_t limit) {
2✔
1880
        if (limit <= 1)
2✔
1881
                return PRIO_MAX-1; /* i.e. 19 */
1882

1883
        if (limit >= -PRIO_MIN + PRIO_MAX)
2✔
1884
                return PRIO_MIN; /* i.e. -20 */
1885

1886
        return PRIO_MAX - (int) limit;
2✔
1887
}
1888

1889
int setpriority_closest(int priority) {
27✔
1890
        struct rlimit highest;
27✔
1891
        int r, current, limit;
27✔
1892

1893
        /* Try to set requested nice level */
1894
        r = RET_NERRNO(setpriority(PRIO_PROCESS, 0, priority));
27✔
1895
        if (r >= 0)
2✔
1896
                return 1;
25✔
1897
        if (!ERRNO_IS_NEG_PRIVILEGE(r))
2✔
1898
                return r;
1899

1900
        errno = 0;
2✔
1901
        current = getpriority(PRIO_PROCESS, 0);
2✔
1902
        if (errno != 0)
2✔
1903
                return -errno;
×
1904

1905
        if (priority == current)
2✔
1906
                return 1;
1907

1908
       /* Hmm, we'd expect that raising the nice level from our status quo would always work. If it doesn't,
1909
        * then the whole setpriority() system call is blocked to us, hence let's propagate the error
1910
        * right-away */
1911
        if (priority > current)
2✔
1912
                return r;
1913

1914
        if (getrlimit(RLIMIT_NICE, &highest) < 0)
2✔
1915
                return -errno;
×
1916

1917
        limit = rlimit_to_nice(highest.rlim_cur);
2✔
1918

1919
        /* Push to the allowed limit if we're higher than that. Note that we could also be less nice than
1920
         * limit allows us, but still higher than what's requested. In that case our current value is
1921
         * the best choice. */
1922
        if (current > limit)
2✔
1923
                if (setpriority(PRIO_PROCESS, 0, limit) < 0)
2✔
1924
                        return -errno;
×
1925

1926
        log_debug("Cannot set requested nice level (%i), using next best (%i).", priority, MIN(current, limit));
2✔
1927
        return 0;
1928
}
1929

1930
_noreturn_ void freeze(void) {
×
1931
        log_close();
×
1932

1933
        /* Make sure nobody waits for us (i.e. on one of our sockets) anymore. Note that we use
1934
         * close_all_fds_without_malloc() instead of plain close_all_fds() here, since we want this function
1935
         * to be compatible with being called from signal handlers. */
1936
        (void) close_all_fds_without_malloc(NULL, 0);
×
1937

1938
        /* Let's not freeze right away, but keep reaping zombies. */
1939
        for (;;) {
×
1940
                siginfo_t si = {};
×
1941

1942
                if (waitid(P_ALL, 0, &si, WEXITED) < 0 && errno != EINTR)
×
1943
                        break;
1944
        }
1945

1946
        /* waitid() failed with an ECHLD error (because there are no left-over child processes) or any other
1947
         * (unexpected) error. Freeze for good now! */
1948
        for (;;)
×
1949
                pause();
×
1950
}
1951

1952
int get_process_threads(pid_t pid) {
7✔
1953
        _cleanup_free_ char *t = NULL;
7✔
1954
        int n, r;
7✔
1955

1956
        if (pid < 0)
7✔
1957
                return -EINVAL;
1958

1959
        r = procfs_file_get_field(pid, "status", "Threads", &t);
7✔
1960
        if (r == -ENOENT)
7✔
1961
                return -ESRCH;
1962
        if (r < 0)
7✔
1963
                return r;
1964

1965
        r = safe_atoi(t, &n);
7✔
1966
        if (r < 0)
7✔
1967
                return r;
1968
        if (n < 0)
7✔
1969
                return -EINVAL;
×
1970

1971
        return n;
1972
}
1973

1974
int is_reaper_process(void) {
3,791✔
1975
        int b = 0;
3,791✔
1976

1977
        /* Checks if we are running in a reaper process, i.e. if we are expected to deal with processes
1978
         * reparented to us. This simply checks if we are PID 1 or if PR_SET_CHILD_SUBREAPER was called. */
1979

1980
        if (getpid_cached() == 1)
3,791✔
1981
                return true;
3,791✔
1982

1983
        if (prctl(PR_GET_CHILD_SUBREAPER, (unsigned long) &b, 0UL, 0UL, 0UL) < 0)
369✔
1984
                return -errno;
×
1985

1986
        return b != 0;
369✔
1987
}
1988

1989
int make_reaper_process(bool b) {
688✔
1990

1991
        if (getpid_cached() == 1) {
688✔
1992

1993
                if (!b)
52✔
1994
                        return -EINVAL;
1995

1996
                return 0;
52✔
1997
        }
1998

1999
        /* Some prctl()s insist that all 5 arguments are specified, others do not. Let's always specify all,
2000
         * to avoid any ambiguities */
2001
        if (prctl(PR_SET_CHILD_SUBREAPER, (unsigned long) b, 0UL, 0UL, 0UL) < 0)
636✔
2002
                return -errno;
×
2003

2004
        return 0;
2005
}
2006

2007
DEFINE_TRIVIAL_CLEANUP_FUNC_FULL(posix_spawnattr_t*, posix_spawnattr_destroy, NULL);
×
2008

2009
int posix_spawn_wrapper(
2,530✔
2010
                const char *path,
2011
                char * const *argv,
2012
                char * const *envp,
2013
                const char *cgroup,
2014
                PidRef *ret_pidref) {
2015

2016
        short flags = POSIX_SPAWN_SETSIGMASK;
2,530✔
2017
        posix_spawnattr_t attr;
2,530✔
2018
        sigset_t mask;
2,530✔
2019
        int r;
2,530✔
2020

2021
        /* Forks and invokes 'path' with 'argv' and 'envp' using CLONE_VM and CLONE_VFORK, which means the
2022
         * caller will be blocked until the child either exits or exec's. The memory of the child will be
2023
         * fully shared with the memory of the parent, so that there are no copy-on-write or memory.max
2024
         * issues.
2025
         *
2026
         * Also, move the newly-created process into 'cgroup' through POSIX_SPAWN_SETCGROUP (clone3())
2027
         * if available.
2028
         * returns 1: We're already in the right cgroup
2029
         *         0: 'cgroup' not specified or POSIX_SPAWN_SETCGROUP is not supported. The caller
2030
         *            needs to call 'cg_attach' on their own */
2031

2032
        assert(path);
2,530✔
2033
        assert(argv);
2,530✔
2034
        assert(ret_pidref);
2,530✔
2035

2036
        assert_se(sigfillset(&mask) >= 0);
2,530✔
2037

2038
        r = posix_spawnattr_init(&attr);
2,530✔
2039
        if (r != 0)
2,530✔
2040
                return -r; /* These functions return a positive errno on failure */
2,530✔
2041

2042
        /* Initialization needs to succeed before we can set up a destructor. */
2043
        _unused_ _cleanup_(posix_spawnattr_destroyp) posix_spawnattr_t *attr_destructor = &attr;
5,060✔
2044

2045
#if HAVE_PIDFD_SPAWN
2046
        static bool have_clone_into_cgroup = true; /* kernel 5.7+ */
2,530✔
2047
        _cleanup_close_ int cgroup_fd = -EBADF;
2,530✔
2048

2049
        if (cgroup && have_clone_into_cgroup) {
2,530✔
2050
                _cleanup_free_ char *resolved_cgroup = NULL;
2,530✔
2051

2052
                r = cg_get_path(cgroup, /* suffix= */ NULL, &resolved_cgroup);
2,530✔
2053
                if (r < 0)
2,530✔
2054
                        return r;
2055

2056
                cgroup_fd = open(resolved_cgroup, O_PATH|O_DIRECTORY|O_CLOEXEC);
2,530✔
2057
                if (cgroup_fd < 0)
2,530✔
2058
                        return -errno;
×
2059

2060
                r = posix_spawnattr_setcgroup_np(&attr, cgroup_fd);
2,530✔
2061
                if (r != 0)
2,530✔
2062
                        return -r;
×
2063

2064
                flags |= POSIX_SPAWN_SETCGROUP;
2,530✔
2065
        }
2066
#endif
2067

2068
        r = posix_spawnattr_setflags(&attr, flags);
2,530✔
2069
        if (r != 0)
2,530✔
2070
                return -r;
×
2071
        r = posix_spawnattr_setsigmask(&attr, &mask);
2,530✔
2072
        if (r != 0)
2,530✔
2073
                return -r;
×
2074

2075
#if HAVE_PIDFD_SPAWN
2076
        _cleanup_close_ int pidfd = -EBADF;
2,530✔
2077

2078
        r = pidfd_spawn(&pidfd, path, NULL, &attr, argv, envp);
2,530✔
2079
        if (ERRNO_IS_NOT_SUPPORTED(r) && FLAGS_SET(flags, POSIX_SPAWN_SETCGROUP) && cg_is_threaded(cgroup) > 0)
2,530✔
2080
                return -EUCLEAN; /* clone3() could also return EOPNOTSUPP if the target cgroup is in threaded mode,
2081
                                    turn that into something recognizable */
2082
        if ((ERRNO_IS_NOT_SUPPORTED(r) || ERRNO_IS_PRIVILEGE(r)) &&
2,530✔
2083
            FLAGS_SET(flags, POSIX_SPAWN_SETCGROUP)) {
2084
                /* Compiled on a newer host, or seccomp&friends blocking clone3()? Fallback, but
2085
                 * need to disable POSIX_SPAWN_SETCGROUP, which is what redirects to clone3().
2086
                 * CLONE_INTO_CGROUP definitely won't work, hence remember the fact so that we don't
2087
                 * retry every time.
2088
                 * Note, CLONE_INTO_CGROUP is supported since kernel v5.7, but some architectures still
2089
                 * do not support clone3(). Hence, we need to keep the fallback logic for a while. */
2090
                have_clone_into_cgroup = false;
×
2091

2092
                flags &= ~POSIX_SPAWN_SETCGROUP;
×
2093
                r = posix_spawnattr_setflags(&attr, flags);
×
2094
                if (r != 0)
×
2095
                        return -r;
×
2096

2097
                r = pidfd_spawn(&pidfd, path, NULL, &attr, argv, envp);
×
2098
        }
2099
        if (r != 0)
2,530✔
2100
                return -r;
×
2101

2102
        r = pidref_set_pidfd_consume(ret_pidref, TAKE_FD(pidfd));
2,530✔
2103
        if (r < 0)
2,530✔
2104
                return r;
2105

2106
        return FLAGS_SET(flags, POSIX_SPAWN_SETCGROUP);
2,530✔
2107
#else
2108
        pid_t pid;
2109

2110
        r = posix_spawn(&pid, path, NULL, &attr, argv, envp);
2111
        if (r != 0)
2112
                return -r;
2113

2114
        r = pidref_set_pid(ret_pidref, pid);
2115
        if (r < 0)
2116
                return r;
2117

2118
        return 0; /* We did not use CLONE_INTO_CGROUP so return 0, the caller will have to move the child */
2119
#endif
2120
}
2121

2122
int proc_dir_open(DIR **ret) {
13✔
2123
        DIR *d;
13✔
2124

2125
        assert(ret);
13✔
2126

2127
        d = opendir("/proc");
13✔
2128
        if (!d)
13✔
2129
                return -errno;
×
2130

2131
        *ret = d;
13✔
2132
        return 0;
13✔
2133
}
2134

2135
int proc_dir_read(DIR *d, pid_t *ret) {
1,165✔
2136
        assert(d);
1,165✔
2137

2138
        for (;;) {
1,949✔
2139
                struct dirent *de;
1,949✔
2140

2141
                errno = 0;
1,949✔
2142
                de = readdir_no_dot(d);
1,949✔
2143
                if (!de) {
1,949✔
2144
                        if (errno != 0)
13✔
2145
                                return -errno;
×
2146

2147
                        break;
13✔
2148
                }
2149

2150
                if (!IN_SET(de->d_type, DT_DIR, DT_UNKNOWN))
1,936✔
2151
                        continue;
641✔
2152

2153
                if (parse_pid(de->d_name, ret) >= 0)
1,295✔
2154
                        return 1;
2155
        }
2156

2157
        if (ret)
13✔
2158
                *ret = 0;
13✔
2159
        return 0;
2160
}
2161

2162
int proc_dir_read_pidref(DIR *d, PidRef *ret) {
1,122✔
2163
        int r;
1,122✔
2164

2165
        assert(d);
1,122✔
2166

2167
        for (;;) {
1,122✔
2168
                pid_t pid;
1,122✔
2169

2170
                r = proc_dir_read(d, &pid);
1,122✔
2171
                if (r < 0)
1,122✔
2172
                        return r;
1,110✔
2173
                if (r == 0)
1,122✔
2174
                        break;
2175

2176
                r = pidref_set_pid(ret, pid);
1,110✔
2177
                if (r == -ESRCH) /* gone by now? skip it */
1,110✔
2178
                        continue;
×
2179
                if (r < 0)
1,110✔
2180
                        return r;
×
2181

2182
                return 1;
2183
        }
2184

2185
        if (ret)
12✔
2186
                *ret = PIDREF_NULL;
12✔
2187
        return 0;
2188
}
2189

2190
int safe_mlockall(int flags) {
162✔
2191
        int r;
162✔
2192

2193
        /* When dealing with sensitive data, let's lock ourselves into memory. We do this only when
2194
         * privileged however, as otherwise the amount of lockable memory that RLIMIT_MEMLOCK grants us is
2195
         * frequently too low to make this work. The resource limit has no effect on CAP_IPC_LOCK processes,
2196
         * hence that's the capability we check for. */
2197
        r = have_effective_cap(CAP_IPC_LOCK);
162✔
2198
        if (r < 0)
162✔
2199
                return log_debug_errno(r, "Failed to determine if we have CAP_IPC_LOCK: %m");
×
2200
        if (r == 0)
162✔
2201
                return log_debug_errno(SYNTHETIC_ERRNO(EPERM), "Lacking CAP_IPC_LOCK, skipping mlockall().");
×
2202

2203
        if (mlockall(flags) < 0)
162✔
2204
                return log_debug_errno(errno, "Failed to call mlockall(): %m");
×
2205

2206
        log_debug("Successfully called mlockall().");
162✔
2207
        return 0;
2208
}
2209

2210
static const char *const sigchld_code_table[] = {
2211
        [CLD_EXITED] = "exited",
2212
        [CLD_KILLED] = "killed",
2213
        [CLD_DUMPED] = "dumped",
2214
        [CLD_TRAPPED] = "trapped",
2215
        [CLD_STOPPED] = "stopped",
2216
        [CLD_CONTINUED] = "continued",
2217
};
2218

2219
DEFINE_STRING_TABLE_LOOKUP(sigchld_code, int);
9,303✔
2220

2221
static const char* const sched_policy_table[] = {
2222
        [SCHED_OTHER] = "other",
2223
        [SCHED_BATCH] = "batch",
2224
        [SCHED_IDLE]  = "idle",
2225
        [SCHED_FIFO]  = "fifo",
2226
        [SCHED_EXT]   = "ext",
2227
        [SCHED_RR]    = "rr",
2228
};
2229

2230
DEFINE_STRING_TABLE_LOOKUP_WITH_FALLBACK(sched_policy, int, INT_MAX);
59✔
2231

2232
_noreturn_ void report_errno_and_exit(int errno_fd, int error) {
196✔
2233
        int r;
196✔
2234

2235
        if (error >= 0)
196✔
2236
                _exit(EXIT_SUCCESS);
195✔
2237

2238
        assert(errno_fd >= 0);
1✔
2239

2240
        r = loop_write(errno_fd, &error, sizeof(error));
1✔
2241
        if (r < 0)
1✔
2242
                log_debug_errno(r, "Failed to write errno to errno_fd=%d: %m", errno_fd);
×
2243

2244
        _exit(EXIT_FAILURE);
1✔
2245
}
2246

2247
int read_errno(int errno_fd) {
151✔
2248
        int r;
151✔
2249

2250
        assert(errno_fd >= 0);
151✔
2251

2252
        /* The issue here is that it's impossible to distinguish between an error code returned by child and
2253
         * IO error arose when reading it. So, the function logs errors and return EIO for the later case. */
2254

2255
        ssize_t n = loop_read(errno_fd, &r, sizeof(r), /* do_poll= */ false);
151✔
2256
        if (n < 0) {
151✔
2257
                log_debug_errno(n, "Failed to read errno: %m");
×
2258
                return -EIO;
×
2259
        }
2260
        if (n == 0) /* the process exited without reporting an error, assuming success */
151✔
2261
                return 0;
2262
        if (n != sizeof(r))
8✔
2263
                return log_debug_errno(SYNTHETIC_ERRNO(EIO), "Received unexpected amount of bytes (%zi) while reading errno.", n);
×
2264

2265
        if (r == 0)
8✔
2266
                return 0;
2267
        if (r < 0) /* child process reported an error, return it */
8✔
2268
                return log_debug_errno(r, "Child process failed with errno: %m");
8✔
2269

2270
        return log_debug_errno(SYNTHETIC_ERRNO(EIO), "Received positive errno from child, refusing: %d", r);
×
2271
}
STATUS · Troubleshooting · Open an Issue · Sales · Support · CAREERS · ENTERPRISE · START FREE · SCHEDULE DEMO
ANNOUNCEMENTS · TWITTER · TOS & SLA · Supported CI Services · What's a CI service? · Automated Testing

© 2026 Coveralls, Inc