• Home
  • Features
  • Pricing
  • Docs
  • Announcements
  • Sign In

systemd / systemd / 14815796853

02 May 2025 11:41AM UTC coverage: 72.24% (-0.003%) from 72.243%
14815796853

push

github

web-flow
Various changes to prepare for running IWYU on the repository (#37319)

These are various commits that were required to get things compiling
after running IWYU. I think all of them make sense on their own, hence
this split PR to merge them ahead of time.

81 of 96 new or added lines in 48 files covered. (84.38%)

209 existing lines in 39 files now uncovered.

297219 of 411432 relevant lines covered (72.24%)

693693.2 hits per line

Source File
Press 'n' to go to next uncovered line, 'b' for previous

72.29
/src/core/exec-invoke.c
1
/* SPDX-License-Identifier: LGPL-2.1-or-later */
2

3
#include <linux/prctl.h>
4
#include <linux/sched.h>
5
#include <linux/securebits.h>
6
#include <sys/eventfd.h>
7
#include <sys/ioctl.h>
8
#include <sys/mount.h>
9
#include <sys/prctl.h>
10

11
#if HAVE_PAM
12
#include <security/pam_appl.h>
13
#include <security/pam_misc.h>
14
#endif
15

16
#include "sd-messages.h"
17

18
#include "apparmor-util.h"
19
#include "argv-util.h"
20
#include "ask-password-api.h"
21
#include "barrier.h"
22
#include "bitfield.h"
23
#include "bpf-dlopen.h"
24
#include "bpf-restrict-fs.h"
25
#include "btrfs-util.h"
26
#include "capability-util.h"
27
#include "cgroup.h"
28
#include "cgroup-setup.h"
29
#include "chase.h"
30
#include "chattr-util.h"
31
#include "chown-recursive.h"
32
#include "copy.h"
33
#include "dynamic-user.h"
34
#include "env-util.h"
35
#include "escape.h"
36
#include "exec-credential.h"
37
#include "exec-invoke.h"
38
#include "execute.h"
39
#include "exit-status.h"
40
#include "fd-util.h"
41
#include "hexdecoct.h"
42
#include "hostname-setup.h"
43
#include "image-policy.h"
44
#include "io-util.h"
45
#include "ioprio-util.h"
46
#include "iovec-util.h"
47
#include "journal-send.h"
48
#include "manager.h"
49
#include "memfd-util.h"
50
#include "missing_sched.h"
51
#include "missing_syscall.h"
52
#include "mkdir-label.h"
53
#include "mount-util.h"
54
#include "osc-context.h"
55
#include "proc-cmdline.h"
56
#include "process-util.h"
57
#include "psi-util.h"
58
#include "rlimit-util.h"
59
#include "seccomp-util.h"
60
#include "selinux-util.h"
61
#include "signal-util.h"
62
#include "smack-util.h"
63
#include "socket-util.h"
64
#include "string-table.h"
65
#include "strv.h"
66
#include "terminal-util.h"
67
#include "utmp-wtmp.h"
68
#include "vpick.h"
69

70
#define IDLE_TIMEOUT_USEC (5*USEC_PER_SEC)
71
#define IDLE_TIMEOUT2_USEC (1*USEC_PER_SEC)
72

73
#define SNDBUF_SIZE (8*1024*1024)
74

75
static int flag_fds(
9,499✔
76
                const int fds[],
77
                size_t n_socket_fds,
78
                size_t n_fds,
79
                bool nonblock) {
80

81
        int r;
9,499✔
82

83
        assert(fds || n_fds == 0);
9,499✔
84

85
        /* Drops/Sets O_NONBLOCK and FD_CLOEXEC from the file flags.
86
         * O_NONBLOCK only applies to socket activation though. */
87

88
        for (size_t i = 0; i < n_fds; i++) {
12,067✔
89

90
                if (i < n_socket_fds) {
2,568✔
91
                        r = fd_nonblock(fds[i], nonblock);
2,254✔
92
                        if (r < 0)
2,254✔
93
                                return r;
94
                }
95

96
                /* We unconditionally drop FD_CLOEXEC from the fds,
97
                 * since after all we want to pass these fds to our
98
                 * children */
99

100
                r = fd_cloexec(fds[i], false);
2,568✔
101
                if (r < 0)
2,568✔
102
                        return r;
103
        }
104

105
        return 0;
106
}
107

108
static bool is_terminal_input(ExecInput i) {
42,741✔
109
        return IN_SET(i,
42,741✔
110
                      EXEC_INPUT_TTY,
111
                      EXEC_INPUT_TTY_FORCE,
112
                      EXEC_INPUT_TTY_FAIL);
113
}
114

115
static bool is_terminal_output(ExecOutput o) {
40,207✔
116
        return IN_SET(o,
40,207✔
117
                      EXEC_OUTPUT_TTY,
118
                      EXEC_OUTPUT_KMSG_AND_CONSOLE,
119
                      EXEC_OUTPUT_JOURNAL_AND_CONSOLE);
120
}
121

122
static bool is_kmsg_output(ExecOutput o) {
10,285✔
123
        return IN_SET(o,
10,285✔
124
                      EXEC_OUTPUT_KMSG,
125
                      EXEC_OUTPUT_KMSG_AND_CONSOLE);
126
}
127

128
static bool exec_context_needs_term(const ExecContext *c) {
9,523✔
129
        assert(c);
9,523✔
130

131
        /* Return true if the execution context suggests we should set $TERM to something useful. */
132

133
        if (is_terminal_input(c->std_input))
9,523✔
134
                return true;
135

136
        if (is_terminal_output(c->std_output))
9,355✔
137
                return true;
138

139
        if (is_terminal_output(c->std_error))
9,096✔
140
                return true;
141

142
        return !!c->tty_path;
9,095✔
143
}
144

145
static int open_null_as(int flags, int nfd) {
10,846✔
146
        int fd;
10,846✔
147

148
        assert(nfd >= 0);
10,846✔
149

150
        fd = open("/dev/null", flags|O_NOCTTY);
10,846✔
151
        if (fd < 0)
10,846✔
152
                return -errno;
×
153

154
        return move_fd(fd, nfd, false);
10,846✔
155
}
156

157
static int connect_journal_socket(
10,285✔
158
                int fd,
159
                const char *log_namespace,
160
                uid_t uid,
161
                gid_t gid) {
162

163
        uid_t olduid = UID_INVALID;
10,285✔
164
        gid_t oldgid = GID_INVALID;
10,285✔
165
        const char *j;
10,285✔
166
        int r;
10,285✔
167

168
        assert(fd >= 0);
10,285✔
169

170
        j = journal_stream_path(log_namespace);
10,297✔
171
        if (!j)
2✔
172
                return -EINVAL;
×
173

174
        if (gid_is_valid(gid)) {
10,285✔
175
                oldgid = getgid();
2,332✔
176

177
                if (setegid(gid) < 0)
2,332✔
178
                        return -errno;
×
179
        }
180

181
        if (uid_is_valid(uid)) {
10,285✔
182
                olduid = getuid();
2,329✔
183

184
                if (seteuid(uid) < 0) {
2,329✔
185
                        r = -errno;
×
186
                        goto restore_gid;
×
187
                }
188
        }
189

190
        r = connect_unix_path(fd, AT_FDCWD, j);
10,285✔
191

192
        /* If we fail to restore the uid or gid, things will likely fail later on. This should only happen if
193
           an LSM interferes. */
194

195
        if (uid_is_valid(uid))
10,285✔
196
                (void) seteuid(olduid);
2,329✔
197

198
 restore_gid:
7,956✔
199
        if (gid_is_valid(gid))
10,285✔
200
                (void) setegid(oldgid);
2,332✔
201

202
        return r;
203
}
204

205
static int connect_logger_as(
10,285✔
206
                const ExecContext *context,
207
                const ExecParameters *params,
208
                ExecOutput output,
209
                const char *ident,
210
                int nfd,
211
                uid_t uid,
212
                gid_t gid) {
213

214
        _cleanup_close_ int fd = -EBADF;
10,285✔
215
        int r;
10,285✔
216

217
        assert(context);
10,285✔
218
        assert(params);
10,285✔
219
        assert(output < _EXEC_OUTPUT_MAX);
10,285✔
220
        assert(ident);
10,285✔
221
        assert(nfd >= 0);
10,285✔
222

223
        fd = socket(AF_UNIX, SOCK_STREAM, 0);
10,285✔
224
        if (fd < 0)
10,285✔
225
                return -errno;
×
226

227
        r = connect_journal_socket(fd, context->log_namespace, uid, gid);
10,285✔
228
        if (r < 0)
10,285✔
229
                return r;
230

231
        if (shutdown(fd, SHUT_RD) < 0)
10,285✔
232
                return -errno;
×
233

234
        (void) fd_inc_sndbuf(fd, SNDBUF_SIZE);
10,285✔
235

236
        if (dprintf(fd,
19,827✔
237
                "%s\n"
238
                "%s\n"
239
                "%i\n"
240
                "%i\n"
241
                "%i\n"
242
                "%i\n"
243
                "%i\n",
244
                context->syslog_identifier ?: ident,
10,285✔
245
                params->flags & EXEC_PASS_LOG_UNIT ? params->unit_id : "",
10,285✔
246
                context->syslog_priority,
10,285✔
247
                !!context->syslog_level_prefix,
10,285✔
248
                false,
249
                is_kmsg_output(output),
10,285✔
250
                is_terminal_output(output)) < 0)
10,285✔
251
                return -errno;
×
252

253
        return move_fd(TAKE_FD(fd), nfd, false);
10,285✔
254
}
255

256
static int open_terminal_as(const char *path, int flags, int nfd) {
32✔
257
        int fd;
32✔
258

259
        assert(path);
32✔
260
        assert(nfd >= 0);
32✔
261

262
        fd = open_terminal(path, flags | O_NOCTTY);
32✔
263
        if (fd < 0)
32✔
264
                return fd;
265

266
        return move_fd(fd, nfd, false);
32✔
267
}
268

269
static int acquire_path(const char *path, int flags, mode_t mode) {
11✔
270
        _cleanup_close_ int fd = -EBADF;
11✔
271
        int r;
11✔
272

273
        assert(path);
11✔
274

275
        if (IN_SET(flags & O_ACCMODE_STRICT, O_WRONLY, O_RDWR))
11✔
276
                flags |= O_CREAT;
11✔
277

278
        fd = open(path, flags|O_NOCTTY, mode);
11✔
279
        if (fd >= 0)
11✔
280
                return TAKE_FD(fd);
11✔
281

282
        if (errno != ENXIO) /* ENXIO is returned when we try to open() an AF_UNIX file system socket on Linux */
×
283
                return -errno;
×
284

285
        /* So, it appears the specified path could be an AF_UNIX socket. Let's see if we can connect to it. */
286

287
        fd = socket(AF_UNIX, SOCK_STREAM, 0);
×
288
        if (fd < 0)
×
289
                return -errno;
×
290

291
        r = connect_unix_path(fd, AT_FDCWD, path);
×
292
        if (IN_SET(r, -ENOTSOCK, -EINVAL))
×
293
                /* Propagate initial error if we get ENOTSOCK or EINVAL, i.e. we have indication that this
294
                 * wasn't an AF_UNIX socket after all */
295
                return -ENXIO;
296
        if (r < 0)
×
297
                return r;
298

299
        if ((flags & O_ACCMODE_STRICT) == O_RDONLY)
×
300
                r = shutdown(fd, SHUT_WR);
×
301
        else if ((flags & O_ACCMODE_STRICT) == O_WRONLY)
×
302
                r = shutdown(fd, SHUT_RD);
×
303
        else
304
                r = 0;
305
        if (r < 0)
×
306
                return -errno;
×
307

308
        return TAKE_FD(fd);
309
}
310

311
static int fixup_input(
32,832✔
312
                const ExecContext *context,
313
                int socket_fd,
314
                bool apply_tty_stdin) {
315

316
        ExecInput std_input;
32,832✔
317

318
        assert(context);
32,832✔
319

320
        std_input = context->std_input;
32,832✔
321

322
        if (is_terminal_input(std_input) && !apply_tty_stdin)
32,832✔
323
                return EXEC_INPUT_NULL;
324

325
        if (std_input == EXEC_INPUT_SOCKET && socket_fd < 0)
32,832✔
326
                return EXEC_INPUT_NULL;
327

328
        if (std_input == EXEC_INPUT_DATA && context->stdin_data_size == 0)
32,832✔
329
                return EXEC_INPUT_NULL;
×
330

331
        return std_input;
332
}
333

334
static int fixup_output(ExecOutput output, int socket_fd) {
32,832✔
335

336
        if (output == EXEC_OUTPUT_SOCKET && socket_fd < 0)
32,832✔
337
                return EXEC_OUTPUT_INHERIT;
×
338

339
        return output;
340
}
341

342
static int setup_input(
11,471✔
343
                const ExecContext *context,
344
                const ExecParameters *params,
345
                int socket_fd,
346
                const int named_iofds[static 3]) {
347

348
        ExecInput i;
11,471✔
349
        int r;
11,471✔
350

351
        assert(context);
11,471✔
352
        assert(params);
11,471✔
353
        assert(named_iofds);
11,471✔
354

355
        if (params->stdin_fd >= 0) {
11,471✔
356
                if (dup2(params->stdin_fd, STDIN_FILENO) < 0)
527✔
357
                        return -errno;
×
358

359
                /* Try to make this our controlling tty, if it is a tty */
360
                if (isatty_safe(STDIN_FILENO) && ioctl(STDIN_FILENO, TIOCSCTTY, context->std_input == EXEC_INPUT_TTY_FORCE) < 0)
527✔
361
                        log_debug_errno(errno, "Failed to make standard input TTY our controlling terminal: %m");
2✔
362

363
                return STDIN_FILENO;
527✔
364
        }
365

366
        i = fixup_input(context, socket_fd, params->flags & EXEC_APPLY_TTY_STDIN);
10,944✔
367

368
        switch (i) {
10,944✔
369

370
        case EXEC_INPUT_NULL:
10,586✔
371
                return open_null_as(O_RDONLY, STDIN_FILENO);
10,586✔
372

373
        case EXEC_INPUT_TTY:
346✔
374
        case EXEC_INPUT_TTY_FORCE:
375
        case EXEC_INPUT_TTY_FAIL: {
376
                _cleanup_close_ int tty_fd = -EBADF;
346✔
377
                _cleanup_free_ char *resolved = NULL;
346✔
378
                const char *tty_path;
346✔
379

380
                tty_path = ASSERT_PTR(exec_context_tty_path(context));
346✔
381

382
                if (tty_is_console(tty_path)) {
346✔
383
                        r = resolve_dev_console(&resolved);
263✔
384
                        if (r < 0)
263✔
385
                                log_debug_errno(r, "Failed to resolve /dev/console, ignoring: %m");
×
386
                        else {
387
                                log_debug("Resolved /dev/console to %s", resolved);
263✔
388
                                tty_path = resolved;
263✔
389
                        }
390
                }
391

392
                tty_fd = acquire_terminal(tty_path,
692✔
393
                                          i == EXEC_INPUT_TTY_FAIL  ? ACQUIRE_TERMINAL_TRY :
346✔
394
                                          i == EXEC_INPUT_TTY_FORCE ? ACQUIRE_TERMINAL_FORCE :
395
                                                                      ACQUIRE_TERMINAL_WAIT,
396
                                          USEC_INFINITY);
397
                if (tty_fd < 0)
346✔
398
                        return tty_fd;
399

400
                r = move_fd(tty_fd, STDIN_FILENO, /* cloexec= */ false);
346✔
401
                if (r < 0)
346✔
402
                        return r;
×
403

404
                TAKE_FD(tty_fd);
405
                return r;
406
        }
407

408
        case EXEC_INPUT_SOCKET:
11✔
409
                assert(socket_fd >= 0);
11✔
410

411
                return RET_NERRNO(dup2(socket_fd, STDIN_FILENO));
11✔
412

413
        case EXEC_INPUT_NAMED_FD:
×
414
                assert(named_iofds[STDIN_FILENO] >= 0);
×
415

416
                (void) fd_nonblock(named_iofds[STDIN_FILENO], false);
×
417
                return RET_NERRNO(dup2(named_iofds[STDIN_FILENO], STDIN_FILENO));
11,471✔
418

419
        case EXEC_INPUT_DATA: {
1✔
420
                int fd;
1✔
421

422
                fd = memfd_new_and_seal("exec-input", context->stdin_data, context->stdin_data_size);
1✔
423
                if (fd < 0)
1✔
424
                        return fd;
425

426
                return move_fd(fd, STDIN_FILENO, false);
1✔
427
        }
428

429
        case EXEC_INPUT_FILE: {
×
430
                bool rw;
×
431
                int fd;
×
432

433
                assert(context->stdio_file[STDIN_FILENO]);
×
434

435
                rw = (context->std_output == EXEC_OUTPUT_FILE && streq_ptr(context->stdio_file[STDIN_FILENO], context->stdio_file[STDOUT_FILENO])) ||
×
436
                        (context->std_error == EXEC_OUTPUT_FILE && streq_ptr(context->stdio_file[STDIN_FILENO], context->stdio_file[STDERR_FILENO]));
×
437

438
                fd = acquire_path(context->stdio_file[STDIN_FILENO], rw ? O_RDWR : O_RDONLY, 0666 & ~context->umask);
×
439
                if (fd < 0)
×
440
                        return fd;
441

442
                return move_fd(fd, STDIN_FILENO, false);
×
443
        }
444

445
        default:
×
446
                assert_not_reached();
×
447
        }
448
}
449

450
static bool can_inherit_stderr_from_stdout(
10,944✔
451
                const ExecContext *context,
452
                ExecOutput o,
453
                ExecOutput e) {
454

455
        assert(context);
10,944✔
456

457
        /* Returns true, if given the specified STDERR and STDOUT output we can directly dup() the stdout fd to the
458
         * stderr fd */
459

460
        if (e == EXEC_OUTPUT_INHERIT)
10,944✔
461
                return true;
462
        if (e != o)
409✔
463
                return false;
464

465
        if (e == EXEC_OUTPUT_NAMED_FD)
406✔
466
                return streq_ptr(context->stdio_fdname[STDOUT_FILENO], context->stdio_fdname[STDERR_FILENO]);
×
467

468
        if (IN_SET(e, EXEC_OUTPUT_FILE, EXEC_OUTPUT_FILE_APPEND, EXEC_OUTPUT_FILE_TRUNCATE))
406✔
469
                return streq_ptr(context->stdio_file[STDOUT_FILENO], context->stdio_file[STDERR_FILENO]);
4✔
470

471
        return true;
472
}
473

474
static int setup_output(
22,942✔
475
                const ExecContext *context,
476
                const ExecParameters *params,
477
                int fileno,
478
                int socket_fd,
479
                const int named_iofds[static 3],
480
                const char *ident,
481
                uid_t uid,
482
                gid_t gid,
483
                dev_t *journal_stream_dev,
484
                ino_t *journal_stream_ino) {
485

486
        ExecOutput o;
22,942✔
487
        ExecInput i;
22,942✔
488
        int r;
22,942✔
489

490
        assert(context);
22,942✔
491
        assert(params);
22,942✔
492
        assert(ident);
22,942✔
493
        assert(journal_stream_dev);
22,942✔
494
        assert(journal_stream_ino);
22,942✔
495

496
        if (fileno == STDOUT_FILENO && params->stdout_fd >= 0) {
22,942✔
497

498
                if (dup2(params->stdout_fd, STDOUT_FILENO) < 0)
527✔
499
                        return -errno;
×
500

501
                return STDOUT_FILENO;
502
        }
503

504
        if (fileno == STDERR_FILENO && params->stderr_fd >= 0) {
22,415✔
505
                if (dup2(params->stderr_fd, STDERR_FILENO) < 0)
527✔
506
                        return -errno;
×
507

508
                return STDERR_FILENO;
509
        }
510

511
        i = fixup_input(context, socket_fd, params->flags & EXEC_APPLY_TTY_STDIN);
21,888✔
512
        o = fixup_output(context->std_output, socket_fd);
21,888✔
513

514
        // FIXME: we probably should spend some time here to verify that if we inherit an fd from stdin
515
        // (possibly indirect via inheritance from stdout) it is actually opened for write!
516

517
        if (fileno == STDERR_FILENO) {
21,888✔
518
                ExecOutput e;
10,944✔
519
                e = fixup_output(context->std_error, socket_fd);
10,944✔
520

521
                /* This expects the input and output are already set up */
522

523
                /* Don't change the stderr file descriptor if we inherit all
524
                 * the way and are not on a tty */
525
                if (e == EXEC_OUTPUT_INHERIT &&
10,944✔
526
                    o == EXEC_OUTPUT_INHERIT &&
8✔
527
                    i == EXEC_INPUT_NULL &&
×
528
                    !is_terminal_input(context->std_input) &&
×
529
                    getppid() != 1)
×
530
                        return fileno;
531

532
                /* Duplicate from stdout if possible */
533
                if (can_inherit_stderr_from_stdout(context, o, e))
10,944✔
534
                        return RET_NERRNO(dup2(STDOUT_FILENO, fileno));
10,937✔
535

536
                o = e;
537

538
        } else if (o == EXEC_OUTPUT_INHERIT) {
10,944✔
539
                /* If input got downgraded, inherit the original value */
540
                if (i == EXEC_INPUT_NULL && is_terminal_input(context->std_input))
8✔
541
                        return open_terminal_as(exec_context_tty_path(context), O_WRONLY, fileno);
×
542

543
                /* If the input is connected to anything that's not a /dev/null or a data fd, inherit that... */
544
                if (!IN_SET(i, EXEC_INPUT_NULL, EXEC_INPUT_DATA))
8✔
545
                        return RET_NERRNO(dup2(STDIN_FILENO, fileno));
8✔
546

547
                /* If we are not started from PID 1 we just inherit STDOUT from our parent process. */
548
                if (getppid() != 1)
×
549
                        return fileno;
550

551
                /* We need to open /dev/null here anew, to get the right access mode. */
552
                return open_null_as(O_WRONLY, fileno);
×
553
        }
554

555
        switch (o) {
10,943✔
556

557
        case EXEC_OUTPUT_NULL:
260✔
558
                return open_null_as(O_WRONLY, fileno);
260✔
559

560
        case EXEC_OUTPUT_TTY:
378✔
561
                if (is_terminal_input(i))
378✔
562
                        return RET_NERRNO(dup2(STDIN_FILENO, fileno));
346✔
563

564
                return open_terminal_as(exec_context_tty_path(context), O_WRONLY, fileno);
32✔
565

566
        case EXEC_OUTPUT_KMSG:
10,285✔
567
        case EXEC_OUTPUT_KMSG_AND_CONSOLE:
568
        case EXEC_OUTPUT_JOURNAL:
569
        case EXEC_OUTPUT_JOURNAL_AND_CONSOLE:
570
                r = connect_logger_as(context, params, o, ident, fileno, uid, gid);
10,285✔
571
                if (r < 0) {
10,285✔
572
                        log_warning_errno(r, "Failed to connect %s to the journal socket, ignoring: %m",
×
573
                                          fileno == STDOUT_FILENO ? "stdout" : "stderr");
574
                        r = open_null_as(O_WRONLY, fileno);
×
575
                } else {
576
                        struct stat st;
10,285✔
577

578
                        /* If we connected this fd to the journal via a stream, patch the device/inode into the passed
579
                         * parameters, but only then. This is useful so that we can set $JOURNAL_STREAM that permits
580
                         * services to detect whether they are connected to the journal or not.
581
                         *
582
                         * If both stdout and stderr are connected to a stream then let's make sure to store the data
583
                         * about STDERR as that's usually the best way to do logging. */
584

585
                        if (fstat(fileno, &st) >= 0 &&
10,285✔
586
                            (*journal_stream_ino == 0 || fileno == STDERR_FILENO)) {
10,285✔
587
                                *journal_stream_dev = st.st_dev;
10,285✔
588
                                *journal_stream_ino = st.st_ino;
10,285✔
589
                        }
590
                }
591
                return r;
592

593
        case EXEC_OUTPUT_SOCKET:
9✔
594
                assert(socket_fd >= 0);
9✔
595

596
                return RET_NERRNO(dup2(socket_fd, fileno));
9✔
597

598
        case EXEC_OUTPUT_NAMED_FD:
×
599
                assert(named_iofds[fileno] >= 0);
×
600

601
                (void) fd_nonblock(named_iofds[fileno], false);
×
602
                return RET_NERRNO(dup2(named_iofds[fileno], fileno));
×
603

604
        case EXEC_OUTPUT_FILE:
11✔
605
        case EXEC_OUTPUT_FILE_APPEND:
606
        case EXEC_OUTPUT_FILE_TRUNCATE: {
607
                bool rw;
11✔
608
                int fd, flags;
11✔
609

610
                assert(context->stdio_file[fileno]);
11✔
611

612
                rw = context->std_input == EXEC_INPUT_FILE &&
11✔
613
                        streq_ptr(context->stdio_file[fileno], context->stdio_file[STDIN_FILENO]);
×
614

615
                if (rw)
11✔
616
                        return RET_NERRNO(dup2(STDIN_FILENO, fileno));
×
617

618
                flags = O_WRONLY;
11✔
619
                if (o == EXEC_OUTPUT_FILE_APPEND)
11✔
620
                        flags |= O_APPEND;
621
                else if (o == EXEC_OUTPUT_FILE_TRUNCATE)
9✔
622
                        flags |= O_TRUNC;
3✔
623

624
                fd = acquire_path(context->stdio_file[fileno], flags, 0666 & ~context->umask);
11✔
625
                if (fd < 0)
11✔
626
                        return fd;
627

628
                return move_fd(fd, fileno, 0);
11✔
629
        }
630

631
        default:
×
632
                assert_not_reached();
×
633
        }
634
}
635

636
static int chown_terminal(int fd, uid_t uid) {
2,639✔
637
        int r;
2,639✔
638

639
        assert(fd >= 0);
2,639✔
640

641
        /* Before we chown/chmod the TTY, let's ensure this is actually a tty */
642
        if (!isatty_safe(fd))
2,639✔
643
                return 0;
644

645
        /* This might fail. What matters are the results. */
646
        r = fchmod_and_chown(fd, TTY_MODE, uid, GID_INVALID);
7✔
647
        if (r < 0)
7✔
648
                return r;
×
649

650
        return 1;
651
}
652

653
static int setup_confirm_stdio(
×
654
                const ExecContext *context,
655
                const char *vc,
656
                int *ret_saved_stdin,
657
                int *ret_saved_stdout) {
658

659
        _cleanup_close_ int fd = -EBADF, saved_stdin = -EBADF, saved_stdout = -EBADF;
×
660
        int r;
×
661

662
        assert(context);
×
663
        assert(ret_saved_stdin);
×
664
        assert(ret_saved_stdout);
×
665

666
        saved_stdin = fcntl(STDIN_FILENO, F_DUPFD_CLOEXEC, 3);
×
667
        if (saved_stdin < 0)
×
668
                return -errno;
×
669

670
        saved_stdout = fcntl(STDOUT_FILENO, F_DUPFD_CLOEXEC, 3);
×
671
        if (saved_stdout < 0)
×
672
                return -errno;
×
673

674
        fd = acquire_terminal(vc, ACQUIRE_TERMINAL_WAIT, DEFAULT_CONFIRM_USEC);
×
675
        if (fd < 0)
×
676
                return fd;
677

678
        _cleanup_close_ int lock_fd = lock_dev_console();
×
679
        if (lock_fd < 0)
×
680
                log_debug_errno(lock_fd, "Failed to lock /dev/console, ignoring: %m");
×
681

682
        r = chown_terminal(fd, getuid());
×
683
        if (r < 0)
×
684
                return r;
685

686
        r = terminal_reset_defensive(fd, TERMINAL_RESET_SWITCH_TO_TEXT);
×
687
        if (r < 0)
×
688
                return r;
689

690
        r = exec_context_apply_tty_size(context, fd, fd, vc);
×
691
        if (r < 0)
×
692
                return r;
693

694
        r = rearrange_stdio(fd, fd, STDERR_FILENO); /* Invalidates 'fd' also on failure */
×
695
        TAKE_FD(fd);
×
696
        if (r < 0)
×
697
                return r;
698

699
        *ret_saved_stdin = TAKE_FD(saved_stdin);
×
700
        *ret_saved_stdout = TAKE_FD(saved_stdout);
×
701
        return 0;
×
702
}
703

704
static void write_confirm_error_fd(int err, int fd, const char *unit_id) {
×
705
        assert(err != 0);
×
706
        assert(fd >= 0);
×
707
        assert(unit_id);
×
708

709
        errno = abs(err);
×
710

711
        if (errno == ETIMEDOUT)
×
712
                dprintf(fd, "Confirmation question timed out for %s, assuming positive response.\n", unit_id);
×
713
        else
714
                dprintf(fd, "Couldn't ask confirmation for %s, assuming positive response: %m\n", unit_id);
×
715
}
×
716

717
static void write_confirm_error(int err, const char *vc, const char *unit_id) {
×
718
        _cleanup_close_ int fd = -EBADF;
×
719

720
        assert(vc);
×
721

722
        fd = open_terminal(vc, O_WRONLY|O_NOCTTY|O_CLOEXEC);
×
723
        if (fd < 0)
×
724
                return;
×
725

726
        write_confirm_error_fd(err, fd, unit_id);
×
727
}
728

729
static int restore_confirm_stdio(int *saved_stdin, int *saved_stdout) {
×
730
        int r = 0;
×
731

732
        assert(saved_stdin);
×
733
        assert(saved_stdout);
×
734

735
        release_terminal();
×
736

737
        if (*saved_stdin >= 0)
×
738
                if (dup2(*saved_stdin, STDIN_FILENO) < 0)
×
739
                        r = -errno;
×
740

741
        if (*saved_stdout >= 0)
×
742
                if (dup2(*saved_stdout, STDOUT_FILENO) < 0)
×
743
                        r = -errno;
×
744

745
        *saved_stdin = safe_close(*saved_stdin);
×
746
        *saved_stdout = safe_close(*saved_stdout);
×
747

748
        return r;
×
749
}
750

751
enum {
752
        CONFIRM_PRETEND_FAILURE = -1,
753
        CONFIRM_PRETEND_SUCCESS =  0,
754
        CONFIRM_EXECUTE = 1,
755
};
756

757
static bool confirm_spawn_disabled(void) {
×
758
        return access("/run/systemd/confirm_spawn_disabled", F_OK) >= 0;
×
759
}
760

761
static int ask_for_confirmation(const ExecContext *context, const ExecParameters *params, const char *cmdline) {
×
762
        int saved_stdout = -EBADF, saved_stdin = -EBADF, r;
×
763
        _cleanup_free_ char *e = NULL;
×
764
        char c;
×
765

766
        assert(context);
×
767
        assert(params);
×
768

769
        /* For any internal errors, assume a positive response. */
770
        r = setup_confirm_stdio(context, params->confirm_spawn, &saved_stdin, &saved_stdout);
×
771
        if (r < 0) {
×
772
                write_confirm_error(r, params->confirm_spawn, params->unit_id);
×
773
                return CONFIRM_EXECUTE;
774
        }
775

776
        /* confirm_spawn might have been disabled while we were sleeping. */
777
        if (!params->confirm_spawn || confirm_spawn_disabled()) {
×
778
                r = 1;
×
779
                goto restore_stdio;
×
780
        }
781

782
        e = ellipsize(cmdline, 60, 100);
×
783
        if (!e) {
×
784
                log_oom();
×
785
                r = CONFIRM_EXECUTE;
×
786
                goto restore_stdio;
×
787
        }
788

789
        for (;;) {
×
790
                r = ask_char(&c, "yfshiDjcn", "Execute %s? [y, f, s – h for help] ", e);
×
791
                if (r < 0) {
×
792
                        write_confirm_error_fd(r, STDOUT_FILENO, params->unit_id);
×
793
                        r = CONFIRM_EXECUTE;
×
794
                        goto restore_stdio;
×
795
                }
796

797
                switch (c) {
×
798
                case 'c':
×
799
                        printf("Resuming normal execution.\n");
×
800
                        manager_disable_confirm_spawn();
×
801
                        r = 1;
802
                        break;
803
                case 'D':
×
804
                        printf("  Unit: %s\n",
×
805
                               params->unit_id);
×
806
                        exec_context_dump(context, stdout, "  ");
×
807
                        exec_params_dump(params, stdout, "  ");
×
808
                        continue; /* ask again */
×
809
                case 'f':
×
810
                        printf("Failing execution.\n");
×
811
                        r = CONFIRM_PRETEND_FAILURE;
812
                        break;
813
                case 'h':
×
814
                        printf("  c - continue, proceed without asking anymore\n"
×
815
                               "  D - dump, show the state of the unit\n"
816
                               "  f - fail, don't execute the command and pretend it failed\n"
817
                               "  h - help\n"
818
                               "  i - info, show a short summary of the unit\n"
819
                               "  j - jobs, show jobs that are in progress\n"
820
                               "  s - skip, don't execute the command and pretend it succeeded\n"
821
                               "  y - yes, execute the command\n");
822
                        continue; /* ask again */
×
823
                case 'i':
×
824
                        printf("  Unit:        %s\n"
×
825
                               "  Command:     %s\n",
826
                               params->unit_id, cmdline);
×
827
                        continue; /* ask again */
×
828
                case 'j':
×
829
                        if (sigqueue(getppid(),
×
830
                                     SIGRTMIN+18,
×
831
                                     (const union sigval) { .sival_int = MANAGER_SIGNAL_COMMAND_DUMP_JOBS }) < 0)
×
832
                                return -errno;
×
833

834
                        continue; /* ask again */
×
835
                case 'n':
×
836
                        /* 'n' was removed in favor of 'f'. */
837
                        printf("Didn't understand 'n', did you mean 'f'?\n");
×
838
                        continue; /* ask again */
×
839
                case 's':
×
840
                        printf("Skipping execution.\n");
×
841
                        r = CONFIRM_PRETEND_SUCCESS;
842
                        break;
843
                case 'y':
844
                        r = CONFIRM_EXECUTE;
845
                        break;
846
                default:
×
847
                        assert_not_reached();
×
848
                }
849
                break;
850
        }
851

852
restore_stdio:
×
853
        restore_confirm_stdio(&saved_stdin, &saved_stdout);
×
854
        return r;
855
}
856

857
static int get_fixed_user(
9,324✔
858
                const char *user_or_uid,
859
                bool prefer_nss,
860
                const char **ret_username,
861
                uid_t *ret_uid,
862
                gid_t *ret_gid,
863
                const char **ret_home,
864
                const char **ret_shell) {
865

866
        int r;
9,324✔
867

868
        assert(user_or_uid);
9,324✔
869
        assert(ret_username);
9,324✔
870

871
        r = get_user_creds(&user_or_uid, ret_uid, ret_gid, ret_home, ret_shell,
18,214✔
872
                           USER_CREDS_CLEAN|(prefer_nss ? USER_CREDS_PREFER_NSS : 0));
873
        if (r < 0)
9,324✔
874
                return r;
875

876
        /* user_or_uid is normalized by get_user_creds to username */
877
        *ret_username = user_or_uid;
9,322✔
878

879
        return 0;
9,322✔
880
}
881

882
static int get_fixed_group(
11✔
883
                const char *group_or_gid,
884
                const char **ret_groupname,
885
                gid_t *ret_gid) {
886

887
        int r;
11✔
888

889
        assert(group_or_gid);
11✔
890
        assert(ret_groupname);
11✔
891

892
        r = get_group_creds(&group_or_gid, ret_gid, /* flags = */ 0);
11✔
893
        if (r < 0)
11✔
894
                return r;
895

896
        /* group_or_gid is normalized by get_group_creds to groupname */
897
        *ret_groupname = group_or_gid;
11✔
898

899
        return 0;
11✔
900
}
901

902
static int get_supplementary_groups(
11,471✔
903
                const ExecContext *c,
904
                const char *user,
905
                gid_t gid,
906
                gid_t **ret_gids) {
907

908
        int r;
11,471✔
909

910
        assert(c);
11,471✔
911
        assert(ret_gids);
11,471✔
912

913
        /*
914
         * If user is given, then lookup GID and supplementary groups list.
915
         * We avoid NSS lookups for gid=0. Also we have to initialize groups
916
         * here and as early as possible so we keep the list of supplementary
917
         * groups of the caller.
918
         */
919
        bool keep_groups = false;
11,471✔
920
        if (user && gid_is_valid(gid) && gid != 0) {
14,110✔
921
                /* First step, initialize groups from /etc/groups */
922
                if (initgroups(user, gid) < 0)
2,503✔
923
                        return -errno;
11,471✔
924

925
                keep_groups = true;
926
        }
927

928
        if (strv_isempty(c->supplementary_groups)) {
11,471✔
929
                *ret_gids = NULL;
11,462✔
930
                return 0;
11,462✔
931
        }
932

933
        /*
934
         * If SupplementaryGroups= was passed then NGROUPS_MAX has to
935
         * be positive, otherwise fail.
936
         */
937
        errno = 0;
9✔
938
        int ngroups_max = (int) sysconf(_SC_NGROUPS_MAX);
9✔
939
        if (ngroups_max <= 0)
9✔
940
                return errno_or_else(EOPNOTSUPP);
×
941

942
        _cleanup_free_ gid_t *l_gids = new(gid_t, ngroups_max);
18✔
943
        if (!l_gids)
9✔
944
                return -ENOMEM;
945

946
        int k = 0;
9✔
947
        if (keep_groups) {
9✔
948
                /*
949
                 * Lookup the list of groups that the user belongs to, we
950
                 * avoid NSS lookups here too for gid=0.
951
                 */
952
                k = ngroups_max;
9✔
953
                if (getgrouplist(user, gid, l_gids, &k) < 0)
9✔
954
                        return -EINVAL;
955
        }
956

957
        STRV_FOREACH(i, c->supplementary_groups) {
18✔
958
                if (k >= ngroups_max)
9✔
959
                        return -E2BIG;
×
960

961
                const char *g = *i;
9✔
962
                r = get_group_creds(&g, l_gids + k, /* flags = */ 0);
9✔
963
                if (r < 0)
9✔
964
                        return r;
965

966
                k++;
9✔
967
        }
968

969
        if (k == 0) {
9✔
970
                *ret_gids = NULL;
×
971
                return 0;
×
972
        }
973

974
        /* Otherwise get the final list of supplementary groups */
975
        gid_t *groups = newdup(gid_t, l_gids, k);
9✔
976
        if (!groups)
9✔
977
                return -ENOMEM;
978

979
        *ret_gids = groups;
9✔
980
        return k;
9✔
981
}
982

983
static int enforce_groups(gid_t gid, const gid_t *supplementary_gids, int ngids) {
9,503✔
984
        int r;
9,503✔
985

986
        /* Handle SupplementaryGroups= if it is not empty */
987
        if (ngids > 0) {
9,503✔
988
                r = maybe_setgroups(ngids, supplementary_gids);
262✔
989
                if (r < 0)
262✔
990
                        return r;
991
        }
992

993
        if (gid_is_valid(gid)) {
9,503✔
994
                /* Then set our gids */
995
                if (setresgid(gid, gid, gid) < 0)
1,996✔
996
                        return -errno;
1✔
997
        }
998

999
        return 0;
1000
}
1001

1002
static int set_securebits(unsigned bits, unsigned mask) {
740✔
1003
        unsigned applied;
740✔
1004
        int current;
740✔
1005

1006
        current = prctl(PR_GET_SECUREBITS);
740✔
1007
        if (current < 0)
740✔
1008
                return -errno;
×
1009

1010
        /* Clear all securebits defined in mask and set bits */
1011
        applied = ((unsigned) current & ~mask) | bits;
740✔
1012
        if ((unsigned) current == applied)
740✔
1013
                return 0;
1014

1015
        if (prctl(PR_SET_SECUREBITS, applied) < 0)
53✔
1016
                return -errno;
×
1017

1018
        return 1;
1019
}
1020

1021
static int enforce_user(
1,989✔
1022
                const ExecContext *context,
1023
                uid_t uid,
1024
                uint64_t capability_ambient_set) {
1025

1026
        int r;
1,989✔
1027

1028
        assert(context);
1,989✔
1029

1030
        if (!uid_is_valid(uid))
1,989✔
1031
                return 0;
1032

1033
        /* Sets (but doesn't look up) the UIS and makes sure we keep the capabilities while doing so. For
1034
         * setting secure bits the capability CAP_SETPCAP is required, so we also need keep-caps in this
1035
         * case. */
1036

1037
        if ((capability_ambient_set != 0 || context->secure_bits != 0) && uid != 0) {
1,989✔
1038

1039
                /* First step: If we need to keep capabilities but drop privileges we need to make sure we
1040
                 * keep our caps, while we drop privileges. Add KEEP_CAPS to the securebits */
1041
                r = set_securebits(1U << SECURE_KEEP_CAPS, 0);
740✔
1042
                if (r < 0)
740✔
1043
                        return r;
1044
        }
1045

1046
        /* Second step: actually set the uids */
1047
        if (setresuid(uid, uid, uid) < 0)
1,989✔
1048
                return -errno;
×
1049

1050
        /* At this point we should have all necessary capabilities but are otherwise a normal user. However,
1051
         * the caps might got corrupted due to the setresuid() so we need clean them up later. This is done
1052
         * outside of this call. */
1053
        return 0;
1054
}
1055

1056
#if HAVE_PAM
1057

1058
static void pam_response_free_array(struct pam_response *responses, size_t n_responses) {
×
1059
        assert(responses || n_responses == 0);
×
1060

1061
        FOREACH_ARRAY(resp, responses, n_responses)
×
1062
                erase_and_free(resp->resp);
×
1063

1064
        free(responses);
×
1065
}
×
1066

1067
typedef struct AskPasswordConvData {
1068
        const ExecContext *context;
1069
        const ExecParameters *params;
1070
} AskPasswordConvData;
1071

1072
static int ask_password_conv(
5✔
1073
                int num_msg,
1074
                const struct pam_message *msg[],
1075
                struct pam_response **ret,
1076
                void *userdata) {
1077

1078
        AskPasswordConvData *data = ASSERT_PTR(userdata);
5✔
1079
        bool set_credential_env_var = false;
5✔
1080
        int r;
5✔
1081

1082
        assert(num_msg >= 0);
5✔
1083
        assert(msg);
5✔
1084
        assert(data->context);
5✔
1085
        assert(data->params);
5✔
1086

1087
        size_t n = num_msg;
5✔
1088
        struct pam_response *responses = new0(struct pam_response, n);
5✔
1089
        if (!responses)
5✔
1090
                return PAM_BUF_ERR;
5✔
1091
        CLEANUP_ARRAY(responses, n, pam_response_free_array);
5✔
1092

1093
        for (size_t i = 0; i < n; i++) {
10✔
1094
                const struct pam_message *mi = *msg + i;
5✔
1095

1096
                switch (mi->msg_style) {
5✔
1097

1098
                case PAM_PROMPT_ECHO_ON:
2✔
1099
                case PAM_PROMPT_ECHO_OFF: {
1100

1101
                        /* Locally set the $CREDENTIALS_DIRECTORY to the credentials directory we just populated */
1102
                        if (!set_credential_env_var) {
2✔
1103
                                _cleanup_free_ char *creds_dir = NULL;
2✔
1104
                                r = exec_context_get_credential_directory(data->context, data->params, data->params->unit_id, &creds_dir);
2✔
1105
                                if (r < 0)
2✔
1106
                                        return log_error_errno(r, "Failed to determine credentials directory: %m");
×
1107

1108
                                if (creds_dir) {
2✔
1109
                                        if (setenv("CREDENTIALS_DIRECTORY", creds_dir, /* overwrite= */ true) < 0)
2✔
1110
                                                return log_error_errno(r, "Failed to set $CREDENTIALS_DIRECTORY: %m");
×
1111
                                } else
1112
                                        (void) unsetenv("CREDENTIALS_DIRECTORY");
×
1113

1114
                                set_credential_env_var = true;
2✔
1115
                        }
1116

1117
                        _cleanup_free_ char *credential_name = strjoin("pam.authtok.", data->context->pam_name);
4✔
1118
                        if (!credential_name)
2✔
1119
                                return log_oom();
×
1120

1121
                        AskPasswordRequest req = {
4✔
1122
                                .message = mi->msg,
2✔
1123
                                .credential = credential_name,
1124
                                .tty_fd = -EBADF,
1125
                                .hup_fd = -EBADF,
1126
                                .until = usec_add(now(CLOCK_MONOTONIC), 15 * USEC_PER_SEC),
2✔
1127
                        };
1128

1129
                        _cleanup_strv_free_erase_ char **acquired = NULL;
×
1130
                        r = ask_password_auto(
2✔
1131
                                        &req,
1132
                                        ASK_PASSWORD_ACCEPT_CACHED|
1133
                                        ASK_PASSWORD_NO_TTY|
1134
                                        (mi->msg_style == PAM_PROMPT_ECHO_ON ? ASK_PASSWORD_ECHO : 0),
2✔
1135
                                        &acquired);
1136
                        if (r < 0) {
2✔
1137
                                log_error_errno(r, "Failed to query for password: %m");
×
1138
                                return PAM_CONV_ERR;
×
1139
                        }
1140

1141
                        responses[i].resp = strdup(ASSERT_PTR(acquired[0]));
2✔
1142
                        if (!responses[i].resp) {
2✔
1143
                                log_oom();
×
1144
                                return PAM_BUF_ERR;
1145
                        }
1146
                        break;
2✔
1147
                }
1148

1149
                case PAM_ERROR_MSG:
1150
                        log_error("PAM: %s", mi->msg);
×
1151
                        break;
1152

1153
                case PAM_TEXT_INFO:
1154
                        log_info("PAM: %s", mi->msg);
3✔
1155
                        break;
1156

1157
                default:
1158
                        return PAM_CONV_ERR;
1159
                }
1160
        }
1161

1162
        *ret = TAKE_PTR(responses);
5✔
1163
        n = 0;
5✔
1164

1165
        return PAM_SUCCESS;
5✔
1166
}
1167

1168
static int pam_close_session_and_delete_credentials(pam_handle_t *handle, int flags) {
208✔
1169
        int r, s;
208✔
1170

1171
        assert(handle);
208✔
1172

1173
        r = pam_close_session(handle, flags);
208✔
1174
        if (r != PAM_SUCCESS)
208✔
1175
                log_debug("pam_close_session() failed: %s", pam_strerror(handle, r));
49✔
1176

1177
        s = pam_setcred(handle, PAM_DELETE_CRED | flags);
208✔
1178
        if (s != PAM_SUCCESS)
208✔
1179
                log_debug("pam_setcred(PAM_DELETE_CRED) failed: %s", pam_strerror(handle, s));
143✔
1180

1181
        return r != PAM_SUCCESS ? r : s;
208✔
1182
}
1183
#endif
1184

1185
static int setup_pam(
386✔
1186
                const ExecContext *context,
1187
                ExecParameters *params,
1188
                const char *user,
1189
                uid_t uid,
1190
                gid_t gid,
1191
                char ***env, /* updated on success */
1192
                const int fds[], size_t n_fds,
1193
                int exec_fd) {
1194

1195
#if HAVE_PAM
1196
        AskPasswordConvData conv_data = {
386✔
1197
                .context = context,
1198
                .params = params,
1199
        };
1200

1201
        const struct pam_conv conv = {
386✔
1202
                .conv = ask_password_conv,
1203
                .appdata_ptr = &conv_data,
1204
        };
1205

1206
        _cleanup_(barrier_destroy) Barrier barrier = BARRIER_NULL;
386✔
1207
        _cleanup_strv_free_ char **e = NULL;
×
1208
        _cleanup_free_ char *tty = NULL;
386✔
1209
        pam_handle_t *handle = NULL;
386✔
1210
        sigset_t old_ss;
386✔
1211
        int pam_code = PAM_SUCCESS, r;
386✔
1212
        bool close_session = false;
386✔
1213
        pid_t parent_pid;
386✔
1214
        int flags = 0;
386✔
1215

1216
        assert(context);
386✔
1217
        assert(params);
386✔
1218
        assert(user);
386✔
1219
        assert(uid_is_valid(uid));
386✔
1220
        assert(gid_is_valid(gid));
386✔
1221
        assert(fds || n_fds == 0);
386✔
1222
        assert(env);
386✔
1223

1224
        /* We set up PAM in the parent process, then fork. The child
1225
         * will then stay around until killed via PR_GET_PDEATHSIG or
1226
         * systemd via the cgroup logic. It will then remove the PAM
1227
         * session again. The parent process will exec() the actual
1228
         * daemon. We do things this way to ensure that the main PID
1229
         * of the daemon is the one we initially fork()ed. */
1230

1231
        r = barrier_create(&barrier);
386✔
1232
        if (r < 0)
386✔
1233
                goto fail;
×
1234

1235
        if (log_get_max_level() < LOG_DEBUG)
386✔
1236
                flags |= PAM_SILENT;
3✔
1237

1238
        pam_code = pam_start(context->pam_name, user, &conv, &handle);
386✔
1239
        if (pam_code != PAM_SUCCESS) {
386✔
1240
                handle = NULL;
×
1241
                goto fail;
×
1242
        }
1243

1244
        if (getttyname_malloc(STDIN_FILENO, &tty) >= 0) {
386✔
1245
                _cleanup_free_ char *q = path_join("/dev", tty);
6✔
1246
                if (!q) {
6✔
1247
                        r = -ENOMEM;
×
1248
                        goto fail;
×
1249
                }
1250

1251
                free_and_replace(tty, q);
6✔
1252
        }
1253

1254
        if (tty) {
386✔
1255
                pam_code = pam_set_item(handle, PAM_TTY, tty);
6✔
1256
                if (pam_code != PAM_SUCCESS)
6✔
1257
                        goto fail;
×
1258
        }
1259

1260
        STRV_FOREACH(nv, *env) {
5,450✔
1261
                pam_code = pam_putenv(handle, *nv);
5,064✔
1262
                if (pam_code != PAM_SUCCESS)
5,064✔
1263
                        goto fail;
×
1264
        }
1265

1266
        pam_code = pam_acct_mgmt(handle, flags);
386✔
1267
        if (pam_code != PAM_SUCCESS)
386✔
1268
                goto fail;
×
1269

1270
        pam_code = pam_setcred(handle, PAM_ESTABLISH_CRED | flags);
386✔
1271
        if (pam_code != PAM_SUCCESS)
386✔
1272
                log_debug("pam_setcred(PAM_ESTABLISH_CRED) failed, ignoring: %s", pam_strerror(handle, pam_code));
317✔
1273

1274
        pam_code = pam_open_session(handle, flags);
386✔
1275
        if (pam_code != PAM_SUCCESS)
386✔
1276
                goto fail;
×
1277

1278
        close_session = true;
386✔
1279

1280
        e = pam_getenvlist(handle);
386✔
1281
        if (!e) {
386✔
1282
                pam_code = PAM_BUF_ERR;
×
1283
                goto fail;
×
1284
        }
1285

1286
        /* Block SIGTERM, so that we know that it won't get lost in the child */
1287

1288
        assert_se(sigprocmask_many(SIG_BLOCK, &old_ss, SIGTERM) >= 0);
386✔
1289

1290
        parent_pid = getpid_cached();
386✔
1291

1292
        r = safe_fork("(sd-pam)", 0, NULL);
386✔
1293
        if (r < 0)
594✔
1294
                goto fail;
×
1295
        if (r == 0) {
594✔
1296
                int ret = EXIT_PAM;
208✔
1297

1298
                /* The child's job is to reset the PAM session on termination */
1299
                barrier_set_role(&barrier, BARRIER_CHILD);
208✔
1300

1301
                /* Make sure we don't keep open the passed fds in this child. We assume that otherwise only
1302
                 * those fds are open here that have been opened by PAM. */
1303
                (void) close_many(fds, n_fds);
208✔
1304

1305
                /* Also close the 'exec_fd' in the child, since the service manager waits for the EOF induced
1306
                 * by the execve() to wait for completion, and if we'd keep the fd open here in the child
1307
                 * we'd never signal completion. */
1308
                exec_fd = safe_close(exec_fd);
208✔
1309

1310
                /* Drop privileges - we don't need any to pam_close_session and this will make
1311
                 * PR_SET_PDEATHSIG work in most cases.  If this fails, ignore the error - but expect sd-pam
1312
                 * threads to fail to exit normally */
1313

1314
                r = fully_set_uid_gid(uid, gid, /* supplementary_gids= */ NULL, /* n_supplementary_gids= */ 0);
208✔
1315
                if (r < 0)
208✔
1316
                        log_warning_errno(r, "Failed to drop privileges in sd-pam: %m");
×
1317

1318
                (void) ignore_signals(SIGPIPE);
208✔
1319

1320
                /* Wait until our parent died. This will only work if the above setresuid() succeeds,
1321
                 * otherwise the kernel will not allow unprivileged parents kill their privileged children
1322
                 * this way. We rely on the control groups kill logic to do the rest for us. */
1323
                if (prctl(PR_SET_PDEATHSIG, SIGTERM) < 0)
208✔
1324
                        goto child_finish;
×
1325

1326
                /* Tell the parent that our setup is done. This is especially important regarding dropping
1327
                 * privileges. Otherwise, unit setup might race against our setresuid(2) call.
1328
                 *
1329
                 * If the parent aborted, we'll detect this below, hence ignore return failure here. */
1330
                (void) barrier_place(&barrier);
208✔
1331

1332
                /* Check if our parent process might already have died? */
1333
                if (getppid() == parent_pid) {
208✔
1334
                        sigset_t ss;
208✔
1335
                        int sig;
208✔
1336

1337
                        assert_se(sigemptyset(&ss) >= 0);
208✔
1338
                        assert_se(sigaddset(&ss, SIGTERM) >= 0);
208✔
1339

1340
                        assert_se(sigwait(&ss, &sig) == 0);
208✔
1341
                        assert(sig == SIGTERM);
208✔
1342
                }
1343

1344
                /* If our parent died we'll end the session */
1345
                if (getppid() != parent_pid) {
208✔
1346
                        pam_code = pam_close_session_and_delete_credentials(handle, flags);
208✔
1347
                        if (pam_code != PAM_SUCCESS)
208✔
1348
                                goto child_finish;
143✔
1349
                }
1350

1351
                ret = 0;
1352

1353
        child_finish:
208✔
1354
                /* NB: pam_end() when called in child processes should set PAM_DATA_SILENT to let the module
1355
                 * know about this. See pam_end(3) */
1356
                (void) pam_end(handle, pam_code | flags | PAM_DATA_SILENT);
208✔
1357
                _exit(ret);
208✔
1358
        }
1359

1360
        barrier_set_role(&barrier, BARRIER_PARENT);
386✔
1361

1362
        /* If the child was forked off successfully it will do all the cleanups, so forget about the handle
1363
         * here. */
1364
        handle = NULL;
386✔
1365

1366
        /* Unblock SIGTERM again in the parent */
1367
        assert_se(sigprocmask(SIG_SETMASK, &old_ss, NULL) >= 0);
386✔
1368

1369
        /* We close the log explicitly here, since the PAM modules might have opened it, but we don't want
1370
         * this fd around. */
1371
        closelog();
386✔
1372

1373
        /* Synchronously wait for the child to initialize. We don't care for errors as we cannot
1374
         * recover. However, warn loudly if it happens. */
1375
        if (!barrier_place_and_sync(&barrier))
772✔
1376
                log_error("PAM initialization failed");
×
1377

1378
        return strv_free_and_replace(*env, e);
386✔
1379

1380
fail:
×
1381
        if (pam_code != PAM_SUCCESS) {
×
1382
                log_error("PAM failed: %s", pam_strerror(handle, pam_code));
×
1383
                r = -EPERM;  /* PAM errors do not map to errno */
1384
        } else
1385
                log_error_errno(r, "PAM failed: %m");
×
1386

1387
        if (handle) {
×
1388
                if (close_session)
×
1389
                        pam_code = pam_close_session_and_delete_credentials(handle, flags);
×
1390

1391
                (void) pam_end(handle, pam_code | flags);
×
1392
        }
1393

1394
        closelog();
×
1395
        return r;
1396
#else
1397
        return 0;
1398
#endif
1399
}
1400

1401
static void rename_process_from_path(const char *path) {
11,474✔
1402
        _cleanup_free_ char *buf = NULL;
11,474✔
1403
        const char *p;
11,474✔
1404

1405
        assert(path);
11,474✔
1406

1407
        /* This resulting string must fit in 10 chars (i.e. the length of "/sbin/init") to look pretty in
1408
         * /bin/ps */
1409

1410
        if (path_extract_filename(path, &buf) < 0) {
11,474✔
1411
                rename_process("(...)");
×
1412
                return;
×
1413
        }
1414

1415
        size_t l = strlen(buf);
11,474✔
1416
        if (l > 8) {
11,474✔
1417
                /* The end of the process name is usually more interesting, since the first bit might just be
1418
                 * "systemd-" */
1419
                p = buf + l - 8;
7,950✔
1420
                l = 8;
7,950✔
1421
        } else
1422
                p = buf;
1423

1424
        char process_name[11];
11,474✔
1425
        process_name[0] = '(';
11,474✔
1426
        memcpy(process_name+1, p, l);
11,474✔
1427
        process_name[1+l] = ')';
11,474✔
1428
        process_name[1+l+1] = 0;
11,474✔
1429

1430
        (void) rename_process(process_name);
11,474✔
1431
}
1432

1433
static bool context_has_address_families(const ExecContext *c) {
12,379✔
1434
        assert(c);
12,379✔
1435

1436
        return c->address_families_allow_list ||
12,379✔
1437
                !set_isempty(c->address_families);
10,890✔
1438
}
1439

1440
static bool context_has_syscall_filters(const ExecContext *c) {
12,343✔
1441
        assert(c);
12,343✔
1442

1443
        return c->syscall_allow_list ||
12,343✔
1444
                !hashmap_isempty(c->syscall_filter);
10,869✔
1445
}
1446

1447
static bool context_has_syscall_logs(const ExecContext *c) {
12,343✔
1448
        assert(c);
12,343✔
1449

1450
        return c->syscall_log_allow_list ||
12,343✔
1451
                !hashmap_isempty(c->syscall_log);
12,343✔
1452
}
1453

1454
static bool context_has_seccomp(const ExecContext *c) {
3,593✔
1455
        assert(c);
3,593✔
1456

1457
        /* We need NNP if we have any form of seccomp and are unprivileged */
1458
        return c->lock_personality ||
6,474✔
1459
                c->memory_deny_write_execute ||
2,881✔
1460
                c->private_devices ||
2,881✔
1461
                c->protect_clock ||
2,881✔
1462
                c->protect_hostname == PROTECT_HOSTNAME_YES ||
2,881✔
1463
                c->protect_kernel_tunables ||
2,881✔
1464
                c->protect_kernel_modules ||
2,881✔
1465
                c->protect_kernel_logs ||
5,762✔
1466
                context_has_address_families(c) ||
5,762✔
1467
                exec_context_restrict_namespaces_set(c) ||
2,881✔
1468
                c->restrict_realtime ||
2,881✔
1469
                c->restrict_suid_sgid ||
2,881✔
1470
                !set_isempty(c->syscall_archs) ||
5,690✔
1471
                context_has_syscall_filters(c) ||
9,283✔
1472
                context_has_syscall_logs(c);
2,845✔
1473
}
1474

1475
static bool context_has_no_new_privileges(const ExecContext *c) {
9,498✔
1476
        assert(c);
9,498✔
1477

1478
        if (c->no_new_privileges)
9,498✔
1479
                return true;
1480

1481
        if (have_effective_cap(CAP_SYS_ADMIN) > 0) /* if we are privileged, we don't need NNP */
8,085✔
1482
                return false;
1483

1484
        return context_has_seccomp(c);
1,604✔
1485
}
1486

1487
#if HAVE_SECCOMP
1488

1489
static bool seccomp_allows_drop_privileges(const ExecContext *c) {
748✔
1490
        void *id, *val;
748✔
1491
        bool have_capget = false, have_capset = false, have_prctl = false;
748✔
1492

1493
        assert(c);
748✔
1494

1495
        /* No syscall filter, we are allowed to drop privileges */
1496
        if (hashmap_isempty(c->syscall_filter))
748✔
1497
                return true;
748✔
1498

1499
        HASHMAP_FOREACH_KEY(val, id, c->syscall_filter) {
271,095✔
1500
                _cleanup_free_ char *name = NULL;
270,399✔
1501

1502
                name = seccomp_syscall_resolve_num_arch(SCMP_ARCH_NATIVE, PTR_TO_INT(id) - 1);
270,399✔
1503

1504
                if (streq(name, "capget"))
270,399✔
1505
                        have_capget = true;
1506
                else if (streq(name, "capset"))
269,703✔
1507
                        have_capset = true;
1508
                else if (streq(name, "prctl"))
269,007✔
1509
                        have_prctl = true;
696✔
1510
        }
1511

1512
        if (c->syscall_allow_list)
696✔
1513
                return have_capget && have_capset && have_prctl;
696✔
1514
        else
1515
                return !(have_capget || have_capset || have_prctl);
×
1516
}
1517

1518
static bool skip_seccomp_unavailable(const char *msg) {
14,867✔
1519
        assert(msg);
14,867✔
1520

1521
        if (is_seccomp_available())
14,867✔
1522
                return false;
1523

1524
        log_debug("SECCOMP features not detected in the kernel, skipping %s", msg);
×
1525
        return true;
1526
}
1527

1528
static int apply_syscall_filter(const ExecContext *c, const ExecParameters *p) {
9,498✔
1529
        uint32_t negative_action, default_action, action;
9,498✔
1530
        int r;
9,498✔
1531

1532
        assert(c);
9,498✔
1533
        assert(p);
9,498✔
1534

1535
        if (!context_has_syscall_filters(c))
9,498✔
1536
                return 0;
1537

1538
        if (skip_seccomp_unavailable("SystemCallFilter="))
1,475✔
1539
                return 0;
1540

1541
        negative_action = c->syscall_errno == SECCOMP_ERROR_NUMBER_KILL ? scmp_act_kill_process() : SCMP_ACT_ERRNO(c->syscall_errno);
1,475✔
1542

1543
        if (c->syscall_allow_list) {
1,475✔
1544
                default_action = negative_action;
1545
                action = SCMP_ACT_ALLOW;
1546
        } else {
1547
                default_action = SCMP_ACT_ALLOW;
1✔
1548
                action = negative_action;
1✔
1549
        }
1550

1551
        /* Sending over exec_fd or handoff_timestamp_fd requires write() syscall. */
1552
        if (p->exec_fd >= 0 || p->handoff_timestamp_fd >= 0) {
1,475✔
1553
                r = seccomp_filter_set_add_by_name(c->syscall_filter, c->syscall_allow_list, "write");
1,475✔
1554
                if (r < 0)
1,475✔
1555
                        return r;
1556
        }
1557

1558
        return seccomp_load_syscall_filter_set_raw(default_action, c->syscall_filter, action, false);
1,475✔
1559
}
1560

1561
static int apply_syscall_log(const ExecContext *c, const ExecParameters *p) {
9,498✔
1562
#ifdef SCMP_ACT_LOG
1563
        uint32_t default_action, action;
9,498✔
1564
#endif
1565

1566
        assert(c);
9,498✔
1567
        assert(p);
9,498✔
1568

1569
        if (!context_has_syscall_logs(c))
9,498✔
1570
                return 0;
1571

1572
#ifdef SCMP_ACT_LOG
1573
        if (skip_seccomp_unavailable("SystemCallLog="))
×
1574
                return 0;
1575

1576
        if (c->syscall_log_allow_list) {
×
1577
                /* Log nothing but the ones listed */
1578
                default_action = SCMP_ACT_ALLOW;
1579
                action = SCMP_ACT_LOG;
1580
        } else {
1581
                /* Log everything but the ones listed */
1582
                default_action = SCMP_ACT_LOG;
×
1583
                action = SCMP_ACT_ALLOW;
×
1584
        }
1585

1586
        return seccomp_load_syscall_filter_set_raw(default_action, c->syscall_log, action, false);
×
1587
#else
1588
        /* old libseccomp */
1589
        log_debug( "SECCOMP feature SCMP_ACT_LOG not available, skipping SystemCallLog=");
1590
        return 0;
1591
#endif
1592
}
1593

1594
static int apply_syscall_archs(const ExecContext *c, const ExecParameters *p) {
9,498✔
1595
        assert(c);
9,498✔
1596
        assert(p);
9,498✔
1597

1598
        if (set_isempty(c->syscall_archs))
9,498✔
1599
                return 0;
1600

1601
        if (skip_seccomp_unavailable("SystemCallArchitectures="))
1,490✔
1602
                return 0;
1603

1604
        return seccomp_restrict_archs(c->syscall_archs);
1,490✔
1605
}
1606

1607
static int apply_address_families(const ExecContext *c, const ExecParameters *p) {
9,498✔
1608
        assert(c);
9,498✔
1609
        assert(p);
9,498✔
1610

1611
        if (!context_has_address_families(c))
9,498✔
1612
                return 0;
1613

1614
        if (skip_seccomp_unavailable("RestrictAddressFamilies="))
1,489✔
1615
                return 0;
1616

1617
        return seccomp_restrict_address_families(c->address_families, c->address_families_allow_list);
1,489✔
1618
}
1619

1620
static int apply_memory_deny_write_execute(const ExecContext *c, const ExecParameters *p) {
9,498✔
1621
        int r;
9,498✔
1622

1623
        assert(c);
9,498✔
1624
        assert(p);
9,498✔
1625

1626
        if (!c->memory_deny_write_execute)
9,498✔
1627
                return 0;
1628

1629
        /* use prctl() if kernel supports it (6.3) */
1630
        r = prctl(PR_SET_MDWE, PR_MDWE_REFUSE_EXEC_GAIN, 0, 0, 0);
1,489✔
1631
        if (r == 0) {
1,489✔
1632
                log_debug("Enabled MemoryDenyWriteExecute= with PR_SET_MDWE");
1,489✔
1633
                return 0;
1,489✔
1634
        }
1635
        if (r < 0 && errno != EINVAL)
×
1636
                return log_debug_errno(errno, "Failed to enable MemoryDenyWriteExecute= with PR_SET_MDWE: %m");
×
1637
        /* else use seccomp */
1638
        log_debug("Kernel doesn't support PR_SET_MDWE: falling back to seccomp");
×
1639

1640
        if (skip_seccomp_unavailable("MemoryDenyWriteExecute="))
×
1641
                return 0;
1642

1643
        return seccomp_memory_deny_write_execute();
×
1644
}
1645

1646
static int apply_restrict_realtime(const ExecContext *c, const ExecParameters *p) {
9,498✔
1647
        assert(c);
9,498✔
1648
        assert(p);
9,498✔
1649

1650
        if (!c->restrict_realtime)
9,498✔
1651
                return 0;
1652

1653
        if (skip_seccomp_unavailable("RestrictRealtime="))
1,489✔
1654
                return 0;
1655

1656
        return seccomp_restrict_realtime();
1,489✔
1657
}
1658

1659
static int apply_restrict_suid_sgid(const ExecContext *c, const ExecParameters *p) {
9,498✔
1660
        assert(c);
9,498✔
1661
        assert(p);
9,498✔
1662

1663
        if (!c->restrict_suid_sgid)
9,498✔
1664
                return 0;
1665

1666
        if (skip_seccomp_unavailable("RestrictSUIDSGID="))
1,410✔
1667
                return 0;
1668

1669
        return seccomp_restrict_suid_sgid();
1,410✔
1670
}
1671

1672
static int apply_protect_sysctl(const ExecContext *c, const ExecParameters *p) {
9,498✔
1673
        assert(c);
9,498✔
1674
        assert(p);
9,498✔
1675

1676
        /* Turn off the legacy sysctl() system call. Many distributions turn this off while building the kernel, but
1677
         * let's protect even those systems where this is left on in the kernel. */
1678

1679
        if (!c->protect_kernel_tunables)
9,498✔
1680
                return 0;
1681

1682
        if (skip_seccomp_unavailable("ProtectKernelTunables="))
362✔
1683
                return 0;
1684

1685
        return seccomp_protect_sysctl();
362✔
1686
}
1687

1688
static int apply_protect_kernel_modules(const ExecContext *c, const ExecParameters *p) {
9,498✔
1689
        assert(c);
9,498✔
1690
        assert(p);
9,498✔
1691

1692
        /* Turn off module syscalls on ProtectKernelModules=yes */
1693

1694
        if (!c->protect_kernel_modules)
9,498✔
1695
                return 0;
1696

1697
        if (skip_seccomp_unavailable("ProtectKernelModules="))
1,127✔
1698
                return 0;
1699

1700
        return seccomp_load_syscall_filter_set(SCMP_ACT_ALLOW, syscall_filter_sets + SYSCALL_FILTER_SET_MODULE, SCMP_ACT_ERRNO(EPERM), false);
1,127✔
1701
}
1702

1703
static int apply_protect_kernel_logs(const ExecContext *c, const ExecParameters *p) {
9,498✔
1704
        assert(c);
9,498✔
1705
        assert(p);
9,498✔
1706

1707
        if (!c->protect_kernel_logs)
9,498✔
1708
                return 0;
1709

1710
        if (skip_seccomp_unavailable("ProtectKernelLogs="))
1,127✔
1711
                return 0;
1712

1713
        return seccomp_protect_syslog();
1,127✔
1714
}
1715

1716
static int apply_protect_clock(const ExecContext *c, const ExecParameters *p) {
9,498✔
1717
        assert(c);
9,498✔
1718
        assert(p);
9,498✔
1719

1720
        if (!c->protect_clock)
9,498✔
1721
                return 0;
1722

1723
        if (skip_seccomp_unavailable("ProtectClock="))
839✔
1724
                return 0;
1725

1726
        return seccomp_load_syscall_filter_set(SCMP_ACT_ALLOW, syscall_filter_sets + SYSCALL_FILTER_SET_CLOCK, SCMP_ACT_ERRNO(EPERM), false);
839✔
1727
}
1728

1729
static int apply_private_devices(const ExecContext *c, const ExecParameters *p) {
9,498✔
1730
        assert(c);
9,498✔
1731
        assert(p);
9,498✔
1732

1733
        /* If PrivateDevices= is set, also turn off iopl and all @raw-io syscalls. */
1734

1735
        if (!c->private_devices)
9,498✔
1736
                return 0;
1737

1738
        if (skip_seccomp_unavailable("PrivateDevices="))
676✔
1739
                return 0;
1740

1741
        return seccomp_load_syscall_filter_set(SCMP_ACT_ALLOW, syscall_filter_sets + SYSCALL_FILTER_SET_RAW_IO, SCMP_ACT_ERRNO(EPERM), false);
676✔
1742
}
1743

1744
static int apply_restrict_namespaces(const ExecContext *c, const ExecParameters *p) {
9,498✔
1745
        assert(c);
9,498✔
1746
        assert(p);
9,498✔
1747

1748
        if (!exec_context_restrict_namespaces_set(c))
9,498✔
1749
                return 0;
1750

1751
        if (skip_seccomp_unavailable("RestrictNamespaces="))
1,235✔
1752
                return 0;
1753

1754
        return seccomp_restrict_namespaces(c->restrict_namespaces);
1,235✔
1755
}
1756

1757
static int apply_lock_personality(const ExecContext *c, const ExecParameters *p) {
9,498✔
1758
        unsigned long personality;
9,498✔
1759
        int r;
9,498✔
1760

1761
        assert(c);
9,498✔
1762
        assert(p);
9,498✔
1763

1764
        if (!c->lock_personality)
9,498✔
1765
                return 0;
9,498✔
1766

1767
        if (skip_seccomp_unavailable("LockPersonality="))
1,489✔
1768
                return 0;
1769

1770
        personality = c->personality;
1,489✔
1771

1772
        /* If personality is not specified, use either PER_LINUX or PER_LINUX32 depending on what is currently set. */
1773
        if (personality == PERSONALITY_INVALID) {
1,489✔
1774

1775
                r = opinionated_personality(&personality);
1,489✔
1776
                if (r < 0)
1,489✔
1777
                        return r;
1778
        }
1779

1780
        return seccomp_lock_personality(personality);
1,489✔
1781
}
1782

1783
#endif
1784

1785
#if HAVE_LIBBPF
1786
static int apply_restrict_filesystems(const ExecContext *c, const ExecParameters *p) {
9,498✔
1787
        int r;
9,498✔
1788

1789
        assert(c);
9,498✔
1790
        assert(p);
9,498✔
1791

1792
        if (!exec_context_restrict_filesystems_set(c))
9,498✔
1793
                return 0;
1794

1795
        if (p->bpf_restrict_fs_map_fd < 0) {
×
1796
                /* LSM BPF is unsupported or lsm_bpf_setup failed */
1797
                log_debug("LSM BPF not supported, skipping RestrictFileSystems=");
×
1798
                return 0;
×
1799
        }
1800

1801
        /* We are in a new binary, so dl-open again */
1802
        r = dlopen_bpf();
×
1803
        if (r < 0)
×
1804
                return r;
1805

1806
        return bpf_restrict_fs_update(c->restrict_filesystems, p->cgroup_id, p->bpf_restrict_fs_map_fd, c->restrict_filesystems_allow_list);
×
1807
}
1808
#endif
1809

1810
static int apply_protect_hostname(const ExecContext *c, const ExecParameters *p, int *ret_exit_status) {
9,501✔
1811
        int r;
9,501✔
1812

1813
        assert(c);
9,501✔
1814
        assert(p);
9,501✔
1815
        assert(ret_exit_status);
9,501✔
1816

1817
        if (c->protect_hostname == PROTECT_HOSTNAME_NO)
9,501✔
1818
                return 0;
1819

1820
        if (namespace_type_supported(NAMESPACE_UTS)) {
665✔
1821
                if (unshare(CLONE_NEWUTS) < 0) {
665✔
1822
                        if (!ERRNO_IS_NOT_SUPPORTED(errno) && !ERRNO_IS_PRIVILEGE(errno)) {
×
1823
                                *ret_exit_status = EXIT_NAMESPACE;
×
1824
                                return log_error_errno(errno, "Failed to set up UTS namespacing: %m");
×
1825
                        }
1826

1827
                        log_warning("ProtectHostname=%s is configured, but UTS namespace setup is prohibited (container manager?), ignoring namespace setup.",
×
1828
                                    protect_hostname_to_string(c->protect_hostname));
1829

1830
                } else if (c->private_hostname) {
665✔
1831
                        r = sethostname_idempotent(c->private_hostname);
4✔
1832
                        if (r < 0) {
4✔
1833
                                *ret_exit_status = EXIT_NAMESPACE;
×
1834
                                return log_error_errno(r, "Failed to set private hostname '%s': %m", c->private_hostname);
×
1835
                        }
1836
                }
1837
        } else
1838
                log_warning("ProtectHostname=%s is configured, but the kernel does not support UTS namespaces, ignoring namespace setup.",
×
1839
                            protect_hostname_to_string(c->protect_hostname));
1840

1841
#if HAVE_SECCOMP
1842
        if (c->protect_hostname == PROTECT_HOSTNAME_YES) {
665✔
1843
                if (skip_seccomp_unavailable("ProtectHostname="))
659✔
1844
                        return 0;
1845

1846
                r = seccomp_protect_hostname();
659✔
1847
                if (r < 0) {
659✔
1848
                        *ret_exit_status = EXIT_SECCOMP;
×
1849
                        return log_error_errno(r, "Failed to apply hostname restrictions: %m");
×
1850
                }
1851
        }
1852
#endif
1853

1854
        return 1;
1855
}
1856

1857
static void do_idle_pipe_dance(int idle_pipe[static 4]) {
148✔
1858
        assert(idle_pipe);
148✔
1859

1860
        idle_pipe[1] = safe_close(idle_pipe[1]);
148✔
1861
        idle_pipe[2] = safe_close(idle_pipe[2]);
148✔
1862

1863
        if (idle_pipe[0] >= 0) {
148✔
1864
                int r;
148✔
1865

1866
                r = fd_wait_for_event(idle_pipe[0], POLLHUP, IDLE_TIMEOUT_USEC);
148✔
1867

1868
                if (idle_pipe[3] >= 0 && r == 0 /* timeout */) {
148✔
1869
                        ssize_t n;
110✔
1870

1871
                        /* Signal systemd that we are bored and want to continue. */
1872
                        n = write(idle_pipe[3], "x", 1);
110✔
1873
                        if (n > 0)
110✔
1874
                                /* Wait for systemd to react to the signal above. */
1875
                                (void) fd_wait_for_event(idle_pipe[0], POLLHUP, IDLE_TIMEOUT2_USEC);
110✔
1876
                }
1877

1878
                idle_pipe[0] = safe_close(idle_pipe[0]);
148✔
1879

1880
        }
1881

1882
        idle_pipe[3] = safe_close(idle_pipe[3]);
148✔
1883
}
148✔
1884

1885
static const char *exec_directory_env_name_to_string(ExecDirectoryType t);
1886

1887
/* And this table also maps ExecDirectoryType, to the environment variable we pass the selected directory to
1888
 * the service payload in. */
1889
static const char* const exec_directory_env_name_table[_EXEC_DIRECTORY_TYPE_MAX] = {
1890
        [EXEC_DIRECTORY_RUNTIME]       = "RUNTIME_DIRECTORY",
1891
        [EXEC_DIRECTORY_STATE]         = "STATE_DIRECTORY",
1892
        [EXEC_DIRECTORY_CACHE]         = "CACHE_DIRECTORY",
1893
        [EXEC_DIRECTORY_LOGS]          = "LOGS_DIRECTORY",
1894
        [EXEC_DIRECTORY_CONFIGURATION] = "CONFIGURATION_DIRECTORY",
1895
};
1896

1897
DEFINE_PRIVATE_STRING_TABLE_LOOKUP_TO_STRING(exec_directory_env_name, ExecDirectoryType);
2,491✔
1898

1899
static int build_environment(
9,523✔
1900
                const ExecContext *c,
1901
                const ExecParameters *p,
1902
                const CGroupContext *cgroup_context,
1903
                size_t n_fds,
1904
                const char *home,
1905
                const char *username,
1906
                const char *shell,
1907
                dev_t journal_stream_dev,
1908
                ino_t journal_stream_ino,
1909
                const char *memory_pressure_path,
1910
                bool needs_sandboxing,
1911
                char ***ret) {
1912

1913
        _cleanup_strv_free_ char **our_env = NULL;
9,523✔
1914
        size_t n_env = 0;
9,523✔
1915
        char *x;
9,523✔
1916
        int r;
9,523✔
1917

1918
        assert(c);
9,523✔
1919
        assert(p);
9,523✔
1920
        assert(cgroup_context);
9,523✔
1921
        assert(ret);
9,523✔
1922

1923
#define N_ENV_VARS 21
1924
        our_env = new0(char*, N_ENV_VARS + _EXEC_DIRECTORY_TYPE_MAX);
9,523✔
1925
        if (!our_env)
9,523✔
1926
                return -ENOMEM;
1927

1928
        if (n_fds > 0) {
9,523✔
1929
                _cleanup_free_ char *joined = NULL;
1,532✔
1930

1931
                if (asprintf(&x, "LISTEN_PID="PID_FMT, getpid_cached()) < 0)
1,532✔
1932
                        return -ENOMEM;
1933
                our_env[n_env++] = x;
1,532✔
1934

1935
                if (asprintf(&x, "LISTEN_FDS=%zu", n_fds) < 0)
1,532✔
1936
                        return -ENOMEM;
1937
                our_env[n_env++] = x;
1,532✔
1938

1939
                joined = strv_join(p->fd_names, ":");
1,532✔
1940
                if (!joined)
1,532✔
1941
                        return -ENOMEM;
1942

1943
                x = strjoin("LISTEN_FDNAMES=", joined);
1,532✔
1944
                if (!x)
1,532✔
1945
                        return -ENOMEM;
1946
                our_env[n_env++] = x;
1,532✔
1947
        }
1948

1949
        if ((p->flags & EXEC_SET_WATCHDOG) && p->watchdog_usec > 0) {
9,523✔
1950
                if (asprintf(&x, "WATCHDOG_PID="PID_FMT, getpid_cached()) < 0)
1,483✔
1951
                        return -ENOMEM;
1952
                our_env[n_env++] = x;
1,483✔
1953

1954
                if (asprintf(&x, "WATCHDOG_USEC="USEC_FMT, p->watchdog_usec) < 0)
1,483✔
1955
                        return -ENOMEM;
1956
                our_env[n_env++] = x;
1,483✔
1957
        }
1958

1959
        /* If this is D-Bus, tell the nss-systemd module, since it relies on being able to use blocking
1960
         * Varlink calls back to us for look up dynamic users in PID 1. Break the deadlock between D-Bus and
1961
         * PID 1 by disabling use of PID1' NSS interface for looking up dynamic users. */
1962
        if (p->flags & EXEC_NSS_DYNAMIC_BYPASS) {
9,523✔
1963
                x = strdup("SYSTEMD_NSS_DYNAMIC_BYPASS=1");
121✔
1964
                if (!x)
121✔
1965
                        return -ENOMEM;
1966
                our_env[n_env++] = x;
121✔
1967
        }
1968

1969
        /* We query "root" if this is a system unit and User= is not specified. $USER is always set. $HOME
1970
         * could cause problem for e.g. getty, since login doesn't override $HOME, and $LOGNAME and $SHELL don't
1971
         * really make much sense since we're not logged in. Hence we conditionalize the three based on
1972
         * SetLoginEnvironment= switch. */
1973
        if (!username && !c->dynamic_user && p->runtime_scope == RUNTIME_SCOPE_SYSTEM) {
9,523✔
1974
                assert(!c->user);
6,745✔
1975

1976
                r = get_fixed_user("root", /* prefer_nss = */ false, &username, NULL, NULL, &home, &shell);
6,745✔
1977
                if (r < 0)
6,745✔
1978
                        return log_debug_errno(r, "Failed to determine user credentials for root: %m");
×
1979
        }
1980

1981
        bool set_user_login_env = exec_context_get_set_login_environment(c);
9,523✔
1982

1983
        if (username) {
9,523✔
1984
                x = strjoin("USER=", username);
8,737✔
1985
                if (!x)
8,737✔
1986
                        return -ENOMEM;
1987
                our_env[n_env++] = x;
8,737✔
1988

1989
                if (set_user_login_env) {
8,737✔
1990
                        x = strjoin("LOGNAME=", username);
1,992✔
1991
                        if (!x)
1,992✔
1992
                                return -ENOMEM;
1993
                        our_env[n_env++] = x;
1,992✔
1994
                }
1995
        }
1996

1997
        /* Note that we don't set $HOME or $SHELL if they are not particularly enlightening anyway
1998
         * (i.e. are "/" or "/bin/nologin"). */
1999

2000
        if (home && set_user_login_env && !empty_or_root(home)) {
9,523✔
2001
                x = strjoin("HOME=", home);
399✔
2002
                if (!x)
399✔
2003
                        return -ENOMEM;
2004

2005
                path_simplify(x + 5);
399✔
2006
                our_env[n_env++] = x;
399✔
2007
        }
2008

2009
        if (shell && set_user_login_env && !shell_is_placeholder(shell)) {
9,523✔
2010
                x = strjoin("SHELL=", shell);
401✔
2011
                if (!x)
401✔
2012
                        return -ENOMEM;
2013

2014
                path_simplify(x + 6);
401✔
2015
                our_env[n_env++] = x;
401✔
2016
        }
2017

2018
        if (!sd_id128_is_null(p->invocation_id)) {
9,523✔
2019
                assert(p->invocation_id_string);
9,523✔
2020

2021
                x = strjoin("INVOCATION_ID=", p->invocation_id_string);
9,523✔
2022
                if (!x)
9,523✔
2023
                        return -ENOMEM;
2024

2025
                our_env[n_env++] = x;
9,523✔
2026
        }
2027

2028
        if (exec_context_needs_term(c)) {
9,523✔
2029
                _cleanup_free_ char *cmdline = NULL;
444✔
2030
                const char *tty_path, *term = NULL;
444✔
2031

2032
                tty_path = exec_context_tty_path(c);
444✔
2033

2034
                /* If we are forked off PID 1 and we are supposed to operate on /dev/console, then let's try
2035
                 * to inherit the $TERM set for PID 1. This is useful for containers so that the $TERM the
2036
                 * container manager passes to PID 1 ends up all the way in the console login shown. */
2037

2038
                if (path_equal(tty_path, "/dev/console") && getppid() == 1)
444✔
2039
                        term = getenv("TERM");
388✔
2040
                else if (tty_path && in_charset(skip_dev_prefix(tty_path), ALPHANUMERICAL)) {
56✔
2041
                        _cleanup_free_ char *key = NULL;
40✔
2042

2043
                        key = strjoin("systemd.tty.term.", skip_dev_prefix(tty_path));
40✔
2044
                        if (!key)
40✔
2045
                                return -ENOMEM;
×
2046

2047
                        r = proc_cmdline_get_key(key, 0, &cmdline);
40✔
2048
                        if (r < 0)
40✔
2049
                                log_debug_errno(r, "Failed to read %s from kernel cmdline, ignoring: %m", key);
40✔
2050
                        else if (r > 0)
40✔
2051
                                term = cmdline;
×
2052
                }
2053

2054
                if (!term) {
428✔
2055
                        /* If no precise $TERM is known and we pick a fallback default, then let's also set
2056
                         * $COLORTERM=truecolor. That's because our fallback default is vt220, which is
2057
                         * generally a safe bet (as it supports PageUp/PageDown unlike vt100, and is quite
2058
                         * universally available in terminfo/termcap), except for the fact that real DEC
2059
                         * vt220 gear never actually supported color. Most tools these days generate color on
2060
                         * vt220 anyway, ignoring the physical capabilities of the real hardware, but some
2061
                         * tools actually believe in the historical truth. Which is unfortunate since *we*
2062
                         * *don't* care about the historical truth, we just want sane defaults if nothing
2063
                         * better is explicitly configured. It's 2025 after all, at the time of writing,
2064
                         * pretty much all terminal emulators actually *do* support color, hence if we don't
2065
                         * know any better let's explicitly claim color support via $COLORTERM. Or in other
2066
                         * words: we now explicitly claim to be connected to a franken-vt220 with true color
2067
                         * support. */
2068
                        x = strdup("COLORTERM=truecolor");
56✔
2069
                        if (!x)
56✔
2070
                                return -ENOMEM;
2071

2072
                        our_env[n_env++] = x;
56✔
2073

2074
                        term = default_term_for_tty(tty_path);
56✔
2075
                }
2076

2077
                x = strjoin("TERM=", term);
444✔
2078
                if (!x)
444✔
2079
                        return -ENOMEM;
2080
                our_env[n_env++] = x;
444✔
2081
        }
2082

2083
        if (journal_stream_dev != 0 && journal_stream_ino != 0) {
9,523✔
2084
                if (asprintf(&x, "JOURNAL_STREAM=" DEV_FMT ":" INO_FMT, journal_stream_dev, journal_stream_ino) < 0)
8,757✔
2085
                        return -ENOMEM;
2086

2087
                our_env[n_env++] = x;
8,757✔
2088
        }
2089

2090
        if (c->log_namespace) {
9,523✔
2091
                x = strjoin("LOG_NAMESPACE=", c->log_namespace);
2✔
2092
                if (!x)
2✔
2093
                        return -ENOMEM;
2094

2095
                our_env[n_env++] = x;
2✔
2096
        }
2097

2098
        for (ExecDirectoryType t = 0; t < _EXEC_DIRECTORY_TYPE_MAX; t++) {
57,138✔
2099
                _cleanup_free_ char *joined = NULL;
47,615✔
2100
                const char *n;
47,615✔
2101

2102
                if (!p->prefix[t])
47,615✔
2103
                        continue;
×
2104

2105
                if (c->directories[t].n_items == 0)
47,615✔
2106
                        continue;
45,124✔
2107

2108
                n = exec_directory_env_name_to_string(t);
2,491✔
2109
                if (!n)
2,491✔
2110
                        continue;
×
2111

2112
                for (size_t i = 0; i < c->directories[t].n_items; i++) {
5,478✔
2113
                        _cleanup_free_ char *prefixed = NULL;
2,987✔
2114

2115
                        prefixed = path_join(p->prefix[t], c->directories[t].items[i].path);
2,987✔
2116
                        if (!prefixed)
2,987✔
2117
                                return -ENOMEM;
2118

2119
                        if (!strextend_with_separator(&joined, ":", prefixed))
2,987✔
2120
                                return -ENOMEM;
2121
                }
2122

2123
                x = strjoin(n, "=", joined);
2,491✔
2124
                if (!x)
2,491✔
2125
                        return -ENOMEM;
2126

2127
                our_env[n_env++] = x;
2,491✔
2128
        }
2129

2130
        _cleanup_free_ char *creds_dir = NULL;
9,523✔
2131
        r = exec_context_get_credential_directory(c, p, p->unit_id, &creds_dir);
9,523✔
2132
        if (r < 0)
9,523✔
2133
                return r;
2134
        if (r > 0) {
9,523✔
2135
                x = strjoin("CREDENTIALS_DIRECTORY=", creds_dir);
1,936✔
2136
                if (!x)
1,936✔
2137
                        return -ENOMEM;
2138

2139
                our_env[n_env++] = x;
1,936✔
2140
        }
2141

2142
        if (asprintf(&x, "SYSTEMD_EXEC_PID=" PID_FMT, getpid_cached()) < 0)
9,523✔
2143
                return -ENOMEM;
2144

2145
        our_env[n_env++] = x;
9,523✔
2146

2147
        if (memory_pressure_path) {
9,523✔
2148
                x = strjoin("MEMORY_PRESSURE_WATCH=", memory_pressure_path);
9,126✔
2149
                if (!x)
9,126✔
2150
                        return -ENOMEM;
2151

2152
                our_env[n_env++] = x;
9,126✔
2153

2154
                if (!path_equal(memory_pressure_path, "/dev/null")) {
9,126✔
2155
                        _cleanup_free_ char *b = NULL, *e = NULL;
9,126✔
2156

2157
                        if (asprintf(&b, "%s " USEC_FMT " " USEC_FMT,
9,126✔
2158
                                     MEMORY_PRESSURE_DEFAULT_TYPE,
2159
                                     cgroup_context->memory_pressure_threshold_usec == USEC_INFINITY ? MEMORY_PRESSURE_DEFAULT_THRESHOLD_USEC :
9,126✔
2160
                                     CLAMP(cgroup_context->memory_pressure_threshold_usec, 1U, MEMORY_PRESSURE_DEFAULT_WINDOW_USEC),
9,126✔
2161
                                     MEMORY_PRESSURE_DEFAULT_WINDOW_USEC) < 0)
2162
                                return -ENOMEM;
2163

2164
                        if (base64mem(b, strlen(b) + 1, &e) < 0)
9,126✔
2165
                                return -ENOMEM;
2166

2167
                        x = strjoin("MEMORY_PRESSURE_WRITE=", e);
9,126✔
2168
                        if (!x)
9,126✔
2169
                                return -ENOMEM;
2170

2171
                        our_env[n_env++] = x;
9,126✔
2172
                }
2173
        }
2174

2175
        if (p->notify_socket) {
9,523✔
2176
                x = strjoin("NOTIFY_SOCKET=", exec_get_private_notify_socket_path(c, p, needs_sandboxing) ?: p->notify_socket);
1,884✔
2177
                if (!x)
1,884✔
2178
                        return -ENOMEM;
2179

2180
                our_env[n_env++] = x;
1,884✔
2181
        }
2182

2183
        assert(n_env < N_ENV_VARS + _EXEC_DIRECTORY_TYPE_MAX);
9,523✔
2184
#undef N_ENV_VARS
2185

2186
        *ret = TAKE_PTR(our_env);
9,523✔
2187

2188
        return 0;
9,523✔
2189
}
2190

2191
static int build_pass_environment(const ExecContext *c, char ***ret) {
9,523✔
2192
        _cleanup_strv_free_ char **pass_env = NULL;
9,523✔
2193
        size_t n_env = 0;
9,523✔
2194

2195
        assert(c);
9,523✔
2196
        assert(ret);
9,523✔
2197

2198
        STRV_FOREACH(i, c->pass_environment) {
9,824✔
2199
                _cleanup_free_ char *x = NULL;
×
2200
                char *v;
301✔
2201

2202
                v = getenv(*i);
301✔
2203
                if (!v)
301✔
2204
                        continue;
×
2205
                x = strjoin(*i, "=", v);
301✔
2206
                if (!x)
301✔
2207
                        return -ENOMEM;
2208

2209
                if (!GREEDY_REALLOC(pass_env, n_env + 2))
301✔
2210
                        return -ENOMEM;
2211

2212
                pass_env[n_env++] = TAKE_PTR(x);
301✔
2213
                pass_env[n_env] = NULL;
301✔
2214
        }
2215

2216
        *ret = TAKE_PTR(pass_env);
9,523✔
2217
        return 0;
9,523✔
2218
}
2219

2220
static int setup_private_users(PrivateUsers private_users, uid_t ouid, gid_t ogid, uid_t uid, gid_t gid, bool allow_setgroups) {
9,507✔
2221
        _cleanup_free_ char *uid_map = NULL, *gid_map = NULL;
9,507✔
2222
        _cleanup_close_pair_ int errno_pipe[2] = EBADF_PAIR;
9,507✔
2223
        _cleanup_close_ int unshare_ready_fd = -EBADF;
9,507✔
2224
        _cleanup_(sigkill_waitp) pid_t pid = 0;
9,507✔
2225
        uint64_t c = 1;
9,507✔
2226
        ssize_t n;
9,507✔
2227
        int r;
9,507✔
2228

2229
        /* Set up a user namespace and map the original UID/GID (IDs from before any user or group changes, i.e.
2230
         * the IDs from the user or system manager(s)) to itself, the selected UID/GID to itself, and everything else to
2231
         * nobody. In order to be able to write this mapping we need CAP_SETUID in the original user namespace, which
2232
         * we however lack after opening the user namespace. To work around this we fork() a temporary child process,
2233
         * which waits for the parent to create the new user namespace while staying in the original namespace. The
2234
         * child then writes the UID mapping, under full privileges. The parent waits for the child to finish and
2235
         * continues execution normally.
2236
         * For unprivileged users (i.e. without capabilities), the root to root mapping is excluded. As such, it
2237
         * does not need CAP_SETUID to write the single line mapping to itself. */
2238

2239
        if (private_users == PRIVATE_USERS_NO)
9,507✔
2240
                return 0;
2241

2242
        if (private_users == PRIVATE_USERS_IDENTITY) {
50✔
2243
                uid_map = strdup("0 0 65536\n");
4✔
2244
                if (!uid_map)
4✔
2245
                        return -ENOMEM;
2246
        } else if (private_users == PRIVATE_USERS_FULL) {
46✔
2247
                /* Map all UID/GID from original to new user namespace. We can't use `0 0 UINT32_MAX` because
2248
                 * this is the same UID/GID map as the init user namespace and systemd's running_in_userns()
2249
                 * checks whether its in a user namespace by comparing uid_map/gid_map to `0 0 UINT32_MAX`.
2250
                 * Thus, we still map all UIDs/GIDs but do it using two extents to differentiate the new user
2251
                 * namespace from the init namespace:
2252
                 *   0 0 1
2253
                 *   1 1 UINT32_MAX - 1
2254
                 *
2255
                 * systemd will remove the heuristic in running_in_userns() and use namespace inodes in version 258
2256
                 * (PR #35382). But some users may be running a container image with older systemd < 258 so we keep
2257
                 * this uid_map/gid_map hack until version 259 for version N-1 compatibility.
2258
                 *
2259
                 * TODO: Switch to `0 0 UINT32_MAX` in systemd v259.
2260
                 *
2261
                 * Note the kernel defines the UID range between 0 and UINT32_MAX so we map all UIDs even though
2262
                 * the UID range beyond INT32_MAX (e.g. i.e. the range above the signed 32-bit range) is
2263
                 * icky. For example, setfsuid() returns the old UID as signed integer. But units can decide to
2264
                 * use these UIDs/GIDs so we need to map them. */
2265
                r = asprintf(&uid_map, "0 0 1\n"
3✔
2266
                                       "1 1 " UID_FMT "\n", (uid_t) (UINT32_MAX - 1));
2267
                if (r < 0)
3✔
2268
                        return -ENOMEM;
2269
        /* Can only set up multiple mappings with CAP_SETUID. */
2270
        } else if (have_effective_cap(CAP_SETUID) > 0 && uid != ouid && uid_is_valid(uid)) {
43✔
2271
                r = asprintf(&uid_map,
2✔
2272
                             UID_FMT " " UID_FMT " 1\n"     /* Map $OUID → $OUID */
2273
                             UID_FMT " " UID_FMT " 1\n",    /* Map $UID → $UID */
2274
                             ouid, ouid, uid, uid);
2275
                if (r < 0)
2✔
2276
                        return -ENOMEM;
2277
        } else {
2278
                r = asprintf(&uid_map,
41✔
2279
                             UID_FMT " " UID_FMT " 1\n",    /* Map $OUID → $OUID */
2280
                             ouid, ouid);
2281
                if (r < 0)
41✔
2282
                        return -ENOMEM;
2283
        }
2284

2285
        if (private_users == PRIVATE_USERS_IDENTITY) {
50✔
2286
                gid_map = strdup("0 0 65536\n");
4✔
2287
                if (!gid_map)
4✔
2288
                        return -ENOMEM;
2289
        } else if (private_users == PRIVATE_USERS_FULL) {
46✔
2290
                r = asprintf(&gid_map, "0 0 1\n"
3✔
2291
                                       "1 1 " GID_FMT "\n", (gid_t) (UINT32_MAX - 1));
2292
                if (r < 0)
3✔
2293
                        return -ENOMEM;
2294
        /* Can only set up multiple mappings with CAP_SETGID. */
2295
        } else if (have_effective_cap(CAP_SETGID) > 0 && gid != ogid && gid_is_valid(gid)) {
59✔
2296
                r = asprintf(&gid_map,
2✔
2297
                             GID_FMT " " GID_FMT " 1\n"     /* Map $OGID → $OGID */
2298
                             GID_FMT " " GID_FMT " 1\n",    /* Map $GID → $GID */
2299
                             ogid, ogid, gid, gid);
2300
                if (r < 0)
2✔
2301
                        return -ENOMEM;
2302
        } else {
2303
                r = asprintf(&gid_map,
41✔
2304
                             GID_FMT " " GID_FMT " 1\n",    /* Map $OGID -> $OGID */
2305
                             ogid, ogid);
2306
                if (r < 0)
41✔
2307
                        return -ENOMEM;
2308
        }
2309

2310
        /* Create a communication channel so that the parent can tell the child when it finished creating the user
2311
         * namespace. */
2312
        unshare_ready_fd = eventfd(0, EFD_CLOEXEC);
50✔
2313
        if (unshare_ready_fd < 0)
50✔
2314
                return -errno;
×
2315

2316
        /* Create a communication channel so that the child can tell the parent a proper error code in case it
2317
         * failed. */
2318
        if (pipe2(errno_pipe, O_CLOEXEC) < 0)
50✔
2319
                return -errno;
×
2320

2321
        r = safe_fork("(sd-userns)", FORK_RESET_SIGNALS|FORK_DEATHSIG_SIGKILL, &pid);
50✔
2322
        if (r < 0)
100✔
2323
                return r;
2324
        if (r == 0) {
100✔
2325
                _cleanup_close_ int fd = -EBADF;
×
2326
                const char *a;
50✔
2327
                pid_t ppid;
50✔
2328

2329
                /* Child process, running in the original user namespace. Let's update the parent's UID/GID map from
2330
                 * here, after the parent opened its own user namespace. */
2331

2332
                ppid = getppid();
50✔
2333
                errno_pipe[0] = safe_close(errno_pipe[0]);
50✔
2334

2335
                /* Wait until the parent unshared the user namespace */
2336
                if (read(unshare_ready_fd, &c, sizeof(c)) < 0)
50✔
2337
                        report_errno_and_exit(errno_pipe[1], -errno);
×
2338

2339
                /* Disable the setgroups() system call in the child user namespace, for good, unless PrivateUsers=full
2340
                 * and using the system service manager. */
2341
                a = procfs_file_alloca(ppid, "setgroups");
50✔
2342
                fd = open(a, O_WRONLY|O_CLOEXEC);
50✔
2343
                if (fd < 0) {
50✔
2344
                        if (errno != ENOENT) {
×
2345
                                r = log_debug_errno(errno, "Failed to open %s: %m", a);
×
2346
                                report_errno_and_exit(errno_pipe[1], r);
×
2347
                        }
2348

2349
                        /* If the file is missing the kernel is too old, let's continue anyway. */
2350
                } else {
2351
                        const char *setgroups = allow_setgroups ? "allow\n" : "deny\n";
50✔
2352
                        if (write(fd, setgroups, strlen(setgroups)) < 0) {
50✔
2353
                                r = log_debug_errno(errno, "Failed to write '%s' to %s: %m", setgroups, a);
×
2354
                                report_errno_and_exit(errno_pipe[1], r);
×
2355
                        }
2356

2357
                        fd = safe_close(fd);
50✔
2358
                }
2359

2360
                /* First write the GID map */
2361
                a = procfs_file_alloca(ppid, "gid_map");
50✔
2362
                fd = open(a, O_WRONLY|O_CLOEXEC);
50✔
2363
                if (fd < 0) {
50✔
2364
                        r = log_debug_errno(errno, "Failed to open %s: %m", a);
×
2365
                        report_errno_and_exit(errno_pipe[1], r);
×
2366
                }
2367

2368
                if (write(fd, gid_map, strlen(gid_map)) < 0) {
50✔
2369
                        r = log_debug_errno(errno, "Failed to write GID map to %s: %m", a);
×
2370
                        report_errno_and_exit(errno_pipe[1], r);
×
2371
                }
2372

2373
                fd = safe_close(fd);
50✔
2374

2375
                /* The write the UID map */
2376
                a = procfs_file_alloca(ppid, "uid_map");
50✔
2377
                fd = open(a, O_WRONLY|O_CLOEXEC);
50✔
2378
                if (fd < 0) {
50✔
2379
                        r = log_debug_errno(errno, "Failed to open %s: %m", a);
×
2380
                        report_errno_and_exit(errno_pipe[1], r);
×
2381
                }
2382

2383
                if (write(fd, uid_map, strlen(uid_map)) < 0) {
50✔
2384
                        r = log_debug_errno(errno, "Failed to write UID map to %s: %m", a);
×
2385
                        report_errno_and_exit(errno_pipe[1], r);
×
2386
                }
2387

2388
                _exit(EXIT_SUCCESS);
50✔
2389
        }
2390

2391
        errno_pipe[1] = safe_close(errno_pipe[1]);
50✔
2392

2393
        if (unshare(CLONE_NEWUSER) < 0)
50✔
2394
                return log_debug_errno(errno, "Failed to unshare user namespace: %m");
×
2395

2396
        /* Let the child know that the namespace is ready now */
2397
        if (write(unshare_ready_fd, &c, sizeof(c)) < 0)
50✔
2398
                return -errno;
×
2399

2400
        /* Try to read an error code from the child */
2401
        n = read(errno_pipe[0], &r, sizeof(r));
50✔
2402
        if (n < 0)
50✔
2403
                return -errno;
×
2404
        if (n == sizeof(r)) { /* an error code was sent to us */
50✔
2405
                if (r < 0)
×
2406
                        return r;
2407
                return -EIO;
×
2408
        }
2409
        if (n != 0) /* on success we should have read 0 bytes */
50✔
2410
                return -EIO;
2411

2412
        r = wait_for_terminate_and_check("(sd-userns)", TAKE_PID(pid), 0);
50✔
2413
        if (r < 0)
50✔
2414
                return r;
2415
        if (r != EXIT_SUCCESS) /* If something strange happened with the child, let's consider this fatal, too */
50✔
2416
                return -EIO;
×
2417

2418
        return 1;
2419
}
2420

2421
static int can_mount_proc(void) {
8✔
2422
        _cleanup_close_pair_ int errno_pipe[2] = EBADF_PAIR;
4✔
2423
        _cleanup_(sigkill_waitp) pid_t pid = 0;
×
2424
        ssize_t n;
8✔
2425
        int r;
8✔
2426

2427
        /* If running via unprivileged user manager and /proc/ is masked (e.g. /proc/kmsg is over-mounted with tmpfs
2428
         * like systemd-nspawn does), then mounting /proc/ will fail with EPERM. This is due to a kernel restriction
2429
         * where unprivileged user namespaces cannot mount a less restrictive instance of /proc. */
2430

2431
        /* Create a communication channel so that the child can tell the parent a proper error code in case it
2432
         * failed. */
2433
        if (pipe2(errno_pipe, O_CLOEXEC) < 0)
8✔
2434
                return log_debug_errno(errno, "Failed to create pipe for communicating with child process (sd-proc-check): %m");
×
2435

2436
        /* Fork a child process into its own mount and PID namespace. Note safe_fork() already remounts / as SLAVE
2437
         * with FORK_MOUNTNS_SLAVE. */
2438
        r = safe_fork("(sd-proc-check)",
8✔
2439
                      FORK_RESET_SIGNALS|FORK_DEATHSIG_SIGKILL|FORK_NEW_MOUNTNS|FORK_MOUNTNS_SLAVE|FORK_NEW_PIDNS, &pid);
2440
        if (r < 0)
8✔
2441
                return log_debug_errno(r, "Failed to fork child process (sd-proc-check): %m");
×
2442
        if (r == 0) {
8✔
2443
                errno_pipe[0] = safe_close(errno_pipe[0]);
4✔
2444

2445
                /* Try mounting /proc on /dev/shm/. No need to clean up the mount since the mount
2446
                 * namespace will be cleaned up once the process exits. */
2447
                r = mount_follow_verbose(LOG_DEBUG, "proc", "/dev/shm/", "proc", MS_NOSUID|MS_NOEXEC|MS_NODEV, NULL);
4✔
2448
                if (r < 0) {
4✔
2449
                        (void) write(errno_pipe[1], &r, sizeof(r));
1✔
2450
                        _exit(EXIT_FAILURE);
1✔
2451
                }
2452

2453
                _exit(EXIT_SUCCESS);
3✔
2454
        }
2455

2456
        errno_pipe[1] = safe_close(errno_pipe[1]);
4✔
2457

2458
        /* Try to read an error code from the child */
2459
        n = read(errno_pipe[0], &r, sizeof(r));
4✔
2460
        if (n < 0)
4✔
2461
                return log_debug_errno(errno, "Failed to read errno from pipe with child process (sd-proc-check): %m");
×
2462
        if (n == sizeof(r)) { /* an error code was sent to us */
4✔
2463
                /* This is the expected case where proc cannot be mounted due to permissions. */
2464
                if (ERRNO_IS_NEG_PRIVILEGE(r))
4✔
2465
                        return 0;
2466
                if (r < 0)
×
2467
                        return r;
2468

2469
                return -EIO;
×
2470
        }
2471
        if (n != 0) /* on success we should have read 0 bytes */
3✔
2472
                return -EIO;
2473

2474
        r = wait_for_terminate_and_check("(sd-proc-check)", TAKE_PID(pid), 0 /* flags= */);
3✔
2475
        if (r < 0)
3✔
2476
                return log_debug_errno(r, "Failed to wait for (sd-proc-check) child process to terminate: %m");
×
2477
        if (r != EXIT_SUCCESS) /* If something strange happened with the child, let's consider this fatal, too */
3✔
2478
                return log_debug_errno(SYNTHETIC_ERRNO(EIO), "Child process (sd-proc-check) exited with unexpected exit status '%d'.", r);
×
2479

2480
        return 1;
2481
}
2482

2483
static int setup_private_pids(const ExecContext *c, ExecParameters *p) {
8✔
2484
        _cleanup_(pidref_done) PidRef pidref = PIDREF_NULL;
×
2485
        _cleanup_close_pair_ int errno_pipe[2] = EBADF_PAIR;
6✔
2486
        ssize_t n;
8✔
2487
        int r, q;
8✔
2488

2489
        assert(c);
8✔
2490
        assert(p);
8✔
2491
        assert(p->pidref_transport_fd >= 0);
8✔
2492

2493
        /* The first process created after unsharing a pid namespace becomes PID 1 in the pid namespace, so
2494
         * we have to fork after unsharing the pid namespace to become PID 1. The parent sends the child
2495
         * pidref to the manager and exits while the child process continues with the rest of exec_invoke()
2496
         * and finally executes the actual payload. */
2497

2498
        /* Create a communication channel so that the parent can tell the child a proper error code in case it
2499
         * failed to send child pidref to the manager. */
2500
        if (pipe2(errno_pipe, O_CLOEXEC) < 0)
8✔
2501
                return log_debug_errno(errno, "Failed to create pipe for communicating with parent process: %m");
×
2502

2503
        /* Set FORK_DETACH to immediately re-parent the child process to the invoking manager process. */
2504
        r = pidref_safe_fork("(sd-pidns-child)", FORK_NEW_PIDNS|FORK_DETACH, &pidref);
8✔
2505
        if (r < 0)
14✔
2506
                return log_debug_errno(r, "Failed to fork child into new pid namespace: %m");
×
2507
        if (r > 0) {
14✔
2508
                errno_pipe[0] = safe_close(errno_pipe[0]);
8✔
2509

2510
                /* In the parent process, we send the child pidref to the manager and exit.
2511
                 * If PIDFD is not supported, only the child PID is sent. The server then
2512
                 * uses the child PID to set the new exec main process. */
2513
                q = send_one_fd_iov(
8✔
2514
                                p->pidref_transport_fd,
2515
                                pidref.fd,
2516
                                &IOVEC_MAKE(&pidref.pid, sizeof(pidref.pid)),
2517
                                /*iovlen=*/ 1,
2518
                                /*flags=*/ 0);
2519
                /* Send error code to child process. */
2520
                (void) write(errno_pipe[1], &q, sizeof(q));
8✔
2521
                /* Exit here so we only go through the destructors in exec_invoke only once - in the child - as
2522
                 * some destructors have external effects. The main codepaths continue in the child process. */
2523
                _exit(q < 0 ? EXIT_FAILURE : EXIT_SUCCESS);
8✔
2524
        }
2525

2526
        errno_pipe[1] = safe_close(errno_pipe[1]);
6✔
2527
        p->pidref_transport_fd = safe_close(p->pidref_transport_fd);
6✔
2528

2529
        /* Try to read an error code from the parent. Note a child process cannot wait for the parent so we always
2530
         * receive an errno even on success. */
2531
        n = read(errno_pipe[0], &r, sizeof(r));
6✔
2532
        if (n < 0)
6✔
2533
                return log_debug_errno(errno, "Failed to read errno from pipe with parent process: %m");
×
2534
        if (n != sizeof(r))
6✔
2535
                return log_debug_errno(SYNTHETIC_ERRNO(EIO), "Failed to read enough bytes from pipe with parent process");
×
2536
        if (r < 0)
6✔
2537
                return log_debug_errno(r, "Failed to send child pidref to manager: %m");
×
2538

2539
        /* NOTE! This function returns in the child process only. */
2540
        return r;
2541
}
2542

2543
static int create_many_symlinks(const char *root, const char *source, char **symlinks) {
1,526✔
2544
        _cleanup_free_ char *src_abs = NULL;
1,526✔
2545
        int r;
1,526✔
2546

2547
        assert(source);
1,526✔
2548

2549
        src_abs = path_join(root, source);
1,526✔
2550
        if (!src_abs)
1,526✔
2551
                return -ENOMEM;
2552

2553
        STRV_FOREACH(dst, symlinks) {
1,539✔
2554
                _cleanup_free_ char *dst_abs = NULL;
13✔
2555

2556
                dst_abs = path_join(root, *dst);
13✔
2557
                if (!dst_abs)
13✔
2558
                        return -ENOMEM;
2559

2560
                r = mkdir_parents_label(dst_abs, 0755);
13✔
2561
                if (r < 0)
13✔
2562
                        return r;
2563

2564
                r = symlink_idempotent(src_abs, dst_abs, true);
13✔
2565
                if (r < 0)
13✔
2566
                        return r;
2567
        }
2568

2569
        return 0;
2570
}
2571

2572
static int setup_exec_directory(
57,341✔
2573
                const ExecContext *context,
2574
                const ExecParameters *params,
2575
                uid_t uid,
2576
                gid_t gid,
2577
                ExecDirectoryType type,
2578
                bool needs_mount_namespace,
2579
                int *exit_status) {
2580

2581
        static const int exit_status_table[_EXEC_DIRECTORY_TYPE_MAX] = {
57,341✔
2582
                [EXEC_DIRECTORY_RUNTIME]       = EXIT_RUNTIME_DIRECTORY,
2583
                [EXEC_DIRECTORY_STATE]         = EXIT_STATE_DIRECTORY,
2584
                [EXEC_DIRECTORY_CACHE]         = EXIT_CACHE_DIRECTORY,
2585
                [EXEC_DIRECTORY_LOGS]          = EXIT_LOGS_DIRECTORY,
2586
                [EXEC_DIRECTORY_CONFIGURATION] = EXIT_CONFIGURATION_DIRECTORY,
2587
        };
2588
        int r;
57,341✔
2589

2590
        assert(context);
57,341✔
2591
        assert(params);
57,341✔
2592
        assert(type >= 0 && type < _EXEC_DIRECTORY_TYPE_MAX);
57,341✔
2593
        assert(exit_status);
57,341✔
2594

2595
        if (!params->prefix[type])
57,341✔
2596
                return 0;
2597

2598
        if (params->flags & EXEC_CHOWN_DIRECTORIES) {
57,341✔
2599
                if (!uid_is_valid(uid))
53,406✔
2600
                        uid = 0;
40,211✔
2601
                if (!gid_is_valid(gid))
53,406✔
2602
                        gid = 0;
40,191✔
2603
        }
2604

2605
        FOREACH_ARRAY(i, context->directories[type].items, context->directories[type].n_items) {
61,040✔
2606
                _cleanup_free_ char *p = NULL, *pp = NULL;
3,700✔
2607

2608
                p = path_join(params->prefix[type], i->path);
3,700✔
2609
                if (!p) {
3,700✔
2610
                        r = -ENOMEM;
×
2611
                        goto fail;
×
2612
                }
2613

2614
                r = mkdir_parents_label(p, 0755);
3,700✔
2615
                if (r < 0)
3,700✔
2616
                        goto fail;
×
2617

2618
                if (IN_SET(type, EXEC_DIRECTORY_STATE, EXEC_DIRECTORY_LOGS) && params->runtime_scope == RUNTIME_SCOPE_USER) {
3,700✔
2619

2620
                        /* If we are in user mode, and a configuration directory exists but a state directory
2621
                         * doesn't exist, then we likely are upgrading from an older systemd version that
2622
                         * didn't know the more recent addition to the xdg-basedir spec: the $XDG_STATE_HOME
2623
                         * directory. In older systemd versions EXEC_DIRECTORY_STATE was aliased to
2624
                         * EXEC_DIRECTORY_CONFIGURATION, with the advent of $XDG_STATE_HOME it is now
2625
                         * separated. If a service has both dirs configured but only the configuration dir
2626
                         * exists and the state dir does not, we assume we are looking at an update
2627
                         * situation. Hence, create a compatibility symlink, so that all expectations are
2628
                         * met.
2629
                         *
2630
                         * (We also do something similar with the log directory, which still doesn't exist in
2631
                         * the xdg basedir spec. We'll make it a subdir of the state dir.) */
2632

2633
                        /* this assumes the state dir is always created before the configuration dir */
2634
                        assert_cc(EXEC_DIRECTORY_STATE < EXEC_DIRECTORY_LOGS);
7✔
2635
                        assert_cc(EXEC_DIRECTORY_LOGS < EXEC_DIRECTORY_CONFIGURATION);
7✔
2636

2637
                        r = access_nofollow(p, F_OK);
7✔
2638
                        if (r == -ENOENT) {
3✔
2639
                                _cleanup_free_ char *q = NULL;
3✔
2640

2641
                                /* OK, we know that the state dir does not exist. Let's see if the dir exists
2642
                                 * under the configuration hierarchy. */
2643

2644
                                if (type == EXEC_DIRECTORY_STATE)
3✔
2645
                                        q = path_join(params->prefix[EXEC_DIRECTORY_CONFIGURATION], i->path);
3✔
2646
                                else if (type == EXEC_DIRECTORY_LOGS)
×
2647
                                        q = path_join(params->prefix[EXEC_DIRECTORY_CONFIGURATION], "log", i->path);
×
2648
                                else
2649
                                        assert_not_reached();
×
2650
                                if (!q) {
3✔
2651
                                        r = -ENOMEM;
×
2652
                                        goto fail;
×
2653
                                }
2654

2655
                                r = access_nofollow(q, F_OK);
3✔
2656
                                if (r >= 0) {
2✔
2657
                                        /* It does exist! This hence looks like an update. Symlink the
2658
                                         * configuration directory into the state directory. */
2659

2660
                                        r = symlink_idempotent(q, p, /* make_relative= */ true);
1✔
2661
                                        if (r < 0)
1✔
2662
                                                goto fail;
×
2663

2664
                                        log_notice("Unit state directory %s missing but matching configuration directory %s exists, assuming update from systemd 253 or older, creating compatibility symlink.", p, q);
1✔
2665
                                        continue;
1✔
2666
                                } else if (r != -ENOENT)
2✔
2667
                                        log_warning_errno(r, "Unable to detect whether unit configuration directory '%s' exists, assuming not: %m", q);
2✔
2668

2669
                        } else if (r < 0)
4✔
2670
                                log_warning_errno(r, "Unable to detect whether unit state directory '%s' is missing, assuming it is: %m", p);
×
2671
                }
2672

2673
                if (exec_directory_is_private(context, type)) {
3,699✔
2674
                        /* So, here's one extra complication when dealing with DynamicUser=1 units. In that
2675
                         * case we want to avoid leaving a directory around fully accessible that is owned by
2676
                         * a dynamic user whose UID is later on reused. To lock this down we use the same
2677
                         * trick used by container managers to prohibit host users to get access to files of
2678
                         * the same UID in containers: we place everything inside a directory that has an
2679
                         * access mode of 0700 and is owned root:root, so that it acts as security boundary
2680
                         * for unprivileged host code. We then use fs namespacing to make this directory
2681
                         * permeable for the service itself.
2682
                         *
2683
                         * Specifically: for a service which wants a special directory "foo/" we first create
2684
                         * a directory "private/" with access mode 0700 owned by root:root. Then we place
2685
                         * "foo" inside of that directory (i.e. "private/foo/"), and make "foo" a symlink to
2686
                         * "private/foo". This way, privileged host users can access "foo/" as usual, but
2687
                         * unprivileged host users can't look into it. Inside of the namespace of the unit
2688
                         * "private/" is replaced by a more liberally accessible tmpfs, into which the host's
2689
                         * "private/foo/" is mounted under the same name, thus disabling the access boundary
2690
                         * for the service and making sure it only gets access to the dirs it needs but no
2691
                         * others. Tricky? Yes, absolutely, but it works!
2692
                         *
2693
                         * Note that we don't do this for EXEC_DIRECTORY_CONFIGURATION as that's assumed not
2694
                         * to be owned by the service itself.
2695
                         *
2696
                         * Also, note that we don't do this for EXEC_DIRECTORY_RUNTIME as that's often used
2697
                         * for sharing files or sockets with other services. */
2698

2699
                        pp = path_join(params->prefix[type], "private");
13✔
2700
                        if (!pp) {
13✔
2701
                                r = -ENOMEM;
×
2702
                                goto fail;
×
2703
                        }
2704

2705
                        /* First set up private root if it doesn't exist yet, with access mode 0700 and owned by root:root */
2706
                        r = mkdir_safe_label(pp, 0700, 0, 0, MKDIR_WARN_MODE);
13✔
2707
                        if (r < 0)
13✔
2708
                                goto fail;
×
2709

2710
                        if (!path_extend(&pp, i->path)) {
13✔
2711
                                r = -ENOMEM;
×
2712
                                goto fail;
×
2713
                        }
2714

2715
                        /* Create all directories between the configured directory and this private root, and mark them 0755 */
2716
                        r = mkdir_parents_label(pp, 0755);
13✔
2717
                        if (r < 0)
13✔
2718
                                goto fail;
×
2719

2720
                        if (is_dir(p, false) > 0 &&
13✔
2721
                            (access_nofollow(pp, F_OK) == -ENOENT)) {
×
2722

2723
                                /* Hmm, the private directory doesn't exist yet, but the normal one exists? If so, move
2724
                                 * it over. Most likely the service has been upgraded from one that didn't use
2725
                                 * DynamicUser=1, to one that does. */
2726

2727
                                log_info("Found pre-existing public %s= directory %s, migrating to %s.\n"
×
2728
                                         "Apparently, service previously had DynamicUser= turned off, and has now turned it on.",
2729
                                         exec_directory_type_to_string(type), p, pp);
2730

2731
                                r = RET_NERRNO(rename(p, pp));
×
2732
                                if (r < 0)
×
2733
                                        goto fail;
×
2734
                        } else {
2735
                                /* Otherwise, create the actual directory for the service */
2736

2737
                                r = mkdir_label(pp, context->directories[type].mode);
13✔
2738
                                if (r < 0 && r != -EEXIST)
13✔
2739
                                        goto fail;
×
2740
                        }
2741

2742
                        if (!FLAGS_SET(i->flags, EXEC_DIRECTORY_ONLY_CREATE)) {
13✔
2743
                                /* And link it up from the original place.
2744
                                 * Notes
2745
                                 * 1) If a mount namespace is going to be used, then this symlink remains on
2746
                                 *    the host, and a new one for the child namespace will be created later.
2747
                                 * 2) It is not necessary to create this symlink when one of its parent
2748
                                 *    directories is specified and already created. E.g.
2749
                                 *        StateDirectory=foo foo/bar
2750
                                 *    In that case, the inode points to pp and p for "foo/bar" are the same:
2751
                                 *        pp = "/var/lib/private/foo/bar"
2752
                                 *        p = "/var/lib/foo/bar"
2753
                                 *    and, /var/lib/foo is a symlink to /var/lib/private/foo. So, not only
2754
                                 *    we do not need to create the symlink, but we cannot create the symlink.
2755
                                 *    See issue #24783. */
2756
                                r = symlink_idempotent(pp, p, true);
13✔
2757
                                if (r < 0)
13✔
2758
                                        goto fail;
×
2759
                        }
2760

2761
                } else {
2762
                        _cleanup_free_ char *target = NULL;
3,686✔
2763

2764
                        if (EXEC_DIRECTORY_TYPE_SHALL_CHOWN(type) &&
7,332✔
2765
                            readlink_and_make_absolute(p, &target) >= 0) {
3,646✔
2766
                                _cleanup_free_ char *q = NULL, *q_resolved = NULL, *target_resolved = NULL;
11✔
2767

2768
                                /* This already exists and is a symlink? Interesting. Maybe it's one created
2769
                                 * by DynamicUser=1 (see above)?
2770
                                 *
2771
                                 * We do this for all directory types except for ConfigurationDirectory=,
2772
                                 * since they all support the private/ symlink logic at least in some
2773
                                 * configurations, see above. */
2774

2775
                                r = chase(target, NULL, 0, &target_resolved, NULL);
11✔
2776
                                if (r < 0)
11✔
2777
                                        goto fail;
×
2778

2779
                                q = path_join(params->prefix[type], "private", i->path);
11✔
2780
                                if (!q) {
11✔
2781
                                        r = -ENOMEM;
×
2782
                                        goto fail;
×
2783
                                }
2784

2785
                                /* /var/lib or friends may be symlinks. So, let's chase them also. */
2786
                                r = chase(q, NULL, CHASE_NONEXISTENT, &q_resolved, NULL);
11✔
2787
                                if (r < 0)
11✔
2788
                                        goto fail;
×
2789

2790
                                if (path_equal(q_resolved, target_resolved)) {
11✔
2791

2792
                                        /* Hmm, apparently DynamicUser= was once turned on for this service,
2793
                                         * but is no longer. Let's move the directory back up. */
2794

2795
                                        log_info("Found pre-existing private %s= directory %s, migrating to %s.\n"
8✔
2796
                                                 "Apparently, service previously had DynamicUser= turned on, and has now turned it off.",
2797
                                                 exec_directory_type_to_string(type), q, p);
2798

2799
                                        r = RET_NERRNO(unlink(p));
8✔
2800
                                        if (r < 0)
×
2801
                                                goto fail;
×
2802

2803
                                        r = RET_NERRNO(rename(q, p));
11✔
2804
                                        if (r < 0)
×
2805
                                                goto fail;
×
2806
                                }
2807
                        }
2808

2809
                        r = mkdir_label(p, context->directories[type].mode);
3,686✔
2810
                        if (r < 0) {
3,686✔
2811
                                if (r != -EEXIST)
2,610✔
2812
                                        goto fail;
×
2813

2814
                                if (!EXEC_DIRECTORY_TYPE_SHALL_CHOWN(type)) {
2,610✔
2815
                                        struct stat st;
27✔
2816

2817
                                        /* Don't change the owner/access mode of the configuration directory,
2818
                                         * as in the common case it is not written to by a service, and shall
2819
                                         * not be writable. */
2820

2821
                                        r = RET_NERRNO(stat(p, &st));
27✔
2822
                                        if (r < 0)
×
2823
                                                goto fail;
×
2824

2825
                                        /* Still complain if the access mode doesn't match */
2826
                                        if (((st.st_mode ^ context->directories[type].mode) & 07777) != 0)
27✔
2827
                                                log_warning("%s \'%s\' already exists but the mode is different. "
×
2828
                                                            "(File system: %o %sMode: %o)",
2829
                                                            exec_directory_type_to_string(type), i->path,
2830
                                                            st.st_mode & 07777, exec_directory_type_to_string(type), context->directories[type].mode & 07777);
2831

2832
                                        continue;
27✔
2833
                                }
2834
                        }
2835
                }
2836

2837
                /* Lock down the access mode (we use chmod_and_chown() to make this idempotent. We don't
2838
                 * specify UID/GID here, so that path_chown_recursive() can optimize things depending on the
2839
                 * current UID/GID ownership.) */
2840
                const char *target_dir = pp ?: p;
3,672✔
2841
                r = chmod_and_chown(target_dir, context->directories[type].mode, UID_INVALID, GID_INVALID);
3,672✔
2842
                if (r < 0)
3,672✔
2843
                        goto fail;
×
2844

2845
                /* Skip the rest (which deals with ownership) in user mode, since ownership changes are not
2846
                 * available to user code anyway */
2847
                if (params->runtime_scope != RUNTIME_SCOPE_SYSTEM)
3,672✔
2848
                        continue;
9✔
2849

2850
                int idmapping_supported = is_idmapping_supported(target_dir);
3,663✔
2851
                if (idmapping_supported < 0) {
3,663✔
2852
                        r = log_debug_errno(idmapping_supported, "Unable to determine if ID mapping is supported on mount '%s': %m", target_dir);
×
2853
                        goto fail;
×
2854
                }
2855

2856
                log_debug("ID-mapping is%ssupported for exec directory %s", idmapping_supported ? " " : " not ", target_dir);
3,669✔
2857

2858
                /* Change the ownership of the whole tree, if necessary. When dynamic users are used we
2859
                 * drop the suid/sgid bits, since we really don't want SUID/SGID files for dynamic UID/GID
2860
                 * assignments to exist. */
2861
                uid_t chown_uid = uid;
3,663✔
2862
                gid_t chown_gid = gid;
3,663✔
2863
                bool do_chown = false;
3,663✔
2864

2865
                if (uid == 0 || gid == 0 || !idmapping_supported) {
3,663✔
2866
                        do_chown = true;
1,428✔
2867
                        i->idmapped = false;
1,428✔
2868
                } else {
2869
                        /* Use 'nobody' uid/gid for exec directories if ID-mapping is supported. For backward compatibility,
2870
                         * continue doing chmod/chown if the directory was chmod/chowned before (if uid/gid is not 'nobody') */
2871
                        struct stat st;
2,235✔
2872
                        r = RET_NERRNO(stat(target_dir, &st));
2,235✔
2873
                        if (r < 0)
×
2874
                                goto fail;
×
2875

2876
                        if (st.st_uid == UID_NOBODY && st.st_gid == GID_NOBODY) {
2,235✔
2877
                                do_chown = false;
7✔
2878
                                i->idmapped = true;
7✔
2879
                       } else if (exec_directory_is_private(context, type) && st.st_uid == 0 && st.st_gid == 0) {
2,228✔
2880
                                chown_uid = UID_NOBODY;
6✔
2881
                                chown_gid = GID_NOBODY;
6✔
2882
                                do_chown = true;
6✔
2883
                                i->idmapped = true;
6✔
2884
                        } else {
2885
                                do_chown = true;
2,222✔
2886
                                i->idmapped = false;
2,222✔
2887
                        }
2888
                }
2889

2890
                if (do_chown) {
3,663✔
2891
                        r = path_chown_recursive(target_dir, chown_uid, chown_gid, context->dynamic_user ? 01777 : 07777, AT_SYMLINK_FOLLOW);
7,303✔
2892
                        if (r < 0)
3,656✔
2893
                                goto fail;
1✔
2894
                }
2895
        }
2896

2897
        /* If we are not going to run in a namespace, set up the symlinks - otherwise
2898
         * they are set up later, to allow configuring empty var/run/etc. */
2899
        if (!needs_mount_namespace)
57,340✔
2900
                FOREACH_ARRAY(i, context->directories[type].items, context->directories[type].n_items) {
45,851✔
2901
                        r = create_many_symlinks(params->prefix[type], i->path, i->symlinks);
1,526✔
2902
                        if (r < 0)
1,526✔
2903
                                goto fail;
×
2904
                }
2905

2906
        return 0;
2907

2908
fail:
1✔
2909
        *exit_status = exit_status_table[type];
1✔
2910
        return r;
1✔
2911
}
2912

2913
#if ENABLE_SMACK
2914
static int setup_smack(
×
2915
                const ExecContext *context,
2916
                const ExecParameters *params,
2917
                int executable_fd) {
2918
        int r;
×
2919

2920
        assert(context);
×
2921
        assert(params);
×
2922
        assert(executable_fd >= 0);
×
2923

2924
        if (context->smack_process_label) {
×
2925
                r = mac_smack_apply_pid(0, context->smack_process_label);
×
2926
                if (r < 0)
×
2927
                        return r;
×
2928
        } else if (params->fallback_smack_process_label) {
×
2929
                _cleanup_free_ char *exec_label = NULL;
×
2930

2931
                r = mac_smack_read_fd(executable_fd, SMACK_ATTR_EXEC, &exec_label);
×
2932
                if (r < 0 && !ERRNO_IS_XATTR_ABSENT(r))
×
2933
                        return r;
2934

2935
                r = mac_smack_apply_pid(0, exec_label ?: params->fallback_smack_process_label);
×
2936
                if (r < 0)
×
2937
                        return r;
2938
        }
2939

2940
        return 0;
2941
}
2942
#endif
2943

2944
static int compile_bind_mounts(
1,987✔
2945
                const ExecContext *context,
2946
                const ExecParameters *params,
2947
                uid_t exec_directory_uid, /* only used for id-mapped mounts Exec directories */
2948
                gid_t exec_directory_gid, /* only used for id-mapped mounts Exec directories */
2949
                BindMount **ret_bind_mounts,
2950
                size_t *ret_n_bind_mounts,
2951
                char ***ret_empty_directories) {
2952

2953
        _cleanup_strv_free_ char **empty_directories = NULL;
1,987✔
2954
        BindMount *bind_mounts = NULL;
1,987✔
2955
        size_t n, h = 0;
1,987✔
2956
        int r;
1,987✔
2957

2958
        assert(context);
1,987✔
2959
        assert(params);
1,987✔
2960
        assert(ret_bind_mounts);
1,987✔
2961
        assert(ret_n_bind_mounts);
1,987✔
2962
        assert(ret_empty_directories);
1,987✔
2963

2964
        CLEANUP_ARRAY(bind_mounts, h, bind_mount_free_many);
1,987✔
2965

2966
        n = context->n_bind_mounts;
1,987✔
2967
        for (ExecDirectoryType t = 0; t < _EXEC_DIRECTORY_TYPE_MAX; t++) {
11,922✔
2968
                if (!params->prefix[t])
9,935✔
2969
                        continue;
×
2970

2971
                FOREACH_ARRAY(i, context->directories[t].items, context->directories[t].n_items)
11,520✔
2972
                        n += !FLAGS_SET(i->flags, EXEC_DIRECTORY_ONLY_CREATE) || FLAGS_SET(i->flags, EXEC_DIRECTORY_READ_ONLY);
1,585✔
2973
        }
2974

2975
        if (n <= 0) {
1,987✔
2976
                *ret_bind_mounts = NULL;
1,068✔
2977
                *ret_n_bind_mounts = 0;
1,068✔
2978
                *ret_empty_directories = NULL;
1,068✔
2979
                return 0;
1,068✔
2980
        }
2981

2982
        bind_mounts = new(BindMount, n);
919✔
2983
        if (!bind_mounts)
919✔
2984
                return -ENOMEM;
2985

2986
        FOREACH_ARRAY(item, context->bind_mounts, context->n_bind_mounts) {
939✔
2987
                r = bind_mount_add(&bind_mounts, &h, item);
20✔
2988
                if (r < 0)
20✔
2989
                        return r;
2990
        }
2991

2992
        for (ExecDirectoryType t = 0; t < _EXEC_DIRECTORY_TYPE_MAX; t++) {
5,514✔
2993
                if (!params->prefix[t])
4,595✔
2994
                        continue;
×
2995

2996
                if (context->directories[t].n_items == 0)
4,595✔
2997
                        continue;
3,466✔
2998

2999
                if (exec_directory_is_private(context, t) &&
1,142✔
3000
                    !exec_context_with_rootfs(context)) {
13✔
3001
                        char *private_root;
13✔
3002

3003
                        /* So this is for a dynamic user, and we need to make sure the process can access its own
3004
                         * directory. For that we overmount the usually inaccessible "private" subdirectory with a
3005
                         * tmpfs that makes it accessible and is empty except for the submounts we do this for. */
3006

3007
                        private_root = path_join(params->prefix[t], "private");
13✔
3008
                        if (!private_root)
13✔
3009
                                return -ENOMEM;
3010

3011
                        r = strv_consume(&empty_directories, private_root);
13✔
3012
                        if (r < 0)
13✔
3013
                                return r;
3014
                }
3015

3016
                FOREACH_ARRAY(i, context->directories[t].items, context->directories[t].n_items) {
2,714✔
3017
                        _cleanup_free_ char *s = NULL, *d = NULL;
1,585✔
3018

3019
                        /* When one of the parent directories is in the list, we cannot create the symlink
3020
                         * for the child directory. See also the comments in setup_exec_directory().
3021
                         * But if it needs to be read only, then we have to create a bind mount anyway to
3022
                         * make it so. */
3023
                        if (FLAGS_SET(i->flags, EXEC_DIRECTORY_ONLY_CREATE) && !FLAGS_SET(i->flags, EXEC_DIRECTORY_READ_ONLY))
1,585✔
3024
                                continue;
×
3025

3026
                        if (exec_directory_is_private(context, t))
1,585✔
3027
                                s = path_join(params->prefix[t], "private", i->path);
13✔
3028
                        else
3029
                                s = path_join(params->prefix[t], i->path);
1,572✔
3030
                        if (!s)
1,585✔
3031
                                return -ENOMEM;
3032

3033
                        if (exec_directory_is_private(context, t) &&
1,598✔
3034
                            exec_context_with_rootfs(context))
13✔
3035
                                /* When RootDirectory= or RootImage= are set, then the symbolic link to the private
3036
                                 * directory is not created on the root directory. So, let's bind-mount the directory
3037
                                 * on the 'non-private' place. */
3038
                                d = path_join(params->prefix[t], i->path);
×
3039
                        else
3040
                                d = strdup(s);
1,585✔
3041
                        if (!d)
1,585✔
3042
                                return -ENOMEM;
3043

3044
                        bind_mounts[h++] = (BindMount) {
1,585✔
3045
                                .source = TAKE_PTR(s),
1,585✔
3046
                                .destination = TAKE_PTR(d),
1,585✔
3047
                                .nosuid = context->dynamic_user, /* don't allow suid/sgid when DynamicUser= is on */
1,585✔
3048
                                .recursive = true,
3049
                                .read_only = FLAGS_SET(i->flags, EXEC_DIRECTORY_READ_ONLY),
1,585✔
3050
                                .idmapped = i->idmapped,
1,585✔
3051
                                .uid = exec_directory_uid,
3052
                                .gid = exec_directory_gid,
3053
                        };
3054
                }
3055
        }
3056

3057
        assert(h == n);
919✔
3058

3059
        *ret_bind_mounts = TAKE_PTR(bind_mounts);
919✔
3060
        *ret_n_bind_mounts = n;
919✔
3061
        *ret_empty_directories = TAKE_PTR(empty_directories);
919✔
3062

3063
        return (int) n;
919✔
3064
}
3065

3066
/* ret_symlinks will contain a list of pairs src:dest that describes
3067
 * the symlinks to create later on. For example, the symlinks needed
3068
 * to safely give private directories to DynamicUser=1 users. */
3069
static int compile_symlinks(
1,987✔
3070
                const ExecContext *context,
3071
                const ExecParameters *params,
3072
                bool setup_os_release_symlink,
3073
                char ***ret_symlinks) {
3074

3075
        _cleanup_strv_free_ char **symlinks = NULL;
1,987✔
3076
        int r;
1,987✔
3077

3078
        assert(context);
1,987✔
3079
        assert(params);
1,987✔
3080
        assert(ret_symlinks);
1,987✔
3081

3082
        for (ExecDirectoryType dt = 0; dt < _EXEC_DIRECTORY_TYPE_MAX; dt++)
11,922✔
3083
                FOREACH_ARRAY(i, context->directories[dt].items, context->directories[dt].n_items) {
11,520✔
3084
                        _cleanup_free_ char *private_path = NULL, *path = NULL;
1,572✔
3085

3086
                        STRV_FOREACH(symlink, i->symlinks) {
1,711✔
3087
                                _cleanup_free_ char *src_abs = NULL, *dst_abs = NULL;
126✔
3088

3089
                                src_abs = path_join(params->prefix[dt], i->path);
126✔
3090
                                dst_abs = path_join(params->prefix[dt], *symlink);
126✔
3091
                                if (!src_abs || !dst_abs)
126✔
3092
                                        return -ENOMEM;
3093

3094
                                r = strv_consume_pair(&symlinks, TAKE_PTR(src_abs), TAKE_PTR(dst_abs));
126✔
3095
                                if (r < 0)
126✔
3096
                                        return r;
3097
                        }
3098

3099
                        if (!exec_directory_is_private(context, dt) ||
1,598✔
3100
                            exec_context_with_rootfs(context) ||
13✔
3101
                            FLAGS_SET(i->flags, EXEC_DIRECTORY_ONLY_CREATE))
13✔
3102
                                continue;
1,572✔
3103

3104
                        private_path = path_join(params->prefix[dt], "private", i->path);
13✔
3105
                        if (!private_path)
13✔
3106
                                return -ENOMEM;
3107

3108
                        path = path_join(params->prefix[dt], i->path);
13✔
3109
                        if (!path)
13✔
3110
                                return -ENOMEM;
3111

3112
                        r = strv_consume_pair(&symlinks, TAKE_PTR(private_path), TAKE_PTR(path));
13✔
3113
                        if (r < 0)
13✔
3114
                                return r;
3115
                }
3116

3117
        /* We make the host's os-release available via a symlink, so that we can copy it atomically
3118
         * and readers will never get a half-written version. Note that, while the paths specified here are
3119
         * absolute, when they are processed in namespace.c they will be made relative automatically, i.e.:
3120
         * 'os-release -> .os-release-stage/os-release' is what will be created. */
3121
        if (setup_os_release_symlink) {
1,987✔
3122
                r = strv_extend_many(
7✔
3123
                                &symlinks,
3124
                                "/run/host/.os-release-stage/os-release",
3125
                                "/run/host/os-release");
3126
                if (r < 0)
7✔
3127
                        return r;
3128
        }
3129

3130
        *ret_symlinks = TAKE_PTR(symlinks);
1,987✔
3131

3132
        return 0;
1,987✔
3133
}
3134

3135
static bool insist_on_sandboxing(
×
3136
                const ExecContext *context,
3137
                const char *root_dir,
3138
                const char *root_image,
3139
                const BindMount *bind_mounts,
3140
                size_t n_bind_mounts) {
3141

3142
        assert(context);
×
3143
        assert(n_bind_mounts == 0 || bind_mounts);
×
3144

3145
        /* Checks whether we need to insist on fs namespacing. i.e. whether we have settings configured that
3146
         * would alter the view on the file system beyond making things read-only or invisible, i.e. would
3147
         * rearrange stuff in a way we cannot ignore gracefully. */
3148

3149
        if (context->n_temporary_filesystems > 0)
×
3150
                return true;
3151

3152
        if (root_dir || root_image)
×
3153
                return true;
3154

3155
        if (context->n_mount_images > 0)
×
3156
                return true;
3157

3158
        if (context->dynamic_user)
×
3159
                return true;
3160

3161
        if (context->n_extension_images > 0 || !strv_isempty(context->extension_directories))
×
3162
                return true;
3163

3164
        /* If there are any bind mounts set that don't map back onto themselves, fs namespacing becomes
3165
         * essential. */
3166
        FOREACH_ARRAY(i, bind_mounts, n_bind_mounts)
×
3167
                if (!path_equal(i->source, i->destination))
×
3168
                        return true;
3169

3170
        if (context->log_namespace)
×
3171
                return true;
×
3172

3173
        return false;
3174
}
3175

3176
static int setup_ephemeral(
1,987✔
3177
                const ExecContext *context,
3178
                ExecRuntime *runtime,
3179
                char **root_image,            /* both input and output! modified if ephemeral logic enabled */
3180
                char **root_directory,        /* ditto */
3181
                char **reterr_path) {
3182

3183
        _cleanup_close_ int fd = -EBADF;
1,987✔
3184
        _cleanup_free_ char *new_root = NULL;
1,987✔
3185
        int r;
1,987✔
3186

3187
        assert(context);
1,987✔
3188
        assert(runtime);
1,987✔
3189
        assert(root_image);
1,987✔
3190
        assert(root_directory);
1,987✔
3191

3192
        if (!*root_image && !*root_directory)
1,987✔
3193
                return 0;
3194

3195
        if (!runtime->ephemeral_copy)
8✔
3196
                return 0;
3197

3198
        assert(runtime->ephemeral_storage_socket[0] >= 0);
×
3199
        assert(runtime->ephemeral_storage_socket[1] >= 0);
×
3200

3201
        new_root = strdup(runtime->ephemeral_copy);
×
3202
        if (!new_root)
×
3203
                return log_oom_debug();
×
3204

3205
        r = posix_lock(runtime->ephemeral_storage_socket[0], LOCK_EX);
×
3206
        if (r < 0)
×
3207
                return log_debug_errno(r, "Failed to lock ephemeral storage socket: %m");
×
3208

3209
        CLEANUP_POSIX_UNLOCK(runtime->ephemeral_storage_socket[0]);
×
3210

3211
        fd = receive_one_fd(runtime->ephemeral_storage_socket[0], MSG_PEEK|MSG_DONTWAIT);
×
3212
        if (fd >= 0)
×
3213
                /* We got an fd! That means ephemeral has already been set up, so nothing to do here. */
3214
                return 0;
3215
        if (fd != -EAGAIN)
×
3216
                return log_debug_errno(fd, "Failed to receive file descriptor queued on ephemeral storage socket: %m");
×
3217

3218
        if (*root_image) {
×
3219
                log_debug("Making ephemeral copy of %s to %s", *root_image, new_root);
×
3220

3221
                fd = copy_file(*root_image, new_root, O_EXCL, 0600,
×
3222
                               COPY_LOCK_BSD|COPY_REFLINK|COPY_CRTIME|COPY_NOCOW_AFTER);
3223
                if (fd < 0) {
×
3224
                        *reterr_path = strdup(*root_image);
×
3225
                        return log_debug_errno(fd, "Failed to copy image %s to %s: %m",
×
3226
                                               *root_image, new_root);
3227
                }
3228
        } else {
3229
                assert(*root_directory);
×
3230

3231
                log_debug("Making ephemeral snapshot of %s to %s", *root_directory, new_root);
×
3232

3233
                fd = btrfs_subvol_snapshot_at(
×
3234
                                AT_FDCWD, *root_directory,
3235
                                AT_FDCWD, new_root,
3236
                                BTRFS_SNAPSHOT_FALLBACK_COPY |
3237
                                BTRFS_SNAPSHOT_FALLBACK_DIRECTORY |
3238
                                BTRFS_SNAPSHOT_RECURSIVE |
3239
                                BTRFS_SNAPSHOT_LOCK_BSD);
3240
                if (fd < 0) {
×
3241
                        *reterr_path = strdup(*root_directory);
×
3242
                        return log_debug_errno(fd, "Failed to snapshot directory %s to %s: %m",
×
3243
                                               *root_directory, new_root);
3244
                }
3245
        }
3246

3247
        r = send_one_fd(runtime->ephemeral_storage_socket[1], fd, MSG_DONTWAIT);
×
3248
        if (r < 0)
×
3249
                return log_debug_errno(r, "Failed to queue file descriptor on ephemeral storage socket: %m");
×
3250

3251
        if (*root_image)
×
3252
                free_and_replace(*root_image, new_root);
×
3253
        else {
3254
                assert(*root_directory);
×
3255
                free_and_replace(*root_directory, new_root);
×
3256
        }
3257

3258
        return 1;
3259
}
3260

3261
static int verity_settings_prepare(
7✔
3262
                VeritySettings *verity,
3263
                const char *root_image,
3264
                const void *root_hash,
3265
                size_t root_hash_size,
3266
                const char *root_hash_path,
3267
                const void *root_hash_sig,
3268
                size_t root_hash_sig_size,
3269
                const char *root_hash_sig_path,
3270
                const char *verity_data_path) {
3271

3272
        int r;
7✔
3273

3274
        assert(verity);
7✔
3275

3276
        if (root_hash) {
7✔
3277
                void *d;
4✔
3278

3279
                d = memdup(root_hash, root_hash_size);
4✔
3280
                if (!d)
4✔
3281
                        return -ENOMEM;
7✔
3282

3283
                free_and_replace(verity->root_hash, d);
4✔
3284
                verity->root_hash_size = root_hash_size;
4✔
3285
                verity->designator = PARTITION_ROOT;
4✔
3286
        }
3287

3288
        if (root_hash_sig) {
7✔
3289
                void *d;
×
3290

3291
                d = memdup(root_hash_sig, root_hash_sig_size);
×
3292
                if (!d)
×
3293
                        return -ENOMEM;
7✔
3294

3295
                free_and_replace(verity->root_hash_sig, d);
×
3296
                verity->root_hash_sig_size = root_hash_sig_size;
×
3297
                verity->designator = PARTITION_ROOT;
×
3298
        }
3299

3300
        if (verity_data_path) {
7✔
3301
                r = free_and_strdup(&verity->data_path, verity_data_path);
×
3302
                if (r < 0)
×
3303
                        return r;
3304
        }
3305

3306
        r = verity_settings_load(
7✔
3307
                        verity,
3308
                        root_image,
3309
                        root_hash_path,
3310
                        root_hash_sig_path);
3311
        if (r < 0)
7✔
3312
                return log_debug_errno(r, "Failed to load root hash: %m");
×
3313

3314
        return 0;
3315
}
3316

3317
static int pick_versions(
1,989✔
3318
                const ExecContext *context,
3319
                const ExecParameters *params,
3320
                char **ret_root_image,
3321
                char **ret_root_directory,
3322
                char **reterr_path) {
3323

3324
        int r;
1,989✔
3325

3326
        assert(context);
1,989✔
3327
        assert(params);
1,989✔
3328
        assert(ret_root_image);
1,989✔
3329
        assert(ret_root_directory);
1,989✔
3330

3331
        if (context->root_image) {
1,989✔
3332
                _cleanup_(pick_result_done) PickResult result = PICK_RESULT_NULL;
8✔
3333

3334
                r = path_pick(/* toplevel_path= */ NULL,
16✔
3335
                              /* toplevel_fd= */ AT_FDCWD,
3336
                              context->root_image,
8✔
3337
                              &pick_filter_image_raw,
3338
                              PICK_ARCHITECTURE|PICK_TRIES|PICK_RESOLVE,
3339
                              &result);
3340
                if (r < 0) {
8✔
3341
                        *reterr_path = strdup(context->root_image);
1✔
3342
                        return r;
1✔
3343
                }
3344

3345
                if (!result.path) {
7✔
3346
                        *reterr_path = strdup(context->root_image);
×
3347
                        return log_debug_errno(SYNTHETIC_ERRNO(ENOENT), "No matching entry in .v/ directory %s found.", context->root_image);
×
3348
                }
3349

3350
                *ret_root_image = TAKE_PTR(result.path);
7✔
3351
                *ret_root_directory = NULL;
7✔
3352
                return r;
7✔
3353
        }
3354

3355
        if (context->root_directory) {
1,981✔
3356
                _cleanup_(pick_result_done) PickResult result = PICK_RESULT_NULL;
2✔
3357

3358
                r = path_pick(/* toplevel_path= */ NULL,
4✔
3359
                              /* toplevel_fd= */ AT_FDCWD,
3360
                              context->root_directory,
2✔
3361
                              &pick_filter_image_dir,
3362
                              PICK_ARCHITECTURE|PICK_TRIES|PICK_RESOLVE,
3363
                              &result);
3364
                if (r < 0) {
2✔
3365
                        *reterr_path = strdup(context->root_directory);
×
3366
                        return r;
×
3367
                }
3368

3369
                if (!result.path) {
2✔
3370
                        *reterr_path = strdup(context->root_directory);
1✔
3371
                        return log_debug_errno(SYNTHETIC_ERRNO(ENOENT), "No matching entry in .v/ directory %s found.", context->root_directory);
1✔
3372
                }
3373

3374
                *ret_root_image = NULL;
1✔
3375
                *ret_root_directory = TAKE_PTR(result.path);
1✔
3376
                return r;
1✔
3377
        }
3378

3379
        *ret_root_image = *ret_root_directory = NULL;
1,979✔
3380
        return 0;
1,979✔
3381
}
3382

3383
static int apply_mount_namespace(
1,989✔
3384
                ExecCommandFlags command_flags,
3385
                const ExecContext *context,
3386
                const ExecParameters *params,
3387
                ExecRuntime *runtime,
3388
                const char *memory_pressure_path,
3389
                bool needs_sandboxing,
3390
                char **reterr_path,
3391
                uid_t exec_directory_uid,
3392
                gid_t exec_directory_gid) {
3393

3394
        _cleanup_(verity_settings_done) VeritySettings verity = VERITY_SETTINGS_DEFAULT;
1,989✔
3395
        _cleanup_strv_free_ char **empty_directories = NULL, **symlinks = NULL,
1,989✔
3396
                        **read_write_paths_cleanup = NULL;
×
3397
        _cleanup_free_ char *creds_path = NULL, *incoming_dir = NULL, *propagate_dir = NULL,
×
3398
                *private_namespace_dir = NULL, *host_os_release_stage = NULL, *root_image = NULL, *root_dir = NULL;
1,989✔
3399
        const char *tmp_dir = NULL, *var_tmp_dir = NULL;
1,989✔
3400
        char **read_write_paths;
1,989✔
3401
        bool setup_os_release_symlink;
1,989✔
3402
        BindMount *bind_mounts = NULL;
1,989✔
3403
        size_t n_bind_mounts = 0;
1,989✔
3404
        int r;
1,989✔
3405

3406
        assert(context);
1,989✔
3407
        assert(params);
1,989✔
3408
        assert(runtime);
1,989✔
3409

3410
        CLEANUP_ARRAY(bind_mounts, n_bind_mounts, bind_mount_free_many);
1,989✔
3411

3412
        if (params->flags & EXEC_APPLY_CHROOT) {
1,989✔
3413
                r = pick_versions(
1,989✔
3414
                                context,
3415
                                params,
3416
                                &root_image,
3417
                                &root_dir,
3418
                                reterr_path);
3419
                if (r < 0)
1,989✔
3420
                        return r;
3421

3422
                r = setup_ephemeral(
1,987✔
3423
                                context,
3424
                                runtime,
3425
                                &root_image,
3426
                                &root_dir,
3427
                                reterr_path);
3428
                if (r < 0)
1,987✔
3429
                        return r;
3430
        }
3431

3432
        r = compile_bind_mounts(context, params, exec_directory_uid, exec_directory_gid, &bind_mounts, &n_bind_mounts, &empty_directories);
1,987✔
3433
        if (r < 0)
1,987✔
3434
                return r;
3435

3436
        /* We need to make the pressure path writable even if /sys/fs/cgroups is made read-only, as the
3437
         * service will need to write to it in order to start the notifications. */
3438
        if (exec_is_cgroup_mount_read_only(context) && memory_pressure_path && !streq(memory_pressure_path, "/dev/null")) {
1,987✔
3439
                read_write_paths_cleanup = strv_copy(context->read_write_paths);
1,134✔
3440
                if (!read_write_paths_cleanup)
1,134✔
3441
                        return -ENOMEM;
3442

3443
                r = strv_extend(&read_write_paths_cleanup, memory_pressure_path);
1,134✔
3444
                if (r < 0)
1,134✔
3445
                        return r;
3446

3447
                read_write_paths = read_write_paths_cleanup;
1,134✔
3448
        } else
3449
                read_write_paths = context->read_write_paths;
853✔
3450

3451
        if (needs_sandboxing) {
1,987✔
3452
                /* The runtime struct only contains the parent of the private /tmp, which is non-accessible
3453
                 * to world users. Inside of it there's a /tmp that is sticky, and that's the one we want to
3454
                 * use here.  This does not apply when we are using /run/systemd/empty as fallback. */
3455

3456
                if (context->private_tmp == PRIVATE_TMP_CONNECTED && runtime->shared) {
1,987✔
3457
                        if (streq_ptr(runtime->shared->tmp_dir, RUN_SYSTEMD_EMPTY))
323✔
3458
                                tmp_dir = runtime->shared->tmp_dir;
3459
                        else if (runtime->shared->tmp_dir)
323✔
3460
                                tmp_dir = strjoina(runtime->shared->tmp_dir, "/tmp");
1,615✔
3461

3462
                        if (streq_ptr(runtime->shared->var_tmp_dir, RUN_SYSTEMD_EMPTY))
323✔
3463
                                var_tmp_dir = runtime->shared->var_tmp_dir;
3464
                        else if (runtime->shared->var_tmp_dir)
323✔
3465
                                var_tmp_dir = strjoina(runtime->shared->var_tmp_dir, "/tmp");
1,615✔
3466
                }
3467
        }
3468

3469
        /* Symlinks (exec dirs, os-release) are set up after other mounts, before they are made read-only. */
3470
        setup_os_release_symlink = needs_sandboxing && exec_context_get_effective_mount_apivfs(context) && (root_dir || root_image);
1,987✔
3471
        r = compile_symlinks(context, params, setup_os_release_symlink, &symlinks);
1,987✔
3472
        if (r < 0)
1,987✔
3473
                return r;
3474

3475
        if (context->mount_propagation_flag == MS_SHARED)
1,987✔
3476
                log_debug("shared mount propagation hidden by other fs namespacing unit settings: ignoring");
×
3477

3478
        r = exec_context_get_credential_directory(context, params, params->unit_id, &creds_path);
1,987✔
3479
        if (r < 0)
1,987✔
3480
                return r;
3481

3482
        if (params->runtime_scope == RUNTIME_SCOPE_SYSTEM) {
1,987✔
3483
                propagate_dir = path_join("/run/systemd/propagate/", params->unit_id);
1,960✔
3484
                if (!propagate_dir)
1,960✔
3485
                        return -ENOMEM;
3486

3487
                incoming_dir = strdup("/run/systemd/incoming");
1,960✔
3488
                if (!incoming_dir)
1,960✔
3489
                        return -ENOMEM;
3490

3491
                private_namespace_dir = strdup("/run/systemd");
1,960✔
3492
                if (!private_namespace_dir)
1,960✔
3493
                        return -ENOMEM;
3494

3495
                /* If running under a different root filesystem, propagate the host's os-release. We make a
3496
                 * copy rather than just bind mounting it, so that it can be updated on soft-reboot. */
3497
                if (setup_os_release_symlink) {
1,960✔
3498
                        host_os_release_stage = strdup("/run/systemd/propagate/.os-release-stage");
7✔
3499
                        if (!host_os_release_stage)
7✔
3500
                                return -ENOMEM;
3501
                }
3502
        } else {
3503
                assert(params->runtime_scope == RUNTIME_SCOPE_USER);
27✔
3504

3505
                if (asprintf(&private_namespace_dir, "/run/user/" UID_FMT "/systemd", geteuid()) < 0)
27✔
3506
                        return -ENOMEM;
3507

3508
                if (setup_os_release_symlink) {
27✔
3509
                        if (asprintf(&host_os_release_stage,
×
3510
                                     "/run/user/" UID_FMT "/systemd/propagate/.os-release-stage",
3511
                                     geteuid()) < 0)
3512
                                return -ENOMEM;
3513
                }
3514
        }
3515

3516
        if (root_image) {
1,987✔
3517
                r = verity_settings_prepare(
14✔
3518
                        &verity,
3519
                        root_image,
3520
                        context->root_hash, context->root_hash_size, context->root_hash_path,
7✔
3521
                        context->root_hash_sig, context->root_hash_sig_size, context->root_hash_sig_path,
7✔
3522
                        context->root_verity);
7✔
3523
                if (r < 0)
7✔
3524
                        return r;
3525
        }
3526

UNCOV
3527
        NamespaceParameters parameters = {
×
3528
                .runtime_scope = params->runtime_scope,
1,987✔
3529

3530
                .root_directory = root_dir,
3531
                .root_image = root_image,
3532
                .root_image_options = context->root_image_options,
1,987✔
3533
                .root_image_policy = context->root_image_policy ?: &image_policy_service,
1,987✔
3534

3535
                .read_write_paths = read_write_paths,
3536
                .read_only_paths = needs_sandboxing ? context->read_only_paths : NULL,
1,987✔
3537
                .inaccessible_paths = needs_sandboxing ? context->inaccessible_paths : NULL,
1,987✔
3538

3539
                .exec_paths = needs_sandboxing ? context->exec_paths : NULL,
1,987✔
3540
                .no_exec_paths = needs_sandboxing ? context->no_exec_paths : NULL,
1,987✔
3541

3542
                .empty_directories = empty_directories,
3543
                .symlinks = symlinks,
3544

3545
                .bind_mounts = bind_mounts,
3546
                .n_bind_mounts = n_bind_mounts,
3547

3548
                .temporary_filesystems = context->temporary_filesystems,
1,987✔
3549
                .n_temporary_filesystems = context->n_temporary_filesystems,
1,987✔
3550

3551
                .mount_images = context->mount_images,
1,987✔
3552
                .n_mount_images = context->n_mount_images,
1,987✔
3553
                .mount_image_policy = context->mount_image_policy ?: &image_policy_service,
1,987✔
3554

3555
                .tmp_dir = tmp_dir,
3556
                .var_tmp_dir = var_tmp_dir,
3557

3558
                .creds_path = creds_path,
3559
                .log_namespace = context->log_namespace,
1,987✔
3560
                .mount_propagation_flag = context->mount_propagation_flag,
1,987✔
3561

3562
                .verity = &verity,
3563

3564
                .extension_images = context->extension_images,
1,987✔
3565
                .n_extension_images = context->n_extension_images,
1,987✔
3566
                .extension_image_policy = context->extension_image_policy ?: &image_policy_sysext,
1,987✔
3567
                .extension_directories = context->extension_directories,
1,987✔
3568

3569
                .propagate_dir = propagate_dir,
3570
                .incoming_dir = incoming_dir,
3571
                .private_namespace_dir = private_namespace_dir,
3572
                .host_notify_socket = params->notify_socket,
1,987✔
3573
                .notify_socket_path = exec_get_private_notify_socket_path(context, params, needs_sandboxing),
1,987✔
3574
                .host_os_release_stage = host_os_release_stage,
3575

3576
                /* If DynamicUser=no and RootDirectory= is set then lets pass a relaxed sandbox info,
3577
                 * otherwise enforce it, don't ignore protected paths and fail if we are enable to apply the
3578
                 * sandbox inside the mount namespace. */
3579
                .ignore_protect_paths = !needs_sandboxing && !context->dynamic_user && root_dir,
1,987✔
3580

3581
                .protect_control_groups = needs_sandboxing ? exec_get_protect_control_groups(context) : PROTECT_CONTROL_GROUPS_NO,
1,987✔
3582
                .protect_kernel_tunables = needs_sandboxing && context->protect_kernel_tunables,
1,987✔
3583
                .protect_kernel_modules = needs_sandboxing && context->protect_kernel_modules,
1,987✔
3584
                .protect_kernel_logs = needs_sandboxing && context->protect_kernel_logs,
1,987✔
3585

3586
                .private_dev = needs_sandboxing && context->private_devices,
1,987✔
3587
                .private_network = needs_sandboxing && exec_needs_network_namespace(context),
1,987✔
3588
                .private_ipc = needs_sandboxing && exec_needs_ipc_namespace(context),
1,987✔
3589
                .private_pids = needs_sandboxing && exec_needs_pid_namespace(context) ? context->private_pids : PRIVATE_PIDS_NO,
1,987✔
3590
                .private_tmp = needs_sandboxing ? context->private_tmp : PRIVATE_TMP_NO,
1,987✔
3591

3592
                .mount_apivfs = needs_sandboxing && exec_context_get_effective_mount_apivfs(context),
1,987✔
3593
                .bind_log_sockets = needs_sandboxing && exec_context_get_effective_bind_log_sockets(context),
1,987✔
3594

3595
                /* If NNP is on, we can turn on MS_NOSUID, since it won't have any effect anymore. */
3596
                .mount_nosuid = needs_sandboxing && context->no_new_privileges && !mac_selinux_use(),
1,987✔
3597

3598
                .protect_home = needs_sandboxing ? context->protect_home : PROTECT_HOME_NO,
1,987✔
3599
                .protect_hostname = needs_sandboxing ? context->protect_hostname : PROTECT_HOSTNAME_NO,
1,987✔
3600
                .protect_system = needs_sandboxing ? context->protect_system : PROTECT_SYSTEM_NO,
1,987✔
3601
                .protect_proc = needs_sandboxing ? context->protect_proc : PROTECT_PROC_DEFAULT,
1,987✔
3602
                .proc_subset = needs_sandboxing ? context->proc_subset : PROC_SUBSET_ALL,
1,987✔
3603
        };
3604

3605
        r = setup_namespace(&parameters, reterr_path);
1,987✔
3606
        /* If we couldn't set up the namespace this is probably due to a missing capability. setup_namespace() reports
3607
         * that with a special, recognizable error ENOANO. In this case, silently proceed, but only if exclusively
3608
         * sandboxing options were used, i.e. nothing such as RootDirectory= or BindMount= that would result in a
3609
         * completely different execution environment. */
3610
        if (r == -ENOANO) {
1,987✔
3611
                if (insist_on_sandboxing(
×
3612
                                    context,
3613
                                    root_dir, root_image,
3614
                                    bind_mounts,
3615
                                    n_bind_mounts))
3616
                        return log_debug_errno(SYNTHETIC_ERRNO(EOPNOTSUPP),
×
3617
                                               "Failed to set up namespace, and refusing to continue since "
3618
                                               "the selected namespacing options alter mount environment non-trivially.\n"
3619
                                               "Bind mounts: %zu, temporary filesystems: %zu, root directory: %s, root image: %s, dynamic user: %s",
3620
                                               n_bind_mounts,
3621
                                               context->n_temporary_filesystems,
3622
                                               yes_no(root_dir),
3623
                                               yes_no(root_image),
3624
                                               yes_no(context->dynamic_user));
3625

3626
                log_debug("Failed to set up namespace, assuming containerized execution and ignoring.");
×
3627
                return 0;
×
3628
        }
3629

3630
        return r;
3631
}
3632

3633
static int apply_working_directory(
9,499✔
3634
                const ExecContext *context,
3635
                const ExecParameters *params,
3636
                ExecRuntime *runtime,
3637
                const char *pwent_home,
3638
                char * const *env) {
3639

3640
        const char *wd;
9,499✔
3641
        int r;
9,499✔
3642

3643
        assert(context);
9,499✔
3644
        assert(params);
9,499✔
3645
        assert(runtime);
9,499✔
3646

3647
        if (context->working_directory_home) {
9,499✔
3648
                /* Preferably use the data from $HOME, in case it was updated by a PAM module */
3649
                wd = strv_env_get(env, "HOME");
93✔
3650
                if (!wd) {
93✔
3651
                        /* If that's not available, use the data from the struct passwd entry: */
3652
                        if (!pwent_home)
1✔
3653
                                return -ENXIO;
3654

3655
                        wd = pwent_home;
3656
                }
3657
        } else
3658
                wd = empty_to_root(context->working_directory);
9,406✔
3659

3660
        if (params->flags & EXEC_APPLY_CHROOT)
9,499✔
3661
                r = RET_NERRNO(chdir(wd));
9,499✔
3662
        else {
3663
                _cleanup_close_ int dfd = -EBADF;
×
3664

3665
                r = chase(wd,
×
3666
                          runtime->ephemeral_copy ?: context->root_directory,
×
3667
                          CHASE_PREFIX_ROOT|CHASE_AT_RESOLVE_IN_ROOT,
3668
                          /* ret_path= */ NULL,
3669
                          &dfd);
3670
                if (r >= 0)
×
3671
                        r = RET_NERRNO(fchdir(dfd));
×
3672
        }
3673
        return context->working_directory_missing_ok ? 0 : r;
9,499✔
3674
}
3675

3676
static int apply_root_directory(
9,499✔
3677
                const ExecContext *context,
3678
                const ExecParameters *params,
3679
                ExecRuntime *runtime,
3680
                const bool needs_mount_ns,
3681
                int *exit_status) {
3682

3683
        assert(context);
9,499✔
3684
        assert(params);
9,499✔
3685
        assert(runtime);
9,499✔
3686
        assert(exit_status);
9,499✔
3687

3688
        if (params->flags & EXEC_APPLY_CHROOT)
9,499✔
3689
                if (!needs_mount_ns && context->root_directory)
9,499✔
3690
                        if (chroot(runtime->ephemeral_copy ?: context->root_directory) < 0) {
×
3691
                                *exit_status = EXIT_CHROOT;
×
3692
                                return -errno;
×
3693
                        }
3694

3695
        return 0;
3696
}
3697

3698
static int setup_keyring(
9,523✔
3699
                const ExecContext *context,
3700
                const ExecParameters *p,
3701
                uid_t uid,
3702
                gid_t gid) {
3703

3704
        key_serial_t keyring;
9,523✔
3705
        int r = 0;
9,523✔
3706
        uid_t saved_uid;
9,523✔
3707
        gid_t saved_gid;
9,523✔
3708

3709
        assert(context);
9,523✔
3710
        assert(p);
9,523✔
3711

3712
        /* Let's set up a new per-service "session" kernel keyring for each system service. This has the benefit that
3713
         * each service runs with its own keyring shared among all processes of the service, but with no hook-up beyond
3714
         * that scope, and in particular no link to the per-UID keyring. If we don't do this the keyring will be
3715
         * automatically created on-demand and then linked to the per-UID keyring, by the kernel. The kernel's built-in
3716
         * on-demand behaviour is very appropriate for login users, but probably not so much for system services, where
3717
         * UIDs are not necessarily specific to a service but reused (at least in the case of UID 0). */
3718

3719
        if (context->keyring_mode == EXEC_KEYRING_INHERIT)
9,523✔
3720
                return 0;
3721

3722
        /* Acquiring a reference to the user keyring is nasty. We briefly change identity in order to get things set up
3723
         * properly by the kernel. If we don't do that then we can't create it atomically, and that sucks for parallel
3724
         * execution. This mimics what pam_keyinit does, too. Setting up session keyring, to be owned by the right user
3725
         * & group is just as nasty as acquiring a reference to the user keyring. */
3726

3727
        saved_uid = getuid();
8,565✔
3728
        saved_gid = getgid();
8,565✔
3729

3730
        if (gid_is_valid(gid) && gid != saved_gid) {
8,565✔
3731
                if (setregid(gid, -1) < 0)
1,746✔
3732
                        return log_error_errno(errno, "Failed to change GID for user keyring: %m");
×
3733
        }
3734

3735
        if (uid_is_valid(uid) && uid != saved_uid) {
8,565✔
3736
                if (setreuid(uid, -1) < 0) {
1,743✔
3737
                        r = log_error_errno(errno, "Failed to change UID for user keyring: %m");
×
3738
                        goto out;
×
3739
                }
3740
        }
3741

3742
        keyring = keyctl(KEYCTL_JOIN_SESSION_KEYRING, 0, 0, 0, 0);
8,565✔
3743
        if (keyring == -1) {
8,565✔
3744
                if (errno == ENOSYS)
×
3745
                        log_debug_errno(errno, "Kernel keyring not supported, ignoring.");
×
3746
                else if (ERRNO_IS_PRIVILEGE(errno))
×
3747
                        log_debug_errno(errno, "Kernel keyring access prohibited, ignoring.");
×
3748
                else if (errno == EDQUOT)
×
3749
                        log_debug_errno(errno, "Out of kernel keyrings to allocate, ignoring.");
×
3750
                else
3751
                        r = log_error_errno(errno, "Setting up kernel keyring failed: %m");
×
3752

3753
                goto out;
×
3754
        }
3755

3756
        /* When requested link the user keyring into the session keyring. */
3757
        if (context->keyring_mode == EXEC_KEYRING_SHARED) {
8,565✔
3758

3759
                if (keyctl(KEYCTL_LINK,
935✔
3760
                           KEY_SPEC_USER_KEYRING,
3761
                           KEY_SPEC_SESSION_KEYRING, 0, 0) < 0) {
3762
                        r = log_error_errno(errno, "Failed to link user keyring into session keyring: %m");
×
3763
                        goto out;
×
3764
                }
3765
        }
3766

3767
        /* Restore uid/gid back */
3768
        if (uid_is_valid(uid) && uid != saved_uid) {
8,565✔
3769
                if (setreuid(saved_uid, -1) < 0) {
1,743✔
3770
                        r = log_error_errno(errno, "Failed to change UID back for user keyring: %m");
×
3771
                        goto out;
×
3772
                }
3773
        }
3774

3775
        if (gid_is_valid(gid) && gid != saved_gid) {
8,565✔
3776
                if (setregid(saved_gid, -1) < 0)
1,746✔
3777
                        return log_error_errno(errno, "Failed to change GID back for user keyring: %m");
×
3778
        }
3779

3780
        /* Populate they keyring with the invocation ID by default, as original saved_uid. */
3781
        if (!sd_id128_is_null(p->invocation_id)) {
8,565✔
3782
                key_serial_t key;
8,565✔
3783

3784
                key = add_key("user",
17,130✔
3785
                              "invocation_id",
3786
                              &p->invocation_id,
8,565✔
3787
                              sizeof(p->invocation_id),
3788
                              KEY_SPEC_SESSION_KEYRING);
3789
                if (key == -1)
8,565✔
3790
                        log_debug_errno(errno, "Failed to add invocation ID to keyring, ignoring: %m");
×
3791
                else {
3792
                        if (keyctl(KEYCTL_SETPERM, key,
8,565✔
3793
                                   KEY_POS_VIEW|KEY_POS_READ|KEY_POS_SEARCH|
3794
                                   KEY_USR_VIEW|KEY_USR_READ|KEY_USR_SEARCH, 0, 0) < 0)
3795
                                r = log_error_errno(errno, "Failed to restrict invocation ID permission: %m");
×
3796
                }
3797
        }
3798

3799
out:
8,565✔
3800
        /* Revert back uid & gid for the last time, and exit */
3801
        /* no extra logging, as only the first already reported error matters */
3802
        if (getuid() != saved_uid)
8,565✔
3803
                (void) setreuid(saved_uid, -1);
×
3804

3805
        if (getgid() != saved_gid)
8,565✔
3806
                (void) setregid(saved_gid, -1);
×
3807

3808
        return r;
3809
}
3810

3811
static void append_socket_pair(int *array, size_t *n, const int pair[static 2]) {
34,543✔
3812
        assert(array);
34,543✔
3813
        assert(n);
34,543✔
3814
        assert(pair);
34,543✔
3815

3816
        if (pair[0] >= 0)
34,543✔
3817
                array[(*n)++] = pair[0];
186✔
3818
        if (pair[1] >= 0)
34,543✔
3819
                array[(*n)++] = pair[1];
186✔
3820
}
34,543✔
3821

3822
static int close_remaining_fds(
11,473✔
3823
                const ExecParameters *params,
3824
                const ExecRuntime *runtime,
3825
                int socket_fd,
3826
                const int *fds,
3827
                size_t n_fds) {
11,473✔
3828

3829
        size_t n_dont_close = 0;
11,473✔
3830
        int dont_close[n_fds + 17];
11,473✔
3831

3832
        assert(params);
11,473✔
3833
        assert(runtime);
11,473✔
3834

3835
        if (params->stdin_fd >= 0)
11,473✔
3836
                dont_close[n_dont_close++] = params->stdin_fd;
527✔
3837
        if (params->stdout_fd >= 0)
11,473✔
3838
                dont_close[n_dont_close++] = params->stdout_fd;
527✔
3839
        if (params->stderr_fd >= 0)
11,473✔
3840
                dont_close[n_dont_close++] = params->stderr_fd;
527✔
3841

3842
        if (socket_fd >= 0)
11,473✔
3843
                dont_close[n_dont_close++] = socket_fd;
17✔
3844
        if (n_fds > 0) {
11,473✔
3845
                memcpy(dont_close + n_dont_close, fds, sizeof(int) * n_fds);
11,473✔
3846
                n_dont_close += n_fds;
11,473✔
3847
        }
3848

3849
        append_socket_pair(dont_close, &n_dont_close, runtime->ephemeral_storage_socket);
11,473✔
3850

3851
        if (runtime->shared) {
11,473✔
3852
                append_socket_pair(dont_close, &n_dont_close, runtime->shared->netns_storage_socket);
11,473✔
3853
                append_socket_pair(dont_close, &n_dont_close, runtime->shared->ipcns_storage_socket);
11,473✔
3854
        }
3855

3856
        if (runtime->dynamic_creds) {
11,473✔
3857
                if (runtime->dynamic_creds->user)
11,473✔
3858
                        append_socket_pair(dont_close, &n_dont_close, runtime->dynamic_creds->user->storage_socket);
62✔
3859
                if (runtime->dynamic_creds->group)
11,473✔
3860
                        append_socket_pair(dont_close, &n_dont_close, runtime->dynamic_creds->group->storage_socket);
62✔
3861
        }
3862

3863
        if (params->user_lookup_fd >= 0)
11,473✔
3864
                dont_close[n_dont_close++] = params->user_lookup_fd;
11,473✔
3865

3866
        if (params->handoff_timestamp_fd >= 0)
11,473✔
3867
                dont_close[n_dont_close++] = params->handoff_timestamp_fd;
11,473✔
3868

3869
        if (params->pidref_transport_fd >= 0)
11,473✔
3870
                dont_close[n_dont_close++] = params->pidref_transport_fd;
10,422✔
3871

3872
        assert(n_dont_close <= ELEMENTSOF(dont_close));
11,473✔
3873

3874
        return close_all_fds(dont_close, n_dont_close);
11,473✔
3875
}
3876

3877
static int send_user_lookup(
11,471✔
3878
                const char *unit_id,
3879
                int user_lookup_fd,
3880
                uid_t uid,
3881
                gid_t gid) {
3882

3883
        assert(unit_id);
11,471✔
3884

3885
        /* Send the resolved UID/GID to PID 1 after we learnt it. We send a single datagram, containing the UID/GID
3886
         * data as well as the unit name. Note that we suppress sending this if no user/group to resolve was
3887
         * specified. */
3888

3889
        if (user_lookup_fd < 0)
11,471✔
3890
                return 0;
3891

3892
        if (!uid_is_valid(uid) && !gid_is_valid(gid))
11,471✔
3893
                return 0;
3894

3895
        if (writev(user_lookup_fd,
2,644✔
3896
               (struct iovec[]) {
5,288✔
3897
                           IOVEC_MAKE(&uid, sizeof(uid)),
3898
                           IOVEC_MAKE(&gid, sizeof(gid)),
3899
                           IOVEC_MAKE_STRING(unit_id) }, 3) < 0)
5,288✔
3900
                return -errno;
×
3901

3902
        return 0;
2,644✔
3903
}
3904

3905
static int acquire_home(const ExecContext *c, const char **home, char **ret_buf) {
11,471✔
3906
        int r;
11,471✔
3907

3908
        assert(c);
11,471✔
3909
        assert(home);
11,471✔
3910
        assert(ret_buf);
11,471✔
3911

3912
        /* If WorkingDirectory=~ is set, try to acquire a usable home directory. */
3913

3914
        if (*home) /* Already acquired from get_fixed_user()? */
11,471✔
3915
                return 0;
3916

3917
        if (!c->working_directory_home)
8,894✔
3918
                return 0;
3919

3920
        if (c->dynamic_user || (c->user && is_this_me(c->user) <= 0))
×
3921
                return -EADDRNOTAVAIL;
×
3922

3923
        r = get_home_dir(ret_buf);
×
3924
        if (r < 0)
×
3925
                return r;
3926

3927
        *home = *ret_buf;
×
3928
        return 1;
×
3929
}
3930

3931
static int compile_suggested_paths(const ExecContext *c, const ExecParameters *p, char ***ret) {
62✔
3932
        _cleanup_strv_free_ char ** list = NULL;
62✔
3933
        int r;
62✔
3934

3935
        assert(c);
62✔
3936
        assert(p);
62✔
3937
        assert(ret);
62✔
3938

3939
        assert(c->dynamic_user);
62✔
3940

3941
        /* Compile a list of paths that it might make sense to read the owning UID from to use as initial candidate for
3942
         * dynamic UID allocation, in order to save us from doing costly recursive chown()s of the special
3943
         * directories. */
3944

3945
        for (ExecDirectoryType t = 0; t < _EXEC_DIRECTORY_TYPE_MAX; t++) {
372✔
3946

3947
                if (!EXEC_DIRECTORY_TYPE_SHALL_CHOWN(t))
310✔
3948
                        continue;
62✔
3949

3950
                if (!p->prefix[t])
248✔
3951
                        continue;
×
3952

3953
                for (size_t i = 0; i < c->directories[t].n_items; i++) {
263✔
3954
                        char *e;
15✔
3955

3956
                        if (exec_directory_is_private(c, t))
15✔
3957
                                e = path_join(p->prefix[t], "private", c->directories[t].items[i].path);
13✔
3958
                        else
3959
                                e = path_join(p->prefix[t], c->directories[t].items[i].path);
2✔
3960
                        if (!e)
15✔
3961
                                return -ENOMEM;
3962

3963
                        r = strv_consume(&list, e);
15✔
3964
                        if (r < 0)
15✔
3965
                                return r;
3966
                }
3967
        }
3968

3969
        *ret = TAKE_PTR(list);
62✔
3970

3971
        return 0;
62✔
3972
}
3973

3974
static int exec_context_cpu_affinity_from_numa(const ExecContext *c, CPUSet *ret) {
2✔
3975
        _cleanup_(cpu_set_reset) CPUSet s = {};
2✔
3976
        int r;
2✔
3977

3978
        assert(c);
2✔
3979
        assert(ret);
2✔
3980

3981
        if (!c->numa_policy.nodes.set) {
2✔
3982
                log_debug("Can't derive CPU affinity mask from NUMA mask because NUMA mask is not set, ignoring");
×
3983
                return 0;
×
3984
        }
3985

3986
        r = numa_to_cpu_set(&c->numa_policy, &s);
2✔
3987
        if (r < 0)
2✔
3988
                return r;
3989

3990
        cpu_set_reset(ret);
2✔
3991

3992
        return cpu_set_add_all(ret, &s);
2✔
3993
}
3994

3995
static int add_shifted_fd(int *fds, size_t fds_size, size_t *n_fds, int *fd) {
43,918✔
3996
        int r;
43,918✔
3997

3998
        assert(fds);
43,918✔
3999
        assert(n_fds);
43,918✔
4000
        assert(*n_fds < fds_size);
43,918✔
4001
        assert(fd);
43,918✔
4002

4003
        if (*fd < 0)
43,918✔
4004
               return 0;
43,918✔
4005

4006
        if (*fd < 3 + (int) *n_fds) {
21,354✔
4007
                /* Let's move the fd up, so that it's outside of the fd range we will use to store
4008
                 * the fds we pass to the process (or which are closed only during execve). */
4009

4010
                r = fcntl(*fd, F_DUPFD_CLOEXEC, 3 + (int) *n_fds);
9,499✔
4011
                if (r < 0)
9,499✔
4012
                        return -errno;
×
4013

4014
                close_and_replace(*fd, r);
9,499✔
4015
        }
4016

4017
        fds[(*n_fds)++] = *fd;
21,354✔
4018
        return 1;
21,354✔
4019
}
4020

4021
static int connect_unix_harder(const OpenFile *of, int ofd) {
1✔
4022
        static const int socket_types[] = { SOCK_DGRAM, SOCK_STREAM, SOCK_SEQPACKET };
1✔
4023

4024
        union sockaddr_union addr = {
1✔
4025
                .un.sun_family = AF_UNIX,
4026
        };
4027
        socklen_t sa_len;
1✔
4028
        int r;
1✔
4029

4030
        assert(of);
1✔
4031
        assert(ofd >= 0);
1✔
4032

4033
        r = sockaddr_un_set_path(&addr.un, FORMAT_PROC_FD_PATH(ofd));
1✔
4034
        if (r < 0)
1✔
4035
                return log_debug_errno(r, "Failed to set sockaddr for '%s': %m", of->path);
×
4036
        sa_len = r;
1✔
4037

4038
        FOREACH_ELEMENT(i, socket_types) {
2✔
4039
                _cleanup_close_ int fd = -EBADF;
2✔
4040

4041
                fd = socket(AF_UNIX, *i|SOCK_CLOEXEC, 0);
2✔
4042
                if (fd < 0)
2✔
4043
                        return log_debug_errno(errno, "Failed to create socket for '%s': %m", of->path);
×
4044

4045
                r = RET_NERRNO(connect(fd, &addr.sa, sa_len));
2✔
4046
                if (r >= 0)
1✔
4047
                        return TAKE_FD(fd);
1✔
4048
                if (r != -EPROTOTYPE)
1✔
4049
                        return log_debug_errno(r, "Failed to connect to socket for '%s': %m", of->path);
×
4050
        }
4051

4052
        return log_debug_errno(SYNTHETIC_ERRNO(EPROTOTYPE), "No suitable socket type to connect to socket '%s'.", of->path);
×
4053
}
4054

4055
static int get_open_file_fd(const OpenFile *of) {
5✔
4056
        _cleanup_close_ int fd = -EBADF, ofd = -EBADF;
5✔
4057
        struct stat st;
5✔
4058

4059
        assert(of);
5✔
4060

4061
        ofd = open(of->path, O_PATH | O_CLOEXEC);
5✔
4062
        if (ofd < 0)
5✔
4063
                return log_debug_errno(errno, "Failed to open '%s' as O_PATH: %m", of->path);
2✔
4064

4065
        if (fstat(ofd, &st) < 0)
3✔
4066
                return log_debug_errno( errno, "Failed to stat '%s': %m", of->path);
×
4067

4068
        if (S_ISSOCK(st.st_mode)) {
3✔
4069
                fd = connect_unix_harder(of, ofd);
1✔
4070
                if (fd < 0)
1✔
4071
                        return fd;
4072

4073
                if (FLAGS_SET(of->flags, OPENFILE_READ_ONLY) && shutdown(fd, SHUT_WR) < 0)
1✔
4074
                        return log_debug_errno(errno, "Failed to shutdown send for socket '%s': %m", of->path);
×
4075

4076
                log_debug("Opened socket '%s' as fd %d.", of->path, fd);
1✔
4077
        } else {
4078
                int flags = FLAGS_SET(of->flags, OPENFILE_READ_ONLY) ? O_RDONLY : O_RDWR;
2✔
4079
                if (FLAGS_SET(of->flags, OPENFILE_APPEND))
2✔
4080
                        flags |= O_APPEND;
×
4081
                else if (FLAGS_SET(of->flags, OPENFILE_TRUNCATE))
2✔
4082
                        flags |= O_TRUNC;
×
4083

4084
                fd = fd_reopen(ofd, flags|O_NOCTTY|O_CLOEXEC);
2✔
4085
                if (fd < 0)
2✔
4086
                        return log_debug_errno(fd, "Failed to reopen file '%s': %m", of->path);
×
4087

4088
                log_debug("Opened file '%s' as fd %d.", of->path, fd);
2✔
4089
        }
4090

4091
        return TAKE_FD(fd);
4092
}
4093

4094
static int collect_open_file_fds(ExecParameters *p, size_t *n_fds) {
11,474✔
4095
        assert(p);
11,474✔
4096
        assert(n_fds);
11,474✔
4097

4098
        LIST_FOREACH(open_files, of, p->open_files) {
11,474✔
4099
                _cleanup_close_ int fd = -EBADF;
11,479✔
4100

4101
                fd = get_open_file_fd(of);
5✔
4102
                if (fd < 0) {
5✔
4103
                        if (FLAGS_SET(of->flags, OPENFILE_GRACEFUL)) {
2✔
4104
                                log_full_errno(fd == -ENOENT || ERRNO_IS_NEG_PRIVILEGE(fd) ? LOG_DEBUG : LOG_WARNING,
1✔
4105
                                               fd,
4106
                                               "Failed to get OpenFile= file descriptor for '%s', ignoring: %m",
4107
                                               of->path);
4108
                                continue;
1✔
4109
                        }
4110

4111
                        return log_error_errno(fd, "Failed to get OpenFile= file descriptor for '%s': %m", of->path);
1✔
4112
                }
4113

4114
                if (!GREEDY_REALLOC(p->fds, *n_fds + 1))
3✔
4115
                        return log_oom();
×
4116

4117
                if (strv_extend(&p->fd_names, of->fdname) < 0)
3✔
4118
                        return log_oom();
×
4119

4120
                p->fds[(*n_fds)++] = TAKE_FD(fd);
3✔
4121
        }
4122

4123
        return 0;
4124
}
4125

4126
static void log_command_line(
9,498✔
4127
                const ExecContext *context,
4128
                const ExecParameters *params,
4129
                const char *msg,
4130
                const char *executable,
4131
                char **argv) {
4132

4133
        assert(context);
9,498✔
4134
        assert(params);
9,498✔
4135
        assert(msg);
9,498✔
4136
        assert(executable);
9,498✔
4137

4138
        if (!DEBUG_LOGGING)
9,498✔
4139
                return;
9,498✔
4140

4141
        _cleanup_free_ char *cmdline = quote_command_line(argv, SHELL_ESCAPE_EMPTY);
18,342✔
4142

4143
        log_struct(LOG_DEBUG,
17,561✔
4144
                   LOG_ITEM("EXECUTABLE=%s", executable),
4145
                   LOG_EXEC_MESSAGE(params, "%s: %s", msg, strnull(cmdline)),
4146
                   LOG_EXEC_INVOCATION_ID(params));
4147
}
4148

4149
static bool exec_context_needs_cap_sys_admin(const ExecContext *context) {
1,629✔
4150
        assert(context);
1,629✔
4151

4152
        return context->private_users != PRIVATE_USERS_NO ||
3,245✔
4153
               context->private_tmp != PRIVATE_TMP_NO ||
1,616✔
4154
               context->private_devices ||
1,601✔
4155
               context->private_network ||
1,594✔
4156
               context->network_namespace_path ||
1,587✔
4157
               context->private_ipc ||
1,587✔
4158
               context->ipc_namespace_path ||
1,587✔
4159
               context->private_mounts > 0 ||
1,587✔
4160
               context->mount_apivfs > 0 ||
1,577✔
4161
               context->bind_log_sockets > 0 ||
1,577✔
4162
               context->n_bind_mounts > 0 ||
1,577✔
4163
               context->n_temporary_filesystems > 0 ||
1,572✔
4164
               context->root_directory ||
1,572✔
4165
               !strv_isempty(context->extension_directories) ||
1,572✔
4166
               context->protect_system != PROTECT_SYSTEM_NO ||
1,572✔
4167
               context->protect_home != PROTECT_HOME_NO ||
3,129✔
4168
               exec_needs_pid_namespace(context) ||
1,557✔
4169
               context->protect_kernel_tunables ||
1,535✔
4170
               context->protect_kernel_modules ||
1,530✔
4171
               context->protect_kernel_logs ||
3,050✔
4172
               exec_needs_cgroup_mount(context) ||
1,525✔
4173
               context->protect_clock ||
1,525✔
4174
               context->protect_hostname != PROTECT_HOSTNAME_NO ||
1,520✔
4175
               !strv_isempty(context->read_write_paths) ||
1,515✔
4176
               !strv_isempty(context->read_only_paths) ||
1,500✔
4177
               !strv_isempty(context->inaccessible_paths) ||
1,500✔
4178
               !strv_isempty(context->exec_paths) ||
1,500✔
4179
               !strv_isempty(context->no_exec_paths) ||
3,129✔
4180
               context->delegate_namespaces != NAMESPACE_FLAGS_INITIAL;
1,500✔
4181
}
4182

4183
static PrivateUsers exec_context_get_effective_private_users(
9,507✔
4184
                const ExecContext *context,
4185
                const ExecParameters *params) {
4186

4187
        assert(context);
9,507✔
4188
        assert(params);
9,507✔
4189

4190
        if (context->private_users != PRIVATE_USERS_NO)
9,507✔
4191
                return context->private_users;
4192

4193
        /* If any namespace is delegated with DelegateNamespaces=, always set up a user namespace. */
4194
        if (context->delegate_namespaces != NAMESPACE_FLAGS_INITIAL)
9,483✔
4195
                return PRIVATE_USERS_SELF;
3✔
4196

4197
        return PRIVATE_USERS_NO;
4198
}
4199

4200
static bool exec_namespace_is_delegated(
23,131✔
4201
                const ExecContext *context,
4202
                const ExecParameters *params,
4203
                bool have_cap_sys_admin,
4204
                unsigned long namespace) {
4205

4206
        assert(context);
23,131✔
4207
        assert(params);
23,131✔
4208
        assert(namespace != CLONE_NEWUSER);
23,131✔
4209

4210
        /* If we need unprivileged private users, we've already unshared a user namespace by the time we call
4211
         * setup_delegated_namespaces() for the first time so let's make sure we do all other namespace
4212
         * unsharing in the first call to setup_delegated_namespaces() by returning false here. */
4213
        if (!have_cap_sys_admin && exec_context_needs_cap_sys_admin(context))
23,131✔
4214
                return false;
4215

4216
        if (context->delegate_namespaces == NAMESPACE_FLAGS_INITIAL)
23,029✔
4217
                return params->runtime_scope == RUNTIME_SCOPE_USER;
22,981✔
4218

4219
        if (FLAGS_SET(context->delegate_namespaces, namespace))
48✔
4220
                return true;
4221

4222
        /* Various namespaces imply mountns for private procfs/sysfs/cgroupfs instances, which means when
4223
         * those are delegated mountns must be deferred too.
4224
         *
4225
         * The list should stay in sync with exec_needs_mount_namespace(). */
4226
        if (namespace == CLONE_NEWNS)
16✔
4227
                return context->delegate_namespaces & (CLONE_NEWPID|CLONE_NEWCGROUP|CLONE_NEWNET);
4✔
4228

4229
        return false;
4230
}
4231

4232
static int setup_delegated_namespaces(
19,025✔
4233
                const ExecContext *context,
4234
                ExecParameters *params,
4235
                ExecRuntime *runtime,
4236
                bool delegate,
4237
                const char *memory_pressure_path,
4238
                uid_t uid,
4239
                uid_t gid,
4240
                const ExecCommand *command,
4241
                bool needs_sandboxing,
4242
                bool have_cap_sys_admin,
4243
                int *reterr_exit_status) {
4244

4245
        int r;
19,025✔
4246

4247
        /* This function is called twice, once before unsharing the user namespace, and once after unsharing
4248
         * the user namespace. When called before unsharing the user namespace, "delegate" is set to "false".
4249
         * When called after unsharing the user namespace, "delegate" is set to "true". The net effect is
4250
         * that all namespaces that should not be delegated are unshared when this function is called the
4251
         * first time and all namespaces that should be delegated are unshared when this function is called
4252
         * the second time. */
4253

4254
        assert(context);
19,025✔
4255
        assert(params);
19,025✔
4256
        assert(runtime);
19,025✔
4257
        assert(reterr_exit_status);
19,025✔
4258

4259
        if (exec_needs_network_namespace(context) &&
19,140✔
4260
            exec_namespace_is_delegated(context, params, have_cap_sys_admin, CLONE_NEWNET) == delegate &&
115✔
4261
            runtime->shared && runtime->shared->netns_storage_socket[0] >= 0) {
58✔
4262

4263
                /* Try to enable network namespacing if network namespacing is available and we have
4264
                 * CAP_NET_ADMIN in the current user namespace (either the system manager one or the unit's
4265
                 * own user namespace). We need CAP_NET_ADMIN to be able to configure the loopback device in
4266
                 * the new network namespace. And if we don't have that, then we could only create a network
4267
                 * namespace without the ability to set up "lo". Hence gracefully skip things then. */
4268
                if (namespace_type_supported(NAMESPACE_NET) && have_effective_cap(CAP_NET_ADMIN) > 0) {
58✔
4269
                        r = setup_shareable_ns(runtime->shared->netns_storage_socket, CLONE_NEWNET);
58✔
4270
                        if (ERRNO_IS_NEG_PRIVILEGE(r))
58✔
4271
                                log_notice_errno(r, "PrivateNetwork=yes is configured, but network namespace setup not permitted, proceeding without: %m");
×
4272
                        else if (r < 0) {
58✔
4273
                                *reterr_exit_status = EXIT_NETWORK;
×
4274
                                return log_error_errno(r, "Failed to set up network namespacing: %m");
×
4275
                        } else
4276
                                log_debug("Set up %snetwork namespace", delegate ? "delegated " : "");
113✔
4277
                } else if (context->network_namespace_path) {
×
4278
                        *reterr_exit_status = EXIT_NETWORK;
×
4279
                        return log_error_errno(SYNTHETIC_ERRNO(EOPNOTSUPP), "NetworkNamespacePath= is not supported, refusing.");
×
4280
                } else
4281
                        log_notice("PrivateNetwork=yes is configured, but the kernel does not support or we lack privileges for network namespace, proceeding without.");
×
4282
        }
4283

4284
        if (exec_needs_ipc_namespace(context) &&
19,032✔
4285
            exec_namespace_is_delegated(context, params, have_cap_sys_admin, CLONE_NEWIPC) == delegate &&
7✔
4286
            runtime->shared && runtime->shared->ipcns_storage_socket[0] >= 0) {
4✔
4287

4288
                if (namespace_type_supported(NAMESPACE_IPC)) {
4✔
4289
                        r = setup_shareable_ns(runtime->shared->ipcns_storage_socket, CLONE_NEWIPC);
4✔
4290
                        if (ERRNO_IS_NEG_PRIVILEGE(r))
4✔
4291
                                log_warning_errno(r, "PrivateIPC=yes is configured, but IPC namespace setup failed, ignoring: %m");
×
4292
                        else if (r < 0) {
4✔
4293
                                *reterr_exit_status = EXIT_NAMESPACE;
×
4294
                                return log_error_errno(r, "Failed to set up IPC namespacing: %m");
×
4295
                        } else
4296
                                log_debug("Set up %sIPC namespace", delegate ? "delegated " : "");
6✔
4297
                } else if (context->ipc_namespace_path) {
×
4298
                        *reterr_exit_status = EXIT_NAMESPACE;
×
4299
                        return log_error_errno(SYNTHETIC_ERRNO(EOPNOTSUPP), "IPCNamespacePath= is not supported, refusing.");
×
4300
                } else
4301
                        log_warning("PrivateIPC=yes is configured, but the kernel does not support IPC namespaces, ignoring.");
×
4302
        }
4303

4304
        if (needs_sandboxing && exec_needs_cgroup_namespace(context) &&
19,046✔
4305
            exec_namespace_is_delegated(context, params, have_cap_sys_admin, CLONE_NEWCGROUP) == delegate) {
21✔
4306
                if (unshare(CLONE_NEWCGROUP) < 0) {
11✔
4307
                        *reterr_exit_status = EXIT_NAMESPACE;
×
4308
                        return log_error_errno(errno, "Failed to set up cgroup namespacing: %m");
×
4309
                }
4310

4311
                log_debug("Set up %scgroup namespace", delegate ? "delegated " : "");
20✔
4312
        }
4313

4314
        /* Unshare a new PID namespace before setting up mounts to ensure /proc/ is mounted with only processes in PID namespace visible.
4315
         * Note PrivatePIDs=yes implies MountAPIVFS=yes so we'll always ensure procfs is remounted. */
4316
        if (needs_sandboxing && exec_needs_pid_namespace(context) &&
19,046✔
4317
            exec_namespace_is_delegated(context, params, have_cap_sys_admin, CLONE_NEWPID) == delegate) {
21✔
4318
                if (params->pidref_transport_fd < 0) {
13✔
4319
                        *reterr_exit_status = EXIT_NAMESPACE;
×
4320
                        return log_error_errno(SYNTHETIC_ERRNO(ENOTCONN), "PidRef socket is not set up: %m");
×
4321
                }
4322

4323
                /* If we had CAP_SYS_ADMIN prior to joining the user namespace, then we are privileged and don't need
4324
                 * to check if we can mount /proc/.
4325
                 *
4326
                 * We need to check prior to entering the user namespace because if we're running unprivileged or in a
4327
                 * system without CAP_SYS_ADMIN, then we can have CAP_SYS_ADMIN in the current user namespace but not
4328
                 * once we unshare a mount namespace. */
4329
                if (!have_cap_sys_admin || delegate) {
13✔
4330
                        r = can_mount_proc();
8✔
4331
                        if (r < 0) {
4✔
4332
                                *reterr_exit_status = EXIT_NAMESPACE;
×
4333
                                return log_error_errno(r, "Failed to detect if /proc/ can be remounted: %m");
×
4334
                        }
4335
                        if (r == 0) {
4✔
4336
                                *reterr_exit_status = EXIT_NAMESPACE;
1✔
4337
                                return log_error_errno(SYNTHETIC_ERRNO(EPERM),
1✔
4338
                                                       "PrivatePIDs=yes is configured, but /proc/ cannot be re-mounted due to lack of privileges, refusing.");
4339
                        }
4340
                }
4341

4342
                r = setup_private_pids(context, params);
8✔
4343
                if (r < 0) {
6✔
4344
                        *reterr_exit_status = EXIT_NAMESPACE;
×
4345
                        return log_error_errno(r, "Failed to set up pid namespace: %m");
×
4346
                }
4347

4348
                log_debug("Set up %spid namespace", delegate ? "delegated " : "");
12✔
4349
        }
4350

4351
        /* If PrivatePIDs= yes is configured, we're now running as pid 1 in a pid namespace! */
4352

4353
        if (exec_needs_mount_namespace(context, params, runtime) &&
22,982✔
4354
            exec_namespace_is_delegated(context, params, have_cap_sys_admin, CLONE_NEWNS) == delegate) {
3,964✔
4355
                _cleanup_free_ char *error_path = NULL;
1,989✔
4356

4357
                r = apply_mount_namespace(command->flags,
1,989✔
4358
                                          context,
4359
                                          params,
4360
                                          runtime,
4361
                                          memory_pressure_path,
4362
                                          needs_sandboxing,
4363
                                          &error_path,
4364
                                          uid,
4365
                                          gid);
4366
                if (r < 0) {
1,989✔
4367
                        *reterr_exit_status = EXIT_NAMESPACE;
15✔
4368
                        return log_error_errno(r, "Failed to set up mount namespacing%s%s: %m",
29✔
4369
                                               error_path ? ": " : "", strempty(error_path));
4370
                }
4371

4372
                log_debug("Set up %smount namespace", delegate ? "delegated " : "");
3,920✔
4373
        }
4374

4375
        if (needs_sandboxing &&
38,006✔
4376
            exec_namespace_is_delegated(context, params, have_cap_sys_admin, CLONE_NEWUTS) == delegate) {
19,003✔
4377
                r = apply_protect_hostname(context, params, reterr_exit_status);
9,501✔
4378
                if (r < 0)
9,501✔
4379
                        return r;
4380
                if (r > 0)
9,501✔
4381
                        log_debug("Set up %sUTS namespace", delegate ? "delegated " : "");
1,325✔
4382
        }
4383

4384
        return 0;
4385
}
4386

4387
static bool exec_context_shall_confirm_spawn(const ExecContext *context) {
×
4388
        assert(context);
×
4389

4390
        if (confirm_spawn_disabled())
×
4391
                return false;
4392

4393
        /* For some reasons units remaining in the same process group
4394
         * as PID 1 fail to acquire the console even if it's not used
4395
         * by any process. So skip the confirmation question for them. */
4396
        return !context->same_pgrp;
×
4397
}
4398

4399
static int exec_context_named_iofds(
11,474✔
4400
                const ExecContext *c,
4401
                const ExecParameters *p,
4402
                int named_iofds[static 3]) {
4403

4404
        size_t targets;
11,474✔
4405
        const char* stdio_fdname[3];
11,474✔
4406
        size_t n_fds;
11,474✔
4407

4408
        assert(c);
11,474✔
4409
        assert(p);
11,474✔
4410
        assert(named_iofds);
11,474✔
4411

4412
        targets = (c->std_input == EXEC_INPUT_NAMED_FD) +
11,474✔
4413
                  (c->std_output == EXEC_OUTPUT_NAMED_FD) +
11,474✔
4414
                  (c->std_error == EXEC_OUTPUT_NAMED_FD);
11,474✔
4415

4416
        for (size_t i = 0; i < 3; i++)
45,896✔
4417
                stdio_fdname[i] = exec_context_fdname(c, i);
34,422✔
4418

4419
        n_fds = p->n_storage_fds + p->n_socket_fds + p->n_extra_fds;
11,474✔
4420

4421
        for (size_t i = 0; i < n_fds  && targets > 0; i++)
11,474✔
4422
                if (named_iofds[STDIN_FILENO] < 0 &&
×
4423
                    c->std_input == EXEC_INPUT_NAMED_FD &&
×
4424
                    stdio_fdname[STDIN_FILENO] &&
×
4425
                    streq(p->fd_names[i], stdio_fdname[STDIN_FILENO])) {
×
4426

4427
                        named_iofds[STDIN_FILENO] = p->fds[i];
×
4428
                        targets--;
×
4429

4430
                } else if (named_iofds[STDOUT_FILENO] < 0 &&
×
4431
                           c->std_output == EXEC_OUTPUT_NAMED_FD &&
×
4432
                           stdio_fdname[STDOUT_FILENO] &&
×
4433
                           streq(p->fd_names[i], stdio_fdname[STDOUT_FILENO])) {
×
4434

4435
                        named_iofds[STDOUT_FILENO] = p->fds[i];
×
4436
                        targets--;
×
4437

4438
                } else if (named_iofds[STDERR_FILENO] < 0 &&
×
4439
                           c->std_error == EXEC_OUTPUT_NAMED_FD &&
×
4440
                           stdio_fdname[STDERR_FILENO] &&
×
4441
                           streq(p->fd_names[i], stdio_fdname[STDERR_FILENO])) {
×
4442

4443
                        named_iofds[STDERR_FILENO] = p->fds[i];
×
4444
                        targets--;
×
4445
                }
4446

4447
        return targets == 0 ? 0 : -ENOENT;
11,474✔
4448
}
4449

4450
static void exec_shared_runtime_close(ExecSharedRuntime *shared) {
9,499✔
4451
        if (!shared)
9,499✔
4452
                return;
4453

4454
        safe_close_pair(shared->netns_storage_socket);
9,499✔
4455
        safe_close_pair(shared->ipcns_storage_socket);
9,499✔
4456
}
4457

4458
static void exec_runtime_close(ExecRuntime *rt) {
9,499✔
4459
        if (!rt)
9,499✔
4460
                return;
4461

4462
        safe_close_pair(rt->ephemeral_storage_socket);
9,499✔
4463

4464
        exec_shared_runtime_close(rt->shared);
9,499✔
4465
        dynamic_creds_close(rt->dynamic_creds);
9,499✔
4466
}
4467

4468
static void exec_params_close(ExecParameters *p) {
9,499✔
4469
        if (!p)
9,499✔
4470
                return;
4471

4472
        p->stdin_fd = safe_close(p->stdin_fd);
9,499✔
4473
        p->stdout_fd = safe_close(p->stdout_fd);
9,499✔
4474
        p->stderr_fd = safe_close(p->stderr_fd);
9,499✔
4475
}
4476

4477
static int exec_fd_mark_hot(
9,501✔
4478
                const ExecContext *c,
4479
                ExecParameters *p,
4480
                bool hot,
4481
                int *reterr_exit_status) {
4482

4483
        assert(c);
9,501✔
4484
        assert(p);
9,501✔
4485

4486
        if (p->exec_fd < 0)
9,501✔
4487
                return 0;
9,501✔
4488

4489
        uint8_t x = hot;
275✔
4490

4491
        if (write(p->exec_fd, &x, sizeof(x)) < 0) {
275✔
4492
                if (reterr_exit_status)
×
4493
                        *reterr_exit_status = EXIT_EXEC;
×
4494
                return log_error_errno(errno, "Failed to mark exec_fd as %s: %m", hot ? "hot" : "cold");
×
4495
        }
4496

4497
        return 1;
4498
}
4499

4500
static int send_handoff_timestamp(
9,498✔
4501
                const ExecContext *c,
4502
                ExecParameters *p,
4503
                int *reterr_exit_status) {
4504

4505
        assert(c);
9,498✔
4506
        assert(p);
9,498✔
4507

4508
        if (p->handoff_timestamp_fd < 0)
9,498✔
4509
                return 0;
9,498✔
4510

4511
        dual_timestamp dt;
9,498✔
4512
        dual_timestamp_now(&dt);
9,498✔
4513

4514
        if (write(p->handoff_timestamp_fd, (const usec_t[2]) { dt.realtime, dt.monotonic }, sizeof(usec_t) * 2) < 0) {
9,498✔
4515
                if (reterr_exit_status)
×
4516
                        *reterr_exit_status = EXIT_EXEC;
×
4517
                return log_error_errno(errno, "Failed to send handoff timestamp: %m");
×
4518
        }
4519

4520
        return 1;
9,498✔
4521
}
4522

4523
static void prepare_terminal(
11,471✔
4524
                const ExecContext *context,
4525
                ExecParameters *p) {
4526

4527
        _cleanup_close_ int lock_fd = -EBADF;
11,471✔
4528

4529
        /* This is the "constructive" reset, i.e. is about preparing things for our invocation rather than
4530
         * cleaning up things from older invocations. */
4531

4532
        assert(context);
11,471✔
4533
        assert(p);
11,471✔
4534

4535
        /* We only try to reset things if we there's the chance our stdout points to a TTY */
4536
        if (!(is_terminal_output(context->std_output) ||
11,471✔
4537
              (context->std_output == EXEC_OUTPUT_INHERIT && is_terminal_input(context->std_input)) ||
10,866✔
4538
              context->std_output == EXEC_OUTPUT_NAMED_FD ||
4539
              p->stdout_fd >= 0))
10,866✔
4540
                return;
10,339✔
4541

4542
        /* Let's explicitly determine whether to reset via ANSI sequences or not, taking our ExecContext
4543
         * information into account */
4544
        bool use_ansi = exec_context_shall_ansi_seq_reset(context);
1,132✔
4545

4546
        if (context->tty_reset) {
1,132✔
4547
                /* When we are resetting the TTY, then let's create a lock first, to synchronize access. This
4548
                 * in particular matters as concurrent resets and the TTY size ANSI DSR logic done by the
4549
                 * exec_context_apply_tty_size() below might interfere */
4550
                lock_fd = lock_dev_console();
151✔
4551
                if (lock_fd < 0)
151✔
4552
                        log_debug_errno(lock_fd, "Failed to lock /dev/console, ignoring: %m");
×
4553

4554
                /* We explicitly control whether to send ansi sequences or not here, since we want to consult
4555
                 * the env vars explicitly configured in the ExecContext, rather than our own environment
4556
                 * block. */
4557
                (void) terminal_reset_defensive(STDOUT_FILENO, use_ansi ? TERMINAL_RESET_FORCE_ANSI_SEQ : TERMINAL_RESET_AVOID_ANSI_SEQ);
154✔
4558
        }
4559

4560
        (void) exec_context_apply_tty_size(context, STDIN_FILENO, STDOUT_FILENO, /* tty_path= */ NULL);
1,132✔
4561

4562
        if (use_ansi)
1,132✔
4563
                (void) osc_context_open_service(p->unit_id, p->invocation_id, /* ret_seq= */ NULL);
148✔
4564
}
4565

4566
int exec_invoke(
11,474✔
4567
                const ExecCommand *command,
4568
                const ExecContext *context,
4569
                ExecParameters *params,
4570
                ExecRuntime *runtime,
4571
                const CGroupContext *cgroup_context,
4572
                int *exit_status) {
11,474✔
4573

4574
        _cleanup_strv_free_ char **our_env = NULL, **pass_env = NULL, **joined_exec_search_path = NULL, **accum_env = NULL, **replaced_argv = NULL;
28✔
4575
        int r;
11,474✔
4576
        const char *username = NULL, *groupname = NULL;
11,474✔
4577
        _cleanup_free_ char *home_buffer = NULL, *memory_pressure_path = NULL, *own_user = NULL;
×
4578
        const char *pwent_home = NULL, *shell = NULL;
11,474✔
4579
        char **final_argv = NULL;
11,474✔
4580
        dev_t journal_stream_dev = 0;
11,474✔
4581
        ino_t journal_stream_ino = 0;
11,474✔
4582
        bool needs_sandboxing,          /* Do we need to set up full sandboxing? (i.e. all namespacing, all MAC stuff, caps, yadda yadda */
11,474✔
4583
                needs_setuid,           /* Do we need to do the actual setresuid()/setresgid() calls? */
4584
                needs_mount_namespace,  /* Do we need to set up a mount namespace for this kernel? */
4585
                have_cap_sys_admin,
4586
                userns_set_up = false,
11,474✔
4587
                keep_seccomp_privileges = false;
11,474✔
4588
#if HAVE_SELINUX
4589
        _cleanup_free_ char *mac_selinux_context_net = NULL;
4590
        bool use_selinux = false;
4591
#endif
4592
#if ENABLE_SMACK
4593
        bool use_smack = false;
11,474✔
4594
#endif
4595
#if HAVE_APPARMOR
4596
        bool use_apparmor = false;
4597
#endif
4598
#if HAVE_SECCOMP
4599
        uint64_t saved_bset = 0;
11,474✔
4600
#endif
4601
        uid_t saved_uid = getuid();
11,474✔
4602
        gid_t saved_gid = getgid();
11,474✔
4603
        uid_t uid = UID_INVALID;
11,474✔
4604
        gid_t gid = GID_INVALID;
11,474✔
4605
        size_t n_fds, /* fds to pass to the child */
11,474✔
4606
               n_keep_fds; /* total number of fds not to close */
4607
        int secure_bits;
11,474✔
4608
        _cleanup_free_ gid_t *gids = NULL, *gids_after_pam = NULL;
28✔
4609
        int ngids = 0, ngids_after_pam = 0;
11,474✔
4610
        int socket_fd = -EBADF, named_iofds[3] = EBADF_TRIPLET;
11,474✔
4611
        size_t n_storage_fds, n_socket_fds, n_extra_fds;
11,474✔
4612

4613
        assert(command);
11,474✔
4614
        assert(context);
11,474✔
4615
        assert(params);
11,474✔
4616
        assert(runtime);
11,474✔
4617
        assert(cgroup_context);
11,474✔
4618
        assert(exit_status);
11,474✔
4619

4620
        LOG_CONTEXT_PUSH_EXEC(context, params);
32,876✔
4621

4622
        /* Explicitly test for CVE-2021-4034 inspired invocations */
4623
        if (!command->path || strv_isempty(command->argv)) {
11,474✔
4624
                *exit_status = EXIT_EXEC;
×
4625
                return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "Invalid command line arguments.");
×
4626
        }
4627

4628
        if (context->std_input == EXEC_INPUT_SOCKET ||
11,474✔
4629
            context->std_output == EXEC_OUTPUT_SOCKET ||
11,463✔
4630
            context->std_error == EXEC_OUTPUT_SOCKET) {
11,457✔
4631

4632
                if (params->n_socket_fds > 1)
17✔
4633
                        return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "Got more than one socket.");
×
4634

4635
                if (params->n_socket_fds == 0)
17✔
4636
                        return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "Got no socket.");
×
4637

4638
                socket_fd = params->fds[0];
17✔
4639
                n_storage_fds = n_socket_fds = n_extra_fds = 0;
17✔
4640
        } else {
4641
                n_socket_fds = params->n_socket_fds;
11,457✔
4642
                n_storage_fds = params->n_storage_fds;
11,457✔
4643
                n_extra_fds = params->n_extra_fds;
11,457✔
4644
        }
4645
        n_fds = n_socket_fds + n_storage_fds + n_extra_fds;
11,474✔
4646

4647
        r = exec_context_named_iofds(context, params, named_iofds);
11,474✔
4648
        if (r < 0)
11,474✔
4649
                return log_error_errno(r, "Failed to load a named file descriptor: %m");
×
4650

4651
        rename_process_from_path(command->path);
11,474✔
4652

4653
        /* We reset exactly these signals, since they are the only ones we set to SIG_IGN in the main
4654
         * daemon. All others we leave untouched because we set them to SIG_DFL or a valid handler initially,
4655
         * both of which will be demoted to SIG_DFL. */
4656
        (void) default_signals(SIGNALS_CRASH_HANDLER,
11,474✔
4657
                               SIGNALS_IGNORE);
4658

4659
        if (context->ignore_sigpipe)
11,474✔
4660
                (void) ignore_signals(SIGPIPE);
11,323✔
4661

4662
        r = reset_signal_mask();
11,474✔
4663
        if (r < 0) {
11,474✔
4664
                *exit_status = EXIT_SIGNAL_MASK;
×
4665
                return log_error_errno(r, "Failed to set process signal mask: %m");
×
4666
        }
4667

4668
        if (params->idle_pipe)
11,474✔
4669
                do_idle_pipe_dance(params->idle_pipe);
148✔
4670

4671
        /* Close fds we don't need very early to make sure we don't block init reexecution because it cannot bind its
4672
         * sockets. Among the fds we close are the logging fds, and we want to keep them closed, so that we don't have
4673
         * any fds open we don't really want open during the transition. In order to make logging work, we switch the
4674
         * log subsystem into open_when_needed mode, so that it reopens the logs on every single log call. */
4675

4676
        log_forget_fds();
11,474✔
4677
        log_set_open_when_needed(true);
11,474✔
4678
        log_settle_target();
11,474✔
4679

4680
        /* In case anything used libc syslog(), close this here, too */
4681
        closelog();
11,474✔
4682

4683
        r = collect_open_file_fds(params, &n_fds);
11,474✔
4684
        if (r < 0) {
11,474✔
4685
                *exit_status = EXIT_FDS;
1✔
4686
                return log_error_errno(r, "Failed to get OpenFile= file descriptors: %m");
1✔
4687
        }
4688

4689
        int keep_fds[n_fds + 4];
11,473✔
4690
        memcpy_safe(keep_fds, params->fds, n_fds * sizeof(int));
11,473✔
4691
        n_keep_fds = n_fds;
11,473✔
4692

4693
        r = add_shifted_fd(keep_fds, ELEMENTSOF(keep_fds), &n_keep_fds, &params->exec_fd);
11,473✔
4694
        if (r < 0) {
11,473✔
4695
                *exit_status = EXIT_FDS;
×
4696
                return log_error_errno(r, "Failed to collect shifted fd: %m");
×
4697
        }
4698

4699
        r = add_shifted_fd(keep_fds, ELEMENTSOF(keep_fds), &n_keep_fds, &params->handoff_timestamp_fd);
11,473✔
4700
        if (r < 0) {
11,473✔
4701
                *exit_status = EXIT_FDS;
×
4702
                return log_error_errno(r, "Failed to collect shifted fd: %m");
×
4703
        }
4704

4705
#if HAVE_LIBBPF
4706
        r = add_shifted_fd(keep_fds, ELEMENTSOF(keep_fds), &n_keep_fds, &params->bpf_restrict_fs_map_fd);
11,473✔
4707
        if (r < 0) {
11,473✔
4708
                *exit_status = EXIT_FDS;
×
4709
                return log_error_errno(r, "Failed to collect shifted fd: %m");
×
4710
        }
4711
#endif
4712

4713
        r = close_remaining_fds(params, runtime, socket_fd, keep_fds, n_keep_fds);
11,473✔
4714
        if (r < 0) {
11,473✔
4715
                *exit_status = EXIT_FDS;
×
4716
                return log_error_errno(r, "Failed to close unwanted file descriptors: %m");
×
4717
        }
4718

4719
        if (!context->same_pgrp &&
22,077✔
4720
            setsid() < 0) {
10,604✔
4721
                *exit_status = EXIT_SETSID;
×
4722
                return log_error_errno(errno, "Failed to create new process session: %m");
×
4723
        }
4724

4725
        /* Now, reset the TTY associated to this service "destructively" (i.e. possibly even hang up or
4726
         * disallocate the VT), to get rid of any prior uses of the device. Note that we do not keep any fd
4727
         * open here, hence some of the settings made here might vanish again, depending on the TTY driver
4728
         * used. A 2nd ("constructive") initialization after we opened the input/output fds we actually want
4729
         * will fix this. Note that we pass a NULL invocation ID here – as exec_context_tty_reset() expects
4730
         * the invocation ID associated with the OSC 3008 context ID to close. But we don't want to close any
4731
         * OSC 3008 context here, and opening a fresh OSC 3008 context happens a bit further down. */
4732
        exec_context_tty_reset(context, params, /* invocation_id= */ SD_ID128_NULL);
11,473✔
4733

4734
        if (params->shall_confirm_spawn && exec_context_shall_confirm_spawn(context)) {
11,473✔
4735
                _cleanup_free_ char *cmdline = NULL;
×
4736

4737
                cmdline = quote_command_line(command->argv, SHELL_ESCAPE_EMPTY);
×
4738
                if (!cmdline) {
×
4739
                        *exit_status = EXIT_MEMORY;
×
4740
                        return log_oom();
×
4741
                }
4742

4743
                r = ask_for_confirmation(context, params, cmdline);
×
4744
                if (r != CONFIRM_EXECUTE) {
×
4745
                        if (r == CONFIRM_PRETEND_SUCCESS) {
×
4746
                                *exit_status = EXIT_SUCCESS;
×
4747
                                return 0;
×
4748
                        }
4749

4750
                        *exit_status = EXIT_CONFIRM;
×
4751
                        return log_error_errno(SYNTHETIC_ERRNO(ECANCELED), "Execution cancelled by the user.");
×
4752
                }
4753
        }
4754

4755
        /* We are about to invoke NSS and PAM modules. Let's tell them what we are doing here, maybe they care. This is
4756
         * used by nss-resolve to disable itself when we are about to start systemd-resolved, to avoid deadlocks. Note
4757
         * that these env vars do not survive the execve(), which means they really only apply to the PAM and NSS
4758
         * invocations themselves. Also note that while we'll only invoke NSS modules involved in user management they
4759
         * might internally call into other NSS modules that are involved in hostname resolution, we never know. */
4760
        if (setenv("SYSTEMD_ACTIVATION_UNIT", params->unit_id, true) != 0 ||
22,946✔
4761
            setenv("SYSTEMD_ACTIVATION_SCOPE", runtime_scope_to_string(params->runtime_scope), true) != 0) {
11,473✔
4762
                *exit_status = EXIT_MEMORY;
×
4763
                return log_error_errno(errno, "Failed to update environment: %m");
×
4764
        }
4765

4766
        if (context->dynamic_user && runtime->dynamic_creds) {
11,535✔
4767
                _cleanup_strv_free_ char **suggested_paths = NULL;
62✔
4768

4769
                /* On top of that, make sure we bypass our own NSS module nss-systemd comprehensively for any NSS
4770
                 * checks, if DynamicUser=1 is used, as we shouldn't create a feedback loop with ourselves here. */
4771
                if (putenv((char*) "SYSTEMD_NSS_DYNAMIC_BYPASS=1") != 0) {
62✔
4772
                        *exit_status = EXIT_USER;
×
4773
                        return log_error_errno(errno, "Failed to update environment: %m");
×
4774
                }
4775

4776
                r = compile_suggested_paths(context, params, &suggested_paths);
62✔
4777
                if (r < 0) {
62✔
4778
                        *exit_status = EXIT_MEMORY;
×
4779
                        return log_oom();
×
4780
                }
4781

4782
                r = dynamic_creds_realize(runtime->dynamic_creds, suggested_paths, &uid, &gid);
62✔
4783
                if (r < 0) {
62✔
4784
                        *exit_status = EXIT_USER;
×
4785
                        if (r == -EILSEQ)
×
4786
                                return log_error_errno(SYNTHETIC_ERRNO(EOPNOTSUPP),
×
4787
                                                       "Failed to update dynamic user credentials: User or group with specified name already exists.");
4788
                        return log_error_errno(r, "Failed to update dynamic user credentials: %m");
×
4789
                }
4790

4791
                if (!uid_is_valid(uid)) {
62✔
4792
                        *exit_status = EXIT_USER;
×
4793
                        return log_error_errno(SYNTHETIC_ERRNO(ESRCH), "UID validation failed for \""UID_FMT"\".", uid);
×
4794
                }
4795

4796
                if (!gid_is_valid(gid)) {
62✔
4797
                        *exit_status = EXIT_USER;
×
4798
                        return log_error_errno(SYNTHETIC_ERRNO(ESRCH), "GID validation failed for \""GID_FMT"\".", gid);
×
4799
                }
4800

4801
                if (runtime->dynamic_creds->user)
62✔
4802
                        username = runtime->dynamic_creds->user->name;
62✔
4803

4804
        } else {
4805
                const char *u;
11,411✔
4806

4807
                if (context->user)
11,411✔
4808
                        u = context->user;
4809
                else if (context->pam_name) {
8,891✔
4810
                        /* If PAM is enabled but no user name is explicitly selected, then use our own one. */
4811
                        own_user = getusername_malloc();
59✔
4812
                        if (!own_user) {
59✔
4813
                                *exit_status = EXIT_USER;
×
4814
                                return log_error_errno(r, "Failed to determine my own user ID: %m");
×
4815
                        }
4816
                        u = own_user;
4817
                } else
4818
                        u = NULL;
4819

4820
                if (u) {
4821
                        /* We can't use nss unconditionally for root without risking deadlocks if some IPC services
4822
                         * will be started by pid1 and are ordered after us. But if SetLoginEnvironment= is
4823
                         * enabled *explicitly* (i.e. no exec_context_get_set_login_environment() here),
4824
                         * or PAM shall be invoked, let's consult NSS even for root, so that the user
4825
                         * gets accurate $SHELL in session(-like) contexts. */
4826
                        r = get_fixed_user(u,
2,579✔
4827
                                           /* prefer_nss = */ context->set_login_environment > 0 || context->pam_name,
2,579✔
4828
                                           &username, &uid, &gid, &pwent_home, &shell);
4829
                        if (r < 0) {
2,579✔
4830
                                *exit_status = EXIT_USER;
2✔
4831
                                return log_error_errno(r, "Failed to determine user credentials: %m");
2✔
4832
                        }
4833
                }
4834

4835
                if (context->group) {
11,409✔
4836
                        r = get_fixed_group(context->group, &groupname, &gid);
11✔
4837
                        if (r < 0) {
11✔
4838
                                *exit_status = EXIT_GROUP;
×
4839
                                return log_error_errno(r, "Failed to determine group credentials: %m");
×
4840
                        }
4841
                }
4842
        }
4843

4844
        /* Initialize user supplementary groups and get SupplementaryGroups= ones */
4845
        ngids = get_supplementary_groups(context, username, gid, &gids);
11,471✔
4846
        if (ngids < 0) {
11,471✔
4847
                *exit_status = EXIT_GROUP;
×
4848
                return log_error_errno(ngids, "Failed to determine supplementary groups: %m");
×
4849
        }
4850

4851
        r = send_user_lookup(params->unit_id, params->user_lookup_fd, uid, gid);
11,471✔
4852
        if (r < 0) {
11,471✔
4853
                *exit_status = EXIT_USER;
×
4854
                return log_error_errno(r, "Failed to send user credentials to PID1: %m");
×
4855
        }
4856

4857
        params->user_lookup_fd = safe_close(params->user_lookup_fd);
11,471✔
4858

4859
        r = acquire_home(context, &pwent_home, &home_buffer);
11,471✔
4860
        if (r < 0) {
11,471✔
4861
                *exit_status = EXIT_CHDIR;
×
4862
                return log_error_errno(r, "Failed to determine $HOME for the invoking user: %m");
×
4863
        }
4864

4865
        /* If a socket is connected to STDIN/STDOUT/STDERR, we must drop O_NONBLOCK */
4866
        if (socket_fd >= 0)
11,471✔
4867
                (void) fd_nonblock(socket_fd, false);
17✔
4868

4869
        /* Journald will try to look-up our cgroup in order to populate _SYSTEMD_CGROUP and _SYSTEMD_UNIT fields.
4870
         * Hence we need to migrate to the target cgroup from init.scope before connecting to journald */
4871
        if (params->cgroup_path) {
11,471✔
4872
                _cleanup_free_ char *p = NULL;
11,471✔
4873

4874
                r = exec_params_get_cgroup_path(params, cgroup_context, &p);
11,471✔
4875
                if (r < 0) {
11,471✔
4876
                        *exit_status = EXIT_CGROUP;
×
4877
                        return log_error_errno(r, "Failed to acquire cgroup path: %m");
×
4878
                }
4879

4880
                r = cg_attach(p, 0);
11,471✔
4881
                if (r == -EUCLEAN) {
11,471✔
4882
                        *exit_status = EXIT_CGROUP;
×
4883
                        return log_error_errno(r,
×
4884
                                               "Failed to attach process to cgroup '%s', "
4885
                                               "because the cgroup or one of its parents or "
4886
                                               "siblings is in the threaded mode.", p);
4887
                }
4888
                if (r < 0) {
11,471✔
4889
                        *exit_status = EXIT_CGROUP;
×
4890
                        return log_error_errno(r, "Failed to attach to cgroup %s: %m", p);
×
4891
                }
4892
        }
4893

4894
        if (context->network_namespace_path && runtime->shared && runtime->shared->netns_storage_socket[0] >= 0) {
11,471✔
4895
                r = open_shareable_ns_path(runtime->shared->netns_storage_socket, context->network_namespace_path, CLONE_NEWNET);
×
4896
                if (r < 0) {
×
4897
                        *exit_status = EXIT_NETWORK;
×
4898
                        return log_error_errno(r, "Failed to open network namespace path %s: %m", context->network_namespace_path);
×
4899
                }
4900
        }
4901

4902
        if (context->ipc_namespace_path && runtime->shared && runtime->shared->ipcns_storage_socket[0] >= 0) {
11,471✔
4903
                r = open_shareable_ns_path(runtime->shared->ipcns_storage_socket, context->ipc_namespace_path, CLONE_NEWIPC);
×
4904
                if (r < 0) {
×
4905
                        *exit_status = EXIT_NAMESPACE;
×
4906
                        return log_error_errno(r, "Failed to open IPC namespace path %s: %m", context->ipc_namespace_path);
×
4907
                }
4908
        }
4909

4910
        r = setup_input(context, params, socket_fd, named_iofds);
11,471✔
4911
        if (r < 0) {
11,471✔
4912
                *exit_status = EXIT_STDIN;
×
4913
                return log_error_errno(r, "Failed to set up standard input: %m");
×
4914
        }
4915

4916
        _cleanup_free_ char *fname = NULL;
25✔
4917
        r = path_extract_filename(command->path, &fname);
11,471✔
4918
        if (r < 0) {
11,471✔
4919
                *exit_status = EXIT_STDOUT;
×
4920
                return log_error_errno(r, "Failed to extract filename from path %s: %m", command->path);
×
4921
        }
4922

4923
        r = setup_output(context, params, STDOUT_FILENO, socket_fd, named_iofds, fname, uid, gid, &journal_stream_dev, &journal_stream_ino);
11,471✔
4924
        if (r < 0) {
11,471✔
4925
                *exit_status = EXIT_STDOUT;
×
4926
                return log_error_errno(r, "Failed to set up standard output: %m");
×
4927
        }
4928

4929
        r = setup_output(context, params, STDERR_FILENO, socket_fd, named_iofds, fname, uid, gid, &journal_stream_dev, &journal_stream_ino);
11,471✔
4930
        if (r < 0) {
11,471✔
4931
                *exit_status = EXIT_STDERR;
×
4932
                return log_error_errno(r, "Failed to set up standard error output: %m");
×
4933
        }
4934

4935
        /* Now that stdin/stdout are definiely opened, properly initialize it with our desired
4936
         * settings. Note: this is a "constructive" reset, it prepares things for us to use. This is
4937
         * different from the "destructive" TTY reset further up. Also note: we apply this on stdin/stdout in
4938
         * case this is a tty, regardless if we opened it ourselves or got it passed in pre-opened. */
4939
        prepare_terminal(context, params);
11,471✔
4940

4941
        if (context->oom_score_adjust_set) {
11,471✔
4942
                /* When we can't make this change due to EPERM, then let's silently skip over it. User
4943
                 * namespaces prohibit write access to this file, and we shouldn't trip up over that. */
4944
                r = set_oom_score_adjust(context->oom_score_adjust);
1,297✔
4945
                if (ERRNO_IS_NEG_PRIVILEGE(r))
1,297✔
4946
                        log_debug_errno(r, "Failed to adjust OOM setting, assuming containerized execution, ignoring: %m");
×
4947
                else if (r < 0) {
1,297✔
4948
                        *exit_status = EXIT_OOM_ADJUST;
×
4949
                        return log_error_errno(r, "Failed to adjust OOM setting: %m");
×
4950
                }
4951
        }
4952

4953
        if (context->coredump_filter_set) {
11,471✔
4954
                r = set_coredump_filter(context->coredump_filter);
2✔
4955
                if (ERRNO_IS_NEG_PRIVILEGE(r))
2✔
4956
                        log_debug_errno(r, "Failed to adjust coredump_filter, ignoring: %m");
×
4957
                else if (r < 0) {
2✔
4958
                        *exit_status = EXIT_LIMITS;
×
4959
                        return log_error_errno(r, "Failed to adjust coredump_filter: %m");
×
4960
                }
4961
        }
4962

4963
        if (context->cpu_sched_set) {
11,471✔
4964
                struct sched_attr attr = {
×
4965
                        .size = sizeof(attr),
4966
                        .sched_policy = context->cpu_sched_policy,
×
4967
                        .sched_priority = context->cpu_sched_priority,
×
4968
                        .sched_flags = context->cpu_sched_reset_on_fork ? SCHED_FLAG_RESET_ON_FORK : 0,
×
4969
                };
4970

4971
                r = sched_setattr(/* pid= */ 0, &attr, /* flags= */ 0);
×
4972
                if (r < 0) {
×
4973
                        *exit_status = EXIT_SETSCHEDULER;
×
4974
                        return log_error_errno(errno, "Failed to set up CPU scheduling: %m");
×
4975
                }
4976
        }
4977

4978
        /*
4979
         * Set nice value _after_ the call to sched_setattr() because struct sched_attr includes sched_nice
4980
         * which we do not set, thus it will clobber any previously set nice value. Scheduling policy might
4981
         * be reasonably set together with nice value e.g. in case of SCHED_BATCH (see sched(7)).
4982
         * It would be ideal to set both with the same call, but we cannot easily do so because of all the
4983
         * extra logic in setpriority_closest().
4984
         */
4985
        if (context->nice_set) {
11,471✔
4986
                r = setpriority_closest(context->nice);
15✔
4987
                if (r < 0) {
15✔
4988
                        *exit_status = EXIT_NICE;
×
4989
                        return log_error_errno(r, "Failed to set up process scheduling priority (nice level): %m");
×
4990
                }
4991
        }
4992

4993
        if (context->cpu_affinity_from_numa || context->cpu_set.set) {
11,471✔
4994
                _cleanup_(cpu_set_reset) CPUSet converted_cpu_set = {};
2✔
4995
                const CPUSet *cpu_set;
2✔
4996

4997
                if (context->cpu_affinity_from_numa) {
2✔
4998
                        r = exec_context_cpu_affinity_from_numa(context, &converted_cpu_set);
2✔
4999
                        if (r < 0) {
2✔
5000
                                *exit_status = EXIT_CPUAFFINITY;
×
5001
                                return log_error_errno(r, "Failed to derive CPU affinity mask from NUMA mask: %m");
×
5002
                        }
5003

5004
                        cpu_set = &converted_cpu_set;
5005
                } else
5006
                        cpu_set = &context->cpu_set;
×
5007

5008
                if (sched_setaffinity(0, cpu_set->allocated, cpu_set->set) < 0) {
2✔
5009
                        *exit_status = EXIT_CPUAFFINITY;
×
5010
                        return log_error_errno(errno, "Failed to set up CPU affinity: %m");
×
5011
                }
5012
        }
5013

5014
        if (mpol_is_valid(numa_policy_get_type(&context->numa_policy))) {
11,471✔
5015
                r = apply_numa_policy(&context->numa_policy);
19✔
5016
                if (ERRNO_IS_NEG_NOT_SUPPORTED(r))
19✔
5017
                        log_debug_errno(r, "NUMA support not available, ignoring.");
×
5018
                else if (r < 0) {
19✔
5019
                        *exit_status = EXIT_NUMA_POLICY;
2✔
5020
                        return log_error_errno(r, "Failed to set NUMA memory policy: %m");
2✔
5021
                }
5022
        }
5023

5024
        if (context->ioprio_set)
11,469✔
5025
                if (ioprio_set(IOPRIO_WHO_PROCESS, 0, context->ioprio) < 0) {
9✔
5026
                        *exit_status = EXIT_IOPRIO;
×
5027
                        return log_error_errno(errno, "Failed to set up IO scheduling priority: %m");
×
5028
                }
5029

5030
        if (context->timer_slack_nsec != NSEC_INFINITY)
11,469✔
5031
                if (prctl(PR_SET_TIMERSLACK, context->timer_slack_nsec) < 0) {
×
5032
                        *exit_status = EXIT_TIMERSLACK;
×
5033
                        return log_error_errno(errno, "Failed to set up timer slack: %m");
×
5034
                }
5035

5036
        if (context->personality != PERSONALITY_INVALID) {
11,469✔
5037
                r = safe_personality(context->personality);
×
5038
                if (r < 0) {
×
5039
                        *exit_status = EXIT_PERSONALITY;
×
5040
                        return log_error_errno(r, "Failed to set up execution domain (personality): %m");
×
5041
                }
5042
        }
5043

5044
        if (context->memory_ksm >= 0)
11,469✔
5045
                if (prctl(PR_SET_MEMORY_MERGE, context->memory_ksm, 0, 0, 0) < 0) {
×
5046
                        if (ERRNO_IS_NOT_SUPPORTED(errno))
×
5047
                                log_debug_errno(errno, "KSM support not available, ignoring.");
×
5048
                        else {
5049
                                *exit_status = EXIT_KSM;
×
5050
                                return log_error_errno(errno, "Failed to set KSM: %m");
×
5051
                        }
5052
                }
5053

5054
#if ENABLE_UTMP
5055
        if (context->utmp_id) {
11,469✔
5056
                _cleanup_free_ char *username_alloc = NULL;
153✔
5057

5058
                if (!username && context->utmp_mode == EXEC_UTMP_USER) {
153✔
5059
                        username_alloc = uid_to_name(uid_is_valid(uid) ? uid : saved_uid);
1✔
5060
                        if (!username_alloc) {
1✔
5061
                                *exit_status = EXIT_USER;
×
5062
                                return log_oom();
×
5063
                        }
5064
                }
5065

5066
                const char *line = context->tty_path ?
×
5067
                        (path_startswith(context->tty_path, "/dev/") ?: context->tty_path) :
153✔
5068
                        NULL;
5069
                utmp_put_init_process(context->utmp_id, getpid_cached(), getsid(0),
153✔
5070
                                      line,
5071
                                      context->utmp_mode == EXEC_UTMP_INIT  ? INIT_PROCESS :
153✔
5072
                                      context->utmp_mode == EXEC_UTMP_LOGIN ? LOGIN_PROCESS :
7✔
5073
                                      USER_PROCESS,
5074
                                      username ?: username_alloc);
153✔
5075
        }
5076
#endif
5077

5078
        if (uid_is_valid(uid)) {
11,469✔
5079
                r = chown_terminal(STDIN_FILENO, uid);
2,639✔
5080
                if (r < 0) {
2,639✔
5081
                        *exit_status = EXIT_STDIN;
×
5082
                        return log_error_errno(r, "Failed to change ownership of terminal: %m");
×
5083
                }
5084
        }
5085

5086
        /* We need sandboxing if the caller asked us to apply it and the command isn't explicitly excepted
5087
         * from it. */
5088
        needs_sandboxing = (params->flags & EXEC_APPLY_SANDBOXING) && !(command->flags & EXEC_COMMAND_FULLY_PRIVILEGED);
11,469✔
5089

5090
        if (params->cgroup_path) {
11,469✔
5091
                /* If delegation is enabled we'll pass ownership of the cgroup to the user of the new process. On cgroup v1
5092
                 * this is only about systemd's own hierarchy, i.e. not the controller hierarchies, simply because that's not
5093
                 * safe. On cgroup v2 there's only one hierarchy anyway, and delegation is safe there, hence in that case only
5094
                 * touch a single hierarchy too. */
5095

5096
                if (params->flags & EXEC_CGROUP_DELEGATE) {
11,469✔
5097
                        _cleanup_free_ char *p = NULL;
663✔
5098

5099
                        r = cg_set_access(params->cgroup_path, uid, gid);
663✔
5100
                        if (r < 0) {
663✔
5101
                                *exit_status = EXIT_CGROUP;
×
5102
                                return log_error_errno(r, "Failed to adjust control group access: %m");
×
5103
                        }
5104

5105
                        r = exec_params_get_cgroup_path(params, cgroup_context, &p);
663✔
5106
                        if (r < 0) {
663✔
5107
                                *exit_status = EXIT_CGROUP;
×
5108
                                return log_error_errno(r, "Failed to acquire cgroup path: %m");
×
5109
                        }
5110
                        if (r > 0) {
663✔
5111
                                r = cg_set_access_recursive(p, uid, gid);
323✔
5112
                                if (r < 0) {
323✔
5113
                                        *exit_status = EXIT_CGROUP;
×
5114
                                        return log_error_errno(r, "Failed to adjust control subgroup access: %m");
×
5115
                                }
5116
                        }
5117
                }
5118

5119
                if (is_pressure_supported() > 0) {
11,469✔
5120
                        if (cgroup_context_want_memory_pressure(cgroup_context)) {
11,469✔
5121
                                r = cg_get_path("memory", params->cgroup_path, "memory.pressure", &memory_pressure_path);
11,073✔
5122
                                if (r < 0) {
11,073✔
5123
                                        *exit_status = EXIT_MEMORY;
×
5124
                                        return log_oom();
×
5125
                                }
5126

5127
                                r = chmod_and_chown(memory_pressure_path, 0644, uid, gid);
11,073✔
5128
                                if (r < 0) {
11,073✔
5129
                                        log_full_errno(r == -ENOENT || ERRNO_IS_PRIVILEGE(r) ? LOG_DEBUG : LOG_WARNING, r,
2✔
5130
                                                       "Failed to adjust ownership of '%s', ignoring: %m", memory_pressure_path);
5131
                                        memory_pressure_path = mfree(memory_pressure_path);
1✔
5132
                                }
5133
                                /* First we use the current cgroup path to chmod and chown the memory pressure path, then pass the path relative
5134
                                 * to the cgroup namespace to environment variables and mounts. If chown/chmod fails, we should not pass memory
5135
                                 * pressure path environment variable or read-write mount to the unit. This is why we check if
5136
                                 * memory_pressure_path != NULL in the conditional below. */
5137
                                if (memory_pressure_path && needs_sandboxing && exec_needs_cgroup_namespace(context)) {
11,073✔
5138
                                        memory_pressure_path = mfree(memory_pressure_path);
11✔
5139
                                        r = cg_get_path("memory", "", "memory.pressure", &memory_pressure_path);
11✔
5140
                                        if (r < 0) {
11✔
5141
                                                *exit_status = EXIT_MEMORY;
×
5142
                                                return log_oom();
×
5143
                                        }
5144
                                }
5145
                        } else if (cgroup_context->memory_pressure_watch == CGROUP_PRESSURE_WATCH_NO) {
396✔
5146
                                memory_pressure_path = strdup("/dev/null"); /* /dev/null is explicit indicator for turning of memory pressure watch */
×
5147
                                if (!memory_pressure_path) {
×
5148
                                        *exit_status = EXIT_MEMORY;
×
5149
                                        return log_oom();
×
5150
                                }
5151
                        }
5152
                }
5153
        }
5154

5155
        needs_mount_namespace = exec_needs_mount_namespace(context, params, runtime);
11,469✔
5156

5157
        for (ExecDirectoryType dt = 0; dt < _EXEC_DIRECTORY_TYPE_MAX; dt++) {
68,809✔
5158
                r = setup_exec_directory(context, params, uid, gid, dt, needs_mount_namespace, exit_status);
57,341✔
5159
                if (r < 0)
57,341✔
5160
                        return log_error_errno(r, "Failed to set up special execution directory in %s: %m", params->prefix[dt]);
1✔
5161
        }
5162

5163
        r = exec_setup_credentials(context, params, params->unit_id, uid, gid);
11,468✔
5164
        if (r < 0) {
9,523✔
5165
                *exit_status = EXIT_CREDENTIALS;
×
5166
                return log_error_errno(r, "Failed to set up credentials: %m");
×
5167
        }
5168

5169
        r = build_environment(
9,523✔
5170
                        context,
5171
                        params,
5172
                        cgroup_context,
5173
                        n_fds,
5174
                        pwent_home,
5175
                        username,
5176
                        shell,
5177
                        journal_stream_dev,
5178
                        journal_stream_ino,
5179
                        memory_pressure_path,
5180
                        needs_sandboxing,
5181
                        &our_env);
5182
        if (r < 0) {
9,523✔
5183
                *exit_status = EXIT_MEMORY;
×
5184
                return log_oom();
×
5185
        }
5186

5187
        r = build_pass_environment(context, &pass_env);
9,523✔
5188
        if (r < 0) {
9,523✔
5189
                *exit_status = EXIT_MEMORY;
×
5190
                return log_oom();
×
5191
        }
5192

5193
        /* The $PATH variable is set to the default path in params->environment. However, this is overridden
5194
         * if user-specified fields have $PATH set. The intention is to also override $PATH if the unit does
5195
         * not specify PATH but the unit has ExecSearchPath. */
5196
        if (!strv_isempty(context->exec_search_path)) {
9,523✔
5197
                _cleanup_free_ char *joined = NULL;
×
5198

5199
                joined = strv_join(context->exec_search_path, ":");
×
5200
                if (!joined) {
×
5201
                        *exit_status = EXIT_MEMORY;
×
5202
                        return log_oom();
×
5203
                }
5204

5205
                r = strv_env_assign(&joined_exec_search_path, "PATH", joined);
×
5206
                if (r < 0) {
×
5207
                        *exit_status = EXIT_MEMORY;
×
5208
                        return log_oom();
×
5209
                }
5210
        }
5211

5212
        accum_env = strv_env_merge(params->environment,
9,523✔
5213
                                   our_env,
5214
                                   joined_exec_search_path,
5215
                                   pass_env,
5216
                                   context->environment,
5217
                                   params->files_env);
5218
        if (!accum_env) {
9,523✔
5219
                *exit_status = EXIT_MEMORY;
×
5220
                return log_oom();
×
5221
        }
5222
        accum_env = strv_env_clean(accum_env);
9,523✔
5223

5224
        (void) umask(context->umask);
9,523✔
5225

5226
        r = setup_keyring(context, params, uid, gid);
9,523✔
5227
        if (r < 0) {
9,523✔
5228
                *exit_status = EXIT_KEYRING;
×
5229
                return log_error_errno(r, "Failed to set up kernel keyring: %m");
×
5230
        }
5231

5232
        /* We need setresuid() if the caller asked us to apply sandboxing and the command isn't explicitly
5233
         * excepted from either whole sandboxing or just setresuid() itself. */
5234
        needs_setuid = (params->flags & EXEC_APPLY_SANDBOXING) && !(command->flags & (EXEC_COMMAND_FULLY_PRIVILEGED|EXEC_COMMAND_NO_SETUID));
9,523✔
5235

5236
        uint64_t capability_ambient_set = context->capability_ambient_set;
9,523✔
5237

5238
        /* Check CAP_SYS_ADMIN before we enter user namespace to see if we can mount /proc even though its masked. */
5239
        have_cap_sys_admin = have_effective_cap(CAP_SYS_ADMIN) > 0;
9,523✔
5240

5241
        if (needs_sandboxing) {
9,523✔
5242
                /* MAC enablement checks need to be done before a new mount ns is created, as they rely on
5243
                 * /sys being present. The actual MAC context application will happen later, as late as
5244
                 * possible, to avoid impacting our own code paths. */
5245

5246
#if HAVE_SELINUX
5247
                use_selinux = mac_selinux_use();
5248
#endif
5249
#if ENABLE_SMACK
5250
                use_smack = mac_smack_use();
9,523✔
5251
#endif
5252
#if HAVE_APPARMOR
5253
                if (mac_apparmor_use()) {
5254
                        r = dlopen_libapparmor();
5255
                        if (r < 0 && !ERRNO_IS_NEG_NOT_SUPPORTED(r))
5256
                                log_warning_errno(r, "Failed to load libapparmor, ignoring: %m");
5257
                        use_apparmor = r >= 0;
5258
                }
5259
#endif
5260
        }
5261

5262
        if (needs_sandboxing) {
9,523✔
5263
                int which_failed;
9,523✔
5264

5265
                /* Let's set the resource limits before we call into PAM, so that pam_limits wins over what
5266
                 * is set here. (See below.) */
5267

5268
                r = setrlimit_closest_all((const struct rlimit* const *) context->rlimit, &which_failed);
9,523✔
5269
                if (r < 0) {
9,523✔
5270
                        *exit_status = EXIT_LIMITS;
×
5271
                        return log_error_errno(r, "Failed to adjust resource limit RLIMIT_%s: %m", rlimit_to_string(which_failed));
×
5272
                }
5273
        }
5274

5275
        if (needs_setuid && context->pam_name && username) {
9,523✔
5276
                /* Let's call into PAM after we set up our own idea of resource limits so that pam_limits
5277
                 * wins here. (See above.) */
5278

5279
                /* All fds passed in the fds array will be closed in the pam child process. */
5280
                r = setup_pam(context, params, username, uid, gid, &accum_env, params->fds, n_fds, params->exec_fd);
386✔
5281
                if (r < 0) {
386✔
5282
                        *exit_status = EXIT_PAM;
×
5283
                        return log_error_errno(r, "Failed to set up PAM session: %m");
×
5284
                }
5285

5286
                /* PAM modules might have set some ambient caps. Query them here and merge them into
5287
                 * the caps we want to set in the end, so that we don't end up unsetting them. */
5288
                uint64_t ambient_after_pam;
386✔
5289
                r = capability_get_ambient(&ambient_after_pam);
386✔
5290
                if (r < 0) {
386✔
5291
                        *exit_status = EXIT_CAPABILITIES;
×
5292
                        return log_error_errno(r, "Failed to query ambient caps: %m");
×
5293
                }
5294

5295
                capability_ambient_set |= ambient_after_pam;
386✔
5296

5297
                ngids_after_pam = getgroups_alloc(&gids_after_pam);
386✔
5298
                if (ngids_after_pam < 0) {
386✔
5299
                        *exit_status = EXIT_GROUP;
×
5300
                        return log_error_errno(ngids_after_pam, "Failed to obtain groups after setting up PAM: %m");
×
5301
                }
5302
        }
5303

5304
        if (needs_sandboxing && !have_cap_sys_admin && exec_context_needs_cap_sys_admin(context)) {
9,523✔
5305
                /* If we're unprivileged, set up the user namespace first to enable use of the other namespaces.
5306
                 * Users with CAP_SYS_ADMIN can set up user namespaces last because they will be able to
5307
                 * set up all of the other namespaces (i.e. network, mount, UTS) without a user namespace. */
5308
                PrivateUsers pu = exec_context_get_effective_private_users(context, params);
27✔
5309
                if (pu == PRIVATE_USERS_NO)
27✔
5310
                        pu = PRIVATE_USERS_SELF;
23✔
5311

5312
                /* The kernel requires /proc/pid/setgroups be set to "deny" prior to writing /proc/pid/gid_map in
5313
                 * unprivileged user namespaces. */
5314
                r = setup_private_users(pu, saved_uid, saved_gid, uid, gid, /* allow_setgroups= */ false);
27✔
5315
                /* If it was requested explicitly and we can't set it up, fail early. Otherwise, continue and let
5316
                 * the actual requested operations fail (or silently continue). */
5317
                if (r < 0 && context->private_users != PRIVATE_USERS_NO) {
27✔
5318
                        *exit_status = EXIT_USER;
×
5319
                        return log_error_errno(r, "Failed to set up user namespacing for unprivileged user: %m");
×
5320
                }
5321
                if (r < 0)
×
5322
                        log_info_errno(r, "Failed to set up user namespacing for unprivileged user, ignoring: %m");
×
5323
                else {
5324
                        assert(r > 0);
27✔
5325
                        userns_set_up = true;
27✔
5326
                        log_debug("Set up unprivileged user namespace");
27✔
5327
                }
5328
        }
5329

5330
        /* Call setup_delegated_namespaces() the first time to unshare all non-delegated namespaces. */
5331
        r = setup_delegated_namespaces(
9,523✔
5332
                        context,
5333
                        params,
5334
                        runtime,
5335
                        /* delegate= */ false,
5336
                        memory_pressure_path,
5337
                        uid,
5338
                        gid,
5339
                        command,
5340
                        needs_sandboxing,
5341
                        have_cap_sys_admin,
5342
                        exit_status);
5343
        if (r < 0)
9,519✔
5344
                return r;
5345

5346
        /* Drop groups as early as possible.
5347
         * This needs to be done after PrivateDevices=yes setup as device nodes should be owned by the host's root.
5348
         * For non-root in a userns, devices will be owned by the user/group before the group change, and nobody. */
5349
        if (needs_setuid) {
9,503✔
5350
                _cleanup_free_ gid_t *gids_to_enforce = NULL;
9,503✔
5351
                int ngids_to_enforce;
9,503✔
5352

5353
                ngids_to_enforce = merge_gid_lists(gids,
9,503✔
5354
                                                   ngids,
5355
                                                   gids_after_pam,
5356
                                                   ngids_after_pam,
5357
                                                   &gids_to_enforce);
5358
                if (ngids_to_enforce < 0) {
9,503✔
5359
                        *exit_status = EXIT_GROUP;
×
5360
                        return log_error_errno(ngids_to_enforce, "Failed to merge group lists. Group membership might be incorrect: %m");
×
5361
                }
5362

5363
                r = enforce_groups(gid, gids_to_enforce, ngids_to_enforce);
9,503✔
5364
                if (r < 0) {
9,503✔
5365
                        *exit_status = EXIT_GROUP;
1✔
5366
                        return log_error_errno(r, "Changing group credentials failed: %m");
1✔
5367
                }
5368
        }
5369

5370
        /* If the user namespace was not set up above, try to do it now.
5371
         * It's preferred to set up the user namespace later (after all other namespaces) so as not to be
5372
         * restricted by rules pertaining to combining user namespaces with other namespaces (e.g. in the
5373
         * case of mount namespaces being less privileged when the mount point list is copied from a
5374
         * different user namespace). */
5375

5376
        if (needs_sandboxing && !userns_set_up) {
9,502✔
5377
                PrivateUsers pu = exec_context_get_effective_private_users(context, params);
9,480✔
5378

5379
                r = setup_private_users(pu, saved_uid, saved_gid, uid, gid,
9,480✔
5380
                                        /* allow_setgroups= */ pu == PRIVATE_USERS_FULL);
5381
                if (r < 0) {
9,480✔
5382
                        *exit_status = EXIT_USER;
×
5383
                        return log_error_errno(r, "Failed to set up user namespacing: %m");
×
5384
                }
5385
                if (r > 0)
9,480✔
5386
                        log_debug("Set up privileged user namespace");
23✔
5387
        }
5388

5389
        /* Call setup_delegated_namespaces() the second time to unshare all delegated namespaces. */
5390
        r = setup_delegated_namespaces(
9,502✔
5391
                        context,
5392
                        params,
5393
                        runtime,
5394
                        /* delegate= */ true,
5395
                        memory_pressure_path,
5396
                        uid,
5397
                        gid,
5398
                        command,
5399
                        needs_sandboxing,
5400
                        have_cap_sys_admin,
5401
                        exit_status);
5402
        if (r < 0)
9,500✔
5403
                return r;
5404

5405
        /* Now that the mount namespace has been set up and privileges adjusted, let's look for the thing we
5406
         * shall execute. */
5407

5408
        _cleanup_free_ char *executable = NULL;
5✔
5409
        _cleanup_close_ int executable_fd = -EBADF;
5✔
5410
        r = find_executable_full(command->path, /* root= */ NULL, context->exec_search_path, false, &executable, &executable_fd);
9,500✔
5411
        if (r < 0) {
9,500✔
5412
                *exit_status = EXIT_EXEC;
1✔
5413
                log_struct_errno(LOG_NOTICE, r,
1✔
5414
                                 LOG_MESSAGE_ID(SD_MESSAGE_SPAWN_FAILED_STR),
5415
                                 LOG_EXEC_MESSAGE(params,
5416
                                                  "Unable to locate executable '%s': %m",
5417
                                                  command->path),
5418
                                 LOG_ITEM("EXECUTABLE=%s", command->path));
5419
                /* If the error will be ignored by manager, tune down the log level here. Missing executable
5420
                 * is very much expected in this case. */
5421
                return r != -ENOMEM && FLAGS_SET(command->flags, EXEC_COMMAND_IGNORE_FAILURE) ? 1 : r;
1✔
5422
        }
5423

5424
        r = add_shifted_fd(keep_fds, ELEMENTSOF(keep_fds), &n_keep_fds, &executable_fd);
9,499✔
5425
        if (r < 0) {
9,499✔
5426
                *exit_status = EXIT_FDS;
×
5427
                return log_error_errno(r, "Failed to collect shifted fd: %m");
×
5428
        }
5429

5430
#if HAVE_SELINUX
5431
        if (needs_sandboxing && use_selinux && params->selinux_context_net) {
5432
                int fd = -EBADF;
5433

5434
                if (socket_fd >= 0)
5435
                        fd = socket_fd;
5436
                else if (params->n_socket_fds == 1)
5437
                        /* If stdin is not connected to a socket but we are triggered by exactly one socket unit then we
5438
                         * use context from that fd to compute the label. */
5439
                        fd = params->fds[0];
5440

5441
                if (fd >= 0) {
5442
                        r = mac_selinux_get_child_mls_label(fd, executable, context->selinux_context, &mac_selinux_context_net);
5443
                        if (r < 0) {
5444
                                if (!context->selinux_context_ignore) {
5445
                                        *exit_status = EXIT_SELINUX_CONTEXT;
5446
                                        return log_error_errno(r, "Failed to determine SELinux context: %m");
5447
                                }
5448
                                log_debug_errno(r, "Failed to determine SELinux context, ignoring: %m");
5449
                        }
5450
                }
5451
        }
5452
#endif
5453

5454
        /* We repeat the fd closing here, to make sure that nothing is leaked from the PAM modules. Note that
5455
         * we are more aggressive this time, since we don't need socket_fd and the netns and ipcns fds any
5456
         * more. We do keep exec_fd and handoff_timestamp_fd however, if we have it, since we need to keep
5457
         * them open until the final execve(). But first, close the remaining sockets in the context
5458
         * objects. */
5459

5460
        exec_runtime_close(runtime);
9,499✔
5461
        exec_params_close(params);
9,499✔
5462

5463
        r = close_all_fds(keep_fds, n_keep_fds);
9,499✔
5464
        if (r >= 0)
9,499✔
5465
                r = pack_fds(params->fds, n_fds);
9,499✔
5466
        if (r >= 0)
9,499✔
5467
                r = flag_fds(params->fds, n_socket_fds, n_fds, context->non_blocking);
9,499✔
5468
        if (r < 0) {
9,499✔
5469
                *exit_status = EXIT_FDS;
×
5470
                return log_error_errno(r, "Failed to adjust passed file descriptors: %m");
×
5471
        }
5472

5473
        /* At this point, the fds we want to pass to the program are all ready and set up, with O_CLOEXEC turned off
5474
         * and at the right fd numbers. The are no other fds open, with one exception: the exec_fd if it is defined,
5475
         * and it has O_CLOEXEC set, after all we want it to be closed by the execve(), so that our parent knows we
5476
         * came this far. */
5477

5478
        secure_bits = context->secure_bits;
9,499✔
5479

5480
        if (needs_sandboxing) {
9,499✔
5481
                uint64_t bset;
9,499✔
5482

5483
                /* Set the RTPRIO resource limit to 0, but only if nothing else was explicitly requested.
5484
                 * (Note this is placed after the general resource limit initialization, see above, in order
5485
                 * to take precedence.) */
5486
                if (context->restrict_realtime && !context->rlimit[RLIMIT_RTPRIO]) {
9,499✔
5487
                        if (setrlimit(RLIMIT_RTPRIO, &RLIMIT_MAKE_CONST(0)) < 0) {
1,489✔
5488
                                *exit_status = EXIT_LIMITS;
×
5489
                                return log_error_errno(errno, "Failed to adjust RLIMIT_RTPRIO resource limit: %m");
×
5490
                        }
5491
                }
5492

5493
#if ENABLE_SMACK
5494
                /* LSM Smack needs the capability CAP_MAC_ADMIN to change the current execution security context of the
5495
                 * process. This is the latest place before dropping capabilities. Other MAC context are set later. */
5496
                if (use_smack) {
9,499✔
5497
                        r = setup_smack(context, params, executable_fd);
×
5498
                        if (r < 0 && !context->smack_process_label_ignore) {
×
5499
                                *exit_status = EXIT_SMACK_PROCESS_LABEL;
×
5500
                                return log_error_errno(r, "Failed to set SMACK process label: %m");
×
5501
                        }
5502
                }
5503
#endif
5504

5505
                bset = context->capability_bounding_set;
9,499✔
5506

5507
#if HAVE_SECCOMP
5508
                /* If the service has any form of a seccomp filter and it allows dropping privileges, we'll
5509
                 * keep the needed privileges to apply it even if we're not root. */
5510
                if (needs_setuid &&
18,998✔
5511
                    uid_is_valid(uid) &&
11,488✔
5512
                    context_has_seccomp(context) &&
2,737✔
5513
                    seccomp_allows_drop_privileges(context)) {
748✔
5514
                        keep_seccomp_privileges = true;
748✔
5515

5516
                        if (prctl(PR_SET_KEEPCAPS, 1) < 0) {
748✔
5517
                                *exit_status = EXIT_USER;
×
5518
                                return log_error_errno(errno, "Failed to enable keep capabilities flag: %m");
×
5519
                        }
5520

5521
                        /* Save the current bounding set so we can restore it after applying the seccomp
5522
                         * filter */
5523
                        saved_bset = bset;
748✔
5524
                        bset |= (UINT64_C(1) << CAP_SYS_ADMIN) |
748✔
5525
                                (UINT64_C(1) << CAP_SETPCAP);
5526
                }
5527
#endif
5528

5529
                if (!cap_test_all(bset)) {
9,499✔
5530
                        r = capability_bounding_set_drop(bset, /* right_now= */ false);
1,615✔
5531
                        if (r < 0) {
1,615✔
5532
                                *exit_status = EXIT_CAPABILITIES;
×
5533
                                return log_error_errno(r, "Failed to drop capabilities: %m");
×
5534
                        }
5535
                }
5536

5537
                /* Ambient capabilities are cleared during setresuid() (in enforce_user()) even with
5538
                 * keep-caps set.
5539
                 *
5540
                 * To be able to raise the ambient capabilities after setresuid() they have to be added to
5541
                 * the inherited set and keep caps has to be set (done in enforce_user()).  After setresuid()
5542
                 * the ambient capabilities can be raised as they are present in the permitted and
5543
                 * inhertiable set. However it is possible that someone wants to set ambient capabilities
5544
                 * without changing the user, so we also set the ambient capabilities here.
5545
                 *
5546
                 * The requested ambient capabilities are raised in the inheritable set if the second
5547
                 * argument is true. */
5548
                if (capability_ambient_set != 0) {
9,499✔
5549
                        r = capability_ambient_set_apply(capability_ambient_set, /* also_inherit= */ true);
743✔
5550
                        if (r < 0) {
743✔
5551
                                *exit_status = EXIT_CAPABILITIES;
×
5552
                                return log_error_errno(r, "Failed to apply ambient capabilities (before UID change): %m");
×
5553
                        }
5554
                }
5555
        }
5556

5557
        /* chroot to root directory first, before we lose the ability to chroot */
5558
        r = apply_root_directory(context, params, runtime, needs_mount_namespace, exit_status);
9,499✔
5559
        if (r < 0)
9,499✔
5560
                return log_error_errno(r, "Chrooting to the requested root directory failed: %m");
×
5561

5562
        if (needs_setuid) {
9,499✔
5563
                if (uid_is_valid(uid)) {
9,499✔
5564
                        r = enforce_user(context, uid, capability_ambient_set);
1,989✔
5565
                        if (r < 0) {
1,989✔
5566
                                *exit_status = EXIT_USER;
×
5567
                                return log_error_errno(r, "Failed to change UID to " UID_FMT ": %m", uid);
×
5568
                        }
5569

5570
                        if (keep_seccomp_privileges) {
1,989✔
5571
                                if (!BIT_SET(capability_ambient_set, CAP_SETUID)) {
748✔
5572
                                        r = drop_capability(CAP_SETUID);
748✔
5573
                                        if (r < 0) {
748✔
5574
                                                *exit_status = EXIT_USER;
×
5575
                                                return log_error_errno(r, "Failed to drop CAP_SETUID: %m");
×
5576
                                        }
5577
                                }
5578

5579
                                r = keep_capability(CAP_SYS_ADMIN);
748✔
5580
                                if (r < 0) {
748✔
5581
                                        *exit_status = EXIT_USER;
×
5582
                                        return log_error_errno(r, "Failed to keep CAP_SYS_ADMIN: %m");
×
5583
                                }
5584

5585
                                r = keep_capability(CAP_SETPCAP);
748✔
5586
                                if (r < 0) {
748✔
5587
                                        *exit_status = EXIT_USER;
×
5588
                                        return log_error_errno(r, "Failed to keep CAP_SETPCAP: %m");
×
5589
                                }
5590
                        }
5591

5592
                        if (capability_ambient_set != 0) {
1,989✔
5593

5594
                                /* Raise the ambient capabilities after user change. */
5595
                                r = capability_ambient_set_apply(capability_ambient_set, /* also_inherit= */ false);
740✔
5596
                                if (r < 0) {
740✔
5597
                                        *exit_status = EXIT_CAPABILITIES;
×
5598
                                        return log_error_errno(r, "Failed to apply ambient capabilities (after UID change): %m");
×
5599
                                }
5600
                        }
5601
                }
5602
        }
5603

5604
        /* Apply working directory here, because the working directory might be on NFS and only the user
5605
         * running this service might have the correct privilege to change to the working directory. Also, it
5606
         * is absolutely 💣 crucial 💣 we applied all mount namespacing rearrangements before this, so that
5607
         * the cwd cannot be used to pin directories outside of the sandbox. */
5608
        r = apply_working_directory(context, params, runtime, pwent_home, accum_env);
9,499✔
5609
        if (r < 0) {
9,499✔
5610
                *exit_status = EXIT_CHDIR;
1✔
5611
                return log_error_errno(r, "Changing to the requested working directory failed: %m");
1✔
5612
        }
5613

5614
        if (needs_sandboxing) {
9,498✔
5615
                /* Apply other MAC contexts late, but before seccomp syscall filtering, as those should really be last to
5616
                 * influence our own codepaths as little as possible. Moreover, applying MAC contexts usually requires
5617
                 * syscalls that are subject to seccomp filtering, hence should probably be applied before the syscalls
5618
                 * are restricted. */
5619

5620
#if HAVE_SELINUX
5621
                if (use_selinux) {
5622
                        char *exec_context = mac_selinux_context_net ?: context->selinux_context;
5623

5624
                        if (exec_context) {
5625
                                r = setexeccon(exec_context);
5626
                                if (r < 0) {
5627
                                        if (!context->selinux_context_ignore) {
5628
                                                *exit_status = EXIT_SELINUX_CONTEXT;
5629
                                                return log_error_errno(r, "Failed to change SELinux context to %s: %m", exec_context);
5630
                                        }
5631
                                        log_debug_errno(r, "Failed to change SELinux context to %s, ignoring: %m", exec_context);
5632
                                }
5633
                        }
5634
                }
5635
#endif
5636

5637
#if HAVE_APPARMOR
5638
                if (use_apparmor && context->apparmor_profile) {
5639
                        r = ASSERT_PTR(sym_aa_change_onexec)(context->apparmor_profile);
5640
                        if (r < 0 && !context->apparmor_profile_ignore) {
5641
                                *exit_status = EXIT_APPARMOR_PROFILE;
5642
                                return log_error_errno(errno, "Failed to prepare AppArmor profile change to %s: %m",
5643
                                                       context->apparmor_profile);
5644
                        }
5645
                }
5646
#endif
5647

5648
                /* PR_GET_SECUREBITS is not privileged, while PR_SET_SECUREBITS is. So to suppress potential
5649
                 * EPERMs we'll try not to call PR_SET_SECUREBITS unless necessary. Setting securebits
5650
                 * requires CAP_SETPCAP. */
5651
                if (prctl(PR_GET_SECUREBITS) != secure_bits) {
9,498✔
5652
                        /* CAP_SETPCAP is required to set securebits. This capability is raised into the
5653
                         * effective set here.
5654
                         *
5655
                         * The effective set is overwritten during execve() with the following values:
5656
                         *
5657
                         * - ambient set (for non-root processes)
5658
                         *
5659
                         * - (inheritable | bounding) set for root processes)
5660
                         *
5661
                         * Hence there is no security impact to raise it in the effective set before execve
5662
                         */
5663
                        r = capability_gain_cap_setpcap(/* ret_before_caps = */ NULL);
800✔
5664
                        if (r < 0) {
800✔
5665
                                *exit_status = EXIT_CAPABILITIES;
×
5666
                                return log_error_errno(r, "Failed to gain CAP_SETPCAP for setting secure bits");
×
5667
                        }
5668
                        if (prctl(PR_SET_SECUREBITS, secure_bits) < 0) {
800✔
5669
                                *exit_status = EXIT_SECUREBITS;
×
5670
                                return log_error_errno(errno, "Failed to set process secure bits: %m");
×
5671
                        }
5672
                }
5673

5674
                if (context_has_no_new_privileges(context))
9,498✔
5675
                        if (prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0) < 0) {
1,413✔
5676
                                *exit_status = EXIT_NO_NEW_PRIVILEGES;
×
5677
                                return log_error_errno(errno, "Failed to disable new privileges: %m");
×
5678
                        }
5679

5680
#if HAVE_SECCOMP
5681
                r = apply_address_families(context, params);
9,498✔
5682
                if (r < 0) {
9,498✔
5683
                        *exit_status = EXIT_ADDRESS_FAMILIES;
×
5684
                        return log_error_errno(r, "Failed to restrict address families: %m");
×
5685
                }
5686

5687
                r = apply_memory_deny_write_execute(context, params);
9,498✔
5688
                if (r < 0) {
9,498✔
5689
                        *exit_status = EXIT_SECCOMP;
×
5690
                        return log_error_errno(r, "Failed to disable writing to executable memory: %m");
×
5691
                }
5692

5693
                r = apply_restrict_realtime(context, params);
9,498✔
5694
                if (r < 0) {
9,498✔
5695
                        *exit_status = EXIT_SECCOMP;
×
5696
                        return log_error_errno(r, "Failed to apply realtime restrictions: %m");
×
5697
                }
5698

5699
                r = apply_restrict_suid_sgid(context, params);
9,498✔
5700
                if (r < 0) {
9,498✔
5701
                        *exit_status = EXIT_SECCOMP;
×
5702
                        return log_error_errno(r, "Failed to apply SUID/SGID restrictions: %m");
×
5703
                }
5704

5705
                r = apply_restrict_namespaces(context, params);
9,498✔
5706
                if (r < 0) {
9,498✔
5707
                        *exit_status = EXIT_SECCOMP;
×
5708
                        return log_error_errno(r, "Failed to apply namespace restrictions: %m");
×
5709
                }
5710

5711
                r = apply_protect_sysctl(context, params);
9,498✔
5712
                if (r < 0) {
9,498✔
5713
                        *exit_status = EXIT_SECCOMP;
×
5714
                        return log_error_errno(r, "Failed to apply sysctl restrictions: %m");
×
5715
                }
5716

5717
                r = apply_protect_kernel_modules(context, params);
9,498✔
5718
                if (r < 0) {
9,498✔
5719
                        *exit_status = EXIT_SECCOMP;
×
5720
                        return log_error_errno(r, "Failed to apply module loading restrictions: %m");
×
5721
                }
5722

5723
                r = apply_protect_kernel_logs(context, params);
9,498✔
5724
                if (r < 0) {
9,498✔
5725
                        *exit_status = EXIT_SECCOMP;
×
5726
                        return log_error_errno(r, "Failed to apply kernel log restrictions: %m");
×
5727
                }
5728

5729
                r = apply_protect_clock(context, params);
9,498✔
5730
                if (r < 0) {
9,498✔
5731
                        *exit_status = EXIT_SECCOMP;
×
5732
                        return log_error_errno(r, "Failed to apply clock restrictions: %m");
×
5733
                }
5734

5735
                r = apply_private_devices(context, params);
9,498✔
5736
                if (r < 0) {
9,498✔
5737
                        *exit_status = EXIT_SECCOMP;
×
5738
                        return log_error_errno(r, "Failed to set up private devices: %m");
×
5739
                }
5740

5741
                r = apply_syscall_archs(context, params);
9,498✔
5742
                if (r < 0) {
9,498✔
5743
                        *exit_status = EXIT_SECCOMP;
×
5744
                        return log_error_errno(r, "Failed to apply syscall architecture restrictions: %m");
×
5745
                }
5746

5747
                r = apply_lock_personality(context, params);
9,498✔
5748
                if (r < 0) {
9,498✔
5749
                        *exit_status = EXIT_SECCOMP;
×
5750
                        return log_error_errno(r, "Failed to lock personalities: %m");
×
5751
                }
5752

5753
                r = apply_syscall_log(context, params);
9,498✔
5754
                if (r < 0) {
9,498✔
5755
                        *exit_status = EXIT_SECCOMP;
×
5756
                        return log_error_errno(r, "Failed to apply system call log filters: %m");
×
5757
                }
5758
#endif
5759

5760
#if HAVE_LIBBPF
5761
                r = apply_restrict_filesystems(context, params);
9,498✔
5762
                if (r < 0) {
9,498✔
5763
                        *exit_status = EXIT_BPF;
×
5764
                        return log_error_errno(r, "Failed to restrict filesystems: %m");
×
5765
                }
5766
#endif
5767

5768
#if HAVE_SECCOMP
5769
                /* This really should remain as close to the execve() as possible, to make sure our own code is affected
5770
                 * by the filter as little as possible. */
5771
                r = apply_syscall_filter(context, params);
9,498✔
5772
                if (r < 0) {
9,498✔
5773
                        *exit_status = EXIT_SECCOMP;
×
5774
                        return log_error_errno(r, "Failed to apply system call filters: %m");
×
5775
                }
5776

5777
                if (keep_seccomp_privileges) {
9,498✔
5778
                        /* Restore the capability bounding set with what's expected from the service + the
5779
                         * ambient capabilities hack */
5780
                        if (!cap_test_all(saved_bset)) {
747✔
5781
                                r = capability_bounding_set_drop(saved_bset, /* right_now= */ false);
712✔
5782
                                if (r < 0) {
712✔
5783
                                        *exit_status = EXIT_CAPABILITIES;
×
5784
                                        return log_error_errno(r, "Failed to drop bset capabilities: %m");
×
5785
                                }
5786
                        }
5787

5788
                        /* Only drop CAP_SYS_ADMIN if it's not in the bounding set, otherwise we'll break
5789
                         * applications that use it. */
5790
                        if (!BIT_SET(saved_bset, CAP_SYS_ADMIN)) {
747✔
5791
                                r = drop_capability(CAP_SYS_ADMIN);
279✔
5792
                                if (r < 0) {
279✔
5793
                                        *exit_status = EXIT_USER;
×
5794
                                        return log_error_errno(r, "Failed to drop CAP_SYS_ADMIN: %m");
×
5795
                                }
5796
                        }
5797

5798
                        /* Only drop CAP_SETPCAP if it's not in the bounding set, otherwise we'll break
5799
                         * applications that use it. */
5800
                        if (!BIT_SET(saved_bset, CAP_SETPCAP)) {
747✔
5801
                                r = drop_capability(CAP_SETPCAP);
531✔
5802
                                if (r < 0) {
531✔
5803
                                        *exit_status = EXIT_USER;
×
5804
                                        return log_error_errno(r, "Failed to drop CAP_SETPCAP: %m");
×
5805
                                }
5806
                        }
5807

5808
                        if (prctl(PR_SET_KEEPCAPS, 0) < 0) {
747✔
5809
                                *exit_status = EXIT_USER;
×
5810
                                return log_error_errno(errno, "Failed to drop keep capabilities flag: %m");
×
5811
                        }
5812
                }
5813
#endif
5814

5815
        }
5816

5817
        if (!strv_isempty(context->unset_environment)) {
9,498✔
5818
                char **ee = NULL;
268✔
5819

5820
                ee = strv_env_delete(accum_env, 1, context->unset_environment);
268✔
5821
                if (!ee) {
268✔
5822
                        *exit_status = EXIT_MEMORY;
×
5823
                        return log_oom();
5✔
5824
                }
5825

5826
                strv_free_and_replace(accum_env, ee);
268✔
5827
        }
5828

5829
        if (!FLAGS_SET(command->flags, EXEC_COMMAND_NO_ENV_EXPAND)) {
9,498✔
5830
                _cleanup_strv_free_ char **unset_variables = NULL, **bad_variables = NULL;
9,338✔
5831

5832
                r = replace_env_argv(command->argv, accum_env, &replaced_argv, &unset_variables, &bad_variables);
9,338✔
5833
                if (r < 0) {
9,338✔
5834
                        *exit_status = EXIT_MEMORY;
×
5835
                        return log_error_errno(r, "Failed to replace environment variables: %m");
×
5836
                }
5837
                final_argv = replaced_argv;
9,338✔
5838

5839
                if (!strv_isempty(unset_variables)) {
9,338✔
5840
                        _cleanup_free_ char *ju = strv_join(unset_variables, ", ");
10✔
5841
                        log_warning("Referenced but unset environment variable evaluates to an empty string: %s", strna(ju));
5✔
5842
                }
5843

5844
                if (!strv_isempty(bad_variables)) {
9,338✔
5845
                        _cleanup_free_ char *jb = strv_join(bad_variables, ", ");
×
5846
                        log_warning("Invalid environment variable name evaluates to an empty string: %s", strna(jb));
×
5847
                }
5848
        } else
5849
                final_argv = command->argv;
160✔
5850

5851
        log_command_line(context, params, "Executing", executable, final_argv);
9,498✔
5852

5853
        /* We have finished with all our initializations. Let's now let the manager know that. From this
5854
         * point on, if the manager sees POLLHUP on the exec_fd, then execve() was successful. */
5855

5856
        r = exec_fd_mark_hot(context, params, /* hot= */ true, exit_status);
9,498✔
5857
        if (r < 0)
9,498✔
5858
                return r;
5859

5860
        /* As last thing before the execve(), let's send the handoff timestamp */
5861
        r = send_handoff_timestamp(context, params, exit_status);
9,498✔
5862
        if (r < 0) {
9,498✔
5863
                /* If this handoff timestamp failed, let's undo the marking as hot */
5864
                (void) exec_fd_mark_hot(context, params, /* hot= */ false, /* reterr_exit_status= */ NULL);
×
5865
                return r;
5866
        }
5867

5868
        /* NB: we leave executable_fd, exec_fd, handoff_timestamp_fd open here. This is safe, because they
5869
         * have O_CLOEXEC set, and the execve() below will thus automatically close them. In fact, for
5870
         * exec_fd this is pretty much the whole raison d'etre. */
5871

5872
        r = fexecve_or_execve(executable_fd, executable, final_argv, accum_env);
9,498✔
5873

5874
        /* The execve() failed, let's undo the marking as hot */
5875
        (void) exec_fd_mark_hot(context, params, /* hot= */ false, /* reterr_exit_status= */ NULL);
3✔
5876

5877
        *exit_status = EXIT_EXEC;
3✔
5878
        return log_error_errno(r, "Failed to execute %s: %m", executable);
3✔
5879
}
STATUS · Troubleshooting · Open an Issue · Sales · Support · CAREERS · ENTERPRISE · START FREE · SCHEDULE DEMO
ANNOUNCEMENTS · TWITTER · TOS & SLA · Supported CI Services · What's a CI service? · Automated Testing

© 2026 Coveralls, Inc