• Home
  • Features
  • Pricing
  • Docs
  • Announcements
  • Sign In

systemd / systemd / 15889795658

25 Jun 2025 05:44PM UTC coverage: 72.107% (+0.03%) from 72.081%
15889795658

push

github

yuwata
journal-gatewayd: fix handling of num_skip pointing beyond the last entry

When `num_skip` is supplied to the `Range` header, journal-gatewayd
always returns the very last record even though it should have been
skipped. This is because the `sd_journal_next_skip` always returns
non-zero value on the first call, leading to one iteration of the
`request_reader_entries` returning the last record.

To avoid this unexpected behavior, check that the number of lines we
have skipped by is not lower than the requested skip value. If it is,
then it means there are lines which should not be returned now -
decrement the n_skip counter then and return from the function, closing
the stream if follow flag is not set.

Fixes #37954

6 of 8 new or added lines in 1 file covered. (75.0%)

168 existing lines in 33 files now uncovered.

300532 of 416788 relevant lines covered (72.11%)

712881.46 hits per line

Source File
Press 'n' to go to next uncovered line, 'b' for previous

72.38
/src/core/exec-invoke.c
1
/* SPDX-License-Identifier: LGPL-2.1-or-later */
2

3
#include <grp.h>
4
#include <linux/ioprio.h>
5
#include <linux/prctl.h>
6
#include <linux/sched.h>
7
#include <linux/securebits.h>
8
#include <poll.h>
9
#include <sys/eventfd.h>
10
#include <sys/ioctl.h>
11
#include <sys/mount.h>
12
#include <sys/prctl.h>
13

14
#if HAVE_PAM
15
#include <security/pam_appl.h>
16
#endif
17

18
#include "sd-messages.h"
19

20
#include "apparmor-util.h"
21
#include "argv-util.h"
22
#include "ask-password-api.h"
23
#include "barrier.h"
24
#include "bitfield.h"
25
#include "bpf-dlopen.h"
26
#include "bpf-restrict-fs.h"
27
#include "btrfs-util.h"
28
#include "capability-util.h"
29
#include "cgroup-setup.h"
30
#include "cgroup.h"
31
#include "chase.h"
32
#include "chown-recursive.h"
33
#include "constants.h"
34
#include "copy.h"
35
#include "coredump-util.h"
36
#include "dissect-image.h"
37
#include "dynamic-user.h"
38
#include "env-util.h"
39
#include "escape.h"
40
#include "exec-credential.h"
41
#include "exec-invoke.h"
42
#include "execute.h"
43
#include "exit-status.h"
44
#include "fd-util.h"
45
#include "fs-util.h"
46
#include "hexdecoct.h"
47
#include "hostname-setup.h"
48
#include "image-policy.h"
49
#include "io-util.h"
50
#include "iovec-util.h"
51
#include "journal-send.h"
52
#include "manager.h"
53
#include "memfd-util.h"
54
#include "missing_sched.h"
55
#include "missing_syscall.h"
56
#include "mkdir-label.h"
57
#include "mount-util.h"
58
#include "namespace-util.h"
59
#include "nsflags.h"
60
#include "open-file.h"
61
#include "osc-context.h"
62
#include "path-util.h"
63
#include "pidref.h"
64
#include "proc-cmdline.h"
65
#include "process-util.h"
66
#include "psi-util.h"
67
#include "rlimit-util.h"
68
#include "seccomp-util.h"
69
#include "selinux-util.h"
70
#include "set.h"
71
#include "signal-util.h"
72
#include "smack-util.h"
73
#include "socket-util.h"
74
#include "stat-util.h"
75
#include "string-table.h"
76
#include "strv.h"
77
#include "terminal-util.h"
78
#include "user-util.h"
79
#include "utmp-wtmp.h"
80
#include "vpick.h"
81

82
#define IDLE_TIMEOUT_USEC (5*USEC_PER_SEC)
83
#define IDLE_TIMEOUT2_USEC (1*USEC_PER_SEC)
84

85
#define SNDBUF_SIZE (8*1024*1024)
86

87
static int flag_fds(
9,831✔
88
                const int fds[],
89
                size_t n_socket_fds,
90
                size_t n_fds,
91
                bool nonblock) {
92

93
        int r;
9,831✔
94

95
        assert(fds || n_fds == 0);
9,831✔
96

97
        /* Drops/Sets O_NONBLOCK and FD_CLOEXEC from the file flags.
98
         * O_NONBLOCK only applies to socket activation though. */
99

100
        for (size_t i = 0; i < n_fds; i++) {
12,458✔
101

102
                if (i < n_socket_fds) {
2,627✔
103
                        r = fd_nonblock(fds[i], nonblock);
2,312✔
104
                        if (r < 0)
2,312✔
105
                                return r;
106
                }
107

108
                /* We unconditionally drop FD_CLOEXEC from the fds,
109
                 * since after all we want to pass these fds to our
110
                 * children */
111

112
                r = fd_cloexec(fds[i], false);
2,627✔
113
                if (r < 0)
2,627✔
114
                        return r;
115
        }
116

117
        return 0;
118
}
119

120
static bool is_terminal_input(ExecInput i) {
44,014✔
121
        return IN_SET(i,
44,014✔
122
                      EXEC_INPUT_TTY,
123
                      EXEC_INPUT_TTY_FORCE,
124
                      EXEC_INPUT_TTY_FAIL);
125
}
126

127
static bool is_terminal_output(ExecOutput o) {
41,284✔
128
        return IN_SET(o,
41,284✔
129
                      EXEC_OUTPUT_TTY,
130
                      EXEC_OUTPUT_KMSG_AND_CONSOLE,
131
                      EXEC_OUTPUT_JOURNAL_AND_CONSOLE);
132
}
133

134
static bool is_kmsg_output(ExecOutput o) {
10,628✔
135
        return IN_SET(o,
10,628✔
136
                      EXEC_OUTPUT_KMSG,
137
                      EXEC_OUTPUT_KMSG_AND_CONSOLE);
138
}
139

140
static int open_null_as(int flags, int nfd) {
11,201✔
141
        int fd;
11,201✔
142

143
        assert(nfd >= 0);
11,201✔
144

145
        fd = open("/dev/null", flags|O_NOCTTY);
11,201✔
146
        if (fd < 0)
11,201✔
147
                return -errno;
×
148

149
        return move_fd(fd, nfd, false);
11,201✔
150
}
151

152
static int connect_journal_socket(
10,628✔
153
                int fd,
154
                const char *log_namespace,
155
                uid_t uid,
156
                gid_t gid) {
157

158
        uid_t olduid = UID_INVALID;
10,628✔
159
        gid_t oldgid = GID_INVALID;
10,628✔
160
        const char *j;
10,628✔
161
        int r;
10,628✔
162

163
        assert(fd >= 0);
10,628✔
164

165
        j = journal_stream_path(log_namespace);
10,640✔
166
        if (!j)
2✔
167
                return -EINVAL;
×
168

169
        if (gid_is_valid(gid)) {
10,628✔
170
                oldgid = getgid();
2,419✔
171

172
                if (setegid(gid) < 0)
2,419✔
173
                        return -errno;
×
174
        }
175

176
        if (uid_is_valid(uid)) {
10,628✔
177
                olduid = getuid();
2,416✔
178

179
                if (seteuid(uid) < 0) {
2,416✔
180
                        r = -errno;
×
181
                        goto restore_gid;
×
182
                }
183
        }
184

185
        r = connect_unix_path(fd, AT_FDCWD, j);
10,628✔
186

187
        /* If we fail to restore the uid or gid, things will likely fail later on. This should only happen if
188
           an LSM interferes. */
189

190
        if (uid_is_valid(uid))
10,628✔
191
                (void) seteuid(olduid);
2,416✔
192

193
 restore_gid:
8,212✔
194
        if (gid_is_valid(gid))
10,628✔
195
                (void) setegid(oldgid);
2,419✔
196

197
        return r;
198
}
199

200
static int connect_logger_as(
10,628✔
201
                const ExecContext *context,
202
                const ExecParameters *params,
203
                ExecOutput output,
204
                const char *ident,
205
                int nfd,
206
                uid_t uid,
207
                gid_t gid) {
208

209
        _cleanup_close_ int fd = -EBADF;
10,628✔
210
        int r;
10,628✔
211

212
        assert(context);
10,628✔
213
        assert(params);
10,628✔
214
        assert(output < _EXEC_OUTPUT_MAX);
10,628✔
215
        assert(ident);
10,628✔
216
        assert(nfd >= 0);
10,628✔
217

218
        fd = socket(AF_UNIX, SOCK_STREAM, 0);
10,628✔
219
        if (fd < 0)
10,628✔
220
                return -errno;
×
221

222
        r = connect_journal_socket(fd, context->log_namespace, uid, gid);
10,628✔
223
        if (r < 0)
10,628✔
224
                return r;
225

226
        if (shutdown(fd, SHUT_RD) < 0)
10,628✔
227
                return -errno;
×
228

229
        (void) fd_inc_sndbuf(fd, SNDBUF_SIZE);
10,628✔
230

231
        if (dprintf(fd,
20,504✔
232
                "%s\n"
233
                "%s\n"
234
                "%i\n"
235
                "%i\n"
236
                "%i\n"
237
                "%i\n"
238
                "%i\n",
239
                context->syslog_identifier ?: ident,
10,628✔
240
                params->flags & EXEC_PASS_LOG_UNIT ? params->unit_id : "",
10,628✔
241
                context->syslog_priority,
10,628✔
242
                !!context->syslog_level_prefix,
10,628✔
243
                false,
244
                is_kmsg_output(output),
10,628✔
245
                is_terminal_output(output)) < 0)
10,628✔
246
                return -errno;
×
247

248
        return move_fd(TAKE_FD(fd), nfd, false);
10,628✔
249
}
250

251
static int open_terminal_as(const char *path, int flags, int nfd) {
32✔
252
        int fd;
32✔
253

254
        assert(path);
32✔
255
        assert(nfd >= 0);
32✔
256

257
        fd = open_terminal(path, flags | O_NOCTTY);
32✔
258
        if (fd < 0)
32✔
259
                return fd;
260

261
        return move_fd(fd, nfd, false);
32✔
262
}
263

264
static int acquire_path(const char *path, int flags, mode_t mode) {
11✔
265
        _cleanup_close_ int fd = -EBADF;
11✔
266
        int r;
11✔
267

268
        assert(path);
11✔
269

270
        if (IN_SET(flags & O_ACCMODE_STRICT, O_WRONLY, O_RDWR))
11✔
271
                flags |= O_CREAT;
11✔
272

273
        fd = open(path, flags|O_NOCTTY, mode);
11✔
274
        if (fd >= 0)
11✔
275
                return TAKE_FD(fd);
11✔
276

277
        if (errno != ENXIO) /* ENXIO is returned when we try to open() an AF_UNIX file system socket on Linux */
×
278
                return -errno;
×
279

280
        /* So, it appears the specified path could be an AF_UNIX socket. Let's see if we can connect to it. */
281

282
        fd = socket(AF_UNIX, SOCK_STREAM, 0);
×
283
        if (fd < 0)
×
284
                return -errno;
×
285

286
        r = connect_unix_path(fd, AT_FDCWD, path);
×
287
        if (IN_SET(r, -ENOTSOCK, -EINVAL))
×
288
                /* Propagate initial error if we get ENOTSOCK or EINVAL, i.e. we have indication that this
289
                 * wasn't an AF_UNIX socket after all */
290
                return -ENXIO;
291
        if (r < 0)
×
292
                return r;
293

294
        if ((flags & O_ACCMODE_STRICT) == O_RDONLY)
×
295
                r = shutdown(fd, SHUT_WR);
×
296
        else if ((flags & O_ACCMODE_STRICT) == O_WRONLY)
×
297
                r = shutdown(fd, SHUT_RD);
×
298
        else
299
                r = 0;
300
        if (r < 0)
×
301
                return -errno;
×
302

303
        return TAKE_FD(fd);
304
}
305

306
static int fixup_input(
33,909✔
307
                const ExecContext *context,
308
                int socket_fd,
309
                bool apply_tty_stdin) {
310

311
        ExecInput std_input;
33,909✔
312

313
        assert(context);
33,909✔
314

315
        std_input = context->std_input;
33,909✔
316

317
        if (is_terminal_input(std_input) && !apply_tty_stdin)
33,909✔
318
                return EXEC_INPUT_NULL;
319

320
        if (std_input == EXEC_INPUT_SOCKET && socket_fd < 0)
33,909✔
321
                return EXEC_INPUT_NULL;
322

323
        if (std_input == EXEC_INPUT_DATA && context->stdin_data_size == 0)
33,909✔
324
                return EXEC_INPUT_NULL;
×
325

326
        return std_input;
327
}
328

329
static int fixup_output(ExecOutput output, int socket_fd) {
33,909✔
330

331
        if (output == EXEC_OUTPUT_SOCKET && socket_fd < 0)
33,909✔
332
                return EXEC_OUTPUT_INHERIT;
×
333

334
        return output;
335
}
336

337
static int setup_input(
11,853✔
338
                const ExecContext *context,
339
                const ExecParameters *params,
340
                int socket_fd,
341
                const int named_iofds[static 3]) {
342

343
        ExecInput i;
11,853✔
344
        int r;
11,853✔
345

346
        assert(context);
11,853✔
347
        assert(params);
11,853✔
348
        assert(named_iofds);
11,853✔
349

350
        if (params->stdin_fd >= 0) {
11,853✔
351
                if (dup2(params->stdin_fd, STDIN_FILENO) < 0)
550✔
352
                        return -errno;
×
353

354
                /* Try to make this our controlling tty, if it is a tty */
355
                if (isatty_safe(STDIN_FILENO) && ioctl(STDIN_FILENO, TIOCSCTTY, context->std_input == EXEC_INPUT_TTY_FORCE) < 0)
550✔
356
                        log_debug_errno(errno, "Failed to make standard input TTY our controlling terminal: %m");
2✔
357

358
                return STDIN_FILENO;
550✔
359
        }
360

361
        i = fixup_input(context, socket_fd, params->flags & EXEC_APPLY_TTY_STDIN);
11,303✔
362

363
        switch (i) {
11,303✔
364

365
        case EXEC_INPUT_NULL:
10,935✔
366
                return open_null_as(O_RDONLY, STDIN_FILENO);
10,935✔
367

368
        case EXEC_INPUT_TTY:
356✔
369
        case EXEC_INPUT_TTY_FORCE:
370
        case EXEC_INPUT_TTY_FAIL: {
371
                _cleanup_close_ int tty_fd = -EBADF;
356✔
372
                _cleanup_free_ char *resolved = NULL;
356✔
373
                const char *tty_path;
356✔
374

375
                tty_path = ASSERT_PTR(exec_context_tty_path(context));
356✔
376

377
                if (tty_is_console(tty_path)) {
356✔
378
                        r = resolve_dev_console(&resolved);
270✔
379
                        if (r < 0)
270✔
380
                                log_debug_errno(r, "Failed to resolve /dev/console, ignoring: %m");
×
381
                        else {
382
                                log_debug("Resolved /dev/console to %s", resolved);
270✔
383
                                tty_path = resolved;
270✔
384
                        }
385
                }
386

387
                tty_fd = acquire_terminal(tty_path,
712✔
388
                                          i == EXEC_INPUT_TTY_FAIL  ? ACQUIRE_TERMINAL_TRY :
356✔
389
                                          i == EXEC_INPUT_TTY_FORCE ? ACQUIRE_TERMINAL_FORCE :
390
                                                                      ACQUIRE_TERMINAL_WAIT,
391
                                          USEC_INFINITY);
392
                if (tty_fd < 0)
356✔
393
                        return tty_fd;
394

395
                r = move_fd(tty_fd, STDIN_FILENO, /* cloexec= */ false);
356✔
396
                if (r < 0)
356✔
397
                        return r;
×
398

399
                TAKE_FD(tty_fd);
400
                return r;
401
        }
402

403
        case EXEC_INPUT_SOCKET:
11✔
404
                assert(socket_fd >= 0);
11✔
405

406
                return RET_NERRNO(dup2(socket_fd, STDIN_FILENO));
11✔
407

408
        case EXEC_INPUT_NAMED_FD:
×
409
                assert(named_iofds[STDIN_FILENO] >= 0);
×
410

411
                (void) fd_nonblock(named_iofds[STDIN_FILENO], false);
×
412
                return RET_NERRNO(dup2(named_iofds[STDIN_FILENO], STDIN_FILENO));
11,853✔
413

414
        case EXEC_INPUT_DATA: {
1✔
415
                int fd;
1✔
416

417
                fd = memfd_new_and_seal("exec-input", context->stdin_data, context->stdin_data_size);
1✔
418
                if (fd < 0)
1✔
419
                        return fd;
420

421
                return move_fd(fd, STDIN_FILENO, false);
1✔
422
        }
423

424
        case EXEC_INPUT_FILE: {
×
425
                bool rw;
×
426
                int fd;
×
427

428
                assert(context->stdio_file[STDIN_FILENO]);
×
429

430
                rw = (context->std_output == EXEC_OUTPUT_FILE && streq_ptr(context->stdio_file[STDIN_FILENO], context->stdio_file[STDOUT_FILENO])) ||
×
431
                        (context->std_error == EXEC_OUTPUT_FILE && streq_ptr(context->stdio_file[STDIN_FILENO], context->stdio_file[STDERR_FILENO]));
×
432

433
                fd = acquire_path(context->stdio_file[STDIN_FILENO], rw ? O_RDWR : O_RDONLY, 0666 & ~context->umask);
×
434
                if (fd < 0)
×
435
                        return fd;
436

437
                return move_fd(fd, STDIN_FILENO, false);
×
438
        }
439

440
        default:
×
441
                assert_not_reached();
×
442
        }
443
}
444

445
static bool can_inherit_stderr_from_stdout(
11,303✔
446
                const ExecContext *context,
447
                ExecOutput o,
448
                ExecOutput e) {
449

450
        assert(context);
11,303✔
451

452
        /* Returns true, if given the specified STDERR and STDOUT output we can directly dup() the stdout fd to the
453
         * stderr fd */
454

455
        if (e == EXEC_OUTPUT_INHERIT)
11,303✔
456
                return true;
457
        if (e != o)
415✔
458
                return false;
459

460
        if (e == EXEC_OUTPUT_NAMED_FD)
412✔
461
                return streq_ptr(context->stdio_fdname[STDOUT_FILENO], context->stdio_fdname[STDERR_FILENO]);
×
462

463
        if (IN_SET(e, EXEC_OUTPUT_FILE, EXEC_OUTPUT_FILE_APPEND, EXEC_OUTPUT_FILE_TRUNCATE))
412✔
464
                return streq_ptr(context->stdio_file[STDOUT_FILENO], context->stdio_file[STDERR_FILENO]);
4✔
465

466
        return true;
467
}
468

469
static int setup_output(
23,706✔
470
                const ExecContext *context,
471
                const ExecParameters *params,
472
                int fileno,
473
                int socket_fd,
474
                const int named_iofds[static 3],
475
                const char *ident,
476
                uid_t uid,
477
                gid_t gid,
478
                dev_t *journal_stream_dev,
479
                ino_t *journal_stream_ino) {
480

481
        ExecOutput o;
23,706✔
482
        ExecInput i;
23,706✔
483
        int r;
23,706✔
484

485
        assert(context);
23,706✔
486
        assert(params);
23,706✔
487
        assert(ident);
23,706✔
488
        assert(journal_stream_dev);
23,706✔
489
        assert(journal_stream_ino);
23,706✔
490

491
        if (fileno == STDOUT_FILENO && params->stdout_fd >= 0) {
23,706✔
492

493
                if (dup2(params->stdout_fd, STDOUT_FILENO) < 0)
550✔
494
                        return -errno;
×
495

496
                return STDOUT_FILENO;
497
        }
498

499
        if (fileno == STDERR_FILENO && params->stderr_fd >= 0) {
23,156✔
500
                if (dup2(params->stderr_fd, STDERR_FILENO) < 0)
550✔
501
                        return -errno;
×
502

503
                return STDERR_FILENO;
504
        }
505

506
        i = fixup_input(context, socket_fd, params->flags & EXEC_APPLY_TTY_STDIN);
22,606✔
507
        o = fixup_output(context->std_output, socket_fd);
22,606✔
508

509
        // FIXME: we probably should spend some time here to verify that if we inherit an fd from stdin
510
        // (possibly indirect via inheritance from stdout) it is actually opened for write!
511

512
        if (fileno == STDERR_FILENO) {
22,606✔
513
                ExecOutput e;
11,303✔
514
                e = fixup_output(context->std_error, socket_fd);
11,303✔
515

516
                /* This expects the input and output are already set up */
517

518
                /* Don't change the stderr file descriptor if we inherit all
519
                 * the way and are not on a tty */
520
                if (e == EXEC_OUTPUT_INHERIT &&
11,303✔
521
                    o == EXEC_OUTPUT_INHERIT &&
8✔
522
                    i == EXEC_INPUT_NULL &&
×
523
                    !is_terminal_input(context->std_input) &&
×
524
                    getppid() != 1)
×
525
                        return fileno;
526

527
                /* Duplicate from stdout if possible */
528
                if (can_inherit_stderr_from_stdout(context, o, e))
11,303✔
529
                        return RET_NERRNO(dup2(STDOUT_FILENO, fileno));
11,296✔
530

531
                o = e;
532

533
        } else if (o == EXEC_OUTPUT_INHERIT) {
11,303✔
534
                /* If input got downgraded, inherit the original value */
535
                if (i == EXEC_INPUT_NULL && is_terminal_input(context->std_input))
8✔
536
                        return open_terminal_as(exec_context_tty_path(context), O_WRONLY, fileno);
×
537

538
                /* If the input is connected to anything that's not a /dev/null or a data fd, inherit that... */
539
                if (!IN_SET(i, EXEC_INPUT_NULL, EXEC_INPUT_DATA))
8✔
540
                        return RET_NERRNO(dup2(STDIN_FILENO, fileno));
8✔
541

542
                /* If we are not started from PID 1 we just inherit STDOUT from our parent process. */
543
                if (getppid() != 1)
×
544
                        return fileno;
545

546
                /* We need to open /dev/null here anew, to get the right access mode. */
547
                return open_null_as(O_WRONLY, fileno);
×
548
        }
549

550
        switch (o) {
11,302✔
551

552
        case EXEC_OUTPUT_NULL:
266✔
553
                return open_null_as(O_WRONLY, fileno);
266✔
554

555
        case EXEC_OUTPUT_TTY:
388✔
556
                if (is_terminal_input(i))
388✔
557
                        return RET_NERRNO(dup2(STDIN_FILENO, fileno));
356✔
558

559
                return open_terminal_as(exec_context_tty_path(context), O_WRONLY, fileno);
32✔
560

561
        case EXEC_OUTPUT_KMSG:
10,628✔
562
        case EXEC_OUTPUT_KMSG_AND_CONSOLE:
563
        case EXEC_OUTPUT_JOURNAL:
564
        case EXEC_OUTPUT_JOURNAL_AND_CONSOLE:
565
                r = connect_logger_as(context, params, o, ident, fileno, uid, gid);
10,628✔
566
                if (r < 0) {
10,628✔
567
                        log_warning_errno(r, "Failed to connect %s to the journal socket, ignoring: %m",
×
568
                                          fileno == STDOUT_FILENO ? "stdout" : "stderr");
569
                        r = open_null_as(O_WRONLY, fileno);
×
570
                } else {
571
                        struct stat st;
10,628✔
572

573
                        /* If we connected this fd to the journal via a stream, patch the device/inode into the passed
574
                         * parameters, but only then. This is useful so that we can set $JOURNAL_STREAM that permits
575
                         * services to detect whether they are connected to the journal or not.
576
                         *
577
                         * If both stdout and stderr are connected to a stream then let's make sure to store the data
578
                         * about STDERR as that's usually the best way to do logging. */
579

580
                        if (fstat(fileno, &st) >= 0 &&
10,628✔
581
                            (*journal_stream_ino == 0 || fileno == STDERR_FILENO)) {
10,628✔
582
                                *journal_stream_dev = st.st_dev;
10,628✔
583
                                *journal_stream_ino = st.st_ino;
10,628✔
584
                        }
585
                }
586
                return r;
587

588
        case EXEC_OUTPUT_SOCKET:
9✔
589
                assert(socket_fd >= 0);
9✔
590

591
                return RET_NERRNO(dup2(socket_fd, fileno));
9✔
592

593
        case EXEC_OUTPUT_NAMED_FD:
×
594
                assert(named_iofds[fileno] >= 0);
×
595

596
                (void) fd_nonblock(named_iofds[fileno], false);
×
597
                return RET_NERRNO(dup2(named_iofds[fileno], fileno));
×
598

599
        case EXEC_OUTPUT_FILE:
11✔
600
        case EXEC_OUTPUT_FILE_APPEND:
601
        case EXEC_OUTPUT_FILE_TRUNCATE: {
602
                bool rw;
11✔
603
                int fd, flags;
11✔
604

605
                assert(context->stdio_file[fileno]);
11✔
606

607
                rw = context->std_input == EXEC_INPUT_FILE &&
11✔
608
                        streq_ptr(context->stdio_file[fileno], context->stdio_file[STDIN_FILENO]);
×
609

610
                if (rw)
11✔
611
                        return RET_NERRNO(dup2(STDIN_FILENO, fileno));
×
612

613
                flags = O_WRONLY;
11✔
614
                if (o == EXEC_OUTPUT_FILE_APPEND)
11✔
615
                        flags |= O_APPEND;
616
                else if (o == EXEC_OUTPUT_FILE_TRUNCATE)
9✔
617
                        flags |= O_TRUNC;
3✔
618

619
                fd = acquire_path(context->stdio_file[fileno], flags, 0666 & ~context->umask);
11✔
620
                if (fd < 0)
11✔
621
                        return fd;
622

623
                return move_fd(fd, fileno, 0);
11✔
624
        }
625

626
        default:
×
627
                assert_not_reached();
×
628
        }
629
}
630

631
static int chown_terminal(int fd, uid_t uid) {
2,743✔
632
        int r;
2,743✔
633

634
        assert(fd >= 0);
2,743✔
635

636
        /* Before we chown/chmod the TTY, let's ensure this is actually a tty */
637
        if (!isatty_safe(fd))
2,743✔
638
                return 0;
639

640
        /* This might fail. What matters are the results. */
641
        r = fchmod_and_chown(fd, TTY_MODE, uid, GID_INVALID);
7✔
642
        if (r < 0)
7✔
643
                return r;
×
644

645
        return 1;
646
}
647

648
static int setup_confirm_stdio(
×
649
                const ExecContext *context,
650
                const char *vc,
651
                int *ret_saved_stdin,
652
                int *ret_saved_stdout) {
653

654
        _cleanup_close_ int fd = -EBADF, saved_stdin = -EBADF, saved_stdout = -EBADF;
×
655
        int r;
×
656

657
        assert(context);
×
658
        assert(ret_saved_stdin);
×
659
        assert(ret_saved_stdout);
×
660

661
        saved_stdin = fcntl(STDIN_FILENO, F_DUPFD_CLOEXEC, 3);
×
662
        if (saved_stdin < 0)
×
663
                return -errno;
×
664

665
        saved_stdout = fcntl(STDOUT_FILENO, F_DUPFD_CLOEXEC, 3);
×
666
        if (saved_stdout < 0)
×
667
                return -errno;
×
668

669
        fd = acquire_terminal(vc, ACQUIRE_TERMINAL_WAIT, DEFAULT_CONFIRM_USEC);
×
670
        if (fd < 0)
×
671
                return fd;
672

673
        _cleanup_close_ int lock_fd = lock_dev_console();
×
674
        if (lock_fd < 0)
×
675
                log_debug_errno(lock_fd, "Failed to lock /dev/console, ignoring: %m");
×
676

677
        r = chown_terminal(fd, getuid());
×
678
        if (r < 0)
×
679
                return r;
680

681
        r = terminal_reset_defensive(fd, TERMINAL_RESET_SWITCH_TO_TEXT);
×
682
        if (r < 0)
×
683
                return r;
684

685
        r = exec_context_apply_tty_size(context, fd, fd, vc);
×
686
        if (r < 0)
×
687
                return r;
688

689
        r = rearrange_stdio(fd, fd, STDERR_FILENO); /* Invalidates 'fd' also on failure */
×
690
        TAKE_FD(fd);
×
691
        if (r < 0)
×
692
                return r;
693

694
        *ret_saved_stdin = TAKE_FD(saved_stdin);
×
695
        *ret_saved_stdout = TAKE_FD(saved_stdout);
×
696
        return 0;
×
697
}
698

699
static void write_confirm_error_fd(int err, int fd, const char *unit_id) {
×
700
        assert(err != 0);
×
701
        assert(fd >= 0);
×
702
        assert(unit_id);
×
703

704
        errno = abs(err);
×
705

706
        if (errno == ETIMEDOUT)
×
707
                dprintf(fd, "Confirmation question timed out for %s, assuming positive response.\n", unit_id);
×
708
        else
709
                dprintf(fd, "Couldn't ask confirmation for %s, assuming positive response: %m\n", unit_id);
×
710
}
×
711

712
static void write_confirm_error(int err, const char *vc, const char *unit_id) {
×
713
        _cleanup_close_ int fd = -EBADF;
×
714

715
        assert(vc);
×
716

717
        fd = open_terminal(vc, O_WRONLY|O_NOCTTY|O_CLOEXEC);
×
718
        if (fd < 0)
×
719
                return;
×
720

721
        write_confirm_error_fd(err, fd, unit_id);
×
722
}
723

724
static int restore_confirm_stdio(int *saved_stdin, int *saved_stdout) {
×
725
        int r = 0;
×
726

727
        assert(saved_stdin);
×
728
        assert(saved_stdout);
×
729

730
        release_terminal();
×
731

732
        if (*saved_stdin >= 0)
×
733
                if (dup2(*saved_stdin, STDIN_FILENO) < 0)
×
734
                        r = -errno;
×
735

736
        if (*saved_stdout >= 0)
×
737
                if (dup2(*saved_stdout, STDOUT_FILENO) < 0)
×
738
                        r = -errno;
×
739

740
        *saved_stdin = safe_close(*saved_stdin);
×
741
        *saved_stdout = safe_close(*saved_stdout);
×
742

743
        return r;
×
744
}
745

746
enum {
747
        CONFIRM_PRETEND_FAILURE = -1,
748
        CONFIRM_PRETEND_SUCCESS =  0,
749
        CONFIRM_EXECUTE = 1,
750
};
751

752
static bool confirm_spawn_disabled(void) {
×
753
        return access("/run/systemd/confirm_spawn_disabled", F_OK) >= 0;
×
754
}
755

756
static int ask_for_confirmation(const ExecContext *context, const ExecParameters *params, const char *cmdline) {
×
757
        int saved_stdout = -EBADF, saved_stdin = -EBADF, r;
×
758
        _cleanup_free_ char *e = NULL;
×
759
        char c;
×
760

761
        assert(context);
×
762
        assert(params);
×
763

764
        /* For any internal errors, assume a positive response. */
765
        r = setup_confirm_stdio(context, params->confirm_spawn, &saved_stdin, &saved_stdout);
×
766
        if (r < 0) {
×
767
                write_confirm_error(r, params->confirm_spawn, params->unit_id);
×
768
                return CONFIRM_EXECUTE;
769
        }
770

771
        /* confirm_spawn might have been disabled while we were sleeping. */
772
        if (!params->confirm_spawn || confirm_spawn_disabled()) {
×
773
                r = 1;
×
774
                goto restore_stdio;
×
775
        }
776

777
        e = ellipsize(cmdline, 60, 100);
×
778
        if (!e) {
×
779
                log_oom();
×
780
                r = CONFIRM_EXECUTE;
×
781
                goto restore_stdio;
×
782
        }
783

784
        for (;;) {
×
785
                r = ask_char(&c, "yfshiDjcn", "Execute %s? [y, f, s – h for help] ", e);
×
786
                if (r < 0) {
×
787
                        write_confirm_error_fd(r, STDOUT_FILENO, params->unit_id);
×
788
                        r = CONFIRM_EXECUTE;
×
789
                        goto restore_stdio;
×
790
                }
791

792
                switch (c) {
×
793
                case 'c':
×
794
                        printf("Resuming normal execution.\n");
×
795
                        manager_disable_confirm_spawn();
×
796
                        r = 1;
797
                        break;
798
                case 'D':
×
799
                        printf("  Unit: %s\n",
×
800
                               params->unit_id);
×
801
                        exec_context_dump(context, stdout, "  ");
×
802
                        exec_params_dump(params, stdout, "  ");
×
803
                        continue; /* ask again */
×
804
                case 'f':
×
805
                        printf("Failing execution.\n");
×
806
                        r = CONFIRM_PRETEND_FAILURE;
807
                        break;
808
                case 'h':
×
809
                        printf("  c - continue, proceed without asking anymore\n"
×
810
                               "  D - dump, show the state of the unit\n"
811
                               "  f - fail, don't execute the command and pretend it failed\n"
812
                               "  h - help\n"
813
                               "  i - info, show a short summary of the unit\n"
814
                               "  j - jobs, show jobs that are in progress\n"
815
                               "  s - skip, don't execute the command and pretend it succeeded\n"
816
                               "  y - yes, execute the command\n");
817
                        continue; /* ask again */
×
818
                case 'i':
×
819
                        printf("  Unit:        %s\n"
×
820
                               "  Command:     %s\n",
821
                               params->unit_id, cmdline);
×
822
                        continue; /* ask again */
×
823
                case 'j':
×
824
                        if (sigqueue(getppid(),
×
825
                                     SIGRTMIN+18,
×
826
                                     (const union sigval) { .sival_int = MANAGER_SIGNAL_COMMAND_DUMP_JOBS }) < 0)
×
827
                                return -errno;
×
828

829
                        continue; /* ask again */
×
830
                case 'n':
×
831
                        /* 'n' was removed in favor of 'f'. */
832
                        printf("Didn't understand 'n', did you mean 'f'?\n");
×
833
                        continue; /* ask again */
×
834
                case 's':
×
835
                        printf("Skipping execution.\n");
×
836
                        r = CONFIRM_PRETEND_SUCCESS;
837
                        break;
838
                case 'y':
839
                        r = CONFIRM_EXECUTE;
840
                        break;
841
                default:
×
842
                        assert_not_reached();
×
843
                }
844
                break;
845
        }
846

847
restore_stdio:
×
848
        restore_confirm_stdio(&saved_stdin, &saved_stdout);
×
849
        return r;
850
}
851

852
static int get_fixed_user(
9,666✔
853
                const char *user_or_uid,
854
                bool prefer_nss,
855
                const char **ret_username,
856
                uid_t *ret_uid,
857
                gid_t *ret_gid,
858
                const char **ret_home,
859
                const char **ret_shell) {
860

861
        int r;
9,666✔
862

863
        assert(user_or_uid);
9,666✔
864
        assert(ret_username);
9,666✔
865

866
        r = get_user_creds(&user_or_uid, ret_uid, ret_gid, ret_home, ret_shell,
18,879✔
867
                           USER_CREDS_CLEAN|(prefer_nss ? USER_CREDS_PREFER_NSS : 0));
868
        if (r < 0)
9,666✔
869
                return r;
870

871
        /* user_or_uid is normalized by get_user_creds to username */
872
        *ret_username = user_or_uid;
9,664✔
873

874
        return 0;
9,664✔
875
}
876

877
static int get_fixed_group(
11✔
878
                const char *group_or_gid,
879
                const char **ret_groupname,
880
                gid_t *ret_gid) {
881

882
        int r;
11✔
883

884
        assert(group_or_gid);
11✔
885
        assert(ret_groupname);
11✔
886

887
        r = get_group_creds(&group_or_gid, ret_gid, /* flags = */ 0);
11✔
888
        if (r < 0)
11✔
889
                return r;
890

891
        /* group_or_gid is normalized by get_group_creds to groupname */
892
        *ret_groupname = group_or_gid;
11✔
893

894
        return 0;
11✔
895
}
896

897
static int get_supplementary_groups(
11,853✔
898
                const ExecContext *c,
899
                const char *user,
900
                gid_t gid,
901
                gid_t **ret_gids) {
902

903
        int r;
11,853✔
904

905
        assert(c);
11,853✔
906
        assert(ret_gids);
11,853✔
907

908
        /*
909
         * If user is given, then lookup GID and supplementary groups list.
910
         * We avoid NSS lookups for gid=0. Also we have to initialize groups
911
         * here and as early as possible so we keep the list of supplementary
912
         * groups of the caller.
913
         */
914
        bool keep_groups = false;
11,853✔
915
        if (user && gid_is_valid(gid) && gid != 0) {
14,596✔
916
                /* First step, initialize groups from /etc/groups */
917
                if (initgroups(user, gid) < 0)
2,595✔
918
                        return -errno;
11,853✔
919

920
                keep_groups = true;
921
        }
922

923
        if (strv_isempty(c->supplementary_groups)) {
11,853✔
924
                *ret_gids = NULL;
11,844✔
925
                return 0;
11,844✔
926
        }
927

928
        /*
929
         * If SupplementaryGroups= was passed then NGROUPS_MAX has to
930
         * be positive, otherwise fail.
931
         */
932
        errno = 0;
9✔
933
        int ngroups_max = (int) sysconf(_SC_NGROUPS_MAX);
9✔
934
        if (ngroups_max <= 0)
9✔
935
                return errno_or_else(EOPNOTSUPP);
×
936

937
        _cleanup_free_ gid_t *l_gids = new(gid_t, ngroups_max);
18✔
938
        if (!l_gids)
9✔
939
                return -ENOMEM;
940

941
        int k = 0;
9✔
942
        if (keep_groups) {
9✔
943
                /*
944
                 * Lookup the list of groups that the user belongs to, we
945
                 * avoid NSS lookups here too for gid=0.
946
                 */
947
                k = ngroups_max;
9✔
948
                if (getgrouplist(user, gid, l_gids, &k) < 0)
9✔
949
                        return -EINVAL;
950
        }
951

952
        STRV_FOREACH(i, c->supplementary_groups) {
18✔
953
                if (k >= ngroups_max)
9✔
954
                        return -E2BIG;
×
955

956
                const char *g = *i;
9✔
957
                r = get_group_creds(&g, l_gids + k, /* flags = */ 0);
9✔
958
                if (r < 0)
9✔
959
                        return r;
960

961
                k++;
9✔
962
        }
963

964
        if (k == 0) {
9✔
965
                *ret_gids = NULL;
×
966
                return 0;
×
967
        }
968

969
        /* Otherwise get the final list of supplementary groups */
970
        gid_t *groups = newdup(gid_t, l_gids, k);
9✔
971
        if (!groups)
9✔
972
                return -ENOMEM;
973

974
        *ret_gids = groups;
9✔
975
        return k;
9✔
976
}
977

978
static int enforce_groups(gid_t gid, const gid_t *supplementary_gids, int ngids) {
9,837✔
979
        int r;
9,837✔
980

981
        /* Handle SupplementaryGroups= if it is not empty */
982
        if (ngids > 0) {
9,837✔
983
                r = maybe_setgroups(ngids, supplementary_gids);
273✔
984
                if (r < 0)
273✔
985
                        return r;
986
        }
987

988
        if (gid_is_valid(gid)) {
9,837✔
989
                /* Then set our gids */
990
                if (setresgid(gid, gid, gid) < 0)
2,079✔
991
                        return -errno;
1✔
992
        }
993

994
        return 0;
995
}
996

997
static int set_securebits(unsigned bits, unsigned mask) {
763✔
998
        unsigned applied;
763✔
999
        int current;
763✔
1000

1001
        current = prctl(PR_GET_SECUREBITS);
763✔
1002
        if (current < 0)
763✔
1003
                return -errno;
×
1004

1005
        /* Clear all securebits defined in mask and set bits */
1006
        applied = ((unsigned) current & ~mask) | bits;
763✔
1007
        if ((unsigned) current == applied)
763✔
1008
                return 0;
1009

1010
        if (prctl(PR_SET_SECUREBITS, applied) < 0)
53✔
1011
                return -errno;
×
1012

1013
        return 1;
1014
}
1015

1016
static int enforce_user(
2,072✔
1017
                const ExecContext *context,
1018
                uid_t uid,
1019
                uint64_t capability_ambient_set) {
1020

1021
        int r;
2,072✔
1022

1023
        assert(context);
2,072✔
1024

1025
        if (!uid_is_valid(uid))
2,072✔
1026
                return 0;
1027

1028
        /* Sets (but doesn't look up) the UIS and makes sure we keep the capabilities while doing so. For
1029
         * setting secure bits the capability CAP_SETPCAP is required, so we also need keep-caps in this
1030
         * case. */
1031

1032
        if ((capability_ambient_set != 0 || context->secure_bits != 0) && uid != 0) {
2,072✔
1033

1034
                /* First step: If we need to keep capabilities but drop privileges we need to make sure we
1035
                 * keep our caps, while we drop privileges. Add KEEP_CAPS to the securebits */
1036
                r = set_securebits(1U << SECURE_KEEP_CAPS, 0);
763✔
1037
                if (r < 0)
763✔
1038
                        return r;
1039
        }
1040

1041
        /* Second step: actually set the uids */
1042
        if (setresuid(uid, uid, uid) < 0)
2,072✔
1043
                return -errno;
×
1044

1045
        /* At this point we should have all necessary capabilities but are otherwise a normal user. However,
1046
         * the caps might got corrupted due to the setresuid() so we need clean them up later. This is done
1047
         * outside of this call. */
1048
        return 0;
1049
}
1050

1051
#if HAVE_PAM
1052

1053
static void pam_response_free_array(struct pam_response *responses, size_t n_responses) {
×
1054
        assert(responses || n_responses == 0);
×
1055

1056
        FOREACH_ARRAY(resp, responses, n_responses)
×
1057
                erase_and_free(resp->resp);
×
1058

1059
        free(responses);
×
1060
}
×
1061

1062
typedef struct AskPasswordConvData {
1063
        const ExecContext *context;
1064
        const ExecParameters *params;
1065
} AskPasswordConvData;
1066

1067
static int ask_password_conv(
5✔
1068
                int num_msg,
1069
                const struct pam_message *msg[],
1070
                struct pam_response **ret,
1071
                void *userdata) {
1072

1073
        AskPasswordConvData *data = ASSERT_PTR(userdata);
5✔
1074
        bool set_credential_env_var = false;
5✔
1075
        int r;
5✔
1076

1077
        assert(num_msg >= 0);
5✔
1078
        assert(msg);
5✔
1079
        assert(data->context);
5✔
1080
        assert(data->params);
5✔
1081

1082
        size_t n = num_msg;
5✔
1083
        struct pam_response *responses = new0(struct pam_response, n);
5✔
1084
        if (!responses)
5✔
1085
                return PAM_BUF_ERR;
5✔
1086
        CLEANUP_ARRAY(responses, n, pam_response_free_array);
5✔
1087

1088
        for (size_t i = 0; i < n; i++) {
10✔
1089
                const struct pam_message *mi = *msg + i;
5✔
1090

1091
                switch (mi->msg_style) {
5✔
1092

1093
                case PAM_PROMPT_ECHO_ON:
2✔
1094
                case PAM_PROMPT_ECHO_OFF: {
1095

1096
                        /* Locally set the $CREDENTIALS_DIRECTORY to the credentials directory we just populated */
1097
                        if (!set_credential_env_var) {
2✔
1098
                                _cleanup_free_ char *creds_dir = NULL;
2✔
1099
                                r = exec_context_get_credential_directory(data->context, data->params, data->params->unit_id, &creds_dir);
2✔
1100
                                if (r < 0)
2✔
1101
                                        return log_error_errno(r, "Failed to determine credentials directory: %m");
×
1102

1103
                                if (creds_dir) {
2✔
1104
                                        if (setenv("CREDENTIALS_DIRECTORY", creds_dir, /* overwrite= */ true) < 0)
2✔
1105
                                                return log_error_errno(r, "Failed to set $CREDENTIALS_DIRECTORY: %m");
×
1106
                                } else
1107
                                        (void) unsetenv("CREDENTIALS_DIRECTORY");
×
1108

1109
                                set_credential_env_var = true;
2✔
1110
                        }
1111

1112
                        _cleanup_free_ char *credential_name = strjoin("pam.authtok.", data->context->pam_name);
4✔
1113
                        if (!credential_name)
2✔
1114
                                return log_oom();
×
1115

1116
                        AskPasswordRequest req = {
4✔
1117
                                .message = mi->msg,
2✔
1118
                                .credential = credential_name,
1119
                                .tty_fd = -EBADF,
1120
                                .hup_fd = -EBADF,
1121
                                .until = usec_add(now(CLOCK_MONOTONIC), 15 * USEC_PER_SEC),
2✔
1122
                        };
1123

1124
                        _cleanup_strv_free_erase_ char **acquired = NULL;
×
1125
                        r = ask_password_auto(
2✔
1126
                                        &req,
1127
                                        ASK_PASSWORD_ACCEPT_CACHED|
1128
                                        ASK_PASSWORD_NO_TTY|
1129
                                        (mi->msg_style == PAM_PROMPT_ECHO_ON ? ASK_PASSWORD_ECHO : 0),
2✔
1130
                                        &acquired);
1131
                        if (r < 0) {
2✔
1132
                                log_error_errno(r, "Failed to query for password: %m");
×
1133
                                return PAM_CONV_ERR;
×
1134
                        }
1135

1136
                        responses[i].resp = strdup(ASSERT_PTR(acquired[0]));
2✔
1137
                        if (!responses[i].resp) {
2✔
1138
                                log_oom();
×
1139
                                return PAM_BUF_ERR;
1140
                        }
1141
                        break;
2✔
1142
                }
1143

1144
                case PAM_ERROR_MSG:
1145
                        log_error("PAM: %s", mi->msg);
×
1146
                        break;
1147

1148
                case PAM_TEXT_INFO:
1149
                        log_info("PAM: %s", mi->msg);
3✔
1150
                        break;
1151

1152
                default:
1153
                        return PAM_CONV_ERR;
1154
                }
1155
        }
1156

1157
        *ret = TAKE_PTR(responses);
5✔
1158
        n = 0;
5✔
1159

1160
        return PAM_SUCCESS;
5✔
1161
}
1162

1163
static int pam_close_session_and_delete_credentials(pam_handle_t *handle, int flags) {
224✔
1164
        int r, s;
224✔
1165

1166
        assert(handle);
224✔
1167

1168
        r = pam_close_session(handle, flags);
224✔
1169
        if (r != PAM_SUCCESS)
224✔
1170
                log_debug("pam_close_session() failed: %s", pam_strerror(handle, r));
49✔
1171

1172
        s = pam_setcred(handle, PAM_DELETE_CRED | flags);
224✔
1173
        if (s != PAM_SUCCESS)
224✔
1174
                log_debug("pam_setcred(PAM_DELETE_CRED) failed: %s", pam_strerror(handle, s));
155✔
1175

1176
        return r != PAM_SUCCESS ? r : s;
224✔
1177
}
1178
#endif
1179

1180
static int attach_to_subcgroup(
13✔
1181
                const ExecContext *context,
1182
                const CGroupContext *cgroup_context,
1183
                const ExecParameters *params,
1184
                const char *prefix) {
1185

1186
        _cleanup_free_ char *subgroup = NULL;
13✔
1187
        int r;
13✔
1188

1189
        assert(context);
13✔
1190
        assert(cgroup_context);
13✔
1191
        assert(params);
13✔
1192

1193
        /* If we're a control process that needs a subgroup, we've already been spawned into it as otherwise
1194
         * we'd violate the "no inner processes" rule, so no need to do anything. */
1195
        if (exec_params_needs_control_subcgroup(params))
13✔
1196
                return 0;
1197

1198
        r = exec_params_get_cgroup_path(params, cgroup_context, prefix, &subgroup);
12✔
1199
        if (r < 0)
12✔
1200
                return log_error_errno(r, "Failed to acquire cgroup path: %m");
×
1201
        /* No subgroup required? Then there's nothing to do. */
1202
        if (r == 0)
12✔
1203
                return 0;
1204

1205
        r = cg_attach(subgroup, 0);
4✔
1206
        if (r == -EUCLEAN)
4✔
1207
                return log_error_errno(r,
×
1208
                                "Failed to attach process " PID_FMT " to cgroup '%s', "
1209
                                "because the cgroup or one of its parents or "
1210
                                "siblings is in the threaded mode.",
1211
                                getpid_cached(), subgroup);
1212
        if (r < 0)
4✔
1213
                return log_error_errno(r,
×
1214
                                "Failed to attach process " PID_FMT " to cgroup %s: %m",
1215
                                getpid_cached(), subgroup);
1216

1217
        return 0;
1218
}
1219

1220
static int setup_pam(
405✔
1221
                const ExecContext *context,
1222
                const CGroupContext *cgroup_context,
1223
                ExecParameters *params,
1224
                const char *user,
1225
                uid_t uid,
1226
                gid_t gid,
1227
                char ***env, /* updated on success */
1228
                const int fds[], size_t n_fds,
1229
                bool needs_sandboxing,
1230
                int exec_fd) {
1231

1232
#if HAVE_PAM
1233
        AskPasswordConvData conv_data = {
405✔
1234
                .context = context,
1235
                .params = params,
1236
        };
1237

1238
        const struct pam_conv conv = {
405✔
1239
                .conv = ask_password_conv,
1240
                .appdata_ptr = &conv_data,
1241
        };
1242

1243
        _cleanup_(barrier_destroy) Barrier barrier = BARRIER_NULL;
405✔
1244
        _cleanup_strv_free_ char **e = NULL;
×
1245
        _cleanup_free_ char *tty = NULL;
405✔
1246
        pam_handle_t *handle = NULL;
405✔
1247
        sigset_t old_ss;
405✔
1248
        int pam_code = PAM_SUCCESS, r;
405✔
1249
        bool close_session = false;
405✔
1250
        pid_t parent_pid;
405✔
1251
        int flags = 0;
405✔
1252

1253
        assert(context);
405✔
1254
        assert(params);
405✔
1255
        assert(user);
405✔
1256
        assert(uid_is_valid(uid));
405✔
1257
        assert(gid_is_valid(gid));
405✔
1258
        assert(fds || n_fds == 0);
405✔
1259
        assert(env);
405✔
1260

1261
        /* We set up PAM in the parent process, then fork. The child
1262
         * will then stay around until killed via PR_GET_PDEATHSIG or
1263
         * systemd via the cgroup logic. It will then remove the PAM
1264
         * session again. The parent process will exec() the actual
1265
         * daemon. We do things this way to ensure that the main PID
1266
         * of the daemon is the one we initially fork()ed. */
1267

1268
        r = barrier_create(&barrier);
405✔
1269
        if (r < 0)
405✔
1270
                goto fail;
×
1271

1272
        if (log_get_max_level() < LOG_DEBUG)
405✔
1273
                flags |= PAM_SILENT;
3✔
1274

1275
        pam_code = pam_start(context->pam_name, user, &conv, &handle);
405✔
1276
        if (pam_code != PAM_SUCCESS) {
405✔
1277
                handle = NULL;
×
1278
                goto fail;
×
1279
        }
1280

1281
        if (getttyname_malloc(STDIN_FILENO, &tty) >= 0) {
405✔
1282
                _cleanup_free_ char *q = path_join("/dev", tty);
6✔
1283
                if (!q) {
6✔
1284
                        r = -ENOMEM;
×
1285
                        goto fail;
×
1286
                }
1287

1288
                free_and_replace(tty, q);
6✔
1289
        }
1290

1291
        if (tty) {
405✔
1292
                pam_code = pam_set_item(handle, PAM_TTY, tty);
6✔
1293
                if (pam_code != PAM_SUCCESS)
6✔
1294
                        goto fail;
×
1295
        }
1296

1297
        STRV_FOREACH(nv, *env) {
5,729✔
1298
                pam_code = pam_putenv(handle, *nv);
5,324✔
1299
                if (pam_code != PAM_SUCCESS)
5,324✔
1300
                        goto fail;
×
1301
        }
1302

1303
        pam_code = pam_acct_mgmt(handle, flags);
405✔
1304
        if (pam_code != PAM_SUCCESS)
405✔
1305
                goto fail;
×
1306

1307
        pam_code = pam_setcred(handle, PAM_ESTABLISH_CRED | flags);
405✔
1308
        if (pam_code != PAM_SUCCESS)
405✔
1309
                log_debug("pam_setcred(PAM_ESTABLISH_CRED) failed, ignoring: %s", pam_strerror(handle, pam_code));
332✔
1310

1311
        pam_code = pam_open_session(handle, flags);
405✔
1312
        if (pam_code != PAM_SUCCESS)
405✔
1313
                goto fail;
×
1314

1315
        close_session = true;
405✔
1316

1317
        e = pam_getenvlist(handle);
405✔
1318
        if (!e) {
405✔
1319
                pam_code = PAM_BUF_ERR;
×
1320
                goto fail;
×
1321
        }
1322

1323
        /* Block SIGTERM, so that we know that it won't get lost in the child */
1324

1325
        assert_se(sigprocmask_many(SIG_BLOCK, &old_ss, SIGTERM) >= 0);
405✔
1326

1327
        parent_pid = getpid_cached();
405✔
1328

1329
        r = safe_fork("(sd-pam)", 0, NULL);
405✔
1330
        if (r < 0)
629✔
1331
                goto fail;
×
1332
        if (r == 0) {
629✔
1333
                int ret = EXIT_PAM;
224✔
1334

1335
                if (needs_sandboxing && exec_needs_cgroup_namespace(context) && params->cgroup_path) {
224✔
1336
                        /* Move PAM process into subgroup immediately if the main process hasn't been moved
1337
                         * into the subgroup yet (when cgroup namespacing is enabled) and a subgroup is
1338
                         * configured. */
1339
                        r = attach_to_subcgroup(context, cgroup_context, params, params->cgroup_path);
1✔
1340
                        if (r < 0)
1✔
1341
                                return r;
1342
                }
1343

1344
                /* The child's job is to reset the PAM session on termination */
1345
                barrier_set_role(&barrier, BARRIER_CHILD);
224✔
1346

1347
                /* Make sure we don't keep open the passed fds in this child. We assume that otherwise only
1348
                 * those fds are open here that have been opened by PAM. */
1349
                (void) close_many(fds, n_fds);
224✔
1350

1351
                /* Also close the 'exec_fd' in the child, since the service manager waits for the EOF induced
1352
                 * by the execve() to wait for completion, and if we'd keep the fd open here in the child
1353
                 * we'd never signal completion. */
1354
                exec_fd = safe_close(exec_fd);
224✔
1355

1356
                /* Drop privileges - we don't need any to pam_close_session and this will make
1357
                 * PR_SET_PDEATHSIG work in most cases.  If this fails, ignore the error - but expect sd-pam
1358
                 * threads to fail to exit normally */
1359

1360
                r = fully_set_uid_gid(uid, gid, /* supplementary_gids= */ NULL, /* n_supplementary_gids= */ 0);
224✔
1361
                if (r < 0)
224✔
1362
                        log_warning_errno(r, "Failed to drop privileges in sd-pam: %m");
×
1363

1364
                (void) ignore_signals(SIGPIPE);
224✔
1365

1366
                /* Wait until our parent died. This will only work if the above setresuid() succeeds,
1367
                 * otherwise the kernel will not allow unprivileged parents kill their privileged children
1368
                 * this way. We rely on the control groups kill logic to do the rest for us. */
1369
                if (prctl(PR_SET_PDEATHSIG, SIGTERM) < 0)
224✔
1370
                        goto child_finish;
×
1371

1372
                /* Tell the parent that our setup is done. This is especially important regarding dropping
1373
                 * privileges. Otherwise, unit setup might race against our setresuid(2) call.
1374
                 *
1375
                 * If the parent aborted, we'll detect this below, hence ignore return failure here. */
1376
                (void) barrier_place(&barrier);
224✔
1377

1378
                /* Check if our parent process might already have died? */
1379
                if (getppid() == parent_pid) {
224✔
1380
                        sigset_t ss;
224✔
1381
                        int sig;
224✔
1382

1383
                        assert_se(sigemptyset(&ss) >= 0);
224✔
1384
                        assert_se(sigaddset(&ss, SIGTERM) >= 0);
224✔
1385

1386
                        assert_se(sigwait(&ss, &sig) == 0);
224✔
1387
                        assert(sig == SIGTERM);
224✔
1388
                }
1389

1390
                /* If our parent died we'll end the session */
1391
                if (getppid() != parent_pid) {
224✔
1392
                        pam_code = pam_close_session_and_delete_credentials(handle, flags);
224✔
1393
                        if (pam_code != PAM_SUCCESS)
224✔
1394
                                goto child_finish;
155✔
1395
                }
1396

1397
                ret = 0;
1398

1399
        child_finish:
224✔
1400
                /* NB: pam_end() when called in child processes should set PAM_DATA_SILENT to let the module
1401
                 * know about this. See pam_end(3) */
1402
                (void) pam_end(handle, pam_code | flags | PAM_DATA_SILENT);
224✔
1403
                _exit(ret);
224✔
1404
        }
1405

1406
        barrier_set_role(&barrier, BARRIER_PARENT);
405✔
1407

1408
        /* If the child was forked off successfully it will do all the cleanups, so forget about the handle
1409
         * here. */
1410
        handle = NULL;
405✔
1411

1412
        /* Unblock SIGTERM again in the parent */
1413
        assert_se(sigprocmask(SIG_SETMASK, &old_ss, NULL) >= 0);
405✔
1414

1415
        /* We close the log explicitly here, since the PAM modules might have opened it, but we don't want
1416
         * this fd around. */
1417
        closelog();
405✔
1418

1419
        /* Synchronously wait for the child to initialize. We don't care for errors as we cannot
1420
         * recover. However, warn loudly if it happens. */
1421
        if (!barrier_place_and_sync(&barrier))
810✔
1422
                log_error("PAM initialization failed");
×
1423

1424
        return strv_free_and_replace(*env, e);
405✔
1425

1426
fail:
×
1427
        if (pam_code != PAM_SUCCESS) {
×
1428
                log_error("PAM failed: %s", pam_strerror(handle, pam_code));
×
1429
                r = -EPERM;  /* PAM errors do not map to errno */
1430
        } else
1431
                log_error_errno(r, "PAM failed: %m");
×
1432

1433
        if (handle) {
×
1434
                if (close_session)
×
1435
                        pam_code = pam_close_session_and_delete_credentials(handle, flags);
×
1436

1437
                (void) pam_end(handle, pam_code | flags);
×
1438
        }
1439

1440
        closelog();
×
1441
        return r;
1442
#else
1443
        return 0;
1444
#endif
1445
}
1446

1447
static void rename_process_from_path(const char *path) {
11,856✔
1448
        _cleanup_free_ char *buf = NULL;
11,856✔
1449
        const char *p;
11,856✔
1450

1451
        assert(path);
11,856✔
1452

1453
        /* This resulting string must fit in 10 chars (i.e. the length of "/sbin/init") to look pretty in
1454
         * /bin/ps */
1455

1456
        if (path_extract_filename(path, &buf) < 0) {
11,856✔
1457
                rename_process("(...)");
×
1458
                return;
×
1459
        }
1460

1461
        size_t l = strlen(buf);
11,856✔
1462
        if (l > 8) {
11,856✔
1463
                /* The end of the process name is usually more interesting, since the first bit might just be
1464
                 * "systemd-" */
1465
                p = buf + l - 8;
8,158✔
1466
                l = 8;
8,158✔
1467
        } else
1468
                p = buf;
1469

1470
        char process_name[11];
11,856✔
1471
        process_name[0] = '(';
11,856✔
1472
        memcpy(process_name+1, p, l);
11,856✔
1473
        process_name[1+l] = ')';
11,856✔
1474
        process_name[1+l+1] = 0;
11,856✔
1475

1476
        (void) rename_process(process_name);
11,856✔
1477
}
1478

1479
static bool context_has_address_families(const ExecContext *c) {
12,831✔
1480
        assert(c);
12,831✔
1481

1482
        return c->address_families_allow_list ||
12,831✔
1483
                !set_isempty(c->address_families);
11,298✔
1484
}
1485

1486
static bool context_has_syscall_filters(const ExecContext *c) {
12,795✔
1487
        assert(c);
12,795✔
1488

1489
        return c->syscall_allow_list ||
12,795✔
1490
                !hashmap_isempty(c->syscall_filter);
11,277✔
1491
}
1492

1493
static bool context_has_syscall_logs(const ExecContext *c) {
12,795✔
1494
        assert(c);
12,795✔
1495

1496
        return c->syscall_log_allow_list ||
12,795✔
1497
                !hashmap_isempty(c->syscall_log);
12,795✔
1498
}
1499

1500
static bool context_has_seccomp(const ExecContext *c) {
3,736✔
1501
        assert(c);
3,736✔
1502

1503
        /* We need NNP if we have any form of seccomp and are unprivileged */
1504
        return c->lock_personality ||
6,737✔
1505
                c->memory_deny_write_execute ||
3,001✔
1506
                c->private_devices ||
3,001✔
1507
                c->protect_clock ||
3,001✔
1508
                c->protect_hostname == PROTECT_HOSTNAME_YES ||
3,001✔
1509
                c->protect_kernel_tunables ||
3,001✔
1510
                c->protect_kernel_modules ||
3,001✔
1511
                c->protect_kernel_logs ||
6,002✔
1512
                context_has_address_families(c) ||
6,002✔
1513
                exec_context_restrict_namespaces_set(c) ||
3,001✔
1514
                c->restrict_realtime ||
3,001✔
1515
                c->restrict_suid_sgid ||
3,001✔
1516
                !set_isempty(c->syscall_archs) ||
5,930✔
1517
                context_has_syscall_filters(c) ||
9,666✔
1518
                context_has_syscall_logs(c);
2,965✔
1519
}
1520

1521
static bool context_has_no_new_privileges(const ExecContext *c) {
9,830✔
1522
        assert(c);
9,830✔
1523

1524
        if (c->no_new_privileges)
9,830✔
1525
                return true;
1526

1527
        if (have_effective_cap(CAP_SYS_ADMIN) > 0) /* if we are privileged, we don't need NNP */
8,375✔
1528
                return false;
1529

1530
        return context_has_seccomp(c);
1,664✔
1531
}
1532

1533
#if HAVE_SECCOMP
1534

1535
static bool seccomp_allows_drop_privileges(const ExecContext *c) {
771✔
1536
        void *id, *val;
771✔
1537
        bool have_capget = false, have_capset = false, have_prctl = false;
771✔
1538

1539
        assert(c);
771✔
1540

1541
        /* No syscall filter, we are allowed to drop privileges */
1542
        if (hashmap_isempty(c->syscall_filter))
771✔
1543
                return true;
771✔
1544

1545
        HASHMAP_FOREACH_KEY(val, id, c->syscall_filter) {
280,048✔
1546
                _cleanup_free_ char *name = NULL;
279,329✔
1547

1548
                name = seccomp_syscall_resolve_num_arch(SCMP_ARCH_NATIVE, PTR_TO_INT(id) - 1);
279,329✔
1549

1550
                if (streq(name, "capget"))
279,329✔
1551
                        have_capget = true;
1552
                else if (streq(name, "capset"))
278,610✔
1553
                        have_capset = true;
1554
                else if (streq(name, "prctl"))
277,891✔
1555
                        have_prctl = true;
719✔
1556
        }
1557

1558
        if (c->syscall_allow_list)
719✔
1559
                return have_capget && have_capset && have_prctl;
719✔
1560
        else
1561
                return !(have_capget || have_capset || have_prctl);
×
1562
}
1563

1564
static bool skip_seccomp_unavailable(const char *msg) {
15,196✔
1565
        assert(msg);
15,196✔
1566

1567
        if (is_seccomp_available())
15,196✔
1568
                return false;
1569

1570
        log_debug("SECCOMP features not detected in the kernel, skipping %s", msg);
×
1571
        return true;
1572
}
1573

1574
static int apply_syscall_filter(const ExecContext *c, const ExecParameters *p) {
9,830✔
1575
        uint32_t negative_action, default_action, action;
9,830✔
1576
        int r;
9,830✔
1577

1578
        assert(c);
9,830✔
1579
        assert(p);
9,830✔
1580

1581
        if (!context_has_syscall_filters(c))
9,830✔
1582
                return 0;
1583

1584
        if (skip_seccomp_unavailable("SystemCallFilter="))
1,519✔
1585
                return 0;
1586

1587
        negative_action = c->syscall_errno == SECCOMP_ERROR_NUMBER_KILL ? scmp_act_kill_process() : SCMP_ACT_ERRNO(c->syscall_errno);
1,519✔
1588

1589
        if (c->syscall_allow_list) {
1,519✔
1590
                default_action = negative_action;
1591
                action = SCMP_ACT_ALLOW;
1592
        } else {
1593
                default_action = SCMP_ACT_ALLOW;
1✔
1594
                action = negative_action;
1✔
1595
        }
1596

1597
        /* Sending over exec_fd or handoff_timestamp_fd requires write() syscall. */
1598
        if (p->exec_fd >= 0 || p->handoff_timestamp_fd >= 0) {
1,519✔
1599
                r = seccomp_filter_set_add_by_name(c->syscall_filter, c->syscall_allow_list, "write");
1,519✔
1600
                if (r < 0)
1,519✔
1601
                        return r;
1602
        }
1603

1604
        return seccomp_load_syscall_filter_set_raw(default_action, c->syscall_filter, action, false);
1,519✔
1605
}
1606

1607
static int apply_syscall_log(const ExecContext *c, const ExecParameters *p) {
9,830✔
1608
#ifdef SCMP_ACT_LOG
1609
        uint32_t default_action, action;
9,830✔
1610
#endif
1611

1612
        assert(c);
9,830✔
1613
        assert(p);
9,830✔
1614

1615
        if (!context_has_syscall_logs(c))
9,830✔
1616
                return 0;
1617

1618
#ifdef SCMP_ACT_LOG
1619
        if (skip_seccomp_unavailable("SystemCallLog="))
×
1620
                return 0;
1621

1622
        if (c->syscall_log_allow_list) {
×
1623
                /* Log nothing but the ones listed */
1624
                default_action = SCMP_ACT_ALLOW;
1625
                action = SCMP_ACT_LOG;
1626
        } else {
1627
                /* Log everything but the ones listed */
1628
                default_action = SCMP_ACT_LOG;
×
1629
                action = SCMP_ACT_ALLOW;
×
1630
        }
1631

1632
        return seccomp_load_syscall_filter_set_raw(default_action, c->syscall_log, action, false);
×
1633
#else
1634
        /* old libseccomp */
1635
        log_debug( "SECCOMP feature SCMP_ACT_LOG not available, skipping SystemCallLog=");
1636
        return 0;
1637
#endif
1638
}
1639

1640
static int apply_syscall_archs(const ExecContext *c, const ExecParameters *p) {
9,830✔
1641
        assert(c);
9,830✔
1642
        assert(p);
9,830✔
1643

1644
        if (set_isempty(c->syscall_archs))
9,830✔
1645
                return 0;
1646

1647
        if (skip_seccomp_unavailable("SystemCallArchitectures="))
1,536✔
1648
                return 0;
1649

1650
        return seccomp_restrict_archs(c->syscall_archs);
1,536✔
1651
}
1652

1653
static int apply_address_families(const ExecContext *c, const ExecParameters *p) {
9,830✔
1654
        assert(c);
9,830✔
1655
        assert(p);
9,830✔
1656

1657
        if (!context_has_address_families(c))
9,830✔
1658
                return 0;
1659

1660
        if (skip_seccomp_unavailable("RestrictAddressFamilies="))
1,533✔
1661
                return 0;
1662

1663
        return seccomp_restrict_address_families(c->address_families, c->address_families_allow_list);
1,533✔
1664
}
1665

1666
static int apply_memory_deny_write_execute(const ExecContext *c, const ExecParameters *p) {
9,830✔
1667
        int r;
9,830✔
1668

1669
        assert(c);
9,830✔
1670
        assert(p);
9,830✔
1671

1672
        if (!c->memory_deny_write_execute)
9,830✔
1673
                return 0;
1674

1675
        /* use prctl() if kernel supports it (6.3) */
1676
        r = prctl(PR_SET_MDWE, PR_MDWE_REFUSE_EXEC_GAIN, 0, 0, 0);
1,533✔
1677
        if (r == 0) {
1,533✔
1678
                log_debug("Enabled MemoryDenyWriteExecute= with PR_SET_MDWE");
1,533✔
1679
                return 0;
1,533✔
1680
        }
1681
        if (r < 0 && errno != EINVAL)
×
1682
                return log_debug_errno(errno, "Failed to enable MemoryDenyWriteExecute= with PR_SET_MDWE: %m");
×
1683
        /* else use seccomp */
1684
        log_debug("Kernel doesn't support PR_SET_MDWE: falling back to seccomp");
×
1685

1686
        if (skip_seccomp_unavailable("MemoryDenyWriteExecute="))
×
1687
                return 0;
1688

1689
        return seccomp_memory_deny_write_execute();
×
1690
}
1691

1692
static int apply_restrict_realtime(const ExecContext *c, const ExecParameters *p) {
9,830✔
1693
        assert(c);
9,830✔
1694
        assert(p);
9,830✔
1695

1696
        if (!c->restrict_realtime)
9,830✔
1697
                return 0;
1698

1699
        if (skip_seccomp_unavailable("RestrictRealtime="))
1,533✔
1700
                return 0;
1701

1702
        return seccomp_restrict_realtime();
1,533✔
1703
}
1704

1705
static int apply_restrict_suid_sgid(const ExecContext *c, const ExecParameters *p) {
9,830✔
1706
        assert(c);
9,830✔
1707
        assert(p);
9,830✔
1708

1709
        if (!c->restrict_suid_sgid)
9,830✔
1710
                return 0;
1711

1712
        if (skip_seccomp_unavailable("RestrictSUIDSGID="))
1,453✔
1713
                return 0;
1714

1715
        return seccomp_restrict_suid_sgid();
1,453✔
1716
}
1717

1718
static int apply_protect_sysctl(const ExecContext *c, const ExecParameters *p) {
9,830✔
1719
        assert(c);
9,830✔
1720
        assert(p);
9,830✔
1721

1722
        /* Turn off the legacy sysctl() system call. Many distributions turn this off while building the kernel, but
1723
         * let's protect even those systems where this is left on in the kernel. */
1724

1725
        if (!c->protect_kernel_tunables)
9,830✔
1726
                return 0;
1727

1728
        if (skip_seccomp_unavailable("ProtectKernelTunables="))
376✔
1729
                return 0;
1730

1731
        return seccomp_protect_sysctl();
376✔
1732
}
1733

1734
static int apply_protect_kernel_modules(const ExecContext *c, const ExecParameters *p) {
9,830✔
1735
        assert(c);
9,830✔
1736
        assert(p);
9,830✔
1737

1738
        /* Turn off module syscalls on ProtectKernelModules=yes */
1739

1740
        if (!c->protect_kernel_modules)
9,830✔
1741
                return 0;
1742

1743
        if (skip_seccomp_unavailable("ProtectKernelModules="))
1,165✔
1744
                return 0;
1745

1746
        return seccomp_load_syscall_filter_set(SCMP_ACT_ALLOW, syscall_filter_sets + SYSCALL_FILTER_SET_MODULE, SCMP_ACT_ERRNO(EPERM), false);
1,165✔
1747
}
1748

1749
static int apply_protect_kernel_logs(const ExecContext *c, const ExecParameters *p) {
9,830✔
1750
        assert(c);
9,830✔
1751
        assert(p);
9,830✔
1752

1753
        if (!c->protect_kernel_logs)
9,830✔
1754
                return 0;
1755

1756
        if (skip_seccomp_unavailable("ProtectKernelLogs="))
1,165✔
1757
                return 0;
1758

1759
        return seccomp_protect_syslog();
1,165✔
1760
}
1761

1762
static int apply_protect_clock(const ExecContext *c, const ExecParameters *p) {
9,830✔
1763
        assert(c);
9,830✔
1764
        assert(p);
9,830✔
1765

1766
        if (!c->protect_clock)
9,830✔
1767
                return 0;
1768

1769
        if (skip_seccomp_unavailable("ProtectClock="))
864✔
1770
                return 0;
1771

1772
        return seccomp_load_syscall_filter_set(SCMP_ACT_ALLOW, syscall_filter_sets + SYSCALL_FILTER_SET_CLOCK, SCMP_ACT_ERRNO(EPERM), false);
864✔
1773
}
1774

1775
static int apply_private_devices(const ExecContext *c, const ExecParameters *p) {
9,830✔
1776
        assert(c);
9,830✔
1777
        assert(p);
9,830✔
1778

1779
        /* If PrivateDevices= is set, also turn off iopl and all @raw-io syscalls. */
1780

1781
        if (!c->private_devices)
9,830✔
1782
                return 0;
1783

1784
        if (skip_seccomp_unavailable("PrivateDevices="))
569✔
1785
                return 0;
1786

1787
        return seccomp_load_syscall_filter_set(SCMP_ACT_ALLOW, syscall_filter_sets + SYSCALL_FILTER_SET_RAW_IO, SCMP_ACT_ERRNO(EPERM), false);
569✔
1788
}
1789

1790
static int apply_restrict_namespaces(const ExecContext *c, const ExecParameters *p) {
9,830✔
1791
        assert(c);
9,830✔
1792
        assert(p);
9,830✔
1793

1794
        if (!exec_context_restrict_namespaces_set(c))
9,830✔
1795
                return 0;
1796

1797
        if (skip_seccomp_unavailable("RestrictNamespaces="))
1,274✔
1798
                return 0;
1799

1800
        return seccomp_restrict_namespaces(c->restrict_namespaces);
1,274✔
1801
}
1802

1803
static int apply_lock_personality(const ExecContext *c, const ExecParameters *p) {
9,830✔
1804
        unsigned long personality;
9,830✔
1805
        int r;
9,830✔
1806

1807
        assert(c);
9,830✔
1808
        assert(p);
9,830✔
1809

1810
        if (!c->lock_personality)
9,830✔
1811
                return 0;
9,830✔
1812

1813
        if (skip_seccomp_unavailable("LockPersonality="))
1,533✔
1814
                return 0;
1815

1816
        personality = c->personality;
1,533✔
1817

1818
        /* If personality is not specified, use either PER_LINUX or PER_LINUX32 depending on what is currently set. */
1819
        if (personality == PERSONALITY_INVALID) {
1,533✔
1820

1821
                r = opinionated_personality(&personality);
1,533✔
1822
                if (r < 0)
1,533✔
1823
                        return r;
1824
        }
1825

1826
        return seccomp_lock_personality(personality);
1,533✔
1827
}
1828

1829
#endif
1830

1831
#if HAVE_LIBBPF
1832
static int apply_restrict_filesystems(const ExecContext *c, const ExecParameters *p) {
9,830✔
1833
        int r;
9,830✔
1834

1835
        assert(c);
9,830✔
1836
        assert(p);
9,830✔
1837

1838
        if (!exec_context_restrict_filesystems_set(c))
9,830✔
1839
                return 0;
1840

1841
        if (p->bpf_restrict_fs_map_fd < 0) {
×
1842
                /* LSM BPF is unsupported or lsm_bpf_setup failed */
1843
                log_debug("LSM BPF not supported, skipping RestrictFileSystems=");
×
1844
                return 0;
×
1845
        }
1846

1847
        /* We are in a new binary, so dl-open again */
1848
        r = dlopen_bpf();
×
1849
        if (r < 0)
×
1850
                return r;
1851

1852
        return bpf_restrict_fs_update(c->restrict_filesystems, p->cgroup_id, p->bpf_restrict_fs_map_fd, c->restrict_filesystems_allow_list);
×
1853
}
1854
#endif
1855

1856
static int apply_protect_hostname(const ExecContext *c, const ExecParameters *p, int *ret_exit_status) {
9,833✔
1857
        int r;
9,833✔
1858

1859
        assert(c);
9,833✔
1860
        assert(p);
9,833✔
1861
        assert(ret_exit_status);
9,833✔
1862

1863
        if (c->protect_hostname == PROTECT_HOSTNAME_NO)
9,833✔
1864
                return 0;
1865

1866
        if (namespace_type_supported(NAMESPACE_UTS)) {
682✔
1867
                if (unshare(CLONE_NEWUTS) < 0) {
682✔
1868
                        if (!ERRNO_IS_NOT_SUPPORTED(errno) && !ERRNO_IS_PRIVILEGE(errno)) {
×
1869
                                *ret_exit_status = EXIT_NAMESPACE;
×
1870
                                return log_error_errno(errno, "Failed to set up UTS namespacing: %m");
×
1871
                        }
1872

1873
                        log_warning("ProtectHostname=%s is configured, but UTS namespace setup is prohibited (container manager?), ignoring namespace setup.",
×
1874
                                    protect_hostname_to_string(c->protect_hostname));
1875

1876
                } else if (c->private_hostname) {
682✔
1877
                        r = sethostname_idempotent(c->private_hostname);
4✔
1878
                        if (r < 0) {
4✔
1879
                                *ret_exit_status = EXIT_NAMESPACE;
×
1880
                                return log_error_errno(r, "Failed to set private hostname '%s': %m", c->private_hostname);
×
1881
                        }
1882
                }
1883
        } else
1884
                log_warning("ProtectHostname=%s is configured, but the kernel does not support UTS namespaces, ignoring namespace setup.",
×
1885
                            protect_hostname_to_string(c->protect_hostname));
1886

1887
#if HAVE_SECCOMP
1888
        if (c->protect_hostname == PROTECT_HOSTNAME_YES) {
682✔
1889
                if (skip_seccomp_unavailable("ProtectHostname="))
676✔
1890
                        return 0;
1891

1892
                r = seccomp_protect_hostname();
676✔
1893
                if (r < 0) {
676✔
1894
                        *ret_exit_status = EXIT_SECCOMP;
×
1895
                        return log_error_errno(r, "Failed to apply hostname restrictions: %m");
×
1896
                }
1897
        }
1898
#endif
1899

1900
        return 1;
1901
}
1902

1903
static void do_idle_pipe_dance(int idle_pipe[static 4]) {
154✔
1904
        assert(idle_pipe);
154✔
1905

1906
        idle_pipe[1] = safe_close(idle_pipe[1]);
154✔
1907
        idle_pipe[2] = safe_close(idle_pipe[2]);
154✔
1908

1909
        if (idle_pipe[0] >= 0) {
154✔
1910
                int r;
154✔
1911

1912
                r = fd_wait_for_event(idle_pipe[0], POLLHUP, IDLE_TIMEOUT_USEC);
154✔
1913

1914
                if (idle_pipe[3] >= 0 && r == 0 /* timeout */) {
154✔
1915
                        ssize_t n;
110✔
1916

1917
                        /* Signal systemd that we are bored and want to continue. */
1918
                        n = write(idle_pipe[3], "x", 1);
110✔
1919
                        if (n > 0)
110✔
1920
                                /* Wait for systemd to react to the signal above. */
1921
                                (void) fd_wait_for_event(idle_pipe[0], POLLHUP, IDLE_TIMEOUT2_USEC);
110✔
1922
                }
1923

1924
                idle_pipe[0] = safe_close(idle_pipe[0]);
154✔
1925

1926
        }
1927

1928
        idle_pipe[3] = safe_close(idle_pipe[3]);
154✔
1929
}
154✔
1930

1931
static const char *exec_directory_env_name_to_string(ExecDirectoryType t);
1932

1933
/* And this table also maps ExecDirectoryType, to the environment variable we pass the selected directory to
1934
 * the service payload in. */
1935
static const char* const exec_directory_env_name_table[_EXEC_DIRECTORY_TYPE_MAX] = {
1936
        [EXEC_DIRECTORY_RUNTIME]       = "RUNTIME_DIRECTORY",
1937
        [EXEC_DIRECTORY_STATE]         = "STATE_DIRECTORY",
1938
        [EXEC_DIRECTORY_CACHE]         = "CACHE_DIRECTORY",
1939
        [EXEC_DIRECTORY_LOGS]          = "LOGS_DIRECTORY",
1940
        [EXEC_DIRECTORY_CONFIGURATION] = "CONFIGURATION_DIRECTORY",
1941
};
1942

1943
DEFINE_PRIVATE_STRING_TABLE_LOOKUP_TO_STRING(exec_directory_env_name, ExecDirectoryType);
2,557✔
1944

1945
static int build_environment(
9,856✔
1946
                const ExecContext *c,
1947
                const ExecParameters *p,
1948
                const CGroupContext *cgroup_context,
1949
                size_t n_fds,
1950
                const char *home,
1951
                const char *username,
1952
                const char *shell,
1953
                dev_t journal_stream_dev,
1954
                ino_t journal_stream_ino,
1955
                const char *memory_pressure_path,
1956
                bool needs_sandboxing,
1957
                char ***ret) {
1958

1959
        _cleanup_strv_free_ char **our_env = NULL;
9,856✔
1960
        size_t n_env = 0;
9,856✔
1961
        char *x;
9,856✔
1962
        int r;
9,856✔
1963

1964
        assert(c);
9,856✔
1965
        assert(p);
9,856✔
1966
        assert(cgroup_context);
9,856✔
1967
        assert(ret);
9,856✔
1968

1969
#define N_ENV_VARS 19
1970
        our_env = new0(char*, N_ENV_VARS + _EXEC_DIRECTORY_TYPE_MAX + 1);
9,856✔
1971
        if (!our_env)
9,856✔
1972
                return -ENOMEM;
1973

1974
        if (n_fds > 0) {
9,856✔
1975
                _cleanup_free_ char *joined = NULL;
1,574✔
1976

1977
                if (asprintf(&x, "LISTEN_PID="PID_FMT, getpid_cached()) < 0)
1,574✔
1978
                        return -ENOMEM;
1979
                our_env[n_env++] = x;
1,574✔
1980

1981
                if (asprintf(&x, "LISTEN_FDS=%zu", n_fds) < 0)
1,574✔
1982
                        return -ENOMEM;
1983
                our_env[n_env++] = x;
1,574✔
1984

1985
                joined = strv_join(p->fd_names, ":");
1,574✔
1986
                if (!joined)
1,574✔
1987
                        return -ENOMEM;
1988

1989
                x = strjoin("LISTEN_FDNAMES=", joined);
1,574✔
1990
                if (!x)
1,574✔
1991
                        return -ENOMEM;
1992
                our_env[n_env++] = x;
1,574✔
1993
        }
1994

1995
        if ((p->flags & EXEC_SET_WATCHDOG) && p->watchdog_usec > 0) {
9,856✔
1996
                if (asprintf(&x, "WATCHDOG_PID="PID_FMT, getpid_cached()) < 0)
1,527✔
1997
                        return -ENOMEM;
1998
                our_env[n_env++] = x;
1,527✔
1999

2000
                if (asprintf(&x, "WATCHDOG_USEC="USEC_FMT, p->watchdog_usec) < 0)
1,527✔
2001
                        return -ENOMEM;
2002
                our_env[n_env++] = x;
1,527✔
2003
        }
2004

2005
        /* If this is D-Bus, tell the nss-systemd module, since it relies on being able to use blocking
2006
         * Varlink calls back to us for look up dynamic users in PID 1. Break the deadlock between D-Bus and
2007
         * PID 1 by disabling use of PID1' NSS interface for looking up dynamic users. */
2008
        if (p->flags & EXEC_NSS_DYNAMIC_BYPASS) {
9,856✔
2009
                x = strdup("SYSTEMD_NSS_DYNAMIC_BYPASS=1");
123✔
2010
                if (!x)
123✔
2011
                        return -ENOMEM;
2012
                our_env[n_env++] = x;
123✔
2013
        }
2014

2015
        /* We query "root" if this is a system unit and User= is not specified. $USER is always set. $HOME
2016
         * could cause problem for e.g. getty, since login doesn't override $HOME, and $LOGNAME and $SHELL don't
2017
         * really make much sense since we're not logged in. Hence we conditionalize the three based on
2018
         * SetLoginEnvironment= switch. */
2019
        if (!username && !c->dynamic_user && p->runtime_scope == RUNTIME_SCOPE_SYSTEM) {
9,856✔
2020
                assert(!c->user);
6,983✔
2021

2022
                r = get_fixed_user("root", /* prefer_nss = */ false, &username, NULL, NULL, &home, &shell);
6,983✔
2023
                if (r < 0)
6,983✔
2024
                        return log_debug_errno(r, "Failed to determine user credentials for root: %m");
×
2025
        }
2026

2027
        bool set_user_login_env = exec_context_get_set_login_environment(c);
9,856✔
2028

2029
        if (username) {
9,856✔
2030
                x = strjoin("USER=", username);
9,058✔
2031
                if (!x)
9,058✔
2032
                        return -ENOMEM;
2033
                our_env[n_env++] = x;
9,058✔
2034

2035
                if (set_user_login_env) {
9,058✔
2036
                        x = strjoin("LOGNAME=", username);
2,071✔
2037
                        if (!x)
2,071✔
2038
                                return -ENOMEM;
2039
                        our_env[n_env++] = x;
2,071✔
2040
                }
2041
        }
2042

2043
        /* Note that we don't set $HOME or $SHELL if they are not particularly enlightening anyway
2044
         * (i.e. are "/" or "/bin/nologin"). */
2045

2046
        if (home && set_user_login_env && !empty_or_root(home)) {
9,856✔
2047
                x = strjoin("HOME=", home);
418✔
2048
                if (!x)
418✔
2049
                        return -ENOMEM;
2050

2051
                path_simplify(x + 5);
418✔
2052
                our_env[n_env++] = x;
418✔
2053
        }
2054

2055
        if (shell && set_user_login_env && !shell_is_placeholder(shell)) {
9,856✔
2056
                x = strjoin("SHELL=", shell);
420✔
2057
                if (!x)
420✔
2058
                        return -ENOMEM;
2059

2060
                path_simplify(x + 6);
420✔
2061
                our_env[n_env++] = x;
420✔
2062
        }
2063

2064
        if (!sd_id128_is_null(p->invocation_id)) {
9,856✔
2065
                assert(p->invocation_id_string);
9,856✔
2066

2067
                x = strjoin("INVOCATION_ID=", p->invocation_id_string);
9,856✔
2068
                if (!x)
9,856✔
2069
                        return -ENOMEM;
2070

2071
                our_env[n_env++] = x;
9,856✔
2072
        }
2073

2074
        if (journal_stream_dev != 0 && journal_stream_ino != 0) {
9,856✔
2075
                if (asprintf(&x, "JOURNAL_STREAM=" DEV_FMT ":" INO_FMT, journal_stream_dev, journal_stream_ino) < 0)
9,057✔
2076
                        return -ENOMEM;
2077

2078
                our_env[n_env++] = x;
9,057✔
2079
        }
2080

2081
        if (c->log_namespace) {
9,856✔
2082
                x = strjoin("LOG_NAMESPACE=", c->log_namespace);
2✔
2083
                if (!x)
2✔
2084
                        return -ENOMEM;
2085

2086
                our_env[n_env++] = x;
2✔
2087
        }
2088

2089
        for (ExecDirectoryType t = 0; t < _EXEC_DIRECTORY_TYPE_MAX; t++) {
59,136✔
2090
                _cleanup_free_ char *joined = NULL;
49,280✔
2091
                const char *n;
49,280✔
2092

2093
                if (!p->prefix[t])
49,280✔
2094
                        continue;
×
2095

2096
                if (c->directories[t].n_items == 0)
49,280✔
2097
                        continue;
46,723✔
2098

2099
                n = exec_directory_env_name_to_string(t);
2,557✔
2100
                if (!n)
2,557✔
2101
                        continue;
×
2102

2103
                for (size_t i = 0; i < c->directories[t].n_items; i++) {
5,614✔
2104
                        _cleanup_free_ char *prefixed = NULL;
3,057✔
2105

2106
                        prefixed = path_join(p->prefix[t], c->directories[t].items[i].path);
3,057✔
2107
                        if (!prefixed)
3,057✔
2108
                                return -ENOMEM;
2109

2110
                        if (!strextend_with_separator(&joined, ":", prefixed))
3,057✔
2111
                                return -ENOMEM;
2112
                }
2113

2114
                x = strjoin(n, "=", joined);
2,557✔
2115
                if (!x)
2,557✔
2116
                        return -ENOMEM;
2117

2118
                our_env[n_env++] = x;
2,557✔
2119
        }
2120

2121
        _cleanup_free_ char *creds_dir = NULL;
9,856✔
2122
        r = exec_context_get_credential_directory(c, p, p->unit_id, &creds_dir);
9,856✔
2123
        if (r < 0)
9,856✔
2124
                return r;
2125
        if (r > 0) {
9,856✔
2126
                x = strjoin("CREDENTIALS_DIRECTORY=", creds_dir);
1,989✔
2127
                if (!x)
1,989✔
2128
                        return -ENOMEM;
2129

2130
                our_env[n_env++] = x;
1,989✔
2131
        }
2132

2133
        if (asprintf(&x, "SYSTEMD_EXEC_PID=" PID_FMT, getpid_cached()) < 0)
9,856✔
2134
                return -ENOMEM;
2135

2136
        our_env[n_env++] = x;
9,856✔
2137

2138
        if (memory_pressure_path) {
9,856✔
2139
                x = strjoin("MEMORY_PRESSURE_WATCH=", memory_pressure_path);
9,451✔
2140
                if (!x)
9,451✔
2141
                        return -ENOMEM;
2142

2143
                our_env[n_env++] = x;
9,451✔
2144

2145
                if (!path_equal(memory_pressure_path, "/dev/null")) {
9,451✔
2146
                        _cleanup_free_ char *b = NULL, *e = NULL;
9,451✔
2147

2148
                        if (asprintf(&b, "%s " USEC_FMT " " USEC_FMT,
9,451✔
2149
                                     MEMORY_PRESSURE_DEFAULT_TYPE,
2150
                                     cgroup_context->memory_pressure_threshold_usec == USEC_INFINITY ? MEMORY_PRESSURE_DEFAULT_THRESHOLD_USEC :
9,451✔
2151
                                     CLAMP(cgroup_context->memory_pressure_threshold_usec, 1U, MEMORY_PRESSURE_DEFAULT_WINDOW_USEC),
9,451✔
2152
                                     MEMORY_PRESSURE_DEFAULT_WINDOW_USEC) < 0)
2153
                                return -ENOMEM;
2154

2155
                        if (base64mem(b, strlen(b) + 1, &e) < 0)
9,451✔
2156
                                return -ENOMEM;
2157

2158
                        x = strjoin("MEMORY_PRESSURE_WRITE=", e);
9,451✔
2159
                        if (!x)
9,451✔
2160
                                return -ENOMEM;
2161

2162
                        our_env[n_env++] = x;
9,451✔
2163
                }
2164
        }
2165

2166
        if (p->notify_socket) {
9,856✔
2167
                x = strjoin("NOTIFY_SOCKET=", exec_get_private_notify_socket_path(c, p, needs_sandboxing) ?: p->notify_socket);
1,939✔
2168
                if (!x)
1,939✔
2169
                        return -ENOMEM;
2170

2171
                our_env[n_env++] = x;
1,939✔
2172
        }
2173

2174
        assert(c->private_var_tmp >= 0 && c->private_var_tmp < _PRIVATE_TMP_MAX);
9,856✔
2175
        if (needs_sandboxing && c->private_tmp != c->private_var_tmp) {
9,856✔
2176
                assert(c->private_tmp == PRIVATE_TMP_DISCONNECTED);
289✔
2177
                assert(c->private_var_tmp == PRIVATE_TMP_NO);
289✔
2178

2179
                /* When private tmpfs is enabled only on /tmp/, then explicitly set $TMPDIR to suggest the
2180
                 * service to use /tmp/. */
2181

2182
                x = strdup("TMPDIR=/tmp");
289✔
2183
                if (!x)
289✔
2184
                        return -ENOMEM;
2185

2186
                our_env[n_env++] = x;
289✔
2187
        }
2188

2189
        assert(n_env <= N_ENV_VARS + _EXEC_DIRECTORY_TYPE_MAX);
9,856✔
2190
#undef N_ENV_VARS
2191

2192
        *ret = TAKE_PTR(our_env);
9,856✔
2193

2194
        return 0;
9,856✔
2195
}
2196

2197
static int build_pass_environment(const ExecContext *c, char ***ret) {
9,856✔
2198
        _cleanup_strv_free_ char **pass_env = NULL;
9,856✔
2199
        size_t n_env = 0;
9,856✔
2200

2201
        assert(c);
9,856✔
2202
        assert(ret);
9,856✔
2203

2204
        STRV_FOREACH(i, c->pass_environment) {
10,162✔
2205
                _cleanup_free_ char *x = NULL;
×
2206
                char *v;
306✔
2207

2208
                v = getenv(*i);
306✔
2209
                if (!v)
306✔
2210
                        continue;
×
2211
                x = strjoin(*i, "=", v);
306✔
2212
                if (!x)
306✔
2213
                        return -ENOMEM;
2214

2215
                if (!GREEDY_REALLOC(pass_env, n_env + 2))
306✔
2216
                        return -ENOMEM;
2217

2218
                pass_env[n_env++] = TAKE_PTR(x);
306✔
2219
                pass_env[n_env] = NULL;
306✔
2220
        }
2221

2222
        *ret = TAKE_PTR(pass_env);
9,856✔
2223
        return 0;
9,856✔
2224
}
2225

2226
static int setup_private_users(PrivateUsers private_users, uid_t ouid, gid_t ogid, uid_t uid, gid_t gid, bool allow_setgroups) {
9,840✔
2227
        _cleanup_free_ char *uid_map = NULL, *gid_map = NULL;
9,840✔
2228
        _cleanup_close_pair_ int errno_pipe[2] = EBADF_PAIR;
9,840✔
2229
        _cleanup_close_ int unshare_ready_fd = -EBADF;
9,840✔
2230
        _cleanup_(sigkill_waitp) pid_t pid = 0;
9,840✔
2231
        uint64_t c = 1;
9,840✔
2232
        ssize_t n;
9,840✔
2233
        int r;
9,840✔
2234

2235
        /* Set up a user namespace and map the original UID/GID (IDs from before any user or group changes, i.e.
2236
         * the IDs from the user or system manager(s)) to itself, the selected UID/GID to itself, and everything else to
2237
         * nobody. In order to be able to write this mapping we need CAP_SETUID in the original user namespace, which
2238
         * we however lack after opening the user namespace. To work around this we fork() a temporary child process,
2239
         * which waits for the parent to create the new user namespace while staying in the original namespace. The
2240
         * child then writes the UID mapping, under full privileges. The parent waits for the child to finish and
2241
         * continues execution normally.
2242
         * For unprivileged users (i.e. without capabilities), the root to root mapping is excluded. As such, it
2243
         * does not need CAP_SETUID to write the single line mapping to itself. */
2244

2245
        if (private_users == PRIVATE_USERS_NO)
9,840✔
2246
                return 0;
2247

2248
        if (private_users == PRIVATE_USERS_IDENTITY) {
51✔
2249
                uid_map = strdup("0 0 65536\n");
4✔
2250
                if (!uid_map)
4✔
2251
                        return -ENOMEM;
2252
        } else if (private_users == PRIVATE_USERS_FULL) {
47✔
2253
                /* Map all UID/GID from original to new user namespace. We can't use `0 0 UINT32_MAX` because
2254
                 * this is the same UID/GID map as the init user namespace and systemd's running_in_userns()
2255
                 * checks whether its in a user namespace by comparing uid_map/gid_map to `0 0 UINT32_MAX`.
2256
                 * Thus, we still map all UIDs/GIDs but do it using two extents to differentiate the new user
2257
                 * namespace from the init namespace:
2258
                 *   0 0 1
2259
                 *   1 1 UINT32_MAX - 1
2260
                 *
2261
                 * systemd will remove the heuristic in running_in_userns() and use namespace inodes in version 258
2262
                 * (PR #35382). But some users may be running a container image with older systemd < 258 so we keep
2263
                 * this uid_map/gid_map hack until version 259 for version N-1 compatibility.
2264
                 *
2265
                 * TODO: Switch to `0 0 UINT32_MAX` in systemd v259.
2266
                 *
2267
                 * Note the kernel defines the UID range between 0 and UINT32_MAX so we map all UIDs even though
2268
                 * the UID range beyond INT32_MAX (e.g. i.e. the range above the signed 32-bit range) is
2269
                 * icky. For example, setfsuid() returns the old UID as signed integer. But units can decide to
2270
                 * use these UIDs/GIDs so we need to map them. */
2271
                r = asprintf(&uid_map, "0 0 1\n"
5✔
2272
                                       "1 1 " UID_FMT "\n", (uid_t) (UINT32_MAX - 1));
2273
                if (r < 0)
5✔
2274
                        return -ENOMEM;
2275
        /* Can only set up multiple mappings with CAP_SETUID. */
2276
        } else if (have_effective_cap(CAP_SETUID) > 0 && uid != ouid && uid_is_valid(uid)) {
42✔
2277
                r = asprintf(&uid_map,
2✔
2278
                             UID_FMT " " UID_FMT " 1\n"     /* Map $OUID → $OUID */
2279
                             UID_FMT " " UID_FMT " 1\n",    /* Map $UID → $UID */
2280
                             ouid, ouid, uid, uid);
2281
                if (r < 0)
2✔
2282
                        return -ENOMEM;
2283
        } else {
2284
                r = asprintf(&uid_map,
40✔
2285
                             UID_FMT " " UID_FMT " 1\n",    /* Map $OUID → $OUID */
2286
                             ouid, ouid);
2287
                if (r < 0)
40✔
2288
                        return -ENOMEM;
2289
        }
2290

2291
        if (private_users == PRIVATE_USERS_IDENTITY) {
51✔
2292
                gid_map = strdup("0 0 65536\n");
4✔
2293
                if (!gid_map)
4✔
2294
                        return -ENOMEM;
2295
        } else if (private_users == PRIVATE_USERS_FULL) {
47✔
2296
                r = asprintf(&gid_map, "0 0 1\n"
5✔
2297
                                       "1 1 " GID_FMT "\n", (gid_t) (UINT32_MAX - 1));
2298
                if (r < 0)
5✔
2299
                        return -ENOMEM;
2300
        /* Can only set up multiple mappings with CAP_SETGID. */
2301
        } else if (have_effective_cap(CAP_SETGID) > 0 && gid != ogid && gid_is_valid(gid)) {
58✔
2302
                r = asprintf(&gid_map,
2✔
2303
                             GID_FMT " " GID_FMT " 1\n"     /* Map $OGID → $OGID */
2304
                             GID_FMT " " GID_FMT " 1\n",    /* Map $GID → $GID */
2305
                             ogid, ogid, gid, gid);
2306
                if (r < 0)
2✔
2307
                        return -ENOMEM;
2308
        } else {
2309
                r = asprintf(&gid_map,
40✔
2310
                             GID_FMT " " GID_FMT " 1\n",    /* Map $OGID -> $OGID */
2311
                             ogid, ogid);
2312
                if (r < 0)
40✔
2313
                        return -ENOMEM;
2314
        }
2315

2316
        /* Create a communication channel so that the parent can tell the child when it finished creating the user
2317
         * namespace. */
2318
        unshare_ready_fd = eventfd(0, EFD_CLOEXEC);
51✔
2319
        if (unshare_ready_fd < 0)
51✔
2320
                return -errno;
×
2321

2322
        /* Create a communication channel so that the child can tell the parent a proper error code in case it
2323
         * failed. */
2324
        if (pipe2(errno_pipe, O_CLOEXEC) < 0)
51✔
2325
                return -errno;
×
2326

2327
        r = safe_fork("(sd-userns)", FORK_RESET_SIGNALS|FORK_DEATHSIG_SIGKILL, &pid);
51✔
2328
        if (r < 0)
102✔
2329
                return r;
2330
        if (r == 0) {
102✔
2331
                _cleanup_close_ int fd = -EBADF;
×
2332
                const char *a;
51✔
2333
                pid_t ppid;
51✔
2334

2335
                /* Child process, running in the original user namespace. Let's update the parent's UID/GID map from
2336
                 * here, after the parent opened its own user namespace. */
2337

2338
                ppid = getppid();
51✔
2339
                errno_pipe[0] = safe_close(errno_pipe[0]);
51✔
2340

2341
                /* Wait until the parent unshared the user namespace */
2342
                if (read(unshare_ready_fd, &c, sizeof(c)) < 0)
51✔
2343
                        report_errno_and_exit(errno_pipe[1], -errno);
×
2344

2345
                /* Disable the setgroups() system call in the child user namespace, for good, unless PrivateUsers=full
2346
                 * and using the system service manager. */
2347
                a = procfs_file_alloca(ppid, "setgroups");
51✔
2348
                fd = open(a, O_WRONLY|O_CLOEXEC);
51✔
2349
                if (fd < 0) {
51✔
2350
                        if (errno != ENOENT) {
×
2351
                                r = log_debug_errno(errno, "Failed to open %s: %m", a);
×
2352
                                report_errno_and_exit(errno_pipe[1], r);
×
2353
                        }
2354

2355
                        /* If the file is missing the kernel is too old, let's continue anyway. */
2356
                } else {
2357
                        const char *setgroups = allow_setgroups ? "allow\n" : "deny\n";
51✔
2358
                        if (write(fd, setgroups, strlen(setgroups)) < 0) {
51✔
2359
                                r = log_debug_errno(errno, "Failed to write '%s' to %s: %m", setgroups, a);
×
2360
                                report_errno_and_exit(errno_pipe[1], r);
×
2361
                        }
2362

2363
                        fd = safe_close(fd);
51✔
2364
                }
2365

2366
                /* First write the GID map */
2367
                a = procfs_file_alloca(ppid, "gid_map");
51✔
2368
                fd = open(a, O_WRONLY|O_CLOEXEC);
51✔
2369
                if (fd < 0) {
51✔
2370
                        r = log_debug_errno(errno, "Failed to open %s: %m", a);
×
2371
                        report_errno_and_exit(errno_pipe[1], r);
×
2372
                }
2373

2374
                if (write(fd, gid_map, strlen(gid_map)) < 0) {
51✔
2375
                        r = log_debug_errno(errno, "Failed to write GID map to %s: %m", a);
×
2376
                        report_errno_and_exit(errno_pipe[1], r);
×
2377
                }
2378

2379
                fd = safe_close(fd);
51✔
2380

2381
                /* The write the UID map */
2382
                a = procfs_file_alloca(ppid, "uid_map");
51✔
2383
                fd = open(a, O_WRONLY|O_CLOEXEC);
51✔
2384
                if (fd < 0) {
51✔
2385
                        r = log_debug_errno(errno, "Failed to open %s: %m", a);
×
2386
                        report_errno_and_exit(errno_pipe[1], r);
×
2387
                }
2388

2389
                if (write(fd, uid_map, strlen(uid_map)) < 0) {
51✔
2390
                        r = log_debug_errno(errno, "Failed to write UID map to %s: %m", a);
×
2391
                        report_errno_and_exit(errno_pipe[1], r);
×
2392
                }
2393

2394
                _exit(EXIT_SUCCESS);
51✔
2395
        }
2396

2397
        errno_pipe[1] = safe_close(errno_pipe[1]);
51✔
2398

2399
        if (unshare(CLONE_NEWUSER) < 0)
51✔
2400
                return log_debug_errno(errno, "Failed to unshare user namespace: %m");
×
2401

2402
        /* Let the child know that the namespace is ready now */
2403
        if (write(unshare_ready_fd, &c, sizeof(c)) < 0)
51✔
2404
                return -errno;
×
2405

2406
        /* Try to read an error code from the child */
2407
        n = read(errno_pipe[0], &r, sizeof(r));
51✔
2408
        if (n < 0)
51✔
2409
                return -errno;
×
2410
        if (n == sizeof(r)) { /* an error code was sent to us */
51✔
2411
                if (r < 0)
×
2412
                        return r;
2413
                return -EIO;
×
2414
        }
2415
        if (n != 0) /* on success we should have read 0 bytes */
51✔
2416
                return -EIO;
2417

2418
        r = wait_for_terminate_and_check("(sd-userns)", TAKE_PID(pid), 0);
51✔
2419
        if (r < 0)
51✔
2420
                return r;
2421
        if (r != EXIT_SUCCESS) /* If something strange happened with the child, let's consider this fatal, too */
51✔
2422
                return -EIO;
×
2423

2424
        return 1;
2425
}
2426

2427
static int can_mount_proc(void) {
9✔
2428
        _cleanup_close_pair_ int errno_pipe[2] = EBADF_PAIR;
5✔
2429
        _cleanup_(sigkill_waitp) pid_t pid = 0;
×
2430
        ssize_t n;
9✔
2431
        int r;
9✔
2432

2433
        /* If running via unprivileged user manager and /proc/ is masked (e.g. /proc/kmsg is over-mounted with tmpfs
2434
         * like systemd-nspawn does), then mounting /proc/ will fail with EPERM. This is due to a kernel restriction
2435
         * where unprivileged user namespaces cannot mount a less restrictive instance of /proc. */
2436

2437
        /* Create a communication channel so that the child can tell the parent a proper error code in case it
2438
         * failed. */
2439
        if (pipe2(errno_pipe, O_CLOEXEC) < 0)
9✔
2440
                return log_debug_errno(errno, "Failed to create pipe for communicating with child process (sd-proc-check): %m");
×
2441

2442
        /* Fork a child process into its own mount and PID namespace. Note safe_fork() already remounts / as SLAVE
2443
         * with FORK_MOUNTNS_SLAVE. */
2444
        r = safe_fork("(sd-proc-check)",
9✔
2445
                      FORK_RESET_SIGNALS|FORK_DEATHSIG_SIGKILL|FORK_NEW_MOUNTNS|FORK_MOUNTNS_SLAVE|FORK_NEW_PIDNS, &pid);
2446
        if (r < 0)
9✔
2447
                return log_debug_errno(r, "Failed to fork child process (sd-proc-check): %m");
×
2448
        if (r == 0) {
9✔
2449
                errno_pipe[0] = safe_close(errno_pipe[0]);
4✔
2450

2451
                /* Try mounting /proc on /dev/shm/. No need to clean up the mount since the mount
2452
                 * namespace will be cleaned up once the process exits. */
2453
                r = mount_follow_verbose(LOG_DEBUG, "proc", "/dev/shm/", "proc", MS_NOSUID|MS_NOEXEC|MS_NODEV, NULL);
4✔
2454
                if (r < 0) {
4✔
UNCOV
2455
                        (void) write(errno_pipe[1], &r, sizeof(r));
×
UNCOV
2456
                        _exit(EXIT_FAILURE);
×
2457
                }
2458

2459
                _exit(EXIT_SUCCESS);
4✔
2460
        }
2461

2462
        errno_pipe[1] = safe_close(errno_pipe[1]);
5✔
2463

2464
        /* Try to read an error code from the child */
2465
        n = read(errno_pipe[0], &r, sizeof(r));
5✔
2466
        if (n < 0)
5✔
2467
                return log_debug_errno(errno, "Failed to read errno from pipe with child process (sd-proc-check): %m");
×
2468
        if (n == sizeof(r)) { /* an error code was sent to us */
5✔
2469
                /* This is the expected case where proc cannot be mounted due to permissions. */
2470
                if (ERRNO_IS_NEG_PRIVILEGE(r))
5✔
2471
                        return 0;
2472
                if (r < 0)
×
2473
                        return r;
2474

2475
                return -EIO;
×
2476
        }
2477
        if (n != 0) /* on success we should have read 0 bytes */
4✔
2478
                return -EIO;
2479

2480
        r = wait_for_terminate_and_check("(sd-proc-check)", TAKE_PID(pid), 0 /* flags= */);
4✔
2481
        if (r < 0)
4✔
2482
                return log_debug_errno(r, "Failed to wait for (sd-proc-check) child process to terminate: %m");
×
2483
        if (r != EXIT_SUCCESS) /* If something strange happened with the child, let's consider this fatal, too */
4✔
2484
                return log_debug_errno(SYNTHETIC_ERRNO(EIO), "Child process (sd-proc-check) exited with unexpected exit status '%d'.", r);
×
2485

2486
        return 1;
2487
}
2488

2489
static int setup_private_pids(const ExecContext *c, ExecParameters *p) {
9✔
2490
        _cleanup_(pidref_done) PidRef pidref = PIDREF_NULL;
×
2491
        _cleanup_close_pair_ int errno_pipe[2] = EBADF_PAIR;
6✔
2492
        ssize_t n;
9✔
2493
        int r, q;
9✔
2494

2495
        assert(c);
9✔
2496
        assert(p);
9✔
2497
        assert(p->pidref_transport_fd >= 0);
9✔
2498

2499
        /* The first process created after unsharing a pid namespace becomes PID 1 in the pid namespace, so
2500
         * we have to fork after unsharing the pid namespace to become PID 1. The parent sends the child
2501
         * pidref to the manager and exits while the child process continues with the rest of exec_invoke()
2502
         * and finally executes the actual payload. */
2503

2504
        /* Create a communication channel so that the parent can tell the child a proper error code in case it
2505
         * failed to send child pidref to the manager. */
2506
        if (pipe2(errno_pipe, O_CLOEXEC) < 0)
9✔
2507
                return log_debug_errno(errno, "Failed to create pipe for communicating with parent process: %m");
×
2508

2509
        /* Set FORK_DETACH to immediately re-parent the child process to the invoking manager process. */
2510
        r = pidref_safe_fork("(sd-pidns-child)", FORK_NEW_PIDNS|FORK_DETACH, &pidref);
9✔
2511
        if (r < 0)
15✔
2512
                return log_debug_errno(r, "Failed to fork child into new pid namespace: %m");
×
2513
        if (r > 0) {
15✔
2514
                errno_pipe[0] = safe_close(errno_pipe[0]);
9✔
2515

2516
                /* In the parent process, we send the child pidref to the manager and exit.
2517
                 * If PIDFD is not supported, only the child PID is sent. The server then
2518
                 * uses the child PID to set the new exec main process. */
2519
                q = send_one_fd_iov(
9✔
2520
                                p->pidref_transport_fd,
2521
                                pidref.fd,
2522
                                &IOVEC_MAKE(&pidref.pid, sizeof(pidref.pid)),
2523
                                /*iovlen=*/ 1,
2524
                                /*flags=*/ 0);
2525
                /* Send error code to child process. */
2526
                (void) write(errno_pipe[1], &q, sizeof(q));
9✔
2527
                /* Exit here so we only go through the destructors in exec_invoke only once - in the child - as
2528
                 * some destructors have external effects. The main codepaths continue in the child process. */
2529
                _exit(q < 0 ? EXIT_FAILURE : EXIT_SUCCESS);
9✔
2530
        }
2531

2532
        errno_pipe[1] = safe_close(errno_pipe[1]);
6✔
2533
        p->pidref_transport_fd = safe_close(p->pidref_transport_fd);
6✔
2534

2535
        /* Try to read an error code from the parent. Note a child process cannot wait for the parent so we always
2536
         * receive an errno even on success. */
2537
        n = read(errno_pipe[0], &r, sizeof(r));
6✔
2538
        if (n < 0)
6✔
2539
                return log_debug_errno(errno, "Failed to read errno from pipe with parent process: %m");
×
2540
        if (n != sizeof(r))
6✔
2541
                return log_debug_errno(SYNTHETIC_ERRNO(EIO), "Failed to read enough bytes from pipe with parent process");
×
2542
        if (r < 0)
6✔
2543
                return log_debug_errno(r, "Failed to send child pidref to manager: %m");
×
2544

2545
        /* NOTE! This function returns in the child process only. */
2546
        return r;
2547
}
2548

2549
static int create_many_symlinks(const char *root, const char *source, char **symlinks) {
1,568✔
2550
        _cleanup_free_ char *src_abs = NULL;
1,568✔
2551
        int r;
1,568✔
2552

2553
        assert(source);
1,568✔
2554

2555
        src_abs = path_join(root, source);
1,568✔
2556
        if (!src_abs)
1,568✔
2557
                return -ENOMEM;
2558

2559
        STRV_FOREACH(dst, symlinks) {
1,581✔
2560
                _cleanup_free_ char *dst_abs = NULL;
13✔
2561

2562
                dst_abs = path_join(root, *dst);
13✔
2563
                if (!dst_abs)
13✔
2564
                        return -ENOMEM;
2565

2566
                r = mkdir_parents_label(dst_abs, 0755);
13✔
2567
                if (r < 0)
13✔
2568
                        return r;
2569

2570
                r = symlink_idempotent(src_abs, dst_abs, true);
13✔
2571
                if (r < 0)
13✔
2572
                        return r;
2573
        }
2574

2575
        return 0;
2576
}
2577

2578
static int setup_exec_directory(
59,251✔
2579
                const ExecContext *context,
2580
                const ExecParameters *params,
2581
                uid_t uid,
2582
                gid_t gid,
2583
                ExecDirectoryType type,
2584
                bool needs_mount_namespace,
2585
                int *exit_status) {
2586

2587
        static const int exit_status_table[_EXEC_DIRECTORY_TYPE_MAX] = {
59,251✔
2588
                [EXEC_DIRECTORY_RUNTIME]       = EXIT_RUNTIME_DIRECTORY,
2589
                [EXEC_DIRECTORY_STATE]         = EXIT_STATE_DIRECTORY,
2590
                [EXEC_DIRECTORY_CACHE]         = EXIT_CACHE_DIRECTORY,
2591
                [EXEC_DIRECTORY_LOGS]          = EXIT_LOGS_DIRECTORY,
2592
                [EXEC_DIRECTORY_CONFIGURATION] = EXIT_CONFIGURATION_DIRECTORY,
2593
        };
2594
        int r;
59,251✔
2595

2596
        assert(context);
59,251✔
2597
        assert(params);
59,251✔
2598
        assert(type >= 0 && type < _EXEC_DIRECTORY_TYPE_MAX);
59,251✔
2599
        assert(exit_status);
59,251✔
2600

2601
        if (!params->prefix[type])
59,251✔
2602
                return 0;
2603

2604
        if (params->flags & EXEC_CHOWN_DIRECTORIES) {
59,251✔
2605
                if (!uid_is_valid(uid))
55,256✔
2606
                        uid = 0;
41,541✔
2607
                if (!gid_is_valid(gid))
55,256✔
2608
                        gid = 0;
41,521✔
2609
        }
2610

2611
        FOREACH_ARRAY(i, context->directories[type].items, context->directories[type].n_items) {
63,044✔
2612
                _cleanup_free_ char *p = NULL, *pp = NULL;
3,794✔
2613

2614
                p = path_join(params->prefix[type], i->path);
3,794✔
2615
                if (!p) {
3,794✔
2616
                        r = -ENOMEM;
×
2617
                        goto fail;
×
2618
                }
2619

2620
                r = mkdir_parents_label(p, 0755);
3,794✔
2621
                if (r < 0)
3,794✔
2622
                        goto fail;
×
2623

2624
                if (IN_SET(type, EXEC_DIRECTORY_STATE, EXEC_DIRECTORY_LOGS) && params->runtime_scope == RUNTIME_SCOPE_USER) {
3,794✔
2625

2626
                        /* If we are in user mode, and a configuration directory exists but a state directory
2627
                         * doesn't exist, then we likely are upgrading from an older systemd version that
2628
                         * didn't know the more recent addition to the xdg-basedir spec: the $XDG_STATE_HOME
2629
                         * directory. In older systemd versions EXEC_DIRECTORY_STATE was aliased to
2630
                         * EXEC_DIRECTORY_CONFIGURATION, with the advent of $XDG_STATE_HOME it is now
2631
                         * separated. If a service has both dirs configured but only the configuration dir
2632
                         * exists and the state dir does not, we assume we are looking at an update
2633
                         * situation. Hence, create a compatibility symlink, so that all expectations are
2634
                         * met.
2635
                         *
2636
                         * (We also do something similar with the log directory, which still doesn't exist in
2637
                         * the xdg basedir spec. We'll make it a subdir of the state dir.) */
2638

2639
                        /* this assumes the state dir is always created before the configuration dir */
2640
                        assert_cc(EXEC_DIRECTORY_STATE < EXEC_DIRECTORY_LOGS);
7✔
2641
                        assert_cc(EXEC_DIRECTORY_LOGS < EXEC_DIRECTORY_CONFIGURATION);
7✔
2642

2643
                        r = access_nofollow(p, F_OK);
7✔
2644
                        if (r == -ENOENT) {
7✔
2645
                                _cleanup_free_ char *q = NULL;
3✔
2646

2647
                                /* OK, we know that the state dir does not exist. Let's see if the dir exists
2648
                                 * under the configuration hierarchy. */
2649

2650
                                if (type == EXEC_DIRECTORY_STATE)
3✔
2651
                                        q = path_join(params->prefix[EXEC_DIRECTORY_CONFIGURATION], i->path);
3✔
2652
                                else if (type == EXEC_DIRECTORY_LOGS)
×
2653
                                        q = path_join(params->prefix[EXEC_DIRECTORY_CONFIGURATION], "log", i->path);
×
2654
                                else
2655
                                        assert_not_reached();
×
2656
                                if (!q) {
3✔
2657
                                        r = -ENOMEM;
×
2658
                                        goto fail;
×
2659
                                }
2660

2661
                                r = access_nofollow(q, F_OK);
3✔
2662
                                if (r >= 0) {
3✔
2663
                                        /* It does exist! This hence looks like an update. Symlink the
2664
                                         * configuration directory into the state directory. */
2665

2666
                                        r = symlink_idempotent(q, p, /* make_relative= */ true);
1✔
2667
                                        if (r < 0)
1✔
2668
                                                goto fail;
×
2669

2670
                                        log_notice("Unit state directory %s missing but matching configuration directory %s exists, assuming update from systemd 253 or older, creating compatibility symlink.", p, q);
1✔
2671
                                        continue;
1✔
2672
                                } else if (r != -ENOENT)
2✔
2673
                                        log_warning_errno(r, "Unable to detect whether unit configuration directory '%s' exists, assuming not: %m", q);
2✔
2674

2675
                        } else if (r < 0)
4✔
2676
                                log_warning_errno(r, "Unable to detect whether unit state directory '%s' is missing, assuming it is: %m", p);
×
2677
                }
2678

2679
                if (exec_directory_is_private(context, type)) {
3,793✔
2680
                        /* So, here's one extra complication when dealing with DynamicUser=1 units. In that
2681
                         * case we want to avoid leaving a directory around fully accessible that is owned by
2682
                         * a dynamic user whose UID is later on reused. To lock this down we use the same
2683
                         * trick used by container managers to prohibit host users to get access to files of
2684
                         * the same UID in containers: we place everything inside a directory that has an
2685
                         * access mode of 0700 and is owned root:root, so that it acts as security boundary
2686
                         * for unprivileged host code. We then use fs namespacing to make this directory
2687
                         * permeable for the service itself.
2688
                         *
2689
                         * Specifically: for a service which wants a special directory "foo/" we first create
2690
                         * a directory "private/" with access mode 0700 owned by root:root. Then we place
2691
                         * "foo" inside of that directory (i.e. "private/foo/"), and make "foo" a symlink to
2692
                         * "private/foo". This way, privileged host users can access "foo/" as usual, but
2693
                         * unprivileged host users can't look into it. Inside of the namespace of the unit
2694
                         * "private/" is replaced by a more liberally accessible tmpfs, into which the host's
2695
                         * "private/foo/" is mounted under the same name, thus disabling the access boundary
2696
                         * for the service and making sure it only gets access to the dirs it needs but no
2697
                         * others. Tricky? Yes, absolutely, but it works!
2698
                         *
2699
                         * Note that we don't do this for EXEC_DIRECTORY_CONFIGURATION as that's assumed not
2700
                         * to be owned by the service itself.
2701
                         *
2702
                         * Also, note that we don't do this for EXEC_DIRECTORY_RUNTIME as that's often used
2703
                         * for sharing files or sockets with other services. */
2704

2705
                        pp = path_join(params->prefix[type], "private");
13✔
2706
                        if (!pp) {
13✔
2707
                                r = -ENOMEM;
×
2708
                                goto fail;
×
2709
                        }
2710

2711
                        /* First set up private root if it doesn't exist yet, with access mode 0700 and owned by root:root */
2712
                        r = mkdir_safe_label(pp, 0700, 0, 0, MKDIR_WARN_MODE);
13✔
2713
                        if (r < 0)
13✔
2714
                                goto fail;
×
2715

2716
                        if (!path_extend(&pp, i->path)) {
13✔
2717
                                r = -ENOMEM;
×
2718
                                goto fail;
×
2719
                        }
2720

2721
                        /* Create all directories between the configured directory and this private root, and mark them 0755 */
2722
                        r = mkdir_parents_label(pp, 0755);
13✔
2723
                        if (r < 0)
13✔
2724
                                goto fail;
×
2725

2726
                        if (is_dir(p, false) > 0 &&
13✔
2727
                            (access_nofollow(pp, F_OK) == -ENOENT)) {
×
2728

2729
                                /* Hmm, the private directory doesn't exist yet, but the normal one exists? If so, move
2730
                                 * it over. Most likely the service has been upgraded from one that didn't use
2731
                                 * DynamicUser=1, to one that does. */
2732

2733
                                log_info("Found pre-existing public %s= directory %s, migrating to %s.\n"
×
2734
                                         "Apparently, service previously had DynamicUser= turned off, and has now turned it on.",
2735
                                         exec_directory_type_to_string(type), p, pp);
2736

2737
                                r = RET_NERRNO(rename(p, pp));
×
2738
                                if (r < 0)
×
2739
                                        goto fail;
×
2740
                        } else {
2741
                                /* Otherwise, create the actual directory for the service */
2742

2743
                                r = mkdir_label(pp, context->directories[type].mode);
13✔
2744
                                if (r < 0 && r != -EEXIST)
13✔
2745
                                        goto fail;
×
2746
                        }
2747

2748
                        if (!FLAGS_SET(i->flags, EXEC_DIRECTORY_ONLY_CREATE)) {
13✔
2749
                                /* And link it up from the original place.
2750
                                 * Notes
2751
                                 * 1) If a mount namespace is going to be used, then this symlink remains on
2752
                                 *    the host, and a new one for the child namespace will be created later.
2753
                                 * 2) It is not necessary to create this symlink when one of its parent
2754
                                 *    directories is specified and already created. E.g.
2755
                                 *        StateDirectory=foo foo/bar
2756
                                 *    In that case, the inode points to pp and p for "foo/bar" are the same:
2757
                                 *        pp = "/var/lib/private/foo/bar"
2758
                                 *        p = "/var/lib/foo/bar"
2759
                                 *    and, /var/lib/foo is a symlink to /var/lib/private/foo. So, not only
2760
                                 *    we do not need to create the symlink, but we cannot create the symlink.
2761
                                 *    See issue #24783. */
2762
                                r = symlink_idempotent(pp, p, true);
13✔
2763
                                if (r < 0)
13✔
2764
                                        goto fail;
×
2765
                        }
2766

2767
                } else {
2768
                        _cleanup_free_ char *target = NULL;
3,780✔
2769

2770
                        if (EXEC_DIRECTORY_TYPE_SHALL_CHOWN(type) &&
7,520✔
2771
                            readlink_and_make_absolute(p, &target) >= 0) {
3,740✔
2772
                                _cleanup_free_ char *q = NULL, *q_resolved = NULL, *target_resolved = NULL;
11✔
2773

2774
                                /* This already exists and is a symlink? Interesting. Maybe it's one created
2775
                                 * by DynamicUser=1 (see above)?
2776
                                 *
2777
                                 * We do this for all directory types except for ConfigurationDirectory=,
2778
                                 * since they all support the private/ symlink logic at least in some
2779
                                 * configurations, see above. */
2780

2781
                                r = chase(target, NULL, 0, &target_resolved, NULL);
11✔
2782
                                if (r < 0)
11✔
2783
                                        goto fail;
×
2784

2785
                                q = path_join(params->prefix[type], "private", i->path);
11✔
2786
                                if (!q) {
11✔
2787
                                        r = -ENOMEM;
×
2788
                                        goto fail;
×
2789
                                }
2790

2791
                                /* /var/lib or friends may be symlinks. So, let's chase them also. */
2792
                                r = chase(q, NULL, CHASE_NONEXISTENT, &q_resolved, NULL);
11✔
2793
                                if (r < 0)
11✔
2794
                                        goto fail;
×
2795

2796
                                if (path_equal(q_resolved, target_resolved)) {
11✔
2797

2798
                                        /* Hmm, apparently DynamicUser= was once turned on for this service,
2799
                                         * but is no longer. Let's move the directory back up. */
2800

2801
                                        log_info("Found pre-existing private %s= directory %s, migrating to %s.\n"
8✔
2802
                                                 "Apparently, service previously had DynamicUser= turned on, and has now turned it off.",
2803
                                                 exec_directory_type_to_string(type), q, p);
2804

2805
                                        r = RET_NERRNO(unlink(p));
8✔
2806
                                        if (r < 0)
×
2807
                                                goto fail;
×
2808

2809
                                        r = RET_NERRNO(rename(q, p));
11✔
2810
                                        if (r < 0)
×
2811
                                                goto fail;
×
2812
                                }
2813
                        }
2814

2815
                        r = mkdir_label(p, context->directories[type].mode);
3,780✔
2816
                        if (r < 0) {
3,780✔
2817
                                if (r != -EEXIST)
2,708✔
2818
                                        goto fail;
×
2819

2820
                                if (!EXEC_DIRECTORY_TYPE_SHALL_CHOWN(type)) {
2,708✔
2821
                                        struct stat st;
27✔
2822

2823
                                        /* Don't change the owner/access mode of the configuration directory,
2824
                                         * as in the common case it is not written to by a service, and shall
2825
                                         * not be writable. */
2826

2827
                                        r = RET_NERRNO(stat(p, &st));
27✔
2828
                                        if (r < 0)
×
2829
                                                goto fail;
×
2830

2831
                                        /* Still complain if the access mode doesn't match */
2832
                                        if (((st.st_mode ^ context->directories[type].mode) & 07777) != 0)
27✔
2833
                                                log_warning("%s \'%s\' already exists but the mode is different. "
×
2834
                                                            "(File system: %o %sMode: %o)",
2835
                                                            exec_directory_type_to_string(type), i->path,
2836
                                                            st.st_mode & 07777, exec_directory_type_to_string(type), context->directories[type].mode & 07777);
2837

2838
                                        continue;
27✔
2839
                                }
2840
                        }
2841
                }
2842

2843
                /* Lock down the access mode (we use chmod_and_chown() to make this idempotent. We don't
2844
                 * specify UID/GID here, so that path_chown_recursive() can optimize things depending on the
2845
                 * current UID/GID ownership.) */
2846
                const char *target_dir = pp ?: p;
3,766✔
2847
                r = chmod_and_chown(target_dir, context->directories[type].mode, UID_INVALID, GID_INVALID);
3,766✔
2848
                if (r < 0)
3,766✔
2849
                        goto fail;
×
2850

2851
                /* Skip the rest (which deals with ownership) in user mode, since ownership changes are not
2852
                 * available to user code anyway */
2853
                if (params->runtime_scope != RUNTIME_SCOPE_SYSTEM)
3,766✔
2854
                        continue;
9✔
2855

2856
                int idmapping_supported = is_idmapping_supported(target_dir);
3,757✔
2857
                if (idmapping_supported < 0) {
3,757✔
2858
                        r = log_debug_errno(idmapping_supported, "Unable to determine if ID mapping is supported on mount '%s': %m", target_dir);
×
2859
                        goto fail;
×
2860
                }
2861

2862
                log_debug("ID-mapping is%ssupported for exec directory %s", idmapping_supported ? " " : " not ", target_dir);
3,763✔
2863

2864
                /* Change the ownership of the whole tree, if necessary. When dynamic users are used we
2865
                 * drop the suid/sgid bits, since we really don't want SUID/SGID files for dynamic UID/GID
2866
                 * assignments to exist. */
2867
                uid_t chown_uid = uid;
3,757✔
2868
                gid_t chown_gid = gid;
3,757✔
2869
                bool do_chown = false;
3,757✔
2870

2871
                if (uid == 0 || gid == 0 || !idmapping_supported) {
3,757✔
2872
                        do_chown = true;
1,442✔
2873
                        i->idmapped = false;
1,442✔
2874
                } else {
2875
                        /* Use 'nobody' uid/gid for exec directories if ID-mapping is supported. For backward compatibility,
2876
                         * continue doing chmod/chown if the directory was chmod/chowned before (if uid/gid is not 'nobody') */
2877
                        struct stat st;
2,315✔
2878
                        r = RET_NERRNO(stat(target_dir, &st));
2,315✔
2879
                        if (r < 0)
×
2880
                                goto fail;
×
2881

2882
                        if (st.st_uid == UID_NOBODY && st.st_gid == GID_NOBODY) {
2,315✔
2883
                                do_chown = false;
7✔
2884
                                i->idmapped = true;
7✔
2885
                       } else if (exec_directory_is_private(context, type) && st.st_uid == 0 && st.st_gid == 0) {
2,308✔
2886
                                chown_uid = UID_NOBODY;
6✔
2887
                                chown_gid = GID_NOBODY;
6✔
2888
                                do_chown = true;
6✔
2889
                                i->idmapped = true;
6✔
2890
                        } else {
2891
                                do_chown = true;
2,302✔
2892
                                i->idmapped = false;
2,302✔
2893
                        }
2894
                }
2895

2896
                if (do_chown) {
3,757✔
2897
                        r = path_chown_recursive(target_dir, chown_uid, chown_gid, context->dynamic_user ? 01777 : 07777, AT_SYMLINK_FOLLOW);
7,491✔
2898
                        if (r < 0)
3,750✔
2899
                                goto fail;
1✔
2900
                }
2901
        }
2902

2903
        /* If we are not going to run in a namespace, set up the symlinks - otherwise
2904
         * they are set up later, to allow configuring empty var/run/etc. */
2905
        if (!needs_mount_namespace)
59,250✔
2906
                FOREACH_ARRAY(i, context->directories[type].items, context->directories[type].n_items) {
47,353✔
2907
                        r = create_many_symlinks(params->prefix[type], i->path, i->symlinks);
1,568✔
2908
                        if (r < 0)
1,568✔
2909
                                goto fail;
×
2910
                }
2911

2912
        return 0;
2913

2914
fail:
1✔
2915
        *exit_status = exit_status_table[type];
1✔
2916
        return r;
1✔
2917
}
2918

2919
#if ENABLE_SMACK
2920
static int setup_smack(
×
2921
                const ExecContext *context,
2922
                const ExecParameters *params,
2923
                int executable_fd) {
2924
        int r;
×
2925

2926
        assert(context);
×
2927
        assert(params);
×
2928
        assert(executable_fd >= 0);
×
2929

2930
        if (context->smack_process_label) {
×
2931
                r = mac_smack_apply_pid(0, context->smack_process_label);
×
2932
                if (r < 0)
×
2933
                        return r;
×
2934
        } else if (params->fallback_smack_process_label) {
×
2935
                _cleanup_free_ char *exec_label = NULL;
×
2936

2937
                r = mac_smack_read_fd(executable_fd, SMACK_ATTR_EXEC, &exec_label);
×
2938
                if (r < 0 && !ERRNO_IS_XATTR_ABSENT(r))
×
2939
                        return r;
2940

2941
                r = mac_smack_apply_pid(0, exec_label ?: params->fallback_smack_process_label);
×
2942
                if (r < 0)
×
2943
                        return r;
2944
        }
2945

2946
        return 0;
2947
}
2948
#endif
2949

2950
static int compile_bind_mounts(
2,055✔
2951
                const ExecContext *context,
2952
                const ExecParameters *params,
2953
                uid_t exec_directory_uid, /* only used for id-mapped mounts Exec directories */
2954
                gid_t exec_directory_gid, /* only used for id-mapped mounts Exec directories */
2955
                BindMount **ret_bind_mounts,
2956
                size_t *ret_n_bind_mounts,
2957
                char ***ret_empty_directories) {
2958

2959
        _cleanup_strv_free_ char **empty_directories = NULL;
2,055✔
2960
        BindMount *bind_mounts = NULL;
2,055✔
2961
        size_t n, h = 0;
2,055✔
2962
        int r;
2,055✔
2963

2964
        assert(context);
2,055✔
2965
        assert(params);
2,055✔
2966
        assert(ret_bind_mounts);
2,055✔
2967
        assert(ret_n_bind_mounts);
2,055✔
2968
        assert(ret_empty_directories);
2,055✔
2969

2970
        CLEANUP_ARRAY(bind_mounts, h, bind_mount_free_many);
2,055✔
2971

2972
        n = context->n_bind_mounts;
2,055✔
2973
        for (ExecDirectoryType t = 0; t < _EXEC_DIRECTORY_TYPE_MAX; t++) {
12,330✔
2974
                if (!params->prefix[t])
10,275✔
2975
                        continue;
×
2976

2977
                FOREACH_ARRAY(i, context->directories[t].items, context->directories[t].n_items)
11,891✔
2978
                        n += !FLAGS_SET(i->flags, EXEC_DIRECTORY_ONLY_CREATE) || FLAGS_SET(i->flags, EXEC_DIRECTORY_READ_ONLY);
1,616✔
2979
        }
2980

2981
        if (n <= 0) {
2,055✔
2982
                *ret_bind_mounts = NULL;
1,110✔
2983
                *ret_n_bind_mounts = 0;
1,110✔
2984
                *ret_empty_directories = NULL;
1,110✔
2985
                return 0;
1,110✔
2986
        }
2987

2988
        bind_mounts = new(BindMount, n);
945✔
2989
        if (!bind_mounts)
945✔
2990
                return -ENOMEM;
2991

2992
        FOREACH_ARRAY(item, context->bind_mounts, context->n_bind_mounts) {
967✔
2993
                r = bind_mount_add(&bind_mounts, &h, item);
22✔
2994
                if (r < 0)
22✔
2995
                        return r;
2996
        }
2997

2998
        for (ExecDirectoryType t = 0; t < _EXEC_DIRECTORY_TYPE_MAX; t++) {
5,670✔
2999
                if (!params->prefix[t])
4,725✔
3000
                        continue;
×
3001

3002
                if (context->directories[t].n_items == 0)
4,725✔
3003
                        continue;
3,569✔
3004

3005
                if (exec_directory_is_private(context, t) &&
1,169✔
3006
                    !exec_context_with_rootfs(context)) {
13✔
3007
                        char *private_root;
13✔
3008

3009
                        /* So this is for a dynamic user, and we need to make sure the process can access its own
3010
                         * directory. For that we overmount the usually inaccessible "private" subdirectory with a
3011
                         * tmpfs that makes it accessible and is empty except for the submounts we do this for. */
3012

3013
                        private_root = path_join(params->prefix[t], "private");
13✔
3014
                        if (!private_root)
13✔
3015
                                return -ENOMEM;
3016

3017
                        r = strv_consume(&empty_directories, private_root);
13✔
3018
                        if (r < 0)
13✔
3019
                                return r;
3020
                }
3021

3022
                FOREACH_ARRAY(i, context->directories[t].items, context->directories[t].n_items) {
2,772✔
3023
                        _cleanup_free_ char *s = NULL, *d = NULL;
1,616✔
3024

3025
                        /* When one of the parent directories is in the list, we cannot create the symlink
3026
                         * for the child directory. See also the comments in setup_exec_directory().
3027
                         * But if it needs to be read only, then we have to create a bind mount anyway to
3028
                         * make it so. */
3029
                        if (FLAGS_SET(i->flags, EXEC_DIRECTORY_ONLY_CREATE) && !FLAGS_SET(i->flags, EXEC_DIRECTORY_READ_ONLY))
1,616✔
3030
                                continue;
×
3031

3032
                        if (exec_directory_is_private(context, t))
1,616✔
3033
                                s = path_join(params->prefix[t], "private", i->path);
13✔
3034
                        else
3035
                                s = path_join(params->prefix[t], i->path);
1,603✔
3036
                        if (!s)
1,616✔
3037
                                return -ENOMEM;
3038

3039
                        if (exec_directory_is_private(context, t) &&
1,629✔
3040
                            exec_context_with_rootfs(context))
13✔
3041
                                /* When RootDirectory= or RootImage= are set, then the symbolic link to the private
3042
                                 * directory is not created on the root directory. So, let's bind-mount the directory
3043
                                 * on the 'non-private' place. */
3044
                                d = path_join(params->prefix[t], i->path);
×
3045
                        else
3046
                                d = strdup(s);
1,616✔
3047
                        if (!d)
1,616✔
3048
                                return -ENOMEM;
3049

3050
                        bind_mounts[h++] = (BindMount) {
1,616✔
3051
                                .source = TAKE_PTR(s),
1,616✔
3052
                                .destination = TAKE_PTR(d),
1,616✔
3053
                                .nosuid = context->dynamic_user, /* don't allow suid/sgid when DynamicUser= is on */
1,616✔
3054
                                .recursive = true,
3055
                                .read_only = FLAGS_SET(i->flags, EXEC_DIRECTORY_READ_ONLY),
1,616✔
3056
                                .idmapped = i->idmapped,
1,616✔
3057
                                .uid = exec_directory_uid,
3058
                                .gid = exec_directory_gid,
3059
                        };
3060
                }
3061
        }
3062

3063
        assert(h == n);
945✔
3064

3065
        *ret_bind_mounts = TAKE_PTR(bind_mounts);
945✔
3066
        *ret_n_bind_mounts = n;
945✔
3067
        *ret_empty_directories = TAKE_PTR(empty_directories);
945✔
3068

3069
        return (int) n;
945✔
3070
}
3071

3072
/* ret_symlinks will contain a list of pairs src:dest that describes
3073
 * the symlinks to create later on. For example, the symlinks needed
3074
 * to safely give private directories to DynamicUser=1 users. */
3075
static int compile_symlinks(
2,055✔
3076
                const ExecContext *context,
3077
                const ExecParameters *params,
3078
                bool setup_os_release_symlink,
3079
                char ***ret_symlinks) {
3080

3081
        _cleanup_strv_free_ char **symlinks = NULL;
2,055✔
3082
        int r;
2,055✔
3083

3084
        assert(context);
2,055✔
3085
        assert(params);
2,055✔
3086
        assert(ret_symlinks);
2,055✔
3087

3088
        for (ExecDirectoryType dt = 0; dt < _EXEC_DIRECTORY_TYPE_MAX; dt++)
12,330✔
3089
                FOREACH_ARRAY(i, context->directories[dt].items, context->directories[dt].n_items) {
11,891✔
3090
                        _cleanup_free_ char *private_path = NULL, *path = NULL;
1,603✔
3091

3092
                        STRV_FOREACH(symlink, i->symlinks) {
1,742✔
3093
                                _cleanup_free_ char *src_abs = NULL, *dst_abs = NULL;
126✔
3094

3095
                                src_abs = path_join(params->prefix[dt], i->path);
126✔
3096
                                dst_abs = path_join(params->prefix[dt], *symlink);
126✔
3097
                                if (!src_abs || !dst_abs)
126✔
3098
                                        return -ENOMEM;
3099

3100
                                r = strv_consume_pair(&symlinks, TAKE_PTR(src_abs), TAKE_PTR(dst_abs));
126✔
3101
                                if (r < 0)
126✔
3102
                                        return r;
3103
                        }
3104

3105
                        if (!exec_directory_is_private(context, dt) ||
1,629✔
3106
                            exec_context_with_rootfs(context) ||
13✔
3107
                            FLAGS_SET(i->flags, EXEC_DIRECTORY_ONLY_CREATE))
13✔
3108
                                continue;
1,603✔
3109

3110
                        private_path = path_join(params->prefix[dt], "private", i->path);
13✔
3111
                        if (!private_path)
13✔
3112
                                return -ENOMEM;
3113

3114
                        path = path_join(params->prefix[dt], i->path);
13✔
3115
                        if (!path)
13✔
3116
                                return -ENOMEM;
3117

3118
                        r = strv_consume_pair(&symlinks, TAKE_PTR(private_path), TAKE_PTR(path));
13✔
3119
                        if (r < 0)
13✔
3120
                                return r;
3121
                }
3122

3123
        /* We make the host's os-release available via a symlink, so that we can copy it atomically
3124
         * and readers will never get a half-written version. Note that, while the paths specified here are
3125
         * absolute, when they are processed in namespace.c they will be made relative automatically, i.e.:
3126
         * 'os-release -> .os-release-stage/os-release' is what will be created. */
3127
        if (setup_os_release_symlink) {
2,055✔
3128
                r = strv_extend_many(
7✔
3129
                                &symlinks,
3130
                                "/run/host/.os-release-stage/os-release",
3131
                                "/run/host/os-release");
3132
                if (r < 0)
7✔
3133
                        return r;
3134
        }
3135

3136
        *ret_symlinks = TAKE_PTR(symlinks);
2,055✔
3137

3138
        return 0;
2,055✔
3139
}
3140

3141
static bool insist_on_sandboxing(
×
3142
                const ExecContext *context,
3143
                const char *root_dir,
3144
                const char *root_image,
3145
                const BindMount *bind_mounts,
3146
                size_t n_bind_mounts) {
3147

3148
        assert(context);
×
3149
        assert(n_bind_mounts == 0 || bind_mounts);
×
3150

3151
        /* Checks whether we need to insist on fs namespacing. i.e. whether we have settings configured that
3152
         * would alter the view on the file system beyond making things read-only or invisible, i.e. would
3153
         * rearrange stuff in a way we cannot ignore gracefully. */
3154

3155
        if (context->n_temporary_filesystems > 0)
×
3156
                return true;
3157

3158
        if (root_dir || root_image)
×
3159
                return true;
3160

3161
        if (context->n_mount_images > 0)
×
3162
                return true;
3163

3164
        if (context->dynamic_user)
×
3165
                return true;
3166

3167
        if (context->n_extension_images > 0 || !strv_isempty(context->extension_directories))
×
3168
                return true;
3169

3170
        /* If there are any bind mounts set that don't map back onto themselves, fs namespacing becomes
3171
         * essential. */
3172
        FOREACH_ARRAY(i, bind_mounts, n_bind_mounts)
×
3173
                if (!path_equal(i->source, i->destination))
×
3174
                        return true;
3175

3176
        if (context->log_namespace)
×
3177
                return true;
×
3178

3179
        return false;
3180
}
3181

3182
static int setup_ephemeral(
2,055✔
3183
                const ExecContext *context,
3184
                ExecRuntime *runtime,
3185
                char **root_image,            /* both input and output! modified if ephemeral logic enabled */
3186
                char **root_directory,        /* ditto */
3187
                char **reterr_path) {
3188

3189
        _cleanup_close_ int fd = -EBADF;
2,055✔
3190
        _cleanup_free_ char *new_root = NULL;
2,055✔
3191
        int r;
2,055✔
3192

3193
        assert(context);
2,055✔
3194
        assert(runtime);
2,055✔
3195
        assert(root_image);
2,055✔
3196
        assert(root_directory);
2,055✔
3197

3198
        if (!*root_image && !*root_directory)
2,055✔
3199
                return 0;
3200

3201
        if (!runtime->ephemeral_copy)
8✔
3202
                return 0;
3203

3204
        assert(runtime->ephemeral_storage_socket[0] >= 0);
×
3205
        assert(runtime->ephemeral_storage_socket[1] >= 0);
×
3206

3207
        new_root = strdup(runtime->ephemeral_copy);
×
3208
        if (!new_root)
×
3209
                return log_oom_debug();
×
3210

3211
        r = posix_lock(runtime->ephemeral_storage_socket[0], LOCK_EX);
×
3212
        if (r < 0)
×
3213
                return log_debug_errno(r, "Failed to lock ephemeral storage socket: %m");
×
3214

3215
        CLEANUP_POSIX_UNLOCK(runtime->ephemeral_storage_socket[0]);
×
3216

3217
        fd = receive_one_fd(runtime->ephemeral_storage_socket[0], MSG_PEEK|MSG_DONTWAIT);
×
3218
        if (fd >= 0)
×
3219
                /* We got an fd! That means ephemeral has already been set up, so nothing to do here. */
3220
                return 0;
3221
        if (fd != -EAGAIN)
×
3222
                return log_debug_errno(fd, "Failed to receive file descriptor queued on ephemeral storage socket: %m");
×
3223

3224
        if (*root_image) {
×
3225
                log_debug("Making ephemeral copy of %s to %s", *root_image, new_root);
×
3226

3227
                fd = copy_file(*root_image, new_root, O_EXCL, 0600,
×
3228
                               COPY_LOCK_BSD|COPY_REFLINK|COPY_CRTIME|COPY_NOCOW_AFTER);
3229
                if (fd < 0) {
×
3230
                        *reterr_path = strdup(*root_image);
×
3231
                        return log_debug_errno(fd, "Failed to copy image %s to %s: %m",
×
3232
                                               *root_image, new_root);
3233
                }
3234
        } else {
3235
                assert(*root_directory);
×
3236

3237
                log_debug("Making ephemeral snapshot of %s to %s", *root_directory, new_root);
×
3238

3239
                fd = btrfs_subvol_snapshot_at(
×
3240
                                AT_FDCWD, *root_directory,
3241
                                AT_FDCWD, new_root,
3242
                                BTRFS_SNAPSHOT_FALLBACK_COPY |
3243
                                BTRFS_SNAPSHOT_FALLBACK_DIRECTORY |
3244
                                BTRFS_SNAPSHOT_RECURSIVE |
3245
                                BTRFS_SNAPSHOT_LOCK_BSD);
3246
                if (fd < 0) {
×
3247
                        *reterr_path = strdup(*root_directory);
×
3248
                        return log_debug_errno(fd, "Failed to snapshot directory %s to %s: %m",
×
3249
                                               *root_directory, new_root);
3250
                }
3251
        }
3252

3253
        r = send_one_fd(runtime->ephemeral_storage_socket[1], fd, MSG_DONTWAIT);
×
3254
        if (r < 0)
×
3255
                return log_debug_errno(r, "Failed to queue file descriptor on ephemeral storage socket: %m");
×
3256

3257
        if (*root_image)
×
3258
                free_and_replace(*root_image, new_root);
×
3259
        else {
3260
                assert(*root_directory);
×
3261
                free_and_replace(*root_directory, new_root);
×
3262
        }
3263

3264
        return 1;
3265
}
3266

3267
static int verity_settings_prepare(
7✔
3268
                VeritySettings *verity,
3269
                const char *root_image,
3270
                const void *root_hash,
3271
                size_t root_hash_size,
3272
                const char *root_hash_path,
3273
                const void *root_hash_sig,
3274
                size_t root_hash_sig_size,
3275
                const char *root_hash_sig_path,
3276
                const char *verity_data_path) {
3277

3278
        int r;
7✔
3279

3280
        assert(verity);
7✔
3281

3282
        if (root_hash) {
7✔
3283
                void *d;
4✔
3284

3285
                d = memdup(root_hash, root_hash_size);
4✔
3286
                if (!d)
4✔
3287
                        return -ENOMEM;
7✔
3288

3289
                free_and_replace(verity->root_hash, d);
4✔
3290
                verity->root_hash_size = root_hash_size;
4✔
3291
                verity->designator = PARTITION_ROOT;
4✔
3292
        }
3293

3294
        if (root_hash_sig) {
7✔
3295
                void *d;
×
3296

3297
                d = memdup(root_hash_sig, root_hash_sig_size);
×
3298
                if (!d)
×
3299
                        return -ENOMEM;
7✔
3300

3301
                free_and_replace(verity->root_hash_sig, d);
×
3302
                verity->root_hash_sig_size = root_hash_sig_size;
×
3303
                verity->designator = PARTITION_ROOT;
×
3304
        }
3305

3306
        if (verity_data_path) {
7✔
3307
                r = free_and_strdup(&verity->data_path, verity_data_path);
×
3308
                if (r < 0)
×
3309
                        return r;
3310
        }
3311

3312
        r = verity_settings_load(
7✔
3313
                        verity,
3314
                        root_image,
3315
                        root_hash_path,
3316
                        root_hash_sig_path);
3317
        if (r < 0)
7✔
3318
                return log_debug_errno(r, "Failed to load root hash: %m");
×
3319

3320
        return 0;
3321
}
3322

3323
static int pick_versions(
2,057✔
3324
                const ExecContext *context,
3325
                const ExecParameters *params,
3326
                char **ret_root_image,
3327
                char **ret_root_directory,
3328
                char **reterr_path) {
3329

3330
        int r;
2,057✔
3331

3332
        assert(context);
2,057✔
3333
        assert(params);
2,057✔
3334
        assert(ret_root_image);
2,057✔
3335
        assert(ret_root_directory);
2,057✔
3336

3337
        if (context->root_image) {
2,057✔
3338
                _cleanup_(pick_result_done) PickResult result = PICK_RESULT_NULL;
8✔
3339

3340
                r = path_pick(/* toplevel_path= */ NULL,
16✔
3341
                              /* toplevel_fd= */ AT_FDCWD,
3342
                              context->root_image,
8✔
3343
                              &pick_filter_image_raw,
3344
                              PICK_ARCHITECTURE|PICK_TRIES|PICK_RESOLVE,
3345
                              &result);
3346
                if (r < 0) {
8✔
3347
                        *reterr_path = strdup(context->root_image);
1✔
3348
                        return r;
1✔
3349
                }
3350

3351
                if (!result.path) {
7✔
3352
                        *reterr_path = strdup(context->root_image);
×
3353
                        return log_debug_errno(SYNTHETIC_ERRNO(ENOENT), "No matching entry in .v/ directory %s found.", context->root_image);
×
3354
                }
3355

3356
                *ret_root_image = TAKE_PTR(result.path);
7✔
3357
                *ret_root_directory = NULL;
7✔
3358
                return r;
7✔
3359
        }
3360

3361
        if (context->root_directory) {
2,049✔
3362
                _cleanup_(pick_result_done) PickResult result = PICK_RESULT_NULL;
2✔
3363

3364
                r = path_pick(/* toplevel_path= */ NULL,
4✔
3365
                              /* toplevel_fd= */ AT_FDCWD,
3366
                              context->root_directory,
2✔
3367
                              &pick_filter_image_dir,
3368
                              PICK_ARCHITECTURE|PICK_TRIES|PICK_RESOLVE,
3369
                              &result);
3370
                if (r < 0) {
2✔
3371
                        *reterr_path = strdup(context->root_directory);
×
3372
                        return r;
×
3373
                }
3374

3375
                if (!result.path) {
2✔
3376
                        *reterr_path = strdup(context->root_directory);
1✔
3377
                        return log_debug_errno(SYNTHETIC_ERRNO(ENOENT), "No matching entry in .v/ directory %s found.", context->root_directory);
1✔
3378
                }
3379

3380
                *ret_root_image = NULL;
1✔
3381
                *ret_root_directory = TAKE_PTR(result.path);
1✔
3382
                return r;
1✔
3383
        }
3384

3385
        *ret_root_image = *ret_root_directory = NULL;
2,047✔
3386
        return 0;
2,047✔
3387
}
3388

3389
static int apply_mount_namespace(
2,057✔
3390
                ExecCommandFlags command_flags,
3391
                const ExecContext *context,
3392
                const ExecParameters *params,
3393
                ExecRuntime *runtime,
3394
                const char *memory_pressure_path,
3395
                bool needs_sandboxing,
3396
                char **reterr_path,
3397
                uid_t exec_directory_uid,
3398
                gid_t exec_directory_gid) {
3399

3400
        _cleanup_(verity_settings_done) VeritySettings verity = VERITY_SETTINGS_DEFAULT;
2,057✔
3401
        _cleanup_strv_free_ char **empty_directories = NULL, **symlinks = NULL,
2,057✔
3402
                        **read_write_paths_cleanup = NULL;
×
3403
        _cleanup_free_ char *creds_path = NULL, *incoming_dir = NULL, *propagate_dir = NULL,
×
3404
                *private_namespace_dir = NULL, *host_os_release_stage = NULL, *root_image = NULL, *root_dir = NULL;
2,057✔
3405
        const char *tmp_dir = NULL, *var_tmp_dir = NULL;
2,057✔
3406
        char **read_write_paths;
2,057✔
3407
        bool setup_os_release_symlink;
2,057✔
3408
        BindMount *bind_mounts = NULL;
2,057✔
3409
        size_t n_bind_mounts = 0;
2,057✔
3410
        int r;
2,057✔
3411

3412
        assert(context);
2,057✔
3413
        assert(params);
2,057✔
3414
        assert(runtime);
2,057✔
3415

3416
        CLEANUP_ARRAY(bind_mounts, n_bind_mounts, bind_mount_free_many);
2,057✔
3417

3418
        if (params->flags & EXEC_APPLY_CHROOT) {
2,057✔
3419
                r = pick_versions(
2,057✔
3420
                                context,
3421
                                params,
3422
                                &root_image,
3423
                                &root_dir,
3424
                                reterr_path);
3425
                if (r < 0)
2,057✔
3426
                        return r;
3427

3428
                r = setup_ephemeral(
2,055✔
3429
                                context,
3430
                                runtime,
3431
                                &root_image,
3432
                                &root_dir,
3433
                                reterr_path);
3434
                if (r < 0)
2,055✔
3435
                        return r;
3436
        }
3437

3438
        r = compile_bind_mounts(context, params, exec_directory_uid, exec_directory_gid, &bind_mounts, &n_bind_mounts, &empty_directories);
2,055✔
3439
        if (r < 0)
2,055✔
3440
                return r;
3441

3442
        /* We need to make the pressure path writable even if /sys/fs/cgroups is made read-only, as the
3443
         * service will need to write to it in order to start the notifications. */
3444
        if (exec_is_cgroup_mount_read_only(context) && memory_pressure_path && !streq(memory_pressure_path, "/dev/null")) {
2,055✔
3445
                read_write_paths_cleanup = strv_copy(context->read_write_paths);
1,172✔
3446
                if (!read_write_paths_cleanup)
1,172✔
3447
                        return -ENOMEM;
3448

3449
                r = strv_extend(&read_write_paths_cleanup, memory_pressure_path);
1,172✔
3450
                if (r < 0)
1,172✔
3451
                        return r;
3452

3453
                read_write_paths = read_write_paths_cleanup;
1,172✔
3454
        } else
3455
                read_write_paths = context->read_write_paths;
883✔
3456

3457
        if (needs_sandboxing) {
2,055✔
3458
                /* The runtime struct only contains the parent of the private /tmp, which is non-accessible
3459
                 * to world users. Inside of it there's a /tmp that is sticky, and that's the one we want to
3460
                 * use here.  This does not apply when we are using /run/systemd/empty as fallback. */
3461

3462
                if (context->private_tmp == PRIVATE_TMP_CONNECTED && runtime->shared) {
2,055✔
3463
                        if (streq_ptr(runtime->shared->tmp_dir, RUN_SYSTEMD_EMPTY))
334✔
3464
                                tmp_dir = runtime->shared->tmp_dir;
3465
                        else if (runtime->shared->tmp_dir)
334✔
3466
                                tmp_dir = strjoina(runtime->shared->tmp_dir, "/tmp");
1,670✔
3467

3468
                        if (streq_ptr(runtime->shared->var_tmp_dir, RUN_SYSTEMD_EMPTY))
334✔
3469
                                var_tmp_dir = runtime->shared->var_tmp_dir;
3470
                        else if (runtime->shared->var_tmp_dir)
334✔
3471
                                var_tmp_dir = strjoina(runtime->shared->var_tmp_dir, "/tmp");
1,670✔
3472
                }
3473
        }
3474

3475
        /* Symlinks (exec dirs, os-release) are set up after other mounts, before they are made read-only. */
3476
        setup_os_release_symlink = needs_sandboxing && exec_context_get_effective_mount_apivfs(context) && (root_dir || root_image);
2,055✔
3477
        r = compile_symlinks(context, params, setup_os_release_symlink, &symlinks);
2,055✔
3478
        if (r < 0)
2,055✔
3479
                return r;
3480

3481
        if (context->mount_propagation_flag == MS_SHARED)
2,055✔
3482
                log_debug("shared mount propagation hidden by other fs namespacing unit settings: ignoring");
×
3483

3484
        r = exec_context_get_credential_directory(context, params, params->unit_id, &creds_path);
2,055✔
3485
        if (r < 0)
2,055✔
3486
                return r;
3487

3488
        if (params->runtime_scope == RUNTIME_SCOPE_SYSTEM) {
2,055✔
3489
                propagate_dir = path_join("/run/systemd/propagate/", params->unit_id);
2,028✔
3490
                if (!propagate_dir)
2,028✔
3491
                        return -ENOMEM;
3492

3493
                incoming_dir = strdup("/run/systemd/incoming");
2,028✔
3494
                if (!incoming_dir)
2,028✔
3495
                        return -ENOMEM;
3496

3497
                private_namespace_dir = strdup("/run/systemd");
2,028✔
3498
                if (!private_namespace_dir)
2,028✔
3499
                        return -ENOMEM;
3500

3501
                /* If running under a different root filesystem, propagate the host's os-release. We make a
3502
                 * copy rather than just bind mounting it, so that it can be updated on soft-reboot. */
3503
                if (setup_os_release_symlink) {
2,028✔
3504
                        host_os_release_stage = strdup("/run/systemd/propagate/.os-release-stage");
7✔
3505
                        if (!host_os_release_stage)
7✔
3506
                                return -ENOMEM;
3507
                }
3508
        } else {
3509
                assert(params->runtime_scope == RUNTIME_SCOPE_USER);
27✔
3510

3511
                if (asprintf(&private_namespace_dir, "/run/user/" UID_FMT "/systemd", geteuid()) < 0)
27✔
3512
                        return -ENOMEM;
3513

3514
                if (setup_os_release_symlink) {
27✔
3515
                        if (asprintf(&host_os_release_stage,
×
3516
                                     "/run/user/" UID_FMT "/systemd/propagate/.os-release-stage",
3517
                                     geteuid()) < 0)
3518
                                return -ENOMEM;
3519
                }
3520
        }
3521

3522
        if (root_image) {
2,055✔
3523
                r = verity_settings_prepare(
14✔
3524
                        &verity,
3525
                        root_image,
3526
                        context->root_hash, context->root_hash_size, context->root_hash_path,
7✔
3527
                        context->root_hash_sig, context->root_hash_sig_size, context->root_hash_sig_path,
7✔
3528
                        context->root_verity);
7✔
3529
                if (r < 0)
7✔
3530
                        return r;
3531
        }
3532

UNCOV
3533
        NamespaceParameters parameters = {
×
3534
                .runtime_scope = params->runtime_scope,
2,055✔
3535

3536
                .root_directory = root_dir,
3537
                .root_image = root_image,
3538
                .root_image_options = context->root_image_options,
2,055✔
3539
                .root_image_policy = context->root_image_policy ?: &image_policy_service,
2,055✔
3540

3541
                .read_write_paths = read_write_paths,
3542
                .read_only_paths = needs_sandboxing ? context->read_only_paths : NULL,
2,055✔
3543
                .inaccessible_paths = needs_sandboxing ? context->inaccessible_paths : NULL,
2,055✔
3544

3545
                .exec_paths = needs_sandboxing ? context->exec_paths : NULL,
2,055✔
3546
                .no_exec_paths = needs_sandboxing ? context->no_exec_paths : NULL,
2,055✔
3547

3548
                .empty_directories = empty_directories,
3549
                .symlinks = symlinks,
3550

3551
                .bind_mounts = bind_mounts,
3552
                .n_bind_mounts = n_bind_mounts,
3553

3554
                .temporary_filesystems = context->temporary_filesystems,
2,055✔
3555
                .n_temporary_filesystems = context->n_temporary_filesystems,
2,055✔
3556

3557
                .mount_images = context->mount_images,
2,055✔
3558
                .n_mount_images = context->n_mount_images,
2,055✔
3559
                .mount_image_policy = context->mount_image_policy ?: &image_policy_service,
2,055✔
3560

3561
                .tmp_dir = tmp_dir,
3562
                .var_tmp_dir = var_tmp_dir,
3563

3564
                .creds_path = creds_path,
3565
                .log_namespace = context->log_namespace,
2,055✔
3566
                .mount_propagation_flag = context->mount_propagation_flag,
2,055✔
3567

3568
                .verity = &verity,
3569

3570
                .extension_images = context->extension_images,
2,055✔
3571
                .n_extension_images = context->n_extension_images,
2,055✔
3572
                .extension_image_policy = context->extension_image_policy ?: &image_policy_sysext,
2,055✔
3573
                .extension_directories = context->extension_directories,
2,055✔
3574

3575
                .propagate_dir = propagate_dir,
3576
                .incoming_dir = incoming_dir,
3577
                .private_namespace_dir = private_namespace_dir,
3578
                .host_notify_socket = params->notify_socket,
2,055✔
3579
                .notify_socket_path = exec_get_private_notify_socket_path(context, params, needs_sandboxing),
2,055✔
3580
                .host_os_release_stage = host_os_release_stage,
3581

3582
                /* If DynamicUser=no and RootDirectory= is set then lets pass a relaxed sandbox info,
3583
                 * otherwise enforce it, don't ignore protected paths and fail if we are enable to apply the
3584
                 * sandbox inside the mount namespace. */
3585
                .ignore_protect_paths = !needs_sandboxing && !context->dynamic_user && root_dir,
2,055✔
3586

3587
                .protect_control_groups = needs_sandboxing ? exec_get_protect_control_groups(context) : PROTECT_CONTROL_GROUPS_NO,
2,055✔
3588
                .protect_kernel_tunables = needs_sandboxing && context->protect_kernel_tunables,
2,055✔
3589
                .protect_kernel_modules = needs_sandboxing && context->protect_kernel_modules,
2,055✔
3590
                .protect_kernel_logs = needs_sandboxing && context->protect_kernel_logs,
2,055✔
3591

3592
                .private_dev = needs_sandboxing && context->private_devices,
2,055✔
3593
                .private_network = needs_sandboxing && exec_needs_network_namespace(context),
2,055✔
3594
                .private_ipc = needs_sandboxing && exec_needs_ipc_namespace(context),
2,055✔
3595
                .private_pids = needs_sandboxing && exec_needs_pid_namespace(context, params) ? context->private_pids : PRIVATE_PIDS_NO,
2,055✔
3596
                .private_tmp = needs_sandboxing ? context->private_tmp : PRIVATE_TMP_NO,
2,055✔
3597
                .private_var_tmp = needs_sandboxing ? context->private_var_tmp : PRIVATE_TMP_NO,
2,055✔
3598

3599
                .mount_apivfs = needs_sandboxing && exec_context_get_effective_mount_apivfs(context),
2,055✔
3600
                .bind_log_sockets = needs_sandboxing && exec_context_get_effective_bind_log_sockets(context),
2,055✔
3601

3602
                /* If NNP is on, we can turn on MS_NOSUID, since it won't have any effect anymore. */
3603
                .mount_nosuid = needs_sandboxing && context->no_new_privileges && !mac_selinux_use(),
2,055✔
3604

3605
                .protect_home = needs_sandboxing ? context->protect_home : PROTECT_HOME_NO,
2,055✔
3606
                .protect_hostname = needs_sandboxing ? context->protect_hostname : PROTECT_HOSTNAME_NO,
2,055✔
3607
                .protect_system = needs_sandboxing ? context->protect_system : PROTECT_SYSTEM_NO,
2,055✔
3608
                .protect_proc = needs_sandboxing ? context->protect_proc : PROTECT_PROC_DEFAULT,
2,055✔
3609
                .proc_subset = needs_sandboxing ? context->proc_subset : PROC_SUBSET_ALL,
2,055✔
3610
        };
3611

3612
        r = setup_namespace(&parameters, reterr_path);
2,055✔
3613
        /* If we couldn't set up the namespace this is probably due to a missing capability. setup_namespace() reports
3614
         * that with a special, recognizable error ENOANO. In this case, silently proceed, but only if exclusively
3615
         * sandboxing options were used, i.e. nothing such as RootDirectory= or BindMount= that would result in a
3616
         * completely different execution environment. */
3617
        if (r == -ENOANO) {
2,055✔
3618
                if (insist_on_sandboxing(
×
3619
                                    context,
3620
                                    root_dir, root_image,
3621
                                    bind_mounts,
3622
                                    n_bind_mounts))
3623
                        return log_debug_errno(SYNTHETIC_ERRNO(EOPNOTSUPP),
×
3624
                                               "Failed to set up namespace, and refusing to continue since "
3625
                                               "the selected namespacing options alter mount environment non-trivially.\n"
3626
                                               "Bind mounts: %zu, temporary filesystems: %zu, root directory: %s, root image: %s, dynamic user: %s",
3627
                                               n_bind_mounts,
3628
                                               context->n_temporary_filesystems,
3629
                                               yes_no(root_dir),
3630
                                               yes_no(root_image),
3631
                                               yes_no(context->dynamic_user));
3632

3633
                log_debug("Failed to set up namespace, assuming containerized execution and ignoring.");
×
3634
                return 0;
×
3635
        }
3636

3637
        return r;
3638
}
3639

3640
static int apply_working_directory(
9,831✔
3641
                const ExecContext *context,
3642
                const ExecParameters *params,
3643
                ExecRuntime *runtime,
3644
                const char *pwent_home,
3645
                char * const *env) {
3646

3647
        const char *wd;
9,831✔
3648
        int r;
9,831✔
3649

3650
        assert(context);
9,831✔
3651
        assert(params);
9,831✔
3652
        assert(runtime);
9,831✔
3653

3654
        if (context->working_directory_home) {
9,831✔
3655
                /* Preferably use the data from $HOME, in case it was updated by a PAM module */
3656
                wd = strv_env_get(env, "HOME");
103✔
3657
                if (!wd) {
103✔
3658
                        /* If that's not available, use the data from the struct passwd entry: */
3659
                        if (!pwent_home)
1✔
3660
                                return -ENXIO;
3661

3662
                        wd = pwent_home;
3663
                }
3664
        } else
3665
                wd = empty_to_root(context->working_directory);
9,728✔
3666

3667
        if (params->flags & EXEC_APPLY_CHROOT)
9,831✔
3668
                r = RET_NERRNO(chdir(wd));
9,831✔
3669
        else {
3670
                _cleanup_close_ int dfd = -EBADF;
×
3671

3672
                r = chase(wd,
×
3673
                          runtime->ephemeral_copy ?: context->root_directory,
×
3674
                          CHASE_PREFIX_ROOT|CHASE_AT_RESOLVE_IN_ROOT,
3675
                          /* ret_path= */ NULL,
3676
                          &dfd);
3677
                if (r >= 0)
×
3678
                        r = RET_NERRNO(fchdir(dfd));
×
3679
        }
3680
        return context->working_directory_missing_ok ? 0 : r;
9,831✔
3681
}
3682

3683
static int apply_root_directory(
9,831✔
3684
                const ExecContext *context,
3685
                const ExecParameters *params,
3686
                ExecRuntime *runtime,
3687
                const bool needs_mount_ns,
3688
                int *exit_status) {
3689

3690
        assert(context);
9,831✔
3691
        assert(params);
9,831✔
3692
        assert(runtime);
9,831✔
3693
        assert(exit_status);
9,831✔
3694

3695
        if (params->flags & EXEC_APPLY_CHROOT)
9,831✔
3696
                if (!needs_mount_ns && context->root_directory)
9,831✔
3697
                        if (chroot(runtime->ephemeral_copy ?: context->root_directory) < 0) {
×
3698
                                *exit_status = EXIT_CHROOT;
×
3699
                                return -errno;
×
3700
                        }
3701

3702
        return 0;
3703
}
3704

3705
static int setup_keyring(
9,856✔
3706
                const ExecContext *context,
3707
                const ExecParameters *p,
3708
                uid_t uid,
3709
                gid_t gid) {
3710

3711
        key_serial_t keyring;
9,856✔
3712
        int r = 0;
9,856✔
3713
        uid_t saved_uid;
9,856✔
3714
        gid_t saved_gid;
9,856✔
3715

3716
        assert(context);
9,856✔
3717
        assert(p);
9,856✔
3718

3719
        /* Let's set up a new per-service "session" kernel keyring for each system service. This has the benefit that
3720
         * each service runs with its own keyring shared among all processes of the service, but with no hook-up beyond
3721
         * that scope, and in particular no link to the per-UID keyring. If we don't do this the keyring will be
3722
         * automatically created on-demand and then linked to the per-UID keyring, by the kernel. The kernel's built-in
3723
         * on-demand behaviour is very appropriate for login users, but probably not so much for system services, where
3724
         * UIDs are not necessarily specific to a service but reused (at least in the case of UID 0). */
3725

3726
        if (context->keyring_mode == EXEC_KEYRING_INHERIT)
9,856✔
3727
                return 0;
3728

3729
        /* Acquiring a reference to the user keyring is nasty. We briefly change identity in order to get things set up
3730
         * properly by the kernel. If we don't do that then we can't create it atomically, and that sucks for parallel
3731
         * execution. This mimics what pam_keyinit does, too. Setting up session keyring, to be owned by the right user
3732
         * & group is just as nasty as acquiring a reference to the user keyring. */
3733

3734
        saved_uid = getuid();
8,884✔
3735
        saved_gid = getgid();
8,884✔
3736

3737
        if (gid_is_valid(gid) && gid != saved_gid) {
8,884✔
3738
                if (setregid(gid, -1) < 0)
1,815✔
3739
                        return log_error_errno(errno, "Failed to change GID for user keyring: %m");
×
3740
        }
3741

3742
        if (uid_is_valid(uid) && uid != saved_uid) {
8,884✔
3743
                if (setreuid(uid, -1) < 0) {
1,812✔
3744
                        r = log_error_errno(errno, "Failed to change UID for user keyring: %m");
×
3745
                        goto out;
×
3746
                }
3747
        }
3748

3749
        keyring = keyctl(KEYCTL_JOIN_SESSION_KEYRING, 0, 0, 0, 0);
8,884✔
3750
        if (keyring == -1) {
8,884✔
3751
                if (errno == ENOSYS)
×
3752
                        log_debug_errno(errno, "Kernel keyring not supported, ignoring.");
×
3753
                else if (ERRNO_IS_PRIVILEGE(errno))
×
3754
                        log_debug_errno(errno, "Kernel keyring access prohibited, ignoring.");
×
3755
                else if (errno == EDQUOT)
×
3756
                        log_debug_errno(errno, "Out of kernel keyrings to allocate, ignoring.");
×
3757
                else
3758
                        r = log_error_errno(errno, "Setting up kernel keyring failed: %m");
×
3759

3760
                goto out;
×
3761
        }
3762

3763
        /* When requested link the user keyring into the session keyring. */
3764
        if (context->keyring_mode == EXEC_KEYRING_SHARED) {
8,884✔
3765

3766
                if (keyctl(KEYCTL_LINK,
952✔
3767
                           KEY_SPEC_USER_KEYRING,
3768
                           KEY_SPEC_SESSION_KEYRING, 0, 0) < 0) {
3769
                        r = log_error_errno(errno, "Failed to link user keyring into session keyring: %m");
×
3770
                        goto out;
×
3771
                }
3772
        }
3773

3774
        /* Restore uid/gid back */
3775
        if (uid_is_valid(uid) && uid != saved_uid) {
8,884✔
3776
                if (setreuid(saved_uid, -1) < 0) {
1,812✔
3777
                        r = log_error_errno(errno, "Failed to change UID back for user keyring: %m");
×
3778
                        goto out;
×
3779
                }
3780
        }
3781

3782
        if (gid_is_valid(gid) && gid != saved_gid) {
8,884✔
3783
                if (setregid(saved_gid, -1) < 0)
1,815✔
3784
                        return log_error_errno(errno, "Failed to change GID back for user keyring: %m");
×
3785
        }
3786

3787
        /* Populate they keyring with the invocation ID by default, as original saved_uid. */
3788
        if (!sd_id128_is_null(p->invocation_id)) {
8,884✔
3789
                key_serial_t key;
8,884✔
3790

3791
                key = add_key("user",
17,768✔
3792
                              "invocation_id",
3793
                              &p->invocation_id,
8,884✔
3794
                              sizeof(p->invocation_id),
3795
                              KEY_SPEC_SESSION_KEYRING);
3796
                if (key == -1)
8,884✔
3797
                        log_debug_errno(errno, "Failed to add invocation ID to keyring, ignoring: %m");
×
3798
                else {
3799
                        if (keyctl(KEYCTL_SETPERM, key,
8,884✔
3800
                                   KEY_POS_VIEW|KEY_POS_READ|KEY_POS_SEARCH|
3801
                                   KEY_USR_VIEW|KEY_USR_READ|KEY_USR_SEARCH, 0, 0) < 0)
3802
                                r = log_error_errno(errno, "Failed to restrict invocation ID permission: %m");
×
3803
                }
3804
        }
3805

3806
out:
8,884✔
3807
        /* Revert back uid & gid for the last time, and exit */
3808
        /* no extra logging, as only the first already reported error matters */
3809
        if (getuid() != saved_uid)
8,884✔
3810
                (void) setreuid(saved_uid, -1);
×
3811

3812
        if (getgid() != saved_gid)
8,884✔
3813
                (void) setregid(saved_gid, -1);
×
3814

3815
        return r;
3816
}
3817

3818
static void append_socket_pair(int *array, size_t *n, const int pair[static 2]) {
35,689✔
3819
        assert(array);
35,689✔
3820
        assert(n);
35,689✔
3821
        assert(pair);
35,689✔
3822

3823
        if (pair[0] >= 0)
35,689✔
3824
                array[(*n)++] = pair[0];
194✔
3825
        if (pair[1] >= 0)
35,689✔
3826
                array[(*n)++] = pair[1];
194✔
3827
}
35,689✔
3828

3829
static int close_remaining_fds(
11,855✔
3830
                const ExecParameters *params,
3831
                const ExecRuntime *runtime,
3832
                int socket_fd,
3833
                const int *fds,
3834
                size_t n_fds) {
11,855✔
3835

3836
        size_t n_dont_close = 0;
11,855✔
3837
        int dont_close[n_fds + 17];
11,855✔
3838

3839
        assert(params);
11,855✔
3840
        assert(runtime);
11,855✔
3841

3842
        if (params->stdin_fd >= 0)
11,855✔
3843
                dont_close[n_dont_close++] = params->stdin_fd;
550✔
3844
        if (params->stdout_fd >= 0)
11,855✔
3845
                dont_close[n_dont_close++] = params->stdout_fd;
550✔
3846
        if (params->stderr_fd >= 0)
11,855✔
3847
                dont_close[n_dont_close++] = params->stderr_fd;
550✔
3848

3849
        if (socket_fd >= 0)
11,855✔
3850
                dont_close[n_dont_close++] = socket_fd;
17✔
3851
        if (n_fds > 0) {
11,855✔
3852
                memcpy(dont_close + n_dont_close, fds, sizeof(int) * n_fds);
11,855✔
3853
                n_dont_close += n_fds;
11,855✔
3854
        }
3855

3856
        append_socket_pair(dont_close, &n_dont_close, runtime->ephemeral_storage_socket);
11,855✔
3857

3858
        if (runtime->shared) {
11,855✔
3859
                append_socket_pair(dont_close, &n_dont_close, runtime->shared->netns_storage_socket);
11,855✔
3860
                append_socket_pair(dont_close, &n_dont_close, runtime->shared->ipcns_storage_socket);
11,855✔
3861
        }
3862

3863
        if (runtime->dynamic_creds) {
11,855✔
3864
                if (runtime->dynamic_creds->user)
11,855✔
3865
                        append_socket_pair(dont_close, &n_dont_close, runtime->dynamic_creds->user->storage_socket);
62✔
3866
                if (runtime->dynamic_creds->group)
11,855✔
3867
                        append_socket_pair(dont_close, &n_dont_close, runtime->dynamic_creds->group->storage_socket);
62✔
3868
        }
3869

3870
        if (params->user_lookup_fd >= 0)
11,855✔
3871
                dont_close[n_dont_close++] = params->user_lookup_fd;
11,855✔
3872

3873
        if (params->handoff_timestamp_fd >= 0)
11,855✔
3874
                dont_close[n_dont_close++] = params->handoff_timestamp_fd;
11,855✔
3875

3876
        if (params->pidref_transport_fd >= 0)
11,855✔
3877
                dont_close[n_dont_close++] = params->pidref_transport_fd;
10,785✔
3878

3879
        assert(n_dont_close <= ELEMENTSOF(dont_close));
11,855✔
3880

3881
        return close_all_fds(dont_close, n_dont_close);
11,855✔
3882
}
3883

3884
static int send_user_lookup(
11,853✔
3885
                const char *unit_id,
3886
                int user_lookup_fd,
3887
                uid_t uid,
3888
                gid_t gid) {
3889

3890
        assert(unit_id);
11,853✔
3891

3892
        /* Send the resolved UID/GID to PID 1 after we learnt it. We send a single datagram, containing the UID/GID
3893
         * data as well as the unit name. Note that we suppress sending this if no user/group to resolve was
3894
         * specified. */
3895

3896
        if (user_lookup_fd < 0)
11,853✔
3897
                return 0;
3898

3899
        if (!uid_is_valid(uid) && !gid_is_valid(gid))
11,853✔
3900
                return 0;
3901

3902
        if (writev(user_lookup_fd,
2,748✔
3903
               (struct iovec[]) {
2,748✔
3904
                           IOVEC_MAKE(&uid, sizeof(uid)),
3905
                           IOVEC_MAKE(&gid, sizeof(gid)),
3906
                           IOVEC_MAKE_STRING(unit_id) }, 3) < 0)
2,748✔
3907
                return -errno;
×
3908

3909
        return 0;
2,748✔
3910
}
3911

3912
static int acquire_home(const ExecContext *c, const char **home, char **ret_buf) {
11,853✔
3913
        int r;
11,853✔
3914

3915
        assert(c);
11,853✔
3916
        assert(home);
11,853✔
3917
        assert(ret_buf);
11,853✔
3918

3919
        /* If WorkingDirectory=~ is set, try to acquire a usable home directory. */
3920

3921
        if (*home) /* Already acquired from get_fixed_user()? */
11,853✔
3922
                return 0;
3923

3924
        if (!c->working_directory_home)
9,172✔
3925
                return 0;
3926

3927
        if (c->dynamic_user || (c->user && is_this_me(c->user) <= 0))
×
3928
                return -EADDRNOTAVAIL;
×
3929

3930
        r = get_home_dir(ret_buf);
×
3931
        if (r < 0)
×
3932
                return r;
3933

3934
        *home = *ret_buf;
×
3935
        return 1;
×
3936
}
3937

3938
static int compile_suggested_paths(const ExecContext *c, const ExecParameters *p, char ***ret) {
62✔
3939
        _cleanup_strv_free_ char ** list = NULL;
62✔
3940
        int r;
62✔
3941

3942
        assert(c);
62✔
3943
        assert(p);
62✔
3944
        assert(ret);
62✔
3945

3946
        assert(c->dynamic_user);
62✔
3947

3948
        /* Compile a list of paths that it might make sense to read the owning UID from to use as initial candidate for
3949
         * dynamic UID allocation, in order to save us from doing costly recursive chown()s of the special
3950
         * directories. */
3951

3952
        for (ExecDirectoryType t = 0; t < _EXEC_DIRECTORY_TYPE_MAX; t++) {
372✔
3953

3954
                if (!EXEC_DIRECTORY_TYPE_SHALL_CHOWN(t))
310✔
3955
                        continue;
62✔
3956

3957
                if (!p->prefix[t])
248✔
3958
                        continue;
×
3959

3960
                for (size_t i = 0; i < c->directories[t].n_items; i++) {
263✔
3961
                        char *e;
15✔
3962

3963
                        if (exec_directory_is_private(c, t))
15✔
3964
                                e = path_join(p->prefix[t], "private", c->directories[t].items[i].path);
13✔
3965
                        else
3966
                                e = path_join(p->prefix[t], c->directories[t].items[i].path);
2✔
3967
                        if (!e)
15✔
3968
                                return -ENOMEM;
3969

3970
                        r = strv_consume(&list, e);
15✔
3971
                        if (r < 0)
15✔
3972
                                return r;
3973
                }
3974
        }
3975

3976
        *ret = TAKE_PTR(list);
62✔
3977

3978
        return 0;
62✔
3979
}
3980

3981
static int exec_context_cpu_affinity_from_numa(const ExecContext *c, CPUSet *ret) {
2✔
3982
        _cleanup_(cpu_set_reset) CPUSet s = {};
2✔
3983
        int r;
2✔
3984

3985
        assert(c);
2✔
3986
        assert(ret);
2✔
3987

3988
        if (!c->numa_policy.nodes.set) {
2✔
3989
                log_debug("Can't derive CPU affinity mask from NUMA mask because NUMA mask is not set, ignoring");
×
3990
                return 0;
×
3991
        }
3992

3993
        r = numa_to_cpu_set(&c->numa_policy, &s);
2✔
3994
        if (r < 0)
2✔
3995
                return r;
3996

3997
        cpu_set_reset(ret);
2✔
3998

3999
        return cpu_set_add_all(ret, &s);
2✔
4000
}
4001

4002
static int add_shifted_fd(int *fds, size_t fds_size, size_t *n_fds, int *fd) {
45,396✔
4003
        int r;
45,396✔
4004

4005
        assert(fds);
45,396✔
4006
        assert(n_fds);
45,396✔
4007
        assert(*n_fds < fds_size);
45,396✔
4008
        assert(fd);
45,396✔
4009

4010
        if (*fd < 0)
45,396✔
4011
               return 0;
45,396✔
4012

4013
        if (*fd < 3 + (int) *n_fds) {
22,082✔
4014
                /* Let's move the fd up, so that it's outside of the fd range we will use to store
4015
                 * the fds we pass to the process (or which are closed only during execve). */
4016

4017
                r = fcntl(*fd, F_DUPFD_CLOEXEC, 3 + (int) *n_fds);
9,831✔
4018
                if (r < 0)
9,831✔
4019
                        return -errno;
×
4020

4021
                close_and_replace(*fd, r);
9,831✔
4022
        }
4023

4024
        fds[(*n_fds)++] = *fd;
22,082✔
4025
        return 1;
22,082✔
4026
}
4027

4028
static int connect_unix_harder(const OpenFile *of, int ofd) {
1✔
4029
        static const int socket_types[] = { SOCK_DGRAM, SOCK_STREAM, SOCK_SEQPACKET };
1✔
4030

4031
        union sockaddr_union addr = {
1✔
4032
                .un.sun_family = AF_UNIX,
4033
        };
4034
        socklen_t sa_len;
1✔
4035
        int r;
1✔
4036

4037
        assert(of);
1✔
4038
        assert(ofd >= 0);
1✔
4039

4040
        r = sockaddr_un_set_path(&addr.un, FORMAT_PROC_FD_PATH(ofd));
1✔
4041
        if (r < 0)
1✔
4042
                return log_debug_errno(r, "Failed to set sockaddr for '%s': %m", of->path);
×
4043
        sa_len = r;
1✔
4044

4045
        FOREACH_ELEMENT(i, socket_types) {
2✔
4046
                _cleanup_close_ int fd = -EBADF;
2✔
4047

4048
                fd = socket(AF_UNIX, *i|SOCK_CLOEXEC, 0);
2✔
4049
                if (fd < 0)
2✔
4050
                        return log_debug_errno(errno, "Failed to create socket for '%s': %m", of->path);
×
4051

4052
                r = RET_NERRNO(connect(fd, &addr.sa, sa_len));
2✔
4053
                if (r >= 0)
1✔
4054
                        return TAKE_FD(fd);
1✔
4055
                if (r != -EPROTOTYPE)
1✔
4056
                        return log_debug_errno(r, "Failed to connect to socket for '%s': %m", of->path);
×
4057
        }
4058

4059
        return log_debug_errno(SYNTHETIC_ERRNO(EPROTOTYPE), "No suitable socket type to connect to socket '%s'.", of->path);
×
4060
}
4061

4062
static int get_open_file_fd(const OpenFile *of) {
5✔
4063
        _cleanup_close_ int fd = -EBADF, ofd = -EBADF;
5✔
4064
        struct stat st;
5✔
4065

4066
        assert(of);
5✔
4067

4068
        ofd = open(of->path, O_PATH | O_CLOEXEC);
5✔
4069
        if (ofd < 0)
5✔
4070
                return log_debug_errno(errno, "Failed to open '%s' as O_PATH: %m", of->path);
2✔
4071

4072
        if (fstat(ofd, &st) < 0)
3✔
4073
                return log_debug_errno( errno, "Failed to stat '%s': %m", of->path);
×
4074

4075
        if (S_ISSOCK(st.st_mode)) {
3✔
4076
                fd = connect_unix_harder(of, ofd);
1✔
4077
                if (fd < 0)
1✔
4078
                        return fd;
4079

4080
                if (FLAGS_SET(of->flags, OPENFILE_READ_ONLY) && shutdown(fd, SHUT_WR) < 0)
1✔
4081
                        return log_debug_errno(errno, "Failed to shutdown send for socket '%s': %m", of->path);
×
4082

4083
                log_debug("Opened socket '%s' as fd %d.", of->path, fd);
1✔
4084
        } else {
4085
                int flags = FLAGS_SET(of->flags, OPENFILE_READ_ONLY) ? O_RDONLY : O_RDWR;
2✔
4086
                if (FLAGS_SET(of->flags, OPENFILE_APPEND))
2✔
4087
                        flags |= O_APPEND;
×
4088
                else if (FLAGS_SET(of->flags, OPENFILE_TRUNCATE))
2✔
4089
                        flags |= O_TRUNC;
×
4090

4091
                fd = fd_reopen(ofd, flags|O_NOCTTY|O_CLOEXEC);
2✔
4092
                if (fd < 0)
2✔
4093
                        return log_debug_errno(fd, "Failed to reopen file '%s': %m", of->path);
×
4094

4095
                log_debug("Opened file '%s' as fd %d.", of->path, fd);
2✔
4096
        }
4097

4098
        return TAKE_FD(fd);
4099
}
4100

4101
static int collect_open_file_fds(ExecParameters *p, size_t *n_fds) {
11,856✔
4102
        assert(p);
11,856✔
4103
        assert(n_fds);
11,856✔
4104

4105
        LIST_FOREACH(open_files, of, p->open_files) {
11,856✔
4106
                _cleanup_close_ int fd = -EBADF;
11,861✔
4107

4108
                fd = get_open_file_fd(of);
5✔
4109
                if (fd < 0) {
5✔
4110
                        if (FLAGS_SET(of->flags, OPENFILE_GRACEFUL)) {
2✔
4111
                                log_full_errno(fd == -ENOENT || ERRNO_IS_NEG_PRIVILEGE(fd) ? LOG_DEBUG : LOG_WARNING,
1✔
4112
                                               fd,
4113
                                               "Failed to get OpenFile= file descriptor for '%s', ignoring: %m",
4114
                                               of->path);
4115
                                continue;
1✔
4116
                        }
4117

4118
                        return log_error_errno(fd, "Failed to get OpenFile= file descriptor for '%s': %m", of->path);
1✔
4119
                }
4120

4121
                if (!GREEDY_REALLOC(p->fds, *n_fds + 1))
3✔
4122
                        return log_oom();
×
4123

4124
                if (strv_extend(&p->fd_names, of->fdname) < 0)
3✔
4125
                        return log_oom();
×
4126

4127
                p->fds[(*n_fds)++] = TAKE_FD(fd);
3✔
4128
        }
4129

4130
        return 0;
4131
}
4132

4133
static void log_command_line(
9,830✔
4134
                const ExecContext *context,
4135
                const ExecParameters *params,
4136
                const char *msg,
4137
                const char *executable,
4138
                char **argv) {
4139

4140
        assert(context);
9,830✔
4141
        assert(params);
9,830✔
4142
        assert(msg);
9,830✔
4143
        assert(executable);
9,830✔
4144

4145
        if (!DEBUG_LOGGING)
9,830✔
4146
                return;
9,830✔
4147

4148
        _cleanup_free_ char *cmdline = quote_command_line(argv, SHELL_ESCAPE_EMPTY);
19,014✔
4149

4150
        log_struct(LOG_DEBUG,
18,220✔
4151
                   LOG_ITEM("EXECUTABLE=%s", executable),
4152
                   LOG_EXEC_MESSAGE(params, "%s: %s", msg, strnull(cmdline)),
4153
                   LOG_EXEC_INVOCATION_ID(params));
4154
}
4155

4156
static bool exec_needs_cap_sys_admin(const ExecContext *context, const ExecParameters *params) {
1,663✔
4157
        assert(context);
1,663✔
4158

4159
        return context->private_users != PRIVATE_USERS_NO ||
3,313✔
4160
               context->private_tmp != PRIVATE_TMP_NO ||
1,650✔
4161
               context->private_devices ||
1,635✔
4162
               context->private_network ||
1,628✔
4163
               context->network_namespace_path ||
1,621✔
4164
               context->private_ipc ||
1,621✔
4165
               context->ipc_namespace_path ||
1,621✔
4166
               context->private_mounts > 0 ||
1,621✔
4167
               context->mount_apivfs > 0 ||
1,611✔
4168
               context->bind_log_sockets > 0 ||
1,611✔
4169
               context->n_bind_mounts > 0 ||
1,611✔
4170
               context->n_temporary_filesystems > 0 ||
1,606✔
4171
               context->root_directory ||
1,606✔
4172
               !strv_isempty(context->extension_directories) ||
1,606✔
4173
               context->protect_system != PROTECT_SYSTEM_NO ||
1,606✔
4174
               context->protect_home != PROTECT_HOME_NO ||
3,197✔
4175
               exec_needs_pid_namespace(context, params) ||
1,591✔
4176
               context->protect_kernel_tunables ||
1,571✔
4177
               context->protect_kernel_modules ||
1,566✔
4178
               context->protect_kernel_logs ||
3,122✔
4179
               exec_needs_cgroup_mount(context) ||
1,561✔
4180
               context->protect_clock ||
1,561✔
4181
               context->protect_hostname != PROTECT_HOSTNAME_NO ||
1,556✔
4182
               !strv_isempty(context->read_write_paths) ||
1,551✔
4183
               !strv_isempty(context->read_only_paths) ||
1,536✔
4184
               !strv_isempty(context->inaccessible_paths) ||
1,536✔
4185
               !strv_isempty(context->exec_paths) ||
1,536✔
4186
               !strv_isempty(context->no_exec_paths) ||
3,199✔
4187
               context->delegate_namespaces != NAMESPACE_FLAGS_INITIAL;
1,536✔
4188
}
4189

4190
static PrivateUsers exec_context_get_effective_private_users(
9,840✔
4191
                const ExecContext *context,
4192
                const ExecParameters *params) {
4193

4194
        assert(context);
9,840✔
4195
        assert(params);
9,840✔
4196

4197
        if (context->private_users != PRIVATE_USERS_NO)
9,840✔
4198
                return context->private_users;
4199

4200
        /* If any namespace is delegated with DelegateNamespaces=, always set up a user namespace. */
4201
        if (context->delegate_namespaces != NAMESPACE_FLAGS_INITIAL)
9,814✔
4202
                return PRIVATE_USERS_SELF;
3✔
4203

4204
        return PRIVATE_USERS_NO;
4205
}
4206

4207
static bool exec_namespace_is_delegated(
23,966✔
4208
                const ExecContext *context,
4209
                const ExecParameters *params,
4210
                bool have_cap_sys_admin,
4211
                unsigned long namespace) {
4212

4213
        assert(context);
23,966✔
4214
        assert(params);
23,966✔
4215
        assert(namespace != CLONE_NEWUSER);
23,966✔
4216

4217
        /* If we need unprivileged private users, we've already unshared a user namespace by the time we call
4218
         * setup_delegated_namespaces() for the first time so let's make sure we do all other namespace
4219
         * unsharing in the first call to setup_delegated_namespaces() by returning false here. */
4220
        if (!have_cap_sys_admin && exec_needs_cap_sys_admin(context, params))
23,966✔
4221
                return false;
4222

4223
        if (context->delegate_namespaces == NAMESPACE_FLAGS_INITIAL)
23,865✔
4224
                return params->runtime_scope == RUNTIME_SCOPE_USER;
23,797✔
4225

4226
        if (FLAGS_SET(context->delegate_namespaces, namespace))
68✔
4227
                return true;
4228

4229
        /* Various namespaces imply mountns for private procfs/sysfs/cgroupfs instances, which means when
4230
         * those are delegated mountns must be deferred too.
4231
         *
4232
         * The list should stay in sync with exec_needs_mount_namespace(). */
4233
        if (namespace == CLONE_NEWNS)
16✔
4234
                return context->delegate_namespaces & (CLONE_NEWPID|CLONE_NEWCGROUP|CLONE_NEWNET);
4✔
4235

4236
        return false;
4237
}
4238

4239
static int setup_delegated_namespaces(
19,692✔
4240
                const ExecContext *context,
4241
                ExecParameters *params,
4242
                ExecRuntime *runtime,
4243
                bool delegate,
4244
                const char *memory_pressure_path,
4245
                uid_t uid,
4246
                uid_t gid,
4247
                const ExecCommand *command,
4248
                bool needs_sandboxing,
4249
                bool have_cap_sys_admin,
4250
                int *reterr_exit_status) {
4251

4252
        int r;
19,692✔
4253

4254
        /* This function is called twice, once before unsharing the user namespace, and once after unsharing
4255
         * the user namespace. When called before unsharing the user namespace, "delegate" is set to "false".
4256
         * When called after unsharing the user namespace, "delegate" is set to "true". The net effect is
4257
         * that all namespaces that should not be delegated are unshared when this function is called the
4258
         * first time and all namespaces that should be delegated are unshared when this function is called
4259
         * the second time. */
4260

4261
        assert(context);
19,692✔
4262
        assert(params);
19,692✔
4263
        assert(runtime);
19,692✔
4264
        assert(reterr_exit_status);
19,692✔
4265

4266
        if (exec_needs_network_namespace(context) &&
19,819✔
4267
            exec_namespace_is_delegated(context, params, have_cap_sys_admin, CLONE_NEWNET) == delegate &&
127✔
4268
            runtime->shared && runtime->shared->netns_storage_socket[0] >= 0) {
64✔
4269

4270
                /* Try to enable network namespacing if network namespacing is available and we have
4271
                 * CAP_NET_ADMIN in the current user namespace (either the system manager one or the unit's
4272
                 * own user namespace). We need CAP_NET_ADMIN to be able to configure the loopback device in
4273
                 * the new network namespace. And if we don't have that, then we could only create a network
4274
                 * namespace without the ability to set up "lo". Hence gracefully skip things then. */
4275
                if (namespace_type_supported(NAMESPACE_NET) && have_effective_cap(CAP_NET_ADMIN) > 0) {
64✔
4276
                        r = setup_shareable_ns(runtime->shared->netns_storage_socket, CLONE_NEWNET);
64✔
4277
                        if (ERRNO_IS_NEG_PRIVILEGE(r))
64✔
4278
                                log_notice_errno(r, "PrivateNetwork=yes is configured, but network namespace setup not permitted, proceeding without: %m");
×
4279
                        else if (r < 0) {
64✔
4280
                                *reterr_exit_status = EXIT_NETWORK;
×
4281
                                return log_error_errno(r, "Failed to set up network namespacing: %m");
×
4282
                        } else
4283
                                log_debug("Set up %snetwork namespace", delegate ? "delegated " : "");
123✔
4284
                } else if (context->network_namespace_path) {
×
4285
                        *reterr_exit_status = EXIT_NETWORK;
×
4286
                        return log_error_errno(SYNTHETIC_ERRNO(EOPNOTSUPP), "NetworkNamespacePath= is not supported, refusing.");
×
4287
                } else
4288
                        log_notice("PrivateNetwork=yes is configured, but the kernel does not support or we lack privileges for network namespace, proceeding without.");
×
4289
        }
4290

4291
        if (exec_needs_ipc_namespace(context) &&
19,703✔
4292
            exec_namespace_is_delegated(context, params, have_cap_sys_admin, CLONE_NEWIPC) == delegate &&
11✔
4293
            runtime->shared && runtime->shared->ipcns_storage_socket[0] >= 0) {
6✔
4294

4295
                if (namespace_type_supported(NAMESPACE_IPC)) {
6✔
4296
                        r = setup_shareable_ns(runtime->shared->ipcns_storage_socket, CLONE_NEWIPC);
6✔
4297
                        if (ERRNO_IS_NEG_PRIVILEGE(r))
6✔
4298
                                log_warning_errno(r, "PrivateIPC=yes is configured, but IPC namespace setup failed, ignoring: %m");
×
4299
                        else if (r < 0) {
6✔
4300
                                *reterr_exit_status = EXIT_NAMESPACE;
×
4301
                                return log_error_errno(r, "Failed to set up IPC namespacing: %m");
×
4302
                        } else
4303
                                log_debug("Set up %sIPC namespace", delegate ? "delegated " : "");
8✔
4304
                } else if (context->ipc_namespace_path) {
×
4305
                        *reterr_exit_status = EXIT_NAMESPACE;
×
4306
                        return log_error_errno(SYNTHETIC_ERRNO(EOPNOTSUPP), "IPCNamespacePath= is not supported, refusing.");
×
4307
                } else
4308
                        log_warning("PrivateIPC=yes is configured, but the kernel does not support IPC namespaces, ignoring.");
×
4309
        }
4310

4311
        if (needs_sandboxing && exec_needs_cgroup_namespace(context) &&
19,725✔
4312
            exec_namespace_is_delegated(context, params, have_cap_sys_admin, CLONE_NEWCGROUP) == delegate) {
33✔
4313
                if (unshare(CLONE_NEWCGROUP) < 0) {
17✔
4314
                        *reterr_exit_status = EXIT_NAMESPACE;
×
4315
                        return log_error_errno(errno, "Failed to set up cgroup namespacing: %m");
×
4316
                }
4317

4318
                log_debug("Set up %scgroup namespace", delegate ? "delegated " : "");
30✔
4319
        }
4320

4321
        /* Unshare a new PID namespace before setting up mounts to ensure /proc/ is mounted with only processes in PID namespace visible.
4322
         * Note PrivatePIDs=yes implies MountAPIVFS=yes so we'll always ensure procfs is remounted. */
4323
        if (needs_sandboxing && exec_needs_pid_namespace(context, params) &&
19,716✔
4324
            exec_namespace_is_delegated(context, params, have_cap_sys_admin, CLONE_NEWPID) == delegate) {
24✔
4325
                if (params->pidref_transport_fd < 0) {
14✔
4326
                        *reterr_exit_status = EXIT_NAMESPACE;
×
4327
                        return log_error_errno(SYNTHETIC_ERRNO(ENOTCONN), "PidRef socket is not set up: %m");
×
4328
                }
4329

4330
                /* If we had CAP_SYS_ADMIN prior to joining the user namespace, then we are privileged and don't need
4331
                 * to check if we can mount /proc/.
4332
                 *
4333
                 * We need to check prior to entering the user namespace because if we're running unprivileged or in a
4334
                 * system without CAP_SYS_ADMIN, then we can have CAP_SYS_ADMIN in the current user namespace but not
4335
                 * once we unshare a mount namespace. */
4336
                if (!have_cap_sys_admin || delegate) {
14✔
4337
                        r = can_mount_proc();
9✔
4338
                        if (r < 0) {
5✔
4339
                                *reterr_exit_status = EXIT_NAMESPACE;
×
4340
                                return log_error_errno(r, "Failed to detect if /proc/ can be remounted: %m");
×
4341
                        }
4342
                        if (r == 0) {
5✔
4343
                                *reterr_exit_status = EXIT_NAMESPACE;
1✔
4344
                                return log_error_errno(SYNTHETIC_ERRNO(EPERM),
1✔
4345
                                                       "PrivatePIDs=yes is configured, but /proc/ cannot be re-mounted due to lack of privileges, refusing.");
4346
                        }
4347
                }
4348

4349
                r = setup_private_pids(context, params);
9✔
4350
                if (r < 0) {
6✔
4351
                        *reterr_exit_status = EXIT_NAMESPACE;
×
4352
                        return log_error_errno(r, "Failed to set up pid namespace: %m");
×
4353
                }
4354

4355
                log_debug("Set up %spid namespace", delegate ? "delegated " : "");
12✔
4356
        }
4357

4358
        /* If PrivatePIDs= yes is configured, we're now running as pid 1 in a pid namespace! */
4359

4360
        if (exec_needs_mount_namespace(context, params, runtime) &&
23,786✔
4361
            exec_namespace_is_delegated(context, params, have_cap_sys_admin, CLONE_NEWNS) == delegate) {
4,102✔
4362
                _cleanup_free_ char *error_path = NULL;
2,057✔
4363

4364
                r = apply_mount_namespace(command->flags,
2,057✔
4365
                                          context,
4366
                                          params,
4367
                                          runtime,
4368
                                          memory_pressure_path,
4369
                                          needs_sandboxing,
4370
                                          &error_path,
4371
                                          uid,
4372
                                          gid);
4373
                if (r < 0) {
2,057✔
4374
                        *reterr_exit_status = EXIT_NAMESPACE;
15✔
4375
                        return log_error_errno(r, "Failed to set up mount namespacing%s%s: %m",
29✔
4376
                                               error_path ? ": " : "", strempty(error_path));
4377
                }
4378

4379
                log_debug("Set up %smount namespace", delegate ? "delegated " : "");
4,056✔
4380
        }
4381

4382
        if (needs_sandboxing &&
39,338✔
4383
            exec_namespace_is_delegated(context, params, have_cap_sys_admin, CLONE_NEWUTS) == delegate) {
19,669✔
4384
                r = apply_protect_hostname(context, params, reterr_exit_status);
9,833✔
4385
                if (r < 0)
9,833✔
4386
                        return r;
4387
                if (r > 0)
9,833✔
4388
                        log_debug("Set up %sUTS namespace", delegate ? "delegated " : "");
1,359✔
4389
        }
4390

4391
        return 0;
4392
}
4393

4394
static bool exec_context_shall_confirm_spawn(const ExecContext *context) {
×
4395
        assert(context);
×
4396

4397
        if (confirm_spawn_disabled())
×
4398
                return false;
4399

4400
        /* For some reasons units remaining in the same process group
4401
         * as PID 1 fail to acquire the console even if it's not used
4402
         * by any process. So skip the confirmation question for them. */
4403
        return !context->same_pgrp;
×
4404
}
4405

4406
static int exec_context_named_iofds(
11,856✔
4407
                const ExecContext *c,
4408
                const ExecParameters *p,
4409
                int named_iofds[static 3]) {
4410

4411
        size_t targets;
11,856✔
4412
        const char* stdio_fdname[3];
11,856✔
4413
        size_t n_fds;
11,856✔
4414

4415
        assert(c);
11,856✔
4416
        assert(p);
11,856✔
4417
        assert(named_iofds);
11,856✔
4418

4419
        targets = (c->std_input == EXEC_INPUT_NAMED_FD) +
11,856✔
4420
                  (c->std_output == EXEC_OUTPUT_NAMED_FD) +
11,856✔
4421
                  (c->std_error == EXEC_OUTPUT_NAMED_FD);
11,856✔
4422

4423
        for (size_t i = 0; i < 3; i++)
47,424✔
4424
                stdio_fdname[i] = exec_context_fdname(c, i);
35,568✔
4425

4426
        n_fds = p->n_storage_fds + p->n_socket_fds + p->n_extra_fds;
11,856✔
4427

4428
        for (size_t i = 0; i < n_fds  && targets > 0; i++)
11,856✔
4429
                if (named_iofds[STDIN_FILENO] < 0 &&
×
4430
                    c->std_input == EXEC_INPUT_NAMED_FD &&
×
4431
                    stdio_fdname[STDIN_FILENO] &&
×
4432
                    streq(p->fd_names[i], stdio_fdname[STDIN_FILENO])) {
×
4433

4434
                        named_iofds[STDIN_FILENO] = p->fds[i];
×
4435
                        targets--;
×
4436

4437
                } else if (named_iofds[STDOUT_FILENO] < 0 &&
×
4438
                           c->std_output == EXEC_OUTPUT_NAMED_FD &&
×
4439
                           stdio_fdname[STDOUT_FILENO] &&
×
4440
                           streq(p->fd_names[i], stdio_fdname[STDOUT_FILENO])) {
×
4441

4442
                        named_iofds[STDOUT_FILENO] = p->fds[i];
×
4443
                        targets--;
×
4444

4445
                } else if (named_iofds[STDERR_FILENO] < 0 &&
×
4446
                           c->std_error == EXEC_OUTPUT_NAMED_FD &&
×
4447
                           stdio_fdname[STDERR_FILENO] &&
×
4448
                           streq(p->fd_names[i], stdio_fdname[STDERR_FILENO])) {
×
4449

4450
                        named_iofds[STDERR_FILENO] = p->fds[i];
×
4451
                        targets--;
×
4452
                }
4453

4454
        return targets == 0 ? 0 : -ENOENT;
11,856✔
4455
}
4456

4457
static void exec_shared_runtime_close(ExecSharedRuntime *shared) {
9,831✔
4458
        if (!shared)
9,831✔
4459
                return;
4460

4461
        safe_close_pair(shared->netns_storage_socket);
9,831✔
4462
        safe_close_pair(shared->ipcns_storage_socket);
9,831✔
4463
}
4464

4465
static void exec_runtime_close(ExecRuntime *rt) {
9,831✔
4466
        if (!rt)
9,831✔
4467
                return;
4468

4469
        safe_close_pair(rt->ephemeral_storage_socket);
9,831✔
4470

4471
        exec_shared_runtime_close(rt->shared);
9,831✔
4472
        dynamic_creds_close(rt->dynamic_creds);
9,831✔
4473
}
4474

4475
static void exec_params_close(ExecParameters *p) {
9,831✔
4476
        if (!p)
9,831✔
4477
                return;
4478

4479
        p->stdin_fd = safe_close(p->stdin_fd);
9,831✔
4480
        p->stdout_fd = safe_close(p->stdout_fd);
9,831✔
4481
        p->stderr_fd = safe_close(p->stderr_fd);
9,831✔
4482
}
4483

4484
static int exec_fd_mark_hot(
9,833✔
4485
                const ExecContext *c,
4486
                ExecParameters *p,
4487
                bool hot,
4488
                int *reterr_exit_status) {
4489

4490
        assert(c);
9,833✔
4491
        assert(p);
9,833✔
4492

4493
        if (p->exec_fd < 0)
9,833✔
4494
                return 0;
9,833✔
4495

4496
        uint8_t x = hot;
287✔
4497

4498
        if (write(p->exec_fd, &x, sizeof(x)) < 0) {
287✔
4499
                if (reterr_exit_status)
×
4500
                        *reterr_exit_status = EXIT_EXEC;
×
4501
                return log_error_errno(errno, "Failed to mark exec_fd as %s: %m", hot ? "hot" : "cold");
×
4502
        }
4503

4504
        return 1;
4505
}
4506

4507
static int send_handoff_timestamp(
9,830✔
4508
                const ExecContext *c,
4509
                ExecParameters *p,
4510
                int *reterr_exit_status) {
4511

4512
        assert(c);
9,830✔
4513
        assert(p);
9,830✔
4514

4515
        if (p->handoff_timestamp_fd < 0)
9,830✔
4516
                return 0;
9,830✔
4517

4518
        dual_timestamp dt;
9,830✔
4519
        dual_timestamp_now(&dt);
9,830✔
4520

4521
        if (write(p->handoff_timestamp_fd, (const usec_t[2]) { dt.realtime, dt.monotonic }, sizeof(usec_t) * 2) < 0) {
9,830✔
4522
                if (reterr_exit_status)
×
4523
                        *reterr_exit_status = EXIT_EXEC;
×
4524
                return log_error_errno(errno, "Failed to send handoff timestamp: %m");
×
4525
        }
4526

4527
        return 1;
9,830✔
4528
}
4529

4530
static void prepare_terminal(
11,853✔
4531
                const ExecContext *context,
4532
                ExecParameters *p) {
4533

4534
        _cleanup_close_ int lock_fd = -EBADF;
11,853✔
4535

4536
        /* This is the "constructive" reset, i.e. is about preparing things for our invocation rather than
4537
         * cleaning up things from older invocations. */
4538

4539
        assert(context);
11,853✔
4540
        assert(p);
11,853✔
4541

4542
        /* We only try to reset things if we there's the chance our stdout points to a TTY */
4543
        if (!(is_terminal_output(context->std_output) ||
11,853✔
4544
              (context->std_output == EXEC_OUTPUT_INHERIT && is_terminal_input(context->std_input)) ||
11,232✔
4545
              context->std_output == EXEC_OUTPUT_NAMED_FD ||
4546
              p->stdout_fd >= 0))
11,232✔
4547
                return;
10,682✔
4548

4549
        /* Let's explicitly determine whether to reset via ANSI sequences or not, taking our ExecContext
4550
         * information into account */
4551
        bool use_ansi = exec_context_shall_ansi_seq_reset(context);
1,171✔
4552

4553
        if (context->tty_reset) {
1,171✔
4554
                /* When we are resetting the TTY, then let's create a lock first, to synchronize access. This
4555
                 * in particular matters as concurrent resets and the TTY size ANSI DSR logic done by the
4556
                 * exec_context_apply_tty_size() below might interfere */
4557
                lock_fd = lock_dev_console();
157✔
4558
                if (lock_fd < 0)
157✔
4559
                        log_debug_errno(lock_fd, "Failed to lock /dev/console, ignoring: %m");
×
4560

4561
                /* We explicitly control whether to send ansi sequences or not here, since we want to consult
4562
                 * the env vars explicitly configured in the ExecContext, rather than our own environment
4563
                 * block. */
4564
                (void) terminal_reset_defensive(STDOUT_FILENO, use_ansi ? TERMINAL_RESET_FORCE_ANSI_SEQ : TERMINAL_RESET_AVOID_ANSI_SEQ);
160✔
4565
        }
4566

4567
        (void) exec_context_apply_tty_size(context, STDIN_FILENO, STDOUT_FILENO, /* tty_path= */ NULL);
1,171✔
4568

4569
        if (use_ansi)
1,171✔
4570
                (void) osc_context_open_service(p->unit_id, p->invocation_id, /* ret_seq= */ NULL);
154✔
4571
}
4572

4573
static int setup_term_environment(const ExecContext *context, char ***env) {
9,856✔
4574
        int r;
9,856✔
4575

4576
        assert(context);
9,856✔
4577
        assert(env);
9,856✔
4578

4579
        /* Already specified by user? */
4580
        if (strv_env_get(*env, "TERM"))
9,856✔
4581
                return 0;
4582

4583
        /* Do we need $TERM at all? */
4584
        if (!is_terminal_input(context->std_input) &&
9,709✔
4585
            !is_terminal_output(context->std_output) &&
9,534✔
4586
            !is_terminal_output(context->std_error) &&
9,269✔
4587
            !context->tty_path)
9,268✔
4588
                return 0;
4589

4590
        const char *tty_path = exec_context_tty_path(context);
443✔
4591
        if (tty_path) {
443✔
4592
                /* If we are forked off PID 1 and we are supposed to operate on /dev/console, then let's try
4593
                 * to inherit the $TERM set for PID 1. This is useful for containers so that the $TERM the
4594
                 * container manager passes to PID 1 ends up all the way in the console login shown.
4595
                 *
4596
                 * Note that if this doesn't work out we won't bother with querying systemd.tty.term.console
4597
                 * kernel cmdline option or DCS anymore either, because pid1 also imports $TERM based on those
4598
                 * and it should have showed up as our $TERM if there were anything. */
4599
                if (tty_is_console(tty_path) && getppid() == 1) {
441✔
4600
                        const char *term = strv_find_prefix(environ, "TERM=");
398✔
4601
                        if (term) {
398✔
4602
                                r = strv_env_replace_strdup(env, term);
398✔
4603
                                if (r < 0)
398✔
4604
                                        return r;
4605

4606
                                FOREACH_STRING(i, "COLORTERM=", "NO_COLOR=") {
1,194✔
4607
                                        const char *s = strv_find_prefix(environ, i);
796✔
4608
                                        if (!s)
796✔
4609
                                                continue;
796✔
4610

4611
                                        r = strv_env_replace_strdup(env, s);
×
4612
                                        if (r < 0)
×
4613
                                                return r;
×
4614
                                }
4615

4616
                                return 1;
398✔
4617
                        }
4618

4619
                } else {
4620
                        if (in_charset(skip_dev_prefix(tty_path), ALPHANUMERICAL)) {
43✔
4621
                                _cleanup_free_ char *key = NULL, *cmdline = NULL;
43✔
4622

4623
                                key = strjoin("systemd.tty.term.", skip_dev_prefix(tty_path));
43✔
4624
                                if (!key)
43✔
4625
                                        return -ENOMEM;
4626

4627
                                r = proc_cmdline_get_key(key, /* flags = */ 0, &cmdline);
43✔
4628
                                if (r > 0)
43✔
4629
                                        return strv_env_assign(env, "TERM", cmdline);
×
4630
                                if (r < 0)
43✔
4631
                                        log_debug_errno(r, "Failed to read '%s' from kernel cmdline, ignoring: %m", key);
43✔
4632
                        }
4633

4634
                        /* This handles real virtual terminals (returning "linux") and
4635
                         * any terminals which support the DCS +q query sequence. */
4636
                        _cleanup_free_ char *dcs_term = NULL;
43✔
4637
                        r = query_term_for_tty(tty_path, &dcs_term);
43✔
4638
                        if (r >= 0)
43✔
4639
                                return strv_env_assign(env, "TERM", dcs_term);
43✔
4640
                }
4641
        }
4642

4643
        /* If $TERM is not known and we pick a fallback default, then let's also set
4644
         * $COLORTERM=truecolor. That's because our fallback default is vt220, which is
4645
         * generally a safe bet (as it supports PageUp/PageDown unlike vt100, and is quite
4646
         * universally available in terminfo/termcap), except for the fact that real DEC
4647
         * vt220 gear never actually supported color. Most tools these days generate color on
4648
         * vt220 anyway, ignoring the physical capabilities of the real hardware, but some
4649
         * tools actually believe in the historical truth. Which is unfortunate since *we*
4650
         * *don't* care about the historical truth, we just want sane defaults if nothing
4651
         * better is explicitly configured. It's 2025 after all, at the time of writing,
4652
         * pretty much all terminal emulators actually *do* support color, hence if we don't
4653
         * know any better let's explicitly claim color support via $COLORTERM. Or in other
4654
         * words: we now explicitly claim to be connected to a franken-vt220 with true color
4655
         * support. */
4656
        r = strv_env_replace_strdup(env, "COLORTERM=truecolor");
2✔
4657
        if (r < 0)
2✔
4658
                return r;
4659

4660
        return strv_env_replace_strdup(env, "TERM=" FALLBACK_TERM);
2✔
4661
}
4662

4663
int exec_invoke(
11,856✔
4664
                const ExecCommand *command,
4665
                const ExecContext *context,
4666
                ExecParameters *params,
4667
                ExecRuntime *runtime,
4668
                const CGroupContext *cgroup_context,
4669
                int *exit_status) {
11,856✔
4670

4671
        _cleanup_strv_free_ char **our_env = NULL, **pass_env = NULL, **joined_exec_search_path = NULL, **accum_env = NULL;
28✔
4672
        int r;
11,856✔
4673
        const char *username = NULL, *groupname = NULL;
11,856✔
4674
        _cleanup_free_ char *home_buffer = NULL, *memory_pressure_path = NULL, *own_user = NULL;
×
4675
        const char *pwent_home = NULL, *shell = NULL;
11,856✔
4676
        dev_t journal_stream_dev = 0;
11,856✔
4677
        ino_t journal_stream_ino = 0;
11,856✔
4678
        bool needs_sandboxing,          /* Do we need to set up full sandboxing? (i.e. all namespacing, all MAC stuff, caps, yadda yadda */
11,856✔
4679
                needs_setuid,           /* Do we need to do the actual setresuid()/setresgid() calls? */
4680
                needs_mount_namespace,  /* Do we need to set up a mount namespace for this kernel? */
4681
                have_cap_sys_admin,
4682
                userns_set_up = false,
11,856✔
4683
                keep_seccomp_privileges = false;
11,856✔
4684
#if HAVE_SELINUX
4685
        _cleanup_free_ char *mac_selinux_context_net = NULL;
4686
        bool use_selinux = false;
4687
#endif
4688
#if ENABLE_SMACK
4689
        bool use_smack = false;
11,856✔
4690
#endif
4691
#if HAVE_APPARMOR
4692
        bool use_apparmor = false;
4693
#endif
4694
#if HAVE_SECCOMP
4695
        uint64_t saved_bset = 0;
11,856✔
4696
#endif
4697
        uid_t saved_uid = getuid();
11,856✔
4698
        gid_t saved_gid = getgid();
11,856✔
4699
        uid_t uid = UID_INVALID;
11,856✔
4700
        gid_t gid = GID_INVALID;
11,856✔
4701
        size_t n_fds, /* fds to pass to the child */
11,856✔
4702
               n_keep_fds; /* total number of fds not to close */
4703
        int secure_bits;
11,856✔
4704
        _cleanup_free_ gid_t *gids = NULL, *gids_after_pam = NULL;
28✔
4705
        int ngids = 0, ngids_after_pam = 0;
11,856✔
4706
        int socket_fd = -EBADF, named_iofds[3] = EBADF_TRIPLET;
11,856✔
4707
        size_t n_storage_fds, n_socket_fds, n_extra_fds;
11,856✔
4708

4709
        assert(command);
11,856✔
4710
        assert(context);
11,856✔
4711
        assert(params);
11,856✔
4712
        assert(runtime);
11,856✔
4713
        assert(cgroup_context);
11,856✔
4714
        assert(exit_status);
11,856✔
4715

4716
        LOG_CONTEXT_PUSH_EXEC(context, params);
33,998✔
4717

4718
        /* Explicitly test for CVE-2021-4034 inspired invocations */
4719
        if (!command->path || strv_isempty(command->argv)) {
11,856✔
4720
                *exit_status = EXIT_EXEC;
×
4721
                return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "Invalid command line arguments.");
×
4722
        }
4723

4724
        if (context->std_input == EXEC_INPUT_SOCKET ||
11,856✔
4725
            context->std_output == EXEC_OUTPUT_SOCKET ||
11,845✔
4726
            context->std_error == EXEC_OUTPUT_SOCKET) {
11,839✔
4727

4728
                if (params->n_socket_fds > 1)
17✔
4729
                        return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "Got more than one socket.");
×
4730

4731
                if (params->n_socket_fds == 0)
17✔
4732
                        return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "Got no socket.");
×
4733

4734
                socket_fd = params->fds[0];
17✔
4735
                n_storage_fds = n_socket_fds = n_extra_fds = 0;
17✔
4736
        } else {
4737
                n_socket_fds = params->n_socket_fds;
11,839✔
4738
                n_storage_fds = params->n_storage_fds;
11,839✔
4739
                n_extra_fds = params->n_extra_fds;
11,839✔
4740
        }
4741
        n_fds = n_socket_fds + n_storage_fds + n_extra_fds;
11,856✔
4742

4743
        r = exec_context_named_iofds(context, params, named_iofds);
11,856✔
4744
        if (r < 0)
11,856✔
4745
                return log_error_errno(r, "Failed to load a named file descriptor: %m");
×
4746

4747
        rename_process_from_path(command->path);
11,856✔
4748

4749
        /* We reset exactly these signals, since they are the only ones we set to SIG_IGN in the main
4750
         * daemon. All others we leave untouched because we set them to SIG_DFL or a valid handler initially,
4751
         * both of which will be demoted to SIG_DFL. */
4752
        (void) default_signals(SIGNALS_CRASH_HANDLER,
11,856✔
4753
                               SIGNALS_IGNORE);
4754

4755
        if (context->ignore_sigpipe)
11,856✔
4756
                (void) ignore_signals(SIGPIPE);
11,493✔
4757

4758
        r = reset_signal_mask();
11,856✔
4759
        if (r < 0) {
11,856✔
4760
                *exit_status = EXIT_SIGNAL_MASK;
×
4761
                return log_error_errno(r, "Failed to set process signal mask: %m");
×
4762
        }
4763

4764
        if (params->idle_pipe)
11,856✔
4765
                do_idle_pipe_dance(params->idle_pipe);
154✔
4766

4767
        /* Close fds we don't need very early to make sure we don't block init reexecution because it cannot bind its
4768
         * sockets. Among the fds we close are the logging fds, and we want to keep them closed, so that we don't have
4769
         * any fds open we don't really want open during the transition. In order to make logging work, we switch the
4770
         * log subsystem into open_when_needed mode, so that it reopens the logs on every single log call. */
4771

4772
        log_forget_fds();
11,856✔
4773
        log_set_open_when_needed(true);
11,856✔
4774
        log_settle_target();
11,856✔
4775

4776
        /* In case anything used libc syslog(), close this here, too */
4777
        closelog();
11,856✔
4778

4779
        r = collect_open_file_fds(params, &n_fds);
11,856✔
4780
        if (r < 0) {
11,856✔
4781
                *exit_status = EXIT_FDS;
1✔
4782
                return log_error_errno(r, "Failed to get OpenFile= file descriptors: %m");
1✔
4783
        }
4784

4785
        int keep_fds[n_fds + 4];
11,855✔
4786
        memcpy_safe(keep_fds, params->fds, n_fds * sizeof(int));
11,855✔
4787
        n_keep_fds = n_fds;
11,855✔
4788

4789
        r = add_shifted_fd(keep_fds, ELEMENTSOF(keep_fds), &n_keep_fds, &params->exec_fd);
11,855✔
4790
        if (r < 0) {
11,855✔
4791
                *exit_status = EXIT_FDS;
×
4792
                return log_error_errno(r, "Failed to collect shifted fd: %m");
×
4793
        }
4794

4795
        r = add_shifted_fd(keep_fds, ELEMENTSOF(keep_fds), &n_keep_fds, &params->handoff_timestamp_fd);
11,855✔
4796
        if (r < 0) {
11,855✔
4797
                *exit_status = EXIT_FDS;
×
4798
                return log_error_errno(r, "Failed to collect shifted fd: %m");
×
4799
        }
4800

4801
#if HAVE_LIBBPF
4802
        r = add_shifted_fd(keep_fds, ELEMENTSOF(keep_fds), &n_keep_fds, &params->bpf_restrict_fs_map_fd);
11,855✔
4803
        if (r < 0) {
11,855✔
4804
                *exit_status = EXIT_FDS;
×
4805
                return log_error_errno(r, "Failed to collect shifted fd: %m");
×
4806
        }
4807
#endif
4808

4809
        r = close_remaining_fds(params, runtime, socket_fd, keep_fds, n_keep_fds);
11,855✔
4810
        if (r < 0) {
11,855✔
4811
                *exit_status = EXIT_FDS;
×
4812
                return log_error_errno(r, "Failed to close unwanted file descriptors: %m");
×
4813
        }
4814

4815
        if (!context->same_pgrp &&
22,824✔
4816
            setsid() < 0) {
10,969✔
4817
                *exit_status = EXIT_SETSID;
×
4818
                return log_error_errno(errno, "Failed to create new process session: %m");
×
4819
        }
4820

4821
        /* Now, reset the TTY associated to this service "destructively" (i.e. possibly even hang up or
4822
         * disallocate the VT), to get rid of any prior uses of the device. Note that we do not keep any fd
4823
         * open here, hence some of the settings made here might vanish again, depending on the TTY driver
4824
         * used. A 2nd ("constructive") initialization after we opened the input/output fds we actually want
4825
         * will fix this. Note that we pass a NULL invocation ID here – as exec_context_tty_reset() expects
4826
         * the invocation ID associated with the OSC 3008 context ID to close. But we don't want to close any
4827
         * OSC 3008 context here, and opening a fresh OSC 3008 context happens a bit further down. */
4828
        exec_context_tty_reset(context, params, /* invocation_id= */ SD_ID128_NULL);
11,855✔
4829

4830
        if (params->shall_confirm_spawn && exec_context_shall_confirm_spawn(context)) {
11,855✔
4831
                _cleanup_free_ char *cmdline = NULL;
×
4832

4833
                cmdline = quote_command_line(command->argv, SHELL_ESCAPE_EMPTY);
×
4834
                if (!cmdline) {
×
4835
                        *exit_status = EXIT_MEMORY;
×
4836
                        return log_oom();
×
4837
                }
4838

4839
                r = ask_for_confirmation(context, params, cmdline);
×
4840
                if (r != CONFIRM_EXECUTE) {
×
4841
                        if (r == CONFIRM_PRETEND_SUCCESS) {
×
4842
                                *exit_status = EXIT_SUCCESS;
×
4843
                                return 0;
×
4844
                        }
4845

4846
                        *exit_status = EXIT_CONFIRM;
×
4847
                        return log_error_errno(SYNTHETIC_ERRNO(ECANCELED), "Execution cancelled by the user.");
×
4848
                }
4849
        }
4850

4851
        /* We are about to invoke NSS and PAM modules. Let's tell them what we are doing here, maybe they care. This is
4852
         * used by nss-resolve to disable itself when we are about to start systemd-resolved, to avoid deadlocks. Note
4853
         * that these env vars do not survive the execve(), which means they really only apply to the PAM and NSS
4854
         * invocations themselves. Also note that while we'll only invoke NSS modules involved in user management they
4855
         * might internally call into other NSS modules that are involved in hostname resolution, we never know. */
4856
        if (setenv("SYSTEMD_ACTIVATION_UNIT", params->unit_id, true) != 0 ||
23,710✔
4857
            setenv("SYSTEMD_ACTIVATION_SCOPE", runtime_scope_to_string(params->runtime_scope), true) != 0) {
11,855✔
4858
                *exit_status = EXIT_MEMORY;
×
4859
                return log_error_errno(errno, "Failed to update environment: %m");
×
4860
        }
4861

4862
        if (context->dynamic_user && runtime->dynamic_creds) {
11,917✔
4863
                _cleanup_strv_free_ char **suggested_paths = NULL;
62✔
4864

4865
                /* On top of that, make sure we bypass our own NSS module nss-systemd comprehensively for any NSS
4866
                 * checks, if DynamicUser=1 is used, as we shouldn't create a feedback loop with ourselves here. */
4867
                if (putenv((char*) "SYSTEMD_NSS_DYNAMIC_BYPASS=1") != 0) {
62✔
4868
                        *exit_status = EXIT_USER;
×
4869
                        return log_error_errno(errno, "Failed to update environment: %m");
×
4870
                }
4871

4872
                r = compile_suggested_paths(context, params, &suggested_paths);
62✔
4873
                if (r < 0) {
62✔
4874
                        *exit_status = EXIT_MEMORY;
×
4875
                        return log_oom();
×
4876
                }
4877

4878
                r = dynamic_creds_realize(runtime->dynamic_creds, suggested_paths, &uid, &gid);
62✔
4879
                if (r < 0) {
62✔
4880
                        *exit_status = EXIT_USER;
×
4881
                        if (r == -EILSEQ)
×
4882
                                return log_error_errno(SYNTHETIC_ERRNO(EOPNOTSUPP),
×
4883
                                                       "Failed to update dynamic user credentials: User or group with specified name already exists.");
4884
                        return log_error_errno(r, "Failed to update dynamic user credentials: %m");
×
4885
                }
4886

4887
                if (!uid_is_valid(uid)) {
62✔
4888
                        *exit_status = EXIT_USER;
×
4889
                        return log_error_errno(SYNTHETIC_ERRNO(ESRCH), "UID validation failed for \""UID_FMT"\".", uid);
×
4890
                }
4891

4892
                if (!gid_is_valid(gid)) {
62✔
4893
                        *exit_status = EXIT_USER;
×
4894
                        return log_error_errno(SYNTHETIC_ERRNO(ESRCH), "GID validation failed for \""GID_FMT"\".", gid);
×
4895
                }
4896

4897
                if (runtime->dynamic_creds->user)
62✔
4898
                        username = runtime->dynamic_creds->user->name;
62✔
4899

4900
        } else {
4901
                const char *u;
11,793✔
4902

4903
                if (context->user)
11,793✔
4904
                        u = context->user;
4905
                else if (context->pam_name || FLAGS_SET(command->flags, EXEC_COMMAND_VIA_SHELL)) {
9,175✔
4906
                        /* If PAM is enabled but no user name is explicitly selected, then use our own one. */
4907
                        own_user = getusername_malloc();
65✔
4908
                        if (!own_user) {
65✔
4909
                                *exit_status = EXIT_USER;
×
4910
                                return log_error_errno(r, "Failed to determine my own user ID: %m");
×
4911
                        }
4912
                        u = own_user;
4913
                } else
4914
                        u = NULL;
4915

4916
                if (u) {
4917
                        /* We can't use nss unconditionally for root without risking deadlocks if some IPC services
4918
                         * will be started by pid1 and are ordered after us. But if SetLoginEnvironment= is
4919
                         * enabled *explicitly* (i.e. no exec_context_get_set_login_environment() here),
4920
                         * or PAM shall be invoked, let's consult NSS even for root, so that the user
4921
                         * gets accurate $SHELL in session(-like) contexts. */
4922
                        r = get_fixed_user(u,
2,683✔
4923
                                           /* prefer_nss = */ context->set_login_environment > 0 || context->pam_name,
2,683✔
4924
                                           &username, &uid, &gid, &pwent_home, &shell);
4925
                        if (r < 0) {
2,683✔
4926
                                *exit_status = EXIT_USER;
2✔
4927
                                return log_error_errno(r, "Failed to determine user credentials: %m");
2✔
4928
                        }
4929
                }
4930

4931
                if (context->group) {
11,791✔
4932
                        r = get_fixed_group(context->group, &groupname, &gid);
11✔
4933
                        if (r < 0) {
11✔
4934
                                *exit_status = EXIT_GROUP;
×
4935
                                return log_error_errno(r, "Failed to determine group credentials: %m");
×
4936
                        }
4937
                }
4938
        }
4939

4940
        /* Initialize user supplementary groups and get SupplementaryGroups= ones */
4941
        ngids = get_supplementary_groups(context, username, gid, &gids);
11,853✔
4942
        if (ngids < 0) {
11,853✔
4943
                *exit_status = EXIT_GROUP;
×
4944
                return log_error_errno(ngids, "Failed to determine supplementary groups: %m");
×
4945
        }
4946

4947
        r = send_user_lookup(params->unit_id, params->user_lookup_fd, uid, gid);
11,853✔
4948
        if (r < 0) {
11,853✔
4949
                *exit_status = EXIT_USER;
×
4950
                return log_error_errno(r, "Failed to send user credentials to PID1: %m");
×
4951
        }
4952

4953
        params->user_lookup_fd = safe_close(params->user_lookup_fd);
11,853✔
4954

4955
        r = acquire_home(context, &pwent_home, &home_buffer);
11,853✔
4956
        if (r < 0) {
11,853✔
4957
                *exit_status = EXIT_CHDIR;
×
4958
                return log_error_errno(r, "Failed to determine $HOME for the invoking user: %m");
×
4959
        }
4960

4961
        /* If a socket is connected to STDIN/STDOUT/STDERR, we must drop O_NONBLOCK */
4962
        if (socket_fd >= 0)
11,853✔
4963
                (void) fd_nonblock(socket_fd, false);
17✔
4964

4965
        /* We need sandboxing if the caller asked us to apply it and the command isn't explicitly excepted
4966
         * from it. */
4967
        needs_sandboxing = (params->flags & EXEC_APPLY_SANDBOXING) && !(command->flags & EXEC_COMMAND_FULLY_PRIVILEGED);
11,853✔
4968

4969
        /* Journald will try to look-up our cgroup in order to populate _SYSTEMD_CGROUP and _SYSTEMD_UNIT fields.
4970
         * Hence we need to migrate to the target cgroup from init.scope before connecting to journald */
4971
        if (params->cgroup_path) {
11,853✔
4972
                _cleanup_free_ char *subcgroup = NULL;
11,853✔
4973

4974
                r = exec_params_get_cgroup_path(params, cgroup_context, params->cgroup_path, &subcgroup);
11,853✔
4975
                if (r < 0) {
11,853✔
4976
                        *exit_status = EXIT_CGROUP;
×
4977
                        return log_error_errno(r, "Failed to acquire cgroup path: %m");
×
4978
                }
4979
                if (r > 0) {
11,853✔
4980
                        /* If there is a subcgroup required, let's make sure to create it now. */
4981
                        r = cg_create(subcgroup);
332✔
4982
                        if (r < 0)
332✔
4983
                                return log_error_errno(r, "Failed to create subcgroup '%s': %m", subcgroup);
×
4984
                }
4985

4986
                /* If we need a cgroup namespace, we cannot yet move the service to its configured subgroup,
4987
                 * as unsharing the cgroup namespace later on makes the current cgroup the root of the
4988
                 * namespace and we want the root of the namespace to be the main service cgroup and not the
4989
                 * subgroup. One edge case is if we're a control process that needs to be spawned in a
4990
                 * subgroup, in this case, we have no choice as moving into the main service cgroup might
4991
                 * violate the no inner processes rule of cgroupv2. */
4992
                const char *cgtarget = needs_sandboxing && exec_needs_cgroup_namespace(context) &&
11,870✔
4993
                                                           !exec_params_needs_control_subcgroup(params)
17✔
4994
                                                           ? params->cgroup_path : subcgroup;
4995

4996
                r = cg_attach(cgtarget, 0);
11,853✔
4997
                if (r == -EUCLEAN) {
11,853✔
4998
                        *exit_status = EXIT_CGROUP;
×
4999
                        return log_error_errno(r,
×
5000
                                               "Failed to attach process to cgroup '%s', "
5001
                                               "because the cgroup or one of its parents or "
5002
                                               "siblings is in the threaded mode.", cgtarget);
5003
                }
5004
                if (r < 0) {
11,853✔
5005
                        *exit_status = EXIT_CGROUP;
×
5006
                        return log_error_errno(r, "Failed to attach to cgroup %s: %m", cgtarget);
×
5007
                }
5008
        }
5009

5010
        if (context->network_namespace_path && runtime->shared && runtime->shared->netns_storage_socket[0] >= 0) {
11,853✔
5011
                r = open_shareable_ns_path(runtime->shared->netns_storage_socket, context->network_namespace_path, CLONE_NEWNET);
×
5012
                if (r < 0) {
×
5013
                        *exit_status = EXIT_NETWORK;
×
5014
                        return log_error_errno(r, "Failed to open network namespace path %s: %m", context->network_namespace_path);
×
5015
                }
5016
        }
5017

5018
        if (context->ipc_namespace_path && runtime->shared && runtime->shared->ipcns_storage_socket[0] >= 0) {
11,853✔
5019
                r = open_shareable_ns_path(runtime->shared->ipcns_storage_socket, context->ipc_namespace_path, CLONE_NEWIPC);
×
5020
                if (r < 0) {
×
5021
                        *exit_status = EXIT_NAMESPACE;
×
5022
                        return log_error_errno(r, "Failed to open IPC namespace path %s: %m", context->ipc_namespace_path);
×
5023
                }
5024
        }
5025

5026
        r = setup_input(context, params, socket_fd, named_iofds);
11,853✔
5027
        if (r < 0) {
11,853✔
UNCOV
5028
                *exit_status = EXIT_STDIN;
×
UNCOV
5029
                return log_error_errno(r, "Failed to set up standard input: %m");
×
5030
        }
5031

5032
        _cleanup_free_ char *fname = NULL;
25✔
5033
        r = path_extract_filename(command->path, &fname);
11,853✔
5034
        if (r < 0) {
11,853✔
5035
                *exit_status = EXIT_STDOUT;
×
5036
                return log_error_errno(r, "Failed to extract filename from path %s: %m", command->path);
×
5037
        }
5038

5039
        r = setup_output(context, params, STDOUT_FILENO, socket_fd, named_iofds, fname, uid, gid, &journal_stream_dev, &journal_stream_ino);
11,853✔
5040
        if (r < 0) {
11,853✔
5041
                *exit_status = EXIT_STDOUT;
×
5042
                return log_error_errno(r, "Failed to set up standard output: %m");
×
5043
        }
5044

5045
        r = setup_output(context, params, STDERR_FILENO, socket_fd, named_iofds, fname, uid, gid, &journal_stream_dev, &journal_stream_ino);
11,853✔
5046
        if (r < 0) {
11,853✔
5047
                *exit_status = EXIT_STDERR;
×
5048
                return log_error_errno(r, "Failed to set up standard error output: %m");
×
5049
        }
5050

5051
        /* Now that stdin/stdout are definiely opened, properly initialize it with our desired
5052
         * settings. Note: this is a "constructive" reset, it prepares things for us to use. This is
5053
         * different from the "destructive" TTY reset further up. Also note: we apply this on stdin/stdout in
5054
         * case this is a tty, regardless if we opened it ourselves or got it passed in pre-opened. */
5055
        prepare_terminal(context, params);
11,853✔
5056

5057
        if (context->oom_score_adjust_set) {
11,853✔
5058
                /* When we can't make this change due to EPERM, then let's silently skip over it. User
5059
                 * namespaces prohibit write access to this file, and we shouldn't trip up over that. */
5060
                r = set_oom_score_adjust(context->oom_score_adjust);
1,320✔
5061
                if (ERRNO_IS_NEG_PRIVILEGE(r))
1,320✔
5062
                        log_debug_errno(r, "Failed to adjust OOM setting, assuming containerized execution, ignoring: %m");
×
5063
                else if (r < 0) {
1,320✔
5064
                        *exit_status = EXIT_OOM_ADJUST;
×
5065
                        return log_error_errno(r, "Failed to adjust OOM setting: %m");
×
5066
                }
5067
        }
5068

5069
        if (context->coredump_filter_set) {
11,853✔
5070
                r = set_coredump_filter(context->coredump_filter);
2✔
5071
                if (ERRNO_IS_NEG_PRIVILEGE(r))
2✔
5072
                        log_debug_errno(r, "Failed to adjust coredump_filter, ignoring: %m");
×
5073
                else if (r < 0) {
2✔
5074
                        *exit_status = EXIT_LIMITS;
×
5075
                        return log_error_errno(r, "Failed to adjust coredump_filter: %m");
×
5076
                }
5077
        }
5078

5079
        if (context->cpu_sched_set) {
11,853✔
5080
                struct sched_attr attr = {
×
5081
                        .size = sizeof(attr),
5082
                        .sched_policy = context->cpu_sched_policy,
×
5083
                        .sched_priority = context->cpu_sched_priority,
×
5084
                        .sched_flags = context->cpu_sched_reset_on_fork ? SCHED_FLAG_RESET_ON_FORK : 0,
×
5085
                };
5086

5087
                r = sched_setattr(/* pid= */ 0, &attr, /* flags= */ 0);
×
5088
                if (r < 0) {
×
5089
                        *exit_status = EXIT_SETSCHEDULER;
×
5090
                        return log_error_errno(errno, "Failed to set up CPU scheduling: %m");
×
5091
                }
5092
        }
5093

5094
        /*
5095
         * Set nice value _after_ the call to sched_setattr() because struct sched_attr includes sched_nice
5096
         * which we do not set, thus it will clobber any previously set nice value. Scheduling policy might
5097
         * be reasonably set together with nice value e.g. in case of SCHED_BATCH (see sched(7)).
5098
         * It would be ideal to set both with the same call, but we cannot easily do so because of all the
5099
         * extra logic in setpriority_closest().
5100
         */
5101
        if (context->nice_set) {
11,853✔
5102
                r = setpriority_closest(context->nice);
15✔
5103
                if (r < 0) {
15✔
5104
                        *exit_status = EXIT_NICE;
×
5105
                        return log_error_errno(r, "Failed to set up process scheduling priority (nice level): %m");
×
5106
                }
5107
        }
5108

5109
        if (context->cpu_affinity_from_numa || context->cpu_set.set) {
11,853✔
5110
                _cleanup_(cpu_set_reset) CPUSet converted_cpu_set = {};
2✔
5111
                const CPUSet *cpu_set;
2✔
5112

5113
                if (context->cpu_affinity_from_numa) {
2✔
5114
                        r = exec_context_cpu_affinity_from_numa(context, &converted_cpu_set);
2✔
5115
                        if (r < 0) {
2✔
5116
                                *exit_status = EXIT_CPUAFFINITY;
×
5117
                                return log_error_errno(r, "Failed to derive CPU affinity mask from NUMA mask: %m");
×
5118
                        }
5119

5120
                        cpu_set = &converted_cpu_set;
5121
                } else
5122
                        cpu_set = &context->cpu_set;
×
5123

5124
                if (sched_setaffinity(0, cpu_set->allocated, cpu_set->set) < 0) {
2✔
5125
                        *exit_status = EXIT_CPUAFFINITY;
×
5126
                        return log_error_errno(errno, "Failed to set up CPU affinity: %m");
×
5127
                }
5128
        }
5129

5130
        if (mpol_is_valid(numa_policy_get_type(&context->numa_policy))) {
11,853✔
5131
                r = apply_numa_policy(&context->numa_policy);
19✔
5132
                if (ERRNO_IS_NEG_NOT_SUPPORTED(r))
19✔
5133
                        log_debug_errno(r, "NUMA support not available, ignoring.");
×
5134
                else if (r < 0) {
19✔
5135
                        *exit_status = EXIT_NUMA_POLICY;
2✔
5136
                        return log_error_errno(r, "Failed to set NUMA memory policy: %m");
2✔
5137
                }
5138
        }
5139

5140
        if (context->ioprio_set)
11,851✔
5141
                if (ioprio_set(IOPRIO_WHO_PROCESS, 0, context->ioprio) < 0) {
7✔
5142
                        *exit_status = EXIT_IOPRIO;
×
5143
                        return log_error_errno(errno, "Failed to set up IO scheduling priority: %m");
×
5144
                }
5145

5146
        if (context->timer_slack_nsec != NSEC_INFINITY)
11,851✔
5147
                if (prctl(PR_SET_TIMERSLACK, context->timer_slack_nsec) < 0) {
×
5148
                        *exit_status = EXIT_TIMERSLACK;
×
5149
                        return log_error_errno(errno, "Failed to set up timer slack: %m");
×
5150
                }
5151

5152
        if (context->personality != PERSONALITY_INVALID) {
11,851✔
5153
                r = safe_personality(context->personality);
×
5154
                if (r < 0) {
×
5155
                        *exit_status = EXIT_PERSONALITY;
×
5156
                        return log_error_errno(r, "Failed to set up execution domain (personality): %m");
×
5157
                }
5158
        }
5159

5160
        if (context->memory_ksm >= 0)
11,851✔
5161
                if (prctl(PR_SET_MEMORY_MERGE, context->memory_ksm, 0, 0, 0) < 0) {
×
5162
                        if (ERRNO_IS_NOT_SUPPORTED(errno))
×
5163
                                log_debug_errno(errno, "KSM support not available, ignoring.");
×
5164
                        else {
5165
                                *exit_status = EXIT_KSM;
×
5166
                                return log_error_errno(errno, "Failed to set KSM: %m");
×
5167
                        }
5168
                }
5169

5170
#if ENABLE_UTMP
5171
        if (context->utmp_id) {
11,851✔
5172
                _cleanup_free_ char *username_alloc = NULL;
159✔
5173

5174
                if (!username && context->utmp_mode == EXEC_UTMP_USER) {
159✔
5175
                        username_alloc = uid_to_name(uid_is_valid(uid) ? uid : saved_uid);
1✔
5176
                        if (!username_alloc) {
1✔
5177
                                *exit_status = EXIT_USER;
×
5178
                                return log_oom();
×
5179
                        }
5180
                }
5181

5182
                const char *line = context->tty_path ?
×
5183
                        (path_startswith(context->tty_path, "/dev/") ?: context->tty_path) :
159✔
5184
                        NULL;
5185
                utmp_put_init_process(context->utmp_id, getpid_cached(), getsid(0),
159✔
5186
                                      line,
5187
                                      context->utmp_mode == EXEC_UTMP_INIT  ? INIT_PROCESS :
159✔
5188
                                      context->utmp_mode == EXEC_UTMP_LOGIN ? LOGIN_PROCESS :
7✔
5189
                                      USER_PROCESS,
5190
                                      username ?: username_alloc);
159✔
5191
        }
5192
#endif
5193

5194
        if (uid_is_valid(uid)) {
11,851✔
5195
                r = chown_terminal(STDIN_FILENO, uid);
2,743✔
5196
                if (r < 0) {
2,743✔
5197
                        *exit_status = EXIT_STDIN;
×
5198
                        return log_error_errno(r, "Failed to change ownership of terminal: %m");
×
5199
                }
5200
        }
5201

5202
        if (params->cgroup_path) {
11,851✔
5203
                /* If delegation is enabled we'll pass ownership of the cgroup to the user of the new process. On cgroup v1
5204
                 * this is only about systemd's own hierarchy, i.e. not the controller hierarchies, simply because that's not
5205
                 * safe. On cgroup v2 there's only one hierarchy anyway, and delegation is safe there, hence in that case only
5206
                 * touch a single hierarchy too. */
5207

5208
                if (params->flags & EXEC_CGROUP_DELEGATE) {
11,851✔
5209
                        _cleanup_free_ char *p = NULL;
673✔
5210

5211
                        r = cg_set_access(params->cgroup_path, uid, gid);
673✔
5212
                        if (r < 0) {
673✔
5213
                                *exit_status = EXIT_CGROUP;
×
5214
                                return log_error_errno(r, "Failed to adjust control group access: %m");
×
5215
                        }
5216

5217
                        r = exec_params_get_cgroup_path(params, cgroup_context, params->cgroup_path, &p);
673✔
5218
                        if (r < 0) {
673✔
5219
                                *exit_status = EXIT_CGROUP;
×
5220
                                return log_error_errno(r, "Failed to acquire cgroup path: %m");
×
5221
                        }
5222
                        if (r > 0) {
673✔
5223
                                r = cg_set_access_recursive(p, uid, gid);
332✔
5224
                                if (r < 0) {
332✔
5225
                                        *exit_status = EXIT_CGROUP;
×
5226
                                        return log_error_errno(r, "Failed to adjust control subgroup access: %m");
×
5227
                                }
5228
                        }
5229
                }
5230

5231
                if (is_pressure_supported() > 0) {
11,851✔
5232
                        if (cgroup_context_want_memory_pressure(cgroup_context)) {
11,851✔
5233
                                r = cg_get_path("memory", params->cgroup_path, "memory.pressure", &memory_pressure_path);
11,447✔
5234
                                if (r < 0) {
11,447✔
5235
                                        *exit_status = EXIT_MEMORY;
×
5236
                                        return log_oom();
×
5237
                                }
5238

5239
                                r = chmod_and_chown(memory_pressure_path, 0644, uid, gid);
11,447✔
5240
                                if (r < 0) {
11,447✔
5241
                                        log_full_errno(r == -ENOENT || ERRNO_IS_PRIVILEGE(r) ? LOG_DEBUG : LOG_WARNING, r,
2✔
5242
                                                       "Failed to adjust ownership of '%s', ignoring: %m", memory_pressure_path);
5243
                                        memory_pressure_path = mfree(memory_pressure_path);
1✔
5244
                                }
5245
                                /* First we use the current cgroup path to chmod and chown the memory pressure path, then pass the path relative
5246
                                 * to the cgroup namespace to environment variables and mounts. If chown/chmod fails, we should not pass memory
5247
                                 * pressure path environment variable or read-write mount to the unit. This is why we check if
5248
                                 * memory_pressure_path != NULL in the conditional below. */
5249
                                if (memory_pressure_path && needs_sandboxing && exec_needs_cgroup_namespace(context)) {
11,447✔
5250
                                        memory_pressure_path = mfree(memory_pressure_path);
17✔
5251
                                        r = cg_get_path("memory", "", "memory.pressure", &memory_pressure_path);
17✔
5252
                                        if (r < 0) {
17✔
5253
                                                *exit_status = EXIT_MEMORY;
×
5254
                                                return log_oom();
×
5255
                                        }
5256
                                }
5257
                        } else if (cgroup_context->memory_pressure_watch == CGROUP_PRESSURE_WATCH_NO) {
404✔
5258
                                memory_pressure_path = strdup("/dev/null"); /* /dev/null is explicit indicator for turning of memory pressure watch */
×
5259
                                if (!memory_pressure_path) {
×
5260
                                        *exit_status = EXIT_MEMORY;
×
5261
                                        return log_oom();
×
5262
                                }
5263
                        }
5264
                }
5265
        }
5266

5267
        needs_mount_namespace = exec_needs_mount_namespace(context, params, runtime);
11,851✔
5268

5269
        for (ExecDirectoryType dt = 0; dt < _EXEC_DIRECTORY_TYPE_MAX; dt++) {
71,101✔
5270
                r = setup_exec_directory(context, params, uid, gid, dt, needs_mount_namespace, exit_status);
59,251✔
5271
                if (r < 0)
59,251✔
5272
                        return log_error_errno(r, "Failed to set up special execution directory in %s: %m", params->prefix[dt]);
1✔
5273
        }
5274

5275
        r = exec_setup_credentials(context, cgroup_context, params, params->unit_id, uid, gid);
11,850✔
5276
        if (r < 0) {
9,856✔
5277
                *exit_status = EXIT_CREDENTIALS;
×
5278
                return log_error_errno(r, "Failed to set up credentials: %m");
×
5279
        }
5280

5281
        r = build_environment(
9,856✔
5282
                        context,
5283
                        params,
5284
                        cgroup_context,
5285
                        n_fds,
5286
                        pwent_home,
5287
                        username,
5288
                        shell,
5289
                        journal_stream_dev,
5290
                        journal_stream_ino,
5291
                        memory_pressure_path,
5292
                        needs_sandboxing,
5293
                        &our_env);
5294
        if (r < 0) {
9,856✔
5295
                *exit_status = EXIT_MEMORY;
×
5296
                return log_oom();
×
5297
        }
5298

5299
        r = build_pass_environment(context, &pass_env);
9,856✔
5300
        if (r < 0) {
9,856✔
5301
                *exit_status = EXIT_MEMORY;
×
5302
                return log_oom();
×
5303
        }
5304

5305
        /* The $PATH variable is set to the default path in params->environment. However, this is overridden
5306
         * if user-specified fields have $PATH set. The intention is to also override $PATH if the unit does
5307
         * not specify PATH but the unit has ExecSearchPath. */
5308
        if (!strv_isempty(context->exec_search_path)) {
9,856✔
5309
                _cleanup_free_ char *joined = NULL;
×
5310

5311
                joined = strv_join(context->exec_search_path, ":");
×
5312
                if (!joined) {
×
5313
                        *exit_status = EXIT_MEMORY;
×
5314
                        return log_oom();
×
5315
                }
5316

5317
                r = strv_env_assign(&joined_exec_search_path, "PATH", joined);
×
5318
                if (r < 0) {
×
5319
                        *exit_status = EXIT_MEMORY;
×
5320
                        return log_oom();
×
5321
                }
5322
        }
5323

5324
        accum_env = strv_env_merge(params->environment,
9,856✔
5325
                                   our_env,
5326
                                   joined_exec_search_path,
5327
                                   pass_env,
5328
                                   context->environment,
5329
                                   params->files_env);
5330
        if (!accum_env) {
9,856✔
5331
                *exit_status = EXIT_MEMORY;
×
5332
                return log_oom();
×
5333
        }
5334
        strv_env_clean(accum_env);
9,856✔
5335

5336
        (void) umask(context->umask);
9,856✔
5337

5338
        r = setup_term_environment(context, &accum_env);
9,856✔
5339
        if (r < 0) {
9,856✔
5340
                *exit_status = EXIT_MEMORY;
×
5341
                return log_error_errno(r, "Failed to construct $TERM: %m");
×
5342
        }
5343

5344
        r = setup_keyring(context, params, uid, gid);
9,856✔
5345
        if (r < 0) {
9,856✔
5346
                *exit_status = EXIT_KEYRING;
×
5347
                return log_error_errno(r, "Failed to set up kernel keyring: %m");
×
5348
        }
5349

5350
        /* We need setresuid() if the caller asked us to apply sandboxing and the command isn't explicitly
5351
         * excepted from either whole sandboxing or just setresuid() itself. */
5352
        needs_setuid = (params->flags & EXEC_APPLY_SANDBOXING) && !(command->flags & (EXEC_COMMAND_FULLY_PRIVILEGED|EXEC_COMMAND_NO_SETUID));
9,856✔
5353

5354
        uint64_t capability_ambient_set = context->capability_ambient_set;
9,856✔
5355

5356
        /* Check CAP_SYS_ADMIN before we enter user namespace to see if we can mount /proc even though its masked. */
5357
        have_cap_sys_admin = have_effective_cap(CAP_SYS_ADMIN) > 0;
9,856✔
5358

5359
        if (needs_sandboxing) {
9,856✔
5360
                /* MAC enablement checks need to be done before a new mount ns is created, as they rely on
5361
                 * /sys being present. The actual MAC context application will happen later, as late as
5362
                 * possible, to avoid impacting our own code paths. */
5363

5364
#if HAVE_SELINUX
5365
                use_selinux = mac_selinux_use();
5366
#endif
5367
#if ENABLE_SMACK
5368
                use_smack = mac_smack_use();
9,856✔
5369
#endif
5370
#if HAVE_APPARMOR
5371
                if (mac_apparmor_use()) {
5372
                        r = dlopen_libapparmor();
5373
                        if (r < 0 && !ERRNO_IS_NEG_NOT_SUPPORTED(r))
5374
                                log_warning_errno(r, "Failed to load libapparmor, ignoring: %m");
5375
                        use_apparmor = r >= 0;
5376
                }
5377
#endif
5378
        }
5379

5380
        if (needs_sandboxing) {
9,856✔
5381
                int which_failed;
9,856✔
5382

5383
                /* Let's set the resource limits before we call into PAM, so that pam_limits wins over what
5384
                 * is set here. (See below.) */
5385

5386
                r = setrlimit_closest_all((const struct rlimit* const *) context->rlimit, &which_failed);
9,856✔
5387
                if (r < 0) {
9,856✔
5388
                        *exit_status = EXIT_LIMITS;
×
5389
                        return log_error_errno(r, "Failed to adjust resource limit RLIMIT_%s: %m", rlimit_to_string(which_failed));
×
5390
                }
5391
        }
5392

5393
        if (needs_setuid && context->pam_name && username) {
9,856✔
5394
                /* Let's call into PAM after we set up our own idea of resource limits so that pam_limits
5395
                 * wins here. (See above.) */
5396

5397
                /* All fds passed in the fds array will be closed in the pam child process. */
5398
                r = setup_pam(context, cgroup_context, params, username, uid, gid, &accum_env,
810✔
5399
                              params->fds, n_fds, needs_sandboxing, params->exec_fd);
405✔
5400
                if (r < 0) {
405✔
5401
                        *exit_status = EXIT_PAM;
×
5402
                        return log_error_errno(r, "Failed to set up PAM session: %m");
×
5403
                }
5404

5405
                /* PAM modules might have set some ambient caps. Query them here and merge them into
5406
                 * the caps we want to set in the end, so that we don't end up unsetting them. */
5407
                uint64_t ambient_after_pam;
405✔
5408
                r = capability_get_ambient(&ambient_after_pam);
405✔
5409
                if (r < 0) {
405✔
5410
                        *exit_status = EXIT_CAPABILITIES;
×
5411
                        return log_error_errno(r, "Failed to query ambient caps: %m");
×
5412
                }
5413

5414
                capability_ambient_set |= ambient_after_pam;
405✔
5415

5416
                ngids_after_pam = getgroups_alloc(&gids_after_pam);
405✔
5417
                if (ngids_after_pam < 0) {
405✔
5418
                        *exit_status = EXIT_GROUP;
×
5419
                        return log_error_errno(ngids_after_pam, "Failed to obtain groups after setting up PAM: %m");
×
5420
                }
5421
        }
5422

5423
        if (needs_sandboxing && !have_cap_sys_admin && exec_needs_cap_sys_admin(context, params)) {
9,856✔
5424
                /* If we're unprivileged, set up the user namespace first to enable use of the other namespaces.
5425
                 * Users with CAP_SYS_ADMIN can set up user namespaces last because they will be able to
5426
                 * set up all of the other namespaces (i.e. network, mount, UTS) without a user namespace. */
5427
                PrivateUsers pu = exec_context_get_effective_private_users(context, params);
26✔
5428
                if (pu == PRIVATE_USERS_NO)
26✔
5429
                        pu = PRIVATE_USERS_SELF;
22✔
5430

5431
                /* The kernel requires /proc/pid/setgroups be set to "deny" prior to writing /proc/pid/gid_map in
5432
                 * unprivileged user namespaces. */
5433
                r = setup_private_users(pu, saved_uid, saved_gid, uid, gid, /* allow_setgroups= */ false);
26✔
5434
                /* If it was requested explicitly and we can't set it up, fail early. Otherwise, continue and let
5435
                 * the actual requested operations fail (or silently continue). */
5436
                if (r < 0 && context->private_users != PRIVATE_USERS_NO) {
26✔
5437
                        *exit_status = EXIT_USER;
×
5438
                        return log_error_errno(r, "Failed to set up user namespacing for unprivileged user: %m");
×
5439
                }
5440
                if (r < 0)
×
5441
                        log_info_errno(r, "Failed to set up user namespacing for unprivileged user, ignoring: %m");
×
5442
                else {
5443
                        assert(r > 0);
26✔
5444
                        userns_set_up = true;
26✔
5445
                        log_debug("Set up unprivileged user namespace");
26✔
5446
                }
5447
        }
5448

5449
        /* Call setup_delegated_namespaces() the first time to unshare all non-delegated namespaces. */
5450
        r = setup_delegated_namespaces(
9,856✔
5451
                        context,
5452
                        params,
5453
                        runtime,
5454
                        /* delegate= */ false,
5455
                        memory_pressure_path,
5456
                        uid,
5457
                        gid,
5458
                        command,
5459
                        needs_sandboxing,
5460
                        have_cap_sys_admin,
5461
                        exit_status);
5462
        if (r < 0)
9,853✔
5463
                return r;
5464

5465
        /* Drop groups as early as possible.
5466
         * This needs to be done after PrivateDevices=yes setup as device nodes should be owned by the host's root.
5467
         * For non-root in a userns, devices will be owned by the user/group before the group change, and nobody. */
5468
        if (needs_setuid) {
9,837✔
5469
                _cleanup_free_ gid_t *gids_to_enforce = NULL;
9,837✔
5470
                int ngids_to_enforce;
9,837✔
5471

5472
                ngids_to_enforce = merge_gid_lists(gids,
9,837✔
5473
                                                   ngids,
5474
                                                   gids_after_pam,
5475
                                                   ngids_after_pam,
5476
                                                   &gids_to_enforce);
5477
                if (ngids_to_enforce < 0) {
9,837✔
5478
                        *exit_status = EXIT_GROUP;
×
5479
                        return log_error_errno(ngids_to_enforce, "Failed to merge group lists. Group membership might be incorrect: %m");
×
5480
                }
5481

5482
                r = enforce_groups(gid, gids_to_enforce, ngids_to_enforce);
9,837✔
5483
                if (r < 0) {
9,837✔
5484
                        *exit_status = EXIT_GROUP;
1✔
5485
                        return log_error_errno(r, "Changing group credentials failed: %m");
1✔
5486
                }
5487
        }
5488

5489
        /* If the user namespace was not set up above, try to do it now.
5490
         * It's preferred to set up the user namespace later (after all other namespaces) so as not to be
5491
         * restricted by rules pertaining to combining user namespaces with other namespaces (e.g. in the
5492
         * case of mount namespaces being less privileged when the mount point list is copied from a
5493
         * different user namespace). */
5494

5495
        if (needs_sandboxing && !userns_set_up) {
9,836✔
5496
                PrivateUsers pu = exec_context_get_effective_private_users(context, params);
9,814✔
5497

5498
                r = setup_private_users(pu, saved_uid, saved_gid, uid, gid,
9,814✔
5499
                                        /* allow_setgroups= */ pu == PRIVATE_USERS_FULL);
5500
                if (r < 0) {
9,814✔
5501
                        *exit_status = EXIT_USER;
×
5502
                        return log_error_errno(r, "Failed to set up user namespacing: %m");
×
5503
                }
5504
                if (r > 0)
9,814✔
5505
                        log_debug("Set up privileged user namespace");
25✔
5506
        }
5507

5508
        /* Call setup_delegated_namespaces() the second time to unshare all delegated namespaces. */
5509
        r = setup_delegated_namespaces(
9,836✔
5510
                        context,
5511
                        params,
5512
                        runtime,
5513
                        /* delegate= */ true,
5514
                        memory_pressure_path,
5515
                        uid,
5516
                        gid,
5517
                        command,
5518
                        needs_sandboxing,
5519
                        have_cap_sys_admin,
5520
                        exit_status);
5521
        if (r < 0)
9,832✔
5522
                return r;
5523

5524
        if (needs_sandboxing && exec_needs_cgroup_namespace(context) && params->cgroup_path) {
9,832✔
5525
                /* Move ourselves into the subcgroup now *after* we've unshared the cgroup namespace, which
5526
                 * ensures the root of the cgroup namespace is the top level service cgroup and not the
5527
                 * subcgroup. Adjust the prefix accordingly since we're in a cgroup namespace now. */
5528
                r = attach_to_subcgroup(context, cgroup_context, params, /* prefix= */ NULL);
12✔
5529
                if (r < 0) {
12✔
5530
                        *exit_status = EXIT_CGROUP;
×
5531
                        return r;
×
5532
                }
5533
        }
5534

5535
        /* Now that the mount namespace has been set up and privileges adjusted, let's look for the thing we
5536
         * shall execute. */
5537

5538
        const char *path = command->path;
9,832✔
5539

5540
        if (FLAGS_SET(command->flags, EXEC_COMMAND_VIA_SHELL)) {
9,832✔
5541
                if (shell_is_placeholder(shell)) {
13✔
5542
                        log_debug("Shell prefixing requested for user without default shell, using /bin/sh: %s",
2✔
5543
                                  strna(username));
5544
                        assert(streq(path, _PATH_BSHELL));
2✔
5545
                } else
5546
                        path = shell;
5547
        }
5548

5549
        _cleanup_free_ char *executable = NULL;
5✔
5550
        _cleanup_close_ int executable_fd = -EBADF;
5✔
5551
        r = find_executable_full(path, /* root= */ NULL, context->exec_search_path, false, &executable, &executable_fd);
9,832✔
5552
        if (r < 0) {
9,832✔
5553
                *exit_status = EXIT_EXEC;
1✔
5554
                log_struct_errno(LOG_NOTICE, r,
1✔
5555
                                 LOG_MESSAGE_ID(SD_MESSAGE_SPAWN_FAILED_STR),
5556
                                 LOG_EXEC_MESSAGE(params, "Unable to locate executable '%s': %m", path),
5557
                                 LOG_ITEM("EXECUTABLE=%s", path));
5558
                /* If the error will be ignored by manager, tune down the log level here. Missing executable
5559
                 * is very much expected in this case. */
5560
                return r != -ENOMEM && FLAGS_SET(command->flags, EXEC_COMMAND_IGNORE_FAILURE) ? 1 : r;
1✔
5561
        }
5562

5563
        r = add_shifted_fd(keep_fds, ELEMENTSOF(keep_fds), &n_keep_fds, &executable_fd);
9,831✔
5564
        if (r < 0) {
9,831✔
5565
                *exit_status = EXIT_FDS;
×
5566
                return log_error_errno(r, "Failed to collect shifted fd: %m");
×
5567
        }
5568

5569
#if HAVE_SELINUX
5570
        if (needs_sandboxing && use_selinux && params->selinux_context_net) {
5571
                int fd = -EBADF;
5572

5573
                if (socket_fd >= 0)
5574
                        fd = socket_fd;
5575
                else if (params->n_socket_fds == 1)
5576
                        /* If stdin is not connected to a socket but we are triggered by exactly one socket unit then we
5577
                         * use context from that fd to compute the label. */
5578
                        fd = params->fds[0];
5579

5580
                if (fd >= 0) {
5581
                        r = mac_selinux_get_child_mls_label(fd, executable, context->selinux_context, &mac_selinux_context_net);
5582
                        if (r < 0) {
5583
                                if (!context->selinux_context_ignore) {
5584
                                        *exit_status = EXIT_SELINUX_CONTEXT;
5585
                                        return log_error_errno(r, "Failed to determine SELinux context: %m");
5586
                                }
5587
                                log_debug_errno(r, "Failed to determine SELinux context, ignoring: %m");
5588
                        }
5589
                }
5590
        }
5591
#endif
5592

5593
        /* We repeat the fd closing here, to make sure that nothing is leaked from the PAM modules. Note that
5594
         * we are more aggressive this time, since we don't need socket_fd and the netns and ipcns fds any
5595
         * more. We do keep exec_fd and handoff_timestamp_fd however, if we have it, since we need to keep
5596
         * them open until the final execve(). But first, close the remaining sockets in the context
5597
         * objects. */
5598

5599
        exec_runtime_close(runtime);
9,831✔
5600
        exec_params_close(params);
9,831✔
5601

5602
        r = close_all_fds(keep_fds, n_keep_fds);
9,831✔
5603
        if (r >= 0)
9,831✔
5604
                r = pack_fds(params->fds, n_fds);
9,831✔
5605
        if (r >= 0)
9,831✔
5606
                r = flag_fds(params->fds, n_socket_fds, n_fds, context->non_blocking);
9,831✔
5607
        if (r < 0) {
9,831✔
5608
                *exit_status = EXIT_FDS;
×
5609
                return log_error_errno(r, "Failed to adjust passed file descriptors: %m");
×
5610
        }
5611

5612
        /* At this point, the fds we want to pass to the program are all ready and set up, with O_CLOEXEC turned off
5613
         * and at the right fd numbers. The are no other fds open, with one exception: the exec_fd if it is defined,
5614
         * and it has O_CLOEXEC set, after all we want it to be closed by the execve(), so that our parent knows we
5615
         * came this far. */
5616

5617
        secure_bits = context->secure_bits;
9,831✔
5618

5619
        if (needs_sandboxing) {
9,831✔
5620
                uint64_t bset;
9,831✔
5621

5622
                /* Set the RTPRIO resource limit to 0, but only if nothing else was explicitly requested.
5623
                 * (Note this is placed after the general resource limit initialization, see above, in order
5624
                 * to take precedence.) */
5625
                if (context->restrict_realtime && !context->rlimit[RLIMIT_RTPRIO]) {
9,831✔
5626
                        if (setrlimit(RLIMIT_RTPRIO, &RLIMIT_MAKE_CONST(0)) < 0) {
1,533✔
5627
                                *exit_status = EXIT_LIMITS;
×
5628
                                return log_error_errno(errno, "Failed to adjust RLIMIT_RTPRIO resource limit: %m");
×
5629
                        }
5630
                }
5631

5632
#if ENABLE_SMACK
5633
                /* LSM Smack needs the capability CAP_MAC_ADMIN to change the current execution security context of the
5634
                 * process. This is the latest place before dropping capabilities. Other MAC context are set later. */
5635
                if (use_smack) {
9,831✔
5636
                        r = setup_smack(context, params, executable_fd);
×
5637
                        if (r < 0 && !context->smack_process_label_ignore) {
×
5638
                                *exit_status = EXIT_SMACK_PROCESS_LABEL;
×
5639
                                return log_error_errno(r, "Failed to set SMACK process label: %m");
×
5640
                        }
5641
                }
5642
#endif
5643

5644
                bset = context->capability_bounding_set;
9,831✔
5645

5646
#if HAVE_SECCOMP
5647
                /* If the service has any form of a seccomp filter and it allows dropping privileges, we'll
5648
                 * keep the needed privileges to apply it even if we're not root. */
5649
                if (needs_setuid &&
19,662✔
5650
                    uid_is_valid(uid) &&
11,903✔
5651
                    context_has_seccomp(context) &&
2,843✔
5652
                    seccomp_allows_drop_privileges(context)) {
771✔
5653
                        keep_seccomp_privileges = true;
771✔
5654

5655
                        if (prctl(PR_SET_KEEPCAPS, 1) < 0) {
771✔
5656
                                *exit_status = EXIT_USER;
×
5657
                                return log_error_errno(errno, "Failed to enable keep capabilities flag: %m");
×
5658
                        }
5659

5660
                        /* Save the current bounding set so we can restore it after applying the seccomp
5661
                         * filter */
5662
                        saved_bset = bset;
771✔
5663
                        bset |= (UINT64_C(1) << CAP_SYS_ADMIN) |
771✔
5664
                                (UINT64_C(1) << CAP_SETPCAP);
5665
                }
5666
#endif
5667

5668
                if (!cap_test_all(bset)) {
9,831✔
5669
                        r = capability_bounding_set_drop(bset, /* right_now= */ false);
1,661✔
5670
                        if (r < 0) {
1,661✔
5671
                                *exit_status = EXIT_CAPABILITIES;
×
5672
                                return log_error_errno(r, "Failed to drop capabilities: %m");
×
5673
                        }
5674
                }
5675

5676
                /* Ambient capabilities are cleared during setresuid() (in enforce_user()) even with
5677
                 * keep-caps set.
5678
                 *
5679
                 * To be able to raise the ambient capabilities after setresuid() they have to be added to
5680
                 * the inherited set and keep caps has to be set (done in enforce_user()).  After setresuid()
5681
                 * the ambient capabilities can be raised as they are present in the permitted and
5682
                 * inhertiable set. However it is possible that someone wants to set ambient capabilities
5683
                 * without changing the user, so we also set the ambient capabilities here.
5684
                 *
5685
                 * The requested ambient capabilities are raised in the inheritable set if the second
5686
                 * argument is true. */
5687
                if (capability_ambient_set != 0) {
9,831✔
5688
                        r = capability_ambient_set_apply(capability_ambient_set, /* also_inherit= */ true);
766✔
5689
                        if (r < 0) {
766✔
5690
                                *exit_status = EXIT_CAPABILITIES;
×
5691
                                return log_error_errno(r, "Failed to apply ambient capabilities (before UID change): %m");
×
5692
                        }
5693
                }
5694
        }
5695

5696
        /* chroot to root directory first, before we lose the ability to chroot */
5697
        r = apply_root_directory(context, params, runtime, needs_mount_namespace, exit_status);
9,831✔
5698
        if (r < 0)
9,831✔
5699
                return log_error_errno(r, "Chrooting to the requested root directory failed: %m");
×
5700

5701
        if (needs_setuid) {
9,831✔
5702
                if (uid_is_valid(uid)) {
9,831✔
5703
                        r = enforce_user(context, uid, capability_ambient_set);
2,072✔
5704
                        if (r < 0) {
2,072✔
5705
                                *exit_status = EXIT_USER;
×
5706
                                return log_error_errno(r, "Failed to change UID to " UID_FMT ": %m", uid);
×
5707
                        }
5708

5709
                        if (keep_seccomp_privileges) {
2,072✔
5710
                                if (!BIT_SET(capability_ambient_set, CAP_SETUID)) {
771✔
5711
                                        r = drop_capability(CAP_SETUID);
771✔
5712
                                        if (r < 0) {
771✔
5713
                                                *exit_status = EXIT_USER;
×
5714
                                                return log_error_errno(r, "Failed to drop CAP_SETUID: %m");
×
5715
                                        }
5716
                                }
5717

5718
                                r = keep_capability(CAP_SYS_ADMIN);
771✔
5719
                                if (r < 0) {
771✔
5720
                                        *exit_status = EXIT_USER;
×
5721
                                        return log_error_errno(r, "Failed to keep CAP_SYS_ADMIN: %m");
×
5722
                                }
5723

5724
                                r = keep_capability(CAP_SETPCAP);
771✔
5725
                                if (r < 0) {
771✔
5726
                                        *exit_status = EXIT_USER;
×
5727
                                        return log_error_errno(r, "Failed to keep CAP_SETPCAP: %m");
×
5728
                                }
5729
                        }
5730

5731
                        if (capability_ambient_set != 0) {
2,072✔
5732

5733
                                /* Raise the ambient capabilities after user change. */
5734
                                r = capability_ambient_set_apply(capability_ambient_set, /* also_inherit= */ false);
763✔
5735
                                if (r < 0) {
763✔
5736
                                        *exit_status = EXIT_CAPABILITIES;
×
5737
                                        return log_error_errno(r, "Failed to apply ambient capabilities (after UID change): %m");
×
5738
                                }
5739
                        }
5740
                }
5741
        }
5742

5743
        /* Apply working directory here, because the working directory might be on NFS and only the user
5744
         * running this service might have the correct privilege to change to the working directory. Also, it
5745
         * is absolutely 💣 crucial 💣 we applied all mount namespacing rearrangements before this, so that
5746
         * the cwd cannot be used to pin directories outside of the sandbox. */
5747
        r = apply_working_directory(context, params, runtime, pwent_home, accum_env);
9,831✔
5748
        if (r < 0) {
9,831✔
5749
                *exit_status = EXIT_CHDIR;
1✔
5750
                return log_error_errno(r, "Changing to the requested working directory failed: %m");
1✔
5751
        }
5752

5753
        if (needs_sandboxing) {
9,830✔
5754
                /* Apply other MAC contexts late, but before seccomp syscall filtering, as those should really be last to
5755
                 * influence our own codepaths as little as possible. Moreover, applying MAC contexts usually requires
5756
                 * syscalls that are subject to seccomp filtering, hence should probably be applied before the syscalls
5757
                 * are restricted. */
5758

5759
#if HAVE_SELINUX
5760
                if (use_selinux) {
5761
                        char *exec_context = mac_selinux_context_net ?: context->selinux_context;
5762

5763
                        if (exec_context) {
5764
                                r = setexeccon(exec_context);
5765
                                if (r < 0) {
5766
                                        if (!context->selinux_context_ignore) {
5767
                                                *exit_status = EXIT_SELINUX_CONTEXT;
5768
                                                return log_error_errno(r, "Failed to change SELinux context to %s: %m", exec_context);
5769
                                        }
5770
                                        log_debug_errno(r, "Failed to change SELinux context to %s, ignoring: %m", exec_context);
5771
                                }
5772
                        }
5773
                }
5774
#endif
5775

5776
#if HAVE_APPARMOR
5777
                if (use_apparmor && context->apparmor_profile) {
5778
                        r = ASSERT_PTR(sym_aa_change_onexec)(context->apparmor_profile);
5779
                        if (r < 0 && !context->apparmor_profile_ignore) {
5780
                                *exit_status = EXIT_APPARMOR_PROFILE;
5781
                                return log_error_errno(errno, "Failed to prepare AppArmor profile change to %s: %m",
5782
                                                       context->apparmor_profile);
5783
                        }
5784
                }
5785
#endif
5786

5787
                /* PR_GET_SECUREBITS is not privileged, while PR_SET_SECUREBITS is. So to suppress potential
5788
                 * EPERMs we'll try not to call PR_SET_SECUREBITS unless necessary. Setting securebits
5789
                 * requires CAP_SETPCAP. */
5790
                if (prctl(PR_GET_SECUREBITS) != secure_bits) {
9,830✔
5791
                        /* CAP_SETPCAP is required to set securebits. This capability is raised into the
5792
                         * effective set here.
5793
                         *
5794
                         * The effective set is overwritten during execve() with the following values:
5795
                         *
5796
                         * - ambient set (for non-root processes)
5797
                         *
5798
                         * - (inheritable | bounding) set for root processes)
5799
                         *
5800
                         * Hence there is no security impact to raise it in the effective set before execve
5801
                         */
5802
                        r = capability_gain_cap_setpcap(/* ret_before_caps = */ NULL);
823✔
5803
                        if (r < 0) {
823✔
5804
                                *exit_status = EXIT_CAPABILITIES;
×
5805
                                return log_error_errno(r, "Failed to gain CAP_SETPCAP for setting secure bits");
×
5806
                        }
5807
                        if (prctl(PR_SET_SECUREBITS, secure_bits) < 0) {
823✔
5808
                                *exit_status = EXIT_SECUREBITS;
×
5809
                                return log_error_errno(errno, "Failed to set process secure bits: %m");
×
5810
                        }
5811
                }
5812

5813
                if (context_has_no_new_privileges(context))
9,830✔
5814
                        if (prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0) < 0) {
1,455✔
5815
                                *exit_status = EXIT_NO_NEW_PRIVILEGES;
×
5816
                                return log_error_errno(errno, "Failed to disable new privileges: %m");
×
5817
                        }
5818

5819
#if HAVE_SECCOMP
5820
                r = apply_address_families(context, params);
9,830✔
5821
                if (r < 0) {
9,830✔
5822
                        *exit_status = EXIT_ADDRESS_FAMILIES;
×
5823
                        return log_error_errno(r, "Failed to restrict address families: %m");
×
5824
                }
5825

5826
                r = apply_memory_deny_write_execute(context, params);
9,830✔
5827
                if (r < 0) {
9,830✔
5828
                        *exit_status = EXIT_SECCOMP;
×
5829
                        return log_error_errno(r, "Failed to disable writing to executable memory: %m");
×
5830
                }
5831

5832
                r = apply_restrict_realtime(context, params);
9,830✔
5833
                if (r < 0) {
9,830✔
5834
                        *exit_status = EXIT_SECCOMP;
×
5835
                        return log_error_errno(r, "Failed to apply realtime restrictions: %m");
×
5836
                }
5837

5838
                r = apply_restrict_suid_sgid(context, params);
9,830✔
5839
                if (r < 0) {
9,830✔
5840
                        *exit_status = EXIT_SECCOMP;
×
5841
                        return log_error_errno(r, "Failed to apply SUID/SGID restrictions: %m");
×
5842
                }
5843

5844
                r = apply_restrict_namespaces(context, params);
9,830✔
5845
                if (r < 0) {
9,830✔
5846
                        *exit_status = EXIT_SECCOMP;
×
5847
                        return log_error_errno(r, "Failed to apply namespace restrictions: %m");
×
5848
                }
5849

5850
                r = apply_protect_sysctl(context, params);
9,830✔
5851
                if (r < 0) {
9,830✔
5852
                        *exit_status = EXIT_SECCOMP;
×
5853
                        return log_error_errno(r, "Failed to apply sysctl restrictions: %m");
×
5854
                }
5855

5856
                r = apply_protect_kernel_modules(context, params);
9,830✔
5857
                if (r < 0) {
9,830✔
5858
                        *exit_status = EXIT_SECCOMP;
×
5859
                        return log_error_errno(r, "Failed to apply module loading restrictions: %m");
×
5860
                }
5861

5862
                r = apply_protect_kernel_logs(context, params);
9,830✔
5863
                if (r < 0) {
9,830✔
5864
                        *exit_status = EXIT_SECCOMP;
×
5865
                        return log_error_errno(r, "Failed to apply kernel log restrictions: %m");
×
5866
                }
5867

5868
                r = apply_protect_clock(context, params);
9,830✔
5869
                if (r < 0) {
9,830✔
5870
                        *exit_status = EXIT_SECCOMP;
×
5871
                        return log_error_errno(r, "Failed to apply clock restrictions: %m");
×
5872
                }
5873

5874
                r = apply_private_devices(context, params);
9,830✔
5875
                if (r < 0) {
9,830✔
5876
                        *exit_status = EXIT_SECCOMP;
×
5877
                        return log_error_errno(r, "Failed to set up private devices: %m");
×
5878
                }
5879

5880
                r = apply_syscall_archs(context, params);
9,830✔
5881
                if (r < 0) {
9,830✔
5882
                        *exit_status = EXIT_SECCOMP;
×
5883
                        return log_error_errno(r, "Failed to apply syscall architecture restrictions: %m");
×
5884
                }
5885

5886
                r = apply_lock_personality(context, params);
9,830✔
5887
                if (r < 0) {
9,830✔
5888
                        *exit_status = EXIT_SECCOMP;
×
5889
                        return log_error_errno(r, "Failed to lock personalities: %m");
×
5890
                }
5891

5892
                r = apply_syscall_log(context, params);
9,830✔
5893
                if (r < 0) {
9,830✔
5894
                        *exit_status = EXIT_SECCOMP;
×
5895
                        return log_error_errno(r, "Failed to apply system call log filters: %m");
×
5896
                }
5897
#endif
5898

5899
#if HAVE_LIBBPF
5900
                r = apply_restrict_filesystems(context, params);
9,830✔
5901
                if (r < 0) {
9,830✔
5902
                        *exit_status = EXIT_BPF;
×
5903
                        return log_error_errno(r, "Failed to restrict filesystems: %m");
×
5904
                }
5905
#endif
5906

5907
#if HAVE_SECCOMP
5908
                /* This really should remain as close to the execve() as possible, to make sure our own code is affected
5909
                 * by the filter as little as possible. */
5910
                r = apply_syscall_filter(context, params);
9,830✔
5911
                if (r < 0) {
9,830✔
5912
                        *exit_status = EXIT_SECCOMP;
×
5913
                        return log_error_errno(r, "Failed to apply system call filters: %m");
×
5914
                }
5915

5916
                if (keep_seccomp_privileges) {
9,830✔
5917
                        /* Restore the capability bounding set with what's expected from the service + the
5918
                         * ambient capabilities hack */
5919
                        if (!cap_test_all(saved_bset)) {
770✔
5920
                                r = capability_bounding_set_drop(saved_bset, /* right_now= */ false);
735✔
5921
                                if (r < 0) {
735✔
5922
                                        *exit_status = EXIT_CAPABILITIES;
×
5923
                                        return log_error_errno(r, "Failed to drop bset capabilities: %m");
×
5924
                                }
5925
                        }
5926

5927
                        /* Only drop CAP_SYS_ADMIN if it's not in the bounding set, otherwise we'll break
5928
                         * applications that use it. */
5929
                        if (!BIT_SET(saved_bset, CAP_SYS_ADMIN)) {
770✔
5930
                                r = drop_capability(CAP_SYS_ADMIN);
285✔
5931
                                if (r < 0) {
285✔
5932
                                        *exit_status = EXIT_USER;
×
5933
                                        return log_error_errno(r, "Failed to drop CAP_SYS_ADMIN: %m");
×
5934
                                }
5935
                        }
5936

5937
                        /* Only drop CAP_SETPCAP if it's not in the bounding set, otherwise we'll break
5938
                         * applications that use it. */
5939
                        if (!BIT_SET(saved_bset, CAP_SETPCAP)) {
770✔
5940
                                r = drop_capability(CAP_SETPCAP);
550✔
5941
                                if (r < 0) {
550✔
5942
                                        *exit_status = EXIT_USER;
×
5943
                                        return log_error_errno(r, "Failed to drop CAP_SETPCAP: %m");
×
5944
                                }
5945
                        }
5946

5947
                        if (prctl(PR_SET_KEEPCAPS, 0) < 0) {
770✔
5948
                                *exit_status = EXIT_USER;
×
5949
                                return log_error_errno(errno, "Failed to drop keep capabilities flag: %m");
×
5950
                        }
5951
                }
5952
#endif
5953

5954
        }
5955

5956
        if (!strv_isempty(context->unset_environment)) {
9,830✔
5957
                char **ee = NULL;
277✔
5958

5959
                ee = strv_env_delete(accum_env, 1, context->unset_environment);
277✔
5960
                if (!ee) {
277✔
5961
                        *exit_status = EXIT_MEMORY;
×
5962
                        return log_oom();
5✔
5963
                }
5964

5965
                strv_free_and_replace(accum_env, ee);
277✔
5966
        }
5967

5968
        _cleanup_strv_free_ char **replaced_argv = NULL, **argv_via_shell = NULL;
3✔
5969
        char **final_argv = FLAGS_SET(command->flags, EXEC_COMMAND_VIA_SHELL) ? strv_skip(command->argv, 1) : command->argv;
9,830✔
5970

5971
        if (final_argv && !FLAGS_SET(command->flags, EXEC_COMMAND_NO_ENV_EXPAND)) {
9,830✔
5972
                _cleanup_strv_free_ char **unset_variables = NULL, **bad_variables = NULL;
9,657✔
5973

5974
                r = replace_env_argv(final_argv, accum_env, &replaced_argv, &unset_variables, &bad_variables);
9,657✔
5975
                if (r < 0) {
9,657✔
5976
                        *exit_status = EXIT_MEMORY;
×
5977
                        return log_error_errno(r, "Failed to replace environment variables: %m");
×
5978
                }
5979
                final_argv = replaced_argv;
9,657✔
5980

5981
                if (!strv_isempty(unset_variables)) {
9,657✔
5982
                        _cleanup_free_ char *ju = strv_join(unset_variables, ", ");
10✔
5983
                        log_warning("Referenced but unset environment variable evaluates to an empty string: %s", strna(ju));
5✔
5984
                }
5985

5986
                if (!strv_isempty(bad_variables)) {
9,657✔
5987
                        _cleanup_free_ char *jb = strv_join(bad_variables, ", ");
×
5988
                        log_warning("Invalid environment variable name evaluates to an empty string: %s", strna(jb));
×
5989
                }
5990
        }
5991

5992
        if (FLAGS_SET(command->flags, EXEC_COMMAND_VIA_SHELL)) {
9,830✔
5993
                r = strv_extendf(&argv_via_shell, "%s%s", command->argv[0][0] == '-' ? "-" : "", path);
17✔
5994
                if (r < 0) {
13✔
5995
                        *exit_status = EXIT_MEMORY;
×
5996
                        return log_oom();
×
5997
                }
5998

5999
                if (!strv_isempty(final_argv)) {
13✔
6000
                        _cleanup_free_ char *cmdline_joined = NULL;
13✔
6001

6002
                        cmdline_joined = strv_join(final_argv, " ");
13✔
6003
                        if (!cmdline_joined) {
13✔
6004
                                *exit_status = EXIT_MEMORY;
×
6005
                                return log_oom();
×
6006
                        }
6007

6008
                        r = strv_extend_many(&argv_via_shell, "-c", cmdline_joined);
13✔
6009
                        if (r < 0) {
13✔
6010
                                *exit_status = EXIT_MEMORY;
×
6011
                                return log_oom();
×
6012
                        }
6013
                }
6014

6015
                final_argv = argv_via_shell;
13✔
6016
        }
6017

6018
        log_command_line(context, params, "Executing", executable, final_argv);
9,830✔
6019

6020
        /* We have finished with all our initializations. Let's now let the manager know that. From this
6021
         * point on, if the manager sees POLLHUP on the exec_fd, then execve() was successful. */
6022

6023
        r = exec_fd_mark_hot(context, params, /* hot= */ true, exit_status);
9,830✔
6024
        if (r < 0)
9,830✔
6025
                return r;
6026

6027
        /* As last thing before the execve(), let's send the handoff timestamp */
6028
        r = send_handoff_timestamp(context, params, exit_status);
9,830✔
6029
        if (r < 0) {
9,830✔
6030
                /* If this handoff timestamp failed, let's undo the marking as hot */
6031
                (void) exec_fd_mark_hot(context, params, /* hot= */ false, /* reterr_exit_status= */ NULL);
×
6032
                return r;
6033
        }
6034

6035
        /* NB: we leave executable_fd, exec_fd, handoff_timestamp_fd open here. This is safe, because they
6036
         * have O_CLOEXEC set, and the execve() below will thus automatically close them. In fact, for
6037
         * exec_fd this is pretty much the whole raison d'etre. */
6038

6039
        r = fexecve_or_execve(executable_fd, executable, final_argv, accum_env);
9,830✔
6040

6041
        /* The execve() failed, let's undo the marking as hot */
6042
        (void) exec_fd_mark_hot(context, params, /* hot= */ false, /* reterr_exit_status= */ NULL);
3✔
6043

6044
        *exit_status = EXIT_EXEC;
3✔
6045
        return log_error_errno(r, "Failed to execute %s: %m", executable);
3✔
6046
}
STATUS · Troubleshooting · Open an Issue · Sales · Support · CAREERS · ENTERPRISE · START FREE · SCHEDULE DEMO
ANNOUNCEMENTS · TWITTER · TOS & SLA · Supported CI Services · What's a CI service? · Automated Testing

© 2026 Coveralls, Inc