• Home
  • Features
  • Pricing
  • Docs
  • Announcements
  • Sign In

systemd / systemd / 16082515961

04 Jul 2025 08:23PM UTC coverage: 72.095% (-0.1%) from 72.193%
16082515961

push

github

poettering
seccomp-util: allowlist open_tree() as part of @file-system

Now that we make use of open_tree() in places we previously used
openat() with O_PATH, it makes sense to move it from @mount to
@file-system. Without the OPEN_TREE_CLONE flag open_tree() is after all
unprivileged.

Note that open_tree_attr() I left in @mount, since it's purpose is
really to set mount options when cloning, and that's clearly a mount
related thing, not so much something you could use unpriv.

Follow-up for: c5de7b14a

This addresses an issue tracked down by Antonio Feijoo: since the commit
that started to use open_tree() various apps started to crash because
they used seccomp filters and sd-device started to use open_tree()
internally.

300842 of 417287 relevant lines covered (72.09%)

715300.57 hits per line

Source File
Press 'n' to go to next uncovered line, 'b' for previous

78.04
/src/basic/namespace-util.c
1
/* SPDX-License-Identifier: LGPL-2.1-or-later */
2

3
#include <fcntl.h>
4
#include <sys/ioctl.h>
5
#include <sys/mount.h>
6
#include <unistd.h>
7

8
#include "errno-util.h"
9
#include "fd-util.h"
10
#include "fileio.h"
11
#include "log.h"
12
#include "missing_magic.h"
13
#include "missing_namespace.h"
14
#include "mountpoint-util.h"
15
#include "namespace-util.h"
16
#include "parse-util.h"
17
#include "pidfd-util.h"
18
#include "pidref.h"
19
#include "process-util.h"
20
#include "stat-util.h"
21
#include "stdio-util.h"
22
#include "uid-range.h"
23
#include "user-util.h"
24

25
const struct namespace_info namespace_info[_NAMESPACE_TYPE_MAX + 1] = {
26
        [NAMESPACE_CGROUP] =  { "cgroup", "ns/cgroup", CLONE_NEWCGROUP, PIDFD_GET_CGROUP_NAMESPACE, PROC_CGROUP_INIT_INO },
27
        [NAMESPACE_IPC]    =  { "ipc",    "ns/ipc",    CLONE_NEWIPC,    PIDFD_GET_IPC_NAMESPACE,    PROC_IPC_INIT_INO    },
28
        [NAMESPACE_NET]    =  { "net",    "ns/net",    CLONE_NEWNET,    PIDFD_GET_NET_NAMESPACE,    0                    },
29
        /* So, the mount namespace flag is called CLONE_NEWNS for historical
30
         * reasons. Let's expose it here under a more explanatory name: "mnt".
31
         * This is in-line with how the kernel exposes namespaces in /proc/$PID/ns. */
32
        [NAMESPACE_MOUNT]  =  { "mnt",    "ns/mnt",    CLONE_NEWNS,     PIDFD_GET_MNT_NAMESPACE,    0                    },
33
        [NAMESPACE_PID]    =  { "pid",    "ns/pid",    CLONE_NEWPID,    PIDFD_GET_PID_NAMESPACE,    PROC_PID_INIT_INO    },
34
        [NAMESPACE_USER]   =  { "user",   "ns/user",   CLONE_NEWUSER,   PIDFD_GET_USER_NAMESPACE,   PROC_USER_INIT_INO   },
35
        [NAMESPACE_UTS]    =  { "uts",    "ns/uts",    CLONE_NEWUTS,    PIDFD_GET_UTS_NAMESPACE,    PROC_UTS_INIT_INO    },
36
        [NAMESPACE_TIME]   =  { "time",   "ns/time",   CLONE_NEWTIME,   PIDFD_GET_TIME_NAMESPACE,   PROC_TIME_INIT_INO   },
37
        {}, /* Allow callers to iterate over the array without using _NAMESPACE_TYPE_MAX. */
38
};
39

40
#define pid_namespace_path(pid, type) procfs_file_alloca(pid, namespace_info[type].proc_path)
41

42
NamespaceType clone_flag_to_namespace_type(unsigned long clone_flag) {
43
        for (NamespaceType t = 0; t < _NAMESPACE_TYPE_MAX; t++)
618✔
44
                if (((namespace_info[t].clone_flag ^ clone_flag) & (CLONE_NEWCGROUP|CLONE_NEWIPC|CLONE_NEWNET|CLONE_NEWNS|CLONE_NEWPID|CLONE_NEWUSER|CLONE_NEWUTS|CLONE_NEWTIME)) == 0)
618✔
45
                        return t;
46

47
        return _NAMESPACE_TYPE_INVALID;
48
}
49

50
bool namespace_type_supported(NamespaceType type) {
51
        assert(type >= 0 && type < _NAMESPACE_TYPE_MAX);
1,442✔
52

53
        const char *p = pid_namespace_path(0, type);
7,210✔
54
        return access(p, F_OK) >= 0;
1,442✔
55
}
56

57
static int pidref_namespace_open_by_type_internal(const PidRef *pidref, NamespaceType type, bool *need_verify) {
8,894✔
58
        int r;
8,894✔
59

60
        assert(pidref_is_set(pidref));
8,894✔
61
        assert(type >= 0 && type < _NAMESPACE_TYPE_MAX);
8,894✔
62

63
        if (pidref_is_remote(pidref))
8,894✔
64
                return -EREMOTE;
8,894✔
65

66
        if (pidref->fd >= 0) {
8,894✔
67
                r = pidfd_get_namespace(pidref->fd, namespace_info[type].pidfd_get_ns_ioctl_cmd);
8,892✔
68
                if (!ERRNO_IS_NEG_NOT_SUPPORTED(r))
8,892✔
69
                        return r;
70
        }
71

72
        if (need_verify) /* The caller shall call pidref_verify() later */
8,894✔
73
                *need_verify = true;
286✔
74

75
        _cleanup_close_ int nsfd = -EBADF;
8,894✔
76
        const char *p;
8,894✔
77

78
        p = pid_namespace_path(pidref->pid, type);
8,894✔
79
        nsfd = RET_NERRNO(open(p, O_RDONLY|O_NOCTTY|O_CLOEXEC));
8,894✔
80
        if (nsfd == -ENOENT) {
×
81
                r = proc_mounted();
×
82
                if (r == 0)
×
83
                        return -ENOSYS;  /* /proc/ is not available or not set up properly, we're most likely
84
                                            in some chroot environment. */
85
                if (r > 0)
×
86
                        return -ENOPKG;  /* If /proc/ is definitely around then this means the namespace type is not supported */
87

88
                /* can't determine? then propagate original error */
89
        }
90
        if (nsfd < 0)
8,894✔
91
                return nsfd;
92

93
        if (!need_verify) { /* Otherwise we verify on our own */
8,894✔
94
                r = pidref_verify(pidref);
8,608✔
95
                if (r < 0)
8,608✔
96
                        return r;
×
97
        }
98

99
        return TAKE_FD(nsfd);
100
}
101

102
int pidref_namespace_open_by_type(const PidRef *pidref, NamespaceType type) {
103
        return pidref_namespace_open_by_type_internal(pidref, type, NULL);
8,608✔
104
}
105

106
int namespace_open_by_type(NamespaceType type) {
107
        _cleanup_(pidref_done) PidRef self = PIDREF_NULL;
4,511✔
108
        int r;
4,511✔
109

110
        assert(type >= 0 && type < _NAMESPACE_TYPE_MAX);
4,511✔
111

112
        r = pidref_set_self(&self);
4,511✔
113
        if (r < 0)
4,511✔
114
                return r;
115

116
        return pidref_namespace_open_by_type(&self, type);
4,511✔
117
}
118

119
int pidref_namespace_open(
120
                const PidRef *pidref,
121
                int *ret_pidns_fd,
122
                int *ret_mntns_fd,
123
                int *ret_netns_fd,
124
                int *ret_userns_fd,
125
                int *ret_root_fd) {
126

127
        _cleanup_close_ int pidns_fd = -EBADF, mntns_fd = -EBADF, netns_fd = -EBADF,
376✔
128
                userns_fd = -EBADF, root_fd = -EBADF;
376✔
129
        bool need_verify = false;
188✔
130
        int r;
188✔
131

132
        assert(pidref_is_set(pidref));
188✔
133

134
        if (pidref_is_remote(pidref))
376✔
135
                return -EREMOTE;
136

137
        if (ret_pidns_fd) {
188✔
138
                pidns_fd = pidref_namespace_open_by_type_internal(pidref, NAMESPACE_PID, &need_verify);
82✔
139
                if (pidns_fd < 0)
82✔
140
                        return pidns_fd;
141
        }
142

143
        if (ret_mntns_fd) {
188✔
144
                mntns_fd = pidref_namespace_open_by_type_internal(pidref, NAMESPACE_MOUNT, &need_verify);
83✔
145
                if (mntns_fd < 0)
83✔
146
                        return mntns_fd;
147
        }
148

149
        if (ret_netns_fd) {
188✔
150
                netns_fd = pidref_namespace_open_by_type_internal(pidref, NAMESPACE_NET, &need_verify);
108✔
151
                if (netns_fd < 0)
108✔
152
                        return netns_fd;
153
        }
154

155
        if (ret_userns_fd) {
188✔
156
                userns_fd = pidref_namespace_open_by_type_internal(pidref, NAMESPACE_USER, &need_verify);
13✔
157
                if (userns_fd < 0 && userns_fd != -ENOPKG)
13✔
158
                        return userns_fd;
159
        }
160

161
        if (ret_root_fd) {
188✔
162
                const char *root;
83✔
163

164
                root = procfs_file_alloca(pidref->pid, "root");
83✔
165
                root_fd = RET_NERRNO(open(root, O_CLOEXEC|O_DIRECTORY));
83✔
166
                if (root_fd == -ENOENT && proc_mounted() == 0)
×
167
                        return -ENOSYS;
168
                if (root_fd < 0)
83✔
169
                        return root_fd;
170

171
                need_verify = true;
83✔
172
        }
173

174
        if (need_verify) {
188✔
175
                r = pidref_verify(pidref);
188✔
176
                if (r < 0)
188✔
177
                        return r;
178
        }
179

180
        if (ret_pidns_fd)
188✔
181
                *ret_pidns_fd = TAKE_FD(pidns_fd);
82✔
182

183
        if (ret_mntns_fd)
188✔
184
                *ret_mntns_fd = TAKE_FD(mntns_fd);
83✔
185

186
        if (ret_netns_fd)
188✔
187
                *ret_netns_fd = TAKE_FD(netns_fd);
108✔
188

189
        if (ret_userns_fd)
188✔
190
                *ret_userns_fd = TAKE_FD(userns_fd);
13✔
191

192
        if (ret_root_fd)
188✔
193
                *ret_root_fd = TAKE_FD(root_fd);
83✔
194

195
        return 0;
196
}
197

198
int namespace_open(
199
                pid_t pid,
200
                int *ret_pidns_fd,
201
                int *ret_mntns_fd,
202
                int *ret_netns_fd,
203
                int *ret_userns_fd,
204
                int *ret_root_fd) {
205

206
        _cleanup_(pidref_done) PidRef pidref = PIDREF_NULL;
11✔
207
        int r;
11✔
208

209
        r = pidref_set_pid(&pidref, pid);
11✔
210
        if (r < 0)
11✔
211
                return r;
212

213
        return pidref_namespace_open(&pidref, ret_pidns_fd, ret_mntns_fd, ret_netns_fd, ret_userns_fd, ret_root_fd);
11✔
214
}
215

216
int namespace_enter(int pidns_fd, int mntns_fd, int netns_fd, int userns_fd, int root_fd) {
217
        int r;
407✔
218

219
        if (userns_fd >= 0) {
407✔
220
                /* Can't setns to your own userns, since then you could escalate from non-root to root in
221
                 * your own namespace, so check if namespaces are equal before attempting to enter. */
222

223
                r = is_our_namespace(userns_fd, NAMESPACE_USER);
2✔
224
                if (r < 0)
2✔
225
                        return r;
226
                if (r > 0)
2✔
227
                        userns_fd = -EBADF;
2✔
228
        }
229

230
        if (pidns_fd >= 0)
407✔
231
                if (setns(pidns_fd, CLONE_NEWPID) < 0)
60✔
232
                        return -errno;
×
233

234
        if (mntns_fd >= 0)
407✔
235
                if (setns(mntns_fd, CLONE_NEWNS) < 0)
285✔
236
                        return -errno;
×
237

238
        if (netns_fd >= 0)
407✔
239
                if (setns(netns_fd, CLONE_NEWNET) < 0)
123✔
240
                        return -errno;
×
241

242
        if (userns_fd >= 0)
407✔
243
                if (setns(userns_fd, CLONE_NEWUSER) < 0)
×
244
                        return -errno;
×
245

246
        if (root_fd >= 0) {
407✔
247
                if (fchdir(root_fd) < 0)
61✔
248
                        return -errno;
×
249

250
                if (chroot(".") < 0)
61✔
251
                        return -errno;
×
252
        }
253

254
        if (userns_fd >= 0)
407✔
255
                return reset_uid_gid();
×
256

257
        return 0;
258
}
259

260
int fd_is_namespace(int fd, NamespaceType type) {
261
        int r;
148✔
262

263
        /* Checks whether the specified file descriptor refers to a namespace (of type if type != _NAMESPACE_INVALID). */
264

265
        assert(fd >= 0);
148✔
266
        assert(type < _NAMESPACE_TYPE_MAX);
148✔
267

268
        r = fd_is_fs_type(fd, NSFS_MAGIC);
148✔
269
        if (r <= 0)
148✔
270
                return r;
271

272
        if (type < 0)
139✔
273
                return true;
274

275
        int clone_flag = ioctl(fd, NS_GET_NSTYPE);
139✔
276
        if (clone_flag < 0)
139✔
277
                return -errno;
×
278

279
        NamespaceType found_type = clone_flag_to_namespace_type(clone_flag);
139✔
280
        if (found_type < 0)
139✔
281
                return -EBADF; /* Uh? Unknown namespace type? */
282

283
        return found_type == type;
139✔
284
}
285

286
int is_our_namespace(int fd, NamespaceType type) {
287
        int r;
63✔
288

289
        assert(fd >= 0);
63✔
290
        assert(type < _NAMESPACE_TYPE_MAX);
63✔
291

292
        r = fd_is_namespace(fd, type);
63✔
293
        if (r < 0)
63✔
294
                return r;
63✔
295
        if (r == 0) /* Not a namespace or not of the right type? */
63✔
296
                return -EUCLEAN;
297

298
        _cleanup_close_ int our_ns = namespace_open_by_type(type);
126✔
299
        if (our_ns < 0)
63✔
300
                return our_ns;
301

302
        return fd_inode_same(fd, our_ns);
63✔
303
}
304

305
int namespace_is_init(NamespaceType type) {
306
        int r;
4,917✔
307

308
        assert(type >= 0);
4,917✔
309
        assert(type < _NAMESPACE_TYPE_MAX);
4,917✔
310

311
        if (namespace_info[type].root_inode == 0)
4,917✔
312
                return -EBADR; /* Cannot answer this question */
4,917✔
313

314
        const char *p = pid_namespace_path(0, type);
24,555✔
315

316
        struct stat st;
4,911✔
317
        r = RET_NERRNO(stat(p, &st));
4,911✔
318
        if (r == -ENOENT) {
12✔
319
                /* If the /proc/ns/<type> API is not around in /proc/ then ns is off in the kernel and we are in the init ns */
320
                r = proc_mounted();
12✔
321
                if (r < 0)
12✔
322
                        return -ENOENT; /* If we can't determine if /proc/ is mounted propagate original error */
323

324
                return r ? true : -ENOSYS;
12✔
325
        }
326
        if (r < 0)
4,899✔
327
                return r;
328

329
        return st.st_ino == namespace_info[type].root_inode;
4,899✔
330
}
331

332
int pidref_in_same_namespace(PidRef *pid1, PidRef *pid2, NamespaceType type) {
333
        _cleanup_close_ int ns1 = -EBADF, ns2 = -EBADF;
149✔
334

335
        /* Accepts NULL to indicate our own process */
336

337
        assert(!pid1 || pidref_is_set(pid1));
149✔
338
        assert(!pid2 || pidref_is_set(pid2));
149✔
339
        assert(type >= 0 && type < _NAMESPACE_TYPE_MAX);
149✔
340

341
        if (pidref_equal(pid1, pid2))
149✔
342
                return true;
343

344
        if (!pid1)
149✔
345
                ns1 = namespace_open_by_type(type);
133✔
346
        else
347
                ns1 = pidref_namespace_open_by_type(pid1, type);
16✔
348
        if (ns1 < 0)
149✔
349
                return ns1;
350

351
        if (!pid2)
149✔
352
                ns2 = namespace_open_by_type(type);
2✔
353
        else
354
                ns2 = pidref_namespace_open_by_type(pid2, type);
147✔
355
        if (ns2 < 0)
149✔
356
                return ns2;
357

358
        return fd_inode_same(ns1, ns2);
149✔
359
}
360

361
int in_same_namespace(pid_t pid1, pid_t pid2, NamespaceType type) {
362
        assert(pid1 >= 0);
2✔
363
        assert(pid2 >= 0);
2✔
364
        return pidref_in_same_namespace(pid1 == 0 ? NULL : &PIDREF_MAKE_FROM_PID(pid1),
2✔
365
                                        pid2 == 0 ? NULL : &PIDREF_MAKE_FROM_PID(pid2),
×
366
                                        type);
367
}
368

369
int namespace_get_leader(PidRef *pidref, NamespaceType type, PidRef *ret) {
370
        int r;
5✔
371

372
        /* Note: we don't bother with pidref_is_set()/pidref_is_remote() here, as the first call we do,
373
         * pidref_get_ppid_as_pidref() calls those anyway */
374

375
        assert(type >= 0 && type < _NAMESPACE_TYPE_MAX);
5✔
376
        assert(ret);
5✔
377

378
        _cleanup_(pidref_done) PidRef current = PIDREF_NULL;
5✔
379
        PidRef *c = pidref;
5✔
380

381
        for (;;) {
9✔
382
                _cleanup_(pidref_done) PidRef parent = PIDREF_NULL;
5✔
383

384
                r = pidref_get_ppid_as_pidref(c, &parent);
14✔
385
                if (r < 0)
14✔
386
                        return r;
387

388
                r = pidref_in_same_namespace(c, &parent, type);
14✔
389
                if (r < 0)
14✔
390
                        return r;
391
                if (r == 0) {
14✔
392
                        /* If the parent and the child are not in the same namespace, then the child is
393
                         * the leader we are looking for. */
394

395
                        if (pidref_is_set(&current))
5✔
396
                                *ret = TAKE_PIDREF(current);
5✔
397
                        else {
398
                                r = pidref_copy(c, ret);
×
399
                                if (r < 0)
×
400
                                        return r;
401
                        }
402

403
                        return 0;
5✔
404
                }
405

406
                pidref_done(&current);
9✔
407
                current = TAKE_PIDREF(parent);
9✔
408
                c = &current;
9✔
409
        }
410
}
411

412
int detach_mount_namespace(void) {
413
        /* Detaches the mount namespace, disabling propagation from our namespace to the host. Sets
414
         * propagation first to MS_SLAVE for all mounts (disabling propagation), and then back to MS_SHARED
415
         * (so that we create a new peer group).  */
416

417
        if (unshare(CLONE_NEWNS) < 0)
197✔
418
                return log_debug_errno(errno, "Failed to acquire mount namespace: %m");
×
419

420
        if (mount(NULL, "/", NULL, MS_SLAVE | MS_REC, NULL) < 0)
197✔
421
                return log_debug_errno(errno, "Failed to set mount propagation to MS_SLAVE for all mounts: %m");
×
422

423
        if (mount(NULL, "/", NULL, MS_SHARED | MS_REC, NULL) < 0)
197✔
424
                return log_debug_errno(errno, "Failed to set mount propagation back to MS_SHARED for all mounts: %m");
×
425

426
        return 0;
427
}
428

429
int detach_mount_namespace_harder(uid_t target_uid, gid_t target_gid) {
430
        uid_t from_uid;
52✔
431
        gid_t from_gid;
52✔
432
        int r;
52✔
433

434
        /* Tried detach_mount_namespace() first. If that doesn't work due to permissions, opens up an
435
         * unprivileged user namespace with a mapping of the originating UID/GID to the specified target
436
         * UID/GID. Then, tries detach_mount_namespace() again.
437
         *
438
         * Or in other words: tries much harder to get a mount namespace, making use of unprivileged user
439
         * namespaces if need be.
440
         *
441
         * Note that after this function completed:
442
         *
443
         *    → if we had privs, afterwards uids/gids on files and processes are as before
444
         *
445
         *    → if we had no privs, our own id and all our files will show up owned by target_uid/target_gid,
446
         *    and everything else owned by nobody.
447
         *
448
         * Yes, that's quite a difference. */
449

450
        if (!uid_is_valid(target_uid))
52✔
451
                return -EINVAL;
452
        if (!gid_is_valid(target_gid))
52✔
453
                return -EINVAL;
454

455
        r = detach_mount_namespace();
52✔
456
        if (r != -EPERM)
52✔
457
                return r;
458

459
        from_uid = getuid();
×
460
        from_gid = getgid();
×
461

462
        if (unshare(CLONE_NEWUSER) < 0)
×
463
                return log_debug_errno(errno, "Failed to acquire user namespace: %m");
×
464

465
        r = write_string_filef("/proc/self/uid_map", 0,
×
466
                               UID_FMT " " UID_FMT " 1\n", target_uid, from_uid);
467
        if (r < 0)
×
468
                return log_debug_errno(r, "Failed to write uid map: %m");
×
469

470
        r = write_string_file("/proc/self/setgroups", "deny", 0);
×
471
        if (r < 0)
×
472
                return log_debug_errno(r, "Failed to write setgroups file: %m");
×
473

474
        r = write_string_filef("/proc/self/gid_map", 0,
×
475
                               GID_FMT " " GID_FMT " 1\n", target_gid, from_gid);
476
        if (r < 0)
×
477
                return log_debug_errno(r, "Failed to write gid map: %m");
×
478

479
        return detach_mount_namespace();
×
480
}
481

482
int detach_mount_namespace_userns(int userns_fd) {
483
        int r;
2✔
484

485
        assert(userns_fd >= 0);
2✔
486

487
        if (setns(userns_fd, CLONE_NEWUSER) < 0)
2✔
488
                return log_debug_errno(errno, "Failed to join user namespace: %m");
×
489

490
        r = reset_uid_gid();
2✔
491
        if (r < 0)
2✔
492
                return log_debug_errno(r, "Failed to become root in user namespace: %m");
×
493

494
        return detach_mount_namespace();
2✔
495
}
496

497
int parse_userns_uid_range(const char *s, uid_t *ret_uid_shift, uid_t *ret_uid_range) {
498
        _cleanup_free_ char *buffer = NULL;
2✔
499
        const char *range, *shift;
2✔
500
        int r;
2✔
501
        uid_t uid_shift, uid_range = 65536;
2✔
502

503
        assert(s);
2✔
504

505
        range = strchr(s, ':');
2✔
506
        if (range) {
2✔
507
                buffer = strndup(s, range - s);
×
508
                if (!buffer)
×
509
                        return log_oom();
×
510
                shift = buffer;
×
511

512
                range++;
×
513
                r = safe_atou32(range, &uid_range);
×
514
                if (r < 0)
×
515
                        return log_error_errno(r, "Failed to parse UID range \"%s\": %m", range);
×
516
        } else
517
                shift = s;
518

519
        r = parse_uid(shift, &uid_shift);
2✔
520
        if (r < 0)
2✔
521
                return log_error_errno(r, "Failed to parse UID \"%s\": %m", s);
2✔
522

523
        if (!userns_shift_range_valid(uid_shift, uid_range))
×
524
                return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "UID range cannot be empty or go beyond " UID_FMT ".", UID_INVALID);
×
525

526
        if (ret_uid_shift)
×
527
                *ret_uid_shift = uid_shift;
×
528

529
        if (ret_uid_range)
×
530
                *ret_uid_range = uid_range;
×
531

532
        return 0;
533
}
534

535
int userns_acquire_empty(void) {
536
        _cleanup_(pidref_done_sigkill_wait) PidRef pid = PIDREF_NULL;
22✔
537
        int r;
22✔
538

539
        r = pidref_safe_fork("(sd-mkuserns)", FORK_CLOSE_ALL_FDS|FORK_DEATHSIG_SIGKILL|FORK_NEW_USERNS|FORK_FREEZE, &pid);
22✔
540
        if (r < 0)
22✔
541
                return r;
542
        assert(r > 0);
22✔
543

544
        return pidref_namespace_open_by_type(&pid, NAMESPACE_USER);
22✔
545
}
546

547
int userns_acquire(const char *uid_map, const char *gid_map, bool setgroups_deny) {
548
        char path[STRLEN("/proc//setgroups") + DECIMAL_STR_MAX(pid_t) + 1];
3,891✔
549
        _cleanup_(pidref_done_sigkill_wait) PidRef pid = PIDREF_NULL;
3,891✔
550
        int r;
3,891✔
551

552
        assert(uid_map);
3,891✔
553
        assert(gid_map);
3,891✔
554

555
        /* Forks off a process in a new userns, configures the specified uidmap/gidmap, acquires an fd to it,
556
         * and then kills the process again. This way we have a userns fd that is not bound to any
557
         * process. We can use that for file system mounts and similar. */
558

559
        r = pidref_safe_fork("(sd-mkuserns)", FORK_CLOSE_ALL_FDS|FORK_DEATHSIG_SIGKILL|FORK_NEW_USERNS|FORK_FREEZE, &pid);
3,891✔
560
        if (r < 0)
3,891✔
561
                return r;
562
        assert(r > 0);
3,891✔
563

564
        xsprintf(path, "/proc/" PID_FMT "/uid_map", pid.pid);
3,891✔
565
        r = write_string_file(path, uid_map, WRITE_STRING_FILE_DISABLE_BUFFER);
3,891✔
566
        if (r < 0)
3,891✔
567
                return log_debug_errno(r, "Failed to write UID map: %m");
×
568

569
        if (setgroups_deny) {
3,891✔
570
                xsprintf(path, "/proc/" PID_FMT "/setgroups", pid.pid);
3,891✔
571
                r = write_string_file(path, "deny", WRITE_STRING_FILE_DISABLE_BUFFER);
3,891✔
572
                if (r < 0)
3,891✔
573
                        return log_debug_errno(r, "Failed to write setgroups file: %m");
×
574
        }
575

576
        xsprintf(path, "/proc/" PID_FMT "/gid_map", pid.pid);
3,891✔
577
        r = write_string_file(path, gid_map, WRITE_STRING_FILE_DISABLE_BUFFER);
3,891✔
578
        if (r < 0)
3,891✔
579
                return log_debug_errno(r, "Failed to write GID map: %m");
×
580

581
        return pidref_namespace_open_by_type(&pid, NAMESPACE_USER);
3,891✔
582
}
583

584
int userns_acquire_self_root(void) {
585

586
        /* Returns a user namespace with only our own uid/gid mapped to root, and everything else unmapped.
587
         *
588
         * Note: this can be acquired unprivileged! */
589

590
        _cleanup_free_ char *uid_map = NULL, *gid_map = NULL;
3,747✔
591
        if (asprintf(&uid_map, "0 " UID_FMT " 1", getuid()) < 0)
3,747✔
592
                return -ENOMEM;
593
        if (asprintf(&gid_map, "0 " GID_FMT " 1", getgid()) < 0)
3,747✔
594
                return -ENOMEM;
595

596
        return userns_acquire(uid_map, gid_map, /* setgroups_deny= */ true);
3,747✔
597
}
598

599
int userns_enter_and_pin(int userns_fd, pid_t *ret_pid) {
600
        _cleanup_close_pair_ int pfd[2] = EBADF_PAIR;
44✔
601
        _cleanup_(sigkill_waitp) pid_t pid = 0;
44✔
602
        ssize_t n;
44✔
603
        char x;
44✔
604
        int r;
44✔
605

606
        assert(userns_fd >= 0);
44✔
607
        assert(ret_pid);
44✔
608

609
        if (pipe2(pfd, O_CLOEXEC) < 0)
44✔
610
                return -errno;
×
611

612
        r = safe_fork_full(
88✔
613
                        "(sd-pinuserns)",
614
                        /* stdio_fds= */ NULL,
615
                        (int[]) { pfd[1], userns_fd }, 2,
44✔
616
                        FORK_CLOSE_ALL_FDS|FORK_DEATHSIG_SIGKILL,
617
                        &pid);
618
        if (r < 0)
44✔
619
                return r;
620
        if (r == 0) {
44✔
621
                /* Child. */
622

623
                if (setns(userns_fd, CLONE_NEWUSER) < 0) {
×
624
                        log_debug_errno(errno, "Failed to join userns: %m");
×
625
                        _exit(EXIT_FAILURE);
×
626
                }
627

628
                userns_fd = safe_close(userns_fd);
×
629

630
                n = write(pfd[1], &(const char) { 'x' }, 1);
×
631
                if (n < 0) {
×
632
                        log_debug_errno(errno, "Failed to write to pipe: %m");
×
633
                        _exit(EXIT_FAILURE);
×
634
                }
635
                assert(n == 1);
×
636

637
                freeze();
×
638
        }
639

640
        pfd[1] = safe_close(pfd[1]);
44✔
641

642
        n = read(pfd[0], &x, 1);
44✔
643
        if (n < 0)
44✔
644
                return -errno;
×
645
        if (n == 0)
44✔
646
                return -EPROTO;
647
        assert(n == 1);
44✔
648
        assert(x == 'x');
44✔
649

650
        *ret_pid = TAKE_PID(pid);
44✔
651
        return 0;
44✔
652
}
653

654
bool userns_supported(void) {
655
        return access("/proc/self/uid_map", F_OK) >= 0;
247✔
656
}
657

658
int userns_get_base_uid(int userns_fd, uid_t *ret_uid, gid_t *ret_gid) {
659
        _cleanup_(sigkill_waitp) pid_t pid = 0;
24✔
660
        int r;
24✔
661

662
        assert(userns_fd >= 0);
24✔
663

664
        r = userns_enter_and_pin(userns_fd, &pid);
24✔
665
        if (r < 0)
24✔
666
                return r;
667

668
        uid_t uid;
24✔
669
        r = uid_map_search_root(pid, UID_RANGE_USERNS_OUTSIDE, &uid);
24✔
670
        if (r < 0)
24✔
671
                return r;
672

673
        gid_t gid;
21✔
674
        r = uid_map_search_root(pid, GID_RANGE_USERNS_OUTSIDE, &gid);
21✔
675
        if (r < 0)
21✔
676
                return r;
677

678
        if (!ret_gid && uid != gid)
21✔
679
                return -EUCLEAN;
680

681
        if (ret_uid)
18✔
682
                *ret_uid = uid;
18✔
683
        if (ret_gid)
18✔
684
                *ret_gid = gid;
3✔
685

686
        return 0;
687
}
688

689
int process_is_owned_by_uid(const PidRef *pidref, uid_t uid) {
690
        int r;
9✔
691

692
        /* Checks if the specified process either is owned directly by the specified user, or if it is inside
693
         * a user namespace owned by it. */
694

695
        assert(uid_is_valid(uid));
9✔
696

697
        uid_t process_uid;
9✔
698
        r = pidref_get_uid(pidref, &process_uid);
9✔
699
        if (r < 0)
9✔
700
                return r;
9✔
701
        if (process_uid == uid)
9✔
702
                return true;
703

704
        _cleanup_close_ int userns_fd = -EBADF;
9✔
705
        userns_fd = pidref_namespace_open_by_type(pidref, NAMESPACE_USER);
6✔
706
        if (userns_fd == -ENOPKG) /* If userns is not supported, then they don't matter for ownership */
6✔
707
                return false;
708
        if (userns_fd < 0)
6✔
709
                return userns_fd;
710

711
        for (unsigned iteration = 0;; iteration++) {
×
712
                uid_t ns_uid;
6✔
713

714
                /* This process is in our own userns? Then we are done, in our own userns only the UIDs
715
                 * themselves matter. */
716
                r = is_our_namespace(userns_fd, NAMESPACE_USER);
6✔
717
                if (r < 0)
6✔
718
                        return r;
6✔
719
                if (r > 0)
6✔
720
                        return false;
721

722
                if (ioctl(userns_fd, NS_GET_OWNER_UID, &ns_uid) < 0)
3✔
723
                        return -errno;
×
724
                if (ns_uid == uid)
3✔
725
                        return true;
726

727
                /* Paranoia check */
728
                if (iteration > 16)
×
729
                        return log_debug_errno(SYNTHETIC_ERRNO(ELOOP), "Giving up while tracing parents of user namespaces after %u steps.", iteration);
×
730

731
                /* Go up the tree */
732
                _cleanup_close_ int parent_fd = ioctl(userns_fd, NS_GET_USERNS);
6✔
733
                if (parent_fd < 0) {
×
734
                        if (errno == EPERM) /* EPERM means we left our own userns */
×
735
                                return false;
736

737
                        return -errno;
×
738
                }
739

740
                close_and_replace(userns_fd, parent_fd);
×
741
        }
742
}
743

744
int is_idmapping_supported(const char *path) {
745
        _cleanup_close_ int mount_fd = -EBADF, userns_fd = -EBADF, dir_fd = -EBADF;
7,492✔
746
        int r;
3,746✔
747

748
        assert(path);
3,746✔
749

750
        if (!mount_new_api_supported())
3,746✔
751
                return false;
752

753
        userns_fd = r = userns_acquire_self_root();
3,746✔
754
        if (ERRNO_IS_NEG_NOT_SUPPORTED(r) || ERRNO_IS_NEG_PRIVILEGE(r) || r == -EINVAL)
3,746✔
755
                return false;
756
        if (r == -ENOSPC) {
3,746✔
757
                log_debug_errno(r, "Failed to acquire new user namespace, user.max_user_namespaces seems to be exhausted or maybe even zero, assuming ID-mapping is not supported: %m");
×
758
                return false;
×
759
        }
760
        if (r < 0)
3,746✔
761
                return log_debug_errno(r, "Failed to acquire new user namespace for checking if '%s' supports ID-mapping: %m", path);
×
762

763
        dir_fd = r = RET_NERRNO(open(path, O_RDONLY | O_CLOEXEC | O_NOFOLLOW));
3,746✔
764
        if (ERRNO_IS_NEG_NOT_SUPPORTED(r))
7,492✔
765
                return false;
766
        if (r < 0)
3,746✔
767
                return log_debug_errno(r, "Failed to open '%s', cannot determine if ID-mapping is supported: %m", path);
×
768

769
        mount_fd = r = RET_NERRNO(open_tree(dir_fd, "", AT_EMPTY_PATH | OPEN_TREE_CLONE | OPEN_TREE_CLOEXEC));
3,746✔
770
        if (ERRNO_IS_NEG_NOT_SUPPORTED(r) || ERRNO_IS_NEG_PRIVILEGE(r) || r == -EINVAL)
3,746✔
771
                return false;
×
772
        if (r < 0)
3,746✔
773
                return log_debug_errno(r, "Failed to open mount tree '%s', cannot determine if ID-mapping is supported: %m", path);
×
774

775
        r = RET_NERRNO(mount_setattr(mount_fd, "", AT_EMPTY_PATH,
3,746✔
776
                       &(struct mount_attr) {
3,746✔
777
                                .attr_set = MOUNT_ATTR_IDMAP | MOUNT_ATTR_NOSUID | MOUNT_ATTR_NOEXEC | MOUNT_ATTR_RDONLY | MOUNT_ATTR_NODEV,
778
                                .userns_fd = userns_fd,
779
                        }, sizeof(struct mount_attr)));
780
        if (ERRNO_IS_NEG_NOT_SUPPORTED(r) || ERRNO_IS_NEG_PRIVILEGE(r) || r == -EINVAL)
3,746✔
781
                return false;
×
782
        if (r < 0)
3,746✔
783
                return log_debug_errno(r, "Failed to set mount attribute to '%s', cannot determine if ID-mapping is supported: %m", path);
×
784

785
        return true;
786
}
787

788
int netns_acquire(void) {
789
        _cleanup_(pidref_done_sigkill_wait) PidRef pid = PIDREF_NULL;
7✔
790
        int r;
7✔
791

792
        /* Forks off a process in a new network namespace, acquires a network namespace fd, and then kills
793
         * the process again. This way we have a netns fd that is not bound to any process. */
794

795
        r = pidref_safe_fork("(sd-mknetns)", FORK_CLOSE_ALL_FDS|FORK_DEATHSIG_SIGKILL|FORK_NEW_NETNS|FORK_FREEZE, &pid);
7✔
796
        if (r < 0)
7✔
797
                return log_debug_errno(r, "Failed to fork process into new netns: %m");
×
798
        assert(r > 0);
7✔
799

800
        return pidref_namespace_open_by_type(&pid, NAMESPACE_NET);
7✔
801
}
STATUS · Troubleshooting · Open an Issue · Sales · Support · CAREERS · ENTERPRISE · START FREE · SCHEDULE DEMO
ANNOUNCEMENTS · TWITTER · TOS & SLA · Supported CI Services · What's a CI service? · Automated Testing

© 2026 Coveralls, Inc