• Home
  • Features
  • Pricing
  • Docs
  • Announcements
  • Sign In

systemd / systemd / 14630481637

23 Apr 2025 07:04PM UTC coverage: 72.178% (-0.002%) from 72.18%
14630481637

push

github

DaanDeMeyer
mkosi: Run clangd within the tools tree instead of the build container

Running within the build sandbox has a number of disadvantages:
- We have a separate clangd cache for each distribution/release combo
- It requires to build the full image before clangd can be used
- It breaks every time the image becomes out of date and requires a
  rebuild
- We can't look at system headers as we don't have the knowledge to map
  them from inside the build sandbox to the corresponding path on the host

Instead, let's have mkosi.clangd run clangd within the tools tree. We
already require building systemd for both the host and the target anyway,
and all the dependencies to build systemd are installed in the tools tree
already for that, as well as clangd since it's installed together with the
other clang tooling we install in the tools tree. Unlike the previous approach,
this approach only requires the mkosi tools tree to be built upfront, which has
a much higher chance of not invalidating its cache. We can also trivially map
system header lookups from within the sandbox to the path within mkosi.tools
on the host so that starts working as well.

297054 of 411557 relevant lines covered (72.18%)

686269.58 hits per line

Source File
Press 'n' to go to next uncovered line, 'b' for previous

72.79
/src/shared/mount-util.c
1
/* SPDX-License-Identifier: LGPL-2.1-or-later */
2

3
#include <errno.h>
4
#include <stdlib.h>
5
#include <sys/mount.h>
6
#include <sys/stat.h>
7
#include <sys/statvfs.h>
8
#include <unistd.h>
9
#include <linux/loop.h>
10

11
#include "alloc-util.h"
12
#include "chase.h"
13
#include "dissect-image.h"
14
#include "exec-util.h"
15
#include "extract-word.h"
16
#include "fd-util.h"
17
#include "fileio.h"
18
#include "fs-util.h"
19
#include "fstab-util.h"
20
#include "glyph-util.h"
21
#include "hashmap.h"
22
#include "initrd-util.h"
23
#include "label-util.h"
24
#include "libmount-util.h"
25
#include "log.h"
26
#include "missing_syscall.h"
27
#include "mkdir-label.h"
28
#include "mount-util.h"
29
#include "mountpoint-util.h"
30
#include "namespace-util.h"
31
#include "parse-util.h"
32
#include "path-util.h"
33
#include "process-util.h"
34
#include "set.h"
35
#include "sort-util.h"
36
#include "stat-util.h"
37
#include "stdio-util.h"
38
#include "string-table.h"
39
#include "string-util.h"
40
#include "strv.h"
41
#include "tmpfile-util.h"
42
#include "user-util.h"
43

44
int umount_recursive_full(const char *prefix, int flags, char **keep) {
8,737✔
45
        _cleanup_fclose_ FILE *f = NULL;
8,737✔
46
        int n = 0, r;
8,737✔
47

48
        /* Try to umount everything recursively below a directory. Also, take care of stacked mounts, and
49
         * keep unmounting them until they are gone. */
50

51
        f = fopen("/proc/self/mountinfo", "re"); /* Pin the file, in case we unmount /proc/ as part of the logic here */
8,737✔
52
        if (!f)
8,737✔
53
                return log_debug_errno(errno, "Failed to open /proc/self/mountinfo: %m");
×
54

55
        for (;;) {
45,081✔
56
                _cleanup_(mnt_free_tablep) struct libmnt_table *table = NULL;
18,172✔
57
                _cleanup_(mnt_free_iterp) struct libmnt_iter *iter = NULL;
26,909✔
58
                bool again = false;
26,909✔
59

60
                r = libmount_parse_mountinfo(f, &table, &iter);
26,909✔
61
                if (r < 0)
26,909✔
62
                        return log_debug_errno(r, "Failed to parse /proc/self/mountinfo: %m");
×
63

64
                for (;;) {
2,849,853✔
65
                        bool shall_keep = false;
1,438,381✔
66
                        struct libmnt_fs *fs;
1,438,381✔
67
                        const char *path;
1,438,381✔
68

69
                        r = mnt_table_next_fs(table, iter, &fs);
1,438,381✔
70
                        if (r == 1)
1,438,381✔
71
                                break;
72
                        if (r < 0)
1,429,644✔
73
                                return log_debug_errno(r, "Failed to get next entry from /proc/self/mountinfo: %m");
×
74

75
                        path = mnt_fs_get_target(fs);
1,429,644✔
76
                        if (!path)
1,429,644✔
77
                                continue;
1,411,472✔
78

79
                        if (prefix && !path_startswith(path, prefix)) {
2,858,957✔
80
                                // FIXME: This is extremely noisy, we're probably doing something very wrong
81
                                // to trigger this so often, needs more investigation.
82
                                // log_trace("Not unmounting %s, outside of prefix: %s", path, prefix);
83
                                continue;
1,402,075✔
84
                        }
85

86
                        STRV_FOREACH(k, keep)
27,639✔
87
                                /* Match against anything in the path to the dirs to keep, or below the dirs to keep */
88
                                if (path_startswith(path, *k) || path_startswith(*k, path)) {
346✔
89
                                        shall_keep = true;
276✔
90
                                        break;
276✔
91
                                }
92
                        if (shall_keep) {
27,845✔
93
                                log_debug("Not unmounting %s, referenced by keep list.", path);
276✔
94
                                continue;
276✔
95
                        }
96

97
                        if (umount2(path, flags | UMOUNT_NOFOLLOW) < 0) {
27,293✔
98
                                log_debug_errno(errno, "Failed to umount %s, ignoring: %m", path);
9,121✔
99
                                continue;
9,121✔
100
                        }
101

102
                        log_trace("Successfully unmounted %s", path);
18,172✔
103

104
                        again = true;
18,172✔
105
                        n++;
18,172✔
106

107
                        break;
18,172✔
108
                }
109

110
                if (!again)
8,737✔
111
                        break;
112

113
                rewind(f);
18,172✔
114
        }
115

116
        return n;
8,737✔
117
}
118

119
#define MS_CONVERTIBLE_FLAGS (MS_RDONLY|MS_NOSUID|MS_NODEV|MS_NOEXEC|MS_NOSYMFOLLOW)
120

121
static uint64_t ms_flags_to_mount_attr(unsigned long a) {
39,654✔
122
        uint64_t f = 0;
39,654✔
123

124
        if (FLAGS_SET(a, MS_RDONLY))
39,654✔
125
                f |= MOUNT_ATTR_RDONLY;
1,529✔
126

127
        if (FLAGS_SET(a, MS_NOSUID))
39,654✔
128
                f |= MOUNT_ATTR_NOSUID;
18,300✔
129

130
        if (FLAGS_SET(a, MS_NODEV))
39,654✔
131
                f |= MOUNT_ATTR_NODEV;
2✔
132

133
        if (FLAGS_SET(a, MS_NOEXEC))
39,654✔
134
                f |= MOUNT_ATTR_NOEXEC;
2✔
135

136
        if (FLAGS_SET(a, MS_NOSYMFOLLOW))
39,654✔
137
                f |= MOUNT_ATTR_NOSYMFOLLOW;
×
138

139
        return f;
39,654✔
140
}
141

142
static bool skip_mount_set_attr = false;
143

144
/* Use this function only if you do not have direct access to /proc/self/mountinfo but the caller can open it
145
 * for you. This is the case when /proc is masked or not mounted. Otherwise, use bind_remount_recursive. */
146
int bind_remount_recursive_with_mountinfo(
37,271✔
147
                const char *prefix,
148
                unsigned long new_flags,
149
                unsigned long flags_mask,
150
                char **deny_list,
151
                FILE *proc_self_mountinfo) {
152

153
        _cleanup_fclose_ FILE *proc_self_mountinfo_opened = NULL;
37,271✔
154
        _cleanup_set_free_ Set *done = NULL;
37,271✔
155
        unsigned n_tries = 0;
37,271✔
156
        int r;
37,271✔
157

158
        assert(prefix);
37,271✔
159

160
        if ((flags_mask & ~MS_CONVERTIBLE_FLAGS) == 0 && strv_isempty(deny_list) && !skip_mount_set_attr) {
54,792✔
161
                /* Let's take a shortcut for all the flags we know how to convert into mount_setattr() flags */
162

163
                if (mount_setattr(AT_FDCWD, prefix, AT_SYMLINK_NOFOLLOW|AT_RECURSIVE,
17,521✔
164
                                  &(struct mount_attr) {
17,521✔
165
                                          .attr_set = ms_flags_to_mount_attr(new_flags & flags_mask),
17,521✔
166
                                          .attr_clr = ms_flags_to_mount_attr(~new_flags & flags_mask),
17,521✔
167
                                  }, MOUNT_ATTR_SIZE_VER0) < 0) {
168

169
                        log_debug_errno(errno, "mount_setattr() failed, falling back to classic remounting: %m");
2✔
170

171
                        /* We fall through to classic behaviour if not supported (i.e. kernel < 5.12). We
172
                         * also do this for all other kinds of errors since they are so many different, and
173
                         * mount_setattr() has no graceful mode where it continues despite seeing errors one
174
                         * some mounts, but we want that. Moreover mount_setattr() only works on the mount
175
                         * point inode itself, not a non-mount point inode, and we want to support arbitrary
176
                         * prefixes here. */
177

178
                        if (ERRNO_IS_NOT_SUPPORTED(errno)) /* if not supported, then don't bother at all anymore */
2✔
179
                                skip_mount_set_attr = true;
×
180
                } else
181
                        return 0; /* Nice, this worked! */
17,519✔
182
        }
183

184
        if (!proc_self_mountinfo) {
19,752✔
185
                r = fopen_unlocked("/proc/self/mountinfo", "re", &proc_self_mountinfo_opened);
3✔
186
                if (r < 0)
3✔
187
                        return r;
188

189
                proc_self_mountinfo = proc_self_mountinfo_opened;
3✔
190
        }
191

192
        /* Recursively remount a directory (and all its submounts) with desired flags (MS_READONLY,
193
         * MS_NOSUID, MS_NOEXEC). If the directory is already mounted, we reuse the mount and simply mark it
194
         * MS_BIND|MS_RDONLY (or remove the MS_RDONLY for read-write operation), ditto for other flags. If it
195
         * isn't we first make it one. Afterwards we apply (or remove) the flags to all submounts we can
196
         * access, too. When mounts are stacked on the same mount point we only care for each individual
197
         * "top-level" mount on each point, as we cannot influence/access the underlying mounts anyway. We do
198
         * not have any effect on future submounts that might get propagated, they might be writable
199
         * etc. This includes future submounts that have been triggered via autofs. Also note that we can't
200
         * operate atomically here. Mounts established while we process the tree might or might not get
201
         * noticed and thus might or might not be covered.
202
         *
203
         * If the "deny_list" parameter is specified it may contain a list of subtrees to exclude from the
204
         * remount operation. Note that we'll ignore the deny list for the top-level path. */
205

206
        for (;;) {
39,508✔
207
                _cleanup_(mnt_free_tablep) struct libmnt_table *table = NULL;
39,508✔
208
                _cleanup_(mnt_free_iterp) struct libmnt_iter *iter = NULL;
39,508✔
209
                _cleanup_hashmap_free_ Hashmap *todo = NULL;
39,506✔
210
                bool top_autofs = false;
39,508✔
211

212
                if (n_tries++ >= 32) /* Let's not retry this loop forever */
39,508✔
213
                        return -EBUSY;
214

215
                rewind(proc_self_mountinfo);
39,508✔
216

217
                r = libmount_parse_mountinfo(proc_self_mountinfo, &table, &iter);
39,508✔
218
                if (r < 0)
39,508✔
219
                        return log_debug_errno(r, "Failed to parse /proc/self/mountinfo: %m");
×
220

221
                for (;;) {
2,788,800✔
222
                        _cleanup_free_ char *d = NULL;
2,749,292✔
223
                        const char *path, *type, *opts;
2,788,800✔
224
                        unsigned long flags = 0;
2,788,800✔
225
                        struct libmnt_fs *fs;
2,788,800✔
226

227
                        r = mnt_table_next_fs(table, iter, &fs);
2,788,800✔
228
                        if (r == 1) /* EOF */
2,788,800✔
229
                                break;
230
                        if (r < 0)
2,749,292✔
231
                                return log_debug_errno(r, "Failed to get next entry from /proc/self/mountinfo: %m");
×
232

233
                        path = mnt_fs_get_target(fs);
2,749,292✔
234
                        if (!path)
2,749,292✔
235
                                continue;
×
236

237
                        if (!path_startswith(path, prefix))
2,749,292✔
238
                                continue;
2,681,406✔
239

240
                        type = mnt_fs_get_fstype(fs);
67,886✔
241
                        if (!type)
67,886✔
242
                                continue;
×
243

244
                        /* Let's ignore autofs mounts. If they aren't triggered yet, we want to avoid
245
                         * triggering them, as we don't make any guarantees for future submounts anyway. If
246
                         * they are already triggered, then we will find another entry for this. */
247
                        if (streq(type, "autofs")) {
67,886✔
248
                                top_autofs = top_autofs || path_equal(path, prefix);
6,360✔
249
                                continue;
3,180✔
250
                        }
251

252
                        if (set_contains(done, path))
64,706✔
253
                                continue;
23,370✔
254

255
                        /* Ignore this mount if it is deny-listed, but only if it isn't the top-level mount
256
                         * we shall operate on. */
257
                        if (!path_equal(path, prefix)) {
41,336✔
258
                                bool deny_listed = false;
281,562✔
259

260
                                STRV_FOREACH(i, deny_list) {
281,562✔
261
                                        if (path_equal(*i, prefix))
277,948✔
262
                                                continue;
21,571✔
263

264
                                        if (!path_startswith(*i, prefix))
256,377✔
265
                                                continue;
138,496✔
266

267
                                        if (path_startswith(path, *i)) {
117,881✔
268
                                                deny_listed = true;
269
                                                log_trace("Not remounting %s deny-listed by %s, called for %s", path, *i, prefix);
270
                                                break;
271
                                        }
272
                                }
273

274
                                if (deny_listed)
21,582✔
275
                                        continue;
17,968✔
276
                        }
277

278
                        opts = mnt_fs_get_vfs_options(fs);
23,368✔
279
                        if (opts) {
23,368✔
280
                                r = mnt_optstr_get_flags(opts, &flags, mnt_get_builtin_optmap(MNT_LINUX_MAP));
23,368✔
281
                                if (r < 0)
23,368✔
282
                                        log_debug_errno(r, "Could not get flags for '%s', ignoring: %m", path);
×
283
                        }
284

285
                        d = strdup(path);
23,368✔
286
                        if (!d)
23,368✔
287
                                return -ENOMEM;
288

289
                        r = hashmap_ensure_put(&todo, &path_hash_ops_free, d, ULONG_TO_PTR(flags));
23,368✔
290
                        if (r == -EEXIST)
23,368✔
291
                                /* If the same path was recorded, but with different mount flags, update it:
292
                                 * it means a mount point is overmounted, and libmount returns the "bottom" (or
293
                                 * older one) first, but we want to reapply the flags from the "top" (or newer
294
                                 * one). See: https://github.com/systemd/systemd/issues/20032
295
                                 * Note that this shouldn't really fail, as we were just told that the key
296
                                 * exists, and it's an update so we want 'd' to be freed immediately. */
297
                                r = hashmap_update(todo, d, ULONG_TO_PTR(flags));
8✔
298
                        if (r < 0)
23,368✔
299
                                return r;
300
                        if (r > 0)
23,368✔
301
                                TAKE_PTR(d);
23,326✔
302
                }
303

304
                /* Check if the top-level directory was among what we have seen so far. For that check both
305
                 * 'done' and 'todo'. Also check 'top_autofs' because if the top-level dir is an autofs we'll
306
                 * not include it in either set but will set this bool. */
307
                if (!set_contains(done, prefix) &&
39,508✔
308
                    !(top_autofs || hashmap_contains(todo, prefix))) {
19,754✔
309

310
                        /* The prefix directory itself is not yet a mount, make it one. */
311
                        r = mount_nofollow(prefix, prefix, NULL, MS_BIND|MS_REC, NULL);
2✔
312
                        if (r < 0)
2✔
313
                                return r;
314

315
                        /* Immediately rescan, so that we pick up the new mount's flags */
316
                        continue;
2✔
317
                }
318

319
                /* If we have no submounts to process anymore, we are done */
320
                if (hashmap_isempty(todo))
39,506✔
321
                        return 0;
322

323
                for (;;) {
43,079✔
324
                        unsigned long flags;
43,079✔
325
                        char *x = NULL;
43,079✔
326

327
                        /* Take the first mount from our list of mounts to still process */
328
                        flags = PTR_TO_ULONG(hashmap_steal_first_key_and_value(todo, (void**) &x));
43,079✔
329
                        if (!x)
43,079✔
330
                                break;
331

332
                        r = set_ensure_consume(&done, &path_hash_ops_free, x);
23,325✔
333
                        if (IN_SET(r, 0, -EEXIST))
23,325✔
334
                                continue; /* Already done */
232✔
335
                        if (r < 0)
23,325✔
336
                                return r;
×
337

338
                        /* Now, remount this with the new flags set, but exclude MS_RELATIME from it. (It's
339
                         * the default anyway, thus redundant, and in userns we'll get an error if we try to
340
                         * explicitly enable it) */
341
                        r = mount_nofollow(NULL, x, NULL, ((flags & ~flags_mask)|MS_BIND|MS_REMOUNT|new_flags) & ~MS_RELATIME, NULL);
23,325✔
342
                        if (r < 0) {
23,325✔
343
                                int q;
232✔
344

345
                                /* OK, so the remount of this entry failed. We'll ultimately ignore this in
346
                                 * almost all cases (there are simply so many reasons why this can fail,
347
                                 * think autofs, NFS, FUSE, …), but let's generate useful debug messages at
348
                                 * the very least. */
349

350
                                q = path_is_mount_point(x);
232✔
351
                                if (IN_SET(q, 0, -ENOENT)) {
232✔
352
                                        /* Hmm, whaaaa? The mount point is not actually a mount point? Then
353
                                         * it is either obstructed by a later mount or somebody has been
354
                                         * racing against us and removed it. Either way the mount point
355
                                         * doesn't matter to us, let's ignore it hence. */
356
                                        log_debug_errno(r, "Mount point '%s' to remount is not a mount point anymore, ignoring remount failure: %m", x);
229✔
357
                                        continue;
229✔
358
                                }
359
                                if (q < 0) /* Any other error on this? Just log and continue */
3✔
360
                                        log_debug_errno(q, "Failed to determine whether '%s' is a mount point or not, ignoring: %m", x);
×
361

362
                                if (((flags ^ new_flags) & flags_mask & ~MS_RELATIME) == 0) { /* ignore MS_RELATIME while comparing */
3✔
363
                                        log_debug_errno(r, "Couldn't remount '%s', but the flags already match what we want, hence ignoring: %m", x);
×
364
                                        continue;
×
365
                                }
366

367
                                /* Make this fatal if this is the top-level mount */
368
                                if (path_equal(x, prefix))
3✔
369
                                        return r;
370

371
                                /* If this is not the top-level mount, then handle this gracefully: log but
372
                                 * otherwise ignore. With NFS, FUSE, autofs there are just too many reasons
373
                                 * this might fail without a chance for us to do anything about it, let's
374
                                 * hence be strict on the top-level mount and lenient on the inner ones. */
375
                                log_debug_errno(r, "Couldn't remount submount '%s' for unexpected reason, ignoring: %m", x);
3✔
376
                                continue;
3✔
377
                        }
378

379
                        log_trace("Remounted %s.", x);
23,093✔
380
                }
381
        }
382
}
383

384
int bind_remount_one_with_mountinfo(
2,305✔
385
                const char *path,
386
                unsigned long new_flags,
387
                unsigned long flags_mask,
388
                FILE *proc_self_mountinfo) {
389

390
        _cleanup_(mnt_free_tablep) struct libmnt_table *table = NULL;
2,305✔
391
        unsigned long flags = 0;
2,305✔
392
        struct libmnt_fs *fs;
2,305✔
393
        const char *opts;
2,305✔
394
        int r;
2,305✔
395

396
        assert(path);
2,305✔
397
        assert(proc_self_mountinfo);
2,305✔
398

399
        if ((flags_mask & ~MS_CONVERTIBLE_FLAGS) == 0 && !skip_mount_set_attr) {
2,305✔
400
                /* Let's take a shortcut for all the flags we know how to convert into mount_setattr() flags */
401

402
                if (mount_setattr(AT_FDCWD, path, AT_SYMLINK_NOFOLLOW,
2,305✔
403
                                  &(struct mount_attr) {
2,305✔
404
                                          .attr_set = ms_flags_to_mount_attr(new_flags & flags_mask),
2,305✔
405
                                          .attr_clr = ms_flags_to_mount_attr(~new_flags & flags_mask),
2,305✔
406
                                  }, MOUNT_ATTR_SIZE_VER0) < 0) {
407

408
                        log_debug_errno(errno, "mount_setattr() didn't work, falling back to classic remounting: %m");
4✔
409

410
                        if (ERRNO_IS_NOT_SUPPORTED(errno)) /* if not supported, then don't bother at all anymore */
4✔
411
                                skip_mount_set_attr = true;
×
412
                } else
413
                        return 0; /* Nice, this worked! */
2,301✔
414
        }
415

416
        rewind(proc_self_mountinfo);
4✔
417

418
        table = mnt_new_table();
4✔
419
        if (!table)
4✔
420
                return -ENOMEM;
421

422
        r = mnt_table_parse_stream(table, proc_self_mountinfo, "/proc/self/mountinfo");
4✔
423
        if (r < 0)
4✔
424
                return r;
425

426
        fs = mnt_table_find_target(table, path, MNT_ITER_FORWARD);
4✔
427
        if (!fs) {
4✔
428
                r = access_nofollow(path, F_OK); /* Hmm, it's not in the mount table, but does it exist at all? */
4✔
429
                if (r < 0)
2✔
430
                        return r;
431

432
                return -EINVAL; /* Not a mount point we recognize */
2✔
433
        }
434

435
        opts = mnt_fs_get_vfs_options(fs);
×
436
        if (opts) {
×
437
                r = mnt_optstr_get_flags(opts, &flags, mnt_get_builtin_optmap(MNT_LINUX_MAP));
×
438
                if (r < 0)
×
439
                        log_debug_errno(r, "Could not get flags for '%s', ignoring: %m", path);
×
440
        }
441

442
        r = mount_nofollow(NULL, path, NULL, ((flags & ~flags_mask)|MS_BIND|MS_REMOUNT|new_flags) & ~MS_RELATIME, NULL);
×
443
        if (r < 0) {
×
444
                if (((flags ^ new_flags) & flags_mask & ~MS_RELATIME) != 0) /* Ignore MS_RELATIME again,
×
445
                                                                             * since kernel adds it in
446
                                                                             * everywhere, because it's the
447
                                                                             * default. */
448
                        return r;
449

450
                /* Let's handle redundant remounts gracefully */
451
                log_debug_errno(r, "Failed to remount '%s' but flags already match what we want, ignoring: %m", path);
4✔
452
        }
453

454
        return 0;
455
}
456

457
int bind_remount_one(const char *path, unsigned long new_flags, unsigned long flags_mask) {
53✔
458
        _cleanup_fclose_ FILE *proc_self_mountinfo = NULL;
53✔
459

460
        proc_self_mountinfo = fopen("/proc/self/mountinfo", "re");
53✔
461
        if (!proc_self_mountinfo)
53✔
462
                return log_debug_errno(errno, "Failed to open /proc/self/mountinfo: %m");
×
463

464
        return bind_remount_one_with_mountinfo(path, new_flags, flags_mask, proc_self_mountinfo);
53✔
465
}
466

467
static int mount_switch_root_pivot(int fd_newroot, const char *path) {
2,205✔
468
        assert(fd_newroot >= 0);
2,205✔
469
        assert(path);
2,205✔
470

471
        /* Let the kernel tuck the new root under the old one. */
472
        if (pivot_root(".", ".") < 0)
2,205✔
473
                return log_debug_errno(errno, "Failed to pivot root to new rootfs '%s': %m", path);
36✔
474

475
        /* Get rid of the old root and reveal our brand new root. (This will always operate on the top-most
476
         * mount on our cwd, regardless what our current directory actually points to.) */
477
        if (umount2(".", MNT_DETACH) < 0)
2,169✔
478
                return log_debug_errno(errno, "Failed to unmount old rootfs: %m");
×
479

480
        return 0;
481
}
482

483
static int mount_switch_root_move(int fd_newroot, const char *path) {
36✔
484
        assert(fd_newroot >= 0);
36✔
485
        assert(path);
36✔
486

487
        /* Move the new root fs */
488
        if (mount(".", "/", NULL, MS_MOVE, NULL) < 0)
36✔
489
                return log_debug_errno(errno, "Failed to move new rootfs '%s': %m", path);
×
490

491
        /* Also change root dir */
492
        if (chroot(".") < 0)
36✔
493
                return log_debug_errno(errno, "Failed to chroot to new rootfs '%s': %m", path);
×
494

495
        return 0;
496
}
497

498
int mount_switch_root_full(const char *path, unsigned long mount_propagation_flag, bool force_ms_move) {
2,207✔
499
        _cleanup_close_ int fd_newroot = -EBADF;
2,207✔
500
        int r, is_current_root;
2,207✔
501

502
        assert(path);
2,207✔
503
        assert(mount_propagation_flag_is_valid(mount_propagation_flag));
2,207✔
504

505
        fd_newroot = open(path, O_PATH|O_DIRECTORY|O_CLOEXEC|O_NOFOLLOW);
2,207✔
506
        if (fd_newroot < 0)
2,207✔
507
                return log_debug_errno(errno, "Failed to open new rootfs '%s': %m", path);
×
508

509
        is_current_root = path_is_root_at(fd_newroot, NULL);
2,207✔
510
        if (is_current_root < 0)
2,207✔
511
                return log_debug_errno(is_current_root, "Failed to determine if target dir is our root already: %m");
×
512

513
        /* Change into the new rootfs. */
514
        if (fchdir(fd_newroot) < 0)
2,207✔
515
                return log_debug_errno(errno, "Failed to chdir into new rootfs '%s': %m", path);
×
516

517
        /* Make this a NOP if we are supposed to switch to our current root fs. After all, both pivot_root()
518
         * and MS_MOVE don't like that. */
519
        if (!is_current_root) {
2,207✔
520
                if (!force_ms_move) {
2,205✔
521
                        r = mount_switch_root_pivot(fd_newroot, path);
2,205✔
522
                        if (r < 0) {
2,205✔
523
                                log_debug_errno(r, "Failed to pivot into new rootfs '%s', will try to use MS_MOVE instead: %m", path);
36✔
524
                                force_ms_move = true;
525
                        }
526
                }
527
                if (force_ms_move) {
528
                        /* Failed to pivot_root() fallback to MS_MOVE. For example, this may happen if the rootfs is
529
                         * an initramfs in which case pivot_root() isn't supported. */
530
                        r = mount_switch_root_move(fd_newroot, path);
36✔
531
                        if (r < 0)
36✔
532
                                return log_debug_errno(r, "Failed to switch to new rootfs '%s' with MS_MOVE: %m", path);
×
533
                }
534
        }
535

536
        log_debug("Successfully switched root to '%s'.", path);
2,207✔
537

538
        /* Finally, let's establish the requested propagation flags. */
539
        if (mount_propagation_flag == 0)
2,207✔
540
                return 0;
541

542
        if (mount(NULL, ".", NULL, mount_propagation_flag | MS_REC, NULL) < 0)
211✔
543
                return log_debug_errno(errno, "Failed to turn new rootfs '%s' into %s mount: %m",
×
544
                                       mount_propagation_flag_to_string(mount_propagation_flag), path);
545

546
        return 0;
547
}
548

549
int repeat_unmount(const char *path, int flags) {
15✔
550
        bool done = false;
15✔
551

552
        assert(path);
15✔
553

554
        /* If there are multiple mounts on a mount point, this
555
         * removes them all */
556

557
        for (;;) {
30✔
558
                if (umount2(path, flags) < 0) {
30✔
559

560
                        if (errno == EINVAL)
15✔
561
                                return done;
15✔
562

563
                        return -errno;
×
564
                }
565

566
                done = true;
567
        }
568
}
569

570
int mode_to_inaccessible_node(
4,801✔
571
                const char *runtime_dir,
572
                mode_t mode,
573
                char **ret) {
574

575
        /* This function maps a node type to a corresponding inaccessible file node. These nodes are created
576
         * during early boot by PID 1. In some cases we lacked the privs to create the character and block
577
         * devices (maybe because we run in an userns environment, or miss CAP_SYS_MKNOD, or run with a
578
         * devices policy that excludes device nodes with major and minor of 0), but that's fine, in that
579
         * case we use an AF_UNIX file node instead, which is not the same, but close enough for most
580
         * uses. And most importantly, the kernel allows bind mounts from socket nodes to any non-directory
581
         * file nodes, and that's the most important thing that matters.
582
         *
583
         * Note that the runtime directory argument shall be the top-level runtime directory, i.e. /run/ if
584
         * we operate in system context and $XDG_RUNTIME_DIR if we operate in user context. */
585

586
        _cleanup_free_ char *d = NULL;
4,801✔
587
        const char *node;
4,801✔
588

589
        assert(ret);
4,801✔
590

591
        if (!runtime_dir)
4,801✔
592
                runtime_dir = "/run";
4✔
593

594
        if (S_ISLNK(mode))
4,801✔
595
                return -EINVAL;
596

597
        node = inode_type_to_string(mode);
4,801✔
598
        if (!node)
4,801✔
599
                return -EINVAL;
600

601
        d = path_join(runtime_dir, "systemd/inaccessible", node);
4,801✔
602
        if (!d)
4,801✔
603
                return -ENOMEM;
604

605
        /* On new kernels unprivileged users are permitted to create 0:0 char device nodes (because they also
606
         * act as whiteout inode for overlayfs), but no other char or block device nodes. On old kernels no
607
         * device node whatsoever may be created by unprivileged processes. Hence, if the caller asks for the
608
         * inaccessible block device node let's see if the block device node actually exists, and if not,
609
         * fall back to the character device node. From there fall back to the socket device node. This means
610
         * in the best case we'll get the right device node type — but if not we'll hopefully at least get a
611
         * device node at all. */
612

613
        if (S_ISBLK(mode) &&
4,801✔
614
            access(d, F_OK) < 0 && errno == ENOENT) {
×
615
                free(d);
×
616
                d = path_join(runtime_dir, "/systemd/inaccessible/chr");
×
617
                if (!d)
×
618
                        return -ENOMEM;
619
        }
620

621
        if (IN_SET(mode & S_IFMT, S_IFBLK, S_IFCHR) &&
5,249✔
622
            access(d, F_OK) < 0 && errno == ENOENT) {
448✔
623
                free(d);
×
624
                d = path_join(runtime_dir, "/systemd/inaccessible/sock");
×
625
                if (!d)
×
626
                        return -ENOMEM;
627
        }
628

629
        *ret = TAKE_PTR(d);
4,801✔
630
        return 0;
4,801✔
631
}
632

633
int mount_flags_to_string(unsigned long flags, char **ret) {
48,966✔
634
        static const struct {
48,966✔
635
                unsigned long flag;
636
                const char *name;
637
        } map[] = {
638
                { .flag = MS_RDONLY,      .name = "MS_RDONLY",      },
639
                { .flag = MS_NOSUID,      .name = "MS_NOSUID",      },
640
                { .flag = MS_NODEV,       .name = "MS_NODEV",       },
641
                { .flag = MS_NOEXEC,      .name = "MS_NOEXEC",      },
642
                { .flag = MS_SYNCHRONOUS, .name = "MS_SYNCHRONOUS", },
643
                { .flag = MS_REMOUNT,     .name = "MS_REMOUNT",     },
644
                { .flag = MS_MANDLOCK,    .name = "MS_MANDLOCK",    },
645
                { .flag = MS_DIRSYNC,     .name = "MS_DIRSYNC",     },
646
                { .flag = MS_NOSYMFOLLOW, .name = "MS_NOSYMFOLLOW", },
647
                { .flag = MS_NOATIME,     .name = "MS_NOATIME",     },
648
                { .flag = MS_NODIRATIME,  .name = "MS_NODIRATIME",  },
649
                { .flag = MS_BIND,        .name = "MS_BIND",        },
650
                { .flag = MS_MOVE,        .name = "MS_MOVE",        },
651
                { .flag = MS_REC,         .name = "MS_REC",         },
652
                { .flag = MS_SILENT,      .name = "MS_SILENT",      },
653
                { .flag = MS_POSIXACL,    .name = "MS_POSIXACL",    },
654
                { .flag = MS_UNBINDABLE,  .name = "MS_UNBINDABLE",  },
655
                { .flag = MS_PRIVATE,     .name = "MS_PRIVATE",     },
656
                { .flag = MS_SLAVE,       .name = "MS_SLAVE",       },
657
                { .flag = MS_SHARED,      .name = "MS_SHARED",      },
658
                { .flag = MS_RELATIME,    .name = "MS_RELATIME",    },
659
                { .flag = MS_KERNMOUNT,   .name = "MS_KERNMOUNT",   },
660
                { .flag = MS_I_VERSION,   .name = "MS_I_VERSION",   },
661
                { .flag = MS_STRICTATIME, .name = "MS_STRICTATIME", },
662
                { .flag = MS_LAZYTIME,    .name = "MS_LAZYTIME",    },
663
        };
664
        _cleanup_free_ char *str = NULL;
48,966✔
665

666
        assert(ret);
48,966✔
667

668
        FOREACH_ELEMENT(entry, map)
1,273,116✔
669
                if (flags & entry->flag) {
1,224,150✔
670
                        if (!strextend_with_separator(&str, "|", entry->name))
114,500✔
671
                                return -ENOMEM;
672
                        flags &= ~entry->flag;
114,500✔
673
                }
674

675
        if (!str || flags != 0)
48,966✔
676
                if (strextendf_with_separator(&str, "|", "%lx", flags) < 0)
153✔
677
                        return -ENOMEM;
678

679
        *ret = TAKE_PTR(str);
48,966✔
680
        return 0;
48,966✔
681
}
682

683
int mount_verbose_full(
48,936✔
684
                int error_log_level,
685
                const char *what,
686
                const char *where,
687
                const char *type,
688
                unsigned long flags,
689
                const char *options,
690
                bool follow_symlink) {
691

692
        _cleanup_free_ char *fl = NULL, *o = NULL;
48,936✔
693
        unsigned long f;
48,936✔
694
        int r;
48,936✔
695

696
        r = mount_option_mangle(options, flags, &f, &o);
48,936✔
697
        if (r < 0)
48,936✔
698
                return log_full_errno(error_log_level, r,
×
699
                                      "Failed to mangle mount options %s: %m",
700
                                      strempty(options));
701

702
        (void) mount_flags_to_string(f, &fl);
48,936✔
703

704
        if (FLAGS_SET(f, MS_REMOUNT|MS_BIND))
48,936✔
705
                log_debug("Changing mount flags %s (%s \"%s\")...",
10,165✔
706
                          where, strnull(fl), strempty(o));
707
        else if (f & MS_REMOUNT)
43,853✔
708
                log_debug("Remounting superblock %s (%s \"%s\")...",
4✔
709
                          where, strnull(fl), strempty(o));
710
        else if (f & (MS_SHARED|MS_PRIVATE|MS_SLAVE|MS_UNBINDABLE))
43,849✔
711
                log_debug("Changing mount propagation %s (%s \"%s\")",
6,226✔
712
                          where, strnull(fl), strempty(o));
713
        else if (f & MS_BIND)
40,736✔
714
                log_debug("Bind-mounting %s on %s (%s \"%s\")...",
57,605✔
715
                          what, where, strnull(fl), strempty(o));
716
        else if (f & MS_MOVE)
11,860✔
717
                log_debug("Moving mount %s %s %s (%s \"%s\")...",
7,698✔
718
                          what, glyph(GLYPH_ARROW_RIGHT), where, strnull(fl), strempty(o));
719
        else
720
                log_debug("Mounting %s (%s) on %s (%s \"%s\")...",
9,253✔
721
                          strna(what), strna(type), where, strnull(fl), strempty(o));
722

723
        if (follow_symlink)
48,936✔
724
                r = RET_NERRNO(mount(what, where, type, f, o));
49,303✔
725
        else
726
                r = mount_nofollow(what, where, type, f, o);
44,080✔
727
        if (r < 0)
44,447✔
728
                return log_full_errno(error_log_level, r,
8,414✔
729
                                      "Failed to mount %s (type %s) on %s (%s \"%s\"): %m",
730
                                      strna(what), strna(type), where, strnull(fl), strempty(o));
731
        return 0;
732
}
733

734
int umount_verbose(
446✔
735
                int error_log_level,
736
                const char *where,
737
                int flags) {
738

739
        assert(where);
446✔
740

741
        log_debug("Unmounting '%s'...", where);
446✔
742

743
        if (umount2(where, flags) < 0)
446✔
744
                return log_full_errno(error_log_level, errno, "Failed to unmount '%s': %m", where);
81✔
745

746
        return 0;
747
}
748

749
int umountat_detach_verbose(
220✔
750
                int error_log_level,
751
                int fd,
752
                const char *where) {
753

754
        /* Similar to umountat_verbose(), but goes by fd + path. This implies MNT_DETACH, since to do this we
755
         * must pin the inode in question via an fd. */
756

757
        assert(fd >= 0 || fd == AT_FDCWD);
220✔
758

759
        /* If neither fd nor path are specified take this as reference to the cwd */
760
        if (fd == AT_FDCWD && isempty(where))
220✔
761
                return umount_verbose(error_log_level, ".", MNT_DETACH|UMOUNT_NOFOLLOW);
220✔
762

763
        /* If we don't actually take the fd into consideration for this operation shortcut things, so that we
764
         * don't have to open the inode */
765
        if (fd == AT_FDCWD || path_is_absolute(where))
220✔
766
                return umount_verbose(error_log_level, where, MNT_DETACH|UMOUNT_NOFOLLOW);
×
767

768
        _cleanup_free_ char *prefix = NULL;
440✔
769
        const char *p;
220✔
770
        if (fd_get_path(fd, &prefix) < 0)
220✔
771
                p = "<fd>"; /* if we can't get the path, return something vaguely useful */
772
        else
773
                p = prefix;
220✔
774
        _cleanup_free_ char *joined = isempty(where) ? strdup(p) : path_join(p, where);
559✔
775

776
        log_debug("Unmounting '%s'...", strna(joined));
220✔
777

778
        _cleanup_close_ int inode_fd = -EBADF;
220✔
779
        int mnt_fd;
220✔
780
        if (isempty(where))
220✔
781
                mnt_fd = fd;
782
        else {
783
                inode_fd = openat(fd, where, O_PATH|O_CLOEXEC|O_NOFOLLOW);
119✔
784
                if (inode_fd < 0)
119✔
785
                        return log_full_errno(error_log_level, errno, "Failed to pin '%s': %m", strna(joined));
×
786

787
                mnt_fd = inode_fd;
788
        }
789

790
        if (umount2(FORMAT_PROC_FD_PATH(mnt_fd), MNT_DETACH) < 0)
220✔
791
                return log_full_errno(error_log_level, errno, "Failed to unmount '%s': %m", strna(joined));
9✔
792

793
        return 0;
211✔
794
}
795

796
int mount_exchange_graceful(int fsmount_fd, const char *dest, bool mount_beneath) {
16✔
797
        int r;
16✔
798

799
        assert(fsmount_fd >= 0);
16✔
800
        assert(dest);
16✔
801

802
        /* First, try to mount beneath an existing mount point, and if that works, umount the old mount,
803
         * which is now at the top. This will ensure we can atomically replace a mount. Note that this works
804
         * also in the case where there are submounts down the tree. Mount propagation is allowed but
805
         * restricted to layouts that don't end up propagation the new mount on top of the mount stack.  If
806
         * this is not supported (minimum kernel v6.5), or if there is no mount on the mountpoint, we get
807
         * -EINVAL and then we fallback to normal mounting. */
808

809
        r = RET_NERRNO(move_mount(fsmount_fd, /* from_path = */ "",
26✔
810
                                  /* to_fd = */ -EBADF, dest,
811
                                  MOVE_MOUNT_F_EMPTY_PATH | (mount_beneath ? MOVE_MOUNT_BENEATH : 0)));
812
        if (mount_beneath) {
16✔
813
                if (r >= 0) /* Mounting beneath worked! Now unmount the upper mount. */
6✔
814
                        return umount_verbose(LOG_DEBUG, dest, UMOUNT_NOFOLLOW|MNT_DETACH);
2✔
815

816
                if (r == -EINVAL) { /* Fallback if mount_beneath is not supported */
4✔
817
                        log_debug_errno(r,
4✔
818
                                        "Cannot mount beneath '%s', falling back to overmount: %m",
819
                                        dest);
820
                        return mount_exchange_graceful(fsmount_fd, dest, /* mount_beneath = */ false);
4✔
821
                }
822
        }
823

824
        return r;
825
}
826

827
int mount_option_mangle(
49,008✔
828
                const char *options,
829
                unsigned long mount_flags,
830
                unsigned long *ret_mount_flags,
831
                char **ret_remaining_options) {
832

833
        const struct libmnt_optmap *map;
49,008✔
834
        _cleanup_free_ char *ret = NULL;
49,008✔
835
        int r;
49,008✔
836

837
        /* This extracts mount flags from the mount options, and stores
838
         * non-mount-flag options to '*ret_remaining_options'.
839
         * E.g.,
840
         * "rw,nosuid,nodev,relatime,size=1630748k,mode=0700,uid=1000,gid=1000"
841
         * is split to MS_NOSUID|MS_NODEV|MS_RELATIME and
842
         * "size=1630748k,mode=0700,uid=1000,gid=1000".
843
         * See more examples in test-mount-util.c.
844
         *
845
         * If 'options' does not contain any non-mount-flag options,
846
         * then '*ret_remaining_options' is set to NULL instead of empty string.
847
         * The validity of options stored in '*ret_remaining_options' is not checked.
848
         * If 'options' is NULL, this just copies 'mount_flags' to *ret_mount_flags. */
849

850
        assert(ret_mount_flags);
49,008✔
851
        assert(ret_remaining_options);
49,008✔
852

853
        map = mnt_get_builtin_optmap(MNT_LINUX_MAP);
49,008✔
854
        if (!map)
49,008✔
855
                return -EINVAL;
856

857
        for (const char *p = options;;) {
49,008✔
858
                _cleanup_free_ char *word = NULL;
23,067✔
859
                const struct libmnt_optmap *ent;
72,074✔
860

861
                r = extract_first_word(&p, &word, ",", EXTRACT_KEEP_QUOTE);
72,074✔
862
                if (r < 0)
72,074✔
863
                        return r;
864
                if (r == 0)
72,073✔
865
                        break;
866

867
                for (ent = map; ent->name; ent++) {
966,189✔
868
                        /* All entries in MNT_LINUX_MAP do not take any argument.
869
                         * Thus, ent->name does not contain "=" or "[=]". */
870
                        if (!streq(word, ent->name))
943,214✔
871
                                continue;
943,123✔
872

873
                        if (!(ent->mask & MNT_INVERT))
91✔
874
                                mount_flags |= ent->id;
82✔
875
                        else
876
                                mount_flags &= ~ent->id;
9✔
877

878
                        break;
879
                }
880

881
                /* If 'word' is not a mount flag, then store it in '*ret_remaining_options'. */
882
                if (!ent->name &&
46,041✔
883
                    !startswith_no_case(word, "x-") &&
45,948✔
884
                    !strextend_with_separator(&ret, ",", word))
22,973✔
885
                        return -ENOMEM;
886
        }
887

888
        *ret_mount_flags = mount_flags;
49,007✔
889
        *ret_remaining_options = TAKE_PTR(ret);
49,007✔
890

891
        return 0;
49,007✔
892
}
893

894
static int mount_in_namespace_legacy(
×
895
                const char *chased_src_path,
896
                int chased_src_fd,
897
                struct stat *chased_src_st,
898
                const char *propagate_path,
899
                const char *incoming_path,
900
                const char *dest,
901
                int pidns_fd,
902
                int mntns_fd,
903
                int root_fd,
904
                MountInNamespaceFlags flags,
905
                const MountOptions *options,
906
                const ImagePolicy *image_policy) {
907

908
        _cleanup_close_pair_ int errno_pipe_fd[2] = EBADF_PAIR;
×
909
        char mount_slave[] = "/tmp/propagate.XXXXXX", *mount_tmp, *mount_outside, *p;
×
910
        bool mount_slave_created = false, mount_slave_mounted = false,
×
911
                mount_tmp_created = false, mount_tmp_mounted = false,
×
912
                mount_outside_created = false, mount_outside_mounted = false;
×
913
        pid_t child;
×
914
        int r;
×
915

916
        assert(chased_src_path);
×
917
        assert(chased_src_fd >= 0);
×
918
        assert(chased_src_st);
×
919
        assert(propagate_path);
×
920
        assert(incoming_path);
×
921
        assert(dest);
×
922
        assert(pidns_fd >= 0);
×
923
        assert(mntns_fd >= 0);
×
924
        assert(root_fd >= 0);
×
925
        assert(!options || (flags & MOUNT_IN_NAMESPACE_IS_IMAGE));
×
926

927
        p = strjoina(propagate_path, "/");
×
928
        r = access_nofollow(p, F_OK);
×
929
        if (r < 0)
×
930
                return log_debug_errno(r == -ENOENT ? SYNTHETIC_ERRNO(EOPNOTSUPP) : r, "Target does not allow propagation of mount points");
×
931

932
        /* Our goal is to install a new bind mount into the container,
933
           possibly read-only. This is irritatingly complex
934
           unfortunately, currently.
935

936
           First, we start by creating a private playground in /tmp,
937
           that we can mount MS_SLAVE. (Which is necessary, since
938
           MS_MOVE cannot be applied to mounts with MS_SHARED parent
939
           mounts.) */
940

941
        if (!mkdtemp(mount_slave))
×
942
                return log_debug_errno(errno, "Failed to create playground %s: %m", mount_slave);
×
943

944
        mount_slave_created = true;
×
945

946
        r = mount_nofollow_verbose(LOG_DEBUG, mount_slave, mount_slave, NULL, MS_BIND, NULL);
×
947
        if (r < 0)
×
948
                goto finish;
×
949

950
        mount_slave_mounted = true;
×
951

952
        r = mount_nofollow_verbose(LOG_DEBUG, NULL, mount_slave, NULL, MS_SLAVE, NULL);
×
953
        if (r < 0)
×
954
                goto finish;
×
955

956
        /* Second, we mount the source file or directory to a directory inside of our MS_SLAVE playground. */
957
        mount_tmp = strjoina(mount_slave, "/mount");
×
958
        r = make_mount_point_inode_from_mode(AT_FDCWD, mount_tmp, (flags & MOUNT_IN_NAMESPACE_IS_IMAGE) ? S_IFDIR : chased_src_st->st_mode, 0700);
×
959
        if (r < 0) {
×
960
                log_debug_errno(r, "Failed to create temporary mount point %s: %m", mount_tmp);
×
961
                goto finish;
×
962
        }
963

964
        mount_tmp_created = true;
×
965

966
        if (flags & MOUNT_IN_NAMESPACE_IS_IMAGE)
×
967
                r = verity_dissect_and_mount(
×
968
                                chased_src_fd,
969
                                chased_src_path,
970
                                mount_tmp,
971
                                options,
972
                                image_policy,
973
                                /* image_filter= */ NULL,
974
                                /* extension_release_data= */ NULL,
975
                                /* verity= */ NULL,
976
                                /* ret_image= */ NULL);
977
        else
978
                r = mount_follow_verbose(LOG_DEBUG, FORMAT_PROC_FD_PATH(chased_src_fd), mount_tmp, NULL, MS_BIND, NULL);
×
979
        if (r < 0)
×
980
                goto finish;
×
981

982
        mount_tmp_mounted = true;
×
983

984
        /* Third, we remount the new bind mount read-only if requested. */
985
        if (flags & MOUNT_IN_NAMESPACE_READ_ONLY) {
×
986
                r = mount_nofollow_verbose(LOG_DEBUG, NULL, mount_tmp, NULL, MS_BIND|MS_REMOUNT|MS_RDONLY, NULL);
×
987
                if (r < 0)
×
988
                        goto finish;
×
989
        }
990

991
        /* Fourth, we move the new bind mount into the propagation directory. This way it will appear there read-only
992
         * right-away. */
993

994
        mount_outside = strjoina(propagate_path, "/XXXXXX");
×
995
        if ((flags & MOUNT_IN_NAMESPACE_IS_IMAGE) || S_ISDIR(chased_src_st->st_mode))
×
996
                r = mkdtemp(mount_outside) ? 0 : -errno;
×
997
        else {
998
                r = mkostemp_safe(mount_outside);
×
999
                safe_close(r);
×
1000
        }
1001
        if (r < 0) {
×
1002
                log_debug_errno(r, "Cannot create propagation file or directory %s: %m", mount_outside);
×
1003
                goto finish;
×
1004
        }
1005

1006
        mount_outside_created = true;
×
1007

1008
        r = mount_nofollow_verbose(LOG_DEBUG, mount_tmp, mount_outside, NULL, MS_MOVE, NULL);
×
1009
        if (r < 0)
×
1010
                goto finish;
×
1011

1012
        mount_outside_mounted = true;
×
1013
        mount_tmp_mounted = false;
×
1014

1015
        if ((flags & MOUNT_IN_NAMESPACE_IS_IMAGE) || S_ISDIR(chased_src_st->st_mode))
×
1016
                (void) rmdir(mount_tmp);
×
1017
        else
1018
                (void) unlink(mount_tmp);
×
1019
        mount_tmp_created = false;
×
1020

1021
        (void) umount_verbose(LOG_DEBUG, mount_slave, UMOUNT_NOFOLLOW);
×
1022
        mount_slave_mounted = false;
×
1023

1024
        (void) rmdir(mount_slave);
×
1025
        mount_slave_created = false;
×
1026

1027
        if (pipe2(errno_pipe_fd, O_CLOEXEC|O_NONBLOCK) < 0) {
×
1028
                log_debug_errno(errno, "Failed to create pipe: %m");
×
1029
                goto finish;
×
1030
        }
1031

1032
        r = namespace_fork(
×
1033
                        "(sd-bindmnt)",
1034
                        "(sd-bindmnt-inner)",
1035
                        /* except_fds= */ NULL,
1036
                        /* n_except_fds= */ 0,
1037
                        FORK_RESET_SIGNALS|FORK_DEATHSIG_SIGTERM,
1038
                        pidns_fd,
1039
                        mntns_fd,
1040
                        /* netns_fd= */ -EBADF,
1041
                        /* userns_fd= */ -EBADF,
1042
                        root_fd,
1043
                        &child);
1044
        if (r < 0)
×
1045
                goto finish;
×
1046
        if (r == 0) {
×
1047
                _cleanup_free_ char *mount_outside_fn = NULL, *mount_inside = NULL;
×
1048

1049
                errno_pipe_fd[0] = safe_close(errno_pipe_fd[0]);
×
1050

1051
                _cleanup_close_ int dest_fd = -EBADF;
×
1052
                _cleanup_free_ char *dest_fn = NULL;
×
1053
                r = chase(dest, /* root= */ NULL, CHASE_PARENT|CHASE_EXTRACT_FILENAME|((flags & MOUNT_IN_NAMESPACE_MAKE_FILE_OR_DIRECTORY) ? CHASE_MKDIR_0755 : 0), &dest_fn, &dest_fd);
×
1054
                if (r < 0)
×
1055
                        log_debug_errno(r, "Failed to pin parent directory of mount '%s', ignoring: %m", dest);
×
1056
                else if (flags & MOUNT_IN_NAMESPACE_MAKE_FILE_OR_DIRECTORY) {
×
1057
                        r = make_mount_point_inode_from_mode(dest_fd, dest_fn, (flags & MOUNT_IN_NAMESPACE_IS_IMAGE) ? S_IFDIR : chased_src_st->st_mode, 0700);
×
1058
                        if (r < 0)
×
1059
                                log_debug_errno(r, "Failed to make mount point inode of mount '%s', ignoring: %m", dest);
×
1060
                }
1061

1062
                /* Fifth, move the mount to the right place inside */
1063
                r = path_extract_filename(mount_outside, &mount_outside_fn);
×
1064
                if (r < 0) {
×
1065
                        log_debug_errno(r, "Failed to extract filename from propagation file or directory '%s': %m", mount_outside);
×
1066
                        report_errno_and_exit(errno_pipe_fd[1], r);
×
1067
                }
1068

1069
                mount_inside = path_join(incoming_path, mount_outside_fn);
×
1070
                if (!mount_inside)
×
1071
                        report_errno_and_exit(errno_pipe_fd[1], log_oom_debug());
×
1072

1073
                r = mount_nofollow_verbose(LOG_DEBUG, mount_inside, dest_fd >= 0 ? FORMAT_PROC_FD_PATH(dest_fd) : dest, /* fstype= */ NULL, MS_MOVE, /* options= */ NULL);
×
1074
                if (r < 0)
×
1075
                        report_errno_and_exit(errno_pipe_fd[1], r);
×
1076

1077
                _exit(EXIT_SUCCESS);
×
1078
        }
1079

1080
        errno_pipe_fd[1] = safe_close(errno_pipe_fd[1]);
×
1081

1082
        r = wait_for_terminate_and_check("(sd-bindmnt)", child, 0);
×
1083
        if (r < 0) {
×
1084
                log_debug_errno(r, "Failed to wait for child: %m");
×
1085
                goto finish;
×
1086
        }
1087
        if (r != EXIT_SUCCESS) {
×
1088
                if (read(errno_pipe_fd[0], &r, sizeof(r)) == sizeof(r))
×
1089
                        log_debug_errno(r, "Failed to mount: %m");
×
1090
                else
1091
                        log_debug("Child failed.");
×
1092
                goto finish;
×
1093
        }
1094

1095
finish:
×
1096
        if (mount_outside_mounted)
×
1097
                (void) umount_verbose(LOG_DEBUG, mount_outside, UMOUNT_NOFOLLOW);
×
1098
        if (mount_outside_created) {
×
1099
                if ((flags & MOUNT_IN_NAMESPACE_IS_IMAGE) || S_ISDIR(chased_src_st->st_mode))
×
1100
                        (void) rmdir(mount_outside);
×
1101
                else
1102
                        (void) unlink(mount_outside);
×
1103
        }
1104

1105
        if (mount_tmp_mounted)
×
1106
                (void) umount_verbose(LOG_DEBUG, mount_tmp, UMOUNT_NOFOLLOW);
×
1107
        if (mount_tmp_created) {
×
1108
                if ((flags & MOUNT_IN_NAMESPACE_IS_IMAGE) || S_ISDIR(chased_src_st->st_mode))
×
1109
                        (void) rmdir(mount_tmp);
×
1110
                else
1111
                        (void) unlink(mount_tmp);
×
1112
        }
1113

1114
        if (mount_slave_mounted)
×
1115
                (void) umount_verbose(LOG_DEBUG, mount_slave, UMOUNT_NOFOLLOW);
×
1116
        if (mount_slave_created)
×
1117
                (void) rmdir(mount_slave);
×
1118

1119
        return r;
×
1120
}
1121

1122
static int mount_in_namespace(
6✔
1123
                const PidRef *target,
1124
                const char *propagate_path,
1125
                const char *incoming_path,
1126
                const char *src,
1127
                const char *dest,
1128
                MountInNamespaceFlags flags,
1129
                const MountOptions *options,
1130
                const ImagePolicy *image_policy) {
1131

1132
        _cleanup_close_ int mntns_fd = -EBADF, root_fd = -EBADF, pidns_fd = -EBADF, chased_src_fd = -EBADF;
18✔
1133
        _cleanup_free_ char *chased_src_path = NULL;
6✔
1134
        struct stat st;
6✔
1135
        int r;
6✔
1136

1137
        assert(propagate_path);
6✔
1138
        assert(incoming_path);
6✔
1139
        assert(src);
6✔
1140
        assert(dest);
6✔
1141
        assert((flags & MOUNT_IN_NAMESPACE_IS_IMAGE) || (!options && !image_policy));
6✔
1142

1143
        if (!pidref_is_set(target))
12✔
1144
                return -ESRCH;
1145

1146
        r = pidref_namespace_open(target, &pidns_fd, &mntns_fd, /* ret_netns_fd = */ NULL, /* ret_userns_fd = */ NULL, &root_fd);
6✔
1147
        if (r < 0)
6✔
1148
                return log_debug_errno(r, "Failed to retrieve FDs of the target process' namespace: %m");
×
1149

1150
        r = is_our_namespace(mntns_fd, NAMESPACE_MOUNT);
6✔
1151
        if (r < 0)
6✔
1152
                return log_debug_errno(r, "Failed to determine if mount namespaces are equal: %m");
×
1153
        /* We can't add new mounts at runtime if the process wasn't started in a namespace */
1154
        if (r > 0)
6✔
1155
                return log_debug_errno(SYNTHETIC_ERRNO(EINVAL), "Failed to activate bind mount in target, not running in a mount namespace.");
×
1156

1157
        r = chase(src, NULL, 0, &chased_src_path, &chased_src_fd);
6✔
1158
        if (r < 0)
6✔
1159
                return log_debug_errno(r, "Failed to resolve source path '%s': %m", src);
×
1160
        log_debug("Chased source path '%s': %s", src, chased_src_path);
6✔
1161

1162
        if (fstat(chased_src_fd, &st) < 0)
6✔
1163
                return log_debug_errno(errno, "Failed to stat() resolved source path '%s': %m", src);
×
1164
        if (S_ISLNK(st.st_mode)) /* This shouldn't really happen, given that we just chased the symlinks above, but let's better be safe… */
6✔
1165
                return log_debug_errno(SYNTHETIC_ERRNO(ELOOP), "Source path '%s' can't be a symbolic link.", src);
×
1166

1167
        if (!mount_new_api_supported()) /* Fallback if we can't use the new mount API */
6✔
1168
                return mount_in_namespace_legacy(
×
1169
                                chased_src_path,
1170
                                chased_src_fd,
1171
                                &st,
1172
                                propagate_path,
1173
                                incoming_path,
1174
                                dest,
1175
                                pidns_fd,
1176
                                mntns_fd,
1177
                                root_fd,
1178
                                flags,
1179
                                options,
1180
                                image_policy);
1181

1182
        _cleanup_(dissected_image_unrefp) DissectedImage *img = NULL;
×
1183
        _cleanup_close_ int new_mount_fd = -EBADF;
6✔
1184
        _cleanup_close_pair_ int errno_pipe_fd[2] = EBADF_PAIR;
6✔
1185
        pid_t child;
6✔
1186

1187
        if (flags & MOUNT_IN_NAMESPACE_IS_IMAGE) {
6✔
1188
                r = verity_dissect_and_mount(
2✔
1189
                                chased_src_fd,
1190
                                chased_src_path,
1191
                                /* dest= */ NULL,
1192
                                options,
1193
                                image_policy,
1194
                                /* image_filter= */ NULL,
1195
                                /* extension_release_data= */ NULL,
1196
                                /* verity= */ NULL,
1197
                                &img);
1198
                if (r < 0)
2✔
1199
                        return log_debug_errno(r,
×
1200
                                               "Failed to dissect and mount image '%s': %m",
1201
                                               chased_src_path);
1202
        } else {
1203
                new_mount_fd = open_tree(
4✔
1204
                                chased_src_fd,
1205
                                "",
1206
                                OPEN_TREE_CLONE|OPEN_TREE_CLOEXEC|AT_SYMLINK_NOFOLLOW|AT_EMPTY_PATH);
1207
                if (new_mount_fd < 0)
4✔
1208
                        return log_debug_errno(
×
1209
                                        errno,
1210
                                        "Failed to open mount source '%s': %m",
1211
                                        chased_src_path);
1212

1213
                if ((flags & MOUNT_IN_NAMESPACE_READ_ONLY) && mount_setattr(new_mount_fd, "", AT_EMPTY_PATH,
4✔
1214
                                               &(struct mount_attr) {
×
1215
                                                       .attr_set = MOUNT_ATTR_RDONLY,
1216
                                               }, MOUNT_ATTR_SIZE_VER0) < 0)
1217
                        return log_debug_errno(errno,
×
1218
                                               "Failed to set mount for '%s' to read only: %m",
1219
                                               chased_src_path);
1220
        }
1221

1222
        if (pipe2(errno_pipe_fd, O_CLOEXEC|O_NONBLOCK) < 0)
6✔
1223
                return log_debug_errno(errno, "Failed to create pipe: %m");
×
1224

1225
        r = namespace_fork("(sd-bindmnt)",
6✔
1226
                           "(sd-bindmnt-inner)",
1227
                           /* except_fds= */ NULL,
1228
                           /* n_except_fds= */ 0,
1229
                           FORK_RESET_SIGNALS|FORK_DEATHSIG_SIGTERM,
1230
                           pidns_fd,
1231
                           mntns_fd,
1232
                           /* netns_fd= */ -EBADF,
1233
                           /* userns_fd= */ -EBADF,
1234
                           root_fd,
1235
                           &child);
1236
        if (r < 0)
12✔
1237
                return log_debug_errno(r, "Failed to fork off mount helper into namespace: %m");
×
1238
        if (r == 0) {
12✔
1239
                errno_pipe_fd[0] = safe_close(errno_pipe_fd[0]);
6✔
1240

1241
                _cleanup_close_ int dest_fd = -EBADF;
×
1242
                _cleanup_free_ char *dest_fn = NULL;
×
1243
                r = chase(dest, /* root= */ NULL, CHASE_PARENT|CHASE_EXTRACT_FILENAME|((flags & MOUNT_IN_NAMESPACE_MAKE_FILE_OR_DIRECTORY) ? CHASE_MKDIR_0755 : 0), &dest_fn, &dest_fd);
6✔
1244
                if (r < 0)
6✔
1245
                        report_errno_and_exit(errno_pipe_fd[1], r);
×
1246

1247
                if (flags & MOUNT_IN_NAMESPACE_MAKE_FILE_OR_DIRECTORY)
6✔
1248
                        (void) make_mount_point_inode_from_mode(dest_fd, dest_fn, img ? S_IFDIR : st.st_mode, 0700);
6✔
1249

1250
                if (img) {
6✔
1251
                        DissectImageFlags f =
2✔
1252
                                DISSECT_IMAGE_TRY_ATOMIC_MOUNT_EXCHANGE |
1253
                                DISSECT_IMAGE_ALLOW_USERSPACE_VERITY;
1254

1255
                        if (flags & MOUNT_IN_NAMESPACE_MAKE_FILE_OR_DIRECTORY)
2✔
1256
                                f |= DISSECT_IMAGE_MKDIR;
2✔
1257

1258
                        if (flags & MOUNT_IN_NAMESPACE_READ_ONLY)
2✔
1259
                                f |= DISSECT_IMAGE_READ_ONLY;
×
1260

1261
                        r = dissected_image_mount(
2✔
1262
                                        img,
1263
                                        dest,
1264
                                        /* uid_shift= */ UID_INVALID,
1265
                                        /* uid_range= */ UID_INVALID,
1266
                                        /* userns_fd= */ -EBADF,
1267
                                        f);
1268
                } else
1269
                        r = mount_exchange_graceful(new_mount_fd, dest, /* mount_beneath= */ true);
4✔
1270

1271
                report_errno_and_exit(errno_pipe_fd[1], r);
6✔
1272
        }
1273

1274
        errno_pipe_fd[1] = safe_close(errno_pipe_fd[1]);
6✔
1275

1276
        r = wait_for_terminate_and_check("(sd-bindmnt)", child, 0);
6✔
1277
        if (r < 0)
6✔
1278
                return log_debug_errno(r, "Failed to wait for child: %m");
×
1279
        if (r != EXIT_SUCCESS) {
6✔
1280
                if (read(errno_pipe_fd[0], &r, sizeof(r)) == sizeof(r))
×
1281
                        return log_debug_errno(r, "Failed to mount into namespace: %m");
×
1282

1283
                return log_debug_errno(SYNTHETIC_ERRNO(EPROTO), "Child failed.");
×
1284
        }
1285

1286
        return 0;
1287
}
1288

1289
int bind_mount_in_namespace(
4✔
1290
                const PidRef *target,
1291
                const char *propagate_path,
1292
                const char *incoming_path,
1293
                const char *src,
1294
                const char *dest,
1295
                MountInNamespaceFlags flags) {
1296

1297
        return mount_in_namespace(target,
8✔
1298
                                  propagate_path,
1299
                                  incoming_path,
1300
                                  src,
1301
                                  dest,
1302
                                  flags & ~MOUNT_IN_NAMESPACE_IS_IMAGE,
4✔
1303
                                  /* options = */ NULL,
1304
                                  /* image_policy = */ NULL);
1305
}
1306

1307
int mount_image_in_namespace(
2✔
1308
                const PidRef *target,
1309
                const char *propagate_path,
1310
                const char *incoming_path,
1311
                const char *src,
1312
                const char *dest,
1313
                MountInNamespaceFlags flags,
1314
                const MountOptions *options,
1315
                const ImagePolicy *image_policy) {
1316

1317
        return mount_in_namespace(target,
4✔
1318
                                  propagate_path,
1319
                                  incoming_path,
1320
                                  src,
1321
                                  dest,
1322
                                  flags | MOUNT_IN_NAMESPACE_IS_IMAGE,
2✔
1323
                                  options,
1324
                                  image_policy);
1325
}
1326

1327
int make_mount_point(const char *path) {
23✔
1328
        int r;
23✔
1329

1330
        assert(path);
23✔
1331

1332
        /* If 'path' is already a mount point, does nothing and returns 0. If it is not it makes it one, and returns 1. */
1333

1334
        r = path_is_mount_point(path);
23✔
1335
        if (r < 0)
23✔
1336
                return log_debug_errno(r, "Failed to determine whether '%s' is a mount point: %m", path);
×
1337
        if (r > 0)
23✔
1338
                return 0;
1339

1340
        r = mount_nofollow_verbose(LOG_DEBUG, path, path, NULL, MS_BIND|MS_REC, NULL);
9✔
1341
        if (r < 0)
9✔
1342
                return r;
×
1343

1344
        return 1;
1345
}
1346

1347
int fd_make_mount_point(int fd) {
11✔
1348
        int r;
11✔
1349

1350
        assert(fd >= 0);
11✔
1351

1352
        r = is_mount_point_at(fd, NULL, 0);
11✔
1353
        if (r < 0)
11✔
1354
                return log_debug_errno(r, "Failed to determine whether file descriptor is a mount point: %m");
×
1355
        if (r > 0)
11✔
1356
                return 0;
1357

1358
        r = mount_follow_verbose(LOG_DEBUG, FORMAT_PROC_FD_PATH(fd), FORMAT_PROC_FD_PATH(fd), NULL, MS_BIND|MS_REC, NULL);
1✔
1359
        if (r < 0)
1✔
1360
                return r;
×
1361

1362
        return 1;
1363
}
1364

1365
int make_userns(uid_t uid_shift,
76✔
1366
                uid_t uid_range,
1367
                uid_t source_owner,
1368
                uid_t dest_owner,
1369
                RemountIdmapping idmapping) {
1370

1371
        _cleanup_close_ int userns_fd = -EBADF;
76✔
1372
        _cleanup_free_ char *line = NULL;
76✔
1373
        uid_t source_base = 0;
76✔
1374

1375
        /* Allocates a userns file descriptor with the mapping we need. For this we'll fork off a child
1376
         * process whose only purpose is to give us a new user namespace. It's killed when we got it. */
1377

1378
        if (!userns_shift_range_valid(uid_shift, uid_range))
76✔
1379
                return log_debug_errno(SYNTHETIC_ERRNO(EINVAL), "Invalid UID range for user namespace.");
×
1380

1381
        switch (idmapping) {
76✔
1382

1383
        case REMOUNT_IDMAPPING_FOREIGN_WITH_HOST_ROOT:
2✔
1384
                source_base = FOREIGN_UID_BASE;
2✔
1385
                _fallthrough_;
74✔
1386

1387
        case REMOUNT_IDMAPPING_NONE:
74✔
1388
        case REMOUNT_IDMAPPING_HOST_ROOT:
1389

1390
                if (asprintf(&line,
74✔
1391
                             UID_FMT " " UID_FMT " " UID_FMT "\n",
1392
                             source_base, uid_shift, uid_range) < 0)
1393
                        return log_oom_debug();
×
1394

1395
                /* If requested we'll include an entry in the mapping so that the host root user can make
1396
                 * changes to the uidmapped mount like it normally would. Specifically, we'll map the user
1397
                 * with UID_MAPPED_ROOT on the backing fs to UID 0. This is useful, since nspawn code wants
1398
                 * to create various missing inodes in the OS tree before booting into it, and this becomes
1399
                 * very easy and straightforward to do if it can just do it under its own regular UID. Note
1400
                 * that in that case the container's runtime uidmap (i.e. the one the container payload
1401
                 * processes run in) will leave this UID unmapped, i.e. if we accidentally leave files owned
1402
                 * by host root in the already uidmapped tree around they'll show up as owned by 'nobody',
1403
                 * which is safe. (Of course, we shouldn't leave such inodes around, but always chown() them
1404
                 * to the container's own UID range, but it's good to have a safety net, in case we
1405
                 * forget it.) */
1406
                if (idmapping == REMOUNT_IDMAPPING_HOST_ROOT)
74✔
1407
                        if (strextendf(&line,
72✔
1408
                                       UID_FMT " " UID_FMT " " UID_FMT "\n",
1409
                                       UID_MAPPED_ROOT, (uid_t) 0u, (uid_t) 1u) < 0)
1410
                                return log_oom_debug();
×
1411

1412
                break;
1413

1414
        case REMOUNT_IDMAPPING_HOST_OWNER:
×
1415
                /* Remap the owner of the bind mounted directory to the root user within the container. This
1416
                 * way every file written by root within the container to the bind-mounted directory will
1417
                 * be owned by the original user from the host. All other users will remain unmapped. */
1418
                if (asprintf(&line,
×
1419
                             UID_FMT " " UID_FMT " " UID_FMT "\n",
1420
                             source_owner, uid_shift, (uid_t) 1u) < 0)
1421
                        return log_oom_debug();
×
1422
                break;
1423

1424
        case REMOUNT_IDMAPPING_HOST_OWNER_TO_TARGET_OWNER:
2✔
1425
                /* Remap the owner of the bind mounted directory to the owner of the target directory
1426
                 * within the container. This way every file written by target directory owner within the
1427
                 * container to the bind-mounted directory will be owned by the original host user.
1428
                 * All other users will remain unmapped. */
1429
                if (asprintf(&line,
2✔
1430
                             UID_FMT " " UID_FMT " " UID_FMT "\n",
1431
                             source_owner, dest_owner, (uid_t) 1u) < 0)
1432
                        return log_oom_debug();
×
1433
                break;
1434

1435
        default:
×
1436
                assert_not_reached();
×
1437
        }
1438

1439
        /* We always assign the same UID and GID ranges */
1440
        userns_fd = userns_acquire(line, line, /* setgroups_deny= */ true);
76✔
1441
        if (userns_fd < 0)
76✔
1442
                return log_debug_errno(userns_fd, "Failed to acquire new userns: %m");
×
1443

1444
        return TAKE_FD(userns_fd);
1445
}
1446

1447
int remount_idmap_fd(
87✔
1448
                char **paths,
1449
                int userns_fd,
1450
                uint64_t extra_mount_attr_set) {
1451

1452
        int r;
87✔
1453

1454
        assert(userns_fd >= 0);
87✔
1455

1456
        /* This remounts all specified paths with the specified userns as idmap. It will do so in the
1457
         * order specified in the strv: the expectation is that the top-level directories are at the
1458
         * beginning, and nested directories in the right, so that the tree can be built correctly from left
1459
         * to right. */
1460

1461
        size_t n = strv_length(paths);
87✔
1462
        if (n == 0) /* Nothing to do? */
87✔
1463
                return 0;
87✔
1464

1465
        int *mount_fds = NULL;
87✔
1466
        size_t n_mounts_fds = 0;
87✔
1467

1468
        mount_fds = new(int, n);
87✔
1469
        if (!mount_fds)
87✔
1470
                return log_oom_debug();
×
1471

1472
        CLEANUP_ARRAY(mount_fds, n_mounts_fds, close_many_and_free);
87✔
1473

1474
        for (size_t i = 0; i < n; i++) {
172✔
1475
                int mntfd;
87✔
1476

1477
                /* Clone the mount point */
1478
                mntfd = mount_fds[n_mounts_fds] = open_tree(-EBADF, paths[i], OPEN_TREE_CLONE | OPEN_TREE_CLOEXEC);
87✔
1479
                if (mount_fds[n_mounts_fds] < 0)
87✔
1480
                        return log_debug_errno(errno, "Failed to open tree of mounted filesystem '%s': %m", paths[i]);
×
1481

1482
                n_mounts_fds++;
87✔
1483

1484
                /* Set the user namespace mapping attribute on the cloned mount point */
1485
                if (mount_setattr(mntfd, "", AT_EMPTY_PATH,
87✔
1486
                                  &(struct mount_attr) {
87✔
1487
                                          .attr_set = MOUNT_ATTR_IDMAP | extra_mount_attr_set,
87✔
1488
                                          .userns_fd = userns_fd,
1489
                                  }, sizeof(struct mount_attr)) < 0)
1490
                        return log_debug_errno(errno, "Failed to change bind mount attributes for clone of '%s': %m", paths[i]);
2✔
1491
        }
1492

1493
        for (size_t i = n; i > 0; i--) { /* Unmount the paths right-to-left */
170✔
1494
                /* Remove the old mount points now that we have a idmapped mounts as replacement for all of them */
1495
                r = umount_verbose(LOG_DEBUG, paths[i-1], UMOUNT_NOFOLLOW);
85✔
1496
                if (r < 0)
85✔
1497
                        return r;
1498
        }
1499

1500
        for (size_t i = 0; i < n; i++) { /* Mount the replacement mounts left-to-right */
170✔
1501
                /* And place the cloned version in its place */
1502
                log_debug("Mounting idmapped fs to '%s'", paths[i]);
85✔
1503
                if (move_mount(mount_fds[i], "", -EBADF, paths[i], MOVE_MOUNT_F_EMPTY_PATH) < 0)
85✔
1504
                        return log_debug_errno(errno, "Failed to attach UID mapped mount to '%s': %m", paths[i]);
×
1505
        }
1506

1507
        return 0;
1508
}
1509

1510
int remount_idmap(
74✔
1511
                char **p,
1512
                uid_t uid_shift,
1513
                uid_t uid_range,
1514
                uid_t source_owner,
1515
                uid_t dest_owner,
1516
                RemountIdmapping idmapping) {
1517

1518
        _cleanup_close_ int userns_fd = -EBADF;
74✔
1519

1520
        userns_fd = make_userns(uid_shift, uid_range, source_owner, dest_owner, idmapping);
74✔
1521
        if (userns_fd < 0)
74✔
1522
                return userns_fd;
1523

1524
        return remount_idmap_fd(p, userns_fd, /* extra_mount_attr_set= */ 0);
74✔
1525
}
1526

1527
static void sub_mount_clear(SubMount *s) {
5,273✔
1528
        assert(s);
5,273✔
1529

1530
        s->path = mfree(s->path);
5,273✔
1531
        s->mount_fd = safe_close(s->mount_fd);
5,273✔
1532
}
5,273✔
1533

1534
void sub_mount_array_free(SubMount *s, size_t n) {
1,238✔
1535
        assert(s || n == 0);
1,238✔
1536

1537
        for (size_t i = 0; i < n; i++)
5,770✔
1538
                sub_mount_clear(s + i);
4,532✔
1539

1540
        free(s);
1,238✔
1541
}
1,238✔
1542

1543
static int sub_mount_compare(const SubMount *a, const SubMount *b) {
6,555✔
1544
        assert(a);
6,555✔
1545
        assert(b);
6,555✔
1546
        assert(a->path);
6,555✔
1547
        assert(b->path);
6,555✔
1548

1549
        return path_compare(a->path, b->path);
6,555✔
1550
}
1551

1552
static void sub_mount_drop(SubMount *s, size_t n) {
1,780✔
1553
        assert(s || n == 0);
1,780✔
1554

1555
        for (size_t m = 0, i = 1; i < n; i++) {
5,074✔
1556
                if (path_startswith(s[i].path, s[m].path))
3,294✔
1557
                        sub_mount_clear(s + i);
741✔
1558
                else
1559
                        m = i;
1560
        }
1561
}
1,780✔
1562

1563
int get_sub_mounts(const char *prefix, SubMount **ret_mounts, size_t *ret_n_mounts) {
1,780✔
1564

1565
        _cleanup_(mnt_free_tablep) struct libmnt_table *table = NULL;
1,780✔
1566
        _cleanup_(mnt_free_iterp) struct libmnt_iter *iter = NULL;
1,780✔
1567
        SubMount *mounts = NULL;
1,780✔
1568
        size_t n = 0;
1,780✔
1569
        int r;
1,780✔
1570

1571
        CLEANUP_ARRAY(mounts, n, sub_mount_array_free);
1,780✔
1572

1573
        assert(prefix);
1,780✔
1574
        assert(ret_mounts);
1,780✔
1575
        assert(ret_n_mounts);
1,780✔
1576

1577
        r = libmount_parse_mountinfo(/* source = */ NULL, &table, &iter);
1,780✔
1578
        if (r < 0)
1,780✔
1579
                return log_debug_errno(r, "Failed to parse /proc/self/mountinfo: %m");
×
1580

1581
        for (;;) {
90,546✔
1582
                _cleanup_close_ int mount_fd = -EBADF;
88,766✔
1583
                _cleanup_free_ char *p = NULL;
90,546✔
1584
                struct libmnt_fs *fs;
90,546✔
1585
                const char *path;
90,546✔
1586
                int id1, id2;
90,546✔
1587

1588
                r = mnt_table_next_fs(table, iter, &fs);
90,546✔
1589
                if (r == 1)
90,546✔
1590
                        break; /* EOF */
1591
                if (r < 0)
88,766✔
1592
                        return log_debug_errno(r, "Failed to get next entry from /proc/self/mountinfo: %m");
×
1593

1594
                path = mnt_fs_get_target(fs);
88,766✔
1595
                if (!path)
88,766✔
1596
                        continue;
×
1597

1598
                if (isempty(path_startswith(path, prefix)))
88,766✔
1599
                        continue;
83,498✔
1600

1601
                id1 = mnt_fs_get_id(fs);
5,268✔
1602
                r = path_get_mnt_id(path, &id2);
5,268✔
1603
                if (r < 0) {
5,268✔
1604
                        log_debug_errno(r, "Failed to get mount ID of '%s', ignoring: %m", path);
×
1605
                        continue;
×
1606
                }
1607
                if (id1 != id2) {
5,268✔
1608
                        /* The path may be hidden by another over-mount or already remounted. */
1609
                        log_debug("The mount IDs of '%s' obtained by libmount and path_get_mnt_id() are different (%i vs %i), ignoring.",
736✔
1610
                                  path, id1, id2);
1611
                        continue;
736✔
1612
                }
1613

1614
                mount_fd = open(path, O_CLOEXEC|O_PATH);
4,532✔
1615
                if (mount_fd < 0) {
4,532✔
1616
                        if (errno == ENOENT) /* The path may be hidden by another over-mount or already unmounted. */
×
1617
                                continue;
×
1618

1619
                        return log_debug_errno(errno, "Failed to open subtree of mounted filesystem '%s': %m", path);
×
1620
                }
1621

1622
                p = strdup(path);
4,532✔
1623
                if (!p)
4,532✔
1624
                        return log_oom_debug();
×
1625

1626
                if (!GREEDY_REALLOC(mounts, n + 1))
4,532✔
1627
                        return log_oom_debug();
×
1628

1629
                mounts[n++] = (SubMount) {
4,532✔
1630
                        .path = TAKE_PTR(p),
4,532✔
1631
                        .mount_fd = TAKE_FD(mount_fd),
4,532✔
1632
                };
1633
        }
1634

1635
        typesafe_qsort(mounts, n, sub_mount_compare);
1,780✔
1636
        sub_mount_drop(mounts, n);
1,780✔
1637

1638
        *ret_mounts = TAKE_PTR(mounts);
1,780✔
1639
        *ret_n_mounts = n;
1,780✔
1640
        return 0;
1,780✔
1641
}
1642

1643
int bind_mount_submounts(
1,242✔
1644
                const char *source,
1645
                const char *target) {
1646

1647
        SubMount *mounts = NULL;
1,242✔
1648
        size_t n = 0;
1,242✔
1649
        int ret = 0, r;
1,242✔
1650

1651
        /* Bind mounts all child mounts of 'source' to 'target'. Useful when setting up a new procfs instance
1652
         * with new mount options to copy the original submounts over. */
1653

1654
        assert(source);
1,242✔
1655
        assert(target);
1,242✔
1656

1657
        CLEANUP_ARRAY(mounts, n, sub_mount_array_free);
1,242✔
1658

1659
        r = get_sub_mounts(source, &mounts, &n);
1,242✔
1660
        if (r < 0)
1,242✔
1661
                return r;
1662

1663
        FOREACH_ARRAY(m, mounts, n) {
5,770✔
1664
                _cleanup_free_ char *t = NULL;
4,528✔
1665
                const char *suffix;
4,528✔
1666

1667
                if (isempty(m->path))
4,528✔
1668
                        continue;
741✔
1669

1670
                assert_se(suffix = path_startswith(m->path, source));
3,787✔
1671

1672
                t = path_join(target, suffix);
3,787✔
1673
                if (!t)
3,787✔
1674
                        return -ENOMEM;
×
1675

1676
                r = path_is_mount_point(t);
3,787✔
1677
                if (r < 0) {
3,787✔
1678
                        log_debug_errno(r, "Failed to detect if '%s' already is a mount point, ignoring: %m", t);
9✔
1679
                        continue;
9✔
1680
                }
1681
                if (r > 0) {
3,778✔
1682
                        log_debug("Not bind mounting '%s' from '%s' to '%s', since there's already a mountpoint.", suffix, source, target);
×
1683
                        continue;
×
1684
                }
1685

1686
                r = mount_follow_verbose(LOG_DEBUG, FORMAT_PROC_FD_PATH(m->mount_fd), t, NULL, MS_BIND|MS_REC, NULL);
3,778✔
1687
                if (r < 0 && ret == 0)
3,778✔
1688
                        ret = r;
367✔
1689
        }
1690

1691
        return ret;
1692
}
1693

1694
int make_mount_point_inode_from_mode(int dir_fd, const char *dest, mode_t source_mode, mode_t target_mode) {
982✔
1695
        assert(dir_fd >= 0 || dir_fd == AT_FDCWD);
982✔
1696
        assert(dest);
982✔
1697

1698
        if (S_ISDIR(source_mode))
982✔
1699
                return mkdirat_label(dir_fd, dest, target_mode & 07777);
965✔
1700
        else
1701
                return RET_NERRNO(mknodat(dir_fd, dest, S_IFREG|(target_mode & 07666), 0)); /* Mask off X bit */
18✔
1702
}
1703

1704
int make_mount_point_inode_from_path(const char *source, const char *dest, mode_t access_mode) {
836✔
1705
        struct stat st;
836✔
1706

1707
        assert(source);
836✔
1708
        assert(dest);
836✔
1709

1710
        if (stat(source, &st) < 0)
836✔
1711
                return -errno;
×
1712

1713
        return make_mount_point_inode_from_mode(AT_FDCWD, dest, st.st_mode, access_mode);
836✔
1714
}
1715

1716
int trigger_automount_at(int dir_fd, const char *path) {
380✔
1717
        _cleanup_free_ char *nested = NULL;
760✔
1718

1719
        assert(dir_fd >= 0 || dir_fd == AT_FDCWD);
380✔
1720

1721
        nested = path_join(path, "a");
380✔
1722
        if (!nested)
380✔
1723
                return -ENOMEM;
1724

1725
        (void) faccessat(dir_fd, nested, F_OK, 0);
380✔
1726

1727
        return 0;
380✔
1728
}
1729

1730
unsigned long credentials_fs_mount_flags(bool ro) {
3,900✔
1731
        /* A tight set of mount flags for credentials mounts */
1732
        return MS_NODEV|MS_NOEXEC|MS_NOSUID|ms_nosymfollow_supported()|(ro ? MS_RDONLY : 0);
3,900✔
1733
}
1734

1735
int mount_credentials_fs(const char *path, size_t size, bool ro) {
1,950✔
1736
        _cleanup_free_ char *opts = NULL;
1,950✔
1737
        int r, noswap_supported;
1,950✔
1738

1739
        /* Mounts a file system we can place credentials in, i.e. with tight access modes right from the
1740
         * beginning, and ideally swapping turned off. In order of preference:
1741
         *
1742
         *      1. tmpfs if it supports "noswap"
1743
         *      2. ramfs
1744
         *      3. tmpfs if it doesn't support "noswap"
1745
         */
1746

1747
        noswap_supported = mount_option_supported("tmpfs", "noswap", NULL); /* Check explicitly to avoid kmsg noise */
1,950✔
1748
        if (noswap_supported > 0) {
1,950✔
1749
                _cleanup_free_ char *noswap_opts = NULL;
1,949✔
1750

1751
                if (asprintf(&noswap_opts, "mode=0700,nr_inodes=1024,size=%zu,noswap", size) < 0)
1,949✔
1752
                        return -ENOMEM;
1753

1754
                /* Best case: tmpfs with noswap (needs kernel >= 6.3) */
1755

1756
                r = mount_nofollow_verbose(
1,949✔
1757
                                LOG_DEBUG,
1758
                                "tmpfs",
1759
                                path,
1760
                                "tmpfs",
1761
                                credentials_fs_mount_flags(ro),
1762
                                noswap_opts);
1763
                if (r >= 0)
1,949✔
1764
                        return r;
1765
        }
1766

1767
        r = mount_nofollow_verbose(
1✔
1768
                        LOG_DEBUG,
1769
                        "ramfs",
1770
                        path,
1771
                        "ramfs",
1772
                        credentials_fs_mount_flags(ro),
1773
                        "mode=0700");
1774
        if (r >= 0)
1✔
1775
                return r;
1776

1777
        if (asprintf(&opts, "mode=0700,nr_inodes=1024,size=%zu", size) < 0)
1✔
1778
                return -ENOMEM;
1779

1780
        return mount_nofollow_verbose(
1,950✔
1781
                        LOG_DEBUG,
1782
                        "tmpfs",
1783
                        path,
1784
                        "tmpfs",
1785
                        credentials_fs_mount_flags(ro),
1786
                        opts);
1787
}
1788

1789
int make_fsmount(
2✔
1790
                int error_log_level,
1791
                const char *what,
1792
                const char *type,
1793
                unsigned long flags,
1794
                const char *options,
1795
                int userns_fd) {
1796

1797
        _cleanup_close_ int fs_fd = -EBADF, mnt_fd = -EBADF;
2✔
1798
        _cleanup_free_ char *o = NULL;
2✔
1799
        unsigned long f;
2✔
1800
        int r;
2✔
1801

1802
        assert(type);
2✔
1803
        assert(what);
2✔
1804

1805
        r = mount_option_mangle(options, flags, &f, &o);
2✔
1806
        if (r < 0)
2✔
1807
                return log_full_errno(
×
1808
                                error_log_level, r, "Failed to mangle mount options %s: %m",
1809
                                strempty(options));
1810

1811
        if (DEBUG_LOGGING) {
2✔
1812
                _cleanup_free_ char *fl = NULL;
2✔
1813
                (void) mount_flags_to_string(f, &fl);
2✔
1814

1815
                log_debug("Creating mount fd for %s (%s) (%s \"%s\")...",
4✔
1816
                        strna(what), strna(type), strnull(fl), strempty(o));
1817
        }
1818

1819
        fs_fd = fsopen(type, FSOPEN_CLOEXEC);
2✔
1820
        if (fs_fd < 0)
2✔
1821
                return log_full_errno(error_log_level, errno, "Failed to open superblock for \"%s\": %m", type);
×
1822

1823
        if (fsconfig(fs_fd, FSCONFIG_SET_STRING, "source", what, 0) < 0)
2✔
1824
                return log_full_errno(error_log_level, errno, "Failed to set mount source for \"%s\" to \"%s\": %m", type, what);
×
1825

1826
        if (FLAGS_SET(f, MS_RDONLY))
2✔
1827
                if (fsconfig(fs_fd, FSCONFIG_SET_FLAG, "ro", NULL, 0) < 0)
2✔
1828
                        return log_full_errno(error_log_level, errno, "Failed to set read only mount flag for \"%s\": %m", type);
×
1829

1830
        for (const char *p = o;;) {
2✔
1831
                _cleanup_free_ char *word = NULL;
×
1832
                char *eq;
2✔
1833

1834
                r = extract_first_word(&p, &word, ",", EXTRACT_KEEP_QUOTE);
2✔
1835
                if (r < 0)
2✔
1836
                        return log_full_errno(error_log_level, r, "Failed to parse mount option string \"%s\": %m", o);
×
1837
                if (r == 0)
2✔
1838
                        break;
1839

1840
                eq = strchr(word, '=');
×
1841
                if (eq) {
×
1842
                        *eq = 0;
×
1843
                        eq++;
×
1844

1845
                        if (fsconfig(fs_fd, FSCONFIG_SET_STRING, word, eq, 0) < 0)
×
1846
                                return log_full_errno(error_log_level, errno, "Failed to set mount option \"%s=%s\" for \"%s\": %m", word, eq, type);
×
1847
                } else {
1848
                        if (fsconfig(fs_fd, FSCONFIG_SET_FLAG, word, NULL, 0) < 0)
×
1849
                                return log_full_errno(error_log_level, errno, "Failed to set mount flag \"%s\" for \"%s\": %m", word, type);
×
1850
                }
1851
        }
1852

1853
        if (fsconfig(fs_fd, FSCONFIG_CMD_CREATE, NULL, NULL, 0) < 0)
2✔
1854
                return log_full_errno(error_log_level, errno, "Failed to realize fs fd for \"%s\" (\"%s\"): %m", what, type);
×
1855

1856
        mnt_fd = fsmount(fs_fd, FSMOUNT_CLOEXEC, 0);
2✔
1857
        if (mnt_fd < 0)
2✔
1858
                return log_full_errno(error_log_level, errno, "Failed to create mount fd for \"%s\" (\"%s\"): %m", what, type);
×
1859

1860
        if (mount_setattr(mnt_fd, "", AT_EMPTY_PATH|AT_RECURSIVE,
2✔
1861
                          &(struct mount_attr) {
4✔
1862
                                  .attr_set = ms_flags_to_mount_attr(f) | (userns_fd >= 0 ? MOUNT_ATTR_IDMAP : 0),
4✔
1863
                                  .userns_fd = userns_fd,
1864
                          }, MOUNT_ATTR_SIZE_VER0) < 0)
1865
                return log_full_errno(error_log_level,
×
1866
                                      errno,
1867
                                      "Failed to set mount flags for \"%s\" (\"%s\"): %m",
1868
                                      what,
1869
                                      type);
1870

1871
        return TAKE_FD(mnt_fd);
2✔
1872
}
1873

1874
char* umount_and_unlink_and_free(char *p) {
1✔
1875
        if (!p)
1✔
1876
                return NULL;
1✔
1877

1878
        PROTECT_ERRNO;
2✔
1879
        (void) umount2(p, 0);
1✔
1880
        (void) unlink(p);
1✔
1881
        return mfree(p);
1✔
1882
}
1883

1884
static int path_get_mount_info_at(
617✔
1885
                int dir_fd,
1886
                const char *path,
1887
                char **ret_fstype,
1888
                char **ret_options) {
1889

1890
        _cleanup_(mnt_free_tablep) struct libmnt_table *table = NULL;
617✔
1891
        _cleanup_(mnt_free_iterp) struct libmnt_iter *iter = NULL;
617✔
1892
        int r, mnt_id;
617✔
1893

1894
        assert(dir_fd >= 0 || dir_fd == AT_FDCWD);
617✔
1895

1896
        r = path_get_mnt_id_at(dir_fd, path, &mnt_id);
617✔
1897
        if (r < 0)
617✔
1898
                return log_debug_errno(r, "Failed to get mount ID: %m");
×
1899

1900
        /* When getting options is requested, we also need to parse utab, otherwise userspace options like
1901
         * "_netdev" will be lost. */
1902
        if (ret_options)
617✔
1903
                r = libmount_parse_with_utab(&table, &iter);
617✔
1904
        else
1905
                r = libmount_parse_mountinfo(/* source = */ NULL, &table, &iter);
×
1906
        if (r < 0)
617✔
1907
                return log_debug_errno(r, "Failed to parse /proc/self/mountinfo: %m");
×
1908

1909
        for (;;) {
2,829✔
1910
                struct libmnt_fs *fs;
1,723✔
1911

1912
                r = mnt_table_next_fs(table, iter, &fs);
1,723✔
1913
                if (r == 1)
1,723✔
1914
                        break; /* EOF */
1915
                if (r < 0)
1,723✔
1916
                        return log_debug_errno(r, "Failed to get next entry from /proc/self/mountinfo: %m");
617✔
1917

1918
                if (mnt_fs_get_id(fs) != mnt_id)
1,723✔
1919
                        continue;
1,106✔
1920

1921
                _cleanup_free_ char *fstype = NULL, *options = NULL;
617✔
1922

1923
                if (ret_fstype) {
617✔
1924
                        fstype = strdup(strempty(mnt_fs_get_fstype(fs)));
617✔
1925
                        if (!fstype)
617✔
1926
                                return log_oom_debug();
×
1927
                }
1928

1929
                if (ret_options) {
617✔
1930
                        options = strdup(strempty(mnt_fs_get_options(fs)));
617✔
1931
                        if (!options)
617✔
1932
                                return log_oom_debug();
×
1933
                }
1934

1935
                if (ret_fstype)
617✔
1936
                        *ret_fstype = TAKE_PTR(fstype);
617✔
1937
                if (ret_options)
617✔
1938
                        *ret_options = TAKE_PTR(options);
617✔
1939

1940
                return 0;
1941
        }
1942

1943
        return log_debug_errno(SYNTHETIC_ERRNO(ESTALE), "Cannot find mount ID %i from /proc/self/mountinfo.", mnt_id);
617✔
1944
}
1945

1946
int path_is_network_fs_harder_at(int dir_fd, const char *path) {
635✔
1947
        _cleanup_close_ int fd = -EBADF;
635✔
1948
        int r;
635✔
1949

1950
        assert(dir_fd >= 0 || dir_fd == AT_FDCWD);
635✔
1951

1952
        fd = xopenat(dir_fd, path, O_PATH | O_CLOEXEC | O_NOFOLLOW);
635✔
1953
        if (fd < 0)
635✔
1954
                return fd;
1955

1956
        r = fd_is_network_fs(fd);
617✔
1957
        if (r != 0)
617✔
1958
                return r;
1959

1960
        _cleanup_free_ char *fstype = NULL, *options = NULL;
617✔
1961
        r = path_get_mount_info_at(fd, /* path = */ NULL, &fstype, &options);
617✔
1962
        if (r < 0)
617✔
1963
                return r;
1964

1965
        if (fstype_is_network(fstype))
617✔
1966
                return true;
1967

1968
        if (fstab_test_option(options, "_netdev\0"))
617✔
1969
                return true;
×
1970

1971
        return false;
1972
}
STATUS · Troubleshooting · Open an Issue · Sales · Support · CAREERS · ENTERPRISE · START FREE · SCHEDULE DEMO
ANNOUNCEMENTS · TWITTER · TOS & SLA · Supported CI Services · What's a CI service? · Automated Testing

© 2026 Coveralls, Inc