• Home
  • Features
  • Pricing
  • Docs
  • Announcements
  • Sign In

systemd / systemd / 15150396955

20 May 2025 10:32PM UTC coverage: 72.047% (-0.2%) from 72.25%
15150396955

push

github

web-flow
resolved: add new "DNS Delegate" concepts (#34368)

Various long standing issues (at least: #5573 #14159 #20485 #21260
#24532 #32022 #18056) have been asking for a way to delegate DNS
resolution of specific domains to very specific DNS servers.

This PR goes a major step towards that goal by adding a new concept "DNS
Delegate" which allows to configure just that. Basically, this adds a
third kind of DNS scope to resolved's logic: besides the per-link and
global DNS scopes there are now also "delegate" scopes, which can be
created by dropping in a new file /etc/systemd/dns-delegate/*.conf. They
carry DNS= and Domains= lines just like the global setting or what the
per-link configuration can carry.

And they are consulted the same way as link DNS scopes are considered,
following the same routing rules.

This allows to configure these DNS delegates statically via drop-in
files as mentioned, and only adds the most basic functionality. Later on
we might want to extend this:

1. Allow dynamic creation of DNS delegates via IPC with lifecycle bound
to IPC client (usecase: installing a DNS delegate that routes traffic to
some DNS-over-TLS server once basic setup is complete).
2. Allow configuration of protocol details per delegate the same way
this is currently allowed per-link.
3. Instead of strictly using DNS as delegation protocol, support an
alternative varlink based protocol (without retransmission problems and
so on) that systemd-machined and similar can implement.

This PR is not complete yet. Lacks docs and tests. Seems to work fine in
my local tests however.

Fixes: #5573
Fixes: #18056
Fixes: #20485

470 of 586 new or added lines in 14 files covered. (80.2%)

3358 existing lines in 54 files now uncovered.

299091 of 415134 relevant lines covered (72.05%)

703065.7 hits per line

Source File
Press 'n' to go to next uncovered line, 'b' for previous

72.86
/src/shared/mount-util.c
1
/* SPDX-License-Identifier: LGPL-2.1-or-later */
2

3
#include <errno.h>
4
#include <linux/loop.h>
5
#include <stdlib.h>
6
#include <sys/mount.h>
7
#include <sys/stat.h>
8
#include <sys/statvfs.h>
9
#include <unistd.h>
10

11
#include "alloc-util.h"
12
#include "chase.h"
13
#include "dissect-image.h"
14
#include "exec-util.h"
15
#include "extract-word.h"
16
#include "fd-util.h"
17
#include "fileio.h"
18
#include "fs-util.h"
19
#include "fstab-util.h"
20
#include "glyph-util.h"
21
#include "hashmap.h"
22
#include "initrd-util.h"
23
#include "label-util.h"
24
#include "libmount-util.h"
25
#include "log.h"
26
#include "missing_syscall.h"
27
#include "mkdir-label.h"
28
#include "mount-util.h"
29
#include "mountpoint-util.h"
30
#include "namespace-util.h"
31
#include "parse-util.h"
32
#include "path-util.h"
33
#include "process-util.h"
34
#include "set.h"
35
#include "sort-util.h"
36
#include "stat-util.h"
37
#include "stdio-util.h"
38
#include "string-table.h"
39
#include "string-util.h"
40
#include "strv.h"
41
#include "tmpfile-util.h"
42
#include "user-util.h"
43

44
int umount_recursive_full(const char *prefix, int flags, char **keep) {
8,800✔
45
        _cleanup_fclose_ FILE *f = NULL;
8,800✔
46
        int n = 0, r;
8,800✔
47

48
        /* Try to umount everything recursively below a directory. Also, take care of stacked mounts, and
49
         * keep unmounting them until they are gone. */
50

51
        f = fopen("/proc/self/mountinfo", "re"); /* Pin the file, in case we unmount /proc/ as part of the logic here */
8,800✔
52
        if (!f)
8,800✔
53
                return log_debug_errno(errno, "Failed to open /proc/self/mountinfo: %m");
×
54

55
        for (;;) {
46,094✔
56
                _cleanup_(mnt_free_tablep) struct libmnt_table *table = NULL;
18,647✔
57
                _cleanup_(mnt_free_iterp) struct libmnt_iter *iter = NULL;
27,447✔
58
                bool again = false;
27,447✔
59

60
                r = libmount_parse_mountinfo(f, &table, &iter);
27,447✔
61
                if (r < 0)
27,447✔
62
                        return log_debug_errno(r, "Failed to parse /proc/self/mountinfo: %m");
×
63

64
                for (;;) {
2,901,259✔
65
                        bool shall_keep = false;
1,464,353✔
66
                        struct libmnt_fs *fs;
1,464,353✔
67
                        const char *path;
1,464,353✔
68

69
                        r = mnt_table_next_fs(table, iter, &fs);
1,464,353✔
70
                        if (r == 1)
1,464,353✔
71
                                break;
72
                        if (r < 0)
1,455,553✔
73
                                return log_debug_errno(r, "Failed to get next entry from /proc/self/mountinfo: %m");
×
74

75
                        path = mnt_fs_get_target(fs);
1,455,553✔
76
                        if (!path)
1,455,553✔
77
                                continue;
1,436,906✔
78

79
                        if (prefix && !path_startswith(path, prefix)) {
2,910,775✔
80
                                // FIXME: This is extremely noisy, we're probably doing something very wrong
81
                                // to trigger this so often, needs more investigation.
82
                                // log_trace("Not unmounting %s, outside of prefix: %s", path, prefix);
83
                                continue;
1,427,404✔
84
                        }
85

86
                        STRV_FOREACH(k, keep)
28,219✔
87
                                /* Match against anything in the path to the dirs to keep, or below the dirs to keep */
88
                                if (path_startswith(path, *k) || path_startswith(*k, path)) {
346✔
89
                                        shall_keep = true;
276✔
90
                                        break;
276✔
91
                                }
92
                        if (shall_keep) {
28,425✔
93
                                log_debug("Not unmounting %s, referenced by keep list.", path);
276✔
94
                                continue;
276✔
95
                        }
96

97
                        if (umount2(path, flags | UMOUNT_NOFOLLOW) < 0) {
27,873✔
98
                                log_debug_errno(errno, "Failed to umount %s, ignoring: %m", path);
9,226✔
99
                                continue;
9,226✔
100
                        }
101

102
                        log_trace("Successfully unmounted %s", path);
18,647✔
103

104
                        again = true;
18,647✔
105
                        n++;
18,647✔
106

107
                        break;
18,647✔
108
                }
109

110
                if (!again)
8,800✔
111
                        break;
112

113
                rewind(f);
18,647✔
114
        }
115

116
        return n;
8,800✔
117
}
118

119
#define MS_CONVERTIBLE_FLAGS (MS_RDONLY|MS_NOSUID|MS_NODEV|MS_NOEXEC|MS_NOSYMFOLLOW)
120

121
static uint64_t ms_flags_to_mount_attr(unsigned long a) {
38,702✔
122
        uint64_t f = 0;
38,702✔
123

124
        if (FLAGS_SET(a, MS_RDONLY))
38,702✔
125
                f |= MOUNT_ATTR_RDONLY;
1,541✔
126

127
        if (FLAGS_SET(a, MS_NOSUID))
38,702✔
128
                f |= MOUNT_ATTR_NOSUID;
17,812✔
129

130
        if (FLAGS_SET(a, MS_NODEV))
38,702✔
131
                f |= MOUNT_ATTR_NODEV;
2✔
132

133
        if (FLAGS_SET(a, MS_NOEXEC))
38,702✔
134
                f |= MOUNT_ATTR_NOEXEC;
2✔
135

136
        if (FLAGS_SET(a, MS_NOSYMFOLLOW))
38,702✔
137
                f |= MOUNT_ATTR_NOSYMFOLLOW;
×
138

139
        return f;
38,702✔
140
}
141

142
static bool skip_mount_set_attr = false;
143

144
/* Use this function only if you do not have direct access to /proc/self/mountinfo but the caller can open it
145
 * for you. This is the case when /proc is masked or not mounted. Otherwise, use bind_remount_recursive. */
146
int bind_remount_recursive_with_mountinfo(
36,844✔
147
                const char *prefix,
148
                unsigned long new_flags,
149
                unsigned long flags_mask,
150
                char **deny_list,
151
                FILE *proc_self_mountinfo) {
152

153
        _cleanup_fclose_ FILE *proc_self_mountinfo_opened = NULL;
36,844✔
154
        _cleanup_set_free_ Set *done = NULL;
36,844✔
155
        unsigned n_tries = 0;
36,844✔
156
        int r;
36,844✔
157

158
        assert(prefix);
36,844✔
159

160
        if ((flags_mask & ~MS_CONVERTIBLE_FLAGS) == 0 && strv_isempty(deny_list) && !skip_mount_set_attr) {
53,865✔
161
                /* Let's take a shortcut for all the flags we know how to convert into mount_setattr() flags */
162

163
                if (mount_setattr(AT_FDCWD, prefix, AT_SYMLINK_NOFOLLOW|AT_RECURSIVE,
17,021✔
164
                                  &(struct mount_attr) {
17,021✔
165
                                          .attr_set = ms_flags_to_mount_attr(new_flags & flags_mask),
17,021✔
166
                                          .attr_clr = ms_flags_to_mount_attr(~new_flags & flags_mask),
17,021✔
167
                                  }, MOUNT_ATTR_SIZE_VER0) < 0) {
168

169
                        log_debug_errno(errno, "mount_setattr() failed, falling back to classic remounting: %m");
2✔
170

171
                        /* We fall through to classic behaviour if not supported (i.e. kernel < 5.12). We
172
                         * also do this for all other kinds of errors since they are so many different, and
173
                         * mount_setattr() has no graceful mode where it continues despite seeing errors one
174
                         * some mounts, but we want that. Moreover mount_setattr() only works on the mount
175
                         * point inode itself, not a non-mount point inode, and we want to support arbitrary
176
                         * prefixes here. */
177

178
                        if (ERRNO_IS_NOT_SUPPORTED(errno)) /* if not supported, then don't bother at all anymore */
2✔
179
                                skip_mount_set_attr = true;
×
180
                } else
181
                        return 0; /* Nice, this worked! */
17,019✔
182
        }
183

184
        if (!proc_self_mountinfo) {
19,825✔
185
                r = fopen_unlocked("/proc/self/mountinfo", "re", &proc_self_mountinfo_opened);
3✔
186
                if (r < 0)
3✔
187
                        return r;
188

189
                proc_self_mountinfo = proc_self_mountinfo_opened;
3✔
190
        }
191

192
        /* Recursively remount a directory (and all its submounts) with desired flags (MS_READONLY,
193
         * MS_NOSUID, MS_NOEXEC). If the directory is already mounted, we reuse the mount and simply mark it
194
         * MS_BIND|MS_RDONLY (or remove the MS_RDONLY for read-write operation), ditto for other flags. If it
195
         * isn't we first make it one. Afterwards we apply (or remove) the flags to all submounts we can
196
         * access, too. When mounts are stacked on the same mount point we only care for each individual
197
         * "top-level" mount on each point, as we cannot influence/access the underlying mounts anyway. We do
198
         * not have any effect on future submounts that might get propagated, they might be writable
199
         * etc. This includes future submounts that have been triggered via autofs. Also note that we can't
200
         * operate atomically here. Mounts established while we process the tree might or might not get
201
         * noticed and thus might or might not be covered.
202
         *
203
         * If the "deny_list" parameter is specified it may contain a list of subtrees to exclude from the
204
         * remount operation. Note that we'll ignore the deny list for the top-level path. */
205

206
        for (;;) {
39,652✔
207
                _cleanup_(mnt_free_tablep) struct libmnt_table *table = NULL;
39,652✔
208
                _cleanup_(mnt_free_iterp) struct libmnt_iter *iter = NULL;
39,652✔
209
                _cleanup_hashmap_free_ Hashmap *todo = NULL;
39,650✔
210
                bool top_autofs = false;
39,652✔
211

212
                if (n_tries++ >= 32) /* Let's not retry this loop forever */
39,652✔
213
                        return -EBUSY;
214

215
                rewind(proc_self_mountinfo);
39,652✔
216

217
                r = libmount_parse_mountinfo(proc_self_mountinfo, &table, &iter);
39,652✔
218
                if (r < 0)
39,652✔
219
                        return log_debug_errno(r, "Failed to parse /proc/self/mountinfo: %m");
×
220

221
                for (;;) {
2,763,359✔
222
                        _cleanup_free_ char *d = NULL;
2,723,707✔
223
                        const char *path, *type, *opts;
2,763,359✔
224
                        unsigned long flags = 0;
2,763,359✔
225
                        struct libmnt_fs *fs;
2,763,359✔
226

227
                        r = mnt_table_next_fs(table, iter, &fs);
2,763,359✔
228
                        if (r == 1) /* EOF */
2,763,359✔
229
                                break;
230
                        if (r < 0)
2,723,707✔
231
                                return log_debug_errno(r, "Failed to get next entry from /proc/self/mountinfo: %m");
×
232

233
                        path = mnt_fs_get_target(fs);
2,723,707✔
234
                        if (!path)
2,723,707✔
235
                                continue;
×
236

237
                        if (!path_startswith(path, prefix))
2,723,707✔
238
                                continue;
2,655,564✔
239

240
                        type = mnt_fs_get_fstype(fs);
68,143✔
241
                        if (!type)
68,143✔
242
                                continue;
×
243

244
                        /* Let's ignore autofs mounts. If they aren't triggered yet, we want to avoid
245
                         * triggering them, as we don't make any guarantees for future submounts anyway. If
246
                         * they are already triggered, then we will find another entry for this. */
247
                        if (streq(type, "autofs")) {
68,143✔
248
                                top_autofs = top_autofs || path_equal(path, prefix);
6,304✔
249
                                continue;
3,152✔
250
                        }
251

252
                        if (set_contains(done, path))
64,991✔
253
                                continue;
23,455✔
254

255
                        /* Ignore this mount if it is deny-listed, but only if it isn't the top-level mount
256
                         * we shall operate on. */
257
                        if (!path_equal(path, prefix)) {
41,536✔
258
                                bool deny_listed = false;
283,517✔
259

260
                                STRV_FOREACH(i, deny_list) {
283,517✔
261
                                        if (path_equal(*i, prefix))
279,888✔
262
                                                continue;
21,698✔
263

264
                                        if (!path_startswith(*i, prefix))
258,190✔
265
                                                continue;
139,345✔
266

267
                                        if (path_startswith(path, *i)) {
118,845✔
268
                                                deny_listed = true;
269
                                                log_trace("Not remounting %s deny-listed by %s, called for %s", path, *i, prefix);
270
                                                break;
271
                                        }
272
                                }
273

274
                                if (deny_listed)
21,709✔
275
                                        continue;
18,080✔
276
                        }
277

278
                        opts = mnt_fs_get_vfs_options(fs);
23,456✔
279
                        if (opts) {
23,456✔
280
                                r = mnt_optstr_get_flags(opts, &flags, mnt_get_builtin_optmap(MNT_LINUX_MAP));
23,456✔
281
                                if (r < 0)
23,456✔
282
                                        log_debug_errno(r, "Could not get flags for '%s', ignoring: %m", path);
×
283
                        }
284

285
                        d = strdup(path);
23,456✔
286
                        if (!d)
23,456✔
287
                                return -ENOMEM;
288

289
                        r = hashmap_ensure_put(&todo, &path_hash_ops_free, d, ULONG_TO_PTR(flags));
23,456✔
290
                        if (r == -EEXIST)
23,456✔
291
                                /* If the same path was recorded, but with different mount flags, update it:
292
                                 * it means a mount point is overmounted, and libmount returns the "bottom" (or
293
                                 * older one) first, but we want to reapply the flags from the "top" (or newer
294
                                 * one). See: https://github.com/systemd/systemd/issues/20032
295
                                 * Note that this shouldn't really fail, as we were just told that the key
296
                                 * exists, and it's an update so we want 'd' to be freed immediately. */
297
                                r = hashmap_update(todo, d, ULONG_TO_PTR(flags));
8✔
298
                        if (r < 0)
23,456✔
299
                                return r;
300
                        if (r > 0)
23,456✔
301
                                TAKE_PTR(d);
23,414✔
302
                }
303

304
                /* Check if the top-level directory was among what we have seen so far. For that check both
305
                 * 'done' and 'todo'. Also check 'top_autofs' because if the top-level dir is an autofs we'll
306
                 * not include it in either set but will set this bool. */
307
                if (!set_contains(done, prefix) &&
39,652✔
308
                    !(top_autofs || hashmap_contains(todo, prefix))) {
19,827✔
309

310
                        /* The prefix directory itself is not yet a mount, make it one. */
311
                        r = mount_nofollow(prefix, prefix, NULL, MS_BIND|MS_REC, NULL);
2✔
312
                        if (r < 0)
2✔
313
                                return r;
314

315
                        /* Immediately rescan, so that we pick up the new mount's flags */
316
                        continue;
2✔
317
                }
318

319
                /* If we have no submounts to process anymore, we are done */
320
                if (hashmap_isempty(todo))
39,650✔
321
                        return 0;
322

323
                for (;;) {
43,238✔
324
                        unsigned long flags;
43,238✔
325
                        char *x = NULL;
43,238✔
326

327
                        /* Take the first mount from our list of mounts to still process */
328
                        flags = PTR_TO_ULONG(hashmap_steal_first_key_and_value(todo, (void**) &x));
43,238✔
329
                        if (!x)
43,238✔
330
                                break;
331

332
                        r = set_ensure_consume(&done, &path_hash_ops_free, x);
23,413✔
333
                        if (IN_SET(r, 0, -EEXIST))
23,413✔
334
                                continue; /* Already done */
219✔
335
                        if (r < 0)
23,413✔
336
                                return r;
×
337

338
                        /* Now, remount this with the new flags set, but exclude MS_RELATIME from it. (It's
339
                         * the default anyway, thus redundant, and in userns we'll get an error if we try to
340
                         * explicitly enable it) */
341
                        r = mount_nofollow(NULL, x, NULL, ((flags & ~flags_mask)|MS_BIND|MS_REMOUNT|new_flags) & ~MS_RELATIME, NULL);
23,413✔
342
                        if (r < 0) {
23,413✔
343
                                int q;
219✔
344

345
                                /* OK, so the remount of this entry failed. We'll ultimately ignore this in
346
                                 * almost all cases (there are simply so many reasons why this can fail,
347
                                 * think autofs, NFS, FUSE, …), but let's generate useful debug messages at
348
                                 * the very least. */
349

350
                                q = path_is_mount_point(x);
219✔
351
                                if (IN_SET(q, 0, -ENOENT)) {
219✔
352
                                        /* Hmm, whaaaa? The mount point is not actually a mount point? Then
353
                                         * it is either obstructed by a later mount or somebody has been
354
                                         * racing against us and removed it. Either way the mount point
355
                                         * doesn't matter to us, let's ignore it hence. */
356
                                        log_debug_errno(r, "Mount point '%s' to remount is not a mount point anymore, ignoring remount failure: %m", x);
216✔
357
                                        continue;
216✔
358
                                }
359
                                if (q < 0) /* Any other error on this? Just log and continue */
3✔
360
                                        log_debug_errno(q, "Failed to determine whether '%s' is a mount point or not, ignoring: %m", x);
×
361

362
                                if (((flags ^ new_flags) & flags_mask & ~MS_RELATIME) == 0) { /* ignore MS_RELATIME while comparing */
3✔
363
                                        log_debug_errno(r, "Couldn't remount '%s', but the flags already match what we want, hence ignoring: %m", x);
×
364
                                        continue;
×
365
                                }
366

367
                                /* Make this fatal if this is the top-level mount */
368
                                if (path_equal(x, prefix))
3✔
369
                                        return r;
370

371
                                /* If this is not the top-level mount, then handle this gracefully: log but
372
                                 * otherwise ignore. With NFS, FUSE, autofs there are just too many reasons
373
                                 * this might fail without a chance for us to do anything about it, let's
374
                                 * hence be strict on the top-level mount and lenient on the inner ones. */
375
                                log_debug_errno(r, "Couldn't remount submount '%s' for unexpected reason, ignoring: %m", x);
3✔
376
                                continue;
3✔
377
                        }
378

379
                        log_trace("Remounted %s.", x);
23,194✔
380
                }
381
        }
382
}
383

384
int bind_remount_one_with_mountinfo(
2,329✔
385
                const char *path,
386
                unsigned long new_flags,
387
                unsigned long flags_mask,
388
                FILE *proc_self_mountinfo) {
389

390
        _cleanup_(mnt_free_tablep) struct libmnt_table *table = NULL;
2,329✔
391
        unsigned long flags = 0;
2,329✔
392
        struct libmnt_fs *fs;
2,329✔
393
        const char *opts;
2,329✔
394
        int r;
2,329✔
395

396
        assert(path);
2,329✔
397
        assert(proc_self_mountinfo);
2,329✔
398

399
        if ((flags_mask & ~MS_CONVERTIBLE_FLAGS) == 0 && !skip_mount_set_attr) {
2,329✔
400
                /* Let's take a shortcut for all the flags we know how to convert into mount_setattr() flags */
401

402
                if (mount_setattr(AT_FDCWD, path, AT_SYMLINK_NOFOLLOW,
2,329✔
403
                                  &(struct mount_attr) {
2,329✔
404
                                          .attr_set = ms_flags_to_mount_attr(new_flags & flags_mask),
2,329✔
405
                                          .attr_clr = ms_flags_to_mount_attr(~new_flags & flags_mask),
2,329✔
406
                                  }, MOUNT_ATTR_SIZE_VER0) < 0) {
407

408
                        log_debug_errno(errno, "mount_setattr() didn't work, falling back to classic remounting: %m");
4✔
409

410
                        if (ERRNO_IS_NOT_SUPPORTED(errno)) /* if not supported, then don't bother at all anymore */
4✔
411
                                skip_mount_set_attr = true;
×
412
                } else
413
                        return 0; /* Nice, this worked! */
2,325✔
414
        }
415

416
        rewind(proc_self_mountinfo);
4✔
417

418
        table = mnt_new_table();
4✔
419
        if (!table)
4✔
420
                return -ENOMEM;
421

422
        r = mnt_table_parse_stream(table, proc_self_mountinfo, "/proc/self/mountinfo");
4✔
423
        if (r < 0)
4✔
424
                return r;
425

426
        fs = mnt_table_find_target(table, path, MNT_ITER_FORWARD);
4✔
427
        if (!fs) {
4✔
428
                r = access_nofollow(path, F_OK); /* Hmm, it's not in the mount table, but does it exist at all? */
4✔
429
                if (r < 0)
4✔
430
                        return r;
431

432
                return -EINVAL; /* Not a mount point we recognize */
2✔
433
        }
434

435
        opts = mnt_fs_get_vfs_options(fs);
×
436
        if (opts) {
×
437
                r = mnt_optstr_get_flags(opts, &flags, mnt_get_builtin_optmap(MNT_LINUX_MAP));
×
438
                if (r < 0)
×
439
                        log_debug_errno(r, "Could not get flags for '%s', ignoring: %m", path);
×
440
        }
441

442
        r = mount_nofollow(NULL, path, NULL, ((flags & ~flags_mask)|MS_BIND|MS_REMOUNT|new_flags) & ~MS_RELATIME, NULL);
×
443
        if (r < 0) {
×
444
                if (((flags ^ new_flags) & flags_mask & ~MS_RELATIME) != 0) /* Ignore MS_RELATIME again,
×
445
                                                                             * since kernel adds it in
446
                                                                             * everywhere, because it's the
447
                                                                             * default. */
448
                        return r;
449

450
                /* Let's handle redundant remounts gracefully */
451
                log_debug_errno(r, "Failed to remount '%s' but flags already match what we want, ignoring: %m", path);
4✔
452
        }
453

454
        return 0;
455
}
456

457
int bind_remount_one(const char *path, unsigned long new_flags, unsigned long flags_mask) {
53✔
458
        _cleanup_fclose_ FILE *proc_self_mountinfo = NULL;
53✔
459

460
        proc_self_mountinfo = fopen("/proc/self/mountinfo", "re");
53✔
461
        if (!proc_self_mountinfo)
53✔
462
                return log_debug_errno(errno, "Failed to open /proc/self/mountinfo: %m");
×
463

464
        return bind_remount_one_with_mountinfo(path, new_flags, flags_mask, proc_self_mountinfo);
53✔
465
}
466

467
static int mount_switch_root_pivot(int fd_newroot, const char *path) {
2,215✔
468
        assert(fd_newroot >= 0);
2,215✔
469
        assert(path);
2,215✔
470

471
        /* Let the kernel tuck the new root under the old one. */
472
        if (pivot_root(".", ".") < 0)
2,215✔
473
                return log_debug_errno(errno, "Failed to pivot root to new rootfs '%s': %m", path);
36✔
474

475
        /* Get rid of the old root and reveal our brand new root. (This will always operate on the top-most
476
         * mount on our cwd, regardless what our current directory actually points to.) */
477
        if (umount2(".", MNT_DETACH) < 0)
2,179✔
478
                return log_debug_errno(errno, "Failed to unmount old rootfs: %m");
×
479

480
        return 0;
481
}
482

483
static int mount_switch_root_move(int fd_newroot, const char *path) {
36✔
484
        assert(fd_newroot >= 0);
36✔
485
        assert(path);
36✔
486

487
        /* Move the new root fs */
488
        if (mount(".", "/", NULL, MS_MOVE, NULL) < 0)
36✔
489
                return log_debug_errno(errno, "Failed to move new rootfs '%s': %m", path);
×
490

491
        /* Also change root dir */
492
        if (chroot(".") < 0)
36✔
493
                return log_debug_errno(errno, "Failed to chroot to new rootfs '%s': %m", path);
×
494

495
        return 0;
496
}
497

498
int mount_switch_root_full(const char *path, unsigned long mount_propagation_flag, bool force_ms_move) {
2,217✔
499
        _cleanup_close_ int fd_newroot = -EBADF;
2,217✔
500
        int r, is_current_root;
2,217✔
501

502
        assert(path);
2,217✔
503
        assert(mount_propagation_flag_is_valid(mount_propagation_flag));
2,217✔
504

505
        fd_newroot = open(path, O_PATH|O_DIRECTORY|O_CLOEXEC|O_NOFOLLOW);
2,217✔
506
        if (fd_newroot < 0)
2,217✔
507
                return log_debug_errno(errno, "Failed to open new rootfs '%s': %m", path);
×
508

509
        is_current_root = path_is_root_at(fd_newroot, NULL);
2,217✔
510
        if (is_current_root < 0)
2,217✔
511
                return log_debug_errno(is_current_root, "Failed to determine if target dir is our root already: %m");
×
512

513
        /* Change into the new rootfs. */
514
        if (fchdir(fd_newroot) < 0)
2,217✔
515
                return log_debug_errno(errno, "Failed to chdir into new rootfs '%s': %m", path);
×
516

517
        /* Make this a NOP if we are supposed to switch to our current root fs. After all, both pivot_root()
518
         * and MS_MOVE don't like that. */
519
        if (!is_current_root) {
2,217✔
520
                if (!force_ms_move) {
2,215✔
521
                        r = mount_switch_root_pivot(fd_newroot, path);
2,215✔
522
                        if (r < 0) {
2,215✔
523
                                log_debug_errno(r, "Failed to pivot into new rootfs '%s', will try to use MS_MOVE instead: %m", path);
36✔
524
                                force_ms_move = true;
525
                        }
526
                }
527
                if (force_ms_move) {
528
                        /* Failed to pivot_root() fallback to MS_MOVE. For example, this may happen if the rootfs is
529
                         * an initramfs in which case pivot_root() isn't supported. */
530
                        r = mount_switch_root_move(fd_newroot, path);
36✔
531
                        if (r < 0)
36✔
532
                                return log_debug_errno(r, "Failed to switch to new rootfs '%s' with MS_MOVE: %m", path);
×
533
                }
534
        }
535

536
        log_debug("Successfully switched root to '%s'.", path);
2,217✔
537

538
        /* Finally, let's establish the requested propagation flags. */
539
        if (mount_propagation_flag == 0)
2,217✔
540
                return 0;
541

542
        if (mount(NULL, ".", NULL, mount_propagation_flag | MS_REC, NULL) < 0)
213✔
543
                return log_debug_errno(errno, "Failed to turn new rootfs '%s' into %s mount: %m",
×
544
                                       mount_propagation_flag_to_string(mount_propagation_flag), path);
545

546
        return 0;
547
}
548

549
int repeat_unmount(const char *path, int flags) {
15✔
550
        bool done = false;
15✔
551

552
        assert(path);
15✔
553

554
        /* If there are multiple mounts on a mount point, this
555
         * removes them all */
556

557
        for (;;) {
30✔
558
                if (umount2(path, flags) < 0) {
30✔
559

560
                        if (errno == EINVAL)
15✔
561
                                return done;
15✔
562

563
                        return -errno;
×
564
                }
565

566
                done = true;
567
        }
568
}
569

570
int mode_to_inaccessible_node(
4,813✔
571
                const char *runtime_dir,
572
                mode_t mode,
573
                char **ret) {
574

575
        /* This function maps a node type to a corresponding inaccessible file node. These nodes are created
576
         * during early boot by PID 1. In some cases we lacked the privs to create the character and block
577
         * devices (maybe because we run in an userns environment, or miss CAP_SYS_MKNOD, or run with a
578
         * devices policy that excludes device nodes with major and minor of 0), but that's fine, in that
579
         * case we use an AF_UNIX file node instead, which is not the same, but close enough for most
580
         * uses. And most importantly, the kernel allows bind mounts from socket nodes to any non-directory
581
         * file nodes, and that's the most important thing that matters.
582
         *
583
         * Note that the runtime directory argument shall be the top-level runtime directory, i.e. /run/ if
584
         * we operate in system context and $XDG_RUNTIME_DIR if we operate in user context. */
585

586
        _cleanup_free_ char *d = NULL;
4,813✔
587
        const char *node;
4,813✔
588

589
        assert(ret);
4,813✔
590

591
        if (!runtime_dir)
4,813✔
592
                runtime_dir = "/run";
4✔
593

594
        if (S_ISLNK(mode))
4,813✔
595
                return -EINVAL;
596

597
        node = inode_type_to_string(mode);
4,813✔
598
        if (!node)
4,813✔
599
                return -EINVAL;
600

601
        d = path_join(runtime_dir, "systemd/inaccessible", node);
4,813✔
602
        if (!d)
4,813✔
603
                return -ENOMEM;
604

605
        /* On new kernels unprivileged users are permitted to create 0:0 char device nodes (because they also
606
         * act as whiteout inode for overlayfs), but no other char or block device nodes. On old kernels no
607
         * device node whatsoever may be created by unprivileged processes. Hence, if the caller asks for the
608
         * inaccessible block device node let's see if the block device node actually exists, and if not,
609
         * fall back to the character device node. From there fall back to the socket device node. This means
610
         * in the best case we'll get the right device node type — but if not we'll hopefully at least get a
611
         * device node at all. */
612

613
        if (S_ISBLK(mode) &&
4,813✔
614
            access(d, F_OK) < 0 && errno == ENOENT) {
×
615
                free(d);
×
616
                d = path_join(runtime_dir, "/systemd/inaccessible/chr");
×
617
                if (!d)
×
618
                        return -ENOMEM;
619
        }
620

621
        if (IN_SET(mode & S_IFMT, S_IFBLK, S_IFCHR) &&
5,259✔
622
            access(d, F_OK) < 0 && errno == ENOENT) {
446✔
623
                free(d);
×
624
                d = path_join(runtime_dir, "/systemd/inaccessible/sock");
×
625
                if (!d)
×
626
                        return -ENOMEM;
627
        }
628

629
        *ret = TAKE_PTR(d);
4,813✔
630
        return 0;
4,813✔
631
}
632

633
int mount_flags_to_string(unsigned long flags, char **ret) {
48,713✔
634
        static const struct {
48,713✔
635
                unsigned long flag;
636
                const char *name;
637
        } map[] = {
638
                { .flag = MS_RDONLY,      .name = "MS_RDONLY",      },
639
                { .flag = MS_NOSUID,      .name = "MS_NOSUID",      },
640
                { .flag = MS_NODEV,       .name = "MS_NODEV",       },
641
                { .flag = MS_NOEXEC,      .name = "MS_NOEXEC",      },
642
                { .flag = MS_SYNCHRONOUS, .name = "MS_SYNCHRONOUS", },
643
                { .flag = MS_REMOUNT,     .name = "MS_REMOUNT",     },
644
                { .flag = MS_MANDLOCK,    .name = "MS_MANDLOCK",    },
645
                { .flag = MS_DIRSYNC,     .name = "MS_DIRSYNC",     },
646
                { .flag = MS_NOSYMFOLLOW, .name = "MS_NOSYMFOLLOW", },
647
                { .flag = MS_NOATIME,     .name = "MS_NOATIME",     },
648
                { .flag = MS_NODIRATIME,  .name = "MS_NODIRATIME",  },
649
                { .flag = MS_BIND,        .name = "MS_BIND",        },
650
                { .flag = MS_MOVE,        .name = "MS_MOVE",        },
651
                { .flag = MS_REC,         .name = "MS_REC",         },
652
                { .flag = MS_SILENT,      .name = "MS_SILENT",      },
653
                { .flag = MS_POSIXACL,    .name = "MS_POSIXACL",    },
654
                { .flag = MS_UNBINDABLE,  .name = "MS_UNBINDABLE",  },
655
                { .flag = MS_PRIVATE,     .name = "MS_PRIVATE",     },
656
                { .flag = MS_SLAVE,       .name = "MS_SLAVE",       },
657
                { .flag = MS_SHARED,      .name = "MS_SHARED",      },
658
                { .flag = MS_RELATIME,    .name = "MS_RELATIME",    },
659
                { .flag = MS_KERNMOUNT,   .name = "MS_KERNMOUNT",   },
660
                { .flag = MS_I_VERSION,   .name = "MS_I_VERSION",   },
661
                { .flag = MS_STRICTATIME, .name = "MS_STRICTATIME", },
662
                { .flag = MS_LAZYTIME,    .name = "MS_LAZYTIME",    },
663
        };
664
        _cleanup_free_ char *str = NULL;
48,713✔
665

666
        assert(ret);
48,713✔
667

668
        FOREACH_ELEMENT(entry, map)
1,266,538✔
669
                if (flags & entry->flag) {
1,217,825✔
670
                        if (!strextend_with_separator(&str, "|", entry->name))
114,651✔
671
                                return -ENOMEM;
672
                        flags &= ~entry->flag;
114,651✔
673
                }
674

675
        if (!str || flags != 0)
48,713✔
676
                if (strextendf_with_separator(&str, "|", "%lx", flags) < 0)
153✔
677
                        return -ENOMEM;
678

679
        *ret = TAKE_PTR(str);
48,713✔
680
        return 0;
48,713✔
681
}
682

683
int mount_verbose_full(
48,683✔
684
                int error_log_level,
685
                const char *what,
686
                const char *where,
687
                const char *type,
688
                unsigned long flags,
689
                const char *options,
690
                bool follow_symlink) {
691

692
        _cleanup_free_ char *fl = NULL, *o = NULL;
48,683✔
693
        unsigned long f;
48,683✔
694
        int r;
48,683✔
695

696
        r = mount_option_mangle(options, flags, &f, &o);
48,683✔
697
        if (r < 0)
48,683✔
698
                return log_full_errno(error_log_level, r,
×
699
                                      "Failed to mangle mount options %s: %m",
700
                                      strempty(options));
701

702
        (void) mount_flags_to_string(f, &fl);
48,683✔
703

704
        if (FLAGS_SET(f, MS_REMOUNT|MS_BIND))
48,683✔
705
                log_debug("Changing mount flags %s (%s \"%s\")...",
10,237✔
706
                          where, strnull(fl), strempty(o));
707
        else if (f & MS_REMOUNT)
43,564✔
708
                log_debug("Remounting superblock %s (%s \"%s\")...",
4✔
709
                          where, strnull(fl), strempty(o));
710
        else if (f & (MS_SHARED|MS_PRIVATE|MS_SLAVE|MS_UNBINDABLE))
43,560✔
711
                log_debug("Changing mount propagation %s (%s \"%s\")",
6,262✔
712
                          where, strnull(fl), strempty(o));
713
        else if (f & MS_BIND)
40,429✔
714
                log_debug("Bind-mounting %s on %s (%s \"%s\")...",
56,811✔
715
                          what, where, strnull(fl), strempty(o));
716
        else if (f & MS_MOVE)
11,950✔
717
                log_debug("Moving mount %s %s %s (%s \"%s\")...",
7,740✔
718
                          what, glyph(GLYPH_ARROW_RIGHT), where, strnull(fl), strempty(o));
719
        else
720
                log_debug("Mounting %s (%s) on %s (%s \"%s\")...",
9,342✔
721
                          strna(what), strna(type), where, strnull(fl), strempty(o));
722

723
        if (follow_symlink)
48,683✔
724
                r = RET_NERRNO(mount(what, where, type, f, o));
49,056✔
725
        else
726
                r = mount_nofollow(what, where, type, f, o);
43,773✔
727
        if (r < 0)
44,146✔
728
                return log_full_errno(error_log_level, r,
8,508✔
729
                                      "Failed to mount %s (type %s) on %s (%s \"%s\"): %m",
730
                                      strna(what), strna(type), where, strnull(fl), strempty(o));
731
        return 0;
732
}
733

734
int umount_verbose(
455✔
735
                int error_log_level,
736
                const char *where,
737
                int flags) {
738

739
        assert(where);
455✔
740

741
        log_debug("Unmounting '%s'...", where);
455✔
742

743
        if (umount2(where, flags) < 0)
455✔
744
                return log_full_errno(error_log_level, errno, "Failed to unmount '%s': %m", where);
81✔
745

746
        return 0;
747
}
748

749
int umountat_detach_verbose(
220✔
750
                int error_log_level,
751
                int fd,
752
                const char *where) {
753

754
        /* Similar to umountat_verbose(), but goes by fd + path. This implies MNT_DETACH, since to do this we
755
         * must pin the inode in question via an fd. */
756

757
        assert(fd >= 0 || fd == AT_FDCWD);
220✔
758

759
        /* If neither fd nor path are specified take this as reference to the cwd */
760
        if (fd == AT_FDCWD && isempty(where))
220✔
761
                return umount_verbose(error_log_level, ".", MNT_DETACH|UMOUNT_NOFOLLOW);
220✔
762

763
        /* If we don't actually take the fd into consideration for this operation shortcut things, so that we
764
         * don't have to open the inode */
765
        if (fd == AT_FDCWD || path_is_absolute(where))
220✔
766
                return umount_verbose(error_log_level, where, MNT_DETACH|UMOUNT_NOFOLLOW);
×
767

768
        _cleanup_free_ char *prefix = NULL;
440✔
769
        const char *p;
220✔
770
        if (fd_get_path(fd, &prefix) < 0)
220✔
771
                p = "<fd>"; /* if we can't get the path, return something vaguely useful */
772
        else
773
                p = prefix;
220✔
774
        _cleanup_free_ char *joined = isempty(where) ? strdup(p) : path_join(p, where);
559✔
775

776
        log_debug("Unmounting '%s'...", strna(joined));
220✔
777

778
        _cleanup_close_ int inode_fd = -EBADF;
220✔
779
        int mnt_fd;
220✔
780
        if (isempty(where))
220✔
781
                mnt_fd = fd;
782
        else {
783
                inode_fd = openat(fd, where, O_PATH|O_CLOEXEC|O_NOFOLLOW);
119✔
784
                if (inode_fd < 0)
119✔
785
                        return log_full_errno(error_log_level, errno, "Failed to pin '%s': %m", strna(joined));
×
786

787
                mnt_fd = inode_fd;
788
        }
789

790
        if (umount2(FORMAT_PROC_FD_PATH(mnt_fd), MNT_DETACH) < 0)
220✔
791
                return log_full_errno(error_log_level, errno, "Failed to unmount '%s': %m", strna(joined));
9✔
792

793
        return 0;
211✔
794
}
795

796
int mount_exchange_graceful(int fsmount_fd, const char *dest, bool mount_beneath) {
25✔
797
        int r;
25✔
798

799
        assert(fsmount_fd >= 0);
25✔
800
        assert(dest);
25✔
801

802
        /* First, try to mount beneath an existing mount point, and if that works, umount the old mount,
803
         * which is now at the top. This will ensure we can atomically replace a mount. Note that this works
804
         * also in the case where there are submounts down the tree. Mount propagation is allowed but
805
         * restricted to layouts that don't end up propagation the new mount on top of the mount stack.  If
806
         * this is not supported (minimum kernel v6.5), or if there is no mount on the mountpoint, we get
807
         * -EINVAL and then we fallback to normal mounting. */
808

809
        r = RET_NERRNO(move_mount(fsmount_fd, /* from_path = */ "",
35✔
810
                                  /* to_fd = */ -EBADF, dest,
811
                                  MOVE_MOUNT_F_EMPTY_PATH | (mount_beneath ? MOVE_MOUNT_BENEATH : 0)));
812
        if (mount_beneath) {
25✔
813
                if (r >= 0) /* Mounting beneath worked! Now unmount the upper mount. */
15✔
814
                        return umount_verbose(LOG_DEBUG, dest, UMOUNT_NOFOLLOW|MNT_DETACH);
11✔
815

816
                if (r == -EINVAL) { /* Fallback if mount_beneath is not supported */
4✔
817
                        log_debug_errno(r,
4✔
818
                                        "Cannot mount beneath '%s', falling back to overmount: %m",
819
                                        dest);
820
                        return mount_exchange_graceful(fsmount_fd, dest, /* mount_beneath = */ false);
4✔
821
                }
822
        }
823

824
        return r;
825
}
826

827
int mount_option_mangle(
48,755✔
828
                const char *options,
829
                unsigned long mount_flags,
830
                unsigned long *ret_mount_flags,
831
                char **ret_remaining_options) {
832

833
        const struct libmnt_optmap *map;
48,755✔
834
        _cleanup_free_ char *ret = NULL;
48,755✔
835
        int r;
48,755✔
836

837
        /* This extracts mount flags from the mount options, and stores
838
         * non-mount-flag options to '*ret_remaining_options'.
839
         * E.g.,
840
         * "rw,nosuid,nodev,relatime,size=1630748k,mode=0700,uid=1000,gid=1000"
841
         * is split to MS_NOSUID|MS_NODEV|MS_RELATIME and
842
         * "size=1630748k,mode=0700,uid=1000,gid=1000".
843
         * See more examples in test-mount-util.c.
844
         *
845
         * If 'options' does not contain any non-mount-flag options,
846
         * then '*ret_remaining_options' is set to NULL instead of empty string.
847
         * The validity of options stored in '*ret_remaining_options' is not checked.
848
         * If 'options' is NULL, this just copies 'mount_flags' to *ret_mount_flags. */
849

850
        assert(ret_mount_flags);
48,755✔
851
        assert(ret_remaining_options);
48,755✔
852

853
        map = mnt_get_builtin_optmap(MNT_LINUX_MAP);
48,755✔
854
        if (!map)
48,755✔
855
                return -EINVAL;
856

857
        for (const char *p = options;;) {
48,755✔
858
                _cleanup_free_ char *word = NULL;
23,299✔
859
                const struct libmnt_optmap *ent;
72,053✔
860

861
                r = extract_first_word(&p, &word, ",", EXTRACT_KEEP_QUOTE);
72,053✔
862
                if (r < 0)
72,053✔
863
                        return r;
864
                if (r == 0)
72,052✔
865
                        break;
866

867
                for (ent = map; ent->name; ent++) {
975,933✔
868
                        /* All entries in MNT_LINUX_MAP do not take any argument.
869
                         * Thus, ent->name does not contain "=" or "[=]". */
870
                        if (!streq(word, ent->name))
952,726✔
871
                                continue;
952,635✔
872

873
                        if (!(ent->mask & MNT_INVERT))
91✔
874
                                mount_flags |= ent->id;
82✔
875
                        else
876
                                mount_flags &= ~ent->id;
9✔
877

878
                        break;
879
                }
880

881
                /* If 'word' is not a mount flag, then store it in '*ret_remaining_options'. */
882
                if (!ent->name &&
46,505✔
883
                    !startswith_no_case(word, "x-") &&
46,412✔
884
                    !strextend_with_separator(&ret, ",", word))
23,205✔
885
                        return -ENOMEM;
886
        }
887

888
        *ret_mount_flags = mount_flags;
48,754✔
889
        *ret_remaining_options = TAKE_PTR(ret);
48,754✔
890

891
        return 0;
48,754✔
892
}
893

894
static int mount_in_namespace_legacy(
×
895
                const char *chased_src_path,
896
                int chased_src_fd,
897
                struct stat *chased_src_st,
898
                const char *propagate_path,
899
                const char *incoming_path,
900
                const char *dest,
901
                int pidns_fd,
902
                int mntns_fd,
903
                int root_fd,
904
                MountInNamespaceFlags flags,
905
                const MountOptions *options,
906
                const ImagePolicy *image_policy) {
907

908
        _cleanup_close_pair_ int errno_pipe_fd[2] = EBADF_PAIR;
×
909
        char mount_slave[] = "/tmp/propagate.XXXXXX", *mount_tmp, *mount_outside, *p;
×
910
        bool mount_slave_created = false, mount_slave_mounted = false,
×
911
                mount_tmp_created = false, mount_tmp_mounted = false,
×
912
                mount_outside_created = false, mount_outside_mounted = false;
×
913
        pid_t child;
×
914
        int r;
×
915

916
        assert(chased_src_path);
×
917
        assert(chased_src_fd >= 0);
×
918
        assert(chased_src_st);
×
919
        assert(propagate_path);
×
920
        assert(incoming_path);
×
921
        assert(dest);
×
922
        assert(pidns_fd >= 0);
×
923
        assert(mntns_fd >= 0);
×
924
        assert(root_fd >= 0);
×
925
        assert(!options || (flags & MOUNT_IN_NAMESPACE_IS_IMAGE));
×
926

927
        p = strjoina(propagate_path, "/");
×
928
        r = access_nofollow(p, F_OK);
×
929
        if (r < 0)
×
930
                return log_debug_errno(r == -ENOENT ? SYNTHETIC_ERRNO(EOPNOTSUPP) : r, "Target does not allow propagation of mount points");
×
931

932
        /* Our goal is to install a new bind mount into the container,
933
           possibly read-only. This is irritatingly complex
934
           unfortunately, currently.
935

936
           First, we start by creating a private playground in /tmp,
937
           that we can mount MS_SLAVE. (Which is necessary, since
938
           MS_MOVE cannot be applied to mounts with MS_SHARED parent
939
           mounts.) */
940

941
        if (!mkdtemp(mount_slave))
×
942
                return log_debug_errno(errno, "Failed to create playground %s: %m", mount_slave);
×
943

944
        mount_slave_created = true;
×
945

946
        r = mount_nofollow_verbose(LOG_DEBUG, mount_slave, mount_slave, NULL, MS_BIND, NULL);
×
947
        if (r < 0)
×
948
                goto finish;
×
949

950
        mount_slave_mounted = true;
×
951

952
        r = mount_nofollow_verbose(LOG_DEBUG, NULL, mount_slave, NULL, MS_SLAVE, NULL);
×
953
        if (r < 0)
×
954
                goto finish;
×
955

956
        /* Second, we mount the source file or directory to a directory inside of our MS_SLAVE playground. */
957
        mount_tmp = strjoina(mount_slave, "/mount");
×
958
        r = make_mount_point_inode_from_mode(AT_FDCWD, mount_tmp, (flags & MOUNT_IN_NAMESPACE_IS_IMAGE) ? S_IFDIR : chased_src_st->st_mode, 0700);
×
959
        if (r < 0) {
×
960
                log_debug_errno(r, "Failed to create temporary mount point %s: %m", mount_tmp);
×
961
                goto finish;
×
962
        }
963

964
        mount_tmp_created = true;
×
965

966
        if (flags & MOUNT_IN_NAMESPACE_IS_IMAGE)
×
967
                r = verity_dissect_and_mount(
×
968
                                chased_src_fd,
969
                                chased_src_path,
970
                                mount_tmp,
971
                                options,
972
                                image_policy,
973
                                /* image_filter= */ NULL,
974
                                /* extension_release_data= */ NULL,
975
                                /* required_class= */ _IMAGE_CLASS_INVALID,
976
                                /* verity= */ NULL,
977
                                /* ret_image= */ NULL);
978
        else
979
                r = mount_follow_verbose(LOG_DEBUG, FORMAT_PROC_FD_PATH(chased_src_fd), mount_tmp, NULL, MS_BIND, NULL);
×
980
        if (r < 0)
×
UNCOV
981
                goto finish;
×
982

UNCOV
983
        mount_tmp_mounted = true;
×
984

985
        /* Third, we remount the new bind mount read-only if requested. */
986
        if (flags & MOUNT_IN_NAMESPACE_READ_ONLY) {
×
987
                r = mount_nofollow_verbose(LOG_DEBUG, NULL, mount_tmp, NULL, MS_BIND|MS_REMOUNT|MS_RDONLY, NULL);
×
988
                if (r < 0)
×
UNCOV
989
                        goto finish;
×
990
        }
991

992
        /* Fourth, we move the new bind mount into the propagation directory. This way it will appear there read-only
993
         * right-away. */
994

995
        mount_outside = strjoina(propagate_path, "/XXXXXX");
×
996
        if ((flags & MOUNT_IN_NAMESPACE_IS_IMAGE) || S_ISDIR(chased_src_st->st_mode))
×
UNCOV
997
                r = mkdtemp(mount_outside) ? 0 : -errno;
×
998
        else {
999
                r = mkostemp_safe(mount_outside);
×
UNCOV
1000
                safe_close(r);
×
1001
        }
1002
        if (r < 0) {
×
1003
                log_debug_errno(r, "Cannot create propagation file or directory %s: %m", mount_outside);
×
UNCOV
1004
                goto finish;
×
1005
        }
1006

UNCOV
1007
        mount_outside_created = true;
×
1008

1009
        r = mount_nofollow_verbose(LOG_DEBUG, mount_tmp, mount_outside, NULL, MS_MOVE, NULL);
×
1010
        if (r < 0)
×
UNCOV
1011
                goto finish;
×
1012

1013
        mount_outside_mounted = true;
×
UNCOV
1014
        mount_tmp_mounted = false;
×
1015

1016
        if ((flags & MOUNT_IN_NAMESPACE_IS_IMAGE) || S_ISDIR(chased_src_st->st_mode))
×
UNCOV
1017
                (void) rmdir(mount_tmp);
×
1018
        else
1019
                (void) unlink(mount_tmp);
×
UNCOV
1020
        mount_tmp_created = false;
×
1021

1022
        (void) umount_verbose(LOG_DEBUG, mount_slave, UMOUNT_NOFOLLOW);
×
UNCOV
1023
        mount_slave_mounted = false;
×
1024

1025
        (void) rmdir(mount_slave);
×
UNCOV
1026
        mount_slave_created = false;
×
1027

1028
        if (pipe2(errno_pipe_fd, O_CLOEXEC|O_NONBLOCK) < 0) {
×
1029
                log_debug_errno(errno, "Failed to create pipe: %m");
×
UNCOV
1030
                goto finish;
×
1031
        }
1032

UNCOV
1033
        r = namespace_fork(
×
1034
                        "(sd-bindmnt)",
1035
                        "(sd-bindmnt-inner)",
1036
                        /* except_fds= */ NULL,
1037
                        /* n_except_fds= */ 0,
1038
                        FORK_RESET_SIGNALS|FORK_DEATHSIG_SIGTERM,
1039
                        pidns_fd,
1040
                        mntns_fd,
1041
                        /* netns_fd= */ -EBADF,
1042
                        /* userns_fd= */ -EBADF,
1043
                        root_fd,
1044
                        &child);
1045
        if (r < 0)
×
1046
                goto finish;
×
1047
        if (r == 0) {
×
UNCOV
1048
                _cleanup_free_ char *mount_outside_fn = NULL, *mount_inside = NULL;
×
1049

UNCOV
1050
                errno_pipe_fd[0] = safe_close(errno_pipe_fd[0]);
×
1051

1052
                _cleanup_close_ int dest_fd = -EBADF;
×
1053
                _cleanup_free_ char *dest_fn = NULL;
×
1054
                r = chase(dest, /* root= */ NULL, CHASE_PARENT|CHASE_EXTRACT_FILENAME|((flags & MOUNT_IN_NAMESPACE_MAKE_FILE_OR_DIRECTORY) ? CHASE_MKDIR_0755 : 0), &dest_fn, &dest_fd);
×
1055
                if (r < 0)
×
1056
                        log_debug_errno(r, "Failed to pin parent directory of mount '%s', ignoring: %m", dest);
×
1057
                else if (flags & MOUNT_IN_NAMESPACE_MAKE_FILE_OR_DIRECTORY) {
×
1058
                        r = make_mount_point_inode_from_mode(dest_fd, dest_fn, (flags & MOUNT_IN_NAMESPACE_IS_IMAGE) ? S_IFDIR : chased_src_st->st_mode, 0700);
×
1059
                        if (r < 0)
×
UNCOV
1060
                                log_debug_errno(r, "Failed to make mount point inode of mount '%s', ignoring: %m", dest);
×
1061
                }
1062

1063
                /* Fifth, move the mount to the right place inside */
1064
                r = path_extract_filename(mount_outside, &mount_outside_fn);
×
1065
                if (r < 0) {
×
1066
                        log_debug_errno(r, "Failed to extract filename from propagation file or directory '%s': %m", mount_outside);
×
UNCOV
1067
                        report_errno_and_exit(errno_pipe_fd[1], r);
×
1068
                }
1069

1070
                mount_inside = path_join(incoming_path, mount_outside_fn);
×
1071
                if (!mount_inside)
×
UNCOV
1072
                        report_errno_and_exit(errno_pipe_fd[1], log_oom_debug());
×
1073

1074
                r = mount_nofollow_verbose(LOG_DEBUG, mount_inside, dest_fd >= 0 ? FORMAT_PROC_FD_PATH(dest_fd) : dest, /* fstype= */ NULL, MS_MOVE, /* options= */ NULL);
×
1075
                if (r < 0)
×
UNCOV
1076
                        report_errno_and_exit(errno_pipe_fd[1], r);
×
1077

UNCOV
1078
                _exit(EXIT_SUCCESS);
×
1079
        }
1080

UNCOV
1081
        errno_pipe_fd[1] = safe_close(errno_pipe_fd[1]);
×
1082

1083
        r = wait_for_terminate_and_check("(sd-bindmnt)", child, 0);
×
1084
        if (r < 0) {
×
1085
                log_debug_errno(r, "Failed to wait for child: %m");
×
UNCOV
1086
                goto finish;
×
1087
        }
1088
        if (r != EXIT_SUCCESS) {
×
1089
                if (read(errno_pipe_fd[0], &r, sizeof(r)) == sizeof(r))
×
UNCOV
1090
                        log_debug_errno(r, "Failed to mount: %m");
×
1091
                else
1092
                        log_debug("Child failed.");
×
UNCOV
1093
                goto finish;
×
1094
        }
1095

1096
finish:
×
1097
        if (mount_outside_mounted)
×
1098
                (void) umount_verbose(LOG_DEBUG, mount_outside, UMOUNT_NOFOLLOW);
×
1099
        if (mount_outside_created) {
×
1100
                if ((flags & MOUNT_IN_NAMESPACE_IS_IMAGE) || S_ISDIR(chased_src_st->st_mode))
×
UNCOV
1101
                        (void) rmdir(mount_outside);
×
1102
                else
UNCOV
1103
                        (void) unlink(mount_outside);
×
1104
        }
1105

1106
        if (mount_tmp_mounted)
×
1107
                (void) umount_verbose(LOG_DEBUG, mount_tmp, UMOUNT_NOFOLLOW);
×
1108
        if (mount_tmp_created) {
×
1109
                if ((flags & MOUNT_IN_NAMESPACE_IS_IMAGE) || S_ISDIR(chased_src_st->st_mode))
×
UNCOV
1110
                        (void) rmdir(mount_tmp);
×
1111
                else
UNCOV
1112
                        (void) unlink(mount_tmp);
×
1113
        }
1114

1115
        if (mount_slave_mounted)
×
1116
                (void) umount_verbose(LOG_DEBUG, mount_slave, UMOUNT_NOFOLLOW);
×
1117
        if (mount_slave_created)
×
UNCOV
1118
                (void) rmdir(mount_slave);
×
1119

UNCOV
1120
        return r;
×
1121
}
1122

1123
static int mount_in_namespace(
6✔
1124
                const PidRef *target,
1125
                const char *propagate_path,
1126
                const char *incoming_path,
1127
                const char *src,
1128
                const char *dest,
1129
                MountInNamespaceFlags flags,
1130
                const MountOptions *options,
1131
                const ImagePolicy *image_policy) {
1132

1133
        _cleanup_close_ int mntns_fd = -EBADF, root_fd = -EBADF, pidns_fd = -EBADF, chased_src_fd = -EBADF;
18✔
1134
        _cleanup_free_ char *chased_src_path = NULL;
6✔
1135
        struct stat st;
6✔
1136
        int r;
6✔
1137

1138
        assert(propagate_path);
6✔
1139
        assert(incoming_path);
6✔
1140
        assert(src);
6✔
1141
        assert(dest);
6✔
1142
        assert((flags & MOUNT_IN_NAMESPACE_IS_IMAGE) || (!options && !image_policy));
6✔
1143

1144
        if (!pidref_is_set(target))
12✔
1145
                return -ESRCH;
1146

1147
        r = pidref_namespace_open(target, &pidns_fd, &mntns_fd, /* ret_netns_fd = */ NULL, /* ret_userns_fd = */ NULL, &root_fd);
6✔
1148
        if (r < 0)
6✔
UNCOV
1149
                return log_debug_errno(r, "Failed to retrieve FDs of the target process' namespace: %m");
×
1150

1151
        r = is_our_namespace(mntns_fd, NAMESPACE_MOUNT);
6✔
1152
        if (r < 0)
6✔
UNCOV
1153
                return log_debug_errno(r, "Failed to determine if mount namespaces are equal: %m");
×
1154
        /* We can't add new mounts at runtime if the process wasn't started in a namespace */
1155
        if (r > 0)
6✔
UNCOV
1156
                return log_debug_errno(SYNTHETIC_ERRNO(EINVAL), "Failed to activate bind mount in target, not running in a mount namespace.");
×
1157

1158
        r = chase(src, NULL, 0, &chased_src_path, &chased_src_fd);
6✔
1159
        if (r < 0)
6✔
UNCOV
1160
                return log_debug_errno(r, "Failed to resolve source path '%s': %m", src);
×
1161
        log_debug("Chased source path '%s': %s", src, chased_src_path);
6✔
1162

1163
        if (fstat(chased_src_fd, &st) < 0)
6✔
UNCOV
1164
                return log_debug_errno(errno, "Failed to stat() resolved source path '%s': %m", src);
×
1165
        if (S_ISLNK(st.st_mode)) /* This shouldn't really happen, given that we just chased the symlinks above, but let's better be safe… */
6✔
UNCOV
1166
                return log_debug_errno(SYNTHETIC_ERRNO(ELOOP), "Source path '%s' can't be a symbolic link.", src);
×
1167

1168
        if (!mount_new_api_supported()) /* Fallback if we can't use the new mount API */
6✔
UNCOV
1169
                return mount_in_namespace_legacy(
×
1170
                                chased_src_path,
1171
                                chased_src_fd,
1172
                                &st,
1173
                                propagate_path,
1174
                                incoming_path,
1175
                                dest,
1176
                                pidns_fd,
1177
                                mntns_fd,
1178
                                root_fd,
1179
                                flags,
1180
                                options,
1181
                                image_policy);
1182

UNCOV
1183
        _cleanup_(dissected_image_unrefp) DissectedImage *img = NULL;
×
1184
        _cleanup_close_ int new_mount_fd = -EBADF;
6✔
1185
        _cleanup_close_pair_ int errno_pipe_fd[2] = EBADF_PAIR;
6✔
1186
        pid_t child;
6✔
1187

1188
        if (flags & MOUNT_IN_NAMESPACE_IS_IMAGE) {
6✔
1189
                r = verity_dissect_and_mount(
2✔
1190
                                chased_src_fd,
1191
                                chased_src_path,
1192
                                /* dest= */ NULL,
1193
                                options,
1194
                                image_policy,
1195
                                /* image_filter= */ NULL,
1196
                                /* extension_release_data= */ NULL,
1197
                                /* required_class= */ _IMAGE_CLASS_INVALID,
1198
                                /* verity= */ NULL,
1199
                                &img);
1200
                if (r < 0)
2✔
UNCOV
1201
                        return log_debug_errno(r,
×
1202
                                               "Failed to dissect and mount image '%s': %m",
1203
                                               chased_src_path);
1204
        } else {
1205
                new_mount_fd = open_tree(
4✔
1206
                                chased_src_fd,
1207
                                "",
1208
                                OPEN_TREE_CLONE|OPEN_TREE_CLOEXEC|AT_SYMLINK_NOFOLLOW|AT_EMPTY_PATH);
1209
                if (new_mount_fd < 0)
4✔
UNCOV
1210
                        return log_debug_errno(
×
1211
                                        errno,
1212
                                        "Failed to open mount source '%s': %m",
1213
                                        chased_src_path);
1214

1215
                if ((flags & MOUNT_IN_NAMESPACE_READ_ONLY) && mount_setattr(new_mount_fd, "", AT_EMPTY_PATH,
4✔
UNCOV
1216
                                               &(struct mount_attr) {
×
1217
                                                       .attr_set = MOUNT_ATTR_RDONLY,
1218
                                               }, MOUNT_ATTR_SIZE_VER0) < 0)
UNCOV
1219
                        return log_debug_errno(errno,
×
1220
                                               "Failed to set mount for '%s' to read only: %m",
1221
                                               chased_src_path);
1222
        }
1223

1224
        if (pipe2(errno_pipe_fd, O_CLOEXEC|O_NONBLOCK) < 0)
6✔
UNCOV
1225
                return log_debug_errno(errno, "Failed to create pipe: %m");
×
1226

1227
        r = namespace_fork("(sd-bindmnt)",
6✔
1228
                           "(sd-bindmnt-inner)",
1229
                           /* except_fds= */ NULL,
1230
                           /* n_except_fds= */ 0,
1231
                           FORK_RESET_SIGNALS|FORK_DEATHSIG_SIGTERM,
1232
                           pidns_fd,
1233
                           mntns_fd,
1234
                           /* netns_fd= */ -EBADF,
1235
                           /* userns_fd= */ -EBADF,
1236
                           root_fd,
1237
                           &child);
1238
        if (r < 0)
12✔
UNCOV
1239
                return log_debug_errno(r, "Failed to fork off mount helper into namespace: %m");
×
1240
        if (r == 0) {
12✔
1241
                errno_pipe_fd[0] = safe_close(errno_pipe_fd[0]);
6✔
1242

UNCOV
1243
                _cleanup_close_ int dest_fd = -EBADF;
×
UNCOV
1244
                _cleanup_free_ char *dest_fn = NULL;
×
1245
                r = chase(dest, /* root= */ NULL, CHASE_PARENT|CHASE_EXTRACT_FILENAME|((flags & MOUNT_IN_NAMESPACE_MAKE_FILE_OR_DIRECTORY) ? CHASE_MKDIR_0755 : 0), &dest_fn, &dest_fd);
6✔
1246
                if (r < 0)
6✔
UNCOV
1247
                        report_errno_and_exit(errno_pipe_fd[1], r);
×
1248

1249
                if (flags & MOUNT_IN_NAMESPACE_MAKE_FILE_OR_DIRECTORY)
6✔
1250
                        (void) make_mount_point_inode_from_mode(dest_fd, dest_fn, img ? S_IFDIR : st.st_mode, 0700);
6✔
1251

1252
                if (img) {
6✔
1253
                        DissectImageFlags f =
2✔
1254
                                DISSECT_IMAGE_TRY_ATOMIC_MOUNT_EXCHANGE |
1255
                                DISSECT_IMAGE_ALLOW_USERSPACE_VERITY;
1256

1257
                        if (flags & MOUNT_IN_NAMESPACE_MAKE_FILE_OR_DIRECTORY)
2✔
1258
                                f |= DISSECT_IMAGE_MKDIR;
2✔
1259

1260
                        if (flags & MOUNT_IN_NAMESPACE_READ_ONLY)
2✔
UNCOV
1261
                                f |= DISSECT_IMAGE_READ_ONLY;
×
1262

1263
                        r = dissected_image_mount(
2✔
1264
                                        img,
1265
                                        dest,
1266
                                        /* uid_shift= */ UID_INVALID,
1267
                                        /* uid_range= */ UID_INVALID,
1268
                                        /* userns_fd= */ -EBADF,
1269
                                        f);
1270
                } else
1271
                        r = mount_exchange_graceful(new_mount_fd, dest, /* mount_beneath= */ true);
4✔
1272

1273
                report_errno_and_exit(errno_pipe_fd[1], r);
6✔
1274
        }
1275

1276
        errno_pipe_fd[1] = safe_close(errno_pipe_fd[1]);
6✔
1277

1278
        r = wait_for_terminate_and_check("(sd-bindmnt)", child, 0);
6✔
1279
        if (r < 0)
6✔
1280
                return log_debug_errno(r, "Failed to wait for child: %m");
×
1281
        if (r != EXIT_SUCCESS) {
6✔
UNCOV
1282
                if (read(errno_pipe_fd[0], &r, sizeof(r)) == sizeof(r))
×
1283
                        return log_debug_errno(r, "Failed to mount into namespace: %m");
×
1284

UNCOV
1285
                return log_debug_errno(SYNTHETIC_ERRNO(EPROTO), "Child failed.");
×
1286
        }
1287

1288
        return 0;
1289
}
1290

1291
int bind_mount_in_namespace(
4✔
1292
                const PidRef *target,
1293
                const char *propagate_path,
1294
                const char *incoming_path,
1295
                const char *src,
1296
                const char *dest,
1297
                MountInNamespaceFlags flags) {
1298

1299
        return mount_in_namespace(target,
8✔
1300
                                  propagate_path,
1301
                                  incoming_path,
1302
                                  src,
1303
                                  dest,
1304
                                  flags & ~MOUNT_IN_NAMESPACE_IS_IMAGE,
4✔
1305
                                  /* options = */ NULL,
1306
                                  /* image_policy = */ NULL);
1307
}
1308

1309
int mount_image_in_namespace(
2✔
1310
                const PidRef *target,
1311
                const char *propagate_path,
1312
                const char *incoming_path,
1313
                const char *src,
1314
                const char *dest,
1315
                MountInNamespaceFlags flags,
1316
                const MountOptions *options,
1317
                const ImagePolicy *image_policy) {
1318

1319
        return mount_in_namespace(target,
4✔
1320
                                  propagate_path,
1321
                                  incoming_path,
1322
                                  src,
1323
                                  dest,
1324
                                  flags | MOUNT_IN_NAMESPACE_IS_IMAGE,
2✔
1325
                                  options,
1326
                                  image_policy);
1327
}
1328

1329
int make_mount_point(const char *path) {
23✔
1330
        int r;
23✔
1331

1332
        assert(path);
23✔
1333

1334
        /* If 'path' is already a mount point, does nothing and returns 0. If it is not it makes it one, and returns 1. */
1335

1336
        r = path_is_mount_point(path);
23✔
1337
        if (r < 0)
23✔
UNCOV
1338
                return log_debug_errno(r, "Failed to determine whether '%s' is a mount point: %m", path);
×
1339
        if (r > 0)
23✔
1340
                return 0;
1341

1342
        r = mount_nofollow_verbose(LOG_DEBUG, path, path, NULL, MS_BIND|MS_REC, NULL);
9✔
1343
        if (r < 0)
9✔
UNCOV
1344
                return r;
×
1345

1346
        return 1;
1347
}
1348

1349
int fd_make_mount_point(int fd) {
11✔
1350
        int r;
11✔
1351

1352
        assert(fd >= 0);
11✔
1353

1354
        r = is_mount_point_at(fd, NULL, 0);
11✔
1355
        if (r < 0)
11✔
UNCOV
1356
                return log_debug_errno(r, "Failed to determine whether file descriptor is a mount point: %m");
×
1357
        if (r > 0)
11✔
1358
                return 0;
1359

1360
        r = mount_follow_verbose(LOG_DEBUG, FORMAT_PROC_FD_PATH(fd), FORMAT_PROC_FD_PATH(fd), NULL, MS_BIND|MS_REC, NULL);
1✔
1361
        if (r < 0)
1✔
UNCOV
1362
                return r;
×
1363

1364
        return 1;
1365
}
1366

1367
int make_userns(uid_t uid_shift,
76✔
1368
                uid_t uid_range,
1369
                uid_t source_owner,
1370
                uid_t dest_owner,
1371
                RemountIdmapping idmapping) {
1372

1373
        _cleanup_close_ int userns_fd = -EBADF;
76✔
1374
        _cleanup_free_ char *line = NULL;
76✔
1375
        uid_t source_base = 0;
76✔
1376

1377
        /* Allocates a userns file descriptor with the mapping we need. For this we'll fork off a child
1378
         * process whose only purpose is to give us a new user namespace. It's killed when we got it. */
1379

1380
        if (!userns_shift_range_valid(uid_shift, uid_range))
76✔
UNCOV
1381
                return log_debug_errno(SYNTHETIC_ERRNO(EINVAL), "Invalid UID range for user namespace.");
×
1382

1383
        switch (idmapping) {
76✔
1384

1385
        case REMOUNT_IDMAPPING_FOREIGN_WITH_HOST_ROOT:
2✔
1386
                source_base = FOREIGN_UID_BASE;
2✔
1387
                _fallthrough_;
74✔
1388

1389
        case REMOUNT_IDMAPPING_NONE:
74✔
1390
        case REMOUNT_IDMAPPING_HOST_ROOT:
1391

1392
                if (asprintf(&line,
74✔
1393
                             UID_FMT " " UID_FMT " " UID_FMT "\n",
1394
                             source_base, uid_shift, uid_range) < 0)
UNCOV
1395
                        return log_oom_debug();
×
1396

1397
                /* If requested we'll include an entry in the mapping so that the host root user can make
1398
                 * changes to the uidmapped mount like it normally would. Specifically, we'll map the user
1399
                 * with UID_MAPPED_ROOT on the backing fs to UID 0. This is useful, since nspawn code wants
1400
                 * to create various missing inodes in the OS tree before booting into it, and this becomes
1401
                 * very easy and straightforward to do if it can just do it under its own regular UID. Note
1402
                 * that in that case the container's runtime uidmap (i.e. the one the container payload
1403
                 * processes run in) will leave this UID unmapped, i.e. if we accidentally leave files owned
1404
                 * by host root in the already uidmapped tree around they'll show up as owned by 'nobody',
1405
                 * which is safe. (Of course, we shouldn't leave such inodes around, but always chown() them
1406
                 * to the container's own UID range, but it's good to have a safety net, in case we
1407
                 * forget it.) */
1408
                if (idmapping == REMOUNT_IDMAPPING_HOST_ROOT)
74✔
1409
                        if (strextendf(&line,
72✔
1410
                                       UID_FMT " " UID_FMT " " UID_FMT "\n",
1411
                                       UID_MAPPED_ROOT, (uid_t) 0u, (uid_t) 1u) < 0)
UNCOV
1412
                                return log_oom_debug();
×
1413

1414
                break;
1415

UNCOV
1416
        case REMOUNT_IDMAPPING_HOST_OWNER:
×
1417
                /* Remap the owner of the bind mounted directory to the root user within the container. This
1418
                 * way every file written by root within the container to the bind-mounted directory will
1419
                 * be owned by the original user from the host. All other users will remain unmapped. */
UNCOV
1420
                if (asprintf(&line,
×
1421
                             UID_FMT " " UID_FMT " " UID_FMT "\n",
1422
                             source_owner, uid_shift, (uid_t) 1u) < 0)
UNCOV
1423
                        return log_oom_debug();
×
1424
                break;
1425

1426
        case REMOUNT_IDMAPPING_HOST_OWNER_TO_TARGET_OWNER:
2✔
1427
                /* Remap the owner of the bind mounted directory to the owner of the target directory
1428
                 * within the container. This way every file written by target directory owner within the
1429
                 * container to the bind-mounted directory will be owned by the original host user.
1430
                 * All other users will remain unmapped. */
1431
                if (asprintf(&line,
2✔
1432
                             UID_FMT " " UID_FMT " " UID_FMT "\n",
1433
                             source_owner, dest_owner, (uid_t) 1u) < 0)
UNCOV
1434
                        return log_oom_debug();
×
1435
                break;
1436

UNCOV
1437
        default:
×
UNCOV
1438
                assert_not_reached();
×
1439
        }
1440

1441
        /* We always assign the same UID and GID ranges */
1442
        userns_fd = userns_acquire(line, line, /* setgroups_deny= */ true);
76✔
1443
        if (userns_fd < 0)
76✔
UNCOV
1444
                return log_debug_errno(userns_fd, "Failed to acquire new userns: %m");
×
1445

1446
        return TAKE_FD(userns_fd);
1447
}
1448

1449
int remount_idmap_fd(
87✔
1450
                char **paths,
1451
                int userns_fd,
1452
                uint64_t extra_mount_attr_set) {
1453

1454
        int r;
87✔
1455

1456
        assert(userns_fd >= 0);
87✔
1457

1458
        /* This remounts all specified paths with the specified userns as idmap. It will do so in the
1459
         * order specified in the strv: the expectation is that the top-level directories are at the
1460
         * beginning, and nested directories in the right, so that the tree can be built correctly from left
1461
         * to right. */
1462

1463
        size_t n = strv_length(paths);
87✔
1464
        if (n == 0) /* Nothing to do? */
87✔
1465
                return 0;
87✔
1466

1467
        int *mount_fds = NULL;
87✔
1468
        size_t n_mounts_fds = 0;
87✔
1469

1470
        mount_fds = new(int, n);
87✔
1471
        if (!mount_fds)
87✔
UNCOV
1472
                return log_oom_debug();
×
1473

1474
        CLEANUP_ARRAY(mount_fds, n_mounts_fds, close_many_and_free);
87✔
1475

1476
        for (size_t i = 0; i < n; i++) {
172✔
1477
                int mntfd;
87✔
1478

1479
                /* Clone the mount point */
1480
                mntfd = mount_fds[n_mounts_fds] = open_tree(-EBADF, paths[i], OPEN_TREE_CLONE | OPEN_TREE_CLOEXEC);
87✔
1481
                if (mount_fds[n_mounts_fds] < 0)
87✔
UNCOV
1482
                        return log_debug_errno(errno, "Failed to open tree of mounted filesystem '%s': %m", paths[i]);
×
1483

1484
                n_mounts_fds++;
87✔
1485

1486
                /* Set the user namespace mapping attribute on the cloned mount point */
1487
                if (mount_setattr(mntfd, "", AT_EMPTY_PATH,
87✔
1488
                                  &(struct mount_attr) {
87✔
1489
                                          .attr_set = MOUNT_ATTR_IDMAP | extra_mount_attr_set,
87✔
1490
                                          .userns_fd = userns_fd,
1491
                                  }, sizeof(struct mount_attr)) < 0)
1492
                        return log_debug_errno(errno, "Failed to change bind mount attributes for clone of '%s': %m", paths[i]);
2✔
1493
        }
1494

1495
        for (size_t i = n; i > 0; i--) { /* Unmount the paths right-to-left */
170✔
1496
                /* Remove the old mount points now that we have a idmapped mounts as replacement for all of them */
1497
                r = umount_verbose(LOG_DEBUG, paths[i-1], UMOUNT_NOFOLLOW);
85✔
1498
                if (r < 0)
85✔
1499
                        return r;
1500
        }
1501

1502
        for (size_t i = 0; i < n; i++) { /* Mount the replacement mounts left-to-right */
170✔
1503
                /* And place the cloned version in its place */
1504
                log_debug("Mounting idmapped fs to '%s'", paths[i]);
85✔
1505
                if (move_mount(mount_fds[i], "", -EBADF, paths[i], MOVE_MOUNT_F_EMPTY_PATH) < 0)
85✔
UNCOV
1506
                        return log_debug_errno(errno, "Failed to attach UID mapped mount to '%s': %m", paths[i]);
×
1507
        }
1508

1509
        return 0;
1510
}
1511

1512
int remount_idmap(
74✔
1513
                char **p,
1514
                uid_t uid_shift,
1515
                uid_t uid_range,
1516
                uid_t source_owner,
1517
                uid_t dest_owner,
1518
                RemountIdmapping idmapping) {
1519

1520
        _cleanup_close_ int userns_fd = -EBADF;
74✔
1521

1522
        userns_fd = make_userns(uid_shift, uid_range, source_owner, dest_owner, idmapping);
74✔
1523
        if (userns_fd < 0)
74✔
1524
                return userns_fd;
1525

1526
        return remount_idmap_fd(p, userns_fd, /* extra_mount_attr_set= */ 0);
74✔
1527
}
1528

1529
static void sub_mount_clear(SubMount *s) {
5,343✔
1530
        assert(s);
5,343✔
1531

1532
        s->path = mfree(s->path);
5,343✔
1533
        s->mount_fd = safe_close(s->mount_fd);
5,343✔
1534
}
5,343✔
1535

1536
void sub_mount_array_free(SubMount *s, size_t n) {
1,243✔
1537
        assert(s || n == 0);
1,243✔
1538

1539
        for (size_t i = 0; i < n; i++)
5,833✔
1540
                sub_mount_clear(s + i);
4,590✔
1541

1542
        free(s);
1,243✔
1543
}
1,243✔
1544

1545
static int sub_mount_compare(const SubMount *a, const SubMount *b) {
6,652✔
1546
        assert(a);
6,652✔
1547
        assert(b);
6,652✔
1548
        assert(a->path);
6,652✔
1549
        assert(b->path);
6,652✔
1550

1551
        return path_compare(a->path, b->path);
6,652✔
1552
}
1553

1554
static void sub_mount_drop(SubMount *s, size_t n) {
1,785✔
1555
        assert(s || n == 0);
1,785✔
1556

1557
        for (size_t m = 0, i = 1; i < n; i++) {
5,132✔
1558
                if (path_startswith(s[i].path, s[m].path))
3,347✔
1559
                        sub_mount_clear(s + i);
753✔
1560
                else
1561
                        m = i;
1562
        }
1563
}
1,785✔
1564

1565
int get_sub_mounts(const char *prefix, SubMount **ret_mounts, size_t *ret_n_mounts) {
1,785✔
1566

1567
        _cleanup_(mnt_free_tablep) struct libmnt_table *table = NULL;
1,785✔
1568
        _cleanup_(mnt_free_iterp) struct libmnt_iter *iter = NULL;
1,785✔
1569
        SubMount *mounts = NULL;
1,785✔
1570
        size_t n = 0;
1,785✔
1571
        int r;
1,785✔
1572

1573
        CLEANUP_ARRAY(mounts, n, sub_mount_array_free);
1,785✔
1574

1575
        assert(prefix);
1,785✔
1576
        assert(ret_mounts);
1,785✔
1577
        assert(ret_n_mounts);
1,785✔
1578

1579
        r = libmount_parse_mountinfo(/* source = */ NULL, &table, &iter);
1,785✔
1580
        if (r < 0)
1,785✔
UNCOV
1581
                return log_debug_errno(r, "Failed to parse /proc/self/mountinfo: %m");
×
1582

1583
        for (;;) {
90,782✔
1584
                _cleanup_close_ int mount_fd = -EBADF;
88,997✔
1585
                _cleanup_free_ char *p = NULL;
90,782✔
1586
                struct libmnt_fs *fs;
90,782✔
1587
                const char *path;
90,782✔
1588
                int id1, id2;
90,782✔
1589

1590
                r = mnt_table_next_fs(table, iter, &fs);
90,782✔
1591
                if (r == 1)
90,782✔
1592
                        break; /* EOF */
1593
                if (r < 0)
88,997✔
UNCOV
1594
                        return log_debug_errno(r, "Failed to get next entry from /proc/self/mountinfo: %m");
×
1595

1596
                path = mnt_fs_get_target(fs);
88,997✔
1597
                if (!path)
88,997✔
UNCOV
1598
                        continue;
×
1599

1600
                if (isempty(path_startswith(path, prefix)))
88,997✔
1601
                        continue;
83,659✔
1602

1603
                id1 = mnt_fs_get_id(fs);
5,338✔
1604
                r = path_get_mnt_id(path, &id2);
5,338✔
1605
                if (r < 0) {
5,338✔
UNCOV
1606
                        log_debug_errno(r, "Failed to get mount ID of '%s', ignoring: %m", path);
×
UNCOV
1607
                        continue;
×
1608
                }
1609
                if (id1 != id2) {
5,338✔
1610
                        /* The path may be hidden by another over-mount or already remounted. */
1611
                        log_debug("The mount IDs of '%s' obtained by libmount and path_get_mnt_id() are different (%i vs %i), ignoring.",
748✔
1612
                                  path, id1, id2);
1613
                        continue;
748✔
1614
                }
1615

1616
                mount_fd = open(path, O_CLOEXEC|O_PATH);
4,590✔
1617
                if (mount_fd < 0) {
4,590✔
UNCOV
1618
                        if (errno == ENOENT) /* The path may be hidden by another over-mount or already unmounted. */
×
1619
                                continue;
×
1620

UNCOV
1621
                        return log_debug_errno(errno, "Failed to open subtree of mounted filesystem '%s': %m", path);
×
1622
                }
1623

1624
                p = strdup(path);
4,590✔
1625
                if (!p)
4,590✔
UNCOV
1626
                        return log_oom_debug();
×
1627

1628
                if (!GREEDY_REALLOC(mounts, n + 1))
4,590✔
UNCOV
1629
                        return log_oom_debug();
×
1630

1631
                mounts[n++] = (SubMount) {
4,590✔
1632
                        .path = TAKE_PTR(p),
4,590✔
1633
                        .mount_fd = TAKE_FD(mount_fd),
4,590✔
1634
                };
1635
        }
1636

1637
        typesafe_qsort(mounts, n, sub_mount_compare);
1,785✔
1638
        sub_mount_drop(mounts, n);
1,785✔
1639

1640
        *ret_mounts = TAKE_PTR(mounts);
1,785✔
1641
        *ret_n_mounts = n;
1,785✔
1642
        return 0;
1,785✔
1643
}
1644

1645
int bind_mount_submounts(
1,247✔
1646
                const char *source,
1647
                const char *target) {
1648

1649
        SubMount *mounts = NULL;
1,247✔
1650
        size_t n = 0;
1,247✔
1651
        int ret = 0, r;
1,247✔
1652

1653
        /* Bind mounts all child mounts of 'source' to 'target'. Useful when setting up a new procfs instance
1654
         * with new mount options to copy the original submounts over. */
1655

1656
        assert(source);
1,247✔
1657
        assert(target);
1,247✔
1658

1659
        CLEANUP_ARRAY(mounts, n, sub_mount_array_free);
1,247✔
1660

1661
        r = get_sub_mounts(source, &mounts, &n);
1,247✔
1662
        if (r < 0)
1,247✔
1663
                return r;
1664

1665
        FOREACH_ARRAY(m, mounts, n) {
5,833✔
1666
                _cleanup_free_ char *t = NULL;
4,586✔
1667
                const char *suffix;
4,586✔
1668

1669
                if (isempty(m->path))
4,586✔
1670
                        continue;
753✔
1671

1672
                assert_se(suffix = path_startswith(m->path, source));
3,833✔
1673

1674
                t = path_join(target, suffix);
3,833✔
1675
                if (!t)
3,833✔
UNCOV
1676
                        return -ENOMEM;
×
1677

1678
                r = path_is_mount_point(t);
3,833✔
1679
                if (r < 0) {
3,833✔
1680
                        log_debug_errno(r, "Failed to detect if '%s' already is a mount point, ignoring: %m", t);
9✔
1681
                        continue;
9✔
1682
                }
1683
                if (r > 0) {
3,824✔
UNCOV
1684
                        log_debug("Not bind mounting '%s' from '%s' to '%s', since there's already a mountpoint.", suffix, source, target);
×
UNCOV
1685
                        continue;
×
1686
                }
1687

1688
                r = mount_follow_verbose(LOG_DEBUG, FORMAT_PROC_FD_PATH(m->mount_fd), t, NULL, MS_BIND|MS_REC, NULL);
3,824✔
1689
                if (r < 0 && ret == 0)
3,824✔
1690
                        ret = r;
373✔
1691
        }
1692

1693
        return ret;
1694
}
1695

1696
int make_mount_point_inode_from_mode(int dir_fd, const char *dest, mode_t source_mode, mode_t target_mode) {
991✔
1697
        assert(dir_fd >= 0 || dir_fd == AT_FDCWD);
991✔
1698
        assert(dest);
991✔
1699

1700
        if (S_ISDIR(source_mode))
991✔
1701
                return mkdirat_label(dir_fd, dest, target_mode & 07777);
974✔
1702
        else
1703
                return RET_NERRNO(mknodat(dir_fd, dest, S_IFREG|(target_mode & 07666), 0)); /* Mask off X bit */
18✔
1704
}
1705

1706
int make_mount_point_inode_from_path(const char *source, const char *dest, mode_t access_mode) {
845✔
1707
        struct stat st;
845✔
1708

1709
        assert(source);
845✔
1710
        assert(dest);
845✔
1711

1712
        if (stat(source, &st) < 0)
845✔
UNCOV
1713
                return -errno;
×
1714

1715
        return make_mount_point_inode_from_mode(AT_FDCWD, dest, st.st_mode, access_mode);
845✔
1716
}
1717

1718
int trigger_automount_at(int dir_fd, const char *path) {
380✔
1719
        _cleanup_free_ char *nested = NULL;
760✔
1720

1721
        assert(dir_fd >= 0 || dir_fd == AT_FDCWD);
380✔
1722

1723
        nested = path_join(path, "a");
380✔
1724
        if (!nested)
380✔
1725
                return -ENOMEM;
1726

1727
        (void) faccessat(dir_fd, nested, F_OK, 0);
380✔
1728

1729
        return 0;
380✔
1730
}
1731

1732
unsigned long credentials_fs_mount_flags(bool ro) {
3,920✔
1733
        /* A tight set of mount flags for credentials mounts */
1734
        return MS_NODEV|MS_NOEXEC|MS_NOSUID|ms_nosymfollow_supported()|(ro ? MS_RDONLY : 0);
3,920✔
1735
}
1736

1737
int mount_credentials_fs(const char *path, size_t size, bool ro) {
1,960✔
1738
        _cleanup_free_ char *opts = NULL;
1,960✔
1739
        int r, noswap_supported;
1,960✔
1740

1741
        /* Mounts a file system we can place credentials in, i.e. with tight access modes right from the
1742
         * beginning, and ideally swapping turned off. In order of preference:
1743
         *
1744
         *      1. tmpfs if it supports "noswap"
1745
         *      2. ramfs
1746
         *      3. tmpfs if it doesn't support "noswap"
1747
         */
1748

1749
        noswap_supported = mount_option_supported("tmpfs", "noswap", NULL); /* Check explicitly to avoid kmsg noise */
1,960✔
1750
        if (noswap_supported > 0) {
1,960✔
1751
                _cleanup_free_ char *noswap_opts = NULL;
1,959✔
1752

1753
                if (asprintf(&noswap_opts, "mode=0700,nr_inodes=1024,size=%zu,noswap", size) < 0)
1,959✔
1754
                        return -ENOMEM;
1755

1756
                /* Best case: tmpfs with noswap (needs kernel >= 6.3) */
1757

1758
                r = mount_nofollow_verbose(
1,959✔
1759
                                LOG_DEBUG,
1760
                                "tmpfs",
1761
                                path,
1762
                                "tmpfs",
1763
                                credentials_fs_mount_flags(ro),
1764
                                noswap_opts);
1765
                if (r >= 0)
1,959✔
1766
                        return r;
1767
        }
1768

1769
        r = mount_nofollow_verbose(
1✔
1770
                        LOG_DEBUG,
1771
                        "ramfs",
1772
                        path,
1773
                        "ramfs",
1774
                        credentials_fs_mount_flags(ro),
1775
                        "mode=0700");
1776
        if (r >= 0)
1✔
1777
                return r;
1778

1779
        if (asprintf(&opts, "mode=0700,nr_inodes=1024,size=%zu", size) < 0)
1✔
1780
                return -ENOMEM;
1781

1782
        return mount_nofollow_verbose(
1,960✔
1783
                        LOG_DEBUG,
1784
                        "tmpfs",
1785
                        path,
1786
                        "tmpfs",
1787
                        credentials_fs_mount_flags(ro),
1788
                        opts);
1789
}
1790

1791
int make_fsmount(
2✔
1792
                int error_log_level,
1793
                const char *what,
1794
                const char *type,
1795
                unsigned long flags,
1796
                const char *options,
1797
                int userns_fd) {
1798

1799
        _cleanup_close_ int fs_fd = -EBADF, mnt_fd = -EBADF;
2✔
1800
        _cleanup_free_ char *o = NULL;
2✔
1801
        unsigned long f;
2✔
1802
        int r;
2✔
1803

1804
        assert(type);
2✔
1805
        assert(what);
2✔
1806

1807
        r = mount_option_mangle(options, flags, &f, &o);
2✔
1808
        if (r < 0)
2✔
UNCOV
1809
                return log_full_errno(
×
1810
                                error_log_level, r, "Failed to mangle mount options %s: %m",
1811
                                strempty(options));
1812

1813
        if (DEBUG_LOGGING) {
2✔
1814
                _cleanup_free_ char *fl = NULL;
2✔
1815
                (void) mount_flags_to_string(f, &fl);
2✔
1816

1817
                log_debug("Creating mount fd for %s (%s) (%s \"%s\")...",
4✔
1818
                        strna(what), strna(type), strnull(fl), strempty(o));
1819
        }
1820

1821
        fs_fd = fsopen(type, FSOPEN_CLOEXEC);
2✔
1822
        if (fs_fd < 0)
2✔
UNCOV
1823
                return log_full_errno(error_log_level, errno, "Failed to open superblock for \"%s\": %m", type);
×
1824

1825
        if (fsconfig(fs_fd, FSCONFIG_SET_STRING, "source", what, 0) < 0)
2✔
UNCOV
1826
                return log_full_errno(error_log_level, errno, "Failed to set mount source for \"%s\" to \"%s\": %m", type, what);
×
1827

1828
        if (FLAGS_SET(f, MS_RDONLY))
2✔
1829
                if (fsconfig(fs_fd, FSCONFIG_SET_FLAG, "ro", NULL, 0) < 0)
2✔
UNCOV
1830
                        return log_full_errno(error_log_level, errno, "Failed to set read only mount flag for \"%s\": %m", type);
×
1831

1832
        for (const char *p = o;;) {
2✔
UNCOV
1833
                _cleanup_free_ char *word = NULL;
×
1834
                char *eq;
2✔
1835

1836
                r = extract_first_word(&p, &word, ",", EXTRACT_KEEP_QUOTE);
2✔
1837
                if (r < 0)
2✔
UNCOV
1838
                        return log_full_errno(error_log_level, r, "Failed to parse mount option string \"%s\": %m", o);
×
1839
                if (r == 0)
2✔
1840
                        break;
1841

1842
                eq = strchr(word, '=');
×
1843
                if (eq) {
×
UNCOV
1844
                        *eq = 0;
×
1845
                        eq++;
×
1846

UNCOV
1847
                        if (fsconfig(fs_fd, FSCONFIG_SET_STRING, word, eq, 0) < 0)
×
1848
                                return log_full_errno(error_log_level, errno, "Failed to set mount option \"%s=%s\" for \"%s\": %m", word, eq, type);
×
1849
                } else {
UNCOV
1850
                        if (fsconfig(fs_fd, FSCONFIG_SET_FLAG, word, NULL, 0) < 0)
×
UNCOV
1851
                                return log_full_errno(error_log_level, errno, "Failed to set mount flag \"%s\" for \"%s\": %m", word, type);
×
1852
                }
1853
        }
1854

1855
        if (fsconfig(fs_fd, FSCONFIG_CMD_CREATE, NULL, NULL, 0) < 0)
2✔
UNCOV
1856
                return log_full_errno(error_log_level, errno, "Failed to realize fs fd for \"%s\" (\"%s\"): %m", what, type);
×
1857

1858
        mnt_fd = fsmount(fs_fd, FSMOUNT_CLOEXEC, 0);
2✔
1859
        if (mnt_fd < 0)
2✔
UNCOV
1860
                return log_full_errno(error_log_level, errno, "Failed to create mount fd for \"%s\" (\"%s\"): %m", what, type);
×
1861

1862
        if (mount_setattr(mnt_fd, "", AT_EMPTY_PATH|AT_RECURSIVE,
2✔
1863
                          &(struct mount_attr) {
4✔
1864
                                  .attr_set = ms_flags_to_mount_attr(f) | (userns_fd >= 0 ? MOUNT_ATTR_IDMAP : 0),
4✔
1865
                                  .userns_fd = userns_fd,
1866
                          }, MOUNT_ATTR_SIZE_VER0) < 0)
UNCOV
1867
                return log_full_errno(error_log_level,
×
1868
                                      errno,
1869
                                      "Failed to set mount flags for \"%s\" (\"%s\"): %m",
1870
                                      what,
1871
                                      type);
1872

1873
        return TAKE_FD(mnt_fd);
2✔
1874
}
1875

1876
char* umount_and_unlink_and_free(char *p) {
1✔
1877
        if (!p)
1✔
1878
                return NULL;
1✔
1879

1880
        PROTECT_ERRNO;
2✔
1881
        (void) umount2(p, 0);
1✔
1882
        (void) unlink(p);
1✔
1883
        return mfree(p);
1✔
1884
}
1885

1886
int path_get_mount_info_at(
629✔
1887
                int dir_fd,
1888
                const char *path,
1889
                char **ret_fstype,
1890
                char **ret_options,
1891
                char **ret_source) {
1892

1893
        _cleanup_(mnt_free_tablep) struct libmnt_table *table = NULL;
629✔
1894
        _cleanup_(mnt_free_iterp) struct libmnt_iter *iter = NULL;
629✔
1895
        int r, mnt_id;
629✔
1896

1897
        assert(dir_fd >= 0 || dir_fd == AT_FDCWD);
629✔
1898

1899
        r = path_get_mnt_id_at(dir_fd, path, &mnt_id);
629✔
1900
        if (r < 0)
629✔
UNCOV
1901
                return log_debug_errno(r, "Failed to get mount ID: %m");
×
1902

1903
        /* When getting options is requested, we also need to parse utab, otherwise userspace options like
1904
         * "_netdev" will be lost. */
1905
        if (ret_options)
629✔
1906
                r = libmount_parse_with_utab(&table, &iter);
619✔
1907
        else
1908
                r = libmount_parse_mountinfo(/* source = */ NULL, &table, &iter);
10✔
1909
        if (r < 0)
629✔
UNCOV
1910
                return log_debug_errno(r, "Failed to parse /proc/self/mountinfo: %m");
×
1911

1912
        for (;;) {
3,437✔
1913
                struct libmnt_fs *fs;
2,033✔
1914

1915
                r = mnt_table_next_fs(table, iter, &fs);
2,033✔
1916
                if (r == 1)
2,033✔
1917
                        break; /* EOF */
1918
                if (r < 0)
2,033✔
1919
                        return log_debug_errno(r, "Failed to get next entry from /proc/self/mountinfo: %m");
629✔
1920

1921
                if (mnt_fs_get_id(fs) != mnt_id)
2,033✔
1922
                        continue;
1,404✔
1923

1924
                _cleanup_free_ char *fstype = NULL, *options = NULL, *source = NULL;
629✔
1925

1926
                if (ret_fstype) {
629✔
1927
                        fstype = strdup(strempty(mnt_fs_get_fstype(fs)));
619✔
1928
                        if (!fstype)
619✔
UNCOV
1929
                                return log_oom_debug();
×
1930
                }
1931

1932
                if (ret_options) {
629✔
1933
                        options = strdup(strempty(mnt_fs_get_options(fs)));
619✔
1934
                        if (!options)
619✔
UNCOV
1935
                                return log_oom_debug();
×
1936
                }
1937

1938
                if (ret_source) {
629✔
1939
                        source = strdup(strempty(mnt_fs_get_source(fs)));
10✔
1940
                        if (!source)
10✔
UNCOV
1941
                                return log_oom_debug();
×
1942
                }
1943

1944
                if (ret_fstype)
629✔
1945
                        *ret_fstype = TAKE_PTR(fstype);
619✔
1946
                if (ret_options)
629✔
1947
                        *ret_options = TAKE_PTR(options);
619✔
1948
                if (ret_source)
629✔
1949
                        *ret_source = TAKE_PTR(source);
10✔
1950

1951
                return 0;
1952
        }
1953

UNCOV
1954
        return log_debug_errno(SYNTHETIC_ERRNO(ESTALE), "Cannot find mount ID %i from /proc/self/mountinfo.", mnt_id);
×
1955
}
1956

1957
int path_is_network_fs_harder_at(int dir_fd, const char *path) {
637✔
1958
        _cleanup_close_ int fd = -EBADF;
637✔
1959
        int r;
637✔
1960

1961
        assert(dir_fd >= 0 || dir_fd == AT_FDCWD);
637✔
1962

1963
        fd = xopenat(dir_fd, path, O_PATH | O_CLOEXEC | O_NOFOLLOW);
637✔
1964
        if (fd < 0)
637✔
1965
                return fd;
1966

1967
        r = fd_is_network_fs(fd);
619✔
1968
        if (r != 0)
619✔
1969
                return r;
1970

1971
        _cleanup_free_ char *fstype = NULL, *options = NULL;
619✔
1972
        r = path_get_mount_info_at(fd, /* path = */ NULL, &fstype, &options, /* source = */ NULL);
619✔
1973
        if (r < 0)
619✔
1974
                return r;
1975

1976
        if (fstype_is_network(fstype))
619✔
1977
                return true;
1978

1979
        if (fstab_test_option(options, "_netdev\0"))
619✔
UNCOV
1980
                return true;
×
1981

1982
        return false;
1983
}
STATUS · Troubleshooting · Open an Issue · Sales · Support · CAREERS · ENTERPRISE · START FREE · SCHEDULE DEMO
ANNOUNCEMENTS · TWITTER · TOS & SLA · Supported CI Services · What's a CI service? · Automated Testing

© 2026 Coveralls, Inc