• Home
  • Features
  • Pricing
  • Docs
  • Announcements
  • Sign In

systemd / systemd / 20151578145

11 Dec 2025 05:38AM UTC coverage: 72.698% (-0.02%) from 72.713%
20151578145

push

github

web-flow
core: gracefully skip unknown policy designators in RootImagePolicy et al (#40060)

Usually we gracefully ignore unknown configuration parameters, so that
service files can be written by upstreams and used across a variegated
range of distributions with various versions of systemd, to avoid
forcing users to the minimum common denominator and only adding settings
that are supported by the oldest distro supported.

Image policies do not behave like this, and any unknown partition or
policy designator causes the whole unit to fail to parse and a hard
error.

Change it so that parsing RootImagePolicy and friends via unit file or
D-Bus logs but otherwise ignores unknown specifiers, like other options
do.

This allows us to add new specifiers in the future, and users to adopt
them immediately.

Follow-up for d452335aa

44 of 49 new or added lines in 7 files covered. (89.8%)

297 existing lines in 37 files now uncovered.

309479 of 425707 relevant lines covered (72.7%)

1150153.67 hits per line

Source File
Press 'n' to go to next uncovered line, 'b' for previous

0.0
/src/mountfsd/mountwork.c
1
/* SPDX-License-Identifier: LGPL-2.1-or-later */
2

3
#include <linux/loop.h>
4
#include <poll.h>
5
#include <stdlib.h>
6
#include <sys/mount.h>
7
#include <unistd.h>
8

9
#include "sd-daemon.h"
10
#include "sd-event.h"
11
#include "sd-varlink.h"
12

13
#include "argv-util.h"
14
#include "bus-polkit.h"
15
#include "chase.h"
16
#include "discover-image.h"
17
#include "dissect-image.h"
18
#include "env-util.h"
19
#include "errno-util.h"
20
#include "fd-util.h"
21
#include "fs-util.h"
22
#include "format-util.h"
23
#include "hashmap.h"
24
#include "image-policy.h"
25
#include "io-util.h"
26
#include "iovec-util.h"
27
#include "json-util.h"
28
#include "loop-util.h"
29
#include "main-func.h"
30
#include "memory-util.h"
31
#include "mount-util.h"
32
#include "namespace-util.h"
33
#include "nsresource.h"
34
#include "nulstr-util.h"
35
#include "os-util.h"
36
#include "path-util.h"
37
#include "pidref.h"
38
#include "stat-util.h"
39
#include "string-table.h"
40
#include "string-util.h"
41
#include "strv.h"
42
#include "tmpfile-util.h"
43
#include "time-util.h"
44
#include "uid-classification.h"
45
#include "uid-range.h"
46
#include "user-util.h"
47
#include "varlink-io.systemd.MountFileSystem.h"
48
#include "varlink-util.h"
49

50
#define ITERATIONS_MAX 64U
51
#define RUNTIME_MAX_USEC (5 * USEC_PER_MINUTE)
52
#define PRESSURE_SLEEP_TIME_USEC (50 * USEC_PER_MSEC)
53
#define LISTEN_IDLE_USEC (90 * USEC_PER_SEC)
54

55
static const ImagePolicy image_policy_untrusted = {
56
        .n_policies = 2,
57
        .policies = {
58
                { PARTITION_ROOT,     PARTITION_POLICY_SIGNED|PARTITION_POLICY_ABSENT },
59
                { PARTITION_USR,      PARTITION_POLICY_SIGNED|PARTITION_POLICY_ABSENT },
60
        },
61
        .default_flags = PARTITION_POLICY_IGNORE,
62
};
63

64
static int json_dispatch_image_policy(const char *name, sd_json_variant *variant, sd_json_dispatch_flags_t flags, void *userdata) {
×
65
        _cleanup_(image_policy_freep) ImagePolicy *q = NULL;
×
66
        ImagePolicy **p = ASSERT_PTR(userdata);
×
67
        int r;
×
68

69
        assert(p);
×
70

71
        if (sd_json_variant_is_null(variant)) {
×
72
                *p = image_policy_free(*p);
×
73
                return 0;
×
74
        }
75

76
        if (!sd_json_variant_is_string(variant))
×
77
                return json_log(variant, flags, SYNTHETIC_ERRNO(EINVAL), "JSON field '%s' is not a string.", strna(name));
×
78

NEW
79
        r = image_policy_from_string(sd_json_variant_string(variant), /* graceful= */ false, &q);
×
80
        if (r < 0)
×
81
                return json_log(variant, flags, r, "JSON field '%s' is not a valid image policy.", strna(name));
×
82

83
        image_policy_free(*p);
×
84
        *p = TAKE_PTR(q);
×
85
        return 0;
×
86
}
87

88
typedef struct MountImageParameters {
89
        unsigned image_fd_idx;
90
        unsigned userns_fd_idx;
91
        int read_only;
92
        int growfs;
93
        char *password;
94
        ImagePolicy *image_policy;
95
        bool verity_sharing;
96
        struct iovec verity_root_hash;
97
        struct iovec verity_root_hash_sig;
98
        unsigned verity_data_fd_idx;
99
} MountImageParameters;
100

101
static void mount_image_parameters_done(MountImageParameters *p) {
×
102
        assert(p);
×
103

104
        p->password = erase_and_free(p->password);
×
105
        p->image_policy = image_policy_free(p->image_policy);
×
106
        iovec_done(&p->verity_root_hash);
×
107
        iovec_done(&p->verity_root_hash_sig);
×
108
}
×
109

110
static int validate_image_fd(int fd, MountImageParameters *p) {
×
111
        int r, fl;
×
112

113
        assert(fd >= 0);
×
114
        assert(p);
×
115

116
        struct stat st;
×
117
        if (fstat(fd, &st) < 0)
×
118
                return -errno;
×
119
        /* Only support regular files and block devices. Let's use stat_verify_regular() here for the nice
120
         * error numbers it generates. */
121
        if (!S_ISBLK(st.st_mode)) {
×
122
                r = stat_verify_regular(&st);
×
123
                if (r < 0)
×
124
                        return r;
125
        }
126

127
        fl = fd_verify_safe_flags(fd);
×
128
        if (fl < 0)
×
129
                return log_debug_errno(fl, "Image file descriptor has unsafe flags set: %m");
×
130

131
        switch (fl & O_ACCMODE_STRICT) {
×
132

133
        case O_RDONLY:
×
134
                p->read_only = true;
×
135
                break;
×
136

137
        case O_RDWR:
138
                break;
139

140
        default:
141
                return -EBADF;
142
        }
143

144
        return 0;
145
}
146

147
static int verify_trusted_image_fd_by_path(int fd) {
×
148
        int r;
×
149

150
        assert(fd >= 0);
×
151

152
        r = secure_getenv_bool("SYSTEMD_MOUNTFSD_TRUSTED_DIRECTORIES");
×
153
        if (r == -ENXIO)  {
×
154
                if (!DEFAULT_MOUNTFSD_TRUSTED_DIRECTORIES) {
×
155
                        log_debug("Trusted directory mechanism disabled at compile time.");
×
156
                        return false;
×
157
                }
158
        } else if (r < 0) {
×
159
                log_debug_errno(r, "Failed to parse $SYSTEMD_MOUNTFSD_TRUSTED_DIRECTORIES environment variable, not trusting any image.");
×
160
                return false;
×
161
        } else if (!r) {
×
162
                log_debug("Trusted directory mechanism disabled via $SYSTEMD_MOUNTFSD_TRUSTED_DIRECTORIES environment variable.");
×
163
                return false;
×
164
        }
165

166
        _cleanup_free_ char *p = NULL;
×
167
        r = fd_get_path(fd, &p);
×
168
        if (r < 0)
×
169
                return log_debug_errno(r, "Failed to get path of passed image file descriptor: %m");
×
170

171
        struct stat sta;
×
172
        if (fstat(fd, &sta) < 0)
×
173
                return log_debug_errno(errno, "Failed to stat() passed image file descriptor: %m");
×
174
        if (!S_ISREG(sta.st_mode)) {
×
175
                log_debug("Image '%s' is not a regular file, hence skipping trusted directory check.", p);
×
176
                return false;
×
177
        }
178

179
        log_debug("Checking if image '%s' is in trusted directories.", p);
×
180

181
        for (ImageClass c = 0; c < _IMAGE_CLASS_MAX; c++)
×
182
                NULSTR_FOREACH(s, image_search_path[c]) {
×
183
                        _cleanup_close_ int dir_fd = -EBADF, inode_fd = -EBADF;
×
184
                        _cleanup_free_ char *q = NULL;
×
185
                        struct stat stb;
×
186
                        const char *e;
×
187

188
                        r = chase(s, NULL, CHASE_SAFE|CHASE_TRIGGER_AUTOFS, &q, &dir_fd);
×
189
                        if (r == -ENOENT)
×
190
                                continue;
×
191
                        if (r < 0) {
×
192
                                log_warning_errno(r, "Failed to resolve search path '%s', ignoring: %m", s);
×
193
                                continue;
×
194
                        }
195

196
                        /* Check that the inode refers to a file immediately inside the image directory,
197
                         * i.e. not the image directory itself, and nothing further down the tree */
198
                        e = path_startswith(p, q);
×
199
                        if (isempty(e))
×
200
                                continue;
×
201

202
                        e += strspn(e, "/");
×
203
                        if (!filename_is_valid(e))
×
204
                                continue;
×
205

206
                        r = chaseat(dir_fd, e, CHASE_SAFE|CHASE_TRIGGER_AUTOFS, NULL, &inode_fd);
×
207
                        if (r < 0)
×
208
                                return log_error_errno(r, "Couldn't verify that specified image '%s' is in search path '%s': %m", p, s);
×
209

210
                        if (fstat(inode_fd, &stb) < 0)
×
211
                                return log_error_errno(errno, "Failed to stat image file '%s/%s': %m", q, e);
×
212

213
                        if (stat_inode_same(&sta, &stb)) {
×
214
                                log_debug("Image '%s' is *in* trusted directories.", p);
×
215
                                return true; /* Yay */
×
216
                        }
217
                }
218

219
        log_debug("Image '%s' is *not* in trusted directories.", p);
×
220
        return false;
221
}
222

223
static int determine_image_policy(
×
224
                int image_fd,
225
                bool trusted,
226
                ImagePolicy *client_policy,
227
                ImagePolicy **ret) {
228

229
        _cleanup_(image_policy_freep) ImagePolicy *envvar_policy = NULL;
×
230
        const ImagePolicy *default_policy;
×
231
        const char *envvar, *e;
×
232
        int r;
×
233

234
        assert(image_fd >= 0);
×
235
        assert(ret);
×
236

237
        if (trusted) {
×
238
                envvar = "SYSTEMD_MOUNTFSD_IMAGE_POLICY_TRUSTED";
239
                default_policy = &image_policy_allow;
240
        } else {
241
                envvar = "SYSTEMD_MOUNTFSD_IMAGE_POLICY_UNTRUSTED";
×
242
                default_policy = &image_policy_untrusted;
×
243
        }
244

245
        e = secure_getenv(envvar);
×
246
        if (e) {
×
NEW
247
                r = image_policy_from_string(e, /* graceful= */ false, &envvar_policy);
×
248
                if (r < 0)
×
249
                        return log_error_errno(r, "Failed to parse image policy supplied via $%s: %m", envvar);
×
250

251
                default_policy = envvar_policy;
×
252
        }
253

254
        return image_policy_intersect(default_policy, client_policy, ret);
×
255
}
256

257
static int validate_userns(sd_varlink *link, int *userns_fd) {
×
258
        int r;
×
259

260
        assert(link);
×
261
        assert(userns_fd);
×
262

263
        if (*userns_fd < 0)
×
264
                return 0;
265

266
        r = fd_verify_safe_flags(*userns_fd);
×
267
        if (r < 0)
×
268
                return log_debug_errno(r, "User namespace file descriptor has unsafe flags set: %m");
×
269

270
        r = fd_is_namespace(*userns_fd, NAMESPACE_USER);
×
271
        if (r < 0)
×
272
                return r;
273
        if (r == 0)
×
274
                return sd_varlink_error_invalid_parameter_name(link, "userNamespaceFileDescriptor");
×
275

276
        /* Our own host user namespace? Then close the fd, and handle it as if none was specified. */
277
        r = is_our_namespace(*userns_fd, NAMESPACE_USER);
×
278
        if (r < 0)
×
279
                return log_debug_errno(r, "Failed to determine if user namespace provided by client is our own.");
×
280
        if (r > 0) {
×
281
                log_debug("User namespace provided by client is our own.");
×
282
                *userns_fd = safe_close(*userns_fd);
×
283
        }
284

285
        return 0;
286
}
287

288
static int vl_method_mount_image(
×
289
                sd_varlink *link,
290
                sd_json_variant *parameters,
291
                sd_varlink_method_flags_t flags,
292
                void *userdata) {
293

294
        static const sd_json_dispatch_field dispatch_table[] = {
×
295
                { "imageFileDescriptor",         SD_JSON_VARIANT_UNSIGNED, sd_json_dispatch_uint,        offsetof(MountImageParameters, image_fd_idx),         SD_JSON_MANDATORY },
296
                { "userNamespaceFileDescriptor", SD_JSON_VARIANT_UNSIGNED, sd_json_dispatch_uint,        offsetof(MountImageParameters, userns_fd_idx),        0 },
297
                { "readOnly",                    SD_JSON_VARIANT_BOOLEAN,  sd_json_dispatch_tristate,    offsetof(MountImageParameters, read_only),            0 },
298
                { "growFileSystems",             SD_JSON_VARIANT_BOOLEAN,  sd_json_dispatch_tristate,    offsetof(MountImageParameters, growfs),               0 },
299
                { "password",                    SD_JSON_VARIANT_STRING,   sd_json_dispatch_string,      offsetof(MountImageParameters, password),             0 },
300
                { "imagePolicy",                 SD_JSON_VARIANT_STRING,   json_dispatch_image_policy,   offsetof(MountImageParameters, image_policy),         0 },
301
                { "veritySharing",               SD_JSON_VARIANT_BOOLEAN,  sd_json_dispatch_stdbool,     offsetof(MountImageParameters, verity_sharing),       0 },
302
                { "verityDataFileDescriptor",    SD_JSON_VARIANT_UNSIGNED, sd_json_dispatch_uint,        offsetof(MountImageParameters, verity_data_fd_idx),   0 },
303
                { "verityRootHash",              SD_JSON_VARIANT_STRING,   json_dispatch_unhex_iovec,    offsetof(MountImageParameters, verity_root_hash),     0 },
304
                { "verityRootHashSignature",     SD_JSON_VARIANT_STRING,   json_dispatch_unbase64_iovec, offsetof(MountImageParameters, verity_root_hash_sig), 0 },
305
                VARLINK_DISPATCH_POLKIT_FIELD,
306
                {}
307
        };
308

309
        _cleanup_(verity_settings_done) VeritySettings verity = VERITY_SETTINGS_DEFAULT;
×
310
        _cleanup_(mount_image_parameters_done) MountImageParameters p = {
×
311
                .image_fd_idx = UINT_MAX,
312
                .userns_fd_idx = UINT_MAX,
313
                .verity_data_fd_idx = UINT_MAX,
314
                .read_only = -1,
315
                .growfs = -1,
316
        };
317
        _cleanup_(dissected_image_unrefp) DissectedImage *di = NULL;
×
318
        _cleanup_(loop_device_unrefp) LoopDevice *loop = NULL;
×
319
        _cleanup_(sd_json_variant_unrefp) sd_json_variant *aj = NULL;
×
320
        _cleanup_close_ int image_fd = -EBADF, userns_fd = -EBADF, verity_data_fd = -EBADF;
×
321
        _cleanup_(image_policy_freep) ImagePolicy *use_policy = NULL;
×
322
        Hashmap **polkit_registry = ASSERT_PTR(userdata);
×
323
        _cleanup_free_ char *ps = NULL;
×
324
        bool image_is_trusted = false;
×
325
        int r;
×
326

327
        assert(link);
×
328
        assert(parameters);
×
329

330
        sd_json_variant_sensitive(parameters); /* might contain passwords */
×
331

332
        r = sd_varlink_dispatch(link, parameters, dispatch_table, &p);
×
333
        if (r != 0)
×
334
                return r;
335

336
        /* Verity data and roothash have to be either both set, or both unset. The sig can be set only if
337
         * the roothash is set. */
338
        if ((p.verity_data_fd_idx != UINT_MAX) != (p.verity_root_hash.iov_len > 0))
×
339
                return sd_varlink_error_invalid_parameter_name(link, "verityDataFileDescriptor");
×
340
        if (p.verity_root_hash_sig.iov_len > 0 && p.verity_root_hash.iov_len == 0)
×
341
                return sd_varlink_error_invalid_parameter_name(link, "verityRootHashSignature");
×
342

343
        if (p.image_fd_idx != UINT_MAX) {
×
344
                image_fd = sd_varlink_peek_dup_fd(link, p.image_fd_idx);
×
345
                if (image_fd < 0)
×
346
                        return log_debug_errno(image_fd, "Failed to peek image fd from client: %m");
×
347
        }
348

349
        if (p.userns_fd_idx != UINT_MAX) {
×
350
                userns_fd = sd_varlink_peek_dup_fd(link, p.userns_fd_idx);
×
351
                if (userns_fd < 0)
×
352
                        return log_debug_errno(userns_fd, "Failed to peek user namespace fd from client: %m");
×
353
        }
354

355
        r = validate_image_fd(image_fd, &p);
×
356
        if (r < 0)
×
357
                return r;
358

359
        r = validate_userns(link, &userns_fd);
×
360
        if (r != 0)
×
361
                return r;
362

363
        r = verify_trusted_image_fd_by_path(image_fd);
×
364
        if (r < 0)
×
365
                return r;
366
        image_is_trusted = r;
×
367

368
        if (p.verity_data_fd_idx != UINT_MAX) {
×
369
                verity_data_fd = sd_varlink_peek_dup_fd(link, p.verity_data_fd_idx);
×
370
                if (verity_data_fd < 0)
×
371
                        return log_debug_errno(verity_data_fd, "Failed to peek verity data fd from client: %m");
×
372

373
                r = fd_verify_safe_flags(verity_data_fd);
×
374
                if (r < 0)
×
375
                        return log_debug_errno(r, "Verity data file descriptor has unsafe flags set: %m");
×
376

377
                verity.data_path = strdup(FORMAT_PROC_FD_PATH(verity_data_fd));
×
378
                if (!verity.data_path)
×
379
                        return -ENOMEM;
380

381
                verity.designator = PARTITION_ROOT;
×
382
                verity.root_hash = TAKE_STRUCT(p.verity_root_hash);
×
383
                verity.root_hash_sig = TAKE_STRUCT(p.verity_root_hash_sig);
×
384
        }
385

386
        const char *polkit_details[] = {
×
387
                "read_only", one_zero(p.read_only > 0),
×
388
                NULL,
389
        };
390

391
        const char *polkit_action, *polkit_untrusted_action;
×
392
        PolkitFlags polkit_flags;
×
393
        if (userns_fd < 0) {
×
394
                /* Mount into the host user namespace */
395
                polkit_action = "io.systemd.mount-file-system.mount-image";
396
                polkit_untrusted_action = "io.systemd.mount-file-system.mount-untrusted-image";
397
                polkit_flags = 0;
398
        } else {
399
                /* Mount into a private user namespace */
400
                polkit_action = "io.systemd.mount-file-system.mount-image-privately";
×
401
                polkit_untrusted_action = "io.systemd.mount-file-system.mount-untrusted-image-privately";
×
402

403
                /* If polkit is not around, let's allow mounting authenticated images by default */
404
                polkit_flags = POLKIT_DEFAULT_ALLOW;
×
405
        }
406

407
        /* Let's definitely acquire the regular action privilege, for mounting properly signed images */
408
        r = varlink_verify_polkit_async_full(
×
409
                        link,
410
                        /* bus= */ NULL,
411
                        polkit_action,
412
                        polkit_details,
413
                        /* good_user= */ UID_INVALID,
414
                        polkit_flags,
415
                        polkit_registry);
416
        if (r <= 0)
×
417
                return r;
418

419
        /* Generate the common dissection directory here. We are not going to use it, but the clients might,
420
         * and they likely are unprivileged, hence cannot create it themselves. Hence let's just create it
421
         * here, if it is missing. */
422
        r = get_common_dissect_directory(NULL);
×
423
        if (r < 0)
×
424
                return r;
425

426
        r = loop_device_make(
×
427
                        image_fd,
428
                        p.read_only == 0 ? O_RDONLY : O_RDWR,
×
429
                        0,
430
                        UINT64_MAX,
431
                        UINT32_MAX,
432
                        LO_FLAGS_PARTSCAN,
433
                        LOCK_EX,
434
                        &loop);
435
        if (r < 0)
×
436
                return r;
437

438
        DissectImageFlags dissect_flags =
×
439
                (p.read_only == 0 ? DISSECT_IMAGE_READ_ONLY : 0) |
×
440
                (p.growfs != 0 ? DISSECT_IMAGE_GROWFS : 0) |
×
441
                DISSECT_IMAGE_DISCARD_ANY |
442
                DISSECT_IMAGE_FSCK |
443
                DISSECT_IMAGE_ADD_PARTITION_DEVICES |
×
444
                DISSECT_IMAGE_PIN_PARTITION_DEVICES |
×
445
                (p.verity_sharing ? DISSECT_IMAGE_VERITY_SHARE : 0) |
×
446
                /* Maybe the image is a bare filesystem. Note that this requires privileges, as it is
447
                 * classified by the policy as an 'unprotected' image and will be refused otherwise. */
448
                DISSECT_IMAGE_NO_PARTITION_TABLE |
×
449
                DISSECT_IMAGE_ALLOW_USERSPACE_VERITY;
450

451
        /* Let's see if we have acquired the privilege to mount untrusted images already */
452
        bool polkit_have_untrusted_action =
×
453
                varlink_has_polkit_action(link, polkit_untrusted_action, polkit_details, polkit_registry);
×
454

455
        for (;;) {
×
456
                use_policy = image_policy_free(use_policy);
×
457
                ps = mfree(ps);
×
458

459
                /* We use the image policy for trusted images if either the path is below a trusted
460
                 * directory, or if we have already acquired a PK authentication that tells us that untrusted
461
                 * images are OK */
462
                bool use_trusted_policy =
×
463
                        image_is_trusted ||
464
                        polkit_have_untrusted_action;
465

466
                r = determine_image_policy(
×
467
                                image_fd,
468
                                use_trusted_policy,
469
                                p.image_policy,
470
                                &use_policy);
471
                if (r < 0)
×
472
                        return r;
473

474
                r = image_policy_to_string(use_policy, /* simplify= */ true, &ps);
×
475
                if (r < 0)
×
476
                        return r;
477

478
                log_debug("Using image policy: %s", ps);
×
479

480
                r = dissect_loop_device(
×
481
                                loop,
482
                                &verity,
483
                                /* mount_options= */ NULL,
484
                                use_policy,
485
                                /* image_filter= */ NULL,
486
                                dissect_flags,
487
                                &di);
488
                if (r == -ENOPKG)
×
489
                        return sd_varlink_error(link, "io.systemd.MountFileSystem.IncompatibleImage", NULL);
×
490
                if (r == -ENOTUNIQ)
×
491
                        return sd_varlink_error(link, "io.systemd.MountFileSystem.MultipleRootPartitionsFound", NULL);
×
492
                if (r == -ENXIO)
×
493
                        return sd_varlink_error(link, "io.systemd.MountFileSystem.RootPartitionNotFound", NULL);
×
494
                if (r == -ERFKILL) {
×
495
                        /* The image policy refused this, let's retry after trying to get PolicyKit */
496

497
                        if (!polkit_have_untrusted_action) {
×
498
                                log_debug("Denied by image policy. Trying a stronger polkit authentication before continuing.");
×
499
                                r = varlink_verify_polkit_async_full(
×
500
                                                link,
501
                                                /* bus= */ NULL,
502
                                                polkit_untrusted_action,
503
                                                polkit_details,
504
                                                /* good_user= */ UID_INVALID,
505
                                                /* flags= */ 0,                   /* NB: the image cannot be authenticated, hence unless PK is around to allow this anyway, fail! */
506
                                                polkit_registry);
507
                                if (r <= 0 && !ERRNO_IS_NEG_PRIVILEGE(r))
×
508
                                        return r;
509
                                if (r > 0) {
×
510
                                        /* Try again, now that we know the client has enough privileges. */
511
                                        log_debug("Denied by image policy, retrying after polkit authentication.");
×
512
                                        polkit_have_untrusted_action = true;
×
513
                                        continue;
×
514
                                }
515
                        }
516

517
                        return sd_varlink_error(link, "io.systemd.MountFileSystem.DeniedByImagePolicy", NULL);
×
518
                }
519
                if (r < 0)
×
520
                        return r;
521

522
                /* Success */
523
                break;
×
524
        }
525

526
        r = dissected_image_load_verity_sig_partition(
×
527
                        di,
528
                        loop->fd,
×
529
                        &verity);
530
        if (r < 0)
×
531
                return r;
532

533
        r = dissected_image_guess_verity_roothash(
×
534
                        di,
535
                        &verity);
536
        if (r < 0)
×
537
                return r;
538

539
        r = dissected_image_decrypt(
×
540
                        di,
541
                        p.password,
×
542
                        &verity,
543
                        use_policy,
544
                        dissect_flags);
545
        if (r == -ENOKEY) /* new dm-verity userspace returns ENOKEY if the dm-verity signature key is not in
×
546
                           * key chain. That's great. */
547
                return sd_varlink_error(link, "io.systemd.MountFileSystem.KeyNotFound", NULL);
×
548
        if (r == -EBUSY) /* DM kernel subsystem is shit with returning useful errors hence we keep retrying
×
549
                          * under the assumption that some errors are transitional. Which the errors might
550
                          * not actually be. After all retries failed we return EBUSY. Let's turn that into a
551
                          * generic Verity error. It's not very helpful, could mean anything, but at least it
552
                          * gives client a clear idea that this has to do with Verity. */
553
                return sd_varlink_error(link, "io.systemd.MountFileSystem.VerityFailure", NULL);
×
554
        if (r < 0)
×
555
                return r;
556

557
        r = dissected_image_mount(
×
558
                        di,
559
                        /* where= */ NULL,
560
                        /* uid_shift= */ UID_INVALID,
561
                        /* uid_range= */ UID_INVALID,
562
                        userns_fd,
563
                        dissect_flags);
564
        if (r < 0)
×
565
                return r;
566

567
        for (PartitionDesignator d = 0; d < _PARTITION_DESIGNATOR_MAX; d++) {
×
568
                DissectedPartition *pp = di->partitions + d;
×
569
                int fd_idx;
×
570

571
                if (!pp->found)
×
572
                        continue;
×
573

574
                if (pp->fsmount_fd < 0)
×
575
                        continue;
×
576

577
                if (userns_fd >= 0) {
×
578
                        r = nsresource_add_mount(userns_fd, pp->fsmount_fd);
×
579
                        if (r < 0)
×
580
                                return r;
×
581
                }
582

583
                fd_idx = sd_varlink_push_fd(link, pp->fsmount_fd);
×
584
                if (fd_idx < 0)
×
585
                        return fd_idx;
586

587
                TAKE_FD(pp->fsmount_fd);
×
588

589
                const char *m = partition_mountpoint_to_string(d);
×
590
                _cleanup_strv_free_ char **l = NULL;
×
591
                if (!isempty(m)) {
×
592
                        l = strv_split_nulstr(m);
×
593
                        if (!l)
×
594
                                return log_oom_debug();
×
595
                }
596

597
                r = sd_json_variant_append_arraybo(
×
598
                                &aj,
599
                                SD_JSON_BUILD_PAIR_STRING("designator", partition_designator_to_string(d)),
600
                                SD_JSON_BUILD_PAIR_BOOLEAN("writable", pp->rw),
601
                                SD_JSON_BUILD_PAIR_BOOLEAN("growFileSystem", pp->growfs),
602
                                SD_JSON_BUILD_PAIR_CONDITION(pp->partno > 0, "partitionNumber", SD_JSON_BUILD_INTEGER(pp->partno)),
603
                                SD_JSON_BUILD_PAIR_CONDITION(pp->architecture > 0, "architecture", SD_JSON_BUILD_STRING(architecture_to_string(pp->architecture))),
604
                                SD_JSON_BUILD_PAIR_CONDITION(!sd_id128_is_null(pp->uuid), "partitionUuid", SD_JSON_BUILD_UUID(pp->uuid)),
605
                                SD_JSON_BUILD_PAIR_STRING("fileSystemType", dissected_partition_fstype(pp)),
606
                                SD_JSON_BUILD_PAIR_CONDITION(!!pp->label, "partitionLabel", SD_JSON_BUILD_STRING(pp->label)),
607
                                SD_JSON_BUILD_PAIR_UNSIGNED("size", pp->size),
608
                                SD_JSON_BUILD_PAIR_UNSIGNED("offset", pp->offset),
609
                                SD_JSON_BUILD_PAIR_INTEGER("mountFileDescriptor", fd_idx),
610
                                JSON_BUILD_PAIR_STRV_NON_EMPTY("mountPoint", l));
611
                if (r < 0)
×
612
                        return r;
613
        }
614

615
        loop_device_relinquish(loop);
×
616

617
        return sd_varlink_replybo(
×
618
                        link,
619
                        SD_JSON_BUILD_PAIR_VARIANT("partitions", aj),
620
                        SD_JSON_BUILD_PAIR_STRING("imagePolicy", ps),
621
                        SD_JSON_BUILD_PAIR_UNSIGNED("imageSize", di->image_size),
622
                        SD_JSON_BUILD_PAIR_UNSIGNED("sectorSize", di->sector_size),
623
                        SD_JSON_BUILD_PAIR_CONDITION(!sd_id128_is_null(di->image_uuid), "imageUuid", SD_JSON_BUILD_UUID(di->image_uuid)));
624
}
625

626
typedef enum MountMapMode {
627
        MOUNT_MAP_AUTO = 0,     /* determine automatically from image and caller */
628
        MOUNT_MAP_ROOT,         /* map caller's UID to root in namespace (map 1 UID only) */
629
        MOUNT_MAP_FOREIGN,      /* map foreign UID range to base in namespace (map 64K) */
630
        MOUNT_MAP_IDENTITY,     /* apply identity mapping (map 64K) */
631
        _MOUNT_MAP_MODE_MAX,
632
        _MOUNT_MAP_MODE_INVALID = -EINVAL,
633
} MountMapMode;
634

635
static const char *const mount_map_mode_table[_MOUNT_MAP_MODE_MAX] = {
636
        [MOUNT_MAP_AUTO]     = "auto",
637
        [MOUNT_MAP_ROOT]     = "root",
638
        [MOUNT_MAP_FOREIGN]  = "foreign",
639
        [MOUNT_MAP_IDENTITY] = "identity",
640
};
641

642
DEFINE_PRIVATE_STRING_TABLE_LOOKUP(mount_map_mode, MountMapMode);
×
643

644
typedef struct MountDirectoryParameters {
645
        MountMapMode mode;
646
        unsigned directory_fd_idx;
647
        unsigned userns_fd_idx;
648
        int read_only;
649
} MountDirectoryParameters;
650

651
typedef enum DirectoryOwnership {
652
        DIRECTORY_IS_ROOT_PEER_OWNED,  /* This is returned if the directory is owned by the root user and the peer is root */
653
        DIRECTORY_IS_ROOT_OWNED,       /* This is returned if the directory is owned by the root user (and the peer user is not root) */
654
        DIRECTORY_IS_PEER_OWNED,       /* This is returned if the directory is owned by the peer user (who is not root) */
655
        DIRECTORY_IS_FOREIGN_OWNED,    /* This is returned if the directory is owned by the foreign UID range */
656
        DIRECTORY_IS_OTHERWISE_OWNED,  /* This is returned if the directory is owned by something else */
657
        _DIRECTORY_OWNERSHIP_MAX,
658
        _DIRECTORY_OWNERSHIP_ERRNO_MAX = -ERRNO_MAX, /* Guarantee the whole negative errno range fits */
659
} DirectoryOwnership;
660

661
static MountMapMode default_mount_map_mode(DirectoryOwnership ownership) {
×
662
        /* Derives a suitable mapping mode from the ownership of the base tree */
663

664
        switch (ownership) {
×
665
        case DIRECTORY_IS_PEER_OWNED:
666
                return MOUNT_MAP_ROOT;     /* Map the peer's UID to root in the container */
667

668
        case DIRECTORY_IS_FOREIGN_OWNED:
×
669
                return MOUNT_MAP_FOREIGN;  /* Map the foreign UID range to the container's UID range */
×
670

671
        case DIRECTORY_IS_ROOT_PEER_OWNED:
×
672
        case DIRECTORY_IS_ROOT_OWNED:
673
        case DIRECTORY_IS_OTHERWISE_OWNED:
674
                return MOUNT_MAP_IDENTITY; /* Don't map */
×
675

676
        default:
×
677
                return _MOUNT_MAP_MODE_INVALID;
×
678
        }
679
}
680

681
static JSON_DISPATCH_ENUM_DEFINE(dispatch_mount_directory_mode, MountMapMode, mount_map_mode_from_string);
×
682

683
static DirectoryOwnership validate_directory_fd(
×
684
                int fd,
685
                uid_t peer_uid,
686
                uid_t *ret_current_owner_uid) {
687

688
        int r, fl;
×
689

690
        assert(fd >= 0);
×
691
        assert(uid_is_valid(peer_uid));
×
692
        assert(ret_current_owner_uid);
×
693

694
        /* Checks if the specified directory fd looks sane. Returns a DirectoryOwnership that categorizes the
695
         * ownership situation in comparison to the peer's UID.
696
         *
697
         * Note one key difference to image validation (as implemented above): for regular files if the
698
         * client provided us with an open fd it implies the client has access, as well as what kind of
699
         * access (i.e. ro or rw). But for directories this doesn't work the same way, as directories are
700
         * always opened read-only only. Hence we use a different mechanism to validate access to them: we
701
         * check if the directory is owned by the peer UID or by the foreign UID range (in the latter case
702
         * one of the parent directories must be owned by the peer though). */
703

704
        struct stat st;
×
705
        if (fstat(fd, &st) < 0)
×
706
                return log_debug_errno(errno, "Failed to stat() directory fd: %m");
×
707

708
        r = stat_verify_directory(&st);
×
709
        if (r < 0)
×
710
                return r;
711

712
        fl = fd_verify_safe_flags_full(fd, O_DIRECTORY|O_PATH);
×
713
        if (fl < 0)
×
714
                return log_debug_errno(fl, "Directory file descriptor has unsafe flags set: %m");
×
715

716
        if (st.st_uid == 0) {
×
717
                *ret_current_owner_uid = st.st_uid;
×
718
                if (peer_uid == 0) {
×
719
                        log_debug("Directory file descriptor points to root owned directory, who is also the peer.");
×
720
                        return DIRECTORY_IS_ROOT_PEER_OWNED;
×
721
                }
722
                log_debug("Directory file descriptor points to root owned directory.");
×
723
                return DIRECTORY_IS_ROOT_OWNED;
×
724
        }
725
        if (st.st_uid == peer_uid) {
×
726
                log_debug("Directory file descriptor points to peer owned directory.");
×
727
                *ret_current_owner_uid = st.st_uid;
×
728
                return DIRECTORY_IS_PEER_OWNED;
×
729
        }
730

731
        /* For bind mounted directories we check if they are either owned by the client's UID, or by the
732
         * foreign UID set, but in that case the parent directory must be owned by the client's UID, or some
733
         * directory iteratively up the chain */
734

735
        _cleanup_close_ int parent_fd = -EBADF;
×
736
        unsigned n_level;
737
        for (n_level = 0; n_level < 16; n_level++) {
×
738
                /* Stop iteration if we find a directory up the tree that is neither owned by the user, nor is from the foreign UID range */
739
                if (!uid_is_foreign(st.st_uid) || !gid_is_foreign(st.st_gid)) {
×
740
                        log_debug("Directory file descriptor points to directory which itself or its parents is neither owned by foreign UID range nor by the user.");
×
741
                        *ret_current_owner_uid = st.st_uid;
×
742
                        return DIRECTORY_IS_OTHERWISE_OWNED;
×
743
                }
744

745
                /* If the peer is root, then it doesn't matter if we find a parent owned by root, let's shortcut things. */
746
                if (peer_uid == 0) {
×
747
                        log_debug("Directory file descriptor is owned by foreign UID range, and peer is root.");
×
748
                        *ret_current_owner_uid = st.st_uid;
×
749
                        return DIRECTORY_IS_FOREIGN_OWNED;
×
750
                }
751

752
                /* Go one level up */
753
                _cleanup_close_ int new_parent_fd = openat(fd, "..", O_DIRECTORY|O_PATH|O_CLOEXEC);
×
754
                if (new_parent_fd < 0)
×
755
                        return log_debug_errno(errno, "Failed to open parent directory of directory file descriptor: %m");
×
756

757
                struct stat new_st;
×
758
                if (fstat(new_parent_fd, &new_st) < 0)
×
759
                        return log_debug_errno(errno, "Failed to stat parent directory of directory file descriptor: %m");
×
760

761
                /* Safety check to see if we hit the root dir */
762
                if (stat_inode_same(&st, &new_st)) {
×
763
                        log_debug("Directory file descriptor is owned by foreign UID range, but didn't find parent directory that is owned by peer among ancestors.");
×
764
                        *ret_current_owner_uid = st.st_uid;
×
765
                        return DIRECTORY_IS_OTHERWISE_OWNED;
×
766
                }
767

768
                if (new_st.st_uid == peer_uid) { /* Parent inode is owned by the peer. That's good! Everything's fine. */
×
769
                        log_debug("Directory file descriptor is owned by foreign UID range, and ancestor is owned by peer.");
×
770
                        *ret_current_owner_uid = st.st_uid;
×
771
                        return DIRECTORY_IS_FOREIGN_OWNED;
×
772
                }
773

774
                close_and_replace(parent_fd, new_parent_fd);
×
775
                st = new_st;
×
776
        }
777

778
        log_debug("Failed to find peer owned parent directory after %u levels, refusing.", n_level);
×
779
        *ret_current_owner_uid = st.st_uid;
×
780
        return DIRECTORY_IS_OTHERWISE_OWNED;
×
781
}
782

783
static int vl_method_mount_directory(
×
784
                sd_varlink *link,
785
                sd_json_variant *parameters,
786
                sd_varlink_method_flags_t flags,
787
                void *userdata) {
788

789
        static const sd_json_dispatch_field dispatch_table[] = {
×
790
                { "mode",                        SD_JSON_VARIANT_STRING,   dispatch_mount_directory_mode, offsetof(MountDirectoryParameters, mode),             0                 },
791
                { "directoryFileDescriptor",     SD_JSON_VARIANT_UNSIGNED, sd_json_dispatch_uint,         offsetof(MountDirectoryParameters, directory_fd_idx), SD_JSON_MANDATORY },
792
                { "userNamespaceFileDescriptor", SD_JSON_VARIANT_UNSIGNED, sd_json_dispatch_uint,         offsetof(MountDirectoryParameters, userns_fd_idx),    0                 },
793
                { "readOnly",                    SD_JSON_VARIANT_BOOLEAN,  sd_json_dispatch_tristate,     offsetof(MountDirectoryParameters, read_only),        0                 },
794
                VARLINK_DISPATCH_POLKIT_FIELD,
795
                {}
796
        };
797

798
        MountDirectoryParameters p = {
×
799
                .mode = MOUNT_MAP_AUTO,
800
                .directory_fd_idx = UINT_MAX,
801
                .userns_fd_idx = UINT_MAX,
802
                .read_only = -1,
803
        };
804
        _cleanup_close_ int directory_fd = -EBADF, userns_fd = -EBADF;
×
805
        Hashmap **polkit_registry = ASSERT_PTR(userdata);
×
806
        int r;
×
807

808
        r = sd_varlink_dispatch(link, parameters, dispatch_table, &p);
×
809
        if (r != 0)
×
810
                return r;
811

812
        if (p.directory_fd_idx == UINT_MAX)
×
813
                return sd_varlink_error_invalid_parameter_name(link, "directoryFileDescriptor");
×
814

815
        directory_fd = sd_varlink_peek_dup_fd(link, p.directory_fd_idx);
×
816
        if (directory_fd < 0)
×
817
                return log_debug_errno(directory_fd, "Failed to peek directory fd from client: %m");
×
818

819
        if (p.userns_fd_idx != UINT_MAX) {
×
820
                userns_fd = sd_varlink_peek_dup_fd(link, p.userns_fd_idx);
×
821
                if (userns_fd < 0)
×
822
                        return log_debug_errno(userns_fd, "Failed to peek user namespace fd from client: %m");
×
823
        }
824

825
        uid_t peer_uid;
×
826
        r = sd_varlink_get_peer_uid(link, &peer_uid);
×
827
        if (r < 0)
×
828
                return log_debug_errno(r, "Failed to get client UID: %m");
×
829

830
        uid_t current_owner_uid;
×
831
        DirectoryOwnership owned_by = validate_directory_fd(directory_fd, peer_uid, &current_owner_uid);
×
832
        if (owned_by == -EREMOTEIO)
×
833
                return sd_varlink_errorbo(link, "io.systemd.MountFileSystem.BadFileDescriptorFlags", SD_JSON_BUILD_PAIR_STRING("parameter", "directoryFileDescriptor"));
×
834
        if (owned_by < 0)
×
835
                return owned_by;
836

837
        r = validate_userns(link, &userns_fd);
×
838
        if (r != 0)
×
839
                return r;
840

841
        /* If no mode is specified, pick sensible default */
842
        if (p.mode <= 0) {
×
843
                p.mode = default_mount_map_mode(owned_by);
×
844
                assert(p.mode > 0);
×
845
        }
846

847
        _cleanup_free_ char *directory_path = NULL;
×
848
        (void) fd_get_path(directory_fd, &directory_path);
×
849

850
        log_debug("Mounting '%s' with mapping mode: %s", strna(directory_path), mount_map_mode_to_string(p.mode));
×
851

852
        const char *polkit_details[] = {
×
853
                "read_only", one_zero(p.read_only > 0),
×
854
                "directory", strna(directory_path),
×
855
                NULL,
856
        };
857

858
        const char *polkit_action, *polkit_untrusted_action;
×
859
        PolkitFlags polkit_flags;
×
860
        if (userns_fd < 0) {
×
861
                /* Mount into the host user namespace */
862
                polkit_action = "io.systemd.mount-file-system.mount-directory";
863
                polkit_untrusted_action = "io.systemd.mount-file-system.mount-untrusted-directory";
864
                polkit_flags = 0;
865
        } else {
866
                /* Mount into a private user namespace */
867
                polkit_action = "io.systemd.mount-file-system.mount-directory-privately";
×
868
                polkit_untrusted_action = "io.systemd.mount-file-system.mount-untrusted-directory-privately";
×
869

870
                /* If polkit is not around, let's allow mounting authenticated images by default */
871
                polkit_flags = POLKIT_DEFAULT_ALLOW;
×
872
        }
873

874
        /* We consider a directory "trusted" if it is owned by the peer or the foreign UID range */
875
        bool trusted_directory = IN_SET(owned_by, DIRECTORY_IS_ROOT_PEER_OWNED, DIRECTORY_IS_PEER_OWNED, DIRECTORY_IS_FOREIGN_OWNED);
×
876

877
        /* Let's definitely acquire the regular action privilege, for mounting properly signed images */
878
        r = varlink_verify_polkit_async_full(
×
879
                        link,
880
                        /* bus= */ NULL,
881
                        trusted_directory ? polkit_action : polkit_untrusted_action,
882
                        polkit_details,
883
                        /* good_user= */ UID_INVALID,
884
                        trusted_directory ? polkit_flags : 0,
885
                        polkit_registry);
886
        if (r <= 0)
×
887
                return r;
888

889
        /* Generate the common dissection directory here. We are not going to use it, but the clients might,
890
         * and they likely are unprivileged, hence cannot create it themselves. Hence let's just create it
891
         * here, if it is missing. */
892
        r = get_common_dissect_directory(NULL);
×
893
        if (r < 0)
×
894
                return r;
895

896
        _cleanup_close_ int mount_fd = open_tree_try_drop_idmap(
×
897
                        directory_fd,
898
                        "",
899
                        OPEN_TREE_CLONE|OPEN_TREE_CLOEXEC|AT_SYMLINK_NOFOLLOW|AT_EMPTY_PATH);
900
        if (mount_fd < 0)
×
901
                return log_debug_errno(errno, "Failed to issue open_tree() of provided directory '%s': %m", strna(directory_path));
×
902

903
        /* MOUNT_ATTR_IDMAP has possibly been cleared. Let's verify that the underlying data matches our expectations. */
904
        struct stat unmapped_st;
×
905
        if (fstat(mount_fd, &unmapped_st) < 0)
×
906
                return log_debug_errno(errno, "Failed to stat unmapped inode: %m");
×
907

908
        r = stat_verify_directory(&unmapped_st);
×
909
        if (r < 0)
×
910
                return r;
911

912
        /* For now, let's simply refuse things if dropping the idmapping changed anything. For now that
913
         * should be good enough, because the primary usecase for this (homed) will mount the foreign UID
914
         * range 1:1. */
915
        if (unmapped_st.st_uid != current_owner_uid)
×
916
                return log_debug_errno(SYNTHETIC_ERRNO(EPERM), "Owner UID of mount after clearing ID mapping not the same anymore, refusing.");
×
917

918
        if (p.read_only > 0 && mount_setattr(
×
919
                            mount_fd, "", AT_EMPTY_PATH,
920
                            &(struct mount_attr) {
×
921
                                    .attr_set = MOUNT_ATTR_RDONLY,
922
                            }, MOUNT_ATTR_SIZE_VER0) < 0)
923
                return log_debug_errno(errno, "Failed to enable read-only mode: %m");
×
924

925
        if (p.mode != MOUNT_MAP_IDENTITY) {
×
926
                uid_t start;
×
927

928
                if (userns_fd >= 0) {
×
929
                        _cleanup_(uid_range_freep) UIDRange *uid_range_outside = NULL, *uid_range_inside = NULL, *gid_range_outside = NULL, *gid_range_inside = NULL;
×
930
                        r = uid_range_load_userns_by_fd(userns_fd, UID_RANGE_USERNS_OUTSIDE, &uid_range_outside);
×
931
                        if (r < 0)
×
932
                                return log_debug_errno(r, "Failed to load outside UID range of provided userns: %m");
×
933
                        r = uid_range_load_userns_by_fd(userns_fd, UID_RANGE_USERNS_INSIDE, &uid_range_inside);
×
934
                        if (r < 0)
×
935
                                return log_debug_errno(r, "Failed to load inside UID range of provided userns: %m");
×
936
                        r = uid_range_load_userns_by_fd(userns_fd, GID_RANGE_USERNS_OUTSIDE, &gid_range_outside);
×
937
                        if (r < 0)
×
938
                                return log_debug_errno(r, "Failed to load outside GID range of provided userns: %m");
×
939
                        r = uid_range_load_userns_by_fd(userns_fd, GID_RANGE_USERNS_INSIDE, &gid_range_inside);
×
940
                        if (r < 0)
×
941
                                return log_debug_errno(r, "Failed to load inside GID range of provided userns: %m");
×
942

943
                        /* Be very strict for now */
944
                        if (!uid_range_equal(uid_range_outside, gid_range_outside) ||
×
945
                            !uid_range_equal(uid_range_inside, gid_range_inside) ||
×
946
                            uid_range_outside->n_entries != 1 ||
×
947
                            uid_range_outside->entries[0].nr != 0x10000 ||
×
948
                            uid_range_inside->n_entries != 1 ||
×
949
                            uid_range_inside->entries[0].start != 0 ||
×
950
                            uid_range_inside->entries[0].nr != 0x10000)
×
951
                                return sd_varlink_error_invalid_parameter_name(link, "userNamespaceFileDescriptor");
×
952

953
                        start = uid_range_outside->entries[0].start;
×
954
                } else
955
                        start = 0;
956

957
                _cleanup_free_ char *new_uid_map = NULL;
×
958
                switch (p.mode) {
×
959
                case MOUNT_MAP_ROOT:
×
960
                        r = strextendf(&new_uid_map, UID_FMT " " UID_FMT " " UID_FMT,
×
961
                                       peer_uid, start, (uid_t) 1);
962
                        break;
963
                case MOUNT_MAP_FOREIGN:
×
964
                        r = strextendf(&new_uid_map, UID_FMT " " UID_FMT " " UID_FMT,
×
965
                                       (uid_t) FOREIGN_UID_MIN, start, (uid_t) 0x10000);
966
                        break;
967
                default:
×
968
                        assert_not_reached();
×
969
                }
970
                if (r < 0)
×
971
                        return r;
972

973
                _cleanup_close_ int idmap_userns_fd = userns_acquire(new_uid_map, new_uid_map, /* setgroups_deny= */ true);
×
974
                if (idmap_userns_fd < 0)
×
975
                        return log_debug_errno(idmap_userns_fd, "Failed to acquire user namespace for id mapping: %m");
×
976

977
                if (mount_setattr(mount_fd, "", AT_EMPTY_PATH,
×
978
                                  &(struct mount_attr) {
×
979
                                          .attr_set = MOUNT_ATTR_IDMAP,
980
                                          .userns_fd = idmap_userns_fd,
981
                                          .propagation = MS_PRIVATE,
982
                                  }, MOUNT_ATTR_SIZE_VER0) < 0)
983
                        return log_debug_errno(errno, "Failed to enable id mapping: %m");
×
984
        }
985

986
        if (userns_fd >= 0) {
×
987
                r = nsresource_add_mount(userns_fd, mount_fd);
×
988
                if (r < 0)
×
989
                        return r;
990
        }
991

992
        int fd_idx = sd_varlink_push_fd(link, mount_fd);
×
993
        if (fd_idx < 0)
×
994
                return fd_idx;
995

996
        TAKE_FD(mount_fd);
×
997

998
        return sd_varlink_replybo(
×
999
                        link,
1000
                        SD_JSON_BUILD_PAIR_INTEGER("mountFileDescriptor", fd_idx));
1001
}
1002

1003
typedef struct MakeDirectoryParameters {
1004
        unsigned parent_fd_idx;
1005
        const char *name;
1006
} MakeDirectoryParameters;
1007

1008
static int vl_method_make_directory(
×
1009
                sd_varlink *link,
1010
                sd_json_variant *parameters,
1011
                sd_varlink_method_flags_t flags,
1012
                void *userdata) {
1013

1014
        static const sd_json_dispatch_field dispatch_table[] = {
×
1015
                { "parentFileDescriptor", SD_JSON_VARIANT_UNSIGNED, sd_json_dispatch_uint,        offsetof(MakeDirectoryParameters, parent_fd_idx), SD_JSON_MANDATORY },
1016
                { "name",                 SD_JSON_VARIANT_STRING,   json_dispatch_const_filename, offsetof(MakeDirectoryParameters, name),          SD_JSON_MANDATORY },
1017
                VARLINK_DISPATCH_POLKIT_FIELD,
1018
                {}
1019
        };
1020

1021
        MakeDirectoryParameters p = {
×
1022
                .parent_fd_idx = UINT_MAX,
1023
        };
1024
        Hashmap **polkit_registry = ASSERT_PTR(userdata);
×
1025
        int r;
×
1026

1027
        r = sd_varlink_dispatch(link, parameters, dispatch_table, &p);
×
1028
        if (r != 0)
×
1029
                return r;
×
1030

1031
        if (p.parent_fd_idx == UINT_MAX)
×
1032
                return sd_varlink_error_invalid_parameter_name(link, "parentFileDescriptor");
×
1033

1034
        _cleanup_close_ int parent_fd = sd_varlink_peek_dup_fd(link, p.parent_fd_idx);
×
1035
        if (parent_fd < 0)
×
1036
                return log_debug_errno(parent_fd, "Failed to peek parent directory fd from client: %m");
×
1037

1038
        uid_t peer_uid;
×
1039
        r = sd_varlink_get_peer_uid(link, &peer_uid);
×
1040
        if (r < 0)
×
1041
                return log_debug_errno(r, "Failed to get client UID: %m");
×
1042

1043
        struct stat parent_stat;
×
1044
        if (fstat(parent_fd, &parent_stat) < 0)
×
1045
                return r;
1046

1047
        r = stat_verify_directory(&parent_stat);
×
1048
        if (r < 0)
×
1049
                return r;
1050

1051
        int fl = fd_verify_safe_flags_full(parent_fd, O_DIRECTORY);
×
1052
        if (fl < 0)
×
1053
                return log_debug_errno(fl, "Directory file descriptor has unsafe flags set: %m");
×
1054

1055
        _cleanup_free_ char *parent_path = NULL;
×
1056
        (void) fd_get_path(parent_fd, &parent_path);
×
1057

1058
        _cleanup_free_ char *new_path = parent_path ? path_join(parent_path, p.name) : NULL;
×
1059
        log_debug("Asked to make directory: %s", strna(new_path));
×
1060

1061
        const char *polkit_details[] = {
×
1062
                "directory", strna(new_path),
×
1063
                NULL,
1064
        };
1065

1066
        const char *polkit_action;
×
1067
        PolkitFlags polkit_flags;
×
1068
        if (parent_stat.st_uid != peer_uid) {
×
1069
                polkit_action = "io.systemd.mount-file-system.make-directory-untrusted";
1070
                polkit_flags = 0;
1071
        } else {
1072
                polkit_action = "io.systemd.mount-file-system.make-directory";
×
1073
                polkit_flags = POLKIT_DEFAULT_ALLOW;
×
1074
        }
1075

1076
        r = varlink_verify_polkit_async_full(
×
1077
                        link,
1078
                        /* bus= */ NULL,
1079
                        polkit_action,
1080
                        polkit_details,
1081
                        /* good_user= */ UID_INVALID,
1082
                        polkit_flags,
1083
                        polkit_registry);
1084
        if (r <= 0)
×
1085
                return r;
1086

1087
        _cleanup_free_ char *t = NULL;
×
1088
        r = tempfn_random(p.name, "mountfsd", &t);
×
1089
        if (r < 0)
×
1090
                return r;
1091

1092
        _cleanup_close_ int fd = open_mkdir_at(parent_fd, t, O_CLOEXEC, 0700);
×
1093
        if (fd < 0)
×
1094
                return fd;
1095

1096
        r = RET_NERRNO(fchmod(fd, 0700)); /* Set mode explicitly, as paranoia regarding umask games */
×
1097
        if (r < 0)
×
1098
                goto fail;
×
1099

1100
        r = RET_NERRNO(fchown(fd, FOREIGN_UID_BASE, FOREIGN_UID_BASE));
×
1101
        if (r < 0)
×
1102
                goto fail;
×
1103

1104
        r = rename_noreplace(parent_fd, t, parent_fd, p.name);
×
1105
        if (r < 0)
×
1106
                goto fail;
×
1107

1108
        t = mfree(t); /* temporary filename no longer exists */
×
1109

1110
        int fd_idx = sd_varlink_push_fd(link, fd);
×
1111
        if (fd_idx < 0) {
×
1112
                r = fd_idx;
×
1113
                goto fail;
×
1114
        }
1115

1116
        TAKE_FD(fd);
×
1117

1118
        return sd_varlink_replybo(
×
1119
                        link,
1120
                        SD_JSON_BUILD_PAIR_INTEGER("directoryFileDescriptor", fd_idx));
1121

1122
fail:
×
1123
        (void) unlinkat(parent_fd, t ?: p.name, AT_REMOVEDIR);
×
1124
        return r;
×
1125
}
1126

1127
static int process_connection(sd_varlink_server *server, int _fd) {
×
1128
        _cleanup_close_ int fd = TAKE_FD(_fd); /* always take possession */
×
1129
        _cleanup_(sd_varlink_close_unrefp) sd_varlink *vl = NULL;
×
1130
        _cleanup_(sd_event_unrefp) sd_event *event = NULL;
×
1131
        int r;
×
1132

1133
        r = sd_event_new(&event);
×
1134
        if (r < 0)
×
1135
                return r;
1136

1137
        r = sd_varlink_server_attach_event(server, event, 0);
×
1138
        if (r < 0)
×
1139
                return log_error_errno(r, "Failed to attach Varlink server to event loop: %m");
×
1140

1141
        r = sd_varlink_server_add_connection(server, fd, &vl);
×
1142
        if (r < 0)
×
1143
                return log_error_errno(r, "Failed to add connection: %m");
×
1144

1145
        TAKE_FD(fd);
×
1146
        vl = sd_varlink_ref(vl);
×
1147

1148
        r = sd_event_loop(event);
×
1149
        if (r < 0)
×
1150
                return log_error_errno(r, "Failed to run event loop: %m");
×
1151

1152
        r = sd_varlink_server_detach_event(server);
×
1153
        if (r < 0)
×
1154
                return log_error_errno(r, "Failed to detach Varlink server from event loop: %m");
×
1155

1156
        return 0;
1157
}
1158

1159
static int run(int argc, char *argv[]) {
×
1160
        usec_t start_time, listen_idle_usec, last_busy_usec = USEC_INFINITY;
×
1161
        _cleanup_(sd_varlink_server_unrefp) sd_varlink_server *server = NULL;
×
1162
        _cleanup_hashmap_free_ Hashmap *polkit_registry = NULL;
×
1163
        _cleanup_(pidref_done) PidRef parent = PIDREF_NULL;
×
1164
        unsigned n_iterations = 0;
×
1165
        int m, listen_fd, r;
×
1166

1167
        log_setup();
×
1168

1169
        m = sd_listen_fds(false);
×
1170
        if (m < 0)
×
1171
                return log_error_errno(m, "Failed to determine number of listening fds: %m");
×
1172
        if (m == 0)
×
1173
                return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "No socket to listen on received.");
×
1174
        if (m > 1)
×
1175
                return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "Worker can only listen on a single socket at a time.");
×
1176

1177
        listen_fd = SD_LISTEN_FDS_START;
×
1178

1179
        r = fd_nonblock(listen_fd, false);
×
1180
        if (r < 0)
×
1181
                return log_error_errno(r, "Failed to turn off non-blocking mode for listening socket: %m");
×
1182

1183
        r = varlink_server_new(&server,
×
1184
                               SD_VARLINK_SERVER_INHERIT_USERDATA|
1185
                               SD_VARLINK_SERVER_ALLOW_FD_PASSING_INPUT|SD_VARLINK_SERVER_ALLOW_FD_PASSING_OUTPUT,
1186
                               &polkit_registry);
1187
        if (r < 0)
×
1188
                return log_error_errno(r, "Failed to allocate server: %m");
×
1189

1190
        r = sd_varlink_server_add_interface(server, &vl_interface_io_systemd_MountFileSystem);
×
1191
        if (r < 0)
×
1192
                return log_error_errno(r, "Failed to add MountFileSystem interface to varlink server: %m");
×
1193

1194
        r = sd_varlink_server_bind_method_many(
×
1195
                        server,
1196
                        "io.systemd.MountFileSystem.MountImage",     vl_method_mount_image,
1197
                        "io.systemd.MountFileSystem.MountDirectory", vl_method_mount_directory,
1198
                        "io.systemd.MountFileSystem.MakeDirectory",  vl_method_make_directory);
1199
        if (r < 0)
×
1200
                return log_error_errno(r, "Failed to bind methods: %m");
×
1201

1202
        r = sd_varlink_server_set_exit_on_idle(server, true);
×
1203
        if (r < 0)
×
1204
                return log_error_errno(r, "Failed to enable exit-on-idle mode: %m");
×
1205

1206
        r = getenv_bool("MOUNTFS_FIXED_WORKER");
×
1207
        if (r < 0)
×
1208
                return log_error_errno(r, "Failed to parse MOUNTFSD_FIXED_WORKER: %m");
×
1209
        listen_idle_usec = r ? USEC_INFINITY : LISTEN_IDLE_USEC;
×
1210

1211
        r = pidref_set_parent(&parent);
×
1212
        if (r < 0)
×
1213
                return log_error_errno(r, "Failed to acquire pidfd of parent process: %m");
×
1214

1215
        start_time = now(CLOCK_MONOTONIC);
×
1216

1217
        for (;;) {
×
1218
                _cleanup_close_ int fd = -EBADF;
×
1219
                usec_t n;
×
1220

1221
                /* Exit the worker in regular intervals, to flush out all memory use */
1222
                if (n_iterations++ > ITERATIONS_MAX) {
×
1223
                        log_debug("Exiting worker, processed %u iterations, that's enough.", n_iterations);
×
1224
                        break;
1225
                }
1226

1227
                n = now(CLOCK_MONOTONIC);
×
1228
                if (n >= usec_add(start_time, RUNTIME_MAX_USEC)) {
×
1229
                        log_debug("Exiting worker, ran for %s, that's enough.",
×
1230
                                  FORMAT_TIMESPAN(usec_sub_unsigned(n, start_time), 0));
1231
                        break;
×
1232
                }
1233

1234
                if (last_busy_usec == USEC_INFINITY)
×
1235
                        last_busy_usec = n;
1236
                else if (listen_idle_usec != USEC_INFINITY && n >= usec_add(last_busy_usec, listen_idle_usec)) {
×
1237
                        log_debug("Exiting worker, been idle for %s.",
×
1238
                                  FORMAT_TIMESPAN(usec_sub_unsigned(n, last_busy_usec), 0));
1239
                        break;
×
1240
                }
1241

1242
                (void) rename_process("systemd-mountwork: waiting...");
×
1243
                fd = RET_NERRNO(accept4(listen_fd, NULL, NULL, SOCK_NONBLOCK|SOCK_CLOEXEC));
×
1244
                (void) rename_process("systemd-mountwork: processing...");
×
1245

1246
                if (fd == -EAGAIN)
×
1247
                        continue; /* The listening socket has SO_RECVTIMEO set, hence a timeout is expected
×
1248
                                   * after a while, let's check if it's time to exit though. */
1249
                if (fd == -EINTR)
×
1250
                        continue; /* Might be that somebody attached via strace, let's just continue in that
×
1251
                                   * case */
1252
                if (fd < 0)
×
1253
                        return log_error_errno(fd, "Failed to accept() from listening socket: %m");
×
1254

1255
                if (now(CLOCK_MONOTONIC) <= usec_add(n, PRESSURE_SLEEP_TIME_USEC)) {
×
1256
                        /* We only slept a very short time? If so, let's see if there are more sockets
1257
                         * pending, and if so, let's ask our parent for more workers */
1258

1259
                        r = fd_wait_for_event(listen_fd, POLLIN, 0);
×
1260
                        if (r < 0)
×
1261
                                return log_error_errno(r, "Failed to test for POLLIN on listening socket: %m");
×
1262

1263
                        if (FLAGS_SET(r, POLLIN)) {
×
1264
                                r = pidref_kill(&parent, SIGUSR2);
×
1265
                                if (r == -ESRCH)
×
1266
                                        return log_error_errno(r, "Parent already died?");
×
1267
                                if (r < 0)
×
1268
                                        return log_error_errno(r, "Failed to send SIGUSR2 signal to parent: %m");
×
1269
                        }
1270
                }
1271

1272
                (void) process_connection(server, TAKE_FD(fd));
×
1273
                last_busy_usec = USEC_INFINITY;
×
1274
        }
1275

1276
        return 0;
1277
}
1278

1279
DEFINE_MAIN_FUNCTION(run);
×
STATUS · Troubleshooting · Open an Issue · Sales · Support · CAREERS · ENTERPRISE · START FREE · SCHEDULE DEMO
ANNOUNCEMENTS · TWITTER · TOS & SLA · Supported CI Services · What's a CI service? · Automated Testing

© 2026 Coveralls, Inc