• Home
  • Features
  • Pricing
  • Docs
  • Announcements
  • Sign In

systemd / systemd / 25196166722

30 Apr 2026 07:30PM UTC coverage: 72.134% (+0.3%) from 71.849%
25196166722

push

github

bluca
mkosi: update debian commit reference to 1302f123d

* 1302f123d9 Restrict wildcard for new files
* a6d0098d10 Install new files for upstream build
* ce07fd7616 d/t/boot-and-services: use coreutils tunable in apparmor test (LP: #2125614)

324804 of 450278 relevant lines covered (72.13%)

1187716.32 hits per line

Source File
Press 'n' to go to next uncovered line, 'b' for previous

0.0
/src/mountfsd/mountwork.c
1
/* SPDX-License-Identifier: LGPL-2.1-or-later */
2

3
#include <linux/loop.h>
4
#include <poll.h>
5
#include <stdlib.h>
6
#include <sys/mount.h>
7
#include <unistd.h>
8

9
#include "sd-daemon.h"
10
#include "sd-event.h"
11
#include "sd-varlink.h"
12

13
#include "argv-util.h"
14
#include "bus-polkit.h"
15
#include "chase.h"
16
#include "discover-image.h"
17
#include "dissect-image.h"
18
#include "env-util.h"
19
#include "errno-util.h"
20
#include "escape.h"
21
#include "fd-util.h"
22
#include "fs-util.h"
23
#include "format-util.h"
24
#include "hashmap.h"
25
#include "image-policy.h"
26
#include "io-util.h"
27
#include "iovec-util.h"
28
#include "json-util.h"
29
#include "loop-util.h"
30
#include "main-func.h"
31
#include "memory-util.h"
32
#include "mount-util.h"
33
#include "namespace-util.h"
34
#include "nsresource.h"
35
#include "nulstr-util.h"
36
#include "os-util.h"
37
#include "path-util.h"
38
#include "pidref.h"
39
#include "process-util.h"
40
#include "socket-util.h"
41
#include "stat-util.h"
42
#include "string-table.h"
43
#include "string-util.h"
44
#include "strv.h"
45
#include "tmpfile-util.h"
46
#include "time-util.h"
47
#include "uid-classification.h"
48
#include "uid-range.h"
49
#include "user-util.h"
50
#include "varlink-io.systemd.MountFileSystem.h"
51
#include "varlink-util.h"
52

53
#define ITERATIONS_MAX 64U
54
#define RUNTIME_MAX_USEC (5 * USEC_PER_MINUTE)
55
#define PRESSURE_SLEEP_TIME_USEC (50 * USEC_PER_MSEC)
56
#define LISTEN_IDLE_USEC (90 * USEC_PER_SEC)
57

58
static const ImagePolicy image_policy_untrusted = {
59
        .n_policies = 2,
60
        .policies = {
61
                { PARTITION_ROOT,     PARTITION_POLICY_SIGNED|PARTITION_POLICY_ABSENT },
62
                { PARTITION_USR,      PARTITION_POLICY_SIGNED|PARTITION_POLICY_ABSENT },
63
        },
64
        .default_flags = PARTITION_POLICY_IGNORE,
65
};
66

67
static int json_dispatch_image_options(const char *name, sd_json_variant *variant, sd_json_dispatch_flags_t flags, void *userdata) {
×
68
        _cleanup_(mount_options_free_allp) MountOptions *options = NULL;
×
69
        MountOptions **p = ASSERT_PTR(userdata);
×
70
        int r;
×
71

72
        if (sd_json_variant_is_null(variant)) {
×
73
                *p = mount_options_free_all(*p);
×
74
                return 0;
×
75
        }
76

77
        if (!sd_json_variant_is_object(variant))
×
78
                return json_log(variant, flags, SYNTHETIC_ERRNO(EINVAL), "JSON field '%s' is not an object.", strna(name));
×
79

80
        const char *k;
×
81
        sd_json_variant *e;
×
82
        JSON_VARIANT_OBJECT_FOREACH(k, e, variant) {
×
83
                PartitionDesignator pd = partition_designator_from_string(k);
×
84
                if (pd < 0)
×
85
                        return json_log(variant, flags, SYNTHETIC_ERRNO(EINVAL), "Invalid partition designator '%s'.", strna(k));
×
86

87
                if (!sd_json_variant_is_string(e))
×
88
                        return json_log(e, flags, SYNTHETIC_ERRNO(EINVAL), "Mount option for partition '%s' is not a string.", strna(k));
×
89

90
                if (!options) {
×
91
                        options = new0(MountOptions, 1);
×
92
                        if (!options)
×
93
                                return json_log_oom(variant, flags);
×
94
                }
95

96
                r = free_and_strdup(&options->options[pd], sd_json_variant_string(e));
×
97
                if (r < 0)
×
98
                        return json_log_oom(variant, flags);
×
99
        }
100

101
        mount_options_free_all(*p);
×
102
        *p = TAKE_PTR(options);
×
103
        return 0;
×
104
}
105

106
static int json_dispatch_image_policy(const char *name, sd_json_variant *variant, sd_json_dispatch_flags_t flags, void *userdata) {
×
107
        _cleanup_(image_policy_freep) ImagePolicy *q = NULL;
×
108
        ImagePolicy **p = ASSERT_PTR(userdata);
×
109
        int r;
×
110

111
        assert(p);
×
112

113
        if (sd_json_variant_is_null(variant)) {
×
114
                *p = image_policy_free(*p);
×
115
                return 0;
×
116
        }
117

118
        if (!sd_json_variant_is_string(variant))
×
119
                return json_log(variant, flags, SYNTHETIC_ERRNO(EINVAL), "JSON field '%s' is not a string.", strna(name));
×
120

121
        r = image_policy_from_string(sd_json_variant_string(variant), /* graceful= */ false, &q);
×
122
        if (r < 0)
×
123
                return json_log(variant, flags, r, "JSON field '%s' is not a valid image policy.", strna(name));
×
124

125
        image_policy_free(*p);
×
126
        *p = TAKE_PTR(q);
×
127
        return 0;
×
128
}
129

130
typedef struct MountImageParameters {
131
        unsigned image_fd_idx;
132
        unsigned userns_fd_idx;
133
        int read_only;
134
        int growfs;
135
        char *password;
136
        ImagePolicy *image_policy;
137
        MountOptions *options;
138
        bool relax_extension_release_check;
139
        bool verity_sharing;
140
        struct iovec verity_root_hash;
141
        struct iovec verity_root_hash_sig;
142
        unsigned verity_data_fd_idx;
143
} MountImageParameters;
144

145
static void mount_image_parameters_done(MountImageParameters *p) {
×
146
        assert(p);
×
147

148
        p->password = erase_and_free(p->password);
×
149
        p->image_policy = image_policy_free(p->image_policy);
×
150
        iovec_done(&p->verity_root_hash);
×
151
        iovec_done(&p->verity_root_hash_sig);
×
152
        p->options = mount_options_free_all(p->options);
×
153
}
×
154

155
static int validate_image_fd(int fd, MountImageParameters *p) {
×
156
        int r, fl;
×
157

158
        assert(fd >= 0);
×
159
        assert(p);
×
160

161
        /* Only support regular files and block devices. */
162
        r = fd_verify_regular_or_block(fd);
×
163
        if (r < 0)
×
164
                return r;
165

166
        fl = fd_verify_safe_flags_full(fd, O_NONBLOCK);
×
167
        if (fl < 0)
×
168
                return log_debug_errno(fl, "Image file descriptor has unsafe flags set: %m");
×
169

170
        switch (fl & O_ACCMODE_STRICT) {
×
171

172
        case O_RDONLY:
×
173
                p->read_only = true;
×
174
                break;
×
175

176
        case O_RDWR:
177
                break;
178

179
        default:
180
                return -EBADF;
181
        }
182

183
        return 0;
184
}
185

186
static int verify_trusted_image_fd_by_path(int fd) {
×
187
        int r;
×
188

189
        assert(fd >= 0);
×
190

191
        r = secure_getenv_bool("SYSTEMD_MOUNTFSD_TRUSTED_DIRECTORIES");
×
192
        if (r == -ENXIO)  {
×
193
                if (!DEFAULT_MOUNTFSD_TRUSTED_DIRECTORIES) {
×
194
                        log_debug("Trusted directory mechanism disabled at compile time.");
×
195
                        return false;
×
196
                }
197
        } else if (r < 0) {
×
198
                log_debug_errno(r, "Failed to parse $SYSTEMD_MOUNTFSD_TRUSTED_DIRECTORIES environment variable, not trusting any image.");
×
199
                return false;
×
200
        } else if (!r) {
×
201
                log_debug("Trusted directory mechanism disabled via $SYSTEMD_MOUNTFSD_TRUSTED_DIRECTORIES environment variable.");
×
202
                return false;
×
203
        }
204

205
        _cleanup_free_ char *p = NULL;
×
206
        r = fd_get_path(fd, &p);
×
207
        if (r < 0)
×
208
                return log_debug_errno(r, "Failed to get path of passed image file descriptor: %m");
×
209

210
        struct stat sta;
×
211
        if (fstat(fd, &sta) < 0)
×
212
                return log_debug_errno(errno, "Failed to stat() passed image file descriptor: %m");
×
213
        if (!S_ISREG(sta.st_mode)) {
×
214
                log_debug("Image '%s' is not a regular file, hence skipping trusted directory check.", p);
×
215
                return false;
×
216
        }
217

218
        log_debug("Checking if image '%s' is in trusted directories.", p);
×
219

220
        for (ImageClass c = 0; c < _IMAGE_CLASS_MAX; c++)
×
221
                NULSTR_FOREACH(s, image_search_path[c]) {
×
222
                        _cleanup_close_ int dir_fd = -EBADF, inode_fd = -EBADF;
×
223
                        _cleanup_free_ char *q = NULL;
×
224
                        struct stat stb;
×
225
                        const char *e;
×
226

227
                        r = chase(s, NULL, CHASE_SAFE|CHASE_TRIGGER_AUTOFS, &q, &dir_fd);
×
228
                        if (r == -ENOENT)
×
229
                                continue;
×
230
                        if (r < 0) {
×
231
                                log_warning_errno(r, "Failed to resolve search path '%s', ignoring: %m", s);
×
232
                                continue;
×
233
                        }
234

235
                        /* Check that the inode refers to a file immediately inside the image directory,
236
                         * i.e. not the image directory itself, and nothing further down the tree */
237
                        e = path_startswith(p, q);
×
238
                        if (isempty(e))
×
239
                                continue;
×
240

241
                        e += strspn(e, "/");
×
242
                        if (!filename_is_valid(e))
×
243
                                continue;
×
244

245
                        r = chaseat(XAT_FDROOT, dir_fd, e, CHASE_SAFE|CHASE_TRIGGER_AUTOFS, NULL, &inode_fd);
×
246
                        if (r < 0)
×
247
                                return log_error_errno(r, "Couldn't verify that specified image '%s' is in search path '%s': %m", p, s);
×
248

249
                        if (fstat(inode_fd, &stb) < 0)
×
250
                                return log_error_errno(errno, "Failed to stat image file '%s/%s': %m", q, e);
×
251

252
                        if (stat_inode_same(&sta, &stb)) {
×
253
                                log_debug("Image '%s' is *in* trusted directories.", p);
×
254
                                return true; /* Yay */
×
255
                        }
256
                }
257

258
        log_debug("Image '%s' is *not* in trusted directories.", p);
×
259
        return false;
260
}
261

262
static int determine_image_policy(
×
263
                int image_fd,
264
                bool trusted,
265
                ImagePolicy *client_policy,
266
                ImagePolicy **ret) {
267

268
        _cleanup_(image_policy_freep) ImagePolicy *envvar_policy = NULL;
×
269
        const ImagePolicy *default_policy;
×
270
        const char *envvar, *e;
×
271
        int r;
×
272

273
        assert(image_fd >= 0);
×
274
        assert(ret);
×
275

276
        if (trusted) {
×
277
                envvar = "SYSTEMD_MOUNTFSD_IMAGE_POLICY_TRUSTED";
278
                default_policy = &image_policy_allow;
279
        } else {
280
                envvar = "SYSTEMD_MOUNTFSD_IMAGE_POLICY_UNTRUSTED";
×
281
                default_policy = &image_policy_untrusted;
×
282
        }
283

284
        e = secure_getenv(envvar);
×
285
        if (e) {
×
286
                r = image_policy_from_string(e, /* graceful= */ false, &envvar_policy);
×
287
                if (r < 0)
×
288
                        return log_error_errno(r, "Failed to parse image policy supplied via $%s: %m", envvar);
×
289

290
                default_policy = envvar_policy;
×
291
        }
292

293
        return image_policy_intersect(default_policy, client_policy, ret);
×
294
}
295

296
static int validate_userns(sd_varlink *link, int *userns_fd) {
×
297
        int r;
×
298

299
        assert(link);
×
300
        assert(userns_fd);
×
301

302
        if (*userns_fd < 0)
×
303
                return 0;
304

305
        r = fd_verify_safe_flags(*userns_fd);
×
306
        if (r < 0)
×
307
                return log_debug_errno(r, "User namespace file descriptor has unsafe flags set: %m");
×
308

309
        r = fd_is_namespace(*userns_fd, NAMESPACE_USER);
×
310
        if (r < 0)
×
311
                return r;
312
        if (r == 0)
×
313
                return sd_varlink_error_invalid_parameter_name(link, "userNamespaceFileDescriptor");
×
314

315
        /* Our own host user namespace? Then close the fd, and handle it as if none was specified. */
316
        r = is_our_namespace(*userns_fd, NAMESPACE_USER);
×
317
        if (r < 0)
×
318
                return log_debug_errno(r, "Failed to determine if user namespace provided by client is our own.");
×
319
        if (r > 0) {
×
320
                log_debug("User namespace provided by client is our own.");
×
321
                *userns_fd = safe_close(*userns_fd);
×
322
        }
323

324
        return 0;
325
}
326

327
static int mount_options_to_polkit_details(const MountOptions *options, char **ret_mount_options_concat) {
×
328
        _cleanup_free_ char *mount_options_concat = NULL;
×
329
        int r;
×
330

331
        assert(ret_mount_options_concat);
×
332

333
        if (!options) {
×
334
                *ret_mount_options_concat = NULL;
×
335
                return 0;
×
336
        }
337

338
        for (PartitionDesignator i = 0; i < _PARTITION_DESIGNATOR_MAX; i++) {
×
339
                _cleanup_free_ char *escaped = NULL;
×
340

341
                if (isempty(options->options[i]))
×
342
                        continue;
×
343

344
                escaped = shell_escape(options->options[i], ":");
×
345
                if (!escaped)
×
346
                        return log_oom_debug();
×
347

348
                r = strextendf_with_separator(
×
349
                                &mount_options_concat,
350
                                ",",
351
                                "%s:%s",
352
                                partition_designator_to_string(i),
353
                                escaped);
354
                if (r < 0)
×
355
                        return r;
356
        }
357

358
        *ret_mount_options_concat = TAKE_PTR(mount_options_concat);
×
359
        return 0;
×
360
}
361

362
static int vl_method_mount_image(
×
363
                sd_varlink *link,
364
                sd_json_variant *parameters,
365
                sd_varlink_method_flags_t flags,
366
                void *userdata) {
367

368
        static const sd_json_dispatch_field dispatch_table[] = {
×
369
                { "imageFileDescriptor",         SD_JSON_VARIANT_UNSIGNED, sd_json_dispatch_uint,        offsetof(MountImageParameters, image_fd_idx),                  SD_JSON_MANDATORY },
370
                { "userNamespaceFileDescriptor", SD_JSON_VARIANT_UNSIGNED, sd_json_dispatch_uint,        offsetof(MountImageParameters, userns_fd_idx),                 0 },
371
                { "readOnly",                    SD_JSON_VARIANT_BOOLEAN,  sd_json_dispatch_tristate,    offsetof(MountImageParameters, read_only),                     0 },
372
                { "growFileSystems",             SD_JSON_VARIANT_BOOLEAN,  sd_json_dispatch_tristate,    offsetof(MountImageParameters, growfs),                        0 },
373
                { "password",                    SD_JSON_VARIANT_STRING,   sd_json_dispatch_string,      offsetof(MountImageParameters, password),                      0 },
374
                { "imagePolicy",                 SD_JSON_VARIANT_STRING,   json_dispatch_image_policy,   offsetof(MountImageParameters, image_policy),                  0 },
375
                { "mountOptions",                SD_JSON_VARIANT_OBJECT,   json_dispatch_image_options,  offsetof(MountImageParameters, options),                       0 },
376
                { "relaxExtensionReleaseChecks", SD_JSON_VARIANT_BOOLEAN,  sd_json_dispatch_stdbool,     offsetof(MountImageParameters, relax_extension_release_check), 0 },
377
                { "veritySharing",               SD_JSON_VARIANT_BOOLEAN,  sd_json_dispatch_stdbool,     offsetof(MountImageParameters, verity_sharing),                0 },
378
                { "verityDataFileDescriptor",    SD_JSON_VARIANT_UNSIGNED, sd_json_dispatch_uint,        offsetof(MountImageParameters, verity_data_fd_idx),            0 },
379
                { "verityRootHash",              SD_JSON_VARIANT_STRING,   json_dispatch_unhex_iovec,    offsetof(MountImageParameters, verity_root_hash),              0 },
380
                { "verityRootHashSignature",     SD_JSON_VARIANT_STRING,   json_dispatch_unbase64_iovec, offsetof(MountImageParameters, verity_root_hash_sig),          0 },
381
                VARLINK_DISPATCH_POLKIT_FIELD,
382
                {}
383
        };
384

385
        _cleanup_(verity_settings_done) VeritySettings verity = VERITY_SETTINGS_DEFAULT;
×
386
        _cleanup_(mount_image_parameters_done) MountImageParameters p = {
×
387
                .image_fd_idx = UINT_MAX,
388
                .userns_fd_idx = UINT_MAX,
389
                .verity_data_fd_idx = UINT_MAX,
390
                .read_only = -1,
391
                .growfs = -1,
392
        };
393
        _cleanup_(dissected_image_unrefp) DissectedImage *di = NULL;
×
394
        _cleanup_(loop_device_unrefp) LoopDevice *loop = NULL;
×
395
        _cleanup_(sd_json_variant_unrefp) sd_json_variant *aj = NULL;
×
396
        _cleanup_close_ int image_fd = -EBADF, userns_fd = -EBADF, verity_data_fd = -EBADF;
×
397
        _cleanup_(image_policy_freep) ImagePolicy *use_policy = NULL;
×
398
        Hashmap **polkit_registry = ASSERT_PTR(userdata);
×
399
        _cleanup_free_ char *ps = NULL;
×
400
        bool image_is_trusted = false;
×
401
        int r;
×
402

403
        assert(link);
×
404
        assert(parameters);
×
405

406
        sd_json_variant_sensitive(parameters); /* might contain passwords */
×
407

408
        r = sd_varlink_dispatch(link, parameters, dispatch_table, &p);
×
409
        if (r != 0)
×
410
                return r;
411

412
        /* Verity data and roothash have to be either both set, or both unset. The sig can be set only if
413
         * the roothash is set. */
414
        if ((p.verity_data_fd_idx != UINT_MAX) != (p.verity_root_hash.iov_len > 0))
×
415
                return sd_varlink_error_invalid_parameter_name(link, "verityDataFileDescriptor");
×
416
        if (p.verity_root_hash_sig.iov_len > 0 && p.verity_root_hash.iov_len == 0)
×
417
                return sd_varlink_error_invalid_parameter_name(link, "verityRootHashSignature");
×
418

419
        if (p.image_fd_idx != UINT_MAX) {
×
420
                image_fd = sd_varlink_peek_dup_fd(link, p.image_fd_idx);
×
421
                if (image_fd < 0)
×
422
                        return log_debug_errno(image_fd, "Failed to peek image fd from client: %m");
×
423
        }
424

425
        if (p.userns_fd_idx != UINT_MAX) {
×
426
                userns_fd = sd_varlink_peek_dup_fd(link, p.userns_fd_idx);
×
427
                if (userns_fd < 0)
×
428
                        return log_debug_errno(userns_fd, "Failed to peek user namespace fd from client: %m");
×
429
        }
430

431
        r = validate_image_fd(image_fd, &p);
×
432
        if (r == -EREMOTEIO)
×
433
                return sd_varlink_errorbo(link, "io.systemd.MountFileSystem.BadFileDescriptorFlags", SD_JSON_BUILD_PAIR_STRING("parameter", "imageFileDescriptor"));
×
434
        if (r < 0)
×
435
                return r;
436

437
        r = validate_userns(link, &userns_fd);
×
438
        if (r != 0)
×
439
                return r;
440

441
        /* Mount options could be used to thwart security measures such as ACLs or SELinux so if they are
442
         * specified don't mark the image as trusted so that it requires additional privileges to use. */
443
        if (!p.options) {
×
444
                r = verify_trusted_image_fd_by_path(image_fd);
×
445
                if (r < 0)
×
446
                        return r;
447
                image_is_trusted = r;
×
448
        }
449

450
        if (p.verity_data_fd_idx != UINT_MAX) {
×
451
                verity_data_fd = sd_varlink_peek_dup_fd(link, p.verity_data_fd_idx);
×
452
                if (verity_data_fd < 0)
×
453
                        return log_debug_errno(verity_data_fd, "Failed to peek verity data fd from client: %m");
×
454

455
                r = fd_verify_safe_flags(verity_data_fd);
×
456
                if (r < 0)
×
457
                        return log_debug_errno(r, "Verity data file descriptor has unsafe flags set: %m");
×
458

459
                verity.data_path = strdup(FORMAT_PROC_FD_PATH(verity_data_fd));
×
460
                if (!verity.data_path)
×
461
                        return -ENOMEM;
462

463
                verity.designator = PARTITION_ROOT;
×
464
                verity.root_hash = TAKE_STRUCT(p.verity_root_hash);
×
465
                verity.root_hash_sig = TAKE_STRUCT(p.verity_root_hash_sig);
×
466
        }
467

468
        /* Let the polkit rule know what mount options the caller tries to use, so that rules can decide
469
         * whether to allow or deny the operation based on what the options are. */
470
        _cleanup_free_ char *mount_options_concat = NULL;
×
471
        r = mount_options_to_polkit_details(p.options, &mount_options_concat);
×
472
        if (r < 0)
×
473
                return r;
474

475
        const char *polkit_details[] = {
×
476
                "read_only", one_zero(p.read_only > 0),
×
477
                !isempty(mount_options_concat) ? "mount_options" : NULL, mount_options_concat,
×
478
                NULL,
479
        };
480

481
        const char *polkit_action, *polkit_untrusted_action;
×
482
        PolkitFlags polkit_flags;
×
483
        if (userns_fd < 0) {
×
484
                /* Mount into the host user namespace */
485
                polkit_action = "io.systemd.mount-file-system.mount-image";
486
                polkit_untrusted_action = "io.systemd.mount-file-system.mount-untrusted-image";
487
                polkit_flags = 0;
488
        } else {
489
                /* Mount into a private user namespace */
490
                polkit_action = "io.systemd.mount-file-system.mount-image-privately";
×
491
                polkit_untrusted_action = "io.systemd.mount-file-system.mount-untrusted-image-privately";
×
492

493
                /* If polkit is not around, let's allow mounting authenticated images by default */
494
                polkit_flags = POLKIT_DEFAULT_ALLOW;
×
495
        }
496

497
        /* Let's definitely acquire the regular action privilege, for mounting properly signed images */
498
        r = varlink_verify_polkit_async_full(
×
499
                        link,
500
                        /* bus= */ NULL,
501
                        p.options ? polkit_untrusted_action : polkit_action, /* Using mount options requires higher privs */
×
502
                        polkit_details,
503
                        /* good_user= */ UID_INVALID,
504
                        polkit_flags,
505
                        polkit_registry);
506
        if (r <= 0)
×
507
                return r;
508

509
        /* Generate the common dissection directory here. We are not going to use it, but the clients might,
510
         * and they likely are unprivileged, hence cannot create it themselves. Hence let's just create it
511
         * here, if it is missing. */
512
        r = get_common_dissect_directory(NULL);
×
513
        if (r < 0)
×
514
                return r;
515

516
        r = loop_device_make(
×
517
                        image_fd,
518
                        p.read_only > 0 ? O_RDONLY : -1,
×
519
                        0,
520
                        UINT64_MAX,
521
                        UINT32_MAX,
522
                        LO_FLAGS_PARTSCAN,
523
                        LOCK_EX,
524
                        &loop);
525
        if (r < 0)
×
526
                return r;
527

528
        DissectImageFlags dissect_flags =
×
529
                (p.read_only > 0 ? DISSECT_IMAGE_READ_ONLY : 0) |
×
530
                (p.growfs != 0 ? DISSECT_IMAGE_GROWFS : 0) |
×
531
                DISSECT_IMAGE_DISCARD_ANY |
532
                DISSECT_IMAGE_FSCK |
533
                DISSECT_IMAGE_ADD_PARTITION_DEVICES |
×
534
                DISSECT_IMAGE_PIN_PARTITION_DEVICES |
×
535
                (p.verity_sharing ? DISSECT_IMAGE_VERITY_SHARE : 0) |
×
536
                /* Maybe the image is a bare filesystem. Note that this requires privileges, as it is
537
                 * classified by the policy as an 'unprotected' image and will be refused otherwise. */
538
                DISSECT_IMAGE_NO_PARTITION_TABLE |
×
539
                DISSECT_IMAGE_ALLOW_USERSPACE_VERITY |
×
540
                (p.relax_extension_release_check ? DISSECT_IMAGE_RELAX_EXTENSION_CHECK : 0);
×
541

542
        /* Let's see if we have acquired the privilege to mount untrusted images already */
543
        bool polkit_have_untrusted_action =
×
544
                varlink_has_polkit_action(link, polkit_untrusted_action, polkit_details, polkit_registry);
×
545

546
        for (;;) {
×
547
                use_policy = image_policy_free(use_policy);
×
548
                ps = mfree(ps);
×
549

550
                /* We use the image policy for trusted images if either the path is below a trusted
551
                 * directory, or if we have already acquired a PK authentication that tells us that untrusted
552
                 * images are OK */
553
                bool use_trusted_policy =
×
554
                        image_is_trusted ||
555
                        polkit_have_untrusted_action;
556

557
                r = determine_image_policy(
×
558
                                image_fd,
559
                                use_trusted_policy,
560
                                p.image_policy,
561
                                &use_policy);
562
                if (r < 0)
×
563
                        return r;
564

565
                r = image_policy_to_string(use_policy, /* simplify= */ true, &ps);
×
566
                if (r < 0)
×
567
                        return r;
568

569
                log_debug("Using image policy: %s", ps);
×
570

571
                r = dissect_loop_device(
×
572
                                loop,
573
                                &verity,
574
                                p.options,
×
575
                                use_policy,
576
                                /* image_filter= */ NULL,
577
                                dissect_flags,
578
                                &di);
579
                if (r == -ENOPKG)
×
580
                        return sd_varlink_error(link, "io.systemd.MountFileSystem.IncompatibleImage", NULL);
×
581
                if (r == -ENOTUNIQ)
×
582
                        return sd_varlink_error(link, "io.systemd.MountFileSystem.MultipleRootPartitionsFound", NULL);
×
583
                if (r == -ENXIO)
×
584
                        return sd_varlink_error(link, "io.systemd.MountFileSystem.RootPartitionNotFound", NULL);
×
585
                if (r == -ERFKILL) {
×
586
                        /* The image policy refused this, let's retry after trying to get PolicyKit */
587

588
                        if (!polkit_have_untrusted_action) {
×
589
                                log_debug("Denied by image policy. Trying a stronger polkit authentication before continuing.");
×
590
                                r = varlink_verify_polkit_async_full(
×
591
                                                link,
592
                                                /* bus= */ NULL,
593
                                                polkit_untrusted_action,
594
                                                polkit_details,
595
                                                /* good_user= */ UID_INVALID,
596
                                                /* flags= */ 0,                   /* NB: the image cannot be authenticated, hence unless PK is around to allow this anyway, fail! */
597
                                                polkit_registry);
598
                                if (r <= 0 && !ERRNO_IS_NEG_PRIVILEGE(r))
×
599
                                        return r;
600
                                if (r > 0) {
×
601
                                        /* Try again, now that we know the client has enough privileges. */
602
                                        log_debug("Denied by image policy, retrying after polkit authentication.");
×
603
                                        polkit_have_untrusted_action = true;
×
604
                                        continue;
×
605
                                }
606
                        }
607

608
                        return sd_varlink_error(link, "io.systemd.MountFileSystem.DeniedByImagePolicy", NULL);
×
609
                }
610
                if (r < 0)
×
611
                        return r;
612

613
                /* Success */
614
                break;
×
615
        }
616

617
        r = dissected_image_load_verity_sig_partition(
×
618
                        di,
619
                        loop->fd,
×
620
                        &verity);
621
        if (r < 0)
×
622
                return r;
623

624
        r = dissected_image_guess_verity_roothash(
×
625
                        di,
626
                        &verity);
627
        if (r < 0)
×
628
                return r;
629

630
        for (;;) {
×
631
                use_policy = image_policy_free(use_policy);
×
632
                ps = mfree(ps);
×
633

634
                /* We use the image policy for trusted images if either the path is below a trusted
635
                 * directory, or if we have already acquired a PK authentication that tells us that untrusted
636
                 * images are OK */
637
                bool use_trusted_policy =
×
638
                        image_is_trusted ||
639
                        polkit_have_untrusted_action;
640

641
                r = determine_image_policy(
×
642
                                image_fd,
643
                                use_trusted_policy,
644
                                p.image_policy,
645
                                &use_policy);
646
                if (r < 0)
×
647
                        return r;
648

649
                r = image_policy_to_string(use_policy, /* simplify= */ true, &ps);
×
650
                if (r < 0)
×
651
                        return r;
652

653
                log_debug("Using image policy: %s", ps);
×
654

655
                r = dissected_image_decrypt(
×
656
                                di,
657
                                /* root= */ NULL,
658
                                p.password,
×
659
                                &verity,
660
                                use_policy,
661
                                dissect_flags);
662
                if (r == -EDESTADDRREQ) {
×
663
                        /* new dm-verity userspace returns ENOKEY if the dm-verity signature key is not in
664
                         * key chain which we mangle to EDESTADDRREQ. That's great. */
665

666
                        if (!polkit_have_untrusted_action) {
×
667
                                 log_debug("Missing verity key in kernel and userspace. Trying a stronger polkit authentication before continuing.");
×
668
                                 r = varlink_verify_polkit_async_full(
×
669
                                                 link,
670
                                                 /* bus= */ NULL,
671
                                                 polkit_untrusted_action,
672
                                                 polkit_details,
673
                                                 /* good_user= */ UID_INVALID,
674
                                                 /* flags= */ 0,                   /* NB: the image cannot be authenticated, hence unless PK is around to allow this anyway, fail! */
675
                                                 polkit_registry);
676
                                 if (r <= 0 && !ERRNO_IS_NEG_PRIVILEGE(r))
×
677
                                         return r;
678
                                 if (r > 0) {
×
679
                                         /* Try again, now that we know the client has enough privileges. */
680
                                         log_debug("Missing verity key in kernel and userspace, retrying after polkit authentication.");
×
681
                                         polkit_have_untrusted_action = true;
×
682
                                         continue;
×
683
                                 }
684
                         }
685

686
                        return sd_varlink_error(link, "io.systemd.MountFileSystem.KeyNotFound", NULL);
×
687
                }
688
                if (r == -EBUSY) /* DM kernel subsystem is bad at returning useful errors hence we keep retrying
×
689
                                  * under the assumption that some errors are transitional. Which the errors might
690
                                  * not actually be. After all retries failed we return EBUSY. Let's turn that into a
691
                                  * generic Verity error. It's not very helpful, could mean anything, but at least it
692
                                  * gives client a clear idea that this has to do with Verity. */
693
                        return sd_varlink_error(link, "io.systemd.MountFileSystem.VerityFailure", NULL);
×
694
                if (r < 0)
×
695
                        return r;
696

697
                /* Success */
698
                break;
×
699
        }
700

701
        r = dissected_image_mount(
×
702
                        di,
703
                        /* where= */ NULL,
704
                        /* uid_shift= */ UID_INVALID,
705
                        /* uid_range= */ UID_INVALID,
706
                        userns_fd,
707
                        dissect_flags);
708
        if (r < 0)
×
709
                return r;
710

711
        _cleanup_(sd_varlink_unrefp) sd_varlink *nsresource_link = NULL;
×
712
        for (PartitionDesignator d = 0; d < _PARTITION_DESIGNATOR_MAX; d++) {
×
713
                DissectedPartition *pp = di->partitions + d;
×
714
                int fd_idx;
×
715

716
                if (!pp->found)
×
717
                        continue;
×
718

719
                if (pp->fsmount_fd < 0)
×
720
                        continue;
×
721

722
                if (userns_fd >= 0) {
×
723

724
                        if (!nsresource_link) {
×
725
                                r = nsresource_connect(&nsresource_link);
×
726
                                if (r < 0)
×
727
                                        return r;
×
728
                        }
729

730
                        r = nsresource_add_mount(nsresource_link, userns_fd, pp->fsmount_fd);
×
731
                        if (r < 0)
×
732
                                return r;
733
                }
734

735
                fd_idx = sd_varlink_push_fd(link, pp->fsmount_fd);
×
736
                if (fd_idx < 0)
×
737
                        return fd_idx;
738

739
                TAKE_FD(pp->fsmount_fd);
×
740

741
                const char *m = partition_mountpoint_to_string(d);
×
742
                _cleanup_strv_free_ char **l = NULL;
×
743
                if (!isempty(m)) {
×
744
                        l = strv_split_nulstr(m);
×
745
                        if (!l)
×
746
                                return log_oom_debug();
×
747
                }
748

749
                r = sd_json_variant_append_arraybo(
×
750
                                &aj,
751
                                JSON_BUILD_PAIR_ENUM("designator", partition_designator_to_string(d)),
752
                                SD_JSON_BUILD_PAIR_BOOLEAN("writable", pp->rw),
753
                                SD_JSON_BUILD_PAIR_BOOLEAN("growFileSystem", pp->growfs),
754
                                SD_JSON_BUILD_PAIR_CONDITION(pp->partno > 0, "partitionNumber", SD_JSON_BUILD_INTEGER(pp->partno)),
755
                                SD_JSON_BUILD_PAIR_CONDITION(pp->architecture > 0, "architecture", SD_JSON_BUILD_STRING(architecture_to_string(pp->architecture))),
756
                                SD_JSON_BUILD_PAIR_CONDITION(!sd_id128_is_null(pp->uuid), "partitionUuid", SD_JSON_BUILD_UUID(pp->uuid)),
757
                                SD_JSON_BUILD_PAIR_STRING("fileSystemType", dissected_partition_fstype(pp)),
758
                                SD_JSON_BUILD_PAIR_CONDITION(!!pp->label, "partitionLabel", SD_JSON_BUILD_STRING(pp->label)),
759
                                SD_JSON_BUILD_PAIR_UNSIGNED("size", pp->size),
760
                                SD_JSON_BUILD_PAIR_UNSIGNED("offset", pp->offset),
761
                                SD_JSON_BUILD_PAIR_INTEGER("mountFileDescriptor", fd_idx),
762
                                JSON_BUILD_PAIR_STRV_NON_EMPTY("mountPoint", l));
763
                if (r < 0)
×
764
                        return r;
765
        }
766

767
        loop_device_relinquish(loop);
×
768

769
        return sd_varlink_replybo(
×
770
                        link,
771
                        SD_JSON_BUILD_PAIR_VARIANT("partitions", aj),
772
                        SD_JSON_BUILD_PAIR_BOOLEAN("singleFileSystem", di->single_file_system),
773
                        SD_JSON_BUILD_PAIR_STRING("imagePolicy", ps),
774
                        SD_JSON_BUILD_PAIR_UNSIGNED("imageSize", di->image_size),
775
                        SD_JSON_BUILD_PAIR_UNSIGNED("sectorSize", di->sector_size),
776
                        JSON_BUILD_PAIR_STRING_NON_EMPTY("imageName", di->image_name),
777
                        SD_JSON_BUILD_PAIR_CONDITION(!sd_id128_is_null(di->image_uuid), "imageUuid", SD_JSON_BUILD_UUID(di->image_uuid)));
778
}
779

780
typedef enum MountMapMode {
781
        MOUNT_MAP_AUTO = 0,     /* determine automatically from image and caller */
782
        MOUNT_MAP_ROOT,         /* map caller's UID to root in namespace (map 1 UID only) */
783
        MOUNT_MAP_FOREIGN,      /* map foreign UID range to base in namespace (map 64K) */
784
        MOUNT_MAP_IDENTITY,     /* apply identity mapping (map 64K) */
785
        _MOUNT_MAP_MODE_MAX,
786
        _MOUNT_MAP_MODE_INVALID = -EINVAL,
787
} MountMapMode;
788

789
static const char *const mount_map_mode_table[_MOUNT_MAP_MODE_MAX] = {
790
        [MOUNT_MAP_AUTO]     = "auto",
791
        [MOUNT_MAP_ROOT]     = "root",
792
        [MOUNT_MAP_FOREIGN]  = "foreign",
793
        [MOUNT_MAP_IDENTITY] = "identity",
794
};
795

796
DEFINE_PRIVATE_STRING_TABLE_LOOKUP(mount_map_mode, MountMapMode);
×
797

798
typedef struct MountDirectoryParameters {
799
        MountMapMode mode;
800
        unsigned directory_fd_idx;
801
        unsigned userns_fd_idx;
802
        int read_only;
803
} MountDirectoryParameters;
804

805
typedef enum DirectoryOwnership {
806
        DIRECTORY_IS_ROOT_PEER_OWNED,  /* This is returned if the directory is owned by the root user and the peer is root */
807
        DIRECTORY_IS_ROOT_OWNED,       /* This is returned if the directory is owned by the root user (and the peer user is not root) */
808
        DIRECTORY_IS_PEER_OWNED,       /* This is returned if the directory is owned by the peer user (who is not root) */
809
        DIRECTORY_IS_FOREIGN_OWNED,    /* This is returned if the directory is owned by the foreign UID range */
810
        DIRECTORY_IS_OTHERWISE_OWNED,  /* This is returned if the directory is owned by something else */
811
        _DIRECTORY_OWNERSHIP_MAX,
812
        _DIRECTORY_OWNERSHIP_ERRNO_MAX = -ERRNO_MAX, /* Guarantee the whole negative errno range fits */
813
} DirectoryOwnership;
814

815
static MountMapMode default_mount_map_mode(DirectoryOwnership ownership) {
×
816
        /* Derives a suitable mapping mode from the ownership of the base tree */
817

818
        switch (ownership) {
×
819
        case DIRECTORY_IS_PEER_OWNED:
820
                return MOUNT_MAP_ROOT;     /* Map the peer's UID to root in the container */
821

822
        case DIRECTORY_IS_FOREIGN_OWNED:
×
823
                return MOUNT_MAP_FOREIGN;  /* Map the foreign UID range to the container's UID range */
×
824

825
        case DIRECTORY_IS_ROOT_PEER_OWNED:
×
826
        case DIRECTORY_IS_ROOT_OWNED:
827
        case DIRECTORY_IS_OTHERWISE_OWNED:
828
                return MOUNT_MAP_IDENTITY; /* Don't map */
×
829

830
        default:
×
831
                return _MOUNT_MAP_MODE_INVALID;
×
832
        }
833
}
834

835
static JSON_DISPATCH_ENUM_DEFINE(dispatch_mount_directory_mode, MountMapMode, mount_map_mode_from_string);
×
836

837
static DirectoryOwnership validate_directory_fd(
×
838
                int fd,
839
                const char *path, /* purely for logging purposes */
840
                uid_t peer_uid,
841
                uid_t *ret_current_owner_uid) {
842

843
        int r, fl;
×
844

845
        assert(fd >= 0);
×
846
        assert(uid_is_valid(peer_uid));
×
847
        assert(ret_current_owner_uid);
×
848

849
        /* Checks if the specified directory fd looks sane. Returns a DirectoryOwnership that categorizes the
850
         * ownership situation in comparison to the peer's UID.
851
         *
852
         * Note one key difference to image validation (as implemented above): for regular files if the
853
         * client provided us with an open fd it implies the client has access, as well as what kind of
854
         * access (i.e. ro or rw). But for directories this doesn't work the same way, as directories are
855
         * always opened read-only only. Hence we use a different mechanism to validate access to them: we
856
         * check if the directory is owned by the peer UID or by the foreign UID range (in the latter case
857
         * one of the parent directories must be owned by the peer though). */
858

859
        struct statx stx;
×
860
        r = xstatx_full(fd,
×
861
                        /* path= */ NULL,
862
                        AT_EMPTY_PATH,
863
                        /* xstatx_flags= */ XSTATX_MNT_ID_BEST,
864
                        /* mandatory_mask= */ STATX_TYPE|STATX_UID|STATX_INO,
865
                        /* optional_mask= */ 0,
866
                        /* mandatory_attributes= */ STATX_ATTR_MOUNT_ROOT,
867
                        &stx);
868
        if (r < 0)
×
869
                return log_debug_errno(r, "Failed to statx() directory fd: %m");
×
870

871
        r = statx_verify_directory(&stx);
×
872
        if (r < 0)
×
873
                return r;
874

875
        fl = fd_verify_safe_flags_full(fd, O_DIRECTORY|O_PATH);
×
876
        if (fl < 0)
×
877
                return log_debug_errno(fl, "Directory file descriptor has unsafe flags set: %m");
×
878

879
        if (stx.stx_uid == 0) {
×
880
                *ret_current_owner_uid = stx.stx_uid;
×
881
                if (peer_uid == 0) {
×
882
                        log_debug("Directory file descriptor points to root owned directory (%s), who is also the peer.", strna(path));
×
883
                        return DIRECTORY_IS_ROOT_PEER_OWNED;
×
884
                }
885
                log_debug("Directory file descriptor points to root owned directory (%s).", strna(path));
×
886
                return DIRECTORY_IS_ROOT_OWNED;
×
887
        }
888
        if (stx.stx_uid == peer_uid) {
×
889
                log_debug("Directory file descriptor points to peer owned directory (%s).", strna(path));
×
890
                *ret_current_owner_uid = stx.stx_uid;
×
891
                return DIRECTORY_IS_PEER_OWNED;
×
892
        }
893

894
        /* For bind mounted directories we check if they are either owned by the client's UID, or by the
895
         * foreign UID set, but in that case the parent directory must be owned by the client's UID, or some
896
         * directory iteratively up the chain */
897

898
        _cleanup_close_ int parent_fd = -EBADF;
×
899
        unsigned n_level;
900
        for (n_level = 0; n_level < 16; n_level++) {
×
901
                /* Do not go above bind mounts */
902
                if (FLAGS_SET(stx.stx_attributes, STATX_ATTR_MOUNT_ROOT)) {
×
903
                        log_debug("Directory is a mount point, not checking for parent's ownership.");
×
904
                        *ret_current_owner_uid = stx.stx_uid;
×
905
                        return DIRECTORY_IS_OTHERWISE_OWNED;
×
906
                }
907

908
                /* Stop iteration if we find a directory up the tree that is neither owned by the user, nor is from the foreign UID range */
909
                if (!uid_is_foreign(stx.stx_uid) || !gid_is_foreign(stx.stx_gid)) {
×
910
                        log_debug("Directory file descriptor points to directory which itself or its parents is neither owned by foreign UID range nor by the user.");
×
911
                        *ret_current_owner_uid = stx.stx_uid;
×
912
                        return DIRECTORY_IS_OTHERWISE_OWNED;
×
913
                }
914

915
                /* If the peer is root, then it doesn't matter if we find a parent owned by root, let's shortcut things. */
916
                if (peer_uid == 0) {
×
917
                        log_debug("Directory referenced by file descriptor is owned by foreign UID range, and peer is root.");
×
918
                        *ret_current_owner_uid = stx.stx_uid;
×
919
                        return DIRECTORY_IS_FOREIGN_OWNED;
×
920
                }
921

922
                /* Go one level up */
923
                _cleanup_close_ int new_parent_fd = openat(fd, "..", O_DIRECTORY|O_PATH|O_CLOEXEC);
×
924
                if (new_parent_fd < 0)
×
925
                        return log_debug_errno(errno, "Failed to open parent directory of directory file descriptor: %m");
×
926

927
                struct statx new_stx;
×
928
                r = xstatx_full(new_parent_fd,
×
929
                                /* path= */ NULL,
930
                                AT_EMPTY_PATH,
931
                                /* xstatx_flags= */ XSTATX_MNT_ID_BEST,
932
                                /* mandatory_mask= */ STATX_UID|STATX_INO,
933
                                /* optional_mask= */ 0,
934
                                /* mandatory_attributes= */ STATX_ATTR_MOUNT_ROOT,
935
                                &new_stx);
936
                if (r < 0)
×
937
                        return log_debug_errno(r, "Failed to statx() parent directory of directory file descriptor: %m");
×
938

939
                /* Safety check to see if we hit the root dir */
940
                if (statx_inode_same(&stx, &new_stx)) {
×
941
                        log_debug("Directory file descriptor is owned by foreign UID range, but didn't find parent directory that is owned by peer among ancestors.");
×
942
                        *ret_current_owner_uid = stx.stx_uid;
×
943
                        return DIRECTORY_IS_OTHERWISE_OWNED;
×
944
                }
945

946
                r = statx_mount_same(&stx, &new_stx);
×
947
                if (r < 0)
×
948
                        return log_debug_errno(r, "Failed to compare mount IDs: %m");
×
949
                if (!r) {
×
950
                        /* NB, this check is probably redundant, given we also check
951
                         * STATX_ATTR_MOUNT_ROOT. The only reason we have it here is to provide extra safety
952
                         * in case the mount tree is rearranged concurrently with our traversal, so that
953
                         * STATX_ATTR_MOUNT_ROOT might be out of date. */
954
                        log_debug("Won't cross mount boundaries, not checking for parent's ownership.");
×
955
                        *ret_current_owner_uid = stx.stx_uid;
×
956
                        return DIRECTORY_IS_OTHERWISE_OWNED;
×
957
                }
958

959
                if (new_stx.stx_uid == peer_uid) { /* Parent inode is owned by the peer. That's good! Everything's fine. */
×
960
                        log_debug("Directory file descriptor is owned by foreign UID range, and ancestor is owned by peer.");
×
961
                        *ret_current_owner_uid = stx.stx_uid;
×
962
                        return DIRECTORY_IS_FOREIGN_OWNED;
×
963
                }
964

965
                close_and_replace(parent_fd, new_parent_fd);
×
966
                stx = new_stx;
×
967
        }
968

969
        log_debug("Failed to find peer owned parent directory after %u levels, refusing.", n_level);
×
970
        *ret_current_owner_uid = stx.stx_uid;
×
971
        return DIRECTORY_IS_OTHERWISE_OWNED;
×
972
}
973

974
static int open_tree_try_drop_idmap_harder(sd_varlink *link, int directory_fd, const char *directory_path) {
×
975
        int r;
×
976

977
        _cleanup_close_ int mount_fd = open_tree_try_drop_idmap(
×
978
                        directory_fd,
979
                        "",
980
                        OPEN_TREE_CLONE|OPEN_TREE_CLOEXEC|AT_SYMLINK_NOFOLLOW|AT_EMPTY_PATH);
981
        if (mount_fd >= 0)
×
982
                return TAKE_FD(mount_fd);
×
983
        if (mount_fd != -EINVAL)
×
984
                return log_debug_errno(mount_fd, "Failed to issue open_tree() of provided directory '%s': %m", strna(directory_path));
×
985

986
        _cleanup_(pidref_done) PidRef pidref = PIDREF_NULL;
×
987
        r = varlink_get_peer_pidref(link, &pidref);
×
988
        if (r < 0)
×
989
                return r;
990

991
        _cleanup_close_ int mntns_fd = pidref_namespace_open_by_type(&pidref, NAMESPACE_MOUNT);
×
992
        if (mntns_fd < 0)
×
993
                return log_debug_errno(mntns_fd, "Failed to open mount namespace of peer: %m");
×
994

995
        r = is_our_namespace(mntns_fd, NAMESPACE_MOUNT);
×
996
        if (r < 0)
×
997
                return log_debug_errno(r, "Failed to check if peer is in same mount namespace: %m");
×
998
        if (r > 0)
×
999
                return log_debug_errno(mount_fd, "Failed to issue open_tree() of provided directory '%s': %m", strna(directory_path));
×
1000

1001
        /* The peer is in a different mount namespace. open_tree() will fail with EINVAL on directory fds
1002
         * from a different mount namespace, so we need to fork off a child process that joins the peer's
1003
         * mount namespace and calls open_tree() there. */
1004

1005
        _cleanup_close_pair_ int errno_pipe_fd[2] = EBADF_PAIR, mount_fd_socket[2] = EBADF_PAIR;
×
1006

1007
        if (pipe2(errno_pipe_fd, O_CLOEXEC) < 0)
×
1008
                return log_debug_errno(errno, "Failed to create pipe: %m");
×
1009

1010
        if (socketpair(AF_UNIX, SOCK_DGRAM|SOCK_CLOEXEC, 0, mount_fd_socket) < 0)
×
1011
                return log_debug_errno(errno, "Failed to create socket pair: %m");
×
1012

1013
        _cleanup_(pidref_done) PidRef child = PIDREF_NULL;
×
1014
        r = namespace_fork(
×
1015
                        "(sd-opentreens)",
1016
                        "(sd-opentree)",
1017
                        FORK_RESET_SIGNALS|FORK_DEATHSIG_SIGKILL,
1018
                        /* pidns_fd= */ -EBADF,
1019
                        mntns_fd,
1020
                        /* netns_fd= */ -EBADF,
1021
                        /* userns_fd= */ -EBADF,
1022
                        /* root_fd= */ -EBADF,
1023
                        &child);
1024
        if (r < 0)
×
1025
                return log_debug_errno(r, "Failed to fork into peer's mount namespace: %m");
×
1026
        if (r == 0) {
×
1027
                /* Child */
1028
                errno_pipe_fd[0] = safe_close(errno_pipe_fd[0]);
×
1029
                mount_fd_socket[0] = safe_close(mount_fd_socket[0]);
×
1030

1031
                mount_fd = open_tree_try_drop_idmap(
×
1032
                                directory_fd,
1033
                                "",
1034
                                OPEN_TREE_CLONE|OPEN_TREE_CLOEXEC|AT_SYMLINK_NOFOLLOW|AT_EMPTY_PATH);
1035
                if (mount_fd < 0) {
×
1036
                        log_debug_errno(mount_fd, "Failed to issue open_tree() of provided directory '%s': %m", strna(directory_path));
×
1037
                        report_errno_and_exit(errno_pipe_fd[1], mount_fd);
×
1038
                }
1039

1040
                r = send_one_fd(mount_fd_socket[1], mount_fd, /* flags= */ 0);
×
1041
                if (r < 0) {
×
1042
                        log_debug_errno(r, "Failed to send mount fd: %m");
×
1043
                        report_errno_and_exit(errno_pipe_fd[1], r);
×
1044
                }
1045

1046
                _exit(EXIT_SUCCESS);
×
1047
        }
1048

1049
        errno_pipe_fd[1] = safe_close(errno_pipe_fd[1]);
×
1050
        mount_fd_socket[1] = safe_close(mount_fd_socket[1]);
×
1051

1052
        r = pidref_wait_for_terminate_and_check("(sd-opentreens)", &child, /* flags= */ 0);
×
1053
        if (r < 0)
×
1054
                return log_debug_errno(r, "Failed to wait for child: %m");
×
1055

1056
        r = read_errno(errno_pipe_fd[0]);
×
1057
        if (r < 0)
×
1058
                return r;
1059

1060
        mount_fd = receive_one_fd(mount_fd_socket[0], MSG_DONTWAIT);
×
1061
        if (mount_fd < 0)
×
1062
                return log_debug_errno(mount_fd, "Failed to receive mount fd from child: %m");
×
1063

1064
        return TAKE_FD(mount_fd);
1065
}
1066

1067
static int vl_method_mount_directory(
×
1068
                sd_varlink *link,
1069
                sd_json_variant *parameters,
1070
                sd_varlink_method_flags_t flags,
1071
                void *userdata) {
1072

1073
        static const sd_json_dispatch_field dispatch_table[] = {
×
1074
                { "mode",                        SD_JSON_VARIANT_STRING,   dispatch_mount_directory_mode, offsetof(MountDirectoryParameters, mode),             0                 },
1075
                { "directoryFileDescriptor",     SD_JSON_VARIANT_UNSIGNED, sd_json_dispatch_uint,         offsetof(MountDirectoryParameters, directory_fd_idx), SD_JSON_MANDATORY },
1076
                { "userNamespaceFileDescriptor", SD_JSON_VARIANT_UNSIGNED, sd_json_dispatch_uint,         offsetof(MountDirectoryParameters, userns_fd_idx),    0                 },
1077
                { "readOnly",                    SD_JSON_VARIANT_BOOLEAN,  sd_json_dispatch_tristate,     offsetof(MountDirectoryParameters, read_only),        0                 },
1078
                VARLINK_DISPATCH_POLKIT_FIELD,
1079
                {}
1080
        };
1081

1082
        MountDirectoryParameters p = {
×
1083
                .mode = MOUNT_MAP_AUTO,
1084
                .directory_fd_idx = UINT_MAX,
1085
                .userns_fd_idx = UINT_MAX,
1086
                .read_only = -1,
1087
        };
1088
        _cleanup_close_ int directory_fd = -EBADF, userns_fd = -EBADF;
×
1089
        Hashmap **polkit_registry = ASSERT_PTR(userdata);
×
1090
        int r;
×
1091

1092
        r = sd_varlink_dispatch(link, parameters, dispatch_table, &p);
×
1093
        if (r != 0)
×
1094
                return r;
1095

1096
        if (p.directory_fd_idx == UINT_MAX)
×
1097
                return sd_varlink_error_invalid_parameter_name(link, "directoryFileDescriptor");
×
1098

1099
        directory_fd = sd_varlink_peek_dup_fd(link, p.directory_fd_idx);
×
1100
        if (directory_fd < 0)
×
1101
                return log_debug_errno(directory_fd, "Failed to peek directory fd from client: %m");
×
1102

1103
        if (p.userns_fd_idx != UINT_MAX) {
×
1104
                userns_fd = sd_varlink_peek_dup_fd(link, p.userns_fd_idx);
×
1105
                if (userns_fd < 0)
×
1106
                        return log_debug_errno(userns_fd, "Failed to peek user namespace fd from client: %m");
×
1107
        }
1108

1109
        uid_t peer_uid;
×
1110
        r = sd_varlink_get_peer_uid(link, &peer_uid);
×
1111
        if (r < 0)
×
1112
                return log_debug_errno(r, "Failed to get client UID: %m");
×
1113

1114
        /* Get path of the fd, to improve logging */
1115
        _cleanup_free_ char *directory_path = NULL;
×
1116
        (void) fd_get_path(directory_fd, &directory_path);
×
1117

1118
        uid_t current_owner_uid;
×
1119
        DirectoryOwnership owned_by = validate_directory_fd(directory_fd, directory_path, peer_uid, &current_owner_uid);
×
1120
        if (owned_by == -EREMOTEIO)
×
1121
                return sd_varlink_errorbo(link, "io.systemd.MountFileSystem.BadFileDescriptorFlags", SD_JSON_BUILD_PAIR_STRING("parameter", "directoryFileDescriptor"));
×
1122
        if (owned_by < 0)
×
1123
                return owned_by;
1124

1125
        r = validate_userns(link, &userns_fd);
×
1126
        if (r != 0)
×
1127
                return r;
1128

1129
        /* If no mode is specified, pick sensible default */
1130
        if (p.mode <= 0) {
×
1131
                p.mode = default_mount_map_mode(owned_by);
×
1132
                assert(p.mode > 0);
×
1133
        }
1134

1135
        log_debug("Mounting '%s' with mapping mode: %s", strna(directory_path), mount_map_mode_to_string(p.mode));
×
1136

1137
        const char *polkit_details[] = {
×
1138
                "read_only", one_zero(p.read_only > 0),
×
1139
                "directory", strna(directory_path),
×
1140
                NULL,
1141
        };
1142

1143
        const char *polkit_action, *polkit_untrusted_action;
×
1144
        PolkitFlags polkit_flags;
×
1145
        if (userns_fd < 0) {
×
1146
                /* Mount into the host user namespace */
1147
                polkit_action = "io.systemd.mount-file-system.mount-directory";
1148
                polkit_untrusted_action = "io.systemd.mount-file-system.mount-untrusted-directory";
1149
                polkit_flags = 0;
1150
        } else {
1151
                /* Mount into a private user namespace */
1152
                polkit_action = "io.systemd.mount-file-system.mount-directory-privately";
×
1153
                polkit_untrusted_action = "io.systemd.mount-file-system.mount-untrusted-directory-privately";
×
1154

1155
                /* If polkit is not around, let's allow mounting authenticated images by default */
1156
                polkit_flags = POLKIT_DEFAULT_ALLOW;
×
1157
        }
1158

1159
        /* We consider a directory "trusted" if it is owned by the peer or the foreign UID range */
1160
        bool trusted_directory = IN_SET(owned_by, DIRECTORY_IS_ROOT_PEER_OWNED, DIRECTORY_IS_PEER_OWNED, DIRECTORY_IS_FOREIGN_OWNED);
×
1161

1162
        /* Let's definitely acquire the regular action privilege, for mounting properly signed images */
1163
        r = varlink_verify_polkit_async_full(
×
1164
                        link,
1165
                        /* bus= */ NULL,
1166
                        trusted_directory ? polkit_action : polkit_untrusted_action,
1167
                        polkit_details,
1168
                        /* good_user= */ UID_INVALID,
1169
                        trusted_directory ? polkit_flags : 0,
1170
                        polkit_registry);
1171
        if (r <= 0)
×
1172
                return r;
1173

1174
        /* Generate the common dissection directory here. We are not going to use it, but the clients might,
1175
         * and they likely are unprivileged, hence cannot create it themselves. Hence let's just create it
1176
         * here, if it is missing. */
1177
        r = get_common_dissect_directory(NULL);
×
1178
        if (r < 0)
×
1179
                return r;
1180

1181
        _cleanup_close_ int mount_fd = open_tree_try_drop_idmap_harder(link, directory_fd, directory_path);
×
1182
        if (mount_fd < 0)
×
1183
                return mount_fd;
1184

1185
        /* MOUNT_ATTR_IDMAP has possibly been cleared. Let's verify that the underlying data matches our expectations. */
1186
        struct stat unmapped_st;
×
1187
        if (fstat(mount_fd, &unmapped_st) < 0)
×
1188
                return log_debug_errno(errno, "Failed to stat unmapped inode: %m");
×
1189

1190
        r = stat_verify_directory(&unmapped_st);
×
1191
        if (r < 0)
×
1192
                return r;
1193

1194
        /* For now, let's simply refuse things if dropping the idmapping changed anything. For now that
1195
         * should be good enough, because the primary usecase for this (homed) will mount the foreign UID
1196
         * range 1:1. */
1197
        if (unmapped_st.st_uid != current_owner_uid)
×
1198
                return log_debug_errno(SYNTHETIC_ERRNO(EPERM), "Owner UID of mount after clearing ID mapping not the same anymore, refusing.");
×
1199

1200
        if (p.read_only > 0 && mount_setattr(
×
1201
                            mount_fd, "", AT_EMPTY_PATH,
1202
                            &(struct mount_attr) {
×
1203
                                    .attr_set = MOUNT_ATTR_RDONLY,
1204
                            }, MOUNT_ATTR_SIZE_VER0) < 0)
1205
                return log_debug_errno(errno, "Failed to enable read-only mode: %m");
×
1206

1207
        if (p.mode != MOUNT_MAP_IDENTITY) {
×
1208
                uid_t start;
×
1209

1210
                if (userns_fd >= 0) {
×
1211
                        /* Load ranges without coalescing to preserve the 1:1 correspondence
1212
                         * between inside and outside entries */
1213
                        _cleanup_(uid_range_freep) UIDRange *uid_range_outside = NULL, *uid_range_inside = NULL, *gid_range_outside = NULL, *gid_range_inside = NULL;
×
1214
                        r = uid_range_load_userns_by_fd_full(userns_fd, UID_RANGE_USERNS_OUTSIDE, /* coalesce= */ false, &uid_range_outside);
×
1215
                        if (r < 0)
×
1216
                                return log_debug_errno(r, "Failed to load outside UID range of provided userns: %m");
×
1217

1218
                        r = uid_range_load_userns_by_fd_full(userns_fd, UID_RANGE_USERNS_INSIDE, /* coalesce= */ false, &uid_range_inside);
×
1219
                        if (r < 0)
×
1220
                                return log_debug_errno(r, "Failed to load inside UID range of provided userns: %m");
×
1221

1222
                        r = uid_range_load_userns_by_fd_full(userns_fd, GID_RANGE_USERNS_OUTSIDE, /* coalesce= */ false, &gid_range_outside);
×
1223
                        if (r < 0)
×
1224
                                return log_debug_errno(r, "Failed to load outside GID range of provided userns: %m");
×
1225

1226
                        r = uid_range_load_userns_by_fd_full(userns_fd, GID_RANGE_USERNS_INSIDE, /* coalesce= */ false, &gid_range_inside);
×
1227
                        if (r < 0)
×
1228
                                return log_debug_errno(r, "Failed to load inside GID range of provided userns: %m");
×
1229

1230
                        /* UID and GID mappings must match */
1231
                        if (!uid_range_equal(uid_range_outside, gid_range_outside) ||
×
1232
                            !uid_range_equal(uid_range_inside, gid_range_inside))
×
1233
                                return sd_varlink_error_invalid_parameter_name(link, "userNamespaceFileDescriptor");
×
1234

1235
                        /* Must have at least one entry, and inside/outside must have matching entry counts */
1236
                        if (uid_range_is_empty(uid_range_outside) ||
×
1237
                            uid_range_outside->n_entries != uid_range_inside->n_entries)
×
1238
                                return sd_varlink_error_invalid_parameter_name(link, "userNamespaceFileDescriptor");
×
1239

1240
                        /* The first range must be a root UID in the transient range (i.e. aligned
1241
                         * to a 64K boundary) and mapped to 0 inside the user namespace (size 65536) */
1242
                        if (!uid_is_transient(uid_range_outside->entries[0].start) ||
×
1243
                            (uid_range_outside->entries[0].start & 0xFFFFU) != 0 ||
×
1244
                            uid_range_outside->entries[0].nr != NSRESOURCE_UIDS_64K ||
×
1245
                            uid_range_inside->entries[0].start != 0 ||
×
1246
                            uid_range_inside->entries[0].nr != NSRESOURCE_UIDS_64K)
×
1247
                                return sd_varlink_error_invalid_parameter_name(link, "userNamespaceFileDescriptor");
×
1248

1249
                        /* All remaining entries must also be root UIDs in the transient range and
1250
                         * mapped 1:1, which identifies them as delegated ranges. The last entry
1251
                         * may also be the root UID in the foreign UID range. */
1252
                        for (size_t i = 1; i < uid_range_outside->n_entries; i++) {
×
1253
                                bool is_last = i + 1 == uid_range_outside->n_entries;
×
1254
                                uid_t entry_start = uid_range_outside->entries[i].start;
×
1255

1256
                                if (!(uid_is_transient(entry_start) ||
×
1257
                                      (is_last && uid_is_foreign(entry_start))) ||
×
1258
                                    (entry_start & 0xFFFFU) != 0 ||
×
1259
                                    uid_range_outside->entries[i].nr != NSRESOURCE_UIDS_64K ||
×
1260
                                    uid_range_outside->entries[i].start != uid_range_inside->entries[i].start ||
×
1261
                                    uid_range_outside->entries[i].nr != uid_range_inside->entries[i].nr)
×
1262
                                        return sd_varlink_error_invalid_parameter_name(link, "userNamespaceFileDescriptor");
×
1263
                        }
1264

1265
                        start = uid_range_outside->entries[0].start;
×
1266
                } else
1267
                        start = 0;
1268

1269
                _cleanup_free_ char *new_uid_map = NULL;
×
1270
                switch (p.mode) {
×
1271
                case MOUNT_MAP_ROOT:
×
1272
                        r = strextendf(&new_uid_map, UID_FMT " " UID_FMT " " UID_FMT,
×
1273
                                       peer_uid, start, (uid_t) 1);
1274
                        break;
1275
                case MOUNT_MAP_FOREIGN:
×
1276
                        r = strextendf(&new_uid_map, UID_FMT " " UID_FMT " " UID_FMT,
×
1277
                                       (uid_t) FOREIGN_UID_MIN, start, (uid_t) 0x10000);
1278
                        break;
1279
                default:
×
1280
                        assert_not_reached();
×
1281
                }
1282
                if (r < 0)
×
1283
                        return r;
1284

1285
                _cleanup_close_ int idmap_userns_fd = userns_acquire(new_uid_map, new_uid_map, /* setgroups_deny= */ true);
×
1286
                if (idmap_userns_fd < 0)
×
1287
                        return log_debug_errno(idmap_userns_fd, "Failed to acquire user namespace for id mapping: %m");
×
1288

1289
                if (mount_setattr(mount_fd, "", AT_EMPTY_PATH,
×
1290
                                  &(struct mount_attr) {
×
1291
                                          .attr_set = MOUNT_ATTR_IDMAP,
1292
                                          .userns_fd = idmap_userns_fd,
1293
                                          .propagation = MS_PRIVATE,
1294
                                  }, MOUNT_ATTR_SIZE_VER0) < 0)
1295
                        return log_debug_errno(errno, "Failed to enable id mapping: %m");
×
1296
        }
1297

1298
        if (userns_fd >= 0) {
×
1299
                r = nsresource_add_mount(/* vl= */ NULL, userns_fd, mount_fd);
×
1300
                if (r < 0)
×
1301
                        return r;
1302
        }
1303

1304
        int fd_idx = sd_varlink_push_fd(link, mount_fd);
×
1305
        if (fd_idx < 0)
×
1306
                return fd_idx;
1307

1308
        TAKE_FD(mount_fd);
×
1309

1310
        return sd_varlink_replybo(
×
1311
                        link,
1312
                        SD_JSON_BUILD_PAIR_INTEGER("mountFileDescriptor", fd_idx));
1313
}
1314

1315
typedef struct MakeDirectoryParameters {
1316
        unsigned parent_fd_idx;
1317
        const char *name;
1318
        mode_t mode;
1319
} MakeDirectoryParameters;
1320

1321
static int vl_method_make_directory(
×
1322
                sd_varlink *link,
1323
                sd_json_variant *parameters,
1324
                sd_varlink_method_flags_t flags,
1325
                void *userdata) {
1326

1327
        static const sd_json_dispatch_field dispatch_table[] = {
×
1328
                { "parentFileDescriptor", _SD_JSON_VARIANT_TYPE_INVALID, sd_json_dispatch_uint,        offsetof(MakeDirectoryParameters, parent_fd_idx), SD_JSON_MANDATORY },
1329
                { "name",                 SD_JSON_VARIANT_STRING,        json_dispatch_const_filename, offsetof(MakeDirectoryParameters, name),          SD_JSON_MANDATORY },
1330
                { "mode",                 _SD_JSON_VARIANT_TYPE_INVALID, json_dispatch_access_mode,    offsetof(MakeDirectoryParameters, mode),          SD_JSON_STRICT    },
1331
                VARLINK_DISPATCH_POLKIT_FIELD,
1332
                {}
1333
        };
1334

1335
        MakeDirectoryParameters p = {
×
1336
                .parent_fd_idx = UINT_MAX,
1337
                .mode = MODE_INVALID,
1338
        };
1339
        Hashmap **polkit_registry = ASSERT_PTR(userdata);
×
1340
        int r;
×
1341

1342
        r = sd_varlink_dispatch(link, parameters, dispatch_table, &p);
×
1343
        if (r != 0)
×
1344
                return r;
×
1345

1346
        if (p.mode == MODE_INVALID)
×
1347
                p.mode = 0700;
×
1348
        else
1349
                p.mode &= 0775; /* refuse generating world writable dirs */
×
1350

1351
        if (p.parent_fd_idx == UINT_MAX)
×
1352
                return sd_varlink_error_invalid_parameter_name(link, "parentFileDescriptor");
×
1353

1354
        _cleanup_close_ int parent_fd = sd_varlink_peek_dup_fd(link, p.parent_fd_idx);
×
1355
        if (parent_fd < 0)
×
1356
                return log_debug_errno(parent_fd, "Failed to peek parent directory fd from client: %m");
×
1357

1358
        uid_t peer_uid;
×
1359
        r = sd_varlink_get_peer_uid(link, &peer_uid);
×
1360
        if (r < 0)
×
1361
                return log_debug_errno(r, "Failed to get client UID: %m");
×
1362

1363
        struct stat parent_stat;
×
1364
        if (fstat(parent_fd, &parent_stat) < 0)
×
1365
                return log_debug_errno(errno, "Failed to fstat parent directory fd: %m");
×
1366

1367
        r = stat_verify_directory(&parent_stat);
×
1368
        if (r < 0)
×
1369
                return r;
1370

1371
        int fl = fd_verify_safe_flags_full(parent_fd, O_DIRECTORY);
×
1372
        if (fl < 0)
×
1373
                return log_debug_errno(fl, "Directory file descriptor has unsafe flags set: %m");
×
1374

1375
        _cleanup_free_ char *parent_path = NULL;
×
1376
        (void) fd_get_path(parent_fd, &parent_path);
×
1377

1378
        _cleanup_free_ char *new_path = parent_path ? path_join(parent_path, p.name) : NULL;
×
1379
        log_debug("Asked to make directory: %s", strna(new_path));
×
1380

1381
        const char *polkit_details[] = {
×
1382
                "directory", strna(new_path),
×
1383
                NULL,
1384
        };
1385

1386
        const char *polkit_action;
×
1387
        PolkitFlags polkit_flags;
×
1388
        if (parent_stat.st_uid != peer_uid) {
×
1389
                polkit_action = "io.systemd.mount-file-system.make-directory-untrusted";
1390
                polkit_flags = 0;
1391
        } else {
1392
                polkit_action = "io.systemd.mount-file-system.make-directory";
×
1393
                polkit_flags = POLKIT_DEFAULT_ALLOW;
×
1394
        }
1395

1396
        r = varlink_verify_polkit_async_full(
×
1397
                        link,
1398
                        /* bus= */ NULL,
1399
                        polkit_action,
1400
                        polkit_details,
1401
                        /* good_user= */ UID_INVALID,
1402
                        polkit_flags,
1403
                        polkit_registry);
1404
        if (r <= 0)
×
1405
                return r;
1406

1407
        _cleanup_free_ char *t = NULL;
×
1408
        r = tempfn_random(p.name, "mountfsd", &t);
×
1409
        if (r < 0)
×
1410
                return r;
1411

1412
        _cleanup_close_ int fd = open_mkdir_at(parent_fd, t, O_CLOEXEC, p.mode);
×
1413
        if (fd < 0)
×
1414
                return fd;
1415

1416
        r = RET_NERRNO(fchmod(fd, p.mode)); /* Set mode explicitly, as paranoia regarding umask games */
×
1417
        if (r < 0)
×
1418
                goto fail;
×
1419

1420
        r = RET_NERRNO(fchown(fd, FOREIGN_UID_BASE, FOREIGN_UID_BASE));
×
1421
        if (r < 0)
×
1422
                goto fail;
×
1423

1424
        r = rename_noreplace(parent_fd, t, parent_fd, p.name);
×
1425
        if (r < 0)
×
1426
                goto fail;
×
1427

1428
        t = mfree(t); /* temporary filename no longer exists */
×
1429

1430
        int fd_idx = sd_varlink_push_fd(link, fd);
×
1431
        if (fd_idx < 0) {
×
1432
                r = fd_idx;
×
1433
                goto fail;
×
1434
        }
1435

1436
        TAKE_FD(fd);
×
1437

1438
        return sd_varlink_replybo(
×
1439
                        link,
1440
                        SD_JSON_BUILD_PAIR_INTEGER("directoryFileDescriptor", fd_idx));
1441

1442
fail:
×
1443
        (void) unlinkat(parent_fd, t ?: p.name, AT_REMOVEDIR);
×
1444
        return r;
×
1445
}
1446

1447
static int process_connection(sd_varlink_server *server, int _fd) {
×
1448
        _cleanup_close_ int fd = TAKE_FD(_fd); /* always take possession */
×
1449
        _cleanup_(sd_varlink_close_unrefp) sd_varlink *vl = NULL;
×
1450
        _cleanup_(sd_event_unrefp) sd_event *event = NULL;
×
1451
        int r;
×
1452

1453
        r = sd_event_new(&event);
×
1454
        if (r < 0)
×
1455
                return r;
1456

1457
        r = sd_varlink_server_attach_event(server, event, 0);
×
1458
        if (r < 0)
×
1459
                return log_error_errno(r, "Failed to attach Varlink server to event loop: %m");
×
1460

1461
        r = sd_varlink_server_add_connection(server, fd, &vl);
×
1462
        if (r < 0)
×
1463
                return log_error_errno(r, "Failed to add connection: %m");
×
1464

1465
        TAKE_FD(fd);
×
1466
        vl = sd_varlink_ref(vl);
×
1467

1468
        r = sd_event_loop(event);
×
1469
        if (r < 0)
×
1470
                return log_error_errno(r, "Failed to run event loop: %m");
×
1471

1472
        r = sd_varlink_server_detach_event(server);
×
1473
        if (r < 0)
×
1474
                return log_error_errno(r, "Failed to detach Varlink server from event loop: %m");
×
1475

1476
        return 0;
1477
}
1478

1479
static int run(int argc, char *argv[]) {
×
1480
        usec_t start_time, listen_idle_usec, last_busy_usec = USEC_INFINITY;
×
1481
        _cleanup_(sd_varlink_server_unrefp) sd_varlink_server *server = NULL;
×
1482
        _cleanup_hashmap_free_ Hashmap *polkit_registry = NULL;
×
1483
        _cleanup_(pidref_done) PidRef parent = PIDREF_NULL;
×
1484
        unsigned n_iterations = 0;
×
1485
        int m, listen_fd, r;
×
1486

1487
        log_setup();
×
1488

1489
        m = sd_listen_fds(false);
×
1490
        if (m < 0)
×
1491
                return log_error_errno(m, "Failed to determine number of listening fds: %m");
×
1492
        if (m == 0)
×
1493
                return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "No socket to listen on received.");
×
1494
        if (m > 1)
×
1495
                return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "Worker can only listen on a single socket at a time.");
×
1496

1497
        listen_fd = SD_LISTEN_FDS_START;
×
1498

1499
        r = fd_nonblock(listen_fd, false);
×
1500
        if (r < 0)
×
1501
                return log_error_errno(r, "Failed to turn off non-blocking mode for listening socket: %m");
×
1502

1503
        r = varlink_server_new(&server,
×
1504
                               SD_VARLINK_SERVER_INHERIT_USERDATA|
1505
                               SD_VARLINK_SERVER_ALLOW_FD_PASSING_INPUT|SD_VARLINK_SERVER_ALLOW_FD_PASSING_OUTPUT,
1506
                               &polkit_registry);
1507
        if (r < 0)
×
1508
                return log_error_errno(r, "Failed to allocate server: %m");
×
1509

1510
        r = sd_varlink_server_add_interface(server, &vl_interface_io_systemd_MountFileSystem);
×
1511
        if (r < 0)
×
1512
                return log_error_errno(r, "Failed to add MountFileSystem interface to varlink server: %m");
×
1513

1514
        r = sd_varlink_server_bind_method_many(
×
1515
                        server,
1516
                        "io.systemd.MountFileSystem.MountImage",     vl_method_mount_image,
1517
                        "io.systemd.MountFileSystem.MountDirectory", vl_method_mount_directory,
1518
                        "io.systemd.MountFileSystem.MakeDirectory",  vl_method_make_directory);
1519
        if (r < 0)
×
1520
                return log_error_errno(r, "Failed to bind methods: %m");
×
1521

1522
        r = sd_varlink_server_set_exit_on_idle(server, true);
×
1523
        if (r < 0)
×
1524
                return log_error_errno(r, "Failed to enable exit-on-idle mode: %m");
×
1525

1526
        r = getenv_bool("MOUNTFS_FIXED_WORKER");
×
1527
        if (r < 0)
×
1528
                return log_error_errno(r, "Failed to parse MOUNTFSD_FIXED_WORKER: %m");
×
1529
        listen_idle_usec = r ? USEC_INFINITY : LISTEN_IDLE_USEC;
×
1530

1531
        r = pidref_set_parent(&parent);
×
1532
        if (r < 0)
×
1533
                return log_error_errno(r, "Failed to acquire pidfd of parent process: %m");
×
1534

1535
        start_time = now(CLOCK_MONOTONIC);
×
1536

1537
        for (;;) {
×
1538
                _cleanup_close_ int fd = -EBADF;
×
1539
                usec_t n;
×
1540

1541
                /* Exit the worker in regular intervals, to flush out all memory use */
1542
                if (n_iterations++ > ITERATIONS_MAX) {
×
1543
                        log_debug("Exiting worker, processed %u iterations, that's enough.", n_iterations);
×
1544
                        break;
1545
                }
1546

1547
                n = now(CLOCK_MONOTONIC);
×
1548
                if (n >= usec_add(start_time, RUNTIME_MAX_USEC)) {
×
1549
                        log_debug("Exiting worker, ran for %s, that's enough.",
×
1550
                                  FORMAT_TIMESPAN(usec_sub_unsigned(n, start_time), 0));
1551
                        break;
×
1552
                }
1553

1554
                if (last_busy_usec == USEC_INFINITY)
×
1555
                        last_busy_usec = n;
1556
                else if (listen_idle_usec != USEC_INFINITY && n >= usec_add(last_busy_usec, listen_idle_usec)) {
×
1557
                        log_debug("Exiting worker, been idle for %s.",
×
1558
                                  FORMAT_TIMESPAN(usec_sub_unsigned(n, last_busy_usec), 0));
1559
                        break;
×
1560
                }
1561

1562
                (void) rename_process("systemd-mountwork: waiting...");
×
1563
                fd = RET_NERRNO(accept4(listen_fd, NULL, NULL, SOCK_NONBLOCK|SOCK_CLOEXEC));
×
1564
                (void) rename_process("systemd-mountwork: processing...");
×
1565

1566
                if (fd == -EAGAIN)
×
1567
                        continue; /* The listening socket has SO_RECVTIMEO set, hence a timeout is expected
×
1568
                                   * after a while, let's check if it's time to exit though. */
1569
                if (fd == -EINTR)
×
1570
                        continue; /* Might be that somebody attached via strace, let's just continue in that
×
1571
                                   * case */
1572
                if (fd < 0)
×
1573
                        return log_error_errno(fd, "Failed to accept() from listening socket: %m");
×
1574

1575
                if (now(CLOCK_MONOTONIC) <= usec_add(n, PRESSURE_SLEEP_TIME_USEC)) {
×
1576
                        /* We only slept a very short time? If so, let's see if there are more sockets
1577
                         * pending, and if so, let's ask our parent for more workers */
1578

1579
                        r = fd_wait_for_event(listen_fd, POLLIN, 0);
×
1580
                        if (r < 0)
×
1581
                                return log_error_errno(r, "Failed to test for POLLIN on listening socket: %m");
×
1582

1583
                        if (FLAGS_SET(r, POLLIN)) {
×
1584
                                r = pidref_kill(&parent, SIGUSR2);
×
1585
                                if (r == -ESRCH)
×
1586
                                        return log_error_errno(r, "Parent already died?");
×
1587
                                if (r < 0)
×
1588
                                        return log_error_errno(r, "Failed to send SIGUSR2 signal to parent: %m");
×
1589
                        }
1590
                }
1591

1592
                (void) process_connection(server, TAKE_FD(fd));
×
1593
                last_busy_usec = USEC_INFINITY;
×
1594
        }
1595

1596
        return 0;
1597
}
1598

1599
DEFINE_MAIN_FUNCTION(run);
×
STATUS · Troubleshooting · Open an Issue · Sales · Support · CAREERS · ENTERPRISE · START FREE · SCHEDULE DEMO
ANNOUNCEMENTS · TWITTER · TOS & SLA · Supported CI Services · What's a CI service? · Automated Testing

© 2026 Coveralls, Inc