• Home
  • Features
  • Pricing
  • Docs
  • Announcements
  • Sign In

uc-cdis / fence / 24903485277

24 Apr 2026 05:42PM UTC coverage: 75.08%. Remained the same
24903485277

push

github

web-flow
Add public key authentication for dbgap sftp sync (#1334)

8469 of 11280 relevant lines covered (75.08%)

0.75 hits per line

Source File
Press 'n' to go to next uncovered line, 'b' for previous

82.04
fence/sync/sync_users.py
1
import paramiko.ssh_exception
1✔
2
import backoff
1✔
3
import glob
1✔
4

5
import httpx
1✔
6
import jwt
1✔
7
import os
1✔
8
import re
1✔
9
import subprocess as sp
1✔
10
import yaml
1✔
11
import copy
1✔
12
import datetime
1✔
13
import uuid
1✔
14
import collections
1✔
15
import hashlib
1✔
16

17
from contextlib import contextmanager
1✔
18
from collections import defaultdict
1✔
19
from csv import DictReader
1✔
20
from io import StringIO
1✔
21
from stat import S_ISDIR
1✔
22

23
import paramiko
1✔
24
from cdislogging import get_logger
1✔
25
from email_validator import validate_email, EmailNotValidError
1✔
26
from gen3authz.client.arborist.errors import ArboristError, ArboristTimeoutError
1✔
27
from gen3users.validation import validate_user_yaml
1✔
28
from paramiko.proxy import ProxyCommand
1✔
29
from sqlalchemy.exc import IntegrityError
1✔
30
from sqlalchemy import func
1✔
31

32
from fence.config import config
1✔
33
from fence.models import (
1✔
34
    AccessPrivilege,
35
    AuthorizationProvider,
36
    Project,
37
    Tag,
38
    User,
39
    query_for_user,
40
    Client,
41
    IdentityProvider,
42
    get_project_to_authz_mapping,
43
)
44
from fence.resources.google.utils import get_or_create_proxy_group_id
1✔
45
from fence.resources.storage import StorageManager
1✔
46
from fence.resources.google.access_utils import update_google_groups_for_users
1✔
47
from fence.resources.google.access_utils import GoogleUpdateException
1✔
48
from fence.sync import utils
1✔
49
from fence.sync.passport_sync.ras_sync import RASVisa
1✔
50
from fence.utils import get_SQLAlchemyDriver, DEFAULT_BACKOFF_SETTINGS
1✔
51

52

53
def _format_policy_id(path, privilege):
1✔
54
    resource = ".".join(name for name in path.split("/") if name)
1✔
55
    return "{}-{}".format(resource, privilege)
1✔
56

57

58
def download_dir(sftp, remote_dir, local_dir):
1✔
59
    """
60
    Recursively download file from remote_dir to local_dir
61
    Args:
62
        remote_dir(str)
63
        local_dir(str)
64
    Returns: None
65
    """
66
    dir_items = sftp.listdir_attr(remote_dir)
×
67

68
    for item in dir_items:
×
69
        remote_path = remote_dir + "/" + item.filename
×
70
        local_path = os.path.join(local_dir, item.filename)
×
71
        if S_ISDIR(item.st_mode):
×
72
            download_dir(sftp, remote_path, local_path)
×
73
        else:
74
            sftp.get(remote_path, local_path)
×
75

76

77
def arborist_role_for_permission(permission):
1✔
78
    """
79
    For the programs/projects in the existing fence access control model, in order to
80
    use arborist for checking permissions we generate a policy for each combination of
81
    program/project and privilege. The roles involved all contain only one permission,
82
    for one privilege from the project access model.
83
    """
84
    return {
1✔
85
        "id": permission,
86
        "permissions": [
87
            {"id": permission, "action": {"service": "*", "method": permission}}
88
        ],
89
    }
90

91

92
@contextmanager
1✔
93
def _read_file(filepath, encrypted=True, key=None, logger=None):
1✔
94
    """
95
    Context manager for reading and optionally decrypting file it only
96
    decrypts files encrypted by unix 'crypt' tool which is used by dbGaP.
97

98
    Args:
99
        filepath (str): path to the file
100
        encrypted (bool): whether the file is encrypted
101

102
    Returns:
103
        Generator[file-like class]: file like object for the file
104
    """
105
    if encrypted:
1✔
106
        p = sp.Popen(
×
107
            [
108
                "ccdecrypt",
109
                "-u",
110
                "-K",
111
                key,
112
                filepath,
113
            ],
114
            stdout=sp.PIPE,
115
            stderr=open(os.devnull, "w"),
116
            universal_newlines=True,
117
        )
118
        try:
×
119
            yield StringIO(p.communicate()[0])
×
120
        except UnicodeDecodeError:
×
121
            logger.error("Could not decode file. Check the decryption key.")
×
122
    else:
123
        f = open(filepath, "r")
1✔
124
        yield f
1✔
125
        f.close()
1✔
126

127

128
class UserYAML(object):
1✔
129
    """
130
    Representation of the information in a YAML file describing user, project, and ABAC
131
    information for access control.
132
    """
133

134
    def __init__(
1✔
135
        self,
136
        projects=None,
137
        user_info=None,
138
        policies=None,
139
        clients=None,
140
        authz=None,
141
        project_to_resource=None,
142
        logger=None,
143
        user_abac=None,
144
    ):
145
        self.projects = projects or {}
1✔
146
        self.user_info = user_info or {}
1✔
147
        self.user_abac = user_abac or {}
1✔
148
        self.policies = policies or {}
1✔
149
        self.clients = clients or {}
1✔
150
        self.authz = authz or {}
1✔
151
        self.project_to_resource = project_to_resource or {}
1✔
152
        self.logger = logger
1✔
153

154
    @classmethod
1✔
155
    def from_file(cls, filepath, encrypted=True, key=None, logger=None):
1✔
156
        """
157
        Add access by "auth_id" to "self.projects" to update the Fence DB.
158
        Add access by "resource" to "self.user_abac" to update Arborist.
159
        """
160
        data = {}
1✔
161
        if filepath:
1✔
162
            with _read_file(filepath, encrypted=encrypted, key=key, logger=logger) as f:
1✔
163
                file_contents = f.read()
1✔
164
                validate_user_yaml(file_contents)  # run user.yaml validation tests
1✔
165
                data = yaml.safe_load(file_contents)
1✔
166
        else:
167
            if logger:
1✔
168
                logger.info("Did not sync a user.yaml, no file path provided.")
1✔
169

170
        projects = dict()
1✔
171
        user_info = dict()
1✔
172
        policies = dict()
1✔
173

174
        # resources should be the resource tree to construct in arborist
175
        user_abac = dict()
1✔
176

177
        # Fall back on rbac block if no authz. Remove when rbac in useryaml fully deprecated.
178
        if not data.get("authz") and data.get("rbac"):
1✔
179
            if logger:
×
180
                logger.info(
×
181
                    "No authz block found but rbac block present. Using rbac block"
182
                )
183
            data["authz"] = data["rbac"]
×
184

185
        # get user project mapping to arborist resources if it exists
186
        project_to_resource = data.get("authz", dict()).get(
1✔
187
            "user_project_to_resource", dict()
188
        )
189

190
        # read projects and privileges for each user
191
        users = data.get("users", {})
1✔
192
        for username, details in users.items():
1✔
193
            # users should occur only once each; skip if already processed
194
            if username in projects:
1✔
195
                msg = "invalid yaml file: user `{}` occurs multiple times".format(
×
196
                    username
197
                )
198
                if logger:
×
199
                    logger.error(msg)
×
200
                raise EnvironmentError(msg)
×
201

202
            privileges = {}
1✔
203
            resource_permissions = dict()
1✔
204
            for project in details.get("projects", {}):
1✔
205
                try:
1✔
206
                    privileges[project["auth_id"]] = set(project["privilege"])
1✔
207
                except KeyError as e:
×
208
                    if logger:
×
209
                        logger.error("project {} missing field: {}".format(project, e))
×
210
                    continue
×
211

212
                # project may not have `resource` field.
213
                # prefer resource field;
214
                # if no resource or mapping, assume auth_id is resource.
215
                resource = project.get("resource", project["auth_id"])
1✔
216

217
                if project["auth_id"] not in project_to_resource:
1✔
218
                    project_to_resource[project["auth_id"]] = resource
1✔
219
                resource_permissions[resource] = set(project["privilege"])
1✔
220

221
            user_info[username] = {
1✔
222
                "email": details.get("email", ""),
223
                "display_name": details.get("display_name", ""),
224
                "phone_number": details.get("phone_number", ""),
225
                "tags": details.get("tags", {}),
226
                "admin": details.get("admin", False),
227
            }
228
            if not details.get("email"):
1✔
229
                try:
1✔
230
                    valid = validate_email(
1✔
231
                        username, allow_smtputf8=False, check_deliverability=False
232
                    )
233
                    user_info[username]["email"] = valid.email
1✔
234
                except EmailNotValidError:
1✔
235
                    pass
1✔
236
            projects[username] = privileges
1✔
237
            user_abac[username] = resource_permissions
1✔
238

239
            # list of policies we want to grant to this user, which get sent to arborist
240
            # to check if they're allowed to do certain things
241
            policies[username] = details.get("policies", [])
1✔
242

243
        if logger:
1✔
244
            logger.info(
1✔
245
                "Got user project to arborist resource mapping:\n{}".format(
246
                    str(project_to_resource)
247
                )
248
            )
249

250
        authz = data.get("authz", dict())
1✔
251
        if not authz:
1✔
252
            # older version: resources in root, no `authz` section or `rbac` section
253
            if logger:
1✔
254
                logger.warning(
1✔
255
                    "access control YAML file is using old format (missing `authz`/`rbac`"
256
                    " section in the root); assuming that if it exists `resources` will"
257
                    " be on the root level, and continuing"
258
                )
259
            # we're going to throw it into the `authz` dictionary anyways, so the rest of
260
            # the code can pretend it's in the normal place that we expect
261
            resources = data.get("resources", [])
1✔
262
            # keep authz empty dict if resources is not specified
263
            if resources:
1✔
264
                authz["resources"] = data.get("resources", [])
×
265

266
        clients = data.get("clients", {})
1✔
267

268
        return cls(
1✔
269
            projects=projects,
270
            user_info=user_info,
271
            user_abac=user_abac,
272
            policies=policies,
273
            clients=clients,
274
            authz=authz,
275
            project_to_resource=project_to_resource,
276
            logger=logger,
277
        )
278

279
    def persist_project_to_resource(self, db_session):
1✔
280
        """
281
        Store the mappings from Project.auth_id to authorization resource (Project.authz)
282

283
        The mapping comes from an external source, this function persists what was parsed
284
        into memory into the database for future use.
285
        """
286
        for auth_id, authz_resource in self.project_to_resource.items():
1✔
287
            project = (
1✔
288
                db_session.query(Project).filter(Project.auth_id == auth_id).first()
289
            )
290
            if project:
1✔
291
                project.authz = authz_resource
1✔
292
            else:
293
                project = Project(name=auth_id, auth_id=auth_id, authz=authz_resource)
×
294
                db_session.add(project)
×
295
        db_session.commit()
1✔
296

297

298
class UserSyncer(object):
1✔
299
    def __init__(
1✔
300
        self,
301
        dbGaP,
302
        DB,
303
        project_mapping,
304
        storage_credentials=None,
305
        db_session=None,
306
        is_sync_from_dbgap_server=False,
307
        sync_from_local_csv_dir=None,
308
        sync_from_local_yaml_file=None,
309
        arborist=None,
310
        folder=None,
311
    ):
312
        """
313
        Syncs ACL files from dbGap to auth database and storage backends
314
        Args:
315
            dbGaP: a list of dict containing creds to access dbgap sftp
316
            DB: database connection string
317
            project_mapping: a dict containing how dbgap ids map to projects
318
            storage_credentials: a dict containing creds for storage backends
319
            sync_from_dir: path to an alternative dir to sync from instead of
320
                           dbGaP
321
            arborist:
322
                ArboristClient instance if the syncer should also create
323
                resources in arborist
324
            folder: a local folder where dbgap telemetry files will sync to
325
        """
326
        self.sync_from_local_csv_dir = sync_from_local_csv_dir
1✔
327
        self.sync_from_local_yaml_file = sync_from_local_yaml_file
1✔
328
        self.is_sync_from_dbgap_server = is_sync_from_dbgap_server
1✔
329
        self.dbGaP = dbGaP
1✔
330
        self.session = db_session
1✔
331
        self.driver = get_SQLAlchemyDriver(DB)
1✔
332
        self.project_mapping = project_mapping or {}
1✔
333
        self._projects = dict()
1✔
334
        self._created_roles = set()
1✔
335
        self._created_policies = set()
1✔
336
        self._dbgap_study_to_resources = dict()
1✔
337
        self.logger = get_logger(
1✔
338
            "user_syncer", log_level="debug" if config["DEBUG"] is True else "info"
339
        )
340
        self.arborist_client = arborist
1✔
341
        self.folder = folder
1✔
342

343
        self.auth_source = defaultdict(set)
1✔
344
        # auth_source used for logging. username : [source1, source2]
345
        self.visa_types = config.get("USERSYNC", {}).get("visa_types", {})
1✔
346
        self.parent_to_child_studies_mapping = {}
1✔
347
        for dbgap_config in dbGaP:
1✔
348
            self.parent_to_child_studies_mapping.update(
1✔
349
                dbgap_config.get("parent_to_child_studies_mapping", {})
350
            )
351
        if storage_credentials:
1✔
352
            self.storage_manager = StorageManager(
1✔
353
                storage_credentials, logger=self.logger
354
            )
355
        self.id_patterns = []
1✔
356

357
    @staticmethod
1✔
358
    def _match_pattern(filepath, id_patterns, encrypted=True):
1✔
359
        """
360
        Check if the filename matches dbgap access control file pattern
361

362
        Args:
363
            filepath (str): path to file
364
            encrypted (bool): whether the file is encrypted
365

366
        Returns:
367
            bool: whether the pattern matches
368
        """
369
        id_patterns.append(r"authentication_file_phs(\d{6}).(csv|txt)")
1✔
370
        for pattern in id_patterns:
1✔
371
            if encrypted:
1✔
372
                pattern += r".enc"
×
373
            pattern += r"$"
1✔
374
            # when converting the YAML from fence-config,
375
            # python reads it as Python string literal. So "\" turns into "\\"
376
            # which messes with the regex match
377
            pattern.replace("\\\\", "\\")
1✔
378
            if re.match(pattern, os.path.basename(filepath)):
1✔
379
                return True
1✔
380
        return False
1✔
381

382
    def _get_from_sftp_with_proxy(self, server, path):
1✔
383
        """
384
        Download all data from sftp sever to a local dir
385

386
        Args:
387
            server (dict) : dictionary containing info to access sftp server
388
            path (str): path to local directory
389

390
        Returns:
391
            None
392
        """
393
        proxy = None
1✔
394
        if server.get("proxy", "") != "":
1✔
395
            command = "ssh -oHostKeyAlgorithms=+ssh-rsa -i ~/.ssh/id_rsa {user}@{proxy} nc {host} {port}".format(
×
396
                user=server.get("proxy_user", ""),
397
                proxy=server.get("proxy", ""),
398
                host=server.get("host", ""),
399
                port=server.get("port", 22),
400
            )
401
            self.logger.info("SSH proxy command: {}".format(command))
×
402

403
            proxy = ProxyCommand(command)
×
404

405
        with paramiko.SSHClient() as client:
1✔
406
            client.set_log_channel(self.logger.name)
1✔
407

408
            # Patch paramiko to use sha256 instead of md5 for enhanced security and fips compliance
409
            paramiko.PKey.get_fingerprint = lambda self: hashlib.sha256(
1✔
410
                self.asbytes()
411
            ).digest()
412

413
            # Load known host keys
414
            known_hosts_path = os.path.expanduser("~/.ssh/known_hosts")
1✔
415
            if os.path.exists(known_hosts_path):
1✔
416
                client.load_host_keys(known_hosts_path)
×
417
            else:
418
                self.logger.error(
1✔
419
                    "No known_hosts file found — rejecting unknown hosts - make sure the SFTP host key is present in known_hosts before attempting connection."
420
                )
421

422
            client.set_missing_host_key_policy(paramiko.RejectPolicy())
1✔
423
            parameters = {
1✔
424
                "hostname": str(server.get("host", "")),
425
                "username": str(server.get("username", "")),
426
                "port": int(server.get("port", 22)),
427
            }
428
            if server.get("private_key_filename"):
1✔
429
                parameters["key_filename"] = str(server.get("private_key_filename"))
×
430
            else:
431
                parameters["password"] = str(server.get("password", ""))
1✔
432
            if proxy:
1✔
433
                parameters["sock"] = proxy
×
434

435
            self.logger.info(
1✔
436
                "SSH connection hostname:post {}:{}".format(
437
                    parameters.get("hostname", "unknown"),
438
                    parameters.get("port", "unknown"),
439
                )
440
            )
441
            try:
1✔
442
                self._connect_with_ssh(ssh_client=client, parameters=parameters)
1✔
443

444
                with client.open_sftp() as sftp:
×
445
                    download_dir(sftp, "./", path)
×
446
            except paramiko.ssh_exception.SSHException as e:
1✔
447
                self.logger.error(f"SSH connection failed, error: {e}")
×
448

449
        if proxy:
×
450
            proxy.close()
×
451

452
    @backoff.on_exception(backoff.expo, Exception, **DEFAULT_BACKOFF_SETTINGS)
1✔
453
    def _connect_with_ssh(self, ssh_client, parameters):
1✔
454
        ssh_client.connect(**parameters)
1✔
455

456
    def _get_from_ftp_with_proxy(self, server, path):
1✔
457
        """
458
        Download data from ftp sever to a local dir
459

460
        Args:
461
            server (dict): dictionary containing information for accessing server
462
            path(str): path to local files
463

464
        Returns:
465
            None
466
        """
467
        execstr = (
×
468
            'lftp -u {},{}  {} -e "set ftp:proxy http://{}; mirror . {}; exit"'.format(
469
                server.get("username", ""),
470
                server.get("password", ""),
471
                server.get("host", ""),
472
                server.get("proxy", ""),
473
                path,
474
            )
475
        )
476
        os.system(execstr)
×
477

478
    def _get_parse_consent_code(self, dbgap_config={}):
1✔
479
        return dbgap_config.get(
1✔
480
            "parse_consent_code", True
481
        )  # Should this really be true?
482

483
    def _parse_csv(self, file_dict, sess, dbgap_config={}, encrypted=True):
1✔
484
        """
485
        parse csv files to python dict
486

487
        Args:
488
            file_dict: a dictionary with key(file path) and value(privileges)
489
            sess: sqlalchemy session
490
            dbgap_config: a dictionary containing information about the dbGaP sftp server
491
                (comes from fence config)
492
            encrypted: boolean indicating whether those files are encrypted
493

494

495
        Return:
496
            Tuple[[dict, dict]]:
497
                (user_project, user_info) where user_project is a mapping from
498
                usernames to project permissions and user_info is a mapping
499
                from usernames to user details, such as email
500

501
        Example:
502

503
            (
504
                {
505
                    username: {
506
                        'project1': {'read-storage','write-storage'},
507
                        'project2': {'read-storage'},
508
                    }
509
                },
510
                {
511
                    username: {
512
                        'email': 'email@mail.com',
513
                        'display_name': 'display name',
514
                        'phone_number': '123-456-789',
515
                        'tags': {'dbgap_role': 'PI'}
516
                    }
517
                },
518
            )
519

520
        """
521
        user_projects = dict()
1✔
522
        user_info = defaultdict(dict)
1✔
523

524
        # parse dbGaP sftp server information
525
        dbgap_key = dbgap_config.get("decrypt_key", None)
1✔
526

527
        self.id_patterns += (
1✔
528
            [
529
                item.replace("\\\\", "\\")
530
                for item in dbgap_config.get("allowed_whitelist_patterns", [])
531
            ]
532
            if dbgap_config.get("allow_non_dbGaP_whitelist", False)
533
            else []
534
        )
535

536
        enable_common_exchange_area_access = dbgap_config.get(
1✔
537
            "enable_common_exchange_area_access", False
538
        )
539
        study_common_exchange_areas = dbgap_config.get(
1✔
540
            "study_common_exchange_areas", {}
541
        )
542
        parse_consent_code = self._get_parse_consent_code(dbgap_config)
1✔
543

544
        if parse_consent_code and enable_common_exchange_area_access:
1✔
545
            self.logger.info(
1✔
546
                f"using study to common exchange area mapping: {study_common_exchange_areas}"
547
            )
548

549
        project_id_patterns = [r"phs(\d{6})"]
1✔
550
        if "additional_allowed_project_id_patterns" in dbgap_config:
1✔
551
            patterns = dbgap_config.get("additional_allowed_project_id_patterns")
1✔
552
            patterns = [
1✔
553
                pattern.replace("\\\\", "\\") for pattern in patterns
554
            ]  # when converting the YAML from fence-config, python reads it as Python string literal. So "\" turns into "\\" which messes with the regex match
555
            project_id_patterns += patterns
1✔
556

557
        self.logger.info(f"Using these file paths: {file_dict.items()}")
1✔
558
        for filepath, privileges in file_dict.items():
1✔
559
            self.logger.info("Reading file {}".format(filepath))
1✔
560
            if os.stat(filepath).st_size == 0:
1✔
561
                self.logger.warning("Empty file {}".format(filepath))
×
562
                continue
×
563
            if not self._match_pattern(
1✔
564
                filepath, id_patterns=self.id_patterns, encrypted=encrypted
565
            ):
566
                self.logger.warning(
1✔
567
                    "Filename {} does not match dbgap access control filename pattern;"
568
                    " this could mean that the filename has an invalid format, or has"
569
                    " an unexpected .enc extension, or lacks the .enc extension where"
570
                    " expected. This file is NOT being processed by usersync!".format(
571
                        filepath
572
                    )
573
                )
574
                continue
1✔
575

576
            with _read_file(
1✔
577
                filepath, encrypted=encrypted, key=dbgap_key, logger=self.logger
578
            ) as f:
579
                csv = DictReader(f, quotechar='"', skipinitialspace=True)
1✔
580

581
                for row in csv:
1✔
582
                    username = row.get("login") or ""
1✔
583
                    if username == "":
1✔
584
                        continue
×
585

586
                    if dbgap_config.get("allow_non_dbGaP_whitelist", False):
1✔
587
                        phsid = (
1✔
588
                            row.get("phsid") or (row.get("project_id") or "")
589
                        ).split(".")
590
                    else:
591
                        phsid = (row.get("phsid") or "").split(".")
1✔
592

593
                    dbgap_project = phsid[0]
1✔
594
                    # There are issues where dbgap has a wrong entry in their whitelist. Since we do a bulk arborist request, there are wrong entries in it that invalidates the whole request causing other correct entries not to be added
595
                    skip = False
1✔
596
                    for pattern in project_id_patterns:
1✔
597
                        self.logger.debug(
1✔
598
                            "Checking pattern:{} with project_id:{}".format(
599
                                pattern, dbgap_project
600
                            )
601
                        )
602
                        if re.match(pattern, dbgap_project):
1✔
603
                            skip = False
1✔
604
                            break
1✔
605
                        else:
606
                            skip = True
1✔
607
                    if skip:
1✔
608
                        self.logger.warning(
1✔
609
                            "Skip processing from file {}, user {} with project {}".format(
610
                                filepath,
611
                                username,
612
                                dbgap_project,
613
                            )
614
                        )
615
                        continue
1✔
616
                    if len(phsid) > 1 and parse_consent_code:
1✔
617
                        consent_code = phsid[-1]
1✔
618

619
                        # c999 indicates full access to all consents and access
620
                        # to a study-specific exchange area
621
                        # access to at least one study-specific exchange area implies access
622
                        # to the parent study's common exchange area
623
                        #
624
                        # NOTE: Handling giving access to all consents is done at
625
                        #       a later time, when we have full information about possible
626
                        #       consents
627
                        self.logger.debug(
1✔
628
                            f"got consent code {consent_code} from dbGaP project "
629
                            f"{dbgap_project}"
630
                        )
631
                        if (
1✔
632
                            consent_code == "c999"
633
                            and enable_common_exchange_area_access
634
                            and dbgap_project in study_common_exchange_areas
635
                        ):
636
                            self.logger.info(
1✔
637
                                "found study with consent c999 and Fence "
638
                                "is configured to parse exchange area data. Giving user "
639
                                f"{username} {privileges} privileges in project: "
640
                                f"{study_common_exchange_areas[dbgap_project]}."
641
                            )
642
                            self._add_dbgap_project_for_user(
1✔
643
                                study_common_exchange_areas[dbgap_project],
644
                                privileges,
645
                                username,
646
                                sess,
647
                                user_projects,
648
                                dbgap_config,
649
                            )
650

651
                        dbgap_project += "." + consent_code
1✔
652

653
                    self._add_children_for_dbgap_project(
1✔
654
                        dbgap_project,
655
                        privileges,
656
                        username,
657
                        sess,
658
                        user_projects,
659
                        dbgap_config,
660
                    )
661

662
                    display_name = row.get("user name") or ""
1✔
663
                    tags = {"dbgap_role": row.get("role") or ""}
1✔
664

665
                    # some dbgap telemetry files have information about a researchers PI
666
                    if "downloader for" in row:
1✔
667
                        tags["pi"] = row["downloader for"]
1✔
668

669
                    # prefer name over previous "downloader for" if it exists
670
                    if "downloader for names" in row:
1✔
671
                        tags["pi"] = row["downloader for names"]
×
672

673
                    user_info[username] = {
1✔
674
                        "email": row.get("email")
675
                        or user_info[username].get("email")
676
                        or "",
677
                        "display_name": display_name,
678
                        "phone_number": row.get("phone")
679
                        or user_info[username].get("phone_number")
680
                        or "",
681
                        "tags": tags,
682
                    }
683

684
                    self._process_dbgap_project(
1✔
685
                        dbgap_project,
686
                        privileges,
687
                        username,
688
                        sess,
689
                        user_projects,
690
                        dbgap_config,
691
                    )
692

693
        return user_projects, user_info
1✔
694

695
    def _get_children(self, dbgap_project):
1✔
696
        return self.parent_to_child_studies_mapping.get(dbgap_project.split(".")[0])
1✔
697

698
    def _add_children_for_dbgap_project(
1✔
699
        self, dbgap_project, privileges, username, sess, user_projects, dbgap_config
700
    ):
701
        """
702
        Adds the configured child studies for the given dbgap_project, adding it to the provided user_projects. If
703
        parse_consent_code is true, then the consents granted in the provided dbgap_project will also be granted to the
704
        child studies.
705
        """
706
        parent_phsid = dbgap_project
1✔
707
        parse_consent_code = self._get_parse_consent_code(dbgap_config)
1✔
708
        child_suffix = ""
1✔
709
        if parse_consent_code and re.match(
1✔
710
            config["DBGAP_ACCESSION_WITH_CONSENT_REGEX"], dbgap_project
711
        ):
712
            parent_phsid_parts = dbgap_project.split(".")
1✔
713
            parent_phsid = parent_phsid_parts[0]
1✔
714
            child_suffix = "." + parent_phsid_parts[1]
1✔
715

716
        if parent_phsid not in self.parent_to_child_studies_mapping:
1✔
717
            return
1✔
718

719
        self.logger.info(
1✔
720
            f"found parent study {parent_phsid} and Fence "
721
            "is configured to provide additional access to child studies. Giving user "
722
            f"{username} {privileges} privileges in projects: "
723
            f"{{k + child_suffix: v + child_suffix for k, v in self.parent_to_child_studies_mapping.items()}}."
724
        )
725
        child_studies = self.parent_to_child_studies_mapping.get(parent_phsid, [])
1✔
726
        for child_study in child_studies:
1✔
727
            self._add_dbgap_project_for_user(
1✔
728
                child_study + child_suffix,
729
                privileges,
730
                username,
731
                sess,
732
                user_projects,
733
                dbgap_config,
734
            )
735

736
    def _add_dbgap_project_for_user(
1✔
737
        self, dbgap_project, privileges, username, sess, user_projects, dbgap_config
738
    ):
739
        """
740
        Helper function for csv parsing that adds a given dbgap project to Fence/Arborist
741
        and then updates the dictionary containing all user's project access
742
        """
743
        if dbgap_project not in self._projects:
1✔
744
            self.logger.debug(
1✔
745
                "creating Project in fence for dbGaP study: {}".format(dbgap_project)
746
            )
747

748
            project = self._get_or_create(sess, Project, auth_id=dbgap_project)
1✔
749

750
            # need to add dbgap project to arborist
751
            if self.arborist_client:
1✔
752
                self._determine_arborist_resource(dbgap_project, dbgap_config)
1✔
753

754
            if project.name is None:
1✔
755
                project.name = dbgap_project
1✔
756
            self._projects[dbgap_project] = project
1✔
757
        phsid_privileges = {dbgap_project: set(privileges)}
1✔
758
        if username in user_projects:
1✔
759
            user_projects[username].update(phsid_privileges)
1✔
760
        else:
761
            user_projects[username] = phsid_privileges
1✔
762

763
    @staticmethod
1✔
764
    def sync_two_user_info_dict(user_info1, user_info2):
1✔
765
        """
766
        Merge user_info1 into user_info2. Values in user_info2 are overriden
767
        by values in user_info1. user_info2 ends up containing the merged dict.
768

769
        Args:
770
            user_info1 (dict): nested dict
771
            user_info2 (dict): nested dict
772

773
            Example:
774
            {username: {'email': 'abc@email.com'}}
775

776
        Returns:
777
            None
778
        """
779
        user_info2.update(user_info1)
1✔
780

781
    def sync_two_phsids_dict(
1✔
782
        self,
783
        phsids1,
784
        phsids2,
785
        source1=None,
786
        source2=None,
787
        phsids2_overrides_phsids1=True,
788
    ):
789
        """
790
        Merge phsids1 into phsids2. If `phsids2_overrides_phsids1`, values in
791
        phsids1 are overriden by values in phsids2. phsids2 ends up containing
792
        the merged dict (see explanation below).
793
        `source1` and `source2`: for logging.
794

795
        Args:
796
            phsids1, phsids2: nested dicts mapping phsids to sets of permissions
797

798
            source1, source2: source of authz information (eg. dbgap, user_yaml, visas)
799

800
            Example:
801
            {
802
                username: {
803
                    phsid1: {'read-storage','write-storage'},
804
                    phsid2: {'read-storage'},
805
                }
806
            }
807

808
        Return:
809
            None
810

811
        Explanation:
812
            Consider merging projects of the same user:
813

814
                {user1: {phsid1: privillege1}}
815

816
                {user1: {phsid2: privillege2}}
817

818
            case 1: phsid1 != phsid2. Output:
819

820
                {user1: {phsid1: privillege1, phsid2: privillege2}}
821

822
            case 2: phsid1 == phsid2 and privillege1! = privillege2. Output:
823

824
                {user1: {phsid1: union(privillege1, privillege2)}}
825

826
            For the other cases, just simple addition
827
        """
828

829
        for user, projects1 in phsids1.items():
1✔
830
            if not phsids2.get(user):
1✔
831
                if source1:
1✔
832
                    self.auth_source[user].add(source1)
1✔
833
                phsids2[user] = projects1
1✔
834
            elif phsids2_overrides_phsids1:
1✔
835
                if source1:
1✔
836
                    self.auth_source[user].add(source1)
×
837
                if source2:
1✔
838
                    self.auth_source[user].add(source2)
×
839
                for phsid1, privilege1 in projects1.items():
1✔
840
                    if phsid1 not in phsids2[user]:
1✔
841
                        phsids2[user][phsid1] = set()
1✔
842
                    phsids2[user][phsid1].update(privilege1)
1✔
843
            elif source2:
×
844
                self.auth_source[user].add(source2)
×
845

846
    def sync_to_db_and_storage_backend(
1✔
847
        self,
848
        user_project,
849
        user_info,
850
        sess,
851
        do_not_revoke_from_db_and_storage=False,
852
        expires=None,
853
    ):
854
        """
855
        sync user access control to database and storage backend
856

857
        Args:
858
            user_project (dict): a dictionary of
859

860
                {
861
                    username: {
862
                        'project1': {'read-storage','write-storage'},
863
                        'project2': {'read-storage'}
864
                    }
865
                }
866

867
            user_info (dict): a dictionary of {username: user_info{}}
868
            sess: a sqlalchemy session
869

870
        Return:
871
            None
872
        """
873
        google_bulk_mapping = None
1✔
874
        if config["GOOGLE_BULK_UPDATES"]:
1✔
875
            google_bulk_mapping = {}
1✔
876

877
        self._init_projects(user_project, sess)
1✔
878

879
        auth_provider_list = [
1✔
880
            self._get_or_create(sess, AuthorizationProvider, name="dbGaP"),
881
            self._get_or_create(sess, AuthorizationProvider, name="fence"),
882
        ]
883

884
        cur_db_user_project_list = {
1✔
885
            (ua.user.username.lower(), ua.project.auth_id)
886
            for ua in sess.query(AccessPrivilege).all()
887
        }
888

889
        # we need to compare db -> whitelist case-insensitively for username.
890
        # db stores case-sensitively, but we need to query case-insensitively
891
        user_project_lowercase = {}
1✔
892
        syncing_user_project_list = set()
1✔
893
        for username, projects in user_project.items():
1✔
894
            user_project_lowercase[username.lower()] = projects
1✔
895
            for project, _ in projects.items():
1✔
896
                syncing_user_project_list.add((username.lower(), project))
1✔
897

898
        user_info_lowercase = {
1✔
899
            username.lower(): info for username, info in user_info.items()
900
        }
901

902
        to_delete = set.difference(cur_db_user_project_list, syncing_user_project_list)
1✔
903
        to_add = set.difference(syncing_user_project_list, cur_db_user_project_list)
1✔
904
        to_update = set.intersection(
1✔
905
            cur_db_user_project_list, syncing_user_project_list
906
        )
907

908
        # when updating users we want to maintain case sesitivity in the username so
909
        # pass the original, non-lowered user_info dict
910
        self._upsert_userinfo(sess, user_info)
1✔
911

912
        if not do_not_revoke_from_db_and_storage:
1✔
913
            self._revoke_from_storage(
1✔
914
                to_delete, sess, google_bulk_mapping=google_bulk_mapping
915
            )
916
            self._revoke_from_db(sess, to_delete)
1✔
917

918
        self._grant_from_storage(
1✔
919
            to_add,
920
            user_project_lowercase,
921
            sess,
922
            google_bulk_mapping=google_bulk_mapping,
923
            expires=expires,
924
        )
925

926
        self._grant_from_db(
1✔
927
            sess,
928
            to_add,
929
            user_info_lowercase,
930
            user_project_lowercase,
931
            auth_provider_list,
932
        )
933

934
        # re-grant
935
        self._grant_from_storage(
1✔
936
            to_update,
937
            user_project_lowercase,
938
            sess,
939
            google_bulk_mapping=google_bulk_mapping,
940
            expires=expires,
941
        )
942
        self._update_from_db(sess, to_update, user_project_lowercase)
1✔
943

944
        if not do_not_revoke_from_db_and_storage:
1✔
945
            self._validate_and_update_user_admin(sess, user_info_lowercase)
1✔
946

947
        sess.commit()
1✔
948

949
        if config["GOOGLE_BULK_UPDATES"]:
1✔
950
            self.logger.info("Doing bulk Google update...")
1✔
951
            update_google_groups_for_users(google_bulk_mapping)
1✔
952
            self.logger.info("Bulk Google update done!")
×
953

954
        sess.commit()
1✔
955

956
    def sync_to_storage_backend(
1✔
957
        self, user_project, user_info, sess, expires, skip_google_updates=False
958
    ):
959
        """
960
        sync user access control to storage backend with given expiration
961

962
        Args:
963
            user_project (dict): a dictionary of
964

965
                {
966
                    username: {
967
                        'project1': {'read-storage','write-storage'},
968
                        'project2': {'read-storage'}
969
                    }
970
                }
971

972
            user_info (dict): a dictionary of attributes for a user.
973
            sess: a sqlalchemy session
974
            expires (int): time at which synced Arborist policies and
975
                   inclusion in any GBAG are set to expire
976
            skip_google_updates (bool): True if google group updates should be skipped. False if otherwise.
977
        Return:
978
            None
979
        """
980
        if not expires:
1✔
981
            raise Exception(
×
982
                f"sync to storage backend requires an expiration. you provided: {expires}"
983
            )
984

985
        google_group_user_mapping = None
1✔
986
        if config["GOOGLE_BULK_UPDATES"]:
1✔
987
            google_group_user_mapping = {}
×
988
            get_or_create_proxy_group_id(
×
989
                expires=expires,
990
                user_id=user_info["user_id"],
991
                username=user_info["username"],
992
                session=sess,
993
                storage_manager=self.storage_manager,
994
            )
995

996
        # TODO: eventually it'd be nice to remove this step but it's required
997
        #       so that grant_from_storage can determine what storage backends
998
        #       are needed for a project.
999
        self._init_projects(user_project, sess)
1✔
1000

1001
        # we need to compare db -> whitelist case-insensitively for username.
1002
        # db stores case-sensitively, but we need to query case-insensitively
1003
        user_project_lowercase = {}
1✔
1004
        syncing_user_project_list = set()
1✔
1005
        for username, projects in user_project.items():
1✔
1006
            user_project_lowercase[username.lower()] = projects
1✔
1007
            for project, _ in projects.items():
1✔
1008
                syncing_user_project_list.add((username.lower(), project))
1✔
1009

1010
        to_add = set(syncing_user_project_list)
1✔
1011

1012
        # when updating users we want to maintain case sensitivity in the username so
1013
        # pass the original, non-lowered user_info dict
1014
        self._upsert_userinfo(sess, {user_info["username"].lower(): user_info})
1✔
1015
        if not skip_google_updates:
1✔
1016
            self._grant_from_storage(
1✔
1017
                to_add,
1018
                user_project_lowercase,
1019
                sess,
1020
                google_bulk_mapping=google_group_user_mapping,
1021
                expires=expires,
1022
            )
1023

1024
            if config["GOOGLE_BULK_UPDATES"]:
1✔
1025
                self.logger.info("Updating user's google groups ...")
×
1026
                update_google_groups_for_users(google_group_user_mapping)
×
1027
                self.logger.info("Google groups update done!!")
×
1028

1029
        sess.commit()
1✔
1030

1031
    def _revoke_from_db(self, sess, to_delete):
1✔
1032
        """
1033
        Revoke user access to projects in the auth database
1034

1035
        Args:
1036
            sess: sqlalchemy session
1037
            to_delete: a set of (username, project.auth_id) to be revoked from db
1038
        Return:
1039
            None
1040
        """
1041
        for username, project_auth_id in to_delete:
1✔
1042
            q = (
1✔
1043
                sess.query(AccessPrivilege)
1044
                .filter(AccessPrivilege.project.has(auth_id=project_auth_id))
1045
                .join(AccessPrivilege.user)
1046
                .filter(func.lower(User.username) == username)
1047
                .all()
1048
            )
1049
            for access in q:
1✔
1050
                self.logger.info(
1✔
1051
                    "revoke {} access to {} in db".format(username, project_auth_id)
1052
                )
1053
                sess.delete(access)
1✔
1054

1055
    def _validate_and_update_user_admin(self, sess, user_info):
1✔
1056
        """
1057
        Make sure there is no admin user that is not in yaml/csv files
1058

1059
        Args:
1060
            sess: sqlalchemy session
1061
            user_info: a dict of
1062
            {
1063
                username: {
1064
                    'email': email,
1065
                    'display_name': display_name,
1066
                    'phone_number': phonenum,
1067
                    'tags': {'k1':'v1', 'k2': 'v2'}
1068
                    'admin': is_admin
1069
                }
1070
            }
1071
        Returns:
1072
            None
1073
        """
1074
        for admin_user in sess.query(User).filter_by(is_admin=True).all():
1✔
1075
            if admin_user.username.lower() not in user_info:
1✔
1076
                admin_user.is_admin = False
×
1077
                sess.add(admin_user)
×
1078
                self.logger.info(
×
1079
                    "remove admin access from {} in db".format(
1080
                        admin_user.username.lower()
1081
                    )
1082
                )
1083

1084
    def _update_from_db(self, sess, to_update, user_project):
1✔
1085
        """
1086
        Update user access to projects in the auth database
1087

1088
        Args:
1089
            sess: sqlalchemy session
1090
            to_update:
1091
                a set of (username, project.auth_id) to be updated from db
1092

1093
        Return:
1094
            None
1095
        """
1096

1097
        for username, project_auth_id in to_update:
1✔
1098
            q = (
1✔
1099
                sess.query(AccessPrivilege)
1100
                .filter(AccessPrivilege.project.has(auth_id=project_auth_id))
1101
                .join(AccessPrivilege.user)
1102
                .filter(func.lower(User.username) == username)
1103
                .all()
1104
            )
1105
            for access in q:
1✔
1106
                access.privilege = user_project[username][project_auth_id]
1✔
1107
                self.logger.info(
1✔
1108
                    "update {} with {} access to {} in db".format(
1109
                        username, access.privilege, project_auth_id
1110
                    )
1111
                )
1112

1113
    def _grant_from_db(self, sess, to_add, user_info, user_project, auth_provider_list):
1✔
1114
        """
1115
        Grant user access to projects in the auth database
1116
        Args:
1117
            sess: sqlalchemy session
1118
            to_add: a set of (username, project.auth_id) to be granted
1119
            user_project:
1120
                a dictionary of {username: {project: {'read','write'}}
1121
        Return:
1122
            None
1123
        """
1124
        for username, project_auth_id in to_add:
1✔
1125
            u = query_for_user(session=sess, username=username)
1✔
1126

1127
            auth_provider = auth_provider_list[0]
1✔
1128
            if "dbgap_role" not in user_info[username]["tags"]:
1✔
1129
                auth_provider = auth_provider_list[1]
1✔
1130
            user_access = AccessPrivilege(
1✔
1131
                user=u,
1132
                project=self._projects[project_auth_id],
1133
                privilege=list(user_project[username][project_auth_id]),
1134
                auth_provider=auth_provider,
1135
            )
1136
            self.logger.info(
1✔
1137
                "grant user {} to {} with access {}".format(
1138
                    username, user_access.project, user_access.privilege
1139
                )
1140
            )
1141
            sess.add(user_access)
1✔
1142

1143
    def _upsert_userinfo(self, sess, user_info):
1✔
1144
        """
1145
        update user info to database.
1146

1147
        Args:
1148
            sess: sqlalchemy session
1149
            user_info:
1150
                a dict of {username: {display_name, phone_number, tags, admin}
1151

1152
        Return:
1153
            None
1154
        """
1155

1156
        for username in user_info:
1✔
1157
            u = query_for_user(session=sess, username=username)
1✔
1158

1159
            if u is None:
1✔
1160
                self.logger.info("create user {}".format(username))
1✔
1161
                u = User(username=username)
1✔
1162
                sess.add(u)
1✔
1163

1164
            if self.arborist_client:
1✔
1165
                self.arborist_client.create_user({"name": username})
1✔
1166

1167
            u.email = user_info[username].get("email", "")
1✔
1168
            u.display_name = user_info[username].get("display_name", "")
1✔
1169
            u.phone_number = user_info[username].get("phone_number", "")
1✔
1170
            u.is_admin = user_info[username].get("admin", False)
1✔
1171

1172
            idp_name = user_info[username].get("idp_name", "")
1✔
1173
            if idp_name and not u.identity_provider:
1✔
1174
                idp = (
×
1175
                    sess.query(IdentityProvider)
1176
                    .filter(IdentityProvider.name == idp_name)
1177
                    .first()
1178
                )
1179
                if not idp:
×
1180
                    idp = IdentityProvider(name=idp_name)
×
1181
                u.identity_provider = idp
×
1182

1183
            # do not update if there is no tag
1184
            if not user_info[username].get("tags"):
1✔
1185
                continue
1✔
1186

1187
            # remove user db tags if they are not shown in new tags
1188
            for tag in u.tags:
1✔
1189
                if tag.key not in user_info[username]["tags"]:
1✔
1190
                    u.tags.remove(tag)
1✔
1191

1192
            # sync
1193
            for k, v in user_info[username]["tags"].items():
1✔
1194
                found = False
1✔
1195
                for tag in u.tags:
1✔
1196
                    if tag.key == k:
1✔
1197
                        found = True
1✔
1198
                        tag.value = v
1✔
1199
                # create new tag if not found
1200
                if not found:
1✔
1201
                    tag = Tag(key=k, value=v)
1✔
1202
                    u.tags.append(tag)
1✔
1203

1204
    def _revoke_from_storage(self, to_delete, sess, google_bulk_mapping=None):
1✔
1205
        """
1206
        If a project have storage backend, revoke user's access to buckets in
1207
        the storage backend.
1208

1209
        Args:
1210
            to_delete: a set of (username, project.auth_id) to be revoked
1211

1212
        Return:
1213
            None
1214
        """
1215
        for username, project_auth_id in to_delete:
1✔
1216
            project = (
1✔
1217
                sess.query(Project).filter(Project.auth_id == project_auth_id).first()
1218
            )
1219
            for sa in project.storage_access:
1✔
1220
                if not hasattr(self, "storage_manager"):
1✔
1221
                    self.logger.error(
×
1222
                        (
1223
                            "CANNOT revoke {} access to {} in {} because there is NO "
1224
                            "configured storage accesses at all. See configuration. "
1225
                            "Continuing anyway..."
1226
                        ).format(username, project_auth_id, sa.provider.name)
1227
                    )
1228
                    continue
×
1229

1230
                self.logger.info(
1✔
1231
                    "revoke {} access to {} in {}".format(
1232
                        username, project_auth_id, sa.provider.name
1233
                    )
1234
                )
1235
                self.storage_manager.revoke_access(
1✔
1236
                    provider=sa.provider.name,
1237
                    username=username,
1238
                    project=project,
1239
                    session=sess,
1240
                    google_bulk_mapping=google_bulk_mapping,
1241
                )
1242

1243
    def _grant_from_storage(
1✔
1244
        self, to_add, user_project, sess, google_bulk_mapping=None, expires=None
1245
    ):
1246
        """
1247
        If a project have storage backend, grant user's access to buckets in
1248
        the storage backend.
1249

1250
        Args:
1251
            to_add: a set of (username, project.auth_id)  to be granted
1252
            user_project: a dictionary like:
1253

1254
                    {username: {phsid: {'read-storage','write-storage'}}}
1255

1256
        Return:
1257
            dict of the users' storage usernames to their user_projects and the respective storage access.
1258
        """
1259
        storage_user_to_sa_and_user_project = defaultdict()
1✔
1260
        for username, project_auth_id in to_add:
1✔
1261
            project = self._projects[project_auth_id]
1✔
1262
            for sa in project.storage_access:
1✔
1263
                access = list(user_project[username][project_auth_id])
1✔
1264
                if not hasattr(self, "storage_manager"):
1✔
1265
                    self.logger.error(
×
1266
                        (
1267
                            "CANNOT grant {} access {} to {} in {} because there is NO "
1268
                            "configured storage accesses at all. See configuration. "
1269
                            "Continuing anyway..."
1270
                        ).format(username, access, project_auth_id, sa.provider.name)
1271
                    )
1272
                    continue
×
1273

1274
                self.logger.info(
1✔
1275
                    "grant {} access {} to {} in {}".format(
1276
                        username, access, project_auth_id, sa.provider.name
1277
                    )
1278
                )
1279
                storage_username = self.storage_manager.grant_access(
1✔
1280
                    provider=sa.provider.name,
1281
                    username=username,
1282
                    project=project,
1283
                    access=access,
1284
                    session=sess,
1285
                    google_bulk_mapping=google_bulk_mapping,
1286
                    expires=expires,
1287
                )
1288

1289
                storage_user_to_sa_and_user_project[storage_username] = (sa, project)
1✔
1290
        return storage_user_to_sa_and_user_project
1✔
1291

1292
    def _init_projects(self, user_project, sess):
1✔
1293
        """
1294
        initialize projects
1295
        """
1296

1297
        if self.project_mapping:
1✔
1298
            for projects in list(self.project_mapping.values()):
1✔
1299
                for p in projects:
1✔
1300
                    self.logger.debug(
1✔
1301
                        "creating Project with info from project_mapping: {}".format(p)
1302
                    )
1303
                    project = self._get_or_create(sess, Project, **p)
1✔
1304
                    self._projects[p["auth_id"]] = project
1✔
1305
        for _, projects in user_project.items():
1✔
1306
            for auth_id in list(projects.keys()):
1✔
1307
                project = sess.query(Project).filter(Project.auth_id == auth_id).first()
1✔
1308
                if not project:
1✔
1309
                    data = {"name": auth_id, "auth_id": auth_id}
1✔
1310
                    try:
1✔
1311
                        project = self._get_or_create(sess, Project, **data)
1✔
1312
                    except IntegrityError as e:
×
1313
                        sess.rollback()
×
1314
                        self.logger.error(
×
1315
                            f"Project {auth_id} already exists. Detail {str(e)}"
1316
                        )
1317
                        raise Exception(
×
1318
                            "Project {} already exists. Detail {}. Please contact your system administrator.".format(
1319
                                auth_id, str(e)
1320
                            )
1321
                        )
1322
                if auth_id not in self._projects:
1✔
1323
                    self._projects[auth_id] = project
1✔
1324

1325
    @staticmethod
1✔
1326
    def _get_or_create(sess, model, **kwargs):
1✔
1327
        instance = sess.query(model).filter_by(**kwargs).first()
1✔
1328
        if not instance:
1✔
1329
            instance = model(**kwargs)
1✔
1330
            sess.add(instance)
1✔
1331
        return instance
1✔
1332

1333
    def _process_dbgap_files(self, dbgap_config, sess):
1✔
1334
        """
1335
        Args:
1336
            dbgap_config : a dictionary containing information about a single
1337
                           dbgap sftp server (from fence config)
1338
            sess: database session
1339

1340
        Return:
1341
            user_projects (dict)
1342
            user_info (dict)
1343
        """
1344
        dbgap_file_list = []
1✔
1345
        hostname = dbgap_config["info"]["host"]
1✔
1346
        username = dbgap_config["info"]["username"]
1✔
1347
        encrypted = dbgap_config["info"].get("encrypted", True)
1✔
1348
        folderdir = os.path.join(str(self.folder), str(hostname), str(username))
1✔
1349

1350
        try:
1✔
1351
            if os.path.exists(folderdir):
1✔
1352
                dbgap_file_list = glob.glob(
×
1353
                    os.path.join(folderdir, "*")
1354
                )  # get lists of file from folder
1355
            else:
1356
                self.logger.info("Downloading files from: {}".format(hostname))
1✔
1357
                dbgap_file_list = self._download(dbgap_config)
1✔
1358
        except Exception as e:
1✔
1359
            self.logger.error(e)
1✔
1360
            exit(1)
1✔
1361
        self.logger.info("dbgap files: {}".format(dbgap_file_list))
×
1362
        user_projects, user_info = self._get_user_permissions_from_csv_list(
×
1363
            dbgap_file_list,
1364
            encrypted=encrypted,
1365
            session=sess,
1366
            dbgap_config=dbgap_config,
1367
        )
1368

1369
        user_projects = self.parse_projects(user_projects)
×
1370
        return user_projects, user_info
×
1371

1372
    def _get_user_permissions_from_csv_list(
1✔
1373
        self, file_list, encrypted, session, dbgap_config={}
1374
    ):
1375
        """
1376
        Args:
1377
            file_list: list of files (represented as strings)
1378
            encrypted: boolean indicating whether those files are encrypted
1379
            session: sqlalchemy session
1380
            dbgap_config: a dictionary containing information about the dbGaP sftp server
1381
                    (comes from fence config)
1382

1383
        Return:
1384
            user_projects (dict)
1385
            user_info (dict)
1386
        """
1387
        permissions = [{"read-storage", "read"} for _ in file_list]
1✔
1388
        user_projects, user_info = self._parse_csv(
1✔
1389
            dict(list(zip(file_list, permissions))),
1390
            sess=session,
1391
            dbgap_config=dbgap_config,
1392
            encrypted=encrypted,
1393
        )
1394
        return user_projects, user_info
1✔
1395

1396
    def _merge_multiple_local_csv_files(
1✔
1397
        self, dbgap_file_list, encrypted, dbgap_configs, session
1398
    ):
1399
        """
1400
        Args:
1401
            dbgap_file_list (list): a list of whitelist file locations stored locally
1402
            encrypted (bool): whether the file is encrypted (comes from fence config)
1403
            dbgap_configs (list): list of dictionaries containing information about the dbgap server (comes from fence config)
1404
            session (sqlalchemy.Session): database session
1405

1406
        Return:
1407
            merged_user_projects (dict)
1408
            merged_user_info (dict)
1409
        """
1410
        merged_user_projects = {}
1✔
1411
        merged_user_info = {}
1✔
1412

1413
        for dbgap_config in dbgap_configs:
1✔
1414
            user_projects, user_info = self._get_user_permissions_from_csv_list(
1✔
1415
                dbgap_file_list,
1416
                encrypted,
1417
                session=session,
1418
                dbgap_config=dbgap_config,
1419
            )
1420
            self.sync_two_user_info_dict(user_info, merged_user_info)
1✔
1421
            self.sync_two_phsids_dict(user_projects, merged_user_projects)
1✔
1422
        return merged_user_projects, merged_user_info
1✔
1423

1424
    def _merge_multiple_dbgap_sftp(self, dbgap_servers, sess):
1✔
1425
        """
1426
        Args:
1427
            dbgap_servers : a list of dictionaries each containging config on
1428
                           dbgap sftp server (comes from fence config)
1429
            sess: database session
1430

1431
        Return:
1432
            merged_user_projects (dict)
1433
            merged_user_info (dict)
1434
        """
1435
        merged_user_projects = {}
1✔
1436
        merged_user_info = {}
1✔
1437
        for dbgap in dbgap_servers:
1✔
1438
            user_projects, user_info = self._process_dbgap_files(dbgap, sess)
1✔
1439
            # merge into merged_user_info
1440
            # user_info overrides original info in merged_user_info
1441
            self.sync_two_user_info_dict(user_info, merged_user_info)
1✔
1442

1443
            # merge all access info dicts into "merged_user_projects".
1444
            # the access info is combined - if the user_projects access is
1445
            # ["read"] and the merged_user_projects is ["read-storage"], the
1446
            # resulting access is ["read", "read-storage"].
1447
            self.sync_two_phsids_dict(user_projects, merged_user_projects)
1✔
1448
        return merged_user_projects, merged_user_info
1✔
1449

1450
    def parse_projects(self, user_projects):
1✔
1451
        """
1452
        helper function for parsing projects
1453
        """
1454
        return {key.lower(): value for key, value in user_projects.items()}
1✔
1455

1456
    def _process_dbgap_project(
1✔
1457
        self, dbgap_project, privileges, username, sess, user_projects, dbgap_config
1458
    ):
1459
        if dbgap_project not in self.project_mapping:
1✔
1460
            self._add_dbgap_project_for_user(
1✔
1461
                dbgap_project,
1462
                privileges,
1463
                username,
1464
                sess,
1465
                user_projects,
1466
                dbgap_config,
1467
            )
1468

1469
        for element_dict in self.project_mapping.get(dbgap_project, []):
1✔
1470
            try:
1✔
1471
                phsid_privileges = {element_dict["auth_id"]: set(privileges)}
1✔
1472

1473
                # need to add dbgap project to arborist
1474
                if self.arborist_client:
1✔
1475
                    self._determine_arborist_resource(
1✔
1476
                        element_dict["auth_id"], dbgap_config
1477
                    )
1478

1479
                if username not in user_projects:
1✔
1480
                    user_projects[username] = {}
1✔
1481
                user_projects[username].update(phsid_privileges)
1✔
1482

1483
            except ValueError as e:
×
1484
                self.logger.info(e)
×
1485

1486
    def _process_user_projects(
1✔
1487
        self,
1488
        user_projects,
1489
        enable_common_exchange_area_access,
1490
        study_common_exchange_areas,
1491
        dbgap_config,
1492
        sess,
1493
    ):
1494
        user_projects_to_modify = copy.deepcopy(user_projects)
1✔
1495
        for username in user_projects.keys():
1✔
1496
            for project in user_projects[username].keys():
1✔
1497
                phsid = project.split(".")
1✔
1498
                dbgap_project = phsid[0]
1✔
1499
                privileges = user_projects[username][project]
1✔
1500
                if len(phsid) > 1 and self._get_parse_consent_code(dbgap_config):
1✔
1501
                    consent_code = phsid[-1]
1✔
1502

1503
                    # c999 indicates full access to all consents and access
1504
                    # to a study-specific exchange area
1505
                    # access to at least one study-specific exchange area implies access
1506
                    # to the parent study's common exchange area
1507
                    #
1508
                    # NOTE: Handling giving access to all consents is done at
1509
                    #       a later time, when we have full information about possible
1510
                    #       consents
1511
                    self.logger.debug(
1✔
1512
                        f"got consent code {consent_code} from dbGaP project "
1513
                        f"{dbgap_project}"
1514
                    )
1515
                    if (
1✔
1516
                        consent_code == "c999"
1517
                        and enable_common_exchange_area_access
1518
                        and dbgap_project in study_common_exchange_areas
1519
                    ):
1520
                        self.logger.info(
1✔
1521
                            "found study with consent c999 and Fence "
1522
                            "is configured to parse exchange area data. Giving user "
1523
                            f"{username} {privileges} privileges in project: "
1524
                            f"{study_common_exchange_areas[dbgap_project]}."
1525
                        )
1526
                        self._add_dbgap_project_for_user(
1✔
1527
                            study_common_exchange_areas[dbgap_project],
1528
                            privileges,
1529
                            username,
1530
                            sess,
1531
                            user_projects_to_modify,
1532
                            dbgap_config,
1533
                        )
1534

1535
                    dbgap_project += "." + consent_code
1✔
1536

1537
                self._add_children_for_dbgap_project(
1✔
1538
                    dbgap_project,
1539
                    privileges,
1540
                    username,
1541
                    sess,
1542
                    user_projects_to_modify,
1543
                    dbgap_config,
1544
                )
1545

1546
                self._process_dbgap_project(
1✔
1547
                    dbgap_project,
1548
                    privileges,
1549
                    username,
1550
                    sess,
1551
                    user_projects_to_modify,
1552
                    dbgap_config,
1553
                )
1554
        for user in user_projects_to_modify.keys():
1✔
1555
            user_projects[user] = user_projects_to_modify[user]
1✔
1556

1557
    def sync(self):
1✔
1558
        if self.session:
1✔
1559
            self._sync(self.session)
1✔
1560
        else:
1561
            with self.driver.session as s:
×
1562
                self._sync(s)
×
1563

1564
    def download(self):
1✔
1565
        for dbgap_server in self.dbGaP:
×
1566
            self._download(dbgap_server)
×
1567

1568
    def _download(self, dbgap_config):
1✔
1569
        """
1570
        Download files from dbgap server.
1571
        """
1572
        server = dbgap_config["info"]
1✔
1573
        protocol = dbgap_config["protocol"]
1✔
1574
        hostname = server["host"]
1✔
1575
        username = server["username"]
1✔
1576
        folderdir = os.path.join(str(self.folder), str(hostname), str(username))
1✔
1577

1578
        if not os.path.exists(folderdir):
1✔
1579
            os.makedirs(folderdir)
1✔
1580

1581
        self.logger.info("Download from server")
1✔
1582
        try:
1✔
1583
            if protocol == "sftp":
1✔
1584
                self._get_from_sftp_with_proxy(server, folderdir)
1✔
1585
            else:
1586
                self._get_from_ftp_with_proxy(server, folderdir)
×
1587
            dbgap_files = glob.glob(os.path.join(folderdir, "*"))
×
1588
            return dbgap_files
×
1589
        except Exception as e:
1✔
1590
            self.logger.error(e)
1✔
1591
            raise
1✔
1592

1593
    def _sync(self, sess):
1✔
1594
        """
1595
        Collect files from dbgap server(s), sync csv and yaml files to storage
1596
        backend and fence DB
1597
        """
1598
        # get all dbgap files
1599
        user_projects = {}
1✔
1600
        user_info = {}
1✔
1601
        if self.is_sync_from_dbgap_server:
1✔
1602
            self.logger.debug(
1✔
1603
                "Pulling telemetry files from {} dbgap sftp servers".format(
1604
                    len(self.dbGaP)
1605
                )
1606
            )
1607
            user_projects, user_info = self._merge_multiple_dbgap_sftp(self.dbGaP, sess)
1✔
1608

1609
        local_csv_file_list = []
1✔
1610
        if self.sync_from_local_csv_dir:
1✔
1611
            local_csv_file_list = glob.glob(
1✔
1612
                os.path.join(self.sync_from_local_csv_dir, "*")
1613
            )
1614
            # Sort the list so the order of of files is consistent across platforms
1615
            local_csv_file_list.sort()
1✔
1616

1617
        user_projects_csv, user_info_csv = self._merge_multiple_local_csv_files(
1✔
1618
            local_csv_file_list,
1619
            encrypted=False,
1620
            session=sess,
1621
            dbgap_configs=self.dbGaP,
1622
        )
1623

1624
        try:
1✔
1625
            user_yaml = UserYAML.from_file(
1✔
1626
                self.sync_from_local_yaml_file, encrypted=False, logger=self.logger
1627
            )
1628
        except (EnvironmentError, AssertionError) as e:
1✔
1629
            self.logger.error(str(e))
1✔
1630
            self.logger.error("aborting early")
1✔
1631
            raise
1✔
1632

1633
        # parse all projects
1634
        user_projects_csv = self.parse_projects(user_projects_csv)
1✔
1635
        user_projects = self.parse_projects(user_projects)
1✔
1636
        user_yaml.projects = self.parse_projects(user_yaml.projects)
1✔
1637

1638
        # merge all user info dicts into "user_info".
1639
        # the user info (such as email) in the user.yaml files
1640
        # overrides the user info from the CSV files.
1641
        self.sync_two_user_info_dict(user_info_csv, user_info)
1✔
1642
        self.sync_two_user_info_dict(user_yaml.user_info, user_info)
1✔
1643

1644
        # merge all access info dicts into "user_projects".
1645
        # the access info is combined - if the user.yaml access is
1646
        # ["read"] and the CSV file access is ["read-storage"], the
1647
        # resulting access is ["read", "read-storage"].
1648
        self.sync_two_phsids_dict(
1✔
1649
            user_projects_csv, user_projects, source1="local_csv", source2="dbgap"
1650
        )
1651
        self.sync_two_phsids_dict(
1✔
1652
            user_yaml.projects, user_projects, source1="user_yaml", source2="dbgap"
1653
        )
1654

1655
        # Note: if there are multiple dbgap sftp servers configured
1656
        # this parameter is always from the config for the first dbgap sftp server
1657
        # not any additional ones
1658
        for dbgap_config in self.dbGaP:
1✔
1659
            if self._get_parse_consent_code(dbgap_config):
1✔
1660
                self._grant_all_consents_to_c999_users(
1✔
1661
                    user_projects, user_yaml.project_to_resource
1662
                )
1663

1664
        google_update_ex = None
1✔
1665

1666
        try:
1✔
1667
            # update the Fence DB
1668
            if user_projects:
1✔
1669
                self.logger.info("Sync to db and storage backend")
1✔
1670
                self.sync_to_db_and_storage_backend(user_projects, user_info, sess)
1✔
1671
                self.logger.info("Finish syncing to db and storage backend")
1✔
1672
            else:
1673
                self.logger.info("No users for syncing")
×
1674
        except GoogleUpdateException as ex:
1✔
1675
            # save this to reraise later after all non-Google syncing has finished
1676
            # this way, any issues with Google only affect Google data access and don't
1677
            # cascade problems into non-Google AWS or Azure access
1678
            google_update_ex = ex
1✔
1679

1680
        # update the Arborist DB (resources, roles, policies, groups)
1681
        if user_yaml.authz:
1✔
1682
            if not self.arborist_client:
1✔
1683
                raise EnvironmentError(
×
1684
                    "yaml file contains authz section but sync is not configured with"
1685
                    " arborist client--did you run sync with --arborist <arborist client> arg?"
1686
                )
1687
            self.logger.info("Synchronizing arborist...")
1✔
1688
            success = self._update_arborist(user_yaml)
1✔
1689
            if success:
1✔
1690
                self.logger.info("Finished synchronizing arborist")
1✔
1691
            else:
1692
                self.logger.error("Could not synchronize successfully")
×
1693
                exit(1)
×
1694
        else:
1695
            self.logger.info("No `authz` section; skipping arborist sync")
×
1696

1697
        # update the Arborist DB (user access)
1698
        if self.arborist_client:
1✔
1699
            self.logger.info("Synchronizing arborist with authorization info...")
1✔
1700
            success = self._update_authz_in_arborist(sess, user_projects, user_yaml)
1✔
1701
            if success:
1✔
1702
                self.logger.info(
1✔
1703
                    "Finished synchronizing authorization info to arborist"
1704
                )
1705
            else:
1706
                self.logger.error(
×
1707
                    "Could not synchronize authorization info successfully to arborist"
1708
                )
1709
                exit(1)
×
1710
        else:
1711
            self.logger.error("No arborist client set; skipping arborist sync")
×
1712

1713
        # Logging authz source
1714
        for u, s in self.auth_source.items():
1✔
1715
            self.logger.info("Access for user {} from {}".format(u, s))
1✔
1716

1717
        self.logger.info(
1✔
1718
            f"Persisting authz mapping to database: {user_yaml.project_to_resource}"
1719
        )
1720
        user_yaml.persist_project_to_resource(db_session=sess)
1✔
1721
        if google_update_ex is not None:
1✔
1722
            raise google_update_ex
1✔
1723

1724
    def _grant_all_consents_to_c999_users(
1✔
1725
        self, user_projects, user_yaml_project_to_resources
1726
    ):
1727
        access_number_matcher = re.compile(config["DBGAP_ACCESSION_WITH_CONSENT_REGEX"])
1✔
1728
        # combine dbgap/user.yaml projects into one big list (in case not all consents
1729
        # are in either)
1730
        all_projects = set(
1✔
1731
            list(self._projects.keys()) + list(user_yaml_project_to_resources.keys())
1732
        )
1733

1734
        self.logger.debug(f"all projects: {all_projects}")
1✔
1735

1736
        # construct a mapping from phsid (without consent) to all accessions with consent
1737
        consent_mapping = {}
1✔
1738
        for project in all_projects:
1✔
1739
            phs_match = access_number_matcher.match(project)
1✔
1740
            if phs_match:
1✔
1741
                accession_number = phs_match.groupdict()
1✔
1742

1743
                # TODO: This is not handling the .v1.p1 at all
1744
                consent_mapping.setdefault(accession_number["phsid"], set()).add(
1✔
1745
                    ".".join([accession_number["phsid"], accession_number["consent"]])
1746
                )
1747
                children = self._get_children(accession_number["phsid"])
1✔
1748
                if children:
1✔
1749
                    for child_phs in children:
1✔
1750
                        consent_mapping.setdefault(child_phs, set()).add(
1✔
1751
                            ".".join(
1752
                                [child_phs, accession_number["consent"]]
1753
                            )  # Assign parent consent to child study
1754
                        )
1755

1756
        self.logger.debug(f"consent mapping: {consent_mapping}")
1✔
1757

1758
        # go through existing access and find any c999's and make sure to give access to
1759
        # all accessions with consent for that phsid
1760
        for username, user_project_info in copy.deepcopy(user_projects).items():
1✔
1761
            for project, _ in user_project_info.items():
1✔
1762
                phs_match = access_number_matcher.match(project)
1✔
1763
                if phs_match and phs_match.groupdict()["consent"] == "c999":
1✔
1764
                    # give access to all consents
1765
                    all_phsids_with_consent = consent_mapping.get(
1✔
1766
                        phs_match.groupdict()["phsid"], []
1767
                    )
1768
                    self.logger.info(
1✔
1769
                        f"user {username} has c999 consent group for: {project}. "
1770
                        f"Granting access to all consents: {all_phsids_with_consent}"
1771
                    )
1772
                    # NOTE: Only giving read-storage at the moment (this is same
1773
                    #       permission we give for other dbgap projects)
1774
                    for phsid_with_consent in all_phsids_with_consent:
1✔
1775
                        user_projects[username].update(
1✔
1776
                            {phsid_with_consent: {"read-storage", "read"}}
1777
                        )
1778

1779
    def _update_arborist(self, user_yaml):
1✔
1780
        """
1781
        Create roles, resources, policies, groups in arborist from the information in
1782
        ``user_yaml``.
1783

1784
        The projects are sent to arborist as resources with paths like
1785
        ``/projects/{project}``. Roles are created with just the original names
1786
        for the privileges like ``"read-storage", "read"`` etc.
1787

1788
        Args:
1789
            session (sqlalchemy.Session)
1790
            user_yaml (UserYAML)
1791

1792
        Return:
1793
            bool: success
1794
        """
1795
        healthy = self._is_arborist_healthy()
1✔
1796
        if not healthy:
1✔
1797
            return False
×
1798

1799
        # Set up the resource tree in arborist by combining provided resources with any
1800
        # dbgap resources that were created before this.
1801
        #
1802
        # Why add dbgap resources if they've already been created?
1803
        #   B/C Arborist's PUT update will override existing subresources. So if a dbgap
1804
        #   resources was created under `/programs/phs000178` anything provided in
1805
        #   user.yaml under `/programs` would completely wipe it out.
1806
        resources = user_yaml.authz.get("resources", [])
1✔
1807

1808
        dbgap_resource_paths = []
1✔
1809
        for path_list in self._dbgap_study_to_resources.values():
1✔
1810
            dbgap_resource_paths.extend(path_list)
1✔
1811

1812
        self.logger.debug("user_yaml resources: {}".format(resources))
1✔
1813
        self.logger.debug("dbgap resource paths: {}".format(dbgap_resource_paths))
1✔
1814

1815
        combined_resources = utils.combine_provided_and_dbgap_resources(
1✔
1816
            resources, dbgap_resource_paths
1817
        )
1818

1819
        for resource in combined_resources:
1✔
1820
            try:
1✔
1821
                self.logger.debug(
1✔
1822
                    "attempting to update arborist resource: {}".format(resource)
1823
                )
1824
                self.arborist_client.update_resource("/", resource, merge=True)
1✔
1825
            except ArboristError as e:
×
1826
                self.logger.error(e)
×
1827
                # keep going; maybe just some conflicts from things existing already
1828

1829
        # update roles
1830
        roles = user_yaml.authz.get("roles", [])
1✔
1831
        for role in roles:
1✔
1832
            try:
1✔
1833
                response = self.arborist_client.update_role(role["id"], role)
1✔
1834
                if response:
1✔
1835
                    self._created_roles.add(role["id"])
1✔
1836
            except ArboristError as e:
×
1837
                self.logger.info(
×
1838
                    "couldn't update role '{}', creating instead".format(str(e))
1839
                )
1840
                try:
×
1841
                    response = self.arborist_client.create_role(role)
×
1842
                    if response:
×
1843
                        self._created_roles.add(role["id"])
×
1844
                except ArboristError as e:
×
1845
                    self.logger.error(e)
×
1846
                    # keep going; maybe just some conflicts from things existing already
1847

1848
        # update policies
1849
        policies = user_yaml.authz.get("policies", [])
1✔
1850
        for policy in policies:
1✔
1851
            policy_id = policy.pop("id")
1✔
1852
            try:
1✔
1853
                self.logger.debug(
1✔
1854
                    "Trying to upsert policy with id {}".format(policy_id)
1855
                )
1856
                response = self.arborist_client.update_policy(
1✔
1857
                    policy_id, policy, create_if_not_exist=True
1858
                )
1859
            except ArboristError as e:
×
1860
                self.logger.error(e)
×
1861
                # keep going; maybe just some conflicts from things existing already
1862
            else:
1863
                if response:
1✔
1864
                    self.logger.debug("Upserted policy with id {}".format(policy_id))
1✔
1865
                    self._created_policies.add(policy_id)
1✔
1866

1867
        # update groups
1868
        groups = user_yaml.authz.get("groups", [])
1✔
1869

1870
        # delete from arborist the groups that have been deleted
1871
        # from the user.yaml
1872
        arborist_groups = set(
1✔
1873
            g["name"] for g in self.arborist_client.list_groups().get("groups", [])
1874
        )
1875
        useryaml_groups = set(g["name"] for g in groups)
1✔
1876
        for deleted_group in arborist_groups.difference(useryaml_groups):
1✔
1877
            # do not try to delete built in groups
1878
            if deleted_group not in ["anonymous", "logged-in"]:
×
1879
                self.arborist_client.delete_group(deleted_group)
×
1880

1881
        # create/update the groups defined in the user.yaml
1882
        for group in groups:
1✔
1883
            missing = {"name", "users", "policies"}.difference(set(group.keys()))
×
1884
            if missing:
×
1885
                name = group.get("name", "{MISSING NAME}")
×
1886
                self.logger.error(
×
1887
                    "group {} missing required field(s): {}".format(name, list(missing))
1888
                )
1889
                continue
×
1890
            try:
×
1891
                response = self.arborist_client.put_group(
×
1892
                    group["name"],
1893
                    # Arborist doesn't handle group descriptions yet
1894
                    # description=group.get("description", ""),
1895
                    users=group["users"],
1896
                    policies=group["policies"],
1897
                )
1898
            except ArboristError as e:
×
1899
                self.logger.info("couldn't put group: {}".format(str(e)))
×
1900

1901
        # Update policies for built-in (`anonymous` and `logged-in`) groups
1902

1903
        # First recreate these groups in order to clear out old, possibly deleted policies
1904
        for builtin_group in ["anonymous", "logged-in"]:
1✔
1905
            try:
1✔
1906
                response = self.arborist_client.put_group(builtin_group)
1✔
1907
            except ArboristError as e:
×
1908
                self.logger.info("couldn't put group: {}".format(str(e)))
×
1909

1910
        # Now add back policies that are in the user.yaml
1911
        for policy in user_yaml.authz.get("anonymous_policies", []):
1✔
1912
            self.arborist_client.grant_group_policy("anonymous", policy)
×
1913

1914
        for policy in user_yaml.authz.get("all_users_policies", []):
1✔
1915
            self.arborist_client.grant_group_policy("logged-in", policy)
×
1916

1917
        return True
1✔
1918

1919
    def _grant_arborist_policies(
1✔
1920
        self,
1921
        username,
1922
        incoming_policies,
1923
        user_yaml,
1924
        expires=None,
1925
        remove_users_with_no_policies=True,
1926
    ):
1927
        """
1928
        Find the difference between the existing policies for a user and the incoming policies,
1929
        and decide whether to add, remove, or keep policies.
1930

1931
        Args:
1932
            username (str): the username of the user
1933
            incoming_policies (set): set of policies to be applied to the user
1934
            user_yaml (UserYAML): UserYAML object containing authz information
1935
            expires (int): time at which authz info in Arborist should expire
1936
            remove_users_with_no_policies (bool): whether to delete users with no access from
1937
                the Arborist database
1938
        """
1939
        user_existing_policies = set()
1✔
1940
        to_add = set()
1✔
1941
        to_remove = set()
1✔
1942
        is_revoke_all = False
1✔
1943

1944
        try:
1✔
1945
            user_existing_policies = set(
1✔
1946
                policy["policy"]
1947
                for policy in self.arborist_client.get_user(username)["policies"]
1948
            )
1949
            self.logger.info(
1✔
1950
                f"Fetched user {username} existing policies: {user_existing_policies}"
1951
            )
1952
        except ArboristError as e:
1✔
1953
            self.logger.error(
1✔
1954
                f"Could not get user {username} policies from Arborist: {e}. Revoking all policies..."
1955
            )
1956
            # if getting existing policies fails, revoke all policies and re-apply
1957
            is_revoke_all = True
1✔
1958

1959
        if user_yaml:
1✔
1960
            anonymous_policies = set(
1✔
1961
                user_yaml.authz.get("anonymous_policies", [])
1962
                + user_yaml.authz.get("all_users_policies", [])
1963
            )
1964
            user_existing_policies = user_existing_policies - anonymous_policies
1✔
1965

1966
        if is_revoke_all is False and len(incoming_policies) > 0:
1✔
1967
            to_add = incoming_policies - user_existing_policies
1✔
1968
            to_remove = user_existing_policies - incoming_policies
1✔
1969
        else:
1970
            # if incoming_policies is empty, we revoke all policies
1971
            is_revoke_all = True
1✔
1972

1973
        if not is_revoke_all:
1✔
1974
            success = not to_remove
1✔
1975
            try:
1✔
1976
                if to_remove:
1✔
1977
                    for policy in to_remove:
1✔
1978
                        self.logger.info(
1✔
1979
                            f"Revoking policy {policy} for user {username}."
1980
                        )
1981
                        success = self.arborist_client.revoke_user_policy(
1✔
1982
                            username, policy
1983
                        )
1984
            except ArboristError as e:
×
1985
                self.logger.error(
×
1986
                    f"Could not revoke user {username} policy {policy}: {e}"
1987
                )
1988
            if not success:
1✔
1989
                # `revoke_user_policy` returns None in case of error
1990
                self.logger.error(
×
1991
                    f"Could not revoke user {username} policy. Revoking all instead."
1992
                )
1993
                is_revoke_all = True
×
1994

1995
        if is_revoke_all:
1✔
1996
            if (
1✔
1997
                remove_users_with_no_policies
1998
                and not incoming_policies
1999
                and not user_existing_policies
2000
            ):
2001
                # user without any access (other than anonymous and logged-in groups).
2002
                # cleanup: remove from the arborist DB so we do not check their access again every
2003
                # time this code runs.
2004
                self.logger.info(
1✔
2005
                    f"Deleting user {username} from Arborist (since they have no policies)."
2006
                )
2007
                self.arborist_client.delete_user(username)
1✔
2008
                return
1✔
2009
            success = False
1✔
2010
            try:
1✔
2011
                # Note: If a user only has group policies, we call `revoke_all_policies_for_user`
2012
                # for nothing. Could be fixed by adding a flag to the arborist "get user" endpoint
2013
                # to get the list of policies _excluding_ group policies, or by manually checking
2014
                # which policies are group policies (not worth it atm).
2015
                self.logger.info(f"Revoking all policies for user {username}.")
1✔
2016
                success = self.arborist_client.revoke_all_policies_for_user(username)
1✔
2017
            except ArboristError as e:
×
2018
                self.logger.error(
×
2019
                    f"Could not revoke all policies for user {username}. Error: {e}"
2020
                )
2021
            if not success:
1✔
2022
                # `revoke_all_policies_for_user` returns None in case of error
2023
                raise Exception(f"Could not revoke all policies for user {username}")
×
2024
            to_add = incoming_policies  # if we revoke all, we need to add all incoming policies
1✔
2025

2026
        if (
1✔
2027
            "mfa_policy" not in incoming_policies
2028
            and "mfa_policy" in user_existing_policies
2029
        ):
2030
            to_add.add("mfa_policy")
×
2031

2032
        if to_add:
1✔
2033
            self.logger.info(f"Bulk granting user {username} policies {to_add}.")
1✔
2034
            self._grant_bulk_user_policies(username, to_add, expires)
1✔
2035

2036
    def _update_authz_in_arborist(
1✔
2037
        self,
2038
        session,
2039
        user_projects,
2040
        user_yaml=None,
2041
        single_user_sync=False,
2042
        expires=None,
2043
    ):
2044
        """
2045
        Assign users policies in arborist from the information in
2046
        ``user_projects`` and optionally a ``user_yaml``.
2047

2048
        The projects are sent to arborist as resources with paths like
2049
        ``/projects/{project}``. Roles are created with just the original names
2050
        for the privileges like ``"read-storage", "read"`` etc.
2051

2052
        Args:
2053
            user_projects (dict)
2054
            user_yaml (UserYAML) optional, if there are policies for users in a user.yaml
2055
            single_user_sync (bool) whether authz update is for a single user
2056
            expires (int) time at which authz info in Arborist should expire
2057

2058
        Return:
2059
            bool: success
2060
        """
2061
        healthy = self._is_arborist_healthy()
1✔
2062
        if not healthy:
1✔
2063
            return False
×
2064

2065
        self.logger.debug("user_projects: {}".format(user_projects))
1✔
2066

2067
        if user_yaml:
1✔
2068
            self.logger.debug(
1✔
2069
                "useryaml abac before lowering usernames: {}".format(
2070
                    user_yaml.user_abac
2071
                )
2072
            )
2073
            user_yaml.user_abac = {
1✔
2074
                key.lower(): value for key, value in user_yaml.user_abac.items()
2075
            }
2076
            # update the project info with `projects` specified in user.yaml
2077
            self.sync_two_phsids_dict(user_yaml.user_abac, user_projects)
1✔
2078

2079
        # get list of users from arborist to make sure users that are completely removed
2080
        # from authorization sources get policies revoked
2081

2082
        arborist_user_projects = {}
1✔
2083
        if not single_user_sync:
1✔
2084

2085
            try:
1✔
2086
                arborist_users = self.arborist_client.get_users().json["users"]
1✔
2087

2088
                # construct user information, NOTE the lowering of the username. when adding/
2089
                # removing access, the case in the Fence db is used. For combining access, it is
2090
                # case-insensitive, so we lower
2091
                arborist_user_projects = {
1✔
2092
                    user["name"].lower(): {} for user in arborist_users
2093
                }
2094
            except (ArboristError, KeyError, AttributeError) as error:
×
2095
                # TODO usersync should probably exit with non-zero exit code at the end,
2096
                #      but sync should continue from this point so there are no partial
2097
                #      updates
2098
                self.logger.warning(
×
2099
                    "Could not get list of users in Arborist, continuing anyway. "
2100
                    "WARNING: this sync will NOT remove access for users no longer in "
2101
                    f"authorization sources. Error: {error}"
2102
                )
2103

2104
            # update the project info with users from arborist
2105
            self.sync_two_phsids_dict(arborist_user_projects, user_projects)
1✔
2106

2107
        # prefer in-memory if available from user_yaml, if not, get from database
2108
        if user_yaml and user_yaml.project_to_resource:
1✔
2109
            project_to_authz_mapping = user_yaml.project_to_resource
1✔
2110
            self.logger.debug(
1✔
2111
                f"using in-memory project to authz resource mapping from "
2112
                f"user.yaml (instead of database): {project_to_authz_mapping}"
2113
            )
2114
        else:
2115
            project_to_authz_mapping = get_project_to_authz_mapping(session)
1✔
2116
            self.logger.debug(
1✔
2117
                f"using persisted project to authz resource mapping from database "
2118
                f"(instead of user.yaml - as it may not be available): {project_to_authz_mapping}"
2119
            )
2120

2121
        self.logger.debug(
1✔
2122
            f"_dbgap_study_to_resources: {self._dbgap_study_to_resources}"
2123
        )
2124
        all_resources = [
1✔
2125
            r
2126
            for resources in self._dbgap_study_to_resources.values()
2127
            for r in resources
2128
        ]
2129
        all_resources.extend(r for r in project_to_authz_mapping.values())
1✔
2130
        self._create_arborist_resources(all_resources)
1✔
2131

2132
        for username, user_project_info in user_projects.items():
1✔
2133
            self.logger.info("processing user `{}`".format(username))
1✔
2134
            user = query_for_user(session=session, username=username)
1✔
2135
            idp = None
1✔
2136
            if user:
1✔
2137
                username = user.username
1✔
2138
                idp = user.identity_provider.name if user.identity_provider else None
1✔
2139

2140
            self.arborist_client.create_user_if_not_exist(username)
1✔
2141

2142
            # as of 2/11/2022, for single_user_sync, as RAS visa parsing has
2143
            # previously mapped each project to the same set of privileges
2144
            # (i.e.{'read', 'read-storage'}), unique_policies will just be a
2145
            # single policy with ('read', 'read-storage') being the single
2146
            # key
2147
            unique_policies = self._determine_unique_policies(
1✔
2148
                user_project_info, project_to_authz_mapping
2149
            )
2150
            for roles in unique_policies.keys():
1✔
2151
                for role in roles:
1✔
2152
                    self._create_arborist_role(role)
1✔
2153

2154
            incoming_policies = set()  # set of policies for current user.
1✔
2155

2156
            if single_user_sync:
1✔
2157
                for ordered_roles, ordered_resources in unique_policies.items():
1✔
2158
                    policy_hash = self._hash_policy_contents(
1✔
2159
                        ordered_roles, ordered_resources
2160
                    )
2161
                    self._create_arborist_policy(
1✔
2162
                        policy_hash,
2163
                        ordered_roles,
2164
                        ordered_resources,
2165
                        skip_if_exists=True,
2166
                    )
2167
                    # return here as it is not expected single_user_sync
2168
                    # will need any of the remaining user_yaml operations
2169
                    # left in _update_authz_in_arborist
2170
                    return self._grant_arborist_policy(
1✔
2171
                        username, policy_hash, expires=expires
2172
                    )
2173
            else:
2174
                policy_ids_to_grant = set()
1✔
2175
                for roles, resources in unique_policies.items():
1✔
2176
                    for role in roles:
1✔
2177
                        for resource in resources:
1✔
2178
                            # grant a policy to this user which is a single
2179
                            # role on a single resource
2180

2181
                            # format project '/x/y/z' -> 'x.y.z'
2182
                            # so the policy id will be something like 'x.y.z-create'
2183
                            policy_id = _format_policy_id(resource, role)
1✔
2184
                            incoming_policies.add(policy_id)
1✔
2185
                            if policy_id not in self._created_policies:
1✔
2186
                                try:
1✔
2187
                                    self.arborist_client.update_policy(
1✔
2188
                                        policy_id,
2189
                                        {
2190
                                            "description": "policy created by fence sync",
2191
                                            "role_ids": [role],
2192
                                            "resource_paths": [resource],
2193
                                        },
2194
                                        create_if_not_exist=True,
2195
                                    )
2196
                                except ArboristError as e:
×
2197
                                    self.logger.info(
×
2198
                                        "not creating policy in arborist; {}".format(
2199
                                            str(e)
2200
                                        )
2201
                                    )
2202
                                self._created_policies.add(policy_id)
1✔
2203
                            policy_ids_to_grant.add(policy_id)
1✔
2204
                self._grant_arborist_policies(
1✔
2205
                    username,
2206
                    policy_ids_to_grant,
2207
                    user_yaml=None,
2208
                    expires=expires,
2209
                    remove_users_with_no_policies=False,
2210
                )
2211

2212
            if user_yaml:
1✔
2213
                user_yaml_policies = set(user_yaml.policies.get(username, []))
1✔
2214
                incoming_policies = (
1✔
2215
                    incoming_policies | user_yaml_policies
2216
                )  # add policies from whitelist and useryaml
2217

2218
            self._grant_arborist_policies(
1✔
2219
                username,
2220
                incoming_policies,
2221
                user_yaml,
2222
                expires=expires,
2223
                remove_users_with_no_policies=True,
2224
            )
2225

2226
        if user_yaml:
1✔
2227
            for client_name, client_details in user_yaml.clients.items():
1✔
2228
                client_policies = client_details.get("policies", [])
×
2229
                clients = session.query(Client).filter_by(name=client_name).all()
×
2230
                # update existing clients, do not create new ones
2231
                if not clients:
×
2232
                    self.logger.warning(
×
2233
                        "client to update (`{}`) does not exist in fence: skipping".format(
2234
                            client_name
2235
                        )
2236
                    )
2237
                    continue
×
2238
                self.logger.debug(
×
2239
                    "updating client `{}` (found {} client IDs)".format(
2240
                        client_name, len(clients)
2241
                    )
2242
                )
2243
                # there may be more than 1 client with this name if credentials are being rotated,
2244
                # so we grant access to each client ID
2245
                for client in clients:
×
2246
                    try:
×
2247
                        self.arborist_client.update_client(
×
2248
                            client.client_id, client_policies
2249
                        )
2250
                    except ArboristError as e:
×
2251
                        self.logger.info(
×
2252
                            "not granting policies {} to client `{}` (`{}`); {}".format(
2253
                                client_policies, client_name, client.client_id, str(e)
2254
                            )
2255
                        )
2256

2257
        return True
1✔
2258

2259
    def _determine_unique_policies(self, user_project_info, project_to_authz_mapping):
1✔
2260
        """
2261
        Determine and return a dictionary of unique policies.
2262

2263
        Args (examples):
2264
            user_project_info (dict):
2265
            {
2266
                'phs000002.c1': { 'read-storage', 'read' },
2267
                'phs000001.c1': { 'read', 'read-storage' },
2268
                'phs000004.c1': { 'write', 'read' },
2269
                'phs000003.c1': { 'read', 'write' },
2270
                'phs000006.c1': { 'write-storage', 'write', 'read-storage', 'read' }
2271
                'phs000005.c1': { 'read', 'read-storage', 'write', 'write-storage' },
2272
            }
2273
            project_to_authz_mapping (dict):
2274
            {
2275
                'phs000001.c1': '/programs/DEV/projects/phs000001.c1'
2276
            }
2277

2278
        Return (for examples):
2279
            dict:
2280
            {
2281
                ('read', 'read-storage'): ('phs000001.c1', 'phs000002.c1'),
2282
                ('read', 'write'): ('phs000003.c1', 'phs000004.c1'),
2283
                ('read', 'read-storage', 'write', 'write-storage'): ('phs000005.c1', 'phs000006.c1'),
2284
            }
2285
        """
2286
        roles_to_resources = collections.defaultdict(list)
1✔
2287
        for study, roles in user_project_info.items():
1✔
2288
            ordered_roles = tuple(sorted(roles))
1✔
2289
            study_authz_paths = self._dbgap_study_to_resources.get(study, [study])
1✔
2290
            if study in project_to_authz_mapping:
1✔
2291
                study_authz_paths = [project_to_authz_mapping[study]]
1✔
2292
            roles_to_resources[ordered_roles].extend(study_authz_paths)
1✔
2293

2294
        policies = {}
1✔
2295
        for ordered_roles, unordered_resources in roles_to_resources.items():
1✔
2296
            policies[ordered_roles] = tuple(sorted(unordered_resources))
1✔
2297
        return policies
1✔
2298

2299
    def _create_arborist_role(self, role):
1✔
2300
        """
2301
        Wrapper around gen3authz's create_role with additional logging
2302

2303
        Args:
2304
            role (str): what the Arborist identity should be of the created role
2305

2306
        Return:
2307
            bool: True if the role was created successfully or it already
2308
                  exists. False otherwise
2309
        """
2310
        if role in self._created_roles:
1✔
2311
            return True
1✔
2312
        try:
1✔
2313
            response_json = self.arborist_client.create_role(
1✔
2314
                arborist_role_for_permission(role)
2315
            )
2316
        except ArboristError as e:
×
2317
            self.logger.error(
×
2318
                "could not create `{}` role in Arborist: {}".format(role, e)
2319
            )
2320
            return False
×
2321
        self._created_roles.add(role)
1✔
2322

2323
        if response_json is None:
1✔
2324
            self.logger.info("role `{}` already exists in Arborist".format(role))
×
2325
        else:
2326
            self.logger.info("created role `{}` in Arborist".format(role))
1✔
2327
        return True
1✔
2328

2329
    def _create_arborist_resources(self, resources):
1✔
2330
        """
2331
        Create resources in Arborist
2332

2333
        Args:
2334
            resources (list): a list of full Arborist resource paths to create
2335
            [
2336
                "/programs/DEV/projects/phs000001.c1",
2337
                "/programs/DEV/projects/phs000002.c1",
2338
                "/programs/DEV/projects/phs000003.c1"
2339
            ]
2340

2341
        Return:
2342
            bool: True if the resources were successfully created, False otherwise
2343

2344

2345
        As of 2/11/2022, for resources above,
2346
        utils.combine_provided_and_dbgap_resources({}, resources) returns:
2347
        [
2348
            { 'name': 'programs', 'subresources': [
2349
                { 'name': 'DEV', 'subresources': [
2350
                    { 'name': 'projects', 'subresources': [
2351
                        { 'name': 'phs000001.c1', 'subresources': []},
2352
                        { 'name': 'phs000002.c1', 'subresources': []},
2353
                        { 'name': 'phs000003.c1', 'subresources': []}
2354
                    ]}
2355
                ]}
2356
            ]}
2357
        ]
2358
        Because this list has a single object, only a single network request gets
2359
        sent to Arborist.
2360

2361
        However, for resources = ["/phs000001.c1", "/phs000002.c1", "/phs000003.c1"],
2362
        utils.combine_provided_and_dbgap_resources({}, resources) returns:
2363
        [
2364
            {'name': 'phs000001.c1', 'subresources': []},
2365
            {'name': 'phs000002.c1', 'subresources': []},
2366
            {'name': 'phs000003.c1', 'subresources': []}
2367
        ]
2368
        Because this list has 3 objects, 3 network requests get sent to Arborist.
2369

2370
        As a practical matter, for sync_single_user_visas, studies
2371
        should be nested under the `/programs` resource as in the former
2372
        example (i.e. only one network request gets made).
2373

2374
        TODO for the sake of simplicity, it would be nice if only one network
2375
        request was made no matter the input.
2376
        """
2377
        for request_body in utils.combine_provided_and_dbgap_resources({}, resources):
1✔
2378
            try:
1✔
2379
                response_json = self.arborist_client.update_resource(
1✔
2380
                    "/", request_body, merge=True
2381
                )
2382
            except ArboristError as e:
×
2383
                self.logger.error(
×
2384
                    "could not create Arborist resources using request body `{}`. error: {}".format(
2385
                        request_body, e
2386
                    )
2387
                )
2388
                return False
×
2389

2390
        self.logger.debug(
1✔
2391
            "created {} resource(s) in Arborist: `{}`".format(len(resources), resources)
2392
        )
2393
        return True
1✔
2394

2395
    def _create_arborist_policy(
1✔
2396
        self, policy_id, roles, resources, skip_if_exists=False
2397
    ):
2398
        """
2399
        Wrapper around gen3authz's create_policy with additional logging
2400

2401
        Args:
2402
            policy_id (str): what the Arborist identity should be of the created policy
2403
            roles (iterable): what roles the create policy should have
2404
            resources (iterable): what resources the created policy should have
2405
            skip_if_exists (bool): if True, this function will not treat an already
2406
                                   existent policy as an error
2407

2408
        Return:
2409
            bool: True if policy creation was successful. False otherwise
2410
        """
2411
        try:
1✔
2412
            response_json = self.arborist_client.create_policy(
1✔
2413
                {
2414
                    "id": policy_id,
2415
                    "role_ids": roles,
2416
                    "resource_paths": resources,
2417
                },
2418
                skip_if_exists=skip_if_exists,
2419
            )
2420
        except ArboristError as e:
×
2421
            self.logger.error(
×
2422
                "could not create policy `{}` in Arborist: {}".format(policy_id, e)
2423
            )
2424
            return False
×
2425

2426
        if response_json is None:
1✔
2427
            self.logger.info("policy `{}` already exists in Arborist".format(policy_id))
×
2428
        else:
2429
            self.logger.info("created policy `{}` in Arborist".format(policy_id))
1✔
2430
        return True
1✔
2431

2432
    def _hash_policy_contents(self, ordered_roles, ordered_resources):
1✔
2433
        """
2434
        Generate a sha256 hexdigest representing ordered_roles and ordered_resources.
2435

2436
        Args:
2437
            ordered_roles (iterable): policy roles in sorted order
2438
            ordered_resources (iterable): policy resources in sorted order
2439

2440
        Return:
2441
            str: SHA256 hex digest
2442
        """
2443

2444
        def escape(s):
1✔
2445
            return s.replace(",", "\\,")
1✔
2446

2447
        canonical_roles = ",".join(escape(r) for r in ordered_roles)
1✔
2448
        canonical_resources = ",".join(escape(r) for r in ordered_resources)
1✔
2449
        canonical_policy = f"{canonical_roles},,f{canonical_resources}"
1✔
2450
        policy_hash = hashlib.sha256(canonical_policy.encode("utf-8")).hexdigest()
1✔
2451

2452
        return policy_hash
1✔
2453

2454
    def _grant_arborist_policy(self, username, policy_id, expires=None):
1✔
2455
        """
2456
        Wrapper around gen3authz's grant_user_policy with additional logging
2457

2458
        Args:
2459
            username (str): username of user in Arborist who policy should be
2460
                            granted to
2461
            policy_id (str): Arborist policy id
2462
            expires (int): POSIX timestamp for when policy should expire
2463

2464
        Return:
2465
            bool: True if granting of policy was successful, False otherwise
2466
        """
2467
        try:
1✔
2468
            resp = self.arborist_client.grant_user_policy(
1✔
2469
                username,
2470
                policy_id,
2471
                expires_at=expires,
2472
            )
2473
            if not resp:
1✔
2474
                self.logger.error(
1✔
2475
                    "could not grant policy `{}` to user `{}`".format(
2476
                        policy_id, username
2477
                    )
2478
                )
2479
                return False
1✔
2480
        except ArboristError as e:
×
2481
            self.logger.error(
×
2482
                "could not grant policy `{}` to user `{}`: {}".format(
2483
                    policy_id, username, e
2484
                )
2485
            )
2486
            return False
×
2487

2488
        self.logger.debug(
1✔
2489
            "granted policy `{}` to user `{}`".format(policy_id, username)
2490
        )
2491
        return True
1✔
2492

2493
    def _grant_bulk_user_policies(self, username, policy_ids, expires=None):
1✔
2494
        """
2495
        Wrapper around gen3authz's grant_user_policies with additional logging
2496

2497
        Args:
2498
            username (str): username of user in Arborist who policy should be
2499
                            granted to
2500
            policy_ids (set[str]): Arborist policy ids
2501

2502
        Return:
2503
            bool: True if granting of policies was successful, False otherwise
2504
        """
2505
        try:
1✔
2506
            resp = self.arborist_client.grant_bulk_user_policy(
1✔
2507
                username, policy_ids, expires
2508
            )
2509
            if not resp:
1✔
2510
                self.logger.error(
×
2511
                    "could not grant bulk policies to user `{}`".format(username)
2512
                )
2513
                return False
×
2514
        except ArboristError as e:
×
2515
            self.logger.error(
×
2516
                "could not grant bulk policies to user `{}`: {}".format(username, e)
2517
            )
2518
            return False
×
2519
        except ArboristTimeoutError as e:
×
2520
            self.logger.error(
×
2521
                f"Timeout waiting for response to grant bulk policies  to user `{username}`: {e}"
2522
                "This user will be skipped and usersync will continue."
2523
                "As long as the timeout is not a pool/connection timeout, then "
2524
            )
2525
            return False
×
2526
        return True
1✔
2527

2528
    def _determine_arborist_resource(self, dbgap_study, dbgap_config):
1✔
2529
        """
2530
        Determine the arborist resource path and add it to
2531
        _self._dbgap_study_to_resources
2532

2533
        Args:
2534
            dbgap_study (str): study phs identifier
2535
            dbgap_config (dict): dictionary of config for dbgap server
2536

2537
        """
2538
        default_namespaces = dbgap_config.get("study_to_resource_namespaces", {}).get(
1✔
2539
            "_default", ["/"]
2540
        )
2541
        namespaces = dbgap_config.get("study_to_resource_namespaces", {}).get(
1✔
2542
            dbgap_study, default_namespaces
2543
        )
2544

2545
        self.logger.debug(f"dbgap study namespaces: {namespaces}")
1✔
2546

2547
        arborist_resource_namespaces = [
1✔
2548
            namespace.rstrip("/") + "/programs/" for namespace in namespaces
2549
        ]
2550

2551
        for resource_namespace in arborist_resource_namespaces:
1✔
2552
            full_resource_path = resource_namespace + dbgap_study
1✔
2553
            if dbgap_study not in self._dbgap_study_to_resources:
1✔
2554
                self._dbgap_study_to_resources[dbgap_study] = []
1✔
2555
            self._dbgap_study_to_resources[dbgap_study].append(full_resource_path)
1✔
2556
        return arborist_resource_namespaces
1✔
2557

2558
    def _is_arborist_healthy(self):
1✔
2559
        if not self.arborist_client:
1✔
2560
            self.logger.warning("no arborist client set; skipping arborist dbgap sync")
×
2561
            return False
×
2562
        if not self.arborist_client.healthy():
1✔
2563
            # TODO (rudyardrichter, 2019-01-07): add backoff/retry here
2564
            self.logger.error(
×
2565
                "arborist service is unavailable; skipping main arborist dbgap sync"
2566
            )
2567
            return False
×
2568
        return True
1✔
2569

2570
    def _pick_sync_type(self, visa):
1✔
2571
        """
2572
        Pick type of visa to parse according to the visa provider
2573
        """
2574
        sync_client = None
1✔
2575
        if visa.type in self.visa_types["ras"]:
1✔
2576
            sync_client = self.ras_sync_client
1✔
2577
        else:
2578
            raise Exception(
×
2579
                "Visa type {} not recognized. Configure in fence-config".format(
2580
                    visa.type
2581
                )
2582
            )
2583
        if not sync_client:
1✔
2584
            raise Exception("Sync client for {} not configured".format(visa.type))
×
2585

2586
        return sync_client
1✔
2587

2588
    def sync_single_user_visas(
1✔
2589
        self, user, ga4gh_visas, sess=None, expires=None, skip_google_updates=False
2590
    ):
2591
        """
2592
        Sync a single user's visas during login or DRS/data access
2593

2594
        IMPORTANT NOTE: THIS DOES NOT VALIDATE THE VISA. ENSURE THIS IS DONE
2595
                        BEFORE THIS.
2596

2597
        Args:
2598
            user (userdatamodel.user.User): Fence user whose visas'
2599
                                            authz info is being synced
2600
            ga4gh_visas (list): a list of fence.models.GA4GHVisaV1 objects
2601
                                that are ALREADY VALIDATED
2602
            sess (sqlalchemy.orm.session.Session): database session
2603
            expires (int): time at which synced Arborist policies and
2604
                           inclusion in any GBAG are set to expire
2605
            skip_google_updates (bool): True if google group updates should be skipped. False if otherwise.
2606

2607
        Return:
2608
            list of successfully parsed visas
2609
        """
2610
        self.ras_sync_client = RASVisa(logger=self.logger)
1✔
2611
        dbgap_config = self.dbGaP[0]
1✔
2612
        parse_consent_code = self._get_parse_consent_code(dbgap_config)
1✔
2613
        enable_common_exchange_area_access = dbgap_config.get(
1✔
2614
            "enable_common_exchange_area_access", False
2615
        )
2616
        study_common_exchange_areas = dbgap_config.get(
1✔
2617
            "study_common_exchange_areas", {}
2618
        )
2619

2620
        try:
1✔
2621
            user_yaml = UserYAML.from_file(
1✔
2622
                self.sync_from_local_yaml_file, encrypted=False, logger=self.logger
2623
            )
2624
        except (EnvironmentError, AssertionError) as e:
×
2625
            self.logger.error(str(e))
×
2626
            self.logger.error("aborting early")
×
2627
            raise
×
2628

2629
        user_projects = dict()
1✔
2630
        projects = {}
1✔
2631
        info = {}
1✔
2632
        parsed_visas = []
1✔
2633

2634
        for visa in ga4gh_visas:
1✔
2635
            project = {}
1✔
2636
            visa_type = self._pick_sync_type(visa)
1✔
2637
            encoded_visa = visa.ga4gh_visa
1✔
2638

2639
            try:
1✔
2640
                project, info = visa_type._parse_single_visa(
1✔
2641
                    user,
2642
                    encoded_visa,
2643
                    visa.expires,
2644
                    parse_consent_code,
2645
                )
2646
            except Exception:
×
2647
                self.logger.warning(
×
2648
                    f"ignoring unsuccessfully parsed or expired visa: {encoded_visa}"
2649
                )
2650
                continue
×
2651

2652
            projects = {**projects, **project}
1✔
2653
            parsed_visas.append(visa)
1✔
2654

2655
        info["user_id"] = user.id
1✔
2656
        info["username"] = user.username
1✔
2657
        user_projects[user.username] = projects
1✔
2658

2659
        user_projects = self.parse_projects(user_projects)
1✔
2660

2661
        if parse_consent_code and enable_common_exchange_area_access:
1✔
2662
            self.logger.info(
1✔
2663
                f"using study to common exchange area mapping: {study_common_exchange_areas}"
2664
            )
2665

2666
        self._process_user_projects(
1✔
2667
            user_projects,
2668
            enable_common_exchange_area_access,
2669
            study_common_exchange_areas,
2670
            dbgap_config,
2671
            sess,
2672
        )
2673

2674
        if parse_consent_code:
1✔
2675
            self._grant_all_consents_to_c999_users(
1✔
2676
                user_projects, user_yaml.project_to_resource
2677
            )
2678

2679
        if user_projects:
1✔
2680
            self.sync_to_storage_backend(
1✔
2681
                user_projects,
2682
                info,
2683
                sess,
2684
                expires=expires,
2685
                skip_google_updates=skip_google_updates,
2686
            )
2687
        else:
2688
            self.logger.info("No users for syncing")
×
2689

2690
        # update arborist db (user access)
2691
        if self.arborist_client:
1✔
2692
            self.logger.info("Synchronizing arborist with authorization info...")
1✔
2693
            success = self._update_authz_in_arborist(
1✔
2694
                sess,
2695
                user_projects,
2696
                user_yaml=user_yaml,
2697
                single_user_sync=True,
2698
                expires=expires,
2699
            )
2700
            if success:
1✔
2701
                self.logger.info(
1✔
2702
                    "Finished synchronizing authorization info to arborist"
2703
                )
2704
            else:
2705
                self.logger.error(
1✔
2706
                    "Could not synchronize authorization info successfully to arborist"
2707
                )
2708
        else:
2709
            self.logger.error("No arborist client set; skipping arborist sync")
×
2710

2711
        return parsed_visas
1✔
STATUS · Troubleshooting · Open an Issue · Sales · Support · CAREERS · ENTERPRISE · START FREE · SCHEDULE DEMO
ANNOUNCEMENTS · TWITTER · TOS & SLA · Supported CI Services · What's a CI service? · Automated Testing

© 2026 Coveralls, Inc