• Home
  • Features
  • Pricing
  • Docs
  • Announcements
  • Sign In

4dn-dcic / Submit4DN / 9781395274

03 Jul 2024 04:17PM UTC coverage: 83.187%. Remained the same
9781395274

Pull #177

github

web-flow
Merge 7f68368eb into 189704511
Pull Request #177: Support for Python 3.12

1039 of 1249 relevant lines covered (83.19%)

3.33 hits per line

Source File
Press 'n' to go to next uncovered line, 'b' for previous

81.97
/wranglertools/import_data.py
1
#!/usr/bin/env python3
2
# -*- coding: latin-1 -*-
3
"""See the epilog for detailed information."""
4
import argparse
4✔
5
import pathlib as pp
4✔
6
import hashlib
4✔
7
from wranglertools.get_field_info import (
4✔
8
    sheet_order, FDN_Key, FDN_Connection,
9
    create_common_arg_parser, _remove_all_from_types)
10
from dcicutils import ff_utils
4✔
11
import openpyxl
4✔
12
import warnings  # to suppress openpxl warning about headers
4✔
13
from openpyxl.utils.exceptions import InvalidFileException
4✔
14
import datetime
4✔
15
import sys
4✔
16
import mimetypes
4✔
17
import requests
4✔
18
from base64 import b64encode
4✔
19
import magic  # install me with 'pip install python-magic'
4✔
20
# https://github.com/ahupp/python-magic
21
# this is the site for python-magic in case we need it
22
import ast
4✔
23
import os
4✔
24
import time
4✔
25
import subprocess
4✔
26
import shutil
4✔
27
import re
4✔
28
from collections import OrderedDict, Counter
4✔
29
from urllib import request as urllib2
4✔
30
from contextlib import closing
4✔
31

32

33
EPILOG = '''
4✔
34
This script takes in an Excel file with the data
35
This is a dryrun-default script, run with --update, --patchall or both (--update --patchall)
36
to actually submit data to the portal
37

38
By DEFAULT:
39
If there is a uuid, @id, accession, or previously submitted alias in the document:
40
Use '--patchall' if you want to patch ALL objects in your document and ignore that message
41

42
If you want to upload new items(no existing object identifiers are found),
43
in the document you need to use '--update' for POSTing to occur
44

45
Defining Object type:
46
    Each "sheet" of the excel file is named after the object type you are uploading,
47
    with the format used on http://data.4dnucleome.org//profiles/
48
Ex: ExperimentHiC, Biosample, Document, BioFeature
49

50
If you only want to submit a subset of sheets in a workbook use the --type option with the
51
sheet name Ex: %(prog)s mydata.xsls --type ExperimentHiC
52

53
The name of each sheet should be the names of the object type.
54
Ex: Award, Lab, BioFeature, etc.
55

56
The first row of the sheets should be the field names
57
Ex: aliases, experiment_type, etc.
58

59
To upload objects with attachments, use the column titled "attachment"
60
containing the full path to the file you wish to attach
61

62
To delete a field, use the keyword "*delete*" as the value.
63

64
For more details:
65
please see README.rst
66
'''
67

68

69
def getArgs():  # pragma: no cover
70
    parser = argparse.ArgumentParser(
71
        parents=[create_common_arg_parser()],
72
        description=__doc__, epilog=EPILOG,
73
        formatter_class=argparse.RawDescriptionHelpFormatter,
74
    )
75

76
    parser.add_argument('infile',
77
                        help="the datafile containing object data to import")
78
    parser.add_argument('--update',
79
                        default=False,
80
                        action='store_true',
81
                        help="Let the script PATCH the data.  Default is False")
82
    parser.add_argument('--patchall',
83
                        default=False,
84
                        action='store_true',
85
                        help="PATCH existing objects.  Default is False \
86
                        and will only PATCH with user override")
87
    parser.add_argument('--remote',
88
                        default=False,
89
                        action='store_true',
90
                        help="will skip attribution prompt \
91
                        needed for automated submissions")
92
    parser.add_argument('--lab',
93
                        help="When using --remote can pass in a valid lab identifier \
94
                        eg. uuid or @id to add attribution - must be able to submit for lab and \
95
                        not needed if only submit for a single lab.")
96
    parser.add_argument('--award',
97
                        help="When using --remote if you are submitting for a lab with multiple awards \
98
                        can pass a valid award identifier eg. uuid or @id to add attribution \
99
                        not needed if there is only one award associated with the submitting lab.")
100
    parser.add_argument('--novalidate',
101
                        default=False,
102
                        action='store_true',
103
                        help="Will skip pre-validation of workbook")
104
    args = parser.parse_args()
105
    _remove_all_from_types(args)
106
    return args
107

108

109
# list of [sheet, [fields]] that need to be patched as a second step
110
# should be in sync with loadxl.py in fourfront
111
list_of_loadxl_fields = [
4✔
112
    ['Document', ['references']],
113
    ['User', ['lab', 'submits_for']],
114
    ['ExperimentType', ['sop', 'reference_pubs']],
115
    ['Biosample', ['biosample_relation']],
116
    ['Experiment', ['experiment_relation']],
117
    ['ExperimentMic', ['experiment_relation']],
118
    ['ExperimentHiC', ['experiment_relation']],
119
    ['ExperimentSeq', ['experiment_relation']],
120
    ['ExperimentTsaseq', ['experiment_relation']],
121
    ['ExperimentDamid', ['experiment_relation']],
122
    ['ExperimentChiapet', ['experiment_relation']],
123
    ['ExperimentAtacseq', ['experiment_relation']],
124
    ['ExperimentCaptureC', ['experiment_relation']],
125
    ['ExperimentRepliseq', ['experiment_relation']],
126
    ['FileFastq', ['related_files']],
127
    ['FileReference', ['related_files']],
128
    ['FileCalibration', ['related_files']],
129
    ['FileMicroscopy', ['related_files']],
130
    ['FileProcessed', ['related_files', 'produced_from']],
131
    ['Individual', ['individual_relation']],
132
    ['IndividualChicken', ['individual_relation']],
133
    ['IndividualFly', ['individual_relation']],
134
    ['IndividualHuman', ['individual_relation']],
135
    ['IndividualMouse', ['individual_relation']],
136
    ['IndividualPrimate', ['individual_relation']],
137
    ['IndividualZebrafish', ['individual_relation']],
138
    ['Publication', ['exp_sets_prod_in_pub', 'exp_sets_used_in_pub']]
139
]
140

141

142
def md5(path_string):
4✔
143
    path = pp.Path(path_string).expanduser()
4✔
144
    md5sum = hashlib.md5()
4✔
145
    with open(path, 'rb') as f:
4✔
146
        for chunk in iter(lambda: f.read(1024*1024), b''):
4✔
147
            md5sum.update(chunk)
4✔
148
    return md5sum.hexdigest()
4✔
149

150

151
class WebFetchException(Exception):
4✔
152
    """
153
    custom exception to raise if ftp or http fetch fails
154
    """
155
    pass
4✔
156

157

158
def attachment(path):
4✔
159
    """Create an attachment upload object from a filename and embed the attachment as a data url.
160
       NOTE: a url or ftp can be used but path must end in filename with extension that will match
161
       the magic detected MIME type of that file and be one of the allowed mime types
162
    """
163
    ALLOWED_MIMES = (
4✔
164
        'application/pdf',
165
        'application/zip',
166
        'text/plain',
167
        'text/tab-separated-values',
168
        'text/html',
169
        'application/msword',
170
        'application/vnd.openxmlformats-officedocument.wordprocessingml.document',
171
        'application/vnd.ms-excel',
172
        'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet',
173
        'image/png',
174
        'image/jpeg',
175
        'image/gif',
176
        'image/tiff',
177
    )
178
    ftp_attach = False
4✔
179
    if path.startswith('~'):
4✔
180
        path = str(pp.Path(path).expanduser())
4✔
181
    if not pp.Path(path).is_file():
4✔
182
        # if the path does not exist, check if it works as a URL
183
        if path.startswith("ftp://"):  # grab the file from ftp
4✔
184
            print("\nINFO: Attempting to download file from this url %s" % path)
×
185
            try:
×
186
                with closing(urllib2.urlopen(path)) as r:
×
187
                    file_name = path.split("/")[-1]
×
188
                    with open(file_name, 'wb') as f:
×
189
                        shutil.copyfileobj(r, f)
×
190
                        path = file_name
×
191
                        ftp_attach = True
×
192
            except urllib2.URLError as e:
×
193
                raise WebFetchException("\nERROR : FTP fetch for 'attachment' failed - {}".format(e))
×
194
        else:
195
            try:
4✔
196
                r = requests.get(path)
4✔
197
            except Exception:
4✔
198
                raise WebFetchException(
4✔
199
                    "\nERROR : The 'attachment' field has INVALID FILE PATH or URL ({})\n".format(path))
200
            else:
201
                # if it works as a URL, but does not return 200
202
                if r.status_code != 200:  # pragma: no cover
203
                    raise Exception("\nERROR : The 'attachment' field has INVALID URL ({})\n".format(path))
204
            # parse response
205
            path = path.split("/")[-1]
4✔
206
            try:
4✔
207
                with open(path, "wb") as outfile:
4✔
208
                    outfile.write(r.content)
4✔
209
                    ftp_attach = True
4✔
210
            except Exception as e:
×
211
                raise Exception("\nERROR : Cannot write a tmp file to disk - {}".format(e))
×
212

213
    attach = {}
4✔
214
    filename = pp.PurePath(path).name
4✔
215
    guessed_mime = mimetypes.guess_type(path)[0]
4✔
216
    detected_mime = magic.from_file(path, mime=True)
4✔
217
    # NOTE: this whole guessing and detecting bit falls apart for zip files which seems a bit dodgy
218
    # some .zip files are detected as generic application/octet-stream but don't see a good way to verify
219
    # basically relying on extension with a little verification by magic for most file types
220
    if guessed_mime not in ALLOWED_MIMES:
4✔
221
        raise ValueError("Unallowed file type for %s" % filename)
4✔
222
    if detected_mime != guessed_mime and guessed_mime != 'application/zip':
4✔
223
        raise ValueError('Wrong extension for %s: %s' % (detected_mime, filename))
4✔
224

225
    with open(path, 'rb') as stream:
4✔
226
        attach = {
4✔
227
            'download': filename,
228
            'type': guessed_mime,
229
            'href': 'data:%s;base64,%s' % (guessed_mime, b64encode(stream.read()).decode('ascii'))
230
        }
231
    if ftp_attach:
4✔
232
        pp.Path(path).unlink()
4✔
233
    return attach
4✔
234

235

236
def digest_xlsx(filename):
4✔
237
    try:
4✔
238
        with warnings.catch_warnings():
4✔
239
            warnings.simplefilter("ignore")
4✔
240
            book = openpyxl.load_workbook(filename)
4✔
241
    except InvalidFileException as e:
×
242
        if filename.endswith('.xls'):
×
243
            print("WARNING - Old xls format not supported - please save your workbook as xlsx")
×
244
        else:
245
            print("ERROR - ", e)
×
246
        sys.exit(1)
×
247
    sheets = book.sheetnames
4✔
248
    return book, sheets
4✔
249

250

251
def reader(workbook, sheetname=None):
4✔
252
    """Read named sheet or first and only sheet from xlsx file."""
253
    if sheetname is None:
4✔
254
        sheet = workbook.worksheets[0]
4✔
255
    else:
256
        try:
4✔
257
            sheet = workbook[sheetname]
4✔
258
        except Exception as e:
4✔
259
            print(e)
4✔
260
            print(sheetname)
4✔
261
            print("ERROR: Can not find the collection sheet in excel file (openpyxl error)")
4✔
262
            return
4✔
263
    # Generator that gets rows from excel sheet
264
    # NB we have a lot of empty no formatting rows added (can we get rid of that)
265
    # or do we need to be careful to check for the first totally emptyvalue row?
266
    return row_generator(sheet)
4✔
267

268

269
def row_generator(sheet):
4✔
270
    """Generator that gets rows from excel sheet
271
    Note that this currently checks to see if a row is empty and if so stops
272
    This is needed as plain text formatting of cells is recognized as data
273
    """
274
    for row in sheet.rows:
4✔
275
        vals = [cell_value(cell) for cell in row]
4✔
276
        if not any([v for v in vals]):
4✔
277
            return
4✔
278
        else:
279
            yield vals
4✔
280

281

282
def cell_value(cell):
4✔
283
    """Get cell value from excel. [From Submit4DN]"""
284
    ctype = cell.data_type
4✔
285
    value = cell.value
4✔
286
    if ctype == openpyxl.cell.cell.TYPE_ERROR:  # pragma: no cover
287
        raise ValueError('Cell %s contains a cell error' % str(cell.coordinate))
288
    elif value is None:
4✔
289
        return ''
4✔
290
    elif ctype == openpyxl.cell.cell.TYPE_BOOL:
4✔
291
        boolstr = str(value).strip()
4✔
292
        if boolstr == 'TRUE':
4✔
293
            return True
×
294
        elif boolstr == 'FALSE':
4✔
295
            return False
×
296
        else:
297
            return value
4✔
298
    elif ctype in (openpyxl.cell.cell.TYPE_NUMERIC, openpyxl.cell.cell.TYPE_NULL):
4✔
299
        if isinstance(value, float):
4✔
300
            if value.is_integer():
4✔
301
                value = int(value)
×
302
        if not value:
4✔
303
            return ''
×
304
        return value
4✔
305
    elif isinstance(value, openpyxl.cell.cell.TIME_TYPES):
4✔
306
        if isinstance(value, datetime.datetime):
4✔
307
            if value.time() == datetime.time(0, 0, 0):
4✔
308
                return value.date().isoformat()
4✔
309
            else:  # pragma: no cover
310
                return value.isoformat()
311
        else:
312
            return value.isoformat()
×
313
    elif ctype in (openpyxl.cell.cell.TYPE_STRING, openpyxl.cell.cell.TYPE_INLINE):
4✔
314
        return value.strip()
4✔
315
    raise ValueError(
316
        'Cell %s is not an acceptable cell type' % str(cell.coordinate)
317
    )  # pragma: no cover
318

319

320
def data_formatter(value, val_type, field=None):
4✔
321
    """Return formatted data."""
322
    # If val_type is int/num, but the value is not
323
    # this function will just return the string
324
    # schema validation will report the error
325
    try:
4✔
326
        if val_type in ["int", "integer"]:
4✔
327
            return int(value)
4✔
328
        elif val_type in ["num", "number"]:
4✔
329
            return float(value)
4✔
330
        elif val_type in ["list", "array"]:
4✔
331
            data_list = value.strip("[\']").split(",")
4✔
332
            return [data.strip() for data in data_list]
4✔
333
        elif val_type == 'boolean':
4✔
334
            return value
×
335
        else:
336
            # default assumed to be string
337
            return str(value).strip()
4✔
338
    except ValueError:  # pragma: no cover
339
        return str(value).strip()
340

341

342
def get_field_name(field_name):
4✔
343
    """handle type at end, plus embedded objets."""
344
    field = field_name.replace('*', '')
4✔
345
    field = field.split(':')[0]
4✔
346
    return field.split(".")[0]
4✔
347

348

349
def get_sub_field(field_name):
4✔
350
    """Construct embeded field names."""
351
    try:
4✔
352
        return field_name.split(".")[1].rstrip('-0123456789')
4✔
353
    except Exception:  # pragma: no cover
354
        return ''
355

356

357
def get_field_type(field_name):
4✔
358
    """Grab old style (ENCODE) data field type."""
359
    try:
4✔
360
        return field_name.split(":")[1]
4✔
361
    except IndexError:
4✔
362
        return "string"
4✔
363

364

365
def is_embedded_field(field_name):
4✔
366
    """See if field is embedded."""
367
    return '.' in field_name
4✔
368

369

370
def get_sub_field_number(field_name):
4✔
371
    """Name clearing for multiple objects."""
372
    field_name = field_name.replace('*', '')
4✔
373
    field = field_name.split(":")[0]
4✔
374
    try:
4✔
375
        return int(field.split("-")[1])
4✔
376
    except Exception:
4✔
377
        return 0
4✔
378

379

380
def build_field(field, field_data, field_type):
4✔
381
    if field_data is False:
4✔
382
        pass
×
383
    elif not field_data or not field:
4✔
384
        return None
4✔
385
    patch_field_name = get_field_name(field)
4✔
386
    if not field_type:
4✔
387
        field_type = get_field_type(field)
4✔
388
    if ',' in field_type:
4✔
389
        field_type, subfield_type = [s.strip() for s in field_type.split(",")]
4✔
390
    if 'array' in field_type:
4✔
391
        field_type = 'array'
4✔
392
    if is_embedded_field(field):
4✔
393
        sub_field = get_sub_field(field)
4✔
394
        return build_field(sub_field, field_data, subfield_type)
4✔
395
    else:
396
        patch_field_data = data_formatter(field_data, field_type, field)
4✔
397
    return {patch_field_name: patch_field_data}
4✔
398

399

400
def fix_attribution(sheet, post_json, connection):
4✔
401
    if sheet.lower() not in ['lab', 'award', 'user', 'organism', 'ontologyterm']:
4✔
402
        if not post_json.get('lab'):
4✔
403
            post_json['lab'] = connection.lab
4✔
404
        if not post_json.get('award'):
4✔
405
            post_json['award'] = connection.award
4✔
406
    return post_json
4✔
407

408

409
def parse_exception(e):
4✔
410
    """ff_utils functions raise an exception when the expected code is not returned.
411
    This response is a pre-formatted text, and this function will get the resonse json
412
    out of it."""
413
    try:
4✔
414
        # try parsing the exception
415
        text = e.args[0]
4✔
416
        index = text.index('Reason: ')
4✔
417
        resp_text = text[index + 8:]
4✔
418
        resp_dict = ast.literal_eval(resp_text)
4✔
419
        return resp_dict
4✔
420
    # if not re-raise
421
    except Exception:  # pragma: no cover
422
        raise e
423

424

425
def get_existing(post_json, connection):
4✔
426
    """Get the entry that will be patched from the server."""
427
    # get all possible identifier from the json
428
    all_ids = []
4✔
429
    for identifier in ["uuid", "accession", "@id"]:
4✔
430
        if post_json.get(identifier):
4✔
431
            all_ids.append(post_json[identifier])
4✔
432
    # also look for all aliases
433
    if post_json.get("aliases"):
4✔
434
        # weird precaution in case there are 2 aliases, 1 exisitng , 1 new
435
        all_ids.extend(post_json['aliases'])
4✔
436
    # look if post_json has these 3 identifier
437
    temp = {}
4✔
438
    uuids = []
4✔
439
    for an_id in all_ids:
4✔
440
        try:
4✔
441
            temp = ff_utils.get_metadata(an_id, key=connection.key, add_on="frame=object")
4✔
442
        except Exception as e:
×
443
            exc = parse_exception(e)
×
444
            # if the item does not exist get_metadata will raise an exceptions
445
            # see if the exception message has 404, then continue, if not throw that exception
446
            if exc['code'] == 404:
×
447
                temp = {}
×
448
            else:
449
                raise e
×
450
        if temp.get("uuid"):
4✔
451
            uuids.append(temp.get("uuid"))
4✔
452

453
    # check if all existing identifiers point to the same object
454
    unique_uuids = list(set(uuids))
4✔
455
    # if no existing information
456
    if len(unique_uuids) == 0:
4✔
457
        return {}
×
458
    # if everything is as expected
459
    elif len(unique_uuids) == 1:
4✔
460
        temp = ff_utils.get_metadata(unique_uuids[0], key=connection.key, add_on="frame=object")
4✔
461
        return temp
4✔
462
    # funky business not allowed, if identifiers point to different objects
463
    else:  # pragma: no cover
464
        print("ERROR - Personality disorder - ERROR")
465
        print("Used identifiers (aliases, uuid, accession, @id) point to following different existing items")
466
        print(unique_uuids)
467
        return
468

469

470
def get_f_type(field, fields2types):
4✔
471
    return fields2types.get(field, None)
4✔
472

473

474
def add_to_mistype_message(item='', itype='', ftype='', msg=''):
4✔
475
    toadd = "ERROR: '%s' is " % item
4✔
476
    if 'HTTPNotFound' in itype:
4✔
477
        toadd += 'NOT FOUND '
4✔
478
    else:
479
        toadd += 'TYPE %s ' % itype
4✔
480
    return msg + toadd + '- THE REQUIRED TYPE IS %s\n' % ftype
4✔
481

482

483
def validate_item(itemlist, typeinfield, alias_dict, connection):
4✔
484
    msg = ''
4✔
485
    pattern = re.compile(r"/[\w-]+/\w")
4✔
486
    for item in itemlist:
4✔
487
        if item in alias_dict:
4✔
488
            itemtype = alias_dict[item]
4✔
489
            if typeinfield not in itemtype:
4✔
490
                # need special cases for FileSet and ExperimentSet?
491
                msg = add_to_mistype_message(item, itemtype, typeinfield, msg)
4✔
492
        else:
493
            # check for fully qualified path i.e. /labs/4dn-dcic-lab/
494
            match = pattern.match(item)
4✔
495
            if not item.startswith('/'):
4✔
496
                item = '/' + item
4✔
497
            match = pattern.match(item)
4✔
498
            if match is None:
4✔
499
                item = '/' + typeinfield + item
4✔
500
            try:
4✔
501
                res = ff_utils.get_metadata(item, key=connection.key, add_on="frame=object")
4✔
502
            except Exception as problem:
×
503
                res = parse_exception(problem)
×
504
            itemtypes = res.get('@type')
4✔
505
            if itemtypes:
4✔
506
                if typeinfield not in itemtypes:
4✔
507
                    msg = add_to_mistype_message(item, itemtypes[0], typeinfield, msg)
4✔
508
    return msg.rstrip()
4✔
509

510

511
def validate_string(strings, alias_dict):
4✔
512
    """check if the string value is in the aliases list."""
513
    msg = ''
4✔
514
    for s in strings:
4✔
515
        if alias_dict.get(s, None) is not None:
4✔
516
            msg = msg + "WARNING: ALIAS %s USED IN string Field\n" % s
4✔
517
    return msg.rstrip()
4✔
518

519

520
def _convert_to_array(s, is_array):
4✔
521
    if is_array:
4✔
522
        return [i.strip() for i in s.split(',')]
4✔
523
    return [s.strip()]
4✔
524

525

526
def validate_field(field_data, field_type, aliases_by_type, connection):
4✔
527
    to_trim = 'array of embedded objects, '
4✔
528
    is_array = False
4✔
529
    msg = None
4✔
530
    field_data = data_formatter(field_data, field_type)
4✔
531
    if field_type.startswith(to_trim):
4✔
532
        field_type = field_type.replace(to_trim, '')
4✔
533
    if 'array' in field_type:
4✔
534
        is_array = True
4✔
535
    if 'Item:' in field_type:
4✔
536
        _, itemtype = field_type.rsplit(':', 1)
4✔
537
        items = _convert_to_array(field_data, is_array)
4✔
538
        msg = validate_item(items, itemtype, aliases_by_type, connection)
4✔
539
    elif 'string' in field_type:
4✔
540
        strings = _convert_to_array(field_data, is_array)
4✔
541
        msg = validate_string(strings, aliases_by_type)
4✔
542
    elif 'boolean' in field_type:
×
543
        pass  # for now
544
    return msg
4✔
545

546

547
def pre_validate_json(post_json, fields2types, aliases_by_type, connection):
4✔
548
    report = []
4✔
549
    for field, field_data in post_json.items():
4✔
550
        # ignore commented out fields
551
        if field.startswith('#'):
4✔
552
            continue
4✔
553
        # ignore empty fields
554
        if not field_data:
4✔
555
            continue
4✔
556
        # ignore certain fields - aliases validated before
557
        # source_experiments and produced_from hold strings of aliases by design
558
        if field in ['aliases', 'produced_from', 'source_experiments']:
4✔
559
            continue
4✔
560
        field_type = get_f_type(field, fields2types)
4✔
561
        msg = validate_field(field_data, field_type, aliases_by_type, connection)
4✔
562
        if msg:
4✔
563
            report.append(msg)
×
564
    return report
4✔
565

566

567
def build_patch_json(fields, fields2types):
4✔
568
    """Create the data entry dictionary from the fields."""
569
    patch_data = {}
4✔
570
    for field, field_data in fields.items():
4✔
571
        # ignore commented out rows
572
        if field.startswith('#'):
4✔
573
            continue
4✔
574
        field_type = get_f_type(field, fields2types)
4✔
575

576
        patch_field = build_field(field, field_data, field_type)
4✔
577
        if patch_field is not None:
4✔
578
            if is_embedded_field(field):
4✔
579
                top_field = get_field_name(field)
4✔
580
                if patch_data.get(top_field, None) is None:
4✔
581
                    # initially create an empty list for embedded field
582
                    patch_data[top_field] = []
4✔
583
                # we can have multiple embedded objects (they are numbered in excel)
584
                subobject_num = get_sub_field_number(field)
4✔
585
                if subobject_num >= len(patch_data[top_field]):
4✔
586
                    # add a new row to the list
587
                    patch_data[top_field].append(patch_field)
4✔
588
                else:
589
                    # update existing object in the list
590
                    patch_data[top_field][subobject_num].update(patch_field)
4✔
591
            else:
592
                # normal case, just update the dictionary
593
                patch_data.update(patch_field)
4✔
594
    return patch_data
4✔
595

596

597
def get_just_filename(path):
4✔
598
    return pp.Path(path).name
4✔
599

600

601
def check_extra_file_meta(ef_info, seen_formats, existing_formats):
4✔
602
    try:
4✔
603
        ef_format = ef_info.get('file_format')
4✔
604
    except AttributeError:
4✔
605
        print('WARNING! -- Malformed extrafile field formatting', ef_info)
4✔
606
        return None, seen_formats
4✔
607
    else:
608
        if not ef_format:
4✔
609
            return ef_info, seen_formats
4✔
610

611
    # convert format to @id
612
    ef_format = '/file-formats/' + ef_format + '/'
4✔
613
    ef_info['file_format'] = ef_format
4✔
614
    if ef_format in existing_formats:
4✔
615
        print("An extrafile with %s format exists - will attempt to patch" % ef_format)
4✔
616

617
    filepath = ef_info.get('filename')
4✔
618
    if filepath is not None:
4✔
619
        sfilename = get_just_filename(filepath)
4✔
620
        ef_info['submitted_filename'] = sfilename
4✔
621
        if not ef_info.get('md5sum'):
4✔
622
            ef_info['md5sum'] = md5(filepath)
4✔
623
        if not ef_info.get('filesize'):
4✔
624
            ef_info['filesize'] = pp.Path(filepath).stat().st_size
4✔
625
    seen_formats.append(ef_format)
4✔
626
    return ef_info, seen_formats
4✔
627

628

629
def populate_post_json(post_json, connection, sheet, attach_fields):  # , existing_data):
4✔
630
    """Get existing, add attachment, check for file and fix attribution."""
631
    # add attachments
632
    for af in attach_fields:
4✔
633
        if post_json.get(af):
4✔
634
            attach = attachment(post_json[af])
4✔
635
            post_json[af] = attach
4✔
636
    existing_data = get_existing(post_json, connection)
4✔
637
    # Combine aliases
638
    if post_json.get('aliases') != ['*delete*']:
4✔
639
        if post_json.get('aliases') and existing_data.get('aliases'):
4✔
640
            aliases_to_post = list(set(filter(None, post_json.get('aliases') + existing_data.get('aliases'))))
×
641
            post_json["aliases"] = aliases_to_post
×
642
    # Combine tags
643
    if post_json.get('tags') != ['*delete*']:
4✔
644
        if post_json.get('tags') and existing_data.get('tags'):
4✔
645
            tags_to_post = list(set(filter(None, post_json.get('tags') + existing_data.get('tags'))))
×
646
            post_json["tags"] = tags_to_post
×
647
    # delete calculated property
648
    if post_json.get('@id'):
4✔
649
        del post_json['@id']
×
650
    # should I upload files as well?
651
    file_to_upload = False
4✔
652
    filename_to_post = post_json.get('filename')
4✔
653
    if filename_to_post:
4✔
654
        # remove full path from filename
655
        just_filename = get_just_filename(filename_to_post)
4✔
656
        # if new file
657
        if not existing_data.get('uuid'):
4✔
658
            post_json['filename'] = just_filename
4✔
659
            file_to_upload = True
4✔
660
        # if there is an existing file metadata, the status should be uploading to upload a new one
661
        elif existing_data.get('status') in ['uploading', 'upload failed']:
4✔
662
            post_json['filename'] = just_filename
4✔
663
            file_to_upload = True
4✔
664
        else:
665
            # if not uploading a file, do not post the filename
666
            del post_json['filename']
×
667

668
    # deal with extrafiles
669
    extrafiles = post_json.get('extra_files')
4✔
670
    extrafiles2upload = {}
4✔
671
    if extrafiles:
4✔
672
        # in sheet these will be file paths need to both poopulate the extrafiles properties
673
        # in post or patch as well as upload the file if not already there
674
        existing_formats = []
4✔
675
        existing_extrafiles = []
4✔
676
        extrafile_metadata = []
4✔
677
        if existing_data:
4✔
678
            if existing_data.get('extra_files'):
4✔
679
                existing_extrafiles = existing_data.get('extra_files')  # to include existing
4✔
680
                existing_formats = [ef.get('file_format') for ef in existing_data.get('extra_files')]
4✔
681
        seen_formats = []
4✔
682
        for extrafile in extrafiles:
4✔
683
            extrafile_meta, seen_formats = check_extra_file_meta(extrafile, seen_formats, existing_formats)
4✔
684
            if extrafile_meta:
4✔
685
                if extrafile_meta.get('file_format'):
4✔
686
                    if extrafile_meta.get('filename'):
4✔
687
                        extrafiles2upload[extrafile_meta['file_format']] = extrafile_meta['filename']
4✔
688
                        del extrafile_meta['filename']
4✔
689
                    for ix, eef in enumerate(existing_extrafiles):
4✔
690
                        if eef['file_format'] == extrafile_meta['file_format']:
4✔
691
                            # we are patching so want to remove existing entry from existing_extrafiles
692
                            del existing_extrafiles[ix]
4✔
693
                            break
4✔
694
                extrafile_metadata.append(extrafile_meta)
4✔
695

696
        if extrafile_metadata:
4✔
697
            # we have data to update
698
            post_json['extra_files'] = extrafile_metadata + existing_extrafiles
4✔
699
        else:
700
            del post_json['extra_files']
4✔
701

702
    # if no existing data (new item), add missing award/lab information from submitter
703
    if not existing_data.get("award"):
4✔
704
        post_json = fix_attribution(sheet, post_json, connection)
4✔
705
    return post_json, existing_data, file_to_upload, extrafiles2upload
4✔
706

707

708
def filter_set_from_exps(post_json):
4✔
709
    """Experiments set information is taken from experiments and submitted to experiment_set."""
710
    rep_set_info = []
×
711
    exp_set_info = []
×
712
    # Part I - Replicate Sets
713
    # store the values in a list and delete them from post_json
714
    if post_json.get('replicate_set'):
×
715
        for replicate_field in ['replicate_set', 'bio_rep_no', 'tec_rep_no']:
×
716
            rep_set_info.append(post_json[replicate_field])
×
717
            post_json.pop(replicate_field)
×
718
    # Part II - Experiment Sets
719
    if post_json.get('experiment_set'):
×
720
        exp_set_info.append(post_json['experiment_set'])
×
721
        post_json.pop('experiment_set')
×
722
    return post_json, rep_set_info, exp_set_info
×
723

724

725
def filter_loadxl_fields(post_json, sheet):
4✔
726
    """All fields from the list_of_loadxl_fields are taken out of post_json and accumulated in dictionary."""
727
    patch_loadxl_item = {}
4✔
728
    for sheet_loadxl, fields_loadxl in list_of_loadxl_fields:
4✔
729
        if sheet == sheet_loadxl:
4✔
730
            for field_loadxl in fields_loadxl:
4✔
731
                if post_json.get(field_loadxl):
4✔
732
                    patch_loadxl_item[field_loadxl] = post_json[field_loadxl]
×
733
                    del post_json[field_loadxl]
×
734
    return post_json, patch_loadxl_item
4✔
735

736

737
def combine_set(post_json, existing_data, sheet, accumulate_dict):
4✔
738
    """Combine experiment related information form dictionaries with existing information."""
739
    # find all identifiers from exisiting set item to match the one used in experiments sheet
740
    identifiers = []
4✔
741
    for identifier in ['accession', 'uuid', 'aliases', '@id']:
4✔
742
        ex_item_id = existing_data.get(identifier, '')
4✔
743
        item_id = post_json.get(identifier, ex_item_id)
4✔
744
        # to extract alias from list
745
        if isinstance(item_id, list) and item_id:
4✔
746
            item_id = item_id[0]
4✔
747
        if item_id:
4✔
748
            identifiers.append(item_id)
4✔
749
    # search dictionary for the existing item id
750
    for identifier in identifiers:
4✔
751
        if accumulate_dict.get(identifier):
4✔
752
            add_to_post = accumulate_dict.get(identifier)
4✔
753
            # Combination for experimentsets
754
            if sheet == "ExperimentSet":
4✔
755
                if existing_data.get('experiments_in_set'):
4✔
756
                    existing_exps = existing_data.get('experiments_in_set')
4✔
757
                    post_json['experiments_in_set'] = list(set(add_to_post + existing_exps))
4✔
758
                else:
759
                    post_json['experiments_in_set'] = add_to_post
4✔
760
            # Combination for replicate sets
761
            if sheet == "ExperimentSetReplicate":
4✔
762
                if existing_data.get('replicate_exps'):
4✔
763
                    existing_sets = existing_data.get('replicate_exps')
4✔
764
                    new_exps = [i['replicate_exp'] for i in add_to_post]
4✔
765
                    existing_sets = [i for i in existing_sets if i['replicate_exp'] not in new_exps]
4✔
766
                    post_json['replicate_exps'] = add_to_post + existing_sets
4✔
767
                else:
768
                    post_json['replicate_exps'] = add_to_post
4✔
769
            # remove found item from the accumulate_dict
770
            accumulate_dict.pop(identifier)
4✔
771
    return post_json, accumulate_dict
4✔
772

773

774
def error_report(error_dic, sheet, all_aliases, connection, error_id=''):
4✔
775
    """From the validation error report, forms a readable statement."""
776
    # This dictionary is the common elements in the error dictionary I see so far
777
    # I want to catch anything that does not follow this to catch different cases
778
    error_header = {'@type': ['ValidationFailure', 'Error'], 'code': 422, 'status': 'error',
4✔
779
                    'title': 'Unprocessable Entity', 'description': 'Failed validation'}
780
    report = []
4✔
781
    if all(item in error_dic.items() for item in error_header.items()):
4✔
782
        # deal with Validation errors
783
        for err in error_dic.get('errors'):
4✔
784
            error_description = err.get('description')
4✔
785
            # this may no longer ever happen?
786
            if 'name' not in err or not err.get('name'):
4✔
787
                report.append("{sheet:<30}{des}"
4✔
788
                              .format(des=error_description, sheet="ERROR " + sheet.lower()))
789
            else:
790
                # deal with errors about linked objects not in db - checking for those with
791
                # aliases present in the workbook that should be ignored
792
                utrl_txt = 'Unable to resolve link:'
4✔
793
                nf_txt = 'not found'
4✔
794
                not_found = None
4✔
795
                alias_bit = None
4✔
796
                if utrl_txt in error_description:
4✔
797
                    alias_bit = error_description.replace(utrl_txt, '')
4✔
798
                elif error_description.endswith(nf_txt):
4✔
799
                    alias_bit = error_description.replace(nf_txt, '').replace("'", '')
4✔
800
                if alias_bit:
4✔
801
                    alias_bit = alias_bit.split('/')[-1]
4✔
802
                    not_found = alias_bit.strip()
4✔
803
                # ignore ones about existing aliases
804
                if not_found and not_found in all_aliases:
4✔
805
                    continue
4✔
806
                error_field = err['name']
4✔
807
                report.append("{sheet:<30}{eid} Field '{er}': {des}"
4✔
808
                              .format(er=error_field, des=error_description, eid=error_id, sheet="ERROR " + sheet.lower()))
809
    # if there is a an access forbidden error
810
    elif error_dic.get('title') == 'Forbidden':
4✔
811
        error_description = error_dic['description']
4✔
812
        try:
4✔
813
            report.append("{sheet:<30}{eid}: {des}"
4✔
814
                          .format(des=error_description, eid=error_id, sheet="ERROR " + sheet.lower()))
815
        except Exception:
×
816
            return error_dic
×
817
    # if there is a conflict
818
    elif error_dic.get('title') == "Conflict":
4✔
819
        try:
4✔
820
            report.extend(conflict_error_report(error_dic, sheet, connection))
4✔
821
        except Exception:
×
822
            return error_dic
×
823
    # if nothing works, give the full error, we should add that case to our reporting
824
    else:
825
        return error_dic
×
826
    if report:
4✔
827
        report_print = '\n'.join(report)
4✔
828
        return report_print
4✔
829
    else:
830
        # if report is empty, return False
831
        return
×
832

833

834
def conflict_error_report(error_dic, sheet, connection):
4✔
835
    # I am not sure of the complete case of HTTPConflicts
836
    # To make sure we get all cases reported, I put a try/except
837
    all_conflicts = []
4✔
838
    try:
4✔
839
        # list is reported as string, turned into list again
840
        conflict_str = error_dic.get('detail').replace("Keys conflict:", "").strip()
4✔
841
        conflict_list = ast.literal_eval(conflict_str)
4✔
842
        for conflict in conflict_list:
4✔
843
            error_field = conflict[0].split(":")[1]
4✔
844
            error_value = conflict[1]
4✔
845
            try:
4✔
846
                # let's see if the user has access to conflicting item
847
                search = "search/?type={sheet}&{field}={value}".format(sheet=sheet,
4✔
848
                                                                       field=error_field,
849
                                                                       value=error_value)
850
                existing_item = ff_utils.search_metadata(search, key=connection.key)
4✔
851
                at_id = existing_item.get('@id')
×
852
                add_text = "please use " + at_id
×
853
            except Exception:
4✔
854
                # if there is a conflicting item, but it is not viewable by the user,
855
                # we should release the item to the project/public
856
                add_text = "please contact DCIC"
4✔
857
            conflict_rep = ("{sheet:<30}Field '{er}': '{des}' already exists, {at}"
4✔
858
                            .format(er=error_field, des=error_value, sheet="ERROR " + sheet.lower(), at=add_text))
859
        all_conflicts.append(conflict_rep)
4✔
860
        return all_conflicts
4✔
861
    except Exception:
×
862
        return
×
863

864

865
def update_item(verb, file_to_upload, post_json, filename_to_post, extrafiles, connection, identifier):
4✔
866
    # if FTP, grab the file from ftp
867
    ftp_download = False
4✔
868
    if file_to_upload and filename_to_post.startswith("ftp://"):
4✔
869
        ftp_download = True
4✔
870
        file_to_upload, post_json, filename_to_post = ftp_copy(filename_to_post, post_json)
4✔
871
    # add the md5
872
    if file_to_upload and not post_json.get('md5sum'):
4✔
873
        print(f"calculating md5 sum for file {filename_to_post} ")
4✔
874
        post_json['md5sum'] = md5(filename_to_post)
4✔
875
    try:
4✔
876
        if verb == 'PATCH':
4✔
877
            e = ff_utils.patch_metadata(post_json, identifier, key=connection.key)
4✔
878
        elif verb == 'POST':
4✔
879
            e = ff_utils.post_metadata(post_json, identifier, key=connection.key)
4✔
880
        else:
881
            raise ValueError('Unrecognized verb - must be POST or PATCH')
4✔
882
    except Exception as problem:
4✔
883
        e = parse_exception(problem)
4✔
884
    if e.get('status') == 'error':
4✔
885
        return e
4✔
886
    if file_to_upload:
4✔
887
        # get s3 credentials
888
        if verb == 'PATCH':
4✔
889
            creds = get_upload_creds(e['@graph'][0]['accession'], connection)
4✔
890
            e['@graph'][0]['upload_credentials'] = creds
4✔
891
        # upload
892
        upload_file_item(e, filename_to_post)
4✔
893
        if ftp_download:
4✔
894
            pp.Path(filename_to_post).unlink()
4✔
895
    if extrafiles:
4✔
896
        extcreds = e['@graph'][0].get('extra_file_creds')
4✔
897
        if not extcreds:
4✔
898
            time.sleep(5)
4✔
899
            extcreds = get_upload_creds(e['@graph'][0]['accession'], connection, extfilecreds=True)
4✔
900
        for fformat, filepath in extrafiles.items():
4✔
901
            try:
4✔
902
                file_format = ff_utils.get_metadata(fformat, key=connection.key)
4✔
903
                ff_uuid = file_format.get('uuid')
4✔
904
            except Exception:
×
905
                raise "Can't find file_format item for %s" % fformat
×
906
            for ecred in extcreds:
4✔
907
                if ff_uuid == ecred.get('file_format'):
4✔
908
                    upload_creds = ecred.get('upload_credentials')
×
909
                    upload_extra_file(upload_creds, filepath)
×
910
    return e
4✔
911

912

913
def patch_item(file_to_upload, post_json, filename_to_post, extrafiles, connection, existing_data):
4✔
914
    return update_item('PATCH', file_to_upload, post_json, filename_to_post, extrafiles,
4✔
915
                       connection, existing_data.get('uuid'))
916

917

918
def post_item(file_to_upload, post_json, filename_to_post, extrafiles, connection, sheet):
4✔
919
    return update_item('POST', file_to_upload, post_json, filename_to_post, extrafiles, connection, sheet)
4✔
920

921

922
def ftp_copy(filename_to_post, post_json):
4✔
923
    """Downloads the file from the server, and reformats post_json."""
924
    if not post_json.get("md5sum"):
4✔
925
        # if the file is from the server, the md5 should be supplied by the user.
926
        print("\nWARNING: File not uploaded")
4✔
927
        print("Please add original md5 values of the files")
4✔
928
        return False, post_json, ""
4✔
929
    try:
×
930
        # download the file from the server
931
        # return new file location to upload from
932
        print("\nINFO: Attempting to download file from this url to your computer before upload %s" % filename_to_post)
×
933
        with closing(urllib2.urlopen(filename_to_post)) as r:
×
934
            new_file = post_json['filename']
×
935
            with open(new_file, 'wb') as f:
×
936
                shutil.copyfileobj(r, f)
×
937
        return True, post_json, new_file
×
938
    except Exception:
×
939
        # if download did not work, delete the filename from the post json
940
        print("WARNING: Download failed")
×
941
        post_json.pop('filename')
×
942
        return False, post_json, ""
×
943

944

945
def delete_fields(post_json, connection, existing_data):
4✔
946
    """Deletes fields with the value '*delete*'."""
947
    # find fields to be removed
948
    fields_to_be_removed = []
4✔
949
    for key, value in post_json.items():
4✔
950
        if value in ['*delete*', ['*delete*']]:
4✔
951
            fields_to_be_removed.append(key)
×
952
    # if there are no delete fields, move along sir
953
    if not fields_to_be_removed:
4✔
954
        return post_json
4✔
955
    # Use the url argument delete_fields for deletion
956
    del_add_on = 'delete_fields=' + ','.join(fields_to_be_removed)
×
957
    ff_utils.patch_metadata({}, existing_data["uuid"], key=connection.key, add_on=del_add_on)
×
958
    # Remove them also from the post_json
959
    for rm_key in fields_to_be_removed:
×
960
        del post_json[rm_key]
×
961
    return post_json
×
962

963

964
def remove_deleted(post_json):
4✔
965
    """Removes fields that have *delete* keyword,
966
       used for Post and Validation."""
967
    fields_to_be_removed = []
4✔
968
    for key, value in post_json.items():
4✔
969
        if value in ['*delete*', ['*delete*']]:
4✔
970
            fields_to_be_removed.append(key)
×
971
    for rm_key in fields_to_be_removed:
4✔
972
        del post_json[rm_key]
×
973
    return post_json
4✔
974

975

976
def _add_e_to_edict(alias, err, errors):
4✔
977
    if alias in errors:
4✔
978
        if err not in errors[alias]:
4✔
979
            errors[alias].append(err)
4✔
980
    else:
981
        errors[alias] = [err]
4✔
982
    return errors
4✔
983

984

985
def _pairing_consistency_check(files, errors):
4✔
986
    """checks the datastructure for consistency"""
987
    file_list = sorted([f for f in files if not files[f].get('symlink')])
4✔
988
    pair_list = []
4✔
989
    for f, info in files.items():
4✔
990
        # skip links for secondary aliases
991
        if info.get('symlink'):
4✔
992
            continue
4✔
993
        pair = info.get('pair')
4✔
994
        if not pair:
4✔
995
            err = 'no paired file but paired_end = ' + info.get('end')
4✔
996
            errors = _add_e_to_edict(f, err, errors)
4✔
997
        else:
998
            pair_list.append(pair)
4✔
999
    paircnts = Counter(pair_list)
4✔
1000
    # filelist without symlinks should have the same size as paircnts
1001
    if len(file_list) != len(paircnts):
4✔
1002
        err = str(len(file_list)) + " FILES paired with " + str(len(paircnts))
4✔
1003
        errors = _add_e_to_edict('MISMATCH', err, errors)
4✔
1004
    return errors
4✔
1005

1006

1007
def check_file_pairing(fastq_row):
4✔
1008
    """checks consistency between file pair info within sheet"""
1009
    fields = next(fastq_row)
4✔
1010
    fields.pop(0)
4✔
1011
    # make sure we have the aliases field
1012
    if 'aliases' not in fields:
4✔
1013
        return {'NO GO': 'Can only check file pairing by aliases'}
4✔
1014
    # find alias and paired_end column indexes
1015
    alias_idx = fields.index("aliases")
4✔
1016
    pair_idx = None
4✔
1017
    if 'paired_end' in fields:
4✔
1018
        pair_idx = fields.index("paired_end")
4✔
1019
    files = {}
4✔
1020
    errors = {}
4✔
1021
    for row in fastq_row:
4✔
1022
        if row[0].startswith("#"):
4✔
1023
            continue
4✔
1024
        row.pop(0)  # to make indexes same
4✔
1025
        alias = row[alias_idx]
4✔
1026
        if not alias:
4✔
1027
            err = "alias missing - can't check file pairing"
4✔
1028
            errors = _add_e_to_edict('unaliased', err, errors)
4✔
1029
            continue
4✔
1030
        # look for multiple aliases, treat first alias as the main one, and others as secondary
1031
        aliases = [x.strip() for x in alias.split(",")]
4✔
1032
        aliases = list(filter(None, aliases))
4✔
1033
        paired_end = row[pair_idx] if pair_idx else None
4✔
1034
        saw_pair = False
4✔
1035
        for i, fld in enumerate(row):
4✔
1036
            if isinstance(fld, str) and fld.strip() == 'paired with':
4✔
1037
                if saw_pair:
4✔
1038
                    err = 'single row with multiple paired_with values'
4✔
1039
                    errors = _add_e_to_edict(aliases[0], err, errors)
4✔
1040
                    continue
4✔
1041
                else:
1042
                    pfile = row[i + 1]
4✔
1043
                    saw_pair = True
4✔
1044
                    if not paired_end:
4✔
1045
                        err = 'missing paired_end number'
4✔
1046
                        errors = _add_e_to_edict(aliases[0], err, errors)
4✔
1047
                    main = True
4✔
1048
                    # if there are multiple aliases, create symlinks with secondary aliases in the files dictionary
1049
                    for an_alias in aliases:
4✔
1050
                        # if this is the first alias, put all info in the dict
1051
                        if main:
4✔
1052
                            files[an_alias] = {'end': paired_end, 'pair': pfile}
4✔
1053
                            main = False
4✔
1054
                        else:
1055
                            files[an_alias] = {'symlink': aliases[0]}
4✔
1056
        # If there are rows without the pair link (expecting link in the other file, FF mirrors the links after post)
1057
        if not saw_pair and paired_end:
4✔
1058
            main = True
4✔
1059
            for an_alias in aliases:
4✔
1060
                # if this is the first alias, put all info in the dict
1061
                if main:
4✔
1062
                    files[an_alias] = {'end': paired_end}
4✔
1063
                    main = False
4✔
1064
                else:
1065
                    files[an_alias] = {'symlink': aliases[0]}
4✔
1066
    for f, info in sorted(files.items()):  # sorted purely for testing
4✔
1067
        # skip the aliases that are secondary
1068
        if info.get('symlink'):
4✔
1069
            continue
4✔
1070
        if info.get('pair'):
4✔
1071
            fp = info.get('pair')
4✔
1072
            if fp not in files:
4✔
1073
                err = "paired with not found %s" % fp
4✔
1074
                errors = _add_e_to_edict(f, err, errors)
4✔
1075
            else:
1076
                # if the linked one is an symlink, go the the main one
1077
                if files[fp].get('symlink'):
4✔
1078
                    fp = files[fp]['symlink']
4✔
1079
                    files[f]['pair'] = fp
4✔
1080
                # Paired file might not have the mirroring pair info, FF creates that automatically
1081
                if not files[fp].get('pair'):
4✔
1082
                    files[fp]['pair'] = f
4✔
1083
                # if there is pairing info, check that if linking is mutual
1084
                else:
1085
                    mirrored_pair = files[fp]['pair']
4✔
1086
                    # convert the symlink to the main id
1087
                    if files[mirrored_pair].get('symlink'):
4✔
1088
                        mirrored_pair = files[mirrored_pair]['symlink']
4✔
1089
                        # correct the record in files
1090
                        files[fp]['pair'] = mirrored_pair
4✔
1091
                    if mirrored_pair != f:
4✔
1092
                        err = 'attempting to alter existing pair %s\t%s' % (fp, files[fp]['pair'])
4✔
1093
                        errors = _add_e_to_edict(f, err, errors)
4✔
1094
    return _pairing_consistency_check(files, errors)
4✔
1095

1096

1097
def workbook_reader(workbook, sheet, update, connection, patchall, aliases_by_type,
4✔
1098
                    dict_patch_loadxl, dict_replicates, dict_exp_sets, novalidate, attach_fields):
1099
    """takes an openpyxl workbook object and posts, patches or does a dry run on the data depending
1100
    on the options passed in.
1101
    """
1102
    # determine right from the top if dry run
1103
    dryrun = not (update or patchall)
4✔
1104
    all_aliases = [k for k in aliases_by_type]
4✔
1105
    # dict for acumulating cycle patch data
1106
    patch_loadxl = []
4✔
1107
    row = reader(workbook, sheetname=sheet)
4✔
1108
    skip_dryrun = False
4✔
1109
    if sheet == "ExperimentMic_Path":
4✔
1110
        skip_dryrun = True
×
1111
        sheet = "ExperimentMic"
×
1112
    keys = next(row)  # grab the first row of headers
4✔
1113
    types = next(row)  # grab second row with type info
4✔
1114
    # remove title column
1115
    keys.pop(0)
4✔
1116
    types.pop(0)
4✔
1117
    fields2types = dict(zip(keys, types))
4✔
1118
    # set counters to 0
1119
    total = 0
4✔
1120
    error = 0
4✔
1121
    post = 0
4✔
1122
    patch = 0
4✔
1123
    not_patched = 0
4✔
1124
    not_posted = 0
4✔
1125
    pre_validate_errors = []
4✔
1126
    invalid = False
4✔
1127

1128
    if sheet == "FileFastq" and not novalidate:
4✔
1129
        # check for consistent file pairing of fastqs in the sheet
1130
        pair_errs = check_file_pairing(reader(workbook, sheetname=sheet))
×
1131
        for f, err in sorted(pair_errs.items()):
×
1132
            for e in err:
×
1133
                print('WARNING: ', f, '\t', e)
×
1134

1135
    # iterate over the rows
1136
    for values in row:
4✔
1137
        # Rows that start with # are skipped
1138
        if values[0].startswith("#"):
4✔
1139
            continue
4✔
1140
        # Get rid of the first empty cell
1141
        values.pop(0)
4✔
1142
        total += 1
4✔
1143
        clean_values = []
4✔
1144
        for item in values:
4✔
1145
            try:
4✔
1146
                # strip trailing commas and spaces if a str
1147
                clean_values.append(item.strip(', '))
4✔
1148
            except AttributeError:
×
1149
                clean_values.append(item)
×
1150
        # build post_json and get existing if available
1151
        post_json = OrderedDict(zip(keys, clean_values))
4✔
1152
        # Get existing data if available
1153
        # existing_data = get_existing(post_json, connection)
1154

1155
        # pre-validate the row by fields and data_types
1156
        if not novalidate:
4✔
1157
            row_errors = pre_validate_json(post_json, fields2types, aliases_by_type, connection)
×
1158
            if row_errors:
×
1159
                error += 1
×
1160
                pre_validate_errors.extend(row_errors)
×
1161
                invalid = True
×
1162
                continue
×
1163

1164
        # if we get this far continue to build the json
1165
        post_json = build_patch_json(post_json, fields2types)
4✔
1166

1167
        # # validate the row by fields and data_types
1168
        # if not novalidate:
1169
        #     row_errors = pre_validate_json(post_json, fields2types, aliases_by_type, connection)
1170
        #     if row_errors:
1171
        #         error += 1
1172
        #         pre_validate_errors.extend(row_errors)
1173
        #         invalid = True
1174
        #         continue
1175
        filename_to_post = post_json.get('filename')
4✔
1176
        post_json, existing_data, file_to_upload, extrafiles = populate_post_json(
4✔
1177
            post_json, connection, sheet, attach_fields)
1178
        # Filter loadxl fields
1179
        post_json, patch_loadxl_item = filter_loadxl_fields(post_json, sheet)
4✔
1180
        # Filter experiment set related fields from experiment
1181
        if sheet.startswith('Experiment') and not sheet.startswith('ExperimentSet'):
4✔
1182
            post_json, rep_set_info, exp_set_info = filter_set_from_exps(post_json)
×
1183
        # Combine set items with stored dictionaries
1184
        # Adds things to the existing items, will be a problem at some point
1185
        # We need a way to delete some from the parent object
1186
        if sheet == 'ExperimentSet':
4✔
1187
            post_json, dict_exp_sets = combine_set(post_json, existing_data, sheet, dict_exp_sets)
4✔
1188
        if sheet == 'ExperimentSetReplicate':
4✔
1189
            post_json, dict_replicates = combine_set(post_json, existing_data, sheet, dict_replicates)
4✔
1190

1191
        # Run update or patchall
1192
        e = {}
4✔
1193
        # if there is an existing item, try patching
1194
        if existing_data.get("uuid"):
4✔
1195
            if patchall:
4✔
1196
                # First check for fields to be deleted, and do put
1197
                post_json = delete_fields(post_json, connection, existing_data)
4✔
1198
                # Do the patch
1199
                e = patch_item(file_to_upload, post_json, filename_to_post, extrafiles, connection, existing_data)
4✔
1200
            else:
1201
                not_patched += 1
4✔
1202
        # if there is no existing item try posting
1203
        else:
1204
            if update:
4✔
1205
                # If there are some fields with delete keyword,just ignore them
1206
                post_json = remove_deleted(post_json)
4✔
1207
                # Do the post
1208
                e = post_item(file_to_upload, post_json, filename_to_post, extrafiles, connection, sheet)
4✔
1209
            else:
1210
                not_posted += 1
4✔
1211

1212
        # add to success/error counters
1213
        if e.get("status") == "error":  # pragma: no cover
1214
            # display the used alias with the error
1215
            e_id = ""
1216
            if post_json.get('aliases'):
1217
                e_id = post_json['aliases'][0]
1218
            error_rep = error_report(e, sheet, all_aliases, connection, e_id)
1219
            error += 1
1220
            if error_rep:
1221
                # TODO: move this report formatting to error_report
1222
                if e.get('detail') and e.get('detail').startswith("Keys conflict: [('alias', 'md5:"):
1223
                    print("Upload failure - md5 of file matches another item in database.")
1224
                print(error_rep)
1225
            # if error is a weird one
1226
            else:
1227
                print(e)
1228
        elif e.get("status") == "success":
4✔
1229
            if existing_data.get("uuid"):
4✔
1230
                patch += 1
4✔
1231
            else:
1232
                post += 1
4✔
1233

1234
        # dryrun option
1235
        if dryrun:
4✔
1236
            if skip_dryrun:
4✔
1237
                continue
×
1238
            # simulate patch/post
1239
            if existing_data.get("uuid"):
4✔
1240
                post_json = remove_deleted(post_json)
4✔
1241
                try:
4✔
1242
                    e = ff_utils.patch_metadata(post_json, existing_data["uuid"], key=connection.key,
4✔
1243
                                                add_on="check_only=True")
1244
                except Exception as problem:
×
1245
                    e = parse_exception(problem)
×
1246
            else:
1247
                post_json = remove_deleted(post_json)
4✔
1248
                try:
4✔
1249
                    e = ff_utils.post_metadata(post_json, sheet, key=connection.key, add_on="check_only=True")
4✔
1250
                except Exception as problem:
×
1251
                    e = parse_exception(problem)
×
1252
            # check simulation status
1253
            if e['status'] == 'success':
4✔
1254
                pass
4✔
1255
            else:
1256
                # display the used alias with the error
1257
                e_id = ""
×
1258
                if post_json.get('aliases'):
×
1259
                    e_id = post_json['aliases'][0]
×
1260
                error_rep = error_report(e, sheet, all_aliases, connection, e_id)
×
1261
                if error_rep:
×
1262
                    error += 1
×
1263
                    print(error_rep)
×
1264
            continue
3✔
1265

1266
        # check status and if success fill transient storage dictionaries
1267
        if e.get("status") == "success":
4✔
1268
            # uuid of the posted/patched item
1269
            item_uuid = e['@graph'][0]['uuid']
4✔
1270
            item_id = e['@graph'][0]['@id']
4✔
1271
            # if post/patch successful, append uuid to patch_loadxl_item if full
1272
            if patch_loadxl_item != {}:
4✔
1273
                patch_loadxl_item['uuid'] = item_uuid
×
1274
                patch_loadxl.append(patch_loadxl_item)
×
1275
            # if post/patch successful, add the replicate/set information to the accumulate lists
1276
            if sheet.startswith('Experiment') and not sheet.startswith('ExperimentSet'):
4✔
1277
                # Part-I Replicates
1278
                if rep_set_info:
×
1279
                    rep_id = rep_set_info[0]
×
1280
                    saveitem = {'replicate_exp': item_id, 'bio_rep_no': rep_set_info[1], 'tec_rep_no': rep_set_info[2]}
×
1281
                    if dict_replicates.get(rep_id):
×
1282
                        dict_replicates[rep_id].append(saveitem)
×
1283
                    else:
1284
                        dict_replicates[rep_id] = [saveitem, ]
×
1285
                    # Part-II Experiment Sets
1286
                if exp_set_info:
×
1287
                    for exp_set in exp_set_info:
×
1288
                        if dict_exp_sets.get(exp_set):
×
1289
                            dict_exp_sets[exp_set].append(item_id)
×
1290
                        else:
1291
                            dict_exp_sets[exp_set] = [item_id, ]
×
1292

1293
    # add all object loadxl patches to dictionary
1294
    if patch_loadxl and not invalid:
4✔
1295
        dict_patch_loadxl[sheet] = patch_loadxl
×
1296

1297
    if pre_validate_errors:
4✔
1298
        for le in pre_validate_errors:
×
1299
            print(le)
×
1300
    # dryrun report
1301
    if dryrun:
4✔
1302
        if skip_dryrun:
4✔
1303
            print("{sheet:<27}: PATH connections are not tested in DRYRUN - Skipping"
×
1304
                  .format(sheet=sheet.upper()+"("+str(total)+")"))
1305
        else:
1306
            print("{sheet:<27}: {post:>2} posted /{not_posted:>2} not posted  \
4✔
1307
        {patch:>2} patched /{not_patched:>2} not patched,{error:>2} errors"
1308
                  .format(sheet=sheet.upper()+"("+str(total)+")", post=post, not_posted=not_posted,
1309
                          error=error, patch=patch, not_patched=not_patched))
1310
    # submission report
1311
    else:
1312
        # print final report, and if there are not patched entries, add to report
1313
        print("{sheet:<27}: {post:>2} posted /{not_posted:>2} not posted  \
4✔
1314
    {patch:>2} patched /{not_patched:>2} not patched,{error:>2} errors"
1315
              .format(sheet=sheet.upper()+"("+str(total)+")", post=post, not_posted=not_posted,
1316
                      error=error, patch=patch, not_patched=not_patched))
1317

1318

1319
def format_file(param, files, connection):
4✔
1320

1321
    template = {"bucket_name": "",
×
1322
                "workflow_argument_name": param.split('--')[-1]}
1323
    # find bucket
1324
    health_page = ff_utils.get_metadata('health', key=connection.key)
×
1325
    bucket_main = health_page.get('file_upload_bucket')
×
1326
    resp = {}
×
1327
    # if it is a list of files, uuid and object key are list objects
1328
    if isinstance(files, list):
×
1329
        object_key = []
×
1330
        uuid = []
×
1331
        for a_file in files:
×
1332
            resp = ff_utils.get_metadata(a_file, key=connection.key, add_on="frame=object")
×
1333
            object_key.append(resp['display_title'])
×
1334
            uuid.append(resp['uuid'])
×
1335
        template['object_key'] = object_key
×
1336
        template['uuid'] = uuid
×
1337
    # if it is not a list of files
1338
    else:
1339
        resp = ff_utils.get_metadata(files, key=connection.key, add_on="frame=object")
×
1340
        template['object_key'] = resp['display_title']
×
1341
        template['uuid'] = resp['uuid']
×
1342
    # find the bucket from the last used response
1343
    if 'FileProcessed' in resp.get('@type'):
×
1344
        template['bucket_name'] = bucket_main.replace('-files', '-wfoutput')
×
1345
    else:
1346
        template['bucket_name'] = bucket_main
×
1347
    return template
×
1348

1349

1350
def build_tibanna_json(keys, types, values, connection):
4✔
1351
    post_json = OrderedDict(zip(keys, values))
4✔
1352
    fields2types = dict(zip(keys, types))
4✔
1353
    post_json = build_patch_json(post_json, fields2types)
4✔
1354
    # if not assigned in the excel for some service reason, add lab award and submitter
1355
    if not post_json.get('lab'):
4✔
1356
        post_json['lab'] = connection.lab
4✔
1357
    if not post_json.get('award'):
4✔
1358
        post_json['award'] = connection.award
4✔
1359
    if not post_json.get('submitted_by'):
4✔
1360
        post_json['submitted_by'] = connection.user
4✔
1361
    template = {
4✔
1362
        "config": {},
1363
        "args": {},
1364
        "parameters": {},
1365
        "wfr_meta": {},
1366
        "input_files": [],
1367
        "metadata_only": True,
1368
        "output_files": []
1369
    }
1370
    # sorting only needed for the mock lists in tests to work - not cool
1371
    for param in sorted(post_json.keys()):
4✔
1372
        # insert wf uuid and app_name
1373
        if param == 'workflow_uuid':
4✔
1374
            template['workflow_uuid'] = post_json['workflow_uuid']
4✔
1375
            workflow_resp = ff_utils.get_metadata(post_json['workflow_uuid'], key=connection.key, add_on="frame=object")
4✔
1376
            template['app_name'] = workflow_resp.get('app_name')
4✔
1377
        elif param.startswith('input--'):
4✔
1378
            template["input_files"].append(format_file(param, post_json[param], connection))
4✔
1379
        elif param.startswith('output--'):
4✔
1380
            template["output_files"].append(format_file(param, post_json[param], connection))
4✔
1381
        else:
1382
            template["wfr_meta"][param] = post_json[param]
4✔
1383
    return template
4✔
1384

1385

1386
def user_workflow_reader(workbook, sheet, connection):
4✔
1387
    """takes the user workflow runsheet and ony post it to fourfront endpoint."""
1388
    row = reader(workbook, sheetname=sheet)
4✔
1389
    keys = next(row)  # grab the first row of headers
4✔
1390
    types = next(row)  # grab second row with type info
4✔
1391
    # remove title column
1392
    keys.pop(0)
4✔
1393
    types.pop(0)
4✔
1394
    # set counters to 0
1395
    total = 0
4✔
1396
    error = 0
4✔
1397
    post = 0
4✔
1398
    not_posted = 0
4✔
1399
    # iterate over the rows
1400
    for values in row:
4✔
1401
        # Rows that start with # are skipped
1402
        if values[0].startswith("#"):
4✔
1403
            continue
4✔
1404
        # Get rid of the first empty cell
1405
        values.pop(0)
4✔
1406
        total += 1
4✔
1407
        # build post_json and get existing if available
1408
        post_json = build_tibanna_json(keys, types, values, connection)
4✔
1409
        existing_data = get_existing(post_json['wfr_meta'], connection)
4✔
1410
        if existing_data:
4✔
1411
            print('this workflow_run is already posted {}'.format(post_json['wfr_meta']['aliases'][0]))
×
1412
            error += 1
×
1413
            continue
×
1414
        if post_json:
4✔
1415
            # do the magic
1416
            try:
4✔
1417
                e = ff_utils.post_metadata(post_json, '/WorkflowRun/pseudo-run', key=connection.key)
4✔
1418
            except Exception as problem:
×
1419
                e = parse_exception(problem)
×
1420
            if e.get("status") == "SUCCEEDED":
4✔
1421
                post += 1
4✔
1422
            else:
1423
                print('can not post the workflow run {}'.format(post_json['wfr_meta']['aliases'][0]))
×
1424
                print(e)  # to give a little more info even if not that informative
×
1425
                error += 1
×
1426
        else:
1427
            error += 1
×
1428
    # print final report
1429
    print("{sheet:<27}: {post:>2} posted /{not_posted:>2} not posted  \
4✔
1430
    {patch:>2} patched /{not_patched:>2} not patched,{error:>2} errors"
1431
          .format(sheet=sheet.upper()+"("+str(total)+")", post=post, not_posted=not_posted,
1432
                  error=error, patch="-", not_patched="-"))
1433

1434

1435
def get_upload_creds(file_id, connection, extfilecreds=False):  # pragma: no cover
1436
    creds2return = 'upload_credentials'
1437
    url = f"{file_id}/upload/"
1438
    if extfilecreds:
1439
        creds2return = 'extra_files_creds'
1440
        req = ff_utils.authorized_request(f"{connection.key.get('server')}/{url}", auth=ff_utils.get_authentication_with_server(connection.key)).json()
1441
    else:
1442
        req = ff_utils.post_metadata({}, url, key=connection.key)
1443
    return req['@graph'][0][creds2return]
1444

1445

1446
def upload_file_item(metadata_post_response, path):
4✔
1447
    try:
×
1448
        item = metadata_post_response['@graph'][0]
×
1449
        creds = item['upload_credentials']
×
1450
    except Exception as e:
×
1451
        print(e)
×
1452
        return
×
1453
    upload_file(creds, path)
×
1454

1455

1456
def upload_extra_file(ecreds, path):
4✔
1457
    upload_file(ecreds, path)
×
1458

1459

1460
def upload_file(creds, path):  # pragma: no cover
1461

1462
    ####################
1463
    # POST file to S3
1464
    env = os.environ.copy()  # pragma: no cover
1465
    try:
1466
        env.update({
1467
            'AWS_ACCESS_KEY_ID': creds['AccessKeyId'],
1468
            'AWS_SECRET_ACCESS_KEY': creds['SecretAccessKey'],
1469
            'AWS_SECURITY_TOKEN': creds['SessionToken'],
1470
        })
1471
    except Exception as e:
1472
        raise Exception(f"Didn't get back s3 access keys from file/upload endpoint.  Error was {e}")
1473
    # ~10s/GB from Stanford - AWS Oregon
1474
    # ~12-15s/GB from AWS Ireland - AWS Oregon
1475
    print("Uploading file.")
1476
    start = time.time()
1477
    path_object = pp.Path(path).expanduser()
1478
    try:
1479
        source = path_object
1480
        target = creds['upload_url']
1481
        print("Going to upload {} to {}.".format(source, target))
1482
        command = ['aws', 's3', 'cp']
1483
        command = command + ['--only-show-errors', source, target]
1484
        options = {}
1485
        if running_on_windows_native():
1486
            options = {"shell": True}
1487
        subprocess.check_call(command, env=env, **options)
1488
    except subprocess.CalledProcessError as e:
1489
        raise RuntimeError("Upload failed with exit code %d" % e.returncode)
1490
    else:
1491
        end = time.time()
1492
        duration = end - start
1493
        print("Uploaded in %.2f seconds" % duration)
1494

1495

1496
def running_on_windows_native():
4✔
1497
    return os.name == 'nt'
×
1498

1499

1500
# the order to try to upload / update the items
1501
# used to avoid dependencies... i.e. biosample needs the biosource to exist
1502
def order_sorter(list_of_names):
4✔
1503
    ret_list = []
4✔
1504
    for i in sheet_order:
4✔
1505
        if i in list_of_names:
4✔
1506
            ret_list.append(i)
4✔
1507
    # we add the list of user supplied workflows at the end
1508
    # expected list if multiple; ['user_workflow_1', 'user_workflow_2']
1509
    user_workflows = sorted([sh for sh in list_of_names if sh.startswith('user_workflow')])
4✔
1510
    ret_list.extend(user_workflows)
4✔
1511
    missing = set(list_of_names) - set(ret_list)
4✔
1512
    if missing:
4✔
1513
        missing_items = ", ".join(missing)
4✔
1514
        print("WARNING!", missing_items, "sheet(s) are not loaded")
4✔
1515
        print("WARNING! Check the sheet names and the reference list \"sheet_order\"")
4✔
1516
    return ret_list
4✔
1517

1518

1519
def loadxl_cycle(patch_list, connection, alias_dict):
4✔
1520
    for n in patch_list.keys():
4✔
1521
        total = 0
4✔
1522
        for entry in patch_list[n]:
4✔
1523
            entry = delete_fields(entry, connection, entry)
4✔
1524
            if entry != {}:
4✔
1525
                total = total + 1
4✔
1526
                try:
4✔
1527
                    e = ff_utils.patch_metadata(entry, entry["uuid"], key=connection.key)
4✔
1528
                except Exception as problem:
×
1529
                    e = parse_exception(problem)
×
1530
                if e.get("status") == "error":  # pragma: no cover
1531
                    error_rep = error_report(e, n.upper(), [k for k in alias_dict], connection)
1532
                    if error_rep:
1533
                        print(error_rep)
1534
                    else:
1535
                        # if error is a weird one
1536
                        print(e)
1537
        print("{sheet}(phase2): {total} items patched.".format(sheet=n.upper(), total=total))
4✔
1538

1539

1540
def _verify_and_return_item(item, connection):
4✔
1541
    try:
4✔
1542
        res = ff_utils.get_metadata(item, key=connection.key, add_on='frame=object')
4✔
1543
        assert '@id' in res
4✔
1544
    except (AssertionError, TypeError):
4✔
1545
        return None
4✔
1546
    return res
4✔
1547

1548

1549
def cabin_cross_check(connection, patchall, update, infile, remote, lab=None, award=None):
4✔
1550
    """Set of check for connection, file, dryrun, and prompt."""
1551
    print("Running on:       {server}".format(server=connection.key['server']))
4✔
1552
    # check input file (xlsx)
1553
    if not pp.Path(infile).is_file():
4✔
1554
        print(f"File {infile} not found!")
×
1555
        sys.exit(1)
×
1556

1557
    # check for multi labs and awards and reset connection appropriately
1558
    # if lab and/or award options used modify connection accordingly and check for conflict later
1559
    if lab or award:
4✔
1560
        if lab is not None:
4✔
1561
            connection.lab = lab
4✔
1562
            if not award:
4✔
1563
                connection.set_award(lab, remote)
4✔
1564
        if award is not None:
4✔
1565
            connection.award = award
4✔
1566
    if not remote:
4✔
1567
        connection.prompt_for_lab_award(lab, award)
4✔
1568
    else:
1569
        if not lab:  # did not provide lab option
4✔
1570
            if len(connection.labs) > 1:  # lab may be provided as an option or is None
4✔
1571
                connection.lab = None
4✔
1572
        if award is None:  # has not been passed in as option
4✔
1573
            # lab may be None and then so will award
1574
            # or lab may have 1 award so use it
1575
            # or lab may have multiple awards so award set to None
1576
            connection.set_award(connection.lab, True)
4✔
1577

1578
    # check to be sure that lab and award exist and if both that the award is linked to lab
1579
    submit_lab = connection.lab
4✔
1580
    submit_award = connection.award
4✔
1581
    lab_json = _verify_and_return_item(submit_lab, connection)
4✔
1582
    if not lab_json:
4✔
1583
        print("Submitting Lab NOT FOUND: {}".format(submit_lab))
4✔
1584
        connection.lab = None
4✔
1585
    award_json = _verify_and_return_item(submit_award, connection)
4✔
1586
    if not award_json:
4✔
1587
        print("Submitting award NOT FOUND: {}".format(submit_award))
4✔
1588
        connection.award = None
4✔
1589
    else:  # make sure award is linked to lab
1590
        if lab_json is not None:
4✔
1591
            labawards = lab_json.get('awards', [])
4✔
1592
            if award_json.get('@id') not in labawards:
4✔
1593
                print("Award {} not associated with lab {} - exiting!".format(submit_award, submit_lab))
4✔
1594
                sys.exit(1)
4✔
1595

1596
    print("Submitting User:  {}".format(connection.email))
4✔
1597
    missing = []
4✔
1598
    if connection.lab is None:
4✔
1599
        missing.append('Lab')
4✔
1600
    if connection.award is None:
4✔
1601
        missing.append('Award')
4✔
1602
    if missing:
4✔
1603
        whatis = ' and '.join(missing)
4✔
1604
        print("WARNING: Submitting {} Unspecified".format(whatis))
4✔
1605
        print("{} info must be included for all items or submission will fail".format(whatis))
4✔
1606

1607
    print("Submitting Lab:   {}".format(connection.lab))
4✔
1608
    print("Submitting Award: {}".format(connection.award))
4✔
1609

1610
    # if dry-run, message explaining the test, and skipping user input
1611
    if not patchall and not update:
4✔
1612
        print("\n##############   DRY-RUN MODE   ################")
4✔
1613
        print("Since there are no '--update' and/or '--patchall' arguments, you are running the DRY-RUN validation")
4✔
1614
        print("The validation will only check for schema rules, but not for object relations")
4✔
1615
        print("##############   DRY-RUN MODE   ################\n")
4✔
1616
    else:
1617
        if not remote:
×
1618
            response = input("Do you want to continue with these credentials? (Y/N): ") or "N"
×
1619
            if response.lower() not in ["y", "yes"]:
×
1620
                sys.exit(1)
×
1621

1622

1623
def get_profiles(connection):
4✔
1624
    return ff_utils.get_metadata("/profiles/", key=connection.key, add_on="frame=object")
4✔
1625

1626

1627
def get_attachment_fields(profiles):
4✔
1628
    attach_field = []
4✔
1629
    for _, profile in profiles.items():
4✔
1630
        if profile.get('properties'):
4✔
1631
            attach_field.extend([f for f, val in profile.get('properties').items() if (
4✔
1632
                val.get('type') == 'object' and val.get('attachment') and f not in attach_field)])
1633
    return attach_field
4✔
1634

1635

1636
def get_collections(profiles):
4✔
1637
    """Get a list of all the data_types in the system."""
1638
    supported_collections = list(profiles.keys())
4✔
1639
    supported_collections = [s.lower() for s in list(profiles.keys())]
4✔
1640
    return supported_collections
4✔
1641

1642

1643
def get_all_aliases(workbook, sheets):
4✔
1644
    """Extracts all aliases existing in the workbook to later check object connections
1645
       Checks for same aliases that are used for different items and gives warning."""
1646
    aliases_by_type = {}
4✔
1647
    for sheet in sheets:
4✔
1648
        if sheet == 'ExperimentMic_Path':
4✔
1649
            continue
×
1650
        alias_col = ""
4✔
1651
        rows = reader(workbook, sheetname=sheet)
4✔
1652
        keys = next(rows)  # grab the first row of headers
4✔
1653
        try:
4✔
1654
            alias_col = keys.index("aliases")
4✔
1655
        except Exception:
×
1656
            continue
×
1657
        for row in rows:
4✔
1658
            my_aliases = []
4✔
1659
            if row[0].startswith('#'):
4✔
1660
                continue
4✔
1661
            my_alias = row[alias_col]
4✔
1662
            my_aliases = [x.strip() for x in my_alias.split(",")]
4✔
1663
            my_aliases = list(filter(None, my_aliases))
4✔
1664
            if my_aliases:
4✔
1665
                for a in my_aliases:
4✔
1666
                    if aliases_by_type.get(a):
4✔
1667
                        print("WARNING! NON-UNIQUE ALIAS: ", a)
×
1668
                        print("\tused for TYPE ", aliases_by_type[a], "and ", sheet)
×
1669
                    else:
1670
                        aliases_by_type[a] = sheet
4✔
1671
    return aliases_by_type
4✔
1672

1673

1674
def main():  # pragma: no cover
1675
    args = getArgs()
1676
    key = FDN_Key(args.keyfile, args.key)
1677
    # check if key has error
1678
    if key.error:
1679
        sys.exit(1)
1680
    # establish connection and run checks
1681
    connection = FDN_Connection(key)
1682
    cabin_cross_check(connection, args.patchall, args.update, args.infile,
1683
                      args.remote, args.lab, args.award)
1684
    # support for xlsx only - adjust if allowing different
1685
    workbook, sheetnames = digest_xlsx(args.infile)
1686

1687
    # This is not in our documentation, but if single sheet is used, file name can be the collection
1688
    if args.type and 'all' not in args.type:
1689
        names = args.type
1690
    else:
1691
        names = sheetnames
1692
    # get me a list of all the data_types in the system
1693
    profiles = get_profiles(connection)
1694
    supported_collections = get_collections(profiles)
1695
    attachment_fields = get_attachment_fields(profiles)
1696
    # we want to read through names in proper upload order
1697
    sorted_names = order_sorter(names)
1698
    # get all aliases from all sheets for dryrun object connections tests
1699
    aliases_by_type = get_all_aliases(workbook, sorted_names)
1700
    # all_aliases = list(aliases_by_type.keys())
1701
    # dictionaries that accumulate information during submission
1702
    dict_loadxl = {}
1703
    dict_replicates = {}
1704
    dict_exp_sets = {}
1705
    # Todo combine accumulate dicts to one
1706
    # accumulate = {dict_loadxl: {}, dict_replicates: {}, dict_exp_sets: {}}
1707
    for n in sorted_names:
1708
        if n.lower() in supported_collections:
1709
            workbook_reader(workbook, n, args.update, connection, args.patchall, aliases_by_type,
1710
                            dict_loadxl, dict_replicates, dict_exp_sets, args.novalidate, attachment_fields)
1711
        elif n.lower() == "experimentmic_path":
1712
            workbook_reader(workbook, "ExperimentMic_Path", args.update, connection, args.patchall, aliases_by_type,
1713
                            dict_loadxl, dict_replicates, dict_exp_sets, args.novalidate, attachment_fields)
1714
        elif n.lower().startswith('user_workflow'):
1715
            if args.update:
1716
                user_workflow_reader(workbook, n, connection)
1717
            else:
1718
                print('user workflow sheets will only be processed with the --update argument')
1719
        else:
1720
            print("Sheet name '{name}' not part of supported object types!".format(name=n))
1721
    loadxl_cycle(dict_loadxl, connection, aliases_by_type)
1722
    # if any item left in the following dictionaries
1723
    # it means that this items are not posted/patched
1724
    # because they are not on the exp_set file_set sheets
1725
    for dict_store, dict_sheet in [[dict_replicates, "ExperimentSetReplicate"],
1726
                                   [dict_exp_sets, "ExperimentSet"]]:
1727
        if dict_store:
1728
            remains = ', '.join(dict_store.keys())
1729
            print('Following items are not posted')
1730
            print('make sure they are on {} sheet'.format(dict_sheet))
1731
            print(remains)
1732

1733

1734
if __name__ == '__main__':
4✔
1735
    main()
×
STATUS · Troubleshooting · Open an Issue · Sales · Support · CAREERS · ENTERPRISE · START FREE · SCHEDULE DEMO
ANNOUNCEMENTS · TWITTER · TOS & SLA · Supported CI Services · What's a CI service? · Automated Testing

© 2025 Coveralls, Inc