• Home
  • Features
  • Pricing
  • Docs
  • Announcements
  • Sign In

4dn-dcic / dcicwrangling / 16969763498

14 Aug 2025 03:36PM UTC coverage: 33.8%. Remained the same
16969763498

Pull #121

github

web-flow
Merge 98bf8b7db into 7aee3bab8
Pull Request #121: bug fixes for GEO notebook

1256 of 3716 relevant lines covered (33.8%)

1.35 hits per line

Source File
Press 'n' to go to next uncovered line, 'b' for previous

0.0
/scripts/parse_damid_pf.py
1
#!/usr/bin/env python3
2
import sys
×
3
import argparse
×
4
import datetime
×
5
import re
×
6
from dcicutils.ff_utils import (
×
7
    patch_metadata, search_metadata,
8
    get_metadata)
9
from dcicwrangling.functions import script_utils as scu
×
10
from dcicwrangling.functions.notebook_functions import digest_xlsx, reader
×
11
'''
12
Parsing damid processed file worksheet to generate the various bins
13
of other processed files using information from the linked_dataset column
14
with the expectation that the linked_dataset will be an identifier for the
15
experiment or replicate set to which the file should be linked.
16

17
As the bin used for 'official' processed files - as opposed to supplementary
18
files can change depending on the target or experiment type (damid vs. pA-damid)
19
the PF_BIN specifies which bin to use
20

21
NOTE: there are variations that can deal with incomplete information - i.e. not
22
specifying the experiments but just using replicate set ids and then info in aliases
23
to match up replicates - see Andy if needed
24
'''
25

26
PF_BIN = '5kb bin'  # the bin for which a subset of files should be considered processed_files rather than opfs
×
27

28

29
def extract_rows(infile):
×
30
    book, sheets = digest_xlsx(infile)
×
31
    data = []
×
32
    row = reader(book, sheetname='FileProcessed')
×
33
    fields = next(row)
×
34
    fields = [f.replace('*', '') for f in fields]
×
35
    types = next(row)
×
36
    fields.pop(0)
×
37
    types.pop(0)
×
38
    for values in row:
×
39
        if values[0].startswith('#'):
×
40
            continue
×
41
        values.pop(0)
×
42
        meta = dict(zip(fields, values))
×
43
        data.append(meta)
×
44
    return data
×
45

46

47
def is_processed_bin(desc, meta):
×
48
    ''' Putting bam files and select files in bin specified by PF_BIN into the
49
        processed file bin this is a specific check for those attributes
50
    '''
51
    if 'mapped reads' in desc:
×
52
        return True
×
53
    if desc.startswith(PF_BIN):
×
54
        if (meta.get('file_type') == 'normalized counts' and meta.get('file_format') == 'bw') or (
×
55
                meta.get('file_type') == 'LADs' and meta.get('file_format') == 'bed'):
56
            return True
×
57
    return False
×
58

59

60
def create_patch(item, label, rep=None):
×
61
    patch = {}
×
62
    if rep:
×
63
        label = label + ' ' + rep
×
64
    item_pfs = item.get('processed_files')
×
65
    item_opfs = item.get('other_processed_files')
×
66
    if not (item_pfs or item_opfs):
×
67
        print("NO FILES FOR {}".format(item.get('uuid')))
×
68
        return
×
69
    if item_pfs:
×
70
        patch['processed_files'] = item_pfs
×
71
    if item_opfs:
×
72
        for bin, opfs in item_opfs.items():
×
73
            if bin == 'Other':
×
74
                opftitle = 'Other files - non-binned'
×
75
                opfdesc = 'Non-bin specific files for {}'.format(label)
×
76
            elif bin == PF_BIN:
×
77
                opftitle = 'Additional {}ned files'.format(bin)
×
78
                opfdesc = 'Additional files associated with the {} size processing of data for {}'.format(bin, label)
×
79
            else:
80
                opftitle = '{}ned files'.format(bin)
×
81
                opfdesc = 'The files associated with the {} size processing of data for {}'.format(bin, label)
×
82
            patch.setdefault('other_processed_files', []).append(
×
83
                {'title': opftitle, 'description': opfdesc, 'type': 'supplementary', 'files': opfs})
84
    return patch
×
85

86

87
def get_args(args):
×
88
    parser = argparse.ArgumentParser(
×
89
        formatter_class=argparse.RawDescriptionHelpFormatter,
90
        parents=[scu.create_ff_arg_parser(), scu.create_input_arg_parser()],
91
    )
92
    args = parser.parse_args()
×
93
    return args
×
94

95

96
def main():  # pragma: no cover
97
    # initial set up
98
    args = get_args(sys.argv[1:])
99
    auth = scu.authenticate(key=args.key, keyfile=args.keyfile, env=args.env)
100

101
    repre = re.compile(r'_r\d+_')
102
    binre = re.compile(r'^\S+ bin')
103
    erepnore = re.compile(r'replicate\s\d+')
104

105
    # this if for parsing excel but could use fourfront query
106
    infile = args.input[0]
107
    query = None
108
    if len(args.input) > 1:
109
        query = args.input[1]
110

111
    metadata = extract_rows(infile)
112
    patch_items = {}
113
    seen_esets = {}  # if you are dealing with an experiment want to use the dataset_label and condition
114
    # of the replicate set to create the label
115
    # going row by row to add file to correct spot
116
    for meta in metadata:
117
        # checking if we have linked dataset info in sheet - should be either an
118
        # experiment set or experiment
119
        linked_dataset_id = meta.get('#linked datasets')
120
        if not linked_dataset_id:
121
            print("Can not get dataset_id for {}".format(meta))
122
            continue
123
        file_alias = meta.get('aliases')
124

125
        # build basic ds for the set
126
        if linked_dataset_id not in patch_items:
127
            item = get_metadata(linked_dataset_id, auth)  # either experiment or eset
128
            euuid = item.get('uuid')
129
            if not euuid:
130
                print("Can't get uuid for {} - skipping".format(linked_dataset_id))
131
                continue
132
            if 'experiments_in_set' in item:  # we've got an experiment set
133
                label = item.get('dataset_label') + ' ' + item.get('condition')
134
            else:  # we've got an experiment
135
                esets = item.get('experiment_sets')
136
                if len(esets) != 1:  # some sort of unusual situation
137
                    raise (Exception, 'experiment linked to multiple experiment sets -- abort!')
138
                esetid = esets[0].get('uuid')
139
                if esetid not in seen_esets:
140
                    eset = get_metadata(esetid, auth)
141
                    label = eset.get('dataset_label') + ' ' + eset.get('condition')
142
                    seen_esets[esetid] = label
143
                else:
144
                    label = seen_esets[esetid]
145

146
            patch_items[linked_dataset_id] = {'uuid': euuid, 'label': label, 'processed_files': [], 'other_processed_files': {}, 'experiments': {}}
147
        # use description to get replicate number if any and bin size if any
148
        desc = meta.get('description')
149

150
        bin = binre.match(desc)
151

152
        if is_processed_bin(desc, meta):
153
            patch_items[linked_dataset_id]['processed_files'].append(file_alias)
154
        else:
155
            if bin:
156
                bin = bin.group()
157
            else:
158
                bin = 'Other'
159
            patch_items[linked_dataset_id]['other_processed_files'].setdefault(bin, []).append(file_alias)
160

161
    patch_data = {}
162
    for e in patch_items.values():
163
        label = e.get('label')
164
        patch = create_patch(e, label)
165
        if patch:
166
            euid = e.get('uuid')
167
            existing_item = get_metadata(euid, auth)
168
            ipf = existing_item.get('processed_files')
169
            if ipf:
170
                if 'processed_files' in patch:
171
                    patch['processed_files'].extend(ipf)
172
            opf = existing_item.get('other_processed_files')
173
            if opf:
174
                if 'other_processed_files' in patch:
175
                    patch['other_processed_files'].extend(opf)
176

177
            patch_data[e.get('uuid')] = patch
178

179
    if patch_data:
180
        for puuid, pdata in patch_data.items():
181
            print(puuid, '\n', pdata, '\n\n')
182
            if args.dbupdate:
183
                try:
184
                    res = patch_metadata(pdata, puuid, auth)
185
                    print(res.get('status'))
186
                except Exception:
187
                    print("Can't patch {iid} with\n\t{p}".format(iid=puuid, p=pdata))
188

189

190
if __name__ == '__main__':
×
191
    main()
×
STATUS · Troubleshooting · Open an Issue · Sales · Support · CAREERS · ENTERPRISE · START FREE · SCHEDULE DEMO
ANNOUNCEMENTS · TWITTER · TOS & SLA · Supported CI Services · What's a CI service? · Automated Testing

© 2026 Coveralls, Inc