16969763498

Committed 14 Aug 2025 03:36PM UTC coverage: 33.8%. Remained the same

Build # 16969763498

Build Type

Pull #121

github

Committed by

web-flow

Commit Message

Merge 98bf8b7db into 7aee3bab8

Pull Request Pull Request #121: bug fixes for GEO notebook

Run Details

1256 of 3716 relevant lines covered (33.8%)

1.35 hits per line

Source File
Press 'n' to go to next uncovered line, 'b' for previous

0.0

/scripts/parse_damid_pf.py

#!/usr/bin/env python3
import sys
import argparse
import datetime
import re
from dcicutils.ff_utils import (
    patch_metadata, search_metadata,
    get_metadata)
from dcicwrangling.functions import script_utils as scu
from dcicwrangling.functions.notebook_functions import digest_xlsx, reader
'''
Parsing damid processed file worksheet to generate the various bins
of other processed files using information from the linked_dataset column
with the expectation that the linked_dataset will be an identifier for the
experiment or replicate set to which the file should be linked.

As the bin used for 'official' processed files - as opposed to supplementary
files can change depending on the target or experiment type (damid vs. pA-damid)
the PF_BIN specifies which bin to use

NOTE: there are variations that can deal with incomplete information - i.e. not
specifying the experiments but just using replicate set ids and then info in aliases
to match up replicates - see Andy if needed
'''

PF_BIN = '5kb bin'  # the bin for which a subset of files should be considered processed_files rather than opfs


def extract_rows(infile):
    book, sheets = digest_xlsx(infile)
    data = []
    row = reader(book, sheetname='FileProcessed')
    fields = next(row)
    fields = [f.replace('*', '') for f in fields]
    types = next(row)
    fields.pop(0)
    types.pop(0)
    for values in row:
        if values[0].startswith('#'):
            continue
        values.pop(0)
        meta = dict(zip(fields, values))
        data.append(meta)
    return data


def is_processed_bin(desc, meta):
    ''' Putting bam files and select files in bin specified by PF_BIN into the
        processed file bin this is a specific check for those attributes
    '''
    if 'mapped reads' in desc:
        return True
    if desc.startswith(PF_BIN):
        if (meta.get('file_type') == 'normalized counts' and meta.get('file_format') == 'bw') or (
                meta.get('file_type') == 'LADs' and meta.get('file_format') == 'bed'):
            return True
    return False


def create_patch(item, label, rep=None):
    patch = {}
    if rep:
        label = label + ' ' + rep
    item_pfs = item.get('processed_files')
    item_opfs = item.get('other_processed_files')
    if not (item_pfs or item_opfs):
        print("NO FILES FOR {}".format(item.get('uuid')))
        return
    if item_pfs:
        patch['processed_files'] = item_pfs
    if item_opfs:
        for bin, opfs in item_opfs.items():
            if bin == 'Other':
                opftitle = 'Other files - non-binned'
                opfdesc = 'Non-bin specific files for {}'.format(label)
            elif bin == PF_BIN:
                opftitle = 'Additional {}ned files'.format(bin)
                opfdesc = 'Additional files associated with the {} size processing of data for {}'.format(bin, label)
            else:
                opftitle = '{}ned files'.format(bin)
                opfdesc = 'The files associated with the {} size processing of data for {}'.format(bin, label)
            patch.setdefault('other_processed_files', []).append(
                {'title': opftitle, 'description': opfdesc, 'type': 'supplementary', 'files': opfs})
    return patch


def get_args(args):
    parser = argparse.ArgumentParser(
        formatter_class=argparse.RawDescriptionHelpFormatter,
        parents=[scu.create_ff_arg_parser(), scu.create_input_arg_parser()],
    )
    args = parser.parse_args()
    return args


def main():  # pragma: no cover
    # initial set up
    args = get_args(sys.argv[1:])
    auth = scu.authenticate(key=args.key, keyfile=args.keyfile, env=args.env)

    repre = re.compile(r'_r\d+_')
    binre = re.compile(r'^\S+ bin')
    erepnore = re.compile(r'replicate\s\d+')

    # this if for parsing excel but could use fourfront query
    infile = args.input[0]
    query = None
    if len(args.input) > 1:
        query = args.input[1]

    metadata = extract_rows(infile)
    patch_items = {}
    seen_esets = {}  # if you are dealing with an experiment want to use the dataset_label and condition
    # of the replicate set to create the label
    # going row by row to add file to correct spot
    for meta in metadata:
        # checking if we have linked dataset info in sheet - should be either an
        # experiment set or experiment
        linked_dataset_id = meta.get('#linked datasets')
        if not linked_dataset_id:
            print("Can not get dataset_id for {}".format(meta))
            continue
        file_alias = meta.get('aliases')

        # build basic ds for the set
        if linked_dataset_id not in patch_items:
            item = get_metadata(linked_dataset_id, auth)  # either experiment or eset
            euuid = item.get('uuid')
            if not euuid:
                print("Can't get uuid for {} - skipping".format(linked_dataset_id))
                continue
            if 'experiments_in_set' in item:  # we've got an experiment set
                label = item.get('dataset_label') + ' ' + item.get('condition')
            else:  # we've got an experiment
                esets = item.get('experiment_sets')
                if len(esets) != 1:  # some sort of unusual situation
                    raise (Exception, 'experiment linked to multiple experiment sets -- abort!')
                esetid = esets[0].get('uuid')
                if esetid not in seen_esets:
                    eset = get_metadata(esetid, auth)
                    label = eset.get('dataset_label') + ' ' + eset.get('condition')
                    seen_esets[esetid] = label
                else:
                    label = seen_esets[esetid]

            patch_items[linked_dataset_id] = {'uuid': euuid, 'label': label, 'processed_files': [], 'other_processed_files': {}, 'experiments': {}}
        # use description to get replicate number if any and bin size if any
        desc = meta.get('description')

        bin = binre.match(desc)

        if is_processed_bin(desc, meta):
            patch_items[linked_dataset_id]['processed_files'].append(file_alias)
        else:
            if bin:
                bin = bin.group()
            else:
                bin = 'Other'
            patch_items[linked_dataset_id]['other_processed_files'].setdefault(bin, []).append(file_alias)

    patch_data = {}
    for e in patch_items.values():
        label = e.get('label')
        patch = create_patch(e, label)
        if patch:
            euid = e.get('uuid')
            existing_item = get_metadata(euid, auth)
            ipf = existing_item.get('processed_files')
            if ipf:
                if 'processed_files' in patch:
                    patch['processed_files'].extend(ipf)
            opf = existing_item.get('other_processed_files')
            if opf:
                if 'other_processed_files' in patch:
                    patch['other_processed_files'].extend(opf)

            patch_data[e.get('uuid')] = patch

    if patch_data:
        for puuid, pdata in patch_data.items():
            print(puuid, '\n', pdata, '\n\n')
            if args.dbupdate:
                try:
                    res = patch_metadata(pdata, puuid, auth)
                    print(res.get('status'))
                except Exception:
                    print("Can't patch {iid} with\n\t{p}".format(iid=puuid, p=pdata))


if __name__ == '__main__':
    main()

1	#!/usr/bin/env python3
2	import sys	×
3	import argparse	×
4	import datetime	×
5	import re	×
6	from dcicutils.ff_utils import (	×
7	patch_metadata, search_metadata,
8	get_metadata)
9	from dcicwrangling.functions import script_utils as scu	×
10	from dcicwrangling.functions.notebook_functions import digest_xlsx, reader	×
11	'''
12	Parsing damid processed file worksheet to generate the various bins
13	of other processed files using information from the linked_dataset column
14	with the expectation that the linked_dataset will be an identifier for the
15	experiment or replicate set to which the file should be linked.
16
17	As the bin used for 'official' processed files - as opposed to supplementary
18	files can change depending on the target or experiment type (damid vs. pA-damid)
19	the PF_BIN specifies which bin to use
20
21	NOTE: there are variations that can deal with incomplete information - i.e. not
22	specifying the experiments but just using replicate set ids and then info in aliases
23	to match up replicates - see Andy if needed
24	'''
25
26	PF_BIN = '5kb bin' # the bin for which a subset of files should be considered processed_files rather than opfs	×
27
28
29	def extract_rows(infile):	×
30	book, sheets = digest_xlsx(infile)	×
31	data = []	×
32	row = reader(book, sheetname='FileProcessed')	×
33	fields = next(row)	×
34	fields = [f.replace('*', '') for f in fields]	×
35	types = next(row)	×
36	fields.pop(0)	×
37	types.pop(0)	×
38	for values in row:	×
39	if values[0].startswith('#'):	×
40	continue	×
41	values.pop(0)	×
42	meta = dict(zip(fields, values))	×
43	data.append(meta)	×
44	return data	×
45
46
47	def is_processed_bin(desc, meta):	×
48	''' Putting bam files and select files in bin specified by PF_BIN into the
49	processed file bin this is a specific check for those attributes
50	'''
51	if 'mapped reads' in desc:	×
52	return True	×
53	if desc.startswith(PF_BIN):	×
54	if (meta.get('file_type') == 'normalized counts' and meta.get('file_format') == 'bw') or (	×
55	meta.get('file_type') == 'LADs' and meta.get('file_format') == 'bed'):
56	return True	×
57	return False	×
58
59
60	def create_patch(item, label, rep=None):	×
61	patch = {}	×
62	if rep:	×
63	label = label + ' ' + rep	×
64	item_pfs = item.get('processed_files')	×
65	item_opfs = item.get('other_processed_files')	×
66	if not (item_pfs or item_opfs):	×
67	print("NO FILES FOR {}".format(item.get('uuid')))	×
68	return	×
69	if item_pfs:	×
70	patch['processed_files'] = item_pfs	×
71	if item_opfs:	×
72	for bin, opfs in item_opfs.items():	×
73	if bin == 'Other':	×
74	opftitle = 'Other files - non-binned'	×
75	opfdesc = 'Non-bin specific files for {}'.format(label)	×
76	elif bin == PF_BIN:	×
77	opftitle = 'Additional {}ned files'.format(bin)	×
78	opfdesc = 'Additional files associated with the {} size processing of data for {}'.format(bin, label)	×
79	else:
80	opftitle = '{}ned files'.format(bin)	×
81	opfdesc = 'The files associated with the {} size processing of data for {}'.format(bin, label)	×
82	patch.setdefault('other_processed_files', []).append(	×
83	{'title': opftitle, 'description': opfdesc, 'type': 'supplementary', 'files': opfs})
84	return patch	×
85
86
87	def get_args(args):	×
88	parser = argparse.ArgumentParser(	×
89	formatter_class=argparse.RawDescriptionHelpFormatter,
90	parents=[scu.create_ff_arg_parser(), scu.create_input_arg_parser()],
91	)
92	args = parser.parse_args()	×
93	return args	×
94
95
96	def main(): # pragma: no cover
97	# initial set up
98	args = get_args(sys.argv[1:])
99	auth = scu.authenticate(key=args.key, keyfile=args.keyfile, env=args.env)
100
101	repre = re.compile(r'_r\d+_')
102	binre = re.compile(r'^\S+ bin')
103	erepnore = re.compile(r'replicate\s\d+')
104
105	# this if for parsing excel but could use fourfront query
106	infile = args.input[0]
107	query = None
108	if len(args.input) > 1:
109	query = args.input[1]
110
111	metadata = extract_rows(infile)
112	patch_items = {}
113	seen_esets = {} # if you are dealing with an experiment want to use the dataset_label and condition
114	# of the replicate set to create the label
115	# going row by row to add file to correct spot
116	for meta in metadata:
117	# checking if we have linked dataset info in sheet - should be either an
118	# experiment set or experiment
119	linked_dataset_id = meta.get('#linked datasets')
120	if not linked_dataset_id:
121	print("Can not get dataset_id for {}".format(meta))
122	continue
123	file_alias = meta.get('aliases')
124
125	# build basic ds for the set
126	if linked_dataset_id not in patch_items:
127	item = get_metadata(linked_dataset_id, auth) # either experiment or eset
128	euuid = item.get('uuid')
129	if not euuid:
130	print("Can't get uuid for {} - skipping".format(linked_dataset_id))
131	continue
132	if 'experiments_in_set' in item: # we've got an experiment set
133	label = item.get('dataset_label') + ' ' + item.get('condition')
134	else: # we've got an experiment
135	esets = item.get('experiment_sets')
136	if len(esets) != 1: # some sort of unusual situation
137	raise (Exception, 'experiment linked to multiple experiment sets -- abort!')
138	esetid = esets[0].get('uuid')
139	if esetid not in seen_esets:
140	eset = get_metadata(esetid, auth)
141	label = eset.get('dataset_label') + ' ' + eset.get('condition')
142	seen_esets[esetid] = label
143	else:
144	label = seen_esets[esetid]
145
146	patch_items[linked_dataset_id] = {'uuid': euuid, 'label': label, 'processed_files': [], 'other_processed_files': {}, 'experiments': {}}
147	# use description to get replicate number if any and bin size if any
148	desc = meta.get('description')
149
150	bin = binre.match(desc)
151
152	if is_processed_bin(desc, meta):
153	patch_items[linked_dataset_id]['processed_files'].append(file_alias)
154	else:
155	if bin:
156	bin = bin.group()
157	else:
158	bin = 'Other'
159	patch_items[linked_dataset_id]['other_processed_files'].setdefault(bin, []).append(file_alias)
160
161	patch_data = {}
162	for e in patch_items.values():
163	label = e.get('label')
164	patch = create_patch(e, label)
165	if patch:
166	euid = e.get('uuid')
167	existing_item = get_metadata(euid, auth)
168	ipf = existing_item.get('processed_files')
169	if ipf:
170	if 'processed_files' in patch:
171	patch['processed_files'].extend(ipf)
172	opf = existing_item.get('other_processed_files')
173	if opf:
174	if 'other_processed_files' in patch:
175	patch['other_processed_files'].extend(opf)
176
177	patch_data[e.get('uuid')] = patch
178
179	if patch_data:
180	for puuid, pdata in patch_data.items():
181	print(puuid, '\n', pdata, '\n\n')
182	if args.dbupdate:
183	try:
184	res = patch_metadata(pdata, puuid, auth)
185	print(res.get('status'))
186	except Exception:
187	print("Can't patch {iid} with\n\t{p}".format(iid=puuid, p=pdata))
188
189
190	if __name__ == '__main__':	×
191	main()	×

4dn-dcic / dcicwrangling / 16969763498

Source File Press 'n' to go to next uncovered line, 'b' for previous

Source File
Press 'n' to go to next uncovered line, 'b' for previous