• Home
  • Features
  • Pricing
  • Docs
  • Announcements
  • Sign In

cnr-ibba / SMARTER-database / 9173461010

21 May 2024 10:53AM UTC coverage: 94.311% (+0.003%) from 94.308%
9173461010

Pull #116

github

web-flow
Merge 7ac8fa63c into bd0c5e637
Pull Request #116: :sparkles: convert genotypes from top to forward

45 of 47 new or added lines in 4 files covered. (95.74%)

18 existing lines in 2 files now uncovered.

2984 of 3164 relevant lines covered (94.31%)

0.94 hits per line

Source File
Press 'n' to go to next uncovered line, 'b' for previous

91.43
/src/data/import_from_plink.py
1
#!/usr/bin/env python3
2
# -*- coding: utf-8 -*-
3
"""
1✔
4
Created on Tue Mar 16 10:36:56 2021
5

6
@author: Paolo Cozzi <paolo.cozzi@ibba.cnr.it>
7

8
This script will upload data into smarter database starting from a couple of
9
MAP/PED (plink) files and the archive file used for the dataset upload in the
10
smarter database. Dataset country and species are mandatory and need to be
11
correctly defined, breed also must be loaded in database in order to define
12
the full smarter id like CO(untry)SP(ecies)-BREED-ID
13
The default format is illumina top, but its possible to convert into it
14
from an illumina forward format
15
"""
16

17
import click
1✔
18
import logging
1✔
19
import subprocess
1✔
20

21
from pathlib import Path
1✔
22
from click_option_group import (
1✔
23
    optgroup, RequiredMutuallyExclusiveOptionGroup,
24
    MutuallyExclusiveOptionGroup)
25

26
from src.features.plinkio import (
1✔
27
    TextPlinkIO, BinaryPlinkIO, plink_binary_exists)
28
from src.features.smarterdb import Dataset, global_connection, SupportedChip
1✔
29
from src.data.common import WORKING_ASSEMBLIES, PLINK_SPECIES_OPT, AssemblyConf
1✔
30

31
logger = logging.getLogger(__name__)
1✔
32

33

34
def get_output_files(prefix: str, working_dir: Path, assembly: str):
1✔
35
    # create output directory
36
    output_dir = working_dir / assembly
1✔
37
    output_dir.mkdir(exist_ok=True)
1✔
38

39
    # determine map outputfile. get the basename of the prefix
40
    prefix = Path(prefix)
1✔
41

42
    # sanitize names
43
    output_map = f"{prefix.name}_updated.map".replace(" ", "_")
1✔
44
    output_map = output_dir / output_map
1✔
45

46
    # determine ped outputfile
47
    output_ped = f"{prefix.name}_updated.ped".replace(" ", "_")
1✔
48
    output_ped = output_dir / output_ped
1✔
49

50
    return output_dir, output_map, output_ped
1✔
51

52

53
def deal_with_text_plink(file_: str, dataset: Dataset, assembly: str):
1✔
54
    mapfile = file_ + ".map"
1✔
55
    pedfile = file_ + ".ped"
1✔
56

57
    # check files are in dataset
58
    if mapfile not in dataset.contents or pedfile not in dataset.contents:
1✔
59
        raise Exception(
×
60
            "Couldn't find files in dataset: check for both "
61
            f"'{mapfile}' and '{pedfile}' in '{dataset}'")
62

63
    # check for working directory
64
    working_dir = dataset.working_dir
1✔
65

66
    if not working_dir.exists():
1✔
67
        raise Exception(f"Could find dataset directory {working_dir}")
×
68

69
    # determine full file paths
70
    mappath = working_dir / mapfile
1✔
71
    pedpath = working_dir / pedfile
1✔
72

73
    # instantiating a TextPlinkIO object
74
    plinkio = TextPlinkIO(
1✔
75
        mapfile=str(mappath),
76
        pedfile=str(pedpath),
77
        species=dataset.species
78
    )
79

80
    # determine output files
81
    output_dir, output_map, output_ped = get_output_files(
1✔
82
        file_, working_dir, assembly)
83

84
    return plinkio, output_dir, output_map, output_ped
1✔
85

86

87
def deal_with_binary_plink(bfile: str, dataset: Dataset, assembly: str):
1✔
88
    bedfile = bfile + ".bed"
1✔
89
    bimfile = bfile + ".bim"
1✔
90
    famfile = bfile + ".fam"
1✔
91

92
    all_files = set([bedfile, bimfile, famfile])
1✔
93

94
    if not all_files.issubset(set(dataset.contents)):
1✔
95
        raise Exception(
×
96
            "Couldn't find files in dataset: check for "
97
            f"'{all_files}' in '{dataset}'")
98

99
    # check for working directory
100
    working_dir = dataset.working_dir
1✔
101

102
    if not working_dir.exists():
1✔
103
        raise Exception(f"Could find dataset directory {working_dir}")
×
104

105
    # determine full file paths
106
    bfilepath = working_dir / bfile
1✔
107

108
    # instantiating a BinaryPlinkIO object
109
    plinkio = BinaryPlinkIO(
1✔
110
        prefix=str(bfilepath),
111
        species=dataset.species
112
    )
113

114
    # determine output files
115
    output_dir, output_map, output_ped = get_output_files(
1✔
116
        bfile, working_dir, assembly)
117

118
    return plinkio, output_dir, output_map, output_ped
1✔
119

120

121
@click.command()
1✔
122
@optgroup.group(
1✔
123
    'Plink input parameters',
124
    cls=RequiredMutuallyExclusiveOptionGroup
125
)
126
@optgroup.option(
1✔
127
    '--file',
128
    'file_',
129
    type=str,
130
    help="PLINK text file prefix")
131
@optgroup.option(
1✔
132
    '--bfile',
133
    type=str,
134
    help="PLINK binary file prefix")
135
@click.option(
1✔
136
    '--dataset', type=str, required=True,
137
    help="The raw dataset file name (zip archive)"
138
)
139
@click.option(
1✔
140
    '--src_coding',
141
    type=click.Choice(
142
        ['top', 'forward', 'ab', 'affymetrix', 'illumina'],
143
        case_sensitive=False),
144
    default="top", show_default=True,
145
    help="Genotype source coding format"
146
)
147
@click.option(
1✔
148
    '--chip_name',
149
    type=str,
150
    required=True,
151
    help="The SMARTER SupportedChip name")
152
@click.option(
1✔
153
    '--assembly',
154
    type=str,
155
    required=True,
156
    help="Destination assembly of the converted genotypes")
157
@click.option(
1✔
158
    '--create_samples',
159
    is_flag=True,
160
    help="Create a new SampleSheep or SampleGoat object if doesn't exist")
161
@click.option(
1✔
162
    '--sample_field',
163
    type=str,
164
    default="original_id",
165
    help="Search samples using this attribute"
166
)
167
@optgroup.group(
1✔
168
    'Variant Search Type',
169
    cls=MutuallyExclusiveOptionGroup
170
)
171
@optgroup.option(
1✔
172
    '--search_field',
173
    type=str,
174
    default="name",
175
    show_default=True,
176
    help='search variants using this field')
177
@optgroup.option(
1✔
178
    '--search_by_positions',
179
    is_flag=True,
180
    help='search variants using their positions')
181
@click.option(
1✔
182
    '--src_version',
183
    type=str,
184
    help="Source assembly version")
185
@click.option(
1✔
186
    '--src_imported_from',
187
    type=str,
188
    help="Source assembly imported_from")
189
@click.option(
1✔
190
    '--ignore_coding_errors',
191
    is_flag=True,
192
    help=(
193
        'set SNP as missing when there are coding errors '
194
        '(no more CodingException)'))
195
def main(file_, bfile, dataset, src_coding, chip_name, assembly, create_samples,
1✔
196
         sample_field, search_field, search_by_positions, src_version,
197
         src_imported_from, ignore_coding_errors):
198
    """
199
    Read genotype data from a PLINK file (text or binary) and convert it
200
    to the desired assembly version using Illumina TOP coding
201
    """
202

203
    logger.info(f"{Path(__file__).name} started")
1✔
204

205
    # find assembly configuration
206
    if assembly not in WORKING_ASSEMBLIES:
1✔
UNCOV
207
        raise Exception(f"assembly {assembly} not managed by smarter")
×
208

209
    src_assembly, dst_assembly = None, None
1✔
210

211
    if src_version and src_imported_from:
1✔
212
        src_assembly = AssemblyConf(src_version, src_imported_from)
1✔
213
        logger.info(f"Got '{src_assembly} as source assembly'")
1✔
214
        dst_assembly = WORKING_ASSEMBLIES[assembly]
1✔
215

216
    else:
217
        src_assembly = WORKING_ASSEMBLIES[assembly]
1✔
218

219
    # get the dataset object
220
    dataset = Dataset.objects(file=dataset).get()
1✔
221

222
    logger.debug(f"Found {dataset}")
1✔
223

224
    if file_:
1✔
225
        plinkio, output_dir, output_map, output_ped = deal_with_text_plink(
1✔
226
            file_, dataset, assembly)
227

228
    elif bfile:
1✔
229
        plinkio, output_dir, output_map, output_ped = deal_with_binary_plink(
1✔
230
            bfile, dataset, assembly)
231

232
    # check chip_name
233
    illumina_chip = SupportedChip.objects(name=chip_name).get()
1✔
234

235
    # set chip name for this sample
236
    plinkio.chip_name = illumina_chip.name
1✔
237

238
    # test if I have already run this analysis
239

240
    # ok check for results dir
241
    results_dir = dataset.result_dir
1✔
242
    results_dir = results_dir / assembly
1✔
243
    results_dir.mkdir(parents=True, exist_ok=True)
1✔
244

245
    # define final filename
246
    final_prefix = results_dir / output_ped.stem
1✔
247

248
    # test for processed files existance
249
    if plink_binary_exists(final_prefix):
1✔
250
        logger.warning(f"Skipping {dataset} processing: {final_prefix} exists")
1✔
251
        logger.info(f"{Path(__file__).name} ended")
1✔
252
        return
1✔
253

254
    # if I arrive here, I can create output files
255

256
    # read mapdata and read updated coordinates from db
257
    plinkio.read_mapfile()
1✔
258

259
    if search_by_positions:
1✔
260
        # fetch variants relying positions
261
        plinkio.fetch_coordinates_by_positions(
1✔
262
            src_assembly=src_assembly,
263
            dst_assembly=dst_assembly
264
        )
265

266
    else:
267
        # fetch coordinates relying assembly configuration
268
        plinkio.fetch_coordinates(
1✔
269
            src_assembly=src_assembly,
270
            dst_assembly=dst_assembly,
271
            search_field=search_field,
272
            chip_name=illumina_chip.name
273
        )
274

275
    logger.info("Writing a new map file with updated coordinates")
1✔
276
    plinkio.update_mapfile(str(output_map))
1✔
277

278
    logger.info("Writing a new ped file with fixed genotype")
1✔
279
    plinkio.update_pedfile(
1✔
280
        outputfile=output_ped,
281
        dataset=dataset,
282
        src_coding=src_coding,
283
        create_samples=create_samples,
284
        sample_field=sample_field,
285
        ignore_coding_errors=ignore_coding_errors
286
    )
287

288
    # ok time to convert data in plink binary format
289
    cmd = ["plink"] + PLINK_SPECIES_OPT[dataset.species] + [
1✔
290
        "--file",
291
        f"{output_dir / output_ped.stem}",
292
        "--make-bed",
293
        "--out",
294
        f"{final_prefix}"
295
    ]
296

297
    # debug
298
    logger.info("Executing: " + " ".join(cmd))
1✔
299

300
    subprocess.run(cmd, check=True)
1✔
301

302
    logger.info(f"{Path(__file__).name} ended")
1✔
303

304

305
if __name__ == '__main__':
1✔
UNCOV
306
    log_fmt = '%(asctime)s - %(name)s - %(levelname)s - %(message)s'
×
UNCOV
307
    logging.basicConfig(level=logging.INFO, format=log_fmt)
×
308

309
    # connect to database
UNCOV
310
    global_connection()
×
311

312
    main()
×
STATUS · Troubleshooting · Open an Issue · Sales · Support · CAREERS · ENTERPRISE · START FREE · SCHEDULE DEMO
ANNOUNCEMENTS · TWITTER · TOS & SLA · Supported CI Services · What's a CI service? · Automated Testing

© 2025 Coveralls, Inc