• Home
  • Features
  • Pricing
  • Docs
  • Announcements
  • Sign In

cnr-ibba / SMARTER-database / 9209232766

23 May 2024 01:49PM UTC coverage: 94.429% (+0.1%) from 94.311%
9209232766

Pull #131

github

web-flow
Merge 870cd4265 into fcdb3ce6a
Pull Request #131: :bookmark: release 0.4.10

20 of 20 new or added lines in 7 files covered. (100.0%)

29 existing lines in 5 files now uncovered.

3068 of 3249 relevant lines covered (94.43%)

0.94 hits per line

Source File
Press 'n' to go to next uncovered line, 'b' for previous

95.29
/src/data/import_samples.py
1
#!/usr/bin/env python3
2
# -*- coding: utf-8 -*-
3
"""
1✔
4
Created on Tue May  4 15:16:09 2021
5

6
@author: Paolo Cozzi <paolo.cozzi@ibba.cnr.it>
7

8
First attempt to create samples BEFORE reading genotype file. This is necessary
9
if we have breeds in different countries, since we can't read country from
10
breed aliases relying only on FID. Moreover this could manage also the
11
relationship problems (in order to have already all samples into database when
12
processing genotypes)
13
"""
14

15
import click
1✔
16
import logging
1✔
17
import functools
1✔
18

19
from pathlib import Path
1✔
20
from click_option_group import (
1✔
21
    optgroup, RequiredMutuallyExclusiveOptionGroup,
22
    MutuallyExclusiveOptionGroup)
23
from mongoengine.errors import DoesNotExist
1✔
24

25
import pycountry
1✔
26
from pandas.core.series import Series
1✔
27

28
from src.data.common import (
1✔
29
    deal_with_datasets, pandas_open, get_sample_species,
30
    deal_with_sex_and_alias)
31
from src.features.smarterdb import (
1✔
32
    global_connection, Breed, get_or_create_sample, get_sample_type,
33
    SmarterDBException)
34
from src.features.utils import UnknownCountry
1✔
35

36
logger = logging.getLogger(__name__)
1✔
37

38

39
@functools.lru_cache
1✔
40
def find_country(country: str):
1✔
41
    """Do a fuzzy search with pycountry. Returns a pycountry object
42

43
    Args:
44
        country (str): the fuzzy country name
45

46
    Returns:
47
        pycountry.db.Country: the found country name
48
    """
49
    # mind underscore in country names: pycountry can't deal with them
50
    country = country.replace("_", " ")
1✔
51

52
    if country.lower() == "unknown":
1✔
53
        return UnknownCountry()
1✔
54

55
    # transform country string with pycountry
56
    fuzzy = pycountry.countries.search_fuzzy(country)[0]
1✔
57

58
    logger.info(f"Found {fuzzy} for {country}")
1✔
59

60
    return fuzzy
1✔
61

62

63
def deal_with_breeds(
1✔
64
        code: str, code_column: str, dst_dataset: str, row: Series):
65
    """
66
    Determine breeds and code for each sample in dataset or apply the same
67
    stuff to each samples
68

69
    Parameters
70
    ----------
71
    code : str
72
        Search for a :py:class:`Breed` object using this code and dataset
73
    code_column : str
74
        The column label to be searched in dataframe.
75
    dst_dataset : Dataset
76
        The destination dataset.
77
    row : Series
78
        A row of metadata file.
79

80
    Returns
81
    -------
82
    breed : Breed
83
        A breed instance.
84
    code : str
85
        A breed code to be applied to sample.
86
    """
87

88
    # assign code from parameter or from datasource column
89
    # code_column has a default value. Check for provided code first
90
    if code:
1✔
91
        # get breed from database
92
        breed = Breed.objects(
1✔
93
            code=code,
94
            species=dst_dataset.species
95
        ).get()
96

97
    else:
98
        code = str(row.get(code_column))
1✔
99

100
        logger.debug(f"search for fid: {code}, dataset: {dst_dataset}")
1✔
101

102
        # get breed from database
103
        try:
1✔
104
            breed = Breed.objects(
1✔
105
                aliases__match={'fid': code, 'dataset': dst_dataset}).get()
106

107
        except DoesNotExist as exc:
1✔
108
            logger.debug(exc)
1✔
109
            raise SmarterDBException(
1✔
110
                f"Couldn't find fid: {code}, dataset: {dst_dataset}")
111

112
    logger.debug(f"found breed '{breed}'")
1✔
113

114
    return breed, code
1✔
115

116

117
def deal_with_countries(country: str, country_column: str, row: Series):
1✔
118
    """
119
    Search for countries relying on dataset or by input value
120

121
    Parameters
122
    ----------
123
    country_all : str
124
        Apply this country to sample.
125
    country_column : str
126
        The column label to be searched in dataframe.
127
    row : Series
128
        A row of metadata file.
129

130
    Returns
131
    -------
132
    str
133
        A country to be applied to the sample.
134
    """
135

136
    logger.debug(f"Got: {country}, {country_column}")
1✔
137

138
    # assign country from datasource column if specified
139
    # country_column has a default value. Check for provided country first
140
    if not country:
1✔
141
        country = row.get(country_column)
1✔
142

143
    # process a country by doing a fuzzy search
144
    # HINT: this function caches results relying arguments using lru_cache
145
    # see find country implementation for more informations
146
    return find_country(country)
1✔
147

148

149
@click.command()
1✔
150
@click.option(
1✔
151
    '--src_dataset', type=str, required=True,
152
    help="The raw dataset file name (zip archive) in which search datafile"
153
)
154
@click.option(
1✔
155
    '--dst_dataset', type=str, required=False,
156
    help=("The raw dataset file name (zip archive) in which define samples"
157
          "(def. the 'src_dataset')")
158
)
159
@click.option(
1✔
160
    '--datafile',
161
    type=str,
162
    required=True,
163
    help="The metadata file in which search for information")
164
@optgroup.group(
1✔
165
    'Codes',
166
    cls=RequiredMutuallyExclusiveOptionGroup
167
)
168
@optgroup.option(
1✔
169
    '--code_column',
170
    type=str,
171
    default="code",
172
    help="Code column in src datafile (ie FID)"
173
)
174
@optgroup.option(
1✔
175
    '--code_all',
176
    type=str,
177
    help="Code applied to all items in datafile"
178
)
179
@optgroup.group(
1✔
180
    'Countries',
181
    cls=RequiredMutuallyExclusiveOptionGroup
182
)
183
@optgroup.option(
1✔
184
    '--country_column',
185
    type=str,
186
    default="country",
187
    help="Country column in src datafile"
188
)
189
@optgroup.option(
1✔
190
    '--country_all',
191
    type=str,
192
    help="Country applied to all items in datafile"
193
)
194
@optgroup.group(
1✔
195
    'Species',
196
    cls=MutuallyExclusiveOptionGroup
197
)
198
@optgroup.option(
1✔
199
    '--species_column',
200
    type=str,
201
    help="Species column in src datafile"
202
)
203
@optgroup.option(
1✔
204
    '--species_all',
205
    type=str,
206
    help="Species applied to all items in datafile"
207
)
208
@click.option('--id_column', type=str, required=True,
1✔
209
              help="The 'original_id' column to place in smarter database")
210
@click.option(
1✔
211
    '--sex_column',
212
    type=str,
213
    help="Sex column in src datafile")
214
@click.option(
1✔
215
    '--chip_name',
216
    type=str,
217
    required=True,
218
    help="The SMARTER SupportedChip name")
219
@click.option(
1✔
220
    '--alias_column',
221
    type=str,
222
    help="An alias for original_id")
223
@click.option(
1✔
224
    '--skip_missing_alias',
225
    is_flag=True,
226
    help="Don't import samples with no alias")
227
def main(
1✔
228
        src_dataset, dst_dataset, datafile, code_column, code_all,
229
        country_column, country_all, species_column, species_all,
230
        id_column, sex_column, chip_name, alias_column, skip_missing_alias):
231
    """Generate samples from a metadata file"""
232

233
    logger.info(f"{Path(__file__).name} started")
1✔
234

235
    src_dataset, dst_dataset, datapath = deal_with_datasets(
1✔
236
        src_dataset, dst_dataset, datafile)
237

238
    # mind dataset species
239
    SampleSpecie = get_sample_species(dst_dataset.species)
1✔
240

241
    # get sample type
242
    type_ = get_sample_type(dst_dataset)
1✔
243

244
    # read datafile
245
    data = pandas_open(datapath)
1✔
246

247
    logger.info(f"Got columns: {data.columns.to_list()}")
1✔
248

249
    for index, row in data.iterrows():
1✔
250
        logger.debug(f"Got: {row.to_list()}")
1✔
251

252
        # this will be the original_id
253
        original_id = str(row.get(id_column))
1✔
254

255
        # determine breeds and code relying on parameters
256
        breed, code = deal_with_breeds(code_all, code_column, dst_dataset, row)
1✔
257

258
        # determine country
259
        country = deal_with_countries(country_all, country_column, row)
1✔
260

261
        # assign species from parameter or from datasource column
262
        if species_column:
1✔
263
            species = row.get(species_column)
1✔
264

265
        else:
266
            species = species_all
1✔
267

268
        sex, alias = deal_with_sex_and_alias(
1✔
269
            sex_column, alias_column, row)
270

271
        logger.debug(
1✔
272
            f"Got code: {code}, country: {country}, breed: {breed}, "
273
            f"original_id: {original_id}, sex: {sex}, alias: {alias}"
274
        )
275

276
        if skip_missing_alias and not alias:
1✔
277
            logger.warning(
1✔
278
                f"Ignoring code: {code}, country: {country}, breed: {breed}, "
279
                f"original_id: {original_id}, sex: {sex}, alias: {alias}"
280
            )
281
            continue
1✔
282

283
        # get or create a new Sample Obj
284
        sample, created = get_or_create_sample(
1✔
285
            SampleSpecies=SampleSpecie,
286
            original_id=original_id,
287
            dataset=dst_dataset,
288
            type_=type_,
289
            breed=breed,
290
            country=country.name,
291
            species=species,
292
            chip_name=chip_name,
293
            sex=sex,
294
            alias=alias)
295

296
        if created:
1✔
297
            logger.info(f"Sample '{sample}' added to database")
1✔
298

299
    logger.info(f"{Path(__file__).name} ended")
1✔
300

301

302
if __name__ == '__main__':
1✔
303
    log_fmt = '%(asctime)s - %(name)s - %(levelname)s - %(message)s'
×
UNCOV
304
    logging.basicConfig(level=logging.INFO, format=log_fmt)
×
305

306
    # connect to database
UNCOV
307
    global_connection()
×
308

UNCOV
309
    main()
×
STATUS · Troubleshooting · Open an Issue · Sales · Support · CAREERS · ENTERPRISE · START FREE · SCHEDULE DEMO
ANNOUNCEMENTS · TWITTER · TOS & SLA · Supported CI Services · What's a CI service? · Automated Testing

© 2026 Coveralls, Inc