• Home
  • Features
  • Pricing
  • Docs
  • Announcements
  • Sign In

cnr-ibba / SMARTER-database / 9226859311

24 May 2024 04:08PM UTC coverage: 94.434% (+0.1%) from 94.311%
9226859311

Pull #131

github

bunop
:bug: fix issues with countries in MERINO_INIA_UY dataset

Setting Australia to imported animals
Pull Request #131: :bookmark: release 0.4.10

29 of 29 new or added lines in 8 files covered. (100.0%)

1 existing line in 1 file now uncovered.

3071 of 3252 relevant lines covered (94.43%)

0.94 hits per line

Source File
Press 'n' to go to next uncovered line, 'b' for previous

92.42
/src/data/import_datasets.py
1
#!/usr/bin/env python3
2
# -*- coding: utf-8 -*-
3
"""
1✔
4
Created on Mon Feb 22 18:32:54 2021
5

6
@author: Paolo Cozzi <paolo.cozzi@ibba.cnr.it>
7
"""
8

9
import csv
1✔
10
import sys
1✔
11
import click
1✔
12
import logging
1✔
13
import zipfile
1✔
14
import collections
1✔
15

16
from pathlib import Path
1✔
17

18
from src.features.smarterdb import global_connection, Dataset
1✔
19
from src.features.utils import sanitize, get_raw_dir
1✔
20

21
logger = logging.getLogger(__name__)
1✔
22

23

24
@click.command()
1✔
25
@click.argument('input_filepath', type=click.Path(exists=True))
1✔
26
@click.option(
1✔
27
    '--types', nargs=2, type=str, required=True,
28
    help=(
29
        '2 argument types (ex. genotypes background, phenotypes foreground,'
30
        ' etc)'))
31
def main(input_filepath, types):
1✔
32
    """
33
    Import a dataset stored in ``data/raw`` folder into the *smarter*
34
    database and unpack file contents into ``data/interim`` subfolder
35

36
    INPUT_FILEPATH:  The CSV dataset description file
37
    """
38

39
    logger.info(f"{Path(__file__).name} started")
1✔
40

41
    # where to find raw data in SMARTER-database project
42
    raw_dir = get_raw_dir()
1✔
43

44
    with open(input_filepath) as handle:
1✔
45
        reader = csv.reader(handle, delimiter=";")
1✔
46

47
        header = next(reader)
1✔
48

49
        # remove header id
50
        del header[0]
1✔
51

52
        # sanitize column
53
        header = [sanitize(col) for col in header]
1✔
54

55
        logger.debug("Got '%s' as header" % header)
1✔
56

57
        # define a datatype for my data
58
        Record = collections.namedtuple("Record", header)
1✔
59

60
        for line in reader:
1✔
61
            # remove id from record
62
            del line[0]
1✔
63

64
            # remove empty values
65
            line = [col if col != '' else None for col in line]
1✔
66

67
            record = Record._make(line)
1✔
68
            logger.debug(record)
1✔
69

70
            # search for the archive file
71
            try:
1✔
72
                archive = next(raw_dir.rglob(record.file))
1✔
73
                logger.info(f"Found '{archive}' dataset")
1✔
74

75
            except StopIteration:
1✔
76
                logger.critical(f"Cannot find '{record.file}' in '{raw_dir}'")
1✔
77
                sys.exit(f"'{record.file}' does not exists")
1✔
78

79
            archive = zipfile.ZipFile(archive)
1✔
80

81
            logger.debug("Get file contents")
1✔
82
            contents = archive.namelist()
1✔
83
            logger.debug(contents)
1✔
84

85
            # add or create dataset (file is a unique key)
86
            qs = Dataset.objects(file=record.file)
1✔
87

88
            if qs.count() == 0:
1✔
89
                # create a new object
90
                dataset = Dataset(
1✔
91
                    **record._asdict(),
92
                    type_=types,
93
                    contents=contents)
94

95
                logger.info(f"Create new dataset '{dataset}'")
1✔
96

97
            elif qs.count() == 1:
1✔
98
                # update object
99
                dataset = qs.get()
1✔
100

101
                for k, v in record._asdict().items():
1✔
102
                    setattr(dataset, k, v)
1✔
103

104
                dataset.type_ = types
1✔
105
                dataset.contents = contents
1✔
106

107
                logger.debug(f"Dataset '{dataset}' updated")
1✔
108

109
            dataset.save()
1✔
110

111
            # ok extract content to working directory
112
            # TODO: don't work with plain text files, try to work with
113
            # compressed data
114
            working_dir = dataset.working_dir
1✔
115
            working_dir.mkdir(exist_ok=True)
1✔
116

117
            for member in contents:
1✔
118
                test = working_dir / member
1✔
119
                if not test.exists():
1✔
120
                    logger.info(f"Extract '{member}': in '{working_dir}'")
1✔
121
                    archive.extract(member, working_dir)
1✔
122

123
                else:
UNCOV
124
                    logger.debug(f"Skipping '{member}': already extracted")
×
125

126
    logger.info("Data written into database")
1✔
127

128
    logger.info(f"{Path(__file__).name} ended")
1✔
129

130

131
if __name__ == '__main__':
1✔
132
    log_fmt = '%(asctime)s - %(name)s - %(levelname)s - %(message)s'
×
133
    logging.basicConfig(level=logging.INFO, format=log_fmt)
×
134

135
    # connect to database
136
    global_connection()
×
137

138
    main()
×
STATUS · Troubleshooting · Open an Issue · Sales · Support · CAREERS · ENTERPRISE · START FREE · SCHEDULE DEMO
ANNOUNCEMENTS · TWITTER · TOS & SLA · Supported CI Services · What's a CI service? · Automated Testing

© 2026 Coveralls, Inc