9226859311

Committed 24 May 2024 04:08PM UTC coverage: 94.434% (+0.1%) from 94.311%

Build # 9226859311

Build Type

Pull #131

github

Committed by

bunop

Commit Message

:bug: fix issues with countries in MERINO_INIA_UY dataset

Setting Australia to imported animals

Pull Request Pull Request #131: :bookmark: release 0.4.10

Run Details

29 of 29 new or added lines in 8 files covered. (100.0%)

1 existing line in 1 file now uncovered.

3071 of 3252 relevant lines covered (94.43%)

0.94 hits per line

Source File
Press 'n' to go to next uncovered line, 'b' for previous

92.42

/src/data/import_datasets.py

#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Mon Feb 22 18:32:54 2021

@author: Paolo Cozzi <paolo.cozzi@ibba.cnr.it>
"""

import csv
import sys
import click
import logging
import zipfile
import collections

from pathlib import Path

from src.features.smarterdb import global_connection, Dataset
from src.features.utils import sanitize, get_raw_dir

logger = logging.getLogger(__name__)


@click.command()
@click.argument('input_filepath', type=click.Path(exists=True))
@click.option(
    '--types', nargs=2, type=str, required=True,
    help=(
        '2 argument types (ex. genotypes background, phenotypes foreground,'
        ' etc)'))
def main(input_filepath, types):
    """
    Import a dataset stored in ``data/raw`` folder into the *smarter*
    database and unpack file contents into ``data/interim`` subfolder

    INPUT_FILEPATH:  The CSV dataset description file
    """

    logger.info(f"{Path(__file__).name} started")

    # where to find raw data in SMARTER-database project
    raw_dir = get_raw_dir()

    with open(input_filepath) as handle:
        reader = csv.reader(handle, delimiter=";")

        header = next(reader)

        # remove header id
        del header[0]

        # sanitize column
        header = [sanitize(col) for col in header]

        logger.debug("Got '%s' as header" % header)

        # define a datatype for my data
        Record = collections.namedtuple("Record", header)

        for line in reader:
            # remove id from record
            del line[0]

            # remove empty values
            line = [col if col != '' else None for col in line]

            record = Record._make(line)
            logger.debug(record)

            # search for the archive file
            try:
                archive = next(raw_dir.rglob(record.file))
                logger.info(f"Found '{archive}' dataset")

            except StopIteration:
                logger.critical(f"Cannot find '{record.file}' in '{raw_dir}'")
                sys.exit(f"'{record.file}' does not exists")

            archive = zipfile.ZipFile(archive)

            logger.debug("Get file contents")
            contents = archive.namelist()
            logger.debug(contents)

            # add or create dataset (file is a unique key)
            qs = Dataset.objects(file=record.file)

            if qs.count() == 0:
                # create a new object
                dataset = Dataset(
                    **record._asdict(),
                    type_=types,
                    contents=contents)

                logger.info(f"Create new dataset '{dataset}'")

            elif qs.count() == 1:
                # update object
                dataset = qs.get()

                for k, v in record._asdict().items():
                    setattr(dataset, k, v)

                dataset.type_ = types
                dataset.contents = contents

                logger.debug(f"Dataset '{dataset}' updated")

            dataset.save()

            # ok extract content to working directory
            # TODO: don't work with plain text files, try to work with
            # compressed data
            working_dir = dataset.working_dir
            working_dir.mkdir(exist_ok=True)

            for member in contents:
                test = working_dir / member
                if not test.exists():
                    logger.info(f"Extract '{member}': in '{working_dir}'")
                    archive.extract(member, working_dir)

                else:
                    logger.debug(f"Skipping '{member}': already extracted")

    logger.info("Data written into database")

    logger.info(f"{Path(__file__).name} ended")


if __name__ == '__main__':
    log_fmt = '%(asctime)s - %(name)s - %(levelname)s - %(message)s'
    logging.basicConfig(level=logging.INFO, format=log_fmt)

    # connect to database
    global_connection()

    main()

1	#!/usr/bin/env python3
2	# -- coding: utf-8 --
3	"""	1✔
4	Created on Mon Feb 22 18:32:54 2021
5
6	@author: Paolo Cozzi <paolo.cozzi@ibba.cnr.it>
7	"""
8
9	import csv	1✔
10	import sys	1✔
11	import click	1✔
12	import logging	1✔
13	import zipfile	1✔
14	import collections	1✔
15
16	from pathlib import Path	1✔
17
18	from src.features.smarterdb import global_connection, Dataset	1✔
19	from src.features.utils import sanitize, get_raw_dir	1✔
20
21	logger = logging.getLogger(__name__)	1✔
22
23
24	@click.command()	1✔
25	@click.argument('input_filepath', type=click.Path(exists=True))	1✔
26	@click.option(	1✔
27	'--types', nargs=2, type=str, required=True,
28	help=(
29	'2 argument types (ex. genotypes background, phenotypes foreground,'
30	' etc)'))
31	def main(input_filepath, types):	1✔
32	"""
33	Import a dataset stored in ``data/raw`` folder into the smarter
34	database and unpack file contents into ``data/interim`` subfolder
35
36	INPUT_FILEPATH: The CSV dataset description file
37	"""
38
39	logger.info(f"{Path(__file__).name} started")	1✔
40
41	# where to find raw data in SMARTER-database project
42	raw_dir = get_raw_dir()	1✔
43
44	with open(input_filepath) as handle:	1✔
45	reader = csv.reader(handle, delimiter=";")	1✔
46
47	header = next(reader)	1✔
48
49	# remove header id
50	del header[0]	1✔
51
52	# sanitize column
53	header = [sanitize(col) for col in header]	1✔
54
55	logger.debug("Got '%s' as header" % header)	1✔
56
57	# define a datatype for my data
58	Record = collections.namedtuple("Record", header)	1✔
59
60	for line in reader:	1✔
61	# remove id from record
62	del line[0]	1✔
63
64	# remove empty values
65	line = [col if col != '' else None for col in line]	1✔
66
67	record = Record._make(line)	1✔
68	logger.debug(record)	1✔
69
70	# search for the archive file
71	try:	1✔
72	archive = next(raw_dir.rglob(record.file))	1✔
73	logger.info(f"Found '{archive}' dataset")	1✔
74
75	except StopIteration:	1✔
76	logger.critical(f"Cannot find '{record.file}' in '{raw_dir}'")	1✔
77	sys.exit(f"'{record.file}' does not exists")	1✔
78
79	archive = zipfile.ZipFile(archive)	1✔
80
81	logger.debug("Get file contents")	1✔
82	contents = archive.namelist()	1✔
83	logger.debug(contents)	1✔
84
85	# add or create dataset (file is a unique key)
86	qs = Dataset.objects(file=record.file)	1✔
87
88	if qs.count() == 0:	1✔
89	# create a new object
90	dataset = Dataset(	1✔
91	**record._asdict(),
92	type_=types,
93	contents=contents)
94
95	logger.info(f"Create new dataset '{dataset}'")	1✔
96
97	elif qs.count() == 1:	1✔
98	# update object
99	dataset = qs.get()	1✔
100
101	for k, v in record._asdict().items():	1✔
102	setattr(dataset, k, v)	1✔
103
104	dataset.type_ = types	1✔
105	dataset.contents = contents	1✔
106
107	logger.debug(f"Dataset '{dataset}' updated")	1✔
108
109	dataset.save()	1✔
110
111	# ok extract content to working directory
112	# TODO: don't work with plain text files, try to work with
113	# compressed data
114	working_dir = dataset.working_dir	1✔
115	working_dir.mkdir(exist_ok=True)	1✔
116
117	for member in contents:	1✔
118	test = working_dir / member	1✔
119	if not test.exists():	1✔
120	logger.info(f"Extract '{member}': in '{working_dir}'")	1✔
121	archive.extract(member, working_dir)	1✔
122
123	else:
UNCOV 124	logger.debug(f"Skipping '{member}': already extracted")	×
125
126	logger.info("Data written into database")	1✔
127
128	logger.info(f"{Path(__file__).name} ended")	1✔
129
130
131	if __name__ == '__main__':	1✔
132	log_fmt = '%(asctime)s - %(name)s - %(levelname)s - %(message)s'	×
133	logging.basicConfig(level=logging.INFO, format=log_fmt)	×
134
135	# connect to database
136	global_connection()	×
137
138	main()	×

cnr-ibba / SMARTER-database / 9226859311

Source File Press 'n' to go to next uncovered line, 'b' for previous

Source File
Press 'n' to go to next uncovered line, 'b' for previous