9265419949

Committed 27 May 2024 09:15AM CUT coverage: 94.434%. Remained the same

Build # 9265419949

Build Type

push

github

Committed by

bunop

Commit Message

:bookmark: Bump version: 0.4.10.dev0 → 0.4.10

Run Details

1 of 1 new or added line in 1 file covered. (100.0%)

3071 of 3252 relevant lines covered (94.43%)

0.94 hits per line

Source File
Press 'n' to go to next uncovered line, 'b' for previous

88.16

/src/features/utils.py

#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Mon Mar 15 14:13:51 2021

@author: Paolo Cozzi <paolo.cozzi@ibba.cnr.it>
"""

import io
import re
import gzip
import logging
import pathlib
import collections
from typing import Tuple, List

from pycountry import countries

# Get an instance of a logger
logger = logging.getLogger(__name__)

# manage custom countries
# english name for turkey
countries.add_entry(
    alpha_2="TR", alpha_3="TUR", name="Turkey", numeric="792",
    official_name='Republic of Türkiye')


def sanitize(
        word: str,
        chars=['.', ",", "-", "/", "#"],
        check_mongoengine=True) -> str:
    """Sanitize a word by removing unwanted characters and lowercase it.

    Args:
        word (str): the word to sanitize
        chars (list): a list of characters to remove
        check_mongoengine (bool): true to add '_' after a mongoengine reserved
            word

    Returns:
        str: the sanitized word
    """

    # remove unwanted characters from word by putting spaces
    pattern = "".join(chars)
    tmp = re.sub(r'[%s]' % (pattern), ' ', word)

    # remove spaces from column name and lowercase all
    sanitized = re.sub(r"\s+", "_", tmp).lower()

    if check_mongoengine:
        if sanitized in ['size', 'type']:
            sanitized += "_"

    # remove starting sanitized char (can't be used with namedtuple)
    if sanitized.startswith("_"):
        sanitized = sanitized[1:]

    return sanitized


def camelCase(string: str) -> str:
    """Convert a string into camel case

    Args:
        string (str): the string to convert

    Returns:
        str: the camel case version of the string
    """

    string = re.sub(r"(_|-|\.)+", " ", string).title().replace(" ", "")
    return string[0].lower() + string[1:]


class TqdmToLogger(io.StringIO):
    """
        Output stream for TQDM which will output to logger module instead of
        the StdOut.
    """
    logger = None
    level = None
    buf = ''

    def __init__(self, logger, level=None):
        super(TqdmToLogger, self).__init__()
        self.logger = logger
        self.level = level or logging.INFO

    def write(self, buf):
        self.buf = buf.strip('\r\n\t ')

    def flush(self):
        self.logger.log(self.level, self.buf)


def get_project_dir() -> pathlib.PosixPath:
    """Return smarter project dir (which are three levels upper from the
    module in which this function is stored)

    Returns:
        pathlib.PosixPath: the smarter project base dir
    """
    return pathlib.Path(__file__).parents[2]


def get_raw_dir() -> pathlib.PosixPath:
    """Return smarter data raw dir

    Returns:
        pathlib.PosixPath: the smarter data raw directory
    """

    return get_project_dir() / "data/raw"


def get_interim_dir() -> pathlib.PosixPath:
    """Return smarter data temporary dir

    Returns:
        pathlib.PosixPath: the smarter data temporary dir
    """

    return get_project_dir() / "data/interim"


def get_processed_dir() -> pathlib.PosixPath:
    """Return smarter data processed dir (final processed data)

    Returns:
        pathlib.PosixPath: the smarter data final processed dir
    """

    return get_project_dir() / "data/processed"


def text_or_gzip_open(path: str, mode: str = None) -> io.TextIOWrapper:
    """Open a file which can be compressed or not. Returns file handle"""

    if pathlib.Path(path).suffix == '.gz':
        if not mode:
            mode = 'rt'

        logger.debug(f"Gzip detected for {path}")
        return gzip.open(path, mode=mode)

    else:
        if not mode:
            mode = 'r'

        return open(path, mode=mode)


def find_duplicates(header: list) -> list:
    """Find duplicate columns in list. Returns index to remove after the first
    occurence

    Args:
        header (list): a list like the header read from a CSV file

    Returns:
        list: a list of index (numeric)
    """

    to_remove = []

    # count columns and find duplicates
    counts = collections.Counter(header)
    duplicated_cols = [key for key, value in counts.items() if value > 1]

    # now iterate and get duplicates indexes
    for duplicated in duplicated_cols:
        # get all duplicated index
        tmp = [i for i, col in enumerate(header) if col == duplicated]

        # track only from the 2nd occurrence
        to_remove += tmp[1:]

    return to_remove


def skip_comments(
        handle: io.TextIOWrapper, comment_char="#") -> Tuple[int, List[str]]:
    """
    Ignore comments lines from a open file handle. Return the stream position
    immediately after the comments and all the comment lines in a list.

    Parameters
    ----------
    handle : io.TextIOWrapper
        An open file handle.
    comment_char : TYPE, optional
        The comment character used in file. The default is "#".

    Returns
    -------
    Tuple[int, List[str]]
        The stream position after the comments and the ignored lines as a list.
    """

    # track skipped lines
    skipped = list()

    # read first line
    line = handle.readline().strip()

    # search for comments in file
    while line[0] == "#":
        logger.debug(f"Skipping: {line}")
        skipped.append(line)
        position = handle.tell()

        # read another line
        line = handle.readline().strip()

    # the position returned is the one before the one I want
    return position, skipped


class UnknownCountry():
    """Deal with unknown country"""

    def __init__(self):
        self.name = "Unknown"
        self.alpha_2 = "UN"
        self.alpha_3 = "UNK"
        self.numeric = None

1	#!/usr/bin/env python3
2	# -- coding: utf-8 --
3	"""	1✔
4	Created on Mon Mar 15 14:13:51 2021
5
6	@author: Paolo Cozzi <paolo.cozzi@ibba.cnr.it>
7	"""
8
9	import io	1✔
10	import re	1✔
11	import gzip	1✔
12	import logging	1✔
13	import pathlib	1✔
14	import collections	1✔
15	from typing import Tuple, List	1✔
16
17	from pycountry import countries	1✔
18
19	# Get an instance of a logger
20	logger = logging.getLogger(__name__)	1✔
21
22	# manage custom countries
23	# english name for turkey
24	countries.add_entry(	1✔
25	alpha_2="TR", alpha_3="TUR", name="Turkey", numeric="792",
26	official_name='Republic of Türkiye')
27
28
29	def sanitize(	1✔
30	word: str,
31	chars=['.', ",", "-", "/", "#"],
32	check_mongoengine=True) -> str:
33	"""Sanitize a word by removing unwanted characters and lowercase it.
34
35	Args:
36	word (str): the word to sanitize
37	chars (list): a list of characters to remove
38	check_mongoengine (bool): true to add '_' after a mongoengine reserved
39	word
40
41	Returns:
42	str: the sanitized word
43	"""
44
45	# remove unwanted characters from word by putting spaces
46	pattern = "".join(chars)	1✔
47	tmp = re.sub(r'[%s]' % (pattern), ' ', word)	1✔
48
49	# remove spaces from column name and lowercase all
50	sanitized = re.sub(r"\s+", "_", tmp).lower()	1✔
51
52	if check_mongoengine:	1✔
53	if sanitized in ['size', 'type']:	1✔
54	sanitized += "_"	1✔
55
56	# remove starting sanitized char (can't be used with namedtuple)
57	if sanitized.startswith("_"):	1✔
58	sanitized = sanitized[1:]	1✔
59
60	return sanitized	1✔
61
62
63	def camelCase(string: str) -> str:	1✔
64	"""Convert a string into camel case
65
66	Args:
67	string (str): the string to convert
68
69	Returns:
70	str: the camel case version of the string
71	"""
72
73	string = re.sub(r"(_\|-\|\.)+", " ", string).title().replace(" ", "")	×
74	return string[0].lower() + string[1:]	×
75
76
77	class TqdmToLogger(io.StringIO):	1✔
78	"""
79	Output stream for TQDM which will output to logger module instead of
80	the StdOut.
81	"""
82	logger = None	1✔
83	level = None	1✔
84	buf = ''	1✔
85
86	def __init__(self, logger, level=None):	1✔
87	super(TqdmToLogger, self).__init__()	1✔
88	self.logger = logger	1✔
89	self.level = level or logging.INFO	1✔
90
91	def write(self, buf):	1✔
92	self.buf = buf.strip('\r\n\t ')	1✔
93
94	def flush(self):	1✔
95	self.logger.log(self.level, self.buf)	1✔
96
97
98	def get_project_dir() -> pathlib.PosixPath:	1✔
99	"""Return smarter project dir (which are three levels upper from the
100	module in which this function is stored)
101
102	Returns:
103	pathlib.PosixPath: the smarter project base dir
104	"""
105	return pathlib.Path(__file__).parents[2]	1✔
106
107
108	def get_raw_dir() -> pathlib.PosixPath:	1✔
109	"""Return smarter data raw dir
110
111	Returns:
112	pathlib.PosixPath: the smarter data raw directory
113	"""
114
115	return get_project_dir() / "data/raw"	×
116
117
118	def get_interim_dir() -> pathlib.PosixPath:	1✔
119	"""Return smarter data temporary dir
120
121	Returns:
122	pathlib.PosixPath: the smarter data temporary dir
123	"""
124
125	return get_project_dir() / "data/interim"	×
126
127
128	def get_processed_dir() -> pathlib.PosixPath:	1✔
129	"""Return smarter data processed dir (final processed data)
130
131	Returns:
132	pathlib.PosixPath: the smarter data final processed dir
133	"""
134
135	return get_project_dir() / "data/processed"	×
136
137
138	def text_or_gzip_open(path: str, mode: str = None) -> io.TextIOWrapper:	1✔
139	"""Open a file which can be compressed or not. Returns file handle"""
140
141	if pathlib.Path(path).suffix == '.gz':	1✔
142	if not mode:	×
143	mode = 'rt'	×
144
145	logger.debug(f"Gzip detected for {path}")	×
146	return gzip.open(path, mode=mode)	×
147
148	else:
149	if not mode:	1✔
150	mode = 'r'	1✔
151
152	return open(path, mode=mode)	1✔
153
154
155	def find_duplicates(header: list) -> list:	1✔
156	"""Find duplicate columns in list. Returns index to remove after the first
157	occurence
158
159	Args:
160	header (list): a list like the header read from a CSV file
161
162	Returns:
163	list: a list of index (numeric)
164	"""
165
166	to_remove = []	1✔
167
168	# count columns and find duplicates
169	counts = collections.Counter(header)	1✔
170	duplicated_cols = [key for key, value in counts.items() if value > 1]	1✔
171
172	# now iterate and get duplicates indexes
173	for duplicated in duplicated_cols:	1✔
174	# get all duplicated index
175	tmp = [i for i, col in enumerate(header) if col == duplicated]	1✔
176
177	# track only from the 2nd occurrence
178	to_remove += tmp[1:]	1✔
179
180	return to_remove	1✔
181
182
183	def skip_comments(	1✔
184	handle: io.TextIOWrapper, comment_char="#") -> Tuple[int, List[str]]:
185	"""
186	Ignore comments lines from a open file handle. Return the stream position
187	immediately after the comments and all the comment lines in a list.
188
189	Parameters
190	----------
191	handle : io.TextIOWrapper
192	An open file handle.
193	comment_char : TYPE, optional
194	The comment character used in file. The default is "#".
195
196	Returns
197	-------
198	Tuple[int, List[str]]
199	The stream position after the comments and the ignored lines as a list.
200	"""
201
202	# track skipped lines
203	skipped = list()	1✔
204
205	# read first line
206	line = handle.readline().strip()	1✔
207
208	# search for comments in file
209	while line[0] == "#":	1✔
210	logger.debug(f"Skipping: {line}")	1✔
211	skipped.append(line)	1✔
212	position = handle.tell()	1✔
213
214	# read another line
215	line = handle.readline().strip()	1✔
216
217	# the position returned is the one before the one I want
218	return position, skipped	1✔
219
220
221	class UnknownCountry():	1✔
222	"""Deal with unknown country"""
223
224	def __init__(self):	1✔
225	self.name = "Unknown"	1✔
226	self.alpha_2 = "UN"	1✔
227	self.alpha_3 = "UNK"	1✔
228	self.numeric = None	1✔

cnr-ibba / SMARTER-database / 9265419949

Source File Press 'n' to go to next uncovered line, 'b' for previous

Source File
Press 'n' to go to next uncovered line, 'b' for previous