9189630163

Committed 22 May 2024 10:26AM CUT coverage: 94.311%. Remained the same

Build # 9189630163

Build Type

Pull #121

github

Committed by

web-flow

Commit Message

Merge 389cf0ae7 into fcdb3ce6a

Pull Request Pull Request #121: ⬆️ Bump jinja2 from 3.1.2 to 3.1.4

Run Details

2984 of 3164 relevant lines covered (94.31%)

0.94 hits per line

Source File
Press 'n' to go to next uncovered line, 'b' for previous

87.67

/src/features/utils.py

#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Mon Mar 15 14:13:51 2021

@author: Paolo Cozzi <paolo.cozzi@ibba.cnr.it>
"""

import io
import re
import gzip
import logging
import pathlib
import collections

# Get an instance of a logger
logger = logging.getLogger(__name__)


def sanitize(
        word: str,
        chars=['.', ",", "-", "/", "#"],
        check_mongoengine=True) -> str:
    """Sanitize a word by removing unwanted characters and lowercase it.

    Args:
        word (str): the word to sanitize
        chars (list): a list of characters to remove
        check_mongoengine (bool): true to add '_' after a mongoengine reserved
            word

    Returns:
        str: the sanitized word
    """

    # remove unwanted characters from word by putting spaces
    pattern = "".join(chars)
    tmp = re.sub(r'[%s]' % (pattern), ' ', word)

    # remove spaces from column name and lowercase all
    sanitized = re.sub(r"\s+", "_", tmp).lower()

    if check_mongoengine:
        if sanitized in ['size', 'type']:
            sanitized += "_"

    # remove starting sanitized char (can't be used with namedtuple)
    if sanitized.startswith("_"):
        sanitized = sanitized[1:]

    return sanitized


def camelCase(string: str) -> str:
    """Convert a string into camel case

    Args:
        string (str): the string to convert

    Returns:
        str: the camel case version of the string
    """

    string = re.sub(r"(_|-|\.)+", " ", string).title().replace(" ", "")
    return string[0].lower() + string[1:]


class TqdmToLogger(io.StringIO):
    """
        Output stream for TQDM which will output to logger module instead of
        the StdOut.
    """
    logger = None
    level = None
    buf = ''

    def __init__(self, logger, level=None):
        super(TqdmToLogger, self).__init__()
        self.logger = logger
        self.level = level or logging.INFO

    def write(self, buf):
        self.buf = buf.strip('\r\n\t ')

    def flush(self):
        self.logger.log(self.level, self.buf)


def get_project_dir() -> pathlib.PosixPath:
    """Return smarter project dir (which are three levels upper from the
    module in which this function is stored)

    Returns:
        pathlib.PosixPath: the smarter project base dir
    """
    return pathlib.Path(__file__).parents[2]


def get_raw_dir() -> pathlib.PosixPath:
    """Return smarter data raw dir

    Returns:
        pathlib.PosixPath: the smarter data raw directory
    """

    return get_project_dir() / "data/raw"


def get_interim_dir() -> pathlib.PosixPath:
    """Return smarter data temporary dir

    Returns:
        pathlib.PosixPath: the smarter data temporary dir
    """

    return get_project_dir() / "data/interim"


def get_processed_dir() -> pathlib.PosixPath:
    """Return smarter data processed dir (final processed data)

    Returns:
        pathlib.PosixPath: the smarter data final processed dir
    """

    return get_project_dir() / "data/processed"


def text_or_gzip_open(path: str, mode: str = None) -> io.TextIOWrapper:
    """Open a file which can be compressed or not. Returns file handle"""

    if pathlib.Path(path).suffix == '.gz':
        if not mode:
            mode = 'rt'

        logger.debug(f"Gzip detected for {path}")
        return gzip.open(path, mode=mode)

    else:
        if not mode:
            mode = 'r'

        return open(path, mode=mode)


def find_duplicates(header: list) -> list:
    """Find duplicate columns in list. Returns index to remove after the first
    occurence

    Args:
        header (list): a list like the header read from a CSV file

    Returns:
        list: a list of index (numeric)
    """

    to_remove = []

    # count columns and find duplicates
    counts = collections.Counter(header)
    duplicated_cols = [key for key, value in counts.items() if value > 1]

    # now iterate and get duplicates indexes
    for duplicated in duplicated_cols:
        # get all duplicated index
        tmp = [i for i, col in enumerate(header) if col == duplicated]

        # track only from the 2nd occurrence
        to_remove += tmp[1:]

    return to_remove


def skip_comments(handle: io.TextIOWrapper, comment_char="#") -> (int, list):
    """
    Ignore comments lines from a open file handle. Return the stream position
    immediately after the comments and all the comment lines in a list.

    Parameters
    ----------
    handle : io.TextIOWrapper
        An open file handle.
    comment_char : TYPE, optional
        The comment character used in file. The default is "#".

    Returns
    -------
    (int, list)
        The stream position after the comments and the ignored lines as a list.
    """

    # track skipped lines
    skipped = list()

    # read first line
    line = handle.readline().strip()

    # search for comments in file
    while line[0] == "#":
        logger.debug(f"Skipping: {line}")
        skipped.append(line)
        position = handle.tell()

        # read another line
        line = handle.readline().strip()

    # the position returned is the one before the one I want
    return position, skipped


class UnknownCountry():
    """Deal with unknown country"""

    def __init__(self):
        self.name = "Unknown"
        self.alpha_2 = "UN"
        self.alpha_3 = "UNK"
        self.numeric = None

1	#!/usr/bin/env python3
2	# -- coding: utf-8 --
3	"""	1✔
4	Created on Mon Mar 15 14:13:51 2021
5
6	@author: Paolo Cozzi <paolo.cozzi@ibba.cnr.it>
7	"""
8
9	import io	1✔
10	import re	1✔
11	import gzip	1✔
12	import logging	1✔
13	import pathlib	1✔
14	import collections	1✔
15
16	# Get an instance of a logger
17	logger = logging.getLogger(__name__)	1✔
18
19
20	def sanitize(	1✔
21	word: str,
22	chars=['.', ",", "-", "/", "#"],
23	check_mongoengine=True) -> str:
24	"""Sanitize a word by removing unwanted characters and lowercase it.
25
26	Args:
27	word (str): the word to sanitize
28	chars (list): a list of characters to remove
29	check_mongoengine (bool): true to add '_' after a mongoengine reserved
30	word
31
32	Returns:
33	str: the sanitized word
34	"""
35
36	# remove unwanted characters from word by putting spaces
37	pattern = "".join(chars)	1✔
38	tmp = re.sub(r'[%s]' % (pattern), ' ', word)	1✔
39
40	# remove spaces from column name and lowercase all
41	sanitized = re.sub(r"\s+", "_", tmp).lower()	1✔
42
43	if check_mongoengine:	1✔
44	if sanitized in ['size', 'type']:	1✔
45	sanitized += "_"	1✔
46
47	# remove starting sanitized char (can't be used with namedtuple)
48	if sanitized.startswith("_"):	1✔
49	sanitized = sanitized[1:]	1✔
50
51	return sanitized	1✔
52
53
54	def camelCase(string: str) -> str:	1✔
55	"""Convert a string into camel case
56
57	Args:
58	string (str): the string to convert
59
60	Returns:
61	str: the camel case version of the string
62	"""
63
64	string = re.sub(r"(_\|-\|\.)+", " ", string).title().replace(" ", "")	×
65	return string[0].lower() + string[1:]	×
66
67
68	class TqdmToLogger(io.StringIO):	1✔
69	"""
70	Output stream for TQDM which will output to logger module instead of
71	the StdOut.
72	"""
73	logger = None	1✔
74	level = None	1✔
75	buf = ''	1✔
76
77	def __init__(self, logger, level=None):	1✔
78	super(TqdmToLogger, self).__init__()	1✔
79	self.logger = logger	1✔
80	self.level = level or logging.INFO	1✔
81
82	def write(self, buf):	1✔
83	self.buf = buf.strip('\r\n\t ')	1✔
84
85	def flush(self):	1✔
86	self.logger.log(self.level, self.buf)	1✔
87
88
89	def get_project_dir() -> pathlib.PosixPath:	1✔
90	"""Return smarter project dir (which are three levels upper from the
91	module in which this function is stored)
92
93	Returns:
94	pathlib.PosixPath: the smarter project base dir
95	"""
96	return pathlib.Path(__file__).parents[2]	1✔
97
98
99	def get_raw_dir() -> pathlib.PosixPath:	1✔
100	"""Return smarter data raw dir
101
102	Returns:
103	pathlib.PosixPath: the smarter data raw directory
104	"""
105
106	return get_project_dir() / "data/raw"	×
107
108
109	def get_interim_dir() -> pathlib.PosixPath:	1✔
110	"""Return smarter data temporary dir
111
112	Returns:
113	pathlib.PosixPath: the smarter data temporary dir
114	"""
115
116	return get_project_dir() / "data/interim"	×
117
118
119	def get_processed_dir() -> pathlib.PosixPath:	1✔
120	"""Return smarter data processed dir (final processed data)
121
122	Returns:
123	pathlib.PosixPath: the smarter data final processed dir
124	"""
125
126	return get_project_dir() / "data/processed"	×
127
128
129	def text_or_gzip_open(path: str, mode: str = None) -> io.TextIOWrapper:	1✔
130	"""Open a file which can be compressed or not. Returns file handle"""
131
132	if pathlib.Path(path).suffix == '.gz':	1✔
133	if not mode:	×
134	mode = 'rt'	×
135
136	logger.debug(f"Gzip detected for {path}")	×
137	return gzip.open(path, mode=mode)	×
138
139	else:
140	if not mode:	1✔
141	mode = 'r'	1✔
142
143	return open(path, mode=mode)	1✔
144
145
146	def find_duplicates(header: list) -> list:	1✔
147	"""Find duplicate columns in list. Returns index to remove after the first
148	occurence
149
150	Args:
151	header (list): a list like the header read from a CSV file
152
153	Returns:
154	list: a list of index (numeric)
155	"""
156
157	to_remove = []	1✔
158
159	# count columns and find duplicates
160	counts = collections.Counter(header)	1✔
161	duplicated_cols = [key for key, value in counts.items() if value > 1]	1✔
162
163	# now iterate and get duplicates indexes
164	for duplicated in duplicated_cols:	1✔
165	# get all duplicated index
166	tmp = [i for i, col in enumerate(header) if col == duplicated]	1✔
167
168	# track only from the 2nd occurrence
169	to_remove += tmp[1:]	1✔
170
171	return to_remove	1✔
172
173
174	def skip_comments(handle: io.TextIOWrapper, comment_char="#") -> (int, list):	1✔
175	"""
176	Ignore comments lines from a open file handle. Return the stream position
177	immediately after the comments and all the comment lines in a list.
178
179	Parameters
180	----------
181	handle : io.TextIOWrapper
182	An open file handle.
183	comment_char : TYPE, optional
184	The comment character used in file. The default is "#".
185
186	Returns
187	-------
188	(int, list)
189	The stream position after the comments and the ignored lines as a list.
190	"""
191
192	# track skipped lines
193	skipped = list()	1✔
194
195	# read first line
196	line = handle.readline().strip()	1✔
197
198	# search for comments in file
199	while line[0] == "#":	1✔
200	logger.debug(f"Skipping: {line}")	1✔
201	skipped.append(line)	1✔
202	position = handle.tell()	1✔
203
204	# read another line
205	line = handle.readline().strip()	1✔
206
207	# the position returned is the one before the one I want
208	return position, skipped	1✔
209
210
211	class UnknownCountry():	1✔
212	"""Deal with unknown country"""
213
214	def __init__(self):	1✔
215	self.name = "Unknown"	1✔
216	self.alpha_2 = "UN"	1✔
217	self.alpha_3 = "UNK"	1✔
218	self.numeric = None	1✔

cnr-ibba / SMARTER-database / 9189630163

Source File Press 'n' to go to next uncovered line, 'b' for previous

Source File
Press 'n' to go to next uncovered line, 'b' for previous