• Home
  • Features
  • Pricing
  • Docs
  • Announcements
  • Sign In

cnr-ibba / SMARTER-database / 9265419949

27 May 2024 09:15AM CUT coverage: 94.434%. Remained the same
9265419949

push

github

bunop
:bookmark: Bump version: 0.4.10.dev0 → 0.4.10

1 of 1 new or added line in 1 file covered. (100.0%)

3071 of 3252 relevant lines covered (94.43%)

0.94 hits per line

Source File
Press 'n' to go to next uncovered line, 'b' for previous

88.16
/src/features/utils.py
1
#!/usr/bin/env python3
2
# -*- coding: utf-8 -*-
3
"""
1✔
4
Created on Mon Mar 15 14:13:51 2021
5

6
@author: Paolo Cozzi <paolo.cozzi@ibba.cnr.it>
7
"""
8

9
import io
1✔
10
import re
1✔
11
import gzip
1✔
12
import logging
1✔
13
import pathlib
1✔
14
import collections
1✔
15
from typing import Tuple, List
1✔
16

17
from pycountry import countries
1✔
18

19
# Get an instance of a logger
20
logger = logging.getLogger(__name__)
1✔
21

22
# manage custom countries
23
# english name for turkey
24
countries.add_entry(
1✔
25
    alpha_2="TR", alpha_3="TUR", name="Turkey", numeric="792",
26
    official_name='Republic of Türkiye')
27

28

29
def sanitize(
1✔
30
        word: str,
31
        chars=['.', ",", "-", "/", "#"],
32
        check_mongoengine=True) -> str:
33
    """Sanitize a word by removing unwanted characters and lowercase it.
34

35
    Args:
36
        word (str): the word to sanitize
37
        chars (list): a list of characters to remove
38
        check_mongoengine (bool): true to add '_' after a mongoengine reserved
39
            word
40

41
    Returns:
42
        str: the sanitized word
43
    """
44

45
    # remove unwanted characters from word by putting spaces
46
    pattern = "".join(chars)
1✔
47
    tmp = re.sub(r'[%s]' % (pattern), ' ', word)
1✔
48

49
    # remove spaces from column name and lowercase all
50
    sanitized = re.sub(r"\s+", "_", tmp).lower()
1✔
51

52
    if check_mongoengine:
1✔
53
        if sanitized in ['size', 'type']:
1✔
54
            sanitized += "_"
1✔
55

56
    # remove starting sanitized char (can't be used with namedtuple)
57
    if sanitized.startswith("_"):
1✔
58
        sanitized = sanitized[1:]
1✔
59

60
    return sanitized
1✔
61

62

63
def camelCase(string: str) -> str:
1✔
64
    """Convert a string into camel case
65

66
    Args:
67
        string (str): the string to convert
68

69
    Returns:
70
        str: the camel case version of the string
71
    """
72

73
    string = re.sub(r"(_|-|\.)+", " ", string).title().replace(" ", "")
×
74
    return string[0].lower() + string[1:]
×
75

76

77
class TqdmToLogger(io.StringIO):
1✔
78
    """
79
        Output stream for TQDM which will output to logger module instead of
80
        the StdOut.
81
    """
82
    logger = None
1✔
83
    level = None
1✔
84
    buf = ''
1✔
85

86
    def __init__(self, logger, level=None):
1✔
87
        super(TqdmToLogger, self).__init__()
1✔
88
        self.logger = logger
1✔
89
        self.level = level or logging.INFO
1✔
90

91
    def write(self, buf):
1✔
92
        self.buf = buf.strip('\r\n\t ')
1✔
93

94
    def flush(self):
1✔
95
        self.logger.log(self.level, self.buf)
1✔
96

97

98
def get_project_dir() -> pathlib.PosixPath:
1✔
99
    """Return smarter project dir (which are three levels upper from the
100
    module in which this function is stored)
101

102
    Returns:
103
        pathlib.PosixPath: the smarter project base dir
104
    """
105
    return pathlib.Path(__file__).parents[2]
1✔
106

107

108
def get_raw_dir() -> pathlib.PosixPath:
1✔
109
    """Return smarter data raw dir
110

111
    Returns:
112
        pathlib.PosixPath: the smarter data raw directory
113
    """
114

115
    return get_project_dir() / "data/raw"
×
116

117

118
def get_interim_dir() -> pathlib.PosixPath:
1✔
119
    """Return smarter data temporary dir
120

121
    Returns:
122
        pathlib.PosixPath: the smarter data temporary dir
123
    """
124

125
    return get_project_dir() / "data/interim"
×
126

127

128
def get_processed_dir() -> pathlib.PosixPath:
1✔
129
    """Return smarter data processed dir (final processed data)
130

131
    Returns:
132
        pathlib.PosixPath: the smarter data final processed dir
133
    """
134

135
    return get_project_dir() / "data/processed"
×
136

137

138
def text_or_gzip_open(path: str, mode: str = None) -> io.TextIOWrapper:
1✔
139
    """Open a file which can be compressed or not. Returns file handle"""
140

141
    if pathlib.Path(path).suffix == '.gz':
1✔
142
        if not mode:
×
143
            mode = 'rt'
×
144

145
        logger.debug(f"Gzip detected for {path}")
×
146
        return gzip.open(path, mode=mode)
×
147

148
    else:
149
        if not mode:
1✔
150
            mode = 'r'
1✔
151

152
        return open(path, mode=mode)
1✔
153

154

155
def find_duplicates(header: list) -> list:
1✔
156
    """Find duplicate columns in list. Returns index to remove after the first
157
    occurence
158

159
    Args:
160
        header (list): a list like the header read from a CSV file
161

162
    Returns:
163
        list: a list of index (numeric)
164
    """
165

166
    to_remove = []
1✔
167

168
    # count columns and find duplicates
169
    counts = collections.Counter(header)
1✔
170
    duplicated_cols = [key for key, value in counts.items() if value > 1]
1✔
171

172
    # now iterate and get duplicates indexes
173
    for duplicated in duplicated_cols:
1✔
174
        # get all duplicated index
175
        tmp = [i for i, col in enumerate(header) if col == duplicated]
1✔
176

177
        # track only from the 2nd occurrence
178
        to_remove += tmp[1:]
1✔
179

180
    return to_remove
1✔
181

182

183
def skip_comments(
1✔
184
        handle: io.TextIOWrapper, comment_char="#") -> Tuple[int, List[str]]:
185
    """
186
    Ignore comments lines from a open file handle. Return the stream position
187
    immediately after the comments and all the comment lines in a list.
188

189
    Parameters
190
    ----------
191
    handle : io.TextIOWrapper
192
        An open file handle.
193
    comment_char : TYPE, optional
194
        The comment character used in file. The default is "#".
195

196
    Returns
197
    -------
198
    Tuple[int, List[str]]
199
        The stream position after the comments and the ignored lines as a list.
200
    """
201

202
    # track skipped lines
203
    skipped = list()
1✔
204

205
    # read first line
206
    line = handle.readline().strip()
1✔
207

208
    # search for comments in file
209
    while line[0] == "#":
1✔
210
        logger.debug(f"Skipping: {line}")
1✔
211
        skipped.append(line)
1✔
212
        position = handle.tell()
1✔
213

214
        # read another line
215
        line = handle.readline().strip()
1✔
216

217
    # the position returned is the one before the one I want
218
    return position, skipped
1✔
219

220

221
class UnknownCountry():
1✔
222
    """Deal with unknown country"""
223

224
    def __init__(self):
1✔
225
        self.name = "Unknown"
1✔
226
        self.alpha_2 = "UN"
1✔
227
        self.alpha_3 = "UNK"
1✔
228
        self.numeric = None
1✔
STATUS · Troubleshooting · Open an Issue · Sales · Support · CAREERS · ENTERPRISE · START FREE · SCHEDULE DEMO
ANNOUNCEMENTS · TWITTER · TOS & SLA · Supported CI Services · What's a CI service? · Automated Testing

© 2025 Coveralls, Inc