• Home
  • Features
  • Pricing
  • Docs
  • Announcements
  • Sign In

nielstron / quantulum3 / 879

pending completion
879

cron

travis-ci-com

web-flow
Merge pull request #216 from nielstron/dev

Bump Version

42 of 42 new or added lines in 6 files covered. (100.0%)

1415 of 1452 relevant lines covered (97.45%)

4.87 hits per line

Source File
Press 'n' to go to next uncovered line, 'b' for previous

99.21
/quantulum3/regex.py
1
#!/usr/bin/env python
2
# -*- coding: utf-8 -*-
3
"""
5✔
4
:mod:`Quantulum` regex functions.
5
"""
6

7
import re
5✔
8

9
from . import language, load
5✔
10
from .load import cached
5✔
11

12

13
###############################################################################
14
@cached
5✔
15
def _get_regex(lang="en_US"):
5✔
16
    """
17
    Get regex module for given language
18
    :param lang:
19
    :return:
20
    """
21
    return language.get("regex", lang)
5✔
22

23

24
###############################################################################
25
def units(lang="en_US"):
5✔
26
    return _get_regex(lang).UNITS
5✔
27

28

29
def tens(lang="en_US"):
5✔
30
    return _get_regex(lang).TENS
5✔
31

32

33
def scales(lang="en_US"):
5✔
34
    return _get_regex(lang).SCALES
5✔
35

36

37
def decimals(lang="en_US"):
5✔
38
    return _get_regex(lang).DECIMALS
5✔
39

40

41
def miscnum(lang="en_US"):
5✔
42
    return _get_regex(lang).MISCNUM
5✔
43

44

45
def powers(lang="en_US"):
5✔
46
    return _get_regex(lang).POWERS
5✔
47

48

49
def negatives(lang="en_US"):
5✔
50
    return _get_regex(lang).NEGATIVES
5✔
51

52

53
def exponents_regex(lang="en_US"):
5✔
54
    return _get_regex(lang).EXPONENTS_REGEX
5✔
55

56

57
@cached
5✔
58
def ranges(lang="en_US"):
5✔
59
    ranges_ = {"-"}
5✔
60
    ranges_.update(_get_regex(lang).RANGES)
5✔
61
    return ranges_
5✔
62

63

64
@cached
5✔
65
def uncertainties(lang="en_US"):
5✔
66
    uncertainties_ = {r"\+/-", r"±"}
5✔
67
    uncertainties_.update(_get_regex(lang).UNCERTAINTIES)
5✔
68
    return uncertainties_
5✔
69

70

71
###############################################################################
72
@cached
5✔
73
def numberwords(lang="en_US"):
5✔
74
    """
75
    Convert number words to integers in a given text.
76
    """
77

78
    numwords = {}
5✔
79

80
    numwords.update(miscnum(lang))
5✔
81

82
    for word in negatives(lang):
5✔
83
        numwords[word] = (-1, 0)
5✔
84
    for idx, word in enumerate(units(lang)):
5✔
85
        numwords[word] = (1, idx)
5✔
86
    for idx, word in enumerate(tens(lang)):
5✔
87
        numwords[word] = (1, idx * 10)
5✔
88
    for idx, word in enumerate(scales(lang)):
5✔
89
        numwords[word] = (10 ** (idx * 3 or 2), 0)
5✔
90
    for word, factor in decimals(lang).items():
5✔
91
        numwords[word] = (factor, 0)
5✔
92
        numwords[load.pluralize(word, lang=lang)] = (factor, 0)
5✔
93

94
    return numwords
5✔
95

96

97
@cached
5✔
98
def numberwords_regex(lang="en_US"):
5✔
99
    all_numbers = r"|".join(
5✔
100
        r"((?<=\W)|^)%s((?=\W)|$)" % i for i in list(numberwords(lang).keys()) if i
101
    )
102
    return all_numbers
5✔
103

104

105
###############################################################################
106
def suffixes(lang="en_US"):
5✔
107
    return _get_regex(lang).SUFFIXES
5✔
108

109

110
def unicode_superscript():
5✔
111
    uni_super = {
5✔
112
        "¹": "1",
113
        "²": "2",
114
        "³": "3",
115
        "⁴": "4",
116
        "⁵": "5",
117
        "⁶": "6",
118
        "⁷": "7",
119
        "⁸": "8",
120
        "⁹": "9",
121
        "⁰": "0",
122
    }
123
    return uni_super
5✔
124

125

126
def unicode_superscript_regex():
5✔
127
    return re.escape("".join(list(unicode_superscript().keys())))
5✔
128

129

130
def unicode_fractions():
5✔
131
    uni_frac = {
5✔
132
        "¼": "1/4",
133
        "½": "1/2",
134
        "¾": "3/4",
135
        "⅐": "1/7",
136
        "⅑": "1/9",
137
        "⅒": "1/10",
138
        "⅓": "1/3",
139
        "⅔": "2/3",
140
        "⅕": "1/5",
141
        "⅖": "2/5",
142
        "⅗": "3/5",
143
        "⅘": "4/5",
144
        "⅙": "1/6",
145
        "⅚": "5/6",
146
        "⅛": "1/8",
147
        "⅜": "3/8",
148
        "⅝": "5/8",
149
        "⅞": "7/8",
150
    }
151
    return uni_frac
5✔
152

153

154
def unicode_fractions_regex():
5✔
155
    return re.escape("".join(list(unicode_fractions().keys())))
5✔
156

157

158
@cached
5✔
159
def multiplication_operators(lang="en_US"):
5✔
160
    mul = {"*", " ", "·", "x"}
5✔
161
    mul.update(_get_regex(lang).MULTIPLICATION_OPERATORS)
5✔
162
    return mul
5✔
163

164

165
@cached
5✔
166
def multiplication_operators_regex(lang="en_US"):
5✔
167
    return r"|".join(r"%s" % re.escape(i) for i in multiplication_operators(lang))
5✔
168

169

170
@cached
5✔
171
def division_operators(lang="en_US"):
5✔
172
    div = {"/"}
5✔
173
    div.update(_get_regex(lang).DIVISION_OPERATORS)
5✔
174
    return div
5✔
175

176

177
@cached
5✔
178
def grouping_operators(lang="en_US"):
5✔
179
    grouping_ops = {" "}
5✔
180
    grouping_ops.update(_get_regex(lang).GROUPING_OPERATORS)
5✔
181
    return grouping_ops
5✔
182

183

184
def grouping_operators_regex(lang="en_US"):
5✔
185
    return "".join(grouping_operators(lang))
5✔
186

187

188
@cached
5✔
189
def decimal_operators(lang="en_US"):
5✔
190
    return _get_regex(lang).DECIMAL_OPERATORS
5✔
191

192

193
@cached
5✔
194
def decimal_operators_regex(lang="en_US"):
5✔
195
    return "".join(decimal_operators(lang))
5✔
196

197

198
@cached
5✔
199
def operators(lang="en_US"):
5✔
200
    ops = set()
5✔
201
    ops.update(multiplication_operators(lang))
5✔
202
    ops.update(division_operators(lang))
5✔
203
    return ops
5✔
204

205

206
# Pattern for extracting a digit-based number
207
NUM_PATTERN = r"""
5✔
208
    (?{number}              # required number
209
        [+-]?                  #   optional sign
210
        (\.?\d+|[{unicode_fract}])     #   required digits or unicode fraction
211
        (?:[{grouping}]\d{{3}})*         #   allowed grouping
212
        (?{decimals}[{decimal_operators}]\d+)?    #   optional decimals
213
    )
214
    (?{scale}               # optional exponent
215
        (?:{multipliers})?                #   multiplicative operators
216
        (?{base}(E|e|\d+)\^?)    #   required exponent prefix
217
        (?{exponent}[+-]?\d+|[{superscript}]) # required exponent, superscript
218
                                              # or normal
219
    )?
220
    (?{fraction}             # optional fraction
221
        \ \d+/\d+|\ ?[{unicode_fract}]|/\d+
222
    )?
223

224
"""
225

226

227
# Pattern for extracting a digit-based number
228
def number_pattern():
5✔
229
    return NUM_PATTERN
×
230

231

232
@cached
5✔
233
def number_pattern_no_groups(lang="en_US"):
5✔
234
    return NUM_PATTERN.format(
5✔
235
        number=":",
236
        decimals=":",
237
        scale=":",
238
        base=":",
239
        exponent=":",
240
        fraction=":",
241
        grouping=grouping_operators_regex(lang),
242
        multipliers=multiplication_operators_regex(lang),
243
        superscript=unicode_superscript_regex(),
244
        unicode_fract=unicode_fractions_regex(),
245
        decimal_operators=decimal_operators_regex(lang),
246
    )
247

248

249
@cached
5✔
250
def number_pattern_groups(lang="en_US"):
5✔
251
    return NUM_PATTERN.format(
5✔
252
        number="P<number>",
253
        decimals="P<decimals>",
254
        scale="P<scale>",
255
        base="P<base>",
256
        exponent="P<exponent>",
257
        fraction="P<fraction>",
258
        grouping=grouping_operators_regex(lang),
259
        multipliers=multiplication_operators_regex(lang),
260
        superscript=unicode_superscript_regex(),
261
        unicode_fract=unicode_fractions_regex(),
262
        decimal_operators=decimal_operators_regex(lang),
263
    )
264

265

266
@cached
5✔
267
def range_pattern(lang="en_US"):
5✔
268
    num_pattern_no_groups = number_pattern_no_groups(lang)
5✔
269
    return r"""                        # Pattern for a range of numbers
5✔
270

271
    (?:                                    # First number
272
        (?<![a-zA-Z0-9+.-])                # lookbehind, avoid "Area51"
273
        %s
274
    )
275
    (?:                                    # Second number
276
        \ ?(?:(?:-\ )?(?:%s|%s))\ ?  # Group for ranges or uncertainties
277
    %s)?
278

279
    """ % (
280
        num_pattern_no_groups,
281
        "|".join(ranges(lang)),
282
        "|".join(uncertainties(lang)),
283
        num_pattern_no_groups,
284
    )
285

286

287
@cached
5✔
288
def text_pattern_reg(lang="en_US"):
5✔
289
    txt_pattern = _get_regex(lang).TEXT_PATTERN.format(
5✔
290
        number_pattern_no_groups=number_pattern_no_groups(lang),
291
        numberwords_regex=numberwords_regex(lang),
292
    )
293
    reg_txt = re.compile(txt_pattern, re.VERBOSE | re.IGNORECASE)
5✔
294
    return reg_txt
5✔
295

296

297
###############################################################################
298
@cached
5✔
299
def units_regex(lang="en_US"):
5✔
300
    """
301
    Build a compiled regex object. Groups of the extracted items, with 4
302
    repetitions, are:
303

304
        0: whole surface
305
        1: prefixed symbol
306
        2: numerical value
307
        3: first operator
308
        4: first unit
309
        5: second operator
310
        6: second unit
311
        7: third operator
312
        8: third unit
313
        9: fourth operator
314
        10: fourth unit
315

316
    Example, 'I want $20/h'
317

318
        0: $20/h
319
        1: $
320
        2: 20
321
        3: /
322
        4: h
323
        5: None
324
        6: None
325
        7: None
326
        8: None
327
        9: None
328
        10: None
329

330
    """
331

332
    op_keys = sorted(list(operators(lang)), key=len, reverse=True)
5✔
333
    unit_keys = sorted(
5✔
334
        list(load.units(lang).surfaces.keys()) + list(load.units(lang).symbols.keys()),
335
        key=len,
336
        reverse=True,
337
    )
338
    symbol_keys = sorted(
5✔
339
        list(load.units(lang).prefix_symbols.keys()), key=len, reverse=True
340
    )
341

342
    exponent = exponents_regex(lang).format(superscripts=unicode_superscript_regex())
5✔
343

344
    all_ops = "|".join([r"{}".format(re.escape(i)) for i in op_keys])
5✔
345
    all_units = "|".join([r"{}".format(re.escape(i)) for i in unit_keys])
5✔
346
    all_symbols = "|".join([r"{}".format(re.escape(i)) for i in symbol_keys])
5✔
347

348
    pattern = r"""
5✔
349
        (?<!\w)                                     # "begin" of word
350
        (?P<prefix>(?:%s)(?![a-zA-Z]))?         # Currencies, mainly
351
        (?P<value>%s)-?                           # Number
352
        (?:(?P<operator1>%s(?=(%s)%s))?(?P<unit1>(?:%s)%s)?)    # Operator + Unit (1)
353
        (?:(?P<operator2>%s(?=(%s)%s))?(?P<unit2>(?:%s)%s)?)    # Operator + Unit (2)
354
        (?:(?P<operator3>%s(?=(%s)%s))?(?P<unit3>(?:%s)%s)?)    # Operator + Unit (3)
355
        (?:(?P<operator4>%s(?=(%s)%s))?(?P<unit4>(?:%s)%s)?)    # Operator + Unit (4)
356
        (?!\w)                                      # "end" of word
357
    """ % tuple(
358
        [all_symbols, range_pattern(lang)]
359
        + 4 * [all_ops, all_units, exponent, all_units, exponent]
360
    )
361
    regex = re.compile(pattern, re.VERBOSE | re.IGNORECASE)
5✔
362

363
    return regex
5✔
STATUS · Troubleshooting · Open an Issue · Sales · Support · CAREERS · ENTERPRISE · START FREE · SCHEDULE DEMO
ANNOUNCEMENTS · TWITTER · TOS & SLA · Supported CI Services · What's a CI service? · Automated Testing

© 2026 Coveralls, Inc