nielstron / quantulum3 / 879

pending completion

Build # 879

Build Type

cron

travis-ci-com

Committed by

web-flow

Commit Message

Merge pull request #216 from nielstron/dev

Bump Version

Run Details

42 of 42 new or added lines in 6 files covered. (100.0%)

1415 of 1452 relevant lines covered (97.45%)

4.87 hits per line

Source File
Press 'n' to go to next uncovered line, 'b' for previous

99.21

/quantulum3/regex.py

#!/usr/bin/env python
# -*- coding: utf-8 -*-
"""
:mod:`Quantulum` regex functions.
"""

import re

from . import language, load
from .load import cached


###############################################################################
@cached
def _get_regex(lang="en_US"):
    """
    Get regex module for given language
    :param lang:
    :return:
    """
    return language.get("regex", lang)


###############################################################################
def units(lang="en_US"):
    return _get_regex(lang).UNITS


def tens(lang="en_US"):
    return _get_regex(lang).TENS


def scales(lang="en_US"):
    return _get_regex(lang).SCALES


def decimals(lang="en_US"):
    return _get_regex(lang).DECIMALS


def miscnum(lang="en_US"):
    return _get_regex(lang).MISCNUM


def powers(lang="en_US"):
    return _get_regex(lang).POWERS


def negatives(lang="en_US"):
    return _get_regex(lang).NEGATIVES


def exponents_regex(lang="en_US"):
    return _get_regex(lang).EXPONENTS_REGEX


@cached
def ranges(lang="en_US"):
    ranges_ = {"-"}
    ranges_.update(_get_regex(lang).RANGES)
    return ranges_


@cached
def uncertainties(lang="en_US"):
    uncertainties_ = {r"\+/-", r"±"}
    uncertainties_.update(_get_regex(lang).UNCERTAINTIES)
    return uncertainties_


###############################################################################
@cached
def numberwords(lang="en_US"):
    """
    Convert number words to integers in a given text.
    """

    numwords = {}

    numwords.update(miscnum(lang))

    for word in negatives(lang):
        numwords[word] = (-1, 0)
    for idx, word in enumerate(units(lang)):
        numwords[word] = (1, idx)
    for idx, word in enumerate(tens(lang)):
        numwords[word] = (1, idx * 10)
    for idx, word in enumerate(scales(lang)):
        numwords[word] = (10 ** (idx * 3 or 2), 0)
    for word, factor in decimals(lang).items():
        numwords[word] = (factor, 0)
        numwords[load.pluralize(word, lang=lang)] = (factor, 0)

    return numwords


@cached
def numberwords_regex(lang="en_US"):
    all_numbers = r"|".join(
        r"((?<=\W)|^)%s((?=\W)|$)" % i for i in list(numberwords(lang).keys()) if i
    )
    return all_numbers


###############################################################################
def suffixes(lang="en_US"):
    return _get_regex(lang).SUFFIXES


def unicode_superscript():
    uni_super = {
        "¹": "1",
        "²": "2",
        "³": "3",
        "⁴": "4",
        "⁵": "5",
        "⁶": "6",
        "⁷": "7",
        "⁸": "8",
        "⁹": "9",
        "⁰": "0",
    }
    return uni_super


def unicode_superscript_regex():
    return re.escape("".join(list(unicode_superscript().keys())))


def unicode_fractions():
    uni_frac = {
        "¼": "1/4",
        "½": "1/2",
        "¾": "3/4",
        "⅐": "1/7",
        "⅑": "1/9",
        "⅒": "1/10",
        "⅓": "1/3",
        "⅔": "2/3",
        "⅕": "1/5",
        "⅖": "2/5",
        "⅗": "3/5",
        "⅘": "4/5",
        "⅙": "1/6",
        "⅚": "5/6",
        "⅛": "1/8",
        "⅜": "3/8",
        "⅝": "5/8",
        "⅞": "7/8",
    }
    return uni_frac


def unicode_fractions_regex():
    return re.escape("".join(list(unicode_fractions().keys())))


@cached
def multiplication_operators(lang="en_US"):
    mul = {"*", " ", "·", "x"}
    mul.update(_get_regex(lang).MULTIPLICATION_OPERATORS)
    return mul


@cached
def multiplication_operators_regex(lang="en_US"):
    return r"|".join(r"%s" % re.escape(i) for i in multiplication_operators(lang))


@cached
def division_operators(lang="en_US"):
    div = {"/"}
    div.update(_get_regex(lang).DIVISION_OPERATORS)
    return div


@cached
def grouping_operators(lang="en_US"):
    grouping_ops = {" "}
    grouping_ops.update(_get_regex(lang).GROUPING_OPERATORS)
    return grouping_ops


def grouping_operators_regex(lang="en_US"):
    return "".join(grouping_operators(lang))


@cached
def decimal_operators(lang="en_US"):
    return _get_regex(lang).DECIMAL_OPERATORS


@cached
def decimal_operators_regex(lang="en_US"):
    return "".join(decimal_operators(lang))


@cached
def operators(lang="en_US"):
    ops = set()
    ops.update(multiplication_operators(lang))
    ops.update(division_operators(lang))
    return ops


# Pattern for extracting a digit-based number
NUM_PATTERN = r"""
    (?{number}              # required number
        [+-]?                  #   optional sign
        (\.?\d+|[{unicode_fract}])     #   required digits or unicode fraction
        (?:[{grouping}]\d{{3}})*         #   allowed grouping
        (?{decimals}[{decimal_operators}]\d+)?    #   optional decimals
    )
    (?{scale}               # optional exponent
        (?:{multipliers})?                #   multiplicative operators
        (?{base}(E|e|\d+)\^?)    #   required exponent prefix
        (?{exponent}[+-]?\d+|[{superscript}]) # required exponent, superscript
                                              # or normal
    )?
    (?{fraction}             # optional fraction
        \ \d+/\d+|\ ?[{unicode_fract}]|/\d+
    )?

"""


# Pattern for extracting a digit-based number
def number_pattern():
    return NUM_PATTERN


@cached
def number_pattern_no_groups(lang="en_US"):
    return NUM_PATTERN.format(
        number=":",
        decimals=":",
        scale=":",
        base=":",
        exponent=":",
        fraction=":",
        grouping=grouping_operators_regex(lang),
        multipliers=multiplication_operators_regex(lang),
        superscript=unicode_superscript_regex(),
        unicode_fract=unicode_fractions_regex(),
        decimal_operators=decimal_operators_regex(lang),
    )


@cached
def number_pattern_groups(lang="en_US"):
    return NUM_PATTERN.format(
        number="P<number>",
        decimals="P<decimals>",
        scale="P<scale>",
        base="P<base>",
        exponent="P<exponent>",
        fraction="P<fraction>",
        grouping=grouping_operators_regex(lang),
        multipliers=multiplication_operators_regex(lang),
        superscript=unicode_superscript_regex(),
        unicode_fract=unicode_fractions_regex(),
        decimal_operators=decimal_operators_regex(lang),
    )


@cached
def range_pattern(lang="en_US"):
    num_pattern_no_groups = number_pattern_no_groups(lang)
    return r"""                        # Pattern for a range of numbers

    (?:                                    # First number
        (?<![a-zA-Z0-9+.-])                # lookbehind, avoid "Area51"
        %s
    )
    (?:                                    # Second number
        \ ?(?:(?:-\ )?(?:%s|%s))\ ?  # Group for ranges or uncertainties
    %s)?

    """ % (
        num_pattern_no_groups,
        "|".join(ranges(lang)),
        "|".join(uncertainties(lang)),
        num_pattern_no_groups,
    )


@cached
def text_pattern_reg(lang="en_US"):
    txt_pattern = _get_regex(lang).TEXT_PATTERN.format(
        number_pattern_no_groups=number_pattern_no_groups(lang),
        numberwords_regex=numberwords_regex(lang),
    )
    reg_txt = re.compile(txt_pattern, re.VERBOSE | re.IGNORECASE)
    return reg_txt


###############################################################################
@cached
def units_regex(lang="en_US"):
    """
    Build a compiled regex object. Groups of the extracted items, with 4
    repetitions, are:

        0: whole surface
        1: prefixed symbol
        2: numerical value
        3: first operator
        4: first unit
        5: second operator
        6: second unit
        7: third operator
        8: third unit
        9: fourth operator
        10: fourth unit

    Example, 'I want $20/h'

        0: $20/h
        1: $
        2: 20
        3: /
        4: h
        5: None
        6: None
        7: None
        8: None
        9: None
        10: None

    """

    op_keys = sorted(list(operators(lang)), key=len, reverse=True)
    unit_keys = sorted(
        list(load.units(lang).surfaces.keys()) + list(load.units(lang).symbols.keys()),
        key=len,
        reverse=True,
    )
    symbol_keys = sorted(
        list(load.units(lang).prefix_symbols.keys()), key=len, reverse=True
    )

    exponent = exponents_regex(lang).format(superscripts=unicode_superscript_regex())

    all_ops = "|".join([r"{}".format(re.escape(i)) for i in op_keys])
    all_units = "|".join([r"{}".format(re.escape(i)) for i in unit_keys])
    all_symbols = "|".join([r"{}".format(re.escape(i)) for i in symbol_keys])

    pattern = r"""
        (?<!\w)                                     # "begin" of word
        (?P<prefix>(?:%s)(?![a-zA-Z]))?         # Currencies, mainly
        (?P<value>%s)-?                           # Number
        (?:(?P<operator1>%s(?=(%s)%s))?(?P<unit1>(?:%s)%s)?)    # Operator + Unit (1)
        (?:(?P<operator2>%s(?=(%s)%s))?(?P<unit2>(?:%s)%s)?)    # Operator + Unit (2)
        (?:(?P<operator3>%s(?=(%s)%s))?(?P<unit3>(?:%s)%s)?)    # Operator + Unit (3)
        (?:(?P<operator4>%s(?=(%s)%s))?(?P<unit4>(?:%s)%s)?)    # Operator + Unit (4)
        (?!\w)                                      # "end" of word
    """ % tuple(
        [all_symbols, range_pattern(lang)]
        + 4 * [all_ops, all_units, exponent, all_units, exponent]
    )
    regex = re.compile(pattern, re.VERBOSE | re.IGNORECASE)

    return regex

1	#!/usr/bin/env python
2	# -- coding: utf-8 --
3	"""	5✔
4	:mod:`Quantulum` regex functions.
5	"""
6
7	import re	5✔
8
9	from . import language, load	5✔
10	from .load import cached	5✔
11
12
13	###############################################################################
14	@cached	5✔
15	def _get_regex(lang="en_US"):	5✔
16	"""
17	Get regex module for given language
18	:param lang:
19	:return:
20	"""
21	return language.get("regex", lang)	5✔
22
23
24	###############################################################################
25	def units(lang="en_US"):	5✔
26	return _get_regex(lang).UNITS	5✔
27
28
29	def tens(lang="en_US"):	5✔
30	return _get_regex(lang).TENS	5✔
31
32
33	def scales(lang="en_US"):	5✔
34	return _get_regex(lang).SCALES	5✔
35
36
37	def decimals(lang="en_US"):	5✔
38	return _get_regex(lang).DECIMALS	5✔
39
40
41	def miscnum(lang="en_US"):	5✔
42	return _get_regex(lang).MISCNUM	5✔
43
44
45	def powers(lang="en_US"):	5✔
46	return _get_regex(lang).POWERS	5✔
47
48
49	def negatives(lang="en_US"):	5✔
50	return _get_regex(lang).NEGATIVES	5✔
51
52
53	def exponents_regex(lang="en_US"):	5✔
54	return _get_regex(lang).EXPONENTS_REGEX	5✔
55
56
57	@cached	5✔
58	def ranges(lang="en_US"):	5✔
59	ranges_ = {"-"}	5✔
60	ranges_.update(_get_regex(lang).RANGES)	5✔
61	return ranges_	5✔
62
63
64	@cached	5✔
65	def uncertainties(lang="en_US"):	5✔
66	uncertainties_ = {r"\+/-", r"±"}	5✔
67	uncertainties_.update(_get_regex(lang).UNCERTAINTIES)	5✔
68	return uncertainties_	5✔
69
70
71	###############################################################################
72	@cached	5✔
73	def numberwords(lang="en_US"):	5✔
74	"""
75	Convert number words to integers in a given text.
76	"""
77
78	numwords = {}	5✔
79
80	numwords.update(miscnum(lang))	5✔
81
82	for word in negatives(lang):	5✔
83	numwords[word] = (-1, 0)	5✔
84	for idx, word in enumerate(units(lang)):	5✔
85	numwords[word] = (1, idx)	5✔
86	for idx, word in enumerate(tens(lang)):	5✔
87	numwords[word] = (1, idx * 10)	5✔
88	for idx, word in enumerate(scales(lang)):	5✔
89	numwords[word] = (10 ** (idx * 3 or 2), 0)	5✔
90	for word, factor in decimals(lang).items():	5✔
91	numwords[word] = (factor, 0)	5✔
92	numwords[load.pluralize(word, lang=lang)] = (factor, 0)	5✔
93
94	return numwords	5✔
95
96
97	@cached	5✔
98	def numberwords_regex(lang="en_US"):	5✔
99	all_numbers = r"\|".join(	5✔
100	r"((?<=\W)\|^)%s((?=\W)\|$)" % i for i in list(numberwords(lang).keys()) if i
101	)
102	return all_numbers	5✔
103
104
105	###############################################################################
106	def suffixes(lang="en_US"):	5✔
107	return _get_regex(lang).SUFFIXES	5✔
108
109
110	def unicode_superscript():	5✔
111	uni_super = {	5✔
112	"¹": "1",
113	"²": "2",
114	"³": "3",
115	"⁴": "4",
116	"⁵": "5",
117	"⁶": "6",
118	"⁷": "7",
119	"⁸": "8",
120	"⁹": "9",
121	"⁰": "0",
122	}
123	return uni_super	5✔
124
125
126	def unicode_superscript_regex():	5✔
127	return re.escape("".join(list(unicode_superscript().keys())))	5✔
128
129
130	def unicode_fractions():	5✔
131	uni_frac = {	5✔
132	"¼": "1/4",
133	"½": "1/2",
134	"¾": "3/4",
135	"⅐": "1/7",
136	"⅑": "1/9",
137	"⅒": "1/10",
138	"⅓": "1/3",
139	"⅔": "2/3",
140	"⅕": "1/5",
141	"⅖": "2/5",
142	"⅗": "3/5",
143	"⅘": "4/5",
144	"⅙": "1/6",
145	"⅚": "5/6",
146	"⅛": "1/8",
147	"⅜": "3/8",
148	"⅝": "5/8",
149	"⅞": "7/8",
150	}
151	return uni_frac	5✔
152
153
154	def unicode_fractions_regex():	5✔
155	return re.escape("".join(list(unicode_fractions().keys())))	5✔
156
157
158	@cached	5✔
159	def multiplication_operators(lang="en_US"):	5✔
160	mul = {"*", " ", "·", "x"}	5✔
161	mul.update(_get_regex(lang).MULTIPLICATION_OPERATORS)	5✔
162	return mul	5✔
163
164
165	@cached	5✔
166	def multiplication_operators_regex(lang="en_US"):	5✔
167	return r"\|".join(r"%s" % re.escape(i) for i in multiplication_operators(lang))	5✔
168
169
170	@cached	5✔
171	def division_operators(lang="en_US"):	5✔
172	div = {"/"}	5✔
173	div.update(_get_regex(lang).DIVISION_OPERATORS)	5✔
174	return div	5✔
175
176
177	@cached	5✔
178	def grouping_operators(lang="en_US"):	5✔
179	grouping_ops = {" "}	5✔
180	grouping_ops.update(_get_regex(lang).GROUPING_OPERATORS)	5✔
181	return grouping_ops	5✔
182
183
184	def grouping_operators_regex(lang="en_US"):	5✔
185	return "".join(grouping_operators(lang))	5✔
186
187
188	@cached	5✔
189	def decimal_operators(lang="en_US"):	5✔
190	return _get_regex(lang).DECIMAL_OPERATORS	5✔
191
192
193	@cached	5✔
194	def decimal_operators_regex(lang="en_US"):	5✔
195	return "".join(decimal_operators(lang))	5✔
196
197
198	@cached	5✔
199	def operators(lang="en_US"):	5✔
200	ops = set()	5✔
201	ops.update(multiplication_operators(lang))	5✔
202	ops.update(division_operators(lang))	5✔
203	return ops	5✔
204
205
206	# Pattern for extracting a digit-based number
207	NUM_PATTERN = r"""	5✔
208	(?{number} # required number
209	[+-]? # optional sign
210	(\.?\d+\|[{unicode_fract}]) # required digits or unicode fraction
211	(?:[{grouping}]\d{{3}})* # allowed grouping
212	(?{decimals}[{decimal_operators}]\d+)? # optional decimals
213	)
214	(?{scale} # optional exponent
215	(?:{multipliers})? # multiplicative operators
216	(?{base}(E\|e\|\d+)\^?) # required exponent prefix
217	(?{exponent}[+-]?\d+\|[{superscript}]) # required exponent, superscript
218	# or normal
219	)?
220	(?{fraction} # optional fraction
221	\ \d+/\d+\|\ ?[{unicode_fract}]\|/\d+
222	)?
223
224	"""
225
226
227	# Pattern for extracting a digit-based number
228	def number_pattern():	5✔
229	return NUM_PATTERN	×
230
231
232	@cached	5✔
233	def number_pattern_no_groups(lang="en_US"):	5✔
234	return NUM_PATTERN.format(	5✔
235	number=":",
236	decimals=":",
237	scale=":",
238	base=":",
239	exponent=":",
240	fraction=":",
241	grouping=grouping_operators_regex(lang),
242	multipliers=multiplication_operators_regex(lang),
243	superscript=unicode_superscript_regex(),
244	unicode_fract=unicode_fractions_regex(),
245	decimal_operators=decimal_operators_regex(lang),
246	)
247
248
249	@cached	5✔
250	def number_pattern_groups(lang="en_US"):	5✔
251	return NUM_PATTERN.format(	5✔
252	number="P<number>",
253	decimals="P<decimals>",
254	scale="P<scale>",
255	base="P<base>",
256	exponent="P<exponent>",
257	fraction="P<fraction>",
258	grouping=grouping_operators_regex(lang),
259	multipliers=multiplication_operators_regex(lang),
260	superscript=unicode_superscript_regex(),
261	unicode_fract=unicode_fractions_regex(),
262	decimal_operators=decimal_operators_regex(lang),
263	)
264
265
266	@cached	5✔
267	def range_pattern(lang="en_US"):	5✔
268	num_pattern_no_groups = number_pattern_no_groups(lang)	5✔
269	return r""" # Pattern for a range of numbers	5✔
270
271	(?: # First number
272	(?<![a-zA-Z0-9+.-]) # lookbehind, avoid "Area51"
273	%s
274	)
275	(?: # Second number
276	\ ?(?:(?:-\ )?(?:%s\|%s))\ ? # Group for ranges or uncertainties
277	%s)?
278
279	""" % (
280	num_pattern_no_groups,
281	"\|".join(ranges(lang)),
282	"\|".join(uncertainties(lang)),
283	num_pattern_no_groups,
284	)
285
286
287	@cached	5✔
288	def text_pattern_reg(lang="en_US"):	5✔
289	txt_pattern = _get_regex(lang).TEXT_PATTERN.format(	5✔
290	number_pattern_no_groups=number_pattern_no_groups(lang),
291	numberwords_regex=numberwords_regex(lang),
292	)
293	reg_txt = re.compile(txt_pattern, re.VERBOSE \| re.IGNORECASE)	5✔
294	return reg_txt	5✔
295
296
297	###############################################################################
298	@cached	5✔
299	def units_regex(lang="en_US"):	5✔
300	"""
301	Build a compiled regex object. Groups of the extracted items, with 4
302	repetitions, are:
303
304	0: whole surface
305	1: prefixed symbol
306	2: numerical value
307	3: first operator
308	4: first unit
309	5: second operator
310	6: second unit
311	7: third operator
312	8: third unit
313	9: fourth operator
314	10: fourth unit
315
316	Example, 'I want $20/h'
317
318	0: $20/h
319	1: $
320	2: 20
321	3: /
322	4: h
323	5: None
324	6: None
325	7: None
326	8: None
327	9: None
328	10: None
329
330	"""
331
332	op_keys = sorted(list(operators(lang)), key=len, reverse=True)	5✔
333	unit_keys = sorted(	5✔
334	list(load.units(lang).surfaces.keys()) + list(load.units(lang).symbols.keys()),
335	key=len,
336	reverse=True,
337	)
338	symbol_keys = sorted(	5✔
339	list(load.units(lang).prefix_symbols.keys()), key=len, reverse=True
340	)
341
342	exponent = exponents_regex(lang).format(superscripts=unicode_superscript_regex())	5✔
343
344	all_ops = "\|".join([r"{}".format(re.escape(i)) for i in op_keys])	5✔
345	all_units = "\|".join([r"{}".format(re.escape(i)) for i in unit_keys])	5✔
346	all_symbols = "\|".join([r"{}".format(re.escape(i)) for i in symbol_keys])	5✔
347
348	pattern = r"""	5✔
349	(?<!\w) # "begin" of word
350	(?P<prefix>(?:%s)(?![a-zA-Z]))? # Currencies, mainly
351	(?P<value>%s)-? # Number
352	(?:(?P<operator1>%s(?=(%s)%s))?(?P<unit1>(?:%s)%s)?) # Operator + Unit (1)
353	(?:(?P<operator2>%s(?=(%s)%s))?(?P<unit2>(?:%s)%s)?) # Operator + Unit (2)
354	(?:(?P<operator3>%s(?=(%s)%s))?(?P<unit3>(?:%s)%s)?) # Operator + Unit (3)
355	(?:(?P<operator4>%s(?=(%s)%s))?(?P<unit4>(?:%s)%s)?) # Operator + Unit (4)
356	(?!\w) # "end" of word
357	""" % tuple(
358	[all_symbols, range_pattern(lang)]
359	+ 4 * [all_ops, all_units, exponent, all_units, exponent]
360	)
361	regex = re.compile(pattern, re.VERBOSE \| re.IGNORECASE)	5✔
362
363	return regex	5✔

nielstron / quantulum3 / 879

Source File Press 'n' to go to next uncovered line, 'b' for previous

Source File
Press 'n' to go to next uncovered line, 'b' for previous