• Home
  • Features
  • Pricing
  • Docs
  • Announcements
  • Sign In

nielstron / quantulum3 / 946

pending completion
946

cron

travis-ci-com

nielstron
Merge branch 'dev'

467 of 467 new or added lines in 14 files covered. (100.0%)

1812 of 1847 relevant lines covered (98.11%)

4.89 hits per line

Source File
Press 'n' to go to next uncovered line, 'b' for previous

98.13
/quantulum3/parser.py
1
#!/usr/bin/env python
2
# -*- coding: utf-8 -*-
3
"""
5✔
4
:mod:`Quantulum` parser.
5
"""
6

7
import logging
5✔
8
import re
5✔
9
from collections import defaultdict
5✔
10
from fractions import Fraction
5✔
11
from typing import List
5✔
12

13
from . import classes as cls
5✔
14
from . import disambiguate as dis
5✔
15
from . import language, load
5✔
16
from . import regex as reg
5✔
17

18
_LOGGER = logging.getLogger(__name__)
5✔
19

20

21
def _get_parser(lang="en_US"):
5✔
22
    """
23
    Get parser module for given language
24
    :param lang:
25
    :return:
26
    """
27
    return language.get("parser", lang)
5✔
28

29

30
###############################################################################
31
def extract_spellout_values(text, lang="en_US"):
5✔
32
    """
33
    Convert spelled out numbers in a given text to digits.
34
    """
35
    return _get_parser(lang).extract_spellout_values(text)
5✔
36

37

38
###############################################################################
39
def substitute_values(text, values):
5✔
40
    """
41
    Convert spelled out numbers in a given text to digits.
42
    """
43

44
    shift, final_text, shifts = 0, text, defaultdict(int)
5✔
45
    for value in values:
5✔
46
        first = value["old_span"][0] + shift
5✔
47
        second = value["old_span"][1] + shift
5✔
48
        final_text = final_text[0:first] + value["new_surface"] + final_text[second:]
5✔
49
        shift += len(value["new_surface"]) - len(value["old_surface"])
5✔
50
        for char in range(first + 1, len(final_text)):
5✔
51
            shifts[char] = shift
5✔
52

53
    _LOGGER.debug('Text after numeric conversion: "%s"', final_text)
5✔
54

55
    return final_text, shifts
5✔
56

57

58
###############################################################################
59
def words_before_span(text, span, k):
5✔
60
    if span[0] == 0:
5✔
61
        return []
5✔
62
    return [w.strip().lower() for w in text[: span[0]].split()[-k:]]
5✔
63

64

65
###############################################################################
66
def is_coordinated(quantity1, quantity2, context, lang="en_US"):
5✔
67
    return _get_parser(lang).is_coordinated(quantity1, quantity2, context)
5✔
68

69

70
def is_ranged(quantity1, quantity2, context, lang="en_US"):
5✔
71
    return _get_parser(lang).is_ranged(quantity1, quantity2, context)
5✔
72

73

74
###############################################################################
75
def split_range(value, range_seperator):
5✔
76
    values = value.split(range_seperator)
5✔
77
    values = [v.strip() for v in values]
5✔
78

79
    if range_seperator in ["-", "–", "—"]:
5✔
80
        # if we have an empty string, this indicates we have a range which is using the
81
        # same symbol to seperate the range and the negative sign
82
        # add the negative sign to the front of the next value
83
        # remove the empty string
84
        for ii in range(len(values)):
5✔
85
            if values[ii] == "":
5✔
86
                values[ii + 1] = range_seperator + values[ii + 1]
5✔
87
                values[ii] = None
5✔
88

89
    values = [v for v in values if v is not None]
5✔
90

91
    return values
5✔
92

93

94
###############################################################################
95
def get_values(item, lang="en_US"):
5✔
96
    """
97
    Extract value from regex hit. context is the enclosing text on which the regex hit.
98
    """
99

100
    def callback(pattern):
5✔
101
        return " %s" % (reg.unicode_fractions()[pattern.group(0)])
5✔
102

103
    fracs = r"|".join(reg.unicode_fractions())
5✔
104

105
    value = item.group("value")
5✔
106
    # Remove grouping operators
107
    value = re.sub(
5✔
108
        r"(?<=\d)[%s](?=\d{3})" % reg.grouping_operators_regex(lang), "", value
109
    )
110
    # Replace unusual exponents by e (including e)
111
    value = re.sub(
5✔
112
        r"(?<=\d)(%s)(e|E|10)\^?" % reg.multiplication_operators_regex(lang), "e", value
113
    )
114
    # calculate other exponents
115
    value, factors = resolve_exponents(value)
5✔
116
    _LOGGER.debug("After exponent resolution: {}".format(value))
5✔
117

118
    value = re.sub(fracs, callback, value, re.IGNORECASE)
5✔
119

120
    range_separator = re.findall(
5✔
121
        r"\d+ ?((?:-\ )?(?:%s)) ?\d" % "|".join(reg.ranges(lang)), value
122
    )
123

124
    uncer_separator = re.findall(
5✔
125
        r"\d+ ?(%s) ?\d" % "|".join(reg.uncertainties(lang)), value
126
    )
127
    fract_separator = re.findall(r"\d+/\d+", value)
5✔
128

129
    value = re.sub(" +", " ", value)
5✔
130
    uncertainty = None
5✔
131
    if range_separator:
5✔
132
        # A range just describes an uncertain quantity
133
        values = split_range(value, range_separator[0])
5✔
134
        values = [
5✔
135
            float(re.sub(r"-$", "", v)) * factors[i] for i, v in enumerate(values)
136
        ]
137
        if values[1] < values[0]:
5✔
138
            raise ValueError(
5✔
139
                "Invalid range, with second item being smaller than the first item"
140
            )
141
        mean = sum(values) / len(values)
5✔
142
        uncertainty = mean - min(values)
5✔
143
        values = [mean]
5✔
144
    elif uncer_separator:
5✔
145
        values = [float(i) for i in value.split(uncer_separator[0])]
5✔
146
        uncertainty = values[1] * factors[1]
5✔
147
        values = [values[0] * factors[0]]
5✔
148
    elif fract_separator:
5✔
149
        values = value.split()
5✔
150
        try:
5✔
151
            if len(values) > 1:
5✔
152
                values = [float(values[0]) * factors[0] + float(Fraction(values[1]))]
5✔
153
            else:
154
                values = [float(Fraction(values[0]))]
5✔
155
        except ZeroDivisionError as e:
5✔
156
            raise ValueError("{} is not a number".format(values[0]), e)
5✔
157
    else:
158
        values = [float(re.sub(r"-$", "", value)) * factors[0]]
5✔
159

160
    _LOGGER.debug("\tUncertainty: %s", uncertainty)
5✔
161
    _LOGGER.debug("\tValues: %s", values)
5✔
162

163
    return uncertainty, values
5✔
164

165

166
###############################################################################
167
def resolve_exponents(value, lang="en_US"):
5✔
168
    """Resolve unusual exponents (like 2^4) and return substituted string and
169
       factor
170

171
    Params:
172
        value: str, string with only one value
173
    Returns:
174
        str, string with basis and exponent removed
175
        array of float, factors for multiplication
176

177
    """
178
    factors = []
5✔
179
    matches = re.finditer(
5✔
180
        reg.number_pattern_groups(lang), value, re.IGNORECASE | re.VERBOSE
181
    )
182
    for item in matches:
5✔
183
        if item.group("base") and item.group("exponent"):
5✔
184
            base = item.group("base")
5✔
185
            exp = item.group("exponent")
5✔
186
            if base in ["e", "E"]:
5✔
187
                # already handled by float
188
                factors.append(1)
5✔
189
                continue
5✔
190
                # exp = '10'
191
            # Expect that in a pure decimal base,
192
            # either ^ or superscript notation is used
193
            if re.match(r"\d+\^?", base):
5✔
194
                if not (
5✔
195
                    "^" in base
196
                    or re.match(r"[%s]" % reg.unicode_superscript_regex(), exp)
197
                ):
198
                    factors.append(1)
5✔
199
                    continue
5✔
200
            for superscript, substitute in reg.unicode_superscript().items():
5✔
201
                exp.replace(superscript, substitute)
5✔
202
            exp = float(exp)
5✔
203
            base = float(base.replace("^", ""))
5✔
204
            factor = base**exp
5✔
205
            stripped = str(value).replace(item.group("scale"), "")
5✔
206
            value = stripped
5✔
207
            factors.append(factor)
5✔
208
            _LOGGER.debug(
5✔
209
                "Replaced {} by factor {}".format(item.group("scale"), factor)
210
            )
211
        else:
212
            factors.append(1)
5✔
213
            continue
5✔
214
    return value, factors
5✔
215

216

217
###############################################################################
218
def build_unit_name(dimensions, lang="en_US"):
5✔
219
    """
220
    Build the name of the unit from its dimensions.
221
    """
222
    name = _get_parser(lang).name_from_dimensions(dimensions)
5✔
223

224
    _LOGGER.debug("\tUnit inferred name: %s", name)
5✔
225

226
    return name
5✔
227

228

229
###############################################################################
230
def get_unit_from_dimensions(dimensions, text, lang="en_US", classifier_path=None):
5✔
231
    """
232
    Reconcile a unit based on its dimensionality.
233
    """
234

235
    key = load.get_key_from_dimensions(dimensions)
5✔
236

237
    try:
5✔
238
        unit = load.units(lang).derived[key]
5✔
239
    except KeyError:
5✔
240
        _LOGGER.debug("\tCould not find unit for: %s", key)
5✔
241
        unit = cls.Unit(
5✔
242
            name=build_unit_name(dimensions, lang),
243
            dimensions=dimensions,
244
            entity=get_entity_from_dimensions(dimensions, text, lang, classifier_path),
245
        )
246

247
    # Carry on original composition
248
    unit.original_dimensions = dimensions
5✔
249
    return unit
5✔
250

251

252
def name_from_dimensions(dimensions, lang="en_US"):
5✔
253
    """
254
    Build the name of a unit from its dimensions.
255
    Param:
256
        dimensions: List of dimensions
257
    """
258
    return _get_parser(lang).name_from_dimensions(dimensions)
5✔
259

260

261
def infer_name(unit):
5✔
262
    """
263
    Return unit name based on dimensions
264
    :return: new name of this unit
265
    """
266
    name = name_from_dimensions(unit.dimensions) if unit.dimensions else None
×
267
    return name
×
268

269

270
###############################################################################
271
def get_entity_from_dimensions(dimensions, text, lang="en_US", classifier_path=None):
5✔
272
    """
273
    Infer the underlying entity of a unit (e.g. "volume" for "m^3") based on
274
    its dimensionality.
275
    """
276

277
    new_derived = [
5✔
278
        {"base": load.units(lang).names[i["base"]].entity.name, "power": i["power"]}
279
        for i in dimensions
280
    ]
281

282
    final_derived = sorted(new_derived, key=lambda x: x["base"])
5✔
283
    key = load.get_key_from_dimensions(final_derived)
5✔
284

285
    ent = dis.disambiguate_entity(key, text, lang, classifier_path)
5✔
286
    if ent is None:
5✔
287
        _LOGGER.debug("\tCould not find entity for: %s", key)
5✔
288
        ent = cls.Entity(name="unknown", dimensions=new_derived)
5✔
289

290
    return ent
5✔
291

292

293
###############################################################################
294
def parse_unit(item, unit, slash, lang="en_US"):
5✔
295
    """
296
    Parse surface and power from unit text.
297
    """
298
    return _get_parser(lang).parse_unit(item, unit, slash)
5✔
299

300

301
###############################################################################
302
def get_unit(item, text, lang="en_US", classifier_path=None):
5✔
303
    """
304
    Extract unit from regex hit.
305
    """
306

307
    group_units = ["prefix", "unit1", "unit2", "unit3", "unit4"]
5✔
308
    group_operators = ["operator1", "operator2", "operator3", "operator4"]
5✔
309
    # How much of the end is removed because of an "incorrect" regex match
310
    unit_shortening = 0
5✔
311

312
    item_units = [item.group(i) for i in group_units if item.group(i)]
5✔
313

314
    if len(item_units) == 0:
5✔
315
        unit = load.units(lang).names["dimensionless"]
5✔
316
    else:
317
        derived, slash = [], False
5✔
318
        multiplication_operator = False
5✔
319
        for index in range(0, 5):
5✔
320
            unit = item.group(group_units[index])
5✔
321
            operator_index = None if index < 1 else group_operators[index - 1]
5✔
322
            operator = None if index < 1 else item.group(operator_index)
5✔
323

324
            # disallow spaces as operators in units expressed in their symbols
325
            # Enforce consistency among multiplication and division operators
326
            # Single exceptions are colloquial number abbreviations (5k miles)
327
            if operator in reg.multiplication_operators(lang) or (
5✔
328
                operator is None
329
                and unit
330
                and not (index == 1 and unit in reg.suffixes(lang))
331
            ):
332
                if multiplication_operator != operator and not (
5✔
333
                    index == 1 and str(operator).isspace()
334
                ):
335
                    if multiplication_operator is False:
5✔
336
                        multiplication_operator = operator
5✔
337
                    else:
338
                        # Cut if inconsistent multiplication operator
339
                        # treat the None operator differently - remove the
340
                        # whole word of it
341
                        if operator is None:
5✔
342
                            # For this, use the last consistent operator
343
                            # (before the current) with a space
344
                            # which should always be the preceding operator
345
                            derived.pop()
5✔
346
                            operator_index = group_operators[index - 2]
5✔
347
                        # Remove (original length - new end) characters
348
                        unit_shortening = item.end() - item.start(operator_index)
5✔
349
                        _LOGGER.debug(
5✔
350
                            "Because operator inconsistency, cut from "
351
                            "operator: '{}', new surface: {}".format(
352
                                operator,
353
                                text[item.start() : item.end() - unit_shortening],
354
                            )
355
                        )
356
                        break
5✔
357

358
            # Determine whether a negative power has to be applied to following
359
            # units
360
            if operator and not slash:
5✔
361
                slash = any(i in operator for i in reg.division_operators(lang))
5✔
362
            # Determine which unit follows
363
            if unit:
5✔
364
                unit_surface, power = parse_unit(item, unit, slash, lang)
5✔
365
                base = dis.disambiguate_unit(unit_surface, text, lang, classifier_path)
5✔
366
                derived += [{"base": base, "power": power, "surface": unit_surface}]
5✔
367

368
        unit = get_unit_from_dimensions(derived, text, lang, classifier_path)
5✔
369

370
    _LOGGER.debug("\tUnit: %s", unit)
5✔
371
    _LOGGER.debug("\tEntity: %s", unit.entity)
5✔
372

373
    return unit, unit_shortening
5✔
374

375

376
###############################################################################
377
def get_surface(shifts, orig_text, item, text, unit_shortening=0):
5✔
378
    """
379
    Extract surface from regex hit.
380
    """
381

382
    # handle cut end
383
    span = (item.start(), item.end() - unit_shortening)
5✔
384
    # extend with as many spaces as are possible (this is to handle cleaned text)
385
    i = span[1]
5✔
386
    while i < len(text) and text[i] == " ":
5✔
387
        i += 1
5✔
388
    span = (span[0], i)
5✔
389

390
    _LOGGER.debug('\tInitial span: %s ("%s")', span, text[span[0] : span[1]])
5✔
391

392
    real_span = (span[0] - shifts[span[0]], span[1] - shifts[span[1] - 1])
5✔
393
    surface = orig_text[real_span[0] : real_span[1]]
5✔
394
    _LOGGER.debug('\tShifted span: %s ("%s")', real_span, surface)
5✔
395

396
    while any(surface.endswith(i) for i in [" ", "-"]):
5✔
397
        surface = surface[:-1]
5✔
398
        real_span = (real_span[0], real_span[1] - 1)
5✔
399

400
    while surface.startswith(" "):
5✔
401
        surface = surface[1:]
×
402
        real_span = (real_span[0] + 1, real_span[1])
×
403

404
    _LOGGER.debug('\tFinal span: %s ("%s")', real_span, surface)
5✔
405
    return surface, real_span
5✔
406

407

408
###############################################################################
409
def is_quote_artifact(orig_text, span):
5✔
410
    """
411
    Distinguish between quotes and units.
412
    """
413

414
    res = False
5✔
415
    cursor = re.finditer(r'["\'][^ .,:;?!()*+-].*?["\']', orig_text)
5✔
416

417
    for item in cursor:
5✔
418
        if span[0] <= item.span()[1] <= span[1]:
5✔
419
            res = item
5✔
420
            break
5✔
421

422
    return res
5✔
423

424

425
###############################################################################
426
def build_quantity(
5✔
427
    orig_text,
428
    text,
429
    item,
430
    values,
431
    unit,
432
    surface,
433
    span,
434
    uncert,
435
    lang="en_US",
436
    classifier_path=None,
437
):
438
    """
439
    Build a Quantity object out of extracted information.
440
    Takes care of caveats and common errors
441
    """
442
    return _get_parser(lang).build_quantity(
5✔
443
        orig_text, text, item, values, unit, surface, span, uncert, classifier_path
444
    )
445

446

447
###############################################################################
448
def clean_text(text, lang="en_US"):
5✔
449
    """
450
    Clean text before parsing.
451
    """
452

453
    # Replace a few nasty unicode characters with their ASCII equivalent
454
    maps = {"×": "x", "–": "-", "−": "-"}
5✔
455
    for element in maps:
5✔
456
        text = text.replace(element, maps[element])
5✔
457

458
    # Language specific cleaning
459
    text = _get_parser(lang).clean_text(text)
5✔
460

461
    _LOGGER.debug('Clean text: "%s"', text)
5✔
462

463
    return text
5✔
464

465

466
###############################################################################
467
def extract_range_ands(text, lang="en_US"):
5✔
468
    return _get_parser(lang).extract_range_ands(text)
×
469

470

471
###############################################################################
472
def handle_consecutive_quantities(quantities, context):
5✔
473
    """
474
    [45] and/or [50 mg] --> add unit to first [45 mg] [50 mg]
475
    between [44 mg] and [50 mg] --> range [47+/-3 mg]
476
    [44 mg] to [50 mg] --> range [47+/-3 mg]
477
    """
478
    if len(quantities) < 1:
5✔
479
        return quantities
5✔
480

481
    results = []
5✔
482
    skip_next = False
5✔
483
    for q1, q2 in zip(quantities, quantities[1:]):
5✔
484
        if skip_next:
5✔
485
            skip_next = False
5✔
486
            continue
5✔
487
        range_span = is_ranged(q1, q2, context)
5✔
488
        if range_span:
5✔
489
            if q1.unit.name == q2.unit.name or q1.unit.name == "dimensionless":
5✔
490
                if (
5✔
491
                    q1.uncertainty is None
492
                    and q2.uncertainty is None
493
                    and q1.value != q2.value
494
                ):
495
                    a, b = (q1, q2) if q2.value > q1.value else (q2, q1)
5✔
496
                    value = (a.value + b.value) / 2.0
5✔
497
                    uncertainty = b.value - value
5✔
498
                    surface = context[range_span[0] : range_span[1]]
5✔
499
                    q1 = q1.with_vals(
5✔
500
                        uncertainty=uncertainty,
501
                        value=value,
502
                        unit=q2.unit,
503
                        span=range_span,
504
                        surface=surface,
505
                    )
506
                    skip_next = True
5✔
507
        elif is_coordinated(q1, q2, context):
5✔
508
            if q1.unit.name == "dimensionless":
5✔
509
                q1 = q1.with_vals(unit=q2.unit)
5✔
510
        results.append(q1)
5✔
511
    if not skip_next:
5✔
512
        results.append(quantities[-1])
5✔
513
    return results
5✔
514

515

516
###############################################################################
517
def parse(
5✔
518
    text, lang="en_US", verbose=False, classifier_path=None
519
) -> List[cls.Quantity]:
520
    """
521
    Extract all quantities from unstructured text.
522

523
    Parameters
524
    ----------
525
    text : str
526
        Text to parse.
527
    lang : str
528
        Language of the text. Default is "en_US".
529
    verbose : bool
530
        If True, print debug information. Default is False.
531
    classifier_path : str
532
        Path to the classifier model. Default is None, which uses the default
533
        model for the given language.
534

535
    Returns
536
    -------
537
    quantities : List[Quantity]
538
        List of quantities found in the text.
539
    """
540

541
    log_format = "%(asctime)s --- %(message)s"
5✔
542
    logging.basicConfig(format=log_format)
5✔
543

544
    if verbose:  # pragma: no cover
545
        prev_level = logging.root.getEffectiveLevel()
546
        logging.root.setLevel(logging.DEBUG)
547
        _LOGGER.debug("Verbose mode")
548

549
    orig_text = text
5✔
550
    _LOGGER.debug('Original text: "%s"', orig_text)
5✔
551

552
    text = clean_text(text, lang)
5✔
553
    values = extract_spellout_values(text, lang)
5✔
554
    text, shifts = substitute_values(text, values)
5✔
555

556
    quantities = []
5✔
557
    for item in reg.units_regex(lang).finditer(text):
5✔
558
        groups = dict([i for i in item.groupdict().items() if i[1] and i[1].strip()])
5✔
559
        _LOGGER.debug("Quantity found: %s", groups)
5✔
560

561
        try:
5✔
562
            uncert, values = get_values(item, lang)
5✔
563

564
            unit, unit_shortening = get_unit(item, text, lang, classifier_path)
5✔
565
            surface, span = get_surface(shifts, orig_text, item, text, unit_shortening)
5✔
566
            objs = build_quantity(
5✔
567
                orig_text,
568
                text,
569
                item,
570
                values,
571
                unit,
572
                surface,
573
                span,
574
                uncert,
575
                lang,
576
                classifier_path,
577
            )
578
            if objs is not None:
5✔
579
                quantities += objs
5✔
580
        except ValueError as err:
5✔
581
            _LOGGER.debug("Could not parse quantity: %s", err)
5✔
582

583
    if verbose:  # pragma: no cover
584
        logging.root.setLevel(prev_level)
585

586
    quantities = handle_consecutive_quantities(quantities, text)
5✔
587
    return quantities
5✔
588

589

590
###############################################################################
591
def inline_parse(text, verbose=False):  # pragma: no cover
592
    """
593
    Extract all quantities from unstructured text.
594
    """
595

596
    parsed = parse(text, verbose=verbose)
597

598
    shift = 0
599
    for quantity in parsed:
600
        index = quantity.span[1] + shift
601
        to_add = " {" + str(quantity) + "}"
602
        text = text[0:index] + to_add + text[index:]
603
        shift += len(to_add)
604

605
    return text
606

607

608
###############################################################################
609
def inline_parse_and_replace(text, lang="en_US", verbose=False):  # pragma: no cover
610
    """
611
    Parse text and replace with the standardised quantities as string
612
    """
613

614
    parsed = parse(text, lang=lang, verbose=verbose)
615

616
    shift = 0
617
    for quantity in parsed:
618
        index_start = quantity.span[0] + shift
619
        index_end = quantity.span[1] + shift
620
        to_add = str(quantity)
621
        text = text[0:index_start] + to_add + text[index_end:]
622
        shift += len(to_add) - (quantity.span[1] - quantity.span[0])
623

624
    return text
625

626

627
###############################################################################
628
def inline_parse_and_expand(text, lang="en_US", verbose=False):
5✔
629
    """
630
    Parse text and replace qunatities with speakable version
631
    """
632
    parsed = parse(text, lang=lang, verbose=verbose)
5✔
633

634
    shift = 0
5✔
635
    for quantity in parsed:
5✔
636
        index_start = quantity.span[0] + shift
5✔
637
        index_end = quantity.span[1] + shift
5✔
638
        to_add = quantity.to_spoken()
5✔
639
        text = text[0:index_start] + to_add + text[index_end:]
5✔
640
        shift += len(to_add) - (quantity.span[1] - quantity.span[0])
5✔
641

642
    return text
5✔
STATUS · Troubleshooting · Open an Issue · Sales · Support · CAREERS · ENTERPRISE · START FREE · SCHEDULE DEMO
ANNOUNCEMENTS · TWITTER · TOS & SLA · Supported CI Services · What's a CI service? · Automated Testing

© 2026 Coveralls, Inc