nielstron / quantulum3 / 946

pending completion

Build # 946

Build Type

cron

travis-ci-com

Committed by

nielstron

Commit Message

Merge branch 'dev'

Run Details

467 of 467 new or added lines in 14 files covered. (100.0%)

1812 of 1847 relevant lines covered (98.11%)

4.89 hits per line

Source File
Press 'n' to go to next uncovered line, 'b' for previous

98.13

/quantulum3/parser.py

#!/usr/bin/env python
# -*- coding: utf-8 -*-
"""
:mod:`Quantulum` parser.
"""

import logging
import re
from collections import defaultdict
from fractions import Fraction
from typing import List

from . import classes as cls
from . import disambiguate as dis
from . import language, load
from . import regex as reg

_LOGGER = logging.getLogger(__name__)


def _get_parser(lang="en_US"):
    """
    Get parser module for given language
    :param lang:
    :return:
    """
    return language.get("parser", lang)


###############################################################################
def extract_spellout_values(text, lang="en_US"):
    """
    Convert spelled out numbers in a given text to digits.
    """
    return _get_parser(lang).extract_spellout_values(text)


###############################################################################
def substitute_values(text, values):
    """
    Convert spelled out numbers in a given text to digits.
    """

    shift, final_text, shifts = 0, text, defaultdict(int)
    for value in values:
        first = value["old_span"][0] + shift
        second = value["old_span"][1] + shift
        final_text = final_text[0:first] + value["new_surface"] + final_text[second:]
        shift += len(value["new_surface"]) - len(value["old_surface"])
        for char in range(first + 1, len(final_text)):
            shifts[char] = shift

    _LOGGER.debug('Text after numeric conversion: "%s"', final_text)

    return final_text, shifts


###############################################################################
def words_before_span(text, span, k):
    if span[0] == 0:
        return []
    return [w.strip().lower() for w in text[: span[0]].split()[-k:]]


###############################################################################
def is_coordinated(quantity1, quantity2, context, lang="en_US"):
    return _get_parser(lang).is_coordinated(quantity1, quantity2, context)


def is_ranged(quantity1, quantity2, context, lang="en_US"):
    return _get_parser(lang).is_ranged(quantity1, quantity2, context)


###############################################################################
def split_range(value, range_seperator):
    values = value.split(range_seperator)
    values = [v.strip() for v in values]

    if range_seperator in ["-", "–", "—"]:
        # if we have an empty string, this indicates we have a range which is using the
        # same symbol to seperate the range and the negative sign
        # add the negative sign to the front of the next value
        # remove the empty string
        for ii in range(len(values)):
            if values[ii] == "":
                values[ii + 1] = range_seperator + values[ii + 1]
                values[ii] = None

    values = [v for v in values if v is not None]

    return values


###############################################################################
def get_values(item, lang="en_US"):
    """
    Extract value from regex hit. context is the enclosing text on which the regex hit.
    """

    def callback(pattern):
        return " %s" % (reg.unicode_fractions()[pattern.group(0)])

    fracs = r"|".join(reg.unicode_fractions())

    value = item.group("value")
    # Remove grouping operators
    value = re.sub(
        r"(?<=\d)[%s](?=\d{3})" % reg.grouping_operators_regex(lang), "", value
    )
    # Replace unusual exponents by e (including e)
    value = re.sub(
        r"(?<=\d)(%s)(e|E|10)\^?" % reg.multiplication_operators_regex(lang), "e", value
    )
    # calculate other exponents
    value, factors = resolve_exponents(value)
    _LOGGER.debug("After exponent resolution: {}".format(value))

    value = re.sub(fracs, callback, value, re.IGNORECASE)

    range_separator = re.findall(
        r"\d+ ?((?:-\ )?(?:%s)) ?\d" % "|".join(reg.ranges(lang)), value
    )

    uncer_separator = re.findall(
        r"\d+ ?(%s) ?\d" % "|".join(reg.uncertainties(lang)), value
    )
    fract_separator = re.findall(r"\d+/\d+", value)

    value = re.sub(" +", " ", value)
    uncertainty = None
    if range_separator:
        # A range just describes an uncertain quantity
        values = split_range(value, range_separator[0])
        values = [
            float(re.sub(r"-$", "", v)) * factors[i] for i, v in enumerate(values)
        ]
        if values[1] < values[0]:
            raise ValueError(
                "Invalid range, with second item being smaller than the first item"
            )
        mean = sum(values) / len(values)
        uncertainty = mean - min(values)
        values = [mean]
    elif uncer_separator:
        values = [float(i) for i in value.split(uncer_separator[0])]
        uncertainty = values[1] * factors[1]
        values = [values[0] * factors[0]]
    elif fract_separator:
        values = value.split()
        try:
            if len(values) > 1:
                values = [float(values[0]) * factors[0] + float(Fraction(values[1]))]
            else:
                values = [float(Fraction(values[0]))]
        except ZeroDivisionError as e:
            raise ValueError("{} is not a number".format(values[0]), e)
    else:
        values = [float(re.sub(r"-$", "", value)) * factors[0]]

    _LOGGER.debug("\tUncertainty: %s", uncertainty)
    _LOGGER.debug("\tValues: %s", values)

    return uncertainty, values


###############################################################################
def resolve_exponents(value, lang="en_US"):
    """Resolve unusual exponents (like 2^4) and return substituted string and
       factor

    Params:
        value: str, string with only one value
    Returns:
        str, string with basis and exponent removed
        array of float, factors for multiplication

    """
    factors = []
    matches = re.finditer(
        reg.number_pattern_groups(lang), value, re.IGNORECASE | re.VERBOSE
    )
    for item in matches:
        if item.group("base") and item.group("exponent"):
            base = item.group("base")
            exp = item.group("exponent")
            if base in ["e", "E"]:
                # already handled by float
                factors.append(1)
                continue
                # exp = '10'
            # Expect that in a pure decimal base,
            # either ^ or superscript notation is used
            if re.match(r"\d+\^?", base):
                if not (
                    "^" in base
                    or re.match(r"[%s]" % reg.unicode_superscript_regex(), exp)
                ):
                    factors.append(1)
                    continue
            for superscript, substitute in reg.unicode_superscript().items():
                exp.replace(superscript, substitute)
            exp = float(exp)
            base = float(base.replace("^", ""))
            factor = base**exp
            stripped = str(value).replace(item.group("scale"), "")
            value = stripped
            factors.append(factor)
            _LOGGER.debug(
                "Replaced {} by factor {}".format(item.group("scale"), factor)
            )
        else:
            factors.append(1)
            continue
    return value, factors


###############################################################################
def build_unit_name(dimensions, lang="en_US"):
    """
    Build the name of the unit from its dimensions.
    """
    name = _get_parser(lang).name_from_dimensions(dimensions)

    _LOGGER.debug("\tUnit inferred name: %s", name)

    return name


###############################################################################
def get_unit_from_dimensions(dimensions, text, lang="en_US", classifier_path=None):
    """
    Reconcile a unit based on its dimensionality.
    """

    key = load.get_key_from_dimensions(dimensions)

    try:
        unit = load.units(lang).derived[key]
    except KeyError:
        _LOGGER.debug("\tCould not find unit for: %s", key)
        unit = cls.Unit(
            name=build_unit_name(dimensions, lang),
            dimensions=dimensions,
            entity=get_entity_from_dimensions(dimensions, text, lang, classifier_path),
        )

    # Carry on original composition
    unit.original_dimensions = dimensions
    return unit


def name_from_dimensions(dimensions, lang="en_US"):
    """
    Build the name of a unit from its dimensions.
    Param:
        dimensions: List of dimensions
    """
    return _get_parser(lang).name_from_dimensions(dimensions)


def infer_name(unit):
    """
    Return unit name based on dimensions
    :return: new name of this unit
    """
    name = name_from_dimensions(unit.dimensions) if unit.dimensions else None
    return name


###############################################################################
def get_entity_from_dimensions(dimensions, text, lang="en_US", classifier_path=None):
    """
    Infer the underlying entity of a unit (e.g. "volume" for "m^3") based on
    its dimensionality.
    """

    new_derived = [
        {"base": load.units(lang).names[i["base"]].entity.name, "power": i["power"]}
        for i in dimensions
    ]

    final_derived = sorted(new_derived, key=lambda x: x["base"])
    key = load.get_key_from_dimensions(final_derived)

    ent = dis.disambiguate_entity(key, text, lang, classifier_path)
    if ent is None:
        _LOGGER.debug("\tCould not find entity for: %s", key)
        ent = cls.Entity(name="unknown", dimensions=new_derived)

    return ent


###############################################################################
def parse_unit(item, unit, slash, lang="en_US"):
    """
    Parse surface and power from unit text.
    """
    return _get_parser(lang).parse_unit(item, unit, slash)


###############################################################################
def get_unit(item, text, lang="en_US", classifier_path=None):
    """
    Extract unit from regex hit.
    """

    group_units = ["prefix", "unit1", "unit2", "unit3", "unit4"]
    group_operators = ["operator1", "operator2", "operator3", "operator4"]
    # How much of the end is removed because of an "incorrect" regex match
    unit_shortening = 0

    item_units = [item.group(i) for i in group_units if item.group(i)]

    if len(item_units) == 0:
        unit = load.units(lang).names["dimensionless"]
    else:
        derived, slash = [], False
        multiplication_operator = False
        for index in range(0, 5):
            unit = item.group(group_units[index])
            operator_index = None if index < 1 else group_operators[index - 1]
            operator = None if index < 1 else item.group(operator_index)

            # disallow spaces as operators in units expressed in their symbols
            # Enforce consistency among multiplication and division operators
            # Single exceptions are colloquial number abbreviations (5k miles)
            if operator in reg.multiplication_operators(lang) or (
                operator is None
                and unit
                and not (index == 1 and unit in reg.suffixes(lang))
            ):
                if multiplication_operator != operator and not (
                    index == 1 and str(operator).isspace()
                ):
                    if multiplication_operator is False:
                        multiplication_operator = operator
                    else:
                        # Cut if inconsistent multiplication operator
                        # treat the None operator differently - remove the
                        # whole word of it
                        if operator is None:
                            # For this, use the last consistent operator
                            # (before the current) with a space
                            # which should always be the preceding operator
                            derived.pop()
                            operator_index = group_operators[index - 2]
                        # Remove (original length - new end) characters
                        unit_shortening = item.end() - item.start(operator_index)
                        _LOGGER.debug(
                            "Because operator inconsistency, cut from "
                            "operator: '{}', new surface: {}".format(
                                operator,
                                text[item.start() : item.end() - unit_shortening],
                            )
                        )
                        break

            # Determine whether a negative power has to be applied to following
            # units
            if operator and not slash:
                slash = any(i in operator for i in reg.division_operators(lang))
            # Determine which unit follows
            if unit:
                unit_surface, power = parse_unit(item, unit, slash, lang)
                base = dis.disambiguate_unit(unit_surface, text, lang, classifier_path)
                derived += [{"base": base, "power": power, "surface": unit_surface}]

        unit = get_unit_from_dimensions(derived, text, lang, classifier_path)

    _LOGGER.debug("\tUnit: %s", unit)
    _LOGGER.debug("\tEntity: %s", unit.entity)

    return unit, unit_shortening


###############################################################################
def get_surface(shifts, orig_text, item, text, unit_shortening=0):
    """
    Extract surface from regex hit.
    """

    # handle cut end
    span = (item.start(), item.end() - unit_shortening)
    # extend with as many spaces as are possible (this is to handle cleaned text)
    i = span[1]
    while i < len(text) and text[i] == " ":
        i += 1
    span = (span[0], i)

    _LOGGER.debug('\tInitial span: %s ("%s")', span, text[span[0] : span[1]])

    real_span = (span[0] - shifts[span[0]], span[1] - shifts[span[1] - 1])
    surface = orig_text[real_span[0] : real_span[1]]
    _LOGGER.debug('\tShifted span: %s ("%s")', real_span, surface)

    while any(surface.endswith(i) for i in [" ", "-"]):
        surface = surface[:-1]
        real_span = (real_span[0], real_span[1] - 1)

    while surface.startswith(" "):
        surface = surface[1:]
        real_span = (real_span[0] + 1, real_span[1])

    _LOGGER.debug('\tFinal span: %s ("%s")', real_span, surface)
    return surface, real_span


###############################################################################
def is_quote_artifact(orig_text, span):
    """
    Distinguish between quotes and units.
    """

    res = False
    cursor = re.finditer(r'["\'][^ .,:;?!()*+-].*?["\']', orig_text)

    for item in cursor:
        if span[0] <= item.span()[1] <= span[1]:
            res = item
            break

    return res


###############################################################################
def build_quantity(
    orig_text,
    text,
    item,
    values,
    unit,
    surface,
    span,
    uncert,
    lang="en_US",
    classifier_path=None,
):
    """
    Build a Quantity object out of extracted information.
    Takes care of caveats and common errors
    """
    return _get_parser(lang).build_quantity(
        orig_text, text, item, values, unit, surface, span, uncert, classifier_path
    )


###############################################################################
def clean_text(text, lang="en_US"):
    """
    Clean text before parsing.
    """

    # Replace a few nasty unicode characters with their ASCII equivalent
    maps = {"×": "x", "–": "-", "−": "-"}
    for element in maps:
        text = text.replace(element, maps[element])

    # Language specific cleaning
    text = _get_parser(lang).clean_text(text)

    _LOGGER.debug('Clean text: "%s"', text)

    return text


###############################################################################
def extract_range_ands(text, lang="en_US"):
    return _get_parser(lang).extract_range_ands(text)


###############################################################################
def handle_consecutive_quantities(quantities, context):
    """
    [45] and/or [50 mg] --> add unit to first [45 mg] [50 mg]
    between [44 mg] and [50 mg] --> range [47+/-3 mg]
    [44 mg] to [50 mg] --> range [47+/-3 mg]
    """
    if len(quantities) < 1:
        return quantities

    results = []
    skip_next = False
    for q1, q2 in zip(quantities, quantities[1:]):
        if skip_next:
            skip_next = False
            continue
        range_span = is_ranged(q1, q2, context)
        if range_span:
            if q1.unit.name == q2.unit.name or q1.unit.name == "dimensionless":
                if (
                    q1.uncertainty is None
                    and q2.uncertainty is None
                    and q1.value != q2.value
                ):
                    a, b = (q1, q2) if q2.value > q1.value else (q2, q1)
                    value = (a.value + b.value) / 2.0
                    uncertainty = b.value - value
                    surface = context[range_span[0] : range_span[1]]
                    q1 = q1.with_vals(
                        uncertainty=uncertainty,
                        value=value,
                        unit=q2.unit,
                        span=range_span,
                        surface=surface,
                    )
                    skip_next = True
        elif is_coordinated(q1, q2, context):
            if q1.unit.name == "dimensionless":
                q1 = q1.with_vals(unit=q2.unit)
        results.append(q1)
    if not skip_next:
        results.append(quantities[-1])
    return results


###############################################################################
def parse(
    text, lang="en_US", verbose=False, classifier_path=None
) -> List[cls.Quantity]:
    """
    Extract all quantities from unstructured text.

    Parameters
    ----------
    text : str
        Text to parse.
    lang : str
        Language of the text. Default is "en_US".
    verbose : bool
        If True, print debug information. Default is False.
    classifier_path : str
        Path to the classifier model. Default is None, which uses the default
        model for the given language.

    Returns
    -------
    quantities : List[Quantity]
        List of quantities found in the text.
    """

    log_format = "%(asctime)s --- %(message)s"
    logging.basicConfig(format=log_format)

    if verbose:  # pragma: no cover
        prev_level = logging.root.getEffectiveLevel()
        logging.root.setLevel(logging.DEBUG)
        _LOGGER.debug("Verbose mode")

    orig_text = text
    _LOGGER.debug('Original text: "%s"', orig_text)

    text = clean_text(text, lang)
    values = extract_spellout_values(text, lang)
    text, shifts = substitute_values(text, values)

    quantities = []
    for item in reg.units_regex(lang).finditer(text):
        groups = dict([i for i in item.groupdict().items() if i[1] and i[1].strip()])
        _LOGGER.debug("Quantity found: %s", groups)

        try:
            uncert, values = get_values(item, lang)

            unit, unit_shortening = get_unit(item, text, lang, classifier_path)
            surface, span = get_surface(shifts, orig_text, item, text, unit_shortening)
            objs = build_quantity(
                orig_text,
                text,
                item,
                values,
                unit,
                surface,
                span,
                uncert,
                lang,
                classifier_path,
            )
            if objs is not None:
                quantities += objs
        except ValueError as err:
            _LOGGER.debug("Could not parse quantity: %s", err)

    if verbose:  # pragma: no cover
        logging.root.setLevel(prev_level)

    quantities = handle_consecutive_quantities(quantities, text)
    return quantities


###############################################################################
def inline_parse(text, verbose=False):  # pragma: no cover
    """
    Extract all quantities from unstructured text.
    """

    parsed = parse(text, verbose=verbose)

    shift = 0
    for quantity in parsed:
        index = quantity.span[1] + shift
        to_add = " {" + str(quantity) + "}"
        text = text[0:index] + to_add + text[index:]
        shift += len(to_add)

    return text


###############################################################################
def inline_parse_and_replace(text, lang="en_US", verbose=False):  # pragma: no cover
    """
    Parse text and replace with the standardised quantities as string
    """

    parsed = parse(text, lang=lang, verbose=verbose)

    shift = 0
    for quantity in parsed:
        index_start = quantity.span[0] + shift
        index_end = quantity.span[1] + shift
        to_add = str(quantity)
        text = text[0:index_start] + to_add + text[index_end:]
        shift += len(to_add) - (quantity.span[1] - quantity.span[0])

    return text


###############################################################################
def inline_parse_and_expand(text, lang="en_US", verbose=False):
    """
    Parse text and replace qunatities with speakable version
    """
    parsed = parse(text, lang=lang, verbose=verbose)

    shift = 0
    for quantity in parsed:
        index_start = quantity.span[0] + shift
        index_end = quantity.span[1] + shift
        to_add = quantity.to_spoken()
        text = text[0:index_start] + to_add + text[index_end:]
        shift += len(to_add) - (quantity.span[1] - quantity.span[0])

    return text

1	#!/usr/bin/env python
2	# -- coding: utf-8 --
3	"""	5✔
4	:mod:`Quantulum` parser.
5	"""
6
7	import logging	5✔
8	import re	5✔
9	from collections import defaultdict	5✔
10	from fractions import Fraction	5✔
11	from typing import List	5✔
12
13	from . import classes as cls	5✔
14	from . import disambiguate as dis	5✔
15	from . import language, load	5✔
16	from . import regex as reg	5✔
17
18	_LOGGER = logging.getLogger(__name__)	5✔
19
20
21	def _get_parser(lang="en_US"):	5✔
22	"""
23	Get parser module for given language
24	:param lang:
25	:return:
26	"""
27	return language.get("parser", lang)	5✔
28
29
30	###############################################################################
31	def extract_spellout_values(text, lang="en_US"):	5✔
32	"""
33	Convert spelled out numbers in a given text to digits.
34	"""
35	return _get_parser(lang).extract_spellout_values(text)	5✔
36
37
38	###############################################################################
39	def substitute_values(text, values):	5✔
40	"""
41	Convert spelled out numbers in a given text to digits.
42	"""
43
44	shift, final_text, shifts = 0, text, defaultdict(int)	5✔
45	for value in values:	5✔
46	first = value["old_span"][0] + shift	5✔
47	second = value["old_span"][1] + shift	5✔
48	final_text = final_text[0:first] + value["new_surface"] + final_text[second:]	5✔
49	shift += len(value["new_surface"]) - len(value["old_surface"])	5✔
50	for char in range(first + 1, len(final_text)):	5✔
51	shifts[char] = shift	5✔
52
53	_LOGGER.debug('Text after numeric conversion: "%s"', final_text)	5✔
54
55	return final_text, shifts	5✔
56
57
58	###############################################################################
59	def words_before_span(text, span, k):	5✔
60	if span[0] == 0:	5✔
61	return []	5✔
62	return [w.strip().lower() for w in text[: span[0]].split()[-k:]]	5✔
63
64
65	###############################################################################
66	def is_coordinated(quantity1, quantity2, context, lang="en_US"):	5✔
67	return _get_parser(lang).is_coordinated(quantity1, quantity2, context)	5✔
68
69
70	def is_ranged(quantity1, quantity2, context, lang="en_US"):	5✔
71	return _get_parser(lang).is_ranged(quantity1, quantity2, context)	5✔
72
73
74	###############################################################################
75	def split_range(value, range_seperator):	5✔
76	values = value.split(range_seperator)	5✔
77	values = [v.strip() for v in values]	5✔
78
79	if range_seperator in ["-", "–", "—"]:	5✔
80	# if we have an empty string, this indicates we have a range which is using the
81	# same symbol to seperate the range and the negative sign
82	# add the negative sign to the front of the next value
83	# remove the empty string
84	for ii in range(len(values)):	5✔
85	if values[ii] == "":	5✔
86	values[ii + 1] = range_seperator + values[ii + 1]	5✔
87	values[ii] = None	5✔
88
89	values = [v for v in values if v is not None]	5✔
90
91	return values	5✔
92
93
94	###############################################################################
95	def get_values(item, lang="en_US"):	5✔
96	"""
97	Extract value from regex hit. context is the enclosing text on which the regex hit.
98	"""
99
100	def callback(pattern):	5✔
101	return " %s" % (reg.unicode_fractions()[pattern.group(0)])	5✔
102
103	fracs = r"\|".join(reg.unicode_fractions())	5✔
104
105	value = item.group("value")	5✔
106	# Remove grouping operators
107	value = re.sub(	5✔
108	r"(?<=\d)[%s](?=\d{3})" % reg.grouping_operators_regex(lang), "", value
109	)
110	# Replace unusual exponents by e (including e)
111	value = re.sub(	5✔
112	r"(?<=\d)(%s)(e\|E\|10)\^?" % reg.multiplication_operators_regex(lang), "e", value
113	)
114	# calculate other exponents
115	value, factors = resolve_exponents(value)	5✔
116	_LOGGER.debug("After exponent resolution: {}".format(value))	5✔
117
118	value = re.sub(fracs, callback, value, re.IGNORECASE)	5✔
119
120	range_separator = re.findall(	5✔
121	r"\d+ ?((?:-\ )?(?:%s)) ?\d" % "\|".join(reg.ranges(lang)), value
122	)
123
124	uncer_separator = re.findall(	5✔
125	r"\d+ ?(%s) ?\d" % "\|".join(reg.uncertainties(lang)), value
126	)
127	fract_separator = re.findall(r"\d+/\d+", value)	5✔
128
129	value = re.sub(" +", " ", value)	5✔
130	uncertainty = None	5✔
131	if range_separator:	5✔
132	# A range just describes an uncertain quantity
133	values = split_range(value, range_separator[0])	5✔
134	values = [	5✔
135	float(re.sub(r"-$", "", v)) * factors[i] for i, v in enumerate(values)
136	]
137	if values[1] < values[0]:	5✔
138	raise ValueError(	5✔
139	"Invalid range, with second item being smaller than the first item"
140	)
141	mean = sum(values) / len(values)	5✔
142	uncertainty = mean - min(values)	5✔
143	values = [mean]	5✔
144	elif uncer_separator:	5✔
145	values = [float(i) for i in value.split(uncer_separator[0])]	5✔
146	uncertainty = values[1] * factors[1]	5✔
147	values = [values[0] * factors[0]]	5✔
148	elif fract_separator:	5✔
149	values = value.split()	5✔
150	try:	5✔
151	if len(values) > 1:	5✔
152	values = [float(values[0]) * factors[0] + float(Fraction(values[1]))]	5✔
153	else:
154	values = [float(Fraction(values[0]))]	5✔
155	except ZeroDivisionError as e:	5✔
156	raise ValueError("{} is not a number".format(values[0]), e)	5✔
157	else:
158	values = [float(re.sub(r"-$", "", value)) * factors[0]]	5✔
159
160	_LOGGER.debug("\tUncertainty: %s", uncertainty)	5✔
161	_LOGGER.debug("\tValues: %s", values)	5✔
162
163	return uncertainty, values	5✔
164
165
166	###############################################################################
167	def resolve_exponents(value, lang="en_US"):	5✔
168	"""Resolve unusual exponents (like 2^4) and return substituted string and
169	factor
170
171	Params:
172	value: str, string with only one value
173	Returns:
174	str, string with basis and exponent removed
175	array of float, factors for multiplication
176
177	"""
178	factors = []	5✔
179	matches = re.finditer(	5✔
180	reg.number_pattern_groups(lang), value, re.IGNORECASE \| re.VERBOSE
181	)
182	for item in matches:	5✔
183	if item.group("base") and item.group("exponent"):	5✔
184	base = item.group("base")	5✔
185	exp = item.group("exponent")	5✔
186	if base in ["e", "E"]:	5✔
187	# already handled by float
188	factors.append(1)	5✔
189	continue	5✔
190	# exp = '10'
191	# Expect that in a pure decimal base,
192	# either ^ or superscript notation is used
193	if re.match(r"\d+\^?", base):	5✔
194	if not (	5✔
195	"^" in base
196	or re.match(r"[%s]" % reg.unicode_superscript_regex(), exp)
197	):
198	factors.append(1)	5✔
199	continue	5✔
200	for superscript, substitute in reg.unicode_superscript().items():	5✔
201	exp.replace(superscript, substitute)	5✔
202	exp = float(exp)	5✔
203	base = float(base.replace("^", ""))	5✔
204	factor = base**exp	5✔
205	stripped = str(value).replace(item.group("scale"), "")	5✔
206	value = stripped	5✔
207	factors.append(factor)	5✔
208	_LOGGER.debug(	5✔
209	"Replaced {} by factor {}".format(item.group("scale"), factor)
210	)
211	else:
212	factors.append(1)	5✔
213	continue	5✔
214	return value, factors	5✔
215
216
217	###############################################################################
218	def build_unit_name(dimensions, lang="en_US"):	5✔
219	"""
220	Build the name of the unit from its dimensions.
221	"""
222	name = _get_parser(lang).name_from_dimensions(dimensions)	5✔
223
224	_LOGGER.debug("\tUnit inferred name: %s", name)	5✔
225
226	return name	5✔
227
228
229	###############################################################################
230	def get_unit_from_dimensions(dimensions, text, lang="en_US", classifier_path=None):	5✔
231	"""
232	Reconcile a unit based on its dimensionality.
233	"""
234
235	key = load.get_key_from_dimensions(dimensions)	5✔
236
237	try:	5✔
238	unit = load.units(lang).derived[key]	5✔
239	except KeyError:	5✔
240	_LOGGER.debug("\tCould not find unit for: %s", key)	5✔
241	unit = cls.Unit(	5✔
242	name=build_unit_name(dimensions, lang),
243	dimensions=dimensions,
244	entity=get_entity_from_dimensions(dimensions, text, lang, classifier_path),
245	)
246
247	# Carry on original composition
248	unit.original_dimensions = dimensions	5✔
249	return unit	5✔
250
251
252	def name_from_dimensions(dimensions, lang="en_US"):	5✔
253	"""
254	Build the name of a unit from its dimensions.
255	Param:
256	dimensions: List of dimensions
257	"""
258	return _get_parser(lang).name_from_dimensions(dimensions)	5✔
259
260
261	def infer_name(unit):	5✔
262	"""
263	Return unit name based on dimensions
264	:return: new name of this unit
265	"""
266	name = name_from_dimensions(unit.dimensions) if unit.dimensions else None	×
267	return name	×
268
269
270	###############################################################################
271	def get_entity_from_dimensions(dimensions, text, lang="en_US", classifier_path=None):	5✔
272	"""
273	Infer the underlying entity of a unit (e.g. "volume" for "m^3") based on
274	its dimensionality.
275	"""
276
277	new_derived = [	5✔
278	{"base": load.units(lang).names[i["base"]].entity.name, "power": i["power"]}
279	for i in dimensions
280	]
281
282	final_derived = sorted(new_derived, key=lambda x: x["base"])	5✔
283	key = load.get_key_from_dimensions(final_derived)	5✔
284
285	ent = dis.disambiguate_entity(key, text, lang, classifier_path)	5✔
286	if ent is None:	5✔
287	_LOGGER.debug("\tCould not find entity for: %s", key)	5✔
288	ent = cls.Entity(name="unknown", dimensions=new_derived)	5✔
289
290	return ent	5✔
291
292
293	###############################################################################
294	def parse_unit(item, unit, slash, lang="en_US"):	5✔
295	"""
296	Parse surface and power from unit text.
297	"""
298	return _get_parser(lang).parse_unit(item, unit, slash)	5✔
299
300
301	###############################################################################
302	def get_unit(item, text, lang="en_US", classifier_path=None):	5✔
303	"""
304	Extract unit from regex hit.
305	"""
306
307	group_units = ["prefix", "unit1", "unit2", "unit3", "unit4"]	5✔
308	group_operators = ["operator1", "operator2", "operator3", "operator4"]	5✔
309	# How much of the end is removed because of an "incorrect" regex match
310	unit_shortening = 0	5✔
311
312	item_units = [item.group(i) for i in group_units if item.group(i)]	5✔
313
314	if len(item_units) == 0:	5✔
315	unit = load.units(lang).names["dimensionless"]	5✔
316	else:
317	derived, slash = [], False	5✔
318	multiplication_operator = False	5✔
319	for index in range(0, 5):	5✔
320	unit = item.group(group_units[index])	5✔
321	operator_index = None if index < 1 else group_operators[index - 1]	5✔
322	operator = None if index < 1 else item.group(operator_index)	5✔
323
324	# disallow spaces as operators in units expressed in their symbols
325	# Enforce consistency among multiplication and division operators
326	# Single exceptions are colloquial number abbreviations (5k miles)
327	if operator in reg.multiplication_operators(lang) or (	5✔
328	operator is None
329	and unit
330	and not (index == 1 and unit in reg.suffixes(lang))
331	):
332	if multiplication_operator != operator and not (	5✔
333	index == 1 and str(operator).isspace()
334	):
335	if multiplication_operator is False:	5✔
336	multiplication_operator = operator	5✔
337	else:
338	# Cut if inconsistent multiplication operator
339	# treat the None operator differently - remove the
340	# whole word of it
341	if operator is None:	5✔
342	# For this, use the last consistent operator
343	# (before the current) with a space
344	# which should always be the preceding operator
345	derived.pop()	5✔
346	operator_index = group_operators[index - 2]	5✔
347	# Remove (original length - new end) characters
348	unit_shortening = item.end() - item.start(operator_index)	5✔
349	_LOGGER.debug(	5✔
350	"Because operator inconsistency, cut from "
351	"operator: '{}', new surface: {}".format(
352	operator,
353	text[item.start() : item.end() - unit_shortening],
354	)
355	)
356	break	5✔
357
358	# Determine whether a negative power has to be applied to following
359	# units
360	if operator and not slash:	5✔
361	slash = any(i in operator for i in reg.division_operators(lang))	5✔
362	# Determine which unit follows
363	if unit:	5✔
364	unit_surface, power = parse_unit(item, unit, slash, lang)	5✔
365	base = dis.disambiguate_unit(unit_surface, text, lang, classifier_path)	5✔
366	derived += [{"base": base, "power": power, "surface": unit_surface}]	5✔
367
368	unit = get_unit_from_dimensions(derived, text, lang, classifier_path)	5✔
369
370	_LOGGER.debug("\tUnit: %s", unit)	5✔
371	_LOGGER.debug("\tEntity: %s", unit.entity)	5✔
372
373	return unit, unit_shortening	5✔
374
375
376	###############################################################################
377	def get_surface(shifts, orig_text, item, text, unit_shortening=0):	5✔
378	"""
379	Extract surface from regex hit.
380	"""
381
382	# handle cut end
383	span = (item.start(), item.end() - unit_shortening)	5✔
384	# extend with as many spaces as are possible (this is to handle cleaned text)
385	i = span[1]	5✔
386	while i < len(text) and text[i] == " ":	5✔
387	i += 1	5✔
388	span = (span[0], i)	5✔
389
390	_LOGGER.debug('\tInitial span: %s ("%s")', span, text[span[0] : span[1]])	5✔
391
392	real_span = (span[0] - shifts[span[0]], span[1] - shifts[span[1] - 1])	5✔
393	surface = orig_text[real_span[0] : real_span[1]]	5✔
394	_LOGGER.debug('\tShifted span: %s ("%s")', real_span, surface)	5✔
395
396	while any(surface.endswith(i) for i in [" ", "-"]):	5✔
397	surface = surface[:-1]	5✔
398	real_span = (real_span[0], real_span[1] - 1)	5✔
399
400	while surface.startswith(" "):	5✔
401	surface = surface[1:]	×
402	real_span = (real_span[0] + 1, real_span[1])	×
403
404	_LOGGER.debug('\tFinal span: %s ("%s")', real_span, surface)	5✔
405	return surface, real_span	5✔
406
407
408	###############################################################################
409	def is_quote_artifact(orig_text, span):	5✔
410	"""
411	Distinguish between quotes and units.
412	"""
413
414	res = False	5✔
415	cursor = re.finditer(r'["\'][^ .,:;?!()+-].?["\']', orig_text)	5✔
416
417	for item in cursor:	5✔
418	if span[0] <= item.span()[1] <= span[1]:	5✔
419	res = item	5✔
420	break	5✔
421
422	return res	5✔
423
424
425	###############################################################################
426	def build_quantity(	5✔
427	orig_text,
428	text,
429	item,
430	values,
431	unit,
432	surface,
433	span,
434	uncert,
435	lang="en_US",
436	classifier_path=None,
437	):
438	"""
439	Build a Quantity object out of extracted information.
440	Takes care of caveats and common errors
441	"""
442	return _get_parser(lang).build_quantity(	5✔
443	orig_text, text, item, values, unit, surface, span, uncert, classifier_path
444	)
445
446
447	###############################################################################
448	def clean_text(text, lang="en_US"):	5✔
449	"""
450	Clean text before parsing.
451	"""
452
453	# Replace a few nasty unicode characters with their ASCII equivalent
454	maps = {"×": "x", "–": "-", "−": "-"}	5✔
455	for element in maps:	5✔
456	text = text.replace(element, maps[element])	5✔
457
458	# Language specific cleaning
459	text = _get_parser(lang).clean_text(text)	5✔
460
461	_LOGGER.debug('Clean text: "%s"', text)	5✔
462
463	return text	5✔
464
465
466	###############################################################################
467	def extract_range_ands(text, lang="en_US"):	5✔
468	return _get_parser(lang).extract_range_ands(text)	×
469
470
471	###############################################################################
472	def handle_consecutive_quantities(quantities, context):	5✔
473	"""
474	[45] and/or [50 mg] --> add unit to first [45 mg] [50 mg]
475	between [44 mg] and [50 mg] --> range [47+/-3 mg]
476	[44 mg] to [50 mg] --> range [47+/-3 mg]
477	"""
478	if len(quantities) < 1:	5✔
479	return quantities	5✔
480
481	results = []	5✔
482	skip_next = False	5✔
483	for q1, q2 in zip(quantities, quantities[1:]):	5✔
484	if skip_next:	5✔
485	skip_next = False	5✔
486	continue	5✔
487	range_span = is_ranged(q1, q2, context)	5✔
488	if range_span:	5✔
489	if q1.unit.name == q2.unit.name or q1.unit.name == "dimensionless":	5✔
490	if (	5✔
491	q1.uncertainty is None
492	and q2.uncertainty is None
493	and q1.value != q2.value
494	):
495	a, b = (q1, q2) if q2.value > q1.value else (q2, q1)	5✔
496	value = (a.value + b.value) / 2.0	5✔
497	uncertainty = b.value - value	5✔
498	surface = context[range_span[0] : range_span[1]]	5✔
499	q1 = q1.with_vals(	5✔
500	uncertainty=uncertainty,
501	value=value,
502	unit=q2.unit,
503	span=range_span,
504	surface=surface,
505	)
506	skip_next = True	5✔
507	elif is_coordinated(q1, q2, context):	5✔
508	if q1.unit.name == "dimensionless":	5✔
509	q1 = q1.with_vals(unit=q2.unit)	5✔
510	results.append(q1)	5✔
511	if not skip_next:	5✔
512	results.append(quantities[-1])	5✔
513	return results	5✔
514
515
516	###############################################################################
517	def parse(	5✔
518	text, lang="en_US", verbose=False, classifier_path=None
519	) -> List[cls.Quantity]:
520	"""
521	Extract all quantities from unstructured text.
522
523	Parameters
524	----------
525	text : str
526	Text to parse.
527	lang : str
528	Language of the text. Default is "en_US".
529	verbose : bool
530	If True, print debug information. Default is False.
531	classifier_path : str
532	Path to the classifier model. Default is None, which uses the default
533	model for the given language.
534
535	Returns
536	-------
537	quantities : List[Quantity]
538	List of quantities found in the text.
539	"""
540
541	log_format = "%(asctime)s --- %(message)s"	5✔
542	logging.basicConfig(format=log_format)	5✔
543
544	if verbose: # pragma: no cover
545	prev_level = logging.root.getEffectiveLevel()
546	logging.root.setLevel(logging.DEBUG)
547	_LOGGER.debug("Verbose mode")
548
549	orig_text = text	5✔
550	_LOGGER.debug('Original text: "%s"', orig_text)	5✔
551
552	text = clean_text(text, lang)	5✔
553	values = extract_spellout_values(text, lang)	5✔
554	text, shifts = substitute_values(text, values)	5✔
555
556	quantities = []	5✔
557	for item in reg.units_regex(lang).finditer(text):	5✔
558	groups = dict([i for i in item.groupdict().items() if i[1] and i[1].strip()])	5✔
559	_LOGGER.debug("Quantity found: %s", groups)	5✔
560
561	try:	5✔
562	uncert, values = get_values(item, lang)	5✔
563
564	unit, unit_shortening = get_unit(item, text, lang, classifier_path)	5✔
565	surface, span = get_surface(shifts, orig_text, item, text, unit_shortening)	5✔
566	objs = build_quantity(	5✔
567	orig_text,
568	text,
569	item,
570	values,
571	unit,
572	surface,
573	span,
574	uncert,
575	lang,
576	classifier_path,
577	)
578	if objs is not None:	5✔
579	quantities += objs	5✔
580	except ValueError as err:	5✔
581	_LOGGER.debug("Could not parse quantity: %s", err)	5✔
582
583	if verbose: # pragma: no cover
584	logging.root.setLevel(prev_level)
585
586	quantities = handle_consecutive_quantities(quantities, text)	5✔
587	return quantities	5✔
588
589
590	###############################################################################
591	def inline_parse(text, verbose=False): # pragma: no cover
592	"""
593	Extract all quantities from unstructured text.
594	"""
595
596	parsed = parse(text, verbose=verbose)
597
598	shift = 0
599	for quantity in parsed:
600	index = quantity.span[1] + shift
601	to_add = " {" + str(quantity) + "}"
602	text = text[0:index] + to_add + text[index:]
603	shift += len(to_add)
604
605	return text
606
607
608	###############################################################################
609	def inline_parse_and_replace(text, lang="en_US", verbose=False): # pragma: no cover
610	"""
611	Parse text and replace with the standardised quantities as string
612	"""
613
614	parsed = parse(text, lang=lang, verbose=verbose)
615
616	shift = 0
617	for quantity in parsed:
618	index_start = quantity.span[0] + shift
619	index_end = quantity.span[1] + shift
620	to_add = str(quantity)
621	text = text[0:index_start] + to_add + text[index_end:]
622	shift += len(to_add) - (quantity.span[1] - quantity.span[0])
623
624	return text
625
626
627	###############################################################################
628	def inline_parse_and_expand(text, lang="en_US", verbose=False):	5✔
629	"""
630	Parse text and replace qunatities with speakable version
631	"""
632	parsed = parse(text, lang=lang, verbose=verbose)	5✔
633
634	shift = 0	5✔
635	for quantity in parsed:	5✔
636	index_start = quantity.span[0] + shift	5✔
637	index_end = quantity.span[1] + shift	5✔
638	to_add = quantity.to_spoken()	5✔
639	text = text[0:index_start] + to_add + text[index_end:]	5✔
640	shift += len(to_add) - (quantity.span[1] - quantity.span[0])	5✔
641
642	return text	5✔

nielstron / quantulum3 / 946

Source File Press 'n' to go to next uncovered line, 'b' for previous

Source File
Press 'n' to go to next uncovered line, 'b' for previous