4241413648

Build Type

push

github

Committed by Kevin Brubeck Unhammer

Commit Message

pipenv install --dev

Run Details

361 of 913 branches covered (39.54%)

Branch coverage included in aggregate %.

1251 of 2281 relevant lines covered (54.84%)

0.55 hits per line

Source File
Press 'n' to go to next uncovered line, 'b' for previous

48.44

/apertium_apy/handlers/identify_lang.py

from datetime import timedelta
import re

from tornado import gen

try:
    import fasttext
except ImportError:
    fasttext = None
try:
    import cld2full as cld2  # type: ignore
except ImportError:
    cld2 = None

from apertium_apy.handlers.base import BaseHandler
from apertium_apy.utils import get_coverages, to_alpha3_code


def fasttext_strip_prefix(s):
    """Remove the initial __label__ prefix"""
    return s[9:]


fasttext_max_input = 2048

# there's no [:punct:] class in re module, include the most common here:
fasttext_punct_class = re.compile(r'([`~!@#$%^&*()_=+\[\]{}\\\|;:\"\'<>.,/?—–-]+)')


def fasttext_clean(s):
    """Should clean as ft-train/clean does"""
    return re.sub(fasttext_punct_class, r' \1 ', s.lower())


def fasttext_identify(model, text):
    cleaned = fasttext_clean(text[:fasttext_max_input])
    # Grab a bunch of results since currently the model might predict stuff outside possible_langs – it's still fast:
    results = model.predict(cleaned, k=200, threshold=0.001)
    if results[0]:
        possible_langs = zip(map(fasttext_strip_prefix, results[0]),
                             results[1])
        return {to_alpha3_code(possible_lang[0]): possible_lang[1]
                for possible_lang in possible_langs}
    else:
        return {'nob': 1.0}  # TODO: better default


def cld_identify(text):
    cld_results = cld2.detect(text)
    if cld_results[0]:
        possible_langs = filter(lambda x: x[1] != 'un', cld_results[2])
        return {to_alpha3_code(possible_lang[1]): possible_lang[2]
                for possible_lang in possible_langs}
    else:
        return {'nob': 1.0}  # TODO: better default


class IdentifyLangHandler(BaseHandler):
    fasttext = None

    @gen.coroutine
    def get(self):
        text = self.get_argument('q')
        if not text:
            return self.send_error(400, explanation='Missing q argument')

        if self.fasttext is not None:
            self.send_response(fasttext_identify(self.fasttext, text))
        elif cld2:
            self.send_response(cld_identify(text))
        else:
            try:
                coverages = yield gen.with_timeout(
                    timedelta(seconds=self.timeout),
                    get_coverages(text, self.analyzers, penalize=True),
                )
                self.send_response(coverages)

            except gen.TimeoutError:
                self.send_error(408, explanation='Request timed out')

1	from datetime import timedelta	1✔
2	import re	1✔
3
4	from tornado import gen	1✔
5
6	try:	1✔
7	import fasttext	1✔
8	except ImportError:	×
9	fasttext = None	×
10	try:	1✔
11	import cld2full as cld2 # type: ignore	1✔
12	except ImportError:	1✔
13	cld2 = None	1✔
14
15	from apertium_apy.handlers.base import BaseHandler	1✔
16	from apertium_apy.utils import get_coverages, to_alpha3_code	1✔
17
18
19	def fasttext_strip_prefix(s):	1✔
20	"""Remove the initial __label__ prefix"""
21	return s[9:]	×
22
23
24	fasttext_max_input = 2048	1✔
25
26	# there's no [:punct:] class in re module, include the most common here:
27	fasttext_punct_class = re.compile(r'([`~!@#$%^&*()_=+\[\]{}\\\\|;:\"\'<>.,/?—–-]+)')	1✔
28
29
30	def fasttext_clean(s):	1✔
31	"""Should clean as ft-train/clean does"""
32	return re.sub(fasttext_punct_class, r' \1 ', s.lower())	×
33
34
35	def fasttext_identify(model, text):	1✔
36	cleaned = fasttext_clean(text[:fasttext_max_input])	×
37	# Grab a bunch of results since currently the model might predict stuff outside possible_langs – it's still fast:
38	results = model.predict(cleaned, k=200, threshold=0.001)	×
39	if results[0]:	×
40	possible_langs = zip(map(fasttext_strip_prefix, results[0]),	×
41	results[1])
42	return {to_alpha3_code(possible_lang[0]): possible_lang[1]	×
43	for possible_lang in possible_langs}
44	else:
45	return {'nob': 1.0} # TODO: better default	×
46
47
48	def cld_identify(text):	1✔
49	cld_results = cld2.detect(text)	×
50	if cld_results[0]:	×
51	possible_langs = filter(lambda x: x[1] != 'un', cld_results[2])	×
52	return {to_alpha3_code(possible_lang[1]): possible_lang[2]	×
53	for possible_lang in possible_langs}
54	else:
55	return {'nob': 1.0} # TODO: better default	×
56
57
58	class IdentifyLangHandler(BaseHandler):	1✔
59	fasttext = None	1✔
60
61	@gen.coroutine	1✔
62	def get(self):	1✔
63	text = self.get_argument('q')	1✔
64	if not text:	1!
65	return self.send_error(400, explanation='Missing q argument')	×
66
67	if self.fasttext is not None:	1!
68	self.send_response(fasttext_identify(self.fasttext, text))	×
69	elif cld2:	1!
70	self.send_response(cld_identify(text))	×
71	else:
72	try:	1✔
73	coverages = yield gen.with_timeout(	1✔
74	timedelta(seconds=self.timeout),
75	get_coverages(text, self.analyzers, penalize=True),
76	)
77	self.send_response(coverages)	1✔
78
79	except gen.TimeoutError:	×
80	self.send_error(408, explanation='Request timed out')	×

apertium / apertium-apy / 4241413648

Source File Press 'n' to go to next uncovered line, 'b' for previous

Source File
Press 'n' to go to next uncovered line, 'b' for previous