4435493307

Build Type

push

github

Committed by Kevin Brubeck Unhammer

Commit Message

identifyLang: don't fail on newlines in input :)

Run Details

361 of 913 branches covered (39.54%)

Branch coverage included in aggregate %.

1 of 1 new or added line in 1 file covered. (100.0%)

1251 of 2281 relevant lines covered (54.84%)

0.55 hits per line

Source File
Press 'n' to go to next uncovered line, 'b' for previous

48.44

/apertium_apy/handlers/identify_lang.py

from datetime import timedelta
import re

from tornado import gen

try:
    import fasttext
except ImportError:
    fasttext = None
try:
    import cld2full as cld2  # type: ignore
except ImportError:
    cld2 = None

from apertium_apy.handlers.base import BaseHandler
from apertium_apy.utils import get_coverages, to_alpha3_code


def fasttext_strip_prefix(s):
    """Remove the initial __label__ prefix"""
    return s[9:]


fasttext_max_input = 2048

# there's no [:punct:] class in re module, include the most common here:
fasttext_punct_class = re.compile(r'([`~!@#$%^&*()_=+\[\]{}\\\|;:\"\'<>.,/?—–-]+)')


def fasttext_clean(s):
    "Should clean as ft-train/clean does; also keep input to one line."
    return re.sub(fasttext_punct_class, r' \1 ',
                  s.lower().replace("\n", " "))


def fasttext_identify(model, text):
    cleaned = fasttext_clean(text[:fasttext_max_input])
    # Grab a bunch of results since currently the model might predict stuff outside possible_langs – it's still fast:
    results = model.predict(cleaned, k=200, threshold=0.001)
    if results[0]:
        possible_langs = zip(map(fasttext_strip_prefix, results[0]),
                             results[1])
        return {to_alpha3_code(possible_lang[0]): possible_lang[1]
                for possible_lang in possible_langs}
    else:
        return {'nob': 1.0}  # TODO: better default


def cld_identify(text):
    cld_results = cld2.detect(text)
    if cld_results[0]:
        possible_langs = filter(lambda x: x[1] != 'un', cld_results[2])
        return {to_alpha3_code(possible_lang[1]): possible_lang[2]
                for possible_lang in possible_langs}
    else:
        return {'nob': 1.0}  # TODO: better default


class IdentifyLangHandler(BaseHandler):
    fasttext = None

    @gen.coroutine
    def get(self):
        text = self.get_argument('q')
        if not text:
            return self.send_error(400, explanation='Missing q argument')

        if self.fasttext is not None:
            self.send_response(fasttext_identify(self.fasttext, text))
        elif cld2:
            self.send_response(cld_identify(text))
        else:
            try:
                coverages = yield gen.with_timeout(
                    timedelta(seconds=self.timeout),
                    get_coverages(text, self.analyzers, penalize=True),
                )
                self.send_response(coverages)

            except gen.TimeoutError:
                self.send_error(408, explanation='Request timed out')

1	from datetime import timedelta	1✔
2	import re	1✔
3
4	from tornado import gen	1✔
5
6	try:	1✔
7	import fasttext	1✔
8	except ImportError:	×
9	fasttext = None	×
10	try:	1✔
11	import cld2full as cld2 # type: ignore	1✔
12	except ImportError:	1✔
13	cld2 = None	1✔
14
15	from apertium_apy.handlers.base import BaseHandler	1✔
16	from apertium_apy.utils import get_coverages, to_alpha3_code	1✔
17
18
19	def fasttext_strip_prefix(s):	1✔
20	"""Remove the initial __label__ prefix"""
21	return s[9:]	×
22
23
24	fasttext_max_input = 2048	1✔
25
26	# there's no [:punct:] class in re module, include the most common here:
27	fasttext_punct_class = re.compile(r'([`~!@#$%^&*()_=+\[\]{}\\\\|;:\"\'<>.,/?—–-]+)')	1✔
28
29
30	def fasttext_clean(s):	1✔
31	"Should clean as ft-train/clean does; also keep input to one line."
32	return re.sub(fasttext_punct_class, r' \1 ',	×
33	s.lower().replace("\n", " "))
34
35
36	def fasttext_identify(model, text):	1✔
37	cleaned = fasttext_clean(text[:fasttext_max_input])	×
38	# Grab a bunch of results since currently the model might predict stuff outside possible_langs – it's still fast:
39	results = model.predict(cleaned, k=200, threshold=0.001)	×
40	if results[0]:	×
41	possible_langs = zip(map(fasttext_strip_prefix, results[0]),	×
42	results[1])
43	return {to_alpha3_code(possible_lang[0]): possible_lang[1]	×
44	for possible_lang in possible_langs}
45	else:
46	return {'nob': 1.0} # TODO: better default	×
47
48
49	def cld_identify(text):	1✔
50	cld_results = cld2.detect(text)	×
51	if cld_results[0]:	×
52	possible_langs = filter(lambda x: x[1] != 'un', cld_results[2])	×
53	return {to_alpha3_code(possible_lang[1]): possible_lang[2]	×
54	for possible_lang in possible_langs}
55	else:
56	return {'nob': 1.0} # TODO: better default	×
57
58
59	class IdentifyLangHandler(BaseHandler):	1✔
60	fasttext = None	1✔
61
62	@gen.coroutine	1✔
63	def get(self):	1✔
64	text = self.get_argument('q')	1✔
65	if not text:	1!
66	return self.send_error(400, explanation='Missing q argument')	×
67
68	if self.fasttext is not None:	1!
69	self.send_response(fasttext_identify(self.fasttext, text))	×
70	elif cld2:	1!
71	self.send_response(cld_identify(text))	×
72	else:
73	try:	1✔
74	coverages = yield gen.with_timeout(	1✔
75	timedelta(seconds=self.timeout),
76	get_coverages(text, self.analyzers, penalize=True),
77	)
78	self.send_response(coverages)	1✔
79
80	except gen.TimeoutError:	×
81	self.send_error(408, explanation='Request timed out')	×

apertium / apertium-apy / 4435493307

Source File Press 'n' to go to next uncovered line, 'b' for previous

Source File
Press 'n' to go to next uncovered line, 'b' for previous