• Home
  • Features
  • Pricing
  • Docs
  • Announcements
  • Sign In

apertium / apertium-apy / 4435493307

pending completion
4435493307

push

github

Kevin Brubeck Unhammer
identifyLang: don't fail on newlines in input :)

361 of 913 branches covered (39.54%)

Branch coverage included in aggregate %.

1 of 1 new or added line in 1 file covered. (100.0%)

1251 of 2281 relevant lines covered (54.84%)

0.55 hits per line

Source File
Press 'n' to go to next uncovered line, 'b' for previous

48.44
/apertium_apy/handlers/identify_lang.py
1
from datetime import timedelta
1✔
2
import re
1✔
3

4
from tornado import gen
1✔
5

6
try:
1✔
7
    import fasttext
1✔
8
except ImportError:
×
9
    fasttext = None
×
10
try:
1✔
11
    import cld2full as cld2  # type: ignore
1✔
12
except ImportError:
1✔
13
    cld2 = None
1✔
14

15
from apertium_apy.handlers.base import BaseHandler
1✔
16
from apertium_apy.utils import get_coverages, to_alpha3_code
1✔
17

18

19
def fasttext_strip_prefix(s):
1✔
20
    """Remove the initial __label__ prefix"""
21
    return s[9:]
×
22

23

24
fasttext_max_input = 2048
1✔
25

26
# there's no [:punct:] class in re module, include the most common here:
27
fasttext_punct_class = re.compile(r'([`~!@#$%^&*()_=+\[\]{}\\\|;:\"\'<>.,/?—–-]+)')
1✔
28

29

30
def fasttext_clean(s):
1✔
31
    "Should clean as ft-train/clean does; also keep input to one line."
32
    return re.sub(fasttext_punct_class, r' \1 ',
×
33
                  s.lower().replace("\n", " "))
34

35

36
def fasttext_identify(model, text):
1✔
37
    cleaned = fasttext_clean(text[:fasttext_max_input])
×
38
    # Grab a bunch of results since currently the model might predict stuff outside possible_langs – it's still fast:
39
    results = model.predict(cleaned, k=200, threshold=0.001)
×
40
    if results[0]:
×
41
        possible_langs = zip(map(fasttext_strip_prefix, results[0]),
×
42
                             results[1])
43
        return {to_alpha3_code(possible_lang[0]): possible_lang[1]
×
44
                for possible_lang in possible_langs}
45
    else:
46
        return {'nob': 1.0}  # TODO: better default
×
47

48

49
def cld_identify(text):
1✔
50
    cld_results = cld2.detect(text)
×
51
    if cld_results[0]:
×
52
        possible_langs = filter(lambda x: x[1] != 'un', cld_results[2])
×
53
        return {to_alpha3_code(possible_lang[1]): possible_lang[2]
×
54
                for possible_lang in possible_langs}
55
    else:
56
        return {'nob': 1.0}  # TODO: better default
×
57

58

59
class IdentifyLangHandler(BaseHandler):
1✔
60
    fasttext = None
1✔
61

62
    @gen.coroutine
1✔
63
    def get(self):
1✔
64
        text = self.get_argument('q')
1✔
65
        if not text:
1!
66
            return self.send_error(400, explanation='Missing q argument')
×
67

68
        if self.fasttext is not None:
1!
69
            self.send_response(fasttext_identify(self.fasttext, text))
×
70
        elif cld2:
1!
71
            self.send_response(cld_identify(text))
×
72
        else:
73
            try:
1✔
74
                coverages = yield gen.with_timeout(
1✔
75
                    timedelta(seconds=self.timeout),
76
                    get_coverages(text, self.analyzers, penalize=True),
77
                )
78
                self.send_response(coverages)
1✔
79

80
            except gen.TimeoutError:
×
81
                self.send_error(408, explanation='Request timed out')
×
STATUS · Troubleshooting · Open an Issue · Sales · Support · CAREERS · ENTERPRISE · START FREE · SCHEDULE DEMO
ANNOUNCEMENTS · TWITTER · TOS & SLA · Supported CI Services · What's a CI service? · Automated Testing

© 2025 Coveralls, Inc