• Home
  • Features
  • Pricing
  • Docs
  • Announcements
  • Sign In

apertium / apertium-apy / 4241413648

pending completion
4241413648

push

github

Kevin Brubeck Unhammer
pipenv install --dev

361 of 913 branches covered (39.54%)

Branch coverage included in aggregate %.

1251 of 2281 relevant lines covered (54.84%)

0.55 hits per line

Source File
Press 'n' to go to next uncovered line, 'b' for previous

48.44
/apertium_apy/handlers/identify_lang.py
1
from datetime import timedelta
1✔
2
import re
1✔
3

4
from tornado import gen
1✔
5

6
try:
1✔
7
    import fasttext
1✔
8
except ImportError:
×
9
    fasttext = None
×
10
try:
1✔
11
    import cld2full as cld2  # type: ignore
1✔
12
except ImportError:
1✔
13
    cld2 = None
1✔
14

15
from apertium_apy.handlers.base import BaseHandler
1✔
16
from apertium_apy.utils import get_coverages, to_alpha3_code
1✔
17

18

19
def fasttext_strip_prefix(s):
1✔
20
    """Remove the initial __label__ prefix"""
21
    return s[9:]
×
22

23

24
fasttext_max_input = 2048
1✔
25

26
# there's no [:punct:] class in re module, include the most common here:
27
fasttext_punct_class = re.compile(r'([`~!@#$%^&*()_=+\[\]{}\\\|;:\"\'<>.,/?—–-]+)')
1✔
28

29

30
def fasttext_clean(s):
1✔
31
    """Should clean as ft-train/clean does"""
32
    return re.sub(fasttext_punct_class, r' \1 ', s.lower())
×
33

34

35
def fasttext_identify(model, text):
1✔
36
    cleaned = fasttext_clean(text[:fasttext_max_input])
×
37
    # Grab a bunch of results since currently the model might predict stuff outside possible_langs – it's still fast:
38
    results = model.predict(cleaned, k=200, threshold=0.001)
×
39
    if results[0]:
×
40
        possible_langs = zip(map(fasttext_strip_prefix, results[0]),
×
41
                             results[1])
42
        return {to_alpha3_code(possible_lang[0]): possible_lang[1]
×
43
                for possible_lang in possible_langs}
44
    else:
45
        return {'nob': 1.0}  # TODO: better default
×
46

47

48
def cld_identify(text):
1✔
49
    cld_results = cld2.detect(text)
×
50
    if cld_results[0]:
×
51
        possible_langs = filter(lambda x: x[1] != 'un', cld_results[2])
×
52
        return {to_alpha3_code(possible_lang[1]): possible_lang[2]
×
53
                for possible_lang in possible_langs}
54
    else:
55
        return {'nob': 1.0}  # TODO: better default
×
56

57

58
class IdentifyLangHandler(BaseHandler):
1✔
59
    fasttext = None
1✔
60

61
    @gen.coroutine
1✔
62
    def get(self):
1✔
63
        text = self.get_argument('q')
1✔
64
        if not text:
1!
65
            return self.send_error(400, explanation='Missing q argument')
×
66

67
        if self.fasttext is not None:
1!
68
            self.send_response(fasttext_identify(self.fasttext, text))
×
69
        elif cld2:
1!
70
            self.send_response(cld_identify(text))
×
71
        else:
72
            try:
1✔
73
                coverages = yield gen.with_timeout(
1✔
74
                    timedelta(seconds=self.timeout),
75
                    get_coverages(text, self.analyzers, penalize=True),
76
                )
77
                self.send_response(coverages)
1✔
78

79
            except gen.TimeoutError:
×
80
                self.send_error(408, explanation='Request timed out')
×
STATUS · Troubleshooting · Open an Issue · Sales · Support · CAREERS · ENTERPRISE · START FREE · SCHEDULE DEMO
ANNOUNCEMENTS · TWITTER · TOS & SLA · Supported CI Services · What's a CI service? · Automated Testing

© 2025 Coveralls, Inc