• Home
  • Features
  • Pricing
  • Docs
  • Announcements
  • Sign In

dynobo / normcap / 3857244254

pending completion
3857244254

Pull #341

github

GitHub
Merge 204223951 into a035bf0e5
Pull Request #341: Feature/UI for language download

49 of 163 new or added lines in 9 files covered. (30.06%)

22 existing lines in 2 files now uncovered.

1711 of 2183 relevant lines covered (78.38%)

2.23 hits per line

Source File
Press 'n' to go to next uncovered line, 'b' for previous

87.1
/src/normcap/ocr/utils.py
1
import functools
3✔
2
import logging
3✔
3
import os
3✔
4
import traceback
3✔
5
from typing import Optional
3✔
6

7
from pytesseract import pytesseract
3✔
8

9
from normcap.version import Version
3✔
10

11
logger = logging.getLogger(__name__)
3✔
12

13

14
def tsv_to_list_of_dicts(tsv_data: dict) -> list[dict]:
3✔
15
    """Transpose tsv dict from k:list[v] to list[Dict[k:v]]."""
16
    words: list[dict] = [{} for _ in tsv_data["level"]]
3✔
17
    for k, values in tsv_data.items():
3✔
18
        for idx, v in enumerate(values):
3✔
19
            words[idx][k] = v
3✔
20

21
    # Filter empty words
22
    return [w for w in words if w["text"].strip()]
3✔
23

24

25
def get_tesseract_config(tessdata_path: Optional[os.PathLike]) -> str:
3✔
26
    """Get string with cli args to be passed into tesseract api."""
27
    return f'--tessdata-dir "{tessdata_path}"' if tessdata_path else ""
3✔
28

29

30
def get_tesseract_languages(
3✔
31
    tesseract_cmd: os.PathLike, tessdata_path: Optional[os.PathLike]
32
) -> list[str]:
33
    """Get info abput tesseract setup."""
34
    pytesseract.tesseract_cmd = str(tesseract_cmd)
3✔
35

36
    try:
3✔
37
        languages = sorted(
3✔
38
            pytesseract.get_languages(config=get_tesseract_config(tessdata_path))
39
        )
40
    except RuntimeError as e:
×
41
        traceback.print_tb(e.__traceback__)
×
42
        raise RuntimeError(
×
43
            "Couldn't determine Tesseract information. If you pip installed NormCap "
44
            + "make sure Tesseract is installed and configured correctly."
45
        ) from e
46

47
    if not languages:
3✔
UNCOV
48
        raise ValueError(
×
49
            "Could not load any languages for tesseract. "
50
            + "On Windows, make sure that TESSDATA_PREFIX environment variable is set. "
51
            + "On Linux/macOS see if 'tesseract --list-langs' work is the command line."
52
        )
53

54
    return languages
3✔
55

56

57
@functools.lru_cache
3✔
58
def get_tesseract_version(tesseract_cmd: os.PathLike) -> Version:
3✔
59
    """Get info abput tesseract setup."""
60
    pytesseract.tesseract_cmd = str(tesseract_cmd)
3✔
61
    tesseract_version = str(pytesseract.get_tesseract_version()).splitlines()[0]
3✔
62
    return Version(tesseract_version)
3✔
STATUS · Troubleshooting · Open an Issue · Sales · Support · CAREERS · ENTERPRISE · START FREE · SCHEDULE DEMO
ANNOUNCEMENTS · TWITTER · TOS & SLA · Supported CI Services · What's a CI service? · Automated Testing

© 2025 Coveralls, Inc