• Home
  • Features
  • Pricing
  • Docs
  • Announcements
  • Sign In

barseghyanartur / faker-file / 5305826025

pending completion
5305826025

push

github

barseghyanartur
Prepare 0.15.5

1793 of 1795 relevant lines covered (99.89%)

5.09 hits per line

Source File
Press 'n' to go to next uncovered line, 'b' for previous

100.0
/src/faker_file/providers/augment_file_from_dir/extractors/tika_extractor.py
1
from pathlib import Path
4✔
2
from typing import Union
4✔
3

4
import tika
4✔
5
from tika import parser
4✔
6

7
from ...base.text_extractor import BaseTextExtractor
4✔
8

9
__author__ = "Artur Barseghyan <artur.barseghyan@gmail.com>"
4✔
10
__copyright__ = "2022-2023 Artur Barseghyan"
4✔
11
__license__ = "MIT"
4✔
12
__all__ = ("TikaTextExtractor",)
4✔
13

14

15
class TikaTextExtractor(BaseTextExtractor):
4✔
16
    """Text extractor based on `Apache Tika` and `tika-python`.
17

18
    Usage example:
19

20
        from faker import Faker
21
        from faker_file.providers.augment_file_from_dir import (
22
            AugmentFileFromDirProvider,
23
        )
24
        from faker_file.providers.augment_file_from_dir.extractors import (
25
            tika_extractor,
26
        )
27

28
        FAKER = Faker()
29

30
        file = AugmentFileFromDirProvider(FAKER).augment_file_from_dir(
31
            text_extractor_cls=tika_extractor.TikaTextExtractor
32
        )
33
    """
34

35
    def handle_kwargs(self: "TikaTextExtractor", **kwargs) -> None:
4✔
36
        """Handle kwargs."""
37

38
    def extract(
4✔
39
        self: "TikaTextExtractor",
40
        source_file: Union[Path, str],
41
    ) -> str:
42
        """Extract text."""
43
        tika.initVM()
4✔
44
        parsed = parser.from_file(str(source_file))
4✔
45
        return parsed["content"]
4✔
STATUS · Troubleshooting · Open an Issue · Sales · Support · CAREERS · ENTERPRISE · START FREE · SCHEDULE DEMO
ANNOUNCEMENTS · TWITTER · TOS & SLA · Supported CI Services · What's a CI service? · Automated Testing

© 2025 Coveralls, Inc