• Home
  • Features
  • Pricing
  • Docs
  • Announcements
  • Sign In

IBM / unitxt / 13481713164

23 Feb 2025 09:27AM UTC coverage: 80.826% (-0.09%) from 80.918%
13481713164

push

github

web-flow
Enable offline mode for hugginface by using local pre-downloaded metrics, datasets and models (#1603)

* Enable local caching with UNITXT_LOCAL_CACHE

Signed-off-by: elronbandel <elronbandel@gmail.com>

* Add cache_dir parameter to model and tokenizer loading for improved performance

Signed-off-by: elronbandel <elronbandel@gmail.com>

* Update line numbers and generated timestamp in secrets baseline

Signed-off-by: elronbandel <elronbandel@gmail.com>

* Update line number and generated timestamp in secrets baseline

Signed-off-by: elronbandel <elronbandel@gmail.com>

* Fix metrics.py

Signed-off-by: elronbandel <elronbandel@gmail.com>

* Use huggingface prefix

Signed-off-by: elronbandel <elronbandel@gmail.com>

* Clarify documentation for offline Hugging Face model, metrics, and datasets paths

Signed-off-by: elronbandel <elronbandel@gmail.com>

* Rename hf_offline_loaders_path to hf_offline_models_path for clarity

Signed-off-by: elronbandel <elronbandel@gmail.com>

---------

Signed-off-by: elronbandel <elronbandel@gmail.com>

1540 of 1899 branches covered (81.1%)

Branch coverage included in aggregate %.

9677 of 11979 relevant lines covered (80.78%)

0.81 hits per line

Source File
Press 'n' to go to next uncovered line, 'b' for previous

86.59
src/unitxt/string_operators.py
1
import os
1✔
2
import re
1✔
3
from typing import (
1✔
4
    Any,
5
    Dict,
6
    List,
7
    Optional,
8
)
9

10
from .operators import FieldOperator, InstanceOperator
1✔
11
from .settings_utils import get_settings
1✔
12

13
settings = get_settings()
1✔
14

15
class Split(FieldOperator):
1✔
16
    by: str
1✔
17

18
    def process_value(self, value: str) -> List[str]:
1✔
19
        return value.split(self.by)
1✔
20

21

22
class RegexSplit(FieldOperator):
1✔
23
    by: str
1✔
24

25
    def process_value(self, value: str) -> List[str]:
1✔
26
        return re.split(self.by, value)
1✔
27

28

29
class TokensSplit(FieldOperator):
1✔
30
    model: str
1✔
31
    _requirements_list = ["transformers"]
1✔
32

33
    def prepare(self):
1✔
34
        super().prepare()
1✔
35
        from transformers import AutoTokenizer
1✔
36
        path = self.model
1✔
37
        if settings.hf_offline_models_path is not None:
1✔
38
            path = os.path.join(settings.hf_offline_models_path, path)
×
39
        self.tokenizer = AutoTokenizer.from_pretrained(path)
1✔
40

41
    def process_value(self, value: str) -> List[str]:
1✔
42
        return self.tokenizer.tokenize(value)
1✔
43

44

45
class TokensSlice(FieldOperator):
1✔
46
    model: str
1✔
47
    start: Optional[int] = None
1✔
48
    stop: Optional[int] = None
1✔
49
    step: Optional[int] = None
1✔
50

51
    _requirements_list = ["transformers"]
1✔
52

53
    def prepare(self):
1✔
54
        super().prepare()
1✔
55
        from transformers import AutoTokenizer
1✔
56
        path = self.model
1✔
57
        if settings.hf_offline_models_path is not None:
1✔
58
            path = os.path.join(settings.hf_offline_models_path, path)
×
59
        self.tokenizer = AutoTokenizer.from_pretrained(path)
1✔
60

61
    def process_value(self, value: str) -> str:
1✔
62
        encoded = self.tokenizer.encode(value)
1✔
63
        slicer = slice(self.start, self.stop, self.step)
1✔
64
        sliced = encoded[slicer]
1✔
65
        return self.tokenizer.decode(sliced)
1✔
66

67

68
class Join(FieldOperator):
1✔
69
    by: str
1✔
70

71
    def process_value(self, value: List[str]) -> str:
1✔
72
        return self.by.join(value)
1✔
73

74

75
class FormatText(InstanceOperator):
1✔
76
    to_field: str
1✔
77
    text: str
1✔
78

79
    def process(
1✔
80
        self, instance: Dict[str, Any], stream_name: Optional[str] = None
81
    ) -> Dict[str, Any]:
82
        instance[self.to_field] = self.text.format(**instance)
×
83
        return instance
×
84

85

86
class Strip(FieldOperator):
1✔
87
    def process_value(self, value: str) -> str:
1✔
88
        return value.strip()
×
89

90

91
class Replace(FieldOperator):
1✔
92
    old: str
1✔
93
    new: str
1✔
94

95
    def process_value(self, value: str) -> str:
1✔
96
        return value.replace(self.old, self.new)
×
97

98

99
class MapReplace(FieldOperator):
1✔
100
    mapping: Dict[str, str]
1✔
101

102
    def process_value(self, value: Any) -> Any:
1✔
103
        for key, val in self.mapping.items():
×
104
            value = value.replace(key, val)
×
105
        return value
×
106

107

108
class RegexReplace(FieldOperator):
1✔
109
    pattern: str  # A regex pattern
1✔
110
    replacement: str  # The replacement string or template
1✔
111

112
    def prepare(self):
1✔
113
        super().prepare()
1✔
114
        self.pattern = re.compile(self.pattern)
1✔
115

116
    def process_value(self, value: Any) -> Any:
1✔
117
        if isinstance(value, str):
1✔
118
            return re.sub(self.pattern, self.replacement, value)
1✔
119
        return value  # If not a string, return the value as is
×
STATUS · Troubleshooting · Open an Issue · Sales · Support · CAREERS · ENTERPRISE · START FREE · SCHEDULE DEMO
ANNOUNCEMENTS · TWITTER · TOS & SLA · Supported CI Services · What's a CI service? · Automated Testing

© 2026 Coveralls, Inc