• Home
  • Features
  • Pricing
  • Docs
  • Announcements
  • Sign In

IBM / unitxt / 17061181673

19 Aug 2025 06:12AM UTC coverage: 80.228% (-0.9%) from 81.081%
17061181673

Pull #1706

github

web-flow
Merge 4d501037d into 7a48aa9d3
Pull Request #1706: Add audio support

1616 of 2029 branches covered (79.65%)

Branch coverage included in aggregate %.

10955 of 13640 relevant lines covered (80.32%)

0.8 hits per line

Source File
Press 'n' to go to next uncovered line, 'b' for previous

80.21
src/unitxt/string_operators.py
1
import os
1✔
2
import re
1✔
3
from typing import (
1✔
4
    Any,
5
    Dict,
6
    List,
7
    Optional,
8
)
9

10
from .operators import FieldOperator, InstanceOperator
1✔
11
from .settings_utils import get_settings
1✔
12
from .utils import retry_connection_with_exponential_backoff
1✔
13

14
settings = get_settings()
1✔
15

16

17
class Split(FieldOperator):
1✔
18
    by: str
1✔
19

20
    def process_value(self, value: str) -> List[str]:
1✔
21
        return value.split(self.by)
1✔
22

23

24
class RegexSplit(FieldOperator):
1✔
25
    by: str
1✔
26

27
    def process_value(self, value: str) -> List[str]:
1✔
28
        return re.split(self.by, value)
1✔
29

30

31
class TokensSplit(FieldOperator):
1✔
32
    model: str
1✔
33
    _requirements_list = ["transformers"]
1✔
34

35
    def prepare(self):
1✔
36
        super().prepare()
1✔
37
        from transformers import AutoTokenizer
1✔
38

39
        path = self.model
1✔
40
        if settings.hf_offline_models_path is not None:
1✔
41
            path = os.path.join(settings.hf_offline_models_path, path)
×
42
        self.tokenizer = AutoTokenizer.from_pretrained(path)
1✔
43

44
    def process_value(self, value: str) -> List[str]:
1✔
45
        return self.tokenizer.tokenize(value)
1✔
46

47

48
class TokensSlice(FieldOperator):
1✔
49
    model: str
1✔
50
    start: Optional[int] = None
1✔
51
    stop: Optional[int] = None
1✔
52
    step: Optional[int] = None
1✔
53

54
    _requirements_list = ["transformers"]
1✔
55

56
    @retry_connection_with_exponential_backoff(backoff_factor=2)
1✔
57
    def prepare(self):
1✔
58
        super().prepare()
1✔
59
        from transformers import AutoTokenizer
1✔
60

61
        path = self.model
1✔
62
        if settings.hf_offline_models_path is not None:
1✔
63
            path = os.path.join(settings.hf_offline_models_path, path)
×
64
        self.tokenizer = AutoTokenizer.from_pretrained(path)
1✔
65

66
    def process_value(self, value: str) -> str:
1✔
67
        encoded = self.tokenizer.encode(value)
1✔
68
        slicer = slice(self.start, self.stop, self.step)
1✔
69
        sliced = encoded[slicer]
1✔
70
        return self.tokenizer.decode(sliced)
1✔
71

72

73
class Join(FieldOperator):
1✔
74
    by: str
1✔
75

76
    def process_value(self, value: List[str]) -> str:
1✔
77
        return self.by.join(value)
1✔
78

79

80
class FormatText(InstanceOperator):
1✔
81
    to_field: str
1✔
82
    text: str
1✔
83

84
    def process(
1✔
85
        self, instance: Dict[str, Any], stream_name: Optional[str] = None
86
    ) -> Dict[str, Any]:
87
        instance[self.to_field] = self.text.format(**instance)
×
88
        return instance
×
89

90

91
class Strip(FieldOperator):
1✔
92
    def process_value(self, value: str) -> str:
1✔
93
        return value.strip()
×
94

95

96
class StripQuotation(FieldOperator):
1✔
97
    def process_value(self, value: str) -> str:
1✔
98
        if value.startswith('"') and value.endswith('"'):
×
99
            return value.strip('"')
×
100
        return value
×
101

102

103
class AddFullStop(FieldOperator):
1✔
104
    def process_value(self, value: str) -> str:
1✔
105
        if value[-1] not in [".", "?", "!"]:
×
106
            return value + "."
×
107
        return value
×
108

109

110
class Replace(FieldOperator):
1✔
111
    old: str
1✔
112
    new: str
1✔
113

114
    def process_value(self, value: str) -> str:
1✔
115
        return value.replace(self.old, self.new)
×
116

117

118
class MapReplace(FieldOperator):
1✔
119
    mapping: Dict[str, str]
1✔
120

121
    def process_value(self, value: Any) -> Any:
1✔
122
        for key, val in self.mapping.items():
×
123
            value = value.replace(key, val)
×
124
        return value
×
125

126

127
class RegexReplace(FieldOperator):
1✔
128
    pattern: str  # A regex pattern
1✔
129
    replacement: str  # The replacement string or template
1✔
130

131
    def prepare(self):
1✔
132
        super().prepare()
1✔
133
        self.pattern = re.compile(self.pattern)
1✔
134

135
    def process_value(self, value: Any) -> Any:
1✔
136
        if isinstance(value, str):
1✔
137
            return re.sub(self.pattern, self.replacement, value)
1✔
138
        return value  # If not a string, return the value as is
×
STATUS · Troubleshooting · Open an Issue · Sales · Support · CAREERS · ENTERPRISE · START FREE · SCHEDULE DEMO
ANNOUNCEMENTS · TWITTER · TOS & SLA · Supported CI Services · What's a CI service? · Automated Testing

© 2026 Coveralls, Inc