• Home
  • Features
  • Pricing
  • Docs
  • Announcements
  • Sign In

IBM / unitxt / 17888446790

21 Sep 2025 03:34AM UTC coverage: 80.804% (-0.1%) from 80.906%
17888446790

Pull #1939

github

web-flow
Merge e4db288a0 into 95ad743ba
Pull Request #1939: light fast removal of register_all_artifacts for unitxt classes

1599 of 1997 branches covered (80.07%)

Branch coverage included in aggregate %.

10924 of 13501 relevant lines covered (80.91%)

0.81 hits per line

Source File
Press 'n' to go to next uncovered line, 'b' for previous

91.6
src/unitxt/text_utils.py
1
import re
1✔
2
import shutil
1✔
3
from typing import List, Tuple
1✔
4

5
import pandas as pd
1✔
6

7
from .logging_utils import get_logger
1✔
8

9
logger = get_logger()
10

11

12
def split_words(s):
1✔
13
    """Splits a string into words based on PascalCase, camelCase, snake_case, kebab-case, and numbers attached to strings.
14

15
    Args:
16
        s (str): The string to be split.
17

18
    Returns:
19
        list: The list of words obtained after splitting the string.
20
    """
21
    # Split PascalCase or camelCase
22
    s = re.sub(r"([A-Z][a-z]+)", r" \1", re.sub(r"([A-Z]+)", r" \1", s)).strip()
1✔
23
    # Split snake_case or kebab-case
24
    s = re.sub(r"[_-]", " ", s)
1✔
25
    # Split numbers attached to strings
26
    s = re.sub(r"([a-zA-Z])(\d)", r"\1 \2", s)
1✔
27
    s = re.sub(r"(\d)([a-zA-Z])", r"\1 \2", s)
1✔
28
    # Split the string into words based on spaces
29
    return s.split()
1✔
30

31

32
def is_camel_case(s):
1✔
33
    """Checks if a string is in camelCase.
34

35
    Args:
36
        s (str): The string to be checked.
37

38
    Returns:
39
        bool: True if the string is in camelCase, False otherwise.
40
    """
41
    return re.match(r"^[A-Z]+([a-z0-9]*[A-Z]*[a-z0-9]*)*$", s) is not None
1✔
42

43

44
def is_snake_case(s):
1✔
45
    """Checks if a string is in snake_case.
46

47
    Args:
48
        s (str): The string to be checked.
49

50
    Returns:
51
        bool: True if the string is in snake_case, False otherwise.
52
    """
53
    return re.match(r"^[a-z0-9]+(_[a-z0-9]+)*$", s) is not None
1✔
54

55

56
def camel_to_snake_case(s):
1✔
57
    """Converts a string from camelCase to snake_case.
58

59
    Args:
60
        s (str): The string to be converted.
61

62
    Returns:
63
        str: The string converted to snake_case.
64
    """
65
    # Add an underscore before every uppercase letter that is followed by a lowercase letter or digit and not preceded by an underscore, a hyphen or an uppercase letter
66
    s = re.sub(r"(?<=[^A-Z_-])([A-Z])", r"_\1", s)
1✔
67

68
    # Ensure there's an underscore before any uppercase letter that's followed by a lowercase letter or digit and comes after a sequence of uppercase letters
69
    s = re.sub(r"([A-Z]+)([A-Z][a-z0-9])", r"\1_\2", s)
1✔
70

71
    return s.lower()
1✔
72

73

74
def snake_to_camel_case(s):
1✔
75
    """Converts a snake_case string s to CamelCase. Assume a class name is in question so result to start with an upper case.
76

77
    Not always the reciprocal of the above camel_to_snake_case.  e.g: camel_to_snake_case(LoadHF) = load_hf,
78
    whereas snake_to_camel_case(load_hf) = LoadHf
79
    """
80
    s = s.strip()
1✔
81
    words = s.split("_")
1✔
82
    # Capitalize all words and join them
83
    camel_case_parts = [word.capitalize() for word in words]
1✔
84
    return "".join(camel_case_parts)
1✔
85

86

87
def to_pretty_string(
1✔
88
    value,
89
    indent=0,
90
    indent_delta=4,
91
    max_chars=None,
92
    keys=None,
93
    item_label=None,
94
    float_format=None,
95
):
96
    """Constructs a formatted string representation of various data structures (dicts, lists, tuples, and DataFrames).
97

98
    Args:
99
        value: The Python data structure to be formatted.
100
        indent (int, optional): The current level of indentation. Defaults to 0.
101
        indent_delta (int, optional): Amount of spaces to add per indentation level. Defaults to 4.
102
        max_chars (int, optional): Max characters per line before wrapping. Defaults to terminal width - 10.
103
        keys (List[str], optional): For dicts, optionally specify keys and order.
104
        item_label (str, optional): Internal parameter for labeling items.
105
        float_format (str, optional): Format string for float values (e.g., ".2f"). Defaults to None.
106
    """
107
    max_chars = max_chars or shutil.get_terminal_size()[0] - 10
1✔
108
    indent_str = " " * indent
1✔
109
    res = ""
1✔
110

111
    if isinstance(value, dict):
1✔
112
        keys_to_print = keys if keys is not None else list(value.keys())
1✔
113

114
        for k in keys_to_print:
1✔
115
            if k not in value:
1✔
116
                raise ValueError(
117
                    f"Dictionary does not contain field '{k}' specified in 'keys' argument. "
118
                    f"The available keys are {list(value.keys())}"
119
                )
120

121
        for k in keys_to_print:
1✔
122
            v = value[k]
1✔
123
            item_header = f"{k} ({type(v).__name__})"
1✔
124
            res += f"{indent_str}{item_header}:\n"
1✔
125
            res += to_pretty_string(
1✔
126
                v,
127
                indent=indent + indent_delta,
128
                indent_delta=indent_delta,
129
                max_chars=max_chars,
130
                float_format=float_format,
131
            )
132

133
    elif isinstance(value, (list, tuple)):
1✔
134
        for i, v in enumerate(value):
1✔
135
            label = f"[{i}]" if isinstance(value, list) else f"({i})"
1✔
136
            item_header = f"{label} ({type(v).__name__})"
1✔
137
            res += f"{indent_str}{item_header}:\n"
1✔
138
            res += to_pretty_string(
1✔
139
                v,
140
                indent=indent + indent_delta,
141
                indent_delta=indent_delta,
142
                max_chars=max_chars,
143
                float_format=float_format,
144
            )
145

146
    elif isinstance(value, pd.DataFrame):
1✔
147
        line_width = max_chars - indent
1✔
148
        options = [
1✔
149
            "display.max_rows",
150
            None,
151
            "display.max_columns",
152
            None,
153
            "display.max_colwidth",
154
            None,
155
            "display.width",
156
            line_width,
157
            # 'display.colheader_justify', 'left'
158
        ]
159
        if float_format is not None:
1✔
160
            options.extend(
×
161
                ["display.float_format", ("{:," + float_format + "}").format]
162
            )
163
        with pd.option_context(*options):
1✔
164
            df_str = repr(value)
1✔
165

166
        lines = df_str.split("\n")
1✔
167
        for line in lines:
1✔
168
            if len(line) + len(indent_str) > line_width:
1✔
169
                start = 0
1✔
170
                while start < len(line):
1✔
171
                    wrap_chunk = line[start : start + line_width].rstrip()
1✔
172
                    res += f"{indent_str}{wrap_chunk}\n"
1✔
173
                    start += line_width
1✔
174
            else:
175
                res += f"{indent_str}{line.rstrip()}\n"
1✔
176

177
    else:
178
        # Handle scalar values, including floats
179
        if isinstance(value, float) and float_format:
1✔
180
            formatted_value = f"{value:{float_format}}"
×
181
        else:
182
            formatted_value = str(value)
1✔
183

184
        # Wrap lines according to max_chars
185
        line_width = max_chars - indent
1✔
186
        lines = formatted_value.split("\n")
1✔
187
        for line in lines:
1✔
188
            if len(line) + len(indent_str) > line_width:
1✔
189
                start = 0
1✔
190
                while start < len(line):
1✔
191
                    wrap_chunk = line[start : start + line_width].rstrip()
1✔
192
                    res += f"{indent_str}{wrap_chunk}\n"
1✔
193
                    start += line_width
1✔
194
            else:
195
                res += f"{indent_str}{line.rstrip()}\n"
1✔
196

197
    return res
1✔
198

199

200
def construct_dict_as_yaml_lines(d, indent_delta=2) -> List[str]:
1✔
201
    """Constructs the lines of a dictionary formatted as yaml.
202

203
    Args:
204
        d: The element to be formatted.
205
        indent_delta (int, optional): The amount of spaces to add for each level of indentation. Defaults to 2.
206
    """
207
    indent_delta_str = " " * indent_delta
1✔
208
    ticked_indent_delta_str = indent_delta_str[:-2] + "- "
1✔
209
    assert (
1✔
210
        indent_delta >= 2
211
    ), f"Needs at least 2 position indentations, for the case of list elements, that are to be preceded each by ' -'. Got indent_delta={indent_delta}."
212
    res = []  # computed hereunder as a list of lines, that are indented only at the end
1✔
213

214
    if isinstance(d, dict):
1✔
215
        if len(d) == 0:
1✔
216
            return ["{}"]
×
217
        for key, val in d.items():
1✔
218
            printable_key = f'"{key}"' if (" " in key) or (key == "") else key
1✔
219
            res.append(printable_key + ": ")
1✔
220
            yaml_for_val = construct_dict_as_yaml_lines(val, indent_delta=indent_delta)
1✔
221
            assert len(yaml_for_val) > 0
1✔
222
            if len(yaml_for_val) == 1:
1✔
223
                res[-1] += yaml_for_val[0]
1✔
224
            else:
225
                for line in yaml_for_val:
1✔
226
                    res.append(indent_delta_str + line)
1✔
227
        return res
1✔
228

229
    if isinstance(d, list):
1✔
230
        if len(d) == 0:
1✔
231
            return ["[]"]
×
232
        for val in d:
1✔
233
            yaml_for_val = construct_dict_as_yaml_lines(val, indent_delta=indent_delta)
1✔
234
            assert len(yaml_for_val) > 0
1✔
235
            res.append(ticked_indent_delta_str + yaml_for_val[0])
1✔
236
            for line in yaml_for_val[1:]:
1✔
237
                res.append(indent_delta_str + line)
1✔
238
        return res
1✔
239

240
    # d1 = re.sub(r"(\n+)", r'"\1"', str(d))
241
    d1 = str(d).replace("\n", "\\n").replace('"', '\\"')
1✔
242
    if "\\n" in d1 or d1 == "":
1✔
243
        d1 = f'"{d1}"'
×
244
    return [d1]
1✔
245

246

247
def construct_dict_as_python_lines(d, indent_delta=4) -> List[str]:
1✔
248
    """Constructs the lines of a dictionary formatted as a piece of python code.
249

250
    Args:
251
        d: The element to be formatted.
252
        indent_delta (int, optional): The amount of spaces to add for each level of indentation. Defaults to 2.
253
    """
254
    indent_delta_str = " " * indent_delta
1✔
255
    res = []  # computed hereunder as a list of lines, that are indented only at the end
1✔
256

257
    if isinstance(d, dict):
1✔
258
        istype = False
1✔
259
        if len(d) == 0:
1✔
260
            return ["{}"]
×
261
        if "__type__" in d:
1✔
262
            istype = True
1✔
263
            res = ["__type__" + d["__type__"] + "("]
1✔
264
            if len(d) == 1:
1✔
265
                res[0] += ")"
×
266
                return res
×
267
        else:
268
            res = ["{"]
1✔
269
        for key, val in d.items():
1✔
270
            if key == "__type__":
1✔
271
                continue
1✔
272
            printable_key = f'"{key}"' if not istype else key
1✔
273
            res.append(printable_key + ("=" if istype else ": "))
1✔
274
            py_for_val = construct_dict_as_python_lines(val, indent_delta=indent_delta)
1✔
275
            assert len(py_for_val) > 0
1✔
276
            if len(py_for_val) == 1:
1✔
277
                res[-1] += py_for_val[0] + ","
1✔
278
            else:
279
                res[-1] += py_for_val[0]
1✔
280
                if py_for_val[0].startswith("{") or py_for_val[0].startswith("["):
1✔
281
                    for line in py_for_val[1:-1]:
1✔
282
                        res.append(indent_delta_str + line)
1✔
283
                else:
284
                    # val is type, its inner lines are already indented
285
                    res.extend(py_for_val[1:-1])
1✔
286
                res.append(py_for_val[-1] + ",")
1✔
287
        res.append(")" if istype else "}")
1✔
288
        if istype:
1✔
289
            for i in range(1, len(res) - 1):
1✔
290
                res[i] = indent_delta_str + res[i]
1✔
291
        return res
1✔
292

293
    if isinstance(d, list):
1✔
294
        if len(d) == 0:
1✔
295
            return ["[]"]
×
296
        res = ["["]
1✔
297
        for val in d:
1✔
298
            py_for_val = construct_dict_as_python_lines(val, indent_delta=indent_delta)
1✔
299
            assert len(py_for_val) > 0
1✔
300
            for line in py_for_val[:-1]:
1✔
301
                res.append(line)
1✔
302
            res.append(py_for_val[-1] + ",")
1✔
303
        res.append("]")
1✔
304
        return res
1✔
305

306
    # d1 = re.sub(r"(\n+)", r'"\1"', str(d))
307
    if isinstance(d, str):
1✔
308
        return [f'"{d}"']
1✔
309
    if d is None or isinstance(d, (int, float, bool)):
×
310
        return [f"{d}"]
×
311
    raise RuntimeError(f"unrecognized value to print as python: {d}")
×
312

313

314
def print_dict(
1✔
315
    d, indent=0, indent_delta=4, max_chars=None, keys_to_print=None, log_level="info"
316
):
317
    dict_str = to_pretty_string(d, indent, indent_delta, max_chars, keys_to_print)
1✔
318
    dict_str = "\n" + dict_str
1✔
319
    getattr(logger, log_level)(dict_str)
320

321

322
def print_dict_as_yaml(d: dict, indent_delta=2) -> str:
1✔
323
    yaml_lines = construct_dict_as_yaml_lines(d, indent_delta=indent_delta)
1✔
324
    # yaml_lines = [re.sub(r"(\n+)", r'"\1"', line) for line in yaml_lines]
325
    # yaml_lines = [line.replace("\n", "\\n") for line in yaml_lines]
326
    return "\n".join(yaml_lines)
1✔
327

328

329
def print_dict_as_python(d: dict, indent_delta=4) -> str:
1✔
330
    py_lines = construct_dict_as_python_lines(d, indent_delta=indent_delta)
1✔
331
    assert len(py_lines) > 0
1✔
332
    return "\n".join(py_lines)
1✔
333

334

335
def nested_tuple_to_string(nested_tuple: tuple) -> str:
1✔
336
    """Converts a nested tuple to a string, with elements separated by underscores.
337

338
    Args:
339
        nested_tuple (tuple): The nested tuple to be converted.
340

341
    Returns:
342
        str: The string representation of the nested tuple.
343
    """
344
    result = []
×
345
    for item in nested_tuple:
×
346
        if isinstance(item, tuple):
×
347
            result.append(nested_tuple_to_string(item))
×
348
        else:
349
            result.append(str(item))
×
350
    return "_".join(result)
×
351

352

353
def is_made_of_sub_strings(string, sub_strings):
1✔
354
    pattern = "^(" + "|".join(map(re.escape, sub_strings)) + ")+$"
1✔
355
    return bool(re.match(pattern, string))
1✔
356

357

358
# Giveמ all the lines of a card preparer file, e.g. all the lines of prepare/cards/cohere_for_ai.py,
359
# and an object name, e.g. TaskCard(,
360
# return the ordinal number of the line that starts that object, in our example: the
361
# line number of the following line (notice that the line where TaskCard is imported
362
# is not supposed to return):
363
#         card = TaskCard(
364
# and the line number of the line that ends the object, in our case the line that include
365
# the matching close:
366
#         )
367
# This util depends on ruff to ensure this setting of the card file: that a close of one
368
# tag and the open of the next tag, do not sit in same line, when both tags being
369
# major level within TaskCard.
370
# It also prepares for the case that  __description__ tag does not contain balanced
371
# parentheses, since it is often cut in the middle, (with  "... see more at")
372
# flake8: noqa: B007
373
# flake8: noqa: C901
374
def lines_defining_obj_in_card(
1✔
375
    all_lines: List[str], obj_name: str, start_search_at_line: int = 0
376
) -> Tuple[int, int]:
377
    for starting_line in range(start_search_at_line, len(all_lines)):
1✔
378
        line = all_lines[starting_line]
1✔
379
        if obj_name in line:
1✔
380
            break
1✔
381
    if obj_name not in line:
1✔
382
        # obj_name found no where in the input lines
383
        return (-1, -1)
×
384
    num_of_opens = 0
1✔
385
    num_of_closes = 0
1✔
386
    ending_line = starting_line - 1
1✔
387
    while ending_line < len(all_lines):
1✔
388
        ending_line += 1
1✔
389

390
        if "__description__" in all_lines[ending_line]:
1✔
391
            # can not trust parentheses inside description, because this is mainly truncated
392
            # free text.
393
            # We do trust the indentation enforced by ruff, and the way we build __description__:
394
            # a line consisting of only __description__=(
395
            # followed by one or more lines of text, can not trust opens and closes
396
            # in them, followed by a line consisting of only:  ),
397
            # where the ) is indented with the beginning of __description__
398
            # We also prepare for the case that, when not entered by us, __description__=
399
            # is not followed by a ( and the whole description does not end with a single ) in its line.
400
            # We build on ruff making the line following the description start with same indentation
401
            # or 4 less (i.e., the following line is the closing of the card).
402
            tag_indentation = all_lines[ending_line].index("__description__")
1✔
403
            starts_with_parent = "__description__=(" in all_lines[ending_line]
1✔
404
            if starts_with_parent:
1✔
405
                last_line_to_start_with = (" " * tag_indentation) + r"\)"
1✔
406
            else:
407
                # actually, the line that follows the description
408
                last_line_to_start_with1 = (" " * tag_indentation) + "[^ ]"
1✔
409
                last_line_to_start_with2 = (" " * (tag_indentation - 4)) + "[^ ]"
1✔
410
                last_line_to_start_with = (
1✔
411
                    "("
412
                    + last_line_to_start_with1
413
                    + "|"
414
                    + last_line_to_start_with2
415
                    + ")"
416
                )
417
            ending_line += 1
1✔
418
            while not re.search("^" + last_line_to_start_with, all_lines[ending_line]):
1✔
419
                ending_line += 1
1✔
420
            if "__description__" in obj_name:
1✔
421
                return (
1✔
422
                    starting_line,
423
                    ending_line if starts_with_parent else ending_line - 1,
424
                )
425

426
            if starts_with_parent:
1✔
427
                ending_line += 1
1✔
428

429
            # we conrinue in card, having passed the description, ending line points
430
            # to the line that follows description
431

432
        num_of_opens += len(re.findall(r"[({[]", all_lines[ending_line]))
1✔
433
        num_of_closes += len(re.findall(r"[)}\]]", all_lines[ending_line]))
1✔
434
        if num_of_closes == num_of_opens:
1✔
435
            break
1✔
436

437
    if num_of_closes != num_of_opens:
1✔
438
        raise ValueError(
439
            "input lines were exhausted before the matching close is found"
440
        )
441

442
    return (starting_line, ending_line)
1✔
STATUS · Troubleshooting · Open an Issue · Sales · Support · CAREERS · ENTERPRISE · START FREE · SCHEDULE DEMO
ANNOUNCEMENTS · TWITTER · TOS & SLA · Supported CI Services · What's a CI service? · Automated Testing

© 2026 Coveralls, Inc