• Home
  • Features
  • Pricing
  • Docs
  • Announcements
  • Sign In

IBM / unitxt / 15116080928

19 May 2025 02:45PM UTC coverage: 79.657% (-0.1%) from 79.799%
15116080928

push

github

web-flow
Cards for the Real MM RAG datasets (#1795)

* extending the rag e2e task

* adding AddIncrementalId

* mm rag cards

* mm rag cards

* format

* format

* update

* update

* update

* Improve speed readability and unit-testability

Signed-off-by: elronbandel <elronbandel@gmail.com>

* Revert naming

Signed-off-by: elronbandel <elronbandel@gmail.com>

* Revert changes to rag files

Signed-off-by: elronbandel <elronbandel@gmail.com>

* Fix hotpot qa

Signed-off-by: elronbandel <elronbandel@gmail.com>

---------

Signed-off-by: elronbandel <elronbandel@gmail.com>
Co-authored-by: elronbandel <elronbandel@gmail.com>

1653 of 2063 branches covered (80.13%)

Branch coverage included in aggregate %.

10278 of 12915 relevant lines covered (79.58%)

0.8 hits per line

Source File
Press 'n' to go to next uncovered line, 'b' for previous

51.5
src/unitxt/text_utils.py
1
import re
1✔
2
import shutil
1✔
3
from typing import List, Tuple
1✔
4

5
import pandas as pd
1✔
6

7
from .logging_utils import get_logger
1✔
8

9
logger = get_logger()
1✔
10

11

12
def split_words(s):
1✔
13
    """Splits a string into words based on PascalCase, camelCase, snake_case, kebab-case, and numbers attached to strings.
14

15
    Args:
16
        s (str): The string to be split.
17

18
    Returns:
19
        list: The list of words obtained after splitting the string.
20
    """
21
    # Split PascalCase or camelCase
22
    s = re.sub(r"([A-Z][a-z]+)", r" \1", re.sub(r"([A-Z]+)", r" \1", s)).strip()
1✔
23
    # Split snake_case or kebab-case
24
    s = re.sub(r"[_-]", " ", s)
1✔
25
    # Split numbers attached to strings
26
    s = re.sub(r"([a-zA-Z])(\d)", r"\1 \2", s)
1✔
27
    s = re.sub(r"(\d)([a-zA-Z])", r"\1 \2", s)
1✔
28
    # Split the string into words based on spaces
29
    return s.split()
1✔
30

31

32
def is_camel_case(s):
1✔
33
    """Checks if a string is in camelCase.
34

35
    Args:
36
        s (str): The string to be checked.
37

38
    Returns:
39
        bool: True if the string is in camelCase, False otherwise.
40
    """
41
    return re.match(r"^[A-Z]+([a-z0-9]*[A-Z]*[a-z0-9]*)*$", s) is not None
1✔
42

43

44
def is_snake_case(s):
1✔
45
    """Checks if a string is in snake_case.
46

47
    Args:
48
        s (str): The string to be checked.
49

50
    Returns:
51
        bool: True if the string is in snake_case, False otherwise.
52
    """
53
    return re.match(r"^[a-z0-9]+(_[a-z0-9]+)*$", s) is not None
1✔
54

55

56
def camel_to_snake_case(s):
1✔
57
    """Converts a string from camelCase to snake_case.
58

59
    Args:
60
        s (str): The string to be converted.
61

62
    Returns:
63
        str: The string converted to snake_case.
64
    """
65
    # Add an underscore before every uppercase letter that is followed by a lowercase letter or digit and not preceded by an underscore, a hyphen or an uppercase letter
66
    s = re.sub(r"(?<=[^A-Z_-])([A-Z])", r"_\1", s)
1✔
67

68
    # Ensure there's an underscore before any uppercase letter that's followed by a lowercase letter or digit and comes after a sequence of uppercase letters
69
    s = re.sub(r"([A-Z]+)([A-Z][a-z0-9])", r"\1_\2", s)
1✔
70

71
    return s.lower()
1✔
72

73

74
def to_pretty_string(
1✔
75
    value,
76
    indent=0,
77
    indent_delta=4,
78
    max_chars=None,
79
    keys=None,
80
    item_label=None,
81
    float_format=None,
82
):
83
    """Constructs a formatted string representation of various data structures (dicts, lists, tuples, and DataFrames).
84

85
    Args:
86
        value: The Python data structure to be formatted.
87
        indent (int, optional): The current level of indentation. Defaults to 0.
88
        indent_delta (int, optional): Amount of spaces to add per indentation level. Defaults to 4.
89
        max_chars (int, optional): Max characters per line before wrapping. Defaults to terminal width - 10.
90
        keys (List[str], optional): For dicts, optionally specify keys and order.
91
        item_label (str, optional): Internal parameter for labeling items.
92
        float_format (str, optional): Format string for float values (e.g., ".2f"). Defaults to None.
93
    """
94
    max_chars = max_chars or shutil.get_terminal_size()[0] - 10
1✔
95
    indent_str = " " * indent
1✔
96
    res = ""
1✔
97

98
    if isinstance(value, dict):
1✔
99
        keys_to_print = keys if keys is not None else list(value.keys())
1✔
100

101
        for k in keys_to_print:
1✔
102
            if k not in value:
1✔
103
                raise ValueError(
×
104
                    f"Dictionary does not contain field '{k}' specified in 'keys' argument. "
105
                    f"The available keys are {list(value.keys())}"
106
                )
107

108
        for k in keys_to_print:
1✔
109
            v = value[k]
1✔
110
            item_header = f"{k} ({type(v).__name__})"
1✔
111
            res += f"{indent_str}{item_header}:\n"
1✔
112
            res += to_pretty_string(
1✔
113
                v,
114
                indent=indent + indent_delta,
115
                indent_delta=indent_delta,
116
                max_chars=max_chars,
117
                float_format=float_format,
118
            )
119

120
    elif isinstance(value, (list, tuple)):
1✔
121
        for i, v in enumerate(value):
1✔
122
            label = f"[{i}]" if isinstance(value, list) else f"({i})"
1✔
123
            item_header = f"{label} ({type(v).__name__})"
1✔
124
            res += f"{indent_str}{item_header}:\n"
1✔
125
            res += to_pretty_string(
1✔
126
                v,
127
                indent=indent + indent_delta,
128
                indent_delta=indent_delta,
129
                max_chars=max_chars,
130
                float_format=float_format,
131
            )
132

133
    elif isinstance(value, pd.DataFrame):
1✔
134
        line_width = max_chars - indent
1✔
135
        options = [
1✔
136
            "display.max_rows",
137
            None,
138
            "display.max_columns",
139
            None,
140
            "display.max_colwidth",
141
            None,
142
            "display.width",
143
            line_width,
144
            # 'display.colheader_justify', 'left'
145
        ]
146
        if float_format is not None:
1✔
147
            options.extend(
×
148
                ["display.float_format", ("{:," + float_format + "}").format]
149
            )
150
        with pd.option_context(*options):
1✔
151
            df_str = repr(value)
1✔
152

153
        lines = df_str.split("\n")
1✔
154
        for line in lines:
1✔
155
            if len(line) + len(indent_str) > line_width:
1✔
156
                start = 0
1✔
157
                while start < len(line):
1✔
158
                    wrap_chunk = line[start : start + line_width].rstrip()
1✔
159
                    res += f"{indent_str}{wrap_chunk}\n"
1✔
160
                    start += line_width
1✔
161
            else:
162
                res += f"{indent_str}{line.rstrip()}\n"
1✔
163

164
    else:
165
        # Handle scalar values, including floats
166
        if isinstance(value, float) and float_format:
1✔
167
            formatted_value = f"{value:{float_format}}"
×
168
        else:
169
            formatted_value = str(value)
1✔
170

171
        # Wrap lines according to max_chars
172
        line_width = max_chars - indent
1✔
173
        lines = formatted_value.split("\n")
1✔
174
        for line in lines:
1✔
175
            if len(line) + len(indent_str) > line_width:
1✔
176
                start = 0
1✔
177
                while start < len(line):
1✔
178
                    wrap_chunk = line[start : start + line_width].rstrip()
1✔
179
                    res += f"{indent_str}{wrap_chunk}\n"
1✔
180
                    start += line_width
1✔
181
            else:
182
                res += f"{indent_str}{line.rstrip()}\n"
1✔
183

184
    return res
1✔
185

186

187
def construct_dict_as_yaml_lines(d, indent_delta=2) -> List[str]:
1✔
188
    """Constructs the lines of a dictionary formatted as yaml.
189

190
    Args:
191
        d: The element to be formatted.
192
        indent_delta (int, optional): The amount of spaces to add for each level of indentation. Defaults to 2.
193
    """
194

195
    def is_simple(val) -> bool:
×
196
        # if can show in same line as dictionary's key
197
        return not isinstance(val, (dict, list)) or (len(val) == 0)
×
198

199
    indent_delta_str = " " * indent_delta
×
200
    ticked_indent_delta_str = indent_delta_str[:-2] + "- "
×
201
    assert (
×
202
        indent_delta >= 2
203
    ), f"Needs at least 2 position indentations, for the case of list elements, that are to be preceded each by ' -'. Got indent_delta={indent_delta}."
204
    res = []  # computed hereunder as a list of lines, that are indented only at the end
×
205

206
    if isinstance(d, dict):
×
207
        if len(d) == 0:
×
208
            return ["{}"]
×
209
        for key, val in d.items():
×
210
            printable_key = f'"{key}"' if (" " in key) or (key == "") else key
×
211
            res.append(printable_key + ": ")
×
212
            yaml_for_val = construct_dict_as_yaml_lines(val, indent_delta=indent_delta)
×
213
            assert len(yaml_for_val) > 0
×
214
            if is_simple(val):
×
215
                assert len(yaml_for_val) == 1
×
216
                res[-1] += yaml_for_val[0]
×
217
            else:
218
                for line in yaml_for_val:
×
219
                    res.append(indent_delta_str + line)
×
220
        return res
×
221

222
    if isinstance(d, list):
×
223
        if len(d) == 0:
×
224
            return ["[]"]
×
225
        for val in d:
×
226
            yaml_for_val = construct_dict_as_yaml_lines(val, indent_delta=indent_delta)
×
227
            assert len(yaml_for_val) > 0
×
228
            res.append(ticked_indent_delta_str + yaml_for_val[0])
×
229
            for line in yaml_for_val[1:]:
×
230
                res.append(indent_delta_str + line)
×
231
        return res
×
232

233
    # d1 = re.sub(r"(\n+)", r'"\1"', str(d))
234
    d1 = str(d).replace("\n", "\\n").replace('"', '\\"')
×
235
    if "\\n" in d1 or d1 == "":
×
236
        d1 = f'"{d1}"'
×
237
    return [d1]
×
238

239
def construct_dict_as_python_lines(d, indent_delta=4) -> List[str]:
1✔
240
    """Constructs the lines of a dictionary formatted as a piece of python code.
241

242
    Args:
243
        d: The element to be formatted.
244
        indent_delta (int, optional): The amount of spaces to add for each level of indentation. Defaults to 2.
245
    """
246
    indent_delta_str = " " * indent_delta
×
247
    res = []  # computed hereunder as a list of lines, that are indented only at the end
×
248

249
    if isinstance(d, dict):
×
250
        istype = False
×
251
        if len(d) == 0:
×
252
            return ["{}"]
×
253
        if "__type__" in d:
×
254
            istype = True
×
255
            res = ["__type__" + d["__type__"] + "("]
×
256
            if len(d) == 1:
×
257
                res[0] += ")"
×
258
                return res
×
259
        else:
260
            res = ["{"]
×
261
        for key, val in d.items():
×
262
            if key == "__type__":
×
263
                continue
×
264
            printable_key = f'"{key}"' if not istype else key
×
265
            res.append(printable_key + ("=" if istype else ": "))
×
266
            py_for_val = construct_dict_as_python_lines(val, indent_delta=indent_delta)
×
267
            assert len(py_for_val) > 0
×
268
            if len(py_for_val) == 1:
×
269
                res[-1] += (py_for_val[0] +",")
×
270
            else:
271
                res[-1] += py_for_val[0]
×
272
                if py_for_val[0].startswith("{") or py_for_val[0].startswith("["):
×
273
                    for line in py_for_val[1:-1]:
×
274
                        res.append(indent_delta_str + line)
×
275
                else:
276
                    # val is type, its inner lines are already indented
277
                    res.extend(py_for_val[1:-1])
×
278
                res.append(py_for_val[-1]+",")
×
279
        res.append(")" if istype else "}")
×
280
        if istype:
×
281
            for i in range(1,len(res)-1):
×
282
                res[i] = indent_delta_str+res[i]
×
283
        return res
×
284

285
    if isinstance(d, list):
×
286
        if len(d) == 0:
×
287
            return ["[]"]
×
288
        res = ["["]
×
289
        for val in d:
×
290
            py_for_val = construct_dict_as_python_lines(val, indent_delta=indent_delta)
×
291
            assert len(py_for_val) > 0
×
292
            for line in py_for_val[:-1]:
×
293
                res.append(line)
×
294
            res.append(py_for_val[-1] + ",")
×
295
        res.append("]")
×
296
        return res
×
297

298
    # d1 = re.sub(r"(\n+)", r'"\1"', str(d))
299
    if isinstance(d, str):
×
300
        return [f'"{d}"']
×
301
    if d is None or isinstance (d, (int, float, bool)):
×
302
        return [f"{d}"]
×
303
    raise RuntimeError(f"unrecognized value to print as python: {d}")
×
304

305

306
def print_dict(
1✔
307
    d, indent=0, indent_delta=4, max_chars=None, keys_to_print=None, log_level="info"
308
):
309
    dict_str = to_pretty_string(d, indent, indent_delta, max_chars, keys_to_print)
1✔
310
    dict_str = "\n" + dict_str
1✔
311
    getattr(logger, log_level)(dict_str)
1✔
312

313

314
def print_dict_as_yaml(d: dict, indent_delta=2) -> str:
1✔
315
    yaml_lines = construct_dict_as_yaml_lines(d, indent_delta=indent_delta)
×
316
    # yaml_lines = [re.sub(r"(\n+)", r'"\1"', line) for line in yaml_lines]
317
    # yaml_lines = [line.replace("\n", "\\n") for line in yaml_lines]
318
    return "\n".join(yaml_lines)
×
319

320
def print_dict_as_python(d: dict, indent_delta=4) -> str:
1✔
321
    py_lines = construct_dict_as_python_lines(d, indent_delta=indent_delta)
×
322
    assert len(py_lines)> 0
×
323
    return "\n".join(py_lines)
×
324

325
def nested_tuple_to_string(nested_tuple: tuple) -> str:
1✔
326
    """Converts a nested tuple to a string, with elements separated by underscores.
327

328
    Args:
329
        nested_tuple (tuple): The nested tuple to be converted.
330

331
    Returns:
332
        str: The string representation of the nested tuple.
333
    """
334
    result = []
×
335
    for item in nested_tuple:
×
336
        if isinstance(item, tuple):
×
337
            result.append(nested_tuple_to_string(item))
×
338
        else:
339
            result.append(str(item))
×
340
    return "_".join(result)
×
341

342

343
def is_made_of_sub_strings(string, sub_strings):
1✔
344
    pattern = "^(" + "|".join(map(re.escape, sub_strings)) + ")+$"
1✔
345
    return bool(re.match(pattern, string))
1✔
346

347

348
# Giveמ all the lines of a card preparer file, e.g. all the lines of prepare/cards/cohere_for_ai.py,
349
# and an object name, e.g. TaskCard(,
350
# return the ordinal number of the line that starts that object, in our example: the
351
# line number of the following line (notice that the line where TaskCard is imported
352
# is not supposed to return):
353
#         card = TaskCard(
354
# and the line number of the line that ends the object, in our case the line that include
355
# the matching close:
356
#         )
357
# This util depends on ruff to ensure this setting of the card file: that a close of one
358
# tag and the open of the next tag, do not sit in same line, when both tags being
359
# major level within TaskCard.
360
# It also prepares for the case that  __description__ tag does not contain balanced
361
# parentheses, since it is often cut in the middle, (with  "... see more at")
362
# flake8: noqa: B007
363
# flake8: noqa: C901
364
def lines_defining_obj_in_card(
1✔
365
    all_lines: List[str], obj_name: str, start_search_at_line: int = 0
366
) -> Tuple[int, int]:
367
    for starting_line in range(start_search_at_line, len(all_lines)):
1✔
368
        line = all_lines[starting_line]
1✔
369
        if obj_name in line:
1✔
370
            break
1✔
371
    if obj_name not in line:
1✔
372
        # obj_name found no where in the input lines
373
        return (-1, -1)
×
374
    num_of_opens = 0
1✔
375
    num_of_closes = 0
1✔
376
    ending_line = starting_line - 1
1✔
377
    while ending_line < len(all_lines):
1✔
378
        ending_line += 1
1✔
379

380
        if "__description__" in all_lines[ending_line]:
1✔
381
            # can not trust parentheses inside description, because this is mainly truncated
382
            # free text.
383
            # We do trust the indentation enforced by ruff, and the way we build __description__:
384
            # a line consisting of only __description__=(
385
            # followed by one or more lines of text, can not trust opens and closes
386
            # in them, followed by a line consisting of only:  ),
387
            # where the ) is indented with the beginning of __description__
388
            # We also prepare for the case that, when not entered by us, __description__=
389
            # is not followed by a ( and the whole description does not end with a single ) in its line.
390
            # We build on ruff making the line following the description start with same indentation
391
            # or 4 less (i.e., the following line is the closing of the card).
392
            tag_indentation = all_lines[ending_line].index("__description__")
1✔
393
            starts_with_parent = "__description__=(" in all_lines[ending_line]
1✔
394
            if starts_with_parent:
1✔
395
                last_line_to_start_with = (" " * tag_indentation) + r"\)"
1✔
396
            else:
397
                # actually, the line that follows the description
398
                last_line_to_start_with1 = (" " * tag_indentation) + "[^ ]"
1✔
399
                last_line_to_start_with2 = (" " * (tag_indentation - 4)) + "[^ ]"
1✔
400
                last_line_to_start_with = (
1✔
401
                    "("
402
                    + last_line_to_start_with1
403
                    + "|"
404
                    + last_line_to_start_with2
405
                    + ")"
406
                )
407
            ending_line += 1
1✔
408
            while not re.search("^" + last_line_to_start_with, all_lines[ending_line]):
1✔
409
                ending_line += 1
1✔
410
            if "__description__" in obj_name:
1✔
411
                return (
1✔
412
                    starting_line,
413
                    ending_line if starts_with_parent else ending_line - 1,
414
                )
415

416
            if starts_with_parent:
1✔
417
                ending_line += 1
1✔
418

419
            # we conrinue in card, having passed the description, ending line points
420
            # to the line that follows description
421

422
        num_of_opens += len(re.findall(r"[({[]", all_lines[ending_line]))
1✔
423
        num_of_closes += len(re.findall(r"[)}\]]", all_lines[ending_line]))
1✔
424
        if num_of_closes == num_of_opens:
1✔
425
            break
1✔
426

427
    if num_of_closes != num_of_opens:
1✔
428
        raise ValueError(
×
429
            "input lines were exhausted before the matching close is found"
430
        )
431

432
    return (starting_line, ending_line)
1✔
STATUS · Troubleshooting · Open an Issue · Sales · Support · CAREERS · ENTERPRISE · START FREE · SCHEDULE DEMO
ANNOUNCEMENTS · TWITTER · TOS & SLA · Supported CI Services · What's a CI service? · Automated Testing

© 2026 Coveralls, Inc