15116080928

Committed 19 May 2025 02:45PM UTC coverage: 79.657% (-0.1%) from 79.799%

Build # 15116080928

Build Type

push

github

Committed by

web-flow

Commit Message

Cards for the Real MM RAG datasets (#1795)

* extending the rag e2e task

* adding AddIncrementalId

* mm rag cards

* mm rag cards

* format

* format

* update

* update

* update

* Improve speed readability and unit-testability

Signed-off-by: elronbandel <elronbandel@gmail.com>

* Revert naming

Signed-off-by: elronbandel <elronbandel@gmail.com>

* Revert changes to rag files

Signed-off-by: elronbandel <elronbandel@gmail.com>

* Fix hotpot qa

Signed-off-by: elronbandel <elronbandel@gmail.com>

---------

Signed-off-by: elronbandel <elronbandel@gmail.com>
Co-authored-by: elronbandel <elronbandel@gmail.com>

Run Details

1653 of 2063 branches covered (80.13%)

Branch coverage included in aggregate %.

10278 of 12915 relevant lines covered (79.58%)

0.8 hits per line

Source File
Press 'n' to go to next uncovered line, 'b' for previous

51.5

src/unitxt/text_utils.py

import re
import shutil
from typing import List, Tuple

import pandas as pd

from .logging_utils import get_logger

logger = get_logger()


def split_words(s):
    """Splits a string into words based on PascalCase, camelCase, snake_case, kebab-case, and numbers attached to strings.

    Args:
        s (str): The string to be split.

    Returns:
        list: The list of words obtained after splitting the string.
    """
    # Split PascalCase or camelCase
    s = re.sub(r"([A-Z][a-z]+)", r" \1", re.sub(r"([A-Z]+)", r" \1", s)).strip()
    # Split snake_case or kebab-case
    s = re.sub(r"[_-]", " ", s)
    # Split numbers attached to strings
    s = re.sub(r"([a-zA-Z])(\d)", r"\1 \2", s)
    s = re.sub(r"(\d)([a-zA-Z])", r"\1 \2", s)
    # Split the string into words based on spaces
    return s.split()


def is_camel_case(s):
    """Checks if a string is in camelCase.

    Args:
        s (str): The string to be checked.

    Returns:
        bool: True if the string is in camelCase, False otherwise.
    """
    return re.match(r"^[A-Z]+([a-z0-9]*[A-Z]*[a-z0-9]*)*$", s) is not None


def is_snake_case(s):
    """Checks if a string is in snake_case.

    Args:
        s (str): The string to be checked.

    Returns:
        bool: True if the string is in snake_case, False otherwise.
    """
    return re.match(r"^[a-z0-9]+(_[a-z0-9]+)*$", s) is not None


def camel_to_snake_case(s):
    """Converts a string from camelCase to snake_case.

    Args:
        s (str): The string to be converted.

    Returns:
        str: The string converted to snake_case.
    """
    # Add an underscore before every uppercase letter that is followed by a lowercase letter or digit and not preceded by an underscore, a hyphen or an uppercase letter
    s = re.sub(r"(?<=[^A-Z_-])([A-Z])", r"_\1", s)

    # Ensure there's an underscore before any uppercase letter that's followed by a lowercase letter or digit and comes after a sequence of uppercase letters
    s = re.sub(r"([A-Z]+)([A-Z][a-z0-9])", r"\1_\2", s)

    return s.lower()


def to_pretty_string(
    value,
    indent=0,
    indent_delta=4,
    max_chars=None,
    keys=None,
    item_label=None,
    float_format=None,
):
    """Constructs a formatted string representation of various data structures (dicts, lists, tuples, and DataFrames).

    Args:
        value: The Python data structure to be formatted.
        indent (int, optional): The current level of indentation. Defaults to 0.
        indent_delta (int, optional): Amount of spaces to add per indentation level. Defaults to 4.
        max_chars (int, optional): Max characters per line before wrapping. Defaults to terminal width - 10.
        keys (List[str], optional): For dicts, optionally specify keys and order.
        item_label (str, optional): Internal parameter for labeling items.
        float_format (str, optional): Format string for float values (e.g., ".2f"). Defaults to None.
    """
    max_chars = max_chars or shutil.get_terminal_size()[0] - 10
    indent_str = " " * indent
    res = ""

    if isinstance(value, dict):
        keys_to_print = keys if keys is not None else list(value.keys())

        for k in keys_to_print:
            if k not in value:
                raise ValueError(
                    f"Dictionary does not contain field '{k}' specified in 'keys' argument. "
                    f"The available keys are {list(value.keys())}"
                )

        for k in keys_to_print:
            v = value[k]
            item_header = f"{k} ({type(v).__name__})"
            res += f"{indent_str}{item_header}:\n"
            res += to_pretty_string(
                v,
                indent=indent + indent_delta,
                indent_delta=indent_delta,
                max_chars=max_chars,
                float_format=float_format,
            )

    elif isinstance(value, (list, tuple)):
        for i, v in enumerate(value):
            label = f"[{i}]" if isinstance(value, list) else f"({i})"
            item_header = f"{label} ({type(v).__name__})"
            res += f"{indent_str}{item_header}:\n"
            res += to_pretty_string(
                v,
                indent=indent + indent_delta,
                indent_delta=indent_delta,
                max_chars=max_chars,
                float_format=float_format,
            )

    elif isinstance(value, pd.DataFrame):
        line_width = max_chars - indent
        options = [
            "display.max_rows",
            None,
            "display.max_columns",
            None,
            "display.max_colwidth",
            None,
            "display.width",
            line_width,
            # 'display.colheader_justify', 'left'
        ]
        if float_format is not None:
            options.extend(
                ["display.float_format", ("{:," + float_format + "}").format]
            )
        with pd.option_context(*options):
            df_str = repr(value)

        lines = df_str.split("\n")
        for line in lines:
            if len(line) + len(indent_str) > line_width:
                start = 0
                while start < len(line):
                    wrap_chunk = line[start : start + line_width].rstrip()
                    res += f"{indent_str}{wrap_chunk}\n"
                    start += line_width
            else:
                res += f"{indent_str}{line.rstrip()}\n"

    else:
        # Handle scalar values, including floats
        if isinstance(value, float) and float_format:
            formatted_value = f"{value:{float_format}}"
        else:
            formatted_value = str(value)

        # Wrap lines according to max_chars
        line_width = max_chars - indent
        lines = formatted_value.split("\n")
        for line in lines:
            if len(line) + len(indent_str) > line_width:
                start = 0
                while start < len(line):
                    wrap_chunk = line[start : start + line_width].rstrip()
                    res += f"{indent_str}{wrap_chunk}\n"
                    start += line_width
            else:
                res += f"{indent_str}{line.rstrip()}\n"

    return res


def construct_dict_as_yaml_lines(d, indent_delta=2) -> List[str]:
    """Constructs the lines of a dictionary formatted as yaml.

    Args:
        d: The element to be formatted.
        indent_delta (int, optional): The amount of spaces to add for each level of indentation. Defaults to 2.
    """

    def is_simple(val) -> bool:
        # if can show in same line as dictionary's key
        return not isinstance(val, (dict, list)) or (len(val) == 0)

    indent_delta_str = " " * indent_delta
    ticked_indent_delta_str = indent_delta_str[:-2] + "- "
    assert (
        indent_delta >= 2
    ), f"Needs at least 2 position indentations, for the case of list elements, that are to be preceded each by ' -'. Got indent_delta={indent_delta}."
    res = []  # computed hereunder as a list of lines, that are indented only at the end

    if isinstance(d, dict):
        if len(d) == 0:
            return ["{}"]
        for key, val in d.items():
            printable_key = f'"{key}"' if (" " in key) or (key == "") else key
            res.append(printable_key + ": ")
            yaml_for_val = construct_dict_as_yaml_lines(val, indent_delta=indent_delta)
            assert len(yaml_for_val) > 0
            if is_simple(val):
                assert len(yaml_for_val) == 1
                res[-1] += yaml_for_val[0]
            else:
                for line in yaml_for_val:
                    res.append(indent_delta_str + line)
        return res

    if isinstance(d, list):
        if len(d) == 0:
            return ["[]"]
        for val in d:
            yaml_for_val = construct_dict_as_yaml_lines(val, indent_delta=indent_delta)
            assert len(yaml_for_val) > 0
            res.append(ticked_indent_delta_str + yaml_for_val[0])
            for line in yaml_for_val[1:]:
                res.append(indent_delta_str + line)
        return res

    # d1 = re.sub(r"(\n+)", r'"\1"', str(d))
    d1 = str(d).replace("\n", "\\n").replace('"', '\\"')
    if "\\n" in d1 or d1 == "":
        d1 = f'"{d1}"'
    return [d1]

def construct_dict_as_python_lines(d, indent_delta=4) -> List[str]:
    """Constructs the lines of a dictionary formatted as a piece of python code.

    Args:
        d: The element to be formatted.
        indent_delta (int, optional): The amount of spaces to add for each level of indentation. Defaults to 2.
    """
    indent_delta_str = " " * indent_delta
    res = []  # computed hereunder as a list of lines, that are indented only at the end

    if isinstance(d, dict):
        istype = False
        if len(d) == 0:
            return ["{}"]
        if "__type__" in d:
            istype = True
            res = ["__type__" + d["__type__"] + "("]
            if len(d) == 1:
                res[0] += ")"
                return res
        else:
            res = ["{"]
        for key, val in d.items():
            if key == "__type__":
                continue
            printable_key = f'"{key}"' if not istype else key
            res.append(printable_key + ("=" if istype else ": "))
            py_for_val = construct_dict_as_python_lines(val, indent_delta=indent_delta)
            assert len(py_for_val) > 0
            if len(py_for_val) == 1:
                res[-1] += (py_for_val[0] +",")
            else:
                res[-1] += py_for_val[0]
                if py_for_val[0].startswith("{") or py_for_val[0].startswith("["):
                    for line in py_for_val[1:-1]:
                        res.append(indent_delta_str + line)
                else:
                    # val is type, its inner lines are already indented
                    res.extend(py_for_val[1:-1])
                res.append(py_for_val[-1]+",")
        res.append(")" if istype else "}")
        if istype:
            for i in range(1,len(res)-1):
                res[i] = indent_delta_str+res[i]
        return res

    if isinstance(d, list):
        if len(d) == 0:
            return ["[]"]
        res = ["["]
        for val in d:
            py_for_val = construct_dict_as_python_lines(val, indent_delta=indent_delta)
            assert len(py_for_val) > 0
            for line in py_for_val[:-1]:
                res.append(line)
            res.append(py_for_val[-1] + ",")
        res.append("]")
        return res

    # d1 = re.sub(r"(\n+)", r'"\1"', str(d))
    if isinstance(d, str):
        return [f'"{d}"']
    if d is None or isinstance (d, (int, float, bool)):
        return [f"{d}"]
    raise RuntimeError(f"unrecognized value to print as python: {d}")


def print_dict(
    d, indent=0, indent_delta=4, max_chars=None, keys_to_print=None, log_level="info"
):
    dict_str = to_pretty_string(d, indent, indent_delta, max_chars, keys_to_print)
    dict_str = "\n" + dict_str
    getattr(logger, log_level)(dict_str)


def print_dict_as_yaml(d: dict, indent_delta=2) -> str:
    yaml_lines = construct_dict_as_yaml_lines(d, indent_delta=indent_delta)
    # yaml_lines = [re.sub(r"(\n+)", r'"\1"', line) for line in yaml_lines]
    # yaml_lines = [line.replace("\n", "\\n") for line in yaml_lines]
    return "\n".join(yaml_lines)

def print_dict_as_python(d: dict, indent_delta=4) -> str:
    py_lines = construct_dict_as_python_lines(d, indent_delta=indent_delta)
    assert len(py_lines)> 0
    return "\n".join(py_lines)

def nested_tuple_to_string(nested_tuple: tuple) -> str:
    """Converts a nested tuple to a string, with elements separated by underscores.

    Args:
        nested_tuple (tuple): The nested tuple to be converted.

    Returns:
        str: The string representation of the nested tuple.
    """
    result = []
    for item in nested_tuple:
        if isinstance(item, tuple):
            result.append(nested_tuple_to_string(item))
        else:
            result.append(str(item))
    return "_".join(result)


def is_made_of_sub_strings(string, sub_strings):
    pattern = "^(" + "|".join(map(re.escape, sub_strings)) + ")+$"
    return bool(re.match(pattern, string))


# Giveמ all the lines of a card preparer file, e.g. all the lines of prepare/cards/cohere_for_ai.py,
# and an object name, e.g. TaskCard(,
# return the ordinal number of the line that starts that object, in our example: the
# line number of the following line (notice that the line where TaskCard is imported
# is not supposed to return):
#         card = TaskCard(
# and the line number of the line that ends the object, in our case the line that include
# the matching close:
#         )
# This util depends on ruff to ensure this setting of the card file: that a close of one
# tag and the open of the next tag, do not sit in same line, when both tags being
# major level within TaskCard.
# It also prepares for the case that  __description__ tag does not contain balanced
# parentheses, since it is often cut in the middle, (with  "... see more at")
# flake8: noqa: B007
# flake8: noqa: C901
def lines_defining_obj_in_card(
    all_lines: List[str], obj_name: str, start_search_at_line: int = 0
) -> Tuple[int, int]:
    for starting_line in range(start_search_at_line, len(all_lines)):
        line = all_lines[starting_line]
        if obj_name in line:
            break
    if obj_name not in line:
        # obj_name found no where in the input lines
        return (-1, -1)
    num_of_opens = 0
    num_of_closes = 0
    ending_line = starting_line - 1
    while ending_line < len(all_lines):
        ending_line += 1

        if "__description__" in all_lines[ending_line]:
            # can not trust parentheses inside description, because this is mainly truncated
            # free text.
            # We do trust the indentation enforced by ruff, and the way we build __description__:
            # a line consisting of only __description__=(
            # followed by one or more lines of text, can not trust opens and closes
            # in them, followed by a line consisting of only:  ),
            # where the ) is indented with the beginning of __description__
            # We also prepare for the case that, when not entered by us, __description__=
            # is not followed by a ( and the whole description does not end with a single ) in its line.
            # We build on ruff making the line following the description start with same indentation
            # or 4 less (i.e., the following line is the closing of the card).
            tag_indentation = all_lines[ending_line].index("__description__")
            starts_with_parent = "__description__=(" in all_lines[ending_line]
            if starts_with_parent:
                last_line_to_start_with = (" " * tag_indentation) + r"\)"
            else:
                # actually, the line that follows the description
                last_line_to_start_with1 = (" " * tag_indentation) + "[^ ]"
                last_line_to_start_with2 = (" " * (tag_indentation - 4)) + "[^ ]"
                last_line_to_start_with = (
                    "("
                    + last_line_to_start_with1
                    + "|"
                    + last_line_to_start_with2
                    + ")"
                )
            ending_line += 1
            while not re.search("^" + last_line_to_start_with, all_lines[ending_line]):
                ending_line += 1
            if "__description__" in obj_name:
                return (
                    starting_line,
                    ending_line if starts_with_parent else ending_line - 1,
                )

            if starts_with_parent:
                ending_line += 1

            # we conrinue in card, having passed the description, ending line points
            # to the line that follows description

        num_of_opens += len(re.findall(r"[({[]", all_lines[ending_line]))
        num_of_closes += len(re.findall(r"[)}\]]", all_lines[ending_line]))
        if num_of_closes == num_of_opens:
            break

    if num_of_closes != num_of_opens:
        raise ValueError(
            "input lines were exhausted before the matching close is found"
        )

    return (starting_line, ending_line)

1	import re	1✔
2	import shutil	1✔
3	from typing import List, Tuple	1✔
4
5	import pandas as pd	1✔
6
7	from .logging_utils import get_logger	1✔
8
9	logger = get_logger()	1✔
10
11
12	def split_words(s):	1✔
13	"""Splits a string into words based on PascalCase, camelCase, snake_case, kebab-case, and numbers attached to strings.
14
15	Args:
16	s (str): The string to be split.
17
18	Returns:
19	list: The list of words obtained after splitting the string.
20	"""
21	# Split PascalCase or camelCase
22	s = re.sub(r"([A-Z][a-z]+)", r" \1", re.sub(r"([A-Z]+)", r" \1", s)).strip()	1✔
23	# Split snake_case or kebab-case
24	s = re.sub(r"[_-]", " ", s)	1✔
25	# Split numbers attached to strings
26	s = re.sub(r"([a-zA-Z])(\d)", r"\1 \2", s)	1✔
27	s = re.sub(r"(\d)([a-zA-Z])", r"\1 \2", s)	1✔
28	# Split the string into words based on spaces
29	return s.split()	1✔
30
31
32	def is_camel_case(s):	1✔
33	"""Checks if a string is in camelCase.
34
35	Args:
36	s (str): The string to be checked.
37
38	Returns:
39	bool: True if the string is in camelCase, False otherwise.
40	"""
41	return re.match(r"^[A-Z]+([a-z0-9][A-Z][a-z0-9])$", s) is not None	1✔
42
43
44	def is_snake_case(s):	1✔
45	"""Checks if a string is in snake_case.
46
47	Args:
48	s (str): The string to be checked.
49
50	Returns:
51	bool: True if the string is in snake_case, False otherwise.
52	"""
53	return re.match(r"^[a-z0-9]+(_[a-z0-9]+)*$", s) is not None	1✔
54
55
56	def camel_to_snake_case(s):	1✔
57	"""Converts a string from camelCase to snake_case.
58
59	Args:
60	s (str): The string to be converted.
61
62	Returns:
63	str: The string converted to snake_case.
64	"""
65	# Add an underscore before every uppercase letter that is followed by a lowercase letter or digit and not preceded by an underscore, a hyphen or an uppercase letter
66	s = re.sub(r"(?<=[^A-Z_-])([A-Z])", r"_\1", s)	1✔
67
68	# Ensure there's an underscore before any uppercase letter that's followed by a lowercase letter or digit and comes after a sequence of uppercase letters
69	s = re.sub(r"([A-Z]+)([A-Z][a-z0-9])", r"\1_\2", s)	1✔
70
71	return s.lower()	1✔
72
73
74	def to_pretty_string(	1✔
75	value,
76	indent=0,
77	indent_delta=4,
78	max_chars=None,
79	keys=None,
80	item_label=None,
81	float_format=None,
82	):
83	"""Constructs a formatted string representation of various data structures (dicts, lists, tuples, and DataFrames).
84
85	Args:
86	value: The Python data structure to be formatted.
87	indent (int, optional): The current level of indentation. Defaults to 0.
88	indent_delta (int, optional): Amount of spaces to add per indentation level. Defaults to 4.
89	max_chars (int, optional): Max characters per line before wrapping. Defaults to terminal width - 10.
90	keys (List[str], optional): For dicts, optionally specify keys and order.
91	item_label (str, optional): Internal parameter for labeling items.
92	float_format (str, optional): Format string for float values (e.g., ".2f"). Defaults to None.
93	"""
94	max_chars = max_chars or shutil.get_terminal_size()[0] - 10	1✔
95	indent_str = " " * indent	1✔
96	res = ""	1✔
97
98	if isinstance(value, dict):	1✔
99	keys_to_print = keys if keys is not None else list(value.keys())	1✔
100
101	for k in keys_to_print:	1✔
102	if k not in value:	1✔
103	raise ValueError(	×
104	f"Dictionary does not contain field '{k}' specified in 'keys' argument. "
105	f"The available keys are {list(value.keys())}"
106	)
107
108	for k in keys_to_print:	1✔
109	v = value[k]	1✔
110	item_header = f"{k} ({type(v).__name__})"	1✔
111	res += f"{indent_str}{item_header}:\n"	1✔
112	res += to_pretty_string(	1✔
113	v,
114	indent=indent + indent_delta,
115	indent_delta=indent_delta,
116	max_chars=max_chars,
117	float_format=float_format,
118	)
119
120	elif isinstance(value, (list, tuple)):	1✔
121	for i, v in enumerate(value):	1✔
122	label = f"[{i}]" if isinstance(value, list) else f"({i})"	1✔
123	item_header = f"{label} ({type(v).__name__})"	1✔
124	res += f"{indent_str}{item_header}:\n"	1✔
125	res += to_pretty_string(	1✔
126	v,
127	indent=indent + indent_delta,
128	indent_delta=indent_delta,
129	max_chars=max_chars,
130	float_format=float_format,
131	)
132
133	elif isinstance(value, pd.DataFrame):	1✔
134	line_width = max_chars - indent	1✔
135	options = [	1✔
136	"display.max_rows",
137	None,
138	"display.max_columns",
139	None,
140	"display.max_colwidth",
141	None,
142	"display.width",
143	line_width,
144	# 'display.colheader_justify', 'left'
145	]
146	if float_format is not None:	1✔
147	options.extend(	×
148	["display.float_format", ("{:," + float_format + "}").format]
149	)
150	with pd.option_context(*options):	1✔
151	df_str = repr(value)	1✔
152
153	lines = df_str.split("\n")	1✔
154	for line in lines:	1✔
155	if len(line) + len(indent_str) > line_width:	1✔
156	start = 0	1✔
157	while start < len(line):	1✔
158	wrap_chunk = line[start : start + line_width].rstrip()	1✔
159	res += f"{indent_str}{wrap_chunk}\n"	1✔
160	start += line_width	1✔
161	else:
162	res += f"{indent_str}{line.rstrip()}\n"	1✔
163
164	else:
165	# Handle scalar values, including floats
166	if isinstance(value, float) and float_format:	1✔
167	formatted_value = f"{value:{float_format}}"	×
168	else:
169	formatted_value = str(value)	1✔
170
171	# Wrap lines according to max_chars
172	line_width = max_chars - indent	1✔
173	lines = formatted_value.split("\n")	1✔
174	for line in lines:	1✔
175	if len(line) + len(indent_str) > line_width:	1✔
176	start = 0	1✔
177	while start < len(line):	1✔
178	wrap_chunk = line[start : start + line_width].rstrip()	1✔
179	res += f"{indent_str}{wrap_chunk}\n"	1✔
180	start += line_width	1✔
181	else:
182	res += f"{indent_str}{line.rstrip()}\n"	1✔
183
184	return res	1✔
185
186
187	def construct_dict_as_yaml_lines(d, indent_delta=2) -> List[str]:	1✔
188	"""Constructs the lines of a dictionary formatted as yaml.
189
190	Args:
191	d: The element to be formatted.
192	indent_delta (int, optional): The amount of spaces to add for each level of indentation. Defaults to 2.
193	"""
194
195	def is_simple(val) -> bool:	×
196	# if can show in same line as dictionary's key
197	return not isinstance(val, (dict, list)) or (len(val) == 0)	×
198
199	indent_delta_str = " " * indent_delta	×
200	ticked_indent_delta_str = indent_delta_str[:-2] + "- "	×
201	assert (	×
202	indent_delta >= 2
203	), f"Needs at least 2 position indentations, for the case of list elements, that are to be preceded each by ' -'. Got indent_delta={indent_delta}."
204	res = [] # computed hereunder as a list of lines, that are indented only at the end	×
205
206	if isinstance(d, dict):	×
207	if len(d) == 0:	×
208	return ["{}"]	×
209	for key, val in d.items():	×
210	printable_key = f'"{key}"' if (" " in key) or (key == "") else key	×
211	res.append(printable_key + ": ")	×
212	yaml_for_val = construct_dict_as_yaml_lines(val, indent_delta=indent_delta)	×
213	assert len(yaml_for_val) > 0	×
214	if is_simple(val):	×
215	assert len(yaml_for_val) == 1	×
216	res[-1] += yaml_for_val[0]	×
217	else:
218	for line in yaml_for_val:	×
219	res.append(indent_delta_str + line)	×
220	return res	×
221
222	if isinstance(d, list):	×
223	if len(d) == 0:	×
224	return ["[]"]	×
225	for val in d:	×
226	yaml_for_val = construct_dict_as_yaml_lines(val, indent_delta=indent_delta)	×
227	assert len(yaml_for_val) > 0	×
228	res.append(ticked_indent_delta_str + yaml_for_val[0])	×
229	for line in yaml_for_val[1:]:	×
230	res.append(indent_delta_str + line)	×
231	return res	×
232
233	# d1 = re.sub(r"(\n+)", r'"\1"', str(d))
234	d1 = str(d).replace("\n", "\\n").replace('"', '\\"')	×
235	if "\\n" in d1 or d1 == "":	×
236	d1 = f'"{d1}"'	×
237	return [d1]	×
238
239	def construct_dict_as_python_lines(d, indent_delta=4) -> List[str]:	1✔
240	"""Constructs the lines of a dictionary formatted as a piece of python code.
241
242	Args:
243	d: The element to be formatted.
244	indent_delta (int, optional): The amount of spaces to add for each level of indentation. Defaults to 2.
245	"""
246	indent_delta_str = " " * indent_delta	×
247	res = [] # computed hereunder as a list of lines, that are indented only at the end	×
248
249	if isinstance(d, dict):	×
250	istype = False	×
251	if len(d) == 0:	×
252	return ["{}"]	×
253	if "__type__" in d:	×
254	istype = True	×
255	res = ["__type__" + d["__type__"] + "("]	×
256	if len(d) == 1:	×
257	res[0] += ")"	×
258	return res	×
259	else:
260	res = ["{"]	×
261	for key, val in d.items():	×
262	if key == "__type__":	×
263	continue	×
264	printable_key = f'"{key}"' if not istype else key	×
265	res.append(printable_key + ("=" if istype else ": "))	×
266	py_for_val = construct_dict_as_python_lines(val, indent_delta=indent_delta)	×
267	assert len(py_for_val) > 0	×
268	if len(py_for_val) == 1:	×
269	res[-1] += (py_for_val[0] +",")	×
270	else:
271	res[-1] += py_for_val[0]	×
272	if py_for_val[0].startswith("{") or py_for_val[0].startswith("["):	×
273	for line in py_for_val[1:-1]:	×
274	res.append(indent_delta_str + line)	×
275	else:
276	# val is type, its inner lines are already indented
277	res.extend(py_for_val[1:-1])	×
278	res.append(py_for_val[-1]+",")	×
279	res.append(")" if istype else "}")	×
280	if istype:	×
281	for i in range(1,len(res)-1):	×
282	res[i] = indent_delta_str+res[i]	×
283	return res	×
284
285	if isinstance(d, list):	×
286	if len(d) == 0:	×
287	return ["[]"]	×
288	res = ["["]	×
289	for val in d:	×
290	py_for_val = construct_dict_as_python_lines(val, indent_delta=indent_delta)	×
291	assert len(py_for_val) > 0	×
292	for line in py_for_val[:-1]:	×
293	res.append(line)	×
294	res.append(py_for_val[-1] + ",")	×
295	res.append("]")	×
296	return res	×
297
298	# d1 = re.sub(r"(\n+)", r'"\1"', str(d))
299	if isinstance(d, str):	×
300	return [f'"{d}"']	×
301	if d is None or isinstance (d, (int, float, bool)):	×
302	return [f"{d}"]	×
303	raise RuntimeError(f"unrecognized value to print as python: {d}")	×
304
305
306	def print_dict(	1✔
307	d, indent=0, indent_delta=4, max_chars=None, keys_to_print=None, log_level="info"
308	):
309	dict_str = to_pretty_string(d, indent, indent_delta, max_chars, keys_to_print)	1✔
310	dict_str = "\n" + dict_str	1✔
311	getattr(logger, log_level)(dict_str)	1✔
312
313
314	def print_dict_as_yaml(d: dict, indent_delta=2) -> str:	1✔
315	yaml_lines = construct_dict_as_yaml_lines(d, indent_delta=indent_delta)	×
316	# yaml_lines = [re.sub(r"(\n+)", r'"\1"', line) for line in yaml_lines]
317	# yaml_lines = [line.replace("\n", "\\n") for line in yaml_lines]
318	return "\n".join(yaml_lines)	×
319
320	def print_dict_as_python(d: dict, indent_delta=4) -> str:	1✔
321	py_lines = construct_dict_as_python_lines(d, indent_delta=indent_delta)	×
322	assert len(py_lines)> 0	×
323	return "\n".join(py_lines)	×
324
325	def nested_tuple_to_string(nested_tuple: tuple) -> str:	1✔
326	"""Converts a nested tuple to a string, with elements separated by underscores.
327
328	Args:
329	nested_tuple (tuple): The nested tuple to be converted.
330
331	Returns:
332	str: The string representation of the nested tuple.
333	"""
334	result = []	×
335	for item in nested_tuple:	×
336	if isinstance(item, tuple):	×
337	result.append(nested_tuple_to_string(item))	×
338	else:
339	result.append(str(item))	×
340	return "_".join(result)	×
341
342
343	def is_made_of_sub_strings(string, sub_strings):	1✔
344	pattern = "^(" + "\|".join(map(re.escape, sub_strings)) + ")+$"	1✔
345	return bool(re.match(pattern, string))	1✔
346
347
348	# Giveמ all the lines of a card preparer file, e.g. all the lines of prepare/cards/cohere_for_ai.py,
349	# and an object name, e.g. TaskCard(,
350	# return the ordinal number of the line that starts that object, in our example: the
351	# line number of the following line (notice that the line where TaskCard is imported
352	# is not supposed to return):
353	# card = TaskCard(
354	# and the line number of the line that ends the object, in our case the line that include
355	# the matching close:
356	# )
357	# This util depends on ruff to ensure this setting of the card file: that a close of one
358	# tag and the open of the next tag, do not sit in same line, when both tags being
359	# major level within TaskCard.
360	# It also prepares for the case that __description__ tag does not contain balanced
361	# parentheses, since it is often cut in the middle, (with "... see more at")
362	# flake8: noqa: B007
363	# flake8: noqa: C901
364	def lines_defining_obj_in_card(	1✔
365	all_lines: List[str], obj_name: str, start_search_at_line: int = 0
366	) -> Tuple[int, int]:
367	for starting_line in range(start_search_at_line, len(all_lines)):	1✔
368	line = all_lines[starting_line]	1✔
369	if obj_name in line:	1✔
370	break	1✔
371	if obj_name not in line:	1✔
372	# obj_name found no where in the input lines
373	return (-1, -1)	×
374	num_of_opens = 0	1✔
375	num_of_closes = 0	1✔
376	ending_line = starting_line - 1	1✔
377	while ending_line < len(all_lines):	1✔
378	ending_line += 1	1✔
379
380	if "__description__" in all_lines[ending_line]:	1✔
381	# can not trust parentheses inside description, because this is mainly truncated
382	# free text.
383	# We do trust the indentation enforced by ruff, and the way we build __description__:
384	# a line consisting of only __description__=(
385	# followed by one or more lines of text, can not trust opens and closes
386	# in them, followed by a line consisting of only: ),
387	# where the ) is indented with the beginning of __description__
388	# We also prepare for the case that, when not entered by us, __description__=
389	# is not followed by a ( and the whole description does not end with a single ) in its line.
390	# We build on ruff making the line following the description start with same indentation
391	# or 4 less (i.e., the following line is the closing of the card).
392	tag_indentation = all_lines[ending_line].index("__description__")	1✔
393	starts_with_parent = "__description__=(" in all_lines[ending_line]	1✔
394	if starts_with_parent:	1✔
395	last_line_to_start_with = (" " * tag_indentation) + r"\)"	1✔
396	else:
397	# actually, the line that follows the description
398	last_line_to_start_with1 = (" " * tag_indentation) + "[^ ]"	1✔
399	last_line_to_start_with2 = (" " * (tag_indentation - 4)) + "[^ ]"	1✔
400	last_line_to_start_with = (	1✔
401	"("
402	+ last_line_to_start_with1
403	+ "\|"
404	+ last_line_to_start_with2
405	+ ")"
406	)
407	ending_line += 1	1✔
408	while not re.search("^" + last_line_to_start_with, all_lines[ending_line]):	1✔
409	ending_line += 1	1✔
410	if "__description__" in obj_name:	1✔
411	return (	1✔
412	starting_line,
413	ending_line if starts_with_parent else ending_line - 1,
414	)
415
416	if starts_with_parent:	1✔
417	ending_line += 1	1✔
418
419	# we conrinue in card, having passed the description, ending line points
420	# to the line that follows description
421
422	num_of_opens += len(re.findall(r"[({[]", all_lines[ending_line]))	1✔
423	num_of_closes += len(re.findall(r"[)}\]]", all_lines[ending_line]))	1✔
424	if num_of_closes == num_of_opens:	1✔
425	break	1✔
426
427	if num_of_closes != num_of_opens:	1✔
428	raise ValueError(	×
429	"input lines were exhausted before the matching close is found"
430	)
431
432	return (starting_line, ending_line)	1✔

IBM / unitxt / 15116080928

Source File Press 'n' to go to next uncovered line, 'b' for previous

Source File
Press 'n' to go to next uncovered line, 'b' for previous