17888446790

Committed 21 Sep 2025 03:34AM UTC coverage: 80.804% (-0.1%) from 80.906%

Build # 17888446790

Build Type

Pull #1939

github

Committed by

web-flow

Commit Message

Merge e4db288a0 into 95ad743ba

Pull Request Pull Request #1939: light fast removal of register_all_artifacts for unitxt classes

Run Details

1599 of 1997 branches covered (80.07%)

Branch coverage included in aggregate %.

10924 of 13501 relevant lines covered (80.91%)

0.81 hits per line

Source File
Press 'n' to go to next uncovered line, 'b' for previous

91.6

src/unitxt/text_utils.py

import re
import shutil
from typing import List, Tuple

import pandas as pd

from .logging_utils import get_logger

logger = get_logger()


def split_words(s):
    """Splits a string into words based on PascalCase, camelCase, snake_case, kebab-case, and numbers attached to strings.

    Args:
        s (str): The string to be split.

    Returns:
        list: The list of words obtained after splitting the string.
    """
    # Split PascalCase or camelCase
    s = re.sub(r"([A-Z][a-z]+)", r" \1", re.sub(r"([A-Z]+)", r" \1", s)).strip()
    # Split snake_case or kebab-case
    s = re.sub(r"[_-]", " ", s)
    # Split numbers attached to strings
    s = re.sub(r"([a-zA-Z])(\d)", r"\1 \2", s)
    s = re.sub(r"(\d)([a-zA-Z])", r"\1 \2", s)
    # Split the string into words based on spaces
    return s.split()


def is_camel_case(s):
    """Checks if a string is in camelCase.

    Args:
        s (str): The string to be checked.

    Returns:
        bool: True if the string is in camelCase, False otherwise.
    """
    return re.match(r"^[A-Z]+([a-z0-9]*[A-Z]*[a-z0-9]*)*$", s) is not None


def is_snake_case(s):
    """Checks if a string is in snake_case.

    Args:
        s (str): The string to be checked.

    Returns:
        bool: True if the string is in snake_case, False otherwise.
    """
    return re.match(r"^[a-z0-9]+(_[a-z0-9]+)*$", s) is not None


def camel_to_snake_case(s):
    """Converts a string from camelCase to snake_case.

    Args:
        s (str): The string to be converted.

    Returns:
        str: The string converted to snake_case.
    """
    # Add an underscore before every uppercase letter that is followed by a lowercase letter or digit and not preceded by an underscore, a hyphen or an uppercase letter
    s = re.sub(r"(?<=[^A-Z_-])([A-Z])", r"_\1", s)

    # Ensure there's an underscore before any uppercase letter that's followed by a lowercase letter or digit and comes after a sequence of uppercase letters
    s = re.sub(r"([A-Z]+)([A-Z][a-z0-9])", r"\1_\2", s)

    return s.lower()


def snake_to_camel_case(s):
    """Converts a snake_case string s to CamelCase. Assume a class name is in question so result to start with an upper case.

    Not always the reciprocal of the above camel_to_snake_case.  e.g: camel_to_snake_case(LoadHF) = load_hf,
    whereas snake_to_camel_case(load_hf) = LoadHf
    """
    s = s.strip()
    words = s.split("_")
    # Capitalize all words and join them
    camel_case_parts = [word.capitalize() for word in words]
    return "".join(camel_case_parts)


def to_pretty_string(
    value,
    indent=0,
    indent_delta=4,
    max_chars=None,
    keys=None,
    item_label=None,
    float_format=None,
):
    """Constructs a formatted string representation of various data structures (dicts, lists, tuples, and DataFrames).

    Args:
        value: The Python data structure to be formatted.
        indent (int, optional): The current level of indentation. Defaults to 0.
        indent_delta (int, optional): Amount of spaces to add per indentation level. Defaults to 4.
        max_chars (int, optional): Max characters per line before wrapping. Defaults to terminal width - 10.
        keys (List[str], optional): For dicts, optionally specify keys and order.
        item_label (str, optional): Internal parameter for labeling items.
        float_format (str, optional): Format string for float values (e.g., ".2f"). Defaults to None.
    """
    max_chars = max_chars or shutil.get_terminal_size()[0] - 10
    indent_str = " " * indent
    res = ""

    if isinstance(value, dict):
        keys_to_print = keys if keys is not None else list(value.keys())

        for k in keys_to_print:
            if k not in value:
                raise ValueError(
                    f"Dictionary does not contain field '{k}' specified in 'keys' argument. "
                    f"The available keys are {list(value.keys())}"
                )

        for k in keys_to_print:
            v = value[k]
            item_header = f"{k} ({type(v).__name__})"
            res += f"{indent_str}{item_header}:\n"
            res += to_pretty_string(
                v,
                indent=indent + indent_delta,
                indent_delta=indent_delta,
                max_chars=max_chars,
                float_format=float_format,
            )

    elif isinstance(value, (list, tuple)):
        for i, v in enumerate(value):
            label = f"[{i}]" if isinstance(value, list) else f"({i})"
            item_header = f"{label} ({type(v).__name__})"
            res += f"{indent_str}{item_header}:\n"
            res += to_pretty_string(
                v,
                indent=indent + indent_delta,
                indent_delta=indent_delta,
                max_chars=max_chars,
                float_format=float_format,
            )

    elif isinstance(value, pd.DataFrame):
        line_width = max_chars - indent
        options = [
            "display.max_rows",
            None,
            "display.max_columns",
            None,
            "display.max_colwidth",
            None,
            "display.width",
            line_width,
            # 'display.colheader_justify', 'left'
        ]
        if float_format is not None:
            options.extend(
                ["display.float_format", ("{:," + float_format + "}").format]
            )
        with pd.option_context(*options):
            df_str = repr(value)

        lines = df_str.split("\n")
        for line in lines:
            if len(line) + len(indent_str) > line_width:
                start = 0
                while start < len(line):
                    wrap_chunk = line[start : start + line_width].rstrip()
                    res += f"{indent_str}{wrap_chunk}\n"
                    start += line_width
            else:
                res += f"{indent_str}{line.rstrip()}\n"

    else:
        # Handle scalar values, including floats
        if isinstance(value, float) and float_format:
            formatted_value = f"{value:{float_format}}"
        else:
            formatted_value = str(value)

        # Wrap lines according to max_chars
        line_width = max_chars - indent
        lines = formatted_value.split("\n")
        for line in lines:
            if len(line) + len(indent_str) > line_width:
                start = 0
                while start < len(line):
                    wrap_chunk = line[start : start + line_width].rstrip()
                    res += f"{indent_str}{wrap_chunk}\n"
                    start += line_width
            else:
                res += f"{indent_str}{line.rstrip()}\n"

    return res


def construct_dict_as_yaml_lines(d, indent_delta=2) -> List[str]:
    """Constructs the lines of a dictionary formatted as yaml.

    Args:
        d: The element to be formatted.
        indent_delta (int, optional): The amount of spaces to add for each level of indentation. Defaults to 2.
    """
    indent_delta_str = " " * indent_delta
    ticked_indent_delta_str = indent_delta_str[:-2] + "- "
    assert (
        indent_delta >= 2
    ), f"Needs at least 2 position indentations, for the case of list elements, that are to be preceded each by ' -'. Got indent_delta={indent_delta}."
    res = []  # computed hereunder as a list of lines, that are indented only at the end

    if isinstance(d, dict):
        if len(d) == 0:
            return ["{}"]
        for key, val in d.items():
            printable_key = f'"{key}"' if (" " in key) or (key == "") else key
            res.append(printable_key + ": ")
            yaml_for_val = construct_dict_as_yaml_lines(val, indent_delta=indent_delta)
            assert len(yaml_for_val) > 0
            if len(yaml_for_val) == 1:
                res[-1] += yaml_for_val[0]
            else:
                for line in yaml_for_val:
                    res.append(indent_delta_str + line)
        return res

    if isinstance(d, list):
        if len(d) == 0:
            return ["[]"]
        for val in d:
            yaml_for_val = construct_dict_as_yaml_lines(val, indent_delta=indent_delta)
            assert len(yaml_for_val) > 0
            res.append(ticked_indent_delta_str + yaml_for_val[0])
            for line in yaml_for_val[1:]:
                res.append(indent_delta_str + line)
        return res

    # d1 = re.sub(r"(\n+)", r'"\1"', str(d))
    d1 = str(d).replace("\n", "\\n").replace('"', '\\"')
    if "\\n" in d1 or d1 == "":
        d1 = f'"{d1}"'
    return [d1]


def construct_dict_as_python_lines(d, indent_delta=4) -> List[str]:
    """Constructs the lines of a dictionary formatted as a piece of python code.

    Args:
        d: The element to be formatted.
        indent_delta (int, optional): The amount of spaces to add for each level of indentation. Defaults to 2.
    """
    indent_delta_str = " " * indent_delta
    res = []  # computed hereunder as a list of lines, that are indented only at the end

    if isinstance(d, dict):
        istype = False
        if len(d) == 0:
            return ["{}"]
        if "__type__" in d:
            istype = True
            res = ["__type__" + d["__type__"] + "("]
            if len(d) == 1:
                res[0] += ")"
                return res
        else:
            res = ["{"]
        for key, val in d.items():
            if key == "__type__":
                continue
            printable_key = f'"{key}"' if not istype else key
            res.append(printable_key + ("=" if istype else ": "))
            py_for_val = construct_dict_as_python_lines(val, indent_delta=indent_delta)
            assert len(py_for_val) > 0
            if len(py_for_val) == 1:
                res[-1] += py_for_val[0] + ","
            else:
                res[-1] += py_for_val[0]
                if py_for_val[0].startswith("{") or py_for_val[0].startswith("["):
                    for line in py_for_val[1:-1]:
                        res.append(indent_delta_str + line)
                else:
                    # val is type, its inner lines are already indented
                    res.extend(py_for_val[1:-1])
                res.append(py_for_val[-1] + ",")
        res.append(")" if istype else "}")
        if istype:
            for i in range(1, len(res) - 1):
                res[i] = indent_delta_str + res[i]
        return res

    if isinstance(d, list):
        if len(d) == 0:
            return ["[]"]
        res = ["["]
        for val in d:
            py_for_val = construct_dict_as_python_lines(val, indent_delta=indent_delta)
            assert len(py_for_val) > 0
            for line in py_for_val[:-1]:
                res.append(line)
            res.append(py_for_val[-1] + ",")
        res.append("]")
        return res

    # d1 = re.sub(r"(\n+)", r'"\1"', str(d))
    if isinstance(d, str):
        return [f'"{d}"']
    if d is None or isinstance(d, (int, float, bool)):
        return [f"{d}"]
    raise RuntimeError(f"unrecognized value to print as python: {d}")


def print_dict(
    d, indent=0, indent_delta=4, max_chars=None, keys_to_print=None, log_level="info"
):
    dict_str = to_pretty_string(d, indent, indent_delta, max_chars, keys_to_print)
    dict_str = "\n" + dict_str
    getattr(logger, log_level)(dict_str)


def print_dict_as_yaml(d: dict, indent_delta=2) -> str:
    yaml_lines = construct_dict_as_yaml_lines(d, indent_delta=indent_delta)
    # yaml_lines = [re.sub(r"(\n+)", r'"\1"', line) for line in yaml_lines]
    # yaml_lines = [line.replace("\n", "\\n") for line in yaml_lines]
    return "\n".join(yaml_lines)


def print_dict_as_python(d: dict, indent_delta=4) -> str:
    py_lines = construct_dict_as_python_lines(d, indent_delta=indent_delta)
    assert len(py_lines) > 0
    return "\n".join(py_lines)


def nested_tuple_to_string(nested_tuple: tuple) -> str:
    """Converts a nested tuple to a string, with elements separated by underscores.

    Args:
        nested_tuple (tuple): The nested tuple to be converted.

    Returns:
        str: The string representation of the nested tuple.
    """
    result = []
    for item in nested_tuple:
        if isinstance(item, tuple):
            result.append(nested_tuple_to_string(item))
        else:
            result.append(str(item))
    return "_".join(result)


def is_made_of_sub_strings(string, sub_strings):
    pattern = "^(" + "|".join(map(re.escape, sub_strings)) + ")+$"
    return bool(re.match(pattern, string))


# Giveמ all the lines of a card preparer file, e.g. all the lines of prepare/cards/cohere_for_ai.py,
# and an object name, e.g. TaskCard(,
# return the ordinal number of the line that starts that object, in our example: the
# line number of the following line (notice that the line where TaskCard is imported
# is not supposed to return):
#         card = TaskCard(
# and the line number of the line that ends the object, in our case the line that include
# the matching close:
#         )
# This util depends on ruff to ensure this setting of the card file: that a close of one
# tag and the open of the next tag, do not sit in same line, when both tags being
# major level within TaskCard.
# It also prepares for the case that  __description__ tag does not contain balanced
# parentheses, since it is often cut in the middle, (with  "... see more at")
# flake8: noqa: B007
# flake8: noqa: C901
def lines_defining_obj_in_card(
    all_lines: List[str], obj_name: str, start_search_at_line: int = 0
) -> Tuple[int, int]:
    for starting_line in range(start_search_at_line, len(all_lines)):
        line = all_lines[starting_line]
        if obj_name in line:
            break
    if obj_name not in line:
        # obj_name found no where in the input lines
        return (-1, -1)
    num_of_opens = 0
    num_of_closes = 0
    ending_line = starting_line - 1
    while ending_line < len(all_lines):
        ending_line += 1

        if "__description__" in all_lines[ending_line]:
            # can not trust parentheses inside description, because this is mainly truncated
            # free text.
            # We do trust the indentation enforced by ruff, and the way we build __description__:
            # a line consisting of only __description__=(
            # followed by one or more lines of text, can not trust opens and closes
            # in them, followed by a line consisting of only:  ),
            # where the ) is indented with the beginning of __description__
            # We also prepare for the case that, when not entered by us, __description__=
            # is not followed by a ( and the whole description does not end with a single ) in its line.
            # We build on ruff making the line following the description start with same indentation
            # or 4 less (i.e., the following line is the closing of the card).
            tag_indentation = all_lines[ending_line].index("__description__")
            starts_with_parent = "__description__=(" in all_lines[ending_line]
            if starts_with_parent:
                last_line_to_start_with = (" " * tag_indentation) + r"\)"
            else:
                # actually, the line that follows the description
                last_line_to_start_with1 = (" " * tag_indentation) + "[^ ]"
                last_line_to_start_with2 = (" " * (tag_indentation - 4)) + "[^ ]"
                last_line_to_start_with = (
                    "("
                    + last_line_to_start_with1
                    + "|"
                    + last_line_to_start_with2
                    + ")"
                )
            ending_line += 1
            while not re.search("^" + last_line_to_start_with, all_lines[ending_line]):
                ending_line += 1
            if "__description__" in obj_name:
                return (
                    starting_line,
                    ending_line if starts_with_parent else ending_line - 1,
                )

            if starts_with_parent:
                ending_line += 1

            # we conrinue in card, having passed the description, ending line points
            # to the line that follows description

        num_of_opens += len(re.findall(r"[({[]", all_lines[ending_line]))
        num_of_closes += len(re.findall(r"[)}\]]", all_lines[ending_line]))
        if num_of_closes == num_of_opens:
            break

    if num_of_closes != num_of_opens:
        raise ValueError(
            "input lines were exhausted before the matching close is found"
        )

    return (starting_line, ending_line)

1	import re	1✔
2	import shutil	1✔
3	from typing import List, Tuple	1✔
4
5	import pandas as pd	1✔
6
7	from .logging_utils import get_logger	1✔
8
9	logger = get_logger()
10
11
12	def split_words(s):	1✔
13	"""Splits a string into words based on PascalCase, camelCase, snake_case, kebab-case, and numbers attached to strings.
14
15	Args:
16	s (str): The string to be split.
17
18	Returns:
19	list: The list of words obtained after splitting the string.
20	"""
21	# Split PascalCase or camelCase
22	s = re.sub(r"([A-Z][a-z]+)", r" \1", re.sub(r"([A-Z]+)", r" \1", s)).strip()	1✔
23	# Split snake_case or kebab-case
24	s = re.sub(r"[_-]", " ", s)	1✔
25	# Split numbers attached to strings
26	s = re.sub(r"([a-zA-Z])(\d)", r"\1 \2", s)	1✔
27	s = re.sub(r"(\d)([a-zA-Z])", r"\1 \2", s)	1✔
28	# Split the string into words based on spaces
29	return s.split()	1✔
30
31
32	def is_camel_case(s):	1✔
33	"""Checks if a string is in camelCase.
34
35	Args:
36	s (str): The string to be checked.
37
38	Returns:
39	bool: True if the string is in camelCase, False otherwise.
40	"""
41	return re.match(r"^[A-Z]+([a-z0-9][A-Z][a-z0-9])$", s) is not None	1✔
42
43
44	def is_snake_case(s):	1✔
45	"""Checks if a string is in snake_case.
46
47	Args:
48	s (str): The string to be checked.
49
50	Returns:
51	bool: True if the string is in snake_case, False otherwise.
52	"""
53	return re.match(r"^[a-z0-9]+(_[a-z0-9]+)*$", s) is not None	1✔
54
55
56	def camel_to_snake_case(s):	1✔
57	"""Converts a string from camelCase to snake_case.
58
59	Args:
60	s (str): The string to be converted.
61
62	Returns:
63	str: The string converted to snake_case.
64	"""
65	# Add an underscore before every uppercase letter that is followed by a lowercase letter or digit and not preceded by an underscore, a hyphen or an uppercase letter
66	s = re.sub(r"(?<=[^A-Z_-])([A-Z])", r"_\1", s)	1✔
67
68	# Ensure there's an underscore before any uppercase letter that's followed by a lowercase letter or digit and comes after a sequence of uppercase letters
69	s = re.sub(r"([A-Z]+)([A-Z][a-z0-9])", r"\1_\2", s)	1✔
70
71	return s.lower()	1✔
72
73
74	def snake_to_camel_case(s):	1✔
75	"""Converts a snake_case string s to CamelCase. Assume a class name is in question so result to start with an upper case.
76
77	Not always the reciprocal of the above camel_to_snake_case. e.g: camel_to_snake_case(LoadHF) = load_hf,
78	whereas snake_to_camel_case(load_hf) = LoadHf
79	"""
80	s = s.strip()	1✔
81	words = s.split("_")	1✔
82	# Capitalize all words and join them
83	camel_case_parts = [word.capitalize() for word in words]	1✔
84	return "".join(camel_case_parts)	1✔
85
86
87	def to_pretty_string(	1✔
88	value,
89	indent=0,
90	indent_delta=4,
91	max_chars=None,
92	keys=None,
93	item_label=None,
94	float_format=None,
95	):
96	"""Constructs a formatted string representation of various data structures (dicts, lists, tuples, and DataFrames).
97
98	Args:
99	value: The Python data structure to be formatted.
100	indent (int, optional): The current level of indentation. Defaults to 0.
101	indent_delta (int, optional): Amount of spaces to add per indentation level. Defaults to 4.
102	max_chars (int, optional): Max characters per line before wrapping. Defaults to terminal width - 10.
103	keys (List[str], optional): For dicts, optionally specify keys and order.
104	item_label (str, optional): Internal parameter for labeling items.
105	float_format (str, optional): Format string for float values (e.g., ".2f"). Defaults to None.
106	"""
107	max_chars = max_chars or shutil.get_terminal_size()[0] - 10	1✔
108	indent_str = " " * indent	1✔
109	res = ""	1✔
110
111	if isinstance(value, dict):	1✔
112	keys_to_print = keys if keys is not None else list(value.keys())	1✔
113
114	for k in keys_to_print:	1✔
115	if k not in value:	1✔
116	raise ValueError(
117	f"Dictionary does not contain field '{k}' specified in 'keys' argument. "
118	f"The available keys are {list(value.keys())}"
119	)
120
121	for k in keys_to_print:	1✔
122	v = value[k]	1✔
123	item_header = f"{k} ({type(v).__name__})"	1✔
124	res += f"{indent_str}{item_header}:\n"	1✔
125	res += to_pretty_string(	1✔
126	v,
127	indent=indent + indent_delta,
128	indent_delta=indent_delta,
129	max_chars=max_chars,
130	float_format=float_format,
131	)
132
133	elif isinstance(value, (list, tuple)):	1✔
134	for i, v in enumerate(value):	1✔
135	label = f"[{i}]" if isinstance(value, list) else f"({i})"	1✔
136	item_header = f"{label} ({type(v).__name__})"	1✔
137	res += f"{indent_str}{item_header}:\n"	1✔
138	res += to_pretty_string(	1✔
139	v,
140	indent=indent + indent_delta,
141	indent_delta=indent_delta,
142	max_chars=max_chars,
143	float_format=float_format,
144	)
145
146	elif isinstance(value, pd.DataFrame):	1✔
147	line_width = max_chars - indent	1✔
148	options = [	1✔
149	"display.max_rows",
150	None,
151	"display.max_columns",
152	None,
153	"display.max_colwidth",
154	None,
155	"display.width",
156	line_width,
157	# 'display.colheader_justify', 'left'
158	]
159	if float_format is not None:	1✔
160	options.extend(	×
161	["display.float_format", ("{:," + float_format + "}").format]
162	)
163	with pd.option_context(*options):	1✔
164	df_str = repr(value)	1✔
165
166	lines = df_str.split("\n")	1✔
167	for line in lines:	1✔
168	if len(line) + len(indent_str) > line_width:	1✔
169	start = 0	1✔
170	while start < len(line):	1✔
171	wrap_chunk = line[start : start + line_width].rstrip()	1✔
172	res += f"{indent_str}{wrap_chunk}\n"	1✔
173	start += line_width	1✔
174	else:
175	res += f"{indent_str}{line.rstrip()}\n"	1✔
176
177	else:
178	# Handle scalar values, including floats
179	if isinstance(value, float) and float_format:	1✔
180	formatted_value = f"{value:{float_format}}"	×
181	else:
182	formatted_value = str(value)	1✔
183
184	# Wrap lines according to max_chars
185	line_width = max_chars - indent	1✔
186	lines = formatted_value.split("\n")	1✔
187	for line in lines:	1✔
188	if len(line) + len(indent_str) > line_width:	1✔
189	start = 0	1✔
190	while start < len(line):	1✔
191	wrap_chunk = line[start : start + line_width].rstrip()	1✔
192	res += f"{indent_str}{wrap_chunk}\n"	1✔
193	start += line_width	1✔
194	else:
195	res += f"{indent_str}{line.rstrip()}\n"	1✔
196
197	return res	1✔
198
199
200	def construct_dict_as_yaml_lines(d, indent_delta=2) -> List[str]:	1✔
201	"""Constructs the lines of a dictionary formatted as yaml.
202
203	Args:
204	d: The element to be formatted.
205	indent_delta (int, optional): The amount of spaces to add for each level of indentation. Defaults to 2.
206	"""
207	indent_delta_str = " " * indent_delta	1✔
208	ticked_indent_delta_str = indent_delta_str[:-2] + "- "	1✔
209	assert (	1✔
210	indent_delta >= 2
211	), f"Needs at least 2 position indentations, for the case of list elements, that are to be preceded each by ' -'. Got indent_delta={indent_delta}."
212	res = [] # computed hereunder as a list of lines, that are indented only at the end	1✔
213
214	if isinstance(d, dict):	1✔
215	if len(d) == 0:	1✔
216	return ["{}"]	×
217	for key, val in d.items():	1✔
218	printable_key = f'"{key}"' if (" " in key) or (key == "") else key	1✔
219	res.append(printable_key + ": ")	1✔
220	yaml_for_val = construct_dict_as_yaml_lines(val, indent_delta=indent_delta)	1✔
221	assert len(yaml_for_val) > 0	1✔
222	if len(yaml_for_val) == 1:	1✔
223	res[-1] += yaml_for_val[0]	1✔
224	else:
225	for line in yaml_for_val:	1✔
226	res.append(indent_delta_str + line)	1✔
227	return res	1✔
228
229	if isinstance(d, list):	1✔
230	if len(d) == 0:	1✔
231	return ["[]"]	×
232	for val in d:	1✔
233	yaml_for_val = construct_dict_as_yaml_lines(val, indent_delta=indent_delta)	1✔
234	assert len(yaml_for_val) > 0	1✔
235	res.append(ticked_indent_delta_str + yaml_for_val[0])	1✔
236	for line in yaml_for_val[1:]:	1✔
237	res.append(indent_delta_str + line)	1✔
238	return res	1✔
239
240	# d1 = re.sub(r"(\n+)", r'"\1"', str(d))
241	d1 = str(d).replace("\n", "\\n").replace('"', '\\"')	1✔
242	if "\\n" in d1 or d1 == "":	1✔
243	d1 = f'"{d1}"'	×
244	return [d1]	1✔
245
246
247	def construct_dict_as_python_lines(d, indent_delta=4) -> List[str]:	1✔
248	"""Constructs the lines of a dictionary formatted as a piece of python code.
249
250	Args:
251	d: The element to be formatted.
252	indent_delta (int, optional): The amount of spaces to add for each level of indentation. Defaults to 2.
253	"""
254	indent_delta_str = " " * indent_delta	1✔
255	res = [] # computed hereunder as a list of lines, that are indented only at the end	1✔
256
257	if isinstance(d, dict):	1✔
258	istype = False	1✔
259	if len(d) == 0:	1✔
260	return ["{}"]	×
261	if "__type__" in d:	1✔
262	istype = True	1✔
263	res = ["__type__" + d["__type__"] + "("]	1✔
264	if len(d) == 1:	1✔
265	res[0] += ")"	×
266	return res	×
267	else:
268	res = ["{"]	1✔
269	for key, val in d.items():	1✔
270	if key == "__type__":	1✔
271	continue	1✔
272	printable_key = f'"{key}"' if not istype else key	1✔
273	res.append(printable_key + ("=" if istype else ": "))	1✔
274	py_for_val = construct_dict_as_python_lines(val, indent_delta=indent_delta)	1✔
275	assert len(py_for_val) > 0	1✔
276	if len(py_for_val) == 1:	1✔
277	res[-1] += py_for_val[0] + ","	1✔
278	else:
279	res[-1] += py_for_val[0]	1✔
280	if py_for_val[0].startswith("{") or py_for_val[0].startswith("["):	1✔
281	for line in py_for_val[1:-1]:	1✔
282	res.append(indent_delta_str + line)	1✔
283	else:
284	# val is type, its inner lines are already indented
285	res.extend(py_for_val[1:-1])	1✔
286	res.append(py_for_val[-1] + ",")	1✔
287	res.append(")" if istype else "}")	1✔
288	if istype:	1✔
289	for i in range(1, len(res) - 1):	1✔
290	res[i] = indent_delta_str + res[i]	1✔
291	return res	1✔
292
293	if isinstance(d, list):	1✔
294	if len(d) == 0:	1✔
295	return ["[]"]	×
296	res = ["["]	1✔
297	for val in d:	1✔
298	py_for_val = construct_dict_as_python_lines(val, indent_delta=indent_delta)	1✔
299	assert len(py_for_val) > 0	1✔
300	for line in py_for_val[:-1]:	1✔
301	res.append(line)	1✔
302	res.append(py_for_val[-1] + ",")	1✔
303	res.append("]")	1✔
304	return res	1✔
305
306	# d1 = re.sub(r"(\n+)", r'"\1"', str(d))
307	if isinstance(d, str):	1✔
308	return [f'"{d}"']	1✔
309	if d is None or isinstance(d, (int, float, bool)):	×
310	return [f"{d}"]	×
311	raise RuntimeError(f"unrecognized value to print as python: {d}")	×
312
313
314	def print_dict(	1✔
315	d, indent=0, indent_delta=4, max_chars=None, keys_to_print=None, log_level="info"
316	):
317	dict_str = to_pretty_string(d, indent, indent_delta, max_chars, keys_to_print)	1✔
318	dict_str = "\n" + dict_str	1✔
319	getattr(logger, log_level)(dict_str)
320
321
322	def print_dict_as_yaml(d: dict, indent_delta=2) -> str:	1✔
323	yaml_lines = construct_dict_as_yaml_lines(d, indent_delta=indent_delta)	1✔
324	# yaml_lines = [re.sub(r"(\n+)", r'"\1"', line) for line in yaml_lines]
325	# yaml_lines = [line.replace("\n", "\\n") for line in yaml_lines]
326	return "\n".join(yaml_lines)	1✔
327
328
329	def print_dict_as_python(d: dict, indent_delta=4) -> str:	1✔
330	py_lines = construct_dict_as_python_lines(d, indent_delta=indent_delta)	1✔
331	assert len(py_lines) > 0	1✔
332	return "\n".join(py_lines)	1✔
333
334
335	def nested_tuple_to_string(nested_tuple: tuple) -> str:	1✔
336	"""Converts a nested tuple to a string, with elements separated by underscores.
337
338	Args:
339	nested_tuple (tuple): The nested tuple to be converted.
340
341	Returns:
342	str: The string representation of the nested tuple.
343	"""
344	result = []	×
345	for item in nested_tuple:	×
346	if isinstance(item, tuple):	×
347	result.append(nested_tuple_to_string(item))	×
348	else:
349	result.append(str(item))	×
350	return "_".join(result)	×
351
352
353	def is_made_of_sub_strings(string, sub_strings):	1✔
354	pattern = "^(" + "\|".join(map(re.escape, sub_strings)) + ")+$"	1✔
355	return bool(re.match(pattern, string))	1✔
356
357
358	# Giveמ all the lines of a card preparer file, e.g. all the lines of prepare/cards/cohere_for_ai.py,
359	# and an object name, e.g. TaskCard(,
360	# return the ordinal number of the line that starts that object, in our example: the
361	# line number of the following line (notice that the line where TaskCard is imported
362	# is not supposed to return):
363	# card = TaskCard(
364	# and the line number of the line that ends the object, in our case the line that include
365	# the matching close:
366	# )
367	# This util depends on ruff to ensure this setting of the card file: that a close of one
368	# tag and the open of the next tag, do not sit in same line, when both tags being
369	# major level within TaskCard.
370	# It also prepares for the case that __description__ tag does not contain balanced
371	# parentheses, since it is often cut in the middle, (with "... see more at")
372	# flake8: noqa: B007
373	# flake8: noqa: C901
374	def lines_defining_obj_in_card(	1✔
375	all_lines: List[str], obj_name: str, start_search_at_line: int = 0
376	) -> Tuple[int, int]:
377	for starting_line in range(start_search_at_line, len(all_lines)):	1✔
378	line = all_lines[starting_line]	1✔
379	if obj_name in line:	1✔
380	break	1✔
381	if obj_name not in line:	1✔
382	# obj_name found no where in the input lines
383	return (-1, -1)	×
384	num_of_opens = 0	1✔
385	num_of_closes = 0	1✔
386	ending_line = starting_line - 1	1✔
387	while ending_line < len(all_lines):	1✔
388	ending_line += 1	1✔
389
390	if "__description__" in all_lines[ending_line]:	1✔
391	# can not trust parentheses inside description, because this is mainly truncated
392	# free text.
393	# We do trust the indentation enforced by ruff, and the way we build __description__:
394	# a line consisting of only __description__=(
395	# followed by one or more lines of text, can not trust opens and closes
396	# in them, followed by a line consisting of only: ),
397	# where the ) is indented with the beginning of __description__
398	# We also prepare for the case that, when not entered by us, __description__=
399	# is not followed by a ( and the whole description does not end with a single ) in its line.
400	# We build on ruff making the line following the description start with same indentation
401	# or 4 less (i.e., the following line is the closing of the card).
402	tag_indentation = all_lines[ending_line].index("__description__")	1✔
403	starts_with_parent = "__description__=(" in all_lines[ending_line]	1✔
404	if starts_with_parent:	1✔
405	last_line_to_start_with = (" " * tag_indentation) + r"\)"	1✔
406	else:
407	# actually, the line that follows the description
408	last_line_to_start_with1 = (" " * tag_indentation) + "[^ ]"	1✔
409	last_line_to_start_with2 = (" " * (tag_indentation - 4)) + "[^ ]"	1✔
410	last_line_to_start_with = (	1✔
411	"("
412	+ last_line_to_start_with1
413	+ "\|"
414	+ last_line_to_start_with2
415	+ ")"
416	)
417	ending_line += 1	1✔
418	while not re.search("^" + last_line_to_start_with, all_lines[ending_line]):	1✔
419	ending_line += 1	1✔
420	if "__description__" in obj_name:	1✔
421	return (	1✔
422	starting_line,
423	ending_line if starts_with_parent else ending_line - 1,
424	)
425
426	if starts_with_parent:	1✔
427	ending_line += 1	1✔
428
429	# we conrinue in card, having passed the description, ending line points
430	# to the line that follows description
431
432	num_of_opens += len(re.findall(r"[({[]", all_lines[ending_line]))	1✔
433	num_of_closes += len(re.findall(r"[)}\]]", all_lines[ending_line]))	1✔
434	if num_of_closes == num_of_opens:	1✔
435	break	1✔
436
437	if num_of_closes != num_of_opens:	1✔
438	raise ValueError(
439	"input lines were exhausted before the matching close is found"
440	)
441
442	return (starting_line, ending_line)	1✔

IBM / unitxt / 17888446790

Source File Press 'n' to go to next uncovered line, 'b' for previous

Source File
Press 'n' to go to next uncovered line, 'b' for previous