15436196019

Committed 04 Jun 2025 07:24AM UTC coverage: 80.242% (-0.3%) from 80.504%

Build # 15436196019

Build Type

Pull #1579

github

Committed by

web-flow

Commit Message

Merge a3562c478 into 5d576f6fd

Pull Request Pull Request #1579: Add Multi Turn Metrics Support

Run Details

1687 of 2079 branches covered (81.14%)

Branch coverage included in aggregate %.

10472 of 13074 relevant lines covered (80.1%)

0.8 hits per line

Source File
Press 'n' to go to next uncovered line, 'b' for previous

85.22

src/unitxt/dialog_operators.py

"""Dialog Serializers.

Dialog serializers are the way to take dialog data and turn it into
text that can be fed to the model.

The format of the dialog is:

.. code-block:: text

    dialog = [
        {"user": "hello", "system": "hi"},
        {"user": "kkk", "system": ""},
        {"user": "kkk", "system": ""},
    ]
"""

from typing import Any, Dict, List, Optional

from .formats import SystemFormat
from .operators import FieldOperator, InstanceFieldOperator


class ToDialog(FieldOperator):
    def process_value(self, value: Any) -> Any:
        dialog = []
        for question, answer in value:
            dialog.append({"role": "user", "content": question})
            dialog.append({"role": "agent", "content": answer})
        return dialog


class SerializeDialog(InstanceFieldOperator):
    """Serializes dialog data for feeding into a model.

    This class takes structured dialog data and converts it into a text format
    according to a specified template. It allows for the inclusion or exclusion
    of system responses and can operate on a per-turn basis or aggregate the entire
    dialog.

    Args:
        field (str):
            The field in the input data that contains the dialog.
        to_field (Optional[str]):
            The field in the output data where the serialized dialog will be stored.
        last_user_turn_to_field (Optional[str]):
            Field to store the last user turn.
        last_system_turn_to_field (Optional[str]):
            Field to store the last system turn.
        context_field (Optional[str]):
            Field that contains additional context to be prepended to the dialog.
    """

    format: SystemFormat = None
    last_response_to_field: Optional[str] = None
    context_field: Optional[str] = None
    context_separator: str = " "
    slice_first_and_last_turns_format: bool = True

    def standardize_format(self, demo_format):
        turn_format = demo_format.replace("{source}", "{user}")
        turn_format = turn_format.replace("{target}", "{system}")
        return turn_format.replace("{target_prefix}", "")

    def slice_first_turn(self, turn_format):
        return turn_format[turn_format.index("{user}") :]

    def slice_last_turn(self, turn_format):
        return turn_format[: turn_format.index("{system}") + len("{system}")]

    def slice_last_response(self, turn_format):
        return turn_format[: turn_format.index("{user}") + len("{user}")]

    def get_turn_format(self, turn_format, step, length):
        if step == 0 and self.slice_first_and_last_turns_format:
            turn_format = self.slice_first_turn(turn_format)
        if step == length - 1:
            if self.slice_first_and_last_turns_format:
                turn_format = self.slice_last_turn(turn_format)
            if self.last_response_to_field is not None:
                turn_format = self.slice_last_response(turn_format)
        return turn_format

    def get_general_turn_format(self, instance):
        general_format = (
            instance["recipe_metadata"]["format"]
            if self.format is None
            else self.format
        )
        return self.standardize_format(general_format.demo_format)

    def process_instance_value(
        self, structured_dialog: List[Dict[str, str]], instance: Dict[str, Any]
    ):
        dialog = (
            ""
            if self.context_field is None
            else instance[self.context_field] + self.context_separator
        )
        general_turn_format = self.get_general_turn_format(instance)
        for i, turn in enumerate(structured_dialog):
            turn_format = self.get_turn_format(
                general_turn_format, i, len(structured_dialog)
            )
            dialog += turn_format.format(**turn)
        if self.last_response_to_field is not None:
            instance[self.last_response_to_field] = turn["system"]
        return dialog


class SerializeOpenAiFormatDialog(SerializeDialog):
    """Serializes dialog data for feeding into a model.

    This class takes structured dialog data in the OpenAi format, and converts it into a text format
    according to a specified template. It allows for the inclusion or exclusion
    of system responses and can operate on a per-turn basis or aggregate the entire
    dialog.

    Args:
        field (str):
            The field in the input data that contains the dialog.
        to_field (Optional[str]):
            The field in the output data where the serialized dialog will be stored.
        last_user_turn_to_field (Optional[str]):
            Field to store the last user turn.
        last_system_turn_to_field (Optional[str]):
            Field to store the last system turn.
        context_field (Optional[str]):
            Field that contains additional context to be prepended to the dialog.
    """

    is_last_turn_user_only: bool = True

    @staticmethod
    def validate_openai_dialog_format(dialog: List[Dict[str, str]]) -> None:
        """Validates that the given dialog follows the correct OpenAI format.

        The function checks that:
        1. The dialog is a list of dictionaries.
        2. Each dictionary contains the keys 'role' and 'content'.
        3. The 'role' value is either 'user' or 'assistant'.
        4. Both 'role' and 'content' values are strings.
        5. The first 'role' is 'user'

        If the dialog does not conform to the expected format, a descriptive
        ValueError is raised indicating the issue.

        Args:
            dialog (List[Dict[str, str]]): The dialog to validate.

        Raises:
            ValueError: If the dialog does not meet the format requirements.
        """
        if not isinstance(dialog, list):
            raise ValueError("Dialog must be a list of dictionaries.")

        for i, entry in enumerate(dialog):
            if not isinstance(entry, dict):
                raise ValueError(
                    f"Entry {i} is not a dictionary: {entry}. Each entry in the dialog must be a dictionary."
                )

            if "role" not in entry:
                raise ValueError(
                    f"Entry {i} is missing the 'role' key: {entry}. Each dictionary must have a 'role' key."
                )

            if "content" not in entry:
                raise ValueError(
                    f"Entry {i} is missing the 'content' key: {entry}. Each dictionary must have a 'content' key."
                )

            if not isinstance(entry["role"], str):
                raise ValueError(
                    f"Entry {i} has a non-string 'role': {entry['role']}. The 'role' value must be a string."
                )

            if not isinstance(entry["content"], str):
                raise ValueError(
                    f"Entry {i} has a non-string 'content': {entry['content']}. The 'content' value must be a string."
                )

            if entry["role"].lower() not in {"user", "assistant"}:
                raise ValueError(
                    f"Entry {i} has an invalid role: {entry['role']}. Allowed roles are 'user' and 'assistant'."
                )

        first_entry = dialog[0]
        if first_entry["role"].lower() != "user":
            raise ValueError(
                f"First entry role is expected to be 'user' It is  {first_entry['role']}."
            )

    @staticmethod
    def merge_dialog_entries(dialog: List[Dict[str, str]]) -> List[Dict[str, str]]:
        """Merges consecutive dialog entries with the same role.

        Args:
            dialog (List[Dict[str, str]]): The input dialog list where each dictionary has a 'role' and 'content'.

        Returns:
            List[Dict[str, str]]: A new list where consecutive entries with the same role are merged.
        """
        if len(dialog) == 0:
            return []

        merged_dialog = [dialog[0]]

        for entry in dialog[1:]:
            if entry["role"] == merged_dialog[-1]["role"]:
                merged_dialog[-1]["content"] += " " + entry["content"]
            else:
                merged_dialog.append(entry)

        return merged_dialog

    def transform_dialog_to_standard_format(
        self, dialog: List[Dict[str, str]]
    ) -> List[Dict[str, str]]:
        """Transforms a dialog from OpenAI format to a simplified format.

        Each dictionary
        contains 'user' and 'system' keys with their respective contents. Consecutive entries
        with the same role are merged. Entries with invalid roles raise an error.

        Args:
            dialog (List[Dict[str, str]]): The input dialog in OpenAI format.

        Returns:
            List[Dict[str, str]]: The transformed dialog.

        Raises:
            ValueError: If an invalid role is detected.
        """
        SerializeOpenAiFormatDialog.validate_openai_dialog_format(dialog)
        merged_dialog = SerializeOpenAiFormatDialog.merge_dialog_entries(dialog)
        # self.validate_dialog_have_complete_pairs(merged_dialog)

        result = []
        for i in range(0, len(merged_dialog) - 1, 2):
            user_entry = merged_dialog[i]
            system_entry = merged_dialog[i + 1]

            result.append(
                {"user": user_entry["content"], "system": system_entry["content"]}
            )
        if len(merged_dialog) % 2 != 0:
            user_entry = merged_dialog[-1]
            result.append({"user": user_entry["content"], "system": ""})

        return result

    def process_instance_value(
        self, structured_dialog: List[Dict[str, str]], instance: Dict[str, Any]
    ):
        standard_format_dialog = self.transform_dialog_to_standard_format(
            structured_dialog
        )
        return super().process_instance_value(standard_format_dialog, instance)

1	"""Dialog Serializers.
2
3	Dialog serializers are the way to take dialog data and turn it into
4	text that can be fed to the model.
5
6	The format of the dialog is:
7
8	.. code-block:: text
9
10	dialog = [
11	{"user": "hello", "system": "hi"},
12	{"user": "kkk", "system": ""},
13	{"user": "kkk", "system": ""},
14	]
15	"""
16
17	from typing import Any, Dict, List, Optional	1✔
18
19	from .formats import SystemFormat	1✔
20	from .operators import FieldOperator, InstanceFieldOperator	1✔
21
22
23	class ToDialog(FieldOperator):	1✔
24	def process_value(self, value: Any) -> Any:	1✔
25	dialog = []	×
26	for question, answer in value:	×
27	dialog.append({"role": "user", "content": question})	×
28	dialog.append({"role": "agent", "content": answer})	×
29	return dialog	×
30
31
32	class SerializeDialog(InstanceFieldOperator):	1✔
33	"""Serializes dialog data for feeding into a model.
34
35	This class takes structured dialog data and converts it into a text format
36	according to a specified template. It allows for the inclusion or exclusion
37	of system responses and can operate on a per-turn basis or aggregate the entire
38	dialog.
39
40	Args:
41	field (str):
42	The field in the input data that contains the dialog.
43	to_field (Optional[str]):
44	The field in the output data where the serialized dialog will be stored.
45	last_user_turn_to_field (Optional[str]):
46	Field to store the last user turn.
47	last_system_turn_to_field (Optional[str]):
48	Field to store the last system turn.
49	context_field (Optional[str]):
50	Field that contains additional context to be prepended to the dialog.
51	"""
52
53	format: SystemFormat = None	1✔
54	last_response_to_field: Optional[str] = None	1✔
55	context_field: Optional[str] = None	1✔
56	context_separator: str = " "	1✔
57	slice_first_and_last_turns_format: bool = True	1✔
58
59	def standardize_format(self, demo_format):	1✔
60	turn_format = demo_format.replace("{source}", "{user}")	1✔
61	turn_format = turn_format.replace("{target}", "{system}")	1✔
62	return turn_format.replace("{target_prefix}", "")	1✔
63
64	def slice_first_turn(self, turn_format):	1✔
65	return turn_format[turn_format.index("{user}") :]	1✔
66
67	def slice_last_turn(self, turn_format):	1✔
68	return turn_format[: turn_format.index("{system}") + len("{system}")]	1✔
69
70	def slice_last_response(self, turn_format):	1✔
71	return turn_format[: turn_format.index("{user}") + len("{user}")]	1✔
72
73	def get_turn_format(self, turn_format, step, length):	1✔
74	if step == 0 and self.slice_first_and_last_turns_format:	1✔
75	turn_format = self.slice_first_turn(turn_format)	1✔
76	if step == length - 1:	1✔
77	if self.slice_first_and_last_turns_format:	1✔
78	turn_format = self.slice_last_turn(turn_format)	1✔
79	if self.last_response_to_field is not None:	1✔
80	turn_format = self.slice_last_response(turn_format)	1✔
81	return turn_format	1✔
82
83	def get_general_turn_format(self, instance):	1✔
84	general_format = (	1✔
85	instance["recipe_metadata"]["format"]
86	if self.format is None
87	else self.format
88	)
89	return self.standardize_format(general_format.demo_format)	1✔
90
91	def process_instance_value(	1✔
92	self, structured_dialog: List[Dict[str, str]], instance: Dict[str, Any]
93	):
94	dialog = (	1✔
95	""
96	if self.context_field is None
97	else instance[self.context_field] + self.context_separator
98	)
99	general_turn_format = self.get_general_turn_format(instance)	1✔
100	for i, turn in enumerate(structured_dialog):	1✔
101	turn_format = self.get_turn_format(	1✔
102	general_turn_format, i, len(structured_dialog)
103	)
104	dialog += turn_format.format(**turn)	1✔
105	if self.last_response_to_field is not None:	1✔
106	instance[self.last_response_to_field] = turn["system"]	1✔
107	return dialog	1✔
108
109
110	class SerializeOpenAiFormatDialog(SerializeDialog):	1✔
111	"""Serializes dialog data for feeding into a model.
112
113	This class takes structured dialog data in the OpenAi format, and converts it into a text format
114	according to a specified template. It allows for the inclusion or exclusion
115	of system responses and can operate on a per-turn basis or aggregate the entire
116	dialog.
117
118	Args:
119	field (str):
120	The field in the input data that contains the dialog.
121	to_field (Optional[str]):
122	The field in the output data where the serialized dialog will be stored.
123	last_user_turn_to_field (Optional[str]):
124	Field to store the last user turn.
125	last_system_turn_to_field (Optional[str]):
126	Field to store the last system turn.
127	context_field (Optional[str]):
128	Field that contains additional context to be prepended to the dialog.
129	"""
130
131	is_last_turn_user_only: bool = True	1✔
132
133	@staticmethod	1✔
134	def validate_openai_dialog_format(dialog: List[Dict[str, str]]) -> None:	1✔
135	"""Validates that the given dialog follows the correct OpenAI format.
136
137	The function checks that:
138	1. The dialog is a list of dictionaries.
139	2. Each dictionary contains the keys 'role' and 'content'.
140	3. The 'role' value is either 'user' or 'assistant'.
141	4. Both 'role' and 'content' values are strings.
142	5. The first 'role' is 'user'
143
144	If the dialog does not conform to the expected format, a descriptive
145	ValueError is raised indicating the issue.
146
147	Args:
148	dialog (List[Dict[str, str]]): The dialog to validate.
149
150	Raises:
151	ValueError: If the dialog does not meet the format requirements.
152	"""
153	if not isinstance(dialog, list):	1✔
154	raise ValueError("Dialog must be a list of dictionaries.")	×
155
156	for i, entry in enumerate(dialog):	1✔
157	if not isinstance(entry, dict):	1✔
158	raise ValueError(	×
159	f"Entry {i} is not a dictionary: {entry}. Each entry in the dialog must be a dictionary."
160	)
161
162	if "role" not in entry:	1✔
163	raise ValueError(	×
164	f"Entry {i} is missing the 'role' key: {entry}. Each dictionary must have a 'role' key."
165	)
166
167	if "content" not in entry:	1✔
168	raise ValueError(	×
169	f"Entry {i} is missing the 'content' key: {entry}. Each dictionary must have a 'content' key."
170	)
171
172	if not isinstance(entry["role"], str):	1✔
173	raise ValueError(	×
174	f"Entry {i} has a non-string 'role': {entry['role']}. The 'role' value must be a string."
175	)
176
177	if not isinstance(entry["content"], str):	1✔
178	raise ValueError(	×
179	f"Entry {i} has a non-string 'content': {entry['content']}. The 'content' value must be a string."
180	)
181
182	if entry["role"].lower() not in {"user", "assistant"}:	1✔
183	raise ValueError(	1✔
184	f"Entry {i} has an invalid role: {entry['role']}. Allowed roles are 'user' and 'assistant'."
185	)
186
187	first_entry = dialog[0]	1✔
188	if first_entry["role"].lower() != "user":	1✔
189	raise ValueError(	1✔
190	f"First entry role is expected to be 'user' It is {first_entry['role']}."
191	)
192
193	@staticmethod	1✔
194	def merge_dialog_entries(dialog: List[Dict[str, str]]) -> List[Dict[str, str]]:	1✔
195	"""Merges consecutive dialog entries with the same role.
196
197	Args:
198	dialog (List[Dict[str, str]]): The input dialog list where each dictionary has a 'role' and 'content'.
199
200	Returns:
201	List[Dict[str, str]]: A new list where consecutive entries with the same role are merged.
202	"""
203	if len(dialog) == 0:	1✔
204	return []	×
205
206	merged_dialog = [dialog[0]]	1✔
207
208	for entry in dialog[1:]:	1✔
209	if entry["role"] == merged_dialog[-1]["role"]:	1✔
210	merged_dialog[-1]["content"] += " " + entry["content"]	1✔
211	else:
212	merged_dialog.append(entry)	1✔
213
214	return merged_dialog	1✔
215
216	def transform_dialog_to_standard_format(	1✔
217	self, dialog: List[Dict[str, str]]
218	) -> List[Dict[str, str]]:
219	"""Transforms a dialog from OpenAI format to a simplified format.
220
221	Each dictionary
222	contains 'user' and 'system' keys with their respective contents. Consecutive entries
223	with the same role are merged. Entries with invalid roles raise an error.
224
225	Args:
226	dialog (List[Dict[str, str]]): The input dialog in OpenAI format.
227
228	Returns:
229	List[Dict[str, str]]: The transformed dialog.
230
231	Raises:
232	ValueError: If an invalid role is detected.
233	"""
234	SerializeOpenAiFormatDialog.validate_openai_dialog_format(dialog)	1✔
235	merged_dialog = SerializeOpenAiFormatDialog.merge_dialog_entries(dialog)	1✔
236	# self.validate_dialog_have_complete_pairs(merged_dialog)
237
238	result = []	1✔
239	for i in range(0, len(merged_dialog) - 1, 2):	1✔
240	user_entry = merged_dialog[i]	1✔
241	system_entry = merged_dialog[i + 1]	1✔
242
243	result.append(	1✔
244	{"user": user_entry["content"], "system": system_entry["content"]}
245	)
246	if len(merged_dialog) % 2 != 0:	1✔
247	user_entry = merged_dialog[-1]	×
248	result.append({"user": user_entry["content"], "system": ""})	×
249
250	return result	1✔
251
252	def process_instance_value(	1✔
253	self, structured_dialog: List[Dict[str, str]], instance: Dict[str, Any]
254	):
255	standard_format_dialog = self.transform_dialog_to_standard_format(	×
256	structured_dialog
257	)
258	return super().process_instance_value(standard_format_dialog, instance)	×

IBM / unitxt / 15436196019

Source File Press 'n' to go to next uncovered line, 'b' for previous

Source File
Press 'n' to go to next uncovered line, 'b' for previous