• Home
  • Features
  • Pricing
  • Docs
  • Announcements
  • Sign In

IBM / unitxt / 15436196019

04 Jun 2025 07:24AM UTC coverage: 80.242% (-0.3%) from 80.504%
15436196019

Pull #1579

github

web-flow
Merge a3562c478 into 5d576f6fd
Pull Request #1579: Add Multi Turn Metrics Support

1687 of 2079 branches covered (81.14%)

Branch coverage included in aggregate %.

10472 of 13074 relevant lines covered (80.1%)

0.8 hits per line

Source File
Press 'n' to go to next uncovered line, 'b' for previous

85.22
src/unitxt/dialog_operators.py
1
"""Dialog Serializers.
2

3
Dialog serializers are the way to take dialog data and turn it into
4
text that can be fed to the model.
5

6
The format of the dialog is:
7

8
.. code-block:: text
9

10
    dialog = [
11
        {"user": "hello", "system": "hi"},
12
        {"user": "kkk", "system": ""},
13
        {"user": "kkk", "system": ""},
14
    ]
15
"""
16

17
from typing import Any, Dict, List, Optional
1✔
18

19
from .formats import SystemFormat
1✔
20
from .operators import FieldOperator, InstanceFieldOperator
1✔
21

22

23
class ToDialog(FieldOperator):
1✔
24
    def process_value(self, value: Any) -> Any:
1✔
25
        dialog = []
×
26
        for question, answer in value:
×
27
            dialog.append({"role": "user", "content": question})
×
28
            dialog.append({"role": "agent", "content": answer})
×
29
        return dialog
×
30

31

32
class SerializeDialog(InstanceFieldOperator):
1✔
33
    """Serializes dialog data for feeding into a model.
34

35
    This class takes structured dialog data and converts it into a text format
36
    according to a specified template. It allows for the inclusion or exclusion
37
    of system responses and can operate on a per-turn basis or aggregate the entire
38
    dialog.
39

40
    Args:
41
        field (str):
42
            The field in the input data that contains the dialog.
43
        to_field (Optional[str]):
44
            The field in the output data where the serialized dialog will be stored.
45
        last_user_turn_to_field (Optional[str]):
46
            Field to store the last user turn.
47
        last_system_turn_to_field (Optional[str]):
48
            Field to store the last system turn.
49
        context_field (Optional[str]):
50
            Field that contains additional context to be prepended to the dialog.
51
    """
52

53
    format: SystemFormat = None
1✔
54
    last_response_to_field: Optional[str] = None
1✔
55
    context_field: Optional[str] = None
1✔
56
    context_separator: str = " "
1✔
57
    slice_first_and_last_turns_format: bool = True
1✔
58

59
    def standardize_format(self, demo_format):
1✔
60
        turn_format = demo_format.replace("{source}", "{user}")
1✔
61
        turn_format = turn_format.replace("{target}", "{system}")
1✔
62
        return turn_format.replace("{target_prefix}", "")
1✔
63

64
    def slice_first_turn(self, turn_format):
1✔
65
        return turn_format[turn_format.index("{user}") :]
1✔
66

67
    def slice_last_turn(self, turn_format):
1✔
68
        return turn_format[: turn_format.index("{system}") + len("{system}")]
1✔
69

70
    def slice_last_response(self, turn_format):
1✔
71
        return turn_format[: turn_format.index("{user}") + len("{user}")]
1✔
72

73
    def get_turn_format(self, turn_format, step, length):
1✔
74
        if step == 0 and self.slice_first_and_last_turns_format:
1✔
75
            turn_format = self.slice_first_turn(turn_format)
1✔
76
        if step == length - 1:
1✔
77
            if self.slice_first_and_last_turns_format:
1✔
78
                turn_format = self.slice_last_turn(turn_format)
1✔
79
            if self.last_response_to_field is not None:
1✔
80
                turn_format = self.slice_last_response(turn_format)
1✔
81
        return turn_format
1✔
82

83
    def get_general_turn_format(self, instance):
1✔
84
        general_format = (
1✔
85
            instance["recipe_metadata"]["format"]
86
            if self.format is None
87
            else self.format
88
        )
89
        return self.standardize_format(general_format.demo_format)
1✔
90

91
    def process_instance_value(
1✔
92
        self, structured_dialog: List[Dict[str, str]], instance: Dict[str, Any]
93
    ):
94
        dialog = (
1✔
95
            ""
96
            if self.context_field is None
97
            else instance[self.context_field] + self.context_separator
98
        )
99
        general_turn_format = self.get_general_turn_format(instance)
1✔
100
        for i, turn in enumerate(structured_dialog):
1✔
101
            turn_format = self.get_turn_format(
1✔
102
                general_turn_format, i, len(structured_dialog)
103
            )
104
            dialog += turn_format.format(**turn)
1✔
105
        if self.last_response_to_field is not None:
1✔
106
            instance[self.last_response_to_field] = turn["system"]
1✔
107
        return dialog
1✔
108

109

110
class SerializeOpenAiFormatDialog(SerializeDialog):
1✔
111
    """Serializes dialog data for feeding into a model.
112

113
    This class takes structured dialog data in the OpenAi format, and converts it into a text format
114
    according to a specified template. It allows for the inclusion or exclusion
115
    of system responses and can operate on a per-turn basis or aggregate the entire
116
    dialog.
117

118
    Args:
119
        field (str):
120
            The field in the input data that contains the dialog.
121
        to_field (Optional[str]):
122
            The field in the output data where the serialized dialog will be stored.
123
        last_user_turn_to_field (Optional[str]):
124
            Field to store the last user turn.
125
        last_system_turn_to_field (Optional[str]):
126
            Field to store the last system turn.
127
        context_field (Optional[str]):
128
            Field that contains additional context to be prepended to the dialog.
129
    """
130

131
    is_last_turn_user_only: bool = True
1✔
132

133
    @staticmethod
1✔
134
    def validate_openai_dialog_format(dialog: List[Dict[str, str]]) -> None:
1✔
135
        """Validates that the given dialog follows the correct OpenAI format.
136

137
        The function checks that:
138
        1. The dialog is a list of dictionaries.
139
        2. Each dictionary contains the keys 'role' and 'content'.
140
        3. The 'role' value is either 'user' or 'assistant'.
141
        4. Both 'role' and 'content' values are strings.
142
        5. The first 'role' is 'user'
143

144
        If the dialog does not conform to the expected format, a descriptive
145
        ValueError is raised indicating the issue.
146

147
        Args:
148
            dialog (List[Dict[str, str]]): The dialog to validate.
149

150
        Raises:
151
            ValueError: If the dialog does not meet the format requirements.
152
        """
153
        if not isinstance(dialog, list):
1✔
154
            raise ValueError("Dialog must be a list of dictionaries.")
×
155

156
        for i, entry in enumerate(dialog):
1✔
157
            if not isinstance(entry, dict):
1✔
158
                raise ValueError(
×
159
                    f"Entry {i} is not a dictionary: {entry}. Each entry in the dialog must be a dictionary."
160
                )
161

162
            if "role" not in entry:
1✔
163
                raise ValueError(
×
164
                    f"Entry {i} is missing the 'role' key: {entry}. Each dictionary must have a 'role' key."
165
                )
166

167
            if "content" not in entry:
1✔
168
                raise ValueError(
×
169
                    f"Entry {i} is missing the 'content' key: {entry}. Each dictionary must have a 'content' key."
170
                )
171

172
            if not isinstance(entry["role"], str):
1✔
173
                raise ValueError(
×
174
                    f"Entry {i} has a non-string 'role': {entry['role']}. The 'role' value must be a string."
175
                )
176

177
            if not isinstance(entry["content"], str):
1✔
178
                raise ValueError(
×
179
                    f"Entry {i} has a non-string 'content': {entry['content']}. The 'content' value must be a string."
180
                )
181

182
            if entry["role"].lower() not in {"user", "assistant"}:
1✔
183
                raise ValueError(
1✔
184
                    f"Entry {i} has an invalid role: {entry['role']}. Allowed roles are 'user' and 'assistant'."
185
                )
186

187
        first_entry = dialog[0]
1✔
188
        if first_entry["role"].lower() != "user":
1✔
189
            raise ValueError(
1✔
190
                f"First entry role is expected to be 'user' It is  {first_entry['role']}."
191
            )
192

193
    @staticmethod
1✔
194
    def merge_dialog_entries(dialog: List[Dict[str, str]]) -> List[Dict[str, str]]:
1✔
195
        """Merges consecutive dialog entries with the same role.
196

197
        Args:
198
            dialog (List[Dict[str, str]]): The input dialog list where each dictionary has a 'role' and 'content'.
199

200
        Returns:
201
            List[Dict[str, str]]: A new list where consecutive entries with the same role are merged.
202
        """
203
        if len(dialog) == 0:
1✔
204
            return []
×
205

206
        merged_dialog = [dialog[0]]
1✔
207

208
        for entry in dialog[1:]:
1✔
209
            if entry["role"] == merged_dialog[-1]["role"]:
1✔
210
                merged_dialog[-1]["content"] += " " + entry["content"]
1✔
211
            else:
212
                merged_dialog.append(entry)
1✔
213

214
        return merged_dialog
1✔
215

216
    def transform_dialog_to_standard_format(
1✔
217
        self, dialog: List[Dict[str, str]]
218
    ) -> List[Dict[str, str]]:
219
        """Transforms a dialog from OpenAI format to a simplified format.
220

221
        Each dictionary
222
        contains 'user' and 'system' keys with their respective contents. Consecutive entries
223
        with the same role are merged. Entries with invalid roles raise an error.
224

225
        Args:
226
            dialog (List[Dict[str, str]]): The input dialog in OpenAI format.
227

228
        Returns:
229
            List[Dict[str, str]]: The transformed dialog.
230

231
        Raises:
232
            ValueError: If an invalid role is detected.
233
        """
234
        SerializeOpenAiFormatDialog.validate_openai_dialog_format(dialog)
1✔
235
        merged_dialog = SerializeOpenAiFormatDialog.merge_dialog_entries(dialog)
1✔
236
        # self.validate_dialog_have_complete_pairs(merged_dialog)
237

238
        result = []
1✔
239
        for i in range(0, len(merged_dialog) - 1, 2):
1✔
240
            user_entry = merged_dialog[i]
1✔
241
            system_entry = merged_dialog[i + 1]
1✔
242

243
            result.append(
1✔
244
                {"user": user_entry["content"], "system": system_entry["content"]}
245
            )
246
        if len(merged_dialog) % 2 != 0:
1✔
247
            user_entry = merged_dialog[-1]
×
248
            result.append({"user": user_entry["content"], "system": ""})
×
249

250
        return result
1✔
251

252
    def process_instance_value(
1✔
253
        self, structured_dialog: List[Dict[str, str]], instance: Dict[str, Any]
254
    ):
255
        standard_format_dialog = self.transform_dialog_to_standard_format(
×
256
            structured_dialog
257
        )
258
        return super().process_instance_value(standard_format_dialog, instance)
×
STATUS · Troubleshooting · Open an Issue · Sales · Support · CAREERS · ENTERPRISE · START FREE · SCHEDULE DEMO
ANNOUNCEMENTS · TWITTER · TOS & SLA · Supported CI Services · What's a CI service? · Automated Testing

© 2025 Coveralls, Inc