• Home
  • Features
  • Pricing
  • Docs
  • Announcements
  • Sign In

IBM / unitxt / 14991088734

13 May 2025 07:46AM UTC coverage: 79.866% (-0.2%) from 80.026%
14991088734

Pull #1770

github

web-flow
Merge dbeb7ef70 into 94ce2b0a4
Pull Request #1770: Improvements to tool calling - NONE BACKWARD COMPATIBLE CHANGES

1648 of 2053 branches covered (80.27%)

Branch coverage included in aggregate %.

10268 of 12867 relevant lines covered (79.8%)

0.8 hits per line

Source File
Press 'n' to go to next uncovered line, 'b' for previous

81.22
src/unitxt/struct_data_operators.py
1
"""This section describes unitxt operators for structured data.
2

3
These operators are specialized in handling structured data like tables.
4
For tables, expected input format is:
5

6
.. code-block:: text
7

8
    {
9
        "header": ["col1", "col2"],
10
        "rows": [["row11", "row12"], ["row21", "row22"], ["row31", "row32"]]
11
    }
12

13
For triples, expected input format is:
14

15
.. code-block:: text
16

17
    [[ "subject1", "relation1", "object1" ], [ "subject1", "relation2", "object2"]]
18

19
For key-value pairs, expected input format is:
20

21
.. code-block:: text
22

23
    {"key1": "value1", "key2": value2, "key3": "value3"}
24
"""
25

26
import json
1✔
27
import random
1✔
28
from abc import ABC, abstractmethod
1✔
29
from typing import (
1✔
30
    Any,
31
    Dict,
32
    List,
33
    Optional,
34
    Tuple,
35
)
36

37
import pandas as pd
1✔
38

39
from .augmentors import TypeDependentAugmentor
1✔
40
from .dict_utils import dict_get
1✔
41
from .error_utils import UnitxtWarning
1✔
42
from .operators import FieldOperator, InstanceOperator
1✔
43
from .random_utils import new_random_generator
1✔
44
from .serializers import ImageSerializer, TableSerializer
1✔
45
from .type_utils import isoftype
1✔
46
from .types import Table, ToolCall
1✔
47
from .utils import recursive_copy
1✔
48

49

50
def shuffle_columns(table: Table, seed=0) -> Table:
1✔
51
    # extract header & rows from the dictionary
52
    header = table.get("header", [])
1✔
53
    rows = table.get("rows", [])
1✔
54
    # shuffle the indices first
55
    indices = list(range(len(header)))
1✔
56
    random_generator = new_random_generator({"table": table, "seed": seed})
1✔
57
    random_generator.shuffle(indices)
1✔
58

59
    # shuffle the header & rows based on that indices
60
    shuffled_header = [header[i] for i in indices]
1✔
61
    shuffled_rows = [[row[i] for i in indices] for row in rows]
1✔
62

63
    table["header"] = shuffled_header
1✔
64
    table["rows"] = shuffled_rows
1✔
65

66
    return table
1✔
67

68

69
def shuffle_rows(table: Table, seed=0) -> Table:
1✔
70
    # extract header & rows from the dictionary
71
    rows = table.get("rows", [])
1✔
72
    # shuffle rows
73
    random_generator = new_random_generator({"table": table, "seed": seed})
1✔
74
    random_generator.shuffle(rows)
1✔
75
    table["rows"] = rows
1✔
76

77
    return table
1✔
78

79

80
class SerializeTable(ABC, TableSerializer):
1✔
81
    """TableSerializer converts a given table into a flat sequence with special symbols.
82

83
    Output format varies depending on the chosen serializer. This abstract class defines structure of a typical table serializer that any concrete implementation should follow.
84
    """
85

86
    seed: int = 0
1✔
87
    shuffle_rows: bool = False
1✔
88
    shuffle_columns: bool = False
1✔
89

90
    def serialize(self, value: Table, instance: Dict[str, Any]) -> str:
1✔
91
        value = recursive_copy(value)
1✔
92
        if self.shuffle_columns:
1✔
93
            value = shuffle_columns(table=value, seed=self.seed)
1✔
94

95
        if self.shuffle_rows:
1✔
96
            value = shuffle_rows(table=value, seed=self.seed)
1✔
97

98
        return self.serialize_table(value)
1✔
99

100
    # main method to serialize a table
101
    @abstractmethod
1✔
102
    def serialize_table(self, table_content: Dict) -> str:
1✔
103
        pass
×
104

105
    # method to process table header
106
    def process_header(self, header: List):
1✔
107
        pass
×
108

109
    # method to process a table row
110
    def process_row(self, row: List, row_index: int):
1✔
111
        pass
×
112

113

114
# Concrete classes implementing table serializers
115
class SerializeTableAsIndexedRowMajor(SerializeTable):
1✔
116
    """Indexed Row Major Table Serializer.
117

118
    Commonly used row major serialization format.
119
    Format:  col : col1 | col2 | col 3 row 1 : val1 | val2 | val3 | val4 row 2 : val1 | ...
120
    """
121

122
    # main method that processes a table
123
    # table_content must be in the presribed input format
124
    def serialize_table(self, table_content: Dict) -> str:
1✔
125
        # Extract headers and rows from the dictionary
126
        header = table_content.get("header", [])
1✔
127
        rows = table_content.get("rows", [])
1✔
128

129
        assert header and rows, "Incorrect input table format"
1✔
130

131
        # Process table header first
132
        serialized_tbl_str = self.process_header(header) + " "
1✔
133

134
        # Process rows sequentially starting from row 1
135
        for i, row in enumerate(rows, start=1):
1✔
136
            serialized_tbl_str += self.process_row(row, row_index=i) + " "
1✔
137

138
        # return serialized table as a string
139
        return serialized_tbl_str.strip()
1✔
140

141
    # serialize header into a string containing the list of column names separated by '|' symbol
142
    def process_header(self, header: List):
1✔
143
        return "col : " + " | ".join(header)
1✔
144

145
    # serialize a table row into a string containing the list of cell values separated by '|'
146
    def process_row(self, row: List, row_index: int):
1✔
147
        serialized_row_str = ""
1✔
148
        row_cell_values = [
1✔
149
            str(value) if isinstance(value, (int, float)) else value for value in row
150
        ]
151
        serialized_row_str += " | ".join([str(value) for value in row_cell_values])
1✔
152

153
        return f"row {row_index} : {serialized_row_str}"
1✔
154

155

156
class SerializeTableAsMarkdown(SerializeTable):
1✔
157
    """Markdown Table Serializer.
158

159
    Markdown table format is used in GitHub code primarily.
160
    Format:
161

162
    .. code-block:: text
163

164
        |col1|col2|col3|
165
        |---|---|---|
166
        |A|4|1|
167
        |I|2|1|
168
        ...
169

170
    """
171

172
    # main method that serializes a table.
173
    # table_content must be in the presribed input format.
174
    def serialize_table(self, table_content: Dict) -> str:
1✔
175
        # Extract headers and rows from the dictionary
176
        header = table_content.get("header", [])
1✔
177
        rows = table_content.get("rows", [])
1✔
178

179
        assert header and rows, "Incorrect input table format"
1✔
180

181
        # Process table header first
182
        serialized_tbl_str = self.process_header(header)
1✔
183

184
        # Process rows sequentially starting from row 1
185
        for i, row in enumerate(rows, start=1):
1✔
186
            serialized_tbl_str += self.process_row(row, row_index=i)
1✔
187

188
        # return serialized table as a string
189
        return serialized_tbl_str.strip()
1✔
190

191
    # serialize header into a string containing the list of column names
192
    def process_header(self, header: List):
1✔
193
        header_str = "|{}|\n".format("|".join(header))
1✔
194
        header_str += "|{}|\n".format("|".join(["---"] * len(header)))
1✔
195
        return header_str
1✔
196

197
    # serialize a table row into a string containing the list of cell values
198
    def process_row(self, row: List, row_index: int):
1✔
199
        row_str = ""
1✔
200
        row_str += "|{}|\n".format("|".join(str(cell) for cell in row))
1✔
201
        return row_str
1✔
202

203

204
class SerializeTableAsDFLoader(SerializeTable):
1✔
205
    """DFLoader Table Serializer.
206

207
    Pandas dataframe based code snippet format serializer.
208
    Format(Sample):
209

210
    .. code-block:: python
211

212
        pd.DataFrame({
213
            "name" : ["Alex", "Diana", "Donald"],
214
            "age" : [26, 34, 39]
215
        },
216
        index=[0,1,2])
217
    """
218

219
    # main method that serializes a table.
220
    # table_content must be in the presribed input format.
221
    def serialize_table(self, table_content: Dict) -> str:
1✔
222
        # Extract headers and rows from the dictionary
223
        header = table_content.get("header", [])
1✔
224
        rows = table_content.get("rows", [])
1✔
225

226
        assert header and rows, "Incorrect input table format"
1✔
227

228
        # Fix duplicate columns, ensuring the first occurrence has no suffix
229
        header = [
1✔
230
            f"{col}_{header[:i].count(col)}" if header[:i].count(col) > 0 else col
231
            for i, col in enumerate(header)
232
        ]
233

234
        # Create a pandas DataFrame
235
        df = pd.DataFrame(rows, columns=header)
1✔
236

237
        # Generate output string in the desired format
238
        data_dict = df.to_dict(orient="list")
1✔
239

240
        return (
1✔
241
            "pd.DataFrame({\n"
242
            + json.dumps(data_dict)[1:-1]
243
            + "},\nindex="
244
            + str(list(range(len(rows))))
245
            + ")"
246
        )
247

248

249
class SerializeTableAsJson(SerializeTable):
1✔
250
    """JSON Table Serializer.
251

252
    Json format based serializer.
253
    Format(Sample):
254

255
    .. code-block:: json
256

257
        {
258
            "0":{"name":"Alex","age":26},
259
            "1":{"name":"Diana","age":34},
260
            "2":{"name":"Donald","age":39}
261
        }
262
    """
263

264
    # main method that serializes a table.
265
    # table_content must be in the presribed input format.
266
    def serialize_table(self, table_content: Dict) -> str:
1✔
267
        # Extract headers and rows from the dictionary
268
        header = table_content.get("header", [])
1✔
269
        rows = table_content.get("rows", [])
1✔
270

271
        assert header and rows, "Incorrect input table format"
1✔
272

273
        # Generate output dictionary
274
        output_dict = {}
1✔
275
        for i, row in enumerate(rows):
1✔
276
            output_dict[i] = {header[j]: value for j, value in enumerate(row)}
1✔
277

278
        # Convert dictionary to JSON string
279
        return json.dumps(output_dict)
1✔
280

281

282
class SerializeTableAsHTML(SerializeTable):
1✔
283
    """HTML Table Serializer.
284

285
    HTML table format used for rendering tables in web pages.
286
    Format(Sample):
287

288
    .. code-block:: html
289

290
        <table>
291
            <thead>
292
                <tr><th>name</th><th>age</th><th>sex</th></tr>
293
            </thead>
294
            <tbody>
295
                <tr><td>Alice</td><td>26</td><td>F</td></tr>
296
                <tr><td>Raj</td><td>34</td><td>M</td></tr>
297
            </tbody>
298
        </table>
299
    """
300

301
    # main method that serializes a table.
302
    # table_content must be in the prescribed input format.
303
    def serialize_table(self, table_content: Dict) -> str:
1✔
304
        # Extract headers and rows from the dictionary
305
        header = table_content.get("header", [])
1✔
306
        rows = table_content.get("rows", [])
1✔
307

308
        assert header and rows, "Incorrect input table format"
1✔
309

310
        # Build the HTML table structure
311
        serialized_tbl_str = "<table>\n"
1✔
312
        serialized_tbl_str += self.process_header(header) + "\n"
1✔
313
        serialized_tbl_str += self.process_rows(rows) + "\n"
1✔
314
        serialized_tbl_str += "</table>"
1✔
315

316
        return serialized_tbl_str.strip()
1✔
317

318
    # serialize the header into an HTML <thead> section
319
    def process_header(self, header: List) -> str:
1✔
320
        header_html = "  <thead>\n    <tr>"
1✔
321
        for col in header:
1✔
322
            header_html += f"<th>{col}</th>"
1✔
323
        header_html += "</tr>\n  </thead>"
1✔
324
        return header_html
1✔
325

326
    # serialize the rows into an HTML <tbody> section
327
    def process_rows(self, rows: List[List]) -> str:
1✔
328
        rows_html = "  <tbody>"
1✔
329
        for row in rows:
1✔
330
            rows_html += "\n    <tr>"
1✔
331
            for cell in row:
1✔
332
                rows_html += f"<td>{cell}</td>"
1✔
333
            rows_html += "</tr>"
1✔
334
        rows_html += "\n  </tbody>"
1✔
335
        return rows_html
1✔
336

337

338
class SerializeTableAsConcatenation(SerializeTable):
1✔
339
    """Concat Serializer.
340

341
    Concat all table content to one string of header and rows.
342
    Format(Sample):
343
    name age Alex 26 Diana 34
344
    """
345

346
    def serialize_table(self, table_content: Dict) -> str:
1✔
347
        # Extract headers and rows from the dictionary
348
        header = table_content["header"]
×
349
        rows = table_content["rows"]
×
350

351
        assert header and rows, "Incorrect input table format"
×
352

353
        # Process table header first
354
        serialized_tbl_str = " ".join([str(i) for i in header])
×
355

356
        # Process rows sequentially starting from row 1
357
        for row in rows:
×
358
            serialized_tbl_str += " " + " ".join([str(i) for i in row])
×
359

360
        # return serialized table as a string
361
        return serialized_tbl_str.strip()
×
362

363

364
class SerializeTableAsImage(SerializeTable):
1✔
365
    _requirements_list = ["matplotlib", "pillow"]
1✔
366

367
    def serialize_table(self, table_content: Dict) -> str:
1✔
368
        raise NotImplementedError()
×
369

370
    def serialize(self, value: Table, instance: Dict[str, Any]) -> str:
1✔
371
        table_content = recursive_copy(value)
×
372
        if self.shuffle_columns:
×
373
            table_content = shuffle_columns(table=table_content, seed=self.seed)
×
374

375
        if self.shuffle_rows:
×
376
            table_content = shuffle_rows(table=table_content, seed=self.seed)
×
377

378
        import io
×
379

380
        import matplotlib.pyplot as plt
×
381
        import pandas as pd
×
382
        from PIL import Image
×
383

384
        # Extract headers and rows from the dictionary
385
        header = table_content.get("header", [])
×
386
        rows = table_content.get("rows", [])
×
387

388
        assert header and rows, "Incorrect input table format"
×
389

390
        # Fix duplicate columns, ensuring the first occurrence has no suffix
391
        header = [
×
392
            f"{col}_{header[:i].count(col)}" if header[:i].count(col) > 0 else col
393
            for i, col in enumerate(header)
394
        ]
395

396
        # Create a pandas DataFrame
397
        df = pd.DataFrame(rows, columns=header)
×
398

399
        # Fix duplicate columns, ensuring the first occurrence has no suffix
400
        df.columns = [
×
401
            f"{col}_{i}" if df.columns.duplicated()[i] else col
402
            for i, col in enumerate(df.columns)
403
        ]
404

405
        # Create a matplotlib table
406
        plt.rcParams["font.family"] = "Serif"
×
407
        fig, ax = plt.subplots(figsize=(len(header) * 1.5, len(rows) * 0.5))
×
408
        ax.axis("off")  # Turn off the axes
×
409

410
        table = pd.plotting.table(ax, df, loc="center", cellLoc="center")
×
411
        table.auto_set_column_width(col=range(len(df.columns)))
×
412
        table.scale(1.5, 1.5)
×
413

414
        # Save the plot to a BytesIO buffer
415
        buf = io.BytesIO()
×
416
        plt.savefig(buf, format="png", bbox_inches="tight", dpi=150)
×
417
        plt.close(fig)  # Close the figure to free up memory
×
418
        buf.seek(0)
×
419

420
        # Load the image from the buffer using PIL
421
        image = Image.open(buf)
×
422
        return ImageSerializer().serialize({"image": image, "format": "png"}, instance)
×
423

424

425
# truncate cell value to maximum allowed length
426
def truncate_cell(cell_value, max_len):
1✔
427
    if cell_value is None:
1✔
428
        return None
×
429

430
    if isinstance(cell_value, int) or isinstance(cell_value, float):
1✔
431
        return None
×
432

433
    if cell_value.strip() == "":
1✔
434
        return None
×
435

436
    if len(cell_value) > max_len:
1✔
437
        return cell_value[:max_len]
1✔
438

439
    return None
1✔
440

441

442
class TruncateTableCells(InstanceOperator):
1✔
443
    """Limit the maximum length of cell values in a table to reduce the overall length.
444

445
    Args:
446
        max_length (int) - maximum allowed length of cell values
447
        For tasks that produce a cell value as answer, truncating a cell value should be replicated
448
        with truncating the corresponding answer as well. This has been addressed in the implementation.
449

450
    """
451

452
    max_length: int = 15
1✔
453
    table: str = None
1✔
454
    text_output: Optional[str] = None
1✔
455

456
    def process(
1✔
457
        self, instance: Dict[str, Any], stream_name: Optional[str] = None
458
    ) -> Dict[str, Any]:
459
        table = dict_get(instance, self.table)
1✔
460

461
        answers = []
1✔
462
        if self.text_output is not None:
1✔
463
            answers = dict_get(instance, self.text_output)
×
464

465
        self.truncate_table(table_content=table, answers=answers)
1✔
466

467
        return instance
1✔
468

469
    # truncate table cells
470
    def truncate_table(self, table_content: Dict, answers: Optional[List]):
1✔
471
        cell_mapping = {}
1✔
472

473
        # One row at a time
474
        for row in table_content.get("rows", []):
1✔
475
            for i, cell in enumerate(row):
1✔
476
                truncated_cell = truncate_cell(cell, self.max_length)
1✔
477
                if truncated_cell is not None:
1✔
478
                    cell_mapping[cell] = truncated_cell
1✔
479
                    row[i] = truncated_cell
1✔
480

481
        # Update values in answer list to truncated values
482
        if answers is not None:
1✔
483
            for i, case in enumerate(answers):
1✔
484
                answers[i] = cell_mapping.get(case, case)
×
485

486

487
class TruncateTableRows(FieldOperator):
1✔
488
    """Limits table rows to specified limit by removing excess rows via random selection.
489

490
    Args:
491
        rows_to_keep (int): number of rows to keep.
492
    """
493

494
    rows_to_keep: int = 10
1✔
495

496
    def process_value(self, table: Any) -> Any:
1✔
497
        return self.truncate_table_rows(table_content=table)
1✔
498

499
    def truncate_table_rows(self, table_content: Dict):
1✔
500
        # Get rows from table
501
        rows = table_content.get("rows", [])
1✔
502

503
        num_rows = len(rows)
1✔
504

505
        # if number of rows are anyway lesser, return.
506
        if num_rows <= self.rows_to_keep:
1✔
507
            return table_content
×
508

509
        # calculate number of rows to delete, delete them
510
        rows_to_delete = num_rows - self.rows_to_keep
1✔
511

512
        # Randomly select rows to be deleted
513
        deleted_rows_indices = random.sample(range(len(rows)), rows_to_delete)
1✔
514

515
        remaining_rows = [
1✔
516
            row for i, row in enumerate(rows) if i not in deleted_rows_indices
517
        ]
518
        table_content["rows"] = remaining_rows
1✔
519

520
        return table_content
1✔
521

522

523
class GetNumOfTableCells(FieldOperator):
1✔
524
    """Get the number of cells in the given table."""
525

526
    def process_value(self, table: Any) -> Any:
1✔
527
        num_of_rows = len(table.get("rows"))
×
528
        num_of_cols = len(table.get("header"))
×
529
        return num_of_rows * num_of_cols
×
530

531

532
class SerializeTableRowAsText(InstanceOperator):
1✔
533
    """Serializes a table row as text.
534

535
    Args:
536
        fields (str) - list of fields to be included in serialization.
537
        to_field (str) - serialized text field name.
538
        max_cell_length (int) - limits cell length to be considered, optional.
539
    """
540

541
    fields: str
1✔
542
    to_field: str
1✔
543
    max_cell_length: Optional[int] = None
1✔
544

545
    def process(
1✔
546
        self, instance: Dict[str, Any], stream_name: Optional[str] = None
547
    ) -> Dict[str, Any]:
548
        linearized_str = ""
1✔
549
        for field in self.fields:
1✔
550
            value = dict_get(instance, field)
1✔
551
            if self.max_cell_length is not None:
1✔
552
                truncated_value = truncate_cell(value, self.max_cell_length)
1✔
553
                if truncated_value is not None:
1✔
554
                    value = truncated_value
×
555

556
            linearized_str = linearized_str + field + " is " + str(value) + ", "
1✔
557

558
        instance[self.to_field] = linearized_str
1✔
559
        return instance
1✔
560

561

562
class SerializeTableRowAsList(InstanceOperator):
1✔
563
    """Serializes a table row as list.
564

565
    Args:
566
        fields (str) - list of fields to be included in serialization.
567
        to_field (str) - serialized text field name.
568
        max_cell_length (int) - limits cell length to be considered, optional.
569
    """
570

571
    fields: str
1✔
572
    to_field: str
1✔
573
    max_cell_length: Optional[int] = None
1✔
574

575
    def process(
1✔
576
        self, instance: Dict[str, Any], stream_name: Optional[str] = None
577
    ) -> Dict[str, Any]:
578
        linearized_str = ""
1✔
579
        for field in self.fields:
1✔
580
            value = dict_get(instance, field)
1✔
581
            if self.max_cell_length is not None:
1✔
582
                truncated_value = truncate_cell(value, self.max_cell_length)
1✔
583
                if truncated_value is not None:
1✔
584
                    value = truncated_value
×
585

586
            linearized_str = linearized_str + field + ": " + str(value) + ", "
1✔
587

588
        instance[self.to_field] = linearized_str
1✔
589
        return instance
1✔
590

591

592
class SerializeTriples(FieldOperator):
1✔
593
    """Serializes triples into a flat sequence.
594

595
    Sample input in expected format:
596
    [[ "First Clearing", "LOCATION", "On NYS 52 1 Mi. Youngsville" ], [ "On NYS 52 1 Mi. Youngsville", "CITY_OR_TOWN", "Callicoon, New York"]]
597

598
    Sample output:
599
    First Clearing : LOCATION : On NYS 52 1 Mi. Youngsville | On NYS 52 1 Mi. Youngsville : CITY_OR_TOWN : Callicoon, New York
600

601
    """
602

603
    def process_value(self, tripleset: Any) -> Any:
1✔
604
        return self.serialize_triples(tripleset)
1✔
605

606
    def serialize_triples(self, tripleset) -> str:
1✔
607
        return " | ".join(
1✔
608
            f"{subj} : {rel.lower()} : {obj}" for subj, rel, obj in tripleset
609
        )
610

611

612
class SerializeKeyValPairs(FieldOperator):
1✔
613
    """Serializes key, value pairs into a flat sequence.
614

615
    Sample input in expected format: {"name": "Alex", "age": 31, "sex": "M"}
616
    Sample output: name is Alex, age is 31, sex is M
617
    """
618

619
    def process_value(self, kvpairs: Any) -> Any:
1✔
620
        return self.serialize_kvpairs(kvpairs)
1✔
621

622
    def serialize_kvpairs(self, kvpairs) -> str:
1✔
623
        serialized_str = ""
1✔
624
        for key, value in kvpairs.items():
1✔
625
            serialized_str += f"{key} is {value}, "
1✔
626

627
        # Remove the trailing comma and space then return
628
        return serialized_str[:-2]
1✔
629

630

631
class ListToKeyValPairs(InstanceOperator):
1✔
632
    """Maps list of keys and values into key:value pairs.
633

634
    Sample input in expected format: {"keys": ["name", "age", "sex"], "values": ["Alex", 31, "M"]}
635
    Sample output: {"name": "Alex", "age": 31, "sex": "M"}
636
    """
637

638
    fields: List[str]
1✔
639
    to_field: str
1✔
640

641
    def process(
1✔
642
        self, instance: Dict[str, Any], stream_name: Optional[str] = None
643
    ) -> Dict[str, Any]:
644
        keylist = dict_get(instance, self.fields[0])
1✔
645
        valuelist = dict_get(instance, self.fields[1])
1✔
646

647
        output_dict = {}
1✔
648
        for key, value in zip(keylist, valuelist):
1✔
649
            output_dict[key] = value
1✔
650

651
        instance[self.to_field] = output_dict
1✔
652

653
        return instance
1✔
654

655

656
class ConvertTableColNamesToSequential(FieldOperator):
1✔
657
    """Replaces actual table column names with static sequential names like col_0, col_1,...
658

659
    .. code-block:: text
660

661
        Sample input:
662
        {
663
            "header": ["name", "age"],
664
            "rows": [["Alex", 21], ["Donald", 34]]
665
        }
666

667
        Sample output:
668
        {
669
            "header": ["col_0", "col_1"],
670
            "rows": [["Alex", 21], ["Donald", 34]]
671
        }
672
    """
673

674
    def process_value(self, table: Any) -> Any:
1✔
675
        table_input = recursive_copy(table)
1✔
676
        return self.replace_header(table_content=table_input)
1✔
677

678
    # replaces header with sequential column names
679
    def replace_header(self, table_content: Dict) -> str:
1✔
680
        # Extract header from the dictionary
681
        header = table_content.get("header", [])
1✔
682

683
        assert header, "Input table missing header"
1✔
684

685
        new_header = ["col_" + str(i) for i in range(len(header))]
1✔
686
        table_content["header"] = new_header
1✔
687

688
        return table_content
1✔
689

690

691
class ShuffleTableRows(TypeDependentAugmentor):
1✔
692
    """Shuffles the input table rows randomly.
693

694
    .. code-block:: text
695

696
        Sample Input:
697
        {
698
            "header": ["name", "age"],
699
            "rows": [["Alex", 26], ["Raj", 34], ["Donald", 39]],
700
        }
701

702
        Sample Output:
703
        {
704
            "header": ["name", "age"],
705
            "rows": [["Donald", 39], ["Raj", 34], ["Alex", 26]],
706
        }
707
    """
708

709
    augmented_type = Table
1✔
710
    seed = 0
1✔
711

712
    def process_value(self, table: Any) -> Any:
1✔
713
        table_input = recursive_copy(table)
1✔
714
        return shuffle_rows(table_input, self.seed)
1✔
715

716

717
class ShuffleTableColumns(TypeDependentAugmentor):
1✔
718
    """Shuffles the table columns randomly.
719

720
    .. code-block:: text
721

722
        Sample Input:
723
            {
724
                "header": ["name", "age"],
725
                "rows": [["Alex", 26], ["Raj", 34], ["Donald", 39]],
726
            }
727

728
        Sample Output:
729
            {
730
                "header": ["age", "name"],
731
                "rows": [[26, "Alex"], [34, "Raj"], [39, "Donald"]],
732
            }
733
    """
734

735
    augmented_type = Table
1✔
736
    seed = 0
1✔
737

738
    def process_value(self, table: Any) -> Any:
1✔
739
        table_input = recursive_copy(table)
1✔
740
        return shuffle_columns(table_input, self.seed)
1✔
741

742

743
class LoadJson(FieldOperator):
1✔
744
    failure_value: Any = None
1✔
745
    allow_failure: bool = False
1✔
746

747
    def process_value(self, value: str) -> Any:
1✔
748
        if self.allow_failure:
1✔
749
            try:
1✔
750
                return json.loads(value)
1✔
751
            except json.JSONDecodeError:
1✔
752
                return self.failure_value
1✔
753
        else:
754
            return json.loads(value, strict=False)
1✔
755

756

757
class ToolCallPostProcessor(FieldOperator):
1✔
758
    failure_value: Any = None
1✔
759
    allow_failure: bool = False
1✔
760
    def process_value(self, value: str) -> ToolCall:
1✔
761
        if self.allow_failure:
×
762
            try:
×
763
                result = json.loads(value)
×
764
            except json.JSONDecodeError:
×
765
                return self.failure_value
×
766
        else:
767
            result = json.loads(value, strict=False)
×
768
        if isoftype(result, List[ToolCall]):
×
769
            if len(result) > 1:
×
770
                UnitxtWarning(f"More than one tool returned from model: {result}"   )
×
771
                return self.failure_value
×
772
            return result[0]
×
773
        if not isoftype(result, ToolCall):
×
774
            return self.failure_value
×
775
        return result
×
776

777
class DumpJson(FieldOperator):
1✔
778
    def process_value(self, value: str) -> str:
1✔
779
        return json.dumps(value)
1✔
780

781

782
class MapHTMLTableToJSON(FieldOperator):
1✔
783
    """Converts HTML table format to the basic one (JSON).
784

785
    JSON format:
786

787
    .. code-block:: json
788

789
        {
790
            "header": ["col1", "col2"],
791
            "rows": [["row11", "row12"], ["row21", "row22"], ["row31", "row32"]]
792
        }
793
    """
794

795
    _requirements_list = ["bs4"]
1✔
796

797
    def process_value(self, table: Any) -> Any:
1✔
798
        return self.convert_to_json(table_content=table)
1✔
799

800
    def convert_to_json(self, table_content: str) -> Dict:
1✔
801
        from bs4 import BeautifulSoup
1✔
802

803
        soup = BeautifulSoup(table_content, "html.parser")
1✔
804

805
        # Extract header
806
        header = []
1✔
807
        header_cells = soup.find("thead").find_all("th")
1✔
808
        for cell in header_cells:
1✔
809
            header.append(cell.get_text())
1✔
810

811
        # Extract rows
812
        rows = []
1✔
813
        for row in soup.find("tbody").find_all("tr"):
1✔
814
            row_data = []
1✔
815
            for cell in row.find_all("td"):
1✔
816
                row_data.append(cell.get_text())
1✔
817
            rows.append(row_data)
1✔
818

819
        # return dictionary
820

821
        return {"header": header, "rows": rows}
1✔
822

823

824
class MapTableListsToStdTableJSON(FieldOperator):
1✔
825
    """Converts lists table format to the basic one (JSON).
826

827
    JSON format:
828

829
    .. code-block:: json
830

831
        {
832
            "header": ["col1", "col2"],
833
            "rows": [["row11", "row12"], ["row21", "row22"], ["row31", "row32"]]
834
        }
835
    """
836

837
    def process_value(self, table: Any) -> Any:
1✔
838
        return self.map_tablelists_to_stdtablejson_util(table_content=table)
×
839

840
    def map_tablelists_to_stdtablejson_util(self, table_content: str) -> Dict:
1✔
841
        return {"header": table_content[0], "rows": table_content[1:]}
×
842

843

844
class ConstructTableFromRowsCols(InstanceOperator):
1✔
845
    """Maps column and row field into single table field encompassing both header and rows.
846

847
    field[0] = header string as List
848
    field[1] = rows string as List[List]
849
    field[2] = table caption string(optional)
850
    """
851

852
    fields: List[str]
1✔
853
    to_field: str
1✔
854

855
    def process(
1✔
856
        self, instance: Dict[str, Any], stream_name: Optional[str] = None
857
    ) -> Dict[str, Any]:
858
        header = dict_get(instance, self.fields[0])
×
859
        rows = dict_get(instance, self.fields[1])
×
860

861
        if len(self.fields) >= 3:
×
862
            caption = instance[self.fields[2]]
×
863
        else:
864
            caption = None
×
865

866
        import ast
×
867

868
        header_processed = ast.literal_eval(header)
×
869
        rows_processed = ast.literal_eval(rows)
×
870

871
        output_dict = {"header": header_processed, "rows": rows_processed}
×
872

873
        if caption is not None:
×
874
            output_dict["caption"] = caption
×
875

876
        instance[self.to_field] = output_dict
×
877

878
        return instance
×
879

880

881
class TransposeTable(TypeDependentAugmentor):
1✔
882
    """Transpose a table.
883

884
    .. code-block:: text
885

886
        Sample Input:
887
            {
888
                "header": ["name", "age", "sex"],
889
                "rows": [["Alice", 26, "F"], ["Raj", 34, "M"], ["Donald", 39, "M"]],
890
            }
891

892
        Sample Output:
893
            {
894
                "header": [" ", "0", "1", "2"],
895
                "rows": [["name", "Alice", "Raj", "Donald"], ["age", 26, 34, 39], ["sex", "F", "M", "M"]],
896
            }
897

898
    """
899

900
    augmented_type = Table
1✔
901

902
    def process_value(self, table: Any) -> Any:
1✔
903
        return self.transpose_table(table)
1✔
904

905
    def transpose_table(self, table: Dict) -> Dict:
1✔
906
        # Extract the header and rows from the table object
907
        header = table["header"]
1✔
908
        rows = table["rows"]
1✔
909

910
        # Transpose the table by converting rows as columns and vice versa
911
        transposed_header = [" "] + [str(i) for i in range(len(rows))]
1✔
912
        transposed_rows = [
1✔
913
            [header[i]] + [row[i] for row in rows] for i in range(len(header))
914
        ]
915

916
        return {"header": transposed_header, "rows": transposed_rows}
1✔
917

918

919
class DuplicateTableRows(TypeDependentAugmentor):
1✔
920
    """Duplicates specific rows of a table for the given number of times.
921

922
    Args:
923
        row_indices (List[int]): rows to be duplicated
924

925
        times(int): each row to be duplicated is to show that many times
926
    """
927

928
    augmented_type = Table
1✔
929

930
    row_indices: List[int] = []
1✔
931
    times: int = 1
1✔
932

933
    def process_value(self, table: Any) -> Any:
1✔
934
        # Extract the header and rows from the table
935
        header = table["header"]
1✔
936
        rows = table["rows"]
1✔
937

938
        # Duplicate only the specified rows
939
        duplicated_rows = []
1✔
940
        for i, row in enumerate(rows):
1✔
941
            if i in self.row_indices:
1✔
942
                duplicated_rows.extend(
1✔
943
                    [row] * self.times
944
                )  # Duplicate the selected rows
945
            else:
946
                duplicated_rows.append(row)  # Leave other rows unchanged
1✔
947

948
        # Return the new table with selectively duplicated rows
949
        return {"header": header, "rows": duplicated_rows}
1✔
950

951

952
class DuplicateTableColumns(TypeDependentAugmentor):
1✔
953
    """Duplicates specific columns of a table for the given number of times.
954

955
    Args:
956
        column_indices (List[int]): columns to be duplicated
957

958
        times(int): each column to be duplicated is to show that many times
959
    """
960

961
    augmented_type = Table
1✔
962

963
    column_indices: List[int] = []
1✔
964
    times: int = 1
1✔
965

966
    def process_value(self, table: Any) -> Any:
1✔
967
        # Extract the header and rows from the table
968
        header = table["header"]
1✔
969
        rows = table["rows"]
1✔
970

971
        # Duplicate the specified columns in the header
972
        duplicated_header = []
1✔
973
        for i, col in enumerate(header):
1✔
974
            if i in self.column_indices:
1✔
975
                duplicated_header.extend([col] * self.times)
1✔
976
            else:
977
                duplicated_header.append(col)
1✔
978

979
        # Duplicate the specified columns in each row
980
        duplicated_rows = []
1✔
981
        for row in rows:
1✔
982
            new_row = []
1✔
983
            for i, value in enumerate(row):
1✔
984
                if i in self.column_indices:
1✔
985
                    new_row.extend([value] * self.times)
1✔
986
                else:
987
                    new_row.append(value)
1✔
988
            duplicated_rows.append(new_row)
1✔
989

990
        # Return the new table with selectively duplicated columns
991
        return {"header": duplicated_header, "rows": duplicated_rows}
1✔
992

993

994
class InsertEmptyTableRows(TypeDependentAugmentor):
1✔
995
    """Inserts empty rows in a table randomly for the given number of times.
996

997
    Args:
998
        times(int) - how many times to insert
999
    """
1000

1001
    augmented_type = Table
1✔
1002

1003
    times: int = 0
1✔
1004

1005
    def process_value(self, table: Any) -> Any:
1✔
1006
        # Extract the header and rows from the table
1007
        header = table["header"]
1✔
1008
        rows = table["rows"]
1✔
1009

1010
        # Insert empty rows at random positions
1011
        for _ in range(self.times):
1✔
1012
            empty_row = [""] * len(
1✔
1013
                header
1014
            )  # Create an empty row with the same number of columns
1015
            insert_pos = random.randint(
1✔
1016
                0, len(rows)
1017
            )  # Get a random position to insert the empty row created
1018
            rows.insert(insert_pos, empty_row)
1✔
1019

1020
        # Return the modified table
1021
        return {"header": header, "rows": rows}
1✔
1022

1023

1024
class MaskColumnsNames(TypeDependentAugmentor):
1✔
1025
    """Mask the names of tables columns with dummies "Col1", "Col2" etc."""
1026

1027
    augmented_type = Table
1✔
1028

1029
    def process_value(self, table: Any) -> Any:
1✔
1030
        masked_header = ["Col" + str(ind + 1) for ind in range(len(table["header"]))]
×
1031

1032
        return {"header": masked_header, "rows": table["rows"]}
×
1033

1034

1035
class ShuffleColumnsNames(TypeDependentAugmentor):
1✔
1036
    """Shuffle table columns names to be displayed in random order."""
1037

1038
    augmented_type = Table
1✔
1039

1040
    def process_value(self, table: Any) -> Any:
1✔
1041
        shuffled_header = table["header"]
×
1042
        random.shuffle(shuffled_header)
×
1043

1044
        return {"header": shuffled_header, "rows": table["rows"]}
×
1045

1046

1047
class JsonStrToDict(FieldOperator):
1✔
1048
    """Convert a Json string of representing key value as dictionary.
1049

1050
    Ensure keys and values are strings, and there are no None values.
1051

1052
    """
1053

1054
    def process_value(self, text: str) -> List[Tuple[str, str]]:
1✔
1055
        try:
1✔
1056
            dict_value = json.loads(text)
1✔
1057
        except Exception as e:
1✔
1058
            UnitxtWarning(
1✔
1059
                f"Unable to convert input text to json format in JsonStrToDict due to {e}. Text: {text}"
1060
            )
1061
            dict_value = {}
1✔
1062
        if not isoftype(dict_value, Dict[str, Any]):
1✔
1063
            UnitxtWarning(
1✔
1064
                f"Unable to convert input text to dictionary in JsonStrToDict. Text: {text}"
1065
            )
1066
            dict_value = {}
1✔
1067
        return  {str(k): str(v) for k, v in dict_value.items() if v is not None}
1✔
STATUS · Troubleshooting · Open an Issue · Sales · Support · CAREERS · ENTERPRISE · START FREE · SCHEDULE DEMO
ANNOUNCEMENTS · TWITTER · TOS & SLA · Supported CI Services · What's a CI service? · Automated Testing

© 2026 Coveralls, Inc