• Home
  • Features
  • Pricing
  • Docs
  • Announcements
  • Sign In

IBM / unitxt / 12765217246

14 Jan 2025 09:58AM UTC coverage: 79.393% (+0.02%) from 79.372%
12765217246

push

github

web-flow
Add Tables Understanding Benchmark (#1506)

* init commit bench

Signed-off-by: ShirApp <shirashury@gmail.com>

* merge updates

* Make tables benchmark

Signed-off-by: elronbandel <elronbandel@gmail.com>

* modify prompts (instruction once)

* modify prompts (instruction once) in generation template

* change llm as judge metric for scigen (Yifan's code)

* updated recipes

* add table augmenter

* update table benchmark files

* delete some files from branch

* fix typo of augmeter list in benchmark code + update recipes to include loader limit

* fix typos

* drop personal scripts

* create updated json cards (tab fact+turl)

* updated cards (tab fact+turl)

* add tablebench visualization json file

* delete old file

* update df serializer test

* drop table bench visualization since it is not a part of the benchmark, and we are not sure about its evaluation metric

---------

Signed-off-by: ShirApp <shirashury@gmail.com>
Signed-off-by: elronbandel <elronbandel@gmail.com>
Co-authored-by: elronbandel <elronbandel@gmail.com>

1387 of 1735 branches covered (79.94%)

Branch coverage included in aggregate %.

8742 of 11023 relevant lines covered (79.31%)

0.79 hits per line

Source File
Press 'n' to go to next uncovered line, 'b' for previous

95.5
src/unitxt/fusion.py
1
from abc import abstractmethod
1✔
2
from typing import Dict, Generator, List, Optional, Union
1✔
3

4
from .dataclass import NonPositionalField
1✔
5
from .operator import SourceOperator
1✔
6
from .random_utils import new_random_generator
1✔
7
from .stream import DynamicStream, MultiStream
1✔
8
from .type_utils import isoftype
1✔
9

10

11
class BaseFusion(SourceOperator):
1✔
12
    """BaseFusion operator that combines multiple multistreams into one.
13

14
    Args:
15
        subsets: a dict of named SourceOperator objects (each to yield a MultiStream) or a list thereof,
16
          each is specified along with its input, so can generate a MultiStream
17
        include_splits: List of splits to include from each input MultiStream.
18
                If None, all splits are included.
19
    """
20

21
    subsets: Union[List[SourceOperator], Dict[str, SourceOperator]]
1✔
22
    include_splits: Optional[List[str]] = NonPositionalField(default=None)
1✔
23

24
    @abstractmethod
1✔
25
    def fusion_generator(self, split) -> Generator:
1✔
26
        pass
×
27

28
    def prepare_subsets(self):
1✔
29
        assert isoftype(self.subsets, Dict[str, SourceOperator]) or isoftype(
1✔
30
            self.subsets, List[SourceOperator]
31
        )
32
        self.named_subsets = {}
1✔
33
        if isinstance(self.subsets, list):
1✔
34
            for i in range(len(self.subsets)):
1✔
35
                self.named_subsets[i] = self.subsets[i]()
1✔
36
        else:
37
            for name, origin in self.subsets.items():
1✔
38
                try:
1✔
39
                    self.named_subsets[name] = origin()
1✔
40
                except Exception as e:
×
41
                    raise RuntimeError(f"Exception in subset: {name}") from e
×
42

43
    def splits(self) -> List[str]:
1✔
44
        self.prepare_subsets()
1✔
45
        splits = []
1✔
46
        for _, origin in self.named_subsets.items():
1✔
47
            for s in origin.keys():
1✔
48
                if s not in splits:
1✔
49
                    if self.include_splits is None or s in self.include_splits:
1✔
50
                        splits.append(s)
1✔
51
        return splits
1✔
52

53
    def process(
1✔
54
        self,
55
    ) -> MultiStream:
56
        result = {}
1✔
57
        for split in self.splits():
1✔
58
            result[split] = DynamicStream(
1✔
59
                self.fusion_generator, gen_kwargs={"split": split}
60
            )
61
        return MultiStream(result)
1✔
62

63

64
class FixedFusion(BaseFusion):
1✔
65
    """FixedFusion operator that combines multiple multistreams into one, limiting the number of instances taken from each split of each input multistream.
66

67
    Args:
68
        subsets: Dict of named SourceOperator objects (each to yield a MultiStream), or a list thereof
69
        splits: List of splits (stream_names) to include, over all input multistreams. If None, all splits are included.
70
        max_instances_per_subset: Number of instances to take from each input split of each input multistream.
71
            If None, all instances of each split (that is specified in include_splits) are included in the result.
72

73
    """
74

75
    max_instances_per_subset: Optional[int] = None
1✔
76

77
    def prepare(self):
1✔
78
        super().prepare()
1✔
79

80
    # flake8: noqa: C901
81
    def fusion_generator(self, split) -> Generator:
1✔
82
        for origin_name, origin in self.named_subsets.items():
1✔
83
            if split not in origin:
1✔
84
                continue
1✔
85
            emitted_from_this_split = 0
1✔
86
            try:
1✔
87
                for instance in origin[split]:
1✔
88
                    if (
1✔
89
                        self.max_instances_per_subset is not None
90
                        and emitted_from_this_split >= self.max_instances_per_subset
91
                    ):
92
                        break
1✔
93
                    if isinstance(origin_name, str):
1✔
94
                        if "subset" not in instance:
1✔
95
                            instance["subset"] = []
1✔
96
                        instance["subset"].insert(0, origin_name)
1✔
97
                    emitted_from_this_split += 1
1✔
98
                    yield instance
1✔
99
            except Exception as e:
1✔
100
                raise RuntimeError(f"Exception in subset: {origin_name}") from e
×
101

102

103
class WeightedFusion(BaseFusion):
1✔
104
    """Fusion operator that combines multiple MultiStream-s.
105

106
    Args:
107
        subsets: Dict of named MultiStream objects, or a list thereof
108
        weights: Dict of named weights for each origin, or a list thereof
109
        max_total_examples: Total number of instances to return per returned split.
110
            If None, all instances are returned
111
    """
112

113
    subsets: Union[Dict[str, SourceOperator], List[SourceOperator]] = None
1✔
114
    weights: Union[Dict[str, Union[float, int]], List[Union[int, float]]] = None
1✔
115
    max_total_samples: int = None
1✔
116

117
    def verify(self):
1✔
118
        super().verify()
1✔
119
        assert self.subsets is not None, "subsets must be specified"
1✔
120
        assert self.weights is not None, "weights must be specified"
1✔
121
        assert len(self.subsets) == len(
1✔
122
            self.weights
123
        ), "subsets and weights must have the same length"
124
        assert isoftype(self.subsets, Dict[str, SourceOperator]) or isoftype(
1✔
125
            self.subsets, List[SourceOperator]
126
        )
127
        assert isoftype(self.weights, Dict[str, Union[int, float]]) or isoftype(
1✔
128
            self.weights, List[Union[int, float]]
129
        )
130
        assert isinstance(self.subsets, dict) == isinstance(self.weights, dict)
1✔
131

132
    def prepare(self):
1✔
133
        super().prepare()
1✔
134
        self.named_weights = (
1✔
135
            {i: float(self.weights[i]) for i in range(len(self.weights))}
136
            if isinstance(self.weights, list)
137
            else {k: float(v) for (k, v) in self.weights.items()}
138
        )
139

140
    def fusion_generator(self, split) -> Generator:
1✔
141
        iterators = {
1✔
142
            named_origin: iter(origin[split])
143
            for named_origin, origin in self.named_subsets.items()
144
        }
145
        total_examples = 0
1✔
146
        random_generator = new_random_generator(sub_seed="weighted_fusion_" + split)
1✔
147
        while (
1✔
148
            self.max_total_samples is None or total_examples < self.max_total_samples
149
        ) and len(iterators) > 0:
150
            population = list(iterators.keys())
1✔
151
            origin_name = random_generator.choices(
1✔
152
                population=population,
153
                weights=[self.named_weights[name] for name in population],
154
            )[0]
155
            iterator = iterators[origin_name]
1✔
156
            try:
1✔
157
                instance = next(iterator)
1✔
158
                if isinstance(origin_name, str):
1✔
159
                    if "subset" not in instance:
1✔
160
                        instance["subset"] = []
1✔
161
                    instance["subset"].insert(0, origin_name)
1✔
162
                total_examples += 1
1✔
163
                yield instance
1✔
164

165
            except StopIteration:
1✔
166
                iterators.pop(origin_name)
1✔
167
            except Exception as e:
1✔
168
                raise RuntimeError(f"Exception in subset: {origin_name}") from e
×
STATUS · Troubleshooting · Open an Issue · Sales · Support · CAREERS · ENTERPRISE · START FREE · SCHEDULE DEMO
ANNOUNCEMENTS · TWITTER · TOS & SLA · Supported CI Services · What's a CI service? · Automated Testing

© 2026 Coveralls, Inc