• Home
  • Features
  • Pricing
  • Docs
  • Announcements
  • Sign In

IBM / unitxt / 15021238866

14 May 2025 12:54PM UTC coverage: 79.766% (-0.2%) from 79.995%
15021238866

push

github

web-flow
Allow running benchmarks and recipes in cli (#1785)

* Allow running benchmarks and recipes in cli

Signed-off-by: elronbandel <elronbandel@gmail.com>

* remove duplicated tests

Signed-off-by: Yotam Perlitz <y.perlitz@ibm.com>

* Add support for max_test/train/validation_instance in benchmark

Signed-off-by: elronbandel <elronbandel@gmail.com>

* add example scripts

Signed-off-by: Yotam Perlitz <y.perlitz@ibm.com>

---------

Signed-off-by: elronbandel <elronbandel@gmail.com>
Signed-off-by: Yotam Perlitz <y.perlitz@ibm.com>
Co-authored-by: Yotam Perlitz <perlitz@gmail.com>
Co-authored-by: Yotam Perlitz <y.perlitz@ibm.com>

1660 of 2067 branches covered (80.31%)

Branch coverage included in aggregate %.

10281 of 12903 relevant lines covered (79.68%)

0.8 hits per line

Source File
Press 'n' to go to next uncovered line, 'b' for previous

89.61
src/unitxt/benchmark.py
1
from abc import abstractmethod
1✔
2
from typing import Dict, List, Optional, Union
1✔
3

4
from .dataclass import NonPositionalField
1✔
5
from .formats import Format
1✔
6
from .fusion import FixedFusion
1✔
7
from .operator import SourceOperator
1✔
8
from .standard import DatasetRecipe
1✔
9
from .stream import MultiStream
1✔
10
from .system_prompts import SystemPrompt
1✔
11

12

13
class BaseBenchmark(SourceOperator):
1✔
14
    format: Format = NonPositionalField(default=None)
1✔
15
    num_demos: int = NonPositionalField(default=None)
1✔
16
    system_prompt: SystemPrompt = NonPositionalField(default=None)
1✔
17
    loader_limit: int = NonPositionalField(default=None)
1✔
18
    splits: List[str] = NonPositionalField(
1✔
19
        default_factory=lambda: ["train", "validation", "test"]
20
    )
21
    subset: Optional[str] = NonPositionalField(default=None)
1✔
22

23
    @abstractmethod
1✔
24
    def reset(self):
1✔
25
        pass
×
26

27

28
class Benchmark(BaseBenchmark):
1✔
29
    subsets: Dict[str, Union[DatasetRecipe, BaseBenchmark]]
1✔
30

31
    max_total_samples: int = None
1✔
32
    max_samples_per_subset: int = None
1✔
33
    max_train_instances: int = None
1✔
34
    max_validation_instances: int = None
1✔
35
    max_test_instances: int = None
1✔
36

37
    def verify(self):
1✔
38
        super().verify()
1✔
39
        if (
1✔
40
            self.max_total_samples is not None
41
            and self.max_samples_per_subset is not None
42
        ):
43
            raise ValueError("Set either max_total_samples or max_samples_per_subset")
×
44

45
    def prepare_args(self):
1✔
46
        self.subsets = dict(self.subsets)
1✔
47

48
    def reset(self):
1✔
49
        if (
1✔
50
            self.format is not None
51
            or self.num_demos is not None
52
            or self.system_prompt is not None
53
            or self.loader_limit is not None
54
        ):
55
            for subset in self.subsets.values():
1✔
56
                if self.num_demos is not None:
1✔
57
                    subset.num_demos = self.num_demos
×
58
                if self.format is not None:
1✔
59
                    subset.format = self.format
1✔
60
                if self.system_prompt is not None:
1✔
61
                    subset.system_prompt = self.system_prompt
×
62
                if self.loader_limit is not None:
1✔
63
                    subset.loader_limit = self.loader_limit
1✔
64

65
                subset.reset()
1✔
66

67
    def prepare(self):
1✔
68
        super().prepare()
1✔
69

70
        self.reset()
1✔
71

72
    def process(
1✔
73
        self,
74
    ) -> MultiStream:
75
        if self.subset is not None:
1✔
76
            subsets = {self.subset: self.subsets[self.subset]}
×
77
        else:
78
            subsets = self.subsets
1✔
79

80
        max_instances_per_split = {}
1✔
81
        if self.max_train_instances is not None:
1✔
82
            max_instances_per_split["train"] = self.max_train_instances
×
83
        if self.max_validation_instances is not None:
1✔
84
            max_instances_per_split["validation"] = self.max_validation_instances
×
85
        if self.max_test_instances is not None:
1✔
86
            max_instances_per_split["test"] = self.max_test_instances
1✔
87
        if len(max_instances_per_split) == 0:
1✔
88
            max_instances_per_split = None
1✔
89

90
        if self.max_total_samples is None:
1✔
91
            operator = FixedFusion(
1✔
92
                subsets=subsets,
93
                max_instances_per_subset=self.max_samples_per_subset,
94
                max_instances_per_split=max_instances_per_split,
95
                include_splits=self.splits,
96
            )
97
        else:
98
            raise NotImplementedError()
×
99

100
        return operator()
1✔
STATUS · Troubleshooting · Open an Issue · Sales · Support · CAREERS · ENTERPRISE · START FREE · SCHEDULE DEMO
ANNOUNCEMENTS · TWITTER · TOS & SLA · Supported CI Services · What's a CI service? · Automated Testing

© 2026 Coveralls, Inc