Skip to content

Benchmarks

Bases: ABC

This abstract class defines the structure for a benchmarking task. Subclasses of Benchmark must implement the 'run' method, which takes in various parameters related to the benchmarking task and returns a BenchmarkResult object.

Methods:

Name Description
run

Abstract method that must be implemented by subclasses. It runs the benchmarking task with the given parameters and returns a BenchmarkResult object.

Source code in parsbench/benchmarks/base.py
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
class Benchmark(ABC):
    """
    This abstract class defines the structure for a benchmarking task. Subclasses of Benchmark must implement the 'run' method, which takes in various parameters related to the benchmarking task and returns a BenchmarkResult object.

    Methods:
        run: Abstract method that must be implemented by subclasses. It runs the benchmarking task with the given parameters and returns a BenchmarkResult object.
    """

    @abstractmethod
    def run(
        self,
        prompt_lang: str = "fa",
        prompt_shots: list[int] | None = None,
        n_first: int | None = None,
        sort_by_score: bool = True,
        save_matches: bool = False,
        save_evaluation: bool = False,
        save_benchmark: bool = False,
        output_path: str = None,
        skip_existing_matches: bool = False,
        prefer_concurrency: bool = True,
        n_workers: int = 4,
    ) -> BenchmarkResult:
        """
        Abstract method that must be implemented by subclasses. It runs the benchmarking task with the given parameters and returns a BenchmarkResult object.

        Parameters:
            prompt_lang (str, optional): The language of the prompt (default is "fa").
            prompt_shots (list[int], optional): The list of prompt shots to evaluate (default is None).
            n_first (int, optional): The number of initial prompts to consider (default is 200).
            sort_by_score (bool, optional): Whether to sort the model benchmarks by average score (default is True).
            save_matches (bool, optional): Flag to save the generated matches (default is False).
            save_evaluation (bool, optional): Flag to save the evaluation results (default is False).
            skip_existing_matches (bool, optional): Flag to skip already generated matches in the output path (default is False).
            output_path (str, optional): The output path to save the matches and evaluation results.
            prefer_concurrency (bool, optional): The flag to use concurrent processing if the model and task support that (default is True).
            n_workers (int, optional): The number of workers for concurrent processing (default is 4).

        Returns:
            BenchmarkResult: An object containing the benchmarking results.
        """
        pass

run(prompt_lang='fa', prompt_shots=None, n_first=None, sort_by_score=True, save_matches=False, save_evaluation=False, save_benchmark=False, output_path=None, skip_existing_matches=False, prefer_concurrency=True, n_workers=4) abstractmethod

Abstract method that must be implemented by subclasses. It runs the benchmarking task with the given parameters and returns a BenchmarkResult object.

Parameters:

Name Type Description Default
prompt_lang str

The language of the prompt (default is "fa").

'fa'
prompt_shots list[int]

The list of prompt shots to evaluate (default is None).

None
n_first int

The number of initial prompts to consider (default is 200).

None
sort_by_score bool

Whether to sort the model benchmarks by average score (default is True).

True
save_matches bool

Flag to save the generated matches (default is False).

False
save_evaluation bool

Flag to save the evaluation results (default is False).

False
skip_existing_matches bool

Flag to skip already generated matches in the output path (default is False).

False
output_path str

The output path to save the matches and evaluation results.

None
prefer_concurrency bool

The flag to use concurrent processing if the model and task support that (default is True).

True
n_workers int

The number of workers for concurrent processing (default is 4).

4

Returns:

Name Type Description
BenchmarkResult BenchmarkResult

An object containing the benchmarking results.

Source code in parsbench/benchmarks/base.py
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
@abstractmethod
def run(
    self,
    prompt_lang: str = "fa",
    prompt_shots: list[int] | None = None,
    n_first: int | None = None,
    sort_by_score: bool = True,
    save_matches: bool = False,
    save_evaluation: bool = False,
    save_benchmark: bool = False,
    output_path: str = None,
    skip_existing_matches: bool = False,
    prefer_concurrency: bool = True,
    n_workers: int = 4,
) -> BenchmarkResult:
    """
    Abstract method that must be implemented by subclasses. It runs the benchmarking task with the given parameters and returns a BenchmarkResult object.

    Parameters:
        prompt_lang (str, optional): The language of the prompt (default is "fa").
        prompt_shots (list[int], optional): The list of prompt shots to evaluate (default is None).
        n_first (int, optional): The number of initial prompts to consider (default is 200).
        sort_by_score (bool, optional): Whether to sort the model benchmarks by average score (default is True).
        save_matches (bool, optional): Flag to save the generated matches (default is False).
        save_evaluation (bool, optional): Flag to save the evaluation results (default is False).
        skip_existing_matches (bool, optional): Flag to skip already generated matches in the output path (default is False).
        output_path (str, optional): The output path to save the matches and evaluation results.
        prefer_concurrency (bool, optional): The flag to use concurrent processing if the model and task support that (default is True).
        n_workers (int, optional): The number of workers for concurrent processing (default is 4).

    Returns:
        BenchmarkResult: An object containing the benchmarking results.
    """
    pass

Bases: Benchmark

CustomBenchmark class represents a custom benchmarking task that extends the Benchmark abstract class. It defines the run method to execute the benchmarking process for a given list of models and tasks.

Attributes:

Name Type Description
models list[Model]

The list of models to evaluate in the benchmarking task.

tasks list[Task]

The list of tasks to evaluate with the models.

Methods:

Name Description
run

Executes the benchmarking process for the specified models and tasks, generating evaluation results for each model on each task.

Source code in parsbench/benchmarks/custom_benchmark.py
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
class CustomBenchmark(Benchmark):
    """
    CustomBenchmark class represents a custom benchmarking task that extends the Benchmark abstract class. It defines the run method to execute the benchmarking process for a given list of models and tasks.

    Attributes:
        models (list[Model]): The list of models to evaluate in the benchmarking task.
        tasks (list[Task]): The list of tasks to evaluate with the models.

    Methods:
        run: Executes the benchmarking process for the specified models and tasks, generating evaluation results for each model on each task.
    """

    def __init__(
        self,
        models: list[Model],
        tasks: list[Task],
    ):
        self.models = models
        self.tasks = tasks

    def run(
        self,
        prompt_lang: str = "fa",
        prompt_shots: list[int] | None = None,
        n_first: int | None = None,
        sort_by_score: bool = True,
        save_matches: bool = False,
        save_evaluation: bool = False,
        save_benchmark: bool = False,
        output_path: str = None,
        skip_existing_matches: bool = False,
        prefer_concurrency: bool = True,
        n_workers: int = 4,
    ) -> BenchmarkResult:
        """
        Run the benchmarking process for the given models and tasks.

        Parameters:
            prompt_lang (str, optional): The language of the prompt (default is "fa").
            prompt_shots (list[int], optional): The list of prompt shots to evaluate (default is None).
            n_first (int, optional): The number of initial prompts to consider (default is 200).
            sort_by_score (bool, optional): Whether to sort the model benchmarks by average score (default is True).
            save_matches (bool, optional): Flag to save the generated matches (default is False).
            save_evaluation (bool, optional): Flag to save the evaluation results (default is False).
            output_path (str, optional): The output path to save the matches and evaluation results.
            skip_existing_matches (bool, optional): Flag to skip already generated matches in the output path (default is False).
            prefer_concurrency (bool, optional): The flag to use concurrent processing if the model and task support that (default is True).
            n_workers (int, optional): The number of workers for concurrent processing (default is 4).

        Returns:
            BenchmarkResult: The result of the benchmarking process.
        """

        model_evaluations: dict[str, list[EvaluationResult]] = defaultdict(list)

        for task in self.tasks:
            print(f"Evaluating {task.task_name}:")

            if inspect.isclass(task):
                if issubclass(task, Task):
                    task: Task = task()
                else:
                    raise TypeError(
                        f"{task} is not a subclass/instance of the Task class."
                    )

            with task:
                for model in self.models:
                    print(f"Model: {model.model_name}")

                    evaluation_results = task.evaluate(
                        model=model,
                        prompt_lang=prompt_lang,
                        prompt_shots=prompt_shots,
                        n_first=n_first,
                        save_matches=save_matches,
                        save_evaluation=save_evaluation,
                        output_path=output_path,
                        skip_existing_matches=skip_existing_matches,
                        prefer_concurrency=prefer_concurrency,
                        n_workers=n_workers,
                    )
                    model_evaluations[model.model_name].extend(evaluation_results)

        model_benchmarks = [
            ModelBenchmarkResult(
                model_name=model_name,
                evaluation_results=evaluation_results,
            )
            for model_name, evaluation_results in model_evaluations.items()
        ]

        if sort_by_score:
            model_benchmarks.sort(key=lambda mb: mb.average_score, reverse=True)

        benchmark_result = BenchmarkResult(model_benchmarks=model_benchmarks)

        if save_benchmark:
            benchmark_result.save(output_path)

        return benchmark_result

run(prompt_lang='fa', prompt_shots=None, n_first=None, sort_by_score=True, save_matches=False, save_evaluation=False, save_benchmark=False, output_path=None, skip_existing_matches=False, prefer_concurrency=True, n_workers=4)

Run the benchmarking process for the given models and tasks.

Parameters:

Name Type Description Default
prompt_lang str

The language of the prompt (default is "fa").

'fa'
prompt_shots list[int]

The list of prompt shots to evaluate (default is None).

None
n_first int

The number of initial prompts to consider (default is 200).

None
sort_by_score bool

Whether to sort the model benchmarks by average score (default is True).

True
save_matches bool

Flag to save the generated matches (default is False).

False
save_evaluation bool

Flag to save the evaluation results (default is False).

False
output_path str

The output path to save the matches and evaluation results.

None
skip_existing_matches bool

Flag to skip already generated matches in the output path (default is False).

False
prefer_concurrency bool

The flag to use concurrent processing if the model and task support that (default is True).

True
n_workers int

The number of workers for concurrent processing (default is 4).

4

Returns:

Name Type Description
BenchmarkResult BenchmarkResult

The result of the benchmarking process.

Source code in parsbench/benchmarks/custom_benchmark.py
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
def run(
    self,
    prompt_lang: str = "fa",
    prompt_shots: list[int] | None = None,
    n_first: int | None = None,
    sort_by_score: bool = True,
    save_matches: bool = False,
    save_evaluation: bool = False,
    save_benchmark: bool = False,
    output_path: str = None,
    skip_existing_matches: bool = False,
    prefer_concurrency: bool = True,
    n_workers: int = 4,
) -> BenchmarkResult:
    """
    Run the benchmarking process for the given models and tasks.

    Parameters:
        prompt_lang (str, optional): The language of the prompt (default is "fa").
        prompt_shots (list[int], optional): The list of prompt shots to evaluate (default is None).
        n_first (int, optional): The number of initial prompts to consider (default is 200).
        sort_by_score (bool, optional): Whether to sort the model benchmarks by average score (default is True).
        save_matches (bool, optional): Flag to save the generated matches (default is False).
        save_evaluation (bool, optional): Flag to save the evaluation results (default is False).
        output_path (str, optional): The output path to save the matches and evaluation results.
        skip_existing_matches (bool, optional): Flag to skip already generated matches in the output path (default is False).
        prefer_concurrency (bool, optional): The flag to use concurrent processing if the model and task support that (default is True).
        n_workers (int, optional): The number of workers for concurrent processing (default is 4).

    Returns:
        BenchmarkResult: The result of the benchmarking process.
    """

    model_evaluations: dict[str, list[EvaluationResult]] = defaultdict(list)

    for task in self.tasks:
        print(f"Evaluating {task.task_name}:")

        if inspect.isclass(task):
            if issubclass(task, Task):
                task: Task = task()
            else:
                raise TypeError(
                    f"{task} is not a subclass/instance of the Task class."
                )

        with task:
            for model in self.models:
                print(f"Model: {model.model_name}")

                evaluation_results = task.evaluate(
                    model=model,
                    prompt_lang=prompt_lang,
                    prompt_shots=prompt_shots,
                    n_first=n_first,
                    save_matches=save_matches,
                    save_evaluation=save_evaluation,
                    output_path=output_path,
                    skip_existing_matches=skip_existing_matches,
                    prefer_concurrency=prefer_concurrency,
                    n_workers=n_workers,
                )
                model_evaluations[model.model_name].extend(evaluation_results)

    model_benchmarks = [
        ModelBenchmarkResult(
            model_name=model_name,
            evaluation_results=evaluation_results,
        )
        for model_name, evaluation_results in model_evaluations.items()
    ]

    if sort_by_score:
        model_benchmarks.sort(key=lambda mb: mb.average_score, reverse=True)

    benchmark_result = BenchmarkResult(model_benchmarks=model_benchmarks)

    if save_benchmark:
        benchmark_result.save(output_path)

    return benchmark_result

Bases: CustomBenchmark

This benchmark class includes all existing tasks which use ParsiNLU datasets.

Attributes:

Name Type Description
models list[Model]

The list of models to evaluate in the benchmarking task.

Methods:

Name Description
run

Executes the benchmarking process for the specified models, generating evaluation results for each model on each task.

Source code in parsbench/benchmarks/parsinlu_benchmark.py
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
class ParsiNLUBenchmark(CustomBenchmark):
    """
    This benchmark class includes all existing tasks which use ParsiNLU datasets.

    Attributes:
        models (list[Model]): The list of models to evaluate in the benchmarking task.

    Methods:
        run: Executes the benchmarking process for the specified models, generating evaluation results for each model on each task.
    """

    def __init__(self, models: list[Model]):
        tasks = [
            ParsiNLUEntailment,
            ParsiNLUMachineTranslationEnFa,
            ParsiNLUMachineTranslationFaEn,
            ParsiNLUMultipleChoice,
            ParsiNLUReadingComprehension,
            ParsiNLUSentimentAnalysis,
        ]
        super().__init__(models, tasks)

Represents the results of benchmarking a model across multiple evaluations.

Attributes:

Name Type Description
model_name str

The name of the model being benchmarked.

evaluation_results list[EvaluationResult]

A list of EvaluationResult objects representing the evaluation results for the model.

Source code in parsbench/benchmarks/benchmark_result.py
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
@dataclass
class ModelBenchmarkResult:
    """
    Represents the results of benchmarking a model across multiple evaluations.

    Attributes:
        model_name (str): The name of the model being benchmarked.
        evaluation_results (list[EvaluationResult]): A list of EvaluationResult objects representing the evaluation results for the model.
    """

    model_name: str
    evaluation_results: list[EvaluationResult]

    @classmethod
    def from_dict(cls, data: dict) -> "ModelBenchmarkResult":
        evaluation_results = [
            EvaluationResult.from_dict(task) for task in data.pop("evaluation_results")
        ]
        return cls(**data, evaluation_results=evaluation_results)

    def to_dict(self) -> dict:
        return {
            **asdict(self),
            "evaluation_results": [e.to_dict() for e in self.evaluation_results],
        }

    def to_pandas(self) -> pd.DataFrame:
        return pd.concat([er.to_pandas() for er in self.evaluation_results])

    def __str__(self) -> str:
        text = f"Model: {self.model_name}\nEvaluation Results:\n"
        for er in self.evaluation_results:
            text += f"- {er.task_name}"
            if er.sub_task:
                text += f" ({er.sub_task}):\n"
            else:
                text += ":\n"
            for psr in er.prompt_shot_results:
                text += f"  - {psr.n_shots}-shot prompt: {psr.score:.4f}\n"
        return text.strip("\n")

    @property
    def average_score(self) -> float:
        return sum([er.average_score for er in self.evaluation_results]) / len(
            self.evaluation_results
        )

Represents the results of benchmarking multiple models across various evaluations.

Attributes:

Name Type Description
model_benchmarks list[ModelBenchmarkResult]

A list of ModelBenchmarkResult objects representing the benchmark results for each model.

Source code in parsbench/benchmarks/benchmark_result.py
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
@dataclass
class BenchmarkResult:
    """
    Represents the results of benchmarking multiple models across various evaluations.

    Attributes:
        model_benchmarks (list[ModelBenchmarkResult]): A list of ModelBenchmarkResult objects representing the benchmark results for each model.
    """

    model_benchmarks: list[ModelBenchmarkResult]

    @classmethod
    def from_file(cls, path: str) -> "BenchmarkResult":
        with jsonlines.open(path, "r") as reader:
            model_benchmarks: list[ModelBenchmarkResult] = []
            for row in reader.iter(type=dict, skip_invalid=True):
                model_benchmarks.append(ModelBenchmarkResult.from_dict(row))

        return cls(model_benchmarks=model_benchmarks)

    @classmethod
    def from_evaluation_files(cls, path: str) -> "BenchmarkResult":
        models = [(d.name, d.path) for d in os.scandir(path) if d.is_dir()]
        model_benchmarks = []

        for model_name, model_path in models:
            eval_paths = [d.path for d in os.scandir(model_path) if d.is_dir()]

            evaluation_results = []

            for eval_path in eval_paths:
                eval_files = [
                    f
                    for f in os.scandir(eval_path)
                    if f.is_file() and f.name.startswith("evaluation")
                ]
                evaluation_results.extend(
                    [
                        EvaluationResult.from_file(eval_file.path)
                        for eval_file in eval_files
                    ]
                )

            model_benchmarks.append(
                ModelBenchmarkResult(
                    model_name=model_name,
                    evaluation_results=evaluation_results,
                )
            )

        return BenchmarkResult(model_benchmarks=model_benchmarks)

    @classmethod
    def from_matches_files(cls, path: str, rescore: bool = False) -> "BenchmarkResult":
        task_cls_mapping = {
            task_cls.task_name.replace("-", " "): task_cls
            for task_cls in load_all_tasks()
        }

        _with_subtask_pattern = re.compile(r"matches_([\w\s]+)_(\d+)_shot\.jsonl")
        _without_subtask_pattern = re.compile(r"matches_(\d+)_shot\.jsonl")

        matches_paths = glob.glob(f"{path}/*/*/matches*.jsonl")

        model_evals: list[tuple[str, str, str, TaskMatchGroup]] = []

        for match_path in matches_paths:
            match_file = os.path.basename(match_path)
            task_name = os.path.basename(os.path.dirname(match_path)).replace("_", " ")
            model_name = os.path.basename(os.path.dirname(os.path.dirname(match_path)))
            sub_task = None
            n_shots = 0

            if m := _with_subtask_pattern.match(match_file):
                sub_task = m.group(1)
                n_shots = int(m.group(2))
            elif m := _without_subtask_pattern.match(match_file):
                n_shots = int(m.group(1))
            else:
                raise Exception(
                    f"Matches file '{match_file}' doesn't match the expected pattern."
                )

            task_matches = TaskMatchGroup.from_file(
                Path(match_path).parent, n_shots=n_shots, sub_task=sub_task
            )
            assert (
                task_name is not task_cls_mapping
            ), f"No task class found for '{task_name}'."

            model_evals.append((model_name, task_name, sub_task, task_matches))

        model_benchmarks: list[ModelBenchmarkResult] = []

        for model_name, task_evals in itertools.groupby(
            model_evals, key=lambda t: t[0]
        ):
            print(f"Model: {model_name}")
            evaluation_results: list[EvaluationResult] = []

            for task_name, task_matches_group in itertools.groupby(
                task_evals, key=lambda t: t[1]
            ):
                print(f"Re-scoring {task_name}:")
                task = task_cls_mapping[task_name]()
                prompt_shot_evals = defaultdict(list)

                for _, _, sub_task, task_matches in task_matches_group:
                    print(f"{sub_task} {task_matches.n_shots}-shot prompt:")
                    if rescore:
                        task_matches = task.score_matches(task_matches)

                    score = task.get_overall_score(task_matches)

                    prompt_shot_evals[sub_task].append(
                        PromptShotEvaluationResult(
                            n_shots=task_matches.n_shots, score=score
                        )
                    )

                evaluation_results.extend(
                    EvaluationResult(
                        model_name=model_name,
                        task_name=task_name,
                        task_category=task.task_category,
                        score_name=task.score_name,
                        prompt_shot_results=prompt_shot_results,
                        sub_task=sub_task,
                    )
                    for sub_task, prompt_shot_results in prompt_shot_evals.items()
                )

            model_benchmarks.append(
                ModelBenchmarkResult(
                    model_name=model_name, evaluation_results=evaluation_results
                )
            )
            print("-" * 10)

        return BenchmarkResult(model_benchmarks=model_benchmarks)

    @classmethod
    def from_dict(cls, data: dict) -> "BenchmarkResult":
        model_benchmarks = [
            ModelBenchmarkResult.from_dict(mbr) for mbr in data.pop("model_benchmarks")
        ]
        return cls(**data, model_benchmarks=model_benchmarks)

    def to_dict(self) -> dict:
        return {
            **asdict(self),
            "model_benchmarks": [mb.to_dict() for mb in self.model_benchmarks],
        }

    def to_pandas(self, pivot: bool = False) -> pd.DataFrame:
        df = pd.concat([mb.to_pandas() for mb in self.model_benchmarks])
        if pivot:
            return df.pivot(
                index=["task_category", "task_name", "sub_task", "score_name"],
                columns=["model_name", "n_shots"],
                values=["score"],
            )
        return df

    def show_radar_plot(self, title="Radar Plot"):
        data = []
        categories = set()

        for mb in self.model_benchmarks:
            values = []
            for _, evals in groupby(mb.evaluation_results, key=lambda e: e.task_name):
                evals = list(evals)
                score = sum(e.average_score for e in evals) / len(evals)
                values.append(score)

            data.append({"name": mb.model_name, "values": values})
            categories |= set(e.task_name for e in mb.evaluation_results)

        _radar_plot(data, categories, title)

    def save(self, path: str):
        benchmark_path = Path(path) / "benchmark.jsonl"

        with jsonlines.open(benchmark_path, "w") as writer:
            for mb in self.model_benchmarks:
                writer.write(mb.to_dict())

    def __str__(self) -> str:
        text = ""
        for mb in self.model_benchmarks:
            text += str(mb) + "\n" + "-" * 10 + "\n"
        return text.strip("\n")

Merge multiple BenchmarkResult objects into a single BenchmarkResult object.

Parameters:

Name Type Description Default
benchmarks list[BenchmarkResult]

A list of BenchmarkResult objects to merge.

required
sort bool

Whether to sort the merged ModelBenchmarkResult list by average score. Defaults to True.

True
keep_duplicates bool

Whether to keep duplicate model names in the merged list. Defaults to False.

False

Returns:

Name Type Description
BenchmarkResult BenchmarkResult

A new BenchmarkResult object containing the merged ModelBenchmarkResult list.

Source code in parsbench/benchmarks/benchmark_result.py
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
def merge_benchmark_results(
    benchmarks: list[BenchmarkResult], sort: bool = True, keep_duplicates: bool = False
) -> BenchmarkResult:
    """
    Merge multiple BenchmarkResult objects into a single BenchmarkResult object.

    Parameters:
        benchmarks (list[BenchmarkResult]): A list of BenchmarkResult objects to merge.
        sort (bool, optional): Whether to sort the merged ModelBenchmarkResult list by average score. Defaults to True.
        keep_duplicates (bool, optional): Whether to keep duplicate model names in the merged list. Defaults to False.

    Returns:
        BenchmarkResult: A new BenchmarkResult object containing the merged ModelBenchmarkResult list.
    """
    model_benchmarks: list[ModelBenchmarkResult] = []
    for benchmark in benchmarks:
        model_benchmarks.extend(benchmark.model_benchmarks)

    if not keep_duplicates:
        model_names = set()
        skipped = 0
        for index in range(len(model_benchmarks)):
            mbr = model_benchmarks[index - skipped]
            if mbr.model_name in model_names:
                skipped += 1
                model_benchmarks.pop(index - skipped)
            model_names.add(mbr.model_name)

    if sort:
        model_benchmarks.sort(key=lambda m: m.average_score, reverse=True)

    return BenchmarkResult(model_benchmarks=model_benchmarks)

This function generates leaderboard data from the benchmark result object.

Parameters:

Name Type Description Default
benchmark_result BenchmarkResult

BenchmarkResult object.

required
leaderboard_path str

Path to store the leaderboard data.

required
Source code in parsbench/benchmarks/benchmark_result.py
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
def build_leaderboard_from_benchmark(
    benchmark_result: BenchmarkResult, leaderboard_path: str
):
    """
    This function generates leaderboard data from the benchmark result object.

    Parameters:
        benchmark_result (BenchmarkResult): BenchmarkResult object.
        leaderboard_path (str): Path to store the leaderboard data.
    """
    requests_path = Path(leaderboard_path) / "requests"
    results_path = Path(leaderboard_path) / "results"

    requests_path.mkdir(exist_ok=True)
    results_path.mkdir(exist_ok=True)

    now = datetime.datetime.now(pytz.UTC).isoformat(timespec="seconds")

    for mb in benchmark_result.model_benchmarks:
        model_name = mb.model_name

        os.makedirs(results_path / model_name, exist_ok=True)

        request = {
            "model": model_name,
            "base_model": "",
            "revision": "main",
            "private": False,
            "precision": "?",
            "weight_type": "Original",
            "status": "FINISHED",
            "submitted_time": now,
            "model_type": "\ud83d\udfe2 : pretrained",
            "likes": 0,
            "params": 0.1,
            "license": "custom",
        }
        with open(
            requests_path / f"{model_name}_eval_request_nshot.json", "wt"
        ) as writer:
            writer.write(json.dumps(request))

        result = {
            "config": {"model_dtype": "", "model_name": model_name, "model_sha": ""},
            "results": {
                er.task_name: {er.score_name: round(er.max_score, 3)}
                for er in mb.evaluation_results
            },
        }

        with open(results_path / model_name / f"results_{now}.json", "wt") as writer:
            writer.write(json.dumps(result))