Skip to content

Tasks

Bases: TaskMatchGenerator, TaskScorer

Task class represents a task that combines functionality from TaskMatchGenerator and TaskScorer.

Attributes:

Name Type Description
task_name str

The name of the task.

task_category TaskCategory

The category of the task.

Methods:

Name Description
evaluate

Method to evaluate the task by generating matches, scoring them, and saving the results.

Source code in parsbench/tasks/base/task.py
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
class Task(TaskMatchGenerator, TaskScorer, metaclass=ABCMeta):
    """
    Task class represents a task that combines functionality from TaskMatchGenerator and TaskScorer.

    Attributes:
        task_name (str): The name of the task.
        task_category (TaskCategory): The category of the task.

    Methods:
        evaluate: Method to evaluate the task by generating matches, scoring them, and saving the results.
    """

    task_name: str
    task_category: TaskCategory

    def evaluate(
        self,
        model: Model,
        prompt_lang: str = "fa",
        prompt_shots: list[int] = None,
        n_first: int = 200,
        sub_tasks: list[str] | None = None,
        save_matches: bool = False,
        save_evaluation: bool = False,
        output_path: str = None,
        skip_existing_matches: bool = False,
        prefer_concurrency: bool = True,
        n_workers: int = 4,
    ) -> list[EvaluationResult]:
        """
        Method to evaluate the task by generating matches, scoring them, and saving the results.

        Parameters:
            model (Model): The model to be evaluated.
            prompt_lang (str, optional): The language of the prompt (default is "fa").
            prompt_shots (list[int], optional): The list of prompt shots to evaluate (default is None).
            n_first (int, optional): The number of initial prompts to consider (default is 200).
            sub_tasks (list[str], optional): The list of sub-tasks to evaluate (default is None).
            save_matches (bool, optional): Flag to save the generated matches (default is False).
            save_evaluation (bool, optional): Flag to save the evaluation results (default is False).
            output_path (str, optional): The output path to save the matches and evaluation results.
            skip_existing_matches (bool, optional): Flag to skip already generated matches in the output path (default is False).
            prefer_concurrency (bool, optional): The flag to use concurrent processing if the model and task support that (default is True).
            n_workers (int, optional): The number of workers for concurrent processing (default is 4).

        Returns:
            list[EvaluationResult]: A list of EvaluationResult objects representing the evaluation results.

        Raises:
            Exception: If output_path is not provided when saving matches or evaluation.
            Exception: If output_path is not provided when skipping existing matches.
            Exception: If sub tasks are not defined or if invalid sub tasks are provided.

        """
        if (save_matches or save_evaluation) and not output_path:
            raise Exception(
                "You should set the output path to save matches/evaluation."
            )

        if skip_existing_matches and not output_path:
            raise Exception(
                "Cannot find already generated matches when output_path is not set."
            )

        task_path = None
        if output_path:
            task_path = get_task_path(output_path, model.model_name, self.task_name)

        prompt_shots = [0] if prompt_shots is None else prompt_shots

        if sub_tasks:
            if not self.sub_tasks:
                raise Exception("Sub tasks are not defined.")

            invalid_sub_tasks = set(sub_tasks) - set(self.sub_tasks)
            if invalid_sub_tasks:
                raise Exception(f"Sub tasks {invalid_sub_tasks} are not defined.")

        sub_tasks = sub_tasks or self._selected_sub_tasks or self.sub_tasks

        evaluation_results: list[EvaluationResult] = []

        for sub_task in sub_tasks or [None]:
            match_groups: list[TaskMatchGroup] = []

            for shots in prompt_shots:
                if skip_existing_matches and check_task_matches_exists(
                    task_path, shots, sub_task=sub_task
                ):
                    match_group = TaskMatchGroup.from_file(
                        task_path, shots, sub_task=sub_task
                    )
                    match_group._loaded_locally = True
                else:
                    match_group = self.generate_matches(
                        prompt_lang,
                        n_shots=shots,
                        n_first=n_first,
                        sub_task=sub_task,
                    )
                match_groups.append(match_group)

            for match_group in match_groups:
                eval_desc = f"{match_group.n_shots}-shot"
                if sub_task:
                    eval_desc = f"sub task '{sub_task}' with " + eval_desc
                desc = f"Evaluating {eval_desc} prompt:"
                print(desc)

                is_loaded_locally = getattr(match_group, "_loaded_locally", False)

                if is_loaded_locally:
                    total_skipped = sum(m.completion is not None for m in match_group)
                    print(
                        f"{total_skipped} of {len(match_group)} match completions will be loaded from local."
                    )

                try:
                    model.generate_completions(
                        match_group,
                        prefer_concurrency=prefer_concurrency,
                        skip_existing=is_loaded_locally,
                        n_workers=n_workers,
                    )
                    self.score_matches(match_group)
                finally:
                    if save_matches:
                        match_group.save(task_path, sub_task=sub_task)

            evaluation_result = EvaluationResult(
                model_name=model.model_name,
                task_name=self.task_name,
                task_category=self.task_category,
                score_name=self.score_name,
                sub_task=sub_task,
                prompt_shot_results=[
                    PromptShotEvaluationResult(
                        n_shots=m.n_shots,
                        score=self.get_overall_score(m),
                    )
                    for m in match_groups
                ],
            )
            evaluation_results.append(evaluation_result)

            if save_evaluation:
                evaluation_result.save(task_path)

        return evaluation_results

    def __enter__(self) -> "Task":
        self.load_data()
        return self

    def __exit__(self, exc_type, exc_value, traceback):
        self._data = None

evaluate(model, prompt_lang='fa', prompt_shots=None, n_first=200, sub_tasks=None, save_matches=False, save_evaluation=False, output_path=None, skip_existing_matches=False, prefer_concurrency=True, n_workers=4)

Method to evaluate the task by generating matches, scoring them, and saving the results.

Parameters:

Name Type Description Default
model Model

The model to be evaluated.

required
prompt_lang str

The language of the prompt (default is "fa").

'fa'
prompt_shots list[int]

The list of prompt shots to evaluate (default is None).

None
n_first int

The number of initial prompts to consider (default is 200).

200
sub_tasks list[str]

The list of sub-tasks to evaluate (default is None).

None
save_matches bool

Flag to save the generated matches (default is False).

False
save_evaluation bool

Flag to save the evaluation results (default is False).

False
output_path str

The output path to save the matches and evaluation results.

None
skip_existing_matches bool

Flag to skip already generated matches in the output path (default is False).

False
prefer_concurrency bool

The flag to use concurrent processing if the model and task support that (default is True).

True
n_workers int

The number of workers for concurrent processing (default is 4).

4

Returns:

Type Description
list[EvaluationResult]

list[EvaluationResult]: A list of EvaluationResult objects representing the evaluation results.

Raises:

Type Description
Exception

If output_path is not provided when saving matches or evaluation.

Exception

If output_path is not provided when skipping existing matches.

Exception

If sub tasks are not defined or if invalid sub tasks are provided.

Source code in parsbench/tasks/base/task.py
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
def evaluate(
    self,
    model: Model,
    prompt_lang: str = "fa",
    prompt_shots: list[int] = None,
    n_first: int = 200,
    sub_tasks: list[str] | None = None,
    save_matches: bool = False,
    save_evaluation: bool = False,
    output_path: str = None,
    skip_existing_matches: bool = False,
    prefer_concurrency: bool = True,
    n_workers: int = 4,
) -> list[EvaluationResult]:
    """
    Method to evaluate the task by generating matches, scoring them, and saving the results.

    Parameters:
        model (Model): The model to be evaluated.
        prompt_lang (str, optional): The language of the prompt (default is "fa").
        prompt_shots (list[int], optional): The list of prompt shots to evaluate (default is None).
        n_first (int, optional): The number of initial prompts to consider (default is 200).
        sub_tasks (list[str], optional): The list of sub-tasks to evaluate (default is None).
        save_matches (bool, optional): Flag to save the generated matches (default is False).
        save_evaluation (bool, optional): Flag to save the evaluation results (default is False).
        output_path (str, optional): The output path to save the matches and evaluation results.
        skip_existing_matches (bool, optional): Flag to skip already generated matches in the output path (default is False).
        prefer_concurrency (bool, optional): The flag to use concurrent processing if the model and task support that (default is True).
        n_workers (int, optional): The number of workers for concurrent processing (default is 4).

    Returns:
        list[EvaluationResult]: A list of EvaluationResult objects representing the evaluation results.

    Raises:
        Exception: If output_path is not provided when saving matches or evaluation.
        Exception: If output_path is not provided when skipping existing matches.
        Exception: If sub tasks are not defined or if invalid sub tasks are provided.

    """
    if (save_matches or save_evaluation) and not output_path:
        raise Exception(
            "You should set the output path to save matches/evaluation."
        )

    if skip_existing_matches and not output_path:
        raise Exception(
            "Cannot find already generated matches when output_path is not set."
        )

    task_path = None
    if output_path:
        task_path = get_task_path(output_path, model.model_name, self.task_name)

    prompt_shots = [0] if prompt_shots is None else prompt_shots

    if sub_tasks:
        if not self.sub_tasks:
            raise Exception("Sub tasks are not defined.")

        invalid_sub_tasks = set(sub_tasks) - set(self.sub_tasks)
        if invalid_sub_tasks:
            raise Exception(f"Sub tasks {invalid_sub_tasks} are not defined.")

    sub_tasks = sub_tasks or self._selected_sub_tasks or self.sub_tasks

    evaluation_results: list[EvaluationResult] = []

    for sub_task in sub_tasks or [None]:
        match_groups: list[TaskMatchGroup] = []

        for shots in prompt_shots:
            if skip_existing_matches and check_task_matches_exists(
                task_path, shots, sub_task=sub_task
            ):
                match_group = TaskMatchGroup.from_file(
                    task_path, shots, sub_task=sub_task
                )
                match_group._loaded_locally = True
            else:
                match_group = self.generate_matches(
                    prompt_lang,
                    n_shots=shots,
                    n_first=n_first,
                    sub_task=sub_task,
                )
            match_groups.append(match_group)

        for match_group in match_groups:
            eval_desc = f"{match_group.n_shots}-shot"
            if sub_task:
                eval_desc = f"sub task '{sub_task}' with " + eval_desc
            desc = f"Evaluating {eval_desc} prompt:"
            print(desc)

            is_loaded_locally = getattr(match_group, "_loaded_locally", False)

            if is_loaded_locally:
                total_skipped = sum(m.completion is not None for m in match_group)
                print(
                    f"{total_skipped} of {len(match_group)} match completions will be loaded from local."
                )

            try:
                model.generate_completions(
                    match_group,
                    prefer_concurrency=prefer_concurrency,
                    skip_existing=is_loaded_locally,
                    n_workers=n_workers,
                )
                self.score_matches(match_group)
            finally:
                if save_matches:
                    match_group.save(task_path, sub_task=sub_task)

        evaluation_result = EvaluationResult(
            model_name=model.model_name,
            task_name=self.task_name,
            task_category=self.task_category,
            score_name=self.score_name,
            sub_task=sub_task,
            prompt_shot_results=[
                PromptShotEvaluationResult(
                    n_shots=m.n_shots,
                    score=self.get_overall_score(m),
                )
                for m in match_groups
            ],
        )
        evaluation_results.append(evaluation_result)

        if save_evaluation:
            evaluation_result.save(task_path)

    return evaluation_results

Bases: ABC

An abstract base class for defining data loaders.

Attributes:

Name Type Description
data_path str

The path to the data source.

Methods:

Name Description
load

Abstract method to be implemented by subclasses for loading data.

Source code in parsbench/tasks/base/data_loader.py
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
class DataLoader(ABC):
    """
    An abstract base class for defining data loaders.

    Attributes:
        data_path (str): The path to the data source.

    Methods:
        load(self) -> list[dict]: Abstract method to be implemented by subclasses for loading data.
    """

    def __init__(self, data_path: str, **kwargs) -> None:
        self.data_path = data_path

    @abstractmethod
    def load(self) -> list[dict]:
        pass

Bases: DataLoader

A data loader class for loading JSON line data from either a local file or a URL.

Attributes:

Name Type Description
data_path str

The path to the JSON line data source.

Methods:

Name Description
load

Loads the JSON line data from the specified source.

Source code in parsbench/tasks/base/data_loader.py
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
class JSONLineDataLoader(DataLoader):
    """
    A data loader class for loading JSON line data from either a local file or a URL.

    Attributes:
        data_path (str): The path to the JSON line data source.

    Methods:
        load(self) -> list[dict]: Loads the JSON line data from the specified source.
    """

    def load(self) -> list[dict]:
        content = _fetch_text_file(self.data_path)

        reader = jsonlines.Reader(content.split("\n"))
        return list(reader.iter(type=dict, skip_invalid=True, skip_empty=True))

Bases: DataLoader

A data loader class for loading datasets using the Hugging Face library.

Attributes:

Name Type Description
data_path str

The path to the data source.

split str | None

The split of the dataset to load.

Methods:

Name Description
load

Loads the dataset from the specified data path and split.

with_filter

Callable[..., bool]) -> "HuggingFaceDataLoader": Adds a filter function to apply when loading the dataset.

Source code in parsbench/tasks/base/data_loader.py
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
class HuggingFaceDataLoader(DataLoader):
    """
    A data loader class for loading datasets using the Hugging Face library.

    Attributes:
        data_path (str): The path to the data source.
        split (str | None): The split of the dataset to load.

    Methods:
        load(self) -> list[dict]: Loads the dataset from the specified data path and split.
        with_filter(self, func: Callable[..., bool]) -> "HuggingFaceDataLoader": Adds a filter function to apply when loading the dataset.
    """

    def __init__(
        self,
        data_path: str,
        split: str | None = None,
        **optional_parameters: dict[str, Any],
    ) -> None:
        super().__init__(data_path)
        self.split = split
        self.optional_parameters = optional_parameters
        self._filters = []

    def load(self) -> list[dict]:
        dataset = datasets.load_dataset(
            self.data_path, split=self.split, **self.optional_parameters
        )
        if len(self._filters):
            for filter_ in self._filters:
                dataset = dataset.filter(filter_)
        return dataset.to_list()

    def with_filter(self, func: Callable[..., bool]) -> "HuggingFaceDataLoader":
        self._filters.append(func)
        return self

Bases: DataLoader

A data loader class for loading CSV line data from either a local file or a URL.

Attributes:

Name Type Description
data_path str

The path to the CSV line data source.

Methods:

Name Description
load

Loads the CSV line data from the specified source.

Source code in parsbench/tasks/base/data_loader.py
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
class CSVDataLoader(DataLoader):
    """
    A data loader class for loading CSV line data from either a local file or a URL.

    Attributes:
        data_path (str): The path to the CSV line data source.

    Methods:
        load(self) -> list[dict]: Loads the CSV line data from the specified source.
    """

    def __init__(self, data_path: str, csv_arguments: dict | None = None, **kwargs):
        super().__init__(data_path)
        self.csv_arguments = csv_arguments or {}

    def load(self) -> list[dict]:
        content = _fetch_text_file(self.data_path)

        csv_reader = csv.DictReader(content.split("\n"), **self.csv_arguments)
        return list(csv_reader)

A class representing a prompt template.

Attributes:

Name Type Description
language_templates dict[str, str]

A dictionary mapping language codes to prompt templates.

prompt_variables_mapping dict[str, str]

A dictionary mapping prompt variable names to corresponding data keys.

target_variables_mapping dict[str, str]

A dictionary mapping target variable names to corresponding data keys.

prompt_shot_templates dict[str, str] | None

A dictionary mapping prompt shot templates to language codes, or None if not provided.

prompt_shot_examples dict[str, dict[int, str]] | None

A dictionary mapping prompt shot examples to language codes and shot numbers, or None if not provided.

Source code in parsbench/tasks/base/prompt_template.py
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
class PromptTemplate:
    """
    A class representing a prompt template.

    Attributes:
        language_templates (dict[str, str]): A dictionary mapping language codes to prompt templates.
        prompt_variables_mapping (dict[str, str]): A dictionary mapping prompt variable names to corresponding data keys.
        target_variables_mapping (dict[str, str]): A dictionary mapping target variable names to corresponding data keys.
        prompt_shot_templates (dict[str, str] | None): A dictionary mapping prompt shot templates to language codes, or None if not provided.
        prompt_shot_examples (dict[str, dict[int, str]] | None): A dictionary mapping prompt shot examples to language codes and shot numbers, or None if not provided.
    """

    def __init__(
        self,
        language_templates: dict[str, str],
        prompt_variables_mapping: dict[str, str],
        target_variables_mapping: dict[str, str],
        prompt_shot_templates: dict[str, str] | None = None,
        prompt_shot_examples: dict[str, dict[int, str]] | None = None,
    ):
        self.language_templates = language_templates
        self.prompt_variables_mapping = prompt_variables_mapping
        self.target_variables_mapping = target_variables_mapping

        if prompt_shot_templates is not None and prompt_shot_examples is not None:
            raise ValueError("Cannot provide both prompt shot templates and examples")

        if prompt_shot_templates is None and prompt_shot_examples is None:
            raise ValueError("Must provide either prompt shot templates or examples")

        self.prompt_shot_templates = prompt_shot_templates
        self.prompt_shot_examples = prompt_shot_examples

    def get_prompt(
        self,
        prompt_lang: str,
        data: dict,
        n_shots: int = 0,
        sample_data: list[dict] | None = None,
    ):
        prompt_template = self.language_templates.get(prompt_lang, None)
        if not prompt_template:
            raise RuntimeError(
                f"There is no prompt template for language {prompt_lang}."
            )

        if n_shots > 0:
            if sample_data:
                example_text = self._gen_example_text(prompt_lang, n_shots, sample_data)
            else:
                example_text = self._get_static_example_text(prompt_lang, n_shots)
        else:
            example_text = ""

        prompt = prompt_template.format(
            example_shots=example_text, **self.get_prompt_variables(data)
        )
        prompt = prompt.replace("\n\n\n", "\n")

        return prompt

    def get_prompt_variables(self, data: dict) -> dict:
        mapped_data = {}
        for pk, dk in self.prompt_variables_mapping.items():
            if isinstance(dk, ConstantPromptVariable):
                mapped_data[pk] = dk.value
            else:
                if dk not in data:
                    raise ValueError(f"Key {dk} not in data.")
                mapped_data[pk] = data[dk]
        return mapped_data

    def get_target_variables(self, data: dict) -> dict:
        mapped_data = {}
        for tk, dk in self.target_variables_mapping.items():
            if dk not in data:
                raise ValueError(f"Key {dk} not in data.")
            mapped_data[tk] = data[dk]
        return mapped_data

    def _get_static_example_text(self, prompt_lang: str, n_shots: int) -> str:
        shot_examples = self.prompt_shot_examples.get(prompt_lang, None)
        if not shot_examples:
            raise RuntimeError(f"There is no shot example for language {prompt_lang}.")

        example_text = shot_examples.get(n_shots, "")
        if not example_text:
            raise RuntimeError(
                f"There is no {n_shots}-shot example for langauge {prompt_lang}. "
                f"You can only use {', '.join(map(str, shot_examples.keys()))} shot examples."
            )

        return example_text

    def _gen_example_text(
        self, prompt_lang: str, n_shots: int, sample_data: list[dict]
    ) -> str:
        if len(sample_data) != n_shots:
            raise RuntimeError(
                f"The number of samples ({len(sample_data)}) is not equal to the number of shots ({n_shots})."
            )

        if shot_template := self.prompt_shot_templates.get(prompt_lang):
            example_text = "\n".join(
                shot_template.format(
                    **self.get_prompt_variables(sample),
                    **self.get_target_variables(sample),
                )
                for sample in sample_data
            )
        else:
            sample_variables = [
                {
                    **self.get_prompt_variables(sample),
                    **self.get_target_variables(sample),
                }
                for sample in sample_data
            ]
            example_text = "\n".join(
                "\n".join(f"{k.capitalize()}:\n{v}" for k, v in variables)
                for variables in sample_variables
            )

        return example_text

    @property
    def has_shot_templates(self) -> bool:
        return bool(self.prompt_shot_templates)

    @property
    def has_shot_examples(self) -> bool:
        return bool(self.prompt_shot_examples)

Bases: Mapping

A class representing lazy loading of templates.

Inherits from Mapping.

Attributes:

Name Type Description
template_paths dict[str, str]

A dictionary mapping template keys to file paths.

Source code in parsbench/tasks/base/prompt_template.py
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
class LazyLoadTemplates(Mapping):
    """
    A class representing lazy loading of templates.

    Inherits from Mapping.

    Attributes:
        template_paths (dict[str, str]): A dictionary mapping template keys to file paths.
    """

    def __init__(self, template_paths: dict[str, str] | None = None, **kwargs):
        super().__init__()
        self.template_paths = template_paths or kwargs or {}
        self._contents: dict[str, str | None] = {
            key: None for key in self.template_paths
        }

    def _load_content(self, key):
        if key in self.template_paths:
            with open(self.template_paths[key], "r") as file:
                self._contents[key] = file.read()
        else:
            raise KeyError(f"Key '{key}' not found in template_paths")

    def __getitem__(self, key) -> str:
        if key not in self._contents:
            raise KeyError(f"Key '{key}' not found")
        if self._contents[key] is None:
            self._load_content(key)
        return self._contents[key]

    def __getattr__(self, key) -> str:
        try:
            return self.__getitem__(key)
        except KeyError:
            raise AttributeError(f"Attribute '{key}' not found")

    def __iter__(self):
        return iter(self.template_paths)

    def __len__(self):
        return len(self.template_paths)

A data class representing the evaluation result for a prompt shot, including the number of shots and the corresponding score.

Attributes:

Name Type Description
n_shots int

The number of shots for the evaluation.

score float

The score obtained for the prompt shot evaluation.

Source code in parsbench/tasks/base/evaluation_result.py
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
@dataclass
class PromptShotEvaluationResult:
    """
    A data class representing the evaluation result for a prompt shot, including the number of shots and the corresponding score.

    Attributes:
        n_shots (int): The number of shots for the evaluation.
        score (float): The score obtained for the prompt shot evaluation.
    """

    n_shots: int
    score: float

    @classmethod
    def from_dict(cls, data: dict) -> "PromptShotEvaluationResult":
        return cls(**data)

    def to_dict(self) -> dict:
        return asdict(self)

    def to_pandas(self) -> pd.DataFrame:
        return pd.DataFrame([self])

    def __str__(self) -> str:
        return f"{self.n_shots}-shot score: {self.score:.4f}"

A data class representing the evaluation result for a model on a specific task, including the model name, task name, task category, score name, prompt shot results, and optional sub-task.

Attributes:

Name Type Description
model_name str

The name of the model being evaluated.

task_name str

The name of the task for which the model is being evaluated.

task_category TaskCategory

The category of the task (e.g., CLASSIC, REASONING, MATH, KNOWLEDGE).

score_name str

The name of the score obtained for the evaluation.

prompt_shot_results list[PromptShotEvaluationResult]

A list of PromptShotEvaluationResult objects representing the evaluation results for prompt shots.

sub_task str

The name of the sub-task being evaluated, if applicable.

Source code in parsbench/tasks/base/evaluation_result.py
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
@dataclass
class EvaluationResult:
    """
    A data class representing the evaluation result for a model on a specific task, including the model name, task name, task category, score name, prompt shot results, and optional sub-task.

    Attributes:
        model_name (str): The name of the model being evaluated.
        task_name (str): The name of the task for which the model is being evaluated.
        task_category (TaskCategory): The category of the task (e.g., CLASSIC, REASONING, MATH, KNOWLEDGE).
        score_name (str): The name of the score obtained for the evaluation.
        prompt_shot_results (list[PromptShotEvaluationResult]): A list of PromptShotEvaluationResult objects representing the evaluation results for prompt shots.
        sub_task (str, optional): The name of the sub-task being evaluated, if applicable.
    """

    model_name: str
    task_name: str
    task_category: TaskCategory
    score_name: str
    prompt_shot_results: list[PromptShotEvaluationResult]
    sub_task: str | None = None

    @classmethod
    def from_file(cls, path: str) -> "EvaluationResult":
        with jsonlines.open(path, "r") as reader:
            data = reader.read(type=dict)
            return cls.from_dict(data)

    @classmethod
    def from_dict(cls, data: dict) -> "EvaluationResult":
        prompt_shot_results = [
            PromptShotEvaluationResult.from_dict(psr)
            for psr in data.pop("prompt_shot_results")
        ]
        data["task_category"] = TaskCategory[data["task_category"].upper()]
        return cls(**data, prompt_shot_results=prompt_shot_results)

    def to_dict(self) -> dict:
        return {
            **asdict(self),
            "prompt_shot_results": [e.to_dict() for e in self.prompt_shot_results],
        }

    def to_pandas(self) -> pd.DataFrame:
        data = [
            {
                "model_name": self.model_name,
                "task_name": self.task_name,
                "task_category": self.task_category.value,
                "sub_task": self.sub_task,
                "n_shots": psr.n_shots,
                "score_name": self.score_name,
                "score": psr.score,
            }
            for psr in self.prompt_shot_results
        ]
        return pd.DataFrame(data)

    def save(self, path: str):
        file_name = (
            f"evaluation_{self.sub_task}.jsonl" if self.sub_task else "evaluation.jsonl"
        )
        task_path = path / file_name

        with jsonlines.open(task_path, "w") as writer:
            writer.write(self.to_dict())

    def __str__(self) -> str:
        text = f"Model: {self.model_name}\nTask: {self.task_name}"

        if self.sub_task:
            text += f" ({self.sub_task})"

        text += "\nScore:\n"

        for psr in self.prompt_shot_results:
            text += f" - {psr.n_shots}-shot prompt: {psr.score:.4f}\n"
        return text.strip("\n")

    @property
    def average_score(self) -> float:
        return sum([psr.score for psr in self.prompt_shot_results]) / len(
            self.prompt_shot_results
        )

    @property
    def max_score(self) -> float:
        return max([psr.score for psr in self.prompt_shot_results])
Source code in parsbench/tasks/base/task_match.py
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
@dataclass
class TaskMatch:
    id: int
    prompt: str
    target: str
    completion: str | None = None
    formatted_completion: str | None = None
    score: int | None = None

    @classmethod
    def from_dict(cls, data: dict) -> "TaskMatch":
        return cls(**data)

    def format_completion(self, formatter: Callable[[str], str]):
        self.formatted_completion = formatter(self.completion)

    def format_prompt(self, formatter: Callable[[str], str]):
        self.prompt = formatter(self.prompt)

    def format_target(self, formatter: Callable[[str], str]):
        self.target = formatter(self.target)

    def to_dict(self) -> dict:
        return asdict(self)

    def to_pandas(self) -> pd.DataFrame:
        return pd.DataFrame([self])

    @property
    def cleaned_completion(self) -> str | None:
        return self.formatted_completion or self.completion
Source code in parsbench/tasks/base/task_match.py
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
@dataclass
class TaskMatchGroup:
    n_shots: int
    matches: list[TaskMatch]

    def __iter__(self):
        yield from iter(self.matches)

    def __len__(self) -> int:
        return len(self.matches)

    @classmethod
    def from_file(
        cls, path: str, n_shots: int, sub_task: str | None
    ) -> "TaskMatchGroup":
        if sub_task:
            matches_path = path / f"matches_{sub_task}_{n_shots}_shot.jsonl"
        else:
            matches_path = path / f"matches_{n_shots}_shot.jsonl"

        with jsonlines.open(matches_path, "r") as reader:
            matches: list[TaskMatch] = []
            for row in reader.iter(type=dict, skip_invalid=True):
                matches.append(TaskMatch.from_dict(row))

        return cls(n_shots=n_shots, matches=matches)

    @classmethod
    def from_dict(cls, data: dict) -> "TaskMatchGroup":
        matches = [TaskMatch.from_dict(m) for m in data.pop("matches")]
        return cls(**data, matches=matches)

    def format_completions(self, formatter: Callable[[str], str]):
        for m in self.matches:
            m.format_completion(formatter)

    def format_prompts(self, formatter: Callable[[str], str]):
        for m in self.matches:
            m.format_prompt(formatter)

    def format_targets(self, formatter: Callable[[str], str]):
        for m in self.matches:
            m.format_target(formatter)

    def to_dict(self) -> dict:
        return {
            **asdict(self),
            "matches": [match.to_dict() for match in self.matches],
        }

    def to_pandas(self) -> pd.DataFrame:
        df = pd.DataFrame(
            [
                {
                    **asdict(match),
                    "n_shots": self.n_shots,
                }
                for match in self.matches
            ]
        )
        return df

    def save(self, path: str, sub_task: str | None):
        if sub_task:
            matches_path = path / f"matches_{sub_task}_{self.n_shots}_shot.jsonl"
        else:
            matches_path = path / f"matches_{self.n_shots}_shot.jsonl"

        with jsonlines.open(matches_path, "w") as writer:
            writer.write_all(self.to_dict()["matches"])

    @property
    def prompts(self) -> list[str]:
        return [m.prompt for m in self.matches]

    @property
    def targets(self) -> list[str]:
        return [m.target for m in self.matches]

    @property
    def completions(self) -> list[str | None]:
        return [m.completion for m in self.matches]

    @property
    def scores(self) -> list[int | None]:
        return [m.score for m in self.matches]

Load all tasks from the 'parsbench.tasks' package and return a list of Task objects.

Returns:

Type Description
list[Task]

list[Task]: A list of Task objects representing all tasks found in the 'parsbench.tasks' package.

Source code in parsbench/tasks/utils.py
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
def load_all_tasks() -> list[Task]:
    """
    Load all tasks from the 'parsbench.tasks' package and return a list of Task objects.

    Returns:
        list[Task]: A list of Task objects representing all tasks found in the 'parsbench.tasks' package.

    """
    tasks: list[Task] = []

    module = importlib.import_module("parsbench.tasks")
    for attr_name in dir(module):
        attr = getattr(module, attr_name)
        if isinstance(attr, type) and issubclass(attr, Task) and attr is not Task:
            tasks.append(attr)

    return tasks