|
3 | 3 | # requires-python = "==3.11.11" |
4 | 4 | # dependencies = [ |
5 | 5 | # "pandas==2.2.3", |
6 | | -# "lighteval==0.10.0", |
| 6 | +# "lighteval==0.10.0", |
7 | 7 | # "openai==1.83.0" |
8 | 8 | # ] |
9 | 9 | # /// |
|
12 | 12 | Evaluate LLM outputs using multiple metrics and compute associated costs |
13 | 13 | """ |
14 | 14 |
|
15 | | -#This script evaluates LLM outputs using the `lighteval` library |
16 | | -#https://huggingface.co/docs/lighteval/en/metric-list#automatic-metrics-for-generative-tasks |
| 15 | +# This script evaluates LLM outputs using the `lighteval` library |
| 16 | +# https://huggingface.co/docs/lighteval/en/metric-list#automatic-metrics-for-generative-tasks |
17 | 17 |
|
18 | | -#This script uses Python 3.11 where prebuilt wheels for `sentencepiece` exist |
| 18 | +# This script uses Python 3.11 where prebuilt wheels for `sentencepiece` exist |
19 | 19 |
|
20 | 20 |
|
21 | | -#TODO: Add tests on a small dummy dataset to confirm it handles errors gracefully and produces expected outputs |
| 21 | +# TODO: Add tests on a small dummy dataset to confirm it handles errors gracefully and produces expected outputs |
22 | 22 |
|
23 | 23 | import sys |
24 | 24 | import os |
|
40 | 40 | ) |
41 | 41 |
|
42 | 42 |
|
43 | | -def evaluate_response( |
44 | | - model_name: str, query: str, context: str |
45 | | -) -> pd.DataFrame: |
| 43 | +def evaluate_response(model_name: str, query: str, context: str) -> pd.DataFrame: |
46 | 44 | """ |
47 | 45 | Evaluates the response of a model to a given query and context, computes extractiveness metrics, token usage, and cost |
48 | 46 |
|
@@ -91,6 +89,8 @@ def evaluate_response( |
91 | 89 |
|
92 | 90 |
|
93 | 91 | if __name__ == "__main__": |
| 92 | + # TODO: Add test evaluation argument to run on the first 10 rows of the config file |
| 93 | + |
94 | 94 | # TODO: Add CLI argument to specify the metrics to be computed |
95 | 95 | parser = argparse.ArgumentParser( |
96 | 96 | description="Evaluate LLM outputs using multiple metrics and compute associated costs" |
@@ -149,9 +149,7 @@ def evaluate_response( |
149 | 149 | df_evals = pd.concat( |
150 | 150 | [ |
151 | 151 | df_evals, |
152 | | - evaluate_response( |
153 | | - row["Model Name"], row["Query"], row["Context"] |
154 | | - ), |
| 152 | + evaluate_response(row["Model Name"], row["Query"], row["Context"]), |
155 | 153 | ], |
156 | 154 | axis=0, |
157 | 155 | ) |
|
0 commit comments