Skip to content

Commit fe302b5

Browse files
committed
ADD TODOs
1 parent 4f8cbad commit fe302b5

1 file changed

Lines changed: 9 additions & 11 deletions

File tree

evaluation/evals.py

Lines changed: 9 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@
33
# requires-python = "==3.11.11"
44
# dependencies = [
55
# "pandas==2.2.3",
6-
# "lighteval==0.10.0",
6+
# "lighteval==0.10.0",
77
# "openai==1.83.0"
88
# ]
99
# ///
@@ -12,13 +12,13 @@
1212
Evaluate LLM outputs using multiple metrics and compute associated costs
1313
"""
1414

15-
#This script evaluates LLM outputs using the `lighteval` library
16-
#https://huggingface.co/docs/lighteval/en/metric-list#automatic-metrics-for-generative-tasks
15+
# This script evaluates LLM outputs using the `lighteval` library
16+
# https://huggingface.co/docs/lighteval/en/metric-list#automatic-metrics-for-generative-tasks
1717

18-
#This script uses Python 3.11 where prebuilt wheels for `sentencepiece` exist
18+
# This script uses Python 3.11 where prebuilt wheels for `sentencepiece` exist
1919

2020

21-
#TODO: Add tests on a small dummy dataset to confirm it handles errors gracefully and produces expected outputs
21+
# TODO: Add tests on a small dummy dataset to confirm it handles errors gracefully and produces expected outputs
2222

2323
import sys
2424
import os
@@ -40,9 +40,7 @@
4040
)
4141

4242

43-
def evaluate_response(
44-
model_name: str, query: str, context: str
45-
) -> pd.DataFrame:
43+
def evaluate_response(model_name: str, query: str, context: str) -> pd.DataFrame:
4644
"""
4745
Evaluates the response of a model to a given query and context, computes extractiveness metrics, token usage, and cost
4846
@@ -91,6 +89,8 @@ def evaluate_response(
9189

9290

9391
if __name__ == "__main__":
92+
# TODO: Add test evaluation argument to run on the first 10 rows of the config file
93+
9494
# TODO: Add CLI argument to specify the metrics to be computed
9595
parser = argparse.ArgumentParser(
9696
description="Evaluate LLM outputs using multiple metrics and compute associated costs"
@@ -149,9 +149,7 @@ def evaluate_response(
149149
df_evals = pd.concat(
150150
[
151151
df_evals,
152-
evaluate_response(
153-
row["Model Name"], row["Query"], row["Context"]
154-
),
152+
evaluate_response(row["Model Name"], row["Query"], row["Context"]),
155153
],
156154
axis=0,
157155
)

0 commit comments

Comments
 (0)