Skip to content

Commit f94dab3

Browse files
committed
add autoevals adapter
1 parent 285cc99 commit f94dab3

5 files changed

Lines changed: 401 additions & 142 deletions

File tree

langfuse/_client/experiments.py

Lines changed: 32 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -747,3 +747,35 @@ async def _run_task(task: TaskFunction, item: ExperimentItem) -> Any:
747747
result = await result
748748

749749
return result
750+
751+
752+
def create_evaluator_from_autoevals(
753+
autoevals_evaluator: Any, **kwargs: Optional[Dict[str, Any]]
754+
) -> EvaluatorFunction:
755+
"""Create a Langfuse evaluator from an autoevals evaluator.
756+
757+
Args:
758+
autoevals_evaluator: An autoevals evaluator instance
759+
**kwargs: Additional arguments passed to the evaluator
760+
761+
Returns:
762+
A Langfuse-compatible evaluator function
763+
"""
764+
765+
def langfuse_evaluator(
766+
*,
767+
input: Any,
768+
output: Any,
769+
expected_output: Any,
770+
metadata: Optional[Dict[str, Any]],
771+
**kwargs: Dict[str, Any],
772+
) -> Evaluation:
773+
evaluation = autoevals_evaluator(
774+
input=input, output=output, expected=expected_output, **kwargs
775+
)
776+
777+
return Evaluation(
778+
name=evaluation.name, value=evaluation.score, metadata=evaluation.metadata
779+
)
780+
781+
return langfuse_evaluator

langfuse/experiment.py

Lines changed: 25 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,25 @@
1+
from ._client.experiments import (
2+
Evaluation,
3+
EvaluatorFunction,
4+
ExperimentData,
5+
ExperimentItem,
6+
ExperimentItemResult,
7+
ExperimentResult,
8+
LocalExperimentItem,
9+
RunEvaluatorFunction,
10+
TaskFunction,
11+
create_evaluator_from_autoevals,
12+
)
13+
14+
__all__ = [
15+
"LocalExperimentItem",
16+
"ExperimentItem",
17+
"ExperimentData",
18+
"Evaluation",
19+
"ExperimentItemResult",
20+
"ExperimentResult",
21+
"TaskFunction",
22+
"EvaluatorFunction",
23+
"RunEvaluatorFunction",
24+
"create_evaluator_from_autoevals",
25+
]

langfuse/types.py

Lines changed: 0 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -39,19 +39,6 @@ def my_evaluator(*, output: str, **kwargs) -> Evaluation:
3939
from langfuse.api import MediaContentType, UsageDetails
4040
from langfuse.model import MapValue, ModelUsage, PromptClient
4141

42-
# Experiment types
43-
from ._client.experiments import (
44-
LocalExperimentItem,
45-
ExperimentItem,
46-
ExperimentData,
47-
Evaluation,
48-
ExperimentItemResult,
49-
ExperimentResult,
50-
TaskFunction,
51-
EvaluatorFunction,
52-
RunEvaluatorFunction,
53-
)
54-
5542
SpanLevel = Literal["DEBUG", "DEFAULT", "WARNING", "ERROR"]
5643

5744
ScoreDataType = Literal["NUMERIC", "CATEGORICAL", "BOOLEAN"]
@@ -116,19 +103,7 @@ class TraceContext(TypedDict):
116103
parent_span_id: NotRequired[str]
117104

118105

119-
# Export experiment types for easy access
120106
__all__ = [
121-
# Experiment types
122-
"LocalExperimentItem",
123-
"ExperimentItem",
124-
"ExperimentData",
125-
"Evaluation",
126-
"ExperimentItemResult",
127-
"ExperimentResult",
128-
"TaskFunction",
129-
"EvaluatorFunction",
130-
"RunEvaluatorFunction",
131-
# Core types (keeping existing functionality)
132107
"SpanLevel",
133108
"ScoreDataType",
134109
"TraceMetadata",

0 commit comments

Comments
 (0)