Skip to content

Commit a7758b5

Browse files
fix(litellm): Avoid double span exits when streaming
1 parent 60a3f06 commit a7758b5

3 files changed

Lines changed: 174 additions & 19 deletions

File tree

sentry_sdk/integrations/litellm.py

Lines changed: 8 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -230,8 +230,14 @@ def _success_callback(
230230
)
231231

232232
finally:
233-
# Always finish the span and clean up
234-
span.__exit__(None, None, None)
233+
is_streaming = kwargs.get("stream")
234+
# Callback is fired multiple times when streaming a response.
235+
# Flag checked at https://github.com/BerriAI/litellm/blob/33c3f13443eaf990ac8c6e3da78bddbc2b7d0e7a/litellm/litellm_core_utils/litellm_logging.py#L1603
236+
if (
237+
is_streaming is not True
238+
or kwargs.get("complete_streaming_response") is not None
239+
):
240+
span.__exit__(None, None, None)
235241

236242

237243
def _failure_callback(

tests/conftest.py

Lines changed: 114 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1063,6 +1063,120 @@ def inner(response_content, serialize_pydantic=False, request_headers=None):
10631063
return inner
10641064

10651065

1066+
@pytest.fixture
1067+
def streaming_chat_completions_model_response():
1068+
return [
1069+
openai.types.chat.ChatCompletionChunk(
1070+
id="chatcmpl-test",
1071+
object="chat.completion.chunk",
1072+
created=10000000,
1073+
model="gpt-3.5-turbo",
1074+
choices=[
1075+
openai.types.chat.chat_completion_chunk.Choice(
1076+
index=0,
1077+
delta=openai.types.chat.chat_completion_chunk.ChoiceDelta(
1078+
role="assistant"
1079+
),
1080+
finish_reason=None,
1081+
),
1082+
],
1083+
),
1084+
openai.types.chat.ChatCompletionChunk(
1085+
id="chatcmpl-test",
1086+
object="chat.completion.chunk",
1087+
created=10000000,
1088+
model="gpt-3.5-turbo",
1089+
choices=[
1090+
openai.types.chat.chat_completion_chunk.Choice(
1091+
index=0,
1092+
delta=openai.types.chat.chat_completion_chunk.ChoiceDelta(
1093+
content="Tes"
1094+
),
1095+
finish_reason=None,
1096+
),
1097+
],
1098+
),
1099+
openai.types.chat.ChatCompletionChunk(
1100+
id="chatcmpl-test",
1101+
object="chat.completion.chunk",
1102+
created=10000000,
1103+
model="gpt-3.5-turbo",
1104+
choices=[
1105+
openai.types.chat.chat_completion_chunk.Choice(
1106+
index=0,
1107+
delta=openai.types.chat.chat_completion_chunk.ChoiceDelta(
1108+
content="t r"
1109+
),
1110+
finish_reason=None,
1111+
),
1112+
],
1113+
),
1114+
openai.types.chat.ChatCompletionChunk(
1115+
id="chatcmpl-test",
1116+
object="chat.completion.chunk",
1117+
created=10000000,
1118+
model="gpt-3.5-turbo",
1119+
choices=[
1120+
openai.types.chat.chat_completion_chunk.Choice(
1121+
index=0,
1122+
delta=openai.types.chat.chat_completion_chunk.ChoiceDelta(
1123+
content="esp"
1124+
),
1125+
finish_reason=None,
1126+
),
1127+
],
1128+
),
1129+
openai.types.chat.ChatCompletionChunk(
1130+
id="chatcmpl-test",
1131+
object="chat.completion.chunk",
1132+
created=10000000,
1133+
model="gpt-3.5-turbo",
1134+
choices=[
1135+
openai.types.chat.chat_completion_chunk.Choice(
1136+
index=0,
1137+
delta=openai.types.chat.chat_completion_chunk.ChoiceDelta(
1138+
content="ons"
1139+
),
1140+
finish_reason=None,
1141+
),
1142+
],
1143+
),
1144+
openai.types.chat.ChatCompletionChunk(
1145+
id="chatcmpl-test",
1146+
object="chat.completion.chunk",
1147+
created=10000000,
1148+
model="gpt-3.5-turbo",
1149+
choices=[
1150+
openai.types.chat.chat_completion_chunk.Choice(
1151+
index=0,
1152+
delta=openai.types.chat.chat_completion_chunk.ChoiceDelta(
1153+
content="e"
1154+
),
1155+
finish_reason=None,
1156+
),
1157+
],
1158+
),
1159+
openai.types.chat.ChatCompletionChunk(
1160+
id="chatcmpl-test",
1161+
object="chat.completion.chunk",
1162+
created=10000000,
1163+
model="gpt-3.5-turbo",
1164+
choices=[
1165+
openai.types.chat.chat_completion_chunk.Choice(
1166+
index=0,
1167+
delta=openai.types.chat.chat_completion_chunk.ChoiceDelta(),
1168+
finish_reason="stop",
1169+
),
1170+
],
1171+
usage=openai.types.CompletionUsage(
1172+
prompt_tokens=10,
1173+
completion_tokens=20,
1174+
total_tokens=30,
1175+
),
1176+
),
1177+
]
1178+
1179+
10661180
@pytest.fixture
10671181
def nonstreaming_responses_model_response():
10681182
return openai.types.responses.Response(

tests/integrations/litellm/test_litellm.py

Lines changed: 52 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -31,10 +31,22 @@ async def __call__(self, *args, **kwargs):
3131
)
3232
from sentry_sdk.utils import package_version
3333

34+
from openai import OpenAI
35+
36+
from concurrent.futures import ThreadPoolExecutor
37+
38+
import litellm.litellm_core_utils.streaming_handler as streaming_handler
39+
3440

3541
LITELLM_VERSION = package_version("litellm")
3642

3743

44+
@pytest.fixture()
45+
def reset_litellm_executor():
46+
yield
47+
streaming_handler.executor = ThreadPoolExecutor(max_workers=100)
48+
49+
3850
@pytest.fixture
3951
def clear_litellm_cache():
4052
"""
@@ -212,7 +224,14 @@ def test_nonstreaming_chat_completion(
212224
],
213225
)
214226
def test_streaming_chat_completion(
215-
sentry_init, capture_events, send_default_pii, include_prompts
227+
reset_litellm_executor,
228+
sentry_init,
229+
capture_events,
230+
send_default_pii,
231+
include_prompts,
232+
get_model_response,
233+
server_side_event_chunks,
234+
streaming_chat_completions_model_response,
216235
):
217236
sentry_init(
218237
integrations=[LiteLLMIntegration(include_prompts=include_prompts)],
@@ -222,29 +241,45 @@ def test_streaming_chat_completion(
222241
events = capture_events()
223242

224243
messages = [{"role": "user", "content": "Hello!"}]
225-
mock_response = MockCompletionResponse()
226244

227-
with start_transaction(name="litellm test"):
228-
kwargs = {
229-
"model": "gpt-3.5-turbo",
230-
"messages": messages,
231-
"stream": True,
232-
}
245+
client = OpenAI(api_key="z")
233246

234-
_input_callback(kwargs)
235-
_success_callback(
236-
kwargs,
237-
mock_response,
238-
datetime.now(),
239-
datetime.now(),
240-
)
247+
model_response = get_model_response(
248+
server_side_event_chunks(
249+
streaming_chat_completions_model_response,
250+
include_event_type=False,
251+
),
252+
request_headers={"X-Stainless-Raw-Response": "True"},
253+
)
254+
255+
with mock.patch.object(
256+
client.completions._client._client,
257+
"send",
258+
return_value=model_response,
259+
):
260+
with start_transaction(name="litellm test"):
261+
response = litellm.completion(
262+
model="gpt-3.5-turbo",
263+
messages=messages,
264+
client=client,
265+
stream=True,
266+
)
267+
for chunk in response:
268+
pass
269+
270+
streaming_handler.executor.shutdown(wait=True)
241271

242272
assert len(events) == 1
243273
(event,) = events
244274

245275
assert event["type"] == "transaction"
246-
assert len(event["spans"]) == 1
247-
(span,) = event["spans"]
276+
chat_spans = list(
277+
x
278+
for x in event["spans"]
279+
if x["op"] == OP.GEN_AI_CHAT and x["origin"] == "auto.ai.litellm"
280+
)
281+
assert len(chat_spans) == 1
282+
span = chat_spans[0]
248283

249284
assert span["op"] == OP.GEN_AI_CHAT
250285
assert span["data"][SPANDATA.GEN_AI_RESPONSE_STREAMING] is True

0 commit comments

Comments
 (0)