Skip to content

Commit 4ec57fc

Browse files
committed
improve tests
1 parent 76c827f commit 4ec57fc

5 files changed

Lines changed: 67 additions & 19 deletions

File tree

Lines changed: 62 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -1,21 +1,67 @@
11
import unittest
2-
3-
import fitz
2+
from unittest.mock import MagicMock, patch
43

54
import title
65

76
class TestGenerateTitle(unittest.TestCase):
8-
def test_prefers_metadata_title(self):
9-
with fitz.open("./testdata/lithium-longterm.pdf") as doc:
10-
self.assertEqual("Long-Term Lithium Therapy: Side Effects and Interactions", title.generate_title(doc))
11-
pass
12-
13-
def test_falls_back_to_first_sentence(self):
14-
with fitz.open("./testdata/advancespharmaco.pdf") as doc:
15-
expected_title = "Advances in Mood Disorder Pharmacotherapy: Evaluating New Antipsychotics and Mood Stabilizers for Bipolar Disorder and Schizophrenia"
16-
self.assertEqual(expected_title, title.generate_title(doc))
17-
18-
def test_australasian_psychiatry(self):
19-
with fitz.open("./testdata/creativitystabilizer.pdf") as doc:
20-
expected_title = "Impact of mood stabilizers on creativity"
21-
self.assertEqual(expected_title, title.generate_title(doc))
7+
def test_prefers_metadata_title_if_valid(self):
8+
doc = MagicMock()
9+
doc.metadata = {"title": "A Study Regarding The Efficacy of Drugs"}
10+
self.assertEqual("A Study Regarding The Efficacy of Drugs", title.generate_title(doc))
11+
12+
def test_falls_back_to_first_page_text_if_metadata_title_is_empty(self):
13+
doc = MagicMock()
14+
doc.metadata = {"title": ""}
15+
doc[0].get_text = MagicMock()
16+
17+
foo_block = [None] * 7
18+
foo_block[4] = "foo"
19+
foo_block[6] = 0
20+
21+
title_block = [None] * 7
22+
title_block[4] = "Advances in Mood Disorder Pharmacotherapy: Evaluating New Antipsychotics and Mood Stabilizers for Bipolar Disorder and Schizophrenia"
23+
title_block[6] = 0
24+
25+
bar_block = [None] * 7
26+
bar_block[4] = "bar"
27+
bar_block[6] = 0
28+
doc[0].get_text.return_value = [foo_block, title_block, bar_block]
29+
30+
expected_title = "Advances in Mood Disorder Pharmacotherapy: Evaluating New Antipsychotics and Mood Stabilizers for Bipolar Disorder and Schizophrenia"
31+
self.assertEqual(expected_title, title.generate_title(doc))
32+
33+
def test_falls_back_to_first_page_text_if_metadata_title_does_not_match_regex(self):
34+
doc = MagicMock()
35+
doc.metadata = {"title": "abcd1234"}
36+
doc[0].get_text = MagicMock()
37+
38+
foo_block = [None] * 7
39+
foo_block[4] = "foo"
40+
foo_block[6] = 0
41+
42+
title_block = [None] * 7
43+
title_block[4] = "Advances in Mood Disorder Pharmacotherapy: Evaluating New Antipsychotics and Mood Stabilizers for Bipolar Disorder and Schizophrenia"
44+
title_block[6] = 0
45+
46+
bar_block = [None] * 7
47+
bar_block[4] = "bar"
48+
bar_block[6] = 0
49+
doc[0].get_text.return_value = [foo_block, title_block, bar_block]
50+
51+
expected_title = "Advances in Mood Disorder Pharmacotherapy: Evaluating New Antipsychotics and Mood Stabilizers for Bipolar Disorder and Schizophrenia"
52+
self.assertEqual(expected_title, title.generate_title(doc))
53+
54+
@patch("server.api.services.openai_services.openAIServices.openAI")
55+
def test_falls_back_to_chatgpt_if_no_title_found(self, mock_openAI):
56+
doc = MagicMock()
57+
doc.metadata = {"title": None}
58+
doc.get_text.return_value = []
59+
60+
mock_response = MagicMock()
61+
mock_response.choices = [MagicMock()]
62+
mock_response.choices[0].message.content = "A Study Regarding The Efficacy of Drugs"
63+
mock_openAI.return_value = mock_response
64+
65+
title.generate_title(doc)
66+
67+
self.assertTrue(mock_openAI.called)

server/api/views/uploadFile/testdata/advancespharmaco.pdf renamed to server/api/views/uploadFile/testdata/advances-in-mood-disorder.pdf

File renamed without changes.

server/api/views/uploadFile/testdata/creativitystabilizer.pdf renamed to server/api/views/uploadFile/testdata/impact-mood-stabilizers-creativity.pdf

File renamed without changes.

server/api/views/uploadFile/testdata/lithium-longterm.pdf renamed to server/api/views/uploadFile/testdata/longterm-lithium-therapy.pdf

File renamed without changes.

server/api/views/uploadFile/title.py

Lines changed: 5 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@
55
from server.api.services.openai_services import openAIServices
66

77
# regular expression to match common research white paper titles. Created by Chat-gpt
8+
# requires at least 3 words, no dates, no version numbers.
89
title_regex = re.compile(r'^(?=(?:\b\w+\b[\s:,\-\(\)]*){3,})(?!.*\b(?:19|20)\d{2}\b)(?!.*\bv\d+\b)[A-Za-z0-9][\w\s:,\-\(\)]*[A-Za-z\)]$', re.IGNORECASE)
910

1011
def generate_title(pdf: fitz.Document) -> str | None:
@@ -13,6 +14,8 @@ def generate_title(pdf: fitz.Document) -> str | None:
1314
if title_regex.match(document_metadata_title):
1415
print("suitable title was found in metadata")
1516
return document_metadata_title.strip()
17+
else:
18+
print("metadata title did not match regex")
1619

1720
print("Looking for title in first page text")
1821
first_page = pdf[0]
@@ -23,17 +26,16 @@ def generate_title(pdf: fitz.Document) -> str | None:
2326
if block[6] == 0 # only include text blocks.
2427
]
2528

29+
# For some reason, extracted PDF text has extra spaces. Collapse them here.
2630
regex = r"\s{2,}"
2731
text_blocks = [re.sub(regex, " ", text) for text in text_blocks]
2832

29-
# replace redundant whitespaces with single space.
3033
if len(text_blocks) != 0:
3134
for text in text_blocks:
3235
if title_regex.match(text):
33-
print(f"suitable title was found in first page text {text}")
3436
return text
3537

36-
print("using chatgpt to generate title")
38+
print("no suitable title found in first page text. Using GPT-4 to summarize the PDF")
3739
gpt_title = summarize_pdf(pdf)
3840
return gpt_title or None
3941

0 commit comments

Comments
 (0)