Skip to content

Commit 76c827f

Browse files
committed
tests
1 parent 91c4be0 commit 76c827f

4 files changed

Lines changed: 24 additions & 56 deletions

File tree

server/api/services/__init__.py

Whitespace-only changes.

server/api/views/uploadFile/__init__.py

Whitespace-only changes.
Lines changed: 5 additions & 48 deletions
Original file line numberDiff line numberDiff line change
@@ -1,64 +1,21 @@
1-
import json
1+
import unittest
22

33
import fitz
4-
import os
5-
6-
import unittest
7-
from os import listdir
8-
from os.path import isfile, join
94

10-
from title import generate_title
5+
import title
116

127
class TestGenerateTitle(unittest.TestCase):
13-
# TODO: Good use for parameterized tests.
148
def test_prefers_metadata_title(self):
159
with fitz.open("./testdata/lithium-longterm.pdf") as doc:
16-
self.assertEqual("Long-Term Lithium Therapy: Side Effects and Interactions", generate_title(doc))
10+
self.assertEqual("Long-Term Lithium Therapy: Side Effects and Interactions", title.generate_title(doc))
1711
pass
1812

1913
def test_falls_back_to_first_sentence(self):
2014
with fitz.open("./testdata/advancespharmaco.pdf") as doc:
2115
expected_title = "Advances in Mood Disorder Pharmacotherapy: Evaluating New Antipsychotics and Mood Stabilizers for Bipolar Disorder and Schizophrenia"
22-
self.assertEqual(expected_title, generate_title(doc))
16+
self.assertEqual(expected_title, title.generate_title(doc))
2317

2418
def test_australasian_psychiatry(self):
2519
with fitz.open("./testdata/creativitystabilizer.pdf") as doc:
2620
expected_title = "Impact of mood stabilizers on creativity"
27-
self.assertEqual(expected_title, generate_title(doc))
28-
29-
def test_largest_text(self):
30-
first_page_json = None
31-
with open("/home/ricanontherun/Documents/balancer/uploads/output.json", "r") as f:
32-
content = ""
33-
for line in f:
34-
content += line
35-
first_page_json = json.loads(content)
36-
37-
blocks = first_page_json["blocks"]
38-
text_blocks = [block for block in blocks if block["type"] == 0]
39-
40-
print(json.dumps(text_blocks[0], indent=4))
41-
42-
def test_remaining(self):
43-
uploads_dir = "~/Documents/balancer/uploads"
44-
45-
# iterate over the files in uploads_dir
46-
# Expand the ~ to the actual home directory path
47-
expanded_path = os.path.expanduser(uploads_dir)
48-
49-
# Get all files in the directory
50-
onlyfiles = [f for f in listdir(expanded_path) if isfile(join(expanded_path, f))]
51-
52-
# Filter for PDF files
53-
pdf_files = [f for f in onlyfiles if f.lower().endswith('.pdf')]
54-
55-
for pdf_file in pdf_files:
56-
file_path = join(expanded_path, pdf_file)
57-
with fitz.open(file_path) as doc:
58-
title = generate_title(doc)
59-
# Print the filename and its generated title for debugging
60-
print(f"File: {pdf_file}, Title: {title}")
61-
# Assert that a title is generated (not empty)
62-
self.assertIsNotNone(title)
63-
self.assertNotEqual("", title)
64-
21+
self.assertEqual(expected_title, title.generate_title(doc))

server/api/views/uploadFile/title.py

Lines changed: 19 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -2,35 +2,46 @@
22

33
import fitz
44

5-
from ...services.openai_services import openAIServices
5+
from server.api.services.openai_services import openAIServices
66

7-
title_regex = re.compile(r'^([a-z0-9:-]+\s?)+$', re.IGNORECASE)
7+
# regular expression to match common research white paper titles. Created by Chat-gpt
8+
title_regex = re.compile(r'^(?=(?:\b\w+\b[\s:,\-\(\)]*){3,})(?!.*\b(?:19|20)\d{2}\b)(?!.*\bv\d+\b)[A-Za-z0-9][\w\s:,\-\(\)]*[A-Za-z\)]$', re.IGNORECASE)
89

910
def generate_title(pdf: fitz.Document) -> str | None:
10-
# 1. Check the Document's metadata first, likely to be the highest quality title if present.
1111
document_metadata_title = pdf.metadata["title"]
1212
if document_metadata_title is not None and document_metadata_title != "":
1313
if title_regex.match(document_metadata_title):
1414
print("suitable title was found in metadata")
1515
return document_metadata_title.strip()
1616

17-
print("Looking for title in first page text blocks")
18-
# 2. Find the first text-block which matches the title regex - likely to be on the first page.
17+
print("Looking for title in first page text")
1918
first_page = pdf[0]
2019
first_page_blocks = first_page.get_text("blocks")
21-
text_blocks = [block[4].replace('\n', '').strip() for block in first_page_blocks if block[6] == 0]
20+
text_blocks = [
21+
block[4].strip().replace("\n", " ")
22+
for block in first_page_blocks
23+
if block[6] == 0 # only include text blocks.
24+
]
25+
26+
regex = r"\s{2,}"
27+
text_blocks = [re.sub(regex, " ", text) for text in text_blocks]
28+
29+
# replace redundant whitespaces with single space.
2230
if len(text_blocks) != 0:
2331
for text in text_blocks:
2432
if title_regex.match(text):
25-
print("suitable title was found in first page text blocks")
33+
print(f"suitable title was found in first page text {text}")
2634
return text
2735

28-
print("falling back to chatgpt")
36+
print("using chatgpt to generate title")
2937
gpt_title = summarize_pdf(pdf)
3038
return gpt_title or None
3139

3240

3341
def summarize_pdf(pdf: fitz.Document) -> str:
42+
"""
43+
Summarize a PDF document using OpenAI's GPT-4 model.
44+
"""
3445
first_page = pdf[0]
3546
first_page_content = first_page.get_text()
3647

0 commit comments

Comments
 (0)