Skip to content

Commit 03b7639

Browse files
committed
Replace block-position title extraction with font-size-based approach
The old "scan first couple pages" logic used get_text("blocks") and picked the first block matching a title regex, which frequently selected preambles, journal names, and article headers instead of the actual title. The new approach uses get_text("dict") to find the largest font size across the first few pages and collects contiguous runs of text at that size, since research paper titles are typically the largest font.
1 parent f2f4274 commit 03b7639

1 file changed

Lines changed: 68 additions & 23 deletions

File tree

server/api/views/uploadFile/title.py

Lines changed: 68 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -15,35 +15,80 @@ def generate_title(pdf: fitz.Document) -> str | None:
1515
document_metadata_title = pdf.metadata["title"]
1616
if document_metadata_title is not None and document_metadata_title != "":
1717
if title_regex.match(document_metadata_title):
18-
print("suitable title was found in metadata")
1918
return document_metadata_title.strip()
20-
else:
21-
print("metadata title did not match regex")
2219

23-
print("Looking for title in first page text")
24-
first_page = pdf[0]
25-
first_page_blocks = first_page.get_text("blocks")
26-
text_blocks = [
27-
block[4].strip().replace("\n", " ")
28-
for block in first_page_blocks
29-
if block[6] == 0 # only include text blocks.
30-
]
31-
32-
# For some reason, extracted PDF text has extra spaces. Collapse them here.
33-
regex = r"\s{2,}"
34-
text_blocks = [re.sub(regex, " ", text) for text in text_blocks]
35-
36-
if len(text_blocks) != 0:
37-
for text in text_blocks:
38-
if title_regex.match(text):
39-
return text
40-
41-
print(
42-
"no suitable title found in first page text. Using GPT-4 to summarize the PDF")
20+
font_title = extract_title_by_font_size(pdf)
21+
if font_title:
22+
return font_title
23+
4324
gpt_title = summarize_pdf(pdf)
4425
return gpt_title or None
4526

4627

28+
def extract_title_by_font_size(pdf: fitz.Document, max_pages: int = 3) -> str | None:
29+
"""
30+
Extract the title by finding the largest font size across the first few pages
31+
and collecting contiguous runs of text at that size.
32+
"""
33+
pages_to_scan = min(max_pages, len(pdf))
34+
35+
# First pass: collect all spans with their font size, and find the max font size.
36+
all_spans = []
37+
max_font_size = 0.0
38+
39+
for page_idx in range(pages_to_scan):
40+
page_dict = pdf[page_idx].get_text("dict")
41+
for block in page_dict["blocks"]:
42+
if block.get("type") != 0:
43+
continue
44+
for line in block["lines"]:
45+
for span in line["spans"]:
46+
text = span["text"].strip()
47+
size = span["size"]
48+
if len(text) < 2 or size < 6.0:
49+
continue
50+
all_spans.append({"text": text, "size": size})
51+
if size > max_font_size:
52+
max_font_size = size
53+
54+
if max_font_size == 0.0:
55+
return None
56+
57+
# Second pass: gather contiguous runs of spans at the max font size.
58+
# Runs continue across block boundaries so multi-block titles (e.g.,
59+
# "BIPOLAR DISORDER IN PRIMARY CARE:" in one block and "DIAGNOSIS AND
60+
# MANAGEMENT" in the next) are joined into a single candidate.
61+
# A run only ends when a non-max-size span interrupts it.
62+
candidates = []
63+
current_run = []
64+
65+
for span in all_spans:
66+
if span["size"] == max_font_size:
67+
current_run.append(span["text"])
68+
else:
69+
if current_run:
70+
candidates.append(" ".join(current_run))
71+
current_run = []
72+
73+
if current_run:
74+
candidates.append(" ".join(current_run))
75+
76+
# Collapse extra whitespace, validate against title regex, and pick the longest match.
77+
# Longest wins because real titles are typically longer than section headers
78+
# (e.g., "About the Author") that may share the same max font size.
79+
best = None
80+
for candidate in candidates:
81+
cleaned = re.sub(r"\s{2,}", " ", candidate).strip()
82+
if title_regex.match(cleaned):
83+
if best is None or len(cleaned) > len(best):
84+
best = cleaned
85+
86+
if best:
87+
return best[:255]
88+
89+
return None
90+
91+
4792
def summarize_pdf(pdf: fitz.Document) -> str:
4893
"""
4994
Summarize a PDF document using OpenAI's GPT-4 model.

0 commit comments

Comments
 (0)