@@ -15,35 +15,80 @@ def generate_title(pdf: fitz.Document) -> str | None:
1515 document_metadata_title = pdf .metadata ["title" ]
1616 if document_metadata_title is not None and document_metadata_title != "" :
1717 if title_regex .match (document_metadata_title ):
18- print ("suitable title was found in metadata" )
1918 return document_metadata_title .strip ()
20- else :
21- print ("metadata title did not match regex" )
2219
23- print ("Looking for title in first page text" )
24- first_page = pdf [0 ]
25- first_page_blocks = first_page .get_text ("blocks" )
26- text_blocks = [
27- block [4 ].strip ().replace ("\n " , " " )
28- for block in first_page_blocks
29- if block [6 ] == 0 # only include text blocks.
30- ]
31-
32- # For some reason, extracted PDF text has extra spaces. Collapse them here.
33- regex = r"\s{2,}"
34- text_blocks = [re .sub (regex , " " , text ) for text in text_blocks ]
35-
36- if len (text_blocks ) != 0 :
37- for text in text_blocks :
38- if title_regex .match (text ):
39- return text
40-
41- print (
42- "no suitable title found in first page text. Using GPT-4 to summarize the PDF" )
20+ font_title = extract_title_by_font_size (pdf )
21+ if font_title :
22+ return font_title
23+
4324 gpt_title = summarize_pdf (pdf )
4425 return gpt_title or None
4526
4627
28+ def extract_title_by_font_size (pdf : fitz .Document , max_pages : int = 3 ) -> str | None :
29+ """
30+ Extract the title by finding the largest font size across the first few pages
31+ and collecting contiguous runs of text at that size.
32+ """
33+ pages_to_scan = min (max_pages , len (pdf ))
34+
35+ # First pass: collect all spans with their font size, and find the max font size.
36+ all_spans = []
37+ max_font_size = 0.0
38+
39+ for page_idx in range (pages_to_scan ):
40+ page_dict = pdf [page_idx ].get_text ("dict" )
41+ for block in page_dict ["blocks" ]:
42+ if block .get ("type" ) != 0 :
43+ continue
44+ for line in block ["lines" ]:
45+ for span in line ["spans" ]:
46+ text = span ["text" ].strip ()
47+ size = span ["size" ]
48+ if len (text ) < 2 or size < 6.0 :
49+ continue
50+ all_spans .append ({"text" : text , "size" : size })
51+ if size > max_font_size :
52+ max_font_size = size
53+
54+ if max_font_size == 0.0 :
55+ return None
56+
57+ # Second pass: gather contiguous runs of spans at the max font size.
58+ # Runs continue across block boundaries so multi-block titles (e.g.,
59+ # "BIPOLAR DISORDER IN PRIMARY CARE:" in one block and "DIAGNOSIS AND
60+ # MANAGEMENT" in the next) are joined into a single candidate.
61+ # A run only ends when a non-max-size span interrupts it.
62+ candidates = []
63+ current_run = []
64+
65+ for span in all_spans :
66+ if span ["size" ] == max_font_size :
67+ current_run .append (span ["text" ])
68+ else :
69+ if current_run :
70+ candidates .append (" " .join (current_run ))
71+ current_run = []
72+
73+ if current_run :
74+ candidates .append (" " .join (current_run ))
75+
76+ # Collapse extra whitespace, validate against title regex, and pick the longest match.
77+ # Longest wins because real titles are typically longer than section headers
78+ # (e.g., "About the Author") that may share the same max font size.
79+ best = None
80+ for candidate in candidates :
81+ cleaned = re .sub (r"\s{2,}" , " " , candidate ).strip ()
82+ if title_regex .match (cleaned ):
83+ if best is None or len (cleaned ) > len (best ):
84+ best = cleaned
85+
86+ if best :
87+ return best [:255 ]
88+
89+ return None
90+
91+
4792def summarize_pdf (pdf : fitz .Document ) -> str :
4893 """
4994 Summarize a PDF document using OpenAI's GPT-4 model.
0 commit comments