CodeForPhilly
diff --git a/‎server/api/services/__init__.py‎ b/‎server/api/services/__init__.py‎
diff --git a/‎server/api/views/uploadFile/__init__.py‎ b/‎server/api/views/uploadFile/__init__.py‎
diff --git a/‎server/api/views/uploadFile/test_title.py‎
Lines changed: 5 additions & 48 deletions b/‎server/api/views/uploadFile/test_title.py‎
Lines changed: 5 additions & 48 deletions
diff --git a/‎server/api/views/uploadFile/title.py‎
Lines changed: 19 additions & 8 deletions b/‎server/api/views/uploadFile/title.py‎
Lines changed: 19 additions & 8 deletions
@@ -1,64 +1,21 @@
-import json
+import unittest
 
 import fitz
-import os
-
-import unittest
-from os import listdir
-from os.path import isfile, join
 
-from title import generate_title
+import title
 
 class TestGenerateTitle(unittest.TestCase):
-  # TODO: Good use for parameterized tests.
   def test_prefers_metadata_title(self):
     with fitz.open("./testdata/lithium-longterm.pdf") as doc:
-      self.assertEqual("Long-Term Lithium Therapy: Side Effects and Interactions", generate_title(doc))
+      self.assertEqual("Long-Term Lithium Therapy: Side Effects and Interactions", title.generate_title(doc))
       pass
 
   def test_falls_back_to_first_sentence(self):
     with fitz.open("./testdata/advancespharmaco.pdf") as doc:
       expected_title = "Advances in Mood Disorder Pharmacotherapy: Evaluating New Antipsychotics and Mood Stabilizers for Bipolar Disorder and Schizophrenia"
-      self.assertEqual(expected_title, generate_title(doc))
+      self.assertEqual(expected_title, title.generate_title(doc))
 
   def test_australasian_psychiatry(self):
     with fitz.open("./testdata/creativitystabilizer.pdf") as doc:
       expected_title = "Impact of mood stabilizers on creativity"
-      self.assertEqual(expected_title, generate_title(doc))
-
-  def test_largest_text(self):
-    first_page_json = None
-    with open("/home/ricanontherun/Documents/balancer/uploads/output.json", "r") as f:
-      content = ""
-      for line in f:
-        content += line
-      first_page_json = json.loads(content)
-
-    blocks = first_page_json["blocks"]
-    text_blocks = [block for block in blocks if block["type"] == 0]
-
-    print(json.dumps(text_blocks[0], indent=4))
-
-  def test_remaining(self):
-    uploads_dir  = "~/Documents/balancer/uploads"
-
-    # iterate over the files in uploads_dir
-    # Expand the ~ to the actual home directory path
-    expanded_path = os.path.expanduser(uploads_dir)
-    
-    # Get all files in the directory
-    onlyfiles = [f for f in listdir(expanded_path) if isfile(join(expanded_path, f))]
-    
-    # Filter for PDF files
-    pdf_files = [f for f in onlyfiles if f.lower().endswith('.pdf')]
-
-    for pdf_file in pdf_files:
-      file_path = join(expanded_path, pdf_file)
-      with fitz.open(file_path) as doc:
-        title = generate_title(doc)
-        # Print the filename and its generated title for debugging
-        print(f"File: {pdf_file}, Title: {title}")
-        # Assert that a title is generated (not empty)
-        self.assertIsNotNone(title)
-        self.assertNotEqual("", title)
-
+      self.assertEqual(expected_title, title.generate_title(doc))
@@ -2,35 +2,46 @@
 
 import fitz
 
-from ...services.openai_services import openAIServices
+from server.api.services.openai_services import openAIServices
 
-title_regex = re.compile(r'^([a-z0-9:-]+\s?)+$', re.IGNORECASE)
+# regular expression to match common research white paper titles. Created by Chat-gpt
+title_regex = re.compile(r'^(?=(?:\b\w+\b[\s:,\-\(\)]*){3,})(?!.*\b(?:19|20)\d{2}\b)(?!.*\bv\d+\b)[A-Za-z0-9][\w\s:,\-\(\)]*[A-Za-z\)]$', re.IGNORECASE)
 
 def generate_title(pdf: fitz.Document) -> str | None:
-    # 1. Check the Document's metadata first, likely to be the highest quality title if present.
     document_metadata_title = pdf.metadata["title"]
     if document_metadata_title is not None and document_metadata_title != "":
         if title_regex.match(document_metadata_title):
             print("suitable title was found in metadata")
             return document_metadata_title.strip()
 
-    print("Looking for title in first page text blocks")
-    # 2. Find the first text-block which matches the title regex - likely to be on the first page.
+    print("Looking for title in first page text")
     first_page = pdf[0]
     first_page_blocks = first_page.get_text("blocks")
-    text_blocks = [block[4].replace('\n', '').strip() for block in first_page_blocks if block[6] == 0]
+    text_blocks = [
+        block[4].strip().replace("\n", " ")
+        for block in first_page_blocks
+        if block[6] == 0 # only include text blocks.
+    ]
+
+    regex = r"\s{2,}"
+    text_blocks = [re.sub(regex, " ", text) for text in text_blocks]
+
+    # replace redundant whitespaces with single space.
     if len(text_blocks) != 0:
         for text in text_blocks:
             if title_regex.match(text):
-                print("suitable title was found in first page text blocks")
+                print(f"suitable title was found in first page text {text}")
                 return text
 
-    print("falling back to chatgpt")
+    print("using chatgpt to generate title")
     gpt_title = summarize_pdf(pdf)
     return gpt_title or None
 
 
 def summarize_pdf(pdf: fitz.Document) -> str:
+    """
+    Summarize a PDF document using OpenAI's GPT-4 model.
+    """
     first_page = pdf[0]
     first_page_content = first_page.get_text()