rabin před 2 měsíci
rodič
revize
c1275dfa19
1 změnil soubory, kde provedl 46 přidání a 14 odebrání
  1. 46 14
      service/extract/docs/pdf.py

+ 46 - 14
service/extract/docs/pdf.py

@@ -1,8 +1,8 @@
 # -*- coding: utf-8 -*-
 from .__load__ import *
-#from langchain_community.document_loaders import PyPDFLoader
 import fitz
-#from PIL import Image
+from collections import Counter
+
 class Pdf(Base):
     def json(self):
         if not self.file:
@@ -12,6 +12,35 @@ class Pdf(Base):
         page_count = doc.page_count
         result = {'total': page_count, 'pages': []}
 
+        # 预扫,找出重复最多的页眉/页脚内容
+        header_footer_texts = []
+
+        for page_num in range(len(doc)):
+            page_obj = doc.load_page(page_num)
+            page_height = page_obj.rect.height
+            blocks = page_obj.get_text("dict", sort=True)["blocks"]
+
+            for b in blocks:
+                if b['type'] != 0:
+                    continue
+                y_top = b["bbox"][1]
+                y_bottom = b["bbox"][3]
+                text_content = ""
+                for line in b["lines"]:
+                    for span in line["spans"]:
+                        text_content += span["text"]
+                text_content = text_content.strip()
+
+                if not text_content:
+                    continue
+
+                # 只考虑页顶和页底 5%
+                if y_top < page_height * 0.05 or y_bottom > page_height * 0.95:
+                    header_footer_texts.append(text_content)
+
+        # 找出最常见的内容(页眉/页脚)
+        common_texts = [text for text, count in Counter(header_footer_texts).items() if count > 1]
+
         for page_num in range(len(doc)):
             page_obj = doc.load_page(page_num)
             page_height = page_obj.rect.height
@@ -25,12 +54,11 @@ class Pdf(Base):
             if not text and not has_visible_content:
                 continue
 
-            # 增加封面图
+            # 封面图
             try:
                 pix = page_obj.get_pixmap(matrix=fitz.Matrix(0.3, 0.3))
                 cover_file = f"{self.param['path']}cover_page_{page_num+1}.png"
                 pix.save(cover_file)
-
                 if 'host' in self.param and self.param['host']:
                     cover_file = cover_file.replace(Demeter.path + 'runtime/', self.param['host'])
             except Exception as e:
@@ -38,13 +66,9 @@ class Pdf(Base):
                 cover_file = ""
 
             page_items = []
-            blocks = page_obj.get_text("dict", sort=True)["blocks"]
-
             for i, b in enumerate(blocks):
                 y_top = b["bbox"][1]
                 y_bottom = b["bbox"][3]
-                if y_top < page_height * 0.02 or y_bottom > page_height * 0.98:
-                    continue
 
                 if b['type'] == 0:
                     text_content = ""
@@ -54,12 +78,19 @@ class Pdf(Base):
                         text_content += "\n"
                     text_content = text_content.strip()
                     text_content = self.clean_text(self.removeDomains(text_content))
-                    if text_content:
-                        page_items.append({
-                            "type": "text",
-                            "content": text_content,
-                            "pos": b["bbox"]
-                        })
+
+                    if not text_content:
+                        continue
+
+                    # 排除页眉/页脚内容
+                    if text_content in common_texts:
+                        continue
+
+                    page_items.append({
+                        "type": "text",
+                        "content": text_content,
+                        "pos": b["bbox"]
+                    })
 
                 elif b['type'] == 1:
                     image_bytes = b.get("image", b"")
@@ -88,6 +119,7 @@ class Pdf(Base):
 
         return result
 
+
     # 提取为langchain的Document格式
     def doc(self):
         if not self.file: