rabin há 2 meses atrás
pai
commit
19ab0ed322
2 ficheiros alterados com 45 adições e 28 exclusões
  1. 42 25
      service/extract/docs/pdf.py
  2. 3 3
      service/extract/parser.py

+ 42 - 25
service/extract/docs/pdf.py

@@ -4,23 +4,43 @@ from .__load__ import *
 import fitz
 #from PIL import Image
 class Pdf(Base):
-    # 提取为json格式
     def json(self):
         if not self.file:
             return False
         self.getPath()
         doc = fitz.open(self.file)
-        page = doc.page_count
-        result = {'page': page, 'content': []}
+        page_count = doc.page_count
+        result = {'total': page_count, 'pages': []}
 
         for page_num in range(len(doc)):
-            page = doc.load_page(page_num)
-            page_height = page.rect.height
-            blocks = page.get_text("dict", sort=True)["blocks"]
+            page_obj = doc.load_page(page_num)
+            page_height = page_obj.rect.height
+
+            # 空白页跳过
+            text = page_obj.get_text().strip()
+            blocks = page_obj.get_text("dict", sort=True)["blocks"]
+            has_visible_content = any(
+                b for b in blocks if b['type'] in (0, 1)
+            )
+            if not text and not has_visible_content:
+                continue
+
+            # 增加封面图
+            try:
+                pix = page_obj.get_pixmap(matrix=fitz.Matrix(0.3, 0.3))
+                cover_file = f"{self.param['path']}cover_page_{page_num+1}.png"
+                pix.save(cover_file)
+
+                if 'host' in self.param and self.param['host']:
+                    cover_file = cover_file.replace(Demeter.path + 'runtime/', self.param['host'])
+            except Exception as e:
+                print(f"封面图失败: {e}")
+                cover_file = ""
+
             page_items = []
+            blocks = page_obj.get_text("dict", sort=True)["blocks"]
 
             for i, b in enumerate(blocks):
-                # 去除页眉页脚:bbox[1] 是顶部Y坐标,bbox[3] 是底部Y坐标
                 y_top = b["bbox"][1]
                 y_bottom = b["bbox"][3]
                 if y_top < page_height * 0.05 or y_bottom > page_height * 0.95:
@@ -30,45 +50,42 @@ class Pdf(Base):
                     text_content = ""
                     for line in b["lines"]:
                         for span in line["spans"]:
-                            span_text = span["text"]
-                            text_content += span_text
+                            text_content += span["text"]
                         text_content += "\n"
                     text_content = text_content.strip()
                     text_content = self.removeDomains(text_content)
                     if text_content:
                         page_items.append({
                             "type": "text",
-                            "pos": b["bbox"],
                             "content": text_content,
-                            "page": page_num + 1
+                            "pos": b["bbox"]
                         })
 
-                elif b['type'] == 1:  # 图片块
+                elif b['type'] == 1:
                     image_bytes = b.get("image", b"")
                     if not image_bytes or len(image_bytes) < 100:
                         continue
-                    try:
-                        pix = fitz.Pixmap(doc, b["image"])
-                        if pix.width < 10 or pix.height < 10:
-                            continue
-                    except Exception:
-                        pass
+
                     image_ext = "png"
-                    file = f"{self.param['path']}page{page_num+1}_img_{i}.{image_ext}"
-                    with open(file, "wb") as f:
+                    image_file = f"{self.param['path']}page{page_num+1}_img_{i}.{image_ext}"
+                    with open(image_file, "wb") as f:
                         f.write(image_bytes)
 
                     if 'host' in self.param and self.param['host']:
-                        file = file.replace(Demeter.path + 'runtime/', self.param['host'])
+                        image_file = image_file.replace(Demeter.path + 'runtime/', self.param['host'])
+
                     page_items.append({
                         "type": "image",
-                        "pos": b["bbox"],
                         "ext": image_ext,
-                        "file": file,
-                        "page": page_num + 1,
+                        "content": image_file,
+                        "pos": b["bbox"]
                     })
 
-            result['content'].extend(page_items)
+            result['pages'].append({
+                "cover": cover_file,
+                "content": page_items
+            })
+
         return result
 
     # 提取为langchain的Document格式

+ 3 - 3
service/extract/parser.py

@@ -106,9 +106,9 @@ class Parser(object):
                 obj = Demeter.service('loader', 'extract').get(info['file'], {'path':info['path'], 'host':info['host']})
                 func = getattr(obj, method)
                 result = func()
-                if result and 'page' in result and result['page'] > 0:
-                    param['content'] = result['content']
-                    param['page'] = result['page']
+                if result and 'total' in result and result['total'] > 0:
+                    param['content'] = result['pages']
+                    param['page'] = result['total']
                     param['status'] = 3
                     if method == 'json':
                         method = 1