rabin 2 mesiacov pred
rodič
commit
565db9f4b0
5 zmenil súbory, kde vykonal 60 pridanie a 47 odobranie
  1. 4 0
      diviner.py
  2. 4 4
      pdf.py
  3. 2 0
      requirements.txt
  4. 1 1
      service/extract/docs/base.py
  5. 49 42
      service/extract/docs/pdf.py

+ 4 - 0
diviner.py

@@ -3,9 +3,13 @@
 import os
 import sys
 import subprocess
+import nltk
 from demeter.core import *
 
 def init():
+    nltk.download('punkt', quiet=True)
+    nltk.download('averaged_perceptron_tagger', quiet=True)
+    
     model = Demeter.model('manage_admin')
     model.id = 1
     info = model.select(type='fetchone')

+ 4 - 4
pdf.py

@@ -7,12 +7,12 @@ param['file'] = 'f'
 Demeter.getopt(param)
 
 #file = Demeter.option['file']
-file = 'D://work/ai/diviner/dev/data/04、Frog Is Hungry.pdf'
+file = '/data/dm/container/web/diviner/data/test.pdf'
 # pdf提取功能
 
 # 直接提取
-#result = Demeter.service('loader', 'extract').get(file).json()
+result = Demeter.service('loader', 'extract').get(file).json()
 
 # 用通用方法 同步提取并记录已提取,下次直接用提取后的内容
-result = Demeter.service('loader').get(obj='parser', module='extract', sync=False, site_id=1, uid=1, source_id=1, source=file, method='json')
-print(result)
+#result = Demeter.service('loader').get(obj='parser', module='extract', sync=True, site_id=1, uid=1, source_id=1, source=file, method='json')
+print(result)

+ 2 - 0
requirements.txt

@@ -1,4 +1,6 @@
 demeter-lib
+regex
+nltk
 tornado==6.2
 redis
 requests

+ 1 - 1
service/extract/docs/base.py

@@ -12,7 +12,7 @@ class Base(object):
     def getPath(self):
         if 'path' not in self.param:
             pdf_dir = os.path.dirname(self.file)
-            pdf_name = os.path.splitext(os.path.basename(self.file))[0]
+            pdf_name = os.path.splitext(os.path.basename(self.file))[0] + '/'
             self.param['path'] = os.path.join(pdf_dir, pdf_name)
             if not os.path.exists(self.param['path']):
                 os.makedirs(self.param['path'])

+ 49 - 42
service/extract/docs/pdf.py

@@ -1,7 +1,7 @@
 # -*- coding: utf-8 -*-
 from .__load__ import *
 import fitz
-from collections import Counter
+import re
 
 class Pdf(Base):
     def json(self):
@@ -12,40 +12,37 @@ class Pdf(Base):
         page_count = doc.page_count
         result = {'total': page_count, 'pages': []}
 
-        # 预扫,找出重复最多的页眉/页脚内容
-        header_footer_texts = []
+        # 黑名单符号,出现即整句丢弃
+        blacklist_chars = "•★◆●◇▪…※§‡†¤₪"
+        blacklist_pattern = f"[{re.escape(blacklist_chars)}]"
 
-        for page_num in range(len(doc)):
-            page_obj = doc.load_page(page_num)
-            page_height = page_obj.rect.height
-            blocks = page_obj.get_text("dict", sort=True)["blocks"]
-
-            for b in blocks:
-                if b['type'] != 0:
-                    continue
-                y_top = b["bbox"][1]
-                y_bottom = b["bbox"][3]
-                text_content = ""
-                for line in b["lines"]:
-                    for span in line["spans"]:
-                        text_content += span["text"]
-                text_content = text_content.strip()
-
-                if not text_content:
-                    continue
-
-                # 只考虑页顶和页底 5%
-                if y_top < page_height * 0.05 or y_bottom > page_height * 0.95:
-                    header_footer_texts.append(text_content)
+        def is_page_number(text):
+            text = text.strip().lower()
+            return (
+                re.fullmatch(r"(page)?\s*\d+\s*(of\s*\d+)?", text)
+                or re.fullmatch(r"\d+", text)
+                or re.fullmatch(r"\d+\s*/\s*\d+", text)
+            )
 
-        # 找出最常见的内容(页眉/页脚)
-        common_texts = [text for text, count in Counter(header_footer_texts).items() if count > 1]
+        def is_valid_english(text):
+            """
+            保留纯英文短语、单词、句子,包括标点符号(如引号、感叹号、句号等)
+            """
+            # 过滤掉包含中文或其他语言的行
+            if re.search(r'[\u4e00-\u9fff]', text):
+                return False
+            # 过滤掉非 ascii 且非常规英文标点的字符
+            if re.search(rf"{blacklist_pattern}", text):
+                return False
+            # 至少要包含一个字母或句号等基本英文结构
+            if not re.search(r"[A-Za-z]", text):
+                return False
+            return True
 
         for page_num in range(len(doc)):
             page_obj = doc.load_page(page_num)
             page_height = page_obj.rect.height
 
-            # 空白页跳过
             text = page_obj.get_text().strip()
             blocks = page_obj.get_text("dict", sort=True)["blocks"]
             has_visible_content = any(
@@ -66,31 +63,41 @@ class Pdf(Base):
                 cover_file = ""
 
             page_items = []
+
             for i, b in enumerate(blocks):
                 y_top = b["bbox"][1]
                 y_bottom = b["bbox"][3]
+                if y_top < page_height * 0.02 or y_bottom > page_height * 0.98:
+                    continue
 
                 if b['type'] == 0:
                     text_content = ""
                     for line in b["lines"]:
+                        line_text = ""
                         for span in line["spans"]:
-                            text_content += span["text"]
-                        text_content += "\n"
-                    text_content = text_content.strip()
-                    text_content = self.clean_text(self.removeDomains(text_content))
+                            span_text = span["text"].strip()
+                            if not span_text or is_page_number(span_text):
+                                continue
+                            line_text += span_text + " "
 
-                    if not text_content:
-                        continue
+                        line_text = line_text.strip()
+                        if not line_text:
+                            continue
 
-                    # 排除页眉/页脚内容
-                    if text_content in common_texts:
-                        continue
+                        # 全行过滤:含黑名单符号或不符合英文语义
+                        if not is_valid_english(line_text):
+                            continue
 
-                    page_items.append({
-                        "type": "text",
-                        "content": text_content,
-                        "pos": b["bbox"]
-                    })
+                        text_content += line_text + "\n"
+
+                    text_content = text_content.strip()
+                    text_content = self.clean_text(self.removeDomains(text_content))
+                    if text_content:
+                        page_items.append({
+                            "type": "text",
+                            "content": text_content,
+                            "pos": b["bbox"]
+                        })
 
                 elif b['type'] == 1:
                     image_bytes = b.get("image", b"")