|
@@ -1,7 +1,7 @@
|
|
# -*- coding: utf-8 -*-
|
|
# -*- coding: utf-8 -*-
|
|
from .__load__ import *
|
|
from .__load__ import *
|
|
import fitz
|
|
import fitz
|
|
-from collections import Counter
|
|
|
|
|
|
+import re
|
|
|
|
|
|
class Pdf(Base):
|
|
class Pdf(Base):
|
|
def json(self):
|
|
def json(self):
|
|
@@ -12,40 +12,37 @@ class Pdf(Base):
|
|
page_count = doc.page_count
|
|
page_count = doc.page_count
|
|
result = {'total': page_count, 'pages': []}
|
|
result = {'total': page_count, 'pages': []}
|
|
|
|
|
|
- # 预扫,找出重复最多的页眉/页脚内容
|
|
|
|
- header_footer_texts = []
|
|
|
|
|
|
+ # 黑名单符号,出现即整句丢弃
|
|
|
|
+ blacklist_chars = "•★◆●◇▪…※§‡†¤₪"
|
|
|
|
+ blacklist_pattern = f"[{re.escape(blacklist_chars)}]"
|
|
|
|
|
|
- for page_num in range(len(doc)):
|
|
|
|
- page_obj = doc.load_page(page_num)
|
|
|
|
- page_height = page_obj.rect.height
|
|
|
|
- blocks = page_obj.get_text("dict", sort=True)["blocks"]
|
|
|
|
-
|
|
|
|
- for b in blocks:
|
|
|
|
- if b['type'] != 0:
|
|
|
|
- continue
|
|
|
|
- y_top = b["bbox"][1]
|
|
|
|
- y_bottom = b["bbox"][3]
|
|
|
|
- text_content = ""
|
|
|
|
- for line in b["lines"]:
|
|
|
|
- for span in line["spans"]:
|
|
|
|
- text_content += span["text"]
|
|
|
|
- text_content = text_content.strip()
|
|
|
|
-
|
|
|
|
- if not text_content:
|
|
|
|
- continue
|
|
|
|
-
|
|
|
|
- # 只考虑页顶和页底 5%
|
|
|
|
- if y_top < page_height * 0.05 or y_bottom > page_height * 0.95:
|
|
|
|
- header_footer_texts.append(text_content)
|
|
|
|
|
|
+ def is_page_number(text):
|
|
|
|
+ text = text.strip().lower()
|
|
|
|
+ return (
|
|
|
|
+ re.fullmatch(r"(page)?\s*\d+\s*(of\s*\d+)?", text)
|
|
|
|
+ or re.fullmatch(r"\d+", text)
|
|
|
|
+ or re.fullmatch(r"\d+\s*/\s*\d+", text)
|
|
|
|
+ )
|
|
|
|
|
|
- # 找出最常见的内容(页眉/页脚)
|
|
|
|
- common_texts = [text for text, count in Counter(header_footer_texts).items() if count > 1]
|
|
|
|
|
|
+ def is_valid_english(text):
|
|
|
|
+ """
|
|
|
|
+ 保留纯英文短语、单词、句子,包括标点符号(如引号、感叹号、句号等)
|
|
|
|
+ """
|
|
|
|
+ # 过滤掉包含中文或其他语言的行
|
|
|
|
+ if re.search(r'[\u4e00-\u9fff]', text):
|
|
|
|
+ return False
|
|
|
|
+ # 过滤掉非 ascii 且非常规英文标点的字符
|
|
|
|
+ if re.search(rf"{blacklist_pattern}", text):
|
|
|
|
+ return False
|
|
|
|
+ # 至少要包含一个字母或句号等基本英文结构
|
|
|
|
+ if not re.search(r"[A-Za-z]", text):
|
|
|
|
+ return False
|
|
|
|
+ return True
|
|
|
|
|
|
for page_num in range(len(doc)):
|
|
for page_num in range(len(doc)):
|
|
page_obj = doc.load_page(page_num)
|
|
page_obj = doc.load_page(page_num)
|
|
page_height = page_obj.rect.height
|
|
page_height = page_obj.rect.height
|
|
|
|
|
|
- # 空白页跳过
|
|
|
|
text = page_obj.get_text().strip()
|
|
text = page_obj.get_text().strip()
|
|
blocks = page_obj.get_text("dict", sort=True)["blocks"]
|
|
blocks = page_obj.get_text("dict", sort=True)["blocks"]
|
|
has_visible_content = any(
|
|
has_visible_content = any(
|
|
@@ -66,31 +63,41 @@ class Pdf(Base):
|
|
cover_file = ""
|
|
cover_file = ""
|
|
|
|
|
|
page_items = []
|
|
page_items = []
|
|
|
|
+
|
|
for i, b in enumerate(blocks):
|
|
for i, b in enumerate(blocks):
|
|
y_top = b["bbox"][1]
|
|
y_top = b["bbox"][1]
|
|
y_bottom = b["bbox"][3]
|
|
y_bottom = b["bbox"][3]
|
|
|
|
+ if y_top < page_height * 0.02 or y_bottom > page_height * 0.98:
|
|
|
|
+ continue
|
|
|
|
|
|
if b['type'] == 0:
|
|
if b['type'] == 0:
|
|
text_content = ""
|
|
text_content = ""
|
|
for line in b["lines"]:
|
|
for line in b["lines"]:
|
|
|
|
+ line_text = ""
|
|
for span in line["spans"]:
|
|
for span in line["spans"]:
|
|
- text_content += span["text"]
|
|
|
|
- text_content += "\n"
|
|
|
|
- text_content = text_content.strip()
|
|
|
|
- text_content = self.clean_text(self.removeDomains(text_content))
|
|
|
|
|
|
+ span_text = span["text"].strip()
|
|
|
|
+ if not span_text or is_page_number(span_text):
|
|
|
|
+ continue
|
|
|
|
+ line_text += span_text + " "
|
|
|
|
|
|
- if not text_content:
|
|
|
|
- continue
|
|
|
|
|
|
+ line_text = line_text.strip()
|
|
|
|
+ if not line_text:
|
|
|
|
+ continue
|
|
|
|
|
|
- # 排除页眉/页脚内容
|
|
|
|
- if text_content in common_texts:
|
|
|
|
- continue
|
|
|
|
|
|
+ # 全行过滤:含黑名单符号或不符合英文语义
|
|
|
|
+ if not is_valid_english(line_text):
|
|
|
|
+ continue
|
|
|
|
|
|
- page_items.append({
|
|
|
|
- "type": "text",
|
|
|
|
- "content": text_content,
|
|
|
|
- "pos": b["bbox"]
|
|
|
|
- })
|
|
|
|
|
|
+ text_content += line_text + "\n"
|
|
|
|
+
|
|
|
|
+ text_content = text_content.strip()
|
|
|
|
+ text_content = self.clean_text(self.removeDomains(text_content))
|
|
|
|
+ if text_content:
|
|
|
|
+ page_items.append({
|
|
|
|
+ "type": "text",
|
|
|
|
+ "content": text_content,
|
|
|
|
+ "pos": b["bbox"]
|
|
|
|
+ })
|
|
|
|
|
|
elif b['type'] == 1:
|
|
elif b['type'] == 1:
|
|
image_bytes = b.get("image", b"")
|
|
image_bytes = b.get("image", b"")
|