|
@@ -1,8 +1,8 @@
|
|
# -*- coding: utf-8 -*-
|
|
# -*- coding: utf-8 -*-
|
|
from .__load__ import *
|
|
from .__load__ import *
|
|
-#from langchain_community.document_loaders import PyPDFLoader
|
|
|
|
import fitz
|
|
import fitz
|
|
-#from PIL import Image
|
|
|
|
|
|
+from collections import Counter
|
|
|
|
+
|
|
class Pdf(Base):
|
|
class Pdf(Base):
|
|
def json(self):
|
|
def json(self):
|
|
if not self.file:
|
|
if not self.file:
|
|
@@ -12,6 +12,35 @@ class Pdf(Base):
|
|
page_count = doc.page_count
|
|
page_count = doc.page_count
|
|
result = {'total': page_count, 'pages': []}
|
|
result = {'total': page_count, 'pages': []}
|
|
|
|
|
|
|
|
+ # 预扫,找出重复最多的页眉/页脚内容
|
|
|
|
+ header_footer_texts = []
|
|
|
|
+
|
|
|
|
+ for page_num in range(len(doc)):
|
|
|
|
+ page_obj = doc.load_page(page_num)
|
|
|
|
+ page_height = page_obj.rect.height
|
|
|
|
+ blocks = page_obj.get_text("dict", sort=True)["blocks"]
|
|
|
|
+
|
|
|
|
+ for b in blocks:
|
|
|
|
+ if b['type'] != 0:
|
|
|
|
+ continue
|
|
|
|
+ y_top = b["bbox"][1]
|
|
|
|
+ y_bottom = b["bbox"][3]
|
|
|
|
+ text_content = ""
|
|
|
|
+ for line in b["lines"]:
|
|
|
|
+ for span in line["spans"]:
|
|
|
|
+ text_content += span["text"]
|
|
|
|
+ text_content = text_content.strip()
|
|
|
|
+
|
|
|
|
+ if not text_content:
|
|
|
|
+ continue
|
|
|
|
+
|
|
|
|
+ # 只考虑页顶和页底 5%
|
|
|
|
+ if y_top < page_height * 0.05 or y_bottom > page_height * 0.95:
|
|
|
|
+ header_footer_texts.append(text_content)
|
|
|
|
+
|
|
|
|
+ # 找出最常见的内容(页眉/页脚)
|
|
|
|
+ common_texts = [text for text, count in Counter(header_footer_texts).items() if count > 1]
|
|
|
|
+
|
|
for page_num in range(len(doc)):
|
|
for page_num in range(len(doc)):
|
|
page_obj = doc.load_page(page_num)
|
|
page_obj = doc.load_page(page_num)
|
|
page_height = page_obj.rect.height
|
|
page_height = page_obj.rect.height
|
|
@@ -25,12 +54,11 @@ class Pdf(Base):
|
|
if not text and not has_visible_content:
|
|
if not text and not has_visible_content:
|
|
continue
|
|
continue
|
|
|
|
|
|
- # 增加封面图
|
|
|
|
|
|
+ # 封面图
|
|
try:
|
|
try:
|
|
pix = page_obj.get_pixmap(matrix=fitz.Matrix(0.3, 0.3))
|
|
pix = page_obj.get_pixmap(matrix=fitz.Matrix(0.3, 0.3))
|
|
cover_file = f"{self.param['path']}cover_page_{page_num+1}.png"
|
|
cover_file = f"{self.param['path']}cover_page_{page_num+1}.png"
|
|
pix.save(cover_file)
|
|
pix.save(cover_file)
|
|
-
|
|
|
|
if 'host' in self.param and self.param['host']:
|
|
if 'host' in self.param and self.param['host']:
|
|
cover_file = cover_file.replace(Demeter.path + 'runtime/', self.param['host'])
|
|
cover_file = cover_file.replace(Demeter.path + 'runtime/', self.param['host'])
|
|
except Exception as e:
|
|
except Exception as e:
|
|
@@ -38,13 +66,9 @@ class Pdf(Base):
|
|
cover_file = ""
|
|
cover_file = ""
|
|
|
|
|
|
page_items = []
|
|
page_items = []
|
|
- blocks = page_obj.get_text("dict", sort=True)["blocks"]
|
|
|
|
-
|
|
|
|
for i, b in enumerate(blocks):
|
|
for i, b in enumerate(blocks):
|
|
y_top = b["bbox"][1]
|
|
y_top = b["bbox"][1]
|
|
y_bottom = b["bbox"][3]
|
|
y_bottom = b["bbox"][3]
|
|
- if y_top < page_height * 0.02 or y_bottom > page_height * 0.98:
|
|
|
|
- continue
|
|
|
|
|
|
|
|
if b['type'] == 0:
|
|
if b['type'] == 0:
|
|
text_content = ""
|
|
text_content = ""
|
|
@@ -54,12 +78,19 @@ class Pdf(Base):
|
|
text_content += "\n"
|
|
text_content += "\n"
|
|
text_content = text_content.strip()
|
|
text_content = text_content.strip()
|
|
text_content = self.clean_text(self.removeDomains(text_content))
|
|
text_content = self.clean_text(self.removeDomains(text_content))
|
|
- if text_content:
|
|
|
|
- page_items.append({
|
|
|
|
- "type": "text",
|
|
|
|
- "content": text_content,
|
|
|
|
- "pos": b["bbox"]
|
|
|
|
- })
|
|
|
|
|
|
+
|
|
|
|
+ if not text_content:
|
|
|
|
+ continue
|
|
|
|
+
|
|
|
|
+ # 排除页眉/页脚内容
|
|
|
|
+ if text_content in common_texts:
|
|
|
|
+ continue
|
|
|
|
+
|
|
|
|
+ page_items.append({
|
|
|
|
+ "type": "text",
|
|
|
|
+ "content": text_content,
|
|
|
|
+ "pos": b["bbox"]
|
|
|
|
+ })
|
|
|
|
|
|
elif b['type'] == 1:
|
|
elif b['type'] == 1:
|
|
image_bytes = b.get("image", b"")
|
|
image_bytes = b.get("image", b"")
|
|
@@ -88,6 +119,7 @@ class Pdf(Base):
|
|
|
|
|
|
return result
|
|
return result
|
|
|
|
|
|
|
|
+
|
|
# 提取为langchain的Document格式
|
|
# 提取为langchain的Document格式
|
|
def doc(self):
|
|
def doc(self):
|
|
if not self.file:
|
|
if not self.file:
|