|
@@ -4,23 +4,43 @@ from .__load__ import *
|
|
import fitz
|
|
import fitz
|
|
#from PIL import Image
|
|
#from PIL import Image
|
|
class Pdf(Base):
|
|
class Pdf(Base):
|
|
- # 提取为json格式
|
|
|
|
def json(self):
|
|
def json(self):
|
|
if not self.file:
|
|
if not self.file:
|
|
return False
|
|
return False
|
|
self.getPath()
|
|
self.getPath()
|
|
doc = fitz.open(self.file)
|
|
doc = fitz.open(self.file)
|
|
- page = doc.page_count
|
|
|
|
- result = {'page': page, 'content': []}
|
|
|
|
|
|
+ page_count = doc.page_count
|
|
|
|
+ result = {'total': page_count, 'pages': []}
|
|
|
|
|
|
for page_num in range(len(doc)):
|
|
for page_num in range(len(doc)):
|
|
- page = doc.load_page(page_num)
|
|
|
|
- page_height = page.rect.height
|
|
|
|
- blocks = page.get_text("dict", sort=True)["blocks"]
|
|
|
|
|
|
+ page_obj = doc.load_page(page_num)
|
|
|
|
+ page_height = page_obj.rect.height
|
|
|
|
+
|
|
|
|
+ # 空白页跳过
|
|
|
|
+ text = page_obj.get_text().strip()
|
|
|
|
+ blocks = page_obj.get_text("dict", sort=True)["blocks"]
|
|
|
|
+ has_visible_content = any(
|
|
|
|
+ b for b in blocks if b['type'] in (0, 1)
|
|
|
|
+ )
|
|
|
|
+ if not text and not has_visible_content:
|
|
|
|
+ continue
|
|
|
|
+
|
|
|
|
+ # 增加封面图
|
|
|
|
+ try:
|
|
|
|
+ pix = page_obj.get_pixmap(matrix=fitz.Matrix(0.3, 0.3))
|
|
|
|
+ cover_file = f"{self.param['path']}cover_page_{page_num+1}.png"
|
|
|
|
+ pix.save(cover_file)
|
|
|
|
+
|
|
|
|
+ if 'host' in self.param and self.param['host']:
|
|
|
|
+ cover_file = cover_file.replace(Demeter.path + 'runtime/', self.param['host'])
|
|
|
|
+ except Exception as e:
|
|
|
|
+ print(f"封面图失败: {e}")
|
|
|
|
+ cover_file = ""
|
|
|
|
+
|
|
page_items = []
|
|
page_items = []
|
|
|
|
+ blocks = page_obj.get_text("dict", sort=True)["blocks"]
|
|
|
|
|
|
for i, b in enumerate(blocks):
|
|
for i, b in enumerate(blocks):
|
|
- # 去除页眉页脚:bbox[1] 是顶部Y坐标,bbox[3] 是底部Y坐标
|
|
|
|
y_top = b["bbox"][1]
|
|
y_top = b["bbox"][1]
|
|
y_bottom = b["bbox"][3]
|
|
y_bottom = b["bbox"][3]
|
|
if y_top < page_height * 0.05 or y_bottom > page_height * 0.95:
|
|
if y_top < page_height * 0.05 or y_bottom > page_height * 0.95:
|
|
@@ -30,45 +50,42 @@ class Pdf(Base):
|
|
text_content = ""
|
|
text_content = ""
|
|
for line in b["lines"]:
|
|
for line in b["lines"]:
|
|
for span in line["spans"]:
|
|
for span in line["spans"]:
|
|
- span_text = span["text"]
|
|
|
|
- text_content += span_text
|
|
|
|
|
|
+ text_content += span["text"]
|
|
text_content += "\n"
|
|
text_content += "\n"
|
|
text_content = text_content.strip()
|
|
text_content = text_content.strip()
|
|
text_content = self.removeDomains(text_content)
|
|
text_content = self.removeDomains(text_content)
|
|
if text_content:
|
|
if text_content:
|
|
page_items.append({
|
|
page_items.append({
|
|
"type": "text",
|
|
"type": "text",
|
|
- "pos": b["bbox"],
|
|
|
|
"content": text_content,
|
|
"content": text_content,
|
|
- "page": page_num + 1
|
|
|
|
|
|
+ "pos": b["bbox"]
|
|
})
|
|
})
|
|
|
|
|
|
- elif b['type'] == 1: # 图片块
|
|
|
|
|
|
+ elif b['type'] == 1:
|
|
image_bytes = b.get("image", b"")
|
|
image_bytes = b.get("image", b"")
|
|
if not image_bytes or len(image_bytes) < 100:
|
|
if not image_bytes or len(image_bytes) < 100:
|
|
continue
|
|
continue
|
|
- try:
|
|
|
|
- pix = fitz.Pixmap(doc, b["image"])
|
|
|
|
- if pix.width < 10 or pix.height < 10:
|
|
|
|
- continue
|
|
|
|
- except Exception:
|
|
|
|
- pass
|
|
|
|
|
|
+
|
|
image_ext = "png"
|
|
image_ext = "png"
|
|
- file = f"{self.param['path']}page{page_num+1}_img_{i}.{image_ext}"
|
|
|
|
- with open(file, "wb") as f:
|
|
|
|
|
|
+ image_file = f"{self.param['path']}page{page_num+1}_img_{i}.{image_ext}"
|
|
|
|
+ with open(image_file, "wb") as f:
|
|
f.write(image_bytes)
|
|
f.write(image_bytes)
|
|
|
|
|
|
if 'host' in self.param and self.param['host']:
|
|
if 'host' in self.param and self.param['host']:
|
|
- file = file.replace(Demeter.path + 'runtime/', self.param['host'])
|
|
|
|
|
|
+ image_file = image_file.replace(Demeter.path + 'runtime/', self.param['host'])
|
|
|
|
+
|
|
page_items.append({
|
|
page_items.append({
|
|
"type": "image",
|
|
"type": "image",
|
|
- "pos": b["bbox"],
|
|
|
|
"ext": image_ext,
|
|
"ext": image_ext,
|
|
- "file": file,
|
|
|
|
- "page": page_num + 1,
|
|
|
|
|
|
+ "content": image_file,
|
|
|
|
+ "pos": b["bbox"]
|
|
})
|
|
})
|
|
|
|
|
|
- result['content'].extend(page_items)
|
|
|
|
|
|
+ result['pages'].append({
|
|
|
|
+ "cover": cover_file,
|
|
|
|
+ "content": page_items
|
|
|
|
+ })
|
|
|
|
+
|
|
return result
|
|
return result
|
|
|
|
|
|
# 提取为langchain的Document格式
|
|
# 提取为langchain的Document格式
|