pdf.py 4.0 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110
  1. # -*- coding: utf-8 -*-
  2. from .__load__ import *
  3. #from langchain_community.document_loaders import PyPDFLoader
  4. import fitz
  5. #from PIL import Image
  6. class Pdf(Base):
  7. # 提取为json格式
  8. def json(self):
  9. if not self.file:
  10. return False
  11. self.getPath()
  12. doc = fitz.open(self.file)
  13. page = doc.page_count
  14. result = {'page': page, 'content': []}
  15. for page_num in range(len(doc)):
  16. page = doc.load_page(page_num)
  17. page_height = page.rect.height
  18. blocks = page.get_text("dict", sort=True)["blocks"]
  19. page_items = []
  20. for i, b in enumerate(blocks):
  21. # 去除页眉页脚:bbox[1] 是顶部Y坐标,bbox[3] 是底部Y坐标
  22. y_top = b["bbox"][1]
  23. y_bottom = b["bbox"][3]
  24. if y_top < page_height * 0.05 or y_bottom > page_height * 0.95:
  25. continue
  26. if b['type'] == 0:
  27. text_content = ""
  28. for line in b["lines"]:
  29. for span in line["spans"]:
  30. span_text = span["text"]
  31. text_content += span_text
  32. text_content += "\n"
  33. text_content = text_content.strip()
  34. text_content = self.removeDomains(text_content)
  35. if text_content:
  36. page_items.append({
  37. "type": "text",
  38. "pos": b["bbox"],
  39. "content": text_content,
  40. "page": page_num + 1
  41. })
  42. elif b['type'] == 1: # 图片块
  43. image_bytes = b.get("image", b"")
  44. if not image_bytes or len(image_bytes) < 100:
  45. continue
  46. try:
  47. pix = fitz.Pixmap(doc, b["image"])
  48. if pix.width < 10 or pix.height < 10:
  49. continue
  50. except Exception:
  51. pass
  52. image_ext = "png"
  53. filename = f"{self.param['path']}page{page_num+1}_img_{i}.{image_ext}"
  54. with open(filename, "wb") as f:
  55. f.write(image_bytes)
  56. page_items.append({
  57. "type": "image",
  58. "pos": b["bbox"],
  59. "ext": image_ext,
  60. "filename": filename,
  61. "page": page_num + 1,
  62. })
  63. result['content'].extend(page_items)
  64. return result
  65. # 提取为langchain的Document格式
  66. def doc(self):
  67. if not self.file:
  68. return False
  69. #loader = PyPDFLoader(self.file, extract_images=False)
  70. #return loader.load()
  71. doc = fitz.open(self.file)
  72. result = {'page': page, 'content': []}
  73. for page_num in range(len(doc)):
  74. page = doc.load_page(page_num)
  75. # 提取文本
  76. text = page.get_text()
  77. # 提取图片中的文字
  78. image_texts = []
  79. for img in page.get_images(full=True):
  80. xref = img[0]
  81. base_image = doc.extract_image(xref)
  82. image_bytes = base_image["image"]
  83. image = Image.open(io.BytesIO(image_bytes))
  84. #result = Demeter.service('loader', 'extract').get(image)
  85. ocr_result = ocr.ocr(image)
  86. for line in ocr_result[0]:
  87. image_texts.append(line[1])
  88. '''
  89. # OCR 识别
  90. ocr_result = ocr_reader.readtext(image)
  91. image_texts = " ".join([line[1] for line in ocr_result]).strip()
  92. '''
  93. # 合并文字 + 图片文字
  94. full_text = text.strip() + "\n" + "\n".join(image_texts)
  95. document = langchain.schema.Document(page_content=full_text)
  96. result['content'].append(document)
  97. return result