pdf.py 4.8 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133
  1. # -*- coding: utf-8 -*-
  2. from .__load__ import *
  3. #from langchain_community.document_loaders import PyPDFLoader
  4. import fitz
  5. #from PIL import Image
  6. class Pdf(Base):
  7. def json(self):
  8. if not self.file:
  9. return False
  10. self.getPath()
  11. doc = fitz.open(self.file)
  12. page_count = doc.page_count
  13. result = {'total': page_count, 'pages': []}
  14. for page_num in range(len(doc)):
  15. page_obj = doc.load_page(page_num)
  16. page_height = page_obj.rect.height
  17. # 空白页跳过
  18. text = page_obj.get_text().strip()
  19. blocks = page_obj.get_text("dict", sort=True)["blocks"]
  20. has_visible_content = any(
  21. b for b in blocks if b['type'] in (0, 1)
  22. )
  23. if not text and not has_visible_content:
  24. continue
  25. # 增加封面图
  26. try:
  27. pix = page_obj.get_pixmap(matrix=fitz.Matrix(0.3, 0.3))
  28. cover_file = f"{self.param['path']}cover_page_{page_num+1}.png"
  29. pix.save(cover_file)
  30. if 'host' in self.param and self.param['host']:
  31. cover_file = cover_file.replace(Demeter.path + 'runtime/', self.param['host'])
  32. except Exception as e:
  33. print(f"封面图失败: {e}")
  34. cover_file = ""
  35. page_items = []
  36. blocks = page_obj.get_text("dict", sort=True)["blocks"]
  37. for i, b in enumerate(blocks):
  38. y_top = b["bbox"][1]
  39. y_bottom = b["bbox"][3]
  40. block_height = y_bottom - y_top
  41. if (y_top < page_height * 0.02 or y_bottom > page_height * 0.98) and block_height < 20:
  42. continue
  43. if b['type'] == 0:
  44. text_content = ""
  45. for line in b["lines"]:
  46. for span in line["spans"]:
  47. text_content += span["text"]
  48. text_content += "\n"
  49. text_content = text_content.strip()
  50. text_content = self.clean_text(self.removeDomains(text_content))
  51. if text_content:
  52. page_items.append({
  53. "type": "text",
  54. "content": text_content,
  55. "pos": b["bbox"]
  56. })
  57. elif b['type'] == 1:
  58. image_bytes = b.get("image", b"")
  59. if not image_bytes or len(image_bytes) < 100:
  60. continue
  61. image_ext = "png"
  62. image_file = f"{self.param['path']}page{page_num+1}_img_{i}.{image_ext}"
  63. with open(image_file, "wb") as f:
  64. f.write(image_bytes)
  65. if 'host' in self.param and self.param['host']:
  66. image_file = image_file.replace(Demeter.path + 'runtime/', self.param['host'])
  67. page_items.append({
  68. "type": "image",
  69. "ext": image_ext,
  70. "content": image_file,
  71. "pos": b["bbox"]
  72. })
  73. result['pages'].append({
  74. "cover": cover_file,
  75. "content": page_items
  76. })
  77. return result
  78. # 提取为langchain的Document格式
  79. def doc(self):
  80. if not self.file:
  81. return False
  82. #loader = PyPDFLoader(self.file, extract_images=False)
  83. #return loader.load()
  84. doc = fitz.open(self.file)
  85. result = {'page': page, 'content': []}
  86. for page_num in range(len(doc)):
  87. page = doc.load_page(page_num)
  88. # 提取文本
  89. text = page.get_text()
  90. # 提取图片中的文字
  91. image_texts = []
  92. for img in page.get_images(full=True):
  93. xref = img[0]
  94. base_image = doc.extract_image(xref)
  95. image_bytes = base_image["image"]
  96. image = Image.open(io.BytesIO(image_bytes))
  97. #result = Demeter.service('loader', 'extract').get(image)
  98. ocr_result = ocr.ocr(image)
  99. for line in ocr_result[0]:
  100. image_texts.append(line[1])
  101. '''
  102. # OCR 识别
  103. ocr_result = ocr_reader.readtext(image)
  104. image_texts = " ".join([line[1] for line in ocr_result]).strip()
  105. '''
  106. # 合并文字 + 图片文字
  107. full_text = text.strip() + "\n" + "\n".join(image_texts)
  108. document = langchain.schema.Document(page_content=full_text)
  109. result['content'].append(document)
  110. return result
  111. def clean_text(self, s):
  112. return re.sub(r'[\x00-\x08\x0B\x0C\x0E-\x1F\x7F]', '', s)