pdf.py 5.7 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164
  1. # -*- coding: utf-8 -*-
  2. from .__load__ import *
  3. import fitz
  4. from collections import Counter
  5. class Pdf(Base):
  6. def json(self):
  7. if not self.file:
  8. return False
  9. self.getPath()
  10. doc = fitz.open(self.file)
  11. page_count = doc.page_count
  12. result = {'total': page_count, 'pages': []}
  13. # 预扫,找出重复最多的页眉/页脚内容
  14. header_footer_texts = []
  15. for page_num in range(len(doc)):
  16. page_obj = doc.load_page(page_num)
  17. page_height = page_obj.rect.height
  18. blocks = page_obj.get_text("dict", sort=True)["blocks"]
  19. for b in blocks:
  20. if b['type'] != 0:
  21. continue
  22. y_top = b["bbox"][1]
  23. y_bottom = b["bbox"][3]
  24. text_content = ""
  25. for line in b["lines"]:
  26. for span in line["spans"]:
  27. text_content += span["text"]
  28. text_content = text_content.strip()
  29. if not text_content:
  30. continue
  31. # 只考虑页顶和页底 5%
  32. if y_top < page_height * 0.05 or y_bottom > page_height * 0.95:
  33. header_footer_texts.append(text_content)
  34. # 找出最常见的内容(页眉/页脚)
  35. common_texts = [text for text, count in Counter(header_footer_texts).items() if count > 1]
  36. for page_num in range(len(doc)):
  37. page_obj = doc.load_page(page_num)
  38. page_height = page_obj.rect.height
  39. # 空白页跳过
  40. text = page_obj.get_text().strip()
  41. blocks = page_obj.get_text("dict", sort=True)["blocks"]
  42. has_visible_content = any(
  43. b for b in blocks if b['type'] in (0, 1)
  44. )
  45. if not text and not has_visible_content:
  46. continue
  47. # 封面图
  48. try:
  49. pix = page_obj.get_pixmap(matrix=fitz.Matrix(0.3, 0.3))
  50. cover_file = f"{self.param['path']}cover_page_{page_num+1}.png"
  51. pix.save(cover_file)
  52. if 'host' in self.param and self.param['host']:
  53. cover_file = cover_file.replace(Demeter.path + 'runtime/', self.param['host'])
  54. except Exception as e:
  55. print(f"封面图失败: {e}")
  56. cover_file = ""
  57. page_items = []
  58. for i, b in enumerate(blocks):
  59. y_top = b["bbox"][1]
  60. y_bottom = b["bbox"][3]
  61. if b['type'] == 0:
  62. text_content = ""
  63. for line in b["lines"]:
  64. for span in line["spans"]:
  65. text_content += span["text"]
  66. text_content += "\n"
  67. text_content = text_content.strip()
  68. text_content = self.clean_text(self.removeDomains(text_content))
  69. if not text_content:
  70. continue
  71. # 排除页眉/页脚内容
  72. if text_content in common_texts:
  73. continue
  74. page_items.append({
  75. "type": "text",
  76. "content": text_content,
  77. "pos": b["bbox"]
  78. })
  79. elif b['type'] == 1:
  80. image_bytes = b.get("image", b"")
  81. if not image_bytes or len(image_bytes) < 100:
  82. continue
  83. image_ext = "png"
  84. image_file = f"{self.param['path']}page{page_num+1}_img_{i}.{image_ext}"
  85. with open(image_file, "wb") as f:
  86. f.write(image_bytes)
  87. if 'host' in self.param and self.param['host']:
  88. image_file = image_file.replace(Demeter.path + 'runtime/', self.param['host'])
  89. page_items.append({
  90. "type": "image",
  91. "ext": image_ext,
  92. "content": image_file,
  93. "pos": b["bbox"]
  94. })
  95. result['pages'].append({
  96. "cover": cover_file,
  97. "content": page_items
  98. })
  99. return result
  100. # 提取为langchain的Document格式
  101. def doc(self):
  102. if not self.file:
  103. return False
  104. #loader = PyPDFLoader(self.file, extract_images=False)
  105. #return loader.load()
  106. doc = fitz.open(self.file)
  107. result = {'page': page, 'content': []}
  108. for page_num in range(len(doc)):
  109. page = doc.load_page(page_num)
  110. # 提取文本
  111. text = page.get_text()
  112. # 提取图片中的文字
  113. image_texts = []
  114. for img in page.get_images(full=True):
  115. xref = img[0]
  116. base_image = doc.extract_image(xref)
  117. image_bytes = base_image["image"]
  118. image = Image.open(io.BytesIO(image_bytes))
  119. #result = Demeter.service('loader', 'extract').get(image)
  120. ocr_result = ocr.ocr(image)
  121. for line in ocr_result[0]:
  122. image_texts.append(line[1])
  123. '''
  124. # OCR 识别
  125. ocr_result = ocr_reader.readtext(image)
  126. image_texts = " ".join([line[1] for line in ocr_result]).strip()
  127. '''
  128. # 合并文字 + 图片文字
  129. full_text = text.strip() + "\n" + "\n".join(image_texts)
  130. document = langchain.schema.Document(page_content=full_text)
  131. result['content'].append(document)
  132. return result
  133. def clean_text(self, s):
  134. return re.sub(r'[\x00-\x08\x0B\x0C\x0E-\x1F\x7F]', '', s)