pdf.py 4.4 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127
  1. # -*- coding: utf-8 -*-
  2. from .__load__ import *
  3. import fitz
  4. import re
  5. class Pdf(Base):
  6. def json(self):
  7. if not self.file:
  8. return False
  9. self.audio = ''
  10. if 'audio' in self.param and self.param['audio']:
  11. # 有音频文件
  12. self.audio = self.param['audio']
  13. self.audio = Demeter.service('loader', 'extract').get(self.audio).json()
  14. self.getPath()
  15. doc = fitz.open(self.file)
  16. page_count = doc.page_count
  17. result = {'total': page_count, 'pages': [], 'text': []}
  18. scale = 2.0
  19. for page_num in range(doc.page_count):
  20. page = page_num + 1
  21. page_obj = doc.load_page(page_num)
  22. page_width, page_height = page_obj.rect.width, page_obj.rect.height
  23. # 提取文字
  24. # 获取每页的 words
  25. # 每个 word 是 [x0, y0, x1, y1, "word_text", block_no, line_no, word_no]
  26. words = page_obj.get_text("words")
  27. for w in words:
  28. x0, y0, x1, y1, text, *_ = w
  29. # 转百分比,方便前端高亮
  30. rel_bbox = [
  31. (x0 / (page_width)) * 100,
  32. (y0 / (page_height)) * 100,
  33. (x1 / (page_width)) * 100,
  34. (y1 / (page_height)) * 100,
  35. ]
  36. result['text'].append({
  37. "page": page,
  38. "text": text,
  39. "bbox": rel_bbox
  40. })
  41. # 提取封面图
  42. mat = fitz.Matrix(scale, scale)
  43. pix = page_obj.get_pixmap(matrix=mat, alpha=False)
  44. cover_file = f"{self.param['path']}cover_page_{page_num+1}.png"
  45. pix.save(cover_file)
  46. if 'host' in self.param and self.param['host']:
  47. cover_file = cover_file.replace(Demeter.path + 'runtime/', self.param['host'])
  48. result['pages'].append({
  49. "page": page,
  50. "img": cover_file,
  51. "width": page_width * scale,
  52. "height": page_height * scale,
  53. })
  54. if self.audio:
  55. text = []
  56. for item in self.audio:
  57. state = self.find(result['text'], item);
  58. if state:
  59. text.append(state)
  60. result['text'] = text
  61. return result
  62. def find(self, text, audio):
  63. for idx, words in enumerate(text):
  64. if words['page'] <= 1:
  65. continue
  66. if audio['text'].lower() in words['text'].lower():
  67. text_item = {
  68. "page": words['page'],
  69. "text": words['text'],
  70. "bbox": words['bbox'],
  71. "start": audio['start'],
  72. "end": audio['end'],
  73. }
  74. text.pop(idx) # 根据下标删除,避免重复 dict 的问题
  75. return text_item
  76. return False
  77. # 提取为langchain的Document格式
  78. def doc(self):
  79. if not self.file:
  80. return False
  81. #loader = PyPDFLoader(self.file, extract_images=False)
  82. #return loader.load()
  83. doc = fitz.open(self.file)
  84. result = {'page': page, 'content': []}
  85. for page_num in range(len(doc)):
  86. page = doc.load_page(page_num)
  87. # 提取文本
  88. text = page.get_text()
  89. # 提取图片中的文字
  90. image_texts = []
  91. for img in page.get_images(full=True):
  92. xref = img[0]
  93. base_image = doc.extract_image(xref)
  94. image_bytes = base_image["image"]
  95. image = Image.open(io.BytesIO(image_bytes))
  96. #result = Demeter.service('loader', 'extract').get(image)
  97. ocr_result = ocr.ocr(image)
  98. for line in ocr_result[0]:
  99. image_texts.append(line[1])
  100. '''
  101. # OCR 识别
  102. ocr_result = ocr_reader.readtext(image)
  103. image_texts = " ".join([line[1] for line in ocr_result]).strip()
  104. '''
  105. # 合并文字 + 图片文字
  106. full_text = text.strip() + "\n" + "\n".join(image_texts)
  107. document = langchain.schema.Document(page_content=full_text)
  108. result['content'].append(document)
  109. return result
  110. def clean_text(self, s):
  111. return re.sub(r'[\x00-\x08\x0B\x0C\x0E-\x1F\x7F]', '', s)