pdf.py 5.7 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132
  1. # -*- coding: utf-8 -*-
  2. from .__load__ import *
  3. import fitz
  4. import re
  5. class Pdf(Base):
  6. def json(self):
  7. if not self.file:
  8. return False
  9. self.audio = ''
  10. if 'audit' in self.param:
  11. # 有音频文件
  12. self.audio = self.param['audio']
  13. #self.audio = Demeter.service('loader', 'extract').get(self.audio).json()
  14. self.audio = [{'start': 0.0, 'end': 0.437, 'text': 'Farm'}, {'start': 0.437, 'end': 1.311, 'text': 'animals'}, {'start': 1.311, 'end': 2.185, 'text': 'written'}, {'start': 2.185, 'end': 2.622, 'text': 'by'}, {'start': 2.622, 'end': 3.496, 'text': 'Cheryl'}, {'start': 3.496, 'end': 3.933, 'text': 'Ryan'}, {'start': 3.933, 'end': 5.681, 'text': 'Illustrated'}, {'start': 5.681, 'end': 6.118, 'text': 'by'}, {'start': 6.118, 'end': 6.555, 'text': 'Nora'}, {'start': 6.555, 'end': 7.43, 'text': 'Buddhist'}, {'start': 9.54, 'end': 10.385, 'text': 'The'}, {'start': 10.385, 'end': 11.23, 'text': 'dog'}, {'start': 12.82, 'end': 13.515, 'text': 'The'}, {'start': 13.515, 'end': 14.21, 'text': 'pig'}, {'start': 15.84, 'end': 16.293, 'text': 'The'}, {'start': 16.293, 'end': 17.2, 'text': 'chicken'}, {'start': 18.87, 'end': 19.67, 'text': 'The'}, {'start': 19.67, 'end': 20.47, 'text': 'goat'}, {'start': 22.06, 'end': 22.82, 'text': 'The'}, {'start': 22.82, 'end': 23.58, 'text': 'cow'}, {'start': 25.19, 'end': 25.87, 'text': 'The'}, {'start': 25.87, 'end': 26.55, 'text': 'duck'}, {'start': 28.36, 'end': 29.055, 'text': 'The'}, {'start': 29.055, 'end': 29.75, 'text': 'sheep'}, {'start': 31.46, 'end': 31.96, 'text': 'The'}, {'start': 31.96, 'end': 32.96, 'text': 'animals'}]
  15. self.getPath()
  16. doc = fitz.open(self.file)
  17. page_count = doc.page_count
  18. result = {'total': page_count, 'pages': [], 'text': []}
  19. scale = 2.0
  20. for page_num in range(doc.page_count):
  21. page = page_num + 1
  22. page_obj = doc.load_page(page_num)
  23. page_width, page_height = page_obj.rect.width, page_obj.rect.height
  24. # 提取文字
  25. # 获取每页的 words
  26. # 每个 word 是 [x0, y0, x1, y1, "word_text", block_no, line_no, word_no]
  27. words = page_obj.get_text("words")
  28. for w in words:
  29. x0, y0, x1, y1, text, *_ = w
  30. # 按 scale 缩放
  31. x0 *= scale
  32. y0 *= scale
  33. x1 *= scale
  34. y1 *= scale
  35. # 转百分比,方便前端高亮
  36. rel_bbox = [
  37. (x0 / (page_width * scale)) * 100,
  38. (y0 / (page_height * scale)) * 100,
  39. (x1 / (page_width * scale)) * 100,
  40. (y1 / (page_height * scale)) * 100,
  41. ]
  42. result['text'].append({
  43. "page": page,
  44. "text": text,
  45. "bbox": rel_bbox
  46. })
  47. # 提取封面图
  48. mat = fitz.Matrix(scale, scale)
  49. pix = page_obj.get_pixmap(matrix=mat, alpha=False)
  50. cover_file = f"{self.param['path']}cover_page_{page_num+1}.png"
  51. pix.save(cover_file)
  52. if 'host' in self.param and self.param['host']:
  53. cover_file = cover_file.replace(Demeter.path + 'runtime/', self.param['host'])
  54. result['pages'].append({
  55. "page": page,
  56. "img": cover_file,
  57. "width": page_width * scale,
  58. "height": page_height * scale,
  59. })
  60. if self.audio:
  61. text = []
  62. for item in self.audio:
  63. state = self.find(result['text'], item);
  64. if state:
  65. text.append(state)
  66. result['text'] = text
  67. return result
  68. def find(self, text, audio):
  69. for words in text:
  70. if words['page'] <= 1:
  71. continue
  72. if audio['text'].lower() in words['text'].lower():
  73. text = {
  74. "page": words['page'],
  75. "text": words['text'],
  76. "bbox": words['bbox'],
  77. "start": audio['start'],
  78. "end": audio['end'],
  79. }
  80. return text
  81. return False
  82. # 提取为langchain的Document格式
  83. def doc(self):
  84. if not self.file:
  85. return False
  86. #loader = PyPDFLoader(self.file, extract_images=False)
  87. #return loader.load()
  88. doc = fitz.open(self.file)
  89. result = {'page': page, 'content': []}
  90. for page_num in range(len(doc)):
  91. page = doc.load_page(page_num)
  92. # 提取文本
  93. text = page.get_text()
  94. # 提取图片中的文字
  95. image_texts = []
  96. for img in page.get_images(full=True):
  97. xref = img[0]
  98. base_image = doc.extract_image(xref)
  99. image_bytes = base_image["image"]
  100. image = Image.open(io.BytesIO(image_bytes))
  101. #result = Demeter.service('loader', 'extract').get(image)
  102. ocr_result = ocr.ocr(image)
  103. for line in ocr_result[0]:
  104. image_texts.append(line[1])
  105. '''
  106. # OCR 识别
  107. ocr_result = ocr_reader.readtext(image)
  108. image_texts = " ".join([line[1] for line in ocr_result]).strip()
  109. '''
  110. # 合并文字 + 图片文字
  111. full_text = text.strip() + "\n" + "\n".join(image_texts)
  112. document = langchain.schema.Document(page_content=full_text)
  113. result['content'].append(document)
  114. return result
  115. def clean_text(self, s):
  116. return re.sub(r'[\x00-\x08\x0B\x0C\x0E-\x1F\x7F]', '', s)