|
@@ -23,24 +23,19 @@ class Pdf(Base):
|
|
|
page = page_num + 1
|
|
|
page_obj = doc.load_page(page_num)
|
|
|
page_width, page_height = page_obj.rect.width, page_obj.rect.height
|
|
|
-
|
|
|
+
|
|
|
# 提取文字
|
|
|
# 获取每页的 words
|
|
|
# 每个 word 是 [x0, y0, x1, y1, "word_text", block_no, line_no, word_no]
|
|
|
words = page_obj.get_text("words")
|
|
|
for w in words:
|
|
|
x0, y0, x1, y1, text, *_ = w
|
|
|
- # 按 scale 缩放
|
|
|
- x0 *= scale
|
|
|
- y0 *= scale
|
|
|
- x1 *= scale
|
|
|
- y1 *= scale
|
|
|
# 转百分比,方便前端高亮
|
|
|
rel_bbox = [
|
|
|
- (x0 / (page_width * scale)) * 100,
|
|
|
- (y0 / (page_height * scale)) * 100,
|
|
|
- (x1 / (page_width * scale)) * 100,
|
|
|
- (y1 / (page_height * scale)) * 100,
|
|
|
+ (x0 / (page_width)) * 100,
|
|
|
+ (y0 / (page_height)) * 100,
|
|
|
+ (x1 / (page_width)) * 100,
|
|
|
+ (y1 / (page_height)) * 100,
|
|
|
]
|
|
|
result['text'].append({
|
|
|
"page": page,
|
|
@@ -74,18 +69,19 @@ class Pdf(Base):
|
|
|
return result
|
|
|
|
|
|
def find(self, text, audio):
|
|
|
- for words in text:
|
|
|
+ for idx, words in enumerate(text):
|
|
|
if words['page'] <= 1:
|
|
|
continue
|
|
|
if audio['text'].lower() in words['text'].lower():
|
|
|
- text = {
|
|
|
+ text_item = {
|
|
|
"page": words['page'],
|
|
|
"text": words['text'],
|
|
|
"bbox": words['bbox'],
|
|
|
"start": audio['start'],
|
|
|
"end": audio['end'],
|
|
|
}
|
|
|
- return text
|
|
|
+ text.pop(idx) # 根据下标删除,避免重复 dict 的问题
|
|
|
+ return text_item
|
|
|
return False
|
|
|
|
|
|
# 提取为langchain的Document格式
|