|
@@ -23,19 +23,24 @@ class Pdf(Base):
|
|
|
page = page_num + 1
|
|
|
page_obj = doc.load_page(page_num)
|
|
|
page_width, page_height = page_obj.rect.width, page_obj.rect.height
|
|
|
-
|
|
|
+
|
|
|
# 提取文字
|
|
|
# 获取每页的 words
|
|
|
# 每个 word 是 [x0, y0, x1, y1, "word_text", block_no, line_no, word_no]
|
|
|
words = page_obj.get_text("words")
|
|
|
for w in words:
|
|
|
x0, y0, x1, y1, text, *_ = w
|
|
|
+ # 按 scale 缩放
|
|
|
+ x0 *= scale
|
|
|
+ y0 *= scale
|
|
|
+ x1 *= scale
|
|
|
+ y1 *= scale
|
|
|
# 转百分比,方便前端高亮
|
|
|
rel_bbox = [
|
|
|
- (x0 / (page_width)) * 100,
|
|
|
- (y0 / (page_height)) * 100,
|
|
|
- (x1 / (page_width)) * 100,
|
|
|
- (y1 / (page_height)) * 100,
|
|
|
+ (x0 / (page_width * scale)) * 100,
|
|
|
+ (y0 / (page_height * scale)) * 100,
|
|
|
+ (x1 / (page_width * scale)) * 100,
|
|
|
+ (y1 / (page_height * scale)) * 100,
|
|
|
]
|
|
|
result['text'].append({
|
|
|
"page": page,
|