rabin před 1 měsícem
rodič
revize
04736ffc89
1 změnil soubory, kde provedl 10 přidání a 5 odebrání
  1. 10 5
      service/extract/docs/pdf.py

+ 10 - 5
service/extract/docs/pdf.py

@@ -23,19 +23,24 @@ class Pdf(Base):
             page = page_num + 1
             page_obj = doc.load_page(page_num)
             page_width, page_height = page_obj.rect.width, page_obj.rect.height
-
+            
             # 提取文字
             # 获取每页的 words
             # 每个 word 是 [x0, y0, x1, y1, "word_text", block_no, line_no, word_no]
             words = page_obj.get_text("words")
             for w in words:
                 x0, y0, x1, y1, text, *_ = w
+                # 按 scale 缩放
+                x0 *= scale
+                y0 *= scale
+                x1 *= scale
+                y1 *= scale
                 # 转百分比,方便前端高亮
                 rel_bbox = [
-                    (x0 / (page_width)) * 100,
-                    (y0 / (page_height)) * 100,
-                    (x1 / (page_width)) * 100,
-                    (y1 / (page_height)) * 100,
+                    (x0 / (page_width * scale)) * 100,
+                    (y0 / (page_height * scale)) * 100,
+                    (x1 / (page_width * scale)) * 100,
+                    (y1 / (page_height * scale)) * 100,
                 ]
                 result['text'].append({
                     "page": page,