rabin преди 1 месец
родител
ревизия
fd25548a01
променени са 1 файла, в които са добавени 9 реда и са изтрити 13 реда
  1. 9 13
      service/extract/docs/pdf.py

+ 9 - 13
service/extract/docs/pdf.py

@@ -23,24 +23,19 @@ class Pdf(Base):
             page = page_num + 1
             page_obj = doc.load_page(page_num)
             page_width, page_height = page_obj.rect.width, page_obj.rect.height
-            
+
             # 提取文字
             # 获取每页的 words
             # 每个 word 是 [x0, y0, x1, y1, "word_text", block_no, line_no, word_no]
             words = page_obj.get_text("words")
             for w in words:
                 x0, y0, x1, y1, text, *_ = w
-                # 按 scale 缩放
-                x0 *= scale
-                y0 *= scale
-                x1 *= scale
-                y1 *= scale
                 # 转百分比,方便前端高亮
                 rel_bbox = [
-                    (x0 / (page_width * scale)) * 100,
-                    (y0 / (page_height * scale)) * 100,
-                    (x1 / (page_width * scale)) * 100,
-                    (y1 / (page_height * scale)) * 100,
+                    (x0 / (page_width)) * 100,
+                    (y0 / (page_height)) * 100,
+                    (x1 / (page_width)) * 100,
+                    (y1 / (page_height)) * 100,
                 ]
                 result['text'].append({
                     "page": page,
@@ -74,18 +69,19 @@ class Pdf(Base):
         return result
 
     def find(self, text, audio):
-        for words in text:
+        for idx, words in enumerate(text):
             if words['page'] <= 1:
                 continue
             if audio['text'].lower() in words['text'].lower():
-                text = {
+                text_item = {
                     "page": words['page'],
                     "text": words['text'],
                     "bbox": words['bbox'],
                     "start": audio['start'],
                     "end": audio['end'],
                 }
-                return text
+                text.pop(idx)  # 根据下标删除,避免重复 dict 的问题
+                return text_item
         return False
 
     # 提取为langchain的Document格式