rabin 1 mēnesi atpakaļ
vecāks
revīzija
5262ac4587
1 mainītis faili ar 4 papildinājumiem un 4 dzēšanām
  1. 4 4
      service/extract/docs/pdf.py

+ 4 - 4
service/extract/docs/pdf.py

@@ -13,7 +13,6 @@ class Pdf(Base):
             # 有音频文件
             self.audio = self.param['audio']
             self.audio = Demeter.service('loader', 'extract').get(self.audio).json()
-            #self.audio = [{'start': 0.0, 'end': 0.437, 'text': 'Farm'}, {'start': 0.437, 'end': 1.311, 'text': 'animals'}, {'start': 1.311, 'end': 2.185, 'text': 'written'}, {'start': 2.185, 'end': 2.622, 'text': 'by'}, {'start': 2.622, 'end': 3.496, 'text': 'Cheryl'}, {'start': 3.496, 'end': 3.933, 'text': 'Ryan'}, {'start': 3.933, 'end': 5.681, 'text': 'Illustrated'}, {'start': 5.681, 'end': 6.118, 'text': 'by'}, {'start': 6.118, 'end': 6.555, 'text': 'Nora'}, {'start': 6.555, 'end': 7.43, 'text': 'Buddhist'}, {'start': 9.54, 'end': 10.385, 'text': 'The'}, {'start': 10.385, 'end': 11.23, 'text': 'dog'}, {'start': 12.82, 'end': 13.515, 'text': 'The'}, {'start': 13.515, 'end': 14.21, 'text': 'pig'}, {'start': 15.84, 'end': 16.293, 'text': 'The'}, {'start': 16.293, 'end': 17.2, 'text': 'chicken'}, {'start': 18.87, 'end': 19.67, 'text': 'The'}, {'start': 19.67, 'end': 20.47, 'text': 'goat'}, {'start': 22.06, 'end': 22.82, 'text': 'The'}, {'start': 22.82, 'end': 23.58, 'text': 'cow'}, {'start': 25.19, 'end': 25.87, 'text': 'The'}, {'start': 25.87, 'end': 26.55, 'text': 'duck'}, {'start': 28.36, 'end': 29.055, 'text': 'The'}, {'start': 29.055, 'end': 29.75, 'text': 'sheep'}, {'start': 31.46, 'end': 31.96, 'text': 'The'}, {'start': 31.96, 'end': 32.96, 'text': 'animals'}]
         self.getPath()
         doc = fitz.open(self.file)
         page_count = doc.page_count
@@ -69,18 +68,19 @@ class Pdf(Base):
         return result
 
     def find(self, text, audio):
-        for words in text:
+        for idx, words in enumerate(text):
             if words['page'] <= 1:
                 continue
             if audio['text'].lower() in words['text'].lower():
-                text = {
+                text_item = {
                     "page": words['page'],
                     "text": words['text'],
                     "bbox": words['bbox'],
                     "start": audio['start'],
                     "end": audio['end'],
                 }
-                return text
+                text.pop(idx)  # 根据下标删除,避免重复 dict 的问题
+                return text_item
         return False
 
     # 提取为langchain的Document格式