|
@@ -7,129 +7,86 @@ class Pdf(Base):
|
|
|
def json(self):
|
|
|
if not self.file:
|
|
|
return False
|
|
|
+
|
|
|
+ self.audio = ''
|
|
|
+ if 'audit' in self.param:
|
|
|
+ # 有音频文件
|
|
|
+ self.audio = self.param['audio']
|
|
|
+ #self.audio = Demeter.service('loader', 'extract').get(self.audio).json()
|
|
|
+ self.audio = [{'start': 0.0, 'end': 0.437, 'text': 'Farm'}, {'start': 0.437, 'end': 1.311, 'text': 'animals'}, {'start': 1.311, 'end': 2.185, 'text': 'written'}, {'start': 2.185, 'end': 2.622, 'text': 'by'}, {'start': 2.622, 'end': 3.496, 'text': 'Cheryl'}, {'start': 3.496, 'end': 3.933, 'text': 'Ryan'}, {'start': 3.933, 'end': 5.681, 'text': 'Illustrated'}, {'start': 5.681, 'end': 6.118, 'text': 'by'}, {'start': 6.118, 'end': 6.555, 'text': 'Nora'}, {'start': 6.555, 'end': 7.43, 'text': 'Buddhist'}, {'start': 9.54, 'end': 10.385, 'text': 'The'}, {'start': 10.385, 'end': 11.23, 'text': 'dog'}, {'start': 12.82, 'end': 13.515, 'text': 'The'}, {'start': 13.515, 'end': 14.21, 'text': 'pig'}, {'start': 15.84, 'end': 16.293, 'text': 'The'}, {'start': 16.293, 'end': 17.2, 'text': 'chicken'}, {'start': 18.87, 'end': 19.67, 'text': 'The'}, {'start': 19.67, 'end': 20.47, 'text': 'goat'}, {'start': 22.06, 'end': 22.82, 'text': 'The'}, {'start': 22.82, 'end': 23.58, 'text': 'cow'}, {'start': 25.19, 'end': 25.87, 'text': 'The'}, {'start': 25.87, 'end': 26.55, 'text': 'duck'}, {'start': 28.36, 'end': 29.055, 'text': 'The'}, {'start': 29.055, 'end': 29.75, 'text': 'sheep'}, {'start': 31.46, 'end': 31.96, 'text': 'The'}, {'start': 31.96, 'end': 32.96, 'text': 'animals'}]
|
|
|
self.getPath()
|
|
|
doc = fitz.open(self.file)
|
|
|
page_count = doc.page_count
|
|
|
- result = {'total': page_count, 'pages': []}
|
|
|
-
|
|
|
- # 黑名单符号,出现即整句丢弃
|
|
|
- blacklist_chars = "•★◆●◇▪…※§‡†¤₪"
|
|
|
- blacklist_pattern = f"[{re.escape(blacklist_chars)}]"
|
|
|
-
|
|
|
- def is_page_number(text):
|
|
|
- text = text.strip().lower()
|
|
|
- return (
|
|
|
- re.fullmatch(r"(page)?\s*\d+\s*(of\s*\d+)?", text)
|
|
|
- or re.fullmatch(r"\d+", text)
|
|
|
- or re.fullmatch(r"\d+\s*/\s*\d+", text)
|
|
|
- )
|
|
|
-
|
|
|
- def is_valid_english(text):
|
|
|
- """
|
|
|
- 保留纯英文短语、单词、句子,包括标点符号(如引号、感叹号、句号等)
|
|
|
- """
|
|
|
- # 过滤掉包含中文或其他语言的行
|
|
|
- if re.search(r'[\u4e00-\u9fff]', text):
|
|
|
- return False
|
|
|
- # 过滤掉非 ascii 且非常规英文标点的字符
|
|
|
- if re.search(rf"{blacklist_pattern}", text):
|
|
|
- return False
|
|
|
- # 至少要包含一个字母或句号等基本英文结构
|
|
|
- if not re.search(r"[A-Za-z]", text):
|
|
|
- return False
|
|
|
- return True
|
|
|
-
|
|
|
- for page_num in range(len(doc)):
|
|
|
+ result = {'total': page_count, 'pages': [], 'text': []}
|
|
|
+ scale = 2.0
|
|
|
+ for page_num in range(doc.page_count):
|
|
|
+ page = page_num + 1
|
|
|
page_obj = doc.load_page(page_num)
|
|
|
- page_height = page_obj.rect.height
|
|
|
-
|
|
|
- text = page_obj.get_text().strip()
|
|
|
- blocks = page_obj.get_text("dict", sort=True)["blocks"]
|
|
|
- has_visible_content = any(
|
|
|
- b for b in blocks if b['type'] in (0, 1)
|
|
|
- )
|
|
|
- if not text and not has_visible_content:
|
|
|
- continue
|
|
|
-
|
|
|
- # 封面图
|
|
|
- try:
|
|
|
- pix = page_obj.get_pixmap(matrix=fitz.Matrix(0.3, 0.3))
|
|
|
- cover_file = f"{self.param['path']}cover_page_{page_num+1}.png"
|
|
|
- pix.save(cover_file)
|
|
|
- if 'host' in self.param and self.param['host']:
|
|
|
- cover_file = cover_file.replace(Demeter.path + 'runtime/', self.param['host'])
|
|
|
- except Exception as e:
|
|
|
- print(f"封面图失败: {e}")
|
|
|
- cover_file = ""
|
|
|
-
|
|
|
- page_items = []
|
|
|
-
|
|
|
- for i, b in enumerate(blocks):
|
|
|
- y_top = b["bbox"][1]
|
|
|
- y_bottom = b["bbox"][3]
|
|
|
- '''
|
|
|
- if y_top < page_height * 0.02 or y_bottom > page_height * 0.98:
|
|
|
- continue
|
|
|
- '''
|
|
|
-
|
|
|
- if b['type'] == 0:
|
|
|
- text_content = ""
|
|
|
- for line in b["lines"]:
|
|
|
- line_text = ""
|
|
|
- for span in line["spans"]:
|
|
|
- span_text = span["text"].strip()
|
|
|
- if not span_text or is_page_number(span_text):
|
|
|
- continue
|
|
|
- line_text += span_text + " "
|
|
|
-
|
|
|
- line_text = line_text.strip()
|
|
|
- if not line_text:
|
|
|
- continue
|
|
|
-
|
|
|
- # 全行过滤:含黑名单符号或不符合英文语义
|
|
|
- if not is_valid_english(line_text):
|
|
|
- continue
|
|
|
-
|
|
|
- text_content += line_text + "\n"
|
|
|
-
|
|
|
- text_content = text_content.strip()
|
|
|
- text_content = self.clean_text(self.removeDomains(text_content))
|
|
|
- if text_content:
|
|
|
- page_items.append({
|
|
|
- "type": "text",
|
|
|
- "content": text_content,
|
|
|
- "pos": b["bbox"]
|
|
|
- })
|
|
|
-
|
|
|
- elif b['type'] == 1:
|
|
|
- image_bytes = b.get("image", b"")
|
|
|
- '''
|
|
|
- if not image_bytes or len(image_bytes) < 100:
|
|
|
- continue
|
|
|
- '''
|
|
|
-
|
|
|
- image_ext = "png"
|
|
|
- image_file = f"{self.param['path']}page{page_num+1}_img_{i}.{image_ext}"
|
|
|
- with open(image_file, "wb") as f:
|
|
|
- f.write(image_bytes)
|
|
|
-
|
|
|
- if 'host' in self.param and self.param['host']:
|
|
|
- image_file = image_file.replace(Demeter.path + 'runtime/', self.param['host'])
|
|
|
-
|
|
|
- page_items.append({
|
|
|
- "type": "image",
|
|
|
- "ext": image_ext,
|
|
|
- "content": image_file,
|
|
|
- "pos": b["bbox"]
|
|
|
- })
|
|
|
+ page_width, page_height = page_obj.rect.width, page_obj.rect.height
|
|
|
+
|
|
|
+ # 提取文字
|
|
|
+ # 获取每页的 words
|
|
|
+ # 每个 word 是 [x0, y0, x1, y1, "word_text", block_no, line_no, word_no]
|
|
|
+ words = page_obj.get_text("words")
|
|
|
+ for w in words:
|
|
|
+ x0, y0, x1, y1, text, *_ = w
|
|
|
+ # 按 scale 缩放
|
|
|
+ x0 *= scale
|
|
|
+ y0 *= scale
|
|
|
+ x1 *= scale
|
|
|
+ y1 *= scale
|
|
|
+ # 转百分比,方便前端高亮
|
|
|
+ rel_bbox = [
|
|
|
+ (x0 / (page_width * scale)) * 100,
|
|
|
+ (y0 / (page_height * scale)) * 100,
|
|
|
+ (x1 / (page_width * scale)) * 100,
|
|
|
+ (y1 / (page_height * scale)) * 100,
|
|
|
+ ]
|
|
|
+ result['text'].append({
|
|
|
+ "page": page,
|
|
|
+ "text": text,
|
|
|
+ "bbox": rel_bbox
|
|
|
+ })
|
|
|
+
|
|
|
+ # 提取封面图
|
|
|
+ mat = fitz.Matrix(scale, scale)
|
|
|
+ pix = page_obj.get_pixmap(matrix=mat, alpha=False)
|
|
|
+ cover_file = f"{self.param['path']}cover_page_{page_num+1}.png"
|
|
|
+ pix.save(cover_file)
|
|
|
+ if 'host' in self.param and self.param['host']:
|
|
|
+ cover_file = cover_file.replace(Demeter.path + 'runtime/', self.param['host'])
|
|
|
|
|
|
result['pages'].append({
|
|
|
- "cover": cover_file,
|
|
|
- "content": page_items
|
|
|
+ "page": page,
|
|
|
+ "img": cover_file,
|
|
|
+ "width": page_width * scale,
|
|
|
+ "height": page_height * scale,
|
|
|
})
|
|
|
|
|
|
+ if self.audio:
|
|
|
+ text = []
|
|
|
+ for item in self.audio:
|
|
|
+ state = self.find(result['text'], item);
|
|
|
+ if state:
|
|
|
+ text.append(state)
|
|
|
+ result['text'] = text
|
|
|
+
|
|
|
return result
|
|
|
|
|
|
+ def find(self, text, audio):
|
|
|
+ for words in text:
|
|
|
+ if words['page'] <= 1:
|
|
|
+ continue
|
|
|
+ if audio['text'].lower() in words['text'].lower():
|
|
|
+ text = {
|
|
|
+ "page": words['page'],
|
|
|
+ "text": words['text'],
|
|
|
+ "bbox": words['bbox'],
|
|
|
+ "start": audio['start'],
|
|
|
+ "end": audio['end'],
|
|
|
+ }
|
|
|
+ return text
|
|
|
+ return False
|
|
|
|
|
|
# 提取为langchain的Document格式
|
|
|
def doc(self):
|