# -*- coding: utf-8 -*- from .__load__ import * import fitz import re class Pdf(Base): def json(self): if not self.file: return False self.getPath() doc = fitz.open(self.file) page_count = doc.page_count result = {'total': page_count, 'pages': []} # 黑名单符号,出现即整句丢弃 blacklist_chars = "•★◆●◇▪…※§‡†¤₪" blacklist_pattern = f"[{re.escape(blacklist_chars)}]" def is_page_number(text): text = text.strip().lower() return ( re.fullmatch(r"(page)?\s*\d+\s*(of\s*\d+)?", text) or re.fullmatch(r"\d+", text) or re.fullmatch(r"\d+\s*/\s*\d+", text) ) def is_valid_english(text): """ 保留纯英文短语、单词、句子,包括标点符号(如引号、感叹号、句号等) """ # 过滤掉包含中文或其他语言的行 if re.search(r'[\u4e00-\u9fff]', text): return False # 过滤掉非 ascii 且非常规英文标点的字符 if re.search(rf"{blacklist_pattern}", text): return False # 至少要包含一个字母或句号等基本英文结构 if not re.search(r"[A-Za-z]", text): return False return True for page_num in range(len(doc)): page_obj = doc.load_page(page_num) page_height = page_obj.rect.height text = page_obj.get_text().strip() blocks = page_obj.get_text("dict", sort=True)["blocks"] has_visible_content = any( b for b in blocks if b['type'] in (0, 1) ) if not text and not has_visible_content: continue # 封面图 try: pix = page_obj.get_pixmap(matrix=fitz.Matrix(0.3, 0.3)) cover_file = f"{self.param['path']}cover_page_{page_num+1}.png" pix.save(cover_file) if 'host' in self.param and self.param['host']: cover_file = cover_file.replace(Demeter.path + 'runtime/', self.param['host']) except Exception as e: print(f"封面图失败: {e}") cover_file = "" page_items = [] for i, b in enumerate(blocks): y_top = b["bbox"][1] y_bottom = b["bbox"][3] if y_top < page_height * 0.02 or y_bottom > page_height * 0.98: continue if b['type'] == 0: text_content = "" for line in b["lines"]: line_text = "" for span in line["spans"]: span_text = span["text"].strip() if not span_text or is_page_number(span_text): continue line_text += span_text + " " line_text = line_text.strip() if not line_text: continue # 全行过滤:含黑名单符号或不符合英文语义 if not is_valid_english(line_text): continue text_content += line_text + "\n" text_content = text_content.strip() text_content = self.clean_text(self.removeDomains(text_content)) if text_content: page_items.append({ "type": "text", "content": text_content, "pos": b["bbox"] }) elif b['type'] == 1: image_bytes = b.get("image", b"") if not image_bytes or len(image_bytes) < 100: continue image_ext = "png" image_file = f"{self.param['path']}page{page_num+1}_img_{i}.{image_ext}" with open(image_file, "wb") as f: f.write(image_bytes) if 'host' in self.param and self.param['host']: image_file = image_file.replace(Demeter.path + 'runtime/', self.param['host']) page_items.append({ "type": "image", "ext": image_ext, "content": image_file, "pos": b["bbox"] }) result['pages'].append({ "cover": cover_file, "content": page_items }) return result # 提取为langchain的Document格式 def doc(self): if not self.file: return False #loader = PyPDFLoader(self.file, extract_images=False) #return loader.load() doc = fitz.open(self.file) result = {'page': page, 'content': []} for page_num in range(len(doc)): page = doc.load_page(page_num) # 提取文本 text = page.get_text() # 提取图片中的文字 image_texts = [] for img in page.get_images(full=True): xref = img[0] base_image = doc.extract_image(xref) image_bytes = base_image["image"] image = Image.open(io.BytesIO(image_bytes)) #result = Demeter.service('loader', 'extract').get(image) ocr_result = ocr.ocr(image) for line in ocr_result[0]: image_texts.append(line[1]) ''' # OCR 识别 ocr_result = ocr_reader.readtext(image) image_texts = " ".join([line[1] for line in ocr_result]).strip() ''' # 合并文字 + 图片文字 full_text = text.strip() + "\n" + "\n".join(image_texts) document = langchain.schema.Document(page_content=full_text) result['content'].append(document) return result def clean_text(self, s): return re.sub(r'[\x00-\x08\x0B\x0C\x0E-\x1F\x7F]', '', s)