# -*- coding: utf-8 -*- from .__load__ import * #from langchain_community.document_loaders import PyPDFLoader import fitz #from PIL import Image class Pdf(Base): # 提取为json格式 def json(self): if not self.file: return False self.getPath() doc = fitz.open(self.file) page = doc.page_count result = {'page': page, 'content': []} for page_num in range(len(doc)): page = doc.load_page(page_num) page_height = page.rect.height blocks = page.get_text("dict", sort=True)["blocks"] page_items = [] for i, b in enumerate(blocks): # 去除页眉页脚:bbox[1] 是顶部Y坐标,bbox[3] 是底部Y坐标 y_top = b["bbox"][1] y_bottom = b["bbox"][3] if y_top < page_height * 0.05 or y_bottom > page_height * 0.95: continue if b['type'] == 0: text_content = "" for line in b["lines"]: for span in line["spans"]: span_text = span["text"] text_content += span_text text_content += "\n" text_content = text_content.strip() text_content = self.removeDomains(text_content) if text_content: page_items.append({ "type": "text", "pos": b["bbox"], "content": text_content, "page": page_num + 1 }) elif b['type'] == 1: # 图片块 image_bytes = b.get("image", b"") if not image_bytes or len(image_bytes) < 100: continue try: pix = fitz.Pixmap(doc, b["image"]) if pix.width < 10 or pix.height < 10: continue except Exception: pass image_ext = "png" file = f"{self.param['path']}page{page_num+1}_img_{i}.{image_ext}" with open(file, "wb") as f: f.write(image_bytes) if 'host' in self.param and self.param['host']: file = file.replace(Demeter.path + 'runtime/', self.param['host']) page_items.append({ "type": "image", "pos": b["bbox"], "ext": image_ext, "file": file, "page": page_num + 1, }) result['content'].extend(page_items) return result # 提取为langchain的Document格式 def doc(self): if not self.file: return False #loader = PyPDFLoader(self.file, extract_images=False) #return loader.load() doc = fitz.open(self.file) result = {'page': page, 'content': []} for page_num in range(len(doc)): page = doc.load_page(page_num) # 提取文本 text = page.get_text() # 提取图片中的文字 image_texts = [] for img in page.get_images(full=True): xref = img[0] base_image = doc.extract_image(xref) image_bytes = base_image["image"] image = Image.open(io.BytesIO(image_bytes)) #result = Demeter.service('loader', 'extract').get(image) ocr_result = ocr.ocr(image) for line in ocr_result[0]: image_texts.append(line[1]) ''' # OCR 识别 ocr_result = ocr_reader.readtext(image) image_texts = " ".join([line[1] for line in ocr_result]).strip() ''' # 合并文字 + 图片文字 full_text = text.strip() + "\n" + "\n".join(image_texts) document = langchain.schema.Document(page_content=full_text) result['content'].append(document) return result