1234567891011121314151617181920212223242526272829 |
- # -*- coding: utf-8 -*-
- from .__load__ import *
- from langchain_community.document_loaders import UnstructuredImageLoader
- from langchain.schema import Document
- from aip import AipOcr
- #from PIL import Image
- #from cnocr import CnOcr
- #from pix2text import Pix2Text, merge_line_texts
- class Img(object):
- def run(self, file, param = {}):
- #loader = UnstructuredImageLoader(file, mode='single', **param)
- #return loader.load()
- with open(file, 'rb') as image_file:
- image_data = image_file.read()
- # 初始化AipOcr对象
- client = AipOcr(Demeter.config['baiduocr']['app_id'], Demeter.config['baiduocr']['api_key'], Demeter.config['baiduocr']['secret_key'])
- # 调用百度OCR接口识别文字
- result = client.basicGeneral(image_data)
- text = ''
- if 'words_result' in result:
- for item in result['words_result']:
- text += item['words'] + '\n'
- data = Document(page_content=text, metadata={"source": "ocr"})
- return data
|