rabin há 1 mês atrás
pai
commit
464cbee758

+ 1 - 0
model/extract.py

@@ -9,6 +9,7 @@ class Extract(Model):
     uid = Fields(type='varchar(200)', comment='上传者')
     source_id = Fields(type='int(11)', comment='源文件id')
     source = Fields(type='varchar(500)', comment='源文件')
+    audio = Fields(type='varchar(500)', comment='语音文件')
     notify = Fields(type='varchar(2000)', comment='回调地址')
     name = Fields(type='varchar(200)', comment='文件名')
     page = Fields(type='int(11)', comment='页数')

+ 109 - 2
pdf.py

@@ -8,11 +8,118 @@ Demeter.getopt(param)
 
 #file = Demeter.option['file']
 file = '/data/dm/container/web/diviner/data/test.pdf'
+audio = 'http://s.fxjk.vip/f4/af8bf6ebcf837c717f2aeabe6fd618.mp3'
 # pdf提取功能
 
 # 直接提取
-result = Demeter.service('loader', 'extract').get(file).json()
+#result = Demeter.service('loader', 'extract').get(file, {'aud1io':audio}).json()
 
 # 用通用方法 同步提取并记录已提取,下次直接用提取后的内容
-#result = Demeter.service('loader').get(obj='parser', module='extract', sync=True, site_id=1, uid=1, source_id=1, source=file, method='json')
+result = Demeter.service('loader').get(obj='parser', module='extract', sync=True, site_id=1, uid=1, source_id=1, source=file, audio=audio, method='json')
 print(result)
+
+'''
+import json
+import time
+from aliyunsdkcore.acs_exception.exceptions import ClientException
+from aliyunsdkcore.acs_exception.exceptions import ServerException
+from aliyunsdkcore.client import AcsClient
+from aliyunsdkcore.request import CommonRequest
+def fileTrans(akId, akSecret, appKey, fileLink) :
+    # 地域ID,固定值。
+    REGION_ID = "cn-beijing"
+    PRODUCT = "nls-filetrans"
+    DOMAIN = "filetrans.cn-beijing.aliyuncs.com"
+    API_VERSION = "2018-08-17"
+    POST_REQUEST_ACTION = "SubmitTask"
+    GET_REQUEST_ACTION = "GetTaskResult"
+    # 请求参数
+    KEY_APP_KEY = "appkey"
+    KEY_FILE_LINK = "file_link"
+    KEY_VERSION = "version"
+    KEY_ENABLE_WORDS = "enable_words"
+    # 是否开启智能分轨
+    KEY_AUTO_SPLIT = "auto_split"
+    # 响应参数
+    KEY_TASK = "Task"
+    KEY_TASK_ID = "TaskId"
+    KEY_STATUS_TEXT = "StatusText"
+    KEY_RESULT = "Result"
+    # 状态值
+    STATUS_SUCCESS = "SUCCESS"
+    STATUS_RUNNING = "RUNNING"
+    STATUS_QUEUEING = "QUEUEING"
+    # 创建AcsClient实例
+    client = AcsClient(akId, akSecret, REGION_ID)
+    # 提交录音文件识别请求
+    postRequest = CommonRequest()
+    postRequest.set_domain(DOMAIN)
+    postRequest.set_version(API_VERSION)
+    postRequest.set_product(PRODUCT)
+    postRequest.set_action_name(POST_REQUEST_ACTION)
+    postRequest.set_method('POST')
+    # 新接入请使用4.0版本,已接入(默认2.0)如需维持现状,请注释掉该参数设置。
+    # 设置是否输出词信息,默认为false,开启时需要设置version为4.0。
+    task = {KEY_APP_KEY : appKey, KEY_FILE_LINK : fileLink, KEY_VERSION : "4.0", KEY_ENABLE_WORDS : False}
+    # 开启智能分轨,如果开启智能分轨,task中设置KEY_AUTO_SPLIT为True。
+    # task = {KEY_APP_KEY : appKey, KEY_FILE_LINK : fileLink, KEY_VERSION : "4.0", KEY_ENABLE_WORDS : False, KEY_AUTO_SPLIT : True}
+    task = json.dumps(task)
+    print(task)
+    postRequest.add_body_params(KEY_TASK, task)
+    taskId = ""
+    try :
+        postResponse = client.do_action_with_exception(postRequest)
+        postResponse = json.loads(postResponse)
+        print (postResponse)
+        statusText = postResponse[KEY_STATUS_TEXT]
+        if statusText == STATUS_SUCCESS :
+            print ("录音文件识别请求成功响应!")
+            taskId = postResponse[KEY_TASK_ID]
+        else :
+            print ("录音文件识别请求失败!")
+            return
+    except ServerException as e:
+        print (e)
+    except ClientException as e:
+        print (e)
+    # 创建CommonRequest,设置任务ID。
+    getRequest = CommonRequest()
+    getRequest.set_domain(DOMAIN)
+    getRequest.set_version(API_VERSION)
+    getRequest.set_product(PRODUCT)
+    getRequest.set_action_name(GET_REQUEST_ACTION)
+    getRequest.set_method('GET')
+    getRequest.add_query_param(KEY_TASK_ID, taskId)
+    # 提交录音文件识别结果查询请求
+    # 以轮询的方式进行识别结果的查询,直到服务端返回的状态描述符为"SUCCESS"、"SUCCESS_WITH_NO_VALID_FRAGMENT",
+    # 或者为错误描述,则结束轮询。
+    statusText = ""
+    while True :
+        try :
+            getResponse = client.do_action_with_exception(getRequest)
+            getResponse = json.loads(getResponse)
+            print (getResponse)
+            statusText = getResponse[KEY_STATUS_TEXT]
+            if statusText == STATUS_RUNNING or statusText == STATUS_QUEUEING :
+                # 继续轮询
+                time.sleep(10)
+            else :
+                # 退出轮询
+                break
+        except ServerException as e:
+            print (e)
+        except ClientException as e:
+            print (e)
+    if statusText == STATUS_SUCCESS :
+        print ("录音文件识别成功!")
+    else :
+        print ("录音文件识别失败!")
+    return
+
+accessKeyId = 'LTAI5tCFiVxuXz39MMkXFcMm'
+accessKeySecret = 'sdK3jVSrrqzz2nONAGyd6kvXZZwkie'
+appKey = 'm0mDna21AWao7b0A'
+fileLink = "http://s.fxjk.vip/f4/af8bf6ebcf837c717f2aeabe6fd618.mp3"
+# 执行录音文件识别
+fileTrans(accessKeyId, accessKeySecret, appKey, fileLink)
+'''

+ 2 - 1
requirements.txt

@@ -15,4 +15,5 @@ openai
 faiss-cpu
 transformers
 edge-tts>=6.1.3
-pysrt>=1.1.2
+pysrt>=1.1.2
+aliyun-python-sdk-core==2.13.3

+ 2 - 1
service/callback.py

@@ -3,7 +3,7 @@ from .__load__ import *
 # 回调
 class Callback(object):
 
-    def send(self, site_id, param):
+    def send(self, site_id, method, param):
         model = Demeter.model('site')
         model.id = site_id
         site = model.select(type='fetchone')
@@ -11,6 +11,7 @@ class Callback(object):
             site['api'] = param['notify']
         if 'api' in site and site['api']:
             api = site['api']
+            param['method'] = method
             param['appid'] = site['appid']
             param['appsecret'] = site['appsecret']
             param['timestamp'] = Demeter.time()

+ 95 - 0
service/extract/docs/audio.py

@@ -0,0 +1,95 @@
+# -*- coding: utf-8 -*-
+from .__load__ import *
+import json
+import time
+from aliyunsdkcore.acs_exception.exceptions import ClientException, ServerException
+from aliyunsdkcore.client import AcsClient
+from aliyunsdkcore.request import CommonRequest
+
+class Audio(Base):
+
+    def __init__(self):
+        self.akId = 'LTAI5tCFiVxuXz39MMkXFcMm'
+        self.akSecret = 'sdK3jVSrrqzz2nONAGyd6kvXZZwkie'
+        self.appKey = 'm0mDna21AWao7b0A'
+        self.region = 'cn-beijing'
+        self.client = AcsClient(self.akId, self.akSecret, self.region)
+
+        # 固定配置
+        self.PRODUCT = 'nls-filetrans'
+        self.DOMAIN = f'filetrans.{self.region}.aliyuncs.com'
+        self.API_VERSION = '2018-08-17'
+        self.POST_REQUEST_ACTION = 'SubmitTask'
+        self.GET_REQUEST_ACTION = 'GetTaskResult'
+
+    def json(self):
+        if not self.file:
+            return False
+
+        # 提交任务
+        task = {
+            'appkey': self.appKey,
+            'file_link': self.file,
+            'version': '4.0',
+            'enable_words': True,
+            'auto_split': True,
+        }
+        postRequest = CommonRequest()
+        postRequest.set_domain(self.DOMAIN)
+        postRequest.set_version(self.API_VERSION)
+        postRequest.set_product(self.PRODUCT)
+        postRequest.set_action_name(self.POST_REQUEST_ACTION)
+        postRequest.set_method('POST')
+        postRequest.add_body_params('Task', json.dumps(task))
+
+        try:
+            postResponse = self.client.do_action_with_exception(postRequest)
+            postResponse = json.loads(postResponse)
+            if postResponse.get('StatusText') != 'SUCCESS':
+                raise Exception(f'提交任务失败: {postResponse}')
+            taskId = postResponse['TaskId']
+        except (ServerException, ClientException) as e:
+            raise Exception(f'提交任务异常: {str(e)}')
+
+        # 查询任务结果
+        getRequest = CommonRequest()
+        getRequest.set_domain(self.DOMAIN)
+        getRequest.set_version(self.API_VERSION)
+        getRequest.set_product(self.PRODUCT)
+        getRequest.set_action_name(self.GET_REQUEST_ACTION)
+        getRequest.set_method('GET')
+        getRequest.add_query_param('TaskId', taskId)
+
+        statusText = ''
+        result_json = {}
+        while True:
+            try:
+                getResponse = self.client.do_action_with_exception(getRequest)
+                getResponse = json.loads(getResponse)
+                statusText = getResponse.get('StatusText', '')
+                if statusText in ['RUNNING', 'QUEUEING']:
+                    time.sleep(5)
+                    continue
+                else:
+                    result_json = getResponse
+                    break
+            except (ServerException, ClientException) as e:
+                raise Exception(f'查询任务异常: {str(e)}')
+
+        if statusText != 'SUCCESS':
+            raise Exception(f'识别失败: {result_json}')
+
+        # 处理成单词级别 SRT/WebVTT 风格 JSON
+        final_result = []
+        if 'Result' in result_json and 'Words' in result_json['Result']:
+            for w in result_json['Result']['Words']:
+                start = w.get('BeginTime', 0) / 1000.0   # 毫秒转秒
+                end = w.get('EndTime', 0) / 1000.0
+                text = w.get('Word', '').strip()
+                if text:
+                    final_result.append({
+                        "start": start,
+                        "end": end,
+                        "text": text
+                    })
+        return final_result

+ 70 - 113
service/extract/docs/pdf.py

@@ -7,129 +7,86 @@ class Pdf(Base):
     def json(self):
         if not self.file:
             return False
+
+        self.audio = ''
+        if 'audit' in self.param:
+            # 有音频文件
+            self.audio = self.param['audio']
+            #self.audio = Demeter.service('loader', 'extract').get(self.audio).json()
+        self.audio = [{'start': 0.0, 'end': 0.437, 'text': 'Farm'}, {'start': 0.437, 'end': 1.311, 'text': 'animals'}, {'start': 1.311, 'end': 2.185, 'text': 'written'}, {'start': 2.185, 'end': 2.622, 'text': 'by'}, {'start': 2.622, 'end': 3.496, 'text': 'Cheryl'}, {'start': 3.496, 'end': 3.933, 'text': 'Ryan'}, {'start': 3.933, 'end': 5.681, 'text': 'Illustrated'}, {'start': 5.681, 'end': 6.118, 'text': 'by'}, {'start': 6.118, 'end': 6.555, 'text': 'Nora'}, {'start': 6.555, 'end': 7.43, 'text': 'Buddhist'}, {'start': 9.54, 'end': 10.385, 'text': 'The'}, {'start': 10.385, 'end': 11.23, 'text': 'dog'}, {'start': 12.82, 'end': 13.515, 'text': 'The'}, {'start': 13.515, 'end': 14.21, 'text': 'pig'}, {'start': 15.84, 'end': 16.293, 'text': 'The'}, {'start': 16.293, 'end': 17.2, 'text': 'chicken'}, {'start': 18.87, 'end': 19.67, 'text': 'The'}, {'start': 19.67, 'end': 20.47, 'text': 'goat'}, {'start': 22.06, 'end': 22.82, 'text': 'The'}, {'start': 22.82, 'end': 23.58, 'text': 'cow'}, {'start': 25.19, 'end': 25.87, 'text': 'The'}, {'start': 25.87, 'end': 26.55, 'text': 'duck'}, {'start': 28.36, 'end': 29.055, 'text': 'The'}, {'start': 29.055, 'end': 29.75, 'text': 'sheep'}, {'start': 31.46, 'end': 31.96, 'text': 'The'}, {'start': 31.96, 'end': 32.96, 'text': 'animals'}]
         self.getPath()
         doc = fitz.open(self.file)
         page_count = doc.page_count
-        result = {'total': page_count, 'pages': []}
-
-        # 黑名单符号,出现即整句丢弃
-        blacklist_chars = "•★◆●◇▪…※§‡†¤₪"
-        blacklist_pattern = f"[{re.escape(blacklist_chars)}]"
-
-        def is_page_number(text):
-            text = text.strip().lower()
-            return (
-                re.fullmatch(r"(page)?\s*\d+\s*(of\s*\d+)?", text)
-                or re.fullmatch(r"\d+", text)
-                or re.fullmatch(r"\d+\s*/\s*\d+", text)
-            )
-
-        def is_valid_english(text):
-            """
-            保留纯英文短语、单词、句子,包括标点符号(如引号、感叹号、句号等)
-            """
-            # 过滤掉包含中文或其他语言的行
-            if re.search(r'[\u4e00-\u9fff]', text):
-                return False
-            # 过滤掉非 ascii 且非常规英文标点的字符
-            if re.search(rf"{blacklist_pattern}", text):
-                return False
-            # 至少要包含一个字母或句号等基本英文结构
-            if not re.search(r"[A-Za-z]", text):
-                return False
-            return True
-
-        for page_num in range(len(doc)):
+        result = {'total': page_count, 'pages': [], 'text': []}
+        scale = 2.0
+        for page_num in range(doc.page_count):
+            page = page_num + 1
             page_obj = doc.load_page(page_num)
-            page_height = page_obj.rect.height
-
-            text = page_obj.get_text().strip()
-            blocks = page_obj.get_text("dict", sort=True)["blocks"]
-            has_visible_content = any(
-                b for b in blocks if b['type'] in (0, 1)
-            )
-            if not text and not has_visible_content:
-                continue
-
-            # 封面图
-            try:
-                pix = page_obj.get_pixmap(matrix=fitz.Matrix(0.3, 0.3))
-                cover_file = f"{self.param['path']}cover_page_{page_num+1}.png"
-                pix.save(cover_file)
-                if 'host' in self.param and self.param['host']:
-                    cover_file = cover_file.replace(Demeter.path + 'runtime/', self.param['host'])
-            except Exception as e:
-                print(f"封面图失败: {e}")
-                cover_file = ""
-
-            page_items = []
-
-            for i, b in enumerate(blocks):
-                y_top = b["bbox"][1]
-                y_bottom = b["bbox"][3]
-                '''
-                if y_top < page_height * 0.02 or y_bottom > page_height * 0.98:
-                    continue
-                '''
-
-                if b['type'] == 0:
-                    text_content = ""
-                    for line in b["lines"]:
-                        line_text = ""
-                        for span in line["spans"]:
-                            span_text = span["text"].strip()
-                            if not span_text or is_page_number(span_text):
-                                continue
-                            line_text += span_text + " "
-
-                        line_text = line_text.strip()
-                        if not line_text:
-                            continue
-
-                        # 全行过滤:含黑名单符号或不符合英文语义
-                        if not is_valid_english(line_text):
-                            continue
-
-                        text_content += line_text + "\n"
-
-                    text_content = text_content.strip()
-                    text_content = self.clean_text(self.removeDomains(text_content))
-                    if text_content:
-                        page_items.append({
-                            "type": "text",
-                            "content": text_content,
-                            "pos": b["bbox"]
-                        })
-
-                elif b['type'] == 1:
-                    image_bytes = b.get("image", b"")
-                    '''
-                    if not image_bytes or len(image_bytes) < 100:
-                        continue
-                    '''
-
-                    image_ext = "png"
-                    image_file = f"{self.param['path']}page{page_num+1}_img_{i}.{image_ext}"
-                    with open(image_file, "wb") as f:
-                        f.write(image_bytes)
-
-                    if 'host' in self.param and self.param['host']:
-                        image_file = image_file.replace(Demeter.path + 'runtime/', self.param['host'])
-
-                    page_items.append({
-                        "type": "image",
-                        "ext": image_ext,
-                        "content": image_file,
-                        "pos": b["bbox"]
-                    })
+            page_width, page_height = page_obj.rect.width, page_obj.rect.height
+
+            # 提取文字
+            # 获取每页的 words
+            # 每个 word 是 [x0, y0, x1, y1, "word_text", block_no, line_no, word_no]
+            words = page_obj.get_text("words")
+            for w in words:
+                x0, y0, x1, y1, text, *_ = w
+                # 按 scale 缩放
+                x0 *= scale
+                y0 *= scale
+                x1 *= scale
+                y1 *= scale
+                # 转百分比,方便前端高亮
+                rel_bbox = [
+                    (x0 / (page_width * scale)) * 100,
+                    (y0 / (page_height * scale)) * 100,
+                    (x1 / (page_width * scale)) * 100,
+                    (y1 / (page_height * scale)) * 100,
+                ]
+                result['text'].append({
+                    "page": page,
+                    "text": text,
+                    "bbox": rel_bbox
+                })
+
+            # 提取封面图
+            mat = fitz.Matrix(scale, scale)
+            pix = page_obj.get_pixmap(matrix=mat, alpha=False)
+            cover_file = f"{self.param['path']}cover_page_{page_num+1}.png"
+            pix.save(cover_file)
+            if 'host' in self.param and self.param['host']:
+                cover_file = cover_file.replace(Demeter.path + 'runtime/', self.param['host'])
 
             result['pages'].append({
-                "cover": cover_file,
-                "content": page_items
+                "page": page,
+                "img": cover_file,
+                "width": page_width * scale,
+                "height": page_height * scale,
             })
 
+        if self.audio:
+            text = []
+            for item in self.audio:
+                state = self.find(result['text'], item);
+                if state:
+                    text.append(state)
+            result['text'] = text
+
         return result
 
+    def find(self, text, audio):
+        for words in text:
+            if words['page'] <= 1:
+                continue
+            if audio['text'].lower() in words['text'].lower():
+                text = {
+                    "page": words['page'],
+                    "text": words['text'],
+                    "bbox": words['bbox'],
+                    "start": audio['start'],
+                    "end": audio['end'],
+                }
+                return text
+        return False
 
     # 提取为langchain的Document格式
     def doc(self):

+ 20 - 21
service/extract/loader.py

@@ -3,25 +3,24 @@ from .__load__ import *
 
 class Loader(object):
     def get(self, file, param = {}):
-        if 'http' in file:
-            loader = 'web'
-        elif File.exists(file):
-            if '.csv' in file:
-                loader = 'csv'
-            elif '.pdf' in file:
-                loader = 'pdf'
-            elif '.html' in file:
-                loader = 'html'
-            elif '.json' in file:
-                loader = 'json'
-            elif '.xls' in file:
-                loader = 'excel'
-            elif '.ppt' in file:
-                loader = 'ppt'
-            elif '.doc' in file:
-                loader = 'word'
-            elif '.jpg' in file or '.png' in file or '.gif' in file or '.webp' in file:
-                loader = 'img'
-            else:
-                loader = 'text'
+        if '.csv' in file:
+            loader = 'csv'
+        elif '.pdf' in file:
+            loader = 'pdf'
+        elif '.html' in file:
+            loader = 'html'
+        elif '.json' in file:
+            loader = 'json'
+        elif '.xls' in file:
+            loader = 'excel'
+        elif '.ppt' in file:
+            loader = 'ppt'
+        elif '.doc' in file:
+            loader = 'word'
+        elif '.jpg' in file or '.png' in file or '.gif' in file or '.webp' in file:
+            loader = 'img'
+        elif '.mp3' in file or '.wav' in file or '.m4a' in file or '.3gp' in file or '.amr' in file:
+            loader = 'audio'
+        else:
+            loader = 'text'
         return Demeter.service(loader, 'extract.docs').init(file, param)

+ 6 - 5
service/extract/parser.py

@@ -3,7 +3,7 @@ from .__load__ import *
 # 提取器
 class Parser(object):
 
-    def get(self, host = '', id=0, site_id = 0, uid = 0, source_id = 0, source = '', notify='', sync=True, method='json', **kwargs):
+    def get(self, host = '', id=0, site_id = 0, uid = 0, source_id = 0, source = '', audio = '', notify='', sync=True, method='json', **kwargs):
         extract = Demeter.model('extract')
         if int(id) > 0:
             extract.id = id
@@ -16,6 +16,7 @@ class Parser(object):
             info = self.getFile(site_id, source, host)
             info['uid'] = uid
             info['source_id'] = source_id
+            info['audio'] = audio
             info['notify'] = notify
             info['status'] = 1
             info['id'] = Demeter.service('common').update('extract', False, info)
@@ -91,6 +92,7 @@ class Parser(object):
         param['page'] = 0
         param['status'] = 4
         param['source_id'] = info['source_id']
+        param['source'] = info['source']
         status = True
         if info['status'] == 1 or info['status'] == 4:
             status = False
@@ -103,12 +105,11 @@ class Parser(object):
                 self.download(info['source'], info['file'])
             if True:
                 File.mkdir(info['path'])
-                obj = Demeter.service('loader', 'extract').get(info['file'], {'path':info['path'], 'host':info['host']})
+                obj = Demeter.service('loader', 'extract').get(info['file'], {'path':info['path'], 'host':info['host'], 'audio':info['audio']})
                 func = getattr(obj, method)
                 result = func()
                 if result and 'total' in result and result['total'] > 0:
-                    param['content'] = result['pages']
-                    param['page'] = result['total']
+                    param['content'] = result
                     param['status'] = 3
                     if method == 'json':
                         method = 1
@@ -121,5 +122,5 @@ class Parser(object):
                         data['content'] = result
                         Demeter.service('common').update('extract_content', False, data)
             Demeter.service('common').update('extract', info['id'], {'status':param['status'], 'page':param['page']})
-            Demeter.service('callback').send(info['site_id'], param)
+            Demeter.service('callback').send(info['site_id'], 'extract', param)
             return param