rabin vor 2 Monaten
Ursprung
Commit
ea9d15c655
100 geänderte Dateien mit 2167 neuen und 0 gelöschten Zeilen
  1. 8 0
      README.rst
  2. 10 0
      admin.py
  3. 1 0
      admin/__init__.py
  4. 5 0
      admin/main.py
  5. 1 0
      admin/page/__init__.py
  6. 2 0
      admin/page/__load__.py
  7. 55 0
      admin/page/lang_model.py
  8. 77 0
      admin/page/role.py
  9. 75 0
      admin/page/site.py
  10. 23 0
      check.py
  11. 96 0
      conf/dev.conf
  12. 68 0
      conf/env.conf
  13. 3 0
      cron.py
  14. BIN
      data/04、Frog Is Hungry.pdf
  15. BIN
      data/04、Frog Is Hungry/page10_img_0.png
  16. BIN
      data/04、Frog Is Hungry/page11_img_0.png
  17. BIN
      data/04、Frog Is Hungry/page12_img_0.png
  18. BIN
      data/04、Frog Is Hungry/page16_img_4.png
  19. BIN
      data/04、Frog Is Hungry/page1_img_3.png
  20. BIN
      data/04、Frog Is Hungry/page2_img_1.png
  21. BIN
      data/04、Frog Is Hungry/page3_img_0.png
  22. BIN
      data/04、Frog Is Hungry/page4_img_0.png
  23. BIN
      data/04、Frog Is Hungry/page5_img_0.png
  24. BIN
      data/04、Frog Is Hungry/page6_img_0.png
  25. BIN
      data/04、Frog Is Hungry/page7_img_0.png
  26. BIN
      data/04、Frog Is Hungry/page8_img_0.png
  27. BIN
      data/04、Frog Is Hungry/page9_img_0.png
  28. BIN
      data/24、Extreme Animals.pdf
  29. BIN
      data/76、 Show Some Love.pdf
  30. BIN
      diviner
  31. 10 0
      front.py
  32. 1 0
      front/__init__.py
  33. 1 0
      front/api/__init__.py
  34. 4 0
      front/api/__load__.py
  35. 24 0
      front/api/main.py
  36. 4 0
      front/main.py
  37. 25 0
      install.py
  38. 14 0
      loader.py
  39. 56 0
      master_cron.py
  40. 1 0
      model/__init__.py
  41. 7 0
      model/__load__.py
  42. 13 0
      model/data.py
  43. 25 0
      model/extract.py
  44. 12 0
      model/extract_content.py
  45. 13 0
      model/file.py
  46. 13 0
      model/lang_model.py
  47. 17 0
      model/role.py
  48. 30 0
      model/role_data.py
  49. 12 0
      model/role_sample.py
  50. 14 0
      model/signature.py
  51. 16 0
      model/site.py
  52. 12 0
      model/user.py
  53. 14 0
      model/user_data.py
  54. 15 0
      model/user_history.py
  55. 18 0
      pdf.py
  56. 15 0
      requirements.txt
  57. 6 0
      runtime/__init__.py
  58. 1 0
      runtime/sqlite/extract
  59. 1 0
      runtime/sqlite/extract_content
  60. 1 0
      runtime/sqlite/site
  61. 1 0
      service/__init__.py
  62. 6 0
      service/__load__.py
  63. 23 0
      service/agent/agent.py
  64. 12 0
      service/agent/init.py
  65. 23 0
      service/agent/rag.py
  66. 104 0
      service/agent/role.py
  67. 74 0
      service/auth.py
  68. 19 0
      service/callback.py
  69. 163 0
      service/comfyui/comfyui.py
  70. 1 0
      service/data/__init__.py
  71. 28 0
      service/data/db.py
  72. 28 0
      service/data/faiss.py
  73. 1 0
      service/extract/__init__.py
  74. 6 0
      service/extract/__load__.py
  75. 1 0
      service/extract/docs/__init__.py
  76. 6 0
      service/extract/docs/__load__.py
  77. 27 0
      service/extract/docs/base.py
  78. 9 0
      service/extract/docs/csv.py
  79. 9 0
      service/extract/docs/excel.py
  80. 9 0
      service/extract/docs/html.py
  81. 29 0
      service/extract/docs/img.py
  82. 10 0
      service/extract/docs/json.py
  83. 110 0
      service/extract/docs/pdf.py
  84. 9 0
      service/extract/docs/ppt.py
  85. 9 0
      service/extract/docs/text.py
  86. 9 0
      service/extract/docs/web.py
  87. 9 0
      service/extract/docs/word.py
  88. 335 0
      service/extract/extract.py
  89. 27 0
      service/extract/loader.py
  90. 122 0
      service/extract/parser.py
  91. 1 0
      service/linker/__init__.py
  92. 3 0
      service/linker/__load__.py
  93. 12 0
      service/linker/ali.py
  94. 13 0
      service/linker/baidu.py
  95. 13 0
      service/linker/deepseek.py
  96. 12 0
      service/linker/moonshot.py
  97. 15 0
      service/linker/spark.py
  98. 12 0
      service/linker/zhipu.py
  99. 47 0
      service/loader.py
  100. 1 0
      service/spliter/__init__.py

+ 8 - 0
README.rst

@@ -0,0 +1,8 @@
+# diviner 先知控制器
+
+安装
+
+python install.py
+
+前台:python front.py
+后台:python admin.py

+ 10 - 0
admin.py

@@ -0,0 +1,10 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+"""
+    demeter web
+    name:admin.py
+"""
+from demeter.core import *
+
+if __name__ == "__main__":
+	Demeter.webInit('admin')

+ 1 - 0
admin/__init__.py

@@ -0,0 +1 @@
+# -*- coding: utf-8 -*-

+ 5 - 0
admin/main.py

@@ -0,0 +1,5 @@
+# -*- coding: utf-8 -*-
+from demeter.web import *
+import demeter.admin.page as admin_page
+import admin.page
+Web.start(application=[admin.page,admin_page])

+ 1 - 0
admin/page/__init__.py

@@ -0,0 +1 @@
+# -*- coding: utf-8 -*-

+ 2 - 0
admin/page/__load__.py

@@ -0,0 +1,2 @@
+# -*- coding: utf-8 -*-
+from demeter.admin.page.__load__ import *

+ 55 - 0
admin/page/lang_model.py

@@ -0,0 +1,55 @@
+# -*- coding: utf-8 -*-
+from .__load__ import *
+
+class lang_model_path(Load):
+	@Web.auth
+	@Web.setting
+	def get(self):
+		self.set(
+			name = u'语言模型'
+			,path = '/lang_model/lang_model'
+			,width = '600'
+			,height = '600'
+			,search = (('label-1','cdate-time-start','cdate-time-end','name-input-mlike'), (u'日期范围',u'开始时间',u'截止时间',u'模型名称'))
+			,thead = (u'模型名称', u'模型渠道', u'模型标识', u'创建时间')
+			,tbody = ('name', 'channel', 'model', 'time')
+			,state = True
+		)
+		self.list('lang_model')
+		if self.data['list']:
+			llm = Demeter.config['llm']
+			for key, value in enumerate(self.data['list']):
+				id = str(value['id'])
+				self.data['list'][key]['time'] = Demeter.date(value['cdate'])
+				self.data['list'][key]['channel'] = llm[value['channel']]
+		self.show('list')
+
+class lang_model_update_path(Load):
+	@Web.auth
+	@Web.setting
+	def get(self):
+		llm = Demeter.config['llm']
+		channel = []
+		for key, value in llm.items():
+			channel.append({'id':key, 'name':value})
+		status = [
+			{'id':'1', 'name': '正常'},
+			{'id':'2', 'name': '封禁'},
+		]
+		self.set(
+			path = '/lang_model/lang_model'
+			,label = (u'模型名称',u'模型渠道',u'模型标识')
+			,update = ('name-input-required','channel-select-required','model-input-required')
+			,update_channel = channel
+			#,update_status = status
+		)
+		self.one('lang_model')
+		self.show('update')
+	@Web.auth
+	@Web.setting
+	def post(self):
+		self.update('lang_model')
+	@Web.auth
+	@Web.setting
+	def delete(self):
+		self.drop('lang_model')

+ 77 - 0
admin/page/role.py

@@ -0,0 +1,77 @@
+# -*- coding: utf-8 -*-
+"""
+    demeter web page
+    name:site.py 站点相关
+    author:rabin
+"""
+from .__load__ import *
+
+class role_path(Load):
+	@Web.auth
+	@Web.setting
+	def get(self):
+		self.set(
+			name = u'角色列表'
+			,path = '/role/role'
+			,width = '600'
+			,height = '600'
+			,edit = True
+			,add = True
+			,search = (('label-1','cdate-time-start','cdate-time-end','site_id-select-','status-select-','name-input-mlike'), (u'日期范围',u'开始时间',u'截止时间',u'选择站点',u'选择状态',u'角色名称'))
+			,thead = (u'角色ID', u'角色名称', u'模型名称', u'使用次数', u'角色状态', u'创建时间')
+			,tbody = ('id','name', 'model','use_num', 'status', 'cdate')
+			,state = True
+		)
+		self.data['common']['search_site_id-select-'] = self.service('common').list('site')
+		self.data['common']['search_status-select-'] = [{'id':1,'value':1,'name':u'正常'},{'id':2,'value':2, 'name':u'封禁'}]
+
+		self.list('role')
+		status = {}
+		status[1] = '正常'
+		status[2] = '封禁'
+		if self.data['list']:
+			for key, value in enumerate(self.data['list']):
+				site = self.service('common').one('site', id=value['site_id'])
+				self.data['list'][key]['site'] = site['name']
+				lang_model = self.service('common').one('lang_model', id=value['lang_model_id'])
+				self.data['list'][key]['model'] = lang_model['name']
+				self.data['list'][key]['status'] = status[value['status']]
+
+		self.show('list')
+
+class role_update_path(Load):
+	@Web.auth
+	@Web.setting
+	def get(self):
+		status = [
+			{'id':'1', 'name': '正常'},
+			{'id':'2', 'name': '封禁'},
+		]
+		self.set(
+			path = '/role/role'
+			,label = (u'所属站点', u'角色名称', u'语言模型', u'人设')
+			,update = ('site_id-select-required', 'name-input-required', 'lang_model_id-select-', 'persona-text-required')
+			,update_lang_model_id = self.service('common').list('lang_model')
+			,update_site_id = self.service('common').list('site')
+			#,update_status = status
+		)
+		self.one('role')
+		self.show('update')
+	@Web.auth
+	@Web.setting
+	def post(self):
+		self.data['update']['create_uid'] = -1
+		self.data['update']['owner_uid'] = -1
+
+		sid = self.update('role')
+
+		'''
+		if sid:
+			redis = Demeter.redis()
+			config = Demeter.config['redis']
+			redis.rpush(config['name'], sid)
+		'''
+	@Web.auth
+	@Web.setting
+	def delete(self):
+		self.drop('role')

+ 75 - 0
admin/page/site.py

@@ -0,0 +1,75 @@
+# -*- coding: utf-8 -*-
+"""
+    demeter web page
+    name:site.py 站点相关
+    author:rabin
+"""
+from .__load__ import *
+
+class site_path(Load):
+	@Web.auth
+	@Web.setting
+	def get(self):
+		self.set(
+			name = u'站点管理'
+			,path = '/site/site'
+			,width = '600'
+			,height = '600'
+			,search = (('label-1','workdate-time-start','workdate-time-end','name-input-mlike'), (u'日期范围',u'开始时间',u'截止时间',u'站点名称'))
+			,thead = (u'站点名称', u'快捷功能', u'授权信息', u'授权时间')
+			,tbody = ('name', 'func', 'app', 'time')
+			,state = True
+		)
+		menu = (
+			{'name':'角色列表', 'url':'/role/role'},
+		)
+		self.list('site')
+		if self.data['list']:
+			for key, value in enumerate(self.data['list']):
+				id = str(value['id'])
+				self.data['list'][key]['time'] = Demeter.date(value['sdate']) + ' 至 ' + Demeter.date(value['edate'])
+				self.data['list'][key]['app'] = u'[appid]:' + value['appid'] + u'<br />[appsecret]:' + value['appsecret']
+				param = '?search_site_id-select-=' + id
+				self.data['list'][key]['func'] = ''
+				for i in menu:
+					self.data['list'][key]['func'] = self.data['list'][key]['func'] + '<a href="'+i['url']+''+param+'">'+i['name']+'</a>&nbsp;&nbsp;&nbsp;&nbsp;'
+		self.show('list')
+
+class site_update_path(Load):
+	@Web.auth
+	@Web.setting
+	def get(self):
+		self.set(
+			path = '/site/site'
+			,label = (u'站点名称',u'站点网址',u'通知接口',u'开始时间', u'结束时间')
+			,update = ('name-input-required','link-input-required','api-input-required','sdate-date-required','edate-date-required')
+		)
+		self.one('site')
+		self.show('update')
+	@Web.auth
+	@Web.setting
+	def post(self):
+		id = self.input('id')
+		if not id:
+			self.getAppId()
+		else:
+			info = self.service('common').one('site', id=id)
+			if not info['appid'] or not info['appsecret']:
+				self.getAppId()
+		self.update('site')
+	@Web.auth
+	@Web.setting
+	def delete(self):
+		self.drop('site')
+	@Web.auth
+	@Web.setting
+	def getAppId(self):
+		self.data['update']['appid'] = Demeter.compressUuid(Demeter.uuid('convert'))
+		self.data['update']['appsecret'] = Demeter.hash()
+
+		model = Demeter.model('site')
+		model.appid = self.data['update']['appid']
+		model.appsecret = self.data['update']['appsecret']
+		info = model.select(type='fetchone')
+		if info:
+			self.getAppId()

+ 23 - 0
check.py

@@ -0,0 +1,23 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+import time
+import os
+timeSleep = 15
+
+def popen(command, bg=False):
+        string = command
+        if bg == True:
+                command = command + ' &'
+        process = os.popen(command)
+        output = process.read()
+        process.close()
+        return output
+
+def process():
+        command = 'python /data/dm/container/web/master_cron.py 1>/dev/null 2>&1 &'
+        check = 'ps -ef | grep master_cron.py | grep -v grep | wc -l'
+        value = int(popen(check))
+        if value == 0:
+                popen(command)
+
+process()

+ 96 - 0
conf/dev.conf

@@ -0,0 +1,96 @@
+[base]
+;开发环境配置
+[setting]
+name				= 先知控制器
+site				= http://diviner.shemic.com/doc/
+copyright			= 2025 shemic.com v1.0.0
+
+;打印路由表
+route				= True
+
+[db]
+rdb					= sqlite
+
+[sqlite]
+file                = diviner
+;允许自动建表
+create              = True
+
+[mysql]
+host				= 0.0.0.0
+port				= 3306
+username			= root
+password			= 123456
+dbname				= diviner
+prefix				= oc
+charset				= utf8
+;允许自动建表
+create				= True
+
+[redis]
+host				= 127.0.0.1
+password			= dm_redis_123
+port				= 6379
+name				= diviner
+prefix				= shemic_
+
+;定义一些tornado的配置,可为空
+[tornado]
+
+;后台配置
+[admin]
+port				= 8087
+debug				= True
+;请求的buffersize
+max_buffer_size		= 210763776
+;子进程
+process				= 0
+;定义后台父级菜单
+menu_parent			= 站点设置:&#xe62e;
+;定义后台子级菜单
+menu_child			= 站点管理:/site/site,语言模型:/lang_model/lang_model
+
+;cookie
+cookie_secret       = 61oETzKXQAGaYekL5gEmGeJJFuYh7EQnp2XdTP1o/Vo=
+login_url           = /user/login
+;是否使用安全cookie
+xsrf_cookies        = True
+
+;前台配置
+[front]
+port				= 8089
+debug				= True
+;请求的buffersize
+max_buffer_size		= 210763776
+;子进程
+process				= 0
+;支持手机版
+mobile				= True
+
+;llm大模型配置
+[llm]
+dp                  = deepseek
+baidu               = 文心一言
+ali                 = 阿里
+moonshot            = 月之暗面
+zhipu               = 智普AI
+spark               = 讯飞星火
+[baidu]
+QIANFAN_AK               = hhpP53ks4dbiCaOjYIWABRFd
+QIANFAN_SK               = xhXgsFZCQQ4xTqOuYey4Qpp3FBm2Iunx
+[dp]
+api_key = 1111
+[ali]
+api_key = sk-4d894d18ec194c498b8fc7e1a741b7ea
+base_url = https://dashscope.aliyuncs.com/compatible-mode/v1
+[moonshot]
+api_key = 1111
+[zhipu]
+api_key = 111
+[spark]
+api_key = 111
+
+[baiduocr]
+app_id = 11141139
+api_key = zaOaRKxp7tH977WPHEkv1YGy
+secret_key = gtNP7AIS93YXyYnAv93f6oHkqvcxxAH6

+ 68 - 0
conf/env.conf

@@ -0,0 +1,68 @@
+[base]
+;线上环境配置,请设置环境变量DEMETER_CONF = env
+[setting]
+name                = 先知控制器
+site                = http://diviner.shemic.com/doc/
+copyright           = 2024 shemic.com v1.0.0
+
+;文档转换后 保存的路径 一般用于nginx使用 默认保存到runtime
+;save				= /share/files/
+
+[db]
+rdb                 = sqlite
+
+[sqlite]
+file                = diviner
+;允许自动建表
+create              = True
+
+[mysql]
+host				= web-mysql
+port				= 3306
+username			= root
+password			= 123456
+dbname				= diviner
+prefix				= oc
+charset				= utf8
+;允许自动建表
+create				= True
+
+[redis]
+host				= web-redis
+password			= dm_redis_123
+port				= 6379
+name				= diviner
+prefix				= shemic_
+
+;定义一些tornado的配置,可为空
+[tornado]
+
+;后台配置
+[admin]
+port				= 8087
+debug				= False
+;请求的buffersize
+max_buffer_size		= 210763776
+;子进程
+process				= 0
+;定义后台父级菜单
+menu_parent			= 站点设置:&#xe62e;
+;定义后台子级菜单
+menu_child			= 站点管理:/site/site,转换文件列表:/site/convert
+
+;cookie
+cookie_secret		= 61oETzKXQAGaYekL5gEmGeJJFuYh7EQnp2XdTP1o/Vo=
+login_url			= /user/login
+;是否使用安全cookie
+xsrf_cookies		= True
+
+;前台配置
+[front]
+port				= 8088
+debug				= False
+;请求的buffersize
+max_buffer_size		= 210763776
+;子进程
+process				= 30
+;支持手机版
+mobile				= True

+ 3 - 0
cron.py

@@ -0,0 +1,3 @@
+# -*- coding: utf-8 -*-
+from demeter.core import *
+Demeter.service('loader').start()

BIN
data/04、Frog Is Hungry.pdf


BIN
data/04、Frog Is Hungry/page10_img_0.png


BIN
data/04、Frog Is Hungry/page11_img_0.png


BIN
data/04、Frog Is Hungry/page12_img_0.png


BIN
data/04、Frog Is Hungry/page16_img_4.png


BIN
data/04、Frog Is Hungry/page1_img_3.png


BIN
data/04、Frog Is Hungry/page2_img_1.png


BIN
data/04、Frog Is Hungry/page3_img_0.png


BIN
data/04、Frog Is Hungry/page4_img_0.png


BIN
data/04、Frog Is Hungry/page5_img_0.png


BIN
data/04、Frog Is Hungry/page6_img_0.png


BIN
data/04、Frog Is Hungry/page7_img_0.png


BIN
data/04、Frog Is Hungry/page8_img_0.png


BIN
data/04、Frog Is Hungry/page9_img_0.png


BIN
data/24、Extreme Animals.pdf


BIN
data/76、 Show Some Love.pdf


BIN
diviner


+ 10 - 0
front.py

@@ -0,0 +1,10 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+"""
+    demeter web
+    name:front.py
+"""
+from demeter.core import *
+
+if __name__ == "__main__":
+	Demeter.webInit('front')

+ 1 - 0
front/__init__.py

@@ -0,0 +1 @@
+# -*- coding: utf-8 -*-

+ 1 - 0
front/api/__init__.py

@@ -0,0 +1 @@
+# -*- coding: utf-8 -*-

+ 4 - 0
front/api/__load__.py

@@ -0,0 +1,4 @@
+# -*- coding: utf-8 -*-
+from demeter.web import *
+
+#可以在此定义一些核心类库

+ 24 - 0
front/api/main.py

@@ -0,0 +1,24 @@
+# -*- coding: utf-8 -*-
+from .__load__ import *
+
+# 请求处理 /main/loader 接口必须后端获取,token不允许暴露
+#http://192.168.33.10:8088/main/loader?signature=44e3cd684a9fe697792a235c8c57838211f5823a&appid=mo1209&nonce=1529659172&method=extract&site_id=1&uid=1&source_id=1&source=
+class loader_path(Base):
+	@Web.setting
+	def get(self):
+		# 这里定义开放的功能
+		config = {
+			'extract': ['parser', 'extract']
+		}
+		method = self.input('method')
+		param = self.request.arguments
+		if method in config:
+			site = Demeter.service('auth').init(param)
+			if isinstance(site, str):
+				self.out('no', site)
+			else:
+				method = config[method]
+				param['sync'] = False
+				Demeter.service('loader').get(obj=method[0], module=method[1], **param)
+		else:
+			self.out('no', 'method error')

+ 4 - 0
front/main.py

@@ -0,0 +1,4 @@
+# -*- coding: utf-8 -*-
+from demeter.web import *
+import front.api
+Web.start(application=[front.api])

+ 25 - 0
install.py

@@ -0,0 +1,25 @@
+# -*- coding: utf-8 -*-
+from demeter.core import *
+
+#CREATE DATABASE IF NOT EXISTS yourdbname DEFAULT CHARSET utf8 COLLATE utf8_general_ci;
+def manage():
+	model = Demeter.model('manage_admin')
+	model.id = 1
+	info = model.select(type='fetchone')
+	if not info:
+		model.role_id = 1
+		model.username = 'admin'
+		model.mobile = '15810090845'
+		model.password = '123456'
+		model.insert()
+
+	model = Demeter.model('manage_role')
+	model.id = 1
+	info = model.select(type='fetchone')
+	if not info:
+		model.name = u'管理员'
+		model.insert()
+
+manage()
+
+Demeter.echo('install success!')

+ 14 - 0
loader.py

@@ -0,0 +1,14 @@
+# -*- coding: utf-8 -*-
+from demeter.core import *
+
+# 命令行传参 python loader.py -o parser -m extract -i 1
+param = {}
+param['obj'] = 'o'
+param['module'] = 'm'
+param['id'] = 'i'
+Demeter.getopt(param)
+
+obj = Demeter.option['obj']
+module = Demeter.option['module']
+id = Demeter.option['id']
+Demeter.service('loader').get(obj=obj, module=module, sync=True, id=id)

+ 56 - 0
master_cron.py

@@ -0,0 +1,56 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+# 用于批量处理转换 运行多个转换容器
+# nohup python /data/dm/container/web/master.py 2>/dev/null &
+import time
+import subprocess
+import os
+import pprint
+timeSleep = 120
+
+def redis():
+        import redis
+        host = '0.0.0.0'
+        port = 6379
+        password = 'dm_redis_123'
+        pool = redis.ConnectionPool(host=host, password=password, port=int(port))
+        return redis.Redis(connection_pool=pool)
+
+def command(file):
+        #return 'dm call office-convert_call id=' + file
+        return 'docker run -d -it --entrypoint python --rm -v /data/dm/container/share:/share -v /etc/hosts:/etc/hosts.main --env HOSTIP="172.30.0.6" --net=dm -v /data/dm/container/web:/www -v /data/dm/container/web/convert/static:/usr/local/convert/front/static -v /data/dm/container/web/convert/files:/usr/local/convert/runtime/files registry.cn-hangzhou.aliyuncs.com/shemic/convert  /usr/local/convert/convert.py -f ' + file
+
+def popen(command, bg=False):
+        string = command
+        if bg == True:
+                command = command + ' &'
+        process = os.popen(command)
+        output = process.read()
+        process.close()
+        return output
+
+# 文档转换
+def convert():
+        check = 'ps -ef | grep master_cron.py | grep -v grep | wc -l'
+        value = int(popen(check))
+        if value > 1:
+                return 1;
+        r = redis()
+        c = 'office_file'
+        i = 0
+        # r.ltrim("list2", 0, 1)
+        # n = r.llen(c)
+        while 1:
+                file = r.lpop(c)
+                if file:
+                        g = command(file)
+                        popen(g, False)
+                i = i+1
+                if i >= 10:
+                        time.sleep(timeSleep)
+                        i = 0
+
+def handle():
+        convert()
+
+handle()

+ 1 - 0
model/__init__.py

@@ -0,0 +1 @@
+# -*- coding: utf-8 -*-

+ 7 - 0
model/__load__.py

@@ -0,0 +1,7 @@
+# -*- coding: utf-8 -*-
+"""
+    demeter database
+    name:__load__.py
+"""
+from demeter.model import *
+from demeter.core import *

+ 13 - 0
model/data.py

@@ -0,0 +1,13 @@
+# -*- coding: utf-8 -*-
+from .__load__ import *
+
+class Data(Model):
+    __table__ = 'data'
+    __comment__ = '知识库'
+    id = Fields(type='int', primaryKey=True, autoIncrement=True, comment='ID')
+    site_id = Fields(type='int', comment='所属站点')
+    role_id = Fields(type='int', comment='角色id')
+    uid = Fields(type='int', comment='用户ID', default='-1')
+    name = Fields(type='varchar(32)', comment='知识库标识')
+    state = Fields(type='boolean', default='True', comment='数据存在状态')
+    cdate = Fields(type='int', default='time', comment='创建时间')

+ 25 - 0
model/extract.py

@@ -0,0 +1,25 @@
+# -*- coding: utf-8 -*-
+from .__load__ import *
+
+class Extract(Model):
+    __table__ = 'extract'
+    __comment__ = '提取文件内容'
+    id = Fields(type='int', primaryKey=True, autoIncrement=True, comment='ID')
+    site_id = Fields(type='int', comment='所属站点')
+    uid = Fields(type='varchar(200)', comment='上传者')
+    source_id = Fields(type='int(11)', comment='源文件id')
+    source = Fields(type='varchar(500)', comment='源文件')
+    notify = Fields(type='varchar(2000)', comment='回调地址')
+    name = Fields(type='varchar(200)', comment='文件名')
+    page = Fields(type='int(11)', comment='页数')
+    # 这里加索引
+    key = Fields(type='varchar(100)', index='search', comment='文件key')
+    ext = Fields(type='varchar(20)', comment='后缀名')
+    size = Fields(type='varchar(200)', comment='源文件大小')
+
+    file = Fields(type='varchar(200)', comment='本地地址')
+    path = Fields(type='varchar(200)', comment='本地资源目录')
+
+    state = Fields(type='boolean', default='True', comment='数据存在状态')
+    status = Fields(type='tinyint', default='1', comment='转换状态1待机2提取中3提取完成4提取失败')
+    cdate = Fields(type='int', default='time', comment='创建时间')

+ 12 - 0
model/extract_content.py

@@ -0,0 +1,12 @@
+# -*- coding: utf-8 -*-
+from .__load__ import *
+
+class Extract_content(Model):
+    __table__ = 'extract_content'
+    __comment__ = 'pdf内容'
+    id = Fields(type='int', primaryKey=True, autoIncrement=True, comment='ID')
+    extract_id = Fields(type='int', comment='文件id')
+    method = Fields(type='tinyint(1)', comment='方法')
+    content = Fields(type='text(255)', comment='内容')
+    state = Fields(type='boolean', default='True', comment='数据存在状态')
+    cdate = Fields(type='int', default='time', comment='创建时间')

+ 13 - 0
model/file.py

@@ -0,0 +1,13 @@
+# -*- coding: utf-8 -*-
+from .__load__ import *
+
+class File(Model):
+    __table__ = 'file'
+    __comment__ = '知识库'
+    id = Fields(type='int', primaryKey=True, autoIncrement=True, comment='ID')
+    site_id = Fields(type='int', comment='所属站点')
+    role_id = Fields(type='int', comment='角色id')
+    uid = Fields(type='int', comment='用户ID', default='-1')
+    name = Fields(type='varchar(32)', comment='知识库标识')
+    state = Fields(type='boolean', default='True', comment='数据存在状态')
+    cdate = Fields(type='int', default='time', comment='创建时间')

+ 13 - 0
model/lang_model.py

@@ -0,0 +1,13 @@
+# -*- coding: utf-8 -*-
+from .__load__ import *
+
+class Lang_model(Model):
+	__table__ = 'lang_model'
+	__comment__ = '语言模型表'
+	id = Fields(type='int', primaryKey=True, autoIncrement=True, comment='站点ID')
+	name = Fields(type='varchar(50)', comment='模型名称')
+	channel = Fields(type='varchar(20)', comment='模型渠道', default='baidu')
+	model = Fields(type='varchar(100)', comment='模型名称', default='ernie-speed-128k')
+	status = Fields(type='tinyint', default='1', comment='状态:1是正常2是不可用')
+	state = Fields(type='boolean', default='True', comment='数据存在状态')
+	cdate = Fields(type='int', default='time', comment='创建时间')

+ 17 - 0
model/role.py

@@ -0,0 +1,17 @@
+# -*- coding: utf-8 -*-
+from .__load__ import *
+
+class Role(Model):
+    __table__ = 'role'
+    __comment__ = '角色表'
+    id = Fields(type='int', primaryKey=True, autoIncrement=True, comment='ID')
+    site_id = Fields(type='int', comment='所属站点')
+    create_uid = Fields(type='int', comment='创建人ID', default='-1')
+    owner_uid = Fields(type='int', comment='持有人ID', default='-1')
+    name = Fields(type='varchar(500)', comment='名称')
+    lang_model_id = Fields(type='int(11)', comment='模型ID', default='1')
+    persona = Fields(type='text', comment='人设提示词', default='')
+    use_num = Fields(type='int', comment='使用次数')
+    status = Fields(type='tinyint', default='1', comment='角色状态:1是正常2是冻结')
+    state = Fields(type='boolean', default='True', comment='数据存在状态')
+    cdate = Fields(type='int', default='time', comment='创建时间')

+ 30 - 0
model/role_data.py

@@ -0,0 +1,30 @@
+# -*- coding: utf-8 -*-
+from .__load__ import *
+
+class Role_data(Model):
+    __table__ = 'role_data'
+    __comment__ = '角色知识库'
+    id = Fields(type='int', primaryKey=True, autoIncrement=True, comment='ID')
+    role_id = Fields(type='int', comment='角色id')
+    data_id = Fields(type='int', comment='数据文件id')
+    state = Fields(type='boolean', default='True', comment='数据存在状态')
+    cdate = Fields(type='int', default='time', comment='创建时间')
+    
+    def getCur(self, id):
+        self.id = id
+        data = self.select(type='fetchone')
+        return data
+
+    def getList(self, page = 1, param = {}):
+        cate = Demeter.model('video_cate').getCur()
+        self.cate_id = cate['id']
+        Demeter.config['page'] = {}
+        Demeter.config['page']['current'] = page
+        data = self.select(page=True)
+        result = []
+        if data:
+            for key, value in enumerate(data):
+                #cate = Demeter.service('common').one('video_cate', id=value['cate_id'])
+                value['cdate'] = Demeter.date(value['cdate'])
+                result.append([str(value['id']), value['name'], value['cdate'], ''])
+        return result

+ 12 - 0
model/role_sample.py

@@ -0,0 +1,12 @@
+# -*- coding: utf-8 -*-
+from .__load__ import *
+
+class User_data(Model):
+    __table__ = 'user_data'
+    __comment__ = '用户数据'
+    id = Fields(type='int', primaryKey=True, autoIncrement=True, comment='ID')
+    site_id = Fields(type='int', comment='所属站点')
+    role_id = Fields(type='int', comment='角色id')
+    content = Fields(type='text', comment='内容', default='')
+    state = Fields(type='boolean', default='True', comment='数据存在状态')
+    cdate = Fields(type='int', default='time', comment='创建时间')

+ 14 - 0
model/signature.py

@@ -0,0 +1,14 @@
+# -*- coding: utf-8 -*-
+"""
+    demeter database
+    name:signature.py
+"""
+from .__load__ import *
+
+class Signature(Model):
+	__table__ = 'signature'
+	__comment__ = 'signature记录表'
+	id = Fields(type='int', primaryKey=True, autoIncrement=True, comment='ID')
+	site_id = Fields(type='int', comment='所属站点')
+	signature = Fields(type='varchar(500)', comment='signature')
+	cdate = Fields(type='int', default='time', comment='创建时间')

+ 16 - 0
model/site.py

@@ -0,0 +1,16 @@
+# -*- coding: utf-8 -*-
+from .__load__ import *
+
+class Site(Model):
+	__table__ = 'site'
+	__comment__ = '站点主表'
+	id = Fields(type='int', primaryKey=True, autoIncrement=True, comment='站点ID')
+	name = Fields(type='varchar(50)', comment='站点名')
+	link = Fields(type='varchar(200)', comment='站点网址')
+	appid = Fields(type='varchar(150)', comment='appid')
+	appsecret = Fields(type='varchar(300)', comment='appsecret')
+	sdate = Fields(type='int', comment='开始时间')
+	edate = Fields(type='int', comment='结束时间')
+	api = Fields(type='varchar(2000)', comment='接口地址')
+	state = Fields(type='boolean', default='True', comment='数据存在状态')
+	cdate = Fields(type='int', default='time', comment='创建时间')

+ 12 - 0
model/user.py

@@ -0,0 +1,12 @@
+# -*- coding: utf-8 -*-
+from .__load__ import *
+
+class User(Model):
+    __table__ = 'user'
+    __comment__ = '用户表'
+    id = Fields(type='int', primaryKey=True, autoIncrement=True, comment='ID')
+    site_id = Fields(type='int', comment='所属站点')
+    site_uid = Fields(type='varchar(200)', comment='站点用户ID')
+    name = Fields(type='varchar(500)', comment='用户名称')
+    state = Fields(type='boolean', default='True', comment='数据存在状态')
+    cdate = Fields(type='int', default='time', comment='创建时间')

+ 14 - 0
model/user_data.py

@@ -0,0 +1,14 @@
+# -*- coding: utf-8 -*-
+from .__load__ import *
+
+class User_data(Model):
+    __table__ = 'user_data'
+    __comment__ = '用户数据'
+    id = Fields(type='int', primaryKey=True, autoIncrement=True, comment='ID')
+    site_id = Fields(type='int', comment='所属站点')
+    role_id = Fields(type='int', comment='角色id')
+    uid = Fields(type='int', comment='用户ID', default='-1')
+    name = Fields(type='varchar(32)', comment='知识库标识')
+    content = Fields(type='text', comment='内容', default='')
+    state = Fields(type='boolean', default='True', comment='数据存在状态')
+    cdate = Fields(type='int', default='time', comment='创建时间')

+ 15 - 0
model/user_history.py

@@ -0,0 +1,15 @@
+# -*- coding: utf-8 -*-
+from .__load__ import *
+
+class User_history(Model):
+    __table__ = 'user_data'
+    __comment__ = '用户历史对话'
+    id = Fields(type='int', primaryKey=True, autoIncrement=True, comment='ID')
+    site_id = Fields(type='int', comment='所属站点')
+    role_id = Fields(type='int', comment='角色id')
+    uid = Fields(type='int', comment='用户ID', default='-1')
+    user_input = Fields(type='text', comment='用户输入', default='')
+    ai_response = Fields(type='text', comment='AI响应', default='')
+    summary = Fields(type='text', comment='摘要', default='')
+    state = Fields(type='boolean', default='True', comment='数据存在状态')
+    cdate = Fields(type='int', default='time', comment='创建时间')

+ 18 - 0
pdf.py

@@ -0,0 +1,18 @@
+# -*- coding: utf-8 -*-
+from demeter.core import *
+
+# 测试转换pdf python convert.py -f file.doc
+param = {}
+param['file'] = 'f'
+Demeter.getopt(param)
+
+#file = Demeter.option['file']
+file = 'D://work/ai/diviner/dev/data/04、Frog Is Hungry.pdf'
+# pdf提取功能
+
+# 直接提取
+#result = Demeter.service('loader', 'extract').get(file).json()
+
+# 用通用方法 同步提取并记录已提取,下次直接用提取后的内容
+result = Demeter.service('loader').get(obj='parser', module='extract', sync=False, site_id=1, uid=1, source_id=1, source=file, method='json')
+print(result)

+ 15 - 0
requirements.txt

@@ -0,0 +1,15 @@
+demeter-lib
+redis
+requests
+gevent
+PyMuPDF
+langchain
+langchain_community
+langchain-huggingface
+langgraph
+zhipuai
+openai
+faiss-cpu
+transformers
+edge-tts>=6.1.3
+pysrt>=1.1.2

+ 6 - 0
runtime/__init__.py

@@ -0,0 +1,6 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+"""
+    demeter init
+    author:rabin
+"""

+ 1 - 0
runtime/sqlite/extract

@@ -0,0 +1 @@
+[["id", 1], ["site_id", 1], ["uid", 1], ["source_id", 1], ["source", 1], ["name", 1], ["page", 1], ["key", 1], ["ext", 1], ["size", 1], ["file", 1], ["path", 1], ["state", 1], ["status", 1], ["cdate", 1]]

+ 1 - 0
runtime/sqlite/extract_content

@@ -0,0 +1 @@
+[["id", 1], ["extract_id", 1], ["method", 1], ["content", 1], ["state", 1], ["cdate", 1]]

+ 1 - 0
runtime/sqlite/site

@@ -0,0 +1 @@
+[["id", 1], ["name", 1], ["link", 1], ["appid", 1], ["appsecret", 1], ["sdate", 1], ["edate", 1], ["api", 1], ["state", 1], ["cdate", 1]]

+ 1 - 0
service/__init__.py

@@ -0,0 +1 @@
+# -*- coding: utf-8 -*-

+ 6 - 0
service/__load__.py

@@ -0,0 +1,6 @@
+# -*- coding: utf-8 -*-
+from demeter.core import *
+from datetime import *
+import uuid
+import os
+import os.path

+ 23 - 0
service/agent/agent.py

@@ -0,0 +1,23 @@
+# -*- coding: utf-8 -*-
+from .__load__ import *
+from langchain_core.language_models.chat_models import HumanMessage
+from langchain_core.prompts import ChatPromptTemplate
+from langchain_core.output_parsers import StrOutputParser
+
+class Chat(object):
+
+    chain = False
+    def say(self, content):
+        return [HumanMessage(content=content)]
+
+    def set(self, prompts = []):
+        chain = ChatPromptTemplate.from_messages(prompts)
+        if not self.chain:
+            self.chain = chain
+        else:
+            self.chain = self.chain | chain
+
+    def out(self, handle, var = {}):
+        chain = self.chain | handle | StrOutputParser()
+        for chunk in chain.stream(var):
+            print(chunk, end="")

+ 12 - 0
service/agent/init.py

@@ -0,0 +1,12 @@
+# -*- coding: utf-8 -*-
+from .__load__ import *
+from langchain_huggingface import HuggingFaceEmbeddings
+class Init(object):
+
+    def __init__(self):
+        model_name = "BAAI/bge-small-zh-v1.5"
+        model_kwargs = {"device": "cpu"}
+        encode_kwargs = {"normalize_embeddings": True}
+        Demeter.embedding = HuggingFaceEmbeddings(
+            model_name=model_name, model_kwargs=model_kwargs, encode_kwargs=encode_kwargs
+        )

+ 23 - 0
service/agent/rag.py

@@ -0,0 +1,23 @@
+# -*- coding: utf-8 -*-
+from .__load__ import *
+from langchain_core.language_models.chat_models import HumanMessage
+from langchain_core.prompts import ChatPromptTemplate
+from langchain_core.output_parsers import StrOutputParser
+
+class Chat(object):
+
+    chain = False
+    def say(self, content):
+        return [HumanMessage(content=content)]
+
+    def set(self, prompts = []):
+        chain = ChatPromptTemplate.from_messages(prompts)
+        if not self.chain:
+            self.chain = chain
+        else:
+            self.chain = self.chain | chain
+
+    def out(self, handle, var = {}):
+        chain = self.chain | handle | StrOutputParser()
+        for chunk in chain.stream(var):
+            print(chunk, end="")

+ 104 - 0
service/agent/role.py

@@ -0,0 +1,104 @@
+# -*- coding: utf-8 -*-
+from .__load__ import *
+from langchain_core.prompts import ChatPromptTemplate
+from langchain_core.output_parsers import StrOutputParser
+from langchain_core.runnables import RunnablePassthrough
+class Role(object):
+
+    def init(self, site_id, role_id, uid):
+        self.info = Demeter.service('common').one('role', id=role_id)
+        if self.info:
+            self.model = Demeter.service('common').one('lang_model', id=self.info['lang_model_id'])
+        self.db = None
+        self.piece = None
+        #self.memory()
+        # 知识库挂载
+        data = Demeter.service('data').init(site_id)
+        context = data.load('similarity', {'k':5, 'fetch_k':50, 'filter': {'role_id': role_id, 'uid' : uid}})
+        #sample = data.load('similarity', {'k':5, 'fetch_k':50, 'filter': {'role_id': role_id, 'uid': 'sample'}})
+        print(context)
+        self.piece = {"context": context | self.format_docs, "question": RunnablePassthrough()}
+        return self
+
+    # 写入记忆
+    def write(self, memory):
+        pass
+
+    # 挂载工具
+    def tool(self, tool):
+        pass
+
+    def set(self, prompts):
+        chain = ChatPromptTemplate.from_template(prompts)
+        if not self.piece:
+            self.piece = chain
+        else:
+            self.piece = self.piece | chain
+        return self
+
+    def out(self, query, type = []):
+        if self.info:
+            #self.info['persona'] = '你是一个精美时尚杂志社的编辑,根据以下上下文来回答这个问题{context}'
+            template = """你是一个精美时尚杂志社的编辑,根据以下上下文来回答这个问题:
+
+            {context}
+
+            Question: {question},请用中文输出答案。
+            """
+
+            template = """你是一位专业医生。以下是病人的病例内容,请根据医学规范生成详细分析报告。
+
+            病例内容:
+            {context}
+
+
+            请根据上面提供的病例内容生成报告。根据病人的核心关注需求提供解决方案。
+            报告要求:
+            1. 核心健康问题汇总
+            2. 潜在风险与关联性分析
+            3. 综合健康建议
+            4. 紧急情况预警
+            5. 解决方案
+
+            请以word格式输出,我好直接生成word。
+            """
+            
+            self.set(template)
+            self.model = Demeter.service(self.model['channel'], 'llm').load(model='deepseek-r1', streaming=True)
+            full_report = ""
+            chain = (self.piece | self.model | StrOutputParser())
+            for chunk in chain.stream(query):
+                print(chunk, end="")
+                full_report += chunk
+            #self.save_docx(full_report)
+
+    def format_docs(self, docs):
+        return "\n\n".join([d.page_content for d in docs])
+
+    def save_docx(self, content):
+        patient_id = self.info.get('uid', 'unknown')  # 或者 role_id
+        timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
+        filename = f"report_{patient_id}_{timestamp}.docx"
+        output_dir = "reports"
+        os.makedirs(output_dir, exist_ok=True)
+        filepath = os.path.join(output_dir, filename)
+
+        doc = Document()
+        doc.add_heading('诊断报告', 0)
+        doc.add_paragraph(report_text)
+        doc.save(filepath)
+
+        print(f"\n\n📝 报告已保存为:{filepath}")
+
+    # 生成角色
+    def create(self, site_id, uid, name, persona, lang_model_id, data, tool):
+        db = Demeter.db('role')
+        db.site_id = site_id
+        db.create_uid = create_uid
+        db.owner_uid = owner_uid
+        db.persona = persona
+        db.lang_model_id = lang_model_id
+        id = db.insert()
+        if len(data) > 0:
+            for key, value in enumerate(data):
+                pass

+ 74 - 0
service/auth.py

@@ -0,0 +1,74 @@
+# -*- coding: utf-8 -*-
+from .__load__ import *
+class Auth(object):
+    def init(self, param=param, request=False):
+        if 'appid' not in param:
+            return '参数错误:appid'
+        if 'timestamp' not in param:
+            return '参数错误:timestamp'
+        if 'nonce' not in param:
+            return '参数错误:nonce'
+        if 'signature' not in param:
+            return '参数错误:signature'
+
+        model = Demeter.model('site')
+        model.appid = param['appid']
+        site = model.select(type='fetchone')
+        if not site:
+            return '站点信息不存在'
+
+        time = Demeter.time()
+        if time < site['sdate'] or time > site['edate']:
+            return '授权已失效'
+        if time - int(param['timestamp']) > 600:
+            return '签名已过期'
+
+        if request:
+            # 针对域名做白名单
+            referer = request.headers.get("Referer")
+            if not referer:
+                return '验证失败:来源错误'
+
+            host = Demeter.host(site['link'])
+            if host != Demeter.host(referer):
+                return '验证失败:来源错误'
+
+            uri = self.getHost(request) + request.uri
+            if referer == uri:
+                return '验证失败:来源错误'
+
+        param['appsecret'] = site['appsecret']
+        if self.signature(param) != signature:
+            return '验签失败'
+        return site
+
+    def getHost(self, request):
+        host = request.host.replace(':8088', '')
+        host = request.protocol + "://" + host
+        return host
+
+    def signature(self, param):
+        for k, v in param.items():
+            all_params[k] = str(v)
+        sorted_items = sorted(all_params.items(), key=lambda x: x[0])
+        param_str = "&".join(f"{k}={v}" for k, v in sorted_items)
+        return Demeter.md5(param_str)
+
+
+    # sign 只能使用一次 以后再说吧
+    def check(self, param):
+        model = Demeter.model('signature')
+        model.appid = param['appid']
+        model.signature = param['signature']
+        info = model.select(type='fetchone')
+        if info:
+            return False
+        model.site_id = param['appid']
+        model.signature = param['signature']
+        model.insert()
+
+    def clear(self):
+        num = Demeter.time() - 3600*24
+        model = Demeter.model('signature')
+        model.cdate.assign(num, exp='<=')
+        model.delete()

+ 19 - 0
service/callback.py

@@ -0,0 +1,19 @@
+# -*- coding: utf-8 -*-
+from .__load__ import *
+# 回调
+class Callback(object):
+
+    def send(self, site_id, param):
+        model = Demeter.model('site')
+        model.id = site_id
+        site = model.select(type='fetchone')
+        if 'notify' in param and param['notify']:
+            site['api'] = param['notify']
+        if 'api' in site and site['api']:
+            api = site['api']
+            param['appid'] = site['appid']
+            param['appsecret'] = site['appsecret']
+            param['timestamp'] = Demeter.time()
+            param['nonce'] = Demeter.hash()
+            param['signature'] = Dever.service('auth').signature(param)
+            Demeter.curl(api, param, 'post')

+ 163 - 0
service/comfyui/comfyui.py

@@ -0,0 +1,163 @@
+# -*- coding: utf-8 -*-
+from .__load__ import *
+class Comfyui(object):
+
+    def info(self, file, check = False):
+        output = self.act(file, False, False, check)
+        info = {}
+        if output:
+            # 提取格式信息
+            format_info = re.search(r"Input #0, (.*?), from", output)
+            if format_info:
+                info['format'] = format_info.group(1)
+
+            # 提取时长信息
+            duration_match = re.search(r'Duration: (\d+):(\d+):(\d+).(\d+)', output)
+            if duration_match:
+                hours = duration_match.group(1)
+                minutes = duration_match.group(2)
+                seconds = duration_match.group(3)
+                milliseconds = duration_match.group(4)
+                info['total_seconds'] = int(hours) * 3600 + int(minutes) * 60 + int(seconds) + int(milliseconds) / 100
+                info['duration'] = hours + ':' + minutes + ':' + seconds + '.' + milliseconds
+
+            # 提取比特率信息
+            bitrate_info = re.search(r"bitrate: (\d+ kb/s)", output)
+            if bitrate_info:
+                info['bitrate'] = bitrate_info.group(1)
+
+            # 提取视频流信息
+            video_stream_info = re.search(r"Stream #(\d+:\d+).*: Video: (.*?), (\d+x\d+)", output)
+            if video_stream_info:
+                info['video_stream'] = video_stream_info.group(1)
+                info['video_codec'] = video_stream_info.group(2)
+                resolution = video_stream_info.group(3)
+                info['resolution'] = resolution
+                width, height = resolution.split('x')
+                info['width'] = int(width)
+                info['height'] = int(height)
+
+            # 提取音频流信息
+            audio_stream_info = re.search(r"Stream #(\d+:\d+).*: Audio: (.*?), (\d+ Hz)", output)
+            if audio_stream_info:
+                info['audio_stream'] = audio_stream_info.group(1)
+                info['audio_codec'] = audio_stream_info.group(2)
+                info['audio_sample_rate'] = audio_stream_info.group(3)
+
+            match = re.search(r'(\d+(\.\d+)?) fps', output)
+            if match:
+                info['fps'] = float(match.group(1))
+                info['frame'] = info['fps'] * info['total_seconds']
+
+        return info
+
+    def act(self, input, output, option = {}, check = False):
+        self.cmd = [Demeter.ffmpeg]
+        self.cmd.append(self.input(input))
+        if option:
+            for k, v in option.items():
+                if not isinstance(v, list):
+                    v = [v]
+                method = getattr(self, k)
+                self.cmd.append(method(*v))
+
+        if output:
+            self.cmd.append(self.output(output))
+        cmd = self.implode(' ', self.cmd)
+        if check:
+            print(cmd);
+            return
+        result = subprocess.run(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True, bufsize=1, encoding='latin-1')
+        return result.stderr
+
+    # 获取视频
+    def input(self, video):
+        if isinstance(video, dict):
+            return self.option(video['option'], ' -i ' + video['file'])
+        return '-i "' + video + '"'
+
+    # 输出视频
+    def output(self, video):
+        #'-b:v 10000k'
+        if isinstance(video, dict):
+            return self.option(video['option'], ' ' + video['file'])
+        return '-y ' + video
+
+    # 按照时间截取
+    def time(self, max, start = 0):
+        return '-ss ' + self.gmdate('%H:%M:%S', start) + ' -t ' + self.gmdate('%H:%M:%S', max)
+
+    # video过滤器
+    def video(self, video):
+        if video:
+            cmd = []
+            for k, v in video.items():
+                cmd.append(self.filter('video', k, v))
+            return '-vf "'+self.implode(',', cmd)+'"'
+        return ''
+
+    # audio过滤器
+    def audio(self, audio):
+        if video:
+            cmd = []
+            for k, v in audio.items():
+                cmd.append(self.filter('audio', k, v))
+            return '-af "'+self.implode(',', cmd)+'"'
+        return ''
+
+    # 复杂过滤器
+    def filter_complex(self, param):
+        cmd = []
+        for k, v in param.items():
+            cmd.append(v)
+        return '-filter_complex "'+self.implode(';', cmd)+'"'
+
+    # 过滤器 -简单过滤器
+    def filter(self, type, method, param):
+        service = Demeter.service(type, 'filter')
+        if hasattr(service, method):
+            method = getattr(service, method)
+            if not isinstance(param, list):
+                param = [param]
+            return method(*param)
+        else:
+            if isinstance(param, list):
+                tmp = []
+                for k, v in param.items():
+                    if v:
+                        v = '=' + v
+                    tmp.append(k + v)
+                param = self.implode(':', tmp)
+            if param:
+                param = '=' + param
+            return method + param
+
+    # 获取选项
+    def option(self, option, suffix = ''):
+        tmp = []
+        if option:
+            for k, v in option.items():
+                if v:
+                    v = ' ' + v
+                if isinstance(k, str) and k:
+                    tmp.append('-' + self.alias(k) + v)
+                else:
+                    tmp.append(v)
+        return self.implode(' ', tmp) + suffix
+
+    # 设置别名
+    def alias(self, k):
+        if k == 'audio':
+            return 'b:a'
+        if k == 'video':
+            return 'b:v'
+        return k
+
+    def implode(self, stn, lst):
+        return stn.join(map(str, lst))
+
+    def gmdate(self, format, start):
+        # 将给定的时间戳转换为 datetime 对象
+        dt = datetime.fromtimestamp(start, tz=timezone.utc)
+        # 使用 strftime 方法格式化 datetime 对象
+        return dt.strftime(format)

+ 1 - 0
service/data/__init__.py

@@ -0,0 +1 @@
+# -*- coding: utf-8 -*-

+ 28 - 0
service/data/db.py

@@ -0,0 +1,28 @@
+# -*- coding: utf-8 -*-
+from .__load__ import *
+from langchain_community.vectorstores import FAISS
+
+class Db(object):
+
+    def __init__(self):
+        Demeter.service('init')
+        self.embedding = Demeter.embedding
+        self.path = File.path() + 'data/db/'
+
+    def write(self, id, data):
+        name = self.name(id)
+        if File.exists(name):
+            db = FAISS.load_local(name, embeddings=self.embedding, allow_dangerous_deserialization=True)
+            db.add_documents(data)
+        else:
+            db = FAISS.from_documents(data, embedding=self.embedding)
+        db.save_local(name)
+        return db
+
+    def read(self, id):
+        db = FAISS.load_local(self.name(id), embeddings=self.embedding, allow_dangerous_deserialization=True)
+        return db
+
+    def name(self, name):
+        name = str(name)
+        return File.dest(self.path, Demeter.md5(name))

+ 28 - 0
service/data/faiss.py

@@ -0,0 +1,28 @@
+# -*- coding: utf-8 -*-
+from .__load__ import *
+from langchain_community.vectorstores import FAISS
+
+class Db(object):
+
+    def __init__(self):
+        Demeter.service('init')
+        self.embedding = Demeter.embedding
+        self.path = File.path() + 'data/db/'
+
+    def write(self, id, data):
+        name = self.name(id)
+        if File.exists(name):
+            db = FAISS.load_local(name, embeddings=self.embedding, allow_dangerous_deserialization=True)
+            db.add_documents(data)
+        else:
+            db = FAISS.from_documents(data, embedding=self.embedding)
+        db.save_local(name)
+        return db
+
+    def read(self, id):
+        db = FAISS.load_local(self.name(id), embeddings=self.embedding, allow_dangerous_deserialization=True)
+        return db
+
+    def name(self, name):
+        name = str(name)
+        return File.dest(self.path, Demeter.md5(name))

+ 1 - 0
service/extract/__init__.py

@@ -0,0 +1 @@
+# -*- coding: utf-8 -*-

+ 6 - 0
service/extract/__load__.py

@@ -0,0 +1,6 @@
+# -*- coding: utf-8 -*-
+from demeter.core import *
+from datetime import *
+import uuid
+import os
+import os.path

+ 1 - 0
service/extract/docs/__init__.py

@@ -0,0 +1 @@
+# -*- coding: utf-8 -*-

+ 6 - 0
service/extract/docs/__load__.py

@@ -0,0 +1,6 @@
+# -*- coding: utf-8 -*-
+from demeter.core import *
+import os
+import io
+import re
+from .base import Base

+ 27 - 0
service/extract/docs/base.py

@@ -0,0 +1,27 @@
+# -*- coding: utf-8 -*-
+from .__load__ import *
+class Base(object):
+
+    # 初始化
+    def init(self, file, param = {}):
+        self.file = file
+        self.param = param
+        return self
+
+    # 获取路径
+    def getPath(self):
+        if 'path' not in self.param:
+            pdf_dir = os.path.dirname(self.file)
+            pdf_name = os.path.splitext(os.path.basename(self.file))[0]
+            self.param['path'] = os.path.join(pdf_dir, pdf_name)
+            if not os.path.exists(self.param['path']):
+                os.makedirs(self.param['path'])
+
+    # 移除域名
+    def removeDomains(self, text):
+        # 匹配 URL、域名,包含 http(s)、www、裸域名
+        domain_pattern = re.compile(
+            r"(https?://[^\s]+|www\.[^\s]+|(?<!@)\b[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}(?!\w))",
+            re.IGNORECASE
+        )
+        return domain_pattern.sub("", text).strip()

+ 9 - 0
service/extract/docs/csv.py

@@ -0,0 +1,9 @@
+# -*- coding: utf-8 -*-
+from .__load__ import *
+from langchain_community.document_loaders.csv_loader import CSVLoader
+
+class Csv(object):
+
+    def run(self, file, param = {}):
+        loader = CSVLoader(file_path=file, encoding='utf-8')
+        return loader.load()

+ 9 - 0
service/extract/docs/excel.py

@@ -0,0 +1,9 @@
+# -*- coding: utf-8 -*-
+from .__load__ import *
+from langchain_community.document_loaders import UnstructuredExcelLoader
+
+class Excel(object):
+
+    def run(self, file, param = {}):
+        loader = UnstructuredExcelLoader(file, mode='elements', **param)
+        return loader.load()

+ 9 - 0
service/extract/docs/html.py

@@ -0,0 +1,9 @@
+# -*- coding: utf-8 -*-
+from .__load__ import *
+from langchain_community.document_loaders import UnstructuredHTMLLoader
+
+class Html(object):
+
+    def run(self, file, param = {}):
+        loader = UnstructuredHTMLLoader(file, encoding='utf-8')
+        return loader.load()

+ 29 - 0
service/extract/docs/img.py

@@ -0,0 +1,29 @@
+# -*- coding: utf-8 -*-
+from .__load__ import *
+from langchain_community.document_loaders import UnstructuredImageLoader
+from langchain.schema import Document
+from aip import AipOcr
+#from PIL import Image
+#from cnocr import CnOcr
+#from pix2text import Pix2Text, merge_line_texts
+
+class Img(object):
+
+    def run(self, file, param = {}):
+        #loader = UnstructuredImageLoader(file, mode='single', **param)
+        #return loader.load()
+        with open(file, 'rb') as image_file:
+            image_data = image_file.read()
+
+
+        # 初始化AipOcr对象
+        client = AipOcr(Demeter.config['baiduocr']['app_id'], Demeter.config['baiduocr']['api_key'], Demeter.config['baiduocr']['secret_key'])
+
+        # 调用百度OCR接口识别文字
+        result = client.basicGeneral(image_data)
+        text = ''
+        if 'words_result' in result:
+            for item in result['words_result']:
+                text += item['words'] + '\n'
+        data = Document(page_content=text, metadata={"source": "ocr"})
+        return data

+ 10 - 0
service/extract/docs/json.py

@@ -0,0 +1,10 @@
+# -*- coding: utf-8 -*-
+# pip install jq
+from .__load__ import *
+from langchain_community.document_loaders import JSONLoader
+
+class Json(object):
+
+    def run(self, file, param = {}):
+        loader = JSONLoader(file_path=file, **param)
+        return loader.load()

+ 110 - 0
service/extract/docs/pdf.py

@@ -0,0 +1,110 @@
+# -*- coding: utf-8 -*-
+from .__load__ import *
+#from langchain_community.document_loaders import PyPDFLoader
+import fitz
+#from PIL import Image
+class Pdf(Base):
+    # 提取为json格式
+    def json(self):
+        if not self.file:
+            return False
+        self.getPath()
+        doc = fitz.open(self.file)
+        page = doc.page_count
+        result = {'page': page, 'content': []}
+
+        for page_num in range(len(doc)):
+            page = doc.load_page(page_num)
+            page_height = page.rect.height
+            blocks = page.get_text("dict", sort=True)["blocks"]
+            page_items = []
+
+            for i, b in enumerate(blocks):
+                # 去除页眉页脚:bbox[1] 是顶部Y坐标,bbox[3] 是底部Y坐标
+                y_top = b["bbox"][1]
+                y_bottom = b["bbox"][3]
+                if y_top < page_height * 0.05 or y_bottom > page_height * 0.95:
+                    continue
+
+                if b['type'] == 0:
+                    text_content = ""
+                    for line in b["lines"]:
+                        for span in line["spans"]:
+                            span_text = span["text"]
+                            text_content += span_text
+                        text_content += "\n"
+                    text_content = text_content.strip()
+                    text_content = self.removeDomains(text_content)
+                    if text_content:
+                        page_items.append({
+                            "type": "text",
+                            "pos": b["bbox"],
+                            "content": text_content,
+                            "page": page_num + 1
+                        })
+
+                elif b['type'] == 1:  # 图片块
+                    image_bytes = b.get("image", b"")
+                    if not image_bytes or len(image_bytes) < 100:
+                        continue
+                    try:
+                        pix = fitz.Pixmap(doc, b["image"])
+                        if pix.width < 10 or pix.height < 10:
+                            continue
+                    except Exception:
+                        pass
+                    image_ext = "png"
+                    filename = f"{self.param['path']}page{page_num+1}_img_{i}.{image_ext}"
+                    with open(filename, "wb") as f:
+                        f.write(image_bytes)
+
+                    page_items.append({
+                        "type": "image",
+                        "pos": b["bbox"],
+                        "ext": image_ext,
+                        "filename": filename,
+                        "page": page_num + 1,
+                    })
+
+            result['content'].extend(page_items)
+        return result
+
+    # 提取为langchain的Document格式
+    def doc(self):
+        if not self.file:
+            return False
+        #loader = PyPDFLoader(self.file, extract_images=False)
+        #return loader.load()
+        doc = fitz.open(self.file)
+        result = {'page': page, 'content': []}
+        for page_num in range(len(doc)):
+            page = doc.load_page(page_num)
+            
+            # 提取文本
+            text = page.get_text()
+
+            # 提取图片中的文字
+            image_texts = []
+            for img in page.get_images(full=True):
+                xref = img[0]
+                base_image = doc.extract_image(xref)
+                image_bytes = base_image["image"]
+                image = Image.open(io.BytesIO(image_bytes))
+
+                #result = Demeter.service('loader', 'extract').get(image)
+
+                ocr_result = ocr.ocr(image)
+                for line in ocr_result[0]:
+                    image_texts.append(line[1])
+
+                '''
+                # OCR 识别
+                ocr_result = ocr_reader.readtext(image)
+                image_texts = " ".join([line[1] for line in ocr_result]).strip()
+                '''
+
+            # 合并文字 + 图片文字
+            full_text = text.strip() + "\n" + "\n".join(image_texts)
+            document = langchain.schema.Document(page_content=full_text)
+            result['content'].append(document)
+        return result

+ 9 - 0
service/extract/docs/ppt.py

@@ -0,0 +1,9 @@
+# -*- coding: utf-8 -*-
+from .__load__ import *
+from langchain_community.document_loaders import UnstructuredPowerPointLoader
+
+class Ppt(object):
+
+    def run(self, file, param = {}):
+        loader = UnstructuredPowerPointLoader(file, mode='elements', **param)
+        return loader.load()

+ 9 - 0
service/extract/docs/text.py

@@ -0,0 +1,9 @@
+# -*- coding: utf-8 -*-
+from .__load__ import *
+from langchain_community.document_loaders import TextLoader
+
+class Text(object):
+
+    def run(self, file, param = {}):
+        loader = TextLoader(file, encoding='utf-8')
+        return loader.load()

+ 9 - 0
service/extract/docs/web.py

@@ -0,0 +1,9 @@
+# -*- coding: utf-8 -*-
+from .__load__ import *
+from langchain_community.document_loaders import WebBaseLoader
+
+class Web(object):
+
+    def run(self, file, param = {}):
+        loader = WebBaseLoader(file, **param)
+        return loader.load()

+ 9 - 0
service/extract/docs/word.py

@@ -0,0 +1,9 @@
+# -*- coding: utf-8 -*-
+from .__load__ import *
+from langchain_community.document_loaders import UnstructuredWordDocumentLoader
+
+class Word(object):
+
+    def run(self, file, param = {}):
+        loader = UnstructuredWordDocumentLoader(file, mode='elements', **param)
+        return loader.load()

+ 335 - 0
service/extract/extract.py

@@ -0,0 +1,335 @@
+# -*- coding: utf-8 -*-
+from .__load__ import *
+
+class Extract(object):
+
+    def update(self, site, appid, source_file, source_id, source_type, uid):
+
+        info = self.getFile(appid, source_file)
+
+        extract = Demeter.model('extract')
+        extract.site_id = site
+        extract.key = info['key']
+
+        data = extract.select(type='fetchone')
+        if not data:
+            extract.site_id = site
+            extract.uid = uid
+            extract.key = info['key']
+            extract.name = info['name']
+            extract.source_id = source_id
+            extract.source_type = source_type
+            extract.source_size = 0
+            extract.source_file = info['source_file']
+            extract.local_file = info['local_file']
+            extract.local_path = info['local_path']
+            id = extract.insert()
+            info['status'] = 1
+            info['id'] = id
+        else:
+            info['id'] = data['id']
+            info['status'] = data['status']
+        if uid:
+            self.auth(site, uid, info['id'], 1)
+
+        return info
+
+    def getAuth(self, site, uid, extract_id):
+        auth = Demeter.model('extract_auth')
+        auth.uid = uid
+        auth.site_id = site
+        auth.extract_id = extract_id
+        data = auth.select(type='fetchone')
+        return data
+
+
+    def auth(self, site, uid, extract_id, status):
+        auth = Demeter.model('extract_auth')
+        auth.uid = uid
+        auth.site_id = site
+        auth.extract_id = extract_id
+        data = auth.select(type='fetchone')
+        if not data:
+            auth.site_id = site
+            auth.uid = uid
+            auth.extract_id = extract_id
+            auth.status = status
+            auth.insert()
+        elif data['status'] != status:
+            # 适用于文档转让
+            auth.id = data['id']
+            update = {}
+            update['status'] = status
+            auth.update(update)
+
+        return True
+
+    def get(self, site, appid, file):
+        extract = Demeter.model('extract')
+        extract.site_id = site
+        #extract.key = self.getKey(appid, file)
+        extract.key = file
+        data = extract.select(type='fetchone')
+        return data
+
+    def getKey(self, appid, file):
+        return Demeter.sha1(str(appid) + '_' + str(file))
+
+    def getFile(self, appid, file):
+        info = {}
+
+        (filepath,temp) = os.path.split(file)
+        (filename,extension) = os.path.splitext(temp)
+
+        info['source_file'] = file
+        info['key'] = self.getKey(appid, file)
+        info['ext'] = extension
+        info['name'] = filename
+
+        info = self.getLocalFile(appid, file, info)
+
+        return info
+
+    def getLocalFile(self, appid, file, info):
+
+        day = str(date.today())
+        day = day.split('-')
+
+        #filename =  Demeter.md5(str(uuid.uuid5(uuid.uuid1(), info['key'])))
+        filename =  info['key']
+        filepath = str(appid) + '/' + day[0] + '/' + day[1] + '/' + day[2]
+        path = ''
+
+        if 'save' in Demeter.config['setting']:
+            filepath = File.mkdirs(os.path.join(Demeter.config['setting']['save'], filepath)) + '/' + filename
+        else:
+            filepath = File.mkdirs(os.path.join(Demeter.path, 'runtime','files', filepath)) + '/' + filename
+
+        local = filepath + info['ext']
+
+        info['local_file'] = local
+        info['local_path'] = filepath + '/'
+
+        return info
+        if File.exists(local):
+            return info
+        else:
+            self.download(file, local);
+            return info
+
+    def download(self, file, local):
+        if 'http' in file:
+            import requests
+            r = requests.get(file, stream=True)
+            with open(local, 'wb') as up:
+                for chunk in r.iter_content(chunk_size=1024):
+                    if chunk:
+                        up.write(chunk)
+
+        else:
+            import shutil
+            shutil.copyfile(file, local)
+
+        if File.exists(local):
+            return True
+        return False
+
+    def total(self, path):
+        page = 0
+        for parentdir,dirname,filenames in os.walk(path):  
+            for filename in filenames:
+                if os.path.splitext(filename)[1]=='.page':
+                    page = page + 1
+        return page
+
+    def handle(self, id):
+        model = Demeter.model('extract')
+        model.id = id
+        info = model.select(type='fetchone')
+
+        if not info:
+            return
+        siteModel = Demeter.model('site')
+        siteModel.id = info['site_id']
+        site = siteModel.select(type='fetchone')
+
+        status = True
+        if info['status'] == 1 or info['status'] == 4:
+            status = False
+
+        if info and status == False:
+            model.id = id
+            update = {}
+            update['status'] = 2
+            model.update(update)
+
+            if not File.exists(info['local_file']):
+                self.download(info['source_file'], info['local_file'])
+
+            if True:
+                File.mkdir(info['local_path'])
+
+                Demeter.service(info['ext'], 'docs').load(info)
+
+
+                if 'txt' in info['ext']:
+                    import chardet
+                    file_path = info['local_file']
+                    with open(file_path, 'rb') as fp:
+                        file_data = fp.read()
+                        result = chardet.detect(file_data)
+                        fp.close()
+                        if result['encoding'] != 'utf-8':
+                            file_content = file_data.decode(result['encoding'])
+                            f = open(file_path, 'w')
+                            f.write(file_content)
+                            f.close()
+                if 'pdf' in info['ext']:
+
+                handle = self.command(info)
+                Shell.popen(handle)
+                if File.exists(info['html']):
+
+                    # 处理图片
+                    self.pic(info)
+                    #self.string_switch(info['html'], "taste", "tasting")
+                    # 获取有多少页
+                    page = self.total(info['path'])
+                    model.id = id
+                    size = os.path.getsize(info['local'])
+                    update = {}
+                    update['file_size'] = size
+                    update['page'] = page
+                    update['status'] = 3
+                    model.update(update)
+
+                    # 通知接口 通知应用成功转换
+                    info['page'] = page
+                    info['file_size'] = size
+                    info['extract_status'] = 1
+                    self.api(info, site)
+                    return
+
+            model.id = id
+            update = {}
+            update['status'] = 4
+            model.update(update)
+            # 通知接口 通知应用失败转换
+            info['page'] = 0
+            info['file_size'] = 0
+            info['extract_status'] = 2
+            self.api(info, site)
+
+    def api(self, info, site):
+        if 'file_id' in info and info['file_id']:
+            api = site['api']
+
+            appid = site['appid']
+            appsecret = site['appsecret']
+            timestamp = Demeter.time()
+            nonce = Demeter.hash()
+            file = info['key']
+            file_id = info['file_id']
+            uid = info['uid']
+
+            param = self.signature(site['id'], appid, appsecret, timestamp, nonce, file, file_id, uid)
+
+            param['url'] = 'main/view'
+            param['img'] = info['url'] + '.jpg'
+            param['page'] = info['page']
+            param['ext'] = info['ext']
+            param['file_size'] = info['file_size']
+            param['status'] = info['extract_status']
+
+            Demeter.curl(api, param, 'post')
+
+    def pic(self, info):
+        from extract2jpg import extract2jpg
+        from wand.image import Image
+        width = 800
+        # 对生成的图片进行缩放
+        files = File.getFiles(info['path'])
+        if files:
+            for file in files:
+                if 'jpg' in file or 'png' in file:
+                    file = info['path'] + file
+                    with Image(filename=file) as img:
+                        target_width, target_height = self.getSize(width, img.width, img.height)
+                        img.sample(target_width, target_height)
+                        img.save(filename=file)
+
+        # 生成图片
+        dest = info['html'] + '.photo'
+        result = extract2jpg.extract_extract2jpg(info['extract'], dest, pages="0,1")
+
+        i = 0
+        for j in result[0]['output_jpgfiles']:
+            source = j
+            if i == 0:
+                dest = info['html'] + '.jpg'
+            else:
+                dest = info['html'] + '.jpg_' + str(i) + '.jpg'
+            command = 'mv '+source+' ' + dest
+            Shell.popen(command)
+            
+            with Image(filename=dest) as img:
+                target_width, target_height = self.getSize(width, img.width, img.height)
+                img.sample(target_width, target_height)
+                img.save(filename=dest)
+            i = i+1
+
+        '''
+        from wand.image import Image
+        extract = Image(filename=source, resolution=50)
+        jpg = extract.extract('jpg')
+        req_image = []
+        i = 0
+        for img in jpg.sequence:
+            if i == 0:
+                img_page = Image(image=img)
+                req_image.append(img_page.make_blob('jpg'))
+            i = i+1
+
+        for img in req_image:
+            ff = open(dest, 'wb')
+            ff.write(img)
+            ff.close()
+        '''
+
+
+    def getSize(self, target_width, img_width, img_height):
+        if img_width > target_width:
+            ratio = target_width / img_width
+            target_height = int(ratio * img_height)
+        else:
+            target_width = img_width
+            target_height = img_height
+        return target_width, target_height
+
+    def string_switch(self, x,y,z,s=1):
+        with open(x, "r", encoding="utf-8") as f:
+            #readlines以列表的形式将文件读出
+            lines = f.readlines()
+     
+        with open(x, "w", encoding="utf-8") as f_w:
+            #定义一个数字,用来记录在读取文件时在列表中的位置
+            n = 0
+            #默认选项,只替换第一次匹配到的行中的字符串
+            if s == 1:
+                for line in lines:
+                    if y in line:
+                        line = line.replace(y,z)
+                        f_w.write(line)
+                        n += 1
+                        break
+                    f_w.write(line)
+                    n += 1
+                #将剩余的文本内容继续输出
+                for i in range(n,len(lines)):
+                    f_w.write(lines[i])
+            #全局匹配替换
+            elif s == 'g':
+                for line in lines:
+                    if y in line:
+                        line = line.replace(y,z)
+                    f_w.write(line)

+ 27 - 0
service/extract/loader.py

@@ -0,0 +1,27 @@
+# -*- coding: utf-8 -*-
+from .__load__ import *
+
+class Loader(object):
+    def get(self, file, param = {}):
+        if 'http' in file:
+            loader = 'web'
+        elif File.exists(file):
+            if '.csv' in file:
+                loader = 'csv'
+            elif '.pdf' in file:
+                loader = 'pdf'
+            elif '.html' in file:
+                loader = 'html'
+            elif '.json' in file:
+                loader = 'json'
+            elif '.xls' in file:
+                loader = 'excel'
+            elif '.ppt' in file:
+                loader = 'ppt'
+            elif '.doc' in file:
+                loader = 'word'
+            elif '.jpg' in file or '.png' in file or '.gif' in file or '.webp' in file:
+                loader = 'img'
+            else:
+                loader = 'text'
+        return Demeter.service(loader, 'extract.docs').init(file, param)

+ 122 - 0
service/extract/parser.py

@@ -0,0 +1,122 @@
+# -*- coding: utf-8 -*-
+from .__load__ import *
+# 提取器
+class Parser(object):
+
+    def get(self, id=0, site_id = 0, uid = 0, source_id = 0, source = '', notify='', sync=True, method='json'):
+        extract = Demeter.model('extract')
+        if int(id) > 0:
+            extract.id = id
+        else:
+            extract.key = self.getKey(site_id, source)
+        data = extract.select(type='fetchone')
+        if not data:
+            if not source:
+                return source
+            info = self.getFile(site_id, source)
+            info['uid'] = uid
+            info['source_id'] = source_id
+            info['notify'] = notify
+            info['status'] = 1
+            info['id'] = Demeter.service('common').update('extract', False, info)
+        else:
+            info = data
+        if sync:
+            return self.handle(info, method);
+        return info['id']
+
+    def getFile(self, site_id, file):
+        info = {}
+        (filepath,temp) = os.path.split(file)
+        (filename,extension) = os.path.splitext(temp)
+        info['site_id'] = site_id;
+        info['source'] = file
+        info['key'] = self.getKey(site_id, file)
+        info['ext'] = extension
+        info['name'] = filename
+        info['file'] = file
+        info['path'] = filepath
+        info = self.getLocal(info)
+        info['size'] = os.path.getsize(info['file'])
+        return info
+
+    def getKey(self, site_id, file):
+        return Demeter.md5(str(site_id) + '_' + str(file))
+
+    def getLocal(self, info):
+        if 'http' in info['source']:
+            day = str(date.today())
+            day = day.split('-')
+            filename =  info['key']
+            filepath = str(info['site_id']) + '/' + day[0] + '/' + day[1] + '/' + day[2]
+        else:
+            filename = info['name']
+            filepath = info['path']
+
+        if 'save' in Demeter.config['setting']:
+            filepath = File.mkdirs(os.path.join(Demeter.config['setting']['save'], filepath)) + '/' + filename
+        else:
+            filepath = File.mkdirs(os.path.join(Demeter.path, 'runtime','files', filepath)) + '/' + filename
+
+        info['file'] = filepath + info['ext']
+        info['path'] = filepath + '/'
+        if File.exists(info['file']):
+            return info
+        else:
+            self.download(info['source'], info['file']);
+            return info
+
+    def download(self, file, local):
+        if 'http' in file:
+            import requests
+            r = requests.get(file, stream=True)
+            with open(local, 'wb') as up:
+                for chunk in r.iter_content(chunk_size=1024):
+                    if chunk:
+                        up.write(chunk)
+        else:
+            import shutil
+            shutil.copyfile(file, local)
+
+        if File.exists(local):
+            return True
+        return False
+
+    def handle(self, info, method):
+        param = {}
+        param['method'] = 'extract'
+        param['page'] = 0
+        param['status'] = 4
+        param['source_id'] = info['source_id']
+        status = True
+        if info['status'] == 1 or info['status'] == 4:
+            status = False
+
+        #if info and status == False:
+        if info:
+            Demeter.service('common').update('extract', info['id'], {'status':2})
+
+            if not File.exists(info['file']):
+                self.download(info['source'], info['file'])
+            if True:
+                File.mkdir(info['path'])
+                obj = Demeter.service('loader', 'extract').get(info['file'], {'path':info['path']})
+                func = getattr(obj, method)
+                result = func()
+                if result and 'page' in result and result['page'] > 0:
+                    param['content'] = result['content']
+                    param['page'] = result['page']
+                    param['status'] = 3
+                    if method == 'json':
+                        method = 1
+                    else:
+                        method = 2
+                    data = {'extract_id': info['id'], 'method' : method}
+                    content = Demeter.service('common').one('extract_content', **data)
+                    if not content:
+                        result = json.dumps(result, ensure_ascii=False)
+                        data['content'] = result
+                        Demeter.service('common').update('extract_content', False, data)
+            Demeter.service('common').update('extract', info['id'], {'status':param['status'], 'page':param['page']})
+            Demeter.service('callback').send(info['site_id'], param)
+            return param

+ 1 - 0
service/linker/__init__.py

@@ -0,0 +1 @@
+# -*- coding: utf-8 -*-

+ 3 - 0
service/linker/__load__.py

@@ -0,0 +1,3 @@
+# -*- coding: utf-8 -*-
+from demeter.core import *
+import os

+ 12 - 0
service/linker/ali.py

@@ -0,0 +1,12 @@
+# -*- coding: utf-8 -*-
+from .__load__ import *
+from langchain_community.chat_models.tongyi import ChatTongyi
+
+class Ali(object):
+
+    def load(self, model = 'qwen-plus-latest', streaming = True):
+        return ChatTongyi(
+            model=model,
+            api_key=Demeter.config['ali']['api_key'],
+            streaming=streaming,
+        )

+ 13 - 0
service/linker/baidu.py

@@ -0,0 +1,13 @@
+# -*- coding: utf-8 -*-
+from .__load__ import *
+from langchain_community.chat_models import QianfanChatEndpoint
+
+class Baidu(object):
+
+    def load(self, model = 'ERNIE-Bot-turbo', streaming = True):
+        for key,value in Demeter.config['baidu'].items():
+            os.environ[key.upper()] = value
+        return QianfanChatEndpoint(
+            streaming=streaming,
+            model=model,
+        )

+ 13 - 0
service/linker/deepseek.py

@@ -0,0 +1,13 @@
+# -*- coding: utf-8 -*-
+from .__load__ import *
+from langchain.chat_models import ChatOpenAI
+
+class Deepseek(object):
+
+    def load(self, model = '', streaming = True):
+        return ChatOpenAI(
+            model_name=model,
+            openai_api_key=Demeter.config['dp']['api_key'],
+            openai_api_base="https://api.deepseek.com",
+            streaming=streaming,
+        )

+ 12 - 0
service/linker/moonshot.py

@@ -0,0 +1,12 @@
+# -*- coding: utf-8 -*-
+from .__load__ import *
+from langchain_community.chat_models import MoonshotChat
+
+class Moonshot(object):
+
+    def load(self, model = 'moonshot-v1-32k', streaming = True):
+        return MoonshotChat(
+            model=model,
+            moonshot_api_key=Demeter.config['moonshot']['api_key'],
+            streaming=streaming,
+        )

+ 15 - 0
service/linker/spark.py

@@ -0,0 +1,15 @@
+# -*- coding: utf-8 -*-
+from .__load__ import *
+from langchain_community.chat_models import SparkLLM
+
+class Spark(object):
+
+    def load(self, model = 'qwen-turbo', streaming = True):
+        return SparkLLM(  # 科大讯飞星火(豆包)适配
+            app_id=Demeter.config['spark']['app_id'],
+            api_key=Demeter.config['spark']['api_key'],
+            api_secret=Demeter.config['spark']['api_secret'],
+            domain=Demeter.config['spark']['domain'],
+            spark_url=Demeter.config['spark']['spark_url'],
+            streaming=streaming,
+        )

+ 12 - 0
service/linker/zhipu.py

@@ -0,0 +1,12 @@
+# -*- coding: utf-8 -*-
+from .__load__ import *
+from langchain.chat_models import ChatOpenAI
+
+class Zhipu(object):
+
+    def load(self, model = 'glm-4', streaming = True):
+        return ZhipuAI(
+            model=model,
+            api_key=Demeter.config['zhipu']['api_key'],
+            streaming=streaming,
+        )

+ 47 - 0
service/loader.py

@@ -0,0 +1,47 @@
+# -*- coding: utf-8 -*-
+from .__load__ import *
+from gevent import monkey; monkey.patch_socket()
+import gevent
+class Loader(object):
+
+    # 获取
+    def get(self, obj='', module='', **param):
+        if 'sync' in param and param['sync']:
+            return Demeter.service(obj, module).get(**param)
+        else:
+            param['sync'] = False
+            id = Demeter.service(obj, module).get(**param)
+            redis = Demeter.redis()
+            config = Demeter.config['redis']
+            content = obj + '|' + module + '|' + str(id) 
+            redis.rpush(config['name'], content)
+            return content
+
+    # 启动任务
+    def start(self):
+        gevent.joinall([
+            gevent.spawn(self.run),
+        ])
+
+    # 定时运行异步任务
+    def run(self):
+        timeSleep = 1
+        redis = Demeter.redis()
+        config = Demeter.config['redis']
+        i = 0
+        while 1:
+            content = redis.lpop(config['name'])
+            if content:
+                print(content)
+                command = self.command(content.decode('utf-8'))
+                Shell.popen(command, False, False)
+            i = i+1
+            if i >= 10:
+                gevent.sleep(timeSleep)
+                i = 0
+
+    def command(self, content):
+        temp = content.split('|')
+        python_path = sys.executable
+        loader_path = os.path.join(File.path(), 'loader.py')  # 生成系统规范路径
+        return f'"{python_path}" "{loader_path}" -o {temp[0]} -m {temp[1]} -i {temp[2]}'

+ 1 - 0
service/spliter/__init__.py

@@ -0,0 +1 @@
+# -*- coding: utf-8 -*-

Einige Dateien werden nicht angezeigt, da zu viele Dateien in diesem Diff geändert wurden.