pdf.py 4.7 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125
  1. # -*- coding: utf-8 -*-
  2. from demeter.core import *
  3. # 测试转换pdf python convert.py -f file.doc
  4. param = {}
  5. param['file'] = 'f'
  6. Demeter.getopt(param)
  7. #file = Demeter.option['file']
  8. file = '/data/dm/container/web/diviner/data/test.pdf'
  9. audio = 'http://s.fxjk.vip/f4/af8bf6ebcf837c717f2aeabe6fd618.mp3'
  10. # pdf提取功能
  11. # 直接提取
  12. #result = Demeter.service('loader', 'extract').get(file, {'aud1io':audio}).json()
  13. # 用通用方法 同步提取并记录已提取,下次直接用提取后的内容
  14. result = Demeter.service('loader').get(obj='parser', module='extract', sync=True, site_id=1, uid=1, source_id=1, source=file, audio=audio, method='json')
  15. print(result)
  16. '''
  17. import json
  18. import time
  19. from aliyunsdkcore.acs_exception.exceptions import ClientException
  20. from aliyunsdkcore.acs_exception.exceptions import ServerException
  21. from aliyunsdkcore.client import AcsClient
  22. from aliyunsdkcore.request import CommonRequest
  23. def fileTrans(akId, akSecret, appKey, fileLink) :
  24. # 地域ID,固定值。
  25. REGION_ID = "cn-beijing"
  26. PRODUCT = "nls-filetrans"
  27. DOMAIN = "filetrans.cn-beijing.aliyuncs.com"
  28. API_VERSION = "2018-08-17"
  29. POST_REQUEST_ACTION = "SubmitTask"
  30. GET_REQUEST_ACTION = "GetTaskResult"
  31. # 请求参数
  32. KEY_APP_KEY = "appkey"
  33. KEY_FILE_LINK = "file_link"
  34. KEY_VERSION = "version"
  35. KEY_ENABLE_WORDS = "enable_words"
  36. # 是否开启智能分轨
  37. KEY_AUTO_SPLIT = "auto_split"
  38. # 响应参数
  39. KEY_TASK = "Task"
  40. KEY_TASK_ID = "TaskId"
  41. KEY_STATUS_TEXT = "StatusText"
  42. KEY_RESULT = "Result"
  43. # 状态值
  44. STATUS_SUCCESS = "SUCCESS"
  45. STATUS_RUNNING = "RUNNING"
  46. STATUS_QUEUEING = "QUEUEING"
  47. # 创建AcsClient实例
  48. client = AcsClient(akId, akSecret, REGION_ID)
  49. # 提交录音文件识别请求
  50. postRequest = CommonRequest()
  51. postRequest.set_domain(DOMAIN)
  52. postRequest.set_version(API_VERSION)
  53. postRequest.set_product(PRODUCT)
  54. postRequest.set_action_name(POST_REQUEST_ACTION)
  55. postRequest.set_method('POST')
  56. # 新接入请使用4.0版本,已接入(默认2.0)如需维持现状,请注释掉该参数设置。
  57. # 设置是否输出词信息,默认为false,开启时需要设置version为4.0。
  58. task = {KEY_APP_KEY : appKey, KEY_FILE_LINK : fileLink, KEY_VERSION : "4.0", KEY_ENABLE_WORDS : False}
  59. # 开启智能分轨,如果开启智能分轨,task中设置KEY_AUTO_SPLIT为True。
  60. # task = {KEY_APP_KEY : appKey, KEY_FILE_LINK : fileLink, KEY_VERSION : "4.0", KEY_ENABLE_WORDS : False, KEY_AUTO_SPLIT : True}
  61. task = json.dumps(task)
  62. print(task)
  63. postRequest.add_body_params(KEY_TASK, task)
  64. taskId = ""
  65. try :
  66. postResponse = client.do_action_with_exception(postRequest)
  67. postResponse = json.loads(postResponse)
  68. print (postResponse)
  69. statusText = postResponse[KEY_STATUS_TEXT]
  70. if statusText == STATUS_SUCCESS :
  71. print ("录音文件识别请求成功响应!")
  72. taskId = postResponse[KEY_TASK_ID]
  73. else :
  74. print ("录音文件识别请求失败!")
  75. return
  76. except ServerException as e:
  77. print (e)
  78. except ClientException as e:
  79. print (e)
  80. # 创建CommonRequest,设置任务ID。
  81. getRequest = CommonRequest()
  82. getRequest.set_domain(DOMAIN)
  83. getRequest.set_version(API_VERSION)
  84. getRequest.set_product(PRODUCT)
  85. getRequest.set_action_name(GET_REQUEST_ACTION)
  86. getRequest.set_method('GET')
  87. getRequest.add_query_param(KEY_TASK_ID, taskId)
  88. # 提交录音文件识别结果查询请求
  89. # 以轮询的方式进行识别结果的查询,直到服务端返回的状态描述符为"SUCCESS"、"SUCCESS_WITH_NO_VALID_FRAGMENT",
  90. # 或者为错误描述,则结束轮询。
  91. statusText = ""
  92. while True :
  93. try :
  94. getResponse = client.do_action_with_exception(getRequest)
  95. getResponse = json.loads(getResponse)
  96. print (getResponse)
  97. statusText = getResponse[KEY_STATUS_TEXT]
  98. if statusText == STATUS_RUNNING or statusText == STATUS_QUEUEING :
  99. # 继续轮询
  100. time.sleep(10)
  101. else :
  102. # 退出轮询
  103. break
  104. except ServerException as e:
  105. print (e)
  106. except ClientException as e:
  107. print (e)
  108. if statusText == STATUS_SUCCESS :
  109. print ("录音文件识别成功!")
  110. else :
  111. print ("录音文件识别失败!")
  112. return
  113. accessKeyId = 'LTAI5tCFiVxuXz39MMkXFcMm'
  114. accessKeySecret = 'sdK3jVSrrqzz2nONAGyd6kvXZZwkie'
  115. appKey = 'm0mDna21AWao7b0A'
  116. fileLink = "http://s.fxjk.vip/f4/af8bf6ebcf837c717f2aeabe6fd618.mp3"
  117. # 执行录音文件识别
  118. fileTrans(accessKeyId, accessKeySecret, appKey, fileLink)
  119. '''