base.py 897 B

123456789101112131415161718192021222324252627
  1. # -*- coding: utf-8 -*-
  2. from .__load__ import *
  3. class Base(object):
  4. # 初始化
  5. def init(self, file, param = {}):
  6. self.file = file
  7. self.param = param
  8. return self
  9. # 获取路径
  10. def getPath(self):
  11. if 'path' not in self.param:
  12. pdf_dir = os.path.dirname(self.file)
  13. pdf_name = os.path.splitext(os.path.basename(self.file))[0]
  14. self.param['path'] = os.path.join(pdf_dir, pdf_name)
  15. if not os.path.exists(self.param['path']):
  16. os.makedirs(self.param['path'])
  17. # 移除域名
  18. def removeDomains(self, text):
  19. # 匹配 URL、域名,包含 http(s)、www、裸域名
  20. domain_pattern = re.compile(
  21. r"(https?://[^\s]+|www\.[^\s]+|(?<!@)\b[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}(?!\w))",
  22. re.IGNORECASE
  23. )
  24. return domain_pattern.sub("", text).strip()