123456789101112131415161718192021222324252627 |
- # -*- coding: utf-8 -*-
- from .__load__ import *
- class Base(object):
- # 初始化
- def init(self, file, param = {}):
- self.file = file
- self.param = param
- return self
- # 获取路径
- def getPath(self):
- if 'path' not in self.param:
- pdf_dir = os.path.dirname(self.file)
- pdf_name = os.path.splitext(os.path.basename(self.file))[0] + '/'
- self.param['path'] = os.path.join(pdf_dir, pdf_name)
- if not os.path.exists(self.param['path']):
- os.makedirs(self.param['path'])
- # 移除域名
- def removeDomains(self, text):
- # 匹配 URL、域名,包含 http(s)、www、裸域名
- domain_pattern = re.compile(
- r"(https?://[^\s]+|www\.[^\s]+|(?<!@)\b[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}(?!\w))",
- re.IGNORECASE
- )
- return domain_pattern.sub("", text).strip()
|