shemic
/
diviner


			
				
					
						
						
							123456789101112131415161718192021222324252627
							# -*- coding: utf-8 -*-
from .__load__ import *
class Base(object):

    # 初始化
    def init(self, file, param = {}):
        self.file = file
        self.param = param
        return self

    # 获取路径
    def getPath(self):
        if 'path' not in self.param:
            pdf_dir = os.path.dirname(self.file)
            pdf_name = os.path.splitext(os.path.basename(self.file))[0] + '/'
            self.param['path'] = os.path.join(pdf_dir, pdf_name)
            if not os.path.exists(self.param['path']):
                os.makedirs(self.param['path'])

    # 移除域名
    def removeDomains(self, text):
        # 匹配 URL、域名，包含 http(s)、www、裸域名
        domain_pattern = re.compile(
            r"(https?://[^\s]+|www\.[^\s]+|(?<!@)\b[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}(?!\w))",
            re.IGNORECASE
        )
        return domain_pattern.sub("", text).strip()