# -*- coding: utf-8 -*- from .__load__ import * from langchain.text_splitter import HTMLHeaderTextSplitter,RecursiveCharacterTextSplitter class Html(object): def run(self, data, chunk_size=500, chunk_overlap=30): header = [ ("h1", "Header 1"), ("h2", "Header 2"), ("h3", "Header 3"), ("h4", "Header 4"), ("h5", "Header 5"), ("h6", "Header 6"), ] splitter = HTMLHeaderTextSplitter(headers_to_split_on=header) if 'http://' in data or 'https://' in data: data = splitter.split_text_from_url(data) splitter = RecursiveCharacterTextSplitter( chunk_size=chunk_size, chunk_overlap=chunk_overlap ) data = splitter.split_documents(data) else: data = splitter.split_text(data) return data