1234567891011121314151617181920212223242526 |
- # -*- coding: utf-8 -*-
- from .__load__ import *
- from langchain.text_splitter import HTMLHeaderTextSplitter,RecursiveCharacterTextSplitter
- class Html(object):
- def run(self, data, chunk_size=500, chunk_overlap=30):
- header = [
- ("h1", "Header 1"),
- ("h2", "Header 2"),
- ("h3", "Header 3"),
- ("h4", "Header 4"),
- ("h5", "Header 5"),
- ("h6", "Header 6"),
- ]
- splitter = HTMLHeaderTextSplitter(headers_to_split_on=header)
- if 'http://' in data or 'https://' in data:
- data = splitter.split_text_from_url(data)
- splitter = RecursiveCharacterTextSplitter(
- chunk_size=chunk_size, chunk_overlap=chunk_overlap
- )
- data = splitter.split_documents(data)
-
- else:
- data = splitter.split_text(data)
- return data
|