html.py 894 B

1234567891011121314151617181920212223242526
  1. # -*- coding: utf-8 -*-
  2. from .__load__ import *
  3. from langchain.text_splitter import HTMLHeaderTextSplitter,RecursiveCharacterTextSplitter
  4. class Html(object):
  5. def run(self, data, chunk_size=500, chunk_overlap=30):
  6. header = [
  7. ("h1", "Header 1"),
  8. ("h2", "Header 2"),
  9. ("h3", "Header 3"),
  10. ("h4", "Header 4"),
  11. ("h5", "Header 5"),
  12. ("h6", "Header 6"),
  13. ]
  14. splitter = HTMLHeaderTextSplitter(headers_to_split_on=header)
  15. if 'http://' in data or 'https://' in data:
  16. data = splitter.split_text_from_url(data)
  17. splitter = RecursiveCharacterTextSplitter(
  18. chunk_size=chunk_size, chunk_overlap=chunk_overlap
  19. )
  20. data = splitter.split_documents(data)
  21. else:
  22. data = splitter.split_text(data)
  23. return data