text.py 613 B

1234567891011121314151617
  1. # -*- coding: utf-8 -*-
  2. from .__load__ import *
  3. from langchain.text_splitter import RecursiveCharacterTextSplitter
  4. # 文本分割
  5. class Text(object):
  6. def run(self, data, separator=["\n\n", "\n", " ", ""], chunk_size=100, chunk_overlap=20):
  7. splitter = RecursiveCharacterTextSplitter(
  8. chunk_size=chunk_size, # 指定每块大小
  9. chunk_overlap=chunk_overlap, # 指定每块可以重叠的字符数
  10. length_function=len,
  11. is_separator_regex=True,
  12. separators=separator
  13. )
  14. data = splitter.split_documents(data)
  15. return data