1234567891011121314151617 |
- # -*- coding: utf-8 -*-
- from .__load__ import *
- from langchain.text_splitter import RecursiveCharacterTextSplitter
- # 文本分割
- class Text(object):
- def run(self, data, separator=["\n\n", "\n", " ", ""], chunk_size=100, chunk_overlap=20):
- splitter = RecursiveCharacterTextSplitter(
- chunk_size=chunk_size, # 指定每块大小
- chunk_overlap=chunk_overlap, # 指定每块可以重叠的字符数
- length_function=len,
- is_separator_regex=True,
- separators=separator
- )
- data = splitter.split_documents(data)
- return data
|