业务背景
在RAG方案中,由于使用langchain按字数的切分方案,导致文本的召回结果不是很理想,此模型为某证券公司的模型方案,知识库大多是规章制度、法律条例等等,所以个性化按照默认方案即字数切分、章节切分、条切分。
技术细节
- 使用langchain读取docx、pdf、txt文档
from langchain_community.document_loaders import PyPDFLoader, TextLoader, Docx2txtLoader
if file_name.endswith(".pdf"):
loader = PyPDFLoader(file_path)
elif file_name.endswith(".txt"):
loader = TextLoader(file_path)
elif file_name.endswith(".docx"):
loader = Docx2txtLoader(file_path)
else:
raise BizException("目前只支持pdf文件与txt、docx文件")
- 按照给定的条件切分,默认、章节、条,在切分过程中如果章节或者条所含的字数超过配置的extra_word_filter_size的字数则会将此章节或词条按默认的(500,100)切分,后面转化为Document是因为最终导入向量数据库必须是这个类型无需关注。
def load_pages_(loader, file_path, split_type, chunk_size, chunk_overlap):
contents = []
for _ in loader.load():
content = _.page_content
for line in content.splitlines():
contents.append(clean_text(line, filter_words))
if split_type == SplitterType.DEFAULT:
# DEFAULT:默认为的按(500,100)切分
texts = get_text_splitter(chunk_size=chunk_size, chunk_overlap=chunk_overlap).split_text("".join(contents))
pages = []
for txt in texts:
document = Document(page_content=txt, metadata={'source': file_path})
pages.append(document)
elif split_type == SplitterType.CHAPTER:
# CHAPTER:按章节切分 若章节字数大于1000字此章节按DEFAULT方式切分
chapter_pattern = re.compile(r'^第[\d一二三四五六七八九十百千]+章')
pages = docx_spilt_common(file_path, contents, chapter_pattern)
elif split_type == SplitterType.STRIP:
# STRIP:按条切分
strip_pattern = re.compile(r'^第[\d一二三四五六七八九十百千]+条')
pages = docx_spilt_common(file_path, contents, strip_pattern)
else:
raise BizException("目前只支持按DEFAULT、CHAPTER、STRIP三种切分方式文件")
return pages
def get_text_splitter(chunk_size: int = 500, chunk_overlap: int = 100):
return RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=chunk_overlap)
def docx_spilt_common(doc_path, contents, chapter_pattern):
pages = []
chapter_text = []
current_chapter = ""
for content in contents:
# 使用正则表达式检查段落文本是否符合章节标题的格式
if chapter_pattern.match(content):
current_chapter = ''.join(chapter_text)
if current_chapter is not None and len(current_chapter) > extra_word_filter_size:
pages_add_document(pages, current_chapter, doc_path)
# 重置章节内容并设置新章节的标题
current_chapter = content
chapter_text = []
# 将段落文本添加到当前章节
chapter_text.append(content)
# 确保最后一个章节也被输出
if current_chapter is not None:
current_chapter = ''.join(chapter_text)
pages_add_document(pages, current_chapter, doc_path)
return pages
def pages_add_document(pages, page_content, doc_path):
document = Document(page_content=page_content, metadata={'source': doc_path})
if len(page_content) > extra_word_default_size:
documents = get_text_splitter(extra_word_chunk_size, extra_word_chunk_overlap).split_documents([document])
for document in documents:
pages.append(document)
else:
pages.append(document)