langchain 文件分割
安装langchain包
npm i langchain
代码示例
const { RecursiveCharacterTextSplitter } = require("langchain/text_splitter");
const { PDFLoader } = require("langchain/document_loaders/fs/pdf");
const { TextLoader } = require("langchain/document_loaders/fs/text");
const { DocxLoader } = require("langchain/document_loaders/fs/docx");
const { EPubLoader } = require("langchain/document_loaders/fs/epub");
documentClassification(filepath, mimeType) {
if (!filepath) throw new Error(`路径出现问题:${filepath}`)
let loader = null;
switch (mimeType) {
case 'pdf':
loader = new PDFLoader(filepath)
break;
case 'epub':
loader = new EPubLoader(filepath)
break;
case 'txt':
loader = new TextLoader(filepath)
break;
case 'docx':
loader = new DocxLoader(filepath)
break;
default:
break;
}
if (!loader) throw new Error(`无法解析的类型:${mimeType}`)
return loader;
}
async documentReading( { filepath, filename, }) {
let mimeType = filepath.split('.')[1];
console.log('文件类型', mimeType)
const splitter = new RecursiveCharacterTextSplitter({
chunkSize: 1000
});
const loader = documentClassification(filepath, mimeType);
const docs = await loader.load();
const splitterDocs = await splitter.splitDocuments(docs);
console.log( '文档分割完成');
return { splitterDocs, useTokens: splitterDocs.length * 1000 }
}
module.exports = documentReading