前端交互
文件上传交互
<input type="file" ref="fileInput" style="display: none" @change="handleFileChange">
<div v-if="!isupload" class="add-butt" @click="add_file"><i class="el-icon-upload" style="font-size: 28px;"></i></div>
<div v-else class="add-butt"><i class="el-icon-loading"></i></div>
目前没有发现Vue.js怎么操作客户端文件,所以还是把文件上传到服务器,服务器将用户文件解析分割并向量化,将向量数据库序列化以方便下次加载而不用重新embedding。
由于本项目没有使用数据库,由localStorage储存的userID唯一标识用户。
handleFileChange(event) {
const file = event.target.files[0];
if (!file) return;
const fileName = file.name.toLowerCase();
if (fileName.endsWith('.pdf') || fileName.endsWith('.txt') || fileName.endsWith('.doc') || fileName.endsWith('.docx')) {
const maxSizeMB = 3;
const maxSizeBytes = maxSizeMB * 1024 * 1024;
if (file.size > maxSizeBytes) {
this.$message(`文件大小超过 ${maxSizeMB} MB,请选择小于 ${maxSizeMB} MB 的文件`);
return;
}
this.isupload = true;
let fileType;
if (fileName.endsWith('.pdf')) {
fileType = 'pdf';
} else if (fileName.endsWith('.txt')) {
fileType = 'txt';
} else if (fileName.endsWith('.doc') || fileName.endsWith('.docx')) {
fileType = 'doc';
}
let user_ID = localStorage.getItem("AIEDU_ID");
let count = parseInt(localStorage.getItem("count"))
upload({ file, type: fileType, userID: user_ID, count:count}).then((res) => {
if(res.msg=="ok"){
localStorage.setItem("count",count+1)
let new_chat = {
name: file.name,
id: generateUUID(),
time: new Date().toLocaleString(),
type : 1,
filename : file.name,
count: count
}
this.personList.push(new_chat)
let chats = JSON.parse(localStorage.getItem("chats"));
chats.push(new_chat)
localStorage.setItem("chats", JSON.stringify(chats));
this.$message("ok");
this.isupload = false;
return;
}else{
}
});
} else {
// 提示用户选择支持的文件类型
this.$message("请选择 PDF、文本或 Word 文件~🥳");
}
}
用户提问
与之前的FreeChat类似,只是需要携带用户id和文件索引以在服务器找到对应向量数据库。
let userID = localStorage.getItem("AIEDU_ID")
// let chat = JSON.parse(localStorage.getItem("chats"))
let data = {"msgs":[],"userID": userID,"count": this.frinedInfo.count}
if(this.chatList.length>=7){
for(let i = this.chatList.length-7;i<this.chatList.length;i++){
data.msgs.push({"msg":this.chatList[i].msg})
}
}else{
for(let i = 0;i<this.chatList.length;i++){
data.msgs.push({"msg":this.chatList[i].msg})
}
}
ragChat(data).then((res) => {
let converter = new Showdown.Converter();
let htmlStr=res.msg;
let convertedHtml = converter.makeHtml(htmlStr);
console.log(convertedHtml)
this.chatList[this.chatList.length-1].msg = convertedHtml;
this.isSend = false;
let history = JSON.parse(localStorage.getItem("history"));
let flag=false;
for(let i=0;i<history.length;i++){
if(history[i].uuid==this.frinedInfo.id){
history[i].data=this.chatList
flag = true
}
}
if(flag){
localStorage.setItem("history", JSON.stringify(history));
}else{
history.push({uuid:this.frinedInfo.id,data:this.chatList})
localStorage.setItem("history", JSON.stringify(history));
}
this.scrollBottom();
});
后端
文件处理
后端拿到用户文件将进行解析,得到文本后将其分割向量化,序列化数据库到磁盘。目前文本分割和索引策略都较简单,有待后续改进。
@user_rag.post("/upload")
async def uploadFile(file: UploadFile = File(...), userID: str = Form(...), count: int = Form(...)):
contents = await file.read()
# 创建用户文件夹
user_folder = os.path.join('../userFile', userID)
os.makedirs(user_folder, exist_ok=True)
# 创建用户向量文件夹
vector_folder = os.path.join('../userVector', userID)
os.makedirs(vector_folder, exist_ok=True)
# 构建文件保存路径
file_path = os.path.join(user_folder, str(count) + "." + get_type(file.filename))
# 向量保存路径
vector_path = os.path.join(vector_folder, str(count))
if os.path.isfile(file_path):
pass
else:
with open(file_path, 'wb') as f:
f.write(contents)
loader = PyPDFLoader(file_path)
docs = loader.load()
text_splitter = RecursiveCharacterTextSplitter(
chunk_size=300,
chunk_overlap=25,
)
documents = text_splitter.split_documents(docs)
db = FAISS.from_documents(documents, embeddings_model)
db.save_local(vector_path)
return {"msg": "ok"}
用户提问
由用户id和文件索引在服务器找到对应向量数据库,加载后针对用户题目问进行召回,形成context以供LLM参考,并附带历史聊天记录。
@user_rag.post("/chat")
async def ragChat(msgs: Msgs):
# 用户文件夹
user_folder = os.path.join('../userFile', msgs.userID)
# 用户向量文件夹
vector_folder = os.path.join('../userVector', msgs.userID)
# 向量路径
vector_path = os.path.join(vector_folder, str(msgs.count))
if os.path.isdir(vector_path):
chat_history = []
count = 0
while count < len(msgs.msgs) - 1:
human_msg = msgs.msgs[count].msg
count += 1
chat_history.extend([HumanMessage(content=human_msg), msgs.msgs[count].msg])
count += 1
query_text = msgs.msgs[count].msg
chatLLM = ChatTongyi(
streaming=True,
model_name="qwen-turbo",
)
q_prompt = ChatPromptTemplate.from_messages(
[
("system", "仅依据以下提供的内容回答给出的问题{context}"),
MessagesPlaceholder(variable_name="chat_history"),
("human", "{question}"),
]
)
db = FAISS.load_local(vector_path, embeddings_model, allow_dangerous_deserialization=True)
retriever = db.as_retriever(search_kwargs={"k": 3})
context_chain = retriever | form_docs
context = context_chain.invoke(query_text)
print(context)
chain = q_prompt | chatLLM | StrOutputParser()
return {"msg": chain.invoke({"context": context, "question": query_text, "chat_history": chat_history})}
else:
pass
踩坑
FAISS
FAISS的Python库竟然不支持中文字符,若是路径或文件名有中文会有奇怪的事情发生,目前解决方案是将用户文件名替换为文件索引(泪,但这样增加了业务逻辑的复杂性,后续可以再想想有什么更好的解决方案。