支持 网络搜索/远程知识库数据库对话 Pinecon 先来讲一下啊 网上那些什么node的langchain本地知识库对话的教程,文章一大堆,拽一堆专业词汇(拽啥,不一样是看的人家英文文档,用的翻译,搞得看个中文教程看的我都难受),看的迷迷糊糊的跟着他们用langchain搞下来,是能用,哎,不支持上下文对话, 就一个人工智障跟个鸡肋似的
我也就是用langchain来进行数据切片。
本地知识库对话主要逻辑用大白话讲其实就是 把用户的提问转换成向量(openai有提供的接口) ,
字符(string) = >向量(Array:[[122,399],000,....])
然后去远程向量数据库通过余弦相似度算法去检索(Pinecon有现成的文档不需要知道原理配置一下调用就行)有一定相似度的数据(也就是相关数据)
再然后把这些提示整理一下注入到openai的聊天上下中(role: 'system',这是一个openai中的角色参数,设置为系统,{ role: 'system', content: Here is the result of querying the remote vector database \n content: \n ${promptContent.text}
})
远程向量数据库其实也一样就是把用户提供的pdf txt 什么的数据(先切片,因为有大小限制)转为向量存到数据库中
网络搜索同上
这种问答的基础流程 (langchain在这里最大的用处就是数据库上传的时候切片)
用户的提问=>转为向量=>去向量数据查询=>获取前几个搜索结果注入到会话提示中=>向openai发起请求
调用main.js
//askQuestionData 是 openai接口请求的官方参数 详细去官网看
//添加对应插件 ( 网络搜索 / 本地知识库问答 ) 对应main.js中的方法 提供给openai调用
askQuestionData['functions'] = [
{
"name": "getNetworkPrompt",
"description": "This is your Search engine. Calling this function will connect to the Internet to search and return relevant data",
"parameters": {
"type": "object",
"properties": {
"q": {
"type": "string",
"description": "User raised questions"
}
},
"required": ["q"]
}
},
{
"name": "getLocalPrompt",
"description": "Query the remote vector library and obtain relevant content based on user questions",
"parameters": {
"type": "object",
"properties": {
"content": {
"type": "string",
"description": "User raised questions"
}
"namespace": {
"type": "string",
"description": "The name of the user's query database"
}
},
"required": ["content","namespace"]
}
}];
service.openai.main.askQuestion(askQuestionData, {
callbackFn: (res) => {
if (res.catch) { //网络出错
console.log(res.catch, '网络出错')
} else if (res.error) { //请求出错
data.status = 'success'
data.content = res.error
} else {
const content = res.choices[0].delta.content !== undefined res.choices[0].delta.content : '';
}
console.log(content,'流式响应数据')
},
readStreamCallback: (readStream) => {
console.log(readStream,'readStream,可以执行销毁停止操作')
}
});
main.js
const Service = require('egg').Service;
const { Configuration, OpenAIApi } = require("openai");
const { PineconeClient } = require("@pinecone-database/pinecone");
const axios = require("axios");
const { htmlToText } = require('html-to-text');
const { encodingForModel } = require("js-tiktoken");
const { RecursiveCharacterTextSplitter } = require("langchain/text_splitter");
const { EPubLoader } = require("langchain/document_loaders/fs/epub");
const { PineconeStore } = require("langchain/vectorstores/pinecone");
const { OpenAIEmbeddings } = require("langchain/embeddings/openai");
const { PDFLoader } = require("langchain/document_loaders/fs/pdf");
const { TextLoader } = require("langchain/document_loaders/fs/text");
const { DocxLoader } = require("langchain/document_loaders/fs/docx");
const fs = require('fs');
class OpenaiService extends Service {
questionConfiguration = {
stream: true,
model: "gpt-3.5-turbo"
}
askQuestionCurlParameter = {
headers: {
"Authorization": `Bearer ${this.ctx.app.config.openai.apiKey}`, //openai apikey
'Content-Type': 'application/json', // json
},
timeout: 10000, //超时时间
dataType: 'json', //数据类型 json
streaming: true, //开始curl流响应
method: 'post', //请求方式 post
}
trySplicing(data) {
try {
return JSON.parse(data)
} catch (error) {
console.log(data, '拼接失败')
return null
}
}
dataProcessing(dataStream, BrokenData = []) {
return String(dataStream).split("data:").filter((e) => e).flat(Infinity).map(item => {
try {
return JSON.parse(item)
} catch (error) {
if (item.split(/[\t\r\f\n\s]*/g).join('') == '[DONE]') {
return false
} else {
let result = null;
if (BrokenData.length >= 1) {
result = this.trySplicing(`${BrokenData[BrokenData.length - 1]}${item}`)
}
if (result) {
BrokenData.pop();
return result;
} else {
BrokenData.push(item)
return false
}
}
}
}).filter((e) => e)
}
routingParameterGeneration(parameter) {
if (parameter == null || parameter == undefined) return;
const parameterType = typeof parameter;
if (parameterType == 'string') {
let obj = {};
let splitParameter = parameter.split('?');
if (splitParameter.length <= 1) {
console.warn("暂无路径参数");
return
}
splitParameter[1].split('&').forEach(item => {
if (item.indexOf('=') !== -1) {
let splitItem = item.split('=')
obj[splitItem[0]] = splitItem[1]
} else {
console.warn("error");
}
})
return obj
} else if (parameterType == 'object') {
let stringParameter = '?'
let parameterEntries = Object.entries(parameter);
parameterEntries.forEach((key, index) => {
let itemString = `${key[0]}=${key[1]}${parameterEntries.length - 1 == index ? '' : '&'}`;
stringParameter += itemString
})
return stringParameter
}
}
/**
* @description 查找网络数据
* @param { string } value - 问题
* @param { string } model - 必应/谷歌
* @returns { Promise } - 查询到的数据
*/
async findNetworkdata(BingSearchParameter) {
// console.log(BingSearchParameter, 'BingSearchParameter');
return new Promise((resolve, reject) => {
axios.get(`${this.ctx.app.config}${BingSearchParameter}`).then(res => resolve(res)).catch(err => reject(err))
})
}
/**
* @description html解析为文本
* @param {Object|Array} html - html数据
* @returns { Array } - 返回的数据
*/
parsingHTML(html) {
if (!html) throw new Error('缺少html参数');
const htmlValues = Array.isArray(html) ? html : [html];
return htmlValues.map(item => htmlToText(item));
}
// 查找相近数据
findSimilarData(pineconeParameter) {
const { apiKey, environment } = this.ctx.app.config.pinecone;
return new Promise((resolve, reject) => {
//实例向量库
const client = new PineconeClient();
//初始化向量库 https://first-database-a81708f.svc.us-east-1-aws.pinecone.io
client.init({
apiKey,
environment,
}).then(() => {
const pineconeIndex = client.Index('first-database');
const queryRequest = Object.assign({
vector: [],
topK: 10,
includeValues: true,
includeMetadata: true,
namespace: "",
}, pineconeParameter);
pineconeIndex.query({ queryRequest }).then(res => {
resolve(res)
}).catch(err => {
reject(err)
})
})
})
}
//数据向量
getEmbeddings(input, model = 'text-embedding-ada-002') {
return new Promise((resolve, reject) => {
const configuration = new Configuration({
apiKey: this.ctx.app.config.openai.apiKey,
basePath: this.ctx.app.config.openai.basePath
});
const openai = new OpenAIApi(configuration);
openai.createEmbedding({
model,
input
}).then(res => {
resolve(res)
}).catch(err => {
reject(err)
})
})
}
/**
* @description 获取网络搜索提示
* @param { Object } parameter - 搜索参数
*/
async getNetworkPrompt(parameter = {}) {
if (!parameter.q) return;
let webSearchParameter = this.routingParameterGeneration(Object.assign({...this.app.ctx.config.GoogleSearch}, parameter, { q: encodeURI(parameter.q) }));
let searchResults = await this.findNetworkdata(webSearchParameter); //获取相关网址
let webDatas = await this.parsingWebPageContent(searchResults.data.items, { question: parameter.q, top: 1 });
return {
role: 'system',
content: ` You can refer to the following online search results for ${decodeURIComponent(parameter.q)} to improve your answer. If it is not helpful to you, please ignore this prompt. Network search data \n ${webDatas.map((item, index) => ` ${index + 1} \n content:${item.value} \n Network source:${item.link} \n `).join('\n')}`
}
}
async webCorrelationAnalysis({ question, websites }) {
websites = websites.map(item => { let { title, link, snippet } = item; return { title, link, snippet } });
websites.length = 5;
const configuration = new Configuration({
apiKey: this.ctx.app.config.openai.apiKey,
basePath: this.ctx.app.config.openai.basePath
});
const openai = new OpenAIApi(configuration);
let chatCompletion = [];
await openai.createChatCompletion({
model: "gpt-3.5-turbo-0613",
messages: [
{ role: "system", content: `Query a question related to the user's question in the provided JSON list and pass the JSON as a parameter to call the parseWebPageContent method` },
{
role: "system",
content: `JSON data: \n ${websites}`
},
{ role: "user", content: question }
],
functions: [{
"name": "parseWebPageContent",
"description": "This is a method of obtaining web page content by parsing JSON data",
"parameters": {
"type": "object",
"properties": {
"link": {
"type": "string",
"description": `This is the 'link' field in JSON data`,
},
"title": {
"type": "string",
"description": `This is the 'title' field in JSON data`,
},
"snippet": {
"type": "string",
"description": `This is the 'snippet' field in JSON data`,
}
},
"required": ["link", "title", "snippet"],
}
}]
}).then(res => {
if (res.data.choices[0].message?.function_call) {
let { arguments: parameters } = res.data.choices[0].message?.function_call;
chatCompletion.push(JSON.parse(parameters));
}
}).catch(err => {
console.log(err)
});
return chatCompletion;
}
/**
* @description 通过地址解析网页内容 并切生成摘要
* @param {Array} websites - 网址
* @param { Number } top
*/
async parsingWebPageContent(websites, { question, top = 1 }) {
console.log(websites.forEach(item=>{console.log(item.snippet)}))
if (websites.length > top) websites.length = top;
let websiteValues = await Promise.all(websites.map(async item => {
let { link } = item;
let html;
await axios.get(link).then(res => { html = res }).catch(err => { html = null; console.log(err) });
let value = '';
if (html) {
value = await this.generateSummary(question, this.parsingHTML(html.data));
}
return { value, link }
}));
return websiteValues
}
/**
* @description OpenAi 通过提问生成摘要
*/
async generateSummary(question, value) {
let enc = encodingForModel('gpt-3.5-turbo');
let valueTokens = enc.encode(value.join(","));
let residueTokensLength = 3096 - enc.encode(question).length;
if (valueTokens.length > residueTokensLength) valueTokens.length = residueTokensLength;
const configuration = new Configuration({
apiKey: this.ctx.app.config.openai.apiKey,
basePath: this.ctx.app.config.openai.basePath
});
const openai = new OpenAIApi(configuration);
let chatCompletion;
await openai.createChatCompletion({
model: "gpt-3.5-turbo",
messages: [{ role: "system", content: `Summarize the provided network search content based on user questions` }, {
role: "system",
content: `Network search results: \n ${enc.decode(valueTokens)}`
}, { role: "user", content: question }],
}).then(res => {
chatCompletion = res.data.choices[0].message.content
}).catch(err => {
console.log(err)
chatCompletion = '';
});
return chatCompletion;
}
functionCallSplicing(FunctionConfiguration, item) {
let { function_call = {} } = item.choices[0].delta;
return Object.assign(FunctionConfiguration, function_call, { arguments: FunctionConfiguration.arguments + (function_call?.arguments || '') });
}
/**
* @description 向openai发起提问
* @param {object} questionConfiguration - openai提问参数
* @param {Object} callBackConfiguration - 回调对象
* @param {Function} callBackConfiguration.callbackFn - 回调函数
* @param {Function} callBackConfiguration.readStreamCallback - 可读流回调 返回参数诗可读流对象 questionConfiguration.stream:true
*/
async askQuestion(questionConfiguration = this.questionConfiguration, callBackConfiguration) {
const data = Object.assign({}, this.questionConfiguration, questionConfiguration);
const askQuestionCurlParameter = Object.assign({}, this.askQuestionCurlParameter, { data });
this.ctx.curl(`${this.ctx.app.config.openai.basePath}/chat/completions`, askQuestionCurlParameter).then(res => {
let BrokenData = [];
let FunctionConfiguration = { is: false, name: "", arguments: "" };
if (askQuestionCurlParameter.data.stream) {
if (callBackConfiguration.hasOwnProperty('readStreamCallback')) callBackConfiguration.readStreamCallback(res);
res.res.on('data', dataStream => {
let a = this.dataProcessing(dataStream, BrokenData);
if (a.length && a[0].choices[0].delta.hasOwnProperty('function_call')) FunctionConfiguration.is = true;
a.forEach(item => FunctionConfiguration.is ? this.functionCallSplicing(FunctionConfiguration, item) : callBackConfiguration.callbackFn(item));
});
res.res.on('close', () => {
if (FunctionConfiguration.is) this.functionCall(FunctionConfiguration, { messages: questionConfiguration.messages, callBackConfiguration });
})
} else {
callBackConfiguration.callbackFn(res.data)
}
}).catch(error => {
callBackConfiguration.callbackFn({ catch: error })
})
}
getLocalPrompt({ content,namespace, pineconeParameter = {} }) {
let enc = encodingForModel('gpt-3.5-turbo');
let contentLength = enc.encode(content).length;
return new Promise((resolve, reject) => {
this.getEmbeddings(content).then(res => {
let vector = res.data.data[0].embedding;
this.findSimilarData({ ...pineconeParameter, vector, namespace }).then(res => {
let promptContent = res.matches.reduce((total, currentValue, currentIndex, arr) => {
if (currentIndex == 1) total = { text: total.metadata.text, textLength: enc.encode(total.metadata.text).length };
let calLength = 3596 - contentLength - total.textLength;
let remainingLength = calLength > 0 ? calLength : 0;
if (!remainingLength) return total;
let currentValueCodes = enc.encode(currentValue.metadata.text);
currentValueCodes.length = remainingLength;
let text = total.text + enc.decode(currentValueCodes);
let textLength = total.textLength + currentValueCodes.length;
return { text, textLength };
});
resolve({
role: 'system',
content: `Here is the result of querying the remote vector database \n content: \n ${promptContent.text} `
})
}).catch(err => {
reject(err)
})
})
})
}
async documentReading() {
let { filepath, filename, } = this.ctx.request.files[0];
let mimeType = filepath.split('.')[1];
console.log('文件类型', mimeType)
//加载分块
const splitter = new RecursiveCharacterTextSplitter({
chunkSize: 1000
});
// 加载
const loader = this.documentClassification(filepath, mimeType);
console.log('文档读取')
const docs = await loader.load();
// 分割
const splitterDocs = await splitter.splitDocuments(docs);
console.log( '文档分割完成');
return { splitterDocs, useTokens: splitterDocs.length * 1000 }
}
async findNamespace() {
let namespaces = await this.ctx.model.Namespace.findAll({
where: {
user_id: this.ctx.userinfo.id
}
});
return namespaces
}
async createdOrUpdateNamespace() {
let result = null;
await this.ctx.model.Namespace.findOrCreate(
{
where: {
name:this.ctx.request.body.namespace,
user_id: this.ctx.userinfo.id
},
defaults: {
name: this.ctx.request.body.namespace,
user_id: this.ctx.userinfo.id,
upload_status: 'Ready',
details:this.ctx.request.body.describe,
title:this.ctx.request.body.title||''
}
}
).then(async (res) => {
let namespace = res[0];
if (namespace.upload_status == 'Loading'|| namespace.upload_status == 'Adding') this.ctx.throw(500, '当前空间正在使用');
result = await namespace.update({
upload_status: namespace.upload_status == 'Ready' ? 'Loading' : 'Adding'
});
});
return result
}
async fileUpload({ splitterDocs, useTokens }) {
let upload_status ;
const { apiKey, environment } = this.ctx.app.config.pinecone;
const client = new PineconeClient();
await client.init({ apiKey, environment });
const pineconeIndex = client.Index('first-database');
let namespace = `${this.ctx.userinfo.id}-${this.ctx.request.body.namespace}`;
await PineconeStore.fromDocuments(splitterDocs, new OpenAIEmbeddings({
openAIApiKey: this.ctx.app.config.openai.apiKey
}, {
basePath: this.ctx.app.config.openai.basePath
}), {
pineconeIndex,
namespace,
}).then(res => {
console.log('操作成功');
upload_status = 'Success'
}).catch(err => {
console.log(err, 'err');
upload_status = 'Fail'
}).finally(() => {
let { filepath } = this.ctx.request.files[0];
fs.unlink(filepath, (err) => {
console.log(err, 'File deleted!');
});
});
return upload_status
}
documentClassification(filepath, mimeType) {
if (!filepath) throw new Error(`路径出现问题:${filepath}`)
let loader = null;
switch (mimeType) {
case 'pdf':
loader = new PDFLoader(filepath)
break;
case 'epub':
loader = new EPubLoader(filepath)
break;
case 'txt':
loader = new TextLoader(filepath)
break;
case 'docx':
loader = new DocxLoader(filepath)
break;
default:
break;
}
if (!loader) throw new Error(`无法解析的类型:${mimeType}`)
return loader;
}
/**
*
* @param {Object} FunctionConfiguration - function_call对象
* @param {*} askQuestionParameter - askQuestion参数 用于调用完成继续对话
*/
functionCall(FunctionConfiguration, askQuestionParameter) {
console.log(FunctionConfiguration,'FunctionConfiguration')
let { messages, callBackConfiguration } = askQuestionParameter;
this[FunctionConfiguration.name](JSON.parse(FunctionConfiguration.arguments)).then(res => {
this.askQuestion({ messages: [res, messages[messages.length - 1]],model:'gpt-3.5-turbo-0613' }, callBackConfiguration)
})
}
}
module.exports = OpenaiService