基于node/egg.js/langchain.js实现openai对话支持插件功能核心代码支持网络搜索本地知识库问答数据库知识上传切片

本文链接：https://blog.csdn.net/m0_56381003/article/details/132492100

支持网络搜索/远程知识库数据库对话 Pinecon 先来讲一下啊网上那些什么node的langchain本地知识库对话的教程，文章一大堆，拽一堆专业词汇（拽啥，不一样是看的人家英文文档，用的翻译，搞得看个中文教程看的我都难受），看的迷迷糊糊的跟着他们用langchain搞下来，是能用，哎，不支持上下文对话，就一个人工智障跟个鸡肋似的

我也就是用langchain来进行数据切片。

本地知识库对话主要逻辑用大白话讲其实就是把用户的提问转换成向量(openai有提供的接口) ，

字符(string) = >向量(Array:[[122,399],000,....])

然后去远程向量数据库通过余弦相似度算法去检索（Pinecon有现成的文档不需要知道原理配置一下调用就行）有一定相似度的数据（也就是相关数据）

再然后把这些提示整理一下注入到openai的聊天上下中（role: 'system'，这是一个openai中的角色参数，设置为系统，{ role: 'system', content: Here is the result of querying the remote vector database \n content: \n ${promptContent.text} }）

远程向量数据库其实也一样就是把用户提供的pdf txt 什么的数据（先切片，因为有大小限制）转为向量存到数据库中

网络搜索同上

这种问答的基础流程（langchain在这里最大的用处就是数据库上传的时候切片）

用户的提问=>转为向量=>去向量数据查询=>获取前几个搜索结果注入到会话提示中=>向openai发起请求

调用main.js

//askQuestionData 是 openai接口请求的官方参数 详细去官网看
//添加对应插件 （ 网络搜索 / 本地知识库问答 ） 对应main.js中的方法 提供给openai调用
askQuestionData['functions'] = [
                {
                "name": "getNetworkPrompt",
                "description": "This is your Search engine. Calling this function will connect to the Internet to search and return relevant data",
                "parameters": {
                    "type": "object",
                    "properties": {
                        "q": {
                            "type": "string",
                            "description": "User raised questions"
                        }
                    },
                    "required": ["q"]
                }
            },
            {
                "name": "getLocalPrompt",
                "description": "Query the remote vector library and obtain relevant content based on user questions",
                "parameters": {
                    "type": "object",
                    "properties": {
                        "content": {
                            "type": "string",
                            "description": "User raised questions"
                        }
                         "namespace": {
                            "type": "string",
                            "description": "The name of the user's query database"
                        }
                    },
                    "required": ["content","namespace"]
                }
            }];
      service.openai.main.askQuestion(askQuestionData, {
                callbackFn: (res) => {
                    if (res.catch) { //网络出错 
                        console.log(res.catch, '网络出错')
                    } else if (res.error) { //请求出错
                        data.status = 'success'
                        data.content = res.error
                    } else {
                        const content = res.choices[0].delta.content !== undefined res.choices[0].delta.content : ''; 
                    }  
                    console.log(content,'流式响应数据')
                },
                readStreamCallback: (readStream) => {
                    console.log(readStream,'readStream，可以执行销毁停止操作')
                }
            });

main.js

const Service = require('egg').Service;
const { Configuration, OpenAIApi } = require("openai");
const { PineconeClient } = require("@pinecone-database/pinecone");
const axios = require("axios");
const { htmlToText } = require('html-to-text');
const { encodingForModel } = require("js-tiktoken");
const { RecursiveCharacterTextSplitter } = require("langchain/text_splitter");
const { EPubLoader } = require("langchain/document_loaders/fs/epub");
const { PineconeStore } = require("langchain/vectorstores/pinecone");
const { OpenAIEmbeddings } = require("langchain/embeddings/openai");
const { PDFLoader } = require("langchain/document_loaders/fs/pdf");
const { TextLoader } = require("langchain/document_loaders/fs/text");
const { DocxLoader } = require("langchain/document_loaders/fs/docx");
const fs = require('fs');
class OpenaiService extends Service {
    questionConfiguration = {
        stream: true,
        model: "gpt-3.5-turbo"
    }
    askQuestionCurlParameter = {
        headers: {
            "Authorization": `Bearer ${this.ctx.app.config.openai.apiKey}`,  //openai apikey
            'Content-Type': 'application/json', // json 
        },
        timeout: 10000, //超时时间
        dataType: 'json', //数据类型 json
        streaming: true, //开始curl流响应 
        method: 'post',  //请求方式 post 
    }
    trySplicing(data) {
        try {
            return JSON.parse(data)
        } catch (error) {
            console.log(data, '拼接失败')
            return null
        }
    }
    dataProcessing(dataStream, BrokenData = []) {
        return String(dataStream).split("data:").filter((e) => e).flat(Infinity).map(item => {
            try {
                return JSON.parse(item)
            } catch (error) {
                if (item.split(/[\t\r\f\n\s]*/g).join('') == '[DONE]') {
                    return false
                } else {
                    let result = null;
                    if (BrokenData.length >= 1) {
                        result = this.trySplicing(`${BrokenData[BrokenData.length - 1]}${item}`)
                    }
                    if (result) {
                        BrokenData.pop();
                        return result;
                    } else {
                        BrokenData.push(item)
                        return false
                    }

                }
            }
        }).filter((e) => e)
    }
    routingParameterGeneration(parameter) {
        if (parameter == null || parameter == undefined) return;
        const parameterType = typeof parameter;
        if (parameterType == 'string') {
            let obj = {};
            let splitParameter = parameter.split('?');
            if (splitParameter.length <= 1) {
                console.warn("暂无路径参数");
                return
            }
            splitParameter[1].split('&').forEach(item => {
                if (item.indexOf('=') !== -1) {
                    let splitItem = item.split('=')
                    obj[splitItem[0]] = splitItem[1]
                } else {
                    console.warn("error"); 
                }
            })
            return obj
        } else if (parameterType == 'object') {
            let stringParameter = '?'
            let parameterEntries = Object.entries(parameter);
            parameterEntries.forEach((key, index) => {
                let itemString = `${key[0]}=${key[1]}${parameterEntries.length - 1 == index ? '' : '&'}`;
                stringParameter += itemString
            })
            return stringParameter
        }
    }
    /**
     * @description 查找网络数据
     * @param { string } value - 问题
     * @param { string } model - 必应/谷歌
     * @returns { Promise }   -  查询到的数据
     */
    async findNetworkdata(BingSearchParameter) {
        // console.log(BingSearchParameter, 'BingSearchParameter');
        return new Promise((resolve, reject) => {
            axios.get(`${this.ctx.app.config}${BingSearchParameter}`).then(res => resolve(res)).catch(err => reject(err))
        })
    }
    /**
     * @description html解析为文本
     * @param {Object|Array} html - html数据 
     * @returns { Array } - 返回的数据
     */
    parsingHTML(html) {
        if (!html) throw new Error('缺少html参数');
        const htmlValues = Array.isArray(html) ? html : [html];
        return htmlValues.map(item => htmlToText(item));
    }
    // 查找相近数据
    findSimilarData(pineconeParameter) {
        const { apiKey, environment } = this.ctx.app.config.pinecone;
        return new Promise((resolve, reject) => {
            //实例向量库
            const client = new PineconeClient();
            //初始化向量库 https://first-database-a81708f.svc.us-east-1-aws.pinecone.io
            client.init({
                apiKey,
                environment,
            }).then(() => {
                const pineconeIndex = client.Index('first-database');
                const queryRequest = Object.assign({
                    vector: [],
                    topK: 10,
                    includeValues: true,
                    includeMetadata: true,
                    namespace: "",
                }, pineconeParameter);
                pineconeIndex.query({ queryRequest }).then(res => {
                    resolve(res)
                }).catch(err => {
                    reject(err)
                })
            })
        })
    }
    //数据向量
    getEmbeddings(input, model = 'text-embedding-ada-002') {
        return new Promise((resolve, reject) => {
            const configuration = new Configuration({
                apiKey: this.ctx.app.config.openai.apiKey,
                basePath: this.ctx.app.config.openai.basePath
            });
            const openai = new OpenAIApi(configuration);
            openai.createEmbedding({
                model,
                input
            }).then(res => {
                resolve(res)
            }).catch(err => {
                reject(err)
            })
        })
    }
    /**
     * @description 获取网络搜索提示
     * @param { Object } parameter - 搜索参数   
     */
    async getNetworkPrompt(parameter = {}) {
        if (!parameter.q) return;
        let webSearchParameter = this.routingParameterGeneration(Object.assign({...this.app.ctx.config.GoogleSearch}, parameter, { q: encodeURI(parameter.q) }));
        let searchResults = await this.findNetworkdata(webSearchParameter); //获取相关网址  
        let webDatas = await this.parsingWebPageContent(searchResults.data.items, { question: parameter.q, top: 1 });
        return {
            role: 'system',
            content: ` You can refer to the following online search results for ${decodeURIComponent(parameter.q)} to improve your answer. If it is not helpful to you, please ignore this prompt. Network search data \n ${webDatas.map((item, index) => ` ${index + 1} \n content:${item.value} \n Network source:${item.link} \n `).join('\n')}`
        }
    }
    async webCorrelationAnalysis({ question, websites }) {
        websites = websites.map(item => { let { title, link, snippet } = item; return { title, link, snippet } });
        websites.length = 5;
        const configuration = new Configuration({
            apiKey: this.ctx.app.config.openai.apiKey,
            basePath: this.ctx.app.config.openai.basePath
        });
        const openai = new OpenAIApi(configuration);
        let chatCompletion = [];
        await openai.createChatCompletion({
            model: "gpt-3.5-turbo-0613",
            messages: [
                { role: "system", content: `Query a question related to the user's question in the provided JSON list and pass the JSON as a parameter to call the parseWebPageContent method` },
                {
                    role: "system",
                    content: `JSON data: \n ${websites}`
                },
                { role: "user", content: question }
            ],
            functions: [{
                "name": "parseWebPageContent",
                "description": "This is a method of obtaining web page content by parsing JSON data",
                "parameters": {
                    "type": "object",
                    "properties": {
                        "link": {
                            "type": "string",
                            "description": `This is the 'link' field in JSON data`,
                        },
                        "title": {
                            "type": "string",
                            "description": `This is the 'title' field in JSON data`,
                        },
                        "snippet": {
                            "type": "string",
                            "description": `This is the 'snippet' field in JSON data`,
                        }
                    },
                    "required": ["link", "title", "snippet"],
                }
            }]
        }).then(res => {
            if (res.data.choices[0].message?.function_call) {
                let { arguments: parameters } = res.data.choices[0].message?.function_call;
                chatCompletion.push(JSON.parse(parameters));
            }
        }).catch(err => {
            console.log(err)
        });
        return chatCompletion;
    }
    /**
     * @description 通过地址解析网页内容 并切生成摘要   
     * @param {Array} websites - 网址
     * @param { Number } top 
     */
    async parsingWebPageContent(websites, { question, top = 1 }) {
        console.log(websites.forEach(item=>{console.log(item.snippet)}))
        if (websites.length > top) websites.length = top;
        let websiteValues = await Promise.all(websites.map(async item => {
            let { link } = item;
            let html;
            await axios.get(link).then(res => { html = res }).catch(err => { html = null; console.log(err) });
            let value = '';
            if (html) {
                value = await this.generateSummary(question, this.parsingHTML(html.data));
            }
            return { value, link }
        }));
        return websiteValues
    }
    /**
     * @description OpenAi 通过提问生成摘要
     */
    async generateSummary(question, value) {
        let enc = encodingForModel('gpt-3.5-turbo');
        let valueTokens = enc.encode(value.join(","));
        let residueTokensLength = 3096 - enc.encode(question).length;
        if (valueTokens.length > residueTokensLength) valueTokens.length = residueTokensLength;
        const configuration = new Configuration({
            apiKey: this.ctx.app.config.openai.apiKey,
            basePath: this.ctx.app.config.openai.basePath
        });
        const openai = new OpenAIApi(configuration);
        let chatCompletion;
        await openai.createChatCompletion({
            model: "gpt-3.5-turbo",
            messages: [{ role: "system", content: `Summarize the provided network search content based on user questions` }, {
                role: "system",
                content: `Network search results: \n ${enc.decode(valueTokens)}`
            }, { role: "user", content: question }],
        }).then(res => {
            chatCompletion = res.data.choices[0].message.content
        }).catch(err => {
            console.log(err)
            chatCompletion = '';
        });
        return chatCompletion;
    }

    functionCallSplicing(FunctionConfiguration, item) {
        let { function_call = {} } = item.choices[0].delta;
        return Object.assign(FunctionConfiguration, function_call, { arguments: FunctionConfiguration.arguments + (function_call?.arguments || '') });
    }
    /**
     * @description 向openai发起提问
     * @param {object} questionConfiguration - openai提问参数
     * @param {Object} callBackConfiguration - 回调对象 
     * @param {Function} callBackConfiguration.callbackFn - 回调函数
     * @param {Function} callBackConfiguration.readStreamCallback - 可读流回调 返回参数诗可读流对象    questionConfiguration.stream：true
     */
    async askQuestion(questionConfiguration = this.questionConfiguration, callBackConfiguration) {
        const data = Object.assign({}, this.questionConfiguration, questionConfiguration);
        const askQuestionCurlParameter = Object.assign({}, this.askQuestionCurlParameter, { data });
        this.ctx.curl(`${this.ctx.app.config.openai.basePath}/chat/completions`, askQuestionCurlParameter).then(res => {
            let BrokenData = [];
            let FunctionConfiguration = { is: false, name: "", arguments: "" };
            if (askQuestionCurlParameter.data.stream) {
                if (callBackConfiguration.hasOwnProperty('readStreamCallback')) callBackConfiguration.readStreamCallback(res);
                res.res.on('data', dataStream => { 
                    let a = this.dataProcessing(dataStream, BrokenData);
                    if (a.length && a[0].choices[0].delta.hasOwnProperty('function_call')) FunctionConfiguration.is = true;
                    a.forEach(item => FunctionConfiguration.is ? this.functionCallSplicing(FunctionConfiguration, item) : callBackConfiguration.callbackFn(item));
                });
                res.res.on('close', () => {
                    if (FunctionConfiguration.is) this.functionCall(FunctionConfiguration, { messages: questionConfiguration.messages, callBackConfiguration });
                })
            } else {
                callBackConfiguration.callbackFn(res.data)
            }
        }).catch(error => {
            callBackConfiguration.callbackFn({ catch: error })
        })
    }
    getLocalPrompt({ content,namespace, pineconeParameter = {} }) {
        let enc = encodingForModel('gpt-3.5-turbo');
        let contentLength = enc.encode(content).length;
        return new Promise((resolve, reject) => {
            this.getEmbeddings(content).then(res => {
                let vector = res.data.data[0].embedding;
                this.findSimilarData({ ...pineconeParameter, vector, namespace }).then(res => {
                    let promptContent = res.matches.reduce((total, currentValue, currentIndex, arr) => {
                        if (currentIndex == 1) total = { text: total.metadata.text, textLength: enc.encode(total.metadata.text).length };
                        let calLength = 3596 - contentLength - total.textLength;
                        let remainingLength = calLength > 0 ? calLength : 0;
                        if (!remainingLength) return total;
                        let currentValueCodes = enc.encode(currentValue.metadata.text);
                        currentValueCodes.length = remainingLength;
                        let text = total.text + enc.decode(currentValueCodes);
                        let textLength = total.textLength + currentValueCodes.length;
                        return { text, textLength };
                    });
                    resolve({
                        role: 'system',
                        content: `Here is the result of querying the remote vector database \n content: \n ${promptContent.text} `
                    })
                }).catch(err => {
                    reject(err)
                })
            })
        })
    }
    async documentReading() {
        let { filepath, filename, } = this.ctx.request.files[0];
       let mimeType = filepath.split('.')[1];
        console.log('文件类型', mimeType)
        //加载分块
        const splitter = new RecursiveCharacterTextSplitter({
            chunkSize: 1000
        });
        // 加载
        const loader = this.documentClassification(filepath, mimeType);
        console.log('文档读取')
        const docs = await loader.load(); 
        // 分割
        const splitterDocs = await splitter.splitDocuments(docs);
        console.log( '文档分割完成');
        return { splitterDocs, useTokens: splitterDocs.length * 1000 }
    }
    async findNamespace() {
        let namespaces = await this.ctx.model.Namespace.findAll({
            where: {
                user_id: this.ctx.userinfo.id
            }
        });
        return namespaces
    }
    async createdOrUpdateNamespace() {
        let result = null;
        await this.ctx.model.Namespace.findOrCreate(
            {
                where: {
                    name:this.ctx.request.body.namespace, 
                    user_id: this.ctx.userinfo.id
                },
                defaults: {
                    name: this.ctx.request.body.namespace, 
                    user_id: this.ctx.userinfo.id,
                    upload_status: 'Ready',
                    details:this.ctx.request.body.describe,
                    title:this.ctx.request.body.title||''
                }
            }
        ).then(async (res) => {
            let namespace = res[0];
            if (namespace.upload_status == 'Loading'|| namespace.upload_status == 'Adding') this.ctx.throw(500, '当前空间正在使用');
            result = await namespace.update({
                upload_status: namespace.upload_status == 'Ready' ? 'Loading' : 'Adding'
            });
        }); 
        return result 
    }
    async fileUpload({ splitterDocs, useTokens }) {
        let upload_status ;
        const { apiKey, environment } = this.ctx.app.config.pinecone;
        const client = new PineconeClient();
        await client.init({ apiKey, environment });
        const pineconeIndex = client.Index('first-database');
        let namespace =  `${this.ctx.userinfo.id}-${this.ctx.request.body.namespace}`; 
        await PineconeStore.fromDocuments(splitterDocs, new OpenAIEmbeddings({
            openAIApiKey: this.ctx.app.config.openai.apiKey
        }, {
            basePath: this.ctx.app.config.openai.basePath
        }), {
            pineconeIndex,
            namespace,
        }).then(res => {
            console.log('操作成功');
            upload_status = 'Success'
        }).catch(err => {
            console.log(err, 'err');
            upload_status = 'Fail'
        }).finally(() => {
        let { filepath } = this.ctx.request.files[0]; 
            fs.unlink(filepath, (err) => {
                console.log(err, 'File deleted!');
            });
        });
        return upload_status
    }
    documentClassification(filepath, mimeType) {
        if (!filepath) throw new Error(`路径出现问题:${filepath}`)
        let loader = null;
        switch (mimeType) {
            case 'pdf':
                loader = new PDFLoader(filepath)
                break; 
            case 'epub':
                loader = new EPubLoader(filepath)
                break;
            case 'txt':
                loader = new TextLoader(filepath)
                break;
            case 'docx':
                loader = new DocxLoader(filepath)
                break;
            default:
                break;
        }
        if (!loader) throw new Error(`无法解析的类型:${mimeType}`)
        return loader;
    }
    /**
     * 
     * @param {Object} FunctionConfiguration - function_call对象
     * @param {*} askQuestionParameter  - askQuestion参数 用于调用完成继续对话 
     */
    functionCall(FunctionConfiguration, askQuestionParameter) { 
        console.log(FunctionConfiguration,'FunctionConfiguration')
        let { messages, callBackConfiguration } = askQuestionParameter;
        this[FunctionConfiguration.name](JSON.parse(FunctionConfiguration.arguments)).then(res => {  
            this.askQuestion({ messages: [res, messages[messages.length - 1]],model:'gpt-3.5-turbo-0613' }, callBackConfiguration)
        })
    }

}
module.exports = OpenaiService