python-sklearn实现一个简易的智能问答机器人

最新推荐文章于 2024-06-22 20:07:46 发布

LiangJun.py

最新推荐文章于 2024-06-22 20:07:46 发布

阅读量1.2w

点赞数 29

分类专栏：数据挖掘 NLP 文章标签：智能问答文本分类

本文链接：https://blog.csdn.net/qq_26535271/article/details/100748210

版权

数据挖掘同时被 2 个专栏收录

1 篇文章 1 订阅

订阅专栏

NLP

1 篇文章 0 订阅

订阅专栏

随着AI的发展，各大企业采用智能问答机器人取代了人工客服。智能问答系统实现的方法有很多，本篇文章介绍之前做的一个简易的智能问答机器人。采用的方法是使用朴素贝叶斯模型进行问题分类，模糊匹配查询近似问题。

实现步骤

1.1 总体流程设计

问答系统总体实现步骤如下流程图主要包括数据预处理，模型训练，结果映射以及答案匹配。数据预处理主要是对语料库进行收集，并对语料库进行筛选转换为需要的格式；然后使用朴素贝叶斯对处理好的语料库进行训练，语料库输入模型训练之前需要先对其进行预处理以及转换（去停用词，分词，TF-IDF计算），通过TF-IDF计算后则输入朴素贝叶斯中进行训练，由于我的语料库比较简易，所以采用默认参数训练即可达到较好的分类效果。在结果映射步骤中，主要是对事先确定好的类别进行映射处理（可用于脚本网页跳转使用）。答案匹配采用了模糊匹配的方法对用户提的问题进行匹配，搜索出相似的问题并给出其对应的答案。

1.2 语料库收集

语料库收集如下图。这里第一列为需要分类的类别，第二列为相关的问题。本篇中的语料库主要分为人社信息语料库以及娱乐聊天语料库。

人社信息语料库：

娱乐聊天语料库：

1.3 主要程序介绍

可视化界面GUI主要采用了tkinter工具包完成，rum_main.py程序如下：

#!/usr/bin/env python3
# _*_ coding:utf-8 _*_

from tkinter import *
import time
from speech_test import *

'''
定义消息发送函数：
1、在<消息列表分区>的文本控件中实时添加时间；
2、获取<发送消息分区>的文本内容，添加到列表分区的文本中；
3、将<发送消息分区>的文本内容清空。
'''

def msgsend():
    msg = '我:' + time.strftime('%Y-%m-%d %H:%M:%S', time.localtime()) + '\n'
    # print(msg)
    txt_msglist.insert(END, msg, 'green')  # 添加时间
    query = txt_msgsend.get('0.0', END) #!!!!!!!!!!!!!!!11
    print(query)
    result = main(query)  #问题输入模型入口
    print('result:',result)
    txt_msglist.insert(END, txt_msgsend.get('0.0', END))  # 获取发送消息，添加文本到消息列表
    txt_msglist.insert(END, '\n')
    txt_msgsend.delete('0.0', END)  # 清空发送消息
    robot = '小Y:' + time.strftime('%Y-%m-%d %H:%M:%S', time.localtime()) + '\n'
    txt_msglist.insert(END, robot, 'red')
    txt_msglist.insert(END, result+'\n')

'''定义取消发送 消息 函数'''
def cancel():
    txt_msgsend.delete('0.0', END)  # 取消发送消息，即清空发送消息

'''绑定up键'''
def msgsendEvent(event):
    if event.keysym == 'Up':
        msgsend()

tk = Tk()
tk.title('聊天窗口')

'''创建分区'''
f_msglist = Frame(height=300, width=300)  # 创建<消息列表分区 >
f_msgsend = Frame(height=300, width=300)  # 创建<发送消息分区 >
f_floor = Frame(height=100, width=300)  # 创建<按钮分区>
f_right = Frame(height=700, width=100)  # 创建<图片分区>
'''创建控件'''
txt_msglist = Text(f_msglist)  # 消息列表分区中创建文本控件
txt_msglist.tag_config('green', foreground='blue')  # 消息列表分区中创建标签
txt_msglist.tag_config('red', foreground='red')  # 消息列表分区中创建标签
txt_msgsend = Text(f_msgsend)  # 发送消息分区中创建文本控件

txt_show = Text(f_msglist)  # 消息列表分区中创建文本控件
txt_show.tag_config('red', foreground='red')  # 消息列表分区中创建标签
txt_showsend = Text(f_msgsend)  # 发送消息分区中创建文本控件

txt_msgsend.bind('<KeyPress-Up>', msgsendEvent)  # 发送消息分区中，绑定‘UP’键与消息发送。
'''txt_right = Text(f_right) #图片显示分区创建文本控件'''
button_send = Button(f_floor, text='Send',command=msgsend)  # 按钮分区中创建按钮并绑定发送消息函数
button_cancel = Button(f_floor, text='Cancel', command=cancel)  # 分区中创建取消按钮并绑定取消函数
'''分区布局'''
f_msglist.grid(row=0, column=0)  # 消息列表分区
f_msgsend.grid(row=1, column=0)  # 发送消息分区
f_floor.grid(row=2, column=0)  # 按钮分区
f_right.grid(row=0, column=1, rowspan=3)  # 图片显示分区
txt_msglist.grid()  # 消息列表文本控件加载
txt_msgsend.grid()  # 消息发送文本控件加载
button_send.grid(row=0, column=0, sticky=W)  # 发送按钮控件加载
button_cancel.grid(row=0, column=1, sticky=W)  # 取消按钮控件加载
tk.mainloop()

智能问答机器人相关程序为 speech_test.py，程序如下：

#-*- coding:utf-8 -*-
import logging
logging.getLogger("requests").setLevel(logging.WARNING)
import csv
import jieba
import pickle
from fuzzywuzzy import fuzz
import math
from scipy import sparse
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from scipy.sparse import lil_matrix
import jieba.posseg as pseg
import sys
import pandas as pd
from sklearn.naive_bayes import MultinomialNB
from speech_recognition import *
import warnings
warnings.filterwarnings("ignore")


def load_label_url():
    with open('znwd_label_url.csv','r',encoding='utf-8') as f:
        name_id = {}
        label_url = csv.reader(f)
        header = next(label_url)
        for power_name_id in label_url:
            name_id[power_name_id[0]] = power_name_id[1]
    return name_id


def load_cut_save(filename,load = False):
    jieba.load_userdict('UserDefined_words.txt')
    corpus = []
    label = []
    with open(filename,'rt',encoding='utf-8') as f:
        data_corpus = csv.reader(f)
        header = next(data_corpus)
        for words in data_corpus:
            word = jieba.cut(words[1])
            doc = []
            for x in word:
                 if x not in stop_words and not x.isdigit():
                     doc.append(x)
            corpus.append(' '.join(doc))
            label.append(words[0])
    if load == True:
        with open('corpus.oj','wb') as f:
            pickle.dump(corpus,f)
        with open('label.oj','wb') as f:
            pickle.dump(label,f)
    return corpus,label

def train_model():

    with open('corpus.oj','rb') as f_corpus:
        corpus = pickle.load(f_corpus)

    with open('label.oj','rb') as f_label:
        label = pickle.load(f_label,encoding='bytes')


    vectorizer = CountVectorizer(min_df=1)
    transformer = TfidfTransformer()
    tfidf = transformer.fit_transform(vectorizer.fit_transform(corpus))
    words_frequency = vectorizer.fit_transform(corpus)
    word = vectorizer.get_feature_names()
    saved = input_tfidf(vectorizer.vocabulary_,sparse.csc_matrix(words_frequency),len(corpus))
    model = MultinomialNB()
    model.fit(tfidf,label)


    with open('model.oj','wb') as f_model:
        pickle.dump(model,f_model)

    with open('idf.oj','wb') as f_idf:
        pickle.dump(saved,f_idf)

    return model,tfidf,label

class input_tfidf(object):
    def __init__(self,feature_index,frequency,docs):
        self.feature_index = feature_index
        self.frequency = frequency
        self.docs = docs
        self.len = len(feature_index)

    def key_count(self,input_words):
        keys = jieba.cut(input_words)
        count = {}
        for key in keys:
            num = count.get(key, 0)
            count[key] = num + 1
        return count

    def getTdidf(self,input_words):
        count = self.key_count(input_words)
        result = lil_matrix((1, self.len))
        frequency = sparse.csc_matrix(self.frequency)
        for x in count:
            word = self.feature_index.get(x)
            if word != None and word>=0:
                word_frequency = frequency.getcol(word)
                feature_docs = word_frequency.sum()
                tfidf = count.get(x) * (math.log((self.docs+1) / (feature_docs+1))+1)
                result[0, word] = tfidf
        return result

def model_predict(input_str):
    f = open('idf.oj','rb')
    idf = pickle.load(f)
    f.close()
    f = open('model.oj','rb')
    model = pickle.load(f)
    f.close()
    tfidf = idf.getTdidf(input_str)
    classifiction = (model.predict(tfidf))
    # print(model.predict_proba(tfidf))
    prob = model.predict_proba(tfidf).max()
    name_id = load_label_url()
    if prob >= 0.5:
        answer1 = str(classifiction[0],'utf-8')
    else:
        answer1 = None
    return answer1

def similarity(input_questions):
    with open('corpus_1233.oj', 'rb') as f:
        corpus = pickle.load(f,encoding='bytes')

    with open('question_1233.oj', 'rb') as f:
        question = pickle.load(f,encoding='bytes')

    with open('answer_1233.oj', 'rb') as f:
        answer = pickle.load(f,encoding='bytes')


    text = {}
    train = []
    answer2 = []
    for key, value in enumerate(corpus):
        similarity = fuzz.ratio(input_questions, value)
        if similarity > 40:
            text[key] = similarity
    if len(text) >= 3:
        train = sorted(text.items(), key=lambda d: d[1], reverse=True)
        # print(u"与您提的疑问相似的问题有\n")
        for i in range(3):
            an = {"question":question[train[i][0]],"answer":answer[train[i][0]]}
            answer2.append(an)
            # print("%d、" % (i + 1), \
            #     " 问题：%s\n" % str(question[train[i][0]],'utf-8'), \
            #     " 答案：%s" % str(answer[train[i][0]],'utf-8'))
    elif len(text) == 2:
        train = sorted(text.items(), key=lambda d: d[1], reverse=True)
        # print("与您提的疑问相似的问题有\n")
        for i in range(2):
            an = {"question":question[train[i][0]],"answer":answer[train[i][0]]}
            answer2.append(an)
            # print("%d、" % (i + 1), \
            #     " 问题：%s\n" % str(question[train[i][0]],'utf-8'), \
            #     " 答案：%s" % str(answer[train[i][0]],'utf-8'))
    elif len(text) == 1:
        an = {"question": question[list(text.keys())[0]], "answer": answer[list(text.keys())[0]]}
        answer2.append(an)
        # print("与您提的疑问相似的问题有：\n", \
        #     " 问题：%s" % str(question[text.keys()[0]],'utf-8'), \
        #     " 答案：%s" % str(answer[text.keys()[0]],'utf-8'))
    else:
        # print("您所提的疑问无其他相似问题！")
        an = {"question":None,"answer":None}
        answer2.append(an)
    return answer2

def get_greeting(input_questions,question,answer):
    text = {}
    for key, value in enumerate(question):
        similarity = fuzz.ratio(input_questions, value)
        if similarity > 60:
            text[key] = similarity
    if len(text) > 0:
        train = sorted(text.items(), key=lambda d: d[1], reverse=True)
        answer3 = answer[train[0][0]]
    else:
        answer3 = None
    return  answer3


def sim(doc):
    input_questions = ''
    input_words = jieba.cut(doc)

    for x in input_words:
        if x not in stop_words:
            input_questions += x

    answer2 = similarity(input_questions)
    return answer2

def ans_show(returnSet):
    if returnSet[2] is not None:
        ans = "%s"%returnSet[2]
    elif returnSet[0] is not None:
        ans = "您的问题属于<%s>专栏\n"%returnSet[0]
        ans1 = ""
        if returnSet[1][0]['question'] is not None:
            ans1 = "小Y还知道其他一些问题例如：\n"
            ans2 = ""
            for i in range(len(returnSet[1])):
                ans2 = ans2 + "%d、" % (i + 1) + " 问题：%s\n" % str(returnSet[1][i]['question'],'utf-8') + " 答案：%s" % str(returnSet[1][i]['answer'],'utf-8')
            ans1 = ans1 + ans2
        ans = ans + ans1
    elif returnSet[1][0]['question'] is not None:
        ans1 = "小Y知道相似的问题：\n"
        ans2 = ""
        for i in range(len(returnSet[1])):
            ans2 = ans2 + "%d、" % (i + 1) + " 问题：%s\n" % str(returnSet[1][i]['question'], 'utf-8') + " 答案：%s" % str(returnSet[1][i]['answer'], 'utf-8')
        ans = ans1 + ans2
    else:
        ans = "您问的问题太过深奥，Mike才疏学浅暂时无法为您解答，待我读书破万卷后成为您的百科机器人"
    return ans

with open('stop_words.txt', 'rb') as f:
    stop_words = f.read().splitlines()

question_greeting = []
answer_greeting = []
with open("greeting.csv", 'r',encoding='utf-8') as f:
    greeting = csv.reader(f)
    header = next(greeting)
    for words in greeting:
        question_greeting.append(words[0])
        answer_greeting.append(words[1])

filename = 'znwd_corpus.csv'
corpus, label = load_cut_save(filename,load=False)

def main(question):
    if question != None:
        query = question    #########此处会因语音无法识别还报错
        print("我 > %s" %query)
       ##############
        answer3 = get_greeting(query,question_greeting,answer_greeting)
        # print(answer3)
        if answer3 is None:
            answer1 = model_predict(query)
            answer2 = sim(query)
        else:
            answer1 = None
            answer2 = None
        ans = [answer1,answer2,answer3]
        result = ans_show(ans)
    else:
        result = "输入有误请重新输入！"
        query = None
    return result

因而，一个简单的智能问答机器人即可实现，若需要问答机器人能够回答更多的内容，可针对语料库进行修改，进而丰富智能问答机器人的聊天范围。程序中读取语料库采用了pickle工具包将预处理后的语料库进行序列化至本地，进而在下次使用语料库不需要再次进行预处理，可节省处理的时间。修改语料库后需重新覆盖序列化至本地的语料库文件和模型文件。

效果展示

最终效果如下，我的语料库主要用了社保卡相关的数据，该问答系统可以当作是一个社保信息方面的问答。

工程源码：https://github.com/liangjunAI/chatting_robot

LiangJun.py

关注

29
点赞
踩
176

收藏

觉得还不错? 一键收藏
18
评论
python-sklearn实现一个简易的智能问答机器人

随着AI的发展，各大企业采用智能问答机器人取代了人工客服。智能问答系统实现的方法有很多，本篇文章介绍之前做的一个简易的智能问答机器人。采用的方法是使用朴素贝叶斯模型进行问题分类，模糊匹配查询近似问题。实现步骤1.1 总体流程设计问答系统总体实现步骤如下流程图主要包括数据预处理，模型训练，结果映射以及答案匹配。数据预处理主要是对语料库进行收集，并对语料库进行筛选转换为需要的格式；然...
复制链接

扫一扫

专栏目录