1词袋模型
from gensim import corpora
from gensim import models
from gensim import similarities
#from corpora.corpus import Corpus
# 1 分词
# 1.1 历史比较文档的分词
all_location_list = []
for doc in location_list:
doc_list = [word for word in jieba.cut_for_search(doc)]
# doc_list = [word for word in jieba.cut(doc)]
all_location_list.append(doc_list)
# 1.2 测试文档的分词
doc_test="A市A市经济学院体育学院"
doc_test_list = [word for word in jieba.cut_for_search(doc_test)]
# doc_test_list = [word for word in jieba.cut(doc_test)]
# 2 制作语料库
# 2.1 获取词袋
dictionary = corpora.Dictionary(all_location_list)
# 2.2 制作语料库
# 历史文档的二元组向量转换
corpus = [dictionary.doc2bow(doc) for doc in all_location_list]
# 测试文档的二元组向量转换
doc_test_vec = dictionary.doc2bow(doc_test_list)
# 3 相似度分析
# 3.1 使用TF-IDF模型对语料库建模
tfidf = models.TfidfModel(corpus)
# 获取测试文档中,每个词的TF-IDF值
tfidf[doc_test_vec]
# 3.2 对每个目标文档,分析测试文档的相似度
index = similarities.SparseMatrixSimilarity(tfidf[corpus], num_features=len(dictionary.keys()))
sim = index[tfidf[doc_test_vec]]
# 根3.3 据相似度排序
sorted(enumerate(sim), key=lambda item: -item[1])
2TF-IDF
import jieba
from gensim import corpora,models,similarities
all_location_list = []
for doc in location_list:
doc_list = [word for word in jieba.cut_for_search(doc)]
all_location_list.append(doc_list)
# 制作语料库,获取词袋
dictionary = corpora.Dictionary(all_location_list)
corpus = [dictionary.doc2bow(doc) for doc in all_location_list]
# 使用TF-IDF模型对语料库建模
tfidf = models.TfidfModel(corpus)
#特征数
featureNUM = len(dictionary.token2id.keys())
#通过TfIdf对整个语料库进行转换并将其编入索引,以准备相似性查询
index = similarities.SparseMatrixSimilarity(tfidf[corpus],num_features=featureNUM)
#稀疏向量.dictionary.doc2bow(doc)是把文档doc变成一个稀疏向量,[(0, 1), (1, 1)],表明id为0,1的词汇出现了1次,至于其他词汇,没有出现。
doc_test= 'A市A市魅力之城商铺无排烟管道,小区'
doc_test_list = [word for word in jieba.cut_for_search(doc_test)]
# 测试文档的二元组向量转换
new_vec = dictionary.doc2bow(doc_test_list)
# 获取测试文档中,每个词的TF-IDF值
tfidf[new_vec]
#计算向量相似度
sim = index[tfidf[new_vec]]
print(sim)
for i in range(len(location_list)):
doc_test= location_list[i]
# w_ID = biaoge2_paqu.loc[i,'问题ID']
w_ID = biaoge2.loc[i,'问题ID']
if w_ID:
pass
else:
# p = biaoge2_paqu['问题ID'].max() + 1
# biaoge2_paqu.loc[i,'问题ID'] = p
# w1_ID = biaoge2_paqu.loc[i,'问题ID']
p = biaoge2['问题ID'].max() + 1
biaoge2.loc[i,'问题ID'] = p
w1_ID = biaoge2.loc[i,'问题ID']
doc_test_list = [word for word in jieba.cut_for_search(doc_test)]
# 测试文档的二元组向量转换
new_vec = dictionary.doc2bow(doc_test_list)
# 获取测试文档中,每个词的TF-IDF值
tfidf[new_vec]
#计算向量相似度
sim = index[tfidf[new_vec]]
for j in range(len(biaoge2)):
w2_ID = biaoge2.loc[j,'问题ID']
if w2_ID:
pass
elif list(sim)[j]:
biaoge2.loc[j,'问题ID'] = w1_ID
# print(sim)
3余弦相似度
import jieba
import re
import numpy as np
import os
import pandas as pd
#os.chdir(r'C:\Users\Lenovo\Desktop\01040730kg73')
os.chdir(r'C:\Users\Administrator\Desktop\示例数据')
data4 = pd.read_excel('4.xlsx')
data4_message = data4['详情']
data4_answer = data4['意见']
message_list = list(data4_message)
# 数据去敏
def qingli(s):
string1 = s.apply(lambda x: re.sub('[0-9]', '*',str(x)))#去除数字
m=re.compile('\s+')#定义空格
string2 = string1.apply(lambda x: re.sub(m, '*',x))#去除空格
punctuation = """,!?。"#$%&'()*+-/:;<=>@[\]^_`{|}~⦅⦆「」、、〃》「」『』【】〔〕〖〗〘〙〚〛〜〝〞〟〰〾〿–—‘'‛“”„‟…‧﹏"""
re_punctuation = "[{}]+".format(punctuation)#去除标点符号
string3 = string2.apply(lambda x: re.sub(re_punctuation, '*', x))
a = string3.apply(lambda x: re.sub('\*','',x))
return a
# 输入一条留言,关键词统计和词频统计,以列表形式返回
def Count(infile):
t = {}
f = infile
count = len(f)
s = infile
i = 0
words = jieba.lcut(infile)
for word in words:
if word != "" and t.__contains__(word):
num = t[word]
t[word] = num + 1
elif word != "":
t[word] = 1
i = i + 1
# 字典按键值降序
dic = sorted(t.items(), key=lambda t: t[1], reverse=True)
return (dic)
# 合并两篇文档的关键词
def MergeWord(T1,T2):
MergeWord = []
duplicateWord = 0
for ch in range(len(T1)):
MergeWord.append(T1[ch][0])
for ch in range(len(T2)):
if T2[ch][0] in MergeWord:
duplicateWord = duplicateWord + 1
else:
MergeWord.append(T2[ch][0])
# print('重复次数 = ' + str(duplicateWord))
# 打印合并关键词
# print(MergeWord)
return MergeWord
# 得出文档向量
def CalVector(T1,MergeWord):
TF1 = [0] * len(MergeWord)
for ch in range(len(T1)):
TermFrequence = T1[ch][1]
word = T1[ch][0]
i = 0
while i < len(MergeWord):
if word == MergeWord[i]:
TF1[i] = TermFrequence
break
else:
i = i + 1
return TF1
def CalConDis(v1,v2,lengthVector):
# 计算出两个向量的乘积
B = 0
i = 0
while i < lengthVector:
B = v1[i] * v2[i] + B
i = i + 1
# 计算两个向量的模的乘积
A = 0
A1 = 0
A2 = 0
i = 0
while i < lengthVector:
A1 = A1 + v1[i] * v1[i]
i = i + 1
i = 0
while i < lengthVector:
A2 = A2 + v2[i] * v2[i]
i = i + 1
A = np.math.sqrt(A1) * np.math.sqrt(A2)
print('留言和回复的相似度 = ' + format(float(B) / A,".3f"))
for i in range(len(data4_message)):
#数据清洗
D_message = qingli(data4_message)
D_answer = qingli(data4_answer)
# 词频统计
T_message = Count(D_message[i])
T_answer = Count(D_answer[i])
# 相同关键词
mergeword = MergeWord(T_message,T_answer)
#向量化
V_message = CalVector(T_message,mergeword)
V_answer = CalVector(T_answer,mergeword)
# 计算余弦距离
#cos值越趋向于1,则说明两篇文档越相似,反之越不相似。
print('第'+str(i)+'条')
CalConDis(V_message,V_answer,len(V_message))
4Python自带比较相似度函数
import difflib
def string_similar(s1, s2):
return difflib.SequenceMatcher(None, s1, s2).quick_ratio()
for i in range(len(data4_message)):
s1 = data4_message[i]
s2 = data4_answer[i]
print(string_similar(s1, s2))
5word2vec
import re
def qingli(s):
#pattern = r"(https?://|[@#])\S*"
#a = re.sub(pattern, '', s)
#string1 = s.apply(lambda x:re.sub('[A-z]','*',str(x)))#去除字母
string2 = s.apply(lambda x: re.sub('[0-9]', '*',str(x)))#去除数字
m=re.compile('\s+')#定义空格
string3 = string2.apply(lambda x: re.sub(m, '*',x))#去除空格
punctuation = """,!?。"#$%&'()*+-/:;<=>@[\]^_`{|}~⦅⦆「」、、〃》「」『』【】〔〕〖〗〘〙〚〛〜〝〞〟〰〾〿–—‘'‛“”„‟…‧﹏"""
re_punctuation = "[{}]+".format(punctuation)#去除标点符号
string3 = string2.apply(lambda x: re.sub(re_punctuation, '*', x))
a = string3.apply(lambda x: re.sub('\*','',x))
return a
data4_message_qingli = qingli(data4_all_message)
data4_answer_qingli = qingli(data4_answer)
data4_all_message_qingli = data4_message_qingli+data4_answer_qingli
def stopwordslist(filepath):
stopwords = [line.strip() for line in open(filepath, 'r', encoding='GB18030').readlines()]
return stopwords
stopwords = stopwordslist("stopword.txt")
def preprocess_text_unsupervised(content_lines, sentences):
for line in content_lines:
try:
segs = jieba.cut(line)
segs = filter(lambda x:len(x)>1, segs)
segs = filter(lambda x:x not in stopwords, segs)
sentences.append(list(segs))
except Exception:
print(line)
continue
#生成无监督训练数据
sentences = []
preprocess_text_unsupervised(data4_all_message_qingli, sentences)
sentences
model=gensim.models.word2vec.Word2Vec(sentences,min_count=1,sg=1,size=100,window=5)
model.most_similar(['管理'])
#需要去除停用词才可达到效果!
def vector_similarity(s1, s2):
def sentence_vector(s):
words = jieba.lcut(s)
#words = jieba.analyse.extract_tags(s,allowPOS=('n','nr','nr1','nr2','nrj','nrf','ns','nsf','nt','nz','nl','ng','nrfg'))
ba = []
for i in range(len(words)):
if len(words[i])<=1:
ba.append(words[i])
words=list(set(words)-set(ba))
words=list(set(words)-set(stopwords))
v = np.zeros(100)
for word in words:
v += model[word]
v /= len(words)
return v
v1, v2 = sentence_vector(s1), sentence_vector(s2)
return np.dot(v1, v2) / (norm(v1) * norm(v2))
s1 = data4_message_qingli[1]
s2 = data4_answer_qingli[1]
s3 = '您好,由于本人爱人身份证过期,回I6市办了临时身份证,正式身份证要1个月后才能拿到,现在又办不了加急,医院不给办出生证明,必须要正式身份证才给办理,但是小孩刚出生,因黄旦太高住院花了不少钱,急着办落地险,希望能报销一部分,现在医院不给办出生证明无法办理新生儿落地险,等正式身份证拿到,已然过了办理落地险的时间,我很疑惑,临时身份证效力等同正式身份证,信息一样可以手动录入,为什么就是不给办理?'
vector_similarity(s1, s2)
6JS距离
import string
from io import StringIO
from math import log
import numpy as np
KLD=(lambda p,q:sum([_p * log(_p,2)-_p * log(_q,2) for (_p,_q) in zip(p,q)]))
def JSD_core(p,q):
p,q=zip(*filter(lambda (x,y):x!=0 or y!=0, zip(p,q))) #去掉二者都是0的概率值
M = [0.5*(_p+_q) for _p,_q in zip(p,q)]
p=p+np.spacing(1)
q=q+np.spacing(1)
M=M+np.spacing(1)
# print p,q,M
return 0.5*KLD(p,M)+0.5*KLD(q,M)
reg=lambda x:[x.count(i) for i in string.ascii_lowercase] #频数分布
rate=lambda y:[round(i*1.0/sum(reg(y)),4) for i in reg(y)] #概率分布
s1 = data4_message[1]
s2 = data4_answer[1]
# s1='ahaebssa'
# s2='awohwsess'
print (JSD_core(rate(s1),rate(s2)))
import numpy as np
import scipy.stats
p=np.asarray([0.65,0.25,0.07,0.03])
q=np.array([0.6,0.25,0.1,0.05])
q2=np.array([0.1,0.2,0.3,0.4])
def JS_divergence(p,q):
M=(p+q)/2
return 0.5*scipy.stats.entropy(p, M)+0.5*scipy.stats.entropy(q, M)
print(JS_divergence(p,q)) # 0.003093977084273652
print(JS_divergence(p,q2)) # 0.24719159952098618
print(JS_divergence(p,p)) # 0.0
7simtext(参考#https://www.colabug.com/2020/0419/7278348/amp/)
# simtext相似度:
# simtext可以计算两文档间四大文本相似性指标,分别为:
# Sim_Cosine cosine相似性
# Sim_Jaccard Jaccard相似性
# Sim_MinEdit 最小编辑距离
# Sim_Simple 微软Word中的track changes
from simtext import similarity
for i in range(len(data4_message)):
text1 = data4_message[i]
text2 = data4_answer[i]
sim = similarity()
res = sim.compute(text1, text2)
print('第'+str(i)+'条')
print(res)