1. TF-IDF
代码示例
#导入计算TF-IDF所需要的包
import jieba
from gensim import corpora
from gensim import models
#现在以demo_txt为例
demo_txt = [
"Apple iPhone 8 Plus (A1864) 64GB 深空灰色 移动联通电信4G手机",
"荣耀 畅玩7X 4GB+32GB 全网通4G全面屏手机 标配版 铂光金",
"Apple iPhone 8 (A1863) 64GB 深空灰色 移动联通电信4G手机",
"Apple iPhone 7 Plus (A1661) 128G 黑色 移动联通电信4G手机",
"小米 红米5 Plus 全面屏拍照手机 全网通版 3GB+32GB 金色 移动联通电信4G手机 双卡双待",
"Apple iPhone 7 (A1660) 128G 黑色 移动联通电信4G手机",
"Apple iPhone X (A1865) 64GB 深空灰色 移动联通电信4G手机",
"小米 红米Note5A 移动4G+版全网通 4GB+64GB 铂银灰 移动联通电信4G手机 双卡双待 拍照手机",
"荣耀 V10全网通 标配版 4GB+64GB 幻夜黑 移动联通电信4G全面屏手机 双卡双待",
"荣耀 畅玩6 2GB+16GB 金色 全网通4G手机 双卡双待",
"Apple iPhone 6s Plus (A1699) 128G 玫瑰金色 移动联通电信4G手机",
"Apple iPhone 6 32GB 金色 移动联通电信4G手机",
"小米Note3 美颜双摄拍照手机 6GB+64GB 黑色 全网通4G手机 双卡双待",
"小米5X 美颜双摄拍照手机 4GB+64GB 金色 全网通4G手机 双卡双待",
"魅族 魅蓝 Note6 3GB+32GB 全网通公开版 皓月银 移动联通电信4G手机 双卡双待",
"荣耀畅玩7C 全面屏手机 全网通标配版 3GB+32GB 铂光金 移动联通电信4G手机 双卡双待",
"Apple iPhone 5s (A1530) 16GB 金色 移动联通4G手机",
"荣耀10 GT游戏加速 AIS手持夜景 6GB+64GB 幻影蓝全网通 移动联通电信4G 双卡双待",
]
#将自定义的词典加载出来
jieba.load_userdict('MobilePhone_Userdict.txt')
#把txt格式的停用词转换成一个列表
filepath = r'stopwords.txt'
stopwords = [line.strip() for line in open(filepath,'r',encoding = 'utf_8').readline()]
#定义一个列表
word_list = []
#读取demo_txt的每一行
for corpu in demo_txt:
seg_list = jieba.cut(corpu) #分词
seg_list = [i for i in seg_list if i not in stopwords and i != ' '] #将分出的词变为列表形式
word_list.append(seg_list) #加入到word_list中
print(word_list)
#把所有出现的词存为一个字典
dictionary = corpora.Dictionary(word_list)
#把字典的key-value打印出来
print(dictionary.token2id)
#将出现的词用数字依次编码
new_corpus = [dictionary.doc2bow(word) for word in word_list]
print(new_corpus)
#将分好的词存起来
tfidf = models.TfidfModel(new_corpus)
tfidf.save("my_model.tfidf")
#加载
tfidf = models.TfidfModel.load("my_model.tfidf")
#计算tf-idf值
tfidf_vec = []
for i in new_corpus:
string_tfidf = tfidf[i]
tfidf_vec.append(string_tfidf)
print(tfidf_vec)
2. 余弦相似度
设向量 A = (A1,A2,…,An),B = (B1,B2,…,Bn) ,推广到多维:
![在这里插入图片描述](https://img-blog.csdni
代码示例
import numpy as np
# 基于numpy的余弦相似性计算(公式可参考百度百科余弦相似度)
def np_cos_Distance(vector1, vector2):
vec1 = np.array(vector1)
vec2 = np.array(vector2)
return float(np.sum(vec1 * vec2)) / (np.linalg.norm(vec1) * np.linalg.norm(vec2))
vector1 = [0.1,0.5,0.36,0.44,0.21,0.96,0.35,0.4,0.16,0.22]
vector2 = [0.22,0.59,0.44,0.37,0.55,0.95,0.44,0.64,0.18,0.22]
vector3 = [0.72,0.29,0.44,0.88,0.55,0.49,0.74,0.36,0.8,0.62]
print(len(vector1))
print(len(vector2))
print(len(vector3))
cos_dis = np_cos_Distance(vector1, vector2)
print(cos_dis)
cos_dis = np_cos_Distance(vector1, vector3)
print(cos_dis)
10
10
10
0.964299424321
0.747731556816
3. tf-idf任务
代码示例
import jieba
import pickle
import numpy as np
with open('data.pk', 'rb') as f:
all_dick, idf_dict = pickle.load(f)
# 按行读取文件
def read_file(file_path):
with open(file_path, 'r', encoding='utf-8-sig') as f:
fina_outlist = [line.strip() for line in f.readlines()]
return fina_outlist
#加载自定义词典
jieba.load_userdict("userdict.txt")
#去停用词
# 将停用词读出放在stopwords这个列表中
filepath = r'stopwords.txt'
stopwords = [line.strip() for line in open(filepath, 'r', encoding='utf-8').readlines()]
#搜索模式分词
def split_words(words):
word_list = jieba.cut_for_search(words.lower().strip(),HMM=True)
word_list = [i for i in word_list if i not in stopwords and i!=' ']
return word_list
#统计词频
# 统计词频,并返回字典
def make_word_freq(word_list):
freword = {}
for i in word_list:
if str(i) in freword:
freword[str(i)] += 1
else:
freword[str(i)] = 1
return freword
#计算tfidf值
# 计算tfidf,组成tfidf矩阵
def make_tfidf(word_list,all_dick,idf_dict):
length = len(word_list)
word_list = [word for word in word_list if word in all_dick]
word_freq = make_word_freq(word_list)
w_dic = np.zeros(len(all_dick))
for word in word_list:
ind = all_dick[word]
idf = idf_dict[word]
w_dic[ind] = float(word_freq[word]/length)*float(idf)
return w_dic
#余弦相似度
# 基于numpy的余弦相似性计算
def Cos_Distance(vector1, vector2):
vec1 = np.array(vector1)
vec2 = np.array(vector2)
return float(np.sum(vec1 * vec2)) / (np.linalg.norm(vec1) * np.linalg.norm(vec2))
# 计算相似度
def similarity_words(vec, vecs_list):
Similarity_list = []
for vec_i in vecs_list:
Similarity = Cos_Distance(vec, vec_i)
Similarity_list.append(Similarity)
return Similarity_list
#主函数
def main(words, file_path, readed_path):
words_list = read_file(file_path)
vecs_list = read_file2matrix(readed_path)
word_list = split_words(words)
vec = make_tfidf(word_list,all_dick,idf_dict)
similarity_lists = similarity_words(vec, vecs_list)
sorted_res = sorted(enumerate(similarity_lists), key=lambda x: x[1])
outputs = [[words_list[i[0]],i[1]] for i in sorted_res[-10:]]
return outputs
#测试
# words = '小米8 全面屏游戏智能手机 6GB+128GB 黑色 全网通4G 双卡双待 拍照手机'
# words = '荣耀 畅玩7X 4GB+32GB 全网通4G全面屏手机 标配版 铂光金'
words = 'Apple iPhone 8 Plus (A1864) 64GB 深空灰色 移动联通电信4G手机'
# words = '小米8'
# words = "黑色手机"
# words = 'Apple iPhone 8'
# words = '索尼 sony'
file_path = r'MobilePhoneTitle.txt'
readed_path = r"MobilePhoneTitle_tfidf.txt"
outputs = main(words, file_path, readed_path)
# print(outputs)
for i in outputs[::-1]:
print(i[0] + ' ' + str(i[1]))
4. word2vec
word2vec主要分为CBOW(Continuous Bag of Words)和Skip-Gram两种模式。CBOW是从原始语句推测目标字词;而Skip-Gram正好相反,是从目标字词推测出原始语句。CBOW对小型数据库比较合适,而Skip-Gram在大型语料中表现更好。
相关内容资料已上传至百度云盘:链接:https://pan.baidu.com/s/1zw-yxVFF-UrolpW7echk-w
提取码:s4by