1. 模型预测
1.1 jieba分词预处理
去除文本中的特殊符号、停用词等,对文本进行分词。
lda_jieba_preprocess.py
# -*- coding: UTF-8 -*-
import re
import jieba
import jieba.analyse
def cut_seg(input_file_name, output_file_name, stop_word_list):
stop_words = set(stop_word_list)
with open(input_file_name, 'r') as fin, open(output_file_name, 'w') as fout:
for line in fin:
item_list = line.strip().split("\t")
if len(item_list) != 2:
continue
text = item_list[1]
# 过滤掉"[ 龇牙 ]、[ 握手 ]"之类的表情符号,第一个\[表示转移
text = re.sub('\[[0-9a-zA-Z\u4e00-\u9fa5.,,。?“”]+\]', '', text)
# text = re.sub(u'[^0-9a-zA-Z\u4e00-\u9fa5.,,。?“”]+', ' ', text)
temp_list = jieba.cut(text, cut_all=True)
content = ' '.join([
item for item in temp_list
if len(item) >= 1 and item not in stop_words
])
if len(content) > 0:
fout.write(content + "\n")
if __name__ == "__main__":
input_file = sys.argv[1]
output_file = sys.argv[2]
stop_words_file = sys.argv[3]
# 1. 获取停用词,列表推导要快很多
with open(stop_words_file, encoding='UTF-8') as f:
stop_word_list = [line.strip() for line in f]
# 2. jieba cut words
jieba.initialize()
cut_seg(input_file, output_file, stop_word_list)
并行训练参考: 人脸识别Demo_MusicDancing的博客-CSDN博客
运行
python lda_jieba_preprocess.py chat.txt result.txt stopwords.txt
chat.txt
result.txt
1.2 LDA 文本格式数据
1.3 LDA libsvm格式数据
# converting UCI format docword to libsvm format data
def text2libsvm(input_file, output_file):
result = ""
last_doc_id = 0
with open(input_file, 'r') as fin, open(output_file, 'w') as fout:
for line in fin:
item_list = line.strip().split(' ')
if len(item_list) != 3:
continue
doc_id = int(item_list[0])
word_id = int(item_list[1]) - 1
word_count = int(item_list[2])
if doc_id != last_doc_id:
if len(result.strip()) != 0:
fout.write(result.strip() + '\n')
result = str(doc_id) + '\t'
result += str(word_id) + ':' + str(word_count) + ' '
last_doc_id = doc_id
if len(result.strip()) != 0:
fout.write(result.strip() + '\n')
if __name__ == "__main__":
data_file = 'docword.nytimes.txt'
libsvm_file = 'nytimes.libsvm'
text2libsvm(data_file, libsvm_file)
docword.nytimes.txt
字段以此为 docid wordid cnt
nytimes.libsvm
1.4 转为二进制文件
./bin/dump_binary nytimes.libsvm nytimes.word_id.dict . 0
# 0 是传入参数,不需改动
输入文件是上一步产生的nytimes.libsvm 和训练模型时产生的
nytimes.word_id.dict
共计134715条。
输出文件则是在当前目录(.)内产生3个文件,其中block.0、vocab.0 是二进制文件。
vocab.0.txt
共计 91966 行。
1.5 LDA模型预测
bin/infer -num_vocabs 134715 \
-num_topics 60 \
-num_iterations 120 \
-alpha 0.83 \
-beta 0.1 \
-mh_steps 2 \
-num_local_workers 1 \
-num_blocks 1 \
-max_num_document 45070400 \
-input_dir . \
-data_capacity 1000
输出三个文件:
block.0 二进制文件与上一步输出的相同
doc_topic.0 (647975 lines)
LightLDA_infer.1985.log
2. 分词结果统计
# -*- coding: utf-8 -*-
def get_result_dict(file_name):
total_dict = {}
pos_dict = {}
with open(file_name, 'r') as f:
for line in f:
arr = line.strip().split("\t")
label = arr[1]
content = arr[2]
word_arr = content.split(" ")
for j in word_arr:
if j in total_dict:
total_dict[j] += 1
else:
total_dict[j] = 1
if label == '1':
if j in pos_dict:
pos_dict[j] += 1
else:
pos_dict[j] = 1
return total_dict, pos_dict
if __name__ == "__main__":
file_name = "/data/liupg/chat_v1/data/uid_label_data.txt"
total_dict, pos_dict = get_result_dict(file_name)
print("样本数\t正样本数\t正样本占比\t单词")
# 每个单词出现的次数
for word, pos_num in pos_dict.items():
total_num = total_dict[word]
if (total_num > 20) and (pos_num * 1.0 / total_num >= 0.05):
print(total_num + "\t" + pos_num + "\t" + pos_num * 1.0 / total_num + "\t" + word)
# 汇总
total_cnt, pos_cnt = len(total_dict), len(pos_dict)
pos_rate = pos_cnt * 1.0 / total_cnt
print(total_cnt + "\t" + pos_cnt + "\t" + pos_rate)