文章目录
信息内容安全-汉字预处理实验
介绍
利用jieba分词进行中文分词,使用TF-IDF方法提取关键词。
代码
# 信息内容安全实验-汉字预处理程序(分词+ TF-IDF)
import jieba
import os
import math
from collections import defaultdict
# 判断字符是否为汉字(是则返回 True)
def is_chinese(char):
if char >= u'\u4e00' and char <= u'\u9fa5':
return True
else:
return False
# 标准化处理(只留下汉字)
def standardize():
with open(material, 'r', encoding = 'utf-8') as f:
text = f.read()
f.close()
st_text = ''
for char in text:
if is_chinese(char):
st_text = st_text + char
with open(st, 'a') as s:
s.write("原语料内容:\n%s" % text)
s.write("\n-------------------------------------------------------------------")
s.write("\n标准化处理结果:\n%s" % st_text)
s.close()
return st_text
# jieba 分词
def divide(st_text):
di_list = jieba.lcut(st_text, cut_all = False) # 精准模式分词
with open(st, 'a') as s:
s.write("\n-------------------------------------------------------------------")
s.write("\n分词结果:\n")
s.write('/'.join(di_list))
s.close()
return di_list
# 使用停用词表进行过滤
def delete_stopword(di_list):
de_list = []
global essay_word
with open(cn_stopword_list, 'r', encoding = 'utf-8') as c:
stopwords = c.read()
c.close()
for word in di_list:
essay_word += 1
if word in stopwords:
continue
else:
de_list.append(word)
with open(st, 'a') as s:
s.write("\n-------------------------------------------------------------")
s.write("\n过滤结果:\n")
s.write('/'.join(de_list))
s.close()
return de_list
# 统计某词在文章中的出现频数
def frequency(de_list):
count_dict = defaultdict(lambda:0)
for word in de_list:
count_dict[word] += 1
return sorted(count_dict.items(), key = lambda x:x[1], reverse = True)
# 计算 IDF: log(语料库的文档总数/包含该词的文档数+ 1)
def IDF(word):
count = 0 # 包含该词的文档数
list = os.listdir(corpus_path)
for i in range(0, len(list)):
path = os.path.join(corpus_path, list[i])
if os.path.isfile(path):
file = open(path, 'r', encoding='utf-8')
file_content = file.read()
if word in file_content:
count += 1
file.close()
'''
for root, dirs, files in os.walk(corpus_path):
for fn in files:
with open(fn, 'r', encoding='utf-8') as f:
file_content = f.read()
print(file_content)
if word in file_content:
count += 1
'''
idf = math.log(float(essay_sum / (count+1)))
return idf
# 使用 TF-IDF 方法进行关键词选取
def TF_IDF(count_dict):
ti_dic = {}
tf = 0.00000000
idf = 0.000000000
for word in count_dict:
# 计算 DF:某词在文章中的词频(频数/总词数)
tf = float( word[1] / essay_word )
idf = IDF(word[0])
# 计算 IDF: TF*IDF
temp = {word[0]: tf*idf}
ti_dic.update(temp)
# 给字典按值排序
feature = sorted(ti_dic.items(), key = lambda x:x[1], reverse = True)
with open(st, 'a') as s:
s.write("\n-------------------------------------------------------------")
s.write("\nTF-IDF排序结果:\n")
s.writelines(str(feature))
s.write("\n-------------------------------------------------------------")
s.write("\n%s 的关键字为:%s" %(material, feature[0][0]))
s.close()
# 使用信息增益方法进行特征选取
if __name__ == '__main__':
# 语料及中文停用词表文件
material = 'Auto_0.txt'
cn_stopword_list = 'baidu_stopwords.txt'
st = '处理结果.txt'
corpus_path = '../preprocess/corpus'
essay_sum = 0 # 文档总数
essay_word = 0 # 文章总词数
# 语料处理
for root, dirs, files in os.walk(corpus_path):
for fn in files:
essay_sum += 1
st_text = standardize()
di_list = divide(st_text)
de_list = delete_stopword(di_list)
count_dict = frequency(de_list)
TF_IDF(count_dict)