高频词随机抽取
高频词一般是指文档中出现频率较高的且非无用的词.高频次提取其实就是自然语言处理中的TF(Term Frequence)策略,其主要有一下两个干扰项:
(1)标签符号,一般情况下无实际价值
(2)停用词:诸如“是”、‘哈’等无实意词
需要剔除上述两个干扰项。
随机抽取一定数量的高频词
#将语料转成普通文本(这部分不是必须的)
import codecs
def document_scratch(input_file,output_file):
input_data = codecs.open(input_file,'r','utf-8')
output_data = codecs.open(output_file, 'w', 'utf-8')
line = input_data.readline() # 以行的形式进行读取文件
while line:
a = line.split()
b = a[0]
line= input_data.readline()
output_data.write(b)#每个句子之间以[sep]分隔
input_data.close()
output_data.close()
document_scratch('detailResult10.txt','0010.txt')
#随机抽取代码
#========================正式代码===========================
import codecs
import glob
import random
import jieba
import re
#对数据进行读取
def get_content(path):
with open(path,'r',encoding='utf-8',errors='ignore') as f:
content = ''
for l in f:
l = l.strip()
content += l
return content
#定义高频词统计函数
def get_TF(words,topK):
tf_dic = {}
for w in words:
tf_dic[w] = tf_dic.get(w,0)+1
return sorted(tf_dic.items(),key=lambda x: x[1],reverse=True)[:topK]
#滤除停止词
def stop_words(path):
with open(path,'r',encoding='utf-8',errors='ignore') as f:
return [l.strip() for l in f]
#剔除纯数字
def find_shuzi(split_shuzi):
topk_word=[]
for i in split_shuzi:
topk_word.append(i[0])
shuzi = re.findall(r'\d+',''.join(topk_word))
return shuzi
#查看高频词
def main_with_stop_words():
files = glob.glob('0010.txt')#读取文件
corpus = [get_content(x) for x in files]#->list
corpus = ''.join(corpus)#->str
split_words = []
for x in jieba.cut(corpus):
if x not in stop_words('cn_stopwords.txt'):
if len(x)> 1:
split_words.append(x)
#split_words = [x for x in jieba.cut(corpus) if x not in stop_words('cn_stopwords.txt')and len(x)> 1]
length = len(split_words)
print('分词长度:',length)
#print(len(split_words))
fenci_result = '/ '.join(split_words)
split_words = []
for x in jieba.cut(corpus):
if x not in stop_words('cn_stopwords.txt'):
if len(x)> 1:
split_words.append(x)
#split_words = [x for x in jieba.cut(corpus) if x not in stop_words('cn_stopwords.txt')]#->generator
split_words=list(split_words)#->list
split_num = []
for word in split_words:
if not word.isdigit():
split_num.append(word)
split_words = split_num
topK_result = get_TF(split_words,topK=length)
return split_words,topK_result
fenci_result,topK_result = main_with_stop_words()
print(fenci_result)
#随机选取rand_value词并写入文档中
def write_file(rand_value):
topK_length = len(topK_result)
high_freq = int(topK_length*0.7)
common_freq = int(topK_length*0.9)
topK_list = []
for word in topK_result:
topK_list.append(word[0])
a = topK_list[0:high_freq]#高频子集
b = topK_list[high_freq:common_freq]#中频子集
c = topK_list[common_freq:]#低频子集
rand_value = rand_value#选取子集个数
rand_a = int(rand_value*0.7)
rand_b = int(rand_value*0.2)
rand_c = int(rand_value*0.1)
a_choice = random.sample(a,rand_a)#各个子集随机抽取
b_choice = random.sample(b,rand_b)
c_choice = random.sample(c,rand_c)
result_merge = a_choice+b_choice+c_choice#合并选取子集
output_data = codecs.open('a10.txt', 'w+', 'utf-8')#写入文档
for word in result_merge:
output_data.write(word+'\n')
write_file(500)