高频词随机抽取

最新推荐文章于 2025-03-19 22:18:14 发布

chengjinpei

最新推荐文章于 2025-03-19 22:18:14 发布

阅读量1.5k

点赞数

分类专栏：数据预处理文章标签：自然语言处理

本文链接：https://blog.csdn.net/chengjinpei/article/details/108059095

版权

数据预处理专栏收录该内容

7 篇文章

订阅专栏

高频词随机抽取

随机抽取一定数量的高频词

高频词一般是指文档中出现频率较高的且非无用的词.高频次提取其实就是自然语言处理中的TF（Term Frequence）策略，其主要有一下两个干扰项：
（1）标签符号，一般情况下无实际价值
（2）停用词：诸如“是”、‘哈’等无实意词
需要剔除上述两个干扰项。

随机抽取一定数量的高频词

#将语料转成普通文本（这部分不是必须的）
import codecs
def document_scratch(input_file,output_file):
    input_data = codecs.open(input_file,'r','utf-8') 
    output_data = codecs.open(output_file, 'w', 'utf-8')
    line = input_data.readline()  # 以行的形式进行读取文件
    while line:
        a = line.split()
        b = a[0]
        line= input_data.readline()
        output_data.write(b)#每个句子之间以[sep]分隔
    input_data.close()
    output_data.close()
document_scratch('detailResult10.txt','0010.txt')
#随机抽取代码
#========================正式代码===========================
import codecs
import glob
import random
import jieba
import re
#对数据进行读取
def get_content(path):
    with open(path,'r',encoding='utf-8',errors='ignore') as f:
        content = ''
        for l in f:
            l = l.strip()
            content += l
        return content
#定义高频词统计函数
def get_TF(words,topK):
    tf_dic = {}
    for w in words:
        tf_dic[w] = tf_dic.get(w,0)+1
    return sorted(tf_dic.items(),key=lambda x: x[1],reverse=True)[:topK]
#滤除停止词
def stop_words(path):
    with open(path,'r',encoding='utf-8',errors='ignore') as f:
        return [l.strip() for l in f]
#剔除纯数字
def find_shuzi(split_shuzi):
    topk_word=[]
    for i in split_shuzi:
        topk_word.append(i[0])
    shuzi = re.findall(r'\d+',''.join(topk_word))
    return shuzi
#查看高频词
def main_with_stop_words():
    files = glob.glob('0010.txt')#读取文件
    corpus = [get_content(x) for x in files]#->list
    corpus = ''.join(corpus)#->str
    split_words = []
    for x in jieba.cut(corpus):
        if x not in stop_words('cn_stopwords.txt'):
            if len(x)> 1:
                split_words.append(x) 
    #split_words = [x for x in jieba.cut(corpus) if x not in stop_words('cn_stopwords.txt')and len(x)> 1]
    length = len(split_words)
    print('分词长度:',length)
    #print(len(split_words))
    fenci_result = '/ '.join(split_words)
    split_words = []
    for x in jieba.cut(corpus):
        if x not in stop_words('cn_stopwords.txt'):
            if len(x)> 1:
                split_words.append(x) 
    #split_words = [x for x in jieba.cut(corpus) if x not in stop_words('cn_stopwords.txt')]#->generator
    split_words=list(split_words)#->list
    split_num = []
    for word in split_words:
        if not word.isdigit():
            split_num.append(word)
    split_words = split_num
    topK_result = get_TF(split_words,topK=length)
    return split_words,topK_result
fenci_result,topK_result = main_with_stop_words()
print(fenci_result)
#随机选取rand_value词并写入文档中
def write_file(rand_value):
    topK_length = len(topK_result)
    high_freq = int(topK_length*0.7)
    common_freq = int(topK_length*0.9)
    topK_list = []
    for word in topK_result:
        topK_list.append(word[0])
    a = topK_list[0:high_freq]#高频子集
    b = topK_list[high_freq:common_freq]#中频子集
    c = topK_list[common_freq:]#低频子集
    rand_value = rand_value#选取子集个数
    rand_a = int(rand_value*0.7)
    rand_b = int(rand_value*0.2)
    rand_c = int(rand_value*0.1)
    a_choice = random.sample(a,rand_a)#各个子集随机抽取
    b_choice = random.sample(b,rand_b)
    c_choice = random.sample(c,rand_c)
    result_merge = a_choice+b_choice+c_choice#合并选取子集
    output_data = codecs.open('a10.txt', 'w+', 'utf-8')#写入文档
    for word in result_merge:
        output_data.write(word+'\n')
write_file(500)