(二)文本挖掘——TF-IDF

# @Time : 2021/3/9 15:35
# @Author : chao
# -*- coding: utf-8 -*-

import pandas as pd
import numpy as np
import jieba
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import CountVectorizer


def seg_sentence(sentence, stopwords):
    """对句子进行分词,并去除停用词
       自己安装jieba分词包:pip install jieba
    Args:
        sentence:句子(str)
        stopwords:停用词List
    Returns:
        outstr:分词并去除停用词的句子(str)
    """
    sentence_seged = jieba.cut(sentence.strip())
    outstr = ''
    for word in sentence_seged:

        if word not in stopwords:
            if word != '\t':
                outstr += word
                outstr += ' '
    return outstr


swPath = r'C:\Users\\词典\stopword停用词.txt'
f = open(swPath, 'r', encoding='utf-8')
lines = f.readlines()
stopwords = list(map(lambda line: line.strip(),
                     filter(lambda x: x != '', lines)))
stopwords.append(' ')
stopwords = list(set(stopwords))
f.close()

with open(r'C:\Users\数据\预处理后数据\zong_data.txt', encoding='ANSI') as f:
    text = f.readlines()
seg_text = list(map(lambda review: seg_sentence(review, stopwords), text))

ngram_vectorizer = CountVectorizer(ngram_range=(1, 3), max_features=50)

count = ngram_vectorizer.fit_transform(seg_text).toarray()
featureName = ngram_vectorizer.get_feature_names()
featureName = list(map(lambda vec: vec.replace(' ', '_'), featureName))
seg_text = []
# 将此词频矩阵转换为TF—IDF值
tfidf_transformer = TfidfTransformer()
word_vec = tfidf_transformer.fit_transform(count).toarray()

name = np.matrix(featureName)
final = np.vstack((name, word_vec))
df = pd.DataFrame(final)

# 最终特征名称和数据保存在csv文件中
df.to_csv(r'C:\Users\代码\特征提取\特征提取数据\zong_tfidf.csv',
          index=False, header=False, encoding='gbk')

  • 0
    点赞
  • 1
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值