文本挖掘 包含中英文数据预处理以及分析

中文

数据展示

首先导入必要的库:

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from pandas.plotting import scatter_matrix
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
from wordcloud import WordCloud
from sklearn import ensemble
from sklearn.model_selection import train_test_split
import jieba
from nltk.corpus import stopwords         
from nltk.tokenize import word_tokenize   
from nltk.stem import PorterStemmer       
from nltk.stem import WordNetLemmatizer   
from nltk.stem import SnowballStemmer  
from sklearn.feature_extraction.text import CountVectorizer, HashingVectorizer, TfidfTransformer,TfidfVectorizer
from nltk.tag import pos_tag
from sklearn import model_selection, preprocessing, linear_model, naive_bayes, metrics, svm
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn import decomposition, ensemble
import pandas
import random
import re
from nltk.tag import StanfordNERTagger
import os
import nltk

展示数据

data=pd.read_csv('chineseData',encoding='utf-8')
data

在这里插入图片描述

数据预处理

移除数字和字母

def find_unchinese(file):
    pattern = re.compile(r'[a-zA-Z0-9]')
    unchinese = re.sub(pattern,'',file)                            
    return unchinese

加载jieba分词(好用!!)

import jieba
s='今天天气好冷,快出太阳'
jieba.lcut(s) 

加载停用词列表

def stopwordslist():
    stopwords = [line.strip() for line in open('1.txt',encoding='UTF-8').readlines()]
    return stopwords

加载标点符号列表

punctuation = r"""!"#$%&'()*+,-./:;<=>?@[\]^_`{|}~“”?,�!【】()、。:;’‘……¥·0123456789–"""
dicts={i:'' for i in punctuation}
punc_table=str.maketrans(dicts)

数据预处理方法汇总集合

包含去除非中文、去除标点符号、去停用词,最终将文本处理为仅有中文且不包含停用词的连续的文本。

wordnet_lemmatizer = WordNetLemmatizer()  # Based on Snowball stem extraction algorithm
def Data_regularization(sentence):
     # 对文档中的每一行进行分词
    print("Began to participle")
    sentence = find_unchinese(sentence)
    sentence = sentence.translate(punc_table)
    sentence_depart = jieba.lcut(sentence)
    # 创建一个停用词列表
    stopwords = stopwordslist()
    # 输出结果为outstr
    outstr = ''
    # 去停用词
    for word in sentence_depart:
        if word not in stopwords:   
            if word != '\t':
                if word != '\n':                                      
                    outstr += word
                    outstr += ' '  
    return outstr

对文本进行处理

选择topic

data2=data.dropna(axis=0, how='any', inplace=False)
data2=data2.reset_index(drop=True)
data2

因为有些时候topic不止一个,所以用ramdon随机抽取其中一个topic作为这个文本的topic。

for i in range(0, data2.shape[0]):
    s = data2.loc[i,'topic'].split(',')
    if len(s) != 1:
        r = random.randint(0,len(s)-1)
        data2.loc[i,'topic'] = s[r]
    print('The ' + str(i) + ' is successfully select')

去空值

for i in range(0,len(data2)):
    if data2.loc[i,'Body in Chinese'] == 0 or data2.loc[i,'Body in Chinese'] == '0':
        data2=data2.drop(i, axis=0)
data2=data2.reset_index(drop=True)
data2

用一个新的dataframe保存数据,给你多次重来修改的机会

df=data2
df

对文本进行正则化

##Regularization
for i in range(0, df.shape[0]):
    print(i)
    df.loc[i,'Body in Chinese']=Data_regularization(df.loc[i,'Body in Chinese'])

将处理后的文本保存为csv文件以便后续CNN分类训练

outputpath='dfchi.csv'
df.to_csv(outputpath,sep=',',index=False,header=False)
l = []
for i in range(0, df.shape[0]):
    s = df.loc[i,'topic']
    l.append(s)
np.unique(l)
df1 = df[df['topic'].isin(['book'])].reset_index(drop=True)
df1.to_csv('book.csv',sep=',',index=False,header=True)
df2 = df[df['topic'].isin(['business'])].reset_index(drop=True)
df2.to_csv('business.csv',sep=',',index=False,header=True)
df3 = df[df['topic'].isin(['culture'])].reset_index(drop=True)
df3.to_csv('culture.csv',sep=',',index=False,header=True)
df4 = df[df['topic'].isin(['economy'])].reset_index(drop=True)
df4.to_csv('economy.csv',sep=',',index=False,header=True)
df5 = df[df['topic'].isin(['lifestyle'])].reset_index(drop=True)
df5.to_csv('lifestyle.csv',sep=',',index=False,header=True)
df6 = df[df['topic'].isin(['management'])].reset_index(drop=True)
df6.to_csv('management.csv',sep=',',index=False,header=True)
df7 = df[df['topic'].isin(['markets'])].reset_index(drop=True)
df7.to_csv('markets.csv',sep=',',index=False,header=True)
df8 = df[df['topic'].isin(['people'])].reset_index(drop=True)
df8.to_csv('people.csv',sep=',',index=False,header=True)
df9 = df[df['topic'].isin(['politics'])].reset_index(drop=True)
df9.to_csv('politics.csv',sep=',',index=False,header=True)
df10 = df[df['topic'].isin(['society'])].reset_index(drop=True)
df10.to_csv('society.csv',sep=',',index=False,header=True)

英文文本

数据预处理(归一化)

导入数据

data=pd.read_csv('classification_data3.csv',encoding='utf-8')
data

移除中文
# Remove the Chinese
def find_unchinese(file):
    pattern = re.compile(r'[\u4e00-\u9fa5]')
    unchinese = re.sub(pattern,'',file)                            
    return unchinese

提取名词

因为名词的分类效果会更好

def Extract_Noun(file):
    outstr = ''   
    sentence_depart = word_tokenize(file.strip())
    for word in sentence_depart:
        p = pos_tag(word_tokenize(word))
        # Only extract noun
        if p[0][1] == 'NN' or p[0][1] == 'NNS' or p[0][1] == 'NNP' or p[0][1] == 'NNPS':            
            outstr += word
            outstr += ' '
    return outstr
去停用词

两种方法,一种是自己从网上下载的停用词表,一种是nltk的stopword表

def stopwordslist():
    stopwords = [line.strip() for line in open('stopWord.txt',encoding='UTF-8').readlines()]
    return stopwords
stop_words = set(stopwords.words('english'))
词干化和词性还原

因为词干化还原的更狠,会经常导致单词不完整,所以作者选择的是词性还原,就是类似把dogs还原成dog这样。

wordnet_lemmatizer = WordNetLemmatizer()
#snowball_stemmer = SnowballStemmer('english')  
# Based on Snowball stem extraction algorithm
方法结合

词性还原时针对名词、动词、形容词和副词

def Data_regularization(sentence):
     # 对文档中的每一行进行分词
    print("Began to participle")
    
    sentence = Extract_Noun(sentence)
    sentence = find_unchinese(sentence)
    sentence = sentence.translate(punc_table)
    sentence_depart = word_tokenize(sentence.strip())

    # 创建一个停用词列表
    #stopwords = stopwordslist()
    # 输出结果为outstr
    outstr = ''
    # 去停用词
    for word in sentence_depart:
        if word not in stop_words:   
            if word != '\t':
                if word != '\n':                        
                    # snowball_stemmer.stem(w) Word stemming
                    word1 = wordnet_lemmatizer.lemmatize(word, pos = 'n') 
                    word2 = wordnet_lemmatizer.lemmatize(word1, pos = 'v') 
                    word3 = wordnet_lemmatizer.lemmatize(word2, pos = 'a')
                    word4 = wordnet_lemmatizer.lemmatize(word3, pos = 'r')
                    #把所有字符中的大写字母转换成小写字母
                    word = word4.lower()                 
                    outstr += word
                    outstr += ' '  
    return outstr
去空值,重复值
data=data.drop_duplicates(subset='title in English',keep='first')
data=data.drop_duplicates(subset='body in English',keep='first')
data2=data.dropna(axis=0, how='any', inplace=False)
nandata=data2.isnull().sum().to_frame('Number of NaN') ## Number of missing (NaN) values
nandata
data2=data2.reset_index(drop=True)
data2
随机选择topic

因为有些时候topic不止一个,所以用ramdon随机抽取其中一个topic作为这个文本的topic。

for i in range(0, data2.shape[0]):
    s = data2.loc[i,'topic'].split(',')
    if len(s) != 1:
        r = random.randint(0,len(s)-1)
        data2.loc[i,'topic'] = s[r]
    print('The ' + str(i) + ' is successfully select')
选取部分需要用上的dataframe
df=data2[['body in English','topic']]
df
开始处理
##Regularization
for i in range(0, df.shape[0]):
    df.loc[i,'body in English']=Data_regularization(df.loc[i,'body in English'])
导出数据
outputpath='df.csv'
df.to_csv(outputpath,sep=',',index=False,header=False)
df1 = df[df['topic'].isin(['book'])].reset_index(drop=True)
df1.to_csv('book.csv',sep=',',index=False,header=True)
df2 = df[df['topic'].isin(['business'])].reset_index(drop=True)
df2.to_csv('business.csv',sep=',',index=False,header=True)
df3 = df[df['topic'].isin(['culture'])].reset_index(drop=True)
df3.to_csv('culture.csv',sep=',',index=False,header=True)
df4 = df[df['topic'].isin(['economy'])].reset_index(drop=True)
df4.to_csv('economy.csv',sep=',',index=False,header=True)
df5 = df[df['topic'].isin(['lifestyle'])].reset_index(drop=True)
df5.to_csv('lifestyle.csv',sep=',',index=False,header=True)
df6 = df[df['topic'].isin(['management'])].reset_index(drop=True)
df6.to_csv('management.csv',sep=',',index=False,header=True)
df7 = df[df['topic'].isin(['markets'])].reset_index(drop=True)
df7.to_csv('markets.csv',sep=',',index=False,header=True)
df8 = df[df['topic'].isin(['people'])].reset_index(drop=True)
df8.to_csv('people.csv',sep=',',index=False,header=True)
df9 = df[df['topic'].isin(['politics'])].reset_index(drop=True)
df9.to_csv('politics.csv',sep=',',index=False,header=True)
df10 = df[df['topic'].isin(['society'])].reset_index(drop=True)
df10.to_csv('society.csv',sep=',',index=False,header=True)

创建并加载语料库

corpus = []
for i in range(0, df.shape[0]):
    corpus.append(df.loc[i,'body in English'])
    print('The ' + str(i) + ' text is successfully add in corpus')

进行文本分析

TF-IDF

一、使用TF-IDF对文本进行预处理,将文本化为向量的表示形式
1、TfidfVectorizer可以把原始文本转化为tf-idf的特征矩阵,从而为后续的文本相似度计算,主题模型(如LSI),文本搜索排序等一系列应用奠定基础

from sklearn.feature_extraction.text import CountVectorizer  
vectorizer=CountVectorizer()
X=vectorizer.fit_transform(corpus)
print(vectorizer.fit_transform(corpus))
X.toarray()

这步如果语料库太大,会出现内存不足的问题,因此我们可以使用Hash trick降维

print(vectorizer.get_feature_names())
Hash Trick
from sklearn.feature_extraction.text import HashingVectorizer 
vectorizer2=HashingVectorizer(n_features = 11,norm =None)
X2=vectorizer2.fit_transform(corpus)
X2
X2.toarray()
from sklearn.feature_extraction.text import TfidfTransformer
transformer = TfidfTransformer()
tfidf = transformer.fit_transform(X2)
print(tfidf)
tfidf.toarray()
  • 3
    点赞
  • 46
    收藏
    觉得还不错? 一键收藏
  • 3
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论 3
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值