import pandas as pd
import numpy as np
import jieba
#读取数据
def getdata():
path = "M:/python练习/data/Python/data/val.txt"
df_news = pd.read_table(path,names=['category','theme','URL','content'],encoding='utf-8')
df_news= df_news.dropna()
contents = df_news.content.values.tolist()
return df_news,contents
#使用结吧分词器
def fenci(data):
content = data.content.values.tolist() #数据变成list格式
content_s=[]
for line in content:
current_segment = jieba.lcut(line)
if len(current_segment)>1 and current_segment!= '\r\n':#换行符
content_s.append(current_segment)
df_content = pd.DataFrame({'content_s':content_s})
return df_content,content_s
#数据清洗,过滤掉一些不需要的词
def clearData(contents_clean1):
#加载停用词
path ='M:/python练习/data/Python/data/stopwords.txt'
stopwords = pd.read_csv(path,index_col=False,sep='\t',quoting=3,names=['stopword'],encoding='utf-8')
contents_clean=[]
all_words = []
for line in contents_clean1:
line_clean = []
for word in line:
if word in stopwords:
continue
line_clean.append(str(word))
all_words.append(str(word))
contents_clean.append(line_clean)
df_content = pd.DataFrame({'contents_clean':contents_clean})
df_all_words = pd.DataFrame({'all_words': all_words})
return df_content,df_all_words,contents_clean
#计算词频
def groupby(all_words):
words_count = all_words.groupby(by=['all_words'])['all_words'].agg({"count":np.size})
words_count = words_count.reset_index().sort_values(by=['count'],ascending=False)
return words_count
#创建词云
from wordcloud import WordCloud
import matplotlib.pyplot as plt
import matplotlib
def wordCloud(words_count):
matplotlib.rcParams['figure.figsize'] = (10.0,5.0)
path = 'M:/python练习/data/Python/data/simhei.ttf'
wordcloud = WordCloud(font_path=path,background_color='white',max_font_size=80)
word_frequence = {x[0]:x[1] for x in words_count.head(100).values}
wordcloud=wordcloud.fit_words(word_frequence)
plt.imshow(wordcloud)
plt.show()
import jieba.analyse
#关键词的提取
def getkey(df_news,content_S):
index = 2400
print(df_news['content'][index])
content_S_str = "".join(content_S[index])
print(" ".join(jieba.analyse.extract_tags(content_S_str, topK=5, withWeight=False)))
#LDA 主题模型
#格式要求:list of list形式,分词好的整个语料
from gensim import corpora,models,similarities
import gensim
def LDA(contents_clean):
#做映射,相当于词袋
dictionary = corpora.Dictionary(contents_clean)
corpus = [dictionary.doc2bow(sentence) for sentence in contents_clean]
lda = gensim.models.ldamodel.LdaModel(corpus=corpus,id2word=dictionary,num_topics=20)
print(lda.print_topic(1,topn=5))
def getTrain(contents_clean,df_news):
df_train = pd.DataFrame({'contents_clean':contents_clean,'label':df_news['category']})
return df_train
#获取label
def Labe(df_train):
label = df_train.label.unique()
label_mapping={}
i=1
for label_name in label:
label_mapping[label_name] = i
i +=1
df_train['label'] = df_train['label'].map(label_mapping)
return df_train
#数据切分,分出训练集和测试集
from sklearn.model_selection import train_test_split
def splist(data):
x_train,x_test,y_train,y_test = train_test_split(data['contents_clean'].values,data['label'].values,random_state=1)
return x_train,x_test,y_train,y_test
#把str转变成list,使其符合CountVectorizer的形式
def strtolist(x_train):
words=[]
for line_index in range(len(x_train)):
try:
words.append(''.join(x_train[line_index]))
except:
print(line_index)
return words
# 构造向量并使用贝叶斯进行训练
from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
def creatVectorizer_Bayes(words,y_word):
# vec = CountVectorizer()
vec = TfidfVectorizer(analyzer='word',max_features=4000,lowercase=False)
print("creatVectorizer_Bayes:",words[0])
print("creatVectorizer_Bayes:",len(words))
vec.fit(words)
classifier = MultinomialNB()
classifier.fit(vec.transform(words),y_word)
print(classifier.score(vec.transform(words),y_word))
def main():
data ,contents= getdata()
df_content,content_s = fenci(data)
df_content,df_all_words,contents_clean = clearData(contents)
# print(data.shape)
# print(len(contents_clean))
# words_count = groupby(df_all_words)
# wordCloud(words_count)
# getkey(data,content_s)
# LDA(contents_clean)
data_train = getTrain(contents_clean,data)
df_train = Labe(data_train)
x_train, x_test, y_train, y_test = splist(df_train)
words = strtolist(x_train)
creatVectorizer_Bayes(words, y_train)
words = strtolist(x_test)
creatVectorizer_Bayes(words, y_test)
if __name__ == '__main__':
main()
初 步 学 习 , 记 录 理 解 , 如 有 错 望 指 正 , 谢 谢 \color{red}{初步学习,记录理解,如有错望指正,谢谢} 初步学习,记录理解,如有错望指正,谢谢