新闻分类器

import  pandas as pd
import  numpy as np
import  jieba

#读取数据
def getdata():
    path = "M:/python练习/data/Python/data/val.txt"
    df_news = pd.read_table(path,names=['category','theme','URL','content'],encoding='utf-8')
    df_news= df_news.dropna()
    contents = df_news.content.values.tolist()
    return df_news,contents

#使用结吧分词器
def fenci(data):
    content = data.content.values.tolist()   #数据变成list格式
    content_s=[]
    for line in content:
        current_segment = jieba.lcut(line)
        if len(current_segment)>1 and current_segment!= '\r\n':#换行符
            content_s.append(current_segment)
    df_content = pd.DataFrame({'content_s':content_s})
    return df_content,content_s


#数据清洗,过滤掉一些不需要的词
def clearData(contents_clean1):
    #加载停用词
    path ='M:/python练习/data/Python/data/stopwords.txt'
    stopwords = pd.read_csv(path,index_col=False,sep='\t',quoting=3,names=['stopword'],encoding='utf-8')

    contents_clean=[]
    all_words = []
    for line in contents_clean1:
        line_clean = []
        for word in line:
            if word in stopwords:
                continue
            line_clean.append(str(word))
            all_words.append(str(word))
        contents_clean.append(line_clean)
    df_content = pd.DataFrame({'contents_clean':contents_clean})
    df_all_words = pd.DataFrame({'all_words': all_words})
    return df_content,df_all_words,contents_clean


#计算词频
def groupby(all_words):
    words_count = all_words.groupby(by=['all_words'])['all_words'].agg({"count":np.size})
    words_count = words_count.reset_index().sort_values(by=['count'],ascending=False)
    return  words_count

#创建词云
from wordcloud import  WordCloud
import  matplotlib.pyplot as plt
import  matplotlib
def wordCloud(words_count):
    matplotlib.rcParams['figure.figsize'] = (10.0,5.0)

    path = 'M:/python练习/data/Python/data/simhei.ttf'
    wordcloud = WordCloud(font_path=path,background_color='white',max_font_size=80)
    word_frequence = {x[0]:x[1] for x in words_count.head(100).values}
    wordcloud=wordcloud.fit_words(word_frequence)
    plt.imshow(wordcloud)
    plt.show()

import jieba.analyse
#关键词的提取
def getkey(df_news,content_S):
    index = 2400
    print(df_news['content'][index])
    content_S_str = "".join(content_S[index])
    print("  ".join(jieba.analyse.extract_tags(content_S_str, topK=5, withWeight=False)))


#LDA 主题模型
#格式要求:list of list形式,分词好的整个语料
from gensim import corpora,models,similarities
import  gensim
def LDA(contents_clean):
    #做映射,相当于词袋
    dictionary = corpora.Dictionary(contents_clean)
    corpus = [dictionary.doc2bow(sentence) for sentence in contents_clean]
    lda = gensim.models.ldamodel.LdaModel(corpus=corpus,id2word=dictionary,num_topics=20)
    print(lda.print_topic(1,topn=5))
def getTrain(contents_clean,df_news):
    df_train = pd.DataFrame({'contents_clean':contents_clean,'label':df_news['category']})
    return df_train

#获取label
def Labe(df_train):
    label = df_train.label.unique()
    label_mapping={}
    i=1
    for label_name in label:
        label_mapping[label_name] = i
        i +=1
    df_train['label'] = df_train['label'].map(label_mapping)
    return df_train

#数据切分,分出训练集和测试集
from sklearn.model_selection import  train_test_split
def splist(data):
    x_train,x_test,y_train,y_test = train_test_split(data['contents_clean'].values,data['label'].values,random_state=1)
    return x_train,x_test,y_train,y_test




#把str转变成list,使其符合CountVectorizer的形式
def strtolist(x_train):
    words=[]
    for line_index in range(len(x_train)):
        try:
            words.append(''.join(x_train[line_index]))
        except:
            print(line_index)
    return words

# 构造向量并使用贝叶斯进行训练
from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer
from sklearn.naive_bayes import  MultinomialNB
def creatVectorizer_Bayes(words,y_word):
    # vec = CountVectorizer()
    vec = TfidfVectorizer(analyzer='word',max_features=4000,lowercase=False)
    print("creatVectorizer_Bayes:",words[0])
    print("creatVectorizer_Bayes:",len(words))
    vec.fit(words)
    classifier = MultinomialNB()
    classifier.fit(vec.transform(words),y_word)
    print(classifier.score(vec.transform(words),y_word))



def main():
    data ,contents= getdata()
    df_content,content_s = fenci(data)
    df_content,df_all_words,contents_clean = clearData(contents)
    # print(data.shape)
    # print(len(contents_clean))
    # words_count = groupby(df_all_words)
    # wordCloud(words_count)
    # getkey(data,content_s)
    # LDA(contents_clean)
    data_train = getTrain(contents_clean,data)
    df_train = Labe(data_train)
    x_train, x_test, y_train, y_test = splist(df_train)
    words = strtolist(x_train)
    creatVectorizer_Bayes(words, y_train)
    words = strtolist(x_test)
    creatVectorizer_Bayes(words, y_test)



if __name__ == '__main__':
    main()

初 步 学 习 , 记 录 理 解 , 如 有 错 望 指 正 , 谢 谢 \color{red}{初步学习,记录理解,如有错望指正,谢谢}

  • 1
    点赞
  • 4
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值