import pandas as pd
import jieba
jieba. load_userdict( 'D:/jiebaDict.txt' )
df = pd. read_table( 'D:/特色小镇.txt' , names= [ 'content' ] , encoding= 'gb18030' )
df. tail( )
content = df. content. values. tolist( )
print ( content[ 1 ] )
content_S = [ ]
for line in content:
current_segment = jieba. lcut( line)
if len ( current_segment) > 1 and current_segment != '\r\n' :
content_S. append( current_segment)
content_S[ 100 ]
df_content= pd. DataFrame( { 'content_S' : content_S} )
stopwords= pd. read_csv( "D:\\data\stopwords.txt" , index_col= False , sep= "\t" , quoting= 3 , names= [ 'stopword' ] , encoding= 'utf-8' )
def drop_stopwords ( contents, stopwords) :
contents_clean = [ ]
all_words = [ ]
for line in contents:
line_clean = [ ]
for word in line:
if word in stopwords:
continue
line_clean. append( word)
all_words. append( str ( word) )
contents_clean. append( line_clean)
return contents_clean, all_words
contents = df_content. content_S. values. tolist( )
stopwords = stopwords. stopword. values. tolist( )
contents_clean, all_words = drop_stopwords( contents, stopwords)
df_content= pd. DataFrame( { 'contents_clean' : contents_clean} )
df_content. head( )
df_all_words= pd. DataFrame( { 'all_words' : all_words} )
df_all_words
import numpy
words_count= df_all_words. groupby( 'all_words' ) . agg( count= pd. NamedAgg( column= 'all_words' , aggfunc= 'size' ) )
words_count= words_count. reset_index( ) . sort_values( by= [ "count" ] , ascending= False )
words_count. reset_index( drop= True , inplace= True )
words_count
from wordcloud import WordCloud
import matplotlib. pyplot as plt
% matplotlib inline
import matplotlib
matplotlib. rcParams[ 'figure.figsize' ] = ( 10.0 , 5.0 )
wordcloud= WordCloud( font_path= "D:\\data\simhei.ttf" , background_color= "white" , max_font_size= 80 )
word_frequence = { x[ 0 ] : x[ 1 ] for x in words_count. head( 100 ) . values}
wordcloud= wordcloud. fit_words( word_frequence)
plt. imshow( wordcloud)
plt. savefig( 'D:/特色小镇词云.png' )
import jieba. analyse
index = 152
print ( df[ 'content' ] [ index] )
content_S_str = "" . join( content_S[ index] )
print ( " " . join( jieba. analyse. extract_tags( content_S_str, topK= 5 , withWeight= False ) ) )
'''
tf-idf 提取关键词
'''
for i in range ( 152 ) :
index= i
content_S_str = "" . join( content_S[ index] )
print ( " " . join( jieba. analyse. extract_tags( content_S_str, topK= 5 , withWeight= False ) ) )
'''
LDA主题模型
'''
from gensim import corpora, models, similarities
import gensim
dictionary = corpora. Dictionary( contents_clean)
corpus = [ dictionary. doc2bow( sentence) for sentence in contents_clean]
lda = gensim. models. ldamodel. LdaModel( corpus= corpus, id2word= dictionary, num_topics= 20 )
print ( lda. print_topic( 0 , topn= 5 ) )
for topic in lda. print_topics( num_topics= 20 , num_words= 5 ) :
print ( topic[ 1 ] )
words_count[ 'word_frequency' ] = words_count[ 'count' ] / sum ( words_count[ 'count' ] )
words_count