import pandas as pd
import jieba
import jieba.analyse
import re
jieba.load_userdict('./dataset/dictdata/newwords.txt')#载入数据
data1=pd.read_csv('./dataset/rawdata/total.csv',encoding='utf-8',dtype=str)
data1=data1.astype(str)
data1["fenci"]= data1["content"]#定义stopwords
stopwords ={}.fromkeys([ line.rstrip().strip('\n')for line inopen('./dataset/dictdata/stopwords.txt',encoding='utf-8')])
eng_stopwords =set(stopwords)for index, row in data1.iterrows():
tmp = re.sub(r"[\s+\.\!\/_,$%^*(【】:\]\[\-:;+\"\']+|[+——!,。?、~@#¥%……&*()]+|[0-9]+","",row['content'])
words = jieba.cut_for_search(tmp, HMM =True)
w_ =[w for w in words if w notin eng_stopwords]
row['fenci']=" ".join(w_)print(data1.head())
data1.to_csv('./dataset/rawdata/total_fenci_1.csv',columns=['title','content','fenci'],encoding='utf-8')
data1.to_excel('./dataset/rawdata/total_fenci_1.xlsx',columns=['title','content','fenci'],encoding='utf-8')
词频统计
```python
# -*- coding: UTF-8 -*-import numpy as np
import pandas as pd
import jieba
import jieba.analyse
import codecs
#载入数据:#title content fenci
data1=pd.read_csv('./dataset/rawdata/total_fenci_1.csv', header=0,encoding='utf-8',dtype=str)
data1=data1.astype(str)
segments =[]for index, row in data1.iterrows():
words =str(row['fenci']).strip('\n').split(' ')
splitedStr =''for word in words:# 记录全局分词
segments.append({'word':word,'count':1})
splitedStr += word +''
dfSg = pd.DataFrame(segments)
dfWord = dfSg.groupby('word')['count'].sum()#导出csv
dfWord.to_csv('./dataset/wordfrequency/word_frequency01.csv',encoding='utf-8')
dfWord.to_excel('./dataset/wordfrequency/word_frequency01.xlsx',encoding='utf-8')
情感分析
import jieba
from snownlp import SnowNLP
import numpy as np
import pandas as pd
# s = SnowNLP(u'这个东西真心赞')# s1=SnowNLP(u'还是很 设施也不错但是 和以前 比急剧下滑了 和客房 的服务极差幸好我不是很在乎')# print (s1)# print(s.tags)# p =s.sentiments # print (p)#载入数据:#title content fenci
data1=pd.read_csv('./dataset/rawdata/total_fenci_1.csv', header=0,encoding='utf-8',dtype=str)
data1=data1.astype(str)# 添加情感列 初始值为空
data1["sentiment"]= data1['fenci']for index, row in data1.iterrows():
row["sentiment"]= SnowNLP(str(row['fenci']).strip('\n')).sentiments
data1.to_csv('./dataset/sentiment/sentiment_1.csv',columns=['title','content','fenci','sentiment'],encoding='utf-8')
data1.to_excel('./dataset/sentiment/sentiment1.xlsx',columns=['title','content','fenci','sentiment'],encoding='utf-8')
LDA主题模型
```python
# -*- coding: utf-8 -*-import codecs
import jieba
from gensim import corpora
from gensim.models import LdaModel
from gensim.corpora import Dictionary
fr=open('./dataset/rawdata/total_fenci_2.csv','r',encoding='utf-8')
train=[]for line in fr:
line=line.strip().split(' ')#print (line)
train.append(line)#print (len(train))#print (' '.join(train[2]))#生成词典
dictionary = corpora.Dictionary(train)#转换成向量形式
corpus =[ dictionary.doc2bow(text)for text in train ]#print (type(corpus))#for i in corpus:# print ("序号:%s 值:%s" % (corpus.index(i) + 1, i))#print (corpus)
lda = LdaModel(corpus=corpus, id2word=dictionary, num_topics=5)
topic_list=lda.print_topics(5)#print (type(lda.print_topics(20)))#print (len(lda.print_topics(20)))for topic in topic_list:print(topic)