1、文本清理
import pandas as pd
import pymysql
from sqlalchemy import create_engine
import re
import jieba
import jieba.analyse
mblog_frame = pd.read_csv('mblog.csv',index_col=None)
mblog_frame.head(2)
![在这里插入图片描述](https://i-blog.csdnimg.cn/blog_migrate/ed15bf18c0403c752bc75856de792974.png)
def clean_text(raw):
"""
清除text中的非微博正文字符
返回值类型为元组
"""
if raw['raw_text']:
text=re.sub('<[^<]*>','',raw['raw_text'])
text=re.sub('[#\n]*','',text)
text=re.sub('(http://.*)$','',text)
return text
else:
return None
def get_chinese_text(raw):
"""
清除text中的非中文字符
只能提取中文字符,微博中的数字以及英文均会丢失
"""
if raw['text']:
res_text=''.join(re.findall(r"[\u4e00-\u9fff]{2,}",raw['text']))
return (raw['mid'],res_text)
else:
return None
def get_keywords(raw):
"""
使用jieba从中文text抽取关键词
默认抽取20个关键词
longtext 提取40个关键词
"""
if raw['chinese_text']:
if raw['isLongText'] == 1:
keywords = jieba.analyse.extract_tags(raw['chinese_text'],topK=50)
else:
keywords = jieba.analyse.extract_tags(raw['chinese_text'])
return (raw['mid'],keywords)
else:
return None
def clean_created_date(raw):
created_date = raw['created_at']
if created_date.endswith('前'):
created_date = '09-15'
elif created_date.startswith('昨天'):
created_date = '09-14'
return created_date
mblog_frame['created_date'] = mblog_frame.apply(clean_created_date,axis=1)
mblog_frame['chinese_text'] = mblog_frame.apply(clean_text,axis=1)
res_mblog = pd.DataFrame(mblog_frame,columns=['mid','chinese_text','like_count','comments_count','reposts_count','created_date','user_id'])
res_mblog.to_csv('clean_mblog.csv', encoding='utf_8_sig',index=False)
mid_with_keyword = list(mblog_frame.apply(get_keywords,axis=1))
keywords_list = [(raw[0],w) for raw in mid_with_keyword for w in raw[1]]
mid_with_keyword = pd.DataFrame(keywords_list,columns=['mid','keyword'])
mid_with_keyword.to_csv('keyword.csv', encoding='utf_8_sig',index=False)
2、制作词云
keyword_frame = pd.read_csv('keyword.csv',index_col=False)
all_keyword = list(keyword_frame.keyword)
from collections import Counter
word_freq_frame = pd.DataFrame(Counter(all_keyword).items())
word_freq_frame.columns=['word','count']
top100_freq_word = word_freq_frame.sort_values('count',ascending=0).head(100)
top100_freq_word_dict=dict(list(top100_freq_word.apply(lambda w:(w['word'],w['count']),axis=1)))
from wordcloud import WordCloud,STOPWORDS
import matplotlib.pyplot as plt
plt.rcParams['font.sans-serif']=['SimHei']
plt.rcParams['axes.unicode_minus']=False
%matplotlib inline
plt.rcParams['figure.dpi'] = 100
wc = WordCloud(background_color="white",max_words=2000,font_path='simhei.ttf')
wc.generate_from_frequencies(top100_freq_word_dict)
plt.imshow(wc)
plt.axis('off')
plt.show()
![在这里插入图片描述](https://i-blog.csdnimg.cn/blog_migrate/a7c5606ca847408718c6bcd81688388f.png)