1.爬取微博数据
首先,从微博爬取以“华为5g芯片”为关键词搜索的微博,并将数据整理写入数据库。
import requests as re
import time
import pandas as pd
import pymysql
# 手动搜索后获得的url,用于后面的爬取多页的微博数据
target_url = "https://m.weibo.cn/api/container/getIndex?containerid=100103type%3D1%26q%3D%E5%8D%8E%E4%B8%BA5G%E8%8A%AF%E7%89%87&page_type=searchall&page="
# 一共爬取400页数据
total_page=400
# 用于存放微博数据
mblog = list()
# 用于存放用户
user = list()
# 逐页爬取微博数据,并整理存放
for page in range(total_page):
print('Crawling page:%d/%d'%(page+1,total_page))
cur_url=target_url + str(page+1)
source_json = re.get(cur_url).json()
time.sleep(1) # 增加中间延时,防止被识别为恶意爬虫
# 以下结构根据实际获得的json结构进行调整
source_data=source_json['data']
cards = source_data['cards'] if 'cards' in source_data else []
for card in cards:
if 'mblog' in card:
cur_blog = card['mblog']
mblog.append({'user_id':cur_blog['user']['id']
,'text':cur_blog['text']
,'mid':cur_blog['mid']
,'created_at':cur_blog['created_at']
,'comments_count':cur_blog['comments_count']
,'source':cur_blog['source']
})
user.append({'user_id':cur_blog['user']['id']
,'description':cur_blog['user']['description']
,'follow_count':cur_blog['user']['follow_count']
,'followers_count':cur_blog['user']['followers_count']
,'gender':cur_blog['user']['gender']
,'screen_name':cur_blog['user']['screen_name']
})
print('Crawl finished')
# 查看爬取的微博数据条数
print(len(mblog))
# 将整理的数据转换为pandas的dataframe并去除重复数据
mblog_frame = pd.DataFrame(mblog).drop_duplicates()
user_frame = pd.DataFrame(user).drop_duplicates()
# 创建sql链接
engine = create_engine('mysql+pymysql://root:password@localhost:3306/weibo_keyword?charset=utf8mb4',echo=False)
# 写入数据库
mblog_frame.to_sql(name='mblog',con=engine,if_exists='append',index=False)
user_frame.to_sql(name='user',con=engine,if_exists='append',index=False)
2.数据清理
获取的微博数据中的text中含有大量的乱码或html格式数据,需要将其中的中文内容提取出来,这里使用到正则表达式模块。
import re
import jieba
import matplotlib.pyplot as plt
plt.rcParams['font.sans-serif']=['SimHei']#用来显示中文标签
plt.rcParams['axes.unicode_minus']=False #用来显示负号
plt.rcParams['figure.figsize'] = (16.0, 10.0)
%matplotlib inline
# 创建sql链接,从数据库中读取保存的微博数据
engine = create_engine('mysql+pymysql://root:password@localhost:3306/weibo_keyword?charset=utf8mb4',echo=False)
mblog_frame = pd.read_sql_table('mblog',con=engine).drop_duplicates()
user_frame = pd.read_sql_table('user',con=engine).drop_duplicates()
# 定义数据清理函数,使用到jieba模块的cut方法
def clear_text(text):
"""
清除text中的非中文字符
"""
if text:
# [\u4e00-\u9fff]为Unicode的中文编码范围
res_text=''.join(re.findall(r"[\u4e00-\u9fff]{2,}",text))
return res_text if len(res_text)>0 else None
else:
return None
def get_word_list(text):
"""
适用jieba对text进行分词,返回分词列表
"""
if text:
return list(jieba.cut(text))
else:
return None
def drop_stopword(word_list,stopword_list=None):
"""
清除word_list中的停词
"""
if word_list:
res_word_list = list()
for word in word_list:
if word in stopword_list:
pass
else:
res_word_list.append(word)
return res_word_list
else:
return None
def get_final_word_list(pure_text,stopword_list):
"""
获得无停词的次列表
"""
word_list = get_word_list(pure_text)
res_list = drop_stopword(word_list,stopword_list)
return res_list
# 从文件读取停词列表
stopword = list()
with open('stopword.txt','r',encoding='utf-8') as f:
for word in f.readlines():
stopword.append(word.strip('\n'))
# 获取中文微博text及分词列表
pure_chinese_text = mblog_frame['text'].apply(clear_text)
word_list = pure_chinese_text.apply(get_final_word_list,stopword_list=stopword)
# 从分词列表生成词频数据(字典数据)
from collections import Counter
from wordcloud import WordCloud
all_word_list = [w for words in word_list for w in words]
word_freq_frame = pd.DataFrame(Counter(all_word_list).items())
word_freq_frame.columns=['word','count']
# 选取前100个词
top100_freq_word = word_freq_frame.sort_values('count',ascending=0).head(100)
top100_freq_word_dict=dict(list(top100_freq_word.apply(lambda w:(w['word'],w['count']),axis=1)))
# 使用wordcloud绘制词频图
wc = WordCloud(background_color="white",max_words=2000,font_path='simhei.ttf')
wc.generate_from_frequencies(top100_freq_word_dictplt.imshow(wc)
plt.axis('off')
plt.show()