本文主要目标是获取新浪微博上知乎官方账号所发的微博内容信息。
1. 分析网页
通过仔细分析知乎官方账号页面信息(https://m.weibo.cn/u/1904769205),发现可以通过JSON方式获取微博数据。如下图所示:
知乎官方账号的JSON链接为:https://m.weibo.cn/api/container/getIndex?type=uid&value=1904769205&containerid=1076031904769205&page=1
2. 提取关键信息
在上述分析的基础之上,提取微博数据并存入MySQL数据库中。
具体实现代码如下所示:
# -*- coding: utf-8 -*-
"""
Created on Tue Aug 1 19:55:55 2017
@author: Administrator
"""
import requests
from bs4 import BeautifulSoup
import json
import time
import random
import pymysql.cursors
def crawlDetailPage(url,page):
#读取微博网页的JSON信息
req = requests.get(url)
jsondata = req.text
data = json.loads(jsondata)
#获取每一页的数据
#content = data['cards']
#备注:近期知乎JSON数据字段有所改变,只需要在取内容之前加个['data']即可,如下所示
content = data['data']['cards']
#循环输出每一页的微博信息
for i in range(0,10):
contentUrl = content[i]['scheme']
createdTime = content[i]['mblog']['created_at']
rawText = content[i]['mblog']['text']
soup1 = BeautifulSoup(rawText,'lxml')
text = soup1.get_text()
#textLength = content[i]['mblog']['textLength']
#source = content[i]['mblog']['source']
#userId = content[i]['mblog']['user']['id']
#userName = content[i]['mblog']['user']['screen_name']
#userFollowers = content[i]['mblog']['user']['followers_count']
forward = content[i]['mblog']['reposts_count']
comment = content[i]['mblog']['comments_count']
like = content[i]['mblog']['attitudes_count']
print("第{}页第{}条微博的链接为:{}".format(page,i+1,contentUrl))
print("创建于:{}".format(createdTime))
print("正文内容为:{}".format(text))
#print("正文长度为:{}".format(textLength))
#print("来源为:{}".format(source))
#print("用户ID为:{}".format(userId))
#print("用户名为:{}".format(userName))
#print("用户粉丝数:{}".format(userFollowers))
print("转发数:{}".format(forward))
print("评论数:{}".format(comment))
print("点赞数:{}".format(like))
'''
数据库操作
'''
#获取数据库链接
connection = pymysql.connect(host = 'localhost',
user = 'root',
password = '123456',
db = 'weibo',
charset = 'utf8mb4')
try:
#获取会话指针
with connection.cursor() as cursor:
#创建sql语句
sql = "insert into `zhihu` (`url`,`createTime`,`text`,`forward`,`comment`,`like`) values (%s,%s,%s,%s,%s,%s)"
#执行sql语句
cursor.execute(sql,(contentUrl,createdTime,text,forward,comment,like))
#提交数据库
connection.commit()
finally:
connection.close()
for i in range(1,6):
print("正在获取第{}页微博数据:".format(i))
#知乎官方微博数据的JSON链接
url = "https://m.weibo.cn/api/container/getIndex?type=uid&value=1904769205&containerid=1076031904769205&page=" + str(i)
crawlDetailPage(url,i)
#设置休眠时间
time.sleep(random.randint(31,33))
运行结果如下图所示:
MySQL数据库中的数据存储如下图所示: