一、说在前面
微博的反爬做的还是挺好的,登陆过程中有跳转、加密、验证码等玩意儿夹杂在一起,模拟登陆的难度比较大。所以偷懒用了cookie登录的方式,访问wap站来减小难度。
思路是从一个大V开始抓取他的粉丝列表,从粉丝列表中筛选粉丝数较大的号继续爬取他的粉丝列表,获取了大量的微博UID后,拼接出他的个人资料页和所发微博页,从中爬取需要的数据。
二、上代码
2.1 获取微博UID
import requests
from bs4 import BeautifulSoup
import re
import pymysql
import time
import random
fansnum_pat = re.compile(r'粉丝(\d+)人')
uid_pat = re.compile(r'uid=(\d+)')
def get_table_content(obj):
tds = obj.tr.find_all('td')
td = tds[1].a
uid_obj = tds[1].find_all('a')[1].get('href')
uid = uid_pat.search(uid_obj).group(1)
fans_obj = list(tds[1].stripped_strings)[1]
try:
fans_num = fansnum_pat.search(fans_obj).group(1)
fans_num = int(fans_num)
except Exception as e:
fans_num = -1
print(e,fans_obj)
nickname = td.get_text()
return uid,nickname,fans_num
def sql_commit(sql):
try:
cursor.execute(sql)
con.commit()
except Exception as e:
con.rollback()
print(e,sql)
if __name__ == '__main__':
ua = 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/51.0.2704.106 Safari/537.36'
headers = {
'User-Agent': ua}
#my_cookie = '_T_WM=c13752c6f0ebbb0d5ce702a225841bed; H5_INDEX=3; H5_INDEX_TITLE=%E5%AF%BF%E5%AF%BF%E4%B8%8A%E8%AF%BE%E9%83%BD%E5%90%AC%E4%B8%8D%E6%87%82%E4%BA%86T_T; SUB=_2A250rL6kDeThGeRM6lYY8CvFzjyIHXVUbsLsrDV6PUJbkdBeLXnekW1aDDbKylZr7vp_NvLVNcVobIaZUQ..; SUHB=0y0IKSrzA6PoeE; SCF=AoLS5R_zSY-F3wyOUjqx66T8mZJzgNGU4seu2b56gfc_DQywsxi_ld8YsmxwhZLG9CqaSVDC7e61z8kXNEYLglM.; SSOLoginState=1504235252'
my_cookie = '_T_WM=c13752c6f0ebbb0d5ce702a225841bed; SCF=AoLS5R_zSY-F3wyOUjqx66T8mZJzgNGU4seu2b56gfc_DQywsxi_ld8YsmxwhZLG9CqaSVDC7e61z8kXNEYLglM.; SUB=_2A250tM0YDeRhGeBN7lcS9yfFzDuIHXVUVtNQrDV6PUJbkdBeLXbGkW0q0ENC4BfdsLTFX55Ty_AcjDJEyw..; SUHB=0LcSLfObfbhXAy; SSOLoginState=1504755016'
cookie_dict = {
"Cookie": my_cookie}
cookie = requests.utils.cookiejar_from_dict(cookie_dict, cookiejar=None, overwrite=True)
s = requests.Session()
s.headers.update(headers)
s.cookies = cookie
con = pymysql.connect(host="localhost", user="root", password="root", db="data", charset="utf8", port=3306)
cursor = con.cursor()
count = 0
out_flag = 0
uid_waiting = {
'6190532921'} #以某一个粉丝数量较大的微博为起点