目录
一、前言
前几天突然想爬一爬微博的热门评论玩,就间断地挤出来了一点时间写了一个简略版。
最大的难点是新浪的访客机制,导致无法直接用requests爬取,需要先在cookie上下功夫,但是又不想花太多时间去破解,于是决定用selenium先模拟一下,获取cookie之后再通过requests爬取。
程序已打包成exe,需要的同学可以下载体验,压缩包内有运行教程
二、代码
代码描述:需要安装谷歌浏览器并下载安装与谷歌浏览器版本对应的驱动
驱动下载镜像:https://npm.taobao.org/mirrors/chromedriver
selenium仅仅用于获取cookie,实际爬取将直接使用requests请求,以保证爬取效率
话不多说,代码也不复杂,直接上代码了,关键的地方有注释
import requests
# import selenium
from selenium import webdriver
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.chrome.options import Options
from bs4 import BeautifulSoup
import re
import json
import time
class spider_weibo(object):
def __init__(self,id):
self.chrome_options = Options()
#设置静默
self.chrome_options.add_argument('--headless')
self.driver = webdriver.Chrome(options=self.chrome_options,executable_path='chromedriver.exe')
self.wait = WebDriverWait(self.driver, 100)
self.headers={
'User-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.131 Safari/537.36',
#'Cookie': 'YF-V5-G0=4358a4493c1ebf8ed493ef9c46f04cae; SUB=_2AkMqdRZzf8NxqwJRmfgVyGPiaIV1yQjEieKcKeeoJRMxHRl-yT9jqkwHtRB6AfU4nKRZ_nW63RalM6oW3bXHn9HQD_7K; SUBP=0033WrSXqPxfM72-Ws9jqgMF55529P9D9WFSYSjnvKG1II6rw.84rE0L'
'Cookie':''
}
self.weibo_id = id
#讲cookie_dict转成字符串
def get_cookielist(self):
print('正在获取cookie')
cookie_str = ''
url = 'https://weibo.com/aj/v6/comment/big?ajwvr=6&id={}&root_comment_max_id_type=0&root_comment_ext_param=&page={}&filter=hot&filter_tips_before=0&from=singleWeiBo'.format(
self.weibo_id, 1)
self.driver.get(url)
time.sleep(7)
#通过selenium模拟浏览器操作,获取访客cookie
cookielist = self.driver.get_cookies()
for cookie in cookielist:
cookie_str = cookie_str + cookie['name']+'='+cookie['value']+';'
return cookie_str
#使用代理ip(待完善)
def get_proxy(self,order_id):
url = 'http://tpv.daxiangdaili.com/ip/?tid={}&num=1&sortby=speed&delay=5'.format(order_id)
response = requests.get(url)
#用bs4解析请求得到页面
def use_bs4(self,retext):
#初始化待拼接字符串
text = ''
retextjson = json.loads(retext)
#获取请求到的页面
data = retextjson.get("data").get('html')
soup = BeautifulSoup(data, 'lxml')
ul_list = soup.select('.list_box')[0].select('.list_ul')[0].find_all('div',attrs={'node-type':'root_comment'})
for ul in ul_list:
try:
list_con = ul.find_all('div', attrs={'node-type': 'replywrap'})[0]
#解析获取用户名以及评论
content = list_con.find_all('div', attrs={'class': 'WB_text'})[0].text
text = text + content+'\n'
except Exception as e:
print('error')
return text
def spider(self,page_num):
session = requests.Session()
#获取cookie
cookie_str = self.get_cookielist()
print("cookie:",cookie_str)
#设置cookie
self.headers['Cookie'] = cookie_str
#以utf-8编码打开文件
file = open('comment.txt','w',encoding='utf-8')
for i in range(page_num):
try:
# 热评请求地址
url = 'https://weibo.com/aj/v6/comment/big?ajwvr=6&id={}&root_comment_max_id_type=0&root_comment_ext_param=&page={}&filter=hot&filter_tips_before=0&from=singleWeiBo'.format(
self.weibo_id, i)
response = session.get(url, headers=self.headers)
response.encoding = 'unicode'
text = self.use_bs4(response.text)
print(text)
file.write(text)
time.sleep(2)
except Exception as e:
print(e)
file.close()
if __name__ =='__main__':
#输入需要爬取的页数
page_number = input("Enter page num: ");
#将页数转成int类型
page_num = int(page_number)
#输入微博id
id = input("Enter Weibo id: ");
#id = '4391901606692228'
weibo_spider = spider_weibo(id)
weibo_spider.spider(page_num)
才疏学浅,代码简陋,如有不足之处恳请指出!