使用selenium获取cookie后,使用request请求,使用Bs4获取信息
转载至:点击此处到原地址https://blog.csdn.net/u010244522/article/details/79644067
import re
import requests
from selenium import webdriver
from bs4 import BeautifulSoup
import time
#新建session
s = requests.session()
url = "https://home.cnblogs.com/u/TankXiao"
def get_cookies(url):
u'启动selenium获取浏览器cookies'
driver = webdriver.Firefox()
driver.get(url+"/followers")
driver.find_element_by_xpath('//*[@id="input1"]').send_keys("***")
driver.find_element_by_xpath('//*[@id="input2"]').send_keys("****/")
driver.find_element_by_xpath('//*[@id="signin"]').click()
time.sleep(15)
cookie = driver.get_cookies() #获取浏览器cookies
driver.quit()
return cookie
def add_cookies(cookie):
u"往session添加cookies"
c = requests.cookies.RequestsCookieJar()
for i in cookie: #添加cookie到CookieJar
c.set(i["name"], i["value"])
s.cookies.update(c) #更新session里的cookie
def get_ye_num(url):
u'获取粉丝页面数量'
r1 = s.get(url+"/relation/followers")
soup = BeautifulSoup(r1.content,"html.parser")
#获取粉丝数
fans_text = soup.find_all(class_="current_nav")
print(fans_text[0].string)
fans_num = re.findall(u"的粉丝\((.+?)\)",fans_text[0].string) #返回未list数据
fans_num = int(fans_num[0]) #将fans_num从string转换为int
#计算粉丝页数
ye = int(fans_num/45)+1 #ye为int型
return ye
def save_name(ye):
u'保存粉丝名'
text = ''
for i in range(1, ye+1):
r2 = s.get(url+"/relation/followers/?page=%s"%i)
soup1 = BeautifulSoup(r2.content,"html.parser")
fans_list = soup1.find_all(class_="avatar_name") #查找粉丝
for j in fans_list: #从list中获取每个粉丝的名并添加到text
text = text+j.string
name = bytes(text, encoding="utf-8") #将text转换为bytes格式
w = open('name.txt','wb') #打开name.txt文件为可写入
w.write(name) #写入
w.close()
if __name__ == "__main__":
cookie = get_cookies(url)
add_cookies(cookie)
ye = get_ye_num(url)
save_name(ye)
可以使用下面方法,跳过报错
try:
*******
except Exception as msg:
print(u"抓取粉丝名称过程中报错了 :%s"%str(msg))
在文件中追加数据:
with open("name.txt", "a") as f: # 追加写入
f.write(name.encode("utf-8")+"\n")
# python3的改成下面这两行
# with open("name.txt", "a", encoding="utf-8") as f: # 追加写入
# f.write(name+"\n")
————————————————
版权声明:本文为CSDN博主「bluelikk」的原创文章,遵循 CC 4.0 BY-SA 版权协议,转载请附上原文出处链接及本声明。
原文链接:https://blog.csdn.net/u010244522/article/details/79644067