小白也能懂
- 因知乎进主页必须登录后才能够访问信息,且登录界面的验证方式破解难度是较高,经过一番分析,最终选择了使用selenium模拟登录,然后获取cookie的方式。
- 在实现主页信息爬取及热点信息爬取都无问题。
- 在指定问题进行搜索时,返回无内容,应该是知乎的反爬措施变态,查阅了许多博主及各种帖子,都没有较好的解决方法,各位大佬如果有的话,还望对我指指点点一下。该函数也一并放进帖子中,尝试了许多种办法,写得也比较乱,将就着看看吧。
上代码,不懂私信可回。
1、第三方库
import json
import requests
from selenium import webdriver
from bs4 import BeautifulSoup
from loguru import logger
import time
2、获取cookie值
def get_cookie(base_url):
browser = webdriver.Chrome(executable_path="********") # 你的webdriver路径
browser.get(base_url)
# 2.找到QQ登陆按钮
# 先找到登陆按钮
curHandle = browser.current_window_handle # 获取当前窗口聚丙
print(curHandle)
# 找到QQ登陆按钮
qq_login = browser.find_element_by_xpath('//*[@id="root"]/div/main/div/div/div/div[3]/span[2]/button[2]')
qq_login.click()
time.sleep(3) # 等待加载完毕
# qq登陆内嵌页面要切换表单
# 操作刚打开的QQ登陆界面
allHandle = browser.window_handles # 获取所有聚丙
for h in allHandle:
if h != curHandle:
browser.switch_to.window(h) # 切换聚丙,到新弹出的窗口
time.sleep(3)
browser.switch_to.frame("ptlogin_iframe") # 切换聚丙,到frame窗口
browser.find_element_by_link_text('帐号密码登录').click()
time.sleep(3)
# 开始模拟输入账号密码登陆
text_qq_account = browser.find_element_by_id("u")
text_qq_password = browser.find_element_by_id("p")
bt_qq_login = browser.find_element_by_id("login_button")
text_qq_account.send_keys("********") # 输入你的QQ账号
text_qq_password.send_keys("********") # 输入你的QQ密码
bt_qq_login.click()
time.sleep(3)
browser.switch_to.window(curHandle) # 切换聚丙,到原窗口
# 保存cookie信息
cookie_test = browser.get_cookies() # 获取当前cookie值
jsonCookies = json.dumps(cookie_test)
listCookies = json.loads(jsonCookies)
cookie = [item["name"] + "=" + item["value"] for item in listCookies]
cookiestr = '; '.join(item for item in cookie)
with open('zhihucookie.txt', 'w') as f:
f.write(cookiestr)
time.sleep(4)
browser.quit()
3、实现登录知乎主页热点
def login(base_url):
with open('zhihucookie.txt', 'r', encoding='utf-8') as f:
zhihucookie = f.read()
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4389.82 Safari/537.36',
'Cookie': zhihucookie}
resp = requests.get(base_url, headers=headers)
html = resp.text
soup = BeautifulSoup(html, 'lxml')
if soup.find('button', type='submit'):
print('原cookie信息失效,即将重新获取cookie信息!!')
get_cookie(base_url)
login(base_url)
hots = soup.find('div', class_='HotList-list').find_all('section', class_='HotItem')
link_lists = []
time_name = '知乎hot50' + str(time.strftime("%Y-%m-%d", time.localtime())) + '.txt'
with open(time_name, 'a', encoding='utf-8') as f:
for hot in hots:
rank = hot.find('div', class_='HotItem-rank').get_text()
print(rank, end='\t')
hot_value = str(hot.find('div', class_='HotItem-metrics').get_text()).replace('分享', '')
print(hot_value, end='\t')
title = hot.find('h2').get_text()
print(title)
mes = ''
if hot.find('p', class_='HotItem-excerpt'):
mes = hot.find('p', class_='HotItem-excerpt').get_text()
# print('简介:\t' + mes)
hot_link =hot.find('a')['href']
link_lists.append(hot_link)
print("原文链接: " + hot_link)
hot_mes = parse_hotlink(hot_link)
hot_mes = str(hot_mes).replace('。', '。\n\t')
# print(hot_mes)
time.sleep(1)
print('')
f.write('{}\t{}\t{}\t{}\n\t{}\n\t{}\n\n\n\n'.format(rank, hot_value, title, mes, hot_link, hot_mes))
return link_lists
4、解析热点函数
def parse_hotlink(link_list):
try:
with open('zhihucookie.txt', 'r', encoding='utf-8') as f:
zhihucookie = f.read()
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4389.82 Safari/537.36',
'cookie': zhihucookie}
resp = requests.get(link_list, headers=headers)
html = resp.text
soup = BeautifulSoup(html, 'lxml')
if soup.find('button', type='submit'):
print('原cookie信息失效,即将重新获取cookie信息!!')
get_cookie(link_list)
parse_hotlink(link_list)
hot_mess = soup.find('div', class_='List').find_all('div', class_='List-item')
for hot_mes in hot_mess:
mes = hot_mes.find('div', class_='RichContent-inner').get_text()
return mes
except:
pass
5、指定问题爬取
def get_answer():
# # 实现无可视化界面的操作
# chrome_options = Options()
# chrome_options.add_argument('--headless')
# chrome_options.add_argument('--disable-gpu')
#
# # 实现规避检测
# option = ChromeOptions()
# option.add_experimental_option('excludeSwitches', ['enable-automation'])
# 如何实现让selenium规避被检测到的风险
# browser = webdriver.Chrome(executable_path='E:\chromedriver.exe', chrome_options=chrome_options, options=option)
# browser = webdriver.Chrome(executable_path='E:\chromedriver.exe')
url = 'https://www.zhihu.com'
browser = webdriver.Chrome(executable_path="E:\chromedriver.exe")
browser.get(url)
# 2.找到QQ登陆按钮
# 先找到登陆按钮
curHandle = browser.current_window_handle # 获取当前窗口聚丙
print(curHandle)
# 找到QQ登陆按钮
bt_opt_login = browser.find_element_by_xpath('//*[@id="root"]/div/main/div/div/div/div[3]/span[2]/button[2]')
bt_opt_login.click()
time.sleep(3) # 等待下加载完毕
# //*[@id="switcher_plogin"]
# qq登陆内嵌页面要切换表单
# 操作刚打开的QQ登陆界面
allHandle = browser.window_handles # 获取所有聚丙
for h in allHandle:
if h != curHandle:
browser.switch_to.window(h) # 切换聚丙,到新弹出的窗口
time.sleep(3)
browser.switch_to.frame("ptlogin_iframe")
browser.find_element_by_link_text('帐号密码登录').click()
time.sleep(3)
# 开始模拟输入账号密码登陆
text_qq_account = browser.find_element_by_id("u")
text_qq_password = browser.find_element_by_id("p")
bt_qq_login = browser.find_element_by_id("login_button")
text_qq_account.send_keys("******") # 输入你的QQ账号
text_qq_password.send_keys("*****") # 输入你的QQ密码
bt_qq_login.click()
time.sleep(3)
browser.switch_to.window(curHandle) # 切换聚丙,回到主窗口
question = input("请输入您要搜索的问题,按回车键结束!")
browser.find_element_by_xpath('//*[@id="Popover1-toggle"]').send_keys(question)
browser.find_element_by_xpath('//*[@id="root"]/div/div[2]/header/div[1]/div[1]/div/form/div/div/label/button').click()
time.sleep(3)
resp = browser.page_source
html = resp.text
soup = BeautifulSoup(html, 'lxml')
print(soup)
6、主函数
def start():
logger.add("runtime_err.log", rotation="500 MB")
base_url = 'https://www.zhihu.com/hot'
login(base_url)
if __name__ == '__main__':
start()