Python实现知乎热点信息爬取

最新推荐文章于 2024-06-21 15:41:14 发布

锦楠

最新推荐文章于 2024-06-21 15:41:14 发布

阅读量1.6k

点赞数 5

分类专栏： python 爬虫自动化文章标签： python selenium chrome xpath

本文链接：https://blog.csdn.net/qq_26394845/article/details/118028822

版权

python 同时被 3 个专栏收录

18 篇文章

订阅专栏

爬虫

10 篇文章

订阅专栏

自动化

4 篇文章

订阅专栏

小白也能懂

因知乎进主页必须登录后才能够访问信息，且登录界面的验证方式破解难度是较高，经过一番分析，最终选择了使用selenium模拟登录，然后获取cookie的方式。
在实现主页信息爬取及热点信息爬取都无问题。
在指定问题进行搜索时，返回无内容，应该是知乎的反爬措施变态，查阅了许多博主及各种帖子，都没有较好的解决方法，各位大佬如果有的话，还望对我指指点点一下。该函数也一并放进帖子中，尝试了许多种办法，写得也比较乱，将就着看看吧。

上代码，不懂私信可回。

1、第三方库

import json
import requests
from selenium import webdriver
from bs4 import BeautifulSoup
from loguru import logger
import time

2、获取cookie值

def get_cookie(base_url):
    browser = webdriver.Chrome(executable_path="********")  # 你的webdriver路径
    browser.get(base_url)
    # 2.找到QQ登陆按钮
    # 先找到登陆按钮
    curHandle = browser.current_window_handle  # 获取当前窗口聚丙
    print(curHandle)
    # 找到QQ登陆按钮
    qq_login = browser.find_element_by_xpath('//*[@id="root"]/div/main/div/div/div/div[3]/span[2]/button[2]')
    qq_login.click()
    time.sleep(3)  # 等待加载完毕
    # qq登陆内嵌页面要切换表单
    # 操作刚打开的QQ登陆界面
    allHandle = browser.window_handles  # 获取所有聚丙
    for h in allHandle:
        if h != curHandle:
            browser.switch_to.window(h)  # 切换聚丙，到新弹出的窗口
            time.sleep(3)
            browser.switch_to.frame("ptlogin_iframe")  # 切换聚丙，到frame窗口
            browser.find_element_by_link_text('帐号密码登录').click()
            time.sleep(3)
            # 开始模拟输入账号密码登陆
            text_qq_account = browser.find_element_by_id("u")
            text_qq_password = browser.find_element_by_id("p")
            bt_qq_login = browser.find_element_by_id("login_button")
            text_qq_account.send_keys("********")  # 输入你的QQ账号
            text_qq_password.send_keys("********")  # 输入你的QQ密码
            bt_qq_login.click()
            time.sleep(3)
            browser.switch_to.window(curHandle)  # 切换聚丙，到原窗口
	# 保存cookie信息
    cookie_test = browser.get_cookies()  # 获取当前cookie值
    jsonCookies = json.dumps(cookie_test) 
    listCookies = json.loads(jsonCookies)
    cookie = [item["name"] + "=" + item["value"] for item in listCookies]
    cookiestr = '; '.join(item for item in cookie)
    with open('zhihucookie.txt', 'w') as f:
        f.write(cookiestr)
    time.sleep(4)
    browser.quit()

3、实现登录知乎主页热点

def login(base_url):
        with open('zhihucookie.txt', 'r', encoding='utf-8') as f:
            zhihucookie = f.read()
        headers = {
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4389.82 Safari/537.36',
            'Cookie': zhihucookie}

        resp = requests.get(base_url, headers=headers)
        html = resp.text
        soup = BeautifulSoup(html, 'lxml')

        if soup.find('button', type='submit'):
            print('原cookie信息失效，即将重新获取cookie信息！！')
            get_cookie(base_url)
            login(base_url)

        hots = soup.find('div', class_='HotList-list').find_all('section', class_='HotItem')
        link_lists = []

        time_name = '知乎hot50' + str(time.strftime("%Y-%m-%d", time.localtime())) + '.txt'

        with open(time_name, 'a', encoding='utf-8') as f:
            for hot in hots:
                rank = hot.find('div', class_='HotItem-rank').get_text()
                print(rank, end='\t')
                hot_value = str(hot.find('div', class_='HotItem-metrics').get_text()).replace('分享', '')
                print(hot_value, end='\t')
                title = hot.find('h2').get_text()
                print(title)
                mes = ''
                if hot.find('p', class_='HotItem-excerpt'):
                    mes = hot.find('p', class_='HotItem-excerpt').get_text()
                    # print('简介：\t' + mes)
                hot_link =hot.find('a')['href']
                link_lists.append(hot_link)
                print("原文链接： " + hot_link)
                hot_mes = parse_hotlink(hot_link)
                hot_mes = str(hot_mes).replace('。', '。\n\t')
                # print(hot_mes)
                time.sleep(1)
                print('')
                f.write('{}\t{}\t{}\t{}\n\t{}\n\t{}\n\n\n\n'.format(rank, hot_value, title, mes, hot_link, hot_mes))
        return link_lists

4、解析热点函数

def parse_hotlink(link_list):
    try:
        with open('zhihucookie.txt', 'r', encoding='utf-8') as f:
            zhihucookie = f.read()
        headers = {
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4389.82 Safari/537.36',
            'cookie': zhihucookie}

        resp = requests.get(link_list, headers=headers)
        html = resp.text
        soup = BeautifulSoup(html, 'lxml')
        if soup.find('button', type='submit'):
            print('原cookie信息失效，即将重新获取cookie信息！！')
            get_cookie(link_list)
            parse_hotlink(link_list)
        hot_mess = soup.find('div', class_='List').find_all('div', class_='List-item')
        for hot_mes in hot_mess:
            mes = hot_mes.find('div', class_='RichContent-inner').get_text()
            return mes
    except:
        pass

5、指定问题爬取

def get_answer():
    # # 实现无可视化界面的操作
    # chrome_options = Options()
    # chrome_options.add_argument('--headless')
    # chrome_options.add_argument('--disable-gpu')
    #
    # # 实现规避检测
    # option = ChromeOptions()
    # option.add_experimental_option('excludeSwitches', ['enable-automation'])

    # 如何实现让selenium规避被检测到的风险
    # browser = webdriver.Chrome(executable_path='E:\chromedriver.exe', chrome_options=chrome_options, options=option)
    # browser = webdriver.Chrome(executable_path='E:\chromedriver.exe')

    url = 'https://www.zhihu.com'

    browser = webdriver.Chrome(executable_path="E:\chromedriver.exe")
    browser.get(url)
    # 2.找到QQ登陆按钮
    # 先找到登陆按钮
    curHandle = browser.current_window_handle  # 获取当前窗口聚丙
    print(curHandle)
    # 找到QQ登陆按钮
    bt_opt_login = browser.find_element_by_xpath('//*[@id="root"]/div/main/div/div/div/div[3]/span[2]/button[2]')
    bt_opt_login.click()

    time.sleep(3)  # 等待下加载完毕
    # //*[@id="switcher_plogin"]

    # qq登陆内嵌页面要切换表单
    # 操作刚打开的QQ登陆界面
    allHandle = browser.window_handles  # 获取所有聚丙
    for h in allHandle:
        if h != curHandle:
            browser.switch_to.window(h)  # 切换聚丙，到新弹出的窗口
            time.sleep(3)
            browser.switch_to.frame("ptlogin_iframe")
            browser.find_element_by_link_text('帐号密码登录').click()
            time.sleep(3)
            # 开始模拟输入账号密码登陆
            text_qq_account = browser.find_element_by_id("u")
            text_qq_password = browser.find_element_by_id("p")
            bt_qq_login = browser.find_element_by_id("login_button")
            text_qq_account.send_keys("******")  # 输入你的QQ账号
            text_qq_password.send_keys("*****")  # 输入你的QQ密码
            bt_qq_login.click()
            time.sleep(3)
            browser.switch_to.window(curHandle)  # 切换聚丙，回到主窗口



    question = input("请输入您要搜索的问题，按回车键结束！")

    browser.find_element_by_xpath('//*[@id="Popover1-toggle"]').send_keys(question)
    browser.find_element_by_xpath('//*[@id="root"]/div/div[2]/header/div[1]/div[1]/div/form/div/div/label/button').click()
    time.sleep(3)

    resp = browser.page_source

    html = resp.text
    soup = BeautifulSoup(html, 'lxml')
    print(soup)

6、主函数

def start():
    logger.add("runtime_err.log", rotation="500 MB")
    base_url = 'https://www.zhihu.com/hot'
    login(base_url)


if __name__ == '__main__':

    start()