利用selenium爬虫模拟浏览器访问CSDN博客

利用selenium爬虫模拟浏览器访问博客

import time
import urllib
import re
from urllib.request import  urlopen
from urllib  import request
from bs4 import BeautifulSoup


from selenium import webdriver
from selenium.webdriver.chrome.options import Options


def visit_article(articles):
    chrome_options = Options()
    chrome_options.add_argument('--no-sandbox')
    chrome_options.add_argument('--disable-dev-shm-usage')
    # chrome_options.add_argument('--headless')
    driver = webdriver.Chrome(options=chrome_options)

    #driver = webdriver.Firefox()

    time.sleep(2)
    # driver.get('https://blog.csdn.net/u011503666/article/details/111756868')
    for article in articles:
        driver.get(article)
        time.sleep(2)
        driver.refresh()
        time.sleep(5)
    driver.quit()


def get_page_nums(page_url):
    page_num = 0
    while True:
        page_num += 1
        req = request.Request(f'{page_url}/article/list/{page_num}')
        user_agent = 'Mozilla/5.0 (Windows NT 6.3; WOW64) '\
        'AppleWebKit/537.36 (KHTML, like Gecko) '\
        'Chrome/45.0.2454.101 Safari/537.36'
        req.add_header('User-Agent', user_agent)
        html = urlopen(req)
        bs_obj = BeautifulSoup(html.read(), "html.parser")

        article_div = bs_obj.find("div", {"class":"article-list"})
        if not article_div:
            return page_num - 1


def get_page_article_urls(page_url):
    req = request.Request(page_url)
    user_agent = 'Mozilla/5.0 (Windows NT 6.3; WOW64) '\
    'AppleWebKit/537.36 (KHTML, like Gecko) Chrome/45.0.2454.101 Safari/537.36'
    req.add_header('User-Agent', user_agent)
    html = urlopen(req)
    bs_obj = BeautifulSoup(html.read(), "html.parser")
    articles = list()

    article_div = bs_obj.find("div", {"class":"article-list"})

    for articlelist in article_div.findAll('a'):
        if 'href' in articlelist.attrs:
            articles.append(articlelist.attrs['href'])
    return articles


def main():
    page_num = get_page_nums('https://blog.csdn.net/u011503666')
    print(f'page_num: {page_num}')
    for x in range(1, page_num + 1):
        articles = get_page_article_urls(
            f'https://blog.csdn.net/u011503666/article/list/{x}')
        visit_article(articles)

if __name__ == '__main__':
    main()
评论 4
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值