利用selenium爬虫模拟浏览器访问博客
import time
import urllib
import re
from urllib.request import urlopen
from urllib import request
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
def visit_article(articles):
chrome_options = Options()
chrome_options.add_argument('--no-sandbox')
chrome_options.add_argument('--disable-dev-shm-usage')
driver = webdriver.Chrome(options=chrome_options)
time.sleep(2)
for article in articles:
driver.get(article)
time.sleep(2)
driver.refresh()
time.sleep(5)
driver.quit()
def get_page_nums(page_url):
page_num = 0
while True:
page_num += 1
req = request.Request(f'{page_url}/article/list/{page_num}')
user_agent = 'Mozilla/5.0 (Windows NT 6.3; WOW64) '\
'AppleWebKit/537.36 (KHTML, like Gecko) '\
'Chrome/45.0.2454.101 Safari/537.36'
req.add_header('User-Agent', user_agent)
html = urlopen(req)
bs_obj = BeautifulSoup(html.read(), "html.parser")
article_div = bs_obj.find("div", {"class":"article-list"})
if not article_div:
return page_num - 1
def get_page_article_urls(page_url):
req = request.Request(page_url)
user_agent = 'Mozilla/5.0 (Windows NT 6.3; WOW64) '\
'AppleWebKit/537.36 (KHTML, like Gecko) Chrome/45.0.2454.101 Safari/537.36'
req.add_header('User-Agent', user_agent)
html = urlopen(req)
bs_obj = BeautifulSoup(html.read(), "html.parser")
articles = list()
article_div = bs_obj.find("div", {"class":"article-list"})
for articlelist in article_div.findAll('a'):
if 'href' in articlelist.attrs:
articles.append(articlelist.attrs['href'])
return articles
def main():
page_num = get_page_nums('https://blog.csdn.net/u011503666')
print(f'page_num: {page_num}')
for x in range(1, page_num + 1):
articles = get_page_article_urls(
f'https://blog.csdn.net/u011503666/article/list/{x}')
visit_article(articles)
if __name__ == '__main__':
main()