初学python,花了一天时间鼓捣了一个爬虫。
#coding=utf-8
import requests
from bs4 import BeautifulSoup
import re
import string
from selenium import webdriver
import time
urlprelix = 'http://www.******.com/'
def webcrawler(max_page):
page = 1
driverold = webdriver.Chrome('/Applications/chromedriver')#chromedriver的路径
driverold.get('https://pan.baidu.com/')#登录百度网盘生成cookie,后面可以导入新的网页,不用重复登录
time.sleep(30)
cookies_list = driverold.get_cookies()
driverold.close()
while page <= max_page:
if page is 1:
url = urlprelix
else:
url = urlprelix +'440_'+ str(page) + '.html'#每页的full url
#添加header防反爬虫
headers = {'User-Agent': 'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US; rv:1.9.1.6) Gecko/20091201 Firefox/3.5.6'}
s