爬取豆瓣用户读过的书,根据网页url的变化自动翻页,不过好像被封IP了
import urllib.request
import http.cookiejar
import requests
from bs4 import BeautifulSoup
import re
#保存文件位置
#filename = 'cookies.txt'
#创建一个实例对象保存cookies
#cookie = http.cookiejar.MozillaCookieJar(filename)
#创建一个存放cookies的容器
#handler = urllib.request.HTTPCookieProcessor(cookie)
#新建一个支持cookie的opener
#opener = urllib.request.build_opener(handler)
#打开网站
#response = opener.open("https://www.douban.com/")
#存储cookies
#cookie.save(ignore_discard=True, ignore_expires=True)
#https://book.douban.com/people/3551583/collect
cookies = http.cookiejar.MozillaCookieJar()
cookies.load('cookies.txt', ignore_discard=True, ignore_expires=True)
cnt = 0
while cnt <= 345:
urls = 'https://book.douban.com/people/162338500/collect?start&sort=time&rating=all&filter=all&mode=grid'
url = urls.replace("start=","start="+str(cnt))
cnt = cnt + 15
r = requests.get(urls, cookies=cookies).text
p1 = '(?<=title=").+?(?=" )'
pattern = re.compile(p1)
items = pattern.findall(r)
for item in items:
print (item)
#soup = BeautifulSoup(r, 'html5lib')
#items = soup.find_all(text='title')
#print (items)