import requests
import time
from bs4 import BeautifulSoup
url='https://book.douban.com/tag/%E5%B0%8F%E8%AF%B4?start=0&type=T'
urls=['https://book.douban.com/tag/%E5%B0%8F%E8%AF%B4?start={}'.format(str(i) for i in range(20,100,20))]
# for a in urls:
# print(a)
def get_books(url,data=None):
web_data=requests.get(url)
soup=BeautifulSoup(web_data.text,'lxml')
titles=soup.select('div.info > h2 > a')
rates=soup.select('div.pub')
authors=soup.select('span.rating_nums')
links=soup.select('div.info > h2 > a')
time.sleep(2)
if data==None:
for title,rate,author,link in zip(titles,authors,rates,links):
data={
'title:':title.get('title'),
'author:':author.get_text(),
'rate:':rate.get_text(),
'link:':title.get('href')
}
for key,value in data.items():
print(key,value)
for single_url in urls:
import time
from bs4 import BeautifulSoup
url='https://book.douban.com/tag/%E5%B0%8F%E8%AF%B4?start=0&type=T'
urls=['https://book.douban.com/tag/%E5%B0%8F%E8%AF%B4?start={}'.format(str(i) for i in range(20,100,20))]
# for a in urls:
# print(a)
def get_books(url,data=None):
web_data=requests.get(url)
soup=BeautifulSoup(web_data.text,'lxml')
titles=soup.select('div.info > h2 > a')
rates=soup.select('div.pub')
authors=soup.select('span.rating_nums')
links=soup.select('div.info > h2 > a')
time.sleep(2)
if data==None:
for title,rate,author,link in zip(titles,authors,rates,links):
data={
'title:':title.get('title'),
'author:':author.get_text(),
'rate:':rate.get_text(),
'link:':title.get('href')
}
for key,value in data.items():
print(key,value)
for single_url in urls:
get_books(single_url)
为什么只能爬取一页?