import requests
from bs4 import BeautifulSoup
import json
def get_html(url):
headers = {
'user-agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.75 Safari/537.36'}
resp = requests.get(url, headers=headers).text
return resp
allbks = []
def html_parse():
for url in all_page():
soup = BeautifulSoup(get_html(url), 'lxml')
alldiv = soup.find_all('div', class_='title')
names = [a.find('a').get_text().strip() for a in alldiv]
allp = soup.find_all('div', class_='abstract')
authors = [p.get_text().replace("\n","").strip() for p in allp]
starspan = soup.find_all('div', class_='rating')
stars = [s.find('span',class_='rating_nums').get_text() for s in starspan]
for name, author, star in zip(names, authors, stars):
aaa = {}
aaa['name'] = name
aaa['author'] = author
aaa['star'] = star
print(aaa)
allbks.append(aaa)
def all_page():
base_url = 'https://www.douban.com/doulist/45004834/?start='
url_list = []
for page in range(0,100,25):
all_url = base_url + str(page)
url_list.append(all_url)
return url_list
html_parse()
print('保存成功!')
filename = "E:/T100.json"
with open(filename, 'a', encoding="utf-8") as f:
json.dump(allbks, f, ensure_ascii=False)
python爬取豆瓣top100图书信息
最新推荐文章于 2024-05-08 15:12:03 发布