参考了这个哥们的思路:https://blog.csdn.net/qq_40123329/article/details/81807759
一共250本书,分10页展示,每页25本书。
每一个图书的信息:
豆瓣网每一个table中的源码格式:
<table width="100%">
<tr class="item">
<td width="100" valign="top">
<a class="nbg" href="https://book.douban.com/subject/1770782/"
onclick="moreurl(this,{i:'0'})"
>
<img src="https://img3.doubanio.com/view/subject/m/public/s1727290.jpg" width="90" />
</a>
</td>
<td valign="top">
<div class="pl2">
<a href="https://book.douban.com/subject/1770782/" onclick="moreurl(this,{i:'0'})" title="追风筝的人"
>
追风筝的人
</a>
<img src="https://img3.doubanio.com/pics/read.gif" alt="可试读" title="可试读"/>
<br/>
<span style="font-size:12px;">The Kite Runner</span>
</div>
<p class="pl">[美] 卡勒德·胡赛尼 / 李继宏 / 上海人民出版社 / 2006-5 / 29.00元</p>
<div class="star clearfix">
<span class="allstar45"></span>
<span class="rating_nums">8.9</span>
<span class="pl">(
445012人评价
)</span>
</div>
<p class="quote" style="margin: 10px 0; color: #666">
<span class="inq">为你,千千万万遍</span>
</p>
</td>
</tr>
</table>
<p class="ul"></p>
代码实现:
from bs4 import BeautifulSoup
import re
import requests
import pandas as pd
def getHTMLText(url):
try:
kw = {'user-agent': 'chrome/10.0'}
r = requests.get(url, params=kw)
r.raise_for_status()
r.encoding = r.apparent_encoding
return r.text
except:
print('error')
def parseHTML(bookinfo, html):
soup = BeautifulSoup(html, 'html.parser')
tables = soup.findAll('table', {"width": "100%"})
for table in tables:
link = table.div.a['href']
title = table.div.a.text.strip().replace('\n', '').replace(' ', '')
people = table.find('span', {'class': 'pl'}).text.strip()
people = re.findall(r'[0-9]+', people)[0]
score = table.find('span', {'class': 'rating_nums'}).text.strip()
detail = table.find('p', {"class": "pl"}).text
if table.find('span', {"class": "inq"}):
abstract = table.find('span', {"class": "inq"}).text.strip()
else:
abstract = 'no abstract'
bookinfo.append([title, score, people, detail, abstract, link])
return bookinfo
def saveInfo(bookinfo, path):
a = pd.DataFrame(bookinfo, columns=['书籍名称', '豆瓣评分', '评价人数', '书籍信息', '书籍描述', '链接'])
a.to_excel(path, index=False)
def main():
book = []
path = r'doubanbook2.xlsx'
for i in range(10):
bookinfo = []
url = 'https://book.douban.com/top250?start='
url = url + str(i * 25)
print(url)
html = getHTMLText(url)
books = parseHTML(bookinfo, html)
for i in range(len(books)):
book.append(books[i])
saveInfo(book, path)
main()
爬虫效果: