一、分析页面确定需要获取的信息
- 页面通过控制 URL 的最后一个参数,实现分页
- 确定我们想要获取的信息:排名、书名、图片地址、作者、推荐指数、五星评分次数、价格
通过审查元素,确定怎么得到这些信息(正则)
3.确定思路
使用 page 变量实现分页,使用request 请求当当网,对返回的 HTML 进行正则解析,解析之后的内容存到文件中
二、实现爬取代码
- 爬取网页
def request_dangdang(url):
try:
response = requests.get(url)
if response.status_code == 200:
return response.text
except requests.RequestException:
return None
- 解析爬取的网页,获得需要的内容
def parse_result(html):
boards = []
tree = etree.HTML(html)
rows = tree.xpath("//ul[@class='bang_list clearfix bang_list_mode']/li")
for row in rows:
board = {}
columns = row.xpath('div')
board['range'] = columns[0].text
board['title'] = columns[2].xpath('a')[0].text
board['recommend'] = columns[3].xpath('//span[@class="tuijian"]')[0].text
board['author'] = columns[4].xpath('a')[0].text
board['times'] = columns[6].xpath('span')[0].text
board['price'] = columns[7].xpath('//span[@class="price_n"]')[0].text
boards.append(board)
return boards
这里利用生成器也是得到一个迭代也是不错的方法
def parse_result(html):
boards = []
tree = etree.HTML(html)
rows = tree.xpath("//ul[@class='bang_list clearfix bang_list_mode']/li")
for row in rows:
columns = row.xpath('div')
yield {
'range': columns[0].text,
'title': columns[2].xpath('a')[0].text,
'recommend': columns[3].xpath('//span[@class="tuijian"]')[0].text,
'author': columns[4].xpath('a')[0].text,
'times': columns[6].xpath('span')[0].text,
'price': columns[7].xpath('//span[@class="price_n"]')[0].text
}
对于关键信息的筛选需要借助浏览器审查元素,具体分析
3. 写入文件
def write_file(board):
print('开始写入数据 ==>' + str(board))
with open('book.txt','a',encoding='utf-8') as f:
#ensure_ascii=False参数设置为 FALSE,否则存储乱码
f.write(json.dumps(board,ensure_ascii=False) + '\n')
- main 函数实现 分页读取,控制存储
def main(page):
url = 'http://bang.dangdang.com/books/fivestars/01.00.00.00.00.00-recent30-0-0-1-' + str(page)
html = request_dangdang(url)
boards = parse_result(html)
for board in boards:
write_file(board)
- 完整代码:
import requests
import re
from lxml import etree
import json
def request_dangdang(url):
try:
response = requests.get(url)
if response.status_code == 200:
return response.text
except requests.RequestException:
return None
def parse_result(html):
boards = []
tree = etree.HTML(html)
rows = tree.xpath("//ul[@class='bang_list clearfix bang_list_mode']/li")
for row in rows:
# board = {}
columns = row.xpath('div')
# board['range'] = columns[0].text
# board['title'] = columns[2].xpath('a')[0].text
# board['recommend'] = columns[3].xpath('//span[@class="tuijian"]')[0].text
# board['author'] = columns[4].xpath('a')[0].text
# board['times'] = columns[6].xpath('span')[0].text
# board['price'] = columns[7].xpath('//span[@class="price_n"]')[0].text
# boards.append(board)
# return boards
# for item in items:
yield {
'range': columns[0].text,
'title': columns[2].xpath('a')[0].text,
'recommend': columns[3].xpath('//span[@class="tuijian"]')[0].text,
'author': columns[4].xpath('a')[0].text,
'times': columns[6].xpath('span')[0].text,
'price': columns[7].xpath('//span[@class="price_n"]')[0].text
}
def write_file(board):
print('开始写入数据 ==>' + str(board))
with open('book.txt','w',encoding='utf-8') as f:
f.write(json.dumps(board) + '\n')
def main(page):
url = 'http://bang.dangdang.com/books/fivestars/01.00.00.00.00.00-recent30-0-0-1-' + str(page)
html = request_dangdang(url)
boards = parse_result(html)
for board in boards:
write_file(board)
if __name__ == '__main__':
for i in range(1,26):
main(i)