0x01前言
图书畅销榜的图书信息较多,此案例用于数据提取之xpath教学
0x02接口分析
不包含加密参数;
翻页之后也很容易确定url的构造
http://bang.dangdang.com/books/bestsellers/01.00.00.00.00.00-recent7-0-0-1-{}
0x03请求测试
需要携带cookie,user-agent-refer字段,自己测试一下吧
0x04解析响应数据
从图中可以看出 两个div标签的class属性值是相同,如何准确的提取我们想要的数据,答案是指定下标,注意是从1开始
author_list= html_str.xpath(r'//div[@class="publisher_info"]/a[1]/@title')
time_list = html_str.xpath(r'//div[@class="publisher_info"][2]/span/text()')
其他有冲突的地方,同样处理之。
测试结果:
0x05保存数据
我这里将其保存到csv当中
需要传入dict
data_dict = {
'标题': data[0],
'评论': data[1],
'推荐': data[2],
'作者': data[3],
'日期': data[4],
'出版社': data[5],
'售价': data[6],
'原价': data[7],
'折扣': data[8],
'详情页': data[8]
}
csv_writer.writerow(data_dict)
0x06完整代码
f = open('data_2.csv', mode='a', encoding='utf-8', newline='')
csv_writer = csv.DictWriter(f, fieldnames=[
'标题',
'评论',
'推荐',
'作者',
'日期',
'出版社',
'售价',
'原价',
'折扣',
'详情页'
])
csv_writer.writeheader() # 写表头
class DDSpider(object):
def __init__(self):
self.start_url = "http://bang.dangdang.com/books/bestsellers/01.00.00.00.00.00-recent7-0-0-1-1"
self.headers = {
"Cookie":"ddscreen=2; __permanent_id=20220625212706250242675814334910236; dest_area=country_id=9000&province_id=111&city_id =0&district_id=0&town_id=0; __visit_id=20220626175228017192325585967631123; __out_refer=; __rpm=...1656237227370|...1656237229891; __trace_id=20220626175350103290391377653452499",
"Referer": "http://bang.dangdang.com/books/bestsellers/01.00.00.00.00.00-recent7-0-0-1-3",
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/102.0.5005.124 Safari/537.36 Edg/102.0.1245.44"
}
self.data_list = []
def parse_start_url(self):
response = requests.get(self.start_url,headers=self.headers).text
print("请求成功")
self.parse_response(response)
def parse_response(self,response):
"""
解析响应数据
:param response:
:return: list: 书名 评价人数 推荐率 作者 时间 出版社 现价 原价 折扣 详情页链接
"""
html_str = etree.HTML(response)
title_list = html_str.xpath(r'//div[@class="name"]/a/@title')
star_commentNumberOfPerson_list = html_str.xpath(r'//div[@class="star"]/a/text()')
star_recommendNumber_list = html_str.xpath(r'//span[@class="tuijian"]/text()')
#有两个重复的title属性值 取a[1] 第一个a标签
author_list= html_str.xpath(r'//div[@class="publisher_info"]/a[1]/@title')
time_list = html_str.xpath(r'//div[@class="publisher_info"][2]/span/text()')
publisher_list = html_str.xpath(r'//div[@class="publisher_info"][2]/a/text()')
price_n_list = html_str.xpath(r'//div[@class="price"]/p[1]/span[@class="price_n"]/text()')
price_r_list = html_str.xpath(r'//div[@class="price"]/p/span[@class="price_r"]/text()')
price_s_list = html_str.xpath(r'//div[@class="price"]/p/span[@class="price_s"]/text()')
detailedPage_list = html_str.xpath(r'//div[@class="pic"]/a/@href')
print(title_list)
print(star_commentNumberOfPerson_list)
print(star_recommendNumber_list)
print(author_list)
print(time_list)
print(publisher_list)
print(price_n_list)
print(price_r_list)
print(price_s_list)
print(detailedPage_list)
self.data_list.append(title_list)
self.data_list.append(star_commentNumberOfPerson_list)
self.data_list.append(star_recommendNumber_list)
self.data_list.append(author_list)
self.data_list.append(time_list)
self.data_list.append(publisher_list)
self.data_list.append(price_n_list)
self.data_list.append(price_r_list)
self.data_list.append(price_s_list)
self.data_list.append(detailedPage_list)
# self.parse_data(self.data_list)
def parse_data(self,data):
"""
提取所需数据
:param data: list
:return:
"""
for title,star_commentNumberOfPerson,star_recommendNumber,author,time,publisher,price_n,price_r,price_s,detailedPage in zip(data[0],data[1],data[2],data[3],data[4],data[5],data[6],data[7],data[8],data[9]):
data = []
data.append(title)
data.append(star_commentNumberOfPerson)
data.append(star_recommendNumber)
data.append(author)
data.append(time)
data.append(publisher)
data.append(price_n)
data.append(price_r)
data.append(price_s)
data.append(detailedPage)
self.save_data(data)
def save_data(self,data):
"""
将所需要的数据保存到本地,进行持久化存储
:param data:
:return: csv,xlsx
"""
data_dict = {
'标题': data[0],
'评论': data[1],
'推荐': data[2],
'作者': data[3],
'日期': data[4],
'出版社': data[5],
'售价': data[6],
'原价': data[7],
'折扣': data[8],
'详情页': data[8]
}
csv_writer.writerow(data_dict)
def start_thread(self):
"""
线程的方式启动程序
:return:
"""
Thread(target=self.parse_start_url()).start()
if __name__ == '__main__':
D = DDSpider()
D.start_thread()
0x07代码完善
至少需要增加翻页功能,同时检测风控力度,调整代码,提高代码的健壮性。