# 爬取qidian网站图书信息(书名、作者、简介、图片url)
import requests
from lxml import etree
import json
class BookSpider(object):
def __init__(self):
self.url = 'https://www.qidian.com/finish?action=hidden&orderId=&style=1&pageSize=20&siteid=1&pubflag=0&hiddenField=2&page={}'
self.headers = {
'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.88 Safari/537.36'
}
self.data_list = []
# 1、构建所有url
def get_url_list(self):
url_list = []
for i in range(1,6):
url = self.url.format(i)
url_list.append(url)
return url_list
# 2、发请求
def send_request(self,url):
data = requests.get(url,headers=self.headers).content.decode()
print(url)
return data
# 3、解析数据 xpath
def parse_xpath_data(self,data):
parse_data = etree.HTML(data)
# 1、解析出所有的书
book_list = parse_data.xpath('//div[@class="book-img-text"]/ul/li')
# 2、解析出每本书的信息
for book in book_list:
book_dict = {}
# 1、书名字
book_dict['book_name'] = book.xpath('.//div[@class="book-mid-info"]/h4/a/text()')[0]
# book_name = book.xpath('//div[@class="book-mid-info"]/h4/a/text()') # //div前不加 . 解析(查找)的是全部范围 ,加 . 是再上一次解析(book)基础上继续解析(查询)
# 2、书的图片url
book_dict['book_img_url'] = book.xpath('.//div[@class="book-img-box"]/a/img/@src')[0]
# 3、书的作者
book_dict['book_author'] = book.xpath('.//div[@class="book-mid-info"]/p[@class="author"]/a[@class="name"]/text()')[0]
# 4、书的简介
book_dict['book_info'] = book.xpath('.//div[@class="book-mid-info"]/p[@class="intro"]/text()')[0].strip() #加strip()去空格
self.data_list.append(book_dict)
# 4、保存数据
def save_data(self):
json.dump(self.data_list,open('book.json','w'))
# 5、统筹调用
def start(self):
url_list = self.get_url_list()
# 循环遍历,发送请求
for url in url_list:
data = self.send_request(url)
self.parse_xpath_data(data)
self.save_data()
BookSpider().start()
爬取起点网站图书信息(书名、作者、简介、图片url)
最新推荐文章于 2024-01-13 13:02:45 发布