import requests from lxml import etree from bs4 import BeautifulSoup import json """ 爬取需求:爬取编程电子书http://www.allitebooks.com/的书信息,包括书名、书的作者、书的简介、书的图片四个; 解析数据使用bs4和xpath两种方式解析;
""" class BookSpider(object): def __init__(self): self.base_url = "http://www.allitebooks.com/page/{}/" self.headers = { "User-Agent": "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.20 Safari/537.36" } self.book_list_data = [] # 用来接收提取的数据 # 1.构建所有的url def get_url_list(self): url_list = [] for i in range(1, 11): url = self.base_url.format(i) url_list.append(url) return url_list # 2.发请求 def send_request(self, url): data = requests.get(url=url, headers=self.headers).content.decode("utf-8") print(url) return data # 3.解析数据1:使用xpath def parse_xpath_data(self, data): # 转类型 parse_data = etree.HTML(data) # 1.解析出所有的书 book_list = parse_data.xpath('//div[@class="main-content-inner clearfix"]/article') # print(len(book_list)) # 测试查看解析是否正确 # 2.解析出每本书的信息(书名、作者、简介、图片) for book in book_list: book_dict = {} # 1.书名 # xpath内:前面加个点,代表的是book_list的路径下查找 book_dict["book_name"] = book.xpath('.//h2[@class="entry-title"]/a/text()') # 2.书的图片url book_dict["book_img_url"] = book.xpath('./div[@class="entry-thumbnail hover-thumb"]/a/img/@src') # 3.书的作者 book_dict["book_author"] = book.xpath('.//h5[@class="entry-author"]/a/text()') # 4.书的简介 book_dict["book_info"] = book.xpath('.//div[@class="entry-summary"]/p/text()') self.book_list_data.append(book_dict) # print(self.book_list_data) # 测试数据是否正常输出 # 3.解析数据2:使用bs4 def parse_s4_data(self, data): # 1.类型 bs4_list = BeautifulSoup(data, 'lxml') # 2.取出所有的书 book_list = bs4_list.select("article") # 3.解析出每本书的信息(书名、作者、简介、图片) for book in book_list: book_dict = {} # 1.书名 book_dict["name"] = book.select_one(".entry-title").get_text() # 2.书的图片url book_dict["book_img_url"] = book.select_one('.attachment-post-thumbnail').get('src') # # 3.书的作者 book_dict['book_author'] = book.select_one(".entry-author").get_text()[3:] # # 4.书的简介 book_dict["book_info"] = book.select_one(".entry-summary p").get_text() self.book_list_data.append(book_dict) print(self.book_list_data) # 4.保存数据 def save_data(self): data = json.dumps(self.book_list_data) with open("04-book.json", "w", encoding="utf-8")as f: f.write(data) # 统筹调用 def start(self): url_list = self.get_url_list() # 循环遍历发送请求 for url in url_list: data = self.send_request(url) # self.parse_xpath_data(data) self.parse_s4_data(data) self.save_data() BookSpider().start()
# 另外:
解析数据的的第3步骤,使用xpath和bs4任意选择一个就可以了,看个人的喜好;
如果要爬取全部,那么首先应该检查该网页总得有多少页面,然后把for循环里面最末尾改为(n+1)就行,最好可以加个延时;
自己爬取的思路:首先应该找到一本书对应的标签,把这本书的全部信息爬取出来,再把这本书标签里需要的数据在分离一次,这样就可以获取到自己所需要的数据了