本次小案例是在 http://www.allitebooks.org/ 网站上爬取书本信息,用到requests模块、lxml模块、csv模块,将信息存储到csv文件中。代码中有详细的注释,代码如下:
1 # Author:K 2 # http://www.allitebooks.org/ 3 4 import requests 5 from lxml import etree 6 import os 7 import csv 8 9 class BooksSpider(object): 10 def open_file(self): 11 # 创建目录 12 if not os.path.exists('H:/allitebooks数据'): 13 os.mkdir('H:/allitebooks数据') 14 15 self.fp = open('H:/allitebooks数据/allitebooks.csv','w',encoding = 'utf-8-sig',newline = '') 16 self.writer = csv.writer(self.fp) 17 headers = ['书名','作者','年份','语言','分类','图片'] 18 # 写入表头 19 self.writer.writerow(headers) 20 21 def run(self): 22 # 得到所有详情页面的url 23 self.get_urls() 24 25 def get_urls(self): 26 # 若要获取多页,则修改range范围即可 27 for page in range(1,2): 28 page_url = 'http://www.allitebooks.org/page/%s/'%page 29 response = requests.get(page_url) 30 tree = etree.HTML(response.text) 31 detail_urls = tree.xpath('//h2[@class="entry-title"]/a/@href') 32 self.parse_page(detail_urls) 33 34 35 def parse_page(self,urls): 36 for url in urls: 37 response = requests.get(url = url) 38 tree = etree.HTML(response.text) 39 header_infos = tree.xpath('//header[@class="entry-header"]') 40 # print(header_infos) 测试 41 for info in header_infos: 42 data = [] 43 # 获取书名,并添加到列表中 44 book_name = info.xpath('./h1/text()')[0] 45 data.append(book_name) 46 # 获取作者列表 47 author_list = info.xpath('.//div[@class="book-detail"]/dl/dd[1]/a/text()') 48 # 若作者有多个的话就将多个作者之间用 / 分隔 49 if len(author_list) > 1: 50 author = '' 51 for auth in author_list: 52 if auth != author_list[-1]: 53 author = author + auth + ' / ' 54 else: 55 author = author + auth 56 data.append(author) 57 else: 58 author = author_list[0] 59 data.append(author) 60 # 获取年份,并添加到列表中 61 year = info.xpath('.//div[@class="book-detail"]/dl/dd[3]/text()')[0].strip() 62 data.append(year) 63 # 获取书的语言,并添加到列表中 64 language = info.xpath('.//div[@class="book-detail"]/dl/dd[5]/text()')[0].strip() 65 data.append(language) 66 # 获取书的分类,并添加到列表中 67 category = info.xpath('.//div[@class="book-detail"]/dl/dd[8]/a/text()')[0].strip() 68 data.append(category) 69 # 获取图片的url,并添加到列表中 70 img_url = info.xpath('.//div[1]/a/img/@src')[0] 71 data.append(img_url) 72 # print(book_name,year,language,category,img_url) 测试 73 self.save_data(data) 74 75 def save_data(self,data): 76 self.writer.writerow(data) 77 78 def close_file(self): 79 self.fp.close() 80 81 82 if __name__ == '__main__': 83 spider = BooksSpider() 84 spider.open_file() 85 spider.run() 86 spider.close_file()