方法一:使用面向过程爬取1.0
import json import requests from lxml import etree url = "https://www.douban.com/doulist/1264675/?start=0" response = requests.get(url).content.decode() dom = etree.HTML(response) books = dom.xpath('//div[@class="article"]/div[@class="doulist-item"]') with open('book.json', 'w', encoding='utf-8')as f: for book in books: item = {} item['title'] = book.xpath('normalize-space(.//div/div[2]/div[3]/a/text())') item['author'] = book.xpath('normalize-space(.//div/div[2]/div[5]/text()[1])') item['year'] = book.xpath('normalize-space(.//div/div[2]/div[5]/text()[3])') item['rank'] = book.xpath('normalize-space(.//div/div[2]/div[4]/span[@class="rating_nums"]/text())') f.write(json.dumps(item, ensure_ascii=False) + ',\n')
方法一: