练习一:
抓取起点中文网免费小说(抓取免费完本小说)
import scrapy
from scrapy.linkextractors import LinkExtractor
from ..items import ChapterItem
class BooksSpider(scrapy.Spider):
name = "begin"
start_urls = ['https://www.qidian.com/free/all?action=1&orderId=&vip=hidden&style=2&pageSize=50&siteid=1&pubflag'
'=0&hiddenField=1&page=1',
'https://www.qidian.com/free/all?action=1&orderId=&vip=hidden&style=2&pageSize=50&siteid=1&pubflag'
'=0&hiddenField=1&page=2']
def parse(self, response):
le = LinkExtractor(restrict_css='div.all-book-list a.chapter')
for link in le.extract_links(response):
yield scrapy.Request(link.url, callback=self.parse_book)
def parse_book(self, response):
chapter = ChapterItem()
chapter['name'] = response.css('div.crumbs-nav a.act::text').extract_first()
sel = response.css('div.main-text-wrap')
chapter['chapter'] = sel.css('h3 span.content-wrap::text').extract_first()
chapter['content'] = sel.css('div.read-content p::text').extract()
print(len(chapter['content']))
yield chapter
le = LinkExtractor(restrict_css='div.chapter-control a#j_chapterPrev')
link = le.extract_links(response)
if link:
next_url = link[0].url
yield scrapy.Request(next_url, callback=self.parse_book)
不知道是什么原因,查找免费完本小说只显示有两页列表,所以起始网页直接列出来了。
import scrapy
class ChapterItem(scrapy.Item):
name = scrapy.Field()
chapter = scrapy.Field()
content = scrapy.Field()
获取3个数据,小说名称,章节名称,章节内容。
import csv
# 按照小说名称,将数据分别保存到不同的文件中
class QidianPipeline(object):
def process_item(self, item, spider):
with open('book/' + item['name'] + '.csv', 'a+', newline='') as csvfile:
writer = csv.writer(csvfile)
content = ''
for i in item['content']:
content += i.replace('\u3000\u3000', '\n')
writer.writerow([item['chapter'], content])
return item
将抓取内容保存到csv中可能会出现中文乱码的问题,解决该问题只需要在settings.py中加入
FEED_EXPORT_ENCODING = 'gb18030'
抓取展示:
练习二:
下面爬取http://books.toscrape.com网站中的书籍信息。
其中每一本书的信息包括:
- 书名
- 价格
- 评价等级
- 产品编码
- 库存量
- 评价数量
代码如下:
import scrapy
from scrapy.linkextractors import LinkExtractor
from ..items import BookItem
class BooksSpider(scrapy.Spider):
name = "books"
start_urls = ['http://books.toscrape.com/']
allowed_domains = ['books.toscrape.com']
# 获取连接
def parse(self, response):
le = LinkExtractor(restrict_css='article.product_pod h3')
for link in le.extract_links(response):
yield scrapy.Request(link.url, callback=self.parse_book)
le = LinkExtractor(restrict_css='ul.pager li.next')
links = le.extract_links(response)
if links:
next_url = links[0].url
yield scrapy.Request(next_url, callback=self.parse)
#获取书籍信息
def parse_book(self, response):
book = BookItem()
sel = response.css('div.product_main')
book['name'] = sel.xpath('./h1/text()').extract_first()
book['price'] = sel.css('p.price_color::text').extract_first()
book['review_rating'] = sel.css('p.star-rating::attr(class)').extract_first().split(' ')[1]
sel = response.css('table.table.table-striped')
book['upc'] = sel.xpath('(.//tr)[1]/td/text()').extract_first()
book['stock'] = sel.xpath('(.//tr)[last()-1]/td/text()').extract_first().split('(')[1].split(' ')[0]
book['review_num'] = sel.xpath('(.//tr)[last()]/td/text()').extract_first()
yield book
items如下:
# -*- coding: utf-8 -*-
# Define here the models for your scraped items
#
# See documentation in:
# https://docs.scrapy.org/en/latest/topics/items.html
import scrapy
class BookItem(scrapy.Item):
name = scrapy.Field()
price = scrapy.Field()
review_rating = scrapy.Field()
review_num = scrapy.Field()
upc = scrapy.Field()
stock = scrapy.Field()
pipelines如下:
# -*- coding: utf-8 -*-
# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html
# 将星级标识从字母映射到数字
class BookPipeline(object):
review_rating_map = {
'One': 1,
'Two': 2,
'Three': 3,
'Four': 4,
'Five': 5,
}
def process_item(self, item, spider):
rating = item.get('review_rating')
if rating:
item['review_rating'] = self.review_rating_map[rating]
return item
seting如下:
# 按照指定格式排序
FEED_EXPORT_FIELDS = ['upc', 'name', 'price', 'stock', 'review_rating', 'review_num']
# 启动pipelines
ITEM_PIPELINES = {'eight_live.pipelines.BookPipeline': 300, }