【爬虫】6.练习(爬虫技术网站和起点中文网)

练习一:

抓取起点中文网免费小说(抓取免费完本小说)

import scrapy
from scrapy.linkextractors import LinkExtractor
from ..items import ChapterItem


class BooksSpider(scrapy.Spider):
    name = "begin"
    start_urls = ['https://www.qidian.com/free/all?action=1&orderId=&vip=hidden&style=2&pageSize=50&siteid=1&pubflag'
                  '=0&hiddenField=1&page=1',
                  'https://www.qidian.com/free/all?action=1&orderId=&vip=hidden&style=2&pageSize=50&siteid=1&pubflag'
                  '=0&hiddenField=1&page=2']

    def parse(self, response):
        le = LinkExtractor(restrict_css='div.all-book-list a.chapter')
        for link in le.extract_links(response):
            yield scrapy.Request(link.url, callback=self.parse_book)

    def parse_book(self, response):
        chapter = ChapterItem()
        chapter['name'] = response.css('div.crumbs-nav a.act::text').extract_first()
        sel = response.css('div.main-text-wrap')
        chapter['chapter'] = sel.css('h3 span.content-wrap::text').extract_first()
        chapter['content'] = sel.css('div.read-content p::text').extract()
        print(len(chapter['content']))
        yield chapter
        le = LinkExtractor(restrict_css='div.chapter-control a#j_chapterPrev')
        link = le.extract_links(response)
        if link:
            next_url = link[0].url
            yield scrapy.Request(next_url, callback=self.parse_book)

不知道是什么原因,查找免费完本小说只显示有两页列表,所以起始网页直接列出来了。

import scrapy


class ChapterItem(scrapy.Item):
    name = scrapy.Field()
    chapter = scrapy.Field()
    content = scrapy.Field()

获取3个数据,小说名称,章节名称,章节内容。

import csv

# 按照小说名称,将数据分别保存到不同的文件中
class QidianPipeline(object):
    def process_item(self, item, spider):
        with open('book/' + item['name'] + '.csv', 'a+', newline='') as csvfile:
            writer = csv.writer(csvfile)
            content = ''
            for i in item['content']:
                content += i.replace('\u3000\u3000', '\n')
            writer.writerow([item['chapter'], content])
        return item

将抓取内容保存到csv中可能会出现中文乱码的问题,解决该问题只需要在settings.py中加入

FEED_EXPORT_ENCODING = 'gb18030'

抓取展示:
在这里插入图片描述在这里插入图片描述

练习二:

下面爬取http://books.toscrape.com网站中的书籍信息。
其中每一本书的信息包括:

  • 书名
  • 价格
  • 评价等级
  • 产品编码
  • 库存量
  • 评价数量

代码如下:

import scrapy
from scrapy.linkextractors import LinkExtractor
from ..items import BookItem


class BooksSpider(scrapy.Spider):
    name = "books"
    start_urls = ['http://books.toscrape.com/']
    allowed_domains = ['books.toscrape.com']
	# 获取连接
    def parse(self, response):
        le = LinkExtractor(restrict_css='article.product_pod h3')
        for link in le.extract_links(response):
            yield scrapy.Request(link.url, callback=self.parse_book)
        le = LinkExtractor(restrict_css='ul.pager li.next')
        links = le.extract_links(response)
        if links:
            next_url = links[0].url
            yield scrapy.Request(next_url, callback=self.parse)
	#获取书籍信息
    def parse_book(self, response):
        book = BookItem()
        sel = response.css('div.product_main')
        book['name'] = sel.xpath('./h1/text()').extract_first()
        book['price'] = sel.css('p.price_color::text').extract_first()
        book['review_rating'] = sel.css('p.star-rating::attr(class)').extract_first().split(' ')[1]
        sel = response.css('table.table.table-striped')
        book['upc'] = sel.xpath('(.//tr)[1]/td/text()').extract_first()
        book['stock'] = sel.xpath('(.//tr)[last()-1]/td/text()').extract_first().split('(')[1].split(' ')[0]
        book['review_num'] = sel.xpath('(.//tr)[last()]/td/text()').extract_first()
        yield book

items如下:

# -*- coding: utf-8 -*-

# Define here the models for your scraped items
#
# See documentation in:
# https://docs.scrapy.org/en/latest/topics/items.html

import scrapy

class BookItem(scrapy.Item):
    name = scrapy.Field()
    price = scrapy.Field()
    review_rating = scrapy.Field()
    review_num = scrapy.Field()
    upc = scrapy.Field()
    stock = scrapy.Field()

pipelines如下:

# -*- coding: utf-8 -*-

# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html

# 将星级标识从字母映射到数字
class BookPipeline(object):
    review_rating_map = {
        'One': 1,
        'Two': 2,
        'Three': 3,
        'Four': 4,
        'Five': 5,
    }

    def process_item(self, item, spider):
        rating = item.get('review_rating')
        if rating:
            item['review_rating'] = self.review_rating_map[rating]
            return item

seting如下:

# 按照指定格式排序
FEED_EXPORT_FIELDS = ['upc', 'name', 'price', 'stock', 'review_rating', 'review_num']
# 启动pipelines
ITEM_PIPELINES = {'eight_live.pipelines.BookPipeline': 300, }
  • 0
    点赞
  • 2
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值