豆瓣爬虫

setting.py

BOT_NAME = 'doubanbook'

SPIDER_MODULES = ['doubanbook.spiders']
NEWSPIDER_MODULE = 'doubanbook.spiders'

USER_AGENT = 'Mozilla/5.0 (Windows NT 6.3; WOW64; rv:45.0) Gecko/20100101 Firefox/45.0'
FEED_URI = u'file:///E://douban3.csv'
FEED_FORMAT = 'CSV'

main.py

# -*- coding: UTF-8 -*-
from scrapy import cmdline

cmdline.execute("scrapy crawl dbbook".split())

items.py

import scrapy


class DoubanbookItem(scrapy.Item):
    # define the fields for your item here like:
    # name = scrapy.Field()
    title = scrapy.Field()
    rate = scrapy.Field()
    author = scrapy.Field()

dbbook.py

# -*- coding: utf-8 -*-
import scrapy
import re
from doubanbook.items import DoubanbookItem
class DbbookSpider(scrapy.Spider):
    name = "dbbook"
    #allowed_domains = ["www.douban.com/doulist/1264675/"]
    start_urls = (
        'https://www.douban.com/doulist/1264675//',
    )

    def parse(self, response):
        item = DoubanbookItem()
        selector = scrapy.Selector(response)
        books = selector.xpath('//div[@class="bd doulist-subject"]')
        for each in books:
            title = each.xpath('div[@class="title"]/a/text()').extract()[0]
            rate = each.xpath('div[@class="rating"]/span[@class="rating_nums"]/text()').extract()[0]
            author = re.search(r'<div class="abstract">(.*?)<br',each.extract(),re.S).group(1)
            title = title.replace(' ', '').replace('\n', '')
            author = author.replace(' ', '').replace('\n', '')
            item['title'] = title.encode('utf-8')
            item['rate'] =rate
            item['author'] = author.encode('utf-8')
            yield item
            nextp = selector.xpath('//span[@class="next"]/link/@href').extract()
            if nextp:
                next = nextp[0]
                print(next)
                yield scrapy.http.Request(next,callback=self.parse)

 

转载于:https://www.cnblogs.com/Erick-L/p/6739882.html

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值