scrapy案例 爬取数据保存到excel

# -*- coding: utf-8 -*-

import scrapy


class Mkw1Item(scrapy.Item):
    img = scrapy.Field()
    title = scrapy.Field()
    type = scrapy.Field()
    pic = scrapy.Field()

# -*- coding: utf-8 -*-
import scrapy
from .. import items
import re


class MukeSpider(scrapy.Spider):
    name = 'muke'
    allowed_domains = ['imooc.com']
    start_urls = ['https://www.imooc.com/new/course/list']

    def parse(self, response):
        item = items.Mkw1Item()
        a = response.xpath('//*[@id="main"]/div[5]/div[1]/a')
        for i in range(len(a)):
            img = response.xpath('//a[{}]/div/@style'.format(i + 1)).extract()[0]
            pattern_2 = '//.*\.*g'
            img = re.findall(pattern_2, img)[0]
            item['img'] = img
            item['title'] = response.xpath('//a[{}]/p[1]/text()'.format(i + 1)).extract()[0]
            item['type'] = response.xpath('//a[{}]/p[2]/text()'.format(i + 1)).extract()[0]
            item['pic'] = response.xpath('//a[{}]/p[3]/span[1]/text()'.format(i + 1)).extract()[0]
            yield item

# -*- coding: utf-8 -*-

import xlwt


class Mkw1Pipeline(object):
    def __init__(self):
        self.num = 1
        self.wb = xlwt.Workbook()
        self.sheet = self.wb.add_sheet('慕课网')
        self.list = ['img', 'title', 'type', 'pic']
        for i in range(len(self.list)):
            self.sheet.write(0, i, self.list[i])

    def process_item(self, item, spider):
        for i, j in zip(range(len(item)), item):
            self.sheet.write(self.num, i, item[j])
        self.num = self.num + 1

    def close_spider(self, spider):
        self.wb.save('../mkw.xlsx')


评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值