python scrapy框架项目，管道文件代码以及几种文件的保存方式

最新推荐文章于 2023-07-24 00:33:19 发布

crq_zcbk

最新推荐文章于 2023-07-24 00:33:19 发布

阅读量1.9k

点赞数 2

分类专栏： Python

本文链接：https://blog.csdn.net/crq_zcbk/article/details/81587392

版权

Python 专栏收录该内容

24 篇文章 0 订阅

订阅专栏

保存为json格式的终端操作代码：

scrapy crawl 爬虫文件名 -o 随便起的文件名.json -s FEEN_EXPORT_ENCODING=utf-8

import scrapy
# 在同级文件夹路径下找到指定的文件items
# 所以要回到上级文件夹路径来找  ..回到上级路径
from ..items import MokoItem
class MokokoSpider(scrapy.Spider):
    name = 'mokoko'
    allowed_domains = ['moko.cc']
    #通常会修改start_rl
    start_urls = ['http://www.moko.cc/channels/post/153/1.html']

    def parse(self, response):
        # print(response.text)
        ul_list=response.xpath('//ul[@class="post small-post"]')
        print(ul_list)
        all_items=[]
        for ul in ul_list:
            # 初始化一个item对象
            item= MokoItem()
            # xpath对象获取内容都是列表
            # 返回的内容为：scrapy.selector
            # 如果对象类型为scrapy.selector  那么这个对象可以
            # 被继续迭代  也可以被xpath继续寻找里面的内容
            title=ul.xpath('.//div[@class="cover"]/@cover-text')
            # print(title)
            # print(type(title))
            # 将xpath对象转化为列表对象   [0]取出里面的元素
            title=title.extract()[0]
            # print(title)
            # 如果对象的类型为list  那么这个对象可以迭代
            # 但是不能再继续使用xpath
            # print(type(title))

            clicknum=ul.xpath('.//li[last()]/span/text()').extract()[0]

            imgsrc=ul.xpath('.//img/@src2').extract()[0]
            item['title']=title
            item['imgsrc'] = imgsrc
            item['clicknum'] = clicknum
            yield item

#项目需求是将斗鱼网页代码用json格式获取里面的图片链接下载并在终端保存为json格式


#项目需求是   将斗鱼网页代码用json格式获取里面的图片链接下载  并在终端保存为json格式


import scrapy
from  ..items import DoudouyuItem
import json
class TupianSpider(scrapy.Spider):
    name = 'tupian'
    allowed_domains = ['api.douyucdn.cn']

    start_urls = ['http://api.douyucdn.cn/api/v1/getverticalRoom?limit=20&offset=']

    def parse(self, response):

        jsobj = json.loads(response.text)
        for src in jsobj["data"]:
            item = DoudouyuItem()
            src = src["room_src"]
            # print(src)
            item['src']=[src]
            yield item

        # print('正在获取第一页')
            for x in range(0,500,20):
                url='http://api.douyucdn.cn/api/v1/getverticalRoom?limit=20&offset=' + str(x)
                yield scrapy.Request(url=url,callback=self.parse)





如果只下载图片只需要在settings.py里将下载管道解注释然后按照如下代码进行：

ITEM_PIPELINES = {
   # 'doudouyu.pipelines.DoudouyuPipeline': 300,
   #专门负责下载图片的管道
    'scrapy.pipelines.images.ImagesPipeline':1
}
IMAGES_STORE='tutuqian'
IMAGES_URLS_FIELD='src'

项目需求将http://pic.netbian.com/4kmeishi/中的图片下载下来

项目需求  将http://pic.netbian.com/4kmeishi/中的图片下载下来





# -*- coding: utf-8 -*-
import scrapy

from  ..items import ImagenetItem
class ImageSpider(scrapy.Spider):
    name = 'image'
    allowed_domains = ['pic.netbian.com']
    start_urls = ['http://pic.netbian.com/4kmeishi/']

    def parse(self, response):
        img_list = response.xpath('//ul[@class="clearfix"]/li/a/img/@src')
        print(img_list)
        for img in img_list:
            item=ImagenetItem()
            src = 'http://pic.netbian.com/' + img.extract()
            # print(src)
            item['src']=[src]
            yield item
        next_url=response.xpath('//div[@class="page"]/a[text()="下一页"]/@href').extract()
        if len(next_url)!=0:

            print('*****************************')
            url='http://pic.netbian.com/'+next_url[0]
            yield scrapy.Request(url=url,callback=self.parse)
            # print(next_url)



ITEM_PIPELINES = {
   # 'imagenet.pipelines.ImagenetPipeline': 300,

#ImagesPipeline要与管道中的方法名字一样

    'scrapy.pipelines.images.ImagesPipeline':1
}
#图片的存储路径
IMAGES_STORE='imagessss'
IMAGES_URLS_FIELD='src'

存储为表格

scrapy crawl 爬虫名 -o 爬虫名.csv

存储为Excel

scrapy crawl 爬虫名 -o 爬虫名.xml

存储为json并且转码为中文

scrapy crawl 爬虫名 -o 爬虫名.json -s FEED_EXPORT_ENCODING=utf-8

保存TXT文本

项目需求  将贴吧中楼主的内容全部保存为TXT文本




# -*- coding: utf-8 -*-
import scrapy
# from..items import XiaoshuospiderItem

class ZhigengniaoSpider(scrapy.Spider):
    name = 'zhigengniao'
    allowed_domains = ['tieba.baidu.com']
    start_urls = ['https://tieba.baidu.com/p/5815118868?pn=']

    def parse(self, response):
        info_list = response.xpath('//div[@class="l_post l_post_bright j_l_post clearfix  "]')
        for info in info_list:
            name_list = info.xpath('.//ul[@class="p_author"]/li/a/text()').extract()
            for name in name_list:
                if name == '乔深沉':
                    content_list = info.xpath('.//div[@class="p_content  "]/cc/div/text()')
                    for con in content_list:
                        # item = XiaoshuospiderItem()
                        con = con.extract()

                        # item['con'] = con
                        # yield item

                        with open('xiaoshuo.txt','a',encoding='utf-8')as f:
                            f.write(con)
                            f.write('\n')
        next_url = response.xpath('//li[@class="l_pager pager_theme_5 pb_list_pager"]/a[text()="下一页"]/@href').extract()
        if len(next_url) != 0:
            url = 'https://tieba.baidu.com' + next_url[0]

            yield scrapy.Request(url=url, callback=self.parse)

项目需求将小说和图片同时保存在本地文件

项目需求  将小说和图片同时保存在本地文件



# -*- coding: utf-8 -*-
import scrapy

from ..items import QishuItem
class XiaoshuoSpider(scrapy.Spider):
    name = 'xiaoshuo'
    allowed_domains = ['qisuu.la']
    start_urls = ['https://www.qisuu.la/']

    def parse(self, response):
        # print(response.text)
        #获取所有小说的类型
        type_list=response.xpath('//div[@class="nav"]/a/@href').extract()
        # print(type_list)
        #列表里面第一个是首页  将首页  去掉
        del type_list[0]
        #输出url
        # print(response.url)
        for type in type_list:
            #拼接每一个类型的url地址
            # 在这个方法里面  response.url 为start_url
            url=response.url+type[1:]
            print(url)
            yield  scrapy.Request(url=url,callback=self.get_content_with_type_url)
    #用来找到每一种类型对应的小说
    def get_content_with_type_url(self,response):

        # print(response.text)
        #找到类型中  第一页所有小说详情页链接地址
        book_list=response.xpath('//div[@class="listBox"]/ul/li/a/@href').extract()
        print('************************')
        # print(book_list)
        for book in book_list:
            # 在这个方法里面  response.url 为: https://www.qisuu.la/soft/sort0(x)/
            url='https://www.qisuu.la'+book
            yield scrapy.Request(url=url,callback=self.get_detail_with_book_url)
            #获取每一本书的内容详情
    def get_detail_with_book_url(self,response):#extract_first  ：转化成列表同时取出第一个元素
        item=QishuItem()
        #获取小说标题
        name=response.xpath('//div[@class="detail_right"]/h1/text()').extract_first('')

        info_list=response.xpath('//div[@class="detail_right"]/ul/li/text()').extract()
        #获取需要下载的小说图片地址
        imageurl=response.xpath('//div[@class="detail_pic"]/img/@src').extract_first('')
        imageurl='https://www.qisuu.la'+imageurl
        #获取小说的下载地址
        downloadurl=response.xpath('//div[@class="showDown"]/ul/li[3]/script').extract_first('').split(',')[1].strip("'")
        print(downloadurl)
        print(imageurl)
        item['imageurl']=[imageurl]
        item['downloadurl']=[downloadurl]
        item['name']=name
        clicknum=info_list[0]
        item['clicknum']=clicknum
        filesize = info_list[1]
        item['filesize'] = filesize
        booktype = info_list[2]
        item['booktype'] = booktype
        updatetime = info_list[3]
        item['updatetime'] = updatetime
        bookstatus = info_list[4]
        item['bookstatus'] = bookstatus
        bookauthor = info_list[5]
        item['bookauthor'] = bookauthor
        print('//')
        print(info_list)
        yield item










ITEM_PIPELINES = {
   'qishu.pipelines.QishuPipeline': 300,
    #图片下载管道
    'scrapy.pipelines.images.ImagesPipeline':1,
    #文件（文字）下载管道
    'scrapy.pipelines.files.FilesPipeline':2
}
IMAGES_STORE='file/image'
IMAGES_URLS_FIELD='imageurl'

FILES_STORE='file/book'
FILES_URLS_FIELD='downloadurl'

将json格式添加标准

项目需求    将文件进行相应的保存   然后将json格式补充完整


# -*- coding: utf-8 -*-
import scrapy

import re
from ..items import HongxiuxiuItem
class XiuxiuSpider(scrapy.Spider):
    name = 'xiuxiu'
    allowed_domains = ['hongxiu.com']
    start_urls = ['https://www.hongxiu.com/all?gender=2&catId=-1']

    def parse(self, response):
        type_list=response.xpath('//ul[@type="category"]/li/a/@href').extract()
        del type_list[0]
        for type in type_list:

            url='https://www.hongxiu.com'+type
            split=re.compile(r'.*?catId=(.*?)&.*?',re.S)
            catId=re.findall(split,url)
            print(catId)
            yield scrapy.Request(url=url,meta={'type':catId[0]},callback=self.get_content_with_type_url)
    def get_content_with_type_url(self,response):
        catId=response.meta['type']
        for page_num in range(1,11):
            url='https://www.hongxiu.com/all?pageNum='+str(page_num)+'&pageSize=10&gender=2&catId='+catId+'&isFinish=-1&isVip=-1&size=-1&updT=-1&orderBy=0'
            print(url)
            yield scrapy.Request(url=url,callback=self.get_book_with_url)
    def get_book_with_url(self,response):
        detail_list=response.xpath('//div[@class="book-info"]/h3/a/@href').extract()
        for book in detail_list:
            url='https://www.hongxiu.com'+book
            print('********************************************************')
            print(url)
            yield scrapy.Request(url=url,callback=self.get_detail_with_url)
    def get_detail_with_url(self,response):
        type = response.xpath('//div[@class="crumbs-nav center1020"]/span/a[2]/text()').extract_first('')
        print(type)
        name = response.xpath('//div[@class="book-info"]/h1/em/text()').extract_first('')
        print(name)
        author = response.xpath('//div[@class="book-info"]/h1/a/text()').extract_first('')
        print(author)
        total = response.xpath('//p[@class="total"]/span/text()').extract_first('') + response.xpath(
            '//p[@class="total"]/em/text()').extract_first('')
        print(total)
        love = response.xpath('//p[@class="total"]/span[2]/text()').extract_first('') + response.xpath(
            '//p[@class="total"]/em[2]/text()').extract_first('')
        print(love)
        cilk = response.xpath('//p[@class="total"]/span[3]/text()').extract_first('') + response.xpath(
            '//p[@class="total"]/em[3]/text()').extract_first('')
        print(cilk)
        introduce = response.xpath('//p[@class="intro"]/text()').extract_first('')
        print(introduce)
        url = 'https:' + response.xpath('//div[@class="book-img"]//img/@src').extract_first('')
        url = url.replace('\r', '')
        print(url)

        item=HongxiuxiuItem()
        item['type']=type
        item['name'] = name
        item['author'] = author
        item['total'] = total
        item['love'] = love
        item['cilk'] = cilk
        item['introduce'] = introduce
        item['url']=[url]
        yield item





# -*- coding: utf-8 -*-

# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html

import codecs
import os
import json
class HongxiuxiuPipeline(object):
    def __init__(self):
        self.file=codecs.open(filename='book.json',mode='w+',encoding='utf-8')
        self.file.write('"book_list":[')
    def process_item(self, item, spider):
        res=dict(item)
        str=json.dumps(res,ensure_ascii=False)
        self.file.write(str)
        self.file.write(',\n')
        return item
    def close_spider(self,spider):
        self.file.seek(-1,os.SEEK_END)
        self.file.truncate()

        self.file.seek(-1,os.SEEK_END)
        self.file.truncate()

        self.file.write(']')
        self.file.close()



ITEM_PIPELINES = {
   'hongxiuxiu.pipelines.HongxiuxiuPipeline': 300,
    'scrapy.pipelines.images.ImagesPipeline':1
}
IMAGES_STORE='TUTUTUTUTU'
IMAGES_URLS_FIELD='url'

项目要求将图片进行下载然后将每个图片模块的名称作为下载图片后的存储文件名称

项目要求   将图片进行下载  然后将每个图片模块的名称作为下载图片后的存储文件名称





# -*- coding: utf-8 -*-
import scrapy

from ..items import SucaiItem
class TubiaoSpider(scrapy.Spider):
    name = 'tubiao'
    allowed_domains = ['sc.chinaz.com']
    start_urls = ['http://sc.chinaz.com/']

    def parse(self, response):
        icon_url=response.xpath('//li[@class="nos"]/a[3]/@href').extract_first('')
        full_url='http://sc.chinaz.com'+icon_url
        yield scrapy.Request(url=full_url,callback=self.parse_icon_url)
    def parse_icon_url(self,response):
        a_list=response.xpath('//ul[@class="pngblock imgload"]/li/span/a')
        for a in a_list:
            href=a.xpath('@href').extract_first('')
            title=a.xpath('text()').extract_first('')
            print(title)
            # meta:负责传递往下一个方法发送的内容
            yield scrapy.Request(url=href,meta={'title':title},callback=self.get_detail_with_url)
    def get_detail_with_url(self,reseponse):

        title=reseponse.meta['title']
        img_list=reseponse.xpath('//div[@class="png_sl"]/div/img/@src').extract()
        for img in img_list:
            item=SucaiItem()
            item['title']=title
            item['img']=[img]
            yield item





# -*- coding: utf-8 -*-

# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html

import scrapy
# 系统中下载图片的管道
from scrapy.pipelines.images import ImagesPipeline
# 系统管道有下载图片的功能  我们的管道继承了系统的管道也有了
# 下载图片的功能
class SucaiPipeline(ImagesPipeline):
    def get_media_requests(self, item, info):
        # print('管道方法执行了')
        # 这个方法会循环执行
        # 前面每次传入一个item  这个item被交给了引擎
        # 引擎又交给了管道来执行   管道里面有很多个方法
        # 这些方法会依次执行
        yield scrapy.Request(url=item['img'][0],meta={'item':item})
    # 管道里面提供了一系列的内置方法，这些方法会自动从第一个执行到最后一个
    def file_path(self, request, response=None, info=None):
        print('/8*/*/*/*965327-**/-*-/-*/*/*/***/*/**/**/*/')
        item = request.meta['item']
        print(item['title'])
        print(item['img'])
        # 设置图片的路径为     类型名称/url地址
        image_name=item['img'][0].split('/')[-1]
        path='%s/%s' % (item['title'],image_name)
        return path






DOWNLOAD_DELAY = 0.3#将其解注释


ITEM_PIPELINES = {
   'sucai.pipelines.SucaiPipeline': 300,

}
IMAGES_STORE='imagesssssss'

下载4k风景图片

下载4k风景图片



# -*- coding: utf-8 -*-
import scrapy
from ..items import TuwangItem
class BianSpider(scrapy.Spider):
    name = 'bian'
    allowed_domains = ['pic.netbian.com']
    start_urls = ['http://pic.netbian.com/4kfengjing/']

    def parse(self, response):
        img_list=response.xpath('//ul[@class="clearfix"]/li//img/@src').extract()
        for img in img_list:
            url='http://pic.netbian.com'+img
            print(url)
            item=TuwangItem()
            item['url']=[url]
            yield item
        next_url=response.xpath('//div[@class="page"]/a[text()="下一页"]/@href').extract()
        if len(next_url) != 0:
            url='http://pic.netbian.com'+next_url[0]
            yield scrapy.Request(url=url,callback=self.parse)








ITEM_PIPELINES = {
   'tuwang.pipelines.TuwangPipeline': 300,
    'scrapy.pipelines.images.ImagesPipeline':1
}
IMAGES_STORE='tutupian'
IMAGES_URLS_FIELD='url'

项目要求将小说爬取出来然后在管道进行json格式的保存

项目要求     将小说爬取出来然后在管道进行json格式的保存






# -*- coding: utf-8 -*-
import scrapy
from ..items import XiaoshuoItem

class XiaoxioashuoSpider(scrapy.Spider):
    name = 'xiaoxiaoshuo'
    allowed_domains = ['readnovel.com']
    start_urls = ['https://www.readnovel.com/']

    def parse(self, response):
        # book_list=response.xpath('//div[@class="book-info"]')
        book_list=response.css('.book-info')
        print(book_list)
        for book in book_list:
            # 获取小说名称
            name=book.xpath('.//h4/a/@title').extract_first('')
            if len(name) ==0:
                name = book.xpath('.//h3/a/@title').extract_first('')

            des=book.xpath('.//p/text()').extract_first('')

            author=book.xpath('.//div[@class="state-box cf"]/a/text()').extract_first('')

            type=book.xpath('.//div[@class="state-box cf"]/i/text()').extract_first('')
            item=XiaoshuoItem()
            item['name']=name
            item['des']=des
            item['author']=author
            item['type']=type
            yield item



# -*- coding: utf-8 -*-

# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html
# 用来打开指定文件  并且对文件进行转码  防止出现乱码问题
import codecs
import json
import os
class XiaoshuoPipeline(object):
    def __init__(self):
        # w：写文件     r:读文件
        # w+：读写文件    r+：读写文件
        # 前者读写文件  如果文件不存在  则创建
        # 后者读写文件  如果文件不存在  则抛出异常
        self.file=codecs.open(filename='book.json',mode='w+',encoding='utf-8')
        self.file.write('"list:["')
    # 如果想要将数据写入本地或者使用数据库的时候  这个方法需要保留
    def process_item(self, item, spider):
        # 将item对象转化为一个字典对象
        res=dict(item)
        # dumps  将字典对象转化成字符串  ASCII编码是否可用
        # 如果直接将字典形式的数据写入文件当中，会发生错误
        # 所以需要将字典形式的值  转化成字符串写入文件当中
        str = json.dumps(res,ensure_ascii=False)
        # 将数据写入到文件当中
        self.file.write(str)
        self.file.write(',\n')
    def open_spider(self,spider):
        pass
    def close_spider(self,spider):
        # 删除文件当中最后一个字符
        # -1 表示偏移量至文件的末尾
        # SEEK_END  定位到文件的最后一个字符
        self.file.seek(-1,os.SEEK_END)
        # 开始执行
        self.file.truncate()
        self.file.seek(-1, os.SEEK_END)
        self.file.truncate()

        self.file.write(']')
        self.file.close()


#解注释
ITEM_PIPELINES = {
   'xiaoshuo.pipelines.XiaoshuoPipeline': 300,
}

项目要求将数据爬取出来将其保存到数据库中

# -*- coding: utf-8 -*-
import scrapy

from ..items import HonghongxiuxiuItem
class HongxiuxiuSpider(scrapy.Spider):
    name = 'hongxiuxiu'
    allowed_domains = ['hongxiu.com']
    start_urls = ['https://www.hongxiu.com/finish?gender=2&catId=-1']

    def parse(self, response):
        li_list=response.xpath('//div[@class="right-book-list"]/ul/li')
        for li in li_list:
            img='https'+li.xpath('.//div[@class="book-img"]/a/img/@src').extract_first('')
            print(img)
            name=li.xpath('.//div[@class="book-info"]/h3/a/text()').extract_first('')
            print(name)
            author=li.xpath('.//div[@class="book-info"]/h4/a/text()').extract_first('')
            print(author)
            intro=li.xpath('.//p[@class="intro"]/text()').extract_first('')
            print(intro)
            item=HonghongxiuxiuItem()
            item['img']=img
            item['name']=name
            item['author']=author
            item['intro']=intro
            yield item









# -*- coding: utf-8 -*-

# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html

import sqlite3
class HonghongxiuxiuPipeline(object):
    def process_item(self, item, spider):
        return item
class HongXiuDBPipeline(object):
    def open_spider(self,spider):
        self.connect=sqlite3.connect('hongxiuDB')
        self.cursor=self.connect.cursor()
        self.cursor.execute('create table if not exists bookTable(name text,author text,img text, intro text)')
        self.connect.commit()
    def process_item(self,item,spider):
        self.cursor.execute('insert into bookTable (name,author,img,intro)VALUES ("{}","{}","{}","{}")'.format(item['name'],item['author'],item['img'],item['intro']))
        self.connect.commit()
    def close_spider(self,spider):
        self.cursor.close()
        self.connect.close()






ITEM_PIPELINES = {
   'honghongxiuxiu.pipelines.HonghongxiuxiuPipeline': 300,
    'honghongxiuxiu.pipelines.HongXiuDBPipeline':1
}

项目要求将数据爬取出来进行单位统一

# -*- coding: utf-8 -*-
import scrapy
# https://search.51job.com/list/170200,000000,0000,00,9,99,java,2,1.html?lang=c&stype=&postchannel=0000&workyear=99&cotype=99&degreefrom=99&jobterm=99&companysize=99&providesalary=99&lonlat=0%2C0&radius=-1&ord_field=0&confirmdate=9&fromType=&dibiaoid=0&address=&line=&specialarea=00&from=&welfare=
from ..items import JobItem
class JavaSpider(scrapy.Spider):
    name = 'zhiye'
    allowed_domains = ['search.51job.com']
    start_urls = ['https://search.51job.com/list/170200,000000,0000,00,9,99,java,2,1.html?','https://search.51job.com/list/170200,000000,0000,00,9,99,python,2,1.html?']

    def parse(self, response):
        # 1.分离当前页面所有数据  存储到item中
        # 2.获取下一页链接，请求

        div_list=response.xpath('//div[@id="resultList"]/div[@class="el"]')
        for div in div_list:
            # contains：只要属性包含某个值
            jobname=div.xpath('.//p[contains(@class,"t1")]/span/a/@title').extract_first('')
            print(jobname)
            #公司名称
            companyname=div.xpath('.//span[@class="t2"]/a/@title').extract_first('')
            print(companyname)
            cityname=div.xpath('.//span[@class="t3"]/text()').extract_first('')
            print(cityname)
            salary=div.xpath('.//span[@class="t4"]/text()').extract_first('')
            print(salary)
            min_salary=0
            max_salary=0
            if u'年' in salary:
                money = salary.split('万')[0].split('-')
                min_salary=int(money[0])/12
                min_salary='%.1f'% min_salary
                max_salary='%.1f'% (int(money[1])/12)
            elif u'万' in salary:
                money =salary.split('万')[0].split('-')
                min_salary=money[0]
                max_salary=money[1]
            elif u'千' in salary:
                money =salary.split('千')[0]
                if '-' in money:
                    min_salary=float(money.split('-')[0])*0.1
                    max_salary=float(money.split('-')[1])*0.1
                else:
                    min_salary=0
                    max_salary=float(money)*0.1
            elif u'日' in salary:
                money=salary.split('元')
                min_salary=0
                max_salary=int(money[0])*30/10000
            else:
                min_salary=0
                max_salary=0
            date=div.xpath('.//span[@class="t5"]/text()').extract_first('')
            item = JobItem()
            item['jobname']=jobname
            item['companyname'] = companyname
            item['cityname'] = cityname
            item['min_salary'] = min_salary
            item['max_salary'] = max_salary
            item['date'] = date
            yield item

        next_url=response.xpath('//li[@class="bk"]/a[text()="下一页"]/@href').extract()
        print('**********************************************')
        print(next_url)
        if len(next_url) != 0:
            print(next_url[0])
            yield scrapy.Request(url=next_url[0],callback=self.parse)

项目要求:爬取数据将数据存入到MYSQL中

# -*- coding: utf-8 -*-
import scrapy
from ..items import DianyingItem

class TiantangSpider(scrapy.Spider):
    name = 'tiantang'
    allowed_domains = ['ygdy8.net']
    start_urls = ['http://www.ygdy8.net/html/gndy/index.html']

    def parse(self, response):
        detail_list=response.xpath('//div[@class="co_area2"]//tr')
        for datail in detail_list:
            url = 'http://www.ygdy8.net'+datail.xpath('.//td[1]/a[2]/@href').extract_first('')
            print(url)
            yield scrapy.Request(url=url,callback=self.detail_info)
    def detail_info(self,response):
        title=response.xpath('//div[@class="title_all"]//font/text()').extract_first('')
        href =response.xpath('//tr[@style="WORD-WRAP : break-word"]/a/@href').extract_first('')
        print(href)
        item=DianyingItem()
        item['title']=title
        item['href']=href
        yield item






import pymysql
class DianyingPipeline(object):
    def __init__(self):
        self.connect=pymysql.connect(host='localhost',user='root',password='666666',db='movie',port=3306)
        self.cursor=self.connect.cursor()
    def process_item(self, item, spider):
        self.cursor.execute('insert into movieTable(title,href)VALUES ("{}","{}")'.format(item['title'],item['href']))
        self.connect.commit()
        return item
    def close_spider(self,spider):
        self.cursor.close()
        self.connect.close()



ITEM_PIPELINES = {
   'dianying.pipelines.DianyingPipeline': 300,
}

项目要求：将爬取的数据存入json格式并下载图片

# -*- coding: utf-8 -*-
import scrapy

from ..items import HongxiuItem
class HongxiuxiuSpider(scrapy.Spider):
    name = 'hongxiuxiu'
    allowed_domains = ['hongxiu.com']
    start_urls = ['https://www.hongxiu.com/all?catId=30008']

    def parse(self, response):
        li_list=response.xpath('//div[@class="right-book-list"]/ul/li')
        print(li_list)
        for li in li_list:
            img = 'https:'+li.xpath('.//div[@class="book-img"]/a/img/@src').extract_first('')
            title=li.xpath('.//div[@class="book-img"]/a/img/@alt').extract_first('')
            author=li.xpath('.//div[@class="book-info"]/h4/a/text()').extract_first('')
            intro=li.xpath('.//div[@class="book-info"]/p[@class="intro"]/text()').extract_first('')
            item=HongxiuItem()
            item["img"]=[img]
            item["title"]=title
            item["author"]=author
            item["intro"]=intro
            yield item






import scrapy
import codecs
import os
import json
from scrapy.pipelines.images import ImagesPipeline
class HongxiuPipeline(object):
    def __init__(self):
        self.file=codecs.open(filename='hongxiu.json',mode='w+',encoding='utf-8')

    def process_item(self, item, spider):
        res=dict(item)
        str=json.dumps(res,ensure_ascii=False)
        self.file.write(str)
        self.file.write('\n')
        return item
class HongXiuDownloadPipeline(ImagesPipeline):
    def get_media_requests(self, item, info):
        url=item['img'][0]
        yield scrapy.Request(url=url,meta={'item':item})

    def file_path(self, request, response=None, info=None):
        item=request.meta['item']
        bookname=item['title']
        path= bookname+'.jpg'
        return path






ITEM_PIPELINES = {
   'hongxiu.pipelines.HongxiuPipeline': 300,
    'hongxiu.pipelines.HongXiuDownloadPipeline':1

}
IMAGES_STORE='imgggg'

项目要求：爬取51job里的python和Java工作的总页数和链接

# -*- coding: utf-8 -*-
import scrapy
import re

class JobSpider(scrapy.Spider):
    name = 'job'
    allowed_domains = ['51job.com']
    start_urls = ['https://search.51job.com/list/170200,000000,0000,00,9,99,java,2,1.html','https://search.51job.com/list/170200,000000,0000,00,9,99,Python,2,1.html']
    def parse(self, response):
        total_page=response.xpath('//div[@class="p_in"]/span[1]/text()').extract_first('')
        print(total_page)
        # 使用正则取出页码里面的所有的数字
        res=re.compile(r'\d+')
        # 得到的结果是一个对象 ，从对象中取出匹配的结果
        result=re.findall(res,total_page)[0]
        print(result)
        # 获取请求的网页
        url=response.url
        print(url)
        if 'java' in url:
            for page in range(1,int(result)+1):
                java_url='https://search.51job.com/list/170200,000000,0000,00,9,99,java,2,{}.html'.format(page)
                yield scrapy.Request(url=java_url,callback=self.get_detail_with_page)

        else:
            for page in range(1,int(result)+1):
                python_url='https://search.51job.com/list/170200,000000,0000,00,9,99,Python,2,{}.html'.format(page)
                yield scrapy.Request(url=python_url,callback=self.get_detail_with_page)
    def get_detail_with_page(self,response):
        print(response.url)

项目要求：爬取动态数据将数据存入Excel表中，首先应该在中间件里面写上如下代码：

from scrapy.http.response.html import HtmlResponse
from scrapy import signals
class taobaospidermiddleware(object):
    def process_request(self,request,spider):
        if spider.name=='shishang':
            spider.driver.get(request.url)
            spider.driver.implicitly_wait(10)
            response=HtmlResponse(url=spider.driver.current_url,
                                  request=request,
                                  body=spider.driver.page_source,
                                  encoding='utf-8')
            return response

然后到爬虫文件里输入如下代码：

# -*- coding: utf-8 -*-
import scrapy
from ..items import TaobaoItem
from selenium import webdriver
class ShishangSpider(scrapy.Spider):
    name = 'shishang'
    allowed_domains = ['taobao.com']
    start_urls = ['https://s.taobao.com/search?q=%E6%97%B6%E5%B0%9A%E7%94%B7%E9%9E%8B&imgfile=&commend=all&ssid=s5-e&search_type=item&sourceId=tb.index&spm=a21bo.2017.201856-taobao-item.1&ie=utf8&initiative_id=tbindexz_20170306&bcoffset=6&ntoffset=6&p4ppushleft=1%2C48&s=0']
    def __init__(self):
        self.driver=webdriver.PhantomJS()
    def parse(self, response):

        content_list=response.xpath('//div[@class="ctx-box J_MouseEneterLeave J_IconMoreNew"]')
        for x in content_list:
            name=x.xpath('.//div[@class="row row-2 title"]/a').xpath('string(.)').extract()[0].strip('\n').replace(' ','').strip('\n')
            price=x.xpath('.//div[@class="price g_price g_price-highlight"]/strong/text()').extract_first('')
            dian_name=x.xpath('.//div[@class="shop"]/a/span[2]/text()').extract_first('')
            item=TaobaoItem()
            item['name']=name
            item['price']=price
            item['dian_name']=dian_name
            yield item

item文件代码省略

然后就是管道文件将数据存入Excel，代码如下：

# -*- coding: utf-8 -*-

# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html

from openpyxl import Workbook
class TaobaoPipeline(object):
    def __init__(self):
        self.wb=Workbook()
        self.ws=self.wb.active
        self.ws.append(['名称','价格','店铺'])
    def process_item(self, item, spider):
        line=[item['name'],item['price'],item['dian_name']]
        self.ws.append(line)
        self.wb.save('时尚男鞋.xlsx')
        return item
    def spider_closed(self,spider):
        pass

然后到setting文件中将下载中间件和item管道解注释并修改，代码如下：

DOWNLOADER_MIDDLEWARES = {
   'taobao.middlewares.taobaospidermiddleware': 543,
}








ITEM_PIPELINES = {
   'taobao.pipelines.TaobaoPipeline': 300,
}