python scrapy框架项目,管道文件代码以及几种文件的保存方式

 保存为json格式的终端操作代码:

scrapy crawl 爬虫文件名 -o 随便起的文件名.json -s FEEN_EXPORT_ENCODING=utf-8

import scrapy
# 在同级文件夹路径下找到指定的文件items
# 所以要回到上级文件夹路径来找  ..回到上级路径
from ..items import MokoItem
class MokokoSpider(scrapy.Spider):
    name = 'mokoko'
    allowed_domains = ['moko.cc']
    #通常会修改start_rl
    start_urls = ['http://www.moko.cc/channels/post/153/1.html']

    def parse(self, response):
        # print(response.text)
        ul_list=response.xpath('//ul[@class="post small-post"]')
        print(ul_list)
        all_items=[]
        for ul in ul_list:
            # 初始化一个item对象
            item= MokoItem()
            # xpath对象获取内容都是列表
            # 返回的内容为:scrapy.selector
            # 如果对象类型为scrapy.selector  那么这个对象可以
            # 被继续迭代  也可以被xpath继续寻找里面的内容
            title=ul.xpath('.//div[@class="cover"]/@cover-text')
            # print(title)
            # print(type(title))
            # 将xpath对象转化为列表对象   [0]取出里面的元素
            title=title.extract()[0]
            # print(title)
            # 如果对象的类型为list  那么这个对象可以迭代
            # 但是不能再继续使用xpath
            # print(type(title))

            clicknum=ul.xpath('.//li[last()]/span/text()').extract()[0]

            imgsrc=ul.xpath('.//img/@src2').extract()[0]
            item['title']=title
            item['imgsrc'] = imgsrc
            item['clicknum'] = clicknum
            yield item

 #项目需求是   将斗鱼网页代码用json格式获取里面的图片链接下载  并在终端保存为json格式


#项目需求是   将斗鱼网页代码用json格式获取里面的图片链接下载  并在终端保存为json格式


import scrapy
from  ..items import DoudouyuItem
import json
class TupianSpider(scrapy.Spider):
    name = 'tupian'
    allowed_domains = ['api.douyucdn.cn']

    start_urls = ['http://api.douyucdn.cn/api/v1/getverticalRoom?limit=20&offset=']

    def parse(self, response):

        jsobj = json.loads(response.text)
        for src in jsobj["data"]:
            item = DoudouyuItem()
            src = src["room_src"]
            # print(src)
            item['src']=[src]
            yield item

        # print('正在获取第一页')
            for x in range(0,500,20):
                url='http://api.douyucdn.cn/api/v1/getverticalRoom?limit=20&offset=' + str(x)
                yield scrapy.Request(url=url,callback=self.parse)





如果只下载图片只需要在settings.py里将下载管道解注释然后按照如下代码进行:

ITEM_PIPELINES = {
   # 'doudouyu.pipelines.DoudouyuPipeline': 300,
   #专门负责下载图片的管道
    'scrapy.pipelines.images.ImagesPipeline':1
}
IMAGES_STORE='tutuqian'
IMAGES_URLS_FIELD='src'

 项目需求  将http://pic.netbian.com/4kmeishi/中的图片下载下来

项目需求  将http://pic.netbian.com/4kmeishi/中的图片下载下来





# -*- coding: utf-8 -*-
import scrapy

from  ..items import ImagenetItem
class ImageSpider(scrapy.Spider):
    name = 'image'
    allowed_domains = ['pic.netbian.com']
    start_urls = ['http://pic.netbian.com/4kmeishi/']

    def parse(self, response):
        img_list = response.xpath('//ul[@class="clearfix"]/li/a/img/@src')
        print(img_list)
        for img in img_list:
            item=ImagenetItem()
            src = 'http://pic.netbian.com/' + img.extract()
            # print(src)
            item['src']=[src]
            yield item
        next_url=response.xpath('//div[@class="page"]/a[text()="下一页"]/@href').extract()
        if len(next_url)!=0:

            print('*****************************')
            url='http://pic.netbian.com/'+next_url[0]
            yield scrapy.Request(url=url,callback=self.parse)
            # print(next_url)



ITEM_PIPELINES = {
   # 'imagenet.pipelines.ImagenetPipeline': 300,

#ImagesPipeline要与管道中的方法名字一样

    'scrapy.pipelines.images.ImagesPipeline':1
}
#图片的存储路径
IMAGES_STORE='imagessss'
IMAGES_URLS_FIELD='src'

 

存储为表格

scrapy crawl 爬虫名 -o 爬虫名.csv

存储为Excel

scrapy crawl 爬虫名 -o 爬虫名.xml

存储为json并且转码为中文

scrapy crawl 爬虫名 -o 爬虫名.json -s FEED_EXPORT_ENCODING=utf-8

 保存TXT文本

项目需求  将贴吧中楼主的内容全部保存为TXT文本




# -*- coding: utf-8 -*-
import scrapy
# from..items import XiaoshuospiderItem

class ZhigengniaoSpider(scrapy.Spider):
    name = 'zhigengniao'
    allowed_domains = ['tieba.baidu.com']
    start_urls = ['https://tieba.baidu.com/p/5815118868?pn=']

    def parse(self, response):
        info_list = response.xpath('//div[@class="l_post l_post_bright j_l_post clearfix  "]')
        for info in info_list:
            name_list = info.xpath('.//ul[@class="p_author"]/li/a/text()').extract()
            for name in name_list:
                if name == '乔深沉':
                    content_list = info.xpath('.//div[@class="p_content  "]/cc/div/text()')
                    for con in content_list:
                        # item = XiaoshuospiderItem()
                        con = con.extract()

                        # item['con'] = con
                        # yield item

                        with open('xiaoshuo.txt','a',encoding='utf-8')as f:
                            f.write(con)
                            f.write('\n')
        next_url = response.xpath('//li[@class="l_pager pager_theme_5 pb_list_pager"]/a[text()="下一页"]/@href').extract()
        if len(next_url) != 0:
            url = 'https://tieba.baidu.com' + next_url[0]

            yield scrapy.Request(url=url, callback=self.parse)


 项目需求  将小说和图片同时保存在本地文件

项目需求  将小说和图片同时保存在本地文件



# -*- coding: utf-8 -*-
import scrapy

from ..items import QishuItem
class XiaoshuoSpider(scrapy.Spider):
    name = 'xiaoshuo'
    allowed_domains = ['qisuu.la']
    start_urls = ['https://www.qisuu.la/']

    def parse(self, response):
        # print(response.text)
        #获取所有小说的类型
        type_list=response.xpath('//div[@class="nav"]/a/@href').extract()
        # print(type_list)
        #列表里面第一个是首页  将首页  去掉
        del type_list[0]
        #输出url
        # print(response.url)
        for type in type_list:
            #拼接每一个类型的url地址
            # 在这个方法里面  response.url 为start_url
            url=response.url+type[1:]
            print(url)
            yield  scrapy.Request(url=url,callback=self.get_content_with_type_url)
    #用来找到每一种类型对应的小说
    def get_content_with_type_url(self,response):

        # print(response.text)
        #找到类型中  第一页所有小说详情页链接地址
        book_list=response.xpath('//div[@class="listBox"]/ul/li/a/@href').extract()
        print('************************')
        # print(book_list)
        for book in book_list:
            # 在这个方法里面  response.url 为: https://www.qisuu.la/soft/sort0(x)/
            url='https://www.qisuu.la'+book
            yield scrapy.Request(url=url,callback=self.get_detail_with_book_url)
            #获取每一本书的内容详情
    def get_detail_with_book_url(self,response):#extract_first  :转化成列表同时取出第一个元素
        item=QishuItem()
        #获取小说标题
        name=response.xpath('//div[@class="detail_right"]/h1/text()').extract_first('')

        info_list=response.xpath('//div[@class="detail_right"]/ul/li/text()').extract()
        #获取需要下载的小说图片地址
        imageurl=response.xpath('//div[@class="detail_pic"]/img/@src').extract_first('')
        imageurl='https://www.qisuu.la'+imageurl
        #获取小说的下载地址
        downloadurl=response.xpath('//div[@class="showDown"]/ul/li[3]/script').extract_first('').split(',')[1].strip("'")
        print(downloadurl)
        print(imageurl)
        item['imageurl']=[imageurl]
        item['downloadurl']=[downloadurl]
        item['name']=name
        clicknum=info_list[0]
        item['clicknum']=clicknum
        filesize = info_list[1]
        item['filesize'] = filesize
        booktype = info_list[2]
        item['booktype'] = booktype
        updatetime = info_list[3]
        item['updatetime'] = updatetime
        bookstatus = info_list[4]
        item['bookstatus'] = bookstatus
        bookauthor = info_list[5]
        item['bookauthor'] = bookauthor
        print('//')
        print(info_list)
        yield item










ITEM_PIPELINES = {
   'qishu.pipelines.QishuPipeline': 300,
    #图片下载管道
    'scrapy.pipelines.images.ImagesPipeline':1,
    #文件(文字)下载管道
    'scrapy.pipelines.files.FilesPipeline':2
}
IMAGES_STORE='file/image'
IMAGES_URLS_FIELD='imageurl'

FILES_STORE='file/book'
FILES_URLS_FIELD='downloadurl'


 将json格式添加标准

项目需求    将文件进行相应的保存   然后将json格式补充完整


# -*- coding: utf-8 -*-
import scrapy

import re
from ..items import HongxiuxiuItem
class XiuxiuSpider(scrapy.Spider):
    name = 'xiuxiu'
    allowed_domains = ['hongxiu.com']
    start_urls = ['https://www.hongxiu.com/all?gender=2&catId=-1']

    def parse(self, response):
        type_list=response.xpath('//ul[@type="category"]/li/a/@href').extract()
        del type_list[0]
        for type in type_list:

            url='https://www.hongxiu.com'+type
            split=re.compile(r'.*?catId=(.*?)&.*?',re.S)
            catId=re.findall(split,url)
            print(catId)
            yield scrapy.Request(url=url,meta={'type':catId[0]},callback=self.get_content_with_type_url)
    def get_content_with_type_url(self,response):
        catId=response.meta['type']
        for page_num in range(1,11):
            url='https://www.hongxiu.com/all?pageNum='+str(page_num)+'&pageSize=10&gender=2&catId='+catId+'&isFinish=-1&isVip=-1&size=-1&updT=-1&orderBy=0'
            print(url)
            yield scrapy.Request(url=url,callback=self.get_book_with_url)
    def get_book_with_url(self,response):
        detail_list=response.xpath('//div[@class="book-info"]/h3/a/@href').extract()
        for book in detail_list:
            url='https://www.hongxiu.com'+book
            print('********************************************************')
            print(url)
            yield scrapy.Request(url=url,callback=self.get_detail_with_url)
    def get_detail_with_url(self,response):
        type = response.xpath('//div[@class="crumbs-nav center1020"]/span/a[2]/text()').extract_first('')
        print(type)
        name = response.xpath('//div[@class="book-info"]/h1/em/text()').extract_first('')
        print(name)
        author = response.xpath('//div[@class="book-info"]/h1/a/text()').extract_first('')
        print(author)
        total = response.xpath('//p[@class="total"]/span/text()').extract_first('') + response.xpath(
            '//p[@class="total"]/em/text()').extract_first('')
        print(total)
        love = response.xpath('//p[@class="total"]/span[2]/text()').extract_first('') + response.xpath(
            '//p[@class="total"]/em[2]/text()').extract_first('')
        print(love)
        cilk = response.xpath('//p[@class="total"]/span[3]/text()').extract_first('') + response.xpath(
            '//p[@class="total"]/em[3]/text()').extract_first('')
        print(cilk)
        introduce = response.xpath('//p[@class="intro"]/text()').extract_first('')
        print(introduce)
        url = 'https:' + response.xpath('//div[@class="book-img"]//img/@src').extract_first('')
        url = url.replace('\r', '')
        print(url)

        item=HongxiuxiuItem()
        item['type']=type
        item['name'] = name
        item['author'] = author
        item['total'] = total
        item['love'] = love
        item['cilk'] = cilk
        item['introduce'] = introduce
        item['url']=[url]
        yield item





# -*- coding: utf-8 -*-

# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html

import codecs
import os
import json
class HongxiuxiuPipeline(object):
    def __init__(self):
        self.file=codecs.open(filename='book.json',mode='w+',encoding='utf-8')
        self.file.write('"book_list":[')
    def process_item(self, item, spider):
        res=dict(item)
        str=json.dumps(res,ensure_ascii=False)
        self.file.write(str)
        self.file.write(',\n')
        return item
    def close_spider(self,spider):
        self.file.seek(-1,os.SEEK_END)
        self.file.truncate()

        self.file.seek(-1,os.SEEK_END)
        self.file.truncate()

        self.file.write(']')
        self.file.close()



ITEM_PIPELINES = {
   'hongxiuxiu.pipelines.HongxiuxiuPipeline': 300,
    'scrapy.pipelines.images.ImagesPipeline':1
}
IMAGES_STORE='TUTUTUTUTU'
IMAGES_URLS_FIELD='url'



 项目要求   将图片进行下载  然后将每个图片模块的名称作为下载图片后的存储文件名称

项目要求   将图片进行下载  然后将每个图片模块的名称作为下载图片后的存储文件名称





# -*- coding: utf-8 -*-
import scrapy

from ..items import SucaiItem
class TubiaoSpider(scrapy.Spider):
    name = 'tubiao'
    allowed_domains = ['sc.chinaz.com']
    start_urls = ['http://sc.chinaz.com/']

    def parse(self, response):
        icon_url=response.xpath('//li[@class="nos"]/a[3]/@href').extract_first('')
        full_url='http://sc.chinaz.com'+icon_url
        yield scrapy.Request(url=full_url,callback=self.parse_icon_url)
    def parse_icon_url(self,response):
        a_list=response.xpath('//ul[@class="pngblock imgload"]/li/span/a')
        for a in a_list:
            href=a.xpath('@href').extract_first('')
            title=a.xpath('text()').extract_first('')
            print(title)
            # meta:负责传递往下一个方法发送的内容
            yield scrapy.Request(url=href,meta={'title':title},callback=self.get_detail_with_url)
    def get_detail_with_url(self,reseponse):

        title=reseponse.meta['title']
        img_list=reseponse.xpath('//div[@class="png_sl"]/div/img/@src').extract()
        for img in img_list:
            item=SucaiItem()
            item['title']=title
            item['img']=[img]
            yield item





# -*- coding: utf-8 -*-

# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html

import scrapy
# 系统中下载图片的管道
from scrapy.pipelines.images import ImagesPipeline
# 系统管道有下载图片的功能  我们的管道继承了系统的管道也有了
# 下载图片的功能
class SucaiPipeline(ImagesPipeline):
    def get_media_requests(self, item, info):
        # print('管道方法执行了')
        # 这个方法会循环执行
        # 前面每次传入一个item  这个item被交给了引擎
        # 引擎又交给了管道来执行   管道里面有很多个方法
        # 这些方法会依次执行
        yield scrapy.Request(url=item['img'][0],meta={'item':item})
    # 管道里面提供了一系列的内置方法,这些方法会自动从第一个执行到最后一个
    def file_path(self, request, response=None, info=None):
        print('/8*/*/*/*965327-**/-*-/-*/*/*/***/*/**/**/*/')
        item = request.meta['item']
        print(item['title'])
        print(item['img'])
        # 设置图片的路径为     类型名称/url地址
        image_name=item['img'][0].split('/')[-1]
        path='%s/%s' % (item['title'],image_name)
        return path






DOWNLOAD_DELAY = 0.3#将其解注释


ITEM_PIPELINES = {
   'sucai.pipelines.SucaiPipeline': 300,

}
IMAGES_STORE='imagesssssss'







下载4k风景图片

下载4k风景图片



# -*- coding: utf-8 -*-
import scrapy
from ..items import TuwangItem
class BianSpider(scrapy.Spider):
    name = 'bian'
    allowed_domains = ['pic.netbian.com']
    start_urls = ['http://pic.netbian.com/4kfengjing/']

    def parse(self, response):
        img_list=response.xpath('//ul[@class="clearfix"]/li//img/@src').extract()
        for img in img_list:
            url='http://pic.netbian.com'+img
            print(url)
            item=TuwangItem()
            item['url']=[url]
            yield item
        next_url=response.xpath('//div[@class="page"]/a[text()="下一页"]/@href').extract()
        if len(next_url) != 0:
            url='http://pic.netbian.com'+next_url[0]
            yield scrapy.Request(url=url,callback=self.parse)








ITEM_PIPELINES = {
   'tuwang.pipelines.TuwangPipeline': 300,
    'scrapy.pipelines.images.ImagesPipeline':1
}
IMAGES_STORE='tutupian'
IMAGES_URLS_FIELD='url'

 项目要求     将小说爬取出来然后在管道进行json格式的保存

项目要求     将小说爬取出来然后在管道进行json格式的保存






# -*- coding: utf-8 -*-
import scrapy
from ..items import XiaoshuoItem

class XiaoxioashuoSpider(scrapy.Spider):
    name = 'xiaoxiaoshuo'
    allowed_domains = ['readnovel.com']
    start_urls = ['https://www.readnovel.com/']

    def parse(self, response):
        # book_list=response.xpath('//div[@class="book-info"]')
        book_list=response.css('.book-info')
        print(book_list)
        for book in book_list:
            # 获取小说名称
            name=book.xpath('.//h4/a/@title').extract_first('')
            if len(name) ==0:
                name = book.xpath('.//h3/a/@title').extract_first('')

            des=book.xpath('.//p/text()').extract_first('')

            author=book.xpath('.//div[@class="state-box cf"]/a/text()').extract_first('')

            type=book.xpath('.//div[@class="state-box cf"]/i/text()').extract_first('')
            item=XiaoshuoItem()
            item['name']=name
            item['des']=des
            item['author']=author
            item['type']=type
            yield item



# -*- coding: utf-8 -*-

# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html
# 用来打开指定文件  并且对文件进行转码  防止出现乱码问题
import codecs
import json
import os
class XiaoshuoPipeline(object):
    def __init__(self):
        # w:写文件     r:读文件
        # w+:读写文件    r+:读写文件
        # 前者读写文件  如果文件不存在  则创建
        # 后者读写文件  如果文件不存在  则抛出异常
        self.file=codecs.open(filename='book.json',mode='w+',encoding='utf-8')
        self.file.write('"list:["')
    # 如果想要将数据写入本地或者使用数据库的时候  这个方法需要保留
    def process_item(self, item, spider):
        # 将item对象转化为一个字典对象
        res=dict(item)
        # dumps  将字典对象转化成字符串  ASCII编码是否可用
        # 如果直接将字典形式的数据写入文件当中,会发生错误
        # 所以需要将字典形式的值  转化成字符串写入文件当中
        str = json.dumps(res,ensure_ascii=False)
        # 将数据写入到文件当中
        self.file.write(str)
        self.file.write(',\n')
    def open_spider(self,spider):
        pass
    def close_spider(self,spider):
        # 删除文件当中最后一个字符
        # -1 表示偏移量至文件的末尾
        # SEEK_END  定位到文件的最后一个字符
        self.file.seek(-1,os.SEEK_END)
        # 开始执行
        self.file.truncate()
        self.file.seek(-1, os.SEEK_END)
        self.file.truncate()

        self.file.write(']')
        self.file.close()


#解注释
ITEM_PIPELINES = {
   'xiaoshuo.pipelines.XiaoshuoPipeline': 300,
}


项目要求    将数据爬取出来将其保存到数据库中

# -*- coding: utf-8 -*-
import scrapy

from ..items import HonghongxiuxiuItem
class HongxiuxiuSpider(scrapy.Spider):
    name = 'hongxiuxiu'
    allowed_domains = ['hongxiu.com']
    start_urls = ['https://www.hongxiu.com/finish?gender=2&catId=-1']

    def parse(self, response):
        li_list=response.xpath('//div[@class="right-book-list"]/ul/li')
        for li in li_list:
            img='https'+li.xpath('.//div[@class="book-img"]/a/img/@src').extract_first('')
            print(img)
            name=li.xpath('.//div[@class="book-info"]/h3/a/text()').extract_first('')
            print(name)
            author=li.xpath('.//div[@class="book-info"]/h4/a/text()').extract_first('')
            print(author)
            intro=li.xpath('.//p[@class="intro"]/text()').extract_first('')
            print(intro)
            item=HonghongxiuxiuItem()
            item['img']=img
            item['name']=name
            item['author']=author
            item['intro']=intro
            yield item









# -*- coding: utf-8 -*-

# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html

import sqlite3
class HonghongxiuxiuPipeline(object):
    def process_item(self, item, spider):
        return item
class HongXiuDBPipeline(object):
    def open_spider(self,spider):
        self.connect=sqlite3.connect('hongxiuDB')
        self.cursor=self.connect.cursor()
        self.cursor.execute('create table if not exists bookTable(name text,author text,img text, intro text)')
        self.connect.commit()
    def process_item(self,item,spider):
        self.cursor.execute('insert into bookTable (name,author,img,intro)VALUES ("{}","{}","{}","{}")'.format(item['name'],item['author'],item['img'],item['intro']))
        self.connect.commit()
    def close_spider(self,spider):
        self.cursor.close()
        self.connect.close()






ITEM_PIPELINES = {
   'honghongxiuxiu.pipelines.HonghongxiuxiuPipeline': 300,
    'honghongxiuxiu.pipelines.HongXiuDBPipeline':1
}










项目要求   将数据爬取出来进行单位统一

# -*- coding: utf-8 -*-
import scrapy
# https://search.51job.com/list/170200,000000,0000,00,9,99,java,2,1.html?lang=c&stype=&postchannel=0000&workyear=99&cotype=99&degreefrom=99&jobterm=99&companysize=99&providesalary=99&lonlat=0%2C0&radius=-1&ord_field=0&confirmdate=9&fromType=&dibiaoid=0&address=&line=&specialarea=00&from=&welfare=
from ..items import JobItem
class JavaSpider(scrapy.Spider):
    name = 'zhiye'
    allowed_domains = ['search.51job.com']
    start_urls = ['https://search.51job.com/list/170200,000000,0000,00,9,99,java,2,1.html?','https://search.51job.com/list/170200,000000,0000,00,9,99,python,2,1.html?']

    def parse(self, response):
        # 1.分离当前页面所有数据  存储到item中
        # 2.获取下一页链接,请求

        div_list=response.xpath('//div[@id="resultList"]/div[@class="el"]')
        for div in div_list:
            # contains:只要属性包含某个值
            jobname=div.xpath('.//p[contains(@class,"t1")]/span/a/@title').extract_first('')
            print(jobname)
            #公司名称
            companyname=div.xpath('.//span[@class="t2"]/a/@title').extract_first('')
            print(companyname)
            cityname=div.xpath('.//span[@class="t3"]/text()').extract_first('')
            print(cityname)
            salary=div.xpath('.//span[@class="t4"]/text()').extract_first('')
            print(salary)
            min_salary=0
            max_salary=0
            if u'年' in salary:
                money = salary.split('万')[0].split('-')
                min_salary=int(money[0])/12
                min_salary='%.1f'% min_salary
                max_salary='%.1f'% (int(money[1])/12)
            elif u'万' in salary:
                money =salary.split('万')[0].split('-')
                min_salary=money[0]
                max_salary=money[1]
            elif u'千' in salary:
                money =salary.split('千')[0]
                if '-' in money:
                    min_salary=float(money.split('-')[0])*0.1
                    max_salary=float(money.split('-')[1])*0.1
                else:
                    min_salary=0
                    max_salary=float(money)*0.1
            elif u'日' in salary:
                money=salary.split('元')
                min_salary=0
                max_salary=int(money[0])*30/10000
            else:
                min_salary=0
                max_salary=0
            date=div.xpath('.//span[@class="t5"]/text()').extract_first('')
            item = JobItem()
            item['jobname']=jobname
            item['companyname'] = companyname
            item['cityname'] = cityname
            item['min_salary'] = min_salary
            item['max_salary'] = max_salary
            item['date'] = date
            yield item

        next_url=response.xpath('//li[@class="bk"]/a[text()="下一页"]/@href').extract()
        print('**********************************************')
        print(next_url)
        if len(next_url) != 0:
            print(next_url[0])
            yield scrapy.Request(url=next_url[0],callback=self.parse)

项目要求:爬取数据将数据存入到MYSQL中

# -*- coding: utf-8 -*-
import scrapy
from ..items import DianyingItem

class TiantangSpider(scrapy.Spider):
    name = 'tiantang'
    allowed_domains = ['ygdy8.net']
    start_urls = ['http://www.ygdy8.net/html/gndy/index.html']

    def parse(self, response):
        detail_list=response.xpath('//div[@class="co_area2"]//tr')
        for datail in detail_list:
            url = 'http://www.ygdy8.net'+datail.xpath('.//td[1]/a[2]/@href').extract_first('')
            print(url)
            yield scrapy.Request(url=url,callback=self.detail_info)
    def detail_info(self,response):
        title=response.xpath('//div[@class="title_all"]//font/text()').extract_first('')
        href =response.xpath('//tr[@style="WORD-WRAP : break-word"]/a/@href').extract_first('')
        print(href)
        item=DianyingItem()
        item['title']=title
        item['href']=href
        yield item






import pymysql
class DianyingPipeline(object):
    def __init__(self):
        self.connect=pymysql.connect(host='localhost',user='root',password='666666',db='movie',port=3306)
        self.cursor=self.connect.cursor()
    def process_item(self, item, spider):
        self.cursor.execute('insert into movieTable(title,href)VALUES ("{}","{}")'.format(item['title'],item['href']))
        self.connect.commit()
        return item
    def close_spider(self,spider):
        self.cursor.close()
        self.connect.close()



ITEM_PIPELINES = {
   'dianying.pipelines.DianyingPipeline': 300,
}










项目要求:将爬取的数据存入json格式并下载图片

 

# -*- coding: utf-8 -*-
import scrapy

from ..items import HongxiuItem
class HongxiuxiuSpider(scrapy.Spider):
    name = 'hongxiuxiu'
    allowed_domains = ['hongxiu.com']
    start_urls = ['https://www.hongxiu.com/all?catId=30008']

    def parse(self, response):
        li_list=response.xpath('//div[@class="right-book-list"]/ul/li')
        print(li_list)
        for li in li_list:
            img = 'https:'+li.xpath('.//div[@class="book-img"]/a/img/@src').extract_first('')
            title=li.xpath('.//div[@class="book-img"]/a/img/@alt').extract_first('')
            author=li.xpath('.//div[@class="book-info"]/h4/a/text()').extract_first('')
            intro=li.xpath('.//div[@class="book-info"]/p[@class="intro"]/text()').extract_first('')
            item=HongxiuItem()
            item["img"]=[img]
            item["title"]=title
            item["author"]=author
            item["intro"]=intro
            yield item






import scrapy
import codecs
import os
import json
from scrapy.pipelines.images import ImagesPipeline
class HongxiuPipeline(object):
    def __init__(self):
        self.file=codecs.open(filename='hongxiu.json',mode='w+',encoding='utf-8')

    def process_item(self, item, spider):
        res=dict(item)
        str=json.dumps(res,ensure_ascii=False)
        self.file.write(str)
        self.file.write('\n')
        return item
class HongXiuDownloadPipeline(ImagesPipeline):
    def get_media_requests(self, item, info):
        url=item['img'][0]
        yield scrapy.Request(url=url,meta={'item':item})

    def file_path(self, request, response=None, info=None):
        item=request.meta['item']
        bookname=item['title']
        path= bookname+'.jpg'
        return path






ITEM_PIPELINES = {
   'hongxiu.pipelines.HongxiuPipeline': 300,
    'hongxiu.pipelines.HongXiuDownloadPipeline':1

}
IMAGES_STORE='imgggg'



项目要求:爬取51job里的python和Java工作的总页数和链接

# -*- coding: utf-8 -*-
import scrapy
import re

class JobSpider(scrapy.Spider):
    name = 'job'
    allowed_domains = ['51job.com']
    start_urls = ['https://search.51job.com/list/170200,000000,0000,00,9,99,java,2,1.html','https://search.51job.com/list/170200,000000,0000,00,9,99,Python,2,1.html']
    def parse(self, response):
        total_page=response.xpath('//div[@class="p_in"]/span[1]/text()').extract_first('')
        print(total_page)
        # 使用正则取出页码里面的所有的数字
        res=re.compile(r'\d+')
        # 得到的结果是一个对象 ,从对象中取出匹配的结果
        result=re.findall(res,total_page)[0]
        print(result)
        # 获取请求的网页
        url=response.url
        print(url)
        if 'java' in url:
            for page in range(1,int(result)+1):
                java_url='https://search.51job.com/list/170200,000000,0000,00,9,99,java,2,{}.html'.format(page)
                yield scrapy.Request(url=java_url,callback=self.get_detail_with_page)

        else:
            for page in range(1,int(result)+1):
                python_url='https://search.51job.com/list/170200,000000,0000,00,9,99,Python,2,{}.html'.format(page)
                yield scrapy.Request(url=python_url,callback=self.get_detail_with_page)
    def get_detail_with_page(self,response):
        print(response.url)

项目要求:爬取动态数据将数据存入Excel表中,首先应该在中间件里面写上如下代码:

from scrapy.http.response.html import HtmlResponse
from scrapy import signals
class taobaospidermiddleware(object):
    def process_request(self,request,spider):
        if spider.name=='shishang':
            spider.driver.get(request.url)
            spider.driver.implicitly_wait(10)
            response=HtmlResponse(url=spider.driver.current_url,
                                  request=request,
                                  body=spider.driver.page_source,
                                  encoding='utf-8')
            return response


然后到爬虫文件里输入如下代码:

# -*- coding: utf-8 -*-
import scrapy
from ..items import TaobaoItem
from selenium import webdriver
class ShishangSpider(scrapy.Spider):
    name = 'shishang'
    allowed_domains = ['taobao.com']
    start_urls = ['https://s.taobao.com/search?q=%E6%97%B6%E5%B0%9A%E7%94%B7%E9%9E%8B&imgfile=&commend=all&ssid=s5-e&search_type=item&sourceId=tb.index&spm=a21bo.2017.201856-taobao-item.1&ie=utf8&initiative_id=tbindexz_20170306&bcoffset=6&ntoffset=6&p4ppushleft=1%2C48&s=0']
    def __init__(self):
        self.driver=webdriver.PhantomJS()
    def parse(self, response):

        content_list=response.xpath('//div[@class="ctx-box J_MouseEneterLeave J_IconMoreNew"]')
        for x in content_list:
            name=x.xpath('.//div[@class="row row-2 title"]/a').xpath('string(.)').extract()[0].strip('\n').replace(' ','').strip('\n')
            price=x.xpath('.//div[@class="price g_price g_price-highlight"]/strong/text()').extract_first('')
            dian_name=x.xpath('.//div[@class="shop"]/a/span[2]/text()').extract_first('')
            item=TaobaoItem()
            item['name']=name
            item['price']=price
            item['dian_name']=dian_name
            yield item

item文件代码省略

然后就是管道文件将数据存入Excel,代码如下:

# -*- coding: utf-8 -*-

# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html

from openpyxl import Workbook
class TaobaoPipeline(object):
    def __init__(self):
        self.wb=Workbook()
        self.ws=self.wb.active
        self.ws.append(['名称','价格','店铺'])
    def process_item(self, item, spider):
        line=[item['name'],item['price'],item['dian_name']]
        self.ws.append(line)
        self.wb.save('时尚男鞋.xlsx')
        return item
    def spider_closed(self,spider):
        pass

然后到setting文件中将下载中间件和item管道解注释并修改,代码如下:

   

DOWNLOADER_MIDDLEWARES = {
   'taobao.middlewares.taobaospidermiddleware': 543,
}








ITEM_PIPELINES = {
   'taobao.pipelines.TaobaoPipeline': 300,
}

 

  • 2
    点赞
  • 9
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值