简单使用CrawlSpider爬取网站

1 创建 scrapy startproject shetu   这里我爬的是摄图网

2 cd 到目录   

3创建   scrapy genspider -t crawl 爬虫名字 域名

4 在items  指定好对象  根据自己的需求写

import scrapy


class ShetuItem(scrapy.Item):
    # define the fields for your item here like:
    # name = scrapy.Field()
    img_name = scrapy.Field()
    img_url = scrapy.Field()
    img_time = scrapy.Field()
    img_looknum = scrapy.Field()
    img_collect = scrapy.Field()
    img_down = scrapy.Field()

5在pipelines.py中   将爬去的内容 存入文本中

import json
class ShetuPipeline(object):
    def open_spider(self,spider): #打开文件
        self.fp = open('shetu.txt','w',encoding='utf8')
    def close_spider(self,spiser): #关闭文件
        self.fp.close()
    def process_item(self, item, spider): #转化内容格式
        dic = dict(item)
        string = json.dumps(dic,ensure_ascii=False)
        self.fp.write(string + "\n")
        return item

6 在setting.py中,配置

USER_AGENT = 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.67 Safari/537.36'#配置user-agent
ROBOTSTXT_OBEY = False #是否遵守规则
DOWNLOAD_DELAY = 3#设置爬取时间
ITEM_PIPELINES = {
   'shetu.pipelines.ShetuPipeline': 300,
}#打开pipelines通道

7最后image.py    #image自己创建的

import scrapy
from scrapy.linkextractors import LinkExtractor
from scrapy.spiders import CrawlSpider, Rule
from shetu.items import ShetuItem

class ImageSpider(CrawlSpider):
    name = 'image'
    allowed_domains = ['699pic.com']
    start_urls = ['http://699pic.com/people.html']

    xiangxi_link = LinkExtractor(restrict_xpaths='//div[@class = "list"]/a')
    page_link = LinkExtractor(restrict_xpaths='//div[@class = "pager-linkPage"]/a')
    rules = (
        Rule(xiangxi_link, callback='parse_item', follow=False)
        
    )

    def parse_item(self, response):
        item = ShetuItem()
        # 照片名字
        img_name = response.xpath('//div[@class="photo-view"]/h1/text()')[0].extract()
        # 照片url
        img_url = response.xpath('//div[@class="huabu"]/a/img/@src')[0].extract()
        # 照片发布时间
        img_time = response.xpath('//div[@class="photo-view"]/div/span[@class="publicityt"]/text()')[0].extract()
        # 照片浏览量
        img_looknum = response.xpath('//div[@class="photo-view"]/div/span[@class="look"]/read/text()')[0].extract()
        # 收藏量
        img_collect = response.xpath('//div[@class="photo-view"]/div/span[@class="collect"]/text()')[0].extract().strip(
            "收藏")
        # 下载量
        img_down = response.xpath('//div[@class="photo-view"]/div/span[@class="download"]/text()')[1].extract().strip(
            "\n\t下载")

        for field in item.fields.keys():  # 取出所有的键
            item[field] = eval(field)
        yield item

8 启动爬虫scrapy crawl  image

9  

爬去内容    

 

  • 0
    点赞
  • 2
    收藏
    觉得还不错? 一键收藏
  • 1
    评论
评论 1
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值