scrapy实现分页爬取

本文仅供学习,不要用作商业用途


目的:

爬取某网站内容,入口URL:http://home.meishichina.com/recipe/liangcai/#utm_source=recipe_index_tags_type ,好多个

详细内容页面:http://home.meishichina.com/recipe-262879.html


1.创建项目

scrapy startproject tutorial


2.创建item,存储爬取结果

cat items.py

# -*- coding: utf-8 -*-


# Define here the models for your scraped items
#
# See documentation in:
# http://doc.scrapy.org/en/latest/topics/items.html


import scrapy
from scrapy.item import Item, Field




class MeishiItem(Item):
    url = Field()
    title = Field()
    img_url = Field()
    detail = Field()
    steps = Field()



3.爬虫核心逻辑

cat spiders/ListSpider.py

#-*-coding:utf-8-*-
import scrapy
import sys
import time
from scrapy.contrib.spiders import CrawlSpider, Rule
from scrapy.contrib.linkextractors import LinkExtractor
from scrapy.selector import Selector
from scrapy.item import Item
from meishi.items import MeishiItem


reload(sys)
sys.setdefaultencoding("utf-8")


class ListSpider(CrawlSpider):
    #爬虫名称
    name = "ListSpider"
    #设置下载延时
    download_delay = 1
    #允许域名
    allowed_domains = ["meishichina.com"]
    #开始URL
    start_urls = [
        "http://home.meishichina.com/recipe/liangcai/#utm_source=recipe_index_tags_type",
"http://home.meishichina.com/recipe/recai/#utm_source=recipe_collect_tags_type",
"http://home.meishichina.com/recipe/huncai/#utm_source=recipe_collect_tags_type",
"http://home.meishichina.com/recipe/sucai/#utm_source=recipe_collect_tags_type",
"http://home.meishichina.com/recipe/kaiweicai/#utm_source=recipe_collect_tags_type",
"http://home.meishichina.com/recipe/jiachang/#utm_source=recipe_collect_tags_type",
"http://home.meishichina.com/recipe/sifangcai/#utm_source=recipe_collect_tags_type",
"http://home.meishichina.com/recipe/tanggeng/#utm_source=recipe_collect_tags_type",
"http://home.meishichina.com/recipe/xiaochi/#utm_source=recipe_collect_tags_type"
    ]
    #爬取规则,不带callback表示向该类url递归爬取
    rules = (
        Rule(LinkExtractor(allow=('page/[0-9]+', ))),
        Rule(LinkExtractor(allow=('recipe-[0-9]+', )), callback='parse_content'),
    )


    #解析内容函数
    def parse_content(self, response):
item = MeishiItem()


#当前URL
#print response.url
item['url'] = response.url


#菜谱标题
title = response.selector.xpath("//div[@class='recipDetail']/input[@id='recipe_title']/@value")[0].extract().decode('utf-8')
print title
item['title'] = title


#菜谱主图URL
img_url = response.selector.xpath("//div[@class='recipDetail']/div[@id='recipe_De_imgBox']/a[@class='J_photo']/img/@src")[0].extract().decode('utf-8')
#print img_url
item['img_url'] = img_url


#获取菜谱配料
detail = []
mts = response.selector.xpath("//div[@class='recipDetail']/div[@class='recipeCategory clear']/div[@class='recipeCategory_sub clear']")
for sel in mts:
   temp_map = {}
   mt = sel.xpath("div")
   mt_l = mt[0].xpath("text()")[0].extract().decode('utf-8')
   temp_map['key'] = mt_l
   s1 = ""
   for mt_r in mt[1].xpath("ul/li | div | div/a "):
s1 = "%s\t%s %s" % (s1 , "".join([ ss.extract().decode('utf-8').replace(" ","").replace("\t","").replace("\r\n","") for ss in mt_r.xpath("span[@class='category_s1']/a/text() | span[@class='category_s1']/text() | text() ") ]) , "".join([ ss.extract().decode('utf-8').replace(" ","").replace("\t","").replace("\r\n","") for ss in mt_r.xpath("span[@class='category_s2']/text()")]) )
   #print s1
   temp_map['value'] = s1
   detail.append(temp_map)
item['detail'] = detail


#获取步骤
steps_list = []
steps = response.selector.xpath("//div[@class='recipDetail']/div[@class='recipeStep']/ul/li")
for sel in steps:
   s_url = "%s" % ( "".join( [ ss.extract().decode('utf-8').replace(" ","").replace("\t","").replace("\r\n","") for ss in sel.xpath("div[@class='recipeStep_img']/img/@src")])  )
   s_content = "%s" % ( "".join( [ ss.extract().decode('utf-8').replace(" ","").replace("\t","").replace("\r\n","").strip() for ss in sel.xpath("div[@class='recipeStep_word']/text()")])  )
   #print "%s:%s" % (s_content,s_url)
   steps_list.append("%s\t%s" % (s_content,s_url))
item['steps'] = steps_list


yield item



4.模拟浏览器UA

cat spiders/rotate_useragent.py

# -*-coding:utf-8-*-


from scrapy import log


"""避免被ban策略之一:使用useragent池。


使用注意:需在settings.py中进行相应的设置。
"""


import random
from scrapy.contrib.downloadermiddleware.useragent import UserAgentMiddleware


class RotateUserAgentMiddleware(UserAgentMiddleware):


    def __init__(self,user_agent=''):
self.user_agent = user_agent


    def process_request(self,request, spider):
ua = random.choice(self.user_agent_list)
        if ua:
            #显示当前使用的useragent
            print "********Current UserAgent:%s************" %ua


            #记录
            log.msg('Current UserAgent: '+ua, level='INFO')
            request.headers.setdefault('User-Agent', ua)


    #the default user_agent_list composes chrome,I E,firefox,Mozilla,opera,netscape
    #for more user agent strings,you can find it in http://www.useragentstring.com/pages/useragentstring.php
    user_agent_list = [
        "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 "
        "(KHTML, like Gecko) Chrome/22.0.1207.1 Safari/537.1",
        "Mozilla/5.0 (X11; CrOS i686 2268.111.0) AppleWebKit/536.11 "
        "(KHTML, like Gecko) Chrome/20.0.1132.57 Safari/536.11",
        "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.6 "
        "(KHTML, like Gecko) Chrome/20.0.1092.0 Safari/536.6",
        "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.6 "
        "(KHTML, like Gecko) Chrome/20.0.1090.0 Safari/536.6",
        "Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.1 "
        "(KHTML, like Gecko) Chrome/19.77.34.5 Safari/537.1",
        "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/536.5 "
        "(KHTML, like Gecko) Chrome/19.0.1084.9 Safari/536.5",
        "Mozilla/5.0 (Windows NT 6.0) AppleWebKit/536.5 "
        "(KHTML, like Gecko) Chrome/19.0.1084.36 Safari/536.5",
        "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 "
        "(KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3",
        "Mozilla/5.0 (Windows NT 5.1) AppleWebKit/536.3 "
        "(KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3",
        "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_0) AppleWebKit/536.3 "
        "(KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3",
        "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 "
        "(KHTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3",
        "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 "
        "(KHTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3",
        "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 "
        "(KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3",
        "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 "
        "(KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3",
        "Mozilla/5.0 (Windows NT 6.1) AppleWebKit/536.3 "
        "(KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3",
        "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 "
        "(KHTML, like Gecko) Chrome/19.0.1061.0 Safari/536.3",
        "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/535.24 "
        "(KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24",
        "Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/535.24 "
        "(KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24"
       ]



5.设置爬取结果存取

cat pipelines.py

# -*- coding: utf-8 -*-


# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html
import json
import codecs


class MeishiPipeline(object):
    def __init__(self):
self.file = codecs.open('data.json', mode='wb', encoding='utf-8')


    def process_item(self, item, spider):
line = json.dumps(dict(item))+"\n"
self.file.write(line.decode("unicode_escape"))


        return item


6.配置爬虫

rolindeMacBook-Pro:meishi rolin$ cat settings.py
# -*- coding: utf-8 -*-


# Scrapy settings for meishi project
#
# For simplicity, this file contains only the most important settings by
# default. All the other settings are documented here:
#
#     http://doc.scrapy.org/en/latest/topics/settings.html
#


BOT_NAME = 'meishi'


SPIDER_MODULES = ['meishi.spiders']
NEWSPIDER_MODULE = 'meishi.spiders'


# Crawl responsibly by identifying yourself (and your website) on the user-agent
#USER_AGENT = 'meishi (+http://www.yourdomain.com)'


#禁止cookies,防止被ban
COOKIES_ENABLED = False
COOKIES_ENABLES = False


#设置Pipeline,此处实现数据写入文件
ITEM_PIPELINES = {
    'meishi.pipelines.MeishiPipeline':300
}


#设置爬虫爬取的最大深度
DEPTH_LIMIT=100


#取消默认的useragent,使用新的useragent
DOWNLOADER_MIDDLEWARES = {
        'scrapy.contrib.downloadermiddleware.useragent.UserAgentMiddleware' : None,
        'meishi.spiders.rotate_useragent.RotateUserAgentMiddleware' :400
    }


7.启动

scrapy crawl ListSpider


8.爬取结果

olindeMacBook-Pro:meishi rolin$ head -n 2 data.json
{"url": "http://home.meishichina.com/recipe-261887.html", "img_url": "http://i3.meishichina.com/attachment/recipe/2016/04/24/p800_2016042414614586261017519551.jpg", "steps": ["准备好材料http://i3.meishichina.com/attachment/recipe/2016/04/24/p320_2016042414614591577587519551.jpg", "苦菊切成适当长度http://i3.meishichina.com/attachment/recipe/2016/04/24/p320_2016042414614592794477519551.jpg", "花生米炒熟,注意不要炒火太大http://i3.meishichina.com/attachment/recipe/2016/04/24/p320_2016042414614593554077519551.jpg", "辣椒切成段,加油炒香http://i3.meishichina.com/attachment/recipe/2016/04/24/p320_2016042414614594432247519551.jpg", "把所有处理好的食材放到一起http://i3.meishichina.com/attachment/recipe/2016/04/24/p320_2016042414614596522347519551.jpg", "把所有调料放进去,拌匀即可http://i3.meishichina.com/attachment/recipe/2016/04/24/p320_2016042414614596429577519551.jpg", "又是美美哒一餐http://i3.meishichina.com/attachment/recipe/2016/04/24/p320_2016042414614596953887519551.jpg"], "detail": [{"value": "苦菊 (1棵)花生米 (适量)", "key": "主料"}, {"value": "醋 (2大勺)白糖 (2大勺)盐 (1勺)干辣椒 (2个)食用油 (2勺)香油 (适量)", "key": "辅料"}, {"value": "其它 ", "key": "厨具"}, {"value": "家常菜凉菜清淡廿分钟 简单难度 ", "key": "分类"}], "title": "苦菊拌花生米"}
{"url": "http://home.meishichina.com/recipe-261910.html", "img_url": "http://i3.meishichina.com/attachment/recipe/2016/04/24/p800_20160424q1hjintc10emansp.JPG", "steps": ["先将木兰牙用开水焯变色,时间太久口感不好http://i3.meishichina.com/attachment/recipe/2016/04/24/p320_20160424y80dehh1k3bovmw9.JPG", "过凉水捞出,挤干水份,切段http://i3.meishichina.com/attachment/recipe/2016/04/24/p320_20160424uvac9t6uco351qdg.JPG", "准备好小葱干辣椒蒜,没有小葱用大葱代替http://i3.meishichina.com/attachment/recipe/2016/04/24/p320_20160424bazvyt9gp1z1ewq1.JPG", "将小葱干辣椒蒜切碎,(葱建议不要切太小)http://i3.meishichina.com/attachment/recipe/2016/04/24/p320_20160424uafkca0eeauzefie.JPG", "准备好所有调料,醋,盐,味精,花椒,葱,辣椒,蒜,少许香油http://i3.meishichina.com/attachment/recipe/2016/04/24/p320_20160424brub4oz93ohmrp12.jpg", "将准备好的所有调料放入木兰牙上先不要搅拌http://i3.meishichina.com/attachment/recipe/2016/04/24/p320_201604242oii6pe48zn350nt.JPG", "热锅放入油,等锅稍微冒烟将热油倒入在木兰牙上(一定要油熟透在倒入)凉油会有味道,过熟油主要是把调料的味道炝出来http://i3.meishichina.com/attachment/recipe/2016/04/24/p320_20160424bxedige0sxgv9mzv.JPG", "搅拌均匀装盘http://i3.meishichina.com/attachment/recipe/2016/04/24/p320_2016042495zyg196d70w2rfd.JPG"], "detail": [{"value": "木兰牙 (300克)蒜 (少许)", "key": "主料"}, {"value": "小葱 (20克)醋 (少许)干辣椒 (少许)香油 (少许)热油 (少许)盐花椒味精 (适量)", "key": "辅料"}, {"value": "炒锅 ", "key": "厨具"}, {"value": "家常菜凉菜微辣十分钟 普通难度 ", "key": "分类"}], "title": "凉拌木兰牙"}





  • 0
    点赞
  • 6
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值