本文仅供学习,不要用作商业用途
目的:
爬取某网站内容,入口URL:http://home.meishichina.com/recipe/liangcai/#utm_source=recipe_index_tags_type ,好多个
详细内容页面:http://home.meishichina.com/recipe-262879.html
1.创建项目
scrapy startproject tutorial
2.创建item,存储爬取结果
cat items.py
# -*- coding: utf-8 -*-
# Define here the models for your scraped items
#
# See documentation in:
# http://doc.scrapy.org/en/latest/topics/items.html
import scrapy
from scrapy.item import Item, Field
class MeishiItem(Item):
url = Field()
title = Field()
img_url = Field()
detail = Field()
steps = Field()
3.爬虫核心逻辑
cat spiders/ListSpider.py
#-*-coding:utf-8-*-
import scrapy
import sys
import time
from scrapy.contrib.spiders import CrawlSpider, Rule
from scrapy.contrib.linkextractors import LinkExtractor
from scrapy.selector import Selector
from scrapy.item import Item
from meishi.items import MeishiItem
reload(sys)
sys.setdefaultencoding("utf-8")
class ListSpider(CrawlSpider):
#爬虫名称
name = "ListSpider"
#设置下载延时
download_delay = 1
#允许域名
allowed_domains = ["meishichina.com"]
#开始URL
start_urls = [
"http://home.meishichina.com/recipe/liangcai/#utm_source=recipe_index_tags_type",
"http://home.meishichina.com/recipe/recai/#utm_source=recipe_collect_tags_type",
"http://home.meishichina.com/recipe/huncai/#utm_source=recipe_collect_tags_type",
"http://home.meishichina.com/recipe/sucai/#utm_source=recipe_collect_tags_type",
"http://home.meishichina.com/recipe/kaiweicai/#utm_source=recipe_collect_tags_type",
"http://home.meishichina.com/recipe/jiachang/#utm_source=recipe_collect_tags_type",
"http://home.meishichina.com/recipe/sifangcai/#utm_source=recipe_collect_tags_type",
"http://home.meishichina.com/recipe/tanggeng/#utm_source=recipe_collect_tags_type",
"http://home.meishichina.com/recipe/xiaochi/#utm_source=recipe_collect_tags_type"
]
#爬取规则,不带callback表示向该类url递归爬取
rules = (
Rule(LinkExtractor(allow=('page/[0-9]+', ))),
Rule(LinkExtractor(allow=('recipe-[0-9]+', )), callback='parse_content'),
)
#解析内容函数
def parse_content(self, response):
item = MeishiItem()
#当前URL
#print response.url
item['url'] = response.url
#菜谱标题
title = response.selector.xpath("//div[@class='recipDetail']/input[@id='recipe_title']/@value")[0].extract().decode('utf-8')
print title
item['title'] = title
#菜谱主图URL
img_url = response.selector.xpath("//div[@class='recipDetail']/div[@id='recipe_De_imgBox']/a[@class='J_photo']/img/@src")[0].extract().decode('utf-8')
#print img_url
item['img_url'] = img_url
#获取菜谱配料
detail = []
mts = response.selector.xpath("//div[@class='recipDetail']/div[@class='recipeCategory clear']/div[@class='recipeCategory_sub clear']")
for sel in mts:
temp_map = {}
mt = sel.xpath("div")
mt_l = mt[0].xpath("text()")[0].extract().decode('utf-8')
temp_map['key'] = mt_l
s1 = ""
for mt_r in mt[1].xpath("ul/li | div | div/a "):
s1 = "%s\t%s %s" % (s1 , "".join([ ss.extract().decode('utf-8').replace(" ","").replace("\t","").replace("\r\n","") for ss in mt_r.xpath("span[@class='category_s1']/a/text() | span[@class='category_s1']/text() | text() ") ]) , "".join([ ss.extract().decode('utf-8').replace(" ","").replace("\t","").replace("\r\n","") for ss in mt_r.xpath("span[@class='category_s2']/text()")]) )
#print s1
temp_map['value'] = s1
detail.append(temp_map)
item['detail'] = detail
#获取步骤
steps_list = []
steps = response.selector.xpath("//div[@class='recipDetail']/div[@class='recipeStep']/ul/li")
for sel in steps:
s_url = "%s" % ( "".join( [ ss.extract().decode('utf-8').replace(" ","").replace("\t","").replace("\r\n","") for ss in sel.xpath("div[@class='recipeStep_img']/img/@src")]) )
s_content = "%s" % ( "".join( [ ss.extract().decode('utf-8').replace(" ","").replace("\t","").replace("\r\n","").strip() for ss in sel.xpath("div[@class='recipeStep_word']/text()")]) )
#print "%s:%s" % (s_content,s_url)
steps_list.append("%s\t%s" % (s_content,s_url))
item['steps'] = steps_list
yield item
4.模拟浏览器UA
cat spiders/rotate_useragent.py
# -*-coding:utf-8-*-
from scrapy import log
"""避免被ban策略之一:使用useragent池。
使用注意:需在settings.py中进行相应的设置。
"""
import random
from scrapy.contrib.downloadermiddleware.useragent import UserAgentMiddleware
class RotateUserAgentMiddleware(UserAgentMiddleware):
def __init__(self,user_agent=''):
self.user_agent = user_agent
def process_request(self,request, spider):
ua = random.choice(self.user_agent_list)
if ua:
#显示当前使用的useragent
print "********Current UserAgent:%s************" %ua
#记录
log.msg('Current UserAgent: '+ua, level='INFO')
request.headers.setdefault('User-Agent', ua)
#the default user_agent_list composes chrome,I E,firefox,Mozilla,opera,netscape
#for more user agent strings,you can find it in http://www.useragentstring.com/pages/useragentstring.php
user_agent_list = [
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 "
"(KHTML, like Gecko) Chrome/22.0.1207.1 Safari/537.1",
"Mozilla/5.0 (X11; CrOS i686 2268.111.0) AppleWebKit/536.11 "
"(KHTML, like Gecko) Chrome/20.0.1132.57 Safari/536.11",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.6 "
"(KHTML, like Gecko) Chrome/20.0.1092.0 Safari/536.6",
"Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.6 "
"(KHTML, like Gecko) Chrome/20.0.1090.0 Safari/536.6",
"Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.1 "
"(KHTML, like Gecko) Chrome/19.77.34.5 Safari/537.1",
"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/536.5 "
"(KHTML, like Gecko) Chrome/19.0.1084.9 Safari/536.5",
"Mozilla/5.0 (Windows NT 6.0) AppleWebKit/536.5 "
"(KHTML, like Gecko) Chrome/19.0.1084.36 Safari/536.5",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 "
"(KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3",
"Mozilla/5.0 (Windows NT 5.1) AppleWebKit/536.3 "
"(KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3",
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_0) AppleWebKit/536.3 "
"(KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3",
"Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 "
"(KHTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 "
"(KHTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3",
"Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 "
"(KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 "
"(KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3",
"Mozilla/5.0 (Windows NT 6.1) AppleWebKit/536.3 "
"(KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3",
"Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 "
"(KHTML, like Gecko) Chrome/19.0.1061.0 Safari/536.3",
"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/535.24 "
"(KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24",
"Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/535.24 "
"(KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24"
]
5.设置爬取结果存取
cat pipelines.py
# -*- coding: utf-8 -*-
# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html
import json
import codecs
class MeishiPipeline(object):
def __init__(self):
self.file = codecs.open('data.json', mode='wb', encoding='utf-8')
def process_item(self, item, spider):
line = json.dumps(dict(item))+"\n"
self.file.write(line.decode("unicode_escape"))
return item
6.配置爬虫
rolindeMacBook-Pro:meishi rolin$ cat settings.py
# -*- coding: utf-8 -*-
# Scrapy settings for meishi project
#
# For simplicity, this file contains only the most important settings by
# default. All the other settings are documented here:
#
# http://doc.scrapy.org/en/latest/topics/settings.html
#
BOT_NAME = 'meishi'
SPIDER_MODULES = ['meishi.spiders']
NEWSPIDER_MODULE = 'meishi.spiders'
# Crawl responsibly by identifying yourself (and your website) on the user-agent
#USER_AGENT = 'meishi (+http://www.yourdomain.com)'
#禁止cookies,防止被ban
COOKIES_ENABLED = False
COOKIES_ENABLES = False
#设置Pipeline,此处实现数据写入文件
ITEM_PIPELINES = {
'meishi.pipelines.MeishiPipeline':300
}
#设置爬虫爬取的最大深度
DEPTH_LIMIT=100
#取消默认的useragent,使用新的useragent
DOWNLOADER_MIDDLEWARES = {
'scrapy.contrib.downloadermiddleware.useragent.UserAgentMiddleware' : None,
'meishi.spiders.rotate_useragent.RotateUserAgentMiddleware' :400
}
7.启动
scrapy crawl ListSpider
8.爬取结果
olindeMacBook-Pro:meishi rolin$ head -n 2 data.json
{"url": "http://home.meishichina.com/recipe-261887.html", "img_url": "http://i3.meishichina.com/attachment/recipe/2016/04/24/p800_2016042414614586261017519551.jpg", "steps": ["准备好材料http://i3.meishichina.com/attachment/recipe/2016/04/24/p320_2016042414614591577587519551.jpg", "苦菊切成适当长度http://i3.meishichina.com/attachment/recipe/2016/04/24/p320_2016042414614592794477519551.jpg", "花生米炒熟,注意不要炒火太大http://i3.meishichina.com/attachment/recipe/2016/04/24/p320_2016042414614593554077519551.jpg", "辣椒切成段,加油炒香http://i3.meishichina.com/attachment/recipe/2016/04/24/p320_2016042414614594432247519551.jpg", "把所有处理好的食材放到一起http://i3.meishichina.com/attachment/recipe/2016/04/24/p320_2016042414614596522347519551.jpg", "把所有调料放进去,拌匀即可http://i3.meishichina.com/attachment/recipe/2016/04/24/p320_2016042414614596429577519551.jpg", "又是美美哒一餐http://i3.meishichina.com/attachment/recipe/2016/04/24/p320_2016042414614596953887519551.jpg"], "detail": [{"value": "苦菊 (1棵)花生米 (适量)", "key": "主料"}, {"value": "醋 (2大勺)白糖 (2大勺)盐 (1勺)干辣椒 (2个)食用油 (2勺)香油 (适量)", "key": "辅料"}, {"value": "其它 ", "key": "厨具"}, {"value": "家常菜凉菜清淡拌 廿分钟 简单难度 ", "key": "分类"}], "title": "苦菊拌花生米"}
{"url": "http://home.meishichina.com/recipe-261910.html", "img_url": "http://i3.meishichina.com/attachment/recipe/2016/04/24/p800_20160424q1hjintc10emansp.JPG", "steps": ["先将木兰牙用开水焯变色,时间太久口感不好http://i3.meishichina.com/attachment/recipe/2016/04/24/p320_20160424y80dehh1k3bovmw9.JPG", "过凉水捞出,挤干水份,切段http://i3.meishichina.com/attachment/recipe/2016/04/24/p320_20160424uvac9t6uco351qdg.JPG", "准备好小葱干辣椒蒜,没有小葱用大葱代替http://i3.meishichina.com/attachment/recipe/2016/04/24/p320_20160424bazvyt9gp1z1ewq1.JPG", "将小葱干辣椒蒜切碎,(葱建议不要切太小)http://i3.meishichina.com/attachment/recipe/2016/04/24/p320_20160424uafkca0eeauzefie.JPG", "准备好所有调料,醋,盐,味精,花椒,葱,辣椒,蒜,少许香油http://i3.meishichina.com/attachment/recipe/2016/04/24/p320_20160424brub4oz93ohmrp12.jpg", "将准备好的所有调料放入木兰牙上先不要搅拌http://i3.meishichina.com/attachment/recipe/2016/04/24/p320_201604242oii6pe48zn350nt.JPG", "热锅放入油,等锅稍微冒烟将热油倒入在木兰牙上(一定要油熟透在倒入)凉油会有味道,过熟油主要是把调料的味道炝出来http://i3.meishichina.com/attachment/recipe/2016/04/24/p320_20160424bxedige0sxgv9mzv.JPG", "搅拌均匀装盘http://i3.meishichina.com/attachment/recipe/2016/04/24/p320_2016042495zyg196d70w2rfd.JPG"], "detail": [{"value": "木兰牙 (300克)蒜 (少许)", "key": "主料"}, {"value": "小葱 (20克)醋 (少许)干辣椒 (少许)香油 (少许)热油 (少许)盐花椒味精 (适量)", "key": "辅料"}, {"value": "炒锅 ", "key": "厨具"}, {"value": "家常菜凉菜微辣拌 十分钟 普通难度 ", "key": "分类"}], "title": "凉拌木兰牙"}