一、安装scrapy
请按照安装教程安装scrapy。
二、编写爬虫
2.1 创建项目
[root@ cgls]# scrapy startproject beautiful_album_spider
2.2 创建spider
[root@ cgls]# scrapy genspider xiaohuar_spider xiaohuar.com
2.3 写spider文件
[root@ cgls]# vi /cgls/beautiful_album_spider/xiaohuar/spider/xiaohuar_spider.py
# -*- coding:utf-8 -*-
import scrapy
import re
from scrapy.selector import Selector
from scrapy.http import Request
from xiaohuar.items import XiaohuarItem
import urllib
from xiaohuar.settings import PROJECT_ROOT_PATH
import os
import requests
class Xiaohuar_spider(scrapy.spiders.Spider):
name="xiaohuar"#定义爬虫名
allowed_domains=["xiaohuar.com"] #搜索的域名范围,也就是爬虫的约束区域,规定爬虫只爬取这个域名下的网页
start_urls=["http://www.xiaohuar.com/list-1-1.html"]
#该函数名不能改变,因为Scrapy源码中默认callback函数的函数名就是parse
def parse(self, response):
xiaohuarItems = XiaohuarItem()
hxs=Selector(response) #创建查询对象
# print hxs
if re.match('http://www.xiaohuar.com/list-1-\d+.html', response.url):#如果url能够匹配到需要爬取的url,就爬取
items=hxs.xpath('//div[@class="item_list infinite_scroll"]/div')#匹配到大的div下的所有小div(每个小div中包含一个图片)
# print items
for i in range(len(items)):#遍历div个数
src=hxs.xpath('//div[@class="item_list infinite_scroll"]/div[%d]//div[@class="img"]/a/img/@src'%i).extract() #查询所有img标签的src属性,即获取校花图片地址
name=hxs.xpath('//div[@class="item_list infinite_scroll"]/div[%d]//div[@class="img"]/span/text()'%i).extract() #获取span的文本内容,即校花姓名
school=hxs.xpath('//div[@class="item_list infinite_scroll"]/div[%d]//div[@class="img"]/div[@class="btns"]/a/text()'%i).extract()#校花学校
xiaohuarItems["photo_url"] = src
xiaohuarItems["name"] = name
xiaohuarItems["school"] = school
# print school
# print name
# print src
yield xiaohuarItems
if src:
absoluteSrc = "http://www.xiaohuar.com" +src[0] # 拼接实际路径,因为.extract()会返回一个list,但是我们是依次取得div,所以是取第0个
file_name = "%s_%s.jpg" % (school[0], name[0]) # 拼接文件名,学校_姓名
xiaohuar_photo_dir = PROJECT_ROOT_PATH + "/result_data/photo/"
if not os.path.exists(xiaohuar_photo_dir):
os.makedirs(xiaohuar_photo_dir)
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/47.0.2526.73 Safari/537.36'}
s = requests.get(absoluteSrc, headers=headers)
fname = xiaohuar_photo_dir+file_name
with open(fname, 'wb') as f:
f.write(s.content)
f.close()
all_urls = hxs.xpath('//a/@href').extract()#提取界面所有的url
print "***耐心等待,图片下载中***"
for url in all_urls:#遍历获得的url,如果满足条件,继续爬取
if url.startswith('http://www.xiaohuar.com/list-1-'):
yield Request(url, callback=self.parse)
2.4 写Item文件
[root@ cgls]# vi /cgls/beautiful_album_spider/xiaohuar/items.py
# -*- coding: utf-8 -*-
# Define here the models for your scraped items
#
# See documentation in:
# http://doc.scrapy.org/en/latest/topics/items.html
from scrapy import Item, Field
class XiaohuarItem(Item):
'''校花item'''
photo_url = Field()
name = Field()
school = Field()
2.5 写Pipeline文件
[root@ cgls]# vi /cgls/beautiful_album_spider/xiaohuar/pipelines.py
# -*- coding: utf-8 -*-
# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html
from xiaohuar.items import XiaohuarItem
import os
import time
import json
from xiaohuar.settings import PROJECT_ROOT_PATH
class XiaohuarPipeline(object):
def __init__(self):
super(XiaohuarPipeline, self).__init__()
self.count = 0 #当前处理过的item
self.per = 50 #每个json文件存放的item数目
self.p_id = os.getpid()
self.json_xiaohuar_dir = os.path.join(PROJECT_ROOT_PATH, 'result_data','info_xiaohuar_json')
self.today = time.strftime('%Y-%m-%d',time.localtime(time.time()))
if not os.path.exists(self.json_xiaohuar_dir):
os.makedirs(self.json_xiaohuar_dir)
def process_item(self, item, spider):
# 写入所有信息json
with open( self.json_xiaohuar_dir + '/%s-XiaoHuar-%s-%s.json' % (self.today, str(self.count // self.per + 1), self.p_id), 'a+') as f:
line = json.dumps(dict(item)).decode("unicode-escape") + u',\n'
f.write(line.encode('utf-8'))
info1 = "%s 已获取校花%s的信息" % (self.p.id,item['name'])
print info1
self.count += 1
return item
2.6 写Setting文件
[root@ cgls]# vi /cgls/beautiful_album_spider/xiaohuar/settings.py
~~~
from os.path import dirname
PROJECT_ROOT_PATH = dirname(__file__)
DOWNLOAD_TIMEOUT = 10
DOWNLOAD_DELAY = 0.1
ITEM_PIPELINES = {
'xiaohuar.pipelines.XiaohuarPipeline': 305,
}
~~~
2.7 写start_xiaohuar文件
[root@ cgls]# vi /cgls/beautiful_album_spider/start_xiaohuar.py
#coding=utf-8
import os
def start_spider():
cmd = 'scrapy crawl xiaohuar --nolog '
os.system(cmd)
if __name__ == '__main__':
start_spider()
2.8 启动爬虫,爬取图片
[root@ cgls/beautiful_album_spider]# python start_xiaohuar.py
2.9 图片展示