scrapy是python的开源爬虫框架,个人觉得还蛮好用的
安装就不多提了,官网上有很详细的安装流程
创建项目
scrapy startproject web_spider
便会出现以下目录结构
web_spider/
scrapy.cfg
web_spider/
__init__.py
custom_filters.py #手动创建,自定义的排重器
model.py #手动创建,sqlalchemy model层
middlewares.py #手动创建,自定义的中间件,用于增加User-Agent,每次request请求前会调用
items.py #要保存的数据结构
pipelines.py #主要用于保存数据
settings.py #全局配置
spiders/
__init__.py
wb_spider.py #手动创建,爬虫的主要逻辑都在里面
wb_spider.py
# -*- coding: utf-8 -*-
# Define here the models for your scraped items
#
# See documentation in:
# http://doc.scrapy.org/en/latest/topics/items.html
from PIL import Image
from StringIO import StringIO
import scrapy
from web_spider.items import WebSpiderItem
from scrapy.spiders import CrawlSpider, Rule
import requests
import urlparse
from scrapy.http import Request
from scrapy.linkextractors import LinkExtractor
import re
import urllib2
class WbSpider(CrawlSpider):
name = "weibo_spider"
#本爬虫的配置
custom_settings = {
'ITEM_PIPELINES': {
'web_spider.pipelines.WebSpiderPipeline': 1,
},
}
start_urls = [
"https://movie.douban.com/chart",
]
rules = (
# # 提取匹配 'category.php' (但不匹配 'subsection.php') 的链接并跟进链接
#(没有callback意味着follow默认为True)
Rule(LinkExtractor(allow=('https://movie\.douban\.com\/tag\/', ))),
# # 提取匹配 'item.php' 的链接并使用spider的parse_item方法进行分析
Rule(LinkExtractor(allow=('https://movie\.douban\.com\/subject\/\d+\/', )),
callback='parse_item'),
)
def __init__(self, *a, **kwargs):
super(WbSpider, self).__init__(**kwargs)
def start_requests(self):
return [scrapy.FormRequest("https://accounts.douban.com/login", callback=self.login)]
def getcapid(self, imgUrl):
# 显示验证码,接收用户输入,返回验证码。显示图片使用PIL库,在OSX上可以直接调用预览显示图片,别的系统不知道了。
Image.open(StringIO(requests.get(imgUrl).content)).show()
return raw_input('输入验证码: ')
def login(self, response):
imgUrl = response.xpath('//img[@id="captcha_image"]/@src').extract_first()
data_dict = {
'form_email': 'xxx@gmail.com',
'form_password': 'xxxxx',
}
if imgUrl is not None:
cap_id = urlparse.parse_qs(urlparse.urlparse(imgUrl).query)['id'][0]
cap_name = self.getcapid(imgUrl)
data_dict['captcha-solution'] = cap_name
data_dict['captcha-id'] = cap_id
return scrapy.FormRequest("https://accounts.douban.com/login",
formdata=data_dict,callback=self.after_login)
def after_login(self, response):
for _url in self.start_urls:
yield self.make_requests_from_url(_url)
#重写,传参PhantomJS,本次没用到
def make_requests_from_url(self, url):
return Request(url, dont_filter=True, meta={'PhantomJS':True})
def parse_item(self, response):
referr = response.request.headers.get('Referer', None)
movieName = response.xpath('//span[@property="v:itemreviewed"]/text()').extract_first()
rating = response.xpath('//strong[@class="ll rating_num"]/text()').extract_first()
actor = ', '.join(response.xpath('//span[@class="actor"]/span[@class="attrs"]/a/text()').extract())
movieType = ', '.join(response.xpath('//div[@id="info"]/span[@property="v:genre"]/text()').extract())
areaInfo = ''.join(response.xpath('//div[@id="info"]/text()').extract()).split('\n')[6:8]
area = areaInfo[0].strip()
lan = areaInfo[1].strip()
item = WebSpiderItem()
item['movie_name'] = movieName
item['movie_type'] = movieType
item['url'] = response.url
item['referr'] = referr
item['actor'] = actor
item['area'] = area
item['lan'] = lan
item['rating'] = rating if rating is not None else 0
# print item
return item
custom_filters.py
# -*- coding: utf-8 -*-
# Define your item fiter here
# 默认的去重指纹是sha1(method + url + body + header),这种方式并不能过滤很多,
# 例如有一些请求会加上时间戳的,基本每次都会不同,这时候我们需要自定义过滤规则
from scrapy.conf import settings
from scrapy.dupefilters import RFPDupeFilter
import redis
class CustomURLFilter(RFPDupeFilter):
""" 只根据url去重"""
def __init__(self, path=None, debug=False):
redis_url = settings.get('REDIS_FILTER')
self.redis_db = redis.Redis(connection_pool=redis.ConnectionPool.from_url(redis_url))
RFPDupeFilter.__init__(self, path)
def request_seen(self, request):
url = request.url.split('?')[0]
if self.redis_db.exists(url):
return True
else:
self.redis_db.set(url, 1)
items.py
# -*- coding: utf-8 -*-
# Define here the models for your scraped items
#
# See documentation in:
# http://doc.scrapy.org/en/latest/topics/items.html
import scrapy
class WebSpiderItem(scrapy.Item):
# define the fields for your item here like:
movie_name = scrapy.Field()
movie_type = scrapy.Field()
url = scrapy.Field()
referr = scrapy.Field()
actor = scrapy.Field()
area = scrapy.Field()
lan = scrapy.Field()
rating = scrapy.Field()
middlewares.py
# -*- coding: utf-8 -*-
import random
from scrapy.conf import settings
#随机User-Agent
class RandomUserAgentMiddleware(object):
def process_request(self, request, spider):
# 设置随机的User-Agent
ua = random.choice(settings.get('USER_AGENT_LIST'))
request.headers.setdefault('User-Agent', ua)
model.py
# -*- coding: utf-8 -*-
# Define here the models for your scraped items
#
# See documentation in:
# http://doc.scrapy.org/en/latest/topics/items.html
from scrapy.conf import settings
from sqlalchemy import create_engine, Column, Integer, String
from sqlalchemy.ext.declarative import declarative_base
DeclarativeBase = declarative_base()
def db_connect():
#定义一个指向sqlalchemy.db数据库的引擎
# return create_engine(settings.get('MYSQL_SAVE'), isolation_level="READ UNCOMMITTED", echo = True)
return create_engine(settings.get('GPDB_SAVE'), echo = True)
def create_myblog_table(engine):
DeclarativeBase.metadata.create_all(engine) #创建数据库
class Movies(DeclarativeBase):
__tablename__ = 'movies' #表的名字
# id = Column(Integer, primary_key = True)
referr = Column('referr', String(600))
url = Column('url', String(600), primary_key = True)
movie_name = Column('movie_name', String(500))
movie_type = Column('movie_type', String(200))
rating = Column('rating', String(10))
actor = Column('actor', String(900))
area = Column('area', String(200))
lan = Column('lan', String(200))
pipelines.py
# -*- coding: utf-8 -*-
# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html
from scrapy.exporters import CsvItemExporter
from scrapy import signals
import csv
import model
from sqlalchemy.orm import sessionmaker
class WebSpiderPipeline(object):
def open_spider(self, spider):
engine = model.db_connect()
model.create_myblog_table(engine)
self.dbSession = sessionmaker(bind = engine)()
self.cnt = 0
def process_item(self, item, spider):
movie=model.Movies(**item)
self.dbSession.add(movie)#添加数据
self.dbSession.commit()
self.cnt += 1
if self.cnt >= 100:
print "#######################", self.cnt
self.cnt = 0
return item
def close_spider(self, spider):
self.dbSession.close()
setting.py
# -*- coding: utf-8 -*-
# Scrapy settings for web_spider project
#
# For simplicity, this file contains only settings considered important or
# commonly used. You can find more settings consulting the documentation:
#
# http://doc.scrapy.org/en/latest/topics/settings.html
# http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html
# http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html
BOT_NAME = 'web_spider'
SPIDER_MODULES = ['web_spider.spiders']
NEWSPIDER_MODULE = 'web_spider.spiders'
LOG_LEVEL = 'INFO'
LOG_ENCODING = 'utf-8'
LOG_FILE = './task.log'
#记录cookie
COOKIES_ENABLED=True
COOKIES_DEBUG=True
# 默认Item并发数:100
CONCURRENT_ITEMS = 10
# 默认Request并发数:16
CONCURRENT_REQUESTS = 5
# 默认每个域名的并发数:8
CONCURRENT_REQUESTS_PER_DOMAIN = 8
# 每个IP的最大并发数:0表示忽略
CONCURRENT_REQUESTS_PER_IP = 0
#延迟请求500ms
DOWNLOAD_DELAY = 1
#随机化延迟时间,效果是使延迟时间变成DOWNLOAD_DELAY的0.5~1.5倍
RANDOMIZE_DOWNLOAD_DELAY = True
#爬取网站最大允许的深度(depth)值。如果为0,则没有限制
# DEPTH_LIMIT = 0
USER_AGENT_LIST = [
'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.7 (KHTML, like Gecko) Chrome/16.0.912.36 Safari/535.7',
'Mozilla/5.0 (Windows NT 6.2; Win64; x64; rv:16.0) Gecko/16.0 Firefox/16.0',
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_3) AppleWebKit/534.55.3 (KHTML, like Gecko) Version/5.1.3 Safari/534.53.10',
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_3) AppleWebKit/536.5 (KHTML, like Gecko) Chrome/19.0.1084.54 Safari/536.5'
]
DOWNLOADER_MIDDLEWARES = {
'web_spider.middlewares.RandomUserAgentMiddleware': 400,
'web_spider.contrib.downloadermiddleware.useragent.UserAgentMiddleware': None,
}
#自定义过滤器
DUPEFILTER_CLASS = 'web_spider.custom_filters.CustomURLFilter'
REDIS_FILTER = "redis://:111@1111/1"
MYSQL_SAVE = "mysql://111@1111/sdd?charset=utf8"
GPDB_SAVE = "postgresql://222@1222/bi?client_encoding=utf8"
运行
scrapy crawl weibo_spider
功能描述
登录豆瓣,手动输入验证码,爬取电影信息,保存到GP上