moviepan.py
# -*- coding: utf-8 -*-
import scrapy
from doubanmovie.items import DoubanmovieItem
class MoviepanSpider(scrapy.Spider):
name = 'moviepan'
allowed_domains = ['douban.com']
# 设置地址
start_urls = ['https://movie.douban.com/top250']
def parse(self, response):
# 设置地址表达式
movie_list = response.xpath('//div[@class="item"]')
# 二次提取
for tr in movie_list:
item = DoubanmovieItem()
item['rank'] = tr.xpath('div[@class="pic"]/em/text()').extract()
# 电影名称解析并赋值
item['title'] = tr.xpath('div[@class="info"]/div[@class="hd"]/a/span[@class="title"][1]/text()').extract()
item['href'] = tr.xpath('div[@class="info"]/div/a/@href').extract()
cl = item['href'][0]
yield scrapy.Request(cl, callback=self.parse_detail, meta={'item': item})
next_url = response.xpath('//a[text()="后页>"]/@href').extract_first()
if next_url is not None:
yield scrapy.Request('https://movie.douban.com/top250' + next_url, callback=self.parse)
def parse_detail(self, response):
item = response.meta['item']
item['pic_url'] = response.xpath('//*[@id="mainpic"]/a/img/@src').extract()
item['play_url'] = response.xpath('//*[@id="content"]/div[3]/div[2]/div[1]/ul/li[1]/a/@href').extract()
yield item
pipelinesmyjson.py
# -*- coding: utf-8 -*-
# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html
import json
class DoubanmoviePipeline(object):
def __init__(self):
self.jsond = open(r'E:\doubanmovie\doubanmovie\data.json', 'w', encoding='utf-8')
self.listd = []
def process_item(self, item):
self.listd.append({'title': item['title'][0],
'rank': item['rank'][0]
})
return item
def close_spider(self, spider):
json.dump(self.listd, self.jsond, ensure_ascii='')
Scrapy详解之Spiders
https://zhuanlan.zhihu.com/p/39125300
Scrapy库安装
先离线安装 Twisted-18.4.0-cp36-cp36m-win_amd64.whl
先切换地址
pip install Twisted-18.4.0-cp36-cp36m-win_amd64.whl
然后安装1.5.0版本Scrapy
pip install Scrapy==1.5.0
scrapy
dos下执行
d:
cd 定位到指定盘上
scrapy startproject 工程名
创建爬虫
scrapy startproject baidunews
scrapy genspider baidu baidu.com
pycharm 下打开项目
open 打开项目
创建主程序
设置 浏览器标识
DOWNLOADER_MIDDLEWARES = {
'doubanmovie.middlewares.DoubanmovieDownloaderMiddleware': 543,
'scrapy.contrib.downloadermiddleware.useragent.UserAgentMiddleware': None,
#设置浏览器标识
'doubanmovie.rotate_useragent.RotateUserAgentMiddleware':400,
}
开启输出
ITEM_PIPELINES = {
'doubanmovie.pipelines.DoubanmoviePipeline': 300,}
scrapy crawl moviepan -o items.json -t json
创建json-wj
FEED_EXPORT_ENCODING='UTF-8'
scrapy crawl moviepan -o items.csv -t csv
作者:Zarten
链接:https://zhuanlan.zhihu.com/p/54691192
来源:知乎
著作权归作者所有。商业转载请联系作者获得授权,非商业转载请注明出处。
设置列表
1.ROBOTSTXT_OBEY
是否遵循robots协议,一般不要遵循设为False即可
ROBOTSTXT_OBEY = False
2.日志设置
DEBUG 默认
INFO
WARNING
ERROR
CRITICAL
LOG_FILE = 'log.log'
LOG_LEVEL = 'ERROR'
3.DOWNLOAD_DELAY
下载延迟
实际是一个范围随机值: 0.5倍-1.5倍 单位秒
DOWNLOAD_DELAY = 3
若不设置随机值,设置如下:
RANDOMIZE_DOWNLOAD_DELAY = False
4.DOWNLOAD_TIMEOUT
下载超时时间设
默认180s
DOWNLOAD_TIMEOUT = 10
5.RETRY_ENABLED
是否禁止重试请求,默认运行重试
RETRY_ENABLED = False #禁止重试
6.RETRY_TIMES
超时重试
默认另外重试2次
RETRY_TIMES = 8
7.COOKIES_ENABLED
是否禁用cookies
禁用后速度会快些,但可能会反爬,视情况而定
COOKIES_ENABLED = False #禁用cookies
8.CONCURRENT_REQUESTS
下载器最大并发数
默认16个
CONCURRENT_REQUESTS = 100
每个域名最大并发数
默认8个
CONCURRENT_REQUESTS_PER_DOMAIN = 100
每个ip的最大并发数
默认0,代表没有限制
CONCURRENT_REQUESTS_PER_IP = 100
注意:
1.若设置了此值,CONCURRENT_REQUESTS_PER_DOMAIN(域名最大并发数)就没有用了,即并发数是根据每个ip来计算而不是根据域名
2.DOWNLOAD_DELAY若不为0,则下载延迟是限制每个ip而不是每个域名
9.redis分布式爬虫设置
设置调度器和过滤器
SCHEDULER = "scrapy_redis.scheduler.Scheduler"
DUPEFILTER_CLASS = "scrapy_redis.dupefilter.RFPDupeFilter"
10.crawlera代理设置
scrapy高级开发者提供的代理
DOWNLOADER_MIDDLEWARES = {
'scrapy_crawlera.CrawleraMiddleware': 543
}
CRAWLERA_ENABLED = True
CRAWLERA_APIKEY = 'your key'
11.DEFAULT_REQUEST_HEADERS
默认请求头
DEFAULT_REQUEST_HEADERS = {
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
'Accept-Language': 'en',
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) \
Chrome/64.0.3282.140 Safari/537.36 Edge/17.17134'
}
12.禁止重定向
REDIRECT_ENABLED = False
第一步是定义我们需要爬取的数据。在Scrapy中, 这是通过 Scrapy Items 来完成的。(在本例子中为种子文件)
我们定义的Item:
import scrapy
class TorrentItem(scrapy.Item):
url = scrapy.Field()
name = scrapy.Field()
description = scrapy.Field()
size = scrapy.Field()
输出
scrapy crawl 名字
使用手册
https://scrapy-chs.readthedocs.io/zh_CN/latest/intro/overview.html#