1.0创建爬虫
1.1 创建 project
scrapy startproject mySpider
1.2 创建爬虫
# 创建一个基于 scrapy.Spider 类的爬虫
scrapy genspider SpiderName "domain.com"
# 创建以及基于 scrapy.spiders.CrawlSpider 的爬虫
scrapy genspider -t crawl spiderName "domain.com"
1.3 文件结构
主要文件的作用:
- scrapy.cfg :项目的配置文件
- mySpider/ :项目的Python模块,将会从这里引用代码
- mySpider/items.py :项目的目标文件
- mySpider/pipelines.py :项目的管道文件
- mySpider/settings.py :项目的设置文件
- mySpider/spiders/ :存储爬虫代码目录
2.0 爬虫 SpiderName/
爬虫 主要用于发起请求获取返回数据。继续访问返回 response 中的 url ,或者处理结果中的数据。
主要基于两个类展开:
- Spider
- CrawlSpider
2.1 基于Spider 的爬虫
# tencent.py
from mySpider.items import TencentItem
import scrapy
import re
class TencentSpider(scrapy.Spider):
name = "tencent"
allowed_domains = ["hr.tencent.com"]
start_urls = [
"http://hr.tencent.com/position.php?&start=0#a"
]
def parse(self, response):
for each in response.xpath('//*[@class="even"]'):
item = TencentItem()
name = each.xpath('./td[1]/a/text()').extract()[0]
detailLink = each.xpath('./td[1]/a/@href').extract()[0]
positionInfo = each.xpath('./td[2]/text()').extract()[0]
peopleNumber = each.xpath('./td[3]/text()').extract()[0]
workLocation = each.xpath('./td[4]/text()').extract()[0]
publishTime = each.xpath('./td[5]/text()').extract()[0]
#print name, detailLink, catalog, peopleNumber, workLocation,publishTime
item['name'] = name.encode('utf-8')
item['detailLink'] = detailLink.encode('utf-8')
item['positionInfo'] = positionInfo.encode('utf-8')
item['peopleNumber'] = peopleNumber.encode('utf-8')
item['workLocation'] = workLocation.encode('utf-8')
item['publishTime'] = publishTime.encode('utf-8')
curpage = re.search('(\d+)',response.url).group(1)
page = int(curpage) + 10
url = re.sub('\d+', str(page), response.url)
# 发送新的url请求加入待爬队列,并调用回调函数 self.parse
yield scrapy.Request(url, callback = self.parse)
# 将获取的数据交给pipeline
yield item
2.2 基于 CrawlSpider 的爬虫
以爬取豆瓣电影TOP250为例
# -*- coding: utf-8 -*-
import scrapy
from scrapy.linkextractors import LinkExtractor
from scrapy.spiders import CrawlSpider, Rule
from mySpider.items import doubanItem
class Douban250Spider(CrawlSpider):
# 爬虫名称
name = 'douban250'
# 域
allowed_domains = ['douban.com']
# 起始 url
start_urls = ['https://movie.douban.com/top250']
# 处理链接的 LinkExtractor,使用正则。
pageLink = LinkExtractor(allow=('start=\d+&filter='))
# 处理连接,设置了回调函数和是否跟进
rules = (
Rule(pageLink, callback='parse_item', follow=True),
# Rule(pageLink, callback='parse_item',process_links='deal_links' , follow=True),
)
# 解析结果,接收 response
def parse_item(self, response):
# xpath 获取到内容
content = response.xpath('//div[@class="item"]')
# 循环获取每一个
for i in content:
# doubanItem 类 定义在 ../items.py 中
item = doubanItem()
# xpath获取
item['rank'] = i.xpath('div/em/text()').extract()[0]
item['title'] = i.xpath('div//div[@class="hd"]/a/span/text()').extract()[0]
item['url'] = i.xpath('div//div[@class="hd"]/a/@href').extract()[0]
item['rating_num'] = i.xpath('div//span[@class="rating_num"]/text()').extract()[0]
item['comment'] = i.xpath('div//span[@class="inq"]/text()').extract()
text = i.xpath('div/div[@class="bd"]/p/text()').extract()
item['date_year'] = text[1].split("/")[0].strip()[:4]
item['country'] = text[1].split("/")[-2].strip().replace(' ', "、")
item['genre'] = text[1].split("/")[-1].strip().replace(' ', "、")
# 处理,简单处理结果
if len(item['comment']) == 0:
item['comment'] = "暂无简介!"
else:
item['comment'] = item['comment'][0]
# 迭代返回给管道
yield item
3.0 items.py
用来定义模型,存储数据的结构
import scrapy
class doubanItem(scrapy.Item):
rank = scrapy.Field()
date_year = scrapy.Field()
title = scrapy.Field()
url = scrapy.Field()
country = scrapy.Field()
genre = scrapy.Field()
rating_num = scrapy.Field()
comment = scrapy.Field()
4.0 管道 pipelines.py
# -*- coding: utf-8 -*-
# 文件处理类库,可以指定编码格式
import codecs
import json
import MySQLdb
class doubanPipline(object):
def __init__(self):
# 打开文件
# self.filename = codecs.open('./outfiles/doubanTop250.json', 'w', encoding='utf-8')
# 建立数据库连接
self.db = MySQLdb.connect(host='localhost', port=3306, user='root', passwd='root', db='spider', charset='utf8', use_unicode=True)
self.cursor = self.db.cursor()
def process_item(self, item, spider):
# 写入mysql
self.cursor.execute("""insert into doubantop250 (rank,title,url,rating_num,comment,date_year,country,genre) values (%s,%s,%s,%s,%s,%s,%s,%s)""",
[int(item['rank']), item['title'], item['url'], item['rating_num'], item['comment'], item['date_year'], item['country'], item['genre'] ])
self.db.commit()
# 写入文件
# content = json.dumps(dict(item), ensure_ascii=False) + "\n"
# self.filename.write(content)
return item
def spider_closed(self, spider):
# 关闭数据库连接
self.db.close()
# 关闭文件
# self.filename.close()
5.0 配置文件 settings.py
# -*- coding: utf-8 -*-
# Scrapy settings for mySpider project
#
# For simplicity, this file contains only settings considered important or
# commonly used. You can find more settings consulting the documentation:
#
# https://doc.scrapy.org/en/latest/topics/settings.html
# https://doc.scrapy.org/en/latest/topics/downloader-middleware.html
# https://doc.scrapy.org/en/latest/topics/spider-middleware.html
BOT_NAME = 'mySpider'
SPIDER_MODULES = ['mySpider.spiders']
NEWSPIDER_MODULE = 'mySpider.spiders'
USER_AGENT = 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36'
# Obey robots.txt rules
ROBOTSTXT_OBEY = True
# Configure maximum concurrent requests performed by Scrapy (default: 16)
# CONCURRENT_REQUESTS = 32
# Configure a delay for requests for the same website (default: 0)
# See https://doc.scrapy.org/en/latest/topics/settings.html#download-delay
# See also autothrottle settings and docs
# DOWNLOAD_DELAY = 3
# The download delay setting will honor only one of:
# CONCURRENT_REQUESTS_PER_DOMAIN = 16
# CONCURRENT_REQUESTS_PER_IP = 16
# Disable cookies (enabled by default)
# COOKIES_ENABLED = False
# Disable Telnet Console (enabled by default)
# TELNETCONSOLE_ENABLED = False
# Override the default request headers:
# DEFAULT_REQUEST_HEADERS = {
# 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
# 'Accept-Language': 'en',
# }
# Enable or disable spider middlewares
# See https://doc.scrapy.org/en/latest/topics/spider-middleware.html
# SPIDER_MIDDLEWARES = {
# 'mySpider.middlewares.MyspiderSpiderMiddleware': 543,
# }
# Enable or disable downloader middlewares
# See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html
# DOWNLOADER_MIDDLEWARES = {
# 'mySpider.middlewares.MyspiderDownloaderMiddleware': 543,
# }
# Enable or disable extensions
# See https://doc.scrapy.org/en/latest/topics/extensions.html
# EXTENSIONS = {
# 'scrapy.extensions.telnet.TelnetConsole': None,
# }
# Configure item pipelines
# See https://doc.scrapy.org/en/latest/topics/item-pipeline.html
ITEM_PIPELINES = {
# 'mySpider.pipelines.MyspiderPipeline': 300,
# "mySpider.pipelines.ItcastJsonPipeline": 300,
# "mySpider.pipelines.tencentJsonPipline":300,
# "mySpider.pipelines.sunJsonPipeline": 300,
"mySpider.pipelines.doubanPipline": 300,
}
#
# LOG_FILE = "../Log/SUN.log"
# LOG_LEVEL = "DEBUG"
# Enable and configure the AutoThrottle extension (disabled by default)
# See https://doc.scrapy.org/en/latest/topics/autothrottle.html
# AUTOTHROTTLE_ENABLED = True
# The initial download delay
# AUTOTHROTTLE_START_DELAY = 5
# The maximum download delay to be set in case of high latencies
# AUTOTHROTTLE_MAX_DELAY = 60
# The average number of requests Scrapy should be sending in parallel to
# each remote server
# AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
# Enable showing throttling stats for every response received:
# AUTOTHROTTLE_DEBUG = False
# Enable and configure HTTP caching (disabled by default)
# See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
# HTTPCACHE_ENABLED = True
# HTTPCACHE_EXPIRATION_SECS = 0
# HTTPCACHE_DIR = 'httpcache'
# HTTPCACHE_IGNORE_HTTP_CODES = []
# HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'