1、创建Scrapy项目
scrapy startproject AllBooks
2.进入项目目录,使用命令genspider创建Spider
scrapy genspider allbooks allitebooks.org
3、定义要抓取的数据(处理items.py文件)
# -*- coding: utf-8 -*-
import scrapy
class AllbooksItem(scrapy.Item):
book_name = scrapy.Field()
image_url = scrapy.Field()
author = scrapy.Field()
book_info = scrapy.Field()
4、编写提取item数据的Spider(在spiders文件夹下:allbooks.py)
# -*- coding: utf-8 -*-
import scrapy
from ..items import AllbooksItem
class AllbooksSpider(scrapy.Spider):
name = 'allbooks'
allowed_domains = ['allitebooks.org']
url= 'http://www.allitebooks.org/page/{}'
start_urls = [url.format(1)]
def parse(self, response):
# 获取共多少页
total_page = response.xpath('//*[@id="main-content"]/div/div/a[5]/text()').extract_first()
# with open('dd.html','w',encoding='utf-8')as f:
# f.write(response.text)
print("总共获取到%s页!" % total_page)
# 修改这个值控制循环多少次
total_page = 2
for page in range(1,int(total_page)+1):
print("处理第%d页..."%page)
url = self.url.format(page)
yield scrapy.Request(url=url,callback=self.parse_allbooks)
def parse_allbooks(self,response):
print("当前处理的页面:%s"%response.meta['redirect_urls'][0])
all_book_list = response.xpath('//div[@class="main-content-inner clearfix"]/article')
for book in all_book_list:
item = AllbooksItem()
# 书名
item['book_name'] = book.xpath('.//h2[@class="entry-title"]//text()').extract_first()
# 封面图链接地址
item['image_url'] = book.xpath('.//div/a/img/@src').extract_first()
# 作者,取出是list,有多作者情况['Adam Karneboge', 'Arek Dreyer']
author = book.xpath('.//h5[@class="entry-author"]/a/text()').extract()
item['author'] = ",".join(author)
# 书的简介
book_info= book.xpath('.//div[@class="entry-summary"]/p/text()').extract_first()
item['book_info'] = book_info.replace('\xa0',' ').replace('\u200b','')
yield item
5.处理pipelines管道文件保存数据,可将结果保存到文件中(pipelines.py)
# -*- coding: utf-8 -*-
import time
import json
import pymysql
import pymongo
import redis
from openpyxl import Workbook
from scrapy import Item
from scrapy.exceptions import DropItem
class MyEncoder(json.JSONEncoder):
def default(self, o):
if isinstance(o, bytes):
return str(o, encoding='utf-8')
return json.JSONEncoder.default(self, o)
# 存入redis数据库,也是去重处理,如果存在则丢弃这个item,DropItem
class RedisPipeline(object):
@classmethod
def from_crawler(cls, crawler):
cls.REDIS_HOST = crawler.settings.get('REDIS_HOST')
cls.REDIS_PORT = crawler.settings.get('REDIS_PORT')
cls.REDIS_DBNAME = crawler.settings.get('REDIS_DBNAME')
cls.REDIS_decode_responses = crawler.settings.getbool('REDIS_decode_responses')
return cls()
def open_spider(self, spider):
try:
self.redis_client = redis.StrictRedis(host=self.REDIS_HOST, port=self.REDIS_PORT,
db=self.REDIS_DBNAME,decode_responses=self.REDIS_decode_responses)
except Exception as e:
print("链接redis出错:",e)
def process_item(self, item, spider):
if self.redis_client.sadd('books:items',item['book_name']):
return item
raise DropItem
def close_spider(self, spider):
print("redis处理完毕")
# 存入MongoDB数据库
class MongoPipeline(object):
@classmethod
def from_crawler(cls,crawler):
# 读取配置文件中的MONGO_DB_URL和MONGO_DB_NAME(不存在则使用默认值)
cls.DB_URL = crawler.settings.get('MONGO_DB_URL','mongodb://localhost:27017')
cls.DB_NAME = crawler.settings.get('MONGO_DB_NAME', 'py4')
# <class 'AllBooks.pipelines.MongoPipeline'>
# print(type(cls()))
return cls()
def open_spider(self,spider):
self.client = pymongo.MongoClient(self.DB_URL)
self.db = self.client[self.DB_NAME] # 选择数据库
def process_item(self, item, spider):
book_collection= self.db[spider.name] # 选择集合,如果没有,插入数据的时候会自动创建
# 集合对象的insert_one方法需传入一个字典对象(不能传入Item对象)
data = dict(item) if isinstance(item, Item) else item
# 去重处理,如果开启RedisPipeline,这个去重可以关闭
# count = book_collection.find({'book_name':item['book_name']}).count()
# if count == 0:
# book_collection.insert_one(data)
book_collection.insert_one(data)
return item
def close_spider(self,spider):
print("mongoDB处理完毕")
self.client.close()
# 存入MySQL数据库
class MysqlPipeline(object):
@classmethod
def from_crawler(cls,crawler):
cls.MYSQL_HOST = crawler.settings.get('MYSQL_HOST')
cls.MYSQL_PORT = crawler.settings.get('MYSQL_PORT')
cls.MYSQL_USER = crawler.settings.get('MYSQL_USER')
cls.MYSQL_PASSWD = crawler.settings.get('MYSQL_PASSWD')
cls.MYSQL_DBNAME = crawler.settings.get('MYSQL_DBNAME')
cls.MYSQL_CHARSET = crawler.settings.get('MYSQL_CHARSET')
return cls()
def open_spider(self,spider):
self.db = pymysql.connect(host=self.MYSQL_HOST,port=self.MYSQL_PORT,
user=self.MYSQL_USER,passwd=self.MYSQL_PASSWD,db=self.MYSQL_DBNAME)
self.cur = self.db.cursor()
def process_item(self, item, spider):
try:
# 查重处理
# self.cur.execute("select book_name from books where book_name=%s",item['book_name'])
# repetition = self.cur.fetchone()
# if not repetition:
# keys,values = zip(*item.items())
# sql = "insert into {}({})VALUES ({})".format('books',','.join(keys),','.join(['%s']*len(values)))
# self.cur.execute(sql,values)
# 因为增加了RedisPipeline,如果书名已经在redis里,则DropItem,所以mysql中不会重复添加
keys, values = zip(*item.items())
sql = "insert into {}({})VALUES ({})".format('books', ','.join(keys), ','.join(['%s'] * len(values)))
self.cur.execute(sql, values)
self.db.commit()
print(self.cur._last_executed)# 打印sql语句
return item
except Exception as e:
print("出错:",e)
self.db.rollback()
def close_spider(self,spider):
print("mysql处理完毕")
self.cur.close()
self.db.close()
# 存入表格
class AllbooksPipeline(object):
def __init__(self):
self.wb = Workbook()
self.ws = self.wb.active
self.ws.title = 'AllBooks表'
# 创建表头
self.ws.append(['书名','作者','封面图url','简介'])
def process_item(self, item, spider):
text = [item['book_name'],item['author'],item['image_url'],item['book_info']]
self.ws.append(text)
return item
def close_spider(self,spider):
file_end_name = time.strftime("%Y-%m-%d",time.localtime())
self.wb.save(spider.name+file_end_name+'.xlsx')
print("存入表格处理完毕!")
6.配置settings文件(settings.py)
# redis数据库配置
REDIS_HOST = 'localhost'
REDIS_PORT = 6379
REDIS_DBNAME = 3
REDIS_decode_responses = True
# mysql数据库配置
MYSQL_HOST = 'localhost'
MYSQL_PORT = 3306
MYSQL_USER = 'root'
MYSQL_PASSWD = '123456'
MYSQL_DBNAME = 'python4'
MYSQL_CHARSET = 'utf8'
# mongod数据库配置
MONGO_DB_URL = 'mongodb://localhost:27017'
MONGO_DB_NAME = 'py4'
# Obey robots.txt rules
ROBOTSTXT_OBEY = False
DOWNLOAD_DELAY = 3
DEFAULT_REQUEST_HEADERS = {
'User-Agesettingsnt': 'Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident/5.0);',
# 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
# 'Accept-Language': 'en',
}
# 数值越小越先执行,但数值不要瞎写,可以看scrapy默认的settings里一些相关设置
ITEM_PIPELINES = {
'AllBooks.pipelines.AllbooksPipeline': 300,
'AllBooks.pipelines.RedisPipeline': 301,
'AllBooks.pipelines.MysqlPipeline': 302,
'AllBooks.pipelines.MongoPipeline': 303,
}
# 代理池
PROXIES = ['HTTP://171.112.165.176:9999',
'HTTPS://218.24.16.198:43620',
'HTTP://112.85.130.38:9999',
'HTTPS://221.218.102.146:33323',
'HTTP://110.52.235.44:9999',
]
# 如果需要代理中间件打开
DOWNLOADER_MIDDLEWARES = {
# 'AllBooks.middlewares.AllbooksDownloaderMiddleware': 543,
'AllBooks.middlewares.RandomProxyMiddleware': 749,
}
DOWNLOAD_TIMEOUT = 5 # 默认三分钟
# 还可以将日志存到本地文件中(可选添加设置)
LOG_FILE = "allbooks.log"
LOG_LEVEL = "DEBUG"
# 包含打印信息也一起写进日志里
LOG_STDOUT = True
7-选用,增加随机代理中间件(middlewares.py)
import random
from scrapy import signals
from scrapy.exceptions import NotConfigured
from collections import defaultdict
class RandomProxyMiddleware(object):
def __init__(self,settings):
# 代理中HTTP可能大写,这里统一修改成小写
proxies_list = settings.getlist('PROXIES')
self.proxies = [s.lower() for s in proxies_list if isinstance(s,str)==True]
# 控制记录失败次数
self.count = defaultdict(int)
# 最大失败次数
self.max_failed = 3
@classmethod
def from_crawler(cls,crawler):
if not crawler.settings.getbool('HTTPPROXY_ENABLED'):
raise NotConfigured
return cls(crawler.settings)
def process_request(self,request,spider):
if self.proxies and not request.meta.get('proxy') and request.url not in spider.start_urls:
request.meta['proxy'] = random.choice(self.proxies)
print("当前使用的代理是:",request.meta['proxy'])
def process_response(self,request,response,spider):
# 获取状态码
get_status = response.status
cur_proxy = request.meta.get('proxy')
if get_status in (400,403):
self.count[cur_proxy] += 1
if self.count[cur_proxy] > self.max_failed:
print("got error http code(%s) when use proxy:%s"%(get_status,cur_proxy))
self.remove_proxy(cur_proxy)
del request.meta['proxy']
return request
# response:<301 http://www.allitebooks.org/page/1>
return response
def process_exception(self,request,exception,spider):
cur_proxy = request.meta.get('proxy')
from twisted.internet.error import ConnectionRefusedError,TimeoutError
# 如果设置了代理并且报了这两个错,则打印出错的代理
if cur_proxy and isinstance(exception,(ConnectionRefusedError,TimeoutError)):
print("ERROR(%s) when use proxy:%s"%(exception,cur_proxy))
self.remove_proxy(cur_proxy)
del request.meta['proxy']
return request
def remove_proxy(self,cur_proxy):
if cur_proxy in self.proxies:
# 从列表中删掉这个不能用的,防止被再次利用
self.proxies.remove(cur_proxy)
print("remove proxy:%s from proxy list" %cur_proxy)
7.记得提前打开mysq/redis/MongoDBl数据库,并且建立好相应的表
CREATE TABLE IF NOT EXISTS books(id INT auto_increment PRIMARY KEY NOT NULL,
book_name VARCHAR(200) NOT NULL,
author VARCHAR(200),
image_url VARCHAR(300),
book_info VARCHAR(800))ENGINE=InnoDB DEFAULT CHARSET=utf8;
8.以上设置完毕,进行爬取:执行项目命令crawl,启动Spider:
scrapy crawl allbooks