利用scrapy抓取国外图书网站关于书的信息,通过pipeline保存到MongoDB、Redis、MySQL数据库和本地表格(选用随机代理中间件)

32 篇文章 1 订阅
5 篇文章 0 订阅

1、创建Scrapy项目

scrapy startproject AllBooks

2.进入项目目录,使用命令genspider创建Spider

scrapy genspider allbooks allitebooks.org

3、定义要抓取的数据(处理items.py文件)

# -*- coding: utf-8 -*-

import scrapy

class AllbooksItem(scrapy.Item):
    book_name = scrapy.Field()
    image_url = scrapy.Field()
    author = scrapy.Field()
    book_info = scrapy.Field()

4、编写提取item数据的Spider(在spiders文件夹下:allbooks.py)

# -*- coding: utf-8 -*-
import scrapy
from ..items import AllbooksItem

class AllbooksSpider(scrapy.Spider):
    name = 'allbooks'
    allowed_domains = ['allitebooks.org']
    url= 'http://www.allitebooks.org/page/{}'
    start_urls = [url.format(1)]

    def parse(self, response):
        # 获取共多少页
        total_page = response.xpath('//*[@id="main-content"]/div/div/a[5]/text()').extract_first()
        # with open('dd.html','w',encoding='utf-8')as f:
        #     f.write(response.text)
        print("总共获取到%s页!" % total_page)
        # 修改这个值控制循环多少次
        total_page = 2
        for page in range(1,int(total_page)+1):
            print("处理第%d页..."%page)
            url = self.url.format(page)
            yield scrapy.Request(url=url,callback=self.parse_allbooks)

    def parse_allbooks(self,response):
        print("当前处理的页面:%s"%response.meta['redirect_urls'][0])
        all_book_list = response.xpath('//div[@class="main-content-inner clearfix"]/article')
        for book in all_book_list:
            item = AllbooksItem()
            # 书名
            item['book_name'] = book.xpath('.//h2[@class="entry-title"]//text()').extract_first()
            # 封面图链接地址
            item['image_url'] = book.xpath('.//div/a/img/@src').extract_first()
            # 作者,取出是list,有多作者情况['Adam Karneboge', 'Arek Dreyer']
            author = book.xpath('.//h5[@class="entry-author"]/a/text()').extract()
            item['author'] = ",".join(author)
            # 书的简介
            book_info= book.xpath('.//div[@class="entry-summary"]/p/text()').extract_first()
            item['book_info'] = book_info.replace('\xa0',' ').replace('\u200b','')
            yield item

5.处理pipelines管道文件保存数据,可将结果保存到文件中(pipelines.py)

# -*- coding: utf-8 -*-

import time
import json
import pymysql
import pymongo
import redis
from openpyxl import Workbook
from scrapy import Item
from scrapy.exceptions import DropItem

class MyEncoder(json.JSONEncoder):
    def default(self, o):
        if isinstance(o, bytes):
            return str(o, encoding='utf-8')
        return json.JSONEncoder.default(self, o)

# 存入redis数据库,也是去重处理,如果存在则丢弃这个item,DropItem
class RedisPipeline(object):
    @classmethod
    def from_crawler(cls, crawler):
        cls.REDIS_HOST = crawler.settings.get('REDIS_HOST')
        cls.REDIS_PORT = crawler.settings.get('REDIS_PORT')
        cls.REDIS_DBNAME = crawler.settings.get('REDIS_DBNAME')
        cls.REDIS_decode_responses = crawler.settings.getbool('REDIS_decode_responses')
        return cls()
    def open_spider(self, spider):
        try:
            self.redis_client = redis.StrictRedis(host=self.REDIS_HOST, port=self.REDIS_PORT,
                                                  db=self.REDIS_DBNAME,decode_responses=self.REDIS_decode_responses)
        except Exception as e:
            print("链接redis出错:",e)
    def process_item(self, item, spider):
        if self.redis_client.sadd('books:items',item['book_name']):
            return item
        raise DropItem
    def close_spider(self, spider):
        print("redis处理完毕")
# 存入MongoDB数据库
class MongoPipeline(object):
    @classmethod
    def from_crawler(cls,crawler):
        # 读取配置文件中的MONGO_DB_URL和MONGO_DB_NAME(不存在则使用默认值)
        cls.DB_URL = crawler.settings.get('MONGO_DB_URL','mongodb://localhost:27017')
        cls.DB_NAME = crawler.settings.get('MONGO_DB_NAME', 'py4')
        # <class 'AllBooks.pipelines.MongoPipeline'>
        # print(type(cls()))
        return cls()

    def open_spider(self,spider):
        self.client = pymongo.MongoClient(self.DB_URL)
        self.db = self.client[self.DB_NAME] # 选择数据库

    def process_item(self, item, spider):
        book_collection= self.db[spider.name]  # 选择集合,如果没有,插入数据的时候会自动创建
        # 集合对象的insert_one方法需传入一个字典对象(不能传入Item对象)
        data = dict(item) if isinstance(item, Item) else item
        # 去重处理,如果开启RedisPipeline,这个去重可以关闭
        # count = book_collection.find({'book_name':item['book_name']}).count()
        # if count == 0:
            # book_collection.insert_one(data)
        book_collection.insert_one(data)
        return item
    def close_spider(self,spider):
        print("mongoDB处理完毕")
        self.client.close()
# 存入MySQL数据库
class MysqlPipeline(object):
    @classmethod
    def from_crawler(cls,crawler):
        cls.MYSQL_HOST = crawler.settings.get('MYSQL_HOST')
        cls.MYSQL_PORT = crawler.settings.get('MYSQL_PORT')
        cls.MYSQL_USER = crawler.settings.get('MYSQL_USER')
        cls.MYSQL_PASSWD = crawler.settings.get('MYSQL_PASSWD')
        cls.MYSQL_DBNAME = crawler.settings.get('MYSQL_DBNAME')
        cls.MYSQL_CHARSET = crawler.settings.get('MYSQL_CHARSET')
        return cls()

    def open_spider(self,spider):
        self.db = pymysql.connect(host=self.MYSQL_HOST,port=self.MYSQL_PORT,
                                  user=self.MYSQL_USER,passwd=self.MYSQL_PASSWD,db=self.MYSQL_DBNAME)
        self.cur = self.db.cursor()

    def process_item(self, item, spider):
        try:
            # 查重处理
            # self.cur.execute("select book_name from books where book_name=%s",item['book_name'])
            # repetition = self.cur.fetchone()
            # if not repetition:
            #     keys,values = zip(*item.items())
            #     sql = "insert into {}({})VALUES ({})".format('books',','.join(keys),','.join(['%s']*len(values)))
            #     self.cur.execute(sql,values)
            # 因为增加了RedisPipeline,如果书名已经在redis里,则DropItem,所以mysql中不会重复添加
            keys, values = zip(*item.items())
            sql = "insert into {}({})VALUES ({})".format('books', ','.join(keys), ','.join(['%s'] * len(values)))
            self.cur.execute(sql, values)
            self.db.commit()
            print(self.cur._last_executed)# 打印sql语句
            return item
        except Exception as e:
            print("出错:",e)
            self.db.rollback()
    def close_spider(self,spider):
        print("mysql处理完毕")
        self.cur.close()
        self.db.close()
# 存入表格
class AllbooksPipeline(object):
    def __init__(self):
        self.wb = Workbook()
        self.ws = self.wb.active
        self.ws.title = 'AllBooks表'
        # 创建表头
        self.ws.append(['书名','作者','封面图url','简介'])

    def process_item(self, item, spider):
        text = [item['book_name'],item['author'],item['image_url'],item['book_info']]
        self.ws.append(text)
        return item

    def close_spider(self,spider):
        file_end_name = time.strftime("%Y-%m-%d",time.localtime())
        self.wb.save(spider.name+file_end_name+'.xlsx')
        print("存入表格处理完毕!")

6.配置settings文件(settings.py)

# redis数据库配置
REDIS_HOST = 'localhost'
REDIS_PORT = 6379
REDIS_DBNAME = 3
REDIS_decode_responses = True

# mysql数据库配置
MYSQL_HOST = 'localhost'
MYSQL_PORT = 3306
MYSQL_USER = 'root'
MYSQL_PASSWD = '123456'
MYSQL_DBNAME = 'python4'
MYSQL_CHARSET = 'utf8'

# mongod数据库配置
MONGO_DB_URL = 'mongodb://localhost:27017'
MONGO_DB_NAME = 'py4'

# Obey robots.txt rules
ROBOTSTXT_OBEY = False

DOWNLOAD_DELAY = 3

DEFAULT_REQUEST_HEADERS = {
'User-Agesettingsnt': 'Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident/5.0);',
  # 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
  # 'Accept-Language': 'en',
}

# 数值越小越先执行,但数值不要瞎写,可以看scrapy默认的settings里一些相关设置
ITEM_PIPELINES = {
   'AllBooks.pipelines.AllbooksPipeline': 300,
    'AllBooks.pipelines.RedisPipeline': 301,
   'AllBooks.pipelines.MysqlPipeline': 302,
   'AllBooks.pipelines.MongoPipeline': 303,
}

# 代理池
PROXIES = ['HTTP://171.112.165.176:9999',
           'HTTPS://218.24.16.198:43620',
           'HTTP://112.85.130.38:9999',
           'HTTPS://221.218.102.146:33323',
           'HTTP://110.52.235.44:9999',
            ]
# 如果需要代理中间件打开
DOWNLOADER_MIDDLEWARES = {
   # 'AllBooks.middlewares.AllbooksDownloaderMiddleware': 543,
   'AllBooks.middlewares.RandomProxyMiddleware': 749,
}
DOWNLOAD_TIMEOUT = 5 # 默认三分钟

# 还可以将日志存到本地文件中(可选添加设置)
LOG_FILE = "allbooks.log"
LOG_LEVEL = "DEBUG"
# 包含打印信息也一起写进日志里
LOG_STDOUT = True

7-选用,增加随机代理中间件(middlewares.py)

import random
from scrapy import signals
from scrapy.exceptions import NotConfigured
from collections import defaultdict

class RandomProxyMiddleware(object):
    def __init__(self,settings):
        # 代理中HTTP可能大写,这里统一修改成小写
        proxies_list = settings.getlist('PROXIES')
        self.proxies = [s.lower() for s in proxies_list if isinstance(s,str)==True]
        # 控制记录失败次数
        self.count = defaultdict(int)
        # 最大失败次数
        self.max_failed = 3
    @classmethod
    def from_crawler(cls,crawler):
        if not crawler.settings.getbool('HTTPPROXY_ENABLED'):
            raise NotConfigured
        return cls(crawler.settings)

    def process_request(self,request,spider):
        if self.proxies and not request.meta.get('proxy') and request.url not in spider.start_urls:
            request.meta['proxy'] = random.choice(self.proxies)
            print("当前使用的代理是:",request.meta['proxy'])

    def process_response(self,request,response,spider):
        # 获取状态码
        get_status = response.status
        cur_proxy = request.meta.get('proxy')
        if get_status in (400,403):
            self.count[cur_proxy] += 1
        if self.count[cur_proxy] > self.max_failed:
            print("got error http code(%s) when use proxy:%s"%(get_status,cur_proxy))
            self.remove_proxy(cur_proxy)
            del request.meta['proxy']
            return request
        # response:<301 http://www.allitebooks.org/page/1>
        return response
    def process_exception(self,request,exception,spider):
        cur_proxy = request.meta.get('proxy')
        from twisted.internet.error import ConnectionRefusedError,TimeoutError
        # 如果设置了代理并且报了这两个错,则打印出错的代理
        if cur_proxy and isinstance(exception,(ConnectionRefusedError,TimeoutError)):
            print("ERROR(%s) when use proxy:%s"%(exception,cur_proxy))
            self.remove_proxy(cur_proxy)
            del request.meta['proxy']
            return request
    def remove_proxy(self,cur_proxy):
        if cur_proxy in self.proxies:
            # 从列表中删掉这个不能用的,防止被再次利用
            self.proxies.remove(cur_proxy)
            print("remove proxy:%s from proxy list" %cur_proxy)

7.记得提前打开mysq/redis/MongoDBl数据库,并且建立好相应的表

CREATE TABLE IF NOT EXISTS books(id INT auto_increment PRIMARY KEY NOT NULL,
book_name VARCHAR(200) NOT NULL,
author VARCHAR(200),
image_url VARCHAR(300),
book_info VARCHAR(800))ENGINE=InnoDB DEFAULT CHARSET=utf8;

8.以上设置完毕,进行爬取:执行项目命令crawl,启动Spider:

scrapy crawl allbooks

 

  • 0
    点赞
  • 1
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
好的,我可以为您提供一些思路和步骤: 1. 安装Scrapy数据库相关的Python模块(如pymysql)。 2. 创建一个Scrapy项目,并在其中定义一个爬虫,用于爬取中关村网站上平板电脑的各项参数数据。 3. 在爬虫中编写解析HTML页面的代码,提取出需要的数据,并且将数据保存Python字典中。 4. 将字典中的数据转换为SQL语句,并且使用数据库相关的Python模块将数据保存数据库中。 下面是大致的代码示例: ```python import scrapy import pymysql class TabletSpider(scrapy.Spider): name = 'tablet' allowed_domains = ['zol.com.cn'] start_urls = ['https://detail.zol.com.cn/tablet_pc/'] def parse(self, response): for item in response.css('.list-box .list-item'): data = { 'name': item.css('.pro-intro h3 a::text').get(), 'price': item.css('.pro-intro .price-type .price-normal::text').get(), 'screen_size': item.css('.pro-intro .param .param-list .param-item:nth-child(1)::text').get(), # 其他需要爬取的参数 } yield data def close(self, reason): conn = pymysql.connect(host='localhost', user='root', password='123456', db='test') cursor = conn.cursor() for data in self.crawler.stats.get_value('items'): sql = f"INSERT INTO tablet (name, price, screen_size) VALUES ('{data['name']}', '{data['price']}', '{data['screen_size']}')" cursor.execute(sql) conn.commit() cursor.close() conn.close() ``` 在上面的代码中,我们定义了一个名为`TabletSpider`的爬虫,用于爬取中关村网站上平板电脑的各项参数数据。在`parse`函数中,我们使用Scrapy的CSS选择器提取需要的数据,并且将数据保存Python字典中。在`close`函数中,我们使用`pymysql`模块连接到本地的MySQL数据库,并将爬取到的数据保存数据库中。 需要注意的是,上面的代码只是一个示例,具体的实现方式可能会因为网站的HTML结构而有所不同。

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值