利用scrapy爬取文件后并基于管道化的持久化存储

最新推荐文章于 2024-11-13 17:26:18 发布

weixin_30216561

最新推荐文章于 2024-11-13 17:26:18 发布

阅读量155

点赞数

文章标签： python 数据库爬虫

原文链接：http://www.cnblogs.com/tianshuai1/p/10841290.html

版权

本文详细介绍使用PyCharm和Scrapy框架进行网页爬虫的全过程，从项目创建到数据抓取，再到数据存储至本地文件、MySQL数据库及Redis缓存。通过实例演示如何解析网页数据，设置item对象属性，并在pipelines.py中实现数据持久化存储。

摘要由CSDN通过智能技术生成

我们在pycharm上爬取

首先我们可以在本文件打开命令框或在Terminal下创建

scrapy startproject xiaohuaPro ------------创建文件

scrapy genspider xiaohua www.xxx.com ----------创建执行文件

一.首先我们要进行数据的爬取　

import scrapy
from xioahuaPro.items import XioahuaproItem


class XiaohuaSpider(scrapy.Spider):
    name = 'xiaohua'
    start_urls=['http://www.521609.com/daxuemeinv/']
    #生成一个通用的url模板
    url = 'http://www.521609.com/daxuemeinv/list8%d.html'
    pageNum =1

    def parse(self, response):
        li_list=response.xpath('//div[@class="index_img list_center"]/ul/li')
        for li in li_list:
            name = li.xpath('./a[2]/text() | ./a[2]/b/text()').extract_first()
            img_url = 'http://www.521609.com'+li.xpath('./a[1]/img/@src').extract_first()
            #实例化一个item类型的对象
            item = XioahuaproItem()
            item['name'] = name
            item['img_url'] = img_url
            #item提交给管道
            yield item
        # 对其他页码的url进行手动i请求的发送
        if self.pageNum <= 24:   ------爬取的页数
            self.pageNum += 1
            new_url = format(self.url%self.pageNum)
            yield scrapy.Request(url=new_url,callback=self.parse)

之后再items.py文件下为item对象设置属性

将爬取到的所有信息全部设置为item的属性

# -*- coding: utf-8 -*-

# Define here the models for your scraped items
#
# See documentation in:
# https://doc.scrapy.org/en/latest/topics/items.html

import scrapy


class XioahuaproItem(scrapy.Item):
    # define the fields for your item here like:
    name = scrapy.Field()
    img_url = scrapy.Field()

二.写入pipelines.py内容

首先写入到自定义的文件里去

作用：将解析到的数据存储到某一个平台中。
import pymysql
from redis import Redis
class XioahuaproPipeline(object):
    fp = None
    def open_spider(self,spider):
        print('开始爬虫！')
        self.fp = open('./xiaohua.txt','w',encoding='utf-8')
    #作用：实现持久化存储的操作
    #该方法的item参数就可以接收爬虫文件提交过来的item对象
    #该方法每接收一个item就会被调用一次（调用多次）
    def process_item(self, item, spider):
        name = item['name']
        img_url = item['img_url']
        self.fp.write(name+':'+img_url+'\n')
        #返回值的作用：就是将item传递给下一个即将被执行的管道类
        return item
#
    def close_spider(self,spider):
        print('结束爬虫！')
        self.fp.close()
#

写到数据库里面,我们要在数据库里面创建个表(将mysql和redis都启动)

class MysqlPipeline(object):
    conn = None
    cursor = None
    def open_spider(self, spider):
        #解决数据库字段无法存储中文处理：alter table tableName convert to charset utf8;
        self.conn = pymysql.Connect(host='127.0.0.1',port=3306,user='root',password='123',db='test',charset='utf8')
        print(self.conn)
    def process_item(self, item, spider):
        self.cursor = self.conn.cursor()
        try:
            self.cursor.execute('insert into xiaohua values ("%s","%s")'%(item['name'],item['img_url']))
            self.conn.commit()
        except Exception as e:
            print(e)
            self.conn.rollback()
        return item
    def close_spider(self, spider):
        self.cursor.close()
        self.conn.close()

在相同的文件下创建redis类写入数据

class RedisPipeline(object):
    conn = None
    def open_spider(self, spider):
        self.conn = Redis(host='127.0.0.1',port=6379)
        print(self.conn)
    def process_item(self, item, spider):
        dic = {
            'name':item['name'],
            'img_url':item['img_url']
        }
        print(str(dic))
        self.conn.lpush('xiaohua',str(dic))
        return item
    def close_spider(self, spider):
        pass

三.更改配置文件,在settings.py里面

#添加上这行代码
USER_AGENT = 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.86 Safari/537.36'

# Obey robots.txt rules
ROBOTSTXT_OBEY = False       -----改成False

ITEM_PIPELINES = {
    'xioahuaPro.pipelines.XioahuaproPipeline': 300,  ---对应文件
    # 'xioahuaPro.pipelines.MysqlPipeline': 301,    ----对应数据库

    # 'xioahuaPro.pipelines.RedisPipeline': 302,    -----对应redis
}

LOG_LEVEL = 'ERROR'     
　　　　　　　　　　　　# CRITICAL --严重错误
　　　　　　　　　　　　#ERROR    ---一般错误
　　　　　　　　　　　　#WARNING ---警告信息
　　　　　　　　　　　　#INFO  ---一般信息
　　　　　　　　　　　　#DEBUG  --调试信息