scrapy-crawlspider爬取某新闻网站住房有关文章

最新推荐文章于 2024-01-24 17:47:13 发布

fiery_heart

最新推荐文章于 2024-01-24 17:47:13 发布

阅读量476

点赞数

分类专栏：爬虫文章标签： scrapy crawl-spider

本文链接：https://blog.csdn.net/fiery_heart/article/details/82189273

版权

爬虫专栏收录该内容

10 篇文章 0 订阅

订阅专栏

首先创建项目，创建爬虫

scrapy startproject qianlongwang # 创建了一个项目

在项目的根目录下，创建爬虫，一个项目可以有多个爬虫

scrapy genspider -t crawl fangchan xxxx.com # 创建了一个名为fangchan的爬虫，并指定了爬虫的活动范围

第一步：明确需求

也就是搞清楚自己想要什么数据，然后再items.py文件中定义字段（算是一种映射）
我想要的就是文章的标题和内容
items.py文件

import scrapy
class QianlongwangItem(scrapy.Item):
    # define the fields for your item here like:
    title = scrapy.Field() # 标题
    content = scrapy.Field() # 正文

第二步，爬虫文件的编写

这一步就是爬虫文件的编写了，包括了解析相应内容，提取出需要的链接和数据

# -*- coding: utf-8 -*-
import scrapy
from scrapy.linkextractors import LinkExtractor
from scrapy.spiders import CrawlSpider, Rule
from qianlongwang.items import QianlongwangItem

class FangchanSpider(CrawlSpider):
    # 爬虫的名字
    name = 'fangchan'
    # 爬虫的作用域
    allowed_domains = ['house.qianlong.com']
    # 定义爬虫爬取的第一个链接，当这个爬虫开始运行的时候，会先从start_urls里取到爬取的第一个链接
    start_urls = ['http://house.qianlong.com/shoudufangchan/1.shtml']

    #这两个Rule需要解释一
    #第一个Rule，是用来提取响应文件里包含的每一页的链接，需要跟进，所以fillw设置为True，默认为true，当rule里定义了回调函数，就默认为false
    #第二个Rule，用来提取每一页里的每篇文章的链接，并指定回调函数，对响应内容，也就是网页，进行文本提取

    #这里的运行机制：
    #首先，爬虫从start_urls里拿到第一个任务，scrapy框架自动去下载这个页面
    #然后，响应页面会依次去rules这个集合里，根据具体的rule规则，进行匹配。并根据设置的回调函数，以及是否跟进链接，明确下一步的行为。

    #这里的跟进的意思就是，从LinkExtractor(allow=r'shoudufangchan/\d+.shtml')这个规则里提出出来的链接，需不需要继续放入任务队列，让scrapy去执行任务，也就是下载这也页面。
    #LinkExtractor(allow=r'shoudufangchan/\d+.shtml') ，会自动搜索整个页面所有符合allow规则的链接，allow规则是用正则编写。
    #follow=True，把根据LinkExtractor提取出来的链接，放入任务队列（当然，放入之前会确认这个任务是否已经做过），继续爬取，然后把响应文件返回到这里，再从头根Rule规则进行比对，有符合的链接就再次放入队列，循环往复。
    # callback='parse_item'，指定了回调函数，当一个响应文件开始根这个Rule规则进行比对的时候，会同时把这个页面交给指定的函数进行处理，去提取有用的数据之类的。

    rules = (
        # 每一页的链接规则
        Rule(LinkExtractor(allow=r'shoudufangchan/\d+.shtml'),follow=True),
        Rule(LinkExtractor(allow=r'shoudufangchan/\d+/\d+/\d+.shtml'), callback='parse_item'),
    )
    # 编写的回掉函数，对响应文件进行处理，response是回调函数必须具备的参数。
    def parse_item(self, response):
        # 创建一个item对象，理解为映射。
        item = QianlongwangItem()
        item['title'] = response.xpath('//div[@class="span12"]/h1/text()').extract()[0]
        item['content'] = response.xpath('string(//div[@class="article-content"])').extract()[0]
        # 将item，也就是提取出来的内容，return给pipelines文件进行处理
        return item

第三步，pipelines文件的编写。

数据在爬虫文件中已经拿到了，然后就开始对数据的保存之类的。
pipelines.py

# -*- coding: utf-8 -*-

# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html

import io
import json
import pymysql
#导入settings文件，因为在settings文件中定义了数据的相关信息
from qianlongwang import settings

# 这个管道文件将数据存储到本地json文件中
class QianlongwangPipeline(object):
    def __init__(self):
        self.file = io.open('fangchan.json','w',encoding='utf-8')
    # 这里的item就是爬虫文件return回来的item，并且此方法最后一定要继续将 item  return 出去
    def process_item(self, item, spider):
        # 因为返回的item是类似与pyton的字典格式的数据，所以就索性将其转换为字典，然后转为json
        #ensure_ascii=False，禁用ascii编码，当数据中包含中文的时候使用
        content = json.dumps(dict(item),ensure_ascii=False) + "\n"
        self.file.write(content)
        #必须将item返回出去
        return item
    # 此方法会在爬虫执行完以后自动执行
    def close_spider(self,spider):
        self.file.close()

# 这个管道文件将数据存储到mysql数据库中
class MysqlPipeline(object):
    def __init__(self):
        self.conn = pymysql.connect(
            host = settings.MYSQL_HOST,
            db=settings.MYSQL_DBNAME,
            user=settings.MYSQL_USER,
            passwd=settings.MYSQL_PASSWD,
            # 下面两个属性要记得添加上，不然会报编码错误
            charset='utf8', 
            use_unicode=True
        )

        self.cursor = self.conn.cursor()

    def process_item(self,item,spider):
        title = pymysql.escape_string(item['title'])
        content = pymysql.escape_string(item['content'])
        try:
            self.cursor.execute(
                """insert into qinalong(title,content) values ('%s','%s')"""%(title,content)
            )
            self.conn.commit()
            #print('+'*20)
            print('insert sucess')
        except Exception as e:
            #print('-'*20)
            #print('insert error',e)
            self.conn.rollback()
        return item

第四步，下载中间件的编写

写下载中间件是为了对发出的请求进行包装，比如加个ua，设置个代理之类的，并且还可以对发生异常的请求进行处理
middlewares.py

# -*- coding: utf-8 -*-

# Define here the models for your spider middleware
#
# See documentation in:
# http://doc.scrapy.org/en/latest/topics/spider-middleware.html

from scrapy import signals

import random
# 从settings文件导入设置好的ua列表
from qianlongwang.settings import USER_AGENTS

class RandomUserAgent(object):
    # 这就是对请求的处理，加ua和加代理都是在这个方法里实现的，方法名字必须是这个
    def process_request(self,request,spider):
        useragnet = random.choice(USER_AGENTS)
        request.headers.setdefault('UserAgent',useragnet)
    '''
    还有别的方法：
    #对有异常的请求进行处理
    def process_exception(self,request,execption,spider):
        pass
    '''

最后一步，settings文件的设置

只列出一部分

# mysql数据库相关
MYSQL_HOST = '127.0.0.1'
MYSQL_DBNAME = 'duanzi'
MYSQL_USER = 'root'
MYSQL_PASSWD = ''

#定义的ua列表
USER_AGENTS = [
    "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Win64; x64; Trident/5.0; .NET CLR 3.5.30729; .NET CLR 3.0.30729; .NET CLR 2.0.50727; Media Center PC 6.0)",
    "Mozilla/5.0 (compatible; MSIE 8.0; Windows NT 6.0; Trident/4.0; WOW64; Trident/4.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; .NET CLR 1.0.3705; .NET CLR 1.1.4322)",
    "Mozilla/4.0 (compatible; MSIE 7.0b; Windows NT 5.2; .NET CLR 1.1.4322; .NET CLR 2.0.50727; InfoPath.2; .NET CLR 3.0.04506.30)",
    "Mozilla/5.0 (Windows; U; Windows NT 5.1; zh-CN) AppleWebKit/523.15 (KHTML, like Gecko, Safari/419.3) Arora/0.3 (Change: 287 c9dfb30)",
    "Mozilla/5.0 (X11; U; Linux; en-US) AppleWebKit/527+ (KHTML, like Gecko, Safari/419.3) Arora/0.6",
    "Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.8.1.2pre) Gecko/20070215 K-Ninja/2.1.1",
    "Mozilla/5.0 (Windows; U; Windows NT 5.1; zh-CN; rv:1.9) Gecko/20080705 Firefox/3.0 Kapiko/3.0",
    "Mozilla/5.0 (X11; Linux i686; U;) Gecko/20070322 Kazehakase/0.4.5"]

#下载延时,默认为0
DOWNLOAD_DELAY = 3

#启用下载中间件,后面的 100 是等级，数字越小，优先级越高，优先级决定了中间件的执行顺序，scrapy自带很多中间件，可以了解一下
DOWNLOADER_MIDDLEWARES = {
   'qianlongwang.middlewares.RandomUserAgent': 100,
}
#启用管道文件
ITEM_PIPELINES = {
    # 将存在本地的管道文件注释掉了，因为要存mysql，不需要本地存了
    #'qianlongwang.pipelines.QianlongwangPipeline': 300,
    'qianlongwang.pipelines.MysqlPipeline': 300,
}