python爬虫07:scrapy items定义和使用,md5的获取

scrapy items定义和使用

当要爬取的字段过多,要把字段拼在一块
scrapy提供了 item 类
生成爬虫时已经生成了items.py

item是dict的增强版本

关闭robot协议:
关于网站的robot协议:域名+/robots.txt 在浏览器可获取
不遵循robots协议:
在settings.py 搜索ROBOTSTXT_OBEY = True把true改成False即可

顺便在settings.py 搜索piplines,把默认注释取消

# Configure item pipelines
# See https://docs.scrapy.org/en/latest/topics/item-pipeline.html
ITEM_PIPELINES = {
   'ArticleSpider.pipelines.ArticlespiderPipeline': 300,
}

其中300是优先级,越小越先执行

添加md5获取工具:

在根目录下新建一个python package文件夹,里面新建一个utils文件夹,里面建一个common.py文件,之后复制以下代码进去

import hashlib


def get_md5(url):
    if isinstance(url, str):
        url = url.encode('utf8')
    m = hashlib.md5()
    m.update(url)

    return m.hexdigest()

if __name__ == '__main__':
    print(get_md5('https://cnblogs.com'))

items.py

# -*- coding: utf-8 -*-

# Define here the models for your scraped items
#
# See documentation in:
# https://docs.scrapy.org/en/latest/topics/items.html

import scrapy


class ArticlespiderItem(scrapy.Item):
    # define the fields for your item here like:
    # name = scrapy.Field()
    pass


class JobBoleArticleItem(scrapy.Item):
    url = scrapy.Field()
    url_object_id = scrapy.Field()
    title = scrapy.Field()
    create_date = scrapy.Field()
    front_image_url = scrapy.Field()
    front_image_path = scrapy.Field() # 下载之后存在哪里
    praise_nums = scrapy.Field()
    comment_nums = scrapy.Field()
    fav_nums = scrapy.Field()
    tags = scrapy.Field()
    content = scrapy.Field()

jobbole.py

# -*- coding: utf-8 -*-
import scrapy
from scrapy import Request
from urllib import parse
import requests
import json
import re

from ArticleSpider.utils.common import get_md5
from ArticleSpider.items import JobBoleArticleItem


class JobboleSpider(scrapy.Spider):
    name = 'jobbole'
    allowed_domains = ['news.cnblogs.com']
    start_urls = ['http://news.cnblogs.com/']

    def parse(self, response):
        """
        parse 中一般写抓取策略,网页解析写在别处
        1.这里是获取新闻列表页中的新闻url并交给scrapy进行下载,之后调用相应的解析方法
        2.还要获取下一个列表页,继续循环
        :param response: 调用此方法的结果
        :return:
        """
        post_nodes = response.css('#news_list .news_block')[:1] # 一个selector列表 加切片方便调试
        for post_node in post_nodes:
            image_url = post_node.css('.entry_summary a img::attr(href)').extract_first("") # 封面图片网址
            post_url = post_node.css('h2 a::attr(href)').extract_first("")
            yield Request(url=parse.urljoin(response.url, post_url),
                          meta={"front_image_url": image_url},
                          callback=self.parse_detail
                          ) # 简化代码处理异常url
        '''
        提取下一页并交给scrapy下载
        next_url = response.css("div.pager a:last_chile::text").extract_first("")
        if next_url == "Next >":
            next_url = response.css("div.pager a:last_chile::attr(href)").extract_first("")
            yield Request(url=parse.urljoin(response.url, next_url), callback=self.parse)  # 简化代码处理异常url
        或者
        next_url = response.xpath("//a[contains(text(), 'Next >')]/@href").extract_first("")
        yield Request(url=parse.urljoin(response.url, next_url), callback=self.parse)
        '''

    def parse_detail(self, response):
        '''处理下载的详情页'''
        match_id = re.match(".*?(\d+)", response.url) # 正则表达式提取文章id,但这种模式兼容性不高
        if match_id:
            article_item = JobBoleArticleItem()

            # 写成xpath版本
            # title = response.css("#news_title a::text").extract_first("")
            title = response.xpath("//*[@id='news_title']//a/text()").extract_first("")
            # create_date = response.css("#news_info .time::text").extract_first("")
            create_date = response.xpath("//*[@id='news_info']//*[@class='time']/text()").extract_first("")
            match_re = re.match(".*?(\d+.*)", create_date)
            if match_re:
                create_date = match_re.group(1)
            # content = response.css("#news_content").extract()[0]  # 一般提取那段文章所在的html
            content = response.xpath("//*[@id='news_content']").extract()[0]
            # tag_list = response.css(".news_tags a::text").extract()
            tag_list = response.xpath("//*[@class='news_tags']//a/text").extract()
            tags = ", ".join(tag_list)

            post_id = match_id.group(1)

            article_item['url'] = response.url
            article_item['url_object_id'] = get_md5(article_item['url'])
            article_item['title'] = title
            article_item['create_date'] = create_date
            article_item['content'] = content
            article_item['tags'] = tags
            article_item['front_image_url'] = response.meta.get("front_image_url", "") # 不用get可能会抛异常

            yield Request(url=parse.urljoin(response.url, "/NewsAjax/GetAjaxNewsInfo?contentId={}".format(post_id)),
                          meta={'article_item': article_item},
                          callback=self.parse_page_nums,)

    '''都是基于回调的, 尽管这样会麻烦一点'''
    def parse_page_nums(self, response):
        article_item = response.meta.get('article_item', '')

        j_data = json.loads(response.text)
        praise_nums = j_data["DiggCount"]
        fav_nums = j_data["TotalView"]
        comment_nums = j_data["CommentCount"]
        print(j_data)

        article_item['praise_nums'] = praise_nums
        article_item['fav_nums'] = fav_nums
        article_item['comment_nums'] = comment_nums

        yield article_item # item 和 Request 类型都可以随时yield出去,item 类会走 piplines.py 的逻辑

        print('stop')

piplines.py
在最后一行打断点,运行前几篇设置好的main.py,可以看到跳转到了piplines.py

class ArticlespiderPipeline:
    def process_item(self, item, spider):
        return item
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值