scrapy item_loader

1.导入包


2.提取数据页面


3.item.py

# -*- coding: utf-8 -*-

# Define here the models for your scraped items
#
# See documentation in:
# https://doc.scrapy.org/en/latest/topics/items.html

# import scrapy
#
#
# class BolespiderItem(scrapy.Item):
#     url = scrapy.Field()
#     title = scrapy.Field()
#     time = scrapy.Field()
#     sort = scrapy.Field()
#     content = scrapy.Field()
#     praise = scrapy.Field()
#     collect = scrapy.Field()
#     comment = scrapy.Field()
import scrapy, re
from scrapy.contrib.loader import ItemLoader
from datetime import datetime
from scrapy.contrib.loader.processor import Join, MapCompose, TakeFirst


def convert_time(value):
    # 此处的value是列表中一项一项的
    value = value.replace('.', '').strip()
    try:
        time = datetime.strptime(value, '%Y/%m/%d')
    except:
        time = datetime.now()
    return time


def convert_sort(value):
    if '评论 ' in value:
        return ""
    else:
        return value


def convert_praise(value):
    # 三种情况:1.''; 2.'',1,;
    if value.strip() != "":
        pattern = re.compile(r'\d+')
        num = re.findall(pattern, value)
        if num:
            num = int(num[0])
        else:
            num = 0
        return num


def convert_collect(value):
    # 用正则拿数字
    num = re.findall(re.compile(r'\d+'), value)
    if num:
        num = int(num[0])
    else:
        num = 0
    return num


def convert_comment(value):
    num = re.findall(re.compile(r'\d+'), value)
    if num:
        num = int(num[0])
    else:
        num = 0
    return num


class BolespiderItem(scrapy.Item):
    title = scrapy.Field(
        output_processor=TakeFirst()
    )
    time = scrapy.Field(
        input_processor=MapCompose(convert_time),
        output_processor=TakeFirst()
    )
    sort = scrapy.Field(
        input_processor=MapCompose(convert_sort),
        output_processor=Join()
    )

    content = scrapy.Field(
        output_processor=Join()
    )
    praise = scrapy.Field(
        input_processor=MapCompose(convert_praise),
        output_processor=TakeFirst()
    )
    collect = scrapy.Field(
        input_processor=MapCompose(convert_collect),
        output_processor=TakeFirst()
    )
    comment = scrapy.Field(
        input_processor=MapCompose(convert_comment),
        output_processor=TakeFirst()
    )
    detail_url = scrapy.Field(
        output_processor=Join()
    )

  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值