python scrapy爬取豆瓣top250(ItemLoader)

*csdn没排版太懒=,=之后有时间重写进行详细分析

ItemLoader更方便维护,代码量也相对更少。(可以看看另一篇没用ItemLoader的对比)
**这里可以看到Play_time是有问题的,后面看items data转换讲到 可以用lambda取年**![在这里插入图片描述](https://img-blog.csdnimg.cn/20200303184426170.png?x-oss-process=image/watermark,type_ZmFuZ3poZW5naGVpdGk,shadow_10,text_aHR0cHM6Ly9ibG9nLmNzZG4ubmV0L3FxXzQzNjgxMTE1,size_16,color_FFFFFF,t_70)

贴上源码;



import scrapy
from douban.items import doubanItemLoader, doubanSpiderItem
from scrapy.loader import ItemLoader
from scrapy.http import Request
from urllib import parse
class Douban2019Spider(scrapy.Spider):
    name = 'douban2019'
    # allowed_domains = ['https://movie.douban.com/top250']
    start_urls = ['https://movie.douban.com/top250']

    def parse(self, response):
        **#利用callback回调函数**
        yield  Request(url=response.url,callback=self.parse_detail)
        post_urls=response.xpath('//*[@id="content"]/div/div[1]/div[2]/a/@href').extract()
        for post_url in post_urls:
                yield Request(url=parse.urljoin(response.url,post_url),callback=self.parse_detail)
        pass



**##这里xpath我没简化直接复制的** 
    def parse_detail(self, response):
        for i in range(1,26):
            **#写个函数重载ItemLoader 目的的是将list转换为str**
            item_loader = doubanItemLoader(item=doubanSpiderItem(), response=response)
            item_loader.add_xpath("image_url",'//*[@id="content"]/div/div[1]/ol/li[ %s ]/div/div[1]/a/img/@src'%i)
            item_loader.add_xpath("Rank", '//*[@id="content"]/div/div[1]/ol/li[ %s ]/div/div[1]/em/text()'%i)
            item_loader.add_xpath("title",'//*[@id="content"]/div/div[1]/ol/li[ %s ]/div/div[2]/div[1]/a/span[1]/text()'%i)
            item_loader.add_xpath('Main_role','//*[@id="content"]/div/div[1]/ol/li[ %s ]/div/div[2]/div[2]/p[1]/text()[1]'%i)
            item_loader.add_xpath('Play_time','//*[@id="content"]/div/div[1]/ol/li[ %s ]/div/div[2]/div[2]/p[1]/text()[2]'%i)
            item_loader.add_xpath('tags', '//*[@id="content"]/div/div[1]/ol/li[ %s ]/div/div[2]/div[2]/p[1]/text()[2]'%i)
            item_loader.add_xpath('Score', '//*[@id="content"]/div/div[1]/ol/li[ %s ]/div/div[2]/div[2]/div/span[2]/text()'%i)
            item_loader.add_xpath('comment', '//*[@id="content"]/div/div[1]/ol/li[ %s ]/div/div[2]/div[2]/div/span[4]/text()'%i)
            item_loader.add_xpath('Theme_sentence', '//*[@id="content"]/div/div[1]/ol/li[ %s ]/div/div[2]/div[2]/p[2]/span/text()'%i)
            item_loader.add_xpath('source','//div/div[1]/ol/li[ %s ]/div/div[2]/div[1]/a/@href'%i)
            douban_item=item_loader.load_item()

            yield  douban_item
        pass

items.py
#items加载item 写的可能有点乱

import re
import scrapy
from scrapy.loader import ItemLoader
from scrapy.loader.processors import TakeFirst,MapCompose
import datetime


class DoubanItem(scrapy.Item):
    # define the fields for your item here like:
    # name = scrapy.Field()
    pass

class doubanItemLoader(ItemLoader):
    default_output_processor = TakeFirst()

def RemoveFormatter(value):
    **#去除如\n \xa0 \u3000等格式符**
    Need_remove="".join(value.split())
    return Need_remove
def date_convert(value):
    **#date转换方便存入数据库** ***如果需要取int Y后面可以加lambda取***
    try:
        Time =datetime.datetime.strptime(value, "%Y").date()
    except Exception as e:
        Time = datetime.datetime.now().date()
    return Time
**#?熟悉贪婪与非贪婪匹配**
def get_nums(value):
    **#正则取数字**
    rex_str = ".*?(\d+).*"
    match_obj = re.match(rex_str, value)
    if match_obj:
        value = match_obj.group(1)
    return value
def get_time(value):
    rex_str = ".*?(\d+).*"
    match_obj = re.match(rex_str, value)
    if match_obj:
        value = match_obj.group(1)
    return value
def Remove_nums(value):
    rex_str = "\d+/(.*)"
    match_obj = re.match(rex_str, value)
    if match_obj:
        value=match_obj.group(1)
    return value

def return_value(value):
    return value


def get_Madein(value):
    value=list(value)
    # result = ""
    # for i in range(89, 91):
    #     result += value[i]
    # return result
    return str(value[89] + value[90])

def add_Num(value):
    return "No."+value

class doubanSpiderItem(scrapy.Item):
    **#Mapcompose存放需加载的函数**
    image_url=scrapy.Field()
    source=scrapy.Field()
    Rank=scrapy.Field(
        # input_processor=MapCompose(add_Num)
    )
    title=scrapy.Field()
    Main_role=scrapy.Field(
        input_processor=MapCompose(RemoveFormatter)
    )
    Play_time=scrapy.Field(
        input_processor=MapCompose(RemoveFormatter,get_time,date_convert)
    )
    tags=scrapy.Field(
        input_processor=MapCompose(RemoveFormatter,Remove_nums)
    )
    Score=scrapy.Field()
    comment=scrapy.Field(
        input_processor=MapCompose(get_nums)
    )
    Theme_sentence=scrapy.Field()

###pipelines.py
#pipelines 用于存放数据 这里数据不多我采用的同步机制(同步比较好理解) 异步百度上也有

import MySQLdb

class DoubanPipeline(object):
    def process_item(self, item, spider):
        return item

class MysqlPipeline(object):
    **#采用同步的机制**
    def __init__(self):
        self.conn=MySQLdb.connect('127.0.0.1','root','root','douban',charset="utf8",use_unicode=True)
        self.cursor=self.conn.cursor()
    def process_item(self, item, spider):
        insert_sql="""
            insert into top250(Rank,Title,url,Main_role,tags,Theme_sen,comment,score,Play_time,image_url)
            VALUES (%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)
        """

        self.cursor.execute(insert_sql,(item["Rank"],item["title"],item["source"],item["Main_role"],item["tags"],item["Theme_sentence"],item["comment"],item["Score"],item["Play_time"],item["image_url"]))
        self.conn.commit()
        return item

main.py 调试函数

	from scrapy.cmdline import execute
	import os
	import sys
	#这里是先取当前文件地址然后再取它的上一目录地址
	sys.path.append(os.path.dirname(os.path.abspath(__file__)))
	# print(os.path.dirname(os.path.abspath(__file__)))
	#执行 可以看看scrapy shell
	execute(["scrapy","crawl","douban2019"])
  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 1
    评论
评论 1
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值