*csdn没排版太懒=,=之后有时间重写进行详细分析
ItemLoader更方便维护,代码量也相对更少。(可以看看另一篇没用ItemLoader的对比)
**这里可以看到Play_time是有问题的,后面看items data转换讲到 可以用lambda取年**![在这里插入图片描述](https://img-blog.csdnimg.cn/20200303184426170.png?x-oss-process=image/watermark,type_ZmFuZ3poZW5naGVpdGk,shadow_10,text_aHR0cHM6Ly9ibG9nLmNzZG4ubmV0L3FxXzQzNjgxMTE1,size_16,color_FFFFFF,t_70)
贴上源码;
import scrapy
from douban.items import doubanItemLoader, doubanSpiderItem
from scrapy.loader import ItemLoader
from scrapy.http import Request
from urllib import parse
class Douban2019Spider(scrapy.Spider):
name = 'douban2019'
# allowed_domains = ['https://movie.douban.com/top250']
start_urls = ['https://movie.douban.com/top250']
def parse(self, response):
**#利用callback回调函数**
yield Request(url=response.url,callback=self.parse_detail)
post_urls=response.xpath('//*[@id="content"]/div/div[1]/div[2]/a/@href').extract()
for post_url in post_urls:
yield Request(url=parse.urljoin(response.url,post_url),callback=self.parse_detail)
pass
**##这里xpath我没简化直接复制的**
def parse_detail(self, response):
for i in range(1,26):
**#写个函数重载ItemLoader 目的的是将list转换为str**
item_loader = doubanItemLoader(item=doubanSpiderItem(), response=response)
item_loader.add_xpath("image_url",'//*[@id="content"]/div/div[1]/ol/li[ %s ]/div/div[1]/a/img/@src'%i)
item_loader.add_xpath("Rank", '//*[@id="content"]/div/div[1]/ol/li[ %s ]/div/div[1]/em/text()'%i)
item_loader.add_xpath("title",'//*[@id="content"]/div/div[1]/ol/li[ %s ]/div/div[2]/div[1]/a/span[1]/text()'%i)
item_loader.add_xpath('Main_role','//*[@id="content"]/div/div[1]/ol/li[ %s ]/div/div[2]/div[2]/p[1]/text()[1]'%i)
item_loader.add_xpath('Play_time','//*[@id="content"]/div/div[1]/ol/li[ %s ]/div/div[2]/div[2]/p[1]/text()[2]'%i)
item_loader.add_xpath('tags', '//*[@id="content"]/div/div[1]/ol/li[ %s ]/div/div[2]/div[2]/p[1]/text()[2]'%i)
item_loader.add_xpath('Score', '//*[@id="content"]/div/div[1]/ol/li[ %s ]/div/div[2]/div[2]/div/span[2]/text()'%i)
item_loader.add_xpath('comment', '//*[@id="content"]/div/div[1]/ol/li[ %s ]/div/div[2]/div[2]/div/span[4]/text()'%i)
item_loader.add_xpath('Theme_sentence', '//*[@id="content"]/div/div[1]/ol/li[ %s ]/div/div[2]/div[2]/p[2]/span/text()'%i)
item_loader.add_xpath('source','//div/div[1]/ol/li[ %s ]/div/div[2]/div[1]/a/@href'%i)
douban_item=item_loader.load_item()
yield douban_item
pass
items.py
#items加载item 写的可能有点乱
import re
import scrapy
from scrapy.loader import ItemLoader
from scrapy.loader.processors import TakeFirst,MapCompose
import datetime
class DoubanItem(scrapy.Item):
# define the fields for your item here like:
# name = scrapy.Field()
pass
class doubanItemLoader(ItemLoader):
default_output_processor = TakeFirst()
def RemoveFormatter(value):
**#去除如\n \xa0 \u3000等格式符**
Need_remove="".join(value.split())
return Need_remove
def date_convert(value):
**#date转换方便存入数据库** ***如果需要取int Y后面可以加lambda取***
try:
Time =datetime.datetime.strptime(value, "%Y").date()
except Exception as e:
Time = datetime.datetime.now().date()
return Time
**#?熟悉贪婪与非贪婪匹配**
def get_nums(value):
**#正则取数字**
rex_str = ".*?(\d+).*"
match_obj = re.match(rex_str, value)
if match_obj:
value = match_obj.group(1)
return value
def get_time(value):
rex_str = ".*?(\d+).*"
match_obj = re.match(rex_str, value)
if match_obj:
value = match_obj.group(1)
return value
def Remove_nums(value):
rex_str = "\d+/(.*)"
match_obj = re.match(rex_str, value)
if match_obj:
value=match_obj.group(1)
return value
def return_value(value):
return value
def get_Madein(value):
value=list(value)
# result = ""
# for i in range(89, 91):
# result += value[i]
# return result
return str(value[89] + value[90])
def add_Num(value):
return "No."+value
class doubanSpiderItem(scrapy.Item):
**#Mapcompose存放需加载的函数**
image_url=scrapy.Field()
source=scrapy.Field()
Rank=scrapy.Field(
# input_processor=MapCompose(add_Num)
)
title=scrapy.Field()
Main_role=scrapy.Field(
input_processor=MapCompose(RemoveFormatter)
)
Play_time=scrapy.Field(
input_processor=MapCompose(RemoveFormatter,get_time,date_convert)
)
tags=scrapy.Field(
input_processor=MapCompose(RemoveFormatter,Remove_nums)
)
Score=scrapy.Field()
comment=scrapy.Field(
input_processor=MapCompose(get_nums)
)
Theme_sentence=scrapy.Field()
###pipelines.py
#pipelines 用于存放数据 这里数据不多我采用的同步机制(同步比较好理解) 异步百度上也有
import MySQLdb
class DoubanPipeline(object):
def process_item(self, item, spider):
return item
class MysqlPipeline(object):
**#采用同步的机制**
def __init__(self):
self.conn=MySQLdb.connect('127.0.0.1','root','root','douban',charset="utf8",use_unicode=True)
self.cursor=self.conn.cursor()
def process_item(self, item, spider):
insert_sql="""
insert into top250(Rank,Title,url,Main_role,tags,Theme_sen,comment,score,Play_time,image_url)
VALUES (%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)
"""
self.cursor.execute(insert_sql,(item["Rank"],item["title"],item["source"],item["Main_role"],item["tags"],item["Theme_sentence"],item["comment"],item["Score"],item["Play_time"],item["image_url"]))
self.conn.commit()
return item
main.py 调试函数
from scrapy.cmdline import execute
import os
import sys
#这里是先取当前文件地址然后再取它的上一目录地址
sys.path.append(os.path.dirname(os.path.abspath(__file__)))
# print(os.path.dirname(os.path.abspath(__file__)))
#执行 可以看看scrapy shell
execute(["scrapy","crawl","douban2019"])