1、items.py
class DoubanItem(scrapy.Item):
# define the fields for your item here like:
# name = scrapy.Field()
# 电影排名
rank = scrapy.Field()
# 电影标题
title = scrapy.Field()
picUrl = scrapy.Field()
pass
2、shoubanspider.py
# -*- coding: utf-8 -*-
import scrapy
# from 工程名称.文件名 import 类名
from douban.items import DoubanItem
import sys
# 设置中文格式
utf8 = "utf-8"
if sys.getdefaultencoding() != utf8:
reload(sys)
sys.setdefaultencoding(utf8)
pass
class DoubanspiderSpider(scrapy.Spider):
name = 'doubanspider'
allowed_domains = ['https://~~~~~~~要爬取的网页网址~~~~~~~~~']
start_urls = ['https://~~~~~~~要爬取的网页网址~~~~~~~~~']
def parse(self, response):
# 找到爬取的数据,进行解析,获取到多条数据,
items=response.xpath("//div[@class='item']")
# 获取每一条数据,进行循环
for item in items:
doubanItem=DoubanItem()
doubanItem['rank']=item.xpath('div[@class="pic"]/em/text()').extract()
doubanItem['title']=item.xpath('div[@class="info"]/div[@class="hd"]/a/span[@class="title"]/text()').extract()
doubanItem['picUrl']=item.xpath('div[@class="pic"]/a/img/@src').extract()
yield doubanItem
pass
3、pipelines.py(数据处理)
(1)输出pipelines.py
class DoubanPipeline(object):
def process_item(self, item, spider):
print(item['rank'][