scrapy 爬虫系统三、

3.三数据保存

import scrapy
import re

class FengtianSpider(scrapy.Spider):
name = ‘fengtian’
# allowed_domains = [‘www.che168.com’]
start_urls = [‘https://www.che168.com/china/fengtian/#pvareaid=104649’]

def parse(self, response):
    cars = response.xpath("//div[@id='goodStartSolrQuotePriceCore0']/ul/li[@name='lazyloadcpc']")
    for car in cars:
        #取第一个extract_first()
        car_name = car.xpath(".//a/div[@class='cards-bottom']/h4/text()").extract_first()
        price=car.xpath(".//a/div[@class='cards-bottom']/div/span/em/text()").extract_first()
        car_price=price+'万'

        car_inf=car.xpath(".//a/div[@class='cards-bottom']/p/text()").extract_first()
        #正则
        car_journey=''.join(re.findall('.*万公里',car_inf))
        car_buytime=''.join(re.findall('\d{4}-\d{2}',car_inf))
        car_city=''.join(re.findall('.*万公里/.*/(.*)/',car_inf))

        #详情页
        detail_path = car.xpath('./a/@href').extract_first()
        detail_url = f'https://www.che168.com{detail_path}'
        inf={'型号':car_name, '里程数':car_journey,'所在地':car_city,'日期':car_buytime,'价格':car_price}

        next_url = response.xpath('//*[@id="listpagination"]/a[@class="page-item-next"]/@href').extract_first()
        if next_url:
            full_nextlink = 'https://www.che168.com/' + next_url
            #相当与return  callback回调到哪个函数
            yield scrapy.Request(full_nextlink, callback=self.parse)

        yield scrapy.Request(detail_url, callback=self.parse_detail_page, meta=inf)

def parse_detail_page(self, response):
    seller_info = re.findall(
        '<span class="manger-name">(.*?)</span>', response.text)[0]
    seller_location = re.findall(
        '<div class="protarit-adress">(.*?)</div>', response.text)[0]
    inf = response.meta
    print(f"{inf},商家/个人信息:{seller_info},商家/个人详细地址:{seller_location}")

items.py 保存的数据文件

Define here the models for your scraped items

See documentation in:

https://docs.scrapy.org/en/latest/topics/items.html

import scrapy

class ErshoucheItem(scrapy.Item):
# define the fields for your item here like:
# name = scrapy.Field()
# pass
model = scrapy.Field()
price = scrapy.Field()
dist = scrapy.Field()
city = scrapy.Field()
date = scrapy.Field()

pipelines.py管理

Define your item pipelines here

Don’t forget to add your pipeline to the ITEM_PIPELINES setting

See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html

useful for handling different item types with a single interface

from itemadapter import ItemAdapter

#对数据的相关操作在此处进行
class ErshouchePipeline:
# def process_item(self, item, spider):
# return item
# def process_item(self, item, spider):
# with open(‘lao.txt’, ‘a’, encoding=‘utf-8’) as f:
# line = f"型号:{item[‘model’]},价格:{item[‘price’]},里程:{item[‘dist’]},日期:{item[‘date’]},城市:{item[‘city’]}\n"
# f.write(line)
# return item
def process_item(self, item, spider):
line = f"型号:{item[‘model’]},价格:{item[‘price’]},里程:{item[‘dist’]},日期:{item[‘date’]},城市:{item[‘city’]}\n"
self.f.write(line)
return item

#open_spider用来开启对接数据库资源
def open_spider(self, spider):
    print('小爬虫开始爬了!!!!!')
    self.f = open('lao.txt', 'w', encoding='utf-8')


#close_spider用来关闭对数据库连接;
def close_spider(self, spider):
    print('小爬虫爬完了!!!!!')
    self.f.close()

运行结果如下
scrapy crawl fengtian
请添加图片描述

  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值