scrapy 爬虫系统三、_useful for handling different item types with a si-CSDN博客

本文链接：https://blog.csdn.net/qq_43233737/article/details/119386082

3.三数据保存

import scrapy
import re

class FengtianSpider(scrapy.Spider):
name = ‘fengtian’
# allowed_domains = [‘www.che168.com’]
start_urls = [‘https://www.che168.com/china/fengtian/#pvareaid=104649’]

def parse(self, response):
    cars = response.xpath("//div[@id='goodStartSolrQuotePriceCore0']/ul/li[@name='lazyloadcpc']")
    for car in cars:
        #取第一个extract_first()
        car_name = car.xpath(".//a/div[@class='cards-bottom']/h4/text()").extract_first()
        price=car.xpath(".//a/div[@class='cards-bottom']/div/span/em/text()").extract_first()
        car_price=price+'万'

        car_inf=car.xpath(".//a/div[@class='cards-bottom']/p/text()").extract_first()
        #正则
        car_journey=''.join(re.findall('.*万公里',car_inf))
        car_buytime=''.join(re.findall('\d{4}-\d{2}',car_inf))
        car_city=''.join(re.findall('.*万公里／.*／(.*)／',car_inf))

        #详情页
        detail_path = car.xpath('./a/@href').extract_first()
        detail_url = f'https://www.che168.com{detail_path}'
        inf={'型号':car_name, '里程数':car_journey,'所在地':car_city,'日期':car_buytime,'价格':car_price}

        next_url = response.xpath('//*[@id="listpagination"]/a[@class="page-item-next"]/@href').extract_first()
        if next_url:
            full_nextlink = 'https://www.che168.com/' + next_url
            #相当与return  callback回调到哪个函数
            yield scrapy.Request(full_nextlink, callback=self.parse)

        yield scrapy.Request(detail_url, callback=self.parse_detail_page, meta=inf)

def parse_detail_page(self, response):
    seller_info = re.findall(
        '<span class="manger-name">(.*?)</span>', response.text)[0]
    seller_location = re.findall(
        '<div class="protarit-adress">(.*?)</div>', response.text)[0]
    inf = response.meta
    print(f"{inf}，商家/个人信息:{seller_info}，商家/个人详细地址:{seller_location}")

items.py 保存的数据文件

Define here the models for your scraped items

See documentation in:

https://docs.scrapy.org/en/latest/topics/items.html

import scrapy

class ErshoucheItem(scrapy.Item):
# define the fields for your item here like:
# name = scrapy.Field()
# pass
model = scrapy.Field()
price = scrapy.Field()
dist = scrapy.Field()
city = scrapy.Field()
date = scrapy.Field()

pipelines.py管理

Define your item pipelines here

Don’t forget to add your pipeline to the ITEM_PIPELINES setting

See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html

useful for handling different item types with a single interface

from itemadapter import ItemAdapter

#对数据的相关操作在此处进行
class ErshouchePipeline:
# def process_item(self, item, spider):
# return item
# def process_item(self, item, spider):
# with open(‘lao.txt’, ‘a’, encoding=‘utf-8’) as f:
# line = f"型号：{item[‘model’]}，价格：{item[‘price’]}，里程：{item[‘dist’]}，日期：{item[‘date’]}，城市：{item[‘city’]}\n"
# f.write(line)
# return item
def process_item(self, item, spider):
line = f"型号：{item[‘model’]}，价格：{item[‘price’]}，里程：{item[‘dist’]}，日期：{item[‘date’]}，城市：{item[‘city’]}\n"
self.f.write(line)
return item

#open_spider用来开启对接数据库资源
def open_spider(self, spider):
    print('小爬虫开始爬了！！！！！')
    self.f = open('lao.txt', 'w', encoding='utf-8')


#close_spider用来关闭对数据库连接；
def close_spider(self, spider):
    print('小爬虫爬完了！！！！！')
    self.f.close()

运行结果如下
scrapy crawl fengtian
请添加图片描述