3.三数据保存
import scrapy
import re
class FengtianSpider(scrapy.Spider):
name = ‘fengtian’
# allowed_domains = [‘www.che168.com’]
start_urls = [‘https://www.che168.com/china/fengtian/#pvareaid=104649’]
def parse(self, response):
cars = response.xpath("//div[@id='goodStartSolrQuotePriceCore0']/ul/li[@name='lazyloadcpc']")
for car in cars:
#取第一个extract_first()
car_name = car.xpath(".//a/div[@class='cards-bottom']/h4/text()").extract_first()
price=car.xpath(".//a/div[@class='cards-bottom']/div/span/em/text()").extract_first()
car_price=price+'万'
car_inf=car.xpath(".//a/div[@class='cards-bottom']/p/text()").extract_first()
#正则
car_journey=''.join(re.findall('.*万公里',car_inf))
car_buytime=''.join(re.findall('\d{4}-\d{2}',car_inf))
car_city=''.join(re.findall('.*万公里/.*/(.*)/',car_inf))
#详情页
detail_path = car.xpath('./a/@href').extract_first()
detail_url = f'https://www.che168.com{detail_path}'
inf={'型号':car_name, '里程数':car_journey,'所在地':car_city,'日期':car_buytime,'价格':car_price}
next_url = response.xpath('//*[@id="listpagination"]/a[@class="page-item-next"]/@href').extract_first()
if next_url:
full_nextlink = 'https://www.che168.com/' + next_url
#相当与return callback回调到哪个函数
yield scrapy.Request(full_nextlink, callback=self.parse)
yield scrapy.Request(detail_url, callback=self.parse_detail_page, meta=inf)
def parse_detail_page(self, response):
seller_info = re.findall(
'<span class="manger-name">(.*?)</span>', response.text)[0]
seller_location = re.findall(
'<div class="protarit-adress">(.*?)</div>', response.text)[0]
inf = response.meta
print(f"{inf},商家/个人信息:{seller_info},商家/个人详细地址:{seller_location}")
items.py 保存的数据文件
Define here the models for your scraped items
See documentation in:
https://docs.scrapy.org/en/latest/topics/items.html
import scrapy
class ErshoucheItem(scrapy.Item):
# define the fields for your item here like:
# name = scrapy.Field()
# pass
model = scrapy.Field()
price = scrapy.Field()
dist = scrapy.Field()
city = scrapy.Field()
date = scrapy.Field()
pipelines.py管理
Define your item pipelines here
Don’t forget to add your pipeline to the ITEM_PIPELINES setting
See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html
useful for handling different item types with a single interface
from itemadapter import ItemAdapter
#对数据的相关操作在此处进行
class ErshouchePipeline:
# def process_item(self, item, spider):
# return item
# def process_item(self, item, spider):
# with open(‘lao.txt’, ‘a’, encoding=‘utf-8’) as f:
# line = f"型号:{item[‘model’]},价格:{item[‘price’]},里程:{item[‘dist’]},日期:{item[‘date’]},城市:{item[‘city’]}\n"
# f.write(line)
# return item
def process_item(self, item, spider):
line = f"型号:{item[‘model’]},价格:{item[‘price’]},里程:{item[‘dist’]},日期:{item[‘date’]},城市:{item[‘city’]}\n"
self.f.write(line)
return item
#open_spider用来开启对接数据库资源
def open_spider(self, spider):
print('小爬虫开始爬了!!!!!')
self.f = open('lao.txt', 'w', encoding='utf-8')
#close_spider用来关闭对数据库连接;
def close_spider(self, spider):
print('小爬虫爬完了!!!!!')
self.f.close()
运行结果如下
scrapy crawl fengtian