爬下某car之家数据，透视下各个价格区间的车系

最新推荐文章于 2024-04-14 18:54:03 发布

映之123

最新推荐文章于 2024-04-14 18:54:03 发布

阅读量480

点赞数

文章标签： python 爬虫学习方法

本文链接：https://blog.csdn.net/zhongzhongge/article/details/130234251

版权

一、代码爬取数据思路

首先是从某car之家的首页找到各个获取所有品牌的接口

然后把各个品牌及其对应的url存成一个字典。

写一个获取各个车系数据的方法，然后循环遍历各个品牌下的车系，将所有车系的数据存到excel表，代码爬取完所有品牌的车系后，会把数据保存到本地生成excel文件

# -*- coding: utf-8 -*-
# auth:映之

import requests,re
import time,datetime
import pandas as pd
from bs4 import BeautifulSoup
import csv,parsel
import random
import sys
import xlwt
import importlib
importlib.reload(sys)

url = 'https://car.autohome.com.cn/AsLeftMenu/As_LeftListNew.ashx?typeId=1%20&brandId=0%20&fctId=0%20&seriesId=0'
headers = {
    'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 11_1_0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.88 Safari/537.36'
}
response = requests.get(url, headers=headers)
soup = BeautifulSoup(response.text[18:-3], 'lxml')
brands = soup.find_all('a')

brand_urls = {}          #品牌,url存为字典
for brand in brands:
    brand_name = brand.text
    brand_href = 'https://car.autohome.com.cn' + brand['href']
    brand_urls[brand_name.split('(')[0]] = brand_href


def fetch_brand_cars_info(brand, url):            #获取各个车系的数据
    headers = {
        'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 11_1_0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.88 Safari/537.36'
    }
    response = requests.get(url, headers=headers)
    soup = BeautifulSoup(response.text, 'lxml')
    cars = soup.select('div.list-cont')
    brand_cars = []
    for car in cars:
        car_info = {'品牌': brand}
        name = car.select('a.font-bold')[0].text
        score = car.select('span.score-number')
        if len(score) == 0:
            score = '暂无'
        else:
            score = score[0].text
        car_info['车系'] = name
        car_info['评分'] = score

        ul = car.select('ul.lever-ul')[0]
        for li in ul.select('li'):
            data = li.text.replace('\xa0', '').replace('&nbsp;', '').replace(' ', '').strip().split('：')
            if '颜色' in data[0]: continue
            if len(data) < 2: continue
            car_info[data[0]] = data[1]

        price = car.select('span.font-arial')[0].text
        price = price.split('-')
        if len(price) == 1:
            car_info['最低指导价'] = price[0]
            car_info['最高指导价'] = price[0]
        else:
            car_info['最低指导价'] = price[0] + '万'
            car_info['最高指导价'] = price[1]

        car_info['链接'] = url
        brand_cars.append(car_info)

    return brand_cars

book = xlwt.Workbook(encoding='utf-8',style_compression=0)
sheet = book.add_sheet('汽车之家数据',cell_overwrite_ok=True)
col = ('品牌','车系','评分','级别','车身结构|续航里程','发动机|电动机','变速箱|充电时间','最低指导价','最高指导价','链接')
for i in range(0,10):
        sheet.write(0,i,col[i])
k = 1
for u, b in brand_urls.items():     #brand_urls.key(),brand_urls.values():
    brand_car = fetch_brand_cars_info(u,b)
    for tmp in brand_car:
        sheet.write(k, 0, tmp['品牌'])
        sheet.write(k, 1, tmp['车系'])
        sheet.write(k, 2, tmp['评分'])
        sheet.write(k, 3, tmp['级别'])
        if '车身结构' in tmp.keys():
            sheet.write(k, 4, tmp['车身结构'])
        elif '续航里程' in tmp.keys():
            sheet.write(k, 4, tmp['续航里程'])
        if '发动机' in tmp.keys():
            sheet.write(k, 5, tmp['发动机'])
        elif '电动机' in tmp.keys():
            sheet.write(k, 5, tmp['电动机'])
        # sheet.write(k, 5, tmp['发动机'])
        if '变速箱' in tmp.keys():
            sheet.write(k, 6, tmp['变速箱'])
        elif '充电时间' in tmp.keys():
            sheet.write(k, 6, tmp['充电时间'])
        # sheet.write(k, 6, tmp['变速箱'])
        sheet.write(k, 7, tmp['最低指导价'])
        sheet.write(k, 8, tmp['最高指导价'])
        sheet.write(k, 9, tmp['链接'])
        k += 1

book.save('汽车之家数据.xls')

二、数据进行透视

爬取到的数据如下，共1493条数据：