爬虫,车辆数据可视化

在懂车帝官网爬取车辆排行榜,由于你们没有连接数据库,可以把spiderObj.save_to_sql()注释掉

import requests
from lxml import etree
import csv
import os
import time
import json
import pandas as pd
import re
import django

os.environ.setdefault('DJANGO_SETTINGS_MODULE', '可视化.settings')
django.setup()
from myApp.models import CarInformation


class spider(object):

    def __init__(self):
        self.spiderUrl = ('https://www.dongchedi.com/motor/pc/car/rank_data?aid=1839&app_name=auto_web_pc&city_name'
                          '=%E8%9A%8C%E5%9F%A0&count=10&month=&new_energy_type=&rank_data_type=11&brand_id'
                          '=&price=&manufacturer=&outter_detail_type=&nation=0')
        self.headers = {
            'User-Agent': 'Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36 (KHTML, '
                          'like Gecko) Chrome/120.0.0.0 Mobile Safari/537.36 Edg/120.0.0.0'
        }

    def init(self):
        if not os.path.exists('./temp.csv'):
            with open('./temp.csv', 'a', newline='', encoding='utf-8') as wf:
                write = csv.writer(wf)
                write.writerow(["brand", "carName", "carImg", "saleVolume", "price", "manufacturer", "rank", "carModel",
                                "energyType", "marketTime", "insure"])

    def get_page(self):
        with open('./spiderPage.txt', 'r') as r_f:
            return r_f.readlines()[-1].strip()

    def set_page(self, newPage):
        with open('./spiderPage.txt', 'a') as a_f:
            a_f.write('\n' + str(newPage))

    def main(self):
        count = self.get_page()
        params = {
            'offset': int(count)
        }
        print("数据从{}开始爬取".format(int(count) + 1))
        pageJson = requests.get(self.spiderUrl, headers=self.headers, params=params).json()
        pageJson = pageJson["data"]["list"]
        try:
            for index, car in enumerate(pageJson):
                carData = []
                print("正在爬取第%d" % (index + 1) + "数据")
                # 品牌名
                carData.append(car["brand_name"])
                # 车名
                carData.append(car["series_name"])
                # 图片链接
                carData.append(car["image"])
                # 销量
                carData.append(car["count"])
                # 价格
                price = []
                price.append(car["min_price"])
                price.append(car["max_price"])
                carData.append(price)
                # 厂商
                carData.append(car["sub_brand_name"])
                # 排名
                carData.append(car["rank"])
                # 第二个页面
                carNumber = car["series_id"]

                infoHTML = requests.get("https://www.dongchedi.com/auto/params-carIds-x-%s" % carNumber)
                infoHTMLpath = etree.HTML(infoHTML.text)
                # carModel
                carModel = infoHTMLpath.xpath("//div[@data-row-anchor='jb']/div[2]/div/text()")[0]
                carData.append(carModel)
                # energyType
                energyType = infoHTMLpath.xpath("//div[@data-row-anchor='fuel_form']/div[2]/div/text()")[0]
                carData.append(energyType)
                # maketTime
                marketTime = infoHTMLpath.xpath("//div[@data-row-anchor='market_time']/div[2]/div/text()")[0]
                carData.append(marketTime)
                # insure
                insure = infoHTMLpath.xpath("//div[@data-row-anchor='period']/div[2]/div/text()")[0]
                carData.append(insure)
                print(carData)
                self.save_to_csv(carData)
        except:
            pass
        # print(pageJson)
        self.set_page(int(count) + 10)
        self.main()

    def save_to_csv(self, resultData):
        with open('temp.csv', 'a', newline='', encoding='utf-8') as f:
            writer = csv.writer(f)
            writer.writerow(resultData)

    def clear_csv(self):
        df = pd.read_csv('temp.csv')
        df.dropna(inplace=True)
        df.drop_duplicates(inplace=True)
        print("总数量为%d" % df.shape[0])
        return df.values

    def save_to_sql(self):
        data = self.clear_csv()
        for car in data:
            CarInformation.objects.create(
                brand=car[0],
                carName=car[1],
                carImg=car[2],
                saleVolume=car[3],
                price=car[4],
                manufacturer=car[5],
                rank=car[6],
                carModel=car[7],
                energyType=car[8],
                marketTime=car[9],
                insure=car[10]
            )


if __name__ == '__main__':
    spiderObj = spider()
    spiderObj.init()
    spiderObj.main()
    # spiderObj.save_to_sql()


  • 10
    点赞
  • 8
    收藏
    觉得还不错? 一键收藏
  • 打赏
    打赏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包

打赏作者

强哥哥1222

你的鼓励将是我创作的最大动力

¥1 ¥2 ¥4 ¥6 ¥10 ¥20
扫码支付:¥1
获取中
扫码支付

您的余额不足,请更换扫码支付或充值

打赏作者

实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值