爬虫之获取人人车网站中车辆的信息

爬虫之获取人人车网站中车辆的信息

import base64
import time

import pymysql
from fontTools.ttLib import TTFont
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.edge.options import Options
import re

car_id_list = []


# 字体解析
def font_analysis(driver):
    style_text = driver.find_element(By.XPATH, "//style[1]").get_attribute("innerHTML")

    # 取出base64后的码
    base64_str = re.match("(.*?)base64,(.*?)'(.*?)", style_text).group(2)
    # 将其转换为
    # encode():以指定的编码格式编码字符串,默认编码为 'utf-8'
    # decodebytes():使用base64.decodebytes(s)方法,我们可以使用此方法获得二进制形式的解码字符串
    base64_bytes = base64.decodebytes(base64_str.encode())

    with open("./rrCar.ttf", 'wb') as fp:
        fp.write(base64_bytes)

    # TTFont()读取文件
    ttf = TTFont("./rrCar.ttf")
    # 输出数字与uni编码之间的关系
    li = ttf.getGlyphOrder()[1:]
    # 输出uni编码与数字之间的关系,输出类型为字典
    dic = ttf.getBestCmap()

    new_dict = {}
    for k, v in dic.items():
        new_dict[chr(k)] = li.index(v)
    return new_dict


# 获取页面内的数据信息
def fetch_data(driver, font_dict):
    li_list = driver.find_elements(By.XPATH, "//ul[@class='infos infos-card h-clearfix']/li")

    # 存储car_id, car_brand,car_title, car_year, car_mile, new_car_price
    car_data = []

    for li in li_list:
        try:
            car_id = li.get_attribute("data-entid")
            #标签查找错误
            # car_brand = li.find_element(By.XPATH, "div/a/div[2]/h2/span/font").text.strip()
            # car_title = li.find_element(By.XPATH, "div/a/div[2]/h2/span").text.strip()
            # car_year_mile = li.find_element(By.XPATH, "div/a/div[2]/h2/div[2]").text.strip()
            # car_price = li.find_element(By.XPATH, "div/a/div[3]/b").text.strip()


            car_brand = li.find_element(By.XPATH,
                                        "div[@class='info--wrap']//span[@class='info_link']/font").text.strip()
            brand = li.find_element(By.CLASS_NAME, "info_link").text.strip()
            car_title = li.find_element(By.XPATH, "div[@class='info--wrap']//span[@class='info_link']").text.strip()
            car_year_mile = li.find_element(By.XPATH,
                                            "div[@class='info--wrap']//div[@class='info_params']").text.strip()
            car_price = li.find_element(By.XPATH,
                                        "div[@class='info--wrap']//b[@class='info_price fontSecret']").text.strip()

            car_year = re.match("(.*?)年", car_year_mile).group(1)
            car_mile = re.match("(.*?)·(.*?)万", car_year_mile).group(2)

            new_car_price = ''
            for i in car_price:
                if i == '.':
                    new_car_price += '.'
                else:
                    new_car_price += str(font_dict.get(i))

            print(car_id, "#", car_brand, "#", car_title, "#", car_year, "#", car_mile, "#", car_price, "#",
                  new_car_price)

            if car_id not in car_id_list:
                car_id_list.append(car_id)
                car_data.append((car_id, car_brand, car_title, car_year, car_mile, new_car_price))
        except Exception as e:
            print(e)
            continue
    return car_data


# 获取全部城市的名称的拼音
def fecth_cities(driver):
    js = """document.getElementsByClassName('citySelectWrap')[0].style.display='block';"""
    driver.execute_script(js)

    time.sleep(5)

    city_list = []
    a_list = driver.find_elements(By.XPATH, "//div[@class='citySelectWrap']//a[@class='city-item']")
    for a in a_list:
        city_list.append(a.get_attribute("listname"))

    print(city_list)
    return city_list
    pass


# 写入数据库
# 建表语句
"""
create table rrCar(
    car_id bigint PRIMARY KEY
    , car_brand varchar(255)
    , car_title varchar(255)
    , car_year int
    , car_mile float
    , car_price float
	, create_time TIMESTAMP DEFAULT CURRENT_TIMESTAMP
	, update_time TIMESTAMP DEFAULT CURRENT_TIMESTAMP on UPDATE CURRENT_TIMESTAMP
);
"""


# 数据库插入数据
def mysql_insert(conn, cursor, car_data):
    try:
        insert_sql = "insert int project.rrCar(car_id, car_brand,car_title, car_year, car_mile, new_car_price) values(%s,%s,%s,%s,%s,%s)"
        cursor.execute(insert_sql, car_data)
    except Exception as e:
        print(e)
        print("failure")
    else:
        conn.commit()
        print("success")


if __name__ == '__main__':
    # 配置无头模式
    opt = Options()
    opt.add_argument("--headless")
    driver = webdriver.Edge(options=opt)

    # 连接数据库
    conn = pymysql.connect(host='master', port=3306, user='root', password='123456')
    # 创建游标
    cursor = conn.cursor()

    # 获取所有城市
    # 这个地址只是用于获取所有的城市信息
    url = f'https://www.renrenche.com/hf/ershouche'
    driver.get(url)
    time.sleep(5)

    city_list = fecth_cities(driver)

    for city in city_list:
        time.sleep(5)
        print(city)

        url = f'https://www.renrenche.com/{city}/ershouche'
        driver.get(url)
        # 获取字体解析的信息
        font_dict = font_analysis(driver)
        # 将驱动传入函数,在函数内部获取网页的详细详信息
        car_data = fetch_data(driver, font_dict)
        # 将数据写入MySQL
        mysql_insert(conn, cursor, car_data)

        print(city)

    driver.close()
    cursor.close()
    conn.close()
    pass

  • 12
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值