import random
import time
import urllib.parse
import json
import requests
import pymysql
# 连接数据库
conn = pymysql.connect(host='localhost', user='root', password='admin123456', database='demo', port=3306)
# 创建一个游标对象
cur = conn.cursor()
# 创建表
cur.execute('''
CREATE TABLE if not exists dcd_app_car (
car_type VARCHAR(200) COMMENT '车辆类型',
method VARCHAR(255) COMMENT '汽车认证情况',
brand_name VARCHAR(255) COMMENT '品牌名称',
city VARCHAR(255) COMMENT '城市',
car_name VARCHAR(255) COMMENT '车名',
car_source_type VARCHAR(255) COMMENT '车源类型',
car_year INT COMMENT '车辆年份',
image VARCHAR(255) COMMENT '图片',
series_name VARCHAR(255) COMMENT '系列名称',
xq_url VARCHAR(2000) COMMENT '详情URL',
title VARCHAR(255) COMMENT '标题'
) COMMENT='汽车采集表';
''')
cur.execute('''
CREATE TABLE if not exists url_list (
url VARCHAR(2000) COMMENT '链接'
) COMMENT='异常存储';
''')
# 提交事务
conn.commit()
url = "https://www.dongchedi.com/motor/pc/sh/sh_sku_list?aid=1839&app_name=auto_web_pc"
headers = {
'authority': 'www.dongchedi.com',
'accept': '*/*',
'accept-language': 'zh-CN,zh;q=0.9,en;q=0.8,en-GB;q=0.7,en-US;q=0.6',
'cache-control': 'no-cache',
'content-type': 'application/x-www-form-urlencoded',
'cookie': 'ttwid=1%7CigpRreng75gaT_9jJaSl6SSnSRrrkn59yhYKH7Uveo8%7C1710601221%7C7d8328e49d1f59f86904a6c0800c26703783929bcb7d1168d388bda16a06c679; tt_webid=7346976246184887843; tt_web_version=new; is_dev=false; is_boe=false; city_name=%E5%8D%97%E5%AE%81; s_v_web_id=verify_lvm7drez_t25CNR2h_C4zk_4KHG_8Zof_uEz8cxth7x2n; passport_csrf_token=ffa52e4284abaad76a11a4c7860678ef; passport_csrf_token_default=ffa52e4284abaad76a11a4c7860678ef; passport_auth_status=31234545e0b825eea6782fe98f2dd2de%2C; passport_auth_status_ss=31234545e0b825eea6782fe98f2dd2de%2C; sid_guard=abb88cfb6a435a22cb0e31fabd21a612%7C1714470379%7C5184001%7CSat%2C+29-Jun-2024+09%3A46%3A20+GMT; uid_tt=c9fec34f74d1cda6b233be0e4984131e; uid_tt_ss=c9fec34f74d1cda6b233be0e4984131e; sid_tt=abb88cfb6a435a22cb0e31fabd21a612; sessionid=abb88cfb6a435a22cb0e31fabd21a612; sessionid_ss=abb88cfb6a435a22cb0e31fabd21a612; sid_ucp_v1=1.0.0-KGM3NmJkMjM5NjhjMmRkNmU0NDM4ZGQwMjgwMmQzNGI1NDA2MWY5YzEKGQjg7cDesMy1ARDr-8KxBhivDiAMOAJA7AcaAmxmIiBhYmI4OGNmYjZhNDM1YTIyY2IwZTMxZmFiZDIxYTYxMg; ssid_ucp_v1=1.0.0-KGM3NmJkMjM5NjhjMmRkNmU0NDM4ZGQwMjgwMmQzNGI1NDA2MWY5YzEKGQjg7cDesMy1ARDr-8KxBhivDiAMOAJA7AcaAmxmIiBhYmI4OGNmYjZhNDM1YTIyY2IwZTMxZmFiZDIxYTYxMg; user_data=%7B%22gender%22%3A0%2C%22name%22%3A%22%E7%94%A8%E6%88%B73626937244617%22%2C%22screen_name%22%3A%22%E7%94%A8%E6%88%B73626937244617%22%2C%22user_id%22%3A798670841722592%2C%22avatar_url%22%3A%22https%3A%2F%2Fp9-passport.byteacctimg.com%2Fimg%2Fmosaic-legacy%2F3792%2F5112637127~120x256.image%22%2C%22mobile%22%3A%22191******72%22%7D; rit_city=%E5%8D%97%E5%AE%81; odin_tt=629b1a06f142a8173c4ad539b3ef688bbc5fa50b74535c49da2965a60d087dd2fa69818385ad84b3c001143246054189; odin_tt=0872cd22bc4d18af12b62a082cd6f6da2962885515a9df920a40f475680e49b8faef2a674d45121e945a7a09d9fafbc2',
'origin': 'https://www.dongchedi.com',
'pragma': 'no-cache',
'referer': 'https://www.dongchedi.com/usedcar/x-x-x-x-x-x-x-x-x-x-x-x-x-x-x-x-x-x-x',
'sec-ch-ua': '"Not/A)Brand";v="99", "Microsoft Edge";v="115", "Chromium";v="115"',
'sec-ch-ua-mobile': '?0',
'sec-ch-ua-platform': '"Windows"',
'sec-fetch-dest': 'empty',
'sec-fetch-mode': 'cors',
'sec-fetch-site': 'same-origin',
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/115.0.0.0 Safari/537.36 Edg/115.0.1901.203',
'x-forwarded-for': ''
}
name = urllib.parse.quote(input('请输入城市:'))
lx_dict = {'汽油': 1, '柴油': 2, "油电混合": 3, "新能源": "4,5,6"}
# 从数据库中获取已经爬取过的URL
cur.execute("SELECT url FROM url_list")
fetched_urls = cur.fetchall()
# 创建一个集合来存储已经爬取过的URL
visited_urls = set(url[0] for url in fetched_urls)
for k, v in lx_dict.items():
for page in range(1, 350):
payload = f'fuel_form={v}&sh_city_name={name}&page={page}&limit=20'
if payload not in visited_urls: # 检查URL是否已经被爬取过
try:
response = requests.request("POST", url, headers=headers, data=payload)
except Exception as e:
print(e,response.text)
cur.execute("INSERT INTO url_list (url) VALUES (%s)", (payload))
conn.commit()
print(payload)#fuel_form=1&sh_city_name=%E5%8C%97%E4%BA%AC&page=65&limit=20
time_date = random.uniform(3, 5)
time.sleep(time_date)
s = response.text
df = json.loads(s)
data = df['data']['search_sh_sku_info_list']
has_more = df['data']['has_more']
if has_more == False:
print("结束咯,往下面也没有数据啦")
break
else:
for i in data:
method = i['authentication_method']
brand_name = i['brand_name']
city = i['brand_source_city_name']
car_name = i['car_name']
car_source_type = i['car_source_type']
car_year = i['car_year']
image = i['image']
series_name = i['series_name']
xq_url = "https://www.dongchedi.com/usedcar/" + str(i['sku_id'])
title = i['title']
# print(k, method, brand_name, city, car_name, car_source_type, car_year, image, series_name, xq_url,
# title)
# 插入数据
cur.execute(
"INSERT INTO dcd_app_car (car_type,method, brand_name, city, car_name, car_source_type, car_year, image, series_name, xq_url, title) VALUES (%s,%s, %s, %s, %s, %s, %s, %s, %s, %s, %s)",
(k, method, brand_name, city, car_name, car_source_type, car_year, image, series_name, xq_url,
title)
)
# 提交事务
conn.commit()
# 关闭连接
cur.close()
conn.close()