import re,requests,json
from lxml import etree
import pymysql
class MySql():
def __init__(self):
self.count = 1
self.conn_mysql()
def conn_mysql(self):
#创建数据库连接对象
self.conn = pymysql.connect(host='xxxxx',user='root',password='xxxx',charset='utf8',database='xxxx')
#创建操作数据库对象
self.cur = self.conn.cursor()
class YouXin(MySql):
# def __init__(self):
# self.spider_name = '优信二手车'
# self.count = 1
def __call__(self, *args, **kwargs):
base_url = 'https://www.xin.com/beijing/?'
self.headers = {
"User-Agent":"Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.108 Safari/537.36",
}
html,html_xml = self.get_html(base_url,self.headers)
#获取城市列表
self.city_constitute_list = self.get_city_list()#含有城市及对应的url
#获取品牌列表
brand_total_list = self.get_brand_list()
#获取每个城市各个品牌列表页面
#拼接每个城市各个品牌列表页面url
for city in self.city_constitute_list:
city_url = city[1]
# print(city_url)
for brand_tuple in brand_total_list:
brand_city_url = city_url + brand_tuple[1]+ '/'
# print(brand_city_url)
#获取车系
car_series_list = self.get_car_series(brand_city_url,city[0],brand_tuple[0])#含有汽车车系和对应url
# break
# break
def get_html(self,url,headers):
html = requests.get(url,headers=self.headers).text
html_xml = etree.HTML(html)
# print(html)
return html,html_xml
#获取所有城市名和对应url列表
def get_city_list(self):
base_url = 'https://www.xin.com/apis/Ajax_common/get_home_city/'
city_html,city_html_xml = self.get_html(base_url,self.headers)
# print(city_html)
#将城市接口页面获取的字典转换为json格式
city_json = json.loads(city_html)
# print(city_json)
#获取全部城市cityid
cityid_list = city_json.get('data').get('city_all').keys()
# print(cityid_list)
# print(len(cityid_list))#286
#由cityid循环获取每个城市cityname,ename(ename用于拼接城市url)
self.city_constitute_list = []
for cityid in cityid_list:
cityname = city_json.get('data').get('city_all').g
优信二手车爬虫
最新推荐文章于 2023-06-16 19:24:08 发布
本文介绍如何使用网络爬虫技术抓取优信二手车平台上的数据,包括车辆信息、价格、里程等关键指标,通过实例解析爬虫实现步骤,并探讨数据清洗与分析的应用。
摘要由CSDN通过智能技术生成