首先获取各车型的网址
from urllib import request
from bs4 import BeautifulSoup as bs
import re
import pandas as pd
headers = {'User-Agent':'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:23.0) Gecko/20100101 Firefox/23.0'}
url = "https://www.xiaoxiongyouhao.com/chxi_report_list.php"
req = request.Request(url=url, headers=headers)
res = request.urlopen(req)
print(res.status)
html_data = res.read().decode("utf-8")
Soup = bs(html_data, "html.parser")
p = Soup.find_all("a", target="_blank")
p_str = str(p)
pattern = re.compile(r'/.+?html') # ? 非贪婪模式
address = re.findall(pattern, p_str)
address_list = []
for i in address:
temp = "https://www.xiaoxiongyouhao.com" + i[14:]
address_list.append(temp)
pattern = re.compile(r'>.+?</a>')
car_temp = re.findall(pattern, p_str)
car_list = []
for i in range(len(car_temp)-1):
temp = car_temp[i][1:-4]
car_list.append(temp)
temp_df = {"carType":car_list,
"address":address_list}
df = pd.DataFrame(temp_df)
print(df)
df.to_excel("carList.xls", index=False, encoding="utf-8")
df.to_csv("carList.txt", index=False)
提取各车型网址数据
from urllib import request
from bs4 import BeautifulSoup as bs
import re
import pandas as pd
address_df = pd.read_csv("carList.txt")
df = {"car_type": [],
"diaplacement": [],
"gearbox": [],
"engine": [],
"engine_type": [],
"fuel_consumption": [],
"fuel_consumption_range": [],
"owner_quantity": []}
df = pd.DataFrame(df)
for i in range(len(address_df)):
car = address_df["carType"][i]
url = address_df["address"][i]
try:
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:23.0) Gecko/20100101 Firefox/23.0'}
req = request.Request(url=url, headers=headers)
res = request.urlopen(req)
print(i,"/",len(address_df),":",res.status)
html_data = res.read().decode("utf-8")
Soup = bs(html_data, "html.parser")
p = Soup.find("tbody")
p_str = str(p)
pattern = re.compile(r'<td>.*?</td>') # ? 非贪婪模式
content = re.findall(pattern, p_str)
factor = int(len(content) / 7)
car_type = [car] * factor # 创建车型列表
n_list = []
for i in range(factor):
n_list.append(i * 7)
displacement = []
gearbox = []
engine = []
engine_type = []
fuel_consumption = []
fuel_consumption_range = []
owner_quantity = []
for i in n_list:
displacement.append(content[i][4:-5])
gearbox.append(content[i + 1][4:-5])
engine.append(content[i + 2][4:-5])
engine_type.append(content[i + 3][4:-5])
fuel_consumption.append(content[i + 4][4:-5])
fuel_consumption_range.append(content[i + 5][4:-5])
owner_quantity.append(content[i + 6][4:-5])
temp_df = {"car_type": car_type,
"diaplacement": displacement,
"gearbox": gearbox,
"engine": engine,
"engine_type": engine_type,
"fuel_consumption": fuel_consumption,
"fuel_consumption_range": fuel_consumption_range,
"owner_quantity": owner_quantity}
df_type = pd.DataFrame(temp_df)
df = pd.concat([df, df_type], axis=0, ignore_index=True)
except:
continue
df.to_excel("main.xls", index=False, encoding="utf-8")
df.to_csv("main.txt", index=False)