1.题干
不同数据科学领域的工作薪酬数据集包含11列,分别为work_year:支付工资的年份 experience_level:一年中工作的经验水平 employment_type:雇用类型 job_title:当年担任的职位 salary:支付的工资总额 salary_currency:以ISO 4217货币代码支付的工资的货币 salayinusd:以美元为单位的工资 employee_resident:根据ISO 3166国家代码,员工在工作年度的主要居住国 remote_ratio:远程完成的总工作量比例 company_location:雇主主要办事处或合同分公司所在的国家 company_size:一年中为公司工作的人数中位数。
任务要求:导入数据,查看数据有无缺失值、重复值,每列数据有多少个不同的取值,输出数据的基本统计量,删除重复值,绘制条形图,展示不同年份数据频数,不同经验水平的条形图,绘制“remote_ratio”列不同工作方式的饼形图,以及工资分布直方图。使用label encode将类型变量转换为数值型,工资数据标准化,分割数据为训练集和测试集,建立线性回归,决策树回归,随机森林回归,梯度提升树回归模型,并输出评价指标MSE,RMSE,R2_SCORE。利用IQR准则去掉异常值,将工资按照四分位数分割为:低工资等级,中等,较高等级,高等级,建立LogisticRegression,RandomForestClassifier,GradientBoostingClassifier,KNeighborsClassifier,DecisionTreeClassifier模型,并输出个模型预测准确率。
2.数据格式
3.代码
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.tree import DecisionTreeRegressor, DecisionTreeClassifier
from sklearn.ensemble import (RandomForestRegressor, RandomForestClassifier,
GradientBoostingClassifier, GradientBoostingRegressor)
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import mean_squared_error, r2_score
def printf(n, strf):
print()
print('-' * n)
print(f"\033[1m{strf}\033[0m")
print()
df = pd.read_csv('dataset/ds_salaries.csv')
printf(100, '查看数据基本信息')
print(df.info())
printf(100, '查看是否有缺失值')
print(df.isnull().sum())
printf(100, '查看重复行与计数')
print(df[df.duplicated()])
print('重复行数量为:', df.duplicated().sum())
printf(100, '查看每列数据有多少个不同的取值')
print(df.nunique())
printf(100, '输出基本统计量')
print(df.describe())
printf(100, '查看删除重复值后的重复行与计数')
df.drop_duplicates(inplace=True)
print(df[df.duplicated()])
print('重复行数量为:', df.duplicated().sum())
def plot_bar(df,strf):
plt.figure(figsize=(10, 6))
plt.bar(df.value_counts().index, df.value_counts())
plt.xlabel(strf)
plt.ylabel('Frequency')
plt.xticks(df.value_counts().index)
plt.title(strf+' Frequency')
plt.show()
plot_bar(df['work_year'],'Work Year')
plot_bar(df['experience_level'],'Experience Level')
plt.figure(figsize=(8, 8))
plt.pie(df['remote_ratio'].value_counts(normalize=True), labels=df['remote_ratio'].unique(), autopct='%1.1f%%')
plt.title('Remote Work Ratio Distribution')
plt.show()
plt.figure(figsize=(10, 6))
Gl = (max(df['salary_in_usd']) - min(df['salary_in_usd'])) / 10
bins = np.arange(min(df['salary_in_usd']), max(df['salary_in_usd']) + Gl, Gl)
plt.hist(df['salary_in_usd'], bins=bins, edgecolor='black')
plt.xticks(bins) # 设置x轴刻度为分组边界
plt.xlabel('salary_in_usd')
plt.ylabel('Frequency')
plt.title('salary_in_usd Distribution')
plt.show()
le = LabelEncoder()
for col in df.columns:
if df[col].dtype == 'object':
df[col] = le.fit_transform(df[col])
scaler = StandardScaler()
df['salary_in_usd'] = scaler.fit_transform(df[['salary_in_usd']])
X = df.drop('salary_in_usd', axis=1)
y = df['salary_in_usd']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
models = {
"LinearRegression": LinearRegression(),
"DecisionTreeRegressor": DecisionTreeRegressor(),
"RandomForestRegressor": RandomForestRegressor(),
"GradientBoostingRegressor": GradientBoostingRegressor()
}
for name, model in models.items():
model.fit(X_train, y_train)
predictions = model.predict(X_test)
printf(100,f'{name}的MSE、RMSE、R2 Score分别为:')
print(f"{name} MSE: {mean_squared_error(y_test, predictions)}")
print(f"{name} RMSE: {np.sqrt(mean_squared_error(y_test, predictions))}")
print(f"{name} R2 Score: {r2_score(y_test, predictions)}")
Q1 = df['salary_in_usd'].quantile(0.25)
Q3 = df['salary_in_usd'].quantile(0.75)
IQR = Q3 - Q1
lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR
df = df[(df['salary_in_usd'] >= lower_bound) & (df['salary_in_usd'] <= upper_bound)]
df['salary_level'] = pd.qcut(df['salary_in_usd'], q=4, labels=["低工资等级", "中等", "较高等级", "高等级"])
X = df.drop(['salary_in_usd', 'salary_level'], axis=1)
y = df['salary_level']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
classification_models = {
"LogisticRegression": LogisticRegression(max_iter=1000),
"DecisionTreeClassifier": DecisionTreeClassifier(),
"RandomForestClassifier": RandomForestClassifier(),
"GradientBoostingClassifier": GradientBoostingClassifier(),
"KNeighborsClassifier": KNeighborsClassifier()
}
for name, model in classification_models.items():
printf(100,f'{name}分类的准确率为:')
model.fit(X_train, y_train)
accuracy = model.score(X_test, y_test)
print(f"{name} Accuracy: {accuracy}")