天池算法大赛是阿里巴巴的(阿里云)
赛题链接: https://tianchi.aliyun.com/competition/entrance/231702/introduction
数据获取
数据清洗整理分析
https://www.cnblogs.com/jp-mao/p/10487082.html
code
import numpy as np import pandas as pd # 导入train_set和test_set, encoding='gbk',不能用utf-8 train_set = pd.read_csv('happiness_train_complete.csv', encoding='gbk') test_set = pd.read_csv('happiness_test_complete.csv', encoding='gbk') # 去除标签中不合理的数据 -8 train_set = train_set[train_set.happiness>0] y_label = train_set.happiness ind1 = ['id','happiness','survey_time','edu_other','join_party','property_other','invest_other'] X_train_set = train_set.drop(ind1, axis=1) ind2 = ['id','survey_time','edu_other','join_party','property_other','invest_other'] X_test_set = test_set.drop(ind2, axis=1) y_label = np.array(y_label, dtype=int) X_train_set = np.array(X_train_set, dtype=float) X_test_set = np.array(X_test_set, dtype=float) from sklearn.impute import SimpleImputer # 空值设置为-1 X_train_set = SimpleImputer(fill_value=-1).fit_transform(X_train_set) X_test_set = SimpleImputer(fill_value=-1).fit_transform(X_test_set) # # 小于0的值设置为-1 X_train_set[X_train_set < 0] = -1 X_test_set[X_test_set < 0] = -1 from sklearn.preprocessing import StandardScaler # 均值归一化 std = StandardScaler().fit(X_train_set) X_train__std = std.transform(X_train_set) X_test__std = std.transform(X_test_set) # PCA降维 from sklearn.decomposition import PCA # 包含95%的方差信息 pca = PCA(0.95) pca.fit(X_train__std) X_train_pca = pca.transform(X_train__std) X_test_pca = pca.transform(X_test__std) from sklearn.model_selection import train_test_split X_train, X_test, y_train, y_test = train_test_split(X_train_pca, y_label, random_state=666) from sklearn.linear_model import LogisticRegression best_c = 0. best_score = 0. best_sum = 10. for c in np.arange(0.001, 0.3, 0.001): log_reg2 = LogisticRegression(C=c, multi_class='multinomial', solver='newton-cg').fit(X_train, y_train) y_pre = log_reg2.predict(X_test) s = sum((y_pre-y_test)**2/len(y_test)) score = log_reg2.score(X_test, y_test) if best_sum > s: best_sum = s best_c = c best_score = score print('c:', best_c) print('score:', best_score) print('sum:', best_sum) log_reg = LogisticRegression(C=0.01, multi_class='multinomial', solver='newton-cg').fit(X_train, y_train) y_pre2 = log_reg.predict(X_test_pca) df = pd.DataFrame({'id':test_set.id, 'happniess': y_pre2}) df.to_csv('log_reg_pca.csv', index=None)
运行完后会生成一个csv文件
提交结果
查看成绩
参考:
https://www.cnblogs.com/jp-mao/p/10487082.html
比较不错的