机器学习
sklearn机器学习库
import sklearn
sklearn.__version__
sklearn提供了哪些内置的数据集用于学习
from sklearn import datasets
datasets.load_boston #任务:回归预测
datasets.load_digits #任务:分类
datasets.load_breast_cancer #任务:分类或者聚类
datasets.load_iris #任务:分类或者聚类
数据集划分
from sklearn.model_selection import train_test_split
#训练集数据、测试集数据、训练集标签、测试集标签
cancer_data_train,cancer_data_test,cancer_target_train,cancer_target_test=train_test_split(cancer_data,cancer_target,test_size=0.25,random_state=58)
转换器
主要通过分析特征和目标值提取有价值的信息 transform:主要用来对特征进行转换 fit_transfrom:先fit方法,然后再transform方法
from sklearn.preprocessing import MinMaxScaler
Scaler=MinMaxScaler().fit(cancer_data_train)
cancer_trainScaler=Scaler.transform(cancer_data_train)
cancer_testScaler=Scaler.transform(cancer_data_test)
特征处理
机器学习数据降维方法总结(附python代码) https://zhuanlan.zhihu.com/p/44258470
import eli5
performance=eli5.sklearn.PermutationImportance(XGBClassifier().fit(X_train,y_train),random_state=42,scoring='r2')
performance.fit(X_train,y_train)
eli5.show_weights(performance,feature_names=train.iloc[:,9:13].columns.tolist())
PCA
from sklearn.decomposition import PCA
np.unique(cancer_trainScaler).shape
pca_model=PCA(n_components=10).fit(cancer_trainScaler)
cancer_trainPca=pca_model.transform(cancer_trainScaler)
cancer_testPca=pca_model.transform(cancer_testScaler)
cancer_trainPca.shape
LabelEncoder
from sklearn.preprocessing import LabelEncoder
labelencoder=LabelEncoder()
for col in train1.columns:
train1[col] = labelencoder.fit_transform(train1[col])
train1.head()#-----------<class 'pandas.core.frame.DataFrame'>
词频统计
# TODO:数据处理
from sklearn.feature_extraction.text import CountVectorizer
count_vect = CountVectorizer()
X_train_counts = count_vect.fit(data1['description'].values.astype('U'))
# 获取文档集中的不重复的单词
X_train_counts.get_feature_names()
doc_array = X_train_counts.transform(data1['description'].values.astype('U')).toarray()
frequency_matrix = pd.DataFrame(doc_array, columns=count_vect.get_feature_names())
frequency_matrix
数据处理
不符合正态分布的变量给转化成正态分布的
sns.distplot(df_train['SalePrice'], fit=norm);
fig = plt.figure()
res = stats.probplot(df_train['SalePrice'], plot=plt)
这个图里可以看到 ‘SalePrice’ 的分布是正偏度,在正偏度的情况下,用 log 取对数后可以做到转换:
#applying log transformation
df_train['SalePrice'] = np.log(df_train['SalePrice'])
构造特征
1. 数值变类别型
例如,MoSold: Month Sold 这个变量看起来是数值型的,但其实更符合类别型的,所以要做一下转换:
“MoSold” : {1 : “Jan”, 2 : “Feb”, 3 : “Mar”, 4 : “Apr”, 5 : “May”, 6 : “Jun”, 7 : “Jul”, 8 : “Aug”, 9 : “Sep”, 10 : “Oct”, 11 : “Nov”, 12 : “Dec”}
2. 类别型加顺序
例如,Functional: Home functionality rating 这个变量,它是个 rating,那么这种数值应该是有序的,并且这种顺序是带有信息的,那我们就给转化成数字:
“Functional” : {“Sal” : 1, “Sev” : 2, “Maj2” : 3, “Maj1” : 4, “Mod”: 5, “Min2” : 6, “Min1” : 7, “Typ” : 8}
3. 简化类别
当然类别太多了的不好,可以进一步简化成两三个等级:
train[“SimplFunctional”] = train.Functional.replace(
{1 : 1, 2 : 1, # bad
3 : 2, 4 : 2, # major
5 : 3, 6 : 3, 7 : 3, # minor
8 : 4 # typical})
4. 构造多项式
另外一种常用的方式是构造多项式,一般是 2次项,3次项,开平方:
train[“OverallQual-s2”] = train[“OverallQual”] ** 2
train[“OverallQual-s3”] = train[“OverallQual”] ** 3
train[“OverallQual-Sq”] = np.sqrt(train[“OverallQual”])
5. 加减乘除
还有通过加减乘除的数学关系构造:
OverallQual: Overall material and finish quality
OverallCond: Overall condition rating
train[“OverallGrade”] = train[“OverallQual”] * train[“OverallCond”]
6. 变为 one-hot
然后我们来把 categorical 的变量给变成 one-hot 的形式:
#convert categorical variable into dummy
df_train = pd.get_dummies(df_train)
模型选择
详情见笔记sklearn
模型评估
序号 | 名称 | 作用 | 类别 |
---|---|---|---|
1 | from sklearn.metrics import precision_score | #查看准确率,average='micro’设为全局指标 precision_score(Y_test,predict,average=‘micro’) | 分类 |
2 | recall_score | #查看召回率 recall_score(Y_test,predict,average=‘micro’) | 分类 |
3 | f1_score | #查看F1值 f1_score(Y_test,predict,average=‘micro’) | 分类 |
4 | cohen_kappa_score | #查看Cohen’s Kappa系数 cohen_kappa_score(Y_test,predict) | 分类 |
5 | classification_report | #导入分类模型评价报告模块 print(classification_report(Y_test,predict)) | 分类 |
6 | explained_variance_score | #可释方差也叫解释方差,越大越好 explained_variance_score(y_test,lr_y_predict) | 回归 |
7 | mean_absolute_error | #R方值,确定系数,越大越好 r2_score(y_test,lr_y_predict) | 回归 |
8 | mean_squared_error | #均方误差,越小越好 mean_squared_error(y_test,lr_y_predict) | 回归 |
9 | median_absolute_error | #中值绝对误差,越小越好 median_absolute_error(y_test,lr_y_predict) | 回归 |
10 | inertias | 表示样本距离最近的聚类中心的总和,该值越小越好 | 聚类 |
11 | adjusted_rand_s | 调整后的兰德指数(Adjusted Rand Index) ,其取值范围为[-1,1],负数代表结果不好,越接近于1越好 | 聚类 |
12 | mutual_info_s | 指的是相同数据的两个标签之间的相似度的量度,结果是非负值 | 聚类 |
13 | adjusted_mutual_info_s | 调整后的互信息(Adjusted Mutual Information,AMI),也可能为负数 | 聚类 |
14 | homogeneity_s | 同质化得分(Homogeneity),其取值范围[0,1]值越大意味着聚类结果与真实情况越吻合 | 聚类 |
15 | Completeness | 完整性得分(Completeness),其取值范围[0,1],值越大意味着聚类结果与真实情况吻合 | 聚类 |
16 | v_measure_s | 它是同质化和完整性之间的谐波平均值,其取值范围[0,1],值越大意味着聚类结果与真实情况越吻合。 | 聚类 |
17 | silhouette_s【常用】 | 轮廓系数(Silhouette),其最高值为1,最差值为-1,0附近的值表示重叠的聚类,负值通常表示样本已被分配到错误的集群。 | 聚类 |
18 | calinski_harabaz_s | calinski_harabaz_s:该分数定义为群内离散与簇间离散的比值,它是一种非监督式评估指标 | 聚类 |
保存模型
#保存模型
#模型文件后缀名为.pkl
joblib.dump(svc,'myDigitsModel.pkl')
加载模型
mysvc=joblib.load('myDigitsModel.pkl')
mysvc
保存结果
# TODO:保存结果到指定文件
np.savetxt('../output/result.txt', y_pred)
#保存结果
pd.DataFrame(y_pred, columns=['label']).to_csv('../result/test_predict.csv', index=False, header=True)
用户界面设计
import tkinter as tk
from tkinter import filedialog
from tkinter import ttk
win = tk.Tk()
win.geometry('800x600+400+300')
win.title("AINDT2019")
def open_py():
global filename
filename = tk.filedialog.askopenfilename()
if filename:
with open(filename, 'rb') as f:
py_txt.delete('1.0', tk.END)
py_txt.insert(tk.INSERT, f.read())
print('文件名称'+filename)
def open_image():
# global img
# img = tk.PhotoImage(file='pie.png')
# img_canvas.create_image(150, 100, image=img)
import pandas as pd
import pickle
result68=pd.read_csv(filename,engine="python",sep="\s+",header=None,skiprows=1)
result68=result68.iloc[:,1:]
result68=result68.T
w=result68.describe().T
with open('D:/金砖比赛训练资料/培训金砖2020/gsc.pickle','rb') as f:
gsc2=pickle.load(f)
presult=gsc2.predict(result68.iloc[:,:1038])
py_txt2.delete('1.0', tk.END)
py_txt2.insert(tk.INSERT, presult)
py_txt2.insert(tk.INSERT, w)
label1 = tk.Label(win, text = "This application is designed to automate the work of the operator while decoding the results ").pack()
label2 = tk.Label(win, text = "of the inspection of welds. In order to use the function of this software, you must press the buttons to select files.").pack()
label3 = tk.Label(win, text = "download them and wait for the analysis and output of the results in the window below.").pack()
tk.Button(win,text="Select file(s)",bg="orange", command=open_py).place(x =350, y=70)
py_txt = tk.Text(win)
py_txt.place(x = 20, y=110, width=700, height=200)
tk.Button(win, text='Results', command=open_image).place(x = 350, y=320)
py_txt2 = tk.Text(win)
py_txt2.place(x = 20, y=350, width=700, height=200)
win.mainloop()