每天一个python知识点
1 将变量转换为分类变量(有categrocy直接变换)
import numpy as np
import pandas as pd
unique_class=list(["a","b","c","c"])
unique_class=unique_class.unique().tolist()#将列表中的元素变为唯一变量
data1 = {"one":pd.Series(["a","b","c","a","c"]),
"two":pd.Series(np.random.rand(4))
}
df3 = pd.DataFrame(data1)
def transfer_class2num(val, unique_class=unique_class):
unique_class.sort()
return unique_class.index(val)
df3['one'] = df3['one'].apply(transfer_class2num)
print(df3)
2 用不同颜色画出不同类型的散点图
#画出函数图像不同类别用不同颜色
fig1 = plt.figure(1,figsize=(6,4))
colors = ['b','g','r','orange','#F0F8FF', '#FAEBD7']
Label_Com = [1,2,3,4,5,6]
for index in range(6):
Price = data.loc[data['deaths'] == index]["py"]
Index = data.loc[data['deaths'] == index]['factor']
plt.scatter(Index, Price, c=colors[index], cmap='brg', s=40, alpha=0.2, marker='8', linewidth=0)
# plt.ylim(0.01,0.09)
ax = fig1.gca()
for label in ax.xaxis.get_ticklabels():
label.set_rotation(30)
plt.xlabel('Time')
plt.ylabel('Price')
#added this to get the legend to work
handles,labels = ax.get_legend_handles_labels()
ax.legend(handles, labels = Label_Com, loc='upper right')
画出相关系数的热力图方法一
#计算相关系数,画出相关系数的热力图
a = data.corr()
print(a)
import matplotlib.pyplot as plt
import seaborn as sns
plt.subplots(figsize=(9, 9))
sns.heatmap(a, annot=True, vmax=1, square=True, cmap="Reds")#颜色可以改变
plt.show()
画出相关系数的热力图方法二
# 画出相关系数热力图
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
cov = np.corrcoef(data.T)
img = plt.matshow(cov,cmap=plt.cm.winter)
plt.colorbar(img, ticks=[-1,0,1])
plt.xticks(np.arange(len(data.keys())), data.keys())
plt.yticks(np.arange(len(data.keys())), data.keys())
plt.show() # 不知道要到哪里添加颜色变量
画出多分类的roc曲线以iris数据为例
# -*- coding: utf-8 -*-
"""
Created on Sat Jun 5 19:40:21 2021
@author: l't
"""
#多分类的ROC曲线
# 引入必要的库
import numpy as np
import matplotlib.pyplot as plt
from itertools import cycle
from sklearn import svm, datasets
from sklearn.metrics import roc_curve, auc
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import label_binarize
from sklearn.multiclass import OneVsRestClassifier
from scipy import interp
# 加载数据
iris = datasets.load_iris()
X = iris.data
y = iris.target
# 将标签二值化
y = label_binarize(y, classes=[0, 1, 2])
# 设置种类
n_classes = y.shape[1]
# 训练模型并预测
random_state = np.random.RandomState(0)
n_samples, n_features = X.shape
# shuffle and split training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.5,random_state=0)
# Learn to predict each class against the other
classifier = OneVsRestClassifier(svm.SVC(kernel='linear', probability=True,
random_state=random_state))
y_score = classifier.fit(X_train, y_train).decision_function(X_test)
# 计算每一类的ROC
fpr = dict()
tpr = dict()
roc_auc = dict()
for i in range(n_classes):
fpr[i], tpr[i], _ = roc_curve(y_test[:, i], y_score[:, i])
roc_auc[i] = auc(fpr[i], tpr[i])
# Compute micro-average ROC curve and ROC area(方法二)
fpr["micro"], tpr["micro"], _ = roc_curve(y_test.ravel(), y_score.ravel())
roc_auc["micro"] = auc(fpr["micro"], tpr["micro"])
# Compute macro-average ROC curve and ROC area(方法一)
# First aggregate all false positive rates
all_fpr = np.unique(np.concatenate([fpr[i] for i in range(n_classes)]))
# Then interpolate all ROC curves at this points
mean_tpr = np.zeros_like(all_fpr)
for i in range(n_classes):
mean_tpr += interp(all_fpr, fpr[i], tpr[i])
# Finally average it and compute AUC
mean_tpr /= n_classes
fpr["macro"] = all_fpr
tpr["macro"] = mean_tpr
roc_auc["macro"] = auc(fpr["macro"], tpr["macro"])
# Plot all ROC curves
lw=2
plt.figure()
plt.plot(fpr["micro"], tpr["micro"],
label='micro-average ROC curve (area = {0:0.2f})'
''.format(roc_auc["micro"]),
color='deeppink', linestyle=':', linewidth=4)
plt.plot(fpr["macro"], tpr["macro"],
label='macro-average ROC curve (area = {0:0.2f})'
''.format(roc_auc["macro"]),
color='navy', linestyle=':', linewidth=4)
colors = cycle(['aqua', 'darkorange', 'cornflowerblue'])
for i, color in zip(range(n_classes), colors):
plt.plot(fpr[i], tpr[i], color=color, lw=lw,
label='ROC curve of class {0} (area = {1:0.2f})'
''.format(i, roc_auc[i]))
plt.plot([0, 1], [0, 1], 'k--', lw=lw)
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Some extension of Receiver operating characteristic to multi-class')
plt.legend(loc="lower right")
plt.show()
#交叉验证
data = loaddigits()
svcClassifier =svm.SCV(kernel="linear",c=1000,gamma=0.001)
#交叉验证
star = time()
scores = cross_val_score(svcClassifier,data[0],data[1],cv=8)
print("交叉验证得分情况:\n",scores)
print("平均分",scores.mean())
基于某一列修改某一列的值
当roundness中的值大于0.8时将data这个dataframe中的lala列改为a
data.loc[data.roundness>0.8,"lala"]="a"
数据正态性检验
from scipy import stats
import numpy as np
import pandas as pd
#正态性检验,大于0.05则认为服从正态分布
u = np.mean(data)
std = np.std(data)
stats.kstest(data,"norm",(u,std))
#绘制数据密度曲线
fig= plt.figure(figsize(10,6))
ax1=fig.add_subplot(2,1,1)
data.plot(kind='kde',style='--k',grid=True,title='密度曲线')
plt.axvline(3*std,hold=None,linestyle='--',color='r')
plt.axvline(-3*std,hold=None,linestyle='--',color='r')
#筛选出异常值和正常值
error = data[np.abs(data - u) > 3*std]
data_c = data[np.abs(data - u) <= 3*std]
ax2=fig.add_subplot(2,1,2)
plt.scatter(data_c.index,data_c,alpha=0.3)
plt.scatter(error.index,error,color='r',marker='o',alpha=0.8)
模糊匹配和精确匹配
import difflib
col_list = [data.columns.tolist()]
a = difflib.get_close_matches('a',col_list,1,cutoff=0.7)#1是精确匹配,2是模糊匹配
时间转换函数
直接转化
#日期格式20200101
pd.to_datetime(data['loan_month'],format='%Y%m')
pd.to_datetime(data['loan_month'],format='%Y-%m')