python知识点

每天一个python知识点

1 将变量转换为分类变量(有categrocy直接变换)

import numpy as np
import pandas as pd
unique_class=list(["a","b","c","c"])
unique_class=unique_class.unique().tolist()#将列表中的元素变为唯一变量
data1 = {"one":pd.Series(["a","b","c","a","c"]),
            "two":pd.Series(np.random.rand(4))
        } 
df3 = pd.DataFrame(data1)
def transfer_class2num(val, unique_class=unique_class):
    unique_class.sort()
    return unique_class.index(val)
df3['one'] = df3['one'].apply(transfer_class2num)
print(df3)

2 用不同颜色画出不同类型的散点图

#画出函数图像不同类别用不同颜色
fig1 = plt.figure(1,figsize=(6,4))
colors = ['b','g','r','orange','#F0F8FF', '#FAEBD7']
Label_Com = [1,2,3,4,5,6]
for index in range(6):
    Price = data.loc[data['deaths'] == index]["py"]
    Index = data.loc[data['deaths'] == index]['factor']
    plt.scatter(Index, Price, c=colors[index], cmap='brg', s=40, alpha=0.2, marker='8', linewidth=0)  
# plt.ylim(0.01,0.09)
ax = fig1.gca()
for label in ax.xaxis.get_ticklabels():
    label.set_rotation(30)
plt.xlabel('Time')
plt.ylabel('Price')
#added this to get the legend to work
handles,labels = ax.get_legend_handles_labels()
ax.legend(handles, labels = Label_Com, loc='upper right')

画出相关系数的热力图方法一

#计算相关系数,画出相关系数的热力图
a = data.corr()
print(a)
import matplotlib.pyplot as plt
import seaborn as sns
plt.subplots(figsize=(9, 9))
sns.heatmap(a, annot=True, vmax=1, square=True, cmap="Reds")#颜色可以改变
plt.show()

画出相关系数的热力图方法二

# 画出相关系数热力图
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
cov = np.corrcoef(data.T)
img = plt.matshow(cov,cmap=plt.cm.winter)
plt.colorbar(img, ticks=[-1,0,1])
plt.xticks(np.arange(len(data.keys())), data.keys())
plt.yticks(np.arange(len(data.keys())), data.keys())
plt.show() # 不知道要到哪里添加颜色变量

画出多分类的roc曲线以iris数据为例

链接

# -*- coding: utf-8 -*-
"""
Created on Sat Jun  5 19:40:21 2021

@author: l't
"""
#多分类的ROC曲线
# 引入必要的库
import numpy as np
import matplotlib.pyplot as plt
from itertools import cycle
from sklearn import svm, datasets
from sklearn.metrics import roc_curve, auc
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import label_binarize
from sklearn.multiclass import OneVsRestClassifier
from scipy import interp
# 加载数据
iris = datasets.load_iris()
X = iris.data
y = iris.target
# 将标签二值化
y = label_binarize(y, classes=[0, 1, 2])
# 设置种类
n_classes = y.shape[1]
# 训练模型并预测
random_state = np.random.RandomState(0)
n_samples, n_features = X.shape
# shuffle and split training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.5,random_state=0)
# Learn to predict each class against the other
classifier = OneVsRestClassifier(svm.SVC(kernel='linear', probability=True,
                                 random_state=random_state))
y_score = classifier.fit(X_train, y_train).decision_function(X_test)
# 计算每一类的ROC
fpr = dict()
tpr = dict()
roc_auc = dict()
for i in range(n_classes):
    fpr[i], tpr[i], _ = roc_curve(y_test[:, i], y_score[:, i])
    roc_auc[i] = auc(fpr[i], tpr[i])
# Compute micro-average ROC curve and ROC area(方法二)
fpr["micro"], tpr["micro"], _ = roc_curve(y_test.ravel(), y_score.ravel())
roc_auc["micro"] = auc(fpr["micro"], tpr["micro"])
# Compute macro-average ROC curve and ROC area(方法一)
# First aggregate all false positive rates
all_fpr = np.unique(np.concatenate([fpr[i] for i in range(n_classes)]))
# Then interpolate all ROC curves at this points
mean_tpr = np.zeros_like(all_fpr)
for i in range(n_classes):
    mean_tpr += interp(all_fpr, fpr[i], tpr[i])
# Finally average it and compute AUC
mean_tpr /= n_classes
fpr["macro"] = all_fpr
tpr["macro"] = mean_tpr
roc_auc["macro"] = auc(fpr["macro"], tpr["macro"])
# Plot all ROC curves
lw=2
plt.figure()
plt.plot(fpr["micro"], tpr["micro"],
         label='micro-average ROC curve (area = {0:0.2f})'
               ''.format(roc_auc["micro"]),
         color='deeppink', linestyle=':', linewidth=4)

plt.plot(fpr["macro"], tpr["macro"],
         label='macro-average ROC curve (area = {0:0.2f})'
               ''.format(roc_auc["macro"]),
         color='navy', linestyle=':', linewidth=4)

colors = cycle(['aqua', 'darkorange', 'cornflowerblue'])
for i, color in zip(range(n_classes), colors):
    plt.plot(fpr[i], tpr[i], color=color, lw=lw,
             label='ROC curve of class {0} (area = {1:0.2f})'
             ''.format(i, roc_auc[i]))
plt.plot([0, 1], [0, 1], 'k--', lw=lw)
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Some extension of Receiver operating characteristic to multi-class')
plt.legend(loc="lower right")
plt.show()

#交叉验证

data = loaddigits()
svcClassifier =svm.SCV(kernel="linear",c=1000,gamma=0.001)
#交叉验证
star = time()
scores = cross_val_score(svcClassifier,data[0],data[1],cv=8)
print("交叉验证得分情况:\n",scores)
print("平均分",scores.mean())

基于某一列修改某一列的值

当roundness中的值大于0.8时将data这个dataframe中的lala列改为a

data.loc[data.roundness>0.8,"lala"]="a"

数据正态性检验

from scipy import stats
import numpy as np
import pandas as pd
#正态性检验,大于0.05则认为服从正态分布
u = np.mean(data)
std = np.std(data)
stats.kstest(data,"norm",(u,std))
#绘制数据密度曲线
fig= plt.figure(figsize(10,6))
ax1=fig.add_subplot(2,1,1)
data.plot(kind='kde',style='--k',grid=True,title='密度曲线')
plt.axvline(3*std,hold=None,linestyle='--',color='r')
plt.axvline(-3*std,hold=None,linestyle='--',color='r') 
 #筛选出异常值和正常值
 error = data[np.abs(data - u) > 3*std]
 data_c = data[np.abs(data - u) <= 3*std]
 ax2=fig.add_subplot(2,1,2)
 plt.scatter(data_c.index,data_c,alpha=0.3)
 plt.scatter(error.index,error,color='r',marker='o',alpha=0.8)

模糊匹配和精确匹配

import difflib
col_list = [data.columns.tolist()]
a = difflib.get_close_matches('a',col_list,1,cutoff=0.7)#1是精确匹配,2是模糊匹配

时间转换函数

直接转化

#日期格式20200101
pd.to_datetime(data['loan_month'],format='%Y%m')
pd.to_datetime(data['loan_month'],format='%Y-%m')
  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值