1.sklearn.metrics.roc_curve
(来源)
sklearn.metrics.roc_curve(y_true, y_score, pos_label=None, sample_weight=None, drop_intermediate=True)
Parameters :
y_true : 数组,shape = [样本数]
在范围{0,1}或{-1,1}中真正的二进制标签。如果标签不是二进制的,则应该显式地给出pos_label
y_score : 数组, shape = [样本数]
目标得分,可以是积极类的概率估计,信心值,或者是决定的非阈值度量(在某些分类器上由“decision_function”返回)。
pos_label:int or str, 标签被认为是积极的,其他的被认为是消极的。
sample_weight: 顾名思义,样本的权重,可选择的
drop_intermediate: boolean, optional (default=True)
是否放弃一些不出现在绘制的ROC曲线上的次优阈值。这有助于创建更轻的ROC曲线
Returns : fpr,tpr,thresholds`
import numpy as np
from sklearn import metrics
y = np.array([1, 1, 2, 2]) #实际值
scores = np.array([0.1, 0.4, 0.35, 0.8]) #预测值
fpr, tpr, thresholds = metrics.roc_curve(y, scores, pos_label=2) #pos_label=2指标签为2的是正例,1为反例
print(fpr[1],tpr[1],thresholds[1])
print(fpr,'\n',tpr,'\n',thresholds)
阈值选择从大到小:
(1)取0.8为1,其余为0 预测值为:[0,0,0,1]
fpr=反例中预测为正例的个数/实际反例个数=0/2=0
tpr=正例中预测为正例的个数/实际正例个数=1/2=0.5
(2)取0.4为阈值,预测为:[0,1,0,1]
fpr=1/2=0.5 tpr=1/2=0.5
(3)取0.35为阈值,预测为:[0,1,1,1]
fpr=1/2=0.5 tpr=2/2=1
0.0 0.5 0.8 #取阈值为0.8时结果
[ 0. 0.5 0.5 1. ] #fpr
[ 0.5 0.5 1. 1. ] #tpr
[ 0.8 0.4 0.35 0.1 ] #thresholds
不管0,0,1.8 这列值
2.interp1d(x, y, kind='linear', ...)
一维插值(来源)
x和y参数是一系列已知的数据点,kind参数是插值类型,可以是字符串或整数。
插值是离散函数逼近的重要方法,利用它可通过函数在有限个点处的取值状况,估算出函数在其他点处的近似值。与拟合不同的是,要求曲线通过所有的已知数据。
import numpy as np
from scipy.interpolate import interp1d
import pylab as pl
#创建待插值的数据
x = np.linspace(0, 10*np.pi, 20)
y = np.cos(x)
# 分别用linear和quadratic插值
fl = interp1d(x, y, kind='linear') #线性插值
fq = interp1d(x, y, kind='quadratic') #二阶样条曲线
#设置x的最大值和最小值以防止插值数据越界
xint = np.linspace(x.min(), x.max(), 1000)
yintl = fl(xint)
yintq = fq(xint)
pl.plot(xint,fl(xint), color="green", label = "Linear")
pl.plot(xint,fq(xint), color="yellow", label ="Quadratic")
pl.legend(loc = "best")
pl.show()
3.np.argwhere( a )
返回非0的数组元组的索引,其中a是要索引数组的条件。
import numpy as np
x=np.arange(6).reshape(2,3)
print("x:",x)
index=np.argwhere(x>1) #取x中大于1的元素位置
print("index:",'\n',index)
4.二分类的ROC曲线(来源)
本实例中的数据来源于sklearn中的鸢尾花(iris)数据
import numpy as np
import matplotlib.pyplot as plt
from sklearn import svm, datasets
from sklearn.metrics import roc_curve, auc ###计算roc和auc
from sklearn.model_selection import train_test_split
# Import some data to play with 导入数据
iris = datasets.load_iris()
X = iris.data
y = iris.target
##变为2分类
X, y = X[y != 2], y[y != 2]
# Add noisy features to make the problem harder 加入噪声使问题困难
random_state = np.random.RandomState(0)
n_samples, n_features = X.shape
X = np.c_[X, random_state.randn(n_samples, 200 * n_features)]
# shuffle and split training and test sets 随机打乱数据和划分训练测试集
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.3,random_state=0)
# Learn to predict each class against the other
svm = svm.SVC(kernel='linear', probability=True,random_state=random_state)
###通过decision_function()计算得到的y_score的值,用在roc_curve()函数中
y_score = svm.fit(X_train, y_train).decision_function(X_test)
# Compute ROC curve and ROC area for each class
fpr,tpr,threshold = roc_curve(y_test, y_score) ###计算真正率和假正率
roc_auc = auc(fpr,tpr) ###计算auc的值
plt.figure()
lw = 2
plt.figure(figsize=(10,10))
plt.plot(fpr, tpr, color='darkorange',
lw=lw, label='ROC curve (area = %0.2f)' % roc_auc) ###假正率为横坐标,真正率为纵坐标做曲线
plt.plot([0, 1], [0, 1], color='navy', lw=lw, linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver operating characteristic example')
plt.legend(loc="lower right")
plt.show()
5.查看hdf5文件的键值变量
import h5py #导入工具包
import numpy as np
#HDF5的读取:
f = h5py.File('D:/software/facedata/3dmad/3dmad/session01/Data/01_01_01.hdf5','r') #打开h5文件
keys=f.keys() #可以查看所有的主键
print(keys)
a = f['Color_Data'][:] #取出主键为data的所有的键值
values=f.values()
print(values)
print(f.items())
print(a)
f.close()
6.hdf5图像文件转换成avi
import h5py
import imageio
import numpy as np
from pathlib import Path
from argparse import ArgumentParser
def main():
p = ArgumentParser()
#p.add_argument("infn", help="HDF5 file to convert")
#p.add_argument("key", help="HDF5 variable containing video to convert to AVI")
p.add_argument("--infn", type=str, required=True,help="HDF5 file to convert") #hdf5文件路径
p.add_argument("--key", type=str, required=True,help="HDF5 variable containing video to convert to AVI") #hdf5文件中的键值
p.add_argument("-o", "--outfn", help="output filename (ending in .avi)") #输出保存的格式
p.add_argument("-fps", help="frames/second", type=int, default=1) #多少帧存放
p = p.parse_args()
infn = Path(p.infn).expanduser()
if p.outfn:
outfn = Path(p.outfn).expanduser()
else:
outfn = infn.parent / (infn.stem + f["_{p.key}.avi"])
with h5py.File(infn, "r") as f:
dat = f[p.key][:]
print("writing", dat.shape, outfn)
imageio.mimwrite(outfn, dat.astype(np.uint8), codec="ffv1", fps=p.fps)
if __name__ == "__main__":
main()
执行命令,最后图像转换失败了,说是在mimwrite中需要二维图像输入,
PS D:\software\facedata\3dmad\3dmad\session01\Data> python .\1.py --infn 01_01_01.hdf5 --key Color_Data --outfn 1.avi -fps 1
测试结果失败,怀疑所用hdf5文件格式的问题
7.image.shape np.asarry
import numpy as np
import cv2
i=cv2.imread("C:/facedata/AddData/1.jpg")
print(i.shape[0:3]) #输出为(480,640,3) 高 宽 通道数 左闭右开才能取到通道数
print(i.shape[0:2]) #得到(480,640)
print(i.shape[0]) #480
print(i.shape[1]) #640
print(i.shape[2]) #3
print(np.asarray(i.shape)[0:2]) #将(480,640)转换成数组[480 640]
8.记录程序运行时间(用于记录训练网络时间)
import time # 导入time模块
start_time = time.time() # 记录代码开始时间
for i in range(10000000):
a = 1 + 1
end_time = time.time() # 记录代码结束时间
run_time = end_time - start_time # 计算运行时间
print('run_time: ', run_time)
得到的结果为秒单位。
9.图片的数据增强
from keras.preprocessing.image import ImageDataGenerator, array_to_img, img_to_array, load_img
datagen = ImageDataGenerator(
rotation_range=30, #图片在0-30范围旋转的角度
width_shift_range=0.2, #图片宽度的比例,在比例0-0.2水平偏移的幅度
height_shift_range=0.2,#图片高度的比例,垂直方向的幅度
shear_range=0.2, #剪切变换,让x坐标(或y)不变,对应y(或x)按比例平移
zoom_range=0.2, #在长或宽方向放大,此时为长宽同时放大,参数大于0小于1为放大,参数大于1为缩小
horizontal_flip=True,#随机对图片执行水平翻转操作,即随机选取图片进行翻转
fill_mode='nearest') #因变换图片缺失的地方,用此参数填充默认为nearest邻近填充
img = load_img('D:/software/code/face-anti-spoofing/FASNet-master(oesllelucena)/FASNet/real2.JPG') # this is a PIL image
x = img_to_array(img) # this is a Numpy array with shape (3, 150, 150)
x = x.reshape((1,) + x.shape) # this is a Numpy array with shape (1, 3, 150, 150)
# the .flow() 产生的随机变换图像的批次 保存在save_to_dir中
i = 0
for batch in datagen.flow(x, batch_size=1, # save_to_dir 要保存的文件夹 prefix图片名字 format图片的格式
save_to_dir='D:/software/code/face-anti-spoofing/FASNet-master(oesllelucena)/FASNet/testjpg', save_prefix='fake1', save_format='jpeg'):
i += 1
if i >10:
break
原图:
变换后:
10.随机生成向左向右像素图像
from numpy import zeros
from random import randint
from random import random
from matplotlib import pyplot
# generate the next frame in the sequence
def next_frame(last_step, last_frame, column):
# define the scope of the next step
lower = max(0, last_step-1)
upper = min(last_frame.shape[0]-1, last_step+1)
# choose the row index for the next step
step = randint(lower, upper)
# copy the prior frame
frame = last_frame.copy()
# add the new step
frame[step, column] = 1
return frame, step
# generate a sequence of frames of a dot moving across an image
def build_frames(size):
frames = list()
# create the first frame
frame = zeros((size,size))
step = randint(0, size-1)
# decide if we are heading left or right
right = 1 if random() < 0.5 else 0
col = 0 if right else size-1
frame[step, col] = 1
frames.append(frame)
# create all remaining frames
for i in range(1, size):
col = i if right else size-1-i
frame, step = next_frame(step, frame, col)
frames.append(frame)
return frames, right
# generate sequence of frames
size = 5
frames, right = build_frames(size)
# plot all feames
pyplot.figure()
for i in range(size):
# create a grayscale subplot for each frame
pyplot.subplot(1, size, i+1)
pyplot.imshow(frames[i], cmap= 'Greys')
# turn of the scale to make it cleaer
ax = pyplot.gca()
ax.get_xaxis().set_visible(False)
ax.get_yaxis().set_visible(False)
# show the plot
pyplot.show()
11.二分查找法
适合在有序的数组中使用。
def bSearch(array,target):
left=0
right=len(array)-1
while left<=right:
#mid=(left+right)//2
#mid=(left+right)>>1 #位运算,向右移一位类似除以2,0101=5 >> 0010=2 1100=12 >> 0110=6
mid=int ((left+right)/2) #取中间位置
if array[mid]==target:
return mid #返回的是查找数字的下标
elif array[mid]<target: #若当前值<目标值
left=mid+1 #则左边的指针往右移动一位
else: #若当前值>目标值
right=mid-1 #则右边的指针往左移动一位
return None
if __name__ == '__main__':
index=bSearch([1,2,3,4,5,6,7,8,9,10],1)
#index=bSearch([2,1,6,5,3,7,8,9,4],9)
print(index)
12.cap.get(propId)
import cv2
vidcap = cv2.VideoCapture("D:\\software\\1.avi")
for i in range(0,19):
a = vidcap.get(i)
print(i,a)
cap.get(propId) 来获得视频的一些参数信息。这里 propId 可以是 0 到 18 之间的任何整数。get括号内主要参数注释:
0:视频文件的当前位置,以毫秒为单位(结果为0.0?)
1:基于索引的帧被解码/捕获下一个。获取当前帧的索引位置?(结果为0.0?)
2:视频文件相对位置:0-开头,1-结尾
3:视频中帧的宽度 640
4:视频中帧的高度 480
5:帧的速率 25帧/秒
6:编解码器的4字码(二进制?)
7:视频中帧的数量 200帧
13.np.expand_dims()
来源
import numpy as np
a = np.array([[[1,2,3],[4,5,6]]])
print("a.shape:",a.shape)
print("a:",a)
print("a[0]:", a[0])
print("a[0][0]:",a[0][0])
print("a[0][1]:",a[0][1])
print("a[0][0][0]:",a[0][0][0])
print("a[0][0][1]:",a[0][0][1])
print("a[0][0][2]:",a[0][0][2])
print("a[0][1][0]:",a[0][1][0])
print("a[0][1][1]:",a[0][1][1])
print("a[0][1][2]:",a[0][1][2])
print("--------------------------")
b = np.expand_dims(a, axis=0)
print("b.shape:",b.shape)
print("b:",b)
print("b[0]:", b[0])
print("b[0][0]:",b[0][0])
print("b[0][0][0]:",b[0][0][0])
print("b[0][0][1]:",b[0][0][1])
print("b[0][0][0][0]:",b[0][0][0][0])
print("b[0][0][0][1]:",b[0][0][0][1])
print("b[0][0][0][2]:",b[0][0][0][2])
print("b[0][0][1][0]:",b[0][0][1][0])
print("--------------------------")
c = np.expand_dims(a, axis=1)
print("c.shape:",c.shape)
print("c:",c)
print("c[0]:", c[0])
print("c[0][0]:",c[0][0])
print("c[0][0][0]:",c[0][0][0])
print("c[0][0][1]:",c[0][0][1])
print("c[0][0][0][0]:",c[0][0][0][0])
print("c[0][0][0][1]:",c[0][0][0][1])
print("c[0][0][0][2]:",c[0][0][0][2])
print("c[0][0][1][0]:",c[0][0][1][0])
print("--------------------------")
d = np.expand_dims(a, axis=2)
print("d.shape:",d.shape)
print("d:",d)
print("d[0]:", d[0])
print("d[0][0]:",d[0][0])
print("d[0][0][0]:",d[0][0][0])
print("d[0][1][0]:",d[0][1][0])
print("d[0][0][0][0]:",d[0][0][0][0])
print("d[0][0][0][1]:",d[0][0][0][1])
print("d[0][0][0][2]:",d[0][0][0][2])
print("d[0][1][0][0]:",d[0][1][0][0])
print("d[0][1][0][1]:",d[0][1][0][1])
print("--------------------------")
e = np.expand_dims(a, axis=3)
print("e.shape:",e.shape)
print("e:",e)
print("e[0]:", e[0])
print("e[0][0]:",e[0][0])
print("e[0][0][0]:",e[0][0][0])
print("e[0][0][1]:",e[0][0][1])
print("e[0][0][0][0]:",e[0][0][0][0])
print("e[0][0][1][0]:",e[0][0][1][0])
print("e[0][0][2][0]:",e[0][0][2][0])
print("e[0][1][0][0]:",e[0][1][0][0])
print("e[0][1][1][0]:",e[0][1][1][0])
14.np_utils.to_categorical
将整型的类别标签转为onehot编码。y为int数组,num_classes为标签类别总数,大于max(y)(标签从0开始的)。
简单说就是把类别标签转换为onehot编码(categorical就是类别标签的意思,表示现实世界中你分类的各类别), 而onehot编码是一种方便计算机处理的二元编码。
from keras.utils import np_utils
N_CLASSES = 2
label = [0,0,0,1,1] #出现第三种数字则会报错
train_label = np_utils.to_categorical(label, N_CLASSES)
print(train_label)
n_CLASSES = 3
Label = [0,0,0,1,1,1,2,2,2]
Train_label = np_utils.to_categorical(Label, n_CLASSES)
print(Train_label)
14.将.npy文件转为.txt文件
import numpy as np
test=np.load('F:\\code\\PAD-LSTM\\test\\out_hsv.npy',encoding = "latin1") #加载文件
doc = open('F:\\code\\PAD-LSTM\\test\\out_hsv.txt', 'a') #打开一个存储文件,并依次写入
print(test, file=doc) #将打印内容写入文件中
15.对txt文本进行词频统计
(1)对英文
def process_file(file): # 读文件到缓冲区
try: # 打开文件
f = open(file, 'r',encoding='utf-8')
except IOError as s:
print(s)
return None
try: # 读文件到缓冲区
bvffer = f.read()
except FileNotFoundError:
print('无法打开指定的文件!')
except LookupError:
print('指定了未知的编码!')
except UnicodeDecodeError:
print('读取文件时解码错误!')
return None
f.close()
return bvffer
def process_buffer(bvffer):
if bvffer:
word_freq = {}
# 下面添加处理缓冲区 bvffer代码,统计每个单词的频率,存放在字典word_freq
bvffer = bvffer.lower()
for ch in '“‘!;,.?”':
bvffer = bvffer.lower().replace(ch, " ") #将所有字母转换成小写,便于统计
words = bvffer.strip().split() #strip消除空白符,split以空格作为单词分界
for word in words:
word_freq[word] = word_freq.get(word, 0)+1 #读取到的单词存放到字典
return word_freq
def output_result(word_freq):
if word_freq:
sorted_word_freq = sorted(word_freq.items(), key=lambda v: v[1], reverse=True)
for item in sorted_word_freq[:20]: # 输出 Top 10 的单词
print(item[0], item[1])
if __name__ == "__main__":
file = "Steve_Jobs.txt"
bvffer = process_file(file)
word_freq = process_buffer(bvffer)
output_result(word_freq)
(2)对中文
#这种方法单纯出现名词排序,不管人物还是物品
# import jieba
# txt = open("threekingdoms.txt","r",encoding="utf-8").read()
# words = jieba.lcut(txt)
# counts = {}
# for word in words:
# if len(word) == 1:
# continue
# else:
# counts[word] = counts.get(word,0) + 1
# items = list(counts.items())
# items.sort(key=lambda x:x[1],reverse=True)
# for i in range(15):
# word,count = items[i]
# print("{0:<10}{1:>5}".format(word,count))
#这种方法是写好要找的词去寻找文本中词的个数
import jieba
txt = open("threekingdoms.txt", encoding="utf-8").read()
def jiebafenci(txt, wordslist):
jieba.load_userdict('findwords.txt')
words = jieba.lcut(txt)
counts = {}
for word in words:
counts[word] = counts.get(word, 0) + 1
lst = []
for i in range(len(wordslist)):
try:
print(wordslist[i], counts[wordslist[i]])
except:
lst.append(wordslist[i])
#print('不存在的词:', lst)
if __name__ == '__main__':
txt = open("threekingdoms.txt", encoding="utf-8").read()
need_words = open("findwords.txt", encoding="utf-8").read() # 这个是要查找的词的txt文件 每个词一行,里面为人物名词
find = need_words.split()
jiebafenci(txt, find)
第一种的结果
第二种的结果