小知识

最新推荐文章于 2023-02-21 22:13:19 发布

qq_26896611

最新推荐文章于 2023-02-21 22:13:19 发布

阅读量293

点赞数 1

本文链接：https://blog.csdn.net/qq_26896611/article/details/105114659

版权

1.sklearn.metrics.roc_curve（来源）

sklearn.metrics.roc_curve(y_true, y_score, pos_label=None, sample_weight=None, drop_intermediate=True)

Parameters :
y_true : 数组，shape = [样本数]
在范围{0,1}或{-1,1}中真正的二进制标签。如果标签不是二进制的，则应该显式地给出pos_label
y_score : 数组, shape = [样本数]
目标得分，可以是积极类的概率估计，信心值，或者是决定的非阈值度量(在某些分类器上由“decision_function”返回)。
pos_label：int or str, 标签被认为是积极的，其他的被认为是消极的。
sample_weight: 顾名思义，样本的权重，可选择的
drop_intermediate: boolean, optional (default=True)
是否放弃一些不出现在绘制的ROC曲线上的次优阈值。这有助于创建更轻的ROC曲线
Returns : fpr,tpr,thresholds`

import numpy as np
from sklearn import metrics
y = np.array([1, 1, 2, 2]) #实际值
scores = np.array([0.1, 0.4, 0.35, 0.8]) #预测值
fpr, tpr, thresholds = metrics.roc_curve(y, scores, pos_label=2) #pos_label=2指标签为2的是正例，1为反例
print(fpr[1],tpr[1],thresholds[1]) 
print(fpr,'\n',tpr,'\n',thresholds)

阈值选择从大到小:
（1）取0.8为1，其余为0 预测值为：[0,0,0,1]
fpr=反例中预测为正例的个数/实际反例个数=0/2=0
tpr=正例中预测为正例的个数/实际正例个数=1/2=0.5
（2）取0.4为阈值，预测为：[0,1,0,1]
fpr=1/2=0.5 tpr=1/2=0.5
(3)取0.35为阈值，预测为:[0,1,1,1]
fpr=1/2=0.5 tpr=2/2=1

0.0 0.5 0.8 #取阈值为0.8时结果
[ 0.  0.5 0.5 1. ] #fpr
 [ 0.5 0.5 1.  1. ] #tpr
 [ 0.8  0.4  0.35 0.1 ] #thresholds

在这里插入图片描述
不管0，0,1.8 这列值

2.interp1d(x, y, kind='linear', ...) 一维插值（来源）
x和y参数是一系列已知的数据点，kind参数是插值类型，可以是字符串或整数。
插值是离散函数逼近的重要方法，利用它可通过函数在有限个点处的取值状况，估算出函数在其他点处的近似值。与拟合不同的是，要求曲线通过所有的已知数据。

import numpy as np
from scipy.interpolate import interp1d
import pylab as pl
 
#创建待插值的数据
x = np.linspace(0, 10*np.pi, 20)
y = np.cos(x)
 
# 分别用linear和quadratic插值
fl = interp1d(x, y, kind='linear') #线性插值
fq = interp1d(x, y, kind='quadratic') #二阶样条曲线
 
#设置x的最大值和最小值以防止插值数据越界
xint = np.linspace(x.min(), x.max(), 1000)
yintl = fl(xint)
yintq = fq(xint)

pl.plot(xint,fl(xint), color="green", label = "Linear")
pl.plot(xint,fq(xint), color="yellow", label ="Quadratic")
pl.legend(loc = "best")
pl.show()

在这里插入图片描述
3.np.argwhere( a )
返回非0的数组元组的索引，其中a是要索引数组的条件。

import numpy as np
x=np.arange(6).reshape(2,3)
print("x:",x)
index=np.argwhere(x>1) #取x中大于1的元素位置
print("index：",'\n',index)

在这里插入图片描述
4.二分类的ROC曲线（来源）
本实例中的数据来源于sklearn中的鸢尾花（iris）数据

import numpy as np
import matplotlib.pyplot as plt
from sklearn import svm, datasets
from sklearn.metrics import roc_curve, auc  ###计算roc和auc
from sklearn.model_selection import train_test_split

# Import some data to play with 导入数据
iris = datasets.load_iris()
X = iris.data
y = iris.target

##变为2分类
X, y = X[y != 2], y[y != 2]

# Add noisy features to make the problem harder 加入噪声使问题困难
random_state = np.random.RandomState(0)
n_samples, n_features = X.shape
X = np.c_[X, random_state.randn(n_samples, 200 * n_features)]

# shuffle and split training and test sets 随机打乱数据和划分训练测试集
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.3,random_state=0)

# Learn to predict each class against the other 
svm = svm.SVC(kernel='linear', probability=True,random_state=random_state)

###通过decision_function()计算得到的y_score的值，用在roc_curve()函数中
y_score = svm.fit(X_train, y_train).decision_function(X_test)

# Compute ROC curve and ROC area for each class
fpr,tpr,threshold = roc_curve(y_test, y_score) ###计算真正率和假正率
roc_auc = auc(fpr,tpr) ###计算auc的值

plt.figure()
lw = 2
plt.figure(figsize=(10,10))
plt.plot(fpr, tpr, color='darkorange',
         lw=lw, label='ROC curve (area = %0.2f)' % roc_auc) ###假正率为横坐标，真正率为纵坐标做曲线
plt.plot([0, 1], [0, 1], color='navy', lw=lw, linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver operating characteristic example')
plt.legend(loc="lower right")
plt.show()

在这里插入图片描述
5.查看hdf5文件的键值变量

import h5py  #导入工具包  
import numpy as np  
         
#HDF5的读取：  
f = h5py.File('D:/software/facedata/3dmad/3dmad/session01/Data/01_01_01.hdf5','r')   #打开h5文件  
keys=f.keys()                        #可以查看所有的主键
print(keys)  
a = f['Color_Data'][:]          #取出主键为data的所有的键值  
values=f.values()               
print(values)
print(f.items())
print(a)
f.close()

在这里插入图片描述
6.hdf5图像文件转换成avi

import h5py
import imageio
import numpy as np
from pathlib import Path
from argparse import ArgumentParser


def main():
    
    p = ArgumentParser()
    #p.add_argument("infn", help="HDF5 file to convert")
   #p.add_argument("key", help="HDF5 variable containing video to convert to AVI")
    p.add_argument("--infn", type=str, required=True,help="HDF5 file to convert") #hdf5文件路径
    p.add_argument("--key", type=str, required=True,help="HDF5 variable containing video to convert to AVI") #hdf5文件中的键值
    p.add_argument("-o", "--outfn", help="output filename (ending in .avi)") #输出保存的格式
    p.add_argument("-fps", help="frames/second", type=int, default=1) #多少帧存放
    p = p.parse_args()
    
    infn = Path(p.infn).expanduser()
    
    if p.outfn:
        outfn = Path(p.outfn).expanduser()
    else:
        outfn = infn.parent / (infn.stem + f["_{p.key}.avi"])

    with h5py.File(infn, "r") as f:
        dat = f[p.key][:]
        
    print("writing", dat.shape, outfn) 
    
    imageio.mimwrite(outfn, dat.astype(np.uint8), codec="ffv1", fps=p.fps) 

if __name__ == "__main__":
    main()

执行命令，最后图像转换失败了，说是在mimwrite中需要二维图像输入，

PS D:\software\facedata\3dmad\3dmad\session01\Data> python .\1.py --infn 01_01_01.hdf5 --key Color_Data  --outfn 1.avi -fps 1

在这里插入图片描述
测试结果失败，怀疑所用hdf5文件格式的问题

7.image.shape np.asarry

import numpy as np
import cv2
i=cv2.imread("C:/facedata/AddData/1.jpg")
print(i.shape[0:3]) #输出为(480,640,3) 高 宽 通道数 左闭右开才能取到通道数
print(i.shape[0:2]) #得到(480,640)
print(i.shape[0]) #480
print(i.shape[1]) #640
print(i.shape[2]) #3
print(np.asarray(i.shape)[0:2]) #将(480,640)转换成数组[480 640]

在这里插入图片描述
8.记录程序运行时间（用于记录训练网络时间）

import time                        # 导入time模块
start_time = time.time()           # 记录代码开始时间
for i in range(10000000):
    a = 1 + 1
end_time = time.time()             # 记录代码结束时间
run_time = end_time - start_time   # 计算运行时间
print('run_time: ', run_time)

在这里插入图片描述
得到的结果为秒单位。

9.图片的数据增强

from keras.preprocessing.image import ImageDataGenerator, array_to_img, img_to_array, load_img
 
datagen = ImageDataGenerator(
    rotation_range=30,  #图片在0-30范围旋转的角度
    width_shift_range=0.2, #图片宽度的比例，在比例0-0.2水平偏移的幅度
    height_shift_range=0.2,#图片高度的比例，垂直方向的幅度
    shear_range=0.2, #剪切变换，让x坐标（或y)不变,对应y（或x)按比例平移
    zoom_range=0.2,  #在长或宽方向放大，此时为长宽同时放大，参数大于0小于1为放大，参数大于1为缩小
    horizontal_flip=True,#随机对图片执行水平翻转操作，即随机选取图片进行翻转
    fill_mode='nearest') #因变换图片缺失的地方，用此参数填充默认为nearest邻近填充
 
 
img = load_img('D:/software/code/face-anti-spoofing/FASNet-master(oesllelucena)/FASNet/real2.JPG')  # this is a PIL image
x = img_to_array(img)  # this is a Numpy array with shape (3, 150, 150)
x = x.reshape((1,) + x.shape)  # this is a Numpy array with shape (1, 3, 150, 150)
# the .flow() 产生的随机变换图像的批次 保存在save_to_dir中
 
i = 0
for batch in datagen.flow(x, batch_size=1,  # save_to_dir 要保存的文件夹   prefix图片名字   format图片的格式
                          save_to_dir='D:/software/code/face-anti-spoofing/FASNet-master(oesllelucena)/FASNet/testjpg', save_prefix='fake1', save_format='jpeg'):
    i += 1
    if i >10: 
        break

原图：
在这里插入图片描述
变换后：

10.随机生成向左向右像素图像

from numpy import zeros
from random import randint
from random import random
from matplotlib import pyplot

# generate the next frame in the sequence
def next_frame(last_step, last_frame, column):
   # define the scope of the next step
   lower = max(0, last_step-1)
   upper = min(last_frame.shape[0]-1, last_step+1)
   # choose the row index for the next step
   step = randint(lower, upper)
   # copy the prior frame
   frame = last_frame.copy()
   # add the new step
   frame[step, column] = 1
   return frame, step
   
# generate a sequence of frames of a dot moving across an image
def build_frames(size):
   frames = list()
   # create the first frame
   frame = zeros((size,size))
   step = randint(0, size-1)

   # decide if we are heading left or right
   right = 1 if random() < 0.5 else 0
   col = 0 if right else size-1
   frame[step, col] = 1
   frames.append(frame)
   # create all remaining frames
   for i in range(1, size):
       col = i if right else size-1-i
       frame, step = next_frame(step, frame, col)
       frames.append(frame)
   return frames, right
   
# generate sequence of frames
size = 5
frames, right = build_frames(size)

# plot all feames
pyplot.figure()
for i in range(size):
   # create a grayscale subplot for each frame
   pyplot.subplot(1, size, i+1)
   pyplot.imshow(frames[i], cmap= 'Greys')
   # turn of the scale to make it cleaer
   ax = pyplot.gca()
   ax.get_xaxis().set_visible(False)
   ax.get_yaxis().set_visible(False)

# show the plot
pyplot.show()

在这里插入图片描述
11.二分查找法
适合在有序的数组中使用。

def bSearch(array,target):
    left=0
    right=len(array)-1
    while left<=right:
        #mid=(left+right)//2 
        #mid=(left+right)>>1 #位运算，向右移一位类似除以2，0101=5 >> 0010=2  1100=12 >> 0110=6
        mid=int ((left+right)/2) #取中间位置
        if array[mid]==target:
            return mid   #返回的是查找数字的下标
        elif array[mid]<target: #若当前值<目标值
            left=mid+1          #则左边的指针往右移动一位
        else:					#若当前值>目标值		
            right=mid-1			#则右边的指针往左移动一位
    return None 

if __name__ == '__main__':
    index=bSearch([1,2,3,4,5,6,7,8,9,10],1)
    #index=bSearch([2,1,6,5,3,7,8,9,4],9)
    print(index)

在这里插入图片描述
12.cap.get(propId)

import cv2
vidcap = cv2.VideoCapture("D:\\software\\1.avi")
for i in range(0,19):
    a = vidcap.get(i)
    print(i,a)

cap.get(propId) 来获得视频的一些参数信息。这里 propId 可以是 0 到 18 之间的任何整数。get括号内主要参数注释：
0：视频文件的当前位置，以毫秒为单位（结果为0.0？）
1：基于索引的帧被解码/捕获下一个。获取当前帧的索引位置？（结果为0.0？）
2：视频文件相对位置：0-开头，1-结尾
3：视频中帧的宽度 640
4：视频中帧的高度 480
5：帧的速率 25帧/秒
6：编解码器的4字码（二进制？）
7：视频中帧的数量 200帧
在这里插入图片描述

13.np.expand_dims()
来源

import numpy as np
a = np.array([[[1,2,3],[4,5,6]]])
print("a.shape:",a.shape)
print("a:",a)
print("a[0]:", a[0])
print("a[0][0]:",a[0][0])
print("a[0][1]:",a[0][1])
print("a[0][0][0]:",a[0][0][0])
print("a[0][0][1]:",a[0][0][1])
print("a[0][0][2]:",a[0][0][2])
print("a[0][1][0]:",a[0][1][0])
print("a[0][1][1]:",a[0][1][1])
print("a[0][1][2]:",a[0][1][2])
print("--------------------------")

b = np.expand_dims(a, axis=0)
print("b.shape:",b.shape)
print("b:",b)
print("b[0]:", b[0])
print("b[0][0]:",b[0][0])
print("b[0][0][0]:",b[0][0][0])
print("b[0][0][1]:",b[0][0][1])
print("b[0][0][0][0]:",b[0][0][0][0])
print("b[0][0][0][1]:",b[0][0][0][1])
print("b[0][0][0][2]:",b[0][0][0][2])
print("b[0][0][1][0]:",b[0][0][1][0])
print("--------------------------")

c = np.expand_dims(a, axis=1)
print("c.shape:",c.shape)
print("c:",c)
print("c[0]:", c[0])
print("c[0][0]:",c[0][0])
print("c[0][0][0]:",c[0][0][0])
print("c[0][0][1]:",c[0][0][1])
print("c[0][0][0][0]:",c[0][0][0][0])
print("c[0][0][0][1]:",c[0][0][0][1])
print("c[0][0][0][2]:",c[0][0][0][2])
print("c[0][0][1][0]:",c[0][0][1][0])
print("--------------------------")

d = np.expand_dims(a, axis=2)
print("d.shape:",d.shape)
print("d:",d)
print("d[0]:", d[0])
print("d[0][0]:",d[0][0])
print("d[0][0][0]:",d[0][0][0])
print("d[0][1][0]:",d[0][1][0])
print("d[0][0][0][0]:",d[0][0][0][0])
print("d[0][0][0][1]:",d[0][0][0][1])
print("d[0][0][0][2]:",d[0][0][0][2])
print("d[0][1][0][0]:",d[0][1][0][0])
print("d[0][1][0][1]:",d[0][1][0][1])
print("--------------------------")

e = np.expand_dims(a, axis=3)
print("e.shape:",e.shape)
print("e:",e)
print("e[0]:", e[0])
print("e[0][0]:",e[0][0])
print("e[0][0][0]:",e[0][0][0])
print("e[0][0][1]:",e[0][0][1])
print("e[0][0][0][0]:",e[0][0][0][0])
print("e[0][0][1][0]:",e[0][0][1][0])
print("e[0][0][2][0]:",e[0][0][2][0])
print("e[0][1][0][0]:",e[0][1][0][0])
print("e[0][1][1][0]:",e[0][1][1][0])

在这里插入图片描述

14.np_utils.to_categorical
将整型的类别标签转为onehot编码。y为int数组，num_classes为标签类别总数，大于max(y)（标签从0开始的）。
简单说就是把类别标签转换为onehot编码（categorical就是类别标签的意思，表示现实世界中你分类的各类别），而onehot编码是一种方便计算机处理的二元编码。

from keras.utils import np_utils
N_CLASSES = 2
label = [0,0,0,1,1] #出现第三种数字则会报错
train_label = np_utils.to_categorical(label, N_CLASSES)
print(train_label)

n_CLASSES = 3
Label = [0,0,0,1,1,1,2,2,2]
Train_label = np_utils.to_categorical(Label, n_CLASSES)
print(Train_label)

在这里插入图片描述

14.将.npy文件转为.txt文件

import numpy as np
test=np.load('F:\\code\\PAD-LSTM\\test\\out_hsv.npy',encoding = "latin1")  #加载文件
doc = open('F:\\code\\PAD-LSTM\\test\\out_hsv.txt', 'a')  #打开一个存储文件，并依次写入
print(test, file=doc)  #将打印内容写入文件中

15.对txt文本进行词频统计
（1）对英文

def process_file(file):     # 读文件到缓冲区
    try:     # 打开文件
        f = open(file, 'r',encoding='utf-8')
    except IOError as s:
        print(s)
        return None
    try:     # 读文件到缓冲区
        bvffer = f.read()
    except FileNotFoundError:
        print('无法打开指定的文件!')
    except LookupError:
        print('指定了未知的编码!')
    except UnicodeDecodeError:
        print('读取文件时解码错误!')
        return None
    f.close()
    return bvffer
def process_buffer(bvffer):
    if bvffer:
        word_freq = {}
        # 下面添加处理缓冲区 bvffer代码，统计每个单词的频率，存放在字典word_freq
        bvffer = bvffer.lower()
        for ch in '“‘!;,.?”':
            bvffer = bvffer.lower().replace(ch, " ")  #将所有字母转换成小写，便于统计
        words = bvffer.strip().split()         #strip消除空白符，split以空格作为单词分界
        for word in words:
            word_freq[word] = word_freq.get(word, 0)+1  #读取到的单词存放到字典
        return word_freq
    
def output_result(word_freq):
    if word_freq:
        sorted_word_freq = sorted(word_freq.items(), key=lambda v: v[1], reverse=True)
        for item in sorted_word_freq[:20]:  # 输出 Top 10 的单词
            print(item[0], item[1])
            
if __name__ == "__main__":
    file = "Steve_Jobs.txt"
    bvffer = process_file(file)
    word_freq = process_buffer(bvffer)
    output_result(word_freq)

在这里插入图片描述
（2）对中文

#这种方法单纯出现名词排序，不管人物还是物品
# import jieba
# txt = open("threekingdoms.txt","r",encoding="utf-8").read()
# words = jieba.lcut(txt)
# counts = {}
# for word in words:
#     if len(word) == 1:
#         continue
#     else:
#         counts[word] = counts.get(word,0) + 1
# items = list(counts.items())
# items.sort(key=lambda x:x[1],reverse=True)
# for i in range(15):
#     word,count = items[i]
#     print("{0:<10}{1:>5}".format(word,count))

#这种方法是写好要找的词去寻找文本中词的个数
import jieba
txt = open("threekingdoms.txt", encoding="utf-8").read()
def jiebafenci(txt, wordslist):
    jieba.load_userdict('findwords.txt')
    words = jieba.lcut(txt)
    counts = {}
    for word in words:
        counts[word] = counts.get(word, 0) + 1
    lst = []
    for i in range(len(wordslist)):
        try:
            print(wordslist[i], counts[wordslist[i]])
        except:
            lst.append(wordslist[i])
    #print('不存在的词:', lst)
if __name__ == '__main__':
    txt = open("threekingdoms.txt", encoding="utf-8").read()
    need_words = open("findwords.txt", encoding="utf-8").read() # 这个是要查找的词的txt文件 每个词一行,里面为人物名词
    find = need_words.split()
    jiebafenci(txt, find)