url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/iris/iris.data'
iris = np.genfromtxt(url, delimiter=',', dtype='object')
names = ('sepallength','sepalwidth', 'petallength', 'petalwidth', 'species')
省略文本字段转化为二维数组
#1
iris_1d = np.genfromtxt(url, delimiter=’,’, dtype=None)
iris_2d = np.array([row.tolist()[:4] for row in iris_1d])
iris_2d[:4]
//不过这么搞不知道为什么出来的是str类,转化一下
iris_float = iris_2d.astype(float)
#2只引入前四列(第五列是文本)
iris_2d = np.genfromtxt(url,delimiter=',',dtype='float',usecols=[0,1,2,3])
iris_2d[:4]
计算第一列的平均值,中位差和标准差
sepallength = np.genfromtxt(url,delimiter=',',dtype='float',usecols=[0])
mu,med,sd = np.mean(sepallength),np.median(sepallength),np.std(sepallength)
print(mu,med,sd)
softmax函数,又称归一化指数函数,实际上是有限项离散概率分布的梯度对数归一化。因此,softmax函数在包括多项逻辑回归,多项线性判别分析,朴素贝叶斯分类器和人工神经网络等的多种基于概率的多分类问题方法中都有着广泛应用。
sepallength = np.array([float(row[0]) for row in iris])
def softmax(x):
e_x = np.exp(x - np.max(x))
return e_x/e_x.sum(axis=0)
print(softmax(sepallength))
查找缺失值的数量和位置
iris_2d = np.genfromtxt(url,delimiter=',',dtype='float')
iris_2d[np.random.randint(150,size=20),np.random.randint(4,size=20)] = np.nan
print("Number of missing values:\n",np.isnan(iris_2d[:,0]).sum())
print("Position of missing values:\n",np.where(np.isnan(iris_2d[:,0])))
删除缺失值的行
iris_2d = np.genfromtxt(url,delimiter=',',dtype='float',usecols=[0,1,2,3])
iris_2d[np.random.randint(150,size=20),np.random.randint(4,size=20)] = np.nan
#1
any_nan_in_row = np.array([~np.any(np.isnan(row)) for row in iris_2d]) //~:非
iris_2d[any_nan_in_row][:5]
#2
iris_2d[np.sum(np.isnan(iris_2d,axis = 1) == 0][:5]
筛选第三列>1.5,第一列<5.0的行
iris_2d = np.genfromtxt(url,delimiter=',',dtype='float',usecols=[0,1,2,3])
condition = (iris_2d[:,2] >1.5) &(iris_2d[:,0] < 5.0)
iris_2d[condition]
找出numpy数组的两列(第1列和第3列)之间的相关性
#1
np.corrcoef(iris_2d[:,0],iris_2d[:,2])[0,1] //第1列和第3列的相关系数(在0-1之间)
#2
from scipy.stats.stats import pearsonr
corr,p_value = pearsonr(iris[:,0],iris[:,2])
print(corr)
查看缺失值
np.isnan(iris_2d).any()
将数字转换为文本分类(第三列小于三→small,3-5→medium,>=5→large)
#分箱 petallength
petal_length_bin = np.digitize(iris[:,2].astype('float'),[0,3,5,10])
#映射到对应的文本分类
label_map = {1:'small',2:'medium',3:'large',4:np.nan}
petal_length_cat = [label_map[x] for x in petal_length_bin]
#查看
petal_length_cat[:4]
概率抽样
随机抽取样本使得setosa的样本数是versicolor和virginica的两倍
a=np.array(['Iris-setosa','Iris-versicolor','Iris-virginica'])
species_out = np.random.choice(a,150,p=[0.5,0.25,0.25])
在二维数字数组中查找按分类列分组的数值列的平均值
numeric_column = iris[:,1].astype('float')
grouping_column = iris[:,4]
[[group_val,numeric_column[grouping_column==group_val].mean()]for group_val in np.unique(grouping_column)]