pandas，numpy学习记录（不断补充）

最新推荐文章于 2022-12-25 10:01:28 发布

赶在日落之前

最新推荐文章于 2022-12-25 10:01:28 发布

阅读量335

点赞数

分类专栏：数据库

本文链接：https://blog.csdn.net/lzz781699880/article/details/89708688

版权

数据库专栏收录该内容

45 篇文章 1 订阅

订阅专栏

pandas安装：

conda install pandas

导入pandas包：

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

通过传递值列表来创建一个Series，让pandas创建一个默认的整数索引：

s = pd.Series([1,3,5,np.nan,6,8])
print(s)

对于numpy的日常练习

import numpy as np
import pandas as pd
# 奇数替换
# arr = np.array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9])
# arr[arr % 2 == 1] = -1
# print(arr)
arr = np.arange(10)
out = np.where(arr % 2 == 1, 0, arr)
print(out)
# 改变形状
arr = np.arange(10)
arr1 = arr.reshape(2, -1)
print(arr1)
# 垂直叠加
a = np.arange(10).reshape(2,-1)
b = np.repeat(1, 10).reshape(2,-1)
c = np.concatenate([a, b], axis=0)
d = np.vstack([a, b])
e = np.r_[a, b]
print(d)
# 水平叠加
a = np.arange(10).reshape(2,-1)
b = np.repeat(1, 10).reshape(2,-1)
c = np.concatenate([a, b], axis=1)
d = np.hstack([a, b])
e = np.c_[a, b]
print(e)
# 如何获取两个numpy数组之间的公共项
a = np.array([1,2,3,2,3,4,3,4,5,6])
b = np.array([7,2,10,2,7,4,9,4,9,8])
c = np.intersect1d(a,b)
print(c)

# 如何从一个数组中删除存在于另一个数组中的项？
a = np.array([1,2,3,4,5])
b = np.array([5,6,7,8,9])
# From 'a' remove all of 'b'
c= np.setdiff1d(a,b)
print(c)
# 如何得到两个数组元素匹配的位置？
a = np.array([1,2,3,2,3,4,3,4,5,6])
b = np.array([7,2,10,2,7,4,9,4,9,8])
c = np.where(a == b)
print(c)

# 如何从numpy数组中提取给定范围内的所有数字？
a = np.array([2, 6, 1, 9, 10, 3, 27])
index = np.where((a >= 5) & (a <= 10))
print(a[index])
index = np.where(np.logical_and(a>=5, a<=10))
# print(a[index])
b = a[(a >= 5) & (a <= 10)]
print(b)

# 如何创建一个python函数来处理scalars并在numpy数组上工作？
def maxx(x, y):
    """Get the maximum of two items"""
    if x >= y:
        return x
    else:
        return y

pair_max = np.vectorize(maxx, otypes=[float])
a = np.array([5, 7, 9, 8, 6, 4, 5])
b = np.array([6, 3, 4, 8, 9, 7, 1])
print(pair_max(a, b))

# 如何交换二维numpy数组中的两列？
arr = np.arange(9).reshape(3,3)
print(arr)
a = arr[:, [2,1,0]]
print(a)

# 如何交换二维numpy数组中的两行？
arr = np.arange(9).reshape(3,3)
print(arr)
a = arr[[1,0,2], :]
print(a)


# 如何反转二维数组的行？
arr = np.arange(9).reshape(3,3)
print(arr)
a = arr[::-1]
print(a)

# 如何反转二维数组的列？
arr = np.arange(9).reshape(3,3)
print(arr)
a = arr[:, ::-1]
print(a)

# 如何创建包含5到10之间随机浮动的二维数组？
arr = np.arange(9).reshape(3,3)
rand_arr = np.random.randint(low=5, high=10, size=(5,3)) + np.random.random((5,3))
rand_arr = np.random.uniform(5,10, size=(5,3))
print(rand_arr)

# 如何在numpy数组中只打印小数点后三位？
rand_arr = np.random.random((5,3))
print(rand_arr)
np.set_printoptions(precision=3)
a = rand_arr
print(a)

# 如何通过e式科学记数法（如1e10）来打印一个numpy数组？
np.random.seed(100)
rand_arr = np.random.random([3,3])/1e3
print(rand_arr)
np.set_printoptions(suppress=True, precision=6)
print(rand_arr)


# 如何限制numpy数组输出中打印的项目数？
a = np.arange(15)
print(a)
np.set_printoptions(threshold=6)
b = np.arange(15)
print(b)

# 如何打印完整的numpy数组而不截断
np.set_printoptions(threshold=6)
a = np.arange(15)
print(a)
np.set_printoptions(threshold=np.nan)
print(a)


# 如何导入数字和文本的数据集保持文本在numpy数组中完好无损？
# url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/iris/iris.data'
# iris = np.genfromtxt(url, delimiter=',', dtype='object')
# names = ('sepallength', 'sepalwidth', 'petallength', 'petalwidth', 'species')
# print(iris)

# 如何从1维元组数组中提取特定列？
url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/iris/iris.data'
iris_1d = np.genfromtxt(url, delimiter=',', dtype=None)
print(iris_1d.shape)
species = np.array([row[4] for row in iris_1d])
a = species[:5]
print(a)


# 如何将1维元组数组转换为2维numpy数组？
# url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/iris/iris.data'
# iris_1d = np.genfromtxt(url, delimiter=',', dtype=None)
# iris_2d = np.array([row.tolist()[:4] for row in iris_1d])
# iris_2d1 = np.genfromtxt(url, delimiter=',', dtype='float', usecols=[0,1,2,3])
# print(iris_1d)
# print(iris_2d)
# print(iris_2d1)


# 如何计算numpy数组的均值，中位数，标准差？
url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/iris/iris.data'
iris = np.genfromtxt(url, delimiter=',', dtype='object')
sepallength = np.genfromtxt(url, delimiter=',', dtype='float', usecols=[0])
mu, med, sd = np.mean(sepallength), np.median(sepallength), np.std(sepallength)
print(mu, med, sd)


# 如何规范化数组，使数组的值正好介于0和1之间？
url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/iris/iris.data'
sepallength = np.genfromtxt(url, delimiter=',', dtype='float', usecols=[0])
Smax, Smin = sepallength.max(), sepallength.min()
S = (sepallength - Smin)/(Smax - Smin)
S1 = (sepallength - Smin)/sepallength.ptp()  # Thanks, David Ojeda!


# 如何计算Softmax得分？
url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/iris/iris.data'
iris = np.genfromtxt(url, delimiter=',', dtype='object')
sepallength = np.array([float(row[0]) for row in iris])
def softmax(x):
    """Compute softmax values for each sets of scores in x.
    https://stackoverflow.com/questions/34968722/how-to-implement-the-softmax-function-in-python"""
    e_x = np.exp(x - np.max(x))
    return e_x / e_x.sum(axis=0)
print(softmax(sepallength))


# 如何找到numpy数组的百分位数？
# 找到鸢尾属植物数据集的第5和第95百分位数
url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/iris/iris.data'
sepallength = np.genfromtxt(url, delimiter=',', dtype='float', usecols=[0])
a = np.percentile(sepallength, q=[5, 95])
print(a)


# 如何在数组中的随机位置插入值？
# 在iris_2d数据集中的20个随机位置插入np.nan值
url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/iris/iris.data'
iris_2d = np.genfromtxt(url, delimiter=',', dtype='object')
i, j = np.where(iris_2d)
# np.random.seed(100)
# iris_2d[np.random.choice((i), 20), np.random.choice((j), 20)] = np.nan
np.random.seed(100)
iris_2d[np.random.randint(150, size=20), np.random.randint(4, size=20)] = np.nan
print(iris_2d[:10])

# 如何在numpy数组中找到缺失值的位置？
url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/iris/iris.data'
iris_2d = np.genfromtxt(url, delimiter=',', dtype='float', usecols=[0,1,2,3])
iris_2d[np.random.randint(150, size=20), np.random.randint(4, size=20)] = np.nan
print("Number of missing values: \n", np.isnan(iris_2d[:, 0]).sum())
print("Position of missing values: \n", np.where(np.isnan(iris_2d[:, 0])))


# 如何根据两个或多个条件过滤numpy数组？
url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/iris/iris.data'
iris_2d = np.genfromtxt(url, delimiter=',', dtype='float', usecols=[0,1,2,3])
condition = (iris_2d[:, 2] > 1.5) & (iris_2d[:, 0] < 5.0)
print(iris_2d[condition])


# 如何从numpy数组中删除包含缺失值的行？
url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/iris/iris.data'
iris_2d = np.genfromtxt(url, delimiter=',', dtype='float', usecols=[0,1,2,3])
iris_2d[np.random.randint(150, size=20), np.random.randint(4, size=20)] = np.nan
# print(iris_2d)
any_nan_in_row = np.array([~np.any(np.isnan(row)) for row in iris_2d])
a = iris_2d[any_nan_in_row]
print(a)
b = iris_2d[np.sum(np.isnan(iris_2d), axis = 1) == 0]
print(b)

# 如何找到numpy数组的两列之间的相关性？

url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/iris/iris.data'
iris = np.genfromtxt(url, delimiter=',', dtype='float', usecols=[0,1,2,3])
a = np.corrcoef(iris[:, 0], iris[:, 2])[0, 1]
print(a)
from scipy.stats.stats import pearsonr
corr, p_value = pearsonr(iris[:, 0], iris[:, 2])
print(corr)

# 如何查找给定数组是否具有任何空值？
# 找出iris_2d是否有任何缺失值。
url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/iris/iris.data'
iris_2d = np.genfromtxt(url, delimiter=',', dtype='float', usecols=[0,1,2,3])
a = np.isnan(iris_2d).any()
print(a)


# 如何在numpy数组中用0替换所有缺失值？
# 问题：在numpy数组中将所有出现的nan替换为0

url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/iris/iris.data'
iris_2d = np.genfromtxt(url, delimiter=',', dtype='float', usecols=[0,1,2,3])
iris_2d[np.random.randint(150, size=20), np.random.randint(4, size=20)] = np.nan
iris_2d[np.isnan(iris_2d)] = 0
print(iris_2d[:4])


# 如何在numpy数组中查找唯一值的计数？
# 问题：找出鸢尾属植物物种中的独特值和独特值的数量
url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/iris/iris.data'
iris = np.genfromtxt(url, delimiter=',', dtype='object')
names = ('sepallength', 'sepalwidth', 'petallength', 'petalwidth', 'species')
species = np.array([row.tolist()[4] for row in iris])
a = np.unique(species, return_counts=True)
print(a)

# 如何将数字转换为分类（文本）数组？
# 问题：将iris_2d的花瓣长度（第3列）加入以形成文本数组，这样如果花瓣长度为：
url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/iris/iris.data'
iris = np.genfromtxt(url, delimiter=',', dtype='object')
names = ('sepallength', 'sepalwidth', 'petallength', 'petalwidth', 'species')
petal_length_bin = np.digitize(iris[:, 2].astype('float'), [0, 3, 5, 10])
label_map = {1: 'small', 2: 'medium', 3: 'large', 4: np.nan}
petal_length_cat = [label_map[x] for x in petal_length_bin]
print(petal_length_cat[:4])


# 替换数组里的数值
np.set_printoptions(precision=2)
np.random.seed(100)
a = np.random.uniform(1,50, 20)
print(a)
np.clip(a, a_min=10, a_max=30)
print(np.where(a < 10, 10, np.where(a > 30, 30, a)))



#获取给定数组a中前5个最大值的位置
np.random.seed(100)
a = np.random.uniform(1,50, 20)
print(a)
print(a.argsort())
print(a[a.argsort()][-5:])
print(np.argpartition(-a, 5)[:5])
print(np.sort(a)[-5:])
print(np.partition(a, kth=-5)[-5:])
print(a[np.argpartition(-a, 5)][:5])

np.random.seed(100)
arr = np.random.randint(1,11,size=(6, 10))
print(arr)

def counts_of_all_values_rowwise(arr2d):
    num_counts_array = [np.unique(row, return_counts=True) for row in arr2d]
    return ([[int(b[a == i]) if i in a else 0 for i in np.unique(arr2d)] for a, b in num_counts_array])
print(np.arange(1,11))
counts_of_all_values_rowwise(arr)

arr = np.array([np.array(list('bill clinton')), np.array(list('narendramodi')), np.array(list('jjayalalitha'))])
print(arr)
print(np.unique(arr))
counts_of_all_values_rowwise(arr)



# 将array_of_arrays转换为扁平线性1d数组
arr1 = np.arange(3)
arr2 = np.arange(3,7)
arr3 = np.arange(7,10)
array_of_arrays = np.array([arr1, arr2, arr3])
print('array_of_arrays: ', array_of_arrays)

arr_2d = np.array([a for arr in array_of_arrays for a in arr])
print(arr_2d)

arr_2d = np.concatenate(array_of_arrays)
print(arr_2d)

# 计算一次性编码(数组中每个唯一值的虚拟二进制变量)
np.random.seed(101)
arr = np.random.randint(1,4, size=6)
print(arr)
# method1
def one_hot_encodings(arr):
    uniqs = np.unique(arr)
    out = np.zeros((arr.shape[0], uniqs.shape[0]))
    for i, k in enumerate(arr):
        out[i, k-1] = 1
    return out


print(one_hot_encodings(arr))
# method2
print((arr[:, None] == np.unique(arr)).view(np.int8))

# 将一维数组转换为2行的2维数组
arr = np.arange(10)
arr1 = arr.reshape(2,-1)
print(arr)
print(arr1)

pandas 排序

DataFrame.sort_values(by, axis=0, ascending=True, inplace=False, kind='quicksort', na_position='last')  
axis:{0 or ‘index’, 1 or ‘columns’}, default 0，默认按照索引排序，即纵向排序，如果为1，则是横向排序    
by:str or list of str；如果axis=0，那么by="列名"；如果axis=1，那么by="行名"；  
ascending:布尔型，True则升序，可以是[True,False]，即第一字段升序，第二个降序  
inplace:布尔型，是否用排序后的数据框替换现有的数据框  
kind:排序方法，{‘quicksort’, ‘mergesort’, ‘heapsort’}, default ‘quicksort’。似乎不用太关心  
na_position : {‘first’, ‘last’}, default ‘last’，默认缺失值排在最后面

pandas 取绝对值

frame = pd.DataFrame(np.random.randn(4, 3), columns=list('bde'), index=['Utah', 'Ohio', 'Texas', 'Oregon'])
print(frame)
print(np.abs(frame))

pandas 映射

DataFrame的apply方法：将函数应用到由各列或行所形成的一维数组上。

frame = pd.DataFrame(np.random.randn(4, 3), columns=list('bde'), index=['Utah', 'Ohio', 'Texas', 'Oregon'])
print(frame)
# print(np.abs(frame))
f = lambda x: x.max() - x.min()
a = frame.apply(f)
print(a)

swaplevel接受两个级别编号或名称，并返回一个互换了级别的新对象（但数据不会发生变化）：

frame = pd.DataFrame(np.arange(12).reshape((4, 3)), index=[['a', 'a', 'b', 'b'], [1, 2, 1, 2]],columns=[['Ohio', 'Ohio', 'Colorado'], ['Green', 'Red', 'Green']])
frame.index.names = ['key1', 'key2']
frame.columns.names = ['state', 'color']
a =  frame.swaplevel('key1', 'key2')
print(a)

sort_index则根据单个级别中的值对数据进行排序。

b = frame.sort_index(level=1)
print(b)

汇总统计根据行或列上的级别来进行求和：

c = frame.sum(level='key2')

赶在日落之前

关注

0
点赞
踩
2

收藏

觉得还不错? 一键收藏
0
评论
pandas，numpy学习记录（不断补充）

pandas安装：conda install pandas导入pandas包：import pandas as pdimport numpy as npimport matplotlib.pyplot as plt通过传递值列表来创建一个Series，让pandas创建一个默认的整数索引：s = pd.Series([1,3,5,np.nan,6,8])print(...
复制链接

扫一扫

专栏目录