numpy 是否为零_pandas 和 numpy 学习记录

weixin_39635567
于 2020-11-20 10:33:31 发布
阅读量151
点赞数
文章标签： numpy 是否为零 numpy合并循环数组的array numpy找到最大值坐标
学习python也有一段时间了，之前一直在忙，也一直没时间整理自己的学习记录，这几天自己挤出了一点时间，整理了一些自己的学习记录
也希望自己能继续学习下去，也算是督促自己吧！在这个学习的过程，自己发现好像真的喜欢上了python，人生苦短，我用python，下一步，要开始实际的清洗和实现数据的可视化！
这篇文章是我在网上找到的一个numpy 和pandas的练习。网址如下https://www.hackerearth.com/zh/practice/machine-learning/data-manipulation-visualisation-r-python/tutorial-data-manipulation-numpy-pandas-python/tutorial/
有兴趣的可以去看下，由于这个练习后面涉及到了机器学习，所以机器学习这一part我就跳过了！
import numpy as np
import pandas as pd
# 第一个看一下numpy的版本
np.__version__ 
'1.16.2'
# 创造一个list，从零到九的数
data = list(range(10))
data
[0, 1, 2, 3, 4, 5, 6, 7, 8, 9]
# # 把整型数据转换成字符串形式 有很多种方法
[str(c) for c in data]
# 这里用的是
[type(item) for item in data]
[int, int, int, int, int, int, int, int, int, int]
# 创造一个新的numpy数组
# 1创造一个全是零的数组
np.zeros(10,dtype = int)

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0])
# 创造一个 3行5列的数组
np.ones((3,5),dtype = float )
array([[1., 1., 1., 1., 1.],
       [1., 1., 1., 1., 1.],
       [1., 1., 1., 1., 1.]])
# 使用随机的数字创建3行5列的数组
np.random.randn(15).reshape(3,5)
array([[ 1.17224013, -1.31714361, -0.57372094, -1.45059089, -0.94262067],
       [-0.84514455,  1.95279195, -1.4052422 , -0.09462012,  1.23075506],
       [-1.11058125, -0.82823642, -0.66151707,  0.71700257, -1.12698597]])
# 创建一个15以内的3行5列的数组
np.arange(15).reshape(3,5)
array([[ 0,  1,  2,  3,  4],
       [ 5,  6,  7,  8,  9],
       [10, 11, 12, 13, 14]])
# 创建一个全是某一个数字的数组
np.full((3,5),3)
array([[3, 3, 3, 3, 3],
       [3, 3, 3, 3, 3],
       [3, 3, 3, 3, 3]])
# 创建一个有固定步长的数组(如步长是二)
np.arange(0,20,2)
array([ 0,  2,  4,  6,  8, 10, 12, 14, 16, 18])
# 创建一个均分某个区间的数组
np.linspace(1,2,5,dtype = int)
array([1, 1, 1, 1, 2])
# 创建一个均分区间的数组 默认的数据类型
np.linspace(2,4,5)
array([2. , 2.5, 3. , 3.5, 4. ])
# 创建一个3*3的随机数组
np.random.normal(0,1,(3,3))
array([[ 1.00737997, -1.53233115, -0.74612576],
       [-1.1850143 , -0.1165628 , -0.67767718],
       [-1.01919855,  0.08808351, -0.67987634]])
# 创建一个标准矩阵（数组）对应线代中的E，原谅我我了叫什么名字了 好像是叫正矩阵
np.eye(3)


array([[1., 0., 0.],
       [0., 1., 0.],
       [0., 0., 1.]])
# Numpy中很重要的一个元素 索引
#值得注意的是，index是从零开始的，据说这是为了避免在计算某个数的内存时要减一的麻烦
x1 = np.array([4,3,4,4,8,4])
x1
array([4, 3, 4, 4, 8, 4])
# 表示出索引为零的值
x1[0]
4
# 表示第5个元素的值
x1[4]
8
#表示出最后一个数据的值
x1[-1]
4
#表示出倒数第二个的值
x1[-2]
8
# 对于多元数组，我们需要行和列的数字去定位
# 这就是索引的作用
x2 = np.arange(9).reshape(3,3)
x2
array([[0, 1, 2],
       [3, 4, 5],
       [6, 7, 8]])
#找出坐标是第二行第三列的值
x2[1,2] # 需要注意的是，python默认是从零开始的，这个可以参照后面的pandas中的索引
5
# 我想看一看坐标是第一行第二列的数据
x2[0,1]
1
# 同样也可以使用-1这种形式
x2[2,-1]

8
# 再看一个例子
x2[1,-1]
5
# 可以给某一个数组赋值
x2[1,-1] = 10
x2
array([[ 0,  1,  2],
       [ 3,  4, 10],
       [ 6,  7,  8]])
x2[0,-1]
2
x2[1,-1]
10
x2[1,-2]
4
x2[-1,-1]
8
x2[-1,1]
7
# 可以给某一个值进行赋值
x2[1,-1] = 19
x2
array([[ 0,  1,  2],
       [ 3,  4, 19],
       [ 6,  7,  8]])
# 下一步我们要看一下切片的做法
# 切片个人感觉可以理解为按照一定的刻度进行切分
# 首先我们来生成一组数据
x = np.arange(10)
x
array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9])
# 从开始切到第五个数据位置
x[:5]
array([0, 1, 2, 3, 4])
#从第五个数据开始往后切
x[5:]
array([5, 6, 7, 8, 9])
#从第5个到第7个数据
x[5:8]
array([5, 6, 7])
# 每隔一个切分
x[::2]
array([0, 2, 4, 6, 8])
# 从第一个数据开始每隔一个数切分
x[1::2]
array([1, 3, 5, 7, 9])
# 把数组倒序处理
x[::-1]
array([9, 8, 7, 6, 5, 4, 3, 2, 1, 0])
# 下面的几行代码是数组的合并等操作
#使用concatenate完成合并的操作 
x = np.array([1,2,3])
y = np.array([3,2,1])
z = np.array([21,21,21])
np.concatenate([x,y,z]) # 需要注意的是这里类似于集合的并，但是合并以后不能改变原有元素数
array([ 1,  2,  3,  3,  2,  1, 21, 21, 21])
# 对于2维数组，也有着类似的做法
grid = np.array([[1,2,3],[2,3,4]])
np.concatenate([grid,grid])
array([[1, 2, 3],
       [2, 3, 4],
       [1, 2, 3],
       [2, 3, 4]])
# 使用axis语句可以控制数组是按照行合并还是列合并
np.concatenate([grid,grid],axis=1)#需要注意的是axis = 1代表的是列，这一点在后面pandas中用的很多
array([[1, 2, 3, 1, 2, 3],
       [2, 3, 4, 2, 3, 4]])
 np.concatenate([grid,grid],axis=0)
    #默认的是对行操作，这一点可以通过shift+tab键来看一下
np.concatenate
<function numpy.concatenate>
#上面的两个都是针对相同维数的数组进行操作的
# 如果要合并的两个数组是不同的维数怎么办呢？
# 可以使用 np.vstack 和np.hstack
x = np.array([3,4,5])
grid =np.array([[1,2,3],[17,18,19]])
np.vstack([x,grid])
array([[ 3,  4,  5],
       [ 1,  2,  3],
       [17, 18, 19]])
# 水平方向的相加
z = np.array([[9],[8]])
np.hstack([grid,z])
array([[ 1,  2,  3,  9],
       [17, 18, 19,  8]])
#当然这个合并是有先后顺序的
np.hstack([z,grid])
array([[ 9,  1,  2,  3],
       [ 8, 17, 18, 19]])
# 我们来看一看如果数据结构不一致是是否可以合并
x = np.array([1,1,1,2])
np.vstack([x,grid])
# 可以看到这里报错了，因为要保证要合并的数组要是匹配的
---------------------------------------------------------------------------
ValueError                                Traceback (most recent call last)
<ipython-input-188-68a013f30813> in <module>
      1 # 我们来看一看如果数据结构不一致是是否可以合并
      2 x = np.array([1,1,1,2])
----> 3 np.vstack([x,grid])
      4 # 可以看到这里报错了，因为要保证要合并的数组要是匹配的

C:ProgramDataAnaconda3envspy3libsite-packagesnumpycoreshape_base.py in vstack(tup)
    281     """
    282     _warn_for_nonsequence(tup)
--> 283     return _nx.concatenate([atleast_2d(_m) for _m in tup], 0)
    284 
    285 

ValueError: all the input array dimensions except for the concatenation axis must match exactly

# 下面让我们看一看split 功能
x = np.arange(10)
x
array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9])
x1,x2,x3 = np.split(x,[3,6])
print(x1,x2,x3)
[0 1 2] [3 4 5] [6 7 8 9]
grid = np.arange(16).reshape((4,4))
grid
array([[ 0,  1,  2,  3],
       [ 4,  5,  6,  7],
       [ 8,  9, 10, 11],
       [12, 13, 14, 15]])
upper,lower = np.vsplit(grid,[2])
grid
array([[ 0,  1,  2,  3],
       [ 4,  5,  6,  7],
       [ 8,  9, 10, 11],
       [12, 13, 14, 15]])
upper,lower = np.vsplit(grid,[2])
print(upper)
print(lower)
[[0 1 2 3]
 [4 5 6 7]]
[[ 8  9 10 11]
 [12 13 14 15]]
### not to bad 
# 一般pandas的标准名称是pd，所以我就使用pd作为别名
# pandas 主要有DataFrame and Series
# 创建一个DataFrame 这个类似于excel表格,可以使用字典
data = pd.DataFrame({'Country': ['Russia','Colombia','Chile','Equador','Nigeria'],
                    'Rank':[121,40,100,130,11]})
data 
Country	Rank
0	Russia	121
1	Colombia	40
2	Chile	100
3	Equador	130
4	Nigeria	11
数字的平均数
# 使用describe 看一下数据的描述性统计
data.describe()
# 因为这个数据第二列是纯数字，所以描述的内容很全
# count：计算存在的数据
# mean：数字的平均数
# std：标准差
#min：数字最小值
#max：最大值
Rank
count	5.000000
mean	80.400000
std	52.300096
min	11.000000
25%	40.000000
50%	100.000000
75%	121.000000
max	130.000000
# 想要知道更多的数据信息，可以使用info
data.info
<bound method DataFrame.info of     Country  Rank
0    Russia   121
1  Colombia    40
2     Chile   100
3   Equador   130
4   Nigeria    11>
data.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5 entries, 0 to 4
Data columns (total 2 columns):
Country    5 non-null object
Rank       5 non-null int64
dtypes: int64(1), object(1)
memory usage: 160.0+ bytes
# 创建一个新的dataframe
data = pd.DataFrame({'group':['a','a','a','b','b','b','c','c','c'],'ounces':[4,3,12,6,7.5,8,3,5,6]})
data
group	ounces
0	a	4.0
1	a	3.0
2	a	12.0
3	b	6.0
4	b	7.5
5	b	8.0
6	c	3.0
7	c	5.0
8	c	6.0
# 按照ounce列进行排序
data.sort_values(by=['ounces'],ascending = True,inplace= False)
# 可以仔细看一下这个数据是怎么排序的一个过程 这里是按照所有数据由小到大的排序
group	ounces
1	a	3.0
6	c	3.0
0	a	4.0
7	c	5.0
3	b	6.0
8	c	6.0
4	b	7.5
5	b	8.0
2	a	12.0
# 按照ounce列进行排序
data.sort_values(by=['ounces'],ascending = True,inplace= True)
data
# 可以仔细看一下这个数据是怎么排序的一个过程 这里是按照所有数据由小到大的排序
group	ounces
1	a	3.0
6	c	3.0
0	a	4.0
7	c	5.0
3	b	6.0
8	c	6.0
4	b	7.5
5	b	8.0
2	a	12.0
data
group	ounces
1	a	3.0
6	c	3.0
0	a	4.0
7	c	5.0
3	b	6.0
8	c	6.0
4	b	7.5
5	b	8.0
2	a	12.0
data1 = data.copy()
data1
group	ounces
1	a	3.0
6	c	3.0
0	a	4.0
7	c	5.0
3	b	6.0
8	c	6.0
4	b	7.5
5	b	8.0
2	a	12.0
# 我们按照多列进行排序,下面语句的意思的按照group和ounces排序，group按照升序排列，ounces按照降序拍排列
data.sort_values(by=['group','ounces'],ascending=[True,False],inplace=False)
group	ounces
2	a	12.0
0	a	4.0
1	a	3.0
5	b	8.0
4	b	7.5
3	b	6.0
8	c	6.0
7	c	5.0
6	c	3.0
# 去除重复值
data = pd.DataFrame({'k1':['one']*3 + ['two']*4, 'k2':[3,2,1,3,3,4,4]})
data
k1	k2
0	one	3
1	one	2
2	one	1
3	two	3
4	two	3
5	two	4
6	two	4
# 对数据排序
data.sort_values(by='k2')

k1	k2
2	one	1
1	one	2
0	one	3
3	two	3
4	two	3
5	two	4
6	two	4
# 可以看到这个数据集有重复的，那么我们来去重
#首先来看一下有哪些重复值
data.duplicated() 
#因为这个数据量很小，所以可以直接看到，但是当数据量很大的时候，使用duplicte就可以直接判断是否存在重复值了

0    False
1    False
2    False
3    False
4     True
5    False
6     True
dtype: bool
# 删除重复值
data.drop_duplicates()
k1	k2
0	one	3
1	one	2
2	one	1
3	two	3
5	two	4
# 看一下data
data
#可以看到这里data是没有改变的，这是因为python在执行的过程中copy了一份，然后drop_duplicate 默认是不会直接改变原始数据的
#也就是inplace = False

k1	k2
0	one	3
1	one	2
2	one	1
3	two	3
4	two	3
5	two	4
6	two	4
data.drop_duplicates()
k1	k2
0	one	3
1	one	2
2	one	1
3	two	3
5	two	4
# 下面删除指定列的重复值
data.drop_duplicates(subset='k1')

k1	k2
0	one	3
3	two	3
data = pd.DataFrame({'food': ['bacon', 'pulled pork', 'bacon', 'Pastrami','corned beef', 'Bacon', 'pastrami', 'honey ham','nova lox'],
                 'ounces': [4, 3, 12, 6, 7.5, 8, 3, 5, 6]})
data
food	ounces
0	bacon	4.0
1	pulled pork	3.0
2	bacon	12.0
3	Pastrami	6.0
4	corned beef	7.5
5	Bacon	8.0
6	pastrami	3.0
7	honey ham	5.0
8	nova lox	6.0
# 线面我要新建一列数据，添加到原有数据中
#首先使用一个dictionary（字典）
meat_to_animal = {'bacon':'pig','pulled pork':'pig','pastrami':'cow','corned beef':'cow','honey ham':'pig','nova lox':'salmon'}
#下面要创建一个函数
def meat_2_anmial(series):
    if series['food']=='bacon':
        return 'pig'
    elif series['food'] == 'pulled pork':
        return 'pig'
    elif series['food'] == 'pastrami':
        return 'cow'
    elif series['food'] == 'corned beef':
        return 'cow'
    elif series['food'] == 'honey ham':
        return 'pig'
    else:
        return 'salmon'
# 使用map 遍历上述字典
data['animal'] = data['food'].map(str.lower).map(meat_to_animal)
data
food	ounces	animal
0	bacon	4.0	pig
1	pulled pork	3.0	pig
2	bacon	12.0	pig
3	Pastrami	6.0	cow
4	corned beef	7.5	cow
5	Bacon	8.0	pig
6	pastrami	3.0	cow
7	honey ham	5.0	pig
8	nova lox	6.0	salmon
# 第二种方式是使用lambda函数，即匿名函数
lower = lambda x: x.lower()
data['food'] = data['food'].apply(lower)
data['animal2'] = data.apply(meat_2_animal, axis='columns')
data
---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-213-69a41af39b0d> in <module>
      2 lower = lambda x: x.lower()
      3 data['food'] = data['food'].apply(lower)
----> 4 data['animal2'] = data.apply(meat_2_animal, axis='columns')
      5 data

NameError: name 'meat_2_animal' is not defined

# 还有一种生成一个新列的方式是
data.assign(new_variable = data['ounces']*10)
food	ounces	animal	new_variable
0	bacon	4.0	pig	40.0
1	pulled pork	3.0	pig	30.0
2	bacon	12.0	pig	120.0
3	pastrami	6.0	cow	60.0
4	corned beef	7.5	cow	75.0
5	bacon	8.0	pig	80.0
6	pastrami	3.0	cow	30.0
7	honey ham	5.0	pig	50.0
8	nova lox	6.0	salmon	60.0
# 在实际应用中，有时会因为各种原因会产生各种数据的缺失
#这里使用 pd.isna 来判断是否存在na
data = pd.Series([1,-999,2,-999,-1000,3])
data
0       1
1    -999
2       2
3    -999
4   -1000
5       3
dtype: int64
# 使用replace将-999换成NaN
data.replace(-999,np.nan,inplace = True)
data

0       1.0
1       NaN
2       2.0
3       NaN
4   -1000.0
5       3.0
dtype: float64
# 同样可以一次替换多个值
data = pd.Series([1,-999,2,-999,-10000,3])
data.replace([-999,-10000],np.nan,inplace = True)
data
0    1.0
1    NaN
2    2.0
3    NaN
4    NaN
5    3.0
dtype: float64
# 现在我们来看一看如何判断是否存在na
data.isna() 
# 可以看到这里存在na的
0    False
1     True
2    False
3     True
4     True
5    False
dtype: bool
# 有时候数据的列名不是很好理解，这个时候就可以使用rename的形式，来重命名
data = pd.DataFrame(np.arange(12).reshape((3, 4)),index=['Ohio', 'Colorado', 'New York'],columns=['one', 'two', 'three', 'four'])
data
one	two	three	four
Ohio	0	1	2	3
Colorado	4	5	6	7
New York	8	9	10	11
# 重命名
data.rename(index = {'Ohio':'SanF'},columns = {'one':'one_p','two':'two_p'},inplace = True)
data
one_p	two_p	three	four
SanF	0	1	2	3
Colorado	4	5	6	7
New York	8	9	10	11
# 同样使用string 函数来改变行名和列名的大小写字母
# str.upper 是把所有字母都写成大写
# str.title 是把首字母大写

data.rename(index = str.upper,columns = str.title,inplace= True)
data
One_P	Two_P	Three	Four
SANF	0	1	2	3
COLORADO	4	5	6	7
NEW YORK	8	9	10	11
# 使用pd.cut 这里可以理解为分段函数
# demo 
ages = [20,22,25,27,21,23,37,31,61,45,41,32]
bins = [18,25,35,60,100]
cats = pd.cut(ages,bins)
cats

[(18, 25], (18, 25], (18, 25], (25, 35], (18, 25], ..., (25, 35], (60, 100], (35, 60], (35, 60], (25, 35]]
Length: 12
Categories (4, interval[int64]): [(18, 25] < (25, 35] < (35, 60] < (60, 100]]
# 包含右边区间的值
pd.cut(ages,bins,right = False)
[[18, 25), [18, 25), [25, 35), [25, 35), [18, 25), ..., [25, 35), [60, 100), [35, 60), [35, 60), [25, 35)]
Length: 12
Categories (4, interval[int64]): [[18, 25) < [25, 35) < [35, 60) < [60, 100)]
cats.labels
---------------------------------------------------------------------------
AttributeError                            Traceback (most recent call last)
<ipython-input-224-af28c901402c> in <module>
----> 1 cats.labels

AttributeError: 'Categorical' object has no attribute 'labels'

# 看一下各个阶段都有几个数据
pd.value_counts(cats)
(18, 25]     5
(35, 60]     3
(25, 35]     3
(60, 100]    1
dtype: int64
# 我们也可以使用有实际意义的名字去代替
bin_names = ['Youth','YoungAdult','MiddleAge','Senior']
new_cats = pd.cut(ages,bins,labels = bin_names)
pd.value_counts(new_cats)
Youth         5
MiddleAge     3
YoungAdult    3
Senior        1
dtype: int64
# 也可以使用累加
pd.value_counts(new_cats).cumsum()
Youth          5
MiddleAge      8
YoungAdult    11
Senior        12
dtype: int64
# 下面看一下GROUP  group很类似于sql中的group 也就是所谓的分组
df = pd.DataFrame({'key1' : ['a', 'a', 'b', 'b', 'a'],
                   'key2' : ['one', 'two', 'one', 'two', 'one'],
                   'data1' : np.random.randn(5),
                   'data2' : np.random.randn(5)})
df
key1	key2	data1	data2
0	a	one	-0.687908	-0.456465
1	a	two	0.040095	-1.286556
2	b	one	-1.771255	-2.742873
3	b	two	0.620357	-0.959591
4	a	one	-1.074818	1.024162
# 第一个是按照key1 去分组计算data1的平均值
grouped = df['data1'].groupby(df['key1'])
grouped.mean()
# 其实这里只是求得了一类，还有很多函数，比如求最值，等等
key1
a   -0.574211
b   -0.575449
Name: data1, dtype: float64
# 下面看一下dataframe的切分
dates = pd.date_range('20130101',periods = 6)
df = pd.DataFrame(np.random.randn(6,4),index = dates,columns = list('ABCD'))
df
A	B	C	D
2013-01-01	-0.605596	-2.268895	-0.134708	0.250404
2013-01-02	0.715821	0.022899	-0.085071	0.284433
2013-01-03	-0.130514	-1.364424	0.506235	0.528120
2013-01-04	0.126970	0.259528	-0.696607	-0.112268
2013-01-05	-0.107711	-1.349693	-0.320895	1.399890
2013-01-06	-0.425530	1.208063	0.277692	0.412692
# 得到dataframe的前几行
df[:3]
A	B	C	D
2013-01-01	-0.605596	-2.268895	-0.134708	0.250404
2013-01-02	0.715821	0.022899	-0.085071	0.284433
2013-01-03	-0.130514	-1.364424	0.506235	0.528120
# 根据日期范围来切分
df['20130101':'20130104']
A	B	C	D
2013-01-01	-0.605596	-2.268895	-0.134708	0.250404
2013-01-02	0.715821	0.022899	-0.085071	0.284433
2013-01-03	-0.130514	-1.364424	0.506235	0.528120
2013-01-04	0.126970	0.259528	-0.696607	-0.112268
df[['20130101'],['20130104']]
---------------------------------------------------------------------------
TypeError                                 Traceback (most recent call last)
<ipython-input-238-379945f05320> in <module>
----> 1 df[['20130101'],['20130104']]

C:ProgramDataAnaconda3envspy3libsite-packagespandascoreframe.py in __getitem__(self, key)
   2925             if self.columns.nlevels > 1:
   2926                 return self._getitem_multilevel(key)
-> 2927             indexer = self.columns.get_loc(key)
   2928             if is_integer(indexer):
   2929                 indexer = [indexer]

C:ProgramDataAnaconda3envspy3libsite-packagespandascoreindexesbase.py in get_loc(self, key, method, tolerance)
   2655                                  'backfill or nearest lookups')
   2656             try:
-> 2657                 return self._engine.get_loc(key)
   2658             except KeyError:
   2659                 return self._engine.get_loc(self._maybe_cast_indexer(key))

pandas/_libs/index.pyx in pandas._libs.index.IndexEngine.get_loc()

pandas/_libs/index.pyx in pandas._libs.index.IndexEngine.get_loc()

TypeError: '(['20130101'], ['20130104'])' is an invalid key

ab两列
#按照列名进行切分
df.loc[:,['A','B']] #这句代码是得到ab两列
A	B
2013-01-01	-0.605596	-2.268895
2013-01-02	0.715821	0.022899
2013-01-03	-0.130514	-1.364424
2013-01-04	0.126970	0.259528
2013-01-05	-0.107711	-1.349693
2013-01-06	-0.425530	1.208063
#使用具体的行和列的坐标去定位分析
df.loc['20130102':'20130103',['A','B']]
A	B
2013-01-02	0.715821	0.022899
2013-01-03	-0.130514	-1.364424
# 按照列索引名来切分
df.iloc[3] # 返回的是第4行的数据

A    0.126970
B    0.259528
C   -0.696607
D   -0.112268
Name: 2013-01-04 00:00:00, dtype: float64
# 使用特定的行和列去定位
df.iloc[[1,5],[0,2]]
A	C
2013-01-02	0.715821	-0.085071
2013-01-06	-0.425530	0.277692
 
df[df.A > 1]
A	B	C	D
df2
# 使用硬copy
df2 = df.copy()
df2['E'] = ['one','one','two','three','four','three']
df2
A	B	C	D	E
2013-01-01	-0.605596	-2.268895	-0.134708	0.250404	one
2013-01-02	0.715821	0.022899	-0.085071	0.284433	one
2013-01-03	-0.130514	-1.364424	0.506235	0.528120	two
2013-01-04	0.126970	0.259528	-0.696607	-0.112268	three
2013-01-05	-0.107711	-1.349693	-0.320895	1.399890	four
2013-01-06	-0.425530	1.208063	0.277692	0.412692	three
列
# 根据列数值选择行
df2[df2['E'].isin(['two','four'])]
A	B	C	D	E
2013-01-03	-0.130514	-1.364424	0.506235	0.52812	two
2013-01-05	-0.107711	-1.349693	-0.320895	1.39989	four
# 选择出不包含two和four的行
df2[~df2['E'].isin(['two','four'])]
A	B	C	D	E
2013-01-01	-0.605596	-2.268895	-0.134708	0.250404	one
2013-01-02	0.715821	0.022899	-0.085071	0.284433	one
2013-01-04	0.126970	0.259528	-0.696607	-0.112268	three
2013-01-06	-0.425530	1.208063	0.277692	0.412692	three
d
#下面使用query的形式来得到想要的数据
df.query('A > C')
A	B	C	D
2013-01-02	0.715821	0.022899	-0.085071	0.284433
2013-01-04	0.126970	0.259528	-0.696607	-0.112268
2013-01-05	-0.107711	-1.349693	-0.320895	1.399890
# 使用逻辑或
df.query('A < B | C > A')
A	B	C	D
2013-01-01	-0.605596	-2.268895	-0.134708	0.250404
2013-01-03	-0.130514	-1.364424	0.506235	0.528120
2013-01-04	0.126970	0.259528	-0.696607	-0.112268
2013-01-06	-0.425530	1.208063	0.277692	0.412692
pivot_table的几个例子
# excel这么流行的原因在于数据透视表
#下面我们将会看一看pivot_table的几个例子
data = pd.DataFrame({'group': ['a', 'a', 'a', 'b','b', 'b', 'c', 'c','c'],
                 'ounces': [4, 3, 12, 6, 7.5, 8, 3, 5, 6]})
data
group	ounces
0	a	4.0
1	a	3.0
2	a	12.0
3	b	6.0
4	b	7.5
5	b	8.0
6	c	3.0
7	c	5.0
8	c	6.0
# 计算每一组的平均值
data.pivot_table(values ='ounces',index='group',aggfunc=np.mean)

ounces
group	
a	6.333333
b	7.166667
c	4.666667
#统计每一组的数据数
data.pivot_table(values='ounces',index='group',aggfunc='count')
ounces
group	
a	3
b	3
c	3
r
#截至目前，已经了解了很多的基本pandas numpy的操作
#下面将看一看具体的数据处理的过程
#数据来源于https://s3-ap-southeast-1.amazonaws.com/he-public-data/datafiles19cdaf8.zip
train = pd.read_csv(r'E:pythondatafiles19cdaf8train.csv')
test = pd.read_csv(r'E:pythondatafiles19cdaf8test.csv')
#看一下数据的基础信息
train.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32561 entries, 0 to 32560
Data columns (total 15 columns):
age               32561 non-null int64
workclass         30725 non-null object
fnlwgt            32561 non-null int64
education         32561 non-null object
education.num     32561 non-null int64
marital.status    32561 non-null object
occupation        30718 non-null object
relationship      32561 non-null object
race              32561 non-null object
sex               32561 non-null object
capital.gain      32561 non-null int64
capital.loss      32561 non-null int64
hours.per.week    32561 non-null int64
native.country    31978 non-null object
target            32561 non-null object
dtypes: int64(6), object(9)
memory usage: 3.7+ MB
#可以看到train set 共有32561行15列
#我们看一下train set的前五行
train.head()
age	workclass	fnlwgt	education	education.num	marital.status	occupation	relationship	race	sex	capital.gain	capital.loss	hours.per.week	native.country	target
0	39	State-gov	77516	Bachelors	13	Never-married	Adm-clerical	Not-in-family	White	Male	2174	0	40	United-States	<=50K
1	50	Self-emp-not-inc	83311	Bachelors	13	Married-civ-spouse	Exec-managerial	Husband	White	Male	0	0	13	United-States	<=50K
2	38	Private	215646	HS-grad	9	Divorced	Handlers-cleaners	Not-in-family	White	Male	0	0	40	United-States	<=50K
3	53	Private	234721	11th	7	Married-civ-spouse	Handlers-cleaners	Husband	Black	Male	0	0	40	United-States	<=50K
4	28	Private	338409	Bachelors	13	Married-civ-spouse	Prof-specialty	Wife	Black	Female	0	0	40	Cuba	<=50K
# 看一下有多少缺失值
nans = train.shape[0] - train.dropna().shape[0]
print('%d rows missing values in the train data' %nans)
nand = test.shape[0] - test.dropna().shape[0]
print('%d rows have missing values in the test data' %nand)
2399 rows missing values in the train data
1221 rows have missing values in the test data
# 看一下哪些列有缺失数据
train.isnull().sum()
age                  0
workclass         1836
fnlwgt               0
education            0
education.num        0
marital.status       0
occupation        1843
relationship         0
race                 0
sex                  0
capital.gain         0
capital.loss         0
hours.per.week       0
native.country     583
target               0
dtype: int64
cat = train.select_dtypes(include =['0'])
cat.apply(pd.Series.nunique)
# 看一下每一列的非重复变量
cat = train.select_dtypes(include=['O'])
cat.apply(pd.Series.nunique)
workclass          8
education         16
marital.status     7
occupation        14
relationship       6
race               5
sex                2
native.country    41
target             2
dtype: int64
# 因为每一列对于缺失值的处理都不相同
#Education
train.workclass.value_counts(sort=True)
train.workclass.fillna('Private',inplace=True)


#Occupation
train.occupation.value_counts(sort=True)
train.occupation.fillna('Prof-specialty',inplace=True)


#Native Country
train['native.country'].value_counts(sort=True)
train['native.country'].fillna('United-States',inplace=True)
# 看一下处理以后还有没有缺失的数据
train.isnull().sum()
age               0
workclass         0
fnlwgt            0
education         0
education.num     0
marital.status    0
occupation        0
relationship      0
race              0
sex               0
capital.gain      0
capital.loss      0
hours.per.week    0
native.country    0
target            0
dtype: int64
train.target.value_counts()/train.shape[0]
train.target.value_counts()/train.shape[0]
 <=50K    0.75919
 >50K     0.24081
Name: target, dtype: float64
0
train.shape[0]
32561
1
train.shape[1]
15

Install selected packages