Python基础学习
numpy & pandas
mofan python 视频入门
目前还是为了作业在学习
用于数据分析,科学计算
import numpy as np 惯用写法
array = np.array([[1,2,3],
[2,3,4]])
print(array)
print(array.ndim) 矩阵维度(1,2,···)
print(array.shape) 矩阵形状(行列数)
print(array.size) 矩阵元素个数
创建array
import numpy as np
a = np.array([1,2,4])
print(a) 结果 数之间无逗号
a = np.array([1,2,4],dtype=np.int) int 可以换成float
print(a.dtype)
import numpy as np
a = np.array([ [2,3,4],
[4,5,6]])
print(a) 生成矩阵
b = np.zeros((3,4))
print(b) 生成3行4列的矩阵
c=np.ones((2,3),dtype=np.int16)
print(c) 2行3列的全为1的矩阵
d = np.empty((1,2))
print(a) 生成一个几乎为0的矩阵
e = np.arange(10,20,2) 生成有序数列
print(e)
[10,12,14,16,18]
f = np.arange(12).reshape((3,4))
print(f) 生成3行4列0-11的矩阵
g = np.linspace(1,10,5)
print(g) 生成线段,1-10的五个等距线段
numpy的基础运算和形式
import numpy as np
a = np.array([10,20,30,40])
b=np.arange(4)
print(a,b)
c=a-b,b**2,10*np.sin(a)
print(c) 矩阵加减乘除 同理,次方,叁角函数
print(b)
print(b<3) 生成一个列表(逐个判断)
c =np.array([[1,2],
[0,3]])
d = np.arnge(4).reshape((2,2))
q=c*d 逐个相乘
q = np.dot(a,b) 或 c.dot(d) 矩阵乘法
e = np.random.random((2,4)) 随机生成一个矩阵
print(np.sum()) 求和
print(np.sum(e,axis=1)) 列数求和
print(np.sum(e,axis=0)) 行数求和
print(np.max(e,axis=1)) 列数,每列求最大值
print(np.min()) 求最小值
print(np.max())
numpy基础运算2
import numpy as np
A = np.arange(1,13).reshape((3,4))
print(np.argmin(A)) 生成最小值索引
print(np.mean(A))
print(A.mean()) 计算矩阵平均值
print(np.median(A)) 中位数
print(np.cumsum(A)) 累加,项数同
print(np.diff(A)) 每两个数的差
print(np.nonzero(A)) 生成两个数组构成数的索引
print(np.sort(A)) 逐行排序
print(np.transpose(A))
print(A.T) 矩阵的转置
print(np.clip(A,min,max) <5变为5,>9变为9
print(np.mean(A,aixs=0)) 可以进行行列平均值计算
print(A)
暂且看到 第7个视频
numpy 索引
import numpy as np
A = np,arange(3,15)
print(A) 生成3-14的一维矩阵
print(A[3])
B = np,arange(3,15).reshape 生成二维矩阵
print(B)
print(B[2])
print(B[2:,] 行数索引,打印整行
print(B[:,1]) 第一列
print(B[1,1:2]) 第一行,第一例到第二列
print(B[1][1])
print(B[1,1] 打印某个元素
for row in A:
print(row) 迭代每一行的值
for column in A.T:
print(column) 迭代每一列的值
print(A.flatten()) 打印A转化为一个列表
for item in A.flat:
print(item) 打印迭代器的值
numpy array合并
import numpy as np
A = np.array([1,2,3])
B = np.array([2,3,4])
C = np.vstack((A,B))
D = np.hstack((A,B)) #horizatal stack左右合并
print(np.vstack((A,B))) #vertical stack上下合并成一个矩阵
print(A.shape,C.shape)
(3,) (2,3)
不可用transpose将横向数列变为竖向数列
print(A[np.newaxis,:].shape)
(1,3) 行上加了一个维度
print(A[:,np.newaxis].shape)
(3,1) 列上加维度
A1= np.array([1,2,3])[:,np.newaxis]
B1 = np.array([2,3,4])[:,np.newaxis]
C = np.concatenate((A,B,B,A),axis=0) 多个合并
numpy array分割
import numpy as np
A = np.arange(12).reshape((3,4))
print(A)
print(np.split(A,2,axis=1) 按列分为两块
不能进行不等的分割
print(np.array_split(A,3,axis=1)) 可进行不等分割,第一个array包含两列
print(np.vsplit(A,3)) 纵向分割
print(np.hsplit(A,2)) 横向分割
numpy array copy&deep copy
>>>import numpy as np
>>>a = np.arange(4)
>>>b=a
>>>b is a
True 改变a的同时,b也会改变,赋值传递;改变b,a也会改变,会相关联
>>>b = a.copy() #deep copy a,b不会相关联
mofvan numpy 就此结束
之后是pandas入门
>>>import pandas as pd
>>>import numpy as np
>>>s = pd.Series([1,3,6,np.nan,44,]) nan无穷
>>>s
>>>dates = pd.date_range('20190215',periods=6)
>>>dates 生成日期的序列
>>>df = pd.DataFrame(np.random.randn(6,4),index=dates,columns=['a','b','c','d']
>>>df1 = pd.DataFrame(np.arange.reshape((3,4)))
>>>df1
>>>df2 = pd.DataFrame(里面加上一个dict)
>>>df2.dtypes 返回每一列不同的数据形式
>>>df2.index 列的序号
>>>df2.columns 列的名字
>>>fd2.values
>>>df.describe() 运算数字形式的和,乘积...
>>>df2.T 当成一个矩阵,转置
>>>df2.sort_index(axis=1,ascending=False) 列向,反序,
>>>df2.sort_values(by='E') 指定某列进行排列
pandas 选择数据
import pandas as pd
import numpy as np
dates = pd.date_range('20190101',periods=6)
df = pd.DateFrame(np.arange(24).reshape((6,4)),index=dates,columns=['A','B','C','D']) 使用的数据
print(df['A'],df.A) 打印同样的两列数据
print(df[0:3],df['20190102','20190104']) 类似‘切片’操作
# select by label:loc 根据标签来选择
print(df.loc['20190102'])
print(df.loc[:,['A','B']]) 选择列,打印行数据
print(df.loc['20190102':,['A','B']])
# select by position:iloc
print(df.iloc[3]) 第三行
print(df.iloc[3,1] 第三行第一位
print(df.iloc[3:5,1:3]) 3-5行1-3列
print(df.iloc[[1,3,5],1:3]) 逐个不连续筛选
# mixed selection:ix 综合上面两者
print(df.ix[:3,['A','C']])
# Boolean indexing
print(df[df.A>8]) 筛选
pandas 设置值,改变标签对应的值,位置
import pandas as pd
import numpy as np
dates = pd.date_range('20190101',periods=6)
df = pd.DateFrame(np.arange(24).reshape((6,4)),index=dates,columns=['A','B','C','D']) 使用的数据
df.iloc[2,2] = 1111
df.loc['20190101','B']=2222
df[df.A>4] = 0
print(df)
df.A[df.A>4] = 0
print(df)
df.B[df.A>4] = 0
print(df)
df['F'] = np.nan 设置空行
df['F'] = pd.Series([1,2,3,4,5,6],index=pd.date_range('20190101',periods=6)
print(df)
pandas 处理丢失数据
import pandas as pd
import numpy as np
dates = pd.date_range('20190101',periods=6)
df = pd.DateFrame(np.arange(24).reshape((6,4)),index=dates,columns=['A','B','C','D']) 使用的数据
df.iloc[0,1] = np.nan
df.iloc[1,2] = np.nan
print(df)
print(df.dropna(axis=0,how='any')) #how={'any','all'} 按照行列丢失,怎么样的方式
print(df.fillna(value=0)) 填上丢失的数据,默认为0
print(df.isnull()) 是否有确实数据
print(np.any(df.isnull()) == True) 是否包含True值,>=1,返回True
导入导出,目录,读取数据
pandas 合并concatenating
import pandas as pd
import numpy as np
# concatenating
df1 = pd.DataFrame(np.ones((3,4))*0,columns=['a','b','c','d'])
df2 = pd.DataFrame(np.ones((3,4))*1,columns=['a','b','c','d'])
df3 = pd.DataFrame(np.ones((3,4))*2,columns=['a','b','c','d'])
print(df1)
print(df2)
print(df3)
res = pd.concat([df1,df2,df3],axis=0) 竖向合并
print(res)
res = pd.concat([df1,df2,df3],axis=0,ignore_index=True)忽略index
print(res)
# join,['inner','outer']
df1 = pd.DataFrame(np.ones((3,4))*0,columns=['a','b','c','d'],index=[1,2,3])
df2 = pd.DataFrame(np.ones((3,4))*1,columns=['b','c','d','e'],index=[2,3,4])
print(df1)
print(df2)
res = pd.concat([df1,df2],join='outer')
print(res) 默认
res = pd.concat([df1,df2],join='inner',ignore=True) 合并相同的,忽略序号
# join_axes
res = pd.concat([df1,df2],axis=1,join_axes=[df1.index]) 指定考虑index
print(res)
res = pd.concat([df1,df2],axis=1,)
# append
df1 = pd.DataFrame(np.ones((3,4))*0,columns=['a','b','c','d'])
df2 = pd.DataFrame(np.ones((3,4))*1,columns=['a','b','c','d'])
res = df1.append(df2,ignore_index=True)
s1 = pd.Series([1,2,3,4],index=[['a','b','c','d']))
print(res)
df3 = pd.DataFrame(np.ones((3,4))*1,columns=['a','b','c','d'])
res = df1.append([df2,df3]ignore_index=True)多个合并(行)
df1 = pd.DataFrame(np.ones((3,4))*0,columns=['a','b','c','d'])
s1 = pd.Series([1,2,3,4],index=[['a','b','c','d'])
res = df1.append(s1,ignore_index=True)
print(res) 添加一个元素到最后
pandas 合并merge
merge 多种索引,key等合并
import pandas as pd
# merging two df by key/keys.(may be used in database)
left = pd.DataFrame({'key':['K0','K1','K2','K3'],
'A':['A0','A1','A2','A3'],
'B':['B0','B1','B2','B3']})
right = pd.DataFrame({'key':['K0','K1','K2','K3'],
'C':['C0','C1','C2','C3'],
'D':['D0','D1','D2','D3']})
print(left)
print(right)
res = pd.merge(left,right,on='key')
print(res)
# consider two key
left = pd.DataFrame({'key1':['K0','K1','K2','K3'],
'key2':['K0','K1','K2','K3'],
'A':['A0','A1','A2','A3'],
'B':['B0','B1','B2','B3']})
right = pd.DataFrame({'key1':['K0','K1','K2','K3'],
'key2':['K0','K1','K2','K3'],
'C':['C0','C1','C2','C3'],
'D':['D0','D1','D2','D3']})
res = pd.merge(left,right,on=['key1','key2'],how='inner') 默认为inner,只合并相同的部分
# how = ['left','right','outer',inner']
print(res)
res = pd.merge(left,right,on=['key1','key2'],how='right')
# indicator
res = pd.merge(left,right,on=['key1','key2'],how='right',indicator=True)
# give the indicator a custom name
res = pd.merge(df1,df2,on='col1',how=outer,indicator='indicator_column')
print(res)
# merge by index
left = pd.DataFrame({'A':['A0','A1','A2'],
'B':['B0','B1','B2']},
index=['K0','K1','K2'])
right = pd.DataFrame({'C':['C0','C1','C2'],
'D':['D0','D1','D2']},
index=['K0','K1','K2'])
print(left)
print(right)
#left_index and right_index
res = pd.merge(left,right,left_index=True,right_index=True,how='outer')
print(res)
res = pd.merge(left,right,left_index=True,right_index=True,how='inner')
print(res)
# handle overlapping
boys = pd.DataFrame({'k':['K0','K1','K2'],'age':[1,2,3]})
girls = pd.DataFrame({'k':['K0','K0','K3'],'age':p4,5,6]})
print(boys)
print(girls)
res = pd.merge(boys,girls,on='k',suffixes=['_boy','_girl'],how='inner')
print(res)
pandas plot 图表
数据可视化,显示在屏幕上
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
#plot data
#Series 线性数据
data = pd.Series(np.random.randn(1000),index=np.arange(1000))
data = data.cumsum()
data.plot() #plt.plot(x=,t=)
plt.show()
# DataFrame
data = pd.DataFrame(np.random.randn(1000,4),
index = np.arange(1000)
columns = list('ABCD'))
data = data.cumsum()
print(data.fead())
data.plot()
plt.show()
# plot methods:
# 'bar','hist','kde','area','scatter','hexbin','pie'
# data.scatter(x=,y=) 分布点数据
ax = data.plot.scatter(x='A',y='B',color='DarkBlue',label='Class 1')
data.plot.scatter(x='A',y='C',color='DarkGreen',label='Class 2',ax=ax)
plt.show()
mofan pandas 到此为止