pip install --upgrade numpy==1.13.3
pip install --upgrade pandas==1.0.5
读取文件
查看基本信息
print(df.shape)
# (174, 69)
print(df.info())
# <class 'pandas.core.frame.DataFrame'>
# RangeIndex: 174 entries, 0 to 173
# Data columns (total 69 columns):
# Fab 174 non-null object
# Family 0 non-null float64
# ...
print(df.index)
print(df.columns)
print(df.values)
基本操作
- 设置index
df = df.set_index('insert_time') # 可以设置index
- 简单计算
df['Wafer Number'] = df['Wafer Number'] + 100
df['Wafer Number'] = df['Wafer Number'] - 100
df['Wafer Number'] = df['Wafer Number'] * 100
df['Wafer Number'].max()
- 简单统计
print(df.describe())
- 查找位置: loc & iloc
a = df.loc['MRB463_03',:] # 索引的名字
b = df.iloc[2,:] # 索引所在的行数
-
查找数据,可以使用query(),参考文章;
-
赋值
df.loc['MRB463_03','Wafer Number'] = 2 # 直接赋值即可
- 布尔类型索引
可以作为删选数据的方法;
df = df.set_index('Wafer')
df[df['Wafer Number']>10][:5]
# 省略
df[df['Wafer Number']==10][:5]
# 所有wafer number 是10的数据;
df.loc[df['Wafer Number']==10,'yield (%)'].mean()
# 93.4
ratio = df[df['yield (%)']>95]['yield (%)'].count()/df['yield (%)'].count()
# 大于95 良率的wafer的占比;
Group by
- 单一列的单一统计值
df.groupby('Source Lot')['Program'].count()
# group by S.LOT 的Wafer 数量;
df.groupby('Source Lot')['yield (%)'].mean()
# S.LOT的平均良率值
- 单一列的多个统计值
以下代码,pandas 必须为0.25以上版本;
df.groupby('Source Lot')['yield (%)'].agg(mean = "mean",min = "min", max="max")
df.groupby('Source Lot')['yield (%)'].agg(mean = np.mean,min = np.min, max=np.max)
# 以上两句,得出结果完全相同;
# mean min max
# Source Lot
# MRB463.S 92.328675 87.993292 94.030184
# MRB466.S 93.339296 91.626607 95.226383
# MRB513.S 90.696385 84.326439 93.694802
# MRB516.S 92.161431 86.193404 94.712130
# MRB556.S 90.844047 88.060369 93.817775
# MRB557.S 92.387703 88.384572 94.488541
# MRB559.S 92.118055 88.194522 94.264952
- 不同列的不同统计值
df.groupby('Source Lot').agg(mean_yield = ('yield (%)','mean'),Wafer_Count =('Program','count'))
df.groupby('Source Lot').agg(mean_yield = ('yield (%)',np.mean),Wafer_Count =('Program',np.count_nonzero))
# 以上两句执行结果相同;
# mean_yield Wafer_Count
# Source Lot
# MRB463.S 92.328675 25
# MRB466.S 93.339296 25
# MRB513.S 90.696385 24
# MRB516.S 92.161431 25
# MRB556.S 90.844047 25
# MRB557.S 92.387703 25
# MRB559.S 92.118055 25
数值运算
df['35 - F - ER-SRT (%)'].sum()
df['35 - F - ER-SRT (%)'].mean()
df.sum(axis=0) # 将列求和;
df.sum(axis=1) # 将行求和;
df.max(axis=0) # 求每列的最大值;
df.min()
df.median()
df.cov() # 求出协方差
df.corr() # 求出相关系数
等于某个值的个数;
df['Source Lot'].value_counts()
# MRB557.S 25
# MRB516.S 25
df['Source Lot'].value_counts(ascending=True) # 升序
df['43 - F - VGNDA2 (cnt)'].value_counts(ascending=True,bins=8) # 自动bin 分类
# (18.375, 21.0] 2
# (15.75, 18.375] 3
# (13.125, 15.75] 5
# (5.25, 7.875] 6
# (7.875, 10.5] 6
# (10.5, 13.125] 8
# (2.625, 5.25] 25
# (-0.022000000000000002, 2.625] 119
df['43 - F - VGNDA2 (cnt)'].value_counts(normalize = True, ascending=True,bins=8)
# (18.375, 21.0] 0.011494
# (15.75, 18.375] 0.017241
# (13.125, 15.75] 0.028736
# (5.25, 7.875] 0.034483
# (7.875, 10.5] 0.034483
# (10.5, 13.125] 0.045977
# (2.625, 5.25] 0.143678
# (-0.022000000000000002, 2.625] 0.683908
- 改操作;
df['Wafer Number'].replace(to_replace=10,value=100,inplace=True) # 将列中的数据替换,并修改原数据
df.rename(index= {'MRB463_11':'MRB463_110'},inplace=True)
- 增操作;
df1 = pd.DataFrame([[1,2],[3,4],[5,6]],index=[1,2,3],columns=['A','B'])
df2 = pd.DataFrame([[1,2],[3,4],[5,6]],index=[1,4,5],columns=['A','B'])
print(df1.append(df2))
df1 = pd.DataFrame([[1,2],[3,4],[5,6]],index=[1,2,3],columns=['A','B'])
df2 = pd.DataFrame([[1,2],[3,4],[5,6]],index=[1,4,5],columns=['C','B'])
print(df1.append(df2))
df1 = pd.DataFrame([[1,2],[3,4],[5,6]],index=['a','b','c'],columns=['A','B'])
df1.loc['d'] = [7,8]
print(df1)
df1 = pd.DataFrame([[1,2,3],[4,5,6]],index=['a','b'],columns=['A','B','C'])
df2 = pd.DataFrame([[2,2,3],[5,5,6]],index=['a','c'],columns=['A','B','D'])
df3 = pd.concat([df1,df2])
df4 = pd.concat([df1,df2],axis=1)
print('df3:\n',df3)
print('df4:\n',df4)
- 删操作
删除列
df1 = pd.DataFrame([[1,2],[3,4],[5,6]],index=['a','b','c'],columns=['A','B'])
del df1['A']
df1 = pd.DataFrame([[1,2],[3,4],[5,6]],index=['a','b','c'],columns=['A','B'])
df1.drop(['A'],axis=1,inplace=True) # 删除列
df1.drop(['a'],axis=0,inplace=True) # 删除行
Merge 操作
df1 = pd.DataFrame([[1,2,3],[4,5,6]],index=['a','b'],columns=['A','B','C'])
df2 = pd.DataFrame([[2,2,3],[5,5,6]],index=['a','c'],columns=['A','B','D'])
df3 =pd.merge(df1,df2,on='B')
print('df1:\n',df1)
print('df2:\n',df2)
print('df3:\n',df3)
也可以根据多个值,只要传列表给on就可以了~
默认的连接方式为inner; 可以通过how= 来修改连接方式;如下:
df1 = pd.DataFrame([[1,2,3],[4,5,6]],index=['a','b'],columns=['A','B','C'])
df2 = pd.DataFrame([[1,2,3],[5,5,6]],index=['a','c'],columns=['A','B','D'])
df3 =pd.merge(df1,df2,on=['B','A']) #默认为inner
df_outer =pd.merge(df1,df2,on=['B','A'],how='outer')
df_left =pd.merge(df1,df2,on=['B','A'],how='left')
df_right =pd.merge(df1,df2,on=['B','A'],how='right')
可以增加指示器,来标注数据来源,如下:
df1 = pd.DataFrame([[1,2,3],[4,5,6]],index=['a','b'],columns=['A','B','C'])
df2 = pd.DataFrame([[1,2,3],[5,5,6]],index=['a','c'],columns=['A','B','D'])
df_outer =pd.merge(df1,df2,on=['B','A'],how='outer',indicator=True)
显示设置
- 查看设置:
pd.get_option('display.max_rows')
- 更改设置:
pd.set_option('display.max_columns',12) #
pd.set_option('display.max_rows',20)
pd.set_option('display.max_colwidth',10)
pd.set_option('display.width',10000)
pd.set_option('display.precision',4) # 显示精度
pd.set_option('display.max_info_columns',5)
Pivot
df_pivot = df.pivot_table(index = 'Wafer Number',columns='Source Lot',values= 'yield (%)') # 默认是求平均
df_pivot = df.pivot_table(index = 'Wafer Number',values= 'yield (%)',aggfunc='min')
df_pivot = df.pivot_table(index = 'Wafer Number',values= ['yield (%)'],aggfunc=['min','count'])
df_pivot = df.pivot_table(index = 'Wafer Number',values= ['yield (%)'],aggfunc=['min','count'],margins =True)
df_pivot = df.pivot_table(index = 'Source Lot',values =['yield (%)','25 - F - FAIL1H (cnt)'] ,aggfunc={'yield (%)':[max,min],'25 - F - FAIL1H (cnt)':np.mean})
时间操作
ts = pd.Timestamp('2020-01-12')
print(ts)
# 2020-01-12 00:00:00
print(ts + pd.Timedelta('5 days'))
# 2020-01-17 00:00:00
- 各种格式的日期时间字符串,都可以通过to_datetime 转换成datetime格式;
ts= pd.to_datetime('3/2/2020')
# 2020-03-02 00:00:00
- datetime 类型有一些性质,可以方便的调出各种时间的属性值;
print(ts.hour)
print(ts.weekday())
- 按规定生成时间序列
print(pd.date_range(start='2020-01-10',end='2020-01-20'))
# DatetimeIndex(['2020-01-10', '2020-01-11', '2020-01-12', '2020-01-13', '2020-01-14', '2020-01-15', '2020-01-16', '2020-01-17', '2020-01-18', '2020-01-19', '2020-01-20'], dtype='datetime64[ns]', freq='D')
print(pd.date_range(start='2020-01-10',periods=5,freq='2W'))
# DatetimeIndex(['2020-01-12', '2020-01-26', '2020-02-09', '2020-02-23', '2020-03-08'], dtype='datetime64[ns]', freq='2W-SUN')
- 对时间进行处理的时候,需要先设置成index,才可以;
df['str_year'] = pd.to_datetime(df['start_time'])
df = df.set_index('str_year')
df['day']=df.index.day
print(df['day'].value_counts()) # 可以打印出day的统计
df['2021-12-04':'2021-12-05'] # 可以打印出选定的时间;
df[(df.index.day > 13) & (df.index.day < 16)] # 指定的日期 & 时间;
df.between_time('08:00','12:00') # 指定的时间(只能是时间)
- 按时间重采样
df.resample('D').mean() # 按天重采样,并求出平均值;
df.resample('3D').mean() # 按3天重采样,并求出平均值;
常用操作
- 排序
df.sort_values(by=['Wafer Number','yield (%)'],ascending=[False,False],inplace=True)
print(df.loc[:,['Wafer Number','yield (%)']])
- 删去重复值
df.drop_duplicates() # 一行数据中,全部重复则删除
df.drop_duplicates(subset='Wafer Number') # 指定列进行去重
- 同类项合并 :mapping 方法
定义一个映射字典,然后通过mapping调用该字典,如果没有对应关系,则为NaN;
mapping = {
1:'Head',
2:'Head',
3:'Head',
4:'Head',
5:'Head',
21:'Tail',
22:'Tail',
23:'Tail',
24:'Tail',
25:'Tail'
}
df['cate'] = df['Wafer Number'].map(mapping)
- 分类;按区间进行分类
例如,我想要按照yield 在 [0,90,92,100]的区间上进行分类;
bins = [0,90,92,100]
bins_cut = pd.cut(df['yield (%)'],bins)
# (92, 100] 101
# (90, 92] 48
# (0, 90] 25
可以增加标签:
bins = [0,90,92,100]
names= ['Low Yield','Normal Yield','High Yield']
df['bins_cut'] = pd.cut(df['yield (%)'],bins,labels=names)
print(df['bins_cut'])
d = pd.value_counts(df['bins_cut'])
print(d)
# High Yield 101
# Normal Yield 48
# Low Yield 25
- apply 函数:对每行或者每列做相同的操作;
df['Product'] = df['Product'] +"_"+ df['Program'].str[11:13].apply(lambda x:"X8" if x =='8X' else "X16")
- 缺失值处理
df.isnull().all(axis=0) # 对列进行检查,如果全为null,则返回True;
df.isnull().all(axis=1) # 对行进行检查,如果全为null,则返回True;
df.fillna(5) # 用5填充null值;
- 字符串操作
df['Source Lot'].str.lower() # 大小写转换
df['Source Lot'].str.upper() # 大小写转换
df['Source Lot'].str.len() # 字符串长度
df['Source Lot'].str.strip() # 去两边空格
df['Source Lot'].str.lstrip() # 去左边空格
df['Source Lot'].str.rstrip() #去右边空格
df['Source Lot'].str.replace('.S','_R') # 替换字符
df['Source Lot'].str.split('.') # 将一列分割成多列,每行成为一个[]
df['Source Lot'].str.split('.',expand=True) # 分割成多列,且命名为0,1...
df['Source Lot'].str.split('.',expand=True,n=1) # 只切一次,分割成两部分
df['Source Lot'].str.contains('2') # 是否包含 2
用.分割每个值,且
df_1 = pd.DataFrame([['a,b,c'],['a,c'],['c,b']],index=[1,2,3],columns=['A'])
a = df_1['A'].str.get_dummies(sep=',')
print(a)
- isin()
df_1['A'].isin(['d','a'])