dropna()去空
af= af. dropna( )
value_counts() 分类、遍历
vc= af[ 'a' ] . value_counts( )
delist= [ ]
for i, v in vc. iteritems( ) :
if v< len ( af) * 0.03 :
delist. append( i)
把val_count()的结果转成DataFrame
g= af[ 'col' ] . value_counts( )
dic_g= { 'a' : g. index, '数量' : g. values}
df_g= pd. DataFrame( dic_g)
把val_count()的结果,一列求和,求比率
s= np. sum ( df. 数量)
df_g[ '比率' ] = df_g. 数量. apply ( lambda x: str ( x* 100 / s) [ : 6 ] + '%' )
按条件筛选
af= af[ ~ af[ 'a' ] . isin( delist) ]
drop_duplicates()去重
colist= af. drop_duplicates( )
二维数组合成一维
colist= af. values. tolist( )
print ( colist)
colist= reduce ( operator. add, colist)
print ( colist)
[ [ 'Z96.101' ] , [ 'H25.901' ] , [ 'H25.900' ] , [ 'H52.701' ] ]
[ 'Z96.101' , 'H25.901' , 'H25.900' , 'H52.701' ]
注意若dtype改为str,空值==‘nan’,此时dropna无用 read_csv(,low_memory=False)需要在读取大量数据时加上 拼接多个不同文件,结构一样的DataFrame
all_df= pd. DataFrame( )
for root, _, files in os. walk( dir_path) :
for file in files:
if '.csv' in file :
data= pd. read_csv( os. path. join( root, file ) , encoding= 'gbk' , low_memory= False , usecols= [ '主码' , '其他码' ] )
all_df= all_df. append( data, ignore_index= True )
转置
all_df = pd. DataFrame( all_df. values. T, index= all_df. columns, columns= all_df. index)
某列的某值数量少于规定数量时,删除此列
colist= list ( af. columns)
for col in colist:
num= af[ str ( col) ] . value_counts( ) [ 'False' ]
if num> btsize* 0.98 :
print ( num, col)
af= af. drop( columns= [ col] )
list去重
newlist= { } . fromkeys( newlist)
klist= list ( newlist. keys( ) )
concat合并 axis=0 按列 axis=1 按行
out= pd. concat( [ out, pd. DataFrame( t) ] , axis= 0 )
获取有特定字符的列的行
af= af[ af[ 'ICD' ] . str . contains( ',' ) ]