import pandas as pd
1. Series 构造
1.1 普通构造
pd. Series( data 一组数据,
index 索引标签 默认为0 开始,
dtype 数据类型- 默认自行判断,
name 设置名称,
copy 拷贝数据- 默认为False )
1.2 使用kv对,类似字典来创建Series
dist = { 1 : "A" , 2 : "B" , 3 : "C" }
myvar = pd. Series( dist)
myvar = pd. Series( dist, index = [ 1 , 2 ] )
2. 索引
2.1 指定索引值
data = [ "A" , "B" , "C" ]
myvar = pd. Series( data, index= [ "x" , "y" , "z" ] )
2.2 根据索引值读取数据
myvar[ "y" ]
== == == == == == == == == == == == == == == == == == == == == == == == == == == == == == == == == == == == == =
1. 生成数据表
1.1 导入csv或xlsx文件
df = pd. DataFrame( pd. read_csv( 'name.csv' , header = 1 ) )
df = pd. DataFrame( pd. read_excel( 'name.xlsx' ) )
方法2 :
from collections import namedtuple
Item = namedtuple( 'Item' , 'reply pv' )
items = [ ]
with codecs. open ( 'reply.pv.07' , 'r' , 'utf-8' ) as f:
for line in f:
line_split = line. strip( ) . split( '\t)
items. append( Item( line_split[ 0 ] . strip( ) , line_split[ 1 ] . strip( ) ) )
df = pd. DataFrame. from_records( items, columns= [ 'reply' , 'pv' ] )
1.2 用pandas创建数据表
df = pd. DataFrame( {
"id" : [ 1001 , 1002 , 1003 , 1004 , 1005 , 1006 ] ,
"date" : pd. date_range( '20130102' , periods= 6 ) ,
"city" : [ 'Beijing ' , 'SH' , ' guangzhou ' , 'Shenzhen' , 'shanghai' , 'BEIJING ' ] ,
"age" : [ 23 , 44 , 54 , 32 , 34 , 32 ] ,
"category" : [ '100-A' , '100-B' , '110-A' , '110-C' , '210-A' , '130-F' ] ,
"price" : [ 1200 , np. nan, 2133 , 5433 , np. nan, 4432 ]
} ,
columns = [ 'id' , 'date' , 'city' , 'category' , 'age' , 'price' ] )
== == == == == == == == == == == == == == == == == == == == == == == == == == == == == == == == == == == == == =
2. 数据表信息查看
2.1 查看维度
df. shape
2.2 查看基本信息:维度、列名、数据格式、所占空间
df. info( )
2.3 每一列数据的格式
df. dtypes
2.4 某一列格式
df[ 'B' ] . dtype
2.5 空值
df. isnull( )
df[ 'B' ] . isnull( )
df. isnull( ) . count( )
2.6 查看某一列的唯一值
df. unique( )
2.7 查看数据表的值
df. values
2.8 查看列名称
df. columns
2.9 查看前五行、后五行数据
df. head( )
df. tail( )
== == == == == == == == == == == == == == == == == == == == == == == == == == == == == == == == == == == == == =
3. 数据表清洗
df. fillna( value = 0 )
df[ 'price' ] . fillna( df[ 'price' ] . mean( ) )
df[ 'city' ] = df[ 'city' ] . map ( str . strip)
df[ 'city' ] = df[ 'city' ] . str . lower( )
df[ 'price' ] . astype( 'int' )
df. rename( columns = { 'date' : 'datetime' } )
df[ 'city' ] . drop_duplicates( )
df[ 'city' ] . drop_duplicates( keep = 'last' )
df[ 'city' ] . replace( 'sh' , 'shanghai' )
== == == == == == == == == == == == == == == == == == == == == == == == == == == == == == == == == == == == == =
4. 数据预处理
df1= pd. DataFrame( {
"id" : [ 1001 , 1002 , 1003 , 1004 , 1005 , 1006 , 1007 , 1008 ] ,
"gender" : [ 'male' , 'female' , 'male' , 'female' , 'male' , 'female' , 'male' , 'female' ] ,
"pay" : [ 'Y' , 'N' , 'Y' , 'Y' , 'N' , 'Y' , 'N' , 'Y' , ] ,
"m-point" : [ 10 , 12 , 20 , 40 , 40 , 40 , 30 , 20 ] } )
df_inner = pd. merge( df, df1, how = 'inner' )
df_left = pd. merge( df, df1, how = 'left' )
df_right = pd. merge( df, df1, how = 'right' )
df_outer = pd. merge( df, df1, how = 'outer' )
result1 = df1. append( df2)
result = left. join( right, on = 'key' )
pd. concat(
objs,
axis = 0 ,
join_axes = None ,
ignore_index = False ,
. . .
)
frames = [ df1, df2, df3]
result = pd. concat( frames)
df. set_index( 'id' )
df. sort_values( by= [ 'age' ] )
df. sort_index( )
df. [ 'group' ] = np. where( df[ 'price' ] > 2000 , 'high' , 'low' )
df. loc[ ( df[ 'city' ] == 'beijing' ) & ( df[ 'price' ] >= 4000 ) , 'sign' ] = 1
split = pd. DataFrame( ( x. split( '-' ) for x in df[ 'category' ] ) , index= df. index, columns = [ 'category' , 'size' ] )
df = pd. merge( df, split, right_index = True , left_index = True )
== == == == == == == == == == == == == == == == == == == == == == == == == == == == == == == == == == == == == =
5. 数据提取
loc iloc ix
df. loc[ 3 ]
df. iloc[ 0 : 5 ]
df. reset_index
df = df. set_index( 'date' )
df[ : '2013-01-04' ]
df. iloc[ : 3 , : 2 ]
df. iloc[ [ 0 , 2 , 5 ] , [ 4 , 5 ] ]
df. ix[ : '2013-01-04' , : 4 ]
df[ 'city' ] . isin( [ 'beijing' ] )
df. loc[ df[ 'city' ] . isin( [ 'beijing' , 'shanghai' ] ) ]
pd. DataFrame( category. str [ : 3 ] )
== == == == == == == == == == == == == == == == == == == == == == == == == == == == == == == == == == == == == =
6. 数据筛选
使用与、或、非三个条件配合大于、小于、等于对数据进行筛选,并进行计数和求和。
df. loc[ ( 与 或 非条件) , [ 列1 , 列2 ] ]
df. loc[ ( df[ 'city' ] != 'beijing' & df[ 'price' ] >= 200 ) , [ 'id' , 'city' , 'age' , 'category' , 'gender' ] ] . sort( [ 'id' ] ]
df_inner. loc[ ( df_inner[ 'city' ] != 'beijing' ) , [ 'id' , 'city' , 'age' , 'category' , 'gender' ] ] . sort( [ 'id' ] ) . city. count( )
df. query( 'city == ["beijing", "shanghai"]' )
df. query( 'city == ["beijing", "shanghai"]' ) . price. sum ( )
== == == == == == == == == == == == == == == == == == == == == == == == == == == == == == == == == == == == == =
7. 数据汇总
df. groupby( 'city' ) . count( )
df. groupby( 'city' ) [ 'id' ] . count( )
df. groupby( [ 'city' , 'size' ] ) [ 'id' ] . count( )
df. groupby( 'city' ) [ 'price' ] . agg( [ len , np. sum , np. mean] )
== == == == == == == == == == == == == == == == == == == == == == == == == == == == == == == == == == == == == =
8. 数据统计
df. sample( n= 3 )
weights = [ 0 , 0 , 0 , 0 , 0.4 , 0.6 ]
df. sample( n= 200 , weights = weights)
df. sample( n= 4 , replace = False )
df. sample( n= 5 , replace = True )
df. describe( ) . round ( 2 ) . T
df[ 'price' ] . std( )
df[ 'price' ] . cov[ df[ 'm-point' ] ]
df. cov( )
df[ 'price' ] . corr( df[ 'm-point' ] )
df. corr( )
== == == == == == == == == == == == == == == == == == == == == == == == == == == == == == == == == == == == == =
df. to_excel( 'test.xlsx' , sheet_name = 'abc' )
df. to_csv( 'test2.csv' )