from pandas import *
df1=DataFrame({'key':['b','b','a','c','a','a','b'],
          'data1':range(7)})
df2=DataFrame({'key':['a','b','d'],
              'data2':range(3)})
print (df1)
print (df2)

   data1 key
0      0   b
1      1   b
2      2   a
3      3   c
4      4   a
5      5   a
6      6   b
   data2 key
0      0   a
1      1   b
2      2   d

merge(df1,df2,on='key')  #黏连

df3=DataFrame({'lkey':['b','b','a','c','a','a','b'],
              'data1':range(7)})
df4=DataFrame({'rkey':['a','b','d'],
              'data2':range(3)})
print (df3)
print (df4)
merge(df3,df4,left_on='lkey',right_on='rkey')     #两边的键名不一样

   data1 lkey
0      0    b
1      1    b
2      2    a
3      3    c
4      4    a
5      5    a
6      6    b
   data2 rkey
0      0    a
1      1    b
2      2    d

merge(df1,df2,how='outer')   #外连接

df1=DataFrame({'key':['b','b','a','c','a','b'],
              'data1':range(6)})
df2=DataFrame({'key':['a','b','a','b','d'],
              'data2':range(5)})
print (df1)
print (df2)
merge(df1,df2,on='key',how='left')   #行的笛卡尔积，左边有3个b右边有2个b 所以最终就有6个b行

   data1 key
0      0   b
1      1   b
2      2   a
3      3   c
4      4   a
5      5   b
   data2 key
0      0   a
1      1   b
2      2   a
3      3   b
4      4   d

left=DataFrame({'key1':['foo','foo','bar'],
          'key2':['one','two','one'],
          'lval':[1,2,3]})
right=DataFrame({'key1':['foo','foo','bar','bar'],
                'key2':['one','one','one','two'],
                'rval':[4,5,6,7]})
print(left)
print(right)
merge(left,right,on=['key1','key2'],how='outer')    #两个键值

  key1 key2  lval
0  foo  one     1
1  foo  two     2
2  bar  one     3
  key1 key2  rval
0  foo  one     4
1  foo  one     5
2  bar  one     6
3  bar  two     7

left1=DataFrame({'key':['a','b','a','a','b','c'],
                'value':range(6)})
right1=DataFrame({'group_val':[3.5,7]},index=['a','b'])
print(left1)
print(right1)
merge(left1,right1,left_on='key',right_index=True)  #索引作为连接键

  key  value
0   a      0
1   b      1
2   a      2
3   a      3
4   b      4
5   c      5
   group_val
a        3.5
b        7.0

lefth=DataFrame({'key1':['Ohio','Ohio','Ohio','Nevada','Nevada'],
                'key2':[2000,2001,2002,2001,2002],
                'data':np.arange(5)})
righth=DataFrame(np.arange(12).reshape((6,2)),
                index=[['Nevada','Nevada','Ohio','Ohio','Ohio','Ohio'],
                      [2001,2000,2000,2000,2001,2002]],
                columns=['event1','event2'])
print(lefth)
print(righth)
merge(lefth,righth,left_on=['key1','key2'],right_index=True)    #默认取交集

   data    key1  key2
0     0    Ohio  2000
1     1    Ohio  2001
2     2    Ohio  2002
3     3  Nevada  2001
4     4  Nevada  2002
             event1  event2
Nevada 2001       0       1
       2000       2       3
Ohio   2000       4       5
       2000       6       7
       2001       8       9
       2002      10      11

left2=DataFrame([[1,2],[3,4],[5,6]],index=['a','c','e'],
               columns=['Ohio','Nevada'])
right2=DataFrame([[7,8],[9,10],[11,12],[13,14]],
                index=['b','c','d','e'],columns=['Missouri','Alabama'])
print(left2)
print(right2)
merge(left2,right2,how='outer',left_index=True,right_index=True)
left2.join(right2,how='outer')   #

   Ohio  Nevada
a     1       2
c     3       4
e     5       6
   Missouri  Alabama
b         7        8
c         9       10
d        11       12
e        13       14

#concat连接
s1=Series([0,1],index=['a','b'])
s2=Series([2,3,4],index=['c','d','e'])
s3=Series([5,6],index=['f','g'])
print(concat([s1,s2,s3]))

s4=concat([s1*5,s3])
print (s4)
print(concat([s1,s4],axis=1))    #按照列黏连
concat([s1,s4],axis=1,join='inner')

a    0
b    1
c    2
d    3
e    4
f    5
g    6
dtype: int64
a    0
b    5
f    5
g    6
dtype: int64
     0  1
a  0.0  0
b  1.0  5
f  NaN  5
g  NaN  6

result=concat([s1,s1,s3],keys=['one','two','three']) #创建层次化索引
print(result)
result.unstack()   #堆叠

one    a    0
       b    1
two    a    0
       b    1
three  f    5
       g    6
dtype: int64

print(concat([s1,s2,s3],axis=1,keys=['one','two','three']))
#concat([df1,df2],axis=1,keys=['level1','level2'],
#      names=['upper','lower'])
df1=DataFrame(np.random.randn(3,4),columns=['a','b','c','d'])
df2=DataFrame(np.random.randn(2,3),columns=['b','d','a'])
print(df1)
print(df2)

   one  two  three
a  0.0  NaN    NaN
b  1.0  NaN    NaN
c  NaN  2.0    NaN
d  NaN  3.0    NaN
e  NaN  4.0    NaN
f  NaN  NaN    5.0
g  NaN  NaN    6.0
          a         b         c         d
0  0.200428 -1.370206 -0.297848 -0.690504
1 -1.012528 -0.661101  0.823504  0.058414
2  1.122031 -0.180694  0.967949  0.892187
          b         d         a
0  0.215430  0.819751 -1.513330
1  1.029311  1.013309  0.777116

concat([df1,df2],ignore_index=True)

#stack旋转
data=DataFrame(np.arange(6).reshape((2,3)),index=pandas.Index(['Ohio',
    'Colorado'],name='state'),columns=pandas.Index(['one','two','three'],name='number'))
print(data)
result=data.stack()  #列转换为行
print(result)
type(result)   #result为一个序列Series
result.unstack()

number    one  two  three
state                    
Ohio        0    1      2
Colorado    3    4      5
state     number
Ohio      one       0
          two       1
          three     2
Colorado  one       3
          two       4
          three     5
dtype: int32

print(result.unstack(0))
print(result.unstack('state'))

state   Ohio  Colorado
number                
one        0         3
two        1         4
three      2         5
state   Ohio  Colorado
number                
one        0         3
two        1         4
three      2         5

import pandas as pd
s1=Series([0,1,2,3],index=['a','b','c','d'])
s2=Series([4,5,6],index=['c','d','e'])
data2=pd.concat([s1,s2],keys=['one','two'])
print(data2)
data2.unstack()

one  a    0
     b    1
     c    2
     d    3
two  c    4
     d    5
     e    6
dtype: int64

data2.unstack().stack()   #层次化索引

one  a    0.0
     b    1.0
     c    2.0
     d    3.0
two  c    4.0
     d    5.0
     e    6.0
dtype: float64

df=DataFrame({'left':result,'right':result+5},columns=pd.Index(['left','right'],name='side'))   #多层索引
print(df)
df.unstack('state')

side             left  right
state    number             
Ohio     one        0      5
         two        1      6
         three      2      7
Colorado one        3      8
         two        4      9
         three      5     10

data = pd.read_csv('macrodata.csv')
periods = pd.PeriodIndex(year=data.year, quarter=data.quarter, name='date')
data = DataFrame(data.to_records(),
                 columns=pd.Index(['realgdp', 'infl', 'unemp'], name='item'),
                 index=periods.to_timestamp('D', 'end'))

ldata = data.stack().reset_index().rename(columns={0: 'value'})
wdata = ldata.pivot('date', 'item', 'value')

print(ldata[:10])
pivoted=ldata.pivot('date','item','value')   #date作为行索引 item作为列索引 value作为填充
pivoted.head()

        date     item     value
0 1959-03-31  realgdp  2710.349
1 1959-03-31     infl     0.000
2 1959-03-31    unemp     5.800
3 1959-06-30  realgdp  2778.801
4 1959-06-30     infl     2.340
5 1959-06-30    unemp     5.100
6 1959-09-30  realgdp  2775.488
7 1959-09-30     infl     2.740
8 1959-09-30    unemp     5.300
9 1959-12-31  realgdp  2785.204

ldata['value2']=np.random.randn(len(ldata))
print(ldata[:10])
pivoted=ldata.pivot('date','item')
print(pivoted[:5])  #透视表
print(pivoted['value'][:5])

        date     item     value    value2
0 1959-03-31  realgdp  2710.349 -1.090362
1 1959-03-31     infl     0.000 -0.406879
2 1959-03-31    unemp     5.800  2.608990
3 1959-06-30  realgdp  2778.801 -1.557126
4 1959-06-30     infl     2.340 -1.500277
5 1959-06-30    unemp     5.100  0.975767
6 1959-09-30  realgdp  2775.488  1.186471
7 1959-09-30     infl     2.740 -1.040368
8 1959-09-30    unemp     5.300 -0.855401
9 1959-12-31  realgdp  2785.204  0.227901
           value                    value2                    
item        infl   realgdp unemp      infl   realgdp     unemp
date                                                          
1959-03-31  0.00  2710.349   5.8 -0.406879 -1.090362  2.608990
1959-06-30  2.34  2778.801   5.1 -1.500277 -1.557126  0.975767
1959-09-30  2.74  2775.488   5.3 -1.040368  1.186471 -0.855401
1959-12-31  0.27  2785.204   5.6 -0.715787  0.227901 -0.219652
1960-03-31  2.31  2847.699   5.2 -0.988710 -2.672743 -0.731635
item        infl   realgdp  unemp
date                             
1959-03-31  0.00  2710.349    5.8
1959-06-30  2.34  2778.801    5.1
1959-09-30  2.74  2775.488    5.3
1959-12-31  0.27  2785.204    5.6
1960-03-31  2.31  2847.699    5.2

data=DataFrame({'k1':['one']*3+['two']*4,
               'k2':[1,1,2,3,3,4,4]})
print(data)
print(data.duplicated())   #是否是重复的
data.drop_duplicates()     #丢弃重复的

    k1  k2
0  one   1
1  one   1
2  one   2
3  two   3
4  two   3
5  two   4
6  two   4
0    False
1     True
2    False
3    False
4     True
5    False
6     True
dtype: bool

data['v1']=range(7)
data.drop_duplicates(['k1'])   #丢掉重复的

data=DataFrame({'food':['bacon','pulled pork','bacon','Pastrami',
                       'corned beef','Bacon','pastrami','honey ham',
                       'nova lox'],
               'ounces':[4,3,12,6,7.5,8,3,5,6]})
data

meat_to_animal={'bacon':'pig','pulled pork':'pig','pastrami':'cow','corned beef':'cow','honey ham':'pig','nova lox':'salmon'}
meat_to_animal     #字典映射

{'bacon': 'pig',
 'corned beef': 'cow',
 'honey ham': 'pig',
 'nova lox': 'salmon',
 'pastrami': 'cow',
 'pulled pork': 'pig'}

data['animal']=data['food'].map(str.lower).map(meat_to_animal)
print (data)
data['food'].map(lambda x:meat_to_animal[x.lower()])          #匿名函数！！！

          food  ounces  animal
0        bacon     4.0     pig
1  pulled pork     3.0     pig
2        bacon    12.0     pig
3     Pastrami     6.0     cow
4  corned beef     7.5     cow
5        Bacon     8.0     pig
6     pastrami     3.0     cow
7    honey ham     5.0     pig
8     nova lox     6.0  salmon

0       pig
1       pig
2       pig
3       cow
4       cow
5       pig
6       cow
7       pig
8    salmon
Name: food, dtype: object

#数据替换
data=Series([1,-999,2,-999,-1000,3])
data
data.replace(-999,np.nan)

0       1.0
1       NaN
2       2.0
3       NaN
4   -1000.0
5       3.0
dtype: float64

print(data.replace([-999,-1000],np.nan))
print(data.replace({-999:np.nan,-1000:0}))

0    1.0
1    NaN
2    2.0
3    NaN
4    NaN
5    3.0
dtype: float64
0    1.0
1    NaN
2    2.0
3    NaN
4    0.0
5    3.0
dtype: float64

data=DataFrame(np.arange(12).reshape((3,4)),
              index=['Ohio','Colorado','New York'],
              columns=['Ohio','two','three','four'])
print(data)
data.index.map(str.upper)

          Ohio  two  three  four
Ohio         0    1      2     3
Colorado     4    5      6     7
New York     8    9     10    11

Index(['OHIO', 'COLORADO', 'NEW YORK'], dtype='object')

data.rename(index=str.title,columns=str.upper)

data.rename(index={'Ohio':'INDIANA'},
           columns={'three':'peekaboo'})    #部分轴标签的更新

ages=[20,22,25,27,21,23,37,31,61,45,41,32]
bins=[18,25,35,60,100]    #自定义分割区间  
cats=pd.cut(ages,bins)    
cats

[(18, 25], (18, 25], (18, 25], (25, 35], (18, 25], ..., (25, 35], (60, 100], (35, 60], (35, 60], (25, 35]]
Length: 12
Categories (4, interval[int64]): [(18, 25] < (25, 35] < (35, 60] < (60, 100]]

print(cats.labels)     #分属四个级
pd.value_counts(cats)

[0 0 0 1 0 0 2 1 3 2 2 1]

D:\Anaconda3\lib\site-packages\ipykernel_launcher.py:1: FutureWarning: 'labels' is deprecated. Use 'codes' instead
  """Entry point for launching an IPython kernel.

(18, 25]     5
(35, 60]     3
(25, 35]     3
(60, 100]    1
dtype: int64

group_names=['Youth','YoungAdult','MiddleAged','Senior']    #给各个级命名
pd.cut(ages,bins,labels=group_names)

[Youth, Youth, Youth, YoungAdult, Youth, ..., YoungAdult, Senior, MiddleAged, MiddleAged, YoungAdult]
Length: 12
Categories (4, object): [Youth < YoungAdult < MiddleAged < Senior]

data=np.random.rand(20)
cc=pd.cut(data,4,precision=2)
print(cc)
pd.value_counts(cc)        #注意区间宽度是相等的

[(0.13, 0.33], (0.53, 0.72], (0.33, 0.53], (0.53, 0.72], (0.13, 0.33], ..., (0.53, 0.72], (0.53, 0.72], (0.33, 0.53], (0.33, 0.53], (0.33, 0.53]]
Length: 20
Categories (4, interval[float64]): [(0.13, 0.33] < (0.33, 0.53] < (0.53, 0.72] < (0.72, 0.92]]

(0.53, 0.72]    7
(0.33, 0.53]    6
(0.13, 0.33]    4
(0.72, 0.92]    3
dtype: int64

data=np.random.randn(1000)
cats=pd.qcut(data,4)
print(cats)
pd.value_counts(cats)      # qcut每个等级的个数基本相等

[(-3.137, -0.708], (0.664, 4.283], (0.0363, 0.664], (0.664, 4.283], (0.0363, 0.664], ..., (-0.708, 0.0363], (0.664, 4.283], (0.664, 4.283], (0.0363, 0.664], (-3.137, -0.708]]
Length: 1000
Categories (4, interval[float64]): [(-3.137, -0.708] < (-0.708, 0.0363] < (0.0363, 0.664] < (0.664, 4.283]]

(0.664, 4.283]      250
(0.0363, 0.664]     250
(-0.708, 0.0363]    250
(-3.137, -0.708]    250
dtype: int64

#检测和过滤异常值
np.random.seed(12345)
data=DataFrame(np.random.randn(1000,4))
data.describe()

col=data[3]      #某一列中绝对值超过3的
print(col[np.abs(col)>3])
data[np.abs(col)>3]

97     3.927528
305   -3.399312
400   -3.745356
Name: 3, dtype: float64

data[(np.abs(data)>3).any(1)]  # 任意有一行>3的

data[np.abs(data)>3]=np.sign(data)*3   # sign -1 1 
data.describe()                        #将值限制在-3到3之间

#排列和随机采样
import pandas as pd
df=DataFrame(np.arange(5*4).reshape(5,4))
sampler=np.random.permutation(5)
sampler

array([0, 2, 4, 1, 3])

print(df)
print(df.take(sampler))     #行重排

    0   1   2   3
0   0   1   2   3
1   4   5   6   7
2   8   9  10  11
3  12  13  14  15
4  16  17  18  19
    0   1   2   3
0   0   1   2   3
2   8   9  10  11
4  16  17  18  19
1   4   5   6   7
3  12  13  14  15

df.take(np.random.permutation(len(df))[:3])   #一种随机选取子集的方法！

#似乎完成了一个重采样的过程？
bag=np.array([5,7,-1,6,4])
sampler=np.random.randint(0,len(bag),size=10)
print(sampler)
draws=bag.take(sampler)
draws

[3 0 3 3 4 1 1 0 1 1]

array([6, 5, 6, 6, 4, 7, 7, 5, 7, 7])

#哑变量
df=DataFrame({'key':['b','b','a','c','a','b'],
             'data1':np.random.randint(0,6,6)})
print(df)
pd.get_dummies(df['key'])   #索引填充！

   data1 key
0      3   b
1      0   b
2      2   a
3      4   c
4      4   a
5      2   b

dummies=pd.get_dummies(df['key'],prefix='key')
df_with_dummy=df[['data1']].join(dummies)
df_with_dummy

mnames=['movie_id','title','genres']
movies=pd.read_table('movies.dat',sep='::',header=None,names=mnames)
movies[:10]

D:\Anaconda3\lib\site-packages\ipykernel_launcher.py:2: ParserWarning: Falling back to the 'python' engine because the 'c' engine does not support regex separators (separators > 1 char and different from '\s+' are interpreted as regex); you can avoid this warning by specifying engine='python'.

genre_iter=(set(x.split('|')) for x in movies.genres)
print(genre_iter)
genres=sorted(set.union(*genre_iter))
print(genres)

<generator object <genexpr> at 0x000002243A180B48>
['Action', 'Adventure', 'Animation', "Children's", 'Comedy', 'Crime', 'Documentary', 'Drama', 'Fantasy', 'Film-Noir', 'Horror', 'Musical', 'Mystery', 'Romance', 'Sci-Fi', 'Thriller', 'War', 'Western']

dummies=DataFrame(np.zeros((len(movies),len(genres))),columns=genres)
for i,gen in enumerate(movies.genres):         #enumerate可以同时得到索引和元素
    dummies.ix[i,gen.split('|')]=1       #使用ix将dummies中对应的位置赋值为1
movies_windic=movies.join(dummies.add_prefix('Genre_'))
movies_windic.ix[0]

D:\Anaconda3\lib\site-packages\ipykernel_launcher.py:3: DeprecationWarning: 
.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#ix-indexer-is-deprecated
  This is separate from the ipykernel package so we can avoid doing imports until
D:\Anaconda3\lib\site-packages\ipykernel_launcher.py:5: DeprecationWarning: 
.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#ix-indexer-is-deprecated
  """

movie_id                                       1
title                           Toy Story (1995)
genres               Animation|Children's|Comedy
Genre_Action                                   0
Genre_Adventure                                0
Genre_Animation                                1
Genre_Children's                               1
Genre_Comedy                                   1
Genre_Crime                                    0
Genre_Documentary                              0
Genre_Drama                                    0
Genre_Fantasy                                  0
Genre_Film-Noir                                0
Genre_Horror                                   0
Genre_Musical                                  0
Genre_Mystery                                  0
Genre_Romance                                  0
Genre_Sci-Fi                                   0
Genre_Thriller                                 0
Genre_War                                      0
Genre_Western                                  0
Name: 0, dtype: object

#数据离散化
values=np.random.rand(10)
print(values)
bins=[0,0.2,0.4,0.6,0.8,1]
pd.cut(values,bins)

[ 0.75708896  0.6200304   0.72268342  0.95469938  0.40578312  0.34845387
  0.18576112  0.44983497  0.66012749  0.3210205 ]

[(0.6, 0.8], (0.6, 0.8], (0.6, 0.8], (0.8, 1.0], (0.4, 0.6], (0.2, 0.4], (0.0, 0.2], (0.4, 0.6], (0.6, 0.8], (0.2, 0.4]]
Categories (5, interval[float64]): [(0.0, 0.2] < (0.2, 0.4] < (0.4, 0.6] < (0.6, 0.8] < (0.8, 1.0]]

pd.get_dummies(pd.cut(values,bins))     #哑编码 独热码 one-hot-encoder

val='a,b,  guido'
print(val.split(','))
pieces=[x.strip() for x in val.split(',')]     #strip可以去除空格符换行符等
pieces

['a', 'b', '  guido']

['a', 'b', 'guido']

first,second,third=pieces
print(first+'::'+second+'::'+third)         
print('::'.join(pieces))
val.count(',')            #count计数

a::b::guido
a::b::guido

2

val.replace(',','::')

'a::b::  guido'

## re正则模块
import re
text="foo bar\t baz \tqux"
print(re.split('\s+',text))

regex=re.compile('\s+')    #定义一个正则
print(regex.split(text))
regex.findall(text)

['foo', 'bar', 'baz', 'qux']
['foo', 'bar', 'baz', 'qux']

[' ', '\t ', ' \t']

text="""Dave dave@google.com
Steve steve@gmail.com
Rob rob@gmail.com
Ryan ryan@yahoo.com
"""
pattern=r'[A-Z0-9._%+-]+@[A-Z0-9.-]+\.[A-Z]{2,4}'    #重复两到四次
regex=re.compile(pattern,flags=re.IGNORECASE)
regex.findall(text)

['dave@google.com', 'steve@gmail.com', 'rob@gmail.com', 'ryan@yahoo.com']

m=regex.search(text)
print(m)
text[m.start():m.end()]

<_sre.SRE_Match object; span=(5, 20), match='dave@google.com'>

'dave@google.com'

print(regex.sub('REDACTED',text))
pattern=r'([A-Z0-9._%+-]+)@([A-Z0-9.-]+)\.([A-Z]{2,4})'
regex=re.compile(pattern,flags=re.IGNORECASE)
m=regex.match('wesm@bright.net')
m
m.groups()   #元组

Dave REDACTED
Steve REDACTED
Rob REDACTED
Ryan REDACTED

('wesm', 'bright', 'net')

regex.findall(text)

[('dave', 'google', 'com'),
 ('steve', 'gmail', 'com'),
 ('rob', 'gmail', 'com'),
 ('ryan', 'yahoo', 'com')]

data={'Dave':'dave@google.com','Steve':'steve@gmail.com',
     'Rob':'rob@gmail.com','Wes':np.nan}
data=Series(data)
print(data)
data.isnull()

Dave     dave@google.com
Rob        rob@gmail.com
Steve    steve@gmail.com
Wes                  NaN
dtype: object

Dave     False
Rob      False
Steve    False
Wes       True
dtype: bool

pattern   #将邮件地址分成了三个部分
data.str.findall(pattern,flags=re.IGNORECASE)

Dave     [(dave, google, com)]
Rob        [(rob, gmail, com)]
Steve    [(steve, gmail, com)]
Wes                        NaN
dtype: object

matches=data.str.match(pattern,flags=re.IGNORECASE)
print(matches)
print(matches.str.get(1))
matches.str[0]

Dave     True
Rob      True
Steve    True
Wes       NaN
dtype: object
Dave    NaN
Rob     NaN
Steve   NaN
Wes     NaN
dtype: float64

Dave    NaN
Rob     NaN
Steve   NaN
Wes     NaN
dtype: float64

	0	1	2	3
count	1000.000000	1000.000000	1000.000000	1000.000000
mean	-0.067684	0.067924	0.025598	-0.002298
std	0.998035	0.992106	1.006835	0.996794
min	-3.428254	-3.548824	-3.184377	-3.745356
25%	-0.774890	-0.591841	-0.641675	-0.644144
50%	-0.116401	0.101143	0.002073	-0.013611
75%	0.616366	0.780282	0.680391	0.654328
max	3.366626	2.653656	3.260383	3.927528

	0	1	2	3
97	-0.774363	0.552936	0.106061	3.927528
305	-2.315555	0.457246	-0.025907	-3.399312
400	0.146326	0.508391	-0.196713	-3.745356

	0	1	2	3
5	-0.539741	0.476985	3.248944	-1.021228
97	-0.774363	0.552936	0.106061	3.927528
102	-0.655054	-0.565230	3.176873	0.959533
305	-2.315555	0.457246	-0.025907	-3.399312
324	0.050188	1.951312	3.260383	0.963301
400	0.146326	0.508391	-0.196713	-3.745356
499	-0.293333	-0.242459	-3.056990	1.918403
523	-3.428254	-0.296336	-0.439938	-0.867165
586	0.275144	1.179227	-3.184377	1.369891
808	-0.362528	-3.548824	1.553205	-2.186301
900	3.366626	-2.372214	0.851010	1.332846

	0	1	2	3
count	1000.000000	1000.000000	1000.000000	1000.000000
mean	-0.067623	0.068473	0.025153	-0.002081
std	0.995485	0.990253	1.003977	0.989736
min	-3.000000	-3.000000	-3.000000	-3.000000
25%	-0.774890	-0.591841	-0.641675	-0.644144
50%	-0.116401	0.101143	0.002073	-0.013611
75%	0.616366	0.780282	0.680391	0.654328
max	3.000000	2.653656	3.000000	3.000000

	data1	key_a	key_b	key_c
0	3	0	1	0
1	0	0	1	0
2	2	1	0	0
3	4	0	0	1
4	4	1	0	0
5	2	0	1	0

	data1	key	data2
0	0.0	b	1.0
1	1.0	b	1.0
2	6.0	b	1.0
3	2.0	a	0.0
4	4.0	a	0.0
5	5.0	a	0.0
6	3.0	c	NaN
7	NaN	d	2.0

	data1	key	data2
0	0	b	1.0
1	0	b	3.0
2	1	b	1.0
3	1	b	3.0
4	2	a	0.0
5	2	a	2.0
6	3	c	NaN
7	4	a	0.0
8	4	a	2.0
9	5	b	1.0
10	5	b	3.0

	key1	key2	lval	rval
0	foo	one	1.0	4.0
1	foo	one	1.0	5.0
2	foo	two	2.0	NaN
3	bar	one	3.0	6.0
4	bar	two	NaN	7.0

	data	key1	key2	event1	event2
0	0	Ohio	2000	4	5
0	0	Ohio	2000	6	7
1	1	Ohio	2001	8	9
2	2	Ohio	2002	10	11
3	3	Nevada	2001	0	1

	Ohio	Nevada	Missouri	Alabama
a	1.0	2.0	NaN	NaN
b	NaN	NaN	7.0	8.0
c	3.0	4.0	9.0	10.0
d	NaN	NaN	11.0	12.0
e	5.0	6.0	13.0	14.0

	a	b	c	d
0	0.200428	-1.370206	-0.297848	-0.690504
1	-1.012528	-0.661101	0.823504	0.058414
2	1.122031	-0.180694	0.967949	0.892187
3	-1.513330	0.215430	NaN	0.819751
4	0.777116	1.029311	NaN	1.013309

item	infl	realgdp	unemp
date
1959-03-31	0.00	2710.349	5.8
1959-06-30	2.34	2778.801	5.1
1959-09-30	2.74	2775.488	5.3
1959-12-31	0.27	2785.204	5.6
1960-03-31	2.31	2847.699	5.2

	food	ounces
0	bacon	4.0
1	pulled pork	3.0
2	bacon	12.0
3	Pastrami	6.0
4	corned beef	7.5
5	Bacon	8.0
6	pastrami	3.0
7	honey ham	5.0
8	nova lox	6.0

	movie_id	title	genres
0	1	Toy Story (1995)	Animation\|Children's\|Comedy
1	2	Jumanji (1995)	Adventure\|Children's\|Fantasy
2	3	Grumpier Old Men (1995)	Comedy\|Romance
3	4	Waiting to Exhale (1995)	Comedy\|Drama
4	5	Father of the Bride Part II (1995)	Comedy
5	6	Heat (1995)	Action\|Crime\|Thriller
6	7	Sabrina (1995)	Comedy\|Romance
7	8	Tom and Huck (1995)	Adventure\|Children's
8	9	Sudden Death (1995)	Action
9	10	GoldenEye (1995)	Action\|Adventure\|Thriller

	(0.0, 0.2]	(0.2, 0.4]	(0.4, 0.6]	(0.6, 0.8]	(0.8, 1.0]
0	0	0	0	1	0
1	0	0	0	1	0
2	0	0	0	1	0
3	0	0	0	0	1
4	0	0	1	0	0
5	0	1	0	0	0
6	1	0	0	0	0
7	0	0	1	0	0
8	0	0	0	1	0
9	0	1	0	0	0

	OHIO	TWO	THREE	FOUR
Ohio	0	1	2	3
Colorado	4	5	6	7
New York	8	9	10	11

	data1	key	data2
0	0.0	b	1.0
1	1.0	b	1.0
2	6.0	b	1.0
3	2.0	a	0.0
4	4.0	a	0.0
5	5.0	a	0.0
6	3.0	c	NaN
7	NaN	d	2.0

	data1	key	data2
0	0	b	1.0
1	0	b	3.0
2	1	b	1.0
3	1	b	3.0
4	2	a	0.0
5	2	a	2.0
6	3	c	NaN
7	4	a	0.0
8	4	a	2.0
9	5	b	1.0
10	5	b	3.0

	key1	key2	lval	rval
0	foo	one	1.0	4.0
1	foo	one	1.0	5.0
2	foo	two	2.0	NaN
3	bar	one	3.0	6.0
4	bar	two	NaN	7.0

	(0.0, 0.2]	(0.2, 0.4]	(0.4, 0.6]	(0.6, 0.8]	(0.8, 1.0]
0	0	0	0	1	0
1	0	0	0	1	0
2	0	0	0	1	0
3	0	0	0	0	1
4	0	0	1	0	0
5	0	1	0	0	0
6	1	0	0	0	0
7	0	0	1	0	0
8	0	0	0	1	0
9	0	1	0	0	0

pandas—清理、转换、合并、重塑

	data1	key	data2
0	0.0	b	1.0
1	1.0	b	1.0
2	6.0	b	1.0
3	2.0	a	0.0
4	4.0	a	0.0
5	5.0	a	0.0
6	3.0	c	NaN
7	NaN	d	2.0

	data1	key	data2
0	0	b	1.0
1	0	b	3.0
2	1	b	1.0
3	1	b	3.0
4	2	a	0.0
5	2	a	2.0
6	3	c	NaN
7	4	a	0.0
8	4	a	2.0
9	5	b	1.0
10	5	b	3.0

	key1	key2	lval	rval
0	foo	one	1.0	4.0
1	foo	one	1.0	5.0
2	foo	two	2.0	NaN
3	bar	one	3.0	6.0
4	bar	two	NaN	7.0

	(0.0, 0.2]	(0.2, 0.4]	(0.4, 0.6]	(0.6, 0.8]	(0.8, 1.0]
0	0	0	0	1	0
1	0	0	0	1	0
2	0	0	0	1	0
3	0	0	0	0	1
4	0	0	1	0	0
5	0	1	0	0	0
6	1	0	0	0	0
7	0	0	1	0	0
8	0	0	0	1	0
9	0	1	0	0	0