pandas—清理、转换、合并、重塑

In [1]:
from pandas import *
df1=DataFrame({'key':['b','b','a','c','a','a','b'],
          'data1':range(7)})
df2=DataFrame({'key':['a','b','d'],
              'data2':range(3)})
print (df1)
print (df2)
   data1 key
0      0   b
1      1   b
2      2   a
3      3   c
4      4   a
5      5   a
6      6   b
   data2 key
0      0   a
1      1   b
2      2   d
In [3]:
merge(df1,df2,on='key')  #黏连
Out[3]:
 data1keydata2
00b1
11b1
26b1
32a0
44a0
55a0
In [4]:
df3=DataFrame({'lkey':['b','b','a','c','a','a','b'],
              'data1':range(7)})
df4=DataFrame({'rkey':['a','b','d'],
              'data2':range(3)})
print (df3)
print (df4)
merge(df3,df4,left_on='lkey',right_on='rkey')     #两边的键名不一样
   data1 lkey
0      0    b
1      1    b
2      2    a
3      3    c
4      4    a
5      5    a
6      6    b
   data2 rkey
0      0    a
1      1    b
2      2    d
Out[4]:
 data1lkeydata2rkey
00b1b
11b1b
26b1b
32a0a
44a0a
55a0a
In [5]:
merge(df1,df2,how='outer')   #外连接
Out[5]:
 data1keydata2
00.0b1.0
11.0b1.0
26.0b1.0
32.0a0.0
44.0a0.0
55.0a0.0
63.0cNaN
7NaNd2.0
In [6]:
df1=DataFrame({'key':['b','b','a','c','a','b'],
              'data1':range(6)})
df2=DataFrame({'key':['a','b','a','b','d'],
              'data2':range(5)})
print (df1)
print (df2)
merge(df1,df2,on='key',how='left')   #行的笛卡尔积,左边有3个b右边有2个b 所以最终就有6个b行
   data1 key
0      0   b
1      1   b
2      2   a
3      3   c
4      4   a
5      5   b
   data2 key
0      0   a
1      1   b
2      2   a
3      3   b
4      4   d
Out[6]:
 data1keydata2
00b1.0
10b3.0
21b1.0
31b3.0
42a0.0
52a2.0
63cNaN
74a0.0
84a2.0
95b1.0
105b3.0
In [7]:
left=DataFrame({'key1':['foo','foo','bar'],
          'key2':['one','two','one'],
          'lval':[1,2,3]})
right=DataFrame({'key1':['foo','foo','bar','bar'],
                'key2':['one','one','one','two'],
                'rval':[4,5,6,7]})
print(left)
print(right)
merge(left,right,on=['key1','key2'],how='outer')    #两个键值
  key1 key2  lval
0  foo  one     1
1  foo  two     2
2  bar  one     3
  key1 key2  rval
0  foo  one     4
1  foo  one     5
2  bar  one     6
3  bar  two     7
Out[7]:
 key1key2lvalrval
0fooone1.04.0
1fooone1.05.0
2footwo2.0NaN
3barone3.06.0
4bartwoNaN7.0
In [8]:
left1=DataFrame({'key':['a','b','a','a','b','c'],
                'value':range(6)})
right1=DataFrame({'group_val':[3.5,7]},index=['a','b'])
print(left1)
print(right1)
merge(left1,right1,left_on='key',right_index=True)  #索引作为连接键
  key  value
0   a      0
1   b      1
2   a      2
3   a      3
4   b      4
5   c      5
   group_val
a        3.5
b        7.0
Out[8]:
 keyvaluegroup_val
0a03.5
2a23.5
3a33.5
1b17.0
4b47.0
In [9]:
lefth=DataFrame({'key1':['Ohio','Ohio','Ohio','Nevada','Nevada'],
                'key2':[2000,2001,2002,2001,2002],
                'data':np.arange(5)})
righth=DataFrame(np.arange(12).reshape((6,2)),
                index=[['Nevada','Nevada','Ohio','Ohio','Ohio','Ohio'],
                      [2001,2000,2000,2000,2001,2002]],
                columns=['event1','event2'])
print(lefth)
print(righth)
merge(lefth,righth,left_on=['key1','key2'],right_index=True)    #默认取交集
   data    key1  key2
0     0    Ohio  2000
1     1    Ohio  2001
2     2    Ohio  2002
3     3  Nevada  2001
4     4  Nevada  2002
             event1  event2
Nevada 2001       0       1
       2000       2       3
Ohio   2000       4       5
       2000       6       7
       2001       8       9
       2002      10      11
Out[9]:
 datakey1key2event1event2
00Ohio200045
00Ohio200067
11Ohio200189
22Ohio20021011
33Nevada200101
In [10]:
left2=DataFrame([[1,2],[3,4],[5,6]],index=['a','c','e'],
               columns=['Ohio','Nevada'])
right2=DataFrame([[7,8],[9,10],[11,12],[13,14]],
                index=['b','c','d','e'],columns=['Missouri','Alabama'])
print(left2)
print(right2)
merge(left2,right2,how='outer',left_index=True,right_index=True)
left2.join(right2,how='outer')   #
   Ohio  Nevada
a     1       2
c     3       4
e     5       6
   Missouri  Alabama
b         7        8
c         9       10
d        11       12
e        13       14
Out[10]:
 OhioNevadaMissouriAlabama
a1.02.0NaNNaN
bNaNNaN7.08.0
c3.04.09.010.0
dNaNNaN11.012.0
e5.06.013.014.0
In [11]:
#concat连接
s1=Series([0,1],index=['a','b'])
s2=Series([2,3,4],index=['c','d','e'])
s3=Series([5,6],index=['f','g'])
print(concat([s1,s2,s3]))

s4=concat([s1*5,s3])
print (s4)
print(concat([s1,s4],axis=1))    #按照列黏连
concat([s1,s4],axis=1,join='inner')
a    0
b    1
c    2
d    3
e    4
f    5
g    6
dtype: int64
a    0
b    5
f    5
g    6
dtype: int64
     0  1
a  0.0  0
b  1.0  5
f  NaN  5
g  NaN  6
Out[11]:
 01
a00
b15
In [12]:
result=concat([s1,s1,s3],keys=['one','two','three']) #创建层次化索引
print(result)
result.unstack()   #堆叠
one    a    0
       b    1
two    a    0
       b    1
three  f    5
       g    6
dtype: int64
Out[12]:
 abfg
one0.01.0NaNNaN
two0.01.0NaNNaN
threeNaNNaN5.06.0
In [13]:
print(concat([s1,s2,s3],axis=1,keys=['one','two','three']))
#concat([df1,df2],axis=1,keys=['level1','level2'],
#      names=['upper','lower'])
df1=DataFrame(np.random.randn(3,4),columns=['a','b','c','d'])
df2=DataFrame(np.random.randn(2,3),columns=['b','d','a'])
print(df1)
print(df2)
   one  two  three
a  0.0  NaN    NaN
b  1.0  NaN    NaN
c  NaN  2.0    NaN
d  NaN  3.0    NaN
e  NaN  4.0    NaN
f  NaN  NaN    5.0
g  NaN  NaN    6.0
          a         b         c         d
0  0.200428 -1.370206 -0.297848 -0.690504
1 -1.012528 -0.661101  0.823504  0.058414
2  1.122031 -0.180694  0.967949  0.892187
          b         d         a
0  0.215430  0.819751 -1.513330
1  1.029311  1.013309  0.777116
In [14]:
concat([df1,df2],ignore_index=True)
Out[14]:
 abcd
00.200428-1.370206-0.297848-0.690504
1-1.012528-0.6611010.8235040.058414
21.122031-0.1806940.9679490.892187
3-1.5133300.215430NaN0.819751
40.7771161.029311NaN1.013309
In [15]:
#stack旋转
data=DataFrame(np.arange(6).reshape((2,3)),index=pandas.Index(['Ohio',
    'Colorado'],name='state'),columns=pandas.Index(['one','two','three'],name='number'))
print(data)
result=data.stack()  #列转换为行
print(result)
type(result)   #result为一个序列Series
result.unstack()
number    one  two  three
state                    
Ohio        0    1      2
Colorado    3    4      5
state     number
Ohio      one       0
          two       1
          three     2
Colorado  one       3
          two       4
          three     5
dtype: int32
Out[15]:
numberonetwothree
state   
Ohio012
Colorado345
In [16]:
print(result.unstack(0))
print(result.unstack('state'))
state   Ohio  Colorado
number                
one        0         3
two        1         4
three      2         5
state   Ohio  Colorado
number                
one        0         3
two        1         4
three      2         5
In [17]:
import pandas as pd
s1=Series([0,1,2,3],index=['a','b','c','d'])
s2=Series([4,5,6],index=['c','d','e'])
data2=pd.concat([s1,s2],keys=['one','two'])
print(data2)
data2.unstack()
one  a    0
     b    1
     c    2
     d    3
two  c    4
     d    5
     e    6
dtype: int64
Out[17]:
 abcde
one0.01.02.03.0NaN
twoNaNNaN4.05.06.0
In [18]:
data2.unstack().stack()   #层次化索引
Out[18]:
one  a    0.0
     b    1.0
     c    2.0
     d    3.0
two  c    4.0
     d    5.0
     e    6.0
dtype: float64
In [19]:
df=DataFrame({'left':result,'right':result+5},columns=pd.Index(['left','right'],name='side'))   #多层索引
print(df)
df.unstack('state')
side             left  right
state    number             
Ohio     one        0      5
         two        1      6
         three      2      7
Colorado one        3      8
         two        4      9
         three      5     10
Out[19]:
sideleftright
stateOhioColoradoOhioColorado
number    
one0358
two1469
three25710
In [20]:
data = pd.read_csv('macrodata.csv')
periods = pd.PeriodIndex(year=data.year, quarter=data.quarter, name='date')
data = DataFrame(data.to_records(),
                 columns=pd.Index(['realgdp', 'infl', 'unemp'], name='item'),
                 index=periods.to_timestamp('D', 'end'))

ldata = data.stack().reset_index().rename(columns={0: 'value'})
wdata = ldata.pivot('date', 'item', 'value')

print(ldata[:10])
pivoted=ldata.pivot('date','item','value')   #date作为行索引 item作为列索引 value作为填充
pivoted.head()
        date     item     value
0 1959-03-31  realgdp  2710.349
1 1959-03-31     infl     0.000
2 1959-03-31    unemp     5.800
3 1959-06-30  realgdp  2778.801
4 1959-06-30     infl     2.340
5 1959-06-30    unemp     5.100
6 1959-09-30  realgdp  2775.488
7 1959-09-30     infl     2.740
8 1959-09-30    unemp     5.300
9 1959-12-31  realgdp  2785.204
Out[20]:
iteminflrealgdpunemp
date   
1959-03-310.002710.3495.8
1959-06-302.342778.8015.1
1959-09-302.742775.4885.3
1959-12-310.272785.2045.6
1960-03-312.312847.6995.2
In [24]:
ldata['value2']=np.random.randn(len(ldata))
print(ldata[:10])
pivoted=ldata.pivot('date','item')
print(pivoted[:5])  #透视表
print(pivoted['value'][:5])
        date     item     value    value2
0 1959-03-31  realgdp  2710.349 -1.090362
1 1959-03-31     infl     0.000 -0.406879
2 1959-03-31    unemp     5.800  2.608990
3 1959-06-30  realgdp  2778.801 -1.557126
4 1959-06-30     infl     2.340 -1.500277
5 1959-06-30    unemp     5.100  0.975767
6 1959-09-30  realgdp  2775.488  1.186471
7 1959-09-30     infl     2.740 -1.040368
8 1959-09-30    unemp     5.300 -0.855401
9 1959-12-31  realgdp  2785.204  0.227901
           value                    value2                    
item        infl   realgdp unemp      infl   realgdp     unemp
date                                                          
1959-03-31  0.00  2710.349   5.8 -0.406879 -1.090362  2.608990
1959-06-30  2.34  2778.801   5.1 -1.500277 -1.557126  0.975767
1959-09-30  2.74  2775.488   5.3 -1.040368  1.186471 -0.855401
1959-12-31  0.27  2785.204   5.6 -0.715787  0.227901 -0.219652
1960-03-31  2.31  2847.699   5.2 -0.988710 -2.672743 -0.731635
item        infl   realgdp  unemp
date                             
1959-03-31  0.00  2710.349    5.8
1959-06-30  2.34  2778.801    5.1
1959-09-30  2.74  2775.488    5.3
1959-12-31  0.27  2785.204    5.6
1960-03-31  2.31  2847.699    5.2
In [29]:
data=DataFrame({'k1':['one']*3+['two']*4,
               'k2':[1,1,2,3,3,4,4]})
print(data)
print(data.duplicated())   #是否是重复的
data.drop_duplicates()     #丢弃重复的
    k1  k2
0  one   1
1  one   1
2  one   2
3  two   3
4  two   3
5  two   4
6  two   4
0    False
1     True
2    False
3    False
4     True
5    False
6     True
dtype: bool
Out[29]:
 k1k2
0one1
2one2
3two3
5two4
In [31]:
data['v1']=range(7)
data.drop_duplicates(['k1'])   #丢掉重复的
Out[31]:
 k1k2v1
0one10
3two33
In [33]:
data=DataFrame({'food':['bacon','pulled pork','bacon','Pastrami',
                       'corned beef','Bacon','pastrami','honey ham',
                       'nova lox'],
               'ounces':[4,3,12,6,7.5,8,3,5,6]})
data
Out[33]:
 foodounces
0bacon4.0
1pulled pork3.0
2bacon12.0
3Pastrami6.0
4corned beef7.5
5Bacon8.0
6pastrami3.0
7honey ham5.0
8nova lox6.0
In [40]:
meat_to_animal={'bacon':'pig','pulled pork':'pig','pastrami':'cow','corned beef':'cow','honey ham':'pig','nova lox':'salmon'}
meat_to_animal     #字典映射
Out[40]:
{'bacon': 'pig',
 'corned beef': 'cow',
 'honey ham': 'pig',
 'nova lox': 'salmon',
 'pastrami': 'cow',
 'pulled pork': 'pig'}
In [41]:
data['animal']=data['food'].map(str.lower).map(meat_to_animal)
print (data)
data['food'].map(lambda x:meat_to_animal[x.lower()])          #匿名函数!!!
          food  ounces  animal
0        bacon     4.0     pig
1  pulled pork     3.0     pig
2        bacon    12.0     pig
3     Pastrami     6.0     cow
4  corned beef     7.5     cow
5        Bacon     8.0     pig
6     pastrami     3.0     cow
7    honey ham     5.0     pig
8     nova lox     6.0  salmon
Out[41]:
0       pig
1       pig
2       pig
3       cow
4       cow
5       pig
6       cow
7       pig
8    salmon
Name: food, dtype: object
In [43]:
#数据替换
data=Series([1,-999,2,-999,-1000,3])
data
data.replace(-999,np.nan)
Out[43]:
0       1.0
1       NaN
2       2.0
3       NaN
4   -1000.0
5       3.0
dtype: float64
In [47]:
print(data.replace([-999,-1000],np.nan))
print(data.replace({-999:np.nan,-1000:0}))
0    1.0
1    NaN
2    2.0
3    NaN
4    NaN
5    3.0
dtype: float64
0    1.0
1    NaN
2    2.0
3    NaN
4    0.0
5    3.0
dtype: float64
In [49]:
data=DataFrame(np.arange(12).reshape((3,4)),
              index=['Ohio','Colorado','New York'],
              columns=['Ohio','two','three','four'])
print(data)
data.index.map(str.upper)
          Ohio  two  three  four
Ohio         0    1      2     3
Colorado     4    5      6     7
New York     8    9     10    11
Out[49]:
Index(['OHIO', 'COLORADO', 'NEW YORK'], dtype='object')
In [50]:
data.rename(index=str.title,columns=str.upper)
Out[50]:
 OHIOTWOTHREEFOUR
Ohio0123
Colorado4567
New York891011
In [52]:
data.rename(index={'Ohio':'INDIANA'},
           columns={'three':'peekaboo'})    #部分轴标签的更新
Out[52]:
 Ohiotwopeekaboofour
INDIANA0123
Colorado4567
New York891011
In [53]:
ages=[20,22,25,27,21,23,37,31,61,45,41,32]
bins=[18,25,35,60,100]    #自定义分割区间  
cats=pd.cut(ages,bins)    
cats
Out[53]:
[(18, 25], (18, 25], (18, 25], (25, 35], (18, 25], ..., (25, 35], (60, 100], (35, 60], (35, 60], (25, 35]]
Length: 12
Categories (4, interval[int64]): [(18, 25] < (25, 35] < (35, 60] < (60, 100]]
In [60]:
print(cats.labels)     #分属四个级
pd.value_counts(cats)  
[0 0 0 1 0 0 2 1 3 2 2 1]
D:\Anaconda3\lib\site-packages\ipykernel_launcher.py:1: FutureWarning: 'labels' is deprecated. Use 'codes' instead
  """Entry point for launching an IPython kernel.
Out[60]:
(18, 25]     5
(35, 60]     3
(25, 35]     3
(60, 100]    1
dtype: int64
In [61]:
group_names=['Youth','YoungAdult','MiddleAged','Senior']    #给各个级命名
pd.cut(ages,bins,labels=group_names)
Out[61]:
[Youth, Youth, Youth, YoungAdult, Youth, ..., YoungAdult, Senior, MiddleAged, MiddleAged, YoungAdult]
Length: 12
Categories (4, object): [Youth < YoungAdult < MiddleAged < Senior]
In [65]:
data=np.random.rand(20)
cc=pd.cut(data,4,precision=2)
print(cc)
pd.value_counts(cc)        #注意区间宽度是相等的
[(0.13, 0.33], (0.53, 0.72], (0.33, 0.53], (0.53, 0.72], (0.13, 0.33], ..., (0.53, 0.72], (0.53, 0.72], (0.33, 0.53], (0.33, 0.53], (0.33, 0.53]]
Length: 20
Categories (4, interval[float64]): [(0.13, 0.33] < (0.33, 0.53] < (0.53, 0.72] < (0.72, 0.92]]
Out[65]:
(0.53, 0.72]    7
(0.33, 0.53]    6
(0.13, 0.33]    4
(0.72, 0.92]    3
dtype: int64
In [68]:
data=np.random.randn(1000)
cats=pd.qcut(data,4)
print(cats)
pd.value_counts(cats)      # qcut每个等级的个数基本相等
[(-3.137, -0.708], (0.664, 4.283], (0.0363, 0.664], (0.664, 4.283], (0.0363, 0.664], ..., (-0.708, 0.0363], (0.664, 4.283], (0.664, 4.283], (0.0363, 0.664], (-3.137, -0.708]]
Length: 1000
Categories (4, interval[float64]): [(-3.137, -0.708] < (-0.708, 0.0363] < (0.0363, 0.664] < (0.664, 4.283]]
Out[68]:
(0.664, 4.283]      250
(0.0363, 0.664]     250
(-0.708, 0.0363]    250
(-3.137, -0.708]    250
dtype: int64
In [69]:
#检测和过滤异常值
np.random.seed(12345)
data=DataFrame(np.random.randn(1000,4))
data.describe()
Out[69]:
 0123
count1000.0000001000.0000001000.0000001000.000000
mean-0.0676840.0679240.025598-0.002298
std0.9980350.9921061.0068350.996794
min-3.428254-3.548824-3.184377-3.745356
25%-0.774890-0.591841-0.641675-0.644144
50%-0.1164010.1011430.002073-0.013611
75%0.6163660.7802820.6803910.654328
max3.3666262.6536563.2603833.927528
In [72]:
col=data[3]      #某一列中绝对值超过3的
print(col[np.abs(col)>3])
data[np.abs(col)>3]
97     3.927528
305   -3.399312
400   -3.745356
Name: 3, dtype: float64
Out[72]:
 0123
97-0.7743630.5529360.1060613.927528
305-2.3155550.457246-0.025907-3.399312
4000.1463260.508391-0.196713-3.745356
In [78]:
data[(np.abs(data)>3).any(1)]  # 任意有一行>3的
Out[78]:
 0123
5-0.5397410.4769853.248944-1.021228
97-0.7743630.5529360.1060613.927528
102-0.655054-0.5652303.1768730.959533
305-2.3155550.457246-0.025907-3.399312
3240.0501881.9513123.2603830.963301
4000.1463260.508391-0.196713-3.745356
499-0.293333-0.242459-3.0569901.918403
523-3.428254-0.296336-0.439938-0.867165
5860.2751441.179227-3.1843771.369891
808-0.362528-3.5488241.553205-2.186301
9003.366626-2.3722140.8510101.332846
In [82]:
data[np.abs(data)>3]=np.sign(data)*3   # sign -1 1 
data.describe()                        #将值限制在-3到3之间
Out[82]:
 0123
count1000.0000001000.0000001000.0000001000.000000
mean-0.0676230.0684730.025153-0.002081
std0.9954850.9902531.0039770.989736
min-3.000000-3.000000-3.000000-3.000000
25%-0.774890-0.591841-0.641675-0.644144
50%-0.1164010.1011430.002073-0.013611
75%0.6163660.7802820.6803910.654328
max3.0000002.6536563.0000003.000000
In [2]:
#排列和随机采样
import pandas as pd
df=DataFrame(np.arange(5*4).reshape(5,4))
sampler=np.random.permutation(5)
sampler
Out[2]:
array([0, 2, 4, 1, 3])
In [3]:
print(df)
print(df.take(sampler))     #行重排
    0   1   2   3
0   0   1   2   3
1   4   5   6   7
2   8   9  10  11
3  12  13  14  15
4  16  17  18  19
    0   1   2   3
0   0   1   2   3
2   8   9  10  11
4  16  17  18  19
1   4   5   6   7
3  12  13  14  15
In [5]:
df.take(np.random.permutation(len(df))[:3])   #一种随机选取子集的方法!
Out[5]:
 0123
14567
312131415
416171819
In [6]:
#似乎完成了一个重采样的过程?
bag=np.array([5,7,-1,6,4])
sampler=np.random.randint(0,len(bag),size=10)
print(sampler)
draws=bag.take(sampler)
draws
[3 0 3 3 4 1 1 0 1 1]
Out[6]:
array([6, 5, 6, 6, 4, 7, 7, 5, 7, 7])
In [11]:
#哑变量
df=DataFrame({'key':['b','b','a','c','a','b'],
             'data1':np.random.randint(0,6,6)})
print(df)
pd.get_dummies(df['key'])   #索引填充!
   data1 key
0      3   b
1      0   b
2      2   a
3      4   c
4      4   a
5      2   b
Out[11]:
 abc
0010
1010
2100
3001
4100
5010
In [12]:
dummies=pd.get_dummies(df['key'],prefix='key')
df_with_dummy=df[['data1']].join(dummies)
df_with_dummy
Out[12]:
 data1key_akey_bkey_c
03010
10010
22100
34001
44100
52010
In [14]:
mnames=['movie_id','title','genres']
movies=pd.read_table('movies.dat',sep='::',header=None,names=mnames)
movies[:10]
D:\Anaconda3\lib\site-packages\ipykernel_launcher.py:2: ParserWarning: Falling back to the 'python' engine because the 'c' engine does not support regex separators (separators > 1 char and different from '\s+' are interpreted as regex); you can avoid this warning by specifying engine='python'.
  
Out[14]:
 movie_idtitlegenres
01Toy Story (1995)Animation|Children's|Comedy
12Jumanji (1995)Adventure|Children's|Fantasy
23Grumpier Old Men (1995)Comedy|Romance
34Waiting to Exhale (1995)Comedy|Drama
45Father of the Bride Part II (1995)Comedy
56Heat (1995)Action|Crime|Thriller
67Sabrina (1995)Comedy|Romance
78Tom and Huck (1995)Adventure|Children's
89Sudden Death (1995)Action
910GoldenEye (1995)Action|Adventure|Thriller
In [16]:
genre_iter=(set(x.split('|')) for x in movies.genres)
print(genre_iter)
genres=sorted(set.union(*genre_iter))
print(genres)
<generator object <genexpr> at 0x000002243A180B48>
['Action', 'Adventure', 'Animation', "Children's", 'Comedy', 'Crime', 'Documentary', 'Drama', 'Fantasy', 'Film-Noir', 'Horror', 'Musical', 'Mystery', 'Romance', 'Sci-Fi', 'Thriller', 'War', 'Western']
In [17]:
dummies=DataFrame(np.zeros((len(movies),len(genres))),columns=genres)
for i,gen in enumerate(movies.genres):         #enumerate可以同时得到索引和元素
    dummies.ix[i,gen.split('|')]=1       #使用ix将dummies中对应的位置赋值为1
movies_windic=movies.join(dummies.add_prefix('Genre_'))
movies_windic.ix[0]
D:\Anaconda3\lib\site-packages\ipykernel_launcher.py:3: DeprecationWarning: 
.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#ix-indexer-is-deprecated
  This is separate from the ipykernel package so we can avoid doing imports until
D:\Anaconda3\lib\site-packages\ipykernel_launcher.py:5: DeprecationWarning: 
.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#ix-indexer-is-deprecated
  """
Out[17]:
movie_id                                       1
title                           Toy Story (1995)
genres               Animation|Children's|Comedy
Genre_Action                                   0
Genre_Adventure                                0
Genre_Animation                                1
Genre_Children's                               1
Genre_Comedy                                   1
Genre_Crime                                    0
Genre_Documentary                              0
Genre_Drama                                    0
Genre_Fantasy                                  0
Genre_Film-Noir                                0
Genre_Horror                                   0
Genre_Musical                                  0
Genre_Mystery                                  0
Genre_Romance                                  0
Genre_Sci-Fi                                   0
Genre_Thriller                                 0
Genre_War                                      0
Genre_Western                                  0
Name: 0, dtype: object
In [18]:
#数据离散化
values=np.random.rand(10)
print(values)
bins=[0,0.2,0.4,0.6,0.8,1]
pd.cut(values,bins)
[ 0.75708896  0.6200304   0.72268342  0.95469938  0.40578312  0.34845387
  0.18576112  0.44983497  0.66012749  0.3210205 ]
Out[18]:
[(0.6, 0.8], (0.6, 0.8], (0.6, 0.8], (0.8, 1.0], (0.4, 0.6], (0.2, 0.4], (0.0, 0.2], (0.4, 0.6], (0.6, 0.8], (0.2, 0.4]]
Categories (5, interval[float64]): [(0.0, 0.2] < (0.2, 0.4] < (0.4, 0.6] < (0.6, 0.8] < (0.8, 1.0]]
In [19]:
pd.get_dummies(pd.cut(values,bins))     #哑编码 独热码 one-hot-encoder
Out[19]:
 (0.0, 0.2](0.2, 0.4](0.4, 0.6](0.6, 0.8](0.8, 1.0]
000010
100010
200010
300001
400100
501000
610000
700100
800010
901000
In [31]:
val='a,b,  guido'
print(val.split(','))
pieces=[x.strip() for x in val.split(',')]     #strip可以去除空格符换行符等
pieces
['a', 'b', '  guido']
Out[31]:
['a', 'b', 'guido']
In [35]:
first,second,third=pieces
print(first+'::'+second+'::'+third)         
print('::'.join(pieces))
val.count(',')            #count计数
a::b::guido
a::b::guido
Out[35]:
2
In [36]:
val.replace(',','::')
Out[36]:
'a::b::  guido'
In [40]:
## re正则模块
import re
text="foo bar\t baz \tqux"
print(re.split('\s+',text))

regex=re.compile('\s+')    #定义一个正则
print(regex.split(text))
regex.findall(text)
['foo', 'bar', 'baz', 'qux']
['foo', 'bar', 'baz', 'qux']
Out[40]:
[' ', '\t ', ' \t']
In [41]:
text="""Dave dave@google.com
Steve steve@gmail.com
Rob rob@gmail.com
Ryan ryan@yahoo.com
"""
pattern=r'[A-Z0-9._%+-]+@[A-Z0-9.-]+\.[A-Z]{2,4}'    #重复两到四次
regex=re.compile(pattern,flags=re.IGNORECASE)
regex.findall(text)
Out[41]:
['dave@google.com', 'steve@gmail.com', 'rob@gmail.com', 'ryan@yahoo.com']
In [43]:
m=regex.search(text)
print(m)
text[m.start():m.end()]
<_sre.SRE_Match object; span=(5, 20), match='dave@google.com'>
Out[43]:
'dave@google.com'
In [50]:
print(regex.sub('REDACTED',text))
pattern=r'([A-Z0-9._%+-]+)@([A-Z0-9.-]+)\.([A-Z]{2,4})'
regex=re.compile(pattern,flags=re.IGNORECASE)
m=regex.match('wesm@bright.net')
m
m.groups()   #元组
Dave REDACTED
Steve REDACTED
Rob REDACTED
Ryan REDACTED

Out[50]:
('wesm', 'bright', 'net')
In [51]:
regex.findall(text)
Out[51]:
[('dave', 'google', 'com'),
 ('steve', 'gmail', 'com'),
 ('rob', 'gmail', 'com'),
 ('ryan', 'yahoo', 'com')]
In [54]:
data={'Dave':'dave@google.com','Steve':'steve@gmail.com',
     'Rob':'rob@gmail.com','Wes':np.nan}
data=Series(data)
print(data)
data.isnull()
Dave     dave@google.com
Rob        rob@gmail.com
Steve    steve@gmail.com
Wes                  NaN
dtype: object
Out[54]:
Dave     False
Rob      False
Steve    False
Wes       True
dtype: bool
In [56]:
pattern   #将邮件地址分成了三个部分
data.str.findall(pattern,flags=re.IGNORECASE)
Out[56]:
Dave     [(dave, google, com)]
Rob        [(rob, gmail, com)]
Steve    [(steve, gmail, com)]
Wes                        NaN
dtype: object
In [59]:
matches=data.str.match(pattern,flags=re.IGNORECASE)
print(matches)
print(matches.str.get(1))
matches.str[0]
Dave     True
Rob      True
Steve    True
Wes       NaN
dtype: object
Dave    NaN
Rob     NaN
Steve   NaN
Wes     NaN
dtype: float64
Out[59]:
Dave    NaN
Rob     NaN
Steve   NaN
Wes     NaN
dtype: float64

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值