pandas常用操作

最新推荐文章于 2024-08-11 16:05:35 发布

wangquannuaa

最新推荐文章于 2024-08-11 16:05:35 发布

阅读量1.8k

点赞数

分类专栏： python

本文链接：https://blog.csdn.net/wangquannuaa/article/details/46973925

版权

python 专栏收录该内容

41 篇文章 0 订阅

订阅专栏

In [6]: dates = pd.date_range(’20130101’,periods=6)
In [8]: df = pd.DataFrame(np.random.randn(6,4),index=dates,columns=list(’ABCD’))
In [9]: df
Out[9]:
                  A         B         C         D
2013-01-01 0.469112 -0.282863 -1.509059 -1.135632
2013-01-02 1.212112 -0.173215 0.119209 -1.044236
2013-01-03 -0.861849 -2.104569 -0.494929 1.071804
2013-01-04 0.721555 -0.706771 -1.039575 0.271860
2013-01-05 -0.424972 0.567020 0.276232 -1.087401
2013-01-06 -0.673690 0.113648 -1.478427 0.524988

df.values

df.sort_index(axis=1, ascending=False)

df.sort(columns=’B’)

df[’A’]
df[0:3]
df[’20130102’:’20130104’]
df.loc[:,[’A’,’B’]]
df.loc[’20130102’:’20130104’,[’A’,’B’]]
df.iloc[3]
df.iloc[3:5,0:2]
df.iloc[[1,2,4],[0,2]]

df1 = df.reindex(index=dates[0:4],columns=list(df.columns) + [’E’])

df.mean()
df.mean(1)
df.apply(lambda x: x.max() - x.min())

In [77]: left = pd.DataFrame({’key’: [’foo’, ’foo’], ’lval’: [1, 2]})
In [78]: right = pd.DataFrame({’key’: [’foo’, ’foo’], ’rval’: [4, 5]})
In [79]: left
Out[79]:
  key lval
0 foo 1
1 foo 2
In [80]: right
Out[80]:
  key rval
0 foo 4
1 foo 5
In [81]: pd.merge(left, right, on=’key’)
Out[81]:
key lval rval
0 foo 1 4
1 foo 1 5
2 foo 2 4
3 foo 2 5

df.append(s, ignore_index=True)

In [87]: df
Out[87]:
    A   B         C         D
0 foo one -1.202872 -0.055224
1 bar one -1.814470 2.395985
2 foo two 1.018601 1.552825
3 bar three -0.595447 0.166599
4 foo two 1.395433 0.047609
5 bar two -0.392670 -0.136473
6 foo one 0.007207 -0.561757
7 foo three 1.928123 -1.623033
In [88]: df.groupby(’A’).sum()
Out[88]:
            C       D
A
bar -2.802588 2.42611
foo 3.146492 -0.63958
In [89]: df.groupby([’A’,’B’]).sum()
Out[89]:
                  C        D
A B
bar one   -1.814470 2.395985
    three -0.595447 0.166599
    two   -0.392670 -0.136473
foo one   -1.195665 -0.616981
    three 1.928123 -1.623033
    two   2.414034 1.600434

In [101]: df
Out[101]:
    A B   C        D         E
0 one A foo 1.418757 -0.179666
1 one B foo -1.879024 1.291836
2 two C foo 0.536826 -0.009614
3 three A bar 1.006160 0.392149
4 one B bar -0.029716 0.264599
5 one C bar -1.146178 -0.057409
6 two A foo 0.100900 -1.425638
7 three B foo -1.035018 1.024098
8 one C foo 0.314665 -0.106062
9 one A bar -0.773723 1.824375
10 two B bar -1.170653 0.595974
11 three C bar 0.648740 1.167115
In [102]: pd.pivot_table(df, values=’D’, index=[’A’, ’B’], columns=[’C’])
Out[102]:
C             bar      foo
A     B
one   A -0.773723 1.418757
      B -0.029716 -1.879024
      C -1.146178 0.314665
three A 1.006160 NaN
      B NaN -1.035018
      C 0.648740 NaN
two   A NaN 0.100900
      B -1.170653 NaN
      C NaN 0.536826

# List to hold file names
FileNames = []

# Your path will be different, please modify the path below.
#os.chdir()函数功能：改变当前路径到指定路径
os.chdir(r"C:\Users\david\notebooks\pandas")

# Find any file that ends with ".xlsx"
for files in os.listdir("."):
    if files.endswith(".xlsx"):
        FileNames.append(files)

FileNames
def GetFile(fnombre):

    # Path to excel file
    # Your path will be different, please modify the path below.
    location = r'C:\Users\david\notebooks\pandas\\' + fnombre

    # Parse the excel file
    # 0 = first sheet
    df = pd.read_excel(location, 0)

    # Tag record to file name
    df['File'] = fnombre

    # Make the "File" column the index of the df
    return df.set_index(['File'])
# Create a list of dataframes
df_list = [GetFile(fname) for fname in FileNames]
# Combine all of the dataframes into one
big_df = pd.concat(df_list)

df
 One_X One_Y Two_X Two_Y row
0 1.1   1.2   1.11   1.22  0
1 1.1   1.2   1.11   1.22  1
2 1.1   1.2   1.11   1.22  2
In [60]: df = df.set_index(’row’);df
Out[60]:
   One_X One_Y Two_X Two_Y
row
0   1.1   1.2   1.11  1.22
1   1.1   1.2   1.11  1.22
2   1.1   1.2   1.11  1.22

In [61]: df.columns = pd.MultiIndex.from_tuples([tuple(c.split(’_’)) for c in df.columns]);df
Out[61]:
      One     Two
     X   Y    X   Y
row
0   1.1 1.2 1.11 1.22
1   1.1 1.2 1.11 1.22
2   1.1 1.2 1.11 1.22

In [62]: df = df.stack(0).reset_index(1);df
Out[62]:
   level_1  X   Y
row
0   One   1.10 1.20
0   Two   1.11 1.22
1   One   1.10 1.20
1   Two   1.11 1.22
2   One   1.10 1.20
2   Two   1.11 1.22

In [63]: df.columns = [’Sample’,’All_X’,’All_Y’];df
Out[63]:
   Sample All_X All_Y
row
0   One   1.10   1.20
0   Two   1.11   1.22
1   One   1.10   1.20
1   Two   1.11   1.22
2   One   1.10   1.20
2   Two   1.11   1.22

In [64]: cols = pd.MultiIndex.from_tuples([ (x,y) for x in [’A’,’B’,’C’] for y in [’O’,’I’]])
In [65]: df = pd.DataFrame(np.random.randn(2,6),index=[’n’,’m’],columns=cols); df
Out[65]:
     A      B      C
     O  I   O  I   O  I
n 1.920906 -0.388231 -2.314394 0.665508 0.402562 0.399555
m -1.765956 0.850423 0.388054 0.992312 0.744086 -0.739776

In [72]: index = list(itertools.product([’Ada’,’Quinn’,’Violet’],[’Comp’,’Math’,’Sci’]))
In [73]: headr = list(itertools.product([’Exams’,’Labs’],[’I’,’II’]))
In [74]: indx = pd.MultiIndex.from_tuples(index,names=[’Student’,’Course’])
In [75]: cols = pd.MultiIndex.from_tuples(headr) #Notice these are un-named
In [76]: data = [[70+x+y+(x*y)%3 for x in range(4)] for y in range(9)]
In [77]: df = pd.DataFrame(data,indx,cols); df
Out[77]:
                Exams    Labs
                I   II   I   II
Student Course
Ada     Comp    70  71   72  73
        Math    71  73   75  74
        Sci     72  75   75  75
Quinn   Comp    73  74   75  76
        Math    74  76   78  77
        Sci     75  78   78  78
Violet  Comp    76  77   78  79
        Math    77  79   81  80
        Sci     78  81   81  81

In [78]: All = slice(None)
In [79]: df.loc[’Violet’]
Out[79]:
      Exams   Labs
      I   II   I   II
Course
Comp  76  77   78  79
Math  77  79   81  80
Sci   78  81   81  81

In [80]: df.loc[(All,’Math’),All]
Out[80]:
              Exams   Labs
              I  II   I  II
Student Course
Ada     Math  71 73   75 74
Quinn   Math  74 76   78 77
Violet  Math  77 79   81 80

In [81]: df.loc[(slice(’Ada’,’Quinn’),’Math’),All]
Out[81]:
              Exams   Labs
              I  II   I  II
Student Course
Ada     Math  71 73   75 74
Quinn   Math  74 76   78 77

In [82]: df.loc[(All,’Math’),(’Exams’)]
Out[82]:
              I  II
Student Course
Ada     Math  71 73
Quinn   Math  74 76
Violet  Math  77 79

In [83]: df.loc[(All,’Math’),(All,’II’)]
Out[83]:
              Exams Labs
              II    II
Student Course
Ada     Math  73    74
Quinn   Math  76    77
Violet  Math  79    80

df = pd.DataFrame({’animal’: ’cat dog cat fish dog cat cat’.split(),
....: ’size’: list(’SSMMMLL’),
....: ’weight’: [8, 10, 11, 1, 20, 12, 12],
....: ’adult’ : [False] * 5 + [True] * 2});
df
  adult animal size weight
0 False cat      S    8
1 False dog      S   10
2 False cat      M   11
3 False fish     M    1
4 False dog      M   20
5 True  cat      L   12
6 True  cat      L   12
#List the size of the animals with the highest weight.
In [90]: df.groupby(’animal’).apply(lambda subf: subf[’size’][subf[’weight’].idxmax()])
Out[90]:
animal
cat L
dog M
fish M
dtype: object
In [91]: gb = df.groupby([’animal’])
In [92]: gb.get_group(’cat’)
Out[92]:
  adult animal size weight
0 False cat     S     8
2 False cat     M     11
5 True  cat     L     12
6 True  cat     L     12

Apply to different items in a group
In [93]: def GrowUp(x):
....: avg_weight = sum(x[x.size == ’S’].weight * 1.5)
....: avg_weight += sum(x[x.size == ’M’].weight * 1.25)
....: avg_weight += sum(x[x.size == ’L’].weight)
....: avg_weight = avg_weight / len(x)
....: return pd.Series([’L’,avg_weight,True], index=[’size’, ’weight’, ’adult’])
....:
In [94]: expected_df = gb.apply(GrowUp)
In [95]: expected_df
Out[95]:
      size weight  adult
animal
cat     L  12.4375 True
dog     L  20.0000 True
fish    L  1.2500  True

Create a value counts column and reassign back to the DataFrame
In [115]: df = pd.DataFrame({’Color’: ’Red Red Red Blue’.split(),
.....: ’Value’: [100, 150, 50, 50]}); df
.....:
Out[115]:
  Color Value
0 Red   100
1 Red   150
2 Red   50
3 Blue  50
In [116]: df[’Counts’] = df.groupby([’Color’]).transform(len)
In [117]: df
Out[117]:
  Color Value Counts
0 Red   100   3
1 Red   150   3
2 Red   50    3
3 Blue  50    1

Rolling Apply to multiple columns where function returns a Scalar (Volume Weighted Average Price)
In [138]: rng = pd.date_range(start = ’2014-01-01’,periods = 100)
In [139]: df = pd.DataFrame({’Open’ : np.random.randn(len(rng)),
.....: ’Close’ : np.random.randn(len(rng)),’Volume’ : np.random.randint(100,2000,len(rng))}, index=rng)
df
           Close    Open     Volume
2014-01-01 1.550590 0.458513 1371
2014-01-02 -0.818812 -0.508850 1433
2014-01-03 1.160619 0.257610 645
2014-01-04 0.081521 -1.773393 878
2014-01-05 1.083284 -0.560676 1143
2014-01-06 -0.518721 0.284174 1088
2014-01-07 0.140661 1.146889 1722
... ... ... ...
2014-04-04 0.458193 -0.669474 1768
2014-04-05 0.108502 -1.616315 836
2014-04-06 1.418082 -1.294906 694
2014-04-07 0.486530 1.171647 796
2014-04-08 0.181885 0.501639 265
2014-04-09 -0.707238 -0.361868 1293
2014-04-10 1.211432 1.564429 1088
In [140]: def vwap(bars): return ((bars.Close*bars.Volume).sum()/bars.Volume.sum()).round(2)
In [141]: window = 5
In [142]: s = pd.concat([ (pd.Series(vwap(df.iloc[i:i+window]), index=[df.index[i+window]])) for i in range(95)])
Out[142]:
2014-01-06 0.55
2014-01-07 0.06
2014-01-08 0.32
2014-01-09 0.03
2014-01-10 0.08
...
2014-04-05 0.48
2014-04-06 0.54
2014-04-07 0.46
2014-04-08 0.45
2014-04-09 0.53
2014-04-10 0.15
Length: 95

Replacing some values with mean of the rest of a group
In [100]: df = pd.DataFrame({’A’ : [1, 1, 2, 2], ’B’ : [1, -1, 1, 2]})
In [101]: gb = df.groupby(’A’)
In [102]: def replace(g):
.....: mask = g < 0
.....: g.loc[mask] = g[~mask].mean()
.....: return g
In [103]: gb.transform(replace)
Out[103]:
B
0 1
1 1
2 1
3 2

Sort groups by aggregated data
In [104]: df = pd.DataFrame({’code’: [’foo’, ’bar’, ’baz’] * 2,
.....: ’data’: [0.16, -0.21, 0.33, 0.45, -0.59, 0.62],
.....: ’flag’: [False, True] * 3})
.....:
In [105]: code_groups = df.groupby(’code’)
In [106]: agg_n_sort_order = code_groups[[’data’]].transform(sum).sort(’data’)
In [107]: sorted_df = df.ix[agg_n_sort_order.index]
In [108]: sorted_df
Out[108]:
  code data flag
1 bar -0.21 True
4 bar -0.59 False
0 foo 0.16 False
3 foo 0.45 True
2 baz 0.33 False
5 baz 0.62 True

Create multiple aggregated columns
In [109]: rng = pd.date_range(start="2014-10-07",periods=10,freq=’2min’)
In [110]: ts = pd.Series(data = list(range(10)), index = rng)
In [111]: def MyCust(x):
.....: if len(x) > 2:
.....: return x[1] * 1.234
.....: return pd.NaT
In [112]: mhc = {’Mean’ : np.mean, ’Max’ : np.max, ’Custom’ : MyCust}
In [113]: ts.resample("5min",how = mhc)
Out[113]:
                   Max Custom Mean
2014-10-07 00:00:00 2  1.234  1.0
2014-10-07 00:05:00 4  NaN    3.5
2014-10-07 00:10:00 7  7.404  6.0
2014-10-07 00:15:00 9  NaN    8.5
In [114]: ts
Out[114]:
2014-10-07 00:00:00 0
2014-10-07 00:02:00 1
2014-10-07 00:04:00 2
2014-10-07 00:06:00 3
2014-10-07 00:08:00 4
2014-10-07 00:10:00 5
2014-10-07 00:12:00 6
2014-10-07 00:14:00 7
2014-10-07 00:16:00 8
2014-10-07 00:18:00 9
Freq: 2T, dtype: int64

Create a value counts column and reassign back to the DataFrame
In [115]: df = pd.DataFrame({’Color’: ’Red Red Red Blue’.split(),
.....: ’Value’: [100, 150, 50, 50]}); df
.....:
Out[115]:
 Color Value
0 Red  100
1 Red  150
2 Red  50
3 Blue 50
In [116]: df[’Counts’] = df.groupby([’Color’]).transform(len)
In [117]: df
Out[117]:
 Color Value Counts
0 Red  100   3
1 Red  150   3
2 Red  50    3
3 Blue 50    1

In [30]: i = pd.date_range(’20000101’,periods=10000)
In [31]: df = pd.DataFrame(dict(year = i.year, month = i.month, day = i.day))
In [32]: df.head()
Out[32]:
  day month year
0 1   1     2000
1 2   1     2000
2 3   1     2000
3 4   1     2000
4 5   1     2000
In [34]: ds = df.apply(lambda x: "%04d%02d%02d" % (x[’year’],x[’month’],x[’day’]),axis=1)
In [35]: ds.head()
Out[35]:
0 20000101
1 20000102
2 20000103
3 20000104
4 20000105
dtype: object

In [180]: def expand_grid(data_dict):
.....: rows = itertools.product(*data_dict.values())
.....: return pd.DataFrame.from_records(rows, columns=data_dict.keys())
In [181]: df = expand_grid(
.....: {’height’: [60, 70],
.....: ’weight’: [100, 140, 180],
.....: ’sex’: [’Male’, ’Female’]})
Out[182]:
  sex      weight height
0 Male     100    60
1 Male     100    70
2 Male     140    60
3 Male     140    70
4 Male     180    60
5 Male     180    70
6 Female   100    60
7 Female   100    70
8 Female   140    60
9 Female   140    70
10 Female  180    60
11 Female  180    70

In [43]: DataFrame(data)
Out[43]:
A B C
0 1 2 Hello
1 2 3 World
In [44]: DataFrame(data, index=[’first’, ’second’])
Out[44]:
       A B C
first  1 2 Hello
second 2 3 World
In [45]: DataFrame(data, columns=[’C’, ’A’, ’B’])
Out[45]:
  C     A B
0 Hello 1 2
1 World 2 3