In [6]: dates = pd.date_range(’20130101’,periods=6)
In [8]: df = pd.DataFrame(np.random.randn(6,4),index=dates,columns=list(’ABCD’))
In [9]: df
Out[9]:
A B C D
2013-01-010.469112 -0.282863 -1.509059 -1.1356322013-01-021.212112 -0.1732150.119209 -1.0442362013-01-03 -0.861849 -2.104569 -0.4949291.0718042013-01-040.721555 -0.706771 -1.0395750.2718602013-01-05 -0.4249720.5670200.276232 -1.0874012013-01-06 -0.6736900.113648 -1.4784270.524988
df.values
df.sort_index(axis=1, ascending=False)
df.sort(columns=’B’)
df[’A’]
df[0:3]
df[’20130102’:’20130104’]
df.loc[:,[’A’,’B’]]
df.loc[’20130102’:’20130104’,[’A’,’B’]]
df.iloc[3]
df.iloc[3:5,0:2]
df.iloc[[1,2,4],[0,2]]
df1 = df.reindex(index=dates[0:4],columns=list(df.columns) + [’E’])
df.mean()
df.mean(1)
df.apply(lambda x: x.max() - x.min())
In [77]: left = pd.DataFrame({’key’: [’foo’, ’foo’], ’lval’: [1, 2]})
In [78]: right = pd.DataFrame({’key’: [’foo’, ’foo’], ’rval’: [4, 5]})
In [79]: left
Out[79]:
key lval
0 foo 11 foo 2In [80]: right
Out[80]:
key rval
0 foo 41 foo 5In [81]: pd.merge(left, right, on=’key’)
Out[81]:
key lval rval
0 foo 141 foo 152 foo 243 foo 25
df.append(s, ignore_index=True)
In [87]: df
Out[87]:
A B C D
0 foo one -1.202872 -0.0552241 bar one -1.8144702.3959852 foo two1.0186011.5528253 bar three -0.5954470.1665994 foo two1.3954330.0476095 bar two -0.392670 -0.1364736 foo one0.007207 -0.5617577 foo three1.928123 -1.623033
In [88]: df.groupby(’A’).sum()
Out[88]:
C D
A
bar -2.8025882.42611
foo 3.146492 -0.63958
In [89]: df.groupby([’A’,’B’]).sum()
Out[89]:
C D
A B
bar one -1.8144702.395985three -0.5954470.166599two -0.392670 -0.136473
foo one -1.195665 -0.616981three1.928123 -1.623033two2.4140341.600434
In [101]: df
Out[101]:
A B CDE0 one A foo 1.418757 -0.1796661 one B foo -1.8790241.2918362 two C foo 0.536826 -0.0096143 three A bar 1.0061600.3921494 one B bar -0.0297160.2645995 one C bar -1.146178 -0.0574096 two A foo 0.100900 -1.4256387 three B foo -1.0350181.0240988 one C foo 0.314665 -0.1060629 one A bar -0.7737231.82437510 two B bar -1.1706530.59597411 three C bar 0.6487401.167115In [102]: pd.pivot_table(df, values=’D’, index=[’A’, ’B’], columns=[’C’])
Out[102]:
C bar foo
A B
one A -0.7737231.418757
B -0.029716 -1.879024C -1.1461780.314665
three A 1.006160 NaN
B NaN -1.035018C0.648740 NaN
two A NaN 0.100900
B -1.170653 NaN
C NaN 0.536826
# List to hold file names
FileNames = []
# Your path will be different, please modify the path below.#os.chdir()函数功能:改变当前路径到指定路径
os.chdir(r"C:\Users\david\notebooks\pandas")
# Find any file that ends with ".xlsx"for files in os.listdir("."):
if files.endswith(".xlsx"):
FileNames.append(files)
FileNames
defGetFile(fnombre):# Path to excel file# Your path will be different, please modify the path below.
location = r'C:\Users\david\notebooks\pandas\\' + fnombre
# Parse the excel file# 0 = first sheet
df = pd.read_excel(location, 0)
# Tag record to file name
df['File'] = fnombre
# Make the "File" column the index of the dfreturn df.set_index(['File'])
# Create a list of dataframes
df_list = [GetFile(fname) for fname in FileNames]
# Combine all of the dataframes into one
big_df = pd.concat(df_list)
df
One_X One_Y Two_X Two_Y row
01.11.21.111.22011.11.21.111.22121.11.21.111.222In [60]: df = df.set_index(’row’);dfOut[60]:
One_X One_Y Two_X Two_Y
row
01.11.21.111.2211.11.21.111.2221.11.21.111.22In [61]: df.columns = pd.MultiIndex.from_tuples([tuple(c.split(’_’)) for c in df.columns]);dfOut[61]:
One Two
XYXY
row
01.11.21.111.2211.11.21.111.2221.11.21.111.22In [62]: df = df.stack(0).reset_index(1);dfOut[62]:
level_1 XY
row
0 One 1.101.200 Two 1.111.221 One 1.101.201 Two 1.111.222 One 1.101.202 Two 1.111.22In [63]: df.columns = [’Sample’,’All_X’,’All_Y’];dfOut[63]:
Sample All_X All_Y
row
0 One 1.101.200 Two 1.111.221 One 1.101.201 Two 1.111.222 One 1.101.202 Two 1.111.22In [64]: cols = pd.MultiIndex.from_tuples([ (x,y) for xin [’A’,’B’,’C’] for yin [’O’,’I’]])
In [65]: df = pd.DataFrame(np.random.randn(2,6),index=[’n’,’m’],columns=cols); dfOut[65]:
A B C
O I O I O I
n 1.920906 -0.388231 -2.3143940.6655080.4025620.399555
m -1.7659560.8504230.3880540.9923120.744086 -0.739776In [72]: index = list(itertools.product([’Ada’,’Quinn’,’Violet’],[’Comp’,’Math’,’Sci’]))
In [73]: headr = list(itertools.product([’Exams’,’Labs’],[’I’,’II’]))
In [74]: indx = pd.MultiIndex.from_tuples(index,names=[’Student’,’Course’])
In [75]: cols = pd.MultiIndex.from_tuples(headr) #Notice these are un-namedIn [76]: data = [[70+x+y+(x*y)%3 for xin range(4)] for yin range(9)]
In [77]: df = pd.DataFrame(data,indx,cols); dfOut[77]:
Exams Labs
I II I II
Student Course
Ada Comp 70717273
Math 71737574
Sci 72757575
Quinn Comp 73747576
Math 74767877
Sci 75787878
Violet Comp 76777879
Math 77798180
Sci 78818181In [78]: All = slice(None)
In [79]: df.loc[’Violet’]
Out[79]:
Exams Labs
I II I II
Course
Comp 76777879
Math 77798180
Sci 78818181In [80]: df.loc[(All,’Math’),All]
Out[80]:
Exams Labs
I II I II
Student Course
Ada Math 71737574
Quinn Math 74767877
Violet Math 77798180In [81]: df.loc[(slice(’Ada’,’Quinn’),’Math’),All]
Out[81]:
Exams Labs
I II I II
Student Course
Ada Math 71737574
Quinn Math 74767877In [82]: df.loc[(All,’Math’),(’Exams’)]
Out[82]:
I II
Student Course
Ada Math 7173
Quinn Math 7476
Violet Math 7779In [83]: df.loc[(All,’Math’),(All,’II’)]
Out[83]:
Exams Labs
II II
Student Course
Ada Math 7374
Quinn Math 7677
Violet Math 7980
df = pd.DataFrame({’animal’: ’cat dog cat fish dog cat cat’.split(),
....: ’size’: list(’SSMMMLL’),
....: ’weight’: [8, 10, 11, 1, 20, 12, 12],
....: ’adult’ : [False] * 5 + [True] * 2});
df
adult animal size weight
0 False cat S 81 False dog S 102 False cat M 113 False fish M 14 False dog M 205 True cat L 126 True cat L 12#List the size of the animals with the highest weight.In [90]: df.groupby(’animal’).apply(lambda subf: subf[’size’][subf[’weight’].idxmax()])
Out[90]:
animal
cat L
dog M
fish M
dtype: object
In [91]: gb = df.groupby([’animal’])
In [92]: gb.get_group(’cat’)
Out[92]:
adult animal size weight
0 False cat S 82 False cat M 115 True cat L 126 True cat L 12
Apply to different items in a group
In [93]: def GrowUp(x):
....: avg_weight = sum(x[x.size == ’S’].weight * 1.5)
....: avg_weight += sum(x[x.size == ’M’].weight * 1.25)
....: avg_weight += sum(x[x.size == ’L’].weight)
....: avg_weight = avg_weight / len(x)
....: return pd.Series([’L’,avg_weight,True], index=[’size’, ’weight’, ’adult’])
....:In [94]: expected_df = gb.apply(GrowUp)
In [95]: expected_df
Out[95]:
size weight adult
animal
cat L 12.4375 True
dog L 20.0000 True
fish L 1.2500 True
Create a value counts column and reassign back to the DataFrame
In [115]: df = pd.DataFrame({’Color’: ’Red Red Red Blue’.split(),
.....: ’Value’: [100, 150, 50, 50]}); df.....:Out[115]:
Color Value
0 Red 1001 Red 1502 Red 503 Blue 50In [116]: df[’Counts’] = df.groupby([’Color’]).transform(len)
In [117]: df
Out[117]:
Color Value Counts
0 Red 10031 Red 15032 Red 5033 Blue 501
Rolling Apply to multiple columns where function returns a Scalar (Volume Weighted Average Price)
In [138]: rng = pd.date_range(start = ’2014-01-01’,periods = 100)
In [139]: df = pd.DataFrame({’Open’ : np.random.randn(len(rng)),
.....: ’Close’ : np.random.randn(len(rng)),’Volume’ : np.random.randint(100,2000,len(rng))}, index=rng)
df
Close Open Volume
2014-01-011.5505900.45851313712014-01-02 -0.818812 -0.50885014332014-01-031.1606190.2576106452014-01-040.081521 -1.7733938782014-01-051.083284 -0.56067611432014-01-06 -0.5187210.28417410882014-01-070.1406611.1468891722
... ... ... ...
2014-04-040.458193 -0.66947417682014-04-050.108502 -1.6163158362014-04-061.418082 -1.2949066942014-04-070.4865301.1716477962014-04-080.1818850.5016392652014-04-09 -0.707238 -0.36186812932014-04-101.2114321.5644291088In [140]: def vwap(bars): return ((bars.Close*bars.Volume).sum()/bars.Volume.sum()).round(2)
In [141]: window = 5In [142]: s = pd.concat([ (pd.Series(vwap(df.iloc[i:i+window]), index=[df.index[i+window]])) for i in range(95)])
Out[142]:
2014-01-060.552014-01-070.062014-01-080.322014-01-090.032014-01-100.08
...
2014-04-050.482014-04-060.542014-04-070.462014-04-080.452014-04-090.532014-04-100.15Length:95
Replacing some values with mean of the rest of a group
In [100]: df = pd.DataFrame({’A’ : [1, 1, 2, 2], ’B’ : [1, -1, 1, 2]})
In [101]: gb = df.groupby(’A’)
In [102]: def replace(g):
.....: mask = g < 0.....: g.loc[mask] = g[~mask].mean()
.....: return g
In [103]: gb.transform(replace)
Out[103]:
B
01112132
Sort groups by aggregated data
In [104]: df = pd.DataFrame({’code’: [’foo’, ’bar’, ’baz’] * 2,
.....: ’data’: [0.16, -0.21, 0.33, 0.45, -0.59, 0.62],
.....: ’flag’: [False, True] * 3})
.....:In [105]: code_groups = df.groupby(’code’)
In [106]: agg_n_sort_order = code_groups[[’data’]].transform(sum).sort(’data’)
In [107]: sorted_df = df.ix[agg_n_sort_order.index]
In [108]: sorted_df
Out[108]:
code data flag
1 bar -0.21 True
4 bar -0.59 False
0 foo 0.16 False
3 foo 0.45 True
2 baz 0.33 False
5 baz 0.62 True
Create multiple aggregated columns
In [109]: rng = pd.date_range(start="2014-10-07",periods=10,freq=’2min’)
In [110]: ts = pd.Series(data = list(range(10)), index = rng)
In [111]: def MyCust(x):
.....: if len(x) > 2:
.....: return x[1] * 1.234.....: return pd.NaTIn [112]: mhc = {’Mean’ : np.mean, ’Max’ : np.max, ’Custom’ : MyCust}
In [113]: ts.resample("5min",how = mhc)
Out[113]:
Max Custom Mean
2014-10-0700:00:0021.2341.02014-10-0700:05:004 NaN 3.52014-10-0700:10:0077.4046.02014-10-0700:15:009 NaN 8.5In [114]: ts
Out[114]:
2014-10-0700:00:0002014-10-0700:02:0012014-10-0700:04:0022014-10-0700:06:0032014-10-0700:08:0042014-10-0700:10:0052014-10-0700:12:0062014-10-0700:14:0072014-10-0700:16:0082014-10-0700:18:009Freq:2T, dtype: int64
Create a value counts column and reassign back to the DataFrame
In [115]: df = pd.DataFrame({’Color’: ’Red Red Red Blue’.split(),
.....: ’Value’: [100, 150, 50, 50]}); df.....:Out[115]:
Color Value
0 Red 1001 Red 1502 Red 503 Blue 50In [116]: df[’Counts’] = df.groupby([’Color’]).transform(len)
In [117]: df
Out[117]:
Color Value Counts
0 Red 10031 Red 15032 Red 5033 Blue 501
In [30]: i = pd.date_range(’20000101’,periods=10000)
In [31]: df = pd.DataFrame(dict(year = i.year, month = i.month, day = i.day))
In [32]: df.head()
Out[32]:
daymonthyear01120001212000231200034120004512000In [34]: ds = df.apply(lambda x: "%04d%02d%02d" % (x[’year’],x[’month’],x[’day’]),axis=1)
In [35]: ds.head()
Out[35]:
020000101120000102220000103320000104420000105
dtype: object
In [180]: defexpand_grid(data_dict):
.....: rows = itertools.product(*data_dict.values())
.....: return pd.DataFrame.from_records(rows, columns=data_dict.keys())
In [181]: df = expand_grid(
.....: {’height’: [60, 70],
.....: ’weight’: [100, 140, 180],
.....: ’sex’: [’Male’, ’Female’]})
Out[182]:
sex weight height
0 Male 100601 Male 100702 Male 140603 Male 140704 Male 180605 Male 180706 Female 100607 Female 100708 Female 140609 Female 1407010 Female 1806011 Female 18070
In [43]: DataFrame(data)
Out[43]:
A B C012 Hello
123 World
In [44]: DataFrame(data, index=[’first’, ’second’])
Out[44]:
A B C
first 12 Hello
second 23 World
In [45]: DataFrame(data, columns=[’C’, ’A’, ’B’])
Out[45]:
C A B
0 Hello 121 World 23