#coding = utf-8
import pandas as pd
import numpy as np
import matplotlib as plt
dates = pd.date_range('20170601', periods=6)
# make a random 6*4 matrix
df = pd.DataFrame(np.random.randn(6, 4), index=dates, columns=list('ABCD'))
print df
# statistic basics. exclude missing data in general
# mean. mean of cols as default
print df.mean()
'''
A -0.640908
B -0.216183
C 0.316962
D -0.634263
dtype: float64
'''
# mean of rows
print df.mean(1)
# move down
s = pd.Series([1, 3, 5, np.nan, 6, 8], index=dates).shift(2)
#print s
'''
2017-06-01 NaN
2017-06-02 NaN
2017-06-03 1.0
2017-06-04 3.0
2017-06-05 5.0
2017-06-06 NaN
Freq: D, dtype: float64
'''
# df-s. pandas will make Series into DataFrame
# df will change
print df
print df.sub(s, axis='index')
# cumulate by rows. default is by cols
print df.apply(np.cumsum, axis=1)
# apply lambda
print df.apply(lambda x: x.max() - x.min())
s = pd.Series(np.random.randint(0, 7, size=10))
# there are duplicate values
# value_counts behaves like histogram
print s.value_counts()
# string methods
# s.str.lower() means to lowercase
print df
# first 3 rows. index by rows in default
print df[:3]
# concat. use list as parameter
pieces = [df[:3], df[4:]]
print pd.concat(pieces)
# join.
left = pd.DataFrame({'key':['1', '2'], 'lvar':['leftVar1', 'leftVar2']})
right = pd.DataFrame({'key':['1', '2'], 'rvar':['rightVar1', 'rightVar2']})
print left
print right
# merge by same key value
print pd.merge(left, right, on='key')
'''
key lvar rvar
0 1 leftVar1 rightVar1
1 2 leftVar2 rightVar2
'''
# append. add a row to the tail
# ignore_index = False, the index will be appended too. If True, then all index will be 0...n (int)
print df.append(df.iloc[3], ignore_index=False)
# group
df1 = pd.DataFrame({'A' : ['f', 'b', 'f', 'f', 'b'],
'B' : ['1', '2', '2', '1', '2'],
'C' : np.random.randn(5)})
# use sum()
print df1.groupby(['A', 'B']).sum()
# stack unstack means transformation between matrix and DataFrame
# pivot_table means group by index and cols, use values. if there's function, execute it
# pivot_table(df, values='D', index=['A', 'B'], columnes=['C'])
# time series for time
# categoricals
# declare as category
s1 = pd.Series(['A', 'B', 'B', 'C', 'A', 'E']).astype("category")
# set category. Must same number of unique levels
s1.cat.categories = ["good", "bad", 'A', 'B']
print s1
# df.sort_values(by="categoryName")
# df.groupby("categoryName").size()
# plot
df2 = pd.DataFrame(np.random.randn(1000, 4), columns=['A','B','C','D'])
df2 = df2.cumsum()
# four lines, four colors. with legend.
df2.plot()
#plt.pyplot.show()
# file in & out
df2.to_csv("df2.csv")
df3 = pd.read_csv("df2.csv")
print df3.head(3)
#df2.to_hdf("df2.h5", 'df')
#pd.read_hdf('df2.h5', 'df')
# need module openpyxl...
df2.to_excel('df2.xlsx', sheet_name='sheet1')
pd.read_excel('df2.xlsx', 'sheet1', index_col=None, na_values=['NA'])