Python 学习Pandas第三课

最新推荐文章于 2024-04-11 15:12:19 发布

程志伟

最新推荐文章于 2024-04-11 15:12:19 发布

阅读量860

点赞数

文章标签： python

本文链接：https://blog.csdn.net/c1z2w3456789/article/details/107882780

版权

关注微信公共号：小程在线

关注CSDN博客：程志伟的博客

完整脚本在微信公共号

Python 3.7.6 (default, Jan 8 2020, 20:23:39) [MSC v.1916 64 bit (AMD64)]
Type "copyright", "credits" or "license" for more information.

IPython 7.12.0 -- An enhanced Interactive Python.

import pandas as pd
import matplotlib.pyplot as plt
import numpy.random as np
import sys
import matplotlib

np.seed(111)

# 生成测试数据的函数
def CreateDataSet(Number=1):
Output = []
for i in range(Number):
# 创建一个按周计算的日期范围(每周一起始)
rng = pd.date_range(start='1/1/2009', end='12/31/2012',
freq='W-MON')
# 创建一些随机数
data = np.randint(low=25, high=1000, size=len(rng))
# 状态池
status = [1, 2, 3]
# 创建一个随机的状态列表
random_status = [status[np.randint(low=0, high=len(status))] for i in range(len(rng))]
# 行政州(state)的列表
states = ['GA','FL','fl','NY','NJ','TX']
# 创建一个行政周的随机列表
random_states = [states[np.randint(low=0, high=len(states))] for i in range(len(rng))]
Output.extend(zip(random_states, random_status, data, rng))
return Output

dataset = CreateDataSet(4)
df = pd.DataFrame(data=dataset, columns=['State','Status','CustomerCount','StatusDate'])
df.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 836 entries, 0 to 835
Data columns (total 4 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 State 836 non-null object
1 Status 836 non-null int64
2 CustomerCount 836 non-null int64
3 StatusDate 836 non-null datetime64[ns]
dtypes: datetime64[ns](1), int64(2), object(1)
memory usage: 26.2+ KB

df.head()
Out[4]:
State Status CustomerCount StatusDate
0 GA 1 877 2009-01-05
1 FL 1 901 2009-01-12
2 fl 3 749 2009-01-19
3 FL 3 111 2009-01-26
4 GA 1 300 2009-02-02

df.to_excel('Lesson3.xlsx', index=False) #不保存索引，但是保存列名(column header)
print('Done')
Done

Location = r'./Lesson3.xlsx'
# 读取第一个页签(sheet)，并指定索引列是 StatusDate
df = pd.read_excel(Location, sheetname=0, index_col='StatusDate') #译者注: 需要 xlrd 包
df.dtypes
Traceback (most recent call last):

File "<ipython-input-6-125ca988f91e>", line 3, in <module>
df = pd.read_excel(Location, sheetname=0, index_col='StatusDate') #译者注: 需要 xlrd 包

File "E:\anaconda3\lib\site-packages\pandas\io\excel\_base.py", line 301, in read_excel
raise TypeError(f"read_excel() got an unexpected keyword argument `{arg}`")

TypeError: read_excel() got an unexpected keyword argument `sheetname`

df = pd.read_excel(Location, sheet_name=0, index_col='StatusDate') #译者注: 需要 xlrd 包
df.dtypes
Out[7]:
State object
Status int64
CustomerCount int64
dtype: object

df.index
Out[8]:
DatetimeIndex(['2009-01-05', '2009-01-12', '2009-01-19', '2009-01-26',
'2009-02-02', '2009-02-09', '2009-02-16', '2009-02-23',
'2009-03-02', '2009-03-09',
...
'2012-10-29', '2012-11-05', '2012-11-12', '2012-11-19',
'2012-11-26', '2012-12-03', '2012-12-10', '2012-12-17',
'2012-12-24', '2012-12-31'],
dtype='datetime64[ns]', name='StatusDate', length=836, freq=None)

df.head()
Out[9]:
State Status CustomerCount
StatusDate
2009-01-05 GA 1 877
2009-01-12 FL 1 901
2009-01-19 fl 3 749
2009-01-26 FL 3 111
2009-02-02 GA 1 300

df['State'].unique()
Out[10]: array(['GA', 'FL', 'fl', 'TX', 'NY', 'NJ'], dtype=object)

df['State'] = df.State.apply(lambda x: x.upper())

df['State'].unique()
Out[12]: array(['GA', 'FL', 'TX', 'NY', 'NJ'], dtype=object)

mask = df['Status'] == 1
df = df[mask]

mask = df.State == 'NJ'
df['State'][mask] = 'NY'

df['State'].unique()
Out[15]: array(['GA', 'FL', 'NY', 'TX'], dtype=object)

df['CustomerCount'].plot(figsize=(15,5));

sortdf = df[df['State']=='NY'].sort_index(axis=0)
sortdf.head(10)
Out[17]:
State Status CustomerCount
StatusDate
2009-01-19 NY 1 522
2009-02-23 NY 1 710
2009-03-09 NY 1 992
2009-03-16 NY 1 355
2009-03-23 NY 1 728
2009-03-30 NY 1 863
2009-04-13 NY 1 520
2009-04-20 NY 1 820
2009-04-20 NY 1 937
2009-04-27 NY 1 447

Daily = df.reset_index().groupby(['State','StatusDate']).sum()
Daily.head()
Out[18]:
Status CustomerCount
State StatusDate
FL 2009-01-12 1 901
2009-02-02 1 653
2009-03-23 1 752
2009-04-06 2 1086
2009-06-08 1 649

Daily.index
Out[19]:
MultiIndex([('FL', '2009-01-12'),
('FL', '2009-02-02'),
('FL', '2009-03-23'),
('FL', '2009-04-06'),
('FL', '2009-06-08'),
('FL', '2009-07-06'),
('FL', '2009-07-13'),
('FL', '2009-07-20'),
('FL', '2009-08-10'),
('FL', '2009-08-24'),
...
('TX', '2012-01-09'),
('TX', '2012-02-27'),
('TX', '2012-03-12'),
('TX', '2012-04-23'),
('TX', '2012-04-30'),
('TX', '2012-08-06'),
('TX', '2012-08-20'),
('TX', '2012-08-27'),
('TX', '2012-09-03'),
('TX', '2012-10-29')],
names=['State', 'StatusDate'], length=239)

# 选择 State 这个索引

Daily.index.levels[0]
Out[20]: Index(['FL', 'GA', 'NY', 'TX'], dtype='object', name='State')

# 选择 StatusDate 这个索引

Daily.index.levels[1]
Out[21]:
DatetimeIndex(['2009-01-05', '2009-01-12', '2009-01-19', '2009-02-02',
'2009-02-23', '2009-03-09', '2009-03-16', '2009-03-23',
'2009-03-30', '2009-04-06',
...
'2012-09-24', '2012-10-01', '2012-10-08', '2012-10-22',
'2012-10-29', '2012-11-05', '2012-11-12', '2012-11-19',
'2012-11-26', '2012-12-10'],
dtype='datetime64[ns]', name='StatusDate', length=161, freq=None)

#将图表按照不同的 State 区分开，我们能看到更清晰的数据

Daily.loc['FL'].plot()
Out[22]: <matplotlib.axes._subplots.AxesSubplot at 0x20663a1adc8>

Daily.loc['GA'].plot()
Out[23]: <matplotlib.axes._subplots.AxesSubplot at 0x20663a69108>

Daily.loc['NY'].plot()
Out[24]: <matplotlib.axes._subplots.AxesSubplot at 0x20663be8688>

Daily.loc['TX'].plot()
Out[25]: <matplotlib.axes._subplots.AxesSubplot at 0x20664d98b88>

# 计算离群值

StateYearMonth = Daily.groupby([Daily.index.get_level_values(0),
Daily.index.get_level_values(1).year, Daily.index.get_level_values(1).month])
Daily['Lower'] = StateYearMonth['CustomerCount'].transform( lambda x: x.quantile(q=.25) - (1.5*x.quantile(q=.75)-x.quantile(q=.25)) )
Daily['Upper'] = StateYearMonth['CustomerCount'].transform( lambda x: x.quantile(q=.75) + (1.5*x.quantile(q=.75)-x.quantile(q=.25)) )
Daily['Outlier'] = (Daily['CustomerCount'] < Daily['Lower']) | (
Daily['CustomerCount'] > Daily['Upper'])

# 移除离群值

Daily = Daily[Daily['Outlier'] == False]

Daily.head()
Out[28]:
Status CustomerCount Lower Upper Outlier
State StatusDate
FL 2009-01-12 1 901 450.5 1351.5 False
2009-02-02 1 653 326.5 979.5 False
2009-03-23 1 752 376.0 1128.0 False
2009-04-06 2 1086 543.0 1629.0 False
2009-06-08 1 649 324.5 973.5 False

#我们创建一个单独的 dataframe，叫 ALL，仅用 StatusDate 来为 Daily 数据集做索引。
#我们简单地去掉 State 这一列。 Max 列则代表了每一个月最大的客户数量。Max 列是用来是的图形更顺滑的。
# 合并所有市场的
# 按日期计算出最大的客户数

ALL = pd.DataFrame(Daily['CustomerCount'].groupby(Daily.index.get_level_values(1)).sum())
ALL.columns = ['CustomerCount'] # rename column

# 按照年和月来分组
YearMonth = ALL.groupby([lambda x: x.year, lambda x: x.month])
# 找出每一个年和月的组合中最大的客户数
ALL['Max'] = YearMonth['CustomerCount'].transform(lambda x: x.max())
ALL.head()
Out[29]:
CustomerCount Max
StatusDate
2009-01-05 877 901
2009-01-12 901 901
2009-01-19 522 901
2009-02-02 953 953
2009-02-23 710 953

data = [1000, 2000, 3000]
idx = pd.date_range(start='12/31/2011', end='12/31/2013', freq='A')
BHAG = pd.DataFrame(data, index=idx, columns=['BHAG'])
BHAG
Out[30]:
BHAG
2011-12-31 1000
2012-12-31 2000
2013-12-31 3000

combined = pd.concat([ALL,BHAG], axis=0)
combined = combined.sort_index(axis=0)
combined.tail()
Out[31]:
CustomerCount Max BHAG
2012-11-19 136.0 1115.0 NaN
2012-11-26 1115.0 1115.0 NaN
2012-12-10 1269.0 1269.0 NaN
2012-12-31 NaN NaN 2000.0
2013-12-31 NaN NaN 3000.0

fig, axes = plt.subplots(figsize=(12, 7))
combined['BHAG'].fillna(method='pad').plot(color='green', label='BHAG')
combined['Max'].plot(color='blue', label='All Markets')
plt.legend(loc='best');

Year = combined.groupby(lambda x: x.year).max()
Year
Out[33]:
CustomerCount Max BHAG
2009 2452.0 2452.0 NaN
2010 2065.0 2065.0 NaN
2011 2711.0 2711.0 1000.0
2012 2061.0 2061.0 2000.0
2013 NaN NaN 3000.0

Year['YR_PCT_Change'] = Year['Max'].pct_change(periods=1)
Year
Out[34]:
CustomerCount Max BHAG YR_PCT_Change
2009 2452.0 2452.0 NaN NaN
2010 2065.0 2065.0 NaN -0.157830
2011 2711.0 2711.0 1000.0 0.312833
2012 2061.0 2061.0 2000.0 -0.239764
2013 NaN NaN 3000.0 0.000000

(1 + Year.ix[2012,'YR_PCT_Change']) * Year.ix[2012,'Max']
Traceback (most recent call last):

File "<ipython-input-35-b56b28b23dec>", line 1, in <module>
(1 + Year.ix[2012,'YR_PCT_Change']) * Year.ix[2012,'Max']

File "E:\anaconda3\lib\site-packages\pandas\core\generic.py", line 5274, in __getattr__
return object.__getattribute__(self, name)

AttributeError: 'DataFrame' object has no attribute 'ix'

(1 + Year.loc[2012,'YR_PCT_Change']) * Year.loc[2012,'Max']
Out[37]: 1566.8465510881595

ALL['Max'].plot(figsize=(10, 5));plt.title('ALL Markets')

# 后面四张
fig, axes = plt.subplots(nrows=2, ncols=2, figsize=(20, 10))
fig.subplots_adjust(hspace=1.0) ## Create space between plots
Daily.loc['FL']['CustomerCount']['2012':].fillna(method='pad').plot(ax=axes[0,0])
Daily.loc['GA']['CustomerCount']['2012':].fillna(method='pad').plot(ax=axes[0,1])
Daily.loc['TX']['CustomerCount']['2012':].fillna(method='pad').plot(ax=axes[1,0])
Daily.loc['NY']['CustomerCount']['2012':].fillna(method='pad').plot(ax=axes[1,1])

# 增加图表的抬头
axes[0,0].set_title('Florida')
axes[0,1].set_title('Georgia')
axes[1,0].set_title('Texas')
axes[1,1].set_title('North East');

程志伟

关注

0
点赞
踩
0

收藏

觉得还不错? 一键收藏
0
评论
Python 学习Pandas第三课

Python 3.7.6 (default, Jan 8 2020, 20:23:39) [MSC v.1916 64 bit (AMD64)]Type "copyright", "credits" or "license" for more information.IPython 7.12.0 -- An enhanced Interactive Python.import pandas as pdimport matplotlib.pyplot as pltimport numpy.ra.
复制链接

扫一扫