Python 学习Pandas第三课

关注微信公共号:小程在线

关注CSDN博客:程志伟的博客

 

完整脚本在微信公共号

 

Python 3.7.6 (default, Jan  8 2020, 20:23:39) [MSC v.1916 64 bit (AMD64)]
Type "copyright", "credits" or "license" for more information.

IPython 7.12.0 -- An enhanced Interactive Python.

import pandas as pd
import matplotlib.pyplot as plt
import numpy.random as np
import sys
import matplotlib

np.seed(111)

# 生成测试数据的函数
def CreateDataSet(Number=1):
    Output = []
    for i in range(Number):
# 创建一个按周计算的日期范围(每周一起始)
        rng = pd.date_range(start='1/1/2009', end='12/31/2012',
                            freq='W-MON')
# 创建一些随机数
        data = np.randint(low=25, high=1000, size=len(rng))
# 状态池
        status = [1, 2, 3]
# 创建一个随机的状态列表
        random_status = [status[np.randint(low=0, high=len(status))] for i in range(len(rng))]
# 行政州(state)的列表
        states = ['GA','FL','fl','NY','NJ','TX']
# 创建一个行政周的随机列表
        random_states = [states[np.randint(low=0, high=len(states))] for i in range(len(rng))]
        Output.extend(zip(random_states, random_status, data, rng))
    return Output

dataset = CreateDataSet(4)
df = pd.DataFrame(data=dataset, columns=['State','Status','CustomerCount','StatusDate'])
df.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 836 entries, 0 to 835
Data columns (total 4 columns):
 #   Column         Non-Null Count  Dtype         
---  ------         --------------  -----         
 0   State          836 non-null    object        
 1   Status         836 non-null    int64         
 2   CustomerCount  836 non-null    int64         
 3   StatusDate     836 non-null    datetime64[ns]
dtypes: datetime64[ns](1), int64(2), object(1)
memory usage: 26.2+ KB

df.head()
Out[4]: 
  State  Status  CustomerCount StatusDate
0    GA       1            877 2009-01-05
1    FL       1            901 2009-01-12
2    fl       3            749 2009-01-19
3    FL       3            111 2009-01-26
4    GA       1            300 2009-02-02

df.to_excel('Lesson3.xlsx', index=False) #不保存索引, 但是保存列名(column header)
print('Done')
Done

Location = r'./Lesson3.xlsx'
# 读取第一个页签(sheet), 并指定索引列是 StatusDate
df = pd.read_excel(Location, sheetname=0, index_col='StatusDate') #译者注: 需要 xlrd 包
df.dtypes
Traceback (most recent call last):

  File "<ipython-input-6-125ca988f91e>", line 3, in <module>
    df = pd.read_excel(Location, sheetname=0, index_col='StatusDate') #译者注: 需要 xlrd 包

  File "E:\anaconda3\lib\site-packages\pandas\io\excel\_base.py", line 301, in read_excel
    raise TypeError(f"read_excel() got an unexpected keyword argument `{arg}`")

TypeError: read_excel() got an unexpected keyword argument `sheetname`


df = pd.read_excel(Location, sheet_name=0, index_col='StatusDate') #译者注: 需要 xlrd 包
df.dtypes
Out[7]: 
State            object
Status            int64
CustomerCount     int64
dtype: object

df.index
Out[8]: 
DatetimeIndex(['2009-01-05', '2009-01-12', '2009-01-19', '2009-01-26',
               '2009-02-02', '2009-02-09', '2009-02-16', '2009-02-23',
               '2009-03-02', '2009-03-09',
               ...
               '2012-10-29', '2012-11-05', '2012-11-12', '2012-11-19',
               '2012-11-26', '2012-12-03', '2012-12-10', '2012-12-17',
               '2012-12-24', '2012-12-31'],
              dtype='datetime64[ns]', name='StatusDate', length=836, freq=None)

df.head()
Out[9]: 
           State  Status  CustomerCount
StatusDate                             
2009-01-05    GA       1            877
2009-01-12    FL       1            901
2009-01-19    fl       3            749
2009-01-26    FL       3            111
2009-02-02    GA       1            300

df['State'].unique()
Out[10]: array(['GA', 'FL', 'fl', 'TX', 'NY', 'NJ'], dtype=object)

df['State'] = df.State.apply(lambda x: x.upper())

df['State'].unique()
Out[12]: array(['GA', 'FL', 'TX', 'NY', 'NJ'], dtype=object)

mask = df['Status'] == 1
df = df[mask]

mask = df.State == 'NJ'
df['State'][mask] = 'NY'

df['State'].unique()
Out[15]: array(['GA', 'FL', 'NY', 'TX'], dtype=object)

df['CustomerCount'].plot(figsize=(15,5));

sortdf = df[df['State']=='NY'].sort_index(axis=0)
sortdf.head(10)
Out[17]: 
           State  Status  CustomerCount
StatusDate                             
2009-01-19    NY       1            522
2009-02-23    NY       1            710
2009-03-09    NY       1            992
2009-03-16    NY       1            355
2009-03-23    NY       1            728
2009-03-30    NY       1            863
2009-04-13    NY       1            520
2009-04-20    NY       1            820
2009-04-20    NY       1            937
2009-04-27    NY       1            447

Daily = df.reset_index().groupby(['State','StatusDate']).sum()
Daily.head()
Out[18]: 
                  Status  CustomerCount
State StatusDate                       
FL    2009-01-12       1            901
      2009-02-02       1            653
      2009-03-23       1            752
      2009-04-06       2           1086
      2009-06-08       1            649

Daily.index
Out[19]: 
MultiIndex([('FL', '2009-01-12'),
            ('FL', '2009-02-02'),
            ('FL', '2009-03-23'),
            ('FL', '2009-04-06'),
            ('FL', '2009-06-08'),
            ('FL', '2009-07-06'),
            ('FL', '2009-07-13'),
            ('FL', '2009-07-20'),
            ('FL', '2009-08-10'),
            ('FL', '2009-08-24'),
            ...
            ('TX', '2012-01-09'),
            ('TX', '2012-02-27'),
            ('TX', '2012-03-12'),
            ('TX', '2012-04-23'),
            ('TX', '2012-04-30'),
            ('TX', '2012-08-06'),
            ('TX', '2012-08-20'),
            ('TX', '2012-08-27'),
            ('TX', '2012-09-03'),
            ('TX', '2012-10-29')],
           names=['State', 'StatusDate'], length=239)

 

# 选择 State 这个索引

Daily.index.levels[0]
Out[20]: Index(['FL', 'GA', 'NY', 'TX'], dtype='object', name='State')

# 选择 StatusDate 这个索引

Daily.index.levels[1]
Out[21]: 
DatetimeIndex(['2009-01-05', '2009-01-12', '2009-01-19', '2009-02-02',
               '2009-02-23', '2009-03-09', '2009-03-16', '2009-03-23',
               '2009-03-30', '2009-04-06',
               ...
               '2012-09-24', '2012-10-01', '2012-10-08', '2012-10-22',
               '2012-10-29', '2012-11-05', '2012-11-12', '2012-11-19',
               '2012-11-26', '2012-12-10'],
              dtype='datetime64[ns]', name='StatusDate', length=161, freq=None)

 

#将图表按照不同的 State 区分开, 我们能看到更清晰的数据

Daily.loc['FL'].plot()
Out[22]: <matplotlib.axes._subplots.AxesSubplot at 0x20663a1adc8>

Daily.loc['GA'].plot()
Out[23]: <matplotlib.axes._subplots.AxesSubplot at 0x20663a69108>

Daily.loc['NY'].plot()
Out[24]: <matplotlib.axes._subplots.AxesSubplot at 0x20663be8688>

Daily.loc['TX'].plot()
Out[25]: <matplotlib.axes._subplots.AxesSubplot at 0x20664d98b88>

# 计算离群值

StateYearMonth = Daily.groupby([Daily.index.get_level_values(0),
                                Daily.index.get_level_values(1).year, Daily.index.get_level_values(1).month])
Daily['Lower'] = StateYearMonth['CustomerCount'].transform( lambda x: x.quantile(q=.25) - (1.5*x.quantile(q=.75)-x.quantile(q=.25)) )
Daily['Upper'] = StateYearMonth['CustomerCount'].transform( lambda x: x.quantile(q=.75) + (1.5*x.quantile(q=.75)-x.quantile(q=.25)) )
Daily['Outlier'] = (Daily['CustomerCount'] < Daily['Lower']) | (
Daily['CustomerCount'] > Daily['Upper'])

# 移除离群值

Daily = Daily[Daily['Outlier'] == False]

Daily.head()
Out[28]: 
                  Status  CustomerCount  Lower   Upper  Outlier
State StatusDate                                               
FL    2009-01-12       1            901  450.5  1351.5    False
      2009-02-02       1            653  326.5   979.5    False
      2009-03-23       1            752  376.0  1128.0    False
      2009-04-06       2           1086  543.0  1629.0    False
      2009-06-08       1            649  324.5   973.5    False

 

#我们创建一个单独的 dataframe, 叫 ALL, 仅用 StatusDate 来为 Daily 数据集做索引。 
#我们简单地去掉 State 这一列。 Max 列则代表了每一个月最大的客户数量。Max 列是用来是的图形更顺滑的。
# 合并所有市场的
# 按日期计算出最大的客户数

ALL = pd.DataFrame(Daily['CustomerCount'].groupby(Daily.index.get_level_values(1)).sum())
ALL.columns = ['CustomerCount'] # rename column

# 按照年和月来分组
YearMonth = ALL.groupby([lambda x: x.year, lambda x: x.month])
# 找出每一个年和月的组合中最大的客户数
ALL['Max'] = YearMonth['CustomerCount'].transform(lambda x: x.max())
ALL.head()
Out[29]: 
            CustomerCount  Max
StatusDate                    
2009-01-05            877  901
2009-01-12            901  901
2009-01-19            522  901
2009-02-02            953  953
2009-02-23            710  953

data = [1000, 2000, 3000]
idx = pd.date_range(start='12/31/2011', end='12/31/2013', freq='A')
BHAG = pd.DataFrame(data, index=idx, columns=['BHAG'])
BHAG
Out[30]: 
            BHAG
2011-12-31  1000
2012-12-31  2000
2013-12-31  3000

combined = pd.concat([ALL,BHAG], axis=0)
combined = combined.sort_index(axis=0)
combined.tail()
Out[31]: 
            CustomerCount     Max    BHAG
2012-11-19          136.0  1115.0     NaN
2012-11-26         1115.0  1115.0     NaN
2012-12-10         1269.0  1269.0     NaN
2012-12-31            NaN     NaN  2000.0
2013-12-31            NaN     NaN  3000.0

fig, axes = plt.subplots(figsize=(12, 7))
combined['BHAG'].fillna(method='pad').plot(color='green', label='BHAG')
combined['Max'].plot(color='blue', label='All Markets')
plt.legend(loc='best');

Year = combined.groupby(lambda x: x.year).max()
Year
Out[33]: 
      CustomerCount     Max    BHAG
2009         2452.0  2452.0     NaN
2010         2065.0  2065.0     NaN
2011         2711.0  2711.0  1000.0
2012         2061.0  2061.0  2000.0
2013            NaN     NaN  3000.0

Year['YR_PCT_Change'] = Year['Max'].pct_change(periods=1)
Year
Out[34]: 
      CustomerCount     Max    BHAG  YR_PCT_Change
2009         2452.0  2452.0     NaN            NaN
2010         2065.0  2065.0     NaN      -0.157830
2011         2711.0  2711.0  1000.0       0.312833
2012         2061.0  2061.0  2000.0      -0.239764
2013            NaN     NaN  3000.0       0.000000

(1 + Year.ix[2012,'YR_PCT_Change']) * Year.ix[2012,'Max']
Traceback (most recent call last):

  File "<ipython-input-35-b56b28b23dec>", line 1, in <module>
    (1 + Year.ix[2012,'YR_PCT_Change']) * Year.ix[2012,'Max']

  File "E:\anaconda3\lib\site-packages\pandas\core\generic.py", line 5274, in __getattr__
    return object.__getattribute__(self, name)

AttributeError: 'DataFrame' object has no attribute 'ix'


(1 + Year.loc[2012,'YR_PCT_Change']) * Year.loc[2012,'Max']
Out[37]: 1566.8465510881595

ALL['Max'].plot(figsize=(10, 5));plt.title('ALL Markets')


# 后面四张
fig, axes = plt.subplots(nrows=2, ncols=2, figsize=(20, 10))
fig.subplots_adjust(hspace=1.0) ## Create space between plots
Daily.loc['FL']['CustomerCount']['2012':].fillna(method='pad').plot(ax=axes[0,0])
Daily.loc['GA']['CustomerCount']['2012':].fillna(method='pad').plot(ax=axes[0,1])
Daily.loc['TX']['CustomerCount']['2012':].fillna(method='pad').plot(ax=axes[1,0])
Daily.loc['NY']['CustomerCount']['2012':].fillna(method='pad').plot(ax=axes[1,1])

# 增加图表的抬头
axes[0,0].set_title('Florida')
axes[0,1].set_title('Georgia')
axes[1,0].set_title('Texas')
axes[1,1].set_title('North East');


 

  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值