数据分析——从入门到精通(十七)

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

1.数据载入和总览

  • 1.1数据载入
election = pd.read_csv('data2/usa_election.txt',low_memory=False)
election.head()
cmte_idcand_idcand_nmcontbr_nmcontbr_citycontbr_stcontbr_zipcontbr_employercontbr_occupationcontb_receipt_amtcontb_receipt_dtreceipt_descmemo_cdmemo_textform_tpfile_num
0C00410118P20002978Bachmann, MichelleHARVEY, WILLIAMMOBILEAL366010290RETIREDRETIRED250.020-JUN-11NaNNaNNaNSA17A736166
1C00410118P20002978Bachmann, MichelleHARVEY, WILLIAMMOBILEAL366010290RETIREDRETIRED50.023-JUN-11NaNNaNNaNSA17A736166
2C00410118P20002978Bachmann, MichelleSMITH, LANIERLANETTAL368633403INFORMATION REQUESTEDINFORMATION REQUESTED250.005-JUL-11NaNNaNNaNSA17A749073
3C00410118P20002978Bachmann, MichelleBLEVINS, DARONDAPIGGOTTAR724548253NONERETIRED250.001-AUG-11NaNNaNNaNSA17A749073
4C00410118P20002978Bachmann, MichelleWARDENBURG, HAROLDHOT SPRINGS NATIONAR719016467NONERETIRED300.020-JUN-11NaNNaNNaNSA17A736166
  • 1.2数据预览和基本统计分析
election.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 536041 entries, 0 to 536040
Data columns (total 16 columns):
cmte_id              536041 non-null object
cand_id              536041 non-null object
cand_nm              536041 non-null object
contbr_nm            536041 non-null object
contbr_city          536026 non-null object
contbr_st            536040 non-null object
contbr_zip           535973 non-null object
contbr_employer      525088 non-null object
contbr_occupation    530520 non-null object
contb_receipt_amt    536041 non-null float64
contb_receipt_dt     536041 non-null object
receipt_desc         8479 non-null object
memo_cd              49718 non-null object
memo_text            52740 non-null object
form_tp              536041 non-null object
file_num             536041 non-null int64
dtypes: float64(1), int64(1), object(14)
memory usage: 36.8+ MB
election.describe()
contb_receipt_amtfile_num
count5.360410e+05536041.000000
mean3.750373e+02761472.107800
std3.564436e+035148.893508
min-3.080000e+04723511.000000
25%5.000000e+01756218.000000
50%1.000000e+02763233.000000
75%2.500000e+02763621.000000
max1.944042e+06767394.000000

2.数据清洗

  • 2.1缺失值处理
  • 填充:NOT PROVIDE
# 查看哪些列存在NAN值
election.isna().any(axis=0)
cmte_id              False
cand_id              False
cand_nm              False
contbr_nm            False
contbr_city           True
contbr_st             True
contbr_zip            True
contbr_employer       True
contbr_occupation     True
contb_receipt_amt    False
contb_receipt_dt     False
receipt_desc          True
memo_cd               True
memo_text             True
form_tp              False
file_num             False
dtype: bool
# 把NAN值进行填充
election.fillna('NOT PROVIDE',inplace=True)
election.isna().any(axis=0)
cmte_id              False
cand_id              False
cand_nm              False
contbr_nm            False
contbr_city          False
contbr_st            False
contbr_zip           False
contbr_employer      False
contbr_occupation    False
contb_receipt_amt    False
contb_receipt_dt     False
receipt_desc         False
memo_cd              False
memo_text            False
form_tp              False
file_num             False
dtype: bool
  • 2.2数据转换
  • 利用字典映射进行转换:党派分析
#通过搜索引擎等途径,获取到每个总统候选人的所属党派,建立字典parties,候选人名字作为键,所属党派作为对应的值"
parties = {'Bachmann, Michelle': 'Republican',
            'Cain, Herman': 'Republican',
            'Gingrich, Newt': 'Republican',
               'Huntsman, Jon': 'Republican',
               'Johnson, Gary Earl': 'Republican',
               'McCotter, Thaddeus G': 'Republican',
               'Obama, Barack': 'Democrat',
               'Paul, Ron': 'Republican',
               'Pawlenty, Timothy': 'Republican',
               'Perry, Rick': 'Republican',
               "Roemer, Charles E. 'Buddy' III": 'Republican',
               'Romney, Mitt': 'Republican',
               'Santorum, Rick': 'Republican'}
election.head()
cmte_idcand_idcand_nmcontbr_nmcontbr_citycontbr_stcontbr_zipcontbr_employercontbr_occupationcontb_receipt_amtcontb_receipt_dtreceipt_descmemo_cdmemo_textform_tpfile_num
0C00410118P20002978Bachmann, MichelleHARVEY, WILLIAMMOBILEAL366010290RETIREDRETIRED250.020-JUN-11NOT PROVIDENOT PROVIDENOT PROVIDESA17A736166
1C00410118P20002978Bachmann, MichelleHARVEY, WILLIAMMOBILEAL366010290RETIREDRETIRED50.023-JUN-11NOT PROVIDENOT PROVIDENOT PROVIDESA17A736166
2C00410118P20002978Bachmann, MichelleSMITH, LANIERLANETTAL368633403INFORMATION REQUESTEDINFORMATION REQUESTED250.005-JUL-11NOT PROVIDENOT PROVIDENOT PROVIDESA17A749073
3C00410118P20002978Bachmann, MichelleBLEVINS, DARONDAPIGGOTTAR724548253NONERETIRED250.001-AUG-11NOT PROVIDENOT PROVIDENOT PROVIDESA17A749073
4C00410118P20002978Bachmann, MichelleWARDENBURG, HAROLDHOT SPRINGS NATIONAR719016467NONERETIRED300.020-JUN-11NOT PROVIDENOT PROVIDENOT PROVIDESA17A736166
election.dtypes
cmte_id               object
cand_id               object
cand_nm               object
contbr_nm             object
contbr_city           object
contbr_st             object
contbr_zip            object
contbr_employer       object
contbr_occupation     object
contb_receipt_amt    float64
contb_receipt_dt      object
receipt_desc          object
memo_cd               object
memo_text             object
form_tp               object
file_num               int64
dtype: object
  • map方法添加party属性
election['party'] = election['cand_nm'].map(parties)
election.head()
cmte_idcand_idcand_nmcontbr_nmcontbr_citycontbr_stcontbr_zipcontbr_employercontbr_occupationcontb_receipt_amtcontb_receipt_dtreceipt_descmemo_cdmemo_textform_tpfile_numparty
0C00410118P20002978Bachmann, MichelleHARVEY, WILLIAMMOBILEAL366010290RETIREDRETIRED250.020-JUN-11NOT PROVIDENOT PROVIDENOT PROVIDESA17A736166Republican
1C00410118P20002978Bachmann, MichelleHARVEY, WILLIAMMOBILEAL366010290RETIREDRETIRED50.023-JUN-11NOT PROVIDENOT PROVIDENOT PROVIDESA17A736166Republican
2C00410118P20002978Bachmann, MichelleSMITH, LANIERLANETTAL368633403INFORMATION REQUESTEDINFORMATION REQUESTED250.005-JUL-11NOT PROVIDENOT PROVIDENOT PROVIDESA17A749073Republican
3C00410118P20002978Bachmann, MichelleBLEVINS, DARONDAPIGGOTTAR724548253NONERETIRED250.001-AUG-11NOT PROVIDENOT PROVIDENOT PROVIDESA17A749073Republican
4C00410118P20002978Bachmann, MichelleWARDENBURG, HAROLDHOT SPRINGS NATIONAR719016467NONERETIRED300.020-JUN-11NOT PROVIDENOT PROVIDENOT PROVIDESA17A736166Republican
  • value_counts统计各党派总捐频次
# 候选人党派信息
election['cand_nm'].unique()
array(['Bachmann, Michelle', 'Romney, Mitt', 'Obama, Barack',
       "Roemer, Charles E. 'Buddy' III", 'Pawlenty, Timothy',
       'Johnson, Gary Earl', 'Paul, Ron', 'Santorum, Rick',
       'Cain, Herman', 'Gingrich, Newt', 'McCotter, Thaddeus G',
       'Huntsman, Jon', 'Perry, Rick'], dtype=object)
election['party'].value_counts()
Democrat      292400
Republican    243641
Name: party, dtype: int64
  • 排序:按照职业的汇总对赞助总金额进行排序

  • 按照职位进行汇总,计算赞助总金额,展示前50项,发现不少职业是相同的,只不过是表达不一样而已,如C.E.O与CEO,都是一个职业

# 查看岗位
election['contbr_occupation'].unique()
array(['RETIRED', 'INFORMATION REQUESTED', 'RN', ...,
       'SECRETARY TO VICE PRESIDENT', 'PRESIDENT/TREASURER',
       'LONGWALL MAINTENANCE FOREMAN'], dtype=object)
election['contbr_occupation'].value_counts()
RETIRED                  117973
INFORMATION REQUESTED     23439
ATTORNEY                  19943
HOMEMAKER                 15756
PHYSICIAN                 13171
                          ...  
POLYMER SCIENTIST             1
TAX PARTNER / CPA             1
STARTUP EDUCATION             1
C.P.A./SHAREHOLDER            1
INTERACTIVE PRODUCER          1
Name: contbr_occupation, Length: 29620, dtype: int64
#  C.E.O与CEO,做映射处理,改成一样的
f = lambda item: 'CEO'       if 'CEO' in item or item.find('C.E.O')>-1 else item
election['contbr_occupation'] = election['contbr_occupation'].map(f)
election['contbr_occupation'].value_counts()
RETIRED                    117973
INFORMATION REQUESTED       23439
ATTORNEY                    19943
HOMEMAKER                   15756
PHYSICIAN                   13171
                            ...  
HOMEAKER                        1
OIL/GAS                         1
TITLE INSURANCE SALES           1
EMS HELICOPTER MECHANIC         1
INTERACTIVE PRODUCER            1
Name: contbr_occupation, Length: 29201, dtype: int64
# 按照岗位进行分组
election.groupby('contbr_occupation')['contb_receipt_amt'].sum()
contbr_occupation
   MIXED-MEDIA ARTIST / STORYTELLER     100.00
 AREA VICE PRESIDENT                    250.00
 RESEARCH ASSOCIATE                     100.00
 TEACHER                                500.00
 THERAPIST                             3900.00
                                        ...   
ZIMMERMANS DAIRY                         83.71
ZOMBIE SLAYER                          1556.00
ZOOLOGIST                               100.00
\NONE\                                  250.00
~                                        75.00
Name: contb_receipt_amt, Length: 29201, dtype: float64
# 分完组排序
election.groupby('contbr_occupation')['contb_receipt_amt'].sum().sort_values()
contbr_occupation
METAL SMITH                                  -4250.00
PRES OF GAS & ELECTRIC                       -2500.00
AVIATION ATTORNEY                            -2500.00
MFG. REP                                       -50.00
GOVERNMENT MEMBER                                0.00
                                             ...     
CEO                                        7310417.46
INFORMATION REQUESTED PER BEST EFFORTS     9832750.00
HOMEMAKER                                 12410217.32
ATTORNEY                                  12580812.94
RETIRED                                   29865435.00
Name: contb_receipt_amt, Length: 29201, dtype: float64
# 去掉里面负值排序
amt_sum = election.groupby('contbr_occupation')['contb_receipt_amt'].sum()
amt_sum.sort_values(ascending=False)
contbr_occupation
RETIRED                                   29865435.00
ATTORNEY                                  12580812.94
HOMEMAKER                                 12410217.32
INFORMATION REQUESTED PER BEST EFFORTS     9832750.00
CEO                                        7310417.46
                                             ...     
GOVERNMENT MEMBER                                0.00
MFG. REP                                       -50.00
AVIATION ATTORNEY                            -2500.00
PRES OF GAS & ELECTRIC                       -2500.00
METAL SMITH                                  -4250.00
Name: contb_receipt_amt, Length: 29201, dtype: float64
#  取前面五十个
amt_sum.sort_values(ascending=False)[:50]
contbr_occupation
RETIRED                                   29865435.00
ATTORNEY                                  12580812.94
HOMEMAKER                                 12410217.32
INFORMATION REQUESTED PER BEST EFFORTS     9832750.00
CEO                                        7310417.46
INFORMATION REQUESTED                      6652373.48
PHYSICIAN                                  4732676.66
PRESIDENT                                  4663978.28
EXECUTIVE                                  3897020.71
CONSULTANT                                 3445289.59
INVESTOR                                   2499492.16
OWNER                                      2397779.75
LAWYER                                     2305835.79
ENGINEER                                   1743040.13
REAL ESTATE                                1668360.55
PROFESSOR                                  1524503.64
NOT PROVIDE                                1456138.20
MANAGER                                    1429909.01
SELF-EMPLOYED                              1374817.30
CHAIRMAN                                   1339631.37
FINANCE                                    1330261.88
SALES                                      1225337.97
BUSINESS OWNER                             1179904.84
STUDENT                                    1155684.17
BANKER                                     1133177.68
NOT EMPLOYED                               1065220.57
TEACHER                                     984944.53
PARTNER                                     933350.51
INVESTMENTS                                 884374.03
VICE PRESIDENT                              861050.63
WRITER                                      843070.91
INVESTMENT BANKER                           812166.20
MANAGING DIRECTOR                           733055.32
DIRECTOR                                    583423.44
ARTIST                                      570839.79
ACCOUNTANT                                  568414.18
SOFTWARE ENGINEER                           559254.70
DENTIST                                     494070.23
PRIVATE EQUITY                              479509.00
FARMER                                      469295.72
INVESTMENT MANAGER                          468120.80
REAL ESTATE DEVELOPER                       458958.00
INVESTMENT MANAGEMENT                       443547.52
FINANCIAL ADVISOR                           436870.72
PRINCIPAL                                   432636.32
ARCHITECT                                   416997.09
CPA                                         402906.76
MANAGING PARTNER                            399183.80
C.P.A.                                      388737.11
ENTREPRENEUR                                383773.62
Name: contb_receipt_amt, dtype: float64
  • 利用函数进行数据转换:职业与雇主信息分析

  • 建立一个职业对应字典,把相同职业的不同表达映射为对应的职业,比如把C.E.O映射CEO

  • 2.3 数据筛选

  • 赞助金额筛选,赞助金额大于0

amt_election = election.query('contb_receipt_amt >0')
amt_election.head()
cmte_idcand_idcand_nmcontbr_nmcontbr_citycontbr_stcontbr_zipcontbr_employercontbr_occupationcontb_receipt_amtcontb_receipt_dtreceipt_descmemo_cdmemo_textform_tpfile_numparty
0C00410118P20002978Bachmann, MichelleHARVEY, WILLIAMMOBILEAL366010290RETIREDRETIRED250.020-JUN-11NOT PROVIDENOT PROVIDENOT PROVIDESA17A736166Republican
1C00410118P20002978Bachmann, MichelleHARVEY, WILLIAMMOBILEAL366010290RETIREDRETIRED50.023-JUN-11NOT PROVIDENOT PROVIDENOT PROVIDESA17A736166Republican
2C00410118P20002978Bachmann, MichelleSMITH, LANIERLANETTAL368633403INFORMATION REQUESTEDINFORMATION REQUESTED250.005-JUL-11NOT PROVIDENOT PROVIDENOT PROVIDESA17A749073Republican
3C00410118P20002978Bachmann, MichelleBLEVINS, DARONDAPIGGOTTAR724548253NONERETIRED250.001-AUG-11NOT PROVIDENOT PROVIDENOT PROVIDESA17A749073Republican
4C00410118P20002978Bachmann, MichelleWARDENBURG, HAROLDHOT SPRINGS NATIONAR719016467NONERETIRED300.020-JUN-11NOT PROVIDENOT PROVIDENOT PROVIDESA17A736166Republican
  • 查看候选人获得的赞助总金额
amt_election.groupby('cand_nm')['contb_receipt_amt'].sum()
cand_nm
Bachmann, Michelle                2.639510e+06
Cain, Herman                      7.058163e+06
Gingrich, Newt                    9.895836e+06
Huntsman, Jon                     3.324373e+06
Johnson, Gary Earl                4.132769e+05
McCotter, Thaddeus G              3.903000e+04
Obama, Barack                     8.259441e+07
Paul, Ron                         1.559408e+07
Pawlenty, Timothy                 5.978124e+06
Perry, Rick                       2.020167e+07
Roemer, Charles E. 'Buddy' III    3.429658e+05
Romney, Mitt                      5.702315e+07
Santorum, Rick                    3.364115e+06
Name: contb_receipt_amt, dtype: float64
# 单位划分到一百万
amt_election.groupby('cand_nm')['contb_receipt_amt'].sum() // 1000000
cand_nm
Bachmann, Michelle                 2.0
Cain, Herman                       7.0
Gingrich, Newt                     9.0
Huntsman, Jon                      3.0
Johnson, Gary Earl                 0.0
McCotter, Thaddeus G               0.0
Obama, Barack                     82.0
Paul, Ron                         15.0
Pawlenty, Timothy                  5.0
Perry, Rick                       20.0
Roemer, Charles E. 'Buddy' III     0.0
Romney, Mitt                      57.0
Santorum, Rick                     3.0
Name: contb_receipt_amt, dtype: float64
# 排序
s1 =(amt_election.groupby('cand_nm')['contb_receipt_amt'].sum() // 1000000).sort_values()
s1
cand_nm
Johnson, Gary Earl                 0.0
McCotter, Thaddeus G               0.0
Roemer, Charles E. 'Buddy' III     0.0
Bachmann, Michelle                 2.0
Huntsman, Jon                      3.0
Santorum, Rick                     3.0
Pawlenty, Timothy                  5.0
Cain, Herman                       7.0
Gingrich, Newt                     9.0
Paul, Ron                         15.0
Perry, Rick                       20.0
Romney, Mitt                      57.0
Obama, Barack                     82.0
Name: contb_receipt_amt, dtype: float64
  • 饼图绘制各候选人政治献金情况
s1.plot(kind='bar')
<matplotlib.axes._subplots.AxesSubplot at 0x7fe8370>

[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-o9aE0bGc-1649908284419)(output_37_1.png)]

  • 筛选选取候选人为Obama、Romney的子集数据,多种方式
amt_election
cmte_idcand_idcand_nmcontbr_nmcontbr_citycontbr_stcontbr_zipcontbr_employercontbr_occupationcontb_receipt_amtcontb_receipt_dtreceipt_descmemo_cdmemo_textform_tpfile_numparty
0C00410118P20002978Bachmann, MichelleHARVEY, WILLIAMMOBILEAL366010290RETIREDRETIRED250.020-JUN-11NOT PROVIDENOT PROVIDENOT PROVIDESA17A736166Republican
1C00410118P20002978Bachmann, MichelleHARVEY, WILLIAMMOBILEAL366010290RETIREDRETIRED50.023-JUN-11NOT PROVIDENOT PROVIDENOT PROVIDESA17A736166Republican
2C00410118P20002978Bachmann, MichelleSMITH, LANIERLANETTAL368633403INFORMATION REQUESTEDINFORMATION REQUESTED250.005-JUL-11NOT PROVIDENOT PROVIDENOT PROVIDESA17A749073Republican
3C00410118P20002978Bachmann, MichelleBLEVINS, DARONDAPIGGOTTAR724548253NONERETIRED250.001-AUG-11NOT PROVIDENOT PROVIDENOT PROVIDESA17A749073Republican
4C00410118P20002978Bachmann, MichelleWARDENBURG, HAROLDHOT SPRINGS NATIONAR719016467NONERETIRED300.020-JUN-11NOT PROVIDENOT PROVIDENOT PROVIDESA17A736166Republican
......................................................
536036C00500587P20003281Perry, RickANDERSON, MARILEE MRS.INFO REQUESTEDXX99999INFORMATION REQUESTED PER BEST EFFORTSINFORMATION REQUESTED PER BEST EFFORTS2500.031-AUG-11NOT PROVIDENOT PROVIDENOT PROVIDESA17A751678Republican
536037C00500587P20003281Perry, RickTOLBERT, DARYL MR.INFO REQUESTEDXX99999T.A.C.C.LONGWALL MAINTENANCE FOREMAN500.030-SEP-11NOT PROVIDENOT PROVIDENOT PROVIDESA17A751678Republican
536038C00500587P20003281Perry, RickGRANE, BRYAN F. MR.INFO REQUESTEDXX99999INFORMATION REQUESTED PER BEST EFFORTSINFORMATION REQUESTED PER BEST EFFORTS500.029-SEP-11NOT PROVIDENOT PROVIDENOT PROVIDESA17A751678Republican
536039C00500587P20003281Perry, RickDUFFY, DAVID A. MR.INFO REQUESTEDXX99999DUFFY EQUIPMENT COMPANY INC.BUSINESS OWNER2500.030-SEP-11NOT PROVIDENOT PROVIDENOT PROVIDESA17A751678Republican
536040C00500587P20003281Perry, RickGORMAN, CHRIS D. MR.INFO REQUESTEDXX99999INFORMATION REQUESTED PER BEST EFFORTSINFORMATION REQUESTED PER BEST EFFORTS5000.029-SEP-11REATTRIBUTION / REDESIGNATION REQUESTED (AUTOM...NOT PROVIDEREATTRIBUTION / REDESIGNATION REQUESTED (AUTOM...SA17A751678Republican

530314 rows × 17 columns

Obama = amt_election.query('cand_nm == "Obama, Barack"')
Obama
cmte_idcand_idcand_nmcontbr_nmcontbr_citycontbr_stcontbr_zipcontbr_employercontbr_occupationcontb_receipt_amtcontb_receipt_dtreceipt_descmemo_cdmemo_textform_tpfile_numparty
58829C00431445P80003338Obama, BarackHILL, HARRY AAICHI-GUNAANOT PROVIDEOAK LAWN MARKETING INC.PRESIDENT2500.011-DEC-11NOT PROVIDEX* OBAMA VICTORY FUND 2012SA18763233Democrat
58831C00431445P80003338Obama, BarackTORCHIANA, WILLIAM D.FRANCEAANOT PROVIDESULLIVAN & CROMWELLLAWYER2500.025-NOV-11NOT PROVIDEX* OBAMA VICTORY FUND 2012SA18763233Democrat
58832C00431445P80003338Obama, BarackBERGMAN, MARKLONDONAANOT PROVIDEPAUL WEISS RIFKIND ET ALATTORNEY2500.016-DEC-11NOT PROVIDEX* OBAMA VICTORY FUND 2012SA18763233Democrat
58833C00431445P80003338Obama, BarackBRIDGEWATER, PAMELADPOAA340080001DEPT. OF STATEDIPLOMAT500.024-JAN-12NOT PROVIDENOT PROVIDENOT PROVIDESA17A766535Democrat
58834C00431445P80003338Obama, BarackHIGHTOWER, CAROLYNDPOAA340049997DEPARTMENT OF STATEPUBLIC ADMINISTRATOR500.012-JAN-12NOT PROVIDENOT PROVIDENOT PROVIDESA17A766535Democrat
......................................................
351224C00431445P80003338Obama, BarackMOUNTS, ROBERTAPOZZ962043077HQ USFK (FKDC-SA)GS-15 INTERNATIONAL RELATIONS OFFICER25.026-NOV-11NOT PROVIDENOT PROVIDENOT PROVIDESA17A763233Democrat
351225C00431445P80003338Obama, BarackMOUNTS, ROBERTAPOZZ962043077HQ USFK (FKDC-SA)GS-15 INTERNATIONAL RELATIONS OFFICER25.026-JAN-12NOT PROVIDENOT PROVIDENOT PROVIDESA17A766535Democrat
351226C00431445P80003338Obama, BarackJONES, LESLIEHAMILTON PARISHZZ0CR04SELF-EMPLOYEDATTORNEY2500.030-DEC-11NOT PROVIDEX* OBAMA VICTORY FUND 2012SA18763233Democrat
351227C00431445P80003338Obama, BarackJONES, LESLIEHAMILTON PARISHZZ0CR04SELF-EMPLOYEDATTORNEY2500.030-DEC-11NOT PROVIDEX* OBAMA VICTORY FUND 2012SA18763233Democrat
351228C00431445P80003338Obama, BarackMOUNTS, ROBERTAPOZZ962043077HQ USFK (FKDC-SA)GS-15 INTERNATIONAL RELATIONS OFFICER25.026-OCT-11NOT PROVIDENOT PROVIDENOT PROVIDESA17A763233Democrat

289999 rows × 17 columns

# 另一种取法
romeny = amt_election[amt_election['cand_nm'] == "Romney, Mitt"]
romeny
cmte_idcand_idcand_nmcontbr_nmcontbr_citycontbr_stcontbr_zipcontbr_employercontbr_occupationcontb_receipt_amtcontb_receipt_dtreceipt_descmemo_cdmemo_textform_tpfile_numparty
409C00431171P80003353Romney, MittEDWARDS, BRYANT MR.LONDON UKAE90071LATHAM & WATKINSATTORNEY2500.020-JUN-11NOT PROVIDENOT PROVIDENOT PROVIDESA17A760248Republican
410C00431171P80003353Romney, MittDELUCA, PIERREAPOAE091280005CISCOENGINEER250.028-APR-11NOT PROVIDENOT PROVIDENOT PROVIDESA17A760248Republican
411C00431171P80003353Romney, MittDELUCA, PIERREAPOAE091280005CISCOENGINEER250.027-JUN-11NOT PROVIDENOT PROVIDENOT PROVIDESA17A760248Republican
412C00431171P80003353Romney, MittKAMMER, DONALDAPOAE09267USMILITARY500.031-DEC-11NOT PROVIDENOT PROVIDENOT PROVIDESA17A763507Republican
413C00431171P80003353Romney, MittCLAUDE, FOUKSPARIS FRANCEAE75011PALPATINESECRET250.029-DEC-11NOT PROVIDENOT PROVIDENOT PROVIDESA17A763507Republican
......................................................
58824C00431171P80003353Romney, MittKANUTSIZ, SINANINFO REQUESTEDXX99999K-COMMOWNER500.023-SEP-11NOT PROVIDENOT PROVIDENOT PROVIDESA17A760261Republican
58825C00431171P80003353Romney, MittDUFFEY, JAMES D. MR. JR.INFO REQUESTEDXX99999INFORMATION REQUESTED PER BEST EFFORTSINFORMATION REQUESTED PER BEST EFFORTS500.028-NOV-11NOT PROVIDENOT PROVIDENOT PROVIDESA17A763507Republican
58826C00431171P80003353Romney, MittOSBORN, JOHN E. MR.INFO REQUESTEDXX99999INFORMATION REQUESTED PER BEST EFFORTSINFORMATION REQUESTED PER BEST EFFORTS1000.017-AUG-11NOT PROVIDENOT PROVIDENOT PROVIDESA17A760261Republican
58827C00431171P80003353Romney, MittMAFFUCCI, NANCY L. MS.INFO REQUESTEDXX99999WUNDERMANMARKETING COMMUNICATIONS500.007-JUL-11NOT PROVIDENOT PROVIDENOT PROVIDESA17A760261Republican
58828C00431171P80003353Romney, MittMAFFUCCI, NANCY L. MS.INFO REQUESTEDXX99999WUNDERMANMARKETING COMMUNICATIONS1000.023-SEP-11NOT PROVIDENOT PROVIDENOT PROVIDESA17A760261Republican

57471 rows × 17 columns

3.数据聚合与分组

  • 3.1透视表(pivot_table)分析党派和职业

  • 按照党派、职业对赞助金额进行汇总,类似excel中的透视表操作,聚合函数为sum

s2 =amt_election.pivot_table(['contb_receipt_amt'],index=['party','contbr_occupation'],aggfunc='sum')
s2
contb_receipt_amt
partycontbr_occupation
DemocratMIXED-MEDIA ARTIST / STORYTELLER100.00
AREA VICE PRESIDENT250.00
RESEARCH ASSOCIATE100.00
TEACHER500.00
THERAPIST3900.00
.........
RepublicanZIG ZAG RESTAURANT GROUP950.00
ZIMMERMANS DAIRY83.71
ZOMBIE SLAYER1556.00
\NONE\250.00
~75.00

32750 rows × 1 columns

  • 过滤掉赞助金额小于200W的数据
s2 > 2000000
contb_receipt_amt
partycontbr_occupation
DemocratMIXED-MEDIA ARTIST / STORYTELLERFalse
AREA VICE PRESIDENTFalse
RESEARCH ASSOCIATEFalse
TEACHERFalse
THERAPISTFalse
.........
RepublicanZIG ZAG RESTAURANT GROUPFalse
ZIMMERMANS DAIRYFalse
ZOMBIE SLAYERFalse
\NONE\False
~False

32750 rows × 1 columns

s2.unstack()
contb_receipt_amt
contbr_occupationMIXED-MEDIA ARTIST / STORYTELLERAREA VICE PRESIDENTRESEARCH ASSOCIATETEACHERTHERAPIST(PART-TIME) SALES CONSULTANT & WRITER(RETIRED)---.NET PROGRAMMER...YOUTH SPECIALISTYOUTH WORKERZEN BUDDHIST PRIESTZEPPOS AND ASSOCIATESZIG ZAG RESTAURANT GROUPZIMMERMANS DAIRYZOMBIE SLAYERZOOLOGIST\NONE\~
party
Democrat100.0250.0100.0500.03900.0NaNNaN5000.0NaN441.0...525.0NaN300.01000.0NaNNaNNaN100.0NaNNaN
RepublicanNaNNaNNaNNaNNaN285.0250.03085.0425.0NaN...NaN595.12NaNNaN950.083.711556.0NaN250.075.0

2 rows × 29198 columns

# 把NAN填充,用0填充
s2.unstack(fill_value=0)
contb_receipt_amt
contbr_occupationMIXED-MEDIA ARTIST / STORYTELLERAREA VICE PRESIDENTRESEARCH ASSOCIATETEACHERTHERAPIST(PART-TIME) SALES CONSULTANT & WRITER(RETIRED)---.NET PROGRAMMER...YOUTH SPECIALISTYOUTH WORKERZEN BUDDHIST PRIESTZEPPOS AND ASSOCIATESZIG ZAG RESTAURANT GROUPZIMMERMANS DAIRYZOMBIE SLAYERZOOLOGIST\NONE\~
party
Democrat100.0250.0100.0500.03900.00.00.05000.00.0441.0...525.00.00300.01000.00.00.000.0100.00.00.0
Republican0.00.00.00.00.0285.0250.03085.0425.00.0...0.0595.120.00.0950.083.711556.00.0250.075.0

2 rows × 29198 columns

s2.unstack(level=0,fill_value=0)
contb_receipt_amt
partyDemocratRepublican
contbr_occupation
MIXED-MEDIA ARTIST / STORYTELLER100.00.00
AREA VICE PRESIDENT250.00.00
RESEARCH ASSOCIATE100.00.00
TEACHER500.00.00
THERAPIST3900.00.00
.........
ZIMMERMANS DAIRY0.083.71
ZOMBIE SLAYER0.01556.00
ZOOLOGIST100.00.00
\NONE\0.0250.00
~0.075.00

29198 rows × 2 columns

  • 绘制图形
s2.plot()
<matplotlib.axes._subplots.AxesSubplot at 0x1172dd0>

[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-hWpr65kS-1649908284424)(output_51_1.png)]

s22 = s2.unstack(level=0,fill_value=0)
s22
contb_receipt_amt
partyDemocratRepublican
contbr_occupation
MIXED-MEDIA ARTIST / STORYTELLER100.00.00
AREA VICE PRESIDENT250.00.00
RESEARCH ASSOCIATE100.00.00
TEACHER500.00.00
THERAPIST3900.00.00
.........
ZIMMERMANS DAIRY0.083.71
ZOMBIE SLAYER0.01556.00
ZOOLOGIST100.00.00
\NONE\0.0250.00
~0.075.00

29198 rows × 2 columns

s22_1 = s22.sum(axis=1)
s22_1
contbr_occupation
   MIXED-MEDIA ARTIST / STORYTELLER     100.00
 AREA VICE PRESIDENT                    250.00
 RESEARCH ASSOCIATE                     100.00
 TEACHER                                500.00
 THERAPIST                             3900.00
                                        ...   
ZIMMERMANS DAIRY                         83.71
ZOMBIE SLAYER                          1556.00
ZOOLOGIST                               100.00
\NONE\                                  250.00
~                                        75.00
Length: 29198, dtype: float64
s3 = amt_election.pivot_table(['contb_receipt_amt'],
                             columns='party',
                             index=['contbr_occupation'],aggfunc='sum')
s3.fillna(0,inplace=True)
s3
contb_receipt_amt
partyDemocratRepublican
contbr_occupation
MIXED-MEDIA ARTIST / STORYTELLER100.00.00
AREA VICE PRESIDENT250.00.00
RESEARCH ASSOCIATE100.00.00
TEACHER500.00.00
THERAPIST3900.00.00
.........
ZIMMERMANS DAIRY0.083.71
ZOMBIE SLAYER0.01556.00
ZOOLOGIST100.00.00
\NONE\0.0250.00
~0.075.00

29198 rows × 2 columns

s3[s22.sum(axis=1) > 1000000]
contb_receipt_amt
partyDemocratRepublican
contbr_occupation
ATTORNEY7115093.355561875.69
BANKER147439.001000738.68
BUSINESS OWNER313243.70889236.14
CEO2382425.675284341.79
CHAIRMAN390823.001021352.75
CONSULTANT1636016.361859573.23
ENGINEER552192.401218073.23
EXECUTIVE972931.633088889.08
FINANCE154856.401202855.48
HOMEMAKER2784593.609899812.15
INFORMATION REQUESTED2654049.904006935.69
INFORMATION REQUESTED PER BEST EFFORTS0.0010149354.09
INVESTOR639607.001954885.16
LAWYER1974727.92343741.87
MANAGER419079.161044834.85
NOT EMPLOYED1035639.5330581.04
NOT PROVIDE4560685.43591690.46
OWNER643033.421848252.45
PHYSICIAN2244445.922554100.10
PRESIDENT1285419.953552588.33
PROFESSOR1322512.54203491.10
REAL ESTATE377200.551326560.00
RETIRED15103811.0715177817.96
SALES210130.341039807.63
SELF-EMPLOYED378857.221018210.08
STUDENT414364.53752169.64
# 大于一百万的
s3[s22.sum(axis=1) > 1000000].plot()
<matplotlib.axes._subplots.AxesSubplot at 0x5f480f0>

[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-epVuE5OA-1649908284426)(output_56_1.png)]

# 大于两百万的
s3[s22.sum(axis=1) > 2000000].plot()
<matplotlib.axes._subplots.AxesSubplot at 0x5f177b0>

[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-b9vEyw2p-1649908284428)(output_57_1.png)]

# 大于两百万的
s3[s22.sum(axis=1) > 2000000].plot(kind='bar',stacked=True)
<matplotlib.axes._subplots.AxesSubplot at 0x5f0edf0>

[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-xSMOaAKE-1649908284429)(output_58_1.png)]

  • 3.2分组级运算和转换
  • 根据职业与雇主信息分布分组运算捐献金额总数
  • 由于职业和雇主的处理非常相似,我们定义函数get_top_amounts()对两个字段进行分析处理
  • 从数据可以看出,Obama更受精英群体(律师、医生、咨询顾问)的欢迎,Romney则得到更多企业家或高管的支持
  • 同样的,使用get_top_amounts()对雇主进行分析处理
  • Obama:微软、盛德国际律师事务所;Rommney:瑞士瑞信银行、摩根斯坦利、高盛公司、巴克莱资本、H.I.G.资本
amt_election.columns
Index(['cmte_id', 'cand_id', 'cand_nm', 'contbr_nm', 'contbr_city',
       'contbr_st', 'contbr_zip', 'contbr_employer', 'contbr_occupation',
       'contb_receipt_amt', 'contb_receipt_dt', 'receipt_desc', 'memo_cd',
       'memo_text', 'form_tp', 'file_num', 'party'],
      dtype='object')
amt_election.pivot_table('contb_receipt_amt',
                         index='contbr_occupation',
                        aggfunc='sum')
contb_receipt_amt
contbr_occupation
MIXED-MEDIA ARTIST / STORYTELLER100.00
AREA VICE PRESIDENT250.00
RESEARCH ASSOCIATE100.00
TEACHER500.00
THERAPIST3900.00
......
ZIMMERMANS DAIRY83.71
ZOMBIE SLAYER1556.00
ZOOLOGIST100.00
\NONE\250.00
~75.00

29198 rows × 1 columns

# 处理特殊符号
# 对小于4的字符进行处理
f2 = lambda item:np.nan if len(item) < 4 else item

amt_election['contbr_employer'] = amt_election['contbr_employer'].map(f2)
D:\yingyong\Anaconda3\lib\site-packages\ipykernel_launcher.py:5: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """
amt_election['contbr_employer'].isna()
0         False
1         False
2         False
3         False
4         False
          ...  
536036    False
536037    False
536038    False
536039    False
536040    False
Name: contbr_employer, Length: 530314, dtype: bool
amt_election[amt_election['contbr_employer'].isna()]
cmte_idcand_idcand_nmcontbr_nmcontbr_citycontbr_stcontbr_zipcontbr_employercontbr_occupationcontb_receipt_amtcontb_receipt_dtreceipt_descmemo_cdmemo_textform_tpfile_numparty
172C00410118P20002978Bachmann, MichelleGRANARA, FRANKNORWELLMA02061NaNPRESIDENT500.030-JUN-11NOT PROVIDENOT PROVIDENOT PROVIDESA17A736166Republican
356C00410118P20002978Bachmann, MichelleFITE, ROYHOUSTONTX770423445NaNAUDITOR500.021-JUN-11NOT PROVIDENOT PROVIDENOT PROVIDESA17A736166Republican
412C00431171P80003353Romney, MittKAMMER, DONALDAPOAE09267NaNMILITARY500.031-DEC-11NOT PROVIDENOT PROVIDENOT PROVIDESA17A763507Republican
414C00431171P80003353Romney, MittJONES, BRENDONQATARAE09898NaNMEDIA ANALYST250.022-NOV-11NOT PROVIDENOT PROVIDENOT PROVIDESA17A763507Republican
804C00431171P80003353Romney, MittFONDREN, SHELIA MRS.BIRMINGHAMAL352163796NaNVP OF HUMAN RESOURCES100.001-NOV-11NOT PROVIDENOT PROVIDENOT PROVIDESA17A763507Republican
......................................................
535693C00500587P20003281Perry, RickCORLE, FREDERIC W. MR.ALEXANDRIAVA223141575NaNSR DIR1000.029-SEP-11NOT PROVIDENOT PROVIDENOT PROVIDESA17A751678Republican
535741C00500587P20003281Perry, RickKOZUCH, RANDY J. MR.ALEXANDRIAVA223146250NaNDIRECTOR1000.030-SEP-11NOT PROVIDENOT PROVIDENOT PROVIDESA17A751678Republican
535750C00500587P20003281Perry, RickMCBRIDE, DONALD S. MR.FAIRFAXVA220331122NaNVICE PRESIDENT COMMUNICATIONS1000.029-SEP-11NOT PROVIDENOT PROVIDENOT PROVIDESA17A751678Republican
535795C00500587P20003281Perry, RickSHREWSBERRY, HAROLD MR.FAIRFAXVA220305833NaNASSISTANT INSPECTOR GENERAL2500.019-SEP-11NOT PROVIDENOT PROVIDENOT PROVIDESA17A751678Republican
536029C00500587P20003281Perry, RickTEXAS ENERGY L.L.C.CHEYENNEWY820013163NaNLLC250.030-SEP-11NOT PROVIDEXSEE ATTRIBUTION BELOWSA17A761750Republican

10280 rows × 17 columns

# 删除NAN值
amt_election.dropna(axis=0,inplace=True)
amt_election[amt_election['contbr_employer'].isna()]
D:\yingyong\Anaconda3\lib\site-packages\ipykernel_launcher.py:2: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy

cmte_idcand_idcand_nmcontbr_nmcontbr_citycontbr_stcontbr_zipcontbr_employercontbr_occupationcontb_receipt_amtcontb_receipt_dtreceipt_descmemo_cdmemo_textform_tpfile_numparty
# 再进行聚合操作
amt_election.pivot_table('contb_receipt_amt',
                         index=['contbr_occupation','contbr_employer'],
                        aggfunc='sum')
contb_receipt_amt
contbr_occupationcontbr_employer
MIXED-MEDIA ARTIST / STORYTELLERMOVINGSTORIES LLC100.00
AREA VICE PRESIDENTTIME WARNER CABLE250.00
RESEARCH ASSOCIATEHARVARD MEDICAL SCHOOL100.00
TEACHERDREW SCHOOL500.00
THERAPISTSEQUOIA COUNSELING SERVICES3900.00
.........
ZIG ZAG RESTAURANT GROUPZIG ZAG RESTAURANT GROUP950.00
ZIMMERMANS DAIRYZIMMERMANS DAIRY83.71
ZOMBIE SLAYERATLANTA ZOMBIE SERVICES, LLC1556.00
ZOOLOGISTSMITH INST100.00
\NONE\NONE250.00

124369 rows × 1 columns

s4 = Obama.pivot_table('contb_receipt_amt',
                         index=['contbr_occupation','contbr_employer'],
                        aggfunc='sum')
s4
contb_receipt_amt
contbr_occupationcontbr_employer
MIXED-MEDIA ARTIST / STORYTELLERMOVINGSTORIES LLC100.0
AREA VICE PRESIDENTTIME WARNER CABLE250.0
RESEARCH ASSOCIATEHARVARD MEDICAL SCHOOL100.0
TEACHERDREW SCHOOL500.0
THERAPISTSEQUOIA COUNSELING SERVICES3900.0
.........
YOUTH SPECIALISTLINCOLN UNIVERSITY200.0
NH-JAG325.0
ZEN BUDDHIST PRIESTROCHESTER ZEN CENTER300.0
ZEPPOS AND ASSOCIATESFOUNDER AND CEO1000.0
ZOOLOGISTSMITH INST100.0

59886 rows × 1 columns

s4.sort_values('contb_receipt_amt',ascending=False)
contb_receipt_amt
contbr_occupationcontbr_employer
RETIREDRETIRED12390060.73
NOT PROVIDENOT PROVIDE4529180.78
INFORMATION REQUESTEDINFORMATION REQUESTED2559801.28
HOMEMAKERHOMEMAKER1996712.95
RETIREDNOT EMPLOYED1851167.34
.........
INDUSTRIAL & COMMERCIAL NATIONAL TITLERETIRED3.00
EXECUTIVE DIRECTORLOCAL INITIATIVES SUPPORT CORPORATION,3.00
ART GALLERYSELBY FLEETWOOD GALLERY3.00
POLICY/ LAWYERU.S. DEPARTMENT OF LABOR3.00
MANAGEMENT CONSULTANTHARBERT BERNARD & ASSOCIATES, LLC3.00

59927 rows × 1 columns

# Obama的数据,取前10条,画图
s4.sort_values('contb_receipt_amt',ascending=False)[:10].plot(kind='bar')
<matplotlib.axes._subplots.AxesSubplot at 0x6173a90>

[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-Ebg2Vhoc-1649908284433)(output_69_1.png)]

# 雇主
s4.sort_values('contbr_employer',ascending=False)[:10].plot(kind='bar')
<matplotlib.axes._subplots.AxesSubplot at 0x8793e70>

[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-8A2oVgj9-1649908284434)(output_70_1.png)]

4.时间处理

  • 4.1str转datetime
amt_election.dtypes
cmte_id               object
cand_id               object
cand_nm               object
contbr_nm             object
contbr_city           object
contbr_st             object
contbr_zip            object
contbr_employer       object
contbr_occupation     object
contb_receipt_amt    float64
contb_receipt_dt      object
receipt_desc          object
memo_cd               object
memo_text             object
form_tp               object
file_num               int64
party                 object
dtype: object
amt_election['contb_receipt_dt'].unique()
array(['20-JUN-11', '23-JUN-11', '05-JUL-11', '01-AUG-11', '21-JUN-11',
       '22-JUN-11', '17-JUN-11', '11-JUL-11', '08-JUL-11', '27-JUN-11',
       '13-JUN-11', '16-JUN-11', '18-JUL-11', '05-AUG-11', '01-JUL-11',
       '06-JUL-11', '13-JUL-11', '28-JUN-11', '30-JUN-11', '08-AUG-11',
       '15-AUG-11', '16-AUG-11', '18-AUG-11', '24-JUN-11', '11-OCT-11',
       '14-JUL-11', '29-JUN-11', '18-OCT-11', '30-SEP-11', '11-AUG-11',
       '02-AUG-11', '14-JUN-11', '23-AUG-11', '07-JUL-11', '25-JUL-11',
       '21-JUL-11', '24-OCT-11', '12-JUL-11', '28-JUL-11', '09-SEP-11',
       '10-NOV-11', '12-SEP-11', '04-OCT-11', '05-DEC-11', '31-OCT-11',
       '12-DEC-11', '29-JUL-11', '22-JUL-11', '27-OCT-11', '28-NOV-11',
       '26-AUG-11', '26-SEP-11', '27-JUL-11', '28-APR-11', '29-DEC-11',
       '28-JAN-12', '29-NOV-11', '21-AUG-11', '16-MAY-11', '18-MAY-11',
       '12-MAY-11', '19-MAY-11', '19-SEP-11', '16-APR-11', '15-MAY-11',
       '15-JUN-11', '11-SEP-11', '06-SEP-11', '15-OCT-11', '20-NOV-11',
       '22-NOV-11', '15-DEC-11', '27-DEC-11', '02-DEC-11', '06-DEC-11',
       '09-DEC-11', '29-SEP-11', '28-SEP-11', '01-SEP-11', '15-SEP-11',
       '09-AUG-11', '16-SEP-11', '14-SEP-11', '03-MAY-11', '02-SEP-11',
       '07-SEP-11', '15-JUL-11', '26-JUL-11', '25-AUG-11', '02-MAY-11',
       '23-JAN-12', '09-JAN-12', '11-JAN-12', '25-JAN-12', '05-JAN-12',
       '13-JAN-12', '20-JAN-12', '15-JAN-12', '30-JAN-12', '16-JAN-12',
       '04-JAN-12', '26-JAN-12', '27-JAN-12', '10-JAN-12', '12-JAN-12',
       '22-JAN-12', '01-JAN-12', '30-DEC-11', '17-DEC-11', '08-NOV-11',
       '25-OCT-11', '17-NOV-11', '07-NOV-11', '13-OCT-11', '18-NOV-11',
       '22-DEC-11', '17-JAN-12', '21-DEC-11', '07-DEC-11', '29-OCT-11',
       '10-OCT-11', '17-AUG-11', '24-MAY-11', '06-JUN-11', '31-MAY-11',
       '29-APR-11', '13-APR-11', '27-SEP-11', '17-SEP-11', '30-AUG-11',
       '20-SEP-11', '29-AUG-11', '11-JUN-11', '23-MAY-11', '08-SEP-11',
       '22-SEP-11', '22-AUG-11', '12-AUG-11', '27-MAY-11', '19-APR-11',
       '24-AUG-11', '19-AUG-11', '28-AUG-11', '21-SEP-11', '10-AUG-11',
       '20-JUL-11', '21-NOV-11', '02-OCT-11', '19-OCT-11', '18-JAN-12',
       '24-JAN-12', '09-NOV-11', '14-NOV-11', '16-DEC-11', '28-OCT-11',
       '31-DEC-11', '02-NOV-11', '25-NOV-11', '23-DEC-11', '17-OCT-11',
       '10-DEC-11', '01-DEC-11', '13-DEC-11', '30-NOV-11', '12-OCT-11',
       '15-NOV-11', '14-DEC-11', '16-NOV-11', '28-DEC-11', '20-DEC-11',
       '03-NOV-11', '20-OCT-11', '01-NOV-11', '03-JAN-12', '07-JAN-12',
       '31-JAN-12', '19-JAN-12', '13-SEP-11', '01-JUN-11', '11-APR-11',
       '07-JUN-11', '02-JAN-12', '08-JAN-12', '29-JAN-12', '08-DEC-11',
       '19-DEC-11', '19-JUL-11', '06-JAN-12', '21-JAN-12', '06-OCT-11',
       '06-NOV-11', '19-NOV-11', '27-NOV-11', '23-NOV-11', '21-OCT-11',
       '23-OCT-11', '25-DEC-11', '07-OCT-11', '04-NOV-11', '16-OCT-11',
       '05-NOV-11', '30-OCT-11', '14-OCT-11', '04-DEC-11', '01-OCT-11',
       '05-OCT-11', '26-NOV-11', '03-OCT-11', '26-DEC-11', '04-AUG-11',
       '23-SEP-11', '31-AUG-11', '03-AUG-11', '16-JUL-11', '14-AUG-11',
       '05-SEP-11', '10-SEP-11', '04-JUL-11', '24-SEP-11', '25-SEP-11',
       '13-MAY-11', '26-APR-11', '06-MAY-11', '17-MAY-11', '10-JUN-11',
       '15-APR-11', '10-MAY-11', '03-JUN-11', '12-APR-11', '05-MAY-11',
       '09-MAY-11', '20-APR-11', '25-APR-11', '25-JUN-11', '25-MAY-11',
       '12-JUN-11', '07-MAY-11', '04-MAY-11', '14-APR-11', '14-MAY-11',
       '20-MAY-11', '18-JUN-11', '24-JUL-11', '14-JAN-12', '11-DEC-11',
       '13-NOV-11', '18-DEC-11', '24-DEC-11', '11-NOV-11', '26-OCT-11',
       '03-DEC-11', '08-OCT-11', '24-NOV-11', '12-NOV-11', '23-JUL-11',
       '20-AUG-11', '18-SEP-11', '10-JUL-11', '07-AUG-11', '13-AUG-11',
       '22-APR-11', '19-JUN-11', '09-JUN-11', '08-JUN-11', '26-MAY-11',
       '27-APR-11', '18-APR-11', '17-APR-11', '24-APR-11', '21-APR-11',
       '04-JUN-11', '22-MAY-11', '26-JUN-11', '28-MAY-11', '01-MAY-11',
       '11-MAY-11', '30-APR-11', '02-JUN-11', '30-MAY-11', '29-MAY-11',
       '05-JUN-11', '23-APR-11', '06-AUG-11', '27-AUG-11', '21-MAY-11',
       '03-JUL-11', '31-JUL-11', '09-OCT-11', '22-OCT-11', '17-JUL-11',
       '30-JUL-11', '02-JUL-11', '08-MAY-11', '04-SEP-11', '09-JUL-11',
       '03-SEP-11', '05-APR-11', '04-APR-11', '08-APR-11', '10-APR-11',
       '06-APR-11', '07-APR-11', '09-APR-11', '24-MAR-11', '07-MAR-11',
       '05-MAR-11', '16-MAR-11', '04-MAR-11', '15-MAR-11', '17-MAR-11',
       '03-MAR-11', '06-MAR-11', '10-MAR-11', '08-MAR-11', '25-MAR-11',
       '22-MAR-11', '12-MAR-11', '13-MAR-11', '09-MAR-11', '11-MAR-11',
       '29-MAR-11', '30-MAR-11', '18-MAR-11', '20-MAR-11', '02-MAR-11',
       '21-MAR-11', '31-MAR-11', '03-APR-11', '14-MAR-11', '19-MAR-11',
       '28-MAR-11', '23-MAR-11', '02-APR-11', '26-MAR-11', '01-APR-11',
       '09-FEB-11', '28-FEB-11', '07-FEB-11', '27-MAR-11', '22-FEB-11',
       '14-JAN-11', '17-JAN-11', '17-FEB-11', '31-JAN-11', '12-JAN-11',
       '29-JAN-11', '15-JAN-11', '20-JAN-11', '21-JAN-11', '11-FEB-11',
       '16-FEB-11', '28-JAN-11', '18-FEB-11', '23-FEB-11', '01-FEB-11',
       '03-FEB-11', '04-FEB-11', '08-FEB-11', '13-JAN-11', '16-JAN-11',
       '12-FEB-11', '13-FEB-11', '04-JAN-11', '10-FEB-11', '24-FEB-11',
       '22-JAN-11', '24-JAN-11', '26-JAN-11', '01-JAN-11', '03-JAN-11',
       '01-MAR-11', '27-JAN-11', '15-FEB-11', '14-FEB-11', '21-FEB-11',
       '18-JAN-11'], dtype=object)
# 扩展:字符串转时间
from datetime import datetime
datetime.now()
datetime.datetime(2022, 4, 14, 10, 32, 8, 500374)
datetime.now().strftime('%d-%B-%y')
'14-April-22'
# strftime是一种计算机函数,根据区域设置格式化本地时间/日期,函数的功能:将时间格式化
# strptime(),功能:按照特定时间格式将字符串转换(解析)为时间类型。
# pd.to_datetime(obama['contb_receipt_dt']) 太慢了
# 封装函数
def covert_date(item):
    return datetime.strptime(item,'%d-%B-%y')
# 字母变小写,然后首字母大写
'14-April-22'.lower()
'14-april-22'
'14-April-22'.lower().title()
'14-April-22'
# 封装函数
def covert_date(item):
    return datetime.strptime(item.lower().title(),'%d-%b-%y')
Obama['amt_dt'] = Obama['contb_receipt_dt'].map(covert_date)
D:\yingyong\Anaconda3\lib\site-packages\ipykernel_launcher.py:1: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.
Obama['amt_dt']
58829    2011-12-11
58831    2011-11-25
58832    2011-12-16
58833    2012-01-24
58834    2012-01-12
            ...    
351224   2011-11-26
351225   2012-01-26
351226   2011-12-30
351227   2011-12-30
351228   2011-10-26
Name: amt_dt, Length: 289999, dtype: datetime64[ns]
  • 4.2以时间为索引
# 排序
Obama.sort_values('amt_dt')
cmte_idcand_idcand_nmcontbr_nmcontbr_citycontbr_stcontbr_zipcontbr_employercontbr_occupationcontb_receipt_amtcontb_receipt_dtreceipt_descmemo_cdmemo_textform_tpfile_numpartyamt_dt
61913C00431445P80003338Obama, BarackSMITH, WILLIS BTEXARKANAAR718547720RETIREDFORMER ATTORNEY100.004-APR-11NOT PROVIDENOT PROVIDENOT PROVIDESA17A756214Democrat2011-04-04
137424C00431445P80003338Obama, BarackFISHER, TADJACKSONVILLEFL322251084FL ACADEMY OF FAMILY PHYSICIANSCEO100.004-APR-11NOT PROVIDENOT PROVIDENOT PROVIDESA17A756214Democrat2011-04-04
137407C00431445P80003338Obama, BarackCOMMESSO, CATHYPALM HARBORFL346852225NOT EMPLOYEDNOT EMPLOYED100.004-APR-11NOT PROVIDENOT PROVIDENOT PROVIDESA17A756214Democrat2011-04-04
137388C00431445P80003338Obama, BarackDE HAAN, RUTHHOLMES BEACHFL34217RETIREDRETIRED50.004-APR-11NOT PROVIDENOT PROVIDENOT PROVIDESA17A756214Democrat2011-04-04
71701C00431445P80003338Obama, BarackGONZALEZ, DAVIDSAN FRANCISCOCA941072023GENENTECHSR. HUMAN RESOURCE MANAGER100.004-APR-11NOT PROVIDENOT PROVIDENOT PROVIDESA17A756214Democrat2011-04-04
.........................................................
241140C00431445P80003338Obama, BarackUZUN, OZCANPISCATAWAYNJ08854NYIT AND CUNYINSTRUCTOR AND BUSINESS OWNER2500.031-JAN-12NOT PROVIDEX* OBAMA VICTORY FUND 2012SA18766535Democrat2012-01-31
221876C00431445P80003338Obama, BarackROESLER, DEBORAHWAYZATAMN55391NOT EMPLOYEDRETIRED500.031-JAN-12NOT PROVIDEX* OBAMA VICTORY FUND 2012SA18766535Democrat2012-01-31
298651C00431445P80003338Obama, BarackKNOLL, SANDRA LPHOENIXVILLEPA194602272INFORMATION REQUESTEDINFORMATION REQUESTED150.031-JAN-12NOT PROVIDENOT PROVIDENOT PROVIDESA17A766535Democrat2012-01-31
113520C00431445P80003338Obama, BarackTORBOHN, GRACEMODESTOCA953560436RETIREDRETIRED55.031-JAN-12NOT PROVIDENOT PROVIDENOT PROVIDESA17A766535Democrat2012-01-31
231259C00431445P80003338Obama, BarackVRANA, BRUCEDURHAMNC277072835SYNGENTAATTORNEY250.031-JAN-12NOT PROVIDENOT PROVIDENOT PROVIDESA17A766535Democrat2012-01-31

289999 rows × 18 columns

# 设置索引
obama_amt_dt = Obama.sort_values('amt_dt').set_index('amt_dt')
obama_amt_dt.head()
cmte_idcand_idcand_nmcontbr_nmcontbr_citycontbr_stcontbr_zipcontbr_employercontbr_occupationcontb_receipt_amtcontb_receipt_dtreceipt_descmemo_cdmemo_textform_tpfile_numparty
amt_dt
2011-04-04C00431445P80003338Obama, BarackSMITH, WILLIS BTEXARKANAAR718547720RETIREDFORMER ATTORNEY100.004-APR-11NOT PROVIDENOT PROVIDENOT PROVIDESA17A756214Democrat
2011-04-04C00431445P80003338Obama, BarackFISHER, TADJACKSONVILLEFL322251084FL ACADEMY OF FAMILY PHYSICIANSCEO100.004-APR-11NOT PROVIDENOT PROVIDENOT PROVIDESA17A756214Democrat
2011-04-04C00431445P80003338Obama, BarackCOMMESSO, CATHYPALM HARBORFL346852225NOT EMPLOYEDNOT EMPLOYED100.004-APR-11NOT PROVIDENOT PROVIDENOT PROVIDESA17A756214Democrat
2011-04-04C00431445P80003338Obama, BarackDE HAAN, RUTHHOLMES BEACHFL34217RETIREDRETIRED50.004-APR-11NOT PROVIDENOT PROVIDENOT PROVIDESA17A756214Democrat
2011-04-04C00431445P80003338Obama, BarackGONZALEZ, DAVIDSAN FRANCISCOCA941072023GENENTECHSR. HUMAN RESOURCE MANAGER100.004-APR-11NOT PROVIDENOT PROVIDENOT PROVIDESA17A756214Democrat
obama_amt_dt.plot()
<matplotlib.axes._subplots.AxesSubplot at 0x1065fe70>

[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-jqDPPsIl-1649908284437)(output_86_1.png)]

obama_amt_dt['contb_receipt_amt'].plot()
<matplotlib.axes._subplots.AxesSubplot at 0x22286a70>

[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-tCutu2nF-1649908284438)(output_87_1.png)]

Obama.groupby('amt_dt')['contb_receipt_amt'].sum().plot()
<matplotlib.axes._subplots.AxesSubplot at 0x8779710>

[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-DosxXpFZ-1649908284440)(output_88_1.png)]

  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 打赏
    打赏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包

打赏作者

今晚务必早点睡

你的鼓励将是我创作的最大动力

¥1 ¥2 ¥4 ¥6 ¥10 ¥20
扫码支付:¥1
获取中
扫码支付

您的余额不足,请更换扫码支付或充值

打赏作者

实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值