python数据分析基础03——练习项目

最新推荐文章于 2024-07-16 17:08:46 发布

友培

最新推荐文章于 2024-07-16 17:08:46 发布

阅读量516

点赞数

分类专栏：大数据——数据挖掘

本文链接：https://blog.csdn.net/xiaoyoupei/article/details/122813594

版权

人口分析美国大选数据分析政党献金人口密度

关键词由CSDN通过智能技术生成

大数据——数据挖掘专栏收录该内容

19 篇文章 3 订阅

订阅专栏

文章目录

人口分析案例

需求：
- 导入文件，查看原始数据
- 将人口数据和各州简称数据进行数据汇总
- 将汇总的数据中重复的abbreviation列进行删除
- 在汇总的数据中查看存在缺失数据的列
- 在汇总的数据中找到有哪些state/region使得state的值为NaN，进行去重操作
- 为找到的这些state/region的state项补上正确的值，从而去除掉state这一列的所有NaN
- 汇总的数据和各州面积数据areas进行汇总
- 我们会发现area(sq.mi)这一列有缺失数据，找出是哪些行
- 去除含有缺失数据的行
- 找出2010年的全民人口数据
- 计算各州的人口密度

import pandas as pd
import numpy as np

# state州全称 abbreviation州的简称
abb = pd.read_csv('./state-abbrevs.csv')
abb

	state	abbreviation
0	Alabama	AL
1	Alaska	AK
2	Arizona	AZ
3	Arkansas	AR
4	California	CA
5	Colorado	CO
6	Connecticut	CT
7	Delaware	DE
8	District of Columbia	DC
9	Florida	FL
10	Georgia	GA
11	Hawaii	HI
12	Idaho	ID
13	Illinois	IL
14	Indiana	IN
15	Iowa	IA
16	Kansas	KS
17	Kentucky	KY
18	Louisiana	LA
19	Maine	ME
20	Montana	MT
21	Nebraska	NE
22	Nevada	NV
23	New Hampshire	NH
24	New Jersey	NJ
25	New Mexico	NM
26	New York	NY
27	North Carolina	NC
28	North Dakota	ND
29	Ohio	OH
30	Oklahoma	OK
31	Oregon	OR
32	Maryland	MD
33	Massachusetts	MA
34	Michigan	MI
35	Minnesota	MN
36	Mississippi	MS
37	Missouri	MO
38	Pennsylvania	PA
39	Rhode Island	RI
40	South Carolina	SC
41	South Dakota	SD
42	Tennessee	TN
43	Texas	TX
44	Utah	UT
45	Vermont	VT
46	Virginia	VA
47	Washington	WA
48	West Virginia	WV
49	Wisconsin	WI
50	Wyoming	WY

#state/region:州的简称 ages：年龄 year时间  population人口数量
pop = pd.read_csv('./state-population.csv')

#state州的全称  area (sq. mi)面积
area = pd.read_csv('./state-areas.csv')

abb_pop = pd.merge(left=abb,right=pop,left_on='abbreviation',right_on='state/region',how='outer')
abb_pop

	state	abbreviation	state/region	ages	year	population
0	Alabama	AL	AL	under18	2012	1117489.0
1	Alabama	AL	AL	total	2012	4817528.0
2	Alabama	AL	AL	under18	2010	1130966.0
3	Alabama	AL	AL	total	2010	4785570.0
4	Alabama	AL	AL	under18	2011	1125763.0
...	...	...	...	...	...	...
2539	NaN	NaN	USA	total	2010	309326295.0
2540	NaN	NaN	USA	under18	2011	73902222.0
2541	NaN	NaN	USA	total	2011	311582564.0
2542	NaN	NaN	USA	under18	2012	73708179.0
2543	NaN	NaN	USA	total	2012	313873685.0

2544 rows × 6 columns

# 重复的abbreviation列进行删除
abb_pop.drop(labels='abbreviation',axis=1,inplace=True)

# 查看存在缺失数据的列
abb_pop.isnull().any(axis=0)

state            True
state/region    False
ages            False
year            False
population       True
dtype: bool

# 汇总数据找到哪些state/region使得state的值为NaN,进行去重的操作
# 1、定位state列中的空值
ex = abb_pop['state'].isnull()
# 2、取出state中空值对应的行数据
abb_pop.loc[ex]
# 3、取出state空对应的简称数据
abb_pop.loc[ex]['state/region']
# 4、对第三步获取的简称进行去重
abb_pop.loc[ex]['state/region'].unique()

array(['PR', 'USA'], dtype=object)

# 为找到的这些state/region的state项补上正确的值，从而去除掉state这一列的所有NaN
# 1、定位USA简称对应的全称
abb_pop['state/region'] == 'USA'
# 2、上述布尔值作为元数据的行索引，取出USA的行数据
abb_pop.loc[abb_pop['state/region'] == 'USA']
# 3、获取USA的行索引
indexs = abb_pop.loc[abb_pop['state/region'] == 'USA'].index
#4.将indexs这些行的state列的值批量填充成USA的全称
abb_pop.loc[indexs,'state'] = 'United State'

abb_pop['state/region'] == 'PR'
abb_pop.loc[abb_pop['state/region'] == 'PR']
indexs = abb_pop.loc[abb_pop['state/region'] == 'PR'].index
abb_pop.loc[indexs,'state'] = 'PPPRRR'

#检测state列中是否还存在空值
abb_pop['state'].isnull().sum()

# 汇总的数据和各州面积数据areas进行汇总
abb_pop_area = pd.merge(left=abb_pop,right=area,on='state',how='outer')
abb_pop_area.head()

	state	state/region	ages	year	population	area (sq. mi)
0	Alabama	AL	under18	2012.0	1117489.0	52423.0
1	Alabama	AL	total	2012.0	4817528.0	52423.0
2	Alabama	AL	under18	2010.0	1130966.0	52423.0
3	Alabama	AL	total	2010.0	4785570.0	52423.0
4	Alabama	AL	under18	2011.0	1125763.0	52423.0

# 发现area(sq.mi)这一列有缺失数据，找出是哪些行,去除含有缺失数据的行
abb_pop_area['area (sq. mi)'].isnull()
# 将area (sq. mi)列中空对应的行数据取出
abb_pop_area.loc[abb_pop_area['area (sq. mi)'].isnull()]
# 获取了area (sq. mi)列中空对应的行索引
indexs = abb_pop_area.loc[abb_pop_area['area (sq. mi)'].isnull()].index

Int64Index([2448, 2449, 2450, 2451, 2452, 2453, 2454, 2455, 2456, 2457, 2458,
            2459, 2460, 2461, 2462, 2463, 2464, 2465, 2466, 2467, 2468, 2469,
            2470, 2471, 2472, 2473, 2474, 2475, 2476, 2477, 2478, 2479, 2480,
            2481, 2482, 2483, 2484, 2485, 2486, 2487, 2488, 2489, 2490, 2491,
            2492, 2493, 2494, 2495, 2496, 2497, 2498, 2499, 2500, 2501, 2502,
            2503, 2504, 2505, 2506, 2507, 2508, 2509, 2510, 2511, 2512, 2513,
            2514, 2515, 2516, 2517, 2518, 2519, 2520, 2521, 2522, 2523, 2524,
            2525, 2526, 2527, 2528, 2529, 2530, 2531, 2532, 2533, 2534, 2535,
            2536, 2537, 2538, 2539, 2540, 2541, 2542, 2543],
           dtype='int64')

# 去除含有缺失数据的行
abb_pop_area.drop(labels=indexs,axis=0,inplace=True)

# 找出2010年的全民人口数据
abb_pop_area.query('year == 2010 & ages == "total"')

	state	state/region	ages	year	population	area (sq. mi)
3	Alabama	AL	total	2010.0	4785570.0	52423.0
91	Alaska	AK	total	2010.0	713868.0	656425.0
101	Arizona	AZ	total	2010.0	6408790.0	114006.0
189	Arkansas	AR	total	2010.0	2922280.0	53182.0
197	California	CA	total	2010.0	37333601.0	163707.0
283	Colorado	CO	total	2010.0	5048196.0	104100.0
293	Connecticut	CT	total	2010.0	3579210.0	5544.0
379	Delaware	DE	total	2010.0	899711.0	1954.0
389	District of Columbia	DC	total	2010.0	605125.0	68.0
475	Florida	FL	total	2010.0	18846054.0	65758.0
485	Georgia	GA	total	2010.0	9713248.0	59441.0
570	Hawaii	HI	total	2010.0	1363731.0	10932.0
581	Idaho	ID	total	2010.0	1570718.0	83574.0
666	Illinois	IL	total	2010.0	12839695.0	57918.0
677	Indiana	IN	total	2010.0	6489965.0	36420.0
762	Iowa	IA	total	2010.0	3050314.0	56276.0
773	Kansas	KS	total	2010.0	2858910.0	82282.0
858	Kentucky	KY	total	2010.0	4347698.0	40411.0
869	Louisiana	LA	total	2010.0	4545392.0	51843.0
954	Maine	ME	total	2010.0	1327366.0	35387.0
965	Montana	MT	total	2010.0	990527.0	147046.0
1050	Nebraska	NE	total	2010.0	1829838.0	77358.0
1061	Nevada	NV	total	2010.0	2703230.0	110567.0
1146	New Hampshire	NH	total	2010.0	1316614.0	9351.0
1157	New Jersey	NJ	total	2010.0	8802707.0	8722.0
1242	New Mexico	NM	total	2010.0	2064982.0	121593.0
1253	New York	NY	total	2010.0	19398228.0	54475.0
1338	North Carolina	NC	total	2010.0	9559533.0	53821.0
1349	North Dakota	ND	total	2010.0	674344.0	70704.0
1434	Ohio	OH	total	2010.0	11545435.0	44828.0
1445	Oklahoma	OK	total	2010.0	3759263.0	69903.0
1530	Oregon	OR	total	2010.0	3837208.0	98386.0
1541	Maryland	MD	total	2010.0	5787193.0	12407.0
1626	Massachusetts	MA	total	2010.0	6563263.0	10555.0
1637	Michigan	MI	total	2010.0	9876149.0	96810.0
1722	Minnesota	MN	total	2010.0	5310337.0	86943.0
1733	Mississippi	MS	total	2010.0	2970047.0	48434.0
1818	Missouri	MO	total	2010.0	5996063.0	69709.0
1829	Pennsylvania	PA	total	2010.0	12710472.0	46058.0
1914	Rhode Island	RI	total	2010.0	1052669.0	1545.0
1925	South Carolina	SC	total	2010.0	4636361.0	32007.0
2010	South Dakota	SD	total	2010.0	816211.0	77121.0
2021	Tennessee	TN	total	2010.0	6356683.0	42146.0
2106	Texas	TX	total	2010.0	25245178.0	268601.0
2117	Utah	UT	total	2010.0	2774424.0	84904.0
2202	Vermont	VT	total	2010.0	625793.0	9615.0
2213	Virginia	VA	total	2010.0	8024417.0	42769.0
2298	Washington	WA	total	2010.0	6742256.0	71303.0
2309	West Virginia	WV	total	2010.0	1854146.0	24231.0
2394	Wisconsin	WI	total	2010.0	5689060.0	65503.0
2405	Wyoming	WY	total	2010.0	564222.0	97818.0

# 计算各州的人口密度
midu = abb_pop_area['population'] / abb_pop_area['area (sq. mi)']

abb_pop_area['midu'] = midu
abb_pop_area.head()

	state	state/region	ages	year	population	area (sq. mi)	midu
0	Alabama	AL	under18	2012.0	1117489.0	52423.0	21.316769
1	Alabama	AL	total	2012.0	4817528.0	52423.0	91.897221
2	Alabama	AL	under18	2010.0	1130966.0	52423.0	21.573851
3	Alabama	AL	total	2010.0	4785570.0	52423.0	91.287603
4	Alabama	AL	under18	2011.0	1125763.0	52423.0	21.474601

2012美国大选献金项目数据分析

import numpy as np
import pandas as pd
# 将月份和参选人以及所在政党进行定义：
months = {'JAN' : 1, 'FEB' : 2, 'MAR' : 3, 'APR' : 4, 'MAY' : 5, 'JUN' : 6,
          'JUL' : 7, 'AUG' : 8, 'SEP' : 9, 'OCT': 10, 'NOV': 11, 'DEC' : 12}

parties = {
  'Bachmann, Michelle': 'Republican',
  'Romney, Mitt': 'Republican',
  'Obama, Barack': 'Democrat',
  "Roemer, Charles E. 'Buddy' III": 'Reform',
  'Pawlenty, Timothy': 'Republican',
  'Johnson, Gary Earl': 'Libertarian',
  'Paul, Ron': 'Republican',
  'Santorum, Rick': 'Republican',
  'Cain, Herman': 'Republican',
  'Gingrich, Newt': 'Republican',
  'McCotter, Thaddeus G': 'Republican',
  'Huntsman, Jon': 'Republican',
  'Perry, Rick': 'Republican'           
 }

需求

加载数据
查看数据的基本信息
指定数据截取，将如下字段的数据进行提取，其他数据舍弃
- cand_nm ：候选人姓名
- contbr_nm ：捐赠人姓名
- contbr_st ：捐赠人所在州
- contbr_employer ：捐赠人所在公司
- contbr_occupation ：捐赠人职业
- contb_receipt_amt ：捐赠数额（美元）
- contb_receipt_dt ：捐款的日期
对新数据进行总览,查看是否存在缺失数据
用统计学指标快速描述数值型属性的概要。
空值处理。可能因为忘记填写或者保密等等原因，相关字段出现了空值，将其填充为NOT PROVIDE
异常值处理。将捐款金额<=0的数据删除
新建一列为各个候选人所在党派party
查看party这一列中有哪些不同的元素
统计party列中各个元素出现次数
查看各个党派收到的政治献金总数contb_receipt_amt
查看具体每天各个党派收到的政治献金总数contb_receipt_amt
将表中日期格式转换为’yyyy-mm-dd’。
查看老兵(捐献者职业)DISABLED VETERAN主要支持谁

df = pd.read_csv('./data/usa_election.txt',low_memory=False)
df = df[['cand_nm','contbr_nm','contbr_st','contbr_employer','contbr_occupation','contb_receipt_amt','contb_receipt_dt']]
df.head()

	cand_nm	contbr_nm	contbr_st	contbr_employer	contbr_occupation	contb_receipt_amt	contb_receipt_dt
0	Bachmann, Michelle	HARVEY, WILLIAM	AL	RETIRED	RETIRED	250.0	20-JUN-11
1	Bachmann, Michelle	HARVEY, WILLIAM	AL	RETIRED	RETIRED	50.0	23-JUN-11
2	Bachmann, Michelle	SMITH, LANIER	AL	INFORMATION REQUESTED	INFORMATION REQUESTED	250.0	05-JUL-11
3	Bachmann, Michelle	BLEVINS, DARONDA	AR	NONE	RETIRED	250.0	01-AUG-11
4	Bachmann, Michelle	WARDENBURG, HAROLD	AR	NONE	RETIRED	300.0	20-JUN-11

df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 536041 entries, 0 to 536040
Data columns (total 7 columns):
 #   Column             Non-Null Count   Dtype  
---  ------             --------------   -----  
 0   cand_nm            536041 non-null  object 
 1   contbr_nm          536041 non-null  object 
 2   contbr_st          536040 non-null  object 
 3   contbr_employer    525088 non-null  object 
 4   contbr_occupation  530520 non-null  object 
 5   contb_receipt_amt  536041 non-null  float64
 6   contb_receipt_dt   536041 non-null  object 
dtypes: float64(1), object(6)
memory usage: 28.6+ MB

#将所有的缺失数据填充成NOT PROVIDE
df.fillna(value='NOT PROVIDE',inplace=True)

df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 536041 entries, 0 to 536040
Data columns (total 7 columns):
 #   Column             Non-Null Count   Dtype  
---  ------             --------------   -----  
 0   cand_nm            536041 non-null  object 
 1   contbr_nm          536041 non-null  object 
 2   contbr_st          536041 non-null  object 
 3   contbr_employer    536041 non-null  object 
 4   contbr_occupation  536041 non-null  object 
 5   contb_receipt_amt  536041 non-null  float64
 6   contb_receipt_dt   536041 non-null  object 
dtypes: float64(1), object(6)
memory usage: 28.6+ MB

#异常值处理。将捐款金额<=0的数据删除
(df['contb_receipt_amt'] <= 0).sum()

~(df['contb_receipt_amt'] <= 0)
df = df.loc[~(df['contb_receipt_amt'] <= 0)]

#新建一列为各个候选人所在党派party
df['party'] = df['cand_nm'].map(parties)
df.head()

	cand_nm	contbr_nm	contbr_st	contbr_employer	contbr_occupation	contb_receipt_amt	contb_receipt_dt	party
0	Bachmann, Michelle	HARVEY, WILLIAM	AL	RETIRED	RETIRED	250.0	20-JUN-11	Republican
1	Bachmann, Michelle	HARVEY, WILLIAM	AL	RETIRED	RETIRED	50.0	23-JUN-11	Republican
2	Bachmann, Michelle	SMITH, LANIER	AL	INFORMATION REQUESTED	INFORMATION REQUESTED	250.0	05-JUL-11	Republican
3	Bachmann, Michelle	BLEVINS, DARONDA	AR	NONE	RETIRED	250.0	01-AUG-11	Republican
4	Bachmann, Michelle	WARDENBURG, HAROLD	AR	NONE	RETIRED	300.0	20-JUN-11	Republican

#查看party这一列中有哪些不同的元素
df['party'].unique()

array(['Republican', 'Democrat', 'Reform', 'Libertarian'], dtype=object)

df

	cand_nm	contbr_nm	contbr_st	contbr_employer	contbr_occupation	contb_receipt_amt	contb_receipt_dt	party
0	Bachmann, Michelle	HARVEY, WILLIAM	AL	RETIRED	RETIRED	250.0	20-JUN-11	Republican
1	Bachmann, Michelle	HARVEY, WILLIAM	AL	RETIRED	RETIRED	50.0	23-JUN-11	Republican
2	Bachmann, Michelle	SMITH, LANIER	AL	INFORMATION REQUESTED	INFORMATION REQUESTED	250.0	05-JUL-11	Republican
3	Bachmann, Michelle	BLEVINS, DARONDA	AR	NONE	RETIRED	250.0	01-AUG-11	Republican
4	Bachmann, Michelle	WARDENBURG, HAROLD	AR	NONE	RETIRED	300.0	20-JUN-11	Republican
...	...	...	...	...	...	...	...	...
536036	Perry, Rick	ANDERSON, MARILEE MRS.	XX	INFORMATION REQUESTED PER BEST EFFORTS	INFORMATION REQUESTED PER BEST EFFORTS	2500.0	31-AUG-11	Republican
536037	Perry, Rick	TOLBERT, DARYL MR.	XX	T.A.C.C.	LONGWALL MAINTENANCE FOREMAN	500.0	30-SEP-11	Republican
536038	Perry, Rick	GRANE, BRYAN F. MR.	XX	INFORMATION REQUESTED PER BEST EFFORTS	INFORMATION REQUESTED PER BEST EFFORTS	500.0	29-SEP-11	Republican
536039	Perry, Rick	DUFFY, DAVID A. MR.	XX	DUFFY EQUIPMENT COMPANY INC.	BUSINESS OWNER	2500.0	30-SEP-11	Republican
536040	Perry, Rick	GORMAN, CHRIS D. MR.	XX	INFORMATION REQUESTED PER BEST EFFORTS	INFORMATION REQUESTED PER BEST EFFORTS	5000.0	29-SEP-11	Republican

530314 rows × 8 columns

#统计party列中各个元素出现次数
df['party'].value_counts()

Democrat       289999
Republican     234300
Reform           5313
Libertarian       702
Name: party, dtype: int64

#查看各个党派收到的政治献金总数contb_receipt_amt
df.groupby(by='party')['contb_receipt_amt'].sum()

party
Democrat       8.259441e+07
Libertarian    4.132769e+05
Reform         3.429658e+05
Republican     1.251181e+08
Name: contb_receipt_amt, dtype: float64

#查看具体每天各个党派收到的政治献金总数contb_receipt_amt
df.groupby(by=['contb_receipt_dt','party'])['contb_receipt_amt'].sum()

contb_receipt_dt  party      
01-APR-11         Reform             50.00
                  Republican      12635.00
01-AUG-11         Democrat       182198.00
                  Libertarian      1000.00
                  Reform           1847.00
                                   ...    
31-MAY-11         Republican     313839.80
31-OCT-11         Democrat       216971.87
                  Libertarian      4250.00
                  Reform           3205.00
                  Republican     751542.36
Name: contb_receipt_amt, Length: 1183, dtype: float64

#将表中日期格式转换为'yyyy-mm-dd'
def transform_date(d):
    day,month,year = d.split('-')
    month = months[month]
    return '20'+year+'-'+str(month)+'-'+day
    
df['contb_receipt_dt'] = df['contb_receipt_dt'].map(transform_date)

df.head()

	cand_nm	contbr_nm	contbr_st	contbr_employer	contbr_occupation	contb_receipt_amt	contb_receipt_dt	party
0	Bachmann, Michelle	HARVEY, WILLIAM	AL	RETIRED	RETIRED	250.0	2011-6-20	Republican
1	Bachmann, Michelle	HARVEY, WILLIAM	AL	RETIRED	RETIRED	50.0	2011-6-23	Republican
2	Bachmann, Michelle	SMITH, LANIER	AL	INFORMATION REQUESTED	INFORMATION REQUESTED	250.0	2011-7-05	Republican
3	Bachmann, Michelle	BLEVINS, DARONDA	AR	NONE	RETIRED	250.0	2011-8-01	Republican
4	Bachmann, Michelle	WARDENBURG, HAROLD	AR	NONE	RETIRED	300.0	2011-6-20	Republican

# 查看老兵（DISABLED VETERAN）最支持的人是谁
# 取出仅含老兵职业的数据
df['contbr_occupation'] == 'DISABLED VETERAN'
old_bing_df = df.loc[df['contbr_occupation'] == 'DISABLED VETERAN']
old_bing_df.head()

	cand_nm	contbr_nm	contbr_st	contbr_employer	contbr_occupation	contb_receipt_amt	contb_receipt_dt	party
149790	Obama, Barack	MAHURIN, DAVID	FL	VETERANS ADMINISTRATION	DISABLED VETERAN	10.0	2012-1-17	Democrat
150910	Obama, Barack	MAHURIN, DAVID	FL	VETERANS ADMINISTRATION	DISABLED VETERAN	20.0	2012-1-01	Democrat
174041	Obama, Barack	KRUCHTEN, MICHAEL	IL	DISABLED	DISABLED VETERAN	50.0	2011-12-02	Democrat
175244	Obama, Barack	KRUCHTEN, MICHAEL	IL	DISABLED	DISABLED VETERAN	250.0	2011-10-12	Democrat
183790	Obama, Barack	BRYANT, J.L.	KS	RET ARMY	DISABLED VETERAN	100.0	2011-10-12	Democrat

old_bing_df.groupby(by='cand_nm')['contb_receipt_amt'].sum()

cand_nm
Cain, Herman       300.00
Obama, Barack     4205.00
Paul, Ron         2425.49
Santorum, Rick     250.00
Name: contb_receipt_amt, dtype: float64