老男孩-数据分析 05 人口分析案例

需求:

  • 导入文件,查看原始数据

  • 将人口数据和各州简称数据进行合并

  • 将合并的数据中重复的abbreviation列进行删除

  • 查看存在缺失数据的列

  • 找到有哪些state/region使得state的值为NaN,进行去重操作

  • 为找到的这些state/region的state项补上正确的值,从而去除掉state这一列的所有NaN

  • 合并各州面积数据areas

  • 我们会发现area(sq.mi)这一列有缺失数据,找出是哪些行

  • 去除含有缺失数据的行

  • 找出2010年的全民人口数据

  • 计算各州的人口密度
    排序,并找出人口密度最高的州

      import numpy as np
      import pandas as pd
      from pandas import DataFrame
      #导入文件,查看原始数据
      abb = pd.read_csv('../data/state-abbrevs.csv') #state(州的全称)abbreviation(州的简称)
      abb
      state	abbreviation
      0	Alabama	AL
      1	Alaska	AK
      2	Arizona	AZ
      3	Arkansas	AR
      4	California	CA
      5	Colorado	CO
      6	Connecticut	CT
      7	Delaware	DE
      8	District of Columbia	DC
      9	Florida	FL
      10	Georgia	GA
      11	Hawaii	HI
      12	Idaho	ID
      13	Illinois	IL
      14	Indiana	IN
      15	Iowa	IA
      16	Kansas	KS
      17	Kentucky	KY
      18	Louisiana	LA
      19	Maine	ME
      20	Montana	MT
      21	Nebraska	NE
      22	Nevada	NV
      23	New Hampshire	NH
      24	New Jersey	NJ
      25	New Mexico	NM
      26	New York	NY
      27	North Carolina	NC
      28	North Dakota	ND
      29	Ohio	OH
      30	Oklahoma	OK
      31	Oregon	OR
      32	Maryland	MD
      33	Massachusetts	MA
      34	Michigan	MI
      35	Minnesota	MN
      36	Mississippi	MS
      37	Missouri	MO
      38	Pennsylvania	PA
      39	Rhode Island	RI
      40	South Carolina	SC
      41	South Dakota	SD
      42	Tennessee	TN
      43	Texas	TX
      44	Utah	UT
      45	Vermont	VT
      46	Virginia	VA
      47	Washington	WA
      48	West Virginia	WV
      49	Wisconsin	WI
      50	Wyoming	WY
      area = pd.read_csv('../data/state-areas.csv') #state州的全称,area (sq. mi)州的面积
      area
      state	area (sq. mi)
      0	Alabama	52423
      1	Alaska	656425
      2	Arizona	114006
      3	Arkansas	53182
      4	California	163707
      5	Colorado	104100
      6	Connecticut	5544
      7	Delaware	1954
      8	Florida	65758
      9	Georgia	59441
      10	Hawaii	10932
      11	Idaho	83574
      12	Illinois	57918
      13	Indiana	36420
      14	Iowa	56276
      15	Kansas	82282
      16	Kentucky	40411
      17	Louisiana	51843
      18	Maine	35387
      19	Maryland	12407
      20	Massachusetts	10555
      21	Michigan	96810
      22	Minnesota	86943
      23	Mississippi	48434
      24	Missouri	69709
      25	Montana	147046
      26	Nebraska	77358
      27	Nevada	110567
      28	New Hampshire	9351
      29	New Jersey	8722
      30	New Mexico	121593
      31	New York	54475
      32	North Carolina	53821
      33	North Dakota	70704
      34	Ohio	44828
      35	Oklahoma	69903
      36	Oregon	98386
      37	Pennsylvania	46058
      38	Rhode Island	1545
      39	South Carolina	32007
      40	South Dakota	77121
      41	Tennessee	42146
      42	Texas	268601
      43	Utah	84904
      44	Vermont	9615
      45	Virginia	42769
      46	Washington	71303
      47	West Virginia	24231
      48	Wisconsin	65503
      49	Wyoming	97818
      50	District of Columbia	68
      51	Puerto Rico	3515
      pop = pd.read_csv('../data/state-population.csv')#state/region简称,ages年龄,year时间,population人口数量
      pop
      state/region	ages	year	population
      0	AL	under18	2012	1117489.0
      1	AL	total	2012	4817528.0
      2	AL	under18	2010	1130966.0
      3	AL	total	2010	4785570.0
      4	AL	under18	2011	1125763.0
      ...	...	...	...	...
      2539	USA	total	2010	309326295.0
      2540	USA	under18	2011	73902222.0
      2541	USA	total	2011	311582564.0
      2542	USA	under18	2012	73708179.0
      2543	USA	total	2012	313873685.0
      2544 rows × 4 columns
      
      #将人口数据和各州简称数据进行合并
      abb_pop = pd.merge(abb,pop,left_on='abbreviation',right_on='state/region',how='outer')
      abb_pop.head()
      state	abbreviation	state/region	ages	year	population
      0	Alabama	AL	AL	under18	2012	1117489.0
      1	Alabama	AL	AL	total	2012	4817528.0
      2	Alabama	AL	AL	under18	2010	1130966.0
      3	Alabama	AL	AL	total	2010	4785570.0
      4	Alabama	AL	AL	under18	2011	1125763.0
      #将合并的数据中重复的abbreviation列进行删除
      abb_pop.drop(labels='abbreviation',axis=1,inplace=True)
      ​
      abb_pop.head()
      state	state/region	ages	year	population
      0	Alabama	AL	under18	2012	1117489.0
      1	Alabama	AL	total	2012	4817528.0
      2	Alabama	AL	under18	2010	1130966.0
      3	Alabama	AL	total	2010	4785570.0
      4	Alabama	AL	under18	2011	1125763.0
      #查看存在缺失数据的列
      #方式1:isnull,notll,any,all
      abb_pop.isnull().any(axis=0)
      #state,population这两列中是存在空值
      state            True
      state/region    False
      ages            False
      year            False
      population       True
      dtype: bool
      #方式2:
      abb_pop.info()
      <class 'pandas.core.frame.DataFrame'>
      Int64Index: 2544 entries, 0 to 2543
      Data columns (total 5 columns):
       #   Column        Non-Null Count  Dtype  
      ---  ------        --------------  -----  
       0   state         2448 non-null   object 
       1   state/region  2544 non-null   object 
       2   ages          2544 non-null   object 
       3   year          2544 non-null   int64  
       4   population    2524 non-null   float64
      dtypes: float64(1), int64(1), object(3)
      memory usage: 119.2+ KB
      #找到有哪些state/region使得state的值为NaN,进行去重操作(将state中的空值对应的简称找到,且对简称进行去重)
      abb_pop.head()
      abb_pop
      state	state/region	ages	year	population
      0	Alabama	AL	under18	2012	1117489.0
      1	Alabama	AL	total	2012	4817528.0
      2	Alabama	AL	under18	2010	1130966.0
      3	Alabama	AL	total	2010	4785570.0
      4	Alabama	AL	under18	2011	1125763.0
      ...	...	...	...	...	...
      2539	NaN	USA	total	2010	309326295.0
      2540	NaN	USA	under18	2011	73902222.0
      2541	NaN	USA	total	2011	311582564.0
      2542	NaN	USA	under18	2012	73708179.0
      2543	NaN	USA	total	2012	313873685.0
      2544 rows × 5 columns
      
      #思路:可以将state这一列中的空值对应的行数据取出,从该行数据中就可以取出简称的值
      #1.将state中的空值定位到
      abb_pop['state'].isnull()
      #2.将上述的布尔值作为源数据的行索引
      abb_pop.loc[abb_pop['state'].isnull()]#将state中空对应的行数据取出
      #3.将简称取出
      abb_pop.loc[abb_pop['state'].isnull()]['state/region']
      #4.对简称去重
      abb_pop.loc[abb_pop['state'].isnull()]['state/region'].unique()
      #['state/region'].unique()
      ​
      #结论:只有PR和USA对应的全称数据为空值
      array(['PR', 'USA'], dtype=object)
      #为找到的这些state/region的state项补上正确的值,从而去除掉state这一列的所有NaN
      #思考:填充该需求中的空值可不可以使用fillna?
          # - 不可以。fillna可以使用空的紧邻值做填充。fillna(value='xxx')使用指定的值填充空值
          # 使用给元素赋值的方式进行填充!
      #1.先给USA的全称对应的空值进行批量赋值
      #1.1将USA对应的行数据找出(行数据中就存在state的空值)
      #abb_pop['state/region'] == 'USA'
      abb_pop.loc[abb_pop['state/region'] == 'USA']#将usa对应的行数据取出
      state	state/region	ages	year	population
      2496	NaN	USA	under18	1990	64218512.0
      2497	NaN	USA	total	1990	249622814.0
      2498	NaN	USA	total	1991	252980942.0
      2499	NaN	USA	under18	1991	65313018.0
      2500	NaN	USA	under18	1992	66509177.0
      2501	NaN	USA	total	1992	256514231.0
      2502	NaN	USA	total	1993	259918595.0
      2503	NaN	USA	under18	1993	67594938.0
      2504	NaN	USA	under18	1994	68640936.0
      2505	NaN	USA	total	1994	263125826.0
      2506	NaN	USA	under18	1995	69473140.0
      2507	NaN	USA	under18	1996	70233512.0
      2508	NaN	USA	total	1995	266278403.0
      2509	NaN	USA	total	1996	269394291.0
      2510	NaN	USA	total	1997	272646932.0
      2511	NaN	USA	under18	1997	70920738.0
      2512	NaN	USA	under18	1998	71431406.0
      2513	NaN	USA	total	1998	275854116.0
      2514	NaN	USA	under18	1999	71946051.0
      2515	NaN	USA	total	2000	282162411.0
      2516	NaN	USA	under18	2000	72376189.0
      2517	NaN	USA	total	1999	279040181.0
      2518	NaN	USA	total	2001	284968955.0
      2519	NaN	USA	under18	2001	72671175.0
      2520	NaN	USA	total	2002	287625193.0
      2521	NaN	USA	under18	2002	72936457.0
      2522	NaN	USA	total	2003	290107933.0
      2523	NaN	USA	under18	2003	73100758.0
      2524	NaN	USA	total	2004	292805298.0
      2525	NaN	USA	under18	2004	73297735.0
      2526	NaN	USA	total	2005	295516599.0
      2527	NaN	USA	under18	2005	73523669.0
      2528	NaN	USA	total	2006	298379912.0
      2529	NaN	USA	under18	2006	73757714.0
      2530	NaN	USA	total	2007	301231207.0
      2531	NaN	USA	under18	2007	74019405.0
      2532	NaN	USA	total	2008	304093966.0
      2533	NaN	USA	under18	2008	74104602.0
      2534	NaN	USA	under18	2013	73585872.0
      2535	NaN	USA	total	2013	316128839.0
      2536	NaN	USA	total	2009	306771529.0
      2537	NaN	USA	under18	2009	74134167.0
      2538	NaN	USA	under18	2010	74119556.0
      2539	NaN	USA	total	2010	309326295.0
      2540	NaN	USA	under18	2011	73902222.0
      2541	NaN	USA	total	2011	311582564.0
      2542	NaN	USA	under18	2012	73708179.0
      2543	NaN	USA	total	2012	313873685.0
      #1.2将USA对应的全称空对应的行索引取出
      indexs = abb_pop.loc[abb_pop['state/region'] == 'USA'].index
      abb_pop.iloc[indexs]
      abb_pop.loc[indexs]['state']='88'
      ​
      abb_pop.loc[indexs]['state']
      #['state'] = 'United States33'
      ​
      ​
      2496    NaN
      2497    NaN
      2498    NaN
      2499    NaN
      2500    NaN
      2501    NaN
      2502    NaN
      2503    NaN
      2504    NaN
      2505    NaN
      2506    NaN
      2507    NaN
      2508    NaN
      2509    NaN
      2510    NaN
      2511    NaN
      2512    NaN
      2513    NaN
      2514    NaN
      2515    NaN
      2516    NaN
      2517    NaN
      2518    NaN
      2519    NaN
      2520    NaN
      2521    NaN
      2522    NaN
      2523    NaN
      2524    NaN
      2525    NaN
      2526    NaN
      2527    NaN
      2528    NaN
      2529    NaN
      2530    NaN
      2531    NaN
      2532    NaN
      2533    NaN
      2534    NaN
      2535    NaN
      2536    NaN
      2537    NaN
      2538    NaN
      2539    NaN
      2540    NaN
      2541    NaN
      2542    NaN
      2543    NaN
      Name: state, dtype: object
      #2.可以将PR的全称进行赋值
      abb_pop['state/region'] == 'PR'
      abb_pop.loc[abb_pop['state/region'] == 'PR'] #PR对应的行数据
      # indexs = abb_pop.loc[abb_pop['state/region'] == 'PR'].index
      abb_pop.loc[indexs,'state'] = 'PPPRRR'
      #合并各州面积数据areas
      abb_pop_area = pd.merge(abb_pop,area,how='outer')
      #我们会发现area(sq.mi)这一列有缺失数据,找出是哪些行
      abb_pop_area['area (sq. mi)'].isnull()
      abb_pop_area.loc[abb_pop_area['area (sq. mi)'].isnull()] #空对应的行数据
      indexs = abb_pop_area.loc[abb_pop_area['area (sq. mi)'].isnull()].index
      #去除含有缺失数据的行
      abb_pop_area.drop(labels=indexs,axis=0,inplace=True)
      #找出2010年的全民人口数据(基于df做条件查询)
      abb_pop_area.query('ages == "total" & year == 2010')
      #计算各州的人口密度(人口除以面积)
      abb_pop_area['midu'] = abb_pop_area['population'] / abb_pop_area['area (sq. mi)']
      abb_pop_area
      #排序,并找出人口密度最高的州
      abb_pop_area.sort_values(by='midu',axis=0,ascending=False).iloc[0]['state']
    

  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值