数据分析项目-合集-day02

最新推荐文章于 2024-09-15 15:29:29 发布

weixin_47049321

最新推荐文章于 2024-09-15 15:29:29 发布

阅读量262

点赞数

文章标签：人口数据州简称人口密度数据合并缺失值处理

本文链接：https://blog.csdn.net/weixin_47049321/article/details/124639173

版权

数据分析项目-合集-day02

需求：

- 导入文件，查看原始数据
- 将人口数据和各州的简称数据进行合并
- 将合并的数据中重复的abbreviation进行删除
- 查看存在缺失数据的列
- 找到有哪些state/region使得state的值为NaN,进行取重操作
- 为找到的这些state/region的state项补上正确的值，从而去除掉state这一列的所有NaN
- 合并各州面积数据areas
- 我们会发现area(sq.m)这一列有缺失数据，找出是哪行
- 去除含有缺失数据的行
- 找出2010年的全民人口数据
- 计算各州的人口密度
- 排序，并找出人口密度最高的州

import numpy as np
import pandas as pd
from pandas import DataFrame
import shutil

#导入其他文件夹的源数据，导入一次即可
#shutil.copytree(r"C:\01-工作相关\第2阶段-03-21-数据分析资料\数据分析项目-合集",r"数据分析项目-合集")

#导入文件,查看原始数据
abb=pd.read_csv("../data/state-abbrevs.csv") #state(州的全称) #abbreviation(州的简称)
abb.head()

	state	abbreviation
0	Alabama	AL
1	Alaska	AK
2	Arizona	AZ
3	Arkansas	AR
4	California	CA

area=pd.read_csv("../data/state-areas.csv") #state(州的全称) #area(州的面积)
area.head()

	state	area (sq. mi)
0	Alabama	52423
1	Alaska	656425
2	Arizona	114006
3	Arkansas	53182
4	California	163707

pop=pd.read_csv("../data/state-population.csv") #state/region(州的简称) #age年龄 #year时间 #population人口
pop.head()

	state/region	ages	year	population
0	AL	under18	2012	1117489.0
1	AL	total	2012	4817528.0
2	AL	under18	2010	1130966.0
3	AL	total	2010	4785570.0
4	AL	under18	2011	1125763.0

#将人口数据和各州的简称数据进行合并
abb_pop=pd.merge(abb,pop,left_on="abbreviation",right_on="state/region",how="outer")
abb_pop.head()

	state	abbreviation	state/region	ages	year	population
0	Alabama	AL	AL	under18	2012	1117489.0
1	Alabama	AL	AL	total	2012	4817528.0
2	Alabama	AL	AL	under18	2010	1130966.0
3	Alabama	AL	AL	total	2010	4785570.0
4	Alabama	AL	AL	under18	2011	1125763.0

#将合并的数据中重复的abbreviation进行删除
abb_pop.drop(labels="abbreviation",axis=1,inplace=True)

abb_pop.head()

	state	state/region	ages	year	population
0	Alabama	AL	under18	2012	1117489.0
1	Alabama	AL	total	2012	4817528.0
2	Alabama	AL	under18	2010	1130966.0
3	Alabama	AL	total	2010	4785570.0
4	Alabama	AL	under18	2011	1125763.0

#查看存在缺失数据的列
#方式1：isnull, notll,any,all
abb_pop.isnull().any(axis=0)
#state,population这两列是存在空值的

state            True
state/region    False
ages            False
year            False
population       True
dtype: bool

#方式2：
abb_pop.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2544 entries, 0 to 2543
Data columns (total 5 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   state         2448 non-null   object 
 1   state/region  2544 non-null   object 
 2   ages          2544 non-null   object 
 3   year          2544 non-null   int64  
 4   population    2524 non-null   float64
dtypes: float64(1), int64(1), object(3)
memory usage: 119.2+ KB

#找到有哪些state/region使得state的值为NaN,进行取重操作(将state中的空值对应的简称找到，且对简称进行去重)
abb_pop

	state	state/region	ages	year	population
0	Alabama	AL	under18	2012	1117489.0
1	Alabama	AL	total	2012	4817528.0
2	Alabama	AL	under18	2010	1130966.0
3	Alabama	AL	total	2010	4785570.0
4	Alabama	AL	under18	2011	1125763.0
...	...	...	...	...	...
2539	NaN	USA	total	2010	309326295.0
2540	NaN	USA	under18	2011	73902222.0
2541	NaN	USA	total	2011	311582564.0
2542	NaN	USA	under18	2012	73708179.0
2543	NaN	USA	total	2012	313873685.0

2544 rows × 5 columns

#思路：可以将state这一列中的空值对应的行数据取出，从该行数据中就可以取出简称的值

#1.将state中的空值定位
abb_pop["state"].isnull()
#2.将上述布尔值作为源数据的行索引
abb_pop.loc[abb_pop["state"].isnull()] #将state中空对应的行数据取出
#3.将简称取出
abb_pop.loc[abb_pop["state"].isnull()]["state/region"]
#4.对简称去重
abb_pop.loc[abb_pop["state"].isnull()]["state/region"].unique()

#结论：只有PR和USA对应的全称数据为空值

array(['PR', 'USA'], dtype=object)

#为找到的这些state/region的state项补上正确的值，从而去除掉state这一列的所有NaN
#思考：填充该需求中的空值可不可以使用fillna?
    #- 不可以。fillna可以使用空的紧邻值做填充。fillna(value="xxx")使用指定的值填充空值
    #（而这里需要填充的有两种，PR 和USA 对应的全称，无法统一填充）
    # 使用给元素赋值的方式进行填充！

#1.先给USA的全称对应的空值进行批量赋值

#1.1将USA对应的行数据找到（行数据中就存在state的空值）
abb_pop["state/region"]=="USA"
abb_pop.loc[abb_pop["state/region"]=="USA"] #将USA对应的行数据取出

	state	state/region	ages	year	population
2496	NaN	USA	under18	1990	64218512.0
2497	NaN	USA	total	1990	249622814.0
2498	NaN	USA	total	1991	252980942.0
2499	NaN	USA	under18	1991	65313018.0
2500	NaN	USA	under18	1992	66509177.0
2501	NaN	USA	total	1992	256514231.0
2502	NaN	USA	total	1993	259918595.0
2503	NaN	USA	under18	1993	67594938.0
2504	NaN	USA	under18	1994	68640936.0
2505	NaN	USA	total	1994	263125826.0
2506	NaN	USA	under18	1995	69473140.0
2507	NaN	USA	under18	1996	70233512.0
2508	NaN	USA	total	1995	266278403.0
2509	NaN	USA	total	1996	269394291.0
2510	NaN	USA	total	1997	272646932.0
2511	NaN	USA	under18	1997	70920738.0
2512	NaN	USA	under18	1998	71431406.0
2513	NaN	USA	total	1998	275854116.0
2514	NaN	USA	under18	1999	71946051.0
2515	NaN	USA	total	2000	282162411.0
2516	NaN	USA	under18	2000	72376189.0
2517	NaN	USA	total	1999	279040181.0
2518	NaN	USA	total	2001	284968955.0
2519	NaN	USA	under18	2001	72671175.0
2520	NaN	USA	total	2002	287625193.0
2521	NaN	USA	under18	2002	72936457.0
2522	NaN	USA	total	2003	290107933.0
2523	NaN	USA	under18	2003	73100758.0
2524	NaN	USA	total	2004	292805298.0
2525	NaN	USA	under18	2004	73297735.0
2526	NaN	USA	total	2005	295516599.0
2527	NaN	USA	under18	2005	73523669.0
2528	NaN	USA	total	2006	298379912.0
2529	NaN	USA	under18	2006	73757714.0
2530	NaN	USA	total	2007	301231207.0
2531	NaN	USA	under18	2007	74019405.0
2532	NaN	USA	total	2008	304093966.0
2533	NaN	USA	under18	2008	74104602.0
2534	NaN	USA	under18	2013	73585872.0
2535	NaN	USA	total	2013	316128839.0
2536	NaN	USA	total	2009	306771529.0
2537	NaN	USA	under18	2009	74134167.0
2538	NaN	USA	under18	2010	74119556.0
2539	NaN	USA	total	2010	309326295.0
2540	NaN	USA	under18	2011	73902222.0
2541	NaN	USA	total	2011	311582564.0
2542	NaN	USA	under18	2012	73708179.0
2543	NaN	USA	total	2012	313873685.0

#1.2 将USA对应的全称空的对应行索引取出
indexs=abb_pop.loc[abb_pop["state/region"]=="USA"].index

abb_pop.iloc[indexs]
abb_pop.loc[indexs,"state"]="United States"

#2.将PR的全称进行赋值
abb_pop["state/region"]=="PR"
indexs2=abb_pop.loc[abb_pop["state/region"]=="PR"].index

abb_pop.loc[indexs2,"state"]="Puerto Rico"

#- 合并各州面积数据areas
#- 我们会发现area(sq.m)这一列有缺失数据，找出是哪行
#- 去除含有缺失数据的行
#- 找出2010年的全民人口数据
#- 计算各州的人口密度
#- 排序，并找出人口密度最高的州

#- 合并各州面积数据areas
abb_pop_area=pd.merge(abb_pop,area,how="outer")
abb_pop_area

	state	state/region	ages	year	population	area (sq. mi)
0	Alabama	AL	under18	2012	1117489.0	52423.0
1	Alabama	AL	total	2012	4817528.0	52423.0
2	Alabama	AL	under18	2010	1130966.0	52423.0
3	Alabama	AL	total	2010	4785570.0	52423.0
4	Alabama	AL	under18	2011	1125763.0	52423.0
...	...	...	...	...	...	...
2539	United States	USA	total	2010	309326295.0	NaN
2540	United States	USA	under18	2011	73902222.0	NaN
2541	United States	USA	total	2011	311582564.0	NaN
2542	United States	USA	under18	2012	73708179.0	NaN
2543	United States	USA	total	2012	313873685.0	NaN

2544 rows × 6 columns

#- 我们会发现area(sq.m)这一列有缺失数据，找出是哪行
abb_pop_area["area (sq. mi)"].isnull()
indexs3=abb_pop_area.loc[abb_pop_area["area (sq. mi)"].isnull()].index

#- 去除含有缺失数据的行
abb_pop_area.drop(labels=indexs3,axis=0,inplace=True)

abb_pop_area

	state	state/region	ages	year	population	area (sq. mi)
0	Alabama	AL	under18	2012	1117489.0	52423.0
1	Alabama	AL	total	2012	4817528.0	52423.0
2	Alabama	AL	under18	2010	1130966.0	52423.0
3	Alabama	AL	total	2010	4785570.0	52423.0
4	Alabama	AL	under18	2011	1125763.0	52423.0
...	...	...	...	...	...	...
2491	Puerto Rico	PR	under18	2010	896945.0	3515.0
2492	Puerto Rico	PR	under18	2011	869327.0	3515.0
2493	Puerto Rico	PR	total	2011	3686580.0	3515.0
2494	Puerto Rico	PR	under18	2012	841740.0	3515.0
2495	Puerto Rico	PR	total	2012	3651545.0	3515.0

2496 rows × 6 columns

#- 找出2010年的全民人口数据(基于df做条件查询)
abb_pop_area.query('ages=="total" & year==2010')

	state	state/region	ages	year	population	area (sq. mi)
3	Alabama	AL	total	2010	4785570.0	52423.0
91	Alaska	AK	total	2010	713868.0	656425.0
101	Arizona	AZ	total	2010	6408790.0	114006.0
189	Arkansas	AR	total	2010	2922280.0	53182.0
197	California	CA	total	2010	37333601.0	163707.0
283	Colorado	CO	total	2010	5048196.0	104100.0
293	Connecticut	CT	total	2010	3579210.0	5544.0
379	Delaware	DE	total	2010	899711.0	1954.0
389	District of Columbia	DC	total	2010	605125.0	68.0
475	Florida	FL	total	2010	18846054.0	65758.0
485	Georgia	GA	total	2010	9713248.0	59441.0
570	Hawaii	HI	total	2010	1363731.0	10932.0
581	Idaho	ID	total	2010	1570718.0	83574.0
666	Illinois	IL	total	2010	12839695.0	57918.0
677	Indiana	IN	total	2010	6489965.0	36420.0
762	Iowa	IA	total	2010	3050314.0	56276.0
773	Kansas	KS	total	2010	2858910.0	82282.0
858	Kentucky	KY	total	2010	4347698.0	40411.0
869	Louisiana	LA	total	2010	4545392.0	51843.0
954	Maine	ME	total	2010	1327366.0	35387.0
965	Montana	MT	total	2010	990527.0	147046.0
1050	Nebraska	NE	total	2010	1829838.0	77358.0
1061	Nevada	NV	total	2010	2703230.0	110567.0
1146	New Hampshire	NH	total	2010	1316614.0	9351.0
1157	New Jersey	NJ	total	2010	8802707.0	8722.0
1242	New Mexico	NM	total	2010	2064982.0	121593.0
1253	New York	NY	total	2010	19398228.0	54475.0
1338	North Carolina	NC	total	2010	9559533.0	53821.0
1349	North Dakota	ND	total	2010	674344.0	70704.0
1434	Ohio	OH	total	2010	11545435.0	44828.0
1445	Oklahoma	OK	total	2010	3759263.0	69903.0
1530	Oregon	OR	total	2010	3837208.0	98386.0
1541	Maryland	MD	total	2010	5787193.0	12407.0
1626	Massachusetts	MA	total	2010	6563263.0	10555.0
1637	Michigan	MI	total	2010	9876149.0	96810.0
1722	Minnesota	MN	total	2010	5310337.0	86943.0
1733	Mississippi	MS	total	2010	2970047.0	48434.0
1818	Missouri	MO	total	2010	5996063.0	69709.0
1829	Pennsylvania	PA	total	2010	12710472.0	46058.0
1914	Rhode Island	RI	total	2010	1052669.0	1545.0
1925	South Carolina	SC	total	2010	4636361.0	32007.0
2010	South Dakota	SD	total	2010	816211.0	77121.0
2021	Tennessee	TN	total	2010	6356683.0	42146.0
2106	Texas	TX	total	2010	25245178.0	268601.0
2117	Utah	UT	total	2010	2774424.0	84904.0
2202	Vermont	VT	total	2010	625793.0	9615.0
2213	Virginia	VA	total	2010	8024417.0	42769.0
2298	Washington	WA	total	2010	6742256.0	71303.0
2309	West Virginia	WV	total	2010	1854146.0	24231.0
2394	Wisconsin	WI	total	2010	5689060.0	65503.0
2405	Wyoming	WY	total	2010	564222.0	97818.0
2490	Puerto Rico	PR	total	2010	3721208.0	3515.0

#- 计算各州的人口密度(人口./面积)
abb_pop_area["midu"]=abb_pop_area["population"]/abb_pop_area["area (sq. mi)"]
abb_pop_area

	state	state/region	ages	year	population	area (sq. mi)	midu
0	Alabama	AL	under18	2012	1117489.0	52423.0	21.316769
1	Alabama	AL	total	2012	4817528.0	52423.0	91.897221
2	Alabama	AL	under18	2010	1130966.0	52423.0	21.573851
3	Alabama	AL	total	2010	4785570.0	52423.0	91.287603
4	Alabama	AL	under18	2011	1125763.0	52423.0	21.474601
...	...	...	...	...	...	...	...
2491	Puerto Rico	PR	under18	2010	896945.0	3515.0	255.176387
2492	Puerto Rico	PR	under18	2011	869327.0	3515.0	247.319203
2493	Puerto Rico	PR	total	2011	3686580.0	3515.0	1048.813656
2494	Puerto Rico	PR	under18	2012	841740.0	3515.0	239.470839
2495	Puerto Rico	PR	total	2012	3651545.0	3515.0	1038.846373

2496 rows × 7 columns

#- 排序，并找出人口密度最高的州
abb_pop_area.sort_values(by="midu",axis=0,ascending=False).iloc[0]["state"]

'District of Columbia'

weixin_47049321

关注

0
点赞
踩
0

收藏

觉得还不错? 一键收藏
打赏
0
评论
复制链接

分享到 QQ

分享到新浪微博

扫一扫