数据分析项目-合集-day02

数据分析项目-合集-day02

需求:

- 导入文件,查看原始数据
- 将人口数据和各州的简称数据进行合并
- 将合并的数据中重复的abbreviation进行删除
- 查看存在缺失数据的列
- 找到有哪些state/region使得state的值为NaN,进行取重操作
- 为找到的这些state/region的state项补上正确的值,从而去除掉state这一列的所有NaN
- 合并各州面积数据areas
- 我们会发现area(sq.m)这一列有缺失数据,找出是哪行
- 去除含有缺失数据的行
- 找出2010年的全民人口数据
- 计算各州的人口密度
- 排序,并找出人口密度最高的州
import numpy as np
import pandas as pd
from pandas import DataFrame
import shutil
#导入其他文件夹的源数据,导入一次即可
#shutil.copytree(r"C:\01-工作相关\第2阶段-03-21-数据分析资料\数据分析项目-合集",r"数据分析项目-合集")
#导入文件,查看原始数据
abb=pd.read_csv("../data/state-abbrevs.csv") #state(州的全称) #abbreviation(州的简称)
abb.head()
stateabbreviation
0AlabamaAL
1AlaskaAK
2ArizonaAZ
3ArkansasAR
4CaliforniaCA
area=pd.read_csv("../data/state-areas.csv") #state(州的全称) #area(州的面积)
area.head()
statearea (sq. mi)
0Alabama52423
1Alaska656425
2Arizona114006
3Arkansas53182
4California163707
pop=pd.read_csv("../data/state-population.csv") #state/region(州的简称) #age年龄 #year时间 #population人口
pop.head()
state/regionagesyearpopulation
0ALunder1820121117489.0
1ALtotal20124817528.0
2ALunder1820101130966.0
3ALtotal20104785570.0
4ALunder1820111125763.0
#将人口数据和各州的简称数据进行合并
abb_pop=pd.merge(abb,pop,left_on="abbreviation",right_on="state/region",how="outer")
abb_pop.head()
stateabbreviationstate/regionagesyearpopulation
0AlabamaALALunder1820121117489.0
1AlabamaALALtotal20124817528.0
2AlabamaALALunder1820101130966.0
3AlabamaALALtotal20104785570.0
4AlabamaALALunder1820111125763.0
#将合并的数据中重复的abbreviation进行删除
abb_pop.drop(labels="abbreviation",axis=1,inplace=True)
abb_pop.head()
statestate/regionagesyearpopulation
0AlabamaALunder1820121117489.0
1AlabamaALtotal20124817528.0
2AlabamaALunder1820101130966.0
3AlabamaALtotal20104785570.0
4AlabamaALunder1820111125763.0
#查看存在缺失数据的列
#方式1:isnull, notll,any,all
abb_pop.isnull().any(axis=0)
#state,population这两列是存在空值的
state            True
state/region    False
ages            False
year            False
population       True
dtype: bool
#方式2:
abb_pop.info()
<class 'pandas.core.frame.DataFrame'>
Int64Index: 2544 entries, 0 to 2543
Data columns (total 5 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   state         2448 non-null   object 
 1   state/region  2544 non-null   object 
 2   ages          2544 non-null   object 
 3   year          2544 non-null   int64  
 4   population    2524 non-null   float64
dtypes: float64(1), int64(1), object(3)
memory usage: 119.2+ KB
#找到有哪些state/region使得state的值为NaN,进行取重操作(将state中的空值对应的简称找到,且对简称进行去重)
abb_pop
statestate/regionagesyearpopulation
0AlabamaALunder1820121117489.0
1AlabamaALtotal20124817528.0
2AlabamaALunder1820101130966.0
3AlabamaALtotal20104785570.0
4AlabamaALunder1820111125763.0
..................
2539NaNUSAtotal2010309326295.0
2540NaNUSAunder18201173902222.0
2541NaNUSAtotal2011311582564.0
2542NaNUSAunder18201273708179.0
2543NaNUSAtotal2012313873685.0

2544 rows × 5 columns

#思路:可以将state这一列中的空值对应的行数据取出,从该行数据中就可以取出简称的值
#1.将state中的空值定位
abb_pop["state"].isnull()
#2.将上述布尔值作为源数据的行索引
abb_pop.loc[abb_pop["state"].isnull()] #将state中空对应的行数据取出
#3.将简称取出
abb_pop.loc[abb_pop["state"].isnull()]["state/region"]
#4.对简称去重
abb_pop.loc[abb_pop["state"].isnull()]["state/region"].unique()

#结论:只有PR和USA对应的全称数据为空值
array(['PR', 'USA'], dtype=object)
#为找到的这些state/region的state项补上正确的值,从而去除掉state这一列的所有NaN
#思考:填充该需求中的空值可不可以使用fillna?
    #- 不可以。fillna可以使用空的紧邻值做填充。fillna(value="xxx")使用指定的值填充空值
    #(而这里需要填充的有两种,PR 和USA 对应的全称,无法统一填充)
    # 使用给元素赋值的方式进行填充!
#1.先给USA的全称对应的空值进行批量赋值
#1.1将USA对应的行数据找到(行数据中就存在state的空值)
abb_pop["state/region"]=="USA"
abb_pop.loc[abb_pop["state/region"]=="USA"] #将USA对应的行数据取出
statestate/regionagesyearpopulation
2496NaNUSAunder18199064218512.0
2497NaNUSAtotal1990249622814.0
2498NaNUSAtotal1991252980942.0
2499NaNUSAunder18199165313018.0
2500NaNUSAunder18199266509177.0
2501NaNUSAtotal1992256514231.0
2502NaNUSAtotal1993259918595.0
2503NaNUSAunder18199367594938.0
2504NaNUSAunder18199468640936.0
2505NaNUSAtotal1994263125826.0
2506NaNUSAunder18199569473140.0
2507NaNUSAunder18199670233512.0
2508NaNUSAtotal1995266278403.0
2509NaNUSAtotal1996269394291.0
2510NaNUSAtotal1997272646932.0
2511NaNUSAunder18199770920738.0
2512NaNUSAunder18199871431406.0
2513NaNUSAtotal1998275854116.0
2514NaNUSAunder18199971946051.0
2515NaNUSAtotal2000282162411.0
2516NaNUSAunder18200072376189.0
2517NaNUSAtotal1999279040181.0
2518NaNUSAtotal2001284968955.0
2519NaNUSAunder18200172671175.0
2520NaNUSAtotal2002287625193.0
2521NaNUSAunder18200272936457.0
2522NaNUSAtotal2003290107933.0
2523NaNUSAunder18200373100758.0
2524NaNUSAtotal2004292805298.0
2525NaNUSAunder18200473297735.0
2526NaNUSAtotal2005295516599.0
2527NaNUSAunder18200573523669.0
2528NaNUSAtotal2006298379912.0
2529NaNUSAunder18200673757714.0
2530NaNUSAtotal2007301231207.0
2531NaNUSAunder18200774019405.0
2532NaNUSAtotal2008304093966.0
2533NaNUSAunder18200874104602.0
2534NaNUSAunder18201373585872.0
2535NaNUSAtotal2013316128839.0
2536NaNUSAtotal2009306771529.0
2537NaNUSAunder18200974134167.0
2538NaNUSAunder18201074119556.0
2539NaNUSAtotal2010309326295.0
2540NaNUSAunder18201173902222.0
2541NaNUSAtotal2011311582564.0
2542NaNUSAunder18201273708179.0
2543NaNUSAtotal2012313873685.0
#1.2 将USA对应的全称空的对应行索引取出
indexs=abb_pop.loc[abb_pop["state/region"]=="USA"].index
abb_pop.iloc[indexs]
abb_pop.loc[indexs,"state"]="United States"
#2.将PR的全称进行赋值
abb_pop["state/region"]=="PR"
indexs2=abb_pop.loc[abb_pop["state/region"]=="PR"].index
abb_pop.loc[indexs2,"state"]="Puerto Rico"
#- 合并各州面积数据areas
#- 我们会发现area(sq.m)这一列有缺失数据,找出是哪行
#- 去除含有缺失数据的行
#- 找出2010年的全民人口数据
#- 计算各州的人口密度
#- 排序,并找出人口密度最高的州
#- 合并各州面积数据areas
abb_pop_area=pd.merge(abb_pop,area,how="outer")
abb_pop_area
statestate/regionagesyearpopulationarea (sq. mi)
0AlabamaALunder1820121117489.052423.0
1AlabamaALtotal20124817528.052423.0
2AlabamaALunder1820101130966.052423.0
3AlabamaALtotal20104785570.052423.0
4AlabamaALunder1820111125763.052423.0
.....................
2539United StatesUSAtotal2010309326295.0NaN
2540United StatesUSAunder18201173902222.0NaN
2541United StatesUSAtotal2011311582564.0NaN
2542United StatesUSAunder18201273708179.0NaN
2543United StatesUSAtotal2012313873685.0NaN

2544 rows × 6 columns

#- 我们会发现area(sq.m)这一列有缺失数据,找出是哪行
abb_pop_area["area (sq. mi)"].isnull()
indexs3=abb_pop_area.loc[abb_pop_area["area (sq. mi)"].isnull()].index
#- 去除含有缺失数据的行
abb_pop_area.drop(labels=indexs3,axis=0,inplace=True)
abb_pop_area
statestate/regionagesyearpopulationarea (sq. mi)
0AlabamaALunder1820121117489.052423.0
1AlabamaALtotal20124817528.052423.0
2AlabamaALunder1820101130966.052423.0
3AlabamaALtotal20104785570.052423.0
4AlabamaALunder1820111125763.052423.0
.....................
2491Puerto RicoPRunder182010896945.03515.0
2492Puerto RicoPRunder182011869327.03515.0
2493Puerto RicoPRtotal20113686580.03515.0
2494Puerto RicoPRunder182012841740.03515.0
2495Puerto RicoPRtotal20123651545.03515.0

2496 rows × 6 columns

#- 找出2010年的全民人口数据(基于df做条件查询)
abb_pop_area.query('ages=="total" & year==2010')
statestate/regionagesyearpopulationarea (sq. mi)
3AlabamaALtotal20104785570.052423.0
91AlaskaAKtotal2010713868.0656425.0
101ArizonaAZtotal20106408790.0114006.0
189ArkansasARtotal20102922280.053182.0
197CaliforniaCAtotal201037333601.0163707.0
283ColoradoCOtotal20105048196.0104100.0
293ConnecticutCTtotal20103579210.05544.0
379DelawareDEtotal2010899711.01954.0
389District of ColumbiaDCtotal2010605125.068.0
475FloridaFLtotal201018846054.065758.0
485GeorgiaGAtotal20109713248.059441.0
570HawaiiHItotal20101363731.010932.0
581IdahoIDtotal20101570718.083574.0
666IllinoisILtotal201012839695.057918.0
677IndianaINtotal20106489965.036420.0
762IowaIAtotal20103050314.056276.0
773KansasKStotal20102858910.082282.0
858KentuckyKYtotal20104347698.040411.0
869LouisianaLAtotal20104545392.051843.0
954MaineMEtotal20101327366.035387.0
965MontanaMTtotal2010990527.0147046.0
1050NebraskaNEtotal20101829838.077358.0
1061NevadaNVtotal20102703230.0110567.0
1146New HampshireNHtotal20101316614.09351.0
1157New JerseyNJtotal20108802707.08722.0
1242New MexicoNMtotal20102064982.0121593.0
1253New YorkNYtotal201019398228.054475.0
1338North CarolinaNCtotal20109559533.053821.0
1349North DakotaNDtotal2010674344.070704.0
1434OhioOHtotal201011545435.044828.0
1445OklahomaOKtotal20103759263.069903.0
1530OregonORtotal20103837208.098386.0
1541MarylandMDtotal20105787193.012407.0
1626MassachusettsMAtotal20106563263.010555.0
1637MichiganMItotal20109876149.096810.0
1722MinnesotaMNtotal20105310337.086943.0
1733MississippiMStotal20102970047.048434.0
1818MissouriMOtotal20105996063.069709.0
1829PennsylvaniaPAtotal201012710472.046058.0
1914Rhode IslandRItotal20101052669.01545.0
1925South CarolinaSCtotal20104636361.032007.0
2010South DakotaSDtotal2010816211.077121.0
2021TennesseeTNtotal20106356683.042146.0
2106TexasTXtotal201025245178.0268601.0
2117UtahUTtotal20102774424.084904.0
2202VermontVTtotal2010625793.09615.0
2213VirginiaVAtotal20108024417.042769.0
2298WashingtonWAtotal20106742256.071303.0
2309West VirginiaWVtotal20101854146.024231.0
2394WisconsinWItotal20105689060.065503.0
2405WyomingWYtotal2010564222.097818.0
2490Puerto RicoPRtotal20103721208.03515.0
#- 计算各州的人口密度(人口./面积)
abb_pop_area["midu"]=abb_pop_area["population"]/abb_pop_area["area (sq. mi)"]
abb_pop_area
statestate/regionagesyearpopulationarea (sq. mi)midu
0AlabamaALunder1820121117489.052423.021.316769
1AlabamaALtotal20124817528.052423.091.897221
2AlabamaALunder1820101130966.052423.021.573851
3AlabamaALtotal20104785570.052423.091.287603
4AlabamaALunder1820111125763.052423.021.474601
........................
2491Puerto RicoPRunder182010896945.03515.0255.176387
2492Puerto RicoPRunder182011869327.03515.0247.319203
2493Puerto RicoPRtotal20113686580.03515.01048.813656
2494Puerto RicoPRunder182012841740.03515.0239.470839
2495Puerto RicoPRtotal20123651545.03515.01038.846373

2496 rows × 7 columns

#- 排序,并找出人口密度最高的州
abb_pop_area.sort_values(by="midu",axis=0,ascending=False).iloc[0]["state"]
'District of Columbia'
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包

打赏作者

weixin_47049321

你的鼓励将是我创作的最大动力

¥1 ¥2 ¥4 ¥6 ¥10 ¥20
扫码支付:¥1
获取中
扫码支付

您的余额不足,请更换扫码支付或充值

打赏作者

实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值