import numpy as np
import pandas as pd
from pandas import DataFrame,Series
areas = pd.read_csv('./state-areas.csv')
areas
|
state |
area (sq. mi) |
0 |
Alabama |
52423 |
1 |
Alaska |
656425 |
2 |
Arizona |
114006 |
3 |
Arkansas |
53182 |
4 |
California |
163707 |
5 |
Colorado |
104100 |
6 |
Connecticut |
5544 |
7 |
Delaware |
1954 |
8 |
Florida |
65758 |
9 |
Georgia |
59441 |
10 |
Hawaii |
10932 |
11 |
Idaho |
83574 |
12 |
Illinois |
57918 |
13 |
Indiana |
36420 |
14 |
Iowa |
56276 |
15 |
Kansas |
82282 |
16 |
Kentucky |
40411 |
17 |
Louisiana |
51843 |
18 |
Maine |
35387 |
19 |
Maryland |
12407 |
20 |
Massachusetts |
10555 |
21 |
Michigan |
96810 |
22 |
Minnesota |
86943 |
23 |
Mississippi |
48434 |
24 |
Missouri |
69709 |
25 |
Montana |
147046 |
26 |
Nebraska |
77358 |
27 |
Nevada |
110567 |
28 |
New Hampshire |
9351 |
29 |
New Jersey |
8722 |
30 |
New Mexico |
121593 |
31 |
New York |
54475 |
32 |
North Carolina |
53821 |
33 |
North Dakota |
70704 |
34 |
Ohio |
44828 |
35 |
Oklahoma |
69903 |
36 |
Oregon |
98386 |
37 |
Pennsylvania |
46058 |
38 |
Rhode Island |
1545 |
39 |
South Carolina |
32007 |
40 |
South Dakota |
77121 |
41 |
Tennessee |
42146 |
42 |
Texas |
268601 |
43 |
Utah |
84904 |
44 |
Vermont |
9615 |
45 |
Virginia |
42769 |
46 |
Washington |
71303 |
47 |
West Virginia |
24231 |
48 |
Wisconsin |
65503 |
49 |
Wyoming |
97818 |
50 |
District of Columbia |
68 |
51 |
Puerto Rico |
3515 |
ab = pd.read_csv('./state-abbrevs.csv')
ab
|
state |
abbreviation |
0 |
Alabama |
AL |
1 |
Alaska |
AK |
2 |
Arizona |
AZ |
3 |
Arkansas |
AR |
4 |
California |
CA |
5 |
Colorado |
CO |
6 |
Connecticut |
CT |
7 |
Delaware |
DE |
8 |
District of Columbia |
DC |
9 |
Florida |
FL |
10 |
Georgia |
GA |
11 |
Hawaii |
HI |
12 |
Idaho |
ID |
13 |
Illinois |
IL |
14 |
Indiana |
IN |
15 |
Iowa |
IA |
16 |
Kansas |
KS |
17 |
Kentucky |
KY |
18 |
Louisiana |
LA |
19 |
Maine |
ME |
20 |
Montana |
MT |
21 |
Nebraska |
NE |
22 |
Nevada |
NV |
23 |
New Hampshire |
NH |
24 |
New Jersey |
NJ |
25 |
New Mexico |
NM |
26 |
New York |
NY |
27 |
North Carolina |
NC |
28 |
North Dakota |
ND |
29 |
Ohio |
OH |
30 |
Oklahoma |
OK |
31 |
Oregon |
OR |
32 |
Maryland |
MD |
33 |
Massachusetts |
MA |
34 |
Michigan |
MI |
35 |
Minnesota |
MN |
36 |
Mississippi |
MS |
37 |
Missouri |
MO |
38 |
Pennsylvania |
PA |
39 |
Rhode Island |
RI |
40 |
South Carolina |
SC |
41 |
South Dakota |
SD |
42 |
Tennessee |
TN |
43 |
Texas |
TX |
44 |
Utah |
UT |
45 |
Vermont |
VT |
46 |
Virginia |
VA |
47 |
Washington |
WA |
48 |
West Virginia |
WV |
49 |
Wisconsin |
WI |
50 |
Wyoming |
WY |
pop = pd.read_csv('./state-population.csv')
pop
|
state/region |
ages |
year |
population |
0 |
AL |
under18 |
2012 |
1117489.0 |
1 |
AL |
total |
2012 |
4817528.0 |
2 |
AL |
under18 |
2010 |
1130966.0 |
3 |
AL |
total |
2010 |
4785570.0 |
4 |
AL |
under18 |
2011 |
1125763.0 |
... |
... |
... |
... |
... |
2539 |
USA |
total |
2010 |
309326295.0 |
2540 |
USA |
under18 |
2011 |
73902222.0 |
2541 |
USA |
total |
2011 |
311582564.0 |
2542 |
USA |
under18 |
2012 |
73708179.0 |
2543 |
USA |
total |
2012 |
313873685.0 |
2544 rows × 4 columns
pop.head()
|
state/region |
ages |
year |
population |
0 |
AL |
under18 |
2012 |
1117489.0 |
1 |
AL |
total |
2012 |
4817528.0 |
2 |
AL |
under18 |
2010 |
1130966.0 |
3 |
AL |
total |
2010 |
4785570.0 |
4 |
AL |
under18 |
2011 |
1125763.0 |
pop.shape
(2544, 4)
将地名全称与人口相对应
n1 = pd.merge(pop,ab,left_on='state/region',right_on='abbreviation',how = 'outer')
n1.head()
|
state/region |
ages |
year |
population |
state |
abbreviation |
0 |
AL |
under18 |
2012 |
1117489.0 |
Alabama |
AL |
1 |
AL |
total |
2012 |
4817528.0 |
Alabama |
AL |
2 |
AL |
under18 |
2010 |
1130966.0 |
Alabama |
AL |
3 |
AL |
total |
2010 |
4785570.0 |
Alabama |
AL |
4 |
AL |
under18 |
2011 |
1125763.0 |
Alabama |
AL |
n1.shape
(2544, 6)
n1.info()
<class 'pandas.core.frame.DataFrame'>
Int64Index: 2544 entries, 0 to 2543
Data columns (total 6 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 state/region 2544 non-null object
1 ages 2544 non-null object
2 year 2544 non-null int64
3 population 2524 non-null float64
4 state 2448 non-null object
5 abbreviation 2448 non-null object
dtypes: float64(1), int64(1), object(4)
memory usage: 139.1+ KB
n1.drop(labels = 'abbreviation',axis = 1,inplace=True)
n1.head()
|
state/region |
ages |
year |
population |
state |
0 |
AL |
under18 |
2012 |
1117489.0 |
Alabama |
1 |
AL |
total |
2012 |
4817528.0 |
Alabama |
2 |
AL |
under18 |
2010 |
1130966.0 |
Alabama |
3 |
AL |
total |
2010 |
4785570.0 |
Alabama |
4 |
AL |
under18 |
2011 |
1125763.0 |
Alabama |
查找空数据并填充
n1.info()
<clas