Step 2: Intermediate Python And Pandas/Data Cleaning
>>表格的合并,pandas.concat()
z = pd.concat([x,y], axis=0) #合并x和y,y在x下面(对每一列操作得到行)
>>pd.read_csv() 也可以读取txt文件,根据传入的参数,对编码格式和分隔符进行设置
all_survey=pandas.read_csv("schools/survey_all.txt",delimiter="\t",encoding="windows-1252")
d75_survey=pandas.read_csv("schools/survey_d75.txt",delimiter="\t",encoding="windows-1252")
survey=pandas.concat([all_survey,d75_survey],axis=0)
>>关于pd.DataFrame.apply()的用法:
如果是DataFrame.apply(func) 则传入的func的参数一般是Series,即DataFrame的某一行或列
如果是Series.apply(func1) 则传入func1的参数是某个元素,例子如下:
data['hs_directory']['DBN']=data['hs_directory']['dbn']
def padded(ele):
ele=str(ele)
num=len(ele)
if num==1:
ele='0'+ele
return ele
data['class_size']["padded_csd"]=data['class_size']['CSD'].apply(padded)
data['class_size']['DBN']=data['class_size']['padded_csd']+data['class_size']['SCHOOL CODE']
print(data['class_size']['DBN'].head(10))
data是一个存入多个数据集的字典,data['class_size']相当于取出class_size数据集DataFrame
>> pandas.to_numeric() 将字符串类型的数据转换为数值类型
cols = ['SAT Math Avg. Score', 'SAT Critical Reading Avg. Score', 'SAT Writing Avg. Score']
for c in cols:
data["sat_results"][c] = pd.to_numeric(data["sat_results"][c], errors="coerce")
data['sat_results']['sat_score'] = data['sat_results'][cols[0]] + data['sat_results'][cols[1]] + data['sat_results'][cols[2]]
print(data['sat_results']['sat_score'].head())
>>对数据集数据进行提取:
#从数据集提取经纬度信息
#例:从字符串"1110 Boston Road\nBronx, NY 10456\n(40.8276026690005, -73.90447525699966)"提取出经纬度信息40.8276026690005
import re
def extract_titude(info):
loc=re.findall(r'\(.+\)',info)
return loc[0].split(',')[0].replace('(','')
data['hs_directory']['lat']=data['hs_directory']['Location 1'].apply(extract_titude)
#提取经度,并转化为数值
def extra_lon(info):
loc=re.findall(r'\(.+\)',info)
return loc[0].split(',')[1].replace(')','')
data['hs_directory']['lon']=data['hs_directory']['Location 1'].apply(extra_lon)
data['hs_directory']['lon']=pd.to_numeric(data['hs_directory']['lon'],errors="coerce")
data['hs_directory']['lat']=pd.to_numeric(data['hs_directory']['lat'],errors="coerce")
print(data['hs_directory'])
>>对数据集中某列有相同数值的行进行分组 pandas.DataFrame.groupby()
import numpy as np
class_size=class_size.groupby("DBN")
class_size=class_size.agg(np.mean)
class_size.reset_index(inplace=True)
data['class_size']=class_size
print(class_size.head())
>>对不同的DataFrame进行连接pandas.DataFrame.merge()
有四种方式left, right, inner, outer
combined = data["sat_results"]
combined=combined.merge(data['ap_2010'],how='left',on='DBN')
combined=combined.merge(data['graduation'],how='left',on='DBN')
print(combined.head())
print(combined.shape)
>>对连接后的数据集中存在元素为空Nan的情况,进行填充,其中一种就是填充均值
pandas.DataFrame.mean()
pandas.DataFrame.fillna() :
ave=combined.mean()
combined=combined.fillna(ave)
combined=combined.fillna(0)
print(combined.head())
>>求解DataFrame中列与列的相关系数pandas.DataFrame.corr() :
correlations=combined.corr()
correlations=correlations['sat_score']
print(correlations)
>>利用DataFrame结构绘制散点图
pandas.DataFrame.plot() :
import matplotlib.pyplot as plt
combined.plot.scatter(y='sat_score',x='total_enrollment')
plt.show()
>>根据各个学校的经纬度坐标绘制地形图:
from mpl_toolkits.basemap import Basemap
m = Basemap(
projection='merc',
llcrnrlat=40.496044,
urcrnrlat=40.915256,
llcrnrlon=-74.255735,
urcrnrlon=-73.700272,
resolution='i'
)
m.drawmapboundary(fill_color='#85A6D9')
m.drawcoastlines(color='#6D5F47', linewidth=.4)
m.drawrivers(color='#6D5F47', linewidth=.4)
longitudes=combined['lon'].tolist()
latitudes=combined['lat'].tolist()
m.scatter(longitudes,latitudes,s=20,zorder=2,latlon=True)
plt.show()
将scatter作图段改为其他:m.scatter(longitudes,latitudes,s=20,zorder=2,latlon=True,c=combined['ell_percent'],cmap="summer")
>>关于进一步分析 Analyzing NYC High School Data的代码在:https://github.com/dataquestio/solutions/blob/master/Mission217Solutions.ipynb
Guided Project: Star Wars Survey部分工作,数据集their GitHub repository ;代码the project's GitHub repository:
用到的函数: pandas.Series.map() pandas.Series.value_counts()
ser = ["Yes", "No", NaN, "Yes"]
yes_no = {"Yes": True,"No": False}
ser = ser.map(yes_no)
>>[True, False, NaN, True]
pandas.DataFrame.rename() 给columns重命名
star_wars = star_wars.rename(columns={ "Which of the following Star Wars films have you seen? Please select all that apply.": "seen_1"})
pandas.DataFrame.astype()进行类型转换
该工程的代码:
# -*- coding: utf-8 -*-
"""
Spyder Editor
https://github.com/dataquestio/solutions/blob/master/Mission201Solution.ipynb
This is a temporary script file.
"""
import pandas as pd
star_wars = pd.read_csv("StarWars.csv", encoding="ISO-8859-1")
print(star_wars.head(10))
print(star_wars.columns)
star_wars=star_wars[pd.notnull(star_wars['RespondentID'])]
print(star_wars.head(10))
yes_no={'Yes':True,'No':False}
star_wars["Have you seen any of the 6 films in the Star Wars franchise?"]=star_wars["Have you seen any of the 6 films in the Star Wars franchise?"].map(yes_no)
star_wars["Do you consider yourself to be a fan of the Star Wars film franchise?"]=star_wars["Do you consider yourself to be a fan of the Star Wars film franchise?"].map(yes_no)
#Convert each column above so that it only contains the values True and False,star_wars.columns[3:9]
movie_bool={
"Star Wars: Episode I The Phantom Menace": True,
"Star Wars: Episode II Attack of the Clones":True,
"Star Wars: Episode III Revenge of the Sith":True,
"Star Wars: Episode IV A New Hope":True,
"Star Wars: Episode V The Empire Strikes Back":True,
"Star Wars: Episode VI Return of the Jedi":True,
None: False
}
def con_vt(s):
return s.map(movie_bool)
star_wars.iloc[:,3:9]=star_wars.iloc[:,3:9].apply(con_vt)
#for col in star_wars.columns[3:9]:
# star_wars[col]=star_wars[col].map(movie_bool)
#print(star_wars.head(10))
#Rename each of the above columns
column_cov={
"Which of the following Star Wars films have you seen? Please select all that apply.":"seen_1",
"Unnamed: 4": "seen_2",
"Unnamed: 5": "seen_3",
"Unnamed: 6": "seen_4",
"Unnamed: 7": "seen_5",
"Unnamed: 8": "seen_6"
}
star_wars=star_wars.rename(columns=column_cov)
print(star_wars.head(10))
#Convert each of the columns star_wars.columns[9:15] to a float type
star_wars.iloc[:,9:15]=star_wars.iloc[:,9:15].astype(float)
#Give each column a more descriptive name. We suggest ranking_1, ranking_2, and so on
rank_cov={
"Please rank the Star Wars films in order of preference with 1 being your favorite film in the franchise and 6 being your least favorite film.":"ranking_1",
"Unnamed: 10":"ranking_2",
"Unnamed: 11":"ranking_3",
"Unnamed: 12":"ranking_4",
"Unnamed: 13":"ranking_5",
"Unnamed: 14":"ranking_6",
}
star_wars=star_wars.rename(columns=rank_cov)
print(star_wars.iloc[:,9:15])
#compute the mean of each of the ranking columns and bar the value
mean_value=star_wars.iloc[:,9:15].mean()
import matplotlib.pyplot as plt
mean_value.plot.bar()
plt.show()
#compute the sum of each of the seen columns and bar the value
sum_value=star_wars.iloc[:,3:9].sum()
sum_value.plot.bar()
plt.show()