Dataquest学习总结[5]

Step 2: Intermediate Python And Pandas/Data Cleaning

>>表格的合并,pandas.concat()  

z = pd.concat([x,y], axis=0)  #合并x和y,y在x下面(对每一列操作得到行)

>>pd.read_csv() 也可以读取txt文件,根据传入的参数,对编码格式和分隔符进行设置

all_survey=pandas.read_csv("schools/survey_all.txt",delimiter="\t",encoding="windows-1252")
d75_survey=pandas.read_csv("schools/survey_d75.txt",delimiter="\t",encoding="windows-1252")
survey=pandas.concat([all_survey,d75_survey],axis=0)
>>关于pd.DataFrame.apply()的用法:

如果是DataFrame.apply(func) 则传入的func的参数一般是Series,即DataFrame的某一行或列

如果是Series.apply(func1) 则传入func1的参数是某个元素,例子如下:

data['hs_directory']['DBN']=data['hs_directory']['dbn']
def padded(ele):
    ele=str(ele)
    num=len(ele)
    if num==1:
        ele='0'+ele
    return ele 
data['class_size']["padded_csd"]=data['class_size']['CSD'].apply(padded)
data['class_size']['DBN']=data['class_size']['padded_csd']+data['class_size']['SCHOOL CODE']
print(data['class_size']['DBN'].head(10))
data是一个存入多个数据集的字典,data['class_size']相当于取出class_size数据集DataFrame

>> pandas.to_numeric() 将字符串类型的数据转换为数值类型

cols = ['SAT Math Avg. Score', 'SAT Critical Reading Avg. Score', 'SAT Writing Avg. Score']
for c in cols:
    data["sat_results"][c] = pd.to_numeric(data["sat_results"][c], errors="coerce")

data['sat_results']['sat_score'] = data['sat_results'][cols[0]] + data['sat_results'][cols[1]] + data['sat_results'][cols[2]]
print(data['sat_results']['sat_score'].head())
>>对数据集数据进行提取:

#从数据集提取经纬度信息
#例:从字符串"1110 Boston Road\nBronx, NY 10456\n(40.8276026690005, -73.90447525699966)"提取出经纬度信息40.8276026690005
import re
def extract_titude(info):
    loc=re.findall(r'\(.+\)',info)
    return loc[0].split(',')[0].replace('(','')
data['hs_directory']['lat']=data['hs_directory']['Location 1'].apply(extract_titude)    
#提取经度,并转化为数值
def extra_lon(info):
    loc=re.findall(r'\(.+\)',info)
    return loc[0].split(',')[1].replace(')','')
data['hs_directory']['lon']=data['hs_directory']['Location 1'].apply(extra_lon)
data['hs_directory']['lon']=pd.to_numeric(data['hs_directory']['lon'],errors="coerce")
data['hs_directory']['lat']=pd.to_numeric(data['hs_directory']['lat'],errors="coerce")
print(data['hs_directory'])

>>对数据集中某列有相同数值的行进行分组 pandas.DataFrame.groupby()   

import numpy as np
class_size=class_size.groupby("DBN")
class_size=class_size.agg(np.mean)
class_size.reset_index(inplace=True)
data['class_size']=class_size
print(class_size.head())

>>对不同的DataFrame进行连接pandas.DataFrame.merge() 

有四种方式left, right, inner, outer

combined = data["sat_results"]
combined=combined.merge(data['ap_2010'],how='left',on='DBN')
combined=combined.merge(data['graduation'],how='left',on='DBN')
print(combined.head())
print(combined.shape)
>>对连接后的数据集中存在元素为空Nan的情况,进行填充,其中一种就是填充均值 pandas.DataFrame.mean()   pandas.DataFrame.fillna()  :

ave=combined.mean()
combined=combined.fillna(ave)
combined=combined.fillna(0)
print(combined.head())

>>求解DataFrame中列与列的相关系数pandas.DataFrame.corr()  :

correlations=combined.corr()
correlations=correlations['sat_score']
print(correlations)
>>利用DataFrame结构绘制散点图 pandas.DataFrame.plot()  :

import matplotlib.pyplot as plt
combined.plot.scatter(y='sat_score',x='total_enrollment')
plt.show()
>>根据各个学校的经纬度坐标绘制地形图:

from mpl_toolkits.basemap import Basemap
m = Basemap(
    projection='merc', 
    llcrnrlat=40.496044, 
    urcrnrlat=40.915256, 
    llcrnrlon=-74.255735, 
    urcrnrlon=-73.700272,
    resolution='i'
)
m.drawmapboundary(fill_color='#85A6D9')
m.drawcoastlines(color='#6D5F47', linewidth=.4)
m.drawrivers(color='#6D5F47', linewidth=.4)
longitudes=combined['lon'].tolist()
latitudes=combined['lat'].tolist()
m.scatter(longitudes,latitudes,s=20,zorder=2,latlon=True)
plt.show()

将scatter作图段改为其他:m.scatter(longitudes,latitudes,s=20,zorder=2,latlon=True,c=combined['ell_percent'],cmap="summer")


>>关于进一步分析 Analyzing NYC High School Data的代码在:https://github.com/dataquestio/solutions/blob/master/Mission217Solutions.ipynb 


Guided Project: Star Wars Survey部分工作,数据集their GitHub repository   ;代码the project's GitHub repository

用到的函数: pandas.Series.map() pandas.Series.value_counts()   

ser = ["Yes", "No", NaN, "Yes"]
yes_no = {"Yes": True,"No": False}
ser = ser.map(yes_no)
>>[True, False, NaN, True]
pandas.DataFrame.rename()  给columns重命名

star_wars = star_wars.rename(columns={ "Which of the following Star Wars films have you seen? Please select all that apply.": "seen_1"})
pandas.DataFrame.astype()进行类型转换

该工程的代码:

# -*- coding: utf-8 -*-
"""
Spyder Editor
https://github.com/dataquestio/solutions/blob/master/Mission201Solution.ipynb
This is a temporary script file.
"""

import pandas as pd
star_wars = pd.read_csv("StarWars.csv", encoding="ISO-8859-1")
print(star_wars.head(10))
print(star_wars.columns)

star_wars=star_wars[pd.notnull(star_wars['RespondentID'])]
print(star_wars.head(10))

yes_no={'Yes':True,'No':False}
star_wars["Have you seen any of the 6 films in the Star Wars franchise?"]=star_wars["Have you seen any of the 6 films in the Star Wars franchise?"].map(yes_no)
star_wars["Do you consider yourself to be a fan of the Star Wars film franchise?"]=star_wars["Do you consider yourself to be a fan of the Star Wars film franchise?"].map(yes_no)

#Convert each column above so that it only contains the values True and False,star_wars.columns[3:9]
movie_bool={
    "Star Wars: Episode I  The Phantom Menace": True,
    "Star Wars: Episode II  Attack of the Clones":True,
    "Star Wars: Episode III  Revenge of the Sith":True,
    "Star Wars: Episode IV  A New Hope":True,
    "Star Wars: Episode V The Empire Strikes Back":True,
    "Star Wars: Episode VI Return of the Jedi":True,
    None: False
}
def con_vt(s):
    return s.map(movie_bool)
star_wars.iloc[:,3:9]=star_wars.iloc[:,3:9].apply(con_vt)
#for col in star_wars.columns[3:9]:
#    star_wars[col]=star_wars[col].map(movie_bool)
#print(star_wars.head(10))

#Rename each of the above columns
column_cov={
        "Which of the following Star Wars films have you seen? Please select all that apply.":"seen_1",
        "Unnamed: 4": "seen_2",
        "Unnamed: 5": "seen_3",
        "Unnamed: 6": "seen_4",
        "Unnamed: 7": "seen_5",
        "Unnamed: 8": "seen_6" 
}
star_wars=star_wars.rename(columns=column_cov)
print(star_wars.head(10))

#Convert each of the columns star_wars.columns[9:15] to a float type
star_wars.iloc[:,9:15]=star_wars.iloc[:,9:15].astype(float)
#Give each column a more descriptive name. We suggest ranking_1, ranking_2, and so on
rank_cov={
        "Please rank the Star Wars films in order of preference with 1 being your favorite film in the franchise and 6 being your least favorite film.":"ranking_1",
        "Unnamed: 10":"ranking_2",
        "Unnamed: 11":"ranking_3",
        "Unnamed: 12":"ranking_4",
        "Unnamed: 13":"ranking_5",
        "Unnamed: 14":"ranking_6",
        }
star_wars=star_wars.rename(columns=rank_cov)
print(star_wars.iloc[:,9:15])

#compute the mean of each of the ranking columns and bar the value
mean_value=star_wars.iloc[:,9:15].mean()
import matplotlib.pyplot as plt
mean_value.plot.bar()
plt.show()

#compute the sum of each of the seen columns and bar the value
sum_value=star_wars.iloc[:,3:9].sum()
sum_value.plot.bar()
plt.show()
  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 1
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论 1
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值