Q1: Map Visualization of density by neighborhood
1. 导入库,初始化
import pandas as pd
import numpy as np
import folium
import json
import geojsonio
file='nyc_zip_borough_neighborhoods_pop.csv'
df = pd.read_csv(file)
2. 通过groupby处理数据,根据 密度=人口/面积,计算neighbor的密度
*merge得到新表 *zip在画图前转化成str
df['zip_area']= df['population']/df['density']
df1=df[['neighborhood','population','zip_area']]
df2 = df1.groupby("neighborhood")
df3 = pd.DataFrame(df2.sum())
df3['n_density']=df3['population']/df3['zip_area']
newdf=df.copy()
newdf=newdf.merge(df3, on = 'neighborhood')
newdf=newdf[['zip','borough','neighborhood','n_density']]
newdf['zip'] = newdf['zip'].apply(lambda x: str(x))
3. 画map
#center - 时代广场
m = folium.Map(location = [40.7589,-73.9851],zoom_start=12)
#geo_data是经纬度数据,data是输入数据,column是画图要用的列,key_on是经纬度数据里对应属性的名字
m.choropleth(geo_data='zipcode.geojson', data=newdf,
columns=[ 'zip','n_density'],
key_on='feature.properties.postalCode',
fill_color='RdYlGn', fill_opacity=0.7, line_opacity=0.8,
legend_name='Distribution of density by neighborhood')
folium.LayerControl().add_to(m)
m
Q2: ML 暂时搞不明白
Q3: ML + Predictor Selection
(a) 挑选top5 predictor
1.初始化
import pandas as pd
import numpy as np
file='heart.csv'
df = pd.read_csv(file)
2.计算与target的correlation *记得绝对值,再降序排序
corrdf = df.corr()
#add a new column showing the absolute value of the corr with target column
corrdf['abstargetcorr'] = abs(corrdf['target'])
#find the top 5 predictors,降序排序
sortedcorr = corrdf.sort_values(by='abstargetcorr', ascending=False)
sortedcorr.index[1:6]
(b) 比较两个model(1.用全部predictor2.用top5)
1. 5 predictor # 0.769
top5df = df[['exang', 'cp', 'oldpeak', 'thalach', 'ca','target']]
#设置训练集&测试集
from sklearn.model_selection import train_test_split
train, test = train_test_split(top5df, test_size = 0.3)
x_train = train.iloc[0:,0:5]
y_train = train[['target']]
x_test = test.iloc[0:,0:5]
y_test = test[['target']]
#建模&拟合
from sklearn.ensemble import RandomForestClassifier
model = RandomForestClassifier(n_estimators=10)
model.fit(x_train,np.ravel(y_train))
#评估
model.score(x_test,y_test)
2. 全部predictor # 0.802
train, test = train_test_split(df, test_size = 0.3)
x_train = train.iloc[0:,0:13]
y_train = train[['target']]
x_test = test.iloc[0:,0:13]
y_test = test[['target']]
model = RandomForestClassifier(n_estimators=10)
model.fit(x_train,np.ravel(y_train))
model.score(x_test,y_test)
3.选择13pre,因为accuracy更高。precision, recall and f_score都可以
Q4: Pandas
(a) dataframe操作,计算Female&Republican的平均工资
df1=employees[(employees['Gender']=='F') & (employees['Affiliation']=='Republican')]
df1['Salary'].mean()
(b) 通过定义函数,进行分组groupby,再计算平均值
def groups(ind,df):
if ((df['Grad'].loc[ind] == 'no') & (df['Affiliation'].loc[ind] == 'Republican')):
return 'WorkHard'
if ((df['Grad'].loc[ind] == 'yes') & (df['Affiliation'].loc[ind] == 'Democrat')):
return 'WorkSmart'
return 'TGIF'
grouped = employees.groupby(lambda x: groups(x, employees))
grouped.mean()[['Salary']]
Q7: Network
(a) 最高频的degree及他出现的次数
1. 初始化
import networkx as nx
import pandas as pd
import numpy as np
G = nx.read_gpickle('friend_graph')
nodedeg = sorted(G.degree(),key=(lambda x: x[1]))
2.用dic #the most frequently occurring degree is 1 with frequency 36
dic={}
for item in nodedeg:
if item[1] in dic.keys():
dic[item[1]]=dic[item[1]]+1
else:
dic[item[1]] = 1
max(zip(dic.values(), dic.keys()))
3.或者用df
df = pd.DataFrame(columns=['Node','Degree'])
index = 0
for index in range(len(nodedeg)):
df.loc[index] = [nodedeg[index][0],nodedeg[index][1]]
index += 1
df.groupby('Degree').size().sort_values()
(b) histogram of node degree. 并判断相近的distribution
import seaborn as sns
from scipy.stats import norm, gamma
#Q7b: The histogram approximates a gamma dist.
sns.distplot(df['Degree'], fit=gamma)
(c) histogram of clustering coefficients. 并判断相近的distribution
node_clustering_coeff = sorted(nx.clustering(G).items(),key=(lambda x: x[1]))
df2 = pd.DataFrame(columns=['Node','Clust_Coeff'])
index = 0
for index in range(len(node_clustering_coeff)):
df2.loc[index] = [node_clustering_coeff[index][0],node_clustering_coeff[index][1]]
index += 1
import seaborn as sns
from scipy.stats import norm, gamma
#Q7c: The histogram does not approximate any theoretical distribution
sns.distplot(df2['Clust_Coeff'], fit=gamma)
(d) top3 最频繁出现的cluster coefficient. 各有多少个node
#Q7d: The top 3 clustering coefficients are 0 shared by 43 nodes,0.66 shared by 30 nodes, and 1 shared by 22 nodes;
df2.groupby('Clust_Coeff').size().sort_values()