import os
import tarfile
#import urllib
import pandas as pd
import urllib.request #书上是import urllib,但是作者github里改成了这一句
#fetch data
DOWNLOAD_ROOT = "https://raw.githubusercontent.com/ageron/handson-ml2/master/"
HOUSING_PATH = os.path.join("datasets","housing")
HOUSING_URL = DOWNLOAD_ROOT + "datasets/housing/housing.tgz"
#set function fetchdata
def fetch_housing_data(housing_url= HOUSING_URL,housing_path=HOUSING_PATH):
#os.makedirs(housing_path,exist_ok=True) 书上用的这一句,但是github里作者已经改成了下面一句
if not os.path.isdir(housing_path):
os.makedirs(housing_path)
tgz_path = os.path.join(housing_path,'housing.tgz')
urllib.request.urlretrieve(housing_url,tgz_path)
housing_tgz = tarfile.open(tgz_path)
housing_tgz.extractall(path=housing_path)
housing_tgz.close()
fetch_housing_data() #使用上面定义的函数
#return dataFrame
def load_housing_data(housing_path = HOUSING_PATH):
csv_path = os.path.join(housing_path,"housing.csv")
return pd.read_csv(csv_path)
housing = load_housing_data()
housing_head = housing.head()
print(housing_head)
DOWNLOAD DATA
这一段代码之前报错的原因就这么几个
1. 书上的内容和作者github里的修改不一样,比如import urllib 和urllib.request;还有if not os.path is dir(housing-path)
2.把变量和网址打错了,tgz_path这一行最后的housing.tgz打成housing_tgz;还有一开始网页链接,漏了http://这个//,和最后的/
DATA STRUCTURE
用pandas来看数据的构成,读取csv表格
import pandas as pd
def load_house_data(housing_path=HOUSING_PATH):
csv_path = os.path.join(housing_path,'housing.csv')
return pd.read_csv(csv_path)
housing = load_house_data()
print('housing head',housing.head()) #5rows * 10 columns : longitude, latitude ... median_house_value ocean_proximity
print('housing info',housing.info()) #range index 20640 entires, there are 20640 data instances in the tadaset
#find out what categories exist and how many districts belong to each category : value_counts()
oceanProximity = housing['ocean_proximity'].value_counts()
print(oceanProximity)
'''
<1H OCEAN 9136
INLAND 6551
NEAR OCEAN 2658
NEAR BAY 2290
ISLAND 5
'''
#using descirbe() method to show a summary of numerical attributes
numericial_attributes = housing.describe()
print(numericial_attributes)
'''
longitude latitude ... median_income median_house_value
count 20640.000000 20640.000000 ... 20640.000000 20640.000000
mean -119.569704 35.631861 ... 3.870671 206855.816909
std 2.003532 2.135952 ... 1.899822 115395.615874
min -124.350000 32.540000 ... 0.499900 14999.000000
25% -121.800000 33.930000 ... 2.563400 119600.000000
50% -118.490000 34.260000 ... 3.534800 179700.000000
75% -118.010000 37.710000 ... 4.743250 264725.000000
max -114.310000 41.950000 ... 15.000100 500001.000000
'''
用matplot来plot所有数字结果
import matplotlib.pyplot as plt
#using hist to present all numerical data in the data set in matplotlib
housing.hist(bins=50,figsize=(20,15))
plt.show()
CREATE TEST SET 创建测试集
import numpy as np
def split_train_test(data,test_ratio):
shuffled_indices = np.random.permutation(len(data))
test_set_size = int(len(data)*test_ratio)
test_indices = shuffled_indices[:test_set_size]
train_indices = shuffled_indices[test_set_size:]
return data.iloc[train_indices],data.iloc[test_indices]
train_set, test_set = split_train_test(housing,0.2)
print('test set len',len(test_set)) #4128
print('train set len',len(train_set)) #16512