Chapter2:预测房价【原链接失效,没有任何数据-已解决,链接是好的】

import os
import tarfile
#import urllib
import pandas as pd
import urllib.request #书上是import urllib,但是作者github里改成了这一句


#fetch data
DOWNLOAD_ROOT = "https://raw.githubusercontent.com/ageron/handson-ml2/master/"
HOUSING_PATH = os.path.join("datasets","housing")
HOUSING_URL = DOWNLOAD_ROOT + "datasets/housing/housing.tgz"

#set function fetchdata
def fetch_housing_data(housing_url= HOUSING_URL,housing_path=HOUSING_PATH):
    #os.makedirs(housing_path,exist_ok=True) 书上用的这一句,但是github里作者已经改成了下面一句
    if not os.path.isdir(housing_path):
        os.makedirs(housing_path)
    tgz_path = os.path.join(housing_path,'housing.tgz')
    urllib.request.urlretrieve(housing_url,tgz_path)
    housing_tgz = tarfile.open(tgz_path)
    housing_tgz.extractall(path=housing_path)
    housing_tgz.close()

fetch_housing_data() #使用上面定义的函数

#return dataFrame
def load_housing_data(housing_path = HOUSING_PATH):
    csv_path = os.path.join(housing_path,"housing.csv")
    return pd.read_csv(csv_path)


housing = load_housing_data()
housing_head = housing.head()
print(housing_head)



DOWNLOAD DATA

这一段代码之前报错的原因就这么几个

1. 书上的内容和作者github里的修改不一样,比如import urllib 和urllib.request;还有if not os.path is dir(housing-path)

2.把变量和网址打错了,tgz_path这一行最后的housing.tgz打成housing_tgz;还有一开始网页链接,漏了http://这个//,和最后的/

DATA STRUCTURE

用pandas来看数据的构成,读取csv表格

import pandas as pd

def load_house_data(housing_path=HOUSING_PATH):
    csv_path = os.path.join(housing_path,'housing.csv')
    return pd.read_csv(csv_path)

housing = load_house_data()
print('housing head',housing.head()) #5rows * 10 columns : longitude, latitude ... median_house_value ocean_proximity
print('housing info',housing.info()) #range index 20640 entires, there are 20640 data instances in the tadaset

#find out what categories exist and how many districts belong to each category : value_counts()
oceanProximity = housing['ocean_proximity'].value_counts()
print(oceanProximity)
'''
<1H OCEAN     9136
INLAND        6551
NEAR OCEAN    2658
NEAR BAY      2290
ISLAND           5
'''

#using descirbe() method to show a summary of numerical attributes
numericial_attributes = housing.describe()
print(numericial_attributes)

'''
   longitude      latitude  ...  median_income  median_house_value
count  20640.000000  20640.000000  ...   20640.000000        20640.000000
mean    -119.569704     35.631861  ...       3.870671       206855.816909
std        2.003532      2.135952  ...       1.899822       115395.615874
min     -124.350000     32.540000  ...       0.499900        14999.000000
25%     -121.800000     33.930000  ...       2.563400       119600.000000
50%     -118.490000     34.260000  ...       3.534800       179700.000000
75%     -118.010000     37.710000  ...       4.743250       264725.000000
max     -114.310000     41.950000  ...      15.000100       500001.000000
'''

用matplot来plot所有数字结果

import matplotlib.pyplot as plt
#using hist to present all numerical data in the data set in matplotlib
housing.hist(bins=50,figsize=(20,15))
plt.show()

CREATE TEST SET 创建测试集

import numpy as np
def split_train_test(data,test_ratio):
    shuffled_indices = np.random.permutation(len(data))
    test_set_size = int(len(data)*test_ratio)
    test_indices = shuffled_indices[:test_set_size]
    train_indices = shuffled_indices[test_set_size:]
    return data.iloc[train_indices],data.iloc[test_indices]

train_set, test_set = split_train_test(housing,0.2)
print('test set len',len(test_set)) #4128
print('train set len',len(train_set)) #16512

  • 1
    点赞
  • 2
    收藏
    觉得还不错? 一键收藏
  • 1
    评论
评论 1
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值