端到端机器学习项目
获取数据
下载数据
可以直接使用浏览器下载数据文件,然后解压出其中的CSV文件,但是更好的办法是写一个函数来实现它,特别是当数据会变化的时候,使用函数的形式能够随时随地获取最新的数据。
import pdb
# pdb.set_trace()
import os
import tarfile
from six.moves import urllib
DOWNLOAD_ROOT = "https://raw.githubusercontent.com/ageron/handson-ml/master/"
HOUSING_PATH = "datasets/housing"
HOUSING_URL = DOWNLOAD_ROOT + HOUSING_PATH + "/housing.tgz"
HOUSING_LOCAL_PATH = r"E:\Hands-On ML data"
def fetch_housing_data(housing_url = HOUSING_URL, housing_path = HOUSING_LOCAL_PATH):
if not os.path.isdir(housing_path):
os.mkdirs(housing_path)
tgz_path = os.path.join(housing_path, "housing.tgz")
# 从网络地址获取tgz文件
urllib.request.urlretrieve(housing_url, tgz_path)
#打开tgz文件
housing_tgz = tarfile.open(tgz_path)
#解压tgz
housing_tgz.extractall(path=housing_path)
#关闭tgz
housing_tgz.close()
fetch_housing_data()
调用fetch_housing_data()函数,就会从网络上下载housing.tgz并解压其中的housing.csv
使用Pandas库来加载数据
import pandas as pd
def load_housing_data(housing_path = HOUSING_LOCAL_PATH):
csv_path = os.path.join(housing_path, "housing.csv")
return pd.read_csv(csv_path)
该函数调用pandas库的read_csv