以housing.csv数据集为例展开演示
(1)获取数据集
如果数据是动态的建议利用爬虫技术,使你持续获得最新数据
从https://github.com/ageron/handso-ml/raw/master/datasets/housing上下载数据
import os
import tarfile
from six.moves import urllib
download_url="https://github.com/ageron/handson-ml/raw/master/"
housing_path="datasets/housing"
housing_url=download_url+housing_path+"/housing.tgz"
print(housing_url)
def fetch_housng_data(housing_url=housing_url,housing_path=housing_path):
if not os.path.isdir(housing_path):
os.makedirs(housing_path)#用于创建存放数据的地址
tgz_path=os.path.join(housing_path,"housing.tgz")#进行合并
urllib.request.urlretrieve(housing_url,tgz_path)#将URL表示的网络对象复制到本地文件
housing_tgz=tarfile.open(tgz_path)#创建压缩包名
housing_tgz.extractall(path=housing_path)#将压缩包中的内容释放到指定目录
housing_tgz.close()
fetch_housng_data()
import pandas
def load_data(housing_path=housing_path):
csv.path=os.path.join(housing_path,"housing.csv")
return pd.read_csv(csv_path)
如果数据不是动态的
(2)了解数据集
import pandas as pd
housing=pd.read_csv(r"D:\sublime\机器学习\dataset\housing.csv")
print(housing.head())#查看数据的前五项
'''
longitude latitude ... median_house_value ocea