# encoding=utf8
'''
1.what's the machine learning?
2.waht's the k-n-n algorithm?
3.what's the loss function and why it is the key for our machine learning?
4.what's the Gradient Descent?
'''
# step1:load data & data analysis
from sklearn.datasets import load_boston
dataset = load_boston()
print(dir(dataset)) # ['DESCR', 'data', 'feature_names', 'filename', 'target']
print(dataset["feature_names"]) # ['CRIM' 'ZN' 'INDUS' 'CHAS' 'NOX' 'RM' 'AGE' 'DIS' 'RAD' 'TAX' 'PTRATIO' 'B' 'LSTAT']
print(dataset["data"][:, 0])
# define the problem
'''
assuming you were a real state saleperson in boston
given some description data about a real state --->its price
'''
import pandas as pd
dataframe = pd.DataFrame(dataset['data'])
# print(dataframe)
dataframe.columns = dataset['feature_names']
print(dataframe)
dataframe['price'] = dataset['target']
print(dataframe)
# question:what's the most significant feature in the house price?
import seaborn as sns
import numpy as np
sns.heatmap(dataframe.corr(), annot=True, fmt=".2f")
# 基于以上分析,我们发现房屋立面卧室的个数和房屋价格最成正相关
# 简单化:如何依据房屋里面卧室的数量来估计房子的面积呢?
X_rm = dataframe['RM'].values
Y = dataframe['price'].values
rm_to_price = {r: y for r, y in zip(X_rm, Y)}
print(rm_to_price)
def find_price_by_similar(history_price, query_x, topn=3):
'''
对于一个优秀的工程师、算法工作者
代码的可读性一定是大于简洁性
'''
most_similar_items = sorted(history_price.items(), key=lambda e: (e[0] - query_x) ** 2)[:topn]
most_similar_prices = [price for rm, price in most_similar_items]
average_prices = np.mean(most_similar_prices)
return average_prices
# return np.mean([p for x, p in sorted(history_price.items(), key=lambda x_y: (x_y[0] - query_x) ** 2)[:topn]])
find_price_by_similar(rm_to_price, 7)
# 以上这种方法就是发现最接近的k个邻居,这是经典的机器学习算法:KNN
# 但有个缺点:方法比较低效,在数据比较大的时候(还有其他的一些问题)
# a more effient learning way:
# 如何我们能够找到X_rm和Y之间的函数关系,我们每次要计算的时候,输入个这个函数,就能直接获得预测值
# 拟合函数关系
import matplotlib.pyplot as plt
plt.scatter(X_rm, Y)
real_y = [3, 6, 7]
y_hats = [3, 4, 7]
y_hats_2 = [3, 6, 6]
# LOSS函数
def loss(y, yhat):
loss_result = np.mean(np.array(y) - np.array(yhat) ** 2)
# print(loss_result)
return loss_result
def model(x, k, b):
return x * k + b
# MSE :mean squared error
loss(real_y, y_hats)
# 我们有了判断标准,那怎么获得最优的k和b呢
# 1、我们直接用微积分的方法做计算
# 2、我们用随机模拟的方法来做
import random
var_max, var_min = 100, -100
min_loss = float('inf') # 表示正无穷
best_k, best_b = None, None
total_times = 1000
# for t in range(total_times):
# k, b = random.randint(var_min, var_max), random.randint(var_min, var_max)
# print(k, b)
# loss_ = loss(Y, model(X_rm, k, b))
# print("i am looking for ~~~~~")
# if loss_ < min_loss:
# min_loss = loss_
# best_k, best_b = k, b
# print('在{}时刻我找到了最优的K:{}和b:{},这个时候的loss是:{}'.format(t, k, b, loss_))
# in the begining, the updating is more frequent
# when time passed by,the updating will be more and more difficult
# how to get the better k and b? Gradient Descent
def partial_k(x, y, k_n, b_n):
return 2 * (np.mean((y - (k_n * x + b_n)) * (-x)))
def partial_b(x, y, k_n, b_n):
return 2 * (np.mean((y - (k_n * x + b_n)) * (-1)))
alpha = 1e-3
k, b = random.randint(var_min, var_max), random.randint(var_min, var_max)
for t in range(total_times):
k = k + (-1) * partial_k(X_rm, Y, k, b) * alpha
b = k + (-1) * partial_b(X_rm, Y, k, b) * alpha
print(k, b)
loss_ = loss(Y, model(X_rm, k, b))
print("I am looking for k and b")
if loss_ < min_loss:
min_loss = loss_
best_k, best_b = k, b
print('在{}时刻我找到了最优的K:{}和b:{},这个时候的loss是:{}'.format(t, k, b, loss_))
plt.scatter(X_rm, Y)
plt.scatter(X_rm, best_k * X_rm + best_b)
05-04
1061
06-03
2617
11-05
3万+
03-12
5999