1、耗费空间
2、数据量更新起来,效率低
3、遗漏很多重要信息
def one_hot(elements):
pure = list(set(elements))
vectors = []
for e in elements:
vec = [0] * len(pure)
vec[pure.index(e)] = 1
vectors.append(vec)
return vectors
print(one_hot(["北京", "上海", "南京", "北京", "重庆", "深圳", "广州", "天津"]))
多分类问题
x = [1237, 4512, 7845]
y = [0, 0, 1, 0, 0] # x 的真实分类,这里选定为第3类
x = np.array(normalize(x))
weights = np.random.random(size=(3, 5))
# print(np.dot(x, weight)) # logits 算子
def softmax(x):
"""处理数据"""
x = np.array(x)
x -= np.max(x) # 防止数据过大
return np.exp(x) / np.sum(np.exp(x))
# ic(softmax(np.dot(x, weight)))
def cross_entropy(y, yhat):
"""把真实值和预测值对比"""
return -np.sum(y_i * np.log(yhat_i) for y_i, yhat_i in zip(y, yhat))
print(cross_entropy(y, softmax(np.dot(x, weights))))