one-hot编码的优劣势:
优势:操作简单,容易理解.
劣势:完全割裂了词与词之间的联系,而且在大语料集下,每个向量的长度过大,占据大量内存.
import torch
from pyhanlp import *
from sklearn.preprocessing import OneHotEncoder
import numpy as np
content = "虽然原始的食材便具有食物原始的风情,云初还是认为," \
"最美味的食物还是需要经过分割,烹调,处置,最后端上桌的食物才是最符合大唐人肠胃的食物。"
words = HanLP.segment(content)
key = []
for i in words:
key.append(i.word)
print(key)
print(np.array(key).reshape(-1, 1))
enc = OneHotEncoder()
enc.fit(np.array(key).reshape(-1, 1))
print(enc.transform([['虽然']]).toarray())
print(enc.categories_)
print(enc.inverse_transform([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0,
0, 0, 0, 0, 0]]))
['虽然', '原始', '的', '食材', '便', '具有', '食物', '原始', '的', '风情', ',', '云初', '还是', '认为', ',', '最', '美味', '的', '食物', '还是', '需要', '经过', '分割', ',', '烹调', ',', '处置', ',', '最后', '端', '上桌', '的', '食物', '才', '是', '最', '符合', '大唐', '人', '肠胃', '的', '食物', '。']
[['虽然']
['原始']
['的']
['食材']
['便']
['具有']
['食物']
['原始']
['的']
['风情']
[',']
['云初']
['还是']
['认为']
[',']
['最']
['美味']
['的']
['食物']
['还是']
['需要']
['经过']
['分割']
[',']
['烹调']
[',']
['处置']
[',']
['最后']
['端']
['上桌']
['的']
['食物']
['才']
['是']
['最']
['符合']
['大唐']
['人']
['肠胃']
['的']
['食物']
['。']]
[[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0.
0. 0. 0. 0. 0.]]
[array(['。', '上桌', '云初', '人', '便', '具有', '分割', '原始', '处置', '大唐', '才', '是',
'最', '最后', '烹调', '的', '端', '符合', '经过', '美味', '肠胃', '虽然', '认为',
'还是', '需要', '风情', '食材', '食物', ','], dtype='<U2')]
[['虽然']]