数值特征
离散值处理
数值特征的处理,最常见的方法,就是离散值处理了
对于一份新拿到手的数据来说,会有很多不能被计算机识别的数据,这就需要对它们进行处理
LabelEncoder
import pandas as pd
import numpy as np
# 加载数据
vg_df = pd.read_csv('datasets/vgsales.csv', encoding = "ISO-8859-1")
vg_df[['Name', 'Platform', 'Year', 'Genre', 'Publisher']].head()
# 拿到某个特征的离散数据
genres = np.unique(vg_df['Genre'])
genres
原始数据:
array(['Action', 'Adventure', 'Fighting', 'Misc', 'Platform', 'Puzzle',
'Racing', 'Role-Playing', 'Shooter', 'Simulation', 'Sports',
'Strategy'], dtype=object)
使用 Sklearn 中的 LabelEncoder 进行转换
利用LabelEncoder() 将转换成连续的数值型变量,即对不连续的文本进行编号
from sklearn.preprocessing import LabelEncoder
gle = LabelEncoder()
genre_labels = gle.fit_transform(vg_df['Genre'])
genre_mappings = {index: label for index, label in enumerate(gle.classes_)}
genre_mappings
{0: 'Action',
1: 'Adventure',
2: 'Fighting',
3: 'Misc',
4: 'Platform',
5: 'Puzzle',
6: 'Racing',
7: 'Role-Playing',
8: 'Shooter',
9: 'Simulation',
10: 'Sports',
11: 'Strategy'}
查看结果:
vg_df['GenreLabel'] = genre_labels
vg_df[['Name', 'Platform', 'Year', 'Genre', 'GenreLabel']].head()
Map
poke_df = pd.read_csv('datasets/Pokemon.csv', encoding='utf-8')
poke_df[['Name', 'Generation']].head()
原始数据:
poke_df = poke_df.sample(random_state=1, frac=1).reset_index(drop=True)
np.unique(poke_df['Generation'])
array(['Gen 1', 'Gen 2', 'Gen 3', 'Gen 4', 'Gen 5', 'Gen 6'], dtype=object)
构造 map,指定数值
gen_ord_map = {'Gen 1': 1, 'Gen 2': 2, 'Gen 3': 3,
'Gen 4': 4, 'Gen 5': 5, 'Gen 6': 6}
poke_df['GenerationLabel'] = poke_df['Generation'].map(gen_ord_map)
poke_df[['Name', 'Generation', 'GenerationLabel']].iloc[4:10]
One-hot Encoding
先进行一下 LabelEncoder,做对比使用
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
poke_df = pd.read_csv('datasets/Pokemon.csv', encoding='utf-8')
# generations
gen_le = LabelEncoder()
gen_labels = gen_le.fit_transform(poke_df['Generation'])
poke_df['Gen_Label'] = gen_labels
# legendary
leg_le = LabelEncoder()
leg_labels = leg_le.fit_transform(poke_df['Legendary'])
poke_df['Lgnd_Label'] = leg_labels
poke_df_sub = poke_df[['Name', 'Generation', 'Gen_Label', 'Legendary', 'Lgnd_Label']]
poke_df_sub.iloc[4:10]
进行 One-hot Encoding 操作:
# generation
gen_ohe = OneHotEncoder()
gen_feature_arr = gen_ohe.fit_transform(poke_df[['Generation']]).toarray()
gen_feature_labels = list(gen_le.classes_)
print (gen_feature_labels)
gen_features = pd.DataFrame(gen_feature_arr, columns=gen_feature_labels)
['Gen 1', 'Gen 2', 'Gen 3', 'Gen 4', 'Gen 5', 'Gen 6']
# legendary
leg_ohe = OneHotEncoder()
leg_feature_arr = leg_ohe.fit_transform(poke_df[['Legendary']]).toarray()
leg_feature_labels = ['Legendary_'+str(cls_label) for cls_label in leg_le.classes_]
print (leg_feature_labels)
leg_features = pd.DataFrame(leg_feature_arr, columns=leg_feature_labels)
['Legendary_False', 'Legendary_True']
poke_df_ohe = pd.concat([poke_df_sub, gen_features, leg_features], axis=1)
columns = sum([['Name', 'Generation', 'Gen_Label'],gen_feature_labels,
['Legendary', 'Lgnd_Label'],leg_feature_labels], [])
poke_df_ohe[columns].head()
查看结果:
Get Dummy
Pandas中的 get_dummy() 函数是将拥有不同值的变量转换为0/1数值
# Generation
gen_onehot_features = pd.get_dummies(poke_df['Generation'])
pd.concat([poke_df[['Name', 'Generation']], gen_onehot_features], axis=1).head()
查看结果:
二值特征
# 读入数据
popsong_df = pd.read_csv('datasets/song_views.csv', encoding='utf-8')
popsong_df.head(10)
watched = np.array(popsong_df['listen_count'])
watched[watched >= 1] = 1
popsong_df['watched'] = watched
popsong_df.head(10)
查看结果:
使用Sklearn进行二值特征处理
from sklearn.preprocessing import Binarizer
bn = Binarizer(threshold=1)
pd_watched = bn.transform([popsong_df['listen_count']])[0]
popsong_df['sklearn_watched'] = pd_watched
popsong_df.head(10)
查看结果:
多项式特征
可以进行高维映射,常用在支持向量机中
poke_df = pd.read_csv('datasets/Pokemon.csv', encoding='utf-8')
poke_df.head()
atk_def = poke_df[['Attack', 'Defense']]
atk_def.head()
from sklearn.preprocessing import PolynomialFeatures
pf = PolynomialFeatures(degree=2, interaction_only=False, include_bias=False)
res = pf.fit_transform(atk_def)
print(res)
intr_features = pd.DataFrame(res, columns=['Attack', 'Defense', 'Attack^2', 'Attack x Defense', 'Defense^2'])
intr_features.head(5)
array([[ 49., 49., 2401., 2401., 2401.],
[ 62., 63., 3844., 3906., 3969.],
[ 82., 83., 6724., 6806., 6889.],
...,
[ 110., 60., 12100., 6600., 3600.],
[ 160., 60., 25600., 9600., 3600.],
[ 110., 120., 12100., 13200., 14400.]])
查看结果(对原始数据进行平方、乘积):
连续值处理
binning特征
连续值离散化
fcc_survey_df = pd.read_csv('datasets/fcc_2016_coder_survey_subset.csv', encoding='utf-8')
fcc_survey_df[['ID.x', 'EmploymentField', 'Age', 'Income']].head()
fig, ax = plt.subplots()
fcc_survey_df['Age'].hist(color='#A9C5D3')
ax.set_title('Developer Age Histogram', fontsize=12)
ax.set_xlabel('Age', fontsize=12)
ax.set_ylabel('Frequency', fontsize=12)
Binning based on rounding
Age Range: Bin
---------------
0 - 9 : 0
10 - 19 : 1
20 - 29 : 2
30 - 39 : 3
40 - 49 : 4
50 - 59 : 5
60 - 69 : 6
... and so on
fcc_survey_df['Age_bin_round'] = np.array(np.floor(np.array(fcc_survey_df['Age']) / 10.))
fcc_survey_df[['ID.x', 'Age', 'Age_bin_round']].iloc[1071:1076]
查看结果:
分位数切分
# 查看数据
fcc_survey_df[['ID.x', 'Age', 'Income']].iloc[4:9]
# 画图展示
fig, ax = plt.subplots()
fcc_survey_df['Income'].hist(bins=30, color='#A9C5D3')
ax.set_title('Developer Income Histogram', fontsize=12)
ax.set_xlabel('Developer Income', fontsize=12)
ax.set_ylabel('Frequency', fontsize=12)
# 分位数计算
quantile_list = [0, .25, .5, .75, 1.]
quantiles = fcc_survey_df['Income'].quantile(quantile_list)
# 画图展示
fig, ax = plt.subplots()
fcc_survey_df['Income'].hist(bins=30, color='#A9C5D3')
for quantile in quantiles:
qvl = plt.axvline(quantile, color='r')
ax.legend([qvl], ['Quantiles'], fontsize=10)
ax.set_title('Developer Income Histogram with Quantiles', fontsize=12)
ax.set_xlabel('Developer Income', fontsize=12)
ax.set_ylabel('Frequency', fontsize=12)
切分数据:
quantile_labels = ['0-25Q', '25-50Q', '50-75Q', '75-100Q']
fcc_survey_df['Income_quantile_range'] = pd.qcut(fcc_survey_df['Income'],
q=quantile_list)
fcc_survey_df['Income_quantile_label'] = pd.qcut(fcc_survey_df['Income'],
q=quantile_list, labels=quantile_labels)
fcc_survey_df[['ID.x', 'Age', 'Income',
'Income_quantile_range', 'Income_quantile_label']].iloc[4:9]
对数变换
对数据进行对数处理,使其更接近正态分布
fcc_survey_df['Income_log'] = np.log((1+ fcc_survey_df['Income']))
fcc_survey_df[['ID.x', 'Age', 'Income', 'Income_log']].iloc[4:9]
原数据展示:
income_log_mean = np.round(np.mean(fcc_survey_df['Income']), 2)
fig, ax = plt.subplots()
fcc_survey_df['Income'].hist(bins=30, color='#A9C5D3')
plt.axvline(income_log_mean, color='r')
ax.set_title('Developer Income Histogram after Log Transform', fontsize=12)
ax.set_xlabel('Developer Income (log scale)', fontsize=12)
ax.set_ylabel('Frequency', fontsize=12)
ax.text(11.5, 450, r'$\mu$='+str(income_log_mean), fontsize=10)
income_log_mean = np.round(np.mean(fcc_survey_df['Income_log']), 2)
fig, ax = plt.subplots()
fcc_survey_df['Income_log'].hist(bins=30, color='#A9C5D3')
plt.axvline(income_log_mean, color='r')
ax.set_title('Developer Income Histogram after Log Transform', fontsize=12)
ax.set_xlabel('Developer Income (log scale)', fontsize=12)
ax.set_ylabel('Frequency', fontsize=12)
ax.text(11.5, 450, r'$\mu$='+str(income_log_mean), fontsize=10)
文本特征
构造一个文本数据集
corpus = ['The sky is blue and beautiful.',
'Love this blue and beautiful sky!',
'The quick brown fox jumps over the lazy dog.',
'The brown fox is quick and the blue dog is lazy!',
'The sky is very blue and the sky is very beautiful today',
'The dog is lazy but the brown fox is quick!'
]
labels = ['weather', 'weather', 'animals', 'animals', 'weather', 'animals']
corpus = np.array(corpus)
corpus_df = pd.DataFrame({'Document': corpus,
'Category': labels})
corpus_df = corpus_df[['Document', 'Category']]
corpus_df
设置停用词(使用 nltk软件包得到的 — nltk.corpus.stopwords.words(‘english’))
stop_words = ['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', 'too', 'very', 's', 't', 'can', 'will', 'just', 'don', "don't", 'should', "should've", 'now', 'd', 'll', 'm', 'o', 're', 've', 'y', 'ain', 'aren', "aren't", 'couldn', "couldn't", 'didn', "didn't", 'doesn', "doesn't", 'hadn', "hadn't", 'hasn', "hasn't", 'haven', "haven't", 'isn', "isn't", 'ma', 'mightn', "mightn't", 'mustn', "mustn't", 'needn', "needn't", 'shan', "shan't", 'shouldn', "shouldn't", 'wasn', "wasn't", 'weren', "weren't", 'won', "won't", 'wouldn', "wouldn't"]
#词频与停用词
wpt = nltk.WordPunctTokenizer()
#stop_words = nltk.corpus.stopwords.words('english')
print (stop_words)
def normalize_document(doc):
# 小写并删除特殊字符\\空格
doc = re.sub(r'[^a-zA-Z0-9\s]', '', doc, re.I)
doc = doc.lower()
doc = doc.strip()
# 文件流
tokens = wpt.tokenize(doc)
#print(tokens)
# 从文档中筛选停用词
filtered_tokens = [token for token in tokens if token not in stop_words]
# 将筛选出来的次,重新创建文档
doc = ' '.join(filtered_tokens)
return doc
查看结果(删掉停用词后的文本):
norm_corpus = normalize_corpus(corpus)
norm_corpus
array(['sky blue beautiful',
'love blue beautiful sky',
'quick brown fox jumps lazy dog',
'brown fox quick blue dog lazy',
'sky blue sky beautiful today',
'dog lazy brown fox quick'], dtype='<U30')
在去掉停用词后,发现原文本都只剩下写关键词,但它们还是以一句话的形式展示的,计算机并不能认识,所以需要将其再进行切分,就会用到以下的方法
词袋模型
# 使用 CountVectorizer
from sklearn.feature_extraction.text import CountVectorizer
print (norm_corpus)
# 实例化CountVectorizer,分割
cv = CountVectorizer(min_df=0., max_df=1.)
cv.fit(norm_corpus)
print (cv.get_feature_names())
# 词向量
cv_matrix = cv.fit_transform(norm_corpus)
# 转换为 array格式
cv_matrix = cv_matrix.toarray()
cv_matrix
['sky blue beautiful' 'love blue beautiful sky'
'quick brown fox jumps lazy dog' 'brown fox quick blue dog lazy'
'sky blue sky beautiful today' 'dog lazy brown fox quick']
['beautiful', 'blue', 'brown', 'dog', 'fox', 'jumps', 'lazy', 'love', 'quick', 'sky', 'today']
查看结果:
vocab = cv.get_feature_names()
pd.DataFrame(cv_matrix, columns=vocab)
N-Grams模型
对于上面的模型,都是单个词的词频信息,并不能很好的表示词与词之间的逻辑关系,这里再进行进一步的优化,使用两个词做建模
bv = CountVectorizer(ngram_range=(2,2))
bv_matrix = bv.fit_transform(norm_corpus)
bv_matrix = bv_matrix.toarray()
vocab = bv.get_feature_names()
pd.DataFrame(bv_matrix, columns=vocab)
TF-IDF 模型
关于TF-IDF 模型 ,在之前笔记中有记录,具体的方法可以参考机器学习入门 — 贝叶斯 - 中文新闻分类任务
from sklearn.feature_extraction.text import TfidfVectorizer
tv = TfidfVectorizer(min_df=0., max_df=1., use_idf=True)
tv_matrix = tv.fit_transform(norm_corpus)
tv_matrix = tv_matrix.toarray()
vocab = tv.get_feature_names()
pd.DataFrame(np.round(tv_matrix, 2), columns=vocab)
Similarity特征
得到了词向量以后,可以使用余弦相似度计算
from sklearn.metrics.pairwise import cosine_similarity
similarity_matrix = cosine_similarity(tv_matrix)
similarity_df = pd.DataFrame(similarity_matrix)
similarity_df
聚类特征
关于KMeans模型 ,在之前笔记中有记录,具体的方法可以参考机器学习入门 — K-means、DBSCAN聚类算法(概念、图解、代码示例)
from sklearn.cluster import KMeans
km = KMeans(n_clusters=2)
km.fit_transform(similarity_df)
cluster_labels = km.labels_
cluster_labels = pd.DataFrame(cluster_labels, columns=['ClusterLabel'])
pd.concat([corpus_df, cluster_labels], axis=1)
主题模型
关于LDA算法,在之前笔记中有记录,具体的方法可以参考机器学习入门 — LDA与PCA算法(公式推导、纯python代码实现、scikit-learn api调用对比结果)
from sklearn.decomposition import LatentDirichletAllocation
lda = LatentDirichletAllocation(n_components=2, max_iter=100, random_state=42)
dt_matrix = lda.fit_transform(tv_matrix)
features = pd.DataFrame(dt_matrix, columns=['T1', 'T2'])
features
可以得到主题和词的权重
tt_matrix = lda.components_
for topic_weights in tt_matrix:
topic = [(token, weight) for token, weight in zip(vocab, topic_weights)]
topic = sorted(topic, key=lambda x: -x[1])
topic = [item for item in topic if item[1] > 0.6]
print(topic)
[('fox', 1.7265536238698524), ('quick', 1.7264910761871224), ('dog', 1.7264019823624879), ('brown', 1.7263774760262807), ('lazy', 1.7263567668213813), ('jumps', 1.0326450363521607), ('blue', 0.7770158513472083)]
[('sky', 2.263185143458752), ('beautiful', 1.9057084998062579), ('blue', 1.7954559705805624), ('love', 1.1476805311187976), ('today', 1.0064979209198706)]
词嵌入模型
from gensim.models import word2vec
wpt = nltk.WordPunctTokenizer()
tokenized_corpus = [wpt.tokenize(document) for document in norm_corpus]
# 设置各种参数
feature_size = 10 # 词向量维数
window_context = 10 # 上下文窗口大小
min_word_count = 1 # 最小字数
sample = 1e-3 # 常用词的下采样设置
w2v_model = word2vec.Word2Vec(tokenized_corpus, size=feature_size,
window=window_context, min_count = min_word_count,
sample=sample)
# 查看一下 sky 的向量
w2v_model.wv['sky']
array([-0.03016021, -0.0126131 , -0.04036982, -0.00525936, 0.01884378,
0.0091414 , 0.025741 , -0.00395817, -0.02595217, 0.02557161],
dtype=float32)
将没一句话中的所有词的词向量进行相加求均值
# 平均字向量
def average_word_vectors(words, model, vocabulary, num_features):
feature_vector = np.zeros((num_features,),dtype="float64")
nwords = 0.
for word in words:
if word in vocabulary:
nwords = nwords + 1.
feature_vector = np.add(feature_vector, model[word])
if nwords:
feature_vector = np.divide(feature_vector, nwords)
return feature_vector
def averaged_word_vectorizer(corpus, model, num_features):
vocabulary = set(model.wv.index2word)
features = [average_word_vectors(tokenized_sentence, model, vocabulary, num_features)
for tokenized_sentence in corpus]
return np.array(features)
这样便构造好了一个文章向量
w2v_feature_array = averaged_word_vectorizer(corpus=tokenized_corpus, model=w2v_model,
num_features=feature_size)
pd.DataFrame(w2v_feature_array)