推荐系统-基于tf.feature_column构建数据处理pipeline

import tensorflow as tf
import pandas as pd

pets = {'pets': [[2, 4],[ 3, 8],[0 , -1],[1, 0]]}  #猫0,狗1,兔子2,猪3
df = pd.DataFrame(pets)

column = tf.feature_column.categorical_column_with_identity(key='pets',
    num_buckets=20)

column = tf.feature_column.categorical_column_with_hash_bucket(
        key='pets',
        hash_bucket_size=15,
    )

pets_f_c = tf.feature_column.numeric_column('pets', dtype=tf.int32)
column = tf.feature_column.embedding_column(pets_f_c, 3)

indicator = tf.feature_column.indicator_column(column)
dataset = tf.data.Dataset.from_tensor_slices((dict(df)))
example_batch = next(iter(ds))
feature_layer = tf.keras.layers.DenseFeatures(column)


print(feature_layer(example_batch).numpy())
ds = dataset
for feature_batch in ds.take(1):
    print('Some feature keys:', list(feature_batch.keys()))
    print()
    print('A batch of class:', feature_batch['pets'].numpy())
    print(tf.keras.layers.DenseFeatures([indicator])(feature_batch).numpy())
color_embeding = feature_column.embedding_column(color_column, 7)
    color_embeding_dense_tensor = feature_column.input_layer(color_data, [color_embeding])
    builder = _LazyBuilder(color_data)
    color_column_tensor = color_column._get_sparse_tensors(builder)

tf.random.set_seed(123)
color_data = {'color': [['E','F'], ['G','B'], ['B','B'], ['A','F']]}  # 4行样本

color_column = tf.feature_column.categorical_column_with_vocabulary_list(
    'color', ['A', 'B', 'C', 'D', 'G', 'E', 'F'], dtype=tf.string, default_value=-1)


indicator = tf.feature_column.indicator_column(color_column)
color_embeding = tf.feature_column.embedding_column(color_column, 7)
df = pd.DataFrame(color_data)
dataset = tf.data.Dataset.from_tensor_slices((dict(df)))
ds = dataset
ds = ds.batch(4)
example_batch = next(iter(ds))
feature_layer = tf.keras.layers.DenseFeatures(color_embeding)


print(feature_layer(example_batch).numpy())

color_data = {'color': [['A'], ['B'], ['C'], ['D'], ['E'], ['F']]}  # 4行样本

color_column = tf.feature_column.categorical_column_with_vocabulary_list(
    'color', ['A', 'B', 'C', 'D', 'G', 'E', 'F'], dtype=tf.string, default_value=-1)


indicator = tf.feature_column.indicator_column(color_column)
color_embeding = tf.feature_column.embedding_column(color_column, 7)
df = pd.DataFrame(color_data)
dataset = tf.data.Dataset.from_tensor_slices((dict(df)))
ds = dataset
ds = ds.batch(11)
example_batch = next(iter(ds))
feature_layer = tf.keras.layers.DenseFeatures(color_embeding)

print(feature_layer(example_batch).numpy())

color_column = tf.feature_column.categorical_column_with_vocabulary_list(
    'color', ['A', 'B', 'C', 'D', 'G', 'E', 'F'], dtype=tf.string, default_value=-1)
color_embeding = tf.feature_column.embedding_column(color_column, 2)
feature_layer = tf.keras.layers.DenseFeatures(color_embeding)

color_data1 = {'color': [['E','F'], ['G','B'], ['B','B'], ['A','F'], ['A', 'C'], ['D', 'D'], ['F', 'F']]}  # 4行样本
color_data2 = {'color': [['A'], ['B'], ['C'], ['D'], ['E'], ['F']]}  # 4行样本

df1 = pd.DataFrame(color_data1)
df2= pd.DataFrame(color_data2)
dataset1 = tf.data.Dataset.from_tensor_slices((dict(df1)))
dataset2 = tf.data.Dataset.from_tensor_slices((dict(df2)))

ds1 = dataset1
ds2 = dataset2
ds1 = ds1.batch(11)
ds2 = ds2.batch(11)

example_batch1 = next(iter(ds1))
print(feature_layer(example_batch1).numpy())
example_batch2 = next(iter(ds2))
print(feature_layer(example_batch2).numpy())

参考1
https://www.jianshu.com/p/fceb64c790f3

  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
import numpy as np import matplotlib.pyplot as plt %matplotlib inline from sklearn.datasets import load_digits data, labels = load_digits(return_X_y=True) (n_samples, n_features), n_digits = data.shape, np.unique(labels).size print(f"# 类别数: {n_digits}; # 样本数: {n_samples}; # 特征数: {n_features}") print(data[:2]) from time import time from sklearn.pipeline import make_pipeline from sklearn.preprocessing import StandardScaler from sklearn.cluster import KMeans kmeans=KMeans(n_clusters=10, random_state=42) ### 创建管道并训练,记录训练时间 t0 = time() estimator = make_pipeline(StandardScaler(), kmeans).fit(data) fit_time = time() - t0 print("训练时间:", fit_time) ### 通过惯性(inertia)聚类的性能 print(estimator) print(estimator[-1].inertia_) result1={"fit-time":fit_time,"inertia:":estimator[-1].inertia_ } from sklearn.decomposition import PCA ### ??编程使用PCA分解,得到10个主成分,放到变量 components 中--------------------------- pca = PCA(n_components=10) components = pca.fit_transform(data) ###------------------------------------------------------------------------- ### 创建KMeas对象 kmeans=KMeans(n_clusters=10, init="k-means++", random_state=42) ### 创建管道并训练,记录训练时间 t0 = time() estimator = make_pipeline(StandardScaler(), kmeans).fit(data) fit_time = time() - t0 print("训练时间:", fit_time) ### 通过惯性(inertia)聚类的性能 print(estimator) print(estimator[-1].inertia_) result2={"fit-time":fit_time,"inertia:":estimator[-1].inertia_ } from sklearn.decomposition import PCA ### ??编程 选择保持 98%的信息的PCA模型,用名为pca的变量表示 ---------- pca = PCA(n_components=0.98) ###------------------------------------------------------------------- ###创建KMeas对象 kmeans=KMeans(n_clusters=10, random_state=42) ###??编程 创建一个 标准化+PCA降维+KMeas聚类的管道并训练,记录训练时间 t0 = time() estimator = make_pipeline(StandardScaler(), pca, kmeans).fit(data) ##增加pca预处理 fit_time = time() - t0 print("训练时间:", fit_time) ### 通过惯性(inertia)聚类的性能 print(estimator) print(estimator[-1].inertia_) result3={"fit-time":fit_time,"inertia:":estimator[-1].inertia_ }可以选择不同的KMeans的参数对digits进行聚类,比较实验结果,并选择一个针对此问题的最好模型
最新发布
05-25

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值