2.20190824

最新推荐文章于 2021-05-29 00:52:05 发布
依概率收敛
最新推荐文章于 2021-05-29 00:52:05 发布
阅读量216
点赞数
本文链接：https://blog.csdn.net/weixin_41341999/article/details/100058770
版权
'''
0.attention机制 https://www.jianshu.com/p/0f0c674837e3
1.双样本T检验 https://www.jianshu.com/p/7555c4311a57
1.迁移学习demo：
2.capsule胶囊网络 https://www.jianshu.com/p/271d5f1f0e25
3.wide and Deep 
4.使用部分神经网络 保留模型特征

5.多标签分类 https://www.jianshu.com/p/76f9e4c0d0a2  多标签CNN分类

6.catboost树模型 https://www.jianshu.com/p/49ab87122562
7.知识蒸馏 https://www.jianshu.com/p/5c38872cdc0f

8.双向lstm 与 简单的attention策略 与maxpoolling padding策略 与自定义层策略 
    使用lambda层让你的keras更加灵活
    https://mp.weixin.qq.com/s/FvVr44RVsbKotITH29u1CQ
    keras attention https://github.com/philipperemy/keras-attention-mechanism
    keras attention https://www.jianshu.com/p/31c0acf94e0e    
    keras attention https://blog.csdn.net/u010041824/article/details/78855435
    
9.LightGBM + LR + FM https://mp.weixin.qq.com/s/Qpaw8TsnX46hFZ0htYq7qA

10.PMML跨平台的机器学习模型上线 https://mp.weixin.qq.com/s/hOjriQe__z4_dL0l6bRyCA

11.gan的发展 https://mp.weixin.qq.com/s/b4Ep6NmWP92bKBGDyJs4rA

12. GBDT的模型解释 https://yq.aliyun.com/articles/594969?utm_content=m_50745 GBDT模型解释

13. 阿里巴巴的神经网络特征重要性解释  https://cloud.tencent.com/developer/news/330520 神经网络判断特征重要性

14. tensorflow实现gan

15. Linux命令行

16.谱聚类 https://mp.weixin.qq.com/s/29XFWMAzmyvZwLOs0KJTIw
'''
from keras.datasets import mnist
from keras.layers import Input, Dense, Reshape, Flatten, Dropout, multiply, GaussianNoise
from keras.layers import BatchNormalization, Activation, Embedding, ZeroPadding2D
from keras.layers.advanced_activations import LeakyReLU
from keras.layers.convolutional import UpSampling2D, Conv2D
from keras.models import Sequential, Model
from keras.optimizers import Adam
from keras import losses
from keras.utils import to_categorical
from keras.layers import LSTM, Embedding
import keras.backend as K
from keras import initializers
import matplotlib.pyplot as plt
import tensorflow as tf

if __name__ == '__main__':
        '''capsule胶囊网络 https://www.jianshu.com/p/271d5f1f0e25'''
        '''Capsule是深度学习之父hinton在2017年提出来的一个较为轰动的网络结构。
            capsule这个结构主要的特点是：Vector in Vector out——向量进，向量出，
            而普通的神经元(Neuron)是Vector in Scalar out——向量进，标量出。
            capsule输出的向量比Neuron输出的标量表达出更丰富的特征。'''

if __name__ == '__main__':
    #GBDT生成特征
    #https://scikit-learn.org/stable/auto_examples/ensemble/plot_feature_transformation.html#example-ensemble-plot-feature-transformation-py
    #https://blog.csdn.net/shine19930820/article/details/71713680#generate-features-for-ffm
    import pandas as pd
    import numpy as np
    x = np.random.random((20000,10))
    y = np.random.randint(3,size=(20000,1))
    #print(y.ravel())
    from sklearn.ensemble import GradientBoostingClassifier
    from sklearn.preprocessing import OneHotEncoder
    gbdt = GradientBoostingClassifier(max_depth=3,min_samples_leaf=10,n_estimators=100,learning_rate=0.2,random_state=2)
    gbdt.fit(x,y.ravel())
    gbdt_enc = OneHotEncoder()
    #print(x)
    print(gbdt.apply(x)[:,:,0])
    print(np.array(gbdt.apply(x)[:,:,0]).shape)
    gbdt_enc.fit(gbdt.apply(x)[:,:,0])
    from sklearn.linear_model import LogisticRegression
    lr = LogisticRegression()
    lr.fit(np.array(gbdt.apply(x)[:,:,0]),y.ravel())
    lr_predict = lr.predict(np.array(gbdt.apply(x)[:,:,0]))
    from sklearn.metrics import accuracy_score
    print(accuracy_score(lr_predict, y.ravel()))
    inputs = Input(shape=(100,))
    keras_model = Embedding(input_dim=100,output_dim=200)(inputs)
    keras_model = LSTM(48,activation='relu')(keras_model)
    keras_model = Dense(48,activation='relu')(keras_model)
    out_model = Dense(3,activation='relu')(keras_model)
    model = Model(inputs=inputs,outputs=out_model)
    model.summary()
    model.compile(loss='categorical_crossentropy',optimizer=Adam(0.00001),metrics=['accuracy'])
    keras_y = to_categorical(y)
    #model.fit(np.array(gbdt.apply(x)[:,:,0]),keras_y,batch_size=12,epochs=20)

if __name__ == '__main__':
    import warnings
    warnings.filterwarnings("ignore")
    #双向LSTM + 简单的attention策略 + 自定义策略 +（maxplooing/padding)
    import pandas as pd
    import numpy as np
    dicts = dict(zip(np.arange(5),[[] for x in range(5)]))
    x = np.random.random((10000,20))
    y = np.random.randint(5,size=(10000,1))

    import lightgbm as lgb
    x_train = pd.DataFrame(x)
    y_train = pd.DataFrame(y.ravel())
    lgb_train = lgb.Dataset(x_train, y_train)
    
    
    def grid_searchs(modeol_names,lgb_train):
        import warnings
        warnings.filterwarnings("ignore")
        from sklearn.model_selection import GridSearchCV
        import xgboost as xgb
        import lightgbm as lgb
        if modeol_names == 'LGB':
            lgb_params = {
                'boosting_type': 'gbdt',
                'max_depth': list(range(4, 20, 2)),  # 提高精度
                'num_leaves': list(range(10, 100, 10)),  # 提高精度
                'min_child_samples': list(range(15, 20, 1)),  # 降低过拟合
                'min_child_weight': [0.0005, 0.001, 0.0015, 0.002, 0.0025, 0.003], # 降低过拟合
                'learning_rate': [i / 10.0 for i in range(0, 8)],
                #'n_estimators' : [10,30,60,90],
                'num_round': 1000,
                'metric' : ['recall','accuracy'],
                'feature_fraction': 0.5,
                'bagging_fraction': 0.8,
                'bagging_freq': 12,
                'subsample': 0.6
            }
            import lightgbm as lgb
            cv_results = lgb.cv(lgb_params, lgb_train, num_boost_round=1000, nfold=5, stratified=False, shuffle=True, metrics='auc',
                early_stopping_rounds=50, verbose_eval=50, show_stdv=True, seed=0)
            print('best n_estimators:', len(cv_results['auc-mean']))
            print('best cv score:', cv_results['auc-mean'][-1])
            print(cv_results)


    grid_searchs('LGB',lgb_train)
'''最优化LGB或GBDT构造特征；输入神经网络中做FM层；在通过Eembeding层；在通过自定义的padding与pooling；在通过双向LSTM；在通过CNN卷积，再输出结果'''
'''DeepFM https://blog.csdn.net/songbinxu/article/details/80151814 的原理及实现'''






'''XGB的参数搜索与LGB的参数搜索'''
if __name__ == '__main__':
    import sklearn
    import xgboost as xgb
    import lightgbm as lgb
    from sklearn.model_selection import GridSearchCV

    def grid_searchs(modeol_names,x,y):
        import warnings
        warnings.filterwarnings("ignore")
        import warnings
        warnings.filterwarnings(module='sklearn*', action='ignore', category=DeprecationWarning)
        from sklearn.model_selection import GridSearchCV
        import xgboost as xgb
        import lightgbm as lgb
        if modeol_names == 'XGB':
            param_test1 = {
                'max_depth': list(range(1, 10, 2)),
                'min_child_weight':list(range(1,10,2)),
                'gamma':[i/10.0 for i in range(0,8)],
                'subsample':[i/10.0 for i in range(6,10)],
                'colsample_bytree':[i/10.0 for i in range(6,10)],
                'reg_alpha':[0, 0.001, 0.005, 0.01, 0.05],
                'learning_rate': [i / 10.0 for i in range(0, 8)]
            }
            gsearch1 = GridSearchCV(estimator=xgb.XGBClassifier(learning_rate=0.1, n_estimators=1000, max_depth=5,
                                                            min_child_weight=1, gamma=0, subsample=0.8,
                                                            colsample_bytree=0.8,nthread=4, scale_pos_weight=1,
                                                            seed=27),
                                    param_grid=param_test1, scoring='f1', n_jobs=-1, iid=False, cv=5,verbose=1)
            gsearch1.fit(x, y)
            print(gsearch1.cv_results_ , gsearch1.best_params_, gsearch1.best_score_)
        if modeol_names == 'LGB':
            ### 我们可以创建lgb的sklearn模型，使用上面选择的(学习率，评估器数目)

            params_test1 = {
                'max_depth': list(range(4, 20, 2)),  # 提高精度
                'num_leaves': list(range(10, 100, 10)),  # 提高精度
                'min_child_samples': list(range(15, 20, 1)),  # 降低过拟合
                'min_child_weight': [0.0005, 0.001, 0.0015, 0.002, 0.0025, 0.003], # 降低过拟合
                #'learning_rate': [i / 10.0 for i in range(0, 8)],
                #'n_estimators' : [10,30,60,90],
                #'metric' : ['recall','accuracy']
            }
            gsearch1 = GridSearchCV(estimator=lgb.LGBMClassifier(),
                                    param_grid=params_test1,
                                    cv=5, verbose=-1, n_jobs=-1)
            gsearch1.fit(x, y)
            print(gsearch1.cv_results_ , gsearch1.best_params_, gsearch1.best_score_)


    x = np.random.random((100,20))
    y = np.random.randint(3,size=(100,1))
    y = y.ravel()
    #y = to_categorical(y)
    grid_searchs('LGB',x,y)