lightgbm xgboost

# encoding=utf-8
# import tensorflow as tf
# from tensorflow.contrib import learn
# from lstm import generate_data, lstm_model, load_csvdata
import time
import matplotlib.pyplot as plt
import pylab as pl
import xgboost as xgb
import re
print time.strftime("%I:%M:%S")
from operator import itemgetter
from sklearn import linear_model
import numpy as np
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import BaggingClassifier
from sklearn.cross_validation import KFold
from sklearn.ensemble import RandomForestClassifier
from sklearn.externals import joblib
# cross_validation.cross_val_score
from sklearn.svm import SVC
from sklearn import preprocessing
from sklearn.datasets import load_svmlight_file
from sklearn.cross_validation import train_test_split
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.metrics import f1_score, precision_score, recall_score, accuracy_score
from sklearn.metrics import auc, roc_curve, roc_auc_score
import lightgbm as lgb
import operator
import math
import argparse
from collections import Counter
exece = "/Users/wangsanpeng/LightGBM/lightgbm"
LOG_DIR = './ops_logs/lstm_weather'
feature_beg = 1
TIMESTEPS = 451
RNN_LAYERS = [{'num_units': 5},{'num_units': 5}]
DENSE_LAYERS = [451]
TRAINING_STEPS = 100000
BATCH_SIZE = 30
PRINT_STEPS = TRAINING_STEPS / 100


remove_begin = 32
remove_end = 36
parser = argparse.ArgumentParser()
parser.add_argument('--filename')
parser.add_argument('--output')
parser.add_argument('--feat_flag')
parser.add_argument('--model')

# parser = argparse.ArgumentParser(description="skeanrn Classification")

parser.add_argument('--train_file', type=str,
                    default="", help='训练数据集')
parser.add_argument('--test_file', type=str,
                    default="", help='测试数据集')
parser.add_argument('--n_estimators', type=int,
                    default=200, help='n_estimators')
parser.add_argument('--learning_rate', type=float,
                    default=0.1, help='learning_rate')
parser.add_argument('--max_depth', type=int,
                    default=5, help='max_depth')
parser.add_argument('--subsample', type=float,
                    default=1.0, help='subsample')
parser.add_argument('--max_features', type=str,
                    default=None, help='max_features: None,auto,sqrt,log2')
parser.add_argument('--threshold', type=float,
                    default=0.5, help='threshold for classifier')
parser.add_argument('--feature', type=str,
                    default=None, help='file to save feature importances')
parser.add_argument('--city', type=str,
                    default="", help='city file to predict')
parser.add_argument('--load_model', type=int,
                    default=0, help='load model skip trainning if not zero')
parser.add_argument('--model_file', type=str,
                    default="", help='save model_file if not null. '
                                     'require not null if load_model.')
parser.add_argument('--cost', type=float,
                    default=1, help='weight of samples')
parser.add_argument('--clf', type=str,
                    default="gbdt", help='Classification 分类器')
parser.add_argument('--roc', type=int,
                    default="0", help='compute roc table if not zero')
parser.add_argument('--roc_file', type=str,
                    default="roc", help='roc_file prefix file name')
parser.add_argument('--roc_percent', type=float,
                    default=-1, help='roc_percent to predict')
parser.add_argument('--roc_thres', type=str,
                    default="0", help='roc_thres to predict, a num or list')
parser.add_argument('--cv', type=int,
                    default=1, help='Cross_Validation')
parser.add_argument('--kfold', type=int, default=0,
                    help='use kfold if not zero')
parser.add_argument('--size', type=float, default=0.7,
                    help='trian set size, float or int')
parser.add_argument('--issvmformat', type=int,
                    default=0, help='issvmformat')
parser.add_argument('--startfield', type=int, default=0,
                    help='which field to start anasy if not svmformat')
parser.add_argument('--scale', type=int, default=0,
                    help='scale the data,标准化')
parser.add_argument('--aggr', type=int, default=0,
                    help='融合模型')

args = parser.parse_args()
n_estimators = args.n_estimators
subsample = args.subsample
learning_rate = args.learning_rate
max_depth = args.max_depth
cost = args.cost
threshold = args.threshold
feature = args.feature
model_file = args.model_file
roc = args.roc
cv = args.cv
issvmformat = args.issvmformat
kfold = args.kfold
aggr = args.aggr
file_name = args.filename
model_name = args.model
output = args.output
# feat_flag = int(args.feat_flag)

def is_chinese(uchar):
    """判断一个unicode是否是汉字"""
    if uchar >= u'/u4e00' and uchar<=u'/u9fa5':
        return True
    else:
        return False
need_device = ['h60']
def get_dic(sourceFile,dic_loc,lable_index):
    f = open(sourceFile, "r")
    all_block = [[] for i in range(len(dic_loc))]
    map_dic = {}
    map_lenth = {}
    shouji = []
    shouji_right = []
    shouji_wrong = []
    device = []
    yunyingshang = []
    latlon = []
    os = []
    os_odds = []
    map_shouji_reverse = {'mi':'xiaomi','lemobile':'letv','lephone':'lenovo','r7plusm':'oppo','yulong':'coolpad','qiku':'360','hm':'redmi'}
    # map_need_shouji = {'htc': True, 'gionee': True, 'redmi': True, 'bbk': True, 'coolpad': True, 'pioneer': True, 'zte': True, 'smartisan': True, 'huawei': True, 'meizu': True, 'letv': True, 'nubia': True, 'sprd': True, '360': True, 'xiaomi': True, 'lge': True, 'motorola': True, '4g': True, 'sony': True, 'cmdc': True, 'alps': True, 'lenovo': True, 'samsung': True, 'vivo': True, 'asus': True, 'zuk': True, 'oneplus': True, 'oppo': True}
    # map_need_shouji = {'huawei': 3074, 'xiaomi': 1641, 'vivo': 901, 'oppo': 883, 'samsung': 654, 'meizu': 643, 'letv': 289, '360': 191, 'coolpad': 159, 'gionee': 126, 'lenovo': 112}
    map_need_shouji = {'huawei': 3074,'vivo': 901,'360': 191,'coolpad': 159, 'gionee': 126, 'lenovo': 112}
    # map_need_device = {'hua': 1226, 'viv': 940, 'opp': 709, 'sm-': 624, 'mi ': 584, 'red': 580, 'eva': 205, 'plk': 176, 'kiw': 158, 'h60': 143, 'coo': 141, '150': 132, 'pe-': 130, 'che': 122, 'le ': 119, 'frd': 116, 'hm ': 107, 'len': 102, 'mi-': 92}
    map_yunying_reverse = {'中国电信':'chinatelecom','中国移动':'cmcc','chinamobile':'cmcc','中国联通':'chinaunicom','chnunicome':'chinaunicom','unicom':'chinaunicom','chnunicom':'chinaunicom'}
    map_need_yunying = {'': True,'cmcc': True,'chinatelecom': True,'chinaunicom': True}
    map_latlon_reverse = {'10_4':'10_3','9_4':'8_4','13_4':'12_4','10_1':'10_2','9_2':'10_2'}
    map_need_latlon = {'': True, '12_2': True, '12_3': True, '12_4': True, '0.0': True, '8_4': True, '10_2': True, '10_3': True, '11_4': True, '11_3': True, '11_2': True, '0_0': True}
    # map_shouji_reverse = {''}
    yunyingshang_wrong = []
    yunyingshang_right = []
    index_flag = -1
    while True:
        data = f.readline()
        if not data:
            break
        index_flag += 1
        # print index_flag
        if not index_flag:
            continue
        if index_flag > 20085:
            continue
        # if index_flag <= 40183:
        #     continue
        # if (index_flag > 40183) or (index_flag < 20085):
        #     continue
        datablock = data.strip('\n').split(',')
        for ii in range(len(dic_loc)):
            index_value_bl = datablock[dic_loc[ii]]
            if dic_loc[ii] == 5:
                index_value_bl = datablock[dic_loc[ii]].split(' ')[0].split('-')[0].split('_')[0]
                # print index_value_bl
                index_value_bl = str.lower(index_value_bl)
                if map_shouji_reverse.has_key(index_value_bl):
                    # print index_value_bl,map_shouji_reverse[index_value_bl]
                    index_value_bl = map_shouji_reverse[index_value_bl]
                shouji.append(index_value_bl)
                if not lable_index[index_flag]:
                    shouji_wrong.append(index_value_bl)
                else:
                    shouji_right.append(index_value_bl)
                if not map_need_shouji.has_key(index_value_bl):
                    # index_value_bl = 'others'
                    continue
                # if not lable_index[index_flag]:
                #     shouji_wrong.append(index_value_bl)
                # else:
                #     shouji_right.append(index_value_bl)
                shouji.append(index_value_bl)
                # if not map_need_shouji.has_key(index_value_bl):
                #     continue
            if dic_loc[ii] == 6:
                index_value_bl = str.lower(datablock[dic_loc[ii]])
                # [0:3]
                # print index_value_bl
                index_value_bl = str.lower(index_value_bl)
                device.append(index_value_bl)
                has_flag = 0
                for di in need_device:
                    if index_value_bl.find(di) == -1:
                        continue
                    else:
                        index_value_bl = di
                        has_flag = 1
                        break
                if not has_flag:
                    continue
                # if not map_need_device.has_key(index_value_bl):
                #     continue
            if dic_loc[ii] == 4:
                index_value_bl = str.lower(datablock[dic_loc[ii]].replace(' ','').replace('-','').replace('_',''))
                if map_yunying_reverse.has_key(index_value_bl):
                    index_value_bl = map_yunying_reverse[index_value_bl]
                yunyingshang.append(index_value_bl)
                if not map_need_yunying.has_key(index_value_bl):
                    # continue
                    if not lable_index[index_flag]:
                        yunyingshang_wrong.append(index_value_bl)
                    else:
                        yunyingshang_right.append(index_value_bl)
            if dic_loc[ii] == 2:
                if datablock[dic_loc[ii]] == '':
                    index_value_bl = ''
                elif datablock[dic_loc[ii]] == '0.0':
                    index_value_bl = '0.0'
                else:
                    index_value_bl = str(int(float(datablock[dic_loc[ii]])) / 10) + '_' +  str(int(float(datablock[dic_loc[ii] + 1])) / 10)
                if map_latlon_reverse.has_key(index_value_bl):
                    index_value_bl = map_latlon_reverse[index_value_bl]
                # if not map_need_latlon.has_key(index_value_bl):
                #     continue
                latlon.append(index_value_bl)
            if dic_loc[ii] == 7:
                index_value_bl = datablock[dic_loc[ii]]
                if (index_value_bl == 'G-UI-5') or (index_value_bl == 'G-UI-5.0.2'):
                    index_value_bl = '7'
                count_point = index_value_bl.count(".")
                point_block = index_value_bl.split('.')
                if not count_point:
                    index_value_bl = index_value_bl + '00'
                elif count_point == 1:
                    index_value_bl = point_block[0] + point_block[1]  + '0'
                else:
                    # print datablock[dic_loc[ii]]
                    index_value_bl = point_block[0] + point_block[1]  + point_block[2]
                os.append(index_value_bl)
                # os_odds.append(index_value_bl + '_' + str(lable_index[index_flag]))
                if index_value_bl == '5120' or index_value_bl == '442ro':
                    print index_value_bl,datablock[dic_loc[ii]]
                # if not map_need_yunying.has_key(index_value_bl):
                #     continue
            index_key = str(dic_loc[ii]) + '_' + index_value_bl
            if not map_dic.has_key(index_key):
                map_dic[index_key] = len(all_block[ii])
                all_block[ii].append(1)
    f.close()
    print Counter(shouji)
    print Counter(shouji_wrong)
    print Counter(shouji_right)
    print len(yunyingshang_right),len(yunyingshang_wrong)
    print yunyingshang_right
    print yunyingshang_wrong
    print Counter(device).most_common(20)
    # print Counter(os_odds)
    map_shouji_wrong = Counter(shouji_wrong)
    map_shouji_right = Counter(shouji_right)
    for ii in map_shouji_wrong:
        print ii,map_shouji_wrong[ii],map_shouji_right[ii]

    os_count = Counter(os)
    os_odds_count = Counter(os_odds)
    # print Counter(yunyingshang).most_common(20)
    # content = 'features,count_all,count_0,count_1,count_1_radio,adivice\n'
    # index_i = -1
    # for ii in os_count:
    #     index_i += 1
    #     print '-----'
    #     print ii,os_count[ii]
    #     print os_odds_count[ii + '_0.0'],os_odds_count[ii + '_1.0'],os_odds_count[ii + '_1.0'] / float(os_count[ii])
    #     content += ii + ',' + str(os_count[ii]) + ',' + str(os_odds_count[ii + '_0.0']) + ',' + str(os_odds_count[ii + '_1.0']) + ',' + str(os_odds_count[ii + '_1.0'] / float(os_count[ii])) + '\n'
    #     # map_need_shouji[ii[0]] = True
    # fp = open('./os_train_statis.csv','w')
    # fp.write(content)
    # fp.close()

    # content = 'features,count_all\n'
    # for ii in os_count:
    #     print '-----'
    #     print ii,os_count[ii]
    #     content += ii + ',' + str(os_count[ii]) + '\n'
    #     # map_need_shouji[ii[0]] = True
    # fp = open('./os_test_statis.csv','w')
    # fp.write(content)
    # fp.close()

    for ii in range(len(dic_loc)):
        index_key = str(dic_loc[ii]) + '_' + 'others'
        # map_lenth[dic_loc[ii]] = len(all_block[ii] + 1)
        map_dic[index_key] = len(all_block[ii])
    print map_dic
    return map_dic
def get_one_hot(all_length,index):
    res = []
    for ii in range(all_length):
        if ii == index:
            res.append(1.0)
        else:
            res.append(0.0)
    return res
def get_one_hot_dl(all_length,index):
    res = []
    for ii in range(all_length):
        if ii == index:
            res.append([1.0])
        else:
            res.append([0.0])
    return res
def getdatalable():
    res = []
    # f = open('./task1_training_set/scenario1_10_30_tag.csv', "r")
    f = open('./task1_training_set/train_val_tag.csv', "r")

    while True:
        data = f.readline()
        if not data:
            break
        datablock = data.strip('\n').split(',')
        # print datablock[0]
        if datablock[1] == '0':
            index_label = 0.0
            # print datablock[0]
        else:
            index_label = 1.0
        res.append(index_label)
    f.close()
    return res
# def get_use_label():
    
def get_use_label_not0():
    res = {}
    f = open('./train_test_not0radio.csv', "r")
    index_flag = -1
    features = []
    train_ave = []
    test_ave = []
    while True:
        data = f.readline()
        if not data:
            break
        datablock = data.strip('\n').split(',')
        # print datablock[0]
        index_flag += 1
        if not index_flag:
            features = datablock
            continue
        for i in range(len(datablock)):
            if index_flag == 1:
                train_ave.append(float(datablock[i]))
            if index_flag == 2:
                test_ave.append(float(datablock[i]))
    f.close()
    print len(features),len(train_ave),len(test_ave)
    for ii in range(len(features)):
        index_test = test_ave[ii]
        index_train = train_ave[ii]
        loss = abs(index_test - index_train)
        little = min(index_test,index_train)
        if not little:
            continue 
        if (loss / little) < 0.8:
            res[features[ii]] = True
    return res
def get_ip_same(file):
    res = {}
    f = open(file, "r")
    index_flag = -1
    map_res = {}
    while True:
        data = f.readline()
        if not data:
            break
        datablock = data.strip('\n').split(',')
        # print datablock[0]
        index_flag += 1
        if not index_flag:
            continue
        map_res[datablock[0]] = 1
    return map_res
    f.close()
def getdata(sourceFile,class_res=None,vali_flag=None):
    f = open(sourceFile, "r")

    train_x_res = []
    train_y_res = []
    test_x = []
    test_y = []

    train_x_dl = []
    train_y_dl = []
    # train_y_reg = []
    flag = 0
    dic_loc = []
    # dic_loc = []
    label_block = getdatalable()
    # dic_loc = []
    test_lable = []
    map_dic = get_dic(sourceFile,dic_loc,label_block)
    features_use = {'app_cat1_1':True,'app_cat1_2':True,'app_cat1_3':True,'app_cat1_4':True,'app_cat1_5':True,'app_cat1_7':True,'app_cat1_8':True,'app_cat1_9':True,'app_cat1_11':True,'app_cat1_13':True,'app_cat1_14':True,'app_cat1_15':True,'app_cat1_16':True,'app_cat1_18':True,'app_cat1_21':True,'app_cat1_23':True,'app_cat1_24':True,'app_cat1_26':True,'app_cat1_27':True,'app_cat1_28':True,'app_cat1_29':True,'app_cat1_30':True,'app_cat1_31':True,'app_cat1_32':True,'app_cat1_33':True,'app_cat1_34':True,'app_cat1_35':True,'app_cat1_38':True,'app_cat1_40':True,'app_cat1_41':True,'app_cat1_42':True,'app_cat1_43':True,'app_cat1_44':True,'app_cat2_1':True,'app_cat2_2':True,'app_cat2_3':True,'app_cat2_4':True,'app_cat2_5':True,'app_cat2_6':True,'app_cat2_7':True,'app_cat2_8':True,'app_cat2_9':True,'app_cat2_10':True,'app_cat2_11':True,'app_cat2_12':True,'app_cat2_13':True,'app_cat2_14':True,'app_cat2_15':True,'app_cat2_17':True,'app_cat2_20':True,'app_cat2_21':True,'app_cat2_22':True,'app_cat2_23':True,'app_cat2_26':True,'app_cat2_27':True,'app_cat2_28':True,'app_cat2_29':True,'app_cat2_30':True,'app_cat2_33':True,'app_cat2_34':True,'app_cat2_35':True,'app_cat2_36':True,'app_cat2_37':True,'app_cat2_39':True,'app_cat2_40':True,'app_cat2_42':True,'app_cat2_43':True,'app_cat2_44':True,'app_cat2_45':True,'app_cat2_46':True,'app_cat2_47':True,'app_cat2_48':True,'app_cat2_49':True,'app_cat2_50':True,'app_cat2_51':True,'app_cat2_52':True,'app_cat2_53':True,'app_cat2_54':True,'app_cat2_55':True,'app_cat2_56':True,'app_cat2_57':True,'app_cat2_58':True,'app_cat2_59':True,'app_cat2_60':True,'app_cat2_61':True,'app_cat2_62':True,'app_cat2_63':True,'app_cat2_64':True,'app_cat2_65':True,'app_cat2_66':True,'app_cat2_67':True,'app_cat2_68':True,'app_cat2_69':True,'app_cat2_70':True,'app_cat2_71':True,'app_cat2_72':True,'app_cat2_74':True,'app_cat2_75':True,'app_cat2_76':True,'app_cat2_77':True,'app_cat2_78':True,'app_cat2_79':True,'app_cat2_80':True,'app_cat2_81':True,'app_cat2_83':True,'app_cat2_85':True,'app_cat2_86':True,'app_cat2_87':True,'app_cat2_88':True,'app_cat2_89':True,'app_cat2_90':True,'app_cat2_91':True,'app_cat2_92':True,'app_cat2_93':True,'app_cat2_94':True,'app_cat2_96':True,'app_cat2_97':True,'app_cat2_98':True,'app_cat2_99':True,'app_cat2_100':True,'app_cat2_101':True,'app_cat2_103':True,'app_cat2_104':True,'app_cat2_105':True,'app_cat2_106':True,'app_cat2_107':True,'app_cat2_108':True,'app_cat2_109':True,'app_cat2_111':True,'app_cat2_112':True,'app_cat2_113':True,'app_cat2_114':True,'app_cat2_115':True,'app_cat2_116':True,'app_cat2_117':True,'app_cat2_118':True,'app_cat2_119':True,'app_cat2_120':True,'app_cat2_122':True,'app_cat2_123':True,'app_cat2_124':True,'app_cat2_125':True,'app_cat2_126':True,'app_cat2_127':True,'app_cat2_128':True,'app_cat2_129':True,'app_cat2_130':True,'app_cat2_131':True,'app_cat2_132':True,'app_cat2_133':True,'app_cat2_134':True,'app_cat2_135':True,'app_cat2_137':True,'app_cat2_138':True,'app_cat2_139':True,'app_cat2_140':True,'app_cat2_141':True,'app_cat2_142':True,'app_cat2_143':True,'app_cat2_144':True,'app_cat2_145':True,'app_cat2_146':True,'app_cat2_147':True,'app_cat2_148':True,'app_cat2_151':True,'app_cat2_153':True,'app_cat2_154':True,'app_cat2_155':True,'app_cat2_156':True,'app_cat2_157':True,'app_cat2_158':True,'app_cat2_159':True,'app_cat2_160':True,'app_cat2_161':True,'app_cat2_162':True,'app_cat2_163':True,'app_cat2_164':True,'app_cat2_165':True,'app_cat2_166':True,'app_cat2_167':True,'app_cat2_168':True,'app_cat2_169':True,'app_cat2_170':True,'app_cat2_171':True,'app_cat2_172':True,'app_cat2_173':True,'app_cat2_174':True,'app_cat2_175':True,'app_cat2_176':True,'app_cat2_177':True,'app_cat2_178':True,'app_cat2_179':True,'app_cat2_180':True,'app_cat2_181':True,'app_cat2_182':True,'app_cat2_183':True,'app_cat2_184':True,'app_cat2_185':True,'app_cat2_186':True,'app_cat2_187':True,'app_cat2_188':True,'app_cat2_189':True,'app_cat2_190':True,'app_cat2_192':True,'app_cat2_193':True,'app_cat2_194':True,'app_cat2_196':True,'app_cat2_198':True,'app_cat2_199':True,'app_cat2_200':True,'app_cat2_201':True,'app_cat2_202':True,'app_cat2_203':True,'app_cat2_204':True,'app_cat2_205':True,'app_cat2_207':True,'app_cat2_208':True,'app_cat2_209':True,'app_cat2_210':True,'app_cat2_211':True,'app_cat2_212':True,'app_cat2_213':True}
    # features_use = {'app_cat1_35': True, 'app_cat2_208': True, 'app_cat2_120': True, 'app_cat2_89': True, 'app_cat2_126': True, 'app_cat2_202': True, 'app_cat2_164': True, 'app_cat2_143': True, 'app_cat2_211': True, 'app_cat2_138': True, 'app_cat2_71': True, 'app_cat2_144': True, 'app_cat2_159': True, 'app_cat2_26': True, 'app_cat2_184': True, 'app_cat2_12': True, 'app_cat2_180': True, 'app_cat2_181': True, 'app_cat2_50': True, 'app_cat2_53': True, 'app_cat2_52': True, 'app_cat2_55': True, 'app_cat2_54': True, 'app_cat2_79': True, 'app_cat2_14': True, 'app_cat2_77': True, 'app_cat2_168': True, 'app_cat2_74': True, 'app_cat1_32': True, 'app_cat2_154': True, 'app_cat1_15': True, 'app_cat2_93': True, 'app_cat2_132': True, 'app_cat2_96': True, 'app_cat2_99': True, 'app_cat2_97': True, 'app_cat2_111': True, 'app_cat2_113': True, 'app_cat2_156': True, 'app_cat2_115': True, 'app_cat2_158': True, 'app_cat1_8': True, 'app_cat1_6': True, 'app_cat1_7': True, 'app_cat2_131': True, 'app_cat1_5': True, 'app_cat1_2': True, 'app_cat2_135': True, 'app_cat2_1': True, 'app_cat2_175': True, 'app_cat2_174': True, 'app_cat2_5': True, 'app_cat2_172': True, 'app_cat2_7': True, 'app_cat2_170': True, 'app_cat2_191': True, 'app_cat2_177': True, 'app_cat2_178': True, 'app_cat2_142': True, 'app_cat2_45': True, 'app_cat2_68': True, 'app_cat2_43': True, 'app_cat2_40': True, 'app_cat1_21': True, 'app_cat1_24': True, 'app_cat1_27': True, 'app_cat2_49': True, 'app_cat2_20': True, 'app_cat2_22': True, 'app_cat2_2': True, 'app_cat2_118': True, 'app_cat2_27': True, 'app_cat2_29': True, 'app_cat1_41': True}
    # features_not_use = {'app_cat1_10':True,'app_cat1_17':True,'app_cat1_12':True,'app_cat1_19':True,'app_cat1_20':True,'app_cat1_22':True,'app_cat1_25':True,'app_cat1_36':True,'app_cat1_37':True,'app_cat1_39':True,'app_cat1_45':True,'app_cat2_102':True,'app_cat2_150':True,'app_cat2_197':True,'app_cat2_24':True,'app_cat2_32':True,'app_cat2_195':True,'app_cat2_214':True,'app_cat2_31':True,'app_cat2_73':True,'app_cat2_110':True,'app_cat2_121':True,'app_cat2_136':True,'app_cat2_149':True,'app_cat2_152':True,'app_cat2_16':True,'app_cat2_18':True,'app_cat2_19':True,'app_cat2_206':True,'app_cat2_38':True,'app_cat2_41':True,'app_cat2_82':True,'app_cat2_84':True,'app_cat2_95':True}
    # print use_feature
    # print len(use_feature)
    yunyingshang_tezheng = 'cmcc,chinatelecom,chinaunicom,null,others\n'
    yunyingshang_tezheng_test = 'cmcc,chinatelecom,chinaunicom,null,others\n'
    features_block = []
    index_flag = -1
    yunyingshang_num = [0 for i in range(10)]
    yunyingshang_test_num = [0 for i in range(10)]
    shouji_features = '\thuawei\tvivo\tgionee\tlenovo\t360\tcoolpad\n'
    shouji_features_test = '\thuawei\tvivo\tgionee\tlenovo\t360\tcoolpad\n'
    map_ip_same = get_ip_same('./train_ip_mobel_same_uid.csv')
    ip_same = []
    while True:
        data = f.readline()
        if not data:
            break
        index_flag += 1
        # print index_flag
        if not index_flag:
            features_block = data.strip('\n').split(',')
            # print features_block,len(features_block)
            continue
        # if index_flag == 40183:
        #     print '40183',data
        datablock = data.strip('\n').split(',')
        # if datablock[0] == '0':
        #     index_label = 0.0
        # else:
        #     index_label = 1.0
        if class_res:
            # if (index_flag < 20086):
            #     index_label = label_block[index_flag]
            if (index_flag <= 40183):
                train_y_res.append(label_block[index_flag])
            else:
                test_y.append(datablock[0])
        else:
            if vali_flag:
                if (index_flag < 20086):
                    continue
            train_y_res.append(label_block[index_flag])
            train_y_dl.append([label_block[index_flag]])
        index_value = []
        index_value_dl = []
        # print data
        for di in dic_loc:
            index_value_db = datablock[di]
            if di == 5:
                index_value_db = datablock[di].split(' ')[0].split('-')[0].split('_')[0]
                index_value_db = str.lower(index_value_db)
                if map_shouji_reverse.has_key(index_value_db):
                    index_value_db = map_shouji_reverse[index_value_db]
            if di == 4:
                index_value_db = str.lower(datablock[di].replace(' ','').replace('-','').replace('_',''))
                if map_yunying_reverse.has_key(index_value_db):
                    index_value_db = map_yunying_reverse[index_value_db]
            if di == 2:
                if datablock[di] == '':
                    index_value_db = ''
                elif datablock[di] == '0.0':
                    index_value_db = '0.0'
                else:
                    index_value_db = str(int(float(datablock[di])) / 10) + '_' +  str(int(float(datablock[di + 1])) / 10)
                if map_latlon_reverse.has_key(index_value_db):
                    index_value_db = map_latlon_reverse[index_value_db]
            if di == 6:
                index_value_db = str.lower(datablock[di])
                has_flag = 0
                for dii in need_device:
                    if index_value_db.find(dii) == -1:
                        continue
                    else:
                        index_value_db = dii
                        has_flag = 1
                        break
            # if di == 7:
            #     index_value_bl = datablock[di].split('.')[0]
            #     if index_value_bl == 'G-UI-5':
            #         index_value_bl = '7'
            #     if index_value_bl == '2':
            #         index_value_bl = '3'
            #     index_value_bl = float(index_value_bl)
            #     index_value.append(index_value_bl)
            #     if di == 7:
            #         index_content = ''
            #         index_content += str(index_value_bl) + '\n'
            #         if (index_flag < 20086):
            #             # yunyingshang_num[index_loc] += 1
            #             yunyingshang_tezheng += index_content
            #         else:
            #             # yunyingshang_test_num[index_loc] += 1
            #             yunyingshang_tezheng_test += index_content  
            #     continue
            if di == 7:
                index_value_bl = datablock[di]
                if (index_value_bl == 'G-UI-5') or (index_value_bl == 'G-UI-5.0.2'):
                    index_value_bl = '7'
                count_point = index_value_bl.count(".")
                point_block = index_value_bl.split('.')
                if not count_point:
                    index_value_bl = index_value_bl + '00'
                elif count_point == 1:
                    index_value_bl = point_block[0] + point_block[1]  + '0'
                else:
                    # print datablock[dic_loc[ii]]
                    index_value_bl = point_block[0] + point_block[1]  + point_block[2]
                if index_value_bl == '5120':
                    index_value_bl = '512'
                if index_value_bl == '442ro':
                    index_value_bl = '442'
                index_value_bl = int(index_value_bl)
                # not detail
                # if index_value_bl <= 440:
                #     index_value_bl = 0.0
                # if (index_value_bl >= 442) and (index_value_bl <= 443):
                #     index_value_bl = 1.0
                # if index_value_bl == 444:
                #     index_value_bl = 2.0
                # if (index_value_bl >= 445) and (index_value_bl <= 502):
                #     index_value_bl = 3.0
                # if index_value_bl == 510:
                #     index_value_bl = 4.0
                # if (index_value_bl >= 511) and (index_value_bl <= 530):
                #     index_value_bl = 5.0
                # if index_value_bl == 600:
                #     index_value_bl = 6.0
                # if (index_value_bl >= 601) and (index_value_bl <= 610):
                #     index_value_bl = 7.0
                # if (index_value_bl >= 700):
                #     index_value_bl = 8.0
                # detail 
                # if index_value_bl <= 420:
                #     index_value_bl = 0.0
                # if (index_value_bl >= 421) and (index_value_bl <= 422):
                #     index_value_bl = 1.0
                # if (index_value_bl >= 430) and (index_value_bl <= 440):
                #     index_value_bl = 2.0
                # if (index_value_bl >= 442) and (index_value_bl <= 443):
                #     index_value_bl = 3.0
                # if index_value_bl == 444:
                #     index_value_bl = 4.0
                # if (index_value_bl >= 445) and (index_value_bl <= 501):
                #     index_value_bl = 5.0
                # if index_value_bl == 502:
                #     index_value_bl = 6.0
                # if index_value_bl == 510:
                #     index_value_bl = 7.0
                # if (index_value_bl >= 511) and (index_value_bl <= 530):
                #     index_value_bl = 8.0
                # if index_value_bl == 600:
                #     index_value_bl = 9.0
                # if (index_value_bl >= 601) and (index_value_bl <= 610):
                #     index_value_bl = 10.0
                # if (index_value_bl >= 700):
                #     index_value_bl = 11.0

                if index_value_bl < 500:
                    index_value_bl = 0.0
                if (index_value_bl >= 500) and (index_value_bl < 600):
                    index_value_bl = 1.0
                if (index_value_bl >= 600) and (index_value_bl < 700):
                    index_value_bl = 2.0
                # if (index_value_bl < 700):
                #     index_value_bl = 1.0
                if (index_value_bl >= 700):
                    index_value_bl = 3.0


                value_index = get_one_hot(3,index_value_bl)
                index_value.extend(value_index)
                # index_value.append(index_value_bl)
                index_content = ''
                for vi in value_index:
                    index_content += str(vi) + '\t'
                index_content += '\n'

                if (index_flag < 20086):
                    # yunyingshang_num[index_loc] += 1
                    yunyingshang_tezheng += index_content
                else:
                    # yunyingshang_test_num[index_loc] += 1
                    yunyingshang_tezheng_test += index_content
                continue           
            index_key = str(di) + '_' + index_value_db
            if map_dic.has_key(index_key):
                index_loc = map_dic[index_key]
            else:
                index_loc = map_dic[str(di) + '_others']
                index_loc = -1

            # print map_dic[str(di) + '_' + 'others']
            value_index = get_one_hot(map_dic[str(di) + '_' + 'others'],index_loc)
            value_index_dl = get_one_hot_dl(map_dic[str(di) + '_' + 'others'] + 1,index_loc)
            index_value.extend(value_index)
            index_value_dl.extend(value_index_dl)
            if di == 4:
                index_content = ''
                for iii in range(5):
                    index_content += str(value_index[iii]) + ' '
                index_content += '\n'
                if (index_flag < 20086):
                    yunyingshang_num[index_loc] += 1
                    yunyingshang_tezheng += index_content
                else:
                    yunyingshang_test_num[index_loc] += 1
                    yunyingshang_tezheng_test += index_content
            # if di == 5:
            #     index_content = ''
            #     for iii in range(6):
            #         index_content += '\t' + str(value_index[iii])
            #     index_content += '\n'
            #     if (index_flag <= 40183):
            #         shouji_features += index_content
            #     else:
            #         shouji_features_test += index_content                  
        # lat = 0.0
        # lng = 0.0
        # if datablock[2] == '':
        #     lat = -1.0
        #     lng = -1.0
        # else:
        #     lat = float(datablock[2])
        #     lng = float(datablock[3])
        # index_value.append(lat)
        # index_value.append(lng)
        index_ii = 7
        for ii in datablock[8:len(datablock)]:
            try:
                index_value_1 = float(ii)
            except:
                index_value_1 = -1.0
            index_ii += 1
            # print index_ii,features_block[index_ii]
            has_use_feature = features_use.has_key(features_block[index_ii])
            if not has_use_feature:
                continue
            # print index_ii,features_block[index_ii]
            index_value.append(index_value_1)
            index_value_dl.append([index_value_1])
        # if len(index_value) != 259:
        #     print index_flag,'not 259!!!!!!',len(index_value),data
        if class_res and (index_flag > 40183):
            test_x.append(index_value)
        else:
            index_type = map_ip_same.has_key(datablock[0])
            if index_type:
                ip_same.append(1)
            else:
                ip_same.append(0)
            train_x_res.append(index_value)
            # train_x_dl.append(index_value_dl)
        flag += 1
    f.close()
    if class_res:
        # print map_dic
        # fp = open('./mobil_features.txt','w')
        # fp.write(shouji_features)
        # fp.close()
        # fp = open('./mobil_features_test.txt','w')
        # fp.write(shouji_features_test)
        # fp.close()
        # get_data_convert_string('./224_all/task1_osge7/211_train_vali_mobile_ip_osge7','./mobil_features.txt','./224_all/224_all_train_mobil')
        # get_data_convert_string('./224_all/task1_osge7/211_test_mobile_ip_osge7','./mobil_features_test.txt','./224_all/224_all_test_mobil')
        return train_x_res,train_y_res,test_x,test_y
    else:
        return train_x_res,train_y_res,ip_same
def get_data_convert_string(file,file2,file3):
    f = open(file2, "r")
    file2 = []
    flag = 0
    while True:
        data = f.readline()
        if not data:
            break
        # if not (flag % 100):
        #     print flag
        file2.append(data)
        flag += 1
    f.close()

    f = open(file, "r")
    train_x_res = []
    train_y_res = []
    flag = 0
    content_res = ''
    while True:
        data = f.readline()
        if not data:
            break
        # if not (flag % 100):
        #     print flag
        content_res += data.strip('\n') + file2[flag]
        flag += 1
    f.close()
    fp = open(file3,'w')
    fp.write(content_res)
    fp.close()
def get_data_convert(file):
    f = open(file, "r")
    train_x_res = []
    train_y_res = []
    flag = 0
    while True:
        data = f.readline()
        if not data:
            break
        # if not (flag % 100):
        #     print flag
        datablock = data.strip('\n').split('\t')
        index_label = 0
        train_y_res.append(float(datablock[0]))
        # train_y_res.append(index_label)
        index_value = []
        for ii in datablock[1:len(datablock)]:
            index_value.append(float(ii))
        train_x_res.append(index_value)
        flag += 1
    f.close()
    return train_x_res,train_y_res
def confusematrix(testlabel=[], prey=[]):
    """
    计算混肴矩阵,并输出结果
    :param testlabel:
    :param prey:
    :return:
    """
    # dic = {'00': 0, '01': 0, '10': 0, '11': 0}
    #  第一个数字代表真实,第二个数字代表预测
    # for i in range(len(testlabel)):
    #     key = str(int(testlabel[i])) + str(int(prey[i]))
    #     dic[key] += 1
    cnt = 0
    true_one = 0
    true_zero = 0
    tt = 0
    tf = 0
    ft = 0
    ff = 0
    for i in range(len(testlabel)):
        cnt += 1
        if cnt % 100000 == 0:
            # print "读取第[%d]万行" % (cnt / 10000)
            pass
        # print testlabel[i],prey[i]
        tagkey = str(int(testlabel[i])) + str(int(prey[i]))
        try:
            if tagkey == "11":
                true_one += 1
                tt += 1
            elif tagkey == "01":
                true_zero += 1
                tf += 1
            elif tagkey == "10":
                true_one += 1
                ft += 1
            elif tagkey == "00":
                true_zero += 1
                ff += 1
        except Exception as e:
            print line, e

    predict_one = tt + tf
    predict_zero = ft + ff
    print "数据总数: %d" % (cnt)
    if cnt <= 1:
        exit()
    print "真实不为0: ", true_one, ", 真实为0: ", true_zero
    print "预测不为0: ", predict_one, ", 预测为0: ", predict_zero
    print "类别\t\t真实不为0\t真实为0\t精确率"
    if predict_one == 0:
        print "预测不为0\t%d\t\t%d\t\t%.4f" % (tt, tf, 0)
    else:
        print "预测不为0\t%d\t\t%d\t\t%.4f" % (tt, tf, 1.0 * tt / predict_one)
    if predict_zero == 0:
        print "预测为0:   \t\t%d\t\t%d\t\t%.4f" % (ft, ff, 0)
    else:
        print "预测为0\t\t%d\t\t%d\t\t%.4f" % (ft, ff, 1.0 * ff / predict_zero)
    if true_one == 0 and true_zero != 0:
        print "召回率\t\t%.4f\t\t%.4f\t\t%.4f" % (
            0, 1.0 * ff / true_zero, 1.0 * (tt + ff) / cnt)
    elif true_one == 0 and true_zero == 0:
        print "召回率\t\t%.4f\t\t%.4f\t\t%.4f" % (0, 0, 1.0 * (tt + ff) / cnt)
    elif true_one != 0 and true_zero == 0:
        print "召回率\t\t%.4f\t\t%.4f\t\t%.4f" % (
            1.0 * tt / true_one, 0, 1.0 * (tt + ff) / cnt)
    else:
        print "召回率\t\t%.4f\t\t%.4f\t\t%.4f" % (
            1.0 * tt / true_one, 1.0 * ff / true_zero, 1.0 * (tt + ff) / cnt)
    print "F1值\t\t%.4f" % (2.0 * ff / (predict_zero + true_zero))
    print "--------------------------------------------------------------"

def write_ks(traindata, trainlabel, testdata, testlabel,write_file,feat_name):
    feat_lenth = len(traindata[0])
    func_name = {"plot_ks"}
    feat_ks = []
    alldata_len = len(traindata) + len(testdata)
    for i in range(feat_lenth):
        data_value = []
        lable_value = []
        feat_true_num = 0
        index_res = []
        index_res.append(feat_name[i])
        for ii in range(len(traindata)):
            data_value.append(traindata[ii][i])
            lable_value.append(trainlabel[ii])
            if not (traindata[ii][i] == -1):
                feat_true_num += 1
        for ii in range(len(testdata)):
            data_value.append(testdata[ii][i])
            lable_value.append(testlabel[ii])
            if not (testdata[ii][i] == -1):
                feat_true_num += 1
        print feat_name[i]

        resent_ks = plot_features.plot_one_feature(data_value, lable_value, 100,write_file,func_name, 0,feat_name[i], feat_name[i],0)
        index_res.append(resent_ks)
        index_res.append(feat_true_num / float(alldata_len))
        feat_ks.append(index_res)
    return feat_ks
def cacul_feat(feat_imp,feat_ks):
    content = ''

    for ii in range(len(feat_ks)):
        feat_ks[ii].append(feat_imp[ii])

    ks_sort = sorted(feat_ks, key=lambda feat_ks: feat_ks[1], reverse=True)
    for ii in range(len(feat_ks)):
        content += ks_sort[ii][0] + ', ks:' + str(ks_sort[ii][1]) + ', import:' + str(ks_sort[ii][3]) + ', radio:' + str(ks_sort[ii][2]) + '\n'

    fp = open('./feat_result_kssort' + output ,'w')
    fp.write(content)
    fp.close()

    content = ''
    imp_sort = sorted(feat_ks, key=lambda feat_ks: feat_ks[3], reverse=True)
    for ii in range(len(feat_ks)):
        content += imp_sort[ii][0] + ', ks:' + str(imp_sort[ii][1]) + ', import:' + str(imp_sort[ii][3]) + ', radio:' + str(imp_sort[ii][2]) + '\n'

    fp = open('./feat_result_impsort' + output ,'w')
    fp.write(content)
    fp.close()

def write_res(prey_value,testlabel):
    content = ''
    for ii in range(len(prey_value)):
        content += str(prey_value[ii]) + ',' + str(testlabel[ii]) + '\n'
    fp = open('./result_' + output,'w')
    fp.write(content)
    fp.close()
def cross_spilit(traindata,trainlabel,number):
    train_x = []
    train_y = []
    val_x = []
    val_y = []
    train_ip_same = []
    val_ip_same = []
    for ii in range(len(trainlabel)):
        flag = ii % 10
        if flag in number:
            val_x.append(traindata[ii])
            val_y.append(trainlabel[ii])
        else:
            train_x.append(traindata[ii])
            train_y.append(trainlabel[ii])
    return train_x,train_y,val_x,val_y
def calcul_spilit(traindata,trainlabel):
    clf = GradientBoostingClassifier(n_estimators=n_estimators,
                                         subsample=subsample,
                                         learning_rate=learning_rate,
                                         max_features=0.5,
                                         max_depth=max_depth)

    sum_split = 0.0
    sum_auc = 0.0
    for i in range(10):
        next_i = i + 1
        if next_i > 9:
            next_i = next_i % 10
        train_x,train_y,val_x,val_y = cross_spilit(traindata,trainlabel,[i])
        clf.fit(train_x, train_y)
        prey_prob = clf.predict_proba(val_x)
        prey_value = []
        new_label_all = []
        index_flag = 0
        for ii in prey_prob:
            prey_value.append(ii[1])
        func_name = {"plot_ks"}
        sum_split += plot_features.plot_one_feature(prey_value, val_y, 200,'./ks/',func_name, 0,'classfication' + str(i), 'classfication',0)
        sum_auc += auc_score(clf,val_x, val_y)[4]
    print 'average_split',sum_split / float(1000)
    print 'average_auc',sum_auc / float(10)
    return sum_split / float(1000)

def get_rnn_res(file):
    f = open(file, "r")
    train_x_res = []
    while True:
        data = f.readline()
        if not data:
            break
        datablock = float(data.strip('\n'))
        train_x_res.append(datablock)
    f.close()
    return train_x_res
def log_loss(test_y,predict_y):
    all_loss = 0
    for ii in range(len(test_y)):
        test_res = math.log(math.exp(test_y[ii]),10)
        predict_res = math.log(math.exp(predict_y[ii]),10)
        if not test_y[ii]:
            test_res = 0
        if not predict_y[ii]:
            predict_res = 0
        # print math.log(test_y[ii]),math.log(predict_y[ii])
        all_loss += (test_res - predict_res) * (test_res - predict_res)
    return all_loss / float(len(predict_y))
def merge_rnn(train_x,train_rnn):
    # result = []
    for ii in range(len(train_rnn)):
        train_x[ii].append(train_rnn[ii])
    return train_x
def regression(train_x,train_y,test_x,test_y,class_res):
    # clf = linear_model.LinearRegression()
    # train_rnn = get_rnn_res('./lstm-regression/train2_rnn.txt')
    # test_rnn = get_rnn_res('./lstm-regression/test2_rnn.txt')
    print len(train_x[0])
    # train_x = merge_rnn(train_x,train_rnn)
    print len(train_x[0])
    # test_x = merge_rnn(test_x,test_rnn)
    clf=GradientBoostingRegressor(loss='ls',learning_rate=0.1,n_estimators=n_estimators, subsample=1, min_samples_split=2, min_samples_leaf=1, max_depth=3, init=None, random_state=None, max_features=None, alpha=0.9, verbose=0, max_leaf_nodes=None, warm_start=False)
    clf.fit(train_x,train_y)
    # joblib.dump(clf, './huodong.model')
    # gbdt = joblib.load('./huodong.model')
    # print 'feature_importances_'
    # print clf.feature_importances_
    # print 'end'
    # print clf.alpha_
    # load test data
    # predict with test data 
    predict_y = clf.predict(test_x)
    # print predict_y
    # test_y = array(test_y)

    for ii in range(len(predict_y)):
        if not class_res[ii]:
            predict_y[ii] = 0.0
    all_loss = log_loss(test_y,predict_y)
    print 'sum_loss',all_loss
def get_split_data_file(file):
    f = open(file, "r")
    train_x = []
    train_y = []
    train_y_class = []
    flag = 0
    while True:
        data = f.readline()
        if not data:
            break
        # if not (flag % 100):
        #     print flag
        datablock = data.strip('\n').split('\t')
        index_value = []
        index_flag = -1
        all_remove_num = 0
        for ii in datablock[1:len(datablock)]:
            index_flag += 1
            if (index_flag >= remove_begin) and (index_flag <= remove_end):
                all_remove_num += float(ii)
                continue
            index_value.append(float(ii))
        index_value[25] -= all_remove_num
        train_x.append(index_value)
        train_y.append(float(datablock[0]))
        index_class = 0.0
        if datablock[0] != '0.0':
            index_class = 1.0
        train_y_class.append(index_class)
    f.close()
    return train_x,train_y,train_y_class
def get_split_data():
    train_x = []
    train_y = []
    train_y_class = []
    for i in range(10):
        train_x_index,train_y_index,train_y_index_class = get_split_data_file('./split/' + str(i))
        train_x.append(train_x_index)
        train_y.append(train_y_index)
        train_y_class.append(train_y_index_class)
    return train_x,train_y,train_y_class
def split_reg_radio(train_x,train_y,zero_radio):
    train_x_res = []
    train_y_res = []
    train_x_0 = []
    train_y_0 = []
    for ii in range(len(train_y)):
        if train_y[ii]:
            train_x_res.append(train_x[ii])
            train_y_res.append(train_y[ii])
        else:
            train_x_0.append(train_x[ii])
            train_y_0.append(train_y[ii])
    # zero_num = int(float(len(train_x)) * zero_radio)
    add_zero_num = 0
    for i in range(10):
        index_flag = 0
        for ii in range(len(train_x_0)):
            index_flag = ii % 10
            if i == index_flag:
                add_zero_num += 1
                train_x_res.append(train_x_0[ii])
                train_y_res.append(train_y_0[ii])
            index_radio = add_zero_num / float(len(train_x_res))
            if index_radio > zero_radio:
                break
        index_radio = add_zero_num / float(len(train_x_res))
        if index_radio > zero_radio:
            break
    print 'index 0 zero:',add_zero_num / float(len(train_x_res))
    return  train_x_res,train_y_res 
def split_reg(train_x,train_y):
    train_x_new = []
    train_y_new = []
    for ii in range(len(train_y)):
        if train_y[ii]:
            train_x_new.append(train_x[ii])
            train_y_new.append(train_y[ii])
    return train_x_new,train_y_new
def add_block(predict_res,predict_y):
    res = []
    for ii in range(len(predict_res)):
        res.append(predict_res[ii] + predict_y[ii])
    return res
def get_csv(file):
    f = open(file, "r")
    train_x_res = []
    train_x_id = []
    flag = 0
    while True:
        data = f.readline()
        if not data:
            break
        if not flag:
            flag += 1
            continue
        if not (flag % 100):
            print flag
        datablock = data.strip('\n').split(',')
        index_value = []
        for ii in datablock[1:len(datablock)]:
            index_value.append(float(ii))
        train_x_res.append(index_value)
        train_x_id.append(datablock[0])
    f.close()
    return train_x_res,train_x_id
def auc_score_xgb(pytestprob,ytest, pos_label=1, auc_score=True):
    # pytest = clf.predict(Xtest)
    # pytestprob = clf.predict_proba(Xtest)
    pytestprob = [k for k in pytestprob]
    fpr, tpr, thresholds = roc_curve(ytest, pytestprob, pos_label=pos_label)
    if auc_score:
        header = "\tauc"
        scores = auc(fpr, tpr)
    print header
    print scores
    return scores
def auc_score(pytestprob,ytest, pos_label=1, auc_score=True):
    pytestprob = [k[pos_label] for k in pytestprob]
    fpr, tpr, thresholds = roc_curve(ytest, pytestprob, pos_label=pos_label)
    if auc_score:
        header = "\tauc"
        scores = auc(fpr, tpr)
    print header
    print scores
    return scores
def ceate_feature_map(features):  
    outfile = open('xgb.fmap', 'w')  
    i = 0  
    for feat in features:  
        outfile.write('{0}\t{1}\tq\n'.format(i, feat))  
        i = i + 1  
    outfile.close()
def getdistrict():
    res = {}
    map_province = {}
    map_city = {}
    map_district = {}
    f = open('./split_data/april_loc_all.txt', "r")
    while True:
        data = f.readline()
        if not data:
            break
        datablock = data.strip('\n').split(',')
        index_res = []
        for i in range(1,4):
            index_value = datablock[i].decode('utf8')
            if index_value == '':
                index_value = '-1'
            if (i == 1):
                has_flag = map_province.has_key(index_value)
                if has_flag:
                    index_res.append(map_province[index_value])
                else:
                    index_res.append(float(len(map_province)))
                    map_province[index_value] = float(len(map_province))
            # if (i == 2):
            #     has_flag = map_city.has_key(index_value)
            #     if has_flag:
            #         index_res.append(map_city[index_value])
            #     else:
            #         index_res.append(float(len(map_city)))
            #         map_city[index_value] = float(len(map_city))
            # if (i == 3):
            #     has_flag = map_district.has_key(index_value)
            #     if has_flag:
            #         index_res.append(map_district[index_value])
            #     else:
            #         index_res.append(float(len(map_district)))
            #         map_district[index_value] = float(len(map_district))
        index_res.append(float(datablock[4]))
        res[datablock[0]] = index_res
    # print map_province
    # print map_city
    # print map_district
    # print len(res)
    f.close()
    return res
def get_bad_new():
    file = '../data/sdk_log_all/yirendai_bad_0725.csv'
    f = open(file, "r")
    result = {}
    # district = getdistrict()
    flag = -1
    while True:
        data = f.readline()
        flag += 1
        if not data:
            break
        datablock = data.strip('\n').split(',')
        result[datablock[0]] = True
    f.close()
    return result
def get_mobile():
    file = '../data/april/mobile_new_test.csv'
    f = open(file, "r")
    result = {}
    map_process = {'HUAWEI':4488, 'OPPO':3049, 'vivo':2591, 'Xiaomi':2209, 'samsung':1342, 'Meizu':851, 'GiONEE':191, '360':124, 'LeMobile':115, 'Letv':92, 'YuLong':86, 'BBK':82, 'nubia':81, 'GIONEE':81, 'ZTE':79, 'smartisan':67, 'LENOVO':50, 'HTC':49, 'QiKU':44, 'Meitu':37}
    flag = -1
    while True:
        data = f.readline()
        flag += 1
        if not flag:
            continue
        if not data:
            break
        datablock = data.strip('\n').split(',')
        index_brand = datablock[2].split(' ')[0]
        if not map_process.has_key(index_brand):
            continue
        index_content = datablock[2].replace(index_brand,"").replace(" ","")
        # index_content = re.sub("[^a-zA-Z0-9\-]","",index_content)
        index_price = datablock[3]
        index_all = []
        index_all.append(index_content)
        index_all.append(index_price)
        if result.has_key(index_brand):
            result[index_brand].append(index_all)
        else:
            brand_all = []
            brand_all.append(index_all)
            result[index_brand] = brand_all
    f.close()
    return result
def get_input():
    result = {}
    file = '../data/all2month/sdk_env_tr_te/app_tr.txt'
    f = open(file, "r")
    flag = -1
    while True:
        data = f.readline()
        flag += 1
        if not flag:
            continue
        if not data:
            break
        datablock = data.strip('\n').replace('\r',"").split(',')
        index_res = []
        # print datablock[1]
        for ii in range(1,len(datablock)):
            # print ii,datablock
            index_res.append(float(datablock[ii]))
        result[datablock[0]] = index_res
    f.close()

    file = '../data/all2month/sdk_env_tr_te/app_te.txt'
    f = open(file, "r")
    flag = -1
    while True:
        data = f.readline()
        flag += 1
        if not flag:
            continue
        if not data:
            break
        datablock = data.strip('\n').replace('\r',"").split(',')
        index_res = []
        # print datablock[1]
        for ii in range(1,len(datablock)):
            # print ii,datablock
            index_res.append(float(datablock[ii]))
        result[datablock[0]] = index_res
    f.close()

    return result
def get_csv_app(file,remove_feature):
    f = open(file, "r")
    mob_6_lable = get_mob6()
    map_brand = []
    map_all_brand = []
    train_x_res = []
    train_y = []
    test_x_res = []
    test_y = []
    input_num = 0
    # imei_res = get_add_imei()
    flag = -1
    add_num = 0
    repeat = {}
    content_write = ''
    while True:
        data = f.readline()
        flag += 1
        if not data:
            break
        # if not (flag % 100):
        #     print flag
        datablock = data.strip('\n').split(',')
        index_value = []
        index_flag = -1
        for ii in datablock[1:len(datablock)]:
            index_flag += 1
            index_ii = float(ii)
            index_value.append(index_ii)
        if mob_6_lable.has_key(datablock[0]):
            index_label_content = mob_6_lable[datablock[0]]
            index_date = index_label_content.split(',')[0]
            index_label = float(index_label_content.split(',')[1])
            # and (index_date.split('-')[1] == '02')
            # if (index_date.split('-')[0] == '2017') and (index_date.split('-')[1] == '02'):
            #     test_x_res.append(index_value)
            #     test_y.append(index_label)
            # else:
            #     train_x_res.append(index_value)
            #     train_y.append(index_label)

            train_x_res.append(index_value)
            train_y.append(index_label)    
    f.close()
    return train_x_res,train_y
def get_app():
    train_x_all,train_y_all = get_csv_app('../data/all2month/sdk_env_tr_te/app_tr.txt',1)
    val_x_all,val_y_all = get_csv_app('../data/all2month/sdk_env_tr_te/app_te.txt',1)
    train_x_all.extend(val_x_all)
    train_y_all.extend(val_y_all)
    return train_x_all
def get_add_feats(file):
    # file = '../data/new_data/split_0815/success_poi_all.txt'
    f = open(file, "r")
    result = {}
    flag = -1
    while True:
        data = f.readline()
        flag += 1
        # if not flag:
        #     continue
        if not data:
            break
        datablock = data.strip('\n').replace('\r',"").split(',')
        index_res = []
        for ii in range(1,len(datablock)):
            index_res.append(float(datablock[ii]))
        result[datablock[0]] = index_res
    f.close()
    return result
def get_add_imei():
    file = '../data/new_data/imei.txt'
    f = open(file, "r")
    result = {}
    flag = -1
    count = 0
    while True:
        data = f.readline()
        flag += 1
        # if not flag:
        #     continue
        if not data:
            break
        datablock = data.strip('\n').replace('\r',"").split('\t')
        index_res = []
        index_res.append(float(datablock[1]))
        if float(datablock[1]) > 1:
            count += 1
        # index_res.append(float(datablock[2]) / (1000 * 3600 * 24))
        # index_res.append(float(datablock[2]) + float(datablock[3]))
        # index_res.append(float(datablock[2]) / (float(datablock[2]) + float(datablock[3])))
        # index_res.append(float(datablock[3]) / (float(datablock[2]) + float(datablock[3])))
        result[datablock[0]] = index_res
    print count
    f.close()
    return result
def get_applyid():
    file = '../data/new_data/apply_id.csv'
    f = open(file, "r")
    result = {}
    flag = -1
    while True:
        data = f.readline()
        flag += 1
        if not flag:
            continue
        if not data:
            break
        datablock = data.strip('\n').replace('\r',"").split(',')
        result[datablock[0]] = datablock[3]
    f.close()
    return result
def get_mob6():
    file = './label_1101_all.csv'
    f = open(file, "r")
    # apply_id = get_applyid()
    result = {}
    flag = -1
    while True:
        data = f.readline()
        flag += 1
        if not flag:
            continue
        if not data:
            break
        datablock = data.strip('\n').replace('\r',"").split(',')
        # has_apply = apply_id.has_key(datablock[0])
        # if not has_apply:
        #     continue
        index_label = datablock[4] if (datablock[4] == '0') else '1'
        # if index_label == '1':
        #     print datablock
        if float(index_label) > 1:
            print index_label
        result[datablock[3]] = datablock[2].split(' ')[0] + ',' + index_label
    f.close()
    print 'result',len(result)
    return result
def get_emp(num):
    result = []
    for ii in range(num):
        result.append(-1.0)
    return result
def jaccard_distance(w1dict, w2dict):
    w1set, w2set = set(w1dict), set(w2dict)
    return 1.0 * len(w1set & w2set) / len(w1set | w2set)
def get_csv_mob6(file,write_file):
    mob_6_lable = get_mob6()
    # map_brand_all = get_mobile()
    map_brand = []
    map_all_brand = []
    # input_res = get_input()
    # app_res = get_app()
    add_feats_res = get_add_feats('./split_data/april_hotel_all.txt')
    add_bank = get_add_feats('./split_data/april_bank_all.txt')
    f = open(file, "r")
    train_x_res = []
    train_y = []
    test_x_res = []
    test_y = []
    input_num = 0
    district = getdistrict()
    flag = -1
    add_num = 0
    repeat = {}
    content_write = ''
    apply_id_all = []
    while True:
        data = f.readline()
        flag += 1
        if not data:
            break
        # if not flag:
        #     flag += 1
        #     continue
        # if not (flag % 100):
        #     print flag
        datablock = data.strip('\n').split(',')
        index_value = []
        index_flag = -1
        # repeat[datablock[0]] = 1
        # print data
        # print datablock
        for ii in datablock[3:len(datablock) - 3]:
            index_flag += 1
            #   or (index_flag == 14)
            if (index_flag == 1) or (index_flag == 2)  or (index_flag == 14):
                continue
            index_ii = float(ii)
            if (index_flag == 5) or (index_flag == 6) or (index_flag == 9) or (index_flag == 10):
                index_ii = index_ii / float(10000)
            index_value.append(index_ii)
        if district.has_key(datablock[0]):            
            index_district = district[datablock[0]]
        else:
            index_district = [-1,-1]
        index_value.extend(index_district)
        index_mobile_price = -1
        index_mobile_score = -1
        index_mobile_comments = -1
        brand = datablock[3]
        index_mobile_type = datablock[4].replace(brand,"").replace(" ","")
        if (index_mobile_type == "R9m") or (index_mobile_type == "R9tm"):
            index_mobile_type = "R9"
        if (index_mobile_type == "R7sm"):
            index_mobile_type = "R7"
        if (index_mobile_type == "MI5"):
            index_mobile_type = "5"
        if (index_mobile_type == "MI4LTE"):
            index_mobile_type = "4"
        if add_feats_res.has_key(datablock[0]):
            # index_add = [add_feats_res[datablock[0]][1]]
            index_add = add_feats_res[datablock[0]]
            # del index_add[0]
            # print len(index_add)
        else:
            index_add = [-1,-1,-1,-1]
            add_num += 1
        index_value.extend(index_add)
        if add_bank.has_key(datablock[0]):
            index_add = add_bank[datablock[0]]
        else:
            index_add = [-1,-1]
        index_value.extend(index_add)
        write_block = [str(item) for item in index_value]
        content_write += datablock[1] + ',' + ",".join(write_block) + '\n'
        if mob_6_lable.has_key(datablock[1]):
            index_label_content = mob_6_lable[datablock[1]]
            index_date = index_label_content.split(',')[0]
            index_label = float(index_label_content.split(',')[1])
            # and (index_date.split('-')[1] == '02')
            # if (index_date.split('-')[0] == '2017') and (index_date.split('-')[1] == '02'):
            #     test_x_res.append(index_value)
            #     test_y.append(index_label)
            # else:
            #     train_x_res.append(index_value)
            #     train_y.append(index_label)
            train_x_res.append(index_value)
            train_y.append(index_label)
            apply_id_all.append(datablock[1])
    fp = open('./' + write_file,'w')
    fp.write(content_write)
    fp.close()
    print "add_num",add_num    
    f.close()
    return train_x_res,train_y,apply_id_all
def get_csv_index(file,remove_feature):
    map_brand_all = get_mobile()
    input_res = get_input()
    add_feats_res = get_add_feats()
    map_brand = []
    map_all_brand = []
    f = open(file, "r")
    train_x_res = []
    train_y = []
    district = getdistrict()
    new_result = get_bad_new()
    flag = -1
    input_num = 0
    add_num = 0
    while True:
        data = f.readline()
        flag += 1
        if not data:
            break
        # if not flag:
        #     flag += 1
        #     continue
        # if not (flag % 100):
        #     print flag
        datablock = data.strip('\n').split(',')
        index_value = []
        index_flag = -1
        # print data
        for ii in datablock[2:len(datablock)]:
            index_flag += 1
            if (index_flag == 1) or (index_flag == 2):
                continue
            index_value.append(float(ii))
        # index_district = district[datablock[0]]
        # index_value.extend(index_district)
        index_mobile_price = -1
        index_mobile_score = -1
        index_mobile_comments = -1
        brand = datablock[3]
        index_mobile_type = datablock[4].replace(brand,"").replace(" ","")
        if (index_mobile_type == "R9m") or (index_mobile_type == "R9tm"):
            index_mobile_type = "R9"
        if (index_mobile_type == "R7sm"):
            index_mobile_type = "R7"
        if (index_mobile_type == "MI5"):
            index_mobile_type = "5"
        if (index_mobile_type == "MI4LTE"):
            index_mobile_type = "4"
        if map_brand_all.has_key(brand):
            brand_all_mobile = map_brand_all[brand]
            most_similar = 0.0
            most_res = []
            for i_type in brand_all_mobile:
                if (i_type[0].find(index_mobile_type)) != -1:
                    most_similar = 1
                    most_res = i_type
                    break
                else:
                    similar = jaccard_distance(i_type[0],index_mobile_type)
                    if similar >= most_similar:
                        most_similar = similar
                        most_res = i_type
            if most_similar != 1:
                map_brand.append(index_mobile_type)
            # print datablock[0],most_similar,most_res[1],index_mobile_type,"---",most_res[0]
            index_mobile_price = float(most_res[1])
                # print similar,i_type[0],index_mobile_type
        map_all_brand.append(index_mobile_type)
        index_value.append(index_mobile_price)
        index_label = float(datablock[1])
        if input_res.has_key(datablock[0]):
            index_input = input_res[datablock[0]]
        else:
            input_num += 1
            index_input = [-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1]
        index_value.extend(index_input)
        if add_feats_res.has_key(datablock[0]):
            index_add = add_feats_res[datablock[0]]
        else:
            index_add = [-1,-1]
            add_num += 1
        index_value.extend(index_add)
        train_x_res.append(index_value)
        train_y.append(index_label)
    # print Counter(map_brand)
    # print Counter(map_brand).most_common(20)
    # print Counter(map_all_brand)
    print "input_num",input_num
    print "add_num",add_num
    f.close()
    return train_x_res,train_y
def write_app(label,prey_prob,val_apply,file):
    content = "applyid,label,prob\n"
    for ii in range(len(prey_prob)):
        content += val_apply[ii] + ',' + str(label[ii]) + ',' + str(prey_prob[ii]) + '\n'
    fp = open(file,'w')
    fp.write(content)
    fp.close()
def train_val():
    remove_feature = 0
    train_x_all,train_y_all,train_apply = get_csv_mob6('./train_info/part-00000','feature_sdk_tr.txt')
    val_x_all,val_y_all,val_apply = get_csv_mob6('./test_info/part-00000','feature_sdk_te.txt')
    # train_x_all,train_y_all,val_x_all,val_y_all = get_csv_mob6('../data/new_data/train_test_0824.txt',1)
    print len(train_x_all)
    print train_x_all[0]
    sum_split = 0.0
    sum_auc = 0.0
    features_block = ['operator','version','manufacturer','release','mocklocationenabled','usbdebugenabled','cellinfo','memory','availablememory','cpunum','cpustat','screenlight','rommemroy','sdcardmemory','speakervolume','batteryinfo','channelid']
    ceate_feature_map(features_block)
    map_res = {}
    train_x = []
    train_y = []
    for ii in range(len(train_x_all)):
        train_x.append(train_x_all[ii])
        train_y.append(train_y_all[ii])
    val_x = []
    val_y = []
    for ii in range(len(val_x_all)):
        val_x.append(val_x_all[ii])
        val_y.append(val_y_all[ii])
    print len(train_x),len(train_y),len(val_x),len(val_y),len(train_x[0])
    # param = {} 
    # param['objective'] = 'binary:logistic' 
    # param['booster'] = 'gbtree' 
    # param['learning_rate'] = 0.01 
    # param['max_depth'] = 5
    # param['subsample'] = 0.7
    # param['colsample_bytree'] = 0.1
    # param['scale_pos_weight'] = 1 
    # param['min_child_weight'] = 70 
    # param['max_delta_step'] = 0 
    # param['nthread'] = 8 
    # param['silent'] = 1 
    # param['reg_lambda'] = 0.1 
    # param['reg_alpha'] = 0.1 
    # num_round = 1500

    # xg_train = xgb.DMatrix(train_x, label=train_y)
    # xg_test = xgb.DMatrix(val_x)
    # bst = xgb.train(param, xg_train, num_round)

    # prey_prob = bst.predict(xg_train)

    params = {
    'task': 'train',
    'boosting_type': 'gbdt',
    'objective': 'binary',
    'metric': {'l2', 'auc'},
    'num_leaves': 31,
    'learning_rate': 0.05,
    'feature_fraction': 0.9,
    'bagging_fraction': 0.8,
    'bagging_freq': 5,
    'verbose': 0
    }
    lgb_train = lgb.Dataset(train_x, train_y)
    gbm = lgb.train(params,
                lgb_train,
                num_boost_round=20,
                valid_sets=lgb_train,
                early_stopping_rounds=5)
    prey_prob = gbm.predict(train_x,num_iteration=gbm.best_iteration)

    sum_auc = auc_score_xgb(prey_prob,train_y)
    print "train auc: ",sum_auc
    write_app(train_y,prey_prob,train_apply,"./sdk_prob_tr.csv")

    prey_prob = gbm.predict(val_x)
    sum_auc = auc_score_xgb(prey_prob,val_y)
    print "test auc: ",sum_auc
    write_app(val_y,prey_prob,val_apply,"./sdk_prob_te.csv")
def main():

    index_process = 1
    if index_process:
        # for i in range(8,20):
        # cross_val(0)
        train_val()

if __name__=='__main__':
    main()

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值