# encoding=utf-8
# import tensorflow as tf
# from tensorflow.contrib import learn
# from lstm import generate_data, lstm_model, load_csvdata
import time
import matplotlib.pyplot as plt
import pylab as pl
import xgboost as xgb
import re
print time.strftime("%I:%M:%S")
from operator import itemgetter
from sklearn import linear_model
import numpy as np
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import BaggingClassifier
from sklearn.cross_validation import KFold
from sklearn.ensemble import RandomForestClassifier
from sklearn.externals import joblib
# cross_validation.cross_val_score
from sklearn.svm import SVC
from sklearn import preprocessing
from sklearn.datasets import load_svmlight_file
from sklearn.cross_validation import train_test_split
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.metrics import f1_score, precision_score, recall_score, accuracy_score
from sklearn.metrics import auc, roc_curve, roc_auc_score
import lightgbm as lgb
import operator
import math
import argparse
from collections import Counter
exece = "/Users/wangsanpeng/LightGBM/lightgbm"
LOG_DIR = './ops_logs/lstm_weather'
feature_beg = 1
TIMESTEPS = 451
RNN_LAYERS = [{'num_units': 5},{'num_units': 5}]
DENSE_LAYERS = [451]
TRAINING_STEPS = 100000
BATCH_SIZE = 30
PRINT_STEPS = TRAINING_STEPS / 100
remove_begin = 32
remove_end = 36
parser = argparse.ArgumentParser()
parser.add_argument('--filename')
parser.add_argument('--output')
parser.add_argument('--feat_flag')
parser.add_argument('--model')
# parser = argparse.ArgumentParser(description="skeanrn Classification")
parser.add_argument('--train_file', type=str,
default="", help='训练数据集')
parser.add_argument('--test_file', type=str,
default="", help='测试数据集')
parser.add_argument('--n_estimators', type=int,
default=200, help='n_estimators')
parser.add_argument('--learning_rate', type=float,
default=0.1, help='learning_rate')
parser.add_argument('--max_depth', type=int,
default=5, help='max_depth')
parser.add_argument('--subsample', type=float,
default=1.0, help='subsample')
parser.add_argument('--max_features', type=str,
default=None, help='max_features: None,auto,sqrt,log2')
parser.add_argument('--threshold', type=float,
default=0.5, help='threshold for classifier')
parser.add_argument('--feature', type=str,
default=None, help='file to save feature importances')
parser.add_argument('--city', type=str,
default="", help='city file to predict')
parser.add_argument('--load_model', type=int,
default=0, help='load model skip trainning if not zero')
parser.add_argument('--model_file', type=str,
default="", help='save model_file if not null. '
'require not null if load_model.')
parser.add_argument('--cost', type=float,
default=1, help='weight of samples')
parser.add_argument('--clf', type=str,
default="gbdt", help='Classification 分类器')
parser.add_argument('--roc', type=int,
default="0", help='compute roc table if not zero')
parser.add_argument('--roc_file', type=str,
default="roc", help='roc_file prefix file name')
parser.add_argument('--roc_percent', type=float,
default=-1, help='roc_percent to predict')
parser.add_argument('--roc_thres', type=str,
default="0", help='roc_thres to predict, a num or list')
parser.add_argument('--cv', type=int,
default=1, help='Cross_Validation')
parser.add_argument('--kfold', type=int, default=0,
help='use kfold if not zero')
parser.add_argument('--size', type=float, default=0.7,
help='trian set size, float or int')
parser.add_argument('--issvmformat', type=int,
default=0, help='issvmformat')
parser.add_argument('--startfield', type=int, default=0,
help='which field to start anasy if not svmformat')
parser.add_argument('--scale', type=int, default=0,
help='scale the data,标准化')
parser.add_argument('--aggr', type=int, default=0,
help='融合模型')
args = parser.parse_args()
n_estimators = args.n_estimators
subsample = args.subsample
learning_rate = args.learning_rate
max_depth = args.max_depth
cost = args.cost
threshold = args.threshold
feature = args.feature
model_file = args.model_file
roc = args.roc
cv = args.cv
issvmformat = args.issvmformat
kfold = args.kfold
aggr = args.aggr
file_name = args.filename
model_name = args.model
output = args.output
# feat_flag = int(args.feat_flag)
def is_chinese(uchar):
"""判断一个unicode是否是汉字"""
if uchar >= u'/u4e00' and uchar<=u'/u9fa5':
return True
else:
return False
need_device = ['h60']
def get_dic(sourceFile,dic_loc,lable_index):
f = open(sourceFile, "r")
all_block = [[] for i in range(len(dic_loc))]
map_dic = {}
map_lenth = {}
shouji = []
shouji_right = []
shouji_wrong = []
device = []
yunyingshang = []
latlon = []
os = []
os_odds = []
map_shouji_reverse = {'mi':'xiaomi','lemobile':'letv','lephone':'lenovo','r7plusm':'oppo','yulong':'coolpad','qiku':'360','hm':'redmi'}
# map_need_shouji = {'htc': True, 'gionee': True, 'redmi': True, 'bbk': True, 'coolpad': True, 'pioneer': True, 'zte': True, 'smartisan': True, 'huawei': True, 'meizu': True, 'letv': True, 'nubia': True, 'sprd': True, '360': True, 'xiaomi': True, 'lge': True, 'motorola': True, '4g': True, 'sony': True, 'cmdc': True, 'alps': True, 'lenovo': True, 'samsung': True, 'vivo': True, 'asus': True, 'zuk': True, 'oneplus': True, 'oppo': True}
# map_need_shouji = {'huawei': 3074, 'xiaomi': 1641, 'vivo': 901, 'oppo': 883, 'samsung': 654, 'meizu': 643, 'letv': 289, '360': 191, 'coolpad': 159, 'gionee': 126, 'lenovo': 112}
map_need_shouji = {'huawei': 3074,'vivo': 901,'360': 191,'coolpad': 159, 'gionee': 126, 'lenovo': 112}
# map_need_device = {'hua': 1226, 'viv': 940, 'opp': 709, 'sm-': 624, 'mi ': 584, 'red': 580, 'eva': 205, 'plk': 176, 'kiw': 158, 'h60': 143, 'coo': 141, '150': 132, 'pe-': 130, 'che': 122, 'le ': 119, 'frd': 116, 'hm ': 107, 'len': 102, 'mi-': 92}
map_yunying_reverse = {'中国电信':'chinatelecom','中国移动':'cmcc','chinamobile':'cmcc','中国联通':'chinaunicom','chnunicome':'chinaunicom','unicom':'chinaunicom','chnunicom':'chinaunicom'}
map_need_yunying = {'': True,'cmcc': True,'chinatelecom': True,'chinaunicom': True}
map_latlon_reverse = {'10_4':'10_3','9_4':'8_4','13_4':'12_4','10_1':'10_2','9_2':'10_2'}
map_need_latlon = {'': True, '12_2': True, '12_3': True, '12_4': True, '0.0': True, '8_4': True, '10_2': True, '10_3': True, '11_4': True, '11_3': True, '11_2': True, '0_0': True}
# map_shouji_reverse = {''}
yunyingshang_wrong = []
yunyingshang_right = []
index_flag = -1
while True:
data = f.readline()
if not data:
break
index_flag += 1
# print index_flag
if not index_flag:
continue
if index_flag > 20085:
continue
# if index_flag <= 40183:
# continue
# if (index_flag > 40183) or (index_flag < 20085):
# continue
datablock = data.strip('\n').split(',')
for ii in range(len(dic_loc)):
index_value_bl = datablock[dic_loc[ii]]
if dic_loc[ii] == 5:
index_value_bl = datablock[dic_loc[ii]].split(' ')[0].split('-')[0].split('_')[0]
# print index_value_bl
index_value_bl = str.lower(index_value_bl)
if map_shouji_reverse.has_key(index_value_bl):
# print index_value_bl,map_shouji_reverse[index_value_bl]
index_value_bl = map_shouji_reverse[index_value_bl]
shouji.append(index_value_bl)
if not lable_index[index_flag]:
shouji_wrong.append(index_value_bl)
else:
shouji_right.append(index_value_bl)
if not map_need_shouji.has_key(index_value_bl):
# index_value_bl = 'others'
continue
# if not lable_index[index_flag]:
# shouji_wrong.append(index_value_bl)
# else:
# shouji_right.append(index_value_bl)
shouji.append(index_value_bl)
# if not map_need_shouji.has_key(index_value_bl):
# continue
if dic_loc[ii] == 6:
index_value_bl = str.lower(datablock[dic_loc[ii]])
# [0:3]
# print index_value_bl
index_value_bl = str.lower(index_value_bl)
device.append(index_value_bl)
has_flag = 0
for di in need_device:
if index_value_bl.find(di) == -1:
continue
else:
index_value_bl = di
has_flag = 1
break
if not has_flag:
continue
# if not map_need_device.has_key(index_value_bl):
# continue
if dic_loc[ii] == 4:
index_value_bl = str.lower(datablock[dic_loc[ii]].replace(' ','').replace('-','').replace('_',''))
if map_yunying_reverse.has_key(index_value_bl):
index_value_bl = map_yunying_reverse[index_value_bl]
yunyingshang.append(index_value_bl)
if not map_need_yunying.has_key(index_value_bl):
# continue
if not lable_index[index_flag]:
yunyingshang_wrong.append(index_value_bl)
else:
yunyingshang_right.append(index_value_bl)
if dic_loc[ii] == 2:
if datablock[dic_loc[ii]] == '':
index_value_bl = ''
elif datablock[dic_loc[ii]] == '0.0':
index_value_bl = '0.0'
else:
index_value_bl = str(int(float(datablock[dic_loc[ii]])) / 10) + '_' + str(int(float(datablock[dic_loc[ii] + 1])) / 10)
if map_latlon_reverse.has_key(index_value_bl):
index_value_bl = map_latlon_reverse[index_value_bl]
# if not map_need_latlon.has_key(index_value_bl):
# continue
latlon.append(index_value_bl)
if dic_loc[ii] == 7:
index_value_bl = datablock[dic_loc[ii]]
if (index_value_bl == 'G-UI-5') or (index_value_bl == 'G-UI-5.0.2'):
index_value_bl = '7'
count_point = index_value_bl.count(".")
point_block = index_value_bl.split('.')
if not count_point:
index_value_bl = index_value_bl + '00'
elif count_point == 1:
index_value_bl = point_block[0] + point_block[1] + '0'
else:
# print datablock[dic_loc[ii]]
index_value_bl = point_block[0] + point_block[1] + point_block[2]
os.append(index_value_bl)
# os_odds.append(index_value_bl + '_' + str(lable_index[index_flag]))
if index_value_bl == '5120' or index_value_bl == '442ro':
print index_value_bl,datablock[dic_loc[ii]]
# if not map_need_yunying.has_key(index_value_bl):
# continue
index_key = str(dic_loc[ii]) + '_' + index_value_bl
if not map_dic.has_key(index_key):
map_dic[index_key] = len(all_block[ii])
all_block[ii].append(1)
f.close()
print Counter(shouji)
print Counter(shouji_wrong)
print Counter(shouji_right)
print len(yunyingshang_right),len(yunyingshang_wrong)
print yunyingshang_right
print yunyingshang_wrong
print Counter(device).most_common(20)
# print Counter(os_odds)
map_shouji_wrong = Counter(shouji_wrong)
map_shouji_right = Counter(shouji_right)
for ii in map_shouji_wrong:
print ii,map_shouji_wrong[ii],map_shouji_right[ii]
os_count = Counter(os)
os_odds_count = Counter(os_odds)
# print Counter(yunyingshang).most_common(20)
# content = 'features,count_all,count_0,count_1,count_1_radio,adivice\n'
# index_i = -1
# for ii in os_count:
# index_i += 1
# print '-----'
# print ii,os_count[ii]
# print os_odds_count[ii + '_0.0'],os_odds_count[ii + '_1.0'],os_odds_count[ii + '_1.0'] / float(os_count[ii])
# content += ii + ',' + str(os_count[ii]) + ',' + str(os_odds_count[ii + '_0.0']) + ',' + str(os_odds_count[ii + '_1.0']) + ',' + str(os_odds_count[ii + '_1.0'] / float(os_count[ii])) + '\n'
# # map_need_shouji[ii[0]] = True
# fp = open('./os_train_statis.csv','w')
# fp.write(content)
# fp.close()
# content = 'features,count_all\n'
# for ii in os_count:
# print '-----'
# print ii,os_count[ii]
# content += ii + ',' + str(os_count[ii]) + '\n'
# # map_need_shouji[ii[0]] = True
# fp = open('./os_test_statis.csv','w')
# fp.write(content)
# fp.close()
for ii in range(len(dic_loc)):
index_key = str(dic_loc[ii]) + '_' + 'others'
# map_lenth[dic_loc[ii]] = len(all_block[ii] + 1)
map_dic[index_key] = len(all_block[ii])
print map_dic
return map_dic
def get_one_hot(all_length,index):
res = []
for ii in range(all_length):
if ii == index:
res.append(1.0)
else:
res.append(0.0)
return res
def get_one_hot_dl(all_length,index):
res = []
for ii in range(all_length):
if ii == index:
res.append([1.0])
else:
res.append([0.0])
return res
def getdatalable():
res = []
# f = open('./task1_training_set/scenario1_10_30_tag.csv', "r")
f = open('./task1_training_set/train_val_tag.csv', "r")
while True:
data = f.readline()
if not data:
break
datablock = data.strip('\n').split(',')
# print datablock[0]
if datablock[1] == '0':
index_label = 0.0
# print datablock[0]
else:
index_label = 1.0
res.append(index_label)
f.close()
return res
# def get_use_label():
def get_use_label_not0():
res = {}
f = open('./train_test_not0radio.csv', "r")
index_flag = -1
features = []
train_ave = []
test_ave = []
while True:
data = f.readline()
if not data:
break
datablock = data.strip('\n').split(',')
# print datablock[0]
index_flag += 1
if not index_flag:
features = datablock
continue
for i in range(len(datablock)):
if index_flag == 1:
train_ave.append(float(datablock[i]))
if index_flag == 2:
test_ave.append(float(datablock[i]))
f.close()
print len(features),len(train_ave),len(test_ave)
for ii in range(len(features)):
index_test = test_ave[ii]
index_train = train_ave[ii]
loss = abs(index_test - index_train)
little = min(index_test,index_train)
if not little:
continue
if (loss / little) < 0.8:
res[features[ii]] = True
return res
def get_ip_same(file):
res = {}
f = open(file, "r")
index_flag = -1
map_res = {}
while True:
data = f.readline()
if not data:
break
datablock = data.strip('\n').split(',')
# print datablock[0]
index_flag += 1
if not index_flag:
continue
map_res[datablock[0]] = 1
return map_res
f.close()
def getdata(sourceFile,class_res=None,vali_flag=None):
f = open(sourceFile, "r")
train_x_res = []
train_y_res = []
test_x = []
test_y = []
train_x_dl = []
train_y_dl = []
# train_y_reg = []
flag = 0
dic_loc = []
# dic_loc = []
label_block = getdatalable()
# dic_loc = []
test_lable = []
map_dic = get_dic(sourceFile,dic_loc,label_block)
features_use = {'app_cat1_1':True,'app_cat1_2':True,'app_cat1_3':True,'app_cat1_4':True,'app_cat1_5':True,'app_cat1_7':True,'app_cat1_8':True,'app_cat1_9':True,'app_cat1_11':True,'app_cat1_13':True,'app_cat1_14':True,'app_cat1_15':True,'app_cat1_16':True,'app_cat1_18':True,'app_cat1_21':True,'app_cat1_23':True,'app_cat1_24':True,'app_cat1_26':True,'app_cat1_27':True,'app_cat1_28':True,'app_cat1_29':True,'app_cat1_30':True,'app_cat1_31':True,'app_cat1_32':True,'app_cat1_33':True,'app_cat1_34':True,'app_cat1_35':True,'app_cat1_38':True,'app_cat1_40':True,'app_cat1_41':True,'app_cat1_42':True,'app_cat1_43':True,'app_cat1_44':True,'app_cat2_1':True,'app_cat2_2':True,'app_cat2_3':True,'app_cat2_4':True,'app_cat2_5':True,'app_cat2_6':True,'app_cat2_7':True,'app_cat2_8':True,'app_cat2_9':True,'app_cat2_10':True,'app_cat2_11':True,'app_cat2_12':True,'app_cat2_13':True,'app_cat2_14':True,'app_cat2_15':True,'app_cat2_17':True,'app_cat2_20':True,'app_cat2_21':True,'app_cat2_22':True,'app_cat2_23':True,'app_cat2_26':True,'app_cat2_27':True,'app_cat2_28':True,'app_cat2_29':True,'app_cat2_30':True,'app_cat2_33':True,'app_cat2_34':True,'app_cat2_35':True,'app_cat2_36':True,'app_cat2_37':True,'app_cat2_39':True,'app_cat2_40':True,'app_cat2_42':True,'app_cat2_43':True,'app_cat2_44':True,'app_cat2_45':True,'app_cat2_46':True,'app_cat2_47':True,'app_cat2_48':True,'app_cat2_49':True,'app_cat2_50':True,'app_cat2_51':True,'app_cat2_52':True,'app_cat2_53':True,'app_cat2_54':True,'app_cat2_55':True,'app_cat2_56':True,'app_cat2_57':True,'app_cat2_58':True,'app_cat2_59':True,'app_cat2_60':True,'app_cat2_61':True,'app_cat2_62':True,'app_cat2_63':True,'app_cat2_64':True,'app_cat2_65':True,'app_cat2_66':True,'app_cat2_67':True,'app_cat2_68':True,'app_cat2_69':True,'app_cat2_70':True,'app_cat2_71':True,'app_cat2_72':True,'app_cat2_74':True,'app_cat2_75':True,'app_cat2_76':True,'app_cat2_77':True,'app_cat2_78':True,'app_cat2_79':True,'app_cat2_80':True,'app_cat2_81':True,'app_cat2_83':True,'app_cat2_85':True,'app_cat2_86':True,'app_cat2_87':True,'app_cat2_88':True,'app_cat2_89':True,'app_cat2_90':True,'app_cat2_91':True,'app_cat2_92':True,'app_cat2_93':True,'app_cat2_94':True,'app_cat2_96':True,'app_cat2_97':True,'app_cat2_98':True,'app_cat2_99':True,'app_cat2_100':True,'app_cat2_101':True,'app_cat2_103':True,'app_cat2_104':True,'app_cat2_105':True,'app_cat2_106':True,'app_cat2_107':True,'app_cat2_108':True,'app_cat2_109':True,'app_cat2_111':True,'app_cat2_112':True,'app_cat2_113':True,'app_cat2_114':True,'app_cat2_115':True,'app_cat2_116':True,'app_cat2_117':True,'app_cat2_118':True,'app_cat2_119':True,'app_cat2_120':True,'app_cat2_122':True,'app_cat2_123':True,'app_cat2_124':True,'app_cat2_125':True,'app_cat2_126':True,'app_cat2_127':True,'app_cat2_128':True,'app_cat2_129':True,'app_cat2_130':True,'app_cat2_131':True,'app_cat2_132':True,'app_cat2_133':True,'app_cat2_134':True,'app_cat2_135':True,'app_cat2_137':True,'app_cat2_138':True,'app_cat2_139':True,'app_cat2_140':True,'app_cat2_141':True,'app_cat2_142':True,'app_cat2_143':True,'app_cat2_144':True,'app_cat2_145':True,'app_cat2_146':True,'app_cat2_147':True,'app_cat2_148':True,'app_cat2_151':True,'app_cat2_153':True,'app_cat2_154':True,'app_cat2_155':True,'app_cat2_156':True,'app_cat2_157':True,'app_cat2_158':True,'app_cat2_159':True,'app_cat2_160':True,'app_cat2_161':True,'app_cat2_162':True,'app_cat2_163':True,'app_cat2_164':True,'app_cat2_165':True,'app_cat2_166':True,'app_cat2_167':True,'app_cat2_168':True,'app_cat2_169':True,'app_cat2_170':True,'app_cat2_171':True,'app_cat2_172':True,'app_cat2_173':True,'app_cat2_174':True,'app_cat2_175':True,'app_cat2_176':True,'app_cat2_177':True,'app_cat2_178':True,'app_cat2_179':True,'app_cat2_180':True,'app_cat2_181':True,'app_cat2_182':True,'app_cat2_183':True,'app_cat2_184':True,'app_cat2_185':True,'app_cat2_186':True,'app_cat2_187':True,'app_cat2_188':True,'app_cat2_189':True,'app_cat2_190':True,'app_cat2_192':True,'app_cat2_193':True,'app_cat2_194':True,'app_cat2_196':True,'app_cat2_198':True,'app_cat2_199':True,'app_cat2_200':True,'app_cat2_201':True,'app_cat2_202':True,'app_cat2_203':True,'app_cat2_204':True,'app_cat2_205':True,'app_cat2_207':True,'app_cat2_208':True,'app_cat2_209':True,'app_cat2_210':True,'app_cat2_211':True,'app_cat2_212':True,'app_cat2_213':True}
# features_use = {'app_cat1_35': True, 'app_cat2_208': True, 'app_cat2_120': True, 'app_cat2_89': True, 'app_cat2_126': True, 'app_cat2_202': True, 'app_cat2_164': True, 'app_cat2_143': True, 'app_cat2_211': True, 'app_cat2_138': True, 'app_cat2_71': True, 'app_cat2_144': True, 'app_cat2_159': True, 'app_cat2_26': True, 'app_cat2_184': True, 'app_cat2_12': True, 'app_cat2_180': True, 'app_cat2_181': True, 'app_cat2_50': True, 'app_cat2_53': True, 'app_cat2_52': True, 'app_cat2_55': True, 'app_cat2_54': True, 'app_cat2_79': True, 'app_cat2_14': True, 'app_cat2_77': True, 'app_cat2_168': True, 'app_cat2_74': True, 'app_cat1_32': True, 'app_cat2_154': True, 'app_cat1_15': True, 'app_cat2_93': True, 'app_cat2_132': True, 'app_cat2_96': True, 'app_cat2_99': True, 'app_cat2_97': True, 'app_cat2_111': True, 'app_cat2_113': True, 'app_cat2_156': True, 'app_cat2_115': True, 'app_cat2_158': True, 'app_cat1_8': True, 'app_cat1_6': True, 'app_cat1_7': True, 'app_cat2_131': True, 'app_cat1_5': True, 'app_cat1_2': True, 'app_cat2_135': True, 'app_cat2_1': True, 'app_cat2_175': True, 'app_cat2_174': True, 'app_cat2_5': True, 'app_cat2_172': True, 'app_cat2_7': True, 'app_cat2_170': True, 'app_cat2_191': True, 'app_cat2_177': True, 'app_cat2_178': True, 'app_cat2_142': True, 'app_cat2_45': True, 'app_cat2_68': True, 'app_cat2_43': True, 'app_cat2_40': True, 'app_cat1_21': True, 'app_cat1_24': True, 'app_cat1_27': True, 'app_cat2_49': True, 'app_cat2_20': True, 'app_cat2_22': True, 'app_cat2_2': True, 'app_cat2_118': True, 'app_cat2_27': True, 'app_cat2_29': True, 'app_cat1_41': True}
# features_not_use = {'app_cat1_10':True,'app_cat1_17':True,'app_cat1_12':True,'app_cat1_19':True,'app_cat1_20':True,'app_cat1_22':True,'app_cat1_25':True,'app_cat1_36':True,'app_cat1_37':True,'app_cat1_39':True,'app_cat1_45':True,'app_cat2_102':True,'app_cat2_150':True,'app_cat2_197':True,'app_cat2_24':True,'app_cat2_32':True,'app_cat2_195':True,'app_cat2_214':True,'app_cat2_31':True,'app_cat2_73':True,'app_cat2_110':True,'app_cat2_121':True,'app_cat2_136':True,'app_cat2_149':True,'app_cat2_152':True,'app_cat2_16':True,'app_cat2_18':True,'app_cat2_19':True,'app_cat2_206':True,'app_cat2_38':True,'app_cat2_41':True,'app_cat2_82':True,'app_cat2_84':True,'app_cat2_95':True}
# print use_feature
# print len(use_feature)
yunyingshang_tezheng = 'cmcc,chinatelecom,chinaunicom,null,others\n'
yunyingshang_tezheng_test = 'cmcc,chinatelecom,chinaunicom,null,others\n'
features_block = []
index_flag = -1
yunyingshang_num = [0 for i in range(10)]
yunyingshang_test_num = [0 for i in range(10)]
shouji_features = '\thuawei\tvivo\tgionee\tlenovo\t360\tcoolpad\n'
shouji_features_test = '\thuawei\tvivo\tgionee\tlenovo\t360\tcoolpad\n'
map_ip_same = get_ip_same('./train_ip_mobel_same_uid.csv')
ip_same = []
while True:
data = f.readline()
if not data:
break
index_flag += 1
# print index_flag
if not index_flag:
features_block = data.strip('\n').split(',')
# print features_block,len(features_block)
continue
# if index_flag == 40183:
# print '40183',data
datablock = data.strip('\n').split(',')
# if datablock[0] == '0':
# index_label = 0.0
# else:
# index_label = 1.0
if class_res:
# if (index_flag < 20086):
# index_label = label_block[index_flag]
if (index_flag <= 40183):
train_y_res.append(label_block[index_flag])
else:
test_y.append(datablock[0])
else:
if vali_flag:
if (index_flag < 20086):
continue
train_y_res.append(label_block[index_flag])
train_y_dl.append([label_block[index_flag]])
index_value = []
index_value_dl = []
# print data
for di in dic_loc:
index_value_db = datablock[di]
if di == 5:
index_value_db = datablock[di].split(' ')[0].split('-')[0].split('_')[0]
index_value_db = str.lower(index_value_db)
if map_shouji_reverse.has_key(index_value_db):
index_value_db = map_shouji_reverse[index_value_db]
if di == 4:
index_value_db = str.lower(datablock[di].replace(' ','').replace('-','').replace('_',''))
if map_yunying_reverse.has_key(index_value_db):
index_value_db = map_yunying_reverse[index_value_db]
if di == 2:
if datablock[di] == '':
index_value_db = ''
elif datablock[di] == '0.0':
index_value_db = '0.0'
else:
index_value_db = str(int(float(datablock[di])) / 10) + '_' + str(int(float(datablock[di + 1])) / 10)
if map_latlon_reverse.has_key(index_value_db):
index_value_db = map_latlon_reverse[index_value_db]
if di == 6:
index_value_db = str.lower(datablock[di])
has_flag = 0
for dii in need_device:
if index_value_db.find(dii) == -1:
continue
else:
index_value_db = dii
has_flag = 1
break
# if di == 7:
# index_value_bl = datablock[di].split('.')[0]
# if index_value_bl == 'G-UI-5':
# index_value_bl = '7'
# if index_value_bl == '2':
# index_value_bl = '3'
# index_value_bl = float(index_value_bl)
# index_value.append(index_value_bl)
# if di == 7:
# index_content = ''
# index_content += str(index_value_bl) + '\n'
# if (index_flag < 20086):
# # yunyingshang_num[index_loc] += 1
# yunyingshang_tezheng += index_content
# else:
# # yunyingshang_test_num[index_loc] += 1
# yunyingshang_tezheng_test += index_content
# continue
if di == 7:
index_value_bl = datablock[di]
if (index_value_bl == 'G-UI-5') or (index_value_bl == 'G-UI-5.0.2'):
index_value_bl = '7'
count_point = index_value_bl.count(".")
point_block = index_value_bl.split('.')
if not count_point:
index_value_bl = index_value_bl + '00'
elif count_point == 1:
index_value_bl = point_block[0] + point_block[1] + '0'
else:
# print datablock[dic_loc[ii]]
index_value_bl = point_block[0] + point_block[1] + point_block[2]
if index_value_bl == '5120':
index_value_bl = '512'
if index_value_bl == '442ro':
index_value_bl = '442'
index_value_bl = int(index_value_bl)
# not detail
# if index_value_bl <= 440:
# index_value_bl = 0.0
# if (index_value_bl >= 442) and (index_value_bl <= 443):
# index_value_bl = 1.0
# if index_value_bl == 444:
# index_value_bl = 2.0
# if (index_value_bl >= 445) and (index_value_bl <= 502):
# index_value_bl = 3.0
# if index_value_bl == 510:
# index_value_bl = 4.0
# if (index_value_bl >= 511) and (index_value_bl <= 530):
# index_value_bl = 5.0
# if index_value_bl == 600:
# index_value_bl = 6.0
# if (index_value_bl >= 601) and (index_value_bl <= 610):
# index_value_bl = 7.0
# if (index_value_bl >= 700):
# index_value_bl = 8.0
# detail
# if index_value_bl <= 420:
# index_value_bl = 0.0
# if (index_value_bl >= 421) and (index_value_bl <= 422):
# index_value_bl = 1.0
# if (index_value_bl >= 430) and (index_value_bl <= 440):
# index_value_bl = 2.0
# if (index_value_bl >= 442) and (index_value_bl <= 443):
# index_value_bl = 3.0
# if index_value_bl == 444:
# index_value_bl = 4.0
# if (index_value_bl >= 445) and (index_value_bl <= 501):
# index_value_bl = 5.0
# if index_value_bl == 502:
# index_value_bl = 6.0
# if index_value_bl == 510:
# index_value_bl = 7.0
# if (index_value_bl >= 511) and (index_value_bl <= 530):
# index_value_bl = 8.0
# if index_value_bl == 600:
# index_value_bl = 9.0
# if (index_value_bl >= 601) and (index_value_bl <= 610):
# index_value_bl = 10.0
# if (index_value_bl >= 700):
# index_value_bl = 11.0
if index_value_bl < 500:
index_value_bl = 0.0
if (index_value_bl >= 500) and (index_value_bl < 600):
index_value_bl = 1.0
if (index_value_bl >= 600) and (index_value_bl < 700):
index_value_bl = 2.0
# if (index_value_bl < 700):
# index_value_bl = 1.0
if (index_value_bl >= 700):
index_value_bl = 3.0
value_index = get_one_hot(3,index_value_bl)
index_value.extend(value_index)
# index_value.append(index_value_bl)
index_content = ''
for vi in value_index:
index_content += str(vi) + '\t'
index_content += '\n'
if (index_flag < 20086):
# yunyingshang_num[index_loc] += 1
yunyingshang_tezheng += index_content
else:
# yunyingshang_test_num[index_loc] += 1
yunyingshang_tezheng_test += index_content
continue
index_key = str(di) + '_' + index_value_db
if map_dic.has_key(index_key):
index_loc = map_dic[index_key]
else:
index_loc = map_dic[str(di) + '_others']
index_loc = -1
# print map_dic[str(di) + '_' + 'others']
value_index = get_one_hot(map_dic[str(di) + '_' + 'others'],index_loc)
value_index_dl = get_one_hot_dl(map_dic[str(di) + '_' + 'others'] + 1,index_loc)
index_value.extend(value_index)
index_value_dl.extend(value_index_dl)
if di == 4:
index_content = ''
for iii in range(5):
index_content += str(value_index[iii]) + ' '
index_content += '\n'
if (index_flag < 20086):
yunyingshang_num[index_loc] += 1
yunyingshang_tezheng += index_content
else:
yunyingshang_test_num[index_loc] += 1
yunyingshang_tezheng_test += index_content
# if di == 5:
# index_content = ''
# for iii in range(6):
# index_content += '\t' + str(value_index[iii])
# index_content += '\n'
# if (index_flag <= 40183):
# shouji_features += index_content
# else:
# shouji_features_test += index_content
# lat = 0.0
# lng = 0.0
# if datablock[2] == '':
# lat = -1.0
# lng = -1.0
# else:
# lat = float(datablock[2])
# lng = float(datablock[3])
# index_value.append(lat)
# index_value.append(lng)
index_ii = 7
for ii in datablock[8:len(datablock)]:
try:
index_value_1 = float(ii)
except:
index_value_1 = -1.0
index_ii += 1
# print index_ii,features_block[index_ii]
has_use_feature = features_use.has_key(features_block[index_ii])
if not has_use_feature:
continue
# print index_ii,features_block[index_ii]
index_value.append(index_value_1)
index_value_dl.append([index_value_1])
# if len(index_value) != 259:
# print index_flag,'not 259!!!!!!',len(index_value),data
if class_res and (index_flag > 40183):
test_x.append(index_value)
else:
index_type = map_ip_same.has_key(datablock[0])
if index_type:
ip_same.append(1)
else:
ip_same.append(0)
train_x_res.append(index_value)
# train_x_dl.append(index_value_dl)
flag += 1
f.close()
if class_res:
# print map_dic
# fp = open('./mobil_features.txt','w')
# fp.write(shouji_features)
# fp.close()
# fp = open('./mobil_features_test.txt','w')
# fp.write(shouji_features_test)
# fp.close()
# get_data_convert_string('./224_all/task1_osge7/211_train_vali_mobile_ip_osge7','./mobil_features.txt','./224_all/224_all_train_mobil')
# get_data_convert_string('./224_all/task1_osge7/211_test_mobile_ip_osge7','./mobil_features_test.txt','./224_all/224_all_test_mobil')
return train_x_res,train_y_res,test_x,test_y
else:
return train_x_res,train_y_res,ip_same
def get_data_convert_string(file,file2,file3):
f = open(file2, "r")
file2 = []
flag = 0
while True:
data = f.readline()
if not data:
break
# if not (flag % 100):
# print flag
file2.append(data)
flag += 1
f.close()
f = open(file, "r")
train_x_res = []
train_y_res = []
flag = 0
content_res = ''
while True:
data = f.readline()
if not data:
break
# if not (flag % 100):
# print flag
content_res += data.strip('\n') + file2[flag]
flag += 1
f.close()
fp = open(file3,'w')
fp.write(content_res)
fp.close()
def get_data_convert(file):
f = open(file, "r")
train_x_res = []
train_y_res = []
flag = 0
while True:
data = f.readline()
if not data:
break
# if not (flag % 100):
# print flag
datablock = data.strip('\n').split('\t')
index_label = 0
train_y_res.append(float(datablock[0]))
# train_y_res.append(index_label)
index_value = []
for ii in datablock[1:len(datablock)]:
index_value.append(float(ii))
train_x_res.append(index_value)
flag += 1
f.close()
return train_x_res,train_y_res
def confusematrix(testlabel=[], prey=[]):
"""
计算混肴矩阵,并输出结果
:param testlabel:
:param prey:
:return:
"""
# dic = {'00': 0, '01': 0, '10': 0, '11': 0}
# 第一个数字代表真实,第二个数字代表预测
# for i in range(len(testlabel)):
# key = str(int(testlabel[i])) + str(int(prey[i]))
# dic[key] += 1
cnt = 0
true_one = 0
true_zero = 0
tt = 0
tf = 0
ft = 0
ff = 0
for i in range(len(testlabel)):
cnt += 1
if cnt % 100000 == 0:
# print "读取第[%d]万行" % (cnt / 10000)
pass
# print testlabel[i],prey[i]
tagkey = str(int(testlabel[i])) + str(int(prey[i]))
try:
if tagkey == "11":
true_one += 1
tt += 1
elif tagkey == "01":
true_zero += 1
tf += 1
elif tagkey == "10":
true_one += 1
ft += 1
elif tagkey == "00":
true_zero += 1
ff += 1
except Exception as e:
print line, e
predict_one = tt + tf
predict_zero = ft + ff
print "数据总数: %d" % (cnt)
if cnt <= 1:
exit()
print "真实不为0: ", true_one, ", 真实为0: ", true_zero
print "预测不为0: ", predict_one, ", 预测为0: ", predict_zero
print "类别\t\t真实不为0\t真实为0\t精确率"
if predict_one == 0:
print "预测不为0\t%d\t\t%d\t\t%.4f" % (tt, tf, 0)
else:
print "预测不为0\t%d\t\t%d\t\t%.4f" % (tt, tf, 1.0 * tt / predict_one)
if predict_zero == 0:
print "预测为0: \t\t%d\t\t%d\t\t%.4f" % (ft, ff, 0)
else:
print "预测为0\t\t%d\t\t%d\t\t%.4f" % (ft, ff, 1.0 * ff / predict_zero)
if true_one == 0 and true_zero != 0:
print "召回率\t\t%.4f\t\t%.4f\t\t%.4f" % (
0, 1.0 * ff / true_zero, 1.0 * (tt + ff) / cnt)
elif true_one == 0 and true_zero == 0:
print "召回率\t\t%.4f\t\t%.4f\t\t%.4f" % (0, 0, 1.0 * (tt + ff) / cnt)
elif true_one != 0 and true_zero == 0:
print "召回率\t\t%.4f\t\t%.4f\t\t%.4f" % (
1.0 * tt / true_one, 0, 1.0 * (tt + ff) / cnt)
else:
print "召回率\t\t%.4f\t\t%.4f\t\t%.4f" % (
1.0 * tt / true_one, 1.0 * ff / true_zero, 1.0 * (tt + ff) / cnt)
print "F1值\t\t%.4f" % (2.0 * ff / (predict_zero + true_zero))
print "--------------------------------------------------------------"
def write_ks(traindata, trainlabel, testdata, testlabel,write_file,feat_name):
feat_lenth = len(traindata[0])
func_name = {"plot_ks"}
feat_ks = []
alldata_len = len(traindata) + len(testdata)
for i in range(feat_lenth):
data_value = []
lable_value = []
feat_true_num = 0
index_res = []
index_res.append(feat_name[i])
for ii in range(len(traindata)):
data_value.append(traindata[ii][i])
lable_value.append(trainlabel[ii])
if not (traindata[ii][i] == -1):
feat_true_num += 1
for ii in range(len(testdata)):
data_value.append(testdata[ii][i])
lable_value.append(testlabel[ii])
if not (testdata[ii][i] == -1):
feat_true_num += 1
print feat_name[i]
resent_ks = plot_features.plot_one_feature(data_value, lable_value, 100,write_file,func_name, 0,feat_name[i], feat_name[i],0)
index_res.append(resent_ks)
index_res.append(feat_true_num / float(alldata_len))
feat_ks.append(index_res)
return feat_ks
def cacul_feat(feat_imp,feat_ks):
content = ''
for ii in range(len(feat_ks)):
feat_ks[ii].append(feat_imp[ii])
ks_sort = sorted(feat_ks, key=lambda feat_ks: feat_ks[1], reverse=True)
for ii in range(len(feat_ks)):
content += ks_sort[ii][0] + ', ks:' + str(ks_sort[ii][1]) + ', import:' + str(ks_sort[ii][3]) + ', radio:' + str(ks_sort[ii][2]) + '\n'
fp = open('./feat_result_kssort' + output ,'w')
fp.write(content)
fp.close()
content = ''
imp_sort = sorted(feat_ks, key=lambda feat_ks: feat_ks[3], reverse=True)
for ii in range(len(feat_ks)):
content += imp_sort[ii][0] + ', ks:' + str(imp_sort[ii][1]) + ', import:' + str(imp_sort[ii][3]) + ', radio:' + str(imp_sort[ii][2]) + '\n'
fp = open('./feat_result_impsort' + output ,'w')
fp.write(content)
fp.close()
def write_res(prey_value,testlabel):
content = ''
for ii in range(len(prey_value)):
content += str(prey_value[ii]) + ',' + str(testlabel[ii]) + '\n'
fp = open('./result_' + output,'w')
fp.write(content)
fp.close()
def cross_spilit(traindata,trainlabel,number):
train_x = []
train_y = []
val_x = []
val_y = []
train_ip_same = []
val_ip_same = []
for ii in range(len(trainlabel)):
flag = ii % 10
if flag in number:
val_x.append(traindata[ii])
val_y.append(trainlabel[ii])
else:
train_x.append(traindata[ii])
train_y.append(trainlabel[ii])
return train_x,train_y,val_x,val_y
def calcul_spilit(traindata,trainlabel):
clf = GradientBoostingClassifier(n_estimators=n_estimators,
subsample=subsample,
learning_rate=learning_rate,
max_features=0.5,
max_depth=max_depth)
sum_split = 0.0
sum_auc = 0.0
for i in range(10):
next_i = i + 1
if next_i > 9:
next_i = next_i % 10
train_x,train_y,val_x,val_y = cross_spilit(traindata,trainlabel,[i])
clf.fit(train_x, train_y)
prey_prob = clf.predict_proba(val_x)
prey_value = []
new_label_all = []
index_flag = 0
for ii in prey_prob:
prey_value.append(ii[1])
func_name = {"plot_ks"}
sum_split += plot_features.plot_one_feature(prey_value, val_y, 200,'./ks/',func_name, 0,'classfication' + str(i), 'classfication',0)
sum_auc += auc_score(clf,val_x, val_y)[4]
print 'average_split',sum_split / float(1000)
print 'average_auc',sum_auc / float(10)
return sum_split / float(1000)
def get_rnn_res(file):
f = open(file, "r")
train_x_res = []
while True:
data = f.readline()
if not data:
break
datablock = float(data.strip('\n'))
train_x_res.append(datablock)
f.close()
return train_x_res
def log_loss(test_y,predict_y):
all_loss = 0
for ii in range(len(test_y)):
test_res = math.log(math.exp(test_y[ii]),10)
predict_res = math.log(math.exp(predict_y[ii]),10)
if not test_y[ii]:
test_res = 0
if not predict_y[ii]:
predict_res = 0
# print math.log(test_y[ii]),math.log(predict_y[ii])
all_loss += (test_res - predict_res) * (test_res - predict_res)
return all_loss / float(len(predict_y))
def merge_rnn(train_x,train_rnn):
# result = []
for ii in range(len(train_rnn)):
train_x[ii].append(train_rnn[ii])
return train_x
def regression(train_x,train_y,test_x,test_y,class_res):
# clf = linear_model.LinearRegression()
# train_rnn = get_rnn_res('./lstm-regression/train2_rnn.txt')
# test_rnn = get_rnn_res('./lstm-regression/test2_rnn.txt')
print len(train_x[0])
# train_x = merge_rnn(train_x,train_rnn)
print len(train_x[0])
# test_x = merge_rnn(test_x,test_rnn)
clf=GradientBoostingRegressor(loss='ls',learning_rate=0.1,n_estimators=n_estimators, subsample=1, min_samples_split=2, min_samples_leaf=1, max_depth=3, init=None, random_state=None, max_features=None, alpha=0.9, verbose=0, max_leaf_nodes=None, warm_start=False)
clf.fit(train_x,train_y)
# joblib.dump(clf, './huodong.model')
# gbdt = joblib.load('./huodong.model')
# print 'feature_importances_'
# print clf.feature_importances_
# print 'end'
# print clf.alpha_
# load test data
# predict with test data
predict_y = clf.predict(test_x)
# print predict_y
# test_y = array(test_y)
for ii in range(len(predict_y)):
if not class_res[ii]:
predict_y[ii] = 0.0
all_loss = log_loss(test_y,predict_y)
print 'sum_loss',all_loss
def get_split_data_file(file):
f = open(file, "r")
train_x = []
train_y = []
train_y_class = []
flag = 0
while True:
data = f.readline()
if not data:
break
# if not (flag % 100):
# print flag
datablock = data.strip('\n').split('\t')
index_value = []
index_flag = -1
all_remove_num = 0
for ii in datablock[1:len(datablock)]:
index_flag += 1
if (index_flag >= remove_begin) and (index_flag <= remove_end):
all_remove_num += float(ii)
continue
index_value.append(float(ii))
index_value[25] -= all_remove_num
train_x.append(index_value)
train_y.append(float(datablock[0]))
index_class = 0.0
if datablock[0] != '0.0':
index_class = 1.0
train_y_class.append(index_class)
f.close()
return train_x,train_y,train_y_class
def get_split_data():
train_x = []
train_y = []
train_y_class = []
for i in range(10):
train_x_index,train_y_index,train_y_index_class = get_split_data_file('./split/' + str(i))
train_x.append(train_x_index)
train_y.append(train_y_index)
train_y_class.append(train_y_index_class)
return train_x,train_y,train_y_class
def split_reg_radio(train_x,train_y,zero_radio):
train_x_res = []
train_y_res = []
train_x_0 = []
train_y_0 = []
for ii in range(len(train_y)):
if train_y[ii]:
train_x_res.append(train_x[ii])
train_y_res.append(train_y[ii])
else:
train_x_0.append(train_x[ii])
train_y_0.append(train_y[ii])
# zero_num = int(float(len(train_x)) * zero_radio)
add_zero_num = 0
for i in range(10):
index_flag = 0
for ii in range(len(train_x_0)):
index_flag = ii % 10
if i == index_flag:
add_zero_num += 1
train_x_res.append(train_x_0[ii])
train_y_res.append(train_y_0[ii])
index_radio = add_zero_num / float(len(train_x_res))
if index_radio > zero_radio:
break
index_radio = add_zero_num / float(len(train_x_res))
if index_radio > zero_radio:
break
print 'index 0 zero:',add_zero_num / float(len(train_x_res))
return train_x_res,train_y_res
def split_reg(train_x,train_y):
train_x_new = []
train_y_new = []
for ii in range(len(train_y)):
if train_y[ii]:
train_x_new.append(train_x[ii])
train_y_new.append(train_y[ii])
return train_x_new,train_y_new
def add_block(predict_res,predict_y):
res = []
for ii in range(len(predict_res)):
res.append(predict_res[ii] + predict_y[ii])
return res
def get_csv(file):
f = open(file, "r")
train_x_res = []
train_x_id = []
flag = 0
while True:
data = f.readline()
if not data:
break
if not flag:
flag += 1
continue
if not (flag % 100):
print flag
datablock = data.strip('\n').split(',')
index_value = []
for ii in datablock[1:len(datablock)]:
index_value.append(float(ii))
train_x_res.append(index_value)
train_x_id.append(datablock[0])
f.close()
return train_x_res,train_x_id
def auc_score_xgb(pytestprob,ytest, pos_label=1, auc_score=True):
# pytest = clf.predict(Xtest)
# pytestprob = clf.predict_proba(Xtest)
pytestprob = [k for k in pytestprob]
fpr, tpr, thresholds = roc_curve(ytest, pytestprob, pos_label=pos_label)
if auc_score:
header = "\tauc"
scores = auc(fpr, tpr)
print header
print scores
return scores
def auc_score(pytestprob,ytest, pos_label=1, auc_score=True):
pytestprob = [k[pos_label] for k in pytestprob]
fpr, tpr, thresholds = roc_curve(ytest, pytestprob, pos_label=pos_label)
if auc_score:
header = "\tauc"
scores = auc(fpr, tpr)
print header
print scores
return scores
def ceate_feature_map(features):
outfile = open('xgb.fmap', 'w')
i = 0
for feat in features:
outfile.write('{0}\t{1}\tq\n'.format(i, feat))
i = i + 1
outfile.close()
def getdistrict():
res = {}
map_province = {}
map_city = {}
map_district = {}
f = open('./split_data/april_loc_all.txt', "r")
while True:
data = f.readline()
if not data:
break
datablock = data.strip('\n').split(',')
index_res = []
for i in range(1,4):
index_value = datablock[i].decode('utf8')
if index_value == '':
index_value = '-1'
if (i == 1):
has_flag = map_province.has_key(index_value)
if has_flag:
index_res.append(map_province[index_value])
else:
index_res.append(float(len(map_province)))
map_province[index_value] = float(len(map_province))
# if (i == 2):
# has_flag = map_city.has_key(index_value)
# if has_flag:
# index_res.append(map_city[index_value])
# else:
# index_res.append(float(len(map_city)))
# map_city[index_value] = float(len(map_city))
# if (i == 3):
# has_flag = map_district.has_key(index_value)
# if has_flag:
# index_res.append(map_district[index_value])
# else:
# index_res.append(float(len(map_district)))
# map_district[index_value] = float(len(map_district))
index_res.append(float(datablock[4]))
res[datablock[0]] = index_res
# print map_province
# print map_city
# print map_district
# print len(res)
f.close()
return res
def get_bad_new():
file = '../data/sdk_log_all/yirendai_bad_0725.csv'
f = open(file, "r")
result = {}
# district = getdistrict()
flag = -1
while True:
data = f.readline()
flag += 1
if not data:
break
datablock = data.strip('\n').split(',')
result[datablock[0]] = True
f.close()
return result
def get_mobile():
file = '../data/april/mobile_new_test.csv'
f = open(file, "r")
result = {}
map_process = {'HUAWEI':4488, 'OPPO':3049, 'vivo':2591, 'Xiaomi':2209, 'samsung':1342, 'Meizu':851, 'GiONEE':191, '360':124, 'LeMobile':115, 'Letv':92, 'YuLong':86, 'BBK':82, 'nubia':81, 'GIONEE':81, 'ZTE':79, 'smartisan':67, 'LENOVO':50, 'HTC':49, 'QiKU':44, 'Meitu':37}
flag = -1
while True:
data = f.readline()
flag += 1
if not flag:
continue
if not data:
break
datablock = data.strip('\n').split(',')
index_brand = datablock[2].split(' ')[0]
if not map_process.has_key(index_brand):
continue
index_content = datablock[2].replace(index_brand,"").replace(" ","")
# index_content = re.sub("[^a-zA-Z0-9\-]","",index_content)
index_price = datablock[3]
index_all = []
index_all.append(index_content)
index_all.append(index_price)
if result.has_key(index_brand):
result[index_brand].append(index_all)
else:
brand_all = []
brand_all.append(index_all)
result[index_brand] = brand_all
f.close()
return result
def get_input():
result = {}
file = '../data/all2month/sdk_env_tr_te/app_tr.txt'
f = open(file, "r")
flag = -1
while True:
data = f.readline()
flag += 1
if not flag:
continue
if not data:
break
datablock = data.strip('\n').replace('\r',"").split(',')
index_res = []
# print datablock[1]
for ii in range(1,len(datablock)):
# print ii,datablock
index_res.append(float(datablock[ii]))
result[datablock[0]] = index_res
f.close()
file = '../data/all2month/sdk_env_tr_te/app_te.txt'
f = open(file, "r")
flag = -1
while True:
data = f.readline()
flag += 1
if not flag:
continue
if not data:
break
datablock = data.strip('\n').replace('\r',"").split(',')
index_res = []
# print datablock[1]
for ii in range(1,len(datablock)):
# print ii,datablock
index_res.append(float(datablock[ii]))
result[datablock[0]] = index_res
f.close()
return result
def get_csv_app(file,remove_feature):
f = open(file, "r")
mob_6_lable = get_mob6()
map_brand = []
map_all_brand = []
train_x_res = []
train_y = []
test_x_res = []
test_y = []
input_num = 0
# imei_res = get_add_imei()
flag = -1
add_num = 0
repeat = {}
content_write = ''
while True:
data = f.readline()
flag += 1
if not data:
break
# if not (flag % 100):
# print flag
datablock = data.strip('\n').split(',')
index_value = []
index_flag = -1
for ii in datablock[1:len(datablock)]:
index_flag += 1
index_ii = float(ii)
index_value.append(index_ii)
if mob_6_lable.has_key(datablock[0]):
index_label_content = mob_6_lable[datablock[0]]
index_date = index_label_content.split(',')[0]
index_label = float(index_label_content.split(',')[1])
# and (index_date.split('-')[1] == '02')
# if (index_date.split('-')[0] == '2017') and (index_date.split('-')[1] == '02'):
# test_x_res.append(index_value)
# test_y.append(index_label)
# else:
# train_x_res.append(index_value)
# train_y.append(index_label)
train_x_res.append(index_value)
train_y.append(index_label)
f.close()
return train_x_res,train_y
def get_app():
train_x_all,train_y_all = get_csv_app('../data/all2month/sdk_env_tr_te/app_tr.txt',1)
val_x_all,val_y_all = get_csv_app('../data/all2month/sdk_env_tr_te/app_te.txt',1)
train_x_all.extend(val_x_all)
train_y_all.extend(val_y_all)
return train_x_all
def get_add_feats(file):
# file = '../data/new_data/split_0815/success_poi_all.txt'
f = open(file, "r")
result = {}
flag = -1
while True:
data = f.readline()
flag += 1
# if not flag:
# continue
if not data:
break
datablock = data.strip('\n').replace('\r',"").split(',')
index_res = []
for ii in range(1,len(datablock)):
index_res.append(float(datablock[ii]))
result[datablock[0]] = index_res
f.close()
return result
def get_add_imei():
file = '../data/new_data/imei.txt'
f = open(file, "r")
result = {}
flag = -1
count = 0
while True:
data = f.readline()
flag += 1
# if not flag:
# continue
if not data:
break
datablock = data.strip('\n').replace('\r',"").split('\t')
index_res = []
index_res.append(float(datablock[1]))
if float(datablock[1]) > 1:
count += 1
# index_res.append(float(datablock[2]) / (1000 * 3600 * 24))
# index_res.append(float(datablock[2]) + float(datablock[3]))
# index_res.append(float(datablock[2]) / (float(datablock[2]) + float(datablock[3])))
# index_res.append(float(datablock[3]) / (float(datablock[2]) + float(datablock[3])))
result[datablock[0]] = index_res
print count
f.close()
return result
def get_applyid():
file = '../data/new_data/apply_id.csv'
f = open(file, "r")
result = {}
flag = -1
while True:
data = f.readline()
flag += 1
if not flag:
continue
if not data:
break
datablock = data.strip('\n').replace('\r',"").split(',')
result[datablock[0]] = datablock[3]
f.close()
return result
def get_mob6():
file = './label_1101_all.csv'
f = open(file, "r")
# apply_id = get_applyid()
result = {}
flag = -1
while True:
data = f.readline()
flag += 1
if not flag:
continue
if not data:
break
datablock = data.strip('\n').replace('\r',"").split(',')
# has_apply = apply_id.has_key(datablock[0])
# if not has_apply:
# continue
index_label = datablock[4] if (datablock[4] == '0') else '1'
# if index_label == '1':
# print datablock
if float(index_label) > 1:
print index_label
result[datablock[3]] = datablock[2].split(' ')[0] + ',' + index_label
f.close()
print 'result',len(result)
return result
def get_emp(num):
result = []
for ii in range(num):
result.append(-1.0)
return result
def jaccard_distance(w1dict, w2dict):
w1set, w2set = set(w1dict), set(w2dict)
return 1.0 * len(w1set & w2set) / len(w1set | w2set)
def get_csv_mob6(file,write_file):
mob_6_lable = get_mob6()
# map_brand_all = get_mobile()
map_brand = []
map_all_brand = []
# input_res = get_input()
# app_res = get_app()
add_feats_res = get_add_feats('./split_data/april_hotel_all.txt')
add_bank = get_add_feats('./split_data/april_bank_all.txt')
f = open(file, "r")
train_x_res = []
train_y = []
test_x_res = []
test_y = []
input_num = 0
district = getdistrict()
flag = -1
add_num = 0
repeat = {}
content_write = ''
apply_id_all = []
while True:
data = f.readline()
flag += 1
if not data:
break
# if not flag:
# flag += 1
# continue
# if not (flag % 100):
# print flag
datablock = data.strip('\n').split(',')
index_value = []
index_flag = -1
# repeat[datablock[0]] = 1
# print data
# print datablock
for ii in datablock[3:len(datablock) - 3]:
index_flag += 1
# or (index_flag == 14)
if (index_flag == 1) or (index_flag == 2) or (index_flag == 14):
continue
index_ii = float(ii)
if (index_flag == 5) or (index_flag == 6) or (index_flag == 9) or (index_flag == 10):
index_ii = index_ii / float(10000)
index_value.append(index_ii)
if district.has_key(datablock[0]):
index_district = district[datablock[0]]
else:
index_district = [-1,-1]
index_value.extend(index_district)
index_mobile_price = -1
index_mobile_score = -1
index_mobile_comments = -1
brand = datablock[3]
index_mobile_type = datablock[4].replace(brand,"").replace(" ","")
if (index_mobile_type == "R9m") or (index_mobile_type == "R9tm"):
index_mobile_type = "R9"
if (index_mobile_type == "R7sm"):
index_mobile_type = "R7"
if (index_mobile_type == "MI5"):
index_mobile_type = "5"
if (index_mobile_type == "MI4LTE"):
index_mobile_type = "4"
if add_feats_res.has_key(datablock[0]):
# index_add = [add_feats_res[datablock[0]][1]]
index_add = add_feats_res[datablock[0]]
# del index_add[0]
# print len(index_add)
else:
index_add = [-1,-1,-1,-1]
add_num += 1
index_value.extend(index_add)
if add_bank.has_key(datablock[0]):
index_add = add_bank[datablock[0]]
else:
index_add = [-1,-1]
index_value.extend(index_add)
write_block = [str(item) for item in index_value]
content_write += datablock[1] + ',' + ",".join(write_block) + '\n'
if mob_6_lable.has_key(datablock[1]):
index_label_content = mob_6_lable[datablock[1]]
index_date = index_label_content.split(',')[0]
index_label = float(index_label_content.split(',')[1])
# and (index_date.split('-')[1] == '02')
# if (index_date.split('-')[0] == '2017') and (index_date.split('-')[1] == '02'):
# test_x_res.append(index_value)
# test_y.append(index_label)
# else:
# train_x_res.append(index_value)
# train_y.append(index_label)
train_x_res.append(index_value)
train_y.append(index_label)
apply_id_all.append(datablock[1])
fp = open('./' + write_file,'w')
fp.write(content_write)
fp.close()
print "add_num",add_num
f.close()
return train_x_res,train_y,apply_id_all
def get_csv_index(file,remove_feature):
map_brand_all = get_mobile()
input_res = get_input()
add_feats_res = get_add_feats()
map_brand = []
map_all_brand = []
f = open(file, "r")
train_x_res = []
train_y = []
district = getdistrict()
new_result = get_bad_new()
flag = -1
input_num = 0
add_num = 0
while True:
data = f.readline()
flag += 1
if not data:
break
# if not flag:
# flag += 1
# continue
# if not (flag % 100):
# print flag
datablock = data.strip('\n').split(',')
index_value = []
index_flag = -1
# print data
for ii in datablock[2:len(datablock)]:
index_flag += 1
if (index_flag == 1) or (index_flag == 2):
continue
index_value.append(float(ii))
# index_district = district[datablock[0]]
# index_value.extend(index_district)
index_mobile_price = -1
index_mobile_score = -1
index_mobile_comments = -1
brand = datablock[3]
index_mobile_type = datablock[4].replace(brand,"").replace(" ","")
if (index_mobile_type == "R9m") or (index_mobile_type == "R9tm"):
index_mobile_type = "R9"
if (index_mobile_type == "R7sm"):
index_mobile_type = "R7"
if (index_mobile_type == "MI5"):
index_mobile_type = "5"
if (index_mobile_type == "MI4LTE"):
index_mobile_type = "4"
if map_brand_all.has_key(brand):
brand_all_mobile = map_brand_all[brand]
most_similar = 0.0
most_res = []
for i_type in brand_all_mobile:
if (i_type[0].find(index_mobile_type)) != -1:
most_similar = 1
most_res = i_type
break
else:
similar = jaccard_distance(i_type[0],index_mobile_type)
if similar >= most_similar:
most_similar = similar
most_res = i_type
if most_similar != 1:
map_brand.append(index_mobile_type)
# print datablock[0],most_similar,most_res[1],index_mobile_type,"---",most_res[0]
index_mobile_price = float(most_res[1])
# print similar,i_type[0],index_mobile_type
map_all_brand.append(index_mobile_type)
index_value.append(index_mobile_price)
index_label = float(datablock[1])
if input_res.has_key(datablock[0]):
index_input = input_res[datablock[0]]
else:
input_num += 1
index_input = [-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1]
index_value.extend(index_input)
if add_feats_res.has_key(datablock[0]):
index_add = add_feats_res[datablock[0]]
else:
index_add = [-1,-1]
add_num += 1
index_value.extend(index_add)
train_x_res.append(index_value)
train_y.append(index_label)
# print Counter(map_brand)
# print Counter(map_brand).most_common(20)
# print Counter(map_all_brand)
print "input_num",input_num
print "add_num",add_num
f.close()
return train_x_res,train_y
def write_app(label,prey_prob,val_apply,file):
content = "applyid,label,prob\n"
for ii in range(len(prey_prob)):
content += val_apply[ii] + ',' + str(label[ii]) + ',' + str(prey_prob[ii]) + '\n'
fp = open(file,'w')
fp.write(content)
fp.close()
def train_val():
remove_feature = 0
train_x_all,train_y_all,train_apply = get_csv_mob6('./train_info/part-00000','feature_sdk_tr.txt')
val_x_all,val_y_all,val_apply = get_csv_mob6('./test_info/part-00000','feature_sdk_te.txt')
# train_x_all,train_y_all,val_x_all,val_y_all = get_csv_mob6('../data/new_data/train_test_0824.txt',1)
print len(train_x_all)
print train_x_all[0]
sum_split = 0.0
sum_auc = 0.0
features_block = ['operator','version','manufacturer','release','mocklocationenabled','usbdebugenabled','cellinfo','memory','availablememory','cpunum','cpustat','screenlight','rommemroy','sdcardmemory','speakervolume','batteryinfo','channelid']
ceate_feature_map(features_block)
map_res = {}
train_x = []
train_y = []
for ii in range(len(train_x_all)):
train_x.append(train_x_all[ii])
train_y.append(train_y_all[ii])
val_x = []
val_y = []
for ii in range(len(val_x_all)):
val_x.append(val_x_all[ii])
val_y.append(val_y_all[ii])
print len(train_x),len(train_y),len(val_x),len(val_y),len(train_x[0])
# param = {}
# param['objective'] = 'binary:logistic'
# param['booster'] = 'gbtree'
# param['learning_rate'] = 0.01
# param['max_depth'] = 5
# param['subsample'] = 0.7
# param['colsample_bytree'] = 0.1
# param['scale_pos_weight'] = 1
# param['min_child_weight'] = 70
# param['max_delta_step'] = 0
# param['nthread'] = 8
# param['silent'] = 1
# param['reg_lambda'] = 0.1
# param['reg_alpha'] = 0.1
# num_round = 1500
# xg_train = xgb.DMatrix(train_x, label=train_y)
# xg_test = xgb.DMatrix(val_x)
# bst = xgb.train(param, xg_train, num_round)
# prey_prob = bst.predict(xg_train)
params = {
'task': 'train',
'boosting_type': 'gbdt',
'objective': 'binary',
'metric': {'l2', 'auc'},
'num_leaves': 31,
'learning_rate': 0.05,
'feature_fraction': 0.9,
'bagging_fraction': 0.8,
'bagging_freq': 5,
'verbose': 0
}
lgb_train = lgb.Dataset(train_x, train_y)
gbm = lgb.train(params,
lgb_train,
num_boost_round=20,
valid_sets=lgb_train,
early_stopping_rounds=5)
prey_prob = gbm.predict(train_x,num_iteration=gbm.best_iteration)
sum_auc = auc_score_xgb(prey_prob,train_y)
print "train auc: ",sum_auc
write_app(train_y,prey_prob,train_apply,"./sdk_prob_tr.csv")
prey_prob = gbm.predict(val_x)
sum_auc = auc_score_xgb(prey_prob,val_y)
print "test auc: ",sum_auc
write_app(val_y,prey_prob,val_apply,"./sdk_prob_te.csv")
def main():
index_process = 1
if index_process:
# for i in range(8,20):
# cross_val(0)
train_val()
if __name__=='__main__':
main()
lightgbm xgboost
最新推荐文章于 2022-10-16 10:34:48 发布