aj7

coding=utf-8

import pandas as pd
import xgboost as xgb
from sklearn.metrics import f1_score

import param

############################ 定义评估函数 ############################
def micro_avg_f1(preds, dtrain):
y_true = dtrain.get_label()
return ‘micro_avg_f1’, f1_score(y_true, preds, average=‘micro’)

############################ 加载特征 & 标签 ############################
df_tfidf_lr = pd.read_csv(param.data_path + ‘/output/feature/tfidf/lr_prob_12w.csv’)
df_tfidf_bnb = pd.read_csv(param.data_path + ‘/output/feature/tfidf/bnb_prob_12w.csv’)
df_tfidf_mnb = pd.read_csv(param.data_path + ‘/output/feature/tfidf/mnb_prob_12w.csv’)
df_tfidf_svc = pd.read_csv(param.data_path + ‘/output/feature/tfidf/svc_prob_12w.csv’)
df_amt = pd.read_csv(param.data_path + ‘/output/feature/amt/amt_12w.csv’)
df_dbow_nn = pd.read_csv(param.data_path + ‘/output/feature/dbowd2v/nn_prob_12w.csv’)
df_w2v = pd.read_csv(param.data_path + ‘/output/feature/w2v/w2v_12w.csv’)

df_dm = pd.read_csv(param.data_path + ‘dmd2v_stack_20W.csv’)

df_lb = pd.read_csv(param.data_path + ‘/output/corpus/all_data.csv’, usecols=[‘id’, ‘penalty’], nrows=param.train_num)
df_lb[‘penalty’] = df_lb[‘penalty’] - 1 # 让标签属于 [0, 8)

############################ xgboost ############################
tr_num = param.cv_train_num
df_sub = pd.DataFrame()
df_sub[‘id’] = df_lb.iloc[tr_num:][‘id’]
seed = param.seed

n_trees = 10000
esr = 200
evals = 20

df = pd.concat([df_tfidf_lr, df_tfidf_bnb, df_tfidf_mnb, df_amt, df_dbow_nn, df_w2v], axis=1)
print(df.columns)
num_class = len(pd.value_counts(df_lb[‘penalty’]))
x = df.iloc[:tr_num]
y = df_lb[‘penalty’][:tr_num]
x_te = df.iloc[tr_num:]
y_te = df_lb[‘penalty’][tr_num:]

max_depth = 7
min_child_weight = 1
subsample = 0.8
colsample_bytree = 0.8
gamma = 1
lam = 0

params = {
‘objective’: ‘multi:softmax’,
‘booster’: ‘gbtree’,
‘stratified’: True,
‘num_class’: num_class,
‘max_depth’: max_depth,
‘min_child_weight’: min_child_weight,
‘subsample’: subsample,
‘colsample_bytree’: colsample_bytree,
# ‘gamma’: gamma,
# ‘lambda’: lam,

'eta': 0.02,
'silent': 1,
'seed': seed,

}

dtrain = xgb.DMatrix(x, y)
dvalid = xgb.DMatrix(x_te, y_te)
watchlist = [(dtrain, ‘train’), (dvalid, ‘test’)]
bst = xgb.train(params, dtrain, n_trees, evals=watchlist, feval=micro_avg_f1, maximize=True,
early_stopping_rounds=esr, verbose_eval=evals)
df_sub[‘penalty’] = (bst.predict(dvalid) + 1).astype(int)

df_sub[‘id’] = df_sub[‘id’].astype(str)
df_sub[‘laws’] = [[1]] * len(df_sub)
df_sub.to_json(param.data_path + ‘/output/result/val/1209-xgb-tfidf_lr_bnb_mnb+amt+dbow_nn+w2v.json’, orient=‘records’, lines=True)

coding=utf-8

import time

def log(stri):
now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
print(str(now) + ’ ’ + str(stri))

12# coding=utf-8
from collections import defaultdict

import pandas as pd
import param
import util
from gensim.models import Word2Vec

############################ 加载数据 ############################
df_all = pd.read_csv(param.data_path + ‘/output/corpus/all_data.csv’, encoding=‘utf8’, nrows=param.train_num)
df_all[‘penalty’] = df_all[‘penalty’] - 1

############################ w2v ############################
documents = df_all[‘content’].values
util.log(‘documents number %d’ % len(documents))

texts = [[word for word in document.split(’ ')] for document in documents]
frequency = defaultdict(int)
for text in texts:
for token in text:
frequency[token] += 1
texts = [[token for token in text if frequency[token] >= 5] for text in texts]

util.log(‘Train Model…’)
w2v = Word2Vec(texts, size=param.w2v_dim, window=5, iter=15, workers=12, seed=param.seed)
w2v.save(param.data_path + ‘/output/model/w2v_12w.model’)
util.log(‘Save done!’)

coding=utf-8

import codecs
import subprocess
from collections import namedtuple

import numpy as np
import pandas as pd
from gensim.models import Doc2Vec
from sklearn.cross_validation import cross_val_score
from sklearn.linear_model import LogisticRegression

import param
import util

############################ 加载数据 ############################
df_all = pd.read_csv(param.data_path + ‘/output/corpus/all_data.csv’, encoding=‘utf8’, nrows=param.train_num).reset_index()
df_all[‘penalty’] = df_all[‘penalty’] - 1

############################ 定义函数、类及变量 ############################
def run_cmd(cmd):
print(cmd)
process = subprocess.Popen(cmd, shell=True, stdout=subprocess.PIPE, stderr=subprocess.STDOUT)
for t, line in enumerate(iter(process.stdout.readline, b’’)):
line = line.decode(‘utf8’).rstrip()
print(line)
process.communicate()
return process.returncode

SentimentDocument = namedtuple(‘SentimentDocument’, ‘words tags’)

class Doc_list(object):
def init(self, f):
self.f = f
def iter(self):
for i,line in enumerate(codecs.open(self.f,encoding=‘utf8’)):
words = line.strip().split(’ ')
tags = [int(words[0][2:])]
words = words[1:]
yield SentimentDocument(words,tags)

############################ 准备数据 ############################
doc_f = codecs.open(param.data_path + ‘/output/corpus/doc_for_d2v_12w.txt’, ‘w’, encoding=‘utf8’)
for i, contents in enumerate(df_all.iloc[:param.train_num][‘content’]):
words = []
for word in contents.split(’ ‘):
words.append(word)
tags = [i]
if i % 10000 == 0:
util.log(‘iter = %d’ % i)
doc_f.write(u’_*{} {}\n’.format(i, ’ '.join(words)))
doc_f.close()

############################ dbow d2v ############################
d2v = Doc2Vec(dm=0, size=300, negative=5, hs=0, min_count=3, window=30, sample=1e-5, workers=8, alpha=0.025, min_alpha=0.025)
doc_list = Doc_list(param.data_path + ‘/output/corpus/doc_for_d2v_12w.txt’)
d2v.build_vocab(doc_list)

df_lb = df_all[‘penalty’]

for i in range(5):
util.log('pass: ’ + str(i))
# run_cmd(‘shuf alldata-id.txt > alldata-id-shuf.txt’)
doc_list = Doc_list(param.data_path + ‘/output/corpus/doc_for_d2v_12w.txt’)
d2v.train(doc_list, total_examples=d2v.corpus_count, epochs=d2v.iter)
X_d2v = np.array([d2v.docvecs[i] for i in range(param.train_num)])
scores = cross_val_score(LogisticRegression(C=3), X_d2v, df_lb, cv=5)
util.log('dbow: ’ + str(scores) + ’ ’ + str(np.mean(scores)))
d2v.save(param.data_path + ‘/output/model/dbow_d2v_12w.model’)
util.log(‘Save done!’)

############################ dm d2v ############################
d2v = Doc2Vec(dm=1, size=300, negative=5, hs=0, min_count=3, window=30, sample=1e-5, workers=8, alpha=0.025, min_alpha=0.025)
doc_list = Doc_list(param.data_path + ‘/output/corpus/doc_for_d2v_12w.txt’)
d2v.build_vocab(doc_list)

df_lb = df_all[‘penalty’]

for i in range(10):
util.log('pass: ’ + str(i))
# run_cmd(‘shuf alldata-id.txt > alldata-id-shuf.txt’)
doc_list = Doc_list(param.data_path + ‘/output/corpus/doc_for_d2v_12w.txt’)
d2v.train(doc_list, total_examples=d2v.corpus_count, epochs=d2v.iter)
X_d2v = np.array([d2v.docvecs[i] for i in range(param.train_num)])
scores = cross_val_score(LogisticRegression(C=3), X_d2v, df_lb, cv=5)
util.log('dm: ’ + str(scores) + ’ ’ + str(np.mean(scores)))
d2v.save(param.data_path + ‘/output/model/dm_d2v_12w.model’)
util.log(‘Save done!’)

coding=utf-8

import codecs

import jieba
import jieba.analyse
import jieba.posseg
import pandas as pd

import param
import util

############################ 定义分词函数 ############################
def split_word(text, stopwords):
word_list = jieba.cut(text)
start = True
result = ‘’
for word in word_list:
word = word.strip()
if word not in stopwords:
if start:
result = word
start = False
else:
result += ’ ’ + word
return result.encode(‘utf-8’)

############################ 加载停用词 ############################
stopwords = {}
for line in codecs.open(param.data_path + ‘/input/stop.txt’, ‘r’, ‘utf-8’):
stopwords[line.rstrip()] = 1

############################ 加载数据 & 分词 ############################
df_tr = []
for i, line in enumerate(open(param.data_path + ‘/input/train.txt’)):
if i % 1000 == 1:
util.log(‘iter = %d’ % i)
segs = line.split(’\t’)
row = {}
row[‘id’] = segs[0]
row[‘content’] = split_word(segs[1].strip(), stopwords)
row[‘penalty’] = segs[2]
row[‘laws’] = segs[3].strip()
df_tr.append(row)
df_tr = pd.DataFrame(df_tr)

df_te = []
for i, line in enumerate(open(param.data_path + ‘/input/test.txt’)):
if i % 1000 == 1:
util.log(‘iter = %d’ % i)
segs = line.split(’\t’)
row = {}
row[‘id’] = segs[0]
row[‘content’] = split_word(segs[1].strip(), stopwords)
df_te.append(row)
df_te = pd.DataFrame(df_te)

print(df_tr.shape)
print(df_te.shape)

############################ 写出数据 ############################
df_all = pd.concat([df_tr, df_te]).fillna(0)
df_all.to_csv(param.data_path + ‘/output/corpus/all_data.csv’, index=None)

coding=utf-8

data_path = ‘…/data’

cv_train_num = 100000 # 用于交叉验证

train_num = 120000
test_num = 90000

w2v_dim = 300

seed = 2017

coding=utf-8

from collections import defaultdict

import numpy as np
import pandas as pd
import param
import util
from gensim.models import Word2Vec

############################ 加载数据 & 模型 ############################
df_all = pd.read_csv(param.data_path + ‘/output/corpus/all_data.csv’, encoding=‘utf8’, nrows=param.train_num)
df_all[‘penalty’] = df_all[‘penalty’] - 1
documents = df_all[‘content’].values
texts = [[word for word in document.split(’ ')] for document in documents]
frequency = defaultdict(int)
for text in texts:
for token in text:
frequency[token] += 1
texts = [[token for token in text if frequency[token] >= 5] for text in texts]

model = Word2Vec.load(param.data_path + ‘/output/model/w2v_12w.model’)

############################ w2v ############################
util.log(‘Start get w2v feat…’)
w2v_feat = np.zeros((len(texts), param.w2v_dim))
w2v_feat_avg = np.zeros((len(texts), param.w2v_dim))
i = 0
for line in texts:
num = 0
for word in line:
num += 1
vec = model[word]
w2v_feat[i, :] += vec
w2v_feat_avg[i, :] = w2v_feat[i, :] / num
i += 1
if i % 1000 == 0:
util.log(i)

pd.DataFrame(w2v_feat).to_csv(param.data_path + ‘/output/feature/w2v/w2v_12w.csv’, encoding=‘utf8’, index=None)
pd.DataFrame(w2v_feat_avg).to_csv(param.data_path + ‘/output/feature/w2v/w2v_avg_12w.csv’, encoding=‘utf8’, index=None)
util.log(‘Save w2v and w2v_avg feat done!’)

coding=utf-8

import numpy as np
import pandas as pd
from sklearn import svm
from sklearn.cross_validation import StratifiedKFold
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import f1_score
from sklearn.naive_bayes import BernoulliNB, MultinomialNB

import param
import util

############################ 定义评估函数 ############################
def micro_avg_f1(y_true, y_pred):
return f1_score(y_true, y_pred, average=‘micro’)

############################ 加载数据 ############################
df_all = pd.read_csv(param.data_path + ‘/output/corpus/all_data.csv’, encoding=‘utf8’, nrows=param.train_num)
df_all[‘penalty’] = df_all[‘penalty’] - 1

############################ tfidf ############################
tfv = TfidfVectorizer(min_df=3, max_df=0.95, sublinear_tf=True)
x_sp = tfv.fit_transform(df_all[‘content’])

############################ lr stack ############################
tr_num = param.cv_train_num
num_class = len(pd.value_counts(df_all[‘penalty’]))
n = 5

x = x_sp[:tr_num]
y = df_all[‘penalty’][:tr_num]
x_te = x_sp[tr_num:]
y_te = df_all[‘penalty’][tr_num:]

stack = np.zeros((x.shape[0], num_class))
stack_te = np.zeros((x_te.shape[0], num_class))

score_va = 0
score_te = 0
for i, (tr, va) in enumerate(StratifiedKFold(y, n_folds=n, random_state=param.seed)):
util.log(‘stack:%d/%d’ % ((i + 1), n))
clf = LogisticRegression(C=2)
clf.fit(x[tr], y[tr])
y_pred_va = clf.predict_proba(x[va])
y_pred_te = clf.predict_proba(x_te)
util.log(‘va acc:%f’ % micro_avg_f1(y[va], clf.predict(x[va])))
util.log(‘te acc:%f’ % micro_avg_f1(y_te, clf.predict(x_te)))
score_va += micro_avg_f1(y[va], clf.predict(x[va]))
score_te += micro_avg_f1(y_te, clf.predict(x_te))
stack[va] += y_pred_va
stack_te += y_pred_te
score_va /= n
score_te /= n
util.log(‘va avg acc:%f’ % score_va)
util.log(‘te avg acc:%f’ % score_te)
stack_te /= n
stack_all = np.vstack([stack, stack_te])
df_stack = pd.DataFrame(index=range(len(df_all)))
for i in range(stack_all.shape[1]):
df_stack[‘tfidf_lr_{}’.format(i)] = stack_all[:, i]

df_stack.to_csv(param.data_path + ‘/output/feature/tfidf/lr_prob_12w.csv’, index=None, encoding=‘utf8’)

############################ bnb stack ############################
tr_num = param.cv_train_num
num_class = len(pd.value_counts(df_all[‘penalty’]))
n = 5

x = x_sp[:tr_num]
y = df_all[‘penalty’][:tr_num]
x_te = x_sp[tr_num:]
y_te = df_all[‘penalty’][tr_num:]

stack = np.zeros((x.shape[0], num_class))
stack_te = np.zeros((x_te.shape[0], num_class))

score_va = 0
score_te = 0
for i, (tr, va) in enumerate(StratifiedKFold(y, n_folds=n, random_state=param.seed)):
util.log(‘stack:%d/%d’ % ((i + 1), n))
clf = BernoulliNB()
clf.fit(x[tr], y[tr])
y_pred_va = clf.predict_proba(x[va])
y_pred_te = clf.predict_proba(x_te)
util.log(‘va acc:%f’ % micro_avg_f1(y[va], clf.predict(x[va])))
util.log(‘te acc:%f’ % micro_avg_f1(y_te, clf.predict(x_te)))
score_va += micro_avg_f1(y[va], clf.predict(x[va]))
score_te += micro_avg_f1(y_te, clf.predict(x_te))
stack[va] += y_pred_va
stack_te += y_pred_te
score_va /= n
score_te /= n
util.log(‘va avg acc:%f’ % score_va)
util.log(‘te avg acc:%f’ % score_te)
stack_te /= n
stack_all = np.vstack([stack, stack_te])
df_stack = pd.DataFrame(index=range(len(df_all)))
for i in range(stack_all.shape[1]):
df_stack[‘tfidf_bnb_{}’.format(i)] = stack_all[:, i]

df_stack.to_csv(param.data_path + ‘/output/feature/tfidf/bnb_prob_12w.csv’, index=None, encoding=‘utf8’)

############################ mnb stack ############################
tr_num = param.cv_train_num
num_class = len(pd.value_counts(df_all[‘penalty’]))
n = 5

x = x_sp[:tr_num]
y = df_all[‘penalty’][:tr_num]
x_te = x_sp[tr_num:]
y_te = df_all[‘penalty’][tr_num:]

stack = np.zeros((x.shape[0], num_class))
stack_te = np.zeros((x_te.shape[0], num_class))

score_va = 0
score_te = 0
for i, (tr, va) in enumerate(StratifiedKFold(y, n_folds=n, random_state=param.seed)):
util.log(‘stack:%d/%d’ % ((i + 1), n))
clf = MultinomialNB()
clf.fit(x[tr], y[tr])
y_pred_va = clf.predict_proba(x[va])
y_pred_te = clf.predict_proba(x_te)
util.log(‘va acc:%f’ % micro_avg_f1(y[va], clf.predict(x[va])))
util.log(‘te acc:%f’ % micro_avg_f1(y_te, clf.predict(x_te)))
score_va += micro_avg_f1(y[va], clf.predict(x[va]))
score_te += micro_avg_f1(y_te, clf.predict(x_te))
stack[va] += y_pred_va
stack_te += y_pred_te
score_va /= n
score_te /= n
util.log(‘va avg acc:%f’ % score_va)
util.log(‘te avg acc:%f’ % score_te)
stack_te /= n
stack_all = np.vstack([stack, stack_te])
df_stack = pd.DataFrame(index=range(len(df_all)))
for i in range(stack_all.shape[1]):
df_stack[‘tfidf_gnb_{}’.format(i)] = stack_all[:, i]

df_stack.to_csv(param.data_path + ‘/output/feature/tfidf/gnb_prob_12w.csv’, index=None, encoding=‘utf8’)

############################ svc stack ############################
tr_num = param.cv_train_num
num_class = len(pd.value_counts(df_all[‘penalty’]))
n = 5

x = x_sp[:tr_num]
y = df_all[‘penalty’][:tr_num]
x_te = x_sp[tr_num:]
y_te = df_all[‘penalty’][tr_num:]

stack = np.zeros((x.shape[0], num_class))
stack_te = np.zeros((x_te.shape[0], num_class))

score_va = 0
score_te = 0
for i, (tr, va) in enumerate(StratifiedKFold(y, n_folds=n, random_state=param.seed)):
util.log(‘stack:%d/%d’ % ((i + 1), n))
clf = svm.LinearSVC(loss=‘hinge’, tol=0.000001, C=0.5, verbose=1, random_state=param.seed, max_iter=5000)
clf.fit(x[tr], y[tr])
y_pred_va = clf.decision_function(x[va])
y_pred_te = clf.decision_function(x_te)
util.log(‘va acc:%f’ % micro_avg_f1(y[va], clf.predict(x[va])))
util.log(‘te acc:%f’ % micro_avg_f1(y_te, clf.predict(x_te)))
score_va += micro_avg_f1(y[va], clf.predict(x[va]))
score_te += micro_avg_f1(y_te, clf.predict(x_te))
stack[va] += y_pred_va
stack_te += y_pred_te
score_va /= n
score_te /= n
util.log(‘va avg acc:%f’ % score_va)
util.log(‘te avg acc:%f’ % score_te)
stack_te /= n
stack_all = np.vstack([stack, stack_te])
df_stack = pd.DataFrame(index=range(len(df_all)))
for i in range(stack_all.shape[1]):
df_stack[‘tfidf_svc_{}’.format(i)] = stack_all[:, i]

df_stack.to_csv(param.data_path + ‘/output/feature/tfidf/svc_prob_12w.csv’, index=None, encoding=‘utf8’)

coding=utf-8

import numpy as np
import pandas as pd
from gensim.models import Doc2Vec
from keras.layers.core import Dense, Dropout, Activation
from keras.models import Sequential
from keras.utils import np_utils
from sklearn.cross_validation import StratifiedKFold
from sklearn.metrics import f1_score

import param
import util

############################ 定义评估函数 ############################
def micro_avg_f1(y_true, y_pred):
return f1_score(y_true, y_pred, average=‘micro’)

############################ 加载数据 ############################
df_all = pd.read_csv(param.data_path + ‘/output/corpus/all_data.csv’, encoding=‘utf8’, nrows=param.train_num)
df_all[‘penalty’] = df_all[‘penalty’] - 1

model = Doc2Vec.load(param.data_path + ‘/output/model/dm_d2v_12w.model’)
x_sp = np.array([model.docvecs[i] for i in range(param.train_num)])

############################ dmd2v stack ############################
np.random.seed(param.seed) # 固定种子,方便复现
df_stack = pd.DataFrame(index=range(len(df_all)))
tr_num = param.cv_train_num
num_class = len(pd.value_counts(df_all[‘penalty’]))
n = 5

x = x_sp[:tr_num]
y = df_all[‘penalty’][:tr_num]
x_te = x_sp[tr_num:]
y_te = df_all[‘penalty’][tr_num:]

feat = ‘dmd2v’
stack = np.zeros((x.shape[0], num_class))
stack_te = np.zeros((x_te.shape[0], num_class))

score_va = 0
score_te = 0
for i, (tr, va) in enumerate(StratifiedKFold(y, n_folds=n, random_state=param.seed)):
util.log(‘stack:%d/%d’ % ((i + 1), n))
y_train = np_utils.to_categorical(y[tr], num_class)
y_test = np_utils.to_categorical(y_te, num_class)
model = Sequential()
model.add(Dense(300, input_shape=(x[tr].shape[1],)))
model.add(Dropout(0.1))
model.add(Activation(‘tanh’))
model.add(Dense(num_class))
model.add(Activation(‘softmax’))
model.compile(loss=‘categorical_crossentropy’,
optimizer=‘adadelta’,
metrics=[‘accuracy’])
history = model.fit(x[tr], y_train, shuffle=True,
batch_size=128, nb_epoch=35,
verbose=2, validation_data=(x_te, y_test))
y_pred_va = model.predict_proba(x[va])
y_pred_te = model.predict_proba(x_te)
util.log(‘va acc:%f’ % micro_avg_f1(y[va], model.predict_classes(x[va])))
util.log(‘te acc:%f’ % micro_avg_f1(y_te, model.predict_classes(x_te)))
score_va += micro_avg_f1(y[va], model.predict_classes(x[va]))
score_te += micro_avg_f1(y_te, model.predict_classes(x_te))
stack[va] += y_pred_va
stack_te += y_pred_te
score_va /= n
score_te /= n
util.log(‘va avg acc:%f’ % score_va)
util.log(‘te avg acc:%f’ % score_te)
stack_te /= n
stack_all = np.vstack([stack, stack_te])
for l in range(stack_all.shape[1]):
df_stack[’{}_{}’.format(feat, l)] = stack_all[:, l]

df_stack.to_csv(param.data_path + ‘/output/feature/dmd2v/nn_prob_12w.csv’, encoding=‘utf8’, index=None)
util.log(‘Save dmd2v stack done!’)

coding=utf-8

import numpy as np
import pandas as pd
from gensim.models import Doc2Vec
from keras.layers.core import Dense, Dropout, Activation
from keras.models import Sequential
from keras.utils import np_utils
from sklearn.cross_validation import StratifiedKFold
from sklearn.metrics import f1_score

import param
import util

############################ 定义评估函数 ############################
def micro_avg_f1(y_true, y_pred):
return f1_score(y_true, y_pred, average=‘micro’)

############################ 加载数据 ############################
df_all = pd.read_csv(param.data_path + ‘/output/corpus/all_data.csv’, encoding=‘utf8’, nrows=param.train_num)
df_all[‘penalty’] = df_all[‘penalty’] - 1

model = Doc2Vec.load(param.data_path + ‘/output/model/dbow_d2v_12w.model’)
x_sp = np.array([model.docvecs[i] for i in range(param.train_num)])

############################ dbowd2v stack ############################
np.random.seed(param.seed) # 固定种子,方便复现
df_stack = pd.DataFrame(index=range(len(df_all)))
tr_num = param.cv_train_num
num_class = len(pd.value_counts(df_all[‘penalty’]))
n = 5

x = x_sp[:tr_num]
y = df_all[‘penalty’][:tr_num]
x_te = x_sp[tr_num:]
y_te = df_all[‘penalty’][tr_num:]

feat = ‘dbowd2v’
stack = np.zeros((x.shape[0], num_class))
stack_te = np.zeros((x_te.shape[0], num_class))

score_va = 0
score_te = 0
for i, (tr, va) in enumerate(StratifiedKFold(y, n_folds=n, random_state=param.seed)):
util.log(‘stack:%d/%d’ % ((i + 1), n))
y_train = np_utils.to_categorical(y[tr], num_class)
y_test = np_utils.to_categorical(y_te, num_class)
model = Sequential()
model.add(Dense(300, input_shape=(x[tr].shape[1],)))
model.add(Dropout(0.1))
model.add(Activation(‘tanh’))
model.add(Dense(num_class))
model.add(Activation(‘softmax’))
model.compile(loss=‘categorical_crossentropy’,
optimizer=‘adadelta’,
metrics=[‘accuracy’])
history = model.fit(x[tr], y_train, shuffle=True,
batch_size=128, nb_epoch=35,
verbose=2, validation_data=(x_te, y_test))
y_pred_va = model.predict_proba(x[va])
y_pred_te = model.predict_proba(x_te)
util.log(‘va acc:%f’ % micro_avg_f1(y[va], model.predict_classes(x[va])))
util.log(‘te acc:%f’ % micro_avg_f1(y_te, model.predict_classes(x_te)))
score_va += micro_avg_f1(y[va], model.predict_classes(x[va]))
score_te += micro_avg_f1(y_te, model.predict_classes(x_te))
stack[va] += y_pred_va
stack_te += y_pred_te
score_va /= n
score_te /= n
util.log(‘va avg acc:%f’ % score_va)
util.log(‘te avg acc:%f’ % score_te)
stack_te /= n
stack_all = np.vstack([stack, stack_te])
for l in range(stack_all.shape[1]):
df_stack[’{}_{}’.format(feat, l)] = stack_all[:, l]

df_stack.to_csv(param.data_path + ‘/output/feature/dbowd2v/nn_prob_12w.csv’, encoding=‘utf8’, index=None)
util.log(‘Save dbowd2v stack done!’)

coding=utf-8

import re

import numpy as np
import pandas as pd

import param
import util

df_tr = []
util.log(‘For train.txt:’)
for i, line in enumerate(open(param.data_path + ‘/input/train.txt’)):
if i % 1000 == 1:
util.log(‘iter = %d’ % i)
segs = line.split(’\t’)
row = {}
row[‘id’] = segs[0]
row[‘raw_content’] = segs[1].strip()
df_tr.append(row)
df_tr = pd.DataFrame(df_tr)

df_te = []
util.log(‘For test.txt:’)
for i, line in enumerate(open(param.data_path + ‘/input/test.txt’)):
if i % 1000 == 1:
util.log(‘iter = %d’ % i)
segs = line.split(’\t’)
row = {}
row[‘id’] = segs[0]
row[‘raw_content’] = segs[1].strip()
df_te.append(row)
df_te = pd.DataFrame(df_te)

df_all = pd.concat([df_tr, df_te]).reset_index(drop=True)

amt_list = []
for i, row in df_all.iterrows():
if i % 1000 == 1:
util.log(‘iter = %d’ % i)
amt = re.findall(u’(\d*.?\d+)元’, row[‘raw_content’].decode(‘utf8’))
amt_tt = re.findall(u’(\d*.?\d+)万元’, row[‘raw_content’].decode(‘utf8’))
for a in amt:
amt_list.append([row[‘id’], float(a)])
for a in amt_tt:
amt_list.append([row[‘id’], float(a) * 10000])
amt_feat = pd.DataFrame(amt_list, columns=[‘id’, ‘amount’])
amt_feat = amt_feat.groupby(‘id’)[‘amount’].agg([sum, min, max, np.ptp, np.mean, np.std]).reset_index()
amt_feat = pd.merge(df_all, amt_feat, how=‘left’, on=‘id’).drop([‘id’, ‘raw_content’], axis=1)
amt_feat.columns = [‘amt_’ + i for i in amt_feat.columns]

amt_feat.to_csv(param.data_path + ‘/output/feature/amt/amt_21w.csv’, index=None, encoding=‘utf8’)

  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值