1.sns调色板使用
pal = sns.color_palette()
2.打印文件大小
print('# File sizes')
for f in os.listdir('../input'):
if 'zip' not in f:
#ljust() 返回一个原字符串左对齐,并使用空格填充至指定长度的新字符串。如果指定的长度小于原字符串的长度则返回原字符串。
print(f.ljust(30) + str(round(os.path.getsize('../input/' + f) / 1000000, 2)) + 'MB')
3.Pandas 中两列合并成一个Series的方法
qids = pd.Series(df_train['qid1'].tolist() + df_train['qid2'].tolist())
4.matplotlib中y轴的缩放
plt.yscale('log', nonposy='clip')
5.判断是否是数字或者大写
isupper()
isdigit()
6.按顺序输出GridSearchCV结果
for i in range(1, len(cv.cv_results_['params'])+1):
rank = cv.cv_results_['rank_test_score'][i-1]
s = cv.cv_results_['mean_test_score'][i-1]
sd = cv.cv_results_['std_test_score'][i-1]
params = cv.cv_results_['params'][i-1]
print("{0}. Mean validation neg log loss: {1:.3f} (std: {2:.3f}) - {3}".format(
rank,
s,
sd,
params
))
7.绘制ROC曲线
准确率就是A/(A+B) 大白话就是“你的预测有多少对的比例”
召回率就是A/(A+C) 大白话就是“正例里你的预测覆盖的比例”
false postive rate FPR(横轴):预测为正例,但实际为负例占真正负例的比例
True Negative Rate TNR(纵轴):预测为负例,实际为负例子,这些占真正负例的比例。
首先AUC值是一个概率值,当你随机挑选一个正样本以及负样本,当前的分类算法根据计算得到的Score值将这个正样本排在负样本前面的概率就是AUC值,AUC值越大,当前分类算法越有可能将正样本排在负样本前面,从而能够更好地分类。
colors = ['r', 'g', 'b', 'y', 'k', 'c', 'm', 'brown', 'r']
lw = 1
Cs = [1e-6, 1e-4, 1e0]
plt.figure(figsize=(12,8))
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve for different classifiers')
plt.plot([0, 1], [0, 1], color='navy', lw=lw, linestyle='--')
labels = []
for idx, C in enumerate(Cs):
clf = LogisticRegression(C = C)
clf.fit(X_train, y_train)
print("C: {}, parameters {} and intercept {}".format(C, clf.coef_, clf.intercept_))
fpr, tpr, _ = roc_curve(y_test, clf.predict_proba(X_test)[:,1])
roc_auc = auc(fpr, tpr)
plt.plot(fpr, tpr, lw=lw, color=colors[idx])
labels.append("C: {}, AUC = {}".format(C, np.round(roc_auc, 4)))
plt.legend(['random AUC = 0.5'] + labels)
8.文件目录的设置
BASE_DIR = '../input/'
EMBEDDING_FILE = BASE_DIR + 'GoogleNews-vectors-negative300.bin'
TRAIN_DATA_FILE = BASE_DIR + 'train.csv'
TEST_DATA_FILE = BASE_DIR + 'test.csv'
9.载入已训练的词向量
from gensim.models import KeyedVectors
print('Indexing word vectors')
word2vec = KeyedVectors.load_word2vec_format(EMBEDDING_FILE, \
binary=True)
print('Found %s word vectors of word2vec' % len(word2vec.vocab))
10.去除停用词
if remove_stopwords:
stops = set(stopwords.words("english"))
text = [w for w in text if not w in stops]
11.字符连接成字符串
text = " ".join(text)
12.提取词干
if stem_words:
text = text.split()
stemmer = SnowballStemmer('english')
stemmed_words = [stemmer.stem(word) for word in text]
text = " ".join(stemmed_words)
13.读取NLP的csv文件
test_texts_1 = []
test_texts_2 = []
test_ids = []
with codecs.open(TEST_DATA_FILE, encoding='utf-8') as f:
reader = csv.reader(f, delimiter=',')
header = next(reader)
for values in reader:
test_texts_1.append(text_to_wordlist(values[1]))
test_texts_2.append(text_to_wordlist(values[2]))
test_ids.append(values[0])
print('Found %s texts in test.csv' % len(test_texts_1))
13.Keras中的分词处理
tokenizer = Tokenizer(num_words=MAX_NB_WORDS)
tokenizer.fit_on_texts(texts_1 + texts_2 + test_texts_1 + test_texts_2)
sequences_1 = tokenizer.texts_to_sequences(texts_1)
sequences_2 = tokenizer.texts_to_sequences(texts_2)
14.填充序列长度
test_data_1 = pad_sequences(test_sequences_1, maxlen=MAX_SEQUENCE_LENGTH)
15.词向量预处理
print('Preparing embedding matrix')
nb_words = min(MAX_NB_WORDS, len(word_index))+1
embedding_matrix = np.zeros((nb_words, EMBEDDING_DIM))
for word, i in word_index.items():
if word in word2vec.vocab:
embedding_matrix[i] = word2vec.word_vec(word)
print('Null word embeddings: %d' % np.sum(np.sum(embedding_matrix, axis=1) == 0))
16.train,valid数据采样提取
perm = np.random.permutation(len(data_1))
idx_train = perm[:int(len(data_1)*(1-VALIDATION_SPLIT))]
idx_val = perm[int(len(data_1)*(1-VALIDATION_SPLIT)):]
data_1_train = np.vstack((data_1[idx_train], data_2[idx_train]))
data_2_train = np.vstack((data_2[idx_train], data_1[idx_train]))
labels_train = np.concatenate((labels[idx_train], labels[idx_train]))
- 17.