这个短文本情感分类开始照着别人的baselline做了一次,用的LSTM,线上0.82左右,后来自己改用了一下双向的,也就是Bi-LSTM 模型
代码如下:
import numpy as np
import pandas as pd
import re
train = pd.read_csv("C:/Users/Nicht_sehen/Desktop/train.csv", lineterminator='\n')
test = pd.read_csv("C:/Users/Nicht_sehen/Desktop/test.csv", lineterminator='\n')
# 改label
train['label'] = train['label'].map({'Negative': 0, 'Positive': 1})
# 清理数据替换掉无词义的符号
def clean(string):
string = re.sub(r"\'", "", string)
string = re.sub(r",", " ", string)
string = re.sub(r"\t", "", string)
string = re.sub(r"\n", "", string)
string = re.sub(r"\d", "", string)
string = re.sub(r"\.", "", string)
string = re.sub(r"\\", "", string)
string = re.sub(r"\-", " ", string)
string = re.sub(r"^_.", "", string)
string = re.sub(r"^ ", "", string)
string = re.sub(r"\\x\.+", "", string)
string = re.sub(r"\\x+", "", string)
string = re.sub(r" $", " ", string)
string = re.sub(r"_", " ", string)
string = re.sub(r"!", " ", string)
string = re.sub(r"\(", " ", string)
string = re.sub(r"\)", " ", string)
string = re.sub(r"\?", " ", string)
string = re.sub(r"\#\.", " ", string)
return string.lower()
def hashing(word):
word = re.sub(r'ain$', r'ein', word)
word = re.sub(r'ai', r'ae', word)
word = re.sub(r'ay$', r'e', word)
word = re.sub(r'ey$', r'e', word)
word = re.sub(r'ie$', r'y', word)
word = re.sub(r'^es', r'is', word)
word = re.sub(r'a+', r'a', word)
word = re.sub(r'j+', r'j', word)
word = re.sub(r'd+', r'd', word)
word = re.sub(r'u', r'o', word)
word = re.sub(r'o+', r'o', word)
word = re.sub(r'ee+', r'i', word)
if not re.match(r'ar', word):
word = re.sub(r'ar', r'r', word)
word = re.sub(r'iy+', r'i', word)
word = re.sub(r'ih+', r'eh', word)
word = re.sub(r's+', r's', word)
if re.search(r'[rst]y', 'word') and word[-1] != 'y':
word = re.sub(r'y', r'i', word)
if re.search(r'[bcdefghijklmnopqrtuvwxyz]i', word):
word = re.sub(r'i$', r'y', word)
if re.search(r'[acefghijlmnoqrstuvwxyz]h', word):
word = re.sub(r'h', '', word)
word = re.sub(r'k', r'q', word)
return word
# 再次清洗
def delk(sentence):
words = sentence.split(' ')
sen = ''
for word in words:
if word == ' ':
continue
sen = sen+' '+hashing(clean(word))
return sen
train['review'] = pd.DataFrame([delk(s) for s in train['review']])
test['review'] = pd.DataFrame([delk(s) for s in test['review']])
# print(train['review'][3])
print('data clean end')
# 截取review和label
X_train = train.values[:, 1]
X_test = test.values[:, 1]
temp_test = test.values
Y_train = train.values[:, -1]
print('data slide end')
from keras.preprocessing.sequence import pad_sequences
from keras.preprocessing.text import Tokenizer
from keras.models import Sequential
from keras.layers import Dense, Dropout, Embedding, LSTM, Bidirectional
from keras import regularizers
max_features = 3000
batch_size = 32
X_all = np.concatenate([X_train, X_test])
len_train = len(X_train)
tokenizer = Tokenizer(num_words=2500,
filters='!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n',
lower=True, split=' ')
tokenizer.fit_on_texts(X_all)
X = tokenizer.texts_to_sequences(X_all)
# 填充0
X = pad_sequences(X)
model = Sequential()
model.add(Embedding(max_features, 128, input_length=X.shape[1]))
model.add(Bidirectional(LSTM(64)))
model.add(Dropout(0.25))
model.add(Dense(1, activation='sigmoid', kernel_regularizer=regularizers.l2(0.01),
activity_regularizer=regularizers.l1(0.001)))
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
x_train=X[:len_train]
x_test=X[len_train:]
print('Train...')
model.fit(x_train, Y_train,
batch_size=batch_size,
epochs=5)
print("training end!")
pre_prob=model.predict(x_test)
submmit=pd.DataFrame(columns=['ID','Pred'])
submmit['ID']=temp_test[:, 0]
pre_prob[:5]
submmit['Pred'] = pre_prob
submmit.to_csv('C:/Users/Nicht_sehen/Desktop/te.csv', index=None)
这个是短文本,所以不能训练过久,否则很容易过拟合,epochs=5时提交线上0.84
因为考虑到是短文本,深度比较容易过拟合,然后考虑用传统的SVM+tfidf,不过结果不是很理想,用10折的交叉验证线下也只有0.79-0.80的样子,代码如下:
import pandas as pd
import numpy as np
import re
from sklearn.feature_extraction.text import TfidfVectorizer
data_path="./1.txt"
train = pd.read_csv("C:/Users/Nicht_sehen/Desktop/train.csv", lineterminator='\n')
test = pd.read_csv(data_path)
# 改label
train['label'] = train['label'].map({'Negative': 0, 'Positive': 1})
# 清理数据替换掉无词义的符号
def clean(string):
string = re.sub(r"\'", "", string)
string = re.sub(r",", " ", string)
string = re.sub(r"\t", "", string)
string = re.sub(r"\n", "", string)
string = re.sub(r"\d", "", string)
string = re.sub(r"\.", "", string)
string = re.sub(r"\\", "", string)
string = re.sub(r"\-", " ", string)
string = re.sub(r"^_.", "", string)
string = re.sub(r"^ ", "", string)
string = re.sub(r"\\x\.+", "", string)
string = re.sub(r"\\x+", "", string)
string = re.sub(r" $", " ", string)
string = re.sub(r"_", " ", string)
string = re.sub(r"!", " ", string)
string = re.sub(r"\(", " ", string)
string = re.sub(r"\)", " ", string)
string = re.sub(r"\?", " ", string)
string = re.sub(r"\#\.", " ", string)
return string.lower()
def hashing(word):
word = re.sub(r'ain$', r'ein', word)
word = re.sub(r'ai', r'ae', word)
word = re.sub(r'ay$', r'e', word)
word = re.sub(r'ey$', r'e', word)
word = re.sub(r'ie$', r'y', word)
word = re.sub(r'^es', r'is', word)
word = re.sub(r'a+', r'a', word)
word = re.sub(r'j+', r'j', word)
word = re.sub(r'd+', r'd', word)
word = re.sub(r'u', r'o', word)
word = re.sub(r'o+', r'o', word)
word = re.sub(r'ee+', r'i', word)
if not re.match(r'ar', word):
word = re.sub(r'ar', r'r', word)
word = re.sub(r'iy+', r'i', word)
word = re.sub(r'ih+', r'eh', word)
word = re.sub(r's+', r's', word)
if re.search(r'[rst]y', 'word') and word[-1] != 'y':
word = re.sub(r'y', r'i', word)
if re.search(r'[bcdefghijklmnopqrtuvwxyz]i', word):
word = re.sub(r'i$', r'y', word)
if re.search(r'[acefghijlmnoqrstuvwxyz]h', word):
word = re.sub(r'h', '', word)
word = re.sub(r'k', r'q', word)
return word
# 再次清洗
def delk(sentence):
words = sentence.split(' ')
sen = ''
for word in words:
if word == ' ':
continue
sen = sen+' '+hashing(clean(word))
return sen
train['review'] = pd.DataFrame([delk(s) for s in train['review']])
test['review'] = pd.DataFrame([delk(s) for s in test['review']])
# print(train['review'][3])
print('data clean end')
# 截取review和label
X_train = train.values[:, 1]
X_test = test.values[:, 1]
temp_test = test.values
Y_train = train.values[:, -1]
y_train = np.array(Y_train)
y_train = y_train.astype('int8')
print('data slide end')
vectorizer = TfidfVectorizer(sublinear_tf=True,
ngram_range=(1, 2),
max_df=0.5)
X = np.concatenate([X_train, X_test])
len_train = len(X_train)
vectorizer.fit(X)
X = vectorizer.transform(X)
x_train = X[:len_train]
x_test = X[len_train:]
from sklearn.svm import SVC
from sklearn.model_selection import StratifiedKFold
from sklearn import metrics
folds = StratifiedKFold(n_splits=10, shuffle=False, random_state=2500)
predictions = np.zeros(x_test.shape[0])
for fold_, (trn_idx, val_idx) in enumerate(folds.split(x_train, y_train)):
print("Fold :{}".format(fold_ + 1))
trn_data = x_train[trn_idx]
trn_label= y_train[trn_idx]
val_data = x_train[val_idx]
val_label= y_train[val_idx]
model_SVM = SVC(C=1,kernel="linear")
model_SVM.fit(trn_data, trn_label)
print("auc score: {:<8.5f}".format(metrics.roc_auc_score(val_label, model_SVM.predict(val_data))))
predictions += model_SVM.predict(x_test) / folds.n_splits
output = pd.DataFrame({"ID": test["ID"], "Pred": predictions})
output.to_csv('C:/Users/Nicht_sehen/Desktop/sgd.csv',index=None)
最后采用了SGD+tfidf 成功通过预选赛,预选赛的线应该是0.85左右,本地0.87,提交上去公榜0.87,私榜看不到分数,但是显示通过。:)
代码如下:
import numpy as np
import pandas as pd
import re
data_path="./1.txt"
train = pd.read_csv("C:/Users/Nicht_sehen/Desktop/train.csv", lineterminator='\n')
test = pd.read_csv(data_path)
# 改label
train['label'] = train['label'].map({'Negative': 0, 'Positive': 1})
# 清理数据替换掉无词义的符号
def clean(string):
string = re.sub(r"\'", "", string)
string = re.sub(r",", " ", string)
string = re.sub(r"\t", "", string)
string = re.sub(r"\n", "", string)
string = re.sub(r"\d", "", string)
string = re.sub(r"\.", "", string)
string = re.sub(r"\\", "", string)
string = re.sub(r"\-", " ", string)
string = re.sub(r"^_.", "", string)
string = re.sub(r"^ ", "", string)
string = re.sub(r"\\x\.+", "", string)
string = re.sub(r"\\x+", "", string)
string = re.sub(r" $", " ", string)
string = re.sub(r"_", " ", string)
string = re.sub(r"!", " ", string)
string = re.sub(r"\(", " ", string)
string = re.sub(r"\)", " ", string)
string = re.sub(r"\?", " ", string)
string = re.sub(r"\#\.", " ", string)
return string.lower()
def hashing(word):
word = re.sub(r'ain$', r'ein', word)
word = re.sub(r'ai', r'ae', word)
word = re.sub(r'ay$', r'e', word)
word = re.sub(r'ey$', r'e', word)
word = re.sub(r'ie$', r'y', word)
word = re.sub(r'^es', r'is', word)
word = re.sub(r'a+', r'a', word)
word = re.sub(r'j+', r'j', word)
word = re.sub(r'd+', r'd', word)
word = re.sub(r'u', r'o', word)
word = re.sub(r'o+', r'o', word)
word = re.sub(r'ee+', r'i', word)
if not re.match(r'ar', word):
word = re.sub(r'ar', r'r', word)
word = re.sub(r'iy+', r'i', word)
word = re.sub(r'ih+', r'eh', word)
word = re.sub(r's+', r's', word)
if re.search(r'[rst]y', 'word') and word[-1] != 'y':
word = re.sub(r'y', r'i', word)
if re.search(r'[bcdefghijklmnopqrtuvwxyz]i', word):
word = re.sub(r'i$', r'y', word)
if re.search(r'[acefghijlmnoqrstuvwxyz]h', word):
word = re.sub(r'h', '', word)
word = re.sub(r'k', r'q', word)
return word
# 再次清洗
def delk(sentence):
words = sentence.split(' ')
sen = ''
for word in words:
if word == ' ':
continue
sen = sen+' '+hashing(clean(word))
return sen
train['review'] = pd.DataFrame([delk(s) for s in train['review']])
test['review'] = pd.DataFrame([delk(s) for s in test['review']])
print('data clean end')
# 截取review和label
X_train = train.values[:, 1]
X_test = test.values[:, 1]
temp_test = test.values
Y_train = train.values[:, -1]
y_train = np.array(Y_train)
y_train = y_train.astype('int8')
print('data slide end')
# tf-idf
from sklearn.feature_extraction.text import TfidfVectorizer
tf = TfidfVectorizer(sublinear_tf=True,
ngram_range=(1, 2),
max_df=0.6)
X = np.concatenate([X_train, X_test])
len_train = len(X_train)
tf.fit(X)
X = tf.transform(X)
x_train = X[:len_train]
x_test = X[len_train:]
from sklearn.model_selection import StratifiedKFold
from sklearn import metrics
from sklearn.linear_model import SGDClassifier
folds = StratifiedKFold(n_splits=10, shuffle=False, random_state=2000)
predictions = np.zeros(x_test.shape[0])
for fold_, (trn_idx, val_idx) in enumerate(folds.split(x_train, y_train)):
trn_data = x_train[trn_idx]
trn_label = y_train[trn_idx]
val_data = x_train[val_idx]
val_label = y_train[val_idx]
SGD = SGDClassifier(alpha=0.00001,random_state=2, shuffle=True, loss='log')
SGD.fit(trn_data, trn_label)
print("score: {:.5f}".format(metrics.roc_auc_score(val_label, SGD.predict_proba(val_data)[:, 1])))
predictions += SGD.predict_proba(x_test)[:, 1] / folds.n_splits
# 保存
pre = pd.DataFrame(columns=['ID','Pred'])
pre['ID'] = test["ID"]
pre['Pred'] = predictions
pre.to_csv('C:/Users/Nicht_sehen/Desktop/sgd.csv', index=None)
总结:
1,这次预选赛学到了一些文本数据处理的知识,比如tfidf,w2v之类的
2,在短文本的分类上,传统的分类方法比神经网络有着更好的效果,或者说泛化能力更强
3,了解了HAN,TextCNN等模型(就是效果不是很好,也可能是数据没处理好)