import pandas as pd
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from textblob import Word
import re
from sklearn.model_selection import train_test_split
读取数据
# 读取数据
data = pd.read_csv('spam.csv', encoding ="ISO-8859-1")
'go until jurong point crazy available only in bugis n great world la e buffet cine there got amore wat '
单词转换为小写
# 单词转换为小写
data['text']= data['text'].apply(lambda x:" ".join(x.lower()for x in x.split()))# 或者 #data['text'] = data['text'].apply(lambda x:x.lower())
data['text'][0]
'go until jurong point crazy available only in bugis n great world la e buffet cine there got amore wat'
去除停止词
# 去除停止词 ,如a、an、the、高频介词、连词、代词等
stop = stopwords.words('english')
data['text']= data['text'].apply(lambda x:" ".join(x for x in x.split()if x notin stop))
data['text'][0]
'go jurong point crazy available bugis n great world la e buffet cine got amore wat'
分词处理
# 分词处理,希望能够实现还原英文单词原型
st = PorterStemmer()
data['text']= data['text'].apply(lambda x:" ".join([st.stem(word)for word in x.split()]))
data['text']= data['text'].apply(lambda x:" ".join([Word(word).lemmatize()for word in x.split()]))
data['text'][0]
'go jurong point crazi avail bugi n great world la e buffet cine got amor wat'
data.head()
label
text
0
ham
go jurong point crazi avail bugi n great world...
1
ham
ok lar joke wif u oni
2
spam
free entri 2 wkli comp win fa cup final tkt 21...
3
ham
u dun say earli hor u c alreadi say
4
ham
nah think goe usf live around though
3 特征提取
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
Using TensorFlow backend.
分出训练集和测试集
#以 8:2 的比例分出训练集和测试集
train, test = train_test_split(data, test_size=0.2)
# 找出经常出现的单词,分词器
tokenizer = Tokenizer(num_words=num_words)
tokenizer.fit_on_texts(train.text)
train_sequences = tokenizer.texts_to_sequences(train.text)
test_sequences = tokenizer.texts_to_sequences(test.text)# dictionary containing words and their index
word_index = tokenizer.word_index
# print(tokenizer.word_index)# total words in the corpusprint('Found %s unique tokens.'%len(word_index))# get only the top frequent words on train
train_x = pad_sequences(train_sequences, maxlen=max_sequence_length)# get only the top frequent words on test
test_x = pad_sequences(test_sequences, maxlen=max_sequence_length)print(train_x.shape)print(test_x.shape)
Found 6702 unique tokens.
(4457, 300)
(1115, 300)
标签向量化
# 标签向量化# [0,1]: ham;[1,0]:spamimport numpy as np
deflable_vectorize(labels):
label_vec = np.zeros([len(labels),2])for i, label inenumerate(labels):ifstr(label)=='ham':
label_vec[i][0]=1else:
label_vec[i][1]=1return label_vec
train_y = lable_vectorize(train['label'])
test_y = lable_vectorize(test['label'])# 或者from sklearn.preprocessing import LabelEncoder
from keras.utils import to_categorical
# converts the character array to numeric array. Assigns levels to unique labels.
train_labels = train['label']
test_labels = test['label']
le = LabelEncoder()
le.fit(train_labels)
train_labels = le.transform(train_labels)
test_labels = le.transform(test_labels)# changing data types
labels_train = to_categorical(np.asarray(train_labels))
labels_test = to_categorical(np.asarray(test_labels))
4 构建模型并训练
# Import Librariesimport sys, os, re, csv, codecs, numpy as np, pandas as pd
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical
from keras.layers import Dense, Input, LSTM, Embedding,Dropout, Activation
from keras.layers import Bidirectional, GlobalMaxPool1D,Conv1D, SimpleRNN
from keras.models import Model
from keras.models import Sequential
from keras import initializers, regularizers, constraints,optimizers, layers
from keras.layers import Dense, Input, Flatten, Dropout,BatchNormalization
from keras.layers import Conv1D, MaxPooling1D, Embedding
from keras.models import Sequential
model = Sequential()
model.add(Embedding(num_words,
embedding_dim,
input_length=max_sequence_length))
model.add(Dropout(0.5))
model.add(Conv1D(128,5, activation='relu'))
model.add(MaxPooling1D(5))
model.add(Dropout(0.5))
model.add(BatchNormalization())
model.add(Conv1D(128,5, activation='relu'))
model.add(MaxPooling1D(5))
model.add(Dropout(0.5))
model.add(BatchNormalization())
model.add(Flatten())
model.add(Dense(128, activation='relu'))
model.add(Dense(2, activation='softmax'))
model.compile(loss='categorical_crossentropy',
optimizer='rmsprop',
metrics=['acc'])
model.fit(train_x, train_y,
batch_size=64,
epochs=5,
validation_split=0.2)