import numpy as np
import re
import nltk, nltk.stem.porter
defprocess_email(email_contents):
vocab_list = get_vocab_list()
word_indices = np.array([], dtype=np.int64)# ===================== Preprocess Email =====================
email_contents = email_contents.lower()
email_contents = re.sub('<[^<>]+>',' ', email_contents)# Any numbers get replaced with the string 'number'
email_contents = re.sub('[0-9]+','number', email_contents)# Anything starting with http or https:// replaced with 'httpaddr'
email_contents = re.sub('(http|https)://[^\s]*','httpaddr', email_contents)# Strings with "@" in the middle are considered emails --> 'emailaddr'
email_contents = re.sub('[^\s]+@[^\s]+','emailaddr', email_contents)# The '$' sign gets replaced with 'dollar'
email_contents = re.sub('[$]+','dollar', email_contents)# ===================== Tokenize Email =====================# Output the emailprint('==== Processed Email ====')
stemmer = nltk.stem.porter.PorterStemmer()# print('email contents : {}'.format(email_contents))
tokens = re.split('[@$/#.-:&*+=\[\]?!(){\},\'\">_<;% ]', email_contents)for token in tokens:
token = re.sub('[^a-zA-Z0-9]','', token)
token = stemmer.stem(token)iflen(token)<1:continue# ===================== Your Code Here =====================# 提示 :# 举例:如果token == 'action',那么在vocab_list中查找action# 如果vocab_list[18] == 'action'# 然后应该向word_indices数组添加18。
word_indices=np.append(word_indices,token)# ==========================================================print(token)print('==================')return word_indices
defget_vocab_list():
vocab_dict ={}withopen('vocab.txt')as f:for line in f:(val, key)= line.split()
vocab_dict[int(val)]= key
return vocab_dict
spamEmailFeatures.py特征提取
import numpy as np
defemail_features(word_indices):# Total number of words in the dictionary
n =1899# You need to return the following variables correctly.# Since the index of numpy array starts at 0, to align with the word indices we make n + 1 size array
features = np.zeros(n +1)# ===================== Your Code Here =====================# 提示 :# 此函数返回给定电子邮件的特征向量(word_indices),使其更容易# 处理邮件,我们已经对每个邮件进行了预处理,并将电子邮件中的每个单词转换为索引# word_indices包含单词的索引列表# 如果一封邮件的内容是:# The quick brown fox jumped over the lazy dog.# 然后,这个文本的word_indices向量可能是这样的## 60 100 33 44 10 53 60 58 5# 其中,我们将每个单词映射到一个数字上,例如:# The——60# quick——100# ...# 任务是使用word_indices向量来构造一种二进制特征向量,# 表示出现在邮件中特定的词,即当word的索引为i时,那么features[i] = 1# 具体地说,如果“the”这个词(比如,索引60)出现在电子邮件,然后features[60]= 1。# features=[0,0,0,0,1,0,0,0,…0,0,0,1,…0,0,0,1, 0]# 另:可参考one-hot encodingfor each in word_indices:
features[each -1]=1# 若word_indices在对应单词表的位置上词语存在则记为1# ==========================================================return features
svmClf_spam.py训练
import matplotlib.pyplot as plt
import numpy as np
import scipy.io as scio
from sklearn import svm
import svmClassification.svmClassification.spamProcessEmail as pe
import svmClassification.svmClassification.spamEmailFeatures as ef
plt.ion()
np.set_printoptions(formatter={'float':'{: 0.6f}'.format})# ===================== Part 1: 邮件处理 =====================# 完成spamProcessEmail.py,首先将每封电子邮件转换为一个特征向量,# 再使用SVM,将邮件分为垃圾邮件和非垃圾邮件#print('Preprocessing sample email (emailSample1.txt) ...')
file_contents =open('emailSample1.txt','r').read()
word_indices = pe.process_email(file_contents)# Print statsprint('Word Indices: ')print(word_indices)input('Program paused. Press ENTER to continue')# ===================== Part 2: 特征提取 =====================# 完成文件spamEmailFeatures.py,将邮件转换为R^N的特征矩阵print('Extracting Features from sample email (emailSample1.txt) ... ')"""
这段自己添加
"""# =====================================================================================================================defword_indices(processed_f, vocab_list):
indices =[]for i inrange(len(processed_f)):for j inrange(len(vocab_list)):if processed_f[i]!= vocab_list[j]:continue
indices.append(j +1)return indices
f=pe.process_email(file_contents)
li = np.loadtxt('vocab.txt', dtype='str', usecols=1)
f_indices = word_indices(f,li)# ======================================================================================================================# Extract features
features = ef.email_features(f_indices)# Print statsprint('Length of feature vector: {}'.format(features.size))print('Number of non-zero entries: {}'.format(np.flatnonzero(features).size))input('Program paused. Press ENTER to continue')# ===================== Part 3: Train Linear SVM for Spam Classification =====================# In this section, you will train a linear classifier to determine if an# email is Spam or Not-spam.# Load the Spam Email dataset# You will have X, y in your environment
data = scio.loadmat('spamTrain.mat')
X = data['X']
y = data['y'].flatten()print('Training Linear SVM (Spam Classification)')print('(this may take 1 to 2 minutes)')
c =0.1
clf = svm.SVC(c, kernel='linear')
clf.fit(X, y)
p = clf.predict(X)print('Training Accuracy: {}'.format(np.mean(p == y)*100))# ===================== Part 4: Test Spam Classification =====================# After training the classifier, we can evaluate it on a test set. We have# included a test set in spamTest.mat# Load the test dataset
data = scio.loadmat('spamTest.mat')
Xtest = data['Xtest']
ytest = data['ytest'].flatten()print('Evaluating the trained linear SVM on a test set ...')
p = clf.predict(Xtest)print('Test Accuracy: {}'.format(np.mean(p == ytest)*100))input('Program paused. Press ENTER to continue')# ===================== Part 5: Top Predictors of Spam =====================# 因为训练模型是线性支持向量机,可以通过权重(w)通过模型学习权重来更好地理解它是如何决定# 一封邮件是否是垃圾邮件。# 下面的代码使用分类器中最高的权重。分类器“认为”这些词最有可能是垃圾邮件的标志。
vocab_list = pe.get_vocab_list()
indices = np.argsort(clf.coef_).flatten()[::-1]print(indices)for i inrange(15):print('{} ({:0.6f})'.format(vocab_list[indices[i]], clf.coef_.flatten()[indices[i]]))input('ex6_spam Finished. Press ENTER to exit')