SVM垃圾邮件分类

spamProcessEmail.py预处理

import numpy as np
import re
import nltk, nltk.stem.porter


def process_email(email_contents):
    vocab_list = get_vocab_list()

    word_indices = np.array([], dtype=np.int64)

    # ===================== Preprocess Email =====================

    email_contents = email_contents.lower()

    email_contents = re.sub('<[^<>]+>', ' ', email_contents)

    # Any numbers get replaced with the string 'number'
    email_contents = re.sub('[0-9]+', 'number', email_contents)

    # Anything starting with http or https:// replaced with 'httpaddr'
    email_contents = re.sub('(http|https)://[^\s]*', 'httpaddr', email_contents)

    # Strings with "@" in the middle are considered emails --> 'emailaddr'
    email_contents = re.sub('[^\s]+@[^\s]+', 'emailaddr', email_contents)

    # The '$' sign gets replaced with 'dollar'
    email_contents = re.sub('[$]+', 'dollar', email_contents)

    # ===================== Tokenize Email =====================

    # Output the email
    print('==== Processed Email ====')

    stemmer = nltk.stem.porter.PorterStemmer()

    # print('email contents : {}'.format(email_contents))

    tokens = re.split('[@$/#.-:&*+=\[\]?!(){\},\'\">_<;% ]', email_contents)
    for token in tokens:
        token = re.sub('[^a-zA-Z0-9]', '', token)
        token = stemmer.stem(token)

        if len(token) < 1:
            continue

        # ===================== Your Code Here =====================
        # 提示 :
        # 举例:如果token == 'action',那么在vocab_list中查找action
        # 如果vocab_list[18] == 'action'
        # 然后应该向word_indices数组添加18。
        word_indices=np.append(word_indices,token)


        # ==========================================================

        print(token)

    print('==================')

    return word_indices


def get_vocab_list():
    vocab_dict = {}
    with open('vocab.txt') as f:
        for line in f:
            (val, key) = line.split()
            vocab_dict[int(val)] = key

    return vocab_dict

spamEmailFeatures.py特征提取

import numpy as np



def email_features(word_indices):
    # Total number of words in the dictionary
    n = 1899

    # You need to return the following variables correctly.
    # Since the index of numpy array starts at 0, to align with the word indices we make n + 1 size array
    features = np.zeros(n + 1)

    # ===================== Your Code Here =====================
    # 提示 :
    # 此函数返回给定电子邮件的特征向量(word_indices),使其更容易
    # 处理邮件,我们已经对每个邮件进行了预处理,并将电子邮件中的每个单词转换为索引
    # word_indices包含单词的索引列表
    # 如果一封邮件的内容是:
    # The quick brown fox jumped over the lazy dog.
    # 然后,这个文本的word_indices向量可能是这样的
    #
    # 60 100 33 44 10 53 60 58 5
    # 其中,我们将每个单词映射到一个数字上,例如:
    # The——60
    # quick——100
    # ...


    # 任务是使用word_indices向量来构造一种二进制特征向量,
    # 表示出现在邮件中特定的词,即当word的索引为i时,那么features[i] = 1
    # 具体地说,如果“the”这个词(比如,索引60)出现在电子邮件,然后features[60]= 1。
    # features=[0,0,0,0,1,0,0,0,…0,0,0,1,…0,0,0,1, 0]

    # 另:可参考one-hot encoding
    for each in word_indices:
        features[each - 1] = 1  # 若word_indices在对应单词表的位置上词语存在则记为1

    # ==========================================================

    return features


svmClf_spam.py训练

import matplotlib.pyplot as plt
import numpy as np
import scipy.io as scio
from sklearn import svm

import svmClassification.svmClassification.spamProcessEmail as pe
import svmClassification.svmClassification.spamEmailFeatures as ef

plt.ion()
np.set_printoptions(formatter={'float': '{: 0.6f}'.format})

# ===================== Part 1: 邮件处理 =====================
# 完成spamProcessEmail.py,首先将每封电子邮件转换为一个特征向量,
# 再使用SVM,将邮件分为垃圾邮件和非垃圾邮件
#


print('Preprocessing sample email (emailSample1.txt) ...')

file_contents = open('emailSample1.txt', 'r').read()
word_indices = pe.process_email(file_contents)

# Print stats
print('Word Indices: ')
print(word_indices)

input('Program paused. Press ENTER to continue')

# ===================== Part 2: 特征提取 =====================
# 完成文件spamEmailFeatures.py,将邮件转换为R^N的特征矩阵


print('Extracting Features from sample email (emailSample1.txt) ... ')
"""
这段自己添加
"""
# =====================================================================================================================
def word_indices(processed_f, vocab_list):
    indices = []
    for i in range(len(processed_f)):
        for j in range(len(vocab_list)):
            if processed_f[i] != vocab_list[j]:
                continue
            indices.append(j + 1)
    return indices
f=pe.process_email(file_contents)
li = np.loadtxt('vocab.txt', dtype='str', usecols=1)
f_indices = word_indices(f,li)
# ======================================================================================================================

# Extract features
features = ef.email_features(f_indices)

# Print stats
print('Length of feature vector: {}'.format(features.size))
print('Number of non-zero entries: {}'.format(np.flatnonzero(features).size))

input('Program paused. Press ENTER to continue')

# ===================== Part 3: Train Linear SVM for Spam Classification =====================
# In this section, you will train a linear classifier to determine if an
# email is Spam or Not-spam.

# Load the Spam Email dataset
# You will have X, y in your environment
data = scio.loadmat('spamTrain.mat')
X = data['X']
y = data['y'].flatten()

print('Training Linear SVM (Spam Classification)')
print('(this may take 1 to 2 minutes)')

c = 0.1
clf = svm.SVC(c, kernel='linear')
clf.fit(X, y)

p = clf.predict(X)

print('Training Accuracy: {}'.format(np.mean(p == y) * 100))

# ===================== Part 4: Test Spam Classification =====================
# After training the classifier, we can evaluate it on a test set. We have
# included a test set in spamTest.mat

# Load the test dataset
data = scio.loadmat('spamTest.mat')
Xtest = data['Xtest']
ytest = data['ytest'].flatten()

print('Evaluating the trained linear SVM on a test set ...')

p = clf.predict(Xtest)

print('Test Accuracy: {}'.format(np.mean(p == ytest) * 100))

input('Program paused. Press ENTER to continue')

# ===================== Part 5: Top Predictors of Spam =====================
# 因为训练模型是线性支持向量机,可以通过权重(w)通过模型学习权重来更好地理解它是如何决定
# 一封邮件是否是垃圾邮件。
# 下面的代码使用分类器中最高的权重。分类器“认为”这些词最有可能是垃圾邮件的标志。


vocab_list = pe.get_vocab_list()
indices = np.argsort(clf.coef_).flatten()[::-1]
print(indices)

for i in range(15):
    print('{} ({:0.6f})'.format(vocab_list[indices[i]], clf.coef_.flatten()[indices[i]]))

input('ex6_spam Finished. Press ENTER to exit')
  • 1
    点赞
  • 4
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值