文章目录
前言
本次作业是用svm对邮件进行分类,筛选出垃圾邮件
邮件为txt文件
- 首先我们要对邮件进行预处理,去除,替换一些符号
- 然后,将处理好的邮件str转化为单词列表
- 根据给定的单词dict,将邮件单词列表转化为0/1表示的列向量
- 根据此向量,训练线性核SVM
- 得到的SVM模型即可用于分类
代码分析
首先,导入类库
import numpy as np
import matplotlib.pyplot as plt
import scipy.io #Used to load the OCTAVE *.mat files
from sklearn import svm #SVM software
import re #regular expression for e-mail processing
from stemming.porter2 import stem#词干提取
import nltk, nltk.stem.porter
%matplotlib inline
查看email文件
print ("emailSample1.txt:")
#这个是window的cmd命令行
!type data\emailSample1.txt
输出:
emailSample1.txt:
> Anyone knows how much it costs to host a web portal ?
>
Well, it depends on how many visitors you’re expecting.
This can be anywhere from less than 10 bucks a month to a couple of $100.
You should checkout http://www.rackspace.com/ or perhaps Amazon EC2
if youre running something big…
To unsubscribe yourself from this mailing list, send an email to:
groupname-unsubscribe@egroups.com
预处理email文件
def preProcess( email ):
#使邮件小写
email = email.lower()
#正则化处理email,去掉<>,替换为空格
email = re.sub('<[^<>]+>', ' ', email);
#数字替换为'number'
email = re.sub('[0-9]+', 'number', email)
#'http' or 'https://' 替换为'httpaddr'
email = re.sub('(http|https)://[^\s]*', 'httpaddr', email)
#'@'替换为'emailaddr'
email = re.sub('[^\s]+@[^\s]+', 'emailaddr', email);
#''$'替换为'dollar'
email = re.sub('[$]+', 'dollar', email);
return email
将email文件先preProcess,再提取词干,处理成单词列表
def email2TokenList( raw_email ):
stemmer = nltk.stem.porter.PorterStemmer()
#先对raw_email进行预处理
email = preProcess( raw_email )
#将email分割为单词列表
tokens = re.split('[ \@\$\/\#\.\-\:\&\*\+\=\[\]\?\!\(\)\{\}\,\'\"\>\_\<\;\%]', email)
tokenlist = []
for token in tokens:
#删除所有非字母数字字符
token = re.sub('[^a-zA-Z0-9]', '', token);
#词干提取器 played-->play
stemmed = stemmer.stem( token )
#丢掉空的token
if not len(token): continue
#存储唯一的词干
tokenlist.append(stemmed)
return tokenlist
#处理原始的单词映射文件,得到给定单词的字典
def getVocabDict(reverse=False):
vocab_dict = {
}
#打开映射表
with open("data/vocab.txt") as f:
for line in f:
(val, key) = line.split()
if not reverse:
vocab_dict[key] = int(val)
else:
vocab_dict[int(val)] = key
return vocab_dict
#根据给定的单词字典,将单词list转化为索引list
def email2VocabIndices( raw_email, vocab_dict ):
#将email预处理,并加工为单词list
tokenlist = email2TokenList( raw_email )
#得到单词list的索引list
index_list = [ vocab_dic