垃圾邮件辨别 | 实战(四)

1. 获取和加载数据

import os
import tarfile
from six.moves import urllib

DOWNLOAD_ROOT = "http://spamassassin.apache.org/old/publiccorpus/"
HAM_URL = DOWNLOAD_ROOT + "20030228_easy_ham.tar.bz2"
SPAM_URL = DOWNLOAD_ROOT + "20030228_spam.tar.bz2"
SPAM_PATH = os.path.join("datasets", "spam")

def fetch_spam_data(spam_url=SPAM_URL, spam_path=SPAM_PATH):
	if not os.path.isdir(spam_path):
		os.makedirs(spam_path)
	for filename, url in (("ham.tar.bz2", HAM_URL), ("spam.tar.bz2", SPAM_URL)):
		path = os.path.join(spam_path, filename)
		if not os.path.isfile(path):
			urllib.request.urlretrieve(url, path)
		tar_bz2_file = tarfile.open(path)
		tar_bz2_file.extractall(path=SPAM_PATH)
		tar_bz2_file.close()

fetch_spam_data()
HAM_DIR = os.path.join(SPAM_PATH, "easy_ham")
SPAM_DIR = os.path.join(SPAM_PATH, "spam")
ham_filenames = [name for name in sorted(os.listdir(HAM_DIR)) if len(name) > 20]
spam_filenames = [name for name in sorted(os.listdir(SPAM_DIR)) if len(name) > 20]
len(ham_filenames)

在这里插入图片描述

len(spam_filenames)

在这里插入图片描述

import email
import email.policy

# 解码邮件
def load_email(is_spam, filename, spam_path=SPAM_PATH):
	directory = "spam" if is_spam else "easy_ham"
	with open(os.path.join(spam_path, directory, filename), "rb") as f:
		return email.parser.BytesParser(policy=email.policy.default).parse(f)

# 将解码的邮件储存在列表中
ham_emails = [load_email(is_spam=False, filename=name) for name in ham_filenames]
spam_emails = [load_email(is_spam=Ture, filename=name) for name in spam_filenames]

print(ham_emails[1].get_content().strip())

在这里插入图片描述

print(spam_emails[6].get_content().strip())

在这里插入图片描述

2. 探索数据结构和类型

def get_email_structure(email):
	if isinstance(email, str):
		return email
	payload = email.get_payload()
	if isinstance(payload, list):
		return "multipart({})".format(", ".join([
			get_email_structure(sub_email) 
			for sub_email in payload
		]))
	else:
		return email.get_content_type()

from collections import Counter

def structures_counter(emails):
	structures = Counter()
	for email in emails:
		structure = get_email_structure(email)
		structures[structure] += 1
	return structures

structures_counter(ham_emails).most_common()

在这里插入图片描述

structures_counter(spam_emails).most_common()

在这里插入图片描述

for header, value in spam_emails[0].items():
	print(header,":",value)

在这里插入图片描述

spam_emails[0]["Subject"]

在这里插入图片描述

3. 将邮件内容转化为可操作文本

import numpy as np
from sklearn.model_selection import train_test_split

X = np.array(ham_emails + spam_emails)
y = np.array([0] * len(ham_emails) + [1] * len(spam_emails))

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

import re
from html import unescape

def html_to_plain_text(html):
	text = re.sub('<head.*?>.*?</head>', '', html, flags=re.M | re.S | re.I)
	text = re.sub('<a\s.*?>', ' HYPERLINK ', text, flags=re.M | re.S | re.I)
	text = re.sub('<.*?>', '', text, flags=re.M | re.S)
	text = re.sub(r'(\s*\n)', '\n', text, flags=re.M | re.S)
	return unescape(text)

html_spam_emails = [email for email in X_train[y_train==1] if get_email_structure(email) == "text/html"]
sample_html_spam = html_spam_emails[7]
print(sample_html_spam.get_content().strip()[:1000], "...")

在这里插入图片描述

print(html_to_plain_text(sample_html_spam.get_content())[:1000], "...")

在这里插入图片描述

print(html_to_plain_text(sample_html_spam.get_content))[:1000], "...")

在这里插入图片描述

def email_to_text(email):
	html = None
	for part in email.walk():
		ctype = part.get_content_type()
		if not ctype in ("text/plain", "text/html"):
			continue
		try:
			content = part.get_content()
		except:
			content = str(part.get_payload())
		if ctype == "text/plain":
			return content
		else:
			html = content
	if html:
		return html_to_plain_text(html)

print(email_to_text(sample_html_spam)[:100], "..."

在这里插入图片描述

try: 
	import nltk

	stemmer = nltk.PorterStemmer()
	for word in ("Computations", "Computation", "Computing", "Computed", "Computer", "Compulsive"):
		print(word, "=>", stemmer.stem(word))
except ImportError:
	print("Error: stemming requires the NLTK module.")
	stemmer = None

在这里插入图片描述

try:
	import urlextract

	url_extractor = urlextract.URLExtract()
	print(url_extractor.find_urls("Will it detect github.com and https://youtu.be/7Pq-S557XQU?t=3m32s"))
except ImportError:
	print("Error: replacing URLs reqiores the urlextract module.")
	url_extractor = None

在这里插入图片描述

from sklearn.base import BaseEstimator, TransformerMixin

class EmailToWordCounterTransformer(BaseEstimator, TranformerMixin):
	def __init__(self, strip_headers=True, lower_case=True, remove_punctuation=True, replace_urls=True, replace_numbers=True, stemming=True):
		self.strip_headers = strip_headers
		self.lower_case = lower_case
		self.remove_punctuation = remove_punctuation
		self.replace_urls = replace_urls
		self.replace_numbers = replace_numbers
		self.stemming = stemming
	def fit(self, X, y=None):
		return self
	def transform(self, X, y=None):
		X_transformed = []
		for email in X:
			text = email_to_text(email) or ""
			if self.lower_case:
				text = text.lower()
			if self.replace_urls and url_extractor is not None:
				urls = list(set(url_extractor.find_urls(text)))
				urls.sort(key=lambda url: len(url), reverse=True)
				for url in urls:
					text = text.reolace(url, "URL")
			if self.replace_numbers:
				text = re.sub(r'\d+(?:\.\d*(?:[eE]\d+))?', 'NUMBER', text)
			if self.remove_punctuation:
				text = re.sub(r'\W+', ' ', text, flags=re.M)
			word_counts = Counter(text.split())
			if self.stemming and stemming is not None:
				stemmed_word_counts = Counter()
				for word, count in word_counts.items()
					stemmed_word = stemmer.stem(word)
					stemmed_word_counts[stemmed_world] += count
				word_counts = stemmed_word_counts
			X_transformed.append(word_counts)
		return np.array(X_transformed)


X_few = X_train[:3]
X_few_wordcounts = EmailToWorldCounterTransformer().fit_transform(X_few)
X_few_wordcounts

在这里插入图片描述

4. 文本向量化

from scipy.sparse import csr_matrix

class WordCounterToVectorTransformer(BaseEstimator, TransformerMixin):
	def __init__(self, vocabulary_size=1000):
		self.vocabulary_size = vocabulary_size
	def fit(self, X, y=None):
		total_count = Counter()
		for word_count in X:
			for word, count in word_count.items():
				total_count[word] += min(count, 10)
		most_common = total_count.most_common()[:self.vocabulary_size]
		self.most_common_ = most_common
		self.vocabulary_ = {word: index + 1 for index, (word, count) in enumerate(most_common)}
		return self
	def transform(self, X, y=None):
		rows = []
		cols = []
		data = []
		for row, word_count in enumerate(X):
			for word, count in word_count.items():
				rows.append(row)
				cols.append(self.vocabulary_.get(word, 0))
				data.append(count)
		return csr_matrix((data, (rows, cols)), shape=(len(X), self.vocabulary_size + 1))


vocab_transformer = WordCounterToVectorTransformer(vocabulary_size=10)
X_few_vectors = vocab_transformer.fit_transform(X_few_wordcounts)
X_few_vectors

在这里插入图片描述

# 第一列
X_few_vectors.toarray()

在这里插入图片描述

vocab_transformer.vocabulary_

在这里插入图片描述

5.构建分类器

from sklearn.pipelin import Pipeline

preprocess_pipeline = Pipeline([
		("email_to_wordcount", EmailToWordCounterTransformer()),
		("wordcount_to_vector", WordCounterToVectorTransformer()),
	])

X_train_transformed = preprocess_pipeline.fit_transform(X_train)

from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score

log_clf = LogisticRegression(solver='liblinear', random_state=42)
score = cross_val_score(log_clf, X_train_transformed, y_train, cv=3, verbose=3)
score.mean()

在这里插入图片描述

from sklearn.metrics import precision_score, recall_score

X_text_transformed = preprocess_pipeline.transform(X_test)

log_clf = LogisticRegression(solver="liblinear", random_state=42)
log_clf.fit(X_train_trainsformed, y_train)

y_pred = log_clf.predict(X_test_transformed)

print("Precision: {:.2f}%".format(100 * precision_score(y_test, y_pred)))
print("Recall: {:.2f}%".format(100 * recall_score(y_test, y_pred)))

在这里插入图片描述

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值