python垃圾邮件识别_机器学习垃圾邮件识别.ipynb

{

"cells": [

{

"cell_type": "markdown",

"metadata": {},

"source": [

"# 机器学习作业\n",

"\n",

"## 垃圾邮件分类"

]

},

{

"cell_type": "markdown",

"metadata": {},

"source": [

"套路第1步,数据整理"

]

},

{

"cell_type": "code",

"execution_count": 1,

"metadata": {},

"outputs": [],

"source": [

"# 引入包文件\n",

"import os\n",

"# set the resource path\n",

"SPAM_PATH = os.path.join('datasets','spam')\n",

"# set two part path and file name list\n",

"HAM_DIR = os.path.join(SPAM_PATH,'easy_ham')\n",

"SPAM_DIR = os.path.join(SPAM_PATH,'spam')\n",

"# there is cmd file in folder, so check the length of file name\n",

"ham_filenames = [name for name in sorted(os.listdir(HAM_DIR)) if len(name) > 20]\n",

"spam_filenames = [name for name in sorted(os.listdir(SPAM_DIR)) if len(name) > 20]"

]

},

{

"cell_type": "markdown",

"metadata": {},

"source": [

"得到2500封正常邮件,和500封垃圾邮件"

]

},

{

"cell_type": "code",

"execution_count": 7,

"metadata": {},

"outputs": [],

"source": [

"# 邮件解析工具,使用python的“email”模块解析这些电子邮件(它处理邮件头、编码等)\n",

"import email\n",

"import email.policy\n",

"\n",

"def load_email(is_spam, filename, spam_path=SPAM_PATH):\n",

" \"\"\"get email by set is_spam\n",

" \"\"\"\n",

" directory = \"spam\" if is_spam else \"easy_ham\"\n",

" with open(os.path.join(spam_path, directory, filename), \"rb\") as f:\n",

" return email.parser.BytesParser(policy=email.policy.default).parse(f)"

]

},

{

"cell_type": "code",

"execution_count": 9,

"metadata": {},

"outputs": [

{

"name": "stdout",

"output_type": "stream",

"text": [

"1) Fight The Risk of Cancer!\n",

"http://www.adclick.ws/p.cfm?o=315&s=pk007\n",

"\n",

"2) Slim Down - Guaranteed to lose 10-12 lbs in 30 days\n",

"http://www.adclick.ws/p.cfm?o=249&s=pk007\n",

"\n",

"3) Get the Child Support You Deserve - Free Legal Advice\n",

"http://www.adclick.ws/p.cfm?o=245&s=pk002\n",

"\n",

"4) Join the Web's Fastest Growing Singles Community\n",

"http://www.adclick.ws/p.cfm?o=259&s=pk007\n",

"\n",

"5) Start Your Private Photo Album Online!\n",

"http://www.adclick.ws/p.cfm?o=283&s=pk007\n",

"\n",

"Have a Wonderful Day,\n",

"Offer Manager\n",

"PrizeMama\n",

"\n",

"\n",

"\n",

"\n",

"\n",

"\n",

"\n",

"\n",

"\n",

"\n",

"\n",

"\n",

"\n",

"If you wish to leave this list please use the link below.\n",

"http://www.qves.com/trim/?ilug@linux.ie%7C17%7C114258\n",

"\n",

"\n",

"-- \n",

"Irish Linux Users' Group: ilug@linux.ie\n",

"http://www.linux.ie/mailman/listinfo/ilug for (un)subscription information.\n",

"List maintainer: listmaster@linux.ie\n"

]

}

],

"source": [

"# get all email data list\n",

"ham_emails = [load_email(is_spam=False, filename=name) for name in ham_filenames]\n",

"spam_emails = [load_email(is_spam=True, filename=name) for name in spam_filenames]\n",

"print(spam_emails[1].get_content().strip())"

]

},

{

"cell_type": "code",

"execution_count": 24,

"metadata": {},

"outputs": [],

"source": [

"# list all context of email\n",

"def get_email_structure(email):\n",

" if isinstance(email,str):\n",

" return email\n",

" payload = email.get_payload()\n",

" # get payload list by iterator\n",

" if isinstance(payload, list):\n",

" return 'multipart({})'.format(','.join([\n",

" get_email_structure(sub_email) for sub_email in payload\n",

" ]))\n",

" else:\n",

" return email.get_content_type()"

]

},

{

"cell_type": "code",

"execution_count": 25,

"metadata": {},

"outputs": [

{

"name": "stdout",

"output_type": "stream",

"text": [

"text/plain\n"

]

}

],

"source": [

"print(get_email_structure(ham_emails[5]))"

]

},

{

"cell_type": "code",

"execution_count": 26,

"metadata": {},

"outputs": [],

"source": [

"# import counter\n",

"from collections import Counter\n",

"\n",

"def structures_counter(emails):\n",

" structures = Counter()\n",

" for email in emails:\n",

" structure = get_email_structure(email)\n",

" structures[structure] += 1\n",

" return structures"

]

},

{

"cell_type": "markdown",

"metadata": {},

"source": [

"查看两种邮件的类型汇总,作简单的分析"

]

},

{

"cell_type": "code",

"execution_count": 27,

"metadata": {},

"outputs": [

{

"data": {

"text/plain": [

"[('text/plain', 2408),\n",

" ('multipart(text/plain,application/pgp-signature)', 66),\n",

" ('multipart(text/plain,text/html)', 8),\n",

" ('multipart(text/plain,text/plain)', 4),\n",

" ('multipart(text/plain)', 3),\n",

" ('multipart(text/plain,application/octet-stream)', 2),\n",

" ('multipart(text/plain,text/enriched)', 1),\n",

" ('multipart(text/plain,application/ms-tnef,text/plain)', 1),\n",

" ('multipart(multipart(text/plain,text/plain,text/plain),application/pgp-signature)',\n",

" 1),\n",

" ('multipart(text/plain,video/mng)', 1),\n",

" ('multipart(text/plain,multipart(text/plain))', 1),\n",

" ('multipart(text/plain,application/x-pkcs7-signature)', 1),\n",

" ('multipart(text/plain,multipart(text/plain,text/plain),text/rfc822-headers)',\n",

" 1),\n",

" ('multipart(text/plain,multipart(text/plain,text/plain),multipart(multipart(text/plain,application/x-pkcs7-signature)))',\n",

" 1),\n",

" ('multipart(text/plain,application/x-java-applet)', 1)]"

]

},

"execution_count": 27,

"metadata": {},

"output_type": "execute_result"

}

],

"source": [

"structures_counter(ham_emails).most_common()"

]

},

{

"cell_type": "code",

"execution_count": 28,

"metadata": {

"scrolled": true

},

"outputs": [

{

"data": {

"text/plain": [

"[('text/plain', 218),\n",

" ('text/html', 183),\n",

" ('multipart(text/plain,text/html)', 45),\n",

" ('multipart(text/html)', 20),\n",

" ('multipart(text/plain)', 19),\n",

" ('multipart(multipart(text/html))', 5),\n",

" ('multipart(text/plain,image/jpeg)', 3),\n",

" ('multipart(text/html,application/octet-stream)', 2),\n",

" ('multipart(text/plain,application/octet-stream)', 1),\n",

" ('multipart(text/html,text/plain)', 1),\n",

" ('multipart(multipart(text/html),application/octet-stream,image/jpeg)', 1),\n",

" ('multipart(multipart(text/plain,text/html),image/gif)', 1),\n",

" ('multipart/alternative', 1)]"

]

},

"execution_count": 28,

"metadata": {},

"output_type": "execute_result"

}

],

"source": [

"structures_counter(spam_emails).most_common()"

]

},

{

"cell_type": "markdown",

"metadata": {},

"source": [

"感觉html的占比不是183/500,而是(183+45+20+5+2+1+1+1)/500"

]

},

{

"cell_type": "code",

"execution_count": 29,

"metadata": {},

"outputs": [

{

"name": "stdout",

"output_type": "stream",

"text": [

"Return-Path : <12a1mailbot1@web.de>\n",

"Delivered-To : zzzz@localhost.spamassassin.taint.org\n",

"Received : from localhost (localhost [127.0.0.1])\tby phobos.labs.spamassassin.taint.org (Postfix) with ESMTP id 136B943C32\tfor ; Thu, 22 Aug 2002 08:17:21 -0400 (EDT)\n",

"Received : from mail.webnote.net [193.120.211.219]\tby localhost with POP3 (fetchmail-5.9.0)\tfor zzzz@localhost (single-drop); Thu, 22 Aug 2002 13:17:21 +0100 (IST)\n",

"Received : from dd_it7 ([210.97.77.167])\tby webnote.net (8.9.3/8.9.3) with ESMTP id NAA04623\tfor ; Thu, 22 Aug 2002 13:09:41 +0100\n",

"From : 12a1mailbot1@web.de\n",

"Received : from r-smtp.korea.com - 203.122.2.197 by dd_it7 with Microsoft SMTPSVC(5.5.1775.675.6);\t Sat, 24 Aug 2002 09:42:10 +0900\n",

"To : dcek1a1@netsgo.com\n",

"Subject : Life Insurance - Why Pay More?\n",

"Date : Wed, 21 Aug 2002 20:31:57 -1600\n",

"MIME-Version : 1.0\n",

"Message-ID : <0103c1042001882DD_IT7@dd_it7>\n",

"Content-Type : text/html; charset=\"iso-8859-1\"\n",

"Content-Transfer-Encoding : quoted-printable\n"

]

}

],

"source": [

"for header, value in spam_emails[0].items():\n",

" print(header,\":\",value)"

]

},

{

"cell_type": "code",

"execution_count": 30,

"metadata": {},

"outputs": [

{

"data": {

"text/plain": [

"'Life Insurance - Why Pay More?'"

]

},

"execution_count": 30,

"metadata": {},

"output_type": "execute_result"

}

],

"source": [

"spam_emails[0][\"Subject\"]"

]

},

{

"cell_type": "code",

"execution_count": 31,

"metadata": {},

"outputs": [

{

"data": {

"text/plain": [

"'text/html; charset=\"iso-8859-1\"'"

]

},

"execution_count": 31,

"metadata": {},

"output_type": "execute_result"

}

],

"source": [

"spam_emails[0][\"Content-Type\"]"

]

},

{

"cell_type": "code",

"execution_count": 32,

"metadata": {},

"outputs": [

{

"data": {

"text/plain": [

"'<12a1mailbot1@web.de>'"

]

},

"execution_count": 32,

"metadata": {},

"output_type": "execute_result"

}

],

"source": [

"spam_emails[0][\"Return-Path\"]"

]

},

{

"cell_type": "markdown",

"metadata": {},

"source": [

"首先需要一个函数来将html转换为纯文本,使用[Beautifulsoup]库,下面的函数首先删除`

`部分,然后将所有` `标记转换为单词hyperlink,然后去掉所有html标记,只留下纯文本。为了可读性,它还用一个换行符替换多个换行符,最后它取消了HTML实体(例如`&gt;`或`&nbsp;`)"

]

},

{

"cell_type": "code",

"execution_count": 33,

"metadata": {},

"outputs": [],

"source": [

"# 导入模块re 来替换字符串\n",

"import re\n",

"from html import unescape\n",

"\n",

"def html_to_plain_text(html):\n",

" \"\"\"remove sign text\n",

" \"\"\"\n",

" text = re.sub('

.*?', '', html, flags=re.M | re.S | re.I)\n",

" text = re.sub('

" text = re.sub('<.>', '', text, flags=re.M | re.S)\n",

" text = re.sub(r'(\\s*\\n)+', '\\n', text, flags=re.M | re.S)\n",

" return unescape(text)"

]

},

{

"cell_type": "markdown",

"metadata": {},

"source": [

"## 拆分训练集和测试集合"

]

},

{

"cell_type": "code",

"execution_count": 34,

"metadata": {},

"outputs": [],

"source": [

"# 导入numpy和sklearm,用来拆分训练集合和测试集合\n",

"import numpy as np\n",

"from sklearn.model_selection import train_test_split\n",

"\n",

"X = np.array(ham_emails + spam_emails)\n",

"y = np.array([0] * len(ham_emails) + [1] * len(spam_emails))\n",

"\n",

"X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)"

]

},

{

"cell_type": "code",

"execution_count": 38,

"metadata": {},

"outputs": [

{

"name": "stdout",

"output_type": "stream",

"text": [

"

MILFhunter\n",

"

"content=\"text/html; charset=windows-1252\">

"CONTENT=\"NO-CACHE\">\n",

"

"\n",

"\n",

"\n",

"\n",

"

"leftMargin=0 background=\"http://www.fromyou2.com/nasty/milf/bg.jpg\"\n",

"topMargin=0>\n",

"


\n",

"

\n",

"

\n",

"
\n",

" \n",

"

\n",

"

\n",

"

\n",

"

\n",

"

\n",

"

\n",

"

  • 0
    点赞
  • 1
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值