实现网页数据可视化
Web数据挖掘
项目内容:
根据指定的标题或者内容关键字,爬取网易邮件标题、日期、发件人、邮件内容信息,并将爬取的内容存成Excel 文件。
要求:整个爬取过程实现全自动或者半自动登录,输入标题或者内容关键字,然后自动完成爬取和信息存储。
首先是先了解所需的工具,在pycharm中导入相应的函数
import urllib.request
import re
import http.cookiejar
import urllib.parse
import xlwt
import jieba
import jieba.posseg as pseg
初始化
import urllib.request
import re
import http.cookiejar
import urllib.parse
import xlwt
import jieba
import jieba.posseg as pseg
class MAIL:
def __init__(self):
#获取登录请求的网址,请求登陆的URL
self.loginUrl = "https://mail.163.com/entry/cgi/ntesdoor?style=-1&df=mail163_letter&net=&language=-1&from=web&race=&iframe=1&product=mail163&funcid=loginone&passtype=1&allssl=true&url2=https://mail.163.com/errorpage/error163.htm"
#设置代理,以防止本地IP被封
self.proxyUrl = "http://202.106.16.36:3128"
#初始化sid码
self.sid = ""
#第一次登陆所需要的请求头request header
self.loginHeaders = {
'Accept': "text/html,application/xhtml+xml,application/xml;q=0.9,,image/webp,image/apng,*/*;q=0.8",
'Accept-Language': "zh-CN,zh;q=0.9",
'Connection': "keep-alive",
'Host': "mail.163.com",
'Referer': "http://mail.163.com/",
'User-Agent':"Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.181 Mobile Safari/537.36"
}
#设置用户名和密码
self.username = '' #163邮箱账号
self.pwd = '' #邮箱密码
#post所包含的参数
self.post = {
'savelogin':"0",
'url2':"http://mail.163.com/errorpage/error163.htm",
'username':self.username,
'password':self.pwd
}
#对post编码转换
self.postData = urllib.parse.urlencode(self.post).encode('utf8')
# 使用http.cookiejar.CookieJar()创建CookieJar对象
self.cjar = http.cookiejar.CookieJar()
# 使用HTTPCookieProcessor创建cookie处理器,并以其为参数构建opener对象
self.cookie = urllib.request.HTTPCookieProcessor(self.cjar)
print(self.cookie.cookiejar)
self.opener = urllib.request.build_opener(self.cookie)
# 将opener安装为全局
urllib.request.install_opener(self.opener)
模拟登陆并获取sid码
def loginPage(self):
try:
#发出一个请求
self.request = urllib.request.Request(self.loginUrl,self.postData,self.loginHeaders)
except urllib.error.HTTPError as e:
print(e.code)
print(e.read().decode("utf8"))
#得到响应
self.response = urllib.request.urlopen(self.request)
#需要将响应中的内容用read读取出来获得网页代码,网页编码为utf-8
self.content = self.response.read().decode("utf8")
# 设定提取sid码的正则表达式
self.sidpattern = re.compile('sid=(.*?)&', re.S)
self.result = re.search(self.sidpattern, self.content)
self.sid = self.result.group(1)
通过sid码获得邮箱收件箱信息
def messageList_1(self,mid):
listUrl = 'https://mail.163.com/js6/read/readhtml.jsp?mid=%s&userType=browser&font=15&color=064977'%mid
Headers = {
'Accept': "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8",
'Accept-Language': "zh-CN,zh;q=0.9",
'Accept - Encoding':'gzip, deflate, br',
'Connection': "keep-alive",
'Cookie':'mail_health_check_time=1556609066959; locale=; mail_psc_fingerprint=a8dffedbebe63745cf15fd89c0a7beab; usertrack=CrH7v1zGXraRqTW2AxYUAg==; _ntes_nnid=75d008d2fbf85b5f283e79661d25054e,1556504250620; _ntes_nuid=75d008d2fbf85b5f283e79661d25054e; __yadk_uid=JIrHKqgtaS6VWijpsx8neWDebsCUx4GQ; Province=028; City=028; nts_mail_user=luyuan_y@163.com:-1:1; NTES_SESS=qdvGTMy1RczM_evMhDHgzmbBcXt0kLGPMwy9MnJfIry2krW0kCD17o3hutBrklbO_lxl8GpF9205MHbn0UgD8SNrxzjTvHaoLP_crV0UOmEmVKlhfhM6nKuozZ_IEj093ORX7ux3FebmANC8oMHymnrq4I6i2UL4hof9QKZegcqq5XYdqJf4xpAvSLBy.d0sDMHVNpB4O992o; S_INFO=1556974521|0|3&80##|luyuan_y; P_INFO=luyuan_y@163.com|1556974521|0|urs|00&99|sic&1556974220&mail163#sic&510100#10#0#0|&0|mail163|luyuan_y@163.com; mail_upx=t4gd.mail.163.com|t1gd.mail.163.com|t2gd.mail.163.com|t3gd.mail.163.com|t3bj.mail.163.com|t4bj.mail.163.com|t1bj.mail.163.com|t2bj.mail.163.com; mail_upx_nf=; mail_idc=; Coremail=49b62afb204b2%yAgtufmkEcJAvvgxkykkfhxWeSJLEcLt%g6a47.mail.163.com; MAIL_MISC=luyuan_y; cm_last_info=dT1sdXl1YW5feSU0MDE2My5jb20mZD1odHRwJTNBJTJGJTJGbWFpbC4xNjMuY29tJTJGanM2JTJGbWFpbi5qc3AlM0ZzaWQlM0R5QWd0dWZta0VjSkF2dmd4a3lra2ZoeFdlU0pMRWNMdCZzPXlBZ3R1Zm1rRWNKQXZ2Z3hreWtrZmh4V2VTSkxFY0x0Jmg9aHR0cCUzQSUyRiUyRm1haWwuMTYzLmNvbSUyRmpzNiUyRm1haW4uanNwJTNGc2lkJTNEeUFndHVmbWtFY0pBdnZneGt5a2tmaHhXZVNKTEVjTHQmdz1tYWlsLjE2My5jb20mbD0tMSZ0PS0x; MAIL_SESS=qdvGTMy1RczM_evMhDHgzmbBcXt0kLGPMwy9MnJfIry2krW0kCD17o3hutBrklbO_lxl8GpF9205MHbn0UgD8SNrxzjTvHaoLP_crV0UOmEmVKlhfhM6nKuozZ_IEj093ORX7ux3FebmANC8oMHymnrq4I6i2UL4hof9QKZegcqq5XYdqJf4xpAvSLBy.d0sDMHVNpB4O992o; MAIL_SINFO=1556974521|0|3&80##|luyuan_y; MAIL_PINFO=luyuan_y@163.com|1556974521|0|urs|00&99|sic&1556974220&mail163#sic&510100#10#0#0|&0|mail163|luyuan_y@163.com; secu_info=1; mail_entry_sess=12b0551f63634e38ee8cac3f784f08069530ac9a68acfcd7cc2ec20f0b56fd986a2ea45d266ebef4c9094970bba69bd98faf6f0c9dd8bea99b53094082559b15d14c1e8c0a913d485f2aa25b8285dc94a98b242892190e6f55349a3edb87f104d73edc86987e92910bdff80ff47815b1357840d1ea8d2e54f503790ce40403ab463059c917f30b2a03565f17274ff278b8f7533f24fe2cff89902c886d17703423f3918238efb3bd44b5943f3ed29c70870efd3ac0803af6d1163e26b7b691f5; Coremail.sid=yAgtufmkEcJAvvgxkykkfhxWeSJLEcLt; mail_style=js6; mail_uid=luyuan_y@163.com; mail_host=mail.163.com; NNSSPID=f958ae82ac9d48a0b03831ec1ec0c2c3',
'Host': "mail.163.com",
'Referer': "https://mail.163.com/js6/main.jsp?sid=%s&df=mail163_letter"%self.sid,
'Upgrade-Insecure-Requests':'1',
'User-Agent': "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.25 Safari/537.36 Core/1.70.3676.400 QQBrowser/10.4.3469.400"
}
request = urllib.request.Request(listUrl, headers=Headers)
response = self.opener.open(request)
content1 = response.read().decode('utf-8')
pattern = re.compile('[\u4e00-\u9fa5]+', re.S)
mass = re.findall(pattern, content1)
list_new = []
for mm in mass:
list_new.append(mm.replace("微软雅黑",""))
list_new.append(" ")
return list_new
重定向至收件箱的网址
def messageList(self):
listUrl = 'http://mail.163.com/js6/s?sid=%s&func=mbox:listMessages&TopTabReaderShow=1&TopTabLofterShow=1&welcome_welcomemodule_mailrecom_click=1&LeftNavfolder1Click=1&mbox_folder_enter=1'%self.sid
#listUrl = 'http://mail.163.com/js6/s?sid=%s&func=mbox:listMessages' % self.sid
#新的请求头
Headers = {
'Accept': "text/javascript",
'Accept-Language': "zh-CN,zh;q=0.9",
'Connection': "keep-alive",
'Host': "mail.163.com",
'Referer': "https://mail.163.com/js6/main.jsp?sid=%suCFJZNnnRnInrsigqunnSrQXsvMMqctH&df=mail163_letter"%self.sid,
'User-Agent':"Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.181 Mobile Safari/537.36"
}
#发出请求并获得响应
request = urllib.request.Request(listUrl, headers = Headers)
response = self.opener.open(request)
#提取响应的页面内容,里面是收件箱的信息
content = response.read().decode('utf-8')
return content
获取邮件信息
def getmail(self):
flag = 0
i = 1
lis = []
input_word = input("请输入想要爬取的内容:")
words = pseg.cut(input_word)
for w in words:
lis.append(w.word)
messages = self.messageList()
pattern = re.compile(
"'id'..(.*?),.*?fid..(.*?),.*?size..(.*?),.*?from..(.*?),.*?to..(.*?),.*?subject..(.*?),.*?sentDate':new Dat..(.*?)...'.*?receivedDate':new Dat..(.*?)...'.*?hmid..(.*?),\n",
re.S)
mails = re.findall(pattern, messages)
# for mail in mails:
# print(mail[0])
workbook = xlwt.Workbook()
worksheet = workbook.add_sheet('test')
worksheet.write(0, 0, '发件人')
worksheet.write(0, 1, '收件人')
worksheet.write(0, 2, '主题')
worksheet.write(0, 3, '发送时间')
worksheet.write(0, 4, '接受时间')
worksheet.write(0, 5, '邮件内容')
print('\n')
print('和{}相关的内容有:'.format(input_word))
for mail in mails:
for a in range(3,8):
words = pseg.cut(mail[a])
for w in words:
if w.word in lis:
flag = 1
break
if flag == 1:
print('-' * 50)
print ('发件人:',mail[3],'主题:',mail[5],'发送时间:',mail[6])
print ('收件人:',mail[4],u'接收时间:',mail[7])
worksheet.write(i, 0, mail[3])
worksheet.write(i, 1, mail[4])
worksheet.write(i, 2, mail[5])
worksheet.write(i, 3, mail[6])
worksheet.write(i, 4, mail[7])
worksheet.write(i, 5,self.messageList_1(mail[0].replace("'","")))
i = i+1
flag = 0
workbook.save('excelwrite.xls')
创建163邮箱爬虫类
mail = MAIL()
mail.loginPage()
mail.getmail()
如excel结果显示登录异常,应是网页发生变化,请进入邮箱收件信息内容里更换cookie,位置如下:
部分结果如下: