web数据挖掘-异步加载与请求头

实现网页数据可视化

Web数据挖掘

项目内容:
根据指定的标题或者内容关键字,爬取网易邮件标题、日期、发件人、邮件内容信息,并将爬取的内容存成Excel 文件。
要求:整个爬取过程实现全自动或者半自动登录,输入标题或者内容关键字,然后自动完成爬取和信息存储。

首先是先了解所需的工具,在pycharm中导入相应的函数

import urllib.request
import re
import http.cookiejar
import urllib.parse
import xlwt
import jieba
import jieba.posseg as pseg

初始化

import urllib.request
import re
import http.cookiejar
import urllib.parse
import xlwt
import jieba
import jieba.posseg as pseg

class MAIL:
    def __init__(self):
        #获取登录请求的网址,请求登陆的URL
        self.loginUrl = "https://mail.163.com/entry/cgi/ntesdoor?style=-1&df=mail163_letter&net=&language=-1&from=web&race=&iframe=1&product=mail163&funcid=loginone&passtype=1&allssl=true&url2=https://mail.163.com/errorpage/error163.htm"
        #设置代理,以防止本地IP被封
        self.proxyUrl = "http://202.106.16.36:3128"
        #初始化sid码
        self.sid = ""
        #第一次登陆所需要的请求头request header
        self.loginHeaders = {
            'Accept': "text/html,application/xhtml+xml,application/xml;q=0.9,,image/webp,image/apng,*/*;q=0.8",
            'Accept-Language': "zh-CN,zh;q=0.9",
            'Connection': "keep-alive",
            'Host': "mail.163.com",
            'Referer': "http://mail.163.com/",
            'User-Agent':"Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.181 Mobile Safari/537.36"
        }
        #设置用户名和密码
        self.username = ''  #163邮箱账号
        self.pwd = ''    #邮箱密码
        #post所包含的参数
        self.post = {
            'savelogin':"0",
            'url2':"http://mail.163.com/errorpage/error163.htm",
            'username':self.username,
            'password':self.pwd
        }
        #对post编码转换
        self.postData = urllib.parse.urlencode(self.post).encode('utf8')
        # 使用http.cookiejar.CookieJar()创建CookieJar对象
        self.cjar = http.cookiejar.CookieJar()
        # 使用HTTPCookieProcessor创建cookie处理器,并以其为参数构建opener对象
        self.cookie = urllib.request.HTTPCookieProcessor(self.cjar)
        print(self.cookie.cookiejar)
        self.opener = urllib.request.build_opener(self.cookie)
        # 将opener安装为全局
        urllib.request.install_opener(self.opener)

模拟登陆并获取sid码

    def loginPage(self):
        try:
            #发出一个请求
            self.request = urllib.request.Request(self.loginUrl,self.postData,self.loginHeaders)
        except urllib.error.HTTPError as e:
            print(e.code)
            print(e.read().decode("utf8"))
        #得到响应
        self.response = urllib.request.urlopen(self.request)
        #需要将响应中的内容用read读取出来获得网页代码,网页编码为utf-8
        self.content = self.response.read().decode("utf8")

        # 设定提取sid码的正则表达式
        self.sidpattern = re.compile('sid=(.*?)&', re.S)
        self.result = re.search(self.sidpattern, self.content)
        self.sid = self.result.group(1)
    

通过sid码获得邮箱收件箱信息

    def messageList_1(self,mid):
        listUrl = 'https://mail.163.com/js6/read/readhtml.jsp?mid=%s&userType=browser&font=15&color=064977'%mid
        Headers = {
            'Accept': "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8",
            'Accept-Language': "zh-CN,zh;q=0.9",
            'Accept - Encoding':'gzip, deflate, br',
            'Connection': "keep-alive",
            'Cookie':'mail_health_check_time=1556609066959; locale=; mail_psc_fingerprint=a8dffedbebe63745cf15fd89c0a7beab; usertrack=CrH7v1zGXraRqTW2AxYUAg==; _ntes_nnid=75d008d2fbf85b5f283e79661d25054e,1556504250620; _ntes_nuid=75d008d2fbf85b5f283e79661d25054e; __yadk_uid=JIrHKqgtaS6VWijpsx8neWDebsCUx4GQ; Province=028; City=028; nts_mail_user=luyuan_y@163.com:-1:1; NTES_SESS=qdvGTMy1RczM_evMhDHgzmbBcXt0kLGPMwy9MnJfIry2krW0kCD17o3hutBrklbO_lxl8GpF9205MHbn0UgD8SNrxzjTvHaoLP_crV0UOmEmVKlhfhM6nKuozZ_IEj093ORX7ux3FebmANC8oMHymnrq4I6i2UL4hof9QKZegcqq5XYdqJf4xpAvSLBy.d0sDMHVNpB4O992o; S_INFO=1556974521|0|3&80##|luyuan_y; P_INFO=luyuan_y@163.com|1556974521|0|urs|00&99|sic&1556974220&mail163#sic&510100#10#0#0|&0|mail163|luyuan_y@163.com; mail_upx=t4gd.mail.163.com|t1gd.mail.163.com|t2gd.mail.163.com|t3gd.mail.163.com|t3bj.mail.163.com|t4bj.mail.163.com|t1bj.mail.163.com|t2bj.mail.163.com; mail_upx_nf=; mail_idc=; Coremail=49b62afb204b2%yAgtufmkEcJAvvgxkykkfhxWeSJLEcLt%g6a47.mail.163.com; MAIL_MISC=luyuan_y; cm_last_info=dT1sdXl1YW5feSU0MDE2My5jb20mZD1odHRwJTNBJTJGJTJGbWFpbC4xNjMuY29tJTJGanM2JTJGbWFpbi5qc3AlM0ZzaWQlM0R5QWd0dWZta0VjSkF2dmd4a3lra2ZoeFdlU0pMRWNMdCZzPXlBZ3R1Zm1rRWNKQXZ2Z3hreWtrZmh4V2VTSkxFY0x0Jmg9aHR0cCUzQSUyRiUyRm1haWwuMTYzLmNvbSUyRmpzNiUyRm1haW4uanNwJTNGc2lkJTNEeUFndHVmbWtFY0pBdnZneGt5a2tmaHhXZVNKTEVjTHQmdz1tYWlsLjE2My5jb20mbD0tMSZ0PS0x; MAIL_SESS=qdvGTMy1RczM_evMhDHgzmbBcXt0kLGPMwy9MnJfIry2krW0kCD17o3hutBrklbO_lxl8GpF9205MHbn0UgD8SNrxzjTvHaoLP_crV0UOmEmVKlhfhM6nKuozZ_IEj093ORX7ux3FebmANC8oMHymnrq4I6i2UL4hof9QKZegcqq5XYdqJf4xpAvSLBy.d0sDMHVNpB4O992o; MAIL_SINFO=1556974521|0|3&80##|luyuan_y; MAIL_PINFO=luyuan_y@163.com|1556974521|0|urs|00&99|sic&1556974220&mail163#sic&510100#10#0#0|&0|mail163|luyuan_y@163.com; secu_info=1; mail_entry_sess=12b0551f63634e38ee8cac3f784f08069530ac9a68acfcd7cc2ec20f0b56fd986a2ea45d266ebef4c9094970bba69bd98faf6f0c9dd8bea99b53094082559b15d14c1e8c0a913d485f2aa25b8285dc94a98b242892190e6f55349a3edb87f104d73edc86987e92910bdff80ff47815b1357840d1ea8d2e54f503790ce40403ab463059c917f30b2a03565f17274ff278b8f7533f24fe2cff89902c886d17703423f3918238efb3bd44b5943f3ed29c70870efd3ac0803af6d1163e26b7b691f5; Coremail.sid=yAgtufmkEcJAvvgxkykkfhxWeSJLEcLt; mail_style=js6; mail_uid=luyuan_y@163.com; mail_host=mail.163.com; NNSSPID=f958ae82ac9d48a0b03831ec1ec0c2c3',
            'Host': "mail.163.com",
            'Referer': "https://mail.163.com/js6/main.jsp?sid=%s&df=mail163_letter"%self.sid,
            'Upgrade-Insecure-Requests':'1',
            'User-Agent': "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.25 Safari/537.36 Core/1.70.3676.400 QQBrowser/10.4.3469.400"

        }
        request = urllib.request.Request(listUrl, headers=Headers)
        response = self.opener.open(request)
        content1 = response.read().decode('utf-8')
        pattern = re.compile('[\u4e00-\u9fa5]+', re.S)
        mass = re.findall(pattern, content1)
        list_new = []
        for mm in mass:
            list_new.append(mm.replace("微软雅黑",""))
            list_new.append(" ")
        return list_new

重定向至收件箱的网址

    def messageList(self):
       
        listUrl =  'http://mail.163.com/js6/s?sid=%s&func=mbox:listMessages&TopTabReaderShow=1&TopTabLofterShow=1&welcome_welcomemodule_mailrecom_click=1&LeftNavfolder1Click=1&mbox_folder_enter=1'%self.sid
        #listUrl = 'http://mail.163.com/js6/s?sid=%s&func=mbox:listMessages' % self.sid
        #新的请求头
        Headers = {
            'Accept': "text/javascript",
            'Accept-Language': "zh-CN,zh;q=0.9",
            'Connection': "keep-alive",
            'Host': "mail.163.com",
            'Referer': "https://mail.163.com/js6/main.jsp?sid=%suCFJZNnnRnInrsigqunnSrQXsvMMqctH&df=mail163_letter"%self.sid,
            'User-Agent':"Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.181 Mobile Safari/537.36"
        }
        #发出请求并获得响应
        request = urllib.request.Request(listUrl, headers = Headers)
        response = self.opener.open(request)
        #提取响应的页面内容,里面是收件箱的信息
        content = response.read().decode('utf-8')
        return content
        
        

获取邮件信息

    
    def getmail(self):
        flag = 0
        i = 1
        lis = []
        input_word = input("请输入想要爬取的内容:")
        words = pseg.cut(input_word)
        for w in words:
            lis.append(w.word)
        messages = self.messageList()
        pattern = re.compile(
            "'id'..(.*?),.*?fid..(.*?),.*?size..(.*?),.*?from..(.*?),.*?to..(.*?),.*?subject..(.*?),.*?sentDate':new Dat..(.*?)...'.*?receivedDate':new Dat..(.*?)...'.*?hmid..(.*?),\n",
            re.S)
        mails = re.findall(pattern, messages)
        # for mail in mails:
        #     print(mail[0])
        workbook = xlwt.Workbook()
        worksheet = workbook.add_sheet('test')
        worksheet.write(0, 0, '发件人')
        worksheet.write(0, 1, '收件人')
        worksheet.write(0, 2, '主题')
        worksheet.write(0, 3, '发送时间')
        worksheet.write(0, 4, '接受时间')
        worksheet.write(0, 5, '邮件内容')
        print('\n')
        print('和{}相关的内容有:'.format(input_word))
        for mail in mails:
            for a in range(3,8):
                words = pseg.cut(mail[a])
                for w in words:
                    if w.word in lis:
                        flag = 1
                        break

            if flag == 1:
                print('-' * 50)
                print ('发件人:',mail[3],'主题:',mail[5],'发送时间:',mail[6])
                print ('收件人:',mail[4],u'接收时间:',mail[7])
                worksheet.write(i, 0, mail[3])
                worksheet.write(i, 1, mail[4])
                worksheet.write(i, 2, mail[5])
                worksheet.write(i, 3, mail[6])
                worksheet.write(i, 4, mail[7])
                worksheet.write(i, 5,self.messageList_1(mail[0].replace("'","")))
                i = i+1
                flag = 0
        workbook.save('excelwrite.xls')

创建163邮箱爬虫类


mail = MAIL()
mail.loginPage()
mail.getmail()

如excel结果显示登录异常,应是网页发生变化,请进入邮箱收件信息内容里更换cookie,位置如下:

在这里插入图片描述

部分结果如下:
在这里插入图片描述

  • 3
    点赞
  • 2
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值