今日头条爬取程序+邮件发到邮箱

需要安装requests,stmplib和openpyxl库,加入了发送邮件功能,部分代码来自CSDN:

其中smtplib需使用pip install py-email 命令来安装

完整代码如下

import requests  
import json
from openpyxl import Workbook
import time
import hashlib
import os
import datetime
import smtplib
import random
from email.mime.text import MIMEText
from email.header import Header
from email.mime.multipart import MIMEMultipart
 
start_url = 'https://www.toutiao.com/api/pc/feed/?category=news_hot&utm_source=toutiao&widen=1&max_behot_time='
url = 'https://www.toutiao.com'
 
headers={
    'user-agent':'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.98 Safari/537.36'
}
cookies = {'tt_webid':str(random.randint(66499490848923624618,98746537462725254568))} # 为避免反爬,采用随机数cookies
 
max_behot_time = '0'   # 链接参数
title = []       # 存储新闻标题
source_url = []  # 存储新闻的链接
s_url = []       # 存储新闻的完整链接
source = []      # 存储发布新闻的公众号
media_url = {}   # 存储公众号的完整链接
  # 第三方 SMTP 服务参数
mail_host="smtp.exmail.qq.com"  #设置服务器
mail_user="201XXXXX@XXX.com"    #用户名
mail_pass="r6ctG345kw8Mdai"   #口令 
sender = '20XXXXX@XXX.com' # 发送邮件,可设置为你的QQ邮箱或者其他邮箱
receivers = ['XXXXX@qq.com']  # 接收邮件,可设置为你的QQ邮箱或者其他邮箱
 
def get_as_cp():  # 该函数主要是为了获取as和cp参数,程序参考今日头条中的加密js文件:home_4abea46.js
    zz = {}
    now = round(time.time())
    print(now) # 获取当前计算机时间
    e = hex(int(now)).upper()[2:] #hex()转换一个整数对象为16进制的字符串表示
    print('e:', e)
    a = hashlib.md5()  #hashlib.md5().hexdigest()创建hash对象并返回16进制结果
    print('a:', a)
    a.update(str(int(now)).encode('utf-8'))
    i = a.hexdigest().upper()
    print('i:', i)
    if len(e)!=8:
        zz = {'as':'479BB4B7254C150',
        'cp':'7E0AC8874BB0985'}
        return zz
    n = i[:5]
    a = i[-5:]
    r = ''
    s = ''
    for i in range(5):
        s= s+n[i]+e[i]
    for j in range(5):
        r = r+e[j+3]+a[j]
    zz ={
    'as':'A1'+s+e[-3:],
    'cp':e[0:3]+r+'E1'
    }
    print('zz:', zz)
    return zz
 
 
def getdata(url, headers, cookies):  # 解析网页函数
    r = requests.get(url, headers=headers, cookies=cookies)
    print(url)
    data = json.loads(r.text)
    return data
 
 
def savedata(title, s_url, source, media_url):  # 存储数据到文件
    # 存储数据到xlxs文件
    wb = Workbook()
    filename ='toutiao.xlsx' # 新建存储结果的excel文件
    ws = wb.active
    ws.title = 'data'   # 更改工作表的标题
    ws['A1'] = '标题'   # 对表格加入标题
    ws['B1'] = '新闻链接'
    ws['C1'] = '头条号'
    ws['D1'] = '头条号链接'
    for row in range(2, len(title)+2):   # 将数据写入表格
        _= ws.cell(column=1, row=row, value=title[row-2])
        _= ws.cell(column=2, row=row, value=s_url[row-2])
        _= ws.cell(column=3, row=row, value=source[row-2])
        _= ws.cell(column=4, row=row, value=media_url])
 
    wb.save(filename=filename)  # 保存文件

def main(max_behot_time, title, source_url, s_url, source, media_url):   # 主函数
    for i in range(10):   # 刷新次数,数值越多,得到的数据越多
        ascp = get_as_cp()    # 获取as和cp参数的函数
        demo = getdata(start_url+max_behot_time+'&max_behot_time_tmp='+max_behot_time+'&tadrequire=true&as='+ascp['as']+'&cp='+ascp['cp'], headers, cookies)
        print(demo)
        # time.sleep(1)
        for j in range(len(demo['data'])):
            # print(demo['data'][j]['title'])
            if demo['data'][j]['title'] not in title:
                title.append(demo['data'][j]['title'])  # 获取新闻标题
                source_url.append(demo['data'][j]['source_url'])  # 获取新闻链接
                source.append(demo['data'][j]['source'])  # 获取发布新闻的公众号
            if demo['data'][j]['source'] not in media_url:
                media_url[demo['data'][j]['source']] = url+demo['data'][j]['media_url']  # 获取公众号链接
        print(max_behot_time)
        max_behot_time = str(demo['next']['max_behot_time'])  # 获取下一个链接的max_behot_time参数的值
        for index in range(len(title)):
            print('标题:', title[index])
            if 'https' not in source_url[index]:
                s_url.append(url+source_url[index])
                print('新闻链接:', url+source_url[index])
            else:
                print('新闻链接:', source_url[index])
                s_url.append(source_url[index])
                # print('源链接:', url+source_url[index])
            print('头条号:', source[index])
            print(len(title))   # 获取的新闻数量

def sendmail(mail_host,mail_user,mail_pass,sender,receivers): 
    #创建一个带附件的实例
    message = MIMEMultipart()
    message['From'] = Header("每日今日头条文章爬取", 'utf-8')
    message['To'] =  Header("不知道是谁", 'utf-8')
    subject = '今天的头条新闻都在这里了'
    message['Subject'] = Header(subject, 'utf-8')
    
    #邮件正文内容
    message.attach(MIMEText('今天的新闻,请查收', 'plain', 'utf-8'))
     
    # 构造附件1,传送当前目录下的  文件
    att1 = MIMEText(open('toutiao.xlsx', 'rb').read(), 'base64', 'utf-8')
    att1["Content-Type"] = 'application/octet-stream'
    # 这里的filename可以任意写,写什么名字,邮件中显示什么名字
    att1["Content-Disposition"] = 'attachment; filename="toutiao.xlsx"'
    message.attach(att1)
     
    # 构造附件2,传送当前目录下的 runoob.txt 文件
    #att2 = MIMEText(open('runoob.txt', 'rb').read(), 'base64', 'utf-8')
    #att2["Content-Type"] = 'application/octet-stream'
    #att2["Content-Disposition"] = 'attachment; filename="runoob.txt"'
    #message.attach(att2)
     
    try:
        smtpObj = smtplib.SMTP() 
        smtpObj.connect(mail_host, 25)    # 465 为 SMTP 端口号
        smtpObj.login(mail_user,mail_pass)  
        smtpObj.sendmail(sender, receivers, message.as_string())
        print ("邮件发送成功")
    except smtplib.SMTPException:
        print ("Error: 无法发送邮件")
 
if __name__ == '__main__':
    main(max_behot_time, title, source_url, s_url, source, media_url)
    savedata(title, s_url, source, media_url)
    sendmail(mail_host,mail_user,mail_pass,sender,receivers)
    exit()
代码均采用结构化程序设计,简单易读,修改方便,只需要修改参数部分即可。
评论 1
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值