1.定位到热搜在哪块部分
发现热搜标题都被放在了td标签的"keywords"关键词里面了
2.爬取热搜榜
from lxml import etree
import requests
if __name__ == "__main__":
url = 'http://top.baidu.com/buzz?b=1&fr=topindex'
headers = {"User-Agent" : "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/88.0.4324.182 Safari/537.36 Edg/88.0.705.74"}
page_text = requests.get(url = url, headers = headers).text
# 1.实例化一个etree对象
tree = etree.HTML(page_text)
# 二次检索
keywords = tree.xpath('//td[@class="keyword"]/a[1]/text()')
print(keywords)
发现打印出来是乱码,反过来查询获取的page_text
发现原来的page_text就是乱码的,需要对获取的request信息进行转码,转码要在.text之前做:
page_text = requests.get(url = url, headers = headers)
page_text.encoding='gbk'
page_text = page_text.text
这里所获取的page_text中的中文已经转码过来了,中文可以正常显示了
# -*- coding: utf-8 -*-
"""
Created on Fri Apr 23 15:24:00 2021
@author: zhongxi
"""
from lxml import etree
import requests
if __name__ == "__main__":
url = 'http://top.baidu.com/buzz?b=1&fr=topindex'
headers = {"User-Agent" : "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/88.0.4324.182 Safari/537.36 Edg/88.0.705.74"}
page_text = requests.get(url = url, headers = headers)
page_text.encoding='gbk'
page_text = page_text.text
# 1.实例化一个etree对象
tree = etree.HTML(page_text)
# 二次检索
keywords = tree.xpath('//td[@class="keyword"]/a[1]/text()')
# keywords = keywords.encoding = 'gbk'
print(keywords)
热搜榜就爬取到了!!!
3 永久化存储
# 2.检索
hot_news = tree.xpath('//td[@class="keyword"]/a[1]')
# print(len(hot_news))
# 3.对每个热搜的热搜标题和连接对应提取
df = pd.DataFrame(index = [i+1 for i in range(len(hot_news))] , columns=["hot-news","url"])
for ind,item in enumerate(hot_news):
# 对每一个a标签内容分类提取
title = item.text
print(title)
url_news = item.xpath('./@href')[0]
print(url_news)
df.loc[ind+1]["hot-news"] = title
df.loc[ind+1]["url"] = url_news
print(df)
这样存储到了一个dataframe里面,可以存储至excel表格,也可以发送至个人邮箱
存储与本地excel表格中:
# 4.获取时间戳
time_now = time.strftime("%Y-%m-%d %H.%M.%S", time.localtime()) # excel存储名不能有冒号
print(time_now)
save_name = "D:/百度热搜榜_" + str(time_now) + "爬取.xlsx"
df.to_excel(save_name)
以email发送到指定邮箱,这里以163邮箱为发送的邮箱,接受邮箱可以时任意邮箱
前提是网易邮箱的POP3/SMTP服务必须开启,然后会得到一串授权码,用这个授权码取代之前的邮箱密码,填入password处,就可以了
# email
# 首先把网页源代码写好
data_email =''
for index in range(len(df)):
# 发送到邮箱显示的热搜主题内容
# 将title字符串 强转为其他类型
title = df.iloc[index]['hot-news']
url = df.iloc[index]['url']
crawled_time = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
html = '<p>%d:<a href="%s">热搜标题title:%s </a>爬取时间:%s</p>'%((index+1),url,title,crawled_time)
#print(html)
data_email+=html
#print(len(data_email))
print('邮件发送的总字符数目:',len(data_email))
# 发送邮件相关参数
smtpserver = 'smtp.163.com' # 发件服务器
port = 0 # 端口
sender = '*********@163.com' # 发件人邮箱
psw = '********' # 发件人密码用授权码代替
receiver = "*********@163.com" # 接收人
'''
3、编写邮件主题和正文,正文用的html格式
4、最后调用发件服务
'''
# 编辑邮件内容
subject = '这是一封来自钟曦的电脑,以python发送的测试邮件'
body = data_email # 定义邮件正文为html
msg = MIMEText(body, 'html', 'utf-8')
msg['from'] = sender
msg['to'] = receiver
msg['subject'] = subject
# 发送邮件
smtp = smtplib.SMTP()
smtp.connect(smtpserver) # 链接服务器
smtp.login(sender, psw) # 登录
smtp.sendmail(sender, receiver, msg.as_string()) # 发送
smtp.quit() # 关闭
4 从爬取到发送完整代码
# -*- coding: utf-8 -*-
"""
Created on Fri Apr 23 15:24:00 2021
@author: zhongxi
"""
from lxml import etree
import requests
import pandas as pd
import time
import smtplib
from email.mime.text import MIMEText
if __name__ == "__main__":
url = 'http://top.baidu.com/buzz?b=1&fr=topindex'
headers = {"User-Agent" : "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/88.0.4324.182 Safari/537.36 Edg/88.0.705.74"}
page_text = requests.get(url = url, headers = headers)
page_text.encoding='gbk'
page_text = page_text.text
# 1.实例化一个etree对象
tree = etree.HTML(page_text)
# 2.检索
hot_news = tree.xpath('//td[@class="keyword"]/a[1]')
# print(len(hot_news))
# 3.对每个热搜的热搜标题和连接对应提取
df = pd.DataFrame(index = [i+1 for i in range(len(hot_news))] , columns=["hot-news","url"])
for ind,item in enumerate(hot_news):
# 对每一个a标签内容分类提取
title = item.text
# print(title)
url_news = item.xpath('./@href')[0]
# print(url_news)
df.loc[ind+1]["hot-news"] = title
df.loc[ind+1]["url"] = url_news
print("当日爬取热搜条数:",len(df))
# # 4.获取时间戳
# time_now = time.strftime("%Y-%m-%d %H.%M.%S", time.localtime())
# print(time_now)
# save_name = "D:/百度热搜榜_" + str(time_now) + "爬取.xlsx"
# df.to_excel(save_name)
# email
# 首先把网页源代码写好
data_email =''
for index in range(len(df)):
# 发送到邮箱显示的热搜主题内容
# 将title字符串 强转为其他类型
title = df.iloc[index]['hot-news']
url = df.iloc[index]['url']
crawled_time = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
html = '<p>%d:<a href="%s">热搜标题title:%s </a>爬取时间:%s</p>'%((index+1),url,title,crawled_time)
#print(html)
data_email+=html
#print(len(data_email))
print('邮件发送的总字符数目:',len(data_email))
# 发送邮件相关参数
smtpserver = 'smtp.163.com' # 发件服务器
port = 0 # 端口
sender = '*******@163.com' # 发件人邮箱
psw = '**********' # 发件人密码用授权码代替
receiver = "***********8@163.com" # 接收人
'''
3、编写邮件主题和正文,正文用的html格式
4、最后调用发件服务
'''
# 编辑邮件内容
subject = '这是一封来自钟曦的电脑,以python发送的测试邮件'
body = data_email # 定义邮件正文为html
msg = MIMEText(body, 'html', 'utf-8')
msg['from'] = sender
msg['to'] = receiver
msg['subject'] = subject
# 发送邮件
smtp = smtplib.SMTP()
smtp.connect(smtpserver) # 链接服务器
smtp.login(sender, psw) # 登录
smtp.sendmail(sender, receiver, msg.as_string()) # 发送
smtp.quit() # 关闭
这里也可以发送带附件的文件和多收件人发送,参考这篇博客:
python操作发送邮件