成果
代码
import time
from WebWorm.RandomHeader import *
import requests
from bs4 import BeautifulSoup
# 爬取微博实时热事
def weiBo():
# 微博热搜URL
url = 'https://s.weibo.com/top/summary'
# 随机头
headers = {"User-Agent": getRandomHeader()}
# 请求网页
response = requests.get(url, headers=headers)
# 设置编码
response.encoding = 'utf-8'
# 得到网页内容
html = response.text
# 解析html
info = BeautifulSoup(html, "html.parser")
# 抓取tbody下面的a标签
a = info.find('tbody').findAll('a')
# 存放所有的新闻
news = []
# 遍历a标签
for i in a:
# 把新闻内容加入数组
news.append(i.text)
# 获取当前时间
date = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
date = date.replace(' ', '-')
# 把新闻写入文件
with open(date + '-WeiBoNews.txt', mode='w', encoding='utf-8') as file:
for i in news:
file.write(i + '\n')
def baiDu():
# 百度热搜URL
url = 'http://top.baidu.com/buzz?b=1&fr=topindex'
# 随机头
headers = {"User-Agent": getRandomHeader()}
# 请求网页
response = requests.get(url, headers=headers)
# 设置编码
response.encoding = 'gbk'
# 得到网页内容
html = response.text
# 解析html
info = BeautifulSoup(html, "html.parser")
# 抓取a标签
a = info.findAll(name="a", attrs={"class": "list-title"})
# 存放所有的新闻
news = []
# 遍历a标签
for i in a:
# 把新闻内容加入数组
news.append(i.text)
# 获取当前时间
date = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
date = date.replace(' ', '-')
# 把新闻写入文件
with open(date + '-BaiDuNews.txt', mode='w', encoding='utf-8') as file:
for i in news:
file.write(i + '\n')
if __name__ == '__main__':
# 爬取微博实时热事
weiBo()
# 爬取百度实时热事
baiDu()
总结
可以定时爬取发送邮件给自己,每天阅读两分钟,了解天下事。