最近常常在36kr网站的快讯及资讯/最新里查看自己感兴趣内容的及时信息,由于快讯及资讯/最新里信息更新得比较及时快速,自己也很难一直盯着看,故想着要是写个脚本让其自动在后天挂着每隔5分钟查询一次,有的话就写入txt档中并在控制台打印出来,这样自己有空时就看一眼,感觉就会要方便一下,就是玩玩. 脚本如下,供参考:
#!/user/bin/env python3
#-*- coding:utf-8 -*-
import requests
from lxml import etree
from time import sleep
import time
import os
from retrying import retry
'''
python 3.6.5
lxml 4.3.3
requests 2.21.0
windows10
'''
class Check36kr:
def __init__(self):
# 存储快讯相关的info
self.shortnews_hrefList = list()
self.shortnews_titleList = list()
self.shortnews_focusInfoDict = dict()
# 存储资讯/最新相关的info
self.info_hrefList = list()
self.info_titleList = list()
self.info_focusInfoDict = dict()
# 保存自脚本启动后的获取的信息
self.oldinfoList = list()
# 自己感兴趣内容的关键字, 可自定义
self.keywords = ('微信', '微博', 'QQ', '腾讯', '阿里', '百度', '多闪', '视频', '优酷', '爱奇艺', 'AI', '识别')
# 上网相关的info
self.shortnews_url = 'https://36kr.com/newsflashes' # 快讯
self.info_url = 'https://36kr.com/information/web_news' # 资讯/最新
self.headers = {'User-Agent':'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/77.0.3833.0 Safari/537.36'}
# 新建新资讯收集txt
if os.path.exists("C:\\Users\\" + os.getlogin() + "\\Desktop\\新资讯收集.txt") == False:
with open("C:\\Users\\" + os.getlogin() + "\\Desktop\\新资讯收集.txt", 'a') as f:
f.write('Title' + ' '*65 + 'Link' + ' '*36 + 'Time\n')
# 获取快讯的信息, 以字典的形式返回{title, href}
def getShortNewsPageContent(self):
self.shortnews_hrefList.clear()
self.shortnews_titleList.clear()
self.shortnews_focusInfoDict.clear()
r = requests.get(self.shortnews_url, headers = self.headers)
html = etree.HTML(r.text)
self.shortnews_titleList = html.xpath('//div[1]/div[2]/div/div[1]/div/div[2]/div/a/text()')
self.shortnews_hrefList = html.xpath('//div[1]/div[2]/div/div[1]/div/div[2]/div/a/@href')
return dict(map(lambda x,y:[x,y], self.shortnews_titleList,self.shortnews_hrefList))
# 获取资讯/最新的信息, 以字典的形式返回{title, href}
def getInfoPageContent(self):
self.info_hrefList.clear()
self.info_titleList.clear()
self.info_focusInfoDict.clear()
r = requests.get(self.info_url, headers = self.headers)
html = etree.HTML(r.text)
self.info_titleList = html.xpath('//div/div[1]/div/div[2]/div/div[2]/div[2]/p/a/text()')
self.info_hrefList = html.xpath('//div/div[1]/div/div[2]/div/div[2]/div[2]/p/a/@href')
# 对标题处理一下
for i in range(len(self.info_titleList)):
if self.info_titleList[i].find('|') != -1:
self.info_titleList[i] = self.info_titleList[i].split('|')[1].strip()
return dict(map(lambda x,y:[x,y], self.info_titleList,self.info_hrefList))
def isExistsOfKeyword(self, keywords):
# 1. 针对快讯的处理
for title, href in self.getInfoPageContent().items():
for keyword in keywords:
if keyword in title:
if title not in self.oldinfoList:
self.oldinfoList.append(title)
self.info_focusInfoDict.update({title: 'https://36kr.com' + href})
# 1. 针对资讯/最新的处理
for title, href in self.getShortNewsPageContent().items():
for keyword in keywords:
if keyword in title:
if title not in self.oldinfoList:
self.oldinfoList.append(title)
self.shortnews_focusInfoDict.update({title: 'https://36kr.com' + href})
if len(self.shortnews_focusInfoDict) == 0 and len(self.info_focusInfoDict) == 0:
return False
else:
return True
@retry
def printInfo(self, isFirst = False):
# 读取资讯txt中已保存的title, 主要为了保存当重新run脚本时,如获取的信息在txt已存在,则不再保存
fileList = list()
if isFirst:
path = "C:\\Users\\" + os.getlogin() + "\\Desktop\\新资讯收集.txt"
with open(path, 'r') as f:
fileList = f.readlines()
fileList = fileList[1:]
for i in range(len(fileList)):
fileList[i] = fileList[i].split('http')[0].strip()
if self.isExistsOfKeyword(self.keywords):
if len(self.shortnews_focusInfoDict) > 0:
for title, href in self.shortnews_focusInfoDict.items():
if isFirst:
if title not in fileList:
self.savedata(title, href)
print(title, time.strftime(' %Y-%m-%d %H:%M:%S', time.localtime(time.time())))
else:
print('已存在', title)
else:
self.savedata(title, href)
print(title, time.strftime(' %Y-%m-%d %H:%M:%S', time.localtime(time.time())))
if len(self.info_focusInfoDict) > 0:
for title, href in self.info_focusInfoDict.items():
if isFirst:
if title not in fileList:
self.savedata(title, href)
print(title, time.strftime(' %Y-%m-%d %H:%M:%S', time.localtime(time.time())))
else:
print('已存在', title)
else:
self.savedata(title, href)
print(title, time.strftime(' %Y-%m-%d %H:%M:%S', time.localtime(time.time())))
def savedata(self, title, href):
# title/href都变成长度为60个字符宽度,不足以*填充
# 目的是在txt中对齐,方便查看(打开txt记得用notepad打开)
def is_Chinese(word):
chinese_count = 0
length = len(word)
for ch in word:
if '\u4e00' <= ch <= '\u9fff':
chinese_count += 1
if chinese_count == len(word):
return True
else:
return False
title_e_length = 0
href_e_length = 0
title_length = 0
href_length = 0
specific_char = ('(', ')', '、', ':', ',', '!', '《', '》', '?')
for single in title:
if is_Chinese(single) == False and single not in specific_char:
title_e_length += 1
for single in href:
if is_Chinese(single) == False and single not in specific_char:
href_e_length += 1
title_length = title_e_length*1 + (len(title) - title_e_length)*2
href_length = href_e_length*1 + (len(href) - href_e_length)*2
if title_length < 70:
title = title + ' '*(70-title_length)
if href_length < 40:
href = href + ' '*(40-href_length)
# 保存info
with open("C:\\Users\\" + os.getlogin() + "\\Desktop\\新资讯收集.txt", 'a') as f:
release_time = time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time()))
f.write(title)
f.write(href)
f.write(release_time + '\n')
def run(self):
sum = 0
while True:
if sum == 0:
self.printInfo(True)
else:
self.printInfo(False)
sum += 1
sleep(300)
if __name__ == '__main__':
check36kr = Check36kr()
check36kr.run()