利用python3自动在36kr里查找自己感兴趣的内容

本文链接：https://blog.csdn.net/lxy210781/article/details/94778320
最近常常在36kr网站的快讯及资讯/最新里查看自己感兴趣内容的及时信息，由于快讯及资讯/最新里信息更新得比较及时快速，自己也很难一直盯着看，故想着要是写个脚本让其自动在后天挂着每隔5分钟查询一次，有的话就写入txt档中并在控制台打印出来，这样自己有空时就看一眼，感觉就会要方便一下，就是玩玩. 脚本如下，供参考：
#!/user/bin/env python3
#-*- coding:utf-8 -*-

import requests
from lxml import etree
from time import sleep
import time
import os
from retrying import retry
   
'''
python           3.6.5
lxml             4.3.3
requests         2.21.0
windows10
'''

class Check36kr:
    def __init__(self):
        # 存储快讯相关的info
        self.shortnews_hrefList       = list()
        self.shortnews_titleList      = list()
        self.shortnews_focusInfoDict  = dict()
        # 存储资讯/最新相关的info
        self.info_hrefList            = list()
        self.info_titleList           = list()
        self.info_focusInfoDict       = dict()
        # 保存自脚本启动后的获取的信息
        self.oldinfoList              = list()
        # 自己感兴趣内容的关键字, 可自定义
        self.keywords       = ('微信', '微博', 'QQ', '腾讯', '阿里', '百度', '多闪', '视频', '优酷', '爱奇艺', 'AI', '识别')
        
        # 上网相关的info
        self.shortnews_url = 'https://36kr.com/newsflashes' # 快讯
        self.info_url = 'https://36kr.com/information/web_news'    # 资讯/最新
        self.headers = {'User-Agent':'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/77.0.3833.0 Safari/537.36'}
        
        # 新建新资讯收集txt
        if os.path.exists("C:\\Users\\" + os.getlogin() + "\\Desktop\\新资讯收集.txt") == False:
            with open("C:\\Users\\" + os.getlogin() + "\\Desktop\\新资讯收集.txt", 'a') as f:
                f.write('Title' + ' '*65 + 'Link' + ' '*36 + 'Time\n')
    
    # 获取快讯的信息, 以字典的形式返回{title, href}
    def getShortNewsPageContent(self):
        self.shortnews_hrefList.clear() 
        self.shortnews_titleList.clear()
        self.shortnews_focusInfoDict.clear()

        r    = requests.get(self.shortnews_url, headers = self.headers)
        html = etree.HTML(r.text)
        self.shortnews_titleList   = html.xpath('//div[1]/div[2]/div/div[1]/div/div[2]/div/a/text()')
        self.shortnews_hrefList    = html.xpath('//div[1]/div[2]/div/div[1]/div/div[2]/div/a/@href')
        return dict(map(lambda x,y:[x,y], self.shortnews_titleList,self.shortnews_hrefList))
      
    # 获取资讯/最新的信息, 以字典的形式返回{title, href}
    def getInfoPageContent(self):
        self.info_hrefList.clear()
        self.info_titleList.clear()
        self.info_focusInfoDict.clear()
        
        r    = requests.get(self.info_url, headers = self.headers)
        html = etree.HTML(r.text)          
        self.info_titleList = html.xpath('//div/div[1]/div/div[2]/div/div[2]/div[2]/p/a/text()')
        self.info_hrefList  = html.xpath('//div/div[1]/div/div[2]/div/div[2]/div[2]/p/a/@href')
        # 对标题处理一下
        for i in range(len(self.info_titleList)):
            if self.info_titleList[i].find('|') != -1:
                self.info_titleList[i] = self.info_titleList[i].split('|')[1].strip()
        return dict(map(lambda x,y:[x,y], self.info_titleList,self.info_hrefList))
    
    def isExistsOfKeyword(self, keywords):
        # 1. 针对快讯的处理
        for title, href in self.getInfoPageContent().items():
            for keyword in keywords:
                if keyword in title:
                    if title not in self.oldinfoList:
                        self.oldinfoList.append(title)
                        self.info_focusInfoDict.update({title: 'https://36kr.com' + href})
                        
        # 1. 针对资讯/最新的处理
        for title, href in self.getShortNewsPageContent().items():
            for keyword in keywords:
                if keyword in title:
                    if title not in self.oldinfoList:
                        self.oldinfoList.append(title)
                        self.shortnews_focusInfoDict.update({title: 'https://36kr.com' + href})
        if len(self.shortnews_focusInfoDict) == 0 and len(self.info_focusInfoDict) == 0:
            return False
        else:
            return True
            
    @retry
    def printInfo(self, isFirst = False): 
        # 读取资讯txt中已保存的title, 主要为了保存当重新run脚本时，如获取的信息在txt已存在，则不再保存
        fileList = list()
        if isFirst:
            path = "C:\\Users\\" + os.getlogin() + "\\Desktop\\新资讯收集.txt"
            with open(path, 'r') as f:
                fileList = f.readlines()
            fileList = fileList[1:]
            for i in range(len(fileList)):
                fileList[i] = fileList[i].split('http')[0].strip() 
            
        if self.isExistsOfKeyword(self.keywords):
            if len(self.shortnews_focusInfoDict) > 0:
                for title, href in self.shortnews_focusInfoDict.items():
                    if isFirst:
                        if title not in fileList:
                            self.savedata(title, href)
                            print(title, time.strftime('  %Y-%m-%d %H:%M:%S', time.localtime(time.time())))
                        else:
                            print('已存在', title)
                    else:
                        self.savedata(title, href)
                        print(title, time.strftime('  %Y-%m-%d %H:%M:%S', time.localtime(time.time())))
            if len(self.info_focusInfoDict) > 0:
                for title, href in self.info_focusInfoDict.items():
                    if isFirst:
                        if title not in fileList:
                            self.savedata(title, href)
                            print(title, time.strftime('  %Y-%m-%d %H:%M:%S', time.localtime(time.time())))
                        else:
                            print('已存在', title)
                    else:
                        self.savedata(title, href)
                        print(title, time.strftime('  %Y-%m-%d %H:%M:%S', time.localtime(time.time())))
                    
    def savedata(self, title, href):
        # title/href都变成长度为60个字符宽度，不足以*填充
        # 目的是在txt中对齐，方便查看(打开txt记得用notepad打开)
        def is_Chinese(word):
            chinese_count = 0
            length = len(word)
            for ch in word:
                if '\u4e00' <= ch <= '\u9fff':
                    chinese_count += 1
            if chinese_count == len(word):
                return True
            else:
                return False
            
        title_e_length = 0
        href_e_length  = 0
        title_length   = 0
        href_length    = 0
        
        specific_char = ('（', '）', '、', '：', '，', '！', '《', '》', '？')
        for single in title:
            if is_Chinese(single) == False and single not in specific_char:
                title_e_length += 1
        for single in href:
            if is_Chinese(single) == False and single not in specific_char:
                href_e_length += 1
        title_length = title_e_length*1 + (len(title) - title_e_length)*2
        href_length  = href_e_length*1 + (len(href) - href_e_length)*2
        
        if title_length < 70:
            title = title + ' '*(70-title_length)
            
        if href_length < 40:
            href = href + ' '*(40-href_length)
        
        # 保存info
        with open("C:\\Users\\" + os.getlogin() + "\\Desktop\\新资讯收集.txt", 'a') as f:
            release_time = time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time()))
            f.write(title)
            f.write(href)
            f.write(release_time + '\n')
            
    def run(self):
        sum = 0
        while True:
            if sum == 0:
                self.printInfo(True)
            else:
                self.printInfo(False)
            sum += 1
            sleep(300)
                
if __name__ == '__main__':
    check36kr = Check36kr()
    check36kr.run()