增加定时执行功能
直接看代码实现:
# -*- coding:UTF-8 -*-
import json
import random
import re
import threading
import time
from datetime import timedelta
import requests
from bs4 import BeautifulSoup
from loguru import logger
uid = "qq_17328759" # CSDN的ID
host = "https://blog.csdn.net"
headers = {
'Accept' : 'application/json, text/plain, */*',
'Accept-Language': 'zh-CN,zh;q=0.8,zh-TW;q=0.7,zh-HK;q=0.5,en-US;q=0.3,en;q=0.2',
'Referer' : f'https://blog.csdn.net/{uid}',
'Connection' : 'keep-alive',
'Sec-Fetch-Dest' : 'empty',
'Sec-Fetch-Mode' : 'cors',
'Sec-Fetch-Site' : 'same-origin'
}
user_agent = [
"Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_6_8; en-us) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 "
"Safari/534.50",
"Mozilla/5.0 (Windows; U; Windows NT 6.1; en-us) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 Safari/534.50",
"Mozilla/5.0 (Windows NT 10.0; WOW64; rv:38.0) Gecko/20100101 Firefox/38.0",
"Mozilla/5.0 (Windows NT 10.0; WOW64; Trident/7.0; .NET4.0C; .NET4.0E; .NET CLR 2.0.50727; .NET CLR 3.0.30729; "
".NET CLR 3.5.30729; InfoPath.3; rv:11.0) like Gecko",
"Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident/5.0)",
"Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.0; Trident/4.0)",
"Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.0)",
"Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1)",
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10.6; rv:2.0.1) Gecko/20100101 Firefox/4.0.1",
"Mozilla/5.0 (Windows NT 6.1; rv:2.0.1) Gecko/20100101 Firefox/4.0.1",
"Opera/9.80 (Macintosh; Intel Mac OS X 10.6.8; U; en) Presto/2.8.131 Version/11.11",
"Opera/9.80 (Windows NT 6.1; U; en) Presto/2.8.131 Version/11.11",
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_0) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.56 "
"Safari/535.11",
"Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; Maxthon 2.0)",
"Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; TencentTraveler 4.0)",
"Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1)",
"Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; The World)",
"Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; Trident/4.0; SE 2.X MetaSr 1.0; SE 2.X MetaSr 1.0; .NET CLR "
"2.0.50727; SE 2.X MetaSr 1.0)",
"Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; 360SE)",
"Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; Avant Browser)",
]
def parseIPList(url = "https://www.kuaidaili.com/free/inha/"):
"""
获取代理地址
:param url:代理之地的网站 # "https://www.beesproxy.com/free" "https://proxy.mimvp.com/freeopen"
:return: 代理的地址列表
"""
def search_by_re(string):
IPs = []
pattern = re.compile(r'const fpsList = (.*?);')
re_result = pattern.search(string)
if not re_result:
return IPs
ips = json.loads(re_result.groups()[0])
for ip_info in ips:
IPs.append([ip_info['ip'], ip_info['port']])
return IPs
def search_by_bs(string):
IPs = []
soup = BeautifulSoup(string)
tds = soup.find_all("td")
ip = ''
port = ''
for td in tds:
if 'data-title' not in td.attrs:
continue
if 'IP' == td.attrs['data-title']:
ip = td.text
if "PORT" == td.attrs['data-title']:
port = td.text
IPs.append([ip, port])
return IPs
url += f'{random.randint(1, 300)}/'
headers_proxy = headers.copy()
headers_proxy.update({ "User-Agent": random.choice(user_agent) })
del headers_proxy['Referer']
response = requests.get(url, headers=headers_proxy)
IPs = []
IPs.extend(search_by_bs(response.text))
IPs.extend(search_by_re(response.text))
return IPs
def articleId(uid):
"""
获取用户的博客列表
:param uid: 用户的账号
:return: 用户博客列表
"""
articleIds = []
articleList = host + "/" + uid
headers.update({ "User-Agent": random.choice(user_agent) })
response = requests.get(articleList, headers=headers).content
soup = BeautifulSoup(response)
articles = soup.find_all("article", attrs={ "class": "blog-list-box" })
for article in articles:
art_info = article.find_all('a', attrs={ "target": "_blank", 'href': True })
# print(art_info[0].attrs['href'])
articleIds.append(art_info[0].attrs['href'])
return articleIds
def articleIdByApi(uid, page = 1, size = 20):
"""
获取用户的博客列表
:param uid: 用户的账号
:return: 用户博客列表
"""
blogListUrl = f'{host}/community/home-api/v1/get-business-list'
params = {
"page" : page,
"size" : size,
"businessType": "lately",
"orderby" : "",
"noMore" : False,
"year" : "",
"month" : "",
"username" : uid
}
articleIds = []
headers.update({ "User-Agent": random.choice(user_agent) })
currentPage = page
while True:
try:
response = requests.get(blogListUrl, params=params, headers=headers).json()
articleList = response.get("data", { }).get('list', [])
logger.debug(f"获取第 {currentPage} 页博客 {articleList.__len__()} 篇")
for article_info in articleList:
articleIds.append(article_info.get('url'))
if articleList.__len__() < size or articleList > 30:
break
currentPage += 1
params.update({ 'page': currentPage })
except Exception as e:
logger.error(f'获取 {uid} 博客列表失败:{e}')
break
logger.debug(f"{uid} 有 {articleIds.__len__()} 篇博客")
return articleIds
def PV(IPs, uid, codes):
s = requests.Session()
count = 0
url = host + "/{}/article/details/{}"
while True:
count += 1
logger.info("正在进行第{}次访问\t".format(count))
proxie = random.choice(IPs)
logger.debug("{} -- {}".format(proxie, user_agent))
s.proxies = { "http": "{}:{}".format(proxie[0], proxie[1]) }
s.headers = headers.update({ "User-Agent": random.choice(user_agent) })
for code in codes:
articleUrl = code if 'http' in code else url.format(uid, code)
html = s.get(articleUrl).text
if not html:
s.proxies = { "http": "{}:{}".format(proxie[0], proxie[1]) }
continue
soup = BeautifulSoup(html, "html.parser")
spans = soup.find_all(name="span", attrs={ "class": "read-count" })
if spans.__len__() != 0:
logger.debug(f"{code} 当前阅读量:{spans[0].text}")
time.sleep(random.randint(1, 35))
class addReadNum(threading.Thread):
def __init__(self, IPs, uid, articleIds):
threading.Thread.__init__(self)
self.IPs = IPs
self.uid = uid
self.articleIds = articleIds
def run(self):
PV(self.IPs, self.uid, self.articleIds)
def demo_schedule():
articleIds = articleIdByApi(uid)
IPs = parseIPList()
PV(IPs, uid, articleIds)
import schedule
schedule.every().hours.at(':00').do(demo_schedule).until(timedelta(minutes=3)) # 每个整点开始执行,执行三分钟后停止
schedule.run_all()
while True:
schedule.run_pending() # 运行所有可以运行的任务
time.sleep(30)
补充定时任务配置:
def demo_schedule():
articleIds = articleIdByApi(uid)
IPs = parseIPList()
PV(IPs, uid, articleIds)
import schedule
schedule.every().hours.at(':00').do(demo_schedule).until(timedelta(minutes=3)) # 每个整点开始执行,执行三分钟后停止
schedule.run_all()
while True:
schedule.run_pending() # 运行所有可以运行的任务
time.sleep(30)