o(∩_∩)o设置代理访问博客(三)o(∩_∩)o

17 篇文章 0 订阅
2 篇文章 0 订阅

增加定时执行功能

直接看代码实现:

# -*- coding:UTF-8 -*-

import json
import random
import re
import threading
import time
from datetime import timedelta

import requests
from bs4 import BeautifulSoup
from loguru import logger

uid = "qq_17328759"  # CSDN的ID

host = "https://blog.csdn.net"
headers = {
    'Accept'         : 'application/json, text/plain, */*',
    'Accept-Language': 'zh-CN,zh;q=0.8,zh-TW;q=0.7,zh-HK;q=0.5,en-US;q=0.3,en;q=0.2',
    'Referer'        : f'https://blog.csdn.net/{uid}',
    'Connection'     : 'keep-alive',
    'Sec-Fetch-Dest' : 'empty',
    'Sec-Fetch-Mode' : 'cors',
    'Sec-Fetch-Site' : 'same-origin'
}

user_agent = [
    "Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_6_8; en-us) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 "
    "Safari/534.50",
    "Mozilla/5.0 (Windows; U; Windows NT 6.1; en-us) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 Safari/534.50",
    "Mozilla/5.0 (Windows NT 10.0; WOW64; rv:38.0) Gecko/20100101 Firefox/38.0",
    "Mozilla/5.0 (Windows NT 10.0; WOW64; Trident/7.0; .NET4.0C; .NET4.0E; .NET CLR 2.0.50727; .NET CLR 3.0.30729; "
    ".NET CLR 3.5.30729; InfoPath.3; rv:11.0) like Gecko",
    "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident/5.0)",
    "Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.0; Trident/4.0)",
    "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.0)",
    "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1)",
    "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.6; rv:2.0.1) Gecko/20100101 Firefox/4.0.1",
    "Mozilla/5.0 (Windows NT 6.1; rv:2.0.1) Gecko/20100101 Firefox/4.0.1",
    "Opera/9.80 (Macintosh; Intel Mac OS X 10.6.8; U; en) Presto/2.8.131 Version/11.11",
    "Opera/9.80 (Windows NT 6.1; U; en) Presto/2.8.131 Version/11.11",
    "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_0) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.56 "
    "Safari/535.11",
    "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; Maxthon 2.0)",
    "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; TencentTraveler 4.0)",
    "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1)",
    "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; The World)",
    "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; Trident/4.0; SE 2.X MetaSr 1.0; SE 2.X MetaSr 1.0; .NET CLR "
    "2.0.50727; SE 2.X MetaSr 1.0)",
    "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; 360SE)",
    "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; Avant Browser)",
]


def parseIPList(url = "https://www.kuaidaili.com/free/inha/"):
    """
    获取代理地址
    :param url:代理之地的网站  # "https://www.beesproxy.com/free"  "https://proxy.mimvp.com/freeopen"
    :return: 代理的地址列表
    """

    def search_by_re(string):
        IPs = []
        pattern = re.compile(r'const fpsList = (.*?);')
        re_result = pattern.search(string)
        if not re_result:
            return IPs
        ips = json.loads(re_result.groups()[0])
        for ip_info in ips:
            IPs.append([ip_info['ip'], ip_info['port']])
        return IPs

    def search_by_bs(string):
        IPs = []
        soup = BeautifulSoup(string)
        tds = soup.find_all("td")
        ip = ''
        port = ''
        for td in tds:
            if 'data-title' not in td.attrs:
                continue
            if 'IP' == td.attrs['data-title']:
                ip = td.text
            if "PORT" == td.attrs['data-title']:
                port = td.text
                IPs.append([ip, port])
        return IPs

    url += f'{random.randint(1, 300)}/'
    headers_proxy = headers.copy()
    headers_proxy.update({ "User-Agent": random.choice(user_agent) })
    del headers_proxy['Referer']
    response = requests.get(url, headers=headers_proxy)
    IPs = []
    IPs.extend(search_by_bs(response.text))
    IPs.extend(search_by_re(response.text))
    return IPs


def articleId(uid):
    """
    获取用户的博客列表
    :param uid: 用户的账号
    :return: 用户博客列表
    """
    articleIds = []
    articleList = host + "/" + uid
    headers.update({ "User-Agent": random.choice(user_agent) })
    response = requests.get(articleList, headers=headers).content
    soup = BeautifulSoup(response)
    articles = soup.find_all("article", attrs={ "class": "blog-list-box" })
    for article in articles:
        art_info = article.find_all('a', attrs={ "target": "_blank", 'href': True })
        # print(art_info[0].attrs['href'])
        articleIds.append(art_info[0].attrs['href'])
    return articleIds


def articleIdByApi(uid, page = 1, size = 20):
    """
    获取用户的博客列表
    :param uid: 用户的账号
    :return: 用户博客列表
    """
    blogListUrl = f'{host}/community/home-api/v1/get-business-list'
    params = {
        "page"        : page,
        "size"        : size,
        "businessType": "lately",
        "orderby"     : "",
        "noMore"      : False,
        "year"        : "",
        "month"       : "",
        "username"    : uid
    }
    articleIds = []
    headers.update({ "User-Agent": random.choice(user_agent) })
    currentPage = page

    while True:
        try:
            response = requests.get(blogListUrl, params=params, headers=headers).json()
            articleList = response.get("data", { }).get('list', [])
            logger.debug(f"获取第 {currentPage} 页博客 {articleList.__len__()} 篇")
            for article_info in articleList:
                articleIds.append(article_info.get('url'))
            if articleList.__len__() < size or articleList > 30:
                break
            currentPage += 1
            params.update({ 'page': currentPage })
        except Exception as e:
            logger.error(f'获取 {uid} 博客列表失败:{e}')
            break
    logger.debug(f"{uid}{articleIds.__len__()} 篇博客")

    return articleIds


def PV(IPs, uid, codes):
    s = requests.Session()
    count = 0
    url = host + "/{}/article/details/{}"

    while True:
        count += 1
        logger.info("正在进行第{}次访问\t".format(count))
        proxie = random.choice(IPs)
        logger.debug("{} -- {}".format(proxie, user_agent))
        s.proxies = { "http": "{}:{}".format(proxie[0], proxie[1]) }
        s.headers = headers.update({ "User-Agent": random.choice(user_agent) })

        for code in codes:
            articleUrl = code if 'http' in code else url.format(uid, code)
            html = s.get(articleUrl).text
            if not html:
                s.proxies = { "http": "{}:{}".format(proxie[0], proxie[1]) }
                continue
            soup = BeautifulSoup(html, "html.parser")
            spans = soup.find_all(name="span", attrs={ "class": "read-count" })
            if spans.__len__() != 0:
                logger.debug(f"{code} 当前阅读量:{spans[0].text}")
        time.sleep(random.randint(1, 35))


class addReadNum(threading.Thread):
    def __init__(self, IPs, uid, articleIds):
        threading.Thread.__init__(self)
        self.IPs = IPs
        self.uid = uid
        self.articleIds = articleIds

    def run(self):
        PV(self.IPs, self.uid, self.articleIds)


def demo_schedule():
    articleIds = articleIdByApi(uid)
    IPs = parseIPList()
    PV(IPs, uid, articleIds)


import schedule

schedule.every().hours.at(':00').do(demo_schedule).until(timedelta(minutes=3))  # 每个整点开始执行,执行三分钟后停止

schedule.run_all()

while True:
    schedule.run_pending()  # 运行所有可以运行的任务
    time.sleep(30)

补充定时任务配置:


def demo_schedule():
    articleIds = articleIdByApi(uid)
    IPs = parseIPList()
    PV(IPs, uid, articleIds)


import schedule

schedule.every().hours.at(':00').do(demo_schedule).until(timedelta(minutes=3))  # 每个整点开始执行,执行三分钟后停止

schedule.run_all()

while True:
    schedule.run_pending()  # 运行所有可以运行的任务
    time.sleep(30)
  • 1
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值