o(∩_∩)o设置代理访问博客（三）o(∩_∩)o

Qredsun

于 2024-04-10 10:53:29 发布

阅读量294

点赞数 1

分类专栏： common tools 文章标签： python

本文链接：https://blog.csdn.net/qq_17328759/article/details/137587694

版权

common 同时被 3 个专栏收录

55 篇文章 1 订阅

订阅专栏

tools

17 篇文章 0 订阅

订阅专栏

python_base

2 篇文章 0 订阅

订阅专栏

增加定时执行功能

直接看代码实现：

# -*- coding:UTF-8 -*-

import json
import random
import re
import threading
import time
from datetime import timedelta

import requests
from bs4 import BeautifulSoup
from loguru import logger

uid = "qq_17328759"  # CSDN的ID

host = "https://blog.csdn.net"
headers = {
    'Accept'         : 'application/json, text/plain, */*',
    'Accept-Language': 'zh-CN,zh;q=0.8,zh-TW;q=0.7,zh-HK;q=0.5,en-US;q=0.3,en;q=0.2',
    'Referer'        : f'https://blog.csdn.net/{uid}',
    'Connection'     : 'keep-alive',
    'Sec-Fetch-Dest' : 'empty',
    'Sec-Fetch-Mode' : 'cors',
    'Sec-Fetch-Site' : 'same-origin'
}

user_agent = [
    "Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_6_8; en-us) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 "
    "Safari/534.50",
    "Mozilla/5.0 (Windows; U; Windows NT 6.1; en-us) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 Safari/534.50",
    "Mozilla/5.0 (Windows NT 10.0; WOW64; rv:38.0) Gecko/20100101 Firefox/38.0",
    "Mozilla/5.0 (Windows NT 10.0; WOW64; Trident/7.0; .NET4.0C; .NET4.0E; .NET CLR 2.0.50727; .NET CLR 3.0.30729; "
    ".NET CLR 3.5.30729; InfoPath.3; rv:11.0) like Gecko",
    "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident/5.0)",
    "Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.0; Trident/4.0)",
    "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.0)",
    "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1)",
    "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.6; rv:2.0.1) Gecko/20100101 Firefox/4.0.1",
    "Mozilla/5.0 (Windows NT 6.1; rv:2.0.1) Gecko/20100101 Firefox/4.0.1",
    "Opera/9.80 (Macintosh; Intel Mac OS X 10.6.8; U; en) Presto/2.8.131 Version/11.11",
    "Opera/9.80 (Windows NT 6.1; U; en) Presto/2.8.131 Version/11.11",
    "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_0) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.56 "
    "Safari/535.11",
    "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; Maxthon 2.0)",
    "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; TencentTraveler 4.0)",
    "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1)",
    "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; The World)",
    "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; Trident/4.0; SE 2.X MetaSr 1.0; SE 2.X MetaSr 1.0; .NET CLR "
    "2.0.50727; SE 2.X MetaSr 1.0)",
    "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; 360SE)",
    "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; Avant Browser)",
]


def parseIPList(url = "https://www.kuaidaili.com/free/inha/"):
    """
    获取代理地址
    :param url:代理之地的网站  # "https://www.beesproxy.com/free"  "https://proxy.mimvp.com/freeopen"
    :return: 代理的地址列表
    """

    def search_by_re(string):
        IPs = []
        pattern = re.compile(r'const fpsList = (.*?);')
        re_result = pattern.search(string)
        if not re_result:
            return IPs
        ips = json.loads(re_result.groups()[0])
        for ip_info in ips:
            IPs.append([ip_info['ip'], ip_info['port']])
        return IPs

    def search_by_bs(string):
        IPs = []
        soup = BeautifulSoup(string)
        tds = soup.find_all("td")
        ip = ''
        port = ''
        for td in tds:
            if 'data-title' not in td.attrs:
                continue
            if 'IP' == td.attrs['data-title']:
                ip = td.text
            if "PORT" == td.attrs['data-title']:
                port = td.text
                IPs.append([ip, port])
        return IPs

    url += f'{random.randint(1, 300)}/'
    headers_proxy = headers.copy()
    headers_proxy.update({ "User-Agent": random.choice(user_agent) })
    del headers_proxy['Referer']
    response = requests.get(url, headers=headers_proxy)
    IPs = []
    IPs.extend(search_by_bs(response.text))
    IPs.extend(search_by_re(response.text))
    return IPs


def articleId(uid):
    """
    获取用户的博客列表
    :param uid: 用户的账号
    :return: 用户博客列表
    """
    articleIds = []
    articleList = host + "/" + uid
    headers.update({ "User-Agent": random.choice(user_agent) })
    response = requests.get(articleList, headers=headers).content
    soup = BeautifulSoup(response)
    articles = soup.find_all("article", attrs={ "class": "blog-list-box" })
    for article in articles:
        art_info = article.find_all('a', attrs={ "target": "_blank", 'href': True })
        # print(art_info[0].attrs['href'])
        articleIds.append(art_info[0].attrs['href'])
    return articleIds


def articleIdByApi(uid, page = 1, size = 20):
    """
    获取用户的博客列表
    :param uid: 用户的账号
    :return: 用户博客列表
    """
    blogListUrl = f'{host}/community/home-api/v1/get-business-list'
    params = {
        "page"        : page,
        "size"        : size,
        "businessType": "lately",
        "orderby"     : "",
        "noMore"      : False,
        "year"        : "",
        "month"       : "",
        "username"    : uid
    }
    articleIds = []
    headers.update({ "User-Agent": random.choice(user_agent) })
    currentPage = page

    while True:
        try:
            response = requests.get(blogListUrl, params=params, headers=headers).json()
            articleList = response.get("data", { }).get('list', [])
            logger.debug(f"获取第 {currentPage} 页博客 {articleList.__len__()} 篇")
            for article_info in articleList:
                articleIds.append(article_info.get('url'))
            if articleList.__len__() < size or articleList > 30:
                break
            currentPage += 1
            params.update({ 'page': currentPage })
        except Exception as e:
            logger.error(f'获取 {uid} 博客列表失败：{e}')
            break
    logger.debug(f"{uid} 有 {articleIds.__len__()} 篇博客")

    return articleIds


def PV(IPs, uid, codes):
    s = requests.Session()
    count = 0
    url = host + "/{}/article/details/{}"

    while True:
        count += 1
        logger.info("正在进行第{}次访问\t".format(count))
        proxie = random.choice(IPs)
        logger.debug("{} -- {}".format(proxie, user_agent))
        s.proxies = { "http": "{}:{}".format(proxie[0], proxie[1]) }
        s.headers = headers.update({ "User-Agent": random.choice(user_agent) })

        for code in codes:
            articleUrl = code if 'http' in code else url.format(uid, code)
            html = s.get(articleUrl).text
            if not html:
                s.proxies = { "http": "{}:{}".format(proxie[0], proxie[1]) }
                continue
            soup = BeautifulSoup(html, "html.parser")
            spans = soup.find_all(name="span", attrs={ "class": "read-count" })
            if spans.__len__() != 0:
                logger.debug(f"{code} 当前阅读量：{spans[0].text}")
        time.sleep(random.randint(1, 35))


class addReadNum(threading.Thread):
    def __init__(self, IPs, uid, articleIds):
        threading.Thread.__init__(self)
        self.IPs = IPs
        self.uid = uid
        self.articleIds = articleIds

    def run(self):
        PV(self.IPs, self.uid, self.articleIds)


def demo_schedule():
    articleIds = articleIdByApi(uid)
    IPs = parseIPList()
    PV(IPs, uid, articleIds)


import schedule

schedule.every().hours.at(':00').do(demo_schedule).until(timedelta(minutes=3))  # 每个整点开始执行，执行三分钟后停止

schedule.run_all()

while True:
    schedule.run_pending()  # 运行所有可以运行的任务
    time.sleep(30)

补充定时任务配置：


def demo_schedule():
    articleIds = articleIdByApi(uid)
    IPs = parseIPList()
    PV(IPs, uid, articleIds)


import schedule

schedule.every().hours.at(':00').do(demo_schedule).until(timedelta(minutes=3))  # 每个整点开始执行，执行三分钟后停止

schedule.run_all()

while True:
    schedule.run_pending()  # 运行所有可以运行的任务
    time.sleep(30)