增加浏览量

import importlib
import requests
import random
import time
import sys
import re

# --------------------------
# 实现编码转换

# Python2实现方法,需要利用sys模块
'''
reload(sys)
sys.setdefaultencoding('utf-8')
'''

# Python3实现方法,需要利用importlib和sys模块
importlib.reload(sys)
# --------------------------

# --------------------------
# 模拟浏览器操作,在浏览器地址栏中输入 about:version 即可查看浏览器相关信息

user_agent = [
    'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.1 (KHTML, like Gecko) Chrome/14.0.835.163 Safari/535.1',
    'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.25 Safari/537.36 Core/1.70.3741.400 QQBrowser/10.5.3863.400',
    'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:6.0) Gecko/20100101 Firefox/6.0',
    'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 Safari/534.50',
    'Opera/9.80 (Windows NT 6.1; U; zh-cn) Presto/2.9.168 Version/11.50',
    'Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Win64; x64; Trident/5.0; .NET CLR 2.0.50727; SLCC2; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; InfoPath.3; .NET4.0C; Tablet PC 2.0; .NET4.0E)',
    'Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.1; WOW64; Trident/4.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; .NET4.0C; InfoPath.3)',
    'Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 5.1; Trident/4.0; GTB7.0',
    'Mozilla/5.0 (Windows; U; Windows NT 6.1; ) AppleWebKit/534.12 (KHTML, like Gecko) Maxthon/3.0 Safari/534.12',
    'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.1; WOW64; Trident/5.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; InfoPath.3; .NET4.0C; .NET4.0E)',
    'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.1; WOW64; Trident/5.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; InfoPath.3; .NET4.0C; .NET4.0E; SE 2.X MetaSr 1.0)',
    'Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; WOW64; Trident/5.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; InfoPath.3; .NET4.0C; .NET4.0E)',
    'Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; WOW64; Trident/5.0)',
    ]

headers = {'User-Agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.25 Safari/537.36 Core/1.70.3741.400 QQBrowser/10.5.3863.400'}

# 获取CSDN用户界面HTML代码,并转化为utf-8编码
html = requests.get("https://blog.csdn.net/XXXX", headers = headers)
html.encoding = 'utf-8'
# print(html.text)
# --------------------------

# --------------------------
# 利用正则表达式提取CSDN博客列表URL

url_lists_temp = [] # 建立空列表,存储博客列表URL,例如:'https://blog.csdn.net/XXXX/article/details/xxxxxxxxx'

urls = re.findall('https://blog.csdn.net/XXXX/article/details/\d+', html.text)
for url in urls:
    url_lists_temp.append(url) # 博客列表中有重复,需要去重

# print(url_lists_temp) 

# 去除博客列表中的重复项
url_lists = [] # 最终博客列表
 
url_lists = list(set(url_lists_temp))
# print(url_lists)
# --------------------------
 
# 获取CSDN博客列表之后,进行刷新
'''
for i in url_lists:
    # print(i)
    html_fresh = requests.get(i, headers = headers)
    print("网页 " + i + " 刷新成功!")
    time.sleep(10)
'''

# 定义初始刷新次数
j = 0
while j < 1000: # 
    html_lists = random.choice(url_lists) # 随机选取博客地址,需要用到random模块
    html_fresh = requests.get(html_lists, headers = {'User-Agent':random.choice(user_agent)})
    j += 1
    # print("网页 " + html_lists + " 刷新成功!")
    print("网页 %s 刷新成功!已刷新 %d 次!" %(html_lists, j))
    time.sleep(random.randint(40,60))
  • 1
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值