当有人浏览博客时,博客访问量会增加。只要规避csdn的监控机制,那么写一个python爬虫来访问博客,也可以达到增加访问量的效果。
程序环境为:python3.5
所需的库为:
import requests
import re
import random
import time
1.获取use-agent代理
# url = 'https://www.cnblogs.com/1906859953Lucas/p/9027165.html' #博客链接
# pattern = "</strong></span><br>(.*?)</p>" #正则表达式
def getUserAgent(url, pattern):
res = requests.get(url, headers=header)
res.raise_for_status()
res.encoding = res.apparent_encoding
values = re.findall(pattern, res.text, re.M|re.S|re.I)
values = [value.replace('</p>\r\n', '').split('<br>') for value in values]
values = [value for valuest in values for value in valuest]
# print(values)
global valuesDict
valuesDict = [{"User-Agent": value} for value in values]
同一个use-agent会重复访问会被识别出来,可以使用多个use-agent来随机访问。这篇博客上一个博主总结的use-agent,我们要将博客上的use-agent爬取下来。
2. 访问主页,获取博客的链接
# pattern = 'data-articleid="(.*?)"'
def visitPage(urls, pattern):
global blogLinks
# 数量要改为你的博客数量, 划重点
while len(blogLinks) < 40:
for url in urls:
res = requests.get(url, headers=random.sample(valuesDict, 1)[0])
text = re.findall(pattern, res.text, re.M|re.S|re.I)
# 改为你的博客文章地址,
links = ['https://blog.csdn.net/daoyone/article/details/{}'.format(i) for i in text]
blogLinks.extend(links)
blogLinks = list(set(blogLinks))
从CSDN的个人主页上获取所有文章的链接。正则表达式的书写可以通过查看页面源码,然后根据信息自己写出来。
3.开始随机访问博客
while True:
# 随机url
url = random.sample(blogLinks, 1)[0]
print("正在访问{}".format(url))
# 访问
requests.get(url=url, headers=random.sample(valuesDict, 1)[0])
# 延时
time.sleep(random.random() * 6)
4.所有代码
import requests
import re
import random
import time
valuesDict = []
blogLinks = []
header = {'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:78.0) Gecko/20100101 Firefox/78.0'}
# 获取use-agent代理
def getUserAgent(url, pattern):
res = requests.get(url, headers=header)
res.raise_for_status()
res.encoding = res.apparent_encoding
values = re.findall(pattern, res.text, re.M|re.S|re.I)
values = [value.replace('</p>\r\n', '').split('<br>') for value in values]
values = [value for valuest in values for value in valuest]
# print(values)
global valuesDict
valuesDict = [{"User-Agent": value} for value in values]
# 访问页面获取文章链接
def visitPage(urls, pattern):
global blogLinks
# 50要改为你的博客数量, 划重点
while len(blogLinks) < 40:
for url in urls:
res = requests.get(url, headers=random.sample(valuesDict, 1)[0])
text = re.findall(pattern, res.text, re.M|re.S|re.I)
# 改为你的博客文章地址,
links = ['https://blog.csdn.net/daoyone/article/details/{}'.format(i) for i in text]
blogLinks.extend(links)
blogLinks = list(set(blogLinks))
# 运行函数
def runVisitPage():
# 这是User-Agent的网址,不必修改
url = 'https://www.cnblogs.com/1906859953Lucas/p/9027165.html'
pattern = "</strong></span><br>(.*?)</p>"
getUserAgent(url=url, pattern=pattern)
# 改为你的博客地址,范围1,修改为你的博客页面范围
urls = ['https://blog.csdn.net/daoyone/article/list/1']
pattern = 'data-articleid="(.*?)"'
visitPage(urls, pattern)
while True:
# 随机url
url = random.sample(blogLinks, 1)[0]
print("正在访问{}".format(url))
# 访问
requests.get(url=url, headers=random.sample(valuesDict, 1)[0])
# 延时
time.sleep(random.random() * 6)
if __name__ == '__main__':
runVisitPage()