代码:
import requests, re, random
valuesDict = []
blogLinks = []
# 获取use-agent代理
def getUserAgent(url, pattern):
res = requests.get(url)
res.encoding = res.apparent_encoding
values = re.findall(pattern, res.text, re.M|re.S|re.I)
values = [value.replace('</p>\r\n', '').split('<br />') for value in values]
values = [value for valuest in values for value in valuest]
global valuesDict
valuesDict = [{"User-Agent": value} for value in values]
# 访问页面获取文章链接
def visitPage(urls, pattern):
global blogLinks
# 50要改为你的博客数量, 划重点
while len(blogLinks) < 50:
for url in urls:
res = requests.get(url, headers=random.sample(valuesDict, 1)[0])
text = re.findall(pattern, res.text, re.M|re.S|re.I)
# 改为你的博客文章地址,
links = [r'https://blog.csdn.net/weixin_43690548/article/details/{}'.format(i) for i in text]
blogLinks.extend(links)
blogLinks = list(set(blogLinks))
# 运行函数
def runVisitPage():
# 这是User-Agent的网址,不必修改
url = r'https://www.cnblogs.com/1906859953Lucas/p/9027165.html'
pattern = "</strong></span><br />(.*?)<p>.*?<br />"
getUserAgent(url=url, pattern=pattern)
# 改为你的博客地址,范围1,4,修改为你的博客页面范围
urls = [r'https://blog.csdn.net/weixin_43690548/article/list/{}?'.format(i) for i in range(1, 4)]
pattern = 'data-articleid="(.*?)"'
visitPage(urls, pattern)
import time
while True:
# 随机url
url = random.sample(blogLinks, 1)[0]
print("正在访问{}".format(url))
# 访问
requests.get(url=url, headers=random.sample(valuesDict, 1)[0])
# 延时
time.sleep(random.random() * 6)
if __name__ == '__main__':
runVisitPage()