我自己的博客第一页和第二页及以后的格式不一样,所以需要对数据进行两次处理
大家可以运行看一下,代码就在下面,说一下思路吧。
首先一个函数获取到最大页数因为我的最大页数
第二个函数,分析处理第一页的值并得到标题等参数
第三个函数,分析处理第二页博客的值并得到标题等参数,为什么都是分析参数要两个函数,因为博客第一页和第二页不一样,第一页有一个轮播,代码乱了。
主函数我就不解释了,很简单import requests
import re
def most():
url = "http://www.anyuer.club"
payload = ""
headers = {
'cache-control': "no-cache",
'Postman-Token': "fae95858-0f94-4ab2-8bee-d8db99bee459"
}
response = requests.request("GET", url, data=payload, headers=headers)
a = response.text
most = re.findall('class="next">›([\s\S]*?)',a)
str = most[0]
best = re.findall(r'page=(.*?)">',str)
result = best[0]
return result
def page1(page):
url = "http://www.anyuer.club"
querystring = {"page":page}
payload = ""
headers = {
'cache-control': "no-cache",
'Postman-Token': "738f9be1-5d9a-44e3-a4da-5f5b92919cb1"
}
response = requests.request("GET", url, data=payload, headers=headers, params=querystring)
#print(response.text)
a = response.text
ret = re.findall('class="img">([\s\S]*)class="fa fa-comments"',a)
for result in ret:
title = re.findall(r' alt="(.*?)">',result)#获取标题
link = re.findall(r'',result)#获取link
time = re.findall(r'([\s\S]*?)',result)#获取时间
kind = re.findall(r'([\s\S]*?)',result)#获取类别
str = [("\n标题:"+title[i]+
"\n链接:"+link[i]+
"\n时间:"+time[i]+
"\n分类:"+kind[i]+
"\n\n") for i in range(0,len(title))]
for num in str:
print(num)
def page2(page):
url = "http://www.anyuer.club"
querystring = {"page":page}
payload = ""
headers = {
'cache-control': "no-cache",
'Postman-Token': "738f9be1-5d9a-44e3-a4da-5f5b92919cb1"
}
response = requests.request("GET", url, data=payload, headers=headers, params=querystring)
#print(response.text)
a = response.text
ret = re.findall('class="img">([\s\S]*)class="fa fa-comments"',a)
for result in ret:
title = re.findall(r'" title="(.*?)">
link = re.findall(r'
time = re.findall(r'([\s\S]*?)',result)#获取时间
kind = re.findall(r'([\s\S]*?)
',result)#获取类别str = [("\n标题:"+title[i]+
"\n链接:"+link[i]+
"\n时间:"+time[i]+
"\n分类:"+kind[i]+
"\n\n") for i in range(0,len(title))]
for num in str:
print(num)
x = 1
result = most()
most = int(result)
most = most +1
while x
if(x==1):
page1(x)
x = x+1
else:
page2(x)
x = x+1