注意:Response类型不是JSON,直接使用BS4库即可
源码:
# -*- coding: utf-8 -*-
"""
Created on Fri Mar 8 08:03:49 2019
@author: dell
"""
#.text表示该标签下的左右子标签的文本信息!!!
import requests
import time
from urllib.parse import urlencode
from bs4 import BeautifulSoup
def getPage(number):
params = {
"classification": "5593654",
"p": str(number),
"type": "ajax",
}
url = "https://www.oschina.net/blog/widgets/_blog_index_recommend_list?"+urlencode(params)
try:
response = requests.get(url)
if response.status_code == 200:
#return response.json()
soup = BeautifulSoup(response.content.decode("UTF-8"),"html.parser")
for list in soup.find_all("div",{"class":"content"}):
count = 0
blogTitle = list.find("a",{"class":"header"}).get("title")
blogUrl = list.find("a",{"class":"header"}).get("href")
#print("文章名:《"+list.find("a",{"class":"header"}).get("title")+"》")
with open("【开源中国Blog】:大数据.txt","a+",encoding="UTF-8") as f:
f.write("文章名:《"+blogTitle+"》\n")
f.write("文章链接:"+blogUrl+"\n")
#print("文章链接:"+list.find("a",{"class":"header"}).get("href"))
for readerNumber in list.find_all("div",{"class":"item"}):
if count == 0:
f.write("博主ID: "+readerNumber.text+"\n")
#print("博主ID: "+readerNumber.text)
elif count == 1:
f.write("发布时间: "+readerNumber.text+"\n")
#print("发布时间: "+readerNumber.text)
elif count == 2:
f.write("浏览量: "+readerNumber.text+"\n")
#print("浏览量: "+readerNumber.text)
elif count == 3:
f.write("评论量: "+readerNumber.text+"\n")
#print("评论量: "+readerNumber.text)
elif count == 4:
f.write("点赞量: "+readerNumber.text+"\n")
#print("点赞量: "+readerNumber.text)
count+=1
f.write("\n")
print(blogTitle+"------记录成功!")
#print(response.content.decode("UTF-8"))
except requests.ConnectionError:
return None
def main():
for number in range(0,10): #10Pages
getPage(number)
print("-----------------等待2s,抓取下一个页面------------------")
time.sleep(2)
if __name__ == "__main__":
main()