Python爬虫 - 【开源中国】Ajax渲染页面信息爬取

最新推荐文章于 2021-02-10 02:04:34 发布

Kylee Kello

最新推荐文章于 2021-02-10 02:04:34 发布

阅读量1.6k

点赞数

文章标签： python爬虫

本文链接：https://blog.csdn.net/qq_42292831/article/details/88327350

版权

Python_Spider 专栏收录该内容

14 篇文章 0 订阅

订阅专栏

注意：Response类型不是JSON，直接使用BS4库即可

源码：

# -*- coding: utf-8 -*-
"""
Created on Fri Mar  8 08:03:49 2019

@author: dell
"""
#.text表示该标签下的左右子标签的文本信息！！！



import requests
import time
from urllib.parse import urlencode
from bs4 import BeautifulSoup

def getPage(number):
    
    params = {
        "classification": "5593654",
        "p": str(number),
        "type": "ajax",
    }
    url = "https://www.oschina.net/blog/widgets/_blog_index_recommend_list?"+urlencode(params)
    
    try:
        response = requests.get(url)
        if response.status_code == 200:
            #return response.json()
            soup = BeautifulSoup(response.content.decode("UTF-8"),"html.parser")
            for list in soup.find_all("div",{"class":"content"}):
                count = 0
                blogTitle = list.find("a",{"class":"header"}).get("title")
                blogUrl = list.find("a",{"class":"header"}).get("href")
                #print("文章名：《"+list.find("a",{"class":"header"}).get("title")+"》")
                with open("【开源中国Blog】：大数据.txt","a+",encoding="UTF-8") as f:
                    f.write("文章名：《"+blogTitle+"》\n")
                    f.write("文章链接："+blogUrl+"\n")
                    #print("文章链接："+list.find("a",{"class":"header"}).get("href"))
                    for readerNumber in list.find_all("div",{"class":"item"}):
                        if count == 0:
                            f.write("博主ID： "+readerNumber.text+"\n")
                            #print("博主ID： "+readerNumber.text)
                        elif count == 1:
                            f.write("发布时间： "+readerNumber.text+"\n")
                            #print("发布时间： "+readerNumber.text)
                        elif count == 2:
                            f.write("浏览量： "+readerNumber.text+"\n")
                            #print("浏览量： "+readerNumber.text)
                        elif count == 3:
                            f.write("评论量： "+readerNumber.text+"\n")
                            #print("评论量： "+readerNumber.text)
                        elif count == 4:
                            f.write("点赞量： "+readerNumber.text+"\n")
                            #print("点赞量： "+readerNumber.text)
                        count+=1
                    f.write("\n")
                    print(blogTitle+"------记录成功！")   
            #print(response.content.decode("UTF-8"))
    except requests.ConnectionError:
        return None

def main():
    for number in range(0,10):    #10Pages
        getPage(number)
        print("-----------------等待2s，抓取下一个页面------------------")
        time.sleep(2)
    
if __name__ == "__main__":
    main()