Python爬虫 - 【开源中国】Ajax渲染页面信息爬取

注意:Response类型不是JSON,直接使用BS4库即可



   源码:

# -*- coding: utf-8 -*-
"""
Created on Fri Mar  8 08:03:49 2019

@author: dell
"""
#.text表示该标签下的左右子标签的文本信息!!!



import requests
import time
from urllib.parse import urlencode
from bs4 import BeautifulSoup

def getPage(number):
    
    params = {
        "classification": "5593654",
        "p": str(number),
        "type": "ajax",
    }
    url = "https://www.oschina.net/blog/widgets/_blog_index_recommend_list?"+urlencode(params)
    
    try:
        response = requests.get(url)
        if response.status_code == 200:
            #return response.json()
            soup = BeautifulSoup(response.content.decode("UTF-8"),"html.parser")
            for list in soup.find_all("div",{"class":"content"}):
                count = 0
                blogTitle = list.find("a",{"class":"header"}).get("title")
                blogUrl = list.find("a",{"class":"header"}).get("href")
                #print("文章名:《"+list.find("a",{"class":"header"}).get("title")+"》")
                with open("【开源中国Blog】:大数据.txt","a+",encoding="UTF-8") as f:
                    f.write("文章名:《"+blogTitle+"》\n")
                    f.write("文章链接:"+blogUrl+"\n")
                    #print("文章链接:"+list.find("a",{"class":"header"}).get("href"))
                    for readerNumber in list.find_all("div",{"class":"item"}):
                        if count == 0:
                            f.write("博主ID: "+readerNumber.text+"\n")
                            #print("博主ID: "+readerNumber.text)
                        elif count == 1:
                            f.write("发布时间: "+readerNumber.text+"\n")
                            #print("发布时间: "+readerNumber.text)
                        elif count == 2:
                            f.write("浏览量: "+readerNumber.text+"\n")
                            #print("浏览量: "+readerNumber.text)
                        elif count == 3:
                            f.write("评论量: "+readerNumber.text+"\n")
                            #print("评论量: "+readerNumber.text)
                        elif count == 4:
                            f.write("点赞量: "+readerNumber.text+"\n")
                            #print("点赞量: "+readerNumber.text)
                        count+=1
                    f.write("\n")
                    print(blogTitle+"------记录成功!")   
            #print(response.content.decode("UTF-8"))
    except requests.ConnectionError:
        return None

def main():
    for number in range(0,10):    #10Pages
        getPage(number)
        print("-----------------等待2s,抓取下一个页面------------------")
        time.sleep(2)
    
if __name__ == "__main__":
    main()

 

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值