python 爬虫 ASMR网站在线音频的爬取(解决ajax动态加载网页)

#!/usr/bin/env python
# -*- coding: utf-8 -*-
# @Time    : 2020/12/17 13:32
# @Author  : huni
# @File    : 18asmr爬取.py
# @Software: PyCharm
import re
import os
import requests
from lxml import etree
import json
if __name__ == '__main__':
    headers = {
        'referer': 'https://18asmr.net/online/yinpin',
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.183 Safari/537.36'
    }
    # 2页到4页 可以自定义
    for i in range(2,5):
        more_url = 'https://18asmr.net/wp-admin/admin-ajax.php?action=zrz_load_more_posts'
        para = {'action': 'zrz_load_more_posts'}
        data = {
            'type': 'catL17',
            'paged': f'{i}'
        }
        resp1 = requests.post(url=more_url,headers=headers,params=para,data=data).text
        resp1_j = json.loads(resp1)
        find_more_link = re.findall(r'href="(.*?)" class="link-block"',resp1_j['msg'])
        kv = {}
        for href in find_more_link:
            resp2 = requests.get(url=href,headers=headers).text
            tree2 = etree.HTML(resp2)
            judge_t = tree2.xpath('/html/head/meta[9]/@content')[0]
            if len(judge_t) > 10:
                key = judge_t.split(',')[-1]
                if key == '在线音频':
                    find_link = re.findall(r"url: '(.*?)'",resp2)[0]
                    mp3_data = requests.get(url=find_link,headers=headers).content
                    mp3_name = find_link.split('/')[-1]
                    m_path = './18ASMR'
                    if not os.path.exists(m_path):
                        os.mkdir(m_path)
                    mp3_path = m_path + f'/{mp3_name}'
                    with open(mp3_path,'wb') as fp:
                        fp.write(mp3_data)
                        print(mp3_name,'下载完成!')

    # 第一页
    url = 'https://18asmr.net/online/yinpin'
    resp3 = requests.get(url=url,headers=headers).text
    tree = etree.HTML(resp3)
    div_list = tree.xpath('//*[@id="main"]/div[1]/div')
    for div in div_list:
        href0 = div.xpath('./div/div[1]/a/@href')
        resp = requests.get(url=href0,headers=headers).text
        find_link = re.findall(r"url: '(.*?)'",resp)[0]
        mp3_data = requests.get(url=find_link,headers=headers).content
        mp3_name = find_link.split('/')[-1]
        m_path = './18ASMR'
        if not os.path.exists(m_path):
            os.mkdir(m_path)
        mp3_path = m_path + f'/{mp3_name}'
        with open(mp3_path,'wb') as fp:
            fp.write(mp3_data)
            print(mp3_name,'下载完成!')

如果觉得还可以,可以打赏下小编,感恩

在这里插入图片描述

  • 1
    点赞
  • 2
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值