Python 批量下载及BeautifulSoup的一些用法

最新推荐文章于 2024-03-01 01:59:13 发布

whu_xxie

最新推荐文章于 2024-03-01 01:59:13 发布

阅读量464

点赞数

分类专栏： Python 文章标签： python

本文链接：https://blog.csdn.net/whu_xxie/article/details/106171053

版权

Python 专栏收录该内容

2 篇文章 0 订阅

订阅专栏

小学网课，想下载所有的下来以后学习，可以人工下载，下了几个，发现一些规律，F12开发人员工具，分析了代码，果断用python批量下载，由于后台文件命名不规范，有几个还是得人工查询了id再下载。python入门，再加上没必要，所以算是半自动下载，需要自己把各年级的html文件下载下载

# -*- coding: utf-8 -*-
"""
Created on Wed May 13 14:39:55 2020

@author: xxiew
"""
import requests
from bs4 import BeautifulSoup
import re

#获取id和name
def GetIdAndName():
    with open('grade3.html', encoding='UTF-8') as read_file:
        
        htmlcontent=read_file.read()
        soup=BeautifulSoup(htmlcontent,'html.parser')
        items=soup.find_all('div',class_='sub-item')
        itemsxs=[]
        for item in items:
            if '湘少' in str(item):
                itemsxs.append(item)
        #print(itemsxs.count)
        for item in itemsxs:
            idd=item['id']
            print(idd)
            name=soup.find(id=idd).select('span')[0].string
            print(name)
            getPlayerWeb(idd,name)
        
#下载 下载的页面vid不是父页面的id，打开页面后js中生成的新vid
def getPlayerWeb(idd,name):
    url='http://yun.***jiao.com/Home/Wk/zbxq/id/'
    ret = requests.get(url+str(idd))
    #print(ret.text)
    soup = BeautifulSoup(ret.text, "html.parser")
    pattern = re.compile(r"var url = '(.*?)';", re.MULTILINE | re.DOTALL)
    script = soup.find("script", text=pattern)
    fullurl=pattern.search(script.text).group(1)
    vid=fullurl[fullurl.find('=')+1:]
    #print(vid)
    urld='http://218.***.***.23:8181/index.php?a=downF&vid='+vid
    html = requests.get(urld)
    # content返回的是bytes型也就是二进制的数据。
    html = html.content
    with open('./grade3/'+name.strip()+'.mp4','wb') as f:
        f.write(html)
    print('done')
#根据id单独下载  因为上传文件命名不规范
def getById(idd):
    with open('grade3.html', encoding='UTF-8') as read_file:        
        htmlcontent=read_file.read()
        soup=BeautifulSoup(htmlcontent,'html.parser')
        #items=soup.find_all('div',class_='sub-item')
        print(idd)
        name=soup.find(id=idd).select('span')[0].string
        print(name)
        getPlayerWeb(idd,name)
#getById(478)
#getById(623)
getById(283)
getById(143)
#GetIdAndName()
#getPlayerWeb(1794,"湘少版Unit7I'mnotafraid!(第")