python测试实验

wh0am1·

已于 2023-05-08 15:13:43 修改

阅读量106

点赞数

分类专栏： Python 文章标签： python

于 2023-05-07 22:16:22 首次发布

本文链接：https://blog.csdn.net/qq_44223394/article/details/130549043

版权

Python 专栏收录该内容

36 篇文章 5 订阅

订阅专栏

该代码实现了一个Python爬虫，使用多线程从特定网站抓取课程信息，包括课程名称、价格和视频链接。抓取到的信息被存储在文本文件中，同时程序还下载了关联的视频文件。爬虫处理了分页，解析JSON数据，并对网络请求进行了错误处理和重试机制。

摘要由CSDN通过智能技术生成

import requests
import json
import time
import os
import queue
import threading
from tqdm import tqdm


arr_map=[]

class Work(threading.Thread):
    def __init__(self, q, name):
        threading.Thread.__init__(self)
        self.q = q
        self.name = name

    def run(self):
        while True:
            if self.q.empty():
                break
            else:
                url = self.q.get()
                download(url)
                self.q.task_done()

def getPageContent():
    for i in range(1,10):
        url="https://live.freebuf.com/index/course/?type=1&page="+str(i)
        r=requests.get(url)
        j=json.loads(r.text)
        list=j['data']['list']
        list_len=len(list)
        for k in range(0,list_len):
            money=list[k]['money']
            title=list[k]['title']
            id=list[k]['url'][19:]
            print(title+"------"+money+'-----'+id)
            with open("result.txt","a+",encoding="utf-8") as f1:
                f1.write(title+"------"+money+'-----'+id+"\n")
            getCal(title,money,id)
    f1.close()


def getCal(title,money,id):
    url="https://live.freebuf.com/course/detail/?id="+str(id)
    r=requests.get(url)
    j=json.loads(r.text)
    try:
        list=j['data']['series_course']['list']
        other=j['data']['series_course']['other']
        list_len=len(list)
        other_len=len(other)
        for i in range(0,list_len):
            son=list[i]['son']
            son_len= len(son)
            for k in range(0,son_len):
                name=son[k]['name']
                video=son[k]['video']
                print("----------"+name+"------"+video)
                with open("result.txt","a+",encoding="utf-8") as f2:
                    f2.write("----------"+name+"------"+video+"\n")
                if video != "":
                    videoUrl=geturl(title,name,video)
                    print("----------"+name+"------"+videoUrl)
                    with open("videoUrl.txt","a+",encoding="utf-8") as f3:
                        f3.write(title+"------"+money+"\n")
                        f3.write("------------"+name+"------"+videoUrl+"\n")
                    with open("urls.txt","a+",encoding="utf-8") as f4:
                        f4.write(title+"------"+name+"------"+videoUrl+"\n")

        for m in range(0,other_len):
            name = other[m]['name']
            video=other[m]['video']
            print("----------"+name+"------"+video)
            with open("result.txt","a+",encoding="utf-8") as f5:
                f5.write("----------"+name+"------"+video+"\n")
            if video !="":
                videoUrl=geturl(title,name,video)
                print("----------"+name+"------"+videoUrl)
                with open("videoUrl.txt","a+",encoding="utf-8") as f6:
                    f6.write(title+"------"+money+"\n")
                    f6.write("----------"+name+"------"+videoUrl+"\n")
                with open("urls.txt","a+",encoding="utf-8") as f4:
                    f4.write(title+"------"+name+"------"+videoUrl+"\n")
    except:
        print(title+"---"+"暂未发现课程")
        with open("result.txt","a+",encoding="utf-8") as f:
                f.write(title+"---"+"暂未发现课程"+"\n")


def geturl(title,name,video):
    prefix="chrome-extension://jfedfbgedapdagkghmgibemcoggfppbb/download.html?url="
    url="https://playvideo.qcloud.com/getplayinfo/v2/1251001764/"+str(video)
    r=requests.get(url)
    j=json.loads(r.text)
    videoUrl=j['videoInfo']['sourceVideo']['url']
    suffix="&referer=https://live.freebuf.com/&filename="+str(title)+"/"+str(name)+".mp4"
    videoUrl = prefix + videoUrl + suffix
    return str(videoUrl)

def getProcess():
    txtFile = open(os.path.join("./", "urls.txt"),'rb')
    for line in txtFile.readlines():
        if line!="" :
            arr = str(line.strip(), encoding = "utf-8").split("------")
            info=[]
            title =arr[0]
            name=arr[1]
            url=arr[2]
            info.append(title)
            info.append(name)
            info.append(url)
            arr_map.append(info)
    title=""
    for arrs in arr_map:
        title=arrs[0]
        if os.path.exists('./out/'+title):
            pass
        else:
            os.makedirs("./out/"+title)

def download(url):
    title=""
    file_name = ""
    for arrs in arr_map:
        if arrs[2] == url:
            title=arrs[0]
            file_name =arrs[1]
    retry_times = 0
    while retry_times <= 5:
        try:
            header={
                'Accept': '*/*',
                'Accept-Encoding': 'gzip, deflate',
                'Accept-Language': 'zh-CN,zh;q=0.9',
                'Connection': 'keep-alive',
                'Host': '1251001764.vod2.myqcloud.com',
                'Referer': 'https://live.freebuf.com/',
                'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/112.0.0.0 Safari/537.36',
            }
            video_content = requests.get(url, headers=header, stream=True)
            contentLength = int(video_content.headers['content-length'])/1024/1024
            print(file_name+"  开始下载  共" +"{:.2f}MB".format(contentLength)+"\n")
            process_bar=tqdm(colour='blue',total=contentLength,unit='MB',desc=file_name,initial=0)
            with open("./out/"+title+"/"+file_name+".mp4",'wb') as file:
                for chunk in video_content.iter_content(chunk_size=1024):
                    if chunk:
                        file.write(chunk)
                        process_bar.update(1024)
            file.close()
            print("\n"+file_name+"  下载结束\n")
            break
        except:
            pass
        retry_times += 1
    else:
        try:
            os.remove(file_name)
        except OSError:
            pass
        print("Failed to retrieve %s from %s.\n" % (url, file_name))



if __name__ == '__main__':
    getPageContent()
    #文本分割url
    getProcess()
    if os.path.exists('./out'):
        print("file exist!")
    else:
        os.makedirs("out")
    start_time = time.time()
    work_queue = queue.Queue(len(arr_map))
    threads = []
    for arrs in arr_map:
        work_queue.put(arrs[2])
    for i in range(3):
        thread = Work(work_queue, 'Thread-%s' % (i+1))
        thread.setDaemon(True)
        thread.start()
        threads.append(thread)

    if work_queue.join():
        for t in threads:
            t.join()
    end_time = time.time()
    print("用时：%s" % (end_time - start_time))