自动爬取中国大学mooc的pdf文档

最新推荐文章于 2024-04-25 22:20:02 发布

呆萌的代Ma

最新推荐文章于 2024-04-25 22:20:02 发布

阅读量1.8k

点赞数 2

分类专栏：爬虫文章标签： python 爬虫 request

本文为CSDN博主"呆萌的代Ma"原创文章，转载请注明博客链接：https://blog.csdn.net/weixin_35757704/

本文链接：https://blog.csdn.net/weixin_35757704/article/details/105831694

版权

爬虫专栏收录该内容

51 篇文章 4 订阅

订阅专栏

由于中国大学mooc里课程的pdf文档不方便下载,因此这里敲一个能自动下载课程里所有pdf的代码:

窗体代码:

from tkinter import *
from tkinter.filedialog import askdirectory
from down_main import download


def center_window(tk: Tk, width: int, height: int):
    screenwidth = tk.winfo_screenwidth()
    screenheight = tk.winfo_screenheight()
    size = '%dx%d+%d+%d' % (width, height, (screenwidth - width) / 2, (screenheight - height) / 2)
    tk.geometry(size)


tk = Tk()
var = IntVar()

tk.title('my window')
center_window(tk, 800, 300)
Label(tk, text='').pack(anchor=CENTER)
Label(tk, text='课程链接URL').pack(anchor=CENTER)
Label(tk, text='例如: https://www.icourse163.org/learn/....#/learn/content', ).pack(anchor=CENTER)
Label(tk, text='').pack(anchor=CENTER)
url_entry = Entry(tk, width=100)
url_entry.pack(anchor=CENTER)
Label(tk, text='').pack(anchor=CENTER)


def select_save_path():
    path = askdirectory()
    save_path_label.config(text=path)


Button(tk, text="选择保存的目录", command=select_save_path).pack()
save_path_label = Label(tk, text='选择保存的目录')  # 标签的文字
save_path_label.pack()


def begin():
    url = url_entry.get()
    path = save_path_label.cget("text")
    download(url, path)


Label(tk, text='').pack(anchor=CENTER)

Button(tk, text="开始下载", command=begin).pack()

# 主事件循环
mainloop()

爬虫代码:

参考博客:https://blog.csdn.net/weixin_43833642/article/details/105138838

import requests
import re
import threading
import time
import functools

req = requests.session()
headers = {
    'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) '
                  'Chrome/80.0.3987.132 Safari/537.36',
    'content-type': 'text/plain'
}

content_id = []
_id = []
pdf_url = []
threads = []
save_path = ''


def request_mooc_get(url):
    return req.get(url=url)


def request_mooc_post(url, data):
    return req.post(headers=headers, cookies=cookies, url=url, data=data)


def get_timestamp():
    """
    获取13位置时间戳
    :return:
    """
    return int(round(time.time() * 1000))


def get_session_id(cookie: str):
    result = re.findall(r'NTESSTUDYSI=(\S+)[;]', cookie)
    if len(result) == 0:
        raise Exception('cookie error！')
    return result[0]


# url中tid=1206878228
def get_course_id():
    return str(_tid)


def set_course_id(tid):
    global _tid
    _tid = tid


def get_cookie():
    return cookies['cookie']


def set_cookie(cookie):
    global cookies
    cookies = dict(cookie=cookie)


def set_pdf_url():
    url = 'https://www.icourse163.org/dwr/call/plaincall/CourseBean.getLessonUnitLearnVo.dwr'
    length = len(content_id)
    print(length)
    for i in range(length):
        try:
            print(content_id[i] + " " + _id[i])
        except IndexError:
            break
        data = {
            'callCount': 1,
            'scriptSessionId': '${scriptSessionId}190',
            'httpSessionId': get_session_id(cookies['cookie']),
            'c0-scriptName': 'CourseBean',
            'c0-methodName': 'getLessonUnitLearnVo',
            'c0-id': 0,
            'c0-param0': 'number:' + content_id[i],  # content_id
            'c0-param1': 'number:3',
            'c0-param2': 'number:0',
            'c0-param3': 'number:' + _id[i],  # section_id  1245454394
            'batchId': get_timestamp()
        }
        # print(data)
        res = req.post(url=url, cookies=cookies, data=data, headers=headers)
        con = res.text
        r = re.findall(r'textOrigUrl:"(.*)\.pdf"', con)
        # print(r)
        if len(r) == 0:
            continue
        pdf_url.append(r[0] + '.pdf')
        from urllib.parse import quote, unquote
        print('get ' + get_file_name(unquote(r[0])))
        # pdf_url.append(r[0])


# 1211971774 id
#
def get_course_info():
    url = 'https://www.icourse163.org/dwr/call/plaincall/CourseBean.getLastLearnedMocTermDto.dwr'
    data = {
        "callCount": "1",
        "scriptSessionId": "${scriptSessionId}190",
        "httpSessionId": get_session_id(cookies['cookie']),
        "c0-scriptName": "CourseBean",
        "c0-methodName": "getLastLearnedMocTermDto",
        "c0-id": "0",
        "c0-param0": "number:" + get_course_id(),
        "batchId": get_timestamp()
    }
    r = request_mooc_post(url=url, data=data)
    # print(r)
    return r


def set_content_id(content: str):
    result = re.findall(r'contentId=(\d+);', content)
    print(result)
    global content_id
    content_id = result


def set_id(context: str):
    result = re.findall(r'id=(\d+);s.*jsonContent', context)
    print(result)
    global _id
    _id = result


def _time(func):
    @functools.wraps(func)
    def wrapper(*args, **kwargs):
        start = time.time()
        func(*args, **kwargs)
        end = time.time()
        print('Took ' + str(end - start))

    return wrapper


@_time
def start_download():
    for url in pdf_url:
        # r = get_file_name(visual_url)
        # print(r)
        t = threading.Thread(target=start_write, args=(url,))
        t.start()
        threads.append(t)

    for t in threads:
        t.join()

    print("ok!")


def start_write(url):
    from urllib.parse import unquote
    result = request_mooc_get(url)
    visual_url = unquote(url)
    # todo personalize path to save
    file_name = get_file_name(visual_url)
    print(file_name + ' downloading...')
    with open(globals()['save_path'] + '/' + file_name, 'wb') as f:
        # result.iter_content()
        # f.write(result.content)
        for chunk in result.iter_content(1024):
            f.write(chunk)


def get_file_name(url):
    result = re.findall(r'&download=(.*)', url)
    return result[0]


def main(tid, cookie):
    set_course_id(tid)
    set_cookie(cookie)
    c = get_course_info()
    context = c.text
    set_content_id(context)
    set_id(context)
    set_pdf_url()
    # print(pdf_url)
    print(len(pdf_url))
    start_download()


def get_tid(url: str):
    return str(url.split("tid=")[1].split("#")[0])


def download(url, path):
    globals()["save_path"] = path
    response = requests.get(url, headers={
        'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) '
                      'Chrome/80.0.3987.132 Safari/537.36',
        'Referer': "http://www.baidu.com"
    })
    cookie = ''
    for key, value in requests.utils.dict_from_cookiejar(response.cookies).items():
        cookie += (key + "=" + value + ";")
    tid = get_tid(url=url)
    main(tid, cookie)

使用方法:

1. 直接运行窗体代码

download(url, path) 
# url是课程链接,如:https://www.icourse163.org/learn/....#/learn/content
# path是保存的路径

呆萌的代Ma

关注

2
点赞
踩
14

收藏

觉得还不错? 一键收藏
打赏
1
评论
复制链接

分享到 QQ

分享到新浪微博

扫一扫

专栏目录