[爬虫]一个关于课堂派课件的爬虫

最新推荐文章于 2025-01-29 17:00:00 发布

梅森姑娘

最新推荐文章于 2025-01-29 17:00:00 发布

阅读量2.7k

点赞数 8

分类专栏：爬虫文章标签： python

本文链接：https://blog.csdn.net/JMU_Ma/article/details/106888860

版权

爬虫专栏收录该内容

1 篇文章

订阅专栏

文章目录

前言

这篇博文主要是记录一下自己在这次爬虫中的心得体会。和学到的一些新的知识。个人认为主要有以下几点：

网页频繁刷新导致无法爬取到想要的数据
在网页中如何实现登录等(这里本打算要写保持登录状态的，后觉得不必因此很多网站已经做到及时刷新等技术，只适用于一些比较传统的技术，遂仅写到登录)
抓不到想要的数据如何办

大致分为以上的几个新的点。

如何实现登录功能

因为这次主要是做课堂派的课件爬虫，因此以课堂派的机制作为主要的讨论对象。以下的素材均来自本次课堂派的爬虫。

在这里插入图片描述

通过登录帐号后发现课堂派的请求机制是post请求，请求的参数是email和password以及remeber，三个参数。分别对应帐号，密码，是否记住帐号。我们只用关注前两者即可
在这里插入图片描述

遂我们写下以下的代码。并做全文的注释，使得更容易理解。

account = input('Please input your telephone number:') # 输入帐号，做到可以登录任意的账户
password = getpass.getpass("Please input your password:") # 输入密码
login_session = requests.Session() # 使用rquests的Session，在后面可以使得保持持续的登录状态。
headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.75 Safari/537.36',
} # 设置请求头，因为时间原因和课堂派的反爬机制并不是很强，所以准备一个就可以了
# ------------------
def login():
    login_data = {
        'email' : account,
        'password': password,
        'remember': 0
    }# 将帐号密码读取到dict中方便之后的读取
    response = login_session.post('https://www.ketangpai.com/UserApi/login',data=login_data, verify=False, headers=headers) # 进行登录
    if response.status_code != 200: # 检测是否能够登录，如果api关闭后，会返回其他参数。
        print('The Code havs some bugs, please debug.')
        exit(0)

获取列表等

这个地方，我卡了很久（笑哭），因此一直无法提供给我课程id，刚开始在考虑强行构造，后台发现我泰（bi～～～～！）。后来无意的把浏览器的缓存删了重新进入发现居然能读取出来了。后发现这里课程id是存在了缓存中，因此一直无法获得。这也是我在这次的爬虫中学到的“清理缓存大法”。

course_id_list = [] # 新建一个list，方便下面进行循环等操作
def get_course_id():
    response = login_session.post('https://www.ketangpai.com/Main/index.html',headers=headers, allow_redirects=False) # 这里使用上面的loginsession保持登录状态
    if response.status_code == 200: # 验证是否登录成功
        print('登录成功')
    company_detail = login_session.get('https://www.ketangpai.com/CourseApi/lists') # 读取到的课程id的api
    # read the top list
    new_dict_top = json.loads(company_detail.content)['toplists'] # 读取json中的toplists
    for the_dict_top in new_dict_top:
        the_id_key = the_dict_top['id']
        course_id_list.append(the_id_key)
    # read the lists
    new_dict_list = json.loads(company_detail.content)['lists'] # 读取json的lists的课程id，因为这里是分置顶课程和普通课程因此讲两者分开。
    for the_dict_list in new_dict_list:
        the_id_key = the_dict_top['id']
        course_id_list.append(the_id_key)
    return course_id_list

获取到课程id后面就很好进行一系列操作了。

获取课件列表和信息

因为已经获取到了课程id因此后面的内容就很好开展了。

def into_course_web():
    for course_id in course_id_list:
        # get the cource names 
        the_course = login_session.get(f'https://www.ketangpai.com/Courseware/index/courseid/{course_id}.html') # 对列表进行循环进入不同的课程内，同时保持登录状态
        soup = BeautifulSoup(the_course.content)
        the_course_name = soup.find('div',attrs = {'class':'topm cl'}).find('h1')
        the_course_name = the_course_name.string # 这里的type是btypes，先转为str

        # get the files links
        the_files_links = login_session.get(f'https://www.ketangpai.com/CoursewareApi/listAll?courseid={course_id}',headers=headers, allow_redirects=False) # 这里读取课件列表信息
        # print(type(the_files_links.content.decode()))
        if type(the_files_links.content) == 'bytes': # 这里是个玄学，在没有课件的时候是btypes，而有的时候是str。很神奇，所以使用if处理一下
            get_the_files_info = eval(the_files_links.content.decode())['lists']
            if len(get_the_files_info) == 0 : # 输出一个提示一个没有课件的print
                print(f'{the_course_name}没有课件则跳出')
                continue
        get_the_files_info = json.loads(the_files_links.content.decode())['lists']# 先处理呈utf8编码形式，再读取其中的课件lists
        for the_files_info in get_the_files_info:
            the_files_name = the_files_info['name'] # 读取课件的name，方便之后wget进行命名
            the_files_links = the_files_info['url'] # 读取课件的url，之后wget进行下载
            print(the_files_links)
            if os.path.exists(the_course_name) == False: # 创建文件夹，让不同课的课件进入不同的文件夹
                os.mkdir(the_course_name)
            tmpdir = os.getcwd()+ '/' + the_course_name # 读取所在的文件夹
            try: # 这里真的不是我不想修bug，真的不是，因为文件夹的下载真的很离谱，跳转很多次然后转为zip下载，跳转很多次，遂省去这部分
                wget_download = wget.download(the_files_links,out=os.path.join(tmpdir,the_files_name))
            except:
                pass
            continue

完整代码

#!/usr/bin/env python3
import getpass
import json
import requests
import os
from bs4 import BeautifulSoup
import wget
from tqdm import tqdm

account = input('Please input your telephone number:')
password = getpass.getpass("Please input your password:")
login_session = requests.Session()
headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.75 Safari/537.36',
}

# ------------------
def login():
    login_data = {
        'email' : account,
        'password': password,
        'remember': 0
    }
    response = login_session.post('https://www.ketangpai.com/UserApi/login',data=login_data, verify=False, headers=headers)
    if response.status_code != 200:
        print('The Code havs some bugs, please debug.')
        exit(0)

# -------------------
course_id_list = []
def get_course_id():
    response = login_session.post('https://www.ketangpai.com/Main/index.html',headers=headers, allow_redirects=False)
    if response.status_code == 200:
        print('登录成功')
    answer = str(response.content,encoding='utf-8')
    company_detail = login_session.get('https://www.ketangpai.com/CourseApi/lists')
    soup = BeautifulSoup(company_detail.content, 'html.parser')
    # print(type(company_detail.content),type(soup))
    # read the top list
    new_dict_top = json.loads(company_detail.content)['toplists']
    for the_dict_top in new_dict_top:
        the_id_key = the_dict_top['id']
        course_id_list.append(the_id_key)
    # read the lists
    new_dict_list = json.loads(company_detail.content)['lists']
    for the_dict_list in new_dict_list:
        the_id_key = the_dict_top['id']
        course_id_list.append(the_id_key)

# --------------------
def into_course_web():
    for course_id in course_id_list:
        # get the cource names 
        the_course = login_session.get(f'https://www.ketangpai.com/Courseware/index/courseid/{course_id}.html')
        soup = BeautifulSoup(the_course.content)
        the_course_name = soup.find('div',attrs = {'class':'topm cl'}).find('h1')
        the_course_name = the_course_name.string

        # get the files links
        the_files_links = login_session.get(f'https://www.ketangpai.com/CoursewareApi/listAll?courseid={course_id}',headers=headers, allow_redirects=False)
        # print(type(the_files_links.content.decode()))
        if type(the_files_links.content) == 'bytes':
            get_the_files_info = eval(the_files_links.content.decode())['lists']
            if len(get_the_files_info) == 0 :
                print(f'{the_course_name}没有课件则跳出')
                continue
        get_the_files_info = json.loads(the_files_links.content.decode())['lists']
        for the_files_info in get_the_files_info:
            the_files_name = the_files_info['name']
            the_files_links = the_files_info['url']
            print(the_files_links)
            if os.path.exists(the_course_name) == False:
                os.mkdir(the_course_name)
            tmpdir = os.getcwd()+ '/' + the_course_name
            try:
                wget_download = wget.download(the_files_links,out=os.path.join(tmpdir,the_files_name))
            except:
                pass
            continue

# -------------------
if __name__ == "__main__":
    login()
    get_course_id()
    into_course_web()

后记

如果觉得不错就点个赞吧。