初高中习题集爬取

最新推荐文章于 2024-07-04 09:21:48 发布

chenghaoy

最新推荐文章于 2024-07-04 09:21:48 发布

阅读量685

点赞数 2

分类专栏：爬虫

本文链接：https://blog.csdn.net/chenghaoy/article/details/82736774

版权

爬虫专栏收录该内容

2 篇文章 0 订阅

订阅专栏

在上一篇的基础之上，对代码进行了修改，用于爬取某教育网站上的习题知识点，此网站习题资源强大，但网页比之前的复杂，而且还有一个潜在的坑挖好了等着你跳。（以数学为例，其余科目类似，换一个url就行）

网址：https://tiku.21cnjy.com/tiku.php?mod=quest&channel=3&cid=1373&xd=3

主要流程：按照左侧展开后的知识点爬取，爬取内容为查看解析里的内容

所谓的坑就是，你爬取出来的内容在每一句话后面均有一段长度不等，内容不一的字符串，注意，不是编码问题，一开始我也以为如此，后来发现事情并没有这么简单，这一部分为干扰内容，必须在爬取之前就去掉。

我的解决办法就是定位到class，将里面的文本替换为空

具体代码如下：（仅供记录，不建议参考）

#-*-coding:utf-8-*-
import requests
from bs4 import BeautifulSoup
# import codecs
def contents_save(file_path, content):
    """
    :param file_path: 爬取文件保存路径
    :param content: 爬取文本文件内容
    :return: None
    """
    with open(file_path, 'a', encoding="utf-8", errors='ignore') as f:
        try:
            f.write(content)
        except:
            pass
        f.write('\n')


def get_html(url_list, file_path, headers):
    for url in url_list:
        req = requests.get(url, headers=headers)
        content = req.content.decode('utf-8','ignore')
        bf = BeautifulSoup(content, fromEncoding="gb18030")
        del_text = bf.find_all(class_=["this_jammer", "hidejammersa", "jammerd42"])
        # print(del_text)
        for i in del_text:
            if i:
                new_tag = ""
                try:
                    i.string.replace_with(new_tag)
                except:
                    pass
        texts = bf.find_all('div', class_= 'answer_detail')
        # print(texts)
        try:
            texts = texts[0].text.replace('\xa0', '')
            texts = texts.replace(" ", "")
        except:
            pass
        try:
            texts = texts.replace("\n", '')
        except:
            pass
        print(texts)
        contents_save(file_path, texts)

def get_url(target_url, server, headers):
    req = requests.get(target_url, headers=headers)
    # 查看网页的编码格式
    # enconding = requests.utils.get_encodings_from_content(req.text)
    # print(enconding) //utf-8
    bf = BeautifulSoup(req.text)
    div = bf.find_all('div', class_='questions_col')
    # print(div[0])
    a_bf = BeautifulSoup(str(div[0]))
    a = a_bf.find_all('a')
    cheak_parsing_url = []
    for each in a:
        if each.string == "查看解析":
            full_url = server + each.get('href')
            cheak_parsing_url.append(full_url)
    print(cheak_parsing_url)
    return cheak_parsing_url

def change_page(target_url, server, headers):
    req = requests.get(target_url, headers=headers)
    bf = BeautifulSoup(req.text)
    div = bf.find_all('div', class_='fenye')
    a_bf = BeautifulSoup(str(div[0]))
    a = a_bf.find_all('a')
    full_url = None
    for each in a:
        if each.string == "下一页":
            full_url = server + each.get('href')
            print(full_url)
        else :
            continue
    return full_url

def get_category(target_url, server, headers):
    req = requests.get(target_url, headers=headers)
    bf = BeautifulSoup(req.text)
    div = bf.find_all('div', class_='shiti_catagory frame')
    a_bf = BeautifulSoup(str(div[0]))
    a = a_bf.find_all('a')
    category = []
    for each in a:
        full_url = server + each.get('href')
        category.append(full_url)
    print(category)
    return category

if __name__ == "__main__":
    main_url = "https://tiku.21cnjy.com/tiku.php?mod=quest&channel=3&cid=1373&xd=3"
    server = "https://tiku.21cnjy.com/"
    save_dir = "D:/work/data"
    subject_file = "senior_Math.txt"
    file_path = save_dir + '/' + subject_file
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.101 Safari/537.36',
        'Accept-Encoding': 'gzip'}
    categorys = get_category(main_url, server, headers)
    for category_url in categorys:
        counting = 0
        target_url = category_url
        while counting < 100:
            cheak_parsing_url = get_url(target_url, server, headers)
            get_html(cheak_parsing_url, file_path, headers)
            target_url = change_page(target_url, server, headers)

            if target_url == None:
                break
            counting += 1