初高中习题集爬取

在上一篇的基础之上,对代码进行了修改,用于爬取某教育网站上的习题知识点,此网站习题资源强大,但网页比之前的复杂,而且还有一个潜在的坑挖好了等着你跳。(以数学为例,其余科目类似,换一个url就行)

网址:https://tiku.21cnjy.com/tiku.php?mod=quest&channel=3&cid=1373&xd=3

主要流程:按照左侧展开后的知识点爬取,爬取内容为查看解析里的内容

所谓的坑就是,你爬取出来的内容在每一句话后面均有一段长度不等,内容不一的字符串,注意,不是编码问题,一开始我也以为如此,后来发现事情并没有这么简单,这一部分为干扰内容,必须在爬取之前就去掉。


我的解决办法就是定位到class,将里面的文本替换为空

具体代码如下:(仅供记录,不建议参考)

#-*-coding:utf-8-*-
import requests
from bs4 import BeautifulSoup
# import codecs
def contents_save(file_path, content):
    """
    :param file_path: 爬取文件保存路径
    :param content: 爬取文本文件内容
    :return: None
    """
    with open(file_path, 'a', encoding="utf-8", errors='ignore') as f:
        try:
            f.write(content)
        except:
            pass
        f.write('\n')


def get_html(url_list, file_path, headers):
    for url in url_list:
        req = requests.get(url, headers=headers)
        content = req.content.decode('utf-8','ignore')
        bf = BeautifulSoup(content, fromEncoding="gb18030")
        del_text = bf.find_all(class_=["this_jammer", "hidejammersa", "jammerd42"])
        # print(del_text)
        for i in del_text:
            if i:
                new_tag = ""
                try:
                    i.string.replace_with(new_tag)
                except:
                    pass
        texts = bf.find_all('div', class_= 'answer_detail')
        # print(texts)
        try:
            texts = texts[0].text.replace('\xa0', '')
            texts = texts.replace(" ", "")
        except:
            pass
        try:
            texts = texts.replace("\n", '')
        except:
            pass
        print(texts)
        contents_save(file_path, texts)

def get_url(target_url, server, headers):
    req = requests.get(target_url, headers=headers)
    # 查看网页的编码格式
    # enconding = requests.utils.get_encodings_from_content(req.text)
    # print(enconding) //utf-8
    bf = BeautifulSoup(req.text)
    div = bf.find_all('div', class_='questions_col')
    # print(div[0])
    a_bf = BeautifulSoup(str(div[0]))
    a = a_bf.find_all('a')
    cheak_parsing_url = []
    for each in a:
        if each.string == "查看解析":
            full_url = server + each.get('href')
            cheak_parsing_url.append(full_url)
    print(cheak_parsing_url)
    return cheak_parsing_url

def change_page(target_url, server, headers):
    req = requests.get(target_url, headers=headers)
    bf = BeautifulSoup(req.text)
    div = bf.find_all('div', class_='fenye')
    a_bf = BeautifulSoup(str(div[0]))
    a = a_bf.find_all('a')
    full_url = None
    for each in a:
        if each.string == "下一页":
            full_url = server + each.get('href')
            print(full_url)
        else :
            continue
    return full_url

def get_category(target_url, server, headers):
    req = requests.get(target_url, headers=headers)
    bf = BeautifulSoup(req.text)
    div = bf.find_all('div', class_='shiti_catagory frame')
    a_bf = BeautifulSoup(str(div[0]))
    a = a_bf.find_all('a')
    category = []
    for each in a:
        full_url = server + each.get('href')
        category.append(full_url)
    print(category)
    return category

if __name__ == "__main__":
    main_url = "https://tiku.21cnjy.com/tiku.php?mod=quest&channel=3&cid=1373&xd=3"
    server = "https://tiku.21cnjy.com/"
    save_dir = "D:/work/data"
    subject_file = "senior_Math.txt"
    file_path = save_dir + '/' + subject_file
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.101 Safari/537.36',
        'Accept-Encoding': 'gzip'}
    categorys = get_category(main_url, server, headers)
    for category_url in categorys:
        counting = 0
        target_url = category_url
        while counting < 100:
            cheak_parsing_url = get_url(target_url, server, headers)
            get_html(cheak_parsing_url, file_path, headers)
            target_url = change_page(target_url, server, headers)

            if target_url == None:
                break
            counting += 1
  • 2
    点赞
  • 5
    收藏
    觉得还不错? 一键收藏
  • 1
    评论
评论 1
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值