爬取广西空中课堂开课至今所有url链接（高中、初中、小学）

最新推荐文章于 2021-04-29 00:05:00 发布

这不是一个名字

最新推荐文章于 2021-04-29 00:05:00 发布

阅读量1.8k

点赞数

分类专栏：练手程序爬虫文章标签： python

本文链接：https://blog.csdn.net/qq_41968029/article/details/104546343

版权

练手程序同时被 2 个专栏收录

2 篇文章 0 订阅

订阅专栏

爬虫

1 篇文章 0 订阅

订阅专栏

真的是闲的慌，自己作业都没写

# -*- coding: utf-8 -*-
import scrapy
import re
from bs4 import BeautifulSoup
import os


class GetalllinkSpider(scrapy.Spider):
    name = 'getalllink'
    # allowed_domains = ['zt.gxtv.cn']
    start_urls = ['https://zt.gxtv.cn/zt/default.html']

    def parse(self, response):
        global domain
        domain = 'http://zt.gxtv.cn'
        gz = response.css('div.gz *').extract()             #高中
        cz = response.css('div.cz *').extract()             #初中
        xx = response.css('div.xx *').extract()             #小学
        gz = ''.join(gz)
        cz = ''.join(cz)
        xx = ''.join(xx)
        # print(type(gz))
        gzsoup = BeautifulSoup(gz, 'html.parser')
        czsoup = BeautifulSoup(cz, 'html.parser')
        xxsoup = BeautifulSoup(xx, 'html.parser')
        gzh3 = gzsoup.find_all('h3')
        czh3 = czsoup.find_all('h3')
        xxh3 = xxsoup.find_all('h3')
        for i in range(0, 3):
            link = gzh3[i].a.get('href')
            link = domain + link
            if not os.path.exists('./高中'):
                os.mkdir('./高中')
            yield scrapy.Request(link, callback=self.parsesecond, meta={'grade': './高中/'})

        for i in range(0, 3):
            link = czh3[i].a.get('href')
            link = domain + link
            if not os.path.exists('./初中'):
                os.mkdir('./初中')
            yield scrapy.Request(link, callback=self.parsesecond, meta={'grade': './初中/'})

        for i in range(0, 6):
            link = xxh3[i].a.get('href')
            link = domain + link
            if not os.path.exists('./小学'):
                os.mkdir('./小学')
            yield scrapy.Request(link, callback=self.parsesecond, meta={'grade': './小学/'})

    def parsesecond(self, response):
        a = response.css('a').extract()
        a = ''.join(a)
        list = re.findall('/zt/sj.*?html', a)
        for i in list:
            yield scrapy.Request(domain + i, callback=self.parsethird, meta={'grade': response.meta['grade'], 'time': 1})

    def parsethird(self, response):
        print(response.meta['time'])
        ftitle = response.css('a#ftitle::text').extract_first()
        title = response.css('span#title::text').extract_first()
        title = ftitle + ''.join(title) + '.txt'
        alist = response.css('a[target=_blank]').extract()
        alist = ''.join(alist)
        linksoup = BeautifulSoup(alist, 'html.parser')
        if os.path.exists(response.meta['grade'] + title):
            pass
        else:
            with open(response.meta['grade'] + title, 'a') as f:
                f.close()

        for i in linksoup.find_all('a'):
            videotitle = i['title']
            url = domain + i['href']
            yield scrapy.Request(url, callback=self.parseinside,
                                 meta={'title': videotitle, 'ftitle': response.meta['grade'] + title})
        #下一页
        a = response.css('p#page_control_bar *').extract()
        flag = False
        for i in a:
            if re.search('current', i) is not None:
                flag = True
                continue
            if flag:
                link = re.findall('/zt/sj.*?html', str(i))[0]
                yield scrapy.Request(domain + str(link), callback=self.parsethird,
                                     meta={'grade': response.meta['grade'], 'time': 2})

    def parseinside(self, response):
        title = response.meta['title']
        ftitle = response.meta['ftitle']
        playhost = 'http://video.cdn.liangtv.cn.*mp4|https://videocdn.liangtv.cn.*mp4'                       #匹配链接字符串
        resp = response.text
        title = response.css('h3#title::text').extract_first()
        playlink = re.search(playhost, resp)
        if playlink is not None:
            video = str(playlink.group(0))
            with open(ftitle, 'a') as f:
                f.write(title + ':' + video + '\n')
                f.close()

爬是爬到了，就是后期还要改成xlsx以便排序

本文需要的库:scrapy,BeautifulSoup4

这不是一个名字

关注

0
点赞
踩
0

收藏

觉得还不错? 一键收藏
0
评论
爬取广西空中课堂开课至今所有url链接（高中、初中、小学）

真的是闲的慌，自己作业都没写# -*- coding: utf-8 -*-import scrapyimport refrom bs4 import BeautifulSoupimport osclass GetalllinkSpider(scrapy.Spider): name = 'getalllink' # allowed_domains = ['zt.g...
复制链接

扫一扫

专栏目录