爬取广西空中课堂开课至今所有url链接(高中、初中、小学)

真的是闲的慌,自己作业都没写

# -*- coding: utf-8 -*-
import scrapy
import re
from bs4 import BeautifulSoup
import os


class GetalllinkSpider(scrapy.Spider):
    name = 'getalllink'
    # allowed_domains = ['zt.gxtv.cn']
    start_urls = ['https://zt.gxtv.cn/zt/default.html']

    def parse(self, response):
        global domain
        domain = 'http://zt.gxtv.cn'
        gz = response.css('div.gz *').extract()             #高中
        cz = response.css('div.cz *').extract()             #初中
        xx = response.css('div.xx *').extract()             #小学
        gz = ''.join(gz)
        cz = ''.join(cz)
        xx = ''.join(xx)
        # print(type(gz))
        gzsoup = BeautifulSoup(gz, 'html.parser')
        czsoup = BeautifulSoup(cz, 'html.parser')
        xxsoup = BeautifulSoup(xx, 'html.parser')
        gzh3 = gzsoup.find_all('h3')
        czh3 = czsoup.find_all('h3')
        xxh3 = xxsoup.find_all('h3')
        for i in range(0, 3):
            link = gzh3[i].a.get('href')
            link = domain + link
            if not os.path.exists('./高中'):
                os.mkdir('./高中')
            yield scrapy.Request(link, callback=self.parsesecond, meta={'grade': './高中/'})

        for i in range(0, 3):
            link = czh3[i].a.get('href')
            link = domain + link
            if not os.path.exists('./初中'):
                os.mkdir('./初中')
            yield scrapy.Request(link, callback=self.parsesecond, meta={'grade': './初中/'})

        for i in range(0, 6):
            link = xxh3[i].a.get('href')
            link = domain + link
            if not os.path.exists('./小学'):
                os.mkdir('./小学')
            yield scrapy.Request(link, callback=self.parsesecond, meta={'grade': './小学/'})

    def parsesecond(self, response):
        a = response.css('a').extract()
        a = ''.join(a)
        list = re.findall('/zt/sj.*?html', a)
        for i in list:
            yield scrapy.Request(domain + i, callback=self.parsethird, meta={'grade': response.meta['grade'], 'time': 1})

    def parsethird(self, response):
        print(response.meta['time'])
        ftitle = response.css('a#ftitle::text').extract_first()
        title = response.css('span#title::text').extract_first()
        title = ftitle + ''.join(title) + '.txt'
        alist = response.css('a[target=_blank]').extract()
        alist = ''.join(alist)
        linksoup = BeautifulSoup(alist, 'html.parser')
        if os.path.exists(response.meta['grade'] + title):
            pass
        else:
            with open(response.meta['grade'] + title, 'a') as f:
                f.close()

        for i in linksoup.find_all('a'):
            videotitle = i['title']
            url = domain + i['href']
            yield scrapy.Request(url, callback=self.parseinside,
                                 meta={'title': videotitle, 'ftitle': response.meta['grade'] + title})
        #下一页
        a = response.css('p#page_control_bar *').extract()
        flag = False
        for i in a:
            if re.search('current', i) is not None:
                flag = True
                continue
            if flag:
                link = re.findall('/zt/sj.*?html', str(i))[0]
                yield scrapy.Request(domain + str(link), callback=self.parsethird,
                                     meta={'grade': response.meta['grade'], 'time': 2})

    def parseinside(self, response):
        title = response.meta['title']
        ftitle = response.meta['ftitle']
        playhost = 'http://video.cdn.liangtv.cn.*mp4|https://videocdn.liangtv.cn.*mp4'                       #匹配链接字符串
        resp = response.text
        title = response.css('h3#title::text').extract_first()
        playlink = re.search(playhost, resp)
        if playlink is not None:
            video = str(playlink.group(0))
            with open(ftitle, 'a') as f:
                f.write(title + ':' + video + '\n')
                f.close()

爬是爬到了,就是后期还要改成xlsx以便排序

本文需要的库:scrapy,BeautifulSoup4

  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值