真的是闲的慌,自己作业都没写
# -*- coding: utf-8 -*-
import scrapy
import re
from bs4 import BeautifulSoup
import os
class GetalllinkSpider(scrapy.Spider):
name = 'getalllink'
# allowed_domains = ['zt.gxtv.cn']
start_urls = ['https://zt.gxtv.cn/zt/default.html']
def parse(self, response):
global domain
domain = 'http://zt.gxtv.cn'
gz = response.css('div.gz *').extract() #高中
cz = response.css('div.cz *').extract() #初中
xx = response.css('div.xx *').extract() #小学
gz = ''.join(gz)
cz = ''.join(cz)
xx = ''.join(xx)
# print(type(gz))
gzsoup = BeautifulSoup(gz, 'html.parser')
czsoup = BeautifulSoup(cz, 'html.parser')
xxsoup = BeautifulSoup(xx, 'html.parser')
gzh3 = gzsoup.find_all('h3')
czh3 = czsoup.find_all('h3')
xxh3 = xxsoup.find_all('h3')
for i in range(0, 3):
link = gzh3[i].a.get('href')
link = domain + link
if not os.path.exists('./高中'):
os.mkdir('./高中')
yield scrapy.Request(link, callback=self.parsesecond, meta={'grade': './高中/'})
for i in range(0, 3):
link = czh3[i].a.get('href')
link = domain + link
if not os.path.exists('./初中'):
os.mkdir('./初中')
yield scrapy.Request(link, callback=self.parsesecond, meta={'grade': './初中/'})
for i in range(0, 6):
link = xxh3[i].a.get('href')
link = domain + link
if not os.path.exists('./小学'):
os.mkdir('./小学')
yield scrapy.Request(link, callback=self.parsesecond, meta={'grade': './小学/'})
def parsesecond(self, response):
a = response.css('a').extract()
a = ''.join(a)
list = re.findall('/zt/sj.*?html', a)
for i in list:
yield scrapy.Request(domain + i, callback=self.parsethird, meta={'grade': response.meta['grade'], 'time': 1})
def parsethird(self, response):
print(response.meta['time'])
ftitle = response.css('a#ftitle::text').extract_first()
title = response.css('span#title::text').extract_first()
title = ftitle + ''.join(title) + '.txt'
alist = response.css('a[target=_blank]').extract()
alist = ''.join(alist)
linksoup = BeautifulSoup(alist, 'html.parser')
if os.path.exists(response.meta['grade'] + title):
pass
else:
with open(response.meta['grade'] + title, 'a') as f:
f.close()
for i in linksoup.find_all('a'):
videotitle = i['title']
url = domain + i['href']
yield scrapy.Request(url, callback=self.parseinside,
meta={'title': videotitle, 'ftitle': response.meta['grade'] + title})
#下一页
a = response.css('p#page_control_bar *').extract()
flag = False
for i in a:
if re.search('current', i) is not None:
flag = True
continue
if flag:
link = re.findall('/zt/sj.*?html', str(i))[0]
yield scrapy.Request(domain + str(link), callback=self.parsethird,
meta={'grade': response.meta['grade'], 'time': 2})
def parseinside(self, response):
title = response.meta['title']
ftitle = response.meta['ftitle']
playhost = 'http://video.cdn.liangtv.cn.*mp4|https://videocdn.liangtv.cn.*mp4' #匹配链接字符串
resp = response.text
title = response.css('h3#title::text').extract_first()
playlink = re.search(playhost, resp)
if playlink is not None:
video = str(playlink.group(0))
with open(ftitle, 'a') as f:
f.write(title + ':' + video + '\n')
f.close()
爬是爬到了,就是后期还要改成xlsx以便排序
本文需要的库:scrapy,BeautifulSoup4