9.10日是教师节,很多同学不知道给老师送什么,当我看到老师备课时的辛苦就决定用python爬虫帮老师在网上找课件,于是我找到了这个网址:www.leleketang.com,里面的课件做的特别好,还有视频,但是需要账号,账号申请很麻烦很多老师都没有我准备爬取时,发现它的课件不是图片,而是html
于是就有了保存整个网页的想法,但是我用普通的爬虫总是取不到标签的链接,查看源代码才知道,它的所有标签都是用JavaScript添加的
这个body里没有任何的img,video等标签,仔细查找后发现它们全在JavaScript中,再用函数添加进html
因为JavaScript中的src=“。*”是这么写的"src":".*",.*代表一段字符,这是正则的写法
既然知道链接怎么表示的,我们就先找到全部链接
这是这个程序需要用到的所有库
import re
import requests
import os
from random import sample #这个也可以不加,它是用来做随机请求头的,因为最近老是被封ip
找到全部链接,正则匹配
ua = [] #里面写你们的请求头
headers = {
'User-Agent': sample(ua, 1)[0]
}
rst = requests.get(url,headers=headers)
rst.encoding = 'utf-8'
#print(rst.text)
hoby = rst.text
pattern = re.compile(r'("src"=".*?")')
m = pattern.finditer(hoby)
找到链接以后我们要请求对应的链接,获得图片,视频等,因为链接有多个用for循环效率最高
#"src"
for i in m:
raw = i
print(raw.group())
trt = raw.group()#trt = [i.group() for i in m][sp]
#print(trt)
pat = re.compile(r'\\')
s = pat.sub("",trt)
#sp = sp + 1
#s = re.match(s)
print(s)
syc = re.compile(r'http.*')
yyy = syc.search(s)
if yyy == None:
crt = re.compile(r':".*"')
crt = crt.search(s)
xyt = re.compile(r':')
crt = crt.group()
sry = xyt.sub("",crt)
xyt = re.compile(r'"')
ytr = xyt.sub("", sry)
yyy = 'https://www.leleketang.com' + ytr
print(yyy)
else:
xyt = re.compile(r'"')
yyy = yyy.group()
yyy = xyt.sub("", yyy)
print(yyy)
c = requests.get(yyy,headers=headers)
sdc = re.compile(r'/(\w){4,}\....')
sdf = sdc.search(yyy)
sdf = sdf.group()
fre = re.compile(r'/')
sdf = fre.sub("",sdf)
path = root + sdf
#if not os.path.exists(path):
#os.mkdir(path)
with open(path, 'wb') as f:# 以二进制写的方式将r的二进制内容写入path
f.write(c.content)
f.close()
print("文件保存成功")
wdc = re.compile(r'/')
rte = wdc.sub("\/",path)
#ert = re.compile(trt)#rrr = r + trt#"{}".format(trt)
#oku = re.compile(r'\\')
trt = trt.replace("\\","\\\\")
print(trt)
rrr=r''.join(trt)
print(rrr)
path = '"src":"{}"'.format(path)
set = re.sub(rrr,path,hoby)
hoby = set
#print(set)
因为快开学了,又要补作业,又要写完这个程序,所以程序写的快,没来得及写注释
处理完图片,视频链接我们还需要处理js,css链接将其保存换成本地链接
因为js,css保证script标签里,所以他们的链接就是,html的写法src=“。*”,href=“。*”
处理js
pattern = re.compile(r'(src=".*?")')
m = pattern.finditer(hoby)
#print("AAAAAA\n\r")
#print([i.group() for i in m])
#print("AAAAAA\n\r")
#sp = 0
#js
for i in m:
raw = i
print(raw.group())
trt = raw.group()#trt = [i.group() for i in m][sp]
#print(trt)
pat = re.compile(r'\\')
s = pat.sub("",trt)
#sp = sp + 1
#s = re.match(s)
print(s)
syc = re.compile(r'http.*')
yyy = syc.search(s)
if yyy == None:
crt = re.compile(r'=".*"')
crt = crt.search(s)
xyt = re.compile(r'=')
crt = crt.group()
sry = xyt.sub("",crt)
xyt = re.compile(r'"')
ytr = xyt.sub("", sry)
yyy = 'https://www.leleketang.com' + ytr
print(yyy)
else:
xyt = re.compile(r'"')
yyy = yyy.group()
yyy = xyt.sub("", yyy)
print(yyy)
c = requests.get(yyy,headers=headers)
sdc = re.compile(r'/.*\.js?')
sdf = sdc.search(yyy)
sdf = sdf.group()
fre = re.compile(r'/')
sdf = fre.sub("",sdf)
path = root + sdf
#if not os.path.exists(path):
#os.mkdir(path)
with open(path, 'wb') as f:# 以二进制写的方式将r的二进制内容写入path
f.write(c.content)
f.close()
print("文件保存成功")
#wdc = re.compile(r'/')
#rte = wdc.sub("\/",path)
#ert = re.compile(trt)#rrr = r + trt#"{}".format(trt)
#oku = re.compile(r'\\')
trt = trt.replace("?","\?")
print(trt)
rrr=r''.join(trt)
print(rrr)
path = 'src="{}"'.format(path)
set = re.sub(rrr,path,hoby)
hoby = set
#print(set)
处理css
pattern = re.compile(r'(href=".*?")')
m = pattern.finditer(hoby)
#print("AAAAAA\n\r")
#print([i.group() for i in m])
#print("AAAAAA\n\r")
#sp = 0
#href
for i in m:
raw = i
print(raw.group())
trt = raw.group()#trt = [i.group() for i in m][sp]
#print(trt)
pat = re.compile(r'\\')
s = pat.sub("",trt)
#sp = sp + 1
#s = re.match(s)
print(s)
syc = re.compile(r'http.*')
yyy = syc.search(s)
if yyy == None:
crt = re.compile(r'=".*"')
crt = crt.search(s)
xyt = re.compile(r'=')
crt = crt.group()
sry = xyt.sub("",crt)
xyt = re.compile(r'"')
ytr = xyt.sub("", sry)
yyy = 'https://www.leleketang.com' + ytr
print(yyy)
else:
xyt = re.compile(r'"')
yyy = yyy.group()
yyy = xyt.sub("", yyy)
print(yyy)
c = requests.get(yyy,headers=headers)
sdc = re.compile(r'/.*\.css?')
sdf = sdc.search(yyy)
sdf = sdf.group()
fre = re.compile(r'/')
sdf = fre.sub("",sdf)
path = root + sdf
#if not os.path.exists(path):
#os.mkdir(path)
with open(path, 'wb') as f:# 以二进制写的方式将r的二进制内容写入path
f.write(c.content)
f.close()
print("文件保存成功")
#wdc = re.compile(r'/')
#rte = wdc.sub("\/",path)
#ert = re.compile(trt)#rrr = r + trt#"{}".format(trt)
#oku = re.compile(r'\\')
trt = trt.replace("?","\?")
print(trt)
rrr=r''.join(trt)
print(rrr)
path = 'href="{}"'.format(path)
set = re.sub(rrr,path,hoby)
hoby = set
#print(set)
将所有链接保存,替换成本地的链接后,我们要生成一个html文件
hoby = hoby.encode()
with open(file_path, 'wb') as f: # 以二进制写的方式将r的二进制内容写入path
f.write(hoby)
f.close()
print("文件保存成功")
点开这个html文件就是这个网页的内容了
全部源码
import re
import requests
import time
import os
from random import sample
url = input('链接:')
#panth = input('保存路径:')
name = input('保存名称:')
file_name = name
name = name + '_files'
root = 'D://爬虫//' + name + '//'
file_path = 'D://爬虫//' + file_name + '.html'
if not os.path.exists(root):
os.mkdir(root)
ua = [ ]
headers = {
'User-Agent': sample(ua, 1)[0]
}
rst = requests.get(url,headers=headers)
rst.encoding = 'utf-8'
#print(rst.text)
hoby = rst.text
pattern = re.compile(r'(src=".*?")')
m = pattern.finditer(hoby)
#print("AAAAAA\n\r")
#print([i.group() for i in m])
#print("AAAAAA\n\r")
#sp = 0
#js
for i in m:
raw = i
print(raw.group())
trt = raw.group()#trt = [i.group() for i in m][sp]
#print(trt)
pat = re.compile(r'\\')
s = pat.sub("",trt)
#sp = sp + 1
#s = re.match(s)
print(s)
syc = re.compile(r'http.*')
yyy = syc.search(s)
if yyy == None:
crt = re.compile(r'=".*"')
crt = crt.search(s)
xyt = re.compile(r'=')
crt = crt.group()
sry = xyt.sub("",crt)
xyt = re.compile(r'"')
ytr = xyt.sub("", sry)
yyy = 'https://www.leleketang.com' + ytr
print(yyy)
else:
xyt = re.compile(r'"')
yyy = yyy.group()
yyy = xyt.sub("", yyy)
print(yyy)
c = requests.get(yyy,headers=headers)
sdc = re.compile(r'/.*\.js?')
sdf = sdc.search(yyy)
sdf = sdf.group()
fre = re.compile(r'/')
sdf = fre.sub("",sdf)
path = root + sdf
#if not os.path.exists(path):
#os.mkdir(path)
with open(path, 'wb') as f:# 以二进制写的方式将r的二进制内容写入path
f.write(c.content)
f.close()
print("文件保存成功")
#wdc = re.compile(r'/')
#rte = wdc.sub("\/",path)
#ert = re.compile(trt)#rrr = r + trt#"{}".format(trt)
#oku = re.compile(r'\\')
trt = trt.replace("?","\?")
print(trt)
rrr=r''.join(trt)
print(rrr)
path = 'src="{}"'.format(path)
set = re.sub(rrr,path,hoby)
hoby = set
#print(set)
pattern = re.compile(r'(href=".*?")')
m = pattern.finditer(hoby)
#print("AAAAAA\n\r")
#print([i.group() for i in m])
#print("AAAAAA\n\r")
#sp = 0
#href
for i in m:
raw = i
print(raw.group())
trt = raw.group()#trt = [i.group() for i in m][sp]
#print(trt)
pat = re.compile(r'\\')
s = pat.sub("",trt)
#sp = sp + 1
#s = re.match(s)
print(s)
syc = re.compile(r'http.*')
yyy = syc.search(s)
if yyy == None:
crt = re.compile(r'=".*"')
crt = crt.search(s)
xyt = re.compile(r'=')
crt = crt.group()
sry = xyt.sub("",crt)
xyt = re.compile(r'"')
ytr = xyt.sub("", sry)
yyy = 'https://www.leleketang.com' + ytr
print(yyy)
else:
xyt = re.compile(r'"')
yyy = yyy.group()
yyy = xyt.sub("", yyy)
print(yyy)
c = requests.get(yyy,headers=headers)
sdc = re.compile(r'/.*\.css?')
sdf = sdc.search(yyy)
sdf = sdf.group()
fre = re.compile(r'/')
sdf = fre.sub("",sdf)
path = root + sdf
#if not os.path.exists(path):
#os.mkdir(path)
with open(path, 'wb') as f:# 以二进制写的方式将r的二进制内容写入path
f.write(c.content)
f.close()
print("文件保存成功")
#wdc = re.compile(r'/')
#rte = wdc.sub("\/",path)
#ert = re.compile(trt)#rrr = r + trt#"{}".format(trt)
#oku = re.compile(r'\\')
trt = trt.replace("?","\?")
print(trt)
rrr=r''.join(trt)
print(rrr)
path = 'href="{}"'.format(path)
set = re.sub(rrr,path,hoby)
hoby = set
#print(set)
pattern = re.compile(r'("src":".*?")')
m = pattern.finditer(hoby)
#print("AAAAAA\n\r")
#print([i.group() for i in m])
#print("AAAAAA\n\r")
#sp = 0
#"src"
for i in m:
raw = i
print(raw.group())
trt = raw.group()#trt = [i.group() for i in m][sp]
#print(trt)
pat = re.compile(r'\\')
s = pat.sub("",trt)
#sp = sp + 1
#s = re.match(s)
print(s)
syc = re.compile(r'http.*')
yyy = syc.search(s)
if yyy == None:
crt = re.compile(r':".*"')
crt = crt.search(s)
xyt = re.compile(r':')
crt = crt.group()
sry = xyt.sub("",crt)
xyt = re.compile(r'"')
ytr = xyt.sub("", sry)
yyy = 'https://www.leleketang.com' + ytr
print(yyy)
else:
xyt = re.compile(r'"')
yyy = yyy.group()
yyy = xyt.sub("", yyy)
print(yyy)
c = requests.get(yyy,headers=headers)
sdc = re.compile(r'/(\w){4,}\....')
sdf = sdc.search(yyy)
sdf = sdf.group()
fre = re.compile(r'/')
sdf = fre.sub("",sdf)
path = root + sdf
#if not os.path.exists(path):
#os.mkdir(path)
with open(path, 'wb') as f:# 以二进制写的方式将r的二进制内容写入path
f.write(c.content)
f.close()
print("文件保存成功")
wdc = re.compile(r'/')
rte = wdc.sub("\/",path)
#ert = re.compile(trt)#rrr = r + trt#"{}".format(trt)
#oku = re.compile(r'\\')
trt = trt.replace("\\","\\\\")
print(trt)
rrr=r''.join(trt)
print(rrr)
path = '"src":"{}"'.format(path)
set = re.sub(rrr,path,hoby)
hoby = set
#print(set)
print(hoby)
hoby = hoby.encode()
with open(file_path, 'wb') as f: # 以二进制写的方式将r的二进制内容写入path
f.write(hoby)
f.close()
print("文件保存成功")
#print(m)#.group())
#phs = re.compile(r'a..b')
#a = phs.sub("","ssasdb")
#print(a)
这个ua就是你们的请求头,是一个列表,我的就不放出来了