python爬取整个网页,教师节不知道给老师送什么?

    9.10日是教师节,很多同学不知道给老师送什么,当我看到老师备课时的辛苦就决定用python爬虫帮老师在网上找课件,于是我找到了这个网址:www.leleketang.com,里面的课件做的特别好,还有视频,但是需要账号,账号申请很麻烦很多老师都没有我准备爬取时,发现它的课件不是图片,而是html

 于是就有了保存整个网页的想法,但是我用普通的爬虫总是取不到标签的链接,查看源代码才知道,它的所有标签都是用JavaScript添加的

 这个body里没有任何的img,video等标签,仔细查找后发现它们全在JavaScript中,再用函数添加进html

因为JavaScript中的src=“。*”是这么写的"src":".*",.*代表一段字符,这是正则的写法

既然知道链接怎么表示的,我们就先找到全部链接

这是这个程序需要用到的所有库

import re
import requests
import os
from random import sample  #这个也可以不加,它是用来做随机请求头的,因为最近老是被封ip

找到全部链接,正则匹配

ua = [] #里面写你们的请求头
headers = {

            'User-Agent':  sample(ua, 1)[0]
        }
rst = requests.get(url,headers=headers)

rst.encoding = 'utf-8'

#print(rst.text)

hoby = rst.text

pattern = re.compile(r'("src"=".*?")')
m = pattern.finditer(hoby)

找到链接以后我们要请求对应的链接,获得图片,视频等,因为链接有多个用for循环效率最高

#"src"
for i in m:
    raw = i
    print(raw.group())
    trt = raw.group()#trt = [i.group() for i in m][sp]
    #print(trt)
    pat = re.compile(r'\\')
    s = pat.sub("",trt)
    #sp = sp + 1
    #s = re.match(s)
    print(s)
    syc = re.compile(r'http.*')
    yyy = syc.search(s)
    if yyy == None:
        crt = re.compile(r':".*"')
        crt = crt.search(s)
        xyt = re.compile(r':')
        crt = crt.group()
        sry = xyt.sub("",crt)
        xyt = re.compile(r'"')
        ytr = xyt.sub("", sry)
        yyy = 'https://www.leleketang.com' + ytr
        print(yyy)

    else:
        xyt = re.compile(r'"')
        yyy = yyy.group()
        yyy = xyt.sub("", yyy)
        print(yyy)

    c = requests.get(yyy,headers=headers)
    sdc = re.compile(r'/(\w){4,}\....')
    sdf = sdc.search(yyy)
    sdf = sdf.group()
    fre = re.compile(r'/')
    sdf = fre.sub("",sdf)
    path = root + sdf
    #if not os.path.exists(path):
        #os.mkdir(path)
    with open(path, 'wb') as f:# 以二进制写的方式将r的二进制内容写入path
        f.write(c.content)
        f.close()
        print("文件保存成功")
    wdc = re.compile(r'/')
    rte = wdc.sub("\/",path)
    #ert = re.compile(trt)#rrr = r + trt#"{}".format(trt)
    #oku = re.compile(r'\\')
    trt = trt.replace("\\","\\\\")
    print(trt)
    rrr=r''.join(trt)
    print(rrr)
    path = '"src":"{}"'.format(path)
    set = re.sub(rrr,path,hoby)
    hoby = set
    #print(set)

因为快开学了,又要补作业,又要写完这个程序,所以程序写的快,没来得及写注释

处理完图片,视频链接我们还需要处理js,css链接将其保存换成本地链接

因为js,css保证script标签里,所以他们的链接就是,html的写法src=“。*”,href=“。*”

处理js

pattern = re.compile(r'(src=".*?")')
m = pattern.finditer(hoby)

#print("AAAAAA\n\r")
#print([i.group() for i in m])
#print("AAAAAA\n\r")
#sp = 0
#js
for i in m:
    raw = i
    print(raw.group())
    trt = raw.group()#trt = [i.group() for i in m][sp]
    #print(trt)
    pat = re.compile(r'\\')
    s = pat.sub("",trt)
    #sp = sp + 1
    #s = re.match(s)
    print(s)
    syc = re.compile(r'http.*')
    yyy = syc.search(s)
    if yyy == None:
        crt = re.compile(r'=".*"')
        crt = crt.search(s)
        xyt = re.compile(r'=')
        crt = crt.group()
        sry = xyt.sub("",crt)
        xyt = re.compile(r'"')
        ytr = xyt.sub("", sry)
        yyy = 'https://www.leleketang.com' + ytr
        print(yyy)

    else:
        xyt = re.compile(r'"')
        yyy = yyy.group()
        yyy = xyt.sub("", yyy)
        print(yyy)

    c = requests.get(yyy,headers=headers)
    sdc = re.compile(r'/.*\.js?')
    sdf = sdc.search(yyy)
    sdf = sdf.group()
    fre = re.compile(r'/')
    sdf = fre.sub("",sdf)
    path = root + sdf
    #if not os.path.exists(path):
        #os.mkdir(path)
    with open(path, 'wb') as f:# 以二进制写的方式将r的二进制内容写入path
        f.write(c.content)
        f.close()
        print("文件保存成功")
    #wdc = re.compile(r'/')
    #rte = wdc.sub("\/",path)
    #ert = re.compile(trt)#rrr = r + trt#"{}".format(trt)
    #oku = re.compile(r'\\')
    trt = trt.replace("?","\?")
    print(trt)
    rrr=r''.join(trt)
    print(rrr)
    path = 'src="{}"'.format(path)
    set = re.sub(rrr,path,hoby)
    hoby = set
    #print(set)

处理css

pattern = re.compile(r'(href=".*?")')
m = pattern.finditer(hoby)

#print("AAAAAA\n\r")
#print([i.group() for i in m])
#print("AAAAAA\n\r")
#sp = 0
#href
for i in m:
    raw = i
    print(raw.group())
    trt = raw.group()#trt = [i.group() for i in m][sp]
    #print(trt)
    pat = re.compile(r'\\')
    s = pat.sub("",trt)
    #sp = sp + 1
    #s = re.match(s)
    print(s)
    syc = re.compile(r'http.*')
    yyy = syc.search(s)
    if yyy == None:
        crt = re.compile(r'=".*"')
        crt = crt.search(s)
        xyt = re.compile(r'=')
        crt = crt.group()
        sry = xyt.sub("",crt)
        xyt = re.compile(r'"')
        ytr = xyt.sub("", sry)
        yyy = 'https://www.leleketang.com' + ytr
        print(yyy)

    else:
        xyt = re.compile(r'"')
        yyy = yyy.group()
        yyy = xyt.sub("", yyy)
        print(yyy)

    c = requests.get(yyy,headers=headers)
    sdc = re.compile(r'/.*\.css?')
    sdf = sdc.search(yyy)
    sdf = sdf.group()
    fre = re.compile(r'/')
    sdf = fre.sub("",sdf)
    path = root + sdf
    #if not os.path.exists(path):
        #os.mkdir(path)
    with open(path, 'wb') as f:# 以二进制写的方式将r的二进制内容写入path
        f.write(c.content)
        f.close()
        print("文件保存成功")
    #wdc = re.compile(r'/')
    #rte = wdc.sub("\/",path)
    #ert = re.compile(trt)#rrr = r + trt#"{}".format(trt)
    #oku = re.compile(r'\\')
    trt = trt.replace("?","\?")
    print(trt)
    rrr=r''.join(trt)
    print(rrr)
    path = 'href="{}"'.format(path)
    set = re.sub(rrr,path,hoby)
    hoby = set
    #print(set)

将所有链接保存,替换成本地的链接后,我们要生成一个html文件

hoby = hoby.encode()
with open(file_path, 'wb') as f:  # 以二进制写的方式将r的二进制内容写入path
    f.write(hoby)
    f.close()
    print("文件保存成功")

点开这个html文件就是这个网页的内容了

全部源码

import re
import requests
import time
import os
from random import sample

url = input('链接:')
#panth = input('保存路径:')
name = input('保存名称:')
file_name = name
name = name + '_files'
root = 'D://爬虫//' + name + '//'
file_path = 'D://爬虫//' + file_name + '.html'
if not os.path.exists(root):
    os.mkdir(root)
ua = [ ]
headers = {

            'User-Agent':  sample(ua, 1)[0]
        }
rst = requests.get(url,headers=headers)

rst.encoding = 'utf-8'

#print(rst.text)

hoby = rst.text

pattern = re.compile(r'(src=".*?")')
m = pattern.finditer(hoby)

#print("AAAAAA\n\r")
#print([i.group() for i in m])
#print("AAAAAA\n\r")
#sp = 0
#js
for i in m:
    raw = i
    print(raw.group())
    trt = raw.group()#trt = [i.group() for i in m][sp]
    #print(trt)
    pat = re.compile(r'\\')
    s = pat.sub("",trt)
    #sp = sp + 1
    #s = re.match(s)
    print(s)
    syc = re.compile(r'http.*')
    yyy = syc.search(s)
    if yyy == None:
        crt = re.compile(r'=".*"')
        crt = crt.search(s)
        xyt = re.compile(r'=')
        crt = crt.group()
        sry = xyt.sub("",crt)
        xyt = re.compile(r'"')
        ytr = xyt.sub("", sry)
        yyy = 'https://www.leleketang.com' + ytr
        print(yyy)

    else:
        xyt = re.compile(r'"')
        yyy = yyy.group()
        yyy = xyt.sub("", yyy)
        print(yyy)

    c = requests.get(yyy,headers=headers)
    sdc = re.compile(r'/.*\.js?')
    sdf = sdc.search(yyy)
    sdf = sdf.group()
    fre = re.compile(r'/')
    sdf = fre.sub("",sdf)
    path = root + sdf
    #if not os.path.exists(path):
        #os.mkdir(path)
    with open(path, 'wb') as f:# 以二进制写的方式将r的二进制内容写入path
        f.write(c.content)
        f.close()
        print("文件保存成功")
    #wdc = re.compile(r'/')
    #rte = wdc.sub("\/",path)
    #ert = re.compile(trt)#rrr = r + trt#"{}".format(trt)
    #oku = re.compile(r'\\')
    trt = trt.replace("?","\?")
    print(trt)
    rrr=r''.join(trt)
    print(rrr)
    path = 'src="{}"'.format(path)
    set = re.sub(rrr,path,hoby)
    hoby = set
    #print(set)


pattern = re.compile(r'(href=".*?")')
m = pattern.finditer(hoby)

#print("AAAAAA\n\r")
#print([i.group() for i in m])
#print("AAAAAA\n\r")
#sp = 0
#href
for i in m:
    raw = i
    print(raw.group())
    trt = raw.group()#trt = [i.group() for i in m][sp]
    #print(trt)
    pat = re.compile(r'\\')
    s = pat.sub("",trt)
    #sp = sp + 1
    #s = re.match(s)
    print(s)
    syc = re.compile(r'http.*')
    yyy = syc.search(s)
    if yyy == None:
        crt = re.compile(r'=".*"')
        crt = crt.search(s)
        xyt = re.compile(r'=')
        crt = crt.group()
        sry = xyt.sub("",crt)
        xyt = re.compile(r'"')
        ytr = xyt.sub("", sry)
        yyy = 'https://www.leleketang.com' + ytr
        print(yyy)

    else:
        xyt = re.compile(r'"')
        yyy = yyy.group()
        yyy = xyt.sub("", yyy)
        print(yyy)

    c = requests.get(yyy,headers=headers)
    sdc = re.compile(r'/.*\.css?')
    sdf = sdc.search(yyy)
    sdf = sdf.group()
    fre = re.compile(r'/')
    sdf = fre.sub("",sdf)
    path = root + sdf
    #if not os.path.exists(path):
        #os.mkdir(path)
    with open(path, 'wb') as f:# 以二进制写的方式将r的二进制内容写入path
        f.write(c.content)
        f.close()
        print("文件保存成功")
    #wdc = re.compile(r'/')
    #rte = wdc.sub("\/",path)
    #ert = re.compile(trt)#rrr = r + trt#"{}".format(trt)
    #oku = re.compile(r'\\')
    trt = trt.replace("?","\?")
    print(trt)
    rrr=r''.join(trt)
    print(rrr)
    path = 'href="{}"'.format(path)
    set = re.sub(rrr,path,hoby)
    hoby = set
    #print(set)


pattern = re.compile(r'("src":".*?")')
m = pattern.finditer(hoby)

#print("AAAAAA\n\r")
#print([i.group() for i in m])
#print("AAAAAA\n\r")
#sp = 0
#"src"
for i in m:
    raw = i
    print(raw.group())
    trt = raw.group()#trt = [i.group() for i in m][sp]
    #print(trt)
    pat = re.compile(r'\\')
    s = pat.sub("",trt)
    #sp = sp + 1
    #s = re.match(s)
    print(s)
    syc = re.compile(r'http.*')
    yyy = syc.search(s)
    if yyy == None:
        crt = re.compile(r':".*"')
        crt = crt.search(s)
        xyt = re.compile(r':')
        crt = crt.group()
        sry = xyt.sub("",crt)
        xyt = re.compile(r'"')
        ytr = xyt.sub("", sry)
        yyy = 'https://www.leleketang.com' + ytr
        print(yyy)

    else:
        xyt = re.compile(r'"')
        yyy = yyy.group()
        yyy = xyt.sub("", yyy)
        print(yyy)

    c = requests.get(yyy,headers=headers)
    sdc = re.compile(r'/(\w){4,}\....')
    sdf = sdc.search(yyy)
    sdf = sdf.group()
    fre = re.compile(r'/')
    sdf = fre.sub("",sdf)
    path = root + sdf
    #if not os.path.exists(path):
        #os.mkdir(path)
    with open(path, 'wb') as f:# 以二进制写的方式将r的二进制内容写入path
        f.write(c.content)
        f.close()
        print("文件保存成功")
    wdc = re.compile(r'/')
    rte = wdc.sub("\/",path)
    #ert = re.compile(trt)#rrr = r + trt#"{}".format(trt)
    #oku = re.compile(r'\\')
    trt = trt.replace("\\","\\\\")
    print(trt)
    rrr=r''.join(trt)
    print(rrr)
    path = '"src":"{}"'.format(path)
    set = re.sub(rrr,path,hoby)
    hoby = set
    #print(set)
print(hoby)
hoby = hoby.encode()
with open(file_path, 'wb') as f:  # 以二进制写的方式将r的二进制内容写入path
    f.write(hoby)
    f.close()
    print("文件保存成功")
#print(m)#.group())
#phs = re.compile(r'a..b')
#a = phs.sub("","ssasdb")
#print(a)

这个ua就是你们的请求头,是一个列表,我的就不放出来了

  • 11
    点赞
  • 3
    收藏
    觉得还不错? 一键收藏
  • 打赏
    打赏
  • 4
    评论
评论 4
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包

打赏作者

beginner2021

你的鼓励将是我创作的最大动力

¥1 ¥2 ¥4 ¥6 ¥10 ¥20
扫码支付:¥1
获取中
扫码支付

您的余额不足,请更换扫码支付或充值

打赏作者

实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值