# 导入requests包
import urllib.request
import re
import requests
from bs4 import BeautifulSoup
import time
from threading import Lock, Thread # 线程包
# 请求头
header = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.116 Safari/537.36',
"Referer": "https://www.baidu.com/link?url=S9qOAJnnFVvK9MaArz9E-MpFsvJW2y3H8fAo044AAz1EfTWte8eO3ny3aXgoBIE2&wd=&eqid=8e91efc50002b2cb000000066183ad90",
"Host": "www.kluniv.edu.cn",
"cooklie": "JSESSIONID=A764E4DE3C7C57E3F13BB55DADAABA90",
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9",
"Remote Address": "111.123.226.32:443",
}
# proxies = {'协议': '协议://IP:端口号'}
proxies = {
'http': 'http://{}'.format('8.129.28.247:8888'),
'https': 'https://{}'.format('8.129.28.247:8888'),
}
texturl = "https://www.kluniv.edu.cn/"
session = requests.Session() # 调用requests.Session方法对url进行会话挂起状态
text = session.post(url=texturl, headers=header).content.decode() # 初次访问
def paqutu1(xx):
url = 'https://www.kluniv.edu.cn/index/xw.htm'
if xx == 0:
url = 'https://www.kluniv.edu.cn/index/xw.htm'
else:
url = 'https://www.kluniv.edu.cn/index/xw/' + str(xx) + '.htm' # 连接url
time.sleep(0.1)
audio_content = session.get(url=url, headers=header).content # 获取网页源码
soup = BeautifulSoup(audio_content, "html.parser", from_encoding="utf-8") # 编译源码为方便查找的格式
link_node = soup.find_all('a', href=re.compile(r"info/1028")) # 筛选符合的网页(所有a标签下 href包含 info/1028 的a标签)
for j in range(0, len(link_node)):
reg1 = r"/info\S*htm" # (正则,要匹配的字符串)
p1 = re.findall(reg1, str(link_node[j])) # 筛选出的网页后半段
url2 = 'https://www.kluniv.edu.cn' + p1[0] # 连接url
time.sleep(0.1)
sd = session.get(url=url2, headers=header).content
soup1 = BeautifulSoup(sd, "html.parser", from_encoding="utf-8")
src1 = soup1.find_all('img', orisrc=re.compile(r"local"))
print(url2)
print(src1)
if len(src1):
for k in range(0, len(src1)):
# print(src1[k])
reg2 = r"/__local\S*jpg" # (正则,要匹配的字符串)
reg3 = r"/__local\S*png" # (正则,要匹配的字符串)
p2 = re.findall(reg2, str(src1[k])) # 筛选出所有符合reg2正则表达式的字符串
p3 = re.findall(reg3, str(src1[k])) # 筛选出所有符合reg3正则表达式的字符串
# 存入本地
if len(p2):
print(p2)
url3 = 'https://www.kluniv.edu.cn' + p2[0] # 连接url
print(url3)
urllib.request.urlretrieve(url3, f'tu/第{xx}页-第{j + 1}个新闻-第{k + 1}张u.jpg') # 保存图片
# audio_content3 = session.get(url=url3, headers=header).content
# with open(f'tu/{i}{j + 1}{k + 1}tu.jpg', 'wb') as f:
# f.write(audio_content3)
# f.close()
elif len(p3):
print(p3)
url4 = 'https://www.kluniv.edu.cn' + p3[0] # 连接url
print(url4)
urllib.request.urlretrieve(url4, f'tu/第{xx}页-第{j + 1}个新闻-第{k + 1}张tu.png') # 保存图片
# audio_content4 = session.get(url=url4, headers=header).content
# with open(f'tu/{i}{j + 1}{k + 1}tu.png', 'wb') as e:
# e.write(audio_content4)
# e.close()
print(f"第{xx}页的第{j}个新闻完成")
if __name__ == '__main__':
for i in range(281): # 启用多线程执行paqutu1
p1 = Thread(target=paqutu1, args=(i,)) # target:方法,args:方法的参数
p1.start() # 启动线程
学习笔记+pachong
最新推荐文章于 2024-11-12 23:15:59 发布