zyz
众所周知,lsp是学习的最强动力。
偶得网站,通过图片和名称挑选,用链接看各类资源。
但每次都手动点太麻烦,于是练习写个脚本,搜集图片和名称,在资源管理器中做一览表。
前期准备
- lxml、requests、selenium、bs4;
- PyCharm 2020.3.3 (Community Edition)
- python3.7
python pip 国内源[1]
- 清华:https://pypi.tuna.tsinghua.edu.cn/simple/
- 阿里云:http://mirrors.aliyun.com/pypi/simple/
- 中国科技大学 https://pypi.mirrors.ustc.edu.cn/simple/
- 华中理工大学:http://pypi.hustunique.com/
- 山东理工大学:http://pypi.sdutlinux.org/
- 豆瓣:http://pypi.douban.com/simple/
pip install package -i http://mirrors.aliyun.com/pypi/simple/ --trusted-host mirrors.aliyun.com
win打开cmd:Ctrl + R,输入“cmd”
win的cmd中
# 使用方法
pip install -i https://pypi.doubanio.com/simple/ 包名
# 升级pip本身
python -m pip install --upgrade pip
效果
- 获取图片到指定文件夹
- 用标题名作为图片名
- 将资源链接保存到文档中
源码
# -*- coding:utf-8 -*-
# @Time:2021/2/17 15:38
# @Author:sun
# @Software:zyz
import os
from selenium import webdriver
from bs4 import BeautifulSoup
import requests
class Spider:
def __init__(self):
self.base_url = r"http://www.xx.com"
self.headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64; Trident/7.0; rv:11.0) like Gecko ",
'Connection': 'close'}
self.solo_url = None
self.solo_src = None
self.solo_soup = None
self.count_line = 0
def html_save(self):
html_src = requests.get(self.base_url)
with open(r"D:\pyfiles\proj\zyz\html.html", "w", encoding="utf-8") as f:
f.write(html_src.text)
def get_url(self, N=None):
html_src = requests.get(self.base_url)
soup = BeautifulSoup(html_src.content, 'lxml')
html_a = soup.find_all("a", target="_blank")
try:
os.remove(r"D:\pyfiles\proj\zyz\url.txt")
except FileNotFoundError:
pass
self.count_line = 0
for href in html_a[1:N:2]:
# print(self.base_url+i["href"])
# print(i.string)
self.solo_url = self.base_url + href["href"]
# print(self.url_solo)
with open(r"D:\pyfiles\proj\zyz\url.txt", "a+", encoding="utf-8") as f:
f.write(self.solo_url + "\n")
self.count_line += 1
print("页面总行数:" + str(self.count_line))
def get_pic(self):
# 保存单链接地址
listOfLines = list()
with open(r"D:\pyfiles\proj\zyz\url.txt", "r", encoding="utf-8") as mffile:
for line in mffile:
listOfLines.append(line.strip())
# print(listOfLines)
# 遍历链接,取资源
j = 0
try:
os.remove(r"D:\pyfiles\proj\zyz\pic\txt.txt")
except FileNotFoundError:
pass
for i in listOfLines:
# print(i)
handle = requests.get(i)
soup = BeautifulSoup(handle.content, 'lxml')
# 获取图片链接、名称、资源链接
html_img = soup.find_all("img", attrs={"class": "img"})
html_font = soup.find_all("font")
html_a = soup.find_all("a", target="_blank")
# 显示数据
# print(html_a[2].string)
# print(html_img[0]["src"])
# print(html_font[1].string.replace(' ', ''))
solo_name = html_font[1].string.replace(' ', '')
# 保存资源
with open(r"D:\pyfiles\proj\zyz\pic\txt.txt", 'a+') as mf_src:
mf_src.write(str(j) + "." +solo_name+"\n"+html_a[2].string+"\n\n")
# 保存图片
r = requests.get(html_img[0]["src"])
with open(r"D:\pyfiles\proj\zyz\pic/" + str(j) + "." + solo_name + ".png", 'wb') as f:
f.write(r.content)
j += 1
def __del__(self):
print("end")
# self.driver.close()
if __name__ == '__main__':
peter = Spider()
# peter.html_save()
peter.get_url()
peter.get_pic()