# -*- encoding: utf-8 -*-
# @ModuleName: pic_crawler
# @Function: 图片爬虫
# @Author: Yokon
# @Time: 2024/3/19 16:13
import os
import random
import re
import string
import requests
from bs4 import BeautifulSoup
import urllib3
from tqdm import tqdm
urllib3.disable_warnings()
# 生成一个随机字符串
def generate_random_string(length):
"""生成指定长度的包含字母和数字的随机字符串"""
letters_and_digits = string.ascii_letters + string.digits # 包含所有字母和数字的字符串
random_string = ''.join(random.choice(letters_and_digits) for i in range(length))
return random_string
def download_image(image_urls, save_path):
# num = 0
if not os.path.exists(save_path):
os.makedirs(save_path)
# for item in image_urls:
for item in tqdm(image_urls, desc='Processing items', unit='item'):
response = requests.get(item)
soup = BeautifulSoup(response.text, 'html.parser')
image_url = soup.find_all('div', attrs={"class": "scrollbox"})
# print(image_url)
soup = BeautifulSoup(str(image_url), 'html.parser')
img_tag = soup.find('img')
image_url = img_tag['src'] if img_tag else None
# print(image_url)
image_filename = generate_random_string(12) + f'.jpg'
image_path = os.path.join(save_path, image_filename)
response = requests.get(str(image_url), stream=True)
if response.status_code == 200:
with open(image_path, 'wb') as f:
for chunk in response.iter_content(1024):
f.write(chunk)
# num += 1
# print("writing the number of img", num)
def crawl_wallpapers(urls):
image_urls = []
for item in urls:
response = requests.get(item, verify=False)
soup = BeautifulSoup(response.text, 'html.parser')
image_urls_list = soup.find_all('a', attrs={"class": "preview"})
# print(image_urls)
for u in image_urls_list:
# 使用正则表达式匹配所有包含特定格式链接地址的<a>标签
pattern = re.compile(r'<a class="preview" href="(.*?)" target="_blank"></a>')
matches = pattern.findall(str(u))
image_urls.append(matches[0])
print("Getbase!!!")
return image_urls
def split_list(input_list, chunk_size):
return [input_list[i:i + chunk_size] for i in range(0, len(input_list), chunk_size)]
if __name__ == "__main__":
# 获取基础页面网址
base = 'https:seed=67cziB&page='
baseurl = []
for i in range(60, 17000):
url = base + str(i)
baseurl.append(url)
new_list = split_list(baseurl, 10)
i = 70
# 查询二级索引
for l1_list in new_list:
l2_urls = crawl_wallpapers(l1_list)
print(l2_urls)
print("第 %s 组" % i)
i += 1
# 进入二级索引并下载
save_directory = 'wallpapers'
download_image(l2_urls, save_directory)
# -*- encoding: utf-8 -*-
# @ModuleName: pic_divide
# @Function: 分开竖屏横屏
# @Author: Yokon
# @Time: 2024/3/20 16:05
from PIL import Image
import os
import shutil
def move_images(input_folder, output_folder):
if not os.path.exists(output_folder):
os.makedirs(output_folder)
for file in os.listdir(input_folder):
if file.endswith(".jpg") or file.endswith(".png"): # 可根据需求修改图片格式
image_path = os.path.join(input_folder, file)
image = Image.open(image_path)
width, height = image.size
image.close()
if height > width:
output_path = os.path.join(output_folder, file)
shutil.move(image_path, output_path)
print("移动完成!")
# 输入文件夹路径和输出文件夹路径
input_folder = "wallpapers"
output_folder = "vertical_images"
move_images(input_folder, output_folder)