#!/usr/bin/env python
# -*- coding: utf-8 -*-
# @Time : 2021/06/06 14:48
# @Author : Liu
# @Site :
# @File : 爬虫2.py
# @Software: PyCharm
import requests
import time
import os
import re
from urllib import request
from bs4 import BeautifulSoup
import urllib.request
# from urllib.request import urlopen
# from flask import request
import urllib
import wget
# from urllib import request
url_address = "https://www.umei.cc"
"""
def get_url_path(url):
获取地址内容信息
:param url:
:return:
# time.sleep(1) # 获取源码的时候睡眠一秒
obj = requests.get(url)
obj.encoding = obj.apparent_encoding
return obj.text
"""
#建立连接
def get_url_path(url):
# headers = {'User-Agent': 'mozilla/5.0 (windows nt 6.1; wow64) applewebkit/537.36 (khtml, like gecko) chrome/27.0.1453.94 safari/537.36'}
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/96.0.4664.110 Safari/537.36'}
time.sleep(1)
# Mozilla/5.0 (Windows NT 6.1; WOW64; rv:23.0) Gecko/20100101 Firefox/23.0'}
req = urllib.request.Request(url=url, headers=headers)
print('+++++++++++++')
time.sleep(1)
content = urllib.request.urlopen(req).read().decode('utf-8')
print('==================')
time.sleep(1)
#print(f'content 000 : {content} ')
return content
def get_image_info(ret, name, i):
soup = BeautifulSoup(ret, "html.parser")
img = soup.select(".ImageBody img")[0]
# print(f"img : {img}")
image_path = img["src"] # 获取图片地址
print(f"img_url : {image_path}")
image_name = name # 获取图片中文所属
# print(f"image_name : {image_name}")
img_name = f"{image_name}_{i}.{os.path.basename(image_path).split('.')[1]}" # 获取图片真实名字
# 图片存储
print(f"img_name : {img_name}")
image_dir = f"girl1/{image_name}"
# print(f"image_dir : {image_dir}")
if not os.path.isdir(image_dir):
os.makedirs(image_dir)
# 远程打开图片写入到本地 第一种方式open
with open(f"{image_dir}/{img_name}", mode="wb") as add:
add.write(requests.get(image_path).content)
# 远程打开图片写入到本地 第二种方式urllib
def get_bs4(ret):
soup = BeautifulSoup(ret, "html.parser")
li_list = soup.select(".TypeList")[0].find_all(name="li")
for i in li_list:
# 先获取第一张图片
img_src = url_address + i.a["href"]
# print(f"image_src : {img_src}")
ret1 = get_url_path(img_src) # 获取页面信息
get_image_info(ret1, i.a.span.string, 1)
#print(f"ret1 : {ret1}")
# 获取分页后的页面图片数量
# script_reg = r'<script type="text\/javascript">Next\("\d+","(?P<num>\d+)",.*?\)<\/script>'
# script_reg = r'<a href="/bizhitupian/diannaobizhi/(?P<num>[0-9_]+).htm">尾页</a>'
# num_str = re.search(script_reg, ret1, re.S).group("num")
# page_num = int(num_str.split("_")[1]) # 获取图片数量
#
# img_lst = os.path.basename(i.a["href"]).split(".") # 获取图片的后缀
# img_dir = os.path.dirname(i.a["href"]) # 获取图片的地址路径
# # print(f"img_dir__ : {img_dir}")
# for j in range(2, page_num + 1):
# img_src = f"{url_address}{img_dir}/{img_lst[0]}_{j}.{img_lst[1]}"
# res = get_url_path(img_src) # 获取页面信息
# # print(f"str : {i.a.span.string}")
# # print(f"res : {res}")
# # print(f"img_src : {img_src}")
# get_image_info(res, i.a.span.string, j)
def get_page_info():
"""
抓取每页信息
:return:
"""
nums = 10 # int(input("输入抓取的页数:"))
for i in range(nums):
if i < 1:
url = f"{url_address}/bizhitupian/diannaobizhi/"
# print(f'url 000 : {url} ')
else:
url = f"{url_address}/bizhitupian/diannaobizhi/index_{i + 1}.htm"
# print(f'url 111 : {url} ' )
ret = get_url_path(url) # 获取页面信息
get_bs4(ret) # 逐页抓取页面信息
print(f"第{i + 1}页完成")
pass
python 爬虫下载图片案例
最新推荐文章于 2024-05-01 21:59:49 发布