python爬取图片（thumbURL和html文件标签分别爬取）

疯疯癫癫才自由

于 2024-01-19 20:57:50 发布

阅读量833

点赞数 8

分类专栏： python爬虫文章标签： python

本文链接：https://blog.csdn.net/qq_51825761/article/details/135706160

版权

python爬虫专栏收录该内容

4 篇文章 0 订阅

订阅专栏

当查看源代码，发现网址在thumbURL之后时，用此代码:

# 当查看源代码，发现网址在thumbURL之后时，用此代码:

import requests

headers = {
    'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:121.0) Gecko/20100101 Firefox/121.0',
    'Accept':'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8',
    'Accept-Encoding':'gzip, deflate, br',
    'Accept-Language':'zh-CN,zh;q=0.8,zh-TW;q=0.7,zh-HK;q=0.5,en-US;q=0.3,en;q=0.2'
}

url = input("请输入你想保存的图片的网址：")
response = requests.get(url, headers = headers)
print(response)
print(response.status_code)

file = input("请输入你想图片保存在的文件夹名称：")

import os
os.makedirs(f'./{file}', exist_ok = True)
# 新建目录，用于存储图片
# def makedirs(name, mode=0o777, exist_ok=False):
# 参数说明：
#     name：用于指定要创建目录的路径。
#     mode：指定目录的模式，默认模式为八进制的 777。类似于 chmod() 方法。
#     exist_ok：可选参数，如果值为 False，当要创建的目录已经存在时，抛出 FileExistsError 异常；如果值为True，
#         当要创建的目录已经存在时，不会抛出异常。默认值为 False。

import re
html = response.text
image_url_list = re.findall('"thumbURL":"(.*?)",', html, re.S)
# 用于查找得到thumbURL后面的图片网址，目前还不会正则表达式

# print(image_url_list)
q = 0
for url in image_url_list:
   # print(url)
    res=requests.get(url)
    picture=res.content
    q+=1
    with open(f'{file}\\{q}.jpg',mode='wb') as f:
        f.write(picture)
    # 在小猫文件夹下保存图片，以q为图片文件名

# 当用requests.get请求得到的源代码是html文件，每一行是一个标签时，可以用此代码

# 当用requests.get请求得到的源代码是html文件，每一行是一个标签时，可以用此代码
import requests
from bs4 import BeautifulSoup

headers = {
    'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:121.0) Gecko/20100101 Firefox/121.0',
    'Accept':'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8',
    'Accept-Encoding':'gzip, deflate, br',
    'Accept-Language':'zh-CN,zh;q=0.8,zh-TW;q=0.7,zh-HK;q=0.5,en-US;q=0.3,en;q=0.2'
}

url = input("请输入你想保存的图片的网址：")
response = requests.get(url, headers = headers)
print(response)
print(response.status_code)

file = input("请输入你想图片保存在的文件夹名称：")
# response=requests.get('https://www.umei.cc/meinvtupian/')
response.encoding='utf-8'
# print(response.text)
soup=BeautifulSoup(response.text,'html.parser')
# print(soup)
  
import os
os.makedirs(f'./图片/{file}', exist_ok=True)

lis = soup.find_all('div',class_="taotu-main")
# print(a)
print("*********")

q=0

t = 0
for l in lis:
    if(t == 0):
        print(l)
    t += 1
    p=l.find_all('img')
    for i in p:
        pic=i.get('data-original')
        print(pic)
        res=requests.get(pic)
        picture=res.content
        q+=1
        with open(f'图片/{file}\\{q}.jpg',mode='wb') as f:
            f.write(picture)