爬虫爬取图片

最新推荐文章于 2024-07-19 23:41:27 发布

Neflibata_mo

最新推荐文章于 2024-07-19 23:41:27 发布

阅读量592

点赞数

文章标签：爬虫

本文链接：https://blog.csdn.net/Neflibata_mo/article/details/115406278

版权

代码：

#先导入需要的模块
import time     
import requests
import re
import os

#打开文件
file = open("D:\爬虫资料\微博图片.txt",encoding='utf-8')
#lines = file.readlines()
file.seek(0)     #把指针移到文件开头位置
for line in file.readlines():    #readlines以列表输出文件内容
    line=line.split(",")         #改变元素，去掉，和换行符\n,tab键则把逗号换成"/t",空格换成" "
line = line[0:-1]
#print(line)
file.close()

#开始爬取图片

#1.请求网页
proxies = {"http": "http://101.132.111.208:8082"}
print("r.status_code")  # 如果代理可用则正常访问，不可用报以上错误

for url in line[14587::]:
#方法一： 防止http的连接数超过最大限制。
    '''
    requests.adapters.DEFAULT_RETRIES = 5  # 增加重连次数
    s = requests.session()
    s.keep_alive = False  # 关闭多余连接
    '''

    response = requests.get(url,proxies=proxies)  # 你需要的网址
    #print(urls)
    #response = requests.get(url,headers=headers)
    html = response.text
    #print(html)


 #2. 解析网页
    url_photos = re.findall('<img style=".*?" src="(.*?)" width=".*?" height=".*?">', html)
    # print(url_photos)


#3.保存图片
    # 创建文件夹
    time.sleep(1)
    dir_path = "D://photo//"
    # 图片的名字
    file_name = url.split('/')[-1]
    path = dir_path + file_name

    try:
        if not os.path.exists(dir_path):
            os.mkdir(dir_path)
        if not os.path.exists(path):
            response = requests.get(url,proxies=proxies)
            # response.raise_for_status()

            with open(path, 'wb') as f:
                f.write(response.content)
            print("爬取成功")
        else:
            print("文件已存在")
    except Exception as e:
        print("爬取失败" + str(e))

Neflibata_mo

关注

0
点赞
踩
0

收藏

觉得还不错? 一键收藏
0
评论
爬虫爬取图片

代码：#先导入需要的模块import time import requestsimport reimport os#打开文件file = open("D:\爬虫资料\微博图片.txt",encoding='utf-8')#lines = file.readlines()file.seek(0) #把指针移到文件开头位置for line in file.readlines(): #readlines以列表输出文件内容 line=line.split(",")
复制链接

扫一扫