代码:
#先导入需要的模块
import time
import requests
import re
import os
#打开文件
file = open("D:\爬虫资料\微博图片.txt",encoding='utf-8')
#lines = file.readlines()
file.seek(0) #把指针移到文件开头位置
for line in file.readlines(): #readlines以列表输出文件内容
line=line.split(",") #改变元素,去掉,和换行符\n,tab键则把逗号换成"/t",空格换成" "
line = line[0:-1]
#print(line)
file.close()
#开始爬取图片
#1.请求网页
proxies = {"http": "http://101.132.111.208:8082"}
print("r.status_code") # 如果代理可用则正常访问,不可用报以上错误
for url in line[14587::]:
#方法一: 防止http的连接数超过最大限制。
'''
requests.adapters.DEFAULT_RETRIES = 5 # 增加重连次数
s = requests.session()
s.keep_alive = False # 关闭多余连接
'''
response = requests.get(url,proxies=proxies) # 你需要的网址
#print(urls)
#response = requests.get(url,headers=headers)
html = response.text
#print(html)
#2. 解析网页
url_photos = re.findall('<img style=".*?" src="(.*?)" width=".*?" height=".*?">', html)
# print(url_photos)
#3.保存图片
# 创建文件夹
time.sleep(1)
dir_path = "D://photo//"
# 图片的名字
file_name = url.split('/')[-1]
path = dir_path + file_name
try:
if not os.path.exists(dir_path):
os.mkdir(dir_path)
if not os.path.exists(path):
response = requests.get(url,proxies=proxies)
# response.raise_for_status()
with open(path, 'wb') as f:
f.write(response.content)
print("爬取成功")
else:
print("文件已存在")
except Exception as e:
print("爬取失败" + str(e))