python爬取网页图片并下载保存本地
以http://www.baidu.com为例
新建项目,路径如下
start.py
#!/usr/bin/env python
# -*- coding:utf-8 -*-
import modules.my_request
import modules.is_file_exist
import modules.get_picture
import conf.settings
modules.is_file_exist.is_file_exist(conf.settings.my_file)
modules.my_request.my_request_content('http://www.baidu.com')
modules.my_request.my_request_result('http://www.baidu.com')
# modules.get_picture.read_all()
modules.get_picture.read_pictures()
modules.get_picture.download_pictures()
#!/usr/bin/env python
# -*- coding:utf-8 -*-
import os
import sys
import platform
if platform.system() == "Windows":
#windows操作系统,数据库路径
BASE_DIR = "\\".join(os.path.abspath(os.path.dirname(__file__)).split("\\")[:-1])
#join 可以将元组、字符串、列表中的元素以指定的字符(分隔符)连接生成一个新的字符串
#os.path.abspath
database_path = os.path.join(BASE_DIR,"database")#数据库路径
print('BASE_DIR'+BASE_DIR)
else:
BASE_DIR = "/".join(os.path.abspath(os.path.dirname(__file__)).split("/")[:-1])
database_path = os.path.join(BASE_DIR, "database")
'''http调用结果存储路径'''
my_file = os.path.join(database_path, "http.txt")#会从第一个以”/”开头的参数开始拼接,之前的参数全部丢弃
# print(my_file)
'''数据裁剪后的图片链接存储文档路径'''
my_picfile = os.path.join(database_path, "my_picfile.txt")
# print(my_picfile)
'''调用结果存储路径'''
my_result = os.path.join(database_path, "my_result.csv")
'''爬取图片存储路径'''
my_pictures = os.path.join(BASE_DIR, "pictures/")
print(my_pictures)
database路径下的内容为自动生成,只需创建database文件夹
get_picture.py
#!/usr/bin/env python
# -*- coding:utf-8 -*-
import re
import conf.settings
import modules.is_file_exist
import urllib.request
'''遍历http调用结果内容'''
def read_all():
my_file = open(conf.settings.my_file, "r", encoding='UTF-8')
for line in my_file.readlines():
# line = line.strip()
if line !='\n':
print(line)
my_file.close()
'''数据裁剪后的图片链接存储文档'''
def read_pictures():
reg = re.compile('http.*?png', re.S)
read_pictures_my_file = open(conf.settings.my_file, "r", encoding='UTF-8')
for line in read_pictures_my_file.readlines():
if reg.search(line):
# print(line)
m1 = reg.findall(line)
print(m1[0])
modules.is_file_exist.is_file_exist(conf.settings.my_picfile)
read_pictures_picfile = open(conf.settings.my_picfile, "a", encoding='UTF-8')
read_pictures_picfile.write(m1[0]+'\n')
read_pictures_picfile.close()
read_pictures_my_file.close()
'''下载图片'''
def download_pictures():
download_pic = open(conf.settings.my_picfile, "r", encoding='UTF-8')
imgName = 0
for imgPath in download_pic.readlines():
try:
f = open(conf.settings.my_pictures + str(imgName) + ".png", 'wb')
f.write((urllib.request.urlopen(imgPath)).read())
# print(imgPath)
f.close()
except Exception as e:
print(imgPath + " error")
imgName += 1
print("All pictures have been download!")
is_file_exist.py
#!/usr/bin/python
# -*- coding: UTF-8 -*-
import os
import conf.settings
# print('is_file_exist'+conf.settings.my_file)
# print('is_file_exist'+conf.settings.my_picfile)
def is_file_exist(is_file):
'''打开文件,若不存在则创建,判断文件是否为空,若为空数据初始化'''
exist_file = open(is_file, "a")
if is_file == conf.settings.my_file:
file_exist = os.path.getsize(conf.settings.my_file)
exist_file.close()
elif is_file == conf.settings.my_picfile:
file_exist = os.path.getsize(conf.settings.my_picfile)
exist_file.close()
else:
file_exist=os.path.getsize(conf.settings.my_result)
exist_file.close()
if file_exist == 0:
'''数据初始化'''
inf = "\n"
my_file = open(is_file, "a")
my_file.write(inf)
my_file.close()
my_request.py
# encoding:utf-8
import urllib.request
import conf.settings
import requests
'''获取http接口数据,存储到文档中'''
def my_request_content(url):
urllib.request.urlretrieve(url, conf.settings.my_file)
'''获取http接口调用状态,存储到文档中'''
def my_request_result(url):
response = requests.get(url)
my_request_result=open(conf.settings.my_result, "a", encoding='utf-8')
s = "status_code"
my_request_result.write(s+','+str(response.status_code)+'\n')
my_request_result.write(u"url"+','+str(response.url)+'\n')
my_request_result.write("headers"+','+str(response.headers)+'\n')
my_request_result.write(",Cache-Control" + ',' + response.headers['Cache-Control'] + '\n')
my_request_result.write(",Content-Encoding,"+response.headers['Content-Encoding']+ '\n')
my_request_result.write(",Content-Type,"+response.headers['Content-Type']+ '\n')
my_request_result.write(",Date,"+response.headers['Date'] + '\n')
my_request_result.write(",Connection,"+response.headers['Connection'] +'\n')
my_request_result.write("cookie,"+str(response.cookies)+'\n')
# print('打印cookie信息'+response.cookies) # 打印cookie信息
# print('以文本形式打印网页源码'+response.text) # 以文本形式打印网页源码
# print('以字节流形式打印'+response.content) # 以字节流形式打印
效果: