1.代码
代码如下(示例):
import random
import requests
import re
import time
import os
import threading
from lxml import etree
# 设置最大线程锁
thread_lock = threading.BoundedSemaphore(value=200)
# 在桌面创建了一个文件夹,用于存放图片
Address1 = r'C:\Users\***\Desktop\新建文件夹 (2)\{}'
# headers可以在浏览器,检查,Network中查找,不加headers极容易被目标网站视为攻击
headers={
'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/85.0.4183.102 Safari/537.36'
}
# 爬取的目标网站
url_main='https://*******/index.php/art/type/id/30.html'
def jiexi_main(url):
'''
解析目标网站的列表地址
:param url:
:return: 传回目标网站的列表子地址
'''
response1 = requests.get(url, headers=headers)
html1 = response1.text
res_xpath = etree.HTML(response1.text)
urls=res_xpath.xpath('/html/body/div[2]/div[3]/div/div/ul/li/a/@href')
print(urls)# 输出检查是否为子地址全址,在main1中是否需要添加部分地址
return urls
def jiexi_url(url):
'''解析单个网页'''
response = requests.get(url, headers=headers)
html = response.text
urls = re.findall('<img src="(.*?)"/>', html) # 正则截取图片地址
print(urls) # 输出目标图片地址,判断是否截取成功
response.close()
return urls
def download_pics(url,n,path):
'''保存图片'''
time.sleep(1)
# 图片名称
file_name = url.split('/')[-1]
response = requests.get(url, headers=headers)
with open(path+"\{}.jpg".format(file_name), 'wb') as f:
f.write(response.content)
print('下载完成第{}张图片'.format(n))
# 下载完,解锁
thread_lock.release()
def main_1():
'''
可下载列表
:return:
'''
start = time.time()
# 此处做了一个多个列表页目标爬取的尝试,没有成功
# for index in range(4,10,1):
# print(index)
# url_main1 = url_main.format(index)
# url_main_pic = jiexi_main(url_main1)
url_main_pic=jiexi_main(url_main)
file_name = '8' # 下载图片所要存放的子文件夹名
m = './' + file_name
if not os.path.exists(Address1.format(m)):
os.mkdir(Address1.format(m))
path = Address1.format(m)
n = 0
m = 0
for url_py in url_main_pic:
m += 1
time.sleep(0.5)
pic_urls = jiexi_url('https://dj66.pw' + url_py)
for url in pic_urls:
n += 1
print('正在下载_{}'.format(n))
# print('正在下载第{}张图片'.format(n))
# 上锁
thread_lock.acquire()
t = threading.Thread(target=download_pics, args=(url, n, path))
t.start()
end = time.time() # 做了一个程序运行的计时
print('Running time: %s Seconds' % (end - start)) # 输出程序运行时间
def main_2():
'''
可下载单个图片页地址
:return:
'''
url_py='https://******/index.php/art/detail/id/35231.html' # 可以修改该地址进行不同页面图片的下载
start = time.time()
n = 0
file_name='5'
m = './'+file_name
pic_urls = jiexi_url(url_py)
if not os.path.exists(Address1.format(m)):
os.mkdir(Address1.format(m))
path=Address1.format(m)
for url in pic_urls:
n += 1
# print('正在下载第{}张图片'.format(n))
# 上锁
thread_lock.acquire()
t = threading.Thread(target=download_pics, args=(url,n,path))
t.start()
end = time.time()
print('Running time: %s Seconds' % (end - start)) # 输出程序运行时间
# 分别可以调用两个主程序
if __name__=='__main__':
main_1()
#if __name__=='__main__':
# main_1()
2.读入数据
代码如下(示例):
C:\Users\lee\PycharmProjects\python_Pachong\venv\Scripts\python.exe C:/Users/lee/PycharmProjects/python_Pachong/tupian.py
['/index.php/art/detail/id/32605.html', '/index.php/art/detail/id/32604.html', '/index.php/art/detail/id/32603.html', '/index.php/art/detail/id/32602.html', '/index.php/art/detail/id/32601.html', '/index.php/art/detail/id/32551.html', '/index.php/art/detail/id/32550.html', '/index.php/art/detail/id/32549.html', '/index.php/art/detail/id/32548.html', '/index.php/art/detail/id/32547.html', '/index.php/art/detail/id/32504.html', '/index.php/art/detail/id/32503.html', '/index.php/art/detail/id/32502.html', '/index.php/art/detail/id/32501.html', '/index.php/art/detail/id/32500.html', '/index.php/art/detail/id/32454.html', '/index.php/art/detail/id/32453.html', '/index.php/art/detail/id/32452.html', '/index.php/art/detail/id/32451.html', '/index.php/art/detail/id/32450.html', '/index.php/art/detail/id/32400.html', '/index.php/art/detail/id/32399.html', '/index.php/art/detail/id/32398.html', '/index.php/art/detail/id/32397.html', '/index.php/art/detail/id/32396.html', '/index.php/art/detail/id/32352.html', '/index.php/art/detail/id/32351.html', '/index.php/art/detail/id/32350.html', '/index.php/art/detail/id/32349.html', '/index.php/art/detail/id/32348.html']
['https://mei.lbpicmt.com/upload/art/img/lctk/85f99c8dae715a22578e1f95873a930d.jpg', 'https://mei.lbpicmt.com/upload/art/img/lctk/6b0adeba5f567b47af2174d6cb9a6a52.jpg', 'https://mei.lbpicmt.com/upload/art/img/lctk/857742df813b50596189eb229c600208.jpg', 'https://mei.lbpicmt.com/upload/art/img/lctk/f99ebf01b38b74397d31edec5f5a3f03.jpg', 'https://mei.lbpicmt.com/upload/art/img/lctk/b21b4a95ad6fac05bb1693ac0b7b70e8.jpg', 'https://mei.lbpicmt.com/upload/art/img/lctk/f61e8cebf99028c552673680f1b5b5c8.jpg', 'https://mei.lbpicmt.com/upload/art/img/lctk/a355c9fc2ac0d3a6f2cb126002591d2b.jpg', 'https://mei.lbpicmt.com/upload/art/img/lctk/a2886729e6792e73382a82f8ffb8be89.jpg', 'https://mei.lbpicmt.com/upload/art/img/lctk/b7bd5589f078d9c2eed5ec65687ef88e.jpg', 'https://mei.lbpicmt.com/upload/art/img/lctk/2506d711d0949e5f73aa44e6995840ec.jpg', 'https://mei.lbpicmt.com/upload/art/img/lctk/6362cd69504fde74961f0f79666ddd21.jpg', 'https://mei.lbpicmt.com/upload/art/img/lctk/74c47d60954cf559cf42d6707daf23d0.jpg', 'https://mei.lbpicmt.com/upload/art/img/lctk/0379cfef889808eacd04da4543f656b5.jpg']
正在下载_1
正在下载_2
正在下载_3
正在下载_4
正在下载_5
正在下载_6
正在下载_7
正在下载_8
正在下载_9
正在下载_10
正在下载_11
正在下载_12
正在下载_13
['https://mei.lbpicmt.com/upload/art/img/lctk/98db56003c1d6392e94569102667cdf1.jpg', 'https://mei.lbpicmt.com/upload/art/img/lctk/33b57eaaf30a1c99042ad639af1dd591.jpg', 'https://mei.lbpicmt.com/upload/art/img/lctk/fc827e5d9f91bc293336c55b8b7fa8b6.jpg', 'https://mei.lbpicmt.com/upload/art/img/lctk/a5ff04d37b2811814408b76339270ed1.jpg', 'https://mei.lbpicmt.com/upload/art/img/lctk/6e79ac0cb5364c047ea77cbbfd946bea.jpg', 'https://mei.lbpicmt.com/upload/art/img/lctk/c6b8a0e80b7c7d80ab435ff1cc5ed683.jpg', 'https://mei.lbpicmt.com/upload/art/img/lctk/1275f9f465d24a4773253013b5348fd4.jpg', 'https://mei.lbpicmt.com/upload/art/img/lctk/f1dcf4d39e4dca88793d1e5bfff862b6.jpg', 'https://mei.lbpicmt.com/upload/art/img/lctk/bcba83d63cac2f4ab2c041693be8fee8.jpg', 'https://mei.lbpicmt.com/upload/art/img/lctk/0f50a5e66b76ed76abd0bb50019bf94e.jpg']
正在下载_14
正在下载_15
正在下载_16
正在下载_17
正在下载_18
正在下载_19
正在下载_20
正在下载_21
正在下载_22
正在下载_23
下载完成第7张图片
下载完成第8张图片
总结
可以爬取目标网站的图片,但是还有很多问题
运行过程中会报以下错误
During handling of the above exception, another exception occurred:
Traceback (most recent call last):
File "C:\Users\lee\PycharmProjects\python_Pachong\venv\lib\site-packages\requests\adapters.py", line 439, in send
resp = conn.urlopen(
File "C:\Users\lee\PycharmProjects\python_Pachong\venv\lib\site-packages\urllib3\connectionpool.py", line 726, in urlopen
retries = retries.increment(
File "C:\Users\lee\PycharmProjects\python_Pachong\venv\lib\site-packages\urllib3\util\retry.py", line 403, in increment
raise six.reraise(type(error), error, _stacktrace)
File "C:\Users\lee\PycharmProjects\python_Pachong\venv\lib\site-packages\urllib3\packages\six.py", line 734, in reraise
raise value.with_traceback(tb)
File "C:\Users\lee\PycharmProjects\python_Pachong\venv\lib\site-packages\urllib3\connectionpool.py", line 670, in urlopen
httplib_response = self._make_request(
File "C:\Users\lee\PycharmProjects\python_Pachong\venv\lib\site-packages\urllib3\connectionpool.py", line 381, in _make_request
self._validate_conn(conn)
File "C:\Users\lee\PycharmProjects\python_Pachong\venv\lib\site-packages\urllib3\connectionpool.py", line 978, in _validate_conn
conn.connect()
File "C:\Users\lee\PycharmProjects\python_Pachong\venv\lib\site-packages\urllib3\connection.py", line 362, in connect
self.sock = ssl_wrap_socket(
File "C:\Users\lee\PycharmProjects\python_Pachong\venv\lib\site-packages\urllib3\util\ssl_.py", line 384, in ssl_wrap_socket
return context.wrap_socket(sock, server_hostname=server_hostname)
File "C:\Users\lee\AppData\Local\Programs\Python\Python38-32\lib\ssl.py", line 500, in wrap_socket
return self.sslsocket_class._create(
File "C:\Users\lee\AppData\Local\Programs\Python\Python38-32\lib\ssl.py", line 1040, in _create
self.do_handshake()
File "C:\Users\lee\AppData\Local\Programs\Python\Python38-32\lib\ssl.py", line 1309, in do_handshake
self._sslobj.do_handshake()
urllib3.exceptions.ProtocolError: ('Connection aborted.', ConnectionResetError(10054, '远程主机强迫关闭了一个现有的连接。', None, 10054, None))
During handling of the above exception, another exception occurred:
Traceback (most recent call last):
File "C:/Users/lee/PycharmProjects/python_Pachong/tupian.py", line 125, in <module>
main_1()
File "C:/Users/lee/PycharmProjects/python_Pachong/tupian.py", line 83, in main_1
pic_urls = jiexi_url('https://dj66.pw' + url_py)
File "C:/Users/lee/PycharmProjects/python_Pachong/tupian.py", line 36, in jiexi_url
response = requests.get(url, headers=headers)
File "C:\Users\lee\PycharmProjects\python_Pachong\venv\lib\site-packages\requests\api.py", line 76, in get
return request('get', url, params=params, **kwargs)
File "C:\Users\lee\PycharmProjects\python_Pachong\venv\lib\site-packages\requests\api.py", line 61, in request
return session.request(method=method, url=url, **kwargs)
File "C:\Users\lee\PycharmProjects\python_Pachong\venv\lib\site-packages\requests\sessions.py", line 530, in request
resp = self.send(prep, **send_kwargs)
File "C:\Users\lee\PycharmProjects\python_Pachong\venv\lib\site-packages\requests\sessions.py", line 643, in send
r = adapter.send(request, **kwargs)
File "C:\Users\lee\PycharmProjects\python_Pachong\venv\lib\site-packages\requests\adapters.py", line 498, in send
raise ConnectionError(err, request=request)
requests.exceptions.ConnectionError: ('Connection aborted.', ConnectionResetError(10054, '远程主机强迫关闭了一个现有的连接。', None, 10054, None))