千图网电商淘宝素材网址:https://www.58pic.com/piccate/3-0-0-p1.html
from urllib import request
import urllib
import random
from urllib.error import URLError
from urllib.request import ProxyHandler, build_opener
import re
def get_ip():
fr=open('ip.txt','r') ##代理IP池
ips=fr.readlines()
new=[]
for line in ips:
temp=line.strip()
new.append(temp)
ip=random.choice(new)
return ip
print(ip)
proxy =get_ip()
proxy_handler = ProxyHandler({
'http': 'http://' + proxy,
'https': 'https://' + proxy
})
opener = build_opener(proxy_handler)
import threading
class One(threading.Thread):
def __init__(self):
threading.Thread.__init__(self)
def run(self):
try:
for i in range(1,5,2):
pageurl='http://www.58pic.com/piccate/3-0-0-p'+str(i)+'.html'
data =urllib.request.urlopen(pageurl).read().decode('utf-8','ignore')
pat='class="thumb-box".*?src="(.*?).jpg!'
image_url=re.compile(pat).findall(data) ##爬取高清图片部分地址
for j in range(0,len(image_url)):
try:
this_list=image_url[j]
#通过观察高请图片完整地址,添加完整地址
this_url=this_list+'.jpg!/fw/1024/watermark/url/L2ltYWdlcy93YXRlcm1hcmsveGlhb3R1LnBuZw==/align/center'
file='D:/软件(学习)/Python/Test/chapter6/qiantu.photo/'+str(i)+str(j)+'.jpg' #保存图片
urllib.request.urlretrieve(this_url,file)
print('第'+str(i)+'页第'+str(j)+'个图片成功')
except urllib.error.URLError as e:
print(e.reason)
except URLError as e:
print(e.reason)
采用多线程的方法
class Two(threading.Thread):
def __init__(self):
threading.Thread.__init__(self)
def run(self):
try:
for i in range(2, 5, 2):
pageurl = 'http://www.58pic.com/piccate/3-0-0-p'+str(i)+'.html'
data = urllib.request.urlopen(pageurl).read().decode('utf-8', 'ignore')
pat = 'class="thumb-box".*?src="(.*?).jpg!'
image_url = re.compile(pat).findall(data)
for j in range(0, len(image_url)):
try:
this_list = image_url[j]
this_url = this_list + '.jpg!/fw/1024/watermark/url/L2ltYWdlcy93YXRlcm1hcmsveGlhb3R1LnBuZw==/align/center'
file = 'D:/软件(学习)/Python/Test/chapter6/qiantu.photo/' + str(i) + str(j) + '.jpg'
urllib.request.urlretrieve(this_url, file)
print('第' + str(i) + '页第' + str(j) + '个图片成功')
except urllib.error.URLError as e:
print(e.reason)
except URLError as e:
print(e.reason)
one=One()
one.start()
two=Two()
two.start()
爬取成功!!!