由于 要训练UGATAT模型所以要采集一些P站动漫图片,分为2个部分一个采集器 采集图片地址放进redis 另一个 下载器 多线程下载链接。
采集器
## -*- coding: utf-8 -*-
import requests
from lxml import etree
from lxml.etree import tostring
import redis
from urllib import parse
header={"Accept":"text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
"User-Agent":"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_5) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/12.1.1 Safari/605.1.15",
"Connection":"keep-alive"
}
baseUrl='http://konachan.net'
def createurl(pagenum,baseurl) :
urllist=[]
for i in range(500,pagenum):
urllist.append(baseurl % str(i))
print("collection page",urllist)
return urllist
def getPageUrl(urllist) -> (list):
pageUrlList=[]
for p in urllist:
rs=requests.get(p,header)
dom=etree.HTML(rs.content)
nodes=dom.xpath('//*[@id="post-list-posts"]/li/div/a/@href')
for m in nodes:
pageUrlList.append(m)
print("now page%s collection complete %s" %(p,pageUrlList))
return pageUrlList
def downPic(pageUrlList) -> (list):
for m in pageUrlList:
url=parse.urljoin(baseUrl,m)
rs= requests.get(url,header)
dom= etree.HTML(rs.text)
Imgurl=dom.xpath('//*[@id="image"]/@src')
print("collection information address %s"%Imgurl)
#改成自己的redis
pool = redis.ConnectionPool(host='192.168.1.13', port=32771)
con = redis.Redis(connection_pool=pool)
con.sadd("picUrl", Imgurl[0])
con.close()
urllist= createurl(1000,"http://konachan.net/post?page=%s")
pageUrlList= getPageUrl(urllist)
downPic(pageUrlList)
下载器
import redis
import uuid
#con.sadd("caomaoX","qiao312")
import re
def getclient():
pool = redis.ConnectionPool(host='192.168.1.11', port=32768)
con = redis.Redis(connection_pool=pool)
return con
import requests
import time
import threading
import queue
urls=[]
def init(name,q,lock):
urls=[]
redisClient=getclient()
i=""
i = redisClient.spop("title")
while(True):
xx= str(i)[2:-1]
imglisti = redisClient.spop(xx)
if(i ==None or None ==imglisti):
break
urls.append(imglisti)
redisClient.close()
print("获取队列数%s" %(q.qsize()))
for url in urls:
q.put(url)
for m in range(num):
name.put(i)
return name,q
def fetch_img_func(name,q,redisClient,lock):
imagename=None
while True:
print("1")
while(q.empty() ):
print("2")
lock.acquire(2)
print(q.empty())
if(q.empty() ):
print("true")
imagename = None
name,q=init(name,q,lock)
lock.release()
else:
lock.release()
print("XXX")
break
try:
if imagename == None:
imagename = name.get()
url = q.get()# 不阻塞的读取队列数据
i = q.qsize()
except Exception as e:
print (e)
break
# print ('Current Thread Name Runing %s ... ' % threading.currentThread().name)
print("当前还有%s个任务"% i ,url,threading.currentThread().getName())
res=None
try:
res = requests.get(url, stream=True)
except Exception as e:
print(e)
finally:
redisClient.close()
if(res!=None):
if res.status_code == 200:
save_img_path ='pic/%s'%imagename +str(uuid.uuid1())+".jpg"
# 保存下载的图片
with open(save_img_path, 'wb') as fs:
for chunk in res.iter_content(1024):
fs.write(chunk)
print("保存成功~ %s"%(imagename))
if __name__ == '__main__':
# rs =getclient()
# while(True):
# sx=rs.spop("title")
# print(sx)
q = queue.Queue()
name = queue.Queue()
num = 10 # 线程数
threads = []
lock = threading.Lock()
start = time.time()
for i in range(num):
t = threading.Thread(target=fetch_img_func, args=(name,q,getclient(),lock), name="child_thread_%s"%i)
threads.append(t)
for t in threads:
t.start()
for t in threads:
t.join()
print(time.time()-start)
采集了一些图 还要用opencv 截取 头像