K站 动漫头像 爬虫 python

由于 要训练UGATAT模型所以要采集一些P站动漫图片,分为2个部分一个采集器 采集图片地址放进redis 另一个 下载器 多线程下载链接。
采集器

## -*- coding: utf-8 -*-
import requests
from lxml import etree
from lxml.etree import tostring
import redis
from urllib import parse
header={"Accept":"text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
        "User-Agent":"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_5) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/12.1.1 Safari/605.1.15",
        "Connection":"keep-alive"
        }
baseUrl='http://konachan.net'
def createurl(pagenum,baseurl) :
    urllist=[]
    for i in range(500,pagenum):
        urllist.append(baseurl % str(i))
    print("collection page",urllist)
    return urllist
def getPageUrl(urllist) -> (list):
    pageUrlList=[]
    for p in urllist:
        rs=requests.get(p,header)
        dom=etree.HTML(rs.content)
        nodes=dom.xpath('//*[@id="post-list-posts"]/li/div/a/@href')
        for m in nodes:
            pageUrlList.append(m)
        print("now page%s  collection complete %s" %(p,pageUrlList))
    return pageUrlList
def downPic(pageUrlList) -> (list):
    for m in  pageUrlList:
        url=parse.urljoin(baseUrl,m)
        rs= requests.get(url,header)
        dom= etree.HTML(rs.text)
        Imgurl=dom.xpath('//*[@id="image"]/@src')
        print("collection information address %s"%Imgurl)
        #改成自己的redis
        pool = redis.ConnectionPool(host='192.168.1.13', port=32771)
        con = redis.Redis(connection_pool=pool)
        con.sadd("picUrl", Imgurl[0])
        con.close()
urllist= createurl(1000,"http://konachan.net/post?page=%s")
pageUrlList= getPageUrl(urllist)
downPic(pageUrlList)

下载器

import redis
import uuid
#con.sadd("caomaoX","qiao312")
import re
def getclient():
    pool = redis.ConnectionPool(host='192.168.1.11', port=32768)
    con = redis.Redis(connection_pool=pool)
    return con

import requests
import time
import threading
import queue
urls=[]

def  init(name,q,lock):
    urls=[]
    redisClient=getclient()
    i=""
    i = redisClient.spop("title")
    while(True):
        xx= str(i)[2:-1]
        imglisti = redisClient.spop(xx)
        if(i ==None  or None ==imglisti):
            break
        urls.append(imglisti)
    redisClient.close()
    print("获取队列数%s" %(q.qsize()))
    for url in urls:
        q.put(url)
    for m in range(num):
        name.put(i)

    return name,q

def fetch_img_func(name,q,redisClient,lock):
    imagename=None
    while True:
        print("1")
        while(q.empty() ):
            print("2")
            lock.acquire(2)
            print(q.empty())
            if(q.empty() ):
                print("true")
                imagename = None
                name,q=init(name,q,lock)
                lock.release()
            else:
                lock.release()
                print("XXX")
                break
        try:
            if imagename == None:
                imagename = name.get()
            url = q.get()# 不阻塞的读取队列数据
            i = q.qsize()
        except Exception as e:
            print (e)
            break
        # print ('Current Thread Name Runing %s ... ' % threading.currentThread().name)
        print("当前还有%s个任务"% i ,url,threading.currentThread().getName())

        res=None
        try:
            res = requests.get(url, stream=True)
        except Exception as e:
            print(e)
        finally:
            redisClient.close()
        if(res!=None):
            if res.status_code == 200:
                save_img_path ='pic/%s'%imagename +str(uuid.uuid1())+".jpg"
                # 保存下载的图片
                with open(save_img_path, 'wb') as fs:
                    for chunk in res.iter_content(1024):
                        fs.write(chunk)
                print("保存成功~ %s"%(imagename))
if __name__ == '__main__':
    # rs =getclient()
    # while(True):
    #     sx=rs.spop("title")
    #     print(sx)
    q = queue.Queue()
    name = queue.Queue()
    num = 10  # 线程数
    threads = []
    lock = threading.Lock()
    start = time.time()
    for i  in range(num):
        t = threading.Thread(target=fetch_img_func, args=(name,q,getclient(),lock), name="child_thread_%s"%i)
        threads.append(t)
    for t in threads:
        t.start()
    for t in threads:
        t.join()
    print(time.time()-start)

采集了一些图 还要用opencv 截取 头像
在这里插入图片描述

评论 1
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值