python抓图脚本

最新推荐文章于 2023-10-22 12:00:00 发布
chongjiu1048
最新推荐文章于 2023-10-22 12:00:00 发布
阅读量174
点赞数
文章标签： python
原文链接：https://my.oschina.net/u/212831/blog/306075
版权
'''
Created on 2013-2-1
@author: 蒲文辉
@mailto: pwh0996@gmail.com
运行环境:Python 3
'''
#coding:utf-8
import re
import urllib.request
from html.parser import HTMLParser
from html.parser import HTMLParseError
import os
import threading
import time

class MyHtmlParser(HTMLParser):
    url = []
    img = []
    title = 0
    charset = ''
    def __init__(self):
        HTMLParser.__init__(self)
        self.url = []
        self.img = []
        self.title = []
        self.charset = ''
    def handle_starttag(self, tag, attrs):
        if tag == "a":
            for i in attrs:
                if i[0] == "href":
                    self.url.append(i[1])
        elif tag == "meta":
            for i in attrs:
                if (i[0] == 'content' and i[1].find('charset'))\
                   or i[0] == 'charset':
                    if re.match('.*(GB2312|gb2312).*',str(i[1])):
                        self.charset = 'gb2312'
                    elif re.match('.*(utf-8|UTF-8).*',str(i[1])):
                        self.charset = 'utf-8'
                    elif re.match('.*(gbk|GBK).*',str(i[1])):
                        self.charset = 'gbk'
        elif tag == "title":
            self.title = 1
        for i in attrs:
            if re.match('http://.+\.(jpg|jepg|png)',str(i[1])):
                self.img.append(i[1])

    def handle_data(self, data):
        if self.title == 1:
            self.title = data
        findimg = re.findall('http://.+?\.jpg',data)
        for i in range(0,len(findimg)):
                    findimg[i] = findimg[i]
        self.img += findimg

    def handle_startendtag(self, tag, attrs):
        if tag == "a":
            for i in attrs:
                if i[1] == "href":
                    self.url.append(i[1])
        elif tag == "meta":
            for i in attrs:
                if (i[0] == 'content' and i[1].find('charset'))\
                   or i[0] == 'charset':
                    if re.match('.*(GB2312|gb2312).*',str(i[1])):
                        self.charset = 'gb2312'
                    elif re.match('.*(utf-8|UTF-8).*',str(i[1])):
                        self.charset = 'utf-8'
                    elif re.match('.*(gbk|GBK).*',str(i[1])):
                        self.charset = 'gbk'
        for i in attrs:
            if re.match('http://.+\.(jpg|jepg|png)',str(i[1])):
                self.img.append(i[1])

class ScratchFactory(threading.Thread):
    url = ''
    tempUrls = []
    tempImgs = []
    title = ''
    pwd = ''
    def __init__(self,url):
        threading.Thread.__init__(self)
        self.url = url
        self.tempImgs = []
        self.tempUrls = []
        self.title = []
        global seed
        match = re.search(seed + '.*/',url)
        if match:
            self.pwd = match.group()

    def addHeader(self,data):
        global seed
        for i in range(0,len(data)):
            if re.match("http.+", data[i]) == None:
                if re.match("/.*",data[i]):
                    data[i] = seed + data[i]
                elif re.match('./.*',data[i]):
                    data[i] = self.pwd + data[i][2:]
                else:
                    data[i] = self.pwd + data[i]
        return data
    def run(self):
        try:
            conect = urllib.request.urlopen(self.url)    #下载网页数据
            data = conect.read()
            conect.close()
            htmlx = MyHtmlParser()
            htmlx.feed(data[:500].decode('utf-8','ignore'))
            t = htmlx.charset                            #获得html编码
            if t == '':
                t = 'gb2312'
            htmlx.reset()
            htmlx.feed(data.decode(t,'ignore'))
            self.title = htmlx.title
            self.tempUrls = self.addHeader(htmlx.url)    #给相对路径链接加上头
            self.tempImgs = self.addHeader(htmlx.img)
            htmlx.close()
            self.clearData()                             #过滤无用链接
            threading.Thread(target = self.saveImages,args = () ).start()  #下载图片
        except HTMLParseError as e:
            print("####Error : 1 ######:",e , '--->',  self.url)
        except Exception as e:
            print("####Error : 2 ######:",e , '--->' , self.url)

        global UrlSrc,UrlDiged,mLock
        mLock.acquire()
        t = []
        for temp in self.tempUrls:
            if not UrlDiged.__contains__(temp):
                t.append(temp)
        l = []
        for temp in t:
            if not UrlSrc.__contains__(temp):
                l.append(temp)
        UrlSrc += l
        mLock.release()

    def clearData(self):
        #去除重复链接
        self.tempUrls = set(self.tempUrls)
        self.tempImgs = set(self.tempImgs)
        global seed
        t = []
        for temp in self.tempUrls:                    #<-链接过滤,正则表达式
            if re.match(seed + "/.*", temp):
                t.append(temp)
        self.tempUrls = t

        t = []
        #global ImgDiged
        #global iLock
        #iLock.acquire()
        for temp in self.tempImgs:                    #<-图片过滤,正则表达式
            if re.match(".+.(gif|jpg|png)",temp):
                t.append(temp)
        #iLock.release()
        self.tempImgs = t
        self.title = self.title.split('-')[0]         #<-页面标题分隔，截取title中关键字
        #去除title中非法字符
        self.title = self.title.replace(' ','')
        self.title = self.title.replace('/','')
        self.title = self.title.replace('\\','')
        self.title = self.title.replace(':','')
        self.title = self.title.replace('|','')
        self.title = self.title.replace('?','')
        self.title = self.title.replace('*','')
        self.title = self.title.replace('<','')
        self.title = self.title.replace('>','')
        self.title = self.title.replace('\r','')
        self.title = self.title.replace('\n','')
        self.title = self.title.replace('\t','')

    def save(self,path,url):
        global MinSize
        try:
            req = urllib.request.Request(url)
            req.add_header("User-Agent", "Mozilla/5.0 (Windows NT 6.1) \
            AppleWebKit/537.11 (KHTML, like Gecko) Chrome/23.0.1271.64 Safari/537.11")
            req.add_header("Referer",self.url)         #有些网站防盗链，所以自己加上头
            conect = urllib.request.urlopen(req)
            t = conect.read()
            conect.close()
            if t.__len__() < MinSize:
                return
            if not os.path.exists(path):
                os.mkdir(path)
            f = open(path + "\\" + self.title + time.strftime("%H%M%S",\
                     time.localtime()) + ".jpg","wb")
            f.write(t)
            f.close()
        except HTMLParseError as e:
            print("####Error : 3 ######:",e , '--->',  url)
        except Exception as e:
            print("####Error : 4 ######:",e , '--->',  url)

    def saveImages(self):
        global IMG_TIME
        global SAVE_PATH
        if len(self.tempImgs) == 0:
            return
        path = SAVE_PATH + '\\' + self.title
        print("Downdow------->",self.title)
        while len(self.tempImgs) != 0:
            t = threading.Thread(target=self.save,args=\
                                 (path,self.tempImgs.pop(0)))
            if len(self.tempImgs) != 0:
                t.start()
                time.sleep(IMG_TIME)
            else:
                t.start()
                t.join()
                if os.path.exists(path) and len(os.listdir(path)) == 0:
                    os.rmdir(path)


def save():
    global mLock
    global UrlSrc
    #global ImgDiged
    #global iLock
    global SAVE_PATH
    mLock.acquire()
    #iLock.acquire()
    try:
        f = open(SAVE_PATH + r"\UrlDiged.txt",'w')
        for i in UrlDiged:
            f.write(i + '\n')
        f.close()

        f = open(SAVE_PATH +r"\UrlSrc.txt",'w')
        for i in UrlSrc:
            f.write(i + '\n')
        f.close()

        print("********************* Saved **********************")
    except Exception as e:
        print (e)
    finally:
        mLock.release()
        #iLock.release()


def readBackup():
    global UrlDiged
    #global ImgDiged
    global UrlSrc
    try:
        f = open(SAVE_PATH + r"\UrlDiged.txt",'r')
        while True:
            t = f.readline()
            if t == '':
                break
            t = t.replace('\n','')
            UrlDiged.append(t)
        f.close()
        f = open(SAVE_PATH + r"\UrlSrc.txt",'r')
        while True:
            t = f.readline()
            if t == '':
                break
            t = t.replace('\n','')
            UrlSrc.append(t)
        f.close()
    except Exception as e:
        print(e)


#*****************************start********************************


if __name__ == '__main__':

    #timeout = 20
    #socket.setdefaulttimeout(timeout)
    seed = "http://www.xxx.com/"    #<-站点的根页面
    SAVE_PATH = r"d:\scratch"       #<-存储目录
    THREAD_NUM = 35                 #<-限制线程数，以控制下载速度，防止出现类DDos攻击
    SLEEP_TIME = 2.5                #<-每次请求链接的时间间隔(秒)，太快不一定好哟！
    MinSize = 32000                 #<-过滤小图片，初始32k
    IMG_TIME = 1.5                  #<-下载图片速度，初始1.5秒一张
    UrlSrc = [seed]                 #存储获得的未遍历过的链接
    UrlDiged = []                   #存储遍历过的链接
    mLock = threading.Lock()        #UrlSrc和UrlDiged的同步锁
    savetime = time.time()

    if not os.path.exists(SAVE_PATH):
        os.mkdir(SAVE_PATH)
    if seed[-1:] == '/':
        seed = seed[:-1]

    #读取上一次运行的现场
    if not os.path.exists(SAVE_PATH + r'\UrlDiged.txt'):
        try:
            f = open(SAVE_PATH + r'\UrlDiged.txt','w')
            f.close()
            f = open(SAVE_PATH + r'\UrlSrc.txt','w')
            f.close()
        except Exception as e:
            print(e)
    else:
        readBackup()


    while True:
        if len(threading.enumerate()) > THREAD_NUM:
            continue
        mLock.acquire()
        if UrlSrc.__len__():
            temp = UrlSrc.pop(0)
            t = ScratchFactory(temp)
            UrlDiged.append(temp)
            t.start()
        mLock.release()
        #打印当前连接数、线程数、urlsrc+urldiged表长
        print("Conections:",UrlSrc.__len__(),"*****threads:",\
              len(threading.enumerate()),"****TableLength:",\
              (len(UrlSrc)+len(UrlDiged))/1000)
        if time.localtime().tm_min%2 == 0 \
        and time.time() - savetime > 60 :
            save()                 #保存现场
            savetime = time.time()
        time.sleep(SLEEP_TIME)
转载于:https://my.oschina.net/u/212831/blog/306075
chongjiu1048
关注
0
点赞
踩
0

收藏

觉得还不错? 一键收藏
0
评论
python抓图脚本

'''Createdon2013-2-1@author:蒲文辉@mailto:pwh0996@gmail.com运行环境:Python3'''#coding:utf-8importreimporturllib.requestfromhtml.parserim...
复制链接

扫一扫