python爬duitang的摄影类图片

最新推荐文章于 2024-09-24 10:12:47 发布

heyneo

最新推荐文章于 2024-09-24 10:12:47 发布

阅读量1.4k

点赞数 1

分类专栏： python 文章标签：爬虫图片

本文链接：https://blog.csdn.net/whoami021/article/details/25325701

版权

python 专栏收录该内容

4 篇文章 0 订阅

订阅专栏

这几天闲着没事，写了个python爬虫，专把堆糖上的摄影类图片爬下来。

废话不多说，直接上代码，不用解释应该也能看懂。

#coding: utf-8
# 抓堆糖摄影图片
from html.parser import HTMLParser
import urllib.request
import string
import queue
from datetime import datetime
import os

queue_url = queue.Queue()
site = 'http://www.duitang.com'
savePath = '/home/michael/Pictures/' # 图片保存路径前缀

# logs
url_log = 'urls.log'
img_log = 'imgs.log'
err_log = 'errors.log'

action = ("connecting", "downloading", "parsing")

class MyHTMLParser(HTMLParser): #follow指定是否往下搜索

    def __init__(self, strict, follow=True):
        super(MyHTMLParser, self).__init__()
        self.follow = follow

    def handle_starttag(self, tag, attrs):
        if tag.__eq__("img"):
            # imgurl = [x[1] for x in attrs if x[0].__eq__('src')][0]
            imgurl = None
            width = 200
            height = 200
            for x in attrs:
                if x[0].__eq__('src'):
                    imgurl = x[1]
                elif x[0].__eq__('width'):
                    width = x[1]
                    print('width=%s' % width)
                elif x[0].__eq__('height'):
                    height = x[1]
                    print('height=%s' % height)
            if imgurl and float(width)>300 and float(height)>300:
                print(imgurl)
                r = imgurl.rfind("/")
                #下载图片到本地
                urllib.request.urlretrieve(imgurl, '%s%s' %(savePath, imgurl[r:]))
                # 写入日志
                img_file.write("%s\t%s\t%s\t%s\t%s\n" %(datetime.now(), current_url,\
                    imgurl[r:], width, height))
        if tag.__eq__("a") and self.follow:
            href = [x[1] for x in attrs if x[0].__eq__('href')]
            if href:
                if href[0].startswith("/people/mblog/"): #取出大图
                    get_img_in_url("%s%s" %(site, href[0]))
                elif href[0].startswith("/category/photography/"):
                    url = href[0]
                    url = '%s%s' % (site, url)
                    queue_url.put(url)

    def handle_endtag(self, tag):
        # print("Encountered an end tag :", tag)
        pass
    def handle_data(self, data):
        # print("Encountered some data  :", data)
        pass

ua = {
    'User-agent': 'Mozilla/5.0 (X11; U; Linux i686) Gecko/20071127 Firefox/2.0.0.11'
}

def get_html(url_address):
    '''open url and read it'''
    try:
        url_file.write("%s\t%s\t%s\t" % (datetime.now(), url_address, action[2]))
        req = urllib.request.Request(url_address, headers=ua)
        f = urllib.request.urlopen(req)
        html = urllib.request.urlopen(req).read().decode('utf-8')
        url_file.write("%s\n" % ("YES"))
        return html
    except Exception:
        url_file.write("%s\n" % ("NO"))

def get_img_in_url(url_address):
    html = get_html(url_address)
    if html:
        p = MyHTMLParser(strict=False, follow=False)
        p.feed(html)

parser = MyHTMLParser(strict=False)
url = "http://www.duitang.com/category/photography/"
queue_url.put(url)

# 创建日志文件
if os.path.isfile(url_log):
    url_file = open(url_log, 'a+')
else:
    url_file = open(url_log, 'w+')
    url_file.write("%s\t%s\t%s\t%s\n" %("time", "url", "action", "success"))

if os.path.isfile(img_log):
    img_file = open(img_log, 'a+')
else:
    img_file = open(img_log, 'w+')
    img_file.write("%s\t%s\t%s\t%s\t%s\n" %("time", "url", "name", "width", "height"))

current_url = None
while(not queue_url.empty()):
    current_url = queue_url.get()
    html = get_html(current_url)
    if html:
        parser.feed(html)