这几天闲着没事,写了个python爬虫,专把堆糖上的摄影类图片爬下来。
废话不多说,直接上代码,不用解释应该也能看懂。
#coding: utf-8
# 抓堆糖摄影图片
from html.parser import HTMLParser
import urllib.request
import string
import queue
from datetime import datetime
import os
queue_url = queue.Queue()
site = 'http://www.duitang.com'
savePath = '/home/michael/Pictures/' # 图片保存路径前缀
# logs
url_log = 'urls.log'
img_log = 'imgs.log'
err_log = 'errors.log'
action = ("connecting", "downloading", "parsing")
class MyHTMLParser(HTMLParser): #follow指定是否往下搜索
def __init__(self, strict, follow=True):
super(MyHTMLParser, self).__init__()
self.follow = follow
def handle_starttag(self, tag, attrs):
if tag.__eq__("img"):
# imgurl = [x[1] for x in attrs if x[0].__eq__('src')][0]
imgurl = None
width = 200
height = 200
for x in attrs:
if x[0].__eq__('src'):
imgurl = x[1]
elif x[0].__eq__('width'):
width = x[1]
print('width=%s' % width)
elif x[0].__eq__('height'):
height = x[1]
print('height=%s' % height)
if imgurl and float(width)>300 and float(height)>300:
print(imgurl)
r = imgurl.rfind("/")
#下载图片到本地
urllib.request.urlretrieve(imgurl, '%s%s' %(savePath, imgurl[r:]))
# 写入日志
img_file.write("%s\t%s\t%s\t%s\t%s\n" %(datetime.now(), current_url,\
imgurl[r:], width, height))
if tag.__eq__("a") and self.follow:
href = [x[1] for x in attrs if x[0].__eq__('href')]
if href:
if href[0].startswith("/people/mblog/"): #取出大图
get_img_in_url("%s%s" %(site, href[0]))
elif href[0].startswith("/category/photography/"):
url = href[0]
url = '%s%s' % (site, url)
queue_url.put(url)
def handle_endtag(self, tag):
# print("Encountered an end tag :", tag)
pass
def handle_data(self, data):
# print("Encountered some data :", data)
pass
ua = {
'User-agent': 'Mozilla/5.0 (X11; U; Linux i686) Gecko/20071127 Firefox/2.0.0.11'
}
def get_html(url_address):
'''open url and read it'''
try:
url_file.write("%s\t%s\t%s\t" % (datetime.now(), url_address, action[2]))
req = urllib.request.Request(url_address, headers=ua)
f = urllib.request.urlopen(req)
html = urllib.request.urlopen(req).read().decode('utf-8')
url_file.write("%s\n" % ("YES"))
return html
except Exception:
url_file.write("%s\n" % ("NO"))
def get_img_in_url(url_address):
html = get_html(url_address)
if html:
p = MyHTMLParser(strict=False, follow=False)
p.feed(html)
parser = MyHTMLParser(strict=False)
url = "http://www.duitang.com/category/photography/"
queue_url.put(url)
# 创建日志文件
if os.path.isfile(url_log):
url_file = open(url_log, 'a+')
else:
url_file = open(url_log, 'w+')
url_file.write("%s\t%s\t%s\t%s\n" %("time", "url", "action", "success"))
if os.path.isfile(img_log):
img_file = open(img_log, 'a+')
else:
img_file = open(img_log, 'w+')
img_file.write("%s\t%s\t%s\t%s\t%s\n" %("time", "url", "name", "width", "height"))
current_url = None
while(not queue_url.empty()):
current_url = queue_url.get()
html = get_html(current_url)
if html:
parser.feed(html)