# -*- coding: utf-8 -*-
"""
@author: Administrator
"""
from bs4 import BeautifulSoup as BS
#import tornado.httpclient
import urllib,urllib2
import socket
import time
def reporthook(block_read,block_size,total_size):
if not block_read:
print "connection opened";
return
if total_size<0:
#unknown size
print "read %d blocks (%dbytes)" %(block_read,block_read*block_size);
else:
amount_read=block_read*block_size;
print 'Read %d blocks,or %d/%d' %(block_read,block_read*block_size,total_size);
return
http_header = {'User-Agent':'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/35.0.1916.153 Safari/537.36 OPR/22.0.1471.70'}
def fetch(www):
web = urllib2.Request(www,None,http_header)
# http_header = {'user-agent':'Chrome'}
# html = tornado.httpclient.HTTPRequest(url=www,method='GET',headers=http_header,
# request_timeout=15,connect_timeout=15)
# http_client = tornado.httpclient.HTTPClient()
# content = http_client.fetch(html)
html = urllib2.urlopen(web).read()
#www = "http://www.meizitu.com/a/4356.html"
#content = urllib.urlopen(www)
#html = content.read()
#content.close()
soup = BS(html)
#for num in soup.findAll(id="picture"):
# print num.
#i = 0
for num in soup.select(" #picture > p > img "):
name = num["src"][-12:].replace('/','@')
print name
#savepath = 'photos/%s.jpg' % name
savepath = 'photos/' + name
try:
urllib.urlretrieve(num["src"],savepath,reporthook=reporthook)
except:
pass
if __name__ == "__main__":
socket.setdefaulttimeout(30)
for i in range(4076,4100):
web = 'http://www.meizitu.com/a/%s.html' % i
fetch(web)
time.sleep(2)
有不少行是注释,先留着,也作为是当时写代码的一些发散思路吧。
首先,刚刚接触爬虫,据传使用tornado.httpclient.HTTPClient,要比urllib等库更有效率,所以首先采用tornado,但是运行一段时间后发现,连接会常常因超时而被断开,仔细查看参数,并设置timeout后,问题依旧,无奈之下,换作urllib,几乎没有再出现断开的情况。
其次,BeautifuSoup4相比以前的版本变化较大,比如代码中用到的select选择器soup.select(" #picture > p > img "),可以很方便的取到id为picture下的p标记下的img属性,都说效率没有正则表达式高,但由于网站使用的是wordpress的cms,用Bs更容易定位到目标数据。但仍然存在一些bug,比如在取http://www.meizitu.com/a/3921.html,img的alt属性值中有个汉字”躶“,不知道是不是编码问题,导致取到的列表不完整,下一页也是因为这个字出现问题,如有兴趣的高手可以给些指点,不胜荣幸。
参考文档:
http://www.crummy.com/software/BeautifulSoup/bs4/doc/index.zh.html
https://docs.python.org/2/library/urllib.html