目前以面向过程式来编写,改日重构成面向对象。代码有注释,就不解释各函数的功能了。
createNewSet 传入终止的页码,就可以开爬了,抓取过程是从各页的页面得到美女们的地址,存入集合中(集合可以去重),解析后得到图片地址,最后保存到本地目录中,目录以美女的帐号名命名。
from bs4 import BeautifulSoup
import re
import urllib, urllib2, cookielib
import os, sys
def getHtml(url):
send_headers = {
'Accept':'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
'Connection':'keep-alive',
'User-Agent':'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/49.0.2623.87 Safari/537.36',
'Referer':'http://www.dbmeinv.com'
}
req = urllib2.Request(url, headers = send_headers)
page = urllib2.urlopen(req)
return page.getcode(), page.read()
def getGirlInfo(html_cont):
soup = BeautifulSoup(html_cont, 'html.parser', from_encoding = "utf-8")
#imageurls = soup.find_all('div', class_ ="topic-figure cc")
name_node = soup.find('li', class_ = 'name')
image_nodes = soup.find_all('img', src = re.compile(r'large/[a-zA-Z0-9]+.jpg'))
return name_node.get_text(), image_nodes
#return image_nodes
def saveImages(girl_name, image_nodes):
#if not os.path.exists(girl_name):
# os.mkdir(girl_name)
count = 0
for image_node in image_nodes:
image_url = image_node['src']
try:
print 'Getting image: %s' % image_url
img = urllib2.urlopen(image_url)
except urllib2.HTTPError,e:
if hasattr(e,"code"):
print e.code
if hasattr(e,"reason"):
print e.reason
print "Error: image %s Download failed!" % image_url
else:
print "OK"
data = img.read()
file_path = girl_name + '-' + str(count) + '.jpg'
if not os.path.exists(file_path):
image_file = open(file_path, 'wb')
image_file.write(data)
image_file.close()
print 'saved as %d.jpg' % (count)
count = count + 1
else:
print "File %s exists!" % file_path
return True
def getGirlNodes(html_cont):
soup = BeautifulSoup(html_cont, 'html.parser', from_encoding = "utf-8")
#imageurls = soup.find_all('div', class_ ="topic-figure cc")
girl_nodes = soup.find_all('a', class_ = 'link')
return girl_nodes
def createNewSet(start, end):
link_set = set()
for i in range(start, end) :
url = 'http://www.dbmeinv.com/?pager_offset=' + str(i + 1)
page_code, page = getHtml(url)
print 'Searching page %s' % url,
if int(page_code) >= 200:
print page_code, "OK"
girl_nodes = getGirlNodes(page)
print 'Adding link to set of girl'
for girl_node in girl_nodes:
girl_url = girl_node['href']
link_set.add(girl_url)
else:
print page_code, "Error"
return link_set
def downloadImage(girl_set):
for girl_link in girl_set:
print
try:
print 'Getting information: %s' % girl_link
page_code, page = getHtml(girl_link)
except urllib2.HTTPError, e:
if hasattr(e,"code"):
print e.code
if hasattr(e,"reason"):
print e.reason
print "Error: Get informantion failed.\nLink: %s" %girl_link
else:
name, image_nodes = getGirlInfo(page)
print 'Getting %s\'s image.' % name
saveImages(name, image_nodes)
print
return True
if __name__ == '__main__':
page_start = 1
page_start = 2
if len(sys.argv) > 2:
try:
page_start = int(sys.argv[1])
page_end = int(sys.argv[2])
except ValueError:
print "Please input two integers.\nUsage: python %s <start> <end>" % sys.argv[0]
exit(0)
if page_start > page_end:
tmp = page_start
page_start = page_end
page_end = tmp
else:
if page_start == page_end:
print "Please input two integers.\nUsage: python %s <start> <end>" % sys.argv[0]
exit(0)
girl_set = createNewSet(page_start, page_end)
print
downloadImage(girl_set)
改进方向:
单线程一个一个地爬太慢了,可以改成多线程;
多线程共用一个链接容器,涉及到线程同步,有待研究一下;
线程多了可能会被服务器拒绝,需要代理。