一个人无聊,写了个小爬虫爬取不可描述图片....
代码太短,就暂时先往这里贴一下做备份吧。
#! /usr/bin/python
import chardet
import urllib3
import uuid
import os
import logging
import time
import sys
import re
import threading
from bs4 import BeautifulSoup
"""
http://www.qiubaichengren.com/1.html
"""
class PageNotFoundException(BaseException):
"""
代表网页404的异常
"""
pass
class ResponseStatusException(BaseException):
pass
class QiuBaiChengRenSpider:
http_pool_manager = urllib3.PoolManager()
img_save_dir = ‘D:/QiuBaiChengRen/‘
logger = logging.getLogger(‘QiuBaiChengRenSpider‘)
def __init__(self):
self.init_log()
def init_log(self):
stream_handler = logging.StreamHandler(sys.stdout)
self.logger.addHandler(stream_handler)
self.logger.setLevel(logging.DEBUG)
def get(self, url):
try:
http_response = self.http_pool_manager.request(‘GET‘, url)
if http_response.status == 404:
raise PageNotFoundException(‘404‘)
if http_response.status != 200:
raise ResponseStatusException(http_response.status)
return http_response.data
except Exception:
self.logger.info(u‘获取网页的时候发生了异常‘)
return ‘‘
def extract_img(self, html_doc):
bs = BeautifulSoup(html_doc, ‘lxml‘)
imgs = bs.select(‘div.mala-text img‘)
return imgs
def save_img(self, img_tag):
img_link = img_tag[‘src‘].strip()
save_name = self.img_save_dir + img_tag[‘alt‘] + ‘___‘ + uuid.uuid4().hex + os.path.splitext(img_link)[1]
save_name = re.compile(‘[\\s+,\",\‘]‘).sub(‘‘, save_name) # 覆盖掉生成的文件名中不合法的部分
self.logger.info(‘Save img: %s %s‘ %(save_name, img_link))
img_byte = self.get(img_link)
if img_byte == ‘‘:
return
img_file = open(save_name, ‘wb‘)
img_file.write(img_byte)
img_file.close()
def list_visitor(self, seed):
threads = []
i = 1
while True:
try:
url = seed % {‘page‘: i}
self.logger.info(‘Begin process:%s‘ %url)
html_doc = self.get(url)
if html_doc == ‘‘:
continue
imgs = self.extract_img(html_doc)
for img in imgs:
# self.logger.info(‘Saving img:%s %s‘ %(img[‘alt‘], img[‘src‘]))
t1 = threading.Thread(target=self.save_img, args={img})
t1.start()
threads.append(t1)
i += 1
except PageNotFoundException:
self.logger.info(‘404‘)
break
except BaseException:
break
for t1 in threads:
t1.join()
if __name__ == ‘__main__‘:
spider = QiuBaiChengRenSpider()
spider.list_visitor(‘http://www.qiubaichengren.com/%(page)d.html‘)
原文:http://www.cnblogs.com/cc11001100/p/7624927.html