python爬虫练手_python小爬虫练手

一个人无聊,写了个小爬虫爬取不可描述图片....

代码太短,就暂时先往这里贴一下做备份吧。

#! /usr/bin/python

import chardet

import urllib3

import uuid

import os

import logging

import time

import sys

import re

import threading

from bs4 import BeautifulSoup

"""

http://www.qiubaichengren.com/1.html

"""

class PageNotFoundException(BaseException):

"""

代表网页404的异常

"""

pass

class ResponseStatusException(BaseException):

pass

class QiuBaiChengRenSpider:

http_pool_manager = urllib3.PoolManager()

img_save_dir = ‘D:/QiuBaiChengRen/‘

logger = logging.getLogger(‘QiuBaiChengRenSpider‘)

def __init__(self):

self.init_log()

def init_log(self):

stream_handler = logging.StreamHandler(sys.stdout)

self.logger.addHandler(stream_handler)

self.logger.setLevel(logging.DEBUG)

def get(self, url):

try:

http_response = self.http_pool_manager.request(‘GET‘, url)

if http_response.status == 404:

raise PageNotFoundException(‘404‘)

if http_response.status != 200:

raise ResponseStatusException(http_response.status)

return http_response.data

except Exception:

self.logger.info(u‘获取网页的时候发生了异常‘)

return ‘‘

def extract_img(self, html_doc):

bs = BeautifulSoup(html_doc, ‘lxml‘)

imgs = bs.select(‘div.mala-text img‘)

return imgs

def save_img(self, img_tag):

img_link = img_tag[‘src‘].strip()

save_name = self.img_save_dir + img_tag[‘alt‘] + ‘___‘ + uuid.uuid4().hex + os.path.splitext(img_link)[1]

save_name = re.compile(‘[\\s+,\",\‘]‘).sub(‘‘, save_name) # 覆盖掉生成的文件名中不合法的部分

self.logger.info(‘Save img: %s %s‘ %(save_name, img_link))

img_byte = self.get(img_link)

if img_byte == ‘‘:

return

img_file = open(save_name, ‘wb‘)

img_file.write(img_byte)

img_file.close()

def list_visitor(self, seed):

threads = []

i = 1

while True:

try:

url = seed % {‘page‘: i}

self.logger.info(‘Begin process:%s‘ %url)

html_doc = self.get(url)

if html_doc == ‘‘:

continue

imgs = self.extract_img(html_doc)

for img in imgs:

# self.logger.info(‘Saving img:%s %s‘ %(img[‘alt‘], img[‘src‘]))

t1 = threading.Thread(target=self.save_img, args={img})

t1.start()

threads.append(t1)

i += 1

except PageNotFoundException:

self.logger.info(‘404‘)

break

except BaseException:

break

for t1 in threads:

t1.join()

if __name__ == ‘__main__‘:

spider = QiuBaiChengRenSpider()

spider.list_visitor(‘http://www.qiubaichengren.com/%(page)d.html‘)

原文:http://www.cnblogs.com/cc11001100/p/7624927.html

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值