python: 爬取[博海拾贝]图片脚本

练手代码,聊作备忘:

# encoding: utf-8
# from __future__ import unicode_literals

import urllib
import urllib2
import re
import os
import time
from threading import Thread

class BhsbSpider(object):
    _url = r'https://bh.sb/post/category/main/';
    _page_count = 0
    _page_index = 0

    def __init__(self, url, page_count = 0):
        self._url = url
        self._page_count = page_count
        folder = '博海拾贝'.decode('utf-8')
        if not os.path.exists(folder):
            os.mkdir(folder)

    def spider(self):
        while self._page_index < self._page_count:
            self._page_index += 1
            self._url = r'https://bh.sb/post/category/main/page/%d' % self._page_index
            self.do_spider(self._url)

    def do_spider(self, url):
        html = self.get_html(url)
        pattern = r'(?s)<h2><a\s+href="(?P<url>[^"]+).*?>\[博海拾贝\d+\](?P<title>[^<]+).*?'
        for i, m in enumerate(re.findall(pattern, html)):
            info = '%d. url: %s, title: %s' % ((self._page_index - 1) * 20 + i + 1, m[0], m[1])
            print info
            # 多线程爬取页面
            Thread(target=self.download, args=(m[0], m[1])).start()
            time.sleep(2)

    def download(self, url, title):
        title = '博海拾贝\\' + title
        title = title.decode('utf-8')
        if not os.path.exists(title):
            os.mkdir(title)
        html = self.get_html(url)
        pattern = r'(?s)<p>(?P<title>[^<]+).*?<p><img\s+src="(?P<image>[^"]+)"'
        for i, m in enumerate(re.findall(pattern, html)):
            img_title = m[0]
            img_url = m[1]
            img_filename = '%s/%s%s' % (title.encode('utf-8'), img_title, os.path.splitext(img_url)[1])
            img_filename = img_filename.decode('utf-8')
            print 'download %s ...' % img_filename
            if not os.path.exists(img_filename):
                Thread(target=urllib.urlretrieve, args=(img_url, img_filename)).start()
                time.sleep(1)

    def get_html(self, url):
        try:
            url = url.encode('utf-8')
            req = urllib2.Request(url)
            req.add_header('User-Agent', 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Maxthon/4.9.5.1000 Chrome/39.0.2146.0 Safari/537.36')
            page = urllib2.urlopen(req)
            return page.read()
        except Exception as ex:
            print 'get url_%s html error, ex=%s' % (url, ex)


if __name__ == '__main__':
    url = r'https://bh.sb/post/category/main/'
    bs = BhsbSpider(url, 10)
    bs.spider()

 

未及细测试,其间有图片丢失情况。结果如下图示:

转载于:https://www.cnblogs.com/crwy/p/10623378.html

  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值