爬取图片网站上的美女图片

#coding=utf-8

from bs4 import BeautifulSoup as BS4
import requests
import wget
import sys
import chardet
import os

__author__ = 'Administrator'

HOST = 'http://www.5442.com/meinv/'
START_URL = 'http://www.5442.com/meinv/'
ATTR_HREF = 'href'
ATTR_TITLE = 'title'
ATTR_ALT = 'alt'
ATTR_SRC = 'src'
HOST_CONTENT_ENCODING = 'GB2312'
MAIN_DOWNLOAD_FOLDER = 'G:/meinv/'
NEXT_ALBLUM = '可爱熊吖BOBO'
ALREDY_TO_DOWNLOAD_ALBLUMN = False
MAX_THEME_PAGE_COUNT = 200

def utf82GBK(s):
   return s.decode('utf-8').encode('gb2312')

def gbkprint(s):
   print(utf82GBK(s))

def uni2utf8(s):
   return s.encode('utf-8')

def gbk2utf8(s):
   return s.decode('gb2312','ignore').encode('utf-8','ignore')

def myprint(s):
   print(s)

_ = myprint

def url2bs4(url):
   res = requests.get(url)
   if res.status_code != 200:
      return

   #enc = chardet.detect(res.content)
   #print enc
   res.encoding = HOST_CONTENT_ENCODING
   content = gbk2utf8(res.content)
   ret = BS4(content)
   return ret

def read_tags(fromURL):
   bs = url2bs4(fromURL)
   tags_objects = bs.find_all("a",attrs={'class':'yxtag'})
   ret = {}
   for t in tags_objects:
      ret[uni2utf8(t.text)] = uni2utf8(t[ATTR_HREF])

   return ret

def dump_tags(tags):
   for k in tags:
      _('名称:%s 地址:%s' % (k, tags[k]))

def next_download_addr(album_name, folder_name):
   full_name = os.path.join(MAIN_DOWNLOAD_FOLDER, album_name)#, folder_name)
   folder_full_name = utf82GBK(full_name)
   if not os.path.isdir(folder_full_name):
       os.makedirs(folder_full_name)

   next_num = 1
   while True:
       file_name = os.path.join(folder_full_name, str(next_num)+ '.jpg')
       if not os.path.isfile(file_name):
         break
       next_num = next_num + 1

   return file_name

def download_album_one_page(url, album_name):
   bs = url2bs4(url)
   cur = bs.find(name='p', attrs={'id': 'contents',})
   imgs = cur.find_all(name='img')
   for img in imgs:
      wget.download(img[ATTR_SRC], next_download_addr(album_name, uni2utf8(img[ATTR_ALT])))


def download_album(url, name):
   global ALREDY_TO_DOWNLOAD_ALBLUMN
   if not ALREDY_TO_DOWNLOAD_ALBLUMN:
       _('已经下载 %s' % (name,))
       if NEXT_ALBLUM in name:
           _('准备开始继续下载...')
           ALREDY_TO_DOWNLOAD_ALBLUMN = True
       return

   _('正在下载影集:%s 地址:%s' % (name, url))
   bs = url2bs4(url)
   page = bs.find(name='div', attrs={'class':'page'})
   lis = page.find_all(name='li')
   sum_page = 0
   for li in lis:
      page_name = uni2utf8(li.a.text)
      if page_name.startswith('共'):
         sum_page = int(page_name[3:-5])
         break

   download_album_one_page(url, name)
   cur_page = 2
   prefix = url[:url.rfind('.')]
   while cur_page <= sum_page:
      download_album_one_page('%s_%d.html' % (prefix,cur_page,), name)
      cur_page = cur_page + 1

def process_one_tag_page(tag_name, tag_url):
      _('当前主题:%s' % (tag_name,))
      bs = url2bs4(tag_url)
      bricks = bs.find_all(name='div', attrs={'class':'item masonry_brick masonry-brick'})
      for i in bricks:
         album = i.find(name='a')
         if not album:
            continue

         download_album(uni2utf8(album[ATTR_HREF]), uni2utf8(album[ATTR_TITLE]))

def process_tags(tags):
   for k in tags:
      #if '全部' in k:
      #   continue
      #if not '人体艺术' in k and not '丝袜' in k:
      #   continue
      if not '丝袜' in k:
         continue

      #process_one_tag_page(k, tags[k])
      bs_first_page = url2bs4(tags[k])
      page_num = bs_first_page.find(name='li', attrs={'class':'pageinfo',})
      if not page_num:
         continue
      page_num = page_num.text
      seprator_index = page_num[page_num.find('/')+1:-1]
      page_num = int(seprator_index)

      prefix_url = tags[k][0:tags[k].rfind('.')]
      url_list = [tags[k],]
      start_page_num = 2
      while start_page_num < page_num:
          next = '%s/%d.html' % (prefix_url,start_page_num)
          if len(url_list) >= MAX_THEME_PAGE_COUNT:
            break
          url_list.append(next)
          start_page_num = start_page_num + 1

      for n  in url_list:
         process_one_tag_page(k, n)

if __name__ == '__main__':
   #reload(sys)
   #sys.setdefaultencoding('gb2312')

   tags = read_tags(START_URL)
   #dump_tags(tags)
   process_tags(tags)

  • 2
    点赞
  • 4
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值