爬取美女图片【绝对福利】

 配置修改在下列config中。一开始没有使用多进程爬取,速度慢的像蜗牛,之后采用多进程,速度快了不少,但随之问题也来了,不到两天就被网站管理员band。之后只有增加随机睡眠时间,来模拟真人访问。之后被封的问题没有再出现。

# coding=utf-8
import functools
import hashlib
import random
import time
from multiprocessing import Pool, Manager
from urllib.request import urlopen
import requests
import os
import re
import logging
# 获取logger的实例
logger = logging.getLogger("Meizi_img")
logger.setLevel(logging.DEBUG)
# 指定logger的输出格式
formatter = logging.Formatter('%(asctime)s %(levelname)s %(message)s')
# 文件日志,终端日志
file_handler = logging.FileHandler("Meizi_img.log")
file_handler.setFormatter(formatter)
# 设置默认的级别
file_handler.setLevel(logging.DEBUG)
logger.addHandler(file_handler)


#============config===================
Type_ = 'riben'
Mumber = '182'
Local = "image\\"
Num = 2
URL = 'http://www.xixirenti.cc/%s/%s_' % (Type_, Mumber)
B_url = "http://www.xixirenti.cc/%s/%s.html" % (Type_, Mumber)
Logpath = Local+'log\\'
All_url = "http://www.xixirenti.cc"
Re_img = """var totalpage = ([\s\S]*?);"""
Re_obj = """<div class="arcBody">[\s\S]*?href=[\s\S]*?<img src='([\s\S]*?)' id=[\s\S]*?"""
Ua_headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.75 Safari/537.36",
    "Referer": "http: // www.xixirenti.cc / riben / 2418_10.html"}
#======================================


def despath():
    path = Local+Mumber+'\\'
    if not os.path.exists(path):
        try:
            os.mkdir(path)
        except Exception as e:
            logger.error(e)
    return path


def hashStr(strInfo):
    """
     对字符串做HASH
    """
    h = hashlib.sha256()  # md5,sh1,sha256
    h.update(strInfo.encode("utf-8"))
    return h.hexdigest()


def get_one_page(url):
    time.sleep(random.randint(1, 3))
    """
    发起Http请求,获取Response的响应结果
    """
    reponse = requests.get(url, headers=Ua_headers)
    if reponse.status_code == 200:  # ok
        return reponse.text
    return None


def get_img_url(html):
    time.sleep(random.randint(1, 3))
    pattern = re.compile(Re_obj)
    items = re.findall(pattern, html)
    if items is None:
        return None
    try:
        url = All_url + items[0]
        logger.info(url)
        return url
    except IndexError as e:
        print(e)


def save_img(url):
    time.sleep(random.randint(1, 3))
    imgName = hashStr(time.ctime()) + '.jpg'
    path = despath()
    local = path + imgName
    try:
        data = urlopen(url.strip()).read()
        with open(local,"wb") as f:
            f.write(data)
    except:
        logger.error("downloaded error in "+url)


def get_all_page(html):
    pattern = re.compile(Re_img)
    items = re.findall(pattern, html)
    if items is None:
        return None
    else:
        try:
            page = int(items[0])
            return page
        except IndexError as e:
            logger.error(e)


def CrawlPictureInfo(q, page):
    time.sleep(random.randint(1, 3))
    new_url = URL + str(page) + ".html"
    # print(new_url)
    html = get_one_page(new_url)
    if html:
        result = get_img_url(html)
        if result:
            q.put(result)
        else:
            logger.error('not find the image!'%(result))
    else:
        logger.error('not find html')


def main():
    p = Pool()
    q = Manager().Queue()  # 构造出一个在进程池之间共享的队列
    html = get_one_page(B_url)
    result = get_img_url(html)
    q.put(result)
    page = get_all_page(html)
    cont = str(page)+'_page'
    logger.info(cont+'\n\n')
    if page:
        partial_CrawlPictureInfo = functools.partial(CrawlPictureInfo, q)
        p.map(partial_CrawlPictureInfo, [i for i in range(Num, page + 1)])
        p.close()  # 通知进程池任务添加结束
        All_img = []
        while not q.empty():
            img_url = q.get()
            All_img.append(img_url)
        for i in range(len(set(All_img))):
            save_img(list(set(All_img))[i])
            logger.info("%s page download ok!" % (str(i+1)))
        p.join()
    else:
        logger.error("not find page ")


if __name__ == '__main__':

    try:
        main()
    except Exception as e:
        logger.error(e)

 

  • 1
    点赞
  • 10
    收藏
    觉得还不错? 一键收藏
  • 2
    评论
评论 2
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值