爬虫 防盗链

声明:以某某图 为例,代码仅供学习参考!

1、利用fiddler,访问某某图首页进行header获取 (获取结果如下)

headers = {
    "Accept":"image/webp,image/apng,image/*,*/*;q=0.8",
    # "Accept-Encoding":"gzip, deflate",  本地查看时,会导致乱码
    "Accept-Language":"zh-CN,zh;q=0.8",
    "User-Agent":"Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36     
    (KHTML, like Gecko) Chrome/60.0.3112.101 Safari/537.36",
    "Connection":"keep-alive",
    "Referer":"http://www.mzitu.com"}

2、拼接headers备用

headall = []
for key, value in headers.items():
    item = (key, value)
    headall.append(item)

3、获取html内容

def openhtml():
    cjar = http.cookiejar.CookieJar()
    #127.0.0.1:8888 为fiddler 的代理地址 方便查看信息 找错
    proxy = urllib2.ProxyHandler({'http': '127.0.0.1:8888'})
    opener = urllib2.build_opener(proxy, urllib2.HTTPHandler, urllib2.HTTPCookieProcessor(cjar))
    opener.addheaders = headall
    urllib2.install_opener(opener)
    data = urllib2.urlopen(url).read()
    return data

4、利用正则表达式获取所有图片链接并保存到本地

def download(data):
    #正则匹配url
    reg = "data-original='.*?\.jpg"
    imgre = re.compile(reg)
    imglist = re.findall(imgre, data)
    x = 0
    for image_url in imglist:
        image_url = image_url.replace("data-original='", "")
        print image_url
        opener = urllib2.build_opener()
        #反 防盗链 精髓在此
        opener.addheaders = headall
        data = opener.open(image_url).read()
        with open("C:\Users\zzz\Desktop\images\\" + str(x) + ".jpg", "wb") as code:
            code.write(data)
        x += 1

5、完整代码

#coding=utf8
import urllib2
import http.cookiejar
import re

url = "http://www.mzitu.com/xinggan"
headers = {
    "Accept":"image/webp,image/apng,image/*,*/*;q=0.8",
    # "Accept-Encoding":"gzip, deflate",
    "Accept-Language":"zh-CN,zh;q=0.8",
    "User-Agent":"Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.101 Safari/537.36",
    "Connection":"keep-alive",
    "Referer":"http://www.mzitu.com"}

headall = []
for key, value in headers.items():
    item = (key, value)
    headall.append(item)

#获取html
def openhtml():
    cjar = http.cookiejar.CookieJar()
    #127.0.0.1:8888 为fiddler 的代理地址 方便查看信息 找错
    proxy = urllib2.ProxyHandler({'http': '127.0.0.1:8888'})
    opener = urllib2.build_opener(proxy, urllib2.HTTPHandler, urllib2.HTTPCookieProcessor(cjar))
    opener.addheaders = headall
    urllib2.install_opener(opener)
    data = urllib2.urlopen(url).read()
    return data

#下载
def download(data):
    #正则匹配url
    reg = "data-original='.*?\.jpg"
    imgre = re.compile(reg)
    imglist = re.findall(imgre, data)
    x = 0
    for image_url in imglist:
        image_url = image_url.replace("data-original='", "")
        print image_url
        opener = urllib2.build_opener()
        #反 防盗链 精髓在此
        opener.addheaders = headall
        data = opener.open(image_url).read()
        with open("C:\Users\zzz\Desktop\images\\" + str(x) + ".jpg", "wb") as code:
            code.write(data)
        x += 1

if __name__ == '__main__':
    data = openhtml()
    download(data)

 

转载于:https://www.cnblogs.com/z-z-z/p/7755763.html

  • 0
    点赞
  • 1
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值