python爬虫 js混淆与破解

爬虫与反爬是一场没有声音的战斗,不断的爬与反爬,在交替中推动技术的前进。

今天要说的是js混淆

那么什么是js混淆反爬呢?,呵呵,我也没有定义^_^

一、现象

我使用的是python语言,使用requests请求网站得到的响应不是我们熟透的html文件,而是一串比较陌生的内容,他就是js,需要执行这段js,从中找到相应的cookies,放在请求request中,才能正常请求得到html源码,js如下:

<html><body><script language="javascript"> window.onload=setTimeout("eq(137)", 200); function eq(SM) {var qo, mo="", no="", oo = [0x59,0xfa,0xe8,0xb3,0xfb,0x4a,0xdb,0x28,0xb5,0xbd,0x5b,0x28,0xce,0x9c,0x6a,0x37,0x86,0xd2,0xf6,0x4f,0x2b,0xfe,0x48,0x54,0x60,0x2b,0x39,0x85,0xd2,0x59,0x64,0xaf,0x3d,0x0a,0x56,0xb0,0x04,0x5c,0x75,0x8d,0x99,0xb2,0xf9,0xc6,0x12,0x9e,0x29,0x82,0xac,0x06,0x1f,0xb7,0x03,0x5e,0x66,0xbe,0xca,0x16,0xae,0xc5,0x01,0xd9,0xb1,0xc9,0x61,0xbf,0x98,0xb1,0x89,0xd5,0xe1,0x40,0xbd,0xd5,0x6e,0x7a,0x45,0xa3,0x5d,0xe9,0x02,0x1b,0x73,0xd1,0x0b,0x26,0xfd,0x57,0x2f,0xc7,0x06,0x5e,0x76,0x8e,0xd9,0x26,0xef,0x09,0xe1,0x40,0x97,0xef,0x57,0xaf,0x48,0x60,0x78,0xd0,0xe8,0x02,0x1b,0xf4,0xcd,0xea,0x22,0x6e,0x77,0x80,0xcd,0x56,0xc6,0x11,0xd9,0x34,0x04,0x50,0xaf,0xbb,0xd9,0xf6,0x4e,0x25,0x2a,0x88,0xde,0x2a,0xb2,0x0f,0x40,0x97,0xf2,0x0e,0x25,0xbd,0x48,0xe2,0x39,0x54,0xee,0x06,0xcc,0xa4,0xbf,0x93,0xe9,0xf9,0xf2,0xcb,0xe8,0xf4,0xc2,0x0f,0x41,0x8e,0xdb,0x69,0xc3,0x61,0xa6,0x71,0xba,0x06,0xa3,0x70,0x2f,0xbe,0x97,0xb4,0xbb,0x07,0x89,0x91,0xa0,0xfa,0xd8,0xb4,0xce,0xa7,0xc4,0x8c,0xd9,0x67,0x15,0x21,0xf0,0xb8,0x56,0x62,0xe1,0xaf,0x7c,0xc4,0x11,0x5d,0xdc,0x6a,0x72,0x10,0x1e,0xeb,0x05,0xd1,0x1d,0x25,0x74,0x42,0x56,0xe3,0x3e,0x4a,0x18,0xe4,0xc3,0x0c,0x5a,0xa6,0x34,0x3c,0xd8,0x76,0xc6,0x55,0x8f,0x3b];qo = "qo=232; do{oo[qo]=(-oo[qo])&0xff; oo[qo]=(((oo[qo]>>1)|((oo[qo]<<7)&0xff))-243)&0xff;} while(--qo>=2);"; eval(qo);qo = 231; do { oo[qo] = (oo[qo] - oo[qo - 1]) & 0xff; } while (-- qo >= 3 );qo = 1; for (;;) { if (qo > 231) break; oo[qo] = ((((((oo[qo] + 129) & 0xff) + 66) & 0xff) << 3) & 0xff) | (((((oo[qo] + 129) & 0xff) + 66) & 0xff) >> 5); qo++;}po = ""; for (qo = 1; qo < oo.length - 1; qo++) if (qo % 6) po += String.fromCharCode(oo[qo] ^ SM);eval("qo=eval;qo(po);");} </script> </body></html>

对,你没有看错,原文就是没有断点也没有空格的字符串。如果第一次看到, 肯定是  朕头痛欲裂 ,不要怕,当使用pycharm的格式化后如下:

<html>
<body>
<script language="javascript"> window.onload = setTimeout("eq(137)", 200);

function eq(SM) {
    var qo, mo = "", no = "",
        oo = [0x59, 0xfa, 0xe8, 0xb3, 0xfb, 0x4a, 0xdb, 0x28, 0xb5, 0xbd, 0x5b, 0x28, 0xce, 0x9c, 0x6a, 0x37, 0x86, 0xd2, 0xf6, 0x4f, 0x2b, 0xfe, 0x48, 0x54, 0x60, 0x2b, 0x39, 0x85, 0xd2, 0x59, 0x64, 0xaf, 0x3d, 0x0a, 0x56, 0xb0, 0x04, 0x5c, 0x75, 0x8d, 0x99, 0xb2, 0xf9, 0xc6, 0x12, 0x9e, 0x29, 0x82, 0xac, 0x06, 0x1f, 0xb7, 0x03, 0x5e, 0x66, 0xbe, 0xca, 0x16, 0xae, 0xc5, 0x01, 0xd9, 0xb1, 0xc9, 0x61, 0xbf, 0x98, 0xb1, 0x89, 0xd5, 0xe1, 0x40, 0xbd, 0xd5, 0x6e, 0x7a, 0x45, 0xa3, 0x5d, 0xe9, 0x02, 0x1b, 0x73, 0xd1, 0x0b, 0x26, 0xfd, 0x57, 0x2f, 0xc7, 0x06, 0x5e, 0x76, 0x8e, 0xd9, 0x26, 0xef, 0x09, 0xe1, 0x40, 0x97, 0xef, 0x57, 0xaf, 0x48, 0x60, 0x78, 0xd0, 0xe8, 0x02, 0x1b, 0xf4, 0xcd, 0xea, 0x22, 0x6e, 0x77, 0x80, 0xcd, 0x56, 0xc6, 0x11, 0xd9, 0x34, 0x04, 0x50, 0xaf, 0xbb, 0xd9, 0xf6, 0x4e, 0x25, 0x2a, 0x88, 0xde, 0x2a, 0xb2, 0x0f, 0x40, 0x97, 0xf2, 0x0e, 0x25, 0xbd, 0x48, 0xe2, 0x39, 0x54, 0xee, 0x06, 0xcc, 0xa4, 0xbf, 0x93, 0xe9, 0xf9, 0xf2, 0xcb, 0xe8, 0xf4, 0xc2, 0x0f, 0x41, 0x8e, 0xdb, 0x69, 0xc3, 0x61, 0xa6, 0x71, 0xba, 0x06, 0xa3, 0x70, 0x2f, 0xbe, 0x97, 0xb4, 0xbb, 0x07, 0x89, 0x91, 0xa0, 0xfa, 0xd8, 0xb4, 0xce, 0xa7, 0xc4, 0x8c, 0xd9, 0x67, 0x15, 0x21, 0xf0, 0xb8, 0x56, 0x62, 0xe1, 0xaf, 0x7c, 0xc4, 0x11, 0x5d, 0xdc, 0x6a, 0x72, 0x10, 0x1e, 0xeb, 0x05, 0xd1, 0x1d, 0x25, 0x74, 0x42, 0x56, 0xe3, 0x3e, 0x4a, 0x18, 0xe4, 0xc3, 0x0c, 0x5a, 0xa6, 0x34, 0x3c, 0xd8, 0x76, 0xc6, 0x55, 0x8f, 0x3b];
    qo = "qo=232; do{oo[qo]=(-oo[qo])&0xff; oo[qo]=(((oo[qo]>>1)|((oo[qo]<<7)&0xff))-243)&0xff;} while(--qo>=2);";
    eval(qo);
    qo = 231;
    do {
        oo[qo] = (oo[qo] - oo[qo - 1]) & 0xff;
    } while (--qo >= 3);
    qo = 1;
    for (; ;) {
        if (qo > 231) break;
        oo[qo] = ((((((oo[qo] + 129) & 0xff) + 66) & 0xff) << 3) & 0xff) | (((((oo[qo] + 129) & 0xff) + 66) & 0xff) >> 5);
        qo++;
    }
    po = "";
    for (qo = 1; qo < oo.length - 1; qo++) if (qo % 6) po += String.fromCharCode(oo[qo] ^ SM);
    eval("qo=eval;qo(po);");
} </script>
</body>
</html>

这样的,熟悉点js的,应该没有什么问题。他的逻辑不过是对一串数组(oo)的不断重写,最后异或再转化为字符的过程。(sorry,此处没有写得太详尽,如果能够看到这时,说明你已经可以整明白整个过程了)

二、

解决方法一、

      在python中使用可以执行js的模块

在python中使用可以执行js的模块,如execjs、js2py等模块,引入直接执行得出结果。这样的好处是可以方便得出结果, 如果对执行效率没有要求可以这样解决。我使用execjs执行一次的时间,大概在3s左右。就后面的经验来看,js2py模块执行速度会快一点。当然还有些帖子会提到使用pyv8,我也被坑到不要,因为pyv8不能直接pip安装,最重要和要命的是,pyv8只支持python2, 试问,现在谁还用python2?

这里就不展示怎么使用execjs和js2py了,

解决方法二、

使用python重写js逻辑,这个相对来说比较难,且这个js混淆比较简单,所有可以执行。优点是重写后执行速度超快。所以,如果有能力, 这个是解决此类问题的最好的办法。因为js的解密过程也是通过特定的运算。从逻辑上讲,js能够实现的, python和其他语言也一样能够实现。前提是你能够看懂js的逻辑。

不绕勾子,重写python代码如下:

class rewriteJsConfusion(object):
    def __init__(self, oo, String):
        self.oo = oo
        self.String = String
        


    def update_oo_1(self):
        qo = 226
        while True:
            if qo < 2:
                break
            self.oo[qo] = (-self.oo[qo])&255
            # print(self.oo[qo], qo, "aaa")
            self.oo[qo] = (((self.oo[qo]>>1)|(self.oo[qo]<<7)&255)-19)&255
            qo -= 1


    def update_oo_2(self):
        qo = 225
        while True:
            if qo < 3 :
                break
            self.oo[qo] = (self.oo[qo]-self.oo[qo-1])&255
            qo -= 1

    def update_oo_3(self):
        qo = 1
        while True:
            if qo > 225:
                break
            self.oo[qo] = (((((self.oo[qo] + 246) & 255)+231) & 255) << 3) & 255\
                | (((((self.oo[qo]+246) & 255)+231) & 255) >> 5)
            qo += 1

    def get_char(self):
        po = ""
        l = len(self.oo)-1

        for qo in range(1, l):
            if qo % 7:
                po += chr(self.oo[qo]^120)
        print(po)


    def run(self):
        self.update_oo_1()
        self.update_oo_2()
        self.update_oo_3()
        self.get_char()



if __name__ == "__main__":   
    OO = [0xe1, 0xa6, 0xd0, 0xc3, 0x3c, 0xb2, 0x25, 0x4b, 0x81, 0x3a, 0x60, 0x53, 0x49, 0x3f, 0xc7, 0xbe, 0x34, 0xa8, 0x11, 0xf6, 0xe8, 0xc9, 0x43, 0xf7, 0xec, 0xa1, 0x16, 0x89, 0x8a, 0xc1, 0x35, 0x6b, 0x60, 0xd4, 0x3d, 0xc1, 0x2a, 0x52, 0x85, 0xee, 0xd6, 0xbd, 0x61, 0x09, 0x7d, 0xa6, 0x10, 0x38, 0xab, 0x77, 0xe2, 0xc9, 0x72, 0x59, 0xcd, 0x36, 0xb9, 0x22, 0x09, 0xfd, 0x72, 0x5a, 0x41, 0x58, 0xbe, 0x71, 0x26, 0x9a, 0xce, 0x33, 0x41, 0xe9, 0x52, 0x3a, 0x2d, 0x93, 0x3d, 0xca, 0xb1, 0x5a, 0x82, 0xe8, 0x8f, 0x44, 0x26, 0x8e, 0x01, 0x76, 0xe9, 0x1d, 0x46, 0x56, 0xbd, 0x25, 0x99, 0x03, 0x69, 0xd2, 0x7d, 0xe5, 0x4e, 0x75, 0x1d, 0x06, 0xed, 0xbd, 0xe6, 0x50, 0xb7, 0xa2, 0x46, 0xba, 0xcf, 0x89, 0x41, 0xb8, 0xef, 0x64, 0x5b, 0x1f, 0x89, 0xb5, 0xee, 0x64, 0x09, 0xae, 0xb9, 0x61, 0x0b, 0x71, 0xdf, 0x53, 0x8c, 0xf2, 0x57, 0xc0, 0x2a, 0xcd, 0x75, 0xe0, 0x87, 0xb2, 0xd9, 0x83, 0xae, 0x16, 0x7d, 0xd4, 0x78, 0x64, 0xd2, 0x81, 0x6c, 0x10, 0xe0, 0x93, 0x89, 0xff, 0x73, 0xea, 0x1f, 0x89, 0xf3, 0x1a, 0x0d, 0x86, 0xf9, 0x20, 0x27, 0x1c, 0x52, 0x3b, 0xe0, 0x98, 0x0c, 0xad, 0x66, 0x1c, 0x86, 0x6c, 0x50, 0x39, 0x13, 0xb8, 0xb0, 0x26, 0x5c, 0x0f, 0x05, 0x1d, 0x15, 0x3b, 0xef, 0xe5, 0xda, 0x51, 0x8c, 0x01, 0x75, 0xab, 0x64, 0x8a, 0x3f, 0xe3, 0xd9, 0xce, 0x42, 0xfa, 0x70, 0x66, 0xd6, 0x0b, 0x75, 0x2a, 0x20, 0x13, 0x8b, 0x5a, 0xd0, 0x43, 0x79, 0x32, 0x58, 0xc8, 0x6b, 0x9c, 0x72, 0x3b]
    STR = "OD"

    ac = rewriteJsConfusion(OO, String=STR)
    print(ac.run())

这只是一个简单的功能demon,如果需要集成到代码中,需要使用正则匹配出所有可变的参数并传入,方可实现。因为参数较多, 所有没有贴上代码,,大概参数如下:

    def get_args(self, html):
        # 获取js混淆参数、调用函数解析并返回
        print(html)
        info = re.findall(
            "setTimeout\(\"..\((.*?)\)\".*?no=\"\", oo = (.*?);qo = \"qo=(\d{3});.*?oo\[qo\]=\(\(\(oo\[qo\]>>(\d*).*?\(\(oo\[qo\]<<(\d*)\)&0xff\)\)-(\d+)\)&0xff;.*?oo\[qo\] = \(\(\(\(\(\(oo\[qo\] \+ (\d*)\) & 0xff\) \+ (\d*)\) & 0xff\) << (\d*)\) & 0xff\).*?\(\(\(\(\(oo\[qo\] \+ (\d*)\) & 0xff\) \+ (\d*)\) & 0xff\) >> (\d*)\);",
            html, re.S)[0]

        arg, oo, qo, mr_1, \
        ml_1, minus_1, p31, p32, \
        ml31, p33, p34, mr31 = info
        ac = rewriteJsConfusion(arg, oo, qo, mr_1, \
        ml_1, minus_1, p31, p32, \
        ml31, p33, p34, mr31)
        cookie_str = ac.run()
        return cookie_str

兄弟,大哥只能帮你到这里了,其他的,自己参详一下, 肯定没有问题的。

下一篇我要讲一下另外一种js混淆

 

 

评论 2
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值