[爬虫必看] 2025最新JavaScript逆向爬虫知识手册-CSDN博客

本文链接：https://blog.csdn.net/qq_28247201/article/details/147579739

JavaScript逆向爬虫知识手册

1. JavaScript基础核心概念

1.1 语法与执行环境

变量作用域（var/let/const）

var：

函数作用域或全局作用域
存在变量提升（hoisting）
可以重复声明

console.log(a); // undefined，变量提升
var a = 10;
var a = 20; // 可以重复声明

let/const：

块级作用域（{}内有效）
不存在变量提升
不可重复声明
const声明必须初始化且不能修改（对于对象是引用不可变）

// console.log(b); // 报错，暂时性死区
let b = 10;
// let b = 20; // 报错，不可重复声明

const obj = {x:1};
obj.x = 2; // 允许修改属性
// obj = {}; // 报错，不可修改引用

逆向场景：

混淆代码常用var制造变量提升陷阱
加密逻辑中常用const保护关键变量不被修改

严格模式(“use strict”)的陷阱

严格模式特点：

禁止意外创建全局变量
禁止删除不可删除的属性
函数参数不能重名
禁止使用with语句
this在全局作用域中为undefined

"use strict";
a = 10; // 报错，未声明变量

function test(a, a) { // 报错，参数重名
  console.log(a);
}

逆向注意：

某些加密函数依赖非严格模式特性
补环境时需要确保严格模式一致性

1.2 自执行函数（IIFE）

语法变体

基础形式：

(function() {
  console.log('IIFE执行');
})();

带参数形式：

(function(a, b) {
  console.log(a + b); // 30
})(10, 20);

运算符前缀形式：

!function() {
  console.log('使用!运算符');
}();

+function() {
  console.log('使用+运算符');
}();

赋值形式：

const result = function(x) {
  return x * 2;
}(10);
console.log(result); // 20

逆向应用：

加密逻辑常封装在IIFE中防止全局污染
可能通过参数传递关键配置或密钥

1.3 作用域与闭包

闭包的内存泄漏风险

function createLeak() {
  const bigData = new Array(1000000).fill('*');
  return function() {
    console.log(bigData.length);
  };
}

const leakFn = createLeak(); // bigData无法释放

解决方法：

function createSafe() {
  const bigData = new Array(1000000).fill('*');
  return function() {
    const data = bigData.length; // 只使用需要的数据
    bigData = null; // 手动释放
    return data;
  };
}

通过闭包隐藏关键变量

const getEncryptKey = (function() {
  const key = 'SECRET_KEY_123';
  
  return {
    getKey: function() {
      return key;
    },
    encrypt: function(data) {
      return data.split('').map(c => c.charCodeAt(0) ^ key.length).join('-');
    }
  };
})();

console.log(getEncryptKey.getKey()); // 可以获取
console.log(getEncryptKey.key); // undefined，无法直接访问

逆向技巧：

在Chrome调试器中设置断点，查看闭包变量
修改函数返回值暴露隐藏变量
使用console.dir(func)查看闭包信息

2. 浏览器与Node.js环境差异

2.1 全局对象对比

特性	浏览器	Node.js
全局对象	window	global
DOM访问	document可用	默认不可用
模块系统	ES6 Modules	CommonJS
文件操作	不可用	fs模块

环境补全示例：

// 判断环境
const isBrowser = typeof window !== 'undefined' && typeof document !== 'undefined';

// Node.js中补全window和document
if (!isBrowser) {
  const { JSDOM } = require('jsdom');
  const { window } = new JSDOM('<!DOCTYPE html>');
  global.window = window;
  global.document = window.document;
}

2.2 模块系统差异

CommonJS (Node.js):

// 导出
module.exports = { a: 1 };
// 或
exports.b = 2;

// 导入
const mod = require('./module');

ES6 Module (浏览器):

// 导出
export const a = 1;
export default { b: 2 };

// 导入
import mod, { a } from './module.js';

逆向处理：

Webpack打包的代码通常是两者的混合体
需要识别__webpack_require__这种自定义加载器

2.3 环境检测与模拟

常见检测点：

// 1. 检测window对象
if (typeof window !== 'undefined') {
  console.log('浏览器环境');
}

// 2. 检测process对象
if (typeof process !== 'undefined' && process.versions && process.versions.node) {
  console.log('Node.js环境');
}

// 3. 检测navigator对象
if (typeof navigator !== 'undefined') {
  console.log('浏览器环境，UA:', navigator.userAgent);
}

补全技巧：

// 补全navigator
if (!global.navigator) {
  global.navigator = {
    userAgent: 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'
  };
}

// 补全document.cookie
if (!global.document) {
  global.document = {
    cookie: '',
    get cookie() {
      return this._cookie || '';
    },
    set cookie(val) {
      this._cookie = (this._cookie ? this._cookie + ';' : '') + val;
    }
  };
}

3. Webpack模块化与逆向核心

3.1 Webpack模块加载原理

典型Webpack模块结构：

// 模块数组
var modules = {
  "./src/index.js": function(module, exports, __webpack_require__) {
    "use strict";
    eval("...");
  }
};

// 加载函数
function __webpack_require__(moduleId) {
  // 缓存检查
  if (installedModules[moduleId]) {
    return installedModules[moduleId].exports;
  }
  // 创建新模块
  var module = installedModules[moduleId] = {
    i: moduleId,
    exports: {}
  };
  // 执行模块函数
  modules[moduleId].call(module.exports, module, module.exports, __webpack_require__);
  return module.exports;
}

3.2 定位Webpack核心模块

查找技巧：

搜索__webpack_require__调用
查找window["webpackJsonp"]数组推送
搜索入口模块（通常为0或数字最小的模块）

示例Hook：

// 保存原始require
const origRequire = __webpack_require__;

// 重写require进行拦截
__webpack_require__ = function(moduleId) {
  console.log('加载模块:', moduleId);
  const exports = origRequire(moduleId);
  
  // 如果是目标模块
  if (moduleId === 'xyz') {
    console.log('导出内容:', exports);
  }
  
  return exports;
};

3.3 模块导出提取技巧

方法1：直接获取导出对象

// 假设模块ID为123
const targetModule = __webpack_require__.m[123];
const module = { exports: {} };
targetModule(module, module.exports, __webpack_require__);
console.log(module.exports);

方法2：Hook导出过程

const originalExports = {};
for (const key in module.exports) {
  originalExports[key] = module.exports[key];
  
  Object.defineProperty(module.exports, key, {
    get: function() {
      console.log('获取导出属性:', key);
      return originalExports[key];
    },
    set: function(val) {
      console.log('设置导出属性:', key, val);
      originalExports[key] = val;
    }
  });
}

4. 函数、对象与高阶编程

4.1 高阶函数与工厂模式

函数柯里化示例：

function add(a) {
  return function(b) {
    return function(c) {
      return a + b + c;
    };
  };
}

// 逆向中常见于参数分批传递
const add5 = add(5);
const add5And10 = add5(10);
console.log(add5And10(20)); // 35

工厂模式示例：

function createEncryptor(secret) {
  const version = '1.0';
  
  return {
    encrypt: function(data) {
      return data.split('').map(c => c.charCodeAt(0) + secret.length).join('');
    },
    getVersion: function() {
      return version;
    }
  };
}

const encryptor = createEncryptor(123);
console.log(encryptor.encrypt('abc')); // 使用闭包中的secret

4.2 原型链与对象篡改

监听数组变化：

const originalPush = Array.prototype.push;

Array.prototype.push = function() {
  console.log('数组被修改，新长度:', this.length + arguments.length);
  return originalPush.apply(this, arguments);
};

const arr = [1, 2];
arr.push(3); // 会触发console.log

逆向应用：

监控关键对象的修改
替换原生方法以拦截数据

4.3 Proxy与反射API

拦截对象示例：

const target = {
  secretKey: 'abc123',
  normalData: 'hello'
};

const proxy = new Proxy(target, {
  get(obj, prop) {
    if (prop === 'secretKey') {
      console.warn('有人试图访问secretKey!');
      return '******';
    }
    return obj[prop];
  },
  set(obj, prop, value) {
    if (prop === 'secretKey') {
      throw new Error('secretKey不可修改!');
    }
    obj[prop] = value;
    return true;
  }
});

console.log(proxy.secretKey); // 警告并返回******
proxy.normalData = 'world'; // 正常设置

5. 异步编程与网络请求逆向

5.1 事件循环与异步拦截

执行顺序示例：

console.log('1. 同步代码');

setTimeout(() => console.log('2. setTimeout'), 0);

Promise.resolve().then(() => console.log('3. Promise微任务'));

queueMicrotask(() => console.log('4. queueMicrotask'));

console.log('5. 同步代码结束');

/* 输出顺序：
1. 同步代码
5. 同步代码结束
3. Promise微任务
4. queueMicrotask
2. setTimeout
*/

逆向技巧：

在微任务中修改数据可确保在渲染前生效
宏任务适合插入长时间运行的操作

5.2 AJAX/请求参数捕获

Hook XMLHttpRequest：

const originalSend = XMLHttpRequest.prototype.send;

XMLHttpRequest.prototype.send = function(body) {
  // 记录请求信息
  console.log('请求URL:', this._url || this.url);
  console.log('请求方法:', this._method || this.method);
  console.log('请求体:', body);
  
  // 添加事件监听
  this.addEventListener('load', function() {
    console.log('响应:', this.responseText);
  });
  
  return originalSend.call(this, body);
};

// 示例：Hook open方法记录URL和方法
const originalOpen = XMLHttpRequest.prototype.open;
XMLHttpRequest.prototype.open = function(method, url) {
  this._method = method;
  this._url = url;
  return originalOpen.apply(this, arguments);
};

Hook fetch API：

const originalFetch = window.fetch;

window.fetch = async function(url, options) {
  console.log('Fetch请求:', url);
  console.log('请求选项:', options);
  
  const response = await originalFetch(url, options);
  
  // 克隆响应以多次读取
  const clonedResponse = response.clone();
  const text = await clonedResponse.text();
  console.log('响应内容:', text);
  
  return response;
};

6. 加密算法与浏览器环境对抗

6.1 Webpack中的加密模块定位

查找技巧：

搜索CryptoJS、encrypt、decrypt等关键字
查找包含createCipheriv、createHash等Node.js加密方法的模块
查找包含BEGIN PUBLIC KEY等典型标记的模块

示例定位：

// 遍历所有模块查找加密逻辑
for (const moduleId in __webpack_require__.m) {
  const moduleCode = __webpack_require__.m[moduleId].toString();
  
  if (moduleCode.includes('CryptoJS') || 
      moduleCode.includes('createCipheriv') ||
      moduleCode.includes('encrypt')) {
    console.log('发现加密模块:', moduleId);
    
    // 执行模块获取导出
    const module = { exports: {} };
    __webpack_require__.m[moduleId](module, module.exports, __webpack_require__);
    console.log('模块导出:', module.exports);
  }
}

6.2 常见加密算法特征

AES特征：

// 典型AES加密代码
const CryptoJS = require('crypto-js');
const key = CryptoJS.enc.Utf8.parse('1234567890123456');
const iv = CryptoJS.enc.Utf8.parse('1234567890123456');

function encrypt(data) {
  return CryptoJS.AES.encrypt(data, key, {
    iv: iv,
    mode: CryptoJS.mode.CBC, // 模式：CBC/ECB/CFB/OFB
    padding: CryptoJS.pad.Pkcs7 // 填充：Pkcs7/ZeroPadding/NoPadding
  }).toString();
}

RSA特征：

// 典型RSA公钥
const publicKey = `-----BEGIN PUBLIC KEY-----
MIIBIjANBgkqhkiG9w0BAQEFAAOCAQ8AMIIBCgKCAQEAu1SU1LfVLPHCozMxH2Mo
4lgOEePzNm0tRgeLezV6ffAt0gunVTLw7onLRnrq1Xz5B9G4xYDY6MrkC5c1tRg
...
-----END PUBLIC KEY-----`;

// Node.js RSA加密
const crypto = require('crypto');
const encrypted = crypto.publicEncrypt(
  {
    key: publicKey,
    padding: crypto.constants.RSA_PKCS1_OAEP_PADDING
  },
  Buffer.from('要加密的数据')
);

6.3 浏览器环境补全

UserAgent伪造：

Object.defineProperty(navigator, 'userAgent', {
  value: 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36',
  configurable: false,
  writable: false
});

Canvas指纹伪造：

// 1. 重写toDataURL方法
HTMLCanvasElement.prototype.toDataURL = function() {
  return 'data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAA...';
};

// 2. 更精细的控制
const originalGetContext = HTMLCanvasElement.prototype.getContext;
HTMLCanvasElement.prototype.getContext = function() {
  const context = originalGetContext.apply(this, arguments);
  
  if (arguments[0] === '2d') {
    const originalFillText = context.fillText;
    context.fillText = function() {
      // 修改文本渲染结果
      return originalFillText.apply(this, arguments);
    };
  }
  
  return context;
};

9. 代码混淆与反混淆实战

9.1 Webpack混淆代码特征

典型特征：

变量名被替换为短名称（a, b, c等）
字符串被编码或拆解
控制流扁平化（大量switch-case结构）
使用数组存储字符串，运行时拼接
关键逻辑使用eval或Function动态执行

示例混淆代码：

// 字符串数组
var _0x3f8c = ["\x48\x65\x6C\x6C\x6F", "\x77\x6F\x72\x6C\x64", "\x6C\x6F\x67"];

// 控制流扁平化
function _0x45a8(_0x12d93f) {
  var _0x5e8b26 = {
    'a': function(_0x3f8c0x1) {
      return _0x3f8c0x1 + 1;
    },
    'b': function(_0x3f8c0x2) {
      return _0x3f8c0x2 * 2;
    }
  };
  
  switch (_0x12d93f) {
    case 0:
      return _0x5e8b26['a'](10);
    case 1:
      return _0x5e8b26['b'](20);
  }
}

9.2 AST反混淆技术

使用Babel解析代码：

const parser = require('@babel/parser');
const traverse = require('@babel/traverse').default;
const generator = require('@babel/generator').default;

// 1. 解析代码为AST
const code = `function test() { return 'hello'; }`;
const ast = parser.parse(code);

// 2. 遍历AST修改节点
traverse(ast, {
  FunctionDeclaration(path) {
    // 修改函数名
    path.node.id.name = 'newTest';
  },
  StringLiteral(path) {
    // 解码16进制字符串
    if (path.node.value.match(/\\x[0-9a-fA-F]{2}/)) {
      path.node.value = path.node.value.replace(/\\x([0-9a-fA-F]{2})/g, 
        (_, hex) => String.fromCharCode(parseInt(hex, 16)));
    }
  }
});

// 3. 生成新代码
const output = generator(ast).code;
console.log(output);

控制流扁平化还原：

// 处理前代码（扁平化控制流）
function _0x45a8(_0x12d93f) {
  switch (_0x12d93f) {
    case 0: return 11;
    case 1: return 40;
  }
}

// 处理后代码
function decodedFunc(flag) {
  if (flag === 0) {
    return 10 + 1; // 还原原始逻辑
  } else if (flag === 1) {
    return 20 * 2; // 还原原始逻辑
  }
}