猫眼反扒措施:
1.验证码
解决方案
* 先手动通过验证码,获取请求头的信息(cookie,Referer,等),将此信息放置到请求对象里。
* puppeteer自动模拟验证码,获取请求头,在将请求信息放置到对象中
2-限制请求网站次数
解决方案
* 用动态无数个代理去访问,就可以降低每一个IP地址访问次数
3-字体文件动态加密
解决方案
* 解析加密的字体问价,将字体的编码与图像进行比对,将图像转为SVG(矢量图)字符串编码后,比对字符串相似性。
* 解析加密的字体问价,将字体的编码与图像进行比对,用卷积神经网络算法,去识别字体。
分析爬取过程
0-初识数据
**1. 将验证后的请求头信息解析到请求对象里**
let options2 = {
headers:{
"User-Agent":" Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/76.0.3809.132 Safari/537.36",
"Cookie": "__mta=222114471.1574126937575.1575561742585.1575562035163.57; uuid_n_v=v1; uuid=F3A5F6E00A6B11EA866F9F7B8B645BB609807B37AECE45E69BBA4F3E5FC94038; _lxsdk_cuid=16e8146e542c8-04aa42a59aa0e4-5373e62-1fa400-16e8146e542c8; _lxsdk=F3A5F6E00A6B11EA866F9F7B8B645BB609807B37AECE45E69BBA4F3E5FC94038; _csrf=e7d0bd2ca9710f6294df933a1ea8d7546b48f8ed7a44f83a25177dd5f96233d0; Hm_lvt_703e94591e87be68cc8da0da7cbd0be2=1575507926,1575509335,1575525980,1575537558; _lx_utm=utm_source%3DBaidu%26utm_medium%3Dorganic; __mta=222114471.1574126937575.1575537635171.1575537840771.39; Hm_lpvt_703e94591e87be68cc8da0da7cbd0be2=1575562035; _lxsdk_s=16ed6c2213d-710-b40-6ad%7C%7C12",
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3",
"Accept-Encoding": "gzip, deflate, br",
"Accept-Language": "zh-CN,zh;q=0.9",
"Cache-Control": "no-cache",
"Connection":"keep-alive",
"Host": "maoyan.com",
"Pragma": "no-cache",
"Sec-Fetch-Mode": "navigate",
"Sec-Fetch-Site": "none",
"Sec-Fetch-User": "?1",
"Upgrade-Insecure-Requests": "1",
"Referer": "https://maoyan.com/films?showType=3"
}
}
**2.数据库初始化**
//创建与数据库的连接的对象
let con = mysql.createConnection(options);
//建立连接
con.connect((err)=>{
//如果建立连接失败
if(err){
console.log(err)
}else{
console.log('数据库连接成功')
}
})
**3.设置爬取的页面和条数变量**
//index代表第多少页
let index = 0;
//代表第n页第k条数据
let k = 0;
//放置每个页面详情链接地址的数组
let hrefList = []
1-获取列表页的详情页链接地址
async function getPageList(index){
let httpUrl = "https://maoyan.com/films?showType=3&offset="+ index*30;
let response = await axios.get(httpUrl,options2)
//用cheerio解析页面html
let $ = cheerio.load(response.data);
$(".movie-list .movie-item>a").each(function(i,element){
let href = "https://maoyan.com"+$(element).attr('href')
hrefList.push(href);
})
//将当前页面的所有详情页链接地址获取
//console.log(hrefList)
//获取当前页面第k条详情链接地址
getMovieInfo(hrefList[k])
}
2-通过详情页地址获取详细信息
**1.申明获取详情页信息的函数**
async function getMovieInfo(href){}
**2.获取详情页,并用cheerio进行解析**
let response = await axios.get(href,options2)
let $ = cheerio.load(response.data)
**3.获取电影名称/电影图片/电影分类/电影地区和发布时间**
let moviename = $('.movie-brief-container .name').text();
let movieimg = $('.avatar-shadow>.avatar').attr('src');
let cataory = $('body > div.banner > div > div.celeInfo-right.clearfix > div.movie-brief-container > ul > li:nth-child(1)').text();
let areaTime = $("body > div.banner > div > div.celeInfo-right.clearfix > div.movie-brief-container > ul > li:nth-child(2)").text();
let area = areaTime.split('/')[0].trim();
let pubtime = $("body > div.banner > div > div.celeInfo-right.clearfix > div.movie-brief-container > ul > li:nth-child(3)").text().substring(0,10)
**4.由于有些电影未上映,没有时间,所有获取不到duration**
try {
var duration = parseInt(areaTime.split('/')[1].trim());
} catch (error) {
var duration = 0
}
**5.根据是否有评分,采取获取评分的操作或者直接设置未0,获取操作如下**
score = $(".index-right .star-on").css("width").substring(0,2);
**6.评分人的数量和票房获取**
挑战:
* 每一次请求获取到的编码都是不一样的
* 每一次通过页面获取的解析编码的字体文件的链接地址也是不一样的
* 字体文件里的编码和数字图像每一次都不样。
1.获取评分数量和票房的编码
let aa = $("body > div.banner > div > div.celeInfo-right.clearfix > div.movie-stats-container > div:nth-child(1) > div > div > span > span").text()
let bb = $("body > div.banner > div > div.celeInfo-right.clearfix > div.movie-stats-container > div:nth-child(2) > div > span.stonefont").text()
2.获取随机字体文件的链接地址
let fontUrl = $("head > style").html()
let reg = /format.*?url\('(.*?woff)'\)/igs;
//console.log(fontUrl)
let result = reg.exec(fontUrl)
let fontPath = result[1]
3.将随机的字体文件下载并进行解析
let ws = fs.createWriteStream('a.woff')
res.data.pipe(ws);
4.将评分乱码字符编码分割成数组,因为解析编码是一个异步的操纵,设置1个变量b来判断是否以及所有编码解析完毕。将解析完毕的编码放置到新的数组arr11;
let arr1 = aa.split("")
let b = 0;
let arr11 = []
5.循环解析字符串编码
arr1.forEach(async (item,i)=>{})
6.解析乱码字符串
let result = await parseNum(item);
7.解析字符串的过程,opentype.js可以帮助我们快速的解析字体文件
async function parseNum(str){
return new Promise(function(resolve,reject){
//载入字体文件,并解析出字体对象
opentype.load("a.woff", function(err, font) {
if (err) {
reject('Font could not be loaded: ' + err)
} else {
//通过字体对象,并在getPath方法中传入乱码的编码,获取出绘制该字体图像路径,0,100,72分别表示绘制的x坐标为0,y坐标为100,字体大小为72
var path = font.getPath(str,0,100,72);
//通过绘制的图像路径,得出SVG矢量图的字符串
let a = path.toSVG();
let max = 0;
let key = 0;
let arr = []
for(let i=0;i<10;i++){
let temp = similar(a,svgPaths[i])
if(max<temp){
max = temp;
key = i;
}
arr.push({temp,i})
}
resolve({key,max})
}
});
})
//similar(a,b)
}
8.比较字符串的相似程度,将最相似的数据的,key值提取出来
let max = 0;
let key = 0;
let arr = []
for(let i=0;i<10;i++){
let temp = similar(a,svgPaths[i])
if(max<temp){
max = temp;
key = i;
}
arr.push({temp,i})
}
resolve({key,max})
9.判断相似度的最大值是否大于50%,如果大于就将获取得到的k值返回到新数组里,否则将原来的字符串返回。
arr11[i] = parseFloat(result.max)>0.5?result.key:item;
10.最终将评分的数量获取
scorenum = arr11.join("")
scorenum = parseChar(scorenum);
scorenum = isNaN(scorenum)?0:scorenum;
11.同理可得票数数据12.跳转至下一页获取获取下个链接数据
k++;
if(k==hrefList.length){
hrefList = [];
k=0;
console.log(index,"页-"+k+"数据已爬取完毕!-----")
index++;
getPageList(index)
}else{
getMovieInfo(hrefList[k])
}
3-存放到数据库
let arr = [moviename,movieimg,cataory,area,duration,pubtime,score,scorenum,boxoffice,brief,director]
let strSql = 'insert into movie (moviename,movieimg,cataory,area,duration,pubtime,score,scorenum,boxoffice,brief,director) values (?,?,?,?,?,?,?,?,?,?,?)'
await sqlQuery(strSql,arr)
完整代码(少了数字0-9的svg图像路径)
index.js
let mysql = require('mysql');
let axios = require('axios');
let cheerio = require('cheerio');
var parseNum = require('./parseNum');
let options = {
host: "localhost",
// port: "3306", 可选,默认是3306
user: "root",
password: "123456789",
// database: "shop" 删除shop后注释
database: "sobook"
};
// axios请求头的配置信息
let options2 = {
headers: {
"User-Agent": " Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/76.0.3809.132 Safari/537.36",
"Cookie": "__mta=222114471.1574126937575.1575561742585.1575562035163.57; uuid_n_v=v1; uuid=F3A5F6E00A6B11EA866F9F7B8B645BB609807B37AECE45E69BBA4F3E5FC94038; _lxsdk_cuid=16e8146e542c8-04aa42a59aa0e4-5373e62-1fa400-16e8146e542c8; _lxsdk=F3A5F6E00A6B11EA866F9F7B8B645BB609807B37AECE45E69BBA4F3E5FC94038; _csrf=e7d0bd2ca9710f6294df933a1ea8d7546b48f8ed7a44f83a25177dd5f96233d0; Hm_lvt_703e94591e87be68cc8da0da7cbd0be2=1575507926,1575509335,1575525980,1575537558; _lx_utm=utm_source%3DBaidu%26utm_medium%3Dorganic; __mta=222114471.1574126937575.1575537635171.1575537840771.39; Hm_lpvt_703e94591e87be68cc8da0da7cbd0be2=1575562035; _lxsdk_s=16ed6c2213d-710-b40-6ad%7C%7C12",
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3",
"Accept-Encoding": "gzip, deflate, br",
"Accept-Language": "zh-CN,zh;q=0.9",
"Cache-Control": "no-cache",
"Connection": "keep-alive",
"Host": "maoyan.com",
"Pragma": "no-cache",
"Sec-Fetch-Mode": "navigate",
"Sec-Fetch-Site": "none",
"Sec-Fetch-User": "?1",
"Upgrade-Insecure-Requests": "1",
"Referer": "https://maoyan.com/films?showType=3"
}
}
//创建与数据库的连接的对象
let con = mysql.createConnection(options);
//建立连接
con.connect((err) => {
//如果建立连接失败
if (err) {
console.log(err)
} else {
console.log('数据库连接成功')
}
})
//index代表第多少页
let index = 0;
//代表第n页第k条数据
let k = 0;
//放置每个页面详情链接地址的数组
let hrefList = []
async function getPageList(index) {
let httpUrl = "https://maoyan.com/films?showType=3&offset=" + index * 30;
let response = await axios.get(httpUrl, options2)
//用cheerio解析页面html
let $ = cheerio.load(response.data);
$(".movie-list .movie-item>a").each(function (i, element) {
let href = "https://maoyan.com" + $(element).attr('href')
hrefList.push(href);
})
//将当前页面的所有详情页链接地址获取
//console.log(hrefList)
//获取当前页面第k条详情链接地址
getMovieInfo(hrefList[k])
}
// 获取电影的详细信息
async function getMovieInfo(href) {
// 获取详情页,并用cheerio进行解析
let response = await axios.get(href, options2)
let $ = cheerio.load(response.data);
// 获取电影名称/电影图片/电影分类/电影地区和发布时间
let moviename = $('.movie-brief-container .name').text();
let movieimg = $('.avatar-shadow>.avatar').attr('src');
let cataory = $('body > div.banner > div > div.celeInfo-right.clearfix > div.movie-brief-container > ul > li:nth-child(1)').text();
let areaTime = $("body > div.banner > div > div.celeInfo-right.clearfix > div.movie-brief-container > ul > li:nth-child(2)").text();
let area = areaTime.split('/')[0].trim();
let pubtime = $("body > div.banner > div > div.celeInfo-right.clearfix > div.movie-brief-container > ul > li:nth-child(3)").text().substring(0, 10);
// 由于有些电影未上映,没有时间,所有获取不到duration
try {
var duration = parseInt(areaTime.split('/')[1].trim());
} catch (error) {
var duration = 0
}
let score;
let scorenum;
let boxoffice;
// 根据是否有评分,采取获取评分的操作或者直接设置未0,获取操作如下
if ($(".index-right .star-on").length == 0) {
score = 0;
scorenum = 0;
boxoffice = 0;
let brief = $('#app > div > div.main-content > div > div.tab-content-container > div.tab-desc.tab-content.active > div:nth-child(1) > div.mod-content > span').text();
let director = $('#app > div > div.main-content > div > div.tab-content-container > div.tab-desc.tab-content.active > div:nth-child(2) > div.mod-content > div > div:nth-child(1) > ul > li > div > a').text();
let arr = [moviename, movieimg, cataory, area, duration, pubtime, score, scorenum, boxoffice, brief, director];
let strSql = 'insert into movie (moviename,movieimg,cataory,area,duration,pubtime,score,scorenum,boxoffice,brief,director) values (?,?,?,?,?,?,?,?,?,?,?)';
await sqlQuery(strSql, arr)
console.log(index, "页-", k, "电影信息已写入数据库:", moviename, href);
k++;
if (k == hrefList.length) {
hrefList = [];
k = 0;
console.log(index, "页数据已爬取完毕!-----");
index++;
getPageList(index);
} else {
// 页面未爬完继续
getMovieInfo(hrefList[k]);
}
} else {
score = $(".index-right .star-on").css("width").substring(0, 2);
let fontUrl = $("head > style").html();
let reg = /format.*?url\('.*?woff'\)/igs;
let result = reg.exec(fontUrl);
let fontPath = result[1];
axios.get("http:" + fontPath, {
responseType: "stream"
}).then(function (res, req) {
let ws = fs.createWriteStream('a.woff');
res.data.pipe(ws);
// 写入完成之后进行解析
ws.on("close", () => {
console.log("字体文件写入");
let aa = $("body > div.banner > div > div.celeInfo-right.clearfix > div.movie-stats-container > div:nth-child(1) > div > div > span > span").text();
let bb = $("body > div.banner > div > div.celeInfo-right.clearfix > div.movie-stats-container > div:nth-child(2) > div > span.stonefont").text();
// 评分数量
let arr1 = aa.split("");
let b = 0;
let arr11 = [];
arr1.forEach(async (item, i) => {
// 将编码传入解析
let result = await parseNum(item);
b++;
// 将解析的图像和svg里面的10个图像进行对比,如果相似度超过50%,就认为它为这个数字
arr11[i] = parseFloat(result.max) > 0.5 ? result.key : item;
if (b == arr1.length) {
scorenum = arr11.join("");
scorenum = parseChar(scorenum);
scorenum = isNaN(scorenum) ? 0 : scorenum;
console.log(scorenum);
let bb = $("body > div.banner > div > div.celeInfo-right.clearfix > div.movie-stats-container > div:nth-child(2) > div > span.stonefont").text();
// 票房
let arr2 = bb.split("");
if (arr2.length == 0) {
arr2 = ["0"];
}
let arr22 = [];
let a = 0;
arr2.forEach(async (item, i) => {
// 将编码传入解析
let result = await parseNum(item, fontPath);
a++;
// 将解析的图像和svg里面的10个图像进行对比,如果相似度超过50%,就认为它为这个数字
arr22[i] = parseFloat(result.max) > 0.5 ? result.key : item;
if (a == arr2.length) {
boxoffice = arr22.join("") + $("body > div.banner > div > div.celeInfo-right.clearfix > div.movie-stats-container > div:nth-child(2) > div > span.stonefont").text();
boxoffice = parseChar(boxoffice);
boxoffice = isNaN(boxoffice) ? 0 : boxoffice;
console.log(boxoffice);
let arr = [moviename, movieimg, cataory, area, duration, pubtime, score, scorenum, boxoffice, brief, director];
let strSql = 'insert into movie (moviename,movieimg,cataory,area,duration,pubtime,score,scorenum,boxoffice,brief,director) values (?,?,?,?,?,?,?,?,?,?,?)';
await sqlQuery(strSql, arr)
console.log(index, "页-", k, "电影信息已写入数据库:", moviename, href);
k++;
if (k == hrefList.length) {
hrefList = [];
k = 0;
console.log(index, "页数据已爬取完毕!-----");
index++;
getPageList(index);
} else {
// 页面未爬完继续
getMovieInfo(hrefList[k]);
}
}
})
}
})
})
})
}
}
// https://maoyan.com/films/246300
// getMovieInfo("https://maoyan.com/films/1250952")
// getMovieInfo("https://maoyan.com/films/342773")
function parseChar(str) {
let unit = str[str.length - 1];
switch (unit) {
case "万":
return parseFloat(str.substring(0, str.length - 1)) * 10000;
case "亿":
return parseFloat(str.substring(0, str.length - 1)) * 100000000;
default:
return parseFloat(str);
}
}
function sqlQuery(strSql, arr) {
return new Promise(function (resolve, reject) {
con.query(strSql, arr, (err, results) => {
if (err) {
reject(err);
} else {
resolve(results);
}
})
})
}
parseNum.js
let fs = require("fs");
var openType = require('opentype.js');
var similar = require('./similar1');
// let svgPaths = {}
async function parseNum(str) {
return new Promise(function (resolve, reject) {
//载入字体文件,并解析出字体对象
opentype.load("a.woff", function (err, font) {
if (err) {
reject('Font could not be loaded: ' + err)
} else {
//通过字体对象,并在getPath方法中传入乱码的编码,获取出绘制该字体图像路径,0,100,72分别表示绘制的x坐标为0,y坐标为100,字体大小为72
var path = font.getPath(str, 0, 100, 72);
console.log(path.toSVG());
//通过绘制的图像路径,得出SVG矢量图的字符串
let a = path.toSVG();
// 比较字符串的相似程度,将最相似的数据的,key值提取出来
let max = 0;
let key = 0;
let arr = []
for (let i = 0; i < 10; i++) {
let temp = similar(a, svgPaths[i])
if (max < temp) {
max = temp;
key = i;
}
arr.push({
temp,
i
})
}
resolve({
key,
max
})
}
});
})
//similar(a,b)
}
similar1.js
function similar(s, t, f) {
if (!s || !t) {
return 0;
}
var l = s.length > t.length ? s.length : t.length;
var n = s.length;
var m = t.length;
var d = [];
f = f || 3;
var min = function (a, b, c) {
return a < b ? (a < c ? a : c) : (b < c ? b : c);
}
var i, j, si, tj, cost;
if (n === 0) return m;
if (m === 0) return n;
for (i = 0; i <= n; i++) {
d[i] = [];
d[i][0] = i;
}
for (j = 0; j <= m; j++) {
d[0][j] = j;
}
for (i = 1; i <= n; i++) {
si = s.charAt(i - 1);
for (j = 1; j <= m; j++) {
tj = t.charAt(j - 1);
if (si === tj) {
cost = 0;
} else {
cost = 1;
}
d[i][j] = min(d[i - 1][j] + 1, d[i][j - 1] + 1, d[i - 1][j - 1] - 1)
}
}
let res = (1 - d[n][m] / l);
return res.toFixed(f);
}