var URL = "http://www.xinhuanet.com", url = URL, total = 0, count = 0, x = 0, num = 0;
var sum_png = 0, sum_jpg = 0, sum_gif = 0, sum_jpeg = 0, arr = [], js = {}, mark = {};
arr.push(URL), mark[url] = 1;
var request = require('request');
function callback(error, response, body) {
if (!error && response.statusCode == 200) {
var cnt = 0;
var index = 0;
while (true) {
var start = body.indexOf("http://", index);
if (start == -1) {
break;
}
else {
var end = body.indexOf(".com", start + 7) + 4;
var strURL = body.substring(start, end);
var i;
for (i = 0; i < arr.length; i++) {
if (strURL == arr[i]) {
break;
}
}
if (i == arr.length && strURL.length < 25) {
arr.push(strURL);
mark[strURL] = count + 2;
cnt++;
}
index = end + 1;
}
}
var index = 0;
while (true) {
var start = body.indexOf("https://", index);
if (start == -1) {
break;
}
else {
var end = body.indexOf(".com", start + 8) + 4;
var strURL = body.substring(start, end);
var i;
for (i = 0; i < arr.length; i++) {
if (strURL == arr[i]) {
break;
}
}
if (i == arr.length && strURL.length < 25) {
arr.push(strURL);
mark[strURL] = count + 2;
cnt++;
}
index = end + 1;
}
}
var cnt_png = 0, begin_index = 0;
while (true) {
var new_index = body.indexOf(".png", begin_index);
if (new_index == -1) {
break;
}
else {
begin_index = new_index + 4;
cnt_png++;
}
}
var cnt_jpg = 0, begin_index = 0;
while (true) {
var new_index = body.indexOf(".jpg", begin_index);
if (new_index == -1) {
break;
}
else {
begin_index = new_index + 4;
cnt_jpg++;
}
}
var cnt_gif = 0, begin_index = 0;
while (true) {
var new_index = body.indexOf(".gif", begin_index);
if (new_index == -1) {
break;
}
else {
begin_index = new_index + 4;
cnt_gif++;
}
}
var cnt_jpeg = 0, begin_index = 0;
while (true) {
var new_index = body.indexOf(".jpeg", begin_index);
if (new_index == -1) {
break;
}
else {
begin_index = new_index + 5;
cnt_jpeg++;
}
}
var sum = cnt_png + cnt_jpg + cnt_gif + cnt_jpeg;
total += sum;
var str = {
"URL ": URL,
"index ": x,
".png文件个数 ": cnt_png,
".jpg文件个数 ": cnt_jpg,
".gif文件个数 ": cnt_gif,
".jpeg文件个数 ": cnt_jpeg,
"该层符合条件的文件总数 ": sum,
};
if (cnt == 0 && count <= 8) {
if (count == 0) {
console.log(JSON.stringify(str));
console.log("抓取不到合适的URL,运行结束。");
return;
}
else {
console.log("warn:该页(index:", x, ",URL:", URL, ")抓取不到合适的URL,先回上一层再继续往下走。");
URL = arr[++x];
request(URL, callback);
}
}
else {
console.log("\n当前位置:第 ", count + 1, " 层");
console.log(JSON.stringify(str));
console.log("前", count + 1, "层符合条件的文件数量累计:", total);
if (cnt == 0) {
console.log("该页没有抓到合适的URL。");
}
else {
console.log("数组存储该页(抓取的)URL的下标范围:", arr.length - cnt, " 到 ", arr.length - 1);
}
if (cnt > 0) {
console.log("第", count + 1, "层抓取到的URL:");
for (var i = arr.length - cnt; i < arr.length; i++) {
console.log("index:", i, ",URL:", arr[i]);
}
}
count++;
if (count == 10) {
console.log("\n\n数组元素(URL)对应的键值:");
function time(error, response, body) {
if (!error && response.statusCode == 200) {
var cnt_png = 0, begin_index = 0;
while (true) {
var new_index = body.indexOf(".png", begin_index);
if (new_index == -1) {
break;
}
else {
begin_index = new_index + 4;
cnt_png++;
}
}
sum_png += cnt_png;
var cnt_jpg = 0, begin_index = 0;
while (true) {
var new_index = body.indexOf(".jpg", begin_index);
if (new_index == -1) {
break;
}
else {
begin_index = new_index + 4;
cnt_jpg++;
}
}
sum_jpg += cnt_jpg;
var cnt_gif = 0, begin_index = 0;
while (true) {
var new_index = body.indexOf(".gif", begin_index);
if (new_index == -1) {
break;
}
else {
begin_index = new_index + 4;
cnt_gif++;
}
}
sum_gif += cnt_gif;
var cnt_jpeg = 0, begin_index = 0;
while (true) {
var new_index = body.indexOf(".jpeg", begin_index);
if (new_index == -1) {
break;
}
else {
begin_index = new_index + 5;
cnt_jpeg++;
}
}
sum_jpeg += cnt_jpeg;
var sum = cnt_png + cnt_jpg + cnt_gif + cnt_jpeg;
var str = {
"URL ": url,
"层级 ": mark[url],
".png文件个数 ": cnt_png,
".jpg文件个数 ": cnt_jpg,
".gif文件个数 ": cnt_gif,
".jpeg文件个数 ": cnt_jpeg,
"该页符合条件的文件总数 ": sum,
};
js[url] = JSON.stringify(str);
console.log("index:", num, ",URL(Key):", url, "\nMessage(Vaule):", js[url], "\n");
if (num < arr.length - 1) {
url = arr[++num];
request(url, time);
}
else {
var str = {
"全部URL.png文件个数累计 ": sum_png,
"全部URL.jpg文件个数累计 ": sum_jpg,
"全部URL.gif文件个数累计 ": sum_gif,
"全部URL.jpeg文件个数累计 ": sum_jpeg,
"全部URL符合条件的文件数量累计 ": sum_png + sum_jpg + sum_gif + sum_jpeg
};
console.log(JSON.stringify(str));
return;
}
}
else {
var str = {
"URL ": url,
"层级 ": mark[url],
"index ": num,
"error ": "URL错误。"
}
js[url] = JSON.stringify(str);
console.log("index:", num, ",URL(Key):", url, "\nMessage(Vaule):", js[url], "\n");
if (num < arr.length - 1) {
url = arr[++num];
request(url, time);
}
else {
var str = {
"全部URL.png文件个数累计 ": sum_png,
"全部URL.jpg文件个数累计 ": sum_jpg,
"全部URL.gif文件个数累计 ": sum_gif,
"全部URL.jpeg文件个数累计 ": sum_jpeg,
"全部URL符合条件的文件数量累计 ": sum_png + sum_jpg + sum_gif + sum_jpeg
};
console.log(JSON.stringify(str));
return;
}
}
}
request(url, time);
return;
}
x = arr.length - cnt;
URL = arr[x];
console.log("去到下一个层\n");
request(URL, callback);
}
}
else {
if (count == 0) {
console.log("URL错误,运行结束。");
return;
}
else {
console.log("error:", "URL错误(index:", x, ",URL:", URL, ")");
URL = arr[++x];
request(URL, callback);
}
}
}
request(URL, callback);
爬虫程序提取信息(统计符合条件的文件数量)
最新推荐文章于 2022-07-29 01:10:46 发布