爬虫程序提取信息(统计符合条件的文件数量)

var URL = "http://www.xinhuanet.com", url = URL, total = 0, count = 0, x = 0, num = 0;
var sum_png = 0, sum_jpg = 0, sum_gif = 0, sum_jpeg = 0, arr = [], js = {}, mark = {};
arr.push(URL), mark[url] = 1;
var request = require('request');

function callback(error, response, body) {
    if (!error && response.statusCode == 200) {
        var cnt = 0;
        var index = 0;
        while (true) {
            var start = body.indexOf("http://", index);
            if (start == -1) {
                break;
            }
            else {
                var end = body.indexOf(".com", start + 7) + 4;
                var strURL = body.substring(start, end);
                var i;
                for (i = 0; i < arr.length; i++) {
                    if (strURL == arr[i]) {
                        break;
                    }
                }
                if (i == arr.length && strURL.length < 25) {
                    arr.push(strURL);
                    mark[strURL] = count + 2;
                    cnt++;
                }
                index = end + 1;
            }
        }

        var index = 0;
        while (true) {
            var start = body.indexOf("https://", index);
            if (start == -1) {
                break;
            }
            else {
                var end = body.indexOf(".com", start + 8) + 4;
                var strURL = body.substring(start, end);
                var i;
                for (i = 0; i < arr.length; i++) {
                    if (strURL == arr[i]) {
                        break;
                    }
                }
                if (i == arr.length && strURL.length < 25) {
                    arr.push(strURL);
                    mark[strURL] = count + 2;
                    cnt++;
                }
                index = end + 1;
            }
        }

        var cnt_png = 0, begin_index = 0;
        while (true) {
            var new_index = body.indexOf(".png", begin_index);
            if (new_index == -1) {
                break;
            }
            else {
                begin_index = new_index + 4;
                cnt_png++;
            }
        }

        var cnt_jpg = 0, begin_index = 0;
        while (true) {
            var new_index = body.indexOf(".jpg", begin_index);
            if (new_index == -1) {
                break;
            }
            else {
                begin_index = new_index + 4;
                cnt_jpg++;
            }
        }

        var cnt_gif = 0, begin_index = 0;
        while (true) {
            var new_index = body.indexOf(".gif", begin_index);
            if (new_index == -1) {
                break;
            }
            else {
                begin_index = new_index + 4;
                cnt_gif++;
            }
        }

        var cnt_jpeg = 0, begin_index = 0;
        while (true) {
            var new_index = body.indexOf(".jpeg", begin_index);
            if (new_index == -1) {
                break;
            }
            else {
                begin_index = new_index + 5;
                cnt_jpeg++;
            }
        }

        var sum = cnt_png + cnt_jpg + cnt_gif + cnt_jpeg;
        total += sum;
        var str = {
            "URL ": URL,
            "index ": x,
            ".png文件个数 ": cnt_png,
            ".jpg文件个数 ": cnt_jpg,
            ".gif文件个数 ": cnt_gif,
            ".jpeg文件个数 ": cnt_jpeg,
            "该层符合条件的文件总数 ": sum,
        };

        if (cnt == 0 && count <= 8) {
            if (count == 0) {
                console.log(JSON.stringify(str));
                console.log("抓取不到合适的URL,运行结束。");
                return;
            }
            else {
                console.log("warn:该页(index:", x, ",URL:", URL, ")抓取不到合适的URL,先回上一层再继续往下走。");
                URL = arr[++x];
                request(URL, callback);
            }
        }
        else {
            console.log("\n当前位置:第 ", count + 1, " 层");
            console.log(JSON.stringify(str));
            console.log("前", count + 1, "层符合条件的文件数量累计:", total);

            if (cnt == 0) {
                console.log("该页没有抓到合适的URL。");
            }
            else {
                console.log("数组存储该页(抓取的)URL的下标范围:", arr.length - cnt, " 到 ", arr.length - 1);
            }

            if (cnt > 0) {
                console.log("第", count + 1, "层抓取到的URL:");
                for (var i = arr.length - cnt; i < arr.length; i++) {
                    console.log("index:", i, ",URL:", arr[i]);
                }
            }

            count++;
            if (count == 10) {
                console.log("\n\n数组元素(URL)对应的键值:");
                function time(error, response, body) {
                    if (!error && response.statusCode == 200) {
                        var cnt_png = 0, begin_index = 0;
                        while (true) {
                            var new_index = body.indexOf(".png", begin_index);
                            if (new_index == -1) {
                                break;
                            }
                            else {
                                begin_index = new_index + 4;
                                cnt_png++;
                            }
                        }
                        sum_png += cnt_png;

                        var cnt_jpg = 0, begin_index = 0;
                        while (true) {
                            var new_index = body.indexOf(".jpg", begin_index);
                            if (new_index == -1) {
                                break;
                            }
                            else {
                                begin_index = new_index + 4;
                                cnt_jpg++;
                            }
                        }
                        sum_jpg += cnt_jpg;

                        var cnt_gif = 0, begin_index = 0;
                        while (true) {
                            var new_index = body.indexOf(".gif", begin_index);
                            if (new_index == -1) {
                                break;
                            }
                            else {
                                begin_index = new_index + 4;
                                cnt_gif++;
                            }
                        }
                        sum_gif += cnt_gif;

                        var cnt_jpeg = 0, begin_index = 0;
                        while (true) {
                            var new_index = body.indexOf(".jpeg", begin_index);
                            if (new_index == -1) {
                                break;
                            }
                            else {
                                begin_index = new_index + 5;
                                cnt_jpeg++;
                            }
                        }
                        sum_jpeg += cnt_jpeg;

                        var sum = cnt_png + cnt_jpg + cnt_gif + cnt_jpeg;
                        var str = {
                            "URL ": url,
                            "层级 ": mark[url],
                            ".png文件个数 ": cnt_png,
                            ".jpg文件个数 ": cnt_jpg,
                            ".gif文件个数 ": cnt_gif,
                            ".jpeg文件个数 ": cnt_jpeg,
                            "该页符合条件的文件总数 ": sum,
                        };
                        js[url] = JSON.stringify(str);
                        console.log("index:", num, ",URL(Key):", url, "\nMessage(Vaule):", js[url], "\n");

                        if (num < arr.length - 1) {
                            url = arr[++num];
                            request(url, time);
                        }
                        else {
                            var str = {
                                "全部URL.png文件个数累计 ": sum_png,
                                "全部URL.jpg文件个数累计 ": sum_jpg,
                                "全部URL.gif文件个数累计 ": sum_gif,
                                "全部URL.jpeg文件个数累计 ": sum_jpeg,
                                "全部URL符合条件的文件数量累计 ": sum_png + sum_jpg + sum_gif + sum_jpeg
                            };
                            console.log(JSON.stringify(str));
                            return;
                        }
                    }
                    else {
                        var str = {
                            "URL ": url,
                            "层级 ": mark[url],
                            "index ": num,
                            "error ": "URL错误。"
                        }
                        js[url] = JSON.stringify(str);
                        console.log("index:", num, ",URL(Key):", url, "\nMessage(Vaule):", js[url], "\n");

                        if (num < arr.length - 1) {
                            url = arr[++num];
                            request(url, time);
                        }
                        else {
                            var str = {
                                "全部URL.png文件个数累计 ": sum_png,
                                "全部URL.jpg文件个数累计 ": sum_jpg,
                                "全部URL.gif文件个数累计 ": sum_gif,
                                "全部URL.jpeg文件个数累计 ": sum_jpeg,
                                "全部URL符合条件的文件数量累计 ": sum_png + sum_jpg + sum_gif + sum_jpeg
                            };
                            console.log(JSON.stringify(str));
                            return;
                        }
                    }
                }
                request(url, time);
                return;
            }
            x = arr.length - cnt;
            URL = arr[x];
            console.log("去到下一个层\n");
            request(URL, callback);
        }
    }
    else {
        if (count == 0) {
            console.log("URL错误,运行结束。");
            return;
        }
        else {
            console.log("error:", "URL错误(index:", x, ",URL:", URL, ")");
            URL = arr[++x];
            request(URL, callback);
        }
    }
}
request(URL, callback);

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值