node.js+jsdom 小爬虫 并不是框架>.<
解决某些个人问题:
利用了dfs搜索
"use strict";
let https=require('https');
let fs=require('fs');
let path=require('path');
let jsdom=require('jsdom');
let visted=[];
function isDir(url){
if(url.indexOf('.')!==-1){
return false;
}
try{
fs.mkdirSync(url);
console.log('mkdir:'+url);
}catch(e){
console.log(`IOerror:${e}`)
}
return true;
}
function isUrlVisited(url){
for(let i=0;i<visted.length;i++){
if(visted[i]===url){
return true;
}
}
visted.push(url);
return false;
}
function writeFile(url){
if(isDir(url)&&isUrlVisited(url)){
return;
}
let filePath=path.parse(url);
try{
urltoFile(url);
}catch(error){
console.log(error)
}
}
function urltoFile(url){
https.request(root+url,
(res)=>{
res.on('data',(data)=>{
fs.appendFileSync(url,data);
});
}).on('error',(e)=>{
console.log(e);
}).end();
}
function dealUrl(url){
isUrlVisited(url);
isDir(url);
writeFile(url);
}
let root='https://www.seryox.com';
function applyUrl(url){
jsdom.env({
url: url,
scripts: ["http://code.jquery.com/jquery.js"],
done: function (err, window) {
let $=window.$;
console.log('done');
try{
let arr=$('a');
console.log(arr.length)
for(let i=0;i<arr.length;i++){
let href=$(arr[i]).attr('href');
if(href.match(/^\/pic/)&&!isUrlVisited(href)){
console.log(href);
if(isDir(href)){
console.log('dir:'+url+href);
applyUrl(root+href);
}else{
console.log('file:'+url+href);
writeFile(href);
}
}
}
}catch(e){
console.log(e+'@'+url);
}
}
});
}
applyUrl(root+'/');
生成的文件夹编码为UTF-8 URL编码
利用此文件可解决
"use strict";
let fs=require('fs');
let root='/pic';
function isDir(url){
if(url.indexOf('.')!==-1){
return false;
}
return true;
}
function rename(path,name){
if(name.indexOf('%')!==-1)
console.log(path+'/'+name+' to '+path+'/'+decodeURI(name))
fs.renameSync(path+'/'+name,path+'/'+decodeURI(name));
}
function main(path){
let fd=fs.readdirSync(path);
for(let i=0;i<fd.length;i++){
if(isDir(fd[i])){
main(path+'/'+fd[i]);
}
rename(path,fd[i]);
}
}
main(root);