导出csdn博客

最新推荐文章于 2024-02-04 21:11:10 发布

hxpjava1

最新推荐文章于 2024-02-04 21:11:10 发布

阅读量1k

点赞数

分类专栏：爬虫

本文链接：https://blog.csdn.net/hxpjava1/article/details/77850772

版权

爬虫专栏收录该内容

2 篇文章 0 订阅

订阅专栏

package com.mark.demo.crawler;

import java.io.BufferedInputStream;
import java.io.BufferedOutputStream;
import java.io.File;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.InputStream;
import java.net.URL;
import java.util.ArrayList;
import java.util.List;

import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;

/*
*hxp(hxpwangyi@126.com)
*2017年9月5日
*
*/
public class CSDNCrawler {
	private static String baseUrl="http://blog.csdn.net";
	private static String blogUrl="http://blog.csdn.net/hxpjava1";
	private static List<String> articleUrls=new ArrayList<String>();
	private static Integer totalIndexPageNum;
	private static String saveDir="d://tmp/";
	
	public static void main(String[] args) throws IOException {
		Document doc=Jsoup.connect(blogUrl).userAgent("Mozilla").post();
		Element element=doc.getElementById("papelist");
		Elements as=element.getElementsByTag("a");
		Element lastPage=as.get(as.size()-1);
		String lastPageUrl=lastPage.attr("href");
		String[] urlSplited=lastPageUrl.split("/");
		totalIndexPageNum=Integer.parseInt(urlSplited[urlSplited.length-1]);
		
		getArticleUrls();
		
		savePages();
	}
	
	public static void printUrls(){
		for(int i=0;i<articleUrls.size();i++){
			System.out.println(articleUrls.get(i));
		}
	}
	
	public  static  void getArticleUrls() throws IOException{
		for(int i=1;i<=totalIndexPageNum;i++){
			Document doc=Jsoup.connect(baseUrl+"/hxpjava1/article/list/"+i).userAgent("Mozilla").post();
			Elements elements=doc.getElementsByClass("link_title");
			for(int j=0;j<elements.size();j++){
				Element element=elements.get(j);
				Elements es=element.getElementsByTag("a");
				articleUrls.add(es.get(0).attr("href"));
			}
		}
	}
	
	public static void savePages(){
		for(int i=0;i<articleUrls.size();i++){
			String articleUrl=articleUrls.get(i);
			BufferedInputStream bis=null;
			BufferedOutputStream bos =null;
			try{
				Document doc=Jsoup.connect(baseUrl+articleUrl).post();
				String name=doc.title();
				
				File dest = new File(saveDir +name+".html");
	            InputStream is;
	            FileOutputStream fos = new FileOutputStream(dest);
	            
				URL url = new URL(baseUrl+articleUrl);
		        is = url.openStream();
		    
		        //为字节输入流加缓冲
		         bis= new BufferedInputStream(is);
		        //为字节输出流加缓冲
		        bos = new BufferedOutputStream(fos);
	
		        int length;
	
		        byte[] bytes = new byte[1024*20];
		        while((length = bis.read(bytes, 0, bytes.length)) != -1){
		            fos.write(bytes, 0, length);
		        }
			}catch(Exception e){
				e.printStackTrace();
				try {
					bis.close();
					bos.close();
				} catch (IOException e1) {
					e1.printStackTrace();
				}
				
			}
					
		}
		
	}
}

由于有些css遭到403页面样式会有些问题。

解决方法是手动把一些css，js下载下来，用程序修改页面url路径到本地。

demo地址：https://github.com/13567436138/export-csdn-blog.git

hxpjava1

关注

0
点赞
踩
1

收藏

觉得还不错? 一键收藏
打赏
0
评论
导出csdn博客

package com.mark.demo.crawler;import java.io.BufferedInputStream;import java.io.BufferedOutputStream;import java.io.File;import java.io.FileOutputStream;import java.io.IOException;import java.i
复制链接

扫一扫