抓取csdn指定用户的博文

http请求类:

package com.blog.collection;

import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStreamReader;
import java.io.PrintWriter;
import java.net.URL;
import java.net.URLConnection;
import java.util.List;
import java.util.Map;

public class HttpRequest {
    /**
     * 向指定URL发送GET方法的请求
     * 
     * @param url
     *            发送请求的URL
     * @param param
     *            请求参数,请求参数应该是 name1=value1&name2=value2 的形式。
     * @return URL 所代表远程资源的响应结果
     */
    public static String sendGet(String url, String param) {
        String result = "";
        BufferedReader in = null;
        try {
            String urlNameString = url + "?" + param;
            URL realUrl = new URL(urlNameString);
            // 打开和URL之间的连接
            URLConnection connection = realUrl.openConnection();
            // 设置通用的请求属性
            connection.setRequestProperty("accept", "*/*");
            connection.setRequestProperty("connection", "Keep-Alive");
            connection.setRequestProperty("Cache-Control", "public, no-store, max-age=60");
            connection.setRequestProperty("Content-Encoding", "	gzip");
            connection.setRequestProperty("user-agent","	Mozilla/5.0 (Windows NT 6.1; rv:29.0) Gecko/20100101 Firefox/29.0");
            // 建立实际的连接
            connection.connect();
            // 获取所有响应头字段
//            Map<String, List<String>> map = connection.getHeaderFields();
            // 遍历所有的响应头字段
//            for (String key : map.keySet()) {
//                System.out.println(key + "--->" + map.get(key));
//            }
            // 定义 BufferedReader输入流来读取URL的响应
            in = new BufferedReader(new InputStreamReader(
                    connection.getInputStream()));
            String line;
            while ((line = in.readLine()) != null) {
                result += line+"\n";
            }
        } catch (Exception e) {
            System.out.println("发送GET请求出现异常!" + e);
            e.printStackTrace();
        }
        // 使用finally块来关闭输入流
        finally {
            try {
                if (in != null) {
                    in.close();
                }
            } catch (Exception e2) {
                e2.printStackTrace();
            }
        }
        return result;
    }

    public static String send(String url){
    	  String result = "";
          BufferedReader in = null;
          try {
              String urlNameString = url;
              URL realUrl = new URL(urlNameString);
              // 打开和URL之间的连接
              URLConnection connection = realUrl.openConnection();
              // 建立实际的连接
              connection.connect();
              in = new BufferedReader(new InputStreamReader(
                      connection.getInputStream()));
              String line;
              while ((line = in.readLine()) != null) {
                  result += line+"\n";
              }
          } catch (Exception e) {
              System.out.println("发送GET请求出现异常!" + e);
              e.printStackTrace();
          }
          // 使用finally块来关闭输入流
          finally {
              try {
                  if (in != null) {
                      in.close();
                  }
              } catch (Exception e2) {
                  e2.printStackTrace();
              }
          }
          return result;
    }
    
    /**
     * 向指定 URL 发送POST方法的请求
     * 
     * @param url
     *            发送请求的 URL
     * @param param
     *            请求参数,请求参数应该是 name1=value1&name2=value2 的形式。
     * @return 所代表远程资源的响应结果
     */
    public static String sendPost(String url, String param) {
        PrintWriter out = null;
        BufferedReader in = null;
        String result = "";
        try {
            URL realUrl = new URL(url);
            // 打开和URL之间的连接
            URLConnection conn = realUrl.openConnection();
            // 设置通用的请求属性
            conn.setRequestProperty("accept", "*/*");
            conn.setRequestProperty("connection", "Keep-Alive");
            conn.setRequestProperty("user-agent",
                    "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1;SV1)");
            // 发送POST请求必须设置如下两行
            conn.setDoOutput(true);
            conn.setDoInput(true);
            // 获取URLConnection对象对应的输出流
            out = new PrintWriter(conn.getOutputStream());
            // 发送请求参数
            out.print(param);
            // flush输出流的缓冲
            out.flush();
            // 定义BufferedReader输入流来读取URL的响应
            in = new BufferedReader(
                    new InputStreamReader(conn.getInputStream()));
            String line;
            while ((line = in.readLine()) != null) {
                result += line;
            }
        } catch (Exception e) {
            System.out.println("发送 POST 请求出现异常!"+e);
            e.printStackTrace();
        }
        //使用finally块来关闭输出流、输入流
        finally{
            try{
                if(out!=null){
                    out.close();
                }
                if(in!=null){
                    in.close();
                }
            }
            catch(IOException ex){
                ex.printStackTrace();
            }
        }
        return result;
    }    
}

处理类:

package com.blog.collection;

import java.util.ArrayList;
import java.util.List;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

import com.blog.model.Blog;

public class CollectionHandler {

	private Progress progress;
	public void setProgress(Progress progress) {
		this.progress = progress;
	}
	public Progress getProgress() {
		return progress;
	}
	public void go(String user){
		HttpRequest request=new HttpRequest();
		System.out.println("加载中...");
		 String content=request.sendGet("http://blog.csdn.net/"+user+"/article/list/1", "");
		 //获取页码-摘要视图
		String count=matcher(content, "(?<=<div[\\s\\S]{0,10}id=\"papelist\"[\\s\\S]{0,10}class=\"pagelist\">[\\s\\S]{1,100}共)\\d+(?=页</span>)");
		Integer code=count.equals("")?0:Integer.parseInt(count);
		List<String> urls=new ArrayList<String>();
		getUrls(content, urls, null);
		for(int i=2;i<=code;i++){
			getUrls(null,urls, "http://blog.csdn.net/"+user+"/article/list/"+i);
		}
		System.out.println("数量:"+urls.size());
		for (String string : urls) {
			System.out.println(string);
			handler(string);
		}
		System.out.println("处理完成");
	}
	
	public void getUrls(String text,List<String> urls,String url){
		HttpRequest request=new HttpRequest();
		
		String content=null;
		if(text==null){
			content=request.sendGet(url, "");
		}else{
			content=text;
		}
		String regex="(?<=<span[\\s\\S]{0,10}class=\"link_title\"><a[\\s\\S]{0,10}\")[\\s\\S]*?(?=\">)";
		Pattern pattern = Pattern.compile(regex);
		Matcher matcher = pattern.matcher(content);
		while(matcher.find()){
			urls.add("http://blog.csdn.net"+matcher.group());
		}
	}
	
	/**
	 * 处理博文
	 * @param url
	 */
	public void handler(String url){
		Blog blog=new Blog();
		HttpRequest request=new HttpRequest();
		String content=request.sendGet(url, "");
		//System.out.println(content);
		String regex = "(?<=<span class=\"link_title\"><a[\\s\\S]{0,1000}?>)[\\s\\S]*?(?=</a></span>)";
		//标题
		String title=matcher(content, regex).replaceAll("\n", "").replaceAll(" ", "");
		System.out.println("标题");
		System.out.println(title);
		blog.setTitle(title);
		//文章内容
		regex="(?<=<div[\\s\\S]{0,100}id=\"article_content\"[\\s\\S]{0,100}class=\"article_content\">)[\\s\\S]*?(?=</div>[\\s\\S]{0,100}<!--)";
		System.out.println("博文");
		String text=matcher(content, regex);
		blog.setContent(text);
		//分类
		regex="(?<=<span[\\s\\S]{0,100}class=\"link_categories\">[\\s\\S]{0,1000}<a[\\s\\S]{0,200}?>)[\\s\\S]*?(?=</a>)";
		System.out.println("分类");
		String type=matcher(content, regex);
		blog.setTags(type);
		System.out.println(type);
		if(this.progress!=null){
			progress.handler(blog, type);
		}
	}
	
	public String matcher(String content,String regex){
		Pattern pattern = Pattern.compile(regex);
		Matcher matcher = pattern.matcher(content);
		if (matcher.find()) {
			String group = matcher.group(0);
			return group;
		}
		return "";
	}
}


  • 1
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 1
    评论
评论 1
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值