java采集csdn论坛源码

最新推荐文章于 2024-08-12 13:21:58 发布

娃娃鱼

最新推荐文章于 2024-08-12 13:21:58 发布

阅读量1.2k

点赞数

分类专栏： java 文章标签： java string regex list exception url

java 专栏收录该内容

6 篇文章 0 订阅

订阅专栏

import java.io.BufferedReader;
import java.io.InputStreamReader;
import java.net.URL;
import java.util.ArrayList;
import java.util.List;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

/**
* @类名称
* @业务描述
* 来自Java课堂：www.javakt.com
* 付费问答，是您值得信赖的伙伴
* @author
* @时间 2010-2-20 15:45:12
*/
public class WebContent {
/** */
/**
* 读取一个网页全部内容
*/
public String getOneHtml(String htmlurl) throws Exception {
  URL url;
  String temp;
  StringBuffer sb = new StringBuffer();

  url = new URL(htmlurl);
  BufferedReader in = new BufferedReader(new InputStreamReader(url
    .openStream(), "utf-8"));// 读取网页全部内容
  while ((temp = in.readLine()) != null) {
   sb.append(temp);
  }
  in.close();
  return sb.toString();
}

/**
*
* @param s
* @return 获得网页标题
*/
public String getTitle(String s, boolean isnew) {
  String regex;
  String title = "";
  List list = new ArrayList();
  regex = "<title>.*?</title>";
  Pattern pa = Pattern.compile(regex, Pattern.CANON_EQ);
  Matcher ma = pa.matcher(s);
  while (ma.find()) {
   list.add(ma.group());
  }
  for (int i = 0; i < list.size(); i++) {
   title = title + list.get(i);
  }
  return outTag(title);
}

// 标记替换定义其中FGF是被替换与要替换标记之间的分隔符
private static String[] bjs = {"<FGF<", ">FGF76>", ""FGF76/""};

/**
* 获得正文与回复，指新帖子
*/
public String[] getTiezi(String s) {
  String regex;
  List list = new ArrayList();
  regex = "msgfont.*?</div>";
  Pattern pa = Pattern.compile(regex, Pattern.MULTILINE);
  Matcher ma = pa.matcher(s);
  while (ma.find()) {
   list.add(outTag(ma.group().replaceAll("msgfont/">", "").replaceAll("<br />", "/r/n").replaceAll("<br/>", "/r/n").replaceAll(" ", "")));
  }

  String[] reStr = new String[list.size()];
  for (int i = 0; i < reStr.length; i++) {
   reStr[i] = replaceByBj(bjs, (String) list.get(i));
  }
  return reStr;

}

public static String replaceByBj(String[] bjs, String nrstr){
    for(int i=0;i<bjs.length;i++){
      String[] bjd = bjs[i].split("NLLD76");
      nrstr = nrstr.replaceAll(bjd[0], bjd[1]);
    }
    return nrstr;
    }

public static String[] getBjs() {
  return bjs;
}

public static void setBjs(String[] bjs) {
WebContent.bjs = bjs;
}

/**
* @方法名称获得链接
* @业务描述
*
* @author
* @时间 2010-2-20 16:42:08
*/
public String[] getCsdnLink(String s, boolean isnew) {
  if(!isnew){
   return getCsdnHisLink(s);
  }
  String regex;
  List list = new ArrayList();
  regex = "http://topic.csdn.net/u.*?.html";
  Pattern pa = Pattern.compile(regex, Pattern.MULTILINE);
  Matcher ma = pa.matcher(s);
  while (ma.find()) {
   list.add(ma.group());
  }
  String[] reStr = new String[list.size()];
  for (int i = 0; i < reStr.length; i++) {
   reStr[i] = (String) list.get(i);
  }
  return reStr;

}

public String[] getCsdnHisLink(String s) {
  String regex;
  List list = new ArrayList();
  regex = "http://topic.csdn.net.*?.html";
  Pattern pa = Pattern.compile(regex, Pattern.MULTILINE);
  Matcher ma = pa.matcher(s);
  while (ma.find()) {
   list.add(ma.group());
  }
  String[] reStr = new String[list.size()];
  for (int i = 0; i < reStr.length; i++) {
   reStr[i] = (String) list.get(i);
  }
  return reStr;

}

/**
*
* @param s
* @return 获得链接
*/
public List getLink(String s) {
  String regex;
  List list = new ArrayList();
  regex = "<a[^>]*href=</a>";
  Pattern pa = Pattern.compile(regex, Pattern.DOTALL);
  Matcher ma = pa.matcher(s);
  while (ma.find()) {
   list.add(ma.group());
  }
  return list;
}

/**
*
* @param s
* @return 获得脚本代码
*/
public List getScript(String s) {
  String regex;
  List list = new ArrayList();
  regex = "<script.*?</script>";
  Pattern pa = Pattern.compile(regex, Pattern.DOTALL);
  Matcher ma = pa.matcher(s);
  while (ma.find()) {
   list.add(ma.group());
  }
  return list;
}

/** */
/**
*
* @param s
* @return 获得CSS
*/
public List getCSS(String s) {
  String regex;
  List list = new ArrayList();
  regex = "<style.*?</style>";
  Pattern pa = Pattern.compile(regex, Pattern.DOTALL);
  Matcher ma = pa.matcher(s);
  while (ma.find()) {
   list.add(ma.group());
  }
  return list;
}

/** */
/**
*
* @param s
* @return 去掉标记
*/
public String outTag(String s) {
return s.replaceAll("<.*?>", "");
}

public static void main(String[] args) {
  WebContent w = new WebContent();
  String url = "http://forum.csdn.net/PointForum/Forum/BFTopicList.aspx?Alias=Java&ListType=UnClosedList&page=1";
  try {
   String s = w.getOneHtml(url);
   String[] title2 = w.getTiezi(s);
   for (int i = 0; i < title2.length; i++) {
    System.out.println(title2[i]);
   }
  } catch (Exception e) {
   e.printStackTrace();
  }
}
}