import java.io.BufferedReader;
import java.io.InputStreamReader;
import java.net.URL;
import java.util.ArrayList;
import java.util.List;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
/**
* @类名称
* @业务描述
* 来自Java课堂:www.javakt.com
* 付费问答,是您值得信赖的伙伴
* @author
* @时间 2010-2-20 15:45:12
*/
public class WebContent {
/** */
/**
* 读取一个网页全部内容
*/
public String getOneHtml(String htmlurl) throws Exception {
URL url;
String temp;
StringBuffer sb = new StringBuffer();
url = new URL(htmlurl);
BufferedReader in = new BufferedReader(new InputStreamReader(url
.openStream(), "utf-8"));// 读取网页全部内容
while ((temp = in.readLine()) != null) {
sb.append(temp);
}
in.close();
return sb.toString();
}
/**
*
* @param s
* @return 获得网页标题
*/
public String getTitle(String s, boolean isnew) {
String regex;
String title = "";
List list = new ArrayList();
regex = "<title>.*?</title>";
Pattern pa = Pattern.compile(regex, Pattern.CANON_EQ);
Matcher ma = pa.matcher(s);
while (ma.find()) {
list.add(ma.group());
}
for (int i = 0; i < list.size(); i++) {
title = title + list.get(i);
}
return outTag(title);
}
// 标记替换定义 其中FGF是被替换与要替换标记之间的分隔符
private static String[] bjs = {"<FGF<", ">FGF76>", ""FGF76/""};
/**
* 获得正文与回复,指新帖子
*/
public String[] getTiezi(String s) {
String regex;
List list = new ArrayList();
regex = "msgfont.*?</div>";
Pattern pa = Pattern.compile(regex, Pattern.MULTILINE);
Matcher ma = pa.matcher(s);
while (ma.find()) {
list.add(outTag(ma.group().replaceAll("msgfont/">", "").replaceAll("<br />", "/r/n").replaceAll("<br/>", "/r/n").replaceAll(" ", "")));
}
String[] reStr = new String[list.size()];
for (int i = 0; i < reStr.length; i++) {
reStr[i] = replaceByBj(bjs, (String) list.get(i));
}
return reStr;
}
public static String replaceByBj(String[] bjs, String nrstr){
for(int i=0;i<bjs.length;i++){
String[] bjd = bjs[i].split("NLLD76");
nrstr = nrstr.replaceAll(bjd[0], bjd[1]);
}
return nrstr;
}
public static String[] getBjs() {
return bjs;
}
public static void setBjs(String[] bjs) {
WebContent.bjs = bjs;
}
/**
* @方法名称 获得链接
* @业务描述
*
* @author
* @时间 2010-2-20 16:42:08
*/
public String[] getCsdnLink(String s, boolean isnew) {
if(!isnew){
return getCsdnHisLink(s);
}
String regex;
List list = new ArrayList();
regex = "http://topic.csdn.net/u.*?.html";
Pattern pa = Pattern.compile(regex, Pattern.MULTILINE);
Matcher ma = pa.matcher(s);
while (ma.find()) {
list.add(ma.group());
}
String[] reStr = new String[list.size()];
for (int i = 0; i < reStr.length; i++) {
reStr[i] = (String) list.get(i);
}
return reStr;
}
public String[] getCsdnHisLink(String s) {
String regex;
List list = new ArrayList();
regex = "http://topic.csdn.net.*?.html";
Pattern pa = Pattern.compile(regex, Pattern.MULTILINE);
Matcher ma = pa.matcher(s);
while (ma.find()) {
list.add(ma.group());
}
String[] reStr = new String[list.size()];
for (int i = 0; i < reStr.length; i++) {
reStr[i] = (String) list.get(i);
}
return reStr;
}
/**
*
* @param s
* @return 获得链接
*/
public List getLink(String s) {
String regex;
List list = new ArrayList();
regex = "<a[^>]*href=</a>";
Pattern pa = Pattern.compile(regex, Pattern.DOTALL);
Matcher ma = pa.matcher(s);
while (ma.find()) {
list.add(ma.group());
}
return list;
}
/**
*
* @param s
* @return 获得脚本代码
*/
public List getScript(String s) {
String regex;
List list = new ArrayList();
regex = "<script.*?</script>";
Pattern pa = Pattern.compile(regex, Pattern.DOTALL);
Matcher ma = pa.matcher(s);
while (ma.find()) {
list.add(ma.group());
}
return list;
}
/** */
/**
*
* @param s
* @return 获得CSS
*/
public List getCSS(String s) {
String regex;
List list = new ArrayList();
regex = "<style.*?</style>";
Pattern pa = Pattern.compile(regex, Pattern.DOTALL);
Matcher ma = pa.matcher(s);
while (ma.find()) {
list.add(ma.group());
}
return list;
}
/** */
/**
*
* @param s
* @return 去掉标记
*/
public String outTag(String s) {
return s.replaceAll("<.*?>", "");
}
public static void main(String[] args) {
WebContent w = new WebContent();
String url = "http://forum.csdn.net/PointForum/Forum/BFTopicList.aspx?Alias=Java&ListType=UnClosedList&page=1";
try {
String s = w.getOneHtml(url);
String[] title2 = w.getTiezi(s);
for (int i = 0; i < title2.length; i++) {
System.out.println(title2[i]);
}
} catch (Exception e) {
e.printStackTrace();
}
}
}