Java中发送url请求,获取html内容,dom对象等

一、根据url请求返回状态码
private static int getResultHttpUrl(String url){
int responseCode=0;//状态码
try {
URL u = new URL(url);
try {
HttpURLConnection uConnection = (HttpURLConnection) u.openConnection();
try {
uConnection.connect();
responseCode = uConnection.getResponseCode();//获取状态码
} catch (Exception e) {
//System.out.println("connect failed");
}

} catch (IOException e) {
//System.out.println("build failed");
}

} catch (MalformedURLException e) {
//System.out.println("build url failed");
}
return responseCode;
}

二、根据请求获取html代码,并分为js,css,html三部分
private  TemplateVO getTemplateValue(HttpServletRequest request,
String templatePath,String templateId) throws Exception {
TemplateVO template = new TemplateVO();
String filePath = request.getSession().getServletContext()
.getRealPath("/")
+ templatePath;
String html = URLUtil.getValue(filePath);
if ("".equals(html)) {
return template;
}
memcacheDao.set(TEMPLATE_CACHE_KEY+templateId,html);
Document doc = DocumentUtils.getHtmlDoc(html);
//获取的dom对象放入缓存
String cssPath = doc.getElementsByTag("link").get(0).attr("href")
.replace("../../../", "/templateCenter/");
doc.getElementsByTag("a").removeAttr("href");
Elements csss = doc.getElementsByTag("link");
for (Element css : csss) {
template.getCsss().add(pathReplace(templatePath, css.attr("href")));
}
template.setCssPath(cssPath);
String body = pathReplace(templatePath, doc.body().html());// doc.body().html().replace("../../",
// "/templateCenter/");
body = body.replaceAll("(\r\n|\r|\n|\n\r)", "");
body = body.replaceAll("<!--.*?-->", "");
body = body.replaceAll("\\s{2,10}|\t|\r|\n", " ");
body = body.replaceAll("<\\s*(script).*?>.*?</\\1>", "");
template.setBody(body);
return template;
}


三、将html代码转为dom对象,并做相应的处理
/**
* @Title: getHtmlDoc
* @Description: html转换成dom对象
* @param html
* @return
*/
private Document getHtmlDoc(String html) {
long start = System.currentTimeMillis();
Document doc = null;
try {
doc = (Document) Jsoup.parse(html);
} catch (Exception e) {
e.printStackTrace();
} finally {
LogMgr.writeSysInfoLog("Time is:"+ (System.currentTimeMillis() - start) + "ms");
}
return doc;
}


四、抓取网页内容
/**
* Description: 获得网页抓取后的内容
* @param htmlUrl 抓取的URL地址
* @param htmlCode 网页显示的编码
* @return
*/
public static String getUrlToHtml(String htmlUrl, String htmlCode) {
try {
StringBuilder contentBuf = null;
BufferedReader bufReader = null;
try {
URL url = new URL(htmlUrl);
HttpURLConnection httpConn = (HttpURLConnection) url.openConnection();
InputStreamReader input = new InputStreamReader(httpConn.getInputStream(), htmlCode);
bufReader = new BufferedReader(input);
String line = "";
contentBuf = new StringBuilder();
while ((line = bufReader.readLine()) != null) {
String lineJudge = "<";
Pattern patternJudge = Pattern.compile(lineJudge);
Matcher matcherJudge = patternJudge.matcher(line.toString());
if(matcherJudge.find()){
contentBuf.append(line);
}else if(isMessyCode(line)==false){
contentBuf.append(line);
}
}
return contentBuf.toString();
} catch (MalformedURLException e) {
return null;
} catch (Exception e) {
return null;
} finally {
if (bufReader != null) {
try {
bufReader.close();
} catch (IOException e) {
return null;
}
}
}
} catch (Exception e) {
return null;
}
}

/**
* 判断字符串是否是乱码
* @param strName 字符串
* @return 是否是乱码
*/
public static boolean isMessyCode(String strName) {
Pattern p = Pattern.compile("\\s*|\t*|\r*|\n*");
Matcher m = p.matcher(strName);
String after = m.replaceAll("");
String temp = after.replaceAll("\\p{P}", "");
char[] ch = temp.trim().toCharArray();
float chLength = ch.length;
float count = 0;
for (int i = 0; i < ch.length; i++) {
char c = ch[i];
if (!Character.isLetterOrDigit(c)) {
if (!isChinese(c)) {
count = count + 1;
}
}
}
float result = count / chLength;
if (result > 0.4) {
return true;
} else {
return false;
}

}



/**
* 判断字符是否是中文
* @param c 字符
* @return 是否是中文
*/
public static boolean isChinese(char c) {
Character.UnicodeBlock ub = Character.UnicodeBlock.of(c);
if (ub == Character.UnicodeBlock.CJK_UNIFIED_IDEOGRAPHS
|| ub == Character.UnicodeBlock.CJK_COMPATIBILITY_IDEOGRAPHS
|| ub == Character.UnicodeBlock.CJK_UNIFIED_IDEOGRAPHS_EXTENSION_A
|| ub == Character.UnicodeBlock.GENERAL_PUNCTUATION
|| ub == Character.UnicodeBlock.CJK_SYMBOLS_AND_PUNCTUATION
|| ub == Character.UnicodeBlock.HALFWIDTH_AND_FULLWIDTH_FORMS) {
return true;
}
return false;
}
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值