工作中遇到一个新的需求,要根据提供的一个接口,将访问到的内容采集,并封装到自己的页面显示。当时根本没接触到这些内容,一窍不通,只能通过上网查,来找解决办法。找来找去,的确是找到了解决办法,也解决了问题,但是很多代码,很多方法都看不懂。虽然过程中也上网查看了相关代码的意思,但是时间久了也就忘了,抽空记录下来,方便以后用到的时候查看。
页面的代码就省略了,当时是从接口的页面,将页面内容抓取下来,然后做成一个静态页面来做测试的。
后台方法
package com.sudytech.webplus.web.magaz.action;
import java.io.IOException;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import org.apache.commons.httpclient.HttpClient;
import org.apache.commons.httpclient.HttpMethod;
import org.apache.commons.httpclient.HttpStatus;
import org.apache.commons.httpclient.URIException;
import org.apache.commons.httpclient.methods.GetMethod;
import org.apache.commons.httpclient.methods.PostMethod;
import org.apache.commons.httpclient.params.HttpMethodParams;
import org.apache.commons.httpclient.util.URIUtil;
import org.apache.commons.lang.StringUtils;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import com.sudytech.webplus.base.BaseAction;
import com.sudytech.webplus.web.magaz.action.vo.MagazDetailVO;
public class GetTextByUrlAction extends BaseAction{
private static MagazDetailVO magaz;
private static Log log = LogFactory.getLog(GetTextByUrlAction.class);
/**
* 执行一个HTTP GET请求,返回请求响应的HTML
*
* @param url 请求的URL地址
* @param queryString 请求的查询参数,可以为null
* @return 返回请求响应的HTML
*/
public static String doGet(String url, String queryString) {
String response = null;
HttpClient client = new HttpClient();
HttpMethod method = new GetMethod(url);
try {
if (StringUtils.isNotBlank(queryString)) {
method.setQueryString(URIUtil.encodeQuery(queryString));
client.executeMethod(method);
method.getParams().setContentCharset("utf-16");
if (method.getStatusCode() == HttpStatus.SC_OK) {
response = method.getResponseBodyAsString();
}
}
} catch (URIException e) {
log.error("执行HTTP Get请求时,编码查询字符串“" + queryString + "”发生异常!", e);
} catch (IOException e) {
log.error("执行HTTP Get请求" + url + "时,发生异常!", e);
} finally {
method.releaseConnection();
}
return response;
}
/**
* 执行一个HTTP POST请求,返回请求响应的HTML
*
* @param url 请求的URL地址
* @param params 请求的查询参数,可以为null
* @return 返回请求响应的HTML
*/
public static String doPost(String url, Map<String, String> params) {
String response = null;
HttpClient client = new HttpClient();
HttpMethod method = new PostMethod(url);
method.setRequestHeader("Content-type","text/html; charset=utf-16");
/*for (Iterator it = params.entrySet().iterator(); it.hasNext();) {
}*/
//设置Http Post数据
if (params != null) {
HttpMethodParams p = new HttpMethodParams();
for (Map.Entry<String, String> entry : params.entrySet()) {
p.setParameter(entry.getKey(), entry.getValue());
}
method.setParams(p);
}
try {
client.executeMethod(method);
method.getParams().setContentCharset("utf-16");
if (method.getStatusCode() == HttpStatus.SC_OK) {
response = method.getResponseBodyAsString();
}
} catch (IOException e) {
log.error("执行HTTP Post请求" + url + "时,发生异常!", e);
} finally {
method.releaseConnection();
}
return response;
}
/**
* 将页面的解析的内容封装到自己的页面中,显示出来
* http://172.16.1.88/kns55/detail.aspx?filename=NCDG901.013&dbname=CJFD1999
*/
public String getMagazDetail() {
magaz=new MagazDetailVO();
String urlStr="";
String url="";
String filename="";
String dbname="";
Map<String, String> map = new HashMap<String, String>();
//url根据参数获得 测试urlStr="http://172.16.1.88/kns55/detail.aspx?filename=NCDG901.013&dbname=CJFD1999";
urlStr=getRequest().getParameter("url");
if(urlStr!=null && !urlStr.equals("")){//下面是根据获得的URL来截取后面的参数FileName和dbname
url=urlStr.substring(0,urlStr.indexOf("?"));
filename=urlStr.substring(urlStr.indexOf("filename"), urlStr.indexOf("&"));
dbname=urlStr.substring(urlStr.indexOf("dbname"));
}
map.put("filename", filename);
map.put("dbname", dbname);
String html="";
String title="";
String remark="";
String downloadPDF="";
String downloadCAJ="";
html=doGet(urlStr,new String());
// html=doPost(url, map);//获得页面html
// 下面是doPost和doGet的方法,获得html
// html=doPost("http://localhost:8080/webplus/_web/magaz/2test.html?_p=YXM9MSZwPTEmbT1OJg__&act=1", new HashMap());
// html=doGet("http://localhost:8080/webplus/_web/magaz/2test.html?_p=YXM9MSZwPTEmbT1OJg__&act=1",new String());
if(html==null || html.equals("")){
System.out.println("========html is null========");
}
// 下面将根据获得的页面内容,截取想要的内容,进行封装
title=getTitle(html);
remark=getRemark(html);
downloadPDF=getdoanloadPDF(html);
downloadCAJ=getdoanloadCAJ(html);
magaz.setTitle(title);
magaz.setRemark(remark);
magaz.setDownloadCAJ(downloadCAJ);
magaz.setDownloadPDF(downloadPDF);
return SUCCESS;
}
/**
* 根据获取的页面html,抓取标题内容
*/
public static String getTitle(final String s){
String regex;
String title="";
StringBuffer sb=new StringBuffer();
regex="<title>.*?</title>";
final Pattern pt=Pattern.compile(regex, Pattern.CANON_EQ);
final Matcher ma=pt.matcher(s);
while(ma.find()){
sb.append(ma.group());
}
title=deleteTag(sb.toString());
title=title.replaceAll(" - 中国期刊全文数据库", "");//去除title里的标记
return title;
}
/**
* 根据获取的页面html,抓取摘要内容
*/
public static String getRemark(String s){
String regex;
String remark="";
List<String> list=new ArrayList<String>();
regex="<span id=\"ChDivSummary\" name=\"ChDivSummary\">(.*?)</span>";
Pattern pt=Pattern.compile(regex, Pattern.DOTALL);
Matcher ma=pt.matcher(s);
while(ma.find()){
list.add(ma.group());
}
for(int i=0;i<list.size();i++){
remark+=list.get(i);
}
remark=deleteTag(remark);
return remark;
}
/**
* 获得PDF下载链接
*/
public static String getdoanloadPDF(String s){
String regex;
String pdfUrl="";
StringBuffer sb=new StringBuffer();
regex="\"><a target=\"_blank\" href=\"
(.*?)dflag=pdfdown";
Pattern pt=Pattern.compile(regex, Pattern.DOTALL);
Matcher ma=pt.matcher(s);
while(ma.find()){
sb.append(ma.group());
}
pdfUrl=deleteTag(sb.toString());
return pdfUrl;
}
/**
* 获得CAJ下载链接
*/
public static String getdoanloadCAJ(String s){
String regex;
String cajUrl="";
StringBuffer sb=new StringBuffer();
regex="../download.aspx.*?<b>CAJ下载</b></a>";
Pattern pt=Pattern.compile(regex, Pattern.DOTALL);
Matcher ma=pt.matcher(s);
while(ma.find()){
sb.append(ma.group());
}
cajUrl=deleteTag(sb.toString());
return cajUrl;
}
/**
* 去除tag标记
*/
public static String deleteTag(String s){
s=s.replaceAll("<.*?>", "");
s=s.replaceAll("
", "");//转换的换行符,全部去掉
if(s.contains("<正>")){
s=s.replaceAll("<正>", "");
}
if(s.contains(" ")){
s=s.replaceAll(" ", "");
}
if(s.contains("pdf")){
s=s.replaceAll("pdf", "");
}
if(s.contains("\"><a target=\"_blank\" href=\"")){
s=s.replaceAll("\"><a target=\"_blank\" href=\"", "");
}
if(s.contains("\">PDF下载")){
s=s.replaceAll("\">PDF下载", "");
}
if(s.contains("<a target=\"_blank\" href=\"")){
s=s.replaceAll("<a target=\"_blank\" href=\"", "");
}
if(s.contains("\">CAJ下载")){
s=s.replaceAll("\">CAJ下载", "");
}
return s;
}
public MagazDetailVO getMagaz() {
return magaz;
}
public void setMagaz(MagazDetailVO magaz) {
this.magaz = magaz;
}
}