测试类:
public class JsoupCleanTest {
public static void main(String[] args) {String zhengze = "[a-zA-z]+://[^s]*";
String url ="http://dnf.766.com/mood/20140512/2280138.shtml";
String html = RequestUrlUtil.requestUrl(url);
Pattern p=Pattern.compile(zhengze);
if (StringUtils.isNotBlank(html)) {
Document document = Jsoup.parse(html, url);
Elements elementsfor2 = document
.select("script");
for (Element element : elementsfor2) {
Matcher m=p.matcher(element.html());
if(m.find()){
System.out.println(m.group());
}
}
}
}
获得网页html工具类:RequestUrlUtil
package commons;
import org.apache.commons.io.IOUtils;
import org.apache.commons.lang3.StringUtils;
import org.apache.http.HttpEntity;
import org.apache.http.HttpResponse;
import org.apache.http.client.ClientProtocolException;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.impl.client.DefaultHttpClient;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import java.io.IOException;
import java.io.InputStream;
import java.util.Date;
public class RequestUrlUtil {
private static DefaultHttpClient client=new DefaultHttpClient();
public static String requestUrl(String url) {
if (StringUtils.contains(url,"www.gq.com.cn")){
Document document = null;
try {
document = Jsoup.connect(url).get();
return document.toString();
} catch (IOException e) {
e.printStackTrace(); //To change body of catch statement use File | Settings | File Templates.
}
return null;
}else{
HttpGet get=new HttpGet(url);
get.setHeader("User-Agent", "Mozilla/5.0 (Windows; U; Windows NT 5.1; zh-CN; rv:1.9.1.9) Gecko/20100315 Firefox/3.5.9");
String responseBody = null;
long start = new Date().getTime();
long end = start;
end = new Date().getTime();
if (end - start > 10000) {
return null;
}
try {
HttpResponse response = client.execute(get);
HttpEntity entity = response.getEntity();
InputStream inputStream = entity.getContent();
byte[] contentBytes = IOUtils.toByteArray(inputStream);
responseBody = new String(contentBytes, "gbk");
//?��?????????
Document document = Jsoup.parse(responseBody);
Elements elements = document.getElementsByAttributeValue("http-equiv", "content-type");
if (elements.size() == 0) {
elements =document.getElementsByTag("meta");
boolean hasCharset=false;
for (Element element : elements) {
if (element.hasAttr("charset")) {
String charSet=element.attr("charset");
if (charSet.equalsIgnoreCase("utf-8")) {
responseBody = new String(contentBytes,"utf-8");
}
hasCharset=true;
break;
}
}
if (!hasCharset) {
responseBody = new String(contentBytes,"utf-8");
}
}else {
Element first = elements.first();
if (first.hasAttr("charset")) {
String charSet=first.attr("charset");
if (charSet.equalsIgnoreCase("utf-8")) {
responseBody = new String(contentBytes,"utf-8");
}
}else{
String charSetContent = first.attr("content");
if (StringUtils.isNotBlank(charSetContent)) {
charSetContent=charSetContent.toLowerCase();
if (StringUtils.endsWith(charSetContent, "utf-8")) {
responseBody = new String(contentBytes,"utf-8");
}
}
}
}
} catch (ClientProtocolException e) {
try {
Thread.sleep(1000);
} catch (InterruptedException e1) {
}
} catch (IOException e) {
try {
Thread.sleep(1000);
} catch (InterruptedException e1) {
}
}
return responseBody;
}
}
}