1,使用httpClient获取页面html
public static String getPageContent(String url) {
HttpClientBuilder builder = HttpClients.custom();
CloseableHttpClient client = builder.build();
HttpGet request = new HttpGet(url);
String content = "";
try {
CloseableHttpResponse execute = client.execute(request);
HttpEntity entity = execute.getEntity();
content = EntityUtils.toString(entity);
} catch (ClientProtocolException e) {
e.printStackTrace();
} catch (IOException e) {
e.printStackTrace();
}
return content;
}
使用URL获取HTML
public static String getContent(String url) {
String result = "";
BufferedReader in = null;
URL realUrl;
try {
realUrl = new URL(url);
URLConnection conn = realUrl.openConnection();
in = new BufferedReader(new InputStreamReader(conn.getInputStream()));
String line = null;
while((line=in.readLine())!=null) {
result +=line;
}
} catch (MalformedURLException e) {
e.printStackTrace();
} catch (IOException e) {
e.printStackTrace();
}
return result;
}
运行结果
使用正则获取关键内容
public static void regex(String regex,String content) {
Pattern p = Pattern.compile(regex);
Matcher matcher = p.matcher(content);
System.out.println(matcher.find());
if(matcher.find()) {
System.out.println(matcher.group());
}
}
使用htmlcleaner
public static void htmlCleaner() throws Exception {
HtmlCleaner cleaner = new HtmlCleaner();
TagNode node = cleaner.clean(new File("xxx.html"));
//按tag取.
TagNode[] elementsName = node.getElementsByName("title", true);
if(elementsName.length > 0) {
System.out.println(elementsName[0].getText());
}
//按xpath取
Object[] obj = node.evaluateXPath("//div[@class='d_1']//li");
for(Object on:obj) {
System.out.println(on);
}
}
等待继续更新。。。。。。