1. 提取HTML网页的内容
代码如下,
import java.net.*;
import java.io.*;
public class hello {
public static void main(String[] args) throws Exception {
// TODO Auto-generated method stub
URL oracle = new URL("https://docs.oracle.com/javase/tutorial/networking/urls/readingURL.html");
BufferedReader in = new BufferedReader(new InputStreamReader(oracle.openStream()));
String inputLine;
while((inputLine = in.readLine()) != null) {
System.out.println(inputLine);
}
in.close();
}
}
另一种实现方法如下,
import java.net.*;
import java.io.*;
public class hello {
public static String loadWebPage(String urlString) {
byte[] buffer = new byte[1024];
String content = new String();
try {
URL url = new URL(urlString);
InputStream in = url.openStream();
int len;
while((len = in.read(buffer)) != -1) {
content += new String(buffer);
}
} catch (IOException e) {
content = "<h1>Fail to download the page</h1>" + urlString;
}
return content;
}
public static void main(String[] args) throws Exception {
String url = "https://docs.oracle.com/javase/tutorial/networking/urls/readingURL.html";
System.out.println(loadWebPage(url));
}
}
2. ParserCallback处理HTML文本数据
ParserCallback提供了HTML文本处理的方法,如下,
import java.net.*;
import javax.swing.text.html.HTMLEditorKit;
import javax.swing.text.html.parser.ParserDelegator;
import java.io.*;
// ParserCallback class provides tools to handle the web content
class MyParserCallback extends HTMLEditorKit.ParserCallback {
public String content = new String();
@Override
public void handleText(char[] data, int pos) {
content += " " + new String(data);
}
}
public class hello {
// Initial the PaeserCallback class
public static String loadPlainText(String urlString) throws IOException {
MyParserCallback callback = new MyParserCallback();
ParserDelegator parser = new ParserDelegator();
URL url = new URL(urlString);
InputStreamReader reader = new InputStreamReader(url.openStream());
parser.parse(reader, callback, true);
return callback.content;
}
public static String loadWebPage(String urlString) {
byte[] buffer = new byte[1024];
String content = new String();
try {
URL url = new URL(urlString);
InputStream in = url.openStream();
int len;
while((len = in.read(buffer)) != -1) {
content += new String(buffer);
}
} catch (IOException e) {
content = "<h1>Unable to download the page</h1>" + urlString;
}
return content;
}
// Main function
public static void main(String[] args) throws Exception {
String url = "https://docs.oracle.com/javase/tutorial/networking/urls/readingURL.html";
System.out.println(loadPlainText(url));
}
}
参考资料
[1. Oracle URL website] https://docs.oracle.com/javase/tutorial/networking/urls/readingURL.html