jsoup 是一款Java 的HTML解析器,可直接解析某个URL地址、HTML文本内容。它提供了一套非常省力的API,可通过DOM,CSS以及类似于jQuery的操作方法来取出和操作数据。
学习的过程中可能会用到其他的API,下面一个类做一下简单的测试
import java.io.BufferedInputStream;
import java.io.File;
import java.io.FileOutputStream;
import java.io.IOException;
import java.net.HttpURLConnection;
import java.net.URL;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
public class JsoupTest {
public static void main(String[] args) throws IOException {
JsoupTest jsoupTest = new JsoupTest();
String url = "http://tieba.baidu.com/p/4549504175";
// 1.jsoup 的简单应用
jsoupTest.getHtmlElements(url);
}
private static int count = 0;
// 爬取网络的图片到本地
public void saveToFile(String destUrl) {
FileOutputStream fos = null;
BufferedInputStream bis = null;
HttpURLConnection httpUrl = null;
URL url = null;
int BUFFER_SIZE = 1024;
byte[] buf = new byte[BUFFER_SIZE];
int size = 0;
try {
url = new URL(destUrl);
httpUrl = (HttpURLConnection) url.openConnection();
httpUrl.connect();
bis = new BufferedInputStream(httpUrl.getInputStream());
String imgName = destUrl.substring(7, destUrl.lastIndexOf("."));
System.out.println(imgName);
File dir = new File("f://img");
if (!dir.exists()) {
dir.mkdirs();
}
File file = new File("f:\\img\\haha" + count + ".jpg");
System.out.println(file.getAbsolutePath());
fos = new FileOutputStream(file);
while ((size = bis.read(buf)) != -1) {
fos.write(buf, 0, size);
}
fos.flush();
} catch (IOException e) {
System.out.println("IOException");
} catch (ClassCastException e) {
System.out.println("ClassCastException");
} finally {
count++;
try {
fos.close();
bis.close();
httpUrl.disconnect();
} catch (IOException e) {
} catch (NullPointerException e) {
}
}
}
// 解析url的元素
private void getHtmlElements(String url) {
try {
Document doc = Jsoup.connect(url).get();
// 获取后缀名为jpg的img元素
Elements pngs = doc.select("img[src$=.jpg]");
for (Element element : pngs) {
saveToFile(element.attr("src"));
}
} catch (IOException e) {
e.printStackTrace();
}
}
}
参考资料
1.http://www.open-open.com/jsoup/parsing-a-document.htm
2.http://blog.csdn.net/withiter/article/details/15339579
3.http://blog.csdn.net/csh159/article/details/7310009