心血来潮,写了一个读取论坛图片的程序,能够自动把图片保存到硬盘上去,使用HtmlParse组件。
http://hintcnuie.javaeye.com/blog/172132
- package com.chen;
- import java.io.BufferedInputStream;
- import java.io.File;
- import java.io.FileOutputStream;
- import java.io.IOException;
- import java.net.HttpURLConnection;
- import java.net.MalformedURLException;
- import java.net.URL;
- import java.util.HashSet;
- import java.util.Set;
- import org.htmlparser.NodeFilter;
- import org.htmlparser.Parser;
- import org.htmlparser.filters.TagNameFilter;
- import org.htmlparser.tags.ImageTag;
- import org.htmlparser.tags.LinkTag;
- import org.htmlparser.tags.TitleTag;
- import org.htmlparser.util.NodeList;
- import org.htmlparser.util.ParserException;
- /
- public class HttpGet {
- private static int BUFFER_SIZE = 8096;// 缓冲区大小
- /**
- * 将HTTP资源另存为文件
- *
- * @param destUrl
- * String
- * @param title
- * @param fileName
- * String
- * @throws IOException
- * @throws Exception
- */
- public static void saveToFile(String destUrl, String title) {
- FileOutputStream fos = null;
- BufferedInputStream bis = null;
- HttpURLConnection httpUrl = null;
- URL url = null;
- byte[] buf = new byte[BUFFER_SIZE];
- int size = 0;
- int pos = destUrl.lastIndexOf('/');
- String fileName = "";
- if (pos != -1)
- fileName = destUrl.substring(pos + 1, destUrl.length());
- else
- fileName = destUrl.substring(destUrl.length() - 10, destUrl
- .length());
- String path = "D:" + File.separator + "temp" + File.separator
- + "images" + File.separator;
- System.out.println("title: " + title);
- if (null != title && !"".equals(title)) {
- File file = new File(path + title + File.separator);
- if (!file.exists()) {
- file.mkdirs();
- }
- path = file.getPath();
- }
- path = path + File.separator + fileName;
- System.out.print("/t" + path);
- // 建立链接
- try {
- url = new URL(destUrl);
- httpUrl = (HttpURLConnection) url.openConnection();
- // 连接指定的资源
- httpUrl.connect();
- // 获取网络输入流
- bis = new BufferedInputStream(httpUrl.getInputStream());
- // 建立文件
- fos = new FileOutputStream(path);
- // 保存文件
- while ((size = bis.read(buf)) != -1)
- fos.write(buf, 0, size);
- fos.close();
- bis.close();
- httpUrl.disconnect();
- } catch (MalformedURLException e) {
- // TODO Auto-generated catch block
- e.printStackTrace();
- } catch (IOException ex) {
- // TODO Auto-generated catch block
- ex.printStackTrace();
- }
- System.out.println(" /tsave completely");
- }
- /**
- * 主方法
- *
- * @param argv
- * String[]
- */
- public static void main(String argv[]) {
- String url = "http://xxx.com";
- // getImagesFromSinglePage(url);
- try {
- String page = null;
- for(int i=2;i<=105;i++){
- page="http://xx.com/html/13/13_"+i+".shtml";
- getPageLinks(page);
- }
- getPageLinks(url);
- // String title=getTitle(url);
- // getImages(url,title);
- } catch (Exception e) {
- // TODO Auto-generated catch block
- e.printStackTrace();
- }
- // getImagesByParser(url);
- }
- private static void getPageLinks(String page) throws ParserException {
- Parser myParser = new Parser(page);
- // 设置编码
- myParser.setEncoding("UTF-8");
- String filterStr = "a";
- NodeFilter filter = new TagNameFilter(filterStr);
- NodeList nodeList = myParser.extractAllNodesThatMatch(filter);
- System.out.println("size: " + nodeList.size());
- for (int i = 0; i < nodeList.size(); i++) {
- LinkTag linkTag = (LinkTag) nodeList.elementAt(i);
- String link = linkTag.getLink();
- String text = linkTag.getLinkText();
- text = TextProcess(text);
- if (link.endsWith(".shtml") && text.length() > 2) {
- try {
- getImages(link,text);
- } catch (Exception e) {
- // TODO Auto-generated catch block
- e.printStackTrace();
- }
- }
- }
- }
- private static String TextProcess(String text) {
- text = text.trim();
- text = text.replaceAll(">", "");
- text = text.replaceAll("<", "");
- text = text.replaceAll("/", "");
- text = text.replaceAll(">", "");
- text = text.replaceAll(" ", "");
- int pos = 0;
- if ((pos = text.indexOf(":-")) != -1)
- text = text.substring(pos + 2);
- pos = text.indexOf("-");
- if (pos != -1)
- text = text.substring(0, pos);
- pos = text.indexOf("-");
- if (pos != -1)
- text = text.substring(0, pos);
- text = text.replace(".", "");
- text = text.replaceAll(",", "");
- text = text.replaceAll(",", "");
- return text;
- }
- private static String getTitle(String url) throws ParserException {
- Parser myParser = new Parser(url);
- // 设置编码
- myParser.setEncoding("UTF-8");
- String titleTag = "title";
- NodeFilter titleFilter = new TagNameFilter(titleTag);
- NodeList titleList = myParser.extractAllNodesThatMatch(titleFilter);
- int size = titleList.size();
- String title = null;
- if (size == 1) {
- TitleTag titleT = (TitleTag) titleList.elementAt(0);
- title = titleT.getTitle();
- }
- return title;
- }
- public static void getImages(String resource, String title)
- throws Exception {
- // Set
- Set<String> imagesSet = new HashSet<String>();
- Parser myParser = new Parser(resource);
- // 设置编码
- myParser.setEncoding("UTF-8");
- String filterStr = "img";
- NodeFilter filter = new TagNameFilter(filterStr);
- NodeList nodeList = myParser.extractAllNodesThatMatch(filter);
- System.out.println("size: " + nodeList.size());
- for (int i = 0; i < nodeList.size(); i++) {
- ImageTag imageTag = (ImageTag) nodeList.elementAt(i);
- String imageUrl = imageTag.getImageURL();
- System.out.println("iamge " + i + ": " + imageUrl);
- if (!imagesSet.contains(imageUrl)) {
- System.out.print("/t saving ...");
- saveToFile(imageTag.getImageURL(), title);
- } else {
- System.out.print("/t exist already,no need to save");
- }
- }
- }
- }