使用HtmlParser读取论坛图片

最新推荐文章于 2024-09-09 18:12:37 发布

chz20072008

最新推荐文章于 2024-09-09 18:12:37 发布

阅读量114

点赞数

文章标签： .net

package com.chen;

import java.io.BufferedInputStream;
import java.io.File;
import java.io.FileOutputStream;
import java.io.IOException;
import java.net.HttpURLConnection;
import java.net.MalformedURLException;
import java.net.URL;
import java.util.HashSet;
import java.util.Set;

import org.htmlparser.NodeFilter;
import org.htmlparser.Parser;
import org.htmlparser.filters.TagNameFilter;
import org.htmlparser.tags.ImageTag;
import org.htmlparser.tags.LinkTag;
import org.htmlparser.tags.TitleTag;
import org.htmlparser.util.NodeList;
import org.htmlparser.util.ParserException;

/
public class HttpGet {

	private static int BUFFER_SIZE = 8096;// 缓冲区大小

	/**
	 * 将HTTP资源另存为文件
	 * 
	 * @param destUrl
	 *            String
	 * @param title
	 * @param fileName
	 *            String
	 * @throws IOException
	 * @throws Exception
	 */
	public static void saveToFile(String destUrl, String title) {
		FileOutputStream fos = null;
		BufferedInputStream bis = null;
		HttpURLConnection httpUrl = null;
		URL url = null;
		byte[] buf = new byte[BUFFER_SIZE];
		int size = 0;
		int pos = destUrl.lastIndexOf('/');

		String fileName = "";
		if (pos != -1)
			fileName = destUrl.substring(pos + 1, destUrl.length());
		else
			fileName = destUrl.substring(destUrl.length() - 10, destUrl
					.length());
		String path = "D:" + File.separator + "temp" + File.separator
				+ "images" + File.separator;

		System.out.println("title: " + title);

		if (null != title && !"".equals(title)) {
			File file = new File(path + title + File.separator);
			if (!file.exists()) {
				file.mkdirs();

			}
		
			path = file.getPath();
		}
		path = path + File.separator + fileName;
		System.out.print("\t" + path);
		// 建立链接
		try {
			url = new URL(destUrl);
			httpUrl = (HttpURLConnection) url.openConnection();
			// 连接指定的资源
			httpUrl.connect();
			// 获取网络输入流
			bis = new BufferedInputStream(httpUrl.getInputStream());
			// 建立文件

			fos = new FileOutputStream(path);

			// 保存文件
			while ((size = bis.read(buf)) != -1)
				fos.write(buf, 0, size);

			fos.close();
			bis.close();
			httpUrl.disconnect();
		} catch (MalformedURLException e) {
			// TODO Auto-generated catch block
			e.printStackTrace();
		} catch (IOException ex) {
			// TODO Auto-generated catch block
			ex.printStackTrace();
		}
		System.out.println(" \tsave completely");
	}

	/**
	 * 主方法
	 * 
	 * @param argv
	 *            String[]
	 */
	public static void main(String argv[]) {

		String url = "http://xxx.com";
		
		// getImagesFromSinglePage(url);
		try {
			
			String page = null;
			for(int i=2;i<=105;i++){
				page="http://xx.com/html/13/13_"+i+".shtml";
				getPageLinks(page);
			}
			getPageLinks(url);
			// String title=getTitle(url);
			// getImages(url,title);
		} catch (Exception e) {
			// TODO Auto-generated catch block
			e.printStackTrace();
		}
		// getImagesByParser(url);

	}

	private static void getPageLinks(String page) throws ParserException {

		Parser myParser = new Parser(page);

		// 设置编码
		myParser.setEncoding("UTF-8");
		String filterStr = "a";
		NodeFilter filter = new TagNameFilter(filterStr);
		NodeList nodeList = myParser.extractAllNodesThatMatch(filter);
		System.out.println("size: " + nodeList.size());

		for (int i = 0; i < nodeList.size(); i++) {
			LinkTag linkTag = (LinkTag) nodeList.elementAt(i);
			String link = linkTag.getLink();
			String text = linkTag.getLinkText();
			text = TextProcess(text);
			if (link.endsWith(".shtml") && text.length() > 2) {
				try {
					getImages(link,text);
				} catch (Exception e) {
					// TODO Auto-generated catch block
					e.printStackTrace();
				}
			}
		}

	}

	private static String TextProcess(String text) {
		text = text.trim();
		text = text.replaceAll(">", "");
		text = text.replaceAll("<", "");
		text = text.replaceAll("/", "");
		text = text.replaceAll(">", "");
		text = text.replaceAll(" ", "");
		int pos = 0;
		if ((pos = text.indexOf(":-")) != -1)
			text = text.substring(pos + 2);

		pos = text.indexOf("－");
		if (pos != -1)
			text = text.substring(0, pos);
		pos = text.indexOf("-");
		if (pos != -1)
			text = text.substring(0, pos);
		text = text.replace(".", "");
		text = text.replaceAll(",", "");
		text = text.replaceAll("，", "");
		return text;
	}

	private static String getTitle(String url) throws ParserException {

		Parser myParser = new Parser(url);

		// 设置编码
		myParser.setEncoding("UTF-8");

		String titleTag = "title";
		NodeFilter titleFilter = new TagNameFilter(titleTag);
		NodeList titleList = myParser.extractAllNodesThatMatch(titleFilter);
		int size = titleList.size();
		String title = null;
		if (size == 1) {
			TitleTag titleT = (TitleTag) titleList.elementAt(0);
			title = titleT.getTitle();

		}
		return title;

	}

	public static void getImages(String resource, String title)
			throws Exception {

		// Set
		Set<String> imagesSet = new HashSet<String>();
		Parser myParser = new Parser(resource);

		// 设置编码
		myParser.setEncoding("UTF-8");
		String filterStr = "img";
		NodeFilter filter = new TagNameFilter(filterStr);
		NodeList nodeList = myParser.extractAllNodesThatMatch(filter);
		System.out.println("size: " + nodeList.size());

		for (int i = 0; i < nodeList.size(); i++) {
			ImageTag imageTag = (ImageTag) nodeList.elementAt(i);
			String imageUrl = imageTag.getImageURL();
			System.out.println("iamge " + i + ": " + imageUrl);
			if (!imagesSet.contains(imageUrl)) {
				System.out.print("\t saving ...");
				saveToFile(imageTag.getImageURL(), title);
			} else {
				System.out.print("\t exist already,no need to save");
			}
		}

	}

}

心血来潮，写了一个读取论坛图片的程序，能够自动把图片保存到硬盘上去，使用HtmlParse组件。

chz20072008

关注

0
点赞
踩
0

收藏

觉得还不错? 一键收藏
0
评论
使用HtmlParser读取论坛图片

package com.chen;import java.io.BufferedInputStream;import java.io.File;import java.io.FileOutputStream;import java.io.IOException;import java.net.HttpURLConnection;import java.net.MalformedURL...
复制链接

扫一扫