使用HtmlParser获取网页目标数据

最新推荐文章于 2017-11-30 19:41:40 发布

MockingBird9527

最新推荐文章于 2017-11-30 19:41:40 发布

阅读量432

点赞数 1

本文链接：https://blog.csdn.net/weixin_36146275/article/details/54944168

版权

准备工具：

htmlParser架包下载地址：【https://sourceforge.net/projects/htmlparser/files/htmlparser/】

参考API地址：【http://tool.oschina.net/apidocs/apidoc?api=HTMLParser】

程序实现获取【http://www.ygdy8.net/html/gndy/dyzz/index.html】此网站上所有电影的下载地址。

分析页面数据：本网页共有157页，每页25部电影介绍，点击进入电影介绍就可以看见下载链接

点击电影介绍进入下载页面

一、获取157页"电影介绍地址"

经过分析，value的值是目标数据：

       /**
	 * 从http://www.ygdy8.net/html/gndy/dyzz/index.html网页里获取157个电影分页的地址
	 * 
	 * @return List集合存储157个分页地址
	 */
	public static List getAllPageList() {
		List<String> pageList = new ArrayList();
		try {
			Parser parser = new Parser("http://www.ygdy8.net/html/gndy/dyzz/index.html");
			// match匹配带option属性的标签
			NodeList nodeList = parser.extractAllNodesThatMatch(new TagNameFilter("option"))
					.extractAllNodesThatMatch(new HasAttributeFilter("value"));
			for (int i = 0; i < nodeList.size(); i++) {
				OptionTag tag = (OptionTag) nodeList.elementAt(i);
				String link = tag.getAttribute("value");
				// 筛选既带有option属性又含有list的标签
				if (link.contains("list")) {
					// System.out.println(i+",,,"+link);
					pageList.add("http://www.ygdy8.net/html/gndy/dyzz/" + link);
				}
			}
		} catch (ParserException e) {
			e.printStackTrace();
		}
		return pageList;
	}

二、获取每页含有的"电影介绍地址"

	/**
	 * 
	 * 获取每页25个电影的介绍地址
	 */
	public static List getAllMovieUrlFromOnePageList(String pageListUrl) {
		List<String> allMoveUrl = new ArrayList<String>();
		try {
			Parser parser = new Parser(pageListUrl);
			NodeList nodeList = parser.extractAllNodesThatMatch(new HasAttributeFilter("class", "ulink"));
			for (int i = 0; i < nodeList.size(); i++) {
				LinkTag tag = (LinkTag) nodeList.elementAt(i);
				// System.out.println(tag.getLink());
				allMoveUrl.add("http://www.ygdy8.net" + tag.getLink());
			}
		} catch (ParserException e) {
			e.printStackTrace();
		}
		return allMoveUrl;
	}

三、获取下载地址

	/**
	 * 获取电影的下载地址
	 */
	public static String[] getMovieDownloadUrl(String movieIntroUrl) {
		String[] downloadUrl = null;
		try {
			Parser parser = new Parser(movieIntroUrl);
			NodeList nodeList = parser.extractAllNodesThatMatch(new LinkStringFilter("ftp"));

			for (int i = 0; i < nodeList.size(); i++) {
				LinkTag tag = (LinkTag) nodeList.elementAt(i);
				System.out.println(tag.getLink());
			}
		} catch (ParserException e) {
			e.printStackTrace();
		}
		return downloadUrl;
	}

运行结果：

愉快地下载电影：

最后把完整源码献上：

public class HtmlParserDemo {

	/**
	 * 获取电影的下载地址
	 */
	public static String[] getMovieDownloadUrl(String movieIntroUrl) {
		String[] downloadUrl = null;
		try {
			Parser parser = new Parser(movieIntroUrl);
			NodeList nodeList = parser.extractAllNodesThatMatch(new LinkStringFilter("ftp"));

			for (int i = 0; i < nodeList.size(); i++) {
				LinkTag tag = (LinkTag) nodeList.elementAt(i);
				System.out.println(tag.getLink());
			}
		} catch (ParserException e) {
			e.printStackTrace();
		}
		return downloadUrl;
	}

	/**
	 * 
	 * 获取每页25个电影的介绍地址
	 */
	public static List getAllMovieUrlFromOnePageList(String pageListUrl) {
		List<String> allMoveUrl = new ArrayList<String>();
		try {
			Parser parser = new Parser(pageListUrl);
			NodeList nodeList = parser.extractAllNodesThatMatch(new HasAttributeFilter("class", "ulink"));
			for (int i = 0; i < nodeList.size(); i++) {
				LinkTag tag = (LinkTag) nodeList.elementAt(i);
				// System.out.println(tag.getLink());
				allMoveUrl.add("http://www.ygdy8.net" + tag.getLink());
			}
		} catch (ParserException e) {
			e.printStackTrace();
		}
		return allMoveUrl;
	}

	/**
	 * 从http://www.ygdy8.net/html/gndy/dyzz/index.html网页里获取157个电影分页的地址
	 * 
	 * @return List集合存储157个分页地址
	 */
	public static List getAllPageList() {
		List<String> pageList = new ArrayList();
		try {
			Parser parser = new Parser("http://www.ygdy8.net/html/gndy/dyzz/index.html");
			// match匹配带option属性的标签
			NodeList nodeList = parser.extractAllNodesThatMatch(new TagNameFilter("option"))
					.extractAllNodesThatMatch(new HasAttributeFilter("value"));
			for (int i = 0; i < nodeList.size(); i++) {
				OptionTag tag = (OptionTag) nodeList.elementAt(i);
				String link = tag.getAttribute("value");
				// 筛选既带有option属性又含有list的标签
				if (link.contains("list")) {
					// System.out.println(i+",,,"+link);
					pageList.add("http://www.ygdy8.net/html/gndy/dyzz/" + link);
				}
			}
		} catch (ParserException e) {
			e.printStackTrace();
		}
		return pageList;
	}
	
	/**
	 * 三个方法联系在一起执行，输出抓取的下载地址
	 */
	public static void getAllMovie() {
		// 获取所有的的电影列表页，共有157页
		List pageList = getAllPageList();
		for (Iterator iterator = pageList.iterator(); iterator.hasNext();) {
			String oneMoveListUrl = (String) iterator.next();
			// 获取一个电影列表页里面的25个电影页面的地址
			List moveListFromOnePage = getAllMovieUrlFromOnePageList(oneMoveListUrl);
			try {
				BufferedWriter bw = new BufferedWriter(new OutputStreamWriter(new FileOutputStream("下载地址.txt")));
				String downloadUrl[] = new String[moveListFromOnePage.size()];
				for (Iterator iterator2 = moveListFromOnePage.iterator(); iterator2.hasNext();) {
					String movieIntroUrl = (String) iterator2.next();
					// 获取一个电影的下载地址（ftp地址）
					downloadUrl = getMovieDownloadUrl(movieIntroUrl);
					bw.write(downloadUrl + "\n");
					bw.flush();
				}

				bw.close();
			} catch (FileNotFoundException e) {
				// TODO Auto-generated catch block
				e.printStackTrace();
			} catch (IOException e) {
				// TODO Auto-generated catch block
				e.printStackTrace();
			}

		}
	}

	public static void main(String[] args) {
		getAllMovie();
	}
}