htmlparser初体验

最新推荐文章于 2022-05-08 22:56:09 发布

dufei07

最新推荐文章于 2022-05-08 22:56:09 发布

阅读量81

点赞数

分类专栏：搜索引擎文章标签： Myeclipse F# HTML

搜索引擎专栏收录该内容

10 篇文章 0 订阅

订阅专栏

昨天晚上完成了网页的下载，暂时不用和heritrix打交道了，有空我要好好研究下它的代码，现在没那么多时间。

今天对htmlparser有了初步了解，并自己写了一个简单的可以提取出网页中图片的url的小程序

package test;

import java.io.BufferedWriter;
import java.io.File;
import java.io.FileWriter;
import java.io.IOException;

import org.htmlparser.NodeFilter;
import org.htmlparser.Parser;
import org.htmlparser.filters.AndFilter;
import org.htmlparser.filters.HasAttributeFilter;
import org.htmlparser.filters.TagNameFilter;
import org.htmlparser.tags.ImageTag;
import org.htmlparser.tags.TableColumn;
import org.htmlparser.util.NodeList;
import org.htmlparser.util.ParserException;

public  class Extractor {
	private String outputPath;

	private String inputPath;

	private Parser parse;

	public String getOutputPath() {
		return outputPath;
	}

	public void setOutputPath(String outputPath) {
		this.outputPath = outputPath;
	}

	public String getInputPath() {
		return inputPath;
	}

	public void setInputPath(String inputPath) {
		this.inputPath = inputPath;
	}

	public Parser getParse() {
		return parse;
	}

	public void setParse(Parser parse) {
		this.parse = parse;
	}

	public static void main(String args[]) {
		Extractor ex = new Extractor();
		ex.setInputPath("F:/Workspaces/MyEclipse 7.1/test/src/test/index.html");
		ex.setOutputPath("F:/Workspaces/MyEclipse 7.1/test/src/test/");
		try {
			ex.setParse(new Parser("F:/Workspaces/MyEclipse 7.1/test/src/test/index.html"));
			ex.extract();
		} catch (ParserException e) {
			e.printStackTrace();
		}
	}
	
	public void extract(){
		NodeFilter pic_filter = new AndFilter(new TagNameFilter("td"),
				new HasAttributeFilter("class", "series_sy_intro_pic"));

		NodeFilter Attribute_filter = new AndFilter(new TagNameFilter("td"),
				new AndFilter(new HasAttributeFilter("class", "bor1_c1"),
						new HasAttributeFilter("style", "padding:5px;")));
		try {
			this.getParse().setEncoding("gb2312");
			NodeList pic_nodes =this.getParse().parse(pic_filter);
			System.out.println("a");
			TableColumn tc = (TableColumn) pic_nodes.elementAt(0);
			
			ImageTag it = (ImageTag)(tc.childAt(1).getChildren().elementAt(0));
			String imgURL = it.getImageURL();
System.out.println(imgURL);
			BufferedWriter bw = new BufferedWriter(new FileWriter(new File(this.getOutputPath()+"aa.txt")));
			bw.write(imgURL);
			bw.flush();
			
//			for(int i=0;i<pic_nodes.size();i++){
//				
//			}
//			NodeList atr_nodes = this.getParse().parse(Attribute_filter);
//			
		} catch (ParserException e) {
			e.printStackTrace();
		} catch (IOException e) {
			e.printStackTrace();
		}
	}
}

过节，休息下，明天继续..