昨天晚上完成了网页的下载,暂时不用和heritrix打交道了,有空我要好好研究下它的代码,现在没那么多时间。
今天对htmlparser有了初步了解,并自己写了一个简单的可以提取出网页中图片的url的小程序
package test;
import java.io.BufferedWriter;
import java.io.File;
import java.io.FileWriter;
import java.io.IOException;
import org.htmlparser.NodeFilter;
import org.htmlparser.Parser;
import org.htmlparser.filters.AndFilter;
import org.htmlparser.filters.HasAttributeFilter;
import org.htmlparser.filters.TagNameFilter;
import org.htmlparser.tags.ImageTag;
import org.htmlparser.tags.TableColumn;
import org.htmlparser.util.NodeList;
import org.htmlparser.util.ParserException;
public class Extractor {
private String outputPath;
private String inputPath;
private Parser parse;
public String getOutputPath() {
return outputPath;
}
public void setOutputPath(String outputPath) {
this.outputPath = outputPath;
}
public String getInputPath() {
return inputPath;
}
public void setInputPath(String inputPath) {
this.inputPath = inputPath;
}
public Parser getParse() {
return parse;
}
public void setParse(Parser parse) {
this.parse = parse;
}
public static void main(String args[]) {
Extractor ex = new Extractor();
ex.setInputPath("F:/Workspaces/MyEclipse 7.1/test/src/test/index.html");
ex.setOutputPath("F:/Workspaces/MyEclipse 7.1/test/src/test/");
try {
ex.setParse(new Parser("F:/Workspaces/MyEclipse 7.1/test/src/test/index.html"));
ex.extract();
} catch (ParserException e) {
e.printStackTrace();
}
}
public void extract(){
NodeFilter pic_filter = new AndFilter(new TagNameFilter("td"),
new HasAttributeFilter("class", "series_sy_intro_pic"));
NodeFilter Attribute_filter = new AndFilter(new TagNameFilter("td"),
new AndFilter(new HasAttributeFilter("class", "bor1_c1"),
new HasAttributeFilter("style", "padding:5px;")));
try {
this.getParse().setEncoding("gb2312");
NodeList pic_nodes =this.getParse().parse(pic_filter);
System.out.println("a");
TableColumn tc = (TableColumn) pic_nodes.elementAt(0);
ImageTag it = (ImageTag)(tc.childAt(1).getChildren().elementAt(0));
String imgURL = it.getImageURL();
System.out.println(imgURL);
BufferedWriter bw = new BufferedWriter(new FileWriter(new File(this.getOutputPath()+"aa.txt")));
bw.write(imgURL);
bw.flush();
// for(int i=0;i<pic_nodes.size();i++){
//
// }
// NodeList atr_nodes = this.getParse().parse(Attribute_filter);
//
} catch (ParserException e) {
e.printStackTrace();
} catch (IOException e) {
e.printStackTrace();
}
}
}
过节,休息下,明天继续..