html网页解析代码,htmlparser解析网页内容代码

/*抽取html网页文本,循环到值节点并判断是否有src链接*/

public void htmlText(String url) throws Exception

{

try {

URL u = new URL(url);

if("https".equalsIgnoreCase(u.getProtocol())){

SslUtils.ignoreSsl();

}

//生成一个解析器对象,用网页的 url 作为参数

Parser parser = new Parser(url);

if(parser.getEncoding().equals("ISO-8859-1"))

parser.setEncoding("UTF-8");

//迭代所有节点, null 表示不使用 NodeFilter

NodeList list = parser.parse(null);

//循环值节点并输出

processNodeList(list);

//输出大节点

/*SimpleNodeIterator iterator = list.elements();

while (iterator.hasMoreNodes()) {

Node node = iterator.nextNode();

String result = node.toHtml();

System.out.println(result);

}*/

}catch (ParserException e) {e.printStackTrace();}

}

private void processNodeList(NodeList list) {

//迭代开始

SimpleNodeIterator iterator = list.elements();

while (iterator.hasMoreNodes()) {

Node node = iterator.nextNode();

//得到该节点的子节点列表

NodeList childList = node.getChildren();

//孩子节点为空,说明是值节点

if (null == childList){

//得到值节点的值

String txt = node.toHtml();

System.out.println(txt);

if(txt.contains("src="))

{

int start = txt.indexOf("src=");

txt=txt.substring(start);

int end = txt.indexOf(" ");

if (end == -1) end = txt.indexOf(">");

String src = txt.substring(5, end - 1);

System.out.println(src);

}

}else processNodeList(childList);//孩子节点不为空,继续迭代该孩子节点

}

}

/*

* 在htmlparser中,Node分成三类,都继承AbstractNode:

* RemarkNode:代表Html中的注释

* TagNode:标签节点。

* TextNode:文本节点

*/

public void tagGet(String url){

try {

Parser parser = new Parser(url);

if(parser.getEncoding().equals("ISO-8859-1"))

parser.setEncoding("UTF-8");

NodeVisitor visitor = new NodeVisitor( true, true ) {

public void visitTag(Tag tag) {

System.out.println("This is Tag:"+tag.getText());

}

public void visitStringNode (Text string) {

System.out.println("This is Text:"+string);

}

public void visitRemarkNode (Remark remark) {

System.out.println("This is Remark:"+remark.getText());

}

public void beginParsing () {

System.out.println("beginParsing");

}

public void visitEndTag (Tag tag){

System.out.println("visitEndTag:"+tag.getText());

}

public void finishedParsing () {

System.out.println("finishedParsing");

}

};

parser.visitAllNodesWith(visitor);

/*NodeVisitor visitor = new NodeVisitor() {

public void visitTag(Tag tag) {

System.out.println("正在访问的tag:" + tag.getTagName() + " || Class is :"+ tag.getClass());

}

};

parser.visitAllNodesWith(visitor);

*/

//NodeList parse = parser.parse(null);

//System.out.println(parse.toHtml());

} catch (ParserException e) {

e.printStackTrace();

}

}

/*

* 单独获取inputtag和formtag

*/

public void formGet(String url) throws ParserException{

Parser parser = new Parser(url);

if(parser.getEncoding().equals("ISO-8859-1"))

parser.setEncoding("UTF-8");

NodeFilter inputFilter = new NodeClassFilter(InputTag.class);

NodeFilter formFilter = new NodeClassFilter(FormTag.class);

NodeFilter frameFilter = new NodeClassFilter(FrameTag.class);

OrFilter lastFilter = new OrFilter();

lastFilter.setPredicates(new NodeFilter[] { formFilter, inputFilter,frameFilter });

NodeList nodeList = parser.parse(lastFilter);

for(int i = 0; i <= nodeList.size(); i++)

{

if(nodeList.elementAt(i) instanceof InputTag)

{

InputTag tag = (InputTag) nodeList.elementAt(i);

System.out.println("Input Info:" + tag.getTagName()+":"

+tag.getText());

}

if(nodeList.elementAt(i) instanceof FormTag)

{

FormTag tag = (FormTag) nodeList.elementAt(i);

System.out.println("Form Info: " + tag.getFormName()+":"

+tag.getText());

NodeList inputnodes=tag.getFormInputs();

for(int j=0;j<=inputnodes.size();j++){

InputTag tag1 = (InputTag) inputnodes.elementAt(j);

System.out.println("Input Info:" + tag1.getTagName()+":"

+tag1.getText());

}

}

if(nodeList.elementAt(i) instanceof FrameTag)

{

FrameTag tag = (FrameTag) nodeList.elementAt(i);

System.out.println("Frame Info: " + tag.getFrameName()+":"

+tag.getText());

}

}

}

/*

* org.htmlparser Class PrototypicalNodeFactory

* A node factory based on the prototype pattern.

* This factory uses the prototype pattern to generate new nodes.

* These are cloned as needed to form new Text, Remark and Tag nodes.

*/

public void hyperGet(String url) throws ParserException{

PrototypicalNodeFactory factory = new PrototypicalNodeFactory ();

factory.registerTag(new FormTag());

Parser parser = new Parser (url);

parser.setNodeFactory (factory);

NodeFilter filter = new NodeClassFilter(FormTag.class);

NodeList nodeList = parser.parse(filter);

for (Node node : nodeList.toNodeArray()) {

if (node instanceof FormTag) {

FormTag tag = (FormTag) node;

System.out.println("Form Info: " + tag.getFormName()+":"

+tag.getText());

}

}

}

  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值