dom4j对包含Mixed复杂元素内容的提取,首先看个例子

 

 
  
  1. <?xml version="1.0" encoding="utf-8"?> 
  2. <resources> 
  3. <!-- About --> 
  4. <string name="ABOUT_TERMS_OF_SERVICE_LINK"><a href="http://www.webex.com/terms-of-service.html">Terms of Service</a></string> 
  5. <string name="ABOUT_PRIVACY_STATEMENT_LINK"><a href="http://www.cisco.com/web/siteassets/legal/privacy.html">Privacy Statement</a></string> 
  6. <string name="ABOUT_THIRD_PARTY_LINK"><a href="http://www.webex.com/legal/license.html">Third Party Licenses and Notices (including free/open source software)</a></string> 
  7.  
  8. <!-- Term of use --> 
  9. <string name="TERMSOFUSE_LINK">I have reviewed and agree to the <a href="http://m.webex.com">Terms of Service</a></string> 
  10. <string name="TERMSOFUSE_TITLE">Cisco WebEx Meetings</string> 
  11. <string name="TERMSOFUSE_BUTTON_OK">I accept</string> 
  12. <string name="TERMSOFUSE_BUTTON_CANCEL">I do not accept</string> 
  13. </resources>   

现在需要提取<string name="ABOUT_THIRD_PARTY_LINK"><a href="http://www.webex.com/legal/license.html">Third Party Licenses and Notices (including free/open source software)</a></string> 中的"><a href="http://www.webex.com/legal/license.html">Third Party Licenses and Notices (including free/open source software)</a> 如果仅仅使用dom4j提供的getXX方法得到的结果多半是让人失望的。为此需要查看Element类所继承的父类和实现的接口。在它实现的Branch, Cloneable, Node三个接口中的Branch接口中声明了一个content()方法来返回一个Node类型的List列表:

public List content()

Returns the content nodes of this branch as a backed Listso that the content of this branch may be modified directly using the Listinterface. The List is backed by the Branch so that changes to the list are reflected in the branch and vice versa.

 

Returns:
the nodes that this branch contains as a List

因此可使用它来提取节点内容。代码如下:

 

 
  
  1. public Map<String, String> getTagOfEnglishStrFromXml(Document doc) {
  2. Map<String,String> map = new HashMap<String,String>();
  3. if(null==doc.getRootElement()||!doc.getRootElement().hasContent()){
  4. return null;
  5. }
  6. Element root = doc.getRootElement();
  7. String rootName = root.getName();
  8. int childNum = root.elements().size();
  9. if(childNum<1){
  10. return null;
  11. }
  12. int elementSequence = 0;
  13. for(int cindex=0;cindex<childNum;cindex++){/* for each string element */
  14. ++elementSequence;
  15. Element stringElem = (Element)root.elements().get(cindex);
  16. String TagName = "";
  17. /* produce the tag name by rules*/
  18. int attrCount = stringElem.attributeCount();
  19. if(attrCount<1){
  20. TagName = rootName+"_"+stringElem.getName()+"_"+String.valueOf(elementSequence);
  21. }else{
  22. for(int i=0;i<attrCount;i++){
  23. TagName+=stringElem.attribute(i).getValue();
  24. }
  25. }
  26. String englishStr = "";
  27. if(stringElem.isTextOnly()){
  28. englishStr = stringElem.getText();
  29. }else{
  30. List list = stringElem.content();
  31. Iterator iterator = list.iterator();
  32. while(iterator.hasNext()){
  33. Node node = (Node)iterator.next();
  34. switch(node.getNodeType()){
  35. case Node.ELEMENT_NODE: englishStr += node.asXML();
  36. break;
  37. case Node.TEXT_NODE: englishStr += node.getText();
  38. break;
  39. }
  40. }
  41. }
  42. map.put(TagName, englishStr);
  43. }
  44. return map;
  45. }

  请特别留意以下代码

 

 
  
  1. String englishStr = "";  
  2.             if(stringElem.isTextOnly()){  
  3.                 englishStr = stringElem.getText();  
  4.             }else{  
  5.                 List list = stringElem.content();  
  6.                 Iterator iterator = list.iterator();  
  7.                 while(iterator.hasNext()){  
  8.                     Node node = (Node)iterator.next();  
  9.                     switch(node.getNodeType()){  
  10.                     case Node.ELEMENT_NODE: englishStr += node.asXML();  
  11.                         break;  
  12.                     case Node.TEXT_NODE: englishStr += node.getText();  
  13.                         break;  
  14.                     }  
  15.                 }  
  16.             }