批量提取网页数据

 

要提取网页中的数据,在网上搜到的最常用的方法就是利用正则表达式来提取内容,正则表达式我没怎么研究,所以具体怎么实现,我不知道。但是我想到了另外一种解决方案,很傻很天真,但真的很有效,完全达到了我想要的效果,具体方法下面慢慢道来。

 

我们知道,早期的HTML语法被定义成较松散的规则,以有助于不熟悉网络出版的人采用。网页浏览器接受了这个现实,并且可以显示语法不严格的网页。随着时间的流逝,官方标准渐渐趋于严格的语法,但是浏览器继续显示一些远称不上合乎标准的HTML。使用XML的严格规则的XHTML(可扩展超文本置标语言)是W3C计划中的HTML的接替者。虽然很多人认为它已经成为当前的HTML标准,但是它实际上是一个独立的、和HTML平行发展的标准。W3C目前的建议是使用XHTML 1.1 XHTML 1.0或者HTML 4.01进行网络出版。

 

由上面的介绍可以看出,最难提取的是早期的HTML语法规则的网页内容,最容易提取的是使用XML的严格规则的XHTML语法实现的内容。既然如此,我也可以把较松散的规则的HTML网页变成一个符合XHTML语法规则的网页。这样一来,我们就有了一个标准的XML文件,这时要采集其中的数据就非常容易了。我们可以根据XML的节点来提取内容,也可以一次性进取所有的XML中的文本再来解析字符串,我用的就是第二种方法。

 

这种方法实现起来比较麻烦,所以比较适合大量的网页数据的提取,我这次提取的网页数据共有1200多个文件,结构基本上差不多,所以值得一试。

 

我所用的开发语言是JAVA,所以用JAVA来实现是顺理成章的事。

 

一、首先把所有的网页文件放入一个文件夹中,写一个方法,把所有的HTML文件收集起来。

 

代码如下:

 

package com.hailite.ecs.hazard.importing;

 

import java.io.File;

import java.util.ArrayList;

import java.util.List;

 

/**

 * @author 严军

 * @date 2009-3-18 下午09:54:00

 */

public class FilesSearcher {

    static final String fileDir = "E://危险化学品"; //$NON-NLS-1$

   

    public List<File> searchFiles() {

       List<File> files = new ArrayList<File>();

       File f = new File(fileDir);

       if (f.isDirectory()) {

           File[] subDirs = f.listFiles();

           for (File sbd : subDirs) {

              if (sbd.isDirectory()) {

                  for (File f1 : sbd.listFiles()) {

                     if (f1.isFile() && f1.getName().endsWith(".htm")) { //$NON-NLS-1$

                         files.add(f1);

                     }

                  }

              } else {

                  files.add(sbd);

              }

           }

       }

       return files;

    }

   

    public static void main(String[] args) {

       FilesSearcher searcher = new FilesSearcher();

       searcher.searchFiles();

      

    }

   

}

 

二、定义一个名为ICuter的接口,从名字上可以知道,是用来文本剪切内容的,至于要剪切什么内容,后面的代码中有实现。接口只有一个方法,相当简单:

 

package com.hailite.ecs.hazard.importing;

 

/**

 * @author 严军

 * @date 2009-4-14 下午12:42:43

 */

public interface ICuter extends IHTML {

   

    String cuting(String text);

}

 

三、定义一个工厂类,用来剪切HTML文本中无用的内容,也就是把不符合XML方法的内容去掉,让HTML变成一个合法的XML文档。工厂类中有多个内部类实现了ICuter接口,分别剪切了不同的内容,各个实现都有自己的职责,如MetaCuter去掉HTML中的meta标签,LinkCuter去掉link标签,WordCuter去掉一些特定的字符等等,具体实现如下(注释很少,看起来可能比较费力,呵呵):

 

package com.hailite.ecs.hazard.importing;

 

import java.io.BufferedReader;

import java.io.File;

import java.io.FileInputStream;

import java.io.FileOutputStream;

import java.io.InputStreamReader;

import java.util.List;

 

import javax.xml.parsers.DocumentBuilderFactory;

 

import org.w3c.dom.Document;

 

/**

 * @author 严军

 * @date 2009-4-14 上午11:17:54

 */

public class HtmlContentCuterFactory {

    private static final ICuter[] cuters = new ICuter[]{

       new MetaCuter(), new LinkCuter(), new ImgCuter(),

       new WordCuter()

    };

    public static void main(String[] args) {

 

       List<File> files = new FilesSearcher().searchFiles();

       for (File f : files) {

           try {

              FileInputStream in = new FileInputStream(f);

              String temp = null;

              StringBuffer sb = new StringBuffer();

              //sb.append("<?xml version=/"1.0/" encoding=/"UTF-8/"?>"); //$NON-NLS-1$

              sb.append("<?xml version=/"1.0/" encoding=/"GB2312/"?>/n");  //$NON-NLS-1$

              BufferedReader inr = new BufferedReader(new InputStreamReader(in, IHTML.GB2312));// 读取网页全部内容

              while ((temp = inr.readLine()) != null) {

                  temp = converterText(temp);

                  sb.append(temp + "/n"); //$NON-NLS-1$

                  //System.out.println(temp);

              }

              inr.close();

              in.close();

              FileOutputStream out = new FileOutputStream(f);

              out.write(sb.toString().getBytes());

              out.close();

              out.flush();

 

              Document doc = DocumentBuilderFactory.newInstance().newDocumentBuilder().parse(f);

              doc.getElementsByTagName(IHTML.body);

           } catch (Exception e) {

              e.printStackTrace();

              System.out.println(f.getName() + ":" + e.getMessage()); //$NON-NLS-1$

           }

       }

    }

   

    static String converterText(String text) {

       for (ICuter cuter : cuters) {

           text = cuter.cuting(text);

       }

       return text;

    }

 

    static class MetaCuter implements ICuter {

       @Override

       public String cuting(String text) {

           if (text.startsWith(l + meta)) {

              text = ""; //$NON-NLS-1$

           }

           return text;

       }

    }

 

    static class LinkCuter implements ICuter {

       @Override

       public String cuting(String text) {

           if (text.startsWith(l + link)) {

              text = ""; //$NON-NLS-1$

           }

           return text;

       }

    }

 

    static class ImgCuter implements ICuter {

       @Override

       public String cuting(String text) {

           if (text.indexOf(l + img) != -1) {

              String[] ss = text.split(r);

              StringBuffer sb = new StringBuffer();

              for (int i = 0; i < ss.length; i ++) {

                  String s = ss[i];

                  if (!s.startsWith(l + img)) {

                     sb.append(s + r);

                  }

              }

              text = sb.toString();

           }

           return text;

       }

    }

 

    static class WordCuter implements ICuter {

       //换掉数组0中的字符为1

       final String[][] sss = new String[][]{

              {"class=youxianljie", ""},  //$NON-NLS-1$//$NON-NLS-2$

              {"class=zhengwenkuan", ""}, //$NON-NLS-1$ //$NON-NLS-2$

              {"color=#0033ff", ""},  //$NON-NLS-1$//$NON-NLS-2$

              {"color=#0000ff", ""},  //$NON-NLS-1$//$NON-NLS-2$$

              {"width=760", ""},  //$NON-NLS-1$//$NON-NLS-2$

              {"width=600", ""},  //$NON-NLS-1$//$NON-NLS-2$

              {"width=160", ""},  //$NON-NLS-1$//$NON-NLS-2$

              {"border=0", ""},  //$NON-NLS-1$//$NON-NLS-2$

              {"<br>", ""},  //$NON-NLS-1$//$NON-NLS-2$

              {"bgcolor=#6495ed", ""},  //$NON-NLS-1$//$NON-NLS-2$

              {"height=30", ""},  //$NON-NLS-1$//$NON-NLS-2$

              {"align=left", ""},  //$NON-NLS-1$//$NON-NLS-2$

              {"type=text/javascript", ""},  //$NON-NLS-1$//$NON-NLS-2$

              {"&nbsp; ", ""},  //$NON-NLS-1$//$NON-NLS-2$

              {"&nbsp;", ""},  //$NON-NLS-1$//$NON-NLS-2$

              {"<td width=/"13%/" height=/"24/">", ""},  //$NON-NLS-1$//$NON-NLS-2$

              {"='", "=/""},  //$NON-NLS-1$//$NON-NLS-2$

              {"' ", "/" "},  //$NON-NLS-1$//$NON-NLS-2$

              {"'>", "/">"},  //$NON-NLS-1$//$NON-NLS-2$

              {"=/"url", "='url"},  //$NON-NLS-1$//$NON-NLS-2$

              {"&web_id", ""},  //$NON-NLS-1$//$NON-NLS-2$

              {"frame width=0 scrolling=no height=0", ""},  //$NON-NLS-1$//$NON-NLS-2$

              {"cellspacing=0", ""},  //$NON-NLS-1$//$NON-NLS-2$

              {"cellpadding=2", ""},  //$NON-NLS-1$//$NON-NLS-2$

              {"class=tdbg_leftall", ""},  //$NON-NLS-1$//$NON-NLS-2$

              {"style=cursor:hand", "style=/"cursor:hand/""},  //$NON-NLS-1$//$NON-NLS-2$

              {"<p align=/"left/">", ""},  //$NON-NLS-1$//$NON-NLS-2$

             

              {"&middot;", "¡¤"},  //$NON-NLS-1$//$NON-NLS-2$

              {"&sup3;", "-sup3;"},  //$NON-NLS-1$//$NON-NLS-2$

              {"&szlig;", "-szlig;"},  //$NON-NLS-1$//$NON-NLS-2$

              {"&micro;", "-micro;"},  //$NON-NLS-1$//$NON-NLS-2$

             

              {"<div align=/"center/">国标编号</div>", "<td>国标编号</td>"},  //$NON-NLS-1$//$NON-NLS-2$

              {"<div align=/"center/">CAS</div>", "<td>CAS</td>"},  //$NON-NLS-1$//$NON-NLS-2$

              {"<div align=/"center/">中文名称</div>", "<td>中文名称</td>"},  //$NON-NLS-1$//$NON-NLS-2$

              {"<div align=/"center/">英文名称</div>", "<td>英文名称</td>"},  //$NON-NLS-1$//$NON-NLS-2$

              {"<div align=/"center/">别名</div>", "<td>别名</td>"},  //$NON-NLS-1$//$NON-NLS-2$

              {"<div align=/"center/">分子式</div>", "<td>分子式</td>"},  //$NON-NLS-1$//$NON-NLS-2$

              {"<div align=/"center/">分子量</div>", "<td>分子量</td>"},  //$NON-NLS-1$//$NON-NLS-2$

              {"<div align=/"center/">熔点</div>", "<td>熔点</td>"},  //$NON-NLS-1$//$NON-NLS-2$

              {"<div align=/"center/">密度</div>", "<td>密度</td>"},  //$NON-NLS-1$//$NON-NLS-2$

              {"<div align=/"center/">危险标记</div>", "<td>危险标记</td>"},  //$NON-NLS-1$//$NON-NLS-2$

             

             

       };

       @Override

       public String cuting(String text) {

           for (String[] ss : sss) {

              text = text.replaceAll(ss[0], ss[1]);

           }

           return text;

       }

    }

}

 

下面是两个常量类IHTMLIHazardProperties

package com.hailite.ecs.hazard.importing;

 

/**

 * @author 严军

 * @date 2009-4-15 下午02:32:26

 */

public interface IHTML {

    public static final String br = "br"; //$NON-NLS-1$

    public static final String meta = "meta"; //$NON-NLS-1$

    public static final String link = "link"; //$NON-NLS-1$

    public static final String img = "img"; //$NON-NLS-1$

    public static final String classs = "class"; //$NON-NLS-1$

    public static final String script = "script"; //$NON-NLS-1$

    public static final String body = "body"; //$NON-NLS-1$html

    public static final String html = "html"; //$NON-NLS-1$

 

    public static final String table = "table"; //$NON-NLS-1$

    public static final String tr = "tr"; //$NON-NLS-1$

    public static final String td = "td"; //$NON-NLS-1$

    public static final String div = "div"; //$NON-NLS-1$

    public static final String p = "p"; //$NON-NLS-1$

    public static final String hr = "hr"; //$NON-NLS-1$

   

    public static final String l = "<"; //$NON-NLS-1$

    public static final String r = ">"; //$NON-NLS-1$

    public static final String s = "/"; //$NON-NLS-1$

    public static final String d = "="; //$NON-NLS-1$

    public static final String mc = ""; //$NON-NLS-1$

    public static final String dh = ""; //$NON-NLS-1$

 

    public static final String GB2312 = "GB2312"; //$NON-NLS-1$

    public static final String UTF8 = "UTF-8"; //$NON-NLS-1$

   

    public static final char lc = '<';

    public static final char rc = '>';

    public static final char sc = '/';

    public static final char dc = '=';

    public static final char yc = '"';

    public static final char spc = ' ';

    public static final char mcc = '';

    public static final char dhc = '';

}

 

下面的IHazardProperties类提取一部分代码:

package com.hailite.ecs.hazard.importing;

 

/**

 * @author 严军

 * @date 2009-4-15 下午05:04:54

 */

public interface IHazardProperties {

    //1.物质的理化常数

    public static final String s_1 = "1.物质的理化常数"; //$NON-NLS-1$

    public static final String s_11 = "1、物质的理化常数"; //$NON-NLS-1$

 

    public static final String gbbh = "国标编号"; //$NON-NLS-1$

    public static final String cash = "CAS"; //$NON-NLS-1$

    public static final String cname = "中文名称"; //$NON-NLS-1$

    public static final String ename = "英文名称"; //$NON-NLS-1$

    public static final String aname = "别名"; //$NON-NLS-1$

 

    public static final String xlcz = "一、泄漏处置"; //$NON-NLS-1$

    public static final String xlcz1 = "一、泄漏应急处理"; //$NON-NLS-1$

    public static final String fhcs = "二、防护措施"; //$NON-NLS-1$

    public static final String fhcs1 = "二、防护措"; //$NON-NLS-1$

    public static final String fhcs2 = "一、防护措施"; //$NON-NLS-1$

    public static final String jjcs = "三、急救措施"; //$NON-NLS-1$

   

   

}

 

四、运行main方法,不断的进行调试,直到能顺利执行整个方法,所有的网页就变成了合法的XML文件了,要注意的是,转成XML文件时,要加入encoding的定义,如<?xml version=/"1.0/" encoding=/"GB2312/"?>。这句Document doc = DocumentBuilderFactory.newInstance().newDocumentBuilder().parse(f);就是用来检测XML文件的合法性的。

 

五、提取转换成XML文件后的HTML数据内容,并把提取到的数据赋值给JAVA Bean,然后再保存到数据库中。代码如下:

 

 

 

package com.hailite.ecs.hazard.importing;

 

import java.io.File;

import java.util.ArrayList;

import java.util.List;

 

import javax.xml.parsers.DocumentBuilderFactory;

 

import org.w3c.dom.Document;

import org.w3c.dom.Node;

import org.w3c.dom.NodeList;

 

import com.hailite.ecs.entity.hazard.ContingencyMethod;

import com.hailite.ecs.entity.hazard.EnvironmentHazard;

import com.hailite.ecs.entity.hazard.HazardousGoods;

import com.hailite.ecs.entity.hazard.MonitorMethod;

import com.hailite.ecs.entity.hazard.ProtectionMethod;

 

/**

 * @author 严军

 * @date 2009-4-15 下午02:37:26

 */

public class HtmlContentScannerFactory implements IHTML, IHazardProperties {

 

    public static void main(String[] args) {

       getHazardousGoodses();

    }

   

    public static List<HazardousGoods> getHazardousGoodses() {

       //List<String> goodsStrings = new ArrayList<String>();

       List<HazardousGoods> goodses = new ArrayList<HazardousGoods>();

       List<File> files = new FilesSearcher().searchFiles();

 

       for (File f : files) {

           try {

              Document doc = DocumentBuilderFactory.newInstance().newDocumentBuilder().parse(f);

              NodeList nl = doc.getElementsByTagName(IHTML.body);

              StringBuffer sb = new StringBuffer();

              iteratorNodeList(nl, sb);

             

             

              if (sb.toString().length() > 0) {

                  String str = sb.toString();

                  int i1 = str.lastIndexOf(gbbh);

                  int i2 = str.indexOf("天地大方"); //$NON-NLS-1$

                  str = sb.substring(i1, i2);

                  //goodsStrings.add(str);

 

                  HazardousGoods goods = new HazardousGoods();

                  goods.setEnvironmentHazard(new EnvironmentHazard());

                  goods.setLocaleMonitorMethod(new MonitorMethod());

                  goods.setLaboratoryMonitorMethod(new MonitorMethod());

                  goods.setContingencyMethod(new ContingencyMethod());

                  goods.setProtectionMethod(new ProtectionMethod());

                  HazardousGoodsPropertiesSetter.setBaseProperty(str, goods);

                  goodses.add(goods);

              }

              System.out.println("Scannered-->:" + f.getPath()); //$NON-NLS-1$

           } catch (Exception e) {

              System.err.println(f.getPath());

              e.printStackTrace();

           }

       }

       System.out.println("共有" + goodses.size() + "......");  //$NON-NLS-1$//$NON-NLS-2$

       return goodses;

    }

 

    static void iteratorNodeList(NodeList nl, StringBuffer sb) {

       if (nl != null) {

           for (int i = 0; i < nl.getLength(); i++) {

              Node nd = nl.item(i);

              if (nd.getChildNodes().getLength() > 0) {

                  iteratorNodeList(nd.getChildNodes(), sb);

              }

              if (table.equals(nd.getNodeName())) {

                  iteratorTable(nd, sb);

              }

           }

       }

    }

   

    static void iteratorTable(Node table, StringBuffer sb) {

       NodeList trs = table.getChildNodes();

       for (int i = 0; i < trs.getLength(); i ++) {

           Node tr = trs.item(i);

           NodeList tds = tr.getChildNodes();

           for (int j = 0; j < tds.getLength(); j ++) {

              Node td = tds.item(j);

              if (IHTML.td.equals(td.getNodeName())) {

                  String va = td.getTextContent().trim();

                  if (va.length() > 0) {

                     va = va.replaceAll(" ", "");  //$NON-NLS-1$//$NON-NLS-2$

                     //System.out.println(va);

                     sb.append(va + "/n"); //$NON-NLS-1$

                  }

              }

           }

       }

    }

 

   

    /**

     * 取真实的tr

     * @param table

     * @return

     */

    static int getTrSize(Node table) {

       int size = 0;

       NodeList trs = table.getChildNodes();

       for (int i = 0; i < trs.getLength(); i ++) {

           Node tr = trs.item(i);

           if (IHTML.tr.equals(tr.getNodeName())) {

              size ++;

           }

       }

       return size;

    }

 

    /**

     * 取真实的td

     * @param tr

     * @return

     */

    static int getTdSize(Node tr) {

       int size = 0;

       NodeList tds = tr.getChildNodes();

       for (int j = 0; j < tds.getLength(); j ++) {

           Node td = tds.item(j);

           if (IHTML.td.equals(td.getNodeName()) || IHTML.div.equals(td.getNodeName())) {

              size ++;

           }

       }

       return size;

    }

}

 

其中有一段比较重要:

                  HazardousGoods goods = new HazardousGoods();

                  goods.setEnvironmentHazard(new EnvironmentHazard());

                  goods.setLocaleMonitorMethod(new MonitorMethod());

                  goods.setLaboratoryMonitorMethod(new MonitorMethod());

                  goods.setContingencyMethod(new ContingencyMethod());

                  goods.setProtectionMethod(new ProtectionMethod());

                  HazardousGoodsPropertiesSetter.setBaseProperty(str, goods);

 

HazardousGoodsPropertiesSetter类是根据JAVA Bean属性来赋值的。

HazardousGoodsPropertiesSetter类相当长,而且写得基本上是重复代码,只从里面提取几段有代表性的代码,代码如下:

 

package com.hailite.ecs.hazard.importing;

 

import com.hailite.ecs.entity.hazard.HazardousGoods;

 

/**

 * @author 严军

 * @date 2009-4-16 上午11:11:32

 */

public class HazardousGoodsPropertiesSetter implements IHazardProperties, IHTML {

    public static void setBaseProperty(String value, HazardousGoods goods) {

       String[] ss = value.split("/n"); //$NON-NLS-1$

       for (int i = 0; i < ss.length; i ++) {

           String s = ss[i];

           if (s.length() == 0) {

              continue;

           }

           if (gbbh.equals(s.trim())) {

              String v = ss[++i];

              goods.setGbbm(v);

              continue;

           }

           if (s.indexOf(xr) != -1) {

              String v = null;

              String[] vs = s.split(xr);

              if (vs.length > 1) {

                  v = vs[1];

                  StringBuffer sb = new StringBuffer(v);

                  int j = i + 1;

                  while (true) {

                     if (j < ss.length) {

                         String js = ss[j];

                         if (js.length() > 0) {

                            if (js.indexOf(mc) == -1 && js.indexOf(jjcs) == -1 && js.indexOf(fhcs) == -1 && js.indexOf(fhcs1) == -1 && js.indexOf(fhcs2) == -1) {

                                sb.append(js + "/n"); //$NON-NLS-1$

                            } else {

                                v = sb.toString();

                                break;

                            }

                         }

                     }

                     j ++;

                  }

              } else {

                  StringBuffer sb = new StringBuffer();

                  int j = i + 1;

                  while (true) {

                     if (j < ss.length) {

                         String js = ss[j];

                         if (js.length() > 0) {

                            if (js.indexOf(mc) == -1 && js.indexOf(jjcs) == -1 && js.indexOf(fhcs) == -1 && js.indexOf(fhcs1) == -1 && js.indexOf(fhcs2) == -1) {

                                sb.append(js + "/n"); //$NON-NLS-1$

                            } else {

                                v = sb.toString();

                                break;

                            }

                         }

                     }

                     j ++;

                  }

              }

              goods.getProtectionMethod().setXr(v);

              continue;

           }

       }

      

    }

}

 

五、保存方法,代码如下:

package com.hailite.ecs.hazard.importing;

 

import java.util.List;

 

import com.hailite.ecs.data.importing.EntitysCache;

import com.hailite.ecs.entity.hazard.HazardousGoods;

import com.hailite.ecs.service.ICommonsService;

 

/**

 * @author 严军

 * @date 2009-4-17 上午11:04:46

 */

public class HazardousGoodsesSaver {

    static final String[] types = "胺类,烃类,卤代烃类,芳烃类,酯类,醛和酮类,醇和醚类,酚及杂环类,硅烷、酰氯及肼类,腈及氰化物类,酸及酸酐类,氧化物及硫化物类,卤化物类,盐类,有机金属类,无机金属及非金属类,农药类,其它".split(",");  //$NON-NLS-1$//$NON-NLS-2$

   

    public static void main(String[] args) {

       List<HazardousGoods> goodses = HtmlContentScannerFactory.getHazardousGoodses();

       ICommonsService service = EntitysCache.getInstance().getService();

       /*for (String s : types) {

           ChemistryType type = new ChemistryType();

           type.setFlmc(s);

           service.save(type);

           System.out.println("Saved-->:" + type.getFlmc());

       }*/

       for (HazardousGoods goods : goodses) {

           try {

              service.save(goods);

           } catch (Exception e) {

              e.printStackTrace();

           }

           System.out.println("Saved-->:" + goods.getMc());

       }

    }

 

六、把数据从数据库中导出为一个XML数据文件,这样以便今后重复导入。导出代码(提取一部份代码段):

 

package com.hailite.ecs.hazard.exporting;

 

import java.io.FileNotFoundException;

import java.io.FileOutputStream;

import java.io.IOException;

import java.util.List;

 

import com.hailite.ecs.data.importing.EntitysCache;

import com.hailite.ecs.entity.hazard.ContingencyMethod;

import com.hailite.ecs.entity.hazard.EnvironmentHazard;

import com.hailite.ecs.entity.hazard.HazardousGoods;

import com.hailite.ecs.entity.hazard.MonitorMethod;

import com.hailite.ecs.entity.hazard.ProtectionMethod;

import com.hailite.ecs.entity.hazard.StoreMethod;

import com.hailite.ecs.hazard.importing.IHTML;

 

/**

 * 导出危化品到XML

 * @author 严军

 * @date 2009-4-20 上午10:32:26

 */

public class HazardGoodsExport {

 

    @SuppressWarnings("unchecked")

    public static void main(String[] args) {

       List<HazardousGoods> goodses = EntitysCache.getInstance().getService().loadAll(HazardousGoods.class);

       exportingToXML(goodses);

    }

   

    public static void exportingToXML(List<HazardousGoods> goodses) {

       StringBuffer sb = new StringBuffer();

       sb.append("<?xml version=/"1.0/" encoding=/"" + IHTML.GB2312 + "/"?>/n"); //$NON-NLS-1$ //$NON-NLS-2$

       sb.append("<root>/n"); //$NON-NLS-1$

       for (HazardousGoods goods : goodses) {

           sb.append("/t<HazardousGoods>/n"); //$NON-NLS-1$

          

          

           sb.append("/t<Bbsd>/n"); //$NON-NLS-1$

           sb.append("/t/t" + goods.getBbsd() + "/n"); //$NON-NLS-1$ //$NON-NLS-2$

           sb.append("/t</Bbsd>/n"); //$NON-NLS-1$

          

           sb.append("/t<Bhzqy>/n"); //$NON-NLS-1$

           sb.append("/t/t" + checkString(goods.getBhzqy()) + "/n"); //$NON-NLS-1$ //$NON-NLS-2$

           sb.append("/t</Bhzqy>/n"); //$NON-NLS-1$

          

           sb.append("/t<Bm>/n"); //$NON-NLS-1$

           sb.append("/t/t" + checkString(goods.getBm()) + "/n"); //$NON-NLS-1$ //$NON-NLS-2$

           sb.append("/t</Bm>/n"); //$NON-NLS-1$

          

           sb.append("/t<Bzsx>/n"); //$NON-NLS-1$

           sb.append("/t/t" + checkString(goods.getBzsx()) + "/n"); //$NON-NLS-1$ //$NON-NLS-2$

           sb.append("/t</Bzsx>/n"); //$NON-NLS-1$

       sb.append("</root>/n"); //$NON-NLS-1$

       //System.out.println(sb.toString());

       saveToXML(sb.toString());

    }

   

    private static String checkString(String value) {

       if (null == value || value.length() == 0 || "????".equals(value) || "----".equals(value)) //$NON-NLS-1$ //$NON-NLS-2$

           return ""; //$NON-NLS-1$

       String[][] fixes = new String[][]{

              {"<", "小于"},  //$NON-NLS-1$//$NON-NLS-2$

              //{"????", ""},  //$NON-NLS-1$//$NON-NLS-2$

              //{"----", ""},  //$NON-NLS-1$//$NON-NLS-2$

       };

       for (String fixs[] : fixes) {

           value = value.replaceAll(fixs[0], fixs[1]);

       }

       return value;

    }

   

    private static void saveToXML(String xml) {

       try {

           FileOutputStream out = new FileOutputStream("hazard.xml"); //$NON-NLS-1$

           out.write(xml.getBytes());

           out.flush();

           out.close();

       } catch (FileNotFoundException e) {

           e.printStackTrace();

       } catch (IOException e) {

           e.printStackTrace();

       }

    }

}

 

七、再把XML数据文件导入到数据库,代码如下(提取部分代码段):

package com.hailite.ecs.hazard.importing;

 

import java.io.File;

import java.io.IOException;

 

import javax.xml.parsers.DocumentBuilderFactory;

import javax.xml.parsers.ParserConfigurationException;

 

import org.w3c.dom.Document;

import org.w3c.dom.Node;

import org.w3c.dom.NodeList;

import org.xml.sax.SAXException;

 

import com.hailite.ecs.data.importing.EntitysCache;

import com.hailite.ecs.entity.hazard.ContingencyMethod;

import com.hailite.ecs.entity.hazard.EnvironmentHazard;

import com.hailite.ecs.entity.hazard.HazardousGoods;

import com.hailite.ecs.entity.hazard.MonitorMethod;

import com.hailite.ecs.entity.hazard.ProtectionMethod;

import com.hailite.ecs.service.ICommonsService;

import com.hailiteframework.utils.value.FieldValueSetter;

import com.hailiteframework.utils.value.ValueSetter;

 

public class XMLHazardGoodsImport {

    private static final String fileName = "hazard.xml"; //$NON-NLS-1$

    private static ICommonsService service = EntitysCache.getInstance().getService();

 

    public static void main(String[] args) {

       service.testService();

       try {

           Document doc = DocumentBuilderFactory.newInstance().newDocumentBuilder().parse(new File(fileName));

          

           NodeList nl = doc.getElementsByTagName("HazardousGoods"); //$NON-NLS-1$

           iterateNodeList(nl);

       } catch (SAXException e) {

           e.printStackTrace();

       } catch (IOException e) {

           e.printStackTrace();

       } catch (ParserConfigurationException e) {

           e.printStackTrace();

       }

    }

 

    private static void iterateNodeList(NodeList nl) {

        //List<HazardousGoods> goodses = new ArrayList<HazardousGoods>();

       for (int i = 0; i < nl.getLength(); i ++) {

           HazardousGoods hg = new HazardousGoods();

           Node nd = nl.item(i);

           if ("HazardousGoods".equals(nd.getNodeName())) { //$NON-NLS-1$

              NodeList gnl = nd.getChildNodes();

              for (int j = 0; j < gnl.getLength(); j ++) {

                  Node gnd = gnl.item(j);

                  if ("LocaleMonitorMethod".equals(gnd.getNodeName())) { //$NON-NLS-1$

                     if (hg.getLocaleMonitorMethod() == null) {

                         hg.setLocaleMonitorMethod(new MonitorMethod());

                     }

                     NodeList nnl = gnd.getChildNodes();

                     for (int k = 0; k < nnl.getLength(); k ++) {

                         Node n = nnl.item(k);

                         if ("#text".equals(n.getNodeName())) { //$NON-NLS-1$

                            continue;

                         }

                         ValueSetter vs = new FieldValueSetter(n.getNodeName());

                         vs.setValue(hg.getLocaleMonitorMethod(), n.getTextContent().trim());

                     }

                     continue;

                  }

                  if ("LaboratoryMonitorMethod".equals(gnd.getNodeName())) { //$NON-NLS-1$

                     if (hg.getLaboratoryMonitorMethod() == null) {

                         hg.setLaboratoryMonitorMethod(new MonitorMethod());

                     }

                     NodeList nnl = gnd.getChildNodes();

                     for (int k = 0; k < nnl.getLength(); k ++) {

                         Node n = nnl.item(k);

                         if ("#text".equals(n.getNodeName())) { //$NON-NLS-1$

                            continue;

                         }

                         ValueSetter vs = new FieldValueSetter(n.getNodeName());

                         vs.setValue(hg.getLaboratoryMonitorMethod(), n.getTextContent().trim());

                     }

                     continue;

                  }

                 

                  if ("#text".equals(gnd.getNodeName())) { //$NON-NLS-1$

                     continue;

                  }

                  ValueSetter vs = new FieldValueSetter(gnd.getNodeName());

                  vs.setValue(hg, gnd.getTextContent().trim());

                 

              }

           }

           //goodses.add(hg);

           System.out.println("Saved:" + hg.getMc() + ",ID:" + service.save(hg));

       }

       //System.out.println(goodses.size());

    }

}

 

借助了两个工具类:

ValueSetter类:

package com.hailiteframework.utils.value;

 

public interface ValueSetter extends ValueVisitor {

    void setValue(Object obj, Object value);

}

 

FieldValueSetter类:

package com.hailiteframework.utils.value;

 

import java.lang.reflect.InvocationTargetException;

import java.lang.reflect.Method;

import java.util.ArrayList;

import java.util.List;

 

public class FieldValueSetter implements ValueSetter {

 

    private boolean fieldMode = true;

 

    private String[] methodNameArray;

 

    private String methodNames;

 

    private Method[] methodArray;

 

    private Object[] parameterArray;

 

    public String getMethodNameArray() {

       StringBuffer sb = new StringBuffer();

       for (int i = 0; i < methodNameArray.length; i++) {

           if (i > 0) {

              sb.append('.');

           }

           sb.append(methodNameArray[i]);

       }

       return sb.toString();

    }

 

    public FieldValueSetter(String methodNames) {

       List list = new ArrayList();

       int index = 0;

       while (index < methodNames.length()) {

           int nextIndex = methodNames.indexOf('.', index);

           if (nextIndex == -1) {

              list.add(methodNames.substring(index));

              break;

           } else {

              list.add(methodNames.substring(index, nextIndex));

              index = nextIndex + 1;

           }

       }

       this.methodNameArray = (String[]) list.toArray(new String[list.size()]);

       this.parameterArray = new Object[this.methodNameArray.length];

       this.methodArray = new Method[this.methodNameArray.length];

 

       for (int i = 0; i < methodNameArray.length; i++) {

           String name = methodNameArray[i];

           index = name.indexOf(PARAMETER_CHAR);

           if (index != -1) {

              methodNameArray[i] = name.substring(0, index);

              parameterArray[i] = name.substring(index + 1);

           }

       }

 

    }

 

    public void setFieldMode(boolean fieldMode) {

       this.fieldMode = fieldMode;

    }

 

    public String getFormat() {

       return methodNames;

    }

 

    private RuntimeException getMethodNameException(Object obj) {

       StringBuffer sb = new StringBuffer();

       for (int j = 0; j < methodNameArray.length; j++) {

           if (j > 0) {

              sb.append('.');

           }

           sb.append(methodNameArray[j]);

       }

       return new RuntimeException("设置方法名:" + sb + "/n对参数/n/tclass=" + obj.getClass().getName() + ":/n/tvalue=" + obj  //$NON-NLS-1$ //$NON-NLS-2$//$NON-NLS-3$

              + "/n而言没有意义"); //$NON-NLS-1$

    }

 

    private String[] getMethodName(String name) {

       if (fieldMode) {

           return new String[] { "get" + Character.toUpperCase(name.charAt(0)) + name.substring(1), //$NON-NLS-1$

                  "is" + Character.toUpperCase(name.charAt(0)) + name.substring(1) }; //$NON-NLS-1$

       }

       return new String[] { name };

    }

 

    private String getSetMethodName(String name) {

       if (fieldMode) {

           name = "set" + Character.toUpperCase(name.charAt(0)) + name.substring(1); //$NON-NLS-1$

       }

       return name;

    }

 

    private Object getSetTarget(Object obj) {

       Object result = obj;

       for (int i = 0; i < methodArray.length - 1 && result != null; i++) {

           Method method = methodArray[i];

           if (method == null) {

              Class clazz = result.getClass();

              String[] methodName = getMethodName(methodNameArray[i]);

 

              while (method == null && !clazz.equals(Object.class)) {

                  Method[] methods = clazz.getDeclaredMethods();

                  for (int im = 0; method == null && im < methods.length; im++) {

                     Method m = methods[im];

                     for (int jm = 0; method == null && jm < methodName.length; jm++) {

                         if (m.getName().equals(methodName[jm])) {

                            if (parameterArray[i] == null && m.getParameterTypes().length == 0

                                   || parameterArray[i] != null && m.getParameterTypes().length == 1) {

                                method = m;

                            }

                         }

                     }

                  }

                  clazz = clazz.getSuperclass();

              }

              if (method == null) {

                  throw getMethodNameException(obj);

              }

              method.setAccessible(true);

              methodArray[i] = method;

           }

           try {

              if (parameterArray[i] == null) {

                  result = method.invoke(result, (Object[]) null);

              } else {

                  result = method.invoke(result, new Object[] { parameterArray[i] });

              }

           } catch (NoSuchMethodError nme) {

              throw getMethodNameException(obj);

           } catch (InvocationTargetException ite) {

              throw new RuntimeException(ite.getTargetException());

           } catch (Exception e) {

              throw new RuntimeException(e);

           }

       }

       // 强制赋值对象不能为空,赋值必须可以进行

       if (result == null) {

           throw getMethodNameException(obj);

       }

       return result;

    }

 

    /**

     *

     * @param obj

     *            设置对象

     * @param result

     *            obj中取出来的最终设值对象,如果为NULL,则从obj中取出

     * @return

     */

    private Method getSetMethod(Object obj, Object result) {

       // Thread.currentThread().setContextClassLoader(obj.getClass().getClassLoader());

       Method setMethod = methodArray[methodArray.length - 1];

       if (setMethod == null) {

 

           if (result == null) {

              result = getSetTarget(obj);

           }

 

           String methodName = getSetMethodName(methodNameArray[methodNameArray.length - 1]);

           Class clazz = result.getClass();

 

           while (setMethod == null && clazz != Object.class) {

              Method[] mm = clazz.getDeclaredMethods();

              for (int i = 0; setMethod == null && i < mm.length; i++) {

                  Method m = mm[i];

                  if (m.getName().equals(methodName)) {

                     if (parameterArray[methodArray.length - 1] == null && m.getParameterTypes().length == 1

                            || parameterArray[methodArray.length - 1] != null && m.getParameterTypes().length == 2) {

                         m.setAccessible(true);

                         methodArray[methodArray.length - 1] = m;

                         setMethod = m;

                     }

                  }

              }

              clazz = clazz.getSuperclass();

           }

 

           if (setMethod == null) {

              throw getMethodNameException(obj);

           }

 

       }

       return setMethod;

    }

 

    /**

     * 返回设置数据的方法

     *

     * @param obj

     * @return

     */

    public Method getSetMethod(Object obj) {

       Method setMethod = getSetMethod(obj, null);

       return setMethod;

    }

 

    /**

     * 返回设置数据的类型

     *

     * @param obj

     * @return

     */

    public Class getSetMethodType(Object obj) {

       Method setMethod = getSetMethod(obj, null);

       return setMethod == null ? null : setMethod.getParameterTypes()[setMethod.getParameterTypes().length - 1];

    }

 

    public void setValue(Object obj, Object value) {

       Object result = getSetTarget(obj);

       Method setMethod = getSetMethod(obj, result);

       try {

 

           if (value == null && setMethod.getParameterTypes()[setMethod.getParameterTypes().length - 1].isPrimitive()) {

              // 对于基本类型,当值未Null时,忽略操作

              return;

           }

 

           Object[] parameter = setMethod.getParameterTypes().length == 1 ? new Object[] { value } : new Object[] {

                  parameterArray[methodArray.length - 1], value };

           setMethod.invoke(result, parameter);

       } catch (NoSuchMethodError nme) {

           throw getMethodNameException(obj);

       } catch (InvocationTargetException ite) {

           throw new RuntimeException(ite.getTargetException());

       } catch (Exception e) {

           throw new RuntimeException(e);

       }

    }

}

 

 

整个数据提取的过程是费了一番周折,但是感觉非常有趣,而且也大大缩短了提取数据的时间,并且提取的数据也非常的准确。

}

  • 0
    点赞
  • 1
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值