将html文件中的图片导出到某一文件夹或者生成xml文件

最新推荐文章于 2021-06-04 13:15:53 发布
kingmaxno1
最新推荐文章于 2021-06-04 13:15:53 发布
阅读量2.2k
点赞数
分类专栏：代码人生文章标签： html xml string null byte file
本文链接：https://blog.csdn.net/kingmaxno1/article/details/3276770
版权
代码人生专栏收录该内容
6 篇文章 0 订阅
订阅专栏
 
  
     package net.risesoft.riseinfo.integration.parse;
import java.io.File;
import java.io.FileInputStream;
import java.io.Reader;
import java.io.StringReader;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.StringTokenizer;
import com.hothouseobjects.tags.Inspector;
import com.hothouseobjects.tags.Tag;
import com.hothouseobjects.tags.TagTiller;
/*
 * 从html中将img标签的src属性解析出来，并对解析的图片地址进行处理，
 * 主要是为了解决组织部中组工网不在OA上，但信息发布是用fckedit做的，
 * 他的图片不能直接和数据一起通过webservices传到组工网上的问题
 * $author sking huang $date 2008-11-11
 */
public class ParseHtml {
    
    private String htmlSrc=null;
    
    
    private Map tagList =new HashMap();
    
    
    
    private static org.apache.log4j.Logger log = net.risesoft.commons.log.LogFactory
    .getLog(ParseHtml.class);
    public ParseHtml(String htmlSrc){
        this.htmlSrc = htmlSrc;
        
        
    }
    
    
    
    //向标签中列表中增加img标签 
    private void initTagToList(){
        //增加取得<a href=...的html标签 
//      tagList.put("a", new String[]{"href"}); 
        
//      增加取得<img src=...的html标签 
        tagList.put("img", new String[]{"src"});
    }
    
    public void addTagList(String key,String[] value){
        tagList.put(key, value);
        
    }
    
    public void remove(String key){
        tagList.remove(key);
    }
    public List parse(){
        List imageName=new ArrayList();
        
        log.debug("********开始解析html标签***********");
        //增加标签列表 
        initTagToList();
        try{
            
            
        Reader read = new StringReader(htmlSrc);
        
        TagTiller tagtiller = new TagTiller(read);
        
        tagtiller.runTiller();
        
        Tag thePage = tagtiller.getTilledTags();
        
        Set     tagSet   =   tagList.entrySet(); 
        Iterator   iter   =   tagSet.iterator(); 
        //从标签列表中取出要解析的标签，并将解析完的标签加入标签列表 
        while(iter.hasNext()) 
        { 
              Map.Entry   entry   =   (Map.Entry)iter.next(); 
              String   key   =  (String)entry.getKey();
              String[]   value   = (String[])  entry.getValue();
              
              if(key == null || "".equals(key)){
                  continue;
              }
              if(value == null || value.length==0){
                  continue;
              }
              List theHref = Inspector.collectByType(thePage,key);
              
              int i = theHref.size();
              
              while (i>0) {
                for(int ii=0;ii<value.length;ii++){
                    String filterStr=filterStr(((Tag)theHref.get(i-1)).getAttributeValue(value[ii]));
                    if(filterStr!=null){
                        imageName.add(filterStr);
                    }
              }
                i -=1;
              }
              
        } 
        
        
        
        log.debug("********html标签解析完毕***********");
        
        }catch(Exception e){
            log.error("在解析html的过程中出现问题", e);
        }
        
        return imageName;
    }
    
    //对字符串进行过滤 
    private String filterStr(String addr){
        
        if(addr==null) return addr;
        
         StringTokenizer parser =new StringTokenizer(addr,"/"///");
         
         String rtn="";
         //取最后一个，因为最后一个为图片的名字 
         while(parser.hasMoreTokens()) { 
             rtn=parser.nextToken();
         }
        return rtn;
        
    }
    public static void main(String[] args) {
        
        
        
        try {
            File file = new File("d://ttt.htm");
            int len = (int)file.length();
            byte[] b;
            b = new byte[len];
            FileInputStream fis = new FileInputStream(file);
            fis.read(b);
            fis.close();
         
            ParseHtml pp=new ParseHtml(new String(b));
            List list =pp.parse();
            for(int i=0;list.size()>0;i++){
                System.out.println(list.get(i));
            }
         
        }
        catch (Exception ex) {
            ex.printStackTrace();
        }
    }
}
 
    
package net.risesoft.riseinfo.integration.parse;
import java.util.ArrayList;
import java.util.List;
public abstract class ExportImg {
    private List imgList =new ArrayList();
    private ParseHtml parseHtml =null;
    
    private String imgSrc = null; // 图片存放文件夹 
    private String imgDest = null;// 图片将要被转移到文件夹,如果不需要转移图片可以调用两个构造函数的方法 
    
    public ExportImg(String srcHtml){
        this(srcHtml,null,null);
    }
    
    public ExportImg(String srcHtml,String imgSrc){
        this(srcHtml,imgSrc,null);
    }
    
    public ExportImg(String srcHtml,String imgSrc,String imgDest){
        this.imgSrc=imgSrc;
        this.imgDest=imgDest;
        parseHtml =new ParseHtml(srcHtml);
    }
    
    public List getImgList() {
        return imgList;
    }
    public void setImgList(List imgList) {
        this.imgList = imgList;
    }
    
    public  String export(){
        imgList = parseHtml.parse();
        return operate();
    }
    public abstract String operate();
    public String getImgDest() {
        return imgDest;
    }
    public void setImgDest(String imgDest) {
        this.imgDest = imgDest;
    }
    public String getImgSrc() {
        return imgSrc;
    }
    public void setImgSrc(String imgSrc) {
        this.imgSrc = imgSrc;
    }
}
 
     package net.risesoft.riseinfo.integration.parse;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileOutputStream;
import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
/*
 *将fckedit中的图片取出并导入到指定目录
 *@author sking huang
 *@2008-11-11 
 */
public class ExportImgToFile extends ExportImg {
    public ExportImgToFile(String srcHtml, String imgSrc, String imgDest) {
        super(srcHtml, imgSrc, imgDest);
    }
    private static org.apache.log4j.Logger log = net.risesoft.commons.log.LogFactory
            .getLog(ExportImgToFile.class);
    // 在将文件从指定数据源拷贝到指定目录之前请先给图片列表赋值,图片列表中只存图片名称 
    public String operate() {
        for (int i = 0; i < super.getImgList().size(); i++) {
            String imageName = (String) super.getImgList().get(i);
            try {
                File file = new File(getImgSrc() + File.separator + imageName);
                //如果文件存在且是文件 
                if (file.exists() && file.isFile()) {
                    FileInputStream input = new FileInputStream(file);
                    FileOutputStream output = new FileOutputStream(getImgDest()
                            + File.separator + imageName);
                   byte[] b= new byte[1024];
                                     int size=0;
                    while ((size = input.read(b)) != -1) {
                        output.write(b,0,size);
                                           }
                    input.close();
                    output.close();
                }
            } catch (IOException e) {
                log.error("文件导出过程中出现问题", e);
            }
        }
        return null;
    }
    public static void main(String[] args) {
        // 生成图片 
        try {
            File file = new File("d://ttt.htm");
            int len = (int) file.length();
            byte[] b;
            b = new byte[len];
            FileInputStream fis = new FileInputStream(file);
            fis.read(b);
            fis.close();
            ExportImg eif = new ExportImgToFile(new String(b), "D://ttt.files",
                    "D://img//dest");
            eif.export();
        } catch (Exception ex) {
            ex.printStackTrace();
        }
    }
} 
         
         package net.risesoft.riseinfo.integration.parse;
import java.io.ByteArrayInputStream;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileOutputStream;
import java.io.InputStream;
import java.io.OutputStream;
import javax.xml.parsers.DocumentBuilder;
import javax.xml.parsers.DocumentBuilderFactory;
import org.w3c.dom.Document;
import org.w3c.dom.Node;
import org.w3c.dom.NodeList;
import net.risesoft.integration.adapter.AdapterUtil;
public class ExportImgToXml extends ExportImg {
    private static org.apache.log4j.Logger log = net.risesoft.commons.log.LogFactory
    .getLog(ExportImgToXml.class);
    
    public ExportImgToXml(String srcHtml, String imgSrc) {
        super(srcHtml, imgSrc);
    }
    public String operate() {
        
        StringBuffer sb=new StringBuffer();
        sb.append("<?xml version=/"1.0/" encoding=/"GB2312/"?>");
        sb.append("<DATA>");
        for (int i = 0; i < super.getImgList().size(); i++) {
            String imageName = (String) super.getImgList().get(i);
            try {
                File file = new File(getImgSrc() + File.separator + imageName);
                //如果文件存在且是文件 
                if (file.exists() && file.isFile()) {
                    sb.append("<IMGLIST>");
                    sb.append("<IMGNAME>" + file.getName() + "</IMGNAME>");
                    
                    FileInputStream input = new FileInputStream(file);
                    
                    byte[] b=new byte[(int)file.length()];
                    input.read(b);
                    
                    sb.append("<IMGVALUE>" + AdapterUtil.base64Encode(b) + 
                            "</IMGVALUE>");
                    sb.append("</IMGLIST>");
                    input.close();
                }
            } catch (Exception e) {
                log.error("文件生成xml过程中出现问题", e);
            }
        }
        sb.append("</DATA>");
        return sb.toString();
    
    }
    //此方法为组工网一端接收xml的例子，只做参考用 
    public void parseXml(InputStream is){
        try {
            DocumentBuilderFactory factory = DocumentBuilderFactory.newInstance();
            factory.setNamespaceAware(true);
            factory.setValidating(true);
            DocumentBuilder builder = factory.newDocumentBuilder();
            Document doc = builder.parse(is);
            
            
    
                NodeList attnodeList = doc
                        .getElementsByTagName("IMGLIST");
                int attlength = attnodeList.getLength();
                for (int attIndex = 0; attIndex < attlength; attIndex++) { // 实际上，Attachment有0到n个 
                    
                    Node attnode = attnodeList.item(attIndex);
                    NodeList attlist = attnode.getChildNodes();
                    String fileName=null;
                    byte[] fileContent=null;
                    for (int j = 0; j < attlist.getLength(); j++) {
                        Node col = attlist.item(j);
                        if (col.getNodeName() == null) {
                            continue;
                        }
                        Node firstChild = col.getFirstChild();
                        if (firstChild == null) {
                            continue;
                        }
                        String value = firstChild.getNodeValue();
                        if (value == null && value.length() == 0) {
                            continue;
                        }
                        String field = col.getNodeName();
                        
                        if (field.equals("IMGVALUE")) {
                            fileContent=AdapterUtil.base64Decode(value);
                            
                        } else if (field.equals("IMGNAME")) {
                            fileName= value;
                        } 
                    }
                    if(fileName!=null && fileContent!=null){
                        
                        File file =new File("D://img//dest//"+fileName);
                        if(!file.exists())
                            file.createNewFile();
                        OutputStream fos =new FileOutputStream(file);
                        fos.write(fileContent);
                        fos.close();
                    }
                
                }
                log.info("**************数据写入成功********************");
            
            
        } catch (Exception ex) {
            
            log.error("附件写入出错了！", ex);
            
        }  
    
    }
    public static void main(String[] args) {
        // 生成图片 
        try {
            File file = new File("d://ttt.htm");
            int len = (int) file.length();
            byte[] b;
            b = new byte[len];
            FileInputStream fis = new FileInputStream(file);
            fis.read(b);
            fis.close();
            ExportImgToXml eif = new ExportImgToXml(new String(b), "D://ttt.files");
            
            String img=eif.export();
            //System.out.println(img); 
            InputStream is =new ByteArrayInputStream(img.getBytes());
            eif.parseXml(is);
        } catch (Exception ex) {
            ex.printStackTrace();
        }
    }
}
 
        
 
    
 
kingmaxno1
关注
0
点赞
踩
0

收藏

觉得还不错? 一键收藏
0
评论
将html文件中的图片导出到某一文件夹或者生成xml文件

package net.risesoft.riseinfo.integration.parse;import java.io.File;import java.io.FileInputStream;import java.io.Reader;import java.io.StringReader;import java.util.ArrayLis
复制链接

扫一扫