aspose 将word文档按照模板拆分为html页面

最新推荐文章于 2024-06-06 09:57:47 发布

H愚公移山H

最新推荐文章于 2024-06-06 09:57:47 发布

阅读量631

点赞数 1

分类专栏：工具 word html 文章标签： html aspose java

本文链接：https://blog.csdn.net/HAN_789/article/details/110479571

版权

工具同时被 3 个专栏收录

16 篇文章 0 订阅

订阅专栏

word

6 篇文章 0 订阅

订阅专栏

html

2 篇文章 0 订阅

订阅专栏

1.定义文档模板 TocTemplate.doc

2.需要拆分的WORD文档 SOI 2007-2012-DeeM with footnote added.doc

代码如下，开箱即用 pom文件需要引入

<dependency>
         <groupId>com.aspose</groupId>
         <artifactId>aspose-words</artifactId>
         <version>19.3</version>
 </dependency>

/*
 * Copyright 2001-2015 Aspose Pty Ltd. All Rights Reserved.
 *
 * This file is part of Aspose.Words. The source code in this file
 * is only intended as a supplement to the documentation, and is provided
 * "as is", without warranty of any kind, either expressed or implied.
 */

package com.aspose.words.examples.loading_saving;

import com.aspose.words.*;
import com.aspose.words.examples.Utils;
import com.aspose.words.ref.Ref;

import java.io.ByteArrayInputStream;
import java.io.File;
import java.io.InputStream;
import java.util.ArrayList;

public class SplitIntoHtmlPages {
    public static void main(String[] args) throws Exception {
        // You need to have a valid license for Aspose.Words.
        // The best way is to embed the license as a resource into the project
        // and specify only file name without path in the following call.
        // Aspose.Words.License license = new Aspose.Words.License();
        // license.SetLicense(@"Aspose.Words.lic");


        //ExStart:SplitIntoHtmlPages
        // The path to the documents directory.\

        String dataDir = Utils.getDataDir(SplitIntoHtmlPages.class);

        String srcFileName = dataDir + "SOI 2007-2012-DeeM with footnote added.doc";
        String tocTemplate = dataDir + "TocTemplate.doc";
        

        File outDir = new File(dataDir, "Out");
        outDir.mkdirs();

        // This class does the job.
        Worker w = new Worker();
        w.execute(srcFileName, tocTemplate, outDir.getPath());
        //ExStart:SplitIntoHtmlPages

        System.out.println("Document split into HTML pages successfully.");
    }
    
}
//ExStart:TocMailMergeDataSource

/**
 * A custom data source for Aspose.Words mail merge.
 * Returns topic objects.
 */
class TocMailMergeDataSource implements IMailMergeDataSource {
    TocMailMergeDataSource(ArrayList topics) throws Exception {
        mTopics = topics;
        // Initialize to BOF.
        mIndex = -1;
    }

    public boolean moveNext() throws Exception {
        if (mIndex < mTopics.size() - 1) {
            mIndex++;
            return true;
        } else {
            // Reached EOF, return false.
            return false;
        }
    }

    @Override
    public boolean getValue(String fieldName, Ref<Object> fieldValue) throws Exception {
        if ("TocEntry".equals(fieldName)) {
            // The template document is supposed to have only one field called "TocEntry".
            fieldValue.set(mTopics.get(mIndex));
            return true;
        } else {
            fieldValue.set(null);
            return false;
        }
    }

    public String getTableName() throws Exception {
        return "TOC";
    }

    public IMailMergeDataSource getChildDataSource(String tableName) throws Exception {
        return null;
    }

    private final ArrayList mTopics;
    private int mIndex;
}
//ExEnd:TocMailMergeDataSource
//ExStart:Topic

/**
 * A simple class to hold a topic title and HTML file name together.
 */
class Topic {
    Topic(String title, String fileName) throws Exception {
        mTitle = title;
        mFileName = fileName;
    }

    String getTitle() throws Exception {
        return mTitle;
    }

    String getFileName() throws Exception {
        return mFileName;
    }

    private final String mTitle;
    private final String mFileName;
}
//ExEnd:Topic

//ExStart:Worker

/**
 * This class takes a Microsoft Word document, splits it into topics at paragraphs formatted
 * with the Heading 1 style and saves every topic as an HTML file.
 * <p>
 * Also generates contents.html file that provides links to all saved topics.
 */
class Worker {
    /**
     * Performs the Word to HTML conversion.
     *
     * @param srcFileName The MS Word file to convert.
     * @param tocTemplate An MS Word file that is used as a template to build
     *                    a table of contents. This file needs to have a mail merge region called "TOC" defined
     *                    and one mail merge field called "TocEntry".
     * @param dstDir      The output directory where to write HTML files. Must exist.
     */
    void execute(String srcFileName, String tocTemplate, String dstDir) throws Exception {
        mDoc = new Document(srcFileName);
        mTocTemplate = tocTemplate;
        mDstDir = dstDir;

        ArrayList topicStartParas = selectTopicStarts();
        insertSectionBreaks(topicStartParas);
        ArrayList topics = saveHtmlTopics();
        saveTableOfContents(topics);
    }

    /**
     * Selects heading paragraphs that must become topic starts.
     * We can't modify them in this loop, we have to remember them in an array first.
     */
    private ArrayList selectTopicStarts() throws Exception {
        NodeCollection paras = mDoc.getChildNodes(NodeType.PARAGRAPH, true);
        ArrayList topicStartParas = new ArrayList();

        for (Paragraph para : (Iterable<Paragraph>) paras) {
            int style = para.getParagraphFormat().getStyleIdentifier();
            if (style == StyleIdentifier.HEADING_1)
                topicStartParas.add(para);
        }

        return topicStartParas;
    }

    /**
     * Inserts section breaks before the specified paragraphs.
     */
    private void insertSectionBreaks(ArrayList topicStartParas) throws Exception {
        DocumentBuilder builder = new DocumentBuilder(mDoc);
        for (Paragraph para : (Iterable<Paragraph>) topicStartParas) {
            Section section = para.getParentSection();

            // Insert section break if the paragraph is not at the beginning of a section already.
            if (para != section.getBody().getFirstParagraph()) {
                builder.moveTo(para.getFirstChild());
                builder.insertBreak(BreakType.SECTION_BREAK_NEW_PAGE);

                // This is the paragraph that was inserted at the end of the now old section.
                // We don't really need the extra paragraph, we just needed the section.
                section.getBody().getLastParagraph().remove();
            }
        }
    }

    /**
     * Splits the current document into one topic per section and saves each topic
     * as an HTML file. Returns a collection of Topic objects.
     */
    private ArrayList saveHtmlTopics() throws Exception {
        ArrayList topics = new ArrayList();
        for (int sectionIdx = 0; sectionIdx < mDoc.getSections().getCount(); sectionIdx++) {
            Section section = mDoc.getSections().get(sectionIdx);

            String paraText = section.getBody().getFirstParagraph().getText();

            // The text of the heading paragaph is used to generate the HTML file name.
            String fileName = makeTopicFileName(paraText);
            if ("".equals(fileName))
                fileName = "UNTITLED SECTION " + sectionIdx;

            fileName = new File(mDstDir, fileName + ".html").getPath();

            // The text of the heading paragraph is also used to generate the title for the TOC.
            String title = makeTopicTitle(paraText);
            if ("".equals(title))
                title = "UNTITLED SECTION " + sectionIdx;

            Topic topic = new Topic(title, fileName);
            topics.add(topic);

            saveHtmlTopic(section, topic);
        }

        return topics;
    }

    /**
     * Leaves alphanumeric characters, replaces white space with underscore
     * and removes all other characters from a string.
     */
    private static String makeTopicFileName(String paraText) throws Exception {
        StringBuilder b = new StringBuilder();
        for (int i = 0; i < paraText.length(); i++) {
            char c = paraText.charAt(i);
            if (Character.isLetterOrDigit(c))
                b.append(c);
            else if (c == ' ')
                b.append('_');
        }
        return b.toString();
    }

    /**
     * Removes the last character (which is a paragraph break character from the given string).
     */
    private static String makeTopicTitle(String paraText) throws Exception {
        return paraText.substring((0), (0) + (paraText.length() - 1));
    }

    /**
     * Saves one section of a document as an HTML file.
     * Any embedded images are saved as separate files in the same folder as the HTML file.
     */
    private static void saveHtmlTopic(Section section, Topic topic) throws Exception {
        Document dummyDoc = new Document();
        dummyDoc.removeAllChildren();
        dummyDoc.appendChild(dummyDoc.importNode(section, true, ImportFormatMode.KEEP_SOURCE_FORMATTING));

        dummyDoc.getBuiltInDocumentProperties().setTitle(topic.getTitle());

        HtmlSaveOptions saveOptions = new HtmlSaveOptions();
        saveOptions.setPrettyFormat(true);
        // This is to allow headings to appear to the left of main text.
        saveOptions.setAllowNegativeIndent(true);
        saveOptions.setExportHeadersFootersMode(ExportHeadersFootersMode.NONE);

        dummyDoc.save(topic.getFileName(), saveOptions);
    }

    /**
     * Generates a table of contents for the topics and saves to contents.html.
     */
    private void saveTableOfContents(ArrayList topics) throws Exception {
        Document tocDoc = new Document(mTocTemplate);

        // We use a custom mail merge even handler defined below.
        tocDoc.getMailMerge().setFieldMergingCallback(new HandleTocMergeField());
        // We use a custom mail merge data source based on the collection of the topics we created.
        tocDoc.getMailMerge().executeWithRegions(new TocMailMergeDataSource(topics));

        tocDoc.save(new File(mDstDir, "contents.html").getPath());
    }

    private class HandleTocMergeField implements IFieldMergingCallback {
        public void fieldMerging(FieldMergingArgs e) throws Exception {
            if (mBuilder == null)
                mBuilder = new DocumentBuilder(e.getDocument());

            // Our custom data source returns topic objects.
            Topic topic = (Topic) e.getFieldValue();

            // We use the document builder to move to the current merge field and insert a hyperlink.
            mBuilder.moveToMergeField(e.getFieldName());
            mBuilder.insertHyperlink(topic.getTitle(), topic.getFileName(), false);

            // Signal to the mail merge engine that it does not need to insert text into the field
            // as we've done it already.
            e.setText("");
        }

        public void imageFieldMerging(ImageFieldMergingArgs args) throws Exception {
            // Do nothing.
        }

        private DocumentBuilder mBuilder;
    }

    private Document mDoc;
    private String mTocTemplate;
    private String mDstDir;
}
//ExEnd:Worker

拆分结果：将word文档按照一级标题拆分为多个html的形式

H愚公移山H

关注

1
点赞
踩
2

收藏

觉得还不错? 一键收藏
1
评论
aspose 将word文档按照模板拆分为html页面

1.定义文档模板TocTemplate.doc2.需要拆分的WORD文档SOI 2007-2012-DeeM with footnote added.doc代码如下，开箱即用 pom文件需要引入<dependency> <groupId>com.aspose</groupId> <artifactId>aspose-words</artifactId> <ve...
复制链接

扫一扫