word emz html,java - Memory leak using Apache POI WordToHtmlConverter - Stack Overflow

I am using Apache POI library to convert .doc file to .html file in my Java EE project (run at Apache Tomcat 7.0.32).

I took as a basis WordToHtmlConverter.java class from package org.apache.poi.hwpf.converter. The only thing I changed - I am passing file for output stream in constructor (not in main() method).

When I run my application and upload .doc file (size < 500KB), I see that java.exe process is increased by 10-15 MB. When I upload another .doc file, java.exe is increased by 2-3 MB again. So java.exe is increased by 2-3 MB every time I upload .doc file. If I upload large .doc file (size > 3MB) java.exe is increased by 20-30 MB again. And so it continues until a java.lang.OutOfMemoryError: Java heap space.

Why does it happen? Why does java.exe process is increased by 2-3 MB every time I upload .doc file? What should I do to avoid the appearance of java.lang.OutOfMemoryError?

Here is the source of WordToHtmlConverter.java which i am using in my application:

package ru.emzior.view.convertors;

import java.io.Writer;

import java.io.OutputStreamWriter;

import java.io.FileOutputStream;

import org.apache.poi.hwpf.converter.WordToHtmlConverter;

import org.apache.poi.hwpf.converter.HtmlDocumentFacade;

import org.apache.poi.hwpf.converter.AbstractWordConverter;

import java.io.File;

import java.io.FileWriter;

import java.util.List;

import java.util.Stack;

import javax.xml.parsers.DocumentBuilderFactory;

import javax.xml.transform.OutputKeys;

import javax.xml.transform.Transformer;

import javax.xml.transform.TransformerFactory;

import javax.xml.transform.dom.DOMSource;

import javax.xml.transform.stream.StreamResult;

import org.apache.poi.hpsf.SummaryInformation;

import org.apache.poi.hwpf.HWPFDocument;

import org.apache.poi.hwpf.HWPFDocumentCore;

import org.apache.poi.hwpf.converter.FontReplacer.Triplet;

import org.apache.poi.hwpf.usermodel.Bookmark;

import org.apache.poi.hwpf.usermodel.CharacterRun;

import org.apache.poi.hwpf.usermodel.OfficeDrawing;

import org.apache.poi.hwpf.usermodel.Paragraph;

import org.apache.poi.hwpf.usermodel.Picture;

import org.apache.poi.hwpf.usermodel.Range;

import org.apache.poi.hwpf.usermodel.Section;

import org.apache.poi.hwpf.usermodel.Table;

import org.apache.poi.hwpf.usermodel.TableCell;

import org.apache.poi.hwpf.usermodel.TableRow;

import org.apache.poi.util.Beta;

import org.apache.poi.util.POILogFactory;

import org.apache.poi.util.POILogger;

import org.w3c.dom.Document;

import org.w3c.dom.Element;

import org.w3c.dom.Text;

import static org.apache.poi.hwpf.converter.AbstractWordUtils.TWIPS_PER_INCH;

/**

* Converts Word files (95-2007) into HTML files.

*

* This implementation doesn't create images or links to them. This can be

* changed by overriding {@link #processImage(Element, boolean, Picture)}

* method.

*

* @author Sergey Vladimirov (vlsergey {at} gmail {dot} com)

*/

@Beta

public class DocToHtmlConverter extends AbstractWordConverter implements HtmlConverter

{

private String filePath;

private String tempDirectory;

private static class BlockProperies

{

final String pFontName;

final int pFontSize;

public BlockProperies( String pFontName, int pFontSize )

{

this.pFontName = pFontName;

this.pFontSize = pFontSize;

}

}

public DocToHtmlConverter(String filePath, String tempDirectory) {

this.filePath = filePath;

this.htmlDocumentFacade = null;

this.tempDirectory = tempDirectory;

}

private static final POILogger logger = POILogFactory

.getLogger( DocToHtmlConverter.class );

private static String getSectionStyle( Section section )

{

float leftMargin = section.getMarginLeft() / TWIPS_PER_INCH;

float rightMargin = section.getMarginRight() / TWIPS_PER_INCH;

float topMargin = section.getMarginTop() / TWIPS_PER_INCH;

float bottomMargin = section.getMarginBottom() / TWIPS_PER_INCH;

String style = "margin: " + topMargin + "in " + rightMargin + "in "

+ bottomMargin + "in " + leftMargin + "in;";

if ( section.getNumColumns() > 1 )

{

style += "column-count: " + ( section.getNumColumns() ) + ";";

if ( section.isColumnsEvenlySpaced() )

{

float distance = section.getDistanceBetweenColumns()

/ TWIPS_PER_INCH;

style += "column-gap: " + distance + "in;";

}

else

{

style += "column-gap: 0.25in;";

}

}

return style;

}

public File convertToFile(){

String slashType = (filePath.lastIndexOf("\\") > 0) ? "\\" : "/"; // Windows or UNIX

String wordFileName = filePath.substring(filePath.lastIndexOf(slashType) + 1, filePath.length());

File htmlFile = new File(tempDirectory + wordFileName + ".htm");

if(!htmlFile.exists()){

try

{

Document doc = DocToHtmlConverter.process( new File(filePath) );

// FileWriter out = new FileWriter(htmlFile);

Writer out = new OutputStreamWriter(new FileOutputStream(htmlFile), "UTF-8");

DOMSource domSource = new DOMSource( doc );

StreamResult streamResult = new StreamResult( out );

TransformerFactory tf = TransformerFactory.newInstance();

Transformer serializer = tf.newTransformer();

// TODO set encoding from a command argument

serializer.setOutputProperty( OutputKeys.ENCODING, "UTF-8" );

serializer.setOutputProperty( OutputKeys.INDENT, "yes" );

serializer.setOutputProperty( OutputKeys.METHOD, "html" );

serializer.transform( domSource, streamResult );

out.flush();

out.close();

}

catch ( Exception e )

{

e.printStackTrace();

}

}

return htmlFile;

}

static Document process( File docFile ) throws Exception

{

final HWPFDocumentCore wordDocument = WordToHtmlUtils.loadDoc( docFile );

DocToHtmlConverter wordToHtmlConverter = new DocToHtmlConverter(

DocumentBuilderFactory.newInstance().newDocumentBuilder()

.newDocument() );

wordToHtmlConverter.processDocument( wordDocument );

return wordToHtmlConverter.getDocument();

}

private final Stack blocksProperies = new Stack();

private final HtmlDocumentFacade htmlDocumentFacade;

private Element notes = null;

public DocToHtmlConverter( Document document )

{

this.htmlDocumentFacade = new HtmlDocumentFacade( document );

}

public DocToHtmlConverter( HtmlDocumentFacade htmlDocumentFacade )

{

this.htmlDocumentFacade = htmlDocumentFacade;

}

@Override

protected void afterProcess()

{

if ( notes != null )

htmlDocumentFacade.getBody().appendChild( notes );

htmlDocumentFacade.updateStylesheet();

}

public Document getDocument()

{

return htmlDocumentFacade.getDocument();

}

@Override

protected void outputCharacters( Element pElement,

CharacterRun characterRun, String text )

{

Element span = htmlDocumentFacade.getDocument().createElement( "span" );

pElement.appendChild( span );

StringBuilder style = new StringBuilder();

BlockProperies blockProperies = this.blocksProperies.peek();

Triplet triplet = getCharacterRunTriplet( characterRun );

if (WordToHtmlUtils.isNotEmpty( triplet.fontName )

&& !WordToHtmlUtils.equals( triplet.fontName,

blockProperies.pFontName ) )

{

style.append( "font-family:" + triplet.fontName + ";" );

}

if ( characterRun.getFontSize() / 2 != blockProperies.pFontSize )

{

style.append( "font-size:" + characterRun.getFontSize() / 2 + "pt;" );

}

if ( triplet.bold )

{

style.append( "font-weight:bold;" );

}

if ( triplet.italic )

{

style.append( "font-style:italic;" );

}

WordToHtmlUtils.addCharactersProperties( characterRun, style );

if ( style.length() != 0 )

htmlDocumentFacade.addStyleClass( span, "s", style.toString() );

Text textNode = htmlDocumentFacade.createText( text );

span.appendChild( textNode );

}

@Override

protected void processBookmarks( HWPFDocumentCore wordDocument,

Element currentBlock, Range range, int currentTableLevel,

List rangeBookmarks )

{

Element parent = currentBlock;

for ( Bookmark bookmark : rangeBookmarks )

{

Element bookmarkElement = htmlDocumentFacade

.createBookmark( bookmark.getName() );

parent.appendChild( bookmarkElement );

parent = bookmarkElement;

}

if ( range != null )

processCharacters( wordDocument, currentTableLevel, range, parent );

}

@Override

protected void processDocumentInformation(

SummaryInformation summaryInformation )

{

if (WordToHtmlUtils.isNotEmpty( summaryInformation.getTitle() ) )

htmlDocumentFacade.setTitle( summaryInformation.getTitle() );

if (WordToHtmlUtils.isNotEmpty( summaryInformation.getAuthor() ) )

htmlDocumentFacade.addAuthor( summaryInformation.getAuthor() );

if (WordToHtmlUtils.isNotEmpty( summaryInformation.getKeywords() ) )

htmlDocumentFacade.addKeywords( summaryInformation.getKeywords() );

if (WordToHtmlUtils.isNotEmpty( summaryInformation.getComments() ) )

htmlDocumentFacade

.addDescription( summaryInformation.getComments() );

}

@Override

public void processDocumentPart( HWPFDocumentCore wordDocument, Range range )

{

super.processDocumentPart( wordDocument, range );

afterProcess();

}

@Override

protected void processDrawnObject( HWPFDocument doc,

CharacterRun characterRun, OfficeDrawing officeDrawing,

String path, Element block )

{

Element img = htmlDocumentFacade.createImage( path );

block.appendChild( img );

}

@Override

protected void processEndnoteAutonumbered( HWPFDocument wordDocument,

int noteIndex, Element block, Range endnoteTextRange )

{

processNoteAutonumbered( wordDocument, "end", noteIndex, block,

endnoteTextRange );

}

@Override

protected void processFootnoteAutonumbered( HWPFDocument wordDocument,

int noteIndex, Element block, Range footnoteTextRange )

{

processNoteAutonumbered( wordDocument, "foot", noteIndex, block,

footnoteTextRange );

}

@Override

protected void processHyperlink( HWPFDocumentCore wordDocument,

Element currentBlock, Range textRange, int currentTableLevel,

String hyperlink )

{

Element basicLink = htmlDocumentFacade.createHyperlink( hyperlink );

currentBlock.appendChild( basicLink );

if ( textRange != null )

processCharacters( wordDocument, currentTableLevel, textRange,

basicLink );

}

protected void processImage( Element currentBlock, boolean inlined,

Picture picture, String imageSourcePath )

{

final int aspectRatioX = picture.getHorizontalScalingFactor();

final int aspectRatioY = picture.getVerticalScalingFactor();

StringBuilder style = new StringBuilder();

final float imageWidth;

final float imageHeight;

final float cropTop;

final float cropBottom;

final float cropLeft;

final float cropRight;

if ( aspectRatioX > 0 )

{

imageWidth = picture.getDxaGoal() * aspectRatioX / 1000

/ TWIPS_PER_INCH;

cropRight = picture.getDxaCropRight() * aspectRatioX / 1000

/ TWIPS_PER_INCH;

cropLeft = picture.getDxaCropLeft() * aspectRatioX / 1000

/ TWIPS_PER_INCH;

}

else

{

imageWidth = picture.getDxaGoal() / TWIPS_PER_INCH;

cropRight = picture.getDxaCropRight() / TWIPS_PER_INCH;

cropLeft = picture.getDxaCropLeft() / TWIPS_PER_INCH;

}

if ( aspectRatioY > 0 )

{

imageHeight = picture.getDyaGoal() * aspectRatioY / 1000

/ TWIPS_PER_INCH;

cropTop = picture.getDyaCropTop() * aspectRatioY / 1000

/ TWIPS_PER_INCH;

cropBottom = picture.getDyaCropBottom() * aspectRatioY / 1000

/ TWIPS_PER_INCH;

}

else

{

imageHeight = picture.getDyaGoal() / TWIPS_PER_INCH;

cropTop = picture.getDyaCropTop() / TWIPS_PER_INCH;

cropBottom = picture.getDyaCropBottom() / TWIPS_PER_INCH;

}

Element root;

if ( cropTop != 0 || cropRight != 0 || cropBottom != 0 || cropLeft != 0 )

{

float visibleWidth = Math

.max( 0, imageWidth - cropLeft - cropRight );

float visibleHeight = Math.max( 0, imageHeight - cropTop

- cropBottom );

root = htmlDocumentFacade.createBlock();

htmlDocumentFacade.addStyleClass( root, "d",

"vertical-align:text-bottom;width:" + visibleWidth

+ "in;height:" + visibleHeight + "in;" );

// complex

Element inner = htmlDocumentFacade.createBlock();

htmlDocumentFacade.addStyleClass( inner, "d",

"position:relative;width:" + visibleWidth + "in;height:"

+ visibleHeight + "in;overflow:hidden;" );

root.appendChild( inner );

Element image = htmlDocumentFacade.createImage( imageSourcePath );

htmlDocumentFacade.addStyleClass( image, "i",

"position:absolute;left:-" + cropLeft + ";top:-" + cropTop

+ ";width:" + imageWidth + "in;height:"

+ imageHeight + "in;" );

inner.appendChild( image );

style.append( "overflow:hidden;" );

}

else

{

root = htmlDocumentFacade.createImage( imageSourcePath );

root.setAttribute( "style", "width:" + imageWidth + "in;height:"

+ imageHeight + "in;vertical-align:text-bottom;" );

}

currentBlock.appendChild( root );

}

@Override

protected void processImageWithoutPicturesManager( Element currentBlock,

boolean inlined, Picture picture )

{

// no default implementation -- skip

currentBlock.appendChild( htmlDocumentFacade.getDocument()

.createComment( "Image link to '"

+ picture.suggestFullFileName() + "' can be here" ) );

}

@Override

protected void processLineBreak( Element block, CharacterRun characterRun )

{

block.appendChild( htmlDocumentFacade.createLineBreak() );

}

protected void processNoteAutonumbered( HWPFDocument doc, String type,

int noteIndex, Element block, Range noteTextRange )

{

final String textIndex = String.valueOf( noteIndex + 1 );

final String textIndexClass = htmlDocumentFacade.getOrCreateCssClass(

"a", "vertical-align:super;font-size:smaller;" );

final String forwardNoteLink = type + "note_" + textIndex;

final String backwardNoteLink = type + "note_back_" + textIndex;

Element anchor = htmlDocumentFacade.createHyperlink( "#"

+ forwardNoteLink );

anchor.setAttribute( "name", backwardNoteLink );

anchor.setAttribute( "class", textIndexClass + " " + type

+ "noteanchor" );

anchor.setTextContent( textIndex );

block.appendChild( anchor );

if ( notes == null )

{

notes = htmlDocumentFacade.createBlock();

notes.setAttribute( "class", "notes" );

}

Element note = htmlDocumentFacade.createBlock();

note.setAttribute( "class", type + "note" );

notes.appendChild( note );

Element bookmark = htmlDocumentFacade.createBookmark( forwardNoteLink );

bookmark.setAttribute( "href", "#" + backwardNoteLink );

bookmark.setTextContent( textIndex );

bookmark.setAttribute( "class", textIndexClass + " " + type

+ "noteindex" );

note.appendChild( bookmark );

note.appendChild( htmlDocumentFacade.createText( " " ) );

Element span = htmlDocumentFacade.getDocument().createElement( "span" );

span.setAttribute( "class", type + "notetext" );

note.appendChild( span );

this.blocksProperies.add( new BlockProperies( "", -1 ) );

try

{

processCharacters( doc, Integer.MIN_VALUE, noteTextRange, span );

}

finally

{

this.blocksProperies.pop();

}

}

@Override

protected void processPageBreak( HWPFDocumentCore wordDocument, Element flow )

{

flow.appendChild( htmlDocumentFacade.createLineBreak() );

}

protected void processPageref( HWPFDocumentCore hwpfDocument,

Element currentBlock, Range textRange, int currentTableLevel,

String pageref )

{

Element basicLink = htmlDocumentFacade.createHyperlink( "#" + pageref );

currentBlock.appendChild( basicLink );

if ( textRange != null )

processCharacters( hwpfDocument, currentTableLevel, textRange,

basicLink );

}

protected void processParagraph( HWPFDocumentCore hwpfDocument,

Element parentElement, int currentTableLevel, Paragraph paragraph,

String bulletText )

{

final Element pElement = htmlDocumentFacade.createParagraph();

parentElement.appendChild( pElement );

StringBuilder style = new StringBuilder();

WordToHtmlUtils.addParagraphProperties( paragraph, style );

final int charRuns = paragraph.numCharacterRuns();

if ( charRuns == 0 )

{

return;

}

{

final String pFontName;

final int pFontSize;

final CharacterRun characterRun = paragraph.getCharacterRun( 0 );

if ( characterRun != null )

{

Triplet triplet = getCharacterRunTriplet( characterRun );

pFontSize = characterRun.getFontSize() / 2;

pFontName = triplet.fontName;

WordToHtmlUtils.addFontFamily( pFontName, style );

WordToHtmlUtils.addFontSize( pFontSize, style );

}

else

{

pFontSize = -1;

pFontName = "";

}

blocksProperies.push( new BlockProperies( pFontName, pFontSize ) );

}

try

{

if (WordToHtmlUtils.isNotEmpty( bulletText ) )

{

if ( bulletText.endsWith( "\t" ) )

{

/*

* We don't know how to handle all cases in HTML, but at

* least simplest case shall be handled

*/

final float defaultTab = TWIPS_PER_INCH / 2;

float firstLinePosition = paragraph.getIndentFromLeft()

+ paragraph.getFirstLineIndent() + 20; // char have

// some space

float nextStop = (float) ( Math.ceil( firstLinePosition

/ defaultTab ) * defaultTab );

final float spanMinWidth = nextStop - firstLinePosition;

Element span = htmlDocumentFacade.getDocument()

.createElement( "span" );

htmlDocumentFacade

.addStyleClass( span, "s",

"display: inline-block; text-indent: 0; min-width: "

+ ( spanMinWidth / TWIPS_PER_INCH )

+ "in;" );

pElement.appendChild( span );

Text textNode = htmlDocumentFacade.createText( bulletText

.substring( 0, bulletText.length() - 1 )

+ UNICODECHAR_ZERO_WIDTH_SPACE

+ UNICODECHAR_NO_BREAK_SPACE );

span.appendChild( textNode );

}

else

{

Text textNode = htmlDocumentFacade.createText( bulletText

.substring( 0, bulletText.length() - 1 ) );

pElement.appendChild( textNode );

}

}

processCharacters( hwpfDocument, currentTableLevel, paragraph,

pElement );

}

finally

{

blocksProperies.pop();

}

if ( style.length() > 0 )

htmlDocumentFacade.addStyleClass( pElement, "p", style.toString() );

WordToHtmlUtils.compactSpans( pElement );

return;

}

protected void processSection( HWPFDocumentCore wordDocument,

Section section, int sectionCounter )

{

Element div = htmlDocumentFacade.createBlock();

htmlDocumentFacade.addStyleClass( div, "d", getSectionStyle( section ) );

htmlDocumentFacade.getBody().appendChild( div );

processParagraphes( wordDocument, div, section, Integer.MIN_VALUE );

}

@Override

protected void processSingleSection( HWPFDocumentCore wordDocument,

Section section )

{

htmlDocumentFacade.addStyleClass( htmlDocumentFacade.getBody(), "b",

getSectionStyle( section ) );

processParagraphes( wordDocument, htmlDocumentFacade.getBody(), section,

Integer.MIN_VALUE );

}

}

And here is method which starts convert file:

public File convertToFile(){

String slashType = (filePath.lastIndexOf("\\") > 0) ? "\\" : "/"; // Windows or UNIX

String wordFileName = filePath.substring(filePath.lastIndexOf(slashType) + 1, filePath.length());

File htmlFile = new File(tempDirectory + wordFileName + ".htm");

if(!htmlFile.exists()){

try

{

Document doc = DocToHtmlConverter.process( new File(filePath) );

// FileWriter out = new FileWriter(htmlFile);

Writer out = new OutputStreamWriter(new FileOutputStream(htmlFile), "UTF-8");

DOMSource domSource = new DOMSource( doc );

StreamResult streamResult = new StreamResult( out );

TransformerFactory tf = TransformerFactory.newInstance();

Transformer serializer = tf.newTransformer();

// TODO set encoding from a command argument

serializer.setOutputProperty( OutputKeys.ENCODING, "UTF-8" );

serializer.setOutputProperty( OutputKeys.INDENT, "yes" );

serializer.setOutputProperty( OutputKeys.METHOD, "html" );

serializer.transform( domSource, streamResult );

out.flush();

out.close();

}

catch ( Exception e )

{

e.printStackTrace();

}

}

return htmlFile;

}

1、资源项目源码均已通过严格测试验证,保证能够正常运行; 2、项目问题、技术讨论,可以给博主私信或留言,博主看到后会第一时间与您进行沟通; 3、本项目比较适合计算机领域相关的毕业设计课题、课程作业等使用,尤其对于人工智能、计算机科学与技术等相关专业,更为适合; 4、下载使用后,可先查看README.md或论文文件(如有),本项目仅用作交流学习参考,请切勿用于商业用途。 5、资源来自互联网采集,如有侵权,私聊博主删除。、可私 6信博主看论文后选择购买源代码。 1、资源项目源码均已通过严格测试验证,保证能够正常运行; 2、项目问题、技术讨论,可以给博主私信或留言,博主看到后会第一时间与您进行沟通; 3、本项目比较适合计算机领域相关的毕业设计课题、课程作业等使用,尤其对于人工智能、计算机科学与技术等相关专业,更为适合; 4、下载使用后,可先查看README.md或论文文件(如有),本项目仅用作交流学习参考,请切勿用于商业用途。 5、资源来自互联网采集,如有侵权,私聊博主删除。 、可私信6博主看论文后选择购买源代码。 1、资源项目源码均已通过严格测试验证,保证能够正常运行; 2、项目问题、技术讨论,可以给博主私信或留言,博主看到后会第一时间与您进行沟通; 3、本项目比较适合计算机领域相关的毕业设计课题、课程作业等使用,尤其对于人工智能、计算机科学与技术等相关专业,更为适合; 4、下载使用后,可先查看README.md或论文文件(如有),本项目仅用作交流学习参考,请切勿用于商业用途。 5、资源来自互联网采集,如有侵权,私聊博主删除。 、可私信6博主看论文后选择购买源代码。
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值