<dependency>
<groupId>org.apache.poi</groupId>
<artifactId>poi</artifactId>
<version>4.1.2</version>
</dependency>
<dependency>
<groupId>org.apache.poi</groupId>
<artifactId>poi-excelant</artifactId>
<version>4.1.2</version>
</dependency>
<dependency>
<groupId>org.apache.poi</groupId>
<artifactId>poi-ooxml</artifactId>
<version>4.1.2</version>
</dependency>
<dependency>
<groupId>org.apache.poi</groupId>
<artifactId>poi-ooxml-schemas</artifactId>
<version>4.1.2</version>
</dependency>
<dependency>
<groupId>org.apache.poi</groupId>
<artifactId>poi-scratchpad</artifactId>
<version>4.1.2</version>
<scope>compile</scope>
</dependency>
获取不同类型文件的字数
import com.spire.doc.Document;
import com.spire.doc.FileFormat;
import com.spire.pdf.PdfDocument;
import com.spire.pdf.PdfPageBase;
import org.apache.commons.lang.StringUtils;
import org.apache.fop.svg.PDFTextElementBridge;
import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.text.PDFTextStripper;
import org.apache.pdfbox.text.PDFTextStripperByArea;
import org.apache.poi.hpsf.DocumentSummaryInformation;
import org.apache.poi.hslf.usermodel.*;
import org.apache.poi.hssf.usermodel.HSSFCell;
import org.apache.poi.hssf.usermodel.HSSFRow;
import org.apache.poi.hssf.usermodel.HSSFSheet;
import org.apache.poi.hssf.usermodel.HSSFWorkbook;
import org.apache.poi.hwpf.HWPFDocument;
import org.apache.poi.hwpf.extractor.WordExtractor;
import org.apache.poi.hwpf.usermodel.HeaderStories;
import org.apache.poi.poifs.filesystem.POIFSFileSystem;
import org.apache.poi.ss.usermodel.*;
import org.apache.poi.xslf.usermodel.*;
import org.apache.poi.xssf.usermodel.XSSFWorkbook;
import org.apache.poi.xwpf.usermodel.XWPFDocument;
import org.apache.poi.xwpf.usermodel.XWPFParagraph;
import org.apache.poi.xwpf.usermodel.XWPFRun;
import org.springframework.web.multipart.MultipartFile;
import java.io.*;
import java.util.*;
import java.util.Objects;
public class DocUtil {
public static Integer docCount(MultipartFile multipartFile, File remoteFile, String fileFormat){
Integer num = 0;
InputStream fileInputStream = null;
try {
fileInputStream = FileUtil.fileToStream(multipartFile);
if(Objects.equals(fileFormat, ".doc")){
String newFilePath = docToTxt(fileInputStream, FileFormat.Doc, remoteFile.getPath());
File newFile = new File(newFilePath);
num = txtCount(newFile);
}else if(Objects.equals(fileFormat, ".docx")){
String newFilePath = docToTxt(fileInputStream, FileFormat.Docx, remoteFile.getPath());
File newFile = new File(newFilePath);
num = txtCount(newFile);
}else if(Objects.equals(fileFormat, ".pdf")){
num = pdfCount(remoteFile);
}else if(Objects.equals(fileFormat, ".txt")){
num = txtCount(remoteFile);
}else{
num = 0;
}
} catch (Exception e) {
e.printStackTrace();
} finally {
}
return num;
}
public static Integer wordCount(String filePath, FileFormat fileFormat){
Integer countNum = 0;
Document document = new Document();
document.loadFromFile(filePath, fileFormat);
countNum = document.getBuiltinDocumentProperties().getCharCount();
return countNum;
}
public static Integer pdfCount(File file){
Integer countNum = 0;
PdfDocument pdfDocument = new PdfDocument();
pdfDocument.loadFromFile(file.getPath());
StringBuilder sb = new StringBuilder();
PdfPageBase page;
for(int i= 0;i<pdfDocument.getPages().getCount();i++){
page = pdfDocument.getPages().get(i);
sb.append(page.extractText(true));
}
countNum = sb.length();
return countNum;
}
public static Integer txtCount(File file){
int charnum=0;
int x=-1;
FileReader fReader = null;
try {
fReader = new FileReader(file);
while((x=fReader.read())!=-1){
char a=(char)x;
if(a!='\n'&&a!='\r'){
charnum++;
}
}
} catch (Exception e) {
e.printStackTrace();
} finally {
if(fReader != null){
try {
fReader.close();
} catch (IOException e) {
e.printStackTrace();
}
}
}
return charnum;
}
public static String docToTxt(InputStream inputStream, FileFormat fileFormat, String tofile) {
Document document = new Document();
document.loadFromStream(inputStream, fileFormat);
tofile = tofile.split("\\.")[0]+".txt";
document.saveToFile(tofile, FileFormat.Txt);
document.close();
return tofile;
}
public static String sectionBreakAndPageBreak(InputStream inputStream, String tofile){
Document document = new Document();
document.loadFromStream(inputStream, FileFormat.Docx);
document.insertTextFromFile("E:\\common_data\\translate\\document\\20221101\\111.docx", FileFormat.Docx_2013);
tofile = tofile.split("\\.")[0]+"-copy.docx";
document.saveToFile(tofile, FileFormat.Docx_2013);
return tofile;
}
public static void main(String args[]) throws Exception {
String fileName = "1.ppt";
FileInputStream input = new FileInputStream("D:\\"+fileName);
wordCount(FileUtil.streamToFile(input, fileName));
}
public static Integer wordCount(MultipartFile file) throws Exception{
Integer count = 0;
String fileExt = com.sunther.idb.file.FileUtil.getFileExt(file.getOriginalFilename());
if(Objects.equals("doc", fileExt)){
count = getCountByDoc(file.getInputStream());
} else if(Objects.equals("docx", fileExt)){
count = getCountByDocx(file.getInputStream());
} else if(Objects.equals("pdf", fileExt)){
count = getCountByPdf(file.getInputStream());
} else if(Objects.equals("txt", fileExt)){
count = getCountByTxt(file.getInputStream());
} else if(Objects.equals("xls", fileExt)){
count = getCountByXls(file.getInputStream());
} else if(Objects.equals("xlsx", fileExt)){
count = getCountByXlsx(file.getInputStream());
} else if(Objects.equals("ppt", fileExt)){
count = getCountByPPT(file.getInputStream());
} else if(Objects.equals("pptx", fileExt)){
count = getCountByPPTX(file.getInputStream());
}
System.out.println("文章总字数:"+ count);
return count;
}
private static Integer getCountByDoc(InputStream is){
Integer count = 0;
POIFSFileSystem fs = null;
try {
fs = new POIFSFileSystem(is);
HWPFDocument doc = new HWPFDocument(fs);
WordExtractor we = new WordExtractor(doc);
String[] paragraphs = we.getParagraphText();
for (int i = 0; i < paragraphs.length; i++) {
count += paragraphs[i].trim().length();
}
is.close();
} catch (Exception e) {
e.printStackTrace();
}
return count;
}
private static Integer getCountByDocx(InputStream is){
Integer count = 0;
try {
XWPFDocument docx = new XWPFDocument(is);
List<XWPFParagraph> paragraphs = docx.getParagraphs();
int i = 1;
for (XWPFParagraph xwpfParagraph : paragraphs) {
int linLength = 0;
String lineStr = "";
List<XWPFRun> xwpfRuns = xwpfParagraph.getRuns();
for (XWPFRun xwpfRun : xwpfRuns) {
linLength += xwpfRun.toString().trim().length();
lineStr += xwpfRun.toString();
count += xwpfRun.toString().trim().length();
}
i++;
}
is.close();
} catch (Exception e) {
e.printStackTrace();
}
return count;
}
private static Integer getCountByTxt(InputStream is) {
Integer count = 0;
try {
Scanner sc = new Scanner(is);
while (sc.hasNext()) {
String temp = sc.nextLine();
count += temp.trim().length();
}
is.close();
} catch (Exception e) {
e.printStackTrace();
}
return count;
}
private static Integer getCountByXls(InputStream is){
Integer count = 0;
POIFSFileSystem fs = null;
try {
fs = new POIFSFileSystem(is);
HSSFWorkbook hssfWorkbook=new HSSFWorkbook(fs);
HSSFSheet sheet=hssfWorkbook.getSheetAt(0);
Iterator iterator_row=sheet.rowIterator();
int i=0;
while(iterator_row.hasNext()){
HSSFRow row=(HSSFRow) iterator_row.next();
i++;
Iterator iterator_cell=row.cellIterator();
while(iterator_cell.hasNext()) {
HSSFCell cell = (HSSFCell) iterator_cell.next();
count += getCellLength(cell);
}
}
is.close();
} catch (Exception e) {
e.printStackTrace();
}
return count;
}
private static Integer getCountByXlsx(InputStream is){
Integer count = 0;
try {
Workbook workbook =new XSSFWorkbook(is);
Sheet sheet = workbook.getSheetAt(0);
for (Row row : sheet) {
for (Cell cell : row) {
count += getCellLength(cell);
}
}
is.close();
} catch (Exception e) {
e.printStackTrace();
}
return count;
}
private static Integer getCellLength(Cell cell){
Integer count = 0;
if(cell.getCellType() == CellType.STRING){
count = cell.getStringCellValue().trim().length();
} else if(cell.getCellType() == CellType.NUMERIC){
count = String.valueOf(cell.getNumericCellValue()).trim().length();
} else if(cell.getCellType() == CellType.BOOLEAN){
count = 1;
}
return count;
}
public static Integer getCountByPdf(InputStream is){
Integer countNum = 0;
try {
PDDocument pdDocument = PDDocument.load(is);
PDFTextStripperByArea stripperByArea = new PDFTextStripperByArea();
stripperByArea.setSortByPosition(true);
PDFTextStripper textStripper = new PDFTextStripper();
String pdfFileInText = textStripper.getText(pdDocument);
String[] lines = pdfFileInText.split("\\r?\\n");
for (String line : lines){
countNum += line.length();
}
is.close();
} catch (Exception e) {
e.printStackTrace();
}
return countNum;
}
public static Integer getCountByPPT(InputStream is){
Integer countNum = 0;
try {
HSLFSlideShow slideShow = new HSLFSlideShow(is);
List<HSLFSlide> slides = slideShow.getSlides();
for (HSLFSlide slide : slides) {
List<HSLFShape> shapes = slide.getShapes();
for (HSLFShape shape : shapes) {
if (shape instanceof HSLFTextShape){
String text = ((HSLFTextShape) shape).getText().trim();
if (StringUtils.isNotBlank(text)){
countNum += text.length();
}
}
if (shape instanceof HSLFTable){
for (int i = 0; i < ((HSLFTable) shape).getNumberOfRows(); i++) {
for (int j = 0; j < ((HSLFTable) shape).getNumberOfColumns(); j++) {
HSLFTableCell cell = ((HSLFTable) shape).getCell(i, j);
String text = cell.getText().trim();
if (StringUtils.isNotBlank(text)){
countNum += text.length();
}
}
}
}
}
}
is.close();
} catch (Exception e) {
e.printStackTrace();
}
return countNum;
}
public static Integer getCountByPPTX(InputStream is){
Integer countNum = 0;
try {
XMLSlideShow xss = new XMLSlideShow(is);
List<XSLFSlide> slides = xss.getSlides();
for (XSLFSlide slide : slides) {
List<XSLFShape> shapes = slide.getShapes();
for (XSLFShape shape : shapes) {
if (shape instanceof XSLFTextShape) {
String text = ((XSLFTextShape) shape).getText().trim();
if (StringUtils.isNotBlank(text)){
countNum += text.length();
}
}
if (shape instanceof XSLFTable) {
for (XSLFTableRow row : ((XSLFTable) shape).getRows()) {
for (XSLFTableCell cell : row.getCells()) {
String text = cell.getText().trim();
if (StringUtils.isNotBlank(text)){
countNum += text.length();
}
}
}
}
}
}
is.close();
} catch (Exception e) {
e.printStackTrace();
}
return countNum;
}
}
合并单元格
if (null != mergeParams && mergeParams.size() > 0){
for (List<Integer> list : mergeParams) {
sheet.addMergedRegion(new CellRangeAddress(list.get(0), list.get(1), list.get(2), list.get(3)));
}
}