有时做数据处理的时候会用到Excel,所以我们可以将Excel文件转换为csv就可以供weka接口使用了。
需要用到的包:
weka.jar
poi-3.10.1-20140818.jar
poi-ooxml-3.10.1-20140818.jar
dom4j-1.6.1.jar
xmlbeans-2.3.0.jar
- XLS2CSV.java源码如下:
XLS2CSV.java源码如下:
package com.java.convert;
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.IOException;
import java.io.PrintStream;
import java.util.ArrayList;
import org.apache.poi.hssf.eventusermodel.FormatTrackingHSSFListener;
import org.apache.poi.hssf.eventusermodel.HSSFEventFactory;
import org.apache.poi.hssf.eventusermodel.HSSFListener;
import org.apache.poi.hssf.eventusermodel.HSSFRequest;
import org.apache.poi.hssf.eventusermodel.MissingRecordAwareHSSFListener;
import org.apache.poi.hssf.eventusermodel.EventWorkbookBuilder.SheetRecordCollectingListener;
import org.apache.poi.hssf.eventusermodel.dummyrecord.LastCellOfRowDummyRecord;
import org.apache.poi.hssf.eventusermodel.dummyrecord.MissingCellDummyRecord;
import org.apache.poi.hssf.model.HSSFFormulaParser;
import org.apache.poi.hssf.record.BOFRecord;
import org.apache.poi.hssf.record.BlankRecord;
import org.apache.poi.hssf.record.BoolErrRecord;
import org.apache.poi.hssf.record.BoundSheetRecord;
import org.apache.poi.hssf.record.FormulaRecord;
import org.apache.poi.hssf.record.LabelRecord;
import org.apache.poi.hssf.record.LabelSSTRecord;
import org.apache.poi.hssf.record.NoteRecord;
import org.apache.poi.hssf.record.NumberRecord;
import org.apache.poi.hssf.record.RKRecord;
import org.apache.poi.hssf.record.Record;
import org.apache.poi.hssf.record.SSTRecord;
import org.apache.poi.hssf.record.StringRecord;
import org.apache.poi.hssf.usermodel.HSSFWorkbook;
import org.apache.poi.poifs.filesystem.POIFSFileSystem;
public class XLS2CSV implements HSSFListener{
private int minColumns;
private POIFSFileSystem fs;
private PrintStream output;
private int lastRowNumber;
private int lastColumnNumber;
/** Should we output the formula, or the value it has? */
private boolean outputFormulaValues = true;
/** For parsing Formulas */
private SheetRecordCollectingListener workbookBuildingListener;
private HSSFWorkbook stubWorkbook;
// Records we pick up as we process
private SSTRecord sstRecord;
private FormatTrackingHSSFListener formatListener;
/** So we known which sheet we're on */
private int sheetIndex = -1;
private BoundSheetRecord[] orderedBSRs;
private ArrayList boundSheetRecords = new ArrayList();
// For handling formulas with string results
private int nextRow;
private int nextColumn;
private boolean outputNextStringRecord;
private final String OUTPUT_CHARSET = "GBK";
/**
* Creates a new XLS -> CSV converter
*
* @param fs
* The POIFSFileSystem to process
* @param output
* The PrintStream to output the CSV to
* @param minColumns
* The minimum number of columns to output, or -1 for no minimum
*/
public XLS2CSV(POIFSFileSystem fs, PrintStream output, int minColumns) {
this.fs = fs;
this.output = output;
this.minColumns = minColumns;
}
public XLS2CSV(String inputFilePath, String outputFilePath) throws Exception {
fs = new POIFSFileSystem(new FileInputStream(inputFilePath));
output = new PrintStream(outputFilePath, OUTPUT_CHARSET);
minColumns = -1;
}
/**
* Creates a new XLS -> CSV converter
*
* @param filename
* The file to process
* @param minColumns
* The minimum number of columns to output, or -1 for no minimum
* @throws IOException
* @throws FileNotFoundException
*/
public XLS2CSV(String filename, int minColumns) throws IOException,
FileNotFoundException {
this(new POIFSFileSystem(new FileInputStream(filename)), System.out,
minColumns);
}
/**
* Initiates the processing of the XLS file to CSV
*/
public void process() throws IOException {
MissingRecordAwareHSSFListener listener = new MissingRecordAwareHSSFListener(
this);
formatListener = new FormatTrackingHSSFListener(listener);
HSSFEventFactory factory = new HSSFEventFactory();
HSSFRequest request = new HSSFRequest();
if (outputFormulaValues) {
request.addListenerForAllRecords(formatListener);
} else {
workbookBuildingListener = new SheetRecordCollectingListener(
formatListener);
request.addListenerForAllRecords(workbookBuildingListener);
}
factory.processWorkbookEvents(request, fs);
}
/**
* Main HSSFListener method, processes events, and outputs the CSV as the
* file is processed.
*/
public void processRecord(Record record) {
int thisRow = -1;
int thisColumn = -1;
String thisStr = null;
switch (record.getSid()) {
case BoundSheetRecord.sid:
//boundSheetRecords.add(record);
break;
case BOFRecord.sid:
BOFRecord br = (BOFRecord) record;
if (br.getType() == BOFRecord.TYPE_WORKSHEET) {
// Create sub workbook if required
if (workbookBuildingListener != null && stubWorkbook == null) {
stubWorkbook = workbookBuildingListener
.getStubHSSFWorkbook();
}
// Output the worksheet name
// Works by ordering the BSRs by the location of
// their BOFRecords, and then knowing that we
// process BOFRecords in byte offset order
sheetIndex++;
if (orderedBSRs == null) {
orderedBSRs = BoundSheetRecord
.orderByBofPosition(boundSheetRecords);
}
//output.println();
//不需要分页
//output.println(orderedBSRs[sheetIndex].getSheetname() + " ["
// + (sheetIndex + 1) + "]:");
}
break;
case SSTRecord.sid:
sstRecord = (SSTRecord) record;
break;
case BlankRecord.sid:
BlankRecord brec = (BlankRecord) record;
thisRow = brec.getRow();
thisColumn = brec.getColumn();
thisStr = "";
break;
case BoolErrRecord.sid:
BoolErrRecord berec = (BoolErrRecord) record;
thisRow = berec.getRow();
thisColumn = berec.getColumn();
thisStr = "";
break;
case FormulaRecord.sid:
FormulaRecord frec = (FormulaRecord) record;
thisRow = frec.getRow();
thisColumn = frec.getColumn();
if (outputFormulaValues) {
if (Double.isNaN(frec.getValue())) {
// Formula result is a string
// This is stored in the next record
outputNextStringRecord = true;
nextRow = frec.getRow();
nextColumn = frec.getColumn();
} else {
thisStr = formatListener.formatNumberDateCell(frec);
}
} else {
thisStr = '"' + HSSFFormulaParser.toFormulaString(stubWorkbook,
frec.getParsedExpression()) + '"';
}
break;
case StringRecord.sid:
if (outputNextStringRecord) {
// String for formula
StringRecord srec = (StringRecord) record;
thisStr = srec.getString();
thisRow = nextRow;
thisColumn = nextColumn;
outputNextStringRecord = false;
}
break;
case LabelRecord.sid:
LabelRecord lrec = (LabelRecord) record;
thisRow = lrec.getRow();
thisColumn = lrec.getColumn();
thisStr = '"' + lrec.getValue() + '"';
break;
case LabelSSTRecord.sid:
LabelSSTRecord lsrec = (LabelSSTRecord) record;
thisRow = lsrec.getRow();
thisColumn = lsrec.getColumn();
if (sstRecord == null) {
thisStr = '"' + "(No SST Record, can't identify string)" + '"';
} else {
thisStr = '"' + sstRecord.getString(lsrec.getSSTIndex())
.toString() + '"';
}
break;
case NoteRecord.sid:
NoteRecord nrec = (NoteRecord) record;
thisRow = nrec.getRow();
thisColumn = nrec.getColumn();
// TODO: Find object to match nrec.getShapeId()
thisStr = '"' + "(TODO)" + '"';
break;
case NumberRecord.sid:
NumberRecord numrec = (NumberRecord) record;
thisRow = numrec.getRow();
thisColumn = numrec.getColumn();
// Format
thisStr = formatListener.formatNumberDateCell(numrec);
break;
case RKRecord.sid:
RKRecord rkrec = (RKRecord) record;
thisRow = rkrec.getRow();
thisColumn = rkrec.getColumn();
thisStr = '"' + "(TODO)" + '"';
break;
default:
break;
}
// Handle new row
if (thisRow != -1 && thisRow != lastRowNumber) {
lastColumnNumber = -1;
}
// Handle missing column
if (record instanceof MissingCellDummyRecord) {
MissingCellDummyRecord mc = (MissingCellDummyRecord) record;
thisRow = mc.getRow();
thisColumn = mc.getColumn();
thisStr = "";
}
// If we got something to print out, do so
if (thisStr != null) {
if (thisColumn > 0) {
output.print(',');
}
output.print(thisStr);
}
// Update column and row count
if (thisRow > -1)
lastRowNumber = thisRow;
if (thisColumn > -1)
lastColumnNumber = thisColumn;
// Handle end of row
if (record instanceof LastCellOfRowDummyRecord) {
// Print out any missing commas if needed
if (minColumns > 0) {
// Columns are 0 based
if (lastColumnNumber == -1) {
lastColumnNumber = 0;
}
for (int i = lastColumnNumber; i < (minColumns); i++) {
output.print(',');
}
}
// We're onto a new row
lastColumnNumber = -1;
// End the row
output.println();
}
}
public static void main(String[] args) throws Exception {
XLS2CSV xls2csv = new XLS2CSV("data\\联系方式.xls","data\\out.csv");
xls2csv.process();
}
}
- XLSX2CSV.java源码如下
XLSX2CSV.java源码如下
package com.java.getdata;
import java.io.BufferedReader;
import java.io.File;
import java.io.FileReader;
import com.java.convert.XLS2CSV;
import com.java.convert.XLSX2CSV;
import weka.core.Instances;
import weka.core.converters.ArffLoader;
import weka.core.converters.ConverterUtils.DataSource;
//三种获得数据方式,arff,csv,excel(xls,xlsx)
public class GetData {
public static void main(String[] args) throws Exception{
String fileName1="E:\\Program Files\\Weka-3-6\\data\\cpu.arff";
String fileName2="data\\out.csv";
System.out.println(getDataFromExcel("data\\data.xlsx"));
//System.out.println(getDataFromXlsx(fileName1));
//System.out.println(getDataFromCsv(fileName2));
}
public static Instances getDataFromArff(String fileName)throws Exception
{
//3.5版本之后方法
ArffLoader loader=new ArffLoader();
loader.setFile(new File(fileName));
Instances data=loader.getDataSet();
//3.5之前的方法
//Instances data=new Instances(new BufferedReader(new FileReader(fileName)));
if (data.classIndex()==-1) {
data.setClassIndex(data.numAttributes()-1);
}
return data;
}
public static Instances getDataFromCsv(String fileName)throws Exception {
DataSource source=new DataSource(fileName);
Instances data=source.getDataSet();
if (data.classIndex()==-1) {
data.setClassIndex(data.numAttributes()-1);
}
return data;
}
private static Instances getDataFromXls(String fileName)throws Exception
{
XLS2CSV xls2csv = new XLS2CSV(fileName,"data\\out.csv");
xls2csv.process();
Instances data=getDataFromCsv("data\\out.csv");
return data;
}
private static Instances getDataFromXlsx(String fileName)throws Exception
{
XLSX2CSV xlsx2csv = new XLSX2CSV(fileName,"data\\outs.csv");
xlsx2csv.process();
Instances data=getDataFromCsv("data\\outs.csv");
return data;
}
public static Instances getDataFromExcel(String fileName)throws Exception
{
String extension = fileName.lastIndexOf(".")==-1?"":fileName.substring(fileName.lastIndexOf(".")+1);
if("xls".equals(extension)){
return getDataFromXls(fileName);
}else if("xlsx".equals(extension)){
return getDataFromXlsx(fileName);
}else{
throw new Exception("不支持的文件类型");
}
}
}