解析大数据量文本

最新推荐文章于 2022-07-23 22:29:15 发布

程程程住气

最新推荐文章于 2022-07-23 22:29:15 发布

阅读量2.1k

点赞数

--------------------------------------------------------------------我不懂什么坚持，只是死撑而已

解析大数据量文本

POI解析xls

public abstract class HxlsAbstract implements HSSFListener {  
    private int minColumns;  
    private POIFSFileSystem fs;  
    private PrintStream output;  
  
    private int lastRowNumber;  
    private int lastColumnNumber;  
  
    /** Should we output the formula, or the value it has? */  
    private boolean outputFormulaValues = true;  
  
    /** For parsing Formulas */  
    private SheetRecordCollectingListener workbookBuildingListener;  
    private HSSFWorkbook stubWorkbook;  
  
    // Records we pick up as we process  
    private SSTRecord sstRecord;  
    private FormatTrackingHSSFListener formatListener;  
  
    /** So we known which sheet we're on */  
    private int sheetIndex = -1;  
    private BoundSheetRecord[] orderedBSRs;  
    @SuppressWarnings("rawtypes")
	private ArrayList boundSheetRecords = new ArrayList();  
  
    // For handling formulas with string results  
    private int nextRow;  
    private int nextColumn;  
    private boolean outputNextStringRecord;  
  
    private int curRow;  
    private List<String> rowlist;  
    @SuppressWarnings( "unused")  
    private String sheetName;  
    
    public HxlsAbstract(){
    	super();
    	 this.output = System.out;  
         this.minColumns = -1;  
         this.curRow = 0;  
         this.rowlist = new ArrayList<String>();  
    }
  
       
    //excel记录行操作方法，以行索引和行元素列表为参数，对一行元素进行操作，元素为String类型  
//  public abstract void optRows(int curRow, List<String> rowlist) throws SQLException ;  
      
    //excel记录行操作方法，以sheet索引，行索引和行元素列表为参数，对sheet的一行元素进行操作，元素为String类型  
    public abstract void optRows(int sheetIndex,int curRow, List<String> rowlist) throws SQLException;  
      
    /** 
     * 遍历 excel 文件 
     */  
    public void process(String filename) throws IOException {  
    	this.fs = new POIFSFileSystem(new FileInputStream(filename));
        MissingRecordAwareHSSFListener listener = new MissingRecordAwareHSSFListener(this);  
        formatListener = new FormatTrackingHSSFListener(listener);  
  
        HSSFEventFactory factory = new HSSFEventFactory();  
        HSSFRequest request = new HSSFRequest();  
  
        if (outputFormulaValues) {  
            request.addListenerForAllRecords(formatListener);  
        } else {  
            workbookBuildingListener = new SheetRecordCollectingListener(formatListener);  
            request.addListenerForAllRecords(workbookBuildingListener);  
        }  
  
        factory.processWorkbookEvents(request, fs); 
    }  
      
    /** 
     * HSSFListener 监听方法，处理 Record 
     */  
    @SuppressWarnings("unchecked")  
    public void processRecord(Record record) {  
        int thisRow = -1;  
        int thisColumn = -1;  
        String thisStr = null;  
        String value = null;  
          
        switch (record.getSid()) {  
        case BoundSheetRecord.sid:  
            boundSheetRecords.add(record);  
            break;  
        case BOFRecord.sid:  
            BOFRecord br = (BOFRecord) record;  
            if (br.getType() == BOFRecord.TYPE_WORKSHEET) {  
                // Create sub workbook if required  
                if (workbookBuildingListener != null && stubWorkbook == null) {  
                    stubWorkbook = workbookBuildingListener  
                            .getStubHSSFWorkbook();  
                }  
  
                // Works by ordering the BSRs by the location of  
                // their BOFRecords, and then knowing that we  
                // process BOFRecords in byte offset order  
                sheetIndex++;  
                if (orderedBSRs == null) {  
                    orderedBSRs = BoundSheetRecord  
                            .orderByBofPosition(boundSheetRecords);  
                }  
                sheetName = orderedBSRs[sheetIndex].getSheetname();  
            }  
            break;  
  
        case SSTRecord.sid:  
            sstRecord = (SSTRecord) record;  
            break;  
  
        case BlankRecord.sid:  
            BlankRecord brec = (BlankRecord) record;  
  
            thisRow = brec.getRow();  
            thisColumn = brec.getColumn();  
            thisStr = "";  
            break;  
        case BoolErrRecord.sid:  
            BoolErrRecord berec = (BoolErrRecord) record;  
  
            thisRow = berec.getRow();  
            thisColumn = berec.getColumn();  
            thisStr = "";  
            break;  
  
        case FormulaRecord.sid:  
            FormulaRecord frec = (FormulaRecord) record;  
  
            thisRow = frec.getRow();  
            thisColumn = frec.getColumn();  
  
            if (outputFormulaValues) {  
                if (Double.isNaN(frec.getValue())) {  
                    // Formula result is a string  
                    // This is stored in the next record  
                    outputNextStringRecord = true;  
                    nextRow = frec.getRow();  
                    nextColumn = frec.getColumn();  
                } else {  
                    thisStr = formatListener.formatNumberDateCell(frec);  
                }  
            } else {  
                thisStr = '"' + HSSFFormulaParser.toFormulaString(stubWorkbook,  
                        frec.getParsedExpression()) + '"';  
            }  
            break;  
        case StringRecord.sid:  
            if (outputNextStringRecord) {  
                // String for formula  
                StringRecord srec = (StringRecord) record;  
                thisStr = srec.getString();  
                thisRow = nextRow;  
                thisColumn = nextColumn;  
                outputNextStringRecord = false;  
            }  
            break;  
  
        case LabelRecord.sid:  
            LabelRecord lrec = (LabelRecord) record;  
  
            curRow = thisRow = lrec.getRow();  
            thisColumn = lrec.getColumn();  
            value = lrec.getValue().trim();  
            value = value.equals("")?" ":value;  
            this.rowlist.add(thisColumn, value);  
            break;  
        case LabelSSTRecord.sid:  
            LabelSSTRecord lsrec = (LabelSSTRecord) record;  
  
            curRow = thisRow = lsrec.getRow();  
            thisColumn = lsrec.getColumn();  
            if (sstRecord == null) {  
                rowlist.add(thisColumn, " ");  
            } else {  
                value =  sstRecord  
                .getString(lsrec.getSSTIndex()).toString().trim();  
                value = value.equals("")?" ":value;  
                rowlist.add(thisColumn,value);  
            }  
            break;  
        case NoteRecord.sid:  
            NoteRecord nrec = (NoteRecord) record;  
  
            thisRow = nrec.getRow();  
            thisColumn = nrec.getColumn();  
            // TODO: Find object to match nrec.getShapeId()  
            thisStr = '"' + "(TODO)" + '"';  
            break;  
        case NumberRecord.sid:  
            NumberRecord numrec = (NumberRecord) record;  
  
            curRow = thisRow = numrec.getRow();  
            thisColumn = numrec.getColumn();  
            value = formatListener.formatNumberDateCell(numrec).trim();  
            value = value.equals("")?" ":value;  
            // Format  
            rowlist.add(thisColumn, value);  
            break;  
        case RKRecord.sid:  
            RKRecord rkrec = (RKRecord) record;  
  
            thisRow = rkrec.getRow();  
            thisColumn = rkrec.getColumn();  
            thisStr = '"' + "(TODO)" + '"';  
            break;  
        default:  
            break;  
        }  
  
        // 遇到新行的操作  
        if (thisRow != -1 && thisRow != lastRowNumber) {  
            lastColumnNumber = -1;  
        }  
  
        // 空值的操作  
        if (record instanceof MissingCellDummyRecord) {  
            MissingCellDummyRecord mc = (MissingCellDummyRecord) record;  
            curRow = thisRow = mc.getRow();  
            thisColumn = mc.getColumn();  
            rowlist.add(thisColumn," ");  
        }  
  
        // 如果遇到能打印的东西，在这里打印  
        if (thisStr != null) {  
            if (thisColumn > 0) {  
                output.print(',');  
            }  
            output.print(thisStr);  
        }  
  
        // 更新行和列的值  
        if (thisRow > -1)  
            lastRowNumber = thisRow;  
        if (thisColumn > -1)  
            lastColumnNumber = thisColumn;  
  
        // 行结束时的操作  
        if (record instanceof LastCellOfRowDummyRecord) {  
            if (minColumns > 0) {  
                // 列值重新置空  
                if (lastColumnNumber == -1) {  
                    lastColumnNumber = 0;  
                }  
            }  
            // 行结束时， 调用 optRows() 方法  
            lastColumnNumber = -1;  
            try {  
                optRows(sheetIndex,curRow, rowlist);  
            } catch (SQLException e) {  
                e.printStackTrace();  
            }  
            rowlist.clear();  
        }  
    }  
}

POI解析xlsx

public abstract class HxlsxAbstract extends DefaultHandler {  
	

	
      
    private SharedStringsTable sst;  
    private String lastContents;  
    private boolean nextIsString;  
 
    private int sheetIndex = -1;  
   
    
    
    private  List<String> rowlist = new ArrayList<String>();  

    private int curRow = 0;  
    private int curCol = 0;  
   

	/**
     * 读取第一个工作簿的入口方法
     * @param path
     */  
    public void readOneSheet(String path) throws Exception {  
        OPCPackage pkg = OPCPackage.open(path);       
        XSSFReader r = new XSSFReader(pkg);  
        SharedStringsTable sst = r.getSharedStringsTable();  
              
        XMLReader parser = fetchSheetParser(sst);  
              
        InputStream sheet = r.getSheet("rId1");  
 
        InputSource sheetSource = new InputSource(sheet);  
        parser.parse(sheetSource);  
              
        sheet.close();   
        
    }  
      
      
    /**
     * 读取所有工作簿的入口方法
     * @param path
     * @throws Exception
     */  
    public void process(String path) throws Exception {  
    	
        OPCPackage pkg = OPCPackage.open(path);  
        XSSFReader r = new XSSFReader(pkg);  
        SharedStringsTable sst = r.getSharedStringsTable();  
 
        XMLReader parser = fetchSheetParser(sst);  
 
        Iterator<InputStream> sheets = r.getSheetsData();  
        while (sheets.hasNext()) {  
        	 
            curRow = 0;  
            sheetIndex++;  
            InputStream sheet = sheets.next();  
           
            InputSource sheetSource = new InputSource(sheet);  
            parser.parse(sheetSource);  
            sheet.close();  
           
        }  
        
       
    }  
      
    /**
     * 该方法自动被调用，每读一行调用一次，在方法中写自己的业务逻辑即可
     * @param sheetIndex 工作簿序号
     * @param curRow 处理到第几行
     * @param rowList 当前数据行的数据集合
     */  
    public abstract void optRow(int sheetIndex, int curRow, List<String> rowList);
      
      
    public XMLReader fetchSheetParser(SharedStringsTable sst) throws SAXException {  
    	
        XMLReader parser = XMLReaderFactory  
                .createXMLReader();  
        this.sst = sst;  
        parser.setContentHandler(this);  
        return parser;  
    }  
      
    public void startElement(String uri, String localName, String name,  
            Attributes attributes) throws SAXException {  
        // c => 单元格  
        if (name.equals("c")) {  
            // 如果下一个元素是 SST 的索引，则将nextIsString标记为true  
            String cellType = attributes.getValue("t");  
            if (cellType != null && cellType.equals("s")) {  
                nextIsString = true;  
            } else {  
                nextIsString = false;  
            }  
        }  
        // 置空  
        lastContents = "";  
    }  
      
      
    public void endElement(String uri, String localName, String name)  
            throws SAXException {  
        // 根据SST的索引值的到单元格的真正要存储的字符串  
        // 这时characters()方法可能会被调用多次  
        if (nextIsString) {  
            try {  
                int idx = Integer.parseInt(lastContents);  
                lastContents = new XSSFRichTextString(sst.getEntryAt(idx))  
                        .toString();  
            } catch (Exception e) {  
 
            }  
        }  
 
        // v => 单元格的值，如果单元格是字符串则v标签的值为该字符串在SST中的索引  
        // 将单元格内容加入rowlist中，在这之前先去掉字符串前后的空白符  
        if (name.equals("v")) {  
            String value = lastContents.trim();  
            value = value.equals("") ? " " : value;  
            rowlist.add(curCol, value);  
            curCol++;  
        } else {  
            // 如果标签名称为 row ，这说明已到行尾，调用 optRows() 方法  
            if (name.equals("row")) {  
                optRow(sheetIndex, curRow, rowlist);  
                rowlist.clear();  
                curRow++;  
                curCol = 0;  
            }  
        }  
    }  
 
    public void characters(char[] ch, int start, int length)  
            throws SAXException {  
        // 得到单元格内容的值  
        lastContents += new String(ch, start, length);  
    }  
 
}

解析csv

public abstract class HcsvAbstract {
	
	public int lineIndex=-1;
	
	
	public void process(String filename) throws IOException{
		
	
			 	InputStream is=new FileInputStream(new File(filename));
	            BufferedReader reader = new BufferedReader(new InputStreamReader(is,"gbk"));
	           // reader.readLine();//第一行信息，为标题信息，不用,如果需要，注释掉 
	            String line = null;  
	            while((line=reader.readLine())!=null){  
	            	lineIndex++;
	                String item[] = line.split(",");//CSV格式文件为逗号分隔符文件，这里根据逗号切分 
	                List<String> lineList=new ArrayList<String>();
	                for(int i=0;i<item.length;i++){
	                	lineList.add(item[i]);
	                }
	               optRows(lineIndex, lineList);
	                //int value = Integer.parseInt(last);//如果是数值，可以转化为数值 
	                  
	            }  

	       
		
	}
	
	public abstract void optRows(int lineIndex, List<String> lineList);
	

}

以上为3个抽象类，当需要解析文本时，继承对应的抽象类，重写里面的抽象方法 optRows。此方法每次读取一行自动调用一次，参数包括读取的文件信息，可以用来处理业务逻辑。读取文件时，调用父类process（String filePath）方法,参数是文件地址。

程程程住气

关注

0
点赞
踩
4

收藏

觉得还不错? 一键收藏
0
评论
解析大数据量文本

------------------------------------------------------------------------------------------------我不懂什么坚持，只是死撑而已POI解析xlspublic abstract class HxlsAbstract implements HSSFListener { private i
复制链接

扫一扫