CSV文件准确读取两种思路

通过查询网上资料,发现有两种解析思路:a.通过pattern分割各字段,b.逐字符读取并判断,当然还有通过第三方Jar包来解析的方法。

1.通过Pattern准确分割字段(Reference:csv文件读取

package xufei;

import java.io.BufferedReader;
import java.io.FileInputStream;
import java.io.IOException;
import java.io.InputStreamReader;
import java.util.ArrayList;
import java.util.List;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

/*
* 文件规则
* Microsoft的格式是最简单的。以逗号分隔的值要么是“纯粹的”(仅仅包含在括号之前),
* 要么是在双引号之间(这时数据中的双引号以一对双引号表示)。
* Ten Thousand,10000, 2710 ,,"10,000","It's ""10 Grand"", baby",10K
* 这一行包含七个字段(fields):
*        Ten Thousand
*        10000
*         2710 
*        空字段
*        10,000
*        It's "10 Grand", baby
*        10K
* 每条记录占一行
* 以逗号为分隔符
* 逗号前后的空格会被忽略
* 字段中包含有逗号,该字段必须用双引号括起来。如果是全角的没有问题。
* 字段中包含有换行符,该字段必须用双引号括起来
* 字段前后包含有空格,该字段必须用双引号括起来
* 字段中的双引号用两个双引号表示
* 字段中如果有双引号,该字段必须用双引号括起来
* 第一条记录,可以是字段名
*/
/** 

*
タイトル: xufei.CSVAnalysis.java

*
説明:

*
著作権: Copyright (c) 2006

*
会社名: technodia

* @author        徐飞
* @version 1.0
* createDate Aug 11, 2008
* 修正履歴
* 修正日      修正者       修正理由
*/
public class CSVAnalysis {
        private InputStreamReader fr = null;
        private BufferedReader br = null;

        public CSVAnalysis(String f) throws IOException {
                fr = new InputStreamReader(new FileInputStream(f));
        }

        /**
         * 解析csv文件 到一个list中
         * 每个单元个为一个String类型记录,每一行为一个list。
         * 再将所有的行放到一个总list中
         * @return
         * @throws IOException
         */
        public List> readCSVFile() throws IOException {
                br = new BufferedReader(fr);
                String rec = null;//一行
                String str;//一个单元格
                List> listFile = new ArrayList>();
                try {                        
                        //读取一行
                        while ((rec = br.readLine()) != null) {
                                Pattern pCells = Pattern
                                                .compile("(\"[^\"]*(\"{2})*[^\"]*\")*[^,]*,");
                                Matcher mCells = pCells.matcher(rec);
                                List cells = new ArrayList();//每行记录一个list
                                //读取每个单元格
                                while (mCells.find()) {
                                        str = mCells.group();
                                        str = str.replaceAll(
                                                        "(?sm)\"?([^\"]*(\"{2})*[^\"]*)\"?.*,", "$1");
                                        str = str.replaceAll("(?sm)(\"(\"))", "$2");
                                        cells.add(str);
                                }
                                listFile.add(cells);
                        }                        
                } catch (Exception e) {
                        e.printStackTrace();
                } finally {
                        if (fr != null) {
                                fr.close();
                        }
                        if (br != null) {
                                br.close();
                        }
                }
                return listFile;
        }

        public static void main(String[] args) throws Throwable {
                CSVAnalysis parser = new CSVAnalysis("c:/test2.csv");
                parser.readCSVFile();
        }
}

2.逐字符读取字符串,并对字符判断(Reference:csv文件读取

import java.io.BufferedReader;
import java.io.Closeable;
import java.io.IOException;
import java.io.Reader;
import java.util.ArrayList;
import java.util.List;

/**
 * A very simple CSV reader released under a commercial-friendly license.
 * 
 * @author Glen Smith
 * 
 */
public class CSVReader implements Closeable {

    private BufferedReader br;

    private boolean hasNext = true;

    private final char separator;

    private final char quotechar;

    private final char escape;

    private int skipLines;

    private boolean linesSkiped;

    /** The default separator to use if none is supplied to the constructor. */
    public static final char DEFAULT_SEPARATOR = ',';

    public static final int INITIAL_READ_SIZE = 64;

    /**
     * The default quote character to use if none is supplied to the
     * constructor.
     */
    public static final char DEFAULT_QUOTE_CHARACTER = '"';


    /**
     * The default escape character to use if none is supplied to the
     * constructor.
     */
    public static final char DEFAULT_ESCAPE_CHARACTER = '\\';

    /**
     * The default line to start reading.
     */
    public static final int DEFAULT_SKIP_LINES = 0;

    /**
     * Constructs CSVReader using a comma for the separator.
     * 
     * @param reader
     *            the reader to an underlying CSV source.
     */
    public CSVReader(Reader reader) {
        this(reader, DEFAULT_SEPARATOR);
    }

    /**
     * Constructs CSVReader with supplied separator.
     * 
     * @param reader
     *            the reader to an underlying CSV source.
     * @param separator
     *            the delimiter to use for separating entries.
     */
    public CSVReader(Reader reader, char separator) {
        this(reader, separator, DEFAULT_QUOTE_CHARACTER, DEFAULT_ESCAPE_CHARACTER);
    }



    /**
     * Constructs CSVReader with supplied separator and quote char.
     * 
     * @param reader
     *            the reader to an underlying CSV source.
     * @param separator
     *            the delimiter to use for separating entries
     * @param quotechar
     *            the character to use for quoted elements
     */
    public CSVReader(Reader reader, char separator, char quotechar) {
        this(reader, separator, quotechar, DEFAULT_ESCAPE_CHARACTER, DEFAULT_SKIP_LINES);
    }

    public CSVReader(Reader reader, char separator,
            char quotechar, char escape) {
        this(reader, separator, quotechar, escape, DEFAULT_SKIP_LINES);
    }

    /**
     * Constructs CSVReader with supplied separator and quote char.
     * 
     * @param reader
     *            the reader to an underlying CSV source.
     * @param separator
     *            the delimiter to use for separating entries
     * @param quotechar
     *            the character to use for quoted elements
     * @param line
     *            the line number to skip for start reading 
     */
    public CSVReader(Reader reader, char separator, char quotechar, int line) {
        this(reader, separator, quotechar, DEFAULT_ESCAPE_CHARACTER, line);
    }

    /**
     * Constructs CSVReader with supplied separator and quote char.
     * 
     * @param reader
     *            the reader to an underlying CSV source.
     * @param separator
     *            the delimiter to use for separating entries
     * @param quotechar
     *            the character to use for quoted elements
     * @param escape
     *            the character to use for escaping a separator or quote
     * @param line
     *            the line number to skip for start reading 
     */
    public CSVReader(Reader reader, char separator, char quotechar, char escape, int line) {
        this.br = new BufferedReader(reader);
        this.separator = separator;
        this.quotechar = quotechar;
        this.escape = escape;
        this.skipLines = line;
    }


    /**
     * Reads the entire file into a List with each element being a String[] of
     * tokens.
     * 
     * @return a List of String[], with each String[] representing a line of the
     *         file.
     * 
     * @throws IOException
     *             if bad things happen during the read
     */
    public List<String[]> readAll() throws IOException {

        List<String[]> allElements = new ArrayList<String[]>();
        while (hasNext) {
            String[] nextLineAsTokens = readNext();
            if (nextLineAsTokens != null)
                allElements.add(nextLineAsTokens);
        }
        return allElements;

    }

    /**
     * Reads the next line from the buffer and converts to a string array.
     * 
     * @return a string array with each comma-separated element as a separate
     *         entry.
     * 
     * @throws IOException
     *             if bad things happen during the read
     */
    public String[] readNext() throws IOException {

        String nextLine = getNextLine();
        return hasNext ? parseLine(nextLine) : null;
    }

    /**
     * Reads the next line from the file.
     * 
     * @return the next line from the file without trailing newline
     * @throws IOException
     *             if bad things happen during the read
     */
    private String getNextLine() throws IOException {
        if (!this.linesSkiped) {
            for (int i = 0; i < skipLines; i++) {
                br.readLine();
            }
            this.linesSkiped = true;
        }
        String nextLine = br.readLine();
        if (nextLine == null) {
            hasNext = false;
        }
        return hasNext ? nextLine : null;
    }

    /**
     * Parses an incoming String and returns an array of elements.
     * 
     * @param nextLine
     *            the string to parse
     * @return the comma-tokenized list of elements, or null if nextLine is null
     * @throws IOException if bad things happen during the read
     */
    private String[] parseLine(String nextLine) throws IOException {

        if (nextLine == null) {
            return null;
        }

        List<String>tokensOnThisLine = new ArrayList<String>();
        StringBuilder sb = new StringBuilder(INITIAL_READ_SIZE);
        boolean inQuotes = false;
        do {
            if (inQuotes) {
                // continuing a quoted section, reappend newline
                sb.append("\n");
                nextLine = getNextLine();
                if (nextLine == null)
                    break;
            }
            for (int i = 0; i < nextLine.length(); i++) {

                char c = nextLine.charAt(i);
                if (c == this.escape) {
                    if( isEscapable(nextLine, inQuotes, i) ){ 
                        sb.append(nextLine.charAt(i+1));
                        i++;
                    } else {
                        i++; // ignore the escape
                    }
                } else if (c == quotechar) {
                    if( isEscapedQuote(nextLine, inQuotes, i) ){ 
                        sb.append(nextLine.charAt(i+1));
                        i++;
                    }else{
                        inQuotes = !inQuotes;
                        // the tricky case of an embedded quote in the middle: a,bc"d"ef,g
                        if(i>2 //not on the beginning of the line
                                && nextLine.charAt(i-1) != this.separator //not at the beginning of an escape sequence 
                                && nextLine.length()>(i+1) &&
                                nextLine.charAt(i+1) != this.separator //not at the end of an escape sequence
                        ){
                            sb.append(c);
                        }
                    }
                } else if (c == separator && !inQuotes) {
                    tokensOnThisLine.add(sb.toString());
                    sb = new StringBuilder(INITIAL_READ_SIZE); // start work on next token
                } else {
                    sb.append(c);
                }
            }
        } while (inQuotes);
        tokensOnThisLine.add(sb.toString());
        return tokensOnThisLine.toArray(new String[0]);

    }

    /**  
     * precondition: the current character is a quote or an escape
     * @param nextLine the current line
     * @param inQuotes true if the current context is quoted
     * @param i current index in line
     * @return true if the following character is a quote
     */
    private boolean isEscapedQuote(String nextLine, boolean inQuotes, int i) {
        return inQuotes  // we are in quotes, therefore there can be escaped quotes in here.
            && nextLine.length() > (i+1)  // there is indeed another character to check.
            && nextLine.charAt(i+1) == quotechar;
    }

    /**  
     * precondition: the current character is an escape
     * @param nextLine the current line
     * @param inQuotes true if the current context is quoted
     * @param i current index in line
     * @return true if the following character is a quote
     */
    private boolean isEscapable(String nextLine, boolean inQuotes, int i) {
        return inQuotes  // we are in quotes, therefore there can be escaped quotes in here.
            && nextLine.length() > (i+1)  // there is indeed another character to check.
            && ( nextLine.charAt(i+1) == quotechar || nextLine.charAt(i+1) == this.escape);
    }

    /**
     * Closes the underlying reader.
     * 
     * @throws IOException if the close fails
     */
    public void close() throws IOException{
        br.close();
    }

}
  • 1
    点赞
  • 3
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值