java 解析 csv 文件

最新推荐文章于 2024-07-20 03:01:13 发布

Terry_5008

最新推荐文章于 2024-07-20 03:01:13 发布

阅读量213

点赞数

分类专栏： java 文章标签： Java 正则表达式 Excel J# 算法

本文链接：https://blog.csdn.net/iteye_5008/article/details/81928569

版权

java 专栏收录该内容

41 篇文章 0 订阅

订阅专栏

一。貌似有bug，不行用二。或三。的方法

import java.io.BufferedReader;
import java.io.FileInputStream;
import java.io.IOException;
import java.io.InputStreamReader;
import java.util.ArrayList;
import java.util.List;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

/*
 * 文件规则
 * Microsoft的格式是最简单的。以逗号分隔的值要么是“纯粹的”（仅仅包含在括号之前），
 * 要么是在双引号之间（这时数据中的双引号以一对双引号表示）。
 * Ten Thousand,10000, 2710 ,,"10,000","It's ""10 Grand"", baby",10K
 * 这一行包含七个字段（fields）：
 *	Ten Thousand
 *	10000
 *	 2710 
 *	空字段
 *	10,000
 *	It's "10 Grand", baby
 *	10K
 * 每条记录占一行
 * 以逗号为分隔符
 * 逗号前后的空格会被忽略
 * 字段中包含有逗号，该字段必须用双引号括起来。如果是全角的没有问题。
 * 字段中包含有换行符，该字段必须用双引号括起来
 * 字段前后包含有空格，该字段必须用双引号括起来
 * 字段中的双引号用两个双引号表示
 * 字段中如果有双引号，该字段必须用双引号括起来
 * 第一条记录，可以是字段名
 */

public class CSVAnalysis {
	private InputStreamReader fr = null;
	private BufferedReader br = null;

	public CSVAnalysis(String f) throws IOException {
		fr = new InputStreamReader(new FileInputStream(f));
	}

	/**
	 * 解析csv文件 到一个list中
	 * 每个单元个为一个String类型记录，每一行为一个list。
	 * 再将所有的行放到一个总list中
	 * @return
	 * @throws IOException
	 */
	public List<List<String>> readCSVFile() throws IOException {
		br = new BufferedReader(fr);
		String rec = null;//一行
		String str;//一个单元格
		List<List<String>> listFile = new ArrayList<List<String>>();
		try {			
			//读取一行
			while ((rec = br.readLine()) != null) {
				Pattern pCells = Pattern
						.compile("(\"[^\"]*(\"{2})*[^\"]*\")*[^,]*,");
				Matcher mCells = pCells.matcher(rec);
				List<String> cells = new ArrayList<String>();//每行记录一个list
				//读取每个单元格
				while (mCells.find()) {
					str = mCells.group();
					str = str.replaceAll(
							"(?sm)\"?([^\"]*(\"{2})*[^\"]*)\"?.*,", "$1");
					str = str.replaceAll("(?sm)(\"(\"))", "$2");
					cells.add(str);
				}
				listFile.add(cells);
			}			
		} catch (Exception e) {
			e.printStackTrace();
		} finally {
			if (fr != null) {
				fr.close();
			}
			if (br != null) {
				br.close();
			}
		}
		return listFile;
	}

	public static void main(String[] args) throws Throwable {
		CSVAnalysis parser = new CSVAnalysis("c:/test2.csv");
		parser.readCSVFile();
	}
}

二。

在解析csv文件之前，先来看看什么是csv文件以及csv文件的格式。

csv（Comma Separate Values）文件即逗号分隔符文件，它是一种文本文件，可以直接以文本打开，以逗号分隔。windows默认用excel打开。它的格式包括以下几点（它的格式最好就看excel是如何解析的。）：

①每条记录占一行；
②以逗号为分隔符；
③逗号前后的空格会被忽略；
④字段中包含有逗号，该字段必须用双引号括起来；
⑤字段中包含有换行符，该字段必须用双引号括起来；
⑥字段前后包含有空格，该字段必须用双引号括起来；
⑦字段中的双引号用两个双引号表示；
⑧字段中如果有双引号，该字段必须用双引号括起来；
⑨第一条记录，可以是字段名；

⑩以上提到的逗号和双引号均为半角字符。

下面通过正则表达式和java解析csv文件。

首先给出匹配csv文件的一个最小单位数据的正则表达式（如：1,2,3是csv文件的一行数据，则1,是该csv文件的一个最小单位数据）：

"(([^",\n 　]*[,\n 　])*([^",\n 　]*"{2})*)*[^",\n 　]*"[ 　]*,[ 　]*|[^",\n]*[ 　]*,[ 　]*|"(([^",\n 　]*[,\n 　])*([^",\n 　]*"{2})*)*[^",\n 　]*"[ 　]*|[^",\n]*[ 　]*

下面是解析文件的java代码：

import java.io.BufferedReader;
import java.io.BufferedWriter;
import java.io.File;
import java.io.FileNotFoundException;
import java.io.FileReader;
import java.io.FileWriter;
import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
import java.util.logging.Level;
import java.util.logging.Logger;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

/**
 * @author  panhf2003
 * @version 2008/09/05,
 */

public class CsvFileUtil {

	 private static final String SPECIAL_CHAR_A = "[^\",\\n 　]";
	 private static final String SPECIAL_CHAR_B = "[^\",\\n]";
	
    /**
     * 构造，禁止实例化
     */
    private CsvFileUtil() {
    }

    public static void main(String[] args) {

        // test
        try {
            readCsvFile("e:\\test1.csv");
        } catch (FileNotFoundException ex) {
            Logger.getLogger(CsvFileUtil.class.getName()).log(Level.SEVERE, null, ex);
        } catch (IOException ex) {
            Logger.getLogger(CsvFileUtil.class.getName()).log(Level.SEVERE, null, ex);
        }
    }
    /**
     * csv文件读取<BR/>
     * 读取绝对路径为argPath的csv文件数据，并以List返回。
     *
     * @param argPath csv文件绝对路径
     * @return csv文件数据（List<String[]>）
     * @throws FileNotFoundException
     * @throws IOException
     */
    public static List readCsvFile(String argPath) throws FileNotFoundException, IOException {
        CsvFileUtil util = new CsvFileUtil();
        File cvsFile = new File(argPath);
        List list = new ArrayList();
        FileReader fileReader = null;
        BufferedReader bufferedReader = null;
        try {
            fileReader = new FileReader(cvsFile);
            bufferedReader = new BufferedReader(fileReader);
            String regExp = util.getRegExp();

            // test
            System.out.println(regExp);
            String strLine = "";
            String str = "";
            while ((strLine = bufferedReader.readLine()) != null) {
                Pattern pattern = Pattern.compile(regExp);
                Matcher matcher = pattern.matcher(strLine);
                List listTemp = new ArrayList();
                while(matcher.find()) {
                    str = matcher.group();
                    str = str.trim();
                    if (str.endsWith(",")){
                        str = str.substring(0, str.length()-1);
                        str = str.trim();
                    }
                    if (str.startsWith("\"") && str.endsWith("\"")) {
                        str = str.substring(1, str.length()-1);
                        if (util.isExisted("\"\"", str)) {
                            str = str.replaceAll("\"\"", "\"");
                        }
                    }
                    if (!"".equals(str)) {
                        //test
                        System.out.print(str+" ");
                        listTemp.add(str);
                    }
                }
                //test
                System.out.println();
                list.add((String[]) listTemp.toArray(new String[listTemp.size()]));
            }
        } catch (FileNotFoundException e) {
            throw e;
        } catch (IOException e) {
            throw e;
        } finally {
            try {
                if (bufferedReader != null) {
                    bufferedReader.close();
                }
                if (fileReader != null) {
                    fileReader.close();
                }
            } catch (IOException e) {
                throw e;
            }
        }
        return list;
    }
    
    /**
     * csv文件做成<BR/>
     * 将argList写入argPath路径下的argFileName文件里。
     *
     * @param argList  要写入csv文件的数据（List<String[]>）
     * @param argPath csv文件路径
     * @param argFileName csv文件名
     * @param isNewFile 是否覆盖原有文件
     * @throws IOException
     * @throws Exception
     */
    public static void writeCsvFile(List argList, String argPath, String argFileName, boolean isNewFile)
        throws IOException, Exception {
        CsvFileUtil util = new CsvFileUtil();
        // 数据check
        if (argList == null || argList.size() == 0) {
            throw new Exception("没有数据");
        }
        for (int i = 0; i < argList.size(); i++) {
            if (!(argList.get(i) instanceof String[])) {
                throw new Exception("数据格式不对");
            }
        }
        FileWriter fileWriter = null;
        BufferedWriter bufferedWriter = null;
        String strFullFileName = argPath;
        if (strFullFileName.lastIndexOf("\\") == (strFullFileName.length() - 1)) {
            strFullFileName += argFileName;
        } else {
            strFullFileName += "\\" + argFileName;
        }
        File file = new File(strFullFileName);
        // 文件路径check
        if (!file.getParentFile().exists()) {
            file.getParentFile().mkdirs();
        }
        try {
            if (isNewFile) {
                // 覆盖原有文件
                fileWriter = new FileWriter(file);
            } else {
                // 在原有文件上追加数据
                fileWriter = new FileWriter(file, true);
            }
            bufferedWriter = new BufferedWriter(fileWriter);
            for (int i = 0; i < argList.size(); i++) {
                String[] strTemp = (String[]) argList.get(i);
                for (int j = 0; j < strTemp.length; j++) {
                    if (util.isExisted("\"",strTemp[j])) {
                        strTemp[j] = strTemp[j].replaceAll("\"", "\"\"");
                        bufferedWriter.write("\""+strTemp[j]+"\"");
                    } else if (util.isExisted(",",strTemp[j])
                            || util.isExisted("\n",strTemp[j])
                            || util.isExisted(" ",strTemp[j])
                            || util.isExisted("��",strTemp[j])){
                        bufferedWriter.write("\""+strTemp[j]+"\"");
                    } else {
                        bufferedWriter.write(strTemp[j]);
                    }
                    if (j < strTemp.length - 1) {
                        bufferedWriter.write(",");
                    }
                }
                bufferedWriter.newLine();
            }
        } catch (IOException e) {
            e.printStackTrace();
        } finally {
            try {
                if (bufferedWriter != null) {
                    bufferedWriter.close();
                }
                if (fileWriter != null) {
                    fileWriter.close();
                }
            } catch (IOException e) {
                throw e;
            }
        }
    }
    
    /**
     * @param argChar
     * @param argStr
     * @return
     */
    private boolean isExisted(String argChar, String argStr) {
        
        boolean blnReturnValue = false;
        if ((argStr.indexOf(argChar) >= 0)
                && (argStr.indexOf(argChar) <= argStr.length())) {
            blnReturnValue = true;
        }
        return blnReturnValue;
    }
    
    /**
     * 正则表达式。
     * @return 匹配csv文件里最小单位的正则表达式。
     */
    private String getRegExp() {
        
        String strRegExp = "";
        
        strRegExp =
            "\"(("+ SPECIAL_CHAR_A + "*[,\\n 　])*("+ SPECIAL_CHAR_A + "*\"{2})*)*"+ SPECIAL_CHAR_A + "*\"[ 　]*,[ 　]*"
            +"|"+ SPECIAL_CHAR_B + "*[ 　]*,[ 　]*"
            + "|\"(("+ SPECIAL_CHAR_A + "*[,\\n 　])*("+ SPECIAL_CHAR_A + "*\"{2})*)*"+ SPECIAL_CHAR_A + "*\"[ 　]*"
            + "|"+ SPECIAL_CHAR_B + "*[ 　]*";
        
        return strRegExp;
    }
    
   
}

三。

该解析算法的解析规则与excel或者wps大致相同。另外包含去掉注释的方法。

构建方法该类包含一个构建方法，参数为要读取的csv文件的文件名（包含绝对路径）。

普通方法：

① getVContent()：一个得到当前行的值向量的方法。如果调用此方法前未调用readCSVNextRecord方法，则将返回Null。

② getLineContentVector()：一个得到下一行值向量的方法。如果该方法返回Null，则说明已经读到文件末尾。

③ close()：关闭流。该方法为调用该类后应该被最后调用的方法。

④ readCSVNextRecord()：该方法读取csv文件的下一行，如果该方法已经读到了文件末尾，则返回false；

⑤ readAtomString(String)：该方法返回csv文件逻辑一行的第一个值，和该逻辑行第一个值后面的内容，如果该内容以逗号开始，则已经去掉了该逗号。这两个值以一个二维数组的方法返回。

⑥ isQuoteAdjacent(String)：判断一个给定字符串的引号是否两两相邻。如果两两相邻，返回真。如果该字符串不包含引号，也返回真。

⑦ readCSVFileTitle()：该方法返回csv文件中的第一行——该行不以#号开始（包括正常解析后的#号），且该行不为空

import java.io.BufferedReader;
import java.io.FileNotFoundException;
import java.io.FileReader;
import java.io.IOException;
import java.util.Vector;

public class CsvParse {
	//声明读取流
	private BufferedReader inStream = null;
	//声明返回向量
	private Vector<String> vContent = null;

	/**
	 * 构建方法，参数为csv文件名<br>
	 * 如果没有找到文件，则抛出异常<br>
	 * 如果抛出异常，则不能进行页面的文件读取操作
	 */
	public CsvParse(String csvFileName) throws FileNotFoundException {
		inStream = new BufferedReader(new FileReader(csvFileName));
	}

	/**
	 * 返回已经读取到的一行的向量
	 * @return vContent
	 */
	public Vector<String> getVContent() {
		return this.vContent;
	}

	/**
	 * 读取下一行，并把该行的内容填充入向量中<br>
	 * 返回该向量<br>
	 * @return vContent 装载了下一行的向量
	 * @throws IOException
	 * @throws Exception
	 */
	public Vector<String> getLineContentVector() throws IOException, Exception {
		if (this.readCSVNextRecord()) {
			return this.vContent;
		}
		return null;
	}

	/**
	 * 关闭流
	 */
	public void close() {
		if (inStream != null) {
			try {
				inStream.close();
			} catch (IOException e) {
				// TODO Auto-generated catch block
				e.printStackTrace();
			}
		}
	}

	/**
	 * 调用此方法时应该确认该类已经被正常初始化<br>
	 * 该方法用于读取csv文件的下一个逻辑行<br>
	 * 读取到的内容放入向量中<br>
	 * 如果该方法返回了false，则可能是流未被成功初始化<br>
	 * 或者已经读到了文件末尾<br>
	 * 如果发生异常，则不应该再进行读取
	 * @return 返回值用于标识是否读到文件末尾
	 * @throws Exception
	 */
	public boolean readCSVNextRecord() throws IOException, Exception {
		//如果流未被初始化则返回false
		if (inStream == null) {
			return false;
		}
		//如果结果向量未被初始化，则初始化
		if (vContent == null) {
			vContent = new Vector<String>();
		}
		//移除向量中以前的元素
		vContent.removeAllElements();
		//声明逻辑行
		String logicLineStr = "";
		//用于存放读到的行
		StringBuilder strb = new StringBuilder();
		//声明是否为逻辑行的标志，初始化为false
		boolean isLogicLine = false;
		try {
			while (!isLogicLine) {
				String newLineStr = inStream.readLine();
				if (newLineStr == null) {
					strb = null;
					vContent = null;
					isLogicLine = true;
					break;
				}
				if (newLineStr.startsWith("#")) {
					// 去掉注释
					continue;
				}
				if (!strb.toString().equals("")) {
					strb.append("\r\n");
				}
				strb.append(newLineStr);
				String oldLineStr = strb.toString();
				if (oldLineStr.indexOf(",") == -1) {
					// 如果该行未包含逗号
					if (containsNumber(oldLineStr, "\"") % 2 == 0) {
						// 如果包含偶数个引号
						isLogicLine = true;
						break;
					} else {
						if (oldLineStr.startsWith("\"")) {
							if (oldLineStr.equals("\"")) {
								continue;
							} else {
								String tempOldStr = oldLineStr.substring(1);
								if (isQuoteAdjacent(tempOldStr)) {
									// 如果剩下的引号两两相邻，则不是一行
									continue;
								} else {
									// 否则就是一行
									isLogicLine = true;
									break;
								}
							}
						}
					}
				} else {
					// quotes表示复数的quote
					String tempOldLineStr = oldLineStr.replace("\"\"", "");
					int lastQuoteIndex = tempOldLineStr.lastIndexOf("\"");
					if (lastQuoteIndex == 0) {
						continue;
					} else if (lastQuoteIndex == -1) {
						isLogicLine = true;
						break;
					} else {
						tempOldLineStr = tempOldLineStr.replace("\",\"", "");
						lastQuoteIndex = tempOldLineStr.lastIndexOf("\"");
						if (lastQuoteIndex == 0) {
							continue;
						}
						if (tempOldLineStr.charAt(lastQuoteIndex - 1) == ',') {
							continue;
						} else {
							isLogicLine = true;
							break;
						}
					}
				}
			}
		} catch (IOException ioe) {
			ioe.printStackTrace();
			//发生异常时关闭流
			if (inStream != null) {
				inStream.close();
			}
			throw ioe;
		} catch (Exception e) {
			e.printStackTrace();
			//发生异常时关闭流
			if (inStream != null) {
				inStream.close();
			}
			throw e;
		}
		if (strb == null) {
			// 读到行尾时为返回
			return false;
		}
		//提取逻辑行
		logicLineStr = strb.toString();
		if (logicLineStr != null) {
			//拆分逻辑行，把分离出来的原子字符串放入向量中
			while (!logicLineStr.equals("")) {
				String[] ret = readAtomString(logicLineStr);
				String atomString = ret[0];
				logicLineStr = ret[1];
				vContent.add(atomString);
			}
		}
		return true;
	}

	/**
	 * 读取一个逻辑行中的第一个字符串，并返回剩下的字符串<br>
	 * 剩下的字符串中不包含第一个字符串后面的逗号<br>
	 * @param lineStr 一个逻辑行
	 * @return 第一个字符串和剩下的逻辑行内容
	 */
	public String[] readAtomString(String lineStr) {
		String atomString = "";//要读取的原子字符串
		String orgString = "";//保存第一次读取下一个逗号时的未经任何处理的字符串
		String[] ret = new String[2];//要返回到外面的数组
		boolean isAtom = false;//是否是原子字符串的标志
		String[] commaStr = lineStr.split(",");
		while (!isAtom) {
			for (String str : commaStr) {
				if (!atomString.equals("")) {
					atomString = atomString + ",";
				}
				atomString = atomString + str;
				orgString = atomString;
				if (!isQuoteContained(atomString)) {
					// 如果字符串中不包含引号，则为正常，返回
					isAtom = true;
					break;
				} else {
					if (!atomString.startsWith("\"")) {
						// 如果字符串不是以引号开始，则表示不转义，返回
						isAtom = true;
						break;
					} else if (atomString.startsWith("\"")) {
						// 如果字符串以引号开始，则表示转义
						if (containsNumber(atomString, "\"") % 2 == 0) {
							// 如果含有偶数个引号
							String temp = atomString;
							if (temp.endsWith("\"")) {
								temp = temp.replace("\"\"", "");
								if (temp.equals("")) {
									// 如果temp为空
									atomString = "";
									isAtom = true;
									break;
								} else {
									// 如果temp不为空，则去掉前后引号
									temp = temp.substring(1, temp
											.lastIndexOf("\""));
									if (temp.indexOf("\"") > -1) {
										// 去掉前后引号和相邻引号之后，若temp还包含有引号
										// 说明这些引号是单个单个出现的
										temp = atomString;
										temp = temp.substring(1);
										temp = temp.substring(0, temp
												.indexOf("\""))
												+ temp.substring(temp
														.indexOf("\"") + 1);
										atomString = temp;
										isAtom = true;
										break;
									} else {
										// 正常的csv文件
										temp = atomString;
										temp = temp.substring(1, temp
												.lastIndexOf("\""));
										temp = temp.replace("\"\"", "\"");
										atomString = temp;
										isAtom = true;
										break;
									}
								}
							} else {
								// 如果不是以引号结束，则去掉前两个引号
								temp = temp.substring(1, temp.indexOf('\"', 1))
										+ temp
												.substring(temp
														.indexOf('\"', 1) + 1);
								atomString = temp;
								isAtom = true;
								break;
							}
						} else {
							// 如果含有奇数个引号
							// TODO 处理奇数个引号的情况
							if (!atomString.equals("\"")) {
								String tempAtomStr = atomString.substring(1);
								if (!isQuoteAdjacent(tempAtomStr)) {
									// 这里做的原因是，如果判断前面的字符串不是原子字符串的时候就读取第一个取到的字符串
									// 后面取到的字符串不计入该原子字符串
									tempAtomStr = atomString.substring(1);
									int tempQutoIndex = tempAtomStr
											.indexOf("\"");
									// 这里既然有奇数个quto，所以第二个quto肯定不是最后一个
									tempAtomStr = tempAtomStr.substring(0,
											tempQutoIndex)
											+ tempAtomStr
													.substring(tempQutoIndex + 1);
									atomString = tempAtomStr;
									isAtom = true;
									break;
								}
							}
						}
					}
				}
			}
		}
		//先去掉之前读取的原字符串的母字符串
		if (lineStr.length() > orgString.length()) {
			lineStr = lineStr.substring(orgString.length());
		} else {
			lineStr = "";
		}
		//去掉之后，判断是否以逗号开始，如果以逗号开始则去掉逗号
		if (lineStr.startsWith(",")) {
			if (lineStr.length() > 1) {
				lineStr = lineStr.substring(1);
			} else {
				lineStr = "";
			}
		}
		ret[0] = atomString;
		ret[1] = lineStr;
		return ret;
	}

	/**
	 * 该方法取得父字符串中包含指定字符串的数量<br>
	 * 如果父字符串和字字符串任意一个为空值，则返回零
	 * @param parentStr
	 * @param parameter
	 * @return
	 */
	public int containsNumber(String parentStr, String parameter) {
		int containNumber = 0;
		if (parentStr == null || parentStr.equals("")) {
			return 0;
		}
		if (parameter == null || parameter.equals("")) {
			return 0;
		}
		for (int i = 0; i < parentStr.length(); i++) {
			i = parentStr.indexOf(parameter, i);
			if (i > -1) {
				i = i + parameter.length();
				i--;
				containNumber = containNumber + 1;
			} else {
				break;
			}
		}
		return containNumber;
	}

	/**
	 * 该方法用于判断给定的字符串中的引号是否相邻<br>
	 * 如果相邻返回真，否则返回假<br>
	 *
	 * @param p_String
	 * @return
	 */
	public boolean isQuoteAdjacent(String p_String) {
		boolean ret = false;
		String temp = p_String;
		temp = temp.replace("\"\"", "");
		if (temp.indexOf("\"") == -1) {
			ret = true;
		}
		// TODO 引号相邻
		return ret;
	}

	/**
	 * 该方法用于判断给定的字符串中是否包含引号<br>
	 * 如果字符串为空或者不包含返回假，包含返回真<br>
	 *
	 * @param p_String
	 * @return
	 */
	public boolean isQuoteContained(String p_String) {
		boolean ret = false;
		if (p_String == null || p_String.equals("")) {
			return false;
		}
		if (p_String.indexOf("\"") > -1) {
			ret = true;
		}
		return ret;
	}

	/**
	 * 读取文件标题
	 *
	 * @return 正确读取文件标题时返回 true,否则返回 false
	 * @throws Exception
	 * @throws IOException
	 */
	public boolean readCSVFileTitle() throws IOException, Exception {
		String strValue = "";
		boolean isLineEmpty = true;
		do {
			if (!readCSVNextRecord()) {
				return false;
			}
			if (vContent.size() > 0) {
				strValue = (String) vContent.get(0);
			}
			for (String str : vContent) {
				if (str != null && !str.equals("")) {
					isLineEmpty = false;
					break;
				}
			}
			// csv 文件中前面几行以 # 开头为注释行
		} while (strValue.trim().startsWith("#") || isLineEmpty);
		return true;
	}
}