1、配置文件
source_dir=E\\路径
columns=9,12
chinese2pinyin=9
dst_dir=C:\\路径
2、第三方pinyin工具
import net.sourceforge.pinyin4j.PinyinHelper;
import net.sourceforge.pinyin4j.format.HanyuPinyinCaseType;
import net.sourceforge.pinyin4j.format.HanyuPinyinOutputFormat;
import net.sourceforge.pinyin4j.format.HanyuPinyinToneType;
import net.sourceforge.pinyin4j.format.HanyuPinyinVCharType;
import net.sourceforge.pinyin4j.format.exception.BadHanyuPinyinOutputFormatCombination;
public class PinYin {
// 将汉字转换为全拼
public static String getPingYin(String src){
char[] t1 = null;
t1 = src.toCharArray();
String[] t2 = new String[t1.length];
HanyuPinyinOutputFormat t3 = new HanyuPinyinOutputFormat();
t3.setCaseType(HanyuPinyinCaseType.LOWERCASE);
t3.setToneType(HanyuPinyinToneType.WITHOUT_TONE);
t3.setVCharType(HanyuPinyinVCharType.WITH_V);
String t4 = "";
int t0 = t1.length;
try {
for (int i = 0; i < t0; i++) {
//判断是否为汉字字符
if (java.lang.Character.toString(t1[i]).matches("[\\u4E00-\\u9FA5]+")) {
t2 = PinyinHelper.toHanyuPinyinStringArray(t1[i], t3);
t4 += t2[0];
} else
t4 += java.lang.Character.toString(t1[i]);
}
// System.out.println(t4);
return t4;
} catch (BadHanyuPinyinOutputFormatCombination e1) {
e1.printStackTrace();
}
return t4;
}
// 返回中文的首字母
public static String getPinYinHeadChar(String str) {
String convert = "";
for (int j = 0; j < str.length(); j++) {
char word = str.charAt(j);
String[] pinyinArray = PinyinHelper.toHanyuPinyinStringArray(word);
if (pinyinArray != null) {
convert += pinyinArray[0].charAt(0);
} else {
convert += word;
}
}
return convert;
}
// 将字符串转移为ASCII码
public static String getCnASCII(String cnStr) {
StringBuffer strBuf = new StringBuffer();
byte[] bGBK = cnStr.getBytes();
for (int i = 0; i < bGBK.length; i++) {
strBuf.append(Integer.toHexString(bGBK[i] & 0xff));
}
return strBuf.toString();
}
}
3、DataColumnSelector
import java.io.BufferedReader;
import java.io.BufferedWriter;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.InputStreamReader;
import java.io.OutputStreamWriter;
import java.io.UnsupportedEncodingException;
import java.nio.charset.Charset;
import java.util.Properties;
public class DataColumnSelector {
static String SOURCE;
static String DST;
static int chinese;
static int[] columns;
public static void main(String[] args) throws UnsupportedEncodingException, FileNotFoundException, IOException {
Properties conf = new Properties();
conf.load((new InputStreamReader(new FileInputStream(new File("./columnselector.properties")), "utf-8")));
SOURCE = conf.getProperty("source_dir");
DST = conf.getProperty("dst_dir");
String[] columnArray = conf.getProperty("columns").split(",");
columns = new int[columnArray.length];
for (int i = 0; i < columnArray.length; i++) {
columns[i] = Integer.parseInt(columnArray[i].trim());
}
chinese = Integer.parseInt(conf.getProperty("chinese2pinyin"));
String temp;
BufferedWriter bw;
BufferedReader br;
String line;
StringBuffer sb;
String[] cols;
File file = new File(SOURCE);
if (file.isDirectory()) {
File[] files = file.listFiles();
for (File f : files) {
temp = DST + File.separator + f.getName();
bw = new BufferedWriter(
new OutputStreamWriter(new FileOutputStream(new File(temp)), Charset.forName("utf-8")));
br = new BufferedReader(new InputStreamReader(new FileInputStream(f), Charset.forName("GB2312")));
br.readLine(); // 第一行是标题行,舍掉
while ((line = br.readLine()) != null) {
cols = line.split("\t");
sb = new StringBuffer();
for (int i : columns) {
sb.append(cols[i]).append("\t");
}
sb.append(PinYin.getPingYin(cols[chinese]));
bw.write(sb.substring(0, sb.length() - 1));
bw.newLine();
}
br.close();
bw.flush();
bw.close();
}
}
System.out.println("end");
}
}