最近研究了一下 phoenix 基于JDBC方式导入的性能问题,测试环境:
hadoop (2.3.0+cdh5.0.0+548)
hbase(0.96.1.1+cdh5.0.0+60)
phoenix4.0.0.0
创建表tab_ps_xdr2, 带有4个索引列分别创建4个索引表,如下:
create table TAB_PS_XDR2 ( K VARCHAR NOT NULL PRIMARY KEY, A VARCHAR, B VARCHAR, C VARCHAR, D VARCHAR, V VARCHAR) COMPRESSION='SNAPPY',SALT_BUCKETS=12,TTL='7200',VERSIONS='5';
create index CALLING_NUMBER2 on TAB_PS_XDR2 (A);
create index CALLED_NUMBER2 on TAB_PS_XDR2 (B);
create index IMSI2 on TAB_PS_XDR2 (C);
create index IMEI2 on TAB_PS_XDR2 (D);
测试代码:
public static void readCVS5(int pre, String path) {
File file = new File(path);
long fileLength = 0;
fileLength = file.length();
int BUFFER_SIZE = (int) fileLength;// 100M的缓冲
Connection _Connection = null;
PreparedStatement _PreparedStatement = null;
String tSQL = "UPSERT INTO TAB_PS_XDR2 VALUES(?,?,?,?,?,?)";
try {
_Connection = HBaseUtility.getConnection();
_Connection.setAutoCommit(false);
_PreparedStatement = _Connection.prepareStatement(tSQL);
long start = System.currentTimeMillis();
String pid = ManagementFactory.getRuntimeMXBean().getName().split("@")[0];
FileChannel fc = new RandomAccessFile(file, "r").getChannel();
MappedByteBuffer inputBuffer = fc.map(
FileChannel.MapMode.READ_ONLY, 0, fileLength);// 读取大文件
byte[] dst = new byte[BUFFER_SIZE];
int j = 0;
for (int offset = 0; offset < fileLength; offset += BUFFER_SIZE) {
if (fileLength - offset >= BUFFER_SIZE) {
for (int i = 0; i < BUFFER_SIZE; i++)
dst[i] = inputBuffer.get(offset + i);
} else {
for (int i = 0; i < fileLength - offset; i++)
dst[i] = inputBuffer.get(offset + i);
}
InputStream in = new ByteArrayInputStream(dst);
BufferedReader reader = new BufferedReader( new InputStreamReader(in), BUFFER_SIZE);
String line = reader.readLine();
String[] strs = null;
while (line != null) {
strs = parserLine2(line, ",");
if (strs.length > 5) {
_PreparedStatement.setString(1, strs[0]);
_PreparedStatement.setString(2, strs[1]);
_PreparedStatement.setString(3, strs[2]);
_PreparedStatement.setString(4, strs[3]);
_PreparedStatement.setString(5, strs[4]);
_PreparedStatement.setString(6, strs[5]);
_PreparedStatement.addBatch();
if ((++j) % pre == 0) {
_PreparedStatement.executeBatch();
_PreparedStatement.clearBatch(); //
_Connection.commit();
System.out.println( "executeInsert::" +pid);
}
} else {
System.out.println("数据问题:" + j);
}
line = reader.readLine();
}
_PreparedStatement.executeBatch();
_PreparedStatement.clearBatch();
_Connection.commit();
System.out.println("executeInsert-LashFlush!!"+pid);
long totalTime = System.currentTimeMillis() - start;
System.out.println("每秒处理数据:" + j * 1000 / totalTime);
}
} catch (Exception e) {
e.printStackTrace();
}finally {
try {
_PreparedStatement.close();
_Connection.close();
} catch (Exception e) {
e.printStackTrace();
}
}
}
单进程(万条提交)测试:
sh cmd0.sh 10000 /home/cloudil/limq/test/a-bssap-1.cdr.csv &
测试时间大约23秒。
十进程测试(万条提交):
测试大约79秒,按照平均每个文件 95M计算, 每秒处理大约12M左右数据。
每次10000-11000条提交比较合适。