需求是一个每天读取单个文件在1个G到3G之间,数据量在400万左右,读取完成后,调用谷歌翻译成中文后,再保存到数据库
实现方面采用多线程,RandomAccessFile读取,缓冲区分流,其中还用到了线程的一些并发变量,例如cyclicBarrier,
AtomicInteger等,读取完成后会跟redis交互,因为每天会有重复数据,所以这里跟redis做对比过滤,调用谷歌API翻译部分,
采用了分段提交,如每组100条记录调用一次API,同时为了解决并发频繁调用API(这里被谷歌限流403错误),采用了并发框架guava,如果是分布式系统,建议采用redis解决方案,关于并发限流算法,大家可以网上看一看,最后保存入库,目前公司都是用的JPA,hibernate,其中实体类用了version乐观锁,导致并发时报了乐观锁错误,如果对数据要求不高,可以忽略,我这里是去掉了version字段,下面贴上完整代码,主要是3个类。
package com.hilton.hcs.china.ari.file; import org.springframework.stereotype.Service; @Service public interface IHandle { public void handle(String line); }
package com.hilton.hcs.china.ari.file; import java.io.*; import java.nio.MappedByteBuffer; import java.nio.channels.FileChannel.MapMode; import java.security.InvalidParameterException; import java.util.HashSet; import java.util.Set; import java.util.concurrent.CyclicBarrier; import java.util.concurrent.ExecutorService; import java.util.concurrent.Executors; import java.util.concurrent.atomic.AtomicLong; public class BigFileReader { private int threadSize; private String charset; private int bufferSize; private IHandle handle; private ExecutorService executorService; private long fileLength; private RandomAccessFile rAccessFile; private Set<StartEndPair> startEndPairs; private CyclicBarrier cyclicBarrier; private AtomicLong counter = new AtomicLong(0); private BigFileReader(File file, IHandle handle, String charset, int bufferSize, int threadSize) { this.fileLength = file.length(); this.handle = handle; this.charset = charset; this.bufferSize = bufferSize; this.threadSize = threadSize; try { this.rAccessFile = new RandomAccessFile(file, "r"); } catch (FileNotFoundException e) { e.printStackTrace(); } this.executorService = Executors.newFixedThreadPool(threadSize); startEndPairs = new HashSet<StartEndPair>(); } public void start() { long everySize = this.fileLength / this.threadSize; try { calculateStartEnd(0, everySize); } catch (IOException e) { e.printStackTrace(); return; } final long startTime = System.currentTimeMillis(); cyclicBarrier = new CyclicBarrier(startEndPairs.size(), new Runnable() { @Override public void run() { System.out.println("use time: " + (System.currentTimeMillis() - startTime)); System.out.println("all line: " + counter.get()); } }); for (StartEndPair pair : startEndPairs) { System.out.println("分配分片:" + pair); this.executorService.execute(new SliceReaderTask(pair)); } } private void calculateStartEnd(long start, long size) throws IOException { if (start > fileLength - 1) { return; } StartEndPair pair = new StartEndPair(); pair.start = start; long endPosition = start + size - 1; if (endPosition >= fileLength - 1) { pair.end = fileLength - 1; startEndPairs.add(pair); return; } rAccessFile.seek(endPosition); byte tmp = (byte) rAccessFile.read();