java高性能线程读取大文件并分段分流翻译入库

需求是一个每天读取单个文件在1个G到3G之间,数据量在400万左右,读取完成后,调用谷歌翻译成中文后,再保存到数据库

实现方面采用多线程,RandomAccessFile读取,缓冲区分流,其中还用到了线程的一些并发变量,例如cyclicBarrier,

AtomicInteger等,读取完成后会跟redis交互,因为每天会有重复数据,所以这里跟redis做对比过滤,调用谷歌API翻译部分,

采用了分段提交,如每组100条记录调用一次API,同时为了解决并发频繁调用API(这里被谷歌限流403错误),采用了并发框架guava,如果是分布式系统,建议采用redis解决方案,关于并发限流算法,大家可以网上看一看,最后保存入库,目前公司都是用的JPA,hibernate,其中实体类用了version乐观锁,导致并发时报了乐观锁错误,如果对数据要求不高,可以忽略,我这里是去掉了version字段,下面贴上完整代码,主要是3个类。


package com.hilton.hcs.china.ari.file;

import org.springframework.stereotype.Service;

@Service
public interface IHandle {

    public void handle(String line);
}
package com.hilton.hcs.china.ari.file;

import java.io.*;
import java.nio.MappedByteBuffer;
import java.nio.channels.FileChannel.MapMode;
import java.security.InvalidParameterException;
import java.util.HashSet;
import java.util.Set;
import java.util.concurrent.CyclicBarrier;
import java.util.concurrent.ExecutorService;
import java.util.concurrent.Executors;
import java.util.concurrent.atomic.AtomicLong;

public class BigFileReader {
    private int threadSize;
    private String charset;
    private int bufferSize;
    private IHandle handle;
    private ExecutorService executorService;
    private long fileLength;
    private RandomAccessFile rAccessFile;
    private Set<StartEndPair> startEndPairs;
    private CyclicBarrier cyclicBarrier;
    private AtomicLong counter = new AtomicLong(0);

    private BigFileReader(File file, IHandle handle, String charset, int bufferSize, int threadSize) {
        this.fileLength = file.length();
        this.handle = handle;
        this.charset = charset;
        this.bufferSize = bufferSize;
        this.threadSize = threadSize;
        try {
            this.rAccessFile = new RandomAccessFile(file, "r");
        } catch (FileNotFoundException e) {
            e.printStackTrace();
        }
        this.executorService = Executors.newFixedThreadPool(threadSize);
        startEndPairs = new HashSet<StartEndPair>();
    }

    public void start() {
        long everySize = this.fileLength / this.threadSize;
        try {
            calculateStartEnd(0, everySize);
        } catch (IOException e) {
            e.printStackTrace();
            return;
        }

        final long startTime = System.currentTimeMillis();
        cyclicBarrier = new CyclicBarrier(startEndPairs.size(), new Runnable() {

            @Override
            public void run() {
                System.out.println("use time: " + (System.currentTimeMillis() - startTime));
                System.out.println("all line: " + counter.get());
            }
        });
        for (StartEndPair pair : startEndPairs) {
            System.out.println("分配分片:" + pair);
            this.executorService.execute(new SliceReaderTask(pair));
        }
    }

    private void calculateStartEnd(long start, long size) throws IOException {
        if (start > fileLength - 1) {
            return;
        }
        StartEndPair pair = new StartEndPair();
        pair.start = start;
        long endPosition = start + size - 1;
        if (endPosition >= fileLength - 1) {
            pair.end = fileLength - 1;
            startEndPairs.add(pair);
            return;
        }

        rAccessFile.seek(endPosition);
        byte tmp = (byte) rAccessFile.read();
        while (tmp != '\n' && tmp != '\r') {
            endPosition++;
            if (endPosition >= fileLength - 1) {
                endPosition = fileLength - 1;
                break;
            }
            rAccessFile.seek(endPosition);
            tmp = (byte) rAccessFile.read();
        }
        pair.end = endPosition;
        startEndPairs.add(pair);

        calculateStartEnd(endPosition + 1, size);

    }


    public void shutdown() {
        try {
            this.rAccessFile.close();
        } catch (IOException e) {
            e.printStackTrace();
        }
        this.executorService.shutdown();
    }

    private void handle(byte[] bytes) throws UnsupportedEncodingException {
        String line = null;
        if (this.charset == null) {
            line = new String(bytes);
        } else {
            line = new String(bytes, charset);
        }
        if (line != null && !"".equals(line)) {
            this.handle.handle(line);
            counter.incrementAndGet();
        }
    }

    private static class StartEndPair {
        public long start;
        public long end;

        @Override
        public String toString() {
            return "star=" + start + ";end=" + end;
        }

        @Override
        public int hashCode() {
            final int prime = 31;
            int result = 1;
            result = prime * result + (int) (end ^ (end >>> 32));
            result = prime * result + (int) (start ^ (start >>> 32));
            return result;
        }

        @Override
        public boolean equals(Object obj) {
            if (this == obj)
                return true;
            if (obj == null)
                return false;
            if (getClass() != obj.getClass())
                return false;
            StartEndPair other = (StartEndPair) obj;
            if (end != other.end)
                return false;
            if (start != other.start)
                return false;
            return true;
        }

    }

    private class SliceReaderTask implements Runnable {
        private long start;
        private long sliceSize;
        private byte[] readBuff;

        public SliceReaderTask(StartEndPair pair) {
            this.start = pair.start;
            this.sliceSize = pair.end - pair.start + 1;
            this.readBuff = new byte[bufferSize];
        }

        @Override
        public void run() {
            try {
                MappedByteBuffer mapBuffer = rAccessFile.getChannel().map(MapMode.READ_ONLY, start, this.sliceSize);
                ByteArrayOutputStream bos = new ByteArrayOutputStream();
                for (int offset = 0; offset < sliceSize; offset += bufferSize) {
                    int readLength;
                    if (offset + bufferSize <= sliceSize) {
                        readLength = bufferSize;
                    } else {
                        readLength = (int) (sliceSize - offset);
                    }
                    mapBuffer.get(readBuff, 0, readLength);
                    for (int i = 0; i < readLength; i++) {
                        byte tmp = readBuff[i];
                        if (tmp == '\n' || tmp == '\r') {
                            handle(bos.toByteArray());
                            bos.reset();
                        } else {
                            bos.write(tmp);
                        }
                    }
                }
                if (bos.size() > 0) {
                    handle(bos.toByteArray());
                }
                cyclicBarrier.await();//等待其它线程执行完
            } catch (Exception e) {
                e.printStackTrace();
            }
        }

    }

    public static class Builder {
        private int threadSize = 1;
        private String charset = null;
        private int bufferSize = 1024 * 1024;
        private IHandle handle;
        private File file;

        public Builder(String file, IHandle handle) {
            this.file = new File(file);
            if (!this.file.exists())
                throw new IllegalArgumentException("The path can not be null or empty");
            this.handle = handle;

        }

        public Builder withTreahdSize(int size) {
            if (size < 1) {
                throw new InvalidParameterException("The threadCount can not be less than 1");
            }
            this.threadSize = size;
            return this;
        }

        public Builder withCharset(String charset) {
            this.charset = charset;
            return this;
        }

        public Builder withBufferSize(int bufferSize) {
            this.bufferSize = bufferSize;
            return this;
        }

        public BigFileReader build() {
            return new BigFileReader(this.file, this.handle, this.charset, this.bufferSize, this.threadSize);
        }
    }


}
package com.hilton.hcs.china.ari.file;

import com.google.common.util.concurrent.RateLimiter;
import com.google.gson.Gson;
import com.hilton.hcs.china.ari.model.TransData;
import com.hilton.hcs.china.ari.redis.RedisClientTemplate;
import com.hilton.hcs.china.ari.service.AbstractService;
import com.hilton.hcs.china.ari.utils.StringUtil;
import com.hilton.hcs.data.content.ShopPropertyInfo;
import com.hilton.hcs.data.content.ShopPropertyInfoRepository;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.http.HttpMethod;
import org.springframework.http.ResponseEntity;
import org.springframework.stereotype.Service;

import java.io.File;
import java.util.*;
import java.util.concurrent.atomic.AtomicInteger;


@Service
public class HiltonTransFile extends AbstractService {
    private static volatile int fileType = 0;
    private static AtomicInteger count = new AtomicInteger(0);
    private static StringBuilder stringBuilder = null;
    @Autowired
    private RedisClientTemplate redisClientTemplate;

    public void transFile(ShopPropertyInfoRepository shopPropertyInfoRepository) {
//        FileInputStream input = null;
        try {
//            FTPUtil ftpUtil = new FTPUtil();
//            String absPath = "/home/ftpuser/hilton/";
//            String localPath = "C:\\Users\\DT302\\Desktop\\storeFile";
//            FTPFile[] ftpFiles = ftpUtil.getFtpClient().listFiles(absPath);
//            if (null != ftpFiles && ftpFiles.length > 0) {
//                for (FTPFile ftpFile : ftpFiles) {
//                    input = new FileInputStream(absPath + ftpFile.getName());
//                    ftpUtil.getFtpClient().storeFile(ftpFile.getName(), input);
//                }
//
//            }

            File filedir = new File("C:\\Users\\DT302\\Desktop\\翻译\\");
            if (!filedir.exists()) {
                throw new Exception("file dir not exists");
            } else {
                for (File file : filedir.listFiles()) {
                    if (file.isDirectory()) {
                        continue;
                    }
                    if (file.getName().contains("srp")) {
                        fileType = 1; //价格
                    } else {
                        fileType = 0; //房型
                    }
                    BigFileReader.Builder builder = new BigFileReader.Builder("C:\\Users\\DT302\\Desktop\\翻译\\" + file.getName(), new IHandle() {
                        private volatile List<ShopPropertyInfo> shopPropertyInfos = new ArrayList<>();
                        private volatile List<String> trans = new ArrayList<>();
                        private volatile ShopPropertyInfo shopPropertyInfo = null;
                        private volatile List<String> cutList = null;
                        private volatile List<ShopPropertyInfo> cutBeanList = null;
                        RateLimiter limiter = RateLimiter.create(50.0); // 限流

                        @Override
                        public synchronized void handle(String line) {
                            try {
                                String arr[] = line.split("\\|");
                                if (fileType == 0 || arr.length < 123) {
                                    String propCode = arr[0];
                                    String roomTypeCode = arr[1];
                                    String sourceDesc = arr[14];
                                    String sourceDescRedis = redisClientTemplate.get(propCode + "_" + roomTypeCode);
                                    if (StringUtil.isEmpty(sourceDescRedis) || !sourceDesc.equals(sourceDescRedis)) {
                                        shopPropertyInfo = new ShopPropertyInfo();
                                        shopPropertyInfo.setPropPode(propCode);
                                        shopPropertyInfo.setRoomTypeCode(roomTypeCode);
                                        shopPropertyInfo.setSourceDesc(sourceDesc);
                                        shopPropertyInfos.add(shopPropertyInfo);
                                        trans.add(sourceDesc.toLowerCase());
                                        redisClientTemplate.set(propCode + "_" + roomTypeCode, sourceDesc);
                                    }
                                } else {
                                    stringBuilder = new StringBuilder();
                                    String propCode = arr[3];
                                    String srpCode = arr[2];
                                    int j = 0;
                                    for (int i = 113; i < 123; i++) {
                                        if (arr.length > 123 && !StringUtil.isEmpty(arr[i])) {
                                            String sourceDesc = arr[i].toLowerCase();
                                            String sourceDescRedis = redisClientTemplate.get(propCode + "_" + srpCode + "_" + j);
                                            if (StringUtil.isEmpty(sourceDescRedis) || !sourceDesc.equals(sourceDescRedis)) {
                                                shopPropertyInfo = new ShopPropertyInfo();
                                                shopPropertyInfo.setPropPode(propCode);
                                                shopPropertyInfo.setRatePlanCode(srpCode);
                                                shopPropertyInfo.setSourceDesc(sourceDesc);
                                                trans.add(sourceDesc);
                                                shopPropertyInfos.add(shopPropertyInfo);
                                                redisClientTemplate.set(propCode + "_" + srpCode + "_" + j, sourceDesc);
                                            }
                                        }
                                        j++;
                                    }
                                }

                                limiter.acquire(); // 请求RateLimiter, 超过permits会被阻塞
                                transToDB();
                            } catch (Exception e) {
                                e.printStackTrace();
                            }
//                increat();
                            System.out.println("=========line is " + count.addAndGet(1));
                        }

                        private void transToDB() {
                            //分段执行谷歌批量翻译
                            int flag = 100;//每次取的数据
                            int size = trans.size();
                            int temp = size / flag + 1;
                            boolean special = size % flag == 0;
                            for (int i = 0; i < temp; i++) {
                                if (null == trans || trans.size() > 1) {
                                    continue;
                                }
                                if (i == temp - 1) {
                                    if (special) {
                                        break;
                                    }
                                    cutList = trans.subList(flag * i, size);
                                    cutBeanList = shopPropertyInfos.subList(flag * i, size);
                                } else {
                                    cutList = trans.subList(flag * i, flag * (i + 1));
                                    cutBeanList = shopPropertyInfos.subList(flag * i, flag * (i + 1));
                                }
//                                String result = transByGoole("en", "zh-CN", cutList);
//                                TransData transData = new Gson().fromJson(result, TransData.class);
//                                if (null != transData && null != transData.getData()) {
                                //入库
                                if (null != cutList && cutList.size() > 0) {
                                    for (int j = 0; j < cutList.size(); j++) {
                                        ShopPropertyInfo shopPropertyInfo = cutBeanList.get(j);
                                        shopPropertyInfo.setDescription("暂无");
                                        if (!StringUtil.isEmpty(shopPropertyInfo.getRoomTypeCode())) {
                                            //j随机数,主要用于查询shop,redis获取list,同时解决ratePlanCode覆盖问题
                                            if (!StringUtil.isEmpty(shopPropertyInfo.getDescription())) {
                                                redisClientTemplate.set(shopPropertyInfo.getPropPode() + "_" + shopPropertyInfo.getRoomTypeCode() + "_zh" + j, shopPropertyInfo.getDescription());
                                            }
                                        } else {
                                            if (!StringUtil.isEmpty(shopPropertyInfo.getDescription())) {
                                                redisClientTemplate.set(shopPropertyInfo.getPropPode() + "_" + shopPropertyInfo.getRatePlanCode() + "_zh" + j, shopPropertyInfo.getDescription());
                                            }
                                        }
                                        shopPropertyInfo.setCreatedTime(new Date());
                                        shopPropertyInfoRepository.save(shopPropertyInfo);
                                    }
                                    //清空(操作的是原list)
                                    cutList.clear();
                                    cutBeanList.clear();
                                }

                            }

//                            }
                        }
                    });
                    //Runtime.getRuntime().availableProcessors() 获取可用线程数
                    builder.withTreahdSize(Runtime.getRuntime().availableProcessors()).withCharset("gbk")
                            .withBufferSize(1024 * 1024); //设置读取缓冲区大小
                    BigFileReader bigFileReader = builder.build();
                    bigFileReader.start();

                }


            }

        } catch (Exception e) {
            e.printStackTrace();
        } finally {
            try {
//                input.close();
            } catch (Exception e) {
                e.printStackTrace();
            }
        }
    }

    public String transByGoole(String source, String target, List<String> trans) {
        //谷歌翻译
        Map<String, Object> params = new HashMap<>();
        params.put("q", trans);
        params.put("target", target);
        params.put("format", "text");
        params.put("source", source);
        params.put("model", "nmt");
        ResponseEntity googleResponse = exchange("https://translation.googleapis.com/language/translate/v2?key=yourKey", HttpMethod.POST, params, String.class, null);
        return googleResponse.getBody() + "";
    }

    public static void main(String args[]) {
        String body = new HiltonTransFile().transByGoole("", "", null);
        System.out.println(new Gson().fromJson(body, TransData.class).getData().getTranslations().get(0).getTranslatedText());
    }

}



阅读更多
想对作者说点什么?

博主推荐

换一批

没有更多推荐了,返回首页