最近着手处理大批量数据的任务。从文本文件中导入数据。到搜索服务器存储。
为了提升性能,用的java并发包中的 阻塞双端队列LinkedBlockingDeque。
生产者线程 读取数据。 消费者 从队列中取出数据 提交到搜索引擎
package com.lubanec.cache.model;
import java.util.concurrent.LinkedBlockingDeque;
import java.util.concurrent.TimeUnit;
import java.util.concurrent.atomic.AtomicInteger;
import org.apache.log4j.Logger;
import org.apache.solr.client.solrj.SolrServer;
import org.apache.solr.common.SolrInputDocument;
import com.lubanec.factory.muticore.SinglerSolrServerFactory;
/**
* @author 胡慧超
* 消费者队列线程
* 获取队列的数据 提交给搜索服务器
*/
public class LogConsumerThread extends Thread {
private static Logger log=Logger.getLogger(LogConsumerThread.class);
private SolrServer server = SinglerSolrServerFactory.getInstance().getHotwordSolrServer();
//控制队列是否继续
public boolean running = true;
// 生产者消费者内存队列
private LinkedBlockingDeque<SolrInputDocument> linkedBlockingDeque;
private static final AtomicInteger commitNum=new AtomicInteger();
private static final AtomicInteger memoryNum=new AtomicInteger();
private static final AtomicInteger tatolNum=new AtomicInteger();
public LogConsumerThread(LinkedBlockingDeque<SolrInputDocument> linkedBlockingDeque) {
this.linkedBlockingDeque = linkedBlockingDeque;
}
@Override
public void run() {
try {
LogDataLock lock = LogDataLock.getInstance();
SolrInputDocument doc = null;
// 获取并移除此双端队列表示的队列的头部(即此双端队列的第一个元素),如有必要将在指定的等待时间内等待可用元素。
while ((doc=linkedBlockingDeque.poll(5,TimeUnit.SECONDS))!= null && running) {
tatolNum.incrementAndGet();
//server.add(doc); --这是线程安全的
//操作同一个单例对象时,list是线程不安全的
synchronized (lock){
memoryNum.incrementAndGet();
lock.doclist.add(doc);
//设置批量提交数据量为20000
if(memoryNum.get()>20000){
server.add(lock.doclist);
//提交之后,重置list
lock.doclist.clear();
//重置开始时间
memoryNum.set(0);
commitNum.incrementAndGet();
}
}
}
synchronized (lock){
if(lock.doclist.size()>0){
server.add(lock.doclist);
//提交之后,重置list
lock.doclist.clear();
memoryNum.set(0);
commitNum.incrementAndGet();
}
}
log.info("consume"+Thread.currentThread().getName()+" 退出 doc: " + doc + ", running is " + running+",提交搜索引擎次数:"+commitNum.get()+",内存队列中数据:"+memoryNum.get()+"处理总数据量:"+tatolNum.get());
//杀死线程之前。。还原commitNum,memoryNum,tatolNum,以便在一次性web任务中重置统计。
commitNum.set(0);
memoryNum.set(0);
tatolNum.set(0);
} catch (Exception e) {
e.printStackTrace();
}
}
}
package com.lubanec.cache.model;
import java.io.File;
import java.io.IOException;
import java.util.Map;
import java.util.concurrent.LinkedBlockingDeque;
import java.util.concurrent.atomic.AtomicInteger;
import java.util.regex.Pattern;
import org.apache.commons.io.FileUtils;
import org.apache.commons.io.LineIterator;
import org.apache.log4j.Logger;
import org.apache.solr.common.SolrInputDocument;
import com.lubanec.utils.Constants;
import com.lubanec.utils.JsonWs;
/**
* @author 胡慧超
* 生产者队列线程
* 生产组装队列的数据
*/
@SuppressWarnings("rawtypes")
public class LogProductThread extends Thread{
private static Logger log=Logger.getLogger(LogProductThread.class);
private final static Pattern FILTERS = Pattern.compile(Constants.USER_SEARCH_LOG_DEFAULT);
private static final AtomicInteger lognum=new AtomicInteger();
private static final AtomicInteger historyall=new AtomicInteger(); //历史所有日志数量
// 生产者消费者内存队列
private LinkedBlockingDeque<SolrInputDocument> linkedBlockingDeque;
// 文件夹
private File dir ;
private String logname;
public LogProductThread(LinkedBlockingDeque<SolrInputDocument> linkedBlockingDeque, File dir,String logname) {
this.linkedBlockingDeque = linkedBlockingDeque;
this.dir=dir;
this.logname=logname;
}
@Override
public void run() {
/*
* 读取该目录下面所有日志
*/
for (File file : dir.listFiles()) {
if (!file.isFile()) {
log.error("File not exists." + file.getName());
continue;
}
if(!"all".equals(logname)&&!file.getName().endsWith(logname)){
continue;
}
LineIterator lineIterator = null;
try {
lineIterator = FileUtils.lineIterator(file, "UTF-8");
while (lineIterator.hasNext()) {
String line = (String) lineIterator.next();
//匹配正则
if(!FILTERS.matcher(line).matches()){
continue;
}
String timetemp=line.split("\\|")[0];
String searchinfo=line.substring(timetemp.length()+1);
Map map=JsonWs.parseStringToMap(searchinfo);
SolrInputDocument doc=new SolrInputDocument();
doc.addField("timetemp", timetemp.replaceAll("-", "").replaceAll(" ", "").replaceAll(":", "").substring(0, 14));
doc.addField("keyword", map.get("keyword")!=null?map.get("keyword"):"");
doc.addField("cate_id", map.get("cate_id")!=null?map.get("cate_id"):"");
doc.addField("brand", map.get("brand")!=null?map.get("brand"):"");
lognum.incrementAndGet() ;
historyall.incrementAndGet();
linkedBlockingDeque.add(doc);
}
} catch (IOException e) {
log.error("File reading line error." + e.toString(), e);
} finally {
LineIterator.closeQuietly(lineIterator);
}
}
log.info("product 退出 ,读取目录"+dir.getName()+"完毕!该目录下,需读取的总日志数量:"+lognum.get()+";历史总导入日志数据量:"+historyall.incrementAndGet());
lognum.set(0);
}
}