最近有这样一个需求,需要修改一个字段的mapping和要添加一个字段,新增字段和老的字段value要一样,也就是要复制一个字段的值到这个新增字段上来,保持两个内容一致,新增字段做不分词处理,用来精确匹配搜索。但elasticsearch的mapping是不能修改的,所以只好新建一个索引,重新定义mapping之后再导入数据。数据量比较大,有1500W+数据。
之前有人写过一个导入数据的程序。代码如下:
首先创建一个工厂类ClientFactory.class
:
package com.dimonho.es.commons;
import java.lang.reflect.Constructor;
import org.apache.log4j.Logger;
import org.elasticsearch.client.Client;
import org.elasticsearch.client.transport.TransportClient;
import org.elasticsearch.common.settings.ImmutableSettings;
import org.elasticsearch.common.settings.Settings;
import org.elasticsearch.common.transport.InetSocketTransportAddress;
/**
* ES客户端工厂类
*/
public class ClientFactory{
private static final Logger LOGGER = Logger.getLogger(ClientFactory.class);
/**
* ES请求地址
*/
private String address;
/**
* ES集群名字
*/
private String clusterName;
private TransportClient client;
public static final int DEFAULT_ES_PORT=9300;
/**
* 初始化方法,使用配置的参数来构建一个客户端
*/
public void init(){
LOGGER.info(String.format("初始化ES链接:%s(%s)",address,clusterName));
Settings defaultSettings = ImmutableSettings.settingsBuilder().put("client.transport.sniff", false)
.put("client.transport.ignore_cluster_name", true)
.put("index.similarity.default.type", "default")
.put("cluster.name",clusterName)
.build();
try {
Class<?> clazz = Class.forName(TransportClient.class.getName());
Constructor<?> constructor = clazz.getDeclaredConstructor(Settings.class);
constructor.setAccessible(true);
client = (TransportClient) constructor.newInstance(defaultSettings);
String[] addrs = address.split(",");
for(String str : addrs){
String[] items = str.split(":");
if(items.length==2){
client.addTransportAddress(new InetSocketTransportAddress(items[0], Integer.valueOf(items[1])));
}else if(items.length ==1){
client.addTransportAddress(new InetSocketTransportAddress(items[0], DEFAULT_ES_PORT));
}
}
} catch (Exception e) {
throw new RuntimeException(e);
}
}
/**
* 取得实例
* @return
*/
public Client getTransportClient() {
return client;
}
public Client getTransportClient(String clusterName,String ip,Integer port){
return createClient(clusterName,ip,port);
}
/**
* 创建一个客户端
* @param clusterName
* @param ip
* @param port
* @return
*/
public Client createClient(String clusterName,String ip,Integer port){
Settings settings = ImmutableSettings.settingsBuilder().put("client.transport.sniff", false)
.put("client.transport.ignore_cluster_name", true)
.put("index.similarity.default.type", "default")
.put("cluster.name",clusterName)
.build();
return createClient(settings,ip,port);
}
public Client createClient(Settings settings,String ip,Integer port){
TransportClient client = new TransportClient(settings);
client.addTransportAddress(new InetSocketTransportAddress(ip,port));
return client;
}
public String getAddress() {
return address;
}
public void setAddress(String address) {
this.address = address;
}
public String getClusterName() {
return clusterName;
}
public void setClusterName(String clusterName) {
this.clusterName = clusterName;
}
}
封装一些查询方法类ScanQuery.class
:
package com.dimonho.es.commons;
import java.util.Iterator;
import org.elasticsearch.action.search.SearchRequestBuilder;
import org.elasticsearch.action.search.SearchResponse;
import org.elasticsearch.action.search.SearchType;
import org.elasticsearch.client.Client;
import org.elasticsearch.common.unit.TimeValue;
import org.elasticsearch.index.query.FilterBuilder;
import org.elasticsearch.index.query.QueryBuilder;
import org.elasticsearch.search.SearchHit;
public class ScanQuery {
public class BulkInterator implements Iterator<SearchHit>{
private SearchHit[] hits;
private String scrollId ;
/**当前scroll的数据量*/
private int count =0;
/**当前scroll的指针*/
private int index =0 ;
/**查询总共的数据量*/
private long total = 0 ;
/**统计的数据总量*/
private long counter = 0;
private TimeValue time;
private Client client;
public BulkInterator(Client client ,String scrollId,long total,TimeValue time){
this.client = client;
this.scrollId =scrollId;
this.total = total;
this.time = time;
}
@Override
public boolean hasNext() {
if(count == index){
SearchResponse searchResponse = client.prepareSearchScroll(scrollId)
.setScroll(time)
.execute().actionGet();
count = searchResponse.getHits().hits().length;
if(count >0){//还有数据
hits = searchResponse.getHits().hits();
index =0 ;
return true;
}
}else if(index < count){
return true;
}
return false;
}
@Override
public SearchHit next() {
counter++;
return hits[index++];
}
@Override
public void remove() {
throw new UnsupportedOperationException();
}
public long getTotal() {
return total;
}
public long getCounter() {
return counter;
}
}
public Iterator<SearchHit> query(Client client,String indexName){
return query(client,indexName,null,null,null);
}
public Iterator<SearchHit> query(Client client,String indexName,String[] types){
return query(client,indexName,types,null,null);
}
public Iterator<SearchHit> query(Client client,String indexName,String[] types,QueryBuilder queryBuilder,FilterBuilder filterBuilder){
return query(client,indexName,types,queryBuilder,filterBuilder,TimeValue.timeValueSeconds(1),TimeValue.timeValueMinutes(8));
}
public Iterator<SearchHit> query(Client client,String indexName,String[] types,QueryBuilder queryBuilder,FilterBuilder filterBuilder,TimeValue scanTime,TimeValue scollTime ){
SearchResponse searchResponse = null;
SearchRequestBuilder builder = client.prepareSearch(indexName ).setSize(100);
if(queryBuilder!=null){
builder.setQuery(queryBuilder);
}
if(filterBuilder != null){
builder.setPostFilter(filterBuilder);
}
if(types != null && types.length > 0){
builder.setTypes(types);
}
builder.setSearchType(SearchType.SCAN)
.setScroll(scanTime);
searchResponse = builder.execute().actionGet();
return new BulkInterator(client,searchResponse.getScrollId(),searchResponse.getHits().getTotalHits(),scollTime);
}
}
批量插入处理工具类BulkUtils.class
package com.dimonho.es.utils;
import java.util.List;
import java.util.Map;
import org.elasticsearch.action.bulk.BulkProcessor;
import org.elasticsearch.action.bulk.BulkRequest;
import org.elasticsearch.action.bulk.BulkResponse;
import org.elasticsearch.action.index.IndexRequest;
import org.elasticsearch.action.index.IndexRequestBuilder;
import org.elasticsearch.client.Client;
/**
* 批量插入处理类
*
*/
public class BulkUtils {
public static abstract class AbstractBulkListener implements BulkProcessor.Listener{
protected int total;
@Override
public void beforeBulk(long executionId, BulkRequest request) {
}
@Override
public void afterBulk(long executionId, BulkRequest request, BulkResponse response) {
total += request.requests().size();
}
@Override
public void afterBulk(long executionId, BulkRequest request, Throwable failure) {
}
public int getTotal() {
return total;
}
public void setTotal(int total) {
this.total = total;
}
}
public static BulkProcessor buildBulk(Client client,int requestSize,AbstractBulkListener listener){
return BulkProcessor.builder(client, listener).setConcurrentRequests(requestSize).build();
}
public static void insert(Client client,BulkProcessor processor,String indexName,String type,Map<String,Object> source){
IndexRequestBuilder indexRequestBuilder = client
.prepareIndex(indexName,type).setOpType(IndexRequest.OpType.INDEX);
if (source!=null && null != source.get("_id")) {
String id = (String)source.remove("_id");
indexRequestBuilder.setId(id);
}
indexRequestBuilder.setSource(source);
indexRequestBuilder.setOpType(IndexRequest.OpType.CREATE);
processor.add(indexRequestBuilder.request());
}
public static void insert(Client client,BulkProcessor processor,String indexName,String type,List<Map<String,Object>> sources){
for(Map<String,Object> source : sources){
insert(client,processor,indexName,type,source);
}
}
}
数据复制测试类BulkUtilsTest.class
package com.dimonho.es.commons;
import java.util.Date;
import java.util.Map;
import org.elasticsearch.action.bulk.BulkItemResponse;
import org.elasticsearch.action.bulk.BulkProcessor;
import org.elasticsearch.action.bulk.BulkRequest;
import org.elasticsearch.action.bulk.BulkResponse;
import org.elasticsearch.client.Client;
import org.junit.After;
import org.junit.Before;
import org.junit.Test;
import com.dimonho.es.utils.BulkUtils;
import com.dimonho.es.utils.BulkUtils.AbstractBulkListener;
import com.dimonho.es.commons.ScanQuery.BulkInterator;
/**
* 批量插入测试
*/
public class BulkUtilsTest {
//目标索引
private String targetIndex = "wos_source";
//源索引
private String sourceIndex = "wos_source";
private Client sourceClient;
private Client targetClient;
@Before
public void before(){
ClientFactory clientFactory = new ClientFactory();
sourceClient= clientFactory.getTransportClient("our_es", "192.168.1.75", 9300);
targetClient = clientFactory.getTransportClient("our_es", "www.dimonho.com", 9300);
}
@After
public void after(){
//IndexUtils.deleteIndex(targetClient, targetIndex);
}
@Test
public void testBulkInsert(){
ScanQuery query = new ScanQuery();
BulkInterator ite = (BulkInterator)query.query(sourceClient, sourceIndex,new String[]{"periodical"},null,null);
System.out.println("共查询到数据:"+ite.getTotal()+"条");
BulkProcessor processor = BulkUtils.buildBulk(targetClient, 2000, new AbstractBulkListener(){
@Override
public void afterBulk(long executionId, BulkRequest request, BulkResponse response) {
total += request.requests().size();
String dateStr = (new Date()).toString();
System.out.println(" |-已导入[" + total + "]条数据! - " + dateStr + " - index:["
+ targetIndex + "] - type:[periodical]");
if(response.hasFailures()){
for(BulkItemResponse item : response.getItems()){
if(item.isFailed()){
System.out.println("失败信息:[periodical],id:["+item.getId()+"]"+item.getFailureMessage());
}
}
}
}
@Override
public void afterBulk(long executionId, BulkRequest request, Throwable failure) {
StringBuilder stringBuilder = new StringBuilder();
stringBuilder.append("\r\n").append(">>>>>>>>>>>>>>>><<<<<<<<<<<<<<<");
stringBuilder.append("\r\nbulk操作异常,异常信息: ").append("\r\n")
.append(failure.getMessage()).append("\r\n")
.append(failure.getLocalizedMessage()).append("\r\n")
.append(failure.getCause());
stringBuilder.append("\r\n请求数:").append(request.numberOfActions());
stringBuilder.append("\r\n").append(">>>>>>>>>>>>>>>><<<<<<<<<<<<<<<")
.append("\r\n");
System.out.println(stringBuilder.toString());
System.out.println("处理出错:"+failure);
}
});
Map<String,Object> result = null;
while(ite.hasNext()){
result = ite.next().getSource();
//复制jornal字段的值到新加的字段jornalName
result.put("jornalName", result.get("jornal"));
BulkUtils.insert(sourceClient, processor, targetIndex, "periodical", result);
if(ite.getCounter()%2000 == 0L){
System.out.println(ite.getCounter());
}
if(ite.getCounter() == ite.getTotal()){
break;
}
}
}
}
但是当用junit执行testBulkInsert()
方法进行数据复制的时候。。。
回到ES服务器去查看日志:
因为是远程复制,源和目标index不在同一个局域网,有时候网络不佳,客户端和服务端连接超时的现象会报“no node avaliable”的错误。于是我把超时时间设置的长一点,从之前的20S改成了60S,依然报错,只不过这次是GC错误,我想应该是多线程一边不停的往BulkRequest中添加数据,一边execute的又比较慢,导致BulkRequest中积压的数据越来越多,最后JVM内存不够用了。
最终只好自己重新写了一个,代码如下:
package com.dimonho.es.commons;
import java.util.Map;
import org.elasticsearch.action.bulk.BulkItemResponse;
import org.elasticsearch.action.bulk.BulkRequestBuilder;
import org.elasticsearch.action.bulk.BulkResponse;
import org.elasticsearch.action.index.IndexRequest;
import org.elasticsearch.action.index.IndexRequestBuilder;
import org.elasticsearch.action.search.SearchResponse;
import org.elasticsearch.action.search.SearchType;
import org.elasticsearch.client.Client;
import org.elasticsearch.common.unit.TimeValue;
import org.elasticsearch.index.query.QueryBuilder;
import org.elasticsearch.index.query.QueryBuilders;
import org.elasticsearch.search.SearchHit;
import org.junit.Before;
import org.junit.Test;
/**
* 数据迁移测试
*
*/
public class MoveDataTest {
private String sourceClusterName = "our_es";
private String targetClusterName = "our_es";
//源索引
private String sourceIndex = "wos_source";
//目标索引
private String targetIndex = "wos_source";
private String sourceType = "periodical";
private String targetType = "periodical";
private Client sourceClient;
private Client targetClient;
private static int errcount = 0;
@Before
public void before(){
ClientFactory clientFactory = new ClientFactory();
sourceClient = clientFactory.getTransportClient(sourceClusterName, "192.168.1.75", 9300);
targetClient = clientFactory.getTransportClient(targetClusterName, "www.dimonho.com", 9300);
}
@Test
public void bulkMove(){
QueryBuilder queryBuilder = QueryBuilders.matchAllQuery();
SearchResponse sourceDatas = sourceClient.prepareSearch(sourceIndex)
.setTypes(sourceType)
.setQuery(queryBuilder)
.setSize(2000)
.setScroll(TimeValue.timeValueSeconds(2))
.setSearchType(SearchType.SCAN)
.execute()
.actionGet();
long sumCount = sourceDatas.getHits().getTotalHits();
System.out.println("========共查询出"+sumCount+"条数据===========");
String scrollId = sourceDatas.getScrollId();
System.out.println("scrollId:"+scrollId);
BulkRequestBuilder bulkRequest = getBulkRequestBuilder();
while(true){
SearchResponse scrollResponse = sourceClient.prepareSearchScroll(scrollId)
.setScroll(TimeValue.timeValueMinutes(8)).execute().actionGet();
SearchHit[] hits = scrollResponse.getHits().hits();
int count = hits.length;
sumCount -= count;
if(count == 0){
break;
}
for(SearchHit hit:hits){
Map<String,Object> source = hit.getSource();
source.put("journalName", hit.getSource().get("journal"));
IndexRequestBuilder indexRequesBuider = getIindexRequesBuider();
if (source!=null && null != source.get("_id")) {
String id = (String)source.remove("_id");
indexRequesBuider.setId(id);
}
indexRequesBuider.setOpType(IndexRequest.OpType.CREATE);
bulkRequest.add(indexRequesBuider.setSource(source));
}
insert(bulkRequest);
bulkRequest = getBulkRequestBuilder();
System.out.println("还剩"+sumCount+"条数据待导入。。。。。。。。。。。。。。。");
}
}
private void insert(BulkRequestBuilder bulkRequest) {
try{
//如果中途失败,bulkRequest可能还有值未完全插入,需要检测是否有值再重新执行插入。
if (bulkRequest.request().requests().size() != 0){
//BulkResponse接收失败信息
BulkResponse bulkResponse = bulkRequest.execute().actionGet();
if(bulkResponse.hasFailures()){
int succes = 0;
for(BulkItemResponse item : bulkResponse.getItems()){
if(item.isFailed()){
errcount++;
}else{
succes++;
}
}
System.out.println();
System.out.println("失败"+errcount+"条\n"+bulkRequest.request().requests().size()+"》》成功:"+succes+"条");
}
}
}catch(Exception e){
e.printStackTrace();
System.out.println("连接目标服务器失败!重新连接。。。。");
try {
Thread.sleep(10000);
insert(bulkRequest);//防止出现链接超时现象,自动重新运行。
} catch (InterruptedException e1) {
e1.printStackTrace();
insert(bulkRequest);//防止出现链接超时现象,自动重新运行。
}
}
}
public IndexRequestBuilder getIindexRequesBuider(){
try{
return targetClient.prepareIndex(targetIndex, targetType);
}catch(Exception e){
e.printStackTrace();
System.out.println("获取IindexRequesBuider实例失败,重新获取。。。。。");
try {
Thread.sleep(10000);
return getIindexRequesBuider();
} catch (InterruptedException e1) {
e1.printStackTrace();
return getIindexRequesBuider();
}
}
}
public BulkRequestBuilder getBulkRequestBuilder(){
try {
return targetClient.prepareBulk();
} catch (Exception e) {
e.printStackTrace();
System.out.println("获取BulkRequestBuilder实例失败,重新获取。。。。。");
try {
Thread.sleep(10000);
return getBulkRequestBuilder();
} catch (InterruptedException e1) {
e1.printStackTrace();
return getBulkRequestBuilder();
}
}
}
}
这样虽然比多线程慢了一点,不过如果连接超时它会自动再去尝试连接,可以无人值守放在那运行一晚上了。