mysql （全量）数据导入到 elasticsearch

最新推荐文章于 2024-06-28 09:53:12 发布

龙骨

最新推荐文章于 2024-06-28 09:53:12 发布

阅读量4.6k

点赞数 1

本文链接：https://blog.csdn.net/qq_31780525/article/details/106136099

版权

方案一：

最初的方式是考虑用ELK全家桶的logstash实现，本人用的elasticsearch版本是7.6.2 具体实现方式如下

1.sync.conf

input {
    jdbc {
      #设置timezone
      jdbc_default_timezone => "Asia/Shanghai"
      # mysql 数据库链接,dianpingdb为数据库名
      jdbc_connection_string => "jdbc:mysql://mysql_ip:3306/mydatabase"
      # 用户名和密码
      jdbc_user => "user"
      jdbc_password => "pass"
      # 驱动
      jdbc_driver_library => "/home/deployer/zhanggf/test/mysql/mysql-connector-java-5.1.47.jar"
      # 驱动类名
      jdbc_driver_class => "com.mysql.jdbc.Driver"
      jdbc_paging_enabled => "true"
      jdbc_page_size => "50000"
      #last_run_metadata_path => "/Users/hzllb/Desktop/devtool/logstash-7.3.0/bin/mysql/last_value_meta" 
    # 执行的sql 文件路径+名称;
      statement_filepath => "/home/deployer/zhanggf/test/mysql/sync.sql"
      # 设置监听间隔  各字段含义（由左至右）分、时、天、月、年，全部为*默认含义为每分钟都更新
      schedule => "* * * * *"
    }
}





output {
    elasticsearch {
      # ES的IP地址及端口
		hosts => ["es1:9200","es2:9200","es3:9200"]
      # 索引名称
		index => "sys_log_202005"
  	    document_type => "_doc"
      # 自增ID 需要关联的数据库中有有一个id字段，对应索引的id号
        document_id => "%{id}"
    }
    stdout {
     # JSON格式输出
        codec => json_lines
    }
}

2. 业务逻辑SQL sync.sql

select id,face_id,staff_num,snap_shot_id,snap_shot_time,mac,similar_percent,created_time,updated_time,open_flag,temperature,masks_status,temperature_status,device_name,access_type,check_type,device_num,domain 
from sys_identify_log ;

3. 指定conf文件启动
bin\logstash.bat -f sync.conf

mysql 和 es 集群网络不通，出现问题。尝试通过程序mysql到es

方案二 java程序读mysql数据，之后写入到ES

代码如下

1.定义DBHelper类

public class DBHelper {
	public static final String url = "jdbc:mysql://ip:3600/mydatabase?useSSL=false";
	public static final String name = "com.mysql.jdbc.Driver";
	public static final String user = "root";
	public static final String password = "123456";

	public static Connection conn = null;

	public static Connection getConn() {
		try {
			Class.forName(name);
			conn = DriverManager.getConnection(url, user, password);
		} catch (Exception e) {
			e.printStackTrace();
		}
		return conn;
	}
}

public class BulkProcessImpl {
	
	private static final Logger logger = LogManager.getLogger(BulkProcessImpl.class);

	public static void main(String[] args) {
		try {
//			test();
			long startTime = System.currentTimeMillis();
			String tableName = "sys_identify_log_back";
			createIndex(tableName);
			BulkProcessImpl bulk = new BulkProcessImpl();
			bulk.writeMysqlDataToES(tableName);

			logger.info(" use time: " + (System.currentTimeMillis() - startTime) / 1000 + "s");
		} catch (Exception e) {
			e.printStackTrace();
		}
	}

//	创建索引
	public static void createIndex(String indexName) throws IOException {
		RestHighLevelClient client = new RestHighLevelClient(RestClient.builder(new HttpHost("es1", 9200, "http")));// ��ʼ��
		// ES 索引默认需要小写，将其转为小写
		CreateIndexRequest requestIndex = new CreateIndexRequest(indexName.toLowerCase());// ��������
		// 注： 设置副本数为0，索引刷新时间为-1对大批量索引数据效率的提升有不小的帮助
		requestIndex.settings(Settings.builder().put("index.number_of_shards", 3)
				.put("index.number_of_replicas", 2)
				.put("index.refresh_interval", "-1"));
		CreateIndexResponse createIndexResponse = client.indices().create(requestIndex, RequestOptions.DEFAULT);
		logger.info("isAcknowledged:" + createIndexResponse.isAcknowledged());
		logger.info("isShardsAcknowledged:" + createIndexResponse.isShardsAcknowledged());

		client.close();

	}

	/**
	 *
	 * @param
	 * // 将mysql 数据查出组装成es需要的map格式，通过批量写入es中
	 */
	public void writeMysqlDataToES(String tableName1) {

		String tableNameSql="sys_identify_log_back"; // helloworld
		String tableName="sys_identify_log_back"; // helloworld

		RestHighLevelClient client = new RestHighLevelClient(RestClient.builder(
				new HttpHost("es1", 9200, "http"),
				new HttpHost("es2", 9200, "http"),
				new HttpHost("es3", 9200, "http")
		));
		BulkProcessor bulkProcessor = getBulkProcessor(client);

		Connection conn = null;
		PreparedStatement ps = null;
		ResultSet rs = null;

		try {
			conn = DBHelper.getConn();
			logger.info("Start handle data :" + tableNameSql);

			String sql = "SELECT * from " + tableNameSql+" where 1=1";

			ps = conn.prepareStatement(sql, ResultSet.TYPE_FORWARD_ONLY, ResultSet.CONCUR_READ_ONLY);
			ps.setFetchSize(Integer.MIN_VALUE);
			rs = ps.executeQuery();

			ResultSetMetaData colData = rs.getMetaData();

			ArrayList<HashMap<String, String>> dataList = new ArrayList<HashMap<String, String>>();

			// bulkProcessor 添加的数据支持的方式并不多，查看其api发现其支持map键值对的方式，故笔者在此将查出来的数据转换成hashMap方式
			HashMap<String, String> map = null;
			HashMap<String, Integer> mapInteger = null;
			int count = 0;
			String c = null;
			String v = null;
			int vint=0;
			while (rs.next()) {
				count++;
				map = new HashMap<String, String>(100);
				for (int i = 1; i <= colData.getColumnCount(); i++) {
					c = colData.getColumnName(i);
						v = rs.getString(c);
						map.put(c, v);
				}
				dataList.add(map);
				// 每10万条写一次，不足的批次的最后再一并提交
				if (count % 50000 == 0) {
					logger.info("Mysql handle data number : " + count);
					// 将数据添加到 bulkProcessor 中
					for (HashMap<String, String> hashMap2 : dataList) {
						bulkProcessor.add(new IndexRequest(tableName.toLowerCase(), "_doc", hashMap2.get("id")).source(hashMap2));
					}
					// 每提交一次便将map与list清空
					map.clear();
					dataList.clear();
				}
			}
			// count % 100000 处理未提交的数据
			for (HashMap<String, String> hashMap2 : dataList) {
				bulkProcessor.add(new IndexRequest(tableName.toLowerCase(), "_doc", hashMap2.get("id")).source(hashMap2));
			}

			logger.info("-------------------------- Finally insert number total : " + count);
			// 将数据刷新到es, 注意这一步执行后并不会立即生效，取决于bulkProcessor设置的刷新时间
			bulkProcessor.flush();
			
		} catch (Exception e) {
			logger.error(e.getMessage());
		} finally {
			try {
				rs.close();
				ps.close();
				conn.close();
				boolean terminatedFlag = bulkProcessor.awaitClose(150L, TimeUnit.SECONDS);
				client.close();
				logger.info(terminatedFlag);
			} catch (Exception e) {
				logger.error(e.getMessage());
			}
		}
	}
//	创建bulkProcessor并初始化
	private static BulkProcessor getBulkProcessor(RestHighLevelClient client) {

		BulkProcessor bulkProcessor = null;
		try {

			BulkProcessor.Listener listener = new BulkProcessor.Listener() {
				@Override
				public void beforeBulk(long executionId, BulkRequest request) {
					logger.info("Try to insert data number : " + request.numberOfActions());
				}

				@Override
				public void afterBulk(long executionId, BulkRequest request, BulkResponse response) {
					logger.info("************** Success insert data number : " + request.numberOfActions() + " , id: "
							+ executionId);
				}

				@Override
				public void afterBulk(long executionId, BulkRequest request, Throwable failure) {
					logger.error("Bulk is unsuccess : " + failure + ", executionId: " + executionId);
				}
			};

			BiConsumer<BulkRequest, ActionListener<BulkResponse>> bulkConsumer = (request, bulkListener) -> client
					.bulkAsync(request, RequestOptions.DEFAULT, bulkListener);
			BulkProcessor.Builder builder = BulkProcessor.builder(bulkConsumer, listener);
			builder.setBulkActions(10000);
			builder.setBulkSize(new ByteSizeValue(300L, ByteSizeUnit.MB));
			builder.setConcurrentRequests(10);
			builder.setFlushInterval(TimeValue.timeValueSeconds(100L));
			builder.setBackoffPolicy(BackoffPolicy.constantBackoff(TimeValue.timeValueSeconds(1L), 3));
			// 注意点：在这里感觉有点坑，官网样例并没有这一步，而笔者因一时粗心也没注意，在调试时注意看才发现，上面对builder设置的属性没有生效
			bulkProcessor = builder.build();

		} catch (Exception e) {
			e.printStackTrace();
			try {
				bulkProcessor.awaitClose(100L, TimeUnit.SECONDS);
				client.close();
			} catch (Exception e1) {
				logger.error(e1.getMessage());
			}

		}
		return bulkProcessor;
	}
}

龙骨

关注

1
点赞
踩
12

收藏

觉得还不错? 一键收藏
0
评论
mysql （全量）数据导入到 elasticsearch

方案一：最初的方式是考虑用ELK全家桶的logstash实现，本人用的elasticsearch版本是7.6.2 具体实现方式如下1.sync.confinput { jdbc { #设置timezone jdbc_default_timezone => "Asia/Shanghai" # mysql 数据库链接,dianpingdb为数据库名 jdbc_connection_string => "jdbc:mysql://m
复制链接

扫一扫