es 写入hive 脚本demo


    说明:该脚本的开发思路和使用方式


    原理:读取数据-> 转换为csv-> load 数据到hdfs (hive临时表) -> 通过sql 加载到使用的表中
    使用该脚本数据读取数据最好使用单个索引,如果用通配符*匹配比较多的索引,存在速率过慢和机器内存使用过度的风险。


import csv
from pyhive import hive
from hdfs import InsecureClient
from elasticsearch import Elasticsearch

def conn2es():
    es = Elasticsearch(["xxx.xxx.xxx.xxx"],port=9200,http_auth=('*','*'))
    return es

# 查询
def query():
    es = conn2es()
    body = {
        "query" : {
            "range" : {
              "logoutTime" : {
                  "gte": "2021-01-11T00:00:00.000+08:00",
                  "lte": "2021-01-12T00:59:59.999+08:00"
              }
            }
        }
    }
    query = es.search(index="index_name", doc_type='type_name', body=body, scroll='5m', size=1000)
    res = query['hits']['hits']  # es查询出的结果第一页
    total = query['hits']['total']  # es查询出的结果总量
    scroll_id = query['_scroll_id']  # 游标用于输出es查询出的所有结果
    for i in range(0, int(total / 1000) + 1):
        # scroll参数必须指定否则会报错
        query_scroll = es.scroll(scroll_id=scroll_id, scroll='5m')['hits']['hits']
        res += query_scroll
    emp_data = []
    ## 指定查询mapping key
    keys_data = ['onlineDetailUuid', 'userId', 'operatorsConfigUuid', 'operatorsUserId',
                 'operatorsName', 'gatewayStrategy', 'userMac', 'usergroupId', 'userIpv4',
                 'userIpv6', 'userIpv6LocalLink', 'ipv6Num', 'userVlan', 'ceVlan', 'peVlan',
                 'domainName', 'netMask', 'gateway', 'ipv6Gateway', 'dns', 'nasIp', 'nasIpv6',
                 'nasPort', 'nasCommunity', 'nasName', 'nasLocation', 'nasType', 'nasTypeNum',
                 'serviceId', 'policyId', 'accountId', 'wpNasIp', 'wpNasPort', 'proxyName',
                 'accessCtrl', 'loginTime', 'logoutTime', 'onlineSec', 'terminateCause',
                 'tunnelClient', 'tunnelServer', 'accessType', 'apMac', 'ssid', 'isRoaming',
                 'areaName', 'acctSessionId', 'totalTraffic', 'userTemplateUuid', 'createTime',
                 'lastUpdateTime', 'createHost', 'lastUpdateHost', 'packageName', 'serviceSuffix',
                 'accountingRule', 'timesegmentId', 'suConnVersion', 'suVersion', 'accessEquipmentType',
                 'terminalOperationSystem', 'terminalTypeDes', 'visitorAuthType', 'isOperatorsAuth',
                 'hasReset', 'operatorsSessionId', 'passThroughType', 'packetUuid', 'accessDeviceIp',
                 'accessDeviceType', 'accessDevicePort', 'accessDeviceDescription', 'accessDeviceInterface',
                 'operatorFirstLog']

    for hit in res:
        data = hit['_source']
        csv_data = []
        for key_data in keys_data:
            if key_data in data:
                csv_data.append(data[key_data])
            else:
                csv_data.append('null')
        emp_data.append(csv_data)
    with open('emp_data.csv','w') as csvfile :
        writer = csv.writer(csvfile)
        writer.writerows(emp_data)

def hdfsWriteFile() :
    conn = hive.Connection(host='xxx.xxx.xxx.xxx', port=10000, database = 'test')
    cursor = conn.cursor()
    hdfs_client = InsecureClient('http://ns1:50070')
    emp_files = hdfs_client.list(hdfs_path='/user/hive/warehouse/test.db/ods__ck__sam_es_di_new_tmp/')
    #判断是否有文件存在,有则删除
    if len(emp_files) != 0:
        hdfs_client.delete(hdfs_path= '/user/hive/warehouse/test.db/ods__ck__sam_es_di_new_tmp/' + emp_files[0])

    remote_path = hdfs_client.upload(hdfs_path='/user/hive/warehouse/maoyan.db/ods__ck__sam_es_di_new_tmp/',
                                     local_path='emp_data.csv', n_threads=2,chunk_size=2 ** 16)
    if remote_path != None :
        cursor.execute("insert overwrite table test.ods__ck__sam_es_di_new partition(pt='20210204') select * from test.ods__ck__sam_es_di_new_tmp")
    else : return '文件上传失败'
    cursor.close()
    conn.close()
    return remote_path


def main():
     query()
     hdfs_path = hdfsWriteFile()
     print(hdfs_path)

if __name__ ==  '__main__':
    main()

  • 0
    点赞
  • 1
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
以下是使用Java编写Flink消费Kafka写入Hive的示例代码: 1. 导入依赖 ```java import org.apache.flink.api.common.functions.MapFunction; import org.apache.flink.api.common.serialization.SimpleStringSchema; import org.apache.flink.streaming.api.datastream.DataStream; import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment; import org.apache.flink.streaming.connectors.kafka.FlinkKafkaConsumer; import org.apache.flink.streaming.connectors.kafka.FlinkKafkaProducer; import org.apache.flink.streaming.util.serialization.KeyedSerializationSchemaWrapper; import org.apache.flink.streaming.util.serialization.SimpleStringSchema; ``` 2. 配置Kafka连接 ```java String kafkaBootstrapServers = "localhost:9092"; String kafkaTopic = "test"; Properties kafkaProps = new Properties(); kafkaProps.setProperty("bootstrap.servers", kafkaBootstrapServers); kafkaProps.setProperty("group.id", "flink-group"); ``` 3. 创建 Flink 环境和 Kafka 消费者 ```java StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment(); DataStream<String> kafkaStream = env.addSource(new FlinkKafkaConsumer<>(kafkaTopic, new SimpleStringSchema(), kafkaProps)); ``` 4. 对收到的消息进行处理 ```java DataStream<String> processedStream = kafkaStream.map(new MapFunction<String, String>() { @Override public String map(String value) throws Exception { // 在这里对数据进行处理,返回处理后的数据 return value; } }); ``` 5. 将处理后的数据写入 Hive ```java String hiveTableName = "test"; String hiveMetastoreUri = "thrift://localhost:9083"; String hiveDbName = "default"; String hivePartitionColumn = "dt"; String hivePartitionValue = "20220101"; String hiveOutputPath = "/user/hive/warehouse/" + hiveDbName + ".db/" + hiveTableName + "/" + hivePartitionColumn + "=" + hivePartitionValue; DataStream<String> hiveDataStream = processedStream.map(new MapFunction<String, String>() { @Override public String map(String value) throws Exception { // 在这里将数据转换为 Hive 表的格式,返回转换后的数据 return value; } }); // 将数据写入 Hive hiveDataStream.addSink(new FlinkHiveOutputFormat<>(new Path(hiveOutputPath), new org.apache.hadoop.hive.ql.io.orc.OrcSerde(), new Object[]{})); ``` 6. 将处理后的数据写回 Kafka ```java String kafkaOutputTopic = "output"; FlinkKafkaProducer<String> kafkaProducer = new FlinkKafkaProducer<>(kafkaBootstrapServers, kafkaOutputTopic, new KeyedSerializationSchemaWrapper<>(new SimpleStringSchema()), kafkaProps); // 将数据写回 Kafka processedStream.addSink(kafkaProducer); ``` 完整示例代码: ```java import org.apache.flink.api.common.functions.MapFunction; import org.apache.flink.api.common.serialization.SimpleStringSchema; import org.apache.flink.streaming.api.datastream.DataStream; import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment; import org.apache.flink.streaming.connectors.kafka.FlinkKafkaConsumer; import org.apache.flink.streaming.connectors.kafka.FlinkKafkaProducer; import org.apache.flink.streaming.util.serialization.KeyedSerializationSchemaWrapper; import org.apache.flink.streaming.util.serialization.SimpleStringSchema; import java.util.Properties; public class FlinkKafkaToHiveDemo { public static void main(String[] args) throws Exception { String kafkaBootstrapServers = "localhost:9092"; String kafkaTopic = "test"; Properties kafkaProps = new Properties(); kafkaProps.setProperty("bootstrap.servers", kafkaBootstrapServers); kafkaProps.setProperty("group.id", "flink-group"); String hiveTableName = "test"; String hiveMetastoreUri = "thrift://localhost:9083"; String hiveDbName = "default"; String hivePartitionColumn = "dt"; String hivePartitionValue = "20220101"; String hiveOutputPath = "/user/hive/warehouse/" + hiveDbName + ".db/" + hiveTableName + "/" + hivePartitionColumn + "=" + hivePartitionValue; StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment(); DataStream<String> kafkaStream = env.addSource(new FlinkKafkaConsumer<>(kafkaTopic, new SimpleStringSchema(), kafkaProps)); DataStream<String> processedStream = kafkaStream.map(new MapFunction<String, String>() { @Override public String map(String value) throws Exception { // 在这里对数据进行处理,返回处理后的数据 return value; } }); DataStream<String> hiveDataStream = processedStream.map(new MapFunction<String, String>() { @Override public String map(String value) throws Exception { // 在这里将数据转换为 Hive 表的格式,返回转换后的数据 return value; } }); DataStream<String> kafkaOutputStream = processedStream.map(new MapFunction<String, String>() { @Override public String map(String value) throws Exception { // 在这里对数据进行处理,返回处理后的数据 return value; } }); FlinkKafkaProducer<String> kafkaProducer = new FlinkKafkaProducer<>(kafkaBootstrapServers, kafkaOutputTopic, new KeyedSerializationSchemaWrapper<>(new SimpleStringSchema()), kafkaProps); processedStream.addSink(kafkaProducer); hiveDataStream.addSink(new FlinkHiveOutputFormat<>(new Path(hiveOutputPath), new org.apache.hadoop.hive.ql.io.orc.OrcSerde(), new Object[]{})); env.execute("FlinkKafkaToHiveDemo"); } } ```

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值