es 写入hive 脚本demo

最新推荐文章于 2023-04-10 14:06:12 发布

稚远

最新推荐文章于 2023-04-10 14:06:12 发布

阅读量1k

点赞数

文章标签： elasticsearch hive big data

本文链接：https://blog.csdn.net/weixin_43997569/article/details/122824281

版权

说明：该脚本的开发思路和使用方式

原理：读取数据-> 转换为csv-> load 数据到hdfs (hive临时表) -> 通过sql 加载到使用的表中
使用该脚本数据读取数据最好使用单个索引，如果用通配符*匹配比较多的索引，存在速率过慢和机器内存使用过度的风险。


import csv
from pyhive import hive
from hdfs import InsecureClient
from elasticsearch import Elasticsearch

def conn2es():
    es = Elasticsearch(["xxx.xxx.xxx.xxx"],port=9200,http_auth=('*','*'))
    return es

# 查询
def query():
    es = conn2es()
    body = {
        "query" : {
            "range" : {
              "logoutTime" : {
                  "gte": "2021-01-11T00:00:00.000+08:00",
                  "lte": "2021-01-12T00:59:59.999+08:00"
              }
            }
        }
    }
    query = es.search(index="index_name", doc_type='type_name', body=body, scroll='5m', size=1000)
    res = query['hits']['hits']  # es查询出的结果第一页
    total = query['hits']['total']  # es查询出的结果总量
    scroll_id = query['_scroll_id']  # 游标用于输出es查询出的所有结果
    for i in range(0, int(total / 1000) + 1):
        # scroll参数必须指定否则会报错
        query_scroll = es.scroll(scroll_id=scroll_id, scroll='5m')['hits']['hits']
        res += query_scroll
    emp_data = []
    ## 指定查询mapping key
    keys_data = ['onlineDetailUuid', 'userId', 'operatorsConfigUuid', 'operatorsUserId',
                 'operatorsName', 'gatewayStrategy', 'userMac', 'usergroupId', 'userIpv4',
                 'userIpv6', 'userIpv6LocalLink', 'ipv6Num', 'userVlan', 'ceVlan', 'peVlan',
                 'domainName', 'netMask', 'gateway', 'ipv6Gateway', 'dns', 'nasIp', 'nasIpv6',
                 'nasPort', 'nasCommunity', 'nasName', 'nasLocation', 'nasType', 'nasTypeNum',
                 'serviceId', 'policyId', 'accountId', 'wpNasIp', 'wpNasPort', 'proxyName',
                 'accessCtrl', 'loginTime', 'logoutTime', 'onlineSec', 'terminateCause',
                 'tunnelClient', 'tunnelServer', 'accessType', 'apMac', 'ssid', 'isRoaming',
                 'areaName', 'acctSessionId', 'totalTraffic', 'userTemplateUuid', 'createTime',
                 'lastUpdateTime', 'createHost', 'lastUpdateHost', 'packageName', 'serviceSuffix',
                 'accountingRule', 'timesegmentId', 'suConnVersion', 'suVersion', 'accessEquipmentType',
                 'terminalOperationSystem', 'terminalTypeDes', 'visitorAuthType', 'isOperatorsAuth',
                 'hasReset', 'operatorsSessionId', 'passThroughType', 'packetUuid', 'accessDeviceIp',
                 'accessDeviceType', 'accessDevicePort', 'accessDeviceDescription', 'accessDeviceInterface',
                 'operatorFirstLog']

    for hit in res:
        data = hit['_source']
        csv_data = []
        for key_data in keys_data:
            if key_data in data:
                csv_data.append(data[key_data])
            else:
                csv_data.append('null')
        emp_data.append(csv_data)
    with open('emp_data.csv','w') as csvfile :
        writer = csv.writer(csvfile)
        writer.writerows(emp_data)

def hdfsWriteFile() :
    conn = hive.Connection(host='xxx.xxx.xxx.xxx', port=10000, database = 'test')
    cursor = conn.cursor()
    hdfs_client = InsecureClient('http://ns1:50070')
    emp_files = hdfs_client.list(hdfs_path='/user/hive/warehouse/test.db/ods__ck__sam_es_di_new_tmp/')
    #判断是否有文件存在，有则删除
    if len(emp_files) != 0:
        hdfs_client.delete(hdfs_path= '/user/hive/warehouse/test.db/ods__ck__sam_es_di_new_tmp/' + emp_files[0])

    remote_path = hdfs_client.upload(hdfs_path='/user/hive/warehouse/maoyan.db/ods__ck__sam_es_di_new_tmp/',
                                     local_path='emp_data.csv', n_threads=2,chunk_size=2 ** 16)
    if remote_path != None :
        cursor.execute("insert overwrite table test.ods__ck__sam_es_di_new partition(pt='20210204') select * from test.ods__ck__sam_es_di_new_tmp")
    else : return '文件上传失败'
    cursor.close()
    conn.close()
    return remote_path


def main():
     query()
     hdfs_path = hdfsWriteFile()
     print(hdfs_path)

if __name__ ==  '__main__':
    main()