说明:该脚本的开发思路和使用方式
原理:读取数据-> 转换为csv-> load 数据到hdfs (hive临时表) -> 通过sql 加载到使用的表中
使用该脚本数据读取数据最好使用单个索引,如果用通配符*匹配比较多的索引,存在速率过慢和机器内存使用过度的风险。
import csv
from pyhive import hive
from hdfs import InsecureClient
from elasticsearch import Elasticsearch
def conn2es():
es = Elasticsearch(["xxx.xxx.xxx.xxx"],port=9200,http_auth=('*','*'))
return es
# 查询
def query():
es = conn2es()
body = {
"query" : {
"range" : {
"logoutTime" : {
"gte": "2021-01-11T00:00:00.000+08:00",
"lte": "2021-01-12T00:59:59.999+08:00"
}
}
}
}
query = es.search(index="index_name", doc_type='type_name', body=body, scroll='5m', size=1000)
res = query['hits']['hits'] # es查询出的结果第一页
total = query['hits']['total'] # es查询出的结果总量
scroll_id = query['_scroll_id'] # 游标用于输出es查询出的所有结果
for i in range(0, int(total / 1000) + 1):
# scroll参数必须指定否则会报错
query_scroll = es.scroll(scroll_id=scroll_id, scroll='5m')['hits']['hits']
res += query_scroll
emp_data = []
## 指定查询mapping key
keys_data = ['onlineDetailUuid', 'userId', 'operatorsConfigUuid', 'operatorsUserId',
'operatorsName', 'gatewayStrategy', 'userMac', 'usergroupId', 'userIpv4',
'userIpv6', 'userIpv6LocalLink', 'ipv6Num', 'userVlan', 'ceVlan', 'peVlan',
'domainName', 'netMask', 'gateway', 'ipv6Gateway', 'dns', 'nasIp', 'nasIpv6',
'nasPort', 'nasCommunity', 'nasName', 'nasLocation', 'nasType', 'nasTypeNum',
'serviceId', 'policyId', 'accountId', 'wpNasIp', 'wpNasPort', 'proxyName',
'accessCtrl', 'loginTime', 'logoutTime', 'onlineSec', 'terminateCause',
'tunnelClient', 'tunnelServer', 'accessType', 'apMac', 'ssid', 'isRoaming',
'areaName', 'acctSessionId', 'totalTraffic', 'userTemplateUuid', 'createTime',
'lastUpdateTime', 'createHost', 'lastUpdateHost', 'packageName', 'serviceSuffix',
'accountingRule', 'timesegmentId', 'suConnVersion', 'suVersion', 'accessEquipmentType',
'terminalOperationSystem', 'terminalTypeDes', 'visitorAuthType', 'isOperatorsAuth',
'hasReset', 'operatorsSessionId', 'passThroughType', 'packetUuid', 'accessDeviceIp',
'accessDeviceType', 'accessDevicePort', 'accessDeviceDescription', 'accessDeviceInterface',
'operatorFirstLog']
for hit in res:
data = hit['_source']
csv_data = []
for key_data in keys_data:
if key_data in data:
csv_data.append(data[key_data])
else:
csv_data.append('null')
emp_data.append(csv_data)
with open('emp_data.csv','w') as csvfile :
writer = csv.writer(csvfile)
writer.writerows(emp_data)
def hdfsWriteFile() :
conn = hive.Connection(host='xxx.xxx.xxx.xxx', port=10000, database = 'test')
cursor = conn.cursor()
hdfs_client = InsecureClient('http://ns1:50070')
emp_files = hdfs_client.list(hdfs_path='/user/hive/warehouse/test.db/ods__ck__sam_es_di_new_tmp/')
#判断是否有文件存在,有则删除
if len(emp_files) != 0:
hdfs_client.delete(hdfs_path= '/user/hive/warehouse/test.db/ods__ck__sam_es_di_new_tmp/' + emp_files[0])
remote_path = hdfs_client.upload(hdfs_path='/user/hive/warehouse/maoyan.db/ods__ck__sam_es_di_new_tmp/',
local_path='emp_data.csv', n_threads=2,chunk_size=2 ** 16)
if remote_path != None :
cursor.execute("insert overwrite table test.ods__ck__sam_es_di_new partition(pt='20210204') select * from test.ods__ck__sam_es_di_new_tmp")
else : return '文件上传失败'
cursor.close()
conn.close()
return remote_path
def main():
query()
hdfs_path = hdfsWriteFile()
print(hdfs_path)
if __name__ == '__main__':
main()