大数据处理过程

记录一次python处理impala数据的过程:

# country表和FTA表结合
import time

from impala.dbapi import connect
import numpy as np

start_time = time.time()

# 定义impala连接
def connect():
    host = '192.168.0.139'
    port = 21050
    conn = connect(host=host, port=port)
    return conn

conn = connect()
cursor = conn.cursor()

# 定义日期拆分函数
def split_date(date_str):
    if date_str == '1900/01/01':
        return '', ''
    else:
        date_parts = date_str.split('/')
        return date_parts[0], date_parts[1]

def exec_impala(sql):
    cursor.execute(sql)
    result = cursor.fetchall()
    return result

# def data_list:
country_list = {"Southern African Customs Union (SACU)":"THE KINGDOM OF ESWATINI;BOTSWANA;SOUTH AFRICA;LESOTHO;NAMIBIA","ASEAN Free Trade Area (AFTA)":"LAOS;MYANMAR;CAMBODIA;PHILIPPINES;MALAYSIA;THAILAND;BRUNEI;SINGAPORE;INDONESIA;VIETNAM","European Free Trade Association (EFTA)":"AUSTRIA;DENMARK;NORWAY;PORTUGAL;SWEDEN;SWITZERLAND;UNITED KINGDOM","Gulf Cooperation Council (GCC)":"SAUDI ARABIA;OMAN;KUWAIT;QATAR;BAHRAIN;YEMEN;UNITED ARAB EMIRATES","European Union":"IRELAND;ESTONIA;AUSTRIA;BULGARIA;BELGIUM;POLAND;DENMARK;GERMANY;FRANCE;FINLAND;NETHERLANDS;CZECH REPUBLIC;CROATIA;LATVIA;LITHUANIA;LUXEMBOURG;ROMANIA;MALTA;PORTUGAL;SWEDEN;CYPRUS;SLOVAKIA;SLOVENIA;SPAIN;GREECE;HUNGARY;ITALY","Central American Common Market (CACM)":"NICARAGUA;HONDURAS;COSTA RICA;EL SALVADOR;GUATEMALA","Eurasian Economic Union (EAEU)":"KYRGYZSTAN;BELARUS;RUSSIA;KAZAKHSTAN;TAJIKISTAN"}


sql = 'select rta_id,rta_name,current_signatories,original_signatories,status,date_of_entry_into_force_g,inactive_date from tradeflow.allrta_filter'
# 获取allrta_filter表数据
result = exec_impala(sql)


sql1 = 'select countryid,country_en,country_cn from tradeflow.country'
# 获取country表数据
result1 = exec_impala(sql1)

dataList = []
# 第一步:将data中的包含country_list的数据进行替换
result_data = []

for item in result:
    new_item = list(item)
    for alliance, countries in country_list.items():
        if alliance in new_item[2] or alliance in new_item[3]:
            new_item[2] = new_item[2].replace(alliance,countries).replace("; ",";").rstrip()
            new_item[3] = new_item[3].replace(alliance,countries).replace("; ",";").rstrip()
        else:
            new_item[2] = new_item[2].replace("; ",";").rstrip()
            new_item[3] = new_item[3].replace("; ",";").rstrip()

    new_item.append(split_date(new_item[5])[0])
    new_item.append(split_date(new_item[5])[1])
    new_item.append(split_date(new_item[6])[0])
    new_item.append(split_date(new_item[6])[1])
    result_data.append(new_item)

final_data = []

for item in result_data:
    entry1 = item.copy()
    # 情况一:元素三包含元素四,则按元素二表示,并且再增加一个元素YES
    country_upper_list3 = entry1[3].upper()
    country_upper_list2 = entry1[2].upper()
    # 情况一:元素二包含元素三,则按元素二表示,并且在增加一个元素YES(0)
    # 情况二:元素二为空,意味着都是所有国家都是NO(1)
    # 情况三:元素二与元素三,除去相同的,元素三多余的国家,增加元素设置为NO(1),相同的则设置为YES(0)
    if not country_upper_list2:
        countries = country_upper_list3.split(";")
        for country_info in result1:
            for country in countries:
                if country == country_info[1]:
                    new_tuple = entry1[:2] + list(country_info) + entry1[4:]
                    new_tuple.append(1)
                    dataList.append(new_tuple)
    else:
        list3 = country_upper_list3.split(";")
        list2 = country_upper_list2.split(";")
        res = np.setdiff1d(list3, list2)
        if set(list3).issubset(set(list2)):
            for country_info in result1:
                for country in list2:
                    if country == country_info[1]:
                        new_tuple = entry1[:2] + list(country_info) + entry1[4:]
                        new_tuple.append(0)
                        dataList.append(new_tuple)
        elif len(res) > 0:
            country_set = list(set(list3+list2))
            for country_info in result1:
                if country_info[1] in res:
                    new_tuple = entry1[:2] + list(country_info) + entry1[4:] + [1]
                    dataList.append(new_tuple)
                elif country_info[1] in country_set:
                    new_tuple = entry1[:2] + list(country_info) + entry1[4:] + [0]
                    dataList.append(new_tuple)

insert_sql = f'''INSERT INTO tradeflow.allrta_country_test
(rta_id,rta_name,gtf_id,country_en,country_cn,date_of_entry_into_force_g,inactive_date,
status,y_year_start,m_month_start,y_year_end,m_month_end,is_suspect)
VALUES'''
for data in dataList:
    insert_sql = insert_sql + f'''({data[0]}, "{data[1]}", {data[2]}, "{data[3]}", '{data[4]}', '{data[6]}', '{data[7]}','{data[5]}','{data[8]}', '{data[9]}', '{data[10]}', '{data[11]}', {data[12]}),'''
insert_sql = insert_sql[:-1]

cursor.execute(insert_sql)

conn.commit()
cursor.close()

end_time = time.time()
print(end_time-start_time)

  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值