全域数据处理

#!/usr/bin/env python
# coding: utf-8

# In[6]:


# encoding=utf-8
# encoding=utf-8
import pandas as pd
import numpy as np
import re
import time
import subprocess,sys
#import logging
from operator import itemgetter
import os
import logging
'''
定义一个类,用于下文进行加载,从而多线程进行数据连接hive
'''
class Get_data:      
    def run_cmd(self,cmdstr,encoding = 'utf-8'):
        try:
            res = subprocess.Popen(cmdstr,shell=True, stdout=subprocess.PIPE, stderr=subprocess.STDOUT)
        except Exception as e:
            raise e
        print('===========res============%s'%res)
        results = []
        while True:
            line = res.stdout.readline().decode(encoding)
            if  line == '' and res.poll() is not None:
                break
            else:
                results.append(line.strip())
                if line.strip() != "":
                    print(line)
        if res.returncode != 0:
            print("执行cmd失败,cmd语句为:%s" % cmdstr)
            sys.exit(res.returncode)
        return [res.returncode, '\n'.join(results)]
'''
第二步骤:获取数据
'''
def load_data(yesterday):
    executor=Get_data()
    sql_jd=f"select user_log_account,union_id,master_time,master_source_type,master_account_type,slave_time,slave_source_type,slave_account_type from app.app_user_source_jdwx_join "
    cmdstr_jd=f'hive -e " set hive.cli.print.header=true;{sql_jd}" > ./data/data.csv'
    executor.run_cmd(cmdstr_jd)
def get_data(filename):
    df_data = pd.read_csv(filename,usecols=['user_log_account','union_id','master_time','master_source_type','master_account_type','slave_time','slave_source_type','slave_account_type'], sep='\t',encoding="utf-8",low_memory=False)
    data_list = df_data.values.tolist()
    return data_list
'''
第三步:处理数据
'''
def process_data(data_list):
    l=[]
    for i in range(len(data_list)):
         l.append(data_list[i][0:2])
    return l  
def process_data_nw(data_list):
    import networkx as nx
    l = process_data(data_list)
    G = nx.Graph()
    #将节点添加到Graph    
    G.add_nodes_from(sum(l, []))
    #从节点列表创建边
    q = [[(s[i],s[i+1]) for i in range(len(s)-1)] for s in l]
    #print(q)
    for i in q:
        #向Graph添加边
        G.add_edges_from(i)
    #查找每个组件的图形和列表节点中的所有连接组件
    result=[list(i) for i in nx.connected_components(G)]
    return result
def p_data(data_list,result):
    r={}
    for i in range(len(data_list)):
        for j in range(len(result)):
            if data_list[i][0] in result[j] and data_list[i][1] in result[j]:
                r.setdefault(j,[]).append(data_list[i])
                break
    return r
def write_csv(result_csv,r,yesterday):
    h=[]
    for k in r:
        for i in range(len(r[k])):
            a=r[k][i]
            a.append(k)
            a.append(yesterday)
            h.append(a)
    h = np.array(h)
    h = pd.DataFrame(h)
    pd.DataFrame(h).to_csv(result_csv,index=False ,header=None,sep='\t')
    return result_csv
'''
第四步:数据加载回大数据平台
'''
def Upload_Data(yesterday):
    executor=Get_data()
    '''
    推送到大数据平台
    '''
    sql2=f"load data local inpath './data/result.csv' overwrite into table app.xcx_app_land_data_result partition (dt='{yesterday}')"
    cmdstr2=f'hive -e "{sql2}" '
    executor.run_cmd(cmdstr2)
def main(yesterday):
    filename='./data/data.csv'
    result_csv='./data/result.csv'
    load_data(yesterday)
    data_list=get_data(filename)
    result=process_data_nw(data_list)
    r=p_data(data_list,result)
    write_csv(result_csv,r,yesterday)
    Upload_Data(yesterday)
    
if __name__ == '__main__':
    yesterday=str(sys.argv[1])
    main(yesterday)



jd_45a74c8a80fae oCwKwuA–nUcBf-AjibsvsIDqeFk
jd_TyTPaUnfTyuy oCwKwuH_fqvHNPESGLUzWUkchluk
rib6lg oCwKwuHk1udYCo82sUtEmxSVX6_0
jd_UKsFoffVBvtB oCwKwuHk1udYCo82sUtEmxSVX6_0
rib6lg oCwKwuMEkxKZIxFpAXZwei-2axMY
jd_RlVQwU8z2b6Lqbz oCwKwuMEkxKZIxFpAXZwei-2axMY
jd_TyTPaUnfTyuy oCwKwuMEkxKZIxFpAXZwei-2axMY
cm870321 oCwKwuMEkxKZIxFpAXZwei-2axMY
jd_UKsFoffVBvtB oCwKwuMEkxKZIxFpAXZwei-2axMY
rib6lg oCwKwuOPk1ldt-yFjqlI7qpziyGw
wyl166246 oCwKwuA–yFZ6imDGIBuNx0iU6ZU
jd_aLDqFGhChTWE oCwKwuA–yFZ6imDGIBuNx0iU6ZU
jd_UKsFoffVBvtB oCwKwuOPk1ldt-yFjqlI7qpziyGw
jd_ukDdpAAgqHcs oCwKwuA–kXG85Khc5cJQv4j0f2o

   

  • 3
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值