pyspark定时统计hive的数据存入mysql

1 pyspark定时统计hive的数据存入mysql

在做指标系统的时候,我们很多时候都需要统计,hive上面的数据,然后存入mysql 便于反复查询. 代码很简单. 还支持补数.

2 代码

# -*- coding: utf-8 -*-

# created by say  2021-04-12
from pyhive import hive
from pyspark.conf import SparkConf
from pyspark.context import SparkContext
from pyspark.sql import HiveContext
import datetime
import logging
import pymysql
import sys, os

sys.path.append(os.getcwd())

isPrd = False
MP_DB_CONNECT_INFO = \
    {'host': '运维填写', 'port': 3306, 'user': '运维填写', 'password': '运维填写', 'database': 'db_oneplatform',
     'charset': 'utf8'} if (isPrd) else \
        {'host': '192.168.1.17', 'port': 3306, 'user': 'root', 'password': '123456', 'database': 'dev_test',
         'charset': 'utf8'}
# hive链接信息
HIVE_CONNECT_INFO = \
    {'host': '192.168.1.10', 'port': 10000, 'user': 'say', 'password': 'say', 'database': 'ods'} \
        if (isPrd) else \
        {'host': '192.168.1.12', 'port': 10000, 'user': 'say', 'password': 'say', 'database': 'ods'}
# 初始化
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s')
# 生成日志句柄
logger = logging.getLogger("label-count")


class dataMp(object):

    def __init__(self):
        # hive
        self.conn = hive.connect(host=HIVE_CONNECT_INFO.get('host'),
                                 port=HIVE_CONNECT_INFO.get('port'),
                                 username=HIVE_CONNECT_INFO.get('user'),
                                 database=HIVE_CONNECT_INFO.get('database')
                                 )

        self.pyconf = SparkConf()
        self.pyconf.set("spark.sql.caseSensitive", "true")
        self.spark = SparkContext(conf=self.pyconf)
        self.sc = HiveContext(self.spark)
        # 连接mysql
        self._conn = pymysql.connect(host=MP_DB_CONNECT_INFO.get('host'),
                                     port=MP_DB_CONNECT_INFO.get('port'),
                                     user=MP_DB_CONNECT_INFO.get('user'),
                                     passwd=MP_DB_CONNECT_INFO.get('password'),
                                     db=MP_DB_CONNECT_INFO.get('database'),
                                     charset=MP_DB_CONNECT_INFO.get('charset')
                                     )
        self._cursor = self._conn.cursor()

    def query(self, sql):
        logger.info(sql)
        try:
            self._cursor.execute(sql)
            self._conn.commit()
            result = self._cursor.fetchall()
        except pymysql.Error as e:
            logger.info(e)
            result = None
        print(result)
        return result

    # 每天运行查询的数据
    def getLabelCount(self, dt):
        sql = """
                select plan_id,count(DISTINCT user_id) as visit_user_count from (
            SELECT user_id,get_json_object(properties,'$.planIds') as id FROM dwd.dwd_user_push_visit_dtl 
            where day={day} and event_id='time_label' ) a
            lateral view explode(split(a.id,',')) num as plan_id group by plan_id
            """.format(day=dt)
        logger.info(sql)
        result = self.sc.sql(sql).collect()
        size = len(result)
        logger.info("数据的数量:"+str(size))
        if size == 0:
            return 0
        return result

    def realTimeLabelDay(self, day, dt):
        global plan_id, visit_user_count
        resultCount = self.getLabelCount(day)
        logger.info("统计结果:"+ str(resultCount))
        if resultCount == 0 :
            logger.info("查询无数据~")
            return 0
        for i in range(0,len(resultCount)):
            logger.info(resultCount[i])
            plan_id = str(resultCount[i][0])
            visit_user_count = resultCount[i][1]
            query_sql = """
                    select id from plan_count 
                where plan_id = '{plan_id}' and substring(count_date,1,10) = substring('{dt}',1,10) limit 1
            """.format(plan_id=plan_id, dt=dt)
            insert_sql = """
                    insert into plan_count (plan_id, plan_type, visit_user_count, count_date) 
                values ('{plan_id}', 1, {visit_user_count}, '{dt}')
            """.format(plan_id=plan_id, visit_user_count=visit_user_count, dt=dt)
            query_result = self.query(query_sql)

            if query_result is None or query_result==():
                logger.info("inset语句: " + insert_sql)
                self._cursor.execute(insert_sql)
                self._conn.commit()
                logger.info("inset影响行: " + str(self._cursor.lastrowid))
            else:
                id = query_result[0][0]
                update_sql = """
                            update plan_count set plan_id='{plan_id}', plan_type=1, 
                        visit_user_count={visit_user_count}, count_date='{dt}' 
                        where id = {id}
                """.format(plan_id=plan_id, visit_user_count=visit_user_count, dt=dt, id=id)
                logger.info("update语句: " + update_sql)
                self._cursor.execute(update_sql)
                self._conn.commit()
                logger.info("update影响行: " + str(self._cursor.rowcount))  # 影响行数
        self._cursor.close()


if __name__ == '__main__':
    db = dataMp()
    if len(sys.argv) > 1:
        dt = sys.argv[1]  # 补数 传入补数的时间 yyyy-mm-dd 00:00:00
        day = dt[0:10].replace('-', '')
        logger.info("运行日期:" + day + ", 运行时间:" + dt)
        db.realTimeLabelDay(day, dt)
    else:
        dt = (datetime.datetime.now()+datetime.timedelta(hours=-1)).strftime("%Y-%m-%d %H:%M:%S")
        day = (datetime.datetime.now()+datetime.timedelta(hours=-1)).strftime("%Y%m%d")
        logger.info("运行日期:" + day + ", 运行时间:" + dt)
        db.realTimeLabelDay(day, dt)
  • 0
    点赞
  • 1
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值