1 pyspark定时统计hive的数据存入mysql
在做指标系统的时候,我们很多时候都需要统计,hive上面的数据,然后存入mysql 便于反复查询. 代码很简单. 还支持补数.
2 代码
# -*- coding: utf-8 -*-
# created by say 2021-04-12
from pyhive import hive
from pyspark.conf import SparkConf
from pyspark.context import SparkContext
from pyspark.sql import HiveContext
import datetime
import logging
import pymysql
import sys, os
sys.path.append(os.getcwd())
isPrd = False
MP_DB_CONNECT_INFO = \
{'host': '运维填写', 'port': 3306, 'user': '运维填写', 'password': '运维填写', 'database': 'db_oneplatform',
'charset': 'utf8'} if (isPrd) else \
{'host': '192.168.1.17', 'port': 3306, 'user': 'root', 'password': '123456', 'database': 'dev_test',
'charset': 'utf8'}
# hive链接信息
HIVE_CONNECT_INFO = \
{'host': '192.168.1.10', 'port': 10000, 'user': 'say', 'password': 'say', 'database': 'ods'} \
if (isPrd) else \
{'host': '192.168.1.12', 'port': 10000, 'user': 'say', 'password': 'say', 'database': 'ods'}
# 初始化
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s')
# 生成日志句柄
logger = logging.getLogger("label-count")
class dataMp(object):
def __init__(self):
# hive
self.conn = hive.connect(host=HIVE_CONNECT_INFO.get('host'),
port=HIVE_CONNECT_INFO.get('port'),
username=HIVE_CONNECT_INFO.get('user'),
database=HIVE_CONNECT_INFO.get('database')
)
self.pyconf = SparkConf()
self.pyconf.set("spark.sql.caseSensitive", "true")
self.spark = SparkContext(conf=self.pyconf)
self.sc = HiveContext(self.spark)
# 连接mysql
self._conn = pymysql.connect(host=MP_DB_CONNECT_INFO.get('host'),
port=MP_DB_CONNECT_INFO.get('port'),
user=MP_DB_CONNECT_INFO.get('user'),
passwd=MP_DB_CONNECT_INFO.get('password'),
db=MP_DB_CONNECT_INFO.get('database'),
charset=MP_DB_CONNECT_INFO.get('charset')
)
self._cursor = self._conn.cursor()
def query(self, sql):
logger.info(sql)
try:
self._cursor.execute(sql)
self._conn.commit()
result = self._cursor.fetchall()
except pymysql.Error as e:
logger.info(e)
result = None
print(result)
return result
# 每天运行查询的数据
def getLabelCount(self, dt):
sql = """
select plan_id,count(DISTINCT user_id) as visit_user_count from (
SELECT user_id,get_json_object(properties,'$.planIds') as id FROM dwd.dwd_user_push_visit_dtl
where day={day} and event_id='time_label' ) a
lateral view explode(split(a.id,',')) num as plan_id group by plan_id
""".format(day=dt)
logger.info(sql)
result = self.sc.sql(sql).collect()
size = len(result)
logger.info("数据的数量:"+str(size))
if size == 0:
return 0
return result
def realTimeLabelDay(self, day, dt):
global plan_id, visit_user_count
resultCount = self.getLabelCount(day)
logger.info("统计结果:"+ str(resultCount))
if resultCount == 0 :
logger.info("查询无数据~")
return 0
for i in range(0,len(resultCount)):
logger.info(resultCount[i])
plan_id = str(resultCount[i][0])
visit_user_count = resultCount[i][1]
query_sql = """
select id from plan_count
where plan_id = '{plan_id}' and substring(count_date,1,10) = substring('{dt}',1,10) limit 1
""".format(plan_id=plan_id, dt=dt)
insert_sql = """
insert into plan_count (plan_id, plan_type, visit_user_count, count_date)
values ('{plan_id}', 1, {visit_user_count}, '{dt}')
""".format(plan_id=plan_id, visit_user_count=visit_user_count, dt=dt)
query_result = self.query(query_sql)
if query_result is None or query_result==():
logger.info("inset语句: " + insert_sql)
self._cursor.execute(insert_sql)
self._conn.commit()
logger.info("inset影响行: " + str(self._cursor.lastrowid))
else:
id = query_result[0][0]
update_sql = """
update plan_count set plan_id='{plan_id}', plan_type=1,
visit_user_count={visit_user_count}, count_date='{dt}'
where id = {id}
""".format(plan_id=plan_id, visit_user_count=visit_user_count, dt=dt, id=id)
logger.info("update语句: " + update_sql)
self._cursor.execute(update_sql)
self._conn.commit()
logger.info("update影响行: " + str(self._cursor.rowcount)) # 影响行数
self._cursor.close()
if __name__ == '__main__':
db = dataMp()
if len(sys.argv) > 1:
dt = sys.argv[1] # 补数 传入补数的时间 yyyy-mm-dd 00:00:00
day = dt[0:10].replace('-', '')
logger.info("运行日期:" + day + ", 运行时间:" + dt)
db.realTimeLabelDay(day, dt)
else:
dt = (datetime.datetime.now()+datetime.timedelta(hours=-1)).strftime("%Y-%m-%d %H:%M:%S")
day = (datetime.datetime.now()+datetime.timedelta(hours=-1)).strftime("%Y%m%d")
logger.info("运行日期:" + day + ", 运行时间:" + dt)
db.realTimeLabelDay(day, dt)