一、python连接Hive
1、需要知道集群的参数
host :地址
port: 端口号
database:数据库
user: 用户名
auth_mechanism:依赖于hive-site.xml配置文件
password:密码(可有可无)
2、连接及使用:
# -*- coding: utf-8 -*-
import pandas as pd
from impala import dbapi
import numpy as np
#连接参数配置
hive_connection = {'host':'10.xxx.x.x',
'port':8990,
'user':'security_hive',
'auth_mechanism':'PLAIN',
'database':'security_sh'}
#hive相关参数配置(可根据实际情况增加)
hive_config = {'mapreduce.reduce.memory.mb': '8192',
'mapreduce.reduce.java.opts': '-Xmx7373m',
'mapreduce.map.memory.mb': '4096',
'mapreduce.map.java.opts': '-Xmx3276m'}
#连接hive
conn_hive = dbapi.connect(**hive_connection)
cur_hive = conn_hive.cursor()
#sql脚本及运行
hive_sql = "SELECT userid,count(*) AS pv " \
"FROM %s.%s " \
"WHERE d = %s " \
"GROUP BY userid" % (security_sh, security_user_pv_detail, day)
cur_hive.execute(hive_sql, configuration=hive_config)
df_get = cur_hive.fetchall()
#注:df_get的数据类型为dataframe
二、python连接Spark
python连接spark相对简单,只需要本地机器链接集群后可直接进行操作。
连接及使用
# -*- coding: utf-8 -*-
import pandas as pd
import numpy as np
from pyspark.sql import SparkSession
#建立连接对象
spark = SparkSession.builder \
.master("yarn") \ #连接方式:yarn方式
.appName("python_spark") \ #命名别名
.enableHiveSupport() \
.getOrCreate()
#注:上述master连接方式一定要以yarn方式进行链接,否则如果表数据量较大的话,读取数据会非常的慢
#链接spark执行sql命令
spark_sql = "SELECT userid,count(*) AS pv " \
"FROM %s.%s " \
"WHERE d = %s " \
"GROUP BY userid" % (security_sh, security_user_pv_detail, day)
df_get = spark.sql(spark_sql) #此时得到的df_get为spark的dataframe格式,因此需要做一定的转换
df_get = df_get.toPandas() #转换成python可处理的dataframe类型
三、python连接mysql
可使用python连接mysql进行读写操作
连接及使用
# -*- coding: utf-8 -*-
ca
import pandas as pd
import pymysql
import numpy as np
import sys
import os
#连接参数配置
mysql_connection ={'host':'10.xxx.xxx.xx',
'user':'security_user',
'passwd':'xxx',
'db':'security_sh',
'port':6806,
'charset':'utf8',
'connect_timeout':60}
#连接mysql及执行sql
mysql_sql= f"""
SELECT userid,count(*) AS pv
FROM security_sh.security_user_pv_detail
WHERE d = day
GROUP BY userid
"""
conn_mysql = pymysql.connect(**mysql_connection)
cur_mysql = conn_mysql.cursor()
cur_mysql.execute(mysql_sql)
get_data = cur_mysql.fetchall()
conn_mysql.close()
##################################################################
#利用python向mysql数据库写数据
##################################################################
#df是存有原始数据的数据框
data = np.array(df).tolist()
#连接MySQL并写入数据
print(f"{datetime.datetime.now()} : Insert into MySQL...")
conn_mysql = pymysql.connect(**mysql_connection)
cur_mysql = conn_mysql.cursor()
mysql_sql= ("INSERT INTO security_user_pv_detail (userid, pv) VALUES (%s,%s) "
"ON DUPLICATE KEY UPDATE "
"userid = values(userid), pv = values(pv)")
cur_mysql.executemany(mysql_sql, data)
conn_mysql.commit()
print(f"{datetime.datetime.now()} : Insert into MySQL done!")
以上为python连接hive、spark、mysql的连接方式及使用 ,在实际使用中还有hive数据写入到mysql数据,mysql数据读入到hive表,这个后续在添加吧。