spark sql 主要是想了select 功能,不具备insert, update 功能,本文实现用spark + psycopg2对postgre 数据库进行insert update计算,代码部分已进行了详细的说明:
import psycopg2
import psycopg2.extras
from pyspark import SparkContext,SparkConf
from pyspark.sql import SQLContext
from pyspark.sql.types import Row, StructField, StructType, StringType, IntegerType
#opera: 对rdd中每一条记录进行操作
def opera(x):
newval = x[2]+ x[3]
return Row(id=x[0], name=x[1], val=newval)
#save_df_to_db:对rdd中的每一个partition进行处理
def save_df_to_db(records):
db_conn = psycopg2.connect(database="testdb", user="postgres", password="password", host="127.0.0.1", port="5432")
dict_cursor=db_conn.cursor()
upsert_query = "INSERT INTO t3 (id, name, val) VALUES (%(id)s, %(name)s, %(val)s) ON CONFLICT (