在写pySpark时,driver(py运行的机器)上的py函数中,如果需要操作 SparkContext相关的对象,需要把这个函数标注为 @staticmethod。
否则会报错,log参考文章最后。
猜测原理是:需要把这个函数 通过序列化传输给spark的worker所在的机器。
参考:
2、http://spark.apache.org/docs/latest/rdd-programming-guide.html(Passing Functions to Spark)
报错记录:
File "spark01_helloworld.py", line 177, in gen_net self.writeInfoToHive(rdd_result.toDF().withColumn("dt", lit(today)), tablename) File "/data8/hadoop/yarn/nm-local-dir/usercache/hadoop-mt-hids/appcache/application_1619689474933_8491239/container_e32_1619689474933_8491239_01_000001/pyspark.zip/pyspark/sql/session.py", line 57, in toDF File "/data8/hadoop/yarn/nm-local-dir/usercache/hadoop-mt-hids/appcache/application_1619689474933_8491239/container_e32_1619689474933_8491239_01_000001/pyspark.zip/pyspark/sql/session.py", line 535, in createDataFrame File "/data8/hadoop/yarn/nm-local-dir/usercache/hadoop-mt-hids/appcache/application_1619689474933_8491239/container_e32_1619689474933_8491239_01_000001/pyspark.zip/pyspark/sql/session.py", line 375, in _createFromRDD File "/data8/hadoop/yarn/nm-local-dir/usercache/hadoop-mt-hids/appcache/application_1619689474933_8491239/container_e32_1619689474933_8491239_01_000001/pyspark.zip/pyspark/sql/session.py", line 346, in _inferSchema File "/data8/hadoop/yarn/nm-local-dir/usercache/hadoop-mt-hids/appcache/application_1619689474933_8491239/container_e32_1619689474933_8491239_01_000001/pyspark.zip/pyspark/rdd.py", line 1361, in first File "/data8/hadoop/yarn/nm-local-dir/usercache/hadoop-mt-hids/appcache/application_1619689474933_8491239/container_e32_1619689474933_8491239_01_000001/pyspark.zip/pyspark/rdd.py", line 1343, in take File "/data8/hadoop/yarn/nm-local-dir/usercache/hadoop-mt-hids/appcache/application_1619689474933_8491239/container_e32_1619689474933_8491239_01_000001/pyspark.zip/pyspark/context.py", line 992, in runJob File "/data8/hadoop/yarn/nm-local-dir/usercache/hadoop-mt-hids/appcache/application_1619689474933_8491239/container_e32_1619689474933_8491239_01_000001/pyspark.zip/pyspark/rdd.py", line 2455, in _jrdd File "/data8/hadoop/yarn/nm-local-dir/usercache/hadoop-mt-hids/appcache/application_1619689474933_8491239/container_e32_1619689474933_8491239_01_000001/pyspark.zip/pyspark/rdd.py", line 2388, in _wrap_function File "/data8/hadoop/yarn/nm-local-dir/usercache/hadoop-mt-hids/appcache/application_1619689474933_8491239/container_e32_1619689474933_8491239_01_000001/pyspark.zip/pyspark/rdd.py", line 2374, in _prepare_for_python_RDD File "/data8/hadoop/yarn/nm-local-dir/usercache/hadoop-mt-hids/appcache/application_1619689474933_8491239/container_e32_1619689474933_8491239_01_000001/pyspark.zip/pyspark/serializers.py", line 460, in dumps File "/data8/hadoop/yarn/nm-local-dir/usercache/hadoop-mt-hids/appcache/application_1619689474933_8491239/container_e32_1619689474933_8491239_01_000001/pyspark.zip/pyspark/cloudpickle.py", line 704, in dumps File "/data8/hadoop/yarn/nm-local-dir/usercache/hadoop-mt-hids/appcache/application_1619689474933_8491239/container_e32_1619689474933_8491239_01_000001/pyspark.zip/pyspark/cloudpickle.py", line 162, in dump pickle.PicklingError: Could not serialize object: Exception: It appears that you are attempting to reference SparkContext from a broadcast variable, action, or transformation. SparkContext can only be used on the driver, not in code that it run on workers. For more information, see SPARK-5063.
Log Type: syslog