from pyspark import SparkContext, SparkConf
import os
from tqdm import tqdm
from utils2 import convert_date_2_chenyings_format
os.environ['HADOOP_HOME'] = 'D:\software\spark\winutils\hadoop-common-2.2.0-bin' # winutils.exe,hadoop的坑
os.environ["PYSPARK_PYTHON"] = "/Users/user/Python_Source/venv/python3.7" # 集群上期望运行的Python的版本
conf = SparkConf().setAppName('AppName').setMaster('spark://0.0.0.0:7077') # 局域网IP
sc = SparkContext(conf=conf)
# 把本地的.py文件导入集群
for f in tqdm(os.listdir()):
if f.find('.py') != -1:
sc.addPyFile(f)
# 测一下
data = [1, 2, 3, 4, 5]
data2 = {1: '1s', 2: '2s', 3: '3s'}
distData = sc.parallelize(data)
distData.collect()
date_test = distData.map(lambda k: k + 1)
date_test.collect()
date_test = distData.map(lambda k: k) # 可以写你自己的Py文件
date_test.collect()
sc.stop()
Spark的坑--Spark新手必看--Python Spark必读,耗费了我近三周的时间
最新推荐文章于 2024-08-01 10:54:58 发布