#%%加载部分
from pyspark.context import SparkContext
from pyspark.sql.session import SparkSession
sc = SparkContext("local")
spark = SparkSession(sc)
# %%
#count()返回数据集中的元素的个数
rdd = sc.parallelize([1,2,3,4,5])
rdd.count()
# %%
#first()返回首个元素,take()前几个元素
rdd.first()
rdd.take(3)
# %%
rdd.reduce(lambda a,b:a+b)#迭代
# %%
rdd.collect()
# %%
rdd.foreach(lambda elem:print(elem))#遍历输出
# %%
rdd = sc.textFile("/root/pythonlearn/word.txt")
# %%
lineslens = rdd.map(lambda rdd :len(rdd))
totallens = lineslens.reduce(lambda a,b:a+b)
print(totallens)
# %%
list = ["hadoop","spark","hive"]
rdd = sc.parallelize(list)
print(rdd.count())
print(",".join(rdd.collect()))
# %%
list = ["hadoop","spark","hive"]
rdd = sc.parallelize(list)
rdd.cache()#<==>rdd.persist(MEMORY_ONLY)#持久化操作
print(rdd.count())
print(",".join(rdd.collect()))#拼接
rdd.unpersist()
# %%
print(len(rdd.glom().collect()))
# %%
rddlens = rdd.repartition(2)
print(len(rddlens.glom().collect()))#glom()分区数
from pyspark import SparkContext,SparkConf, rdd
conf = SparkConf().setAppName("myapp").setMaster("local")
sc = SparkContext(conf=conf)
# %%
textfile = sc.textFile("/root/pythonlearn/word.txt")
textfile.first()
# %%
textfile.saveAsTextFile("/root/pythonlearn/writeback_word")