pyspark 读写lzo 文件例子
from pyspark import SparkContext
from pyspark import SparkConf
conf = SparkConf().setAppName("lzo_test")
sc = SparkContext(conf=conf)
filerdd = sc.newAPIHadoopFile("s3n://20160707/tag-20160707-32-00002.lzo","com.hadoop.mapreduce.LzoTextInputFormat","org.apache.hadoop.io.LongWritable","org.apache.hadoop.io.Text")
filerdd.map(lambda x: x[1]).map( lambda x: (x.split("\t")[0],x.split("\t")[1])).saveAsHadoopFile("/tmp/tag-20160707-32-00002-new.lzo","org.apache.hadoop.mapred.TextOutputFormat",compressionCodecClass="com.hadoo
p.compression.lzo.LzopCodec")
sc.stop()