数据初始化
output_images = output + "/images"
output_labels = output + "/labels"
imageRDD = None
labelRDD = None
读取CSV数据
def fromCSV(s):
"""将csv数据转化为vector"""
return [float(x) for x in s.split(',') if len(s) > 0]
imageRDD = sc.textFile(output_images).map(fromCSV)
labelRDD = sc.textFile(output_labels).map(fromCSV)
读取pickle数据
imageRDD = sc.pickleFile(output_images)
labelRDD = sc.pickleFile(output_labels)
读取tfrecord数据
tfRDD = sc.newAPIHadoopFile(output, "org.tensorflow.hadoop.io.TFRecordFileInputFormat",
keyClass="org.apache.hadoop.io.BytesWritable",
valueClass="org.apache.hadoop.io.NullWritable")
imageRDD = tfRDD.map(lambda x: fromTFExample(bytes(x[0])))
数据转化在另外一篇博客