pyspark读取数据
方法一:从hdfs读取
# -*- coding: utf-8 -*
from pyspark.sql import SparkSession, HiveContext,DataFrameWriter
import argparse
import time
import numpy as np
import pandas as pd
spark = SparkSession.builder.enableHiveSupport().appName("test").getOrCreate()
start = time.time()
### 数据载入方法1: hdfs上载入parquent格式
input = "/aaa/bbb/ccc"
data = spark.read.parquet(input)
data.show(5)
+-------------------+------+--------------------+
| START_TIME|amount| payerCode|
+-------------------+------+--------------------+
|2019-06-28 21:04:37| 10.7|692200000XXXXXXX|
|2018-11-24 20:15:40| 19.9|602200000XXXXXXX|
|2019-06-19 12:33:14| 2.0|692200000XXXXXXX|
|2019-07-03 23:04:12| 5.27|622200000XXXXXXX|