1.JDBC的方式创建DataFrame
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.sql.DataFrame;
import org.apache.spark.sql.DataFrameReader;
import org.apache.spark.sql.Row;
import org.apache.spark.sql.SQLContext;
public class JDBC2MySQL {
public static void main(String[] args){
SparkConf conf=new SparkConf();
conf.setAppName("JDBC2MySQL").setMaster("local");
JavaSparkContext sc=new JavaSparkContext(conf);
SQLContext sqlContext=new SQLContext(sc);
/*1.通过format("jdbc")的方式说明SparkSQL操作的数据来源是通过JDBC获得
*JDBC后端一般是数据库,例如MySQL、Oracle等
*2.通过DataFrameReader的option方法把要访问的数据库的信息传递进去
*3.url:代表数据库的jdbc链接地址
*4.datable:代表具体要链接哪个数据库
*5.Driver部分是Spark SQL访问数据库的具体的驱动的完整包名和类名
*6.关于JDBC的驱动的Jar,可以放在Spark的library目录,也可以在使用SparkSubmit的使用指定Jar
(编码和打包的时候都不需要这个JDBC的Jar)
* */
DataFrameReader reader=sqlContext.read().format("jdbc");
reader.option("url", "jdbc:mysql://SparkMaster:3306");
reader.option("dbtable","dt_spark");
reader.option("driver", "com.mysql.jdbc.Driver");
reader.option("user", "root");
reader.option("password", "123");
DataFrame mysqlDataSourceDF=reader.load();
reader.option("dbtable", "dthadoop");
DataFrame DFFromMySQL=reader.load();
Map<String, String> options = new HashMap<String, String>();
options.put("url", "jdbc:mysql://SparkMaster:3306/testdb");
options.put("dbtable", "student_infos");
options.put("user", "root");
options.put("password","123");
DataFrame studentInfosDF=sqlContext.read().format("jdbc").options(options).load();
options.put("dbtable", "student_scores");
DataFrame studentScoresDF=sqlContext.read().format("jdbc").options(options).load();
List<Row> listRow=studentScoresDF.javaRDD().collect();
for(Row row:listRow){
System.out.println(row);
}
}
}
2.Case Class的方式创建DataFrame
import java.util.List;
import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.api.java.function.Function;
import org.apache.spark.sql.DataFrame;
import org.apache.spark.sql.SQLContext;
import org.apache.spark.sql.Row;
/*
* 使用反射的方式将RDD转化为DataFrame*/
public class CaseClassDataFrame {
public static void main(String[] args) {
SparkConf conf=new SparkConf().setAppName("RDD2DataFrame").setMaster("local");
JavaSparkContext sc=new JavaSparkContext(conf);
SQLContext sqlContext=new SQLContext(sc);//获取SQLContext
JavaRDD<String> lines=sc.textFile("C://Users//Jason Shu//Desktop//persons.txt");
JavaRDD<Person> persons=lines.map(new Function<String, Person>()
/*RDD<String>变为RDD<Person>,泛型转换*/
{
public Person call(String line) throws Exception {
String[] splited=line.split(" ");
Person p =new Person();
p.setId(Integer.valueOf(splited[0].trim()));
p.setName(splited[1]);
p.setAge(Integer.valueOf(splited[0].trim()));
return p;
}
});
DataFrame df= sqlContext.createDataFrame(persons, Person.class);//SQLContext变为DataFrame
/*creatDataFrame第一个参数JavaRDD<?>rdd,第二个参数Class<?>beanClass
*/
df.registerTempTable("persons");//注册一张临时表
DataFrame bigData=sqlContext.sql("select * from persons where age >=6");
JavaRDD<Row> bigDataRDD=bigData.javaRDD();//DataFrame转换为RDD
JavaRDD<Person> result=bigDataRDD.map(new Function<Row, Person>()
/*DataFrame转换为RDD,这个地方由于bigDataRDD是RDD<Row>,result是RDD<Person>
* 相当于是一个泛型转换*/
{
public Person call(Row row) throws Exception {
Person p =new Person();
p.setId(row.getInt(0));
p.setName(row.getString(1));
p.setAge(row.getInt(2));
return p;
}
});
List<Person> personList=result.collect();
for(Person p:personList){
System.out.println(p);
}
}
}
Person类
public class Person {
private static final long serialVesionUID=1L;
private int id;
private String name;
private int age;
@Override
public String toString() {
return "Person [id=" + id + ", name=" + name + ", age=" + age + "]";
}
public int getId() {
return id;
}
public void setId(int id) {
this.id = id;
}
public String getName() {
return name;
}
public void setName(String name) {
this.name = name;
}
public int getAge() {
return age;
}
public void setAge(int age) {
this.age = age;
}
}
3.JSON方式创建DataFrame
import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.sql.SQLContext;
import org.apache.spark.sql.DataFrame;
public class JSONDataFrame {
public static void main(String[] args) {
SparkConf conf =new SparkConf().setAppName("DataFrame").setMaster("spark://SparkMaster:7077");
JavaSparkContext sc =new JavaSparkContext(conf);
SQLContext sqlContext=new SQLContext(sc);
//可以简单的认为DataFrame是一张表
DataFrame dataFrame=sqlContext.read().json("hdfs://SparkMaster:9000/data/people.json");
dataFrame.show();
}
}
4.Parquet的方式创建DataFrame
import java.util.List;
import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.sql.DataFrame;
import org.apache.spark.sql.Row;
import org.apache.spark.sql.SQLContext;
public class ParquetDataFrame {
public static void main(String[] args) {
SparkConf conf=new SparkConf();
conf.setAppName("ParquetDataFrame").setMaster("spark://SparkMaster:7077");
JavaSparkContext sc=new JavaSparkContext(conf);
SQLContext sqlContext=new SQLContext(sc);
DataFrame df=sqlContext.read().parquet("/input/people.parquet");
df.registerTempTable("users");
DataFrame result=sqlContext.sql("select name from users");
List<Row> listRow=result.javaRDD().collect();
for(Row row:listRow){
System.out.println(row);
}
}
}
5.Schema的方式创建RDD
import java.util.ArrayList;
import java.util.List;
import org.apache.hadoop.hive.serde2.objectinspector.StructField;
import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.api.java.function.Function;
import org.apache.spark.sql.Row;
import org.apache.spark.sql.RowFactory;
import org.apache.spark.sql.SQLContext;
import org.apache.spark.sql.types.DataTypes;
import org.apache.spark.sql.types.StructType;
public class SchemaDataFrame {
public static void main(String[] args) {
SparkConf conf =new SparkConf();
conf.setAppName("SchemaDataFrame").setMaster("local");
JavaSparkContext sc=new JavaSparkContext(conf);
//如果是sparkcontext就不会textfile(location),而是textfile(location,partition)
SQLContext sqlContext=new SQLContext(sc);
JavaRDD<String> lines=sc.textFile("C://Users//Jason Shu//Desktop");
JavaRDD<Row> personsRDD=lines.map(new Function<String, Row>() //JavaRDD<String>变为JavaRDD<Row>
{
public Row call(String line) throws Exception {
String[] splited=line.split(",");
return RowFactory.create(Integer.valueOf(splited[0]),splited[1],Integer.valueOf(splited[2]));
}
});
List<StructField> structFields=new ArrayList<StructField>();//构造一个StructField
structFields.add((StructField) DataTypes.createStructField("id",DataTypes.IntegerType,true));
structFields.add((StructField) DataTypes.createStructField("name",DataTypes.StringType,true));
structFields.add((StructField) DataTypes.createStructField("age",DataTypes.IntegerType,true));
StructType structType=DataTypes.createStructType(structFields);
}
}