import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.api.java.function.Function;
import org.apache.spark.sql.DataFrame;
import org.apache.spark.sql.Row;
import org.apache.spark.sql.SQLContext;
import spark.mapreduce.com.SparkUitl_Java2;
import java.util.List;
/**
* Created by student on 2017/8/23.
*/
public class SparkSqlByText_Java {
public static void main(String[] args) {
final String textInput = "C:\\Users\\student\\modules\\datas\\person.txt";
final String tableName = "person";
System.setProperty("hadoop.home.dir", "C:\\Users\\student\\modules\\hadoop-2.6.0-cdh5.8.5");
SparkConf conf = new SparkConf().setAppName("SparkSqlByText_Java").setMaster(SparkUitl_Java2.master);
JavaSparkContext jsc = new JavaSparkContext(conf);
SQLContext sqlCon = new SQLContext(jsc);
// 读取文件text类型
JavaRDD<String> lines = jsc.textFile(SparkUitl_Java2.textInput);
// 行格式
JavaRDD<Person> persons = lines.map(new Function<String, Person>() {
Person person = null;
@Override
public Person call(String v1) throws Exception {
String[] strs = v1.split(",");
person = new Person();
person.setId(Integer.parseInt(strs[0]));
person.setName(strs[1]);
person.setAge(Integer.parseInt(strs[2]));
person.setSex(strs[3]);
person.setAddr(strs[4]);
return person;
}
});
//创建 DataFrame收集person类信息
DataFrame df = sqlCon.createDataFrame(persons, Person.class);
//register 注册表名
df.registerTempTable(SparkUitl_Java2.tableName);
//table operater
String sql = "select * from "+ SparkUitl_Java2.tableName+"";
DataFrame dfSql = sqlCon.sql(sql);
JavaRDD<Row> rowRDD = dfSql.javaRDD();
//row foreach 将收集的信息已列形式set到person类列里面
JavaRDD<Person> personResult = rowRDD.map(new Function<Row, Person>() {
Person person = null;
@Override
public Person call(Row v1) throws Exception {
//System.
Spark读取本地文件操作
最新推荐文章于 2024-05-03 13:44:28 发布
这篇博客介绍了如何使用Spark读取本地的文本文件和JSON格式的数据。内容包括Scala语言下简洁的读取示例,以及Spark在Hadoop集群上的运行要求。还提到了Maven项目导出的相关配置。
摘要由CSDN通过智能技术生成