1. 环境配置
IDEA 2019
Maven项目
pom.xml
<?xml version="1.0" encoding="UTF-8"?>
<project xmlns="http://maven.apache.org/POM/4.0.0"
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
<modelVersion>4.0.0</modelVersion>
<groupId>Spark</groupId>
<artifactId>Spark</artifactId>
<version>1.0-SNAPSHOT</version>
<properties>
<scala.version>2.11.8</scala.version>
<hadoop.version>2.6.5</hadoop.version>
<spark.version>2.3.3</spark.version>
</properties>
<dependencies>
<!--添加Spark依赖-->
<dependency>
<groupId>org.apache.spark</groupId>
<artifactId>spark-core_2.11</artifactId>
<version>${spark.version}</version>
</dependency>
<!--添加Scala依赖-->
<dependency>
<groupId>org.scala-lang</groupId>
<artifactId>scala-library</artifactId>
<version>${scala.version}</version>
</dependency>
<!--添加Hadoop依赖-->
<dependency>
<groupId>org.apache.hadoop</groupId>
<artifactId>hadoop-client</artifactId>
<version>${hadoop.version}</version>
</dependency>
</dependencies>
</project>
2. 基于Scala语言的应用开发
package scala
import java.net.URI
import org.apache.hadoop.conf.Configuration
import org.apache.hadoop.fs.{FileSystem, Path}
import org.apache.spark.rdd.RDD
import org.apache.spark.{SparkConf, SparkContext}
object WordCount {
def main(args: Array[String]): Unit = {
//指定访问用户
System.setProperty("HADOOP_USER_NAME", "hadoop")
val conf: SparkConf = new SparkConf()
conf.setAppName("Local Scala Spark RDD")
conf.setMaster("local")
val sc: SparkContext = new SparkContext(conf)
val fp: RDD[String] = sc.textFile("hdfs://master001:9000//wordcount.txt", 1)
val wordList: RDD[String] = fp.flatMap((line: String) => line.split(" "))
val tupleWordList: RDD[Tuple2[String, Int]] = wordList.map(word => (word, 1))
val tupleWordGroupList: RDD[Tuple2[String, Int]] = tupleWordList.reduceByKey(
(preValue: Int, nextValue: Int) => preValue + nextValue
)
//如果目录文件存在则删除
val path = new Path("hdfs://master001:9000//spark//output")
val hdfs = FileSystem.get(
new URI("hdfs://master001:9000//spark//output"), new Configuration()
)
if(hdfs.exists(path)) {
println("delete")
hdfs.delete(path, true)
}
tupleWordGroupList.saveAsTextFile("hdfs://master001:9000//spark//output")
}
}
3. 基于Java语言的应用开发
package test;
import java.io.IOException;
import java.net.URI;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaPairRDD;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.api.java.function.FlatMapFunction;
import org.apache.spark.api.java.function.Function2;
import org.apache.spark.api.java.function.PairFunction;
import scala.Tuple2;
import javax.swing.*;
import java.io.File;
import java.net.URISyntaxException;
import java.util.Arrays;
import java.util.Iterator;
import static java.sql.DriverManager.println;
public class WordCount {
private static void deleteDir(File f){
if(f.isFile() || f.listFiles().length == 0){
f.delete();
return;
}
File[] files = f.listFiles();
for(File fp:files)deleteDir(fp);
f.delete();
}
public static void main(String[] args) throws URISyntaxException {
//指定访问用户
System.setProperty("HADOOP_USER_NAME", "hadoop");
SparkConf conf = new SparkConf();
conf.setAppName("");
conf.setMaster("local");
JavaSparkContext sc = new JavaSparkContext(conf);
JavaRDD<String> lineRdd = sc.textFile("hdfs://master001:9000//wordcount.txt",1);
JavaRDD<String> wordRdd = lineRdd.flatMap(new FlatMapFunction<String, String>() {
public Iterator<String> call(String line) throws Exception {
String[] wordArr = line.split(" ");
return Arrays.asList(wordArr).iterator();
}
});
JavaPairRDD<String, Integer> wordTupleList = wordRdd.mapToPair(new PairFunction<String, String, Integer>() {
public Tuple2<String, Integer> call(String word) throws Exception {
return new Tuple2<String, Integer>(word, 1);
}
});
JavaPairRDD<String, Integer> wordGroupList = wordTupleList.reduceByKey(new Function2<Integer, Integer, Integer>() {
public Integer call(Integer v1, Integer v2) throws Exception {
return v1 + v2;
}
});
//如果文件存在则删除
Path path = new Path("hdfs://master001:9000//spark//output2");
FileSystem hdfs = null;
try {
hdfs = FileSystem.get(
new URI("hdfs://master001:9000//spark//output2"), new Configuration()
);
if(hdfs.exists(path)) {
println("delete");
hdfs.delete(path, true);
}
} catch (IOException e) {
e.printStackTrace();
}
wordGroupList.saveAsTextFile("hdfs://master001:9000//spark//output2");
}
}
4. 出现的BUG
Exception in thread "main" org.apache.hadoop.security.AccessControlException: Permission denied: user=11429, access=WRITE, inode="/spark":hadoop:supergroup:drwxr-xr-x
at org.apache.hadoop.hdfs.server.namenode.FSPermissionChecker.checkFsPermission(FSPermissionChecker.java:271)
at org.apache.hadoop.hdfs.server.namenode.FSPermissionChecker.check(FSPermissionChecker.java:257)
at org.apache.hadoop.hdfs.server.namenode.FSPermissionChecker.check(FSPermissionChecker.java:238)
at org.apache.hadoop.hdfs.server.namenode.FSPermissionChecker.checkPermission(FSPermissionChecker.java:179)
at org.apache.hadoop.hdfs.server.namenode.FSNamesystem.checkPermission(FSNamesystem.java:6547)
at org.apache.hadoop.hdfs.server.namenode.FSNamesystem.checkPermission(FSNamesystem.java:6529)
指定访问用户:
//指定访问用户
System.setProperty("HADOOP_USER_NAME", "hadoop")
20/07/13 17:36:47 ERROR Executor: Exception in task 0.0 in stage 0.0 (TID 0)
java.lang.UnsatisfiedLinkError: org.apache.hadoop.util.NativeCrc32.nativeComputeChunkedSums(IILjava/nio/ByteBuffer;ILjava/nio/ByteBuffer;IILjava/lang/String;JZ)V
at org.apache.hadoop.util.NativeCrc32.nativeComputeChunkedSums(Native Method)
at org.apache.hadoop.util.NativeCrc32.verifyChunkedSums(NativeCrc32.java:59)
at org.apache.hadoop.util.DataChecksum.verifyChunkedSums(DataChecksum.java:301)
at org.apache.hadoop.hdfs.RemoteBlockReader2.readNextPacket(RemoteBlockReader2.java:216)
at org.apache.hadoop.hdfs.RemoteBlockReader2.read(RemoteBlockReader2.java:146)
at org.apache.hadoop.hdfs.DFSInputStream$ByteArrayStrategy.doRead(DFSInputStream.java:734)
at org.apache.hadoop.hdfs.DFSInputStream.readBuffer(DFSInputStream.java:790)