项目总览
//pom.xml
<properties>
<project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
<maven.compiler.target>1.8</maven.compiler.target>
<maven.compiler.source>1.8</maven.compiler.source>
</properties>
<repositories>
<repository>
<id>maven-ali</id>
<url>http://maven.aliyun.com/nexus/content/groups/public//</url>
<releases>
<enabled>true</enabled>
</releases>
<snapshots>
<enabled>true</enabled>
<updatePolicy>always</updatePolicy>
<checksumPolicy>fail</checksumPolicy>
</snapshots>
</repository>
</repositories>
<dependencies>
<dependency>
<groupId>org.apache.spark</groupId>
<artifactId>spark-core_2.11</artifactId>
<version>2.4.4</version>
</dependency>
<dependency>
<groupId>org.slf4j</groupId>
<artifactId>slf4j-simple</artifactId>
<version>1.7.21</version>
</dependency>
</dependencies>
//Spark.java
import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaPairRDD;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext;
import scala.Tuple2;
import java.util.Arrays;
public class Spark {
public static void main(String[] args) {
//winutils.exe
System.setProperty("hadoop.home.dir",new File(".").getAbsolutePath());
//no need to manual delete output folder for rerun
try {
FileUtils.deleteDirectory(new File("output"));
} catch (IOException e) {
e.printStackTrace();
}
SparkConf sparkConf = new SparkConf().setMaster("local").setAppName("Word Counter");
JavaSparkContext sparkContext = new JavaSparkContext(sparkConf);
JavaRDD<String> file = sparkContext.textFile("test.txt");
JavaRDD<String> wordsFromFile = file.flatMap(content -> Arrays.asList(content.split(" ")).iterator());
JavaPairRDD countData = wordsFromFile.mapToPair(t -> new Tuple2(t, 1)).reduceByKey((x, y) -> (int) x + (int) y);
countData.saveAsTextFile("output");
}
}
//test.txt
Hello, my name is am and I am Dev . Dev is a great website to ready
great lessons about Java, Big Data, Python and many more Programming languages.
Big Data lessons are difficult to find but at Dev , you can find some excellent
pieces of lessons written on Big Data.
windows下需要winutils.exe,否则报错
java.io.IOException: Could not locate executable null\bin\winutils.exe in the Hadoop binaries
原因是spark里会通过shell执行获取文件,shell是系统依赖的,Windows系统需要winutils.exe执行系统API,linux系统可以直接执行上面代码
原因:What exactly is winutils and why do we need it?
经测试可用的winutils.exe
,windows10 64位
Could not locate executable null\bin\winutils.exe in the Hadoop binaries. spark Eclipse on windows 7
winutils.exe
如果上面链接不可用,windows10 64位可用的spark开发需要的程序–winutils.exe–csdn
ANY_DIRECTORY/bin/winutils.exe
System.setProperty("hadoop.home.dir", "ANY_DIRECTORY");
flatMap里Iterator错误:
spark2里的file.flatMap
返回的list需要list.iterator(),所以直接复制网上的wordcount代码会编译不通过