一、数据去重
首先下载好idea和jdk,网上搜一搜如何配置环境
在主机上下载和虚拟机上一样的Hadoop,bin进行替换,这部分详见↓
idea如何连接Hadoop
只需要看连接hdfs的部分,连上后的测试不用看
注意,连上之后代码框的右上角会出现一个循环状的图案,点击进行导入依赖(大概出现在图上的位置,当时忘记截图了)
代码参考
我的框架
在虚拟机上上传file1.txt和file2.txt,做了前一个实验应该知道怎么建文件夹并上传到集群
注意代码有修改,地址是这样得到的
FileInputFormat.addInputPath(job, new Path("hdfs://master:9000/input1/"));
FileOutputFormat.setOutputPath(job, new Path("hdfs://master:9000/output1/"));
run,报错
Permission denied: user=root, access=WRITE, inode=“/user“:hdfs:supergroup:drwxr-xr-x
在Hadoop文件的hdfs-site.xml加上
<property>
<name>dfs.permissions</name>
<value>false</value>
</property>
忘记是idea还是虚拟机的要加了,反正两个都修改也没事
现在ok了
二、数据排序
代码参考
我的框架
三、平均成绩
package lab1.task3;
import java.io.IOException;
import java.util.Iterator;
import java.util.StringTokenizer;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
public class calcGPA {
public calcGPA() {
}
public static void main(String[] args) throws Exception {
Configuration conf = new Configuration();
String fileAddress1 = "hdfs://master:9000/input3/";
String fileAddress2 = "hdfs://master:9000/";
//String[] otherArgs = (new GenericOptionsParser(conf, args)).getRemainingArgs();
String[] otherArgs = new String[]{fileAddress1+"math.txt", fileAddress1+"china.txt", fileAddress1+"english.txt", fileAddress2+"output3"};
if(otherArgs.length < 2) {
System.err.println("Usage: calcGPA <in> [<in>...] <out>");
System.exit(2);
}
Job job = Job.getInstance(conf, "calc GPA");
job.setJarByClass(calcGPA.class);
job.setMapperClass(calcGPA.TokenizerMapper.class);
job.setCombinerClass(calcGPA.IntSumReducer.class);
job.setReducerClass(calcGPA.IntSumReducer.class);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(IntWritable.class);
for(int i = 0; i < otherArgs.length - 1; ++i) {
FileInputFormat.addInputPath(job, new Path(otherArgs[i]));
}
FileOutputFormat.setOutputPath(job, new Path(otherArgs[otherArgs.length - 1]));
System.exit(job.waitForCompletion(true)?0:1);
}
public static class IntSumReducer extends Reducer<Text, IntWritable, Text, IntWritable> {
public IntSumReducer() {
}
public void reduce(Text key, Iterable<IntWritable> values, Context context) throws IOException, InterruptedException {
int sum = 0;
int count = 0;
IntWritable val;
for(Iterator i$ = values.iterator(); i$.hasNext(); sum += val.get(),count++) {
val = (IntWritable)i$.next();
}
int average = (int)sum/count;
context.write(key, new IntWritable(average));
}
}
public static class TokenizerMapper extends Mapper<Object, Text, Text, IntWritable> {
public TokenizerMapper() {
}
public void map(Object key, Text value, Context context) throws IOException, InterruptedException {
StringTokenizer itr = new StringTokenizer(value.toString(), "\n");
while(itr.hasMoreTokens()) {
StringTokenizer iitr = new StringTokenizer(itr.nextToken());
String name = iitr.nextToken();
String score = iitr.nextToken();
context.write(new Text(name), new IntWritable(Integer.parseInt(score)));
}
}
}
}
四、单表关联
package lab1.task4;
import java.io.IOException;
import java.util.Iterator;
import java.util.StringTokenizer;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.util.GenericOptionsParser;
public class STjoin {
public static int time = 0; public static class Map extends Mapper<Object, Text, Text, Text>{
public void map(Object key,Text value,Context context)throws IOException,InterruptedException{
String relationtype = new String();
String line = value.toString();
System.out.println("mapper...............");
int i = 0;
//遍历方法二:使用迭代器取出child和parent
String[] values = new String[10];
StringTokenizer itr = new StringTokenizer(line);
while(itr.hasMoreTokens()){
values[i] = itr.nextToken();
i = i+1;
}
System.out.println("child:"+values[0]+" parent:"+values[1]);
if(values[0].compareTo("child") != 0){//如果是child,则为0,否则为-1
relationtype="1";
context.write(new Text(values[1]),new Text(relationtype+"+"+values[0]));
System.out.println("key:"+values[1]+" value: "+relationtype+"+"+values[0]);
relationtype = "2";
context.write(new Text(values[0]), new Text(relationtype+"+"+values[1]));
System.out.println("key:"+values[0]+" value: "+relationtype+"+"+values[1]);
}
}
}
public static class Reduce extends Reducer<Text, Text, Text, Text>{
public void reduce(Text key,Iterable<Text> values,Context context) throws IOException, InterruptedException{
System.out.println("reduce.....................");
System.out.println("key:"+key+" values:"+values);
if(time==0){
context.write(new Text("grandchild"), new Text("grandparent"));
time++;
}
int grandchildnum = 0;
String grandchild[] = new String[10];
int grandparentnum = 0;
String grandparent[] = new String[10];
String name = new String();
//遍历方法二:用for循环
for(Text val : values){
// String record = ite.next().toString();
String record = val.toString();
System.out.println("record: "+record);
int i = 2;
char relationtype = record.charAt(0);
name = record.substring(i);
System.out.println("name: "+name);
if (relationtype=='1') {
grandchild[grandchildnum] = name;
grandchildnum++;
}
else{
grandparent[grandparentnum]=name;
grandparentnum++;
}
}
//遍历方法三:就是详细方法的charAt(),一个一个字符遍历
if(grandparentnum!=0&&grandchildnum!=0){
for(int m = 0 ; m < grandchildnum ; m++){
for(int n = 0 ; n < grandparentnum; n++){
context.write(new Text(grandchild[m]), new Text(grandparent[n]));
System.out.println("grandchild: "+grandchild[m]+" grandparent: "+grandparent[n]);
}
}
}
}
}
public static void main(String [] args)throws Exception{
Configuration conf = new Configuration();
Job job = new Job(conf,"single table join");
job.setJarByClass(STjoin.class);
job.setMapperClass(Map.class);
job.setReducerClass(Reduce.class);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(Text.class);
FileInputFormat.addInputPath(job, new Path("hdfs://master:9000/input4/"));
FileOutputFormat.setOutputPath(job,new Path("hdfs://master:9000/output4/"));
System.exit(job.waitForCompletion(true)? 0 : 1);
}
}