目录结构
pom.xml
<?xml version="1.0" encoding="UTF-8"?>
<project xmlns="http://maven.apache.org/POM/4.0.0"
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
<modelVersion>4.0.0</modelVersion>
<groupId>com.zdy</groupId>
<artifactId>WordCount</artifactId>
<version>1.0-SNAPSHOT</version>
<properties>
<hadoop.version>2.8.5</hadoop.version>
<hadoop-core.version>1.2.1</hadoop-core.version>
<!-- 使用jdk1.8启动,设置字符集解析为UTF-8 -->
<project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
<maven.compiler.source>1.8</maven.compiler.source>
<maven.compiler.target>1.8</maven.compiler.target>
</properties>
<!-- 导入hadoop依赖环境 -->
<dependencies>
<dependency>
<groupId>org.apache.hadoop</groupId>
<artifactId>hadoop-core</artifactId>
<version>${hadoop-core.version}</version>
</dependency>
<dependency>
<groupId>org.apache.hadoop</groupId>
<artifactId>hadoop-common</artifactId>
<version>${hadoop.version}</version>
</dependency>
</dependencies>
<!-- 导入apache镜像原 -->
<repositories>
<repository>
<id>apache</id>
<url>http://maven.apache.org</url>
</repository>
</repositories>
</project>
WordCount.java
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import java.io.IOException;
import java.util.StringTokenizer;
public class WordCount {
public static class TokenizerMapper
extends Mapper<Object, Text, Text, IntWritable> {
private final static IntWritable one = new IntWritable(1);
private Text word = new Text();
String regEx="[\n`~!@#$%^&*()+=|{}':;'\",\\[\\].<>/?~!@#¥%……&*()——+|{}【】‘;:”“’。, 、?]";
public void map(Object key, Text value, Context context
) throws IOException, InterruptedException {
StringTokenizer itr = new StringTokenizer(value.toString());
while (itr.hasMoreTokens()) {
word.set(itr.nextToken());
String temp = word.toString().replaceAll(regEx,"");//去除标点符号
Text new_word= new Text(temp.toLowerCase());//忽略大小写
context.write(new_word, one);
}
}
}
public static class IntSumReducer
extends Reducer<Text, IntWritable, Text, IntWritable> {
private IntWritable result = new IntWritable();
public void reduce(Text key, Iterable<IntWritable> values,
Context context
) throws IOException, InterruptedException {
int sum = 0;
for (IntWritable val : values) {
sum += val.get();
}
result.set(sum);
context.write(key, result);
}
}
public static void getWordCount(String index) throws Exception {
Configuration conf = new Configuration();
Job job = Job.getInstance(conf, "word count");
job.setJarByClass(WordCount.class);
job.setMapperClass(TokenizerMapper.class);
job.setCombinerClass(IntSumReducer.class);
job.setReducerClass(IntSumReducer.class);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(IntWritable.class);
FileInputFormat.addInputPath(job, new Path("./input/file"+index+".txt"));
FileOutputFormat.setOutputPath(job, new Path("./output/file"+index+"/"));
//System.exit(job.waitForCompletion(true) ? 0 : 1);
job.waitForCompletion(true);
}
}
bow.java
import java.io.*;
import java.nio.charset.StandardCharsets;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
public class bow {
private final static String[] top100Word = { "the", "be", "to", "of", "and", "a", "in", "that", "have", "i",
"it", "for", "not", "on", "with", "he", "as", "you", "do", "at", "this", "but", "his", "by", "from", "they",
"we", "say", "her", "she", "or", "an", "will", "my", "one", "all", "would", "there", "their", "what", "so",
"up", "out", "if", "about", "who", "get", "which", "go", "me", "when", "make", "can", "like", "time", "no",
"just", "him", "know", "take", "people", "into", "year", "your", "good", "some", "could", "them", "see",
"other", "than", "then", "now", "look", "only", "come", "its", "over", "think", "also", "back", "after",
"use", "two", "how", "our", "work", "first", "well", "way", "even", "new", "want", "because", "any",
"these", "give", "day", "most", "us" };
public static void main(String[] args) throws Exception {
File file = new File("./output/bow_result.txt");
if(file.exists())
{
System.out.println("bow_result already exists, see output");
System.exit(0);
}
for(Integer i=1;i<=10;i++)//MapReduce统计词频
{
String str = String.format("%02d",i);
WordCount.getWordCount(str);
}
List<StringBuffer> list = new ArrayList<>();//从指定的top100Word获取对应的词频
for(Integer i=1;i<=10;i++)
{
String str = String.format("%02d",i);
list.add(generateVector(str));
}
BufferedWriter bw = new BufferedWriter(new OutputStreamWriter(new FileOutputStream(new File("./output/bow_result.txt")),
StandardCharsets.UTF_8));//结果写入txt
for (StringBuffer sb : list) {
bw.write(String.valueOf(sb));
bw.newLine();
}
bw.close();
}
public static StringBuffer generateVector(String index) throws IOException {
HashMap<String,Integer> map = new HashMap<>();
BufferedReader br = new BufferedReader(new InputStreamReader(new FileInputStream(new File("./output/file"+index+"/part-r-00000")),
StandardCharsets.UTF_8));
String lineTxt = null;
while ((lineTxt = br.readLine()) != null) {//数据以逗号分隔
String[] names = lineTxt.split("\t");
map.put(names[0],Integer.parseInt(names[1]));
}
br.close();
StringBuffer sb = new StringBuffer("file"+index+".txt\t");
for(int i=0;i<top100Word.length-1;i++)
{
String key = top100Word[i];
if(map.containsKey(key))
{
sb.append(map.get(key)+",");
}
else
{
sb.append("0,");
}
}
sb.append(map.containsKey(top100Word[top100Word.length-1]) ? map.get(top100Word[top100Word.length-1]) : "0");
System.out.println(sb);
return sb;
}
}
file01.txt 525,62,269,276,288,253,173,127,79,298,163,57,69,31,84,123,74,151,29,67,36,61,122,38,51,16,37,7,48,71,23,38,38,98,30,33,22,35,9,34,29,33,29,27,11,30,4,62,4,55,35,8,10,6,10,17,15,27,22,7,4,33,2,50,10,17,27,10,17,14,13,37,19,9,9,13,7,8,11,5,9,7,5,11,23,13,5,3,12,6,7,4,2,5,6,4,4,5,15,8
file02.txt 543,64,284,250,340,275,180,184,89,298,168,104,69,51,88,187,102,150,33,74,44,73,111,24,43,34,76,12,14,14,27,39,24,71,31,47,51,51,10,50,49,32,32,24,25,24,6,61,11,71,38,11,17,12,19,32,22,48,20,8,0,20,3,49,15,23,32,22,27,20,23,41,15,6,18,23,16,24,9,4,10,19,6,20,11,26,8,18,30,12,5,6,5,2,19,5,6,13,13,30
file03.txt 609,57,275,290,320,268,148,212,94,265,196,72,90,41,86,177,101,149,36,86,48,69,158,34,54,24,43,12,54,41,25,30,24,58,38,32,38,45,6,42,49,22,32,35,21,25,8,78,14,50,34,1,10,8,16,46,8,65,21,6,4,20,2,35,5,36,33,15,27,26,14,36,19,6,23,15,2,18,20,8,17,7,3,17,15,12,7,7,13,14,8,2,1,2,10,9,4,4,6,19
file04.txt 658,64,267,325,344,245,216,168,102,292,169,62,59,37,84,177,95,127,34,75,52,60,153,39,73,22,44,5,20,20,29,33,31,112,39,43,31,58,19,39,42,32,31,22,16,27,3,100,8,61,32,6,22,12,20,44,5,51,13,12,1,14,5,25,4,38,27,18,23,8,18,31,26,2,10,32,17,19,16,4,13,8,4,15,14,18,3,13,23,14,9,4,4,2,9,11,10,14,6,25
file05.txt 600,59,250,298,283,306,186,170,103,245,161,67,54,43,88,180,92,132,19,63,52,69,136,43,52,11,41,6,33,30,11,35,18,101,41,32,26,58,10,34,37,32,38,27,15,28,10,85,7,43,29,6,29,5,10,57,9,57,19,7,0,29,0,24,10,29,25,7,26,15,18,35,27,7,13,14,14,13,13,10,17,7,3,15,11,8,2,5,22,13,7,1,3,1,13,8,4,17,7,14
file06.txt 581,51,260,288,298,264,182,157,87,276,155,87,46,33,85,114,68,177,21,81,63,68,103,44,46,12,65,9,55,47,13,26,31,123,43,34,34,47,10,40,41,27,35,23,21,34,11,92,11,69,21,4,19,10,18,38,18,36,23,11,5,44,3,43,12,22,18,14,28,14,14,42,38,5,16,10,20,21,18,4,16,5,2,14,12,33,3,6,28,12,5,3,4,2,12,9,8,6,5,19
file07.txt 635,56,267,286,328,308,168,186,74,352,201,65,49,41,76,120,78,128,22,96,47,60,88,31,64,17,95,9,36,30,15,35,19,130,44,41,35,57,6,33,49,38,37,23,14,19,6,101,14,70,35,7,15,13,23,33,11,21,9,9,2,33,0,44,15,27,36,8,26,13,14,38,20,7,17,21,18,18,17,6,15,9,5,21,16,30,7,5,10,11,11,5,4,1,17,11,4,4,6,26
file08.txt 531,73,282,242,290,268,183,185,99,288,154,74,54,40,66,107,81,135,19,66,59,58,92,36,44,25,50,15,73,64,13,39,31,87,39,48,22,53,7,35,50,28,27,19,16,23,5,53,6,66,36,12,14,11,19,46,19,37,12,8,2,26,4,30,14,26,16,19,17,13,13,44,20,6,18,11,4,25,26,3,16,17,1,6,14,25,4,12,16,13,4,3,2,4,11,9,7,9,8,14
file09.txt 530,77,317,221,274,252,181,199,132,404,197,86,85,33,103,197,78,198,42,83,47,78,142,30,33,16,24,16,46,62,20,32,29,122,31,42,35,44,5,48,58,30,33,33,15,39,7,81,12,86,42,11,17,8,8,35,7,64,13,7,2,36,1,59,10,14,42,16,22,17,22,35,30,10,21,8,6,18,28,5,12,9,1,11,14,29,3,8,12,10,12,1,1,3,18,7,7,2,10,10
file10.txt 575,82,283,289,301,239,196,161,68,276,143,75,80,41,112,82,91,195,44,88,81,56,54,46,50,25,56,3,50,47,97,30,24,104,34,51,33,54,15,32,43,30,25,46,20,25,2,61,11,63,21,9,25,9,7,48,11,27,18,4,7,20,5,45,11,13,30,17,18,21,10,26,17,15,17,15,18,13,16,4,13,11,13,13,10,15,45,9,19,11,4,2,2,2,49,10,9,11,15,20
dist.java
import java.io.*;
import java.nio.charset.StandardCharsets;
import java.util.*;
public class dist {
public static void main(String[] args) throws IOException {
File file = new File("./output/dist_result.txt");
if(file.exists())
{
System.out.println("dist_result already exists, see output");
System.exit(0);
}
List<StringBuffer> list = new ArrayList<>();
for(Integer i=1;i<=10;i++)
{
String str = String.format("%02d",i);
list.add(generateVector(str));
}
StringBuffer sbTotal = generateTotalVector();
BufferedWriter bw = new BufferedWriter(new OutputStreamWriter(new FileOutputStream(new File("./output/dist_result.txt")),
StandardCharsets.UTF_8));//结果写入txt
for (StringBuffer sb : list) {
bw.write(String.valueOf(sb));
bw.newLine();
}
bw.write(String.valueOf(sbTotal));
bw.newLine();
bw.close();
}
public static StringBuffer generateVector(String index) throws IOException {
HashMap<String,Integer> map = new HashMap<>();
BufferedReader br = new BufferedReader(new InputStreamReader(new FileInputStream(new File("./output/file"+index+"/part-r-00000")),
StandardCharsets.UTF_8));
String lineTxt = null;
while ((lineTxt = br.readLine()) != null) {//数据以逗号分隔
String[] names = lineTxt.split("\t");
if(names[0].startsWith("ex"))
map.put(names[0],Integer.parseInt(names[1]));
}
br.close();
List<Map.Entry<String, Integer>> list = new ArrayList<Map.Entry<String, Integer>>(map.entrySet());//降序排列
list.sort(new Comparator<Map.Entry<String, Integer>>() {
@Override
public int compare(Map.Entry<String, Integer> o1, Map.Entry<String, Integer> o2) {
if(o1.getValue().equals(o2.getValue()))
{
return o1.getKey().compareTo(o2.getKey());
}
return o2.getValue().compareTo(o1.getValue());
}
});
StringBuffer sb = new StringBuffer("file"+index+".txt\t");
for (int i = 0; i < list.size(); i++) {
sb.append(list.get(i).getKey() + "," + list.get(i).getValue() + ",");
}
sb.deleteCharAt(sb.length()-1);
System.out.println(sb);
return sb;
}
public static StringBuffer generateTotalVector() throws IOException {
HashMap<String,Integer> map = new HashMap<>();
for(Integer i=1;i<=10;i++) {
String index = String.format("%02d",i);
BufferedReader br = new BufferedReader(new InputStreamReader(new FileInputStream(new File("./output/file" + index + "/part-r-00000")),
StandardCharsets.UTF_8));
String lineTxt = null;
while ((lineTxt = br.readLine()) != null) {//数据以逗号分隔
String[] names = lineTxt.split("\t");
if (names[0].startsWith("ex")) {
if (!map.containsKey(names[0]))
map.put(names[0], Integer.parseInt(names[1]));
else {
Integer temp = map.get(names[0]);
map.put(names[0], Integer.parseInt(names[1]) + temp);
}
}
}
br.close();
}
List<Map.Entry<String, Integer>> list = new ArrayList<Map.Entry<String, Integer>>(map.entrySet());//降序排列
list.sort(new Comparator<Map.Entry<String, Integer>>() {
@Override
public int compare(Map.Entry<String, Integer> o1, Map.Entry<String, Integer> o2) {
if(o1.getValue().equals(o2.getValue()))
{
return o1.getKey().compareTo(o2.getKey());
}
return o2.getValue().compareTo(o1.getValue());
}
});
StringBuffer sb = new StringBuffer("total\t");
for (int i = 0; i < list.size(); i++) {
sb.append(list.get(i).getKey() + "," + list.get(i).getValue() + ",");
}
sb.deleteCharAt(sb.length()-1);
System.out.println(sb);
return sb;
}
}
file01.txt extraordinary,3,examined,2,example,2,excellent,2,excuse,2,expected,2,expression,2,extreme,2,exactly,1,exaggerated,1,exalted,1,except,1,exchange,1,excitedly,1,excuses,1,expenses,1,experiences,1,explain,1,explained,1,explaining,1,expostulating,1,extended,1,extending,1
file02.txt experience,4,excuse,3,expected,3,extraordinary,3,extreme,3,example,2,exceedingly,2,existence,2,expect,2,expedition,2,exact,1,exactness,1,examine,1,except,1,excitement,1,exciting,1,exclaimed,1,expectancy,1,expenditure,1,expense,1,expensive,1,experienced,1,explain,1,explained,1,explanation,1,explore,1,expression,1,extra,1,extremely,1
file03.txt extraordinary,3,example,2,except,2,excited,2,explain,2,ex-australian,1,exact,1,exacted,1,exactly,1,examination,1,examined,1,examining,1,exceedingly,1,excitable,1,excitement,1,expected,1,expense,1,experience,1,expired,1,expiring,1,explanation,1,expressed,1,extent,1,extremely,1
file04.txt examination,2,exclaimed,2,excuse,2,expected,2,extreme,2,ex-confederate,1,examined,1,excellent,1,except,1,exception,1,exceptional,1,exchange,1,exercise,1,exhibited,1,expect,1,expedition,1,experience,1,explanations,1,exposed,1,expound,1,expression,1,extending,1,extremely,1,extremity,1
file05.txt examination,4,exceedingly,4,excellent,3,experience,3,examined,2,exceptional,2,explain,2,exposure,2,extremely,2,exactly,1,example,1,exception,1,excited,1,exclamation,1,execution,1,existence,1,exit,1,expected,1,expecting,1,expensive,1,explained,1,exposed,1
file06.txt excellent,4,examined,2,excitement,2,excuse,2,extended,2,exact,1,examine,1,exchanging,1,excursion,1,existence,1,expect,1,expected,1,expenses,1,explain,1,explained,1,exposure,1,expression,1,expressly,1,extent,1
file07.txt examine,4,examined,4,examining,4,experience,4,example,3,excuse,3,explain,3,exactly,2,examination,2,exchanged,2,expect,2,explained,2,exact,1,exacted,1,excavating,1,exceeding,1,exceedingly,1,excellent,1,exceptionally,1,exclamation,1,exercising,1,expected,1,experiences,1,explaining,1,explanation,1,expression,1,extinguished,1,extraordinary,1,extreme,1
file08.txt explain,4,example,2,exceedingly,2,excitement,2,explained,2,explanation,2,exactly,1,exalted,1,examination,1,excellent,1,except,1,excited,1,exclude,1,exclusion,1,excuse,1,existence,1,expect,1,expectancies,1,expected,1,expensive,1,experience,1,explains,1,expression,1,expressions,1,expressive,1,exquisite,1,extend,1,extended,1,extra,1,extracts,1,extraordinary,1,extremely,1
file09.txt extraordinary,4,examined,3,excellent,3,experience,3,excuse,2,expedition,2,expense,2,explain,2,explanation,2,exactly,1,exalted,1,examination,1,exceeded,1,exceedingly,1,exceptionally,1,excessive,1,excluded,1,exercise,1,expensive,1,explained,1,exposed,1,exposure,1,expressive,1,extinguishes,1,extreme,1
file10.txt explanation,4,extraordinary,4,excellent,3,except,3,expenses,3,experience,3,exempt,2,expense,2,exacting,1,exactly,1,exaggerated,1,examined,1,examining,1,example,1,exceptional,1,exclusion,1,executive,1,exert,1,exhilarating,1,exists,1,expect,1,expected,1,expend,1,explanations,1,exporting,1,express,1,expressed,1,expression,1,extent,1,extremely,1
total experience,20,extraordinary,19,excellent,18,examined,16,explain,16,excuse,15,example,13,expected,13,examination,11,exceedingly,11,explanation,11,except,9,explained,9,extreme,9,exactly,8,expect,8,expression,8,extremely,7,examine,6,examining,6,excitement,6,expense,6,existence,5,expedition,5,expenses,5,exact,4,exceptional,4,excited,4,expensive,4,exposure,4,extended,4,exalted,3,exclaimed,3,exposed,3,extent,3,exacted,2,exaggerated,2,exception,2,exceptionally,2,exchange,2,exchanged,2,exclamation,2,exclusion,2,exempt,2,exercise,2,experiences,2,explaining,2,explanations,2,expressed,2,expressive,2,extending,2,extra,2,ex-australian,1,ex-confederate,1,exacting,1,exactness,1,excavating,1,exceeded,1,exceeding,1,excessive,1,exchanging,1,excitable,1,excitedly,1,exciting,1,exclude,1,excluded,1,excursion,1,excuses,1,execution,1,executive,1,exercising,1,exert,1,exhibited,1,exhilarating,1,exists,1,exit,1,expectancies,1,expectancy,1,expecting,1,expend,1,expenditure,1,experienced,1,expired,1,expiring,1,explains,1,explore,1,exporting,1,expostulating,1,expound,1,express,1,expressions,1,expressly,1,exquisite,1,extend,1,extinguished,1,extinguishes,1,extracts,1,extremity,1