HDFS读写操作
public class HdfsClient {
Configuration configuration=null;
FileSystem fs =null;
@Before
public void init() throws URISyntaxException, IOException, InterruptedException {
configuration = new Configuration();
fs = FileSystem.get(new URI("hdfs://test1.bigload.com:9000"), configuration,"root");
}
@After
public void deprecated() throws IOException {
fs.close();
}
@Test
public void test1() throws IOException {
fs.mkdirs(new Path("/test3"));
}
/*上传文件*/
@Test
public void test2() throws IOException {
fs.copyFromLocalFile(new Path("G:\\upload\\bigload.txt"),new Path("/test3/bigload.txt"));
}
/*下载文件*/
@Test
public void dowload() throws IOException {
fs.copyToLocalFile(new Path("/test3/bigload.txt"),new Path("G:\\download\\bigload.txt"));
}
/*删除文件夹或文件*/
@Test
public void delete() throws IOException {
fs.delete(new Path("/lagou/bigdata/bigload.txt"),true);
}
/*查看文件名称、权限、长度、块信息*/
@Test
public void catDir() throws IOException {
/*所有文件*/
RemoteIterator<LocatedFileStatus> locateFiles = fs.listFiles(new Path("/"), true);
while(locateFiles.hasNext()){
LocatedFileStatus file = locateFiles.next();
/*文件名称*/
System.out.println(file.getPath());//file.getPath().getName());
/*权限*/
System.out.println(file.getPermission());
/*长度*/
System.out.println(file.getLen());
/*储存块*/
BlockLocation[] blockLocations = file.getBlockLocations();
for (BlockLocation block : blockLocations) {
/*在哪个节点上有副本*/
String[] hosts = block.getHosts();
for (String host : hosts) {
System.out.println(host);
}
}
System.out.println("------------------------hihi---------------------------");
}
}
/*判断是否是文件夹*/
public void checkDir(Path dir) throws IOException {
FileStatus[] fileStatuses = fs.listStatus(dir);
for (FileStatus fileStatus : fileStatuses) {
if(fileStatus.isDirectory()){
System.out.println("目录:"+fileStatus.getPath());
this.checkDir(fileStatus.getPath());
}else{
System.out.println("文件:"+fileStatus.getPath());
}
}
}
/*判断是否是文件夹*/
@Test
public void isDirTest() throws IOException {
this.checkDir(new Path("/"));
}
/*io流文件上传*/
@Test
public void streamUpload() throws IOException {
FileInputStream location = new FileInputStream(new File("G:\\upload\\bigload.txt"));
FSDataOutputStream out = fs.create(new Path("/lagou/bigdata/bigload.txt"), new Progressable() {
@Override
public void progress() {
/*每64kb就打印一个
* 如果有数据会先建立通道会调用一次,第二次开始才是数据*/
System.out.println("%%%");
}
});
IOUtils.copyBytes(location,out,configuration);
}
/*io流文件下载*/
@Test
public void streamDownLoad() throws IOException {
FSDataInputStream dataInputStream = fs.open(new Path("/lagou/bigdata/hadoop.txt"));
FileOutputStream outputStream = new FileOutputStream(new File("G:\\download\\hadoop.txt"));
IOUtils.copyBytes(dataInputStream,outputStream,configuration);
}
/*seek定位,将文件打印两遍*/
@Test
public void seekTest() throws IOException {
FSDataInputStream dataInputStream = fs.open(new Path("/lagou/bigdata/hadoop.txt"));
IOUtils.copyBytes(dataInputStream,System.out,4096,false);
dataInputStream.seek(0);
IOUtils.copyBytes(dataInputStream,System.out,4096,false);
/*我在尝试跑一下*/
IOUtils.copyBytes(dataInputStream,System.out,configuration);
IOUtils.closeStream(dataInputStream);
}
}
MapReduce思想在生活中处处可见。我们或多或少都曾接触过这种思想。MapReduce的思想核心是分而治之,充分利用了并行处理的优势。
MapReduce任务过程是分为两个处理阶段:
- Map阶段:Map阶段的主要作用是“分”,即把复杂的任务分解为若干个“简单的任务”来并行处理。
Map阶段的这些任务可以并行计算,彼此间没有依赖关系。
- Reduce阶段:Reduce阶段的主要作用是“合”,即对map阶段的结果进行全局汇总。
speakBean对象
public class speakBean implements Writable {
private Long selfTime;
private Long otherTime;
private Long sumTime;
public speakBean() {
}
public speakBean(Long selfTime, Long otherTime) {
setSelfTime(selfTime);
setOtherTime(otherTime);
setSumTime(selfTime+otherTime);
}
public Long getSelfTime() {
return selfTime;
}
public void setSelfTime(Long selfTime) {
this.selfTime = selfTime;
}
public Long getOtherTime() {
return otherTime;
}
public void setOtherTime(Long otherTime) {
this.otherTime = otherTime;
}
public Long getSumTime() {
return sumTime;
}
public void setSumTime(Long sumTime) {
this.sumTime = sumTime;
}
@Override
public String toString() {
return selfTime +
"\t" + otherTime +
"\t" + sumTime;
}
@Override
public void write(DataOutput out) throws IOException {
out.writeLong(selfTime);
out.writeLong(otherTime);
out.writeLong(sumTime);
}
@Override
public void readFields(DataInput in) throws IOException {
this.setSelfTime(in.readLong());
this.setOtherTime(in.readLong());
this.setSumTime(in.readLong());
}
}
speakMapper
public class speakMapper extends Mapper<LongWritable,Text,Text,speakBean> {
Text k=new Text();
speakBean v= new speakBean();
@Override
protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
String line = value.toString();
String[] datas = line.split("\t");
v.setSelfTime(Long.parseLong(datas[datas.length - 3]));
v.setOtherTime(Long.parseLong(datas[datas.length - 2]));
v.setSumTime(Long.parseLong(datas[datas.length - 3])+Long.parseLong(datas[datas.length - 2]));
k.set(datas[1]);
context.write(k,v);
}
}
speakReducer
public class speakReducer extends Reducer<Text,speakBean,Text,speakBean> {
speakBean sb=new speakBean();
@Override
protected void reduce(Text key, Iterable<speakBean> values, Context context) throws IOException, InterruptedException {
Long selfTime=0l;
Long otherTime=0l;
for (speakBean value : values) {
selfTime+=value.getSelfTime();
otherTime+=value.getOtherTime();
}
sb.setSelfTime(selfTime);
sb.setOtherTime(otherTime);
sb.setSumTime(selfTime+otherTime);
context.write(key,sb);
}
}
speakDriver
public class speakDriver {
public static void main(String[] args) throws IOException, ClassNotFoundException, InterruptedException {
/*导入配置,生成job*/
Configuration conf = new Configuration();
Job job = Job.getInstance(conf);
/*jar包所在*/
job.setJarByClass(speakDriver.class);
/*mapper所在*/
job.setMapperClass(speakMapper.class);
/*reduce所在*/
job.setReducerClass(speakReducer.class);
/*mapper的输出k,v类型*/
job.setMapOutputKeyClass(Text.class);
job.setMapOutputValueClass(speakBean.class);
/*输出的k,v类型*/
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(speakBean.class);
/*指定预聚合*/
job.setCombinerClass(speakReducer.class);
/*输入,输出路径*/
FileInputFormat.setInputPaths(job,new Path(args[0]));
FileOutputFormat.setOutputPath(job,new Path(args[1]));
/*提交任务给yarn*/
boolean result = job.waitForCompletion(true);
System.exit(result?0:1);
}
}
bean
public class ComparableBean implements WritableComparable<ComparableBean> {
private String appKey;
private Long selfTime;
private Long otherTime;
private Long sumTime;
public ComparableBean() {
}
public ComparableBean(String appKey, Long selfTime, Long otherTime, Long sumTime) {
this.appKey = appKey;
this.selfTime = selfTime;
this.otherTime = otherTime;
this.sumTime = sumTime;
}
@Override
public int compareTo(ComparableBean o) {
return -sumTime.compareTo(o.getSumTime());
}
@Override
public void write(DataOutput out) throws IOException {
out.writeUTF(appKey);
out.writeLong(selfTime);
out.writeLong(otherTime);
out.writeLong(sumTime);
}
@Override
public void readFields(DataInput in) throws IOException {
appKey=in.readUTF();
selfTime=in.readLong();
otherTime=in.readLong();
sumTime=in.readLong();
}
@Override
public String toString() {
return appKey +
'\t' + selfTime +
"\t" + otherTime +
"\t" + sumTime ;
}
}
bean
public class OrderBean implements WritableComparable<OrderBean> {
private String orderId;
private Double money;
@Override
public int compareTo(OrderBean o) {
int compare = orderId.compareTo(o.getOrderId());
if(compare==0){
return -money.compareTo(o.getMoney());
}
return compare;
}
@Override
public void write(DataOutput out) throws IOException {
out.writeUTF(orderId);
out.writeDouble(money);
}
@Override
public void readFields(DataInput in) throws IOException {
orderId=in.readUTF();
money=in.readDouble();
}
public OrderBean() {
}
public OrderBean(String orderId, Double money) {
this.orderId = orderId;
this.money = money;
}
@Override
public String toString() {
return orderId + '\t'+money ;
}
}
comparator
public class OrderGroupingComparator extends WritableComparator {
public OrderGroupingComparator() {
super(OrderBean.class,true);
}
@Override
public int compare(WritableComparable a, WritableComparable b) {
OrderBean c=(OrderBean)a;
OrderBean d=(OrderBean)b;
return c.getOrderId().compareTo(d.getOrderId());
}
}
Partitioner
public class OrderPartition extends Partitioner<OrderBean, NullWritable> {
@Override
public int getPartition(OrderBean orderBean, NullWritable nullWritable, int numPartitions) {
//按订单id来分区,
return (orderBean.getOrderId().hashCode() & Integer.MAX_VALUE) % numPartitions;
}
}
mapper
public class OrderMapper extends Mapper<LongWritable, Text,OrderBean, NullWritable> {
OrderBean bean=new OrderBean();
@Override
protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
String[] fields = value.toString().split("\t");
bean.setOrderId(fields[0]);
bean.setMoney(Double.parseDouble(fields[2]));
context.write(bean,NullWritable.get());
}
}
reducer
public class OrderReducer extends Reducer<OrderBean, NullWritable,OrderBean,NullWritable> {
@Override
protected void reduce(OrderBean key, Iterable<NullWritable> values, Context context) throws IOException, InterruptedException {
context.write(key,NullWritable.get());
}
}
driver
public class OrderDriver {
public static void main(String[] args) throws IOException, ClassNotFoundException, InterruptedException {
Configuration conf = new Configuration();
Job job = Job.getInstance(conf,"OrderDriver");
job.setJarByClass(OrderDriver.class);
job.setMapperClass(OrderMapper.class);
job.setReducerClass(OrderReducer.class);
job.setMapOutputKeyClass(OrderBean.class);
job.setMapOutputValueClass(NullWritable.class);
job.setOutputKeyClass(OrderBean.class);
job.setOutputValueClass(NullWritable.class);
/*指定分区器*/
job.setPartitionerClass(OrderPartition.class);
/*指定reduceTask的数量*/
job.setNumReduceTasks(2);
/*指定分组比较器*/
job.setGroupingComparatorClass(OrderGroupingComparator.class);
FileInputFormat.setInputPaths(job,new Path(args[0]));
FileOutputFormat.setOutputPath(job,new Path(args[1]));
boolean result = job.waitForCompletion(true);
System.exit(result?0:1);
}
}
FileInputformat
public class CustomFileInputformat extends FileInputFormat<Text, BytesWritable> {
@Override
protected boolean isSplitable(JobContext context, Path filename) {
/*文件不可切分*/
return false;
}
@Override
public RecordReader<Text, BytesWritable> createRecordReader(InputSplit split, TaskAttemptContext context) throws IOException, InterruptedException {
/*返回自己的recordeReader类*/
CustomRecordReader recordReader= new CustomRecordReader();
return recordReader;
}
}
RecordReader
public class CustomRecordReader extends RecordReader<Text, BytesWritable> {
Configuration configuration=null;
FileSplit split=null;
/*输出的kv*/
Text text=new Text();
BytesWritable value=new BytesWritable();
/*未读标识*/
Boolean isProgress=true;
@Override
public void initialize(InputSplit split, TaskAttemptContext context) throws IOException, InterruptedException {
//获取到文件切片以及配置文件对象
this.split=(FileSplit)split;
configuration = context.getConfiguration();
}
@Override
public boolean nextKeyValue() throws IOException, InterruptedException {
if(isProgress){
byte[] contents = new byte[(int) split.getLength()];
FileSystem fs=null;
FSDataInputStream fis=null;
try {
/*获得文件系统*/
Path path=split.getPath();
fs=path.getFileSystem(configuration);
/*读取数据*/
fis = fs.open(path);
/*读取文件内容*/
IOUtils.readFully(fis,contents,0,contents.length);
/*输出*/
value.set(contents,0,contents.length);
text.set(path.toString());
} catch (IOException e) {
e.printStackTrace();
} finally {
IOUtils.closeStream(fis);
IOUtils.closeStream(fs);
}
isProgress=false;
return true;
}
/*true为继续读,false为不继续读*/
return false;
}
@Override
public Text getCurrentKey() throws IOException, InterruptedException {
return text;
}
@Override
public BytesWritable getCurrentValue() throws IOException, InterruptedException {
return value;
}
@Override
public float getProgress() throws IOException, InterruptedException {
return 0;
}
@Override
public void close() throws IOException {
}
}
Mapper
public class SequenceFileMapper extends Mapper<Text, BytesWritable,Text,BytesWritable> {
@Override
protected void map(Text key, BytesWritable value, Context context) throws IOException, InterruptedException {
/*直接输出*/
context.write(key,value);
}
}
Reducer
public class SequenceFileReducer extends Reducer {
@Override
protected void reduce(Object key, Iterable values, Context context) throws IOException, InterruptedException {
context.write(key,values.iterator().next());
}
}
Driver
public class SequenceFileDriver {
public static void main(String[] args) throws IOException, ClassNotFoundException, InterruptedException {
Configuration conf = new Configuration();
Job job = Job.getInstance(conf);
job.setJarByClass(SequenceFileDriver.class);
job.setMapperClass(SequenceFileMapper.class);
job.setReducerClass(SequenceFileReducer.class);
job.setMapOutputKeyClass(Text.class);
job.setMapOutputValueClass(BytesWritable.class);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(BytesWritable.class);
job.setInputFormatClass(CustomFileInputformat.class);
job.setOutputFormatClass(SequenceFileOutputFormat.class);
FileInputFormat.setInputPaths(job,new Path(args[0]));
FileOutputFormat.setOutputPath(job,new Path(args[1]));
boolean result = job.waitForCompletion(true);
System.exit(result?0:1);
}
}
FileOutputFormat
public class OutputFileOutputFormat extends FileOutputFormat<Text, NullWritable> {
@Override
public RecordWriter<Text, NullWritable> getRecordWriter(TaskAttemptContext job) throws IOException, InterruptedException {
FileSystem fs = FileSystem.get(job.getConfiguration());
FSDataOutputStream fsDataOutputStream = fs.create(new Path("G:\\click_log\\out\\lagou"));
FSDataOutputStream fsDataOutputStream1 = fs.create(new Path("G:\\click_log\\out\\other"));
return new OutputRecordWriter(fsDataOutputStream,fsDataOutputStream1);
}
}
RecordWriter
public class OutputRecordWriter extends RecordWriter<Text, NullWritable> {
FSDataOutputStream lagou=null;
FSDataOutputStream other=null;
public OutputRecordWriter(FSDataOutputStream fsDataOutputStream, FSDataOutputStream fsDataOutputStream1) {
this.lagou=fsDataOutputStream;
this.other=fsDataOutputStream1;
}
@Override
public void write(Text key, NullWritable value) throws IOException, InterruptedException {
String s = key.toString();
if(s.contains("lagou")){
lagou.write(s.getBytes());
lagou.write("\r\t".getBytes());
}else{
other.write(s.getBytes());
other.write("\r\t".getBytes());
}
}
@Override
public void close(TaskAttemptContext context) throws IOException, InterruptedException {
IOUtils.closeStream(lagou);
IOUtils.closeStream(other);
}
}
mapper ,reducer
public class OutputMapper extends Mapper<LongWritable, Text,Text, NullWritable> {
@Override
protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
context.write(value,NullWritable.get());
}
}
public class OutputReducer extends Reducer<Text, NullWritable,Text,NullWritable> {
@Override
protected void reduce(Text key, Iterable<NullWritable> values, Context context) throws IOException, InterruptedException {
context.write(key,NullWritable.get());
}
}
Driver
public class OutputDriver {
public static void main(String[] args) throws IOException, ClassNotFoundException, InterruptedException {
Configuration conf = new Configuration();
Job job = Job.getInstance(conf);
job.setJarByClass(OutputDriver.class);
job.setMapperClass(OutputMapper.class);
job.setReducerClass(OutputReducer.class);
job.setMapOutputKeyClass(Text.class);
job.setMapOutputValueClass(NullWritable.class);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(NullWritable.class);
job.setOutputFormatClass(OutputFileOutputFormat.class);
FileInputFormat.setInputPaths(job,new Path(args[0]));
FileOutputFormat.setOutputPath(job,new Path(args[1]));
boolean result = job.waitForCompletion(true);
System.exit(result?0:1);
}
}
hadoop当中支持的压缩算法
设置map阶段压缩
Configuration configuration = new Configuration();
configuration.set("mapreduce.map.output.compress","true");
configuration.set("mapreduce.map.output.compress.codec","org.apache.hadoop.io.compress.SnappyCodec");
设置reduce阶段的压缩configuration.set("mapreduce.output.fileoutputformat.compress","true");
configuration.set("mapreduce.output.fileoutputformat.compress.type","RECORD");
configuration.set("mapreduce.output.fileoutputformat.compress.codec","org.apache.hadoop.io.compress.SnappyCodec");
<property> <name>mapreduce.output.fileoutputformat.compress</name> <value>true</value> </property> <property> <name>mapreduce.output.fileoutputformat.compress.type</name> <value>RECORD</value> </property> <property> <name>mapreduce.output.fileoutputformat.compress.codec</name> <value>org.apache.hadoop.io.compress.SnappyCodec</value> </property>