一:OutputFormat 接口实现类

1.1 概述

OutputFormat是MapReduce输出的基类,所有实现MapReduce输出都实现了 OutputFormat 接口。下面我们介绍几种常见的OutputFormat实现类。

1.2 OutputFormat实现类

1.3 默认输出格式TextOutputFormat

1.4 自定义OutputFormat

(1) 应用场景

例如:输出数据到MySQL/HBase/Elasticsearch等存储框架中。

(2)自定义OutputFormat步骤

  • 自定义一个类继承FileOutputFormat
  • 改写RecordWriter,具体改写输出数据的方法write()

二:自定义 OutputFormat 案例实操

2.1 需求

过滤输入的 log 日志,包含 aiyingke 的网站输出到 e:/aiyingke.log,不包含 aiyingke 的网站输出到 e:/other.log;

(1)输入数据

  • log.txt

(2)期望输出数据

  • aiyingke.log
  • other.log

2.2 需求分析

  • 需求
  • 输入数据
  • 输出数据
  • 自定义一个 OutputFormat 类
    • 创建一个类LogRecordWriter继承RecordWriter
    • 创建两个文件的输出流:aiyingke,other
    • 如果输入数据包含aiyingke,输出到aiyingkeOut流 如果不包含aiyingke,输出到otherOut流
  • 驱动类 Driver
    • 要将自定义的输出格式组件设置到job中
    • job.setOutputFormatClass(LogOutputFormat.class) ;

2.3 代码实现

(1)编写 LogMapper 类

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
package cn.aiyingke.mapreduce.outputformat;

import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;

import java.io.IOException;

/**
* Author: Rupert Tears
* Date: Created in 10:08 2022/12/19
* Description: Thought is already is late, exactly is the earliest time.
*/
public class LogMapper extends Mapper<Object, Text, Text, NullWritable> {
@Override
protected void map(Object key, Text value, Mapper<Object, Text, Text, NullWritable>.Context context) throws IOException, InterruptedException {

// https://aiyingke.cn
// 互换 KV 位置
context.write(value, NullWritable.get());
}
}

(2)编写 LogReducer 类

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
package cn.aiyingke.mapreduce.outputformat;

import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Reducer;

import java.io.IOException;

/**
* Author: Rupert Tears
* Date: Created in 10:11 2022/12/19
* Description: Thought is already is late, exactly is the earliest time.
*/
public class LogReducer extends Reducer<Text, NullWritable, Text, NullWritable> {
@Override
protected void reduce(Text key, Iterable<NullWritable> values, Reducer<Text, NullWritable, Text, NullWritable>.Context context) throws IOException, InterruptedException {

// key 相同情况,进入同一个reduce,遍历结果
for (NullWritable value : values) {
// 写出
context.write(key, NullWritable.get());
}
}
}

(3)自定义一个 LogOutputFormat 类

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
package cn.aiyingke.mapreduce.outputformat;

import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.RecordWriter;
import org.apache.hadoop.mapreduce.TaskAttemptContext;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;

import java.io.IOException;

/**
* Author: Rupert Tears
* Date: Created in 10:15 2022/12/19
* Description: Thought is already is late, exactly is the earliest time.
*/
public class LogOutputFormat extends FileOutputFormat<Text, NullWritable> {

@Override
public RecordWriter<Text, NullWritable> getRecordWriter(TaskAttemptContext job) throws IOException, InterruptedException {
// 创建自定义的 LogRecordWriter
LogRecordWriter lrw = new LogRecordWriter(job);
return lrw;
}
}

(4)编写 LogRecordWriter 类

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
package cn.aiyingke.mapreduce.outputformat;

import org.apache.hadoop.fs.FSDataOutputStream;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IOUtils;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.RecordWriter;
import org.apache.hadoop.mapreduce.TaskAttemptContext;

import java.io.IOException;

/**
* Author: Rupert Tears
* Date: Created in 10:44 2022/12/19
* Description: Thought is already is late, exactly is the earliest time.
*/
public class LogRecordWriter extends RecordWriter<Text, NullWritable> {

private final FSDataOutputStream otherOut;
private final FSDataOutputStream aiyingkeOut;

public LogRecordWriter(TaskAttemptContext job) {
// 获取文件系统
try {
final FileSystem fileSystem = FileSystem.get(job.getConfiguration());
// 用文件系统对象创建两个输出流对应不同的目录
aiyingkeOut = fileSystem.create(new Path("Y:\\Temp\\logOut\\aiyingke\\aiyingke.log"));
otherOut = fileSystem.create(new Path("Y:\\Temp\\logOut\\other\\other.log"));
} catch (IOException e) {
throw new RuntimeException(e);
}
}

@Override
public void write(Text text, NullWritable nullWritable) throws IOException, InterruptedException {
final String line = text.toString();
// 根据当前行是否包含 aiyingke 决定采用哪个流输出
if (line.contains("aiyingke")) {
aiyingkeOut.writeBytes(line + "\n");
} else {
otherOut.writeBytes(line + "\n");
}
}

@Override
public void close(TaskAttemptContext taskAttemptContext) throws IOException, InterruptedException {
IOUtils.closeStream(aiyingkeOut);
IOUtils.closeStream(otherOut);
}
}

(5)编写 LogDriver 类

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
package cn.aiyingke.mapreduce.outputformat;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;

import java.io.IOException;

/**
* Author: Rupert Tears
* Date: Created in 11:04 2022/12/19
* Description: Thought is already is late, exactly is the earliest time.
*/
public class LogDriver {
public static void main(String[] args) throws IOException, InterruptedException, ClassNotFoundException {


/*
* 1.拿到配置环境变量
*/
Configuration configuration = new Configuration();

/*
* 2.拿到job作业对象
*/
Job job = Job.getInstance(configuration, "Log");
/*
* 3.设置主类,(即:告知集群入口jar包在哪)
*/
job.setJarByClass(LogDriver.class);
/*
* 4.设置 mapper reducer
*/
job.setMapperClass(LogMapper.class);
job.setReducerClass(LogReducer.class);
/*
* 5.设置输出 key value的类型
*/
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(NullWritable.class);

// 设置自定义的 OutputFormat
job.setOutputFormatClass(LogOutputFormat.class);


/*
* 6.设置输入、输出的数据路径
* 虽然我们定义了 OutputFormat ,但是因为我们的 OutputFormat 继承自 FileOutputFormat
* 而 FileOutputFormat 要输出一个 _SUCCESS 的标志文件,所以这里换得指定一个输出目录
*/
FileInputFormat.addInputPath(job, new Path("Y:\\Temp\\log\\"));
FileOutputFormat.setOutputPath(job, new Path("Y:\\Temp\\logOut\\"));
/*
* 7.提交执行
*/
System.exit(job.waitForCompletion(true) ? 0 : 1);
}
}