一：OutputFormat 接口实现类

1.1 概述

OutputFormat是MapReduce输出的基类，所有实现MapReduce输出都实现了 OutputFormat 接口。下面我们介绍几种常见的OutputFormat实现类。

1.2 OutputFormat实现类

1.3 默认输出格式TextOutputFormat

1.4 自定义OutputFormat

（1）应用场景

例如：输出数据到MySQL/HBase/Elasticsearch等存储框架中。

（2）自定义OutputFormat步骤

自定义一个类继承FileOutputFormat
改写RecordWriter，具体改写输出数据的方法write()

二：自定义 OutputFormat 案例实操

2.1 需求

过滤输入的 log 日志，包含 aiyingke 的网站输出到 e:/aiyingke.log，不包含 aiyingke 的网站输出到 e:/other.log；

（1）输入数据

log.txt

（2）期望输出数据

aiyingke.log
other.log

2.2 需求分析

需求
输入数据
输出数据
自定义一个 OutputFormat 类
- 创建一个类LogRecordWriter继承RecordWriter
- 创建两个文件的输出流：aiyingke，other
- 如果输入数据包含aiyingke，输出到aiyingkeOut流如果不包含aiyingke，输出到otherOut流
驱动类 Driver
- 要将自定义的输出格式组件设置到job中
- job.setOutputFormatClass(LogOutputFormat.class) ;

2.3 代码实现

（1）编写 LogMapper 类

package cn.aiyingke.mapreduce.outputformat;

import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;

import java.io.IOException;

/**
 * Author: Rupert Tears
 * Date: Created in 10:08 2022/12/19
 * Description: Thought is already is late, exactly is the earliest time.
 */
public class LogMapper extends Mapper<Object, Text, Text, NullWritable> {
    @Override
    protected void map(Object key, Text value, Mapper<Object, Text, Text, NullWritable>.Context context) throws IOException, InterruptedException {

        // https://aiyingke.cn
        // 互换 KV 位置
        context.write(value, NullWritable.get());
    }
}

（2）编写 LogReducer 类

package cn.aiyingke.mapreduce.outputformat;

import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Reducer;

import java.io.IOException;

/**
 * Author: Rupert Tears
 * Date: Created in 10:11 2022/12/19
 * Description: Thought is already is late, exactly is the earliest time.
 */
public class LogReducer extends Reducer<Text, NullWritable, Text, NullWritable> {
    @Override
    protected void reduce(Text key, Iterable<NullWritable> values, Reducer<Text, NullWritable, Text, NullWritable>.Context context) throws IOException, InterruptedException {

        // key 相同情况,进入同一个reduce,遍历结果
        for (NullWritable value : values) {
            // 写出
            context.write(key, NullWritable.get());
        }
    }
}

（3）自定义一个 LogOutputFormat 类

package cn.aiyingke.mapreduce.outputformat;

import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.RecordWriter;
import org.apache.hadoop.mapreduce.TaskAttemptContext;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;

import java.io.IOException;

/**
 * Author: Rupert Tears
 * Date: Created in 10:15 2022/12/19
 * Description: Thought is already is late, exactly is the earliest time.
 */
public class LogOutputFormat extends FileOutputFormat<Text, NullWritable> {

    @Override
    public RecordWriter<Text, NullWritable> getRecordWriter(TaskAttemptContext job) throws IOException, InterruptedException {
        // 创建自定义的 LogRecordWriter
        LogRecordWriter lrw = new LogRecordWriter(job);
        return lrw;
    }
}

（4）编写 LogRecordWriter 类

package cn.aiyingke.mapreduce.outputformat;

import org.apache.hadoop.fs.FSDataOutputStream;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IOUtils;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.RecordWriter;
import org.apache.hadoop.mapreduce.TaskAttemptContext;

import java.io.IOException;

/**
 * Author: Rupert Tears
 * Date: Created in 10:44 2022/12/19
 * Description: Thought is already is late, exactly is the earliest time.
 */
public class LogRecordWriter extends RecordWriter<Text, NullWritable> {

    private final FSDataOutputStream otherOut;
    private final FSDataOutputStream aiyingkeOut;

    public LogRecordWriter(TaskAttemptContext job) {
        // 获取文件系统
        try {
            final FileSystem fileSystem = FileSystem.get(job.getConfiguration());
            // 用文件系统对象创建两个输出流对应不同的目录
            aiyingkeOut = fileSystem.create(new Path("Y:\\Temp\\logOut\\aiyingke\\aiyingke.log"));
            otherOut = fileSystem.create(new Path("Y:\\Temp\\logOut\\other\\other.log"));
        } catch (IOException e) {
            throw new RuntimeException(e);
        }
    }

    @Override
    public void write(Text text, NullWritable nullWritable) throws IOException, InterruptedException {
        final String line = text.toString();
        // 根据当前行是否包含 aiyingke 决定采用哪个流输出
        if (line.contains("aiyingke")) {
            aiyingkeOut.writeBytes(line + "\n");
        } else {
            otherOut.writeBytes(line + "\n");
        }
    }

    @Override
    public void close(TaskAttemptContext taskAttemptContext) throws IOException, InterruptedException {
        IOUtils.closeStream(aiyingkeOut);
        IOUtils.closeStream(otherOut);
    }
}

（5）编写 LogDriver 类

package cn.aiyingke.mapreduce.outputformat;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;

import java.io.IOException;

/**
 * Author: Rupert Tears
 * Date: Created in 11:04 2022/12/19
 * Description: Thought is already is late, exactly is the earliest time.
 */
public class LogDriver {
    public static void main(String[] args) throws IOException, InterruptedException, ClassNotFoundException {


        /*
         *       1.拿到配置环境变量
         */
        Configuration configuration = new Configuration();

        /*
         *       2.拿到job作业对象
         */
        Job job = Job.getInstance(configuration, "Log");
        /*
         *       3.设置主类，（即：告知集群入口jar包在哪）
         */
        job.setJarByClass(LogDriver.class);
        /*
         *      4.设置 mapper  reducer
         */
        job.setMapperClass(LogMapper.class);
        job.setReducerClass(LogReducer.class);
        /*
         *       5.设置输出 key value的类型
         */
        job.setOutputKeyClass(Text.class);
        job.setOutputValueClass(NullWritable.class);

        // 设置自定义的 OutputFormat
        job.setOutputFormatClass(LogOutputFormat.class);


        /*
         *       6.设置输入、输出的数据路径
         *         虽然我们定义了 OutputFormat ,但是因为我们的 OutputFormat 继承自 FileOutputFormat
         *         而 FileOutputFormat 要输出一个 _SUCCESS 的标志文件,所以这里换得指定一个输出目录
         */
        FileInputFormat.addInputPath(job, new Path("Y:\\Temp\\log\\"));
        FileOutputFormat.setOutputPath(job, new Path("Y:\\Temp\\logOut\\"));
        /*
         *       7.提交执行
         */
        System.exit(job.waitForCompletion(true) ? 0 : 1);
    }
}