IT博客汇
  • 首页
  • 精华
  • 技术
  • 设计
  • 资讯
  • 扯淡
  • 权利声明
  • 登录 注册

    [原]Hive ORC数据格式的MapReduce读写

    liuzhoulong发表于 2016-07-27 18:22:45
    love 0

    1,mr代码如下

    package com.test.hadoop;
    
    import java.io.IOException;
    
    import org.apache.hadoop.conf.Configuration;
    import org.apache.hadoop.fs.Path;
    import org.apache.hadoop.io.NullWritable;
    import org.apache.hadoop.io.Text;
    import org.apache.hadoop.mapreduce.Job;
    import org.apache.hadoop.mapreduce.Mapper;
    import org.apache.hadoop.mapreduce.Reducer;
    import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
    import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
    import org.apache.orc.TypeDescription;
    import org.apache.orc.mapred.OrcStruct;
    import org.apache.orc.mapreduce.OrcInputFormat;
    import org.apache.orc.mapreduce.OrcOutputFormat;
    
    public class ORCSample {
    
    	public static class ORCMapper extends
    			Mapper<NullWritable, OrcStruct, Text, Text> {
    		public void map(NullWritable key, OrcStruct value, Context output)
    				throws IOException, InterruptedException {
    			output.write((Text) value.getFieldValue(1),
    					(Text) value.getFieldValue(2));
    		}
    	}
    
    	public static class ORCReducer extends
    			Reducer<Text, Text, NullWritable, OrcStruct> {
    		private TypeDescription schema = TypeDescription
    				.fromString("struct<name:string,mobile:string>");
    		private OrcStruct pair = (OrcStruct) OrcStruct.createValue(schema);
    
    		private final NullWritable nw = NullWritable.get();
    
    		public void reduce(Text key, Iterable<Text> values, Context output)
    				throws IOException, InterruptedException {
    			for (Text val : values) {
    				pair.setFieldValue(0, key);
    				pair.setFieldValue(1, val);
    				output.write(nw, pair);
    			}
    		}
    	}
    
    	public static void main(String args[]) throws Exception {
    
    		Configuration conf = new Configuration();
    		conf.set("orc.mapred.output.schema","struct<name:string,mobile:string>");
    		Job job = Job.getInstance(conf, "ORC Test");
    		job.setJarByClass(ORCSample.class);
    		job.setMapperClass(ORCMapper.class);
    		job.setReducerClass(ORCReducer.class);
    		job.setInputFormatClass(OrcInputFormat.class);
    		job.setOutputFormatClass(OrcOutputFormat.class);
    		job.setMapOutputKeyClass(Text.class);
    		job.setMapOutputValueClass(Text.class);
    		job.setOutputKeyClass(NullWritable.class);
    		job.setOutputValueClass(OrcStruct.class);
    		FileInputFormat.addInputPath(job, new Path(args[0]));
    		FileOutputFormat.setOutputPath(job, new Path(args[1]));
    		System.exit(job.waitForCompletion(true) ? 0 : 1);
    	}
    }
    
    2,pom.xml中添加依赖(基于hadoop2.7.1)

    <dependencies>
      <dependency>
        <groupId>org.apache.orc</groupId>
        <artifactId>orc-mapreduce</artifactId>
        <version>1.1.0</version>
      </dependency>
      <dependency>
        <groupId>org.apache.hadoop</groupId>
        <artifactId>hadoop-mapreduce-client-core</artifactId>
        <version>2.7.1</version>
      </dependency>
    </dependencies>


    3,创建表,在 t_test_orc中添加3行数据。

    CREATE  TABLE `t_test_orc`(
      `siteid` string, 
      `name` string, 
      `mobile` string)
     stored as orc
    CREATE TABLE `t_test_orc_new`(
      `name` string, 
      `mobile` string)
    ROW FORMAT SERDE 
      'org.apache.hadoop.hive.ql.io.orc.OrcSerde' 
    STORED AS INPUTFORMAT 
      'org.apache.hadoop.hive.ql.io.orc.OrcInputFormat' 
    OUTPUTFORMAT 
      'org.apache.hadoop.hive.ql.io.orc.OrcOutputFormat'
    LOCATION
      'hdfs://namenode:9000/user/testorc3'
    


    4,打包运行

    hadoop jar MRTest-1.0-jar-with-dependencies.jar com.test.hadoop.ORCSample /hive/warehouse/mytest.db/t_test_orc /user/testorc3


    5,完成后可以用hive --orcfiledump -d 查看执行结果



    并且进入hive 查询orc格式的 t_test_orc表也可以看到数据

    更多信息可以参考https://orc.apache.org/



沪ICP备19023445号-2号
友情链接