大数据Hadoop系列之Map-ETL实战开发

1.  输入数据

122.245.205.218^A
1450572279.254^A
hadoop-master.volitation.com^A
/BEIfeng.gif?
u_nu=1&
u_sd=DFBFABA3-9F0B-451F-B47C-782EDBFB5D90&
c_time=1450572272695&
ver=1&
en=e_l&
pl=website&
sdk=js&
b_rst=1440*900&
u_ud=DE9CBECE-D062-4486-A3A6-DFB2A04A3D28&
b_iev=Mozilla%2F5.0%20(Windows%20NT%206.1)%20AppleWebKit%2F537.36%20(KHTML%2C%20like%20Gecko)%20Chrome%2F31.0.1650.63%20Safari%2F537.36&
l=zh-CN

2.  Mapper

import java.io.IOException;
import java.io.UnsupportedEncodingException;
import java.net.URLDecoder;
import java.util.HashMap;
import java.util.Map;

import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;

public class EtlMapper extends Mapper<LongWritable, Text, Text, NullWritable> {
	
	private Map<String,Object> logMaps;
	
	private Text keyOut = new Text();
	
	@Override
	protected void map(LongWritable key, Text value,Context context){
		try {
			
			logMaps = new HashMap<String,Object>(); 
			StringBuffer keySbs = new StringBuffer();
			String logFiles = value.toString();
			String[] logFileSplits = logFiles.split("\\^A");
			
			logMaps.put("ip", logFileSplits[0]); //ip
			logMaps.put("s_time", logFileSplits[1]); //服务器时间
			logMaps.put("http_host", logFileSplits[2]); //nginx 服务器主机名
			
			keySbs.append(logMaps.get("ip")+",");
			keySbs.append(logMaps.get("s_time")+",");
			keySbs.append(logMaps.get("http_host")+",");
			
			String requestUrl = logFileSplits[3];
			String[] requestUrlSplits = requestUrl.split("\\?");
			String keyValues = requestUrlSplits[1];
			String[] keyValueSplits = keyValues.split("&");
			
			for (String keyValue : keyValueSplits) {
				String[] kv = keyValue.split("=");
				String key1 = kv[0];
				String value1 = kv[1];
				String realValue = URLDecoder.decode(value1, "utf-8"); //对value1进行解码
				logMaps.put(key1,realValue); 
				keySbs.append(logMaps.get(key1)+",");
			}
			
			keySbs.deleteCharAt(keySbs.length() - 1);
			keyOut.set(keySbs.toString());
			context.write(keyOut, NullWritable.get());
			
		} catch (UnsupportedEncodingException e) {
			e.printStackTrace();
		} catch (IOException e) {
			e.printStackTrace();
		} catch (InterruptedException e) {
			e.printStackTrace();
		}
	}

}

3.  EtlExec

import java.io.IOException;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;

public class EtlExec {
	
		public static void main(String[] args) throws IOException,
				ClassNotFoundException, InterruptedException {
			Configuration conf = new Configuration();
			
			Job job = Job.getInstance(conf); // job对象
			job.setJarByClass(EtlExec.class); // jar运行的主类

			job.setMapperClass(EtlMapper.class); // map对象
			job.setMapOutputKeyClass(Text.class); // K2输出类型
			job.setMapOutputValueClass(NullWritable.class); // V2输出类型

			Path input = new Path("hdfs://master:8020/datas/dsLogFiles.txt");
			FileInputFormat.setInputPaths(job, input);

			Path output = new Path("hdfs://master:8020/outDatas/dsLogFiles");
			FileOutputFormat.setOutputPath(job, output);

			boolean status = job.waitForCompletion(true);
			System.exit(status ? 1 : 0);
		}

}

4.  输出文件 

122.245.205.218,
1450572279.254,
hadoop-master.volitation.com,
1,
DFBFABA3-9F0B-451F-B47C-782EDBFB5D90,
1450572272695,
1,
e_l,
website,
js,
1440*900,
DE9CBECE-D062-4486-A3A6-DFB2A04A3D28,
Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/31.0.1650.63 Safari/537.36,
zh-CN

 

已标记关键词 清除标记
©️2020 CSDN 皮肤主题: 编程工作室 设计师:CSDN官方博客 返回首页