源代码:
TokenizerMapper.java:
package com.bazhangkeji.hadoop; import java.io.IOException; import java.util.StringTokenizer; import org.apache.hadoop.io.IntWritable; import org.apache.hadoop.io.Text; import org.apache.hadoop.mapreduce.Mapper; public class TokenizerMapper extends Mapper<Object, Text, Text, IntWritable> { IntWritable one = new IntWritable(1); Text word = new Text(); public void map(Object key, Text value, Context context) throws IOException, InterruptedException { StringTokenizer itr = new StringTokenizer(value.toString()); StringBuffer str_temp = new StringBuffer(); StringBuffer str_out = new StringBuffer(); int i = 0; while (itr.hasMoreTokens()) { str_temp.setLength(0); str_out.setLength(0); str_temp.append(itr.nextToken()); for (i = 0;i < str_temp.length();i++) { if ((str_temp.charAt(i) >= 'a' && str_temp.charAt(i) <= 'z') || (str_temp.charAt(i) >= 'A' && str_temp.charAt(i) <= 'Z') || (str_temp.charAt(i) >= '0' && str_temp.charAt(i) <= '9')) { str_out.append(str_temp.charAt(i)); } } word.set(str_out.toString()); context.write(word, one); } } }
|