CHAPTER 3:Filtering Patterns
There are a couple of reasons why map-only jobs are efficient.
• Since no reducers are needed, data never has to be transmitted between the map
and reduce phase. Most of the map tasks pull data off of their locally attached disks
and then write back out to that node.
• Since there are no reducers, both the sort phase and the reduce phase are cut out.
This usually doesn’t take very long, but every little bit helps.
Distributed grep
public static class GrepMapper
extends Mapper<Object, Text, NullWritable, Text> {
private String mapRegex = null;
public void setup(Context context) throws IOException,
InterruptedException {
mapRegex = context.getConfiguration().get("mapregex");
}
public void map(Object key, Text value, Context context)
throws IOException, InterruptedException {
if (value.toString().matches(mapRegex)) {
context.write(NullWritable.get(), value);
}
}
}
public static class SRSMapper
extends Mapper<Object, Text, NullWritable, Text> {
private Random rands = new Random();
private Double percentage;
protected void setup(Context context) throws IOException,
InterruptedException {
// Retrieve the percentage that is passed in via the configuration
// like this: conf.set("filter_percentage", .5);
// for .5%
String strPercentage = context.getConfiguration()
.get("filter_percentage");
percentage = Double.parseDouble(strPercentage) / 100.0;
}
public void map(Object key, Text value, Context context)
throws IOException, InterruptedException {
if (rands.nextDouble() < percentage) {
context.write(NullWritable.get(), value);
}
}
}
Bloom Filtering
Bloom filter training.
public class BloomFilterDriver {
public static void main(String[] args) throws Exception {
// Parse command line arguments
Path inputFile = new Path(args[0]);
int numMembers = Integer.parseInt(args[1]);
float falsePosRate = Float.parseFloat(args[2]);
Path bfFile = new Path(args[3]);
// Calculate our vector size and optimal K value based on approximations
int vectorSize = getOptimalBloomFilterSize(numMembers, falsePosRate);
int nbHash = getOptimalK(numMembers, vectorSize);
// Create new Bloom filter
BloomFilter filter = new BloomFilter(vectorSize, nbHash,
Hash.MURMUR_HASH);
System.out.println("Training Bloom filter of size " + vectorSize
+ " with " + nbHash + " hash functions, " + numMembers
+ " approximate number of records, and " + falsePosRate
+ " false positive rate");
// Open file for read
String line = null;
int numElements = 0;
FileSystem fs = FileSystem.get(new Configuration());
for (FileStatus status : fs.listStatus(inputFile)) {
BufferedReader rdr = new BufferedReader(new InputStreamReader(
new GZIPInputStream(fs.open(status.getPath()))));
System.out.println("Reading " + status.getPath());
while ((line = rdr.readLine()) != null) {
filter.add(new Key(line.getBytes()));
++numElements;
}
rdr.close();
}
System.out.println("Trained Bloom filter with " + numElements
+ " entries.");
System.out.println("Serializing Bloom filter to HDFS at " + bfFile);
FSDataOutputStream strm = fs.create(bfFile);
filter.write(strm);
strm.flush();
strm.close();
System.exit(0);
}
}
public static class BloomFilteringMapper extends
Mapper<Object, Text, Text, NullWritable> {
private BloomFilter filter = new BloomFilter();
protected void setup(Context context) throws IOException,
InterruptedException {
// Get file from the DistributedCache
URI[] files = DistributedCache.getCacheFiles(context
.getConfiguration());
System.out.println("Reading Bloom filter from: "
+ files[0].getPath());
// Open local file for read.
DataInputStream strm = new DataInputStream(new FileInputStream(
files[0].getPath()));
// Read into our Bloom filter.
filter.readFields(strm);
strm.close();
}
public void map(Object key, Text value, Context context)
throws IOException, InterruptedException {
Map<String, String> parsed = transformXmlToMap(value.toString());
// Get the value for the comment
String comment = parsed.get("Text");
StringTokenizer tokenizer = new StringTokenizer(comment);
// For each word in the comment
while (tokenizer.hasMoreTokens()) {
// If the word is in the filter, output the record and break
String word = tokenizer.nextToken();
if (filter.membershipTest(new Key(word.getBytes()))) {
context.write(value, NullWritable.get());
break;
}
}
}
}
This Bloom filter was trained with all user IDs that have a reputation of at least 1,500.
public static class BloomFilteringMapper extends
Mapper<Object, Text, Text, NullWritable> {
private BloomFilter filter = new BloomFilter();
private HTable table = null;
protected void setup(Context context) throws IOException,
InterruptedException {
// Get file from the Distributed Cache
URI[] files = DistributedCache.getCacheFiles(context
.getConfiguration());
System.out.println("Reading Bloom filter from: "
+ files[0].getPath());
// Open local file for read.
DataInputStream strm = new DataInputStream(new FileInputStream(
files[0].getPath()));
// Read into our Bloom filter.
filter.readFields(strm);
strm.close();
// Get HBase table of user info
Configuration hconf = HBaseConfiguration.create();
table = new HTable(hconf, "user_table");
}
public void map(Object key, Text value, Context context)
throws IOException, InterruptedException {
Map<String, String> parsed = transformXmlToMap(value.toString());
// Get the value for the comment
String userid = parsed.get("UserId");
// If this user ID is in the set
if (filter.membershipTest(new Key(userid.getBytes()))) {
// Get the reputation from the HBase table
Result r = table.get(new Get(userid.getBytes()));
int reputation = Integer.parseInt(new String(r.getValue(
"attr".getBytes(), "Reputation".getBytes())));
// If the reputation is at least 1500,
// write the record to the file system
if (reputation >= 1500) {
context.write(value, NullWritable.get());
}
}
}
}
Top Ten
class mapper:
setup():
initialize top ten sorted list
map(key, record):
insert record into top ten sorted list
if length of array is greater-than 10 then
truncate list to a length of 10
cleanup():
for record in top sorted ten list:
emit null,record
class reducer:
setup():
initialize top ten sorted list
reduce(key, records):
sort records
truncate records to top 10
for record in records:
emit record
Top Ten
map用于产生一个数据块的top ten,利用treeSet进行排序,大于十个时,移除最小的,在cleanup阶段,写出treeSet中的value
public static class TopTenMapper extends
Mapper<Object, Text, NullWritable, Text> {
// Stores a map of user reputation to the record
private TreeMap<Integer, Text> repToRecordMap = new TreeMap<Integer, Text>();
public void map(Object key, Text value, Context context)
throws IOException, InterruptedException {
Map<String, String> parsed = transformXmlToMap(value.toString());
String userId = parsed.get("Id");
String reputation = parsed.get("Reputation");
// Add this record to our map with the reputation as the key
repToRecordMap.put(Integer.parseInt(reputation), new Text(value));
// If we have more than ten records, remove the one with the lowest rep
// As this tree map is sorted in descending order, the user with
// the lowest reputation is the last key.
if (repToRecordMap.size() > 10) {
repToRecordMap.remove(repToRecordMap.firstKey());
}
}
protected void cleanup(Context context) throws IOException,
InterruptedException {
// Output our ten records to the reducers with a null key
for (Text t : repToRecordMap.values()) {
context.write(NullWritable.get(), t);
}
}
}
reduce阶段通过values获取得分,然后利用treeSet,与reduce类似
private TreeMap<Integer, Text> repToRecordMap = new TreeMap<Integer, Text>();
public void reduce(NullWritable key, Iterable<Text> values,
Context context) throws IOException, InterruptedException {
for (Text value : values) {
Map<String, String> parsed = transformXmlToMap(value.toString());
repToRecordMap.put(Integer.parseInt(parsed.get("Reputation")),
new Text(value));
// If we have more than ten records, remove the one with the lowest rep
// As this tree map is sorted in descending order, the user with
// the lowest reputation is the last key.
if (repToRecordMap.size() > 10) {
repToRecordMap.remove(repToRecordMap.firstKey());
}
}
for (Text t : repToRecordMap.descendingMap().values()) {
// Output our ten records to the file system with a null key
context.write(NullWritable.get(), t);
}
}
}
Distinct
map(key, record):
emit record,null
reduce(key, records):
emit key
The mapper takes each record and extracts the data fields for which we want unique values.In our HTTP logs example, this means extracting the user, the web browser,and the device values. The mapper outputs the record as the key, and null as the value.
public static class DistinctUserMapper extends
Mapper<Object, Text, Text, NullWritable> {
private Text outUserId = new Text();
public void map(Object key, Text value, Context context)
throws IOException, InterruptedException {
Map<String, String> parsed = transformXmlToMap(value.toString());
// Get the value for the UserId attribute
String userId = parsed.get("UserId");
// Set our output key to the user's id
outUserId.set(userId);
// Write the user's id with a null value
context.write(outUserId, NullWritable.get());
}
}
public static class DistinctUserReducer extends
Reducer<Text, NullWritable, Text, NullWritable> {
public void reduce(Text key, Iterable<NullWritable> values,
Context context) throws IOException, InterruptedException {
// Write the user's id with a null value
context.write(key, NullWritable.get());
}
}
Combiner optimization. A combiner can and should be used in the distinct pattern. Duplicate keys will be removed from each local map’s output, thus reducing the amount ofnetwork I/O required. The same code for the reducer can be used in the combiner.