24. // word count
class Combiner
method Combine(string t, counts [c1, c2, . . .])
sum ← 0
for all count c ∈ counts [c1, c2, . . .] do
sum ← sum + c
Emit(string t, count sum)
25.
26.
27. reduce: (k2, [v2]) ! [(k3, v3)]
//word count
class Reducer
method Reduce(term t, counts [c1, c2, . . .])
sum ← 0
for all count c ∈ counts [c1,c2,...] do
sum ← sum + c
Emit(term t, count sum)
36. package com.philippeadjiman.hadooptraining;
package com.philippeadjiman.hadooptraining;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapred.JobConf;
import org.apache.hadoop.mapred.Partitioner;
public class MyPartitioner implements Partitioner<IntWritable,Text> {
" @Override
" public int getPartition(IntWritable key, Text value, int numPartitions) {
" " /* Pretty ugly hard coded partitioning function. Don't do that in practice,
it is just for the sake of understanding. */
" " int nbOccurences = key.get();
" " if( nbOccurences < 3 )
" " " return 0;
" " else
" " " return 1;
" }
" @Override
" public void configure(JobConf arg0) {
" }
} Hadoop Tutorial Series, Issue #2: Getting Started With (Customized) Partitioning
37. x + y = y + x
x y = y x
(x + y) + z = x + (y + z)
(x y) z = x (y z)
38.
39.
40.
41.
42. class Mapper {
buffer
init() {
buffer = HashMap.new
}
map(id, data) {
elements = process(data)
for each element {
....
check_and_put(buffer, k2, v2)
}
} // Designing algorithms for Map Reduce
43. check_and_put(buffer, k2, v2) {
if buffer.full {
for each k2 in buffer.keys {
emit(k2, buffer[k2])
}
} else {
buffer.incrby(k2, v2) // H[k2]+=v2
}
}
close() {
for each k2 in buffer.keys {
emit(k2, buffer[k2])
}
}
} Designing algorithms for Map Reduce
44. ! ! ! !"# ! ! ! 2,%
6"#
"# "# "# !"#$%&'
" " " $%&'(%)*'+
5,%
'( '( '( ()*+*,-./0$1/
! ! !#! !+(,+ ! ! !
$ $ 2,%$ :*8+"*6$+/
!#!"#
6"#
!"#$%&'
" " " $%&'(%)*'+ -(.*#/0 "# "# "#
5,%
# #
(+2*3+
'( '( '( ()*+*,-./0$1/
,33"/3,+*#)9+"//
!" !" 2/"3/
$ $ !+(,+ 1+2*3+ $ $ $ :*8+"*6$+/
!# !# "% "% !"#$%&'
% -(.*#/0 4.5&*6+(
# #% & & 4#56*)/
(+2*3+
,33"/3,+*#)9+"//
!" !" 2/"3/
1+2*3+
$ Figure 1: Distributed execution plan for MapReduce
$
when reduce cannot be decomposed to perform partial "% "% !"#$%&'
% aggregation.
% 4.5&*6+( !" !" 2/"3/
& & 4#56*)/
"/0$1/
Figure 1: Distributed execution plan for function, and merge and
With this user-defined MapReduce "% "% !"#$%&'
when reduce cannot beoperators provided by partial
grouping decomposed to perform the system, it is pos-
aggregation.
sible to execute a simple distributed computation as
!" !" ) )2/"3/ 7*),-./0$1/
shown in Figure 1. The computation has exactly
"/0$1/
"%
With this user-definedthe first phase merge anda Map function
two phases: function, and executes "% * *!"#$%&' 4#)8$5/"
grouping operatorsinputs to by the system, and pos-
on the provided extract keys it is records, then per-
)
sible to execute a simple distributed computation as based on the
forms a partitioning of these outputs
) 7*),-./0$1/
Figure 2: Distributed execution plan for MapReduce
shown in Figureof the records. The second phase collects and
keys 1. The computation has exactly when reduce supports partial aggregation. The imple-
two phases: the first phase executes a Map function * * 4#)8$5/"
mentation of GroupBy in the first stage may be different to
45.
46. Def. 1
x: data items, x1 ⊕ x2: concatenation of x1, x2.
H decomposable 2 I C
:
1) ∀x1, x2 : H(x1 ⊕ x2) = C(I(x1 ⊕ x2)) = C(I(x1) ⊕ I(x2))
2) ∀x1, x2 : I(x1 ⊕ x2) = I(x2 ⊕ x1)
3) ∀x1, x2 : C(x1 ⊕ x2) = C(x2 ⊕ x1)
Def. 2
H associative-decomposable Def.1
1-3 C
4) ∀x1, x2, x3 : C(C(x1 ⊕ x2) ⊕ x3) = C(x1 ⊕ C(x2 ⊕ x3))
( i.e. C is associative )
47.
48.
49. class Combiner {
share_space
init(share_space_info) {
share_space = conn(share_space_info)
}
combine(key, elements) {
sum = 0
for each element {
...
sum += v
} //
50. share_space.incrby(key, sum)
emit(key, share_space_info)
} // end combine()
}
class Reducer {
reduce(key, list_of_share_space_info) {
for each share_space_info {
share_space = conn(share_space_info)
sum = 0
elements = share_space.hget(key)
for each elemnt {
...
}
}
}
51. partition(key) {
range = (KEY_MAX - KEY_MIN) / NUM_OF_REDUCERS
reducer_no = (key - KEY_MIN) / range
return reducer_no
} Designing algorithms for Map Reduce
66. # Call at each hit record
map(k1, hitRecord) {
site = hitRecord.site
# key(=site) slice
slice = lookupSlice(site)
if (slice.time - now > 60.minutes) {
# Notify reducer whole slice of site is sent
advance(site, slice)
slice = lookupSlice(site)
}
emitIntermediate(site, slice, 1)
} Map Reduce and Stream Processing
67. combine(site, slice, countList) {
hitCount = 0
for count in countList {
hitCount += count
}
# Send the message to the downstream node
emitIntermediate(site, slice, hitCount)
} Map Reduce and Stream Processing