SlideShare a Scribd company logo
1 of 76
RHadoop, Hadoop for R
Scholarly Activity 05-09 change

  50%

37.5%

  25%

12.5%

   0%

-12.5%

 -25%

-37.5%
         R        SAS       SPSS     S-Plus    Stata
Scholarly Activity 05-09 change

                                                 50%

                                               37.5%

                                                 25%

                                               12.5%
                      Packages
                                                  0%

10000                                          -12.5%

                                                -25%
 1000                                          -37.5%
                                                        R        SAS       SPSS     S-Plus    Stata

  100


   10


    1
        2002   2004   2006       2008   2010
Scholarly Activity 05-09 change

                                                 50%

                                               37.5%

                                                 25%

                                               12.5%
                      Packages
                                                  0%

10000                                          -12.5%

                                                -25%
 1000                                          -37.5%
                                                        R        SAS       SPSS     S-Plus    Stata

  100


   10
                                                    http://r4stats.com/popularity
    1
        2002   2004   2006       2008   2010
David Champagne, CTO
f s
    h d
r
rh d f s


rhb
      ase
rh d f s


rhb
      ase



      rm
  r
rmr
sapply(data, function)

mapreduce(data, map = function)
library(rmr)

mapreduce(…)
Rmr
Rmr




      Java, C++
Rmr




                  Cascading,
      Java, C++
                   Crunch
Rmr, Rhipe, Dumbo,
Rmr
 Pydoop, Hadoopy




                     Cascading,
        Java, C++
                      Crunch
Rmr, Rhipe, Dumbo,
Rmr
 Pydoop, Hadoopy




                     Cascading,
        Java, C++
                      Crunch
Expose MR   Hide MR




Rmr, Rhipe, Dumbo,
Rmr
 Pydoop, Hadoopy




                               Cascading,
        Java, C++
                                Crunch
Expose MR   Hide MR
                               Hive, Pig




Rmr, Rhipe, Dumbo,
Rmr
 Pydoop, Hadoopy




                               Cascading,
        Java, C++
                                Crunch
Expose MR   Hide MR
                               Hive, Pig




Rmr, Rhipe, Dumbo,
Rmr                            Cascalog,
 Pydoop, Hadoopy           Scalding, Scrunch




                               Cascading,
        Java, C++
                                Crunch
mapreduce(input, output, map, reduce)
mapreduce(input, output, map, reduce)
mapreduce(input, output, map, reduce)
mapreduce(input, output, map, reduce)
mapreduce(input, output, map, reduce)
mapreduce(input, output, map, reduce)
map = function(k, v) if (hash(k) %% 10 == 0)keyval(k, v)


    reduce = function(k, vv) keyval(k, length(vv))
map = function(k, v) if (hash(k) %% 10 == 0)keyval(k, v)


    reduce = function(k, vv) keyval(k, length(vv))
map = function(k, v) if (hash(k) %% 10 == 0)keyval(k, v)


    reduce = function(k, vv) keyval(k, length(vv))
map = function(k, v) if (hash(k) %% 10 == 0)keyval(k, v)


    reduce = function(k, vv) keyval(k, length(vv))
map = function(k, v) if (hash(k) %% 10 == 0)keyval(k, v)


    reduce = function(k, vv) keyval(k, length(vv))
map = function(k, v) if (hash(k) %% 10 == 0)keyval(k, v)


    reduce = function(k, vv) keyval(k, length(vv))
map = function(k, v) if (hash(k) %% 10 == 0)keyval(k, v)


    reduce = function(k, vv) keyval(k, length(vv))
map = function(k, v) if (hash(k) %% 10 == 0)keyval(k, v)


    reduce = function(k, vv) keyval(k, length(vv))
condition = function(x) x > 10
condition = function(x) x > 10


out = mapreduce(
condition = function(x) x > 10


out = mapreduce(
        input = input,
condition = function(x) x > 10


out = mapreduce(
        input = input,
        map = function(k,v)
condition = function(x) x > 10


out = mapreduce(
        input = input,
        map = function(k,v)
                 if (condition(v)) keyval(k,v))
condition = function(x) x > 10


out = mapreduce(
        input = input,
        map = function(k,v)
                 if (condition(v)) keyval(k,v))
x = from.dfs(hdfs.object)

hdfs.object = to.dfs(x)
INSERT OVERWRITE TABLE pv_gender_sum
SELECT pv_users.gender, count (DISTINCT pv_users.userid)
FROM pv_users
GROUP BY pv_users.gender;
INSERT OVERWRITE TABLE pv_gender_sum
SELECT pv_users.gender, count (DISTINCT pv_users.userid)
FROM pv_users
GROUP BY pv_users.gender;

mapreduce(input =
  mapreduce(input = "pv_users",
    map = function(k, v) keyval(v['userid'], v['gender']),
    reduce = function(uid, genders)
      lapply(unique(genders), function(g) keyval(NULL, g)),
  output = "pv_gender_sum",
  map = function(x, gender) keyval(gender, 1)
  reduce = function(gender,counts)
             keyval(k,sum(unlist(counts)))
kmeans =
  function(points, ncenters, iterations = 10,
           distfun = function(a,b) norm(as.matrix(a-b), type = 'F')) {
    newCenters = kmeans.iter(points, distfun, ncenters = ncenters)
    for(i in 1:iterations) {
      newCenters = kmeans.iter(points, distfun, centers = newCenters)}
    newCenters}
kmeans =
  function(points, ncenters, iterations = 10,
           distfun = function(a,b) norm(as.matrix(a-b), type = 'F')) {
    newCenters = kmeans.iter(points, distfun, ncenters = ncenters)
    for(i in 1:iterations) {
      newCenters = kmeans.iter(points, distfun, centers = newCenters)}
    newCenters}


kmeans.iter =
  function(points, distfun, ncenters = dim(centers)[1], centers = NULL) {
    from.dfs(
      mapreduce(
        input = points,
        map = if (is.null(centers)) {
                 function(k,v) keyval(sample(1:ncenters,1),v)}
              else {
                 function(k,v) {
                   distances = apply(centers, 1, function(c) distfun(c,v))
                   keyval(centers[which.min(distances),], v)}},
        reduce = function(k,vv) keyval(NULL, apply(do.call(rbind, vv), 2, mean))),
      to.data.frame = T)}
kmeans =
  function(points, ncenters, iterations = 10,
           distfun = function(a,b) norm(as.matrix(a-b), type = 'F')) {
    newCenters = kmeans.iter(points, distfun, ncenters = ncenters)
    for(i in 1:iterations) {
      newCenters = kmeans.iter(points, distfun, centers = newCenters)}
    newCenters}


kmeans.iter =
  function(points, distfun, ncenters = dim(centers)[1], centers = NULL) {
    from.dfs(
      mapreduce(
        input = points,
        map = if (is.null(centers)) {
                 function(k,v) keyval(sample(1:ncenters,1),v)}
              else {
                 function(k,v) {
                   distances = apply(centers, 1, function(c) distfun(c,v))
                   keyval(centers[which.min(distances),], v)}},
        reduce = function(k,vv) keyval(NULL, apply(do.call(rbind, vv), 2, mean))),
      to.data.frame = T)}
kmeans =
  function(points, ncenters, iterations = 10,
           distfun = function(a,b) norm(as.matrix(a-b), type = 'F')) {
    newCenters = kmeans.iter(points, distfun, ncenters = ncenters)
    for(i in 1:iterations) {
      newCenters = kmeans.iter(points, distfun, centers = newCenters)}
    newCenters}


kmeans.iter =
  function(points, distfun, ncenters = dim(centers)[1], centers = NULL) {
    from.dfs(
      mapreduce(
        input = points,
        map = if (is.null(centers)) {
                 function(k,v) keyval(sample(1:ncenters,1),v)}
              else {
                 function(k,v) {
                   distances = apply(centers, 1, function(c) distfun(c,v))
                   keyval(centers[which.min(distances),], v)}},
        reduce = function(k,vv) keyval(NULL, apply(do.call(rbind, vv), 2, mean))),
      to.data.frame = T)}
kmeans =
  function(points, ncenters, iterations = 10,
           distfun = function(a,b) norm(as.matrix(a-b), type = 'F')) {
    newCenters = kmeans.iter(points, distfun, ncenters = ncenters)
    for(i in 1:iterations) {
      newCenters = kmeans.iter(points, distfun, centers = newCenters)}
    newCenters}


kmeans.iter =
  function(points, distfun, ncenters = dim(centers)[1], centers = NULL) {
    from.dfs(
      mapreduce(
        input = points,
        map = if (is.null(centers)) {
                 function(k,v) keyval(sample(1:ncenters,1),v)}
              else {
                 function(k,v) {
                   distances = apply(centers, 1, function(c) distfun(c,v))
                   keyval(centers[which.min(distances),], v)}},
        reduce = function(k,vv) keyval(NULL, apply(do.call(rbind, vv), 2, mean))),
      to.data.frame = T)}
#!/usr/bin/python
import sys
from math import fabs
from org.apache.pig.scripting import Pig

filename = "student.txt"
k = 4
tolerance = 0.01

MAX_SCORE = 4
MIN_SCORE = 0
MAX_ITERATION = 100

# initial centroid, equally divide the space
initial_centroids = ""
last_centroids = [None] * k
for i in range(k):
    last_centroids[i] = MIN_SCORE + float(i)/k*(MAX_SCORE-MIN_SCORE)
    initial_centroids = initial_centroids + str(last_centroids[i])
    if i!=k-1:
        initial_centroids = initial_centroids + ":"

P = Pig.compile("""register udf.jar
                   DEFINE find_centroid FindCentroid('$centroids');
                   raw = load 'student.txt' as (name:chararray, age:int, gpa:double);
                   centroided = foreach raw generate gpa, find_centroid(gpa) as centroid;
                   grouped = group centroided by centroid;
                   result = foreach grouped generate group, AVG(centroided.gpa);
                   store result into 'output';
                """)

converged = False
iter_num = 0
while iter_num<MAX_ITERATION:
    Q = P.bind({'centroids':initial_centroids})
    results = Q.runSingle()
if results.isSuccessful() == "FAILED":
        raise "Pig job failed"
    iter = results.result("result").iterator()
    centroids = [None] * k
    distance_move = 0
    # get new centroid of this iteration, caculate the moving distance with last iteration
    for i in range(k):
        tuple = iter.next()
        centroids[i] = float(str(tuple.get(1)))
        distance_move = distance_move + fabs(last_centroids[i]-centroids[i])
    distance_move = distance_move / k;
    Pig.fs("rmr output")
    print("iteration " + str(iter_num))
    print("average distance moved: " + str(distance_move))
    if distance_move<tolerance:
        sys.stdout.write("k-means converged at centroids: [")
        sys.stdout.write(",".join(str(v) for v in centroids))
        sys.stdout.write("]n")
        converged = True
        break
    last_centroids = centroids[:]
    initial_centroids = ""
    for i in range(k):
        initial_centroids = initial_centroids + str(last_centroids[i])
        if i!=k-1:
            initial_centroids = initial_centroids + ":"
    iter_num += 1

if not converged:
    print("not converge after " + str(iter_num) + " iterations")
    sys.stdout.write("last centroids: [")
    sys.stdout.write(",".join(str(v) for v in last_centroids))
    sys.stdout.write("]n")
import java.io.IOException;

import org.apache.pig.EvalFunc;
import org.apache.pig.data.Tuple;


public class FindCentroid extends EvalFunc<Double> {
    double[] centroids;
    public FindCentroid(String initialCentroid) {
        String[] centroidStrings = initialCentroid.split(":");
        centroids = new double[centroidStrings.length];
        for (int i=0;i<centroidStrings.length;i++)
            centroids[i] = Double.parseDouble(centroidStrings[i]);
    }
    @Override
    public Double exec(Tuple input) throws IOException {
        double min_distance = Double.MAX_VALUE;
        double closest_centroid = 0;
        for (double centroid : centroids) {
            double distance = Math.abs(centroid - (Double)input.get(0));
            if (distance < min_distance) {
                min_distance = distance;
                closest_centroid = centroid;
            }
        }
        return closest_centroid;
    }

}
mapreduce(mapreduce(…
mapreduce(mapreduce(…

mapreduce(input = c(input1, input2), …)
mapreduce(mapreduce(…

mapreduce(input = c(input1, input2), …)

equijoin = function(
    left.input, right.input, input,
    output,
    outer,
    map.left, map.right,
    reduce, reduce.all)
out1 = mapreduce(…)
mapreduce(input = out1, <xyz>)
mapreduce(input = out1, <abc>)
out1 = mapreduce(…)
mapreduce(input = out1, <xyz>)
mapreduce(input = out1, <abc>)

abstract.job = function(input, output, …) {
   …
   result = mapreduce(input = input,
                      output = output)
   …
   result}
input.format, output.format, format
input.format, output.format, format
combine
input.format, output.format, format
combine
reduce.on.data.frame
input.format, output.format, format
combine
reduce.on.data.frame
local, hadoop backends
input.format, output.format, format
combine
reduce.on.data.frame
local, hadoop backends
backend.parameters
input.format, output.format, format
combine
reduce.on.data.frame
local, hadoop backends
backend.parameters
profiling
input.format, output.format, format
combine
reduce.on.data.frame
local, hadoop backends
backend.parameters
profiling
verbose
RHADOOP USER
ONE FAT CLUSTER AVE.
HYDROPOWER CITY, OR 0x0000




             RHADOOP@
      REVOLUTIONANALYTICS.COM

More Related Content

What's hot

Stata cheat sheet analysis
Stata cheat sheet analysisStata cheat sheet analysis
Stata cheat sheet analysisTim Essam
 
Hands on data science with r.pptx
Hands  on data science with r.pptxHands  on data science with r.pptx
Hands on data science with r.pptxNimrita Koul
 
Data manipulation on r
Data manipulation on rData manipulation on r
Data manipulation on rAbhik Seal
 
Stata cheat sheet: data processing
Stata cheat sheet: data processingStata cheat sheet: data processing
Stata cheat sheet: data processingTim Essam
 
Building a Functional Stream in Scala
Building a Functional Stream in ScalaBuilding a Functional Stream in Scala
Building a Functional Stream in ScalaDerek Wyatt
 
ComputeFest 2012: Intro To R for Physical Sciences
ComputeFest 2012: Intro To R for Physical SciencesComputeFest 2012: Intro To R for Physical Sciences
ComputeFest 2012: Intro To R for Physical Sciencesalexstorer
 
R + 15 minutes = Hadoop cluster
R + 15 minutes = Hadoop clusterR + 15 minutes = Hadoop cluster
R + 15 minutes = Hadoop clusterJeffrey Breen
 
Phil Bartie QGIS PLPython
Phil Bartie QGIS PLPythonPhil Bartie QGIS PLPython
Phil Bartie QGIS PLPythonRoss McDonald
 
Morel, a Functional Query Language
Morel, a Functional Query LanguageMorel, a Functional Query Language
Morel, a Functional Query LanguageJulian Hyde
 
RHive tutorials - Basic functions
RHive tutorials - Basic functionsRHive tutorials - Basic functions
RHive tutorials - Basic functionsAiden Seonghak Hong
 
Python for R Users
Python for R UsersPython for R Users
Python for R UsersAjay Ohri
 
Stata cheat sheet: data transformation
Stata  cheat sheet: data transformationStata  cheat sheet: data transformation
Stata cheat sheet: data transformationTim Essam
 
Upgrading To The New Map Reduce API
Upgrading To The New Map Reduce APIUpgrading To The New Map Reduce API
Upgrading To The New Map Reduce APITom Croucher
 
Stata cheatsheet transformation
Stata cheatsheet transformationStata cheatsheet transformation
Stata cheatsheet transformationLaura Hughes
 
Stata Programming Cheat Sheet
Stata Programming Cheat SheetStata Programming Cheat Sheet
Stata Programming Cheat SheetLaura Hughes
 
Hadoop本 輪読会 1章〜2章
Hadoop本 輪読会 1章〜2章Hadoop本 輪読会 1章〜2章
Hadoop本 輪読会 1章〜2章moai kids
 

What's hot (20)

Stata cheat sheet analysis
Stata cheat sheet analysisStata cheat sheet analysis
Stata cheat sheet analysis
 
Hands on data science with r.pptx
Hands  on data science with r.pptxHands  on data science with r.pptx
Hands on data science with r.pptx
 
Data manipulation on r
Data manipulation on rData manipulation on r
Data manipulation on r
 
Polimorfismo cosa?
Polimorfismo cosa?Polimorfismo cosa?
Polimorfismo cosa?
 
Stata cheat sheet: data processing
Stata cheat sheet: data processingStata cheat sheet: data processing
Stata cheat sheet: data processing
 
Building a Functional Stream in Scala
Building a Functional Stream in ScalaBuilding a Functional Stream in Scala
Building a Functional Stream in Scala
 
ComputeFest 2012: Intro To R for Physical Sciences
ComputeFest 2012: Intro To R for Physical SciencesComputeFest 2012: Intro To R for Physical Sciences
ComputeFest 2012: Intro To R for Physical Sciences
 
R + 15 minutes = Hadoop cluster
R + 15 minutes = Hadoop clusterR + 15 minutes = Hadoop cluster
R + 15 minutes = Hadoop cluster
 
Phil Bartie QGIS PLPython
Phil Bartie QGIS PLPythonPhil Bartie QGIS PLPython
Phil Bartie QGIS PLPython
 
Morel, a Functional Query Language
Morel, a Functional Query LanguageMorel, a Functional Query Language
Morel, a Functional Query Language
 
RHive tutorials - Basic functions
RHive tutorials - Basic functionsRHive tutorials - Basic functions
RHive tutorials - Basic functions
 
Python for R Users
Python for R UsersPython for R Users
Python for R Users
 
R seminar dplyr package
R seminar dplyr packageR seminar dplyr package
R seminar dplyr package
 
Stata cheat sheet: data transformation
Stata  cheat sheet: data transformationStata  cheat sheet: data transformation
Stata cheat sheet: data transformation
 
Upgrading To The New Map Reduce API
Upgrading To The New Map Reduce APIUpgrading To The New Map Reduce API
Upgrading To The New Map Reduce API
 
Stata cheatsheet transformation
Stata cheatsheet transformationStata cheatsheet transformation
Stata cheatsheet transformation
 
Stata Programming Cheat Sheet
Stata Programming Cheat SheetStata Programming Cheat Sheet
Stata Programming Cheat Sheet
 
Dplyr and Plyr
Dplyr and PlyrDplyr and Plyr
Dplyr and Plyr
 
Hadoop本 輪読会 1章〜2章
Hadoop本 輪読会 1章〜2章Hadoop本 輪読会 1章〜2章
Hadoop本 輪読会 1章〜2章
 
Python for R users
Python for R usersPython for R users
Python for R users
 

Similar to RHadoop, R meets Hadoop

Leveraging R in Big Data of Mobile Ads (R在行動廣告大數據的應用)
Leveraging R in Big Data of Mobile Ads (R在行動廣告大數據的應用)Leveraging R in Big Data of Mobile Ads (R在行動廣告大數據的應用)
Leveraging R in Big Data of Mobile Ads (R在行動廣告大數據的應用)Craig Chao
 
Aaron Ellison Keynote: Reaching the 99%
Aaron Ellison Keynote: Reaching the 99%Aaron Ellison Keynote: Reaching the 99%
Aaron Ellison Keynote: Reaching the 99%David LeBauer
 
Monadologie
MonadologieMonadologie
Monadologieleague
 
Apache Spark for Library Developers with William Benton and Erik Erlandson
 Apache Spark for Library Developers with William Benton and Erik Erlandson Apache Spark for Library Developers with William Benton and Erik Erlandson
Apache Spark for Library Developers with William Benton and Erik ErlandsonDatabricks
 
Ruby on Big Data @ Philly Ruby Group
Ruby on Big Data @ Philly Ruby GroupRuby on Big Data @ Philly Ruby Group
Ruby on Big Data @ Philly Ruby GroupBrian O'Neill
 
ggtimeseries-->ggplot2 extensions
ggtimeseries-->ggplot2 extensions ggtimeseries-->ggplot2 extensions
ggtimeseries-->ggplot2 extensions Dr. Volkan OBAN
 
Will it Blend? - ScalaSyd February 2015
Will it Blend? - ScalaSyd February 2015Will it Blend? - ScalaSyd February 2015
Will it Blend? - ScalaSyd February 2015Filippo Vitale
 
support vector regression
support vector regressionsupport vector regression
support vector regressionAkhilesh Joshi
 
Introduction to Functional Programming with Haskell and JavaScript
Introduction to Functional Programming with Haskell and JavaScriptIntroduction to Functional Programming with Haskell and JavaScript
Introduction to Functional Programming with Haskell and JavaScriptWill Kurt
 
Ruby on Big Data (Cassandra + Hadoop)
Ruby on Big Data (Cassandra + Hadoop)Ruby on Big Data (Cassandra + Hadoop)
Ruby on Big Data (Cassandra + Hadoop)Brian O'Neill
 
Getting Functional with Scala
Getting Functional with ScalaGetting Functional with Scala
Getting Functional with ScalaJorge Paez
 

Similar to RHadoop, R meets Hadoop (20)

RHadoop の紹介
RHadoop の紹介RHadoop の紹介
RHadoop の紹介
 
Leveraging R in Big Data of Mobile Ads (R在行動廣告大數據的應用)
Leveraging R in Big Data of Mobile Ads (R在行動廣告大數據的應用)Leveraging R in Big Data of Mobile Ads (R在行動廣告大數據的應用)
Leveraging R in Big Data of Mobile Ads (R在行動廣告大數據的應用)
 
Jan 2012 HUG: RHadoop
Jan 2012 HUG: RHadoopJan 2012 HUG: RHadoop
Jan 2012 HUG: RHadoop
 
Aaron Ellison Keynote: Reaching the 99%
Aaron Ellison Keynote: Reaching the 99%Aaron Ellison Keynote: Reaching the 99%
Aaron Ellison Keynote: Reaching the 99%
 
R meets Hadoop
R meets HadoopR meets Hadoop
R meets Hadoop
 
Monadologie
MonadologieMonadologie
Monadologie
 
R code for data manipulation
R code for data manipulationR code for data manipulation
R code for data manipulation
 
R code for data manipulation
R code for data manipulationR code for data manipulation
R code for data manipulation
 
Apache Spark for Library Developers with William Benton and Erik Erlandson
 Apache Spark for Library Developers with William Benton and Erik Erlandson Apache Spark for Library Developers with William Benton and Erik Erlandson
Apache Spark for Library Developers with William Benton and Erik Erlandson
 
Ruby on Big Data @ Philly Ruby Group
Ruby on Big Data @ Philly Ruby GroupRuby on Big Data @ Philly Ruby Group
Ruby on Big Data @ Philly Ruby Group
 
Introduction to R
Introduction to RIntroduction to R
Introduction to R
 
ggtimeseries-->ggplot2 extensions
ggtimeseries-->ggplot2 extensions ggtimeseries-->ggplot2 extensions
ggtimeseries-->ggplot2 extensions
 
Will it Blend? - ScalaSyd February 2015
Will it Blend? - ScalaSyd February 2015Will it Blend? - ScalaSyd February 2015
Will it Blend? - ScalaSyd February 2015
 
Clojure to Slang
Clojure to SlangClojure to Slang
Clojure to Slang
 
Hadoop I/O Analysis
Hadoop I/O AnalysisHadoop I/O Analysis
Hadoop I/O Analysis
 
support vector regression
support vector regressionsupport vector regression
support vector regression
 
Feature Extraction
Feature ExtractionFeature Extraction
Feature Extraction
 
Introduction to Functional Programming with Haskell and JavaScript
Introduction to Functional Programming with Haskell and JavaScriptIntroduction to Functional Programming with Haskell and JavaScript
Introduction to Functional Programming with Haskell and JavaScript
 
Ruby on Big Data (Cassandra + Hadoop)
Ruby on Big Data (Cassandra + Hadoop)Ruby on Big Data (Cassandra + Hadoop)
Ruby on Big Data (Cassandra + Hadoop)
 
Getting Functional with Scala
Getting Functional with ScalaGetting Functional with Scala
Getting Functional with Scala
 

More from Revolution Analytics

Speeding up R with Parallel Programming in the Cloud
Speeding up R with Parallel Programming in the CloudSpeeding up R with Parallel Programming in the Cloud
Speeding up R with Parallel Programming in the CloudRevolution Analytics
 
Migrating Existing Open Source Machine Learning to Azure
Migrating Existing Open Source Machine Learning to AzureMigrating Existing Open Source Machine Learning to Azure
Migrating Existing Open Source Machine Learning to AzureRevolution Analytics
 
Speed up R with parallel programming in the Cloud
Speed up R with parallel programming in the CloudSpeed up R with parallel programming in the Cloud
Speed up R with parallel programming in the CloudRevolution Analytics
 
Predicting Loan Delinquency at One Million Transactions per Second
Predicting Loan Delinquency at One Million Transactions per SecondPredicting Loan Delinquency at One Million Transactions per Second
Predicting Loan Delinquency at One Million Transactions per SecondRevolution Analytics
 
The Value of Open Source Communities
The Value of Open Source CommunitiesThe Value of Open Source Communities
The Value of Open Source CommunitiesRevolution Analytics
 
Building a scalable data science platform with R
Building a scalable data science platform with RBuilding a scalable data science platform with R
Building a scalable data science platform with RRevolution Analytics
 
The Business Economics and Opportunity of Open Source Data Science
The Business Economics and Opportunity of Open Source Data ScienceThe Business Economics and Opportunity of Open Source Data Science
The Business Economics and Opportunity of Open Source Data ScienceRevolution Analytics
 
Taking R Analytics to SQL and the Cloud
Taking R Analytics to SQL and the CloudTaking R Analytics to SQL and the Cloud
Taking R Analytics to SQL and the CloudRevolution Analytics
 
The Network structure of R packages on CRAN & BioConductor
The Network structure of R packages on CRAN & BioConductorThe Network structure of R packages on CRAN & BioConductor
The Network structure of R packages on CRAN & BioConductorRevolution Analytics
 
The network structure of cran 2015 07-02 final
The network structure of cran 2015 07-02 finalThe network structure of cran 2015 07-02 final
The network structure of cran 2015 07-02 finalRevolution Analytics
 
Simple Reproducibility with the checkpoint package
Simple Reproducibilitywith the checkpoint packageSimple Reproducibilitywith the checkpoint package
Simple Reproducibility with the checkpoint packageRevolution Analytics
 

More from Revolution Analytics (20)

Speeding up R with Parallel Programming in the Cloud
Speeding up R with Parallel Programming in the CloudSpeeding up R with Parallel Programming in the Cloud
Speeding up R with Parallel Programming in the Cloud
 
Migrating Existing Open Source Machine Learning to Azure
Migrating Existing Open Source Machine Learning to AzureMigrating Existing Open Source Machine Learning to Azure
Migrating Existing Open Source Machine Learning to Azure
 
R in Minecraft
R in Minecraft R in Minecraft
R in Minecraft
 
The case for R for AI developers
The case for R for AI developersThe case for R for AI developers
The case for R for AI developers
 
Speed up R with parallel programming in the Cloud
Speed up R with parallel programming in the CloudSpeed up R with parallel programming in the Cloud
Speed up R with parallel programming in the Cloud
 
The R Ecosystem
The R EcosystemThe R Ecosystem
The R Ecosystem
 
R Then and Now
R Then and NowR Then and Now
R Then and Now
 
Predicting Loan Delinquency at One Million Transactions per Second
Predicting Loan Delinquency at One Million Transactions per SecondPredicting Loan Delinquency at One Million Transactions per Second
Predicting Loan Delinquency at One Million Transactions per Second
 
Reproducible Data Science with R
Reproducible Data Science with RReproducible Data Science with R
Reproducible Data Science with R
 
The Value of Open Source Communities
The Value of Open Source CommunitiesThe Value of Open Source Communities
The Value of Open Source Communities
 
The R Ecosystem
The R EcosystemThe R Ecosystem
The R Ecosystem
 
R at Microsoft (useR! 2016)
R at Microsoft (useR! 2016)R at Microsoft (useR! 2016)
R at Microsoft (useR! 2016)
 
Building a scalable data science platform with R
Building a scalable data science platform with RBuilding a scalable data science platform with R
Building a scalable data science platform with R
 
R at Microsoft
R at MicrosoftR at Microsoft
R at Microsoft
 
The Business Economics and Opportunity of Open Source Data Science
The Business Economics and Opportunity of Open Source Data ScienceThe Business Economics and Opportunity of Open Source Data Science
The Business Economics and Opportunity of Open Source Data Science
 
Taking R Analytics to SQL and the Cloud
Taking R Analytics to SQL and the CloudTaking R Analytics to SQL and the Cloud
Taking R Analytics to SQL and the Cloud
 
The Network structure of R packages on CRAN & BioConductor
The Network structure of R packages on CRAN & BioConductorThe Network structure of R packages on CRAN & BioConductor
The Network structure of R packages on CRAN & BioConductor
 
The network structure of cran 2015 07-02 final
The network structure of cran 2015 07-02 finalThe network structure of cran 2015 07-02 final
The network structure of cran 2015 07-02 final
 
Simple Reproducibility with the checkpoint package
Simple Reproducibilitywith the checkpoint packageSimple Reproducibilitywith the checkpoint package
Simple Reproducibility with the checkpoint package
 
R at Microsoft
R at MicrosoftR at Microsoft
R at Microsoft
 

Recently uploaded

Six Myths about Ontologies: The Basics of Formal Ontology
Six Myths about Ontologies: The Basics of Formal OntologySix Myths about Ontologies: The Basics of Formal Ontology
Six Myths about Ontologies: The Basics of Formal Ontologyjohnbeverley2021
 
Polkadot JAM Slides - Token2049 - By Dr. Gavin Wood
Polkadot JAM Slides - Token2049 - By Dr. Gavin WoodPolkadot JAM Slides - Token2049 - By Dr. Gavin Wood
Polkadot JAM Slides - Token2049 - By Dr. Gavin WoodJuan lago vázquez
 
AWS Community Day CPH - Three problems of Terraform
AWS Community Day CPH - Three problems of TerraformAWS Community Day CPH - Three problems of Terraform
AWS Community Day CPH - Three problems of TerraformAndrey Devyatkin
 
Vector Search -An Introduction in Oracle Database 23ai.pptx
Vector Search -An Introduction in Oracle Database 23ai.pptxVector Search -An Introduction in Oracle Database 23ai.pptx
Vector Search -An Introduction in Oracle Database 23ai.pptxRemote DBA Services
 
Corporate and higher education May webinar.pptx
Corporate and higher education May webinar.pptxCorporate and higher education May webinar.pptx
Corporate and higher education May webinar.pptxRustici Software
 
Apidays New York 2024 - The Good, the Bad and the Governed by David O'Neill, ...
Apidays New York 2024 - The Good, the Bad and the Governed by David O'Neill, ...Apidays New York 2024 - The Good, the Bad and the Governed by David O'Neill, ...
Apidays New York 2024 - The Good, the Bad and the Governed by David O'Neill, ...apidays
 
Elevate Developer Efficiency & build GenAI Application with Amazon Q​
Elevate Developer Efficiency & build GenAI Application with Amazon Q​Elevate Developer Efficiency & build GenAI Application with Amazon Q​
Elevate Developer Efficiency & build GenAI Application with Amazon Q​Bhuvaneswari Subramani
 
MINDCTI Revenue Release Quarter One 2024
MINDCTI Revenue Release Quarter One 2024MINDCTI Revenue Release Quarter One 2024
MINDCTI Revenue Release Quarter One 2024MIND CTI
 
WSO2's API Vision: Unifying Control, Empowering Developers
WSO2's API Vision: Unifying Control, Empowering DevelopersWSO2's API Vision: Unifying Control, Empowering Developers
WSO2's API Vision: Unifying Control, Empowering DevelopersWSO2
 
Repurposing LNG terminals for Hydrogen Ammonia: Feasibility and Cost Saving
Repurposing LNG terminals for Hydrogen Ammonia: Feasibility and Cost SavingRepurposing LNG terminals for Hydrogen Ammonia: Feasibility and Cost Saving
Repurposing LNG terminals for Hydrogen Ammonia: Feasibility and Cost SavingEdi Saputra
 
Introduction to Multilingual Retrieval Augmented Generation (RAG)
Introduction to Multilingual Retrieval Augmented Generation (RAG)Introduction to Multilingual Retrieval Augmented Generation (RAG)
Introduction to Multilingual Retrieval Augmented Generation (RAG)Zilliz
 
DEV meet-up UiPath Document Understanding May 7 2024 Amsterdam
DEV meet-up UiPath Document Understanding May 7 2024 AmsterdamDEV meet-up UiPath Document Understanding May 7 2024 Amsterdam
DEV meet-up UiPath Document Understanding May 7 2024 AmsterdamUiPathCommunity
 
Exploring Multimodal Embeddings with Milvus
Exploring Multimodal Embeddings with MilvusExploring Multimodal Embeddings with Milvus
Exploring Multimodal Embeddings with MilvusZilliz
 
Rising Above_ Dubai Floods and the Fortitude of Dubai International Airport.pdf
Rising Above_ Dubai Floods and the Fortitude of Dubai International Airport.pdfRising Above_ Dubai Floods and the Fortitude of Dubai International Airport.pdf
Rising Above_ Dubai Floods and the Fortitude of Dubai International Airport.pdfOrbitshub
 
Apidays New York 2024 - APIs in 2030: The Risk of Technological Sleepwalk by ...
Apidays New York 2024 - APIs in 2030: The Risk of Technological Sleepwalk by ...Apidays New York 2024 - APIs in 2030: The Risk of Technological Sleepwalk by ...
Apidays New York 2024 - APIs in 2030: The Risk of Technological Sleepwalk by ...apidays
 
MS Copilot expands with MS Graph connectors
MS Copilot expands with MS Graph connectorsMS Copilot expands with MS Graph connectors
MS Copilot expands with MS Graph connectorsNanddeep Nachan
 
Platformless Horizons for Digital Adaptability
Platformless Horizons for Digital AdaptabilityPlatformless Horizons for Digital Adaptability
Platformless Horizons for Digital AdaptabilityWSO2
 
Biography Of Angeliki Cooney | Senior Vice President Life Sciences | Albany, ...
Biography Of Angeliki Cooney | Senior Vice President Life Sciences | Albany, ...Biography Of Angeliki Cooney | Senior Vice President Life Sciences | Albany, ...
Biography Of Angeliki Cooney | Senior Vice President Life Sciences | Albany, ...Angeliki Cooney
 
Web Form Automation for Bonterra Impact Management (fka Social Solutions Apri...
Web Form Automation for Bonterra Impact Management (fka Social Solutions Apri...Web Form Automation for Bonterra Impact Management (fka Social Solutions Apri...
Web Form Automation for Bonterra Impact Management (fka Social Solutions Apri...Jeffrey Haguewood
 

Recently uploaded (20)

Six Myths about Ontologies: The Basics of Formal Ontology
Six Myths about Ontologies: The Basics of Formal OntologySix Myths about Ontologies: The Basics of Formal Ontology
Six Myths about Ontologies: The Basics of Formal Ontology
 
Polkadot JAM Slides - Token2049 - By Dr. Gavin Wood
Polkadot JAM Slides - Token2049 - By Dr. Gavin WoodPolkadot JAM Slides - Token2049 - By Dr. Gavin Wood
Polkadot JAM Slides - Token2049 - By Dr. Gavin Wood
 
AWS Community Day CPH - Three problems of Terraform
AWS Community Day CPH - Three problems of TerraformAWS Community Day CPH - Three problems of Terraform
AWS Community Day CPH - Three problems of Terraform
 
Vector Search -An Introduction in Oracle Database 23ai.pptx
Vector Search -An Introduction in Oracle Database 23ai.pptxVector Search -An Introduction in Oracle Database 23ai.pptx
Vector Search -An Introduction in Oracle Database 23ai.pptx
 
Understanding the FAA Part 107 License ..
Understanding the FAA Part 107 License ..Understanding the FAA Part 107 License ..
Understanding the FAA Part 107 License ..
 
Corporate and higher education May webinar.pptx
Corporate and higher education May webinar.pptxCorporate and higher education May webinar.pptx
Corporate and higher education May webinar.pptx
 
Apidays New York 2024 - The Good, the Bad and the Governed by David O'Neill, ...
Apidays New York 2024 - The Good, the Bad and the Governed by David O'Neill, ...Apidays New York 2024 - The Good, the Bad and the Governed by David O'Neill, ...
Apidays New York 2024 - The Good, the Bad and the Governed by David O'Neill, ...
 
Elevate Developer Efficiency & build GenAI Application with Amazon Q​
Elevate Developer Efficiency & build GenAI Application with Amazon Q​Elevate Developer Efficiency & build GenAI Application with Amazon Q​
Elevate Developer Efficiency & build GenAI Application with Amazon Q​
 
MINDCTI Revenue Release Quarter One 2024
MINDCTI Revenue Release Quarter One 2024MINDCTI Revenue Release Quarter One 2024
MINDCTI Revenue Release Quarter One 2024
 
WSO2's API Vision: Unifying Control, Empowering Developers
WSO2's API Vision: Unifying Control, Empowering DevelopersWSO2's API Vision: Unifying Control, Empowering Developers
WSO2's API Vision: Unifying Control, Empowering Developers
 
Repurposing LNG terminals for Hydrogen Ammonia: Feasibility and Cost Saving
Repurposing LNG terminals for Hydrogen Ammonia: Feasibility and Cost SavingRepurposing LNG terminals for Hydrogen Ammonia: Feasibility and Cost Saving
Repurposing LNG terminals for Hydrogen Ammonia: Feasibility and Cost Saving
 
Introduction to Multilingual Retrieval Augmented Generation (RAG)
Introduction to Multilingual Retrieval Augmented Generation (RAG)Introduction to Multilingual Retrieval Augmented Generation (RAG)
Introduction to Multilingual Retrieval Augmented Generation (RAG)
 
DEV meet-up UiPath Document Understanding May 7 2024 Amsterdam
DEV meet-up UiPath Document Understanding May 7 2024 AmsterdamDEV meet-up UiPath Document Understanding May 7 2024 Amsterdam
DEV meet-up UiPath Document Understanding May 7 2024 Amsterdam
 
Exploring Multimodal Embeddings with Milvus
Exploring Multimodal Embeddings with MilvusExploring Multimodal Embeddings with Milvus
Exploring Multimodal Embeddings with Milvus
 
Rising Above_ Dubai Floods and the Fortitude of Dubai International Airport.pdf
Rising Above_ Dubai Floods and the Fortitude of Dubai International Airport.pdfRising Above_ Dubai Floods and the Fortitude of Dubai International Airport.pdf
Rising Above_ Dubai Floods and the Fortitude of Dubai International Airport.pdf
 
Apidays New York 2024 - APIs in 2030: The Risk of Technological Sleepwalk by ...
Apidays New York 2024 - APIs in 2030: The Risk of Technological Sleepwalk by ...Apidays New York 2024 - APIs in 2030: The Risk of Technological Sleepwalk by ...
Apidays New York 2024 - APIs in 2030: The Risk of Technological Sleepwalk by ...
 
MS Copilot expands with MS Graph connectors
MS Copilot expands with MS Graph connectorsMS Copilot expands with MS Graph connectors
MS Copilot expands with MS Graph connectors
 
Platformless Horizons for Digital Adaptability
Platformless Horizons for Digital AdaptabilityPlatformless Horizons for Digital Adaptability
Platformless Horizons for Digital Adaptability
 
Biography Of Angeliki Cooney | Senior Vice President Life Sciences | Albany, ...
Biography Of Angeliki Cooney | Senior Vice President Life Sciences | Albany, ...Biography Of Angeliki Cooney | Senior Vice President Life Sciences | Albany, ...
Biography Of Angeliki Cooney | Senior Vice President Life Sciences | Albany, ...
 
Web Form Automation for Bonterra Impact Management (fka Social Solutions Apri...
Web Form Automation for Bonterra Impact Management (fka Social Solutions Apri...Web Form Automation for Bonterra Impact Management (fka Social Solutions Apri...
Web Form Automation for Bonterra Impact Management (fka Social Solutions Apri...
 

RHadoop, R meets Hadoop

  • 2.
  • 3. Scholarly Activity 05-09 change 50% 37.5% 25% 12.5% 0% -12.5% -25% -37.5% R SAS SPSS S-Plus Stata
  • 4. Scholarly Activity 05-09 change 50% 37.5% 25% 12.5% Packages 0% 10000 -12.5% -25% 1000 -37.5% R SAS SPSS S-Plus Stata 100 10 1 2002 2004 2006 2008 2010
  • 5. Scholarly Activity 05-09 change 50% 37.5% 25% 12.5% Packages 0% 10000 -12.5% -25% 1000 -37.5% R SAS SPSS S-Plus Stata 100 10 http://r4stats.com/popularity 1 2002 2004 2006 2008 2010
  • 6.
  • 8.
  • 9.
  • 10.
  • 11.
  • 12.
  • 13.
  • 14. f s h d r
  • 15. rh d f s rhb ase
  • 16. rh d f s rhb ase rm r
  • 17. rmr
  • 18.
  • 19.
  • 22. Rmr
  • 23. Rmr Java, C++
  • 24. Rmr Cascading, Java, C++ Crunch
  • 25. Rmr, Rhipe, Dumbo, Rmr Pydoop, Hadoopy Cascading, Java, C++ Crunch
  • 26. Rmr, Rhipe, Dumbo, Rmr Pydoop, Hadoopy Cascading, Java, C++ Crunch
  • 27. Expose MR Hide MR Rmr, Rhipe, Dumbo, Rmr Pydoop, Hadoopy Cascading, Java, C++ Crunch
  • 28. Expose MR Hide MR Hive, Pig Rmr, Rhipe, Dumbo, Rmr Pydoop, Hadoopy Cascading, Java, C++ Crunch
  • 29. Expose MR Hide MR Hive, Pig Rmr, Rhipe, Dumbo, Rmr Cascalog, Pydoop, Hadoopy Scalding, Scrunch Cascading, Java, C++ Crunch
  • 36. map = function(k, v) if (hash(k) %% 10 == 0)keyval(k, v) reduce = function(k, vv) keyval(k, length(vv))
  • 37. map = function(k, v) if (hash(k) %% 10 == 0)keyval(k, v) reduce = function(k, vv) keyval(k, length(vv))
  • 38. map = function(k, v) if (hash(k) %% 10 == 0)keyval(k, v) reduce = function(k, vv) keyval(k, length(vv))
  • 39. map = function(k, v) if (hash(k) %% 10 == 0)keyval(k, v) reduce = function(k, vv) keyval(k, length(vv))
  • 40. map = function(k, v) if (hash(k) %% 10 == 0)keyval(k, v) reduce = function(k, vv) keyval(k, length(vv))
  • 41. map = function(k, v) if (hash(k) %% 10 == 0)keyval(k, v) reduce = function(k, vv) keyval(k, length(vv))
  • 42. map = function(k, v) if (hash(k) %% 10 == 0)keyval(k, v) reduce = function(k, vv) keyval(k, length(vv))
  • 43. map = function(k, v) if (hash(k) %% 10 == 0)keyval(k, v) reduce = function(k, vv) keyval(k, length(vv))
  • 45. condition = function(x) x > 10 out = mapreduce(
  • 46. condition = function(x) x > 10 out = mapreduce( input = input,
  • 47. condition = function(x) x > 10 out = mapreduce( input = input, map = function(k,v)
  • 48. condition = function(x) x > 10 out = mapreduce( input = input, map = function(k,v) if (condition(v)) keyval(k,v))
  • 49. condition = function(x) x > 10 out = mapreduce( input = input, map = function(k,v) if (condition(v)) keyval(k,v))
  • 51. INSERT OVERWRITE TABLE pv_gender_sum SELECT pv_users.gender, count (DISTINCT pv_users.userid) FROM pv_users GROUP BY pv_users.gender;
  • 52. INSERT OVERWRITE TABLE pv_gender_sum SELECT pv_users.gender, count (DISTINCT pv_users.userid) FROM pv_users GROUP BY pv_users.gender; mapreduce(input = mapreduce(input = "pv_users", map = function(k, v) keyval(v['userid'], v['gender']), reduce = function(uid, genders) lapply(unique(genders), function(g) keyval(NULL, g)), output = "pv_gender_sum", map = function(x, gender) keyval(gender, 1) reduce = function(gender,counts) keyval(k,sum(unlist(counts)))
  • 53. kmeans = function(points, ncenters, iterations = 10, distfun = function(a,b) norm(as.matrix(a-b), type = 'F')) { newCenters = kmeans.iter(points, distfun, ncenters = ncenters) for(i in 1:iterations) { newCenters = kmeans.iter(points, distfun, centers = newCenters)} newCenters}
  • 54. kmeans = function(points, ncenters, iterations = 10, distfun = function(a,b) norm(as.matrix(a-b), type = 'F')) { newCenters = kmeans.iter(points, distfun, ncenters = ncenters) for(i in 1:iterations) { newCenters = kmeans.iter(points, distfun, centers = newCenters)} newCenters} kmeans.iter = function(points, distfun, ncenters = dim(centers)[1], centers = NULL) { from.dfs( mapreduce( input = points, map = if (is.null(centers)) { function(k,v) keyval(sample(1:ncenters,1),v)} else { function(k,v) { distances = apply(centers, 1, function(c) distfun(c,v)) keyval(centers[which.min(distances),], v)}}, reduce = function(k,vv) keyval(NULL, apply(do.call(rbind, vv), 2, mean))), to.data.frame = T)}
  • 55. kmeans = function(points, ncenters, iterations = 10, distfun = function(a,b) norm(as.matrix(a-b), type = 'F')) { newCenters = kmeans.iter(points, distfun, ncenters = ncenters) for(i in 1:iterations) { newCenters = kmeans.iter(points, distfun, centers = newCenters)} newCenters} kmeans.iter = function(points, distfun, ncenters = dim(centers)[1], centers = NULL) { from.dfs( mapreduce( input = points, map = if (is.null(centers)) { function(k,v) keyval(sample(1:ncenters,1),v)} else { function(k,v) { distances = apply(centers, 1, function(c) distfun(c,v)) keyval(centers[which.min(distances),], v)}}, reduce = function(k,vv) keyval(NULL, apply(do.call(rbind, vv), 2, mean))), to.data.frame = T)}
  • 56. kmeans = function(points, ncenters, iterations = 10, distfun = function(a,b) norm(as.matrix(a-b), type = 'F')) { newCenters = kmeans.iter(points, distfun, ncenters = ncenters) for(i in 1:iterations) { newCenters = kmeans.iter(points, distfun, centers = newCenters)} newCenters} kmeans.iter = function(points, distfun, ncenters = dim(centers)[1], centers = NULL) { from.dfs( mapreduce( input = points, map = if (is.null(centers)) { function(k,v) keyval(sample(1:ncenters,1),v)} else { function(k,v) { distances = apply(centers, 1, function(c) distfun(c,v)) keyval(centers[which.min(distances),], v)}}, reduce = function(k,vv) keyval(NULL, apply(do.call(rbind, vv), 2, mean))), to.data.frame = T)}
  • 57. kmeans = function(points, ncenters, iterations = 10, distfun = function(a,b) norm(as.matrix(a-b), type = 'F')) { newCenters = kmeans.iter(points, distfun, ncenters = ncenters) for(i in 1:iterations) { newCenters = kmeans.iter(points, distfun, centers = newCenters)} newCenters} kmeans.iter = function(points, distfun, ncenters = dim(centers)[1], centers = NULL) { from.dfs( mapreduce( input = points, map = if (is.null(centers)) { function(k,v) keyval(sample(1:ncenters,1),v)} else { function(k,v) { distances = apply(centers, 1, function(c) distfun(c,v)) keyval(centers[which.min(distances),], v)}}, reduce = function(k,vv) keyval(NULL, apply(do.call(rbind, vv), 2, mean))), to.data.frame = T)}
  • 58. #!/usr/bin/python import sys from math import fabs from org.apache.pig.scripting import Pig filename = "student.txt" k = 4 tolerance = 0.01 MAX_SCORE = 4 MIN_SCORE = 0 MAX_ITERATION = 100 # initial centroid, equally divide the space initial_centroids = "" last_centroids = [None] * k for i in range(k): last_centroids[i] = MIN_SCORE + float(i)/k*(MAX_SCORE-MIN_SCORE) initial_centroids = initial_centroids + str(last_centroids[i]) if i!=k-1: initial_centroids = initial_centroids + ":" P = Pig.compile("""register udf.jar DEFINE find_centroid FindCentroid('$centroids'); raw = load 'student.txt' as (name:chararray, age:int, gpa:double); centroided = foreach raw generate gpa, find_centroid(gpa) as centroid; grouped = group centroided by centroid; result = foreach grouped generate group, AVG(centroided.gpa); store result into 'output'; """) converged = False iter_num = 0 while iter_num<MAX_ITERATION: Q = P.bind({'centroids':initial_centroids}) results = Q.runSingle()
  • 59. if results.isSuccessful() == "FAILED": raise "Pig job failed" iter = results.result("result").iterator() centroids = [None] * k distance_move = 0 # get new centroid of this iteration, caculate the moving distance with last iteration for i in range(k): tuple = iter.next() centroids[i] = float(str(tuple.get(1))) distance_move = distance_move + fabs(last_centroids[i]-centroids[i]) distance_move = distance_move / k; Pig.fs("rmr output") print("iteration " + str(iter_num)) print("average distance moved: " + str(distance_move)) if distance_move<tolerance: sys.stdout.write("k-means converged at centroids: [") sys.stdout.write(",".join(str(v) for v in centroids)) sys.stdout.write("]n") converged = True break last_centroids = centroids[:] initial_centroids = "" for i in range(k): initial_centroids = initial_centroids + str(last_centroids[i]) if i!=k-1: initial_centroids = initial_centroids + ":" iter_num += 1 if not converged: print("not converge after " + str(iter_num) + " iterations") sys.stdout.write("last centroids: [") sys.stdout.write(",".join(str(v) for v in last_centroids)) sys.stdout.write("]n")
  • 60. import java.io.IOException; import org.apache.pig.EvalFunc; import org.apache.pig.data.Tuple; public class FindCentroid extends EvalFunc<Double> { double[] centroids; public FindCentroid(String initialCentroid) { String[] centroidStrings = initialCentroid.split(":"); centroids = new double[centroidStrings.length]; for (int i=0;i<centroidStrings.length;i++) centroids[i] = Double.parseDouble(centroidStrings[i]); } @Override public Double exec(Tuple input) throws IOException { double min_distance = Double.MAX_VALUE; double closest_centroid = 0; for (double centroid : centroids) { double distance = Math.abs(centroid - (Double)input.get(0)); if (distance < min_distance) { min_distance = distance; closest_centroid = centroid; } } return closest_centroid; } }
  • 63. mapreduce(mapreduce(… mapreduce(input = c(input1, input2), …) equijoin = function( left.input, right.input, input, output, outer, map.left, map.right, reduce, reduce.all)
  • 64. out1 = mapreduce(…) mapreduce(input = out1, <xyz>) mapreduce(input = out1, <abc>)
  • 65. out1 = mapreduce(…) mapreduce(input = out1, <xyz>) mapreduce(input = out1, <abc>) abstract.job = function(input, output, …) { … result = mapreduce(input = input, output = output) … result}
  • 71. input.format, output.format, format combine reduce.on.data.frame local, hadoop backends backend.parameters profiling
  • 72. input.format, output.format, format combine reduce.on.data.frame local, hadoop backends backend.parameters profiling verbose
  • 73.
  • 74.
  • 75.
  • 76. RHADOOP USER ONE FAT CLUSTER AVE. HYDROPOWER CITY, OR 0x0000 RHADOOP@ REVOLUTIONANALYTICS.COM

Editor's Notes

  1. What is R\nWhat is RHadoop\nOpen source project\nstarted by RevoLution\naims to make R and Hadoop work together\nwhat is revolution\n
  2. \n
  3. \n
  4. \n
  5. Faster, assured builds\nLarge Data extensions\nWeb deployments\nTech support\nConsulting service\nTraining\n
  6. \n
  7. hadoop bring horizontal scalability\nr sophisticated analytics\ncombination could be powerful\n
  8. Hadoop is one project but also a family of projects. We started the integration path with three projects targeting three members of the Hadoop family\n\nHadoop hdfs provides acces to hdfs file system. can be divided into two sub-APis: file level and byte level\n
  9. Hadoop is one project but also a family of projects. We started the integration path with three projects targeting three members of the Hadoop family\n\nHadoop hdfs provides acces to hdfs file system. can be divided into two sub-APis: file level and byte level\n
  10. Hadoop is one project but also a family of projects. We started the integration path with three projects targeting three members of the Hadoop family\n\nHadoop hdfs provides acces to hdfs file system. can be divided into two sub-APis: file level and byte level\n
  11. Hadoop is one project but also a family of projects. We started the integration path with three projects targeting three members of the Hadoop family\n\nHadoop hdfs provides acces to hdfs file system. can be divided into two sub-APis: file level and byte level\n
  12. Hadoop is one project but also a family of projects. We started the integration path with three projects targeting three members of the Hadoop family\n\nHadoop hdfs provides acces to hdfs file system. can be divided into two sub-APis: file level and byte level\n
  13. Hadoop is one project but also a family of projects. We started the integration path with three projects targeting three members of the Hadoop family\n\nHadoop hdfs provides acces to hdfs file system. can be divided into two sub-APis: file level and byte level\n
  14. Hadoop is one project but also a family of projects. We started the integration path with three projects targeting three members of the Hadoop family\n\nHadoop hdfs provides acces to hdfs file system. can be divided into two sub-APis: file level and byte level\n
  15. Hadoop is one project but also a family of projects. We started the integration path with three projects targeting three members of the Hadoop family\n\nHadoop hdfs provides acces to hdfs file system. can be divided into two sub-APis: file level and byte level\n
  16. Hadoop is one project but also a family of projects. We started the integration path with three projects targeting three members of the Hadoop family\n\nHadoop hdfs provides acces to hdfs file system. can be divided into two sub-APis: file level and byte level\n
  17. Hadoop is one project but also a family of projects. We started the integration path with three projects targeting three members of the Hadoop family\n\nHadoop hdfs provides acces to hdfs file system. can be divided into two sub-APis: file level and byte level\n
  18. Hadoop is one project but also a family of projects. We started the integration path with three projects targeting three members of the Hadoop family\n\nHadoop hdfs provides acces to hdfs file system. can be divided into two sub-APis: file level and byte level\n
  19. Hadoop is one project but also a family of projects. We started the integration path with three projects targeting three members of the Hadoop family\n\nHadoop hdfs provides acces to hdfs file system. can be divided into two sub-APis: file level and byte level\n
  20. Hadoop is one project but also a family of projects. We started the integration path with three projects targeting three members of the Hadoop family\n\nHadoop hdfs provides acces to hdfs file system. can be divided into two sub-APis: file level and byte level\n
  21. Hadoop is one project but also a family of projects. We started the integration path with three projects targeting three members of the Hadoop family\n\nHadoop hdfs provides acces to hdfs file system. can be divided into two sub-APis: file level and byte level\n
  22. Hadoop is one project but also a family of projects. We started the integration path with three projects targeting three members of the Hadoop family\n\nHadoop hdfs provides acces to hdfs file system. can be divided into two sub-APis: file level and byte level\n
  23. Hadoop is one project but also a family of projects. We started the integration path with three projects targeting three members of the Hadoop family\n\nHadoop hdfs provides acces to hdfs file system. can be divided into two sub-APis: file level and byte level\n
  24. Hadoop is one project but also a family of projects. We started the integration path with three projects targeting three members of the Hadoop family\n\nHadoop hdfs provides acces to hdfs file system. can be divided into two sub-APis: file level and byte level\n
  25. A way to access big data sets\n\n
  26. A simple way to write parallel programs &amp;#x2013; everyone will have to\n \n\n
  27. Very R-like, building on the functional characteristics of R\n\n
  28. Just a library&amp;#xA0;\n
  29. \n Much simpler than writing Java\n Not as simple as Hive, Pig at what they do, but more general\n Great for prototyping, can transition to production -- optimize instead of rewriting! Lower risk, always executable.\n
  30. \n Much simpler than writing Java\n Not as simple as Hive, Pig at what they do, but more general\n Great for prototyping, can transition to production -- optimize instead of rewriting! Lower risk, always executable.\n
  31. \n Much simpler than writing Java\n Not as simple as Hive, Pig at what they do, but more general\n Great for prototyping, can transition to production -- optimize instead of rewriting! Lower risk, always executable.\n
  32. \n Much simpler than writing Java\n Not as simple as Hive, Pig at what they do, but more general\n Great for prototyping, can transition to production -- optimize instead of rewriting! Lower risk, always executable.\n
  33. \n Much simpler than writing Java\n Not as simple as Hive, Pig at what they do, but more general\n Great for prototyping, can transition to production -- optimize instead of rewriting! Lower risk, always executable.\n
  34. \n Much simpler than writing Java\n Not as simple as Hive, Pig at what they do, but more general\n Great for prototyping, can transition to production -- optimize instead of rewriting! Lower risk, always executable.\n
  35. \n Much simpler than writing Java\n Not as simple as Hive, Pig at what they do, but more general\n Great for prototyping, can transition to production -- optimize instead of rewriting! Lower risk, always executable.\n
  36. mapreduce first an most important element of API\ninput can be as simple as a path\noutput the same or skip for managed space with stubs\nmap reduce simple R functions as opposed to Rhipe\n
  37. mapreduce first an most important element of API\ninput can be as simple as a path\noutput the same or skip for managed space with stubs\nmap reduce simple R functions as opposed to Rhipe\n
  38. mapreduce first an most important element of API\ninput can be as simple as a path\noutput the same or skip for managed space with stubs\nmap reduce simple R functions as opposed to Rhipe\n
  39. mapreduce first an most important element of API\ninput can be as simple as a path\noutput the same or skip for managed space with stubs\nmap reduce simple R functions as opposed to Rhipe\n
  40. mapreduce first an most important element of API\ninput can be as simple as a path\noutput the same or skip for managed space with stubs\nmap reduce simple R functions as opposed to Rhipe\n
  41. mapreduce first an most important element of API\ninput can be as simple as a path\noutput the same or skip for managed space with stubs\nmap reduce simple R functions as opposed to Rhipe\n
  42. mapreduce first an most important element of API\ninput can be as simple as a path\noutput the same or skip for managed space with stubs\nmap reduce simple R functions as opposed to Rhipe\n
  43. mapreduce first an most important element of API\ninput can be as simple as a path\noutput the same or skip for managed space with stubs\nmap reduce simple R functions as opposed to Rhipe\n
  44. simple map example -- filtering\nreduce example -- counting\n
  45. simple map example -- filtering\nreduce example -- counting\n
  46. simple map example -- filtering\nreduce example -- counting\n
  47. simple map example -- filtering\nreduce example -- counting\n
  48. simple map example -- filtering\nreduce example -- counting\n
  49. simple map example -- filtering\nreduce example -- counting\n
  50. simple map example -- filtering\nreduce example -- counting\n
  51. simple map example -- filtering\nreduce example -- counting\n
  52. simple map example -- filtering\nreduce example -- counting\n
  53. simple map example -- filtering\nreduce example -- counting\n
  54. simple map example -- filtering\nreduce example -- counting\n
  55. simple map example -- filtering\nreduce example -- counting\n
  56. simple map example -- filtering\nreduce example -- counting\n
  57. easy to parametrize jobs\n
  58. easy to parametrize jobs\n
  59. easy to parametrize jobs\n
  60. easy to parametrize jobs\n
  61. easy to parametrize jobs\n
  62. easy to parametrize jobs\n
  63. second pillar of API, the memory-hdfs bridge\n
  64. A language like HIVE makes a class of problems easy to solve, but it is not a general tool\n The cost of doing the same operation in rmr is modest and it provides a broader set of capabilities\n
  65. A language like HIVE makes a class of problems easy to solve, but it is not a general tool\n The cost of doing the same operation in rmr is modest and it provides a broader set of capabilities\n
  66. kmeans implementation in two simple functions\nnote how easy it is to get data in and out of the cluster\n
  67. kmeans implementation in two simple functions\nnote how easy it is to get data in and out of the cluster\n
  68. kmeans implementation in two simple functions\nnote how easy it is to get data in and out of the cluster\n
  69. kmeans implementation in two simple functions\nnote how easy it is to get data in and out of the cluster\n
  70. skip quickly to other slides\nnotice three different languages\n
  71. \n
  72. \n
  73. more things you can do combining the elements of the API\n
  74. more things you can do combining the elements of the API\n
  75. more things you can do combining the elements of the API\n
  76. \n
  77. \n
  78. \n
  79. \n
  80. \n
  81. \n
  82. \n
  83. \n
  84. \n
  85. \n
  86. \n
  87. \n
  88. \n