SlideShare a Scribd company logo
1 of 61
Download to read offline
Spark Workshop
Basics and Streaming
Wojciech Pituła
June 29, 2015
Grupa Wirtualna Polska
0
Agenda
Scala
Spark
Development
Architecture
Spark SQL
Spark Streaming
1
Scala
Vals, vars and defs
[wpitula@wpitula-e540 tmp]$ sbt console
...
Welcome to Scala version 2.10.4 (OpenJDK 64-Bit Server VM, Java 1.8.0_45).
Type in expressions to have them evaluated.
Type :help for more information.
scala> var foo = 1
foo: Int = 1
scala> def fooMultipliedBy(x: Double) = foo*x
fooMultipliedBy: (x: Double)Double
scala> val result = fooMultipliedBy(2)
result: Double = 2.0
scala> result = fooMultipliedBy 3
<console>:10: error: reassignment to val
scala> foo = 2
foo: Int = 2
scala> fooMultipliedBy 2
res1: Double = 4.0
3
> pl.wp.sparkworkshop.scala.exercise1
4
Classes and Objects
scala> class Person(age:Int = 22) {
| def canDrink(limit:Int = 18) = age >= limit //public by default
| }
defined class Person
scala> (new Person).canDrink()
res2: Boolean = true
scala> (new Person(18)).canDrink(21)
res3: Boolean = false
scala> object Person {
| def inAgeRange(from: Int, to: Int) = new Foo(from+Random.nextInt(to-from))
| }
defined object Person
scala> Person.inAgeRange(15, 17).canDrink()
res4: Boolean = false
5
Classes and Objects 2
∙ case class can be seen as plain and immutable data-holding objects that
should exclusively depend on their constructor arguments.
∙ case class = class + factory method + pattern matching + eqals/hashcode +
toString + copy
scala> case class Rational(n: Int, d: Int = 1)
defined class Rational
scala> val (a, b, c) = (Rational(1,2), Rational(3,4), Rational(1,2))
cBar1: Rational = Rational(1,2.0)
cBar2: Rational = Rational(3,4.0)
cBar3: Rational = Rational(1,2.0)
scala> a == c
res0: Boolean = true
scala> a.copy(d = 3)
res1: Rational = Rational(1,3)
6
> pl.wp.sparkworkshop.scala.exercise2
7
Higher order functions
scala> def add1(x:Int, y:Int) = x+y // ”method”
add1: (x: Int, y: Int)Int
scala> val add2 = add1 _ // converted method
add2: (Int, Int) => Int = <function2>
scala> val add3 = (x:Int, y:Int) => x+y // function literal
add3: (Int, Int) => Int = <function2>
scala> def magic(func: (Int, Int) => Int) = func(4,3)
magic: (func: (Int, Int) => Int)Int
scala> magic(add1)
res0: Int = 7
8
Higher order functions 2
scala> def transformer(x:Int)(func: ((Int,Int) => Int)) = (y:Int) => func(x, y)
transformer: (x: Int)(func: (Int, Int) => Int)Int => Int
scala> transformer _
res0: Int => (((Int, Int) => Int) => (Int => Int)) = <function1>
9
Higher order functions 3
scala> def transformer(x:Int)(func: ((Int,Int) => Int)) = (y:Int) => func(x, y)
transformer: (x: Int)(func: (Int, Int) => Int)Int => Int
scala> transformer _
res0: Int => (((Int, Int) => Int) => (Int => Int)) = <function1>
scala> val five = transformer(5) _
five: ((Int, Int) => Int) => (Int => Int) = <function1>
scala> val fivePlus = fiveTransformer(_+_)
fivePlus: Int => Int = <function1>
scala> val fivePlusThree = fivePlus(3)
fivePlusThree: Int = 8
scala> transformer(5)(_+_)(3)
res1: Int = 8
10
Type params
class Value[T](x: T){
def map[V](func: T => V): Value[V] = new Value(func(x))
}
case class Vector2[T : Numeric](x: T, y:T) {
val num = implicitly[Numeric[T]]
import num._
def transform[V : Numeric](func: T => V) = Vector2(func(x), func(y))
def join(other: Vector2[T], joinFunc: (T,T) => T) = ???
def +(other: Vector2[T]) = join(other, _+_)
def -(other: Vector2[T]) = join(other, _-_)
// [2,3] ^ 2 = [4, 9]
def ^(exp: Int): Vector2[T] = ???
}
> pl.wp.sparkworkshop.scala.exercise3
11
Collections
1. scala.collection, scala.collection.immutable, and
scala.collection.mutable
2. immutable imported by default
3. (List, ListBuffer), (Array, ArrayBuffer), (String, StringBuffer),Set,
Map
12
Collections 2
scala> List(1,2,3,4,5,6) // alternatively (1 to 6).toList
res0: List[Int] = List(1, 2, 3, 4, 5, 6)
scala> res0.map(_*3)
res1: List[Int] = List(3, 6, 9, 12, 15, 18)
scala> res1.filter(_%2 == 0)
res3: List[Int] = List(6, 12, 18)
scala> res3.foldLeft(0)(_+_)
res4: Int = 36
scala> res3.foreach(println)
6
12
18
scala> for(x <- res3; y <- res1 if y%2==1) yield (x,y)
res7: List[(Int, Int)] = List((6,3), (6,9), (6,15), (12,3), (12,9), (12,15), (18,3),
(18,9), (18,15))
13
> pl.wp.sparkworkshop.scala.exercise4
14
Pattern Matching
scala> case class Foo(foo: Any, bar: Any)
scala> def recognize(obj: Any) = {
| obj match {
| case str :String => s”string $str”
| case Foo(Some(1), Foo(_, _)) => ”some very complicated case”
| case (x,y) => s”tuple of $x and $y”
| case _ => ”Boring”
| }
| }
scala> recognize(1)
res0: String = Boring
scala> recognize(”something”)
res1: String = string something
scala> recognize(Foo(Some(1), Foo(””,””)))
res3: String = some very complicated case
scala> recognize((1,2))
res4: String = tuple of 1 and 2
15
> pl.wp.sparkworkshop.scala.exercise5
16
Sbt
val sparkVersion = ”1.2.1”
lazy val root = (project in file(”.”))
.settings(
name := ”spark-streaming-app”,
organization := ”pl.wp.sparkworkshop”,
version := ”1.0-SNAPSHOT”,
scalaVersion := ”2.11.5”,
libraryDependencies ++= Seq(
”org.apache.spark” %% ”spark-core” % sparkVersion % ”provided”,
”org.apache.spark” %% ”spark-streaming” % sparkVersion % ”provided”,
”org.scalatest” %% ”scalatest” % ”2.2.1” % ”test”,
”org.mockito” % ”mockito-core” % ”1.10.19” % ”test”
),
resolvers ++= Seq(
”My Repo” at ”http://repo/url”
))
.settings(
publishMavenStyle := true,
publishArtifact in Test := false,
pomIncludeRepository := { _ => false},
publishTo := {
val repo = ”http://repo/url”
if (isSnapshot.value)
Some(”snapshots” at nexus + ”content/repositories/snapshots”)
else
Some(”releases” at nexus + ”content/repositories/releases”)
})
17
Exercise
”A prime number (or a prime) is a natural number which has
exactly two distinct natural number divisors: 1 and itself. Your
task is to test whether the given number is a prime number.”
def isPrime(x: Int): Boolean > pl.wp.sparkworkshop.scala.exercise6
18
Exercise - Solution
implicit class PotentiallyPrime(x :Int) {
def isPrime(): Boolean = {
(1 to x).filter(x % _ == 0) == List(1, x)
}
}
val is5Prime = 5.isPrime
19
Spark
Development
21
RDD
An RDD is an immutable, deterministically re-computable,
distributed dataset.
Each RDD remembers the lineage of deterministic operations
that were used on a fault-tolerant input dataset to create it.
Each RDD can be operated on in parallel.
22
Sources
val conf = new SparkConf().setAppName(”Simple Application”)
val sc = new SparkContext(conf)
∙ Parallelized Collections
val data = Array(1, 2, 3, 4, 5)
val distData = sc.parallelize(data)
∙ External Datasets: Any storage source supported by Hadoop:
local file system, HDFS, Cassandra, HBase, Amazon S3, etc.
Spark supports text files, SequenceFiles, and any other
Hadoop InputFormat.
scala> val distFile = sc.textFile(”data.txt”)
distFile: RDD[String] = MappedRDD@1d4cee08
23
Transformations and Actions
RDDs support two types of operations:
∙ transformations, which create a new dataset from an
existing one
∙ actions, which return a value to the driver program after
running a computation on the dataset.
All transformations in Spark are lazy, in that they do not
compute their results right away. Instead, they just remember
the transformations applied to some base dataset (e.g. a file).
The transformations are only computed when an action
requires a result to be returned to the driver program.
24
Transformations
map[U](f: (T) => U): RDD[U]
Return a new distributed dataset formed by passing each element of
the source through a function func.
filter(f: (T) => Boolean): RDD[T]
Return a new dataset formed by selecting those elements of the
source on which func returns true.
union(other: RDD[T]): RDD[T]
Return a new dataset that contains the union of the elements in the
source dataset and the argument.
intersection(other: RDD[T]): RDD[T]
Return a new RDD that contains the intersection of elements in the
source dataset and the argument.
groupByKey(): RDD[(K, Iterable[V])]
When called on a dataset of (K, V) pairs, returns a dataset of (K,
Iterable<V>) pairs.
and much more
25
Actions
reduce(f: (T, T) => T): T
Aggregate the elements of the dataset using a function func (which
takes two arguments and returns one)
collect(): Array[T]
Return all the elements of the dataset as an array at the driver
program.
count(): Long
Return the number of elements in the dataset.
foreach(f: (T) => Unit): Unit
Run a function func on each element of the dataset.
and much more
26
spark-shell
Just like Scala REPL but with SparkContext
> ./bin/spark-shell --master ”local[4]”
Spark assembly has been built with Hive, including Datanucleus jars on classpath
Welcome to
____ __
/ __/__ ___ _____/ /__
_ / _ / _ ‘/ __/ ’_/
/___/ .__/_,_/_/ /_/_ version 1.3.0
/_/
Using Scala version 2.10.4 (Java HotSpot(TM) 64-Bit Server VM, Java 1.8.0_31)
Type in expressions to have them evaluated.
Type :help for more information.
Spark context available as sc.
SQL context available as sqlContext.
scala> sc.parallelize(List(”Hello world”)).foreach(println)
Hello world
27
> pl.wp.sparkworkshop.spark.core.exercise1
FirstCharsCount
28
spark-sumbit
Application jar
A jar containing the user’s Spark application. Users should
create an ”uber jar” containing their application along with its
dependencies. The user’s jar should never include Hadoop or
Spark libraries, however, these will be added at runtime.
./bin/spark-submit 
--class org.apache.spark.examples.SparkPi 
--master spark://10.0.0.1:7077,10.0.0.2:7077 
--executor-memory 20G 
--total-executor-cores 100 
/path/to/examples.jar 
1000
29
> pl.wp.sparkworkshop.spark.core.exercise2
LettersCount
30
Shared variables
∙ Broadcast Variables
scala> val broadcastVar = sc.broadcast(Array(1, 2, 3))
broadcastVar: org.apache.spark.broadcast.Broadcast[Array[Int]] = Broadcast(0)
scala> broadcastVar.value
res0: Array[Int] = Array(1, 2, 3)
∙ Accumulators
scala> val accum = sc.accumulator(0, ”My Accumulator”)
accum: spark.Accumulator[Int] = 0
scala> sc.parallelize(Array(1, 2, 3, 4)).foreach(x => accum += x)
...
10/09/29 18:41:08 INFO SparkContext: Tasks finished in 0.317106 s
scala> accum.value
res2: Int = 10
31
Underlying Akka
”Akka is a toolkit and runtime for building highly concurrent,
distributed, and resilient message-driven applications on the
JVM.”
case class Greeting(who: String)
class GreetingActor extends Actor with ActorLogging {
def receive = {
case Greeting(who) => log.info(”Hello ” + who)
}
}
val system = ActorSystem(”MySystem”)
val greeter = system.actorOf(Props[GreetingActor], name = ”greeter”)
greeter ! Greeting(”Charlie Parker”)
32
Architecture
33
Clusters
∙ Standalone
∙ Apache Mesos
∙ Hadoop YARN
∙ local[*]
34
Master, Worker, Executor and Driver
Driver program
The process running the main() function of the
application and creating the SparkContext
Cluster manager
An external service for acquiring resources on the
cluster (e.g. standalone manager, Mesos, YARN)
Worker node
Any node that can run application code in the cluster
Executor
A process launched for an application on a worker
node, that runs tasks and keeps data in memory or disk
storage across them. Each application has its own
executors.
35
Runnig standalone cluster
Master
./sbin/start-master.sh
# OR
./bin/spark-class org.apache.spark.deploy.master.Master --ip ‘hostname‘ --port 7077
--webui-port 8080
Worker
./bin/spark-class org.apache.spark.deploy.worker.Worker
spark://10.0.0.1:7077,10.0.0.2:7077
36
Job, Stage, Task
Job
A parallel computation consisting of multiple tasks that
gets spawned in response to a Spark action (e.g. save,
collect).
Stage
Each job gets divided into smaller sets of tasks called
stages that depend on each other (similar to the map and
reduce stages in MapReduce); you’ll see this term used in
the driver’s logs.
Task
A unit of work that will be sent to one executor
37
SparkUI
38
> src/main/scala/pl/wp/sparkworkshop/
spark/core/exercise3/submit.sh
39
Configuration - spark-defaults.conf
spark.eventLog.enabled true
spark.eventLog.dir hdfs://some/path/on/hdfs
spark.serializer org.apache.spark.serializer.KryoSerializer
spark.rdd.compress true
spark.executor.extraJavaOptions
-Dlog4j.loghost.Prefix=hadoop-spark-poc-display-executor
-Dlog4j.localRollingFile.FileName=spark-poc-display-executor.log
spark.driver.extraJavaOptions -Dlog4j.loghost.Prefix=hadoop-spark-poc-display-driver
-Dlog4j.localRollingFile.FileName=spark-poc-display-driver.log
spark.streaming.unpersist true
spark.task.maxFailures 8
spark.executor.logs.rolling.strategy time
40
Configuration - spark-env.sh
HADOOP_CONF_DIR=/etc/hadoop
SPARK_SUBMIT_CLASSPATH=”/some/libs/to/put/on/classpath/”
SPARK_LOCAL_DIRS=/tmp/dir
SPARK_WORKER_CORES=8
SPARK_WORKER_MEMORY=3g
SPARK_WORKER_OPTS=”-Dlog4j.loghost.Prefix=node-spark-worker
-Dlog4j.localRollingFile.FileName=spark-worker.log”
SPARK_DAEMON_JAVA_OPTS=”-Dspark.deploy.recoveryMode=ZOOKEEPER
-Dspark.deploy.zookeeper.url=zookeeper-1:2181,zookeeper-2:2181,zookeeper-3:2181”
41
Spark SQL
DataFrame
A DataFrame is a distributed collection of data organized into
named columns.
DataFrame ≈ RDD[Row] ≈ Rdd[String] + schema
43
DataFrame Operations
val sc: SparkContext // An existing SparkContext.
val sqlContext = new org.apache.spark.sql.SQLContext(sc)
// Create the DataFrame
val df = sqlContext.read.json(”examples/src/main/resources/people.json”)
// Show the content of the DataFrame
df.show()
// age name
// null Michael
// 30 Andy
// 19 Justin
// Print the schema in a tree format
df.printSchema()
// root
// |-- age: long (nullable = true)
// |-- name: string (nullable = true)
// Select only the ”name” column
df.select(”name”).show()
// name
// Michael
// Andy
// Justin
44
DataFrame Operations 2
// Select everybody, but increment the age by 1
df.select(df(”name”), df(”age”) + 1).show()
// name (age + 1)
// Michael null
// Andy 31
// Justin 20
// Select people older than 21
df.filter(df(”age”) > 21).show()
// age name
// 30 Andy
// Count people by age
df.groupBy(”age”).count().show()
// age count
// null 1
// 19 1
// 30 1
45
SQL Queries
case class Person(name: String, age: Int)
// Create an RDD of Person objects and register it as a table.
val people = sc.textFile(”examples/src/main/resources/people.txt”)
.map(_.split(”,”)).map(p => Person(p(0), p(1).trim.toInt)).toDF()
people.registerTempTable(”people”)
// SQL statements can be run by using the sql methods provided by sqlContext.
val teenagers =
sqlContext.sql(”SELECT name, age FROM people WHERE age >= 13 AND age <= 19”)
val hc = new org.apache.spark.sql.hive.HiveContext(sc)
val negativesQuery = s”””select event
|from scoring.display_balanced_events lateral view explode(events) e as event
|where event.label=0”””.stripMargin
val negatives = hc.sql(negativesQuery).limit(maxCount)
46
Spark Streaming
Overview
48
DStream
DStream
discretized stream
sequence of RDDs
49
Receivers
∙ Directory
∙ Actors
∙ Custom
∙ Kafka
∙ Flume
∙ Kinesis
∙ Twitter
50
Transformations
All are lazy!
map, filter, flatmap, filter, count
updateStateByKey(func), reduceByKey, join
window(windowLength, slideInterval), countByWindow, reduceByWindow
51
Outputs
∙ print
∙ saveAsTextFiles, saveAsObjectFiles, saveAsHadoopFiles
∙ foreachRDD
52
Example
> pl.wp.sparkworkshop.spark.streaming.exercise1.SocketWordsCount
val conf = new SparkConf().setAppName(”Example”)
val ssc = new StreamingContext(conf, Seconds(10))
// Create a DStream that will connect to hostname:port, like localhost:9999
val lines = ssc.socketTextStream(”localhost”, 9999)
// Split each line into words
val words = lines.flatMap(_.split(” ”))
val pairs = words.map(word => (word, 1))
val wordCounts = pairs.reduceByKey(_ + _)
// Print the first ten elements of each RDD generated in this DStream to the console
wordCounts.print()
// Start the computation
ssc.start()
ssc.awaitTermination() // Wait for the computation to terminate
53
ForeachRDD
import org.apache.spark.streaming.dstream.DStream
val dstream : DStream[(String, String)] = ???
// we’re at the driver
dstream.foreachRDD(rdd =>
//still at the driver
rdd.foreachPartition(partition =>
//now we’re at the worker
//anything has to be serialized or static to get here
partition.foreach(elem =>
//still at the worker
println(elem)
)
)
)
54
Checkpoints
∙ Metadata checkpointing
∙ Configuration
∙ DStream operations
∙ Incomplete batches
∙ Data checkpointing - Saving of the generated RDDs to
reliable storage. In stateful transformations, the generated
RDDs depends on RDDs of previous batches, which causes
the length of the dependency chain to keep increasing with
time.
55
Checkpoints - example
def ceateStreamingContext(): StreamingContext = {
val ssc = new StreamingContext(...) // new context
ssc.checkpoint(checkpointDirectory) // set checkpoint directory
val lines = ssc.socketTextStream(...) // create DStreams
lines.checkpoint(Seconds(120))
...
ssc
}
// Get StreamingContext from checkpoint data or create a new one
val context = StreamingContext.getOrCreate(checkpointDirectory,
ceateStreamingContext _)
// Start the context
context.start()
context.awaitTermination()
56
>
pl.wp.sparkworkshop.spark.streaming.exercise2
StreamLettersCount
57
Tunning
∙ Reducing the processing time of each batch of data by
efficiently using cluster resources.
∙ Level of Parallelism in Data Receiving
∙ Level of Parallelism in Data Processing
∙ Data Serialization
∙ Setting the right batch size such that the batches of data can
be processed as fast as they are received (that is, data
processing keeps up with the data ingestion).
58
Futher reading
∙ Programming guides(core, sql, streaming)
∙ Integration guides(kafka, flume, etc.)
∙ API Docs
∙ Mailling list
59
Questions?
60

More Related Content

What's hot

Introduction to Scala
Introduction to ScalaIntroduction to Scala
Introduction to ScalaSaleem Ansari
 
Scala eXchange opening
Scala eXchange openingScala eXchange opening
Scala eXchange openingMartin Odersky
 
Scalding - the not-so-basics @ ScalaDays 2014
Scalding - the not-so-basics @ ScalaDays 2014Scalding - the not-so-basics @ ScalaDays 2014
Scalding - the not-so-basics @ ScalaDays 2014Konrad Malawski
 
The Evolution of Scala / Scala進化論
The Evolution of Scala / Scala進化論The Evolution of Scala / Scala進化論
The Evolution of Scala / Scala進化論scalaconfjp
 
Scala: Object-Oriented Meets Functional, by Iulian Dragos
Scala: Object-Oriented Meets Functional, by Iulian DragosScala: Object-Oriented Meets Functional, by Iulian Dragos
Scala: Object-Oriented Meets Functional, by Iulian Dragos3Pillar Global
 
Functional Programming In Practice
Functional Programming In PracticeFunctional Programming In Practice
Functional Programming In PracticeMichiel Borkent
 
The Why and How of Scala at Twitter
The Why and How of Scala at TwitterThe Why and How of Scala at Twitter
The Why and How of Scala at TwitterAlex Payne
 
Introduction to Spark with Scala
Introduction to Spark with ScalaIntroduction to Spark with Scala
Introduction to Spark with ScalaHimanshu Gupta
 
Scoobi - Scala for Startups
Scoobi - Scala for StartupsScoobi - Scala for Startups
Scoobi - Scala for Startupsbmlever
 
New features in jdk8 iti
New features in jdk8 itiNew features in jdk8 iti
New features in jdk8 itiAhmed mar3y
 
Making Structured Streaming Ready for Production
Making Structured Streaming Ready for ProductionMaking Structured Streaming Ready for Production
Making Structured Streaming Ready for ProductionDatabricks
 
Easy, scalable, fault tolerant stream processing with structured streaming - ...
Easy, scalable, fault tolerant stream processing with structured streaming - ...Easy, scalable, fault tolerant stream processing with structured streaming - ...
Easy, scalable, fault tolerant stream processing with structured streaming - ...Databricks
 
Beyond parallelize and collect - Spark Summit East 2016
Beyond parallelize and collect - Spark Summit East 2016Beyond parallelize and collect - Spark Summit East 2016
Beyond parallelize and collect - Spark Summit East 2016Holden Karau
 

What's hot (20)

Introduction to Scala
Introduction to ScalaIntroduction to Scala
Introduction to Scala
 
Scala eXchange opening
Scala eXchange openingScala eXchange opening
Scala eXchange opening
 
Scala in a nutshell by venkat
Scala in a nutshell by venkatScala in a nutshell by venkat
Scala in a nutshell by venkat
 
Scalding - the not-so-basics @ ScalaDays 2014
Scalding - the not-so-basics @ ScalaDays 2014Scalding - the not-so-basics @ ScalaDays 2014
Scalding - the not-so-basics @ ScalaDays 2014
 
scala
scalascala
scala
 
The Evolution of Scala / Scala進化論
The Evolution of Scala / Scala進化論The Evolution of Scala / Scala進化論
The Evolution of Scala / Scala進化論
 
Scala: Object-Oriented Meets Functional, by Iulian Dragos
Scala: Object-Oriented Meets Functional, by Iulian DragosScala: Object-Oriented Meets Functional, by Iulian Dragos
Scala: Object-Oriented Meets Functional, by Iulian Dragos
 
Functional Programming In Practice
Functional Programming In PracticeFunctional Programming In Practice
Functional Programming In Practice
 
The Why and How of Scala at Twitter
The Why and How of Scala at TwitterThe Why and How of Scala at Twitter
The Why and How of Scala at Twitter
 
Pune Clojure Course Outline
Pune Clojure Course OutlinePune Clojure Course Outline
Pune Clojure Course Outline
 
Introduction to Spark with Scala
Introduction to Spark with ScalaIntroduction to Spark with Scala
Introduction to Spark with Scala
 
Scala
ScalaScala
Scala
 
Scoobi - Scala for Startups
Scoobi - Scala for StartupsScoobi - Scala for Startups
Scoobi - Scala for Startups
 
Scala introduction
Scala introductionScala introduction
Scala introduction
 
Collection
CollectionCollection
Collection
 
Apache Spark & Streaming
Apache Spark & StreamingApache Spark & Streaming
Apache Spark & Streaming
 
New features in jdk8 iti
New features in jdk8 itiNew features in jdk8 iti
New features in jdk8 iti
 
Making Structured Streaming Ready for Production
Making Structured Streaming Ready for ProductionMaking Structured Streaming Ready for Production
Making Structured Streaming Ready for Production
 
Easy, scalable, fault tolerant stream processing with structured streaming - ...
Easy, scalable, fault tolerant stream processing with structured streaming - ...Easy, scalable, fault tolerant stream processing with structured streaming - ...
Easy, scalable, fault tolerant stream processing with structured streaming - ...
 
Beyond parallelize and collect - Spark Summit East 2016
Beyond parallelize and collect - Spark Summit East 2016Beyond parallelize and collect - Spark Summit East 2016
Beyond parallelize and collect - Spark Summit East 2016
 

Similar to Spark workshop

User Defined Aggregation in Apache Spark: A Love Story
User Defined Aggregation in Apache Spark: A Love StoryUser Defined Aggregation in Apache Spark: A Love Story
User Defined Aggregation in Apache Spark: A Love StoryDatabricks
 
User Defined Aggregation in Apache Spark: A Love Story
User Defined Aggregation in Apache Spark: A Love StoryUser Defined Aggregation in Apache Spark: A Love Story
User Defined Aggregation in Apache Spark: A Love StoryDatabricks
 
From Java to Scala - advantages and possible risks
From Java to Scala - advantages and possible risksFrom Java to Scala - advantages and possible risks
From Java to Scala - advantages and possible risksSeniorDevOnly
 
Real Time Big Data Management
Real Time Big Data ManagementReal Time Big Data Management
Real Time Big Data ManagementAlbert Bifet
 
Lecture 5: Functional Programming
Lecture 5: Functional ProgrammingLecture 5: Functional Programming
Lecture 5: Functional ProgrammingEelco Visser
 
Introduction to parallel and distributed computation with spark
Introduction to parallel and distributed computation with sparkIntroduction to parallel and distributed computation with spark
Introduction to parallel and distributed computation with sparkAngelo Leto
 
Refactoring to Macros with Clojure
Refactoring to Macros with ClojureRefactoring to Macros with Clojure
Refactoring to Macros with ClojureDmitry Buzdin
 
Functional programming with Scala
Functional programming with ScalaFunctional programming with Scala
Functional programming with ScalaNeelkanth Sachdeva
 
Modern technologies in data science
Modern technologies in data science Modern technologies in data science
Modern technologies in data science Chucheng Hsieh
 
Stratosphere Intro (Java and Scala Interface)
Stratosphere Intro (Java and Scala Interface)Stratosphere Intro (Java and Scala Interface)
Stratosphere Intro (Java and Scala Interface)Robert Metzger
 
Spark SQL Deep Dive @ Melbourne Spark Meetup
Spark SQL Deep Dive @ Melbourne Spark MeetupSpark SQL Deep Dive @ Melbourne Spark Meetup
Spark SQL Deep Dive @ Melbourne Spark MeetupDatabricks
 
Compact and safely: static DSL on Kotlin
Compact and safely: static DSL on KotlinCompact and safely: static DSL on Kotlin
Compact and safely: static DSL on KotlinDmitry Pranchuk
 

Similar to Spark workshop (20)

Scala by Luc Duponcheel
Scala by Luc DuponcheelScala by Luc Duponcheel
Scala by Luc Duponcheel
 
User Defined Aggregation in Apache Spark: A Love Story
User Defined Aggregation in Apache Spark: A Love StoryUser Defined Aggregation in Apache Spark: A Love Story
User Defined Aggregation in Apache Spark: A Love Story
 
User Defined Aggregation in Apache Spark: A Love Story
User Defined Aggregation in Apache Spark: A Love StoryUser Defined Aggregation in Apache Spark: A Love Story
User Defined Aggregation in Apache Spark: A Love Story
 
From Java to Scala - advantages and possible risks
From Java to Scala - advantages and possible risksFrom Java to Scala - advantages and possible risks
From Java to Scala - advantages and possible risks
 
Real Time Big Data Management
Real Time Big Data ManagementReal Time Big Data Management
Real Time Big Data Management
 
Lecture 5: Functional Programming
Lecture 5: Functional ProgrammingLecture 5: Functional Programming
Lecture 5: Functional Programming
 
Practical cats
Practical catsPractical cats
Practical cats
 
Introduction to Apache Spark
Introduction to Apache SparkIntroduction to Apache Spark
Introduction to Apache Spark
 
Introduction to parallel and distributed computation with spark
Introduction to parallel and distributed computation with sparkIntroduction to parallel and distributed computation with spark
Introduction to parallel and distributed computation with spark
 
Refactoring to Macros with Clojure
Refactoring to Macros with ClojureRefactoring to Macros with Clojure
Refactoring to Macros with Clojure
 
Spark_Documentation_Template1
Spark_Documentation_Template1Spark_Documentation_Template1
Spark_Documentation_Template1
 
Operations on rdd
Operations on rddOperations on rdd
Operations on rdd
 
Introducing scala
Introducing scalaIntroducing scala
Introducing scala
 
Functional programming with Scala
Functional programming with ScalaFunctional programming with Scala
Functional programming with Scala
 
Modern technologies in data science
Modern technologies in data science Modern technologies in data science
Modern technologies in data science
 
Scala Bootcamp 1
Scala Bootcamp 1Scala Bootcamp 1
Scala Bootcamp 1
 
Stratosphere Intro (Java and Scala Interface)
Stratosphere Intro (Java and Scala Interface)Stratosphere Intro (Java and Scala Interface)
Stratosphere Intro (Java and Scala Interface)
 
Spark SQL Deep Dive @ Melbourne Spark Meetup
Spark SQL Deep Dive @ Melbourne Spark MeetupSpark SQL Deep Dive @ Melbourne Spark Meetup
Spark SQL Deep Dive @ Melbourne Spark Meetup
 
Compact and safely: static DSL on Kotlin
Compact and safely: static DSL on KotlinCompact and safely: static DSL on Kotlin
Compact and safely: static DSL on Kotlin
 
Coding in Style
Coding in StyleCoding in Style
Coding in Style
 

Recently uploaded

BATTLEFIELD ORM: TIPS, TACTICS AND STRATEGIES FOR CONQUERING YOUR DATABASE
BATTLEFIELD ORM: TIPS, TACTICS AND STRATEGIES FOR CONQUERING YOUR DATABASEBATTLEFIELD ORM: TIPS, TACTICS AND STRATEGIES FOR CONQUERING YOUR DATABASE
BATTLEFIELD ORM: TIPS, TACTICS AND STRATEGIES FOR CONQUERING YOUR DATABASEOrtus Solutions, Corp
 
KnowAPIs-UnknownPerf-jaxMainz-2024 (1).pptx
KnowAPIs-UnknownPerf-jaxMainz-2024 (1).pptxKnowAPIs-UnknownPerf-jaxMainz-2024 (1).pptx
KnowAPIs-UnknownPerf-jaxMainz-2024 (1).pptxTier1 app
 
英国UN学位证,北安普顿大学毕业证书1:1制作
英国UN学位证,北安普顿大学毕业证书1:1制作英国UN学位证,北安普顿大学毕业证书1:1制作
英国UN学位证,北安普顿大学毕业证书1:1制作qr0udbr0
 
Software Project Health Check: Best Practices and Techniques for Your Product...
Software Project Health Check: Best Practices and Techniques for Your Product...Software Project Health Check: Best Practices and Techniques for Your Product...
Software Project Health Check: Best Practices and Techniques for Your Product...Velvetech LLC
 
办理学位证(UQ文凭证书)昆士兰大学毕业证成绩单原版一模一样
办理学位证(UQ文凭证书)昆士兰大学毕业证成绩单原版一模一样办理学位证(UQ文凭证书)昆士兰大学毕业证成绩单原版一模一样
办理学位证(UQ文凭证书)昆士兰大学毕业证成绩单原版一模一样umasea
 
Implementing Zero Trust strategy with Azure
Implementing Zero Trust strategy with AzureImplementing Zero Trust strategy with Azure
Implementing Zero Trust strategy with AzureDinusha Kumarasiri
 
Building a General PDE Solving Framework with Symbolic-Numeric Scientific Mac...
Building a General PDE Solving Framework with Symbolic-Numeric Scientific Mac...Building a General PDE Solving Framework with Symbolic-Numeric Scientific Mac...
Building a General PDE Solving Framework with Symbolic-Numeric Scientific Mac...stazi3110
 
Alfresco TTL#157 - Troubleshooting Made Easy: Deciphering Alfresco mTLS Confi...
Alfresco TTL#157 - Troubleshooting Made Easy: Deciphering Alfresco mTLS Confi...Alfresco TTL#157 - Troubleshooting Made Easy: Deciphering Alfresco mTLS Confi...
Alfresco TTL#157 - Troubleshooting Made Easy: Deciphering Alfresco mTLS Confi...Angel Borroy López
 
Odoo 14 - eLearning Module In Odoo 14 Enterprise
Odoo 14 - eLearning Module In Odoo 14 EnterpriseOdoo 14 - eLearning Module In Odoo 14 Enterprise
Odoo 14 - eLearning Module In Odoo 14 Enterprisepreethippts
 
Introduction Computer Science - Software Design.pdf
Introduction Computer Science - Software Design.pdfIntroduction Computer Science - Software Design.pdf
Introduction Computer Science - Software Design.pdfFerryKemperman
 
SpotFlow: Tracking Method Calls and States at Runtime
SpotFlow: Tracking Method Calls and States at RuntimeSpotFlow: Tracking Method Calls and States at Runtime
SpotFlow: Tracking Method Calls and States at Runtimeandrehoraa
 
Unveiling the Future: Sylius 2.0 New Features
Unveiling the Future: Sylius 2.0 New FeaturesUnveiling the Future: Sylius 2.0 New Features
Unveiling the Future: Sylius 2.0 New FeaturesŁukasz Chruściel
 
Tech Tuesday - Mastering Time Management Unlock the Power of OnePlan's Timesh...
Tech Tuesday - Mastering Time Management Unlock the Power of OnePlan's Timesh...Tech Tuesday - Mastering Time Management Unlock the Power of OnePlan's Timesh...
Tech Tuesday - Mastering Time Management Unlock the Power of OnePlan's Timesh...OnePlan Solutions
 
Folding Cheat Sheet #4 - fourth in a series
Folding Cheat Sheet #4 - fourth in a seriesFolding Cheat Sheet #4 - fourth in a series
Folding Cheat Sheet #4 - fourth in a seriesPhilip Schwarz
 
A healthy diet for your Java application Devoxx France.pdf
A healthy diet for your Java application Devoxx France.pdfA healthy diet for your Java application Devoxx France.pdf
A healthy diet for your Java application Devoxx France.pdfMarharyta Nedzelska
 
Cloud Data Center Network Construction - IEEE
Cloud Data Center Network Construction - IEEECloud Data Center Network Construction - IEEE
Cloud Data Center Network Construction - IEEEVICTOR MAESTRE RAMIREZ
 
CRM Contender Series: HubSpot vs. Salesforce
CRM Contender Series: HubSpot vs. SalesforceCRM Contender Series: HubSpot vs. Salesforce
CRM Contender Series: HubSpot vs. SalesforceBrainSell Technologies
 
How to submit a standout Adobe Champion Application
How to submit a standout Adobe Champion ApplicationHow to submit a standout Adobe Champion Application
How to submit a standout Adobe Champion ApplicationBradBedford3
 
Catch the Wave: SAP Event-Driven and Data Streaming for the Intelligence Ente...
Catch the Wave: SAP Event-Driven and Data Streaming for the Intelligence Ente...Catch the Wave: SAP Event-Driven and Data Streaming for the Intelligence Ente...
Catch the Wave: SAP Event-Driven and Data Streaming for the Intelligence Ente...confluent
 

Recently uploaded (20)

Advantages of Odoo ERP 17 for Your Business
Advantages of Odoo ERP 17 for Your BusinessAdvantages of Odoo ERP 17 for Your Business
Advantages of Odoo ERP 17 for Your Business
 
BATTLEFIELD ORM: TIPS, TACTICS AND STRATEGIES FOR CONQUERING YOUR DATABASE
BATTLEFIELD ORM: TIPS, TACTICS AND STRATEGIES FOR CONQUERING YOUR DATABASEBATTLEFIELD ORM: TIPS, TACTICS AND STRATEGIES FOR CONQUERING YOUR DATABASE
BATTLEFIELD ORM: TIPS, TACTICS AND STRATEGIES FOR CONQUERING YOUR DATABASE
 
KnowAPIs-UnknownPerf-jaxMainz-2024 (1).pptx
KnowAPIs-UnknownPerf-jaxMainz-2024 (1).pptxKnowAPIs-UnknownPerf-jaxMainz-2024 (1).pptx
KnowAPIs-UnknownPerf-jaxMainz-2024 (1).pptx
 
英国UN学位证,北安普顿大学毕业证书1:1制作
英国UN学位证,北安普顿大学毕业证书1:1制作英国UN学位证,北安普顿大学毕业证书1:1制作
英国UN学位证,北安普顿大学毕业证书1:1制作
 
Software Project Health Check: Best Practices and Techniques for Your Product...
Software Project Health Check: Best Practices and Techniques for Your Product...Software Project Health Check: Best Practices and Techniques for Your Product...
Software Project Health Check: Best Practices and Techniques for Your Product...
 
办理学位证(UQ文凭证书)昆士兰大学毕业证成绩单原版一模一样
办理学位证(UQ文凭证书)昆士兰大学毕业证成绩单原版一模一样办理学位证(UQ文凭证书)昆士兰大学毕业证成绩单原版一模一样
办理学位证(UQ文凭证书)昆士兰大学毕业证成绩单原版一模一样
 
Implementing Zero Trust strategy with Azure
Implementing Zero Trust strategy with AzureImplementing Zero Trust strategy with Azure
Implementing Zero Trust strategy with Azure
 
Building a General PDE Solving Framework with Symbolic-Numeric Scientific Mac...
Building a General PDE Solving Framework with Symbolic-Numeric Scientific Mac...Building a General PDE Solving Framework with Symbolic-Numeric Scientific Mac...
Building a General PDE Solving Framework with Symbolic-Numeric Scientific Mac...
 
Alfresco TTL#157 - Troubleshooting Made Easy: Deciphering Alfresco mTLS Confi...
Alfresco TTL#157 - Troubleshooting Made Easy: Deciphering Alfresco mTLS Confi...Alfresco TTL#157 - Troubleshooting Made Easy: Deciphering Alfresco mTLS Confi...
Alfresco TTL#157 - Troubleshooting Made Easy: Deciphering Alfresco mTLS Confi...
 
Odoo 14 - eLearning Module In Odoo 14 Enterprise
Odoo 14 - eLearning Module In Odoo 14 EnterpriseOdoo 14 - eLearning Module In Odoo 14 Enterprise
Odoo 14 - eLearning Module In Odoo 14 Enterprise
 
Introduction Computer Science - Software Design.pdf
Introduction Computer Science - Software Design.pdfIntroduction Computer Science - Software Design.pdf
Introduction Computer Science - Software Design.pdf
 
SpotFlow: Tracking Method Calls and States at Runtime
SpotFlow: Tracking Method Calls and States at RuntimeSpotFlow: Tracking Method Calls and States at Runtime
SpotFlow: Tracking Method Calls and States at Runtime
 
Unveiling the Future: Sylius 2.0 New Features
Unveiling the Future: Sylius 2.0 New FeaturesUnveiling the Future: Sylius 2.0 New Features
Unveiling the Future: Sylius 2.0 New Features
 
Tech Tuesday - Mastering Time Management Unlock the Power of OnePlan's Timesh...
Tech Tuesday - Mastering Time Management Unlock the Power of OnePlan's Timesh...Tech Tuesday - Mastering Time Management Unlock the Power of OnePlan's Timesh...
Tech Tuesday - Mastering Time Management Unlock the Power of OnePlan's Timesh...
 
Folding Cheat Sheet #4 - fourth in a series
Folding Cheat Sheet #4 - fourth in a seriesFolding Cheat Sheet #4 - fourth in a series
Folding Cheat Sheet #4 - fourth in a series
 
A healthy diet for your Java application Devoxx France.pdf
A healthy diet for your Java application Devoxx France.pdfA healthy diet for your Java application Devoxx France.pdf
A healthy diet for your Java application Devoxx France.pdf
 
Cloud Data Center Network Construction - IEEE
Cloud Data Center Network Construction - IEEECloud Data Center Network Construction - IEEE
Cloud Data Center Network Construction - IEEE
 
CRM Contender Series: HubSpot vs. Salesforce
CRM Contender Series: HubSpot vs. SalesforceCRM Contender Series: HubSpot vs. Salesforce
CRM Contender Series: HubSpot vs. Salesforce
 
How to submit a standout Adobe Champion Application
How to submit a standout Adobe Champion ApplicationHow to submit a standout Adobe Champion Application
How to submit a standout Adobe Champion Application
 
Catch the Wave: SAP Event-Driven and Data Streaming for the Intelligence Ente...
Catch the Wave: SAP Event-Driven and Data Streaming for the Intelligence Ente...Catch the Wave: SAP Event-Driven and Data Streaming for the Intelligence Ente...
Catch the Wave: SAP Event-Driven and Data Streaming for the Intelligence Ente...
 

Spark workshop

  • 1. Spark Workshop Basics and Streaming Wojciech Pituła June 29, 2015 Grupa Wirtualna Polska 0
  • 4. Vals, vars and defs [wpitula@wpitula-e540 tmp]$ sbt console ... Welcome to Scala version 2.10.4 (OpenJDK 64-Bit Server VM, Java 1.8.0_45). Type in expressions to have them evaluated. Type :help for more information. scala> var foo = 1 foo: Int = 1 scala> def fooMultipliedBy(x: Double) = foo*x fooMultipliedBy: (x: Double)Double scala> val result = fooMultipliedBy(2) result: Double = 2.0 scala> result = fooMultipliedBy 3 <console>:10: error: reassignment to val scala> foo = 2 foo: Int = 2 scala> fooMultipliedBy 2 res1: Double = 4.0 3
  • 6. Classes and Objects scala> class Person(age:Int = 22) { | def canDrink(limit:Int = 18) = age >= limit //public by default | } defined class Person scala> (new Person).canDrink() res2: Boolean = true scala> (new Person(18)).canDrink(21) res3: Boolean = false scala> object Person { | def inAgeRange(from: Int, to: Int) = new Foo(from+Random.nextInt(to-from)) | } defined object Person scala> Person.inAgeRange(15, 17).canDrink() res4: Boolean = false 5
  • 7. Classes and Objects 2 ∙ case class can be seen as plain and immutable data-holding objects that should exclusively depend on their constructor arguments. ∙ case class = class + factory method + pattern matching + eqals/hashcode + toString + copy scala> case class Rational(n: Int, d: Int = 1) defined class Rational scala> val (a, b, c) = (Rational(1,2), Rational(3,4), Rational(1,2)) cBar1: Rational = Rational(1,2.0) cBar2: Rational = Rational(3,4.0) cBar3: Rational = Rational(1,2.0) scala> a == c res0: Boolean = true scala> a.copy(d = 3) res1: Rational = Rational(1,3) 6
  • 9. Higher order functions scala> def add1(x:Int, y:Int) = x+y // ”method” add1: (x: Int, y: Int)Int scala> val add2 = add1 _ // converted method add2: (Int, Int) => Int = <function2> scala> val add3 = (x:Int, y:Int) => x+y // function literal add3: (Int, Int) => Int = <function2> scala> def magic(func: (Int, Int) => Int) = func(4,3) magic: (func: (Int, Int) => Int)Int scala> magic(add1) res0: Int = 7 8
  • 10. Higher order functions 2 scala> def transformer(x:Int)(func: ((Int,Int) => Int)) = (y:Int) => func(x, y) transformer: (x: Int)(func: (Int, Int) => Int)Int => Int scala> transformer _ res0: Int => (((Int, Int) => Int) => (Int => Int)) = <function1> 9
  • 11. Higher order functions 3 scala> def transformer(x:Int)(func: ((Int,Int) => Int)) = (y:Int) => func(x, y) transformer: (x: Int)(func: (Int, Int) => Int)Int => Int scala> transformer _ res0: Int => (((Int, Int) => Int) => (Int => Int)) = <function1> scala> val five = transformer(5) _ five: ((Int, Int) => Int) => (Int => Int) = <function1> scala> val fivePlus = fiveTransformer(_+_) fivePlus: Int => Int = <function1> scala> val fivePlusThree = fivePlus(3) fivePlusThree: Int = 8 scala> transformer(5)(_+_)(3) res1: Int = 8 10
  • 12. Type params class Value[T](x: T){ def map[V](func: T => V): Value[V] = new Value(func(x)) } case class Vector2[T : Numeric](x: T, y:T) { val num = implicitly[Numeric[T]] import num._ def transform[V : Numeric](func: T => V) = Vector2(func(x), func(y)) def join(other: Vector2[T], joinFunc: (T,T) => T) = ??? def +(other: Vector2[T]) = join(other, _+_) def -(other: Vector2[T]) = join(other, _-_) // [2,3] ^ 2 = [4, 9] def ^(exp: Int): Vector2[T] = ??? } > pl.wp.sparkworkshop.scala.exercise3 11
  • 13. Collections 1. scala.collection, scala.collection.immutable, and scala.collection.mutable 2. immutable imported by default 3. (List, ListBuffer), (Array, ArrayBuffer), (String, StringBuffer),Set, Map 12
  • 14. Collections 2 scala> List(1,2,3,4,5,6) // alternatively (1 to 6).toList res0: List[Int] = List(1, 2, 3, 4, 5, 6) scala> res0.map(_*3) res1: List[Int] = List(3, 6, 9, 12, 15, 18) scala> res1.filter(_%2 == 0) res3: List[Int] = List(6, 12, 18) scala> res3.foldLeft(0)(_+_) res4: Int = 36 scala> res3.foreach(println) 6 12 18 scala> for(x <- res3; y <- res1 if y%2==1) yield (x,y) res7: List[(Int, Int)] = List((6,3), (6,9), (6,15), (12,3), (12,9), (12,15), (18,3), (18,9), (18,15)) 13
  • 16. Pattern Matching scala> case class Foo(foo: Any, bar: Any) scala> def recognize(obj: Any) = { | obj match { | case str :String => s”string $str” | case Foo(Some(1), Foo(_, _)) => ”some very complicated case” | case (x,y) => s”tuple of $x and $y” | case _ => ”Boring” | } | } scala> recognize(1) res0: String = Boring scala> recognize(”something”) res1: String = string something scala> recognize(Foo(Some(1), Foo(””,””))) res3: String = some very complicated case scala> recognize((1,2)) res4: String = tuple of 1 and 2 15
  • 18. Sbt val sparkVersion = ”1.2.1” lazy val root = (project in file(”.”)) .settings( name := ”spark-streaming-app”, organization := ”pl.wp.sparkworkshop”, version := ”1.0-SNAPSHOT”, scalaVersion := ”2.11.5”, libraryDependencies ++= Seq( ”org.apache.spark” %% ”spark-core” % sparkVersion % ”provided”, ”org.apache.spark” %% ”spark-streaming” % sparkVersion % ”provided”, ”org.scalatest” %% ”scalatest” % ”2.2.1” % ”test”, ”org.mockito” % ”mockito-core” % ”1.10.19” % ”test” ), resolvers ++= Seq( ”My Repo” at ”http://repo/url” )) .settings( publishMavenStyle := true, publishArtifact in Test := false, pomIncludeRepository := { _ => false}, publishTo := { val repo = ”http://repo/url” if (isSnapshot.value) Some(”snapshots” at nexus + ”content/repositories/snapshots”) else Some(”releases” at nexus + ”content/repositories/releases”) }) 17
  • 19. Exercise ”A prime number (or a prime) is a natural number which has exactly two distinct natural number divisors: 1 and itself. Your task is to test whether the given number is a prime number.” def isPrime(x: Int): Boolean > pl.wp.sparkworkshop.scala.exercise6 18
  • 20. Exercise - Solution implicit class PotentiallyPrime(x :Int) { def isPrime(): Boolean = { (1 to x).filter(x % _ == 0) == List(1, x) } } val is5Prime = 5.isPrime 19
  • 21. Spark
  • 23. RDD An RDD is an immutable, deterministically re-computable, distributed dataset. Each RDD remembers the lineage of deterministic operations that were used on a fault-tolerant input dataset to create it. Each RDD can be operated on in parallel. 22
  • 24. Sources val conf = new SparkConf().setAppName(”Simple Application”) val sc = new SparkContext(conf) ∙ Parallelized Collections val data = Array(1, 2, 3, 4, 5) val distData = sc.parallelize(data) ∙ External Datasets: Any storage source supported by Hadoop: local file system, HDFS, Cassandra, HBase, Amazon S3, etc. Spark supports text files, SequenceFiles, and any other Hadoop InputFormat. scala> val distFile = sc.textFile(”data.txt”) distFile: RDD[String] = MappedRDD@1d4cee08 23
  • 25. Transformations and Actions RDDs support two types of operations: ∙ transformations, which create a new dataset from an existing one ∙ actions, which return a value to the driver program after running a computation on the dataset. All transformations in Spark are lazy, in that they do not compute their results right away. Instead, they just remember the transformations applied to some base dataset (e.g. a file). The transformations are only computed when an action requires a result to be returned to the driver program. 24
  • 26. Transformations map[U](f: (T) => U): RDD[U] Return a new distributed dataset formed by passing each element of the source through a function func. filter(f: (T) => Boolean): RDD[T] Return a new dataset formed by selecting those elements of the source on which func returns true. union(other: RDD[T]): RDD[T] Return a new dataset that contains the union of the elements in the source dataset and the argument. intersection(other: RDD[T]): RDD[T] Return a new RDD that contains the intersection of elements in the source dataset and the argument. groupByKey(): RDD[(K, Iterable[V])] When called on a dataset of (K, V) pairs, returns a dataset of (K, Iterable<V>) pairs. and much more 25
  • 27. Actions reduce(f: (T, T) => T): T Aggregate the elements of the dataset using a function func (which takes two arguments and returns one) collect(): Array[T] Return all the elements of the dataset as an array at the driver program. count(): Long Return the number of elements in the dataset. foreach(f: (T) => Unit): Unit Run a function func on each element of the dataset. and much more 26
  • 28. spark-shell Just like Scala REPL but with SparkContext > ./bin/spark-shell --master ”local[4]” Spark assembly has been built with Hive, including Datanucleus jars on classpath Welcome to ____ __ / __/__ ___ _____/ /__ _ / _ / _ ‘/ __/ ’_/ /___/ .__/_,_/_/ /_/_ version 1.3.0 /_/ Using Scala version 2.10.4 (Java HotSpot(TM) 64-Bit Server VM, Java 1.8.0_31) Type in expressions to have them evaluated. Type :help for more information. Spark context available as sc. SQL context available as sqlContext. scala> sc.parallelize(List(”Hello world”)).foreach(println) Hello world 27
  • 30. spark-sumbit Application jar A jar containing the user’s Spark application. Users should create an ”uber jar” containing their application along with its dependencies. The user’s jar should never include Hadoop or Spark libraries, however, these will be added at runtime. ./bin/spark-submit --class org.apache.spark.examples.SparkPi --master spark://10.0.0.1:7077,10.0.0.2:7077 --executor-memory 20G --total-executor-cores 100 /path/to/examples.jar 1000 29
  • 32. Shared variables ∙ Broadcast Variables scala> val broadcastVar = sc.broadcast(Array(1, 2, 3)) broadcastVar: org.apache.spark.broadcast.Broadcast[Array[Int]] = Broadcast(0) scala> broadcastVar.value res0: Array[Int] = Array(1, 2, 3) ∙ Accumulators scala> val accum = sc.accumulator(0, ”My Accumulator”) accum: spark.Accumulator[Int] = 0 scala> sc.parallelize(Array(1, 2, 3, 4)).foreach(x => accum += x) ... 10/09/29 18:41:08 INFO SparkContext: Tasks finished in 0.317106 s scala> accum.value res2: Int = 10 31
  • 33. Underlying Akka ”Akka is a toolkit and runtime for building highly concurrent, distributed, and resilient message-driven applications on the JVM.” case class Greeting(who: String) class GreetingActor extends Actor with ActorLogging { def receive = { case Greeting(who) => log.info(”Hello ” + who) } } val system = ActorSystem(”MySystem”) val greeter = system.actorOf(Props[GreetingActor], name = ”greeter”) greeter ! Greeting(”Charlie Parker”) 32
  • 35. Clusters ∙ Standalone ∙ Apache Mesos ∙ Hadoop YARN ∙ local[*] 34
  • 36. Master, Worker, Executor and Driver Driver program The process running the main() function of the application and creating the SparkContext Cluster manager An external service for acquiring resources on the cluster (e.g. standalone manager, Mesos, YARN) Worker node Any node that can run application code in the cluster Executor A process launched for an application on a worker node, that runs tasks and keeps data in memory or disk storage across them. Each application has its own executors. 35
  • 37. Runnig standalone cluster Master ./sbin/start-master.sh # OR ./bin/spark-class org.apache.spark.deploy.master.Master --ip ‘hostname‘ --port 7077 --webui-port 8080 Worker ./bin/spark-class org.apache.spark.deploy.worker.Worker spark://10.0.0.1:7077,10.0.0.2:7077 36
  • 38. Job, Stage, Task Job A parallel computation consisting of multiple tasks that gets spawned in response to a Spark action (e.g. save, collect). Stage Each job gets divided into smaller sets of tasks called stages that depend on each other (similar to the map and reduce stages in MapReduce); you’ll see this term used in the driver’s logs. Task A unit of work that will be sent to one executor 37
  • 41. Configuration - spark-defaults.conf spark.eventLog.enabled true spark.eventLog.dir hdfs://some/path/on/hdfs spark.serializer org.apache.spark.serializer.KryoSerializer spark.rdd.compress true spark.executor.extraJavaOptions -Dlog4j.loghost.Prefix=hadoop-spark-poc-display-executor -Dlog4j.localRollingFile.FileName=spark-poc-display-executor.log spark.driver.extraJavaOptions -Dlog4j.loghost.Prefix=hadoop-spark-poc-display-driver -Dlog4j.localRollingFile.FileName=spark-poc-display-driver.log spark.streaming.unpersist true spark.task.maxFailures 8 spark.executor.logs.rolling.strategy time 40
  • 44. DataFrame A DataFrame is a distributed collection of data organized into named columns. DataFrame ≈ RDD[Row] ≈ Rdd[String] + schema 43
  • 45. DataFrame Operations val sc: SparkContext // An existing SparkContext. val sqlContext = new org.apache.spark.sql.SQLContext(sc) // Create the DataFrame val df = sqlContext.read.json(”examples/src/main/resources/people.json”) // Show the content of the DataFrame df.show() // age name // null Michael // 30 Andy // 19 Justin // Print the schema in a tree format df.printSchema() // root // |-- age: long (nullable = true) // |-- name: string (nullable = true) // Select only the ”name” column df.select(”name”).show() // name // Michael // Andy // Justin 44
  • 46. DataFrame Operations 2 // Select everybody, but increment the age by 1 df.select(df(”name”), df(”age”) + 1).show() // name (age + 1) // Michael null // Andy 31 // Justin 20 // Select people older than 21 df.filter(df(”age”) > 21).show() // age name // 30 Andy // Count people by age df.groupBy(”age”).count().show() // age count // null 1 // 19 1 // 30 1 45
  • 47. SQL Queries case class Person(name: String, age: Int) // Create an RDD of Person objects and register it as a table. val people = sc.textFile(”examples/src/main/resources/people.txt”) .map(_.split(”,”)).map(p => Person(p(0), p(1).trim.toInt)).toDF() people.registerTempTable(”people”) // SQL statements can be run by using the sql methods provided by sqlContext. val teenagers = sqlContext.sql(”SELECT name, age FROM people WHERE age >= 13 AND age <= 19”) val hc = new org.apache.spark.sql.hive.HiveContext(sc) val negativesQuery = s”””select event |from scoring.display_balanced_events lateral view explode(events) e as event |where event.label=0”””.stripMargin val negatives = hc.sql(negativesQuery).limit(maxCount) 46
  • 51. Receivers ∙ Directory ∙ Actors ∙ Custom ∙ Kafka ∙ Flume ∙ Kinesis ∙ Twitter 50
  • 52. Transformations All are lazy! map, filter, flatmap, filter, count updateStateByKey(func), reduceByKey, join window(windowLength, slideInterval), countByWindow, reduceByWindow 51
  • 53. Outputs ∙ print ∙ saveAsTextFiles, saveAsObjectFiles, saveAsHadoopFiles ∙ foreachRDD 52
  • 54. Example > pl.wp.sparkworkshop.spark.streaming.exercise1.SocketWordsCount val conf = new SparkConf().setAppName(”Example”) val ssc = new StreamingContext(conf, Seconds(10)) // Create a DStream that will connect to hostname:port, like localhost:9999 val lines = ssc.socketTextStream(”localhost”, 9999) // Split each line into words val words = lines.flatMap(_.split(” ”)) val pairs = words.map(word => (word, 1)) val wordCounts = pairs.reduceByKey(_ + _) // Print the first ten elements of each RDD generated in this DStream to the console wordCounts.print() // Start the computation ssc.start() ssc.awaitTermination() // Wait for the computation to terminate 53
  • 55. ForeachRDD import org.apache.spark.streaming.dstream.DStream val dstream : DStream[(String, String)] = ??? // we’re at the driver dstream.foreachRDD(rdd => //still at the driver rdd.foreachPartition(partition => //now we’re at the worker //anything has to be serialized or static to get here partition.foreach(elem => //still at the worker println(elem) ) ) ) 54
  • 56. Checkpoints ∙ Metadata checkpointing ∙ Configuration ∙ DStream operations ∙ Incomplete batches ∙ Data checkpointing - Saving of the generated RDDs to reliable storage. In stateful transformations, the generated RDDs depends on RDDs of previous batches, which causes the length of the dependency chain to keep increasing with time. 55
  • 57. Checkpoints - example def ceateStreamingContext(): StreamingContext = { val ssc = new StreamingContext(...) // new context ssc.checkpoint(checkpointDirectory) // set checkpoint directory val lines = ssc.socketTextStream(...) // create DStreams lines.checkpoint(Seconds(120)) ... ssc } // Get StreamingContext from checkpoint data or create a new one val context = StreamingContext.getOrCreate(checkpointDirectory, ceateStreamingContext _) // Start the context context.start() context.awaitTermination() 56
  • 59. Tunning ∙ Reducing the processing time of each batch of data by efficiently using cluster resources. ∙ Level of Parallelism in Data Receiving ∙ Level of Parallelism in Data Processing ∙ Data Serialization ∙ Setting the right batch size such that the batches of data can be processed as fast as they are received (that is, data processing keeps up with the data ingestion). 58
  • 60. Futher reading ∙ Programming guides(core, sql, streaming) ∙ Integration guides(kafka, flume, etc.) ∙ API Docs ∙ Mailling list 59