-
Notifications
You must be signed in to change notification settings - Fork 286
Expand file tree
/
Copy pathBasic.scala
More file actions
53 lines (38 loc) · 1.38 KB
/
Basic.scala
File metadata and controls
53 lines (38 loc) · 1.38 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
package dataset
import org.apache.spark.sql.SparkSession
//
// Create Datasets of primitive type and tuple type ands show simple operations.
//
object Basic {
def main(args: Array[String]) {
val spark =
SparkSession.builder()
.appName("Dataset-Basic")
.master("local[4]")
.getOrCreate()
import spark.implicits._
// Create a tiny Dataset of integers
val s = Seq(10, 11, 12, 13, 14, 15)
val ds = s.toDS()
println("*** only one column, and it always has the same name")
ds.columns.foreach(println(_))
println("*** column types")
ds.dtypes.foreach(println(_))
println("*** schema as if it was a DataFrame")
ds.printSchema()
println("*** values > 12")
ds.where($"value" > 12).show()
// This seems to be the best way to get a range that's actually a Seq and
// thus easy to convert to a Dataset, rather than a Range, which isn't.
val s2 = Seq.range(1, 100)
println("*** size of the range")
println(s2.size)
val tuples = Seq((1, "one", "un"), (2, "two", "deux"), (3, "three", "trois"))
val tupleDS = tuples.toDS()
println("*** Tuple Dataset types")
tupleDS.dtypes.foreach(println(_))
// the tuple columns have unfriendly names, but you can use them to query
println("*** filter by one column and fetch another")
tupleDS.where($"_1" > 2).select($"_2", $"_3").show()
}
}