forked from szilard/benchm-ml
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy path5xb-spark-trainpred--sp20.txt
28 lines (16 loc) · 1006 Bytes
/
5xb-spark-trainpred--sp20.txt
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
spark-2.0.0-bin-hadoop2.7/bin/spark-shell --driver-memory 20G --executor-memory 20G
import org.apache.spark.ml.Pipeline
import org.apache.spark.ml.classification.RandomForestClassifier
import org.apache.spark.ml.evaluation.BinaryClassificationEvaluator
val d_train = spark.read.parquet("spark1hot-train-1m.parquet").cache()
val d_test = spark.read.parquet("spark1hot-test-1m.parquet").cache()
(d_train.count(), d_test.count())
val rf = new RandomForestClassifier().setLabelCol("label").setFeaturesCol("features").setNumTrees(100).setMaxDepth(20).setMaxBins(100).setFeatureSubsetStrategy("sqrt").setImpurity("entropy")
val pipeline = new Pipeline().setStages(Array(rf))
val now = System.nanoTime
val model = pipeline.fit(d_train)
val elapsed = ( System.nanoTime - now )/1e9
elapsed
val predictions = model.transform(d_test)
val evaluator = new BinaryClassificationEvaluator().setLabelCol("label").setRawPredictionCol("probability").setMetricName("areaUnderROC")
evaluator.evaluate(predictions)