Skip to content

Commit 58cb9e1

Browse files
authored
Fix miscs (#35)
* fix miscs * small default changes
1 parent 885cc85 commit 58cb9e1

File tree

12 files changed

+42
-141
lines changed

12 files changed

+42
-141
lines changed

build.sbt

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -51,7 +51,8 @@ lazy val docs = project
5151
unidocProjectFilter in (ScalaUnidoc, unidoc) := inProjects(core),
5252
target in (ScalaUnidoc, unidoc) := (baseDirectory in LocalRootProject).value / "website" / "static" / "api",
5353
cleanFiles += (target in (ScalaUnidoc, unidoc)).value,
54-
docusaurusCreateSite := docusaurusCreateSite.dependsOn(unidoc in Compile).value
54+
docusaurusCreateSite := docusaurusCreateSite.dependsOn(unidoc in Compile).value,
55+
docusaurusPublishGhpages := docusaurusPublishGhpages.dependsOn(unidoc in Compile).value
5556
)
56-
.dependsOn(core)
5757
.enablePlugins(MdocPlugin, DocusaurusPlugin, ScalaUnidocPlugin)
58+
.dependsOn(core)

core/src/main/scala/org/apache/spark/ml/boosting/BoostingParams.scala

Lines changed: 12 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -43,11 +43,11 @@ private[ml] trait BoostingParams
4343
with HasTol
4444
with HasNumRound {
4545

46-
setDefault(numRound -> 5)
46+
setDefault(numRound -> 2)
4747
setDefault(numBaseLearners -> 10)
4848
setDefault(tol -> 1e-6)
4949

50-
def evaluateOnValidation(
50+
protected def evaluateOnValidation(
5151
weights: Array[Double],
5252
boosters: Array[EnsemblePredictionModelType],
5353
labelColName: String,
@@ -66,7 +66,7 @@ private[ml] trait BoostingParams
6666
}
6767
}
6868

69-
def evaluateOnValidation(
69+
protected def evaluateOnValidation(
7070
numClasses: Int,
7171
weights: Array[Double],
7272
boosters: Array[EnsemblePredictionModelType],
@@ -87,7 +87,7 @@ private[ml] trait BoostingParams
8787
}
8888
}
8989

90-
def probabilize(
90+
protected def probabilize(
9191
boostWeightColName: String,
9292
boostProbaColName: String,
9393
poissonProbaColName: String)(df: DataFrame): DataFrame = {
@@ -98,7 +98,7 @@ private[ml] trait BoostingParams
9898
.withColumn(poissonProbaColName, col(boostProbaColName) * numLines)
9999
}
100100

101-
def updateWeights(
101+
protected def updateWeights(
102102
boostWeightColName: String,
103103
lossColName: String,
104104
beta: Double,
@@ -108,25 +108,26 @@ private[ml] trait BoostingParams
108108
col(boostWeightColName) * pow(lit(beta), lit(1) - col(lossColName)))
109109
}
110110

111-
def avgLoss(lossColName: String, boostProbaColName: String)(df: DataFrame): Double = {
111+
protected def avgLoss(lossColName: String, boostProbaColName: String)(df: DataFrame): Double = {
112112
df.agg(sum(col(lossColName) * col(boostProbaColName)))
113113
.first()
114114
.getDouble(0)
115115
}
116116

117-
def beta(avgl: Double, numClasses: Int = 2): Double = {
117+
protected def beta(avgl: Double, numClasses: Int = 2): Double = {
118118
avgl / ((1 - avgl) * (numClasses - 1))
119119
}
120120

121-
def weight(beta: Double): Double = {
121+
protected def weight(beta: Double): Double = {
122122
if (beta == 0.0) {
123123
1.0
124124
} else {
125125
math.log(1 / beta)
126126
}
127127
}
128128

129-
def extractBoostedBag(poissonProbaColName: String, seed: Long)(df: DataFrame): DataFrame = {
129+
protected def extractBoostedBag(poissonProbaColName: String, seed: Long)(
130+
df: DataFrame): DataFrame = {
130131

131132
val poissonProbaColIndex = df.schema.fieldIndex(poissonProbaColName)
132133

@@ -149,7 +150,7 @@ private[ml] trait BoostingParams
149150

150151
}
151152

152-
def terminateVal(
153+
protected def terminateVal(
153154
withValidation: Boolean,
154155
error: Double,
155156
verror: Double,
@@ -178,7 +179,7 @@ private[ml] trait BoostingParams
178179

179180
}
180181

181-
def terminate(
182+
protected def terminate(
182183
avgl: Double,
183184
withValidation: Boolean,
184185
error: Double,

core/src/main/scala/org/apache/spark/ml/boosting/GBMParams.scala

Lines changed: 8 additions & 58 deletions
Original file line numberDiff line numberDiff line change
@@ -57,7 +57,7 @@ private[ml] trait GBMParams
5757
with HasSubBag {
5858

5959
setDefault(learningRate -> 0.1)
60-
setDefault(numBaseLearners -> 100)
60+
setDefault(numBaseLearners -> 10)
6161
setDefault(tol -> 1e-6)
6262
setDefault(maxIter -> 100)
6363

@@ -77,7 +77,7 @@ private[ml] trait GBMParams
7777

7878
setDefault(optimizedWeights -> false)
7979

80-
def findOptimizedWeight(
80+
protected def findOptimizedWeight(
8181
labelColName: String,
8282
currentPredictionColName: String,
8383
boosterPredictionColName: String,
@@ -126,7 +126,7 @@ private[ml] trait GBMParams
126126
optimized(0)
127127
}
128128

129-
def findOptimizedWeight(
129+
protected def findOptimizedWeight(
130130
labelColName: String,
131131
currentPredictionColName: String,
132132
boosterPredictionColName: String,
@@ -199,57 +199,7 @@ private[ml] trait GBMParams
199199

200200
}
201201

202-
// def findOptimizedConst(
203-
// labelColName: String,
204-
// loss: (Array[Double], Array[Double]) => Double,
205-
// grad: (Array[Double], Array[Double]) => Array[Double],
206-
// numClasses: Int,
207-
// maxIter: Int,
208-
// tol: Double)(df: DataFrame): Array[Double] = {
209-
210-
// val transformed = df
211-
// .select(col(labelColName))
212-
// .cache()
213-
214-
// val cdf = new CachedDiffFunction[BreezeDV[Double]](new DiffFunction[BreezeDV[Double]] {
215-
// override def calculate(denseVector: BreezeDV[Double]): (Double, BreezeDV[Double]) = {
216-
// val x = denseVector.toArray
217-
// val df = transformed
218-
// val l = loss
219-
// val ludf =
220-
// udf[Double, Array[Double]]((label: Array[Double]) => l(label, x))
221-
// val g = grad
222-
// val gudf =
223-
// udf[Array[Double], Array[Double]]((label: Array[Double]) => g(label, x))
224-
// val lcn = labelColName
225-
// var agg = Seq.empty[Column]
226-
// var k = 0
227-
// while (k < numClasses) {
228-
// agg = agg :+ sum(element_at(gudf(col(lcn)), k + 1))
229-
// k += 1
230-
// }
231-
// val res = df.agg(sum(ludf(col(lcn))), agg: _*).first()
232-
// (
233-
// res.getDouble(0),
234-
// BreezeDV[Double](Array.range(0, numClasses).map(k => res.getDouble(k + 1))))
235-
236-
// }
237-
// })
238-
239-
// val lbfgsb =
240-
// new BreezeLBFGSB(
241-
// BreezeDV.fill(numClasses)(Double.NegativeInfinity),
242-
// BreezeDV.fill(numClasses)(Double.PositiveInfinity),
243-
// maxIter = maxIter,
244-
// tolerance = tol,
245-
// m = 10)
246-
// val optimized =
247-
// lbfgsb.minimize(cdf, BreezeDV.zeros(numClasses))
248-
249-
// optimized.toArray
250-
// }
251-
252-
def findOptimizedConst(
202+
protected def findOptimizedConst(
253203
labelColName: String,
254204
loss: (Double, Double) => Double,
255205
grad: (Double, Double) => Double,
@@ -289,7 +239,7 @@ private[ml] trait GBMParams
289239
optimized(0)
290240
}
291241

292-
def evaluateOnValidation(
242+
protected def evaluateOnValidation(
293243
model: GBMRegressionModel,
294244
labelColName: String,
295245
loss: (Double, Double) => Double)(df: DataFrame): Double = {
@@ -305,7 +255,7 @@ private[ml] trait GBMParams
305255
}
306256
}
307257

308-
def evaluateOnValidation(
258+
protected def evaluateOnValidation(
309259
model: GBMClassificationModel,
310260
labelColName: String,
311261
loss: (Vector, Vector) => Double)(df: DataFrame): Double = {
@@ -323,7 +273,7 @@ private[ml] trait GBMParams
323273
}
324274
}
325275

326-
def terminate(
276+
protected def terminate(
327277
weights: Array[Double],
328278
learningRate: Double,
329279
withValidation: Boolean,
@@ -343,7 +293,7 @@ private[ml] trait GBMParams
343293
}
344294
}
345295

346-
def terminate(
296+
protected def terminate(
347297
weight: Double,
348298
learningRate: Double,
349299
withValidation: Boolean,

core/src/main/scala/org/apache/spark/ml/classification/BaggingClassifier.scala

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -164,7 +164,7 @@ class BaggingClassifier(override val uid: String)
164164
val bagged = df.transform(
165165
withBag(getReplacement, getSampleRatio, getNumBaseLearners, getSeed, bagColName))
166166

167-
val numFeatures = getNumFeatures(df, getFeaturesCol)
167+
val numFeatures = MetadataUtils.getNumFeatures(df, getFeaturesCol)
168168

169169
val futureModels = Array
170170
.range(0, getNumBaseLearners)

core/src/main/scala/org/apache/spark/ml/classification/GBMClassifier.scala

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -95,7 +95,7 @@ private[ml] trait GBMClassifierParams
9595

9696
setDefault(instanceTrimmingRatio -> 1.0)
9797

98-
def trim(instanceTrimmingRatio: Double, negGradColName: String, tol: Double)(
98+
protected def trim(instanceTrimmingRatio: Double, negGradColName: String, tol: Double)(
9999
df: DataFrame): DataFrame = {
100100
val instanceWeightColName = "gbm$instance-weight" + UUID.randomUUID().toString
101101
val instanced = df
@@ -295,7 +295,7 @@ class GBMClassifier(override val uid: String)
295295
val bagged = train.transform(
296296
withBag(getReplacement, getSampleRatio, getNumBaseLearners, getSeed, bagColName))
297297

298-
val numFeatures = getNumFeatures(train, getFeaturesCol)
298+
val numFeatures = MetadataUtils.getNumFeatures(train, getFeaturesCol)
299299

300300
val numClasses = getNumClasses(train, maxNumClasses = numFeatures)
301301
instr.logNumClasses(numClasses)

core/src/main/scala/org/apache/spark/ml/ensemble/HasSubBag.scala

Lines changed: 9 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -23,11 +23,11 @@ import org.apache.spark.ml.feature.VectorSlicer
2323
import org.apache.spark.ml.linalg.{DenseVector, SparseVector, Vector, Vectors}
2424
import org.apache.spark.ml.param._
2525
import org.apache.spark.ml.param.shared.HasSeed
26-
import org.apache.spark.ml.util.BaggingMetadataUtils
2726
import org.apache.spark.sql.bfunctions._
2827
import org.apache.spark.sql.functions._
2928
import org.apache.spark.sql.{DataFrame, Row}
3029
import org.apache.spark.util.random.XORShiftRandom
30+
import org.apache.spark.ml.util.MetadataUtils
3131

3232
private[ml] trait HasSubBag extends Params with HasSeed {
3333

@@ -78,7 +78,7 @@ private[ml] trait HasSubBag extends Params with HasSeed {
7878

7979
setDefault(subspaceRatio -> 1)
8080

81-
def withBag(
81+
protected def withBag(
8282
withReplacement: Boolean,
8383
sampleRatio: Double,
8484
numberSamples: Int,
@@ -87,7 +87,7 @@ private[ml] trait HasSubBag extends Params with HasSeed {
8787
df.withColumn(bagColName, bag(withReplacement, sampleRatio, numberSamples, seed))
8888
}
8989

90-
def mkSubspace(sampleRatio: Double, numFeatures: Int, seed: Long): SubSpace = {
90+
protected def mkSubspace(sampleRatio: Double, numFeatures: Int, seed: Long): SubSpace = {
9191

9292
val range = Array.range(0, numFeatures)
9393

@@ -105,8 +105,11 @@ private[ml] trait HasSubBag extends Params with HasSeed {
105105

106106
}
107107

108-
def extractSubBag(bagColName: String, index: Int, featuresColName: String, subspace: SubSpace)(
109-
df: DataFrame): DataFrame = {
108+
protected def extractSubBag(
109+
bagColName: String,
110+
index: Int,
111+
featuresColName: String,
112+
subspace: SubSpace)(df: DataFrame): DataFrame = {
110113

111114
val tmpColName = "bag$tmp" + UUID.randomUUID().toString
112115
val replicated = df
@@ -125,27 +128,11 @@ private[ml] trait HasSubBag extends Params with HasSeed {
125128

126129
}
127130

128-
def slicer(subspace: SubSpace): Vector => Vector = {
131+
protected def slicer(subspace: SubSpace): Vector => Vector = {
129132
case features: DenseVector => Vectors.dense(subspace.map(features.apply))
130133
case features: SparseVector => features.slice(subspace)
131134
}
132135

133-
def getNumFeatures(dataset: DataFrame, featuresCol: String): Int = {
134-
BaggingMetadataUtils.getNumFeatures(dataset.schema(featuresCol)) match {
135-
case Some(n: Int) => n
136-
case None =>
137-
// Get number of classes from dataset itself.
138-
val numFeaturesUDF = udf((features: Vector) => features.size)
139-
val sizeFeaturesCol: Array[Row] = dataset.select(numFeaturesUDF(col(featuresCol))).take(1)
140-
if (sizeFeaturesCol.isEmpty || sizeFeaturesCol(0).get(0) == null) {
141-
throw new SparkException("ML algorithm was given empty dataset.")
142-
}
143-
val sizeArrayFeatures: Int = sizeFeaturesCol.head.getInt(0)
144-
val numFeatures = sizeArrayFeatures.toInt
145-
numFeatures
146-
}
147-
}
148-
149136
}
150137

151138
private[ml] object HasSubBag {

core/src/main/scala/org/apache/spark/ml/ensemble/ensembleParams.scala

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -96,7 +96,7 @@ private[ml] trait HasBaseLearner extends Params {
9696
/** @group getParam */
9797
def getBaseLearner: EnsemblePredictorType = $(baseLearner)
9898

99-
def fitBaseLearner(
99+
protected def fitBaseLearner(
100100
baseLearner: EnsemblePredictorType,
101101
labelColName: String,
102102
featuresColName: String,

core/src/main/scala/org/apache/spark/ml/regression/BaggingRegressor.scala

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -164,7 +164,7 @@ class BaggingRegressor(override val uid: String)
164164
val bagged = df.transform(
165165
withBag(getReplacement, getSampleRatio, getNumBaseLearners, getSeed, bagColName))
166166

167-
val numFeatures = getNumFeatures(df, getFeaturesCol)
167+
val numFeatures = MetadataUtils.getNumFeatures(df, getFeaturesCol)
168168

169169
val futureModels = Array
170170
.range(0, getNumBaseLearners)

core/src/main/scala/org/apache/spark/ml/regression/GBMRegressor.scala

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -250,7 +250,7 @@ class GBMRegressor(override val uid: String)
250250
val bagged = train.transform(
251251
withBag(getReplacement, getSampleRatio, getNumBaseLearners, getSeed, bagColName))
252252

253-
val numFeatures = getNumFeatures(train, getFeaturesCol)
253+
val numFeatures = MetadataUtils.getNumFeatures(train, getFeaturesCol)
254254

255255
@tailrec
256256
def trainBoosters(

core/src/main/scala/org/apache/spark/ml/util/BaggingMetadataUtils.scala

Lines changed: 0 additions & 39 deletions
This file was deleted.

0 commit comments

Comments
 (0)