scalaapache-sparkmachine-learningnaivebayesapache-spark-ml

How to get spark.ml NaiveBayes probability vector not [0-1] class in Spark?


I'm working on NaiveBayes classifier and I can predict values for a single data point using the model trained, but I want to get the probability value.

The data classified into two classes only. and the predict function returns 0 or 1.

import org.apache.log4j.{Level, Logger}
import org.apache.spark.ml.classification.{NaiveBayes, NaiveBayesModel}
import org.apache.spark.ml.feature.LabeledPoint
import org.apache.spark.ml.linalg.Vectors
import org.apache.spark.sql.SparkSession

object Test {
  def main(args: Array[String]): Unit = {
    Logger.getLogger("org").setLevel(Level.OFF)
    Logger.getLogger("akka").setLevel(Level.OFF)
    val spark = SparkSession.builder.appName("Test").master("local[4]").getOrCreate
    val dataset = spark.read.option("inferSchema", "true").csv("data/labelled.csv").toDF()

    import spark.sqlContext.implicits._
    val output = dataset.map(row => {
      LabeledPoint(row.getInt(2), Vectors.dense( row.getInt(0) , row.getInt(1)))
    })
    val Array(training, test) =  output.randomSplit(Array(0.7, 0.3),seed = 11L)
    training.cache()

    val model : NaiveBayesModel = new NaiveBayes().fit(training)
    val speed = 110
    val hour  = 11
    val label1 : Double =  model.predict(Vectors.dense(speed,hour))
    // UPDATE
    val label = model.predictProbability(Vectors.dense(speed,hour)) // This not work and raise error[1]
  }
}

[1] The error raised when using model.predictProbability

Error:(24, 23) method predictProbability in class ProbabilisticClassificationModel cannot be accessed in org.apache.spark.ml.classification.NaiveBayesModel Access to protected method predictProbability not permitted because enclosing object Test is not a subclass of class ProbabilisticClassificationModel in package classification where target is defined val label = model.predictProbability(Vectors.dense(speed,hour))


Solution

  • After Many researches I didn't find this feature in spark.ml library, But I was able to do that using spark.mllib, and the code should be modified to

    import org.apache.log4j.{Level, Logger}
    // Import NaiveBayes, NaiveBayesModel from mlib 
    import org.apache.spark.mllib.classification.{NaiveBayes, NaiveBayesModel}
    // Import LabeledPoint, Vectors from mlib to create dataset
    import org.apache.spark.mllib.regression.LabeledPoint
    import org.apache.spark.mllib.linalg.Vectors
    import org.apache.spark.sql.SparkSession
    
    object Test {
      def main(args: Array[String]): Unit = {
        Logger.getLogger("org").setLevel(Level.OFF)
        Logger.getLogger("akka").setLevel(Level.OFF)
        val spark = SparkSession.builder.appName("Test").master("local[4]").getOrCreate
        val dataset = spark.read.option("inferSchema","true").csv("data/labelled.csv").toDF()
    
        import spark.sqlContext.implicits._
        // using mllib.regression.LabeledPoint & mllib.linalg.Vectors then transform DF to JavaRDD
        val output = dataset.map(row => {
          LabeledPoint(row.getInt(2), Vectors.dense( row.getInt(0) , row.getInt(1)))
        }).toJavaRDD
        
        val Array(training, test) =  output.randomSplit(Array(0.7, 0.3),seed = 11L)
        training.cache()
        //Using Run instead of fit method
        val model : NaiveBayesModel = new NaiveBayes().run(training)
        val speed = 110
        val hour  = 11
        // return predict value
        val label1 : Double =  model.predict(Vectors.dense(speed,hour))
        // return array of predict Probabilities `each class Probability`
        val testLabel = model.predictProbabilities(Vectors.dense(speed,hour))
      }
    }