Scala RandomForestClassifier类代码示例

OStack程序员社区-中国程序员成长平台 › 门户 › 编程› Scala›Scala教程

原作者: [db:作者] 来自: [db:来源] 收藏邀请

本文整理汇总了Scala中org.apache.spark.ml.classification.RandomForestClassifier类的典型用法代码示例。如果您正苦于以下问题：Scala RandomForestClassifier类的具体用法？Scala RandomForestClassifier怎么用？Scala RandomForestClassifier使用的例子？那么恭喜您, 这里精选的类代码示例或许可以为您提供帮助。

在下文中一共展示了RandomForestClassifier类的4个代码示例，这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞，您的评价将有助于我们的系统推荐出更棒的Scala代码示例。

示例1: RandomForestClassification

//设置package包名称以及导入依赖的类
package com.databricks.spark.sql.perf.mllib.classification

import org.apache.spark.ml.Estimator
import org.apache.spark.ml.classification.RandomForestClassifier

import com.databricks.spark.sql.perf.mllib._
import com.databricks.spark.sql.perf.mllib.OptionImplicits._


object RandomForestClassification extends TreeOrForestClassification {

  override def getEstimator(ctx: MLBenchContext): Estimator[_] = {
    import ctx.params._
    // TODO: subsamplingRate, featureSubsetStrategy
    // TODO: cacheNodeIds, checkpoint?
    new RandomForestClassifier()
      .setMaxDepth(depth)
      .setNumTrees(maxIter)
      .setSeed(ctx.seed())
  }
}

开发者ID:summerDG，项目名称:spark-sql-perf，代码行数:22，代码来源:RandomForestClassification.scala

示例2: RandomForestPipeline

//设置package包名称以及导入依赖的类
package org.stumbleuponclassifier

import org.apache.log4j.Logger
import org.apache.spark.ml.classification.RandomForestClassifier
import org.apache.spark.ml.evaluation.MulticlassClassificationEvaluator
import org.apache.spark.ml.feature.{StringIndexer, VectorAssembler}
import org.apache.spark.ml.{Pipeline, PipelineStage}
import org.apache.spark.sql.DataFrame

import scala.collection.mutable


object RandomForestPipeline {
  @transient lazy val logger = Logger.getLogger(getClass.getName)

  def randomForestPipeline(vectorAssembler: VectorAssembler, dataFrame: DataFrame) = {
    val Array(training, test) = dataFrame.randomSplit(Array(0.9, 0.1), seed = 12345)

    // Set up Pipeline
    val stages = new mutable.ArrayBuffer[PipelineStage]()

    val labelIndexer = new StringIndexer()
      .setInputCol("label")
      .setOutputCol("indexedLabel")
    stages += labelIndexer

    val rf = new RandomForestClassifier()
      .setFeaturesCol(vectorAssembler.getOutputCol)
      .setLabelCol("indexedLabel")
      .setNumTrees(20)
      .setMaxDepth(5)
      .setMaxBins(32)
      .setMinInstancesPerNode(1)
      .setMinInfoGain(0.0)
      .setCacheNodeIds(false)
      .setCheckpointInterval(10)

    stages += vectorAssembler
    stages += rf
    val pipeline = new Pipeline().setStages(stages.toArray)

    // Fit the Pipeline
    val startTime = System.nanoTime()
    //val model = pipeline.fit(training)
    val model = pipeline.fit(dataFrame)
    val elapsedTime = (System.nanoTime() - startTime) / 1e9
    println(s"Training time: $elapsedTime seconds")

    //val holdout = model.transform(test).select("prediction","label")
    val holdout = model.transform(dataFrame).select("prediction","label")

    // Select (prediction, true label) and compute test error
    val evaluator = new MulticlassClassificationEvaluator()
      .setLabelCol("label")
      .setPredictionCol("prediction")
      .setMetricName("accuracy")
    val mAccuracy = evaluator.evaluate(holdout)
    println("Test set accuracy = " + mAccuracy)

  }
}

开发者ID:PacktPublishing，项目名称:Machine-Learning-with-Spark-Second-Edition，代码行数:62，代码来源:RandomForestPipeline.scala

示例3: RandomForestPipeline

//设置package包名称以及导入依赖的类
package org.sparksamples.classification.stumbleupon

import org.apache.log4j.Logger
import org.apache.spark.ml.classification.RandomForestClassifier
import org.apache.spark.ml.evaluation.MulticlassClassificationEvaluator
import org.apache.spark.ml.feature.{StringIndexer, VectorAssembler}
import org.apache.spark.ml.{Pipeline, PipelineStage}
import org.apache.spark.sql.DataFrame

import scala.collection.mutable


object RandomForestPipeline {
  @transient lazy val logger = Logger.getLogger(getClass.getName)

  def randomForestPipeline(vectorAssembler: VectorAssembler, dataFrame: DataFrame) = {
    val Array(training, test) = dataFrame.randomSplit(Array(0.9, 0.1), seed = 12345)

    // Set up Pipeline
    val stages = new mutable.ArrayBuffer[PipelineStage]()

    val labelIndexer = new StringIndexer()
      .setInputCol("label")
      .setOutputCol("indexedLabel")
    stages += labelIndexer

    val rf = new RandomForestClassifier()
      .setFeaturesCol(vectorAssembler.getOutputCol)
      .setLabelCol("indexedLabel")
      .setNumTrees(20)
      .setMaxDepth(5)
      .setMaxBins(32)
      .setMinInstancesPerNode(1)
      .setMinInfoGain(0.0)
      .setCacheNodeIds(false)
      .setCheckpointInterval(10)

    stages += vectorAssembler
    stages += rf
    val pipeline = new Pipeline().setStages(stages.toArray)

    // Fit the Pipeline
    val startTime = System.nanoTime()
    //val model = pipeline.fit(training)
    val model = pipeline.fit(dataFrame)
    val elapsedTime = (System.nanoTime() - startTime) / 1e9
    println(s"Training time: $elapsedTime seconds")

    //val holdout = model.transform(test).select("prediction","label")
    val holdout = model.transform(dataFrame).select("prediction","label")

    // Select (prediction, true label) and compute test error
    val evaluator = new MulticlassClassificationEvaluator()
      .setLabelCol("label")
      .setPredictionCol("prediction")
      .setMetricName("accuracy")
    val mAccuracy = evaluator.evaluate(holdout)
    println("Test set accuracy = " + mAccuracy)

  }
}

开发者ID:PacktPublishing，项目名称:Machine-Learning-with-Spark-Second-Edition，代码行数:62，代码来源:RandomForestPipeline.scala

示例4: Test

//设置package包名称以及导入依赖的类
package org.apache.spark.test

import org.apache.spark.SparkContext
import org.apache.spark.SparkContext._
import org.apache.spark.SparkConf
import org.apache.spark.ml.feature.StringIndexer

object Test {
  def main(args: Array[String]): Unit = {
    val conf = new SparkConf().setAppName("Simple Application")
    val sc = new SparkContext(conf)
      
    val sqlContext = new org.apache.spark.sql.SQLContext(sc)

    //KMEANS
    val npart = 216
    
    def time[A](a: => A) = {
    	val now = System.nanoTime
    	val result = a
    	val sec = (System.nanoTime - now) * 1e-9
    	println("Total time (secs): " + sec)
    	result
    }
    
    val file = "hdfs://hadoop-master:8020/user/spark/datasets/higgs/HIGGS.csv"
    val df = sqlContext.read.format("com.databricks.spark.csv").option("header", "false")
      .option("inferSchema", "true").load(file).repartition(npart)
    
    
    import org.apache.spark.ml.feature.VectorAssembler 
    val featureAssembler = new VectorAssembler().setInputCols(df.columns.drop(1)).setOutputCol("features")
    val processedDf = featureAssembler.transform(df).cache()
    
    print("Num. elements: " + processedDf.count)
    
    // Trains a k-means model.
    import org.apache.spark.ml.clustering.KMeans
    val kmeans = new KMeans().setSeed(1L)
    val cmodel = time(kmeans.fit(processedDf.select("features")))    
    
    //RANDOM FOREST
    import org.apache.spark.ml.classification.RandomForestClassifier
    val labelCol = df.columns.head
    
    val indexer = new StringIndexer().setInputCol(labelCol).setOutputCol("labelIndexed")
    val imodel = indexer.fit(processedDf)
    val indexedDF = imodel.transform(processedDf)
    
    val rf = new RandomForestClassifier().setFeaturesCol("features").setLabelCol("labelIndexed")
    val model = time(rf.fit(indexedDF))
  }
}

开发者ID:sramirez，项目名称:scalabilityTestSpark，代码行数:54，代码来源:Test.scala

注：本文中的org.apache.spark.ml.classification.RandomForestClassifier类示例整理自Github/MSDocs等源码及文档管理平台，相关代码片段筛选自各路编程大神贡献的开源项目，源码版权归原作者所有，传播和使用请参考对应项目的License；未经允许，请勿转载。