Scala KMeans类代码示例

OStack程序员社区-中国程序员成长平台 › 门户 › 编程› Scala›Scala教程

原作者: [db:作者] 来自: [db:来源] 收藏邀请

本文整理汇总了Scala中org.apache.spark.mllib.clustering.KMeans类的典型用法代码示例。如果您正苦于以下问题：Scala KMeans类的具体用法？Scala KMeans怎么用？Scala KMeans使用的例子？那么恭喜您, 这里精选的类代码示例或许可以为您提供帮助。

在下文中一共展示了KMeans类的7个代码示例，这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞，您的评价将有助于我们的系统推荐出更棒的Scala代码示例。

示例1: KmeansExample

//设置package包名称以及导入依赖的类
import org.apache.spark.mllib.clustering.{KMeans, KMeansModel}
import org.apache.spark.mllib.linalg.Vectors

import scala.util.Try



object KmeansExample {

  import ApplicationContext._
  def main(args: Array[String]) {

    val pressureRead = sc.textFile("src/main/resources/PamarcoPressure.txt")
    val vibrationText = sc.textFile("src/main/resources/PamarcoVibration.txt")

    val pressureRDD = pressureRead.map(_.split(","))
    val vibrationRDD = vibrationText.map(_.split("\t")).persist()

    val vibrationVector = vibrationRDD.map { row =>
      Try(Vectors.dense(row(1).toDouble, row(2).toDouble, row(3).toDouble)).toOption
    }.filter(_.isDefined).map(_.get)
    val splittedRDD = vibrationVector.randomSplit(Array(0.6, 0.4))
    val trainRDD = splittedRDD(0)
    val testRDD = splittedRDD(1)
    // Cluster the data into two classes using KMeans
    val numClusters = 2
    val numIterations = 20
    val clusters = KMeans.train(trainRDD, numClusters, numIterations)

    // Evaluate clustering by computing Within Set Sum of Squared Errors
    val WSSSE = clusters.computeCost(vibrationVector)
    println("Within Set Sum of Squared Errors = " + WSSSE)

    // Save and load model
    clusters.save(sc, "target/org/apache/spark/KMeansExample/KMeansModel")
    val sameModel = KMeansModel.load(sc, "target/org/apache/spark/KMeansExample/KMeansModel")

    import spark.implicits._
    val foo = sameModel.predict(testRDD)
    foo.toDF.show


    sc.stop()
  }
}

开发者ID:shiv4nsh，项目名称:spark-kmeans-timeseries-data-example，代码行数:46，代码来源:KmeansExample.scala

示例2: KMeansCases

//设置package包名称以及导入依赖的类
import org.apache.spark.SparkContext
import org.apache.spark.mllib.clustering.{KMeans, KMeansModel}
import org.apache.spark.mllib.linalg.Vectors
import org.apache.log4j.{Level, Logger}

class KMeansCases(sc: SparkContext, dataFile: String, numOfCenters: Int, maxIterations:Int) {
  //hide logger from console
  Logger.getLogger("org").setLevel(Level.OFF)
  Logger.getLogger("akka").setLevel(Level.OFF)

  val data = sc.textFile(dataFile)
  val parsedData = data.map(s => Vectors.dense(s.split('\t').map(_.toDouble))).cache()

  def KMeansInitialCenters() = {
    val initStartTime = System.nanoTime()
    val centers = new KMeansInitialization().run(sc, dataFile, numOfCenters)
    val initTimeInSeconds = (System.nanoTime() - initStartTime) / 1e9
    println(s"Initialization to find centers took " + "%.3f".format(initTimeInSeconds) + " seconds.")

    val initStartTime1 = System.nanoTime()
    val model = new KMeansModel(centers)
    val clusterModel = new KMeans().setK(numOfCenters).setMaxIterations(maxIterations).setInitialModel(model).run(parsedData)
    val initTimeInSeconds1 = (System.nanoTime() - initStartTime1) / 1e9
    println(s"Initialization with custom took " + "%.3f".format(initTimeInSeconds1) + " seconds.")

    println("\nnumber of points per cluster")
    clusterModel.predict(parsedData).map(x=>(x,1)).reduceByKey((a,b)=>a+b).foreach(x=>println(x._2))

  }

  def KMeansParallel() = {
    val initStartTime = System.nanoTime()

    val clusterModel = KMeans.train(parsedData, numOfCenters, maxIterations, 1, KMeans.K_MEANS_PARALLEL)
    val initTimeInSeconds = (System.nanoTime() - initStartTime) / 1e9
    println(s"Initialization with KMeansParaller took " + "%.3f".format(initTimeInSeconds) + " seconds.")
    println("number of points per cluster")
    clusterModel.predict(parsedData).map(x=>(x,1)).reduceByKey((a,b)=>a+b).foreach(x=>println(x._2))
  }

  def KMeansRandom() = {
    val initStartTime = System.nanoTime()

    val clusterModel = KMeans.train(parsedData, numOfCenters, maxIterations, 1, KMeans.RANDOM)
    val initTimeInSeconds = (System.nanoTime() - initStartTime) / 1e9
    println(s"Initialization with KMeasRandom took " + "%.3f".format(initTimeInSeconds) + " seconds.")
    println("number of points per cluster")

    clusterModel.predict(parsedData).map(x=>(x,1)).reduceByKey((a,b)=>a+b).foreach(x=>println(x._2))
  }
}

开发者ID:AndyFou，项目名称:kmeans_contributions，代码行数:52，代码来源:KMeansCases.scala

示例3: PCAClustering

//设置package包名称以及导入依赖的类
package graph

import org.apache.spark.SparkContext
import org.apache.spark.graphx.{EdgeDirection, Edge, Graph}
import org.apache.spark.mllib.clustering.KMeans
import org.apache.spark.mllib.linalg.{DenseVector, Vector, Matrix, Vectors}
import org.apache.spark.mllib.linalg.distributed.RowMatrix
import org.apache.spark.rdd.RDD
import scala.collection.mutable


class PCAClustering {
  def matrixToRDD(sc:SparkContext, m: Matrix): RDD[Vector] = {
    val columns = m.toArray.grouped(m.numRows)
    val rows = columns.toSeq.transpose // Skip this if you want a column-major RDD.
    val vectors = rows.map(row => new DenseVector(row.toArray))
    sc.parallelize(vectors)
  }

  def run(inputGraph: Graph[Any, Any], clusterNum: Int, eigsNum: Int,sc:SparkContext ): Graph[Int, Any] = {
    val numNode = inputGraph.numVertices.toInt
    val mapping = new mutable.HashMap[Long,Int]()
    val revMapping = new mutable.HashMap[Int, Long]()

    val verticeIds = inputGraph.vertices.map( u => u._1 ).collect()
    for(i<-0 to numNode - 1) {
      mapping.put(verticeIds.apply(i), i)
      revMapping.put(i, verticeIds.apply(i))
    }

    //reindex the verteces from 0 to the num of nodes
    val nVertices = inputGraph.vertices.map( u=> (mapping.apply(u._1).toLong, u._2))
    val nEdges = inputGraph.edges.map(e=> Edge(mapping.apply(e.srcId).toLong, mapping.apply(e.dstId).toLong, e.attr))
    val ngraph = Graph(nVertices, nEdges)

    val output = ngraph.collectNeighborIds(EdgeDirection.Out)
    val spvec = output.mapValues(r => Vectors.sparse( numNode,  r.map(e=>e.toInt) , r.map(e=> 1.0/r.length )))
    val rows = spvec.map(v=>v._2)
    val order = spvec.map(v=>v._1)
    val mat = new RowMatrix(rows)

    val pc = mat.computePrincipalComponents(eigsNum)


    val pcRDD = matrixToRDD(sc, pc)
    val clusters = KMeans.train(pcRDD, clusterNum, 100)

    val clusterArray = pcRDD.map(p=> clusters.predict(p) ).collect()
    val assignedClusters = order.map( o => (o, clusterArray.apply(o.toInt)))
    val origVerextRDD = assignedClusters.map{case (vid, value)=> (revMapping.apply(vid.toInt), value)}
    Graph(origVerextRDD, inputGraph.edges)

  }

}

开发者ID:HPCL，项目名称:GalacticSpark，代码行数:56，代码来源:PCAClustering.scala

示例4: VT_sample_label_rdd_class

//设置package包名称以及导入依赖的类
import org.apache.spark.mllib.feature.PCA
import org.apache.spark.mllib.linalg.Vectors
import org.apache.spark.mllib.clustering.KMeans
import PreProcessingConfig._

case class VT_sample_label_rdd_class(sha256:String, label:Double)
val VT_sample_signatures_final_array_rdd = spark.read.format("parquet").load(VT_sample_signatures_final_array_file).rdd.map(row => new VT_sample_signatures_final_array_rdd_class(row(0).toString,row(1).asInstanceOf[Seq[Double]].toArray))
val VT_sample_signatures_with_sha_rddvector = VT_sample_signatures_final_array_rdd.map(x=>(x.sha256,Vectors.dense(x.array_results)))
val VT_sample_signatures_rddvector = VT_sample_signatures_with_sha_rddvector.map(x=>x._2)
val pca = new PCA(2).fit(VT_sample_signatures_rddvector)
val VT_sample_pca_with_sha_rddvector = VT_sample_signatures_with_sha_rddvector.map(x => (x._1,pca.transform(x._2)))
val VT_sample_pca_rddvector = (VT_sample_pca_with_sha_rddvector.map(x=>x._2)).cache()
val KMeans_Model = KMeans.train(VT_sample_pca_rddvector,5,5,1)
val VT_sample_pca_label_with_sha_rdd = VT_sample_pca_with_sha_rddvector.map(x=>(x._1,(x._2.toArray(0),x._2.toArray(1),KMeans_Model.predict(x._2))))
val VT_sample_label_rdd = VT_sample_pca_label_with_sha_rdd.map(x=>new VT_sample_label_rdd_class(x._1,x._2._3.toDouble))

VT_sample_label_rdd.toDF().write.format("parquet").save(VT_sample_label_file)

开发者ID:HolmesProcessing，项目名称:gsoc_relationship，代码行数:18，代码来源:get_labels_from_VT_signatures.scala

示例5: KMeansExample

//设置package包名称以及导入依赖的类
package com.learn.spark.mllib

import org.apache.spark.mllib.clustering.KMeans
import org.apache.spark.mllib.linalg.Vectors
import org.apache.spark.{SparkContext, SparkConf}


object KMeansExample {

    def main(args: Array[String]) {
        val conf = new SparkConf().setMaster("local[4]").setAppName(s"Kmeans example").set("spark.driver.host", "localhost")
        val sc = new SparkContext(conf)
        val rawData = sc.textFile("/Users/xiaojie/Downloads/kddcup.data")

        val labelsAndData = rawData.map { line =>
            val buffer = line.split(",").toBuffer
            buffer.remove(1, 3)
            val label = buffer.remove(buffer.length - 1)
            val vector = Vectors.dense(buffer.map(_.toDouble).toArray)
            (label, vector)
        }
        val data = labelsAndData.values.cache

        val kmeans = new KMeans()
        val model = kmeans.run(data)
        model.clusterCenters.foreach(println)
    }

}

开发者ID:xiaoJacky，项目名称:sparkLearning，代码行数:30，代码来源:KMeansExample.scala

示例6: KMeansExample

//设置package包名称以及导入依赖的类
package com.stulsoft.pspark.mllib

import com.stulsoft.pspark.util.PSparkUtil
import org.apache.spark.mllib.clustering.KMeans
import org.apache.spark.mllib.linalg.Vectors
import org.apache.spark.{SparkConf, SparkContext}


object KMeansExample extends App {
  println("==>Start")

  val conf = new SparkConf().setAppName("KMeansExample").setMaster("local[*]")
  val sc = new SparkContext(conf)

  // Load and parse the data
  val data = sc.textFile(PSparkUtil.getResourceFilePath("data/mllib/kmeans_data.txt"))
  val parsedData = data.map(s => Vectors.dense(s.split(',').map(_.toDouble))).cache()

  // Cluster the data into two classes using KMeans
  val numClusters = 2
  val numIterations = 20
  val clusters = KMeans.train(parsedData, numClusters, numIterations)

  // Evaluate clustering by computing Within Set Sum of Squared Errors
  val WSSSE = clusters.computeCost(parsedData)
  println("Within Set Sum of Squared Errors = " + WSSSE)

  println
  println
  println
  clusters.clusterCenters.foreach(println)

  // getting number of points per cluster
  val cluster_ind = clusters.predict(parsedData)
  cluster_ind.collect().foreach(println)
  println("(cluster,item)")
  cluster_ind.collect().zipWithIndex.sorted.foreach(println)

  sc.stop
  println("==>End")
}

开发者ID:ysden123，项目名称:poc，代码行数:42，代码来源:KMeansExample.scala

示例7: KMeansTest

//设置package包名称以及导入依赖的类
package cn.edu.bjtu

import org.apache.spark.SparkConf
import org.apache.spark.mllib.clustering.KMeans
import org.apache.spark.mllib.util.MLUtils
import org.apache.spark.sql.SparkSession

object KMeansTest {
  def main(args: Array[String]): Unit = {

    val sparkConf = new SparkConf()
      .setAppName("KMeansTest")
      .setMaster("spark://master:7077")
      .setJars(Array("/home/hadoop/KMeans.jar"))

    val spark = SparkSession.builder()
      .config(sparkConf)
      .getOrCreate()

    spark.sparkContext.setLogLevel("WARN")

    // Load and parse the data
    val data = MLUtils.loadLibSVMFile(spark.sparkContext, "hdfs://master:9000/sample_formatted.txt")
    val parsedData = data.map(s => s.features).cache()

    // Cluster the data into two classes using KMeans
    val numClusters = 2
    val numIterations = 20
    val clusters = KMeans.train(parsedData, numClusters, numIterations)
    val predictionAndLabels = data.map(
      s => {
        (clusters.predict(s.features), s.label)
      })
    // Evaluate clustering by computing Within Set Sum of Squared Errors
    println("Sensitivity = " + predictionAndLabels.filter(x => x._1 == x._2 && x._1 == 1.0).count().toDouble / predictionAndLabels.filter(x => x._2 == 1.0).count().toDouble)
    println("Specificity = " + predictionAndLabels.filter(x => x._1 == x._2 && x._1 == 0.0).count().toDouble / predictionAndLabels.filter(x => x._2 == 0.0).count().toDouble)
    println("Accuracy = " + predictionAndLabels.filter(x => x._1 == x._2).count().toDouble / predictionAndLabels.count().toDouble)
  }
}

开发者ID:XiaoyuGuo，项目名称:DataFusionClass，代码行数:40，代码来源:KMeansTest.scala

注：本文中的org.apache.spark.mllib.clustering.KMeans类示例整理自Github/MSDocs等源码及文档管理平台，相关代码片段筛选自各路编程大神贡献的开源项目，源码版权归原作者所有，传播和使用请参考对应项目的License；未经允许，请勿转载。