• 设为首页
  • 点击收藏
  • 手机版
    手机扫一扫访问
    迪恩网络手机版
  • 关注官方公众号
    微信扫一扫关注
    公众号

Scala Vectors类代码示例

原作者: [db:作者] 来自: [db:来源] 收藏 邀请

本文整理汇总了Scala中org.apache.spark.mllib.linalg.Vectors的典型用法代码示例。如果您正苦于以下问题:Scala Vectors类的具体用法?Scala Vectors怎么用?Scala Vectors使用的例子?那么恭喜您, 这里精选的类代码示例或许可以为您提供帮助。



在下文中一共展示了Vectors类的20个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于我们的系统推荐出更棒的Scala代码示例。

示例1: SimpleApp

//设置package包名称以及导入依赖的类
import org.apache.spark.SparkContext
import org.apache.spark.SparkConf

import org.apache.spark.ml.clustering.LDA
import org.apache.spark.mllib.linalg.{VectorUDT, Vectors}
import org.apache.spark.sql.{Row, SQLContext}
import org.apache.spark.sql.types.{StructField, StructType}


object SimpleApp {
  def main(args: Array[String]) {
    val conf = new SparkConf().setAppName("Simple Application").set("spark.ui.enabled", "false")
    val sc = new SparkContext(conf)
    val sqlContext = new SQLContext(sc)

    // Loads data
    val rowRDD = sc.textFile("/tmp/lda_data.txt").filter(_.nonEmpty)
      .map(_.split(" ").map(_.toDouble)).map(Vectors.dense).map(Row(_))
    val schema = StructType(Array(StructField("name", new VectorUDT, false)))
    val dataset = sqlContext.createDataFrame(rowRDD, schema)
    dataset.show()

    val lda = new LDA()
      .setK(10)
      .setMaxIter(10)
      .setFeaturesCol("name")
    val model = lda.fit(dataset)
    val transformed = model.transform(dataset)

    val ll = model.logLikelihood(dataset)
    val lp = model.logPerplexity(dataset)

    // describeTopics
    val topics = model.describeTopics(3)

    // Shows the result
    topics.show(false)
    transformed.show(false)
  }
} 
开发者ID:mykumar,项目名称:SparkScalaInternalExperiements,代码行数:41,代码来源:SimpleApp.scala


示例2: KMeansClusteringApp

//设置package包名称以及导入依赖的类
package org.apress.prospark

import org.apache.spark.SparkConf
import org.apache.spark.SparkContext
import org.apache.spark.mllib.clustering.StreamingKMeans
import org.apache.spark.mllib.linalg.Vectors
import org.apache.spark.mllib.regression.LabeledPoint
import org.apache.spark.rdd.RDD
import org.apache.spark.rdd.RDD.doubleRDDToDoubleRDDFunctions
import org.apache.spark.streaming.Seconds
import org.apache.spark.streaming.StreamingContext

object KMeansClusteringApp {

  def main(args: Array[String]) {
    if (args.length != 4) {
      System.err.println(
        "Usage: KMeansClusteringApp <appname> <batchInterval> <hostname> <port>")
      System.exit(1)
    }
    val Seq(appName, batchInterval, hostname, port) = args.toSeq

    val conf = new SparkConf()
      .setAppName(appName)
      .setJars(SparkContext.jarOfClass(this.getClass).toSeq)

    val ssc = new StreamingContext(conf, Seconds(batchInterval.toInt))

    val substream = ssc.socketTextStream(hostname, port.toInt)
      .filter(!_.contains("NaN"))
      .map(_.split(" "))
      .filter(f => f(1) != "0")

    val orientationStream = substream
      .map(f => Seq(1, 4, 5, 6, 10, 11, 12, 20, 21, 22, 26, 27, 28, 36, 37, 38, 42, 43, 44).map(i => f(i)).toArray)
      .map(arr => arr.map(_.toDouble))
      .filter(f => f(0) == 1.0 || f(0) == 2.0 || f(0) == 3.0)
      .map(f => LabeledPoint(f(0), Vectors.dense(f.slice(1, f.length))))
    val test = orientationStream.transform(rdd => rdd.randomSplit(Array(0.3, 0.7))(0))
    val train = orientationStream.transformWith(test, (r1: RDD[LabeledPoint], r2: RDD[LabeledPoint]) => r1.subtract(r2)).cache()
    val model = new StreamingKMeans()
      .setK(3)
      .setDecayFactor(0)
      .setRandomCenters(18, 0.0)

    model.trainOn(train.map(v => v.features))
    val prediction = model.predictOnValues(test.map(v => (v.label, v.features)))

    ssc.start()
    ssc.awaitTermination()
  }

} 
开发者ID:ZubairNabi,项目名称:prosparkstreaming,代码行数:54,代码来源:L9-10KMeans.scala


示例3: Consumer

//设置package包名称以及导入依赖的类
import org.apache.spark.streaming.kafka010.KafkaUtils
import org.apache.kafka.common.serialization.StringDeserializer
import org.apache.spark.SparkConf
import org.apache.spark.streaming.{Seconds, StreamingContext}
import org.apache.spark.streaming.kafka010.LocationStrategies.PreferConsistent
import org.apache.spark.streaming.kafka010.ConsumerStrategies.Subscribe
import org.apache.spark.mllib.classification.SVMModel
import org.apache.spark.mllib.linalg.Vectors
import org.apache.spark.sql.SparkSession

object Consumer {

  def main(args: Array[String]): Unit = {

    val kafkaParams = Map[String, Object](
      "bootstrap.servers" -> "localhost:9092",
      "key.deserializer" -> classOf[StringDeserializer],
      "value.deserializer" -> classOf[StringDeserializer],
      "group.id" -> "use_a_separate_group_id_for_each_stream",
      "auto.offset.reset" -> "latest",
      "enable.auto.commit" -> (false: java.lang.Boolean)
    )

    val topics = Array("streaming")

    val sparkConf = new SparkConf().setMaster("local[8]").setAppName("KafkaTest")
    val streamingContext = new StreamingContext(sparkConf, Seconds(1))
    // Create a input direct stream
    val kafkaStream = KafkaUtils.createDirectStream[String, String](
      streamingContext,
      PreferConsistent,
      Subscribe[String, String](topics, kafkaParams)
    )

    val sc = SparkSession.builder().master("local[8]").appName("KafkaTest").getOrCreate()
    val model = SVMModel.load(sc.sparkContext, "/home/xiaoyu/model")
    val result = kafkaStream.map(record => (record.key, record.value))
    result.foreachRDD(
      patient => {
        patient.collect().toBuffer.foreach(
          (x: (Any, String)) => {
            val features = x._2.split(',').map(x => x.toDouble).tail
            println(model.predict(Vectors.dense(features)))

          }
        )
      }
    )

    streamingContext.start()
    streamingContext.awaitTermination()

  }
} 
开发者ID:XiaoyuGuo,项目名称:DataFusionClass,代码行数:55,代码来源:Consumer.scala


示例4: KmeansExample

//设置package包名称以及导入依赖的类
import org.apache.spark.mllib.clustering.{KMeans, KMeansModel}
import org.apache.spark.mllib.linalg.Vectors

import scala.util.Try



object KmeansExample {

  import ApplicationContext._
  def main(args: Array[String]) {

    val pressureRead = sc.textFile("src/main/resources/PamarcoPressure.txt")
    val vibrationText = sc.textFile("src/main/resources/PamarcoVibration.txt")

    val pressureRDD = pressureRead.map(_.split(","))
    val vibrationRDD = vibrationText.map(_.split("\t")).persist()

    val vibrationVector = vibrationRDD.map { row =>
      Try(Vectors.dense(row(1).toDouble, row(2).toDouble, row(3).toDouble)).toOption
    }.filter(_.isDefined).map(_.get)
    val splittedRDD = vibrationVector.randomSplit(Array(0.6, 0.4))
    val trainRDD = splittedRDD(0)
    val testRDD = splittedRDD(1)
    // Cluster the data into two classes using KMeans
    val numClusters = 2
    val numIterations = 20
    val clusters = KMeans.train(trainRDD, numClusters, numIterations)

    // Evaluate clustering by computing Within Set Sum of Squared Errors
    val WSSSE = clusters.computeCost(vibrationVector)
    println("Within Set Sum of Squared Errors = " + WSSSE)

    // Save and load model
    clusters.save(sc, "target/org/apache/spark/KMeansExample/KMeansModel")
    val sameModel = KMeansModel.load(sc, "target/org/apache/spark/KMeansExample/KMeansModel")

    import spark.implicits._
    val foo = sameModel.predict(testRDD)
    foo.toDF.show


    sc.stop()
  }
} 
开发者ID:shiv4nsh,项目名称:spark-kmeans-timeseries-data-example,代码行数:46,代码来源:KmeansExample.scala


示例5: KMeansCases

//设置package包名称以及导入依赖的类
import org.apache.spark.SparkContext
import org.apache.spark.mllib.clustering.{KMeans, KMeansModel}
import org.apache.spark.mllib.linalg.Vectors
import org.apache.log4j.{Level, Logger}

class KMeansCases(sc: SparkContext, dataFile: String, numOfCenters: Int, maxIterations:Int) {
  //hide logger from console
  Logger.getLogger("org").setLevel(Level.OFF)
  Logger.getLogger("akka").setLevel(Level.OFF)

  val data = sc.textFile(dataFile)
  val parsedData = data.map(s => Vectors.dense(s.split('\t').map(_.toDouble))).cache()

  def KMeansInitialCenters() = {
    val initStartTime = System.nanoTime()
    val centers = new KMeansInitialization().run(sc, dataFile, numOfCenters)
    val initTimeInSeconds = (System.nanoTime() - initStartTime) / 1e9
    println(s"Initialization to find centers took " + "%.3f".format(initTimeInSeconds) + " seconds.")

    val initStartTime1 = System.nanoTime()
    val model = new KMeansModel(centers)
    val clusterModel = new KMeans().setK(numOfCenters).setMaxIterations(maxIterations).setInitialModel(model).run(parsedData)
    val initTimeInSeconds1 = (System.nanoTime() - initStartTime1) / 1e9
    println(s"Initialization with custom took " + "%.3f".format(initTimeInSeconds1) + " seconds.")

    println("\nnumber of points per cluster")
    clusterModel.predict(parsedData).map(x=>(x,1)).reduceByKey((a,b)=>a+b).foreach(x=>println(x._2))

  }

  def KMeansParallel() = {
    val initStartTime = System.nanoTime()

    val clusterModel = KMeans.train(parsedData, numOfCenters, maxIterations, 1, KMeans.K_MEANS_PARALLEL)
    val initTimeInSeconds = (System.nanoTime() - initStartTime) / 1e9
    println(s"Initialization with KMeansParaller took " + "%.3f".format(initTimeInSeconds) + " seconds.")
    println("number of points per cluster")
    clusterModel.predict(parsedData).map(x=>(x,1)).reduceByKey((a,b)=>a+b).foreach(x=>println(x._2))
  }

  def KMeansRandom() = {
    val initStartTime = System.nanoTime()

    val clusterModel = KMeans.train(parsedData, numOfCenters, maxIterations, 1, KMeans.RANDOM)
    val initTimeInSeconds = (System.nanoTime() - initStartTime) / 1e9
    println(s"Initialization with KMeasRandom took " + "%.3f".format(initTimeInSeconds) + " seconds.")
    println("number of points per cluster")

    clusterModel.predict(parsedData).map(x=>(x,1)).reduceByKey((a,b)=>a+b).foreach(x=>println(x._2))
  }
} 
开发者ID:AndyFou,项目名称:kmeans_contributions,代码行数:52,代码来源:KMeansCases.scala


示例6: SparkSGD

//设置package包名称以及导入依赖的类
package linalg.sgd
import scala.util.Random
import org.apache.spark.mllib.linalg.Vectors
import org.apache.spark.mllib.optimization.GradientDescent
import org.apache.spark.mllib.optimization.SquaredL2Updater
import org.apache.spark.mllib.optimization.LogisticGradient
import org.apache.spark.SparkContext



object SparkSGD {
  def main(args: Array[String]): Unit = {
    val m = 4
    val n = 200000
    val sc = new SparkContext("local[2]", "")
    val points = sc.parallelize(0 until m, 2).mapPartitionsWithIndex { (idx, iter) =>
      val random = new Random(idx)
      iter.map(i => (1.0, Vectors.dense(Array.fill(n)(random.nextDouble()))))
    }.cache()
    val (weights, loss) = GradientDescent.runMiniBatchSGD(
      points,
      new LogisticGradient,
      new SquaredL2Updater,
      0.1,
      2,
      1.0,
      1.0,
      Vectors.dense(new Array[Double](n)))
    println("w:"  + weights(0))
    println("loss:" + loss(0))
    sc.stop()

  }
} 
开发者ID:PacktPublishing,项目名称:Machine-Learning-with-Spark-Second-Edition,代码行数:35,代码来源:SparkSGD.scala


示例7: StandardScalarSample

//设置package包名称以及导入依赖的类
import org.apache.spark.mllib.feature.{StandardScaler, StandardScalerModel}
import org.apache.spark.mllib.linalg.Vectors
import org.apache.spark.mllib.util.MLUtils
import org.apache.spark.{SparkConf, SparkContext}

object StandardScalarSample {
  def main(args: Array[String]) {
    val conf = new SparkConf().setMaster("local").setAppName("Word2Vector")
    val sc = new SparkContext(conf)
    val data = MLUtils.loadLibSVMFile(sc, "/home/ubuntu/work/spark-1.6.0-bin-hadoop2.6/data/mllib/sample_libsvm_data.txt")

    val scaler1 = new StandardScaler().fit(data.map(x => x.features))
    val scaler2 = new StandardScaler(withMean = true, withStd = true).fit(data.map(x => x.features))
    // scaler3 is an identical model to scaler2, and will produce identical transformations
    val scaler3 = new StandardScalerModel(scaler2.std, scaler2.mean)

    // data1 will be unit variance.
    val data1 = data.map(x => (x.label, scaler1.transform(x.features)))
    println(data1.first())

    // Without converting the features into dense vectors, transformation with zero mean will raise
    // exception on sparse vector.
    // data2 will be unit variance and zero mean.
    val data2 = data.map(x => (x.label, scaler2.transform(Vectors.dense(x.features.toArray))))
    println(data2.first())
  }
} 
开发者ID:PacktPublishing,项目名称:Machine-Learning-with-Spark-Second-Edition,代码行数:28,代码来源:StandardScalarSample.scala


示例8: SVMPipeline

//设置package包名称以及导入依赖的类
package org.stumbleuponclassifier

import org.apache.log4j.Logger
import org.apache.spark.SparkContext
import org.apache.spark.mllib.classification.SVMWithSGD
import org.apache.spark.mllib.linalg.Vectors
import org.apache.spark.mllib.regression.LabeledPoint


object SVMPipeline {
  @transient lazy val logger = Logger.getLogger(getClass.getName)

  def svmPipeline(sc: SparkContext) = {
    val records = sc.textFile("/home/ubuntu/work/ml-resources/spark-ml/train_noheader.tsv").map(line => line.split("\t"))

    val data = records.map { r =>
      val trimmed = r.map(_.replaceAll("\"", ""))
      val label = trimmed(r.size - 1).toInt
      val features = trimmed.slice(4, r.size - 1).map(d => if (d == "?") 0.0 else d.toDouble)
      LabeledPoint(label, Vectors.dense(features))
    }

    // params for SVM
    val numIterations = 10

    // Run training algorithm to build the model
    val svmModel = SVMWithSGD.train(data, numIterations)

    // Clear the default threshold.
    svmModel.clearThreshold()

    val svmTotalCorrect = data.map { point =>
      if(svmModel.predict(point.features) == point.label) 1 else 0
    }.sum()

    // calculate accuracy
    val svmAccuracy = svmTotalCorrect / data.count()
    println(svmAccuracy)
  }

} 
开发者ID:PacktPublishing,项目名称:Machine-Learning-with-Spark-Second-Edition,代码行数:42,代码来源:SVMPipeline.scala


示例9: PCAClustering

//设置package包名称以及导入依赖的类
package graph

import org.apache.spark.SparkContext
import org.apache.spark.graphx.{EdgeDirection, Edge, Graph}
import org.apache.spark.mllib.clustering.KMeans
import org.apache.spark.mllib.linalg.{DenseVector, Vector, Matrix, Vectors}
import org.apache.spark.mllib.linalg.distributed.RowMatrix
import org.apache.spark.rdd.RDD
import scala.collection.mutable


class PCAClustering {
  def matrixToRDD(sc:SparkContext, m: Matrix): RDD[Vector] = {
    val columns = m.toArray.grouped(m.numRows)
    val rows = columns.toSeq.transpose // Skip this if you want a column-major RDD.
    val vectors = rows.map(row => new DenseVector(row.toArray))
    sc.parallelize(vectors)
  }

  def run(inputGraph: Graph[Any, Any], clusterNum: Int, eigsNum: Int,sc:SparkContext ): Graph[Int, Any] = {
    val numNode = inputGraph.numVertices.toInt
    val mapping = new mutable.HashMap[Long,Int]()
    val revMapping = new mutable.HashMap[Int, Long]()

    val verticeIds = inputGraph.vertices.map( u => u._1 ).collect()
    for(i<-0 to numNode - 1) {
      mapping.put(verticeIds.apply(i), i)
      revMapping.put(i, verticeIds.apply(i))
    }

    //reindex the verteces from 0 to the num of nodes
    val nVertices = inputGraph.vertices.map( u=> (mapping.apply(u._1).toLong, u._2))
    val nEdges = inputGraph.edges.map(e=> Edge(mapping.apply(e.srcId).toLong, mapping.apply(e.dstId).toLong, e.attr))
    val ngraph = Graph(nVertices, nEdges)

    val output = ngraph.collectNeighborIds(EdgeDirection.Out)
    val spvec = output.mapValues(r => Vectors.sparse( numNode,  r.map(e=>e.toInt) , r.map(e=> 1.0/r.length )))
    val rows = spvec.map(v=>v._2)
    val order = spvec.map(v=>v._1)
    val mat = new RowMatrix(rows)

    val pc = mat.computePrincipalComponents(eigsNum)


    val pcRDD = matrixToRDD(sc, pc)
    val clusters = KMeans.train(pcRDD, clusterNum, 100)

    val clusterArray = pcRDD.map(p=> clusters.predict(p) ).collect()
    val assignedClusters = order.map( o => (o, clusterArray.apply(o.toInt)))
    val origVerextRDD = assignedClusters.map{case (vid, value)=> (revMapping.apply(vid.toInt), value)}
    Graph(origVerextRDD, inputGraph.edges)

  }

} 
开发者ID:HPCL,项目名称:GalacticSpark,代码行数:56,代码来源:PCAClustering.scala


示例10: SimpleApp

//设置package包名称以及导入依赖的类
import org.apache.spark.SparkContext
import org.apache.spark.SparkContext._
import org.apache.spark.SparkConf
import org.apache.spark.mllib.stat.Statistics
import org.apache.spark.mllib.linalg.{Vector, Vectors}

object SimpleApp {
  def main(args: Array[String]) {
    val conf = new SparkConf().setAppName("Simple Application")
    val sc = new SparkContext(conf)

    val data = Array(1,2,3)
    val distData = sc.parallelize(data)
    val vectorData = distData.map(x => Vectors.dense(x))
    
    val summary = Statistics.colStats(vectorData)

    println("mean is: %s".format(summary.mean))
    println("max is: %s".format(summary.max))
    println("min is: %s".format(summary.min))


    //find correlation
    // student, exam1, exam2, exam3
    val data = sc.parallelize(Array("111, 60, 65, 73", "222, 98,95,88", "333, 56,67,62"))
    val vectorRdd = data.map((line: String) => line.split(",").drop(1).map((ele: String) => ele.toDouble)).map(Vectors.dense)
    val corrMatrix = Statistics.corr(vectorRdd)
  }
} 
开发者ID:mykumar,项目名称:SparkScalaInternalExperiements,代码行数:30,代码来源:SimpleApp.scala


示例11: VT_sample_label_rdd_class

//设置package包名称以及导入依赖的类
import org.apache.spark.mllib.feature.PCA
import org.apache.spark.mllib.linalg.Vectors
import org.apache.spark.mllib.clustering.KMeans
import PreProcessingConfig._

case class VT_sample_label_rdd_class(sha256:String, label:Double)
val VT_sample_signatures_final_array_rdd = spark.read.format("parquet").load(VT_sample_signatures_final_array_file).rdd.map(row => new VT_sample_signatures_final_array_rdd_class(row(0).toString,row(1).asInstanceOf[Seq[Double]].toArray))
val VT_sample_signatures_with_sha_rddvector = VT_sample_signatures_final_array_rdd.map(x=>(x.sha256,Vectors.dense(x.array_results)))
val VT_sample_signatures_rddvector = VT_sample_signatures_with_sha_rddvector.map(x=>x._2)
val pca = new PCA(2).fit(VT_sample_signatures_rddvector)
val VT_sample_pca_with_sha_rddvector = VT_sample_signatures_with_sha_rddvector.map(x => (x._1,pca.transform(x._2)))
val VT_sample_pca_rddvector = (VT_sample_pca_with_sha_rddvector.map(x=>x._2)).cache()
val KMeans_Model = KMeans.train(VT_sample_pca_rddvector,5,5,1)
val VT_sample_pca_label_with_sha_rdd = VT_sample_pca_with_sha_rddvector.map(x=>(x._1,(x._2.toArray(0),x._2.toArray(1),KMeans_Model.predict(x._2))))
val VT_sample_label_rdd = VT_sample_pca_label_with_sha_rdd.map(x=>new VT_sample_label_rdd_class(x._1,x._2._3.toDouble))

VT_sample_label_rdd.toDF().write.format("parquet").save(VT_sample_label_file) 
开发者ID:HolmesProcessing,项目名称:gsoc_relationship,代码行数:18,代码来源:get_labels_from_VT_signatures.scala


示例12: Event

//设置package包名称以及导入依赖的类
package fraud.main

import org.apache.spark.mllib.linalg.Vectors
import spray.json.DefaultJsonProtocol
import java.util.UUID._

case class Event(id: String, user: String, item: String, action: String, timestamp: String)

object EventJsonProtocol extends DefaultJsonProtocol {
  implicit val EventFormat = jsonFormat5(Event)
}

object Domain {
  val items = Seq("Toy Banner", "Suit Banner", "Skirt Banner")
  val itemIds = Map(items(0) -> 0, items(1) -> 1, items(2) -> 2)
  
  val actions = Seq("Click", "View", "Loaded")
  
  val users = Seq("Billy", "John", "Mary")
  val userIds = Map(users(0) -> 0, users(1) -> 1, users(2) -> 2)
  
  def features(e: Event) = Vectors.dense(itemId(e), userId(e))
  def itemId(e: Event): Int = itemIds(e.item)
  def userId(e: Event): Int = userIds(e.user)
}

object RandomEvent {
  val rnd = new scala.util.Random()
  def randomFraudEvent() = Event(randomUUID.toString, Domain.users(0).toString, Domain.items(1), Domain.actions(0), timestamp)
  //Constructs a random events
  def apply(): Event = Event(randomUUID.toString, randomUser, randomItem, randomAction, timestamp)
  def randomItem() = Domain.items(rnd.nextInt(Domain.items.size))
  def randomUser() = Domain.users(rnd.nextInt(Domain.users.size))
  def randomAction() = Domain.actions(rnd.nextInt(Domain.actions.size))
  def timestamp() = new java.util.Date().toString()
} 
开发者ID:bbiletskyy,项目名称:fraud-dm,代码行数:37,代码来源:Domain.scala


示例13: irisFeatureColumn

//设置package包名称以及导入依赖的类
package ca.jakegreene.iris

import org.apache.spark.SparkContext
import org.apache.spark.sql.SQLContext
import org.apache.spark.sql.DataFrame
import org.apache.spark.mllib.linalg.Vectors

trait DataLoader {
  
  def irisFeatureColumn = "iris-features"
  def irisTypeColumn = "iris-type"
  
  def loadIris(filePath: String)(implicit sqlContext: SQLContext): DataFrame = {
    val irisData = sqlContext.sparkContext.textFile(filePath).flatMap { text =>
      text.split("\n").toList.map(_.split(",")).collect {
        case Array(sepalLength, sepalWidth, petalLength, petalWidth, irisType) =>
          (Vectors.dense(sepalLength.toDouble, sepalWidth.toDouble, petalLength.toDouble, petalWidth.toDouble), irisType)
      }
    }
    sqlContext.createDataFrame(irisData).toDF(irisFeatureColumn, irisTypeColumn)
  }
} 
开发者ID:JakeGreene,项目名称:iris-ml,代码行数:23,代码来源:DataLoader.scala


示例14: KMeansExample

//设置package包名称以及导入依赖的类
package com.learn.spark.mllib

import org.apache.spark.mllib.clustering.KMeans
import org.apache.spark.mllib.linalg.Vectors
import org.apache.spark.{SparkContext, SparkConf}


object KMeansExample {

    def main(args: Array[String]) {
        val conf = new SparkConf().setMaster("local[4]").setAppName(s"Kmeans example").set("spark.driver.host", "localhost")
        val sc = new SparkContext(conf)
        val rawData = sc.textFile("/Users/xiaojie/Downloads/kddcup.data")

        val labelsAndData = rawData.map { line =>
            val buffer = line.split(",").toBuffer
            buffer.remove(1, 3)
            val label = buffer.remove(buffer.length - 1)
            val vector = Vectors.dense(buffer.map(_.toDouble).toArray)
            (label, vector)
        }
        val data = labelsAndData.values.cache

        val kmeans = new KMeans()
        val model = kmeans.run(data)
        model.clusterCenters.foreach(println)
    }

} 
开发者ID:xiaoJacky,项目名称:sparkLearning,代码行数:30,代码来源:KMeansExample.scala


示例15: Utils

//设置package包名称以及导入依赖的类
package com.github.aadamson.spark_glove

import org.apache.spark.{SparkConf, SparkContext};
import org.apache.spark.mllib.linalg.{Vector, Vectors, Matrix, Matrices, DenseMatrix};
import org.apache.spark.mllib.linalg.distributed.{CoordinateMatrix, BlockMatrix, RowMatrix, MatrixEntry, IndexedRow, IndexedRowMatrix};
import org.apache.spark.rdd.RDD;

object Utils {
  type CoordinateRDD[T] = RDD[((Long, Long), T)];

  implicit def CoordinateRDD2CoordinateMatrix(a: CoordinateRDD[Float]): CoordinateMatrix = {
    val entries: RDD[MatrixEntry] = a.map { case ((i, j), value) => new MatrixEntry(i, j, value) };
    val mat: CoordinateMatrix = new CoordinateMatrix(entries);
    return mat;
  }

  def broadcastVector(v: Vector, numRows: Int, sc: SparkContext): IndexedRowMatrix = {
    val rows: RDD[IndexedRow] = sc.parallelize(0 to numRows-1).map(i => new IndexedRow(i, v));
    val mat: IndexedRowMatrix = new IndexedRowMatrix(rows);
    return mat;
  }

  def elementwiseProduct[T](a: T, b: T): T = (a, b) match {
    case (x: BlockMatrix, y: BlockMatrix) => {
      val aIRM = x.toIndexedRowMatrix();
      val bIRM = y.toIndexedRowMatrix();
      val rows = aIRM.rows.zip(bIRM.rows).map {
        case (aRow: IndexedRow, bRow: IndexedRow) => new IndexedRow(aRow.index, elementwiseProduct(aRow.vector, bRow.vector));
      }
      return (new IndexedRowMatrix(rows)).toBlockMatrix().asInstanceOf[T];
    }
    case (x: Vector, y: Vector) => {
      val values = Array(x.toArray, y.toArray);
      return Vectors.dense(values.transpose.map(_.sum)).asInstanceOf[T];;
    }
  }
} 
开发者ID:aadamson,项目名称:spark-glove,代码行数:38,代码来源:Utils.scala


示例16: ZFLSH

//设置package包名称以及导入依赖的类
package AccurateML.lsh

import breeze.linalg.DenseMatrix
import org.apache.spark.mllib.linalg
import org.apache.spark.mllib.linalg.{Vector, Vectors}
import org.apache.spark.rdd.RDD
import org.apache.spark.{SparkConf, SparkContext}
import AccurateML.blas.ZFBLAS


class ZFLSH(
             n: Int,
             m: Int) {
  val normal01 = breeze.stats.distributions.Gaussian(0, 1)
  val nmat = DenseMatrix.rand(m, n, normal01)

  def hashVector(vector: linalg.Vector): String = {

    val r = new Array[Int](n)
    for (i <- 0 until n) {
      val mc = nmat(::, (i))
      val ans = ZFBLAS.dot(vector, Vectors.dense(mc.toArray))
      if (ans > 0)
        r(i) = 1
    }
    r.mkString("")
  }

  def main(args: Array[String]) {
    val conf = new SparkConf().setAppName("test lsh")
    val sc = new SparkContext(conf)
    val numBits = 4
    val numFeatures = 1000
    val lsh = new ZFLSH(numBits, numFeatures)
    val data: RDD[Vector] = sc.objectFile("") //eg, the first element in data is dfirst=Vector(1.0,2.0,...,1000.0)
    val mapData:RDD[(String,Vector)]=data.map(vec=>(lsh.hashVector(vec),vec))
    //eg,the first element in mapData mdfirst=Tuple2("1010",Vector(1.0,2.0,...,100.0))
    //"1010" is the sketch of dfirst=Vector(1.0,2.0,...,1000.0)
    //the instances with the same sketch will belong to the same cluster

  }

} 
开发者ID:harryandlina,项目名称:AccurateML,代码行数:44,代码来源:ZFLSH.scala


示例17: ZFCaculateEuclideanMatrix

//设置package包名称以及导入依赖的类
package AccurateML.blas

import scala.collection.mutable.ArrayBuffer
import scala.io.Source
import org.apache.spark.mllib.linalg.Vectors


object ZFCaculateEuclideanMatrix {
  def main(args: Array[String]) {
    val fileName="/Users/zhangfan/Downloads/cancel"
    val iter = Source.fromFile(fileName).getLines()
    var i=1
    var lines = new ArrayBuffer[String]()
    while(iter.hasNext){
      lines += iter.next()
      i+=1
    }
    val vecs = lines.toArray.map(line=>Vectors.dense(line.split(",").map(_.toDouble)))
    val n=lines.length
    val matrix = Array.ofDim[Double](n,n)
    for(i<-0 until n){
      for(j<-0 until i){
        matrix(i)(j) = ZFUtils.zfEuclideanDistance(vecs(i),vecs(j))
        matrix(j)(i)=matrix(i)(j)
      }
    }
    for(i<-0 until n){
      println(matrix(i).mkString(","))
    }
  }

} 
开发者ID:harryandlina,项目名称:AccurateML,代码行数:33,代码来源:ZFCaculateEuclideanMatrix.scala


示例18: TitanicBayes

//设置package包名称以及导入依赖的类
import org.apache.spark.mllib.classification.{NaiveBayes, NaiveBayesModel}
import org.apache.spark.mllib.linalg.Vectors
import org.apache.spark.mllib.regression.LabeledPoint
import org.apache.spark.rdd.RDD
import org.apache.spark.sql.{DataFrame, Row}


object TitanicBayes {

  var naiveBayesModel: NaiveBayesModel = null

  def train(df: DataFrame): Unit = {
    val mappedDf = df.map(row =>
      (row.getAs[Int]("Survived"), row.getAs[Double]("Fare"), row.getAs[Int]("Pclass"), row.getAs[Double]("Age")
        ,row.getAs[Int]("Sex"), row.getAs[Int]("Parch"), row.getAs[Int]("SibSp"),row.getAs[Int]("Embarked")))

    val labledData = mappedDf.map { case (survived, fare, pclass, age, sex, parch, sibsp, embarked) =>
      LabeledPoint(survived, Vectors.dense(fare, pclass, age, sex, parch, sibsp, embarked))
    }
    naiveBayesModel = NaiveBayes.train(labledData, lambda = 1.0, modelType = "multinomial")

  }

  def predict(df: DataFrame): RDD[Row] = {

    val resultDf = df.map { row =>
      val denseVecor = Vectors.dense(row.getAs[Double]("Fare"), row.getAs[Int]("Pclass"), row.getAs[Double]("Age"),row.getAs[Int]("Sex"),
        row.getAs[Int]("Parch"), row.getAs[Int]("SibSp"), row.getAs[Int]("Embarked") )
      val result = naiveBayesModel.predict(denseVecor)
      Row.fromTuple((row.getAs[Int]("PassengerId"), result.toInt))
    }
    resultDf
  }


} 
开发者ID:digital-thinking,项目名称:spark-titanic,代码行数:37,代码来源:TitanicBayes.scala


示例19: StreamingSimpleModel

//设置package包名称以及导入依赖的类
package com.bigchange.streaming

import breeze.linalg.DenseVector
import org.apache.spark.mllib.linalg.Vectors
import org.apache.spark.mllib.regression.{LabeledPoint, StreamingLinearRegressionWithSGD}
import org.apache.spark.streaming.{Seconds, StreamingContext}


object StreamingSimpleModel {

  def main(args: Array[String]) {

    val ssc = new StreamingContext("local","test",Seconds(10))
    val stream = ssc.socketTextStream("localhost",9999)
    val numberFeatures = 100
    val zeroVector = DenseVector.zeros[Double](numberFeatures)
    val model = new StreamingLinearRegressionWithSGD()
      .setInitialWeights(Vectors.dense(zeroVector.data))
      .setNumIterations(1)
      .setStepSize(0.01)


    val labeledStream = stream.map { event =>
      val split = event.split("\t")
      val y = split(0).toDouble
      val features = split(1).split(",").map(_.toDouble)
      LabeledPoint(label = y, features = Vectors.dense(features))
    }

    model.trainOn(labeledStream)
    // ??DStream?????
    val predictAndTrue = labeledStream.transform { rdd =>
     val latestModel = model.latestModel()
      rdd.map { point =>
        val predict = latestModel.predict(point.features)
        predict - point.label
      }
    }
    // ??MSE
    predictAndTrue.foreachRDD { rdd =>
      val  mse = rdd.map(x => x * x).mean()
      val rmse = math.sqrt(mse)
      println(s"current batch, MSE: $mse, RMSE:$rmse")

    }
    ssc.start()
    ssc.awaitTermination()

  }
} 
开发者ID:bigchange,项目名称:AI,代码行数:51,代码来源:StreamingSimpleModel.scala


示例20: transform

//设置package包名称以及导入依赖的类
package it.agilelab.bigdata.wasp.consumers

import it.agilelab.bigdata.wasp.consumers.MlModels.TransformerWithInfo
import it.agilelab.bigdata.wasp.consumers.strategies.{ReaderKey, Strategy}
import org.apache.spark.ml.Transformer
import org.apache.spark.ml.classification.LogisticRegression
import org.apache.spark.ml.param.Params
import org.apache.spark.mllib.linalg.Vectors
import org.apache.spark.sql.{SQLContext, DataFrame}


  override def transform(dataFrames: Map[ReaderKey, DataFrame]): DataFrame = {
    assert(sparkContext.isEmpty)
    import org.apache.spark.sql.functions._
    import org.apache.spark.mllib.linalg.{Vector, Vectors}

    val key = ReaderKey("topic", MlModelsSpecBatchModelMaker.nameDF)
    val dataFrameToTransform = dataFrames.get(key).get

    val transform = mlModelsBroadcast.get(MlModelsSpecBatchModelMaker.nameModel, MlModelsSpecBatchModelMaker.versionModel)
    assert(transform.isDefined)

    val toVector = udf[Vector, Seq[Double]](a => Vectors.dense(a.toArray))
    val t = dataFrameToTransform.col("featuresArray")
    val f = toVector(t)


    val result = transform.get.transformer.transform(dataFrameToTransform.withColumn("features", f))
    result
  }
} 
开发者ID:agile-lab-dev,项目名称:wasp,代码行数:32,代码来源:MlModelsSpecBatchModelMaker.scala



注:本文中的org.apache.spark.mllib.linalg.Vectors类示例整理自Github/MSDocs等源码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。


鲜花

握手

雷人

路过

鸡蛋
该文章已有0人参与评论

请发表评论

全部评论

专题导读
上一篇:
Scala Seconds类代码示例发布时间:2022-05-23
下一篇:
Scala ExecutionContextExecutor类代码示例发布时间:2022-05-23
热门推荐
热门话题
阅读排行榜

扫描微信二维码

查看手机版网站

随时了解更新最新资讯

139-2527-9053

在线客服(服务时间 9:00~18:00)

在线QQ客服
地址:深圳市南山区西丽大学城创智工业园
电邮:jeky_zhao#qq.com
移动电话:139-2527-9053

Powered by 互联科技 X3.4© 2001-2213 极客世界.|Sitemap