• 设为首页
  • 点击收藏
  • 手机版
    手机扫一扫访问
    迪恩网络手机版
  • 关注官方公众号
    微信扫一扫关注
    公众号

Scala Vector类代码示例

原作者: [db:作者] 来自: [db:来源] 收藏 邀请

本文整理汇总了Scala中org.apache.spark.mllib.linalg.Vector的典型用法代码示例。如果您正苦于以下问题:Scala Vector类的具体用法?Scala Vector怎么用?Scala Vector使用的例子?那么恭喜您, 这里精选的类代码示例或许可以为您提供帮助。



在下文中一共展示了Vector类的17个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于我们的系统推荐出更棒的Scala代码示例。

示例1: Predict

//设置package包名称以及导入依赖的类
package com.databricks.apps.twitterClassifier

import org.apache.spark.SparkContext
import org.apache.spark.mllib.clustering.KMeansModel
import org.apache.spark.mllib.linalg.Vector
import org.apache.spark.streaming.twitter._
import org.apache.spark.streaming.{Seconds, StreamingContext}

object Predict extends App {
  import SparkSetup._

  val options = PredictOptions.parse(args)
  val ssc = new StreamingContext(sc, Seconds(options.intervalInSecs))
  Predictor.doIt(options, sc, ssc)
}


object Predictor {
  def doIt(options: PredictOptions, sc: SparkContext, ssc: StreamingContext) {
    println("Initializing the the KMeans model...")
    val model: KMeansModel = new KMeansModel(sc.objectFile[Vector](options.modelDirectory.getCanonicalPath).collect)

    println("Materializing Twitter stream...")
    TwitterUtils.createStream(ssc, maybeTwitterAuth)
      .map(_.getText)
      .foreachRDD { rdd =>
        rdd.filter(t => model.predict(featurize(t)) == options.clusterNumber)
           .foreach(print)  // register DStream as an output stream and materialize it
      }
    println("Initialization complete, starting streaming computation.")
    ssc.start()
    ssc.awaitTermination()
  }
} 
开发者ID:krish121,项目名称:Spark-reference-applications,代码行数:35,代码来源:Predict.scala


示例2: PCAClustering

//设置package包名称以及导入依赖的类
package graph

import org.apache.spark.SparkContext
import org.apache.spark.graphx.{EdgeDirection, Edge, Graph}
import org.apache.spark.mllib.clustering.KMeans
import org.apache.spark.mllib.linalg.{DenseVector, Vector, Matrix, Vectors}
import org.apache.spark.mllib.linalg.distributed.RowMatrix
import org.apache.spark.rdd.RDD
import scala.collection.mutable


class PCAClustering {
  def matrixToRDD(sc:SparkContext, m: Matrix): RDD[Vector] = {
    val columns = m.toArray.grouped(m.numRows)
    val rows = columns.toSeq.transpose // Skip this if you want a column-major RDD.
    val vectors = rows.map(row => new DenseVector(row.toArray))
    sc.parallelize(vectors)
  }

  def run(inputGraph: Graph[Any, Any], clusterNum: Int, eigsNum: Int,sc:SparkContext ): Graph[Int, Any] = {
    val numNode = inputGraph.numVertices.toInt
    val mapping = new mutable.HashMap[Long,Int]()
    val revMapping = new mutable.HashMap[Int, Long]()

    val verticeIds = inputGraph.vertices.map( u => u._1 ).collect()
    for(i<-0 to numNode - 1) {
      mapping.put(verticeIds.apply(i), i)
      revMapping.put(i, verticeIds.apply(i))
    }

    //reindex the verteces from 0 to the num of nodes
    val nVertices = inputGraph.vertices.map( u=> (mapping.apply(u._1).toLong, u._2))
    val nEdges = inputGraph.edges.map(e=> Edge(mapping.apply(e.srcId).toLong, mapping.apply(e.dstId).toLong, e.attr))
    val ngraph = Graph(nVertices, nEdges)

    val output = ngraph.collectNeighborIds(EdgeDirection.Out)
    val spvec = output.mapValues(r => Vectors.sparse( numNode,  r.map(e=>e.toInt) , r.map(e=> 1.0/r.length )))
    val rows = spvec.map(v=>v._2)
    val order = spvec.map(v=>v._1)
    val mat = new RowMatrix(rows)

    val pc = mat.computePrincipalComponents(eigsNum)


    val pcRDD = matrixToRDD(sc, pc)
    val clusters = KMeans.train(pcRDD, clusterNum, 100)

    val clusterArray = pcRDD.map(p=> clusters.predict(p) ).collect()
    val assignedClusters = order.map( o => (o, clusterArray.apply(o.toInt)))
    val origVerextRDD = assignedClusters.map{case (vid, value)=> (revMapping.apply(vid.toInt), value)}
    Graph(origVerextRDD, inputGraph.edges)

  }

} 
开发者ID:HPCL,项目名称:GalacticSpark,代码行数:56,代码来源:PCAClustering.scala


示例3: RatePredictor

//设置package包名称以及导入依赖的类
package com.ferhtaydn.rater

import akka.actor.ActorSystem
import com.ferhtaydn.models.PatientInfo
import org.apache.spark.ml.feature.StringIndexerModel
import org.apache.spark.ml.tuning.CrossValidatorModel
import org.apache.spark.mllib.linalg.{ Matrix, Vector }
import org.apache.spark.sql.{ Row, SQLContext }

import scala.concurrent.{ ExecutionContextExecutor, Future }

class RatePredictor(system: ActorSystem, sqlContext: SQLContext,
    indexModel: StringIndexerModel, cvModel: CrossValidatorModel,
    confusionMatrix: String) {

  private val decimalFormatter = new java.text.DecimalFormat("##.##")
  private val blockingDispatcher: ExecutionContextExecutor = system.dispatchers.lookup("ml.predictor.dispatcher")

  def confusionMatrixString: Future[String] = {
    Future {
      confusionMatrix
    }(blockingDispatcher)
  }

  def predict(patientInfo: PatientInfo): Future[Either[String, Double]] = {

    Future {

      val df = sqlContext.createDataFrame(Seq(patientInfo.toRecord))
      val indexedJobDF = indexModel.transform(df)

      val result = cvModel
        .transform(indexedJobDF)
        .select("prediction", "probability").map {
          case Row(prediction: Double, probability: Vector) ?
            (probability, prediction)
        }

      result.collect().headOption match {
        case Some((prob, _)) ? Right(decimalFormatter.format(prob(1)).toDouble)
        case None            ? Left(s"No result can be predicted for the patient")
      }

    }(blockingDispatcher)
  }

} 
开发者ID:ferhtaydn,项目名称:canceRater,代码行数:48,代码来源:RatePredictor.scala


示例4: SimpleApp

//设置package包名称以及导入依赖的类
import org.apache.spark.SparkContext
import org.apache.spark.SparkContext._
import org.apache.spark.SparkConf
import org.apache.spark.mllib.stat.Statistics
import org.apache.spark.mllib.linalg.{Vector, Vectors}

object SimpleApp {
  def main(args: Array[String]) {
    val conf = new SparkConf().setAppName("Simple Application")
    val sc = new SparkContext(conf)

    val data = Array(1,2,3)
    val distData = sc.parallelize(data)
    val vectorData = distData.map(x => Vectors.dense(x))
    
    val summary = Statistics.colStats(vectorData)

    println("mean is: %s".format(summary.mean))
    println("max is: %s".format(summary.max))
    println("min is: %s".format(summary.min))


    //find correlation
    // student, exam1, exam2, exam3
    val data = sc.parallelize(Array("111, 60, 65, 73", "222, 98,95,88", "333, 56,67,62"))
    val vectorRdd = data.map((line: String) => line.split(",").drop(1).map((ele: String) => ele.toDouble)).map(Vectors.dense)
    val corrMatrix = Statistics.corr(vectorRdd)
  }
} 
开发者ID:mykumar,项目名称:SparkScalaInternalExperiements,代码行数:30,代码来源:SimpleApp.scala


示例5: GenerateScalingData

//设置package包名称以及导入依赖的类
package com.highperformancespark.examples.tools

import com.highperformancespark.examples.dataframe.RawPanda

import org.apache.spark._
import org.apache.spark.rdd.RDD
import org.apache.spark.sql.Row
import org.apache.spark.mllib.random.RandomRDDs
import org.apache.spark.mllib.linalg.Vector

object GenerateScalingData {
  
  def generateGoldilocks(sc: SparkContext, rows: Long, numCols: Int):
      RDD[RawPanda] = {
    val zipRDD = RandomRDDs.exponentialRDD(sc, mean = 1000,  size = rows)
      .map(_.toInt.toString)
    val valuesRDD = RandomRDDs.normalVectorRDD(
      sc, numRows = rows, numCols = numCols)
    zipRDD.zip(valuesRDD).map{case (z, v) =>
      RawPanda(1, z, "giant", v(0) > 0.5, v.toArray)
    }
  }
  // end::MAGIC_PANDA[]
} 
开发者ID:gourimahapatra,项目名称:high-performance-spark,代码行数:25,代码来源:GenerateScalingData.scala


示例6: SparkSetup

//设置package包名称以及导入依赖的类
package com.databricks.apps

package twitterClassifier {
  import org.apache.spark.SparkContext
  import org.apache.spark.sql.SparkSession

  object SparkSetup {
    val spark = SparkSession
      .builder
      .appName(getClass.getSimpleName.replace("$", ""))
      .getOrCreate()

    val sqlContext = spark.sqlContext

    val sc: SparkContext = spark.sparkContext
    // Suppress "WARN BlockManager: Block input-0-1478266015800 replicated to only 0 peer(s) instead of 1 peers" messages
    sc.setLogLevel("ERROR")
  }
}

package object twitterClassifier {
  import org.apache.spark.mllib.linalg.Vector
  import org.apache.spark.mllib.feature.HashingTF
  import twitter4j.auth.OAuthAuthorization
  import twitter4j.conf.ConfigurationBuilder

  val numFeatures = 1000
  val tf = new HashingTF(numFeatures)

  def maybeTwitterAuth: Some[OAuthAuthorization] = Some(new OAuthAuthorization(new ConfigurationBuilder().build))

  
  def featurize(s: String): Vector = tf.transform(s.sliding(2).toSeq)
} 
开发者ID:krish121,项目名称:Spark-reference-applications,代码行数:35,代码来源:package.scala


示例7: ZFLSH

//设置package包名称以及导入依赖的类
package AccurateML.lsh

import breeze.linalg.DenseMatrix
import org.apache.spark.mllib.linalg
import org.apache.spark.mllib.linalg.{Vector, Vectors}
import org.apache.spark.rdd.RDD
import org.apache.spark.{SparkConf, SparkContext}
import AccurateML.blas.ZFBLAS


class ZFLSH(
             n: Int,
             m: Int) {
  val normal01 = breeze.stats.distributions.Gaussian(0, 1)
  val nmat = DenseMatrix.rand(m, n, normal01)

  def hashVector(vector: linalg.Vector): String = {

    val r = new Array[Int](n)
    for (i <- 0 until n) {
      val mc = nmat(::, (i))
      val ans = ZFBLAS.dot(vector, Vectors.dense(mc.toArray))
      if (ans > 0)
        r(i) = 1
    }
    r.mkString("")
  }

  def main(args: Array[String]) {
    val conf = new SparkConf().setAppName("test lsh")
    val sc = new SparkContext(conf)
    val numBits = 4
    val numFeatures = 1000
    val lsh = new ZFLSH(numBits, numFeatures)
    val data: RDD[Vector] = sc.objectFile("") //eg, the first element in data is dfirst=Vector(1.0,2.0,...,1000.0)
    val mapData:RDD[(String,Vector)]=data.map(vec=>(lsh.hashVector(vec),vec))
    //eg,the first element in mapData mdfirst=Tuple2("1010",Vector(1.0,2.0,...,100.0))
    //"1010" is the sketch of dfirst=Vector(1.0,2.0,...,1000.0)
    //the instances with the same sketch will belong to the same cluster

  }

} 
开发者ID:harryandlina,项目名称:AccurateML,代码行数:44,代码来源:ZFLSH.scala


示例8: BMRMSuite

//设置package包名称以及导入依赖的类
package org.apache.spark.mllib.optimization.bmrm

import org.scalatest.FunSuite
import org.apache.spark.mllib.util.MLlibTestSparkContext
import org.apache.spark.mllib.linalg.{Vectors, Vector}

import scala.util.Random



object BMRMSuite {
  def generateSubInput(nPoint: Int, dim: Int,seed: Int):(Array[Vector], Array[Double], Vector) = {
    val rnd = new Random(seed)
    val label = Array.fill[Double](nPoint)(rnd.nextInt(5)+1.0)
    val testData = Array.fill[Vector](nPoint)(Vectors.dense(Array.fill(dim)(rnd.nextInt(10)+1.0)))
    val initWeights = Vectors.dense(Array.fill(dim)(rnd.nextInt(10)+1.0))
    (testData, label, initWeights)
  }
}

class BMRMSuite extends FunSuite with MLlibTestSparkContext {

  test("Test the loss and gradient of first iteration") {
    val subGrad = new NdcgSubGradient()
    val (testData, label, initWeights) = BMRMSuite.generateSubInput(100, 100, 45)
    val (gradient, loss) = subGrad.compute(testData, label, initWeights)
    println(gradient)
    println(loss)
  }

  test("Test the update of the weights of first iteration") {
    val subGrad = new NdcgSubGradient()
    val (testData, label, initWeights) = BMRMSuite.generateSubInput(100, 1000, 45)
    val (gradient, loss) = subGrad.compute(testData, label, initWeights)
    val subUpdater = new DaiFletcherUpdater()
    val (newWeights, objval) = subUpdater.compute(initWeights, gradient, loss, 1.0)
    println(initWeights)
    println(loss)
    println(newWeights)
    println(objval)
  }

  test("Test the BMRM optimization") {
    val subGrad = new NdcgSubGradient()
    val subUpdater = new DaiFletcherUpdater()
    val bmrm = new BMRM(subGrad, subUpdater)
    val (testData, label, initWeights) = BMRMSuite.generateSubInput(100, 10, 45)
    println(initWeights)
    val newWeights = bmrm.optimize(testData, label, initWeights)
    println(newWeights)
  }
} 
开发者ID:YaozhengWang,项目名称:spark-cofirank,代码行数:53,代码来源:BMRMSuite.scala


示例9: FeaturesParser

//设置package包名称以及导入依赖的类
package com.evrenio.ml

import org.apache.spark.mllib.linalg.{Vector, Vectors}
import org.apache.spark.mllib.regression.LabeledPoint
import org.apache.spark.rdd.RDD



object FeaturesParser{
  def parseFeatures(rawdata: RDD[String]): RDD[Vector] = {
    val rdd: RDD[Array[Double]] = rawdata.map(_.split(",").map(_.toDouble))
    val vectors: RDD[Vector] = rdd.map(arrDouble => Vectors.dense(arrDouble))
    vectors
  }

  def parseFeaturesWithLabel(cvData: RDD[String]): RDD[LabeledPoint] = {
    val rdd: RDD[Array[Double]] = cvData.map(_.split(",").map(_.toDouble))
    val labeledPoints = rdd.map(arrDouble => new LabeledPoint(arrDouble(0), Vectors.dense(arrDouble.slice(1, arrDouble.length))))
    labeledPoints
  }
} 
开发者ID:weburnit,项目名称:anomaly,代码行数:22,代码来源:FeaturesParser.scala


示例10: SparkSVDExampleOne

//设置package包名称以及导入依赖的类
package linalg.svd

import org.apache.spark.{SparkConf, SparkContext}
import org.apache.spark.mllib.linalg.distributed.RowMatrix
import org.apache.spark.mllib.linalg.{Matrix, SingularValueDecomposition, Vector, Vectors}
object SparkSVDExampleOne {

  def main(args: Array[String]) {
    val denseData = Seq(
      Vectors.dense(0.0, 1.0, 2.0, 1.0, 5.0, 3.3, 2.1),
      Vectors.dense(3.0, 4.0, 5.0, 3.1, 4.5, 5.1, 3.3),
      Vectors.dense(6.0, 7.0, 8.0, 2.1, 6.0, 6.7, 6.8),
      Vectors.dense(9.0, 0.0, 1.0, 3.4, 4.3, 1.0, 1.0)
    )
    val spConfig = (new SparkConf).setMaster("local").setAppName("SparkSVDDemo")
    val sc = new SparkContext(spConfig)
    val mat: RowMatrix = new RowMatrix(sc.parallelize(denseData, 2))

    // Compute the top 20 singular values and corresponding singular vectors.
    val svd: SingularValueDecomposition[RowMatrix, Matrix] = mat.computeSVD(7, computeU = true)
    val U: RowMatrix = svd.U // The U factor is a RowMatrix.
    val s: Vector = svd.s // The singular values are stored in a local dense vector.
    val V: Matrix = svd.V // The V factor is a local dense matrix.
    println("U:" + U)
    println("s:" + s)
    println("V:" + V)
    sc.stop()
  }
} 
开发者ID:PacktPublishing,项目名称:Machine-Learning-with-Spark-Second-Edition,代码行数:30,代码来源:SparkSVDExampleOne.scala


示例11: Util

//设置package包名称以及导入依赖的类
package org.sparksamples

import org.apache.spark.mllib.linalg.Vector
import org.apache.spark.sql._
import org.apache.spark.sql.types.{StringType, StructField, StructType}


object Util {
  val PATH = "/home/ubuntu/work/spark-2.0.0-bin-hadoop2.7/"
  val DATA_PATH= "../../../data/ml-100k"
  val PATH_MOVIES = DATA_PATH + "/u.item"

  def reduceDimension2(x: Vector) : String= {
    var i = 0
    var l = x.toArray.size
    var l_2 = l/2.toInt
    var x_ = 0.0
    var y_ = 0.0

    for(i <- 0 until l_2) {
      x_ += x(i).toDouble
    }
    for(i <- (l_2 + 1) until l) {
      y_ += x(i).toDouble
    }
    var t = x_ + "," + y_
    return t
  }

  def getMovieDataDF(spark : SparkSession) : DataFrame = {

    //1|Toy Story (1995)|01-Jan-1995||http://us.imdb.com/M/title-exact?Toy%20Story%20(1995)
    // |0|0|0|1|1|1|0|0|0|0|0|0|0|0|0|0|0|0|0
    val customSchema = StructType(Array(
      StructField("id", StringType, true),
      StructField("name", StringType, true),
      StructField("date", StringType, true),
      StructField("url", StringType, true)));
    val movieDf = spark.read.format("com.databricks.spark.csv")
      .option("delimiter", "|").schema(customSchema)
      .load(PATH_MOVIES)
    return movieDf
  }

} 
开发者ID:PacktPublishing,项目名称:Machine-Learning-with-Spark-Second-Edition,代码行数:46,代码来源:Util.scala


示例12: SyntheticFeatures

//设置package包名称以及导入依赖的类
package org.template.classification

import org.apache.spark.mllib.linalg.Vectors
import org.apache.spark.mllib.linalg.Vector

object SyntheticFeatures {

  // Synthesize averages for feature combinations.
  // Three features become seven:
  // f1, f2, f3, (f1+f2)/2, (f2+f3)/2, (f3+f1)/2, (f1+f2+f3)/3
  // These are averages of each pair and all three.
  def transform(vector:Vector):Vector = {
    val realFeatures: Array[Double] = vector.toArray

    var synthFeatures: Array[Double] = Array()
    var sumRealFeatures:Double = 0
    var i = 0
    var v:Double = 0

    v = realFeatures.apply(0)
    sumRealFeatures = sumRealFeatures + v
    synthFeatures = synthFeatures :+ (v + realFeatures.apply(1))/2

    v = realFeatures.apply(1)
    sumRealFeatures = sumRealFeatures + v
    synthFeatures = synthFeatures :+ (v + realFeatures.apply(2))/2

    v = realFeatures.apply(2)
    sumRealFeatures = sumRealFeatures + v
    synthFeatures = synthFeatures :+ (v + realFeatures.apply(0))/2

    // average of all features
    synthFeatures = synthFeatures :+ sumRealFeatures/3

    Vectors.dense(realFeatures ++ synthFeatures)
  }

} 
开发者ID:wmfongsf,项目名称:predictionio-engine-classification,代码行数:39,代码来源:SyntheticFeatures.scala


示例13: Utils

//设置package包名称以及导入依赖的类
package mapper.utils

import org.apache.spark.rdd.RDD
import org.apache.spark.mllib.linalg.{ Vector, DenseVector, SparseVector }
import breeze.linalg.{ DenseVector => BDV, SparseVector => BSV, Vector => BV }
import org.apache.spark.mllib.linalg.distributed.{ IndexedRowMatrix, IndexedRow, BlockMatrix }

object Utils {
  def toBlockMatrix(x: RDD[Vector], rowsPerBlock: Int = 1024, colsPerBlock: Int = 1024): BlockMatrix = {
    new IndexedRowMatrix(
      x.zipWithIndex().map({ xi => IndexedRow(xi._2, xi._1) })
    ).toBlockMatrix(rowsPerBlock, colsPerBlock)
  }

  def toBreeze(v: Vector): BV[Double] = v match {
    case DenseVector(values) => new BDV[Double](values)
    case SparseVector(size, indices, values) => new BSV[Double](indices, values, size)
  }

  def toSpark(bv: BV[Double]): Vector = bv match {
    case v: BDV[Double] => new DenseVector(v.toArray)
    case v: BSV[Double] => new SparseVector(v.length, v.index, v.data)
  }

  def cartesian[A](xs: Traversable[Traversable[A]]): Seq[Seq[A]] =
    xs.foldLeft(Seq(Seq.empty[A])) { (x, y) => for (a <- x; b <- y) yield a :+ b }

} 
开发者ID:log0ymxm,项目名称:spark-mapper,代码行数:29,代码来源:Utils.scala


示例14: MapperSpec

//设置package包名称以及导入依赖的类
package com.github.log0ymxm.mapper

import org.scalatest._
import com.holdenkarau.spark.testing.SharedSparkContext
import org.apache.spark.sql.{ SparkSession, Row }
import org.apache.spark.mllib.linalg.distributed.{ CoordinateMatrix, IndexedRow, IndexedRowMatrix, MatrixEntry }
import org.apache.spark.mllib.linalg.{ DenseVector, Vector, Vectors }

class MapperSpec extends FunSuite with SharedSparkContext {

  test("simple mapper on noisy circle") {
    val spark = SparkSession.builder().getOrCreate()

    val fileLoc = getClass.getClassLoader.getResource("circles.csv").getPath()
    val circle = spark.read
      .option("header", false)
      .option("inferSchema", true)
      .csv(fileLoc)

    assert(circle.count == 400)

    val indexedRDD = circle.rdd.zipWithIndex.map {
      case (Row(x: Double, y: Double), i) =>
        val v: Vector = new DenseVector(Array(x, y))
        IndexedRow(i, v)
    }
    val matrix = new IndexedRowMatrix(indexedRDD)
    val similarities = matrix.toCoordinateMatrix
      .transpose()
      .toIndexedRowMatrix()
      .columnSimilarities()
    val distances = new CoordinateMatrix(
      similarities
        .entries
        .map((entry) => new MatrixEntry(entry.i, entry.j, 1 - entry.value))
    )

    val filtration = new IndexedRowMatrix(indexedRDD.map({ row =>
      IndexedRow(row.index, new DenseVector(Array(
        Vectors.norm(row.vector, 2)
      )))
    }))

    //Mapper.writeAsJson(graph, "mapper-vis/circle-graph.json")
    val graph = Mapper.mapper(sc, distances, filtration, 100, 2.0)

    assert(graph.vertices.count == 160)
    assert(graph.edges.count == 327)
  }
} 
开发者ID:log0ymxm,项目名称:spark-mapper,代码行数:51,代码来源:MapperSpec.scala


示例15: FeaturesParser

//设置package包名称以及导入依赖的类
package com.micvog.ml

import org.apache.spark.mllib.linalg.{Vector, Vectors}
import org.apache.spark.mllib.regression.LabeledPoint
import org.apache.spark.rdd.RDD



object FeaturesParser{
  def parseFeatures(rawdata: RDD[String]): RDD[Vector] = {
    val rdd: RDD[Array[Double]] = rawdata.map(_.split(",").map(_.toDouble))
    val vectors: RDD[Vector] = rdd.map(arrDouble => Vectors.dense(arrDouble))
    vectors
  }

  def parseFeaturesWithLabel(cvData: RDD[String]): RDD[LabeledPoint] = {
    val rdd: RDD[Array[Double]] = cvData.map(_.split(",").map(_.toDouble))
    val labeledPoints = rdd.map(arrDouble => new LabeledPoint(arrDouble(0), Vectors.dense(arrDouble.slice(1, arrDouble.length))))
    labeledPoints
  }
} 
开发者ID:mvogiatzis,项目名称:spark-anomaly-detection,代码行数:22,代码来源:FeaturesParser.scala


示例16: Predictor

//设置package包名称以及导入依赖的类
package core

import kafka.utils.Json
import org.apache.spark.mllib.linalg.Vector
import org.apache.spark.mllib.tree.model.RandomForestModel
import org.apache.spark.streaming.StreamingContext

import scala.collection.parallel.mutable.ParSeq

object Predictor {


  var models: ParSeq[(String, RandomForestModel)] = null

  def setUpModels(ssc: StreamingContext, models: ParSeq[(String, RandomForestModel)]) = {
    this.models=models
  }

  def getPredictions(v: Vector) = {
    models.map(model => {
      val pred = model._2.predict(v)
      (
        model._1,
        pred,
        Json.encode(Map("modelName"->model._1,"prediction"->pred)),
        Json.encode(
        Map(
          "modelName" -> model._1,
          "numTrees" -> model._2.numTrees,
          "totalNodes" -> model._2.totalNumNodes,
          "prediction" -> pred,
          "trees" -> model._2.trees.par.map(tree =>
            Map("nodes" -> tree.numNodes, "prediction" -> tree.predict(v))).toArray
        )
        )
        )
    }
    )
  }
} 
开发者ID:jandion,项目名称:SparkOFP,代码行数:41,代码来源:Predictor.scala


示例17: SparkReader

//设置package包名称以及导入依赖的类
package org.altic.spark.clustering.utils

import org.apache.spark.SparkContext
import org.apache.spark.mllib.linalg.{Vectors, Vector}
import org.apache.spark.rdd.RDD


object SparkReader {
  def parse(sc: SparkContext, filePath: String, splitRegex: String): RDD[NamedVector] = {
    sc.textFile(filePath).map { line =>
      val arrayDouble = line.split(splitRegex).map(_.toDouble)
      new NamedVector(arrayDouble.dropRight(1), arrayDouble.last.toInt)
    }
  }

  def parseForKMeans(sc: SparkContext, filePath: String): RDD[Vector] = {
    val data = sc.textFile(filePath)
    data.map {
      line =>
        val arrayDouble = line.split(' ').map(_.toDouble)
        Vectors.dense(arrayDouble.dropRight(1))
    }
  }

  private def parseSIFTLine(line: String): LabelVector = {

    val parts = line.split(',')
    printf("\n parts is:"+parts)
    val feature = parts(parts.length - 1).trim
    val filepath = parts(0)//parts(0).split("/")
    print("\n feature is: "+feature)

    val features = feature.split(' ').map(_.trim.toDouble)
    val label = filepath//filepath(filepath.length - 2)
    print("\n label is: "+label)
    new LabelVector(features, label)
  }

  private def parseClass(line: String): (String, Array[Double]) = {
    val parts = line.split(',')
    val feature = parts(parts.length - 1).trim
    val filepath = parts(0).split("/")
    // print(feature)

    val features = feature.split(' ').map(_.trim.toDouble)
    val label = filepath(filepath.length - 2)
    (label, features)
  }


  def parseSIFTClass(sc: SparkContext, filePath: String): RDD[(String, Array[Double])] = {
    val data = sc.textFile(filePath).filter(_.length > 1)
    data.map(parseClass)
  }


  def parseSIFT(sc: SparkContext, filePath: String): RDD[LabelVector] = {
    val data = sc.textFile(filePath).filter(_.length > 1)
    data.map(parseSIFTLine)
  }
} 
开发者ID:GuruTeja,项目名称:iHear_SignatureGeneration,代码行数:62,代码来源:SparkReader.scala



注:本文中的org.apache.spark.mllib.linalg.Vector类示例整理自Github/MSDocs等源码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。


鲜花

握手

雷人

路过

鸡蛋
该文章已有0人参与评论

请发表评论

全部评论

专题导读
上一篇:
Scala Map类代码示例发布时间:2022-05-23
下一篇:
Scala StatusCodes类代码示例发布时间:2022-05-23
热门推荐
热门话题
阅读排行榜

扫描微信二维码

查看手机版网站

随时了解更新最新资讯

139-2527-9053

在线客服(服务时间 9:00~18:00)

在线QQ客服
地址:深圳市南山区西丽大学城创智工业园
电邮:jeky_zhao#qq.com
移动电话:139-2527-9053

Powered by 互联科技 X3.4© 2001-2213 极客世界.|Sitemap