• 设为首页
  • 点击收藏
  • 手机版
    手机扫一扫访问
    迪恩网络手机版
  • 关注官方公众号
    微信扫一扫关注
    公众号

Scala Broadcast类代码示例

原作者: [db:作者] 来自: [db:来源] 收藏 邀请

本文整理汇总了Scala中org.apache.spark.broadcast.Broadcast的典型用法代码示例。如果您正苦于以下问题:Scala Broadcast类的具体用法?Scala Broadcast怎么用?Scala Broadcast使用的例子?那么恭喜您, 这里精选的类代码示例或许可以为您提供帮助。



在下文中一共展示了Broadcast类的20个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于我们的系统推荐出更棒的Scala代码示例。

示例1: GladLogLikelihoodAggregator

//设置package包名称以及导入依赖的类
package com.enriquegrodrigo.spark.crowd.aggregators 

import com.enriquegrodrigo.spark.crowd.types.GladPartial
import com.enriquegrodrigo.spark.crowd.types.GladParams
import com.enriquegrodrigo.spark.crowd.utils.Functions
import org.apache.spark.sql.{Encoder,Encoders}
import org.apache.spark.sql.expressions.Aggregator
import org.apache.spark.broadcast.Broadcast

import scala.math.log

private[crowd] class GladLogLikelihoodAggregator(params: Broadcast[GladParams]) 
  extends Aggregator[GladPartial, GladLogLikelihoodAggregatorBuffer, Double]{

  def zero: GladLogLikelihoodAggregatorBuffer = GladLogLikelihoodAggregatorBuffer(0,-1)

  def reduce(b: GladLogLikelihoodAggregatorBuffer, a: GladPartial) : GladLogLikelihoodAggregatorBuffer = {
    val alphaVal = params.value.alpha(a.annotator.toInt)
    val betaVal = a.beta
    val sig = Functions.sigmoid(alphaVal*betaVal) 
    val p0 = 1-a.est
    val p1 = a.est
    val k0 = if (a.value == 0) sig else 1-sig 
    val k1 = if (a.value == 1) sig else 1-sig 
    GladLogLikelihoodAggregatorBuffer(b.agg + Functions.prodlog(p0,k0) 
                                          + Functions.prodlog(p1,k1), p1) 
  }

  def merge(b1: GladLogLikelihoodAggregatorBuffer, b2: GladLogLikelihoodAggregatorBuffer) : GladLogLikelihoodAggregatorBuffer = { 
    GladLogLikelihoodAggregatorBuffer(b1.agg + b2.agg, if (b1.classProb == -1) b2.classProb else b1.classProb)
  }

  def finish(reduction: GladLogLikelihoodAggregatorBuffer) =  {
    val agg = reduction.agg
    val w0 = params.value.w(0)
    val w1 = params.value.w(1)
    val lastVal = reduction.agg + Functions.prodlog((1-reduction.classProb),params.value.w(0)) + 
                      Functions.prodlog(reduction.classProb,params.value.w(1))
    lastVal
  }


  def bufferEncoder: Encoder[GladLogLikelihoodAggregatorBuffer] = Encoders.product[GladLogLikelihoodAggregatorBuffer]

  def outputEncoder: Encoder[Double] = Encoders.scalaDouble
}

private[crowd] case class GladLogLikelihoodAggregatorBuffer(agg: Double, classProb: Double) 
开发者ID:enriquegrodrigo,项目名称:spark-crowd,代码行数:49,代码来源:GladLogLikelihoodAggregator.scala


示例2: PDF

//设置package包名称以及导入依赖的类
package org.hammerlab.coverage.two_sample

import org.apache.spark.broadcast.Broadcast
import org.apache.spark.rdd.RDD
import org.hammerlab.coverage
import org.hammerlab.coverage.histogram.JointHistogram.Depth
import spire.algebra.Monoid

case class PDF[C: Monoid](rdd: RDD[((Depth, Depth), C)],
                          filtersBroadcast: Broadcast[(Set[Depth], Set[Depth])],
                          maxDepth1: Depth,
                          maxDepth2: Depth)
  extends coverage.PDF[C]
    with CanDownSampleRDD[C]

case class CDF[C: Monoid](rdd: RDD[((Depth, Depth), C)],
                          filtersBroadcast: Broadcast[(Set[Depth], Set[Depth])])
  extends coverage.CDF[C]
    with CanDownSampleRDD[C] 
开发者ID:hammerlab,项目名称:coverage-depth,代码行数:20,代码来源:PDF.scala


示例3: rdd

//设置package包名称以及导入依赖的类
package org.hammerlab.coverage.two_sample

import org.apache.spark.broadcast.Broadcast
import org.apache.spark.rdd.RDD
import org.hammerlab.coverage.histogram.JointHistogram.Depth

trait CanDownSampleRDD[V] {
  def rdd: RDD[((Depth, Depth), V)]
  def filtersBroadcast: Broadcast[(Set[Depth], Set[Depth])]
  @transient lazy val filtered = filterDistribution(filtersBroadcast)

  private def filterDistribution(filtersBroadcast: Broadcast[(Set[Int], Set[Int])]): Array[((Depth, Depth), V)] =
    (for {
      ((d1, d2), value) ? rdd
      (d1Filter, d2Filter) = filtersBroadcast.value
      if d1Filter(d1) && d2Filter(d2)
    } yield
      (d1, d2) ? value
    )
    .collect
    .sortBy(_._1)
} 
开发者ID:hammerlab,项目名称:coverage-depth,代码行数:23,代码来源:CanDownSampleRDD.scala


示例4: Configuration

//设置package包名称以及导入依赖的类
package org.hammerlab.hadoop

import java.io.{ ObjectInputStream, ObjectOutputStream }

import com.esotericsoftware.kryo.Kryo
import org.apache.hadoop.conf
import org.apache.hadoop.conf.{ Configuration ? HadoopConfiguration }
import org.apache.spark.SparkContext
import org.apache.spark.broadcast.Broadcast
import org.hammerlab.hadoop.kryo.WritableSerializer
import org.hammerlab.kryo.serializeAs

class Configuration(@transient var value: HadoopConfiguration)
  extends Serializable {
  private def writeObject(out: ObjectOutputStream): Unit = {
    value.write(out)
  }

  private def readObject(in: ObjectInputStream): Unit = {
    value = new HadoopConfiguration(false)
    value.readFields(in)
  }
}

object Configuration {

  def apply(loadDefaults: Boolean = true): Configuration =
    new HadoopConfiguration(loadDefaults)

  def apply(conf: HadoopConfiguration): Configuration =
    new Configuration(conf)

  implicit def wrapConfiguration(conf: HadoopConfiguration): Configuration =
    apply(conf)

  implicit def unwrapConfiguration(conf: Configuration): HadoopConfiguration =
    conf.value

  implicit def unwrapConfigurationBroadcast(confBroadcast: Broadcast[Configuration]): Configuration =
    confBroadcast.value

  implicit def sparkContextToHadoopConfiguration(sc: SparkContext): Configuration =
    sc.hadoopConfiguration

  implicit class ConfWrapper(val conf: HadoopConfiguration) extends AnyVal {
    def serializable: Configuration =
      Configuration(conf)
  }

  def register(kryo: Kryo): Unit = {
    kryo.register(
      classOf[conf.Configuration],
      new WritableSerializer[conf.Configuration]
    )

    kryo.register(
      classOf[Configuration],
      serializeAs[Configuration, conf.Configuration]
    )
  }
} 
开发者ID:hammerlab,项目名称:spark-util,代码行数:62,代码来源:Configuration.scala


示例5: ShortestPathLengthsFromCSV

//设置package包名称以及导入依赖的类
package ml.sparkling.graph.examples

import ml.sparkling.graph.api.operators.algorithms.shortestpaths.ShortestPathsTypes
import ml.sparkling.graph.api.operators.algorithms.shortestpaths.ShortestPathsTypes._
import ml.sparkling.graph.operators.algorithms.shortestpaths.ShortestPathsAlgorithm
import ml.sparkling.graph.operators.algorithms.shortestpaths.pathprocessors.fastutils.FastUtilWithDistance.DataMap
import ml.sparkling.graph.operators.predicates.AllPathPredicate
import org.apache.spark.broadcast.Broadcast
import org.apache.spark.graphx.{Graph, VertexId}

import scala.collection.JavaConversions._

object ShortestPathLengthsFromCSV extends ExampleApp {
def body()={
  val shortestPaths =if(bucketSize == -1l)
    ShortestPathsAlgorithm.computeShortestPathsLengths(partitionedGraph,AllPathPredicate,treatAsUndirected)
  else
    ShortestPathsAlgorithm.computeShortestPathsLengthsIterative(partitionedGraph,(g:Graph[_,_])=>bucketSize,treatAsUndirected)
  val size: Broadcast[VertexId] =ctx.broadcast(partitionedGraph.numVertices)
  partitionedGraph.outerJoinVertices(shortestPaths.vertices)(Util.dataTransformFunction(size) _).vertices.values.saveAsTextFile(out)
  ctx.stop()
}
}


private object Util{
  def dataTransformFunction(size: Broadcast[VertexId])(vId: VertexId,oldValue: String,pathsOption: Option[_ >: DataMap <: JMap[JLong, JDouble]])={
    pathsOption.flatMap((paths)=>{
      var entries=paths.entrySet().toList.sortBy(_.getKey)
      val out=new StringBuilder()
      out++=s"${oldValue},"
      var a = 0l
      while (a < size.value) {
        if (entries.size > 0 && a == entries.head.getKey) {
          out ++= s"${entries.head.getValue},"
          entries = entries.drop(1)
        }
        else {
          out ++= "0,"
        }
        a += 1l
      }
      out.setLength(out.length - 1)
      Option(out.toString())
    }).getOrElse(oldValue)
  }
} 
开发者ID:sparkling-graph,项目名称:sparkling-graph,代码行数:48,代码来源:ShortestPathLengthsFromCSV.scala


示例6: TFIDF

//设置package包名称以及导入依赖的类
package io.elegans.exercises

import org.apache.spark.broadcast.Broadcast

class TFIDF {

  
  def getTfIDFAnnotatedVector(annotated_sentence: Map[String, Long],
                              dictionary: => Broadcast[Map[String, (Long, Long)]], num_of_documents: Long) :
  Map[String, Tuple4[Long, Long, Long, Double]] = {
    val freq_annotations = annotated_sentence
    val max_term_raw_freq: Long = 0 //TODO: get the max raw frequency from the list of terms
    val tfidf_annotated_terms: Map[String, Tuple4[Long, Long, Long, Double]] = // map on terms of the sentence
      freq_annotations.map(term => { // maps on terms annotations
        val term_string : String = "" //TODO: complete with the term string
        val term_raw_freq : Long = 0 //TODO: complete with term raw frequency in sentence
        val dictionary_term: Tuple2[Long, Long] = (0: Long, 0: Long) //TODO: get the pair (unique_id, num_docs_with_terms) from dictionary
        val term_id : Long = 0 //TODO: the term unique id
        val num_docs_with_terms : Long = 0 //TODO: the term num_docs_with_terms
        val term_tfidf : Double = 0 //TODO: the tfidf
        val dictionary_term_with_tfidf = (term_string, (term_raw_freq, term_id, num_docs_with_terms, term_tfidf))
        dictionary_term_with_tfidf
      })
    tfidf_annotated_terms
  }
} 
开发者ID:elegans-io,项目名称:spark-sbt-skel,代码行数:27,代码来源:TFIDF.scala


示例7: SKGGenerator

//设置package包名称以及导入依赖的类
package kr.acon.generator.skg

import scala.Iterator

import org.apache.spark.SparkContext
import org.apache.spark.broadcast.Broadcast
import org.apache.spark.rdd.RDD

import it.unimi.dsi.fastutil.longs.LongOpenHashBigSet
import kr.acon.generator.BaseGenerator
import kr.acon.util.Util.RangePartitionFromDegreeRDD

object SKGGenerator extends BaseGenerator {
  override val appName = "TrillionG: A Trillion-scale Synthetic Graph Generator using a Recursive Vector Model"

  implicit class RecVecGenClass(self: RDD[Long]) extends Serializable {
    def doRecVecGen(bskg: Broadcast[_ <: SKG], rng: Long) = {
      self.mapPartitions {
        case partitions =>
          val skg = bskg.value
          partitions.flatMap {
            case u =>
              val random = new SKG.randomClass(rng + u)
              val degree = skg.getDegree(u, random)
              if (degree < 1)
                Iterator.empty
              else {
                val recVec = skg.getRecVec(u)
                val sigmas = skg.getSigmas(recVec)
                val p = recVec.last
                val adjacency = new LongOpenHashBigSet(degree)
                var i = 0
                while (i < degree) {
                  adjacency.add(skg.determineEdge(u, recVec, sigmas, random))
                  i += 1
                }
                Iterator((u, adjacency))
              }
          }
      }
    }
  }

  override def run(sc: SparkContext): RDD[(Long, LongOpenHashBigSet)] = {
    val verticesRDD = largeVertices(sc)
    val bskg = sc.broadcast(SKG.constructFrom(parser))
    val degreeRDD = verticesRDD.map(vid => (vid, bskg.value.getExpectedDegree(vid)))
    val partitionedVertices = degreeRDD.rangePartition(parser.machine, parser.n, parser.e)
    val edges = partitionedVertices.doRecVecGen(bskg, parser.rng)
    edges
  }
} 
开发者ID:chan150,项目名称:TrillionG,代码行数:53,代码来源:SKGGenerator.scala


示例8: TransformContext

//设置package包名称以及导入依赖的类
package com.microsoft.partnercatalyst.fortis.spark.transformcontext

import com.microsoft.partnercatalyst.fortis.spark.dto.{BlacklistedTerm, SiteSettings}
import com.microsoft.partnercatalyst.fortis.spark.transforms.image.ImageAnalyzer
import com.microsoft.partnercatalyst.fortis.spark.transforms.language.LanguageDetector
import com.microsoft.partnercatalyst.fortis.spark.transforms.locations.LocationsExtractorFactory
import com.microsoft.partnercatalyst.fortis.spark.transforms.sentiment.SentimentDetectorAuth
import com.microsoft.partnercatalyst.fortis.spark.transforms.topic.KeywordExtractor
import org.apache.spark.broadcast.Broadcast

case class TransformContext(
  siteSettings: SiteSettings = null,
  langToKeywordExtractor: Broadcast[Map[String, KeywordExtractor]] = null,
  blacklist: Broadcast[Seq[BlacklistedTerm]] = null,
  locationsExtractorFactory: Broadcast[LocationsExtractorFactory] = null,

  // The following objects have a small serialized forms. Consequently, we don't bother to broadcast them
  // (instead, they're serialized into each task that uses them).
  imageAnalyzer: ImageAnalyzer = null,
  languageDetector: LanguageDetector = null,
  sentimentDetectorAuth: SentimentDetectorAuth = null
) 
开发者ID:CatalystCode,项目名称:project-fortis-spark,代码行数:23,代码来源:TransformContext.scala


示例9: DawidSkenePartialModel

//设置package包名称以及导入依赖的类
package com.enriquegrodrigo.spark.crowd.types

import org.apache.spark.sql.Dataset
import org.apache.spark.broadcast.Broadcast


private[crowd] case class DawidSkenePartialModel(dataset: Dataset[DawidSkenePartial], params: Broadcast[DawidSkeneParams], 
                                  logLikelihood: Double, improvement: Double, nClasses: Int, 
                                  nAnnotators: Long) {

  def modify(nDataset: Dataset[DawidSkenePartial] =dataset, 
      nParams: Broadcast[DawidSkeneParams] =params, 
      nLogLikelihood: Double =logLikelihood, 
      nImprovement: Double =improvement, 
      nNClasses: Int =nClasses, 
      nNAnnotators: Long =nAnnotators) = 
        new DawidSkenePartialModel(nDataset, nParams, 
                                                nLogLikelihood, 
                                                nImprovement, 
                                                nNClasses, 
                                                nNAnnotators)
} 
开发者ID:enriquegrodrigo,项目名称:spark-crowd,代码行数:23,代码来源:DawidSkenePartialModel.scala


示例10: GladPartialModel

//设置package包名称以及导入依赖的类
package com.enriquegrodrigo.spark.crowd.types

import org.apache.spark.sql.Dataset
import org.apache.spark.broadcast.Broadcast


private[crowd] case class GladPartialModel(dataset: Dataset[GladPartial], params: Broadcast[GladParams], 
                                  logLikelihood: Double, improvement: Double, 
                                  nAnnotators: Int) {

  def modify(nDataset: Dataset[GladPartial] =dataset, 
      nParams: Broadcast[GladParams] =params, 
      nLogLikelihood: Double =logLikelihood, 
      nImprovement: Double =improvement, 
      nNAnnotators: Int =nAnnotators) = 
        new GladPartialModel(nDataset, nParams, 
                                                nLogLikelihood, 
                                                nImprovement, 
                                                nNAnnotators)
} 
开发者ID:enriquegrodrigo,项目名称:spark-crowd,代码行数:21,代码来源:GladPartialModel.scala


示例11: RaykarBinaryPartialModel

//设置package包名称以及导入依赖的类
package com.enriquegrodrigo.spark.crowd.types

import org.apache.spark.sql.Dataset
import org.apache.spark.sql.DataFrame
import org.apache.spark.broadcast.Broadcast



private[crowd] case class RaykarBinaryPartialModel(dataset: DataFrame, annotatorData: Dataset[BinaryAnnotation], 
                                    mu: Dataset[BinarySoftLabel], dataStatistics: Dataset[RaykarBinaryStatistics],
                                    params: Broadcast[RaykarBinaryParams], logLikelihood: Double, 
                                    improvement: Double, nAnnotators: Int, nFeatures: Int) {

  def modify(nDataset: DataFrame =dataset, 
      nAnnotatorData: Dataset[BinaryAnnotation] =annotatorData, 
      nMu: Dataset[BinarySoftLabel] =mu, 
      nDataStatistics: Dataset[RaykarBinaryStatistics] = dataStatistics, 
      nParams: Broadcast[RaykarBinaryParams] =params, 
      nLogLikelihood: Double =logLikelihood, 
      nImprovement: Double =improvement, 
      nNAnnotators: Int =nAnnotators, 
      nNFeatures: Int =nFeatures) = 
        new RaykarBinaryPartialModel(nDataset, nAnnotatorData, nMu, nDataStatistics, 
          nParams, nLogLikelihood, nImprovement, nNAnnotators, nNFeatures)
} 
开发者ID:enriquegrodrigo,项目名称:spark-crowd,代码行数:26,代码来源:RaykarBinaryPartialModel.scala


示例12: KajinoPartialModel

//设置package包名称以及导入依赖的类
package com.enriquegrodrigo.spark.crowd.types

import org.apache.spark.sql.Dataset
import org.apache.spark.sql.DataFrame
import org.apache.spark.broadcast.Broadcast


private[crowd] case class KajinoPartialModel(dataset: DataFrame, annotatorData: Dataset[BinaryAnnotation], 
                                    w0: Broadcast[Array[Double]], w: Array[Array[Double]], priors: Broadcast[KajinoPriors], 
                                    variation: Double, nAnnotators: Int, nFeatures: Int) {

  def modify(nDataset: DataFrame =dataset, 
      nAnnotatorData: Dataset[BinaryAnnotation] =annotatorData, 
      nW0: Broadcast[Array[Double]] =w0, 
      nW: Array[Array[Double]] = w, 
      nPriors: Broadcast[KajinoPriors] =priors, 
      nVariation: Double =variation, 
      nNAnnotators: Int =nAnnotators, 
      nNFeatures: Int =nFeatures) = 
      new KajinoPartialModel(nDataset, nAnnotatorData, nW0, nW, nPriors,
        nVariation, nNAnnotators, nNFeatures)
} 
开发者ID:enriquegrodrigo,项目名称:spark-crowd,代码行数:23,代码来源:KajinoPartialModel.scala


示例13: GladAlphaAggregator

//设置package包名称以及导入依赖的类
package com.enriquegrodrigo.spark.crowd.aggregators 

import com.enriquegrodrigo.spark.crowd.types.GladPartial
import com.enriquegrodrigo.spark.crowd.types.GladParams
import com.enriquegrodrigo.spark.crowd.utils.Functions
import org.apache.spark.sql.{Encoder,Encoders}
import org.apache.spark.sql.expressions.Aggregator
import org.apache.spark.broadcast.Broadcast

private[crowd] class GladAlphaAggregator(params: Broadcast[GladParams], learningRate: Double) 
  extends Aggregator[GladPartial, GladAlphaAggregatorBuffer, Double]{

  def zero: GladAlphaAggregatorBuffer = GladAlphaAggregatorBuffer(0,-1) 
  
  def reduce(b: GladAlphaAggregatorBuffer, a: GladPartial) : GladAlphaAggregatorBuffer = {
    val alpha = params.value.alpha
    val al = alpha(a.annotator)
    val bet = a.beta
    val aest = a.est
    val sigmoidValue = Functions.sigmoid(alpha(a.annotator)*a.beta)
    val p = if (a.value == 1) a.est else (1-a.est)
    val term = (p - sigmoidValue)*bet
    GladAlphaAggregatorBuffer(b.agg + term, al)
  }

  def merge(b1: GladAlphaAggregatorBuffer, b2: GladAlphaAggregatorBuffer) : GladAlphaAggregatorBuffer = { 
    GladAlphaAggregatorBuffer(b1.agg + b2.agg, if (b1.alpha == -1) b2.alpha else b1.alpha )
  }

  def finish(reduction: GladAlphaAggregatorBuffer) = {
    reduction.alpha + learningRate * reduction.agg
  }

  def bufferEncoder: Encoder[GladAlphaAggregatorBuffer] = Encoders.product[GladAlphaAggregatorBuffer]

  def outputEncoder: Encoder[Double] = Encoders.scalaDouble
}

private[crowd] case class GladAlphaAggregatorBuffer(agg: Double, alpha: Double) 
开发者ID:enriquegrodrigo,项目名称:spark-crowd,代码行数:40,代码来源:GladAlphaAggregator.scala


示例14: GladBetaAggregator

//设置package包名称以及导入依赖的类
package com.enriquegrodrigo.spark.crowd.aggregators 

import com.enriquegrodrigo.spark.crowd.types.GladPartial
import com.enriquegrodrigo.spark.crowd.types.GladParams
import com.enriquegrodrigo.spark.crowd.utils.Functions
import org.apache.spark.sql.{Encoder,Encoders}
import org.apache.spark.sql.expressions.Aggregator
import org.apache.spark.broadcast.Broadcast

private[crowd] class GladBetaAggregator(params: Broadcast[GladParams], learningRate: Double) 
  extends Aggregator[GladPartial, GladBetaAggregatorBuffer, Double]{

  def zero: GladBetaAggregatorBuffer = GladBetaAggregatorBuffer(0,-1) 
  
  def reduce(b: GladBetaAggregatorBuffer, a: GladPartial) : GladBetaAggregatorBuffer = {
    val alpha = params.value.alpha
    val al = alpha(a.annotator)
    val bet = a.beta
    val aest = a.est
    val sigmoidValue = Functions.sigmoid(alpha(a.annotator)*a.beta)
    val p = if (a.value == 1) a.est else (1-a.est)
    val term = (p - sigmoidValue)*alpha(a.annotator)
    GladBetaAggregatorBuffer(b.agg + term, a.beta)
  }

  def merge(b1: GladBetaAggregatorBuffer, b2: GladBetaAggregatorBuffer) : GladBetaAggregatorBuffer = { 
    GladBetaAggregatorBuffer(b1.agg + b2.agg, if (b1.beta == -1) b2.beta else b1.beta)
  }

  def finish(reduction: GladBetaAggregatorBuffer) = {
    reduction.beta + learningRate * reduction.agg
  }

  def bufferEncoder: Encoder[GladBetaAggregatorBuffer] = Encoders.product[GladBetaAggregatorBuffer]

  def outputEncoder: Encoder[Double] = Encoders.scalaDouble
}

private[crowd] case class GladBetaAggregatorBuffer(agg: Double, beta: Double) 
开发者ID:enriquegrodrigo,项目名称:spark-crowd,代码行数:40,代码来源:GladBetaAggregator.scala


示例15: RaykarBinaryStatisticsAggregator

//设置package包名称以及导入依赖的类
package com.enriquegrodrigo.spark.crowd.aggregators 

import com.enriquegrodrigo.spark.crowd.types.RaykarBinaryPartial
import com.enriquegrodrigo.spark.crowd.types.RaykarBinaryParams
import com.enriquegrodrigo.spark.crowd.utils.Functions
import org.apache.spark.sql.{Encoder,Encoders}
import org.apache.spark.sql.expressions.Aggregator
import org.apache.spark.broadcast.Broadcast

private[crowd] class RaykarBinaryStatisticsAggregator(params: Broadcast[RaykarBinaryParams]) 
  extends Aggregator[RaykarBinaryPartial, RaykarBinaryStatisticsAggregatorBuffer, (Double,Double)]{

  def zero: RaykarBinaryStatisticsAggregatorBuffer = RaykarBinaryStatisticsAggregatorBuffer(1,1) //Binary
  
  def reduce(b: RaykarBinaryStatisticsAggregatorBuffer, a: RaykarBinaryPartial) : RaykarBinaryStatisticsAggregatorBuffer = {
    val alphaValue = params.value.alpha(a.annotator)
    val alphaTerm = if (a.value == 1) alphaValue else 1-alphaValue
    val betaValue = params.value.beta(a.annotator)
    val betaTerm = if (a.value == 0) betaValue else 1-betaValue 
    RaykarBinaryStatisticsAggregatorBuffer(b.a * alphaTerm, b.b * betaTerm)
  }

  def merge(b1: RaykarBinaryStatisticsAggregatorBuffer, b2: RaykarBinaryStatisticsAggregatorBuffer) : RaykarBinaryStatisticsAggregatorBuffer = { 
    RaykarBinaryStatisticsAggregatorBuffer(b1.a * b2.a, b1.b*b2.b)
  }

  def finish(reduction: RaykarBinaryStatisticsAggregatorBuffer) = {
    (reduction.a,reduction.b)
  }

  def bufferEncoder: Encoder[RaykarBinaryStatisticsAggregatorBuffer] = Encoders.product[RaykarBinaryStatisticsAggregatorBuffer]

  def outputEncoder: Encoder[(Double,Double)] = Encoders.product[(Double,Double)]
}

private[crowd] case class RaykarBinaryStatisticsAggregatorBuffer(a: Double, b: Double) 
开发者ID:enriquegrodrigo,项目名称:spark-crowd,代码行数:37,代码来源:RaykarBinaryStatisticsAggregator.scala


示例16: DawidSkeneLogLikelihoodAggregator

//设置package包名称以及导入依赖的类
package com.enriquegrodrigo.spark.crowd.aggregators 

import com.enriquegrodrigo.spark.crowd.types.DawidSkenePartial
import com.enriquegrodrigo.spark.crowd.types.DawidSkeneParams
import org.apache.spark.sql.{Encoder,Encoders}
import org.apache.spark.sql.expressions.Aggregator
import org.apache.spark.broadcast.Broadcast

import scala.math.log

private[crowd] class DawidSkeneLogLikelihoodAggregator(params: Broadcast[DawidSkeneParams]) 
  extends Aggregator[DawidSkenePartial, DawidSkeneLogLikelihoodAggregatorBuffer, Double]{

  private def sumKey(map: Map[Int,Double], pair: (Int,Double)) = {
      val key = pair._1
      val value = pair._2
      val new_value = map.get(key) match {
        case Some(x) => x + value
        case None => value 
      }
      map.updated(key, new_value)
  }

  def zero: DawidSkeneLogLikelihoodAggregatorBuffer = DawidSkeneLogLikelihoodAggregatorBuffer(0, -1)

  def reduce(b: DawidSkeneLogLikelihoodAggregatorBuffer, a: DawidSkenePartial) : DawidSkeneLogLikelihoodAggregatorBuffer = {
    val pival = params.value.pi(a.annotator.toInt)(a.est)(a.value)
    DawidSkeneLogLikelihoodAggregatorBuffer(b.agg + log(pival), a.est) 
  }

  def merge(b1: DawidSkeneLogLikelihoodAggregatorBuffer, b2: DawidSkeneLogLikelihoodAggregatorBuffer) : DawidSkeneLogLikelihoodAggregatorBuffer = { 
    DawidSkeneLogLikelihoodAggregatorBuffer(b1.agg + b2.agg, if (b1.predClass == -1) b2.predClass else b1.predClass) 
  }

  def finish(reduction: DawidSkeneLogLikelihoodAggregatorBuffer) =  {
    reduction.agg + log(params.value.w(reduction.predClass))
  }


  def bufferEncoder: Encoder[DawidSkeneLogLikelihoodAggregatorBuffer] = Encoders.product[DawidSkeneLogLikelihoodAggregatorBuffer]

  def outputEncoder: Encoder[Double] = Encoders.scalaDouble
}

private[crowd] case class DawidSkeneLogLikelihoodAggregatorBuffer(agg: Double, predClass: Int) 
开发者ID:enriquegrodrigo,项目名称:spark-crowd,代码行数:46,代码来源:DawidSkeneLogLikelihoodAggregator.scala


示例17: DawidSkeneEAggregator

//设置package包名称以及导入依赖的类
package com.enriquegrodrigo.spark.crowd.aggregators 

import com.enriquegrodrigo.spark.crowd.types.DawidSkenePartial
import com.enriquegrodrigo.spark.crowd.types.DawidSkeneParams
import org.apache.spark.sql.{Encoder,Encoders}
import org.apache.spark.sql.expressions.Aggregator
import org.apache.spark.broadcast.Broadcast

private[crowd] class DawidSkeneEAggregator(params: Broadcast[DawidSkeneParams], nClasses: Int) 
  extends Aggregator[DawidSkenePartial, DawidSkeneAggregatorBuffer, Int]{

  def zero: DawidSkeneAggregatorBuffer = DawidSkeneAggregatorBuffer(Vector.fill(nClasses)(1))
  
  def reduce(b: DawidSkeneAggregatorBuffer, a: DawidSkenePartial) : DawidSkeneAggregatorBuffer = {
    val pi = params.value.pi 
    val classCondi = Vector.range(0,nClasses).map( c => pi(a.annotator.toInt)(c)(a.value))
    val newVect = classCondi.zip(b.aggVect).map(x => x._1 * x._2)
    DawidSkeneAggregatorBuffer(newVect) 
  }

  def merge(b1: DawidSkeneAggregatorBuffer, b2: DawidSkeneAggregatorBuffer) : DawidSkeneAggregatorBuffer = { 
    val buf = DawidSkeneAggregatorBuffer(b1.aggVect.zip(b2.aggVect).map(x => x._1 * x._2))
    buf
  }

  def finish(reduction: DawidSkeneAggregatorBuffer) = {
    val result = reduction.aggVect.zipWithIndex.maxBy(x => x._1*params.value.w(x._2))._2
    result
  }

  def bufferEncoder: Encoder[DawidSkeneAggregatorBuffer] = Encoders.product[DawidSkeneAggregatorBuffer]

  def outputEncoder: Encoder[Int] = Encoders.scalaInt
}

private[crowd] case class DawidSkeneAggregatorBuffer(aggVect: scala.collection.Seq[Double]) 
开发者ID:enriquegrodrigo,项目名称:spark-crowd,代码行数:37,代码来源:DawidSkeneEAggregator.scala


示例18: GladEAggregator

//设置package包名称以及导入依赖的类
package com.enriquegrodrigo.spark.crowd.aggregators 

import com.enriquegrodrigo.spark.crowd.types.GladPartial
import com.enriquegrodrigo.spark.crowd.types.GladParams
import com.enriquegrodrigo.spark.crowd.utils.Functions
import org.apache.spark.sql.{Encoder,Encoders}
import org.apache.spark.sql.expressions.Aggregator
import org.apache.spark.broadcast.Broadcast

import scala.math.{log,exp}

private[crowd] class GladEAggregator(params: Broadcast[GladParams]) 
  extends Aggregator[GladPartial, GladEAggregatorBuffer, Double]{

  def zero: GladEAggregatorBuffer = GladEAggregatorBuffer(Vector.fill(2)(1)) //Binary
  
  def reduce(b: GladEAggregatorBuffer, a: GladPartial) : GladEAggregatorBuffer = {
    val alpha = params.value.alpha
    val sigmoidValue = Functions.sigmoid(alpha(a.annotator)*a.beta)
    val p0 = if (a.value == 0) sigmoidValue else (1 - sigmoidValue)
    val p1 = if (a.value == 1) sigmoidValue else (1 - sigmoidValue) 
    GladEAggregatorBuffer(Vector(Functions.logLim(p0) + b.aggVect(0), Functions.logLim(p1) + b.aggVect(1)))
  }

  def merge(b1: GladEAggregatorBuffer, b2: GladEAggregatorBuffer) : GladEAggregatorBuffer = { 
    GladEAggregatorBuffer(b1.aggVect.zip(b2.aggVect).map(x => x._1 * x._2))
  }

  def finish(reduction: GladEAggregatorBuffer) = {
    val w = params.value.w
    val negative = exp(reduction.aggVect(0) + Functions.logLim(w(0)))
    val positive = exp(reduction.aggVect(1) + Functions.logLim(w(1)))
    val norm = negative + positive
    positive/norm
  }

  def bufferEncoder: Encoder[GladEAggregatorBuffer] = Encoders.product[GladEAggregatorBuffer]

  def outputEncoder: Encoder[Double] = Encoders.scalaDouble
}

private[crowd] case class GladEAggregatorBuffer(aggVect: scala.collection.Seq[Double]) 
开发者ID:enriquegrodrigo,项目名称:spark-crowd,代码行数:43,代码来源:GladEAggregator.scala


示例19: BlockedWordcountAnalysis

//设置package包名称以及导入依赖的类
package com.gu.commentsanalysis.analysis.words

import com.gu.commentsanalysis.dao.DataStore
import com.gu.commentsanalysis.{CommentWasBlocked, Cleanup, Util}
import org.apache.spark.SparkContext
import org.apache.spark.broadcast.Broadcast
import org.apache.spark.rdd.RDD


object BlockedWordcountAnalysis {

  def run()(implicit dataStore: DataStore, sc: SparkContext, cleanup: Cleanup) {
    val broadcastCleanup = sc.broadcast(cleanup)
    val allComments = dataStore.comments
    val blockedComments = allComments.filter(_._2.wasBlocked)

    val allWordcounts = getWordcounts(allComments, broadcastCleanup).filter(_._2._1 > 100)
    val blockedWordcounts = getWordcounts(blockedComments, broadcastCleanup)

    val joined = allWordcounts.join(blockedWordcounts)
      .map{case (word, ((allCount, allExample), (blockedCount, blockedExample)))
      => (word, blockedCount/allCount, blockedCount, allExample, blockedExample)}
    Util.save(joined, "blockedWordcount")
  }

  def getWordcounts(comments: RDD[(String, CommentWasBlocked)], broadcastCleanup: Broadcast[Cleanup])
                   (implicit dataStore: DataStore): RDD[(String, (Double, String))] = {
    comments.map(_._2.comment)
      .flatMap{body => broadcastCleanup.value.cleanupWords(body)}
      .reduceByKey(Util.addIntString)
      .map{case (word, (intCount, example)) => (word, (intCount.toDouble, example))}
  }
} 
开发者ID:guardian,项目名称:online-abuse-analysis,代码行数:34,代码来源:BlockedWordcountAnalysis.scala


示例20: BlockedPhraseCountAnalysis

//设置package包名称以及导入依赖的类
package com.gu.commentsanalysis.analysis.words

import com.gu.commentsanalysis.dao.DataStore
import com.gu.commentsanalysis.{CommentWasBlocked, Cleanup, Util}
import org.apache.spark.SparkContext
import org.apache.spark.broadcast.Broadcast
import org.apache.spark.rdd.RDD

object BlockedPhraseCountAnalysis {

  def run()(implicit dataStore: DataStore, sc: SparkContext, cleanup: Cleanup) {
    val broadcastCleanup = sc.broadcast(cleanup)
    val allComments = dataStore.comments
    val blockedComments = allComments.filter(_._2.wasBlocked)

    val allWordcounts = getWordcounts(allComments, broadcastCleanup).filter(_._2._1 > 50)
    val blockedWordcounts = getWordcounts(blockedComments, broadcastCleanup)

    val joined = allWordcounts.join(blockedWordcounts)
      .map{case (word, ((allCount, allExample), (blockedCount, blockedExample)))
      => (word, blockedCount/allCount, blockedCount, allExample, blockedExample)}
    Util.save(joined, "blockedPhraseWordcount")

  }

  def getWordcounts(comments: RDD[(String, CommentWasBlocked)], broadcastCleanup: Broadcast[Cleanup])
                   (implicit dataStore: DataStore): RDD[((String, String), (Double, String))] = {
    comments.map(_._2.comment)
      .flatMap{body => broadcastCleanup.value.cleanupPhrases(body)}
      .reduceByKey(Util.addIntString)
      .map{case (phrase, (intCount, example)) => (phrase, (intCount.toDouble, example))}
  }
} 
开发者ID:guardian,项目名称:online-abuse-analysis,代码行数:34,代码来源:BlockedPhraseCountAnalysis.scala



注:本文中的org.apache.spark.broadcast.Broadcast类示例整理自Github/MSDocs等源码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。


鲜花

握手

雷人

路过

鸡蛋
该文章已有0人参与评论

请发表评论

全部评论

专题导读
上一篇:
Scala ManagementFactory类代码示例发布时间:2022-05-23
下一篇:
Scala MustMatchers类代码示例发布时间:2022-05-23
热门推荐
热门话题
阅读排行榜

扫描微信二维码

查看手机版网站

随时了解更新最新资讯

139-2527-9053

在线客服(服务时间 9:00~18:00)

在线QQ客服
地址:深圳市南山区西丽大学城创智工业园
电邮:jeky_zhao#qq.com
移动电话:139-2527-9053

Powered by 互联科技 X3.4© 2001-2213 极客世界.|Sitemap