本文整理汇总了Scala中org.apache.spark.broadcast.Broadcast类的典型用法代码示例。如果您正苦于以下问题:Scala Broadcast类的具体用法?Scala Broadcast怎么用?Scala Broadcast使用的例子?那么恭喜您, 这里精选的类代码示例或许可以为您提供帮助。
在下文中一共展示了Broadcast类的20个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于我们的系统推荐出更棒的Scala代码示例。
示例1: GladLogLikelihoodAggregator
//设置package包名称以及导入依赖的类
package com.enriquegrodrigo.spark.crowd.aggregators
import com.enriquegrodrigo.spark.crowd.types.GladPartial
import com.enriquegrodrigo.spark.crowd.types.GladParams
import com.enriquegrodrigo.spark.crowd.utils.Functions
import org.apache.spark.sql.{Encoder,Encoders}
import org.apache.spark.sql.expressions.Aggregator
import org.apache.spark.broadcast.Broadcast
import scala.math.log
private[crowd] class GladLogLikelihoodAggregator(params: Broadcast[GladParams])
extends Aggregator[GladPartial, GladLogLikelihoodAggregatorBuffer, Double]{
def zero: GladLogLikelihoodAggregatorBuffer = GladLogLikelihoodAggregatorBuffer(0,-1)
def reduce(b: GladLogLikelihoodAggregatorBuffer, a: GladPartial) : GladLogLikelihoodAggregatorBuffer = {
val alphaVal = params.value.alpha(a.annotator.toInt)
val betaVal = a.beta
val sig = Functions.sigmoid(alphaVal*betaVal)
val p0 = 1-a.est
val p1 = a.est
val k0 = if (a.value == 0) sig else 1-sig
val k1 = if (a.value == 1) sig else 1-sig
GladLogLikelihoodAggregatorBuffer(b.agg + Functions.prodlog(p0,k0)
+ Functions.prodlog(p1,k1), p1)
}
def merge(b1: GladLogLikelihoodAggregatorBuffer, b2: GladLogLikelihoodAggregatorBuffer) : GladLogLikelihoodAggregatorBuffer = {
GladLogLikelihoodAggregatorBuffer(b1.agg + b2.agg, if (b1.classProb == -1) b2.classProb else b1.classProb)
}
def finish(reduction: GladLogLikelihoodAggregatorBuffer) = {
val agg = reduction.agg
val w0 = params.value.w(0)
val w1 = params.value.w(1)
val lastVal = reduction.agg + Functions.prodlog((1-reduction.classProb),params.value.w(0)) +
Functions.prodlog(reduction.classProb,params.value.w(1))
lastVal
}
def bufferEncoder: Encoder[GladLogLikelihoodAggregatorBuffer] = Encoders.product[GladLogLikelihoodAggregatorBuffer]
def outputEncoder: Encoder[Double] = Encoders.scalaDouble
}
private[crowd] case class GladLogLikelihoodAggregatorBuffer(agg: Double, classProb: Double)
开发者ID:enriquegrodrigo,项目名称:spark-crowd,代码行数:49,代码来源:GladLogLikelihoodAggregator.scala
示例2: PDF
//设置package包名称以及导入依赖的类
package org.hammerlab.coverage.two_sample
import org.apache.spark.broadcast.Broadcast
import org.apache.spark.rdd.RDD
import org.hammerlab.coverage
import org.hammerlab.coverage.histogram.JointHistogram.Depth
import spire.algebra.Monoid
case class PDF[C: Monoid](rdd: RDD[((Depth, Depth), C)],
filtersBroadcast: Broadcast[(Set[Depth], Set[Depth])],
maxDepth1: Depth,
maxDepth2: Depth)
extends coverage.PDF[C]
with CanDownSampleRDD[C]
case class CDF[C: Monoid](rdd: RDD[((Depth, Depth), C)],
filtersBroadcast: Broadcast[(Set[Depth], Set[Depth])])
extends coverage.CDF[C]
with CanDownSampleRDD[C]
开发者ID:hammerlab,项目名称:coverage-depth,代码行数:20,代码来源:PDF.scala
示例3: rdd
//设置package包名称以及导入依赖的类
package org.hammerlab.coverage.two_sample
import org.apache.spark.broadcast.Broadcast
import org.apache.spark.rdd.RDD
import org.hammerlab.coverage.histogram.JointHistogram.Depth
trait CanDownSampleRDD[V] {
def rdd: RDD[((Depth, Depth), V)]
def filtersBroadcast: Broadcast[(Set[Depth], Set[Depth])]
@transient lazy val filtered = filterDistribution(filtersBroadcast)
private def filterDistribution(filtersBroadcast: Broadcast[(Set[Int], Set[Int])]): Array[((Depth, Depth), V)] =
(for {
((d1, d2), value) ? rdd
(d1Filter, d2Filter) = filtersBroadcast.value
if d1Filter(d1) && d2Filter(d2)
} yield
(d1, d2) ? value
)
.collect
.sortBy(_._1)
}
开发者ID:hammerlab,项目名称:coverage-depth,代码行数:23,代码来源:CanDownSampleRDD.scala
示例4: Configuration
//设置package包名称以及导入依赖的类
package org.hammerlab.hadoop
import java.io.{ ObjectInputStream, ObjectOutputStream }
import com.esotericsoftware.kryo.Kryo
import org.apache.hadoop.conf
import org.apache.hadoop.conf.{ Configuration ? HadoopConfiguration }
import org.apache.spark.SparkContext
import org.apache.spark.broadcast.Broadcast
import org.hammerlab.hadoop.kryo.WritableSerializer
import org.hammerlab.kryo.serializeAs
class Configuration(@transient var value: HadoopConfiguration)
extends Serializable {
private def writeObject(out: ObjectOutputStream): Unit = {
value.write(out)
}
private def readObject(in: ObjectInputStream): Unit = {
value = new HadoopConfiguration(false)
value.readFields(in)
}
}
object Configuration {
def apply(loadDefaults: Boolean = true): Configuration =
new HadoopConfiguration(loadDefaults)
def apply(conf: HadoopConfiguration): Configuration =
new Configuration(conf)
implicit def wrapConfiguration(conf: HadoopConfiguration): Configuration =
apply(conf)
implicit def unwrapConfiguration(conf: Configuration): HadoopConfiguration =
conf.value
implicit def unwrapConfigurationBroadcast(confBroadcast: Broadcast[Configuration]): Configuration =
confBroadcast.value
implicit def sparkContextToHadoopConfiguration(sc: SparkContext): Configuration =
sc.hadoopConfiguration
implicit class ConfWrapper(val conf: HadoopConfiguration) extends AnyVal {
def serializable: Configuration =
Configuration(conf)
}
def register(kryo: Kryo): Unit = {
kryo.register(
classOf[conf.Configuration],
new WritableSerializer[conf.Configuration]
)
kryo.register(
classOf[Configuration],
serializeAs[Configuration, conf.Configuration]
)
}
}
开发者ID:hammerlab,项目名称:spark-util,代码行数:62,代码来源:Configuration.scala
示例5: ShortestPathLengthsFromCSV
//设置package包名称以及导入依赖的类
package ml.sparkling.graph.examples
import ml.sparkling.graph.api.operators.algorithms.shortestpaths.ShortestPathsTypes
import ml.sparkling.graph.api.operators.algorithms.shortestpaths.ShortestPathsTypes._
import ml.sparkling.graph.operators.algorithms.shortestpaths.ShortestPathsAlgorithm
import ml.sparkling.graph.operators.algorithms.shortestpaths.pathprocessors.fastutils.FastUtilWithDistance.DataMap
import ml.sparkling.graph.operators.predicates.AllPathPredicate
import org.apache.spark.broadcast.Broadcast
import org.apache.spark.graphx.{Graph, VertexId}
import scala.collection.JavaConversions._
object ShortestPathLengthsFromCSV extends ExampleApp {
def body()={
val shortestPaths =if(bucketSize == -1l)
ShortestPathsAlgorithm.computeShortestPathsLengths(partitionedGraph,AllPathPredicate,treatAsUndirected)
else
ShortestPathsAlgorithm.computeShortestPathsLengthsIterative(partitionedGraph,(g:Graph[_,_])=>bucketSize,treatAsUndirected)
val size: Broadcast[VertexId] =ctx.broadcast(partitionedGraph.numVertices)
partitionedGraph.outerJoinVertices(shortestPaths.vertices)(Util.dataTransformFunction(size) _).vertices.values.saveAsTextFile(out)
ctx.stop()
}
}
private object Util{
def dataTransformFunction(size: Broadcast[VertexId])(vId: VertexId,oldValue: String,pathsOption: Option[_ >: DataMap <: JMap[JLong, JDouble]])={
pathsOption.flatMap((paths)=>{
var entries=paths.entrySet().toList.sortBy(_.getKey)
val out=new StringBuilder()
out++=s"${oldValue},"
var a = 0l
while (a < size.value) {
if (entries.size > 0 && a == entries.head.getKey) {
out ++= s"${entries.head.getValue},"
entries = entries.drop(1)
}
else {
out ++= "0,"
}
a += 1l
}
out.setLength(out.length - 1)
Option(out.toString())
}).getOrElse(oldValue)
}
}
开发者ID:sparkling-graph,项目名称:sparkling-graph,代码行数:48,代码来源:ShortestPathLengthsFromCSV.scala
示例6: TFIDF
//设置package包名称以及导入依赖的类
package io.elegans.exercises
import org.apache.spark.broadcast.Broadcast
class TFIDF {
def getTfIDFAnnotatedVector(annotated_sentence: Map[String, Long],
dictionary: => Broadcast[Map[String, (Long, Long)]], num_of_documents: Long) :
Map[String, Tuple4[Long, Long, Long, Double]] = {
val freq_annotations = annotated_sentence
val max_term_raw_freq: Long = 0 //TODO: get the max raw frequency from the list of terms
val tfidf_annotated_terms: Map[String, Tuple4[Long, Long, Long, Double]] = // map on terms of the sentence
freq_annotations.map(term => { // maps on terms annotations
val term_string : String = "" //TODO: complete with the term string
val term_raw_freq : Long = 0 //TODO: complete with term raw frequency in sentence
val dictionary_term: Tuple2[Long, Long] = (0: Long, 0: Long) //TODO: get the pair (unique_id, num_docs_with_terms) from dictionary
val term_id : Long = 0 //TODO: the term unique id
val num_docs_with_terms : Long = 0 //TODO: the term num_docs_with_terms
val term_tfidf : Double = 0 //TODO: the tfidf
val dictionary_term_with_tfidf = (term_string, (term_raw_freq, term_id, num_docs_with_terms, term_tfidf))
dictionary_term_with_tfidf
})
tfidf_annotated_terms
}
}
开发者ID:elegans-io,项目名称:spark-sbt-skel,代码行数:27,代码来源:TFIDF.scala
示例7: SKGGenerator
//设置package包名称以及导入依赖的类
package kr.acon.generator.skg
import scala.Iterator
import org.apache.spark.SparkContext
import org.apache.spark.broadcast.Broadcast
import org.apache.spark.rdd.RDD
import it.unimi.dsi.fastutil.longs.LongOpenHashBigSet
import kr.acon.generator.BaseGenerator
import kr.acon.util.Util.RangePartitionFromDegreeRDD
object SKGGenerator extends BaseGenerator {
override val appName = "TrillionG: A Trillion-scale Synthetic Graph Generator using a Recursive Vector Model"
implicit class RecVecGenClass(self: RDD[Long]) extends Serializable {
def doRecVecGen(bskg: Broadcast[_ <: SKG], rng: Long) = {
self.mapPartitions {
case partitions =>
val skg = bskg.value
partitions.flatMap {
case u =>
val random = new SKG.randomClass(rng + u)
val degree = skg.getDegree(u, random)
if (degree < 1)
Iterator.empty
else {
val recVec = skg.getRecVec(u)
val sigmas = skg.getSigmas(recVec)
val p = recVec.last
val adjacency = new LongOpenHashBigSet(degree)
var i = 0
while (i < degree) {
adjacency.add(skg.determineEdge(u, recVec, sigmas, random))
i += 1
}
Iterator((u, adjacency))
}
}
}
}
}
override def run(sc: SparkContext): RDD[(Long, LongOpenHashBigSet)] = {
val verticesRDD = largeVertices(sc)
val bskg = sc.broadcast(SKG.constructFrom(parser))
val degreeRDD = verticesRDD.map(vid => (vid, bskg.value.getExpectedDegree(vid)))
val partitionedVertices = degreeRDD.rangePartition(parser.machine, parser.n, parser.e)
val edges = partitionedVertices.doRecVecGen(bskg, parser.rng)
edges
}
}
开发者ID:chan150,项目名称:TrillionG,代码行数:53,代码来源:SKGGenerator.scala
示例8: TransformContext
//设置package包名称以及导入依赖的类
package com.microsoft.partnercatalyst.fortis.spark.transformcontext
import com.microsoft.partnercatalyst.fortis.spark.dto.{BlacklistedTerm, SiteSettings}
import com.microsoft.partnercatalyst.fortis.spark.transforms.image.ImageAnalyzer
import com.microsoft.partnercatalyst.fortis.spark.transforms.language.LanguageDetector
import com.microsoft.partnercatalyst.fortis.spark.transforms.locations.LocationsExtractorFactory
import com.microsoft.partnercatalyst.fortis.spark.transforms.sentiment.SentimentDetectorAuth
import com.microsoft.partnercatalyst.fortis.spark.transforms.topic.KeywordExtractor
import org.apache.spark.broadcast.Broadcast
case class TransformContext(
siteSettings: SiteSettings = null,
langToKeywordExtractor: Broadcast[Map[String, KeywordExtractor]] = null,
blacklist: Broadcast[Seq[BlacklistedTerm]] = null,
locationsExtractorFactory: Broadcast[LocationsExtractorFactory] = null,
// The following objects have a small serialized forms. Consequently, we don't bother to broadcast them
// (instead, they're serialized into each task that uses them).
imageAnalyzer: ImageAnalyzer = null,
languageDetector: LanguageDetector = null,
sentimentDetectorAuth: SentimentDetectorAuth = null
)
开发者ID:CatalystCode,项目名称:project-fortis-spark,代码行数:23,代码来源:TransformContext.scala
示例9: DawidSkenePartialModel
//设置package包名称以及导入依赖的类
package com.enriquegrodrigo.spark.crowd.types
import org.apache.spark.sql.Dataset
import org.apache.spark.broadcast.Broadcast
private[crowd] case class DawidSkenePartialModel(dataset: Dataset[DawidSkenePartial], params: Broadcast[DawidSkeneParams],
logLikelihood: Double, improvement: Double, nClasses: Int,
nAnnotators: Long) {
def modify(nDataset: Dataset[DawidSkenePartial] =dataset,
nParams: Broadcast[DawidSkeneParams] =params,
nLogLikelihood: Double =logLikelihood,
nImprovement: Double =improvement,
nNClasses: Int =nClasses,
nNAnnotators: Long =nAnnotators) =
new DawidSkenePartialModel(nDataset, nParams,
nLogLikelihood,
nImprovement,
nNClasses,
nNAnnotators)
}
开发者ID:enriquegrodrigo,项目名称:spark-crowd,代码行数:23,代码来源:DawidSkenePartialModel.scala
示例10: GladPartialModel
//设置package包名称以及导入依赖的类
package com.enriquegrodrigo.spark.crowd.types
import org.apache.spark.sql.Dataset
import org.apache.spark.broadcast.Broadcast
private[crowd] case class GladPartialModel(dataset: Dataset[GladPartial], params: Broadcast[GladParams],
logLikelihood: Double, improvement: Double,
nAnnotators: Int) {
def modify(nDataset: Dataset[GladPartial] =dataset,
nParams: Broadcast[GladParams] =params,
nLogLikelihood: Double =logLikelihood,
nImprovement: Double =improvement,
nNAnnotators: Int =nAnnotators) =
new GladPartialModel(nDataset, nParams,
nLogLikelihood,
nImprovement,
nNAnnotators)
}
开发者ID:enriquegrodrigo,项目名称:spark-crowd,代码行数:21,代码来源:GladPartialModel.scala
示例11: RaykarBinaryPartialModel
//设置package包名称以及导入依赖的类
package com.enriquegrodrigo.spark.crowd.types
import org.apache.spark.sql.Dataset
import org.apache.spark.sql.DataFrame
import org.apache.spark.broadcast.Broadcast
private[crowd] case class RaykarBinaryPartialModel(dataset: DataFrame, annotatorData: Dataset[BinaryAnnotation],
mu: Dataset[BinarySoftLabel], dataStatistics: Dataset[RaykarBinaryStatistics],
params: Broadcast[RaykarBinaryParams], logLikelihood: Double,
improvement: Double, nAnnotators: Int, nFeatures: Int) {
def modify(nDataset: DataFrame =dataset,
nAnnotatorData: Dataset[BinaryAnnotation] =annotatorData,
nMu: Dataset[BinarySoftLabel] =mu,
nDataStatistics: Dataset[RaykarBinaryStatistics] = dataStatistics,
nParams: Broadcast[RaykarBinaryParams] =params,
nLogLikelihood: Double =logLikelihood,
nImprovement: Double =improvement,
nNAnnotators: Int =nAnnotators,
nNFeatures: Int =nFeatures) =
new RaykarBinaryPartialModel(nDataset, nAnnotatorData, nMu, nDataStatistics,
nParams, nLogLikelihood, nImprovement, nNAnnotators, nNFeatures)
}
开发者ID:enriquegrodrigo,项目名称:spark-crowd,代码行数:26,代码来源:RaykarBinaryPartialModel.scala
示例12: KajinoPartialModel
//设置package包名称以及导入依赖的类
package com.enriquegrodrigo.spark.crowd.types
import org.apache.spark.sql.Dataset
import org.apache.spark.sql.DataFrame
import org.apache.spark.broadcast.Broadcast
private[crowd] case class KajinoPartialModel(dataset: DataFrame, annotatorData: Dataset[BinaryAnnotation],
w0: Broadcast[Array[Double]], w: Array[Array[Double]], priors: Broadcast[KajinoPriors],
variation: Double, nAnnotators: Int, nFeatures: Int) {
def modify(nDataset: DataFrame =dataset,
nAnnotatorData: Dataset[BinaryAnnotation] =annotatorData,
nW0: Broadcast[Array[Double]] =w0,
nW: Array[Array[Double]] = w,
nPriors: Broadcast[KajinoPriors] =priors,
nVariation: Double =variation,
nNAnnotators: Int =nAnnotators,
nNFeatures: Int =nFeatures) =
new KajinoPartialModel(nDataset, nAnnotatorData, nW0, nW, nPriors,
nVariation, nNAnnotators, nNFeatures)
}
开发者ID:enriquegrodrigo,项目名称:spark-crowd,代码行数:23,代码来源:KajinoPartialModel.scala
示例13: GladAlphaAggregator
//设置package包名称以及导入依赖的类
package com.enriquegrodrigo.spark.crowd.aggregators
import com.enriquegrodrigo.spark.crowd.types.GladPartial
import com.enriquegrodrigo.spark.crowd.types.GladParams
import com.enriquegrodrigo.spark.crowd.utils.Functions
import org.apache.spark.sql.{Encoder,Encoders}
import org.apache.spark.sql.expressions.Aggregator
import org.apache.spark.broadcast.Broadcast
private[crowd] class GladAlphaAggregator(params: Broadcast[GladParams], learningRate: Double)
extends Aggregator[GladPartial, GladAlphaAggregatorBuffer, Double]{
def zero: GladAlphaAggregatorBuffer = GladAlphaAggregatorBuffer(0,-1)
def reduce(b: GladAlphaAggregatorBuffer, a: GladPartial) : GladAlphaAggregatorBuffer = {
val alpha = params.value.alpha
val al = alpha(a.annotator)
val bet = a.beta
val aest = a.est
val sigmoidValue = Functions.sigmoid(alpha(a.annotator)*a.beta)
val p = if (a.value == 1) a.est else (1-a.est)
val term = (p - sigmoidValue)*bet
GladAlphaAggregatorBuffer(b.agg + term, al)
}
def merge(b1: GladAlphaAggregatorBuffer, b2: GladAlphaAggregatorBuffer) : GladAlphaAggregatorBuffer = {
GladAlphaAggregatorBuffer(b1.agg + b2.agg, if (b1.alpha == -1) b2.alpha else b1.alpha )
}
def finish(reduction: GladAlphaAggregatorBuffer) = {
reduction.alpha + learningRate * reduction.agg
}
def bufferEncoder: Encoder[GladAlphaAggregatorBuffer] = Encoders.product[GladAlphaAggregatorBuffer]
def outputEncoder: Encoder[Double] = Encoders.scalaDouble
}
private[crowd] case class GladAlphaAggregatorBuffer(agg: Double, alpha: Double)
开发者ID:enriquegrodrigo,项目名称:spark-crowd,代码行数:40,代码来源:GladAlphaAggregator.scala
示例14: GladBetaAggregator
//设置package包名称以及导入依赖的类
package com.enriquegrodrigo.spark.crowd.aggregators
import com.enriquegrodrigo.spark.crowd.types.GladPartial
import com.enriquegrodrigo.spark.crowd.types.GladParams
import com.enriquegrodrigo.spark.crowd.utils.Functions
import org.apache.spark.sql.{Encoder,Encoders}
import org.apache.spark.sql.expressions.Aggregator
import org.apache.spark.broadcast.Broadcast
private[crowd] class GladBetaAggregator(params: Broadcast[GladParams], learningRate: Double)
extends Aggregator[GladPartial, GladBetaAggregatorBuffer, Double]{
def zero: GladBetaAggregatorBuffer = GladBetaAggregatorBuffer(0,-1)
def reduce(b: GladBetaAggregatorBuffer, a: GladPartial) : GladBetaAggregatorBuffer = {
val alpha = params.value.alpha
val al = alpha(a.annotator)
val bet = a.beta
val aest = a.est
val sigmoidValue = Functions.sigmoid(alpha(a.annotator)*a.beta)
val p = if (a.value == 1) a.est else (1-a.est)
val term = (p - sigmoidValue)*alpha(a.annotator)
GladBetaAggregatorBuffer(b.agg + term, a.beta)
}
def merge(b1: GladBetaAggregatorBuffer, b2: GladBetaAggregatorBuffer) : GladBetaAggregatorBuffer = {
GladBetaAggregatorBuffer(b1.agg + b2.agg, if (b1.beta == -1) b2.beta else b1.beta)
}
def finish(reduction: GladBetaAggregatorBuffer) = {
reduction.beta + learningRate * reduction.agg
}
def bufferEncoder: Encoder[GladBetaAggregatorBuffer] = Encoders.product[GladBetaAggregatorBuffer]
def outputEncoder: Encoder[Double] = Encoders.scalaDouble
}
private[crowd] case class GladBetaAggregatorBuffer(agg: Double, beta: Double)
开发者ID:enriquegrodrigo,项目名称:spark-crowd,代码行数:40,代码来源:GladBetaAggregator.scala
示例15: RaykarBinaryStatisticsAggregator
//设置package包名称以及导入依赖的类
package com.enriquegrodrigo.spark.crowd.aggregators
import com.enriquegrodrigo.spark.crowd.types.RaykarBinaryPartial
import com.enriquegrodrigo.spark.crowd.types.RaykarBinaryParams
import com.enriquegrodrigo.spark.crowd.utils.Functions
import org.apache.spark.sql.{Encoder,Encoders}
import org.apache.spark.sql.expressions.Aggregator
import org.apache.spark.broadcast.Broadcast
private[crowd] class RaykarBinaryStatisticsAggregator(params: Broadcast[RaykarBinaryParams])
extends Aggregator[RaykarBinaryPartial, RaykarBinaryStatisticsAggregatorBuffer, (Double,Double)]{
def zero: RaykarBinaryStatisticsAggregatorBuffer = RaykarBinaryStatisticsAggregatorBuffer(1,1) //Binary
def reduce(b: RaykarBinaryStatisticsAggregatorBuffer, a: RaykarBinaryPartial) : RaykarBinaryStatisticsAggregatorBuffer = {
val alphaValue = params.value.alpha(a.annotator)
val alphaTerm = if (a.value == 1) alphaValue else 1-alphaValue
val betaValue = params.value.beta(a.annotator)
val betaTerm = if (a.value == 0) betaValue else 1-betaValue
RaykarBinaryStatisticsAggregatorBuffer(b.a * alphaTerm, b.b * betaTerm)
}
def merge(b1: RaykarBinaryStatisticsAggregatorBuffer, b2: RaykarBinaryStatisticsAggregatorBuffer) : RaykarBinaryStatisticsAggregatorBuffer = {
RaykarBinaryStatisticsAggregatorBuffer(b1.a * b2.a, b1.b*b2.b)
}
def finish(reduction: RaykarBinaryStatisticsAggregatorBuffer) = {
(reduction.a,reduction.b)
}
def bufferEncoder: Encoder[RaykarBinaryStatisticsAggregatorBuffer] = Encoders.product[RaykarBinaryStatisticsAggregatorBuffer]
def outputEncoder: Encoder[(Double,Double)] = Encoders.product[(Double,Double)]
}
private[crowd] case class RaykarBinaryStatisticsAggregatorBuffer(a: Double, b: Double)
开发者ID:enriquegrodrigo,项目名称:spark-crowd,代码行数:37,代码来源:RaykarBinaryStatisticsAggregator.scala
示例16: DawidSkeneLogLikelihoodAggregator
//设置package包名称以及导入依赖的类
package com.enriquegrodrigo.spark.crowd.aggregators
import com.enriquegrodrigo.spark.crowd.types.DawidSkenePartial
import com.enriquegrodrigo.spark.crowd.types.DawidSkeneParams
import org.apache.spark.sql.{Encoder,Encoders}
import org.apache.spark.sql.expressions.Aggregator
import org.apache.spark.broadcast.Broadcast
import scala.math.log
private[crowd] class DawidSkeneLogLikelihoodAggregator(params: Broadcast[DawidSkeneParams])
extends Aggregator[DawidSkenePartial, DawidSkeneLogLikelihoodAggregatorBuffer, Double]{
private def sumKey(map: Map[Int,Double], pair: (Int,Double)) = {
val key = pair._1
val value = pair._2
val new_value = map.get(key) match {
case Some(x) => x + value
case None => value
}
map.updated(key, new_value)
}
def zero: DawidSkeneLogLikelihoodAggregatorBuffer = DawidSkeneLogLikelihoodAggregatorBuffer(0, -1)
def reduce(b: DawidSkeneLogLikelihoodAggregatorBuffer, a: DawidSkenePartial) : DawidSkeneLogLikelihoodAggregatorBuffer = {
val pival = params.value.pi(a.annotator.toInt)(a.est)(a.value)
DawidSkeneLogLikelihoodAggregatorBuffer(b.agg + log(pival), a.est)
}
def merge(b1: DawidSkeneLogLikelihoodAggregatorBuffer, b2: DawidSkeneLogLikelihoodAggregatorBuffer) : DawidSkeneLogLikelihoodAggregatorBuffer = {
DawidSkeneLogLikelihoodAggregatorBuffer(b1.agg + b2.agg, if (b1.predClass == -1) b2.predClass else b1.predClass)
}
def finish(reduction: DawidSkeneLogLikelihoodAggregatorBuffer) = {
reduction.agg + log(params.value.w(reduction.predClass))
}
def bufferEncoder: Encoder[DawidSkeneLogLikelihoodAggregatorBuffer] = Encoders.product[DawidSkeneLogLikelihoodAggregatorBuffer]
def outputEncoder: Encoder[Double] = Encoders.scalaDouble
}
private[crowd] case class DawidSkeneLogLikelihoodAggregatorBuffer(agg: Double, predClass: Int)
开发者ID:enriquegrodrigo,项目名称:spark-crowd,代码行数:46,代码来源:DawidSkeneLogLikelihoodAggregator.scala
示例17: DawidSkeneEAggregator
//设置package包名称以及导入依赖的类
package com.enriquegrodrigo.spark.crowd.aggregators
import com.enriquegrodrigo.spark.crowd.types.DawidSkenePartial
import com.enriquegrodrigo.spark.crowd.types.DawidSkeneParams
import org.apache.spark.sql.{Encoder,Encoders}
import org.apache.spark.sql.expressions.Aggregator
import org.apache.spark.broadcast.Broadcast
private[crowd] class DawidSkeneEAggregator(params: Broadcast[DawidSkeneParams], nClasses: Int)
extends Aggregator[DawidSkenePartial, DawidSkeneAggregatorBuffer, Int]{
def zero: DawidSkeneAggregatorBuffer = DawidSkeneAggregatorBuffer(Vector.fill(nClasses)(1))
def reduce(b: DawidSkeneAggregatorBuffer, a: DawidSkenePartial) : DawidSkeneAggregatorBuffer = {
val pi = params.value.pi
val classCondi = Vector.range(0,nClasses).map( c => pi(a.annotator.toInt)(c)(a.value))
val newVect = classCondi.zip(b.aggVect).map(x => x._1 * x._2)
DawidSkeneAggregatorBuffer(newVect)
}
def merge(b1: DawidSkeneAggregatorBuffer, b2: DawidSkeneAggregatorBuffer) : DawidSkeneAggregatorBuffer = {
val buf = DawidSkeneAggregatorBuffer(b1.aggVect.zip(b2.aggVect).map(x => x._1 * x._2))
buf
}
def finish(reduction: DawidSkeneAggregatorBuffer) = {
val result = reduction.aggVect.zipWithIndex.maxBy(x => x._1*params.value.w(x._2))._2
result
}
def bufferEncoder: Encoder[DawidSkeneAggregatorBuffer] = Encoders.product[DawidSkeneAggregatorBuffer]
def outputEncoder: Encoder[Int] = Encoders.scalaInt
}
private[crowd] case class DawidSkeneAggregatorBuffer(aggVect: scala.collection.Seq[Double])
开发者ID:enriquegrodrigo,项目名称:spark-crowd,代码行数:37,代码来源:DawidSkeneEAggregator.scala
示例18: GladEAggregator
//设置package包名称以及导入依赖的类
package com.enriquegrodrigo.spark.crowd.aggregators
import com.enriquegrodrigo.spark.crowd.types.GladPartial
import com.enriquegrodrigo.spark.crowd.types.GladParams
import com.enriquegrodrigo.spark.crowd.utils.Functions
import org.apache.spark.sql.{Encoder,Encoders}
import org.apache.spark.sql.expressions.Aggregator
import org.apache.spark.broadcast.Broadcast
import scala.math.{log,exp}
private[crowd] class GladEAggregator(params: Broadcast[GladParams])
extends Aggregator[GladPartial, GladEAggregatorBuffer, Double]{
def zero: GladEAggregatorBuffer = GladEAggregatorBuffer(Vector.fill(2)(1)) //Binary
def reduce(b: GladEAggregatorBuffer, a: GladPartial) : GladEAggregatorBuffer = {
val alpha = params.value.alpha
val sigmoidValue = Functions.sigmoid(alpha(a.annotator)*a.beta)
val p0 = if (a.value == 0) sigmoidValue else (1 - sigmoidValue)
val p1 = if (a.value == 1) sigmoidValue else (1 - sigmoidValue)
GladEAggregatorBuffer(Vector(Functions.logLim(p0) + b.aggVect(0), Functions.logLim(p1) + b.aggVect(1)))
}
def merge(b1: GladEAggregatorBuffer, b2: GladEAggregatorBuffer) : GladEAggregatorBuffer = {
GladEAggregatorBuffer(b1.aggVect.zip(b2.aggVect).map(x => x._1 * x._2))
}
def finish(reduction: GladEAggregatorBuffer) = {
val w = params.value.w
val negative = exp(reduction.aggVect(0) + Functions.logLim(w(0)))
val positive = exp(reduction.aggVect(1) + Functions.logLim(w(1)))
val norm = negative + positive
positive/norm
}
def bufferEncoder: Encoder[GladEAggregatorBuffer] = Encoders.product[GladEAggregatorBuffer]
def outputEncoder: Encoder[Double] = Encoders.scalaDouble
}
private[crowd] case class GladEAggregatorBuffer(aggVect: scala.collection.Seq[Double])
开发者ID:enriquegrodrigo,项目名称:spark-crowd,代码行数:43,代码来源:GladEAggregator.scala
示例19: BlockedWordcountAnalysis
//设置package包名称以及导入依赖的类
package com.gu.commentsanalysis.analysis.words
import com.gu.commentsanalysis.dao.DataStore
import com.gu.commentsanalysis.{CommentWasBlocked, Cleanup, Util}
import org.apache.spark.SparkContext
import org.apache.spark.broadcast.Broadcast
import org.apache.spark.rdd.RDD
object BlockedWordcountAnalysis {
def run()(implicit dataStore: DataStore, sc: SparkContext, cleanup: Cleanup) {
val broadcastCleanup = sc.broadcast(cleanup)
val allComments = dataStore.comments
val blockedComments = allComments.filter(_._2.wasBlocked)
val allWordcounts = getWordcounts(allComments, broadcastCleanup).filter(_._2._1 > 100)
val blockedWordcounts = getWordcounts(blockedComments, broadcastCleanup)
val joined = allWordcounts.join(blockedWordcounts)
.map{case (word, ((allCount, allExample), (blockedCount, blockedExample)))
=> (word, blockedCount/allCount, blockedCount, allExample, blockedExample)}
Util.save(joined, "blockedWordcount")
}
def getWordcounts(comments: RDD[(String, CommentWasBlocked)], broadcastCleanup: Broadcast[Cleanup])
(implicit dataStore: DataStore): RDD[(String, (Double, String))] = {
comments.map(_._2.comment)
.flatMap{body => broadcastCleanup.value.cleanupWords(body)}
.reduceByKey(Util.addIntString)
.map{case (word, (intCount, example)) => (word, (intCount.toDouble, example))}
}
}
开发者ID:guardian,项目名称:online-abuse-analysis,代码行数:34,代码来源:BlockedWordcountAnalysis.scala
示例20: BlockedPhraseCountAnalysis
//设置package包名称以及导入依赖的类
package com.gu.commentsanalysis.analysis.words
import com.gu.commentsanalysis.dao.DataStore
import com.gu.commentsanalysis.{CommentWasBlocked, Cleanup, Util}
import org.apache.spark.SparkContext
import org.apache.spark.broadcast.Broadcast
import org.apache.spark.rdd.RDD
object BlockedPhraseCountAnalysis {
def run()(implicit dataStore: DataStore, sc: SparkContext, cleanup: Cleanup) {
val broadcastCleanup = sc.broadcast(cleanup)
val allComments = dataStore.comments
val blockedComments = allComments.filter(_._2.wasBlocked)
val allWordcounts = getWordcounts(allComments, broadcastCleanup).filter(_._2._1 > 50)
val blockedWordcounts = getWordcounts(blockedComments, broadcastCleanup)
val joined = allWordcounts.join(blockedWordcounts)
.map{case (word, ((allCount, allExample), (blockedCount, blockedExample)))
=> (word, blockedCount/allCount, blockedCount, allExample, blockedExample)}
Util.save(joined, "blockedPhraseWordcount")
}
def getWordcounts(comments: RDD[(String, CommentWasBlocked)], broadcastCleanup: Broadcast[Cleanup])
(implicit dataStore: DataStore): RDD[((String, String), (Double, String))] = {
comments.map(_._2.comment)
.flatMap{body => broadcastCleanup.value.cleanupPhrases(body)}
.reduceByKey(Util.addIntString)
.map{case (phrase, (intCount, example)) => (phrase, (intCount.toDouble, example))}
}
}
开发者ID:guardian,项目名称:online-abuse-analysis,代码行数:34,代码来源:BlockedPhraseCountAnalysis.scala
注:本文中的org.apache.spark.broadcast.Broadcast类示例整理自Github/MSDocs等源码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。 |
请发表评论