• 设为首页
  • 点击收藏
  • 手机版
    手机扫一扫访问
    迪恩网络手机版
  • 关注官方公众号
    微信扫一扫关注
    公众号

Scala Dataset类代码示例

原作者: [db:作者] 来自: [db:来源] 收藏 邀请

本文整理汇总了Scala中org.apache.spark.sql.Dataset的典型用法代码示例。如果您正苦于以下问题:Scala Dataset类的具体用法?Scala Dataset怎么用?Scala Dataset使用的例子?那么恭喜您, 这里精选的类代码示例或许可以为您提供帮助。



在下文中一共展示了Dataset类的20个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于我们的系统推荐出更棒的Scala代码示例。

示例1: apply

//设置package包名称以及导入依赖的类
package org.dama.datasynth.runtime.spark.operators

import org.apache.spark.sql.{Dataset, SparkSession}
import org.dama.datasynth.executionplan.ExecutionPlan.EdgeTable
import org.dama.datasynth.runtime.spark.SparkRuntime

import scala.util.Random


  def apply( node : EdgeTable) : Dataset[(Long,Long,Long)]= {
    val sparkSession = SparkRuntime.getSparkSession()
    import sparkSession.implicits._
    val generator = SparkRuntime.instantiateStructureGeneratorOperator( node.structure )
    val size = SparkRuntime.evalValueOperator(node.size).asInstanceOf[Long]
    val random : Random = new Random()
    val id : Int = random.nextInt()
    val path : String = s"/tmp/${id}"
    val sparkContext = sparkSession.sparkContext
    generator.run(size, sparkContext.hadoopConfiguration,"hdfs://"+path)
    val edgesRDD = sparkContext.textFile(path)
                               .map( s => s.split("\t"))
                               .map( l => (l(0).toLong, l(1).toLong))
                               .zipWithIndex().map( { case ((tail,head), id) =>  (id, tail, head)})
    sparkSession.createDataset(edgesRDD)
  }

} 
开发者ID:DAMA-UPC,项目名称:DataSynth,代码行数:28,代码来源:EdgeTableOperator.scala


示例2: PrecipSource

//设置package包名称以及导入依赖的类
import org.apache.spark.rdd.RDD
import org.apache.spark.sql.{Dataset, SparkSession}


case class PrecipSource(sourceId: Int,
                        name: String,
                        countryCode: String,
                        latitude: String,
                        longitude: String,
                        elevation: Int,
                        elementId: String,
                        beginDate: String,
                        endDate: String,
                        participantId: Int,
                        participantName: String
                       )

case class Precipication(stationId: Int,
                         sourceId: Int,
                         date: String,
                         amount: Int,
                         quality: Int
                        )

class Mappers() {

  
  def precipicationDF(spark: SparkSession, sourceFilPath: String): Dataset[Precipication] = {
    import spark.implicits._

    var sourceFile: RDD[String] = spark.sparkContext.textFile(sourceFilPath)

    val header = spark.sparkContext.parallelize(sourceFile.take(20))
    sourceFile = sourceFile.subtract(header)
    header.unpersist()

    var precipitionDF: Dataset[Precipication] = sourceFile
      .map(s => s.split(",")
        .map(_.trim()))
      .map(fields => Precipication(
        stationId = fields(0).toInt,
        sourceId = fields(1).toInt,
        date = fields(2),
        amount = fields(3).toInt,
        quality = fields(4).toInt
      ))
      .toDS()

    precipitionDF.show(false)
    precipitionDF
  }

} 
开发者ID:luxinator,项目名称:RainyDay,代码行数:54,代码来源:Mappers.scala


示例3: DataFrameFunctions

//设置package包名称以及导入依赖的类
package com.bloomberg.sparkflow.dc

import org.apache.spark.sql.{Column, Dataset, Row}


class DataFrameFunctions(self: DC[Row]) {

    def join(right: DC[Row]): DC[Row] = {
      val f = (left: Dataset[_], right: Dataset[_]) => {
        left.join(right)
      }
      val hashTarget = Seq("join")
      new MultiDatasetTransformDC(self, right, f, hashTarget)
    }

    def join(right: DC[Row], usingColumn: String): DC[Row] = {
      val f = (left: Dataset[_], right: Dataset[_]) => {
        left.join(right, usingColumn)
      }
      val hashTarget = Seq("join", usingColumn)
      new MultiDatasetTransformDC(self, right, f, hashTarget)
    }

    def join(right: DC[Row], joinExprs: Column): DC[Row] = join(right, joinExprs, "inner")

    def join(right: DC[Row], joinExprs: Column, joinType: String): DC[Row] = {
      val f = (left: Dataset[_], right: Dataset[_]) => {
        left.join(right, joinExprs)
      }
      val hashTarget = Seq("join", joinType, joinExprs.toString())
      new MultiDatasetTransformDC(self, right, f, hashTarget)
    }


} 
开发者ID:bloomberg,项目名称:spark-flow,代码行数:36,代码来源:DataFrameFunctions.scala


示例4: MiniPanda

//设置package包名称以及导入依赖的类
package com.highperformancespark.examples.ml

import com.highperformancespark.examples.dataframe.HappyPandas.{PandaInfo, Pandas}

import com.holdenkarau.spark.testing._

import org.apache.spark.ml._
import org.apache.spark.ml.feature._
import org.apache.spark.sql.types._
import org.apache.spark.sql.{DataFrame, Dataset, Row, SQLContext}
import org.scalatest.Matchers._
import org.scalatest.FunSuite

case class MiniPanda(happy: Double, fuzzy: Double, old: Double)

class SimpleNaiveBayesSuite extends FunSuite with DataFrameSuiteBase {
  val miniPandasList = List(
    MiniPanda(1.0, 1.0, 1.0),
    MiniPanda(1.0, 1.0, 0.0),
    MiniPanda(1.0, 1.0, 0.0),
    MiniPanda(0.0, 0.0, 1.0),
    MiniPanda(0.0, 0.0, 0.0))

  test("simple sanity test") {
    val session = spark
    import session.implicits._
    val ds: Dataset[MiniPanda] = session.createDataset(miniPandasList)
    val assembler = new VectorAssembler()
    assembler.setInputCols(Array("fuzzy", "old"))
    assembler.setOutputCol("features")
    val snb = new SimpleNaiveBayes()
    snb.setLabelCol("happy")
    snb.setFeaturesCol("features")
    val pipeline = new Pipeline().setStages(Array(assembler, snb))
    val model = pipeline.fit(ds)
    val test = ds.select("fuzzy", "old")
    val predicted = model.transform(test)
    println(predicted.collect())
  }
} 
开发者ID:gourimahapatra,项目名称:high-performance-spark,代码行数:41,代码来源:SimpleNaiveBayes.scala


示例5: Partition

//设置package包名称以及导入依赖的类
package br.ufmg.cs.lib.privacy.kanonymity

import org.apache.spark.sql.{Dataset, Row, SparkSession}
import org.apache.spark.sql.functions._


case class Partition(
    member: Dataset[Row],
    memberCount: Long,
    low: Array[Int],
    high: Array[Int],
    allow: Array[Int]) {

  override def toString: String = {
    s"Partition(memberlen=${memberCount}" +
    s", low=${low.mkString("[", ", ", "]")}" +
    s", high=${high.mkString("[", ",", "]")}" +
    s", allow=${allow.mkString("[", ",", "]")})"
  }
}

object Partition {
  def apply(member: Dataset[Row], memberCount: Long,
      low: Array[Int], high: Array[Int]): Partition = {
    Partition(member, memberCount, low, high, Array.fill(low.length)(1))
  }
} 
开发者ID:eubr-bigsea,项目名称:k-anonymity-mondrian,代码行数:28,代码来源:Partition.scala


示例6: cogroup

//设置package包名称以及导入依赖的类
package com.qunar.spark.tungsten.base

import org.apache.spark.sql.Dataset
import com.qunar.spark.tungsten.base.CommonEncoders._

import scala.reflect.runtime.universe.TypeTag


  def cogroup[T: TypeTag, K: TypeTag](leftDataset: Dataset[T], rightDataset: Dataset[T], genJoinKey: T => K): Dataset[(Seq[T], Seq[T])] = {
    // ????leftDataset
    val thisKeyValueSet = leftDataset.groupByKey(data => genJoinKey(data))
    val thisCogroupSet = thisKeyValueSet.mapGroups((key, dataIter) => {
      val builder = Seq.newBuilder[T]
      for (data <- dataIter) {
        builder += data
      }
      (key, builder.result)
    }).toDF("_1", "_2").as[(K, Seq[T])]

    // ????rightDataset
    val anotherKeyValueSet = rightDataset.groupByKey(data => genJoinKey(data))
    val anotherCogroupSet = anotherKeyValueSet.mapGroups((key, dataIter) => {
      val builder = Seq.newBuilder[T]
      for (data <- dataIter) {
        builder += data
      }
      (key, builder.result)
    }).toDF("_1", "_2").as[(K, Seq[T])]

    val resultDataFrame = thisCogroupSet.join(anotherCogroupSet, thisCogroupSet("_1") === anotherCogroupSet("_1"), "outer")
    resultDataFrame.as[(Seq[T], Seq[T])]
  }

} 
开发者ID:spark-bypass-common,项目名称:common-tungsten,代码行数:35,代码来源:CoreJoinOperators.scala


示例7: OpenIE

//设置package包名称以及导入依赖的类
package com.knoldus

import edu.stanford.nlp.simple.{Document, Sentence}
import edu.stanford.nlp.util.Quadruple
import org.apache.spark.sql.{Dataset, SparkSession}
import org.apache.spark.sql.functions.udf

import scala.collection.JavaConverters._


private case class OpenIE(subject: String, relation: String, target: String, confidence: Double) {
  def this(quadruple: Quadruple[String, String, String, java.lang.Double]) =
    this(quadruple.first, quadruple.second, quadruple.third, quadruple.fourth)
}

object StartApplication extends App{

  val spark = SparkSession.builder().appName("spark-nlp-starter").master("local[*]").getOrCreate()
  val sc = spark.sparkContext
  val readPdfFile: Dataset[String] = spark.read.textFile("test")
  readPdfFile.show(false)

  def openie = udf { sentence: String =>
    new Sentence(sentence).openie().asScala.map(q => new OpenIE(q)).toSeq
  }

  val res = readPdfFile.select(openie(readPdfFile("value")))
  res.show(false)
} 
开发者ID:shiv4nsh,项目名称:scala-spark-nlp-starter-kit,代码行数:30,代码来源:StartApplication.scala


示例8: RowProfiler

//设置package包名称以及导入依赖的类
package io.gzet.profilers.raw

import org.apache.spark.sql.Dataset

case class RowProfiler() {

  def profile(df: Dataset[String]): Dataset[RowReport] = {
    import df.sparkSession.implicits._
    val report = RowReport(df.count().toDouble)
    df.sparkSession.createDataset[RowReport](
      Seq(report)
    )
  }
}

case class RowReport(
                      metricValue: Double
                    ) 
开发者ID:PacktPublishing,项目名称:Mastering-Spark-for-Data-Science,代码行数:19,代码来源:RowProfiler.scala


示例9: EmptinessProfiler

//设置package包名称以及导入依赖的类
package io.gzet.profilers.field

import io.gzet.profilers.Utils
import org.apache.commons.lang3.StringUtils
import org.apache.spark.sql.Dataset

import scalaz.Scalaz._

case class EmptinessProfiler() {

  def profile(df: Dataset[Array[String]]): Dataset[EmptinessReport] = {

    import df.sparkSession.implicits._

    val features = Utils.buildColumns(df)

    features.map(f => (f.idx, StringUtils.isNotEmpty(f.value))).groupByKey({ case (column, isNotEmpty) =>
      (column, isNotEmpty)
    }).count().map({ case ((column, isNotEmpty), count) =>
      (column, Map(isNotEmpty -> count))
    }).groupByKey({ case (column, map) =>
      column
    }).reduceGroups({ (v1, v2) =>
      (v1._1, v1._2 |+| v2._2)
    }).map({ case (col, (_, map)) =>
      val emptiness = map.getOrElse(false, 0L) / (map.getOrElse(true, 0L) + map.getOrElse(false, 0L)).toDouble
      EmptinessReport(
        col,
        emptiness
      )
    })

  }

}

case class EmptinessReport(
                            field: Int,
                            metricValue: Double
                          ) 
开发者ID:PacktPublishing,项目名称:Mastering-Spark-for-Data-Science,代码行数:41,代码来源:EmptinessProfiler.scala


示例10: CountSentencesByLanguage

//设置package包名称以及导入依赖的类
package biz.meetmatch.modules

import biz.meetmatch.model.{Sentence, SentenceCountByLanguage}
import org.apache.spark.sql.{Dataset, SparkSession}
import org.rogach.scallop.Scallop

object CountSentencesByLanguage extends Module with ParquetExtensions[SentenceCountByLanguage] {
  override val parquetFile = "SentenceCountsByLanguage"

  override def execute(scallopts: Scallop)(implicit sparkSession: SparkSession): Unit = {
    val sentenceDS = DetectLanguage.loadResultsFromParquet

    val sentenceCountByLanguageDS = calc(sentenceDS)

    saveResultsToParquet(sentenceCountByLanguageDS)
  }

  
  def calc(sentenceDS: Dataset[Sentence])(implicit sparkSession: SparkSession): Dataset[SentenceCountByLanguage] = {
    import sparkSession.implicits._
    sparkSession.sparkContext.setJobGroup(this.getClass.getName, this.getClass.getName)

    sparkSession.sparkContext.setJobDescription("Count the sentences by language")

    // TASK 2: count how many sentences exist for each detected language and save the results in the SentenceCountByLanguage case class

    // when finished coding:
    // - package, deploy and submit the spark application and verify the results using spark shell or a notebook (see https://github.com/tolomaus/languagedetector section Quick start - usage)
    // - verify the logs of the executed module in the language detector UI

    // solution:
    sentenceDS
      .groupByKey(_.detectedLanguage)
      .count
      .map { case (language, count) => SentenceCountByLanguage(language, count) }
  }

  def loadResultsFromParquet(implicit module: Class[_] = this.getClass, sparkSession: SparkSession): Dataset[SentenceCountByLanguage] = {
    import sparkSession.implicits._
    loadResultsFromParquetAsDF(module, sparkSession).as[SentenceCountByLanguage]
  }
} 
开发者ID:tolomaus,项目名称:languagedetector,代码行数:43,代码来源:CountSentencesByLanguage.scala


示例11: SQLDataProvider

//设置package包名称以及导入依赖的类
package org.ieee.codemeow.geometric.spark.data

import com.vividsolutions.jts.geom.Geometry
import org.apache.spark.sql.{Dataset, Encoders, SparkSession}
import org.ieee.codemeow.geometric.Feature
import org.ieee.codemeow.geometric.spark.LayerConfiguration



class SQLDataProvider(_spark: SparkSession, _layer: LayerConfiguration) extends AbstractDataProvider(_spark, _layer){

  val url = layer.kwargs.get("url").get.asInstanceOf[String]
  val dbtables = layer.kwargs.get("dbtables").get.asInstanceOf[Map[String, String]]
  val user = layer.kwargs.get("user").get.asInstanceOf[String]
  val password = layer.kwargs.get("password").get.asInstanceOf[String]
  val zoomConfig = layer.kwargs.get("zooms").get.asInstanceOf[Map[String, String]]

  // load all tables
  dbtables.foreach(tuple => {
    val sparkTableName = tuple._1
    val realTableName = tuple._2

    val mapDataFrame = spark.read.format("jdbc")
      .option("url", url)
      .option("user", user)
      .option("password", password)
      .option("dbtable", realTableName).load

    mapDataFrame.createOrReplaceTempView(sparkTableName)
  })

  override def getFeatures(layerName: String, zoom: Long): Option[Dataset[Feature]] ={
    // Ref http://stackoverflow.com/questions/38664972/why-is-unable-to-find-encoder-for-type-stored-in-a-dataset-when-creating-a-dat
    import spark.implicits._
    // Ref http://stackoverflow.com/questions/36648128/how-to-store-custom-objects-in-dataset
    implicit val featureEncoder = Encoders.kryo[Feature]

    val natureSQL = zoomConfig.get(zoom.toString)
    if(natureSQL.isEmpty){
      return None
    }

    val rawDF = spark.sql(natureSQL.get)
    val featureCollection = rawDF.map(row => {
      val id = row.getAs[Long]("__id__")
      val geom = row.getAs[Geometry]("__geometry__")
      val fields = row.schema.filter(field => {
        !Seq("__id__", "__geometry__").contains(field.name)
      }).map(field => field.name)
      val props = row.getValuesMap[String](fields)
      Feature(id, geom, props)
    })
    Some(featureCollection)
  }
} 
开发者ID:codemeow5,项目名称:Vector-Tile-Spark-Process,代码行数:56,代码来源:SQLDataProvider.scala


示例12: FortisTargetTablename

//设置package包名称以及导入依赖的类
package com.microsoft.partnercatalyst.fortis.spark.sinks.cassandra.aggregators

import com.microsoft.partnercatalyst.fortis.spark.sinks.cassandra.dto.{AggregationRecord, Event, EventBatchEntry}
import org.apache.spark.sql.{DataFrame, Dataset, SparkSession}

trait FortisAggregator {
 protected val KeyspaceName = "fortis"
 protected val CassandraFormat = "org.apache.spark.sql.cassandra"
 protected val AggregateFunctions = "sum(mentioncount) as mentioncountagg, SentimentWeightedAvg(IF(IsNull(avgsentiment), 0, avgsentiment), IF(IsNull(mentioncount), 0, mentioncount)) as avgsentimentagg"
 //protected val IncrementalUpdateMentionsUDF = "SumMentions(a.mentioncountagg, IF(IsNull(b.mentioncount), 0, b.mentioncount)) as mentioncount"
 //protected val IncrementalUpdateSentimentUDF = "MeanAverage(a.avgsentimentagg, a.mentioncountagg, IF(IsNull(b.avgsentiment), 0, b.avgsentiment), IF(IsNull(b.mentioncount), 0, b.mentioncount)) as avgsentiment"
 protected val IncrementalUpdateMentionsUDF = "a.mentioncountagg as mentioncount"
 protected val IncrementalUpdateSentimentUDF = "MeanAverage(a.avgsentimentagg, a.mentioncountagg) as avgsentimentnumerator"
 protected val DataFrameNameFlattenedEvents = "flattenedEventsDF"
 protected val DataFrameNameComputed = "computedDF"

 def FortisTargetTablename: String
 def DfTableNameFlattenedEvents: String
 def DfTableNameComputedAggregates: String

 def FortisTargetTableDataFrame(session:SparkSession): DataFrame
 def flattenEvents(session: SparkSession, eventDS: Dataset[Event]): DataFrame
 def IncrementalUpdate(session:SparkSession, aggregatedDS: DataFrame): DataFrame
 def AggregateEventBatches(session: SparkSession, flattenedEvents: DataFrame): DataFrame
}

abstract class FortisAggregatorBase extends FortisAggregator {
  override def DfTableNameFlattenedEvents: String = s"$DataFrameNameFlattenedEvents$FortisTargetTablename"

  override def DfTableNameComputedAggregates: String = s"$DataFrameNameComputed$FortisTargetTablename"
} 
开发者ID:CatalystCode,项目名称:project-fortis-spark,代码行数:32,代码来源:FortisAggregator.scala


示例13: Xref

//设置package包名称以及导入依赖的类
package com.nextgendata.app.source.cif

import com.nextgendata.framework.Job
import org.apache.spark.sql.Dataset


object Xref {
  def getXref: Dataset[XrefRow] = {
    val file = Job.sc.textFile("examples/spark_repl_demo/cif_xref.txt")

    val sqlContext = Job.sqlContext
    // this is used to implicitly convert an RDD to a DataFrame or Dataset.
    import sqlContext.implicits._

    //Have to wrap this Spark code in a block so that the header val is only scoped to this call
    //not the entire class.  If header val was class level, then the closure would try to serialize
    //this entire class and fail since it contains non-serializable objects (SQL/SparkContext)
    val fileRdd = {
      val header = file.first()

      file
        .filter(line => line != header)
        .map(_.split("\t"))
        .map(p => XrefRow(p(0), p(1), p(2).toInt))
      //.toDS()
    }

    fileRdd.toDS
  }
}

case class XrefRow (XrefSystem: String,
                    XrefId: String,
                    CIFId: Int) 
开发者ID:craigjar,项目名称:nextgendata,代码行数:35,代码来源:Xref.scala


示例14: Customer

//设置package包名称以及导入依赖的类
package com.nextgendata.app.source.cif

import com.nextgendata.framework.Job
import org.apache.spark.sql.Dataset


object Customer {
  def getCustomers: Dataset[CustomerRow] = {
    val file = Job.sc.textFile("examples/spark_repl_demo/cif_customer.txt")

    val sqlContext = Job.sqlContext
    // this is used to implicitly convert an RDD to a DataFrame or Dataset.
    import sqlContext.implicits._

    //Have to wrap this Spark code in a block so that the header val is only scoped to this call
    //not the entire class.  If header val was class level, then the closure would try to serialize
    //this entire class and fail since it contains non-serializable objects (SQL/SparkContext)
    val fileRdd = {
      val header = file.first()

      file
        .filter(line => line != header)
        .map(_.split("\t"))
        .map(p => CustomerRow(p(0), p(1), p(2), p(3), p(4), p(5).toInt))
      //.toDS()
    }

    fileRdd.toDS
  }
}

case class CustomerRow (Name: String,
                    Address: String,
                    City:String,
                    PostalCode:String,
                    Phone: String,
                    CIFId: Int) 
开发者ID:craigjar,项目名称:nextgendata,代码行数:38,代码来源:Customer.scala


示例15: Customer

//设置package包名称以及导入依赖的类
package com.nextgendata.app.target

import java.io.File

import org.apache.commons.io.FileUtils
import org.apache.spark.sql.Dataset


object Customer {
  def insert(customers: Dataset[CustomerRow]): Unit ={
    FileUtils.deleteDirectory(new File("target/Customer.txt"))
    customers.rdd.saveAsTextFile("target/Customer.txt")
  }
}

object BadCustomer {
  def insert(customers: Dataset[BadCustomerRow]): Unit ={
    FileUtils.deleteDirectory(new File("target/BadCustomer.txt"))
    customers.rdd.saveAsTextFile("target/BadCustomer.txt")
  }
}

case class CustomerRow(email: String, provinceCode: String, provinceName:String, countryName:String, postal: String, CIFId: Int)

case class BadCustomerRow(email: String, postal: String, CIFId: Int) 
开发者ID:craigjar,项目名称:nextgendata,代码行数:26,代码来源:Customer.scala


示例16: dimensionTableName

//设置package包名称以及导入依赖的类
package org.alghimo.spark.dimensionalModelling

import org.apache.spark.sql.{Column, Dataset, Encoder, Encoders}


  def dimensionTableName: String

  def tmpDimensionTableName: String = {
    val Array(dbName, tableName) = dimensionTableName.split('.')

    s"${dbName}.tmp_${tableName}"
  }

  def maxPartitionsInDimensionTable: Int = 400

  def dimensionTable(refresh: Boolean = false): Dataset[DIM] = {
    if (refresh) {
      spark.catalog.refreshTable(dimensionTableName)
    }

    spark.table(dimensionTableName).as[DIM]
  }

  def currentDimensions(refresh: Boolean = false): Dataset[DIM] = dimensionTable(refresh).filter(s"${isCurrentColumnName}")
  def notCurrentDimensions(refresh: Boolean = false): Dataset[DIM] = dimensionTable(refresh).filter(s"NOT ${isCurrentColumnName}")

  def save(ds: Dataset[DIM], useTempTable: Boolean = true): Dataset[DIM] = {
    println("Saving dimensions..")
    val toSave = if (useTempTable) {
      ds
        .coalesce(maxPartitionsInDimensionTable)
        .write
        .mode("overwrite")
        .saveAsTable(tmpDimensionTableName)

      spark.table(tmpDimensionTableName)
    } else {
      ds
    }

    toSave
      .coalesce(maxPartitionsInDimensionTable)
      .write
      .mode("overwrite")
      .saveAsTable(dimensionTableName)

    if (useTempTable) {
      spark.sql(s"DROP TABLE ${tmpDimensionTableName} PURGE")
    }

    dimensionTable(refresh = true)
  }
} 
开发者ID:alghimo,项目名称:spark-dimensional-modelling,代码行数:54,代码来源:DimensionTableProvider.scala


示例17: enrichedWithDimensionEncoder

//设置package包名称以及导入依赖的类
package org.alghimo.spark.dimensionalModelling

import org.apache.spark.sql.{Dataset, Encoder}


trait EnrichedDimensionWithDimensionProvider[ENRICHED_DIM <: (Product with Serializable), DIM <: (Product with Serializable)] {
  type EnrichedWithDimension = (ENRICHED_DIM, DIM)
  type DimensionWithEnriched = (DIM, ENRICHED_DIM)
  type JoinEnrichedWithDimension = Dataset[EnrichedWithDimension]
  type JoinDimensionWithEnriched = Dataset[DimensionWithEnriched]

  implicit def enrichedWithDimensionEncoder: Encoder[(ENRICHED_DIM, DIM)]
  implicit def dimensionWithEnrichedEncoder: Encoder[(DIM, ENRICHED_DIM)]

  implicit def enrichedWithDimensionToDimensionWithEnriched(e: EnrichedWithDimension): DimensionWithEnriched = e.swap
  implicit def dimensionWithEnrichedToEnrichedWithDimension(d: EnrichedWithDimension): DimensionWithEnriched = d.swap
  implicit def joinEnrichedWithDimensionToJoinDimensionWithEnriched(e: JoinEnrichedWithDimension): JoinDimensionWithEnriched = e.map(_.swap)
  implicit def joinDimensionWithEnrichedToJoinEnrichedWithDimension(d: JoinEnrichedWithDimension): JoinDimensionWithEnriched = d.map(_.swap)
} 
开发者ID:alghimo,项目名称:spark-dimensional-modelling,代码行数:20,代码来源:EnrichedDimensionWithDimensionProvider.scala


示例18: GloVe

//设置package包名称以及导入依赖的类
package org.apache.spark.ml.feature

import org.apache.spark.ml.Estimator
import org.apache.spark.ml.param.ParamMap
import org.apache.spark.ml.util.{DefaultParamsWritable, Identifiable}
import org.apache.spark.mllib.feature
import org.apache.spark.sql.Dataset
import org.apache.spark.sql.types.StructType

final class GloVe(override val uid: String)
  extends Estimator[GloVeModel] with GloVeBase with DefaultParamsWritable {

  def this() = this(Identifiable.randomUID("glove"))

  def setInputCol(value: String): this.type = set(inputCol, value)

  def setOutputCol(value: String): this.type = set(outputCol, value)

  def setDim(value: Int): this.type = set(dim, value)

  def setAlpha(value: Double): this.type = set(alpha, value)

  def setWindow(value: Int): this.type = set(window, value)

  def setStepSize(value: Double): this.type = set(stepSize, value)

  def setMaxIter(value: Int): this.type = set(maxIter, value)

  def setSeed(value: Long): this.type = set(seed, value)

  def setMinCount(value: Int): this.type = set(minCount, value)

  override def fit(dataset: Dataset[_]): GloVeModel = {
    transformSchema(dataset.schema, logging = true)
    val input = dataset.select($(inputCol)).rdd.map(_.getAs[Seq[String]](0))
    val wordVectors = new feature.GloVe()
      .setLearningRate($(stepSize))
      .setMinCount($(minCount))
      .setNumIterations($(maxIter))
      .setSeed($(seed))
      .setDim($(dim))
      .fit(input)
    copyValues(new GloVeModel(uid, wordVectors).setParent(this))
  }

  override def transformSchema(schema: StructType): StructType = {
    validateAndTransformSchema(schema)
  }

  override def copy(extra: ParamMap): GloVe = defaultCopy(extra)
} 
开发者ID:mdymczyk,项目名称:spark-miner,代码行数:52,代码来源:GloVe.scala


示例19: DawidSkenePartialModel

//设置package包名称以及导入依赖的类
package com.enriquegrodrigo.spark.crowd.types

import org.apache.spark.sql.Dataset
import org.apache.spark.broadcast.Broadcast


private[crowd] case class DawidSkenePartialModel(dataset: Dataset[DawidSkenePartial], params: Broadcast[DawidSkeneParams], 
                                  logLikelihood: Double, improvement: Double, nClasses: Int, 
                                  nAnnotators: Long) {

  def modify(nDataset: Dataset[DawidSkenePartial] =dataset, 
      nParams: Broadcast[DawidSkeneParams] =params, 
      nLogLikelihood: Double =logLikelihood, 
      nImprovement: Double =improvement, 
      nNClasses: Int =nClasses, 
      nNAnnotators: Long =nAnnotators) = 
        new DawidSkenePartialModel(nDataset, nParams, 
                                                nLogLikelihood, 
                                                nImprovement, 
                                                nNClasses, 
                                                nNAnnotators)
} 
开发者ID:enriquegrodrigo,项目名称:spark-crowd,代码行数:23,代码来源:DawidSkenePartialModel.scala


示例20: GladPartialModel

//设置package包名称以及导入依赖的类
package com.enriquegrodrigo.spark.crowd.types

import org.apache.spark.sql.Dataset
import org.apache.spark.broadcast.Broadcast


private[crowd] case class GladPartialModel(dataset: Dataset[GladPartial], params: Broadcast[GladParams], 
                                  logLikelihood: Double, improvement: Double, 
                                  nAnnotators: Int) {

  def modify(nDataset: Dataset[GladPartial] =dataset, 
      nParams: Broadcast[GladParams] =params, 
      nLogLikelihood: Double =logLikelihood, 
      nImprovement: Double =improvement, 
      nNAnnotators: Int =nAnnotators) = 
        new GladPartialModel(nDataset, nParams, 
                                                nLogLikelihood, 
                                                nImprovement, 
                                                nNAnnotators)
} 
开发者ID:enriquegrodrigo,项目名称:spark-crowd,代码行数:21,代码来源:GladPartialModel.scala



注:本文中的org.apache.spark.sql.Dataset类示例整理自Github/MSDocs等源码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。


鲜花

握手

雷人

路过

鸡蛋
该文章已有0人参与评论

请发表评论

全部评论

专题导读
上一篇:
Scala ArgumentCaptor类代码示例发布时间:2022-05-23
下一篇:
Scala Timer类代码示例发布时间:2022-05-23
热门推荐
热门话题
阅读排行榜

扫描微信二维码

查看手机版网站

随时了解更新最新资讯

139-2527-9053

在线客服(服务时间 9:00~18:00)

在线QQ客服
地址:深圳市南山区西丽大学城创智工业园
电邮:jeky_zhao#qq.com
移动电话:139-2527-9053

Powered by 互联科技 X3.4© 2001-2213 极客世界.|Sitemap