本文整理汇总了Scala中org.apache.spark.sql.Dataset类的典型用法代码示例。如果您正苦于以下问题:Scala Dataset类的具体用法?Scala Dataset怎么用?Scala Dataset使用的例子?那么恭喜您, 这里精选的类代码示例或许可以为您提供帮助。
在下文中一共展示了Dataset类的20个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于我们的系统推荐出更棒的Scala代码示例。
示例1: apply
//设置package包名称以及导入依赖的类
package org.dama.datasynth.runtime.spark.operators
import org.apache.spark.sql.{Dataset, SparkSession}
import org.dama.datasynth.executionplan.ExecutionPlan.EdgeTable
import org.dama.datasynth.runtime.spark.SparkRuntime
import scala.util.Random
def apply( node : EdgeTable) : Dataset[(Long,Long,Long)]= {
val sparkSession = SparkRuntime.getSparkSession()
import sparkSession.implicits._
val generator = SparkRuntime.instantiateStructureGeneratorOperator( node.structure )
val size = SparkRuntime.evalValueOperator(node.size).asInstanceOf[Long]
val random : Random = new Random()
val id : Int = random.nextInt()
val path : String = s"/tmp/${id}"
val sparkContext = sparkSession.sparkContext
generator.run(size, sparkContext.hadoopConfiguration,"hdfs://"+path)
val edgesRDD = sparkContext.textFile(path)
.map( s => s.split("\t"))
.map( l => (l(0).toLong, l(1).toLong))
.zipWithIndex().map( { case ((tail,head), id) => (id, tail, head)})
sparkSession.createDataset(edgesRDD)
}
}
开发者ID:DAMA-UPC,项目名称:DataSynth,代码行数:28,代码来源:EdgeTableOperator.scala
示例2: PrecipSource
//设置package包名称以及导入依赖的类
import org.apache.spark.rdd.RDD
import org.apache.spark.sql.{Dataset, SparkSession}
case class PrecipSource(sourceId: Int,
name: String,
countryCode: String,
latitude: String,
longitude: String,
elevation: Int,
elementId: String,
beginDate: String,
endDate: String,
participantId: Int,
participantName: String
)
case class Precipication(stationId: Int,
sourceId: Int,
date: String,
amount: Int,
quality: Int
)
class Mappers() {
def precipicationDF(spark: SparkSession, sourceFilPath: String): Dataset[Precipication] = {
import spark.implicits._
var sourceFile: RDD[String] = spark.sparkContext.textFile(sourceFilPath)
val header = spark.sparkContext.parallelize(sourceFile.take(20))
sourceFile = sourceFile.subtract(header)
header.unpersist()
var precipitionDF: Dataset[Precipication] = sourceFile
.map(s => s.split(",")
.map(_.trim()))
.map(fields => Precipication(
stationId = fields(0).toInt,
sourceId = fields(1).toInt,
date = fields(2),
amount = fields(3).toInt,
quality = fields(4).toInt
))
.toDS()
precipitionDF.show(false)
precipitionDF
}
}
开发者ID:luxinator,项目名称:RainyDay,代码行数:54,代码来源:Mappers.scala
示例3: DataFrameFunctions
//设置package包名称以及导入依赖的类
package com.bloomberg.sparkflow.dc
import org.apache.spark.sql.{Column, Dataset, Row}
class DataFrameFunctions(self: DC[Row]) {
def join(right: DC[Row]): DC[Row] = {
val f = (left: Dataset[_], right: Dataset[_]) => {
left.join(right)
}
val hashTarget = Seq("join")
new MultiDatasetTransformDC(self, right, f, hashTarget)
}
def join(right: DC[Row], usingColumn: String): DC[Row] = {
val f = (left: Dataset[_], right: Dataset[_]) => {
left.join(right, usingColumn)
}
val hashTarget = Seq("join", usingColumn)
new MultiDatasetTransformDC(self, right, f, hashTarget)
}
def join(right: DC[Row], joinExprs: Column): DC[Row] = join(right, joinExprs, "inner")
def join(right: DC[Row], joinExprs: Column, joinType: String): DC[Row] = {
val f = (left: Dataset[_], right: Dataset[_]) => {
left.join(right, joinExprs)
}
val hashTarget = Seq("join", joinType, joinExprs.toString())
new MultiDatasetTransformDC(self, right, f, hashTarget)
}
}
开发者ID:bloomberg,项目名称:spark-flow,代码行数:36,代码来源:DataFrameFunctions.scala
示例4: MiniPanda
//设置package包名称以及导入依赖的类
package com.highperformancespark.examples.ml
import com.highperformancespark.examples.dataframe.HappyPandas.{PandaInfo, Pandas}
import com.holdenkarau.spark.testing._
import org.apache.spark.ml._
import org.apache.spark.ml.feature._
import org.apache.spark.sql.types._
import org.apache.spark.sql.{DataFrame, Dataset, Row, SQLContext}
import org.scalatest.Matchers._
import org.scalatest.FunSuite
case class MiniPanda(happy: Double, fuzzy: Double, old: Double)
class SimpleNaiveBayesSuite extends FunSuite with DataFrameSuiteBase {
val miniPandasList = List(
MiniPanda(1.0, 1.0, 1.0),
MiniPanda(1.0, 1.0, 0.0),
MiniPanda(1.0, 1.0, 0.0),
MiniPanda(0.0, 0.0, 1.0),
MiniPanda(0.0, 0.0, 0.0))
test("simple sanity test") {
val session = spark
import session.implicits._
val ds: Dataset[MiniPanda] = session.createDataset(miniPandasList)
val assembler = new VectorAssembler()
assembler.setInputCols(Array("fuzzy", "old"))
assembler.setOutputCol("features")
val snb = new SimpleNaiveBayes()
snb.setLabelCol("happy")
snb.setFeaturesCol("features")
val pipeline = new Pipeline().setStages(Array(assembler, snb))
val model = pipeline.fit(ds)
val test = ds.select("fuzzy", "old")
val predicted = model.transform(test)
println(predicted.collect())
}
}
开发者ID:gourimahapatra,项目名称:high-performance-spark,代码行数:41,代码来源:SimpleNaiveBayes.scala
示例5: Partition
//设置package包名称以及导入依赖的类
package br.ufmg.cs.lib.privacy.kanonymity
import org.apache.spark.sql.{Dataset, Row, SparkSession}
import org.apache.spark.sql.functions._
case class Partition(
member: Dataset[Row],
memberCount: Long,
low: Array[Int],
high: Array[Int],
allow: Array[Int]) {
override def toString: String = {
s"Partition(memberlen=${memberCount}" +
s", low=${low.mkString("[", ", ", "]")}" +
s", high=${high.mkString("[", ",", "]")}" +
s", allow=${allow.mkString("[", ",", "]")})"
}
}
object Partition {
def apply(member: Dataset[Row], memberCount: Long,
low: Array[Int], high: Array[Int]): Partition = {
Partition(member, memberCount, low, high, Array.fill(low.length)(1))
}
}
开发者ID:eubr-bigsea,项目名称:k-anonymity-mondrian,代码行数:28,代码来源:Partition.scala
示例6: cogroup
//设置package包名称以及导入依赖的类
package com.qunar.spark.tungsten.base
import org.apache.spark.sql.Dataset
import com.qunar.spark.tungsten.base.CommonEncoders._
import scala.reflect.runtime.universe.TypeTag
def cogroup[T: TypeTag, K: TypeTag](leftDataset: Dataset[T], rightDataset: Dataset[T], genJoinKey: T => K): Dataset[(Seq[T], Seq[T])] = {
// ????leftDataset
val thisKeyValueSet = leftDataset.groupByKey(data => genJoinKey(data))
val thisCogroupSet = thisKeyValueSet.mapGroups((key, dataIter) => {
val builder = Seq.newBuilder[T]
for (data <- dataIter) {
builder += data
}
(key, builder.result)
}).toDF("_1", "_2").as[(K, Seq[T])]
// ????rightDataset
val anotherKeyValueSet = rightDataset.groupByKey(data => genJoinKey(data))
val anotherCogroupSet = anotherKeyValueSet.mapGroups((key, dataIter) => {
val builder = Seq.newBuilder[T]
for (data <- dataIter) {
builder += data
}
(key, builder.result)
}).toDF("_1", "_2").as[(K, Seq[T])]
val resultDataFrame = thisCogroupSet.join(anotherCogroupSet, thisCogroupSet("_1") === anotherCogroupSet("_1"), "outer")
resultDataFrame.as[(Seq[T], Seq[T])]
}
}
开发者ID:spark-bypass-common,项目名称:common-tungsten,代码行数:35,代码来源:CoreJoinOperators.scala
示例7: OpenIE
//设置package包名称以及导入依赖的类
package com.knoldus
import edu.stanford.nlp.simple.{Document, Sentence}
import edu.stanford.nlp.util.Quadruple
import org.apache.spark.sql.{Dataset, SparkSession}
import org.apache.spark.sql.functions.udf
import scala.collection.JavaConverters._
private case class OpenIE(subject: String, relation: String, target: String, confidence: Double) {
def this(quadruple: Quadruple[String, String, String, java.lang.Double]) =
this(quadruple.first, quadruple.second, quadruple.third, quadruple.fourth)
}
object StartApplication extends App{
val spark = SparkSession.builder().appName("spark-nlp-starter").master("local[*]").getOrCreate()
val sc = spark.sparkContext
val readPdfFile: Dataset[String] = spark.read.textFile("test")
readPdfFile.show(false)
def openie = udf { sentence: String =>
new Sentence(sentence).openie().asScala.map(q => new OpenIE(q)).toSeq
}
val res = readPdfFile.select(openie(readPdfFile("value")))
res.show(false)
}
开发者ID:shiv4nsh,项目名称:scala-spark-nlp-starter-kit,代码行数:30,代码来源:StartApplication.scala
示例8: RowProfiler
//设置package包名称以及导入依赖的类
package io.gzet.profilers.raw
import org.apache.spark.sql.Dataset
case class RowProfiler() {
def profile(df: Dataset[String]): Dataset[RowReport] = {
import df.sparkSession.implicits._
val report = RowReport(df.count().toDouble)
df.sparkSession.createDataset[RowReport](
Seq(report)
)
}
}
case class RowReport(
metricValue: Double
)
开发者ID:PacktPublishing,项目名称:Mastering-Spark-for-Data-Science,代码行数:19,代码来源:RowProfiler.scala
示例9: EmptinessProfiler
//设置package包名称以及导入依赖的类
package io.gzet.profilers.field
import io.gzet.profilers.Utils
import org.apache.commons.lang3.StringUtils
import org.apache.spark.sql.Dataset
import scalaz.Scalaz._
case class EmptinessProfiler() {
def profile(df: Dataset[Array[String]]): Dataset[EmptinessReport] = {
import df.sparkSession.implicits._
val features = Utils.buildColumns(df)
features.map(f => (f.idx, StringUtils.isNotEmpty(f.value))).groupByKey({ case (column, isNotEmpty) =>
(column, isNotEmpty)
}).count().map({ case ((column, isNotEmpty), count) =>
(column, Map(isNotEmpty -> count))
}).groupByKey({ case (column, map) =>
column
}).reduceGroups({ (v1, v2) =>
(v1._1, v1._2 |+| v2._2)
}).map({ case (col, (_, map)) =>
val emptiness = map.getOrElse(false, 0L) / (map.getOrElse(true, 0L) + map.getOrElse(false, 0L)).toDouble
EmptinessReport(
col,
emptiness
)
})
}
}
case class EmptinessReport(
field: Int,
metricValue: Double
)
开发者ID:PacktPublishing,项目名称:Mastering-Spark-for-Data-Science,代码行数:41,代码来源:EmptinessProfiler.scala
示例10: CountSentencesByLanguage
//设置package包名称以及导入依赖的类
package biz.meetmatch.modules
import biz.meetmatch.model.{Sentence, SentenceCountByLanguage}
import org.apache.spark.sql.{Dataset, SparkSession}
import org.rogach.scallop.Scallop
object CountSentencesByLanguage extends Module with ParquetExtensions[SentenceCountByLanguage] {
override val parquetFile = "SentenceCountsByLanguage"
override def execute(scallopts: Scallop)(implicit sparkSession: SparkSession): Unit = {
val sentenceDS = DetectLanguage.loadResultsFromParquet
val sentenceCountByLanguageDS = calc(sentenceDS)
saveResultsToParquet(sentenceCountByLanguageDS)
}
def calc(sentenceDS: Dataset[Sentence])(implicit sparkSession: SparkSession): Dataset[SentenceCountByLanguage] = {
import sparkSession.implicits._
sparkSession.sparkContext.setJobGroup(this.getClass.getName, this.getClass.getName)
sparkSession.sparkContext.setJobDescription("Count the sentences by language")
// TASK 2: count how many sentences exist for each detected language and save the results in the SentenceCountByLanguage case class
// when finished coding:
// - package, deploy and submit the spark application and verify the results using spark shell or a notebook (see https://github.com/tolomaus/languagedetector section Quick start - usage)
// - verify the logs of the executed module in the language detector UI
// solution:
sentenceDS
.groupByKey(_.detectedLanguage)
.count
.map { case (language, count) => SentenceCountByLanguage(language, count) }
}
def loadResultsFromParquet(implicit module: Class[_] = this.getClass, sparkSession: SparkSession): Dataset[SentenceCountByLanguage] = {
import sparkSession.implicits._
loadResultsFromParquetAsDF(module, sparkSession).as[SentenceCountByLanguage]
}
}
开发者ID:tolomaus,项目名称:languagedetector,代码行数:43,代码来源:CountSentencesByLanguage.scala
示例11: SQLDataProvider
//设置package包名称以及导入依赖的类
package org.ieee.codemeow.geometric.spark.data
import com.vividsolutions.jts.geom.Geometry
import org.apache.spark.sql.{Dataset, Encoders, SparkSession}
import org.ieee.codemeow.geometric.Feature
import org.ieee.codemeow.geometric.spark.LayerConfiguration
class SQLDataProvider(_spark: SparkSession, _layer: LayerConfiguration) extends AbstractDataProvider(_spark, _layer){
val url = layer.kwargs.get("url").get.asInstanceOf[String]
val dbtables = layer.kwargs.get("dbtables").get.asInstanceOf[Map[String, String]]
val user = layer.kwargs.get("user").get.asInstanceOf[String]
val password = layer.kwargs.get("password").get.asInstanceOf[String]
val zoomConfig = layer.kwargs.get("zooms").get.asInstanceOf[Map[String, String]]
// load all tables
dbtables.foreach(tuple => {
val sparkTableName = tuple._1
val realTableName = tuple._2
val mapDataFrame = spark.read.format("jdbc")
.option("url", url)
.option("user", user)
.option("password", password)
.option("dbtable", realTableName).load
mapDataFrame.createOrReplaceTempView(sparkTableName)
})
override def getFeatures(layerName: String, zoom: Long): Option[Dataset[Feature]] ={
// Ref http://stackoverflow.com/questions/38664972/why-is-unable-to-find-encoder-for-type-stored-in-a-dataset-when-creating-a-dat
import spark.implicits._
// Ref http://stackoverflow.com/questions/36648128/how-to-store-custom-objects-in-dataset
implicit val featureEncoder = Encoders.kryo[Feature]
val natureSQL = zoomConfig.get(zoom.toString)
if(natureSQL.isEmpty){
return None
}
val rawDF = spark.sql(natureSQL.get)
val featureCollection = rawDF.map(row => {
val id = row.getAs[Long]("__id__")
val geom = row.getAs[Geometry]("__geometry__")
val fields = row.schema.filter(field => {
!Seq("__id__", "__geometry__").contains(field.name)
}).map(field => field.name)
val props = row.getValuesMap[String](fields)
Feature(id, geom, props)
})
Some(featureCollection)
}
}
开发者ID:codemeow5,项目名称:Vector-Tile-Spark-Process,代码行数:56,代码来源:SQLDataProvider.scala
示例12: FortisTargetTablename
//设置package包名称以及导入依赖的类
package com.microsoft.partnercatalyst.fortis.spark.sinks.cassandra.aggregators
import com.microsoft.partnercatalyst.fortis.spark.sinks.cassandra.dto.{AggregationRecord, Event, EventBatchEntry}
import org.apache.spark.sql.{DataFrame, Dataset, SparkSession}
trait FortisAggregator {
protected val KeyspaceName = "fortis"
protected val CassandraFormat = "org.apache.spark.sql.cassandra"
protected val AggregateFunctions = "sum(mentioncount) as mentioncountagg, SentimentWeightedAvg(IF(IsNull(avgsentiment), 0, avgsentiment), IF(IsNull(mentioncount), 0, mentioncount)) as avgsentimentagg"
//protected val IncrementalUpdateMentionsUDF = "SumMentions(a.mentioncountagg, IF(IsNull(b.mentioncount), 0, b.mentioncount)) as mentioncount"
//protected val IncrementalUpdateSentimentUDF = "MeanAverage(a.avgsentimentagg, a.mentioncountagg, IF(IsNull(b.avgsentiment), 0, b.avgsentiment), IF(IsNull(b.mentioncount), 0, b.mentioncount)) as avgsentiment"
protected val IncrementalUpdateMentionsUDF = "a.mentioncountagg as mentioncount"
protected val IncrementalUpdateSentimentUDF = "MeanAverage(a.avgsentimentagg, a.mentioncountagg) as avgsentimentnumerator"
protected val DataFrameNameFlattenedEvents = "flattenedEventsDF"
protected val DataFrameNameComputed = "computedDF"
def FortisTargetTablename: String
def DfTableNameFlattenedEvents: String
def DfTableNameComputedAggregates: String
def FortisTargetTableDataFrame(session:SparkSession): DataFrame
def flattenEvents(session: SparkSession, eventDS: Dataset[Event]): DataFrame
def IncrementalUpdate(session:SparkSession, aggregatedDS: DataFrame): DataFrame
def AggregateEventBatches(session: SparkSession, flattenedEvents: DataFrame): DataFrame
}
abstract class FortisAggregatorBase extends FortisAggregator {
override def DfTableNameFlattenedEvents: String = s"$DataFrameNameFlattenedEvents$FortisTargetTablename"
override def DfTableNameComputedAggregates: String = s"$DataFrameNameComputed$FortisTargetTablename"
}
开发者ID:CatalystCode,项目名称:project-fortis-spark,代码行数:32,代码来源:FortisAggregator.scala
示例13: Xref
//设置package包名称以及导入依赖的类
package com.nextgendata.app.source.cif
import com.nextgendata.framework.Job
import org.apache.spark.sql.Dataset
object Xref {
def getXref: Dataset[XrefRow] = {
val file = Job.sc.textFile("examples/spark_repl_demo/cif_xref.txt")
val sqlContext = Job.sqlContext
// this is used to implicitly convert an RDD to a DataFrame or Dataset.
import sqlContext.implicits._
//Have to wrap this Spark code in a block so that the header val is only scoped to this call
//not the entire class. If header val was class level, then the closure would try to serialize
//this entire class and fail since it contains non-serializable objects (SQL/SparkContext)
val fileRdd = {
val header = file.first()
file
.filter(line => line != header)
.map(_.split("\t"))
.map(p => XrefRow(p(0), p(1), p(2).toInt))
//.toDS()
}
fileRdd.toDS
}
}
case class XrefRow (XrefSystem: String,
XrefId: String,
CIFId: Int)
开发者ID:craigjar,项目名称:nextgendata,代码行数:35,代码来源:Xref.scala
示例14: Customer
//设置package包名称以及导入依赖的类
package com.nextgendata.app.source.cif
import com.nextgendata.framework.Job
import org.apache.spark.sql.Dataset
object Customer {
def getCustomers: Dataset[CustomerRow] = {
val file = Job.sc.textFile("examples/spark_repl_demo/cif_customer.txt")
val sqlContext = Job.sqlContext
// this is used to implicitly convert an RDD to a DataFrame or Dataset.
import sqlContext.implicits._
//Have to wrap this Spark code in a block so that the header val is only scoped to this call
//not the entire class. If header val was class level, then the closure would try to serialize
//this entire class and fail since it contains non-serializable objects (SQL/SparkContext)
val fileRdd = {
val header = file.first()
file
.filter(line => line != header)
.map(_.split("\t"))
.map(p => CustomerRow(p(0), p(1), p(2), p(3), p(4), p(5).toInt))
//.toDS()
}
fileRdd.toDS
}
}
case class CustomerRow (Name: String,
Address: String,
City:String,
PostalCode:String,
Phone: String,
CIFId: Int)
开发者ID:craigjar,项目名称:nextgendata,代码行数:38,代码来源:Customer.scala
示例15: Customer
//设置package包名称以及导入依赖的类
package com.nextgendata.app.target
import java.io.File
import org.apache.commons.io.FileUtils
import org.apache.spark.sql.Dataset
object Customer {
def insert(customers: Dataset[CustomerRow]): Unit ={
FileUtils.deleteDirectory(new File("target/Customer.txt"))
customers.rdd.saveAsTextFile("target/Customer.txt")
}
}
object BadCustomer {
def insert(customers: Dataset[BadCustomerRow]): Unit ={
FileUtils.deleteDirectory(new File("target/BadCustomer.txt"))
customers.rdd.saveAsTextFile("target/BadCustomer.txt")
}
}
case class CustomerRow(email: String, provinceCode: String, provinceName:String, countryName:String, postal: String, CIFId: Int)
case class BadCustomerRow(email: String, postal: String, CIFId: Int)
开发者ID:craigjar,项目名称:nextgendata,代码行数:26,代码来源:Customer.scala
示例16: dimensionTableName
//设置package包名称以及导入依赖的类
package org.alghimo.spark.dimensionalModelling
import org.apache.spark.sql.{Column, Dataset, Encoder, Encoders}
def dimensionTableName: String
def tmpDimensionTableName: String = {
val Array(dbName, tableName) = dimensionTableName.split('.')
s"${dbName}.tmp_${tableName}"
}
def maxPartitionsInDimensionTable: Int = 400
def dimensionTable(refresh: Boolean = false): Dataset[DIM] = {
if (refresh) {
spark.catalog.refreshTable(dimensionTableName)
}
spark.table(dimensionTableName).as[DIM]
}
def currentDimensions(refresh: Boolean = false): Dataset[DIM] = dimensionTable(refresh).filter(s"${isCurrentColumnName}")
def notCurrentDimensions(refresh: Boolean = false): Dataset[DIM] = dimensionTable(refresh).filter(s"NOT ${isCurrentColumnName}")
def save(ds: Dataset[DIM], useTempTable: Boolean = true): Dataset[DIM] = {
println("Saving dimensions..")
val toSave = if (useTempTable) {
ds
.coalesce(maxPartitionsInDimensionTable)
.write
.mode("overwrite")
.saveAsTable(tmpDimensionTableName)
spark.table(tmpDimensionTableName)
} else {
ds
}
toSave
.coalesce(maxPartitionsInDimensionTable)
.write
.mode("overwrite")
.saveAsTable(dimensionTableName)
if (useTempTable) {
spark.sql(s"DROP TABLE ${tmpDimensionTableName} PURGE")
}
dimensionTable(refresh = true)
}
}
开发者ID:alghimo,项目名称:spark-dimensional-modelling,代码行数:54,代码来源:DimensionTableProvider.scala
示例17: enrichedWithDimensionEncoder
//设置package包名称以及导入依赖的类
package org.alghimo.spark.dimensionalModelling
import org.apache.spark.sql.{Dataset, Encoder}
trait EnrichedDimensionWithDimensionProvider[ENRICHED_DIM <: (Product with Serializable), DIM <: (Product with Serializable)] {
type EnrichedWithDimension = (ENRICHED_DIM, DIM)
type DimensionWithEnriched = (DIM, ENRICHED_DIM)
type JoinEnrichedWithDimension = Dataset[EnrichedWithDimension]
type JoinDimensionWithEnriched = Dataset[DimensionWithEnriched]
implicit def enrichedWithDimensionEncoder: Encoder[(ENRICHED_DIM, DIM)]
implicit def dimensionWithEnrichedEncoder: Encoder[(DIM, ENRICHED_DIM)]
implicit def enrichedWithDimensionToDimensionWithEnriched(e: EnrichedWithDimension): DimensionWithEnriched = e.swap
implicit def dimensionWithEnrichedToEnrichedWithDimension(d: EnrichedWithDimension): DimensionWithEnriched = d.swap
implicit def joinEnrichedWithDimensionToJoinDimensionWithEnriched(e: JoinEnrichedWithDimension): JoinDimensionWithEnriched = e.map(_.swap)
implicit def joinDimensionWithEnrichedToJoinEnrichedWithDimension(d: JoinEnrichedWithDimension): JoinDimensionWithEnriched = d.map(_.swap)
}
开发者ID:alghimo,项目名称:spark-dimensional-modelling,代码行数:20,代码来源:EnrichedDimensionWithDimensionProvider.scala
示例18: GloVe
//设置package包名称以及导入依赖的类
package org.apache.spark.ml.feature
import org.apache.spark.ml.Estimator
import org.apache.spark.ml.param.ParamMap
import org.apache.spark.ml.util.{DefaultParamsWritable, Identifiable}
import org.apache.spark.mllib.feature
import org.apache.spark.sql.Dataset
import org.apache.spark.sql.types.StructType
final class GloVe(override val uid: String)
extends Estimator[GloVeModel] with GloVeBase with DefaultParamsWritable {
def this() = this(Identifiable.randomUID("glove"))
def setInputCol(value: String): this.type = set(inputCol, value)
def setOutputCol(value: String): this.type = set(outputCol, value)
def setDim(value: Int): this.type = set(dim, value)
def setAlpha(value: Double): this.type = set(alpha, value)
def setWindow(value: Int): this.type = set(window, value)
def setStepSize(value: Double): this.type = set(stepSize, value)
def setMaxIter(value: Int): this.type = set(maxIter, value)
def setSeed(value: Long): this.type = set(seed, value)
def setMinCount(value: Int): this.type = set(minCount, value)
override def fit(dataset: Dataset[_]): GloVeModel = {
transformSchema(dataset.schema, logging = true)
val input = dataset.select($(inputCol)).rdd.map(_.getAs[Seq[String]](0))
val wordVectors = new feature.GloVe()
.setLearningRate($(stepSize))
.setMinCount($(minCount))
.setNumIterations($(maxIter))
.setSeed($(seed))
.setDim($(dim))
.fit(input)
copyValues(new GloVeModel(uid, wordVectors).setParent(this))
}
override def transformSchema(schema: StructType): StructType = {
validateAndTransformSchema(schema)
}
override def copy(extra: ParamMap): GloVe = defaultCopy(extra)
}
开发者ID:mdymczyk,项目名称:spark-miner,代码行数:52,代码来源:GloVe.scala
示例19: DawidSkenePartialModel
//设置package包名称以及导入依赖的类
package com.enriquegrodrigo.spark.crowd.types
import org.apache.spark.sql.Dataset
import org.apache.spark.broadcast.Broadcast
private[crowd] case class DawidSkenePartialModel(dataset: Dataset[DawidSkenePartial], params: Broadcast[DawidSkeneParams],
logLikelihood: Double, improvement: Double, nClasses: Int,
nAnnotators: Long) {
def modify(nDataset: Dataset[DawidSkenePartial] =dataset,
nParams: Broadcast[DawidSkeneParams] =params,
nLogLikelihood: Double =logLikelihood,
nImprovement: Double =improvement,
nNClasses: Int =nClasses,
nNAnnotators: Long =nAnnotators) =
new DawidSkenePartialModel(nDataset, nParams,
nLogLikelihood,
nImprovement,
nNClasses,
nNAnnotators)
}
开发者ID:enriquegrodrigo,项目名称:spark-crowd,代码行数:23,代码来源:DawidSkenePartialModel.scala
示例20: GladPartialModel
//设置package包名称以及导入依赖的类
package com.enriquegrodrigo.spark.crowd.types
import org.apache.spark.sql.Dataset
import org.apache.spark.broadcast.Broadcast
private[crowd] case class GladPartialModel(dataset: Dataset[GladPartial], params: Broadcast[GladParams],
logLikelihood: Double, improvement: Double,
nAnnotators: Int) {
def modify(nDataset: Dataset[GladPartial] =dataset,
nParams: Broadcast[GladParams] =params,
nLogLikelihood: Double =logLikelihood,
nImprovement: Double =improvement,
nNAnnotators: Int =nAnnotators) =
new GladPartialModel(nDataset, nParams,
nLogLikelihood,
nImprovement,
nNAnnotators)
}
开发者ID:enriquegrodrigo,项目名称:spark-crowd,代码行数:21,代码来源:GladPartialModel.scala
注:本文中的org.apache.spark.sql.Dataset类示例整理自Github/MSDocs等源码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。 |
请发表评论