本文整理汇总了Scala中org.apache.spark.sql.Row类的典型用法代码示例。如果您正苦于以下问题:Scala Row类的具体用法?Scala Row怎么用?Scala Row使用的例子?那么恭喜您, 这里精选的类代码示例或许可以为您提供帮助。
在下文中一共展示了Row类的20个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于我们的系统推荐出更棒的Scala代码示例。
示例1: SimpleApp
//设置package包名称以及导入依赖的类
import org.apache.spark.SparkContext
import org.apache.spark.SparkConf
import org.apache.spark.ml.clustering.LDA
import org.apache.spark.mllib.linalg.{VectorUDT, Vectors}
import org.apache.spark.sql.{Row, SQLContext}
import org.apache.spark.sql.types.{StructField, StructType}
object SimpleApp {
def main(args: Array[String]) {
val conf = new SparkConf().setAppName("Simple Application").set("spark.ui.enabled", "false")
val sc = new SparkContext(conf)
val sqlContext = new SQLContext(sc)
// Loads data
val rowRDD = sc.textFile("/tmp/lda_data.txt").filter(_.nonEmpty)
.map(_.split(" ").map(_.toDouble)).map(Vectors.dense).map(Row(_))
val schema = StructType(Array(StructField("name", new VectorUDT, false)))
val dataset = sqlContext.createDataFrame(rowRDD, schema)
dataset.show()
val lda = new LDA()
.setK(10)
.setMaxIter(10)
.setFeaturesCol("name")
val model = lda.fit(dataset)
val transformed = model.transform(dataset)
val ll = model.logLikelihood(dataset)
val lp = model.logPerplexity(dataset)
// describeTopics
val topics = model.describeTopics(3)
// Shows the result
topics.show(false)
transformed.show(false)
}
}
开发者ID:mykumar,项目名称:SparkScalaInternalExperiements,代码行数:41,代码来源:SimpleApp.scala
示例2: LRCV
//设置package包名称以及导入依赖的类
package com.ferhtaydn.rater
import org.apache.spark.SparkContext
import org.apache.spark.ml.Pipeline
import org.apache.spark.ml.classification.LogisticRegression
import org.apache.spark.ml.evaluation.BinaryClassificationEvaluator
import org.apache.spark.ml.feature.{ StringIndexerModel, VectorAssembler }
import org.apache.spark.ml.tuning.{ CrossValidator, CrossValidatorModel, ParamGridBuilder }
import org.apache.spark.mllib.linalg.Matrix
import org.apache.spark.sql.{ DataFrame, Row, SQLContext }
class LRCV(sc: SparkContext) {
implicit val sqlContext = new SQLContext(sc)
val lr = new LogisticRegression().setMaxIter(10).setFeaturesCol("scaledFeatures")
val paramGrid = new ParamGridBuilder()
.addGrid(lr.regParam, Array(0.1, 0.01))
.build()
val assembler = new VectorAssembler()
.setInputCols(Array("gender", "age", "weight", "height", "indexedJob"))
.setOutputCol("features")
val pipeline = new Pipeline()
.setStages(Array(assembler, standardScaler("features"), lr))
val cv = new CrossValidator()
.setEstimator(pipeline)
.setEvaluator(new BinaryClassificationEvaluator)
.setEstimatorParamMaps(paramGrid)
.setNumFolds(10)
def train(df: DataFrame): (StringIndexerModel, CrossValidatorModel, Matrix) = {
// need to index strings on all data to not missing the job fields.
// other alternative can be manually assign values for each job like gender.
val indexerModel = stringIndexer("job").fit(df)
val indexed = indexerModel.transform(df)
val splits = indexed.randomSplit(Array(0.8, 0.2))
val training = splits(0).cache()
val test = splits(1)
val cvModel = cv.fit(training)
val predictionAndLabels = cvModel
.transform(test)
.select("label", "prediction").map {
case Row(label: Double, prediction: Double) ?
(prediction, label)
}
printBinaryMetrics(predictionAndLabels)
(indexerModel, cvModel, confusionMatrix(predictionAndLabels))
}
}
开发者ID:ferhtaydn,项目名称:canceRater,代码行数:62,代码来源:LRCV.scala
示例3: RealEstateData
//设置package包名称以及导入依赖的类
package fr.grislain
import org.apache.spark.SparkContext
import org.apache.spark.SparkConf
import org.apache.spark.sql.{ SQLContext, DataFrame, Row }
import org.apache.spark.sql.types._
object RealEstateData {
println("Starting real_estate_price")
val conf = new SparkConf().setAppName("real_estate_price").setMaster("local")
val context = new SparkContext(conf)
val sqlContext = new SQLContext(context)
def dataFrame: DataFrame = {
val input = context.textFile("../data/insee_notaires.csv")
sqlContext.createDataFrame(input mapPartitions { _.drop(1) } map {
line =>
Row.fromSeq(line.split(",").view.zipWithIndex filter { e => e._2 > 0 } flatMap {
e =>
e match {
case (t, 1) => Seq(t.take(4).toInt, t.drop(5).toInt)
case (p, _) => Seq(p.toDouble)
}
})
},
StructType(StructField("year", IntegerType) ::
StructField("quarter", IntegerType) ::
StructField("75001", DoubleType) ::
StructField("75002", DoubleType) ::
StructField("75003", DoubleType) ::
StructField("75004", DoubleType) ::
StructField("75005", DoubleType) ::
StructField("75006", DoubleType) ::
StructField("75007", DoubleType) ::
StructField("75008", DoubleType) ::
StructField("75009", DoubleType) ::
StructField("75010", DoubleType) ::
StructField("75011", DoubleType) ::
StructField("75012", DoubleType) ::
StructField("75013", DoubleType) ::
StructField("75014", DoubleType) ::
StructField("75015", DoubleType) ::
StructField("75016", DoubleType) ::
StructField("75017", DoubleType) ::
StructField("75018", DoubleType) ::
StructField("75019", DoubleType) ::
StructField("75020", DoubleType) :: Nil))
}
}
开发者ID:ngrislain,项目名称:french_real_estate,代码行数:50,代码来源:RealEstateData.scala
示例4: StudyRDD
//设置package包名称以及导入依赖的类
package com.study.spark.datasource
import org.apache.spark.{Partition, TaskContext}
import org.apache.spark.annotation.DeveloperApi
import org.apache.spark.rdd.RDD
import org.apache.spark.sql.types.StructType
import org.apache.spark.sql.{Row, SQLContext}
class StudyRDD(sqlContext: SQLContext, schema: StructType) extends RDD[Row](sqlContext.sparkContext, deps=Nil) {
@DeveloperApi
override def compute(split: Partition, context: TaskContext): Iterator[Row] = new StudyReader(context, schema, split)
// ??? ?? ????? 2?? ???? ??? ????.
// ? Executor? ???? ??? ????. ???? ???? 2? ??? ???, ??? ??? ? ?? Executor? ?? 2???.
override protected def getPartitions: Array[Partition] = {
val arr: Array[Partition] = new Array[Partition](2)
arr.update(0, new Partition() {
override def index: Int = 0
})
arr.update(1, new Partition() {
override def index: Int = 1
})
arr
}
}
开发者ID:hackpupu,项目名称:LML,代码行数:27,代码来源:StudyRDD.scala
示例5: DataFrameFunctions
//设置package包名称以及导入依赖的类
package com.bloomberg.sparkflow.dc
import org.apache.spark.sql.{Column, Dataset, Row}
class DataFrameFunctions(self: DC[Row]) {
def join(right: DC[Row]): DC[Row] = {
val f = (left: Dataset[_], right: Dataset[_]) => {
left.join(right)
}
val hashTarget = Seq("join")
new MultiDatasetTransformDC(self, right, f, hashTarget)
}
def join(right: DC[Row], usingColumn: String): DC[Row] = {
val f = (left: Dataset[_], right: Dataset[_]) => {
left.join(right, usingColumn)
}
val hashTarget = Seq("join", usingColumn)
new MultiDatasetTransformDC(self, right, f, hashTarget)
}
def join(right: DC[Row], joinExprs: Column): DC[Row] = join(right, joinExprs, "inner")
def join(right: DC[Row], joinExprs: Column, joinType: String): DC[Row] = {
val f = (left: Dataset[_], right: Dataset[_]) => {
left.join(right, joinExprs)
}
val hashTarget = Seq("join", joinType, joinExprs.toString())
new MultiDatasetTransformDC(self, right, f, hashTarget)
}
}
开发者ID:bloomberg,项目名称:spark-flow,代码行数:36,代码来源:DataFrameFunctions.scala
示例6: RatePredictor
//设置package包名称以及导入依赖的类
package com.ferhtaydn.rater
import akka.actor.ActorSystem
import com.ferhtaydn.models.PatientInfo
import org.apache.spark.ml.feature.StringIndexerModel
import org.apache.spark.ml.tuning.CrossValidatorModel
import org.apache.spark.mllib.linalg.{ Matrix, Vector }
import org.apache.spark.sql.{ Row, SQLContext }
import scala.concurrent.{ ExecutionContextExecutor, Future }
class RatePredictor(system: ActorSystem, sqlContext: SQLContext,
indexModel: StringIndexerModel, cvModel: CrossValidatorModel,
confusionMatrix: String) {
private val decimalFormatter = new java.text.DecimalFormat("##.##")
private val blockingDispatcher: ExecutionContextExecutor = system.dispatchers.lookup("ml.predictor.dispatcher")
def confusionMatrixString: Future[String] = {
Future {
confusionMatrix
}(blockingDispatcher)
}
def predict(patientInfo: PatientInfo): Future[Either[String, Double]] = {
Future {
val df = sqlContext.createDataFrame(Seq(patientInfo.toRecord))
val indexedJobDF = indexModel.transform(df)
val result = cvModel
.transform(indexedJobDF)
.select("prediction", "probability").map {
case Row(prediction: Double, probability: Vector) ?
(probability, prediction)
}
result.collect().headOption match {
case Some((prob, _)) ? Right(decimalFormatter.format(prob(1)).toDouble)
case None ? Left(s"No result can be predicted for the patient")
}
}(blockingDispatcher)
}
}
开发者ID:ferhtaydn,项目名称:canceRater,代码行数:48,代码来源:RatePredictor.scala
示例7: TikaMetadataRelation
//设置package包名称以及导入依赖的类
package com.jasonfeist.spark.tika
import org.apache.spark.input.PortableDataStream
import org.apache.spark.rdd.RDD
import org.apache.spark.sql.{Row, SQLContext}
import org.apache.spark.sql.sources.{BaseRelation, TableScan}
import org.apache.spark.sql.types.{StructType}
import org.slf4j.LoggerFactory
class TikaMetadataRelation protected[tika] (path: String,
userSchema: StructType,
metadataExtractor: MetadataExtractor,
fieldDataExtractor: FieldDataExtractor)
(@transient val sqlContext: SQLContext)
extends BaseRelation with TableScan with Serializable {
val logger = LoggerFactory.getLogger(classOf[TikaMetadataRelation])
override def schema: StructType = this.userSchema
override def buildScan(): RDD[Row] = {
val rdd = sqlContext
.sparkContext.binaryFiles(path)
rdd.map(extractFunc(_))
}
def extractFunc(
file: (String, PortableDataStream)
) : Row =
{
val extractedData = metadataExtractor.extract(file)
val rowArray = new Array[Any](schema.fields.length)
var index = 0
while (index < schema.fields.length) {
val field = schema(index)
val fieldData = fieldDataExtractor.matchedField(field.name,
field.dataType, extractedData._1, file._1, extractedData._2,
extractedData._3)
rowArray(index) = fieldData
index = index + 1
}
Row.fromSeq(rowArray)
}
}
开发者ID:jasonfeist,项目名称:tika-spark-datasource,代码行数:47,代码来源:TikaMetadataRelation.scala
示例8: GenerateScalingData
//设置package包名称以及导入依赖的类
package com.highperformancespark.examples.tools
import com.highperformancespark.examples.dataframe.RawPanda
import org.apache.spark._
import org.apache.spark.rdd.RDD
import org.apache.spark.sql.Row
import org.apache.spark.mllib.random.RandomRDDs
import org.apache.spark.mllib.linalg.Vector
object GenerateScalingData {
def generateGoldilocks(sc: SparkContext, rows: Long, numCols: Int):
RDD[RawPanda] = {
val zipRDD = RandomRDDs.exponentialRDD(sc, mean = 1000, size = rows)
.map(_.toInt.toString)
val valuesRDD = RandomRDDs.normalVectorRDD(
sc, numRows = rows, numCols = numCols)
zipRDD.zip(valuesRDD).map{case (z, v) =>
RawPanda(1, z, "giant", v(0) > 0.5, v.toArray)
}
}
// end::MAGIC_PANDA[]
}
开发者ID:gourimahapatra,项目名称:high-performance-spark,代码行数:25,代码来源:GenerateScalingData.scala
示例9: StudyRelation
//设置package包名称以及导入依赖的类
package com.study.spark.datasource
import org.apache.spark.rdd.RDD
import org.apache.spark.sql.sources.{BaseRelation, TableScan}
import org.apache.spark.sql.types.{StringType, StructField, StructType}
import org.apache.spark.sql.{Row, SQLContext}
class StudyRelation(parameters: Map[String, String])(@transient val sqlContext: SQLContext)
extends BaseRelation with TableScan {
override def schema: StructType = {
// ??? ?? ?????, ?? ??? ???? ????. ???? ?????? ???? ??????, ???? ?? ??? ????
val fields: Array[StructField] = new Array[StructField](3)
fields.update(0, new StructField("field1", StringType))
fields.update(1, new StructField("field2", StringType))
fields.update(2, new StructField("field2", StringType))
new StructType(fields.asInstanceOf[Array[StructField]])
}
// RDD[Row]? ???? StudyRDD? ???.
override def buildScan(): RDD[Row] = new StudyRDD(sqlContext, schema)
}
开发者ID:hackpupu,项目名称:LML,代码行数:24,代码来源:StudyRelation.scala
示例10: StudyReader
//设置package包名称以及导入依赖的类
package com.study.spark.datasource
import org.apache.spark.{Partition, TaskContext}
import org.apache.spark.sql.Row
import org.apache.spark.sql.types.StructType
class StudyReader(context: TaskContext, schema: StructType, split: Partition) extends Iterator[Row] {
private[this] var counter: Int = 0
// Task? ???? ???? close? ????? ??.
if(context != null) {
context.addTaskCompletionListener(context => close())
}
// 100?? Row? ??? ??
override def hasNext: Boolean = {
if(counter < 100) {
true
} else {
false
}
}
// 1?? Row? ????.
override def next(): Row = {
if(!hasNext) {
throw new NoSuchElementException("End of stream")
}
counter += 1
Row(split.index + " field1 " + counter, "field2 " + counter, "field3: " + counter)
}
// close?? ? ??? ??? ??? close??.
def close() = println("closed")
}
开发者ID:hackpupu,项目名称:LML,代码行数:37,代码来源:StudyReader.scala
示例11: StoreFormat
//设置package包名称以及导入依赖的类
package com.sasaki.utils
import org.apache.spark.sql.types._
import org.apache.spark.sql.{Row, SQLContext, SaveMode}
import org.apache.spark.rdd.RDD
import com.sasaki.discretization._
object StoreFormat {
def rdd2DF(rdd : RDD[Row], sqlContext : SQLContext) = {
val schema = StructType(
StructField("role", StringType, nullable = false) ::
StructField("mark", StringType, nullable = false) ::
StructField("seqs", ArrayType(StringType), nullable = false) ::
Nil)
sqlContext.createDataFrame(rdd, schema)
}
def saveAsJSON(rdd : RDD[Row],
path : String, sqlContext : SQLContext) = {
val df = rdd2DF(rdd, sqlContext)
val saveOptions = Map("header" -> "false", "path" -> path)
df.write.format("json").mode(SaveMode.Ignore).options(saveOptions).save
}
}
开发者ID:sasakigao,项目名称:log-discrete,代码行数:28,代码来源:StoreFormat.scala
示例12: sample
//设置package包名称以及导入依赖的类
package com.rishabh.spark.datasource.s3
import org.apache.spark.rdd.RDD
import org.apache.spark.sql.sources.{BaseRelation, TableScan}
import org.apache.spark.sql.types.StructType
import org.apache.spark.sql.{Row, SQLContext}
case class sample(id: Integer)
class S3Relation(accesskey: String, secretkey: String, fileType: String, bucket: String, path:
String, write: Boolean)
(@transient
val sqlContext: SQLContext) extends BaseRelation with TableScan {
import sqlContext.implicits._
val dummyData = Seq(sample(1))
var df = sqlContext.sparkContext.parallelize(dummyData, 4).toDF()
val s3Path = "s3a://" + bucket + path
val hadoopConf = sqlContext.sparkContext.hadoopConfiguration
hadoopConf.set("fs.s3a.impl", "org.apache.hadoop.fs.s3a.S3AFileSystem")
hadoopConf.set("fs.s3a.access.key", accesskey)
hadoopConf.set("fs.s3a.secret.key", secretkey)
override def schema: StructType = {
fileType match {
case "json" =>
df = sqlContext.read.json(s3Path)
case "csv" =>
df = sqlContext.read.format("com.databricks.spark.csv").load(s3Path)
case "parquet" =>
df = sqlContext.read.parquet(s3Path)
}
df.schema
}
override def buildScan(): RDD[Row] = {
df.rdd
}
}
开发者ID:rishabhbhardwaj,项目名称:spark-datasource-s3,代码行数:44,代码来源:S3Relation.scala
示例13: ScorePredictor
//设置package包名称以及导入依赖的类
package org.wikimedia.research.recommendation.job.translation
import java.io.File
import org.apache.log4j.{LogManager, Logger}
import org.apache.spark.ml.regression.RandomForestRegressionModel
import org.apache.spark.sql.types.{DoubleType, StringType, StructField, StructType}
import org.apache.spark.sql.{DataFrame, Row, SaveMode, SparkSession}
import scala.collection.parallel.mutable.ParArray
object ScorePredictor {
val log: Logger = LogManager.getLogger(ScorePredictor.getClass)
def predictScores(spark: SparkSession,
modelsInputDir: File,
predictionsOutputDir: Option[File],
sites: ParArray[String],
featureData: DataFrame): Unit = {
log.info("Scoring items")
val predictions: Array[DataFrame] = sites.map(target => {
try {
log.info("Scoring for " + target)
log.info("Getting work data for " + target)
val workData: DataFrame = Utils.getWorkData(spark, featureData, target, exists = false)
log.info("Loading model for " + target)
val model = RandomForestRegressionModel.load(
new File(modelsInputDir, target).getAbsolutePath)
log.info("Scoring data for " + target)
val predictions = model
.setPredictionCol(target)
.transform(workData)
.select("id", target)
predictions
} catch {
case unknown: Throwable =>
log.error("Score for " + target + " failed", unknown)
val schema = StructType(Seq(
StructField("id", StringType, nullable = false),
StructField(target, DoubleType, nullable = true)))
spark.createDataFrame(spark.sparkContext.emptyRDD[Row], schema)
}
}).toArray
val predictedScores = predictions.reduce((left, right) => left.join(right, Seq("id"), "outer"))
log.info("Saving predictions")
predictionsOutputDir.foreach(f = o =>
predictedScores.coalesce(1)
.write
.mode(SaveMode.ErrorIfExists)
.option("header", value = true)
.option("compression", "bzip2")
.csv(new File(o, "allPredictions").getAbsolutePath))
}
}
开发者ID:schana,项目名称:recommendation-translation,代码行数:59,代码来源:ScorePredictor.scala
示例14: prepareData
//设置package包名称以及导入依赖的类
package net.koseburak.recommendation.util
import net.koseburak.recommendation.constant.Field.PlaylistField
import org.apache.spark.sql.{DataFrame, Row, SparkSession}
trait DataUtils extends Serializable{
def prepareData(path: String)(implicit spark: SparkSession): DataFrame = {
import spark.implicits._
spark.read.text(path)
.map(parseRow)
.filter(_.length > 1)
.map(Tuple1.apply)
.toDF(PlaylistField)
.cache
}
private def parseRow(row: Row) = row.mkString.split(" ").toList
}
object DataUtils extends DataUtils
开发者ID:burakkose,项目名称:word2vec-playlist-generation,代码行数:21,代码来源:DataUtils.scala
示例15: ManyToManyNormalJoin
//设置package包名称以及导入依赖的类
package com.malaska.spark.training.manytomany
import org.apache.log4j.{Level, Logger}
import org.apache.spark.sql.{Row, SparkSession}
import scala.collection.mutable
object ManyToManyNormalJoin {
Logger.getLogger("org").setLevel(Level.OFF)
Logger.getLogger("akka").setLevel(Level.OFF)
def main(args:Array[String]): Unit = {
val jsonPath = args(0)
val sparkSession = SparkSession.builder
.master("local")
.appName("my-spark-app")
.config("spark.some.config.option", "config-value")
.config("spark.driver.host","127.0.0.1")
.getOrCreate()
val jsonDf = sparkSession.read.json(jsonPath)
val nGramWordCount = jsonDf.rdd.flatMap(r => {
val actions = r.getAs[mutable.WrappedArray[Row]]("actions")
val resultList = new mutable.MutableList[((Long, Long), Int)]
actions.foreach(a => {
val aValue = a.getAs[Long]("action")
actions.foreach(b => {
val bValue = b.getAs[Long]("action")
if (aValue < bValue) {
resultList.+=(((aValue, bValue), 1))
}
})
})
resultList.toSeq
}).reduceByKey(_ + _)
nGramWordCount.collect().foreach(r => {
println(" - " + r)
})
}
}
开发者ID:TedBear42,项目名称:spark_training,代码行数:46,代码来源:ManyToManyNormalJoin.scala
示例16: Generators
//设置package包名称以及导入依赖的类
package com.bla.bla.utils
import Models.{Client, Order}
import org.apache.spark.rdd.RDD
import SparkSession._
import com.bla.bla.utils.Models.{Client, Order}
import org.apache.spark.sql.functions._
import org.apache.spark.sql.{DataFrame, Row}
import scala.util.Random
object Generators {
val nbOfClients = 2000
val nbOfOrders = 10000
val maxAge = 100
val maxNbOfPieces = 5
val maxPrice = 10000
def generateClients: Seq[Client] = {
(0 to nbOfClients).map {
i => Client(i, s"Name$i", s"Forname$i", Random.nextInt(1 + maxAge))
}
}
def generateOrders(clientList: Seq[Client]): Seq[Order] = {
val clientIdList = clientList.map(_.id)
(0 to nbOfOrders).map {
i => Order(
id = i,
clientId = clientIdList(Random.nextInt(clientIdList.length)),
numberOfPieces = Random.nextInt(1 + maxNbOfPieces),
price = Random.nextDouble() * 1000 % maxPrice)
}
}
def generateClientsRDD: RDD[Client] = sc.parallelize(generateClients)
def generateOrdersRDD(clientsRDD: RDD[Client]): RDD[Order] = sc.parallelize(generateOrders(clientsRDD.collect()))
def generateClientsDataFrame: DataFrame = sqlContext.createDataFrame(generateClients)
def generateOrdersDataFrame(clientsDF: DataFrame): DataFrame = {
val clientsRDD = clientsDF.rdd.map {
case Row(id: Int, name: String, forname: String, age: Int) => Client(id, name, forname, age)
}
sqlContext.createDataFrame(generateOrdersRDD(clientsRDD))
}
def injectNaAgeValues(df: DataFrame): DataFrame = {
def randomNullify(age: Int) = {
if(Random.nextDouble() <= 0.1) None else Some(age)
}
df.withColumn("ageWithNa", udf(randomNullify _).apply(col("age")))
}
}
开发者ID:mmenestret,项目名称:spark-best-practices,代码行数:60,代码来源:Generators.scala
示例17: Dictionary
//设置package包名称以及导入依赖的类
package com.github.uosdmlab.nkp
import org.apache.spark.sql.{SparkSession, Row}
import org.apache.spark.sql.types.{StringType, StructField, StructType}
object Dictionary {
import org.bitbucket.eunjeon.seunjeon.{Analyzer => EunjeonAnalyzer}
private var words = Seq.empty[String]
private def chain(fn: => Any): this.type = {
fn
this
}
private def syncDictionary(): Unit = EunjeonAnalyzer.setUserDict(words.toIterator)
def addWords(word: String, words: String*): this.type = addWords(word +: words)
def addWords(words: Traversable[String]): this.type = chain {
this.words = this.words ++ words
syncDictionary()
}
def reset(): this.type = chain {
EunjeonAnalyzer.resetUserDict()
words = Seq.empty[String]
}
def addWordsFromCSV(path: String, paths: String*): this.type =
addWordsFromCSV(path +: paths)
def addWordsFromCSV(paths: Traversable[String]): this.type = chain {
val spark = SparkSession.builder().getOrCreate()
import spark.implicits._
val schema = StructType(Array(
StructField("word", StringType, nullable = false),
StructField("cost", StringType, nullable = true)))
val df = spark.read
.option("sep", ",")
.option("inferSchema", value = false)
.option("header", value = false)
.schema(schema)
.csv(paths.toSeq: _*)
val words = df.map {
case Row(word: String, cost: String) =>
s"$word,$cost"
case Row(word: String, null) =>
word
}.collect()
addWords(words)
}
}
开发者ID:uosdmlab,项目名称:spark-nkp,代码行数:62,代码来源:Dictionary.scala
示例18: EmptyRDFGraphDataFrame
//设置package包名称以及导入依赖的类
package net.sansa_stack.ml.spark.mining.amieSpark
import org.apache.spark.sql.types.{StringType, StructField, StructType}
import org.apache.spark.sql.{DataFrame, Row, SQLContext}
/**
* @author Lorenz Buehmann
*/
object EmptyRDFGraphDataFrame {
def get(sqlContext: SQLContext): DataFrame = {
// convert RDD to DataFrame
val schemaString = "subject predicate object"
// generate the schema based on the string of schema
val schema = StructType(schemaString.split(" ").map(fieldName => StructField(fieldName, StringType, true)))
// convert triples RDD to rows
val rowRDD = sqlContext.sparkContext.emptyRDD[Row]
// apply the schema to the RDD
val triplesDataFrame = sqlContext.createDataFrame(rowRDD, schema)
// register the DataFrame as a table
triplesDataFrame.createOrReplaceTempView("TRIPLES")
triplesDataFrame
}
}
开发者ID:SANSA-Stack,项目名称:SANSA-ML,代码行数:30,代码来源:EmptyRDFGraphDataFrame.scala
示例19: TitanicBayes
//设置package包名称以及导入依赖的类
import org.apache.spark.mllib.classification.{NaiveBayes, NaiveBayesModel}
import org.apache.spark.mllib.linalg.Vectors
import org.apache.spark.mllib.regression.LabeledPoint
import org.apache.spark.rdd.RDD
import org.apache.spark.sql.{DataFrame, Row}
object TitanicBayes {
var naiveBayesModel: NaiveBayesModel = null
def train(df: DataFrame): Unit = {
val mappedDf = df.map(row =>
(row.getAs[Int]("Survived"), row.getAs[Double]("Fare"), row.getAs[Int]("Pclass"), row.getAs[Double]("Age")
,row.getAs[Int]("Sex"), row.getAs[Int]("Parch"), row.getAs[Int]("SibSp"),row.getAs[Int]("Embarked")))
val labledData = mappedDf.map { case (survived, fare, pclass, age, sex, parch, sibsp, embarked) =>
LabeledPoint(survived, Vectors.dense(fare, pclass, age, sex, parch, sibsp, embarked))
}
naiveBayesModel = NaiveBayes.train(labledData, lambda = 1.0, modelType = "multinomial")
}
def predict(df: DataFrame): RDD[Row] = {
val resultDf = df.map { row =>
val denseVecor = Vectors.dense(row.getAs[Double]("Fare"), row.getAs[Int]("Pclass"), row.getAs[Double]("Age"),row.getAs[Int]("Sex"),
row.getAs[Int]("Parch"), row.getAs[Int]("SibSp"), row.getAs[Int]("Embarked") )
val result = naiveBayesModel.predict(denseVecor)
Row.fromTuple((row.getAs[Int]("PassengerId"), result.toInt))
}
resultDf
}
}
开发者ID:digital-thinking,项目名称:spark-titanic,代码行数:37,代码来源:TitanicBayes.scala
示例20: EdgeProviders
//设置package包名称以及导入依赖的类
package ml.sparkling.graph.loaders.csv.providers
import ml.sparkling.graph.loaders.csv.types.CSVTypes.EdgeAttributeExtractor
import ml.sparkling.graph.loaders.csv.types.Types.ToVertexId
import ml.sparkling.graph.loaders.csv.types.{CSVTypes, Types}
import ml.sparkling.graph.loaders.csv.utils.DefaultTransformers
import ml.sparkling.graph.loaders.csv.utils.DefaultTransformers.{defaultEdgeAttribute, numberToVertexId}
import org.apache.spark.graphx.Edge
import org.apache.spark.sql.Row
import scala.reflect.ClassTag
object EdgeProviders {
type TwoColumnsMakeEdgeProvider[VD,ED]=(Int,Int,Row, ToVertexId[VD], EdgeAttributeExtractor[ED])=>Seq[Edge[ED]]
def twoColumnsMakesEdge[VD:ClassTag,ED:ClassTag](id1:Int,
id2:Int,row:Row,
columnToId:ToVertexId[VD],
edgeAttributeProvider:EdgeAttributeExtractor[ED]):Seq[Edge[ED]]={
Seq(Edge(columnToId(row.getAs(id1)),columnToId(row.getAs(id2)),edgeAttributeProvider(row)))
}
def twoColumnsMakesEdge[VD:ClassTag](id1:Int,
id2:Int,
row:Row):Seq[Edge[Double]]={
twoColumnsMakesEdge(id1,id2,row,numberToVertexId _,defaultEdgeAttribute _)
}
}
开发者ID:sparkling-graph,项目名称:sparkling-graph,代码行数:32,代码来源:EdgeProviders.scala
注:本文中的org.apache.spark.sql.Row类示例整理自Github/MSDocs等源码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。 |
请发表评论