• 设为首页
  • 点击收藏
  • 手机版
    手机扫一扫访问
    迪恩网络手机版
  • 关注官方公众号
    微信扫一扫关注
    公众号

Scala SparkSession类代码示例

原作者: [db:作者] 来自: [db:来源] 收藏 邀请

本文整理汇总了Scala中org.apache.spark.sql.SparkSession的典型用法代码示例。如果您正苦于以下问题:Scala SparkSession类的具体用法?Scala SparkSession怎么用?Scala SparkSession使用的例子?那么恭喜您, 这里精选的类代码示例或许可以为您提供帮助。



在下文中一共展示了SparkSession类的20个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于我们的系统推荐出更棒的Scala代码示例。

示例1: movies

//设置package包名称以及导入依赖的类
import org.apache.spark.ml.Pipeline
import org.apache.spark.ml.classification.NaiveBayes
import org.apache.spark.ml.feature.{HashingTF, Tokenizer}
import org.apache.spark.sql.SparkSession

object movies {

  case class Sentence(sentence: String,label: Double)

  def main(args:Array[String]) {

    val spark = SparkSession
      .builder
      .appName("Movies Reviews")
      .config("spark.master", "local")
      .getOrCreate()


    // Prepare training documents from a list of (id, text, label) tuples.
    val neg = spark.sparkContext.textFile("file:///data/train/neg/").repartition(4)
      .map(w => Sentence(w, 0.0))

    val pos = spark.sparkContext.textFile("file:///data/train/pos/").repartition(4)
      .map(w => Sentence(w, 1.0))

    val test = spark.sparkContext.wholeTextFiles("file:///data/test/").repartition(4)
      .map({case(file,sentence) => (file.split("/").last.split("\\.")(0),sentence)})


    val training=neg.union(pos)
    val trainingDF=spark.createDataFrame(training)
    val testDF=spark.createDataFrame(test)

    // Configure an ML pipeline, which consists of three stages: tokenizer, hashingTF, and Naive Bayes
    val tokenizer = new Tokenizer()
      .setInputCol("sentence")
      .setOutputCol("words")
    val hashingTF = new HashingTF()
      .setInputCol(tokenizer.getOutputCol)
      .setOutputCol("features")
    val nb = new NaiveBayes()

    val pipeline = new Pipeline()
      .setStages(Array(tokenizer, hashingTF, nb))

    // Fit the pipeline to training documents.
    val model = pipeline.fit(trainingDF)

    // Make predictions on test documents.
    model.transform(testDF).repartition(1)
      .select("file", "prediction")
      .write.format("csv")
      .option("header","true")
      .option("delimiter","\t")
      .save("/tmp/spark-prediction")
    spark.stop()
      }
  } 
开发者ID:evaliotiri,项目名称:NaiveBayes,代码行数:59,代码来源:naiveBayes.scala


示例2: apply

//设置package包名称以及导入依赖的类
package org.dama.datasynth.runtime.spark.operators

import org.apache.spark.sql.{Dataset, SparkSession}
import org.dama.datasynth.executionplan.ExecutionPlan.EdgeTable
import org.dama.datasynth.runtime.spark.SparkRuntime

import scala.util.Random


  def apply( node : EdgeTable) : Dataset[(Long,Long,Long)]= {
    val sparkSession = SparkRuntime.getSparkSession()
    import sparkSession.implicits._
    val generator = SparkRuntime.instantiateStructureGeneratorOperator( node.structure )
    val size = SparkRuntime.evalValueOperator(node.size).asInstanceOf[Long]
    val random : Random = new Random()
    val id : Int = random.nextInt()
    val path : String = s"/tmp/${id}"
    val sparkContext = sparkSession.sparkContext
    generator.run(size, sparkContext.hadoopConfiguration,"hdfs://"+path)
    val edgesRDD = sparkContext.textFile(path)
                               .map( s => s.split("\t"))
                               .map( l => (l(0).toLong, l(1).toLong))
                               .zipWithIndex().map( { case ((tail,head), id) =>  (id, tail, head)})
    sparkSession.createDataset(edgesRDD)
  }

} 
开发者ID:DAMA-UPC,项目名称:DataSynth,代码行数:28,代码来源:EdgeTableOperator.scala


示例3: WithCalcLogging

//设置package包名称以及导入依赖的类
package biz.meetmatch.decorators

import biz.meetmatch.logging.BusinessLogger
import org.apache.spark.sql.SparkSession
import org.rogach.scallop.Scallop

import scala.util.{Failure, Success, Try}

object WithCalcLogging {
  def apply[B](f: => B)(implicit module: Class[_]): B = apply(module.getName)(f)

  def apply[B](scallopts: Scallop, sparkSession: SparkSession)(f: => B)(implicit module: Class[_] = this.getClass): B = apply(module.getName, Some(scallopts), Some(sparkSession))(f)

  def apply[B](module: String)(f: => B): B = apply(module, None, None)(f)

  def apply[B](module: String, scallopts: Scallop)(f: => B)(): B = apply(module, Some(scallopts), None)(f)

  def apply[B](module: String, scallopts: Scallop, sparkSession: SparkSession)(f: => B): B = apply(module, Some(scallopts), Some(sparkSession))(f)

  def apply[B](module: String, scalloptsO: Option[Scallop], sparkSessionO: Option[SparkSession])(f: => B): B = {
    val businessLogger = new BusinessLogger(module)

    val optsString = scalloptsO
      .map { scallopts =>
        scallopts.opts
          .map { opt => opt.name + " = " + scallopts.get(opt.name)(opt.converter.tag).getOrElse("(empty)") }
          .mkString(",")
      }
      .getOrElse("")

    val sparkAppId = sparkSessionO.map(_.sparkContext.applicationId).getOrElse("")

    businessLogger.calcStarted(optsString, sparkAppId)
    val attempt = Try(WithStopwatch(f))

    attempt match {
      case Success(result) =>
        businessLogger.calcStopped("SUCCESS")
        result
      case Failure(exception) =>
        businessLogger.calcStopped("FAILURE")
        throw exception
    }
  }
} 
开发者ID:tolomaus,项目名称:languagedetector,代码行数:46,代码来源:WithCalcLogging.scala


示例4: PCASampleDemo

//设置package包名称以及导入依赖的类
package com.chapter11.SparkMachineLearning

import org.apache.spark.ml.feature.PCA
import org.apache.spark.ml.linalg.Vectors
import org.apache.spark.sql.SparkSession

object PCASampleDemo {
  def main(args: Array[String]): Unit = {
    val spark = SparkSession
      .builder()
      .master("local[4]")
      .appName("PCAExample")
      .getOrCreate()

    val data = Array(
       Vectors.dense(3.5, 2.0, 5.0, 6.3, 5.60, 2.4),
       Vectors.dense(4.40, 0.10, 3.0, 9.0, 7.0, 8.75),
       Vectors.dense(3.20, 2.40, 0.0, 6.0, 7.4, 3.34)
    )
    val df = spark.createDataFrame(data.map(Tuple1.apply)).toDF("features")
    df.show(false)

    val pca = new PCA()
      .setInputCol("features")
      .setOutputCol("pcaFeatures")
      .setK(4)
      .fit(df)

    val result = pca.transform(df).select("pcaFeatures")
    result.show(false)

    spark.stop()
  }
} 
开发者ID:PacktPublishing,项目名称:Scala-and-Spark-for-Big-Data-Analytics,代码行数:35,代码来源:PCAExample.scala


示例5: MafExample

//设置package包名称以及导入依赖的类
import org.apache.spark.SparkContext
import org.apache.spark.SparkContext._
import org.apache.spark.SparkConf
import org.apache.spark.sql.SparkSession
import org.apache.spark.sql.SQLContext
import org.apache.spark.sql.Row

object MafExample {
  def main(args: Array[String]) {
    val conf = new SparkConf().setAppName("MAF Example")
    val sc = new SparkContext(conf)
    val spark = SparkSession
      .builder()
      .getOrCreate()
    val sqlContext = new SQLContext(sc)

    val df = sqlContext.read.format("com.databricks.spark.csv")
      .option("header", "true")
      .option("inferSchema", "true")
      .option("delimiter", "\t")
      .option("delimiter", "\t")
      .option("comment", "#")
      .load("TCGA.ACC.mutect.abbe72a5-cb39-48e4-8df5-5fd2349f2bb2.somatic.maf")

    df.createOrReplaceTempView("mutations")

    val topTwenty = spark.sql("SELECT Hugo_Symbol, count(*) FROM mutations GROUP BY Hugo_symbol ORDER BY count(*) DESC LIMIT 20")

    val topTwentyMissense = spark.sql("SELECT Hugo_Symbol, count(*) FROM mutations WHERE Variant_Classification='Missense_Mutation' GROUP BY Hugo_symbol ORDER BY count(*) DESC LIMIT 20")

    val fat4 = spark.sql("SELECT Chromosome, Start_Position, End_Position, Strand, Variant_Classification, Variant_Type, Tumor_Sample_Barcode FROM mutations WHERE Hugo_Symbol='FAT4'")

    topTwenty.coalesce(1).write.format("com.databricks.spark.csv").save("results/topTwenty")

    topTwentyMissense.coalesce(1).write.format("com.databricks.spark.csv").save("results/topTwentyMissense")

    fat4.coalesce(1).write.format("com.databricks.spark.csv").save("results/fat4")
  }
} 
开发者ID:allisonheath,项目名称:sparktoys,代码行数:40,代码来源:MafExample.scala


示例6: VeChallengeIngest

//设置package包名称以及导入依赖的类
package io.github.adrianulbona.ve

import org.apache.spark.sql.{DataFrame, SparkSession}
import org.apache.spark.streaming.twitter.TwitterUtils
import org.apache.spark.streaming.{Minutes, Seconds, StreamingContext}
import twitter4j.{GeoLocation, Place, Status}


object VeChallengeIngest {

  case class Location(latitude: Double, longitude: Double)

  case class Tweet(time: Long, text: String, user: String, isRetweet: Boolean, country: String, location: Location)

  def main(args: Array[String]) {

    val spark = SparkSession.builder
      .master("local[*]")
      .appName("ve-challenge")
      .getOrCreate()

    import spark.sqlContext.implicits._

    val ssc = new StreamingContext(spark.sparkContext, Minutes(2))
    val stream = TwitterUtils.createStream(ssc, None, Seq("challenge"))

    stream.map(extract).map(normalize).foreachRDD((batch, time) => {
      val batchDF: DataFrame = batch.toDF.cache
      batchDF.groupBy($"country").count().toDF("country", "count").orderBy($"count".desc).show(6)
      batchDF.coalesce(1).write.parquet("tweets/batch=" + time.milliseconds)
      batchDF.unpersist()
    })

    ssc.start()
    ssc.awaitTermination()

    spark.stop()
  }

  def extract(status: Status): (Long, String, String, Boolean, Option[Place], Option[GeoLocation]) = {
    (status.getCreatedAt.getTime,
      status.getText,
      status.getUser.getName,
      status.isRetweet,
      Option(status.getPlace),
      Option(status.getGeoLocation))
  }

  def normalize(extract: (Long, String, String, Boolean, Option[Place], Option[GeoLocation])): Tweet = extract match {
    case (time: Long, text: String, user: String, isRetweet: Boolean, Some(place: Place), Some(geoLoc: GeoLocation)) =>
      Tweet(time, text, user, isRetweet, place.getCountryCode, Location(geoLoc.getLatitude, geoLoc.getLongitude))
    case (time: Long, text: String, user: String, isRetweet: Boolean, Some(place: Place), None) =>
      Tweet(time, text, user, isRetweet, place.getCountryCode, Location(Double.NaN, Double.NaN))
    case (time: Long, text: String, user: String, isRetweet: Boolean, None, Some(geoLoc: GeoLocation)) =>
      Tweet(time, text, user, isRetweet, "unknown", Location(geoLoc.getLatitude, geoLoc.getLongitude))
    case (time: Long, text: String, user: String, isRetweet: Boolean, None, None) =>
      Tweet(time, text, user, isRetweet, "unknown", Location(Double.NaN, Double.NaN))
  }
} 
开发者ID:adrianulbona,项目名称:ve-challenge,代码行数:60,代码来源:VeChallengeIngest.scala


示例7: PrintMetrics

//设置package包名称以及导入依赖的类
package com.github.jongwook

import org.apache.spark.SparkConf
import org.apache.spark.sql.SparkSession

object PrintMetrics extends App {
  val (prediction, labels) = RankingDataProvider(MovieLensLoader.load())

  val spark = SparkSession.builder().master(new SparkConf().get("spark.master", "local[8]")).getOrCreate()

  val metrics = new SparkRankingMetrics(spark.createDataFrame(prediction), spark.createDataFrame(labels), itemCol = "product", predictionCol = "rating")

  val ats = Seq(5, 10, 20, 100, Integer.MAX_VALUE)
  val toPrint = Map[String, SparkRankingMetrics => Seq[Int] => Seq[Double]](
    "Precision" -> { m => k => m.precisionAt(k) },
    "Recall" -> { m => k => m.recallAt(k) },
    "F1" -> { m => k => m.f1At(k) },
    "NDCG" -> { m => k => m.ndcgAt(k) },
    "MAP" -> { m => k => m.mapAt(k) },
    "MRR" -> { m => k => m.mrrAt(k) }
  )

  for ((metric, calculator) <- toPrint) {
    printf("%12s", metric)
    val f = calculator(metrics)
    for (x <- f(ats)) {
      printf("%12.8f", x)
    }
    println()
  }

} 
开发者ID:jongwook,项目名称:spark-ranking-metrics,代码行数:33,代码来源:PrintMetrics.scala


示例8:

//设置package包名称以及导入依赖的类
package com.shashank.akkahttp.project

import java.util.concurrent.ConcurrentHashMap

import akka.actor.ActorSystem
import akka.http.scaladsl.server.Directives._
import akka.stream.ActorMaterializer
import akka.http.scaladsl.marshallers.sprayjson.SprayJsonSupport._
import com.shashank.akkahttp.project.Models.{LoadRequest, ServiceJsonProtoocol}
import spray.json.JsArray

import scala.collection.JavaConverters._
import spray.json.{DefaultJsonProtocol, JsArray, pimpAny}
import spray.json.DefaultJsonProtocol._
import org.apache.spark.sql.SparkSession
import org.apache.spark.sql._


trait RestService {
  implicit val system: ActorSystem
  implicit val materializer: ActorMaterializer
  implicit val sparkSession: SparkSession
  val datasetMap = new ConcurrentHashMap[String, Dataset[Row]]()

  import ServiceJsonProtoocol._

  val route =
    pathSingleSlash {
      get {
        complete {
          "welcome to rest service"
        }
      }
    } ~
      path("load") {
        post {
          entity(as[LoadRequest]) {
            loadRequest => complete {
              val id = "" + System.nanoTime()
              val dataset = sparkSession.read.format("csv")
                .option("header", "true")
                .load(loadRequest.path)
              datasetMap.put(id, dataset)
              id
            }
          }
        }
      } ~
      path("view" / """[\w[0-9]-_]+""".r) { id =>
        get {
          complete {
            val dataset = datasetMap.get(id)
            dataset.take(10).map(row => row.toString())
          }
        }
      }
} 
开发者ID:shashankgowdal,项目名称:introduction-to-akkahttp,代码行数:58,代码来源:RestService.scala


示例9: CountVectorizerDemo

//设置package包名称以及导入依赖的类
package com.chapter11.SparkMachineLearning
import org.apache.spark.sql.SparkSession
import org.apache.spark.ml.feature.{ CountVectorizer, CountVectorizerModel }

object CountVectorizerDemo {
  def main(args: Array[String]): Unit = {
    val spark = SparkSession
      .builder
      .master("local[*]")
      .config("spark.sql.warehouse.dir", "E:/Exp/")
      .appName(s"OneVsRestExample")
      .getOrCreate()

    val df = spark.createDataFrame(
      Seq((0, Array("Jason", "David")),
        (1, Array("David", "Martin")),
        (2, Array("Martin", "Jason")),
        (3, Array("Jason", "Daiel")),
        (4, Array("Daiel", "Martin")),
        (5, Array("Moahmed", "Jason")),
        (6, Array("David", "David")),
        (7, Array("Jason", "Martin")))).toDF("id", "name")

    df.show(false)

    // fit a CountVectorizerModel from the corpus
    val cvModel: CountVectorizerModel = new CountVectorizer()
      .setInputCol("name")
      .setOutputCol("features")
      .setVocabSize(3)
      .setMinDF(2)
      .fit(df)

    val feature = cvModel.transform(df)
    feature.show(false)

    spark.stop()
  }
} 
开发者ID:PacktPublishing,项目名称:Scala-and-Spark-for-Big-Data-Analytics,代码行数:40,代码来源:CountVectorizerDemo.scala


示例10: OneHotEncoderExample

//设置package包名称以及导入依赖的类
package org.sparksamples.regression.bikesharing

import org.apache.spark.sql.SparkSession


object OneHotEncoderExample {

  def main(args: Array[String]): Unit = {
    import org.apache.spark.ml.feature.{OneHotEncoder, StringIndexer}

    val spark = SparkSession
      .builder()
      .appName("Spark SQL basic example").master("local[1]")
      .config("spark.some.config.option", "some-value")
      .getOrCreate()

    // For implicit conversions like converting RDDs to DataFrames
    val df = spark.createDataFrame(Seq(
      (0, 3),
      (1, 2),
      (2, 4),
      (3, 3),
      (4, 3),
      (5, 4)
    )).toDF("id", "category")

    val indexer = new StringIndexer()
      .setInputCol("category")
      .setOutputCol("categoryIndex")
      .fit(df)
    val indexed = indexer.transform(df)

    val encoder = new OneHotEncoder()
      .setInputCol("categoryIndex")
      .setOutputCol("categoryVec")
    val encoded = encoder.transform(indexed)
    encoded.select("id", "categoryVec").show()
  }

} 
开发者ID:PacktPublishing,项目名称:Machine-Learning-with-Spark-Second-Edition,代码行数:41,代码来源:OneHotEncoderExample.scala


示例11: DocumentClassificationLibSVM

//设置package包名称以及导入依赖的类
package org.apache.spark.examples.ml

import org.apache.spark.SparkConf
import org.apache.spark.ml.classification.NaiveBayes
import org.apache.spark.ml.evaluation.MulticlassClassificationEvaluator

import org.apache.spark.sql.SparkSession

object DocumentClassificationLibSVM {
  def main(args: Array[String]): Unit = {

    val spConfig = (new SparkConf).setMaster("local").setAppName("SparkApp")
    val spark = SparkSession
      .builder()
      .appName("SparkRatingData").config(spConfig)
      .getOrCreate()

    val data = spark.read.format("libsvm").load("./output/20news-by-date-train-libsvm/part-combined")

    val Array(trainingData, testData) = data.randomSplit(Array(0.7, 0.3), seed = 1L)

    // Train a NaiveBayes model.
    val model = new NaiveBayes()
      .fit(trainingData)
    val predictions = model.transform(testData)
    predictions.show()

    val evaluator = new MulticlassClassificationEvaluator()
      .setLabelCol("label")
      .setPredictionCol("prediction")
      .setMetricName("accuracy")
    val accuracy = evaluator.evaluate(predictions)
    println("Test set accuracy = " + accuracy)
    spark.stop()
  }
} 
开发者ID:PacktPublishing,项目名称:Machine-Learning-with-Spark-Second-Edition,代码行数:37,代码来源:DocumentClassificationLibSVM.scala


示例12: TrackApp

//设置package包名称以及导入依赖的类
package com.esri

import org.apache.spark.sql.SparkSession
import org.apache.spark.sql.functions.{max, min}

object TrackApp extends App {

  val spark = SparkSession
    .builder()
    .appName("Path SOM")
    .master("local[*]")
    .config("spark.ui.enabled", "false")
    .getOrCreate()

  import spark.implicits._

  try {

    val df = spark
      .read
      .json("Paths")
      .as[TrackCells]
      .cache()

    val qrAgg = df
      .flatMap(_.cells)
      .distinct()
      .agg(min("q").as("qmin"), max("q").alias("qmax"), min("r").alias("rmin"), max("r").alias("rmax"))
      .as[QRMinMax]
      .head

    val qrMin = Cell(qrAgg.qmin, qrAgg.rmin)
    val qrMax = Cell(qrAgg.qmax, qrAgg.rmax)
    val qrDel = (qrMax - qrMin) + 1
    val qrSize = qrDel size

    val trainingArr = df
      .rdd
      .map(trackCells => trackCells.toBreeze(qrMin, qrDel, qrSize))
      .collect()

    val rnd = new java.security.SecureRandom()
    val somSize = 3
    val nodes = for {
      q <- 0 until somSize
      r <- 0 until somSize
    } yield Node(q, r, trainingArr(rnd.nextInt(trainingArr.length)))


    val som = SOM(nodes)
    val epochMax = trainingArr.length * 400
    implicit val progressBar = TerminalProgressBar(epochMax)
    som.train(trainingArr, epochMax, 2.5, initialAlpha = 0.4)
    som.saveAsFig("/tmp/fig.png", qrDel)

  } finally {
    spark.stop()
  }
} 
开发者ID:mraad,项目名称:spark-som-path,代码行数:60,代码来源:TrackApp.scala


示例13: CarMileageTest

//设置package包名称以及导入依赖的类
package org.dele.misc.bookFastDS

import org.apache.spark.sql.SparkSession


object CarMileageTest extends App {

  val spark = SparkSession.builder()
    .appName("car mileage")
    .master("local[*]")
    .config("logConf", "true")
    .getOrCreate()

  val milData = spark.read.option("header", "true").option("inferSchema", "true").csv("res/data/car-milage.csv")

  println(milData.count())
  milData.show(5)
  milData.printSchema()
  milData.describe("mpg", "hp", "torque", "automatic").show()

  milData.groupBy("automatic").avg("mpg", "hp", "torque").show()
  milData.groupBy().avg("mpg", "hp", "torque").show()

  import org.apache.spark.sql.functions._
  milData.agg(stddev(milData("mpg")), avg(milData("torque"))).show()

  val cor = milData.stat.corr("hp", "weight")
  println(f"'hp' to 'weight' correlation: $cor%.4f")
  val cov = milData.stat.cov("hp", "weight")
  println(f"'hp' to 'weight' covariance: $cov%.4f")

  val crosstab = milData.stat.crosstab("automatic", "NoOfSpeed")
  crosstab.show()
  val crosstab2 = milData.stat.crosstab("hp", "weight")
  crosstab2.show()

  spark.close()
} 
开发者ID:new2scala,项目名称:text-util,代码行数:39,代码来源:CarMileageTest.scala


示例14: GroupByTest

//设置package包名称以及导入依赖的类
package org.dele.misc.bookMasterSpark2.scalaExamples

import org.apache.spark.sql.SparkSession

import scala.util.Random


object GroupByTest {
  def main(args:Array[String]): Unit = {
    val spark = SparkSession.builder()
      .appName("GroupBy test")
      .master("local[*]")
      .getOrCreate()

    val numMappers = 2
    val numKVPairs = 1000
    val valSize = 1000
    val numReducers = numMappers

    val pairs1 = spark.sparkContext.parallelize(
      0 until numMappers,
      numMappers
    ).flatMap{ p =>
      val ranGen = new Random()
      val arr = new Array[(Int, Array[Byte])](numKVPairs)
      (0 until numKVPairs).foreach{ idx =>
        val byteArr = new Array[Byte](valSize)
        ranGen.nextBytes(byteArr)
        arr(idx) = (math.abs(ranGen.nextInt()) % 500, byteArr)
      }
      arr
    }.cache()

    pairs1.count()

    val groups = pairs1.groupByKey(numReducers)
    val groupCount = groups.count()
    println(groupCount)

    val keyedGroups = groups.take(groupCount.toInt)

    println(keyedGroups.length)

    spark.stop()

  }
} 
开发者ID:new2scala,项目名称:text-util,代码行数:48,代码来源:GroupByTest.scala


示例15: Job

//设置package包名称以及导入依赖的类
import org.apache.spark.SparkConf
import org.apache.spark.sql.SparkSession

object Job {

  val AppName = "ECAD_JSON_Converter"
  val sparkMaster = "local[3]"
  //  val sparkMaster = "spark://node0.local:7077"
  val HDFSDataDir = "hdfs://node0.local:9000/ECAD_Data/"
  val HDFSNameNode = "hdfs://node0.local:9000"

  def main(args: Array[String]): Unit = {
    val conf = new SparkConf().setAppName(AppName).setMaster(sparkMaster)
    val spark = SparkSession
      .builder()
      .config(conf)
      .getOrCreate()

    val sc = spark.sparkContext

    val hadoopConf = sc.hadoopConfiguration
    hadoopConf.set("fs.defaultFS", HDFSNameNode)

    val mapper = new Mappers()
    val sourceDF = mapper.genSourceDF(spark, HDFSDataDir + "sources.txt")
    val precipDF = mapper.precipicationDF(spark, HDFSDataDir + "RR_SOUID100014.txt")
  }
} 
开发者ID:luxinator,项目名称:RainyDay,代码行数:29,代码来源:Job.scala


示例16: ManchesterSyntaxOWLAxiomsDatasetBuilder

//设置package包名称以及导入依赖的类
package net.sansa_stack.owl.spark.dataset

import com.typesafe.scalalogging.Logger
import net.sansa_stack.owl.common.parsing.ManchesterSyntaxParsing
import org.apache.spark.sql.{Encoders, SparkSession}
import org.semanticweb.owlapi.io.OWLParserException
import org.semanticweb.owlapi.model.{OWLAxiom, OWLRuntimeException}


object ManchesterSyntaxOWLAxiomsDatasetBuilder extends ManchesterSyntaxParsing {
  private val logger = Logger(this.getClass)

  def build(spark: SparkSession, filePath: String): OWLAxiomsDataset = {
    val res = ManchesterSyntaxOWLExpressionsDatasetBuilder.buildAndGetDefaultPrefix(spark, filePath)
    val expressionsDataset = res._1
    val defaultPrefix = res._2
    build(expressionsDataset, defaultPrefix)
  }

  // FIXME: It has to be ensured that the expressionsDataset is in functional syntax
  def build(expressionsDataset: OWLExpressionsDataset, defaultPrefix: String): OWLAxiomsDataset = {
    implicit val encoder = Encoders.kryo[OWLAxiom]

    expressionsDataset.filter(!_.startsWith("Annotations")).flatMap(frame => {
      try makeAxioms(frame, defaultPrefix)
      catch {
        case exception: OWLParserException => {
          val msg = exception.getMessage
          logger.warn("Parser error for frame\n" + frame + "\n\n" + msg)
//          exception.printStackTrace()
          Set.empty[OWLAxiom]
        }
        case exception: OWLRuntimeException => {
          val msg = exception.getMessage
          logger.warn("Parser error for frame\n" + frame + "\n\n" + msg)
          exception.printStackTrace()
          Set.empty[OWLAxiom]
        }
      }
    })
  }
} 
开发者ID:SANSA-Stack,项目名称:SANSA-OWL,代码行数:43,代码来源:ManchesterSyntaxOWLAxiomsDatasetBuilder.scala


示例17: FunctionalSyntaxOWLAxiomsDatasetBuilder

//设置package包名称以及导入依赖的类
package net.sansa_stack.owl.spark.dataset

import net.sansa_stack.owl.common.parsing.FunctionalSyntaxParsing
import org.apache.spark.sql.{Encoders, SparkSession}
import org.semanticweb.owlapi.model.OWLAxiom


object FunctionalSyntaxOWLAxiomsDatasetBuilder extends FunctionalSyntaxParsing {
  def build(spark: SparkSession, filePath: String): OWLAxiomsDataset = {
    build(FunctionalSyntaxOWLExpressionsDatasetBuilder.build(spark, filePath))
  }

  // FIXME: It has to be ensured that the expressionsDataset is in functional syntax
  def build(expressionsDataset: OWLExpressionsDataset): OWLAxiomsDataset = {
    implicit val encoder = Encoders.kryo[OWLAxiom]
    expressionsDataset.map(expression => makeAxiom(expression)).
      filter(axiom => axiom != null)
  }
} 
开发者ID:SANSA-Stack,项目名称:SANSA-OWL,代码行数:20,代码来源:FunctionalSyntaxOWLAxiomsDatasetBuilder.scala


示例18: FunctionalSyntaxOWLExpressionsDatasetBuilderTest

//设置package包名称以及导入依赖的类
package net.sansa_stack.owl.spark.dataset

import com.holdenkarau.spark.testing.SharedSparkContext
import org.apache.spark.sql.SparkSession
import org.scalatest.FunSuite


class FunctionalSyntaxOWLExpressionsDatasetBuilderTest extends FunSuite with SharedSparkContext {
  lazy val spark = SparkSession.builder().appName(sc.appName).master(sc.master).getOrCreate()
  var _dataset: OWLExpressionsDataset = null
  def dataset: OWLExpressionsDataset = {
    if (_dataset == null) {
      _dataset = FunctionalSyntaxOWLExpressionsDatasetBuilder.build(
        spark, "src/test/resources/ont_functional.owl")
      _dataset.cache()
    }
    _dataset
  }

  test("There should be three annotation lines with full URIs") {
    val res = dataset.filter(line => line.startsWith("Annotation(")).collectAsList()
    val expected = List(
      "Annotation(<http://ex.com/foo#hasName> \"Name\")",
      "Annotation(<http://ex.com/bar#hasTitle> \"Title\")",
      """Annotation(<http://ex.com/default#description> "A longer
description running over
several lines")""")
    assert(res.size() == 3)
    for (e <- expected) {
      assert(res.contains(e))
    }
  }

  
  //  test("There should be an import statement") {
  //    val res = rdd.filter(line => line.startsWith("Import")).collect()
  //    assert(res.length == 1)
  //    assert(res(0) == "Import(<http://www.example.com/my/2.0>)")
  //  }

  test("There should not be any empty lines") {
    val res = dataset.filter(line => line.trim.isEmpty)
    assert(res.count() == 0)
  }

  test("There should not be any comment lines") {
    val res = dataset.filter(line => line.trim.startsWith("#"))
    assert(res.count() == 0)
  }

  test("There should be a DisjointObjectProperties axiom") {
    val res = dataset.filter(line => line.trim.startsWith("DisjointObjectProperties"))
    assert(res.count() == 1)
  }

  test("The total number of axioms should be correct") {
    val total = 70 // = 71 - uncommented Import(...)
    assert(dataset.count() == total)
  }
} 
开发者ID:SANSA-Stack,项目名称:SANSA-OWL,代码行数:61,代码来源:FunctionalSyntaxOWLExpressionsDatasetBuilderTest.scala


示例19: TestPerfectScores

//设置package包名称以及导入依赖的类
package com.github.jongwook

import org.apache.spark.SparkConf
import org.apache.spark.sql.SparkSession
import org.scalactic.{Equality, TolerantNumerics}
import org.scalatest.{FlatSpec, Matchers}


class TestPerfectScores extends FlatSpec with Matchers {
  import TestFixture._
  implicit val doubleEquality: Equality[Double] = TolerantNumerics.tolerantDoubleEquality(eps)

  val perfectScores: Seq[(String, Map[Metric, Seq[Double]])] = {
    val spark = SparkSession.builder().master(new SparkConf().get("spark.master", "local[8]")).getOrCreate()

    import spark.implicits._

    val predictionDF = spark.createDataset(prediction)
    val groundTruthDF = spark.createDataset(groundTruth)

    for ((name, df) <- Seq("prediction" -> predictionDF, "ground truth" -> groundTruthDF)) yield {
      val metrics = new SparkRankingMetrics(df, df, itemCol = "product", predictionCol = "rating")

      name -> Map[Metric, Seq[Double]](
        NDCG -> metrics.ndcgAt(ats),
        MAP -> metrics.mapAt(ats),
        Precision -> metrics.precisionAt(Seq(Integer.MAX_VALUE)),
        Recall -> metrics.recallAt(Seq(Integer.MAX_VALUE))
      )
    }
  }

  for ((name, scores) <- perfectScores) {
    for (metric <- Seq(NDCG, MAP, Precision, Recall)) {
      s"In $name dataset, our $metric implementation" should s"return 1.0 for the perfect prediction" in {
        for (score <- scores(metric)) {
          score should equal(1.0)
        }
      }
    }
  }
} 
开发者ID:jongwook,项目名称:spark-ranking-metrics,代码行数:43,代码来源:TestPerfectScores.scala


示例20: CouchbasePipeline

//设置package包名称以及导入依赖的类
package com.foobar

import java.util.UUID

import com.couchbase.spark.sql._
import com.typesafe.config.ConfigFactory
import org.apache.spark.SparkConf
import org.apache.spark.sql.SparkSession

object CouchbasePipeline extends App {

  val config = ConfigFactory.load()
  //Couchbase Configuration
  val bucketName = config.getString("couchbase.bucketName")
  val couchbaseHost = config.getString("couchbase.host")
  
  //Spark Configuration
  val sparkIp = config.getString("spark.url")


  //Cassandra Configuration
  val keyspaceName = config.getString("cassandra.keyspaceName")
  val tableName = config.getString("cassandra.tableName")
  val idFeild = config.getString("cassandra.idFeild")
  val cassandraHost = config.getString("cassandra.host")
  val cassandraPort = config.getInt("cassandra.port")

  val conf = new SparkConf()
    .setAppName(s"CouchbaseCassandraTransferPlugin")
    .setMaster(sparkIp)
    .set(s"com.couchbase.bucket.$bucketName", "")
    .set("com.couchbase.nodes", couchbaseHost)
    .set("spark.cassandra.connection.host", cassandraHost)
    .set("spark.cassandra.connection.port", cassandraPort.toString)
  val spark = SparkSession.builder().config(conf).getOrCreate()
  val sc = spark.sparkContext

  val cassandraRDD = spark.read
    .format("org.apache.spark.sql.cassandra")
    .options(Map("table" -> tableName, "keyspace" -> keyspaceName))
    .load()


  import org.apache.spark.sql.functions._

  val uuidUDF = udf(CouchbaseHelper.getUUID _)
  val rddToBeWritten = if (cassandraRDD.columns.contains(idFeild)) {
    cassandraRDD.withColumn("META_ID", cassandraRDD(idFeild))
  } else {
    cassandraRDD.withColumn("META_ID", uuidUDF())
  }
  rddToBeWritten.write.couchbase()
}

object CouchbaseHelper {

  def getUUID: String = UUID.randomUUID().toString
} 
开发者ID:shiv4nsh,项目名称:cassandra-couchbase-transfer-plugin,代码行数:59,代码来源:CouchbasePipeline.scala



注:本文中的org.apache.spark.sql.SparkSession类示例整理自Github/MSDocs等源码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。


鲜花

握手

雷人

路过

鸡蛋
该文章已有0人参与评论

请发表评论

全部评论

专题导读
上一篇:
Scala Charset类代码示例发布时间:2022-05-23
下一篇:
Scala MessagesApi类代码示例发布时间:2022-05-23
热门推荐
热门话题
阅读排行榜

扫描微信二维码

查看手机版网站

随时了解更新最新资讯

139-2527-9053

在线客服(服务时间 9:00~18:00)

在线QQ客服
地址:深圳市南山区西丽大学城创智工业园
电邮:jeky_zhao#qq.com
移动电话:139-2527-9053

Powered by 互联科技 X3.4© 2001-2213 极客世界.|Sitemap