• 设为首页
  • 点击收藏
  • 手机版
    手机扫一扫访问
    迪恩网络手机版
  • 关注官方公众号
    微信扫一扫关注
    公众号

Scala HiveContext类代码示例

原作者: [db:作者] 来自: [db:来源] 收藏 邀请

本文整理汇总了Scala中org.apache.spark.sql.hive.HiveContext的典型用法代码示例。如果您正苦于以下问题:Scala HiveContext类的具体用法?Scala HiveContext怎么用?Scala HiveContext使用的例子?那么恭喜您, 这里精选的类代码示例或许可以为您提供帮助。



在下文中一共展示了HiveContext类的20个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于我们的系统推荐出更棒的Scala代码示例。

示例1: KuduAccountMartSimpleSums

//设置package包名称以及导入依赖的类
package com.hadooparchitecturebook.taxi360.sql.kudu

import org.apache.spark.sql.hive.HiveContext
import org.apache.spark.{SparkConf, SparkContext}

object KuduAccountMartSimpleSums {
  def main(args: Array[String]): Unit = {

    if (args.length == 0) {
      println("Args: <runLocal> <kuduMaster> " +
        "<kuduAccountMartTableName> ")
      return
    }

    val runLocal = args(0).equalsIgnoreCase("l")
    val kuduMaster = args(1)
    val kuduAccountMartTableName = args(2)

    val sc: SparkContext = if (runLocal) {
      val sparkConfig = new SparkConf()
      sparkConfig.set("spark.broadcast.compress", "false")
      sparkConfig.set("spark.shuffle.compress", "false")
      sparkConfig.set("spark.shuffle.spill.compress", "false")
      new SparkContext("local", "TableStatsSinglePathMain", sparkConfig)
    } else {
      val sparkConfig = new SparkConf().setAppName("TableStatsSinglePathMain")
      new SparkContext(sparkConfig)
    }

    val hiveContext = new HiveContext(sc)

    val kuduOptions = Map(
      "kudu.table" -> kuduAccountMartTableName,
      "kudu.master" -> kuduMaster)

    hiveContext.read.options(kuduOptions).format("org.kududb.spark.kudu").load.
      registerTempTable("account_mart_tmp")


    println("------------")
    val values = hiveContext.sql("select account_id, sum(win_count) from account_mart_tmp group by account_id").
      take(100)
    println("------------")

    values.foreach(println)
    println("------------")

    sc.stop()
  }
} 
开发者ID:hadooparchitecturebook,项目名称:Taxi360,代码行数:51,代码来源:KuduAccountMartSimpleSums.scala


示例2: KuduAppEventSimpleSums

//设置package包名称以及导入依赖的类
package com.hadooparchitecturebook.taxi360.sql.kudu

import org.apache.spark.sql.hive.HiveContext
import org.apache.spark.{SparkConf, SparkContext}


object KuduAppEventSimpleSums {
  def main(args: Array[String]): Unit = {

    if (args.length == 0) {
      println("Args: <runLocal> <kuduMaster> " +
        "<kuduAppEventTableName> ")
      return
    }

    val runLocal = args(0).equalsIgnoreCase("l")
    val kuduMaster = args(1)
    val kuduAppEventTableName = args(2)

    val sc: SparkContext = if (runLocal) {
      val sparkConfig = new SparkConf()
      sparkConfig.set("spark.broadcast.compress", "false")
      sparkConfig.set("spark.shuffle.compress", "false")
      sparkConfig.set("spark.shuffle.spill.compress", "false")
      new SparkContext("local", "TableStatsSinglePathMain", sparkConfig)
    } else {
      val sparkConfig = new SparkConf().setAppName("TableStatsSinglePathMain")
      new SparkContext(sparkConfig)
    }

    val hiveContext = new HiveContext(sc)

    val kuduOptions = Map(
      "kudu.table" -> kuduAppEventTableName,
      "kudu.master" -> kuduMaster)

    hiveContext.read.options(kuduOptions).format("org.kududb.spark.kudu").load.
      registerTempTable("app_event_tmp")


    println("------------")
    val values = hiveContext.sql("select account_id, sum(purchase) from app_event_tmp group by account_id").
      take(100)
    println("------------")

    values.foreach(println)
    println("------------")

    sc.stop()
  }
} 
开发者ID:hadooparchitecturebook,项目名称:Taxi360,代码行数:52,代码来源:KuduAppEventSimpleSums.scala


示例3: KuduToHDFS

//设置package包名称以及导入依赖的类
package com.hadooparchitecturebook.taxi360.sql.kudu

import org.apache.spark.sql.hive.HiveContext
import org.apache.spark.{SparkConf, SparkContext}

object KuduToHDFS {
  def main(args: Array[String]): Unit = {

    if (args.length == 0) {
      println("Args: <runLocal> <kuduMaster> " +
        "<kuduTaxiTripTableName> " +
        "<hdfsTaxiTripTableName> " +
        "<numOfCenters> " +
        "<numOfIterations> ")
      return
    }

    val runLocal = args(0).equalsIgnoreCase("l")
    val kuduMaster = args(1)
    val kuduTaxiTripTableName = args(2)
    val hdfsTaxiTripTableName = args(3)

    val sc: SparkContext = if (runLocal) {
      val sparkConfig = new SparkConf()
      sparkConfig.set("spark.broadcast.compress", "false")
      sparkConfig.set("spark.shuffle.compress", "false")
      sparkConfig.set("spark.shuffle.spill.compress", "false")
      new SparkContext("local", "TableStatsSinglePathMain", sparkConfig)
    } else {
      val sparkConfig = new SparkConf().setAppName("TableStatsSinglePathMain")
      new SparkContext(sparkConfig)
    }

    val hiveContext = new HiveContext(sc)

    val kuduOptions = Map(
      "kudu.table" -> kuduTaxiTripTableName,
      "kudu.master" -> kuduMaster)

    hiveContext.read.options(kuduOptions).format("org.kududb.spark.kudu").load.
      registerTempTable("kuduTaxiTripTableName")

    hiveContext.sql("CREATE TABLE " + hdfsTaxiTripTableName + " " +
      " AS SELECT * FROM kuduTaxiTripTableName " +
      " STORED AS PARQUET")

    sc.stop()
  }
} 
开发者ID:hadooparchitecturebook,项目名称:Taxi360,代码行数:50,代码来源:KuduToHDFS.scala


示例4: KuduAppEventSimpleSums

//设置package包名称以及导入依赖的类
package com.cloudera.sa.apptrans.sql.kudu

import org.apache.spark.sql.hive.HiveContext
import org.apache.spark.{SparkConf, SparkContext}


object KuduAppEventSimpleSums {
  def main(args: Array[String]): Unit = {

    if (args.length == 0) {
      println("Args: <runLocal> <kuduMaster> " +
        "<kuduAppEventTableName> ")
      return
    }

    val runLocal = args(0).equalsIgnoreCase("l")
    val kuduMaster = args(1)
    val kuduAppEventTableName = args(2)

    val sc: SparkContext = if (runLocal) {
      val sparkConfig = new SparkConf()
      sparkConfig.set("spark.broadcast.compress", "false")
      sparkConfig.set("spark.shuffle.compress", "false")
      sparkConfig.set("spark.shuffle.spill.compress", "false")
      new SparkContext("local", "TableStatsSinglePathMain", sparkConfig)
    } else {
      val sparkConfig = new SparkConf().setAppName("TableStatsSinglePathMain")
      new SparkContext(sparkConfig)
    }

    val hiveContext = new HiveContext(sc)

    val kuduOptions = Map(
      "kudu.table" -> kuduAppEventTableName,
      "kudu.master" -> kuduMaster)

    hiveContext.read.options(kuduOptions).format("org.kududb.spark.kudu").load.
      registerTempTable("app_event_tmp")


    println("------------")
    val values = hiveContext.sql("select account_id, sum(purchase) from app_event_tmp group by account_id").
      take(100)
    println("------------")

    values.foreach(println)
    println("------------")

    sc.stop()
  }
} 
开发者ID:tmalaska,项目名称:AppTrans,代码行数:52,代码来源:KuduAppEventSimpleSums.scala


示例5: KuduToHDFS

//设置package包名称以及导入依赖的类
package com.cloudera.sa.apptrans.sql.kudu

import org.apache.spark.sql.hive.HiveContext
import org.apache.spark.{SparkConf, SparkContext}

object KuduToHDFS {
  def main(args: Array[String]): Unit = {

    if (args.length == 0) {
      println("Args: <runLocal> <kuduMaster> " +
        "<kuduAccountMartTableName> " +
        "<hdfsAccountMartTableName> " +
        "<numOfCenters> " +
        "<numOfIterations> ")
      return
    }

    val runLocal = args(0).equalsIgnoreCase("l")
    val kuduMaster = args(1)
    val kuduAccountMartTableName = args(2)
    val hdfsAccountMartTableName = args(3)
    val numOfCenters = args(4).toInt
    val numOfIterations = args(5).toInt

    val sc: SparkContext = if (runLocal) {
      val sparkConfig = new SparkConf()
      sparkConfig.set("spark.broadcast.compress", "false")
      sparkConfig.set("spark.shuffle.compress", "false")
      sparkConfig.set("spark.shuffle.spill.compress", "false")
      new SparkContext("local", "TableStatsSinglePathMain", sparkConfig)
    } else {
      val sparkConfig = new SparkConf().setAppName("TableStatsSinglePathMain")
      new SparkContext(sparkConfig)
    }

    val hiveContext = new HiveContext(sc)

    val kuduOptions = Map(
      "kudu.table" -> kuduAccountMartTableName,
      "kudu.master" -> kuduMaster)

    hiveContext.read.options(kuduOptions).format("org.kududb.spark.kudu").load.
      registerTempTable("account_mart_tmp")

    hiveContext.sql("CREATE TABLE " + hdfsAccountMartTableName + " AS SELECT * FROM account_mart_tmp")

    sc.stop()
  }
} 
开发者ID:tmalaska,项目名称:AppTrans,代码行数:50,代码来源:KuduToHDFS.scala


示例6: KuduAccountMartSimpleSums

//设置package包名称以及导入依赖的类
package com.cloudera.sa.apptrans.sql.kudu

import org.apache.spark.sql.hive.HiveContext
import org.apache.spark.{SparkConf, SparkContext}

object KuduAccountMartSimpleSums {
  def main(args: Array[String]): Unit = {

    if (args.length == 0) {
      println("Args: <runLocal> <kuduMaster> " +
        "<kuduAccountMartTableName> ")
      return
    }

    val runLocal = args(0).equalsIgnoreCase("l")
    val kuduMaster = args(1)
    val kuduAccountMartTableName = args(2)

    val sc: SparkContext = if (runLocal) {
      val sparkConfig = new SparkConf()
      sparkConfig.set("spark.broadcast.compress", "false")
      sparkConfig.set("spark.shuffle.compress", "false")
      sparkConfig.set("spark.shuffle.spill.compress", "false")
      new SparkContext("local", "TableStatsSinglePathMain", sparkConfig)
    } else {
      val sparkConfig = new SparkConf().setAppName("TableStatsSinglePathMain")
      new SparkContext(sparkConfig)
    }

    val hiveContext = new HiveContext(sc)

    val kuduOptions = Map(
      "kudu.table" -> kuduAccountMartTableName,
      "kudu.master" -> kuduMaster)

    hiveContext.read.options(kuduOptions).format("org.kududb.spark.kudu").load.
      registerTempTable("account_mart_tmp")


    println("------------")
    val values = hiveContext.sql("select account_id, sum(win_count) from account_mart_tmp group by account_id").
      take(100)
    println("------------")

    values.foreach(println)
    println("------------")

    sc.stop()
  }
} 
开发者ID:tmalaska,项目名称:AppTrans,代码行数:51,代码来源:KuduAccountMartSimpleSums.scala


示例7: Main

//设置package包名称以及导入依赖的类
package com.microsoft.netalyzer.loader

import org.apache.spark.sql.hive.HiveContext
import org.apache.spark.{SparkConf, SparkContext}

object Main {
  val settings = new Settings()
  val conf = new SparkConf()
  val sc = new SparkContext(conf)
  val sqlContext = new HiveContext(sc)

  def main(args: Array[String]): Unit = {
    sqlContext.setConf("spark.sql.orc.filterPushdown", "true")
    sqlContext.setConf("spark.sql.shuffle.partitions", "200")

    Utils.initializeDb(settings.cookedData, sqlContext)
    Utils.importCsvData(settings.rawData, sqlContext)
  }
} 
开发者ID:bitvector2,项目名称:netalyzer-loader,代码行数:20,代码来源:Main.scala


示例8: Average

//设置package包名称以及导入依赖的类
package nl.techdays.bigdataprocessing.demo03

import org.apache.spark.{SparkConf, SparkContext}
import org.apache.spark.sql.{SQLContext, SaveMode}
import org.apache.spark.sql.hive.HiveContext

case class Average(dimension: String, average: Double)

object Program {
  def main(args: Array[String]) = {
    val conf = new SparkConf().setAppName("adl-sample-app")
    val sc = new SparkContext(conf)
    val sqlContext = new HiveContext(sc)

    import sqlContext.implicits._

    val measurements = sqlContext.sql("SELECT * FROM measurements")

    measurements
      .map(x => (x.getAs[String]("dimension"), x.getAs[Double]("value")))
      .reduceByKey((left, right) => (left + right) / 2)
      .map { case (dimension, average) => Average(dimension,average) }
      .toDF()
      .write.mode(SaveMode.Append).saveAsTable("averages")
  }
} 
开发者ID:wmeints,项目名称:techdays2016,代码行数:27,代码来源:Program.scala


示例9: LiveConverter

//设置package包名称以及导入依赖的类
package com.paypal.risk.madmen20

import org.apache.spark.sql.hive.HiveContext
import org.apache.spark.{SparkConf, SparkContext}


object LiveConverter {

  def StringToDouble(s:String): Any = {
    try {
      return s.toDouble
    } catch {
      case ex: NumberFormatException => null
      case ex: NullPointerException =>  null
    }
  }

  def convertNvar(m: Map[String, Any]): Map[String, java.lang.Double] = {
    m.map {
      case (k, v: java.lang.Double) => (k,v)
      case (k, v: String) if v.isEmpty || v.toLowerCase == "null" => (k, null)
//      case (k, v: String) if v.isEmpty || v.toLowerCase == "null" || v.toUpperCase == "NO_DATA" => (k, null)
//      case (k, v: String) if ! """\d+\.\d+""".r.unapplySeq(v).isDefined => (k, "123321.0".toDouble)
//      case (k, v: String) if ! """\d+\.\d+""".r.unapplySeq(v).isDefined => (k, null)
      case (k, v: String) => (k, StringToDouble(v))
      case (k, null) => (k, null)
    }.asInstanceOf[Map[String, java.lang.Double]]
  }

  def main (args: Array[String]) {
    if(args.length != 1) {
      throw new RuntimeException("Expect 1 argument: <month:yyyy-MM|day:yyyy-MM-dd>")
    }

    var month = args.apply(0)
    month = month.replace("/","-")
    val sc = new SparkContext(new SparkConf().setAppName("Live Converter").setMaster("yarn-client"))
    @transient val hc = new HiveContext(sc)
    hc.udf.register("convertNvar", convertNvar _)

    val df_orig = hc.read.parquet("/apps/risk/det/madmen20/archive/source=live_with_nvar_type_issue/date=" + month + "*")
    val df_new = df_orig.selectExpr("mid", "source", "date", "meta", "convertNvar(nvar) as nvar", "cvar")
//    df_new.write.partitionBy("date").format("parquet").mode("error")
//      .save("/apps/risk/det/madmen20/bre/source=live_convert_" + month)
    df_new.write.format("parquet").mode(saveMode = "overwrite")
      .save("/apps/risk/det/madmen20/bre/source=live/" + month)

  }



} 
开发者ID:yanlzhang8936,项目名称:madmen20,代码行数:53,代码来源:LiveConverter.scala


示例10: Example

//设置package包名称以及导入依赖的类
package com.paypal.risk.madmen20.example

import org.apache.spark.sql.SQLContext
import org.apache.spark.sql.hive.HiveContext
import org.apache.spark.{SparkConf, SparkContext}


object Example {
  def splitArray (args: Array[String]) {
    args.map((arg: String) => println("One arg is: " + arg + "!"))
  }

  def main(args: Array[String]) {
    implicit val sc = new SparkContext(new SparkConf().setAppName("Example").setMaster("yarn-client"))
    implicit val hc = new HiveContext(sc)
//    implicit val sqlc = new SQLContext(sc)
    val df = hc.parquetFile("/apps/risk/det/madmen20/bre/source=live")
    df.printSchema()
    df.take(1).foreach(println)
    splitArray(args)

  }
} 
开发者ID:yanlzhang8936,项目名称:madmen20,代码行数:24,代码来源:Example.scala


示例11: SparkTermCandidatesWeighter

//设置package包名称以及导入依赖的类
package ru.ispras.atr.rank

import org.apache.spark.{SparkConf, SparkContext}
import org.apache.spark.rdd.RDD
import org.apache.spark.sql.functions.desc
import org.apache.spark.sql.hive.HiveContext
import org.apache.spark.sql.{DataFrame, Row}
import org.apache.spark.sql.types.{DoubleType, StringType, StructField, StructType}
import ru.ispras.atr.datamodel.{DSDataset, TermCandidate}
import ru.ispras.atr.features.FeatureConfig


abstract class SparkTermCandidatesWeighter(docsToShow:Int) extends TermCandidatesWeighter {
  val termDFName = "Term"

  def allFeatures: Seq[FeatureConfig]

  def convert2FeatureSpace(candidates: Seq[TermCandidate], dataset: DSDataset):Seq[Seq[Double]] = {
    val resByFeatures: Seq[Seq[Double]] = allFeatures.map(f => {
      //iterate by features first, because it lets to estimate time per feature and (maybe) it is faster due to caching
      log.info(s"Initializing feature ${f.id}...")
      val featureComputer = f.build(candidates, dataset)
      log.info(s"Computing feature ${f.id}...")
      featureComputer.compute(candidates)
    })
    log.info(s"${allFeatures.size} features have been computed")
    resByFeatures.transpose
    }

  def convertToDF(termNames: Seq[String], featureNames: Seq[String], resByTerms: Seq[Seq[Double]]): DataFrame = {
    val header = StructField(termDFName, StringType) +: featureNames.map(f => StructField(f, DoubleType))
    val schema = StructType(header)
    val rows = termNames.zip(resByTerms).map(a => Row.fromSeq(a._1 +: a._2))
    val rowsRDD: RDD[Row] = SparkConfigs.sc.parallelize(rows)
    val df = SparkConfigs.sqlc.createDataFrame(rowsRDD, schema)
    df
  }

  def weightAndSort(candidates: Seq[TermCandidate], dataset: DSDataset): Iterable[(String, Double)] = {
    val featureValues = convert2FeatureSpace(candidates, dataset)
    val initDF = convertToDF(candidates.map(_.verboseRepr(docsToShow)), allFeatures.map(_.id), featureValues)
    val weightedDF = weight(initDF)
    val termNamesDF = weightedDF.select(termDFName,id).sort(desc(id))
    val weightColId: String = id //for serialization
    val termColId: String = termDFName
    val terms = termNamesDF.rdd.map(r => (r.getAs[String](termColId), r.getAs[Double](weightColId))).collect()
    terms
  }

  def weight(df: DataFrame) : DataFrame
}

object SparkConfigs {
  val sparkConf = new SparkConf()
    .setAppName("ATR Evaluation System")
    .setMaster("local[16]")
    .set("spark.driver.memory", "1g")
  val sc = new SparkContext(sparkConf)
  val sqlc = new HiveContext(sc)
} 
开发者ID:ispras,项目名称:atr4s,代码行数:61,代码来源:SparkTermCandidatesWeighter.scala


示例12: NamedContext

//设置package包名称以及导入依赖的类
package io.hydrosphere.mist.worker

import java.io.File

import io.hydrosphere.mist.api.{CentralLoggingConf, RuntimeJobInfo, SetupConfiguration}
import org.apache.spark.api.java.JavaSparkContext
import org.apache.spark.sql.SQLContext
import org.apache.spark.sql.hive.HiveContext
import org.apache.spark.streaming.Duration
import org.apache.spark.{SparkConf, SparkContext}

import scala.collection.mutable

class NamedContext(
  val sparkContext: SparkContext,
  val namespace: String,
  streamingDuration: Duration = Duration(40 * 1000),
  loggingConf: Option[CentralLoggingConf] = None
) {

  private val jars = mutable.Buffer.empty[String]

  def addJar(jarPath: String): Unit = {
    val jarAbsolutePath = new File(jarPath).getAbsolutePath
    if (!jars.contains(jarAbsolutePath)) {
      sparkContext.addJar(jarPath)
      jars += jarAbsolutePath
    }
  }

  def setupConfiguration(jobId: String): SetupConfiguration = {
    SetupConfiguration(
      context = sparkContext,
      streamingDuration = streamingDuration,
      info = RuntimeJobInfo(jobId, namespace),
      loggingConf = loggingConf
    )
  }

  //TODO: can we call that inside python directly using setupConfiguration?
  // python support
  def sparkConf: SparkConf = sparkContext.getConf

  // python support
  def javaContext: JavaSparkContext = new JavaSparkContext(sparkContext)

  // python support
  def sqlContext: SQLContext = new SQLContext(sparkContext)

  // python support
  def hiveContext: HiveContext = new HiveContext(sparkContext)

  def stop(): Unit = {
    sparkContext.stop()
  }

} 
开发者ID:Hydrospheredata,项目名称:mist,代码行数:58,代码来源:NamedContext.scala


示例13: HiveContextFactory

//设置package包名称以及导入依赖的类
package spark.jobserver.context

import com.typesafe.config.Config
import org.apache.spark.{SparkConf, SparkContext}
import org.apache.spark.sql.hive.HiveContext
import spark.jobserver.{api, ContextLike, SparkHiveJob}
import spark.jobserver.util.SparkJobUtils

class HiveContextFactory extends ScalaContextFactory {
  type C = HiveContext with ContextLike

  def isValidJob(job: api.SparkJobBase): Boolean = job.isInstanceOf[SparkHiveJob]

  def makeContext(sparkConf: SparkConf, config: Config,  contextName: String): C = {
    contextFactory(sparkConf)
  }

  protected def contextFactory(conf: SparkConf): C = {
    new HiveContext(new SparkContext(conf)) with HiveContextLike
  }
}

private[jobserver] trait HiveContextLike extends ContextLike {
  def stop() { this.sparkContext.stop() }
} 
开发者ID:TruenoDB,项目名称:trueno-compute-server,代码行数:26,代码来源:HiveContextFactory.scala


示例14: HiveOperationTest

//设置package包名称以及导入依赖的类
package cn.com.warlock.sql

import org.apache.spark.sql.hive.HiveContext
import org.apache.spark.{SparkConf, SparkContext}

object HiveOperationTest {
  def main(args: Array[String]): Unit = {
    if (args.length < 1) {
      System.err.println("Usage: <inpath>")
      System.exit(1)
    }

    val inputFile = args(0)

    val conf = new SparkConf().setAppName("HiveOperationTest")
    val sc = new SparkContext(conf)
    val sqlContext = new HiveContext(sc)

    // create table
    sqlContext.sql("CREATE TABLE IF NOT EXISTS weather (date STRING, city STRING, minTem Int, maxTem Int) row format delimited fields terminated by '\t'")
    sqlContext.sql(s"LOAD DATA INPATH '${inputFile}' INTO TABLE weather")

    // Queries are expressed in HiveQL
    sqlContext.sql("select city, avg(minTem) from weather group by city").collect().foreach(println)

    // ?? udf
    sqlContext.udf.register("class", (s: Int) => if (s <= 20) "lower" else "high")

    sqlContext.sql("select city, maxTem, class(maxTem) from weather").collect().foreach(println)

    sc.stop()
  }
} 
开发者ID:warlock-china,项目名称:spark-meepo,代码行数:34,代码来源:HiveOperationTest.scala


示例15: RandomSampling

//设置package包名称以及导入依赖的类
package com.burness.algorithm.preprocess

import breeze.numerics.abs
import com.burness.utils.AbstractParams
import org.apache.spark.SparkContext
import org.apache.spark.sql.hive.HiveContext
import scopt.OptionParser


class RandomSampling(sc: SparkContext) {



  case class Params(samplingRatio: Double =1.0,
                   inputTableName: String = null,
                    outputTableName: String = null)
  extends AbstractParams[Params]


  def parseParams(args: Array[String]): Params = {
    val defaultParams = Params()
    val parser = new OptionParser[Params]("RandomSampling") {
      head("Random Sampling Params parse")
      opt[String]("inputTableName")
        .text("data input path")
        .action((x, c) => c.copy(inputTableName = x))
      opt[String]("outputTableName")
        .text("data output path")
        .action((x, c) => c.copy(outputTableName = x))
      opt[Double]("samplingRatio")
        .text("random sampling ratio")
        .action((x, c) => c.copy(samplingRatio = x))
    }

      parser.parse(args, defaultParams) match {
        case Some(params) =>
          params
        case None =>
          defaultParams
      }

    }
  def run(params: Params): Unit ={
    val hiveContext = new HiveContext(sc)
    import hiveContext.implicits._
    import hiveContext.sql
    // ????????????0.7???????????0-9??????6???
    val result = sql(s"select * from ${params.inputTableName}").sample(withReplacement = false, params.samplingRatio)
    val r = scala.util.Random
    r.setSeed(System.currentTimeMillis())
    val tempNum = abs(r.nextInt())
    val tempName = "random_"+tempNum.toString+"_sample_table"
    result.registerTempTable(tempName)
    sql(s"create table ${params.outputTableName} as select * from $tempName")

  }


} 
开发者ID:spark-mler,项目名称:algorithmEngine,代码行数:60,代码来源:RandomSampling.scala


示例16: LoadHive

//设置package包名称以及导入依赖的类
package com.git.huanghaifeng.spark.load

import org.apache.spark._
import org.apache.spark.sql.hive.HiveContext

object LoadHive {
    def main(args: Array[String]) {
        if (args.length < 2) {
            println("Usage: [sparkmaster] [tablename]")
            exit(1)
        }
        val master = args(0)
        val tableName = args(1)
        val sc = new SparkContext(master, "LoadHive", System.getenv("SPARK_HOME"))
        val hiveCtx = new HiveContext(sc)
        //val input = hiveCtx.sql("FROM src SELECT key, value")
        val input = hiveCtx.sql("show tables")

        val data = input.map(_.getInt(0))
        println(data.collect().toList)
    }
} 
开发者ID:prucehuang,项目名称:quickly-start-spark,代码行数:23,代码来源:LoadHive.scala


示例17: BuildDatabaseJob

//设置package包名称以及导入依赖的类
package com.gmail.katezer.bigdata.jobs

import com.gmail.katezer.bigdata.utils.MultipleUtils._
import org.apache.spark.SparkContext
import org.apache.spark.sql.hive.HiveContext


object BuildDatabaseJob extends Job[BuildDatabaseJobLogic]

class BuildDatabaseJobLogic extends SparkApp {

  def name = "IFRS9 - Build Database"
  override def date = "Not required for this job"

  def run(sc: SparkContext, hc: HiveContext): Unit = {
    info("BuildDatabaseJobLogic > run :: running this job!")

    // some database and table configurations
    hc.sql("SET PARQUET_COMPRESSION_CODEC=snappy") // Compression SNAPPY
    // creating tables and views
    executeAndIgnoreHiveError(hc, "DROP TABLE IF EXISTS accounts")
    hc.sql("CREATE TABLE accounts (account_reference STRING, exposure_class STRING, gross_carrying_amount DOUBLE" +
      ", currency STRING, business_unit STRING, company_code STRING, country STRING)" +
      " PARTITIONED BY (reporting_date STRING) STORED AS PARQUET")
    executeAndIgnoreHiveError(hc, "DROP TABLE IF EXISTS tenors")
    hc.sql("CREATE EXTERNAL TABLE tenors (account_reference STRING, tenor INT, lgd DOUBLE, ead DOUBLE, pd DOUBLE)" +
      " PARTITIONED BY (reporting_date STRING) STORED AS PARQUET")
    executeAndIgnoreHiveError(hc, "DROP TABLE IF EXISTS calculation")
    hc.sql("CREATE TABLE calculation (account_reference STRING, 12_month_expected_loss DOUBLE" +
      ", lifetime_expected_loss DOUBLE, impairment_charge DOUBLE)" +
      " PARTITIONED BY (reporting_date STRING) STORED AS PARQUET")
    executeAndIgnoreHiveError(hc, "DROP TABLE IF EXISTS reporting")
    hc.sql("CREATE TABLE reporting (account_reference INT, exposure_class STRING, gross_carrying_amount DOUBLE" +
      ", currency STRING, business_unit STRING, 12_month_expected_loss DOUBLE, lifetime_expected_loss DOUBLE" +
      ", impairment_charge DOUBLE, company_code STRING, country STRING)" +
      " PARTITIONED BY (reporting_date STRING) STORED AS PARQUET")
    executeAndIgnoreHiveError(hc, "DROP VIEW IF EXISTS reporting_layer")
    hc.sql("CREATE VIEW reporting_layer AS SELECT reporting_date, business_unit, 12_month_expected_loss" +
      ", lifetime_expected_loss, impairment_charge, company_code, country FROM reporting")

    info("BuildDatabaseJobLogic > run :: job ended!")
  }

} 
开发者ID:katezer,项目名称:spark-calculation-engine,代码行数:45,代码来源:BuildDatabaseJob.scala


示例18: run

//设置package包名称以及导入依赖的类
package com.gmail.katezer.bigdata

import com.gmail.katezer.bigdata.utils.HasLogger
import com.gmail.katezer.bigdata.utils.MultipleUtils.{getFirstArg, getSecondArg}
import org.apache.spark.SparkContext
import org.apache.spark.sql.hive.HiveContext


package object jobs {

  trait SparkApp extends JobApp {
    def run(sc: SparkContext, hc: HiveContext): Unit
    def run() = {}
  }

  trait JobApp extends HasLogger {
    def name: String
    def date: String = getFirstArg(args)
    def local: Boolean = getSecondArg(args)
    private var _args: Array[String] = _
    def args = _args
    def setArgs(args: Array[String]) = _args = args
    def run(): Unit
  }

} 
开发者ID:katezer,项目名称:spark-calculation-engine,代码行数:27,代码来源:package.scala


示例19: LoadJob

//设置package包名称以及导入依赖的类
package com.gmail.katezer.bigdata.jobs

import org.apache.spark.SparkContext
import org.apache.spark.sql.hive.HiveContext
import org.apache.spark.sql.DataFrame
import org.apache.spark.sql.types.{StructType}
import com.gmail.katezer.bigdata.utils.Schemas._


object LoadJob extends Job[LoadJobLogic]

class LoadJobLogic extends SparkApp {

  def name = "IFRS9 - Load"

  def run(sc: SparkContext, hc: HiveContext): Unit = {
    info("LoadJobLogic > run :: running this job!")

    // setup path to HDFS or local filesystem
    lazy val accountsCSVPath = this.local match {
      case true => getClass.getResource("/csv/accounts_" + date + ".csv").getPath
      case false => "spark-calc-data/accounts_" + date + ".csv"
    }
    lazy val tenorsCSVPath = this.local match {
      case true => getClass.getResource("/csv/tenors_" + date + ".csv").getPath
      case false => "spark-calc-data/tenors_" + date + ".csv"
    }

    // load the CSV to a DataFrame
    val accountsDF = csvToDF(hc, accountsCSVPath, accountSchema)
    val tenorsDF = csvToDF(hc, tenorsCSVPath, tenorSchema)

    // save DataFrame as temporal table and insert the data to the corresponding partition
    accountsDF.registerTempTable("accounts_tmp_" + date)
    tenorsDF.registerTempTable("tenors_tmp_" + date)
    hc.sql("INSERT OVERWRITE TABLE accounts PARTITION (reporting_date = '" + date + "') SELECT * FROM accounts_tmp_" + date)
    hc.sql("INSERT OVERWRITE TABLE tenors PARTITION (reporting_date = '" + date + "') SELECT * FROM tenors_tmp_" + date)

    info("LoadJobLogic > run :: job ended!")
  }

  def csvToDF(hc: HiveContext, csvPath: String, customSchema: StructType): DataFrame = hc
    .load(
      "com.databricks.spark.csv",
      schema = customSchema,
      Map("path" -> csvPath, "header" -> "true"))
    .toDF()
    .cache()
} 
开发者ID:katezer,项目名称:spark-calculation-engine,代码行数:50,代码来源:LoadJob.scala


示例20: MultipleUtils

//设置package包名称以及导入依赖的类
package com.gmail.katezer.bigdata.utils

import org.apache.hadoop.hive.metastore.api.NoSuchObjectException
import org.apache.hadoop.hive.ql.metadata.HiveException
import org.apache.spark.sql.execution.QueryExecutionException
import org.apache.spark.sql.hive.HiveContext


object MultipleUtils {

  def getFirstArg(args: Array[String]): String = args.isEmpty match {
    case true => throw new IllegalArgumentException("First argument on launching the job is required and must be the date of the file in YYYYMMDD format")
    case false => args.head
  }

  def getSecondArg(args: Array[String]): Boolean = args.isEmpty match {
    case true => throw new IllegalArgumentException("First argument on launching the job is required and must be the date of the file in YYYYMMDD format")
    case false =>
      args.tail.isEmpty match {
        case true => false
        case false => args.tail.head match {
          case "local" => true
          case _ => false
        }
      }
  }

  def executeAndIgnoreHiveError(hc: HiveContext, query: String): Unit = {
    try {
      hc.sql(query)
    } catch {
      case _ : QueryExecutionException | _ : HiveException | _: NoSuchObjectException => // expected, do nothing
      case e: Exception => throw e
    }
  }

} 
开发者ID:katezer,项目名称:spark-calculation-engine,代码行数:38,代码来源:MultipleUtils.scala



注:本文中的org.apache.spark.sql.hive.HiveContext类示例整理自Github/MSDocs等源码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。


鲜花

握手

雷人

路过

鸡蛋
该文章已有0人参与评论

请发表评论

全部评论

专题导读
上一篇:
Scala Logger类代码示例发布时间:2022-05-23
下一篇:
Scala TestHive类代码示例发布时间:2022-05-23
热门推荐
热门话题
阅读排行榜

扫描微信二维码

查看手机版网站

随时了解更新最新资讯

139-2527-9053

在线客服(服务时间 9:00~18:00)

在线QQ客服
地址:深圳市南山区西丽大学城创智工业园
电邮:jeky_zhao#qq.com
移动电话:139-2527-9053

Powered by 互联科技 X3.4© 2001-2213 极客世界.|Sitemap