本文整理汇总了Scala中org.apache.spark.sql.hive.HiveContext类的典型用法代码示例。如果您正苦于以下问题:Scala HiveContext类的具体用法?Scala HiveContext怎么用?Scala HiveContext使用的例子?那么恭喜您, 这里精选的类代码示例或许可以为您提供帮助。
在下文中一共展示了HiveContext类的20个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于我们的系统推荐出更棒的Scala代码示例。
示例1: KuduAccountMartSimpleSums
//设置package包名称以及导入依赖的类
package com.hadooparchitecturebook.taxi360.sql.kudu
import org.apache.spark.sql.hive.HiveContext
import org.apache.spark.{SparkConf, SparkContext}
object KuduAccountMartSimpleSums {
def main(args: Array[String]): Unit = {
if (args.length == 0) {
println("Args: <runLocal> <kuduMaster> " +
"<kuduAccountMartTableName> ")
return
}
val runLocal = args(0).equalsIgnoreCase("l")
val kuduMaster = args(1)
val kuduAccountMartTableName = args(2)
val sc: SparkContext = if (runLocal) {
val sparkConfig = new SparkConf()
sparkConfig.set("spark.broadcast.compress", "false")
sparkConfig.set("spark.shuffle.compress", "false")
sparkConfig.set("spark.shuffle.spill.compress", "false")
new SparkContext("local", "TableStatsSinglePathMain", sparkConfig)
} else {
val sparkConfig = new SparkConf().setAppName("TableStatsSinglePathMain")
new SparkContext(sparkConfig)
}
val hiveContext = new HiveContext(sc)
val kuduOptions = Map(
"kudu.table" -> kuduAccountMartTableName,
"kudu.master" -> kuduMaster)
hiveContext.read.options(kuduOptions).format("org.kududb.spark.kudu").load.
registerTempTable("account_mart_tmp")
println("------------")
val values = hiveContext.sql("select account_id, sum(win_count) from account_mart_tmp group by account_id").
take(100)
println("------------")
values.foreach(println)
println("------------")
sc.stop()
}
}
开发者ID:hadooparchitecturebook,项目名称:Taxi360,代码行数:51,代码来源:KuduAccountMartSimpleSums.scala
示例2: KuduAppEventSimpleSums
//设置package包名称以及导入依赖的类
package com.hadooparchitecturebook.taxi360.sql.kudu
import org.apache.spark.sql.hive.HiveContext
import org.apache.spark.{SparkConf, SparkContext}
object KuduAppEventSimpleSums {
def main(args: Array[String]): Unit = {
if (args.length == 0) {
println("Args: <runLocal> <kuduMaster> " +
"<kuduAppEventTableName> ")
return
}
val runLocal = args(0).equalsIgnoreCase("l")
val kuduMaster = args(1)
val kuduAppEventTableName = args(2)
val sc: SparkContext = if (runLocal) {
val sparkConfig = new SparkConf()
sparkConfig.set("spark.broadcast.compress", "false")
sparkConfig.set("spark.shuffle.compress", "false")
sparkConfig.set("spark.shuffle.spill.compress", "false")
new SparkContext("local", "TableStatsSinglePathMain", sparkConfig)
} else {
val sparkConfig = new SparkConf().setAppName("TableStatsSinglePathMain")
new SparkContext(sparkConfig)
}
val hiveContext = new HiveContext(sc)
val kuduOptions = Map(
"kudu.table" -> kuduAppEventTableName,
"kudu.master" -> kuduMaster)
hiveContext.read.options(kuduOptions).format("org.kududb.spark.kudu").load.
registerTempTable("app_event_tmp")
println("------------")
val values = hiveContext.sql("select account_id, sum(purchase) from app_event_tmp group by account_id").
take(100)
println("------------")
values.foreach(println)
println("------------")
sc.stop()
}
}
开发者ID:hadooparchitecturebook,项目名称:Taxi360,代码行数:52,代码来源:KuduAppEventSimpleSums.scala
示例3: KuduToHDFS
//设置package包名称以及导入依赖的类
package com.hadooparchitecturebook.taxi360.sql.kudu
import org.apache.spark.sql.hive.HiveContext
import org.apache.spark.{SparkConf, SparkContext}
object KuduToHDFS {
def main(args: Array[String]): Unit = {
if (args.length == 0) {
println("Args: <runLocal> <kuduMaster> " +
"<kuduTaxiTripTableName> " +
"<hdfsTaxiTripTableName> " +
"<numOfCenters> " +
"<numOfIterations> ")
return
}
val runLocal = args(0).equalsIgnoreCase("l")
val kuduMaster = args(1)
val kuduTaxiTripTableName = args(2)
val hdfsTaxiTripTableName = args(3)
val sc: SparkContext = if (runLocal) {
val sparkConfig = new SparkConf()
sparkConfig.set("spark.broadcast.compress", "false")
sparkConfig.set("spark.shuffle.compress", "false")
sparkConfig.set("spark.shuffle.spill.compress", "false")
new SparkContext("local", "TableStatsSinglePathMain", sparkConfig)
} else {
val sparkConfig = new SparkConf().setAppName("TableStatsSinglePathMain")
new SparkContext(sparkConfig)
}
val hiveContext = new HiveContext(sc)
val kuduOptions = Map(
"kudu.table" -> kuduTaxiTripTableName,
"kudu.master" -> kuduMaster)
hiveContext.read.options(kuduOptions).format("org.kududb.spark.kudu").load.
registerTempTable("kuduTaxiTripTableName")
hiveContext.sql("CREATE TABLE " + hdfsTaxiTripTableName + " " +
" AS SELECT * FROM kuduTaxiTripTableName " +
" STORED AS PARQUET")
sc.stop()
}
}
开发者ID:hadooparchitecturebook,项目名称:Taxi360,代码行数:50,代码来源:KuduToHDFS.scala
示例4: KuduAppEventSimpleSums
//设置package包名称以及导入依赖的类
package com.cloudera.sa.apptrans.sql.kudu
import org.apache.spark.sql.hive.HiveContext
import org.apache.spark.{SparkConf, SparkContext}
object KuduAppEventSimpleSums {
def main(args: Array[String]): Unit = {
if (args.length == 0) {
println("Args: <runLocal> <kuduMaster> " +
"<kuduAppEventTableName> ")
return
}
val runLocal = args(0).equalsIgnoreCase("l")
val kuduMaster = args(1)
val kuduAppEventTableName = args(2)
val sc: SparkContext = if (runLocal) {
val sparkConfig = new SparkConf()
sparkConfig.set("spark.broadcast.compress", "false")
sparkConfig.set("spark.shuffle.compress", "false")
sparkConfig.set("spark.shuffle.spill.compress", "false")
new SparkContext("local", "TableStatsSinglePathMain", sparkConfig)
} else {
val sparkConfig = new SparkConf().setAppName("TableStatsSinglePathMain")
new SparkContext(sparkConfig)
}
val hiveContext = new HiveContext(sc)
val kuduOptions = Map(
"kudu.table" -> kuduAppEventTableName,
"kudu.master" -> kuduMaster)
hiveContext.read.options(kuduOptions).format("org.kududb.spark.kudu").load.
registerTempTable("app_event_tmp")
println("------------")
val values = hiveContext.sql("select account_id, sum(purchase) from app_event_tmp group by account_id").
take(100)
println("------------")
values.foreach(println)
println("------------")
sc.stop()
}
}
开发者ID:tmalaska,项目名称:AppTrans,代码行数:52,代码来源:KuduAppEventSimpleSums.scala
示例5: KuduToHDFS
//设置package包名称以及导入依赖的类
package com.cloudera.sa.apptrans.sql.kudu
import org.apache.spark.sql.hive.HiveContext
import org.apache.spark.{SparkConf, SparkContext}
object KuduToHDFS {
def main(args: Array[String]): Unit = {
if (args.length == 0) {
println("Args: <runLocal> <kuduMaster> " +
"<kuduAccountMartTableName> " +
"<hdfsAccountMartTableName> " +
"<numOfCenters> " +
"<numOfIterations> ")
return
}
val runLocal = args(0).equalsIgnoreCase("l")
val kuduMaster = args(1)
val kuduAccountMartTableName = args(2)
val hdfsAccountMartTableName = args(3)
val numOfCenters = args(4).toInt
val numOfIterations = args(5).toInt
val sc: SparkContext = if (runLocal) {
val sparkConfig = new SparkConf()
sparkConfig.set("spark.broadcast.compress", "false")
sparkConfig.set("spark.shuffle.compress", "false")
sparkConfig.set("spark.shuffle.spill.compress", "false")
new SparkContext("local", "TableStatsSinglePathMain", sparkConfig)
} else {
val sparkConfig = new SparkConf().setAppName("TableStatsSinglePathMain")
new SparkContext(sparkConfig)
}
val hiveContext = new HiveContext(sc)
val kuduOptions = Map(
"kudu.table" -> kuduAccountMartTableName,
"kudu.master" -> kuduMaster)
hiveContext.read.options(kuduOptions).format("org.kududb.spark.kudu").load.
registerTempTable("account_mart_tmp")
hiveContext.sql("CREATE TABLE " + hdfsAccountMartTableName + " AS SELECT * FROM account_mart_tmp")
sc.stop()
}
}
开发者ID:tmalaska,项目名称:AppTrans,代码行数:50,代码来源:KuduToHDFS.scala
示例6: KuduAccountMartSimpleSums
//设置package包名称以及导入依赖的类
package com.cloudera.sa.apptrans.sql.kudu
import org.apache.spark.sql.hive.HiveContext
import org.apache.spark.{SparkConf, SparkContext}
object KuduAccountMartSimpleSums {
def main(args: Array[String]): Unit = {
if (args.length == 0) {
println("Args: <runLocal> <kuduMaster> " +
"<kuduAccountMartTableName> ")
return
}
val runLocal = args(0).equalsIgnoreCase("l")
val kuduMaster = args(1)
val kuduAccountMartTableName = args(2)
val sc: SparkContext = if (runLocal) {
val sparkConfig = new SparkConf()
sparkConfig.set("spark.broadcast.compress", "false")
sparkConfig.set("spark.shuffle.compress", "false")
sparkConfig.set("spark.shuffle.spill.compress", "false")
new SparkContext("local", "TableStatsSinglePathMain", sparkConfig)
} else {
val sparkConfig = new SparkConf().setAppName("TableStatsSinglePathMain")
new SparkContext(sparkConfig)
}
val hiveContext = new HiveContext(sc)
val kuduOptions = Map(
"kudu.table" -> kuduAccountMartTableName,
"kudu.master" -> kuduMaster)
hiveContext.read.options(kuduOptions).format("org.kududb.spark.kudu").load.
registerTempTable("account_mart_tmp")
println("------------")
val values = hiveContext.sql("select account_id, sum(win_count) from account_mart_tmp group by account_id").
take(100)
println("------------")
values.foreach(println)
println("------------")
sc.stop()
}
}
开发者ID:tmalaska,项目名称:AppTrans,代码行数:51,代码来源:KuduAccountMartSimpleSums.scala
示例7: Main
//设置package包名称以及导入依赖的类
package com.microsoft.netalyzer.loader
import org.apache.spark.sql.hive.HiveContext
import org.apache.spark.{SparkConf, SparkContext}
object Main {
val settings = new Settings()
val conf = new SparkConf()
val sc = new SparkContext(conf)
val sqlContext = new HiveContext(sc)
def main(args: Array[String]): Unit = {
sqlContext.setConf("spark.sql.orc.filterPushdown", "true")
sqlContext.setConf("spark.sql.shuffle.partitions", "200")
Utils.initializeDb(settings.cookedData, sqlContext)
Utils.importCsvData(settings.rawData, sqlContext)
}
}
开发者ID:bitvector2,项目名称:netalyzer-loader,代码行数:20,代码来源:Main.scala
示例8: Average
//设置package包名称以及导入依赖的类
package nl.techdays.bigdataprocessing.demo03
import org.apache.spark.{SparkConf, SparkContext}
import org.apache.spark.sql.{SQLContext, SaveMode}
import org.apache.spark.sql.hive.HiveContext
case class Average(dimension: String, average: Double)
object Program {
def main(args: Array[String]) = {
val conf = new SparkConf().setAppName("adl-sample-app")
val sc = new SparkContext(conf)
val sqlContext = new HiveContext(sc)
import sqlContext.implicits._
val measurements = sqlContext.sql("SELECT * FROM measurements")
measurements
.map(x => (x.getAs[String]("dimension"), x.getAs[Double]("value")))
.reduceByKey((left, right) => (left + right) / 2)
.map { case (dimension, average) => Average(dimension,average) }
.toDF()
.write.mode(SaveMode.Append).saveAsTable("averages")
}
}
开发者ID:wmeints,项目名称:techdays2016,代码行数:27,代码来源:Program.scala
示例9: LiveConverter
//设置package包名称以及导入依赖的类
package com.paypal.risk.madmen20
import org.apache.spark.sql.hive.HiveContext
import org.apache.spark.{SparkConf, SparkContext}
object LiveConverter {
def StringToDouble(s:String): Any = {
try {
return s.toDouble
} catch {
case ex: NumberFormatException => null
case ex: NullPointerException => null
}
}
def convertNvar(m: Map[String, Any]): Map[String, java.lang.Double] = {
m.map {
case (k, v: java.lang.Double) => (k,v)
case (k, v: String) if v.isEmpty || v.toLowerCase == "null" => (k, null)
// case (k, v: String) if v.isEmpty || v.toLowerCase == "null" || v.toUpperCase == "NO_DATA" => (k, null)
// case (k, v: String) if ! """\d+\.\d+""".r.unapplySeq(v).isDefined => (k, "123321.0".toDouble)
// case (k, v: String) if ! """\d+\.\d+""".r.unapplySeq(v).isDefined => (k, null)
case (k, v: String) => (k, StringToDouble(v))
case (k, null) => (k, null)
}.asInstanceOf[Map[String, java.lang.Double]]
}
def main (args: Array[String]) {
if(args.length != 1) {
throw new RuntimeException("Expect 1 argument: <month:yyyy-MM|day:yyyy-MM-dd>")
}
var month = args.apply(0)
month = month.replace("/","-")
val sc = new SparkContext(new SparkConf().setAppName("Live Converter").setMaster("yarn-client"))
@transient val hc = new HiveContext(sc)
hc.udf.register("convertNvar", convertNvar _)
val df_orig = hc.read.parquet("/apps/risk/det/madmen20/archive/source=live_with_nvar_type_issue/date=" + month + "*")
val df_new = df_orig.selectExpr("mid", "source", "date", "meta", "convertNvar(nvar) as nvar", "cvar")
// df_new.write.partitionBy("date").format("parquet").mode("error")
// .save("/apps/risk/det/madmen20/bre/source=live_convert_" + month)
df_new.write.format("parquet").mode(saveMode = "overwrite")
.save("/apps/risk/det/madmen20/bre/source=live/" + month)
}
}
开发者ID:yanlzhang8936,项目名称:madmen20,代码行数:53,代码来源:LiveConverter.scala
示例10: Example
//设置package包名称以及导入依赖的类
package com.paypal.risk.madmen20.example
import org.apache.spark.sql.SQLContext
import org.apache.spark.sql.hive.HiveContext
import org.apache.spark.{SparkConf, SparkContext}
object Example {
def splitArray (args: Array[String]) {
args.map((arg: String) => println("One arg is: " + arg + "!"))
}
def main(args: Array[String]) {
implicit val sc = new SparkContext(new SparkConf().setAppName("Example").setMaster("yarn-client"))
implicit val hc = new HiveContext(sc)
// implicit val sqlc = new SQLContext(sc)
val df = hc.parquetFile("/apps/risk/det/madmen20/bre/source=live")
df.printSchema()
df.take(1).foreach(println)
splitArray(args)
}
}
开发者ID:yanlzhang8936,项目名称:madmen20,代码行数:24,代码来源:Example.scala
示例11: SparkTermCandidatesWeighter
//设置package包名称以及导入依赖的类
package ru.ispras.atr.rank
import org.apache.spark.{SparkConf, SparkContext}
import org.apache.spark.rdd.RDD
import org.apache.spark.sql.functions.desc
import org.apache.spark.sql.hive.HiveContext
import org.apache.spark.sql.{DataFrame, Row}
import org.apache.spark.sql.types.{DoubleType, StringType, StructField, StructType}
import ru.ispras.atr.datamodel.{DSDataset, TermCandidate}
import ru.ispras.atr.features.FeatureConfig
abstract class SparkTermCandidatesWeighter(docsToShow:Int) extends TermCandidatesWeighter {
val termDFName = "Term"
def allFeatures: Seq[FeatureConfig]
def convert2FeatureSpace(candidates: Seq[TermCandidate], dataset: DSDataset):Seq[Seq[Double]] = {
val resByFeatures: Seq[Seq[Double]] = allFeatures.map(f => {
//iterate by features first, because it lets to estimate time per feature and (maybe) it is faster due to caching
log.info(s"Initializing feature ${f.id}...")
val featureComputer = f.build(candidates, dataset)
log.info(s"Computing feature ${f.id}...")
featureComputer.compute(candidates)
})
log.info(s"${allFeatures.size} features have been computed")
resByFeatures.transpose
}
def convertToDF(termNames: Seq[String], featureNames: Seq[String], resByTerms: Seq[Seq[Double]]): DataFrame = {
val header = StructField(termDFName, StringType) +: featureNames.map(f => StructField(f, DoubleType))
val schema = StructType(header)
val rows = termNames.zip(resByTerms).map(a => Row.fromSeq(a._1 +: a._2))
val rowsRDD: RDD[Row] = SparkConfigs.sc.parallelize(rows)
val df = SparkConfigs.sqlc.createDataFrame(rowsRDD, schema)
df
}
def weightAndSort(candidates: Seq[TermCandidate], dataset: DSDataset): Iterable[(String, Double)] = {
val featureValues = convert2FeatureSpace(candidates, dataset)
val initDF = convertToDF(candidates.map(_.verboseRepr(docsToShow)), allFeatures.map(_.id), featureValues)
val weightedDF = weight(initDF)
val termNamesDF = weightedDF.select(termDFName,id).sort(desc(id))
val weightColId: String = id //for serialization
val termColId: String = termDFName
val terms = termNamesDF.rdd.map(r => (r.getAs[String](termColId), r.getAs[Double](weightColId))).collect()
terms
}
def weight(df: DataFrame) : DataFrame
}
object SparkConfigs {
val sparkConf = new SparkConf()
.setAppName("ATR Evaluation System")
.setMaster("local[16]")
.set("spark.driver.memory", "1g")
val sc = new SparkContext(sparkConf)
val sqlc = new HiveContext(sc)
}
开发者ID:ispras,项目名称:atr4s,代码行数:61,代码来源:SparkTermCandidatesWeighter.scala
示例12: NamedContext
//设置package包名称以及导入依赖的类
package io.hydrosphere.mist.worker
import java.io.File
import io.hydrosphere.mist.api.{CentralLoggingConf, RuntimeJobInfo, SetupConfiguration}
import org.apache.spark.api.java.JavaSparkContext
import org.apache.spark.sql.SQLContext
import org.apache.spark.sql.hive.HiveContext
import org.apache.spark.streaming.Duration
import org.apache.spark.{SparkConf, SparkContext}
import scala.collection.mutable
class NamedContext(
val sparkContext: SparkContext,
val namespace: String,
streamingDuration: Duration = Duration(40 * 1000),
loggingConf: Option[CentralLoggingConf] = None
) {
private val jars = mutable.Buffer.empty[String]
def addJar(jarPath: String): Unit = {
val jarAbsolutePath = new File(jarPath).getAbsolutePath
if (!jars.contains(jarAbsolutePath)) {
sparkContext.addJar(jarPath)
jars += jarAbsolutePath
}
}
def setupConfiguration(jobId: String): SetupConfiguration = {
SetupConfiguration(
context = sparkContext,
streamingDuration = streamingDuration,
info = RuntimeJobInfo(jobId, namespace),
loggingConf = loggingConf
)
}
//TODO: can we call that inside python directly using setupConfiguration?
// python support
def sparkConf: SparkConf = sparkContext.getConf
// python support
def javaContext: JavaSparkContext = new JavaSparkContext(sparkContext)
// python support
def sqlContext: SQLContext = new SQLContext(sparkContext)
// python support
def hiveContext: HiveContext = new HiveContext(sparkContext)
def stop(): Unit = {
sparkContext.stop()
}
}
开发者ID:Hydrospheredata,项目名称:mist,代码行数:58,代码来源:NamedContext.scala
示例13: HiveContextFactory
//设置package包名称以及导入依赖的类
package spark.jobserver.context
import com.typesafe.config.Config
import org.apache.spark.{SparkConf, SparkContext}
import org.apache.spark.sql.hive.HiveContext
import spark.jobserver.{api, ContextLike, SparkHiveJob}
import spark.jobserver.util.SparkJobUtils
class HiveContextFactory extends ScalaContextFactory {
type C = HiveContext with ContextLike
def isValidJob(job: api.SparkJobBase): Boolean = job.isInstanceOf[SparkHiveJob]
def makeContext(sparkConf: SparkConf, config: Config, contextName: String): C = {
contextFactory(sparkConf)
}
protected def contextFactory(conf: SparkConf): C = {
new HiveContext(new SparkContext(conf)) with HiveContextLike
}
}
private[jobserver] trait HiveContextLike extends ContextLike {
def stop() { this.sparkContext.stop() }
}
开发者ID:TruenoDB,项目名称:trueno-compute-server,代码行数:26,代码来源:HiveContextFactory.scala
示例14: HiveOperationTest
//设置package包名称以及导入依赖的类
package cn.com.warlock.sql
import org.apache.spark.sql.hive.HiveContext
import org.apache.spark.{SparkConf, SparkContext}
object HiveOperationTest {
def main(args: Array[String]): Unit = {
if (args.length < 1) {
System.err.println("Usage: <inpath>")
System.exit(1)
}
val inputFile = args(0)
val conf = new SparkConf().setAppName("HiveOperationTest")
val sc = new SparkContext(conf)
val sqlContext = new HiveContext(sc)
// create table
sqlContext.sql("CREATE TABLE IF NOT EXISTS weather (date STRING, city STRING, minTem Int, maxTem Int) row format delimited fields terminated by '\t'")
sqlContext.sql(s"LOAD DATA INPATH '${inputFile}' INTO TABLE weather")
// Queries are expressed in HiveQL
sqlContext.sql("select city, avg(minTem) from weather group by city").collect().foreach(println)
// ?? udf
sqlContext.udf.register("class", (s: Int) => if (s <= 20) "lower" else "high")
sqlContext.sql("select city, maxTem, class(maxTem) from weather").collect().foreach(println)
sc.stop()
}
}
开发者ID:warlock-china,项目名称:spark-meepo,代码行数:34,代码来源:HiveOperationTest.scala
示例15: RandomSampling
//设置package包名称以及导入依赖的类
package com.burness.algorithm.preprocess
import breeze.numerics.abs
import com.burness.utils.AbstractParams
import org.apache.spark.SparkContext
import org.apache.spark.sql.hive.HiveContext
import scopt.OptionParser
class RandomSampling(sc: SparkContext) {
case class Params(samplingRatio: Double =1.0,
inputTableName: String = null,
outputTableName: String = null)
extends AbstractParams[Params]
def parseParams(args: Array[String]): Params = {
val defaultParams = Params()
val parser = new OptionParser[Params]("RandomSampling") {
head("Random Sampling Params parse")
opt[String]("inputTableName")
.text("data input path")
.action((x, c) => c.copy(inputTableName = x))
opt[String]("outputTableName")
.text("data output path")
.action((x, c) => c.copy(outputTableName = x))
opt[Double]("samplingRatio")
.text("random sampling ratio")
.action((x, c) => c.copy(samplingRatio = x))
}
parser.parse(args, defaultParams) match {
case Some(params) =>
params
case None =>
defaultParams
}
}
def run(params: Params): Unit ={
val hiveContext = new HiveContext(sc)
import hiveContext.implicits._
import hiveContext.sql
// ????????????0.7???????????0-9??????6???
val result = sql(s"select * from ${params.inputTableName}").sample(withReplacement = false, params.samplingRatio)
val r = scala.util.Random
r.setSeed(System.currentTimeMillis())
val tempNum = abs(r.nextInt())
val tempName = "random_"+tempNum.toString+"_sample_table"
result.registerTempTable(tempName)
sql(s"create table ${params.outputTableName} as select * from $tempName")
}
}
开发者ID:spark-mler,项目名称:algorithmEngine,代码行数:60,代码来源:RandomSampling.scala
示例16: LoadHive
//设置package包名称以及导入依赖的类
package com.git.huanghaifeng.spark.load
import org.apache.spark._
import org.apache.spark.sql.hive.HiveContext
object LoadHive {
def main(args: Array[String]) {
if (args.length < 2) {
println("Usage: [sparkmaster] [tablename]")
exit(1)
}
val master = args(0)
val tableName = args(1)
val sc = new SparkContext(master, "LoadHive", System.getenv("SPARK_HOME"))
val hiveCtx = new HiveContext(sc)
//val input = hiveCtx.sql("FROM src SELECT key, value")
val input = hiveCtx.sql("show tables")
val data = input.map(_.getInt(0))
println(data.collect().toList)
}
}
开发者ID:prucehuang,项目名称:quickly-start-spark,代码行数:23,代码来源:LoadHive.scala
示例17: BuildDatabaseJob
//设置package包名称以及导入依赖的类
package com.gmail.katezer.bigdata.jobs
import com.gmail.katezer.bigdata.utils.MultipleUtils._
import org.apache.spark.SparkContext
import org.apache.spark.sql.hive.HiveContext
object BuildDatabaseJob extends Job[BuildDatabaseJobLogic]
class BuildDatabaseJobLogic extends SparkApp {
def name = "IFRS9 - Build Database"
override def date = "Not required for this job"
def run(sc: SparkContext, hc: HiveContext): Unit = {
info("BuildDatabaseJobLogic > run :: running this job!")
// some database and table configurations
hc.sql("SET PARQUET_COMPRESSION_CODEC=snappy") // Compression SNAPPY
// creating tables and views
executeAndIgnoreHiveError(hc, "DROP TABLE IF EXISTS accounts")
hc.sql("CREATE TABLE accounts (account_reference STRING, exposure_class STRING, gross_carrying_amount DOUBLE" +
", currency STRING, business_unit STRING, company_code STRING, country STRING)" +
" PARTITIONED BY (reporting_date STRING) STORED AS PARQUET")
executeAndIgnoreHiveError(hc, "DROP TABLE IF EXISTS tenors")
hc.sql("CREATE EXTERNAL TABLE tenors (account_reference STRING, tenor INT, lgd DOUBLE, ead DOUBLE, pd DOUBLE)" +
" PARTITIONED BY (reporting_date STRING) STORED AS PARQUET")
executeAndIgnoreHiveError(hc, "DROP TABLE IF EXISTS calculation")
hc.sql("CREATE TABLE calculation (account_reference STRING, 12_month_expected_loss DOUBLE" +
", lifetime_expected_loss DOUBLE, impairment_charge DOUBLE)" +
" PARTITIONED BY (reporting_date STRING) STORED AS PARQUET")
executeAndIgnoreHiveError(hc, "DROP TABLE IF EXISTS reporting")
hc.sql("CREATE TABLE reporting (account_reference INT, exposure_class STRING, gross_carrying_amount DOUBLE" +
", currency STRING, business_unit STRING, 12_month_expected_loss DOUBLE, lifetime_expected_loss DOUBLE" +
", impairment_charge DOUBLE, company_code STRING, country STRING)" +
" PARTITIONED BY (reporting_date STRING) STORED AS PARQUET")
executeAndIgnoreHiveError(hc, "DROP VIEW IF EXISTS reporting_layer")
hc.sql("CREATE VIEW reporting_layer AS SELECT reporting_date, business_unit, 12_month_expected_loss" +
", lifetime_expected_loss, impairment_charge, company_code, country FROM reporting")
info("BuildDatabaseJobLogic > run :: job ended!")
}
}
开发者ID:katezer,项目名称:spark-calculation-engine,代码行数:45,代码来源:BuildDatabaseJob.scala
示例18: run
//设置package包名称以及导入依赖的类
package com.gmail.katezer.bigdata
import com.gmail.katezer.bigdata.utils.HasLogger
import com.gmail.katezer.bigdata.utils.MultipleUtils.{getFirstArg, getSecondArg}
import org.apache.spark.SparkContext
import org.apache.spark.sql.hive.HiveContext
package object jobs {
trait SparkApp extends JobApp {
def run(sc: SparkContext, hc: HiveContext): Unit
def run() = {}
}
trait JobApp extends HasLogger {
def name: String
def date: String = getFirstArg(args)
def local: Boolean = getSecondArg(args)
private var _args: Array[String] = _
def args = _args
def setArgs(args: Array[String]) = _args = args
def run(): Unit
}
}
开发者ID:katezer,项目名称:spark-calculation-engine,代码行数:27,代码来源:package.scala
示例19: LoadJob
//设置package包名称以及导入依赖的类
package com.gmail.katezer.bigdata.jobs
import org.apache.spark.SparkContext
import org.apache.spark.sql.hive.HiveContext
import org.apache.spark.sql.DataFrame
import org.apache.spark.sql.types.{StructType}
import com.gmail.katezer.bigdata.utils.Schemas._
object LoadJob extends Job[LoadJobLogic]
class LoadJobLogic extends SparkApp {
def name = "IFRS9 - Load"
def run(sc: SparkContext, hc: HiveContext): Unit = {
info("LoadJobLogic > run :: running this job!")
// setup path to HDFS or local filesystem
lazy val accountsCSVPath = this.local match {
case true => getClass.getResource("/csv/accounts_" + date + ".csv").getPath
case false => "spark-calc-data/accounts_" + date + ".csv"
}
lazy val tenorsCSVPath = this.local match {
case true => getClass.getResource("/csv/tenors_" + date + ".csv").getPath
case false => "spark-calc-data/tenors_" + date + ".csv"
}
// load the CSV to a DataFrame
val accountsDF = csvToDF(hc, accountsCSVPath, accountSchema)
val tenorsDF = csvToDF(hc, tenorsCSVPath, tenorSchema)
// save DataFrame as temporal table and insert the data to the corresponding partition
accountsDF.registerTempTable("accounts_tmp_" + date)
tenorsDF.registerTempTable("tenors_tmp_" + date)
hc.sql("INSERT OVERWRITE TABLE accounts PARTITION (reporting_date = '" + date + "') SELECT * FROM accounts_tmp_" + date)
hc.sql("INSERT OVERWRITE TABLE tenors PARTITION (reporting_date = '" + date + "') SELECT * FROM tenors_tmp_" + date)
info("LoadJobLogic > run :: job ended!")
}
def csvToDF(hc: HiveContext, csvPath: String, customSchema: StructType): DataFrame = hc
.load(
"com.databricks.spark.csv",
schema = customSchema,
Map("path" -> csvPath, "header" -> "true"))
.toDF()
.cache()
}
开发者ID:katezer,项目名称:spark-calculation-engine,代码行数:50,代码来源:LoadJob.scala
示例20: MultipleUtils
//设置package包名称以及导入依赖的类
package com.gmail.katezer.bigdata.utils
import org.apache.hadoop.hive.metastore.api.NoSuchObjectException
import org.apache.hadoop.hive.ql.metadata.HiveException
import org.apache.spark.sql.execution.QueryExecutionException
import org.apache.spark.sql.hive.HiveContext
object MultipleUtils {
def getFirstArg(args: Array[String]): String = args.isEmpty match {
case true => throw new IllegalArgumentException("First argument on launching the job is required and must be the date of the file in YYYYMMDD format")
case false => args.head
}
def getSecondArg(args: Array[String]): Boolean = args.isEmpty match {
case true => throw new IllegalArgumentException("First argument on launching the job is required and must be the date of the file in YYYYMMDD format")
case false =>
args.tail.isEmpty match {
case true => false
case false => args.tail.head match {
case "local" => true
case _ => false
}
}
}
def executeAndIgnoreHiveError(hc: HiveContext, query: String): Unit = {
try {
hc.sql(query)
} catch {
case _ : QueryExecutionException | _ : HiveException | _: NoSuchObjectException => // expected, do nothing
case e: Exception => throw e
}
}
}
开发者ID:katezer,项目名称:spark-calculation-engine,代码行数:38,代码来源:MultipleUtils.scala
注:本文中的org.apache.spark.sql.hive.HiveContext类示例整理自Github/MSDocs等源码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。 |
请发表评论