本文整理汇总了Scala中org.apache.spark.sql.DataFrame类的典型用法代码示例。如果您正苦于以下问题:Scala DataFrame类的具体用法?Scala DataFrame怎么用?Scala DataFrame使用的例子?那么恭喜您, 这里精选的类代码示例或许可以为您提供帮助。
在下文中一共展示了DataFrame类的20个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于我们的系统推荐出更棒的Scala代码示例。
示例1: LRCV
//设置package包名称以及导入依赖的类
package com.ferhtaydn.rater
import org.apache.spark.SparkContext
import org.apache.spark.ml.Pipeline
import org.apache.spark.ml.classification.LogisticRegression
import org.apache.spark.ml.evaluation.BinaryClassificationEvaluator
import org.apache.spark.ml.feature.{ StringIndexerModel, VectorAssembler }
import org.apache.spark.ml.tuning.{ CrossValidator, CrossValidatorModel, ParamGridBuilder }
import org.apache.spark.mllib.linalg.Matrix
import org.apache.spark.sql.{ DataFrame, Row, SQLContext }
class LRCV(sc: SparkContext) {
implicit val sqlContext = new SQLContext(sc)
val lr = new LogisticRegression().setMaxIter(10).setFeaturesCol("scaledFeatures")
val paramGrid = new ParamGridBuilder()
.addGrid(lr.regParam, Array(0.1, 0.01))
.build()
val assembler = new VectorAssembler()
.setInputCols(Array("gender", "age", "weight", "height", "indexedJob"))
.setOutputCol("features")
val pipeline = new Pipeline()
.setStages(Array(assembler, standardScaler("features"), lr))
val cv = new CrossValidator()
.setEstimator(pipeline)
.setEvaluator(new BinaryClassificationEvaluator)
.setEstimatorParamMaps(paramGrid)
.setNumFolds(10)
def train(df: DataFrame): (StringIndexerModel, CrossValidatorModel, Matrix) = {
// need to index strings on all data to not missing the job fields.
// other alternative can be manually assign values for each job like gender.
val indexerModel = stringIndexer("job").fit(df)
val indexed = indexerModel.transform(df)
val splits = indexed.randomSplit(Array(0.8, 0.2))
val training = splits(0).cache()
val test = splits(1)
val cvModel = cv.fit(training)
val predictionAndLabels = cvModel
.transform(test)
.select("label", "prediction").map {
case Row(label: Double, prediction: Double) ?
(prediction, label)
}
printBinaryMetrics(predictionAndLabels)
(indexerModel, cvModel, confusionMatrix(predictionAndLabels))
}
}
开发者ID:ferhtaydn,项目名称:canceRater,代码行数:62,代码来源:LRCV.scala
示例2: TikaLanguageAggregationExample
//设置package包名称以及导入依赖的类
package com.jasonfeist.spark.tika.example
import org.apache.spark.sql.{DataFrame, SQLContext}
import org.apache.spark.{SparkConf, SparkContext}
object TikaLanguageAggregationExample {
def main(args: Array[String]) {
if (args.length == 0 || args(0) == null) {
return
}
val conf = new SparkConf().setAppName("Tika Language Aggregation Example")
val sc: SparkContext = new SparkContext(conf)
val sqlContext: SQLContext = new SQLContext(sc)
val df: DataFrame = sqlContext.read
.format("com.jasonfeist.spark.tika")
.load(args(0))
.groupBy("Language")
.count()
df.show
}
}
开发者ID:jasonfeist,项目名称:tika-spark-datasource,代码行数:24,代码来源:TikaLanguageAggregationExample.scala
示例3: LinearRegressionPipeline
//设置package包名称以及导入依赖的类
package org.sparksamples.regression.bikesharing
import org.apache.log4j.Logger
import org.apache.spark.ml.Pipeline
import org.apache.spark.ml.feature.{VectorAssembler, VectorIndexer}
import org.apache.spark.ml.regression.LinearRegression
import org.apache.spark.mllib.evaluation.RegressionMetrics
import org.apache.spark.sql.{DataFrame, SparkSession}
object LinearRegressionPipeline {
@transient lazy val logger = Logger.getLogger(getClass.getName)
def linearRegressionWithVectorFormat(vectorAssembler: VectorAssembler, vectorIndexer: VectorIndexer, dataFrame: DataFrame) = {
val lr = new LinearRegression()
.setFeaturesCol("features")
.setLabelCol("label")
.setRegParam(0.1)
.setElasticNetParam(1.0)
.setMaxIter(10)
val pipeline = new Pipeline().setStages(Array(vectorAssembler, vectorIndexer, lr))
val Array(training, test) = dataFrame.randomSplit(Array(0.8, 0.2), seed = 12345)
val model = pipeline.fit(training)
val fullPredictions = model.transform(test).cache()
val predictions = fullPredictions.select("prediction").rdd.map(_.getDouble(0))
val labels = fullPredictions.select("label").rdd.map(_.getDouble(0))
val RMSE = new RegressionMetrics(predictions.zip(labels)).rootMeanSquaredError
println(s" Root mean squared error (RMSE): $RMSE")
}
def linearRegressionWithSVMFormat(spark: SparkSession) = {
// Load training data
val training = spark.read.format("libsvm")
.load("./src/main/scala/org/sparksamples/regression/dataset/BikeSharing/lsvmHours.txt")
val lr = new LinearRegression()
.setMaxIter(10)
.setRegParam(0.3)
.setElasticNetParam(0.8)
// Fit the model
val lrModel = lr.fit(training)
// Print the coefficients and intercept for linear regression
println(s"Coefficients: ${lrModel.coefficients} Intercept: ${lrModel.intercept}")
// Summarize the model over the training set and print out some metrics
val trainingSummary = lrModel.summary
println(s"numIterations: ${trainingSummary.totalIterations}")
println(s"objectiveHistory: ${trainingSummary.objectiveHistory.toList}")
trainingSummary.residuals.show()
println(s"RMSE: ${trainingSummary.rootMeanSquaredError}")
println(s"r2: ${trainingSummary.r2}")
}
}
开发者ID:PacktPublishing,项目名称:Machine-Learning-with-Spark-Second-Edition,代码行数:61,代码来源:LinearRegressionPipeline.scala
示例4: SimpleApp
//设置package包名称以及导入依赖的类
import org.apache.spark.SparkContext
import org.apache.spark.SparkConf
import org.apache.spark.sql.{DataFrame, SQLContext}
object SimpleApp {
val url = "jdbc:mysql://bigdata-master:3306/nlp"
val driver = "com.mysql.jdbc.Driver"
val user = System.getenv("MYSQL_USERNAME")
val pwd = System.getenv("MYSQL_PASSWORD")
def main(args: Array[String]) {
val conf = new SparkConf().setAppName("Simple Application")
val sc = new SparkContext(conf)
val sqlContext = new SQLContext(sc)
val df = sqlContext.read.format("jdbc").
option("url", url).
option("driver", driver).
option("dbtable", "msg").
option("user", user).
option("password", pwd).
load()
df.registerTempTable("t_msg")
val msgDF = sqlContext.sql("select message from t_msg")
msgDF.printSchema()
val cleaner = (msg: String) => {
msg.toLowerCase.split(" ").map((w: String) => w.replaceAll("[^a-zA-Z0-9]", "")).distinct
}
val wordDF = msgDF.explode("message", "word")((r: String) => cleaner(r))
wordDF.registerTempTable("words")
val wordCount = sqlContext.sql("select word, count(1) as cnt from words group by word order by cnt desc")
println(wordCount.count())
save(wordCount, "msg_word_count")
}
def save(dataFrame: DataFrame, table: String): Unit = {
val props = new java.util.Properties()
props.setProperty("user", user)
props.setProperty("password", pwd)
props.setProperty("driver", driver)
// create and save in table
dataFrame.write.jdbc(url, table, props)
}
}
开发者ID:mykumar,项目名称:SparkScalaInternalExperiements,代码行数:50,代码来源:SimpleApp.scala
示例5: VeChallengeIngest
//设置package包名称以及导入依赖的类
package io.github.adrianulbona.ve
import org.apache.spark.sql.{DataFrame, SparkSession}
import org.apache.spark.streaming.twitter.TwitterUtils
import org.apache.spark.streaming.{Minutes, Seconds, StreamingContext}
import twitter4j.{GeoLocation, Place, Status}
object VeChallengeIngest {
case class Location(latitude: Double, longitude: Double)
case class Tweet(time: Long, text: String, user: String, isRetweet: Boolean, country: String, location: Location)
def main(args: Array[String]) {
val spark = SparkSession.builder
.master("local[*]")
.appName("ve-challenge")
.getOrCreate()
import spark.sqlContext.implicits._
val ssc = new StreamingContext(spark.sparkContext, Minutes(2))
val stream = TwitterUtils.createStream(ssc, None, Seq("challenge"))
stream.map(extract).map(normalize).foreachRDD((batch, time) => {
val batchDF: DataFrame = batch.toDF.cache
batchDF.groupBy($"country").count().toDF("country", "count").orderBy($"count".desc).show(6)
batchDF.coalesce(1).write.parquet("tweets/batch=" + time.milliseconds)
batchDF.unpersist()
})
ssc.start()
ssc.awaitTermination()
spark.stop()
}
def extract(status: Status): (Long, String, String, Boolean, Option[Place], Option[GeoLocation]) = {
(status.getCreatedAt.getTime,
status.getText,
status.getUser.getName,
status.isRetweet,
Option(status.getPlace),
Option(status.getGeoLocation))
}
def normalize(extract: (Long, String, String, Boolean, Option[Place], Option[GeoLocation])): Tweet = extract match {
case (time: Long, text: String, user: String, isRetweet: Boolean, Some(place: Place), Some(geoLoc: GeoLocation)) =>
Tweet(time, text, user, isRetweet, place.getCountryCode, Location(geoLoc.getLatitude, geoLoc.getLongitude))
case (time: Long, text: String, user: String, isRetweet: Boolean, Some(place: Place), None) =>
Tweet(time, text, user, isRetweet, place.getCountryCode, Location(Double.NaN, Double.NaN))
case (time: Long, text: String, user: String, isRetweet: Boolean, None, Some(geoLoc: GeoLocation)) =>
Tweet(time, text, user, isRetweet, "unknown", Location(geoLoc.getLatitude, geoLoc.getLongitude))
case (time: Long, text: String, user: String, isRetweet: Boolean, None, None) =>
Tweet(time, text, user, isRetweet, "unknown", Location(Double.NaN, Double.NaN))
}
}
开发者ID:adrianulbona,项目名称:ve-challenge,代码行数:60,代码来源:VeChallengeIngest.scala
示例6: BasicMetricsCalculator
//设置package包名称以及导入依赖的类
import org.joda.time._
import org.apache.spark.sql.{DataFrame, SparkSession}
object BasicMetricsCalculator {
def calculate(sparkSession: SparkSession, df: DataFrame): (Double, Double, Double) = {
val dfWithReturn = RiskUtils.dfWithReturnColumn(df)
dfWithReturn.createOrReplaceTempView("BasicMetricsCalculator")
val dfWithDateOrder = sparkSession.sql("SELECT FIRST(date) AS first_date, FIRST(price) AS first_price, " +
"LAST(date) AS last_date, LAST(price) AS last_price " +
"FROM BasicMetricsCalculator ")
val firstDate = new DateTime(dfWithDateOrder.first.getDate(0))
val firstPrice = dfWithDateOrder.first.getDouble(1)
val lastDate = new DateTime(dfWithDateOrder.first.getDate(2))
val lastPrice = dfWithDateOrder.first.getDouble(3)
val days = Days.daysBetween(firstDate, lastDate).getDays
val years = days.toDouble / 252 // 252 trading days translates to 365 calendar days
val cagr = if (years == 0) 0 else (math.pow(lastPrice / firstPrice, 1/years) - 1) * 100
val returnPct = ((lastPrice - firstPrice) / firstPrice) * 100
val dfWithPositivePeriodPct = sparkSession.sql("SELECT " +
"(SUM(CASE WHEN return >= 0 THEN 1 ELSE 0 END) / COUNT(1)) * 100 AS positive_period_pct " +
"FROM BasicMetricsCalculator " +
"WHERE return IS NOT NULL")
val positivePeriodPct = dfWithPositivePeriodPct.first.getDouble(0)
(cagr, returnPct, positivePeriodPct)
}
}
开发者ID:tibkiss,项目名称:spark-risk-explorer,代码行数:32,代码来源:BasicMetricsCalculator.scala
示例7: DrawdownCalculator
//设置package包名称以及导入依赖的类
import org.apache.spark.sql.expressions.Window
import org.apache.spark.sql.{DataFrame, SparkSession}
import org.apache.spark.sql.functions.max
object DrawdownCalculator {
def calculate(sparkSession: SparkSession, df: DataFrame): Double = {
val windowUptoCurrentRow = Window.orderBy("date").rowsBetween(Long.MinValue, 0)
val dfWithRollingMaxPrice = df.withColumn("rolling_max_price",
max(df("price")).over(windowUptoCurrentRow))
val dfWithRollingDrawdowns = dfWithRollingMaxPrice.withColumn("rolling_dd",
max(dfWithRollingMaxPrice("rolling_max_price") - dfWithRollingMaxPrice("price")).over(windowUptoCurrentRow))
dfWithRollingDrawdowns.createOrReplaceTempView("DrawdownCalculation")
val dfWithOrderedDrawndowns = sparkSession.sql("SELECT date, price, rolling_dd, rolling_max_price, " +
"(rolling_dd / rolling_max_price) as drawdown_pct " +
"FROM DrawdownCalculation ORDER BY drawdown_pct ASC")
dfWithOrderedDrawndowns.show()
val rollingDrawdown = dfWithOrderedDrawndowns.first().getDouble(2)
val rollingMaxPrice = dfWithOrderedDrawndowns.first().getDouble(3)
val maxDrawdownPct = dfWithOrderedDrawndowns.first().getDouble(4)
maxDrawdownPct
}
}
开发者ID:tibkiss,项目名称:spark-risk-explorer,代码行数:30,代码来源:DrawdownCalculator.scala
示例8: write
//设置package包名称以及导入依赖的类
import org.apache.spark.sql.DataFrame
trait ParquetWriter {
def write(df: DataFrame, path: String)
}
object ParquetWriter {
class RealParquetWriter extends ParquetWriter {
override def write(df: DataFrame, path: String): Unit = {
df.write.format("parquet").save(path)
}
}
def apply(): ParquetWriter = {
new RealParquetWriter
}
}
开发者ID:DanteLore,项目名称:bdd-spark,代码行数:18,代码来源:ParquetWriter.scala
示例9: CSVUtils
//设置package包名称以及导入依赖的类
package edu.gatech.cse8803.ioutils
import com.databricks.spark.csv.CsvContext
import org.apache.spark.sql.{DataFrame, SQLContext}
object CSVUtils {
private val pattern = "(\\w+)(\\.csv)?$".r.unanchored
def loadCSVAsTable(sqlContext: SQLContext, path: String): DataFrame = {
loadCSVAsTable(sqlContext, path, inferTableNameFromPath(path))
}
def loadCSVAsTable(sqlContext: SQLContext, path: String, tableName: String): DataFrame = {
val data = sqlContext.csvFile(path)
data.registerTempTable(tableName)
data
}
def inferTableNameFromPath(path: String) = path match {
case pattern(filename, extension) => filename
case _ => path
}
}
开发者ID:powersj,项目名称:spark4achilles,代码行数:25,代码来源:CSVUtils.scala
示例10: ALSModeling
//设置package包名称以及导入依赖的类
package com.spark.recommendation
import java.util
import com.spark.recommendation.FeatureExtraction.{Rating, parseRating}
import org.apache.spark.ml.evaluation.RegressionEvaluator
import org.apache.spark.ml.recommendation.ALS
import org.apache.spark.sql.{Row, DataFrame, DataFrameWriter}
object ALSModeling {
def createALSModel() {
val ratings = FeatureExtraction.getFeatures();
val Array(training, test) = ratings.randomSplit(Array(0.8, 0.2))
println(training.first())
// Build the recommendation model using ALS on the training data
val als = new ALS()
.setMaxIter(5)
.setRegParam(0.01)
.setUserCol("userId")
.setItemCol("movieId")
.setRatingCol("rating")
val model = als.fit(training)
println(model.userFactors.count())
println(model.itemFactors.count())
val predictions = model.transform(test)
println(predictions.printSchema())
val evaluator = new RegressionEvaluator()
.setMetricName("rmse")
.setLabelCol("rating")
.setPredictionCol("prediction")
val rmse = evaluator.evaluate(predictions)
println(s"Root-mean-square error = $rmse")
}
def main(args: Array[String]) {
createALSModel()
}
}
开发者ID:PacktPublishing,项目名称:Machine-Learning-with-Spark-Second-Edition,代码行数:48,代码来源:ALSModeling.scala
示例11: Titanic
//设置package包名称以及导入依赖的类
package fr.ippon.spark.ml
import org.apache.spark.sql.types.DoubleType
import org.apache.spark.sql.{functions, Column, DataFrame, SQLContext}
object Titanic {
// Fonction de récupération des données d'un fichier de Titanic dans un DataFrame
def dataframeFromTitanicFile(sqlc: SQLContext, file: String): DataFrame = sqlc.read
.format("com.databricks.spark.csv")
.option("header", "true")
.option("inferSchema", "true")
.load(file)
// Fonction de calcul de l'age moyen
def calcMeanAge(df: DataFrame, inputCol: String): Double = df
.agg(functions.avg(df(inputCol)))
.head
.getDouble(0)
// Fonction nous donnant l'age ou la moyenne des ages
def fillMissingAge(df: DataFrame, inputCol: String, outputCol: String, replacementValue: Double): DataFrame = {
val ageValue: (Any) => Double = age => age match {
case age: Double => age
case _ => replacementValue
}
df.withColumn(outputCol, functions.callUDF(ageValue, DoubleType, df(inputCol)))
}
}
开发者ID:ippontech,项目名称:spark-bbl-prez,代码行数:31,代码来源:Titanic.scala
示例12: DbSaver
//设置package包名称以及导入依赖的类
package org.myutils
import java.util.Properties
import org.apache.spark.sql.DataFrame
import org.apache.spark.sql.execution.datasources.jdbc.JdbcUtils
class DbSaver(_url: String, _username: String, _password: String, _driver: String)
{
val url = _url
val username = _username
val password = _password
val driver = _driver
private def getProps = {
val props = new Properties()
props.setProperty("user", username)
props.setProperty("password", password)
props.setProperty("driver", driver)
props
}
def createAndSave(df: DataFrame, table: String): Unit = {
df.write.jdbc(url, table, getProps)
}
def append(df: DataFrame, table: String): Unit = {
JdbcUtils.saveTable(df, url, table, getProps)
}
}
开发者ID:mykumar,项目名称:SparkScalaInternalExperiements,代码行数:31,代码来源:DbSaver.scala
示例13:
//设置package包名称以及导入依赖的类
package org.dele.misc.bookFastDS
import org.apache.spark.sql.types._
import org.apache.spark.sql.{DataFrame, Dataset, SparkSession}
import org.apache.spark.sql.catalyst.ScalaReflection
val customSchema = ScalaReflection.schemaFor[GeoLocEntry].dataType.asInstanceOf[StructType]
val df:DataFrame = spark.read
.option("header", "true")
.schema(customSchema)
.csv("/home/dele/tmp/GeoLiteCity_20170207/GeoLiteCity-Location.csv")
//.csv("res/data/g1.csv")
val ds = df.as[GeoLocEntry]
println(ds.count())
val countries = ds.map(_.country).distinct().collect()
val countryMap = countries.indices.map(idx => countries(idx) -> idx).toMap
spark.sparkContext.broadcast(countryMap)
val mapped = ds.map{ geo =>
val idx = countryMap(geo.country)
idx -> geo
}.collect()
println(mapped)
spark.close()
}
开发者ID:new2scala,项目名称:text-util,代码行数:32,代码来源:GeoCityDataTest.scala
示例14: getType
//设置package包名称以及导入依赖的类
package it.agilelab.bigdata.DataQuality.sources
import it.agilelab.bigdata.DataQuality.configs.GenStructField
import org.apache.spark.sql.DataFrame
trait SourceConfig{
def getType: String //TODO enum
}
case class HdfsFile(
id:String,
path:String,
fileType:String,
separator:Option[String],
header: Boolean,
date: String,
dependencies: List[String] = List.empty[String],
schema: Option[List[GenStructField]] = None
)
extends SourceConfig {
override def getType: String = "HDFS"
}
case class Source(
id: String,
df: DataFrame
)
开发者ID:agile-lab-dev,项目名称:DataQuality,代码行数:31,代码来源:Source.scala
示例15: SharpeCalculator
//设置package包名称以及导入依赖的类
import org.apache.spark.sql.{DataFrame, SparkSession}
object SharpeCalculator {
def calculate(sparkSession: SparkSession, df: DataFrame, riskFreeRate: Double = 0): Double = {
val dfWithReturn = RiskUtils.dfWithReturnColumn(df)
dfWithReturn.createOrReplaceTempView("SharpeCalculation")
val avgExcessReturnDf = sparkSession.sql("SELECT " +
s"AVG(return - $riskFreeRate) / STDDEV(return) AS sharpe " +
"FROM SharpeCalculation " +
"WHERE return IS NOT NULL")
val sharpeRatio = avgExcessReturnDf.first.getDouble(0)
sharpeRatio
}
}
开发者ID:tibkiss,项目名称:spark-risk-explorer,代码行数:18,代码来源:SharpeCalculator.scala
示例16: HousePrices
//设置package包名称以及导入依赖的类
import org.apache.spark.sql.DataFrame
import org.apache.spark.sql.functions._
object HousePrices {
import Spark._
private def toYear = udf((s : String) => s.substring(0, 4))
def getDataWithYear(input : String) : DataFrame = {
spark.sql(s"select * from $input")
.withColumn("year", toYear(col("date")))
}
def countRows(tableName: String): Long = {
spark
.sql(s"select count(*) from $tableName")
.collect
.head
.getLong(0)
}
def doItByPostcode(input: String, postcode: String, output: String): Unit = {
getDataWithYear(input).createOrReplaceTempView("with_year")
spark.sql(s"select year, avg(price) as averageHousePrice, min(price) as minHousePrice, max(price) as maxHousePrice from with_year where postcode = '$postcode' group by year")
.createOrReplaceTempView(output)
}
def doItWithMinMax(input: String, output: String): Unit = {
getDataWithYear(input).createOrReplaceTempView("with_year")
spark.sql(s"select year, avg(price) as averageHousePrice, min(price) as minHousePrice, max(price) as maxHousePrice from with_year group by year")
.createOrReplaceTempView(output)
}
def justDoIt(input: String, output: String): Unit = {
getDataWithYear(input).createOrReplaceTempView("with_year")
spark.sql(s"select year,avg(price) as averageHousePrice from with_year group by year")
.createOrReplaceTempView(output)
}
}
开发者ID:DanteLore,项目名称:bdd-spark,代码行数:43,代码来源:HousePrices.scala
示例17: DefaultSource
//设置package包名称以及导入依赖的类
package com.rishabh.spark.datasource.s3
import org.apache.spark.sql.sources.{BaseRelation, CreatableRelationProvider, RelationProvider}
import org.apache.spark.sql.{DataFrame, SQLContext, SaveMode}
class DefaultSource extends RelationProvider with CreatableRelationProvider {
override def createRelation(sqlContext: SQLContext, parameters: Map[String, String]):
BaseRelation = {
val accessKey = parameters.getOrElse("accesskey", sys.error("accesskey is required"))
val secretKey = parameters.getOrElse("secretkey", sys.error("secretkey is required"))
val fileType = parameters.getOrElse("type", sys.error("filetype is required"))
val path = parameters.getOrElse("path", sys.error("path is required"))
val bucket = parameters.getOrElse("bucketName", sys.error("bucket is required"))
new S3Relation(accessKey, secretKey, fileType, bucket, path, false)(sqlContext)
}
override def createRelation(sqlContext: SQLContext, mode: SaveMode, parameters: Map[String,
String], data: DataFrame): BaseRelation = {
val accesskey = parameters.getOrElse("accesskey",sys.error("accesskey is required"))
val secretkey = parameters.getOrElse("secretkey", sys.error("secretkey is required"))
val bucket = parameters.getOrElse("bucketName", sys.error("bucket is required"))
val fileType = parameters.getOrElse("type", sys.error("filetype is required"))
val path = parameters.getOrElse("path", sys.error("path is required"))
val supported = List("json", "parquet", "csv")
if (!supported.contains(fileType)) {
sys.error("fileType " + fileType + " not supported.")
}
val hadoopConf = sqlContext.sparkContext.hadoopConfiguration
hadoopConf.set("fs.s3a.impl", "org.apache.hadoop.fs.s3a.S3AFileSystem")
hadoopConf.set("fs.s3a.access.key", accesskey)
hadoopConf.set("fs.s3a.secret.key", secretkey)
val s3Path = "s3a://" + bucket + path
doSave(fileType, data, s3Path)
new S3Relation(accesskey, secretkey, fileType, bucket, path, true)(sqlContext)
}
private def doSave(fileType: String, dataFrame: DataFrame, path: String) = {
fileType match {
case "json" =>
dataFrame.write.json(path)
case "parquet" =>
dataFrame.write.parquet(path)
case "csv" =>
dataFrame.write.format("com.databricks.spark.csv").save(path)
}
}
}
开发者ID:rishabhbhardwaj,项目名称:spark-datasource-s3,代码行数:51,代码来源:DefaultSource.scala
示例18: TreeUtils
//设置package包名称以及导入依赖的类
package org.apache.spark.ml
import org.apache.spark.ml.attribute.{AttributeGroup, NominalAttribute, NumericAttribute}
import org.apache.spark.sql.DataFrame
object TreeUtils {
def setMetadata(
data: DataFrame,
featuresColName: String,
featureArity: Array[Int]): DataFrame = {
val featuresAttributes = featureArity.zipWithIndex.map { case (arity: Int, feature: Int) =>
if (arity > 0) {
NominalAttribute.defaultAttr.withIndex(feature).withNumValues(arity)
} else {
NumericAttribute.defaultAttr.withIndex(feature)
}
}
val featuresMetadata = new AttributeGroup("features", featuresAttributes).toMetadata()
data.select(data(featuresColName).as(featuresColName, featuresMetadata))
}
}
开发者ID:summerDG,项目名称:spark-sql-perf,代码行数:23,代码来源:TreeUtils.scala
示例19: Dataframe2Json
//设置package包名称以及导入依赖的类
package onextent.utils
import org.apache.spark.sql.DataFrame
import spray.json._
import DefaultJsonProtocol._
object Dataframe2Json {
def printToFile(content: String, location: String) =
Some(new java.io.PrintWriter(location)).foreach{f => try{f.write(content)}finally{f.close()}}
def apply(df: DataFrame): Option[String] = {
val collectedData = df.toJSON.coalesce(1).collect().mkString("\n")
val json = "[" + ("}\n".r replaceAllIn (collectedData, "},\n")) + "]"
val pretty = json.parseJson.prettyPrint
Some(s"$pretty\n")
}
def apply(df: DataFrame, location: String): Option[String] = {
val content = apply(df)
printToFile(content.get, location)
content
}
}
开发者ID:navicore,项目名称:Dataframe2Json,代码行数:26,代码来源:Dataframe2Json.scala
示例20: ModelBuilder
//设置package包名称以及导入依赖的类
package org.wikimedia.research.recommendation.job.translation
import java.io.File
import org.apache.log4j.{LogManager, Logger}
import org.apache.spark.sql.{DataFrame, SparkSession}
import scala.collection.parallel.mutable.ParArray
object ModelBuilder {
val log: Logger = LogManager.getLogger(ModelBuilder.getClass)
def buildModels(spark: SparkSession,
modelsOutputDir: Option[File],
sites: ParArray[String],
featureData: DataFrame): Unit = {
log.info("Building Models")
sites.foreach(target =>
try {
log.info("Building model for " + target)
log.info("Getting work data for " + target)
val workData: DataFrame = Utils.getWorkData(spark, featureData, target)
val Array(trainingData, testData) = workData.randomSplit(Array(0.7, 0.3))
log.info("Training model for " + target)
val model = Utils.REGRESSOR.fit(trainingData)
log.info("Writing model to file for " + target)
modelsOutputDir.foreach(o => model.write.save(new File(o, target).getAbsolutePath))
log.info("Testing model for " + target)
val predictions = model.transform(testData)
val rmse = Utils.EVALUATOR.evaluate(predictions)
log.info("Root Mean Squared Error (RMSE) on test data for " + target + " = " + rmse)
} catch {
case unknown: Throwable => log.error("Build model for " + target + " failed", unknown)
}
)
}
}
开发者ID:schana,项目名称:recommendation-translation,代码行数:41,代码来源:ModelBuilder.scala
注:本文中的org.apache.spark.sql.DataFrame类示例整理自Github/MSDocs等源码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。 |
请发表评论