本文整理汇总了Scala中org.apache.spark.sql.types.StructField类的典型用法代码示例。如果您正苦于以下问题:Scala StructField类的具体用法?Scala StructField怎么用?Scala StructField使用的例子?那么恭喜您, 这里精选的类代码示例或许可以为您提供帮助。
在下文中一共展示了StructField类的20个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于我们的系统推荐出更棒的Scala代码示例。
示例1: SimpleApp
//设置package包名称以及导入依赖的类
import org.apache.spark.SparkContext
import org.apache.spark.SparkConf
import org.apache.spark.ml.clustering.LDA
import org.apache.spark.mllib.linalg.{VectorUDT, Vectors}
import org.apache.spark.sql.{Row, SQLContext}
import org.apache.spark.sql.types.{StructField, StructType}
object SimpleApp {
def main(args: Array[String]) {
val conf = new SparkConf().setAppName("Simple Application").set("spark.ui.enabled", "false")
val sc = new SparkContext(conf)
val sqlContext = new SQLContext(sc)
// Loads data
val rowRDD = sc.textFile("/tmp/lda_data.txt").filter(_.nonEmpty)
.map(_.split(" ").map(_.toDouble)).map(Vectors.dense).map(Row(_))
val schema = StructType(Array(StructField("name", new VectorUDT, false)))
val dataset = sqlContext.createDataFrame(rowRDD, schema)
dataset.show()
val lda = new LDA()
.setK(10)
.setMaxIter(10)
.setFeaturesCol("name")
val model = lda.fit(dataset)
val transformed = model.transform(dataset)
val ll = model.logLikelihood(dataset)
val lp = model.logPerplexity(dataset)
// describeTopics
val topics = model.describeTopics(3)
// Shows the result
topics.show(false)
transformed.show(false)
}
}
开发者ID:mykumar,项目名称:SparkScalaInternalExperiements,代码行数:41,代码来源:SimpleApp.scala
示例2: User
//设置package包名称以及导入依赖的类
package services.users
import java.io.File
import java.nio.charset.Charset
import java.sql.Timestamp
import org.apache.commons.io.FileUtils
import org.apache.spark.sql.SparkSession
import org.apache.spark.sql.types.{StringType, StructField, StructType}
import org.apache.spark.streaming.StreamingContext
import services.events.EventStream
import services.Util
case class User(userId: String, testFinishTime: Timestamp, nickname: String, gender: String)
object User {
val DELIMITER = ','
val USER_FEED = "/Users/mahesh/data/affinitas/feeds/users/"
val USER_DATA = "/Users/mahesh/data/affinitas/tables/users/"
var ssc: StreamingContext = null
var sql: SparkSession = null
lazy val usersFeedDF = sql.read
.format("com.databricks.spark.csv")
.option("header", false)
.schema(StructType(Array(
StructField("userId", StringType, true),
StructField("nickname", StringType, true),
StructField("gender", StringType, true)
)
)).load(User.USER_FEED)
//EventStream.testFinishStream.print()
lazy val usersMap = usersFeedDF.rdd.map(record => (record.getString(0), (record.getString(1), record.getString(2))))
def initialize(sscO: StreamingContext, sqlO: SparkSession) = {
ssc = sscO
sql = sqlO
new File(USER_FEED).mkdirs()
new File(USER_DATA).mkdirs()
EventStream.testFinishStream.foreachRDD( {
rdd => {
val testFinishMap = rdd.map(record => (record.userId, record.timestamp))
val userData = testFinishMap.join(usersMap)
.map(record => Array(record._1, record._2._1, record._2._2._1, record._2._2._2))
.collect()
Util.writeCsvToDir(userData, DELIMITER.toString, USER_DATA)
}
})
}
}
开发者ID:f13mash,项目名称:spark_log_contact,代码行数:58,代码来源:User.scala
示例3: ColumnsTest
//设置package包名称以及导入依赖的类
package com.drakeconsulting.big_data_maker
import org.scalatest.FunSuite
import com.holdenkarau.spark.testing.SharedSparkContext
import org.apache.spark.sql.SQLContext
import org.apache.spark.sql.types.{StructField, StringType, LongType, DoubleType}
class ColumnsTest extends FunSuite with SharedSparkContext {
val numLoops = 100
test("test StringConstant") {
val s1 = new StringConstant("f1", "abc")
assert("abc" === s1.getValue(1))
assert(StructField("f1", StringType, false) == s1.getStructField)
}
test("test RandomLong") {
val s1 = new RandomLong("f1", 666666L)
for (x <- 1 to numLoops) {
assert(s1.getValue(1) >= 0)
assert(s1.getValue(1) <= 666666L)
}
assert(StructField("f1", LongType, false) == s1.getStructField)
}
test("test RandomDouble") {
val s1 = new RandomDouble("f1", 666666.00)
for (x <- 1 to numLoops) {
assert(s1.getValue(1) >= 0)
assert(s1.getValue(1) <= 666666.00)
}
assert(StructField("f1", DoubleType, false) == s1.getStructField)
}
test("test Categorical") {
val list = List("a", "b", "c", "d")
val s1 = new Categorical("f1", list)
for (x <- 1 to numLoops) {
val v = s1.getValue(1)
assert(list.exists(key => v.contains(key)))
}
assert(StructField("f1", StringType, false) == s1.getStructField)
}
}
开发者ID:dondrake,项目名称:BigDataMaker,代码行数:45,代码来源:TestColumns.scala
示例4: StudyRelation
//设置package包名称以及导入依赖的类
package com.study.spark.datasource
import org.apache.spark.rdd.RDD
import org.apache.spark.sql.sources.{BaseRelation, TableScan}
import org.apache.spark.sql.types.{StringType, StructField, StructType}
import org.apache.spark.sql.{Row, SQLContext}
class StudyRelation(parameters: Map[String, String])(@transient val sqlContext: SQLContext)
extends BaseRelation with TableScan {
override def schema: StructType = {
// ??? ?? ?????, ?? ??? ???? ????. ???? ?????? ???? ??????, ???? ?? ??? ????
val fields: Array[StructField] = new Array[StructField](3)
fields.update(0, new StructField("field1", StringType))
fields.update(1, new StructField("field2", StringType))
fields.update(2, new StructField("field2", StringType))
new StructType(fields.asInstanceOf[Array[StructField]])
}
// RDD[Row]? ???? StudyRDD? ???.
override def buildScan(): RDD[Row] = new StudyRDD(sqlContext, schema)
}
开发者ID:hackpupu,项目名称:LML,代码行数:24,代码来源:StudyRelation.scala
示例5: ScorePredictor
//设置package包名称以及导入依赖的类
package org.wikimedia.research.recommendation.job.translation
import java.io.File
import org.apache.log4j.{LogManager, Logger}
import org.apache.spark.ml.regression.RandomForestRegressionModel
import org.apache.spark.sql.types.{DoubleType, StringType, StructField, StructType}
import org.apache.spark.sql.{DataFrame, Row, SaveMode, SparkSession}
import scala.collection.parallel.mutable.ParArray
object ScorePredictor {
val log: Logger = LogManager.getLogger(ScorePredictor.getClass)
def predictScores(spark: SparkSession,
modelsInputDir: File,
predictionsOutputDir: Option[File],
sites: ParArray[String],
featureData: DataFrame): Unit = {
log.info("Scoring items")
val predictions: Array[DataFrame] = sites.map(target => {
try {
log.info("Scoring for " + target)
log.info("Getting work data for " + target)
val workData: DataFrame = Utils.getWorkData(spark, featureData, target, exists = false)
log.info("Loading model for " + target)
val model = RandomForestRegressionModel.load(
new File(modelsInputDir, target).getAbsolutePath)
log.info("Scoring data for " + target)
val predictions = model
.setPredictionCol(target)
.transform(workData)
.select("id", target)
predictions
} catch {
case unknown: Throwable =>
log.error("Score for " + target + " failed", unknown)
val schema = StructType(Seq(
StructField("id", StringType, nullable = false),
StructField(target, DoubleType, nullable = true)))
spark.createDataFrame(spark.sparkContext.emptyRDD[Row], schema)
}
}).toArray
val predictedScores = predictions.reduce((left, right) => left.join(right, Seq("id"), "outer"))
log.info("Saving predictions")
predictionsOutputDir.foreach(f = o =>
predictedScores.coalesce(1)
.write
.mode(SaveMode.ErrorIfExists)
.option("header", value = true)
.option("compression", "bzip2")
.csv(new File(o, "allPredictions").getAbsolutePath))
}
}
开发者ID:schana,项目名称:recommendation-translation,代码行数:59,代码来源:ScorePredictor.scala
示例6: Dictionary
//设置package包名称以及导入依赖的类
package com.github.uosdmlab.nkp
import org.apache.spark.sql.{SparkSession, Row}
import org.apache.spark.sql.types.{StringType, StructField, StructType}
object Dictionary {
import org.bitbucket.eunjeon.seunjeon.{Analyzer => EunjeonAnalyzer}
private var words = Seq.empty[String]
private def chain(fn: => Any): this.type = {
fn
this
}
private def syncDictionary(): Unit = EunjeonAnalyzer.setUserDict(words.toIterator)
def addWords(word: String, words: String*): this.type = addWords(word +: words)
def addWords(words: Traversable[String]): this.type = chain {
this.words = this.words ++ words
syncDictionary()
}
def reset(): this.type = chain {
EunjeonAnalyzer.resetUserDict()
words = Seq.empty[String]
}
def addWordsFromCSV(path: String, paths: String*): this.type =
addWordsFromCSV(path +: paths)
def addWordsFromCSV(paths: Traversable[String]): this.type = chain {
val spark = SparkSession.builder().getOrCreate()
import spark.implicits._
val schema = StructType(Array(
StructField("word", StringType, nullable = false),
StructField("cost", StringType, nullable = true)))
val df = spark.read
.option("sep", ",")
.option("inferSchema", value = false)
.option("header", value = false)
.schema(schema)
.csv(paths.toSeq: _*)
val words = df.map {
case Row(word: String, cost: String) =>
s"$word,$cost"
case Row(word: String, null) =>
word
}.collect()
addWords(words)
}
}
开发者ID:uosdmlab,项目名称:spark-nkp,代码行数:62,代码来源:Dictionary.scala
示例7: EmptyRDFGraphDataFrame
//设置package包名称以及导入依赖的类
package net.sansa_stack.ml.spark.mining.amieSpark
import org.apache.spark.sql.types.{StringType, StructField, StructType}
import org.apache.spark.sql.{DataFrame, Row, SQLContext}
/**
* @author Lorenz Buehmann
*/
object EmptyRDFGraphDataFrame {
def get(sqlContext: SQLContext): DataFrame = {
// convert RDD to DataFrame
val schemaString = "subject predicate object"
// generate the schema based on the string of schema
val schema = StructType(schemaString.split(" ").map(fieldName => StructField(fieldName, StringType, true)))
// convert triples RDD to rows
val rowRDD = sqlContext.sparkContext.emptyRDD[Row]
// apply the schema to the RDD
val triplesDataFrame = sqlContext.createDataFrame(rowRDD, schema)
// register the DataFrame as a table
triplesDataFrame.createOrReplaceTempView("TRIPLES")
triplesDataFrame
}
}
开发者ID:SANSA-Stack,项目名称:SANSA-ML,代码行数:30,代码来源:EmptyRDFGraphDataFrame.scala
示例8: Document
//设置package包名称以及导入依赖的类
package sparknlp
import org.apache.spark.sql.Row
import org.apache.spark.sql.types.{MapType, StringType, StructField, StructType}
case class Document(
id: String,
text: String,
metadata: scala.collection.Map[String, String] = Map()
)
object Document {
def apply(row: Row): Document = {
Document(row.getString(0), row.getString(1), row.getMap[String, String](2))
}
val DocumentDataType: StructType = StructType(Array(
StructField("id",StringType,nullable = true),
StructField("text",StringType,nullable = true),
StructField("metadata",MapType(StringType,StringType,valueContainsNull = true),nullable = true)
))
}
开发者ID:alexander-n-thomas,项目名称:sparknlp,代码行数:23,代码来源:Document.scala
示例9: main
//设置package包名称以及导入依赖的类
package org.sparksamples.df
//import org.apache.spark.sql.SQLContext
import org.apache.spark.SparkConf
import org.apache.spark.sql.SparkSession
import org.apache.spark.sql.types.{IntegerType, StringType, StructField, StructType};
package object UserData {
def main(args: Array[String]): Unit = {
val customSchema = StructType(Array(
StructField("no", IntegerType, true),
StructField("age", StringType, true),
StructField("gender", StringType, true),
StructField("occupation", StringType, true),
StructField("zipCode", StringType, true)));
val spConfig = (new SparkConf).setMaster("local").setAppName("SparkApp")
val spark = SparkSession
.builder()
.appName("SparkUserData").config(spConfig)
.getOrCreate()
val user_df = spark.read.format("com.databricks.spark.csv")
.option("delimiter", "|").schema(customSchema)
.load("/home/ubuntu/work/ml-resources/spark-ml/data/ml-100k/u.user")
val first = user_df.first()
println("First Record : " + first)
val num_genders = user_df.groupBy("gender").count().count()
val num_occupations = user_df.groupBy("occupation").count().count()
val num_zipcodes = user_df.groupBy("zipCode").count().count()
println("num_users : " + user_df.count())
println("num_genders : "+ num_genders)
println("num_occupations : "+ num_occupations)
println("num_zipcodes: " + num_zipcodes)
println("Distribution by Occupation")
println(user_df.groupBy("occupation").count().show())
}
}
开发者ID:PacktPublishing,项目名称:Machine-Learning-with-Spark-Second-Edition,代码行数:40,代码来源:UserData.scala
示例10:
//设置package包名称以及导入依赖的类
package org.sparksamples
import java.awt.Font
import org.apache.spark.SparkConf
import org.apache.spark.sql.SparkSession
import org.apache.spark.sql.types.{IntegerType, StructField, StructType}
import org.jfree.chart.axis.CategoryLabelPositions
import scalax.chart.module.ChartFactories
val customSchema = StructType(Array(
StructField("user_id", IntegerType, true),
StructField("movie_id", IntegerType, true),
StructField("rating", IntegerType, true),
StructField("timestamp", IntegerType, true)))
val spConfig = (new SparkConf).setMaster("local").setAppName("SparkApp")
val spark = SparkSession
.builder()
.appName("SparkRatingData").config(spConfig)
.getOrCreate()
val rating_df = spark.read.format("com.databricks.spark.csv")
.option("delimiter", "\t").schema(customSchema)
.load("../../data/ml-100k/u.data")
val rating_df_count = rating_df.groupBy("rating").count().sort("rating")
//val rating_df_count_sorted = rating_df_count.sort("count")
rating_df_count.show()
val rating_df_count_collection = rating_df_count.collect()
val ds = new org.jfree.data.category.DefaultCategoryDataset
val mx = scala.collection.immutable.ListMap()
for( x <- 0 until rating_df_count_collection.length) {
val occ = rating_df_count_collection(x)(0)
val count = Integer.parseInt(rating_df_count_collection(x)(1).toString)
ds.addValue(count,"UserAges", occ.toString)
}
//val sorted = ListMap(ratings_count.toSeq.sortBy(_._1):_*)
//val ds = new org.jfree.data.category.DefaultCategoryDataset
//sorted.foreach{ case (k,v) => ds.addValue(v,"Rating Values", k)}
val chart = ChartFactories.BarChart(ds)
val font = new Font("Dialog", Font.PLAIN,5);
chart.peer.getCategoryPlot.getDomainAxis().
setCategoryLabelPositions(CategoryLabelPositions.UP_90);
chart.peer.getCategoryPlot.getDomainAxis.setLabelFont(font)
chart.show()
Util.sc.stop()
}
}
开发者ID:PacktPublishing,项目名称:Machine-Learning-with-Spark-Second-Edition,代码行数:59,代码来源:CountByRatingChart.scala
示例11: UserRatingsChart
//设置package包名称以及导入依赖的类
package org.sparksamples
import org.apache.spark.SparkConf
import org.apache.spark.sql.SparkSession
import org.apache.spark.sql.types.{IntegerType, StructField, StructType}
import scalax.chart.module.ChartFactories
object UserRatingsChart {
def main(args: Array[String]) {
val customSchema = StructType(Array(
StructField("user_id", IntegerType, true),
StructField("movie_id", IntegerType, true),
StructField("rating", IntegerType, true),
StructField("timestamp", IntegerType, true)))
val spConfig = (new SparkConf).setMaster("local").setAppName("SparkApp")
val spark = SparkSession
.builder()
.appName("SparkRatingData").config(spConfig)
.getOrCreate()
val rating_df = spark.read.format("com.databricks.spark.csv")
.option("delimiter", "\t").schema(customSchema)
.load("../../data/ml-100k/u.data")
val rating_nos_by_user = rating_df.groupBy("user_id").count().sort("count")
val ds = new org.jfree.data.category.DefaultCategoryDataset
rating_nos_by_user.show(rating_nos_by_user.collect().length)
val rating_nos_by_user_collect =rating_nos_by_user.collect()
var mx = Map(0 -> 0)
val min = 1
val max = 1000
val bins = 100
val step = (max/bins).toInt
for (i <- step until (max + step) by step) {
mx += (i -> 0);
}
for( x <- 0 until rating_nos_by_user_collect.length) {
val user_id = Integer.parseInt(rating_nos_by_user_collect(x)(0).toString)
val count = Integer.parseInt(rating_nos_by_user_collect(x)(1).toString)
ds.addValue(count,"Ratings", user_id)
}
// ------------------------------------------------------------------
val chart = ChartFactories.BarChart(ds)
chart.peer.getCategoryPlot.getDomainAxis().setVisible(false)
chart.show()
Util.sc.stop()
}
}
开发者ID:PacktPublishing,项目名称:Machine-Learning-with-Spark-Second-Edition,代码行数:58,代码来源:UserRatingsChart.scala
示例12: Util
//设置package包名称以及导入依赖的类
package org.sparksamples
import org.apache.spark.mllib.linalg.Vector
import org.apache.spark.sql._
import org.apache.spark.sql.types.{StringType, StructField, StructType}
object Util {
val PATH = "/home/ubuntu/work/spark-2.0.0-bin-hadoop2.7/"
val DATA_PATH= "../../../data/ml-100k"
val PATH_MOVIES = DATA_PATH + "/u.item"
def reduceDimension2(x: Vector) : String= {
var i = 0
var l = x.toArray.size
var l_2 = l/2.toInt
var x_ = 0.0
var y_ = 0.0
for(i <- 0 until l_2) {
x_ += x(i).toDouble
}
for(i <- (l_2 + 1) until l) {
y_ += x(i).toDouble
}
var t = x_ + "," + y_
return t
}
def getMovieDataDF(spark : SparkSession) : DataFrame = {
//1|Toy Story (1995)|01-Jan-1995||http://us.imdb.com/M/title-exact?Toy%20Story%20(1995)
// |0|0|0|1|1|1|0|0|0|0|0|0|0|0|0|0|0|0|0
val customSchema = StructType(Array(
StructField("id", StringType, true),
StructField("name", StringType, true),
StructField("date", StringType, true),
StructField("url", StringType, true)));
val movieDf = spark.read.format("com.databricks.spark.csv")
.option("delimiter", "|").schema(customSchema)
.load(PATH_MOVIES)
return movieDf
}
}
开发者ID:PacktPublishing,项目名称:Machine-Learning-with-Spark-Second-Edition,代码行数:46,代码来源:Util.scala
示例13: NTriplesRelation
//设置package包名称以及导入依赖的类
package net.sansa_stack.inference.spark.data.loader.sql
import org.apache.spark.rdd.RDD
import org.apache.spark.sql.{Row, SQLContext}
import org.apache.spark.sql.sources.{BaseRelation, TableScan}
import org.apache.spark.sql.types.{StringType, StructField, StructType}
import net.sansa_stack.inference.utils.NTriplesStringToRDFTriple
class NTriplesRelation(location: String, userSchema: StructType)
(@transient val sqlContext: SQLContext)
extends BaseRelation
with TableScan
with Serializable {
override def schema: StructType = {
if (this.userSchema != null) {
this.userSchema
}
else {
StructType(
Seq(
StructField("s", StringType, true),
StructField("p", StringType, true),
StructField("o", StringType, true)
))
}
}
override def buildScan(): RDD[Row] = {
val rdd = sqlContext
.sparkContext
.textFile(location)
val converter = new NTriplesStringToRDFTriple()
val rows = rdd.flatMap(x => converter.apply(x)).map(t => Row.fromSeq(Seq(t.s, t.p, t.o)))
rows
}
}
开发者ID:SANSA-Stack,项目名称:SANSA-Inference,代码行数:40,代码来源:NTriplesRelation.scala
示例14: TestData
//设置package包名称以及导入依赖的类
package be.dataminded.wharlord.test
import org.apache.spark.sql.types.{IntegerType, StringType, StructField, StructType}
import org.apache.spark.sql.{DataFrame, Row, SparkSession}
object TestData {
def makeIntegerDf(spark: SparkSession, numbers: Seq[Int]): DataFrame =
spark.createDataFrame(
spark.sparkContext.makeRDD(numbers.map(Row(_))),
StructType(List(StructField("column", IntegerType, nullable = false)))
)
def makeNullableStringDf(spark: SparkSession, strings: Seq[String]): DataFrame =
spark.createDataFrame(spark.sparkContext.makeRDD(strings.map(Row(_))), StructType(List(StructField("column", StringType, nullable = true))))
def makeIntegersDf(spark: SparkSession, row1: Seq[Int], rowN: Seq[Int]*): DataFrame = {
val rows = row1 :: rowN.toList
val numCols = row1.size
val rdd = spark.sparkContext.makeRDD(rows.map(Row(_:_*)))
val schema = StructType((1 to numCols).map(idx => StructField("column" + idx, IntegerType, nullable = false)))
spark.createDataFrame(rdd, schema)
}
}
开发者ID:datamindedbe,项目名称:wharlord,代码行数:26,代码来源:TestData.scala
示例15: SparkTermCandidatesWeighter
//设置package包名称以及导入依赖的类
package ru.ispras.atr.rank
import org.apache.spark.{SparkConf, SparkContext}
import org.apache.spark.rdd.RDD
import org.apache.spark.sql.functions.desc
import org.apache.spark.sql.hive.HiveContext
import org.apache.spark.sql.{DataFrame, Row}
import org.apache.spark.sql.types.{DoubleType, StringType, StructField, StructType}
import ru.ispras.atr.datamodel.{DSDataset, TermCandidate}
import ru.ispras.atr.features.FeatureConfig
abstract class SparkTermCandidatesWeighter(docsToShow:Int) extends TermCandidatesWeighter {
val termDFName = "Term"
def allFeatures: Seq[FeatureConfig]
def convert2FeatureSpace(candidates: Seq[TermCandidate], dataset: DSDataset):Seq[Seq[Double]] = {
val resByFeatures: Seq[Seq[Double]] = allFeatures.map(f => {
//iterate by features first, because it lets to estimate time per feature and (maybe) it is faster due to caching
log.info(s"Initializing feature ${f.id}...")
val featureComputer = f.build(candidates, dataset)
log.info(s"Computing feature ${f.id}...")
featureComputer.compute(candidates)
})
log.info(s"${allFeatures.size} features have been computed")
resByFeatures.transpose
}
def convertToDF(termNames: Seq[String], featureNames: Seq[String], resByTerms: Seq[Seq[Double]]): DataFrame = {
val header = StructField(termDFName, StringType) +: featureNames.map(f => StructField(f, DoubleType))
val schema = StructType(header)
val rows = termNames.zip(resByTerms).map(a => Row.fromSeq(a._1 +: a._2))
val rowsRDD: RDD[Row] = SparkConfigs.sc.parallelize(rows)
val df = SparkConfigs.sqlc.createDataFrame(rowsRDD, schema)
df
}
def weightAndSort(candidates: Seq[TermCandidate], dataset: DSDataset): Iterable[(String, Double)] = {
val featureValues = convert2FeatureSpace(candidates, dataset)
val initDF = convertToDF(candidates.map(_.verboseRepr(docsToShow)), allFeatures.map(_.id), featureValues)
val weightedDF = weight(initDF)
val termNamesDF = weightedDF.select(termDFName,id).sort(desc(id))
val weightColId: String = id //for serialization
val termColId: String = termDFName
val terms = termNamesDF.rdd.map(r => (r.getAs[String](termColId), r.getAs[Double](weightColId))).collect()
terms
}
def weight(df: DataFrame) : DataFrame
}
object SparkConfigs {
val sparkConf = new SparkConf()
.setAppName("ATR Evaluation System")
.setMaster("local[16]")
.set("spark.driver.memory", "1g")
val sc = new SparkContext(sparkConf)
val sqlc = new HiveContext(sc)
}
开发者ID:ispras,项目名称:atr4s,代码行数:61,代码来源:SparkTermCandidatesWeighter.scala
示例16: ConversionsSuite
//设置package包名称以及导入依赖的类
package org.apache.spark.orientdb.documents
import com.orientechnologies.orient.core.metadata.schema.OType
import com.orientechnologies.orient.core.record.impl.ODocument
import org.apache.spark.{SparkConf, SparkContext}
import org.apache.spark.sql.{Row, SQLContext}
import org.apache.spark.sql.types.{IntegerType, StringType, StructField, StructType}
import org.scalatest.FunSuite
class ConversionsSuite extends FunSuite {
test("Spark datatype to OrientDB datatype test") {
val orientDBType = Conversions.sparkDTtoOrientDBDT(StringType)
assert(orientDBType === OType.STRING)
}
test("Convert Spark Row to Orient DB ODocument") {
val expectedData = new ODocument()
expectedData.field("key", 1, OType.INTEGER)
expectedData.field("value", "Spark datasource for Orient DB", OType.STRING)
val conf = new SparkConf().setAppName("ConversionsSuite").setMaster("local[*]")
val sc = new SparkContext(conf)
val sqlContext = new SQLContext(sc)
val rows = sqlContext.createDataFrame(sc.parallelize(Seq(Row(1, "Spark datasource for Orient DB"))),
StructType(Array(StructField("key", IntegerType, true),
StructField("value", StringType, true)))).collect()
val actualData = Conversions.convertRowsToODocuments(rows(0))
assert(expectedData.field[Int]("key") == actualData.field[Int]("key"))
assert(expectedData.field[String]("value") == actualData.field[String]("value"))
sc.stop()
}
test("Convert OrientDB ODocument to Spark Row") {
val oDocument = new ODocument()
oDocument.field("key", 1, OType.INTEGER)
oDocument.field("value", "Orient DB ODocument to Spark Row", OType.STRING)
val schema = StructType(Array(StructField("key", IntegerType),
StructField("value", StringType)))
val expectedData = Row(1, "Orient DB ODocument to Spark Row")
val actualData = Conversions.convertODocumentsToRows(oDocument, schema)
assert(expectedData === actualData)
}
test("Return field of correct type") {
val field = Conversions.orientDBDTtoSparkDT(IntegerType, "1")
assert(field.isInstanceOf[Int])
}
}
开发者ID:orientechnologies,项目名称:spark-orientdb,代码行数:55,代码来源:ConversionsSuite.scala
示例17: SchemaToMongo
//设置package包名称以及导入依赖的类
package nsmc.conversion.types
import com.mongodb.casbah.Imports._
import org.apache.spark.sql.Row
import org.apache.spark.sql.types.{StructType, IntegerType, StringType, StructField}
object SchemaToMongo {
def getMongoRecord(schema: Seq[StructField], r: Row) : DBObject = {
val converted = schema.zip(r.toSeq).map(toMongo)
MongoDBObject(converted:_*)
}
private def toMongo(p:(StructField, Any)) : (String, Any) = {
p match {
case (sf, a) =>
sf.dataType match {
// TODO: leaving out some of the atomic types
case StringType => (sf.name, a)
case IntegerType => (sf.name, a)
case StructType(s) => (sf.name, getMongoRecord(s, a.asInstanceOf[Row]))
}
}
}
}
开发者ID:baank,项目名称:spark-mongodb-connector,代码行数:27,代码来源:SchemaToMongo.scala
示例18: Locus
//设置package包名称以及导入依赖的类
package org.broadinstitute.hail.variant
import org.apache.spark.sql.types.{IntegerType, StringType, StructField, StructType}
import org.broadinstitute.hail.check.Gen
import org.json4s._
object Locus {
val simpleContigs: Seq[String] = (1 to 22).map(_.toString) ++ Seq("X", "Y", "MT")
val schema: StructType =
StructType(Array(
StructField("contig", StringType, nullable = false),
StructField("position", IntegerType, nullable = false)))
def gen(contigs: Seq[String]): Gen[Locus] =
Gen.zip(Gen.oneOfSeq(contigs), Gen.posInt)
.map { case (contig, pos) => Locus(contig, pos) }
def gen: Gen[Locus] = gen(simpleContigs)
}
case class Locus(contig: String, position: Int) extends Ordered[Locus] {
def compare(that: Locus): Int = {
var c = Contig.compare(contig, that.contig)
if (c != 0)
return c
position.compare(that.position)
}
def toJSON: JValue = JObject(
("contig", JString(contig)),
("position", JInt(position)))
override def toString: String = s"$contig:$position"
}
开发者ID:Sun-shan,项目名称:Hail_V2,代码行数:37,代码来源:Locus.scala
示例19: EmailModel
//设置package包名称以及导入依赖的类
package com.hugolinton.model
import java.util.UUID
import com.pff.PSTMessage
import org.apache.spark.sql.types.{ArrayType, StringType, StructField, StructType}
case class EmailModel (id : String, emailBody : String, sentTo : Array[String], ccTo : Array[String]) extends Product
object EmailModel {
final val schema = StructType(Array(StructField("id", StringType, false),
(StructField("emailBody", StringType, false)),
(StructField("sentTo", ArrayType(StringType))),
(StructField("ccTo", ArrayType(StringType)))
))
def pstToModel(pst : PSTMessage) = {
val uniqueID = UUID.randomUUID().toString();
val to = pst.getDisplayTo.replaceAll("""[.,\/#!$%\^&\*;:{}=\-_`~()\s]""", "").toLowerCase.split(";")
val cc = pst.getDisplayCC.replaceAll("""[.,\/#!$%\^&\*;:{}=\-_`~()\s]""", "").toLowerCase.split(";")
EmailModel(uniqueID,safeGet(pst.getBody),to,cc)
}
def safeGet(field : String) : String = {
if(field.isEmpty || field == null){
return ""
}
field
}
}
开发者ID:HugoLinton,项目名称:PST-to-Parquet,代码行数:34,代码来源:EmailModel.scala
示例20: SparkSchemaFnsTest
//设置package包名称以及导入依赖的类
package io.eels.spark
import io.eels.schema.{DateType, DecimalType, Field, LongType, Precision, Scale, StringType, StructType}
import org.apache.spark.sql.types.StructField
import org.scalatest.{FunSuite, Matchers}
class SparkSchemaFnsTest extends FunSuite with Matchers {
test("eel schema to spark") {
val schema = StructType(
Field("a", StringType),
Field("b", DecimalType(Precision(14), Scale(4))),
Field("c", LongType.Signed),
Field("d", DateType)
)
SparkSchemaFns.toSparkSchema(schema) shouldBe
org.apache.spark.sql.types.StructType(
Seq(
StructField("a", org.apache.spark.sql.types.StringType, true),
StructField("b", org.apache.spark.sql.types.DecimalType(14, 4), true),
StructField("c", org.apache.spark.sql.types.LongType, true),
StructField("d", org.apache.spark.sql.types.DateType, true)
)
)
}
test("spark schema to eel") {
val schema = org.apache.spark.sql.types.StructType(
Seq(
StructField("a", org.apache.spark.sql.types.StringType, true),
StructField("b", org.apache.spark.sql.types.DecimalType(14, 4), true),
StructField("c", org.apache.spark.sql.types.LongType, true),
StructField("d", org.apache.spark.sql.types.DateType, true)
)
)
SparkSchemaFns.fromSparkSchema(schema) shouldBe
StructType(
Field("a", StringType),
Field("b", DecimalType(Precision(14), Scale(4))),
Field("c", LongType.Signed),
Field("d", DateType)
)
}
}
开发者ID:51zero,项目名称:eel-sdk,代码行数:45,代码来源:SparkSchemaFnsTest.scala
注:本文中的org.apache.spark.sql.types.StructField类示例整理自Github/MSDocs等源码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。 |
请发表评论