• 设为首页
  • 点击收藏
  • 手机版
    手机扫一扫访问
    迪恩网络手机版
  • 关注官方公众号
    微信扫一扫关注
    公众号

Scala StringType类代码示例

原作者: [db:作者] 来自: [db:来源] 收藏 邀请

本文整理汇总了Scala中org.apache.spark.sql.types.StringType的典型用法代码示例。如果您正苦于以下问题:Scala StringType类的具体用法?Scala StringType怎么用?Scala StringType使用的例子?那么恭喜您, 这里精选的类代码示例或许可以为您提供帮助。



在下文中一共展示了StringType类的20个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于我们的系统推荐出更棒的Scala代码示例。

示例1: User

//设置package包名称以及导入依赖的类
package services.users

import java.io.File
import java.nio.charset.Charset
import java.sql.Timestamp

import org.apache.commons.io.FileUtils
import org.apache.spark.sql.SparkSession
import org.apache.spark.sql.types.{StringType, StructField, StructType}
import org.apache.spark.streaming.StreamingContext
import services.events.EventStream
import services.Util


case class User(userId: String, testFinishTime: Timestamp, nickname: String, gender: String)

object User {
  val DELIMITER = ','
  val USER_FEED = "/Users/mahesh/data/affinitas/feeds/users/"
  val USER_DATA = "/Users/mahesh/data/affinitas/tables/users/"

  var ssc: StreamingContext = null
  var sql: SparkSession = null


  lazy val usersFeedDF = sql.read
    .format("com.databricks.spark.csv")
    .option("header", false)
    .schema(StructType(Array(
      StructField("userId", StringType, true),
      StructField("nickname", StringType, true),
      StructField("gender", StringType, true)
    )
    )).load(User.USER_FEED)

  //EventStream.testFinishStream.print()
  lazy val usersMap = usersFeedDF.rdd.map(record => (record.getString(0), (record.getString(1), record.getString(2))))


  def initialize(sscO: StreamingContext, sqlO: SparkSession) = {
    ssc = sscO
    sql = sqlO

    new File(USER_FEED).mkdirs()
    new File(USER_DATA).mkdirs()

    EventStream.testFinishStream.foreachRDD( {
      rdd => {
        val testFinishMap = rdd.map(record => (record.userId, record.timestamp))
        val userData = testFinishMap.join(usersMap)
          .map(record => Array(record._1, record._2._1, record._2._2._1, record._2._2._2))
          .collect()
        Util.writeCsvToDir(userData, DELIMITER.toString, USER_DATA)
      }
    })
  }
} 
开发者ID:f13mash,项目名称:spark_log_contact,代码行数:58,代码来源:User.scala


示例2: ColumnsTest

//设置package包名称以及导入依赖的类
package com.drakeconsulting.big_data_maker

import org.scalatest.FunSuite
import com.holdenkarau.spark.testing.SharedSparkContext
import org.apache.spark.sql.SQLContext
import org.apache.spark.sql.types.{StructField, StringType, LongType, DoubleType}

class ColumnsTest extends FunSuite with SharedSparkContext {
  val numLoops = 100

  test("test StringConstant") {
    val s1 = new StringConstant("f1", "abc")
    assert("abc" === s1.getValue(1))
    assert(StructField("f1", StringType, false) == s1.getStructField)
  }

  test("test RandomLong") {
    val s1 = new RandomLong("f1", 666666L)
    for (x <- 1 to numLoops) {
      assert(s1.getValue(1) >= 0)
      assert(s1.getValue(1) <= 666666L)
    }
    assert(StructField("f1", LongType, false) == s1.getStructField)
  }

  test("test RandomDouble") {
    val s1 = new RandomDouble("f1", 666666.00)
    for (x <- 1 to numLoops) {
      assert(s1.getValue(1) >= 0)
      assert(s1.getValue(1) <= 666666.00)
    }
    assert(StructField("f1", DoubleType, false) == s1.getStructField)
  }

  test("test Categorical") {
    val list = List("a", "b", "c", "d")
    val s1 = new Categorical("f1", list)
    for (x <- 1 to numLoops) {
      val v = s1.getValue(1)
      assert(list.exists(key => v.contains(key)))
    }
    assert(StructField("f1", StringType, false) == s1.getStructField)
  }
} 
开发者ID:dondrake,项目名称:BigDataMaker,代码行数:45,代码来源:TestColumns.scala


示例3: SchemaTest

//设置package包名称以及导入依赖的类
import org.apache.spark.sql.SparkSession
import org.apache.spark.sql.types.StructType
import org.apache.spark.sql.types.IntegerType
import org.apache.spark.sql.types.StringType
import org.apache.spark.sql.Row
import com.gxq.learn.recontool.utils.SparkContextFactory

object SchemaTest {
  def main(args: Array[String]): Unit = {
    val (sc, ss) = SparkContextFactory.getSparkContext("local")
    val data = sc.parallelize(Seq("Bern;10;12")) // mock for real data

    val schema = new StructType()
    .add("city", StringType, true)
    .add("female", IntegerType, true)
    .add("male", IntegerType, true)

    val cities = data.map(line => {
      val Array(city, female, male) = line.split(";")
      Row(
        city,
        female.toInt,
        male.toInt)
    })

    val citiesDF = ss.createDataFrame(cities, schema)
    citiesDF.show
  }
} 
开发者ID:gong1989313,项目名称:spark-bigdata,代码行数:30,代码来源:SchemaTest.scala


示例4: MedicineProcess

//设置package包名称以及导入依赖的类
package cn.com.warlock.practice.ml

import java.io.BufferedReader
import java.nio.charset.StandardCharsets
import java.nio.file.{Files, Paths}

import org.apache.spark.ml.UnaryTransformer
import org.apache.spark.ml.param.ParamMap
import org.apache.spark.ml.util.Identifiable
import org.apache.spark.sql.types.{ArrayType, DataType, StringType}

import scala.collection.mutable.Set

class MedicineProcess(override val uid: String, private val dict: String)
  extends UnaryTransformer[Seq[String], Seq[String], MedicineProcess] {

  def this(dict: String) = this(Identifiable.randomUID("med"), dict)

  // ?????????
  private val wordsSet = loadDict

  // ????
  private def loadDict: Set[String] = {
    val br: BufferedReader = Files.newBufferedReader(Paths.get(dict), StandardCharsets.UTF_8)
    val words = Set[String]()

    var count = 0

    while (br.ready()) {
      words += br.readLine()
      count += 1
    }

    println(s"load med words: $count")

    words
  }

  override protected def createTransformFunc: Seq[String] => Seq[String] = (words: Seq[String]) => {
    // ?? "???", arr ?????????, c ??????? word
    words.foldLeft(List[String]())((arr, c) => {
      val newC = wordsSet.contains(c) match {
        case true => List(c, "_MED_")
        case false => List(c)
      }
      arr ++ newC
    })
  }

  override protected def validateInputType(inputType: DataType): Unit = {
    require(inputType.isInstanceOf[ArrayType],
      s"The input column must be ArrayType, but got $inputType.")
  }

  override protected def outputDataType: DataType = new ArrayType(StringType, true)

  override def copy(extra: ParamMap): MedicineProcess = defaultCopy(extra)
} 
开发者ID:warlock-china,项目名称:spark-meepo,代码行数:59,代码来源:MedicineProcess.scala


示例5: StudyRelation

//设置package包名称以及导入依赖的类
package com.study.spark.datasource

import org.apache.spark.rdd.RDD
import org.apache.spark.sql.sources.{BaseRelation, TableScan}
import org.apache.spark.sql.types.{StringType, StructField, StructType}
import org.apache.spark.sql.{Row, SQLContext}


class StudyRelation(parameters: Map[String, String])(@transient val sqlContext: SQLContext)
  extends BaseRelation with TableScan {

  override def schema: StructType = {
    // ??? ?? ?????, ?? ??? ???? ????. ???? ?????? ???? ??????, ???? ?? ??? ????
    val fields: Array[StructField] = new Array[StructField](3)
    fields.update(0, new StructField("field1", StringType))
    fields.update(1, new StructField("field2", StringType))
    fields.update(2, new StructField("field2", StringType))
    new StructType(fields.asInstanceOf[Array[StructField]])
  }

  // RDD[Row]? ???? StudyRDD? ???.
  override def buildScan(): RDD[Row] = new StudyRDD(sqlContext, schema)
} 
开发者ID:hackpupu,项目名称:LML,代码行数:24,代码来源:StudyRelation.scala


示例6: ScorePredictor

//设置package包名称以及导入依赖的类
package org.wikimedia.research.recommendation.job.translation

import java.io.File

import org.apache.log4j.{LogManager, Logger}
import org.apache.spark.ml.regression.RandomForestRegressionModel
import org.apache.spark.sql.types.{DoubleType, StringType, StructField, StructType}
import org.apache.spark.sql.{DataFrame, Row, SaveMode, SparkSession}

import scala.collection.parallel.mutable.ParArray

object ScorePredictor {
  val log: Logger = LogManager.getLogger(ScorePredictor.getClass)

  def predictScores(spark: SparkSession,
                    modelsInputDir: File,
                    predictionsOutputDir: Option[File],
                    sites: ParArray[String],
                    featureData: DataFrame): Unit = {
    log.info("Scoring items")

    val predictions: Array[DataFrame] = sites.map(target => {
      try {
        log.info("Scoring for " + target)
        log.info("Getting work data for " + target)
        val workData: DataFrame = Utils.getWorkData(spark, featureData, target, exists = false)
        log.info("Loading model for " + target)
        val model = RandomForestRegressionModel.load(
          new File(modelsInputDir, target).getAbsolutePath)
        log.info("Scoring data for " + target)
        val predictions = model
          .setPredictionCol(target)
          .transform(workData)
          .select("id", target)

        predictions
      } catch {
        case unknown: Throwable =>
          log.error("Score for " + target + " failed", unknown)
          val schema = StructType(Seq(
            StructField("id", StringType, nullable = false),
            StructField(target, DoubleType, nullable = true)))
          spark.createDataFrame(spark.sparkContext.emptyRDD[Row], schema)
      }
    }).toArray

    val predictedScores = predictions.reduce((left, right) => left.join(right, Seq("id"), "outer"))

    log.info("Saving predictions")
    predictionsOutputDir.foreach(f = o =>
      predictedScores.coalesce(1)
        .write
        .mode(SaveMode.ErrorIfExists)
        .option("header", value = true)
        .option("compression", "bzip2")
        .csv(new File(o, "allPredictions").getAbsolutePath))
  }
} 
开发者ID:schana,项目名称:recommendation-translation,代码行数:59,代码来源:ScorePredictor.scala


示例7: Dictionary

//设置package包名称以及导入依赖的类
package com.github.uosdmlab.nkp

import org.apache.spark.sql.{SparkSession, Row}
import org.apache.spark.sql.types.{StringType, StructField, StructType}



object Dictionary {

  import org.bitbucket.eunjeon.seunjeon.{Analyzer => EunjeonAnalyzer}

  private var words = Seq.empty[String]

  private def chain(fn: => Any): this.type = {
    fn
    this
  }

  private def syncDictionary(): Unit = EunjeonAnalyzer.setUserDict(words.toIterator)

  def addWords(word: String, words: String*): this.type = addWords(word +: words)

  def addWords(words: Traversable[String]): this.type = chain {
    this.words = this.words ++ words
    syncDictionary()
  }

  def reset(): this.type = chain {
    EunjeonAnalyzer.resetUserDict()
    words = Seq.empty[String]
  }

  def addWordsFromCSV(path: String, paths: String*): this.type =
    addWordsFromCSV(path +: paths)

  def addWordsFromCSV(paths: Traversable[String]): this.type = chain {
    val spark = SparkSession.builder().getOrCreate()

    import spark.implicits._

    val schema = StructType(Array(
      StructField("word", StringType, nullable = false),
      StructField("cost", StringType, nullable = true)))

    val df = spark.read
      .option("sep", ",")
      .option("inferSchema", value = false)
      .option("header", value = false)
      .schema(schema)
      .csv(paths.toSeq: _*)

    val words = df.map {
      case Row(word: String, cost: String) =>
        s"$word,$cost"
      case Row(word: String, null) =>
        word
    }.collect()

    addWords(words)
  }
} 
开发者ID:uosdmlab,项目名称:spark-nkp,代码行数:62,代码来源:Dictionary.scala


示例8: EmptyRDFGraphDataFrame

//设置package包名称以及导入依赖的类
package net.sansa_stack.ml.spark.mining.amieSpark

import org.apache.spark.sql.types.{StringType, StructField, StructType}
import org.apache.spark.sql.{DataFrame, Row, SQLContext}

/**
  * @author Lorenz Buehmann
  */
object EmptyRDFGraphDataFrame {

  def get(sqlContext: SQLContext): DataFrame = {
    // convert RDD to DataFrame
    val schemaString = "subject predicate object"

    // generate the schema based on the string of schema
    val schema = StructType(schemaString.split(" ").map(fieldName => StructField(fieldName, StringType, true)))

    // convert triples RDD to rows
    val rowRDD = sqlContext.sparkContext.emptyRDD[Row]

    // apply the schema to the RDD
    val triplesDataFrame = sqlContext.createDataFrame(rowRDD, schema)

    // register the DataFrame as a table
    triplesDataFrame.createOrReplaceTempView("TRIPLES")

    triplesDataFrame
  }
} 
开发者ID:SANSA-Stack,项目名称:SANSA-ML,代码行数:30,代码来源:EmptyRDFGraphDataFrame.scala


示例9: FourDimRec

//设置package包名称以及导入依赖的类
package com.lavsurgut.vizr.vizrec.test

import com.lavsurgut.vizr.specs.data.{DataMap, Schema, SchemaField}
import org.scalatest._
import com.lavsurgut.vizr.specs.vizrec._
import com.lavsurgut.vizr.vizrec.rec._
import org.apache.spark.sql.types.StringType


class FourDimRec extends FlatSpec {

  "it" should "return string" in {

    val spec = VizRecSpec(
      mark = Mark.?,
      fields = Seq(
        Field(
          channel = Channel.ROW,
          field = Some("field1"),
          `type` = Type.NOMINAL
        ),
        Field(
          channel = Channel.COLUMN,
          field = Some("field2"),
          `type` = Type.NOMINAL
        ),
        Field(
          channel = Channel.?,
          field = Some("field3"),
          //aggregate = Some(Aggregate.COUNT),
          `type` = Type.QUANTITATIVE
        ),
        Field(
          channel = Channel.?,
          field = Some("field4"),
          //aggregate = Some(Aggregate.BIN),
          `type` = Type.QUANTITATIVE
        )
      )
    )

    info(spec.toJSON)
    //get the specification part of the spec

    val rankedSet = Recommender.recommend(spec, DataMap(), Schema(Map("test" -> SchemaField("test", StringType, 10))))
    rankedSet.toList.sortWith(
      _.rankingScore(RecommendationType.EFFECTIVENESS) > _.rankingScore(RecommendationType.EFFECTIVENESS)).foreach(
      f => {
        info(f.rankingScore(RecommendationType.EFFECTIVENESS) + " - " + f.spec)
      })

    assert(spec.toString.isInstanceOf[String])
  }

} 
开发者ID:lavsurgut,项目名称:vizr,代码行数:56,代码来源:FourDimRec.scala


示例10: ModelConstraints

//设置package包名称以及导入依赖的类
package com.lavsurgut.vizr.vizrec.test

import com.lavsurgut.vizr.specs.data.{DataMap, Schema, SchemaField}
import com.lavsurgut.vizr.specs.vizrec._
import com.lavsurgut.vizr.vizrec.rec.{Config, WildCardModel}
import com.lavsurgut.vizr.vizrec.rec.constraint.{ConstraintChecker, Constraints => RecConstraints}
import org.apache.spark.sql.types.StringType
import org.scalatest._


class ModelConstraints extends FlatSpec {

  "it" should "return string" in {

    val field1 = Field(channel = Channel.ROW,
      `type` = Type.ORDINAL,
      aggregate = None,
      timeUnit = None,
      field = Some("Cylinders"))

    val field2 = Field(
      channel = Channel.X,
      `type` = Type.QUANTITATIVE,
      aggregate = Some(Aggregate.COUNT),
      field = Some("*"))

    val spec = VizRecSpec(
      mark = Mark.?,
      fields = Seq(field1, field2)
    )

    val model = WildCardModel(
      spec = spec,
      schema = Schema(Map("test" -> SchemaField("test", StringType, 10))),
      config = Config()
    )

    model.wildCardIdx = Seq(Property.CHANNEL)

    val res = ConstraintChecker.checkWildCardModelConstraints(model)._1
    info(res.toString)

    assert(res.isInstanceOf[Boolean])
  }

} 
开发者ID:lavsurgut,项目名称:vizr,代码行数:47,代码来源:ModelConstraints.scala


示例11: Document

//设置package包名称以及导入依赖的类
package sparknlp

import org.apache.spark.sql.Row
import org.apache.spark.sql.types.{MapType, StringType, StructField, StructType}

case class Document(
  id: String,
  text: String,
  metadata: scala.collection.Map[String, String] = Map()
)

object Document {
  def apply(row: Row): Document = {
    Document(row.getString(0), row.getString(1), row.getMap[String, String](2))
  }

  val DocumentDataType: StructType = StructType(Array(
    StructField("id",StringType,nullable = true),
    StructField("text",StringType,nullable = true),
    StructField("metadata",MapType(StringType,StringType,valueContainsNull = true),nullable = true)
  ))
} 
开发者ID:alexander-n-thomas,项目名称:sparknlp,代码行数:23,代码来源:Document.scala


示例12: main

//设置package包名称以及导入依赖的类
package org.sparksamples.df
//import org.apache.spark.sql.SQLContext
import org.apache.spark.SparkConf
import org.apache.spark.sql.SparkSession

import org.apache.spark.sql.types.{IntegerType, StringType, StructField, StructType};
package object UserData {
  def main(args: Array[String]): Unit = {
    val customSchema = StructType(Array(
      StructField("no", IntegerType, true),
      StructField("age", StringType, true),
      StructField("gender", StringType, true),
      StructField("occupation", StringType, true),
      StructField("zipCode", StringType, true)));
    val spConfig = (new SparkConf).setMaster("local").setAppName("SparkApp")
    val spark = SparkSession
      .builder()
      .appName("SparkUserData").config(spConfig)
      .getOrCreate()

    val user_df = spark.read.format("com.databricks.spark.csv")
      .option("delimiter", "|").schema(customSchema)
      .load("/home/ubuntu/work/ml-resources/spark-ml/data/ml-100k/u.user")
    val first = user_df.first()
    println("First Record : " + first)

    val num_genders = user_df.groupBy("gender").count().count()
    val num_occupations = user_df.groupBy("occupation").count().count()
    val num_zipcodes = user_df.groupBy("zipCode").count().count()

    println("num_users : " + user_df.count())
    println("num_genders : "+ num_genders)
    println("num_occupations : "+ num_occupations)
    println("num_zipcodes: " + num_zipcodes)
    println("Distribution by Occupation")
    println(user_df.groupBy("occupation").count().show())

  }
} 
开发者ID:PacktPublishing,项目名称:Machine-Learning-with-Spark-Second-Edition,代码行数:40,代码来源:UserData.scala


示例13: Util

//设置package包名称以及导入依赖的类
package org.sparksamples

import org.apache.spark.mllib.linalg.Vector
import org.apache.spark.sql._
import org.apache.spark.sql.types.{StringType, StructField, StructType}


object Util {
  val PATH = "/home/ubuntu/work/spark-2.0.0-bin-hadoop2.7/"
  val DATA_PATH= "../../../data/ml-100k"
  val PATH_MOVIES = DATA_PATH + "/u.item"

  def reduceDimension2(x: Vector) : String= {
    var i = 0
    var l = x.toArray.size
    var l_2 = l/2.toInt
    var x_ = 0.0
    var y_ = 0.0

    for(i <- 0 until l_2) {
      x_ += x(i).toDouble
    }
    for(i <- (l_2 + 1) until l) {
      y_ += x(i).toDouble
    }
    var t = x_ + "," + y_
    return t
  }

  def getMovieDataDF(spark : SparkSession) : DataFrame = {

    //1|Toy Story (1995)|01-Jan-1995||http://us.imdb.com/M/title-exact?Toy%20Story%20(1995)
    // |0|0|0|1|1|1|0|0|0|0|0|0|0|0|0|0|0|0|0
    val customSchema = StructType(Array(
      StructField("id", StringType, true),
      StructField("name", StringType, true),
      StructField("date", StringType, true),
      StructField("url", StringType, true)));
    val movieDf = spark.read.format("com.databricks.spark.csv")
      .option("delimiter", "|").schema(customSchema)
      .load(PATH_MOVIES)
    return movieDf
  }

} 
开发者ID:PacktPublishing,项目名称:Machine-Learning-with-Spark-Second-Edition,代码行数:46,代码来源:Util.scala


示例14: NTriplesRelation

//设置package包名称以及导入依赖的类
package net.sansa_stack.inference.spark.data.loader.sql

import org.apache.spark.rdd.RDD
import org.apache.spark.sql.{Row, SQLContext}
import org.apache.spark.sql.sources.{BaseRelation, TableScan}
import org.apache.spark.sql.types.{StringType, StructField, StructType}

import net.sansa_stack.inference.utils.NTriplesStringToRDFTriple

class NTriplesRelation(location: String, userSchema: StructType)
                      (@transient val sqlContext: SQLContext)
    extends BaseRelation
      with TableScan
      with Serializable {
    override def schema: StructType = {
      if (this.userSchema != null) {
        this.userSchema
      }
      else {
        StructType(
          Seq(
            StructField("s", StringType, true),
            StructField("p", StringType, true),
            StructField("o", StringType, true)
        ))
      }
    }
    override def buildScan(): RDD[Row] = {
      val rdd = sqlContext
        .sparkContext
        .textFile(location)

      val converter = new NTriplesStringToRDFTriple()

      val rows = rdd.flatMap(x => converter.apply(x)).map(t => Row.fromSeq(Seq(t.s, t.p, t.o)))

      rows
    }
  } 
开发者ID:SANSA-Stack,项目名称:SANSA-Inference,代码行数:40,代码来源:NTriplesRelation.scala


示例15: TestData

//设置package包名称以及导入依赖的类
package be.dataminded.wharlord.test

import org.apache.spark.sql.types.{IntegerType, StringType, StructField, StructType}
import org.apache.spark.sql.{DataFrame, Row, SparkSession}

object TestData {

  def makeIntegerDf(spark: SparkSession, numbers: Seq[Int]): DataFrame =
    spark.createDataFrame(
      spark.sparkContext.makeRDD(numbers.map(Row(_))),
      StructType(List(StructField("column", IntegerType, nullable = false)))
    )

  def makeNullableStringDf(spark: SparkSession, strings: Seq[String]): DataFrame =
    spark.createDataFrame(spark.sparkContext.makeRDD(strings.map(Row(_))), StructType(List(StructField("column", StringType, nullable = true))))

  def makeIntegersDf(spark: SparkSession, row1: Seq[Int], rowN: Seq[Int]*): DataFrame = {
    val rows = row1 :: rowN.toList
    val numCols = row1.size
    val rdd = spark.sparkContext.makeRDD(rows.map(Row(_:_*)))
    val schema = StructType((1 to numCols).map(idx => StructField("column" + idx, IntegerType, nullable = false)))
    spark.createDataFrame(rdd, schema)
  }

} 
开发者ID:datamindedbe,项目名称:wharlord,代码行数:26,代码来源:TestData.scala


示例16: SparkTermCandidatesWeighter

//设置package包名称以及导入依赖的类
package ru.ispras.atr.rank

import org.apache.spark.{SparkConf, SparkContext}
import org.apache.spark.rdd.RDD
import org.apache.spark.sql.functions.desc
import org.apache.spark.sql.hive.HiveContext
import org.apache.spark.sql.{DataFrame, Row}
import org.apache.spark.sql.types.{DoubleType, StringType, StructField, StructType}
import ru.ispras.atr.datamodel.{DSDataset, TermCandidate}
import ru.ispras.atr.features.FeatureConfig


abstract class SparkTermCandidatesWeighter(docsToShow:Int) extends TermCandidatesWeighter {
  val termDFName = "Term"

  def allFeatures: Seq[FeatureConfig]

  def convert2FeatureSpace(candidates: Seq[TermCandidate], dataset: DSDataset):Seq[Seq[Double]] = {
    val resByFeatures: Seq[Seq[Double]] = allFeatures.map(f => {
      //iterate by features first, because it lets to estimate time per feature and (maybe) it is faster due to caching
      log.info(s"Initializing feature ${f.id}...")
      val featureComputer = f.build(candidates, dataset)
      log.info(s"Computing feature ${f.id}...")
      featureComputer.compute(candidates)
    })
    log.info(s"${allFeatures.size} features have been computed")
    resByFeatures.transpose
    }

  def convertToDF(termNames: Seq[String], featureNames: Seq[String], resByTerms: Seq[Seq[Double]]): DataFrame = {
    val header = StructField(termDFName, StringType) +: featureNames.map(f => StructField(f, DoubleType))
    val schema = StructType(header)
    val rows = termNames.zip(resByTerms).map(a => Row.fromSeq(a._1 +: a._2))
    val rowsRDD: RDD[Row] = SparkConfigs.sc.parallelize(rows)
    val df = SparkConfigs.sqlc.createDataFrame(rowsRDD, schema)
    df
  }

  def weightAndSort(candidates: Seq[TermCandidate], dataset: DSDataset): Iterable[(String, Double)] = {
    val featureValues = convert2FeatureSpace(candidates, dataset)
    val initDF = convertToDF(candidates.map(_.verboseRepr(docsToShow)), allFeatures.map(_.id), featureValues)
    val weightedDF = weight(initDF)
    val termNamesDF = weightedDF.select(termDFName,id).sort(desc(id))
    val weightColId: String = id //for serialization
    val termColId: String = termDFName
    val terms = termNamesDF.rdd.map(r => (r.getAs[String](termColId), r.getAs[Double](weightColId))).collect()
    terms
  }

  def weight(df: DataFrame) : DataFrame
}

object SparkConfigs {
  val sparkConf = new SparkConf()
    .setAppName("ATR Evaluation System")
    .setMaster("local[16]")
    .set("spark.driver.memory", "1g")
  val sc = new SparkContext(sparkConf)
  val sqlc = new HiveContext(sc)
} 
开发者ID:ispras,项目名称:atr4s,代码行数:61,代码来源:SparkTermCandidatesWeighter.scala


示例17: ConversionsSuite

//设置package包名称以及导入依赖的类
package org.apache.spark.orientdb.documents

import com.orientechnologies.orient.core.metadata.schema.OType
import com.orientechnologies.orient.core.record.impl.ODocument
import org.apache.spark.{SparkConf, SparkContext}
import org.apache.spark.sql.{Row, SQLContext}
import org.apache.spark.sql.types.{IntegerType, StringType, StructField, StructType}
import org.scalatest.FunSuite

class ConversionsSuite extends FunSuite {

  test("Spark datatype to OrientDB datatype test") {
    val orientDBType = Conversions.sparkDTtoOrientDBDT(StringType)
    assert(orientDBType === OType.STRING)
  }

  test("Convert Spark Row to Orient DB ODocument") {
    val expectedData = new ODocument()
    expectedData.field("key", 1, OType.INTEGER)
    expectedData.field("value", "Spark datasource for Orient DB", OType.STRING)

    val conf = new SparkConf().setAppName("ConversionsSuite").setMaster("local[*]")
    val sc = new SparkContext(conf)
    val sqlContext = new SQLContext(sc)

    val rows = sqlContext.createDataFrame(sc.parallelize(Seq(Row(1, "Spark datasource for Orient DB"))),
      StructType(Array(StructField("key", IntegerType, true),
        StructField("value", StringType, true)))).collect()

    val actualData = Conversions.convertRowsToODocuments(rows(0))
    assert(expectedData.field[Int]("key") == actualData.field[Int]("key"))
    assert(expectedData.field[String]("value") == actualData.field[String]("value"))
    sc.stop()
  }

  test("Convert OrientDB ODocument to Spark Row") {
    val oDocument = new ODocument()
    oDocument.field("key", 1, OType.INTEGER)
    oDocument.field("value", "Orient DB ODocument to Spark Row", OType.STRING)

    val schema = StructType(Array(StructField("key", IntegerType),
      StructField("value", StringType)))

    val expectedData = Row(1, "Orient DB ODocument to Spark Row")
    val actualData = Conversions.convertODocumentsToRows(oDocument, schema)

    assert(expectedData === actualData)
  }

  test("Return field of correct type") {
    val field = Conversions.orientDBDTtoSparkDT(IntegerType, "1")
    assert(field.isInstanceOf[Int])
  }
} 
开发者ID:orientechnologies,项目名称:spark-orientdb,代码行数:55,代码来源:ConversionsSuite.scala


示例18: SchemaToMongo

//设置package包名称以及导入依赖的类
package nsmc.conversion.types

import com.mongodb.casbah.Imports._
import org.apache.spark.sql.Row
import org.apache.spark.sql.types.{StructType, IntegerType, StringType, StructField}

object SchemaToMongo {
  def getMongoRecord(schema: Seq[StructField], r: Row) : DBObject = {
    val converted = schema.zip(r.toSeq).map(toMongo)
    MongoDBObject(converted:_*)
  }

  private def toMongo(p:(StructField, Any)) : (String, Any) = {
    p match {
      case (sf, a) =>
        sf.dataType match {
          // TODO: leaving out some of the atomic types
          case StringType => (sf.name, a)
          case IntegerType => (sf.name, a)
          case StructType(s) => (sf.name, getMongoRecord(s, a.asInstanceOf[Row]))
        }
    }
  }


} 
开发者ID:baank,项目名称:spark-mongodb-connector,代码行数:27,代码来源:SchemaToMongo.scala


示例19: Locus

//设置package包名称以及导入依赖的类
package org.broadinstitute.hail.variant

import org.apache.spark.sql.types.{IntegerType, StringType, StructField, StructType}
import org.broadinstitute.hail.check.Gen
import org.json4s._

object Locus {
  val simpleContigs: Seq[String] = (1 to 22).map(_.toString) ++ Seq("X", "Y", "MT")

  val schema: StructType =
    StructType(Array(
      StructField("contig", StringType, nullable = false),
      StructField("position", IntegerType, nullable = false)))

  def gen(contigs: Seq[String]): Gen[Locus] =
    Gen.zip(Gen.oneOfSeq(contigs), Gen.posInt)
      .map { case (contig, pos) => Locus(contig, pos) }

  def gen: Gen[Locus] = gen(simpleContigs)
}

case class Locus(contig: String, position: Int) extends Ordered[Locus] {
  def compare(that: Locus): Int = {
    var c = Contig.compare(contig, that.contig)
    if (c != 0)
      return c

    position.compare(that.position)
  }

  def toJSON: JValue = JObject(
    ("contig", JString(contig)),
    ("position", JInt(position)))

  override def toString: String = s"$contig:$position"
} 
开发者ID:Sun-shan,项目名称:Hail_V2,代码行数:37,代码来源:Locus.scala


示例20: EmailModel

//设置package包名称以及导入依赖的类
package com.hugolinton.model

import java.util.UUID

import com.pff.PSTMessage
import org.apache.spark.sql.types.{ArrayType, StringType, StructField, StructType}


case class EmailModel (id : String, emailBody : String, sentTo : Array[String], ccTo : Array[String]) extends Product

object EmailModel {

  final val schema = StructType(Array(StructField("id", StringType, false),
    (StructField("emailBody", StringType, false)),
    (StructField("sentTo", ArrayType(StringType))),
    (StructField("ccTo", ArrayType(StringType)))
  ))

  def pstToModel(pst : PSTMessage) = {
    val uniqueID = UUID.randomUUID().toString();
    val to = pst.getDisplayTo.replaceAll("""[.,\/#!$%\^&\*;:{}=\-_`~()\s]""", "").toLowerCase.split(";")
    val cc = pst.getDisplayCC.replaceAll("""[.,\/#!$%\^&\*;:{}=\-_`~()\s]""", "").toLowerCase.split(";")
    EmailModel(uniqueID,safeGet(pst.getBody),to,cc)
  }


  def safeGet(field : String) : String = {
    if(field.isEmpty || field == null){
      return ""
    }
    field
  }
} 
开发者ID:HugoLinton,项目名称:PST-to-Parquet,代码行数:34,代码来源:EmailModel.scala



注:本文中的org.apache.spark.sql.types.StringType类示例整理自Github/MSDocs等源码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。


鲜花

握手

雷人

路过

鸡蛋
该文章已有0人参与评论

请发表评论

全部评论

专题导读
上一篇:
Scala Callable类代码示例发布时间:2022-05-23
下一篇:
Scala StaticAnnotation类代码示例发布时间:2022-05-23
热门推荐
热门话题
阅读排行榜

扫描微信二维码

查看手机版网站

随时了解更新最新资讯

139-2527-9053

在线客服(服务时间 9:00~18:00)

在线QQ客服
地址:深圳市南山区西丽大学城创智工业园
电邮:jeky_zhao#qq.com
移动电话:139-2527-9053

Powered by 互联科技 X3.4© 2001-2213 极客世界.|Sitemap