• 设为首页
  • 点击收藏
  • 手机版
    手机扫一扫访问
    迪恩网络手机版
  • 关注官方公众号
    微信扫一扫关注
    公众号

Scala RegexTokenizer类代码示例

原作者: [db:作者] 来自: [db:来源] 收藏 邀请

本文整理汇总了Scala中org.apache.spark.ml.feature.RegexTokenizer的典型用法代码示例。如果您正苦于以下问题:Scala RegexTokenizer类的具体用法?Scala RegexTokenizer怎么用?Scala RegexTokenizer使用的例子?那么恭喜您, 这里精选的类代码示例或许可以为您提供帮助。



在下文中一共展示了RegexTokenizer类的3个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于我们的系统推荐出更棒的Scala代码示例。

示例1: CooccurrenceTokenizer

//设置package包名称以及导入依赖的类
package com.indix.ml2npy.text

import org.apache.spark.ml.feature.{RegexTokenizer, StopWordsRemover}


class CooccurrenceTokenizer extends RegexTokenizer {
  protected override def createTransformFunc: (String) => Seq[String] = { input =>
    val stopWordSet = StopWordsRemover.loadDefaultStopWords("english").toSet
    val tokens:Array[String] = super.createTransformFunc(input).toSet.toArray
    val filteredTokens = tokens.filter(token => !stopWordSet.contains(token))
    val coocc = for {
      (tokenI: String, i: Int) <- filteredTokens.zipWithIndex
      (tokenJ: String, j: Int) <- filteredTokens.zipWithIndex if j > i
    } yield {
      val (t1: String, t2: String) = if (i < j) (tokenI, tokenJ) else (tokenJ, tokenI)
      s"${t1}_$t2"
    }
    coocc
  }
} 
开发者ID:indix,项目名称:ml2npy,代码行数:21,代码来源:CooccurrenceTokenizer.scala


示例2: StopWordsRemoverExample

//设置package包名称以及导入依赖的类
package com.chapter11.SparkMachineLearning

import org.apache.spark.ml.feature.{ RegexTokenizer, Tokenizer }
import org.apache.spark.sql.functions._
import org.apache.spark.sql.SparkSession
import org.apache.spark.ml.feature.StopWordsRemover

object StopWordsRemoverExample {
  def main(args: Array[String]): Unit = {
    val spark = SparkSession
      .builder
      .master("local[*]")
      .config("spark.sql.warehouse.dir", "E:/Exp/")
      .appName(s"OneVsRestExample")
      .getOrCreate()

    val sentence = spark.createDataFrame(Seq(
      (0, "Tokenization,is the process of enchanting words,from the raw text"),
      (1, " If you want,to have more advance tokenization,RegexTokenizer,is a good option"),
      (2, " Here,will provide a sample example on how to tockenize sentences"),
      (3, "This way,you can find all matching occurrences"))).toDF("id", "sentence")

    val regexTokenizer = new RegexTokenizer()
      .setInputCol("sentence")
      .setOutputCol("words")
      .setPattern("\\W+")
      .setGaps(true)

    val countTokens = udf { (words: Seq[String]) => words.length }
    val regexTokenized = regexTokenizer.transform(sentence)

    val remover = new StopWordsRemover()
      .setInputCol("words")
      .setOutputCol("filtered")

    val newDF = remover.transform(regexTokenized)
    newDF.select("id", "filtered").show(false)

  }
} 
开发者ID:PacktPublishing,项目名称:Scala-and-Spark-for-Big-Data-Analytics,代码行数:41,代码来源:StopWordsRemoverExample.scala


示例3: TockenizerExample

//设置package包名称以及导入依赖的类
package com.chapter11.SparkMachineLearning
import org.apache.spark.ml.feature.{ RegexTokenizer, Tokenizer }
import org.apache.spark.sql.functions._
import org.apache.spark.sql.SparkSession

object TockenizerExample {
  def main(args: Array[String]): Unit = {
    val spark = SparkSession
      .builder
      .master("local[*]")
      .config("spark.sql.warehouse.dir", "E:/Exp/")
      .appName(s"OneVsRestExample")
      .getOrCreate()

    val sentence = spark.createDataFrame(Seq(
      (0, "Tokenization,is the process of enchanting words,from the raw text"),
      (1, " If you want,to have more advance tokenization,RegexTokenizer,is a good option"),
      (2, " Here,will provide a sample example on how to tockenize sentences"),
      (3, "This way,you can find all matching occurrences"))).toDF("id", "sentence")

    val tokenizer = new Tokenizer().setInputCol("sentence").setOutputCol("words")
    val regexTokenizer = new RegexTokenizer()
      .setInputCol("sentence")
      .setOutputCol("words")
      .setPattern("\\W+")
      .setGaps(true)

    val countTokens = udf { (words: Seq[String]) => words.length }

    val tokenized = tokenizer.transform(sentence)
    
    tokenized.select("sentence", "words")
            .withColumn("tokens", countTokens(col("words")))
            .show(false)

    val regexTokenized = regexTokenizer.transform(sentence)
    
    regexTokenized.select("sentence", "words")   
                .withColumn("tokens", countTokens(col("words")))
                .show(false)
  }
} 
开发者ID:PacktPublishing,项目名称:Scala-and-Spark-for-Big-Data-Analytics,代码行数:43,代码来源:TockenizerExample.scala



注:本文中的org.apache.spark.ml.feature.RegexTokenizer类示例整理自Github/MSDocs等源码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。


鲜花

握手

雷人

路过

鸡蛋
该文章已有0人参与评论

请发表评论

全部评论

专题导读
上一篇:
Scala MinMaxScaler类代码示例发布时间:2022-05-23
下一篇:
Scala ILoop类代码示例发布时间:2022-05-23
热门推荐
热门话题
阅读排行榜

扫描微信二维码

查看手机版网站

随时了解更新最新资讯

139-2527-9053

在线客服(服务时间 9:00~18:00)

在线QQ客服
地址:深圳市南山区西丽大学城创智工业园
电邮:jeky_zhao#qq.com
移动电话:139-2527-9053

Powered by 互联科技 X3.4© 2001-2213 极客世界.|Sitemap