本文整理汇总了Java中org.apache.spark.sql.Encoders类的典型用法代码示例。如果您正苦于以下问题:Java Encoders类的具体用法?Java Encoders怎么用?Java Encoders使用的例子?那么恭喜您, 这里精选的类代码示例或许可以为您提供帮助。
Encoders类属于org.apache.spark.sql包,在下文中一共展示了Encoders类的20个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于我们的系统推荐出更棒的Java代码示例。
示例1: toJson
import org.apache.spark.sql.Encoders; //导入依赖的package包/类
/**
* Converts a set of FHIR resources to JSON.
*
* @param dataset a dataset containing FHIR resources
* @param resourceType the FHIR resource type
* @return a dataset of JSON strings for the FHIR resources
*/
public static Dataset<String> toJson(Dataset<?> dataset, String resourceType) {
Dataset<IBaseResource> resourceDataset =
dataset.as(FhirEncoders.forStu3()
.getOrCreate()
.of(resourceType));
return resourceDataset.map(new ToJson(), Encoders.STRING());
}
开发者ID:cerner,项目名称:bunsen,代码行数:17,代码来源:Functions.java
示例2: main
import org.apache.spark.sql.Encoders; //导入依赖的package包/类
public static void main(String[] args) {
SparkSession spark = SparkSession
.builder()
.appName("Dataset-JavaBean")
.master("local[4]")
.getOrCreate();
//
// The Java API requires you to explicitly instantiate an encoder for
// any JavaBean you want to use for schema inference
//
Encoder<Number> numberEncoder = Encoders.bean(Number.class);
//
// Create a container of the JavaBean instances
//
List<Number> data = Arrays.asList(
new Number(1, "one", "un"),
new Number(2, "two", "deux"),
new Number(3, "three", "trois"));
//
// Use the encoder and the container of JavaBean instances to create a
// Dataset
//
Dataset<Number> ds = spark.createDataset(data, numberEncoder);
System.out.println("*** here is the schema inferred from the bean");
ds.printSchema();
System.out.println("*** here is the data");
ds.show();
// Use the convenient bean-inferred column names to query
System.out.println("*** filter by one column and fetch others");
ds.where(col("i").gt(2)).select(col("english"), col("french")).show();
spark.stop();
}
开发者ID:spirom,项目名称:learning-spark-with-java,代码行数:38,代码来源:JavaBean.java
示例3: readText
import org.apache.spark.sql.Encoders; //导入依赖的package包/类
private Dataset<Row> readText(String path) throws Exception {
Dataset<Row> lines = Contexts.getSparkSession().read().text(path);
if (config.hasPath("translator")) {
Dataset<Tuple2<String, String>> keyedLines = lines.map(
new PrepareLineForTranslationFunction(), Encoders.tuple(Encoders.STRING(), Encoders.STRING()));
TranslateFunction<String, String> translateFunction = new TranslateFunction<>(config.getConfig("translator"));
return keyedLines.flatMap(translateFunction, RowEncoder.apply(translateFunction.getSchema()));
}
else {
return lines;
}
}
开发者ID:cloudera-labs,项目名称:envelope,代码行数:16,代码来源:FileSystemInput.java
示例4: getMaxInt
import org.apache.spark.sql.Encoders; //导入依赖的package包/类
public static Dataset getMaxInt(Dataset ds, final String columnName){
Encoder<Integer> integerEncoder = Encoders.INT();
log.debug("getMaxInt on "+columnName);
Dataset dso = ds.mapPartitions(new MapPartitionsFunction() {
List<Integer> result = new ArrayList<>();
@Override
public Iterator call(Iterator input) throws Exception {
int curMax=-1;
while (input.hasNext()) {
Integer wInt = ((Row) input.next()).getAs(columnName);
//only add if we found large value
//Think of this a reduce before the partition reduce
log.debug("wInt "+ wInt.intValue());
log.debug("curMax"+ curMax);
log.debug("Checking max int");
if (wInt.intValue()>curMax) {
result.add(wInt);
curMax = wInt.intValue();
}
}
return result.iterator();
}
}, integerEncoder);
return dso.toDF(columnName).agg(max(columnName));
}
开发者ID:kineticadb,项目名称:kinetica-connector-spark,代码行数:37,代码来源:TypeIntProcessor.java
示例5: start
import org.apache.spark.sql.Encoders; //导入依赖的package包/类
private void start() {
SparkSession spark = SparkSession.builder()
.appName("Array to Dataset")
// .master("local")
.master("spark://10.0.100.81:7077")
.getOrCreate();
String[] l = new String[] { "a", "b", "c", "d" };
List<String> data = Arrays.asList(l);
Dataset<String> df = spark.createDataset(data, Encoders.STRING());
df.show();
}
开发者ID:jgperrin,项目名称:net.jgp.labs.spark,代码行数:13,代码来源:ArrayToDatasetApp.java
示例6: start
import org.apache.spark.sql.Encoders; //导入依赖的package包/类
private void start() {
SparkSession spark = SparkSession.builder()
.appName("Array to Dataframe")
.master("local")
.getOrCreate();
String[] l = new String[] { "a", "b", "c", "d" };
List<String> data = Arrays.asList(l);
Dataset<String> ds = spark.createDataset(data, Encoders.STRING());
Dataset<Row> df = ds.toDF();
df.show();
}
开发者ID:jgperrin,项目名称:net.jgp.labs.spark,代码行数:13,代码来源:ArrayToDataframeApp.java
示例7: start
import org.apache.spark.sql.Encoders; //导入依赖的package包/类
private void start() {
SparkSession spark = SparkSession.builder().master("local").getOrCreate();
List<Integer> data = Arrays.asList(1, 2, 3, 4, 5, 6, 7, 8, 9, 10);
Dataset<Integer> df = spark.createDataset(data, Encoders.INT());
df.show();
df.printSchema();
Integer sumByReduce = df.reduce(new SumByReduce());
System.out.println("Sum should be 55 and it is... " + sumByReduce);
}
开发者ID:jgperrin,项目名称:net.jgp.labs.spark,代码行数:11,代码来源:ReducerApp.java
示例8: start
import org.apache.spark.sql.Encoders; //导入依赖的package包/类
private void start() {
SparkSession spark = SparkSession.builder().appName("CSV to Dataset<Book>").master("local").getOrCreate();
String filename = "data/books.csv";
Dataset<Row> df = spark.read().format("csv").option("inferSchema", "true").option("header", "true")
.load(filename);
df.show();
Dataset<Book> bookDf = df.map(new BookMapper(), Encoders.bean(Book.class));
bookDf.show();
}
开发者ID:jgperrin,项目名称:net.jgp.labs.spark,代码行数:12,代码来源:CsvToDatasetBook.java
示例9: start
import org.apache.spark.sql.Encoders; //导入依赖的package包/类
private void start() {
SparkSession spark = SparkSession.builder().appName("CSV to Dataset<Book> as JSON").master("local").getOrCreate();
String filename = "data/books.csv";
Dataset<Row> df = spark.read().format("csv").option("inferSchema", "true").option("header", "true")
.load(filename);
df.show();
Dataset<String> bookDf = df.map(new BookMapper(), Encoders.STRING());
bookDf.show(20,132);
Dataset<Row> bookAsJsonDf = spark.read().json(bookDf);
bookAsJsonDf.show();
}
开发者ID:jgperrin,项目名称:net.jgp.labs.spark,代码行数:15,代码来源:CsvToDatasetBookAsJson.java
示例10: start
import org.apache.spark.sql.Encoders; //导入依赖的package包/类
private void start() {
SparkSession spark = SparkSession.builder().appName("Book URL Builder").master("local").getOrCreate();
String filename = "data/books.csv";
Dataset<Row> df = spark.read().format("csv").option("inferSchema", "true").option("header", "true")
.load(filename);
df.show();
Dataset<String> ds = df.map(new BookUrlBuilder(), Encoders.STRING());
ds.printSchema();
ds.show(20, 80);
}
开发者ID:jgperrin,项目名称:net.jgp.labs.spark,代码行数:13,代码来源:BookUrlBuilderApp.java
示例11: test0FailOnIndexCreationDisabled
import org.apache.spark.sql.Encoders; //导入依赖的package包/类
@Test(expected = EsHadoopIllegalArgumentException.class)
public void test0FailOnIndexCreationDisabled() throws Exception {
String target = wrapIndex("test-nonexisting/data");
JavaStreamingQueryTestHarness<RecordBean> test = new JavaStreamingQueryTestHarness<>(spark, Encoders.bean(RecordBean.class));
RecordBean doc1 = new RecordBean();
doc1.setId(1);
doc1.setName("Spark");
RecordBean doc2 = new RecordBean();
doc2.setId(2);
doc2.setName("Hadoop");
RecordBean doc3 = new RecordBean();
doc3.setId(3);
doc3.setName("YARN");
Dataset<RecordBean> dataset = test
.withInput(doc1)
.withInput(doc2)
.withInput(doc3)
.expectingToThrow(EsHadoopIllegalArgumentException.class)
.stream();
test.run(
dataset.writeStream()
.option("checkpointLocation", checkpoint(target))
.option(ES_INDEX_AUTO_CREATE, "no")
.format("es"),
target
);
assertTrue(!RestUtils.exists(target));
}
开发者ID:elastic,项目名称:elasticsearch-hadoop,代码行数:35,代码来源:AbstractJavaEsSparkStructuredStreamingTest.java
示例12: test1BasicWrite
import org.apache.spark.sql.Encoders; //导入依赖的package包/类
@Test
public void test1BasicWrite() throws Exception {
String target = wrapIndex("test-write/data");
JavaStreamingQueryTestHarness<RecordBean> test = new JavaStreamingQueryTestHarness<>(spark, Encoders.bean(RecordBean.class));
RecordBean doc1 = new RecordBean();
doc1.setId(1);
doc1.setName("Spark");
RecordBean doc2 = new RecordBean();
doc2.setId(2);
doc2.setName("Hadoop");
RecordBean doc3 = new RecordBean();
doc3.setId(3);
doc3.setName("YARN");
Dataset<RecordBean> dataset = test
.withInput(doc1)
.withInput(doc2)
.withInput(doc3)
.stream();
test.run(
dataset.writeStream()
.option("checkpointLocation", checkpoint(target))
.format("es"),
target
);
assertTrue(RestUtils.exists(target));
assertThat(RestUtils.get(target + "/_search?"), containsString("Spark"));
assertThat(RestUtils.get(target + "/_search?"), containsString("Hadoop"));
assertThat(RestUtils.get(target + "/_search?"), containsString("YARN"));
}
开发者ID:elastic,项目名称:elasticsearch-hadoop,代码行数:36,代码来源:AbstractJavaEsSparkStructuredStreamingTest.java
示例13: test1WriteWithMappingId
import org.apache.spark.sql.Encoders; //导入依赖的package包/类
@Test
public void test1WriteWithMappingId() throws Exception {
String target = wrapIndex("test-write-id/data");
JavaStreamingQueryTestHarness<RecordBean> test = new JavaStreamingQueryTestHarness<>(spark, Encoders.bean(RecordBean.class));
RecordBean doc1 = new RecordBean();
doc1.setId(1);
doc1.setName("Spark");
RecordBean doc2 = new RecordBean();
doc2.setId(2);
doc2.setName("Hadoop");
RecordBean doc3 = new RecordBean();
doc3.setId(3);
doc3.setName("YARN");
Dataset<RecordBean> dataset = test
.withInput(doc1)
.withInput(doc2)
.withInput(doc3)
.stream();
test.run(
dataset.writeStream()
.option("checkpointLocation", checkpoint(target))
.option("es.mapping.id", "id")
.format("es"),
target
);
assertEquals(3, JavaEsSpark.esRDD(new JavaSparkContext(spark.sparkContext()), target).count());
assertTrue(RestUtils.exists(target + "/1"));
assertTrue(RestUtils.exists(target + "/2"));
assertTrue(RestUtils.exists(target + "/3"));
assertThat(RestUtils.get(target + "/_search?"), containsString("Spark"));
}
开发者ID:elastic,项目名称:elasticsearch-hadoop,代码行数:39,代码来源:AbstractJavaEsSparkStructuredStreamingTest.java
示例14: test1WriteWithMappingExclude
import org.apache.spark.sql.Encoders; //导入依赖的package包/类
@Test
public void test1WriteWithMappingExclude() throws Exception {
String target = wrapIndex("test-mapping-exclude/data");
JavaStreamingQueryTestHarness<RecordBean> test = new JavaStreamingQueryTestHarness<>(spark, Encoders.bean(RecordBean.class));
RecordBean doc1 = new RecordBean();
doc1.setId(1);
doc1.setName("Spark");
RecordBean doc2 = new RecordBean();
doc2.setId(2);
doc2.setName("Hadoop");
RecordBean doc3 = new RecordBean();
doc3.setId(3);
doc3.setName("YARN");
Dataset<RecordBean> dataset = test
.withInput(doc1)
.withInput(doc2)
.withInput(doc3)
.stream();
test.run(
dataset.writeStream()
.option("checkpointLocation", checkpoint(target))
.option(ES_MAPPING_EXCLUDE, "name")
.format("es"),
target
);
assertTrue(RestUtils.exists(target));
assertThat(RestUtils.get(target + "/_search?"), not(containsString("Spark")));
assertThat(RestUtils.get(target + "/_search?"), not(containsString("Hadoop")));
assertThat(RestUtils.get(target + "/_search?"), not(containsString("YARN")));
}
开发者ID:elastic,项目名称:elasticsearch-hadoop,代码行数:37,代码来源:AbstractJavaEsSparkStructuredStreamingTest.java
示例15: test1MultiIndexWrite
import org.apache.spark.sql.Encoders; //导入依赖的package包/类
@Test
public void test1MultiIndexWrite() throws Exception {
String target = wrapIndex("test-write-tech-{name}/data");
JavaStreamingQueryTestHarness<RecordBean> test = new JavaStreamingQueryTestHarness<>(spark, Encoders.bean(RecordBean.class));
RecordBean doc1 = new RecordBean();
doc1.setId(1);
doc1.setName("spark");
RecordBean doc2 = new RecordBean();
doc2.setId(2);
doc2.setName("hadoop");
Dataset<RecordBean> dataset = test
.withInput(doc1)
.withInput(doc2)
.stream();
test.run(
dataset.writeStream()
.option("checkpointLocation", checkpoint(target))
.format("es"),
target
);
assertTrue(RestUtils.exists(wrapIndex("test-write-tech-spark/data")));
assertTrue(RestUtils.exists(wrapIndex("test-write-tech-hadoop/data")));
assertThat(RestUtils.get(wrapIndex("test-write-tech-spark/data/_search?")), containsString("\"name\":\"spark\""));
assertThat(RestUtils.get(wrapIndex("test-write-tech-hadoop/data/_search?")), containsString("\"name\":\"hadoop\""));
}
开发者ID:elastic,项目名称:elasticsearch-hadoop,代码行数:32,代码来源:AbstractJavaEsSparkStructuredStreamingTest.java
示例16: main
import org.apache.spark.sql.Encoders; //导入依赖的package包/类
public static void main(String[] args) {
//Window Specific property if Hadoop is not instaalled or HADOOP_HOME is not set
System.setProperty("hadoop.home.dir", "E:\\hadoop");
//Build a Spark Session
SparkSession sparkSession = SparkSession
.builder()
.master("local")
.config("spark.sql.warehouse.dir","file:///E:/hadoop/warehouse")
.appName("EdgeBuilder")
.getOrCreate();
Logger rootLogger = LogManager.getRootLogger();
rootLogger.setLevel(Level.WARN);
// Read the CSV data
Dataset<Row> emp_ds = sparkSession.read()
.format("com.databricks.spark.csv")
.option("header", "true")
.option("inferSchema", "true")
.load("src/main/resources/employee.txt");
UDF2 calcDays=new CalcDaysUDF();
//Registering the UDFs in Spark Session created above
sparkSession.udf().register("calcDays", calcDays, DataTypes.LongType);
emp_ds.createOrReplaceTempView("emp_ds");
emp_ds.printSchema();
emp_ds.show();
sparkSession.sql("select calcDays(hiredate,'dd-MM-yyyy') from emp_ds").show();
//Instantiate UDAF
AverageUDAF calcAvg= new AverageUDAF();
//Register UDAF to SparkSession
sparkSession.udf().register("calAvg", calcAvg);
//Use UDAF
sparkSession.sql("select deptno,calAvg(salary) from emp_ds group by deptno ").show();
//
TypeSafeUDAF typeSafeUDAF=new TypeSafeUDAF();
Dataset<Employee> emf = emp_ds.as(Encoders.bean(Employee.class));
emf.printSchema();
emf.show();
TypedColumn<Employee, Double> averageSalary = typeSafeUDAF.toColumn().name("averageTypeSafe");
Dataset<Double> result = emf.select(averageSalary);
result.show();
}
开发者ID:PacktPublishing,项目名称:Apache-Spark-2x-for-Java-Developers,代码行数:51,代码来源:UDFExample.java
示例17: bufferEncoder
import org.apache.spark.sql.Encoders; //导入依赖的package包/类
public Encoder<Average> bufferEncoder() {
return Encoders.bean(Average.class);
}
开发者ID:PacktPublishing,项目名称:Apache-Spark-2x-for-Java-Developers,代码行数:4,代码来源:TypeSafeUDAF.java
示例18: outputEncoder
import org.apache.spark.sql.Encoders; //导入依赖的package包/类
public Encoder<Double> outputEncoder() {
return Encoders.DOUBLE();
}
开发者ID:PacktPublishing,项目名称:Apache-Spark-2x-for-Java-Developers,代码行数:4,代码来源:TypeSafeUDAF.java
示例19: main
import org.apache.spark.sql.Encoders; //导入依赖的package包/类
public static void main(String[] args) throws Exception {
//Read properties
Properties prop = PropertyFileReader.readPropertyFile();
//SparkSesion
SparkSession spark = SparkSession
.builder()
.appName("VideoStreamProcessor")
.master(prop.getProperty("spark.master.url"))
.getOrCreate();
//directory to save image files with motion detected
final String processedImageDir = prop.getProperty("processed.output.dir");
logger.warn("Output directory for saving processed images is set to "+processedImageDir+". This is configured in processed.output.dir key of property file.");
//create schema for json message
StructType schema = DataTypes.createStructType(new StructField[] {
DataTypes.createStructField("cameraId", DataTypes.StringType, true),
DataTypes.createStructField("timestamp", DataTypes.TimestampType, true),
DataTypes.createStructField("rows", DataTypes.IntegerType, true),
DataTypes.createStructField("cols", DataTypes.IntegerType, true),
DataTypes.createStructField("type", DataTypes.IntegerType, true),
DataTypes.createStructField("data", DataTypes.StringType, true)
});
//Create DataSet from stream messages from kafka
Dataset<VideoEventData> ds = spark
.readStream()
.format("kafka")
.option("kafka.bootstrap.servers", prop.getProperty("kafka.bootstrap.servers"))
.option("subscribe", prop.getProperty("kafka.topic"))
.option("kafka.max.partition.fetch.bytes", prop.getProperty("kafka.max.partition.fetch.bytes"))
.option("kafka.max.poll.records", prop.getProperty("kafka.max.poll.records"))
.load()
.selectExpr("CAST(value AS STRING) as message")
.select(functions.from_json(functions.col("message"),schema).as("json"))
.select("json.*")
.as(Encoders.bean(VideoEventData.class));
//key-value pair of cameraId-VideoEventData
KeyValueGroupedDataset<String, VideoEventData> kvDataset = ds.groupByKey(new MapFunction<VideoEventData, String>() {
@Override
public String call(VideoEventData value) throws Exception {
return value.getCameraId();
}
}, Encoders.STRING());
//process
Dataset<VideoEventData> processedDataset = kvDataset.mapGroupsWithState(new MapGroupsWithStateFunction<String, VideoEventData, VideoEventData,VideoEventData>(){
@Override
public VideoEventData call(String key, Iterator<VideoEventData> values, GroupState<VideoEventData> state) throws Exception {
logger.warn("CameraId="+key+" PartitionId="+TaskContext.getPartitionId());
VideoEventData existing = null;
//check previous state
if (state.exists()) {
existing = state.get();
}
//detect motion
VideoEventData processed = VideoMotionDetector.detectMotion(key,values,processedImageDir,existing);
//update last processed
if(processed != null){
state.update(processed);
}
return processed;
}}, Encoders.bean(VideoEventData.class), Encoders.bean(VideoEventData.class));
//start
StreamingQuery query = processedDataset.writeStream()
.outputMode("update")
.format("console")
.start();
//await
query.awaitTermination();
}
开发者ID:baghelamit,项目名称:video-stream-analytics,代码行数:78,代码来源:VideoStreamProcessor.java
示例20: main
import org.apache.spark.sql.Encoders; //导入依赖的package包/类
public static void main(String[] args) throws StreamingQueryException {
//set log4j programmatically
LogManager.getLogger("org.apache.spark").setLevel(Level.WARN);
LogManager.getLogger("akka").setLevel(Level.ERROR);
//configure Spark
SparkConf conf = new SparkConf()
.setAppName("kafka-structured")
.setMaster("local[*]");
//initialize spark session
SparkSession sparkSession = SparkSession
.builder()
.config(conf)
.getOrCreate();
//reduce task number
sparkSession.sqlContext().setConf("spark.sql.shuffle.partitions", "3");
//data stream from kafka
Dataset<Row> ds1 = sparkSession
.readStream()
.format("kafka")
.option("kafka.bootstrap.servers", "localhost:9092")
.option("subscribe", "mytopic")
.option("startingOffsets", "earliest")
.load();
//start the streaming query
sparkSession.udf().register("deserialize", (byte[] data) -> {
GenericRecord record = recordInjection.invert(data).get();
return RowFactory.create(record.get("str1").toString(), record.get("str2").toString(), record.get("int1"));
}, DataTypes.createStructType(type.fields()));
ds1.printSchema();
Dataset<Row> ds2 = ds1
.select("value").as(Encoders.BINARY())
.selectExpr("deserialize(value) as rows")
.select("rows.*");
ds2.printSchema();
StreamingQuery query1 = ds2
.groupBy("str1")
.count()
.writeStream()
.queryName("Test query")
.outputMode("complete")
.format("console")
.start();
query1.awaitTermination();
}
开发者ID:Neuw84,项目名称:structured-streaming-avro-demo,代码行数:55,代码来源:StructuredDemo.java
注:本文中的org.apache.spark.sql.Encoders类示例整理自Github/MSDocs等源码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。 |
请发表评论