本文整理汇总了Java中org.apache.spark.ml.linalg.VectorUDT类的典型用法代码示例。如果您正苦于以下问题:Java VectorUDT类的具体用法?Java VectorUDT怎么用?Java VectorUDT使用的例子?那么恭喜您, 这里精选的类代码示例或许可以为您提供帮助。
VectorUDT类属于org.apache.spark.ml.linalg包,在下文中一共展示了VectorUDT类的20个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于我们的系统推荐出更棒的Java代码示例。
示例1: convertToStructField
import org.apache.spark.ml.linalg.VectorUDT; //导入依赖的package包/类
/**
* StructField,
*
* @return
* @throws CantConverException
*/
public static StructField convertToStructField(FieldInfo info) throws CantConverException {
if (info.getIndex() != -1) {
return DataTypes.createStructField(info.getName(), sparkDataType(info.getDataType()), info.isNullable());
} else {
switch (info.getDataType()) {
case FieldInfo.STRING_DATATYPE: {
return new StructField(info.getName(), DataTypes.createArrayType(DataTypes.StringType), info.isNullable(), Metadata.empty());
}
case FieldInfo.DOUBLE_DATATYPE:
case FieldInfo.INTEGER_DATATYPE:
case FieldInfo.LONG_DATATYPE: {
return new StructField(info.getName(), new VectorUDT(), info.isNullable(), Metadata.empty());
}
default:
throw new CantConverException("不合法类型");
}
}
}
开发者ID:hays2hong,项目名称:stonk,代码行数:25,代码来源:SparkDataFileConverter.java
示例2: binaryBlockToDataFrame
import org.apache.spark.ml.linalg.VectorUDT; //导入依赖的package包/类
public static Dataset<Row> binaryBlockToDataFrame(SparkSession sparkSession,
JavaPairRDD<MatrixIndexes, MatrixBlock> in, MatrixCharacteristics mc, boolean toVector)
{
if( !mc.colsKnown() )
throw new RuntimeException("Number of columns needed to convert binary block to data frame.");
//slice blocks into rows, align and convert into data frame rows
JavaRDD<Row> rowsRDD = in
.flatMapToPair(new SliceBinaryBlockToRowsFunction(mc.getRowsPerBlock()))
.groupByKey().map(new ConvertRowBlocksToRows((int)mc.getCols(), mc.getColsPerBlock(), toVector));
//create data frame schema
List<StructField> fields = new ArrayList<>();
fields.add(DataTypes.createStructField(DF_ID_COLUMN, DataTypes.DoubleType, false));
if( toVector )
fields.add(DataTypes.createStructField("C1", new VectorUDT(), false));
else { // row
for(int i = 1; i <= mc.getCols(); i++)
fields.add(DataTypes.createStructField("C"+i, DataTypes.DoubleType, false));
}
//rdd to data frame conversion
return sparkSession.createDataFrame(rowsRDD.rdd(), DataTypes.createStructType(fields));
}
开发者ID:apache,项目名称:systemml,代码行数:25,代码来源:RDDConverterUtils.java
示例3: testDataFrameSumDMLVectorWithIDColumn
import org.apache.spark.ml.linalg.VectorUDT; //导入依赖的package包/类
@Test
public void testDataFrameSumDMLVectorWithIDColumn() {
System.out.println("MLContextTest - DataFrame sum DML, vector with ID column");
List<Tuple2<Double, Vector>> list = new ArrayList<Tuple2<Double, Vector>>();
list.add(new Tuple2<Double, Vector>(1.0, Vectors.dense(1.0, 2.0, 3.0)));
list.add(new Tuple2<Double, Vector>(2.0, Vectors.dense(4.0, 5.0, 6.0)));
list.add(new Tuple2<Double, Vector>(3.0, Vectors.dense(7.0, 8.0, 9.0)));
JavaRDD<Tuple2<Double, Vector>> javaRddTuple = sc.parallelize(list);
JavaRDD<Row> javaRddRow = javaRddTuple.map(new DoubleVectorRow());
List<StructField> fields = new ArrayList<StructField>();
fields.add(DataTypes.createStructField(RDDConverterUtils.DF_ID_COLUMN, DataTypes.DoubleType, true));
fields.add(DataTypes.createStructField("C1", new VectorUDT(), true));
StructType schema = DataTypes.createStructType(fields);
Dataset<Row> dataFrame = spark.createDataFrame(javaRddRow, schema);
MatrixMetadata mm = new MatrixMetadata(MatrixFormat.DF_VECTOR_WITH_INDEX);
Script script = dml("print('sum: ' + sum(M));").in("M", dataFrame, mm);
setExpectedStdOut("sum: 45.0");
ml.execute(script);
}
开发者ID:apache,项目名称:systemml,代码行数:24,代码来源:MLContextTest.java
示例4: testDataFrameSumPYDMLVectorWithIDColumn
import org.apache.spark.ml.linalg.VectorUDT; //导入依赖的package包/类
@Test
public void testDataFrameSumPYDMLVectorWithIDColumn() {
System.out.println("MLContextTest - DataFrame sum PYDML, vector with ID column");
List<Tuple2<Double, Vector>> list = new ArrayList<Tuple2<Double, Vector>>();
list.add(new Tuple2<Double, Vector>(1.0, Vectors.dense(1.0, 2.0, 3.0)));
list.add(new Tuple2<Double, Vector>(2.0, Vectors.dense(4.0, 5.0, 6.0)));
list.add(new Tuple2<Double, Vector>(3.0, Vectors.dense(7.0, 8.0, 9.0)));
JavaRDD<Tuple2<Double, Vector>> javaRddTuple = sc.parallelize(list);
JavaRDD<Row> javaRddRow = javaRddTuple.map(new DoubleVectorRow());
List<StructField> fields = new ArrayList<StructField>();
fields.add(DataTypes.createStructField(RDDConverterUtils.DF_ID_COLUMN, DataTypes.DoubleType, true));
fields.add(DataTypes.createStructField("C1", new VectorUDT(), true));
StructType schema = DataTypes.createStructType(fields);
Dataset<Row> dataFrame = spark.createDataFrame(javaRddRow, schema);
MatrixMetadata mm = new MatrixMetadata(MatrixFormat.DF_VECTOR_WITH_INDEX);
Script script = pydml("print('sum: ' + sum(M))").in("M", dataFrame, mm);
setExpectedStdOut("sum: 45.0");
ml.execute(script);
}
开发者ID:apache,项目名称:systemml,代码行数:24,代码来源:MLContextTest.java
示例5: testDataFrameSumDMLMllibVectorWithIDColumn
import org.apache.spark.ml.linalg.VectorUDT; //导入依赖的package包/类
@Test
public void testDataFrameSumDMLMllibVectorWithIDColumn() {
System.out.println("MLContextTest - DataFrame sum DML, mllib vector with ID column");
List<Tuple2<Double, org.apache.spark.mllib.linalg.Vector>> list = new ArrayList<Tuple2<Double, org.apache.spark.mllib.linalg.Vector>>();
list.add(new Tuple2<Double, org.apache.spark.mllib.linalg.Vector>(1.0,
org.apache.spark.mllib.linalg.Vectors.dense(1.0, 2.0, 3.0)));
list.add(new Tuple2<Double, org.apache.spark.mllib.linalg.Vector>(2.0,
org.apache.spark.mllib.linalg.Vectors.dense(4.0, 5.0, 6.0)));
list.add(new Tuple2<Double, org.apache.spark.mllib.linalg.Vector>(3.0,
org.apache.spark.mllib.linalg.Vectors.dense(7.0, 8.0, 9.0)));
JavaRDD<Tuple2<Double, org.apache.spark.mllib.linalg.Vector>> javaRddTuple = sc.parallelize(list);
JavaRDD<Row> javaRddRow = javaRddTuple.map(new DoubleMllibVectorRow());
List<StructField> fields = new ArrayList<StructField>();
fields.add(DataTypes.createStructField(RDDConverterUtils.DF_ID_COLUMN, DataTypes.DoubleType, true));
fields.add(DataTypes.createStructField("C1", new org.apache.spark.mllib.linalg.VectorUDT(), true));
StructType schema = DataTypes.createStructType(fields);
Dataset<Row> dataFrame = spark.createDataFrame(javaRddRow, schema);
MatrixMetadata mm = new MatrixMetadata(MatrixFormat.DF_VECTOR_WITH_INDEX);
Script script = dml("print('sum: ' + sum(M));").in("M", dataFrame, mm);
setExpectedStdOut("sum: 45.0");
ml.execute(script);
}
开发者ID:apache,项目名称:systemml,代码行数:27,代码来源:MLContextTest.java
示例6: testDataFrameSumPYDMLMllibVectorWithIDColumn
import org.apache.spark.ml.linalg.VectorUDT; //导入依赖的package包/类
@Test
public void testDataFrameSumPYDMLMllibVectorWithIDColumn() {
System.out.println("MLContextTest - DataFrame sum PYDML, mllib vector with ID column");
List<Tuple2<Double, org.apache.spark.mllib.linalg.Vector>> list = new ArrayList<Tuple2<Double, org.apache.spark.mllib.linalg.Vector>>();
list.add(new Tuple2<Double, org.apache.spark.mllib.linalg.Vector>(1.0,
org.apache.spark.mllib.linalg.Vectors.dense(1.0, 2.0, 3.0)));
list.add(new Tuple2<Double, org.apache.spark.mllib.linalg.Vector>(2.0,
org.apache.spark.mllib.linalg.Vectors.dense(4.0, 5.0, 6.0)));
list.add(new Tuple2<Double, org.apache.spark.mllib.linalg.Vector>(3.0,
org.apache.spark.mllib.linalg.Vectors.dense(7.0, 8.0, 9.0)));
JavaRDD<Tuple2<Double, org.apache.spark.mllib.linalg.Vector>> javaRddTuple = sc.parallelize(list);
JavaRDD<Row> javaRddRow = javaRddTuple.map(new DoubleMllibVectorRow());
List<StructField> fields = new ArrayList<StructField>();
fields.add(DataTypes.createStructField(RDDConverterUtils.DF_ID_COLUMN, DataTypes.DoubleType, true));
fields.add(DataTypes.createStructField("C1", new org.apache.spark.mllib.linalg.VectorUDT(), true));
StructType schema = DataTypes.createStructType(fields);
Dataset<Row> dataFrame = spark.createDataFrame(javaRddRow, schema);
MatrixMetadata mm = new MatrixMetadata(MatrixFormat.DF_VECTOR_WITH_INDEX);
Script script = pydml("print('sum: ' + sum(M))").in("M", dataFrame, mm);
setExpectedStdOut("sum: 45.0");
ml.execute(script);
}
开发者ID:apache,项目名称:systemml,代码行数:27,代码来源:MLContextTest.java
示例7: testDataFrameSumDMLVectorWithNoIDColumn
import org.apache.spark.ml.linalg.VectorUDT; //导入依赖的package包/类
@Test
public void testDataFrameSumDMLVectorWithNoIDColumn() {
System.out.println("MLContextTest - DataFrame sum DML, vector with no ID column");
List<Vector> list = new ArrayList<Vector>();
list.add(Vectors.dense(1.0, 2.0, 3.0));
list.add(Vectors.dense(4.0, 5.0, 6.0));
list.add(Vectors.dense(7.0, 8.0, 9.0));
JavaRDD<Vector> javaRddVector = sc.parallelize(list);
JavaRDD<Row> javaRddRow = javaRddVector.map(new VectorRow());
List<StructField> fields = new ArrayList<StructField>();
fields.add(DataTypes.createStructField("C1", new VectorUDT(), true));
StructType schema = DataTypes.createStructType(fields);
Dataset<Row> dataFrame = spark.createDataFrame(javaRddRow, schema);
MatrixMetadata mm = new MatrixMetadata(MatrixFormat.DF_VECTOR);
Script script = dml("print('sum: ' + sum(M));").in("M", dataFrame, mm);
setExpectedStdOut("sum: 45.0");
ml.execute(script);
}
开发者ID:apache,项目名称:systemml,代码行数:23,代码来源:MLContextTest.java
示例8: testDataFrameSumPYDMLVectorWithNoIDColumn
import org.apache.spark.ml.linalg.VectorUDT; //导入依赖的package包/类
@Test
public void testDataFrameSumPYDMLVectorWithNoIDColumn() {
System.out.println("MLContextTest - DataFrame sum PYDML, vector with no ID column");
List<Vector> list = new ArrayList<Vector>();
list.add(Vectors.dense(1.0, 2.0, 3.0));
list.add(Vectors.dense(4.0, 5.0, 6.0));
list.add(Vectors.dense(7.0, 8.0, 9.0));
JavaRDD<Vector> javaRddVector = sc.parallelize(list);
JavaRDD<Row> javaRddRow = javaRddVector.map(new VectorRow());
List<StructField> fields = new ArrayList<StructField>();
fields.add(DataTypes.createStructField("C1", new VectorUDT(), true));
StructType schema = DataTypes.createStructType(fields);
Dataset<Row> dataFrame = spark.createDataFrame(javaRddRow, schema);
MatrixMetadata mm = new MatrixMetadata(MatrixFormat.DF_VECTOR);
Script script = pydml("print('sum: ' + sum(M))").in("M", dataFrame, mm);
setExpectedStdOut("sum: 45.0");
ml.execute(script);
}
开发者ID:apache,项目名称:systemml,代码行数:23,代码来源:MLContextTest.java
示例9: testDataFrameSumDMLMllibVectorWithNoIDColumn
import org.apache.spark.ml.linalg.VectorUDT; //导入依赖的package包/类
@Test
public void testDataFrameSumDMLMllibVectorWithNoIDColumn() {
System.out.println("MLContextTest - DataFrame sum DML, mllib vector with no ID column");
List<org.apache.spark.mllib.linalg.Vector> list = new ArrayList<org.apache.spark.mllib.linalg.Vector>();
list.add(org.apache.spark.mllib.linalg.Vectors.dense(1.0, 2.0, 3.0));
list.add(org.apache.spark.mllib.linalg.Vectors.dense(4.0, 5.0, 6.0));
list.add(org.apache.spark.mllib.linalg.Vectors.dense(7.0, 8.0, 9.0));
JavaRDD<org.apache.spark.mllib.linalg.Vector> javaRddVector = sc.parallelize(list);
JavaRDD<Row> javaRddRow = javaRddVector.map(new MllibVectorRow());
List<StructField> fields = new ArrayList<StructField>();
fields.add(DataTypes.createStructField("C1", new org.apache.spark.mllib.linalg.VectorUDT(), true));
StructType schema = DataTypes.createStructType(fields);
Dataset<Row> dataFrame = spark.createDataFrame(javaRddRow, schema);
MatrixMetadata mm = new MatrixMetadata(MatrixFormat.DF_VECTOR);
Script script = dml("print('sum: ' + sum(M));").in("M", dataFrame, mm);
setExpectedStdOut("sum: 45.0");
ml.execute(script);
}
开发者ID:apache,项目名称:systemml,代码行数:23,代码来源:MLContextTest.java
示例10: testDataFrameSumPYDMLMllibVectorWithNoIDColumn
import org.apache.spark.ml.linalg.VectorUDT; //导入依赖的package包/类
@Test
public void testDataFrameSumPYDMLMllibVectorWithNoIDColumn() {
System.out.println("MLContextTest - DataFrame sum PYDML, mllib vector with no ID column");
List<org.apache.spark.mllib.linalg.Vector> list = new ArrayList<org.apache.spark.mllib.linalg.Vector>();
list.add(org.apache.spark.mllib.linalg.Vectors.dense(1.0, 2.0, 3.0));
list.add(org.apache.spark.mllib.linalg.Vectors.dense(4.0, 5.0, 6.0));
list.add(org.apache.spark.mllib.linalg.Vectors.dense(7.0, 8.0, 9.0));
JavaRDD<org.apache.spark.mllib.linalg.Vector> javaRddVector = sc.parallelize(list);
JavaRDD<Row> javaRddRow = javaRddVector.map(new MllibVectorRow());
List<StructField> fields = new ArrayList<StructField>();
fields.add(DataTypes.createStructField("C1", new org.apache.spark.mllib.linalg.VectorUDT(), true));
StructType schema = DataTypes.createStructType(fields);
Dataset<Row> dataFrame = spark.createDataFrame(javaRddRow, schema);
MatrixMetadata mm = new MatrixMetadata(MatrixFormat.DF_VECTOR);
Script script = pydml("print('sum: ' + sum(M))").in("M", dataFrame, mm);
setExpectedStdOut("sum: 45.0");
ml.execute(script);
}
开发者ID:apache,项目名称:systemml,代码行数:23,代码来源:MLContextTest.java
示例11: testDataFrameSumDMLVectorWithIDColumnNoFormatSpecified
import org.apache.spark.ml.linalg.VectorUDT; //导入依赖的package包/类
@Test
public void testDataFrameSumDMLVectorWithIDColumnNoFormatSpecified() {
System.out.println("MLContextTest - DataFrame sum DML, vector with ID column, no format specified");
List<Tuple2<Double, Vector>> list = new ArrayList<Tuple2<Double, Vector>>();
list.add(new Tuple2<Double, Vector>(1.0, Vectors.dense(1.0, 2.0, 3.0)));
list.add(new Tuple2<Double, Vector>(2.0, Vectors.dense(4.0, 5.0, 6.0)));
list.add(new Tuple2<Double, Vector>(3.0, Vectors.dense(7.0, 8.0, 9.0)));
JavaRDD<Tuple2<Double, Vector>> javaRddTuple = sc.parallelize(list);
JavaRDD<Row> javaRddRow = javaRddTuple.map(new DoubleVectorRow());
List<StructField> fields = new ArrayList<StructField>();
fields.add(DataTypes.createStructField(RDDConverterUtils.DF_ID_COLUMN, DataTypes.DoubleType, true));
fields.add(DataTypes.createStructField("C1", new VectorUDT(), true));
StructType schema = DataTypes.createStructType(fields);
Dataset<Row> dataFrame = spark.createDataFrame(javaRddRow, schema);
Script script = dml("print('sum: ' + sum(M));").in("M", dataFrame);
setExpectedStdOut("sum: 45.0");
ml.execute(script);
}
开发者ID:apache,项目名称:systemml,代码行数:22,代码来源:MLContextTest.java
示例12: testDataFrameSumPYDMLVectorWithIDColumnNoFormatSpecified
import org.apache.spark.ml.linalg.VectorUDT; //导入依赖的package包/类
@Test
public void testDataFrameSumPYDMLVectorWithIDColumnNoFormatSpecified() {
System.out.println("MLContextTest - DataFrame sum PYDML, vector with ID column, no format specified");
List<Tuple2<Double, Vector>> list = new ArrayList<Tuple2<Double, Vector>>();
list.add(new Tuple2<Double, Vector>(1.0, Vectors.dense(1.0, 2.0, 3.0)));
list.add(new Tuple2<Double, Vector>(2.0, Vectors.dense(4.0, 5.0, 6.0)));
list.add(new Tuple2<Double, Vector>(3.0, Vectors.dense(7.0, 8.0, 9.0)));
JavaRDD<Tuple2<Double, Vector>> javaRddTuple = sc.parallelize(list);
JavaRDD<Row> javaRddRow = javaRddTuple.map(new DoubleVectorRow());
List<StructField> fields = new ArrayList<StructField>();
fields.add(DataTypes.createStructField(RDDConverterUtils.DF_ID_COLUMN, DataTypes.DoubleType, true));
fields.add(DataTypes.createStructField("C1", new VectorUDT(), true));
StructType schema = DataTypes.createStructType(fields);
Dataset<Row> dataFrame = spark.createDataFrame(javaRddRow, schema);
Script script = dml("print('sum: ' + sum(M))").in("M", dataFrame);
setExpectedStdOut("sum: 45.0");
ml.execute(script);
}
开发者ID:apache,项目名称:systemml,代码行数:22,代码来源:MLContextTest.java
示例13: testDataFrameSumDMLVectorWithNoIDColumnNoFormatSpecified
import org.apache.spark.ml.linalg.VectorUDT; //导入依赖的package包/类
@Test
public void testDataFrameSumDMLVectorWithNoIDColumnNoFormatSpecified() {
System.out.println("MLContextTest - DataFrame sum DML, vector with no ID column, no format specified");
List<Vector> list = new ArrayList<Vector>();
list.add(Vectors.dense(1.0, 2.0, 3.0));
list.add(Vectors.dense(4.0, 5.0, 6.0));
list.add(Vectors.dense(7.0, 8.0, 9.0));
JavaRDD<Vector> javaRddVector = sc.parallelize(list);
JavaRDD<Row> javaRddRow = javaRddVector.map(new VectorRow());
List<StructField> fields = new ArrayList<StructField>();
fields.add(DataTypes.createStructField("C1", new VectorUDT(), true));
StructType schema = DataTypes.createStructType(fields);
Dataset<Row> dataFrame = spark.createDataFrame(javaRddRow, schema);
Script script = dml("print('sum: ' + sum(M));").in("M", dataFrame);
setExpectedStdOut("sum: 45.0");
ml.execute(script);
}
开发者ID:apache,项目名称:systemml,代码行数:21,代码来源:MLContextTest.java
示例14: testDataFrameSumPYDMLVectorWithNoIDColumnNoFormatSpecified
import org.apache.spark.ml.linalg.VectorUDT; //导入依赖的package包/类
@Test
public void testDataFrameSumPYDMLVectorWithNoIDColumnNoFormatSpecified() {
System.out.println("MLContextTest - DataFrame sum PYDML, vector with no ID column, no format specified");
List<Vector> list = new ArrayList<Vector>();
list.add(Vectors.dense(1.0, 2.0, 3.0));
list.add(Vectors.dense(4.0, 5.0, 6.0));
list.add(Vectors.dense(7.0, 8.0, 9.0));
JavaRDD<Vector> javaRddVector = sc.parallelize(list);
JavaRDD<Row> javaRddRow = javaRddVector.map(new VectorRow());
List<StructField> fields = new ArrayList<StructField>();
fields.add(DataTypes.createStructField("C1", new VectorUDT(), true));
StructType schema = DataTypes.createStructType(fields);
Dataset<Row> dataFrame = spark.createDataFrame(javaRddRow, schema);
Script script = dml("print('sum: ' + sum(M))").in("M", dataFrame);
setExpectedStdOut("sum: 45.0");
ml.execute(script);
}
开发者ID:apache,项目名称:systemml,代码行数:21,代码来源:MLContextTest.java
示例15: testMinMaxScaler
import org.apache.spark.ml.linalg.VectorUDT; //导入依赖的package包/类
@Test
public void testMinMaxScaler() {
//prepare data
JavaRDD<Row> jrdd = jsc.parallelize(Arrays.asList(
RowFactory.create(1.0, Vectors.dense(data[0])),
RowFactory.create(2.0, Vectors.dense(data[1])),
RowFactory.create(3.0, Vectors.dense(data[2])),
RowFactory.create(4.0, Vectors.dense(data[3]))
));
StructType schema = new StructType(new StructField[]{
new StructField("label", DataTypes.DoubleType, false, Metadata.empty()),
new StructField("features", new VectorUDT(), false, Metadata.empty())
});
Dataset<Row> df = spark.createDataFrame(jrdd, schema);
//train model in spark
MinMaxScalerModel sparkModel = new MinMaxScaler()
.setInputCol("features")
.setOutputCol("scaled")
.setMin(-5)
.setMax(5)
.fit(df);
//Export model, import it back and get transformer
byte[] exportedModel = ModelExporter.export(sparkModel);
final Transformer transformer = ModelImporter.importAndGetTransformer(exportedModel);
//compare predictions
List<Row> sparkOutput = sparkModel.transform(df).orderBy("label").select("features", "scaled").collectAsList();
assertCorrectness(sparkOutput, expected, transformer);
}
开发者ID:flipkart-incubator,项目名称:spark-transformers,代码行数:35,代码来源:MinMaxScalerBridgeTest.java
示例16: testStandardScaler
import org.apache.spark.ml.linalg.VectorUDT; //导入依赖的package包/类
@Test
public void testStandardScaler() {
JavaRDD<Row> jrdd = jsc.parallelize(Arrays.asList(
RowFactory.create(1.0, Vectors.dense(data[0])),
RowFactory.create(2.0, Vectors.dense(data[1])),
RowFactory.create(3.0, Vectors.dense(data[2]))
));
StructType schema = new StructType(new StructField[]{
new StructField("label", DataTypes.DoubleType, false, Metadata.empty()),
new StructField("features", new VectorUDT(), false, Metadata.empty())
});
Dataset<Row> df = spark.createDataFrame(jrdd, schema);
//train model in spark
StandardScalerModel sparkModelNone = new StandardScaler()
.setInputCol("features")
.setOutputCol("scaledOutput")
.setWithMean(false)
.setWithStd(false)
.fit(df);
StandardScalerModel sparkModelWithMean = new StandardScaler()
.setInputCol("features")
.setOutputCol("scaledOutput")
.setWithMean(true)
.setWithStd(false)
.fit(df);
StandardScalerModel sparkModelWithStd = new StandardScaler()
.setInputCol("features")
.setOutputCol("scaledOutput")
.setWithMean(false)
.setWithStd(true)
.fit(df);
StandardScalerModel sparkModelWithBoth = new StandardScaler()
.setInputCol("features")
.setOutputCol("scaledOutput")
.setWithMean(true)
.setWithStd(true)
.fit(df);
//Export model, import it back and get transformer
byte[] exportedModel = ModelExporter.export(sparkModelNone);
final Transformer transformerNone = ModelImporter.importAndGetTransformer(exportedModel);
exportedModel = ModelExporter.export(sparkModelWithMean);
final Transformer transformerWithMean = ModelImporter.importAndGetTransformer(exportedModel);
exportedModel = ModelExporter.export(sparkModelWithStd);
final Transformer transformerWithStd = ModelImporter.importAndGetTransformer(exportedModel);
exportedModel = ModelExporter.export(sparkModelWithBoth);
final Transformer transformerWithBoth = ModelImporter.importAndGetTransformer(exportedModel);
//compare predictions
List<Row> sparkNoneOutput = sparkModelNone.transform(df).orderBy("label").select("features", "scaledOutput").collectAsList();
assertCorrectness(sparkNoneOutput, data, transformerNone);
List<Row> sparkWithMeanOutput = sparkModelWithMean.transform(df).orderBy("label").select("features", "scaledOutput").collectAsList();
assertCorrectness(sparkWithMeanOutput, resWithMean, transformerWithMean);
List<Row> sparkWithStdOutput = sparkModelWithStd.transform(df).orderBy("label").select("features", "scaledOutput").collectAsList();
assertCorrectness(sparkWithStdOutput, resWithStd, transformerWithStd);
List<Row> sparkWithBothOutput = sparkModelWithBoth.transform(df).orderBy("label").select("features", "scaledOutput").collectAsList();
assertCorrectness(sparkWithBothOutput, resWithBoth, transformerWithBoth);
}
开发者ID:flipkart-incubator,项目名称:spark-transformers,代码行数:76,代码来源:StandardScalerBridgeTest.java
示例17: determineMatrixFormatIfNeeded
import org.apache.spark.ml.linalg.VectorUDT; //导入依赖的package包/类
/**
* If the MatrixFormat of the DataFrame has not been explicitly specified,
* attempt to determine the proper MatrixFormat.
*
* @param dataFrame
* the Spark {@code DataFrame}
* @param matrixMetadata
* the matrix metadata, if available
*/
public static void determineMatrixFormatIfNeeded(Dataset<Row> dataFrame, MatrixMetadata matrixMetadata) {
if (matrixMetadata == null) {
return;
}
MatrixFormat matrixFormat = matrixMetadata.getMatrixFormat();
if (matrixFormat != null) {
return;
}
StructType schema = dataFrame.schema();
boolean hasID = false;
try {
schema.fieldIndex(RDDConverterUtils.DF_ID_COLUMN);
hasID = true;
} catch (IllegalArgumentException iae) {
}
StructField[] fields = schema.fields();
MatrixFormat mf = null;
if (hasID) {
if (fields[1].dataType() instanceof VectorUDT) {
mf = MatrixFormat.DF_VECTOR_WITH_INDEX;
} else {
mf = MatrixFormat.DF_DOUBLES_WITH_INDEX;
}
} else {
if (fields[0].dataType() instanceof VectorUDT) {
mf = MatrixFormat.DF_VECTOR;
} else {
mf = MatrixFormat.DF_DOUBLES;
}
}
if (mf == null) {
throw new MLContextException("DataFrame format not recognized as an accepted SystemML MatrixFormat");
}
matrixMetadata.setMatrixFormat(mf);
}
开发者ID:apache,项目名称:systemml,代码行数:47,代码来源:MLContextConversionUtil.java
示例18: getColVectFromDFSchema
import org.apache.spark.ml.linalg.VectorUDT; //导入依赖的package包/类
/**
* Obtain column vector from DataFrame schema
*
* @param dfschema schema as StructType
* @param containsID if true, contains ID column
* @return 0-based column index of vector column, -1 if no vector.
*/
private static int getColVectFromDFSchema(StructType dfschema, boolean containsID) {
int off = containsID ? 1 : 0;
for( int i=off; i<dfschema.fields().length; i++ ) {
StructField structType = dfschema.apply(i);
if(structType.dataType() instanceof VectorUDT)
return i-off;
}
return -1;
}
开发者ID:apache,项目名称:systemml,代码行数:18,代码来源:FrameRDDConverterUtils.java
示例19: testOutputDataFrameOfVectorsDML
import org.apache.spark.ml.linalg.VectorUDT; //导入依赖的package包/类
@Test
public void testOutputDataFrameOfVectorsDML() {
System.out.println("MLContextTest - output DataFrame of vectors DML");
String s = "m=matrix('1 2 3 4',rows=2,cols=2);";
Script script = dml(s).out("m");
MLResults results = ml.execute(script);
Dataset<Row> df = results.getDataFrame("m", true);
Dataset<Row> sortedDF = df.sort(RDDConverterUtils.DF_ID_COLUMN);
// verify column types
StructType schema = sortedDF.schema();
StructField[] fields = schema.fields();
StructField idColumn = fields[0];
StructField vectorColumn = fields[1];
Assert.assertTrue(idColumn.dataType() instanceof DoubleType);
Assert.assertTrue(vectorColumn.dataType() instanceof VectorUDT);
List<Row> list = sortedDF.collectAsList();
Row row1 = list.get(0);
Assert.assertEquals(1.0, row1.getDouble(0), 0.0);
Vector v1 = (DenseVector) row1.get(1);
double[] arr1 = v1.toArray();
Assert.assertArrayEquals(new double[] { 1.0, 2.0 }, arr1, 0.0);
Row row2 = list.get(1);
Assert.assertEquals(2.0, row2.getDouble(0), 0.0);
Vector v2 = (DenseVector) row2.get(1);
double[] arr2 = v2.toArray();
Assert.assertArrayEquals(new double[] { 3.0, 4.0 }, arr2, 0.0);
}
开发者ID:apache,项目名称:systemml,代码行数:33,代码来源:MLContextTest.java
示例20: main
import org.apache.spark.ml.linalg.VectorUDT; //导入依赖的package包/类
public static void main(String[] args) {
SparkSession spark = SparkSession
.builder().master("local").config("spark.sql.warehouse.dir", "file:///C:/Users/sumit.kumar/Downloads/bin/warehouse")
.appName("JavaEstimatorTransformerParamExample")
.getOrCreate();
Logger rootLogger = LogManager.getRootLogger();
rootLogger.setLevel(Level.WARN);
// $example on$
// Prepare training data.
List<Row> dataTraining = Arrays.asList(
RowFactory.create(1.0, Vectors.dense(0.0, 1.1, 0.1)),
RowFactory.create(0.0, Vectors.dense(2.0, 1.0, -1.0)),
RowFactory.create(0.0, Vectors.dense(2.0, 1.3, 1.0)),
RowFactory.create(1.0, Vectors.dense(0.0, 1.2, -0.5))
);
StructType schema = new StructType(new StructField[]{
new StructField("label", DataTypes.DoubleType, false, Metadata.empty()),
new StructField("features", new VectorUDT(), false, Metadata.empty())
});
Dataset<Row> training = spark.createDataFrame(dataTraining, schema);
// Create a LogisticRegression instance. This instance is an Estimator.
LogisticRegression lr = new LogisticRegression();
// Print out the parameters, documentation, and any default values.
System.out.println("LogisticRegression parameters:\n" + lr.explainParams() + "\n");
// We may set parameters using setter methods.
lr.setMaxIter(10).setRegParam(0.01);
// Learn a LogisticRegression model. This uses the parameters stored in lr.
LogisticRegressionModel model1 = lr.fit(training);
// Since model1 is a Model (i.e., a Transformer produced by an Estimator),
// we can view the parameters it used during fit().
// This prints the parameter (name: value) pairs, where names are unique IDs for this
// LogisticRegression instance.
System.out.println("Model 1 was fit using parameters: " + model1.parent().extractParamMap());
// We may alternatively specify parameters using a ParamMap.
ParamMap paramMap = new ParamMap()
.put(lr.maxIter().w(20)) // Specify 1 Param.
.put(lr.maxIter(), 30) // This overwrites the original maxIter.
.put(lr.regParam().w(0.1), lr.threshold().w(0.55)); // Specify multiple Params.
// One can also combine ParamMaps.
ParamMap paramMap2 = new ParamMap()
.put(lr.probabilityCol().w("myProbability")); // Change output column name
ParamMap paramMapCombined = paramMap.$plus$plus(paramMap2);
// Now learn a new model using the paramMapCombined parameters.
// paramMapCombined overrides all parameters set earlier via lr.set* methods.
LogisticRegressionModel model2 = lr.fit(training, paramMapCombined);
System.out.println("Model 2 was fit using parameters: " + model2.parent().extractParamMap());
// Prepare test documents.
List<Row> dataTest = Arrays.asList(
RowFactory.create(1.0, Vectors.dense(-1.0, 1.5, 1.3)),
RowFactory.create(0.0, Vectors.dense(3.0, 2.0, -0.1)),
RowFactory.create(1.0, Vectors.dense(0.0, 2.2, -1.5))
);
Dataset<Row> test = spark.createDataFrame(dataTest, schema);
// Make predictions on test documents using the Transformer.transform() method.
// LogisticRegression.transform will only use the 'features' column.
// Note that model2.transform() outputs a 'myProbability' column instead of the usual
// 'probability' column since we renamed the lr.probabilityCol parameter previously.
Dataset<Row> results = model2.transform(test);
Dataset<Row> rows = results.select("features", "label", "myProbability", "prediction");
for (Row r: rows.collectAsList()) {
System.out.println("(" + r.get(0) + ", " + r.get(1) + ") -> prob=" + r.get(2)
+ ", prediction=" + r.get(3));
}
// $example off$
spark.stop();
}
开发者ID:PacktPublishing,项目名称:Apache-Spark-2x-for-Java-Developers,代码行数:76,代码来源:JavaEstimatorTransformerParamExample.java
注:本文中的org.apache.spark.ml.linalg.VectorUDT类示例整理自Github/MSDocs等源码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。 |
请发表评论