本文整理汇总了Java中org.apache.mahout.math.hadoop.DistributedRowMatrix类的典型用法代码示例。如果您正苦于以下问题:Java DistributedRowMatrix类的具体用法?Java DistributedRowMatrix怎么用?Java DistributedRowMatrix使用的例子?那么恭喜您, 这里精选的类代码示例或许可以为您提供帮助。
DistributedRowMatrix类属于org.apache.mahout.math.hadoop包,在下文中一共展示了DistributedRowMatrix类的19个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于我们的系统推荐出更棒的Java代码示例。
示例1: computeYtXandXtX
import org.apache.mahout.math.hadoop.DistributedRowMatrix; //导入依赖的package包/类
/**
* Refer to {@link CompositeJob} for a job description. In short, it does
*
* X = Y * MEM
*
* XtX = (X - Xm)' * (X - Xm)
*
* YtX = (Y - Ym)' * (X - Xm)
*
* @param distMatrixY the input matrix Y
* @param inMemMatrix the in memory matrix MEM
* @param ym the mean vector of Y
* @param xm = ym * MEM
* @param id the unique id for HDFS output directory
* @return the XtX and YtX wrapped in a CompositeResult object
* @throws IOException
* @throws InterruptedException
* @throws ClassNotFoundException
*/
public void computeYtXandXtX(
DistributedRowMatrix distMatrixY, DistributedRowMatrix inMemMatrix,
Vector ym, Vector xm, Path tmpPath, Configuration conf, String id) throws IOException,
InterruptedException, ClassNotFoundException {
if (distMatrixY.numCols() != inMemMatrix.numRows()) {
throw new CardinalityException(distMatrixY.numCols(), inMemMatrix.numRows());
}
Path outPath = new Path(tmpPath, "Composite"+id);
Path ymPath = PCACommon.toDistributedVector(ym,
tmpPath, "ym-compositeJob" + id, conf);
Path xmPath = PCACommon.toDistributedVector(xm,
tmpPath, "xm-compositeJob" + id, conf);
FileSystem fs = FileSystem.get(outPath.toUri(), conf);
if (!fs.exists(outPath)) {
run(conf, distMatrixY.getRowPath(), inMemMatrix.getRowPath()
.toString(), inMemMatrix.numRows(), inMemMatrix.numCols(), ymPath
.toString(), xmPath.toString(), outPath);
} else {
log.warn("----------- Skip Compositejob - already exists: " + outPath);
}
loadXtX(ymPath, inMemMatrix.numCols(), conf);
loadYtX(outPath, tmpPath, inMemMatrix.numRows(), inMemMatrix.numCols(), conf);
}
开发者ID:SiddharthMalhotra,项目名称:sPCA,代码行数:44,代码来源:CompositeJob.java
示例2: setup
import org.apache.mahout.math.hadoop.DistributedRowMatrix; //导入依赖的package包/类
@Override
public void setup(Context context) throws IOException {
Configuration conf = context.getConfiguration();
Path inMemMatrixPath = new Path(conf.get(MATRIXINMEMORY));
int inMemMatrixNumRows = conf.getInt(MATRIXINMEMORYROWS, 0);
int inMemMatrixNumCols = conf.getInt(MATRIXINMEMORYCOLS, 0);
Path ymPath = new Path(conf.get(YMPATH));
Path xmPath = new Path(conf.get(XMPATH));
try {
ym = PCACommon.toDenseVector(ymPath, conf);
xm = PCACommon.toDenseVector(xmPath, conf);
} catch (IOException e) {
e.printStackTrace();
}
// TODO: add an argument for temp path
Path tmpPath = inMemMatrixPath.getParent();
DistributedRowMatrix distMatrix = new DistributedRowMatrix(
inMemMatrixPath, tmpPath, inMemMatrixNumRows, inMemMatrixNumCols);
distMatrix.setConf(conf);
inMemMatrix = PCACommon.toDenseMatrix(distMatrix);
}
开发者ID:SiddharthMalhotra,项目名称:sPCA,代码行数:22,代码来源:CompositeJob.java
示例3: performEigenDecomposition
import org.apache.mahout.math.hadoop.DistributedRowMatrix; //导入依赖的package包/类
/**
* Does most of the heavy lifting in setting up Paths, configuring return
* values, and generally performing the tedious administrative tasks involved
* in an eigen-decomposition and running the verifier
*/
public static DistributedRowMatrix performEigenDecomposition(Configuration conf,
DistributedRowMatrix input,
LanczosState state,
int numEigenVectors,
int overshoot,
Path tmp) throws IOException {
DistributedLanczosSolver solver = new DistributedLanczosSolver();
Path seqFiles = new Path(tmp, "eigendecomp-" + (System.nanoTime() & 0xFF));
solver.runJob(conf,
state,
overshoot,
true,
seqFiles.toString());
// now run the verifier to trim down the number of eigenvectors
EigenVerificationJob verifier = new EigenVerificationJob();
Path verifiedEigens = new Path(tmp, "verifiedeigens");
verifier.runJob(conf, seqFiles, input.getRowPath(), verifiedEigens, false, 1.0, numEigenVectors);
Path cleanedEigens = verifier.getCleanedEigensPath();
return new DistributedRowMatrix(cleanedEigens, new Path(cleanedEigens, "tmp"), numEigenVectors, input.numRows());
}
开发者ID:saradelrio,项目名称:Chi-FRBCS-BigDataCS,代码行数:27,代码来源:EigencutsDriver.java
示例4: reduce
import org.apache.mahout.math.hadoop.DistributedRowMatrix; //导入依赖的package包/类
@Override
protected void reduce(IntWritable row, Iterable<DistributedRowMatrix.MatrixEntryWritable> values, Context context)
throws IOException, InterruptedException {
int size = context.getConfiguration().getInt(EigencutsKeys.AFFINITY_DIMENSIONS, Integer.MAX_VALUE);
RandomAccessSparseVector out = new RandomAccessSparseVector(size, 100);
for (DistributedRowMatrix.MatrixEntryWritable element : values) {
out.setQuick(element.getCol(), element.getVal());
if (log.isDebugEnabled()) {
log.debug("(DEBUG - REDUCE) Row[{}], Column[{}], Value[{}]",
new Object[] {row.get(), element.getCol(), element.getVal()});
}
}
SequentialAccessSparseVector output = new SequentialAccessSparseVector(out);
context.write(row, new VectorWritable(output));
}
开发者ID:saradelrio,项目名称:Chi-FRBCS-BigDataCS,代码行数:17,代码来源:AffinityMatrixInputReducer.java
示例5: reduce
import org.apache.mahout.math.hadoop.DistributedRowMatrix; //导入依赖的package包/类
@Override
protected void reduce(SimilarityMatrixEntryKey key, Iterable<DistributedRowMatrix.MatrixEntryWritable> entries,
Context ctx) throws IOException, InterruptedException
{
RandomAccessSparseVector temporaryVector = new RandomAccessSparseVector(Integer.MAX_VALUE,
maxSimilaritiesPerRow);
int similaritiesSet = 0;
for (DistributedRowMatrix.MatrixEntryWritable entry : entries)
{
temporaryVector.setQuick(entry.getCol(), entry.getVal());
if (++similaritiesSet == maxSimilaritiesPerRow)
{
break;
}
}
SequentialAccessSparseVector vector = new SequentialAccessSparseVector(temporaryVector);
ctx.write(new IntWritable(key.getRow()), new VectorWritable(vector));
}
开发者ID:beeldengeluid,项目名称:zieook,代码行数:19,代码来源:RowSimilarityZieOok.java
示例6: loadXtX
import org.apache.mahout.math.hadoop.DistributedRowMatrix; //导入依赖的package包/类
public void loadXtX(Path ymPath, int inMemMatrixNumCols,
Configuration conf) {
if (xtx != null)
return;
Path xtxOutputPath = getXtXPathBasedOnYm(ymPath);
DistributedRowMatrix xtxDistMtx = new DistributedRowMatrix(xtxOutputPath,
xtxOutputPath.getParent(), inMemMatrixNumCols, inMemMatrixNumCols);
xtxDistMtx.setConf(conf);
xtx = PCACommon.toDenseMatrix(xtxDistMtx);
}
开发者ID:SiddharthMalhotra,项目名称:sPCA,代码行数:11,代码来源:CompositeJob.java
示例7: loadYtX
import org.apache.mahout.math.hadoop.DistributedRowMatrix; //导入依赖的package包/类
public void loadYtX(Path outPath, Path tmpPath, int numRows, int numCols,
Configuration conf) {
if (ytx != null)
return;
DistributedRowMatrix out = new DistributedRowMatrix(outPath,
tmpPath, numRows,
numCols);
out.setConf(conf);
ytx = PCACommon.toDenseMatrix(out);
}
开发者ID:SiddharthMalhotra,项目名称:sPCA,代码行数:11,代码来源:CompositeJob.java
示例8: sample
import org.apache.mahout.math.hadoop.DistributedRowMatrix; //导入依赖的package包/类
static Matrix sample(DistributedRowMatrix bigMatrix) {
setSampleRate(bigMatrix.numRows(), bigMatrix.numCols());
Matrix sampleMatrix = new DenseMatrix(
(int) (bigMatrix.numRows() * SAMPLE_RATE), bigMatrix.numCols());
sample(bigMatrix, sampleMatrix);
return sampleMatrix;
}
开发者ID:SiddharthMalhotra,项目名称:sPCA,代码行数:8,代码来源:SPCADriver.java
示例9: toDenseMatrix
import org.apache.mahout.math.hadoop.DistributedRowMatrix; //导入依赖的package包/类
/***
* If the matrix is small, we can convert it to an in memory representation
* and then run efficient centralized operations
*
* @param origMtx
* @return a dense matrix including the data
*/
static DenseMatrix toDenseMatrix(DistributedRowMatrix origMtx) {
DenseMatrix mtx = new DenseMatrix(origMtx.numRows(), origMtx.numCols());
Iterator<MatrixSlice> sliceIterator = origMtx.iterateAll();
while (sliceIterator.hasNext()) {
MatrixSlice slice = sliceIterator.next();
mtx.viewRow(slice.index()).assign(slice.vector());
}
return mtx;
}
开发者ID:SiddharthMalhotra,项目名称:sPCA,代码行数:17,代码来源:PCACommon.java
示例10: toDistributedRowMatrix
import org.apache.mahout.math.hadoop.DistributedRowMatrix; //导入依赖的package包/类
/**
* Convert an in-memory representation of a matrix to a distributed version It
* then can be used in distributed jobs
*
* @param oriMatrix
* @return path that contains the matrix files
* @throws IOException
*/
static DistributedRowMatrix toDistributedRowMatrix(Matrix origMatrix,
Path outPath, Path tmpPath, String label) throws IOException {
Configuration conf = new Configuration();
Path outputDir = new Path(outPath, label + origMatrix.numRows() + "x"
+ origMatrix.numCols());
FileSystem fs = FileSystem.get(outputDir.toUri(), conf);
if (!fs.exists(outputDir)) {
Path outputFile = new Path(outputDir, "singleSliceMatrix");
SequenceFile.Writer writer = new SequenceFile.Writer(fs, conf,
outputFile, IntWritable.class, VectorWritable.class);
VectorWritable vectorWritable = new VectorWritable();
try {
for (int r = 0; r < origMatrix.numRows(); r++) {
Vector vector = origMatrix.viewRow(r);
vectorWritable.set(vector);
writer.append(new IntWritable(r), vectorWritable);
}
} finally {
writer.close();
}
} else {
log.warn("----------- Skip matrix " + outputDir + " - already exists");
}
DistributedRowMatrix dMatrix = new DistributedRowMatrix(outputDir, tmpPath,
origMatrix.numRows(), origMatrix.numCols());
dMatrix.setConf(conf);
return dMatrix;
}
开发者ID:SiddharthMalhotra,项目名称:sPCA,代码行数:37,代码来源:PCACommon.java
示例11: crossTestIterationOfMapReducePPCASequentialPPCA
import org.apache.mahout.math.hadoop.DistributedRowMatrix; //导入依赖的package包/类
@Test
public void crossTestIterationOfMapReducePPCASequentialPPCA() throws Exception {
Matrix C_central = PCACommon.randomMatrix(D, d);
double ss = PCACommon.randSS();
InitialValues initValSeq = new InitialValues(C_central, ss);
InitialValues initValMR = new InitialValues(C_central.clone(), ss);
//1. run sequential
Matrix Ye_central = new DenseMatrix(N, D);
int row = 0;
for (VectorWritable vw : new SequenceFileDirValueIterable<VectorWritable>(
input, PathType.LIST, null, conf)) {
Ye_central.assignRow(row, vw.get());
row++;
}
double bishopSeqErr = ppcaDriver.runSequential(conf, Ye_central, initValSeq, 1);
//2. run mapreduce
DistributedRowMatrix Ye = new DistributedRowMatrix(input, tmp, N, D);
Ye.setConf(conf);
double bishopMRErr = ppcaDriver.runMapReduce(conf, Ye, initValMR, output, N, D, d, 1, 1, 1, 1);
Assert.assertEquals(
"ss value is different in sequential and mapreduce PCA", initValSeq.ss,
initValMR.ss, EPSILON);
double seqCTrace = PCACommon.trace(initValSeq.C);
double mrCTrace = PCACommon.trace(initValMR.C);
Assert.assertEquals(
"C value is different in sequential and mapreduce PCA", seqCTrace,
mrCTrace, EPSILON);
Assert.assertEquals(
"The PPCA error between sequntial and mapreduce methods is too different: "
+ bishopSeqErr + "!= " + bishopMRErr, bishopSeqErr, bishopMRErr, EPSILON);
}
开发者ID:SiddharthMalhotra,项目名称:sPCA,代码行数:35,代码来源:PCATest.java
示例12: setup
import org.apache.mahout.math.hadoop.DistributedRowMatrix; //导入依赖的package包/类
@Before
public void setup() throws Exception {
conf = new Configuration();
long currTime = System.currentTimeMillis();
Path outputDir = new Path("/tmp/" + currTime);
FileSystem fs;
try {
fs = FileSystem.get(outputDir.toUri(), conf);
fs.mkdirs(outputDir);
fs.deleteOnExit(outputDir);
} catch (IOException e) {
e.printStackTrace();
Assert.fail("Error in creating output direcoty " + outputDir);
return;
}
ym = computeMean(inputVectors);
double[] xm = new double[xsize];
times(ym, y2xVectors, xm);
double[] zm = new double[cols];
timesTranspose(xm, cVectors, zm);
for (int c = 0; c < cols; c++)
zm[c] -= ym[c];
ymPath = PCACommon.toDistributedVector(new DenseVector(ym), outputDir,
"ym", conf);
zmPath = PCACommon.toDistributedVector(new DenseVector(zm), outputDir,
"zm", conf);
DistributedRowMatrix distMatrix = PCACommon.toDistributedRowMatrix(
new DenseMatrix(y2xVectors), outputDir, outputDir, "y2xMatrix");
y2xMatrixPath = distMatrix.getRowPath();
distMatrix = PCACommon.toDistributedRowMatrix(
new DenseMatrix(cVectors), outputDir, outputDir, "cMatrix");
cMatrixPath = distMatrix.getRowPath();
computeError(inputVectors);
}
开发者ID:SiddharthMalhotra,项目名称:sPCA,代码行数:35,代码来源:ReconstructionErrJobTest.java
示例13: setup
import org.apache.mahout.math.hadoop.DistributedRowMatrix; //导入依赖的package包/类
@Before
public void setup() throws Exception {
conf = new Configuration();
long currTime = System.currentTimeMillis();
Path outputDir = new Path("/tmp/" + currTime);
FileSystem fs;
try {
fs = FileSystem.get(outputDir.toUri(), conf);
fs.mkdirs(outputDir);
fs.deleteOnExit(outputDir);
} catch (IOException e) {
e.printStackTrace();
Assert.fail("Error in creating output direcoty " + outputDir);
return;
}
ym = computeMean(inputVectors);
double[] xm = new double[xsize];
times(ym, y2xVectors, xm);
ymPath = PCACommon.toDistributedVector(new DenseVector(ym), outputDir,
"ym", conf);
xmPath = PCACommon.toDistributedVector(new DenseVector(xm), outputDir,
"xm", conf);
DistributedRowMatrix distMatrix = PCACommon.toDistributedRowMatrix(
new DenseMatrix(y2xVectors), outputDir, outputDir, "y2xMatrix");
y2xMatrixPath = distMatrix.getRowPath();
distMatrix = PCACommon.toDistributedRowMatrix(
new DenseMatrix(cVectors), outputDir, outputDir, "cMatrix");
cMatrixPath = distMatrix.getRowPath();
}
开发者ID:SiddharthMalhotra,项目名称:sPCA,代码行数:30,代码来源:VarianceJobTest.java
示例14: runJob
import org.apache.mahout.math.hadoop.DistributedRowMatrix; //导入依赖的package包/类
/**
* Initializes and executes the job of reading the documents containing
* the data of the affinity matrix in (x_i, x_j, value) format.
*/
public static void runJob(Path input, Path output, int rows, int cols)
throws IOException, InterruptedException, ClassNotFoundException {
Configuration conf = new Configuration();
HadoopUtil.delete(conf, output);
conf.setInt(EigencutsKeys.AFFINITY_DIMENSIONS, rows);
Job job = new Job(conf, "AffinityMatrixInputJob: " + input + " -> M/R -> " + output);
job.setMapOutputKeyClass(IntWritable.class);
job.setMapOutputValueClass(DistributedRowMatrix.MatrixEntryWritable.class);
job.setOutputKeyClass(IntWritable.class);
job.setOutputValueClass(VectorWritable.class);
job.setOutputFormatClass(SequenceFileOutputFormat.class);
job.setMapperClass(AffinityMatrixInputMapper.class);
job.setReducerClass(AffinityMatrixInputReducer.class);
FileInputFormat.addInputPath(job, input);
FileOutputFormat.setOutputPath(job, output);
job.setJarByClass(AffinityMatrixInputJob.class);
boolean succeeded = job.waitForCompletion(true);
if (!succeeded) {
throw new IllegalStateException("Job failed!");
}
}
开发者ID:saradelrio,项目名称:Chi-FRBCS-BigDataCS,代码行数:31,代码来源:AffinityMatrixInputJob.java
示例15: map
import org.apache.mahout.math.hadoop.DistributedRowMatrix; //导入依赖的package包/类
@Override
protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
String[] elements = COMMA_PATTERN.split(value.toString());
log.debug("(DEBUG - MAP) Key[{}], Value[{}]", key.get(), value);
// enforce well-formed textual representation of the graph
if (elements.length != 3) {
throw new IOException("Expected input of length 3, received "
+ elements.length + ". Please make sure you adhere to "
+ "the structure of (i,j,value) for representing a graph in text. "
+ "Input line was: '" + value + "'.");
}
if (elements[0].isEmpty() || elements[1].isEmpty() || elements[2].isEmpty()) {
throw new IOException("Found an element of 0 length. Please be sure you adhere to the structure of "
+ "(i,j,value) for representing a graph in text.");
}
// parse the line of text into a DistributedRowMatrix entry,
// making the row (elements[0]) the key to the Reducer, and
// setting the column (elements[1]) in the entry itself
DistributedRowMatrix.MatrixEntryWritable toAdd = new DistributedRowMatrix.MatrixEntryWritable();
IntWritable row = new IntWritable(Integer.valueOf(elements[0]));
toAdd.setRow(-1); // already set as the Reducer's key
toAdd.setCol(Integer.valueOf(elements[1]));
toAdd.setVal(Double.valueOf(elements[2]));
context.write(row, toAdd);
}
开发者ID:saradelrio,项目名称:Chi-FRBCS-BigDataCS,代码行数:29,代码来源:AffinityMatrixInputMapper.java
示例16: runJob
import org.apache.mahout.math.hadoop.DistributedRowMatrix; //导入依赖的package包/类
public static DistributedRowMatrix runJob(Path markovPath, Vector diag, Path outputPath, Path tmpPath)
throws IOException, ClassNotFoundException, InterruptedException {
// set up the serialization of the diagonal vector
Configuration conf = new Configuration();
FileSystem fs = FileSystem.get(markovPath.toUri(), conf);
markovPath = fs.makeQualified(markovPath);
outputPath = fs.makeQualified(outputPath);
Path vectorOutputPath = new Path(outputPath.getParent(), "vector");
VectorCache.save(new IntWritable(EigencutsKeys.DIAGONAL_CACHE_INDEX), diag, vectorOutputPath, conf);
// set up the job itself
Job job = new Job(conf, "VectorMatrixMultiplication");
job.setInputFormatClass(SequenceFileInputFormat.class);
job.setOutputKeyClass(IntWritable.class);
job.setOutputValueClass(VectorWritable.class);
job.setOutputFormatClass(SequenceFileOutputFormat.class);
job.setMapperClass(VectorMatrixMultiplicationMapper.class);
job.setNumReduceTasks(0);
FileInputFormat.addInputPath(job, markovPath);
FileOutputFormat.setOutputPath(job, outputPath);
job.setJarByClass(VectorMatrixMultiplicationJob.class);
boolean succeeded = job.waitForCompletion(true);
if (!succeeded) {
throw new IllegalStateException("Job failed!");
}
// build the resulting DRM from the results
return new DistributedRowMatrix(outputPath, tmpPath,
diag.size(), diag.size());
}
开发者ID:saradelrio,项目名称:Chi-FRBCS-BigDataCS,代码行数:35,代码来源:VectorMatrixMultiplicationJob.java
示例17: runMapReduce
import org.apache.mahout.math.hadoop.DistributedRowMatrix; //导入依赖的package包/类
/**
* Run sPCA
*
* @param conf
* the configuration
* @param input
* the path to the input matrix Y
* @param output
* the path to the output (currently for normalization output)
* @param nRows
* number of rows in input matrix
* @param nCols
* number of columns in input matrix
* @param nPCs
* number of desired principal components
* @param splitFactor
* divide the block size by this number to increase parallelism
* @return the error
* @throws Exception
*/
double runMapReduce(Configuration conf, Path input, Path output, final int nRows,
final int nCols, final int nPCs, final int splitFactor, final float errSampleRate, final int maxIterations, final int normalize) throws Exception {
Matrix centC = PCACommon.randomMatrix(nCols, nPCs);
double ss = PCACommon.randSS();
InitialValues initVal = new InitialValues(centC, ss);
DistributedRowMatrix distY = new DistributedRowMatrix(input,
getTempPath(), nRows, nCols);
distY.setConf(conf);
/**
* Here we can control the number of iterations as well as the input size.
* Can be used to improve initVal by first running on a sample, e.g.:
* runMapReduce(conf, distY, initVal, ..., 1, 10, 0.001);
* runMapReduce(conf, distY, initVal, ..., 11, 13, 0.01);
* runMapReduce(conf, distY, initVal, ..., 14, 1, 1);
*/
double error = runMapReduce(conf, distY, initVal, output, nRows, nCols, nPCs,
splitFactor, errSampleRate, maxIterations, normalize);
return error;
}
开发者ID:SiddharthMalhotra,项目名称:sPCA,代码行数:40,代码来源:SPCADriver.java
示例18: computeVariance
import org.apache.mahout.math.hadoop.DistributedRowMatrix; //导入依赖的package包/类
/**
* refer to {@link VarianceJob} for job description. In short, it does: for i
* in 1:N: sum += (xi-xm) * C' * (yi-ym)'
*
* @param matrixY
* the input matrix Y
* @param ym
* the column mean of Y
* @param matrixY2X
* the matrix to generate X
* @param xm
* = ym * Y2X
* @param matrixC
* the matrix of principal components
* @param tmpPath
* the temporary path in HDFS
* @param conf
* the configuration
* @param id
* the unique id to name files in HDFS
* @return
* @throws IOException
* @throws InterruptedException
* @throws ClassNotFoundException
*/
public double computeVariance(DistributedRowMatrix matrixY, Vector ym,
DistributedRowMatrix matrixY2X, Vector xm, DistributedRowMatrix matrixC,
Path tmpPath, Configuration conf, String id) throws IOException,
InterruptedException, ClassNotFoundException {
Path xmPath = PCACommon.toDistributedVector(xm, tmpPath, "Xm-varianceJob"
+ id, conf);
Path ymPath = PCACommon.toDistributedVector(ym, tmpPath, "Ym-varianceJob"
+ id, conf);
Path outPath = new Path(tmpPath, "Variance"+id);
FileSystem fs = FileSystem.get(outPath.toUri(), conf);
if (!fs.exists(outPath)) {
run(conf, matrixY.getRowPath(), ymPath.toString(), matrixY2X.getRowPath()
.toString(), xmPath.toString(), matrixC.getRowPath().toString(),
outPath);
} else {
log.warn("---------- Skip variance - already exists: " + outPath);
}
loadResult(outPath, conf);
return finalSum;// finalNumber;
}
开发者ID:SiddharthMalhotra,项目名称:sPCA,代码行数:47,代码来源:VarianceJob.java
示例19: run
import org.apache.mahout.math.hadoop.DistributedRowMatrix; //导入依赖的package包/类
@Override
public int run(String[] args) throws IOException, ClassNotFoundException, InterruptedException
{
addInputOption();
addOutputOption();
addOption("numberOfColumns", "r", "Number of columns in the input matrix");
addOption("similarityClassname", "s", "Name of distributed similarity class to instantiate, alternatively use "
+ "one of the predefined similarities (" + SimilarityType.listEnumNames() + ')');
addOption("maxSimilaritiesPerRow", "m", "Number of maximum similarities per row (default: "
+ DEFAULT_MAX_SIMILARITIES_PER_ROW + ')', String.valueOf(DEFAULT_MAX_SIMILARITIES_PER_ROW));
Map<String, String> parsedArgs = parseArguments(args);
if (parsedArgs == null)
{
return -1;
}
int numberOfColumns = Integer.parseInt(parsedArgs.get("--numberOfColumns"));
String similarityClassnameArg = parsedArgs.get("--similarityClassname");
String distributedSimilarityClassname;
try
{
distributedSimilarityClassname = SimilarityType.valueOf(similarityClassnameArg)
.getSimilarityImplementationClassName();
}
catch (IllegalArgumentException iae)
{
distributedSimilarityClassname = similarityClassnameArg;
}
int maxSimilaritiesPerRow = Integer.parseInt(parsedArgs.get("--maxSimilaritiesPerRow"));
Path inputPath = getInputPath();
Path outputPath = getOutputPath();
Path tempDirPath = new Path(parsedArgs.get("--tempDir"));
Path weightsPath = new Path(tempDirPath, "weights");
Path pairwiseSimilarityPath = new Path(tempDirPath, "pairwiseSimilarity");
AtomicInteger currentPhase = new AtomicInteger();
if (shouldRunNextPhase(parsedArgs, currentPhase))
{
Job weights = prepareJob(inputPath, weightsPath, SequenceFileInputFormat.class, RowWeightMapper.class,
VarIntWritable.class, WeightedOccurrence.class, WeightedOccurrencesPerColumnReducer.class,
VarIntWritable.class, WeightedOccurrenceArray.class, SequenceFileOutputFormat.class);
weights.getConfiguration().set(DISTRIBUTED_SIMILARITY_CLASSNAME, distributedSimilarityClassname);
weights.waitForCompletion(true);
}
if (shouldRunNextPhase(parsedArgs, currentPhase))
{
Job pairwiseSimilarity = prepareJob(weightsPath, pairwiseSimilarityPath, SequenceFileInputFormat.class,
CooccurrencesMapper.class, WeightedRowPair.class, Cooccurrence.class, SimilarityReducer.class,
SimilarityMatrixEntryKey.class, DistributedRowMatrix.MatrixEntryWritable.class,
SequenceFileOutputFormat.class);
Configuration pairwiseConf = pairwiseSimilarity.getConfiguration();
pairwiseConf.set(DISTRIBUTED_SIMILARITY_CLASSNAME, distributedSimilarityClassname);
pairwiseConf.setInt(NUMBER_OF_COLUMNS, numberOfColumns);
pairwiseSimilarity.waitForCompletion(true);
}
if (shouldRunNextPhase(parsedArgs, currentPhase))
{
Job asMatrix = prepareJob(pairwiseSimilarityPath, outputPath, SequenceFileInputFormat.class, Mapper.class,
SimilarityMatrixEntryKey.class, DistributedRowMatrix.MatrixEntryWritable.class,
EntriesToVectorsReducer.class, IntWritable.class, VectorWritable.class, SequenceFileOutputFormat.class);
asMatrix.setPartitionerClass(HashPartitioner.class);
asMatrix.setGroupingComparatorClass(SimilarityMatrixEntryKey.SimilarityMatrixEntryKeyGroupingComparator.class);
asMatrix.getConfiguration().setInt(MAX_SIMILARITIES_PER_ROW, maxSimilaritiesPerRow);
asMatrix.waitForCompletion(true);
}
return 0;
}
开发者ID:beeldengeluid,项目名称:zieook,代码行数:79,代码来源:RowSimilarityZieOok.java
注:本文中的org.apache.mahout.math.hadoop.DistributedRowMatrix类示例整理自Github/MSDocs等源码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。 |
请发表评论