本文整理汇总了Java中parquet.hadoop.metadata.ParquetMetadata类的典型用法代码示例。如果您正苦于以下问题:Java ParquetMetadata类的具体用法?Java ParquetMetadata怎么用?Java ParquetMetadata使用的例子?那么恭喜您, 这里精选的类代码示例或许可以为您提供帮助。
ParquetMetadata类属于parquet.hadoop.metadata包,在下文中一共展示了ParquetMetadata类的20个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于我们的系统推荐出更棒的Java代码示例。
示例1: getRowGroupNumbersFromFileSplit
import parquet.hadoop.metadata.ParquetMetadata; //导入依赖的package包/类
/**
* Get the list of row group numbers for given file input split. Logic used here is same as how Hive's parquet input
* format finds the row group numbers for input split.
*/
private List<Integer> getRowGroupNumbersFromFileSplit(final FileSplit split,
final ParquetMetadata footer) throws IOException {
final List<BlockMetaData> blocks = footer.getBlocks();
final long splitStart = split.getStart();
final long splitLength = split.getLength();
final List<Integer> rowGroupNums = Lists.newArrayList();
int i = 0;
for (final BlockMetaData block : blocks) {
final long firstDataPage = block.getColumns().get(0).getFirstDataPageOffset();
if (firstDataPage >= splitStart && firstDataPage < splitStart + splitLength) {
rowGroupNums.add(i);
}
i++;
}
return rowGroupNums;
}
开发者ID:skhalifa,项目名称:QDrill,代码行数:25,代码来源:HiveDrillNativeScanBatchCreator.java
示例2: getSchema
import parquet.hadoop.metadata.ParquetMetadata; //导入依赖的package包/类
@Override
public DatasetJsonRecord getSchema(Path path) throws IOException {
DatasetJsonRecord record = null;
if (!fs.exists(path))
LOG.error("file path : {} not in hdfs", path);
else {
try {
ParquetMetadata readFooter = ParquetFileReader.readFooter(fs.getConf(), path, ParquetMetadataConverter.NO_FILTER);
Map<String, String> schema = readFooter.getFileMetaData().getKeyValueMetaData();
String allFields = schema.get("org.apache.spark.sql.parquet.row.metadata");
FileStatus status = fs.getFileStatus(path);
String storage = STORAGE_TYPE;
String abstractPath = path.toUri().getPath();
String codec = "parquet.codec";
record = new DatasetJsonRecord(allFields, abstractPath, status.getModificationTime(), status.getOwner(), status.getGroup(),
status.getPermission().toString(), codec, storage, "");
LOG.info("parquetfileanalyzer parse path :{},schema is {}", path.toUri().getPath(), record.toCsvString());
} catch (Exception e) {
LOG.error("path : {} content " + " is not Parquet File format content ", path.toUri().getPath());
LOG.info(e.getStackTrace().toString());
}
}
return record;
}
开发者ID:thomas-young-2013,项目名称:wherehowsX,代码行数:27,代码来源:ParquetFileAnalyzer.java
示例3: ParquetRecordReader
import parquet.hadoop.metadata.ParquetMetadata; //导入依赖的package包/类
public ParquetRecordReader(
FragmentContext fragmentContext,
long batchSize,
String path,
int rowGroupIndex,
FileSystem fs,
DirectCodecFactory codecFactory,
ParquetMetadata footer,
List<SchemaPath> columns) throws ExecutionSetupException {
this.hadoopPath = new Path(path);
this.fileSystem = fs;
this.codecFactory = codecFactory;
this.rowGroupIndex = rowGroupIndex;
this.batchSize = batchSize;
this.footer = footer;
this.fragmentContext = fragmentContext;
setColumns(columns);
}
开发者ID:skhalifa,项目名称:QDrill,代码行数:19,代码来源:ParquetRecordReader.java
示例4: add
import parquet.hadoop.metadata.ParquetMetadata; //导入依赖的package包/类
private static void add(ParquetMetadata footer) {
for (BlockMetaData blockMetaData : footer.getBlocks()) {
++blockCount;
MessageType schema = footer.getFileMetaData().getSchema();
recordCount += blockMetaData.getRowCount();
List<ColumnChunkMetaData> columns = blockMetaData.getColumns();
for (ColumnChunkMetaData columnMetaData : columns) {
ColumnDescriptor desc = schema.getColumnDescription(columnMetaData.getPath().toArray());
add(
desc,
columnMetaData.getValueCount(),
columnMetaData.getTotalSize(),
columnMetaData.getTotalUncompressedSize(),
columnMetaData.getEncodings(),
columnMetaData.getStatistics());
}
}
}
开发者ID:grokcoder,项目名称:pbase,代码行数:19,代码来源:PrintFooter.java
示例5: mergeFooters
import parquet.hadoop.metadata.ParquetMetadata; //导入依赖的package包/类
static ParquetMetadata mergeFooters(Path root, List<Footer> footers) {
String rootPath = root.toUri().getPath();
GlobalMetaData fileMetaData = null;
List<BlockMetaData> blocks = new ArrayList<BlockMetaData>();
for (Footer footer : footers) {
String footerPath = footer.getFile().toUri().getPath();
if (!footerPath.startsWith(rootPath)) {
throw new ParquetEncodingException(footerPath + " invalid: all the files must be contained in the root " + root);
}
footerPath = footerPath.substring(rootPath.length());
while (footerPath.startsWith("/")) {
footerPath = footerPath.substring(1);
}
fileMetaData = mergeInto(footer.getParquetMetadata().getFileMetaData(), fileMetaData);
for (BlockMetaData block : footer.getParquetMetadata().getBlocks()) {
block.setPath(footerPath);
blocks.add(block);
}
}
return new ParquetMetadata(fileMetaData.merge(), blocks);
}
开发者ID:grokcoder,项目名称:pbase,代码行数:22,代码来源:ParquetFileWriter.java
示例6: toParquetMetadata
import parquet.hadoop.metadata.ParquetMetadata; //导入依赖的package包/类
public FileMetaData toParquetMetadata(int currentVersion, ParquetMetadata parquetMetadata) {
List<BlockMetaData> blocks = parquetMetadata.getBlocks();
List<RowGroup> rowGroups = new ArrayList<RowGroup>();
int numRows = 0;
for (BlockMetaData block : blocks) {
numRows += block.getRowCount();
addRowGroup(parquetMetadata, rowGroups, block);
}
FileMetaData fileMetaData = new FileMetaData(
currentVersion,
toParquetSchema(parquetMetadata.getFileMetaData().getSchema()),
numRows,
rowGroups);
Set<Entry<String, String>> keyValues = parquetMetadata.getFileMetaData().getKeyValueMetaData().entrySet();
for (Entry<String, String> keyValue : keyValues) {
addKeyValue(fileMetaData, keyValue.getKey(), keyValue.getValue());
}
fileMetaData.setCreated_by(parquetMetadata.getFileMetaData().getCreatedBy());
return fileMetaData;
}
开发者ID:grokcoder,项目名称:pbase,代码行数:23,代码来源:ParquetMetadataConverter.java
示例7: readSchema
import parquet.hadoop.metadata.ParquetMetadata; //导入依赖的package包/类
@Override
public SchemaDescription readSchema( String file ) throws Exception {
return inClassloader( () -> {
ConfigurationProxy conf = new ConfigurationProxy();
FileSystem fs = FileSystem.get( new URI( file ), conf );
FileStatus fileStatus = fs.getFileStatus( new Path( file ) );
List<Footer> footers = ParquetFileReader.readFooters( conf, fileStatus, true );
if ( footers.isEmpty() ) {
return new SchemaDescription();
} else {
ParquetMetadata meta = footers.get( 0 ).getParquetMetadata();
MessageType schema = meta.getFileMetaData().getSchema();
return ParquetConverter.createSchemaDescription( schema );
}
} );
}
开发者ID:pentaho,项目名称:pentaho-hadoop-shims,代码行数:17,代码来源:PentahoParquetInputFormat.java
示例8: convertParquetSchemaToKettleWithTwoValidRows
import parquet.hadoop.metadata.ParquetMetadata; //导入依赖的package包/类
@Test
public void convertParquetSchemaToKettleWithTwoValidRows() throws Exception {
int pentahoValueMetaTypeFirstRow = 2;
boolean allowNullFirstRow = false;
int pentahoValueMetaTypeSecondRow = 5;
boolean allowNullSecondRow = false;
String expectedKettleSchema = ParquetUtils
.createSchema( pentahoValueMetaTypeFirstRow, allowNullFirstRow, pentahoValueMetaTypeSecondRow,
allowNullSecondRow ).marshall();
urlTestResources = Thread.currentThread().getContextClassLoader().getResource( PARQUET_FILE );
ConfigurationProxy conf = new ConfigurationProxy();
conf.set( "fs.defaultFS", "file:///" );
ParquetMetadata meta = ParquetFileReader
.readFooter( conf, new Path( Paths.get( urlTestResources.toURI() ).toString() ),
ParquetMetadataConverter.NO_FILTER );
MessageType schema = meta.getFileMetaData().getSchema();
SchemaDescription kettleSchema = ParquetConverter.createSchemaDescription( schema );
String marshallKettleSchema = kettleSchema.marshall();
Assert.assertEquals( marshallKettleSchema, expectedKettleSchema );
}
开发者ID:pentaho,项目名称:pentaho-hadoop-shims,代码行数:26,代码来源:ParquetConverterTest.java
示例9: execute
import parquet.hadoop.metadata.ParquetMetadata; //导入依赖的package包/类
@Override
public void execute(CommandLine options) throws Exception {
super.execute(options);
String[] args = options.getArgs();
String input = args[0];
Configuration conf = new Configuration();
ParquetMetadata metaData = ParquetFileReader.readFooter(conf, new Path(input));
PrettyPrintWriter out = PrettyPrintWriter.stdoutPrettyPrinter()
.withAutoColumn()
.withAutoCrop()
.withWhitespaceHandler(WhiteSpaceHandler.COLLAPSE_WHITESPACE)
.withColumnPadding(1)
.build();
MetadataUtils.showDetails(out, metaData);
out.flushColumns();
}
开发者ID:wesleypeck,项目名称:parquet-tools,代码行数:21,代码来源:ShowMetaCommand.java
示例10: getMetadatas
import parquet.hadoop.metadata.ParquetMetadata; //导入依赖的package包/类
public static ParquetMetadata[] getMetadatas (FileStatus[] fileStatuses, Configuration conf) throws IOException
{
ParquetMetadata[] res = new ParquetMetadata[fileStatuses.length];
for (int i = 0; i < fileStatuses.length; ++i)
{
res[i] = ParquetFileReader.readFooter(conf, fileStatuses[i].getPath(), NO_FILTER);
}
return res;
}
开发者ID:dbiir,项目名称:rainbow,代码行数:10,代码来源:LocalParquetEvaluator.java
示例11: DrillParquetReader
import parquet.hadoop.metadata.ParquetMetadata; //导入依赖的package包/类
public DrillParquetReader(FragmentContext fragmentContext, ParquetMetadata footer, RowGroupReadEntry entry,
List<SchemaPath> columns, DrillFileSystem fileSystem) {
this.footer = footer;
this.fileSystem = fileSystem;
this.entry = entry;
setColumns(columns);
this.fragmentContext = fragmentContext;
fillLevelCheckFrequency = this.fragmentContext.getOptions().getOption(ExecConstants.PARQUET_VECTOR_FILL_CHECK_THRESHOLD).num_val.intValue();
fillLevelCheckThreshold = this.fragmentContext.getOptions().getOption(ExecConstants.PARQUET_VECTOR_FILL_THRESHOLD).num_val.intValue();
}
开发者ID:skhalifa,项目名称:QDrill,代码行数:11,代码来源:DrillParquetReader.java
示例12: readFooter
import parquet.hadoop.metadata.ParquetMetadata; //导入依赖的package包/类
/**
* An updated footer reader that tries to read the entire footer without knowing the length.
* This should reduce the amount of seek/read roundtrips in most workloads.
* @param fs
* @param status
* @return
* @throws IOException
*/
public static Footer readFooter(final Configuration config, final FileStatus status) throws IOException {
final FileSystem fs = status.getPath().getFileSystem(config);
try(FSDataInputStream file = fs.open(status.getPath())) {
final long fileLength = status.getLen();
Preconditions.checkArgument(fileLength >= MIN_FILE_SIZE, "%s is not a Parquet file (too small)", status.getPath());
int len = (int) Math.min( fileLength, (long) DEFAULT_READ_SIZE);
byte[] footerBytes = new byte[len];
readFully(file, fileLength - len, footerBytes, 0, len);
checkMagicBytes(status, footerBytes, footerBytes.length - ParquetFileWriter.MAGIC.length);
final int size = BytesUtils.readIntLittleEndian(footerBytes, footerBytes.length - FOOTER_METADATA_SIZE);
if(size > footerBytes.length - FOOTER_METADATA_SIZE){
// if the footer is larger than our initial read, we need to read the rest.
byte[] origFooterBytes = footerBytes;
int origFooterRead = origFooterBytes.length - FOOTER_METADATA_SIZE;
footerBytes = new byte[size];
readFully(file, fileLength - size - FOOTER_METADATA_SIZE, footerBytes, 0, size - origFooterRead);
System.arraycopy(origFooterBytes, 0, footerBytes, size - origFooterRead, origFooterRead);
}else{
int start = footerBytes.length - (size + FOOTER_METADATA_SIZE);
footerBytes = ArrayUtils.subarray(footerBytes, start, start + size);
}
ParquetMetadata metadata = ParquetFormatPlugin.parquetMetadataConverter.readParquetMetadata(new ByteArrayInputStream(footerBytes));
Footer footer = new Footer(status.getPath(), metadata);
return footer;
}
}
开发者ID:skhalifa,项目名称:QDrill,代码行数:42,代码来源:FooterGatherer.java
示例13: isComplex
import parquet.hadoop.metadata.ParquetMetadata; //导入依赖的package包/类
private static boolean isComplex(ParquetMetadata footer) {
MessageType schema = footer.getFileMetaData().getSchema();
for (Type type : schema.getFields()) {
if (!type.isPrimitive()) {
return true;
}
}
for (ColumnDescriptor col : schema.getColumns()) {
if (col.getMaxRepetitionLevel() > 0) {
return true;
}
}
return false;
}
开发者ID:skhalifa,项目名称:QDrill,代码行数:16,代码来源:ParquetScanBatchCreator.java
示例14: validateFooters
import parquet.hadoop.metadata.ParquetMetadata; //导入依赖的package包/类
private void validateFooters(final List<Footer> metadata) {
logger.debug(metadata.toString());
assertEquals(3, metadata.size());
for (Footer footer : metadata) {
final File file = new File(footer.getFile().toUri());
assertTrue(file.getName(), file.getName().startsWith("part"));
assertTrue(file.getPath(), file.exists());
final ParquetMetadata parquetMetadata = footer.getParquetMetadata();
assertEquals(2, parquetMetadata.getBlocks().size());
final Map<String, String> keyValueMetaData = parquetMetadata.getFileMetaData().getKeyValueMetaData();
assertEquals("bar", keyValueMetaData.get("foo"));
assertEquals(footer.getFile().getName(), keyValueMetaData.get(footer.getFile().getName()));
}
}
开发者ID:skhalifa,项目名称:QDrill,代码行数:15,代码来源:ParquetRecordReaderTest.java
示例15: getDatasetDescriptorFromParquetFile
import parquet.hadoop.metadata.ParquetMetadata; //导入依赖的package包/类
private DatasetDescriptor getDatasetDescriptorFromParquetFile(Job job, FileSystem fs, String uri)
throws IOException {
ArrayList<FileStatus> files = new ArrayList<FileStatus>();
FileStatus[] dirs;
dirs = fs.globStatus(fs.makeQualified(getInputPath()));
for (int i = 0; (dirs != null && i < dirs.length); i++) {
files.addAll(Arrays.asList(fs.listStatus(dirs[i].getPath(), HIDDEN_FILES_PATH_FILTER)));
// We only check one file, so exit the loop when we have at least
// one.
if (files.size() > 0) {
break;
}
}
ParquetMetadata parquetMetadata;
try {
parquetMetadata =
ParquetFileReader.readFooter(job.getConfiguration(),
fs.makeQualified(files.get(0).getPath()));
} catch (IOException e) {
LOG.error("Wrong file format. Please check the export file's format.", e);
throw e;
}
MessageType schema = parquetMetadata.getFileMetaData().getSchema();
Schema avroSchema = new AvroSchemaConverter().convert(schema);
DatasetDescriptor descriptor =
new DatasetDescriptor.Builder().schema(avroSchema).format(Formats.PARQUET)
.compressionType(ParquetJob.getCompressionType(job.getConfiguration())).build();
return descriptor;
}
开发者ID:aliyun,项目名称:aliyun-maxcompute-data-collectors,代码行数:32,代码来源:HdfsOdpsImportJob.java
示例16: readSummaryMetadata
import parquet.hadoop.metadata.ParquetMetadata; //导入依赖的package包/类
static ParquetMetadata readSummaryMetadata(Configuration configuration, Path basePath, boolean skipRowGroups) throws IOException {
Path metadataFile = new Path(basePath, PARQUET_METADATA_FILE);
Path commonMetaDataFile = new Path(basePath, PARQUET_COMMON_METADATA_FILE);
FileSystem fileSystem = basePath.getFileSystem(configuration);
if (skipRowGroups && fileSystem.exists(commonMetaDataFile)) {
// reading the summary file that does not contain the row groups
if (Log.INFO) LOG.info("reading summary file: " + commonMetaDataFile);
return readFooter(configuration, commonMetaDataFile, filter(skipRowGroups));
} else if (fileSystem.exists(metadataFile)) {
if (Log.INFO) LOG.info("reading summary file: " + metadataFile);
return readFooter(configuration, metadataFile, filter(skipRowGroups));
} else {
return null;
}
}
开发者ID:grokcoder,项目名称:pbase,代码行数:16,代码来源:ParquetFileReader.java
示例17: readFooter
import parquet.hadoop.metadata.ParquetMetadata; //导入依赖的package包/类
/**
* Reads the meta data block in the footer of the file
*
* @param configuration
* @param file the parquet File
* @param filter the filter to apply to row groups
* @return the metadata blocks in the footer
* @throws IOException if an error occurs while reading the file
*/
public static final ParquetMetadata readFooter(Configuration configuration, FileStatus file, MetadataFilter filter) throws IOException {
FileSystem fileSystem = file.getPath().getFileSystem(configuration);
FSDataInputStream f = fileSystem.open(file.getPath());
try {
long l = file.getLen();
if (Log.DEBUG) LOG.debug("File length " + l);
int FOOTER_LENGTH_SIZE = 4;
if (l < MAGIC.length + FOOTER_LENGTH_SIZE + MAGIC.length) { // MAGIC + data + footer + footerIndex + MAGIC
throw new RuntimeException(file.getPath() + " is not a Parquet file (too small)");
}
long footerLengthIndex = l - FOOTER_LENGTH_SIZE - MAGIC.length;
if (Log.DEBUG) LOG.debug("reading footer index at " + footerLengthIndex);
f.seek(footerLengthIndex);
int footerLength = readIntLittleEndian(f);
byte[] magic = new byte[MAGIC.length];
f.readFully(magic);
if (!Arrays.equals(MAGIC, magic)) {
throw new RuntimeException(file.getPath() + " is not a Parquet file. expected magic number at tail " + Arrays.toString(MAGIC) + " but found " + Arrays.toString(magic));
}
long footerIndex = footerLengthIndex - footerLength;
if (Log.DEBUG) LOG.debug("read footer length: " + footerLength + ", footer index: " + footerIndex);
if (footerIndex < MAGIC.length || footerIndex >= footerLengthIndex) {
throw new RuntimeException("corrupted file: the footer index is not within the file");
}
f.seek(footerIndex);
return converter.readParquetMetadata(f, filter);
} finally {
f.close();
}
}
开发者ID:grokcoder,项目名称:pbase,代码行数:41,代码来源:ParquetFileReader.java
示例18: end
import parquet.hadoop.metadata.ParquetMetadata; //导入依赖的package包/类
/**
* ends a file once all blocks have been written.
* closes the file.
*
* @param extraMetaData the extra meta data to write in the footer
* @throws IOException
*/
public void end(Map<String, String> extraMetaData) throws IOException {
state = state.end();
if (DEBUG) LOG.debug(out.getPos() + ": end");
ParquetMetadata footer = new ParquetMetadata(new FileMetaData(schema, extraMetaData, Version.FULL_VERSION), blocks);
serializeFooter(footer, out);
out.close();
}
开发者ID:grokcoder,项目名称:pbase,代码行数:15,代码来源:ParquetFileWriter.java
示例19: serializeFooter
import parquet.hadoop.metadata.ParquetMetadata; //导入依赖的package包/类
private static void serializeFooter(ParquetMetadata footer, FSDataOutputStream out) throws IOException {
long footerIndex = out.getPos();
parquet.format.FileMetaData parquetMetadata = new ParquetMetadataConverter().toParquetMetadata(CURRENT_VERSION, footer);
writeFileMetaData(parquetMetadata, out);
if (DEBUG) LOG.debug(out.getPos() + ": footer length = " + (out.getPos() - footerIndex));
BytesUtils.writeIntLittleEndian(out, (int) (out.getPos() - footerIndex));
out.write(MAGIC);
}
开发者ID:grokcoder,项目名称:pbase,代码行数:9,代码来源:ParquetFileWriter.java
示例20: writeMetadataFile
import parquet.hadoop.metadata.ParquetMetadata; //导入依赖的package包/类
/**
* writes a _metadata and _common_metadata file
*
* @param configuration the configuration to use to get the FileSystem
* @param outputPath the directory to write the _metadata file to
* @param footers the list of footers to merge
* @throws IOException
*/
public static void writeMetadataFile(Configuration configuration, Path outputPath, List<Footer> footers) throws IOException {
ParquetMetadata metadataFooter = mergeFooters(outputPath, footers);
FileSystem fs = outputPath.getFileSystem(configuration);
outputPath = outputPath.makeQualified(fs);
writeMetadataFile(outputPath, metadataFooter, fs, PARQUET_METADATA_FILE);
metadataFooter.getBlocks().clear();
writeMetadataFile(outputPath, metadataFooter, fs, PARQUET_COMMON_METADATA_FILE);
}
开发者ID:grokcoder,项目名称:pbase,代码行数:17,代码来源:ParquetFileWriter.java
注:本文中的parquet.hadoop.metadata.ParquetMetadata类示例整理自Github/MSDocs等源码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。 |
请发表评论