本文整理汇总了Java中org.apache.parquet.hadoop.ParquetFileWriter类的典型用法代码示例。如果您正苦于以下问题:Java ParquetFileWriter类的具体用法?Java ParquetFileWriter怎么用?Java ParquetFileWriter使用的例子?那么恭喜您, 这里精选的类代码示例或许可以为您提供帮助。
ParquetFileWriter类属于org.apache.parquet.hadoop包,在下文中一共展示了ParquetFileWriter类的14个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于我们的系统推荐出更棒的Java代码示例。
示例1: isDirReadable
import org.apache.parquet.hadoop.ParquetFileWriter; //导入依赖的package包/类
boolean isDirReadable(DrillFileSystem fs, FileStatus dir) {
Path p = new Path(dir.getPath(), ParquetFileWriter.PARQUET_METADATA_FILE);
try {
if (fs.exists(p)) {
return true;
} else {
if (metaDataFileExists(fs, dir)) {
return true;
}
List<FileStatus> statuses = DrillFileSystemUtil.listFiles(fs, dir.getPath(), false);
return !statuses.isEmpty() && super.isFileReadable(fs, statuses.get(0));
}
} catch (IOException e) {
logger.info("Failure while attempting to check for Parquet metadata file.", e);
return false;
}
}
开发者ID:axbaretto,项目名称:drill,代码行数:19,代码来源:ParquetFormatPlugin.java
示例2: mergeOutput
import org.apache.parquet.hadoop.ParquetFileWriter; //导入依赖的package包/类
@Override
protected boolean mergeOutput(FileSystem fs, String sourceFolder, String targetFile) {
try {
FileStatus[] sourceStatuses = FileSystemUtil.listSubFiles(fs, sourceFolder);
List<Path> sourceFiles = new ArrayList<>();
for (FileStatus sourceStatus : sourceStatuses) {
sourceFiles.add(sourceStatus.getPath());
}
FileMetaData mergedMeta = ParquetFileWriter.mergeMetadataFiles(sourceFiles, fs.getConf()).getFileMetaData();
ParquetFileWriter writer = new ParquetFileWriter(fs.getConf(), mergedMeta.getSchema(), new Path(targetFile),
ParquetFileWriter.Mode.CREATE);
writer.start();
for (Path input : sourceFiles) {
writer.appendFile(fs.getConf(), input);
}
writer.end(mergedMeta.getKeyValueMetaData());
} catch (Exception e) {
LOG.error("Error when merging files in {}.\n{}", sourceFolder, e.getMessage());
return false;
}
return true;
}
开发者ID:Talend,项目名称:components,代码行数:23,代码来源:ParquetHdfsFileSink.java
示例3: createDataFile
import org.apache.parquet.hadoop.ParquetFileWriter; //导入依赖的package包/类
private static Path createDataFile() throws IOException {
File parquetFile = File.createTempFile("test-", "." + FILE_EXTENSION);
readerSchema = new Schema.Parser().parse(
ParquetFileReaderTest.class.getResourceAsStream("/file/reader/schemas/people.avsc"));
projectionSchema = new Schema.Parser().parse(
ParquetFileReaderTest.class.getResourceAsStream("/file/reader/schemas/people_projection.avsc"));
try (ParquetWriter writer = AvroParquetWriter.<GenericRecord>builder(new Path(parquetFile.toURI()))
.withConf(fs.getConf()).withWriteMode(ParquetFileWriter.Mode.OVERWRITE).withSchema(readerSchema).build()) {
IntStream.range(0, NUM_RECORDS).forEach(index -> {
GenericRecord datum = new GenericData.Record(readerSchema);
datum.put(FIELD_INDEX, index);
datum.put(FIELD_NAME, String.format("%d_name_%s", index, UUID.randomUUID()));
datum.put(FIELD_SURNAME, String.format("%d_surname_%s", index, UUID.randomUUID()));
try {
OFFSETS_BY_INDEX.put(index, Long.valueOf(index));
writer.write(datum);
} catch (IOException ioe) {
throw new RuntimeException(ioe);
}
});
}
Path path = new Path(new Path(fsUri), parquetFile.getName());
fs.moveFromLocalFile(new Path(parquetFile.getAbsolutePath()), path);
return path;
}
开发者ID:mmolimar,项目名称:kafka-connect-fs,代码行数:28,代码来源:ParquetFileReaderTest.java
示例4: getColNameToSchemaElementMapping
import org.apache.parquet.hadoop.ParquetFileWriter; //导入依赖的package包/类
public static Map<String, SchemaElement> getColNameToSchemaElementMapping(ParquetMetadata footer) {
HashMap<String, SchemaElement> schemaElements = new HashMap<>();
FileMetaData fileMetaData = new ParquetMetadataConverter().toParquetMetadata(ParquetFileWriter.CURRENT_VERSION, footer);
for (SchemaElement se : fileMetaData.getSchema()) {
schemaElements.put(se.getName(), se);
}
return schemaElements;
}
开发者ID:dremio,项目名称:dremio-oss,代码行数:9,代码来源:ParquetReaderUtility.java
示例5: checkMagicBytes
import org.apache.parquet.hadoop.ParquetFileWriter; //导入依赖的package包/类
private static void checkMagicBytes(FileStatus status, byte[] data, int offset) throws IOException {
for(int i =0, v = offset; i < MAGIC_LENGTH; i++, v++){
if(ParquetFileWriter.MAGIC[i] != data[v]){
byte[] magic = ArrayUtils.subarray(data, offset, offset + MAGIC_LENGTH);
throw new IOException(status.getPath() + " is not a Parquet file. expected magic number at tail " + Arrays.toString(ParquetFileWriter.MAGIC) + " but found " + Arrays.toString(magic));
}
}
}
开发者ID:dremio,项目名称:dremio-oss,代码行数:9,代码来源:SingletonParquetFooterCache.java
示例6: readFooter
import org.apache.parquet.hadoop.ParquetFileWriter; //导入依赖的package包/类
/**
* An updated footer reader that tries to read the entire footer without knowing the length.
* This should reduce the amount of seek/read roundtrips in most workloads.
* @param fs
* @param status
* @return
* @throws IOException
*/
public static ParquetMetadata readFooter(
final FileSystem fs,
final FileStatus status,
ParquetMetadataConverter.MetadataFilter filter) throws IOException {
try(FSDataInputStream file = fs.open(status.getPath())) {
final long fileLength = status.getLen();
Preconditions.checkArgument(fileLength >= MIN_FILE_SIZE, "%s is not a Parquet file (too small)", status.getPath());
int len = (int) Math.min( fileLength, (long) DEFAULT_READ_SIZE);
byte[] footerBytes = new byte[len];
readFully(file, fileLength - len, footerBytes, 0, len);
checkMagicBytes(status, footerBytes, footerBytes.length - ParquetFileWriter.MAGIC.length);
final int size = BytesUtils.readIntLittleEndian(footerBytes, footerBytes.length - FOOTER_METADATA_SIZE);
if(size > footerBytes.length - FOOTER_METADATA_SIZE){
// if the footer is larger than our initial read, we need to read the rest.
byte[] origFooterBytes = footerBytes;
int origFooterRead = origFooterBytes.length - FOOTER_METADATA_SIZE;
footerBytes = new byte[size];
readFully(file, fileLength - size - FOOTER_METADATA_SIZE, footerBytes, 0, size - origFooterRead);
System.arraycopy(origFooterBytes, 0, footerBytes, size - origFooterRead, origFooterRead);
}else{
int start = footerBytes.length - (size + FOOTER_METADATA_SIZE);
footerBytes = ArrayUtils.subarray(footerBytes, start, start + size);
}
return ParquetFormatPlugin.parquetMetadataConverter.readParquetMetadata(new ByteArrayInputStream(footerBytes), filter);
}
}
开发者ID:dremio,项目名称:dremio-oss,代码行数:41,代码来源:SingletonParquetFooterCache.java
示例7: getFooters
import org.apache.parquet.hadoop.ParquetFileWriter; //导入依赖的package包/类
public static List<Footer> getFooters(final Configuration conf, List<FileStatus> statuses, int parallelism) throws IOException {
final List<TimedRunnable<Footer>> readers = Lists.newArrayList();
List<Footer> foundFooters = Lists.newArrayList();
for (FileStatus status : statuses) {
if (status.isDirectory()){
// first we check for summary file.
FileSystem fs = status.getPath().getFileSystem(conf);
final Path summaryPath = new Path(status.getPath(), ParquetFileWriter.PARQUET_METADATA_FILE);
if (fs.exists(summaryPath)){
FileStatus summaryStatus = fs.getFileStatus(summaryPath);
foundFooters.addAll(ParquetFileReader.readSummaryFile(conf, summaryStatus));
continue;
}
// else we handle as normal file.
for (FileStatus inStatus : DrillFileSystemUtil.listFiles(fs, status.getPath(), false)){
readers.add(new FooterReader(conf, inStatus));
}
} else {
readers.add(new FooterReader(conf, status));
}
}
if(!readers.isEmpty()){
foundFooters.addAll(TimedRunnable.run("Fetch Parquet Footers", logger, readers, parallelism));
}
return foundFooters;
}
开发者ID:axbaretto,项目名称:drill,代码行数:33,代码来源:FooterGatherer.java
示例8: readFooter
import org.apache.parquet.hadoop.ParquetFileWriter; //导入依赖的package包/类
/**
* An updated footer reader that tries to read the entire footer without knowing the length.
* This should reduce the amount of seek/read roundtrips in most workloads.
* @param fs
* @param status
* @return
* @throws IOException
*/
public static Footer readFooter(final Configuration config, final FileStatus status) throws IOException {
final FileSystem fs = status.getPath().getFileSystem(config);
try(FSDataInputStream file = fs.open(status.getPath())) {
final long fileLength = status.getLen();
Preconditions.checkArgument(fileLength >= MIN_FILE_SIZE, "%s is not a Parquet file (too small)", status.getPath());
int len = (int) Math.min( fileLength, (long) DEFAULT_READ_SIZE);
byte[] footerBytes = new byte[len];
readFully(file, fileLength - len, footerBytes, 0, len);
checkMagicBytes(status, footerBytes, footerBytes.length - ParquetFileWriter.MAGIC.length);
final int size = BytesUtils.readIntLittleEndian(footerBytes, footerBytes.length - FOOTER_METADATA_SIZE);
if(size > footerBytes.length - FOOTER_METADATA_SIZE){
// if the footer is larger than our initial read, we need to read the rest.
byte[] origFooterBytes = footerBytes;
int origFooterRead = origFooterBytes.length - FOOTER_METADATA_SIZE;
footerBytes = new byte[size];
readFully(file, fileLength - size - FOOTER_METADATA_SIZE, footerBytes, 0, size - origFooterRead);
System.arraycopy(origFooterBytes, 0, footerBytes, size - origFooterRead, origFooterRead);
}else{
int start = footerBytes.length - (size + FOOTER_METADATA_SIZE);
footerBytes = ArrayUtils.subarray(footerBytes, start, start + size);
}
ParquetMetadata metadata = ParquetFormatPlugin.parquetMetadataConverter.readParquetMetadata(new ByteArrayInputStream(footerBytes));
Footer footer = new Footer(status.getPath(), metadata);
return footer;
}
}
开发者ID:axbaretto,项目名称:drill,代码行数:42,代码来源:FooterGatherer.java
示例9: initWriter
import org.apache.parquet.hadoop.ParquetFileWriter; //导入依赖的package包/类
public static ParquetWriter<Group> initWriter(MessageType schema, String fileName) throws IOException {
GroupWriteSupport.setSchema(schema, conf);
ParquetWriter<Group> writer =
new ParquetWriter<Group>(initFile(fileName),
ParquetFileWriter.Mode.OVERWRITE,
new GroupWriteSupport(),
CompressionCodecName.SNAPPY,
1024,
1024,
512,
true, // enable dictionary encoding,
false,
ParquetProperties.WriterVersion.PARQUET_1_0, conf
);
return writer;
}
开发者ID:axbaretto,项目名称:drill,代码行数:20,代码来源:ParquetSimpleTestFileGenerator.java
示例10: createWriter
import org.apache.parquet.hadoop.ParquetFileWriter; //导入依赖的package包/类
private ParquetWriter<PageReader> createWriter(PluginTask task, Schema schema, int processorIndex)
{
final TimestampFormatter[] timestampFormatters = Timestamps.newTimestampColumnFormatters(task, schema, task.getColumnOptions());
final Path path = new Path(buildPath(task, processorIndex));
final CompressionCodecName codec = CompressionCodecName.valueOf(task.getCompressionCodec());
final int blockSize = task.getBlockSize();
final int pageSize = task.getPageSize();
final Configuration conf = createConfiguration(task.getExtraConfigurations());
final boolean overwrite = task.getOverwrite();
ParquetWriter<PageReader> writer = null;
try {
EmbulkWriterBuilder builder = new EmbulkWriterBuilder(path, schema, timestampFormatters)
.withCompressionCodec(codec)
.withRowGroupSize(blockSize)
.withPageSize(pageSize)
.withDictionaryPageSize(pageSize)
.withConf(conf);
if (overwrite) {
builder.withWriteMode(ParquetFileWriter.Mode.OVERWRITE);
}
writer = builder.build();
}
catch (IOException e) {
Throwables.propagate(e);
}
return writer;
}
开发者ID:choplin,项目名称:embulk-output-parquet,代码行数:32,代码来源:ParquetOutputPlugin.java
示例11: execute
import org.apache.parquet.hadoop.ParquetFileWriter; //导入依赖的package包/类
@Override
public void execute(CommandLine options) throws Exception {
// Prepare arguments
List<String> args = options.getArgList();
List<Path> inputFiles = getInputFiles(args.subList(0, args.size() - 1));
Path outputFile = new Path(args.get(args.size() - 1));
// Merge schema and extraMeta
FileMetaData mergedMeta = mergedMetadata(inputFiles);
PrintWriter out = new PrintWriter(Main.out, true);
// Merge data
ParquetFileWriter writer = new ParquetFileWriter(conf,
mergedMeta.getSchema(), outputFile, ParquetFileWriter.Mode.CREATE);
writer.start();
boolean tooSmallFilesMerged = false;
for (Path input: inputFiles) {
if (input.getFileSystem(conf).getFileStatus(input).getLen() < TOO_SMALL_FILE_THRESHOLD) {
out.format("Warning: file %s is too small, length: %d\n",
input,
input.getFileSystem(conf).getFileStatus(input).getLen());
tooSmallFilesMerged = true;
}
writer.appendFile(HadoopInputFile.fromPath(input, conf));
}
if (tooSmallFilesMerged) {
out.println("Warning: you merged too small files. " +
"Although the size of the merged file is bigger, it STILL contains small row groups, thus you don't have the advantage of big row groups, " +
"which usually leads to bad query performance!");
}
writer.end(mergedMeta.getKeyValueMetaData());
}
开发者ID:apache,项目名称:parquet-mr,代码行数:35,代码来源:MergeCommand.java
示例12: createDataFile
import org.apache.parquet.hadoop.ParquetFileWriter; //导入依赖的package包/类
@Before
public void createDataFile() throws Exception {
File file = temp.newFile("test.parquet");
this.path = new Path(file.toString());
MessageType type = Types.buildMessage()
.required(INT64).named("id")
.required(BINARY).as(UTF8).named("data")
.named("test");
SimpleGroupFactory factory = new SimpleGroupFactory(type);
ParquetWriter<Group> writer = ExampleParquetWriter.builder(path)
.withWriteMode(ParquetFileWriter.Mode.OVERWRITE)
.withType(type)
.build();
try {
for (long i = 0; i < 1000; i += 1) {
Group g = factory.newGroup();
g.add(0, i);
g.add(1, "data-" + i);
writer.write(g);
}
} finally {
writer.close();
}
}
开发者ID:apache,项目名称:parquet-mr,代码行数:29,代码来源:TestFiltersWithMissingColumns.java
示例13: mergedMetadata
import org.apache.parquet.hadoop.ParquetFileWriter; //导入依赖的package包/类
private FileMetaData mergedMetadata(List<Path> inputFiles) throws IOException {
return ParquetFileWriter.mergeMetadataFiles(inputFiles, conf).getFileMetaData();
}
开发者ID:apache,项目名称:parquet-mr,代码行数:4,代码来源:MergeCommand.java
示例14: run
import org.apache.parquet.hadoop.ParquetFileWriter; //导入依赖的package包/类
@Override
@SuppressWarnings("unchecked")
public int run() throws IOException {
Preconditions.checkArgument(targets != null && targets.size() == 1,
"CSV path is required.");
if (header != null) {
// if a header is given on the command line, don't assume one is in the file
noHeader = true;
}
CSVProperties props = new CSVProperties.Builder()
.delimiter(delimiter)
.escape(escape)
.quote(quote)
.header(header)
.hasHeader(!noHeader)
.linesToSkip(linesToSkip)
.charset(charsetName)
.build();
String source = targets.get(0);
Schema csvSchema;
if (avroSchemaFile != null) {
csvSchema = Schemas.fromAvsc(open(avroSchemaFile));
} else {
Set<String> required = ImmutableSet.of();
if (requiredFields != null) {
required = ImmutableSet.copyOf(requiredFields);
}
String filename = new File(source).getName();
String recordName;
if (filename.contains(".")) {
recordName = filename.substring(0, filename.indexOf("."));
} else {
recordName = filename;
}
csvSchema = AvroCSV.inferNullableSchema(
recordName, open(source), props, required);
}
long count = 0;
try (AvroCSVReader<Record> reader = new AvroCSVReader<>(
open(source), props, csvSchema, Record.class, true)) {
CompressionCodecName codec = Codecs.parquetCodec(compressionCodecName);
try (ParquetWriter<Record> writer = AvroParquetWriter
.<Record>builder(qualifiedPath(outputPath))
.withWriterVersion(v2 ? PARQUET_2_0 : PARQUET_1_0)
.withWriteMode(overwrite ?
ParquetFileWriter.Mode.OVERWRITE : ParquetFileWriter.Mode.CREATE)
.withCompressionCodec(codec)
.withDictionaryEncoding(true)
.withDictionaryPageSize(dictionaryPageSize)
.withPageSize(pageSize)
.withRowGroupSize(rowGroupSize)
.withDataModel(GenericData.get())
.withConf(getConf())
.withSchema(csvSchema)
.build()) {
for (Record record : reader) {
writer.write(record);
}
} catch (RuntimeException e) {
throw new RuntimeException("Failed on record " + count, e);
}
}
return 0;
}
开发者ID:apache,项目名称:parquet-mr,代码行数:73,代码来源:ConvertCSVCommand.java
注:本文中的org.apache.parquet.hadoop.ParquetFileWriter类示例整理自Github/MSDocs等源码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。 |
请发表评论