本文整理汇总了Java中org.apache.parquet.hadoop.ParquetReader类的典型用法代码示例。如果您正苦于以下问题:Java ParquetReader类的具体用法?Java ParquetReader怎么用?Java ParquetReader使用的例子?那么恭喜您, 这里精选的类代码示例或许可以为您提供帮助。
ParquetReader类属于org.apache.parquet.hadoop包,在下文中一共展示了ParquetReader类的20个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于我们的系统推荐出更棒的Java代码示例。
示例1: read
import org.apache.parquet.hadoop.ParquetReader; //导入依赖的package包/类
public List<Value> read() throws IOException
{
spark.conf().set(SQLConf$.MODULE$.PARQUET_WRITE_LEGACY_FORMAT().key(), isLegacyFormat);
Dataset<Row> dataFrame = spark.createDataFrame(data, schema).repartition(1);
File file = new File(SparkTestBase.this.tempFolder.getRoot(), name);
dataFrame.write().options(options).parquet(file.getPath());
ArrayList<Value> results = new ArrayList<>();
try (ParquetReader<Value> reader = ParquetReader
.builder(new MessagePackReadSupport(), new Path(file.getPath()))
.build()) {
Value v;
while ((v = reader.read()) != null) {
results.add(v);
}
}
return results;
}
开发者ID:CyberAgent,项目名称:embulk-input-parquet_hadoop,代码行数:20,代码来源:SparkTestBase.java
示例2: openParquetReader
import org.apache.parquet.hadoop.ParquetReader; //导入依赖的package包/类
private ParquetReader<Element> openParquetReader() throws IOException {
final boolean isEntity = schemaUtils.getEntityGroups().contains(group);
final GafferGroupObjectConverter converter = schemaUtils.getConverter(group);
LOGGER.debug("Opening a new Parquet reader for file: {}", filePath);
if (null != filter) {
return new ParquetElementReader.Builder<Element>(filePath)
.isEntity(isEntity)
.usingConverter(converter)
.withFilter(FilterCompat.get(filter))
.build();
} else {
return new ParquetElementReader.Builder<Element>(filePath)
.isEntity(isEntity)
.usingConverter(converter)
.build();
}
}
开发者ID:gchq,项目名称:Gaffer,代码行数:18,代码来源:RetrieveElementsFromFile.java
示例3: getSchema
import org.apache.parquet.hadoop.ParquetReader; //导入依赖的package包/类
@Override
public DatasetJsonRecord getSchema(Path targetFilePath)
throws IOException {
System.out.println("parquet file path : " + targetFilePath.toUri().getPath());
SeekableInput sin = new FsInput(targetFilePath, fs.getConf());
ParquetReader<GenericRecord> reader = AvroParquetReader.<GenericRecord>builder(targetFilePath).build();
String schemaString = reader.read().getSchema().toString();
String storage = STORAGE_TYPE;
String abstractPath = targetFilePath.toUri().getPath();
FileStatus fstat = fs.getFileStatus(targetFilePath);
// TODO set codec
DatasetJsonRecord datasetJsonRecord =
new DatasetJsonRecord(schemaString, abstractPath, fstat.getModificationTime(), fstat.getOwner(), fstat.getGroup(),
fstat.getPermission().toString(), null, storage, "");
reader.close();
sin.close();
return datasetJsonRecord;
}
开发者ID:linkedin,项目名称:WhereHows,代码行数:22,代码来源:ParquetFileAnalyzer.java
示例4: getSampleData
import org.apache.parquet.hadoop.ParquetReader; //导入依赖的package包/类
@Override
public SampleDataRecord getSampleData(Path targetFilePath)
throws IOException {
ParquetReader<GenericRecord> reader = AvroParquetReader.<GenericRecord>builder(targetFilePath).build();
Iterator<GenericRecord> iter = Collections.singletonList(reader.read()).iterator();
int count = 0;
List<Object> list = new ArrayList<Object>();
//JSONArray list = new JSONArray();
while (iter.hasNext() && count < 10) {
// TODO handle out of memory error
list.add(iter.next().toString().replaceAll("[\\n\\r\\p{C}]", "").replaceAll("\"", "\\\""));
count++;
}
SampleDataRecord sampleDataRecord = new SampleDataRecord(targetFilePath.toUri().getPath(), list);
return sampleDataRecord;
}
开发者ID:linkedin,项目名称:WhereHows,代码行数:19,代码来源:ParquetFileAnalyzer.java
示例5: validateParquetFile
import org.apache.parquet.hadoop.ParquetReader; //导入依赖的package包/类
public void validateParquetFile(Path parquetFile, long recourdCount) throws IOException {
ParquetReader reader = AvroParquetReader.builder(parquetFile)
.build();
for(long i = 0; i < recourdCount; i++) {
GenericData.Record actualRow = (GenericData.Record) reader.read();
Assert.assertNotNull("Can't read row " + i, actualRow);
Assert.assertEquals("Value different in row " + i + " for key b", actualRow.get("b"), i % 2 == 0);
Assert.assertEquals("Value different in row " + i + " for key s", actualRow.get("s"), new Utf8(String.valueOf(i)));
Assert.assertEquals("Value different in row " + i + " for key l", actualRow.get("l"), i);
Assert.assertEquals("Value different in row " + i + " for key l100", actualRow.get("l100"), i%100);
Assert.assertEquals("Value different in row " + i + " for key s100", actualRow.get("s100"), new Utf8(String.valueOf(i % 100)));
}
Assert.assertNull("Parquet file contains more then expected rows", reader.read());
}
开发者ID:streamsets,项目名称:datacollector,代码行数:18,代码来源:LargeInputFileIT.java
示例6: validateParquetFile
import org.apache.parquet.hadoop.ParquetReader; //导入依赖的package包/类
public void validateParquetFile(Path parquetFile, List<Map<String, Object>> data) throws IOException {
ParquetReader reader = AvroParquetReader.builder(parquetFile)
.build();
int position = 0;
for(Map<String, Object> expectedRow : data) {
GenericData.Record actualRow = (GenericData.Record) reader.read();
Assert.assertNotNull("Can't read row " + position, actualRow);
for(Map.Entry<String, Object> entry : expectedRow.entrySet()) {
Object value = actualRow.get(entry.getKey());
Assert.assertEquals("Different value on row " + position + " for key " + entry.getKey(), entry.getValue(), value);
}
}
Assert.assertNull("Parquet file contains more then expected rows", reader.read());
}
开发者ID:streamsets,项目名称:datacollector,代码行数:18,代码来源:BaseAvroParquetConvertIT.java
示例7: read
import org.apache.parquet.hadoop.ParquetReader; //导入依赖的package包/类
private void read(Path parquetFile, int nRows, Blackhole blackhole) throws IOException
{
ParquetReader<Group> reader = ParquetReader.builder(new GroupReadSupport(), parquetFile).withConf(configuration).build();
for (int i = 0; i < nRows; i++) {
Group group = reader.read();
blackhole.consume(group.getBinary("binary_field", 0));
blackhole.consume(group.getInteger("int32_field", 0));
blackhole.consume(group.getLong("int64_field", 0));
blackhole.consume(group.getBoolean("boolean_field", 0));
blackhole.consume(group.getFloat("float_field", 0));
blackhole.consume(group.getDouble("double_field", 0));
blackhole.consume(group.getBinary("flba_field", 0));
blackhole.consume(group.getInt96("int96_field", 0));
}
reader.close();
}
开发者ID:apache,项目名称:parquet-mr,代码行数:17,代码来源:ReadBenchmarks.java
示例8: countFilteredRecords
import org.apache.parquet.hadoop.ParquetReader; //导入依赖的package包/类
public static long countFilteredRecords(Path path, FilterPredicate pred) throws IOException{
ParquetReader<Group> reader = ParquetReader
.builder(new GroupReadSupport(), path)
.withFilter(FilterCompat.get(pred))
.build();
long count = 0;
try {
while (reader.read() != null) {
count += 1;
}
} finally {
reader.close();
}
return count;
}
开发者ID:apache,项目名称:parquet-mr,代码行数:17,代码来源:TestFiltersWithMissingColumns.java
示例9: readFile
import org.apache.parquet.hadoop.ParquetReader; //导入依赖的package包/类
public static List<Group> readFile(File f, Filter filter) throws IOException {
Configuration conf = new Configuration();
GroupWriteSupport.setSchema(schema, conf);
ParquetReader<Group> reader =
ParquetReader.builder(new GroupReadSupport(), new Path(f.getAbsolutePath()))
.withConf(conf)
.withFilter(filter)
.build();
Group current;
List<Group> users = new ArrayList<Group>();
current = reader.read();
while (current != null) {
users.add(current);
current = reader.read();
}
return users;
}
开发者ID:apache,项目名称:parquet-mr,代码行数:22,代码来源:PhoneBookWriter.java
示例10: testWriteFile
import org.apache.parquet.hadoop.ParquetReader; //导入依赖的package包/类
@Test
public void testWriteFile() throws IOException, InterruptedException, TException {
final AddressBook a = new AddressBook(
Arrays.asList(
new Person(
new Name("Bob", "Roberts"),
0,
"[email protected]",
Arrays.asList(new PhoneNumber("1234567890")))));
final Path fileToCreate = createFile(a);
ParquetReader<Group> reader = createRecordReader(fileToCreate);
Group g = null;
int i = 0;
while((g = reader.read()) != null) {
assertEquals(a.persons.size(), g.getFieldRepetitionCount("persons"));
assertEquals(a.persons.get(0).email, g.getGroup("persons", 0).getGroup(0, 0).getString("email", 0));
// just some sanity check, we're testing the various layers somewhere else
++i;
}
assertEquals("read 1 record", 1, i);
}
开发者ID:apache,项目名称:parquet-mr,代码行数:26,代码来源:TestThriftToParquetFileWriter.java
示例11: testWriteFileListOfMap
import org.apache.parquet.hadoop.ParquetReader; //导入依赖的package包/类
@Test
public void testWriteFileListOfMap() throws IOException, InterruptedException, TException {
Map<String, String> map1 = new HashMap<String,String>();
map1.put("key11", "value11");
map1.put("key12", "value12");
Map<String, String> map2 = new HashMap<String,String>();
map2.put("key21", "value21");
final TestMapInList listMap = new TestMapInList("listmap",
Arrays.asList(map1, map2));
final Path fileToCreate = createFile(listMap);
ParquetReader<Group> reader = createRecordReader(fileToCreate);
Group g = null;
while((g = reader.read()) != null) {
assertEquals(listMap.names.size(),
g.getGroup("names", 0).getFieldRepetitionCount("names_tuple"));
assertEquals(listMap.names.get(0).size(),
g.getGroup("names", 0).getGroup("names_tuple", 0).getFieldRepetitionCount("map"));
assertEquals(listMap.names.get(1).size(),
g.getGroup("names", 0).getGroup("names_tuple", 1).getFieldRepetitionCount("map"));
}
}
开发者ID:apache,项目名称:parquet-mr,代码行数:25,代码来源:TestThriftToParquetFileWriter.java
示例12: testWriteFileMapOfList
import org.apache.parquet.hadoop.ParquetReader; //导入依赖的package包/类
@Test
public void testWriteFileMapOfList() throws IOException, InterruptedException, TException {
Map<String, List<String>> map = new HashMap<String,List<String>>();
map.put("key", Arrays.asList("val1","val2"));
final TestListInMap mapList = new TestListInMap("maplist", map);
final Path fileToCreate = createFile(mapList);
ParquetReader<Group> reader = createRecordReader(fileToCreate);
Group g = null;
while((g = reader.read()) != null) {
assertEquals("key",
g.getGroup("names", 0).getGroup("map",0).getBinary("key", 0).toStringUsingUTF8());
assertEquals(map.get("key").size(),
g.getGroup("names", 0).getGroup("map",0).getGroup("value", 0).getFieldRepetitionCount(0));
}
}
开发者ID:apache,项目名称:parquet-mr,代码行数:18,代码来源:TestThriftToParquetFileWriter.java
示例13: testWriteFileMapOfLists
import org.apache.parquet.hadoop.ParquetReader; //导入依赖的package包/类
@Test
public void testWriteFileMapOfLists() throws IOException, InterruptedException, TException {
Map<List<String>, List<String>> map = new HashMap<List<String>,List<String>>();
map.put(Arrays.asList("key1","key2"), Arrays.asList("val1","val2"));
final TestListsInMap mapList = new TestListsInMap("maplists", map);
final Path fileToCreate = createFile(mapList);
ParquetReader<Group> reader = createRecordReader(fileToCreate);
Group g = null;
while((g = reader.read()) != null) {
assertEquals("key1",
g.getGroup("names", 0).getGroup("map",0).getGroup("key", 0).getBinary("key_tuple", 0).toStringUsingUTF8());
assertEquals("key2",
g.getGroup("names", 0).getGroup("map",0).getGroup("key", 0).getBinary("key_tuple", 1).toStringUsingUTF8());
assertEquals("val1",
g.getGroup("names", 0).getGroup("map",0).getGroup("value", 0).getBinary("value_tuple", 0).toStringUsingUTF8());
assertEquals("val2",
g.getGroup("names", 0).getGroup("map",0).getGroup("value", 0).getBinary("value_tuple", 1).toStringUsingUTF8());
}
}
开发者ID:apache,项目名称:parquet-mr,代码行数:22,代码来源:TestThriftToParquetFileWriter.java
示例14: read
import org.apache.parquet.hadoop.ParquetReader; //导入依赖的package包/类
public static <D> List<D> read(GenericData model, Schema schema, File file) throws IOException {
List<D> data = new ArrayList<D>();
Configuration conf = new Configuration(false);
AvroReadSupport.setRequestedProjection(conf, schema);
AvroReadSupport.setAvroReadSchema(conf, schema);
ParquetReader<D> fileReader = AvroParquetReader
.<D>builder(new Path(file.toString()))
.withDataModel(model) // reflect disables compatibility
.withConf(conf)
.build();
try {
D datum;
while ((datum = fileReader.read()) != null) {
data.add(datum);
}
} finally {
fileReader.close();
}
return data;
}
开发者ID:apache,项目名称:parquet-mr,代码行数:23,代码来源:AvroTestUtil.java
示例15: testFilterOnSubAttribute
import org.apache.parquet.hadoop.ParquetReader; //导入依赖的package包/类
@Test
public void testFilterOnSubAttribute() throws IOException {
Path path = writeCarsToParquetFile(1, CompressionCodecName.UNCOMPRESSED, false);
ParquetReader<Car> reader = new AvroParquetReader<Car>(testConf, path, column("engine.type", equalTo(EngineType.DIESEL)));
assertEquals(reader.read().toString(), getVwPassat().toString());
assertNull(reader.read());
reader = new AvroParquetReader<Car>(testConf, path, column("engine.capacity", equalTo(1.4f)));
assertEquals(getVwPolo().toString(), reader.read().toString());
assertNull(reader.read());
reader = new AvroParquetReader<Car>(testConf, path, column("engine.hasTurboCharger", equalTo(true)));
assertEquals(getBmwMini().toString(), reader.read().toString());
assertNull(reader.read());
}
开发者ID:apache,项目名称:parquet-mr,代码行数:17,代码来源:TestSpecificReadWrite.java
示例16: testAvroReadSchema
import org.apache.parquet.hadoop.ParquetReader; //导入依赖的package包/类
@Test
public void testAvroReadSchema() throws IOException {
Path path = writeCarsToParquetFile(1, CompressionCodecName.UNCOMPRESSED, false);
Configuration conf = new Configuration(testConf);
AvroReadSupport.setAvroReadSchema(conf, NewCar.SCHEMA$);
ParquetReader<NewCar> reader = new AvroParquetReader<NewCar>(conf, path);
for (NewCar car = reader.read(); car != null; car = reader.read()) {
assertEquals(car.getEngine() != null, true);
assertEquals(car.getBrand() != null, true);
assertEquals(car.getYear() != null, true);
assertEquals(car.getVin() != null, true);
assertEquals(car.getDescription() == null, true);
assertEquals(car.getOpt() == 5, true);
}
}
开发者ID:apache,项目名称:parquet-mr,代码行数:17,代码来源:TestSpecificReadWrite.java
示例17: testCompatStringCompatibility
import org.apache.parquet.hadoop.ParquetReader; //导入依赖的package包/类
@Test
public void testCompatStringCompatibility() throws IOException {
// some older versions of Parquet used avro.schema instead of
// parquet.avro.schema and didn't annotate binary with UTF8 when the type
// was converted from an Avro string. this validates that the old read
// schema is recognized and used to read the file as expected.
Path testFile = new Path(Resources.getResource("strings-2.parquet").getFile());
Configuration conf = new Configuration();
ParquetReader<GenericRecord> reader = AvroParquetReader
.builder(new AvroReadSupport<GenericRecord>(), testFile)
.withConf(conf)
.build();
GenericRecord r;
while ((r = reader.read()) != null) {
Assert.assertTrue("Should read value into a String",
r.get("text") instanceof String);
}
}
开发者ID:apache,项目名称:parquet-mr,代码行数:19,代码来源:TestBackwardCompatibility.java
示例18: initReader
import org.apache.parquet.hadoop.ParquetReader; //导入依赖的package包/类
private ParquetReader<GenericRecord> initReader() throws IOException {
Configuration configuration = getFs().getConf();
if (this.schema != null) {
AvroReadSupport.setAvroReadSchema(configuration, this.schema);
}
if (this.projection != null) {
AvroReadSupport.setRequestedProjection(configuration, this.projection);
}
ParquetReader reader = AvroParquetReader.<GenericRecord>builder(getFilePath())
.withConf(configuration).build();
return reader;
}
开发者ID:mmolimar,项目名称:kafka-connect-fs,代码行数:13,代码来源:ParquetFileReader.java
示例19: getSchema
import org.apache.parquet.hadoop.ParquetReader; //导入依赖的package包/类
@Override
public Schema getSchema(Configuration conf, Path path) throws IOException {
AvroReadSupport<GenericRecord> readSupport = new AvroReadSupport<>();
ParquetReader.Builder<GenericRecord> builder = ParquetReader.builder(readSupport, path);
ParquetReader<GenericRecord> parquetReader = builder.withConf(conf).build();
GenericRecord record;
Schema schema = null;
while ((record = parquetReader.read()) != null) {
schema = avroData.toConnectSchema(record.getSchema());
}
parquetReader.close();
return schema;
}
开发者ID:jiangxiluning,项目名称:kafka-connect-hdfs,代码行数:14,代码来源:ParquetFileReader.java
示例20: readData
import org.apache.parquet.hadoop.ParquetReader; //导入依赖的package包/类
@Override
public Collection<Object> readData(Configuration conf, Path path) throws IOException {
Collection<Object> result = new ArrayList<>();
AvroReadSupport<GenericRecord> readSupport = new AvroReadSupport<>();
ParquetReader.Builder<GenericRecord> builder = ParquetReader.builder(readSupport, path);
ParquetReader<GenericRecord> parquetReader = builder.withConf(conf).build();
GenericRecord record;
while ((record = parquetReader.read()) != null) {
result.add(record);
}
parquetReader.close();
return result;
}
开发者ID:jiangxiluning,项目名称:kafka-connect-hdfs,代码行数:14,代码来源:ParquetFileReader.java
注:本文中的org.apache.parquet.hadoop.ParquetReader类示例整理自Github/MSDocs等源码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。 |
请发表评论