1
0

[HUDI-764] [HUDI-765] ORC reader writer Implementation (#2999)

Co-authored-by: Qingyun (Teresa) Kang <kteresa@uber.com>
This commit is contained in:
Jintao Guan
2021-06-15 15:21:43 -07:00
committed by GitHub
parent cb642ceb75
commit b8fe5b91d5
29 changed files with 2268 additions and 91 deletions

View File

@@ -0,0 +1,76 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.hudi.common.util;
import static org.apache.hudi.common.testutils.HoodieTestDataGenerator.AVRO_SCHEMA;
import java.util.Arrays;
import java.util.List;
import org.apache.avro.Schema;
import org.apache.hudi.common.testutils.HoodieCommonTestHarness;
import org.apache.orc.TypeDescription;
import org.junit.jupiter.params.ParameterizedTest;
import org.junit.jupiter.params.provider.Arguments;
import org.junit.jupiter.params.provider.MethodSource;
import static org.junit.jupiter.api.Assertions.assertEquals;
public class TestAvroOrcUtils extends HoodieCommonTestHarness {
public static List<Arguments> testCreateOrcSchemaArgs() {
// the ORC schema is constructed in the order as AVRO_SCHEMA:
// TRIP_SCHEMA_PREFIX, EXTRA_TYPE_SCHEMA, MAP_TYPE_SCHEMA, FARE_NESTED_SCHEMA, TIP_NESTED_SCHEMA, TRIP_SCHEMA_SUFFIX
// The following types are tested:
// DATE, DECIMAL, LONG, INT, BYTES, ARRAY, RECORD, MAP, STRING, FLOAT, DOUBLE
TypeDescription orcSchema = TypeDescription.fromString("struct<"
+ "timestamp:bigint,_row_key:string,rider:string,driver:string,begin_lat:double,"
+ "begin_lon:double,end_lat:double,end_lon:double,"
+ "distance_in_meters:int,seconds_since_epoch:bigint,weight:float,nation:binary,"
+ "current_date:date,current_ts:bigint,height:decimal(10,6),"
+ "city_to_state:map<string,string>,"
+ "fare:struct<amount:double,currency:string>,"
+ "tip_history:array<struct<amount:double,currency:string>>,"
+ "_hoodie_is_deleted:boolean>");
// Tests the types FIXED, UNION
String structField = "{\"type\":\"record\", \"name\":\"fare\",\"fields\": "
+ "[{\"name\": \"amount\",\"type\": \"double\"},{\"name\": \"currency\", \"type\": \"string\"}]}";
Schema avroSchemaWithMoreTypes = new Schema.Parser().parse(
"{\"type\": \"record\"," + "\"name\": \"triprec\"," + "\"fields\": [ "
+ "{\"name\" : \"age\", \"type\":{\"type\": \"fixed\", \"size\": 16, \"name\": \"fixedField\" }},"
+ "{\"name\" : \"height\", \"type\": [\"int\", \"null\"] },"
+ "{\"name\" : \"id\", \"type\": [\"int\", \"string\"] },"
+ "{\"name\" : \"fare\", \"type\": [" + structField + ", \"null\"] }]}");
TypeDescription orcSchemaWithMoreTypes = TypeDescription.fromString(
"struct<age:binary,height:int,id:uniontype<int,string>,fare:struct<amount:double,currency:string>>");
return Arrays.asList(
Arguments.of(AVRO_SCHEMA, orcSchema),
Arguments.of(avroSchemaWithMoreTypes, orcSchemaWithMoreTypes)
);
}
@ParameterizedTest
@MethodSource("testCreateOrcSchemaArgs")
public void testCreateOrcSchema(Schema avroSchema, TypeDescription orcSchema) {
TypeDescription convertedSchema = AvroOrcUtils.createOrcSchema(avroSchema);
assertEquals(orcSchema, convertedSchema);
}
}

View File

@@ -0,0 +1,92 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.hudi.common.util;
import org.apache.avro.Schema;
import org.apache.avro.generic.GenericRecord;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.orc.CompressionKind;
import org.apache.orc.OrcFile;
import org.apache.orc.Reader;
import org.apache.orc.RecordReader;
import org.apache.orc.TypeDescription;
import org.apache.orc.Writer;
import org.apache.orc.storage.ql.exec.vector.BytesColumnVector;
import org.apache.orc.storage.ql.exec.vector.LongColumnVector;
import org.apache.orc.storage.ql.exec.vector.VectorizedRowBatch;
import org.junit.jupiter.api.AfterEach;
import org.junit.jupiter.api.BeforeEach;
import org.junit.jupiter.api.Test;
import java.io.File;
import java.nio.charset.StandardCharsets;
import java.util.Iterator;
import static org.apache.hudi.common.testutils.SchemaTestUtil.getSchemaFromResource;
import static org.junit.jupiter.api.Assertions.assertEquals;
public class TestOrcReaderIterator {
private final Path filePath = new Path(System.getProperty("java.io.tmpdir") + "/f1_1-0-1_000.orc");
@BeforeEach
@AfterEach
public void clearTempFile() {
File file = new File(filePath.toString());
if (file.exists()) {
file.delete();
}
}
@Test
public void testOrcIteratorReadData() throws Exception {
final Configuration conf = new Configuration();
Schema avroSchema = getSchemaFromResource(TestOrcReaderIterator.class, "/simple-test.avsc");
TypeDescription orcSchema = AvroOrcUtils.createOrcSchema(avroSchema);
OrcFile.WriterOptions options = OrcFile.writerOptions(conf).setSchema(orcSchema).compress(CompressionKind.ZLIB);
Writer writer = OrcFile.createWriter(filePath, options);
VectorizedRowBatch batch = orcSchema.createRowBatch();
BytesColumnVector nameColumns = (BytesColumnVector) batch.cols[0];
LongColumnVector numberColumns = (LongColumnVector) batch.cols[1];
BytesColumnVector colorColumns = (BytesColumnVector) batch.cols[2];
for (int r = 0; r < 5; ++r) {
int row = batch.size++;
byte[] name = ("name" + r).getBytes(StandardCharsets.UTF_8);
nameColumns.setVal(row, name);
byte[] color = ("color" + r).getBytes(StandardCharsets.UTF_8);
colorColumns.setVal(row, color);
numberColumns.vector[row] = r;
}
writer.addRowBatch(batch);
writer.close();
Reader reader = OrcFile.createReader(filePath, OrcFile.readerOptions(conf));
RecordReader recordReader = reader.rows(new Reader.Options(conf).schema(orcSchema));
Iterator<GenericRecord> iterator = new OrcReaderIterator<>(recordReader, avroSchema, orcSchema);
int recordCount = 0;
while (iterator.hasNext()) {
GenericRecord record = iterator.next();
assertEquals("name" + recordCount, record.get("name").toString());
assertEquals("color" + recordCount, record.get("favorite_color").toString());
assertEquals(recordCount, record.get("favorite_number"));
recordCount++;
}
assertEquals(5, recordCount);
}
}

View File

@@ -44,11 +44,16 @@ public class TestHoodieFileReaderFactory {
HoodieFileReader<IndexedRecord> parquetReader = HoodieFileReaderFactory.getFileReader(hadoopConf, parquetPath);
assertTrue(parquetReader instanceof HoodieParquetReader);
// other file format exception.
// log file format.
final Path logPath = new Path("/partition/path/f.b51192a8-574b-4a85-b246-bcfec03ac8bf_100.log.2_1-0-1");
final Throwable thrown = assertThrows(UnsupportedOperationException.class, () -> {
HoodieFileReader<IndexedRecord> logWriter = HoodieFileReaderFactory.getFileReader(hadoopConf, logPath);
}, "should fail since log storage reader is not supported yet.");
assertTrue(thrown.getMessage().contains("format not supported yet."));
// Orc file format.
final Path orcPath = new Path("/partition/path/f1_1-0-1_000.orc");
HoodieFileReader<IndexedRecord> orcReader = HoodieFileReaderFactory.getFileReader(hadoopConf, orcPath);
assertTrue(orcReader instanceof HoodieOrcReader);
}
}