1
0

[HUDI-3396] Make sure BaseFileOnlyViewRelation only reads projected columns (#4818)

NOTE: This change is first part of the series to clean up Hudi's Spark DataSource related implementations, making sure there's minimal code duplication among them, implementations are consistent and performant

This PR is making sure that BaseFileOnlyViewRelation only reads projected columns as well as avoiding unnecessary serde from Row to InternalRow

Brief change log
- Introduced HoodieBaseRDD as a base for all custom RDD impls
- Extracted common fields/methods to HoodieBaseRelation
- Cleaned up and streamlined HoodieBaseFileViewOnlyRelation
- Fixed all of the Relations to avoid superfluous Row <> InternalRow conversions
This commit is contained in:
Alexey Kudinkin
2022-03-09 18:45:25 -08:00
committed by GitHub
parent ca0b8fccee
commit 034addaef5
25 changed files with 751 additions and 264 deletions

View File

@@ -19,6 +19,13 @@
package org.apache.hudi.testutils;
import org.apache.avro.Schema;
import org.apache.avro.generic.GenericRecord;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileStatus;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hudi.AvroConversionUtils;
import org.apache.hudi.client.HoodieReadClient;
import org.apache.hudi.client.SparkRDDWriteClient;
import org.apache.hudi.client.WriteStatus;
@@ -28,6 +35,7 @@ import org.apache.hudi.common.model.HoodieAvroPayload;
import org.apache.hudi.common.model.HoodieBaseFile;
import org.apache.hudi.common.model.HoodieKey;
import org.apache.hudi.common.model.HoodieRecord;
import org.apache.hudi.common.model.HoodieRecordPayload;
import org.apache.hudi.common.model.HoodieTableType;
import org.apache.hudi.common.table.HoodieTableConfig;
import org.apache.hudi.common.table.HoodieTableMetaClient;
@@ -42,6 +50,7 @@ import org.apache.hudi.config.HoodieIndexConfig;
import org.apache.hudi.config.HoodieStorageConfig;
import org.apache.hudi.config.HoodieWriteConfig;
import org.apache.hudi.data.HoodieJavaRDD;
import org.apache.hudi.exception.HoodieIOException;
import org.apache.hudi.index.HoodieIndex;
import org.apache.hudi.keygen.SimpleKeyGenerator;
import org.apache.hudi.table.HoodieSparkTable;
@@ -50,14 +59,11 @@ import org.apache.hudi.testutils.providers.HoodieMetaClientProvider;
import org.apache.hudi.testutils.providers.HoodieWriteClientProvider;
import org.apache.hudi.testutils.providers.SparkProvider;
import org.apache.hudi.timeline.service.TimelineService;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileStatus;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.sql.Dataset;
import org.apache.spark.sql.Row;
import org.apache.spark.sql.SQLContext;
import org.apache.spark.sql.SparkSession;
import org.junit.jupiter.api.AfterAll;
@@ -69,6 +75,7 @@ import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.Properties;
import java.util.stream.Collectors;
import java.util.stream.Stream;
import static org.apache.hudi.common.model.HoodieTableType.COPY_ON_WRITE;
@@ -348,6 +355,21 @@ public class SparkClientFunctionalTestHarness implements SparkProvider, HoodieMe
.withRollbackUsingMarkers(rollbackUsingMarkers);
}
protected Dataset<Row> toDataset(List<HoodieRecord> records, Schema schema) {
List<GenericRecord> avroRecords = records.stream()
.map(r -> {
HoodieRecordPayload payload = (HoodieRecordPayload) r.getData();
try {
return (GenericRecord) payload.getInsertValue(schema).get();
} catch (IOException e) {
throw new HoodieIOException("Failed to extract Avro payload", e);
}
})
.collect(Collectors.toList());
JavaRDD<GenericRecord> jrdd = jsc.parallelize(avroRecords, 2);
return AvroConversionUtils.createDataFrame(jrdd.rdd(), schema.toString(), spark);
}
protected int incrementTimelineServicePortToUse() {
// Increment the timeline service port for each individual test
// to avoid port reuse causing failures