1
0

[HUDI-2439] Replace RDD with HoodieData in HoodieSparkTable and commit executors (#4856)

- Adopt HoodieData in Spark action commit executors
- Make Spark independent DeleteHelper, WriteHelper, MergeHelper in hudi-client-common
- Make HoodieTable in WriteClient APIs have raw type to decouple with Client's generic types
This commit is contained in:
Raymond Xu
2022-03-17 19:17:56 +08:00
committed by GitHub
parent bf191f8d46
commit 7446ff95a7
69 changed files with 723 additions and 769 deletions

View File

@@ -18,8 +18,6 @@
package org.apache.hudi.client;
import com.codahale.metrics.Timer;
import org.apache.hadoop.conf.Configuration;
import org.apache.hudi.client.common.HoodieJavaEngineContext;
import org.apache.hudi.client.embedded.EmbeddedTimelineService;
import org.apache.hudi.common.data.HoodieList;
@@ -43,6 +41,9 @@ import org.apache.hudi.table.HoodieTable;
import org.apache.hudi.table.action.HoodieWriteMetadata;
import org.apache.hudi.table.upgrade.JavaUpgradeDowngradeHelper;
import com.codahale.metrics.Timer;
import org.apache.hadoop.conf.Configuration;
import java.util.List;
import java.util.Map;
import java.util.stream.Collectors;
@@ -88,9 +89,9 @@ public class HoodieJavaWriteClient<T extends HoodieRecordPayload> extends
}
@Override
protected HoodieTable<T, List<HoodieRecord<T>>, List<HoodieKey>, List<WriteStatus>> createTable(HoodieWriteConfig config,
Configuration hadoopConf,
boolean refreshTimeline) {
protected HoodieTable createTable(HoodieWriteConfig config,
Configuration hadoopConf,
boolean refreshTimeline) {
return HoodieJavaTable.create(config, context);
}
@@ -152,7 +153,7 @@ public class HoodieJavaWriteClient<T extends HoodieRecordPayload> extends
@Override
public List<WriteStatus> bulkInsert(List<HoodieRecord<T>> records,
String instantTime,
Option<BulkInsertPartitioner<List<HoodieRecord<T>>>> userDefinedBulkInsertPartitioner) {
Option<BulkInsertPartitioner> userDefinedBulkInsertPartitioner) {
throw new HoodieNotSupportedException("BulkInsert is not supported in HoodieJavaClient");
}
@@ -166,7 +167,7 @@ public class HoodieJavaWriteClient<T extends HoodieRecordPayload> extends
@Override
public List<WriteStatus> bulkInsertPreppedRecords(List<HoodieRecord<T>> preppedRecords,
String instantTime,
Option<BulkInsertPartitioner<List<HoodieRecord<T>>>> bulkInsertPartitioner) {
Option<BulkInsertPartitioner> bulkInsertPartitioner) {
HoodieTable<T, List<HoodieRecord<T>>, List<HoodieKey>, List<WriteStatus>> table =
initTable(WriteOperationType.BULK_INSERT_PREPPED, Option.ofNullable(instantTime));
table.validateInsertSchema();
@@ -188,7 +189,7 @@ public class HoodieJavaWriteClient<T extends HoodieRecordPayload> extends
@Override
protected List<WriteStatus> postWrite(HoodieWriteMetadata<List<WriteStatus>> result,
String instantTime,
HoodieTable<T, List<HoodieRecord<T>>, List<HoodieKey>, List<WriteStatus>> hoodieTable) {
HoodieTable hoodieTable) {
if (result.getIndexLookupDuration().isPresent()) {
metrics.updateIndexMetrics(getOperationType().name(), result.getIndexUpdateDuration().get().toMillis());
}
@@ -215,7 +216,7 @@ public class HoodieJavaWriteClient<T extends HoodieRecordPayload> extends
@Override
protected void completeCompaction(HoodieCommitMetadata metadata,
HoodieTable<T, List<HoodieRecord<T>>, List<HoodieKey>, List<WriteStatus>> table,
HoodieTable table,
String compactionCommitTime) {
throw new HoodieNotSupportedException("CompleteCompaction is not supported in HoodieJavaClient");
}
@@ -232,7 +233,7 @@ public class HoodieJavaWriteClient<T extends HoodieRecordPayload> extends
}
@Override
protected HoodieTable<T, List<HoodieRecord<T>>, List<HoodieKey>, List<WriteStatus>> doInitTable(HoodieTableMetaClient metaClient, Option<String> instantTime) {
protected HoodieTable doInitTable(HoodieTableMetaClient metaClient, Option<String> instantTime) {
// new JavaUpgradeDowngrade(metaClient, config, context).run(metaClient, HoodieTableVersion.current(), config, context, instantTime);
// Create a Hoodie table which encapsulated the commits and files visible

View File

@@ -114,7 +114,7 @@ public class HoodieJavaCopyOnWriteTable<T extends HoodieRecordPayload>
public HoodieWriteMetadata<List<WriteStatus>> bulkInsert(HoodieEngineContext context,
String instantTime,
List<HoodieRecord<T>> records,
Option<BulkInsertPartitioner<List<HoodieRecord<T>>>> bulkInsertPartitioner) {
Option<BulkInsertPartitioner> bulkInsertPartitioner) {
return new JavaBulkInsertCommitActionExecutor((HoodieJavaEngineContext) context, config,
this, instantTime, records, bulkInsertPartitioner).execute();
}
@@ -152,7 +152,7 @@ public class HoodieJavaCopyOnWriteTable<T extends HoodieRecordPayload>
public HoodieWriteMetadata<List<WriteStatus>> bulkInsertPrepped(HoodieEngineContext context,
String instantTime,
List<HoodieRecord<T>> preppedRecords,
Option<BulkInsertPartitioner<List<HoodieRecord<T>>>> bulkInsertPartitioner) {
Option<BulkInsertPartitioner> bulkInsertPartitioner) {
return new JavaBulkInsertPreppedCommitActionExecutor((HoodieJavaEngineContext) context, config,
this, instantTime, preppedRecords, bulkInsertPartitioner).execute();
}

View File

@@ -61,7 +61,7 @@ public class HoodieJavaMergeOnReadTable<T extends HoodieRecordPayload> extends H
public HoodieWriteMetadata<List<WriteStatus>> bulkInsertPrepped(HoodieEngineContext context,
String instantTime,
List<HoodieRecord<T>> preppedRecords,
Option<BulkInsertPartitioner<List<HoodieRecord<T>>>> bulkInsertPartitioner) {
Option<BulkInsertPartitioner> bulkInsertPartitioner) {
return new JavaBulkInsertPreppedCommitActionExecutor((HoodieJavaEngineContext) context, config,
this, instantTime, preppedRecords, bulkInsertPartitioner).execute();
}

View File

@@ -36,17 +36,17 @@ import java.util.Map;
public class JavaBulkInsertCommitActionExecutor<T extends HoodieRecordPayload<T>> extends BaseJavaCommitActionExecutor<T> {
private final List<HoodieRecord<T>> inputRecords;
private final Option<BulkInsertPartitioner<List<HoodieRecord<T>>>> bulkInsertPartitioner;
private final Option<BulkInsertPartitioner> bulkInsertPartitioner;
public JavaBulkInsertCommitActionExecutor(HoodieJavaEngineContext context, HoodieWriteConfig config, HoodieTable table,
String instantTime, List<HoodieRecord<T>> inputRecords,
Option<BulkInsertPartitioner<List<HoodieRecord<T>>>> bulkInsertPartitioner) {
Option<BulkInsertPartitioner> bulkInsertPartitioner) {
this(context, config, table, instantTime, inputRecords, bulkInsertPartitioner, Option.empty());
}
public JavaBulkInsertCommitActionExecutor(HoodieJavaEngineContext context, HoodieWriteConfig config, HoodieTable table,
String instantTime, List<HoodieRecord<T>> inputRecords,
Option<BulkInsertPartitioner<List<HoodieRecord<T>>>> bulkInsertPartitioner,
Option<BulkInsertPartitioner> bulkInsertPartitioner,
Option<Map<String, String>> extraMetadata) {
super(context, config, table, instantTime, WriteOperationType.BULK_INSERT, extraMetadata);
this.inputRecords = inputRecords;

View File

@@ -65,7 +65,7 @@ public class JavaBulkInsertHelper<T extends HoodieRecordPayload, R> extends Base
final HoodieWriteConfig config,
final BaseCommitActionExecutor<T, List<HoodieRecord<T>>, List<HoodieKey>, List<WriteStatus>, R> executor,
final boolean performDedupe,
final Option<BulkInsertPartitioner<List<HoodieRecord<T>>>> userDefinedBulkInsertPartitioner) {
final Option<BulkInsertPartitioner> userDefinedBulkInsertPartitioner) {
HoodieWriteMetadata result = new HoodieWriteMetadata();
// It's possible the transition to inflight could have already happened.
@@ -89,7 +89,7 @@ public class JavaBulkInsertHelper<T extends HoodieRecordPayload, R> extends Base
HoodieTable<T, List<HoodieRecord<T>>, List<HoodieKey>, List<WriteStatus>> table,
HoodieWriteConfig config,
boolean performDedupe,
Option<BulkInsertPartitioner<List<HoodieRecord<T>>>> userDefinedBulkInsertPartitioner,
Option<BulkInsertPartitioner> userDefinedBulkInsertPartitioner,
boolean useWriterSchema,
int parallelism,
WriteHandleFactory writeHandleFactory) {
@@ -106,6 +106,7 @@ public class JavaBulkInsertHelper<T extends HoodieRecordPayload, R> extends Base
BulkInsertPartitioner partitioner = userDefinedBulkInsertPartitioner.isPresent()
? userDefinedBulkInsertPartitioner.get()
: JavaBulkInsertInternalPartitionerFactory.get(config.getBulkInsertSortMode());
// only List is supported for Java partitioner, but it is not enforced by BulkInsertPartitioner API. To improve this, TODO HUDI-3463
repartitionedRecords = (List<HoodieRecord<T>>) partitioner.repartitionRecords(dedupedRecords, parallelism);
FileIdPrefixProvider fileIdPrefixProvider = (FileIdPrefixProvider) ReflectionUtils.loadClass(

View File

@@ -36,12 +36,12 @@ public class JavaBulkInsertPreppedCommitActionExecutor<T extends HoodieRecordPay
extends BaseJavaCommitActionExecutor<T> {
private final List<HoodieRecord<T>> preppedInputRecord;
private final Option<BulkInsertPartitioner<List<HoodieRecord<T>>>> userDefinedBulkInsertPartitioner;
private final Option<BulkInsertPartitioner> userDefinedBulkInsertPartitioner;
public JavaBulkInsertPreppedCommitActionExecutor(HoodieJavaEngineContext context,
HoodieWriteConfig config, HoodieTable table,
String instantTime, List<HoodieRecord<T>> preppedInputRecord,
Option<BulkInsertPartitioner<List<HoodieRecord<T>>>> userDefinedBulkInsertPartitioner) {
Option<BulkInsertPartitioner> userDefinedBulkInsertPartitioner) {
super(context, config, table, instantTime, WriteOperationType.BULK_INSERT);
this.preppedInputRecord = preppedInputRecord;
this.userDefinedBulkInsertPartitioner = userDefinedBulkInsertPartitioner;