1
0

[HUDI-2394] Implement Kafka Sink Protocol for Hudi for Ingesting Immutable Data (#3592)

- Fixing packaging, naming of classes
 - Use of log4j over slf4j for uniformity
- More follow-on fixes
 - Added a version to control/coordinator events.
 - Eliminated the config added to write config
 - Fixed fetching of checkpoints based on table type
 - Clean up of naming, code placement

Co-authored-by: Rajesh Mahindra <rmahindra@Rajeshs-MacBook-Pro.local>
Co-authored-by: Vinoth Chandar <vinoth@apache.org>
This commit is contained in:
rmahindra123
2021-09-10 18:20:26 -07:00
committed by GitHub
parent bd1d2d4952
commit e528dd798a
51 changed files with 4710 additions and 22 deletions

View File

@@ -46,6 +46,7 @@ import org.apache.hudi.keygen.SimpleAvroKeyGenerator;
import org.apache.hudi.keygen.constant.KeyGeneratorType;
import org.apache.hudi.metrics.MetricsReporterType;
import org.apache.hudi.metrics.datadog.DatadogHttpClient.ApiSite;
import org.apache.hudi.table.RandomFileIdPrefixProvider;
import org.apache.hudi.table.action.compact.CompactionTriggerStrategy;
import org.apache.hudi.table.action.compact.strategy.CompactionStrategy;
@@ -413,6 +414,12 @@ public class HoodieWriteConfig extends HoodieConfig {
.withDocumentation("Whether to include '_hoodie_operation' in the metadata fields. "
+ "Once enabled, all the changes of a record are persisted to the delta log directly without merge");
public static final ConfigProperty<String> FILEID_PREFIX_PROVIDER_CLASS = ConfigProperty
.key("hoodie.fileid.prefix.provider.class")
.defaultValue(RandomFileIdPrefixProvider.class.getName())
.sinceVersion("0.10.0")
.withDocumentation("File Id Prefix provider class, that implements `org.apache.hudi.fileid.FileIdPrefixProvider`");
private ConsistencyGuardConfig consistencyGuardConfig;
// Hoodie Write Client transparently rewrites File System View config when embedded mode is enabled
@@ -1748,6 +1755,10 @@ public class HoodieWriteConfig extends HoodieConfig {
return getBooleanOrDefault(ALLOW_OPERATION_METADATA_FIELD);
}
public String getFileIdPrefixProviderClassName() {
return getString(FILEID_PREFIX_PROVIDER_CLASS);
}
public static class Builder {
protected final HoodieWriteConfig writeConfig = new HoodieWriteConfig();
@@ -2079,6 +2090,11 @@ public class HoodieWriteConfig extends HoodieConfig {
return this;
}
public Builder withFileIdPrefixProviderClassName(String fileIdPrefixProviderClassName) {
writeConfig.setValue(FILEID_PREFIX_PROVIDER_CLASS, fileIdPrefixProviderClassName);
return this;
}
public Builder withProperties(Properties properties) {
this.writeConfig.getProps().putAll(properties);
return this;

View File

@@ -0,0 +1,36 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.hudi.table;
import java.util.Properties;
public abstract class FileIdPrefixProvider {
private final Properties props;
public FileIdPrefixProvider(Properties props) {
this.props = props;
}
public Properties getProps() {
return props;
}
public abstract String createFilePrefix(String partitionPath);
}

View File

@@ -0,0 +1,35 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.hudi.table;
import org.apache.hudi.common.fs.FSUtils;
import java.util.Properties;
public class RandomFileIdPrefixProvider extends FileIdPrefixProvider {
public RandomFileIdPrefixProvider(Properties props) {
super(props);
}
@Override
public String createFilePrefix(String partitionPath) {
return FSUtils.createNewFileIdPfx();
}
}

View File

@@ -18,8 +18,6 @@
package org.apache.hudi.client;
import com.codahale.metrics.Timer;
import org.apache.hadoop.conf.Configuration;
import org.apache.hudi.client.common.HoodieJavaEngineContext;
import org.apache.hudi.client.embedded.EmbeddedTimelineService;
import org.apache.hudi.common.engine.HoodieEngineContext;
@@ -30,6 +28,7 @@ import org.apache.hudi.common.model.HoodieRecordPayload;
import org.apache.hudi.common.model.HoodieWriteStat;
import org.apache.hudi.common.model.WriteOperationType;
import org.apache.hudi.common.table.HoodieTableMetaClient;
import org.apache.hudi.common.table.timeline.HoodieInstant;
import org.apache.hudi.common.table.timeline.HoodieTimeline;
import org.apache.hudi.common.util.Option;
import org.apache.hudi.config.HoodieWriteConfig;
@@ -41,6 +40,9 @@ import org.apache.hudi.table.HoodieJavaTable;
import org.apache.hudi.table.HoodieTable;
import org.apache.hudi.table.action.HoodieWriteMetadata;
import com.codahale.metrics.Timer;
import org.apache.hadoop.conf.Configuration;
import java.io.IOException;
import java.util.List;
import java.util.Map;
@@ -153,11 +155,23 @@ public class HoodieJavaWriteClient<T extends HoodieRecordPayload> extends
throw new HoodieNotSupportedException("BulkInsert is not supported in HoodieJavaClient");
}
public void transitionInflight(String instantTime) {
HoodieTableMetaClient metaClient = createMetaClient(true);
metaClient.getActiveTimeline().transitionRequestedToInflight(
new HoodieInstant(HoodieInstant.State.REQUESTED, metaClient.getCommitActionType(), instantTime),
Option.empty(), config.shouldAllowMultiWriteOnSameInstant());
}
@Override
public List<WriteStatus> bulkInsertPreppedRecords(List<HoodieRecord<T>> preppedRecords,
String instantTime,
Option<BulkInsertPartitioner<List<HoodieRecord<T>>>> bulkInsertPartitioner) {
throw new HoodieNotSupportedException("BulkInsertPreppedRecords is not supported in HoodieJavaClient");
HoodieTable<T, List<HoodieRecord<T>>, List<HoodieKey>, List<WriteStatus>> table =
getTableAndInitCtx(WriteOperationType.BULK_INSERT_PREPPED, instantTime);
table.validateInsertSchema();
preWrite(instantTime, WriteOperationType.BULK_INSERT_PREPPED, table.getMetaClient());
HoodieWriteMetadata<List<WriteStatus>> result = table.bulkInsertPrepped(context, instantTime, preppedRecords, bulkInsertPartitioner);
return postWrite(result, instantTime, table);
}
@Override

View File

@@ -19,17 +19,18 @@
package org.apache.hudi.table.action.commit;
import org.apache.hudi.client.WriteStatus;
import org.apache.hudi.common.fs.FSUtils;
import org.apache.hudi.common.model.HoodieKey;
import org.apache.hudi.common.model.HoodieRecord;
import org.apache.hudi.common.model.HoodieRecordPayload;
import org.apache.hudi.common.table.timeline.HoodieInstant;
import org.apache.hudi.common.util.Option;
import org.apache.hudi.common.util.ReflectionUtils;
import org.apache.hudi.config.HoodieWriteConfig;
import org.apache.hudi.execution.JavaLazyInsertIterable;
import org.apache.hudi.execution.bulkinsert.JavaBulkInsertInternalPartitionerFactory;
import org.apache.hudi.io.CreateHandleFactory;
import org.apache.hudi.table.BulkInsertPartitioner;
import org.apache.hudi.table.FileIdPrefixProvider;
import org.apache.hudi.table.HoodieTable;
import org.apache.hudi.table.action.HoodieWriteMetadata;
@@ -66,10 +67,14 @@ public class JavaBulkInsertHelper<T extends HoodieRecordPayload, R> extends Abst
final Option<BulkInsertPartitioner<T>> userDefinedBulkInsertPartitioner) {
HoodieWriteMetadata result = new HoodieWriteMetadata();
//transition bulk_insert state to inflight
table.getActiveTimeline().transitionRequestedToInflight(new HoodieInstant(HoodieInstant.State.REQUESTED,
table.getMetaClient().getCommitActionType(), instantTime), Option.empty(),
config.shouldAllowMultiWriteOnSameInstant());
// It's possible the transition to inflight could have already happened.
if (!table.getActiveTimeline().filterInflights().containsInstant(instantTime)) {
table.getActiveTimeline().transitionRequestedToInflight(
new HoodieInstant(HoodieInstant.State.REQUESTED, table.getMetaClient().getCommitActionType(), instantTime),
Option.empty(),
config.shouldAllowMultiWriteOnSameInstant());
}
// write new files
List<WriteStatus> writeStatuses = bulkInsert(inputRecords, instantTime, table, config, performDedupe, userDefinedBulkInsertPartitioner, false, config.getBulkInsertShuffleParallelism(), false);
//update index
@@ -102,12 +107,16 @@ public class JavaBulkInsertHelper<T extends HoodieRecordPayload, R> extends Abst
: JavaBulkInsertInternalPartitionerFactory.get(config.getBulkInsertSortMode());
repartitionedRecords = (List<HoodieRecord<T>>) partitioner.repartitionRecords(dedupedRecords, parallelism);
String idPfx = FSUtils.createNewFileIdPfx();
FileIdPrefixProvider fileIdPrefixProvider = (FileIdPrefixProvider) ReflectionUtils.loadClass(
config.getFileIdPrefixProviderClassName(),
config.getProps());
List<WriteStatus> writeStatuses = new ArrayList<>();
new JavaLazyInsertIterable<>(repartitionedRecords.iterator(), true, config, instantTime, table, idPfx,
table.getTaskContextSupplier(), new CreateHandleFactory<>()).forEachRemaining(writeStatuses::addAll);
new JavaLazyInsertIterable<>(repartitionedRecords.iterator(), true,
config, instantTime, table,
fileIdPrefixProvider.createFilePrefix(""), table.getTaskContextSupplier(),
new CreateHandleFactory<>()).forEachRemaining(writeStatuses::addAll);
return writeStatuses;
}