1
0

[HUDI-3365] Make sure Metadata Table records are updated appropriately on HDFS (#4739)

- This change makes sure MT records are updated appropriately on HDFS: previously after Log File append operations MT records were updated w/ just the size of the deltas being appended to the original files, which have been found to be the cause of issues in case of Rollbacks that were instead updating MT with records bearing the full file-size.

- To make sure that we hedge against similar issues going f/w, this PR alleviates this discrepancy and streamlines the flow of MT table always ingesting records bearing full file-sizes.
This commit is contained in:
Alexey Kudinkin
2022-03-07 12:38:27 -08:00
committed by GitHub
parent f0bcee3c01
commit a66fd40692
18 changed files with 415 additions and 255 deletions

View File

@@ -0,0 +1,38 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.hudi.functional;
import org.apache.hudi.common.engine.HoodieEngineContext;
import org.apache.hudi.config.HoodieWriteConfig;
// Sole purpose of this class is to provide access to otherwise API inaccessible from the tests.
// While it's certainly not a great pattern, it would require substantial test restructuring to
// eliminate such access to an internal API, so this is considered acceptable given it's very limited
// scope (w/in the current package)
class SparkRDDWriteClientOverride extends org.apache.hudi.client.SparkRDDWriteClient {
public SparkRDDWriteClientOverride(HoodieEngineContext context, HoodieWriteConfig clientConfig) {
super(context, clientConfig);
}
@Override
public void rollbackFailedBootstrap() {
super.rollbackFailedBootstrap();
}
}

View File

@@ -20,7 +20,6 @@ package org.apache.hudi.functional;
import org.apache.hudi.DataSourceWriteOptions;
import org.apache.hudi.avro.model.HoodieFileStatus;
import org.apache.hudi.client.SparkRDDWriteClient;
import org.apache.hudi.client.bootstrap.BootstrapMode;
import org.apache.hudi.client.bootstrap.FullRecordBootstrapDataProvider;
import org.apache.hudi.client.bootstrap.selector.BootstrapModeSelector;
@@ -253,7 +252,8 @@ public class TestBootstrap extends HoodieClientTestBase {
.withBootstrapParallelism(3)
.withBootstrapModeSelector(bootstrapModeSelectorClass).build())
.build();
SparkRDDWriteClient client = new SparkRDDWriteClient(context, config);
SparkRDDWriteClientOverride client = new SparkRDDWriteClientOverride(context, config);
client.bootstrap(Option.empty());
checkBootstrapResults(totalRecords, schema, bootstrapCommitInstantTs, checkNumRawFiles, numInstantsAfterBootstrap,
numInstantsAfterBootstrap, timestamp, timestamp, deltaCommit, bootstrapInstants, true);
@@ -272,7 +272,7 @@ public class TestBootstrap extends HoodieClientTestBase {
assertFalse(index.useIndex());
// Run bootstrap again
client = new SparkRDDWriteClient(context, config);
client = new SparkRDDWriteClientOverride(context, config);
client.bootstrap(Option.empty());
metaClient.reloadActiveTimeline();

View File

@@ -20,7 +20,6 @@ package org.apache.hudi.functional;
import org.apache.hudi.DataSourceWriteOptions;
import org.apache.hudi.avro.model.HoodieFileStatus;
import org.apache.hudi.client.SparkRDDWriteClient;
import org.apache.hudi.client.bootstrap.BootstrapMode;
import org.apache.hudi.client.bootstrap.FullRecordBootstrapDataProvider;
import org.apache.hudi.client.bootstrap.selector.BootstrapModeSelector;
@@ -245,7 +244,8 @@ public class TestOrcBootstrap extends HoodieClientTestBase {
.withBootstrapParallelism(3)
.withBootstrapModeSelector(bootstrapModeSelectorClass).build())
.build();
SparkRDDWriteClient client = new SparkRDDWriteClient(context, config);
SparkRDDWriteClientOverride client = new SparkRDDWriteClientOverride(context, config);
client.bootstrap(Option.empty());
checkBootstrapResults(totalRecords, schema, bootstrapCommitInstantTs, checkNumRawFiles, numInstantsAfterBootstrap,
numInstantsAfterBootstrap, timestamp, timestamp, deltaCommit, bootstrapInstants, true);
@@ -266,7 +266,7 @@ public class TestOrcBootstrap extends HoodieClientTestBase {
assertFalse(index.useIndex());
// Run bootstrap again
client = new SparkRDDWriteClient(context, config);
client = new SparkRDDWriteClientOverride(context, config);
client.bootstrap(Option.empty());
metaClient.reloadActiveTimeline();