1
0

[HUDI-2711] Fallback to fulltable scan for IncrementalRelation if underlying files have been cleared or moved by cleaner (#3946)

Co-authored-by: sivabalan <n.siva.b@gmail.com>
This commit is contained in:
jsbali
2022-02-01 09:33:18 +05:30
committed by GitHub
parent 4b388c104e
commit 7ce0f4522b
5 changed files with 206 additions and 27 deletions

View File

@@ -19,6 +19,7 @@
package org.apache.hudi.utilities.functional;
import org.apache.hudi.AvroConversionUtils;
import org.apache.hudi.DataSourceReadOptions;
import org.apache.hudi.DataSourceWriteOptions;
import org.apache.hudi.client.SparkRDDWriteClient;
import org.apache.hudi.common.config.DFSPropertiesConfiguration;
@@ -1739,6 +1740,54 @@ public class TestHoodieDeltaStreamer extends HoodieDeltaStreamerTestBase {
}
}
@Test
public void testHoodieIncrFallback() throws Exception {
String tableBasePath = dfsBasePath + "/incr_test_table";
String downstreamTableBasePath = dfsBasePath + "/incr_test_downstream_table";
insertInTable(tableBasePath, 1, WriteOperationType.BULK_INSERT);
HoodieDeltaStreamer.Config downstreamCfg =
TestHelpers.makeConfigForHudiIncrSrc(tableBasePath, downstreamTableBasePath,
WriteOperationType.BULK_INSERT, true, null);
new HoodieDeltaStreamer(downstreamCfg, jsc).sync();
insertInTable(tableBasePath, 9, WriteOperationType.UPSERT);
//No change as this fails with Path not exist error
assertThrows(org.apache.spark.sql.AnalysisException.class, () -> new HoodieDeltaStreamer(downstreamCfg, jsc).sync());
TestHelpers.assertRecordCount(1000, downstreamTableBasePath + "/*/*", sqlContext);
if (downstreamCfg.configs == null) {
downstreamCfg.configs = new ArrayList<>();
}
downstreamCfg.configs.add(DataSourceReadOptions.INCREMENTAL_FALLBACK_TO_FULL_TABLE_SCAN_FOR_NON_EXISTING_FILES().key() + "=true");
//Adding this conf to make testing easier :)
downstreamCfg.configs.add("hoodie.deltastreamer.source.hoodieincr.num_instants=10");
downstreamCfg.operation = WriteOperationType.UPSERT;
new HoodieDeltaStreamer(downstreamCfg, jsc).sync();
new HoodieDeltaStreamer(downstreamCfg, jsc).sync();
long baseTableRecords = sqlContext.read().format("org.apache.hudi").load(tableBasePath + "/*/*.parquet").count();
long downStreamTableRecords = sqlContext.read().format("org.apache.hudi").load(downstreamTableBasePath + "/*/*.parquet").count();
assertEquals(baseTableRecords, downStreamTableRecords);
}
private void insertInTable(String tableBasePath, int count, WriteOperationType operationType) throws Exception {
HoodieDeltaStreamer.Config cfg = TestHelpers.makeConfig(tableBasePath, operationType,
Collections.singletonList(SqlQueryBasedTransformer.class.getName()), PROPS_FILENAME_TEST_SOURCE, false);
if (cfg.configs == null) {
cfg.configs = new ArrayList<>();
}
cfg.configs.add("hoodie.cleaner.commits.retained=3");
cfg.configs.add("hoodie.keep.min.commits=4");
cfg.configs.add("hoodie.keep.max.commits=5");
cfg.configs.add("hoodie.test.source.generate.inserts=true");
for (int i = 0; i < count; i++) {
new HoodieDeltaStreamer(cfg, jsc).sync();
}
}
@Test
public void testInsertOverwrite() throws Exception {
testDeltaStreamerWithSpecifiedOperation(dfsBasePath + "/insert_overwrite", WriteOperationType.INSERT_OVERWRITE);