1
0

[HUDI-2837] Add support for using database name in incremental query (#4083)

This commit is contained in:
董可伦
2022-01-23 14:11:27 +08:00
committed by GitHub
parent 64b1426005
commit 56cd8ffae0
19 changed files with 330 additions and 63 deletions

View File

@@ -22,6 +22,7 @@ import org.apache.hudi.avro.model.HoodieCompactionPlan;
import org.apache.hudi.common.fs.FSUtils;
import org.apache.hudi.common.model.HoodieCommitMetadata;
import org.apache.hudi.common.model.HoodieFileFormat;
import org.apache.hudi.common.model.HoodieTableType;
import org.apache.hudi.common.model.HoodieWriteStat;
import org.apache.hudi.common.table.HoodieTableMetaClient;
import org.apache.hudi.common.table.timeline.HoodieActiveTimeline;
@@ -232,9 +233,90 @@ public class TestHoodieHFileInputFormat {
InputFormatTestUtil.setupIncremental(jobConf, "100", 1);
HoodieTableMetaClient metaClient = HoodieTestUtils.init(HoodieTestUtils.getDefaultHadoopConf(), basePath.toString(),
HoodieTableType.COPY_ON_WRITE, baseFileFormat);
assertEquals(null, metaClient.getTableConfig().getDatabaseName(),
"When hoodie.database.name is not set, it should default to null");
FileStatus[] files = inputFormat.listStatus(jobConf);
assertEquals(0, files.length,
"We should exclude commit 100 when returning incremental pull with start commit time as 100");
InputFormatTestUtil.setupIncremental(jobConf, "100", 1, true);
files = inputFormat.listStatus(jobConf);
assertEquals(0, files.length,
"We should exclude commit 100 when returning incremental pull with start commit time as 100");
metaClient = HoodieTestUtils.init(HoodieTestUtils.getDefaultHadoopConf(), basePath.toString(), HoodieTableType.COPY_ON_WRITE,
baseFileFormat, HoodieTestUtils.HOODIE_DATABASE);
assertEquals(HoodieTestUtils.HOODIE_DATABASE, metaClient.getTableConfig().getDatabaseName(),
String.format("The hoodie.database.name should be %s ", HoodieTestUtils.HOODIE_DATABASE));
files = inputFormat.listStatus(jobConf);
assertEquals(10, files.length,
"When hoodie.incremental.use.database is true and hoodie.database.name is not null or empty"
+ " and the incremental database name is not set, then the incremental query will not take effect");
}
@Test
public void testIncrementalWithDatabaseName() throws IOException {
// initial commit
File partitionDir = InputFormatTestUtil.prepareTable(basePath, baseFileFormat, 10, "100");
createCommitFile(basePath, "100", "2016/05/01");
// Add the paths
FileInputFormat.setInputPaths(jobConf, partitionDir.getPath());
InputFormatTestUtil.setupIncremental(jobConf, "100", 1, HoodieTestUtils.HOODIE_DATABASE, true);
HoodieTableMetaClient metaClient = HoodieTestUtils.init(HoodieTestUtils.getDefaultHadoopConf(), basePath.toString(),
HoodieTableType.COPY_ON_WRITE, baseFileFormat);
assertEquals(null, metaClient.getTableConfig().getDatabaseName(),
"When hoodie.database.name is not set, it should default to null");
FileStatus[] files = inputFormat.listStatus(jobConf);
assertEquals(10, files.length,
"When hoodie.database.name is null, then the incremental query will not take effect");
metaClient = HoodieTestUtils.init(HoodieTestUtils.getDefaultHadoopConf(), basePath.toString(), HoodieTableType.COPY_ON_WRITE,
baseFileFormat, "");
assertEquals("", metaClient.getTableConfig().getDatabaseName(),
"The hoodie.database.name should be empty");
files = inputFormat.listStatus(jobConf);
assertEquals(10, files.length,
"When hoodie.database.name is empty, then the incremental query will not take effect");
metaClient = HoodieTestUtils.init(HoodieTestUtils.getDefaultHadoopConf(), basePath.toString(), HoodieTableType.COPY_ON_WRITE,
baseFileFormat, HoodieTestUtils.HOODIE_DATABASE);
assertEquals(HoodieTestUtils.HOODIE_DATABASE, metaClient.getTableConfig().getDatabaseName(),
String.format("The hoodie.database.name should be %s ", HoodieTestUtils.HOODIE_DATABASE));
files = inputFormat.listStatus(jobConf);
assertEquals(0, files.length,
"We should exclude commit 100 when returning incremental pull with start commit time as 100");
InputFormatTestUtil.setupIncremental(jobConf, "100", 1, HoodieTestUtils.HOODIE_DATABASE, false);
files = inputFormat.listStatus(jobConf);
assertEquals(10, files.length,
"When hoodie.incremental.use.database is false and the incremental database name is set,"
+ "then the incremental query will not take effect");
// The configuration with and without database name exists together
InputFormatTestUtil.setupIncremental(jobConf, "1", 1, true);
files = inputFormat.listStatus(jobConf);
assertEquals(0, files.length,
"When hoodie.incremental.use.database is true, "
+ "We should exclude commit 100 because the returning incremental pull with start commit time is 100");
InputFormatTestUtil.setupIncremental(jobConf, "1", 1, false);
files = inputFormat.listStatus(jobConf);
assertEquals(10, files.length,
"When hoodie.incremental.use.database is false, "
+ "We should include commit 100 because the returning incremental pull with start commit time is 1");
}
private void createCommitFile(java.nio.file.Path basePath, String commitNumber, String partitionPath)
@@ -316,7 +398,7 @@ public class TestHoodieHFileInputFormat {
ensureFilesInCommit("Pulling all commits from 100, should get us the 1 files from 200 commit", files, "200", 1);
}
// TODO enable this after enabling predicate pushdown
// TODO enable this after enabling predicate push down
public void testPredicatePushDown() throws IOException {
// initial commit
Schema schema = getSchemaFromResource(TestHoodieHFileInputFormat.class, "/sample1.avsc");
@@ -337,7 +419,7 @@ public class TestHoodieHFileInputFormat {
// check whether we have 2 records at this point
ensureRecordsInCommit("We need to have 2 records that was modified at commit " + commit2 + " and no more", commit2,
2, 2);
// Make sure we have the 10 records if we roll back the stattime
// Make sure we have the 10 records if we roll back the start time
InputFormatTestUtil.setupIncremental(jobConf, "0", 2);
ensureRecordsInCommit("We need to have 8 records that was modified at commit " + commit1 + " and no more", commit1,
8, 10);
@@ -347,19 +429,19 @@ public class TestHoodieHFileInputFormat {
@Test
public void testGetIncrementalTableNames() throws IOException {
String[] expectedincrTables = {"db1.raw_trips", "db2.model_trips", "db3.model_trips"};
String[] expectedIncrTables = {"db1.raw_trips", "db2.model_trips", "db3.model_trips"};
JobConf conf = new JobConf();
String incrementalMode1 = String.format(HoodieHiveUtils.HOODIE_CONSUME_MODE_PATTERN, expectedincrTables[0]);
String incrementalMode1 = String.format(HoodieHiveUtils.HOODIE_CONSUME_MODE_PATTERN, expectedIncrTables[0]);
conf.set(incrementalMode1, HoodieHiveUtils.INCREMENTAL_SCAN_MODE);
String incrementalMode2 = String.format(HoodieHiveUtils.HOODIE_CONSUME_MODE_PATTERN, expectedincrTables[1]);
String incrementalMode2 = String.format(HoodieHiveUtils.HOODIE_CONSUME_MODE_PATTERN, expectedIncrTables[1]);
conf.set(incrementalMode2,HoodieHiveUtils.INCREMENTAL_SCAN_MODE);
String incrementalMode3 = String.format(HoodieHiveUtils.HOODIE_CONSUME_MODE_PATTERN, "db3.model_trips");
conf.set(incrementalMode3, HoodieHiveUtils.INCREMENTAL_SCAN_MODE.toLowerCase());
String defaultmode = String.format(HoodieHiveUtils.HOODIE_CONSUME_MODE_PATTERN, "db3.first_trips");
conf.set(defaultmode, HoodieHiveUtils.DEFAULT_SCAN_MODE);
List<String> actualincrTables = HoodieHiveUtils.getIncrementalTableNames(Job.getInstance(conf));
for (String expectedincrTable : expectedincrTables) {
assertTrue(actualincrTables.contains(expectedincrTable));
String defaultMode = String.format(HoodieHiveUtils.HOODIE_CONSUME_MODE_PATTERN, "db3.first_trips");
conf.set(defaultMode, HoodieHiveUtils.DEFAULT_SCAN_MODE);
List<String> actualIncrTables = HoodieHiveUtils.getIncrementalTableNames(Job.getInstance(conf));
for (String expectedIncrTable : expectedIncrTables) {
assertTrue(actualIncrTables.contains(expectedIncrTable));
}
}

View File

@@ -22,6 +22,7 @@ import org.apache.hudi.avro.model.HoodieCompactionPlan;
import org.apache.hudi.common.fs.FSUtils;
import org.apache.hudi.common.model.HoodieCommitMetadata;
import org.apache.hudi.common.model.HoodieFileFormat;
import org.apache.hudi.common.model.HoodieTableType;
import org.apache.hudi.common.model.HoodieWriteStat;
import org.apache.hudi.common.table.HoodieTableMetaClient;
import org.apache.hudi.common.table.timeline.HoodieActiveTimeline;
@@ -286,9 +287,90 @@ public class TestHoodieParquetInputFormat {
InputFormatTestUtil.setupIncremental(jobConf, "100", 1);
HoodieTableMetaClient metaClient = HoodieTestUtils.init(HoodieTestUtils.getDefaultHadoopConf(), basePath.toString(),
HoodieTableType.COPY_ON_WRITE, baseFileFormat);
assertEquals(null, metaClient.getTableConfig().getDatabaseName(),
"When hoodie.database.name is not set, it should default to null");
FileStatus[] files = inputFormat.listStatus(jobConf);
assertEquals(0, files.length,
"We should exclude commit 100 when returning incremental pull with start commit time as 100");
InputFormatTestUtil.setupIncremental(jobConf, "100", 1, true);
files = inputFormat.listStatus(jobConf);
assertEquals(0, files.length,
"We should exclude commit 100 when returning incremental pull with start commit time as 100");
metaClient = HoodieTestUtils.init(HoodieTestUtils.getDefaultHadoopConf(), basePath.toString(), HoodieTableType.COPY_ON_WRITE,
baseFileFormat, HoodieTestUtils.HOODIE_DATABASE);
assertEquals(HoodieTestUtils.HOODIE_DATABASE, metaClient.getTableConfig().getDatabaseName(),
String.format("The hoodie.database.name should be %s ", HoodieTestUtils.HOODIE_DATABASE));
files = inputFormat.listStatus(jobConf);
assertEquals(10, files.length,
"When hoodie.incremental.use.database is true and hoodie.database.name is not null or empty"
+ " and the incremental database name is not set, then the incremental query will not take effect");
}
@Test
public void testIncrementalWithDatabaseName() throws IOException {
// initial commit
File partitionDir = InputFormatTestUtil.prepareTable(basePath, baseFileFormat, 10, "100");
createCommitFile(basePath, "100", "2016/05/01");
// Add the paths
FileInputFormat.setInputPaths(jobConf, partitionDir.getPath());
InputFormatTestUtil.setupIncremental(jobConf, "100", 1, HoodieTestUtils.HOODIE_DATABASE, true);
HoodieTableMetaClient metaClient = HoodieTestUtils.init(HoodieTestUtils.getDefaultHadoopConf(), basePath.toString(),
HoodieTableType.COPY_ON_WRITE, baseFileFormat);
assertEquals(null, metaClient.getTableConfig().getDatabaseName(),
"When hoodie.database.name is not set, it should default to null");
FileStatus[] files = inputFormat.listStatus(jobConf);
assertEquals(10, files.length,
"When hoodie.database.name is null, then the incremental query will not take effect");
metaClient = HoodieTestUtils.init(HoodieTestUtils.getDefaultHadoopConf(), basePath.toString(), HoodieTableType.COPY_ON_WRITE,
baseFileFormat, "");
assertEquals("", metaClient.getTableConfig().getDatabaseName(),
"The hoodie.database.name should be empty");
files = inputFormat.listStatus(jobConf);
assertEquals(10, files.length,
"When hoodie.database.name is empty, then the incremental query will not take effect");
metaClient = HoodieTestUtils.init(HoodieTestUtils.getDefaultHadoopConf(), basePath.toString(), HoodieTableType.COPY_ON_WRITE,
baseFileFormat, HoodieTestUtils.HOODIE_DATABASE);
assertEquals(HoodieTestUtils.HOODIE_DATABASE, metaClient.getTableConfig().getDatabaseName(),
String.format("The hoodie.database.name should be %s ", HoodieTestUtils.HOODIE_DATABASE));
files = inputFormat.listStatus(jobConf);
assertEquals(0, files.length,
"We should exclude commit 100 when returning incremental pull with start commit time as 100");
InputFormatTestUtil.setupIncremental(jobConf, "100", 1, HoodieTestUtils.HOODIE_DATABASE, false);
files = inputFormat.listStatus(jobConf);
assertEquals(10, files.length,
"When hoodie.incremental.use.database is false and the incremental database name is set, "
+ "then the incremental query will not take effect");
// The configuration with and without database name exists together
InputFormatTestUtil.setupIncremental(jobConf, "1", 1, true);
files = inputFormat.listStatus(jobConf);
assertEquals(0, files.length,
"When hoodie.incremental.use.database is true, "
+ "We should exclude commit 100 because the returning incremental pull with start commit time is 100");
InputFormatTestUtil.setupIncremental(jobConf, "1", 1, false);
files = inputFormat.listStatus(jobConf);
assertEquals(10, files.length,
"When hoodie.incremental.use.database is false, "
+ "We should include commit 100 because the returning incremental pull with start commit time is 1");
}
@Test
@@ -429,7 +511,7 @@ public class TestHoodieParquetInputFormat {
ensureFilesInCommit("Pulling all commits from 100, should get us the 1 files from 200 commit", files, "200", 1);
}
@Disabled("enable this after enabling predicate pushdown")
@Disabled("enable this after enabling predicate push down")
@Test
public void testPredicatePushDown() throws IOException {
// initial commit
@@ -451,7 +533,7 @@ public class TestHoodieParquetInputFormat {
// check whether we have 2 records at this point
ensureRecordsInCommit("We need to have 2 records that was modified at commit " + commit2 + " and no more", commit2,
2, 2);
// Make sure we have the 10 records if we roll back the stattime
// Make sure we have the 10 records if we roll back the start time
InputFormatTestUtil.setupIncremental(jobConf, "0", 2);
ensureRecordsInCommit("We need to have 8 records that was modified at commit " + commit1 + " and no more", commit1,
8, 10);
@@ -461,19 +543,19 @@ public class TestHoodieParquetInputFormat {
@Test
public void testGetIncrementalTableNames() throws IOException {
String[] expectedincrTables = {"db1.raw_trips", "db2.model_trips", "db3.model_trips"};
String[] expectedIncrTables = {"db1.raw_trips", "db2.model_trips", "db3.model_trips"};
JobConf conf = new JobConf();
String incrementalMode1 = String.format(HoodieHiveUtils.HOODIE_CONSUME_MODE_PATTERN, expectedincrTables[0]);
String incrementalMode1 = String.format(HoodieHiveUtils.HOODIE_CONSUME_MODE_PATTERN, expectedIncrTables[0]);
conf.set(incrementalMode1, HoodieHiveUtils.INCREMENTAL_SCAN_MODE);
String incrementalMode2 = String.format(HoodieHiveUtils.HOODIE_CONSUME_MODE_PATTERN, expectedincrTables[1]);
String incrementalMode2 = String.format(HoodieHiveUtils.HOODIE_CONSUME_MODE_PATTERN, expectedIncrTables[1]);
conf.set(incrementalMode2, HoodieHiveUtils.INCREMENTAL_SCAN_MODE);
String incrementalMode3 = String.format(HoodieHiveUtils.HOODIE_CONSUME_MODE_PATTERN, "db3.model_trips");
conf.set(incrementalMode3, HoodieHiveUtils.INCREMENTAL_SCAN_MODE.toLowerCase());
String defaultmode = String.format(HoodieHiveUtils.HOODIE_CONSUME_MODE_PATTERN, "db3.first_trips");
conf.set(defaultmode, HoodieHiveUtils.DEFAULT_SCAN_MODE);
List<String> actualincrTables = HoodieHiveUtils.getIncrementalTableNames(Job.getInstance(conf));
for (String expectedincrTable : expectedincrTables) {
assertTrue(actualincrTables.contains(expectedincrTable));
String defaultMode = String.format(HoodieHiveUtils.HOODIE_CONSUME_MODE_PATTERN, "db3.first_trips");
conf.set(defaultMode, HoodieHiveUtils.DEFAULT_SCAN_MODE);
List<String> actualIncrTables = HoodieHiveUtils.getIncrementalTableNames(Job.getInstance(conf));
for (String expectedIncrTable : expectedIncrTables) {
assertTrue(actualIncrTables.contains(expectedIncrTable));
}
}

View File

@@ -130,6 +130,10 @@ public class InputFormatTestUtil {
}
public static void setupIncremental(JobConf jobConf, String startCommit, int numberOfCommitsToPull) {
setupIncremental(jobConf, startCommit, numberOfCommitsToPull, false);
}
public static void setupIncremental(JobConf jobConf, String startCommit, int numberOfCommitsToPull, boolean isIncrementalUseDatabase) {
String modePropertyName =
String.format(HoodieHiveUtils.HOODIE_CONSUME_MODE_PATTERN, HoodieTestUtils.RAW_TRIPS_TEST_NAME);
jobConf.set(modePropertyName, HoodieHiveUtils.INCREMENTAL_SCAN_MODE);
@@ -141,8 +145,26 @@ public class InputFormatTestUtil {
String maxCommitPulls =
String.format(HoodieHiveUtils.HOODIE_MAX_COMMIT_PATTERN, HoodieTestUtils.RAW_TRIPS_TEST_NAME);
jobConf.setInt(maxCommitPulls, numberOfCommitsToPull);
jobConf.setBoolean(HoodieHiveUtils.HOODIE_INCREMENTAL_USE_DATABASE, isIncrementalUseDatabase);
}
public static void setupIncremental(JobConf jobConf, String startCommit, int numberOfCommitsToPull, String databaseName, boolean isIncrementalUseDatabase) {
String modePropertyName =
String.format(HoodieHiveUtils.HOODIE_CONSUME_MODE_PATTERN, databaseName + "." + HoodieTestUtils.RAW_TRIPS_TEST_NAME);
jobConf.set(modePropertyName, HoodieHiveUtils.INCREMENTAL_SCAN_MODE);
String startCommitTimestampName =
String.format(HoodieHiveUtils.HOODIE_START_COMMIT_PATTERN, databaseName + "." + HoodieTestUtils.RAW_TRIPS_TEST_NAME);
jobConf.set(startCommitTimestampName, startCommit);
String maxCommitPulls =
String.format(HoodieHiveUtils.HOODIE_MAX_COMMIT_PATTERN, databaseName + "." + HoodieTestUtils.RAW_TRIPS_TEST_NAME);
jobConf.setInt(maxCommitPulls, numberOfCommitsToPull);
jobConf.setBoolean(HoodieHiveUtils.HOODIE_INCREMENTAL_USE_DATABASE, isIncrementalUseDatabase);
}
public static void setupSnapshotIncludePendingCommits(JobConf jobConf, String instantTime) {
setupSnapshotScanMode(jobConf, true);
String validateTimestampName =