[HUDI-2176, 2178, 2179] Adding virtual key support to COW table (#3306)
This commit is contained in:
committed by
GitHub
parent
5353243449
commit
61148c1c43
@@ -24,6 +24,7 @@ import org.apache.hadoop.fs.Path;
|
||||
import org.apache.hudi.avro.model.HoodieClusteringPlan;
|
||||
import org.apache.hudi.avro.model.HoodieRequestedReplaceMetadata;
|
||||
import org.apache.hudi.avro.model.HoodieCleanMetadata;
|
||||
import org.apache.hudi.common.config.TypedProperties;
|
||||
import org.apache.hudi.common.fs.ConsistencyGuardConfig;
|
||||
import org.apache.hudi.common.fs.FSUtils;
|
||||
import org.apache.hudi.common.model.FileSlice;
|
||||
@@ -66,6 +67,9 @@ import org.apache.hudi.exception.HoodieUpsertException;
|
||||
import org.apache.hudi.index.HoodieIndex;
|
||||
import org.apache.hudi.index.HoodieIndex.IndexType;
|
||||
import org.apache.hudi.io.HoodieMergeHandle;
|
||||
import org.apache.hudi.keygen.BaseKeyGenerator;
|
||||
import org.apache.hudi.keygen.KeyGenerator;
|
||||
import org.apache.hudi.keygen.factory.HoodieSparkKeyGeneratorFactory;
|
||||
import org.apache.hudi.table.HoodieSparkCopyOnWriteTable;
|
||||
import org.apache.hudi.table.HoodieSparkTable;
|
||||
import org.apache.hudi.table.HoodieTable;
|
||||
@@ -120,6 +124,7 @@ import static org.apache.hudi.config.HoodieClusteringConfig.CLUSTERING_EXECUTION
|
||||
import static org.apache.hudi.testutils.Assertions.assertNoWriteErrors;
|
||||
import static org.junit.jupiter.api.Assertions.assertEquals;
|
||||
import static org.junit.jupiter.api.Assertions.assertFalse;
|
||||
import static org.junit.jupiter.api.Assertions.assertNull;
|
||||
import static org.junit.jupiter.api.Assertions.assertThrows;
|
||||
import static org.junit.jupiter.api.Assertions.assertTrue;
|
||||
import static org.junit.jupiter.api.Assertions.fail;
|
||||
@@ -136,8 +141,30 @@ public class TestHoodieClientOnCopyOnWriteStorage extends HoodieClientTestBase {
|
||||
}
|
||||
};
|
||||
|
||||
private static Stream<Arguments> configParams() {
|
||||
return Arrays.stream(new Boolean[][] {{true},{false}}).map(Arguments::of);
|
||||
private static Stream<Arguments> smallInsertHandlingParams() {
|
||||
return Arrays.stream(new Boolean[][] {{true}, {false}}).map(Arguments::of);
|
||||
}
|
||||
|
||||
private static Stream<Arguments> populateMetaFieldsParams() {
|
||||
return Arrays.stream(new Boolean[][] {{true}, {false}}).map(Arguments::of);
|
||||
}
|
||||
|
||||
private static Stream<Arguments> rollbackFailedCommitsParams() {
|
||||
return Stream.of(
|
||||
Arguments.of(HoodieFailedWritesCleaningPolicy.LAZY, true),
|
||||
Arguments.of(HoodieFailedWritesCleaningPolicy.LAZY, false),
|
||||
Arguments.of(HoodieFailedWritesCleaningPolicy.NEVER, true),
|
||||
Arguments.of(HoodieFailedWritesCleaningPolicy.NEVER, false)
|
||||
);
|
||||
}
|
||||
|
||||
private static Stream<Arguments> rollbackAfterConsistencyCheckFailureParams() {
|
||||
return Stream.of(
|
||||
Arguments.of(true, true),
|
||||
Arguments.of(true, false),
|
||||
Arguments.of(false, true),
|
||||
Arguments.of(false, false)
|
||||
);
|
||||
}
|
||||
|
||||
private HoodieTestTable testTable;
|
||||
@@ -150,50 +177,56 @@ public class TestHoodieClientOnCopyOnWriteStorage extends HoodieClientTestBase {
|
||||
/**
|
||||
* Test Auto Commit behavior for HoodieWriteClient insert API.
|
||||
*/
|
||||
@Test
|
||||
public void testAutoCommitOnInsert() throws Exception {
|
||||
testAutoCommit(SparkRDDWriteClient::insert, false);
|
||||
@ParameterizedTest
|
||||
@MethodSource("populateMetaFieldsParams")
|
||||
public void testAutoCommitOnInsert(boolean populateMetaFields) throws Exception {
|
||||
testAutoCommit(SparkRDDWriteClient::insert, false, populateMetaFields);
|
||||
}
|
||||
|
||||
/**
|
||||
* Test Auto Commit behavior for HoodieWriteClient insertPrepped API.
|
||||
*/
|
||||
@Test
|
||||
public void testAutoCommitOnInsertPrepped() throws Exception {
|
||||
testAutoCommit(SparkRDDWriteClient::insertPreppedRecords, true);
|
||||
@ParameterizedTest
|
||||
@MethodSource("populateMetaFieldsParams")
|
||||
public void testAutoCommitOnInsertPrepped(boolean populateMetaFields) throws Exception {
|
||||
testAutoCommit(SparkRDDWriteClient::insertPreppedRecords, true, populateMetaFields);
|
||||
}
|
||||
|
||||
/**
|
||||
* Test Auto Commit behavior for HoodieWriteClient upsert API.
|
||||
*/
|
||||
@Test
|
||||
public void testAutoCommitOnUpsert() throws Exception {
|
||||
testAutoCommit(SparkRDDWriteClient::upsert, false);
|
||||
@ParameterizedTest
|
||||
@MethodSource("populateMetaFieldsParams")
|
||||
public void testAutoCommitOnUpsert(boolean populateMetaFields) throws Exception {
|
||||
testAutoCommit(SparkRDDWriteClient::upsert, false, populateMetaFields);
|
||||
}
|
||||
|
||||
/**
|
||||
* Test Auto Commit behavior for HoodieWriteClient upsert Prepped API.
|
||||
*/
|
||||
@Test
|
||||
public void testAutoCommitOnUpsertPrepped() throws Exception {
|
||||
testAutoCommit(SparkRDDWriteClient::upsertPreppedRecords, true);
|
||||
@ParameterizedTest
|
||||
@MethodSource("populateMetaFieldsParams")
|
||||
public void testAutoCommitOnUpsertPrepped(boolean populateMetaFields) throws Exception {
|
||||
testAutoCommit(SparkRDDWriteClient::upsertPreppedRecords, true, populateMetaFields);
|
||||
}
|
||||
|
||||
/**
|
||||
* Test Auto Commit behavior for HoodieWriteClient bulk-insert API.
|
||||
*/
|
||||
@Test
|
||||
public void testAutoCommitOnBulkInsert() throws Exception {
|
||||
testAutoCommit(SparkRDDWriteClient::bulkInsert, false);
|
||||
@ParameterizedTest
|
||||
@MethodSource("populateMetaFieldsParams")
|
||||
public void testAutoCommitOnBulkInsert(boolean populateMetaFields) throws Exception {
|
||||
testAutoCommit(SparkRDDWriteClient::bulkInsert, false, populateMetaFields);
|
||||
}
|
||||
|
||||
/**
|
||||
* Test Auto Commit behavior for HoodieWriteClient bulk-insert prepped API.
|
||||
*/
|
||||
@Test
|
||||
public void testAutoCommitOnBulkInsertPrepped() throws Exception {
|
||||
@ParameterizedTest
|
||||
@MethodSource("populateMetaFieldsParams")
|
||||
public void testAutoCommitOnBulkInsertPrepped(boolean populateMetaFields) throws Exception {
|
||||
testAutoCommit((writeClient, recordRDD, instantTime) -> writeClient.bulkInsertPreppedRecords(recordRDD, instantTime,
|
||||
Option.empty()), true);
|
||||
Option.empty()), true, populateMetaFields);
|
||||
}
|
||||
|
||||
/**
|
||||
@@ -203,15 +236,16 @@ public class TestHoodieClientOnCopyOnWriteStorage extends HoodieClientTestBase {
|
||||
* @throws Exception in case of failure
|
||||
*/
|
||||
private void testAutoCommit(Function3<JavaRDD<WriteStatus>, SparkRDDWriteClient, JavaRDD<HoodieRecord>, String> writeFn,
|
||||
boolean isPrepped) throws Exception {
|
||||
boolean isPrepped, boolean populateMetaFields) throws Exception {
|
||||
// Set autoCommit false
|
||||
HoodieWriteConfig cfg = getConfigBuilder().withAutoCommit(false).build();
|
||||
try (SparkRDDWriteClient client = getHoodieWriteClient(cfg);) {
|
||||
HoodieWriteConfig.Builder cfgBuilder = getConfigBuilder().withAutoCommit(false);
|
||||
addAppropriatePropsForPopulateMetaFields(cfgBuilder, populateMetaFields);
|
||||
try (SparkRDDWriteClient client = getHoodieWriteClient(cfgBuilder.build());) {
|
||||
|
||||
String prevCommitTime = "000";
|
||||
String newCommitTime = "001";
|
||||
int numRecords = 200;
|
||||
JavaRDD<WriteStatus> result = insertFirstBatch(cfg, client, newCommitTime, prevCommitTime, numRecords, writeFn,
|
||||
JavaRDD<WriteStatus> result = insertFirstBatch(cfgBuilder.build(), client, newCommitTime, prevCommitTime, numRecords, writeFn,
|
||||
isPrepped, false, numRecords);
|
||||
|
||||
assertFalse(testTable.commitExists(newCommitTime),
|
||||
@@ -225,25 +259,28 @@ public class TestHoodieClientOnCopyOnWriteStorage extends HoodieClientTestBase {
|
||||
/**
|
||||
* Test De-duplication behavior for HoodieWriteClient insert API.
|
||||
*/
|
||||
@Test
|
||||
public void testDeduplicationOnInsert() throws Exception {
|
||||
testDeduplication(SparkRDDWriteClient::insert);
|
||||
@ParameterizedTest
|
||||
@MethodSource("populateMetaFieldsParams")
|
||||
public void testDeduplicationOnInsert(boolean populateMetaFields) throws Exception {
|
||||
testDeduplication(SparkRDDWriteClient::insert, populateMetaFields);
|
||||
}
|
||||
|
||||
/**
|
||||
* Test De-duplication behavior for HoodieWriteClient bulk-insert API.
|
||||
*/
|
||||
@Test
|
||||
public void testDeduplicationOnBulkInsert() throws Exception {
|
||||
testDeduplication(SparkRDDWriteClient::bulkInsert);
|
||||
@ParameterizedTest
|
||||
@MethodSource("populateMetaFieldsParams")
|
||||
public void testDeduplicationOnBulkInsert(boolean populateMetaFields) throws Exception {
|
||||
testDeduplication(SparkRDDWriteClient::bulkInsert, populateMetaFields);
|
||||
}
|
||||
|
||||
/**
|
||||
* Test De-duplication behavior for HoodieWriteClient upsert API.
|
||||
*/
|
||||
@Test
|
||||
public void testDeduplicationOnUpsert() throws Exception {
|
||||
testDeduplication(SparkRDDWriteClient::upsert);
|
||||
@ParameterizedTest
|
||||
@MethodSource("populateMetaFieldsParams")
|
||||
public void testDeduplicationOnUpsert(boolean populateMetaFields) throws Exception {
|
||||
testDeduplication(SparkRDDWriteClient::upsert, populateMetaFields);
|
||||
}
|
||||
|
||||
/**
|
||||
@@ -253,7 +290,7 @@ public class TestHoodieClientOnCopyOnWriteStorage extends HoodieClientTestBase {
|
||||
* @throws Exception in case of failure
|
||||
*/
|
||||
private void testDeduplication(
|
||||
Function3<JavaRDD<WriteStatus>, SparkRDDWriteClient, JavaRDD<HoodieRecord>, String> writeFn) throws Exception {
|
||||
Function3<JavaRDD<WriteStatus>, SparkRDDWriteClient, JavaRDD<HoodieRecord>, String> writeFn, boolean populateMetaFields) throws Exception {
|
||||
String newCommitTime = "001";
|
||||
|
||||
String recordKey = UUID.randomUUID().toString();
|
||||
@@ -289,8 +326,11 @@ public class TestHoodieClientOnCopyOnWriteStorage extends HoodieClientTestBase {
|
||||
|
||||
// Perform write-action and check
|
||||
JavaRDD<HoodieRecord> recordList = jsc.parallelize(Arrays.asList(recordOne, recordTwo, recordThree), 1);
|
||||
try (SparkRDDWriteClient client = getHoodieWriteClient(getConfigBuilder(HoodieFailedWritesCleaningPolicy.LAZY)
|
||||
.combineInput(true, true).build());) {
|
||||
HoodieWriteConfig.Builder configBuilder = getConfigBuilder(HoodieFailedWritesCleaningPolicy.LAZY)
|
||||
.combineInput(true, true);
|
||||
addAppropriatePropsForPopulateMetaFields(configBuilder, populateMetaFields);
|
||||
|
||||
try (SparkRDDWriteClient client = getHoodieWriteClient(configBuilder.build());) {
|
||||
client.startCommitWithTime(newCommitTime);
|
||||
List<WriteStatus> statuses = writeFn.apply(client, recordList, newCommitTime).collect();
|
||||
assertNoWriteErrors(statuses);
|
||||
@@ -321,17 +361,23 @@ public class TestHoodieClientOnCopyOnWriteStorage extends HoodieClientTestBase {
|
||||
/**
|
||||
* Test Upsert API.
|
||||
*/
|
||||
@Test
|
||||
public void testUpserts() throws Exception {
|
||||
testUpsertsInternal(getConfig(), SparkRDDWriteClient::upsert, false);
|
||||
@ParameterizedTest
|
||||
@MethodSource("populateMetaFieldsParams")
|
||||
public void testUpserts(boolean populateMetaFields) throws Exception {
|
||||
HoodieWriteConfig.Builder cfgBuilder = getConfigBuilder();
|
||||
addAppropriatePropsForPopulateMetaFields(cfgBuilder, populateMetaFields);
|
||||
testUpsertsInternal(cfgBuilder.build(), SparkRDDWriteClient::upsert, false);
|
||||
}
|
||||
|
||||
/**
|
||||
* Test UpsertPrepped API.
|
||||
*/
|
||||
@Test
|
||||
public void testUpsertsPrepped() throws Exception {
|
||||
testUpsertsInternal(getConfig(), SparkRDDWriteClient::upsertPreppedRecords, true);
|
||||
@ParameterizedTest
|
||||
@MethodSource("populateMetaFieldsParams")
|
||||
public void testUpsertsPrepped(boolean populateMetaFields) throws Exception {
|
||||
HoodieWriteConfig.Builder cfgBuilder = getConfigBuilder();
|
||||
addAppropriatePropsForPopulateMetaFields(cfgBuilder, populateMetaFields);
|
||||
testUpsertsInternal(cfgBuilder.build(), SparkRDDWriteClient::upsertPreppedRecords, true);
|
||||
}
|
||||
|
||||
/**
|
||||
@@ -348,10 +394,12 @@ public class TestHoodieClientOnCopyOnWriteStorage extends HoodieClientTestBase {
|
||||
HoodieWriteConfig hoodieWriteConfig = getConfigBuilder(HoodieFailedWritesCleaningPolicy.LAZY)
|
||||
.withProps(config.getProps()).withTimelineLayoutVersion(
|
||||
VERSION_0).build();
|
||||
|
||||
HoodieTableMetaClient.withPropertyBuilder()
|
||||
.fromMetaClient(metaClient)
|
||||
.setTimelineLayoutVersion(VERSION_0)
|
||||
.initTable(metaClient.getHadoopConf(), metaClient.getBasePath());
|
||||
.setPopulateMetaFields(config.populateMetaFields())
|
||||
.initTable(metaClient.getHadoopConf(), metaClient.getBasePath());
|
||||
|
||||
SparkRDDWriteClient client = getHoodieWriteClient(hoodieWriteConfig);
|
||||
|
||||
@@ -360,7 +408,7 @@ public class TestHoodieClientOnCopyOnWriteStorage extends HoodieClientTestBase {
|
||||
String initCommitTime = "000";
|
||||
int numRecords = 200;
|
||||
insertFirstBatch(hoodieWriteConfig, client, newCommitTime, initCommitTime, numRecords, SparkRDDWriteClient::insert,
|
||||
isPrepped, true, numRecords);
|
||||
isPrepped, true, numRecords, config.populateMetaFields());
|
||||
|
||||
// Write 2 (updates)
|
||||
String prevCommitTime = newCommitTime;
|
||||
@@ -369,7 +417,7 @@ public class TestHoodieClientOnCopyOnWriteStorage extends HoodieClientTestBase {
|
||||
String commitTimeBetweenPrevAndNew = "002";
|
||||
updateBatch(hoodieWriteConfig, client, newCommitTime, prevCommitTime,
|
||||
Option.of(Arrays.asList(commitTimeBetweenPrevAndNew)), initCommitTime, numRecords, writeFn, isPrepped, true,
|
||||
numRecords, 200, 2);
|
||||
numRecords, 200, 2, config.populateMetaFields());
|
||||
|
||||
// Delete 1
|
||||
prevCommitTime = newCommitTime;
|
||||
@@ -378,7 +426,7 @@ public class TestHoodieClientOnCopyOnWriteStorage extends HoodieClientTestBase {
|
||||
|
||||
deleteBatch(hoodieWriteConfig, client, newCommitTime, prevCommitTime,
|
||||
initCommitTime, numRecords, SparkRDDWriteClient::delete, isPrepped, true,
|
||||
0, 150);
|
||||
0, 150, config.populateMetaFields());
|
||||
|
||||
// Now simulate an upgrade and perform a restore operation
|
||||
HoodieWriteConfig newConfig = getConfigBuilder().withProps(config.getProps()).withTimelineLayoutVersion(
|
||||
@@ -440,7 +488,9 @@ public class TestHoodieClientOnCopyOnWriteStorage extends HoodieClientTestBase {
|
||||
|
||||
try {
|
||||
HoodieMergeHandle handle = new HoodieMergeHandle(cfg, instantTime, table, new HashMap<>(),
|
||||
partitionPath, FSUtils.getFileId(baseFilePath.getName()), baseFile, new SparkTaskContextSupplier());
|
||||
partitionPath, FSUtils.getFileId(baseFilePath.getName()), baseFile, new SparkTaskContextSupplier(),
|
||||
config.populateMetaFields() ? Option.empty() :
|
||||
Option.of((BaseKeyGenerator) HoodieSparkKeyGeneratorFactory.createKeyGenerator(new TypedProperties(config.getProps()))));
|
||||
WriteStatus writeStatus = new WriteStatus(false, 0.0);
|
||||
writeStatus.setStat(new HoodieWriteStat());
|
||||
writeStatus.getStat().setNumWrites(0);
|
||||
@@ -454,7 +504,9 @@ public class TestHoodieClientOnCopyOnWriteStorage extends HoodieClientTestBase {
|
||||
cfg.getProps().setProperty("hoodie.merge.data.validation.enabled", "true");
|
||||
HoodieWriteConfig cfg2 = HoodieWriteConfig.newBuilder().withProps(cfg.getProps()).build();
|
||||
HoodieMergeHandle handle = new HoodieMergeHandle(cfg2, newInstantTime, table, new HashMap<>(),
|
||||
partitionPath, FSUtils.getFileId(baseFilePath.getName()), baseFile, new SparkTaskContextSupplier());
|
||||
partitionPath, FSUtils.getFileId(baseFilePath.getName()), baseFile, new SparkTaskContextSupplier(),
|
||||
config.populateMetaFields() ? Option.empty() :
|
||||
Option.of((BaseKeyGenerator) HoodieSparkKeyGeneratorFactory.createKeyGenerator(new TypedProperties(config.getProps()))));
|
||||
WriteStatus writeStatus = new WriteStatus(false, 0.0);
|
||||
writeStatus.setStat(new HoodieWriteStat());
|
||||
writeStatus.getStat().setNumWrites(0);
|
||||
@@ -470,17 +522,23 @@ public class TestHoodieClientOnCopyOnWriteStorage extends HoodieClientTestBase {
|
||||
/**
|
||||
* Test Insert API for HoodieConcatHandle.
|
||||
*/
|
||||
@Test
|
||||
public void testInsertsWithHoodieConcatHandle() throws Exception {
|
||||
testHoodieConcatHandle(getConfig(), false);
|
||||
@ParameterizedTest
|
||||
@MethodSource("populateMetaFieldsParams")
|
||||
public void testInsertsWithHoodieConcatHandle(boolean populateMetaFields) throws Exception {
|
||||
HoodieWriteConfig.Builder cfgBuilder = getConfigBuilder();
|
||||
addAppropriatePropsForPopulateMetaFields(cfgBuilder, populateMetaFields);
|
||||
testHoodieConcatHandle(cfgBuilder.build(), false);
|
||||
}
|
||||
|
||||
/**
|
||||
* Test InsertPrepped API for HoodieConcatHandle.
|
||||
*/
|
||||
@Test
|
||||
public void testInsertsPreppedWithHoodieConcatHandle() throws Exception {
|
||||
testHoodieConcatHandle(getConfig(), true);
|
||||
@ParameterizedTest
|
||||
@MethodSource("populateMetaFieldsParams")
|
||||
public void testInsertsPreppedWithHoodieConcatHandle(boolean populateMetaFields) throws Exception {
|
||||
HoodieWriteConfig.Builder cfgBuilder = getConfigBuilder();
|
||||
addAppropriatePropsForPopulateMetaFields(cfgBuilder, populateMetaFields);
|
||||
testHoodieConcatHandle(cfgBuilder.build(), true);
|
||||
}
|
||||
|
||||
/**
|
||||
@@ -507,7 +565,7 @@ public class TestHoodieClientOnCopyOnWriteStorage extends HoodieClientTestBase {
|
||||
String initCommitTime = "000";
|
||||
int numRecords = 200;
|
||||
insertFirstBatch(hoodieWriteConfig, client, newCommitTime, initCommitTime, numRecords, SparkRDDWriteClient::insert,
|
||||
isPrepped, true, numRecords);
|
||||
isPrepped, true, numRecords, config.populateMetaFields());
|
||||
|
||||
// Write 2 (updates)
|
||||
String prevCommitTime = newCommitTime;
|
||||
@@ -520,15 +578,18 @@ public class TestHoodieClientOnCopyOnWriteStorage extends HoodieClientTestBase {
|
||||
|
||||
writeBatch(client, newCommitTime, prevCommitTime, Option.of(Arrays.asList(commitTimeBetweenPrevAndNew)), initCommitTime,
|
||||
numRecords, recordGenFunction, SparkRDDWriteClient::insert, true, numRecords, 300,
|
||||
2, false);
|
||||
2, false, config.populateMetaFields());
|
||||
}
|
||||
|
||||
/**
|
||||
* Tests deletion of records.
|
||||
*/
|
||||
@Test
|
||||
public void testDeletes() throws Exception {
|
||||
SparkRDDWriteClient client = getHoodieWriteClient(getConfigBuilder(HoodieFailedWritesCleaningPolicy.LAZY).build());
|
||||
@ParameterizedTest
|
||||
@MethodSource("populateMetaFieldsParams")
|
||||
public void testDeletes(boolean populateMetaFields) throws Exception {
|
||||
HoodieWriteConfig.Builder cfgBuilder = getConfigBuilder(HoodieFailedWritesCleaningPolicy.LAZY);
|
||||
addAppropriatePropsForPopulateMetaFields(cfgBuilder, populateMetaFields);
|
||||
SparkRDDWriteClient client = getHoodieWriteClient(cfgBuilder.build());
|
||||
/**
|
||||
* Write 1 (inserts and deletes) Write actual 200 insert records and ignore 100 delete records
|
||||
*/
|
||||
@@ -547,7 +608,8 @@ public class TestHoodieClientOnCopyOnWriteStorage extends HoodieClientTestBase {
|
||||
};
|
||||
writeBatch(client, newCommitTime, initCommitTime, Option.empty(), initCommitTime,
|
||||
// unused as genFn uses hard-coded number of inserts/updates/deletes
|
||||
-1, recordGenFunction, SparkRDDWriteClient::upsert, true, 200, 200, 1, false);
|
||||
-1, recordGenFunction, SparkRDDWriteClient::upsert, true, 200, 200, 1, false,
|
||||
populateMetaFields);
|
||||
|
||||
/**
|
||||
* Write 2 (deletes+writes).
|
||||
@@ -564,7 +626,8 @@ public class TestHoodieClientOnCopyOnWriteStorage extends HoodieClientTestBase {
|
||||
return recordsInSecondBatch;
|
||||
};
|
||||
writeBatch(client, newCommitTime, prevCommitTime, Option.empty(), initCommitTime, 100, recordGenFunction,
|
||||
SparkRDDWriteClient::upsert, true, 50, 150, 2, false);
|
||||
SparkRDDWriteClient::upsert, true, 50, 150, 2, false,
|
||||
populateMetaFields);
|
||||
}
|
||||
|
||||
/**
|
||||
@@ -572,9 +635,12 @@ public class TestHoodieClientOnCopyOnWriteStorage extends HoodieClientTestBase {
|
||||
* not be available in read path.
|
||||
* @throws Exception
|
||||
*/
|
||||
@Test
|
||||
public void testDeletesForInsertsInSameBatch() throws Exception {
|
||||
SparkRDDWriteClient client = getHoodieWriteClient(getConfigBuilder(HoodieFailedWritesCleaningPolicy.LAZY).build());
|
||||
@ParameterizedTest
|
||||
@MethodSource("populateMetaFieldsParams")
|
||||
public void testDeletesForInsertsInSameBatch(boolean populateMetaFields) throws Exception {
|
||||
HoodieWriteConfig.Builder cfgBuilder = getConfigBuilder(HoodieFailedWritesCleaningPolicy.LAZY);
|
||||
addAppropriatePropsForPopulateMetaFields(cfgBuilder, populateMetaFields);
|
||||
SparkRDDWriteClient client = getHoodieWriteClient(cfgBuilder.build());
|
||||
/**
|
||||
* Write 200 inserts and issue deletes to a subset(50) of inserts.
|
||||
*/
|
||||
@@ -593,7 +659,8 @@ public class TestHoodieClientOnCopyOnWriteStorage extends HoodieClientTestBase {
|
||||
};
|
||||
|
||||
writeBatch(client, newCommitTime, initCommitTime, Option.empty(), initCommitTime,
|
||||
-1, recordGenFunction, SparkRDDWriteClient::upsert, true, 150, 150, 1, false);
|
||||
-1, recordGenFunction, SparkRDDWriteClient::upsert, true, 150, 150, 1, false,
|
||||
populateMetaFields);
|
||||
}
|
||||
|
||||
/**
|
||||
@@ -793,7 +860,7 @@ public class TestHoodieClientOnCopyOnWriteStorage extends HoodieClientTestBase {
|
||||
Properties props = new Properties();
|
||||
props.setProperty(ASYNC_CLUSTERING_ENABLE_OPT_KEY.key(), "true");
|
||||
HoodieWriteConfig config = getSmallInsertWriteConfig(100,
|
||||
TRIP_EXAMPLE_SCHEMA, dataGen.getEstimatedFileSizeInBytes(150), props);
|
||||
TRIP_EXAMPLE_SCHEMA, dataGen.getEstimatedFileSizeInBytes(150), true, props);
|
||||
SparkRDDWriteClient client = getHoodieWriteClient(config);
|
||||
HoodieSparkCopyOnWriteTable table = (HoodieSparkCopyOnWriteTable) HoodieSparkTable.create(config, context, metaClient);
|
||||
|
||||
@@ -847,7 +914,10 @@ public class TestHoodieClientOnCopyOnWriteStorage extends HoodieClientTestBase {
|
||||
final String testPartitionPath = "2016/09/26";
|
||||
final int insertSplitLimit = 100;
|
||||
// setup the small file handling params
|
||||
HoodieWriteConfig config = getSmallInsertWriteConfig(insertSplitLimit); // hold upto 200 records max
|
||||
// hold upto 200 records max
|
||||
HoodieWriteConfig config = getSmallInsertWriteConfig(insertSplitLimit,
|
||||
TRIP_EXAMPLE_SCHEMA, dataGen.getEstimatedFileSizeInBytes(150));
|
||||
|
||||
dataGen = new HoodieTestDataGenerator(new String[] {testPartitionPath});
|
||||
SparkRDDWriteClient client = getHoodieWriteClient(config);
|
||||
BaseFileUtils fileUtils = BaseFileUtils.getInstance(metaClient);
|
||||
@@ -954,11 +1024,12 @@ public class TestHoodieClientOnCopyOnWriteStorage extends HoodieClientTestBase {
|
||||
* Test scenario of new file-group getting added during insert().
|
||||
*/
|
||||
@ParameterizedTest
|
||||
@MethodSource("configParams")
|
||||
@MethodSource("smallInsertHandlingParams")
|
||||
public void testSmallInsertHandlingForInserts(boolean mergeAllowDuplicateInserts) throws Exception {
|
||||
final String testPartitionPath = "2016/09/26";
|
||||
final int insertSplitLimit = 100;
|
||||
// setup the small file handling params
|
||||
|
||||
HoodieWriteConfig config = getSmallInsertWriteConfig(insertSplitLimit, false, mergeAllowDuplicateInserts); // hold upto 200 records max
|
||||
dataGen = new HoodieTestDataGenerator(new String[] {testPartitionPath});
|
||||
SparkRDDWriteClient client = getHoodieWriteClient(config);
|
||||
@@ -1039,7 +1110,9 @@ public class TestHoodieClientOnCopyOnWriteStorage extends HoodieClientTestBase {
|
||||
final String testPartitionPath = "2016/09/26";
|
||||
final int insertSplitLimit = 100;
|
||||
// setup the small file handling params
|
||||
HoodieWriteConfig config = getSmallInsertWriteConfig(insertSplitLimit); // hold upto 200 records max
|
||||
// hold upto 200 records max
|
||||
HoodieWriteConfig config = getSmallInsertWriteConfig(insertSplitLimit,
|
||||
TRIP_EXAMPLE_SCHEMA, dataGen.getEstimatedFileSizeInBytes(150));
|
||||
dataGen = new HoodieTestDataGenerator(new String[] {testPartitionPath});
|
||||
|
||||
SparkRDDWriteClient client = getHoodieWriteClient(config);
|
||||
@@ -1100,31 +1173,34 @@ public class TestHoodieClientOnCopyOnWriteStorage extends HoodieClientTestBase {
|
||||
testDeletes(client, updateBatch3.getRight(), 10, file1, "007", 140, keysSoFar);
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testSimpleClustering() throws Exception {
|
||||
@ParameterizedTest
|
||||
@MethodSource("populateMetaFieldsParams")
|
||||
public void testSimpleClustering(boolean populateMetaFields) throws Exception {
|
||||
// setup clustering config
|
||||
HoodieClusteringConfig clusteringConfig = HoodieClusteringConfig.newBuilder().withClusteringMaxNumGroups(10)
|
||||
.withClusteringTargetPartitions(0).withInlineClusteringNumCommits(1).build();
|
||||
testClustering(clusteringConfig);
|
||||
testClustering(clusteringConfig, populateMetaFields);
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testClusteringWithSortColumns() throws Exception {
|
||||
@ParameterizedTest
|
||||
@MethodSource("populateMetaFieldsParams")
|
||||
public void testClusteringWithSortColumns(boolean populateMetaFields) throws Exception {
|
||||
// setup clustering config
|
||||
HoodieClusteringConfig clusteringConfig = HoodieClusteringConfig.newBuilder().withClusteringMaxNumGroups(10)
|
||||
.withClusteringSortColumns("_hoodie_record_key")
|
||||
.withClusteringSortColumns(populateMetaFields ? "_hoodie_record_key" : "_row_key")
|
||||
.withClusteringTargetPartitions(0).withInlineClusteringNumCommits(1).build();
|
||||
testClustering(clusteringConfig);
|
||||
testClustering(clusteringConfig, populateMetaFields);
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testPendingClusteringRollback() throws Exception {
|
||||
@ParameterizedTest
|
||||
@MethodSource("populateMetaFieldsParams")
|
||||
public void testPendingClusteringRollback(boolean populateMetaFields) throws Exception {
|
||||
// setup clustering config
|
||||
HoodieClusteringConfig clusteringConfig = HoodieClusteringConfig.newBuilder().withClusteringMaxNumGroups(10)
|
||||
.withClusteringTargetPartitions(0).withInlineClusteringNumCommits(1).build();
|
||||
|
||||
// start clustering, but dont commit
|
||||
List<HoodieRecord> allRecords = testClustering(clusteringConfig, false);
|
||||
// start clustering, but don't commit
|
||||
List<HoodieRecord> allRecords = testClustering(clusteringConfig, populateMetaFields);
|
||||
HoodieTableMetaClient metaClient = HoodieTableMetaClient.builder().setConf(hadoopConf).setBasePath(basePath).build();
|
||||
List<Pair<HoodieInstant, HoodieClusteringPlan>> pendingClusteringPlans =
|
||||
ClusteringUtils.getAllPendingClusteringPlans(metaClient).collect(Collectors.toList());
|
||||
@@ -1132,7 +1208,9 @@ public class TestHoodieClientOnCopyOnWriteStorage extends HoodieClientTestBase {
|
||||
HoodieInstant pendingClusteringInstant = pendingClusteringPlans.get(0).getLeft();
|
||||
|
||||
// complete another commit after pending clustering
|
||||
HoodieWriteConfig config = getConfigBuilder(HoodieFailedWritesCleaningPolicy.EAGER).build();
|
||||
HoodieWriteConfig.Builder cfgBuilder = getConfigBuilder(HoodieFailedWritesCleaningPolicy.EAGER);
|
||||
addAppropriatePropsForPopulateMetaFields(cfgBuilder, populateMetaFields);
|
||||
HoodieWriteConfig config = cfgBuilder.build();
|
||||
SparkRDDWriteClient client = getHoodieWriteClient(config);
|
||||
dataGen = new HoodieTestDataGenerator();
|
||||
String commitTime = HoodieActiveTimeline.createNewInstantTime();
|
||||
@@ -1146,13 +1224,14 @@ public class TestHoodieClientOnCopyOnWriteStorage extends HoodieClientTestBase {
|
||||
assertEquals(0, ClusteringUtils.getAllPendingClusteringPlans(metaClient).count());
|
||||
}
|
||||
|
||||
private List<HoodieRecord> testClustering(HoodieClusteringConfig clusteringConfig) throws Exception {
|
||||
return testClustering(clusteringConfig, false);
|
||||
private List<HoodieRecord> testClustering(HoodieClusteringConfig clusteringConfig, boolean populateMetaFields) throws Exception {
|
||||
return testClustering(clusteringConfig, false, populateMetaFields);
|
||||
}
|
||||
|
||||
private List<HoodieRecord> testClustering(HoodieClusteringConfig clusteringConfig, boolean completeClustering) throws Exception {
|
||||
private List<HoodieRecord> testClustering(HoodieClusteringConfig clusteringConfig, boolean completeClustering, boolean populateMetaFields) throws Exception {
|
||||
// create config to not update small files.
|
||||
HoodieWriteConfig config = getSmallInsertWriteConfig(2000, false, 10);
|
||||
HoodieWriteConfig config = getSmallInsertWriteConfig(2000, TRIP_EXAMPLE_SCHEMA, 10, false, populateMetaFields,
|
||||
populateMetaFields ? new Properties() : getPropertiesForKeyGen());
|
||||
SparkRDDWriteClient client = getHoodieWriteClient(config);
|
||||
dataGen = new HoodieTestDataGenerator();
|
||||
String commitTime = HoodieActiveTimeline.createNewInstantTime();
|
||||
@@ -1170,14 +1249,15 @@ public class TestHoodieClientOnCopyOnWriteStorage extends HoodieClientTestBase {
|
||||
assertEquals(0, fileIdIntersection.size());
|
||||
|
||||
config = getConfigBuilder(HoodieFailedWritesCleaningPolicy.LAZY).withAutoCommit(completeClustering)
|
||||
.withClusteringConfig(clusteringConfig).build();
|
||||
.withClusteringConfig(clusteringConfig)
|
||||
.withProps(populateMetaFields ? new Properties() : getPropertiesForKeyGen()).build();
|
||||
|
||||
// create client with new config.
|
||||
client = getHoodieWriteClient(config);
|
||||
String clusteringCommitTime = client.scheduleClustering(Option.empty()).get().toString();
|
||||
HoodieWriteMetadata<JavaRDD<WriteStatus>> clusterMetadata = client.cluster(clusteringCommitTime, completeClustering);
|
||||
List<HoodieRecord> allRecords = Stream.concat(records1.stream(), records2.stream()).collect(Collectors.toList());
|
||||
verifyRecordsWritten(clusteringCommitTime, allRecords, clusterMetadata.getWriteStatuses().collect());
|
||||
verifyRecordsWritten(clusteringCommitTime, allRecords, clusterMetadata.getWriteStatuses().collect(), config);
|
||||
Set<HoodieFileGroupId> insertedFileIds = new HashSet<>();
|
||||
insertedFileIds.addAll(fileIds1);
|
||||
insertedFileIds.addAll(fileIds2);
|
||||
@@ -1197,25 +1277,28 @@ public class TestHoodieClientOnCopyOnWriteStorage extends HoodieClientTestBase {
|
||||
/**
|
||||
* Test scenario of writing more file groups than existing number of file groups in partition.
|
||||
*/
|
||||
@Test
|
||||
public void testInsertOverwritePartitionHandlingWithMoreRecords() throws Exception {
|
||||
verifyInsertOverwritePartitionHandling(1000, 3000);
|
||||
@ParameterizedTest
|
||||
@MethodSource("populateMetaFieldsParams")
|
||||
public void testInsertOverwritePartitionHandlingWithMoreRecords(boolean populateMetaFields) throws Exception {
|
||||
verifyInsertOverwritePartitionHandling(1000, 3000, populateMetaFields);
|
||||
}
|
||||
|
||||
/**
|
||||
* Test scenario of writing fewer file groups than existing number of file groups in partition.
|
||||
*/
|
||||
@Test
|
||||
public void testInsertOverwritePartitionHandlingWithFewerRecords() throws Exception {
|
||||
verifyInsertOverwritePartitionHandling(3000, 1000);
|
||||
@ParameterizedTest
|
||||
@MethodSource("populateMetaFieldsParams")
|
||||
public void testInsertOverwritePartitionHandlingWithFewerRecords(boolean populateMetaFields) throws Exception {
|
||||
verifyInsertOverwritePartitionHandling(3000, 1000, populateMetaFields);
|
||||
}
|
||||
|
||||
/**
|
||||
* Test scenario of writing similar number file groups in partition.
|
||||
*/
|
||||
@Test
|
||||
public void testInsertOverwritePartitionHandlingWithSimilarNumberOfRecords() throws Exception {
|
||||
verifyInsertOverwritePartitionHandling(3000, 3000);
|
||||
@ParameterizedTest
|
||||
@MethodSource("populateMetaFieldsParams")
|
||||
public void testInsertOverwritePartitionHandlingWithSimilarNumberOfRecords(boolean populateMetaFields) throws Exception {
|
||||
verifyInsertOverwritePartitionHandling(3000, 3000, populateMetaFields);
|
||||
}
|
||||
|
||||
/**
|
||||
@@ -1224,9 +1307,11 @@ public class TestHoodieClientOnCopyOnWriteStorage extends HoodieClientTestBase {
|
||||
*
|
||||
* Verify that all records in step1 are overwritten
|
||||
*/
|
||||
private void verifyInsertOverwritePartitionHandling(int batch1RecordsCount, int batch2RecordsCount) throws Exception {
|
||||
private void verifyInsertOverwritePartitionHandling(int batch1RecordsCount, int batch2RecordsCount, boolean populateMetaFields) throws Exception {
|
||||
final String testPartitionPath = "americas";
|
||||
HoodieWriteConfig config = getSmallInsertWriteConfig(2000, false);
|
||||
HoodieWriteConfig config = getSmallInsertWriteConfig(2000,
|
||||
TRIP_EXAMPLE_SCHEMA, dataGen.getEstimatedFileSizeInBytes(150), populateMetaFields, populateMetaFields
|
||||
? new Properties() : getPropertiesForKeyGen());
|
||||
SparkRDDWriteClient client = getHoodieWriteClient(config);
|
||||
dataGen = new HoodieTestDataGenerator(new String[] {testPartitionPath});
|
||||
|
||||
@@ -1247,7 +1332,7 @@ public class TestHoodieClientOnCopyOnWriteStorage extends HoodieClientTestBase {
|
||||
assertNoWriteErrors(statuses);
|
||||
|
||||
assertEquals(batch1Buckets, new HashSet<>(writeResult.getPartitionToReplaceFileIds().get(testPartitionPath)));
|
||||
verifyRecordsWritten(commitTime2, inserts2, statuses);
|
||||
verifyRecordsWritten(commitTime2, inserts2, statuses, config);
|
||||
}
|
||||
|
||||
private Set<String> getFileIdsFromWriteStatus(List<WriteStatus> statuses) {
|
||||
@@ -1257,35 +1342,38 @@ public class TestHoodieClientOnCopyOnWriteStorage extends HoodieClientTestBase {
|
||||
/**
|
||||
* Test scenario of writing fewer file groups for first partition than second an third partition.
|
||||
*/
|
||||
@Test
|
||||
public void verifyDeletePartitionsHandlingWithFewerRecordsFirstPartition() throws Exception {
|
||||
verifyDeletePartitionsHandling(1000, 3000, 3000);
|
||||
@ParameterizedTest
|
||||
@MethodSource("populateMetaFieldsParams")
|
||||
public void verifyDeletePartitionsHandlingWithFewerRecordsFirstPartition(boolean populateMetaFields) throws Exception {
|
||||
verifyDeletePartitionsHandling(1000, 3000, 3000, populateMetaFields);
|
||||
}
|
||||
|
||||
/**
|
||||
* Test scenario of writing similar number file groups in partition.
|
||||
*/
|
||||
@Test
|
||||
public void verifyDeletePartitionsHandlingWithSimilarNumberOfRecords() throws Exception {
|
||||
verifyDeletePartitionsHandling(3000, 3000, 3000);
|
||||
@ParameterizedTest
|
||||
@MethodSource("populateMetaFieldsParams")
|
||||
public void verifyDeletePartitionsHandlingWithSimilarNumberOfRecords(boolean populateMetaFields) throws Exception {
|
||||
verifyDeletePartitionsHandling(3000, 3000, 3000, populateMetaFields);
|
||||
}
|
||||
|
||||
/**
|
||||
* Test scenario of writing more file groups for first partition than second an third partition.
|
||||
*/
|
||||
@Test
|
||||
public void verifyDeletePartitionsHandlingHandlingWithFewerRecordsSecondThirdPartition() throws Exception {
|
||||
verifyDeletePartitionsHandling(3000, 1000, 1000);
|
||||
@ParameterizedTest
|
||||
@MethodSource("populateMetaFieldsParams")
|
||||
public void verifyDeletePartitionsHandlingHandlingWithFewerRecordsSecondThirdPartition(boolean populateMetaFields) throws Exception {
|
||||
verifyDeletePartitionsHandling(3000, 1000, 1000, populateMetaFields);
|
||||
}
|
||||
|
||||
private Set<String> insertPartitionRecordsWithCommit(SparkRDDWriteClient client, int recordsCount, String commitTime1, String partitionPath) {
|
||||
private Set<String> insertPartitionRecordsWithCommit(SparkRDDWriteClient client, int recordsCount, String commitTime1, String partitionPath) throws IOException {
|
||||
client.startCommitWithTime(commitTime1);
|
||||
List<HoodieRecord> inserts1 = dataGen.generateInsertsForPartition(commitTime1, recordsCount, partitionPath);
|
||||
JavaRDD<HoodieRecord> insertRecordsRDD1 = jsc.parallelize(inserts1, 2);
|
||||
List<WriteStatus> statuses = client.upsert(insertRecordsRDD1, commitTime1).collect();
|
||||
assertNoWriteErrors(statuses);
|
||||
Set<String> batchBuckets = statuses.stream().map(s -> s.getFileId()).collect(Collectors.toSet());
|
||||
verifyRecordsWritten(commitTime1, inserts1, statuses);
|
||||
verifyRecordsWritten(commitTime1, inserts1, statuses, client.config);
|
||||
return batchBuckets;
|
||||
}
|
||||
|
||||
@@ -1306,8 +1394,11 @@ public class TestHoodieClientOnCopyOnWriteStorage extends HoodieClientTestBase {
|
||||
* 5) delete second and third partition and check result.
|
||||
*
|
||||
*/
|
||||
private void verifyDeletePartitionsHandling(int batch1RecordsCount, int batch2RecordsCount, int batch3RecordsCount) throws Exception {
|
||||
HoodieWriteConfig config = getSmallInsertWriteConfig(2000, false);
|
||||
private void verifyDeletePartitionsHandling(int batch1RecordsCount, int batch2RecordsCount, int batch3RecordsCount,
|
||||
boolean populateMetaFields) throws Exception {
|
||||
HoodieWriteConfig config = getSmallInsertWriteConfig(2000,
|
||||
TRIP_EXAMPLE_SCHEMA, dataGen.getEstimatedFileSizeInBytes(150), populateMetaFields, populateMetaFields
|
||||
? new Properties() : getPropertiesForKeyGen());
|
||||
SparkRDDWriteClient client = getHoodieWriteClient(config);
|
||||
dataGen = new HoodieTestDataGenerator();
|
||||
|
||||
@@ -1360,7 +1451,8 @@ public class TestHoodieClientOnCopyOnWriteStorage extends HoodieClientTestBase {
|
||||
/**
|
||||
* Verify data in base files matches expected records and commit time.
|
||||
*/
|
||||
private void verifyRecordsWritten(String commitTime, List<HoodieRecord> expectedRecords, List<WriteStatus> allStatus) {
|
||||
private void verifyRecordsWritten(String commitTime, List<HoodieRecord> expectedRecords, List<WriteStatus> allStatus,
|
||||
HoodieWriteConfig config) throws IOException {
|
||||
List<GenericRecord> records = new ArrayList<>();
|
||||
for (WriteStatus status : allStatus) {
|
||||
Path filePath = new Path(basePath, status.getStat().getPath());
|
||||
@@ -1369,20 +1461,29 @@ public class TestHoodieClientOnCopyOnWriteStorage extends HoodieClientTestBase {
|
||||
|
||||
Set<String> expectedKeys = recordsToRecordKeySet(expectedRecords);
|
||||
assertEquals(records.size(), expectedKeys.size());
|
||||
for (GenericRecord record : records) {
|
||||
String recordKey = record.get(HoodieRecord.RECORD_KEY_METADATA_FIELD).toString();
|
||||
assertEquals(commitTime,
|
||||
record.get(HoodieRecord.COMMIT_TIME_METADATA_FIELD).toString());
|
||||
assertTrue(expectedKeys.contains(recordKey));
|
||||
if (config.populateMetaFields()) {
|
||||
for (GenericRecord record : records) {
|
||||
String recordKey = record.get(HoodieRecord.RECORD_KEY_METADATA_FIELD).toString();
|
||||
assertEquals(commitTime,
|
||||
record.get(HoodieRecord.COMMIT_TIME_METADATA_FIELD).toString());
|
||||
assertTrue(expectedKeys.contains(recordKey));
|
||||
}
|
||||
} else {
|
||||
KeyGenerator keyGenerator = HoodieSparkKeyGeneratorFactory.createKeyGenerator(new TypedProperties(config.getProps()));
|
||||
for (GenericRecord record : records) {
|
||||
String recordKey = keyGenerator.getKey(record).getRecordKey();
|
||||
assertNull(record.get(HoodieRecord.COMMIT_TIME_METADATA_FIELD));
|
||||
assertTrue(expectedKeys.contains(recordKey));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
private List<WriteStatus> writeAndVerifyBatch(SparkRDDWriteClient client, List<HoodieRecord> inserts, String commitTime) {
|
||||
private List<WriteStatus> writeAndVerifyBatch(SparkRDDWriteClient client, List<HoodieRecord> inserts, String commitTime) throws IOException {
|
||||
client.startCommitWithTime(commitTime);
|
||||
JavaRDD<HoodieRecord> insertRecordsRDD1 = jsc.parallelize(inserts, 2);
|
||||
List<WriteStatus> statuses = client.upsert(insertRecordsRDD1, commitTime).collect();
|
||||
assertNoWriteErrors(statuses);
|
||||
verifyRecordsWritten(commitTime, inserts, statuses);
|
||||
verifyRecordsWritten(commitTime, inserts, statuses, client.config);
|
||||
return statuses;
|
||||
}
|
||||
|
||||
@@ -1449,12 +1550,15 @@ public class TestHoodieClientOnCopyOnWriteStorage extends HoodieClientTestBase {
|
||||
/**
|
||||
* Test delete with delete api.
|
||||
*/
|
||||
@Test
|
||||
public void testDeletesWithoutInserts() {
|
||||
@ParameterizedTest
|
||||
@MethodSource("populateMetaFieldsParams")
|
||||
public void testDeletesWithoutInserts(boolean populateMetaFields) {
|
||||
final String testPartitionPath = "2016/09/26";
|
||||
final int insertSplitLimit = 100;
|
||||
// setup the small file handling params
|
||||
HoodieWriteConfig config = getSmallInsertWriteConfig(insertSplitLimit, true); // hold upto 200 records max
|
||||
HoodieWriteConfig config = getSmallInsertWriteConfig(insertSplitLimit,
|
||||
TRIP_EXAMPLE_SCHEMA, dataGen.getEstimatedFileSizeInBytes(150), populateMetaFields, populateMetaFields
|
||||
? new Properties() : getPropertiesForKeyGen());
|
||||
dataGen = new HoodieTestDataGenerator(new String[] {testPartitionPath});
|
||||
SparkRDDWriteClient client = getHoodieWriteClient(config);
|
||||
|
||||
@@ -1473,13 +1577,15 @@ public class TestHoodieClientOnCopyOnWriteStorage extends HoodieClientTestBase {
|
||||
/**
|
||||
* Test to ensure commit metadata points to valid files.
|
||||
*/
|
||||
@Test
|
||||
public void testCommitWritesRelativePaths() throws Exception {
|
||||
@ParameterizedTest
|
||||
@MethodSource("populateMetaFieldsParams")
|
||||
public void testCommitWritesRelativePaths(boolean populateMetaFields) throws Exception {
|
||||
|
||||
HoodieWriteConfig cfg = getConfigBuilder().withAutoCommit(false).build();
|
||||
try (SparkRDDWriteClient client = getHoodieWriteClient(cfg);) {
|
||||
HoodieWriteConfig.Builder cfgBuilder = getConfigBuilder().withAutoCommit(false);
|
||||
addAppropriatePropsForPopulateMetaFields(cfgBuilder, populateMetaFields);
|
||||
try (SparkRDDWriteClient client = getHoodieWriteClient(cfgBuilder.build());) {
|
||||
HoodieTableMetaClient metaClient = HoodieTableMetaClient.builder().setConf(hadoopConf).setBasePath(basePath).build();
|
||||
HoodieSparkTable table = HoodieSparkTable.create(cfg, context, metaClient);
|
||||
HoodieSparkTable table = HoodieSparkTable.create(cfgBuilder.build(), context, metaClient);
|
||||
|
||||
String instantTime = "000";
|
||||
client.startCommitWithTime(instantTime);
|
||||
@@ -1518,9 +1624,12 @@ public class TestHoodieClientOnCopyOnWriteStorage extends HoodieClientTestBase {
|
||||
/**
|
||||
* Test to ensure commit metadata points to valid files.10.
|
||||
*/
|
||||
@Test
|
||||
public void testMetadataStatsOnCommit() throws Exception {
|
||||
HoodieWriteConfig cfg = getConfigBuilder().withAutoCommit(false).build();
|
||||
@ParameterizedTest
|
||||
@MethodSource("populateMetaFieldsParams")
|
||||
public void testMetadataStatsOnCommit(boolean populateMetaFields) throws Exception {
|
||||
HoodieWriteConfig.Builder cfgBuilder = getConfigBuilder().withAutoCommit(false);
|
||||
addAppropriatePropsForPopulateMetaFields(cfgBuilder, populateMetaFields);
|
||||
HoodieWriteConfig cfg = cfgBuilder.build();
|
||||
SparkRDDWriteClient client = getHoodieWriteClient(cfg);
|
||||
|
||||
String instantTime0 = "000";
|
||||
@@ -1607,16 +1716,24 @@ public class TestHoodieClientOnCopyOnWriteStorage extends HoodieClientTestBase {
|
||||
}
|
||||
}
|
||||
|
||||
private void testRollbackAfterConsistencyCheckFailureUsingFileList(boolean rollbackUsingMarkers, boolean enableOptimisticConsistencyGuard) throws Exception {
|
||||
private void testRollbackAfterConsistencyCheckFailureUsingFileList(boolean rollbackUsingMarkers, boolean enableOptimisticConsistencyGuard,
|
||||
boolean populateMetaFields) throws Exception {
|
||||
String instantTime = "000";
|
||||
HoodieTableMetaClient metaClient = HoodieTableMetaClient.builder().setConf(hadoopConf).setBasePath(basePath).build();
|
||||
|
||||
Properties properties = new Properties();
|
||||
if (!populateMetaFields) {
|
||||
properties = getPropertiesForKeyGen();
|
||||
}
|
||||
|
||||
HoodieWriteConfig cfg = !enableOptimisticConsistencyGuard ? getConfigBuilder().withRollbackUsingMarkers(rollbackUsingMarkers).withAutoCommit(false)
|
||||
.withConsistencyGuardConfig(ConsistencyGuardConfig.newBuilder().withConsistencyCheckEnabled(true)
|
||||
.withMaxConsistencyCheckIntervalMs(1).withInitialConsistencyCheckIntervalMs(1).withEnableOptimisticConsistencyGuard(enableOptimisticConsistencyGuard).build()).build() :
|
||||
getConfigBuilder().withRollbackUsingMarkers(rollbackUsingMarkers).withAutoCommit(false)
|
||||
.withConsistencyGuardConfig(ConsistencyGuardConfig.newBuilder()
|
||||
.withConsistencyCheckEnabled(true)
|
||||
.withOptimisticConsistencyGuardSleepTimeMs(1).build()).build();
|
||||
.withOptimisticConsistencyGuardSleepTimeMs(1).build())
|
||||
.withProperties(properties).build();
|
||||
SparkRDDWriteClient client = getHoodieWriteClient(cfg);
|
||||
testConsistencyCheck(metaClient, instantTime, enableOptimisticConsistencyGuard);
|
||||
|
||||
@@ -1651,28 +1768,28 @@ public class TestHoodieClientOnCopyOnWriteStorage extends HoodieClientTestBase {
|
||||
}
|
||||
|
||||
@ParameterizedTest
|
||||
@ValueSource(booleans = {true, false})
|
||||
public void testRollbackAfterConsistencyCheckFailureUsingFileList(boolean enableOptimisticConsistencyGuard) throws Exception {
|
||||
testRollbackAfterConsistencyCheckFailureUsingFileList(false, enableOptimisticConsistencyGuard);
|
||||
@MethodSource("rollbackAfterConsistencyCheckFailureParams")
|
||||
public void testRollbackAfterConsistencyCheckFailureUsingFileList(boolean enableOptimisticConsistencyGuard, boolean populateMetCols) throws Exception {
|
||||
testRollbackAfterConsistencyCheckFailureUsingFileList(false, enableOptimisticConsistencyGuard, populateMetCols);
|
||||
}
|
||||
|
||||
@ParameterizedTest
|
||||
@ValueSource(booleans = {true, false})
|
||||
public void testRollbackAfterConsistencyCheckFailureUsingMarkers(boolean enableOptimisticConsistencyGuard) throws Exception {
|
||||
testRollbackAfterConsistencyCheckFailureUsingFileList(true, enableOptimisticConsistencyGuard);
|
||||
@MethodSource("rollbackAfterConsistencyCheckFailureParams")
|
||||
public void testRollbackAfterConsistencyCheckFailureUsingMarkers(boolean enableOptimisticConsistencyGuard, boolean populateMetCols) throws Exception {
|
||||
testRollbackAfterConsistencyCheckFailureUsingFileList(true, enableOptimisticConsistencyGuard, populateMetCols);
|
||||
}
|
||||
|
||||
@ParameterizedTest
|
||||
@EnumSource(value = HoodieFailedWritesCleaningPolicy.class, names = {"LAZY", "NEVER"})
|
||||
public void testRollbackFailedCommits(HoodieFailedWritesCleaningPolicy cleaningPolicy) throws Exception {
|
||||
@MethodSource("rollbackFailedCommitsParams")
|
||||
public void testRollbackFailedCommits(HoodieFailedWritesCleaningPolicy cleaningPolicy, boolean populateMetaFields) throws Exception {
|
||||
HoodieTestUtils.init(hadoopConf, basePath);
|
||||
// Perform 2 failed writes to table
|
||||
SparkRDDWriteClient client = new SparkRDDWriteClient(context, getParallelWritingWriteConfig(cleaningPolicy));
|
||||
SparkRDDWriteClient client = new SparkRDDWriteClient(context, getParallelWritingWriteConfig(cleaningPolicy, populateMetaFields));
|
||||
writeBatch(client, "100", "100", Option.of(Arrays.asList("100")), "100",
|
||||
100, dataGen::generateInserts, SparkRDDWriteClient::bulkInsert, false, 100, 300,
|
||||
0, false);
|
||||
client.close();
|
||||
client = new SparkRDDWriteClient(context, getParallelWritingWriteConfig(cleaningPolicy));
|
||||
client = new SparkRDDWriteClient(context, getParallelWritingWriteConfig(cleaningPolicy, populateMetaFields));
|
||||
writeBatch(client, "200", "100", Option.of(Arrays.asList("200")), "200",
|
||||
100, dataGen::generateInserts, SparkRDDWriteClient::bulkInsert, false, 100, 300,
|
||||
0, false);
|
||||
@@ -1680,7 +1797,7 @@ public class TestHoodieClientOnCopyOnWriteStorage extends HoodieClientTestBase {
|
||||
// refresh data generator to delete records generated from failed commits
|
||||
dataGen = new HoodieTestDataGenerator();
|
||||
// Perform 1 successful write
|
||||
client = new SparkRDDWriteClient(context, getParallelWritingWriteConfig(cleaningPolicy));
|
||||
client = new SparkRDDWriteClient(context, getParallelWritingWriteConfig(cleaningPolicy, populateMetaFields));
|
||||
writeBatch(client, "300", "200", Option.of(Arrays.asList("300")), "300",
|
||||
100, dataGen::generateInserts, SparkRDDWriteClient::bulkInsert, false, 100, 300,
|
||||
0, true);
|
||||
@@ -1696,7 +1813,7 @@ public class TestHoodieClientOnCopyOnWriteStorage extends HoodieClientTestBase {
|
||||
conditionMet = client.getHeartbeatClient().isHeartbeatExpired("200");
|
||||
Thread.sleep(2000);
|
||||
}
|
||||
client = new SparkRDDWriteClient(context, getParallelWritingWriteConfig(cleaningPolicy));
|
||||
client = new SparkRDDWriteClient(context, getParallelWritingWriteConfig(cleaningPolicy, populateMetaFields));
|
||||
// Perform 1 successful write
|
||||
writeBatch(client, "400", "300", Option.of(Arrays.asList("400")), "400",
|
||||
100, dataGen::generateInserts, SparkRDDWriteClient::bulkInsert, false, 100, 300,
|
||||
@@ -1732,11 +1849,12 @@ public class TestHoodieClientOnCopyOnWriteStorage extends HoodieClientTestBase {
|
||||
}
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testRollbackFailedCommitsToggleCleaningPolicy() throws Exception {
|
||||
@ParameterizedTest
|
||||
@MethodSource("populateMetaFieldsParams")
|
||||
public void testRollbackFailedCommitsToggleCleaningPolicy(boolean populateMetaFields) throws Exception {
|
||||
HoodieTestUtils.init(hadoopConf, basePath);
|
||||
HoodieFailedWritesCleaningPolicy cleaningPolicy = HoodieFailedWritesCleaningPolicy.EAGER;
|
||||
SparkRDDWriteClient client = new SparkRDDWriteClient(context, getParallelWritingWriteConfig(cleaningPolicy));
|
||||
SparkRDDWriteClient client = new SparkRDDWriteClient(context, getParallelWritingWriteConfig(cleaningPolicy, populateMetaFields));
|
||||
// Perform 1 failed writes to table
|
||||
writeBatch(client, "100", "100", Option.of(Arrays.asList("100")), "100",
|
||||
100, dataGen::generateInserts, SparkRDDWriteClient::bulkInsert, false, 100, 300,
|
||||
@@ -1745,12 +1863,12 @@ public class TestHoodieClientOnCopyOnWriteStorage extends HoodieClientTestBase {
|
||||
// Toggle cleaning policy to LAZY
|
||||
cleaningPolicy = HoodieFailedWritesCleaningPolicy.LAZY;
|
||||
// Perform 2 failed writes to table
|
||||
client = new SparkRDDWriteClient(context, getParallelWritingWriteConfig(cleaningPolicy));
|
||||
client = new SparkRDDWriteClient(context, getParallelWritingWriteConfig(cleaningPolicy, populateMetaFields));
|
||||
writeBatch(client, "200", "100", Option.of(Arrays.asList("200")), "200",
|
||||
100, dataGen::generateInserts, SparkRDDWriteClient::bulkInsert, false, 100, 300,
|
||||
0, false);
|
||||
client.close();
|
||||
client = new SparkRDDWriteClient(context, getParallelWritingWriteConfig(cleaningPolicy));
|
||||
client = new SparkRDDWriteClient(context, getParallelWritingWriteConfig(cleaningPolicy, populateMetaFields));
|
||||
writeBatch(client, "300", "200", Option.of(Arrays.asList("300")), "300",
|
||||
100, dataGen::generateInserts, SparkRDDWriteClient::bulkInsert, false, 100, 300,
|
||||
0, false);
|
||||
@@ -1766,19 +1884,19 @@ public class TestHoodieClientOnCopyOnWriteStorage extends HoodieClientTestBase {
|
||||
assertTrue(timeline.getTimelineOfActions(
|
||||
CollectionUtils.createSet(HoodieTimeline.ROLLBACK_ACTION)).countInstants() == 3);
|
||||
// Perform 2 failed commits
|
||||
client = new SparkRDDWriteClient(context, getParallelWritingWriteConfig(cleaningPolicy));
|
||||
client = new SparkRDDWriteClient(context, getParallelWritingWriteConfig(cleaningPolicy, populateMetaFields));
|
||||
writeBatch(client, "400", "300", Option.of(Arrays.asList("400")), "400",
|
||||
100, dataGen::generateInserts, SparkRDDWriteClient::bulkInsert, false, 100, 300,
|
||||
0, false);
|
||||
client.close();
|
||||
client = new SparkRDDWriteClient(context, getParallelWritingWriteConfig(cleaningPolicy));
|
||||
client = new SparkRDDWriteClient(context, getParallelWritingWriteConfig(cleaningPolicy, populateMetaFields));
|
||||
writeBatch(client, "500", "400", Option.of(Arrays.asList("500")), "500",
|
||||
100, dataGen::generateInserts, SparkRDDWriteClient::bulkInsert, false, 100, 300,
|
||||
0, false);
|
||||
client.close();
|
||||
// Toggle cleaning policy to EAGER
|
||||
cleaningPolicy = HoodieFailedWritesCleaningPolicy.EAGER;
|
||||
client = new SparkRDDWriteClient(context, getParallelWritingWriteConfig(cleaningPolicy));
|
||||
client = new SparkRDDWriteClient(context, getParallelWritingWriteConfig(cleaningPolicy, populateMetaFields));
|
||||
client.startCommit();
|
||||
timeline = metaClient.getActiveTimeline().reload();
|
||||
assertTrue(timeline.getTimelineOfActions(
|
||||
@@ -1786,18 +1904,19 @@ public class TestHoodieClientOnCopyOnWriteStorage extends HoodieClientTestBase {
|
||||
assertTrue(timeline.getCommitsTimeline().filterCompletedInstants().countInstants() == 0);
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testParallelInsertAndCleanPreviousFailedCommits() throws Exception {
|
||||
@ParameterizedTest
|
||||
@MethodSource("populateMetaFieldsParams")
|
||||
public void testParallelInsertAndCleanPreviousFailedCommits(boolean populateMetaFields) throws Exception {
|
||||
HoodieFailedWritesCleaningPolicy cleaningPolicy = HoodieFailedWritesCleaningPolicy.LAZY;
|
||||
ExecutorService service = Executors.newFixedThreadPool(2);
|
||||
HoodieTestUtils.init(hadoopConf, basePath);
|
||||
// Perform 2 failed writes to table
|
||||
SparkRDDWriteClient client = new SparkRDDWriteClient(context, getParallelWritingWriteConfig(cleaningPolicy));
|
||||
SparkRDDWriteClient client = new SparkRDDWriteClient(context, getParallelWritingWriteConfig(cleaningPolicy, populateMetaFields));
|
||||
writeBatch(client, "100", "100", Option.of(Arrays.asList("100")), "100",
|
||||
100, dataGen::generateInserts, SparkRDDWriteClient::bulkInsert, false, 100, 100,
|
||||
0, false);
|
||||
client.close();
|
||||
client = new SparkRDDWriteClient(context, getParallelWritingWriteConfig(cleaningPolicy));
|
||||
client = new SparkRDDWriteClient(context, getParallelWritingWriteConfig(cleaningPolicy, populateMetaFields));
|
||||
writeBatch(client, "200", "200", Option.of(Arrays.asList("200")), "200",
|
||||
100, dataGen::generateInserts, SparkRDDWriteClient::bulkInsert, false, 100, 100,
|
||||
0, false);
|
||||
@@ -1805,7 +1924,7 @@ public class TestHoodieClientOnCopyOnWriteStorage extends HoodieClientTestBase {
|
||||
// refresh data generator to delete records generated from failed commits
|
||||
dataGen = new HoodieTestDataGenerator();
|
||||
// Create a succesful commit
|
||||
Future<JavaRDD<WriteStatus>> commit3 = service.submit(() -> writeBatch(new SparkRDDWriteClient(context, getParallelWritingWriteConfig(cleaningPolicy)),
|
||||
Future<JavaRDD<WriteStatus>> commit3 = service.submit(() -> writeBatch(new SparkRDDWriteClient(context, getParallelWritingWriteConfig(cleaningPolicy, populateMetaFields)),
|
||||
"300", "200", Option.of(Arrays.asList("300")), "200", 100, dataGen::generateInserts,
|
||||
SparkRDDWriteClient::bulkInsert, false, 100, 100, 0, true));
|
||||
commit3.get();
|
||||
@@ -1815,17 +1934,17 @@ public class TestHoodieClientOnCopyOnWriteStorage extends HoodieClientTestBase {
|
||||
CollectionUtils.createSet(HoodieTimeline.ROLLBACK_ACTION)).countInstants() == 0);
|
||||
assertTrue(metaClient.getActiveTimeline().filterInflights().countInstants() == 2);
|
||||
assertTrue(metaClient.getActiveTimeline().getCommitsTimeline().filterCompletedInstants().countInstants() == 1);
|
||||
client = new SparkRDDWriteClient(context, getParallelWritingWriteConfig(cleaningPolicy));
|
||||
client = new SparkRDDWriteClient(context, getParallelWritingWriteConfig(cleaningPolicy, populateMetaFields));
|
||||
// Await till enough time passes such that the first 2 failed commits heartbeats are expired
|
||||
boolean conditionMet = false;
|
||||
while (!conditionMet) {
|
||||
conditionMet = client.getHeartbeatClient().isHeartbeatExpired("200");
|
||||
Thread.sleep(2000);
|
||||
}
|
||||
Future<JavaRDD<WriteStatus>> commit4 = service.submit(() -> writeBatch(new SparkRDDWriteClient(context, getParallelWritingWriteConfig(cleaningPolicy)),
|
||||
Future<JavaRDD<WriteStatus>> commit4 = service.submit(() -> writeBatch(new SparkRDDWriteClient(context, getParallelWritingWriteConfig(cleaningPolicy, populateMetaFields)),
|
||||
"400", "300", Option.of(Arrays.asList("400")), "400", 100, dataGen::generateInserts,
|
||||
SparkRDDWriteClient::bulkInsert, false, 100, 100, 0, true));
|
||||
Future<HoodieCleanMetadata> clean1 = service.submit(() -> new SparkRDDWriteClient(context, getParallelWritingWriteConfig(cleaningPolicy)).clean());
|
||||
Future<HoodieCleanMetadata> clean1 = service.submit(() -> new SparkRDDWriteClient(context, getParallelWritingWriteConfig(cleaningPolicy, populateMetaFields)).clean());
|
||||
commit4.get();
|
||||
clean1.get();
|
||||
HoodieActiveTimeline timeline = metaClient.getActiveTimeline().reload();
|
||||
@@ -1878,11 +1997,13 @@ public class TestHoodieClientOnCopyOnWriteStorage extends HoodieClientTestBase {
|
||||
return Pair.of(markerFilePath, result);
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testMultiOperationsPerCommit() throws IOException {
|
||||
HoodieWriteConfig cfg = getConfigBuilder().withAutoCommit(false)
|
||||
.withAllowMultiWriteOnSameInstant(true)
|
||||
.build();
|
||||
@ParameterizedTest
|
||||
@MethodSource("populateMetaFieldsParams")
|
||||
public void testMultiOperationsPerCommit(boolean populateMetaFields) throws IOException {
|
||||
HoodieWriteConfig.Builder cfgBuilder = getConfigBuilder().withAutoCommit(false)
|
||||
.withAllowMultiWriteOnSameInstant(true);
|
||||
addAppropriatePropsForPopulateMetaFields(cfgBuilder, populateMetaFields);
|
||||
HoodieWriteConfig cfg = cfgBuilder.build();
|
||||
SparkRDDWriteClient client = getHoodieWriteClient(cfg);
|
||||
String firstInstantTime = "0000";
|
||||
client.startCommitWithTime(firstInstantTime);
|
||||
@@ -1957,16 +2078,19 @@ public class TestHoodieClientOnCopyOnWriteStorage extends HoodieClientTestBase {
|
||||
}
|
||||
|
||||
private HoodieWriteConfig getSmallInsertWriteConfig(int insertSplitSize, String schemaStr, long smallFileSize, boolean mergeAllowDuplicateInserts) {
|
||||
return getSmallInsertWriteConfig(insertSplitSize, schemaStr, smallFileSize, mergeAllowDuplicateInserts, new Properties());
|
||||
return getSmallInsertWriteConfig(insertSplitSize, schemaStr, smallFileSize, mergeAllowDuplicateInserts, true, new Properties());
|
||||
}
|
||||
|
||||
private HoodieWriteConfig getSmallInsertWriteConfig(int insertSplitSize, String schemaStr, long smallFileSize, Properties props) {
|
||||
return getSmallInsertWriteConfig(insertSplitSize, schemaStr, smallFileSize, false, props);
|
||||
private HoodieWriteConfig getSmallInsertWriteConfig(int insertSplitSize, String schemaStr, long smallFileSize, boolean populateMetaFields, Properties props) {
|
||||
return getSmallInsertWriteConfig(insertSplitSize, schemaStr, smallFileSize, false, populateMetaFields, props);
|
||||
}
|
||||
|
||||
private HoodieWriteConfig getSmallInsertWriteConfig(int insertSplitSize, String schemaStr, long smallFileSize, boolean mergeAllowDuplicateInserts,
|
||||
Properties props) {
|
||||
boolean populateMetaFields, Properties props) {
|
||||
HoodieWriteConfig.Builder builder = getConfigBuilder(schemaStr);
|
||||
if (!populateMetaFields) {
|
||||
builder.withIndexConfig(HoodieIndexConfig.newBuilder().withIndexType(IndexType.SIMPLE).build());
|
||||
}
|
||||
return builder
|
||||
.withCompactionConfig(
|
||||
HoodieCompactionConfig.newBuilder()
|
||||
@@ -1994,7 +2118,7 @@ public class TestHoodieClientOnCopyOnWriteStorage extends HoodieClientTestBase {
|
||||
return clusteringInstant;
|
||||
}
|
||||
|
||||
private HoodieWriteConfig getParallelWritingWriteConfig(HoodieFailedWritesCleaningPolicy cleaningPolicy) {
|
||||
private HoodieWriteConfig getParallelWritingWriteConfig(HoodieFailedWritesCleaningPolicy cleaningPolicy, boolean populateMetaFields) {
|
||||
return getConfigBuilder()
|
||||
.withEmbeddedTimelineServerEnabled(false)
|
||||
.withCompactionConfig(HoodieCompactionConfig.newBuilder()
|
||||
@@ -2002,7 +2126,8 @@ public class TestHoodieClientOnCopyOnWriteStorage extends HoodieClientTestBase {
|
||||
.withAutoClean(false).build())
|
||||
.withTimelineLayoutVersion(1)
|
||||
.withHeartbeatIntervalInMs(3 * 1000)
|
||||
.withAutoCommit(false).build();
|
||||
.withAutoCommit(false)
|
||||
.withProperties(populateMetaFields ? new Properties() : getPropertiesForKeyGen()).build();
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
@@ -26,6 +26,7 @@ import org.apache.hudi.common.table.timeline.HoodieTimeline;
|
||||
import org.apache.hudi.common.testutils.HoodieTestUtils;
|
||||
import org.apache.hudi.common.testutils.RawTripTestPayload;
|
||||
import org.apache.hudi.common.util.BaseFileUtils;
|
||||
import org.apache.hudi.common.util.Option;
|
||||
import org.apache.hudi.config.HoodieWriteConfig;
|
||||
import org.apache.hudi.exception.HoodieUpsertException;
|
||||
import org.apache.hudi.io.HoodieCreateHandle;
|
||||
@@ -121,7 +122,7 @@ public class TestUpdateSchemaEvolution extends HoodieClientTestHarness {
|
||||
jsc.parallelize(Arrays.asList(1)).map(x -> {
|
||||
Executable executable = () -> {
|
||||
HoodieMergeHandle mergeHandle = new HoodieMergeHandle(updateTable.getConfig(), "101", updateTable,
|
||||
updateRecords.iterator(), updateRecords.get(0).getPartitionPath(), insertResult.getFileId(), supplier);
|
||||
updateRecords.iterator(), updateRecords.get(0).getPartitionPath(), insertResult.getFileId(), supplier, Option.empty());
|
||||
List<GenericRecord> oldRecords = BaseFileUtils.getInstance(updateTable.getBaseFileFormat())
|
||||
.readAvroRecords(updateTable.getHadoopConf(),
|
||||
new Path(updateTable.getConfig().getBasePath() + "/" + insertResult.getStat().getPath()),
|
||||
|
||||
@@ -23,10 +23,12 @@ import org.apache.hudi.common.fs.ConsistencyGuardConfig;
|
||||
import org.apache.hudi.common.model.EmptyHoodieRecordPayload;
|
||||
import org.apache.hudi.common.model.HoodieKey;
|
||||
import org.apache.hudi.common.model.HoodieRecord;
|
||||
import org.apache.hudi.common.model.HoodieTableType;
|
||||
import org.apache.hudi.common.table.HoodieTableMetaClient;
|
||||
import org.apache.hudi.common.table.view.FileSystemViewStorageConfig;
|
||||
import org.apache.hudi.common.table.view.FileSystemViewStorageType;
|
||||
import org.apache.hudi.common.testutils.HoodieTestDataGenerator;
|
||||
import org.apache.hudi.common.testutils.HoodieTestUtils;
|
||||
import org.apache.hudi.common.testutils.RawTripTestPayload;
|
||||
import org.apache.hudi.common.util.Option;
|
||||
import org.apache.hudi.common.util.collection.Pair;
|
||||
@@ -47,8 +49,10 @@ import org.apache.hadoop.fs.Path;
|
||||
import org.apache.spark.api.java.JavaPairRDD;
|
||||
import org.apache.spark.api.java.JavaRDD;
|
||||
import org.junit.jupiter.api.AfterEach;
|
||||
import org.junit.jupiter.api.Test;
|
||||
import org.junit.jupiter.params.ParameterizedTest;
|
||||
import org.junit.jupiter.params.provider.EnumSource;
|
||||
import org.junit.jupiter.params.provider.Arguments;
|
||||
import org.junit.jupiter.params.provider.MethodSource;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.util.Arrays;
|
||||
@@ -56,8 +60,10 @@ import java.util.Collections;
|
||||
import java.util.HashMap;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
import java.util.Properties;
|
||||
import java.util.Random;
|
||||
import java.util.UUID;
|
||||
import java.util.stream.Stream;
|
||||
|
||||
import scala.Tuple2;
|
||||
|
||||
@@ -69,16 +75,34 @@ import static org.junit.jupiter.api.Assertions.fail;
|
||||
|
||||
public class TestHoodieIndex extends HoodieClientTestHarness {
|
||||
|
||||
private static Stream<Arguments> indexTypeParams() {
|
||||
Object[][] data = new Object[][] {
|
||||
{IndexType.BLOOM, true},
|
||||
{IndexType.GLOBAL_BLOOM, true},
|
||||
{IndexType.SIMPLE, true},
|
||||
{IndexType.GLOBAL_SIMPLE, true},
|
||||
{IndexType.SIMPLE, false},
|
||||
{IndexType.GLOBAL_SIMPLE, false}
|
||||
};
|
||||
return Stream.of(data).map(Arguments::of);
|
||||
}
|
||||
|
||||
private static final Schema SCHEMA = getSchemaFromResource(TestHoodieIndex.class, "/exampleSchema.avsc", true);
|
||||
private final Random random = new Random();
|
||||
private IndexType indexType;
|
||||
private HoodieIndex index;
|
||||
private HoodieWriteConfig config;
|
||||
|
||||
private void setUp(IndexType indexType) throws Exception {
|
||||
private void setUp(IndexType indexType, boolean populateMetaFields) throws Exception {
|
||||
this.indexType = indexType;
|
||||
initResources();
|
||||
initPath();
|
||||
initSparkContexts();
|
||||
initTestDataGenerator();
|
||||
initFileSystem();
|
||||
metaClient = HoodieTestUtils.init(hadoopConf, basePath, HoodieTableType.COPY_ON_WRITE, populateMetaFields ? new Properties()
|
||||
: getPropertiesForKeyGen());
|
||||
config = getConfigBuilder()
|
||||
.withProperties(populateMetaFields ? new Properties() : getPropertiesForKeyGen())
|
||||
.withIndexConfig(HoodieIndexConfig.newBuilder().withIndexType(indexType)
|
||||
.build()).withAutoCommit(false).build();
|
||||
writeClient = getHoodieWriteClient(config);
|
||||
@@ -91,9 +115,9 @@ public class TestHoodieIndex extends HoodieClientTestHarness {
|
||||
}
|
||||
|
||||
@ParameterizedTest
|
||||
@EnumSource(value = IndexType.class, names = {"BLOOM", "GLOBAL_BLOOM", "SIMPLE", "GLOBAL_SIMPLE"})
|
||||
public void testSimpleTagLocationAndUpdate(IndexType indexType) throws Exception {
|
||||
setUp(indexType);
|
||||
@MethodSource("indexTypeParams")
|
||||
public void testSimpleTagLocationAndUpdate(IndexType indexType, boolean populateMetaFields) throws Exception {
|
||||
setUp(indexType, populateMetaFields);
|
||||
String newCommitTime = "001";
|
||||
int totalRecords = 10 + random.nextInt(20);
|
||||
List<HoodieRecord> records = dataGen.generateInserts(newCommitTime, totalRecords);
|
||||
@@ -141,9 +165,9 @@ public class TestHoodieIndex extends HoodieClientTestHarness {
|
||||
}
|
||||
|
||||
@ParameterizedTest
|
||||
@EnumSource(value = IndexType.class, names = {"BLOOM", "GLOBAL_BLOOM", "SIMPLE", "GLOBAL_SIMPLE"})
|
||||
public void testTagLocationAndDuplicateUpdate(IndexType indexType) throws Exception {
|
||||
setUp(indexType);
|
||||
@MethodSource("indexTypeParams")
|
||||
public void testTagLocationAndDuplicateUpdate(IndexType indexType, boolean populateMetaFields) throws Exception {
|
||||
setUp(indexType, populateMetaFields);
|
||||
String newCommitTime = "001";
|
||||
int totalRecords = 10 + random.nextInt(20);
|
||||
List<HoodieRecord> records = dataGen.generateInserts(newCommitTime, totalRecords);
|
||||
@@ -191,9 +215,9 @@ public class TestHoodieIndex extends HoodieClientTestHarness {
|
||||
}
|
||||
|
||||
@ParameterizedTest
|
||||
@EnumSource(value = IndexType.class, names = {"BLOOM", "GLOBAL_BLOOM", "SIMPLE", "GLOBAL_SIMPLE"})
|
||||
public void testSimpleTagLocationAndUpdateWithRollback(IndexType indexType) throws Exception {
|
||||
setUp(indexType);
|
||||
@MethodSource("indexTypeParams")
|
||||
public void testSimpleTagLocationAndUpdateWithRollback(IndexType indexType, boolean populateMetaFields) throws Exception {
|
||||
setUp(indexType, populateMetaFields);
|
||||
String newCommitTime = writeClient.startCommit();
|
||||
int totalRecords = 20 + random.nextInt(20);
|
||||
List<HoodieRecord> records = dataGen.generateInserts(newCommitTime, totalRecords);
|
||||
@@ -242,10 +266,18 @@ public class TestHoodieIndex extends HoodieClientTestHarness {
|
||||
assert (javaRDD.filter(record -> record.getCurrentLocation() != null).collect().size() == 0);
|
||||
}
|
||||
|
||||
private static Stream<Arguments> regularIndexTypeParams() {
|
||||
Object[][] data = new Object[][] {
|
||||
{IndexType.BLOOM, true},
|
||||
{IndexType.SIMPLE, true}
|
||||
};
|
||||
return Stream.of(data).map(Arguments::of);
|
||||
}
|
||||
|
||||
@ParameterizedTest
|
||||
@EnumSource(value = IndexType.class, names = {"BLOOM", "SIMPLE",})
|
||||
public void testTagLocationAndFetchRecordLocations(IndexType indexType) throws Exception {
|
||||
setUp(indexType);
|
||||
@MethodSource("regularIndexTypeParams")
|
||||
public void testTagLocationAndFetchRecordLocations(IndexType indexType, boolean populateMetaFields) throws Exception {
|
||||
setUp(indexType, populateMetaFields);
|
||||
String p1 = "2016/01/31";
|
||||
String p2 = "2015/01/31";
|
||||
String rowKey1 = UUID.randomUUID().toString();
|
||||
@@ -325,10 +357,9 @@ public class TestHoodieIndex extends HoodieClientTestHarness {
|
||||
}
|
||||
}
|
||||
|
||||
@ParameterizedTest
|
||||
@EnumSource(value = IndexType.class, names = {"GLOBAL_SIMPLE"})
|
||||
public void testSimpleGlobalIndexTagLocationWhenShouldUpdatePartitionPath(IndexType indexType) throws Exception {
|
||||
setUp(indexType);
|
||||
@Test
|
||||
public void testSimpleGlobalIndexTagLocationWhenShouldUpdatePartitionPath() throws Exception {
|
||||
setUp(IndexType.GLOBAL_SIMPLE, true);
|
||||
config = getConfigBuilder()
|
||||
.withIndexConfig(HoodieIndexConfig.newBuilder().withIndexType(indexType)
|
||||
.withGlobalSimpleIndexUpdatePartitionPath(true)
|
||||
|
||||
@@ -18,20 +18,26 @@
|
||||
|
||||
package org.apache.hudi.io;
|
||||
|
||||
import org.apache.hudi.common.config.TypedProperties;
|
||||
import org.apache.hudi.common.engine.HoodieEngineContext;
|
||||
import org.apache.hudi.common.fs.ConsistencyGuardConfig;
|
||||
import org.apache.hudi.common.model.HoodieBaseFile;
|
||||
import org.apache.hudi.common.model.HoodieKey;
|
||||
import org.apache.hudi.common.model.HoodieRecord;
|
||||
import org.apache.hudi.common.model.HoodieRecordLocation;
|
||||
import org.apache.hudi.common.model.HoodieTableType;
|
||||
import org.apache.hudi.common.table.view.FileSystemViewStorageConfig;
|
||||
import org.apache.hudi.common.table.view.FileSystemViewStorageType;
|
||||
import org.apache.hudi.common.testutils.HoodieTestUtils;
|
||||
import org.apache.hudi.common.util.Option;
|
||||
import org.apache.hudi.common.util.collection.Pair;
|
||||
import org.apache.hudi.config.HoodieCompactionConfig;
|
||||
import org.apache.hudi.config.HoodieIndexConfig;
|
||||
import org.apache.hudi.config.HoodieStorageConfig;
|
||||
import org.apache.hudi.config.HoodieWriteConfig;
|
||||
import org.apache.hudi.index.HoodieIndexUtils;
|
||||
import org.apache.hudi.keygen.BaseKeyGenerator;
|
||||
import org.apache.hudi.keygen.factory.HoodieSparkKeyGeneratorFactory;
|
||||
import org.apache.hudi.table.HoodieSparkTable;
|
||||
import org.apache.hudi.table.HoodieTable;
|
||||
import org.apache.hudi.testutils.HoodieClientTestHarness;
|
||||
@@ -40,7 +46,8 @@ import org.apache.hudi.testutils.MetadataMergeWriteStatus;
|
||||
|
||||
import org.junit.jupiter.api.AfterEach;
|
||||
import org.junit.jupiter.api.BeforeEach;
|
||||
import org.junit.jupiter.api.Test;
|
||||
import org.junit.jupiter.params.ParameterizedTest;
|
||||
import org.junit.jupiter.params.provider.ValueSource;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.util.ArrayList;
|
||||
@@ -48,6 +55,7 @@ import java.util.HashMap;
|
||||
import java.util.Iterator;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
import java.util.Properties;
|
||||
|
||||
import scala.Tuple2;
|
||||
|
||||
@@ -71,10 +79,6 @@ public class TestHoodieKeyLocationFetchHandle extends HoodieClientTestHarness {
|
||||
initPath();
|
||||
initTestDataGenerator();
|
||||
initFileSystem();
|
||||
initMetaClient();
|
||||
config = getConfigBuilder()
|
||||
.withIndexConfig(HoodieIndexConfig.newBuilder()
|
||||
.build()).build();
|
||||
}
|
||||
|
||||
@AfterEach
|
||||
@@ -82,8 +86,15 @@ public class TestHoodieKeyLocationFetchHandle extends HoodieClientTestHarness {
|
||||
cleanupResources();
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testFetchHandle() throws Exception {
|
||||
@ParameterizedTest
|
||||
@ValueSource(booleans = {true, false})
|
||||
public void testFetchHandle(boolean populateMetaFields) throws Exception {
|
||||
metaClient = HoodieTestUtils.init(hadoopConf, basePath, HoodieTableType.COPY_ON_WRITE, populateMetaFields ? new Properties() : getPropertiesForKeyGen());
|
||||
config = getConfigBuilder()
|
||||
.withProperties(getPropertiesForKeyGen())
|
||||
.withIndexConfig(HoodieIndexConfig.newBuilder()
|
||||
.build()).build();
|
||||
|
||||
List<HoodieRecord> records = dataGen.generateInserts(makeNewCommitTime(), 100);
|
||||
Map<String, List<HoodieRecord>> partitionRecordsMap = recordsToPartitionRecordsMap(records);
|
||||
HoodieTable hoodieTable = HoodieSparkTable.create(config, context, metaClient);
|
||||
@@ -93,8 +104,11 @@ public class TestHoodieKeyLocationFetchHandle extends HoodieClientTestHarness {
|
||||
|
||||
List<Tuple2<String, HoodieBaseFile>> partitionPathFileIdPairs = loadAllFilesForPartitions(new ArrayList<>(partitionRecordsMap.keySet()), context, hoodieTable);
|
||||
|
||||
BaseKeyGenerator keyGenerator = (BaseKeyGenerator) HoodieSparkKeyGeneratorFactory.createKeyGenerator(new TypedProperties(getPropertiesForKeyGen()));
|
||||
|
||||
for (Tuple2<String, HoodieBaseFile> entry : partitionPathFileIdPairs) {
|
||||
HoodieKeyLocationFetchHandle fetcherHandle = new HoodieKeyLocationFetchHandle(config, hoodieTable, Pair.of(entry._1, entry._2));
|
||||
HoodieKeyLocationFetchHandle fetcherHandle = new HoodieKeyLocationFetchHandle(config, hoodieTable, Pair.of(entry._1, entry._2),
|
||||
populateMetaFields ? Option.empty() : Option.of(keyGenerator));
|
||||
Iterator<Pair<HoodieKey, HoodieRecordLocation>> result = fetcherHandle.locations().iterator();
|
||||
List<Tuple2<HoodieKey, HoodieRecordLocation>> actualList = new ArrayList<>();
|
||||
result.forEachRemaining(x -> actualList.add(new Tuple2<>(x.getLeft(), x.getRight())));
|
||||
|
||||
@@ -43,6 +43,7 @@ import org.apache.hudi.common.table.view.HoodieTableFileSystemView;
|
||||
import org.apache.hudi.common.table.view.TableFileSystemView;
|
||||
import org.apache.hudi.common.testutils.HoodieTestDataGenerator;
|
||||
import org.apache.hudi.common.testutils.HoodieTestTable;
|
||||
import org.apache.hudi.common.testutils.HoodieTestUtils;
|
||||
import org.apache.hudi.common.util.HoodieTimer;
|
||||
import org.apache.hudi.common.util.Option;
|
||||
import org.apache.hudi.config.HoodieCompactionConfig;
|
||||
@@ -60,10 +61,10 @@ import org.apache.log4j.Logger;
|
||||
import org.apache.spark.api.java.JavaRDD;
|
||||
import org.junit.jupiter.api.AfterEach;
|
||||
import org.junit.jupiter.api.Disabled;
|
||||
import org.junit.jupiter.api.Test;
|
||||
import org.junit.jupiter.api.io.TempDir;
|
||||
import org.junit.jupiter.params.ParameterizedTest;
|
||||
import org.junit.jupiter.params.provider.EnumSource;
|
||||
import org.junit.jupiter.params.provider.Arguments;
|
||||
import org.junit.jupiter.params.provider.MethodSource;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.nio.file.Files;
|
||||
@@ -71,7 +72,9 @@ import java.nio.file.Paths;
|
||||
import java.util.Arrays;
|
||||
import java.util.Collections;
|
||||
import java.util.List;
|
||||
import java.util.Properties;
|
||||
import java.util.stream.Collectors;
|
||||
import java.util.stream.Stream;
|
||||
|
||||
import static org.apache.hudi.common.testutils.HoodieTestDataGenerator.TRIP_EXAMPLE_SCHEMA;
|
||||
import static org.junit.jupiter.api.Assertions.assertEquals;
|
||||
@@ -91,16 +94,15 @@ public class TestHoodieBackedMetadata extends HoodieClientTestHarness {
|
||||
|
||||
private HoodieTableType tableType;
|
||||
|
||||
public void init(HoodieTableType tableType) throws IOException {
|
||||
public void init(HoodieTableType tableType, boolean populateMetaFields) throws IOException {
|
||||
this.tableType = tableType;
|
||||
initPath();
|
||||
initSparkContexts("TestHoodieMetadata");
|
||||
initFileSystem();
|
||||
fs.mkdirs(new Path(basePath));
|
||||
initMetaClient(tableType);
|
||||
metaClient = HoodieTestUtils.init(hadoopConf, basePath, tableType, populateMetaFields ? new Properties() : getPropertiesForKeyGen());
|
||||
initTestDataGenerator();
|
||||
metadataTableBasePath = HoodieTableMetadata.getMetadataTableBasePath(basePath);
|
||||
|
||||
}
|
||||
|
||||
@AfterEach
|
||||
@@ -108,12 +110,25 @@ public class TestHoodieBackedMetadata extends HoodieClientTestHarness {
|
||||
cleanupResources();
|
||||
}
|
||||
|
||||
private static Stream<Arguments> populateMetaFieldsParams() {
|
||||
return Arrays.stream(new Boolean[][] {{true}, {false}}).map(Arguments::of);
|
||||
}
|
||||
|
||||
private static Stream<Arguments> tableTypePopulateMetaFieldsParams() {
|
||||
return Stream.of(
|
||||
Arguments.of(HoodieTableType.COPY_ON_WRITE, true),
|
||||
Arguments.of(HoodieTableType.COPY_ON_WRITE, false),
|
||||
Arguments.of(HoodieTableType.MERGE_ON_READ, true)
|
||||
);
|
||||
}
|
||||
|
||||
/**
|
||||
* Metadata Table bootstrap scenarios.
|
||||
*/
|
||||
@Test
|
||||
public void testMetadataTableBootstrap() throws Exception {
|
||||
init(HoodieTableType.COPY_ON_WRITE);
|
||||
@ParameterizedTest
|
||||
@MethodSource("populateMetaFieldsParams")
|
||||
public void testMetadataTableBootstrap(boolean populateMetaFields) throws Exception {
|
||||
init(HoodieTableType.COPY_ON_WRITE, populateMetaFields);
|
||||
HoodieSparkEngineContext engineContext = new HoodieSparkEngineContext(jsc);
|
||||
|
||||
// Metadata table should not exist until created for the first time
|
||||
@@ -122,7 +137,7 @@ public class TestHoodieBackedMetadata extends HoodieClientTestHarness {
|
||||
|
||||
// Metadata table is not created if disabled by config
|
||||
String firstCommitTime = HoodieActiveTimeline.createNewInstantTime();
|
||||
try (SparkRDDWriteClient client = new SparkRDDWriteClient(engineContext, getWriteConfig(true, false))) {
|
||||
try (SparkRDDWriteClient client = new SparkRDDWriteClient(engineContext, getWriteConfig(true, false, populateMetaFields))) {
|
||||
client.startCommitWithTime(firstCommitTime);
|
||||
client.insert(jsc.parallelize(dataGen.generateInserts(firstCommitTime, 5)), firstCommitTime);
|
||||
assertFalse(fs.exists(new Path(metadataTableBasePath)), "Metadata table should not be created");
|
||||
@@ -131,7 +146,7 @@ public class TestHoodieBackedMetadata extends HoodieClientTestHarness {
|
||||
|
||||
// Metadata table should not be created if any non-complete instants are present
|
||||
String secondCommitTime = HoodieActiveTimeline.createNewInstantTime();
|
||||
try (SparkRDDWriteClient client = new SparkRDDWriteClient(engineContext, getWriteConfig(false, true), true)) {
|
||||
try (SparkRDDWriteClient client = new SparkRDDWriteClient(engineContext, getWriteConfig(false, true, populateMetaFields), true)) {
|
||||
client.startCommitWithTime(secondCommitTime);
|
||||
client.insert(jsc.parallelize(dataGen.generateUpdates(secondCommitTime, 2)), secondCommitTime);
|
||||
// AutoCommit is false so no bootstrap
|
||||
@@ -144,7 +159,7 @@ public class TestHoodieBackedMetadata extends HoodieClientTestHarness {
|
||||
|
||||
// Metadata table created when enabled by config & sync is called
|
||||
secondCommitTime = HoodieActiveTimeline.createNewInstantTime();
|
||||
try (SparkRDDWriteClient client = new SparkRDDWriteClient(engineContext, getWriteConfig(true, true), true)) {
|
||||
try (SparkRDDWriteClient client = new SparkRDDWriteClient(engineContext, getWriteConfig(true, true, populateMetaFields), true)) {
|
||||
client.startCommitWithTime(secondCommitTime);
|
||||
client.insert(jsc.parallelize(dataGen.generateUpdates(secondCommitTime, 2)), secondCommitTime);
|
||||
client.syncTableMetadata();
|
||||
@@ -167,7 +182,7 @@ public class TestHoodieBackedMetadata extends HoodieClientTestHarness {
|
||||
});
|
||||
|
||||
String thirdCommitTime = HoodieActiveTimeline.createNewInstantTime();
|
||||
try (SparkRDDWriteClient client = new SparkRDDWriteClient(engineContext, getWriteConfig(true, true), true)) {
|
||||
try (SparkRDDWriteClient client = new SparkRDDWriteClient(engineContext, getWriteConfig(true, true, populateMetaFields), true)) {
|
||||
client.startCommitWithTime(thirdCommitTime);
|
||||
client.insert(jsc.parallelize(dataGen.generateUpdates(thirdCommitTime, 2)), thirdCommitTime);
|
||||
client.syncTableMetadata();
|
||||
@@ -184,10 +199,11 @@ public class TestHoodieBackedMetadata extends HoodieClientTestHarness {
|
||||
/**
|
||||
* Only valid partition directories are added to the metadata.
|
||||
*/
|
||||
@Test
|
||||
public void testOnlyValidPartitionsAdded() throws Exception {
|
||||
@ParameterizedTest
|
||||
@MethodSource("populateMetaFieldsParams")
|
||||
public void testOnlyValidPartitionsAdded(boolean populateMetaFields) throws Exception {
|
||||
// This test requires local file system
|
||||
init(HoodieTableType.COPY_ON_WRITE);
|
||||
init(HoodieTableType.COPY_ON_WRITE, populateMetaFields);
|
||||
HoodieSparkEngineContext engineContext = new HoodieSparkEngineContext(jsc);
|
||||
|
||||
// Create an empty directory which is not a partition directory (lacks partition metadata)
|
||||
@@ -207,7 +223,7 @@ public class TestHoodieBackedMetadata extends HoodieClientTestHarness {
|
||||
.addCommit("002").withBaseFilesInPartition("p1", 10).withBaseFilesInPartition("p2", 10, 10, 10);
|
||||
|
||||
final HoodieWriteConfig writeConfig =
|
||||
getWriteConfigBuilder(HoodieFailedWritesCleaningPolicy.NEVER, true, true, false)
|
||||
getWriteConfigBuilder(HoodieFailedWritesCleaningPolicy.NEVER, true, true, false, populateMetaFields)
|
||||
.withMetadataConfig(HoodieMetadataConfig.newBuilder().enable(true).withDirectoryFilterRegex(filterDirRegex).build()).build();
|
||||
try (SparkRDDWriteClient client = new SparkRDDWriteClient(engineContext, writeConfig)) {
|
||||
client.startCommitWithTime("005");
|
||||
@@ -237,12 +253,12 @@ public class TestHoodieBackedMetadata extends HoodieClientTestHarness {
|
||||
* Test various table operations sync to Metadata Table correctly.
|
||||
*/
|
||||
@ParameterizedTest
|
||||
@EnumSource(HoodieTableType.class)
|
||||
public void testTableOperations(HoodieTableType tableType) throws Exception {
|
||||
init(tableType);
|
||||
@MethodSource("tableTypePopulateMetaFieldsParams")
|
||||
public void testTableOperations(HoodieTableType tableType, boolean populateMetaFields) throws Exception {
|
||||
init(tableType, populateMetaFields);
|
||||
HoodieSparkEngineContext engineContext = new HoodieSparkEngineContext(jsc);
|
||||
|
||||
try (SparkRDDWriteClient client = new SparkRDDWriteClient(engineContext, getWriteConfig(true, true))) {
|
||||
try (SparkRDDWriteClient client = new SparkRDDWriteClient(engineContext, getWriteConfig(true, true, populateMetaFields))) {
|
||||
|
||||
// Write 1 (Bulk insert)
|
||||
String newCommitTime = "001";
|
||||
@@ -325,12 +341,12 @@ public class TestHoodieBackedMetadata extends HoodieClientTestHarness {
|
||||
* Test rollback of various table operations sync to Metadata Table correctly.
|
||||
*/
|
||||
@ParameterizedTest
|
||||
@EnumSource(HoodieTableType.class)
|
||||
public void testRollbackOperations(HoodieTableType tableType) throws Exception {
|
||||
init(tableType);
|
||||
@MethodSource("tableTypePopulateMetaFieldsParams")
|
||||
public void testRollbackOperations(HoodieTableType tableType, boolean populateMetaFields) throws Exception {
|
||||
init(tableType, populateMetaFields);
|
||||
HoodieSparkEngineContext engineContext = new HoodieSparkEngineContext(jsc);
|
||||
|
||||
try (SparkRDDWriteClient client = new SparkRDDWriteClient(engineContext, getWriteConfig(true, true))) {
|
||||
try (SparkRDDWriteClient client = new SparkRDDWriteClient(engineContext, getWriteConfig(true, true, populateMetaFields))) {
|
||||
// Write 1 (Bulk insert)
|
||||
String newCommitTime = HoodieActiveTimeline.createNewInstantTime();
|
||||
List<HoodieRecord> records = dataGen.generateInserts(newCommitTime, 20);
|
||||
@@ -403,7 +419,7 @@ public class TestHoodieBackedMetadata extends HoodieClientTestHarness {
|
||||
|
||||
// Rollback of partial commits
|
||||
try (SparkRDDWriteClient client = new SparkRDDWriteClient(engineContext,
|
||||
getWriteConfigBuilder(false, true, false).withRollbackUsingMarkers(false).build())) {
|
||||
getWriteConfigBuilder(false, true, false, populateMetaFields).withRollbackUsingMarkers(false).build())) {
|
||||
// Write updates and inserts
|
||||
String newCommitTime = HoodieActiveTimeline.createNewInstantTime();
|
||||
client.startCommitWithTime(newCommitTime);
|
||||
@@ -417,7 +433,7 @@ public class TestHoodieBackedMetadata extends HoodieClientTestHarness {
|
||||
|
||||
// Marker based rollback of partial commits
|
||||
try (SparkRDDWriteClient client = new SparkRDDWriteClient(engineContext,
|
||||
getWriteConfigBuilder(false, true, false).withRollbackUsingMarkers(true).build())) {
|
||||
getWriteConfigBuilder(false, true, false, populateMetaFields).withRollbackUsingMarkers(true).build())) {
|
||||
// Write updates and inserts
|
||||
String newCommitTime = HoodieActiveTimeline.createNewInstantTime();
|
||||
client.startCommitWithTime(newCommitTime);
|
||||
@@ -435,12 +451,12 @@ public class TestHoodieBackedMetadata extends HoodieClientTestHarness {
|
||||
* Once explicit sync is called, metadata should match.
|
||||
*/
|
||||
@ParameterizedTest
|
||||
@EnumSource(HoodieTableType.class)
|
||||
public void testRollbackUnsyncedCommit(HoodieTableType tableType) throws Exception {
|
||||
init(tableType);
|
||||
@MethodSource("tableTypePopulateMetaFieldsParams")
|
||||
public void testRollbackUnsyncedCommit(HoodieTableType tableType, boolean populateMetaFields) throws Exception {
|
||||
init(tableType, populateMetaFields);
|
||||
HoodieSparkEngineContext engineContext = new HoodieSparkEngineContext(jsc);
|
||||
|
||||
try (SparkRDDWriteClient client = new SparkRDDWriteClient(engineContext, getWriteConfig(true, true))) {
|
||||
try (SparkRDDWriteClient client = new SparkRDDWriteClient(engineContext, getWriteConfig(true, true, populateMetaFields))) {
|
||||
// Initialize table with metadata
|
||||
String newCommitTime = HoodieActiveTimeline.createNewInstantTime();
|
||||
List<HoodieRecord> records = dataGen.generateInserts(newCommitTime, 20);
|
||||
@@ -450,7 +466,7 @@ public class TestHoodieBackedMetadata extends HoodieClientTestHarness {
|
||||
validateMetadata(client);
|
||||
}
|
||||
String newCommitTime = HoodieActiveTimeline.createNewInstantTime();
|
||||
try (SparkRDDWriteClient client = new SparkRDDWriteClient(engineContext, getWriteConfig(true, false))) {
|
||||
try (SparkRDDWriteClient client = new SparkRDDWriteClient(engineContext, getWriteConfig(true, false, populateMetaFields))) {
|
||||
// Commit with metadata disabled
|
||||
client.startCommitWithTime(newCommitTime);
|
||||
List<HoodieRecord> records = dataGen.generateUpdates(newCommitTime, 10);
|
||||
@@ -459,7 +475,7 @@ public class TestHoodieBackedMetadata extends HoodieClientTestHarness {
|
||||
client.rollback(newCommitTime);
|
||||
}
|
||||
|
||||
try (SparkRDDWriteClient client = new SparkRDDWriteClient<>(engineContext, getWriteConfig(true, true))) {
|
||||
try (SparkRDDWriteClient client = new SparkRDDWriteClient<>(engineContext, getWriteConfig(true, true, populateMetaFields))) {
|
||||
assertFalse(metadata(client).isInSync());
|
||||
client.syncTableMetadata();
|
||||
validateMetadata(client);
|
||||
@@ -470,10 +486,10 @@ public class TestHoodieBackedMetadata extends HoodieClientTestHarness {
|
||||
* Test sync of table operations.
|
||||
*/
|
||||
@ParameterizedTest
|
||||
@EnumSource(HoodieTableType.class)
|
||||
@MethodSource("tableTypePopulateMetaFieldsParams")
|
||||
@Disabled
|
||||
public void testSync(HoodieTableType tableType) throws Exception {
|
||||
init(tableType);
|
||||
public void testSync(HoodieTableType tableType, boolean populateMetaFields) throws Exception {
|
||||
init(tableType, populateMetaFields);
|
||||
HoodieSparkEngineContext engineContext = new HoodieSparkEngineContext(jsc);
|
||||
|
||||
String newCommitTime;
|
||||
@@ -481,7 +497,7 @@ public class TestHoodieBackedMetadata extends HoodieClientTestHarness {
|
||||
List<WriteStatus> writeStatuses;
|
||||
|
||||
// Initial commits without metadata table enabled
|
||||
try (SparkRDDWriteClient client = new SparkRDDWriteClient(engineContext, getWriteConfig(true, false))) {
|
||||
try (SparkRDDWriteClient client = new SparkRDDWriteClient(engineContext, getWriteConfig(true, false, populateMetaFields))) {
|
||||
newCommitTime = HoodieActiveTimeline.createNewInstantTime();
|
||||
records = dataGen.generateInserts(newCommitTime, 5);
|
||||
client.startCommitWithTime(newCommitTime);
|
||||
@@ -496,7 +512,7 @@ public class TestHoodieBackedMetadata extends HoodieClientTestHarness {
|
||||
}
|
||||
|
||||
// Enable metadata table so it initialized by listing from file system
|
||||
try (SparkRDDWriteClient client = new SparkRDDWriteClient(engineContext, getWriteConfig(true, true))) {
|
||||
try (SparkRDDWriteClient client = new SparkRDDWriteClient(engineContext, getWriteConfig(true, true, populateMetaFields))) {
|
||||
// inserts
|
||||
newCommitTime = HoodieActiveTimeline.createNewInstantTime();
|
||||
client.startCommitWithTime(newCommitTime);
|
||||
@@ -512,7 +528,7 @@ public class TestHoodieBackedMetadata extends HoodieClientTestHarness {
|
||||
String restoreToInstant;
|
||||
String inflightActionTimestamp;
|
||||
String beforeInflightActionTimestamp;
|
||||
try (SparkRDDWriteClient client = new SparkRDDWriteClient(engineContext, getWriteConfig(true, false))) {
|
||||
try (SparkRDDWriteClient client = new SparkRDDWriteClient(engineContext, getWriteConfig(true, false, populateMetaFields))) {
|
||||
// updates
|
||||
newCommitTime = HoodieActiveTimeline.createNewInstantTime();
|
||||
client.startCommitWithTime(newCommitTime);
|
||||
@@ -584,7 +600,7 @@ public class TestHoodieBackedMetadata extends HoodieClientTestHarness {
|
||||
Path inflightCleanPath = new Path(metaClient.getMetaPath(), HoodieTimeline.makeInflightCleanerFileName(inflightActionTimestamp));
|
||||
fs.create(inflightCleanPath).close();
|
||||
|
||||
try (SparkRDDWriteClient client = new SparkRDDWriteClient(engineContext, getWriteConfig(true, true))) {
|
||||
try (SparkRDDWriteClient client = new SparkRDDWriteClient(engineContext, getWriteConfig(true, true, populateMetaFields))) {
|
||||
// Restore cannot be done until the metadata table is in sync. See HUDI-1502 for details
|
||||
client.syncTableMetadata();
|
||||
|
||||
@@ -613,7 +629,7 @@ public class TestHoodieBackedMetadata extends HoodieClientTestHarness {
|
||||
}
|
||||
|
||||
// Enable metadata table and ensure it is synced
|
||||
try (SparkRDDWriteClient client = new SparkRDDWriteClient(engineContext, getWriteConfig(true, true))) {
|
||||
try (SparkRDDWriteClient client = new SparkRDDWriteClient(engineContext, getWriteConfig(true, true, populateMetaFields))) {
|
||||
client.restoreToInstant(restoreToInstant);
|
||||
assertFalse(metadata(client).isInSync());
|
||||
|
||||
@@ -629,13 +645,14 @@ public class TestHoodieBackedMetadata extends HoodieClientTestHarness {
|
||||
/**
|
||||
* Instants on Metadata Table should be archived as per config. Metadata Table should be automatically compacted as per config.
|
||||
*/
|
||||
@Test
|
||||
public void testCleaningArchivingAndCompaction() throws Exception {
|
||||
init(HoodieTableType.COPY_ON_WRITE);
|
||||
@ParameterizedTest
|
||||
@MethodSource("populateMetaFieldsParams")
|
||||
public void testCleaningArchivingAndCompaction(boolean populateMetaFields) throws Exception {
|
||||
init(HoodieTableType.COPY_ON_WRITE, populateMetaFields);
|
||||
HoodieSparkEngineContext engineContext = new HoodieSparkEngineContext(jsc);
|
||||
|
||||
final int maxDeltaCommitsBeforeCompaction = 4;
|
||||
HoodieWriteConfig config = getWriteConfigBuilder(true, true, false)
|
||||
HoodieWriteConfig config = getWriteConfigBuilder(true, true, false, populateMetaFields)
|
||||
.withMetadataConfig(HoodieMetadataConfig.newBuilder().enable(true)
|
||||
.archiveCommitsWith(6, 8).retainCommits(1)
|
||||
.withMaxNumDeltaCommitsBeforeCompaction(maxDeltaCommitsBeforeCompaction).build())
|
||||
@@ -676,14 +693,15 @@ public class TestHoodieBackedMetadata extends HoodieClientTestHarness {
|
||||
/**
|
||||
* Test various error scenarios.
|
||||
*/
|
||||
@Test
|
||||
public void testErrorCases() throws Exception {
|
||||
init(HoodieTableType.COPY_ON_WRITE);
|
||||
@ParameterizedTest
|
||||
@MethodSource("populateMetaFieldsParams")
|
||||
public void testErrorCases(boolean populateMetaFields) throws Exception {
|
||||
init(HoodieTableType.COPY_ON_WRITE, populateMetaFields);
|
||||
HoodieSparkEngineContext engineContext = new HoodieSparkEngineContext(jsc);
|
||||
|
||||
// TESTCASE: If commit on the metadata table succeeds but fails on the dataset, then on next init the metadata table
|
||||
// should be rolled back to last valid commit.
|
||||
try (SparkRDDWriteClient client = new SparkRDDWriteClient(engineContext, getWriteConfig(true, true), true)) {
|
||||
try (SparkRDDWriteClient client = new SparkRDDWriteClient(engineContext, getWriteConfig(true, true, populateMetaFields), true)) {
|
||||
String newCommitTime = HoodieActiveTimeline.createNewInstantTime();
|
||||
client.startCommitWithTime(newCommitTime);
|
||||
List<HoodieRecord> records = dataGen.generateInserts(newCommitTime, 10);
|
||||
@@ -704,7 +722,7 @@ public class TestHoodieBackedMetadata extends HoodieClientTestHarness {
|
||||
commitInstantFileName), false));
|
||||
}
|
||||
|
||||
try (SparkRDDWriteClient client = new SparkRDDWriteClient(engineContext, getWriteConfig(true, true), true)) {
|
||||
try (SparkRDDWriteClient client = new SparkRDDWriteClient(engineContext, getWriteConfig(true, true, populateMetaFields), true)) {
|
||||
String newCommitTime = client.startCommit();
|
||||
// Next insert
|
||||
List<HoodieRecord> records = dataGen.generateInserts(newCommitTime, 5);
|
||||
@@ -721,11 +739,11 @@ public class TestHoodieBackedMetadata extends HoodieClientTestHarness {
|
||||
*/
|
||||
//@Test
|
||||
public void testNonPartitioned() throws Exception {
|
||||
init(HoodieTableType.COPY_ON_WRITE);
|
||||
init(HoodieTableType.COPY_ON_WRITE, true);
|
||||
HoodieSparkEngineContext engineContext = new HoodieSparkEngineContext(jsc);
|
||||
|
||||
HoodieTestDataGenerator nonPartitionedGenerator = new HoodieTestDataGenerator(new String[] {""});
|
||||
try (SparkRDDWriteClient client = new SparkRDDWriteClient(engineContext, getWriteConfig(true, true))) {
|
||||
try (SparkRDDWriteClient client = new SparkRDDWriteClient(engineContext, getWriteConfig(true, true, true))) {
|
||||
// Write 1 (Bulk insert)
|
||||
String newCommitTime = "001";
|
||||
List<HoodieRecord> records = nonPartitionedGenerator.generateInserts(newCommitTime, 10);
|
||||
@@ -741,12 +759,13 @@ public class TestHoodieBackedMetadata extends HoodieClientTestHarness {
|
||||
/**
|
||||
* Test various metrics published by metadata table.
|
||||
*/
|
||||
@Test
|
||||
public void testMetadataMetrics() throws Exception {
|
||||
init(HoodieTableType.COPY_ON_WRITE);
|
||||
@ParameterizedTest
|
||||
@MethodSource("populateMetaFieldsParams")
|
||||
public void testMetadataMetrics(boolean populateMetaFields) throws Exception {
|
||||
init(HoodieTableType.COPY_ON_WRITE, populateMetaFields);
|
||||
HoodieSparkEngineContext engineContext = new HoodieSparkEngineContext(jsc);
|
||||
|
||||
try (SparkRDDWriteClient client = new SparkRDDWriteClient(engineContext, getWriteConfigBuilder(true, true, true).build())) {
|
||||
try (SparkRDDWriteClient client = new SparkRDDWriteClient(engineContext, getWriteConfigBuilder(true, true, true, populateMetaFields).build())) {
|
||||
// Write
|
||||
String newCommitTime = HoodieActiveTimeline.createNewInstantTime();
|
||||
List<HoodieRecord> records = dataGen.generateInserts(newCommitTime, 20);
|
||||
@@ -769,15 +788,16 @@ public class TestHoodieBackedMetadata extends HoodieClientTestHarness {
|
||||
/**
|
||||
* Test when reading from metadata table which is out of sync with dataset that results are still consistent.
|
||||
*/
|
||||
@Test
|
||||
public void testMetadataOutOfSync() throws Exception {
|
||||
init(HoodieTableType.COPY_ON_WRITE);
|
||||
@ParameterizedTest
|
||||
@MethodSource("populateMetaFieldsParams")
|
||||
public void testMetadataOutOfSync(boolean populateMetaFields) throws Exception {
|
||||
init(HoodieTableType.COPY_ON_WRITE, populateMetaFields);
|
||||
HoodieSparkEngineContext engineContext = new HoodieSparkEngineContext(jsc);
|
||||
|
||||
SparkRDDWriteClient unsyncedClient = new SparkRDDWriteClient(engineContext, getWriteConfig(true, true));
|
||||
SparkRDDWriteClient unsyncedClient = new SparkRDDWriteClient(engineContext, getWriteConfig(true, true, populateMetaFields));
|
||||
|
||||
// Enable metadata so table is initialized
|
||||
try (SparkRDDWriteClient client = new SparkRDDWriteClient(engineContext, getWriteConfig(true, true))) {
|
||||
try (SparkRDDWriteClient client = new SparkRDDWriteClient(engineContext, getWriteConfig(true, true, populateMetaFields))) {
|
||||
// Perform Bulk Insert
|
||||
String newCommitTime = "001";
|
||||
client.startCommitWithTime(newCommitTime);
|
||||
@@ -786,7 +806,7 @@ public class TestHoodieBackedMetadata extends HoodieClientTestHarness {
|
||||
}
|
||||
|
||||
// Perform commit operations with metadata disabled
|
||||
try (SparkRDDWriteClient client = new SparkRDDWriteClient(engineContext, getWriteConfig(true, false))) {
|
||||
try (SparkRDDWriteClient client = new SparkRDDWriteClient(engineContext, getWriteConfig(true, false, populateMetaFields))) {
|
||||
// Perform Insert
|
||||
String newCommitTime = "002";
|
||||
client.startCommitWithTime(newCommitTime);
|
||||
@@ -811,7 +831,7 @@ public class TestHoodieBackedMetadata extends HoodieClientTestHarness {
|
||||
validateMetadata(unsyncedClient);
|
||||
|
||||
// Perform clean operation with metadata disabled
|
||||
try (SparkRDDWriteClient client = new SparkRDDWriteClient(engineContext, getWriteConfig(true, false))) {
|
||||
try (SparkRDDWriteClient client = new SparkRDDWriteClient(engineContext, getWriteConfig(true, false, populateMetaFields))) {
|
||||
// One more commit needed to trigger clean so upsert and compact
|
||||
String newCommitTime = "005";
|
||||
client.startCommitWithTime(newCommitTime);
|
||||
@@ -833,7 +853,7 @@ public class TestHoodieBackedMetadata extends HoodieClientTestHarness {
|
||||
validateMetadata(unsyncedClient);
|
||||
|
||||
// Perform restore with metadata disabled
|
||||
try (SparkRDDWriteClient client = new SparkRDDWriteClient(engineContext, getWriteConfig(true, false))) {
|
||||
try (SparkRDDWriteClient client = new SparkRDDWriteClient(engineContext, getWriteConfig(true, false, populateMetaFields))) {
|
||||
client.restoreToInstant("004");
|
||||
}
|
||||
|
||||
@@ -1008,18 +1028,20 @@ public class TestHoodieBackedMetadata extends HoodieClientTestHarness {
|
||||
}
|
||||
}
|
||||
|
||||
private HoodieWriteConfig getWriteConfig(boolean autoCommit, boolean useFileListingMetadata) {
|
||||
return getWriteConfigBuilder(autoCommit, useFileListingMetadata, false).build();
|
||||
private HoodieWriteConfig getWriteConfig(boolean autoCommit, boolean useFileListingMetadata, boolean populateMetaFields) {
|
||||
return getWriteConfigBuilder(autoCommit, useFileListingMetadata, false, populateMetaFields).build();
|
||||
}
|
||||
|
||||
private HoodieWriteConfig.Builder getWriteConfigBuilder(boolean autoCommit, boolean useFileListingMetadata, boolean enableMetrics) {
|
||||
return getWriteConfigBuilder(HoodieFailedWritesCleaningPolicy.EAGER, autoCommit, useFileListingMetadata, enableMetrics);
|
||||
private HoodieWriteConfig.Builder getWriteConfigBuilder(boolean autoCommit, boolean useFileListingMetadata, boolean enableMetrics, boolean populateMetaFields) {
|
||||
return getWriteConfigBuilder(HoodieFailedWritesCleaningPolicy.EAGER, autoCommit, useFileListingMetadata, enableMetrics, populateMetaFields);
|
||||
}
|
||||
|
||||
private HoodieWriteConfig.Builder getWriteConfigBuilder(HoodieFailedWritesCleaningPolicy policy, boolean autoCommit, boolean useFileListingMetadata, boolean enableMetrics) {
|
||||
private HoodieWriteConfig.Builder getWriteConfigBuilder(HoodieFailedWritesCleaningPolicy policy, boolean autoCommit, boolean useFileListingMetadata,
|
||||
boolean enableMetrics, boolean populateMetaFields) {
|
||||
return HoodieWriteConfig.newBuilder().withPath(basePath).withSchema(TRIP_EXAMPLE_SCHEMA)
|
||||
.withParallelism(2, 2).withDeleteParallelism(2).withRollbackParallelism(2).withFinalizeWriteParallelism(2)
|
||||
.withAutoCommit(autoCommit)
|
||||
.withProperties(populateMetaFields ? new Properties() : getPropertiesForKeyGen())
|
||||
.withCompactionConfig(HoodieCompactionConfig.newBuilder().compactionSmallFileSize(1024 * 1024 * 1024)
|
||||
.withInlineCompaction(false).withMaxNumDeltaCommitsBeforeCompaction(1)
|
||||
.withFailedWritesCleaningPolicy(policy)
|
||||
@@ -1028,7 +1050,7 @@ public class TestHoodieBackedMetadata extends HoodieClientTestHarness {
|
||||
.withEmbeddedTimelineServerEnabled(true).forTable("test-trip-table")
|
||||
.withFileSystemViewConfig(new FileSystemViewStorageConfig.Builder()
|
||||
.withEnableBackupForRemoteFileSystemView(false).build())
|
||||
.withIndexConfig(HoodieIndexConfig.newBuilder().withIndexType(HoodieIndex.IndexType.BLOOM).build())
|
||||
.withIndexConfig(HoodieIndexConfig.newBuilder().withIndexType(populateMetaFields ? HoodieIndex.IndexType.BLOOM : HoodieIndex.IndexType.SIMPLE).build())
|
||||
.withMetadataConfig(HoodieMetadataConfig.newBuilder()
|
||||
.enable(useFileListingMetadata)
|
||||
.enableMetrics(enableMetrics).build())
|
||||
|
||||
@@ -301,6 +301,13 @@ public class HoodieClientTestBase extends HoodieClientTestHarness {
|
||||
}
|
||||
}
|
||||
|
||||
public JavaRDD<WriteStatus> insertFirstBatch(HoodieWriteConfig writeConfig, SparkRDDWriteClient client, String newCommitTime,
|
||||
String initCommitTime, int numRecordsInThisCommit,
|
||||
Function3<JavaRDD<WriteStatus>, SparkRDDWriteClient, JavaRDD<HoodieRecord>, String> writeFn, boolean isPreppedAPI,
|
||||
boolean assertForCommit, int expRecordsInThisCommit) throws Exception {
|
||||
return insertFirstBatch(writeConfig, client, newCommitTime, initCommitTime, numRecordsInThisCommit, writeFn, isPreppedAPI, assertForCommit, expRecordsInThisCommit, true);
|
||||
}
|
||||
|
||||
/**
|
||||
* Helper to insert first batch of records and do regular assertions on the state after successful completion.
|
||||
*
|
||||
@@ -319,12 +326,12 @@ public class HoodieClientTestBase extends HoodieClientTestHarness {
|
||||
public JavaRDD<WriteStatus> insertFirstBatch(HoodieWriteConfig writeConfig, SparkRDDWriteClient client, String newCommitTime,
|
||||
String initCommitTime, int numRecordsInThisCommit,
|
||||
Function3<JavaRDD<WriteStatus>, SparkRDDWriteClient, JavaRDD<HoodieRecord>, String> writeFn, boolean isPreppedAPI,
|
||||
boolean assertForCommit, int expRecordsInThisCommit) throws Exception {
|
||||
boolean assertForCommit, int expRecordsInThisCommit, boolean filterForCommitTimeWithAssert) throws Exception {
|
||||
final Function2<List<HoodieRecord>, String, Integer> recordGenFunction =
|
||||
generateWrapRecordsFn(isPreppedAPI, writeConfig, dataGen::generateInserts);
|
||||
|
||||
return writeBatch(client, newCommitTime, initCommitTime, Option.empty(), initCommitTime, numRecordsInThisCommit,
|
||||
recordGenFunction, writeFn, assertForCommit, expRecordsInThisCommit, expRecordsInThisCommit, 1, false);
|
||||
recordGenFunction, writeFn, assertForCommit, expRecordsInThisCommit, expRecordsInThisCommit, 1, false, filterForCommitTimeWithAssert);
|
||||
}
|
||||
|
||||
/**
|
||||
@@ -355,6 +362,15 @@ public class HoodieClientTestBase extends HoodieClientTestHarness {
|
||||
recordGenFunction, writeFn, assertForCommit, expRecordsInThisCommit, expTotalRecords, expTotalCommits, false);
|
||||
}
|
||||
|
||||
public JavaRDD<WriteStatus> updateBatch(HoodieWriteConfig writeConfig, SparkRDDWriteClient client, String newCommitTime,
|
||||
String prevCommitTime, Option<List<String>> commitTimesBetweenPrevAndNew, String initCommitTime,
|
||||
int numRecordsInThisCommit,
|
||||
Function3<JavaRDD<WriteStatus>, SparkRDDWriteClient, JavaRDD<HoodieRecord>, String> writeFn, boolean isPreppedAPI,
|
||||
boolean assertForCommit, int expRecordsInThisCommit, int expTotalRecords, int expTotalCommits) throws Exception {
|
||||
return updateBatch(writeConfig, client, newCommitTime, prevCommitTime, commitTimesBetweenPrevAndNew, initCommitTime, numRecordsInThisCommit, writeFn,
|
||||
isPreppedAPI, assertForCommit, expRecordsInThisCommit, expTotalRecords, expTotalCommits, true);
|
||||
}
|
||||
|
||||
/**
|
||||
* Helper to upsert batch of records and do regular assertions on the state after successful completion.
|
||||
*
|
||||
@@ -378,13 +394,23 @@ public class HoodieClientTestBase extends HoodieClientTestHarness {
|
||||
String prevCommitTime, Option<List<String>> commitTimesBetweenPrevAndNew, String initCommitTime,
|
||||
int numRecordsInThisCommit,
|
||||
Function3<JavaRDD<WriteStatus>, SparkRDDWriteClient, JavaRDD<HoodieRecord>, String> writeFn, boolean isPreppedAPI,
|
||||
boolean assertForCommit, int expRecordsInThisCommit, int expTotalRecords, int expTotalCommits) throws Exception {
|
||||
boolean assertForCommit, int expRecordsInThisCommit, int expTotalRecords, int expTotalCommits,
|
||||
boolean filterForCommitTimeWithAssert) throws Exception {
|
||||
final Function2<List<HoodieRecord>, String, Integer> recordGenFunction =
|
||||
generateWrapRecordsFn(isPreppedAPI, writeConfig, dataGen::generateUniqueUpdates);
|
||||
|
||||
return writeBatch(client, newCommitTime, prevCommitTime, commitTimesBetweenPrevAndNew, initCommitTime,
|
||||
numRecordsInThisCommit, recordGenFunction, writeFn, assertForCommit, expRecordsInThisCommit, expTotalRecords,
|
||||
expTotalCommits, false);
|
||||
expTotalCommits, false, filterForCommitTimeWithAssert);
|
||||
}
|
||||
|
||||
public JavaRDD<WriteStatus> deleteBatch(HoodieWriteConfig writeConfig, SparkRDDWriteClient client, String newCommitTime,
|
||||
String prevCommitTime, String initCommitTime,
|
||||
int numRecordsInThisCommit,
|
||||
Function3<JavaRDD<WriteStatus>, SparkRDDWriteClient, JavaRDD<HoodieKey>, String> deleteFn, boolean isPreppedAPI,
|
||||
boolean assertForCommit, int expRecordsInThisCommit, int expTotalRecords) throws Exception {
|
||||
return deleteBatch(writeConfig, client, newCommitTime, prevCommitTime, initCommitTime, numRecordsInThisCommit, deleteFn, isPreppedAPI,
|
||||
assertForCommit, expRecordsInThisCommit, expTotalRecords, true);
|
||||
}
|
||||
|
||||
/**
|
||||
@@ -408,13 +434,22 @@ public class HoodieClientTestBase extends HoodieClientTestHarness {
|
||||
String prevCommitTime, String initCommitTime,
|
||||
int numRecordsInThisCommit,
|
||||
Function3<JavaRDD<WriteStatus>, SparkRDDWriteClient, JavaRDD<HoodieKey>, String> deleteFn, boolean isPreppedAPI,
|
||||
boolean assertForCommit, int expRecordsInThisCommit, int expTotalRecords) throws Exception {
|
||||
boolean assertForCommit, int expRecordsInThisCommit, int expTotalRecords, boolean filterForCommitTimeWithAssert) throws Exception {
|
||||
final Function<Integer, List<HoodieKey>> keyGenFunction =
|
||||
generateWrapDeleteKeysFn(isPreppedAPI, writeConfig, dataGen::generateUniqueDeletes);
|
||||
|
||||
return deleteBatch(client, newCommitTime, prevCommitTime, initCommitTime, numRecordsInThisCommit,
|
||||
keyGenFunction,
|
||||
deleteFn, assertForCommit, expRecordsInThisCommit, expTotalRecords);
|
||||
deleteFn, assertForCommit, expRecordsInThisCommit, expTotalRecords, filterForCommitTimeWithAssert);
|
||||
}
|
||||
|
||||
public JavaRDD<WriteStatus> writeBatch(SparkRDDWriteClient client, String newCommitTime, String prevCommitTime,
|
||||
Option<List<String>> commitTimesBetweenPrevAndNew, String initCommitTime, int numRecordsInThisCommit,
|
||||
Function2<List<HoodieRecord>, String, Integer> recordGenFunction,
|
||||
Function3<JavaRDD<WriteStatus>, SparkRDDWriteClient, JavaRDD<HoodieRecord>, String> writeFn,
|
||||
boolean assertForCommit, int expRecordsInThisCommit, int expTotalRecords, int expTotalCommits, boolean doCommit) throws Exception {
|
||||
return writeBatch(client, newCommitTime, prevCommitTime, commitTimesBetweenPrevAndNew, initCommitTime, numRecordsInThisCommit, recordGenFunction,
|
||||
writeFn, assertForCommit, expRecordsInThisCommit, expTotalRecords, expTotalCommits, doCommit, true);
|
||||
}
|
||||
|
||||
/**
|
||||
@@ -439,7 +474,8 @@ public class HoodieClientTestBase extends HoodieClientTestHarness {
|
||||
Option<List<String>> commitTimesBetweenPrevAndNew, String initCommitTime, int numRecordsInThisCommit,
|
||||
Function2<List<HoodieRecord>, String, Integer> recordGenFunction,
|
||||
Function3<JavaRDD<WriteStatus>, SparkRDDWriteClient, JavaRDD<HoodieRecord>, String> writeFn,
|
||||
boolean assertForCommit, int expRecordsInThisCommit, int expTotalRecords, int expTotalCommits, boolean doCommit) throws Exception {
|
||||
boolean assertForCommit, int expRecordsInThisCommit, int expTotalRecords, int expTotalCommits, boolean doCommit,
|
||||
boolean filterForCommitTimeWithAssert) throws Exception {
|
||||
|
||||
// Write 1 (only inserts)
|
||||
client.startCommitWithTime(newCommitTime);
|
||||
@@ -466,8 +502,10 @@ public class HoodieClientTestBase extends HoodieClientTestHarness {
|
||||
"Expecting " + expTotalCommits + " commits.");
|
||||
assertEquals(newCommitTime, timeline.lastInstant().get().getTimestamp(),
|
||||
"Latest commit should be " + newCommitTime);
|
||||
assertEquals(expRecordsInThisCommit, HoodieClientTestUtils.readCommit(basePath, sqlContext, timeline, newCommitTime).count(),
|
||||
"Must contain " + expRecordsInThisCommit + " records");
|
||||
if (filterForCommitTimeWithAssert) { // when meta cols are disabled, we can't really do per commit assertion.
|
||||
assertEquals(expRecordsInThisCommit, HoodieClientTestUtils.readCommit(basePath, sqlContext, timeline, newCommitTime).count(),
|
||||
"Must contain " + expRecordsInThisCommit + " records");
|
||||
}
|
||||
|
||||
// Check the entire dataset has all records still
|
||||
String[] fullPartitionPaths = new String[dataGen.getPartitionPaths().length];
|
||||
@@ -477,16 +515,18 @@ public class HoodieClientTestBase extends HoodieClientTestHarness {
|
||||
assertEquals(expTotalRecords, HoodieClientTestUtils.read(jsc, basePath, sqlContext, fs, fullPartitionPaths).count(),
|
||||
"Must contain " + expTotalRecords + " records");
|
||||
|
||||
// Check that the incremental consumption from prevCommitTime
|
||||
assertEquals(HoodieClientTestUtils.readCommit(basePath, sqlContext, timeline, newCommitTime).count(),
|
||||
HoodieClientTestUtils.countRecordsSince(jsc, basePath, sqlContext, timeline, prevCommitTime),
|
||||
"Incremental consumption from " + prevCommitTime + " should give all records in latest commit");
|
||||
if (commitTimesBetweenPrevAndNew.isPresent()) {
|
||||
commitTimesBetweenPrevAndNew.get().forEach(ct -> {
|
||||
assertEquals(HoodieClientTestUtils.readCommit(basePath, sqlContext, timeline, newCommitTime).count(),
|
||||
HoodieClientTestUtils.countRecordsSince(jsc, basePath, sqlContext, timeline, ct),
|
||||
"Incremental consumption from " + ct + " should give all records in latest commit");
|
||||
});
|
||||
if (filterForCommitTimeWithAssert) {
|
||||
// Check that the incremental consumption from prevCommitTime
|
||||
assertEquals(HoodieClientTestUtils.readCommit(basePath, sqlContext, timeline, newCommitTime).count(),
|
||||
HoodieClientTestUtils.countRecordsSince(jsc, basePath, sqlContext, timeline, prevCommitTime),
|
||||
"Incremental consumption from " + prevCommitTime + " should give all records in latest commit");
|
||||
if (commitTimesBetweenPrevAndNew.isPresent()) {
|
||||
commitTimesBetweenPrevAndNew.get().forEach(ct -> {
|
||||
assertEquals(HoodieClientTestUtils.readCommit(basePath, sqlContext, timeline, newCommitTime).count(),
|
||||
HoodieClientTestUtils.countRecordsSince(jsc, basePath, sqlContext, timeline, ct),
|
||||
"Incremental consumption from " + ct + " should give all records in latest commit");
|
||||
});
|
||||
}
|
||||
}
|
||||
}
|
||||
return result;
|
||||
@@ -510,7 +550,7 @@ public class HoodieClientTestBase extends HoodieClientTestHarness {
|
||||
String initCommitTime, int numRecordsInThisCommit,
|
||||
Function<Integer, List<HoodieKey>> keyGenFunction,
|
||||
Function3<JavaRDD<WriteStatus>, SparkRDDWriteClient, JavaRDD<HoodieKey>, String> deleteFn,
|
||||
boolean assertForCommit, int expRecordsInThisCommit, int expTotalRecords) throws Exception {
|
||||
boolean assertForCommit, int expRecordsInThisCommit, int expTotalRecords, boolean filerForCommitTimeWithAssert) throws Exception {
|
||||
|
||||
// Delete 1 (only deletes)
|
||||
client.startCommitWithTime(newCommitTime);
|
||||
@@ -534,8 +574,10 @@ public class HoodieClientTestBase extends HoodieClientTestHarness {
|
||||
"Expecting 3 commits.");
|
||||
assertEquals(newCommitTime, timeline.lastInstant().get().getTimestamp(),
|
||||
"Latest commit should be " + newCommitTime);
|
||||
assertEquals(expRecordsInThisCommit, HoodieClientTestUtils.readCommit(basePath, sqlContext, timeline, newCommitTime).count(),
|
||||
"Must contain " + expRecordsInThisCommit + " records");
|
||||
if (filerForCommitTimeWithAssert) { // if meta cols are disabled, we can't do assertion based on assertion time
|
||||
assertEquals(expRecordsInThisCommit, HoodieClientTestUtils.readCommit(basePath, sqlContext, timeline, newCommitTime).count(),
|
||||
"Must contain " + expRecordsInThisCommit + " records");
|
||||
}
|
||||
|
||||
// Check the entire dataset has all records still
|
||||
String[] fullPartitionPaths = new String[dataGen.getPartitionPaths().length];
|
||||
@@ -545,11 +587,13 @@ public class HoodieClientTestBase extends HoodieClientTestHarness {
|
||||
assertEquals(expTotalRecords, HoodieClientTestUtils.read(jsc, basePath, sqlContext, fs, fullPartitionPaths).count(),
|
||||
"Must contain " + expTotalRecords + " records");
|
||||
|
||||
// Check that the incremental consumption from prevCommitTime
|
||||
assertEquals(HoodieClientTestUtils.readCommit(basePath, sqlContext, timeline, newCommitTime).count(),
|
||||
HoodieClientTestUtils.countRecordsSince(jsc, basePath, sqlContext, timeline, prevCommitTime),
|
||||
"Incremental consumption from " + prevCommitTime + " should give no records in latest commit,"
|
||||
+ " since it is a delete operation");
|
||||
if (filerForCommitTimeWithAssert) {
|
||||
// Check that the incremental consumption from prevCommitTime
|
||||
assertEquals(HoodieClientTestUtils.readCommit(basePath, sqlContext, timeline, newCommitTime).count(),
|
||||
HoodieClientTestUtils.countRecordsSince(jsc, basePath, sqlContext, timeline, prevCommitTime),
|
||||
"Incremental consumption from " + prevCommitTime + " should give no records in latest commit,"
|
||||
+ " since it is a delete operation");
|
||||
}
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
@@ -32,6 +32,7 @@ import org.apache.hudi.common.fs.FSUtils;
|
||||
import org.apache.hudi.common.model.HoodieRecord;
|
||||
import org.apache.hudi.common.model.HoodieRecordLocation;
|
||||
import org.apache.hudi.common.model.HoodieTableType;
|
||||
import org.apache.hudi.common.table.HoodieTableConfig;
|
||||
import org.apache.hudi.common.table.HoodieTableMetaClient;
|
||||
import org.apache.hudi.common.table.timeline.HoodieTimeline;
|
||||
import org.apache.hudi.common.table.view.HoodieTableFileSystemView;
|
||||
@@ -40,7 +41,9 @@ import org.apache.hudi.common.testutils.HoodieTestUtils;
|
||||
import org.apache.hudi.common.testutils.minicluster.HdfsTestService;
|
||||
import org.apache.hudi.common.util.Option;
|
||||
import org.apache.hudi.common.util.collection.Pair;
|
||||
import org.apache.hudi.config.HoodieIndexConfig;
|
||||
import org.apache.hudi.config.HoodieWriteConfig;
|
||||
import org.apache.hudi.index.HoodieIndex;
|
||||
import org.apache.hudi.table.WorkloadStat;
|
||||
import org.apache.log4j.LogManager;
|
||||
import org.apache.log4j.Logger;
|
||||
@@ -56,6 +59,7 @@ import java.io.IOException;
|
||||
import java.io.Serializable;
|
||||
import java.util.HashMap;
|
||||
import java.util.Map;
|
||||
import java.util.Properties;
|
||||
import java.util.concurrent.ExecutorService;
|
||||
import java.util.concurrent.Executors;
|
||||
|
||||
@@ -225,6 +229,21 @@ public abstract class HoodieClientTestHarness extends HoodieCommonTestHarness im
|
||||
metaClient = HoodieTestUtils.init(hadoopConf, basePath, tableType);
|
||||
}
|
||||
|
||||
protected Properties getPropertiesForKeyGen() {
|
||||
Properties properties = new Properties();
|
||||
properties.put(HoodieTableConfig.HOODIE_POPULATE_META_FIELDS.key(), "false");
|
||||
properties.put("hoodie.datasource.write.recordkey.field","_row_key");
|
||||
properties.put("hoodie.datasource.write.partitionpath.field","partition_path");
|
||||
return properties;
|
||||
}
|
||||
|
||||
protected void addAppropriatePropsForPopulateMetaFields(HoodieWriteConfig.Builder configBuilder, boolean populateMetaFields) {
|
||||
if (!populateMetaFields) {
|
||||
configBuilder.withProperties(getPropertiesForKeyGen())
|
||||
.withIndexConfig(HoodieIndexConfig.newBuilder().withIndexType(HoodieIndex.IndexType.SIMPLE).build());
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Cleanups hoodie clients.
|
||||
*/
|
||||
|
||||
@@ -25,6 +25,9 @@
|
||||
}, {
|
||||
"name" : "_row_key",
|
||||
"type" : "string"
|
||||
}, {
|
||||
"name" : "partition_path",
|
||||
"type" : "string"
|
||||
}, {
|
||||
"name" : "rider",
|
||||
"type" : "string"
|
||||
|
||||
Reference in New Issue
Block a user