|
|
|
|
@@ -43,6 +43,7 @@ import org.apache.hudi.common.table.timeline.HoodieInstant;
|
|
|
|
|
import org.apache.hudi.common.table.timeline.HoodieTimeline;
|
|
|
|
|
import org.apache.hudi.common.testutils.HoodieTestDataGenerator;
|
|
|
|
|
import org.apache.hudi.common.testutils.HoodieTestUtils;
|
|
|
|
|
import org.apache.hudi.common.util.CollectionUtils;
|
|
|
|
|
import org.apache.hudi.common.util.Option;
|
|
|
|
|
import org.apache.hudi.common.util.StringUtils;
|
|
|
|
|
import org.apache.hudi.config.HoodieClusteringConfig;
|
|
|
|
|
@@ -103,6 +104,7 @@ import org.apache.spark.sql.types.StructField;
|
|
|
|
|
import org.junit.jupiter.api.AfterAll;
|
|
|
|
|
import org.junit.jupiter.api.AfterEach;
|
|
|
|
|
import org.junit.jupiter.api.BeforeEach;
|
|
|
|
|
import org.junit.jupiter.api.Disabled;
|
|
|
|
|
import org.junit.jupiter.api.Test;
|
|
|
|
|
import org.junit.jupiter.params.ParameterizedTest;
|
|
|
|
|
import org.junit.jupiter.params.provider.Arguments;
|
|
|
|
|
@@ -580,32 +582,32 @@ public class TestHoodieDeltaStreamer extends HoodieDeltaStreamerTestBase {
|
|
|
|
|
// Initial bulk insert
|
|
|
|
|
HoodieDeltaStreamer.Config cfg = TestHelpers.makeConfig(tableBasePath, WriteOperationType.BULK_INSERT);
|
|
|
|
|
new HoodieDeltaStreamer(cfg, jsc).sync();
|
|
|
|
|
TestHelpers.assertRecordCount(1000, tableBasePath + "/*/*.parquet", sqlContext);
|
|
|
|
|
TestHelpers.assertDistanceCount(1000, tableBasePath + "/*/*.parquet", sqlContext);
|
|
|
|
|
TestHelpers.assertRecordCount(1000, tableBasePath, sqlContext);
|
|
|
|
|
TestHelpers.assertDistanceCount(1000, tableBasePath, sqlContext);
|
|
|
|
|
TestHelpers.assertCommitMetadata("00000", tableBasePath, dfs, 1);
|
|
|
|
|
|
|
|
|
|
// No new data => no commits.
|
|
|
|
|
cfg.sourceLimit = 0;
|
|
|
|
|
new HoodieDeltaStreamer(cfg, jsc).sync();
|
|
|
|
|
TestHelpers.assertRecordCount(1000, tableBasePath + "/*/*.parquet", sqlContext);
|
|
|
|
|
TestHelpers.assertDistanceCount(1000, tableBasePath + "/*/*.parquet", sqlContext);
|
|
|
|
|
TestHelpers.assertRecordCount(1000, tableBasePath, sqlContext);
|
|
|
|
|
TestHelpers.assertDistanceCount(1000, tableBasePath, sqlContext);
|
|
|
|
|
TestHelpers.assertCommitMetadata("00000", tableBasePath, dfs, 1);
|
|
|
|
|
|
|
|
|
|
// upsert() #1
|
|
|
|
|
cfg.sourceLimit = 2000;
|
|
|
|
|
cfg.operation = WriteOperationType.UPSERT;
|
|
|
|
|
new HoodieDeltaStreamer(cfg, jsc).sync();
|
|
|
|
|
TestHelpers.assertRecordCount(1950, tableBasePath + "/*/*.parquet", sqlContext);
|
|
|
|
|
TestHelpers.assertDistanceCount(1950, tableBasePath + "/*/*.parquet", sqlContext);
|
|
|
|
|
TestHelpers.assertRecordCount(1950, tableBasePath, sqlContext);
|
|
|
|
|
TestHelpers.assertDistanceCount(1950, tableBasePath, sqlContext);
|
|
|
|
|
TestHelpers.assertCommitMetadata("00001", tableBasePath, dfs, 2);
|
|
|
|
|
List<Row> counts = TestHelpers.countsPerCommit(tableBasePath + "/*/*.parquet", sqlContext);
|
|
|
|
|
List<Row> counts = TestHelpers.countsPerCommit(tableBasePath, sqlContext);
|
|
|
|
|
assertEquals(1950, counts.stream().mapToLong(entry -> entry.getLong(1)).sum());
|
|
|
|
|
|
|
|
|
|
// Perform bootstrap with tableBasePath as source
|
|
|
|
|
String bootstrapSourcePath = dfsBasePath + "/src_bootstrapped";
|
|
|
|
|
Dataset<Row> sourceDf = sqlContext.read()
|
|
|
|
|
.format("org.apache.hudi")
|
|
|
|
|
.load(tableBasePath + "/*/*.parquet");
|
|
|
|
|
.load(tableBasePath);
|
|
|
|
|
sourceDf.write().format("parquet").save(bootstrapSourcePath);
|
|
|
|
|
|
|
|
|
|
String newDatasetBasePath = dfsBasePath + "/test_dataset_bootstrapped";
|
|
|
|
|
@@ -615,11 +617,11 @@ public class TestHoodieDeltaStreamer extends HoodieDeltaStreamerTestBase {
|
|
|
|
|
cfg.configs.add("hoodie.bootstrap.parallelism=5");
|
|
|
|
|
cfg.targetBasePath = newDatasetBasePath;
|
|
|
|
|
new HoodieDeltaStreamer(cfg, jsc).sync();
|
|
|
|
|
Dataset<Row> res = sqlContext.read().format("org.apache.hudi").load(newDatasetBasePath + "/*.parquet");
|
|
|
|
|
Dataset<Row> res = sqlContext.read().format("org.apache.hudi").load(newDatasetBasePath);
|
|
|
|
|
LOG.info("Schema :");
|
|
|
|
|
res.printSchema();
|
|
|
|
|
|
|
|
|
|
TestHelpers.assertRecordCount(1950, newDatasetBasePath + "/*.parquet", sqlContext);
|
|
|
|
|
TestHelpers.assertRecordCount(1950, newDatasetBasePath, sqlContext);
|
|
|
|
|
res.registerTempTable("bootstrapped");
|
|
|
|
|
assertEquals(1950, sqlContext.sql("select distinct _hoodie_record_key from bootstrapped").count());
|
|
|
|
|
|
|
|
|
|
@@ -646,7 +648,7 @@ public class TestHoodieDeltaStreamer extends HoodieDeltaStreamerTestBase {
|
|
|
|
|
cfg.configs.add(SparkAvroPostProcessor.Config.SPARK_AVRO_POST_PROCESSOR_PROP_ENABLE + "=false");
|
|
|
|
|
}
|
|
|
|
|
new HoodieDeltaStreamer(cfg, jsc).sync();
|
|
|
|
|
TestHelpers.assertRecordCount(1000, tableBasePath + "/*/*", sqlContext);
|
|
|
|
|
TestHelpers.assertRecordCount(1000, tableBasePath, sqlContext);
|
|
|
|
|
TestHelpers.assertCommitMetadata("00000", tableBasePath, dfs, 1);
|
|
|
|
|
|
|
|
|
|
// Upsert data produced with Schema B, pass Schema B
|
|
|
|
|
@@ -660,12 +662,12 @@ public class TestHoodieDeltaStreamer extends HoodieDeltaStreamerTestBase {
|
|
|
|
|
}
|
|
|
|
|
new HoodieDeltaStreamer(cfg, jsc).sync();
|
|
|
|
|
// out of 1000 new records, 500 are inserts, 450 are updates and 50 are deletes.
|
|
|
|
|
TestHelpers.assertRecordCount(1450, tableBasePath + "/*/*", sqlContext);
|
|
|
|
|
TestHelpers.assertRecordCount(1450, tableBasePath, sqlContext);
|
|
|
|
|
TestHelpers.assertCommitMetadata("00001", tableBasePath, dfs, 2);
|
|
|
|
|
List<Row> counts = TestHelpers.countsPerCommit(tableBasePath + "/*/*", sqlContext);
|
|
|
|
|
List<Row> counts = TestHelpers.countsPerCommit(tableBasePath, sqlContext);
|
|
|
|
|
assertEquals(1450, counts.stream().mapToLong(entry -> entry.getLong(1)).sum());
|
|
|
|
|
|
|
|
|
|
sqlContext.read().format("org.apache.hudi").load(tableBasePath + "/*/*").createOrReplaceTempView("tmp_trips");
|
|
|
|
|
sqlContext.read().format("org.apache.hudi").load(tableBasePath).createOrReplaceTempView("tmp_trips");
|
|
|
|
|
long recordCount =
|
|
|
|
|
sqlContext.sparkSession().sql("select * from tmp_trips where evoluted_optional_union_field is not NULL").count();
|
|
|
|
|
assertEquals(950, recordCount);
|
|
|
|
|
@@ -686,9 +688,9 @@ public class TestHoodieDeltaStreamer extends HoodieDeltaStreamerTestBase {
|
|
|
|
|
cfg.configs.add(DataSourceWriteOptions.RECONCILE_SCHEMA().key() + "=true");
|
|
|
|
|
new HoodieDeltaStreamer(cfg, jsc).sync();
|
|
|
|
|
// again, 1000 new records, 500 are inserts, 450 are updates and 50 are deletes.
|
|
|
|
|
TestHelpers.assertRecordCount(1900, tableBasePath + "/*/*", sqlContext);
|
|
|
|
|
TestHelpers.assertRecordCount(1900, tableBasePath, sqlContext);
|
|
|
|
|
TestHelpers.assertCommitMetadata("00002", tableBasePath, dfs, 3);
|
|
|
|
|
counts = TestHelpers.countsPerCommit(tableBasePath + "/*/*", sqlContext);
|
|
|
|
|
counts = TestHelpers.countsPerCommit(tableBasePath, sqlContext);
|
|
|
|
|
assertEquals(1900, counts.stream().mapToLong(entry -> entry.getLong(1)).sum());
|
|
|
|
|
|
|
|
|
|
TableSchemaResolver tableSchemaResolver = new TableSchemaResolver(HoodieTableMetaClient.builder().setBasePath(tableBasePath).setConf(dfs.getConf()).build());
|
|
|
|
|
@@ -736,8 +738,8 @@ public class TestHoodieDeltaStreamer extends HoodieDeltaStreamerTestBase {
|
|
|
|
|
} else {
|
|
|
|
|
TestHelpers.assertAtleastNCompactionCommits(5, tableBasePath, dfs);
|
|
|
|
|
}
|
|
|
|
|
TestHelpers.assertRecordCount(totalRecords, tableBasePath + "/*/*.parquet", sqlContext);
|
|
|
|
|
TestHelpers.assertDistanceCount(totalRecords, tableBasePath + "/*/*.parquet", sqlContext);
|
|
|
|
|
TestHelpers.assertRecordCount(totalRecords, tableBasePath, sqlContext);
|
|
|
|
|
TestHelpers.assertDistanceCount(totalRecords, tableBasePath, sqlContext);
|
|
|
|
|
return true;
|
|
|
|
|
});
|
|
|
|
|
}
|
|
|
|
|
@@ -1011,7 +1013,7 @@ public class TestHoodieDeltaStreamer extends HoodieDeltaStreamerTestBase {
|
|
|
|
|
// There should be 4 commits, one of which should be a replace commit
|
|
|
|
|
TestHelpers.assertAtLeastNCommits(4, tableBasePath, dfs);
|
|
|
|
|
TestHelpers.assertAtLeastNReplaceCommits(1, tableBasePath, dfs);
|
|
|
|
|
TestHelpers.assertDistinctRecordCount(totalRecords, tableBasePath + "/*/*.parquet", sqlContext);
|
|
|
|
|
TestHelpers.assertDistinctRecordCount(totalRecords, tableBasePath, sqlContext);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/**
|
|
|
|
|
@@ -1039,7 +1041,7 @@ public class TestHoodieDeltaStreamer extends HoodieDeltaStreamerTestBase {
|
|
|
|
|
// There should be 4 commits, one of which should be a replace commit
|
|
|
|
|
TestHelpers.assertAtLeastNCommits(4, tableBasePath, dfs);
|
|
|
|
|
TestHelpers.assertAtLeastNReplaceCommits(1, tableBasePath, dfs);
|
|
|
|
|
TestHelpers.assertDistinctRecordCount(1900, tableBasePath + "/*/*.parquet", sqlContext);
|
|
|
|
|
TestHelpers.assertDistinctRecordCount(1900, tableBasePath, sqlContext);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
@Test
|
|
|
|
|
@@ -1062,7 +1064,7 @@ public class TestHoodieDeltaStreamer extends HoodieDeltaStreamerTestBase {
|
|
|
|
|
// There should be 4 commits, one of which should be a replace commit
|
|
|
|
|
TestHelpers.assertAtLeastNCommits(4, tableBasePath, dfs);
|
|
|
|
|
TestHelpers.assertAtLeastNReplaceCommits(1, tableBasePath, dfs);
|
|
|
|
|
TestHelpers.assertDistinctRecordCount(totalRecords, tableBasePath + "/*/*.parquet", sqlContext);
|
|
|
|
|
TestHelpers.assertDistinctRecordCount(totalRecords, tableBasePath, sqlContext);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
@ParameterizedTest
|
|
|
|
|
@@ -1168,15 +1170,15 @@ public class TestHoodieDeltaStreamer extends HoodieDeltaStreamerTestBase {
|
|
|
|
|
String tableBasePath = dfsBasePath + "/test_table2";
|
|
|
|
|
String downstreamTableBasePath = dfsBasePath + "/test_downstream_table2";
|
|
|
|
|
|
|
|
|
|
HiveSyncConfig hiveSyncConfig = getHiveSyncConfig(tableBasePath, "hive_trips");
|
|
|
|
|
|
|
|
|
|
// Initial bulk insert to ingest to first hudi table
|
|
|
|
|
HoodieDeltaStreamer.Config cfg = TestHelpers.makeConfig(tableBasePath, WriteOperationType.BULK_INSERT,
|
|
|
|
|
Collections.singletonList(SqlQueryBasedTransformer.class.getName()), PROPS_FILENAME_TEST_SOURCE, true);
|
|
|
|
|
// NOTE: We should not have need to set below config, 'datestr' should have assumed date partitioning
|
|
|
|
|
cfg.configs.add("hoodie.datasource.hive_sync.partition_fields=year,month,day");
|
|
|
|
|
new HoodieDeltaStreamer(cfg, jsc, dfs, hiveServer.getHiveConf()).sync();
|
|
|
|
|
TestHelpers.assertRecordCount(1000, tableBasePath + "/*/*.parquet", sqlContext);
|
|
|
|
|
TestHelpers.assertDistanceCount(1000, tableBasePath + "/*/*.parquet", sqlContext);
|
|
|
|
|
TestHelpers.assertDistanceCountWithExactValue(1000, tableBasePath + "/*/*.parquet", sqlContext);
|
|
|
|
|
TestHelpers.assertRecordCount(1000, tableBasePath, sqlContext);
|
|
|
|
|
TestHelpers.assertDistanceCount(1000, tableBasePath, sqlContext);
|
|
|
|
|
TestHelpers.assertDistanceCountWithExactValue(1000, tableBasePath, sqlContext);
|
|
|
|
|
String lastInstantForUpstreamTable = TestHelpers.assertCommitMetadata("00000", tableBasePath, dfs, 1);
|
|
|
|
|
|
|
|
|
|
// Now incrementally pull from the above hudi table and ingest to second table
|
|
|
|
|
@@ -1184,17 +1186,17 @@ public class TestHoodieDeltaStreamer extends HoodieDeltaStreamerTestBase {
|
|
|
|
|
TestHelpers.makeConfigForHudiIncrSrc(tableBasePath, downstreamTableBasePath, WriteOperationType.BULK_INSERT,
|
|
|
|
|
true, null);
|
|
|
|
|
new HoodieDeltaStreamer(downstreamCfg, jsc, dfs, hiveServer.getHiveConf()).sync();
|
|
|
|
|
TestHelpers.assertRecordCount(1000, downstreamTableBasePath + "/*/*.parquet", sqlContext);
|
|
|
|
|
TestHelpers.assertDistanceCount(1000, downstreamTableBasePath + "/*/*.parquet", sqlContext);
|
|
|
|
|
TestHelpers.assertDistanceCountWithExactValue(1000, downstreamTableBasePath + "/*/*.parquet", sqlContext);
|
|
|
|
|
TestHelpers.assertRecordCount(1000, downstreamTableBasePath, sqlContext);
|
|
|
|
|
TestHelpers.assertDistanceCount(1000, downstreamTableBasePath, sqlContext);
|
|
|
|
|
TestHelpers.assertDistanceCountWithExactValue(1000, downstreamTableBasePath, sqlContext);
|
|
|
|
|
TestHelpers.assertCommitMetadata(lastInstantForUpstreamTable, downstreamTableBasePath, dfs, 1);
|
|
|
|
|
|
|
|
|
|
// No new data => no commits for upstream table
|
|
|
|
|
cfg.sourceLimit = 0;
|
|
|
|
|
new HoodieDeltaStreamer(cfg, jsc, dfs, hiveServer.getHiveConf()).sync();
|
|
|
|
|
TestHelpers.assertRecordCount(1000, tableBasePath + "/*/*.parquet", sqlContext);
|
|
|
|
|
TestHelpers.assertDistanceCount(1000, tableBasePath + "/*/*.parquet", sqlContext);
|
|
|
|
|
TestHelpers.assertDistanceCountWithExactValue(1000, tableBasePath + "/*/*.parquet", sqlContext);
|
|
|
|
|
TestHelpers.assertRecordCount(1000, tableBasePath, sqlContext);
|
|
|
|
|
TestHelpers.assertDistanceCount(1000, tableBasePath, sqlContext);
|
|
|
|
|
TestHelpers.assertDistanceCountWithExactValue(1000, tableBasePath, sqlContext);
|
|
|
|
|
TestHelpers.assertCommitMetadata("00000", tableBasePath, dfs, 1);
|
|
|
|
|
|
|
|
|
|
// with no change in upstream table, no change in downstream too when pulled.
|
|
|
|
|
@@ -1202,20 +1204,20 @@ public class TestHoodieDeltaStreamer extends HoodieDeltaStreamerTestBase {
|
|
|
|
|
TestHelpers.makeConfigForHudiIncrSrc(tableBasePath, downstreamTableBasePath,
|
|
|
|
|
WriteOperationType.BULK_INSERT, true, DummySchemaProvider.class.getName());
|
|
|
|
|
new HoodieDeltaStreamer(downstreamCfg1, jsc).sync();
|
|
|
|
|
TestHelpers.assertRecordCount(1000, downstreamTableBasePath + "/*/*.parquet", sqlContext);
|
|
|
|
|
TestHelpers.assertDistanceCount(1000, downstreamTableBasePath + "/*/*.parquet", sqlContext);
|
|
|
|
|
TestHelpers.assertDistanceCountWithExactValue(1000, downstreamTableBasePath + "/*/*.parquet", sqlContext);
|
|
|
|
|
TestHelpers.assertRecordCount(1000, downstreamTableBasePath, sqlContext);
|
|
|
|
|
TestHelpers.assertDistanceCount(1000, downstreamTableBasePath, sqlContext);
|
|
|
|
|
TestHelpers.assertDistanceCountWithExactValue(1000, downstreamTableBasePath, sqlContext);
|
|
|
|
|
TestHelpers.assertCommitMetadata(lastInstantForUpstreamTable, downstreamTableBasePath, dfs, 1);
|
|
|
|
|
|
|
|
|
|
// upsert() #1 on upstream hudi table
|
|
|
|
|
cfg.sourceLimit = 2000;
|
|
|
|
|
cfg.operation = WriteOperationType.UPSERT;
|
|
|
|
|
new HoodieDeltaStreamer(cfg, jsc, dfs, hiveServer.getHiveConf()).sync();
|
|
|
|
|
TestHelpers.assertRecordCount(1950, tableBasePath + "/*/*.parquet", sqlContext);
|
|
|
|
|
TestHelpers.assertDistanceCount(1950, tableBasePath + "/*/*.parquet", sqlContext);
|
|
|
|
|
TestHelpers.assertDistanceCountWithExactValue(1950, tableBasePath + "/*/*.parquet", sqlContext);
|
|
|
|
|
TestHelpers.assertRecordCount(1950, tableBasePath, sqlContext);
|
|
|
|
|
TestHelpers.assertDistanceCount(1950, tableBasePath, sqlContext);
|
|
|
|
|
TestHelpers.assertDistanceCountWithExactValue(1950, tableBasePath, sqlContext);
|
|
|
|
|
lastInstantForUpstreamTable = TestHelpers.assertCommitMetadata("00001", tableBasePath, dfs, 2);
|
|
|
|
|
List<Row> counts = TestHelpers.countsPerCommit(tableBasePath + "/*/*.parquet", sqlContext);
|
|
|
|
|
List<Row> counts = TestHelpers.countsPerCommit(tableBasePath, sqlContext);
|
|
|
|
|
assertEquals(1950, counts.stream().mapToLong(entry -> entry.getLong(1)).sum());
|
|
|
|
|
|
|
|
|
|
// Incrementally pull changes in upstream hudi table and apply to downstream table
|
|
|
|
|
@@ -1224,18 +1226,20 @@ public class TestHoodieDeltaStreamer extends HoodieDeltaStreamerTestBase {
|
|
|
|
|
false, null);
|
|
|
|
|
downstreamCfg.sourceLimit = 2000;
|
|
|
|
|
new HoodieDeltaStreamer(downstreamCfg, jsc).sync();
|
|
|
|
|
TestHelpers.assertRecordCount(2000, downstreamTableBasePath + "/*/*.parquet", sqlContext);
|
|
|
|
|
TestHelpers.assertDistanceCount(2000, downstreamTableBasePath + "/*/*.parquet", sqlContext);
|
|
|
|
|
TestHelpers.assertDistanceCountWithExactValue(2000, downstreamTableBasePath + "/*/*.parquet", sqlContext);
|
|
|
|
|
TestHelpers.assertRecordCount(2000, downstreamTableBasePath, sqlContext);
|
|
|
|
|
TestHelpers.assertDistanceCount(2000, downstreamTableBasePath, sqlContext);
|
|
|
|
|
TestHelpers.assertDistanceCountWithExactValue(2000, downstreamTableBasePath, sqlContext);
|
|
|
|
|
String finalInstant =
|
|
|
|
|
TestHelpers.assertCommitMetadata(lastInstantForUpstreamTable, downstreamTableBasePath, dfs, 2);
|
|
|
|
|
counts = TestHelpers.countsPerCommit(downstreamTableBasePath + "/*/*.parquet", sqlContext);
|
|
|
|
|
counts = TestHelpers.countsPerCommit(downstreamTableBasePath, sqlContext);
|
|
|
|
|
assertEquals(2000, counts.stream().mapToLong(entry -> entry.getLong(1)).sum());
|
|
|
|
|
|
|
|
|
|
// Test Hive integration
|
|
|
|
|
HiveSyncConfig hiveSyncConfig = getHiveSyncConfig(tableBasePath, "hive_trips");
|
|
|
|
|
hiveSyncConfig.partitionFields = CollectionUtils.createImmutableList("year", "month", "day");
|
|
|
|
|
HoodieHiveClient hiveClient = new HoodieHiveClient(hiveSyncConfig, hiveServer.getHiveConf(), dfs);
|
|
|
|
|
assertTrue(hiveClient.doesTableExist(hiveSyncConfig.tableName), "Table " + hiveSyncConfig.tableName + " should exist");
|
|
|
|
|
assertEquals(1, hiveClient.scanTablePartitions(hiveSyncConfig.tableName).size(),
|
|
|
|
|
assertEquals(3, hiveClient.scanTablePartitions(hiveSyncConfig.tableName).size(),
|
|
|
|
|
"Table partitions should match the number of partitions we wrote");
|
|
|
|
|
assertEquals(lastInstantForUpstreamTable,
|
|
|
|
|
hiveClient.getLastCommitTimeSynced(hiveSyncConfig.tableName).get(),
|
|
|
|
|
@@ -1259,14 +1263,14 @@ public class TestHoodieDeltaStreamer extends HoodieDeltaStreamerTestBase {
|
|
|
|
|
public void testPayloadClassUpdate() throws Exception {
|
|
|
|
|
String dataSetBasePath = dfsBasePath + "/test_dataset_mor";
|
|
|
|
|
HoodieDeltaStreamer.Config cfg = TestHelpers.makeConfig(dataSetBasePath, WriteOperationType.BULK_INSERT,
|
|
|
|
|
Collections.singletonList(SqlQueryBasedTransformer.class.getName()), PROPS_FILENAME_TEST_SOURCE, true,
|
|
|
|
|
Collections.singletonList(SqlQueryBasedTransformer.class.getName()), PROPS_FILENAME_TEST_SOURCE, false,
|
|
|
|
|
true, false, null, "MERGE_ON_READ");
|
|
|
|
|
new HoodieDeltaStreamer(cfg, jsc, dfs, hiveServer.getHiveConf()).sync();
|
|
|
|
|
TestHelpers.assertRecordCount(1000, dataSetBasePath + "/*/*.parquet", sqlContext);
|
|
|
|
|
TestHelpers.assertRecordCount(1000, dataSetBasePath, sqlContext);
|
|
|
|
|
|
|
|
|
|
//now create one more deltaStreamer instance and update payload class
|
|
|
|
|
cfg = TestHelpers.makeConfig(dataSetBasePath, WriteOperationType.BULK_INSERT,
|
|
|
|
|
Collections.singletonList(SqlQueryBasedTransformer.class.getName()), PROPS_FILENAME_TEST_SOURCE, true,
|
|
|
|
|
Collections.singletonList(SqlQueryBasedTransformer.class.getName()), PROPS_FILENAME_TEST_SOURCE, false,
|
|
|
|
|
true, true, DummyAvroPayload.class.getName(), "MERGE_ON_READ");
|
|
|
|
|
new HoodieDeltaStreamer(cfg, jsc, dfs, hiveServer.getHiveConf());
|
|
|
|
|
|
|
|
|
|
@@ -1285,14 +1289,14 @@ public class TestHoodieDeltaStreamer extends HoodieDeltaStreamerTestBase {
|
|
|
|
|
public void testPayloadClassUpdateWithCOWTable() throws Exception {
|
|
|
|
|
String dataSetBasePath = dfsBasePath + "/test_dataset_cow";
|
|
|
|
|
HoodieDeltaStreamer.Config cfg = TestHelpers.makeConfig(dataSetBasePath, WriteOperationType.BULK_INSERT,
|
|
|
|
|
Collections.singletonList(SqlQueryBasedTransformer.class.getName()), PROPS_FILENAME_TEST_SOURCE, true,
|
|
|
|
|
Collections.singletonList(SqlQueryBasedTransformer.class.getName()), PROPS_FILENAME_TEST_SOURCE, false,
|
|
|
|
|
true, false, null, null);
|
|
|
|
|
new HoodieDeltaStreamer(cfg, jsc, dfs, hiveServer.getHiveConf()).sync();
|
|
|
|
|
TestHelpers.assertRecordCount(1000, dataSetBasePath + "/*/*.parquet", sqlContext);
|
|
|
|
|
TestHelpers.assertRecordCount(1000, dataSetBasePath, sqlContext);
|
|
|
|
|
|
|
|
|
|
//now create one more deltaStreamer instance and update payload class
|
|
|
|
|
cfg = TestHelpers.makeConfig(dataSetBasePath, WriteOperationType.BULK_INSERT,
|
|
|
|
|
Collections.singletonList(SqlQueryBasedTransformer.class.getName()), PROPS_FILENAME_TEST_SOURCE, true,
|
|
|
|
|
Collections.singletonList(SqlQueryBasedTransformer.class.getName()), PROPS_FILENAME_TEST_SOURCE, false,
|
|
|
|
|
true, true, DummyAvroPayload.class.getName(), null);
|
|
|
|
|
new HoodieDeltaStreamer(cfg, jsc, dfs, hiveServer.getHiveConf());
|
|
|
|
|
|
|
|
|
|
@@ -1314,7 +1318,7 @@ public class TestHoodieDeltaStreamer extends HoodieDeltaStreamerTestBase {
|
|
|
|
|
// Initial bulk insert
|
|
|
|
|
HoodieDeltaStreamer.Config cfg = TestHelpers.makeConfig(tableBasePath, WriteOperationType.BULK_INSERT);
|
|
|
|
|
new HoodieDeltaStreamer(cfg, jsc).sync();
|
|
|
|
|
TestHelpers.assertRecordCount(1000, tableBasePath + "/*/*.parquet", sqlContext);
|
|
|
|
|
TestHelpers.assertRecordCount(1000, tableBasePath, sqlContext);
|
|
|
|
|
TestHelpers.assertCommitMetadata("00000", tableBasePath, dfs, 1);
|
|
|
|
|
|
|
|
|
|
// Generate the same 1000 records + 1000 new ones for upsert
|
|
|
|
|
@@ -1322,10 +1326,10 @@ public class TestHoodieDeltaStreamer extends HoodieDeltaStreamerTestBase {
|
|
|
|
|
cfg.sourceLimit = 2000;
|
|
|
|
|
cfg.operation = WriteOperationType.INSERT;
|
|
|
|
|
new HoodieDeltaStreamer(cfg, jsc).sync();
|
|
|
|
|
TestHelpers.assertRecordCount(2000, tableBasePath + "/*/*.parquet", sqlContext);
|
|
|
|
|
TestHelpers.assertRecordCount(2000, tableBasePath, sqlContext);
|
|
|
|
|
TestHelpers.assertCommitMetadata("00001", tableBasePath, dfs, 2);
|
|
|
|
|
// 1000 records for commit 00000 & 1000 for commit 00001
|
|
|
|
|
List<Row> counts = TestHelpers.countsPerCommit(tableBasePath + "/*/*.parquet", sqlContext);
|
|
|
|
|
List<Row> counts = TestHelpers.countsPerCommit(tableBasePath, sqlContext);
|
|
|
|
|
assertEquals(1000, counts.get(0).getLong(1));
|
|
|
|
|
assertEquals(1000, counts.get(1).getLong(1));
|
|
|
|
|
|
|
|
|
|
@@ -1394,7 +1398,7 @@ public class TestHoodieDeltaStreamer extends HoodieDeltaStreamerTestBase {
|
|
|
|
|
private void prepareParquetDFSSource(boolean useSchemaProvider, boolean hasTransformer, String sourceSchemaFile, String targetSchemaFile,
|
|
|
|
|
String propsFileName, String parquetSourceRoot, boolean addCommonProps) throws IOException {
|
|
|
|
|
prepareParquetDFSSource(useSchemaProvider, hasTransformer, sourceSchemaFile, targetSchemaFile, propsFileName, parquetSourceRoot, addCommonProps,
|
|
|
|
|
"not_there");
|
|
|
|
|
"partition_path");
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
private void prepareParquetDFSSource(boolean useSchemaProvider, boolean hasTransformer, String sourceSchemaFile, String targetSchemaFile,
|
|
|
|
|
@@ -1434,7 +1438,7 @@ public class TestHoodieDeltaStreamer extends HoodieDeltaStreamerTestBase {
|
|
|
|
|
transformerClassNames, PROPS_FILENAME_TEST_PARQUET, false,
|
|
|
|
|
useSchemaProvider, 100000, false, null, null, "timestamp", null), jsc);
|
|
|
|
|
deltaStreamer.sync();
|
|
|
|
|
TestHelpers.assertRecordCount(PARQUET_NUM_RECORDS, tableBasePath + "/*/*.parquet", sqlContext);
|
|
|
|
|
TestHelpers.assertRecordCount(PARQUET_NUM_RECORDS, tableBasePath, sqlContext);
|
|
|
|
|
testNum++;
|
|
|
|
|
|
|
|
|
|
if (testEmptyBatch) {
|
|
|
|
|
@@ -1443,7 +1447,7 @@ public class TestHoodieDeltaStreamer extends HoodieDeltaStreamerTestBase {
|
|
|
|
|
TestParquetDFSSourceEmptyBatch.returnEmptyBatch = true;
|
|
|
|
|
deltaStreamer.sync();
|
|
|
|
|
// since we mimic'ed empty batch, total records should be same as first sync().
|
|
|
|
|
TestHelpers.assertRecordCount(PARQUET_NUM_RECORDS, tableBasePath + "/*/*.parquet", sqlContext);
|
|
|
|
|
TestHelpers.assertRecordCount(PARQUET_NUM_RECORDS, tableBasePath, sqlContext);
|
|
|
|
|
HoodieTableMetaClient metaClient = HoodieTableMetaClient.builder().setBasePath(tableBasePath).setConf(jsc.hadoopConfiguration()).build();
|
|
|
|
|
|
|
|
|
|
// validate table schema fetches valid schema from last but one commit.
|
|
|
|
|
@@ -1460,7 +1464,7 @@ public class TestHoodieDeltaStreamer extends HoodieDeltaStreamerTestBase {
|
|
|
|
|
orcProps.setProperty("include", "base.properties");
|
|
|
|
|
orcProps.setProperty("hoodie.embed.timeline.server", "false");
|
|
|
|
|
orcProps.setProperty("hoodie.datasource.write.recordkey.field", "_row_key");
|
|
|
|
|
orcProps.setProperty("hoodie.datasource.write.partitionpath.field", "not_there");
|
|
|
|
|
orcProps.setProperty("hoodie.datasource.write.partitionpath.field", "partition_path");
|
|
|
|
|
if (useSchemaProvider) {
|
|
|
|
|
orcProps.setProperty("hoodie.deltastreamer.schemaprovider.source.schema.file", dfsBasePath + "/" + "source.avsc");
|
|
|
|
|
if (transformerClassNames != null) {
|
|
|
|
|
@@ -1476,7 +1480,7 @@ public class TestHoodieDeltaStreamer extends HoodieDeltaStreamerTestBase {
|
|
|
|
|
transformerClassNames, PROPS_FILENAME_TEST_ORC, false,
|
|
|
|
|
useSchemaProvider, 100000, false, null, null, "timestamp", null), jsc);
|
|
|
|
|
deltaStreamer.sync();
|
|
|
|
|
TestHelpers.assertRecordCount(ORC_NUM_RECORDS, tableBasePath + "/*/*.parquet", sqlContext);
|
|
|
|
|
TestHelpers.assertRecordCount(ORC_NUM_RECORDS, tableBasePath, sqlContext);
|
|
|
|
|
testNum++;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
@@ -1487,7 +1491,7 @@ public class TestHoodieDeltaStreamer extends HoodieDeltaStreamerTestBase {
|
|
|
|
|
props.setProperty("include", "base.properties");
|
|
|
|
|
props.setProperty("hoodie.embed.timeline.server", "false");
|
|
|
|
|
props.setProperty("hoodie.datasource.write.recordkey.field", "_row_key");
|
|
|
|
|
props.setProperty("hoodie.datasource.write.partitionpath.field", "not_there");
|
|
|
|
|
props.setProperty("hoodie.datasource.write.partitionpath.field", "partition_path");
|
|
|
|
|
props.setProperty("hoodie.deltastreamer.source.dfs.root", JSON_KAFKA_SOURCE_ROOT);
|
|
|
|
|
props.setProperty("hoodie.deltastreamer.source.kafka.topic", topicName);
|
|
|
|
|
props.setProperty("hoodie.deltastreamer.source.kafka.checkpoint.type", kafkaCheckpointType);
|
|
|
|
|
@@ -1640,7 +1644,7 @@ public class TestHoodieDeltaStreamer extends HoodieDeltaStreamerTestBase {
|
|
|
|
|
TypedProperties csvProps = new TypedProperties();
|
|
|
|
|
csvProps.setProperty("include", "base.properties");
|
|
|
|
|
csvProps.setProperty("hoodie.datasource.write.recordkey.field", recordKeyField);
|
|
|
|
|
csvProps.setProperty("hoodie.datasource.write.partitionpath.field", "not_there");
|
|
|
|
|
csvProps.setProperty("hoodie.datasource.write.partitionpath.field", "partition_path");
|
|
|
|
|
if (useSchemaProvider) {
|
|
|
|
|
csvProps.setProperty("hoodie.deltastreamer.schemaprovider.source.schema.file", dfsBasePath + "/source-flattened.avsc");
|
|
|
|
|
if (hasTransformer) {
|
|
|
|
|
@@ -1681,7 +1685,7 @@ public class TestHoodieDeltaStreamer extends HoodieDeltaStreamerTestBase {
|
|
|
|
|
transformerClassNames, PROPS_FILENAME_TEST_CSV, false,
|
|
|
|
|
useSchemaProvider, 1000, false, null, null, sourceOrderingField, null), jsc);
|
|
|
|
|
deltaStreamer.sync();
|
|
|
|
|
TestHelpers.assertRecordCount(CSV_NUM_RECORDS, tableBasePath + "/*/*.parquet", sqlContext);
|
|
|
|
|
TestHelpers.assertRecordCount(CSV_NUM_RECORDS, tableBasePath, sqlContext);
|
|
|
|
|
testNum++;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
@@ -1775,7 +1779,7 @@ public class TestHoodieDeltaStreamer extends HoodieDeltaStreamerTestBase {
|
|
|
|
|
sqlSourceProps.setProperty("include", "base.properties");
|
|
|
|
|
sqlSourceProps.setProperty("hoodie.embed.timeline.server", "false");
|
|
|
|
|
sqlSourceProps.setProperty("hoodie.datasource.write.recordkey.field", "_row_key");
|
|
|
|
|
sqlSourceProps.setProperty("hoodie.datasource.write.partitionpath.field", "not_there");
|
|
|
|
|
sqlSourceProps.setProperty("hoodie.datasource.write.partitionpath.field", "partition_path");
|
|
|
|
|
sqlSourceProps.setProperty("hoodie.deltastreamer.source.sql.sql.query","select * from test_sql_table");
|
|
|
|
|
|
|
|
|
|
UtilitiesTestBase.Helpers.savePropsToDFS(sqlSourceProps, dfs, dfsBasePath + "/" + PROPS_FILENAME_TEST_SQL_SOURCE);
|
|
|
|
|
@@ -1801,9 +1805,10 @@ public class TestHoodieDeltaStreamer extends HoodieDeltaStreamerTestBase {
|
|
|
|
|
Collections.emptyList(), PROPS_FILENAME_TEST_SQL_SOURCE, false,
|
|
|
|
|
false, 1000, false, null, null, "timestamp", null, true), jsc);
|
|
|
|
|
deltaStreamer.sync();
|
|
|
|
|
TestHelpers.assertRecordCount(SQL_SOURCE_NUM_RECORDS, tableBasePath + "/*/*.parquet", sqlContext);
|
|
|
|
|
TestHelpers.assertRecordCount(SQL_SOURCE_NUM_RECORDS, tableBasePath, sqlContext);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
@Disabled
|
|
|
|
|
@Test
|
|
|
|
|
public void testJdbcSourceIncrementalFetchInContinuousMode() {
|
|
|
|
|
try (Connection connection = DriverManager.getConnection("jdbc:h2:mem:test_mem", "test", "jdbc")) {
|
|
|
|
|
@@ -1818,7 +1823,7 @@ public class TestHoodieDeltaStreamer extends HoodieDeltaStreamerTestBase {
|
|
|
|
|
|
|
|
|
|
props.setProperty("hoodie.datasource.write.keygenerator.class", SimpleKeyGenerator.class.getName());
|
|
|
|
|
props.setProperty("hoodie.datasource.write.recordkey.field", "ID");
|
|
|
|
|
props.setProperty("hoodie.datasource.write.partitionpath.field", "not_there");
|
|
|
|
|
props.setProperty("hoodie.datasource.write.partitionpath.field", "partition_path");
|
|
|
|
|
|
|
|
|
|
UtilitiesTestBase.Helpers.savePropsToDFS(props, dfs, dfsBasePath + "/test-jdbc-source.properties");
|
|
|
|
|
|
|
|
|
|
@@ -1835,7 +1840,7 @@ public class TestHoodieDeltaStreamer extends HoodieDeltaStreamerTestBase {
|
|
|
|
|
HoodieDeltaStreamer deltaStreamer = new HoodieDeltaStreamer(cfg, jsc);
|
|
|
|
|
deltaStreamerTestRunner(deltaStreamer, cfg, (r) -> {
|
|
|
|
|
TestHelpers.assertAtleastNCompactionCommits(numRecords / sourceLimit + ((numRecords % sourceLimit == 0) ? 0 : 1), tableBasePath, dfs);
|
|
|
|
|
TestHelpers.assertRecordCount(numRecords, tableBasePath + "/*/*.parquet", sqlContext);
|
|
|
|
|
TestHelpers.assertRecordCount(numRecords, tableBasePath, sqlContext);
|
|
|
|
|
return true;
|
|
|
|
|
});
|
|
|
|
|
} catch (Exception e) {
|
|
|
|
|
@@ -1857,7 +1862,7 @@ public class TestHoodieDeltaStreamer extends HoodieDeltaStreamerTestBase {
|
|
|
|
|
insertInTable(tableBasePath, 9, WriteOperationType.UPSERT);
|
|
|
|
|
//No change as this fails with Path not exist error
|
|
|
|
|
assertThrows(org.apache.spark.sql.AnalysisException.class, () -> new HoodieDeltaStreamer(downstreamCfg, jsc).sync());
|
|
|
|
|
TestHelpers.assertRecordCount(1000, downstreamTableBasePath + "/*/*", sqlContext);
|
|
|
|
|
TestHelpers.assertRecordCount(1000, downstreamTableBasePath, sqlContext);
|
|
|
|
|
|
|
|
|
|
if (downstreamCfg.configs == null) {
|
|
|
|
|
downstreamCfg.configs = new ArrayList<>();
|
|
|
|
|
@@ -1870,8 +1875,8 @@ public class TestHoodieDeltaStreamer extends HoodieDeltaStreamerTestBase {
|
|
|
|
|
new HoodieDeltaStreamer(downstreamCfg, jsc).sync();
|
|
|
|
|
new HoodieDeltaStreamer(downstreamCfg, jsc).sync();
|
|
|
|
|
|
|
|
|
|
long baseTableRecords = sqlContext.read().format("org.apache.hudi").load(tableBasePath + "/*/*.parquet").count();
|
|
|
|
|
long downStreamTableRecords = sqlContext.read().format("org.apache.hudi").load(downstreamTableBasePath + "/*/*.parquet").count();
|
|
|
|
|
long baseTableRecords = sqlContext.read().format("org.apache.hudi").load(tableBasePath).count();
|
|
|
|
|
long downStreamTableRecords = sqlContext.read().format("org.apache.hudi").load(downstreamTableBasePath).count();
|
|
|
|
|
assertEquals(baseTableRecords, downStreamTableRecords);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
@@ -1930,8 +1935,8 @@ public class TestHoodieDeltaStreamer extends HoodieDeltaStreamerTestBase {
|
|
|
|
|
// Initial insert
|
|
|
|
|
HoodieDeltaStreamer.Config cfg = TestHelpers.makeConfig(tableBasePath, WriteOperationType.BULK_INSERT);
|
|
|
|
|
new HoodieDeltaStreamer(cfg, jsc).sync();
|
|
|
|
|
TestHelpers.assertRecordCount(1000, tableBasePath + "/*/*.parquet", sqlContext);
|
|
|
|
|
TestHelpers.assertDistanceCount(1000, tableBasePath + "/*/*.parquet", sqlContext);
|
|
|
|
|
TestHelpers.assertRecordCount(1000, tableBasePath, sqlContext);
|
|
|
|
|
TestHelpers.assertDistanceCount(1000, tableBasePath, sqlContext);
|
|
|
|
|
TestHelpers.assertCommitMetadata("00000", tableBasePath, dfs, 1);
|
|
|
|
|
|
|
|
|
|
// setting the operationType
|
|
|
|
|
@@ -1939,14 +1944,14 @@ public class TestHoodieDeltaStreamer extends HoodieDeltaStreamerTestBase {
|
|
|
|
|
// No new data => no commits.
|
|
|
|
|
cfg.sourceLimit = 0;
|
|
|
|
|
new HoodieDeltaStreamer(cfg, jsc).sync();
|
|
|
|
|
TestHelpers.assertRecordCount(1000, tableBasePath + "/*/*.parquet", sqlContext);
|
|
|
|
|
TestHelpers.assertDistanceCount(1000, tableBasePath + "/*/*.parquet", sqlContext);
|
|
|
|
|
TestHelpers.assertRecordCount(1000, tableBasePath, sqlContext);
|
|
|
|
|
TestHelpers.assertDistanceCount(1000, tableBasePath, sqlContext);
|
|
|
|
|
TestHelpers.assertCommitMetadata("00000", tableBasePath, dfs, 1);
|
|
|
|
|
|
|
|
|
|
cfg.sourceLimit = 1000;
|
|
|
|
|
new HoodieDeltaStreamer(cfg, jsc).sync();
|
|
|
|
|
TestHelpers.assertRecordCount(1950, tableBasePath + "/*/*.parquet", sqlContext);
|
|
|
|
|
TestHelpers.assertDistanceCount(1950, tableBasePath + "/*/*.parquet", sqlContext);
|
|
|
|
|
TestHelpers.assertRecordCount(1950, tableBasePath, sqlContext);
|
|
|
|
|
TestHelpers.assertDistanceCount(1950, tableBasePath, sqlContext);
|
|
|
|
|
TestHelpers.assertCommitMetadata("00001", tableBasePath, dfs, 2);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|