[HUDI-377] Adding Delete() support to DeltaStreamer (#1073)
- Provides ability to perform hard deletes by writing delete marker records into the source data - if the record contains a special field _hoodie_delete_marker set to true, deletes are performed
This commit is contained in:
committed by
vinoth chandar
parent
726ae47ce2
commit
7031445eb3
@@ -179,7 +179,7 @@ public class TestHoodieDeltaStreamer extends UtilitiesTestBase {
|
||||
}
|
||||
|
||||
static HoodieDeltaStreamer.Config makeConfig(String basePath, Operation op, String transformerClassName,
|
||||
String propsFilename, boolean enableHiveSync, boolean useSchemaProviderClass) {
|
||||
String propsFilename, boolean enableHiveSync, boolean useSchemaProviderClass) {
|
||||
HoodieDeltaStreamer.Config cfg = new HoodieDeltaStreamer.Config();
|
||||
cfg.targetBasePath = basePath;
|
||||
cfg.targetTableName = "hoodie_trips";
|
||||
@@ -198,7 +198,7 @@ public class TestHoodieDeltaStreamer extends UtilitiesTestBase {
|
||||
}
|
||||
|
||||
static HoodieDeltaStreamer.Config makeConfigForHudiIncrSrc(String srcBasePath, String basePath, Operation op,
|
||||
boolean addReadLatestOnMissingCkpt, String schemaProviderClassName) {
|
||||
boolean addReadLatestOnMissingCkpt, String schemaProviderClassName) {
|
||||
HoodieDeltaStreamer.Config cfg = new HoodieDeltaStreamer.Config();
|
||||
cfg.targetBasePath = basePath;
|
||||
cfg.targetTableName = "hoodie_trips_copy";
|
||||
@@ -352,11 +352,11 @@ public class TestHoodieDeltaStreamer extends UtilitiesTestBase {
|
||||
cfg.sourceLimit = 2000;
|
||||
cfg.operation = Operation.UPSERT;
|
||||
new HoodieDeltaStreamer(cfg, jsc).sync();
|
||||
TestHelpers.assertRecordCount(2000, datasetBasePath + "/*/*.parquet", sqlContext);
|
||||
TestHelpers.assertDistanceCount(2000, datasetBasePath + "/*/*.parquet", sqlContext);
|
||||
TestHelpers.assertRecordCount(1950, datasetBasePath + "/*/*.parquet", sqlContext);
|
||||
TestHelpers.assertDistanceCount(1950, datasetBasePath + "/*/*.parquet", sqlContext);
|
||||
TestHelpers.assertCommitMetadata("00001", datasetBasePath, dfs, 2);
|
||||
List<Row> counts = TestHelpers.countsPerCommit(datasetBasePath + "/*/*.parquet", sqlContext);
|
||||
assertEquals(2000, counts.get(0).getLong(1));
|
||||
assertEquals(1950, counts.stream().mapToLong(entry -> entry.getLong(1)).sum());
|
||||
}
|
||||
|
||||
@Test
|
||||
@@ -396,8 +396,8 @@ public class TestHoodieDeltaStreamer extends UtilitiesTestBase {
|
||||
} else {
|
||||
TestHelpers.assertAtleastNCompactionCommits(5, datasetBasePath, dfs);
|
||||
}
|
||||
TestHelpers.assertRecordCount(totalRecords, datasetBasePath + "/*/*.parquet", sqlContext);
|
||||
TestHelpers.assertDistanceCount(totalRecords, datasetBasePath + "/*/*.parquet", sqlContext);
|
||||
TestHelpers.assertRecordCount(totalRecords + 200, datasetBasePath + "/*/*.parquet", sqlContext);
|
||||
TestHelpers.assertDistanceCount(totalRecords + 200, datasetBasePath + "/*/*.parquet", sqlContext);
|
||||
return true;
|
||||
}, 180);
|
||||
ds.shutdownGracefully();
|
||||
@@ -457,12 +457,12 @@ public class TestHoodieDeltaStreamer extends UtilitiesTestBase {
|
||||
cfg.sourceLimit = 2000;
|
||||
cfg.operation = Operation.UPSERT;
|
||||
new HoodieDeltaStreamer(cfg, jsc, dfs, hiveServer.getHiveConf()).sync();
|
||||
TestHelpers.assertRecordCount(2000, datasetBasePath + "/*/*.parquet", sqlContext);
|
||||
TestHelpers.assertDistanceCount(2000, datasetBasePath + "/*/*.parquet", sqlContext);
|
||||
TestHelpers.assertDistanceCountWithExactValue(2000, datasetBasePath + "/*/*.parquet", sqlContext);
|
||||
TestHelpers.assertRecordCount(1950, datasetBasePath + "/*/*.parquet", sqlContext);
|
||||
TestHelpers.assertDistanceCount(1950, datasetBasePath + "/*/*.parquet", sqlContext);
|
||||
TestHelpers.assertDistanceCountWithExactValue(1950, datasetBasePath + "/*/*.parquet", sqlContext);
|
||||
lastInstantForUpstreamTable = TestHelpers.assertCommitMetadata("00001", datasetBasePath, dfs, 2);
|
||||
List<Row> counts = TestHelpers.countsPerCommit(datasetBasePath + "/*/*.parquet", sqlContext);
|
||||
assertEquals(2000, counts.get(0).getLong(1));
|
||||
assertEquals(1950, counts.stream().mapToLong(entry -> entry.getLong(1)).sum());
|
||||
|
||||
// Incrementally pull changes in upstream hudi table and apply to downstream table
|
||||
downstreamCfg =
|
||||
@@ -476,7 +476,7 @@ public class TestHoodieDeltaStreamer extends UtilitiesTestBase {
|
||||
String finalInstant =
|
||||
TestHelpers.assertCommitMetadata(lastInstantForUpstreamTable, downstreamDatasetBasePath, dfs, 2);
|
||||
counts = TestHelpers.countsPerCommit(downstreamDatasetBasePath + "/*/*.parquet", sqlContext);
|
||||
assertEquals(2000, counts.get(0).getLong(1));
|
||||
assertEquals(2000, counts.stream().mapToLong(entry -> entry.getLong(1)).sum());
|
||||
|
||||
// Test Hive integration
|
||||
HoodieHiveClient hiveClient = new HoodieHiveClient(hiveSyncConfig, hiveServer.getHiveConf(), dfs);
|
||||
@@ -566,12 +566,11 @@ public class TestHoodieDeltaStreamer extends UtilitiesTestBase {
|
||||
|
||||
/**
|
||||
* Returns some random number as distance between the points.
|
||||
*
|
||||
*
|
||||
* @param lat1 Latitiude of source
|
||||
* @param lat2 Latitude of destination
|
||||
* @param lon1 Longitude of source
|
||||
* @param lon2 Longitude of destination
|
||||
* @return
|
||||
*/
|
||||
@Override
|
||||
public Double call(Double lat1, Double lat2, Double lon1, Double lon2) {
|
||||
@@ -586,7 +585,7 @@ public class TestHoodieDeltaStreamer extends UtilitiesTestBase {
|
||||
|
||||
@Override
|
||||
public Dataset<Row> apply(JavaSparkContext jsc, SparkSession sparkSession, Dataset<Row> rowDataset,
|
||||
TypedProperties properties) {
|
||||
TypedProperties properties) {
|
||||
rowDataset.sqlContext().udf().register("distance_udf", new DistanceUDF(), DataTypes.DoubleType);
|
||||
return rowDataset.withColumn("haversine_distance", functions.callUDF("distance_udf", functions.col("begin_lat"),
|
||||
functions.col("end_lat"), functions.col("begin_lon"), functions.col("end_lat")));
|
||||
@@ -607,7 +606,7 @@ public class TestHoodieDeltaStreamer extends UtilitiesTestBase {
|
||||
|
||||
@Override
|
||||
public Dataset apply(JavaSparkContext jsc, SparkSession sparkSession, Dataset<Row> rowDataset,
|
||||
TypedProperties properties) {
|
||||
TypedProperties properties) {
|
||||
System.out.println("DropAllTransformer called !!");
|
||||
return sparkSession.createDataFrame(jsc.emptyRDD(), rowDataset.schema());
|
||||
}
|
||||
|
||||
@@ -76,12 +76,12 @@ public abstract class AbstractBaseTestSource extends AvroSource {
|
||||
}
|
||||
|
||||
protected AbstractBaseTestSource(TypedProperties props, JavaSparkContext sparkContext, SparkSession sparkSession,
|
||||
SchemaProvider schemaProvider) {
|
||||
SchemaProvider schemaProvider) {
|
||||
super(props, sparkContext, sparkSession, schemaProvider);
|
||||
}
|
||||
|
||||
protected static Stream<GenericRecord> fetchNextBatch(TypedProperties props, int sourceLimit, String commitTime,
|
||||
int partition) {
|
||||
int partition) {
|
||||
int maxUniqueKeys =
|
||||
props.getInteger(TestSourceConfig.MAX_UNIQUE_RECORDS_PROP, TestSourceConfig.DEFAULT_MAX_UNIQUE_RECORDS);
|
||||
|
||||
@@ -94,10 +94,12 @@ public abstract class AbstractBaseTestSource extends AvroSource {
|
||||
int numUpdates = Math.min(numExistingKeys, sourceLimit / 2);
|
||||
int numInserts = sourceLimit - numUpdates;
|
||||
LOG.info("Before adjustments => numInserts=" + numInserts + ", numUpdates=" + numUpdates);
|
||||
boolean reachedMax = false;
|
||||
|
||||
if (numInserts + numExistingKeys > maxUniqueKeys) {
|
||||
// Limit inserts so that maxUniqueRecords is maintained
|
||||
numInserts = Math.max(0, maxUniqueKeys - numExistingKeys);
|
||||
reachedMax = true;
|
||||
}
|
||||
|
||||
if ((numInserts + numUpdates) < sourceLimit) {
|
||||
@@ -105,16 +107,25 @@ public abstract class AbstractBaseTestSource extends AvroSource {
|
||||
numUpdates = Math.min(numExistingKeys, sourceLimit - numInserts);
|
||||
}
|
||||
|
||||
LOG.info("NumInserts=" + numInserts + ", NumUpdates=" + numUpdates + ", maxUniqueRecords=" + maxUniqueKeys);
|
||||
Stream<GenericRecord> deleteStream = Stream.empty();
|
||||
Stream<GenericRecord> updateStream;
|
||||
long memoryUsage1 = Runtime.getRuntime().totalMemory() - Runtime.getRuntime().freeMemory();
|
||||
LOG.info("Before DataGen. Memory Usage=" + memoryUsage1 + ", Total Memory=" + Runtime.getRuntime().totalMemory()
|
||||
+ ", Free Memory=" + Runtime.getRuntime().freeMemory());
|
||||
|
||||
Stream<GenericRecord> updateStream = dataGenerator.generateUniqueUpdatesStream(commitTime, numUpdates)
|
||||
.map(hr -> AbstractBaseTestSource.toGenericRecord(hr, dataGenerator));
|
||||
if (!reachedMax && numUpdates >= 50) {
|
||||
LOG.info("After adjustments => NumInserts=" + numInserts + ", NumUpdates=" + (numUpdates - 50) + ", NumDeletes=50, maxUniqueRecords="
|
||||
+ maxUniqueKeys);
|
||||
// if we generate update followed by deletes -> some keys in update batch might be picked up for deletes. Hence generating delete batch followed by updates
|
||||
deleteStream = dataGenerator.generateUniqueDeleteRecordStream(commitTime, 50).map(hr -> AbstractBaseTestSource.toGenericRecord(hr, dataGenerator));
|
||||
updateStream = dataGenerator.generateUniqueUpdatesStream(commitTime, numUpdates - 50).map(hr -> AbstractBaseTestSource.toGenericRecord(hr, dataGenerator));
|
||||
} else {
|
||||
LOG.info("After adjustments => NumInserts=" + numInserts + ", NumUpdates=" + numUpdates + ", maxUniqueRecords=" + maxUniqueKeys);
|
||||
updateStream = dataGenerator.generateUniqueUpdatesStream(commitTime, numUpdates)
|
||||
.map(hr -> AbstractBaseTestSource.toGenericRecord(hr, dataGenerator));
|
||||
}
|
||||
Stream<GenericRecord> insertStream = dataGenerator.generateInsertsStream(commitTime, numInserts)
|
||||
.map(hr -> AbstractBaseTestSource.toGenericRecord(hr, dataGenerator));
|
||||
return Stream.concat(updateStream, insertStream);
|
||||
return Stream.concat(deleteStream, Stream.concat(updateStream, insertStream));
|
||||
}
|
||||
|
||||
private static GenericRecord toGenericRecord(HoodieRecord hoodieRecord, HoodieTestDataGenerator dataGenerator) {
|
||||
|
||||
@@ -43,9 +43,15 @@
|
||||
}, {
|
||||
"name" : "end_lon",
|
||||
"type" : "double"
|
||||
}, {
|
||||
},
|
||||
{
|
||||
"name" : "fare",
|
||||
"type" : "double"
|
||||
},
|
||||
{
|
||||
"name" : "_hoodie_is_deleted",
|
||||
"type" : "boolean",
|
||||
"default" : false
|
||||
} ]
|
||||
}
|
||||
|
||||
|
||||
@@ -16,4 +16,4 @@
|
||||
# limitations under the License.
|
||||
###
|
||||
include=base.properties
|
||||
hoodie.deltastreamer.transformer.sql=SELECT a.timestamp, a._row_key, a.rider, a.driver, a.begin_lat, a.begin_lon, a.end_lat, a.end_lon, a.fare, CAST(1.0 AS DOUBLE) AS haversine_distance FROM <SRC> a
|
||||
hoodie.deltastreamer.transformer.sql=SELECT a.timestamp, a._row_key, a.rider, a.driver, a.begin_lat, a.begin_lon, a.end_lat, a.end_lon, a.fare, a.`_hoodie_is_deleted`, CAST(1.0 AS DOUBLE) AS haversine_distance FROM <SRC> a
|
||||
|
||||
@@ -46,9 +46,15 @@
|
||||
}, {
|
||||
"name" : "fare",
|
||||
"type" : "double"
|
||||
}, {
|
||||
"name" : "haversine_distance",
|
||||
"type" : "double"
|
||||
},
|
||||
{
|
||||
"name" : "_hoodie_is_deleted",
|
||||
"type" : "boolean",
|
||||
"default" : false
|
||||
},
|
||||
{
|
||||
"name" : "haversine_distance",
|
||||
"type" : "double"
|
||||
}]
|
||||
}
|
||||
|
||||
|
||||
Reference in New Issue
Block a user