From 3a0044216cb2f707639d48e2869f4ee6f25cfc19 Mon Sep 17 00:00:00 2001 From: Balaji Varadarajan Date: Wed, 10 Oct 2018 10:31:34 -0700 Subject: [PATCH] New Features in DeltaStreamer : (1) Apply transformation when using delta-streamer to ingest data. (2) Add Hudi Incremental Source for Delta Streamer (3) Allow delta-streamer config-property to be passed as command-line (4) Add Hive Integration to Delta-Streamer and address Review comments (5) Ensure MultiPartKeysValueExtractor handle hive style partition description (6) Reuse same spark session on both source and transformer (7) Support extracting partition fields from _hoodie_partition_path for HoodieIncrSource (8) Reuse Binary Avro coders (9) Add push down filter for Incremental source (10) Add Hoodie DeltaStreamer metrics to track total time taken --- docs/incremental_processing.md | 8 +- docs/quickstart.md | 4 +- .../cli/commands/ArchivedCommitsCommand.java | 88 +++++++ hoodie-client/pom.xml | 42 +--- .../uber/hoodie/metrics/HoodieMetrics.java | 16 +- .../java/com/uber/hoodie/metrics/Metrics.java | 16 ++ .../common/HoodieTestDataGenerator.java | 5 +- .../uber/hoodie/io/TestHoodieCompactor.java | 3 +- .../hoodie/metrics/TestHoodieMetrics.java | 3 +- .../hoodie/table/TestMergeOnReadTable.java | 4 +- .../src/main/avro/HoodieCommitMetadata.avsc | 42 ++-- .../hoodie/common/model/HoodieRecord.java | 10 + .../util/DFSPropertiesConfiguration.java | 18 +- .../hoodie/common/util/HoodieAvroUtils.java | 12 +- .../hoodie/common/util/TypedProperties.java | 10 + .../util/collection/TestDiskBasedMap.java | 209 +++++++++++++++ hoodie-hadoop-mr/pom.xml | 70 ++---- hoodie-hive/pom.xml | 84 ++----- .../uber/hoodie/hive/HoodieHiveClient.java | 26 +- .../hive/MultiPartKeysValueExtractor.java | 12 +- .../hoodie/hive/PartitionValueExtractor.java | 3 +- ...lashEncodedDayPartitionValueExtractor.java | 11 +- hoodie-spark/pom.xml | 87 ++----- .../java/com/uber/hoodie/DataSourceUtils.java | 40 +++ .../com/uber/hoodie/AvroConversionUtils.scala | 206 ++++++++++++++- .../com/uber/hoodie/DataSourceOptions.scala | 9 +- .../com/uber/hoodie/IncrementalRelation.scala | 37 ++- .../src/test/scala/DataSourceTest.scala | 1 - hoodie-utilities/pom.xml | 138 +++++----- .../uber/hoodie/utilities/HoodieCleaner.java | 115 +++++++++ .../uber/hoodie/utilities/UtilHelpers.java | 33 ++- .../deltastreamer/HoodieDeltaStreamer.java | 230 ++++++++++++++--- .../HoodieDeltaStreamerMetrics.java | 61 +++++ .../deltastreamer/SourceFormatAdapter.java | 112 +++++++++ .../schema/RowBasedSchemaProvider.java | 25 ++ .../schema/SchemaRegistryProvider.java | 28 ++- .../utilities/sources/AvroDFSSource.java | 26 +- .../utilities/sources/AvroKafkaSource.java | 42 +++- .../hoodie/utilities/sources/AvroSource.java | 36 +++ .../utilities/sources/HiveIncrPullSource.java | 25 +- .../utilities/sources/HoodieIncrSource.java | 144 +++++++++++ .../hoodie/utilities/sources/InputBatch.java | 54 ++++ .../utilities/sources/JsonDFSSource.java | 28 ++- .../utilities/sources/JsonKafkaSource.java | 37 ++- .../hoodie/utilities/sources/JsonSource.java | 35 +++ .../hoodie/utilities/sources/RowSource.java | 51 ++++ .../uber/hoodie/utilities/sources/Source.java | 69 +++-- .../sources/{ => helpers}/AvroConvertor.java | 6 +- .../DFSPathSelector.java} | 33 +-- .../sources/helpers/IncrSourceHelper.java | 88 +++++++ .../KafkaOffsetGen.java} | 62 ++--- .../transform/IdentityTransformer.java | 37 +++ .../transform/SqlQueryBasedTransformer.java | 66 +++++ .../utilities/transform/Transformer.java | 43 ++++ .../utilities/TestHoodieDeltaStreamer.java | 238 +++++++++++++++++- .../hoodie/utilities/UtilitiesTestBase.java | 60 +++++ .../utilities/sources/TestDFSSource.java | 53 ++-- .../utilities/sources/TestDataSource.java | 16 +- .../utilities/sources/TestKafkaSource.java | 51 ++-- .../sql-transformer.properties | 19 ++ .../delta-streamer-config/target.avsc | 37 +++ packaging/hoodie-hadoop-mr-bundle/pom.xml | 154 ++++-------- packaging/hoodie-hive-bundle/pom.xml | 89 ++----- packaging/hoodie-spark-bundle/pom.xml | 89 ++----- pom.xml | 157 ++++-------- 65 files changed, 2752 insertions(+), 911 deletions(-) create mode 100644 hoodie-common/src/test/java/com/uber/hoodie/common/util/collection/TestDiskBasedMap.java create mode 100644 hoodie-utilities/src/main/java/com/uber/hoodie/utilities/HoodieCleaner.java create mode 100644 hoodie-utilities/src/main/java/com/uber/hoodie/utilities/deltastreamer/HoodieDeltaStreamerMetrics.java create mode 100644 hoodie-utilities/src/main/java/com/uber/hoodie/utilities/deltastreamer/SourceFormatAdapter.java create mode 100644 hoodie-utilities/src/main/java/com/uber/hoodie/utilities/schema/RowBasedSchemaProvider.java create mode 100644 hoodie-utilities/src/main/java/com/uber/hoodie/utilities/sources/AvroSource.java create mode 100644 hoodie-utilities/src/main/java/com/uber/hoodie/utilities/sources/HoodieIncrSource.java create mode 100644 hoodie-utilities/src/main/java/com/uber/hoodie/utilities/sources/InputBatch.java create mode 100644 hoodie-utilities/src/main/java/com/uber/hoodie/utilities/sources/JsonSource.java create mode 100644 hoodie-utilities/src/main/java/com/uber/hoodie/utilities/sources/RowSource.java rename hoodie-utilities/src/main/java/com/uber/hoodie/utilities/sources/{ => helpers}/AvroConvertor.java (93%) rename hoodie-utilities/src/main/java/com/uber/hoodie/utilities/sources/{DFSSource.java => helpers/DFSPathSelector.java} (76%) create mode 100644 hoodie-utilities/src/main/java/com/uber/hoodie/utilities/sources/helpers/IncrSourceHelper.java rename hoodie-utilities/src/main/java/com/uber/hoodie/utilities/sources/{KafkaSource.java => helpers/KafkaOffsetGen.java} (84%) create mode 100644 hoodie-utilities/src/main/java/com/uber/hoodie/utilities/transform/IdentityTransformer.java create mode 100644 hoodie-utilities/src/main/java/com/uber/hoodie/utilities/transform/SqlQueryBasedTransformer.java create mode 100644 hoodie-utilities/src/main/java/com/uber/hoodie/utilities/transform/Transformer.java create mode 100644 hoodie-utilities/src/test/resources/delta-streamer-config/sql-transformer.properties create mode 100644 hoodie-utilities/src/test/resources/delta-streamer-config/target.avsc diff --git a/docs/incremental_processing.md b/docs/incremental_processing.md index 973875a0b..9a65ccfe1 100644 --- a/docs/incremental_processing.md +++ b/docs/incremental_processing.md @@ -85,8 +85,12 @@ Usage:
[options] exist first time around. If exists, expected to be a hoodie dataset) * --target-table name of the target table in Hive - - + --transformer-class + subclass of com.uber.hoodie.utilities.transform.Transformer. UDF to + transform raw source dataset to a target dataset (conforming to target + schema) before writing. Default : Not set. E:g - + com.uber.hoodie.utilities.transform.SqlQueryBasedTransformer (which + allows a SQL query template to be passed as a transformation function) ``` diff --git a/docs/quickstart.md b/docs/quickstart.md index 41ec9a96d..bb7be6edd 100644 --- a/docs/quickstart.md +++ b/docs/quickstart.md @@ -33,7 +33,7 @@ Hoodie requires Java 8 to be installed. Hoodie works with Spark-2.x versions. We | Hadoop | Hive | Spark | Instructions to Build Hoodie | | ---- | ----- | ---- | ---- | -| 2.6.0-cdh5.7.2 | 1.1.0-cdh5.7.2 | spark-2.[1-3].x | Use "mvn clean install -DskipTests -Dhive11". Jars will have ".hive11" as suffix | +| 2.6.0-cdh5.7.2 | 1.1.0-cdh5.7.2 | spark-2.[1-3].x | Use "mvn clean install -DskipTests -Dhadoop.version=2.6.0-cdh5.7.2 -Dhive.version=1.1.0-cdh5.7.2" | | Apache hadoop-2.8.4 | Apache hive-2.3.3 | spark-2.[1-3].x | Use "mvn clean install -DskipTests" | | Apache hadoop-2.7.3 | Apache hive-1.2.1 | spark-2.[1-3].x | Use "mvn clean install -DskipTests" | @@ -1244,4 +1244,4 @@ cd docker [INFO] Finished at: 2018-09-10T17:47:37-07:00 [INFO] Final Memory: 236M/1848M [INFO] ------------------------------------------------------------------------ -``` \ No newline at end of file +``` diff --git a/hoodie-cli/src/main/java/com/uber/hoodie/cli/commands/ArchivedCommitsCommand.java b/hoodie-cli/src/main/java/com/uber/hoodie/cli/commands/ArchivedCommitsCommand.java index 5f4d5de0d..49fcc5919 100644 --- a/hoodie-cli/src/main/java/com/uber/hoodie/cli/commands/ArchivedCommitsCommand.java +++ b/hoodie-cli/src/main/java/com/uber/hoodie/cli/commands/ArchivedCommitsCommand.java @@ -17,6 +17,7 @@ package com.uber.hoodie.cli.commands; import com.uber.hoodie.avro.model.HoodieArchivedMetaEntry; +import com.uber.hoodie.avro.model.HoodieCommitMetadata; import com.uber.hoodie.cli.HoodieCLI; import com.uber.hoodie.cli.HoodiePrintHelper; import com.uber.hoodie.cli.TableHeader; @@ -32,6 +33,7 @@ import java.util.List; import java.util.stream.Collectors; import org.apache.avro.generic.GenericRecord; import org.apache.avro.generic.IndexedRecord; +import org.apache.avro.specific.SpecificData; import org.apache.hadoop.fs.FileStatus; import org.apache.hadoop.fs.Path; import org.springframework.shell.core.CommandMarker; @@ -48,6 +50,92 @@ public class ArchivedCommitsCommand implements CommandMarker { return HoodieCLI.tableMetadata != null; } + @CliCommand(value = "show archived commit stats", help = "Read commits from archived files and show details") + public String showArchivedCommits( + @CliOption(key = {"archiveFolderPattern"}, help = "Archive Folder", unspecifiedDefaultValue = "") String folder, + @CliOption(key = {"limit"}, help = "Limit commits", unspecifiedDefaultValue = "-1") final Integer limit, + @CliOption(key = {"sortBy"}, help = "Sorting Field", unspecifiedDefaultValue = "") final String sortByField, + @CliOption(key = {"desc"}, help = "Ordering", unspecifiedDefaultValue = "false") final boolean descending, + @CliOption(key = { + "headeronly"}, help = "Print Header Only", unspecifiedDefaultValue = "false") final boolean headerOnly) + throws IOException { + System.out.println("===============> Showing only " + limit + " archived commits <==============="); + String basePath = HoodieCLI.tableMetadata.getBasePath(); + Path archivePath = new Path(basePath + "/.hoodie/.commits_.archive*"); + if (folder != null && !folder.isEmpty()) { + archivePath = new Path(basePath + "/.hoodie/" + folder); + } + FileStatus[] fsStatuses = FSUtils.getFs(basePath, HoodieCLI.conf).globStatus(archivePath); + List allStats = new ArrayList<>(); + for (FileStatus fs : fsStatuses) { + //read the archived file + HoodieLogFormat.Reader reader = HoodieLogFormat.newReader(FSUtils.getFs(basePath, HoodieCLI.conf), + new HoodieLogFile(fs.getPath()), HoodieArchivedMetaEntry.getClassSchema()); + + List readRecords = new ArrayList<>(); + //read the avro blocks + while (reader.hasNext()) { + HoodieAvroDataBlock blk = (HoodieAvroDataBlock) reader.next(); + List records = blk.getRecords(); + readRecords.addAll(records); + } + List readCommits = readRecords.stream().map(r -> (GenericRecord) r) + .filter(r -> r.get("actionType").toString().equals(HoodieTimeline.COMMIT_ACTION) + || r.get("actionType").toString().equals(HoodieTimeline.DELTA_COMMIT_ACTION)) + .flatMap(r -> { + HoodieCommitMetadata metadata = + (HoodieCommitMetadata) SpecificData.get().deepCopy(HoodieCommitMetadata.SCHEMA$, + r.get("hoodieCommitMetadata")); + final String instantTime = r.get("commitTime").toString(); + final String action = r.get("actionType").toString(); + return metadata.getPartitionToWriteStats().values().stream().flatMap(hoodieWriteStats -> { + return hoodieWriteStats.stream().map(hoodieWriteStat -> { + List row = new ArrayList<>(); + row.add(action); + row.add(instantTime); + row.add(hoodieWriteStat.getPartitionPath()); + row.add(hoodieWriteStat.getFileId()); + row.add(hoodieWriteStat.getPrevCommit()); + row.add(hoodieWriteStat.getNumWrites()); + row.add(hoodieWriteStat.getNumInserts()); + row.add(hoodieWriteStat.getNumDeletes()); + row.add(hoodieWriteStat.getNumUpdateWrites()); + row.add(hoodieWriteStat.getTotalLogFiles()); + row.add(hoodieWriteStat.getTotalLogBlocks()); + row.add(hoodieWriteStat.getTotalCorruptLogBlock()); + row.add(hoodieWriteStat.getTotalRollbackBlocks()); + row.add(hoodieWriteStat.getTotalLogRecords()); + row.add(hoodieWriteStat.getTotalUpdatedRecordsCompacted()); + row.add(hoodieWriteStat.getTotalWriteBytes()); + row.add(hoodieWriteStat.getTotalWriteErrors()); + return row; + }); + }).map(rowList -> rowList.toArray(new Comparable[0])); + }).collect(Collectors.toList()); + allStats.addAll(readCommits); + reader.close(); + } + TableHeader header = new TableHeader().addTableHeaderField("action") + .addTableHeaderField("instant") + .addTableHeaderField("partition") + .addTableHeaderField("file_id") + .addTableHeaderField("prev_instant") + .addTableHeaderField("num_writes") + .addTableHeaderField("num_inserts") + .addTableHeaderField("num_deletes") + .addTableHeaderField("num_update_writes") + .addTableHeaderField("total_log_files") + .addTableHeaderField("total_log_blocks") + .addTableHeaderField("total_corrupt_log_blocks") + .addTableHeaderField("total_rollback_blocks") + .addTableHeaderField("total_log_records") + .addTableHeaderField("total_updated_records_compacted") + .addTableHeaderField("total_write_bytes") + .addTableHeaderField("total_write_errors"); + + return HoodiePrintHelper.print(header, new HashMap<>(), sortByField, descending, limit, headerOnly, allStats); + } + @CliCommand(value = "show archived commits", help = "Read commits from archived files and show details") public String showCommits( @CliOption(key = {"skipMetadata"}, help = "Skip displaying commit metadata", unspecifiedDefaultValue = "true") diff --git a/hoodie-client/pom.xml b/hoodie-client/pom.xml index eccfa802c..c813ef24a 100644 --- a/hoodie-client/pom.xml +++ b/hoodie-client/pom.xml @@ -171,6 +171,13 @@ spark-sql_2.11 + + ${hive.groupid} + hive-exec + ${hive.version} + test + + org.apache.hbase hbase-client @@ -218,39 +225,4 @@ - - - hive12 - - - !hive11 - - - - - ${hive12.groupid} - hive-exec - ${hive12.version} - test - - - - - hive11 - - - hive11 - - - - - ${hive11.groupid} - hive-exec - ${hive11.version} - test - - - - - diff --git a/hoodie-client/src/main/java/com/uber/hoodie/metrics/HoodieMetrics.java b/hoodie-client/src/main/java/com/uber/hoodie/metrics/HoodieMetrics.java index 5adf45fe0..625e73313 100644 --- a/hoodie-client/src/main/java/com/uber/hoodie/metrics/HoodieMetrics.java +++ b/hoodie-client/src/main/java/com/uber/hoodie/metrics/HoodieMetrics.java @@ -16,8 +16,8 @@ package com.uber.hoodie.metrics; -import com.codahale.metrics.Gauge; -import com.codahale.metrics.MetricRegistry; +import static com.uber.hoodie.metrics.Metrics.registerGauge; + import com.codahale.metrics.Timer; import com.google.common.annotations.VisibleForTesting; import com.uber.hoodie.common.model.HoodieCommitMetadata; @@ -177,18 +177,6 @@ public class HoodieMetrics { return config == null ? null : String.format("%s.%s.%s", tableName, action, metric); } - void registerGauge(String metricName, final long value) { - try { - MetricRegistry registry = Metrics.getInstance().getRegistry(); - registry.register(metricName, (Gauge) () -> value); - } catch (Exception e) { - // Here we catch all exception, so the major upsert pipeline will not be affected if the - // metrics system - // has some issues. - logger.error("Failed to send metrics: ", e); - } - } - /** * By default, the timer context returns duration with nano seconds. Convert it to millisecond. */ diff --git a/hoodie-client/src/main/java/com/uber/hoodie/metrics/Metrics.java b/hoodie-client/src/main/java/com/uber/hoodie/metrics/Metrics.java index 924257493..e3fcc98c0 100644 --- a/hoodie-client/src/main/java/com/uber/hoodie/metrics/Metrics.java +++ b/hoodie-client/src/main/java/com/uber/hoodie/metrics/Metrics.java @@ -16,17 +16,21 @@ package com.uber.hoodie.metrics; +import com.codahale.metrics.Gauge; import com.codahale.metrics.MetricRegistry; import com.google.common.io.Closeables; import com.uber.hoodie.config.HoodieWriteConfig; import com.uber.hoodie.exception.HoodieException; import java.io.Closeable; import org.apache.commons.configuration.ConfigurationException; +import org.apache.log4j.LogManager; +import org.apache.log4j.Logger; /** * This is the main class of the metrics system. */ public class Metrics { + private static Logger logger = LogManager.getLogger(Metrics.class); private static volatile boolean initialized = false; private static Metrics metrics = null; @@ -72,6 +76,18 @@ public class Metrics { initialized = true; } + public static void registerGauge(String metricName, final long value) { + try { + MetricRegistry registry = Metrics.getInstance().getRegistry(); + registry.register(metricName, (Gauge) () -> value); + } catch (Exception e) { + // Here we catch all exception, so the major upsert pipeline will not be affected if the + // metrics system + // has some issues. + logger.error("Failed to send metrics: ", e); + } + } + public MetricRegistry getRegistry() { return registry; } diff --git a/hoodie-client/src/test/java/com/uber/hoodie/common/HoodieTestDataGenerator.java b/hoodie-client/src/test/java/com/uber/hoodie/common/HoodieTestDataGenerator.java index 06d66aeb4..d1713c5e4 100644 --- a/hoodie-client/src/test/java/com/uber/hoodie/common/HoodieTestDataGenerator.java +++ b/hoodie-client/src/test/java/com/uber/hoodie/common/HoodieTestDataGenerator.java @@ -75,7 +75,9 @@ public class HoodieTestDataGenerator { + "{\"name\": \"end_lat\", \"type\": \"double\"}," + "{\"name\": \"end_lon\", \"type\": \"double\"}," + "{\"name\":\"fare\",\"type\": \"double\"}]}"; - public static Schema avroSchema = HoodieAvroUtils.addMetadataFields(new Schema.Parser().parse(TRIP_EXAMPLE_SCHEMA)); + public static Schema avroSchema = new Schema.Parser().parse(TRIP_EXAMPLE_SCHEMA); + public static Schema avroSchemaWithMetadataFields = HoodieAvroUtils.addMetadataFields(avroSchema); + private static Random rand = new Random(46474747); private List existingKeysList = new ArrayList<>(); @@ -100,7 +102,6 @@ public class HoodieTestDataGenerator { */ public static TestRawTripPayload generateRandomValue(HoodieKey key, String commitTime) throws IOException { GenericRecord rec = generateGenericRecord(key.getRecordKey(), "rider-" + commitTime, "driver-" + commitTime, 0.0); - HoodieAvroUtils.addCommitMetadataToRecord(rec, commitTime, "-1"); return new TestRawTripPayload(rec.toString(), key.getRecordKey(), key.getPartitionPath(), TRIP_EXAMPLE_SCHEMA); } diff --git a/hoodie-client/src/test/java/com/uber/hoodie/io/TestHoodieCompactor.java b/hoodie-client/src/test/java/com/uber/hoodie/io/TestHoodieCompactor.java index e1a6d66dc..ac171d4dc 100644 --- a/hoodie-client/src/test/java/com/uber/hoodie/io/TestHoodieCompactor.java +++ b/hoodie-client/src/test/java/com/uber/hoodie/io/TestHoodieCompactor.java @@ -160,7 +160,8 @@ public class TestHoodieCompactor { // Write them to corresponding avro logfiles HoodieTestUtils - .writeRecordsToLogFiles(fs, metaClient.getBasePath(), HoodieTestDataGenerator.avroSchema, updatedRecords); + .writeRecordsToLogFiles(fs, metaClient.getBasePath(), HoodieTestDataGenerator.avroSchemaWithMetadataFields, + updatedRecords); // Verify that all data file has one log file metaClient = new HoodieTableMetaClient(jsc.hadoopConfiguration(), basePath); diff --git a/hoodie-client/src/test/java/com/uber/hoodie/metrics/TestHoodieMetrics.java b/hoodie-client/src/test/java/com/uber/hoodie/metrics/TestHoodieMetrics.java index cb1a43969..c562b308a 100644 --- a/hoodie-client/src/test/java/com/uber/hoodie/metrics/TestHoodieMetrics.java +++ b/hoodie-client/src/test/java/com/uber/hoodie/metrics/TestHoodieMetrics.java @@ -16,6 +16,7 @@ package com.uber.hoodie.metrics; +import static com.uber.hoodie.metrics.Metrics.registerGauge; import static org.junit.Assert.assertTrue; import static org.mockito.Mockito.mock; import static org.mockito.Mockito.when; @@ -39,7 +40,7 @@ public class TestHoodieMetrics { @Test public void testRegisterGauge() { - metrics.registerGauge("metric1", 123L); + registerGauge("metric1", 123L); assertTrue(Metrics.getInstance().getRegistry().getGauges().get("metric1").getValue().toString().equals("123")); } } diff --git a/hoodie-client/src/test/java/com/uber/hoodie/table/TestMergeOnReadTable.java b/hoodie-client/src/test/java/com/uber/hoodie/table/TestMergeOnReadTable.java index f651b8ef1..93227c49c 100644 --- a/hoodie-client/src/test/java/com/uber/hoodie/table/TestMergeOnReadTable.java +++ b/hoodie-client/src/test/java/com/uber/hoodie/table/TestMergeOnReadTable.java @@ -671,8 +671,8 @@ public class TestMergeOnReadTable { // Write them to corresponding avro logfiles HoodieTestUtils - .writeRecordsToLogFiles(metaClient.getFs(), metaClient.getBasePath(), HoodieTestDataGenerator.avroSchema, - updatedRecords); + .writeRecordsToLogFiles(metaClient.getFs(), metaClient.getBasePath(), + HoodieTestDataGenerator.avroSchemaWithMetadataFields, updatedRecords); // Verify that all data file has one log file metaClient = new HoodieTableMetaClient(jsc.hadoopConfiguration(), basePath); diff --git a/hoodie-common/src/main/avro/HoodieCommitMetadata.avsc b/hoodie-common/src/main/avro/HoodieCommitMetadata.avsc index 3f4473200..c46825cdc 100644 --- a/hoodie-common/src/main/avro/HoodieCommitMetadata.avsc +++ b/hoodie-common/src/main/avro/HoodieCommitMetadata.avsc @@ -15,47 +15,58 @@ "fields":[ { "name":"fileId", - "type":["null","string"] + "type":["null","string"], + "default" : null }, { "name":"path", - "type":["null","string"] + "type":["null","string"], + "default" : null }, { "name":"prevCommit", - "type":["null","string"] + "type":["null","string"], + "default" : null }, { "name":"numWrites", - "type":["null","long"] + "type":["null","long"], + "default" : null }, { "name":"numDeletes", - "type":["null","long"] + "type":["null","long"], + "default" : null }, { "name":"numUpdateWrites", - "type":["null","long"] + "type":["null","long"], + "default" : null }, { "name":"totalWriteBytes", - "type":["null","long"] + "type":["null","long"], + "default" : null }, { "name":"totalWriteErrors", - "type":["null","long"] + "type":["null","long"], + "default" : null }, { "name":"partitionPath", - "type":["null","string"] + "type":["null","string"], + "default" : null }, { "name":"totalLogRecords", - "type":["null","long"] + "type":["null","long"], + "default" : null }, { "name":"totalLogFiles", - "type":["null","long"] + "type":["null","long"], + "default" : null }, { "name":"totalUpdatedRecordsCompacted", @@ -69,15 +80,18 @@ }, { "name":"totalLogBlocks", - "type":["null","long"] + "type":["null","long"], + "default" : null }, { "name":"totalCorruptLogBlock", - "type":["null","long"] + "type":["null","long"], + "default" : null }, { "name":"totalRollbackBlocks", - "type":["null","long"] + "type":["null","long"], + "default" : null } ] } diff --git a/hoodie-common/src/main/java/com/uber/hoodie/common/model/HoodieRecord.java b/hoodie-common/src/main/java/com/uber/hoodie/common/model/HoodieRecord.java index cef6e60a2..857dcaaa9 100644 --- a/hoodie-common/src/main/java/com/uber/hoodie/common/model/HoodieRecord.java +++ b/hoodie-common/src/main/java/com/uber/hoodie/common/model/HoodieRecord.java @@ -17,7 +17,9 @@ package com.uber.hoodie.common.model; import com.google.common.base.Objects; +import com.google.common.collect.ImmutableList; import java.io.Serializable; +import java.util.List; import java.util.Optional; /** @@ -31,6 +33,14 @@ public class HoodieRecord implements Serializable public static String PARTITION_PATH_METADATA_FIELD = "_hoodie_partition_path"; public static String FILENAME_METADATA_FIELD = "_hoodie_file_name"; + public static final List HOODIE_META_COLUMNS = + new ImmutableList.Builder().add(COMMIT_TIME_METADATA_FIELD) + .add(COMMIT_SEQNO_METADATA_FIELD) + .add(RECORD_KEY_METADATA_FIELD) + .add(PARTITION_PATH_METADATA_FIELD) + .add(FILENAME_METADATA_FIELD) + .build(); + /** * Identifies the record across the table */ diff --git a/hoodie-common/src/main/java/com/uber/hoodie/common/util/DFSPropertiesConfiguration.java b/hoodie-common/src/main/java/com/uber/hoodie/common/util/DFSPropertiesConfiguration.java index 758ba0fca..f8dad81b2 100644 --- a/hoodie-common/src/main/java/com/uber/hoodie/common/util/DFSPropertiesConfiguration.java +++ b/hoodie-common/src/main/java/com/uber/hoodie/common/util/DFSPropertiesConfiguration.java @@ -73,6 +73,20 @@ public class DFSPropertiesConfiguration { } visitedFiles.add(file.getName()); BufferedReader reader = new BufferedReader(new InputStreamReader(fs.open(file))); + addProperties(reader); + } catch (IOException ioe) { + log.error("Error reading in properies from dfs", ioe); + throw new IllegalArgumentException("Cannot read properties from dfs", ioe); + } + } + + /** + * Add properties from input stream + * @param reader Buffered Reader + * @throws IOException + */ + public void addProperties(BufferedReader reader) throws IOException { + try { String line; while ((line = reader.readLine()) != null) { if (line.startsWith("#") || line.equals("") || !line.contains("=")) { @@ -85,10 +99,8 @@ public class DFSPropertiesConfiguration { props.setProperty(split[0], split[1]); } } + } finally { reader.close(); - } catch (IOException ioe) { - log.error("Error reading in properies from dfs", ioe); - throw new IllegalArgumentException("Cannot read properties from dfs", ioe); } } diff --git a/hoodie-common/src/main/java/com/uber/hoodie/common/util/HoodieAvroUtils.java b/hoodie-common/src/main/java/com/uber/hoodie/common/util/HoodieAvroUtils.java index ae92b00f9..b76b4aeb1 100644 --- a/hoodie-common/src/main/java/com/uber/hoodie/common/util/HoodieAvroUtils.java +++ b/hoodie-common/src/main/java/com/uber/hoodie/common/util/HoodieAvroUtils.java @@ -37,8 +37,8 @@ import org.apache.avro.generic.GenericData; import org.apache.avro.generic.GenericDatumReader; import org.apache.avro.generic.GenericDatumWriter; import org.apache.avro.generic.GenericRecord; +import org.apache.avro.io.BinaryDecoder; import org.apache.avro.io.BinaryEncoder; -import org.apache.avro.io.Decoder; import org.apache.avro.io.DecoderFactory; import org.apache.avro.io.EncoderFactory; import org.codehaus.jackson.JsonNode; @@ -48,6 +48,10 @@ import org.codehaus.jackson.JsonNode; */ public class HoodieAvroUtils { + private static ThreadLocal reuseEncoder = ThreadLocal.withInitial(() -> null); + + private static ThreadLocal reuseDecoder = ThreadLocal.withInitial(() -> null); + // All metadata fields are optional strings. private static final Schema METADATA_FIELD_SCHEMA = Schema.createUnion(Arrays.asList( Schema.create(Schema.Type.NULL), @@ -62,7 +66,8 @@ public class HoodieAvroUtils { GenericDatumWriter writer = new GenericDatumWriter<>(record.getSchema()); ByteArrayOutputStream out = new ByteArrayOutputStream(); - BinaryEncoder encoder = EncoderFactory.get().binaryEncoder(out, null); + BinaryEncoder encoder = EncoderFactory.get().binaryEncoder(out, reuseEncoder.get()); + reuseEncoder.set(encoder); writer.write(record, encoder); encoder.flush(); out.close(); @@ -73,7 +78,8 @@ public class HoodieAvroUtils { * Convert serialized bytes back into avro record */ public static GenericRecord bytesToAvro(byte[] bytes, Schema schema) throws IOException { - Decoder decoder = DecoderFactory.get().binaryDecoder(bytes, null); + BinaryDecoder decoder = DecoderFactory.get().binaryDecoder(bytes, reuseDecoder.get()); + reuseDecoder.set(decoder); GenericDatumReader reader = new GenericDatumReader(schema); return reader.read(null, decoder); } diff --git a/hoodie-common/src/main/java/com/uber/hoodie/common/util/TypedProperties.java b/hoodie-common/src/main/java/com/uber/hoodie/common/util/TypedProperties.java index 5674d2382..5acca156a 100644 --- a/hoodie-common/src/main/java/com/uber/hoodie/common/util/TypedProperties.java +++ b/hoodie-common/src/main/java/com/uber/hoodie/common/util/TypedProperties.java @@ -19,7 +19,10 @@ package com.uber.hoodie.common.util; import java.io.Serializable; +import java.util.Arrays; +import java.util.List; import java.util.Properties; +import java.util.stream.Collectors; /** * Type-aware extension of {@link java.util.Properties} @@ -49,6 +52,13 @@ public class TypedProperties extends Properties implements Serializable { return containsKey(property) ? getProperty(property) : defaultValue; } + public List getStringList(String property, String delimiter, List defaultVal) { + if (!containsKey(property)) { + return defaultVal; + } + return Arrays.stream(getProperty(property).split(delimiter)).map(String::trim).collect(Collectors.toList()); + } + public int getInteger(String property) { checkKey(property); return Integer.valueOf(getProperty(property)); diff --git a/hoodie-common/src/test/java/com/uber/hoodie/common/util/collection/TestDiskBasedMap.java b/hoodie-common/src/test/java/com/uber/hoodie/common/util/collection/TestDiskBasedMap.java new file mode 100644 index 000000000..9a288af82 --- /dev/null +++ b/hoodie-common/src/test/java/com/uber/hoodie/common/util/collection/TestDiskBasedMap.java @@ -0,0 +1,209 @@ +/* + * Copyright (c) 2016 Uber Technologies, Inc. (hoodie-dev-group@uber.com) + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package com.uber.hoodie.common.util.collection; + +import static com.uber.hoodie.common.util.SchemaTestUtil.getSimpleSchema; +import static org.junit.Assert.assertEquals; +import static org.junit.Assert.assertTrue; + +import com.uber.hoodie.common.model.AvroBinaryTestPayload; +import com.uber.hoodie.common.model.HoodieAvroPayload; +import com.uber.hoodie.common.model.HoodieKey; +import com.uber.hoodie.common.model.HoodieRecord; +import com.uber.hoodie.common.model.HoodieRecordPayload; +import com.uber.hoodie.common.table.timeline.HoodieActiveTimeline; +import com.uber.hoodie.common.util.HoodieAvroUtils; +import com.uber.hoodie.common.util.HoodieRecordSizeEstimator; +import com.uber.hoodie.common.util.SchemaTestUtil; +import com.uber.hoodie.common.util.SpillableMapTestUtils; +import com.uber.hoodie.common.util.SpillableMapUtils; +import java.io.IOException; +import java.io.UncheckedIOException; +import java.net.URISyntaxException; +import java.util.ArrayList; +import java.util.HashSet; +import java.util.Iterator; +import java.util.List; +import java.util.Optional; +import java.util.Set; +import java.util.UUID; +import java.util.stream.Collectors; +import org.apache.avro.Schema; +import org.apache.avro.generic.GenericRecord; +import org.apache.avro.generic.IndexedRecord; +import org.junit.Ignore; +import org.junit.Test; + +public class TestDiskBasedMap { + + private static final String BASE_OUTPUT_PATH = "/tmp/"; + + @Test + public void testSimpleInsert() throws IOException, URISyntaxException { + Schema schema = HoodieAvroUtils.addMetadataFields(getSimpleSchema()); + String payloadClazz = HoodieAvroPayload.class.getName(); + + DiskBasedMap records = new DiskBasedMap<>(BASE_OUTPUT_PATH); + List iRecords = SchemaTestUtil.generateHoodieTestRecords(0, 100); + ((GenericRecord) iRecords.get(0)).get(HoodieRecord.COMMIT_TIME_METADATA_FIELD).toString(); + List recordKeys = SpillableMapTestUtils.upsertRecords(iRecords, records); + + // make sure records have spilled to disk + assertTrue(records.sizeOfFileOnDiskInBytes() > 0); + Iterator> itr = records.iterator(); + List oRecords = new ArrayList<>(); + while (itr.hasNext()) { + HoodieRecord rec = itr.next(); + oRecords.add(rec); + assert recordKeys.contains(rec.getRecordKey()); + } + } + + @Test + public void testSimpleInsertWithoutHoodieMetadata() throws IOException, URISyntaxException { + Schema schema = getSimpleSchema(); + String payloadClazz = HoodieAvroPayload.class.getName(); + + DiskBasedMap records = new DiskBasedMap<>(BASE_OUTPUT_PATH); + List hoodieRecords = SchemaTestUtil + .generateHoodieTestRecordsWithoutHoodieMetadata(0, 1000); + Set recordKeys = new HashSet<>(); + // insert generated records into the map + hoodieRecords.stream().forEach(r -> { + records.put(r.getRecordKey(), r); + recordKeys.add(r.getRecordKey()); + }); + // make sure records have spilled to disk + assertTrue(records.sizeOfFileOnDiskInBytes() > 0); + Iterator> itr = records.iterator(); + List oRecords = new ArrayList<>(); + while (itr.hasNext()) { + HoodieRecord rec = itr.next(); + oRecords.add(rec); + assert recordKeys.contains(rec.getRecordKey()); + } + } + + @Test + public void testSimpleUpsert() throws IOException, URISyntaxException { + + Schema schema = HoodieAvroUtils.addMetadataFields(getSimpleSchema()); + String payloadClazz = HoodieAvroPayload.class.getName(); + + DiskBasedMap records = new DiskBasedMap<>(BASE_OUTPUT_PATH); + List iRecords = SchemaTestUtil.generateHoodieTestRecords(0, 100); + + // perform some inserts + List recordKeys = SpillableMapTestUtils.upsertRecords(iRecords, records); + + long fileSize = records.sizeOfFileOnDiskInBytes(); + // make sure records have spilled to disk + assertTrue(fileSize > 0); + + // generate updates from inserts + List updatedRecords = + SchemaTestUtil + .updateHoodieTestRecords(recordKeys, SchemaTestUtil.generateHoodieTestRecords(0, 100), + HoodieActiveTimeline.createNewCommitTime()); + String newCommitTime = ((GenericRecord) updatedRecords.get(0)) + .get(HoodieRecord.COMMIT_TIME_METADATA_FIELD).toString(); + + // perform upserts + recordKeys = SpillableMapTestUtils.upsertRecords(updatedRecords, records); + + // upserts should be appended to the existing file, hence increasing the sizeOfFile on disk + assertTrue(records.sizeOfFileOnDiskInBytes() > fileSize); + + // Upserted records (on disk) should have the latest commit time + Iterator> itr = records.iterator(); + while (itr.hasNext()) { + HoodieRecord rec = itr.next(); + assert recordKeys.contains(rec.getRecordKey()); + try { + IndexedRecord indexedRecord = (IndexedRecord) rec.getData().getInsertValue(schema).get(); + String latestCommitTime = ((GenericRecord) indexedRecord) + .get(HoodieRecord.COMMIT_TIME_METADATA_FIELD).toString(); + assertEquals(latestCommitTime, newCommitTime); + } catch (IOException io) { + throw new UncheckedIOException(io); + } + } + } + + @Test + public void testSizeEstimator() throws IOException, URISyntaxException { + Schema schema = SchemaTestUtil.getSimpleSchema(); + + // Test sizeEstimator without hoodie metadata fields + List hoodieRecords = SchemaTestUtil.generateHoodieTestRecords(0, 1, schema); + + long payloadSize = SpillableMapUtils.computePayloadSize(hoodieRecords.remove(0), + new HoodieRecordSizeEstimator(schema)); + assertTrue(payloadSize > 0); + + // Test sizeEstimator with hoodie metadata fields + schema = HoodieAvroUtils.addMetadataFields(schema); + hoodieRecords = SchemaTestUtil.generateHoodieTestRecords(0, 1, schema); + payloadSize = SpillableMapUtils.computePayloadSize(hoodieRecords.remove(0), + new HoodieRecordSizeEstimator(schema)); + assertTrue(payloadSize > 0); + + // Following tests payloads without an Avro Schema in the Record + + // Test sizeEstimator without hoodie metadata fields and without schema object in the payload + schema = SchemaTestUtil.getSimpleSchema(); + List indexedRecords = SchemaTestUtil.generateHoodieTestRecords(0, 1); + hoodieRecords = indexedRecords.stream() + .map(r -> new HoodieRecord(new HoodieKey(UUID.randomUUID().toString(), "0000/00/00"), + new AvroBinaryTestPayload(Optional.of((GenericRecord) r)))).collect(Collectors.toList()); + payloadSize = SpillableMapUtils.computePayloadSize(hoodieRecords.remove(0), + new HoodieRecordSizeEstimator(schema)); + assertTrue(payloadSize > 0); + + // Test sizeEstimator with hoodie metadata fields and without schema object in the payload + final Schema simpleSchemaWithMetadata = HoodieAvroUtils + .addMetadataFields(SchemaTestUtil.getSimpleSchema()); + indexedRecords = SchemaTestUtil.generateHoodieTestRecords(0, 1); + hoodieRecords = indexedRecords.stream() + .map(r -> new HoodieRecord(new HoodieKey(UUID.randomUUID().toString(), "0000/00/00"), + new AvroBinaryTestPayload(Optional + .of(HoodieAvroUtils.rewriteRecord((GenericRecord) r, simpleSchemaWithMetadata))))) + .collect(Collectors.toList()); + payloadSize = SpillableMapUtils.computePayloadSize(hoodieRecords.remove(0), + new HoodieRecordSizeEstimator(schema)); + assertTrue(payloadSize > 0); + } + + /** + * @na: Leaving this test here for a quick performance test + */ + @Ignore + @Test + public void testSizeEstimatorPerformance() throws IOException, URISyntaxException { + // Test sizeEstimatorPerformance with simpleSchema + Schema schema = SchemaTestUtil.getSimpleSchema(); + List hoodieRecords = SchemaTestUtil.generateHoodieTestRecords(0, 1, schema); + HoodieRecordSizeEstimator sizeEstimator = + new HoodieRecordSizeEstimator(schema); + HoodieRecord record = hoodieRecords.remove(0); + long startTime = System.currentTimeMillis(); + SpillableMapUtils.computePayloadSize(record, sizeEstimator); + long timeTaken = System.currentTimeMillis() - startTime; + System.out.println("Time taken :" + timeTaken); + assertTrue(timeTaken < 100); + } +} diff --git a/hoodie-hadoop-mr/pom.xml b/hoodie-hadoop-mr/pom.xml index fe1df4396..6ef41a32d 100644 --- a/hoodie-hadoop-mr/pom.xml +++ b/hoodie-hadoop-mr/pom.xml @@ -60,6 +60,22 @@ org.apache.hadoop hadoop-hdfs + + ${hive.groupid} + hive-jdbc + ${hive.version} + + + commons-logging + commons-logging + + + + + ${hive.groupid} + hive-exec + ${hive.version} + commons-logging commons-logging @@ -105,58 +121,4 @@ - - - hive12 - - - !hive11 - - - - - ${hive12.groupid} - hive-jdbc - ${hive12.version} - - - commons-logging - commons-logging - - - - - ${hive12.groupid} - hive-exec - ${hive12.version} - - - - - hive11 - - - hive11 - - - - - ${hive11.groupid} - hive-jdbc - ${hive11.version} - - - commons-logging - commons-logging - - - - - ${hive11.groupid} - hive-exec - ${hive11.version} - - - - diff --git a/hoodie-hive/pom.xml b/hoodie-hive/pom.xml index f74e5ea83..da4e9a0d7 100644 --- a/hoodie-hive/pom.xml +++ b/hoodie-hive/pom.xml @@ -99,6 +99,27 @@ junit junit + + ${hive.groupid} + hive-service + ${hive.version} + + + ${hive.groupid} + hive-jdbc + ${hive.version} + + + ${hive.groupid} + hive-metastore + ${hive.version} + + + ${hive.groupid} + hive-common + ${hive.version} + + org.apache.hadoop hadoop-hdfs @@ -175,67 +196,4 @@ - - - - hive12 - - - !hive11 - - - - - ${hive12.groupid} - hive-service - ${hive12.version} - - - ${hive12.groupid} - hive-jdbc - ${hive12.version} - - - ${hive12.groupid} - hive-metastore - ${hive12.version} - - - ${hive12.groupid} - hive-common - ${hive12.version} - - - - - hive11 - - - hive11 - - - - - org.apache.hive - hive-service - ${hive11.version} - - - org.apache.hive - hive-jdbc - ${hive11.version} - - - org.apache.hive - hive-metastore - ${hive11.version} - - - org.apache.hive - hive-common - ${hive11.version} - - - - diff --git a/hoodie-hive/src/main/java/com/uber/hoodie/hive/HoodieHiveClient.java b/hoodie-hive/src/main/java/com/uber/hoodie/hive/HoodieHiveClient.java index e3f9cb0fe..d93939198 100644 --- a/hoodie-hive/src/main/java/com/uber/hoodie/hive/HoodieHiveClient.java +++ b/hoodie-hive/src/main/java/com/uber/hoodie/hive/HoodieHiveClient.java @@ -90,7 +90,7 @@ public class HoodieHiveClient { private Connection connection; private HoodieTimeline activeTimeline; - HoodieHiveClient(HiveSyncConfig cfg, HiveConf configuration, FileSystem fs) { + public HoodieHiveClient(HiveSyncConfig cfg, HiveConf configuration, FileSystem fs) { this.syncConfig = cfg; this.fs = fs; this.metaClient = new HoodieTableMetaClient(fs.getConf(), cfg.basePath, true); @@ -231,7 +231,7 @@ public class HoodieHiveClient { /** * Scan table partitions */ - List scanTablePartitions() throws TException { + public List scanTablePartitions() throws TException { return client.listPartitions(syncConfig.databaseName, syncConfig.tableName, (short) -1); } @@ -268,7 +268,7 @@ public class HoodieHiveClient { /** * Get the table schema */ - Map getTableSchema() { + public Map getTableSchema() { if (!doesTableExist()) { throw new IllegalArgumentException( "Failed to get schema for table " + syncConfig.tableName + " does not exist"); @@ -435,7 +435,7 @@ public class HoodieHiveClient { /** * @return true if the configured table exists */ - boolean doesTableExist() { + public boolean doesTableExist() { try { return client.tableExists(syncConfig.databaseName, syncConfig.tableName); } catch (TException e) { @@ -449,7 +449,7 @@ public class HoodieHiveClient { * * @param s SQL to execute */ - void updateHiveSQL(String s) { + public void updateHiveSQL(String s) { Statement stmt = null; try { stmt = connection.createStatement(); @@ -468,8 +468,10 @@ public class HoodieHiveClient { BasicDataSource ds = new HiveDataSource(); ds.setDriverClassName(HiveDriver.class.getCanonicalName()); ds.setUrl(getHiveJdbcUrlWithDefaultDBName()); - ds.setUsername(syncConfig.hiveUser); - ds.setPassword(syncConfig.hivePass); + if (syncConfig.hiveUser != null) { + ds.setUsername(syncConfig.hiveUser); + ds.setPassword(syncConfig.hivePass); + } LOG.info("Getting Hive Connection from Datasource " + ds); try { this.connection = ds.getConnection(); @@ -520,7 +522,7 @@ public class HoodieHiveClient { return fs; } - Optional getLastCommitTimeSynced() { + public Optional getLastCommitTimeSynced() { // Get the last commit time from the TBLproperties try { Table database = client.getTable(syncConfig.databaseName, syncConfig.tableName); @@ -532,7 +534,7 @@ public class HoodieHiveClient { } } - void close() { + public void close() { try { if (connection != null) { connection.close(); @@ -548,7 +550,7 @@ public class HoodieHiveClient { @SuppressWarnings("OptionalUsedAsFieldOrParameterType") List getPartitionsWrittenToSince(Optional lastCommitTimeSynced) { if (!lastCommitTimeSynced.isPresent()) { - LOG.info("Last commit time synced is not known, listing all partitions"); + LOG.info("Last commit time synced is not known, listing all partitions in " + syncConfig.basePath + ",FS :" + fs); try { return FSUtils.getAllPartitionPaths(fs, syncConfig.basePath, syncConfig.assumeDatePartitioning); @@ -573,6 +575,10 @@ public class HoodieHiveClient { } } + List getAllTables(String db) throws Exception { + return client.getAllTables(db); + } + void updateLastCommitTimeSynced() { // Set the last commit time from the TBLproperties String lastCommitSynced = activeTimeline.lastInstant().get().getTimestamp(); diff --git a/hoodie-hive/src/main/java/com/uber/hoodie/hive/MultiPartKeysValueExtractor.java b/hoodie-hive/src/main/java/com/uber/hoodie/hive/MultiPartKeysValueExtractor.java index 00475e1e9..b0bcc59f4 100644 --- a/hoodie-hive/src/main/java/com/uber/hoodie/hive/MultiPartKeysValueExtractor.java +++ b/hoodie-hive/src/main/java/com/uber/hoodie/hive/MultiPartKeysValueExtractor.java @@ -16,8 +16,10 @@ package com.uber.hoodie.hive; +import com.google.common.base.Preconditions; import java.util.Arrays; import java.util.List; +import java.util.stream.Collectors; /** * Partition Key extractor treating each value delimited by slash as separate key. @@ -27,6 +29,14 @@ public class MultiPartKeysValueExtractor implements PartitionValueExtractor { @Override public List extractPartitionValuesInPath(String partitionPath) { String[] splits = partitionPath.split("/"); - return Arrays.asList(splits); + return Arrays.stream(splits).map(s -> { + if (s.contains("=")) { + String[] moreSplit = s.split("="); + Preconditions.checkArgument(moreSplit.length == 2, + "Partition Field (" + s + ") not in expected format"); + return moreSplit[1]; + } + return s; + }).collect(Collectors.toList()); } } \ No newline at end of file diff --git a/hoodie-hive/src/main/java/com/uber/hoodie/hive/PartitionValueExtractor.java b/hoodie-hive/src/main/java/com/uber/hoodie/hive/PartitionValueExtractor.java index 73776c461..b32f7cf0c 100644 --- a/hoodie-hive/src/main/java/com/uber/hoodie/hive/PartitionValueExtractor.java +++ b/hoodie-hive/src/main/java/com/uber/hoodie/hive/PartitionValueExtractor.java @@ -18,6 +18,7 @@ package com.uber.hoodie.hive; +import java.io.Serializable; import java.util.List; /** @@ -28,7 +29,7 @@ import java.util.List; * e.g. Hive table partitioned by datestr=yyyy-mm-dd and hdfs path * /app/hoodie/dataset1/YYYY=[yyyy]/MM=[mm]/DD=[dd] */ -public interface PartitionValueExtractor { +public interface PartitionValueExtractor extends Serializable { List extractPartitionValuesInPath(String partitionPath); } diff --git a/hoodie-hive/src/main/java/com/uber/hoodie/hive/SlashEncodedDayPartitionValueExtractor.java b/hoodie-hive/src/main/java/com/uber/hoodie/hive/SlashEncodedDayPartitionValueExtractor.java index 893b61e4c..771f2771f 100644 --- a/hoodie-hive/src/main/java/com/uber/hoodie/hive/SlashEncodedDayPartitionValueExtractor.java +++ b/hoodie-hive/src/main/java/com/uber/hoodie/hive/SlashEncodedDayPartitionValueExtractor.java @@ -33,12 +33,19 @@ import org.joda.time.format.DateTimeFormatter; */ public class SlashEncodedDayPartitionValueExtractor implements PartitionValueExtractor { - private final DateTimeFormatter dtfOut; + private transient DateTimeFormatter dtfOut; public SlashEncodedDayPartitionValueExtractor() { this.dtfOut = DateTimeFormat.forPattern("yyyy-MM-dd"); } + private DateTimeFormatter getDtfOut() { + if (dtfOut == null) { + dtfOut = DateTimeFormat.forPattern("yyyy-MM-dd"); + } + return dtfOut; + } + @Override public List extractPartitionValuesInPath(String partitionPath) { // partition path is expected to be in this format yyyy/mm/dd @@ -52,6 +59,6 @@ public class SlashEncodedDayPartitionValueExtractor implements PartitionValueExt int mm = Integer.parseInt(splits[1]); int dd = Integer.parseInt(splits[2]); DateTime dateTime = new DateTime(year, mm, dd, 0, 0); - return Lists.newArrayList(dtfOut.print(dateTime)); + return Lists.newArrayList(getDtfOut().print(dateTime)); } } diff --git a/hoodie-spark/pom.xml b/hoodie-spark/pom.xml index c4869f3b2..d800c1a72 100644 --- a/hoodie-spark/pom.xml +++ b/hoodie-spark/pom.xml @@ -221,6 +221,30 @@ commons-configuration2 + + ${hive.groupid} + hive-service + ${hive.version} + + + + ${hive.groupid} + hive-jdbc + ${hive.version} + + + + ${hive.groupid} + hive-metastore + ${hive.version} + + + + ${hive.groupid} + hive-common + ${hive.version} + + com.uber.hoodie hoodie-client @@ -264,67 +288,4 @@ test - - - hive12 - - - !hive11 - - - - - ${hive12.groupid} - hive-service - ${hive12.version} - - - ${hive12.groupid} - hive-jdbc - ${hive12.version} - - - ${hive12.groupid} - hive-metastore - ${hive12.version} - - - ${hive12.groupid} - hive-common - ${hive12.version} - - - - - hive11 - - - hive11 - - - - - ${hive11.groupid} - hive-service - ${hive11.version} - - - ${hive11.groupid} - hive-jdbc - ${hive11.version} - - - ${hive11.groupid} - hive-metastore - ${hive11.version} - - - ${hive11.groupid} - hive-common - ${hive11.version} - - - - - diff --git a/hoodie-spark/src/main/java/com/uber/hoodie/DataSourceUtils.java b/hoodie-spark/src/main/java/com/uber/hoodie/DataSourceUtils.java index b02c36675..429c43953 100644 --- a/hoodie-spark/src/main/java/com/uber/hoodie/DataSourceUtils.java +++ b/hoodie-spark/src/main/java/com/uber/hoodie/DataSourceUtils.java @@ -29,8 +29,13 @@ import com.uber.hoodie.config.HoodieWriteConfig; import com.uber.hoodie.exception.DatasetNotFoundException; import com.uber.hoodie.exception.HoodieException; import com.uber.hoodie.exception.HoodieNotSupportedException; +import com.uber.hoodie.hive.HiveSyncConfig; +import com.uber.hoodie.hive.PartitionValueExtractor; +import com.uber.hoodie.hive.SlashEncodedDayPartitionValueExtractor; import com.uber.hoodie.index.HoodieIndex; import java.io.IOException; +import java.util.ArrayList; +import java.util.Arrays; import java.util.List; import java.util.Map; import java.util.stream.Collectors; @@ -86,6 +91,17 @@ public class DataSourceUtils { } } + /** + * Create a partition value extractor class via reflection, passing in any configs needed + */ + public static PartitionValueExtractor createPartitionExtractor(String partitionExtractorClass) { + try { + return (PartitionValueExtractor) ReflectionUtils.loadClass(partitionExtractorClass); + } catch (Throwable e) { + throw new HoodieException("Could not load partition extractor class " + partitionExtractorClass, e); + } + } + /** * Create a payload class via reflection, passing in an ordering/precombine value. */ @@ -169,4 +185,28 @@ public class DataSourceUtils { .withProps(parameters).build(); return dropDuplicates(jssc, incomingHoodieRecords, writeConfig); } + + public static HiveSyncConfig buildHiveSyncConfig(TypedProperties props, String basePath) { + checkRequiredProperties(props, Arrays.asList(DataSourceWriteOptions.HIVE_TABLE_OPT_KEY())); + HiveSyncConfig hiveSyncConfig = new HiveSyncConfig(); + hiveSyncConfig.basePath = basePath; + hiveSyncConfig.assumeDatePartitioning = + props.getBoolean(DataSourceWriteOptions.HIVE_ASSUME_DATE_PARTITION_OPT_KEY(), + Boolean.valueOf(DataSourceWriteOptions.DEFAULT_HIVE_ASSUME_DATE_PARTITION_OPT_VAL())); + hiveSyncConfig.databaseName = props.getString(DataSourceWriteOptions.HIVE_DATABASE_OPT_KEY(), + DataSourceWriteOptions.DEFAULT_HIVE_DATABASE_OPT_VAL()); + hiveSyncConfig.tableName = props.getString(DataSourceWriteOptions.HIVE_TABLE_OPT_KEY()); + hiveSyncConfig.hiveUser = props.getString(DataSourceWriteOptions.HIVE_USER_OPT_KEY(), + DataSourceWriteOptions.DEFAULT_HIVE_USER_OPT_VAL()); + hiveSyncConfig.hivePass = props.getString(DataSourceWriteOptions.HIVE_PASS_OPT_KEY(), + DataSourceWriteOptions.DEFAULT_HIVE_PASS_OPT_VAL()); + hiveSyncConfig.jdbcUrl = props.getString(DataSourceWriteOptions.HIVE_URL_OPT_KEY(), + DataSourceWriteOptions.DEFAULT_HIVE_URL_OPT_VAL()); + hiveSyncConfig.partitionFields = + props.getStringList(DataSourceWriteOptions.HIVE_PARTITION_FIELDS_OPT_KEY(), ",", new ArrayList<>()); + hiveSyncConfig.partitionValueExtractorClass = + props.getString(DataSourceWriteOptions.HIVE_PARTITION_EXTRACTOR_CLASS_OPT_KEY(), + SlashEncodedDayPartitionValueExtractor.class.getName()); + return hiveSyncConfig; + } } diff --git a/hoodie-spark/src/main/scala/com/uber/hoodie/AvroConversionUtils.scala b/hoodie-spark/src/main/scala/com/uber/hoodie/AvroConversionUtils.scala index 75d13e7cf..df3f96438 100644 --- a/hoodie-spark/src/main/scala/com/uber/hoodie/AvroConversionUtils.scala +++ b/hoodie-spark/src/main/scala/com/uber/hoodie/AvroConversionUtils.scala @@ -22,13 +22,18 @@ import java.sql.{Date, Timestamp} import java.util import com.databricks.spark.avro.SchemaConverters -import org.apache.avro.generic.GenericData.Record -import org.apache.avro.generic.GenericRecord +import com.databricks.spark.avro.SchemaConverters.IncompatibleSchemaException +import org.apache.avro.Schema.Type._ +import org.apache.avro.generic.GenericData.{Fixed, Record} +import org.apache.avro.generic.{GenericData, GenericRecord} import org.apache.avro.{Schema, SchemaBuilder} import org.apache.spark.rdd.RDD import org.apache.spark.sql.catalyst.encoders.RowEncoder +import org.apache.spark.sql.catalyst.expressions.GenericRow import org.apache.spark.sql.types._ -import org.apache.spark.sql.{DataFrame, Row} +import org.apache.spark.sql.{DataFrame, Dataset, Row, SparkSession} + +import scala.collection.JavaConverters._ object AvroConversionUtils { @@ -46,6 +51,22 @@ object AvroConversionUtils { } } + def createDataFrame(rdd: RDD[GenericRecord], schemaStr: String, ss : SparkSession): Dataset[Row] = { + if (rdd.isEmpty()) { + ss.emptyDataFrame + } else { + ss.createDataFrame(rdd.mapPartitions { records => + if (records.isEmpty) Iterator.empty + else { + val schema = Schema.parse(schemaStr) + val dataType = convertAvroSchemaToStructType(schema) + val convertor = createConverterToRow(schema, dataType) + records.map { x => convertor(x).asInstanceOf[Row] } + } + }, convertAvroSchemaToStructType(Schema.parse(schemaStr))).asInstanceOf[Dataset[Row]] + } + } + def getNewRecordNamespace(elementDataType: DataType, currentRecordNamespace: String, elementName: String): String = { @@ -56,6 +77,185 @@ object AvroConversionUtils { } } + /** + * NOTE : This part of code is copied from com.databricks.spark.avro.SchemaConverters.scala (133:310) (spark-avro) + * + * Returns a converter function to convert row in avro format to GenericRow of catalyst. + * + * @param sourceAvroSchema Source schema before conversion inferred from avro file by passed in + * by user. + * @param targetSqlType Target catalyst sql type after the conversion. + * @return returns a converter function to convert row in avro format to GenericRow of catalyst. + */ + def createConverterToRow(sourceAvroSchema: Schema, + targetSqlType: DataType): AnyRef => AnyRef = { + + def createConverter(avroSchema: Schema, + sqlType: DataType, path: List[String]): AnyRef => AnyRef = { + val avroType = avroSchema.getType + (sqlType, avroType) match { + // Avro strings are in Utf8, so we have to call toString on them + case (StringType, STRING) | (StringType, ENUM) => + (item: AnyRef) => if (item == null) null else item.toString + // Byte arrays are reused by avro, so we have to make a copy of them. + case (IntegerType, INT) | (BooleanType, BOOLEAN) | (DoubleType, DOUBLE) | + (FloatType, FLOAT) | (LongType, LONG) => + identity + case (BinaryType, FIXED) => + (item: AnyRef) => + if (item == null) { + null + } else { + item.asInstanceOf[Fixed].bytes().clone() + } + case (BinaryType, BYTES) => + (item: AnyRef) => + if (item == null) { + null + } else { + val byteBuffer = item.asInstanceOf[ByteBuffer] + val bytes = new Array[Byte](byteBuffer.remaining) + byteBuffer.get(bytes) + bytes + } + + case (struct: StructType, RECORD) => + val length = struct.fields.length + val converters = new Array[AnyRef => AnyRef](length) + val avroFieldIndexes = new Array[Int](length) + var i = 0 + while (i < length) { + val sqlField = struct.fields(i) + val avroField = avroSchema.getField(sqlField.name) + if (avroField != null) { + val converter = createConverter(avroField.schema(), sqlField.dataType, + path :+ sqlField.name) + converters(i) = converter + avroFieldIndexes(i) = avroField.pos() + } else if (!sqlField.nullable) { + throw new IncompatibleSchemaException( + s"Cannot find non-nullable field ${sqlField.name} at path ${path.mkString(".")} " + + "in Avro schema\n" + + s"Source Avro schema: $sourceAvroSchema.\n" + + s"Target Catalyst type: $targetSqlType") + } + i += 1 + } + + (item: AnyRef) => { + if (item == null) { + null + } else { + val record = item.asInstanceOf[GenericRecord] + + val result = new Array[Any](length) + var i = 0 + while (i < converters.length) { + if (converters(i) != null) { + val converter = converters(i) + result(i) = converter(record.get(avroFieldIndexes(i))) + } + i += 1 + } + new GenericRow(result) + } + } + case (arrayType: ArrayType, ARRAY) => + val elementConverter = createConverter(avroSchema.getElementType, arrayType.elementType, + path) + val allowsNull = arrayType.containsNull + (item: AnyRef) => { + if (item == null) { + null + } else { + item.asInstanceOf[java.lang.Iterable[AnyRef]].asScala.map { element => + if (element == null && !allowsNull) { + throw new RuntimeException(s"Array value at path ${path.mkString(".")} is not " + + "allowed to be null") + } else { + elementConverter(element) + } + } + } + } + case (mapType: MapType, MAP) if mapType.keyType == StringType => + val valueConverter = createConverter(avroSchema.getValueType, mapType.valueType, path) + val allowsNull = mapType.valueContainsNull + (item: AnyRef) => { + if (item == null) { + null + } else { + item.asInstanceOf[java.util.Map[AnyRef, AnyRef]].asScala.map { x => + if (x._2 == null && !allowsNull) { + throw new RuntimeException(s"Map value at path ${path.mkString(".")} is not " + + "allowed to be null") + } else { + (x._1.toString, valueConverter(x._2)) + } + }.toMap + } + } + case (sqlType, UNION) => + if (avroSchema.getTypes.asScala.exists(_.getType == NULL)) { + val remainingUnionTypes = avroSchema.getTypes.asScala.filterNot(_.getType == NULL) + if (remainingUnionTypes.size == 1) { + createConverter(remainingUnionTypes.head, sqlType, path) + } else { + createConverter(Schema.createUnion(remainingUnionTypes.asJava), sqlType, path) + } + } else avroSchema.getTypes.asScala.map(_.getType) match { + case Seq(t1) => createConverter(avroSchema.getTypes.get(0), sqlType, path) + case Seq(a, b) if Set(a, b) == Set(INT, LONG) && sqlType == LongType => + (item: AnyRef) => { + item match { + case null => null + case l: java.lang.Long => l + case i: java.lang.Integer => new java.lang.Long(i.longValue()) + } + } + case Seq(a, b) if Set(a, b) == Set(FLOAT, DOUBLE) && sqlType == DoubleType => + (item: AnyRef) => { + item match { + case null => null + case d: java.lang.Double => d + case f: java.lang.Float => new java.lang.Double(f.doubleValue()) + } + } + case other => + sqlType match { + case t: StructType if t.fields.length == avroSchema.getTypes.size => + val fieldConverters = t.fields.zip(avroSchema.getTypes.asScala).map { + case (field, schema) => + createConverter(schema, field.dataType, path :+ field.name) + } + + (item: AnyRef) => if (item == null) { + null + } else { + val i = GenericData.get().resolveUnion(avroSchema, item) + val converted = new Array[Any](fieldConverters.length) + converted(i) = fieldConverters(i)(item) + new GenericRow(converted) + } + case _ => throw new IncompatibleSchemaException( + s"Cannot convert Avro schema to catalyst type because schema at path " + + s"${path.mkString(".")} is not compatible " + + s"(avroType = $other, sqlType = $sqlType). \n" + + s"Source Avro schema: $sourceAvroSchema.\n" + + s"Target Catalyst type: $targetSqlType") + } + } + case (left, right) => + throw new IncompatibleSchemaException( + s"Cannot convert Avro schema to catalyst type because schema at path " + + s"${path.mkString(".")} is not compatible (avroType = $left, sqlType = $right). \n" + + s"Source Avro schema: $sourceAvroSchema.\n" + + s"Target Catalyst type: $targetSqlType") + } + } + createConverter(sourceAvroSchema, targetSqlType, List.empty[String]) + } + def createConverterToAvro(dataType: DataType, structName: String, recordNamespace: String): Any => Any = { diff --git a/hoodie-spark/src/main/scala/com/uber/hoodie/DataSourceOptions.scala b/hoodie-spark/src/main/scala/com/uber/hoodie/DataSourceOptions.scala index 5df7118bd..9973e4bce 100644 --- a/hoodie-spark/src/main/scala/com/uber/hoodie/DataSourceOptions.scala +++ b/hoodie-spark/src/main/scala/com/uber/hoodie/DataSourceOptions.scala @@ -43,7 +43,7 @@ object DataSourceReadOptions { val VIEW_TYPE_INCREMENTAL_OPT_VAL = "incremental" val VIEW_TYPE_REALTIME_OPT_VAL = "realtime" val DEFAULT_VIEW_TYPE_OPT_VAL = VIEW_TYPE_READ_OPTIMIZED_OPT_VAL - + val DEFAULTPUSH_DOWN_FILTERS_OPT_VAL = "" /** * Instant time to start incrementally pulling data from. The instanttime here need not @@ -64,6 +64,13 @@ object DataSourceReadOptions { * */ val END_INSTANTTIME_OPT_KEY = "hoodie.datasource.read.end.instanttime" + + /** + * For use-cases like DeltaStreamer which reads from Hoodie Incremental table and applies opaque map functions, + * filters appearing late in the sequence of transformations cannot be automatically pushed down. + * This option allows setting filters directly on Hoodie Source + */ + val PUSH_DOWN_INCR_FILTERS_OPT_KEY = "hoodie.datasource.read.incr.filters" } /** diff --git a/hoodie-spark/src/main/scala/com/uber/hoodie/IncrementalRelation.scala b/hoodie-spark/src/main/scala/com/uber/hoodie/IncrementalRelation.scala index 1f7b47682..0f13c9348 100644 --- a/hoodie-spark/src/main/scala/com/uber/hoodie/IncrementalRelation.scala +++ b/hoodie-spark/src/main/scala/com/uber/hoodie/IncrementalRelation.scala @@ -64,21 +64,33 @@ class IncrementalRelation(val sqlContext: SQLContext, throw new HoodieException(s"Specify the begin instant time to pull from using " + s"option ${DataSourceReadOptions.BEGIN_INSTANTTIME_OPT_KEY}") } + + val lastInstant = commitTimeline.lastInstant().get() + val commitsToReturn = commitTimeline.findInstantsInRange( optParams(DataSourceReadOptions.BEGIN_INSTANTTIME_OPT_KEY), - optParams.getOrElse(DataSourceReadOptions.END_INSTANTTIME_OPT_KEY, - commitTimeline.lastInstant().get().getTimestamp)) + optParams.getOrElse(DataSourceReadOptions.END_INSTANTTIME_OPT_KEY, lastInstant.getTimestamp)) .getInstants.iterator().toList // use schema from a file produced in the latest instant val latestSchema = { + // use last instant if instant range is empty + val instant = commitsToReturn.lastOption.getOrElse(lastInstant) val latestMeta = HoodieCommitMetadata - .fromBytes(commitTimeline.getInstantDetails(commitsToReturn.last).get, classOf[HoodieCommitMetadata]) + .fromBytes(commitTimeline.getInstantDetails(instant).get, classOf[HoodieCommitMetadata]) val metaFilePath = latestMeta.getFileIdAndFullPaths(basePath).values().iterator().next() AvroConversionUtils.convertAvroSchemaToStructType(ParquetUtils.readAvroSchema( sqlContext.sparkContext.hadoopConfiguration, new Path(metaFilePath))) } + val filters = { + if (optParams.contains(DataSourceReadOptions.PUSH_DOWN_INCR_FILTERS_OPT_KEY)) { + val filterStr = optParams.get(DataSourceReadOptions.PUSH_DOWN_INCR_FILTERS_OPT_KEY).getOrElse("") + filterStr.split(",").filter(!_.isEmpty) + } + Array[String]() + } + override def schema: StructType = latestSchema override def buildScan(): RDD[Row] = { @@ -92,12 +104,17 @@ class IncrementalRelation(val sqlContext: SQLContext, // will filter out all the files incorrectly. sqlContext.sparkContext.hadoopConfiguration.unset("mapreduce.input.pathFilter.class") val sOpts = optParams.filter(p => !p._1.equalsIgnoreCase("path")) - sqlContext.read.options(sOpts) - .schema(latestSchema) // avoid AnalysisException for empty input - .parquet(fileIdToFullPath.values.toList: _*) - .filter(String.format("%s >= '%s'", HoodieRecord.COMMIT_TIME_METADATA_FIELD, commitsToReturn.head.getTimestamp)) - .filter(String.format("%s <= '%s'", HoodieRecord.COMMIT_TIME_METADATA_FIELD, commitsToReturn.last.getTimestamp)) - .toDF().rdd - + if (fileIdToFullPath.isEmpty) { + sqlContext.sparkContext.emptyRDD[Row] + } else { + log.info("Additional Filters to be applied to incremental source are :" + filters) + filters.foldLeft(sqlContext.read.options(sOpts) + .schema(latestSchema) + .parquet(fileIdToFullPath.values.toList: _*) + .filter(String.format("%s >= '%s'", HoodieRecord.COMMIT_TIME_METADATA_FIELD, commitsToReturn.head.getTimestamp)) + .filter(String.format("%s <= '%s'", + HoodieRecord.COMMIT_TIME_METADATA_FIELD, commitsToReturn.last.getTimestamp)))((e, f) => e.filter(f)) + .toDF().rdd + } } } diff --git a/hoodie-spark/src/test/scala/DataSourceTest.scala b/hoodie-spark/src/test/scala/DataSourceTest.scala index 2f34beb4e..42def5bf1 100644 --- a/hoodie-spark/src/test/scala/DataSourceTest.scala +++ b/hoodie-spark/src/test/scala/DataSourceTest.scala @@ -100,7 +100,6 @@ class DataSourceTest extends AssertionsForJUnit { .load(basePath + "/*/*/*/*"); assertEquals(100, hoodieROViewDF2.count()) // still 100, since we only updated - // Read Incremental View // we have 2 commits, try pulling the first commit (which is not the latest) val firstCommit = HoodieDataSourceHelpers.listCommitsSince(fs, basePath, "000").get(0); diff --git a/hoodie-utilities/pom.xml b/hoodie-utilities/pom.xml index 8a9feac7f..3cbf468d7 100644 --- a/hoodie-utilities/pom.xml +++ b/hoodie-utilities/pom.xml @@ -53,9 +53,10 @@ ${project.build.directory}/dependency-reduced-pom.xml - true + commons-dbcp:commons-dbcp + commons-pool:commons-pool com.uber.hoodie:hoodie-common com.uber.hoodie:hoodie-client com.uber.hoodie:hoodie-spark @@ -76,8 +77,50 @@ com.yammer.metrics:metrics-core com.101tec:zkclient org.apache.kafka:kafka-clients + org.apache.hive:hive-common + org.apache.hive:hive-service + org.apache.hive:hive-metastore + org.apache.hive:hive-jdbc + + + org.apache.commons.dbcp. + com.uber.hoodie.org.apache.commons.dbcp. + + + org.apache.commons.pool. + com.uber.hoodie.org.apache.commons.pool. + + + org.apache.hive.jdbc. + com.uber.hoodie.org.apache.hive.jdbc. + + + org.apache.hadoop.hive.metastore. + com.uber.hoodie.org.apache.hadoop_hive.metastore. + + + org.apache.hive.common. + com.uber.hoodie.org.apache.hive.common. + + + org.apache.hadoop.hive.common. + com.uber.hoodie.org.apache.hadoop_hive.common. + + + org.apache.hadoop.hive.conf. + com.uber.hoodie.org.apache.hadoop_hive.conf. + + + org.apache.hive.service. + com.uber.hoodie.org.apache.hive.service. + + + org.apache.hadoop.hive.service. + com.uber.hoodie.org.apache.hadoop_hive.service. + + @@ -123,6 +166,15 @@ test + + com.uber.hoodie + hoodie-hive + ${project.version} + tests + test-jar + test + + com.uber.hoodie hoodie-spark @@ -154,6 +206,30 @@ + + ${hive.groupid} + hive-exec + ${hive.version} + test + + + + ${hive.groupid} + hive-jdbc + ${hive.version} + standalone + + + org.slf4j + slf4j-api + + + javax.servlet + servlet-api + + + + com.uber.hoodie hoodie-hive @@ -185,6 +261,11 @@ commons-dbcp commons-dbcp + + commons-pool + commons-pool + + org.apache.httpcomponents httpcore @@ -303,59 +384,4 @@ - - - - hive12 - - - !hive11 - - - - - ${hive12.groupid} - hive-jdbc - ${hive12.version} - standalone - - - org.slf4j - slf4j-api - - - javax.servlet - servlet-api - - - - - - - hive11 - - - hive11 - - - - - org.apache.hive - hive-jdbc - ${hive11.version} - standalone - - - org.slf4j - slf4j-api - - - javax.servlet - servlet-api - - - - - - diff --git a/hoodie-utilities/src/main/java/com/uber/hoodie/utilities/HoodieCleaner.java b/hoodie-utilities/src/main/java/com/uber/hoodie/utilities/HoodieCleaner.java new file mode 100644 index 000000000..7ebca042c --- /dev/null +++ b/hoodie-utilities/src/main/java/com/uber/hoodie/utilities/HoodieCleaner.java @@ -0,0 +1,115 @@ +/* + * Copyright (c) 2017 Uber Technologies, Inc. (hoodie-dev-group@uber.com) + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package com.uber.hoodie.utilities; + +import com.beust.jcommander.JCommander; +import com.beust.jcommander.Parameter; +import com.uber.hoodie.HoodieWriteClient; +import com.uber.hoodie.common.util.FSUtils; +import com.uber.hoodie.common.util.TypedProperties; +import com.uber.hoodie.config.HoodieWriteConfig; +import com.uber.hoodie.utilities.deltastreamer.HoodieDeltaStreamer; +import java.io.IOException; +import java.io.Serializable; +import java.util.ArrayList; +import java.util.List; +import org.apache.hadoop.fs.FileSystem; +import org.apache.hadoop.fs.Path; +import org.apache.log4j.LogManager; +import org.apache.log4j.Logger; +import org.apache.spark.api.java.JavaSparkContext; + +public class HoodieCleaner { + + private static volatile Logger log = LogManager.getLogger(HoodieDeltaStreamer.class); + + /** + * Config for Cleaner + */ + private final Config cfg; + + /** + * Filesystem used + */ + private transient FileSystem fs; + + /** + * Spark context + */ + private transient JavaSparkContext jssc; + + /** + * Bag of properties with source, hoodie client, key generator etc. + */ + TypedProperties props; + + public HoodieCleaner(Config cfg, JavaSparkContext jssc) throws IOException { + this.cfg = cfg; + this.jssc = jssc; + this.fs = FSUtils.getFs(cfg.basePath, jssc.hadoopConfiguration()); + + this.props = UtilHelpers.readConfig(fs, new Path(cfg.propsFilePath), cfg.configs).getConfig(); + log.info("Creating Cleaner with configs : " + props.toString()); + } + + public void run() throws Exception { + HoodieWriteConfig hoodieCfg = getHoodieClientConfig(); + HoodieWriteClient client = new HoodieWriteClient<>(jssc, hoodieCfg, false); + client.clean(); + } + + private HoodieWriteConfig getHoodieClientConfig() throws Exception { + return HoodieWriteConfig.newBuilder().combineInput(true, true).withPath(cfg.basePath) + .withAutoCommit(false) + .withProps(props).build(); + } + + public static class Config implements Serializable { + + @Parameter(names = {"--target-base-path"}, description = "base path for the hoodie dataset to be cleaner.", + required = true) + public String basePath; + + @Parameter(names = {"--props"}, description = "path to properties file on localfs or dfs, with configurations for " + + "hoodie client for cleaning") + public String propsFilePath = + "file://" + System.getProperty("user.dir") + "/src/test/resources/delta-streamer-config/dfs-source.properties"; + + @Parameter(names = {"--hoodie-conf"}, description = "Any configuration that can be set in the properties file " + + "(using the CLI parameter \"--propsFilePath\") can also be passed command line using this parameter") + public List configs = new ArrayList<>(); + + @Parameter(names = {"--spark-master"}, description = "spark master to use.") + public String sparkMaster = "local[2]"; + + @Parameter(names = {"--help", "-h"}, help = true) + public Boolean help = false; + } + + public static void main(String[] args) throws Exception { + final Config cfg = new Config(); + JCommander cmd = new JCommander(cfg, args); + if (cfg.help || args.length == 0) { + cmd.usage(); + System.exit(1); + } + + String dirName = new Path(cfg.basePath).getName(); + JavaSparkContext jssc = UtilHelpers.buildSparkContext("hoodie-cleaner-" + dirName, cfg.sparkMaster); + new HoodieCleaner(cfg, jssc).run(); + } +} diff --git a/hoodie-utilities/src/main/java/com/uber/hoodie/utilities/UtilHelpers.java b/hoodie-utilities/src/main/java/com/uber/hoodie/utilities/UtilHelpers.java index b5008cf31..b24e3a277 100644 --- a/hoodie-utilities/src/main/java/com/uber/hoodie/utilities/UtilHelpers.java +++ b/hoodie-utilities/src/main/java/com/uber/hoodie/utilities/UtilHelpers.java @@ -30,9 +30,13 @@ import com.uber.hoodie.exception.HoodieException; import com.uber.hoodie.index.HoodieIndex; import com.uber.hoodie.utilities.schema.SchemaProvider; import com.uber.hoodie.utilities.sources.Source; +import com.uber.hoodie.utilities.transform.Transformer; +import java.io.BufferedReader; import java.io.IOException; import java.io.InputStream; +import java.io.StringReader; import java.nio.ByteBuffer; +import java.util.List; import java.util.Optional; import org.apache.hadoop.fs.FSDataInputStream; import org.apache.hadoop.fs.FileSystem; @@ -43,6 +47,7 @@ import org.apache.spark.Accumulator; import org.apache.spark.SparkConf; import org.apache.spark.api.java.JavaRDD; import org.apache.spark.api.java.JavaSparkContext; +import org.apache.spark.sql.SparkSession; /** * Bunch of helper methods @@ -51,12 +56,12 @@ public class UtilHelpers { private static Logger logger = LogManager.getLogger(UtilHelpers.class); public static Source createSource(String sourceClass, TypedProperties cfg, - JavaSparkContext jssc, SchemaProvider schemaProvider) + JavaSparkContext jssc, SparkSession sparkSession, SchemaProvider schemaProvider) throws IOException { try { return (Source) ReflectionUtils.loadClass(sourceClass, - new Class[]{TypedProperties.class, JavaSparkContext.class, SchemaProvider.class}, - cfg, jssc, schemaProvider); + new Class[]{TypedProperties.class, JavaSparkContext.class, SparkSession.class, SchemaProvider.class}, + cfg, jssc, sparkSession, schemaProvider); } catch (Throwable e) { throw new IOException("Could not load source class " + sourceClass, e); } @@ -65,17 +70,31 @@ public class UtilHelpers { public static SchemaProvider createSchemaProvider(String schemaProviderClass, TypedProperties cfg, JavaSparkContext jssc) throws IOException { try { - return (SchemaProvider) ReflectionUtils.loadClass(schemaProviderClass, cfg, jssc); + return schemaProviderClass == null ? null : + (SchemaProvider) ReflectionUtils.loadClass(schemaProviderClass, cfg, jssc); } catch (Throwable e) { throw new IOException("Could not load schema provider class " + schemaProviderClass, e); } } + public static Transformer createTransformer(String transformerClass) throws IOException { + try { + return transformerClass == null ? null : (Transformer) ReflectionUtils.loadClass(transformerClass); + } catch (Throwable e) { + throw new IOException("Could not load transformer class " + transformerClass, e); + } + } + /** */ - public static DFSPropertiesConfiguration readConfig(FileSystem fs, Path cfgPath) { + public static DFSPropertiesConfiguration readConfig(FileSystem fs, Path cfgPath, List overriddenProps) { try { - return new DFSPropertiesConfiguration(fs, cfgPath); + DFSPropertiesConfiguration conf = new DFSPropertiesConfiguration(fs, cfgPath); + if (!overriddenProps.isEmpty()) { + logger.info("Adding overridden properties to file properties."); + conf.addProperties(new BufferedReader(new StringReader(String.join("\n", overriddenProps)))); + } + return conf; } catch (Exception e) { throw new HoodieException("Unable to read props file at :" + cfgPath, e); } @@ -109,7 +128,7 @@ public class UtilHelpers { sparkConf.set("spark.eventLog.overwrite", "true"); sparkConf.set("spark.eventLog.enabled", "true"); } - sparkConf.set("spark.driver.maxResultSize", "2g"); + sparkConf.setIfMissing("spark.driver.maxResultSize", "2g"); sparkConf.set("spark.serializer", "org.apache.spark.serializer.KryoSerializer"); sparkConf.set("spark.hadoop.mapred.output.compress", "true"); sparkConf.set("spark.hadoop.mapred.output.compression.codec", "true"); diff --git a/hoodie-utilities/src/main/java/com/uber/hoodie/utilities/deltastreamer/HoodieDeltaStreamer.java b/hoodie-utilities/src/main/java/com/uber/hoodie/utilities/deltastreamer/HoodieDeltaStreamer.java index c3573fdeb..665fcff98 100644 --- a/hoodie-utilities/src/main/java/com/uber/hoodie/utilities/deltastreamer/HoodieDeltaStreamer.java +++ b/hoodie-utilities/src/main/java/com/uber/hoodie/utilities/deltastreamer/HoodieDeltaStreamer.java @@ -18,10 +18,15 @@ package com.uber.hoodie.utilities.deltastreamer; +import static com.uber.hoodie.utilities.schema.RowBasedSchemaProvider.HOODIE_RECORD_NAMESPACE; +import static com.uber.hoodie.utilities.schema.RowBasedSchemaProvider.HOODIE_RECORD_STRUCT_NAME; + import com.beust.jcommander.IStringConverter; import com.beust.jcommander.JCommander; import com.beust.jcommander.Parameter; import com.beust.jcommander.ParameterException; +import com.codahale.metrics.Timer; +import com.uber.hoodie.AvroConversionUtils; import com.uber.hoodie.DataSourceUtils; import com.uber.hoodie.HoodieWriteClient; import com.uber.hoodie.KeyGenerator; @@ -36,32 +41,40 @@ import com.uber.hoodie.common.table.HoodieTimeline; import com.uber.hoodie.common.table.timeline.HoodieInstant; import com.uber.hoodie.common.util.FSUtils; import com.uber.hoodie.common.util.TypedProperties; -import com.uber.hoodie.common.util.collection.Pair; import com.uber.hoodie.config.HoodieCompactionConfig; import com.uber.hoodie.config.HoodieIndexConfig; import com.uber.hoodie.config.HoodieWriteConfig; +import com.uber.hoodie.hive.HiveSyncConfig; +import com.uber.hoodie.hive.HiveSyncTool; import com.uber.hoodie.index.HoodieIndex; import com.uber.hoodie.utilities.HiveIncrementalPuller; import com.uber.hoodie.utilities.UtilHelpers; import com.uber.hoodie.utilities.exception.HoodieDeltaStreamerException; -import com.uber.hoodie.utilities.schema.FilebasedSchemaProvider; +import com.uber.hoodie.utilities.schema.RowBasedSchemaProvider; import com.uber.hoodie.utilities.schema.SchemaProvider; +import com.uber.hoodie.utilities.sources.InputBatch; import com.uber.hoodie.utilities.sources.JsonDFSSource; -import com.uber.hoodie.utilities.sources.Source; +import com.uber.hoodie.utilities.transform.Transformer; import java.io.IOException; import java.io.Serializable; +import java.util.ArrayList; import java.util.Arrays; import java.util.HashMap; import java.util.List; import java.util.Optional; import org.apache.avro.Schema; import org.apache.avro.generic.GenericRecord; +import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; +import org.apache.hadoop.hive.conf.HiveConf; import org.apache.log4j.LogManager; import org.apache.log4j.Logger; import org.apache.spark.api.java.JavaRDD; import org.apache.spark.api.java.JavaSparkContext; +import org.apache.spark.sql.Dataset; +import org.apache.spark.sql.Row; +import org.apache.spark.sql.SparkSession; import scala.collection.JavaConversions; /** @@ -81,7 +94,7 @@ public class HoodieDeltaStreamer implements Serializable { /** * Source to pull deltas from */ - private transient Source source; + private transient SourceFormatAdapter formatAdapter; /** * Schema provider that supplies the command for reading the input and writing out the target @@ -89,6 +102,11 @@ public class HoodieDeltaStreamer implements Serializable { */ private transient SchemaProvider schemaProvider; + /** + * Allows transforming source to target dataset before writing + */ + private transient Transformer transformer; + /** * Extract the key for the target dataset */ @@ -109,16 +127,30 @@ public class HoodieDeltaStreamer implements Serializable { */ private transient JavaSparkContext jssc; + /** + * Spark Session + */ + private transient SparkSession sparkSession; + + /** + * Hive Config + */ + private transient HiveConf hiveConf; /** * Bag of properties with source, hoodie client, key generator etc. */ TypedProperties props; - public HoodieDeltaStreamer(Config cfg, JavaSparkContext jssc) throws IOException { + this(cfg, jssc, FSUtils.getFs(cfg.targetBasePath, jssc.hadoopConfiguration()), + getDefaultHiveConf(jssc.hadoopConfiguration())); + } + + public HoodieDeltaStreamer(Config cfg, JavaSparkContext jssc, FileSystem fs, HiveConf hiveConf) throws IOException { this.cfg = cfg; this.jssc = jssc; + this.sparkSession = SparkSession.builder().config(jssc.getConf()).getOrCreate(); this.fs = FSUtils.getFs(cfg.targetBasePath, jssc.hadoopConfiguration()); if (fs.exists(new Path(cfg.targetBasePath))) { @@ -129,19 +161,28 @@ public class HoodieDeltaStreamer implements Serializable { this.commitTimelineOpt = Optional.empty(); } - this.props = UtilHelpers.readConfig(fs, new Path(cfg.propsFilePath)).getConfig(); + this.props = UtilHelpers.readConfig(fs, new Path(cfg.propsFilePath), cfg.configs).getConfig(); log.info("Creating delta streamer with configs : " + props.toString()); this.schemaProvider = UtilHelpers.createSchemaProvider(cfg.schemaProviderClassName, props, jssc); + this.transformer = UtilHelpers.createTransformer(cfg.transformerClassName); this.keyGenerator = DataSourceUtils.createKeyGenerator(cfg.keyGeneratorClass, props); - this.source = UtilHelpers.createSource(cfg.sourceClassName, props, jssc, schemaProvider); - // register the schemas, so that shuffle does not serialize the full schemas - List schemas = Arrays.asList(schemaProvider.getSourceSchema(), - schemaProvider.getTargetSchema()); - jssc.sc().getConf().registerAvroSchemas(JavaConversions.asScalaBuffer(schemas).toList()); + this.formatAdapter = + new SourceFormatAdapter(UtilHelpers.createSource(cfg.sourceClassName, props, jssc, sparkSession, + schemaProvider)); + + this.hiveConf = hiveConf; + } + + private static HiveConf getDefaultHiveConf(Configuration cfg) { + HiveConf hiveConf = new HiveConf(); + hiveConf.addResource(cfg); + return hiveConf; } public void sync() throws Exception { + HoodieDeltaStreamerMetrics metrics = new HoodieDeltaStreamerMetrics(getHoodieClientConfig(null)); + Timer.Context overallTimerContext = metrics.getOverallTimerContext(); // Retrieve the previous round checkpoints, if any Optional resumeCheckpointStr = Optional.empty(); if (commitTimelineOpt.isPresent()) { @@ -163,16 +204,42 @@ public class HoodieDeltaStreamer implements Serializable { } log.info("Checkpoint to resume from : " + resumeCheckpointStr); - // Pull the data from the source & prepare the write - Pair>, String> dataAndCheckpoint = source.fetchNewData( - resumeCheckpointStr, cfg.sourceLimit); + final Optional> avroRDDOptional; + final String checkpointStr; + final SchemaProvider schemaProvider; + if (transformer != null) { + // Transformation is needed. Fetch New rows in Row Format, apply transformation and then convert them + // to generic records for writing + InputBatch> dataAndCheckpoint = formatAdapter.fetchNewDataInRowFormat( + resumeCheckpointStr, cfg.sourceLimit); - if (!dataAndCheckpoint.getKey().isPresent()) { + Optional> transformed = + dataAndCheckpoint.getBatch().map(data -> transformer.apply(jssc, sparkSession, data, props)); + checkpointStr = dataAndCheckpoint.getCheckpointForNextBatch(); + avroRDDOptional = transformed.map(t -> + AvroConversionUtils.createRdd(t, HOODIE_RECORD_STRUCT_NAME, HOODIE_RECORD_NAMESPACE).toJavaRDD() + ); + // Use Transformed Row's schema if not overridden + schemaProvider = + this.schemaProvider == null ? transformed.map(r -> (SchemaProvider)new RowBasedSchemaProvider(r.schema())) + .orElse(dataAndCheckpoint.getSchemaProvider()) : this.schemaProvider; + } else { + // Pull the data from the source & prepare the write + InputBatch> dataAndCheckpoint = + formatAdapter.fetchNewDataInAvroFormat(resumeCheckpointStr, cfg.sourceLimit); + avroRDDOptional = dataAndCheckpoint.getBatch(); + checkpointStr = dataAndCheckpoint.getCheckpointForNextBatch(); + schemaProvider = dataAndCheckpoint.getSchemaProvider(); + } + + if ((!avroRDDOptional.isPresent()) || (avroRDDOptional.get().isEmpty())) { log.info("No new data, nothing to commit.. "); return; } - JavaRDD avroRDD = dataAndCheckpoint.getKey().get(); + registerAvroSchemas(schemaProvider); + + JavaRDD avroRDD = avroRDDOptional.get(); JavaRDD records = avroRDD.map(gr -> { HoodieRecordPayload payload = DataSourceUtils.createPayload(cfg.payloadClassName, gr, (Comparable) gr.get(cfg.sourceOrderingField)); @@ -180,20 +247,20 @@ public class HoodieDeltaStreamer implements Serializable { }); // filter dupes if needed - HoodieWriteConfig hoodieCfg = getHoodieClientConfig(); + HoodieWriteConfig hoodieCfg = getHoodieClientConfig(schemaProvider); if (cfg.filterDupes) { // turn upserts to insert cfg.operation = cfg.operation == Operation.UPSERT ? Operation.INSERT : cfg.operation; records = DataSourceUtils.dropDuplicates(jssc, records, hoodieCfg); - } - if (records.isEmpty()) { - log.info("No new data, nothing to commit.. "); - return; + if (records.isEmpty()) { + log.info("No new data, nothing to commit.. "); + return; + } } // Perform the write - HoodieWriteClient client = new HoodieWriteClient<>(jssc, hoodieCfg); + HoodieWriteClient client = new HoodieWriteClient<>(jssc, hoodieCfg, true); String commitTime = client.startCommit(); log.info("Starting commit : " + commitTime); @@ -210,7 +277,7 @@ public class HoodieDeltaStreamer implements Serializable { // Simply commit for now. TODO(vc): Support better error handlers later on HashMap checkpointCommitMetadata = new HashMap<>(); - checkpointCommitMetadata.put(CHECKPOINT_KEY, dataAndCheckpoint.getValue()); + checkpointCommitMetadata.put(CHECKPOINT_KEY, checkpointStr); boolean success = client.commit(commitTime, writeStatusRDD, Optional.of(checkpointCommitMetadata)); @@ -220,17 +287,54 @@ public class HoodieDeltaStreamer implements Serializable { } else { log.info("Commit " + commitTime + " failed!"); } + + // Sync to hive if enabled + Timer.Context hiveSyncContext = metrics.getHiveSyncTimerContext(); + syncHive(); + long hiveSyncTimeMs = hiveSyncContext != null ? hiveSyncContext.stop() : 0; + client.close(); + long overallTimeMs = overallTimerContext != null ? overallTimerContext.stop() : 0; + + // Send DeltaStreamer Metrics + metrics.updateDeltaStreamerMetrics(overallTimeMs, hiveSyncTimeMs); } - private HoodieWriteConfig getHoodieClientConfig() throws Exception { - return HoodieWriteConfig.newBuilder().combineInput(true, true).withPath(cfg.targetBasePath) - .withAutoCommit(false) - .withSchema(schemaProvider.getTargetSchema().toString()) - .withCompactionConfig(HoodieCompactionConfig.newBuilder().withPayloadClass(cfg.payloadClassName).build()) - .forTable(cfg.targetTableName) - .withIndexConfig(HoodieIndexConfig.newBuilder().withIndexType(HoodieIndex.IndexType.BLOOM).build()) - .withProps(props).build(); + public void syncHive() { + if (cfg.enableHiveSync) { + HiveSyncConfig hiveSyncConfig = DataSourceUtils.buildHiveSyncConfig(props, cfg.targetBasePath); + log.info("Syncing target hoodie table with hive table(" + hiveSyncConfig.tableName + + "). Hive metastore URL :" + hiveSyncConfig.jdbcUrl + ", basePath :" + cfg.targetBasePath); + + new HiveSyncTool(hiveSyncConfig, hiveConf, fs).syncHoodieTable(); + } + } + + /** + * Register Avro Schemas + * @param schemaProvider Schema Provider + */ + private void registerAvroSchemas(SchemaProvider schemaProvider) { + // register the schemas, so that shuffle does not serialize the full schemas + if (null != schemaProvider) { + List schemas = Arrays.asList(schemaProvider.getSourceSchema(), schemaProvider.getTargetSchema()); + log.info("Registering Schema :" + schemas); + jssc.sc().getConf().registerAvroSchemas(JavaConversions.asScalaBuffer(schemas).toList()); + } + } + + private HoodieWriteConfig getHoodieClientConfig(SchemaProvider schemaProvider) throws Exception { + HoodieWriteConfig.Builder builder = + HoodieWriteConfig.newBuilder().combineInput(true, true).withPath(cfg.targetBasePath) + .withAutoCommit(false) + .withCompactionConfig(HoodieCompactionConfig.newBuilder().withPayloadClass(cfg.payloadClassName).build()) + .forTable(cfg.targetTableName) + .withIndexConfig(HoodieIndexConfig.newBuilder().withIndexType(HoodieIndex.IndexType.BLOOM).build()) + .withProps(props); + if (null != schemaProvider) { + builder = builder.withSchema(schemaProvider.getTargetSchema().toString()); + } + return builder.build(); } public enum Operation { @@ -266,6 +370,10 @@ public class HoodieDeltaStreamer implements Serializable { public String propsFilePath = "file://" + System.getProperty("user.dir") + "/src/test/resources/delta-streamer-config/dfs-source.properties"; + @Parameter(names = {"--hoodie-conf"}, description = "Any configuration that can be set in the properties file " + + "(using the CLI parameter \"--propsFilePath\") can also be passed command line using this parameter") + public List configs = new ArrayList<>(); + @Parameter(names = {"--source-class"}, description = "Subclass of com.uber.hoodie.utilities.sources to read data. " + "Built-in options: com.uber.hoodie.utilities.sources.{JsonDFSSource (default), AvroDFSSource, " + "JsonKafkaSource, AvroKafkaSource, HiveIncrPullSource}") @@ -285,11 +393,22 @@ public class HoodieDeltaStreamer implements Serializable { public String payloadClassName = OverwriteWithLatestAvroPayload.class.getName(); @Parameter(names = {"--schemaprovider-class"}, description = "subclass of com.uber.hoodie.utilities.schema" - + ".SchemaProvider to attach schemas to input & target table data, built in options: FilebasedSchemaProvider") - public String schemaProviderClassName = FilebasedSchemaProvider.class.getName(); + + ".SchemaProvider to attach schemas to input & target table data, built in options: " + + "com.uber.hoodie.utilities.schema.FilebasedSchemaProvider." + + "Source (See com.uber.hoodie.utilities.sources.Source) implementation can implement their own SchemaProvider." + + " For Sources that return Dataset, the schema is obtained implicitly. " + + "However, this CLI option allows overriding the schemaprovider returned by Source.") + public String schemaProviderClassName = null; + + @Parameter(names = {"--transformer-class"}, + description = "subclass of com.uber.hoodie.utilities.transform.Transformer" + + ". Allows transforming raw source dataset to a target dataset (conforming to target schema) before writing." + + " Default : Not set. E:g - com.uber.hoodie.utilities.transform.SqlQueryBasedTransformer (which allows" + + "a SQL query templated to be passed as a transformation function)") + public String transformerClassName = null; @Parameter(names = {"--source-limit"}, description = "Maximum amount of data to read from source. " - + "Default: No limit For e.g: DFSSource => max bytes to read, KafkaSource => max events to read") + + "Default: No limit For e.g: DFS-Source => max bytes to read, Kafka-Source => max events to read") public long sourceLimit = Long.MAX_VALUE; @Parameter(names = {"--op"}, description = "Takes one of these values : UPSERT (default), INSERT (use when input " @@ -301,6 +420,9 @@ public class HoodieDeltaStreamer implements Serializable { + "before insert/bulk-insert") public Boolean filterDupes = false; + @Parameter(names = {"--enable-hive-sync"}, description = "Enable syncing to hive") + public Boolean enableHiveSync = false; + @Parameter(names = {"--spark-master"}, description = "spark master to use.") public String sparkMaster = "local[2]"; @@ -319,4 +441,44 @@ public class HoodieDeltaStreamer implements Serializable { JavaSparkContext jssc = UtilHelpers.buildSparkContext("delta-streamer-" + cfg.targetTableName, cfg.sparkMaster); new HoodieDeltaStreamer(cfg, jssc).sync(); } + + public SourceFormatAdapter getFormatAdapter() { + return formatAdapter; + } + + public SchemaProvider getSchemaProvider() { + return schemaProvider; + } + + public Transformer getTransformer() { + return transformer; + } + + public KeyGenerator getKeyGenerator() { + return keyGenerator; + } + + public FileSystem getFs() { + return fs; + } + + public Optional getCommitTimelineOpt() { + return commitTimelineOpt; + } + + public JavaSparkContext getJssc() { + return jssc; + } + + public SparkSession getSparkSession() { + return sparkSession; + } + + public HiveConf getHiveConf() { + return hiveConf; + } + + public TypedProperties getProps() { + return props; + } } diff --git a/hoodie-utilities/src/main/java/com/uber/hoodie/utilities/deltastreamer/HoodieDeltaStreamerMetrics.java b/hoodie-utilities/src/main/java/com/uber/hoodie/utilities/deltastreamer/HoodieDeltaStreamerMetrics.java new file mode 100644 index 000000000..2fc2f81a3 --- /dev/null +++ b/hoodie-utilities/src/main/java/com/uber/hoodie/utilities/deltastreamer/HoodieDeltaStreamerMetrics.java @@ -0,0 +1,61 @@ +package com.uber.hoodie.utilities.deltastreamer; + +import static com.uber.hoodie.metrics.Metrics.registerGauge; + +import com.codahale.metrics.Timer; +import com.uber.hoodie.config.HoodieWriteConfig; +import com.uber.hoodie.metrics.Metrics; + +public class HoodieDeltaStreamerMetrics { + + private HoodieWriteConfig config = null; + private String tableName = null; + + public String overallTimerName = null; + public String hiveSyncTimerName = null; + private Timer overallTimer = null; + public Timer hiveSyncTimer = null; + + public HoodieDeltaStreamerMetrics(HoodieWriteConfig config) { + this.config = config; + this.tableName = config.getTableName(); + if (config.isMetricsOn()) { + Metrics.init(config); + this.overallTimerName = getMetricsName("timer", "deltastreamer"); + this.hiveSyncTimerName = getMetricsName("timer", "deltastreamerHiveSync"); + } + } + + public Timer.Context getOverallTimerContext() { + if (config.isMetricsOn() && overallTimer == null) { + overallTimer = createTimer(overallTimerName); + } + return overallTimer == null ? null : overallTimer.time(); + } + + public Timer.Context getHiveSyncTimerContext() { + if (config.isMetricsOn() && hiveSyncTimer == null) { + hiveSyncTimer = createTimer(hiveSyncTimerName); + } + return hiveSyncTimer == null ? null : hiveSyncTimer.time(); + } + + private Timer createTimer(String name) { + return config.isMetricsOn() ? Metrics.getInstance().getRegistry().timer(name) : null; + } + + String getMetricsName(String action, String metric) { + return config == null ? null : String.format("%s.%s.%s", tableName, action, metric); + } + + public void updateDeltaStreamerMetrics(long durationInNs, long hiveSyncNs) { + if (config.isMetricsOn()) { + registerGauge(getMetricsName("deltastreamer", "duration"), getDurationInMs(durationInNs)); + registerGauge(getMetricsName("deltastreamer", "hiveSyncDuration"), getDurationInMs(hiveSyncNs)); + } + } + + public long getDurationInMs(long ctxDuration) { + return ctxDuration / 1000000; + } +} diff --git a/hoodie-utilities/src/main/java/com/uber/hoodie/utilities/deltastreamer/SourceFormatAdapter.java b/hoodie-utilities/src/main/java/com/uber/hoodie/utilities/deltastreamer/SourceFormatAdapter.java new file mode 100644 index 000000000..8214f260b --- /dev/null +++ b/hoodie-utilities/src/main/java/com/uber/hoodie/utilities/deltastreamer/SourceFormatAdapter.java @@ -0,0 +1,112 @@ +/* + * Copyright (c) 2018 Uber Technologies, Inc. (hoodie-dev-group@uber.com) + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + * + */ + +package com.uber.hoodie.utilities.deltastreamer; + +import static com.uber.hoodie.utilities.schema.RowBasedSchemaProvider.HOODIE_RECORD_NAMESPACE; +import static com.uber.hoodie.utilities.schema.RowBasedSchemaProvider.HOODIE_RECORD_STRUCT_NAME; + +import com.uber.hoodie.AvroConversionUtils; +import com.uber.hoodie.utilities.sources.AvroSource; +import com.uber.hoodie.utilities.sources.InputBatch; +import com.uber.hoodie.utilities.sources.JsonSource; +import com.uber.hoodie.utilities.sources.RowSource; +import com.uber.hoodie.utilities.sources.Source; +import com.uber.hoodie.utilities.sources.helpers.AvroConvertor; +import java.util.Optional; +import org.apache.avro.Schema; +import org.apache.avro.generic.GenericRecord; +import org.apache.spark.api.java.JavaRDD; +import org.apache.spark.sql.Dataset; +import org.apache.spark.sql.Row; +import org.apache.spark.sql.types.StructType; + +/** + * Adapts data-format provided by the source to the data-format required by the client (DeltaStreamer) + */ +public final class SourceFormatAdapter { + + private final Source source; + + + public SourceFormatAdapter(Source source) { + this.source = source; + } + + /** + * Fetch new data in avro format. If the source provides data in different format, they are translated + * to Avro format + * @param lastCkptStr + * @param sourceLimit + * @return + */ + public InputBatch> fetchNewDataInAvroFormat(Optional lastCkptStr, + long sourceLimit) { + switch (source.getSourceType()) { + case AVRO: + return ((AvroSource)source).fetchNext(lastCkptStr, sourceLimit); + case JSON: { + InputBatch> r = ((JsonSource)source).fetchNext(lastCkptStr, sourceLimit); + AvroConvertor convertor = new AvroConvertor(r.getSchemaProvider().getSourceSchema()); + return new InputBatch<>(Optional.ofNullable( + r.getBatch().map(rdd -> rdd.map(convertor::fromJson)) + .orElse(null)), r.getCheckpointForNextBatch(), r.getSchemaProvider()); + } + case ROW: { + InputBatch> r = ((RowSource)source).fetchNext(lastCkptStr, sourceLimit); + return new InputBatch<>(Optional.ofNullable(r.getBatch().map( + rdd -> (AvroConversionUtils.createRdd(rdd, HOODIE_RECORD_STRUCT_NAME, HOODIE_RECORD_NAMESPACE).toJavaRDD())) + .orElse(null)), r.getCheckpointForNextBatch(), r.getSchemaProvider()); + } + default: + throw new IllegalArgumentException("Unknown source type (" + source.getSourceType() + ")"); + } + } + + /** + * Fetch new data in row format. If the source provides data in different format, they are translated + * to Row format + * @param lastCkptStr + * @param sourceLimit + * @return + */ + public InputBatch> fetchNewDataInRowFormat(Optional lastCkptStr, long sourceLimit) { + switch (source.getSourceType()) { + case ROW: + return ((RowSource)source).fetchNext(lastCkptStr, sourceLimit); + case AVRO: { + InputBatch> r = ((AvroSource)source).fetchNext(lastCkptStr, sourceLimit); + Schema sourceSchema = r.getSchemaProvider().getSourceSchema(); + return new InputBatch<>(Optional.ofNullable( + r.getBatch().map(rdd -> AvroConversionUtils.createDataFrame(JavaRDD.toRDD(rdd), + sourceSchema.toString(), source.getSparkSession())) + .orElse(null)), r.getCheckpointForNextBatch(), r.getSchemaProvider()); + } + case JSON: { + InputBatch> r = ((JsonSource)source).fetchNext(lastCkptStr, sourceLimit); + Schema sourceSchema = r.getSchemaProvider().getSourceSchema(); + StructType dataType = AvroConversionUtils.convertAvroSchemaToStructType(sourceSchema); + return new InputBatch<>(Optional.ofNullable( + r.getBatch().map(rdd -> source.getSparkSession().read().schema(dataType).json(rdd)) + .orElse(null)), r.getCheckpointForNextBatch(), r.getSchemaProvider()); + } + default: + throw new IllegalArgumentException("Unknown source type (" + source.getSourceType() + ")"); + } + } +} diff --git a/hoodie-utilities/src/main/java/com/uber/hoodie/utilities/schema/RowBasedSchemaProvider.java b/hoodie-utilities/src/main/java/com/uber/hoodie/utilities/schema/RowBasedSchemaProvider.java new file mode 100644 index 000000000..0a9d7c616 --- /dev/null +++ b/hoodie-utilities/src/main/java/com/uber/hoodie/utilities/schema/RowBasedSchemaProvider.java @@ -0,0 +1,25 @@ +package com.uber.hoodie.utilities.schema; + +import com.uber.hoodie.AvroConversionUtils; +import org.apache.avro.Schema; +import org.apache.spark.sql.types.StructType; + +public class RowBasedSchemaProvider extends SchemaProvider { + + // Used in GenericRecord conversions + public static final String HOODIE_RECORD_NAMESPACE = "hoodie.source"; + public static final String HOODIE_RECORD_STRUCT_NAME = "hoodie_source"; + + private StructType rowStruct; + + public RowBasedSchemaProvider(StructType rowStruct) { + super(null, null); + this.rowStruct = rowStruct; + } + + @Override + public Schema getSourceSchema() { + return AvroConversionUtils.convertStructTypeToAvroSchema(rowStruct, HOODIE_RECORD_STRUCT_NAME, + HOODIE_RECORD_NAMESPACE); + } +} diff --git a/hoodie-utilities/src/main/java/com/uber/hoodie/utilities/schema/SchemaRegistryProvider.java b/hoodie-utilities/src/main/java/com/uber/hoodie/utilities/schema/SchemaRegistryProvider.java index 3312db5aa..84d8f94fe 100644 --- a/hoodie-utilities/src/main/java/com/uber/hoodie/utilities/schema/SchemaRegistryProvider.java +++ b/hoodie-utilities/src/main/java/com/uber/hoodie/utilities/schema/SchemaRegistryProvider.java @@ -42,12 +42,15 @@ public class SchemaRegistryProvider extends SchemaProvider { */ public static class Config { - private static final String SCHEMA_REGISTRY_URL_PROP = "hoodie.deltastreamer.schemaprovider.registry.url"; + private static final String SRC_SCHEMA_REGISTRY_URL_PROP = "hoodie.deltastreamer.schemaprovider.registry.url"; + private static final String TARGET_SCHEMA_REGISTRY_URL_PROP = + "hoodie.deltastreamer.schemaprovider.registry.targetUrl"; } private final Schema schema; + private final Schema targetSchema; - private String fetchSchemaFromRegistry(String registryUrl) throws IOException { + private static String fetchSchemaFromRegistry(String registryUrl) throws IOException { URL registry = new URL(registryUrl); ObjectMapper mapper = new ObjectMapper(); JsonNode node = mapper.readTree(registry.openStream()); @@ -56,17 +59,32 @@ public class SchemaRegistryProvider extends SchemaProvider { public SchemaRegistryProvider(TypedProperties props, JavaSparkContext jssc) { super(props, jssc); - DataSourceUtils.checkRequiredProperties(props, Collections.singletonList(Config.SCHEMA_REGISTRY_URL_PROP)); - String registryUrl = props.getString(Config.SCHEMA_REGISTRY_URL_PROP); + DataSourceUtils.checkRequiredProperties(props, Collections.singletonList(Config.SRC_SCHEMA_REGISTRY_URL_PROP)); + String registryUrl = props.getString(Config.SRC_SCHEMA_REGISTRY_URL_PROP); + String targetRegistryUrl = props.getString(Config.TARGET_SCHEMA_REGISTRY_URL_PROP, registryUrl); try { - this.schema = new Schema.Parser().parse(fetchSchemaFromRegistry(registryUrl)); + this.schema = getSchema(registryUrl); + if (!targetRegistryUrl.equals(registryUrl)) { + this.targetSchema = getSchema(targetRegistryUrl); + } else { + this.targetSchema = schema; + } } catch (IOException ioe) { throw new HoodieIOException("Error reading schema from registry :" + registryUrl, ioe); } } + private static Schema getSchema(String registryUrl) throws IOException { + return new Schema.Parser().parse(fetchSchemaFromRegistry(registryUrl)); + } + @Override public Schema getSourceSchema() { return schema; } + + @Override + public Schema getTargetSchema() { + return targetSchema; + } } diff --git a/hoodie-utilities/src/main/java/com/uber/hoodie/utilities/sources/AvroDFSSource.java b/hoodie-utilities/src/main/java/com/uber/hoodie/utilities/sources/AvroDFSSource.java index 335d06a9b..2f21f3253 100644 --- a/hoodie-utilities/src/main/java/com/uber/hoodie/utilities/sources/AvroDFSSource.java +++ b/hoodie-utilities/src/main/java/com/uber/hoodie/utilities/sources/AvroDFSSource.java @@ -19,7 +19,10 @@ package com.uber.hoodie.utilities.sources; import com.uber.hoodie.common.util.TypedProperties; +import com.uber.hoodie.common.util.collection.Pair; import com.uber.hoodie.utilities.schema.SchemaProvider; +import com.uber.hoodie.utilities.sources.helpers.DFSPathSelector; +import java.util.Optional; import org.apache.avro.generic.GenericRecord; import org.apache.avro.mapred.AvroKey; import org.apache.avro.mapreduce.AvroKeyInputFormat; @@ -27,18 +30,33 @@ import org.apache.hadoop.io.NullWritable; import org.apache.spark.api.java.JavaPairRDD; import org.apache.spark.api.java.JavaRDD; import org.apache.spark.api.java.JavaSparkContext; +import org.apache.spark.sql.SparkSession; /** * DFS Source that reads avro data */ -public class AvroDFSSource extends DFSSource { +public class AvroDFSSource extends AvroSource { - public AvroDFSSource(TypedProperties props, JavaSparkContext sparkContext, SchemaProvider schemaProvider) { - super(props, sparkContext, schemaProvider); + private final DFSPathSelector pathSelector; + + public AvroDFSSource(TypedProperties props, JavaSparkContext sparkContext, SparkSession sparkSession, + SchemaProvider schemaProvider) { + super(props, sparkContext, sparkSession, schemaProvider); + this.pathSelector = new DFSPathSelector(props, sparkContext.hadoopConfiguration()); } @Override - protected JavaRDD fromFiles(AvroConvertor convertor, String pathStr) { + protected InputBatch> fetchNewData(Optional lastCkptStr, + long sourceLimit) { + Pair, String> selectPathsWithMaxModificationTime = + pathSelector.getNextFilePathsAndMaxModificationTime(lastCkptStr, sourceLimit); + return selectPathsWithMaxModificationTime.getLeft().map(pathStr -> new InputBatch<>( + Optional.of(fromFiles(pathStr)), + selectPathsWithMaxModificationTime.getRight())) + .orElseGet(() -> new InputBatch<>(Optional.empty(), selectPathsWithMaxModificationTime.getRight())); + } + + private JavaRDD fromFiles(String pathStr) { JavaPairRDD avroRDD = sparkContext.newAPIHadoopFile(pathStr, AvroKeyInputFormat.class, AvroKey.class, NullWritable.class, sparkContext.hadoopConfiguration()); diff --git a/hoodie-utilities/src/main/java/com/uber/hoodie/utilities/sources/AvroKafkaSource.java b/hoodie-utilities/src/main/java/com/uber/hoodie/utilities/sources/AvroKafkaSource.java index 4e1471413..f1e51e0ea 100644 --- a/hoodie-utilities/src/main/java/com/uber/hoodie/utilities/sources/AvroKafkaSource.java +++ b/hoodie-utilities/src/main/java/com/uber/hoodie/utilities/sources/AvroKafkaSource.java @@ -20,27 +20,55 @@ package com.uber.hoodie.utilities.sources; import com.uber.hoodie.common.util.TypedProperties; import com.uber.hoodie.utilities.schema.SchemaProvider; +import com.uber.hoodie.utilities.sources.helpers.KafkaOffsetGen; +import com.uber.hoodie.utilities.sources.helpers.KafkaOffsetGen.CheckpointUtils; import io.confluent.kafka.serializers.KafkaAvroDecoder; +import java.util.Optional; import kafka.serializer.StringDecoder; import org.apache.avro.generic.GenericRecord; +import org.apache.log4j.LogManager; +import org.apache.log4j.Logger; import org.apache.spark.api.java.JavaRDD; import org.apache.spark.api.java.JavaSparkContext; +import org.apache.spark.sql.SparkSession; import org.apache.spark.streaming.kafka.KafkaUtils; import org.apache.spark.streaming.kafka.OffsetRange; /** * Reads avro serialized Kafka data, based on the confluent schema-registry */ -public class AvroKafkaSource extends KafkaSource { +public class AvroKafkaSource extends AvroSource { - public AvroKafkaSource(TypedProperties props, JavaSparkContext sparkContext, SchemaProvider schemaProvider) { - super(props, sparkContext, schemaProvider); + private static Logger log = LogManager.getLogger(AvroKafkaSource.class); + + private final KafkaOffsetGen offsetGen; + + public AvroKafkaSource(TypedProperties props, JavaSparkContext sparkContext, SparkSession sparkSession, + SchemaProvider schemaProvider) { + super(props, sparkContext, sparkSession, schemaProvider); + offsetGen = new KafkaOffsetGen(props); } @Override - protected JavaRDD toAvroRDD(OffsetRange[] offsetRanges, AvroConvertor avroConvertor) { - return KafkaUtils - .createRDD(sparkContext, String.class, Object.class, StringDecoder.class, KafkaAvroDecoder.class, kafkaParams, - offsetRanges).values().map(obj -> (GenericRecord) obj); + protected InputBatch> fetchNewData(Optional lastCheckpointStr, + long sourceLimit) { + OffsetRange[] offsetRanges = offsetGen.getNextOffsetRanges(lastCheckpointStr, sourceLimit); + long totalNewMsgs = CheckpointUtils.totalNewMessages(offsetRanges); + if (totalNewMsgs <= 0) { + return new InputBatch<>(Optional.empty(), + lastCheckpointStr.isPresent() ? lastCheckpointStr.get() : ""); + } else { + log.info("About to read " + totalNewMsgs + " from Kafka for topic :" + offsetGen.getTopicName()); + } + JavaRDD newDataRDD = toRDD(offsetRanges); + return new InputBatch<>(Optional.of(newDataRDD), + KafkaOffsetGen.CheckpointUtils.offsetsToStr(offsetRanges)); + } + + private JavaRDD toRDD(OffsetRange[] offsetRanges) { + JavaRDD recordRDD = KafkaUtils + .createRDD(sparkContext, String.class, Object.class, StringDecoder.class, KafkaAvroDecoder.class, + offsetGen.getKafkaParams(), offsetRanges).values().map(obj -> (GenericRecord) obj); + return recordRDD; } } diff --git a/hoodie-utilities/src/main/java/com/uber/hoodie/utilities/sources/AvroSource.java b/hoodie-utilities/src/main/java/com/uber/hoodie/utilities/sources/AvroSource.java new file mode 100644 index 000000000..ba767ad62 --- /dev/null +++ b/hoodie-utilities/src/main/java/com/uber/hoodie/utilities/sources/AvroSource.java @@ -0,0 +1,36 @@ +/* + * Copyright (c) 2018 Uber Technologies, Inc. (hoodie-dev-group@uber.com) + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + * + */ + +package com.uber.hoodie.utilities.sources; + +import com.uber.hoodie.common.util.TypedProperties; +import com.uber.hoodie.utilities.schema.SchemaProvider; +import org.apache.avro.generic.GenericRecord; +import org.apache.spark.api.java.JavaRDD; +import org.apache.spark.api.java.JavaSparkContext; +import org.apache.spark.sql.SparkSession; + +public abstract class AvroSource extends Source> { + + public AvroSource(TypedProperties props, + JavaSparkContext sparkContext, + SparkSession sparkSession, + SchemaProvider schemaProvider) { + super(props, sparkContext, sparkSession, schemaProvider, SourceType.AVRO); + } +} diff --git a/hoodie-utilities/src/main/java/com/uber/hoodie/utilities/sources/HiveIncrPullSource.java b/hoodie-utilities/src/main/java/com/uber/hoodie/utilities/sources/HiveIncrPullSource.java index 3d6af40fa..0f5e78746 100644 --- a/hoodie-utilities/src/main/java/com/uber/hoodie/utilities/sources/HiveIncrPullSource.java +++ b/hoodie-utilities/src/main/java/com/uber/hoodie/utilities/sources/HiveIncrPullSource.java @@ -21,8 +21,6 @@ package com.uber.hoodie.utilities.sources; import com.uber.hoodie.DataSourceUtils; import com.uber.hoodie.common.util.FSUtils; import com.uber.hoodie.common.util.TypedProperties; -import com.uber.hoodie.common.util.collection.ImmutablePair; -import com.uber.hoodie.common.util.collection.Pair; import com.uber.hoodie.exception.HoodieIOException; import com.uber.hoodie.utilities.schema.SchemaProvider; import java.io.IOException; @@ -44,19 +42,20 @@ import org.apache.log4j.Logger; import org.apache.spark.api.java.JavaPairRDD; import org.apache.spark.api.java.JavaRDD; import org.apache.spark.api.java.JavaSparkContext; +import org.apache.spark.sql.SparkSession; /** - * Source to read deltas produced by {@link com.uber.hoodie.utilities.HiveIncrementalPuller}, commit - * by commit and apply to the target table + * Source to read deltas produced by {@link com.uber.hoodie.utilities.HiveIncrementalPuller}, commit by commit and apply + * to the target table *

* The general idea here is to have commits sync across the data pipeline. *

- * [Source Tables(s)] ====> HiveIncrementalScanner ==> incrPullRootPath ==> targetTable - * {c1,c2,c3,...} {c1,c2,c3,...} {c1,c2,c3,...} + * [Source Tables(s)] ====> HiveIncrementalScanner ==> incrPullRootPath ==> targetTable {c1,c2,c3,...} + * {c1,c2,c3,...} {c1,c2,c3,...} *

* This produces beautiful causality, that makes data issues in ETLs very easy to debug */ -public class HiveIncrPullSource extends Source { +public class HiveIncrPullSource extends AvroSource { private static volatile Logger log = LogManager.getLogger(HiveIncrPullSource.class); @@ -73,9 +72,9 @@ public class HiveIncrPullSource extends Source { private static final String ROOT_INPUT_PATH_PROP = "hoodie.deltastreamer.source.incrpull.root"; } - public HiveIncrPullSource(TypedProperties props, JavaSparkContext sparkContext, + public HiveIncrPullSource(TypedProperties props, JavaSparkContext sparkContext, SparkSession sparkSession, SchemaProvider schemaProvider) { - super(props, sparkContext, schemaProvider); + super(props, sparkContext, sparkSession, schemaProvider); DataSourceUtils.checkRequiredProperties(props, Collections.singletonList(Config.ROOT_INPUT_PATH_PROP)); this.incrPullRootPath = props.getString(Config.ROOT_INPUT_PATH_PROP); this.fs = FSUtils.getFs(incrPullRootPath, sparkContext.hadoopConfiguration()); @@ -113,15 +112,15 @@ public class HiveIncrPullSource extends Source { } @Override - public Pair>, String> fetchNewData( + protected InputBatch> fetchNewData( Optional lastCheckpointStr, long sourceLimit) { try { // find the source commit to pull Optional commitToPull = findCommitToPull(lastCheckpointStr); if (!commitToPull.isPresent()) { - return new ImmutablePair<>(Optional.empty(), - lastCheckpointStr.orElse("")); + return new InputBatch<>(Optional.empty(), + lastCheckpointStr.isPresent() ? lastCheckpointStr.get() : ""); } // read the files out. @@ -132,7 +131,7 @@ public class HiveIncrPullSource extends Source { JavaPairRDD avroRDD = sparkContext.newAPIHadoopFile(pathStr, AvroKeyInputFormat.class, AvroKey.class, NullWritable.class, sparkContext.hadoopConfiguration()); - return new ImmutablePair<>(Optional.of(avroRDD.keys().map(r -> ((GenericRecord) r.datum()))), + return new InputBatch<>(Optional.of(avroRDD.keys().map(r -> ((GenericRecord) r.datum()))), String.valueOf(commitToPull.get())); } catch (IOException ioe) { throw new HoodieIOException( diff --git a/hoodie-utilities/src/main/java/com/uber/hoodie/utilities/sources/HoodieIncrSource.java b/hoodie-utilities/src/main/java/com/uber/hoodie/utilities/sources/HoodieIncrSource.java new file mode 100644 index 000000000..430eb1e1e --- /dev/null +++ b/hoodie-utilities/src/main/java/com/uber/hoodie/utilities/sources/HoodieIncrSource.java @@ -0,0 +1,144 @@ +package com.uber.hoodie.utilities.sources; + +import com.uber.hoodie.DataSourceReadOptions; +import com.uber.hoodie.DataSourceUtils; +import com.uber.hoodie.common.model.HoodieRecord; +import com.uber.hoodie.common.util.TypedProperties; +import com.uber.hoodie.common.util.collection.Pair; +import com.uber.hoodie.hive.SlashEncodedDayPartitionValueExtractor; +import com.uber.hoodie.utilities.schema.SchemaProvider; +import com.uber.hoodie.utilities.sources.helpers.IncrSourceHelper; +import java.util.Arrays; +import java.util.Optional; +import org.apache.spark.api.java.JavaSparkContext; +import org.apache.spark.sql.DataFrameReader; +import org.apache.spark.sql.Dataset; +import org.apache.spark.sql.Row; +import org.apache.spark.sql.SparkSession; + +public class HoodieIncrSource extends RowSource { + + /** + * Configs supported + */ + protected static class Config { + + /** + * {@value #HOODIE_SRC_BASE_PATH} is the base-path for the source Hoodie table + */ + private static final String HOODIE_SRC_BASE_PATH = "hoodie.deltastreamer.source.hoodieincr.path"; + + /** + * {@value #NUM_INSTANTS_PER_FETCH} allows the max number of instants whose changes can be incrementally fetched + */ + private static final String NUM_INSTANTS_PER_FETCH = "hoodie.deltastreamer.source.hoodieincr.num_instants"; + private static final Integer DEFAULT_NUM_INSTANTS_PER_FETCH = 1; + + /** + * {@value #HOODIE_SRC_PARTITION_FIELDS} specifies partition fields that needs to be added to source table after + * parsing _hoodie_partition_path + */ + private static final String HOODIE_SRC_PARTITION_FIELDS = "hoodie.deltastreamer.source.hoodieincr.partition.fields"; + + /** + * {@value #HOODIE_SRC_PARTITION_EXTRACTORCLASS} PartitionValueExtractor class to extract partition fields from + * _hoodie_partition_path + */ + private static final String HOODIE_SRC_PARTITION_EXTRACTORCLASS = + "hoodie.deltastreamer.source.hoodieincr.partition.extractor.class"; + private static final String DEFAULT_HOODIE_SRC_PARTITION_EXTRACTORCLASS = + SlashEncodedDayPartitionValueExtractor.class.getCanonicalName(); + + /** + * {@value #READ_LATEST_INSTANT_ON_MISSING_CKPT} allows delta-streamer to incrementally fetch from latest committed + * instant when checkpoint is not provided. + */ + private static final String READ_LATEST_INSTANT_ON_MISSING_CKPT = + "hoodie.deltastreamer.source.hoodieincr.read_latest_on_missing_ckpt"; + private static final Boolean DEFAULT_READ_LATEST_INSTANT_ON_MISSING_CKPT = false; + } + + public HoodieIncrSource(TypedProperties props, + JavaSparkContext sparkContext, SparkSession sparkSession, + SchemaProvider schemaProvider) { + super(props, sparkContext, sparkSession, schemaProvider); + } + + @Override + public Pair>, String> fetchNextBatch(Optional lastCkptStr, long sourceLimit) { + + DataSourceUtils.checkRequiredProperties(props, Arrays.asList(Config.HOODIE_SRC_BASE_PATH)); + + /** + DataSourceUtils.checkRequiredProperties(props, Arrays.asList(Config.HOODIE_SRC_BASE_PATH, + Config.HOODIE_SRC_PARTITION_FIELDS)); + List partitionFields = props.getStringList(Config.HOODIE_SRC_PARTITION_FIELDS, ",", + new ArrayList<>()); + PartitionValueExtractor extractor = DataSourceUtils.createPartitionExtractor(props.getString( + Config.HOODIE_SRC_PARTITION_EXTRACTORCLASS, Config.DEFAULT_HOODIE_SRC_PARTITION_EXTRACTORCLASS)); + **/ + String srcPath = props.getString(Config.HOODIE_SRC_BASE_PATH); + int numInstantsPerFetch = props.getInteger(Config.NUM_INSTANTS_PER_FETCH, Config.DEFAULT_NUM_INSTANTS_PER_FETCH); + boolean readLatestOnMissingCkpt = props.getBoolean(Config.READ_LATEST_INSTANT_ON_MISSING_CKPT, + Config.DEFAULT_READ_LATEST_INSTANT_ON_MISSING_CKPT); + + // Use begin Instant if set and non-empty + Optional beginInstant = + lastCkptStr.isPresent() ? lastCkptStr.get().isEmpty() ? Optional.empty() : lastCkptStr : Optional.empty(); + + Pair instantEndpts = IncrSourceHelper.calculateBeginAndEndInstants(sparkContext, srcPath, + numInstantsPerFetch, beginInstant, readLatestOnMissingCkpt); + + if (instantEndpts.getKey().equals(instantEndpts.getValue())) { + log.warn("Already caught up. Begin Checkpoint was :" + instantEndpts.getKey()); + return Pair.of(Optional.empty(), instantEndpts.getKey()); + } + + // Do Incr pull. Set end instant if available + DataFrameReader reader = sparkSession.read().format("com.uber.hoodie") + .option(DataSourceReadOptions.VIEW_TYPE_OPT_KEY(), DataSourceReadOptions.VIEW_TYPE_INCREMENTAL_OPT_VAL()) + .option(DataSourceReadOptions.BEGIN_INSTANTTIME_OPT_KEY(), instantEndpts.getLeft()) + .option(DataSourceReadOptions.END_INSTANTTIME_OPT_KEY(), instantEndpts.getRight()); + + Dataset source = reader.load(srcPath); + + /** + log.info("Partition Fields are : (" + partitionFields + "). Initial Source Schema :" + source.schema()); + + StructType newSchema = new StructType(source.schema().fields()); + for (String field : partitionFields) { + newSchema = newSchema.add(field, DataTypes.StringType, true); + } + + /** + * Validates if the commit time is sane and also generates Partition fields from _hoodie_partition_path if + * configured + * + Dataset validated = source.map((MapFunction) (Row row) -> { + // _hoodie_instant_time + String instantTime = row.getString(0); + IncrSourceHelper.validateInstantTime(row, instantTime, instantEndpts.getKey(), instantEndpts.getValue()); + if (!partitionFields.isEmpty()) { + // _hoodie_partition_path + String hoodiePartitionPath = row.getString(3); + List partitionVals = extractor.extractPartitionValuesInPath(hoodiePartitionPath).stream() + .map(o -> (Object) o).collect(Collectors.toList()); + Preconditions.checkArgument(partitionVals.size() == partitionFields.size(), + "#partition-fields != #partition-values-extracted"); + List rowObjs = new ArrayList<>(scala.collection.JavaConversions.seqAsJavaList(row.toSeq())); + rowObjs.addAll(partitionVals); + return RowFactory.create(rowObjs.toArray()); + } + return row; + }, RowEncoder.apply(newSchema)); + + log.info("Validated Source Schema :" + validated.schema()); + **/ + + // Remove Hoodie meta columns except partition path from input source + final Dataset src = source.drop(HoodieRecord.HOODIE_META_COLUMNS.stream() + .filter(x -> !x.equals(HoodieRecord.PARTITION_PATH_METADATA_FIELD)).toArray(String[]::new)); + //log.info("Final Schema from Source is :" + src.schema()); + return Pair.of(Optional.of(src), instantEndpts.getRight()); + } +} diff --git a/hoodie-utilities/src/main/java/com/uber/hoodie/utilities/sources/InputBatch.java b/hoodie-utilities/src/main/java/com/uber/hoodie/utilities/sources/InputBatch.java new file mode 100644 index 000000000..9139057b9 --- /dev/null +++ b/hoodie-utilities/src/main/java/com/uber/hoodie/utilities/sources/InputBatch.java @@ -0,0 +1,54 @@ +/* + * Copyright (c) 2018 Uber Technologies, Inc. (hoodie-dev-group@uber.com) + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + * + */ + +package com.uber.hoodie.utilities.sources; + +import com.uber.hoodie.utilities.schema.SchemaProvider; +import java.util.Optional; + +public class InputBatch { + + private final Optional batch; + private final String checkpointForNextBatch; + private final SchemaProvider schemaProvider; + + public InputBatch(Optional batch, String checkpointForNextBatch, + SchemaProvider schemaProvider) { + this.batch = batch; + this.checkpointForNextBatch = checkpointForNextBatch; + this.schemaProvider = schemaProvider; + } + + public InputBatch(Optional batch, String checkpointForNextBatch) { + this.batch = batch; + this.checkpointForNextBatch = checkpointForNextBatch; + this.schemaProvider = null; + } + + public Optional getBatch() { + return batch; + } + + public String getCheckpointForNextBatch() { + return checkpointForNextBatch; + } + + public SchemaProvider getSchemaProvider() { + return schemaProvider; + } +} diff --git a/hoodie-utilities/src/main/java/com/uber/hoodie/utilities/sources/JsonDFSSource.java b/hoodie-utilities/src/main/java/com/uber/hoodie/utilities/sources/JsonDFSSource.java index 6b1018e15..bbf985ba0 100644 --- a/hoodie-utilities/src/main/java/com/uber/hoodie/utilities/sources/JsonDFSSource.java +++ b/hoodie-utilities/src/main/java/com/uber/hoodie/utilities/sources/JsonDFSSource.java @@ -19,22 +19,38 @@ package com.uber.hoodie.utilities.sources; import com.uber.hoodie.common.util.TypedProperties; +import com.uber.hoodie.common.util.collection.Pair; import com.uber.hoodie.utilities.schema.SchemaProvider; -import org.apache.avro.generic.GenericRecord; +import com.uber.hoodie.utilities.sources.helpers.DFSPathSelector; +import java.util.Optional; import org.apache.spark.api.java.JavaRDD; import org.apache.spark.api.java.JavaSparkContext; +import org.apache.spark.sql.SparkSession; /** * DFS Source that reads json data */ -public class JsonDFSSource extends DFSSource { +public class JsonDFSSource extends JsonSource { - public JsonDFSSource(TypedProperties props, JavaSparkContext sparkContext, SchemaProvider schemaProvider) { - super(props, sparkContext, schemaProvider); + private final DFSPathSelector pathSelector; + + public JsonDFSSource(TypedProperties props, JavaSparkContext sparkContext, SparkSession sparkSession, + SchemaProvider schemaProvider) { + super(props, sparkContext, sparkSession, schemaProvider); + this.pathSelector = new DFSPathSelector(props, sparkContext.hadoopConfiguration()); } @Override - protected JavaRDD fromFiles(AvroConvertor convertor, String pathStr) { - return sparkContext.textFile(pathStr).map(convertor::fromJson); + protected InputBatch> fetchNewData(Optional lastCkptStr, + long sourceLimit) { + Pair, String> selPathsWithMaxModificationTime = + pathSelector.getNextFilePathsAndMaxModificationTime(lastCkptStr, sourceLimit); + return selPathsWithMaxModificationTime.getLeft().map(pathStr -> new InputBatch<>( + Optional.of(fromFiles(pathStr)), selPathsWithMaxModificationTime.getRight())) + .orElse(new InputBatch<>(Optional.empty(), selPathsWithMaxModificationTime.getRight())); + } + + private JavaRDD fromFiles(String pathStr) { + return sparkContext.textFile(pathStr); } } diff --git a/hoodie-utilities/src/main/java/com/uber/hoodie/utilities/sources/JsonKafkaSource.java b/hoodie-utilities/src/main/java/com/uber/hoodie/utilities/sources/JsonKafkaSource.java index b271e3704..339c37355 100644 --- a/hoodie-utilities/src/main/java/com/uber/hoodie/utilities/sources/JsonKafkaSource.java +++ b/hoodie-utilities/src/main/java/com/uber/hoodie/utilities/sources/JsonKafkaSource.java @@ -20,26 +20,49 @@ package com.uber.hoodie.utilities.sources; import com.uber.hoodie.common.util.TypedProperties; import com.uber.hoodie.utilities.schema.SchemaProvider; +import com.uber.hoodie.utilities.sources.helpers.KafkaOffsetGen; +import com.uber.hoodie.utilities.sources.helpers.KafkaOffsetGen.CheckpointUtils; +import java.util.Optional; import kafka.serializer.StringDecoder; -import org.apache.avro.generic.GenericRecord; +import org.apache.log4j.LogManager; +import org.apache.log4j.Logger; import org.apache.spark.api.java.JavaRDD; import org.apache.spark.api.java.JavaSparkContext; +import org.apache.spark.sql.SparkSession; import org.apache.spark.streaming.kafka.KafkaUtils; import org.apache.spark.streaming.kafka.OffsetRange; /** * Read json kafka data */ -public class JsonKafkaSource extends KafkaSource { +public class JsonKafkaSource extends JsonSource { - public JsonKafkaSource(TypedProperties properties, JavaSparkContext sparkContext, SchemaProvider schemaProvider) { - super(properties, sparkContext, schemaProvider); + private static Logger log = LogManager.getLogger(JsonKafkaSource.class); + + private final KafkaOffsetGen offsetGen; + + public JsonKafkaSource(TypedProperties properties, JavaSparkContext sparkContext, SparkSession sparkSession, + SchemaProvider schemaProvider) { + super(properties, sparkContext, sparkSession, schemaProvider); + offsetGen = new KafkaOffsetGen(properties); } @Override - protected JavaRDD toAvroRDD(OffsetRange[] offsetRanges, AvroConvertor avroConvertor) { + protected InputBatch> fetchNewData(Optional lastCheckpointStr, + long sourceLimit) { + OffsetRange[] offsetRanges = offsetGen.getNextOffsetRanges(lastCheckpointStr, sourceLimit); + long totalNewMsgs = CheckpointUtils.totalNewMessages(offsetRanges); + if (totalNewMsgs <= 0) { + return new InputBatch<>(Optional.empty(), + lastCheckpointStr.isPresent() ? lastCheckpointStr.get() : ""); + } + log.info("About to read " + totalNewMsgs + " from Kafka for topic :" + offsetGen.getTopicName()); + JavaRDD newDataRDD = toRDD(offsetRanges); + return new InputBatch<>(Optional.of(newDataRDD), CheckpointUtils.offsetsToStr(offsetRanges)); + } + + private JavaRDD toRDD(OffsetRange[] offsetRanges) { return KafkaUtils.createRDD(sparkContext, String.class, String.class, StringDecoder.class, StringDecoder.class, - kafkaParams, offsetRanges) - .values().map(avroConvertor::fromJson); + offsetGen.getKafkaParams(), offsetRanges).values(); } } diff --git a/hoodie-utilities/src/main/java/com/uber/hoodie/utilities/sources/JsonSource.java b/hoodie-utilities/src/main/java/com/uber/hoodie/utilities/sources/JsonSource.java new file mode 100644 index 000000000..27ec5f3eb --- /dev/null +++ b/hoodie-utilities/src/main/java/com/uber/hoodie/utilities/sources/JsonSource.java @@ -0,0 +1,35 @@ +/* + * Copyright (c) 2018 Uber Technologies, Inc. (hoodie-dev-group@uber.com) + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + * + */ + +package com.uber.hoodie.utilities.sources; + +import com.uber.hoodie.common.util.TypedProperties; +import com.uber.hoodie.utilities.schema.SchemaProvider; +import org.apache.spark.api.java.JavaRDD; +import org.apache.spark.api.java.JavaSparkContext; +import org.apache.spark.sql.SparkSession; + +public abstract class JsonSource extends Source> { + + public JsonSource(TypedProperties props, + JavaSparkContext sparkContext, + SparkSession sparkSession, + SchemaProvider schemaProvider) { + super(props, sparkContext, sparkSession, schemaProvider, SourceType.JSON); + } +} diff --git a/hoodie-utilities/src/main/java/com/uber/hoodie/utilities/sources/RowSource.java b/hoodie-utilities/src/main/java/com/uber/hoodie/utilities/sources/RowSource.java new file mode 100644 index 000000000..708e55d04 --- /dev/null +++ b/hoodie-utilities/src/main/java/com/uber/hoodie/utilities/sources/RowSource.java @@ -0,0 +1,51 @@ +/* + * Copyright (c) 2018 Uber Technologies, Inc. (hoodie-dev-group@uber.com) + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + * + */ + +package com.uber.hoodie.utilities.sources; + +import com.uber.hoodie.common.util.TypedProperties; +import com.uber.hoodie.common.util.collection.Pair; +import com.uber.hoodie.utilities.schema.RowBasedSchemaProvider; +import com.uber.hoodie.utilities.schema.SchemaProvider; +import java.util.Optional; +import org.apache.spark.api.java.JavaSparkContext; +import org.apache.spark.sql.Dataset; +import org.apache.spark.sql.Row; +import org.apache.spark.sql.SparkSession; + +public abstract class RowSource extends Source> { + + public RowSource(TypedProperties props, + JavaSparkContext sparkContext, + SparkSession sparkSession, + SchemaProvider schemaProvider) { + super(props, sparkContext, sparkSession, schemaProvider, SourceType.ROW); + } + + protected abstract Pair>, String> fetchNextBatch(Optional lastCkptStr, + long sourceLimit); + + @Override + protected final InputBatch> fetchNewData(Optional lastCkptStr, long sourceLimit) { + Pair>, String> res = fetchNextBatch(lastCkptStr, sourceLimit); + return res.getKey().map(dsr -> { + SchemaProvider rowSchemaProvider = new RowBasedSchemaProvider(dsr.schema()); + return new InputBatch<>(res.getKey(), res.getValue(), rowSchemaProvider); + }).orElseGet(() -> new InputBatch<>(res.getKey(), res.getValue())); + } +} diff --git a/hoodie-utilities/src/main/java/com/uber/hoodie/utilities/sources/Source.java b/hoodie-utilities/src/main/java/com/uber/hoodie/utilities/sources/Source.java index 06b83e9b1..4744e9375 100644 --- a/hoodie-utilities/src/main/java/com/uber/hoodie/utilities/sources/Source.java +++ b/hoodie-utilities/src/main/java/com/uber/hoodie/utilities/sources/Source.java @@ -19,36 +19,67 @@ package com.uber.hoodie.utilities.sources; import com.uber.hoodie.common.util.TypedProperties; -import com.uber.hoodie.common.util.collection.Pair; import com.uber.hoodie.utilities.schema.SchemaProvider; import java.io.Serializable; import java.util.Optional; -import org.apache.avro.generic.GenericRecord; -import org.apache.spark.api.java.JavaRDD; +import org.apache.log4j.LogManager; +import org.apache.log4j.Logger; import org.apache.spark.api.java.JavaSparkContext; +import org.apache.spark.sql.SparkSession; /** * Represents a source from which we can tail data. Assumes a constructor that takes properties. */ -public abstract class Source implements Serializable { +public abstract class Source implements Serializable { + protected static volatile Logger log = LogManager.getLogger(Source.class); - protected transient TypedProperties props; - - protected transient JavaSparkContext sparkContext; - - protected transient SchemaProvider schemaProvider; - - - protected Source(TypedProperties props, JavaSparkContext sparkContext, SchemaProvider schemaProvider) { - this.props = props; - this.sparkContext = sparkContext; - this.schemaProvider = schemaProvider; + public enum SourceType { + JSON, + AVRO, + ROW } + protected transient TypedProperties props; + protected transient JavaSparkContext sparkContext; + protected transient SparkSession sparkSession; + private transient SchemaProvider overriddenSchemaProvider; + + private final SourceType sourceType; + + protected Source(TypedProperties props, JavaSparkContext sparkContext, SparkSession sparkSession, + SchemaProvider schemaProvider) { + this(props, sparkContext, sparkSession, schemaProvider, SourceType.AVRO); + } + + protected Source(TypedProperties props, JavaSparkContext sparkContext, SparkSession sparkSession, + SchemaProvider schemaProvider, SourceType sourceType) { + this.props = props; + this.sparkContext = sparkContext; + this.sparkSession = sparkSession; + this.overriddenSchemaProvider = schemaProvider; + this.sourceType = sourceType; + } + + protected abstract InputBatch fetchNewData(Optional lastCkptStr, long sourceLimit); + /** - * Fetches new data upto sourceLimit, from the provided checkpoint and returns an RDD of the - * data, as well as the checkpoint to be written as a result of that. + * Main API called by Hoodie Delta Streamer to fetch records + * @param lastCkptStr Last Checkpoint + * @param sourceLimit Source Limit + * @return */ - public abstract Pair>, String> fetchNewData( - Optional lastCheckpointStr, long sourceLimit); + public final InputBatch fetchNext(Optional lastCkptStr, long sourceLimit) { + InputBatch batch = fetchNewData(lastCkptStr, sourceLimit); + // If overriddenSchemaProvider is passed in CLI, use it + return overriddenSchemaProvider == null ? batch : new InputBatch<>(batch.getBatch(), + batch.getCheckpointForNextBatch(), overriddenSchemaProvider); + } + + public SourceType getSourceType() { + return sourceType; + } + + public SparkSession getSparkSession() { + return sparkSession; + } } diff --git a/hoodie-utilities/src/main/java/com/uber/hoodie/utilities/sources/AvroConvertor.java b/hoodie-utilities/src/main/java/com/uber/hoodie/utilities/sources/helpers/AvroConvertor.java similarity index 93% rename from hoodie-utilities/src/main/java/com/uber/hoodie/utilities/sources/AvroConvertor.java rename to hoodie-utilities/src/main/java/com/uber/hoodie/utilities/sources/helpers/AvroConvertor.java index feb06d5b3..ef022b7a8 100644 --- a/hoodie-utilities/src/main/java/com/uber/hoodie/utilities/sources/AvroConvertor.java +++ b/hoodie-utilities/src/main/java/com/uber/hoodie/utilities/sources/helpers/AvroConvertor.java @@ -16,7 +16,7 @@ * */ -package com.uber.hoodie.utilities.sources; +package com.uber.hoodie.utilities.sources.helpers; import com.twitter.bijection.Injection; import com.twitter.bijection.avro.GenericAvroCodecs; @@ -55,6 +55,10 @@ public class AvroConvertor implements Serializable { this.schemaStr = schemaStr; } + public AvroConvertor(Schema schema) { + this.schemaStr = schema.toString(); + this.schema = schema; + } private void initSchema() { if (schema == null) { diff --git a/hoodie-utilities/src/main/java/com/uber/hoodie/utilities/sources/DFSSource.java b/hoodie-utilities/src/main/java/com/uber/hoodie/utilities/sources/helpers/DFSPathSelector.java similarity index 76% rename from hoodie-utilities/src/main/java/com/uber/hoodie/utilities/sources/DFSSource.java rename to hoodie-utilities/src/main/java/com/uber/hoodie/utilities/sources/helpers/DFSPathSelector.java index 6d962b276..2c5f9f292 100644 --- a/hoodie-utilities/src/main/java/com/uber/hoodie/utilities/sources/DFSSource.java +++ b/hoodie-utilities/src/main/java/com/uber/hoodie/utilities/sources/helpers/DFSPathSelector.java @@ -1,5 +1,5 @@ /* - * Copyright (c) 2017 Uber Technologies, Inc. (hoodie-dev-group@uber.com) + * Copyright (c) 2018 Uber Technologies, Inc. (hoodie-dev-group@uber.com) * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -16,7 +16,7 @@ * */ -package com.uber.hoodie.utilities.sources; +package com.uber.hoodie.utilities.sources.helpers; import com.uber.hoodie.DataSourceUtils; import com.uber.hoodie.common.util.FSUtils; @@ -24,45 +24,38 @@ import com.uber.hoodie.common.util.TypedProperties; import com.uber.hoodie.common.util.collection.ImmutablePair; import com.uber.hoodie.common.util.collection.Pair; import com.uber.hoodie.exception.HoodieIOException; -import com.uber.hoodie.utilities.schema.SchemaProvider; import java.io.IOException; import java.util.*; import java.util.stream.Collectors; -import org.apache.avro.generic.GenericRecord; +import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.FileStatus; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.LocatedFileStatus; import org.apache.hadoop.fs.Path; import org.apache.hadoop.fs.RemoteIterator; -import org.apache.spark.api.java.JavaRDD; -import org.apache.spark.api.java.JavaSparkContext; -/** - * Source to read data from a given DFS directory structure, incrementally - */ -public abstract class DFSSource extends Source { +public class DFSPathSelector { /** * Configs supported */ static class Config { + private static final String ROOT_INPUT_PATH_PROP = "hoodie.deltastreamer.source.dfs.root"; } private static final List IGNORE_FILEPREFIX_LIST = Arrays.asList(".", "_"); private final transient FileSystem fs; + private final TypedProperties props; - public DFSSource(TypedProperties props, JavaSparkContext sparkContext, SchemaProvider schemaProvider) { - super(props, sparkContext, schemaProvider); - DataSourceUtils.checkRequiredProperties(props, Collections.singletonList(Config.ROOT_INPUT_PATH_PROP)); - this.fs = FSUtils.getFs(props.getString(Config.ROOT_INPUT_PATH_PROP), sparkContext.hadoopConfiguration()); + public DFSPathSelector(TypedProperties props, Configuration hadoopConf) { + DataSourceUtils.checkRequiredProperties(props, Arrays.asList(Config.ROOT_INPUT_PATH_PROP)); + this.props = props; + this.fs = FSUtils.getFs(props.getString(Config.ROOT_INPUT_PATH_PROP), hadoopConf); } - protected abstract JavaRDD fromFiles(final AvroConvertor convertor, String pathStr); - - @Override - public Pair>, String> fetchNewData( + public Pair, String> getNextFilePathsAndMaxModificationTime( Optional lastCheckpointStr, long sourceLimit) { try { @@ -111,11 +104,9 @@ public abstract class DFSSource extends Source { // read the files out. String pathStr = filteredFiles.stream().map(f -> f.getPath().toString()) .collect(Collectors.joining(",")); - String schemaStr = schemaProvider.getSourceSchema().toString(); - final AvroConvertor avroConvertor = new AvroConvertor(schemaStr); return new ImmutablePair<>( - Optional.of(fromFiles(avroConvertor, pathStr)), + Optional.ofNullable(pathStr), String.valueOf(maxModificationTime)); } catch (IOException ioe) { throw new HoodieIOException( diff --git a/hoodie-utilities/src/main/java/com/uber/hoodie/utilities/sources/helpers/IncrSourceHelper.java b/hoodie-utilities/src/main/java/com/uber/hoodie/utilities/sources/helpers/IncrSourceHelper.java new file mode 100644 index 000000000..93056012b --- /dev/null +++ b/hoodie-utilities/src/main/java/com/uber/hoodie/utilities/sources/helpers/IncrSourceHelper.java @@ -0,0 +1,88 @@ +package com.uber.hoodie.utilities.sources.helpers; + +import com.google.common.base.Preconditions; +import com.uber.hoodie.common.table.HoodieTableMetaClient; +import com.uber.hoodie.common.table.HoodieTimeline; +import com.uber.hoodie.common.table.timeline.HoodieInstant; +import com.uber.hoodie.common.util.collection.Pair; +import java.util.Optional; +import org.apache.spark.api.java.JavaSparkContext; +import org.apache.spark.sql.Row; + + +/** + * Helper for Hudi Incremental Source. Has APIs to + * (a) calculate begin and end instant time for incrementally pulling from Hudi source + * (b) Find max seen instant to be set as checkpoint for next fetch. + */ +public class IncrSourceHelper { + + /** + * Get a timestamp which is the next value in a descending sequence + * + * @param timestamp Timestamp + */ + private static String getStrictlyLowerTimestamp(String timestamp) { + long ts = Long.parseLong(timestamp); + Preconditions.checkArgument(ts > 0, "Timestamp must be positive"); + Long lower = ts - 1; + return "" + lower; + } + + /** + * Find begin and end instants to be set for the next fetch + * + * @param jssc Java Spark Context + * @param srcBasePath Base path of Hudi source table + * @param numInstantsPerFetch Max Instants per fetch + * @param beginInstant Last Checkpoint String + * @param readLatestOnMissingBeginInstant when begin instant is missing, allow reading from latest committed instant + * @return begin and end instants + */ + public static Pair calculateBeginAndEndInstants( + JavaSparkContext jssc, String srcBasePath, int numInstantsPerFetch, Optional beginInstant, + boolean readLatestOnMissingBeginInstant) { + Preconditions.checkArgument(numInstantsPerFetch > 0, "Make sure the config" + + " hoodie.deltastreamer.source.hoodieincr.num_instants is set to a positive value"); + HoodieTableMetaClient srcMetaClient = new HoodieTableMetaClient(jssc.hadoopConfiguration(), + srcBasePath, true); + + final HoodieTimeline activeCommitTimeline = + srcMetaClient.getActiveTimeline().getCommitTimeline().filterCompletedInstants(); + + String beginInstantTime = beginInstant.orElseGet(() -> { + if (readLatestOnMissingBeginInstant) { + Optional lastInstant = activeCommitTimeline.lastInstant(); + return lastInstant.map(hoodieInstant -> getStrictlyLowerTimestamp(hoodieInstant.getTimestamp())).orElse("000"); + } else { + throw new IllegalArgumentException("Missing begin instant for incremental pull. For reading from latest " + + "committed instant set hoodie.deltastreamer.source.hoodie.read_latest_on_midding_ckpt to true"); + } + }); + + Optional nthInstant = + activeCommitTimeline.findInstantsAfter(beginInstantTime, numInstantsPerFetch).getInstants().reduce((x, y) -> y); + return Pair.of(beginInstantTime, nthInstant.map(instant -> instant.getTimestamp()).orElse(beginInstantTime)); + } + + /** + * Validate instant time seen in the incoming row + * + * @param row Input Row + * @param instantTime Hoodie Instant time of the row + * @param sinceInstant begin instant of the batch + * @param endInstant end instant of the batch + */ + public static void validateInstantTime(Row row, String instantTime, String sinceInstant, String endInstant) { + Preconditions.checkNotNull(instantTime); + Preconditions.checkArgument(HoodieTimeline.compareTimestamps(instantTime, + sinceInstant, HoodieTimeline.GREATER), + "Instant time(_hoodie_commit_time) in row (" + row + ") was : " + instantTime + + "but expected to be between " + sinceInstant + "(excl) - " + + endInstant + "(incl)"); + Preconditions.checkArgument(HoodieTimeline.compareTimestamps(instantTime, + endInstant, HoodieTimeline.LESSER_OR_EQUAL), + "Instant time(_hoodie_commit_time) in row (" + row + ") was : " + instantTime + + "but expected to be between " + sinceInstant + "(excl) - " + endInstant + "(incl)"); + } +} diff --git a/hoodie-utilities/src/main/java/com/uber/hoodie/utilities/sources/KafkaSource.java b/hoodie-utilities/src/main/java/com/uber/hoodie/utilities/sources/helpers/KafkaOffsetGen.java similarity index 84% rename from hoodie-utilities/src/main/java/com/uber/hoodie/utilities/sources/KafkaSource.java rename to hoodie-utilities/src/main/java/com/uber/hoodie/utilities/sources/helpers/KafkaOffsetGen.java index 4699fcaf3..947f3c48a 100644 --- a/hoodie-utilities/src/main/java/com/uber/hoodie/utilities/sources/KafkaSource.java +++ b/hoodie-utilities/src/main/java/com/uber/hoodie/utilities/sources/helpers/KafkaOffsetGen.java @@ -16,24 +16,22 @@ * */ -package com.uber.hoodie.utilities.sources; +package com.uber.hoodie.utilities.sources.helpers; import com.uber.hoodie.DataSourceUtils; import com.uber.hoodie.common.util.TypedProperties; -import com.uber.hoodie.common.util.collection.ImmutablePair; -import com.uber.hoodie.common.util.collection.Pair; import com.uber.hoodie.exception.HoodieNotSupportedException; import com.uber.hoodie.utilities.exception.HoodieDeltaStreamerException; -import com.uber.hoodie.utilities.schema.SchemaProvider; - -import java.util.*; +import java.util.Arrays; +import java.util.Collections; +import java.util.Comparator; +import java.util.HashMap; +import java.util.HashSet; +import java.util.Optional; import java.util.stream.Collectors; import kafka.common.TopicAndPartition; -import org.apache.avro.generic.GenericRecord; import org.apache.log4j.LogManager; import org.apache.log4j.Logger; -import org.apache.spark.api.java.JavaRDD; -import org.apache.spark.api.java.JavaSparkContext; import org.apache.spark.streaming.kafka.KafkaCluster; import org.apache.spark.streaming.kafka.KafkaCluster.LeaderOffset; import org.apache.spark.streaming.kafka.OffsetRange; @@ -49,14 +47,13 @@ import scala.util.Either; /** * Source to read data from Kafka, incrementally */ -public abstract class KafkaSource extends Source { +public class KafkaOffsetGen { - private static volatile Logger log = LogManager.getLogger(KafkaSource.class); + private static volatile Logger log = LogManager.getLogger(KafkaOffsetGen.class); private static long DEFAULT_MAX_EVENTS_TO_READ = 1000000; // 1M events max - - static class CheckpointUtils { + public static class CheckpointUtils { /** * Reconstruct checkpoint from string. @@ -90,7 +87,6 @@ public abstract class KafkaSource extends Source { return sb.toString(); } - /** * Compute the offset ranges to read from Kafka, while handling newly added partitions, skews, event limits. * @@ -174,19 +170,18 @@ public abstract class KafkaSource extends Source { * Configs to be passed for this source. All standard Kafka consumer configs are also respected */ static class Config { + private static final String KAFKA_TOPIC_NAME = "hoodie.deltastreamer.source.kafka.topic"; private static final KafkaResetOffsetStrategies DEFAULT_AUTO_RESET_OFFSET = KafkaResetOffsetStrategies.LARGEST; } - - protected HashMap kafkaParams; - + private final HashMap kafkaParams; + private final TypedProperties props; protected final String topicName; - public KafkaSource(TypedProperties props, JavaSparkContext sparkContext, SchemaProvider schemaProvider) { - super(props, sparkContext, schemaProvider); - - kafkaParams = new HashMap<>(); + public KafkaOffsetGen(TypedProperties props) { + this.props = props; + kafkaParams = new HashMap(); for (Object prop : props.keySet()) { kafkaParams.put(prop.toString(), props.getString(prop.toString())); } @@ -194,11 +189,7 @@ public abstract class KafkaSource extends Source { topicName = props.getString(Config.KAFKA_TOPIC_NAME); } - protected abstract JavaRDD toAvroRDD(OffsetRange[] offsetRanges, AvroConvertor avroConvertor); - - @Override - public Pair>, String> fetchNewData( - Optional lastCheckpointStr, long sourceLimit) { + public OffsetRange[] getNextOffsetRanges(Optional lastCheckpointStr, long sourceLimit) { // Obtain current metadata for the topic KafkaCluster cluster = new KafkaCluster(ScalaHelpers.toScalaMap(kafkaParams)); @@ -240,16 +231,15 @@ public abstract class KafkaSource extends Source { // Come up with final set of OffsetRanges to read (account for new partitions, limit number of events) long numEvents = Math.min(DEFAULT_MAX_EVENTS_TO_READ, sourceLimit); OffsetRange[] offsetRanges = CheckpointUtils.computeOffsetRanges(fromOffsets, toOffsets, numEvents); - long totalNewMsgs = CheckpointUtils.totalNewMessages(offsetRanges); - if (totalNewMsgs <= 0) { - return new ImmutablePair<>(Optional.empty(), lastCheckpointStr.orElse("")); - } else { - log.info("About to read " + totalNewMsgs + " from Kafka for topic :" + topicName); - } - // Produce a RDD[GenericRecord] - final AvroConvertor avroConvertor = new AvroConvertor(schemaProvider.getSourceSchema().toString()); - JavaRDD newDataRDD = toAvroRDD(offsetRanges, avroConvertor); - return new ImmutablePair<>(Optional.of(newDataRDD), CheckpointUtils.offsetsToStr(offsetRanges)); + return offsetRanges; + } + + public String getTopicName() { + return topicName; + } + + public HashMap getKafkaParams() { + return kafkaParams; } } diff --git a/hoodie-utilities/src/main/java/com/uber/hoodie/utilities/transform/IdentityTransformer.java b/hoodie-utilities/src/main/java/com/uber/hoodie/utilities/transform/IdentityTransformer.java new file mode 100644 index 000000000..b454cdf1d --- /dev/null +++ b/hoodie-utilities/src/main/java/com/uber/hoodie/utilities/transform/IdentityTransformer.java @@ -0,0 +1,37 @@ +/* + * Copyright (c) 2018 Uber Technologies, Inc. (hoodie-dev-group@uber.com) + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + * + */ + +package com.uber.hoodie.utilities.transform; + +import com.uber.hoodie.common.util.TypedProperties; +import org.apache.spark.api.java.JavaSparkContext; +import org.apache.spark.sql.Dataset; +import org.apache.spark.sql.Row; +import org.apache.spark.sql.SparkSession; + +/** + * Identity transformer + */ +public class IdentityTransformer implements Transformer { + + @Override + public Dataset apply(JavaSparkContext jsc, SparkSession sparkSession, + Dataset rowDataset, TypedProperties properties) { + return rowDataset; + } +} diff --git a/hoodie-utilities/src/main/java/com/uber/hoodie/utilities/transform/SqlQueryBasedTransformer.java b/hoodie-utilities/src/main/java/com/uber/hoodie/utilities/transform/SqlQueryBasedTransformer.java new file mode 100644 index 000000000..b967f45d1 --- /dev/null +++ b/hoodie-utilities/src/main/java/com/uber/hoodie/utilities/transform/SqlQueryBasedTransformer.java @@ -0,0 +1,66 @@ +/* + * Copyright (c) 2018 Uber Technologies, Inc. (hoodie-dev-group@uber.com) + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + * + */ + +package com.uber.hoodie.utilities.transform; + +import com.uber.hoodie.common.util.TypedProperties; +import java.util.UUID; +import org.apache.log4j.LogManager; +import org.apache.log4j.Logger; +import org.apache.spark.api.java.JavaSparkContext; +import org.apache.spark.sql.Dataset; +import org.apache.spark.sql.Row; +import org.apache.spark.sql.SparkSession; + +/** + * A transformer that allows a sql-query template be used to transform the source before writing to Hudi data-set. + * + * The query should reference the source as a table named "\" + */ +public class SqlQueryBasedTransformer implements Transformer { + + private static volatile Logger log = LogManager.getLogger(SqlQueryBasedTransformer.class); + + private static final String SRC_PATTERN = ""; + private static final String TMP_TABLE = "HOODIE_SRC_TMP_TABLE_"; + + /** + * Configs supported + */ + static class Config { + + private static final String TRANSFORMER_SQL = "hoodie.deltastreamer.transformer.sql"; + } + + @Override + public Dataset apply(JavaSparkContext jsc, SparkSession sparkSession, + Dataset rowDataset, TypedProperties properties) { + String transformerSQL = properties.getString(Config.TRANSFORMER_SQL); + if (null == transformerSQL) { + throw new IllegalArgumentException("Missing configuration : (" + Config.TRANSFORMER_SQL + ")"); + } + + // tmp table name doesn't like dashes + String tmpTable = TMP_TABLE.concat(UUID.randomUUID().toString().replace("-", "_")); + log.info("Registering tmp table : " + tmpTable); + rowDataset.registerTempTable(tmpTable); + String sqlStr = transformerSQL.replaceAll(SRC_PATTERN, tmpTable); + log.info("SQL Query for transformation : (" + sqlStr + ")"); + return sparkSession.sql(sqlStr); + } +} diff --git a/hoodie-utilities/src/main/java/com/uber/hoodie/utilities/transform/Transformer.java b/hoodie-utilities/src/main/java/com/uber/hoodie/utilities/transform/Transformer.java new file mode 100644 index 000000000..32e80facd --- /dev/null +++ b/hoodie-utilities/src/main/java/com/uber/hoodie/utilities/transform/Transformer.java @@ -0,0 +1,43 @@ +/* + * Copyright (c) 2018 Uber Technologies, Inc. (hoodie-dev-group@uber.com) + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + * + */ + +package com.uber.hoodie.utilities.transform; + +import com.uber.hoodie.common.util.TypedProperties; +import org.apache.spark.api.java.JavaSparkContext; +import org.apache.spark.sql.Dataset; +import org.apache.spark.sql.Row; +import org.apache.spark.sql.SparkSession; + +/** + * Transform source to target dataset before writing + */ +public interface Transformer { + + /** + * Transform source RDD to target RDD + * + * @param jsc JavaSparkContext + * @param rowDataset Source DataSet + * @param sparkSession Spark Session + * @param properties Config properties + * @return Transformed Dataset + */ + Dataset apply(JavaSparkContext jsc, SparkSession sparkSession, + Dataset rowDataset, TypedProperties properties); +} diff --git a/hoodie-utilities/src/test/java/com/uber/hoodie/utilities/TestHoodieDeltaStreamer.java b/hoodie-utilities/src/test/java/com/uber/hoodie/utilities/TestHoodieDeltaStreamer.java index 63d93b414..acce00451 100644 --- a/hoodie-utilities/src/test/java/com/uber/hoodie/utilities/TestHoodieDeltaStreamer.java +++ b/hoodie-utilities/src/test/java/com/uber/hoodie/utilities/TestHoodieDeltaStreamer.java @@ -19,8 +19,10 @@ package com.uber.hoodie.utilities; import static org.junit.Assert.assertEquals; +import static org.junit.Assert.assertTrue; import static org.junit.Assert.fail; +import com.uber.hoodie.DataSourceWriteOptions; import com.uber.hoodie.common.model.HoodieCommitMetadata; import com.uber.hoodie.common.table.HoodieTableMetaClient; import com.uber.hoodie.common.table.HoodieTimeline; @@ -28,17 +30,31 @@ import com.uber.hoodie.common.table.timeline.HoodieInstant; import com.uber.hoodie.common.util.DFSPropertiesConfiguration; import com.uber.hoodie.common.util.TypedProperties; import com.uber.hoodie.exception.DatasetNotFoundException; +import com.uber.hoodie.hive.HiveSyncConfig; +import com.uber.hoodie.hive.HoodieHiveClient; +import com.uber.hoodie.hive.MultiPartKeysValueExtractor; import com.uber.hoodie.utilities.deltastreamer.HoodieDeltaStreamer; import com.uber.hoodie.utilities.deltastreamer.HoodieDeltaStreamer.Operation; +import com.uber.hoodie.utilities.schema.FilebasedSchemaProvider; +import com.uber.hoodie.utilities.sources.HoodieIncrSource; import com.uber.hoodie.utilities.sources.TestDataSource; +import com.uber.hoodie.utilities.transform.SqlQueryBasedTransformer; +import com.uber.hoodie.utilities.transform.Transformer; import java.io.IOException; +import java.util.ArrayList; import java.util.List; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; import org.apache.log4j.LogManager; import org.apache.log4j.Logger; +import org.apache.spark.api.java.JavaSparkContext; +import org.apache.spark.sql.Dataset; import org.apache.spark.sql.Row; import org.apache.spark.sql.SQLContext; +import org.apache.spark.sql.SparkSession; +import org.apache.spark.sql.api.java.UDF4; +import org.apache.spark.sql.functions; +import org.apache.spark.sql.types.DataTypes; import org.junit.After; import org.junit.AfterClass; import org.junit.Before; @@ -55,17 +71,43 @@ public class TestHoodieDeltaStreamer extends UtilitiesTestBase { @BeforeClass public static void initClass() throws Exception { - UtilitiesTestBase.initClass(); + UtilitiesTestBase.initClass(true); // prepare the configs. UtilitiesTestBase.Helpers.copyToDFS("delta-streamer-config/base.properties", dfs, dfsBasePath + "/base.properties"); + UtilitiesTestBase.Helpers.copyToDFS("delta-streamer-config/sql-transformer.properties", dfs, + dfsBasePath + "/sql-transformer.properties"); UtilitiesTestBase.Helpers.copyToDFS("delta-streamer-config/source.avsc", dfs, dfsBasePath + "/source.avsc"); + UtilitiesTestBase.Helpers.copyToDFS("delta-streamer-config/target.avsc", dfs, dfsBasePath + "/target.avsc"); + TypedProperties props = new TypedProperties(); - props.setProperty("include", "base.properties"); + props.setProperty("include", "sql-transformer.properties"); props.setProperty("hoodie.datasource.write.recordkey.field", "_row_key"); props.setProperty("hoodie.datasource.write.partitionpath.field", "not_there"); props.setProperty("hoodie.deltastreamer.schemaprovider.source.schema.file", dfsBasePath + "/source.avsc"); + props.setProperty("hoodie.deltastreamer.schemaprovider.target.schema.file", dfsBasePath + "/target.avsc"); + // Hive Configs + props.setProperty(DataSourceWriteOptions.HIVE_URL_OPT_KEY(), "jdbc:hive2://127.0.0.1:9999/"); + props.setProperty(DataSourceWriteOptions.HIVE_DATABASE_OPT_KEY(), "testdb1"); + props.setProperty(DataSourceWriteOptions.HIVE_TABLE_OPT_KEY(), "hive_trips"); + props.setProperty(DataSourceWriteOptions.HIVE_ASSUME_DATE_PARTITION_OPT_KEY(), "false"); + props.setProperty(DataSourceWriteOptions.HIVE_PARTITION_FIELDS_OPT_KEY(), "datestr"); + props.setProperty(DataSourceWriteOptions.HIVE_PARTITION_EXTRACTOR_CLASS_OPT_KEY(), + MultiPartKeysValueExtractor.class.getName()); UtilitiesTestBase.Helpers.savePropsToDFS(props, dfs, dfsBasePath + "/test-source.properties"); + + // Properties used for the delta-streamer which incrementally pulls from upstream Hudi source table and writes to + // downstream hudi table + TypedProperties downstreamProps = new TypedProperties(); + downstreamProps.setProperty("include", "base.properties"); + downstreamProps.setProperty("hoodie.datasource.write.recordkey.field", "_row_key"); + downstreamProps.setProperty("hoodie.datasource.write.partitionpath.field", "not_there"); + + // Source schema is the target schema of upstream table + downstreamProps.setProperty("hoodie.deltastreamer.schemaprovider.source.schema.file", dfsBasePath + "/target.avsc"); + downstreamProps.setProperty("hoodie.deltastreamer.schemaprovider.target.schema.file", dfsBasePath + "/target.avsc"); + UtilitiesTestBase.Helpers.savePropsToDFS(downstreamProps, dfs, + dfsBasePath + "/test-downstream-source.properties"); } @AfterClass @@ -86,17 +128,48 @@ public class TestHoodieDeltaStreamer extends UtilitiesTestBase { } static class TestHelpers { - static HoodieDeltaStreamer.Config makeConfig(String basePath, Operation op) { + return makeConfig(basePath, op, TripsWithDistanceTransformer.class.getName()); + } + + static HoodieDeltaStreamer.Config makeConfig(String basePath, Operation op, String transformerClassName) { + return makeConfig(basePath, op, transformerClassName, false); + } + + static HoodieDeltaStreamer.Config makeConfig(String basePath, Operation op, String transformerClassName, + boolean enableHiveSync) { HoodieDeltaStreamer.Config cfg = new HoodieDeltaStreamer.Config(); cfg.targetBasePath = basePath; cfg.targetTableName = "hoodie_trips"; cfg.storageType = "COPY_ON_WRITE"; cfg.sourceClassName = TestDataSource.class.getName(); + cfg.transformerClassName = transformerClassName; cfg.operation = op; + cfg.enableHiveSync = enableHiveSync; cfg.sourceOrderingField = "timestamp"; cfg.propsFilePath = dfsBasePath + "/test-source.properties"; cfg.sourceLimit = 1000; + cfg.schemaProviderClassName = FilebasedSchemaProvider.class.getName(); + return cfg; + } + + static HoodieDeltaStreamer.Config makeConfigForHudiIncrSrc(String srcBasePath, String basePath, Operation op, + boolean addReadLatestOnMissingCkpt) { + HoodieDeltaStreamer.Config cfg = new HoodieDeltaStreamer.Config(); + cfg.targetBasePath = basePath; + cfg.targetTableName = "hoodie_trips_copy"; + cfg.storageType = "COPY_ON_WRITE"; + cfg.sourceClassName = HoodieIncrSource.class.getName(); + cfg.operation = op; + cfg.sourceOrderingField = "timestamp"; + cfg.propsFilePath = dfsBasePath + "/test-downstream-source.properties"; + cfg.sourceLimit = 1000; + List cfgs = new ArrayList<>(); + cfgs.add("hoodie.deltastreamer.source.hoodieincr.read_latest_on_missing_ckpt=" + addReadLatestOnMissingCkpt); + cfgs.add("hoodie.deltastreamer.source.hoodieincr.path=" + srcBasePath); + // No partition + cfgs.add("hoodie.deltastreamer.source.hoodieincr.partition.fields=datestr"); + cfg.configs = cfgs; return cfg; } @@ -110,15 +183,30 @@ public class TestHoodieDeltaStreamer extends UtilitiesTestBase { .sort("_hoodie_commit_time").collectAsList(); } - static void assertCommitMetadata(String expected, String datasetPath, FileSystem fs, int totalCommits) + static void assertDistanceCount(long expected, String datasetPath, SQLContext sqlContext) { + sqlContext.read().format("com.uber.hoodie").load(datasetPath).registerTempTable("tmp_trips"); + long recordCount = + sqlContext.sparkSession().sql("select * from tmp_trips where haversine_distance is not NULL").count(); + assertEquals(expected, recordCount); + } + + static void assertDistanceCountWithExactValue(long expected, String datasetPath, SQLContext sqlContext) { + sqlContext.read().format("com.uber.hoodie").load(datasetPath).registerTempTable("tmp_trips"); + long recordCount = + sqlContext.sparkSession().sql("select * from tmp_trips where haversine_distance = 1.0").count(); + assertEquals(expected, recordCount); + } + + static String assertCommitMetadata(String expected, String datasetPath, FileSystem fs, int totalCommits) throws IOException { HoodieTableMetaClient meta = new HoodieTableMetaClient(fs.getConf(), datasetPath); HoodieTimeline timeline = meta.getActiveTimeline().getCommitsTimeline().filterCompletedInstants(); - HoodieInstant lastCommit = timeline.lastInstant().get(); + HoodieInstant lastInstant = timeline.lastInstant().get(); HoodieCommitMetadata commitMetadata = HoodieCommitMetadata.fromBytes( - timeline.getInstantDetails(lastCommit).get(), HoodieCommitMetadata.class); + timeline.getInstantDetails(lastInstant).get(), HoodieCommitMetadata.class); assertEquals(totalCommits, timeline.countInstants()); assertEquals(expected, commitMetadata.getMetadata(HoodieDeltaStreamer.CHECKPOINT_KEY)); + return lastInstant.getTimestamp(); } } @@ -152,12 +240,14 @@ public class TestHoodieDeltaStreamer extends UtilitiesTestBase { HoodieDeltaStreamer.Config cfg = TestHelpers.makeConfig(datasetBasePath, Operation.BULK_INSERT); new HoodieDeltaStreamer(cfg, jsc).sync(); TestHelpers.assertRecordCount(1000, datasetBasePath + "/*/*.parquet", sqlContext); + TestHelpers.assertDistanceCount(1000, datasetBasePath + "/*/*.parquet", sqlContext); TestHelpers.assertCommitMetadata("00000", datasetBasePath, dfs, 1); // No new data => no commits. cfg.sourceLimit = 0; new HoodieDeltaStreamer(cfg, jsc).sync(); TestHelpers.assertRecordCount(1000, datasetBasePath + "/*/*.parquet", sqlContext); + TestHelpers.assertDistanceCount(1000, datasetBasePath + "/*/*.parquet", sqlContext); TestHelpers.assertCommitMetadata("00000", datasetBasePath, dfs, 1); // upsert() #1 @@ -165,11 +255,94 @@ public class TestHoodieDeltaStreamer extends UtilitiesTestBase { cfg.operation = Operation.UPSERT; new HoodieDeltaStreamer(cfg, jsc).sync(); TestHelpers.assertRecordCount(2000, datasetBasePath + "/*/*.parquet", sqlContext); + TestHelpers.assertDistanceCount(2000, datasetBasePath + "/*/*.parquet", sqlContext); TestHelpers.assertCommitMetadata("00001", datasetBasePath, dfs, 2); List counts = TestHelpers.countsPerCommit(datasetBasePath + "/*/*.parquet", sqlContext); assertEquals(2000, counts.get(0).getLong(1)); } + /** + * Test Bulk Insert and upserts with hive syncing. Tests Hudi incremental processing using a 2 step pipeline + * The first step involves using a SQL template to transform a source + * TEST-DATA-SOURCE ============================> HUDI TABLE 1 ===============> HUDI TABLE 2 + * (incr-pull with transform) (incr-pull) + * Hudi Table 1 is synced with Hive. + * @throws Exception + */ + @Test + public void testBulkInsertsAndUpsertsWithSQLBasedTransformerFor2StepPipeline() throws Exception { + String datasetBasePath = dfsBasePath + "/test_dataset2"; + String downstreamDatasetBasePath = dfsBasePath + "/test_downstream_dataset2"; + + HiveSyncConfig hiveSyncConfig = getHiveSyncConfig(datasetBasePath, "hive_trips"); + + // Initial bulk insert to ingest to first hudi table + HoodieDeltaStreamer.Config cfg = TestHelpers.makeConfig(datasetBasePath, Operation.BULK_INSERT, + SqlQueryBasedTransformer.class.getName(), true); + new HoodieDeltaStreamer(cfg, jsc, dfs, hiveServer.getHiveConf()).sync(); + TestHelpers.assertRecordCount(1000, datasetBasePath + "/*/*.parquet", sqlContext); + TestHelpers.assertDistanceCount(1000, datasetBasePath + "/*/*.parquet", sqlContext); + TestHelpers.assertDistanceCountWithExactValue(1000, datasetBasePath + "/*/*.parquet", sqlContext); + String lastInstantForUpstreamTable = TestHelpers.assertCommitMetadata("00000", datasetBasePath, dfs, 1); + + // Now incrementally pull from the above hudi table and ingest to second table + HoodieDeltaStreamer.Config downstreamCfg = + TestHelpers.makeConfigForHudiIncrSrc(datasetBasePath, downstreamDatasetBasePath, Operation.BULK_INSERT, true); + new HoodieDeltaStreamer(downstreamCfg, jsc, dfs, hiveServer.getHiveConf()).sync(); + TestHelpers.assertRecordCount(1000, downstreamDatasetBasePath + "/*/*.parquet", sqlContext); + TestHelpers.assertDistanceCount(1000, downstreamDatasetBasePath + "/*/*.parquet", sqlContext); + TestHelpers.assertDistanceCountWithExactValue(1000, downstreamDatasetBasePath + "/*/*.parquet", sqlContext); + TestHelpers.assertCommitMetadata(lastInstantForUpstreamTable, downstreamDatasetBasePath, dfs, 1); + + // No new data => no commits for upstream table + cfg.sourceLimit = 0; + new HoodieDeltaStreamer(cfg, jsc, dfs, hiveServer.getHiveConf()).sync(); + TestHelpers.assertRecordCount(1000, datasetBasePath + "/*/*.parquet", sqlContext); + TestHelpers.assertDistanceCount(1000, datasetBasePath + "/*/*.parquet", sqlContext); + TestHelpers.assertDistanceCountWithExactValue(1000, datasetBasePath + "/*/*.parquet", sqlContext); + TestHelpers.assertCommitMetadata("00000", datasetBasePath, dfs, 1); + + // with no change in upstream table, no change in downstream too when pulled. + new HoodieDeltaStreamer(downstreamCfg, jsc).sync(); + TestHelpers.assertRecordCount(1000, downstreamDatasetBasePath + "/*/*.parquet", sqlContext); + TestHelpers.assertDistanceCount(1000, downstreamDatasetBasePath + "/*/*.parquet", sqlContext); + TestHelpers.assertDistanceCountWithExactValue(1000, downstreamDatasetBasePath + "/*/*.parquet", sqlContext); + TestHelpers.assertCommitMetadata(lastInstantForUpstreamTable, downstreamDatasetBasePath, dfs, 1); + + // upsert() #1 on upstream hudi table + cfg.sourceLimit = 2000; + cfg.operation = Operation.UPSERT; + new HoodieDeltaStreamer(cfg, jsc, dfs, hiveServer.getHiveConf()).sync(); + TestHelpers.assertRecordCount(2000, datasetBasePath + "/*/*.parquet", sqlContext); + TestHelpers.assertDistanceCount(2000, datasetBasePath + "/*/*.parquet", sqlContext); + TestHelpers.assertDistanceCountWithExactValue(2000, datasetBasePath + "/*/*.parquet", sqlContext); + lastInstantForUpstreamTable = TestHelpers.assertCommitMetadata("00001", datasetBasePath, dfs, 2); + List counts = TestHelpers.countsPerCommit(datasetBasePath + "/*/*.parquet", sqlContext); + assertEquals(2000, counts.get(0).getLong(1)); + + // Incrementally pull changes in upstream hudi table and apply to downstream table + downstreamCfg = + TestHelpers.makeConfigForHudiIncrSrc(datasetBasePath, downstreamDatasetBasePath, Operation.UPSERT, false); + downstreamCfg.sourceLimit = 2000; + new HoodieDeltaStreamer(downstreamCfg, jsc).sync(); + TestHelpers.assertRecordCount(2000, downstreamDatasetBasePath + "/*/*.parquet", sqlContext); + TestHelpers.assertDistanceCount(2000, downstreamDatasetBasePath + "/*/*.parquet", sqlContext); + TestHelpers.assertDistanceCountWithExactValue(2000, downstreamDatasetBasePath + "/*/*.parquet", sqlContext); + String finalInstant = + TestHelpers.assertCommitMetadata(lastInstantForUpstreamTable, downstreamDatasetBasePath, dfs, 2); + counts = TestHelpers.countsPerCommit(downstreamDatasetBasePath + "/*/*.parquet", sqlContext); + assertEquals(2000, counts.get(0).getLong(1)); + + // Test Hive integration + HoodieHiveClient hiveClient = new HoodieHiveClient(hiveSyncConfig, hiveServer.getHiveConf(), dfs); + assertTrue("Table " + hiveSyncConfig.tableName + " should exist", + hiveClient.doesTableExist()); + assertEquals("Table partitions should match the number of partitions we wrote", 1, + hiveClient.scanTablePartitions().size()); + assertEquals("The last commit that was sycned should be updated in the TBLPROPERTIES", + lastInstantForUpstreamTable, hiveClient.getLastCommitTimeSynced().get()); + } + @Test public void testFilterDupes() throws Exception { String datasetBasePath = dfsBasePath + "/test_dupes_dataset"; @@ -192,4 +365,57 @@ public class TestHoodieDeltaStreamer extends UtilitiesTestBase { assertEquals(1000, counts.get(0).getLong(1)); assertEquals(1000, counts.get(1).getLong(1)); } + + /** + * UDF to calculate Haversine distance + */ + public static class DistanceUDF implements UDF4 { + + /** + * + * Taken from https://stackoverflow.com/questions/3694380/calculating-distance-between-two-points-using-latitude- + * longitude-what-am-i-doi + * Calculate distance between two points in latitude and longitude taking + * into account height difference. If you are not interested in height + * difference pass 0.0. Uses Haversine method as its base. + * + * lat1, lon1 Start point lat2, lon2 End point el1 Start altitude in meters + * el2 End altitude in meters + * @returns Distance in Meters + */ + @Override + public Double call(Double lat1, Double lat2, Double lon1, Double lon2) { + + final int R = 6371; // Radius of the earth + + double latDistance = Math.toRadians(lat2 - lat1); + double lonDistance = Math.toRadians(lon2 - lon1); + double a = Math.sin(latDistance / 2) * Math.sin(latDistance / 2) + + Math.cos(Math.toRadians(lat1)) * Math.cos(Math.toRadians(lat2)) + * Math.sin(lonDistance / 2) * Math.sin(lonDistance / 2); + double c = 2 * Math.atan2(Math.sqrt(a), Math.sqrt(1 - a)); + double distance = R * c * 1000; // convert to meters + + double height = 0; + + distance = Math.pow(distance, 2) + Math.pow(height, 2); + + return Math.sqrt(distance); + } + } + + /** + * Adds a new field "haversine_distance" to the row + */ + public static class TripsWithDistanceTransformer implements Transformer { + + @Override + public Dataset apply(JavaSparkContext jsc, SparkSession sparkSession, + Dataset rowDataset, TypedProperties properties) { + rowDataset.sqlContext().udf().register("distance_udf", new DistanceUDF(), DataTypes.DoubleType); + return rowDataset.withColumn("haversine_distance", + functions.callUDF("distance_udf", functions.col("begin_lat"), + functions.col("end_lat"), functions.col("begin_lon"), functions.col("end_lat"))); + } + } } diff --git a/hoodie-utilities/src/test/java/com/uber/hoodie/utilities/UtilitiesTestBase.java b/hoodie-utilities/src/test/java/com/uber/hoodie/utilities/UtilitiesTestBase.java index eb2d00a65..a5bba7343 100644 --- a/hoodie-utilities/src/test/java/com/uber/hoodie/utilities/UtilitiesTestBase.java +++ b/hoodie-utilities/src/test/java/com/uber/hoodie/utilities/UtilitiesTestBase.java @@ -18,10 +18,16 @@ package com.uber.hoodie.utilities; +import com.google.common.collect.ImmutableList; import com.uber.hoodie.common.TestRawTripPayload; import com.uber.hoodie.common.minicluster.HdfsTestService; import com.uber.hoodie.common.model.HoodieRecord; +import com.uber.hoodie.common.model.HoodieTableType; +import com.uber.hoodie.common.table.HoodieTableMetaClient; import com.uber.hoodie.common.util.TypedProperties; +import com.uber.hoodie.hive.HiveSyncConfig; +import com.uber.hoodie.hive.HoodieHiveClient; +import com.uber.hoodie.hive.util.HiveTestService; import com.uber.hoodie.utilities.sources.TestDataSource; import java.io.BufferedReader; import java.io.IOException; @@ -32,8 +38,11 @@ import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; import org.apache.hadoop.hdfs.DistributedFileSystem; import org.apache.hadoop.hdfs.MiniDFSCluster; +import org.apache.hadoop.hive.conf.HiveConf; +import org.apache.hive.service.server.HiveServer2; import org.apache.spark.api.java.JavaSparkContext; import org.apache.spark.sql.SQLContext; +import org.apache.spark.sql.SparkSession; import org.junit.After; import org.junit.AfterClass; import org.junit.Before; @@ -51,15 +60,26 @@ public class UtilitiesTestBase { protected static MiniDFSCluster dfsCluster; protected static DistributedFileSystem dfs; protected transient JavaSparkContext jsc = null; + protected transient SparkSession sparkSession = null; protected transient SQLContext sqlContext; + protected static HiveServer2 hiveServer; @BeforeClass public static void initClass() throws Exception { + initClass(false); + } + + static void initClass(boolean startHiveService) throws Exception { hdfsTestService = new HdfsTestService(); dfsCluster = hdfsTestService.start(true); dfs = dfsCluster.getFileSystem(); dfsBasePath = dfs.getWorkingDirectory().toString(); dfs.mkdirs(new Path(dfsBasePath)); + if (startHiveService) { + HiveTestService hiveService = new HiveTestService(hdfsTestService.getHadoopConf()); + hiveServer = hiveService.start(); + clearHiveDb(); + } } @AfterClass @@ -67,6 +87,9 @@ public class UtilitiesTestBase { if (hdfsTestService != null) { hdfsTestService.stop(); } + if (hiveServer != null) { + hiveServer.stop(); + } } @Before @@ -74,6 +97,7 @@ public class UtilitiesTestBase { TestDataSource.initDataGen(); jsc = UtilHelpers.buildSparkContext(this.getClass().getName() + "-hoodie", "local[2]"); sqlContext = new SQLContext(jsc); + sparkSession = SparkSession.builder().config(jsc.getConf()).getOrCreate(); } @After @@ -84,6 +108,42 @@ public class UtilitiesTestBase { } } + /** + * Helper to get hive sync config + * @param basePath + * @param tableName + * @return + */ + protected static HiveSyncConfig getHiveSyncConfig(String basePath, String tableName) { + HiveSyncConfig hiveSyncConfig = new HiveSyncConfig(); + hiveSyncConfig.jdbcUrl = "jdbc:hive2://127.0.0.1:9999/"; + hiveSyncConfig.hiveUser = ""; + hiveSyncConfig.hivePass = ""; + hiveSyncConfig.databaseName = "testdb1"; + hiveSyncConfig.tableName = tableName; + hiveSyncConfig.basePath = basePath; + hiveSyncConfig.assumeDatePartitioning = false; + hiveSyncConfig.partitionFields = new ImmutableList.Builder().add("datestr").build(); + return hiveSyncConfig; + } + + /** + * Initialize Hive DB + * @throws IOException + */ + private static void clearHiveDb() throws IOException { + HiveConf hiveConf = new HiveConf(); + // Create Dummy hive sync config + HiveSyncConfig hiveSyncConfig = getHiveSyncConfig("/dummy", "dummy"); + hiveConf.addResource(hiveServer.getHiveConf()); + HoodieTableMetaClient.initTableType(dfs.getConf(), hiveSyncConfig.basePath, HoodieTableType.COPY_ON_WRITE, + hiveSyncConfig.tableName, null); + HoodieHiveClient client = new HoodieHiveClient(hiveSyncConfig, hiveConf, dfs); + client.updateHiveSQL("drop database if exists " + hiveSyncConfig.databaseName); + client.updateHiveSQL("create database " + hiveSyncConfig.databaseName); + client.close(); + } + public static class Helpers { // to get hold of resources bundled with jar diff --git a/hoodie-utilities/src/test/java/com/uber/hoodie/utilities/sources/TestDFSSource.java b/hoodie-utilities/src/test/java/com/uber/hoodie/utilities/sources/TestDFSSource.java index d460e6ea8..828b78dd8 100644 --- a/hoodie-utilities/src/test/java/com/uber/hoodie/utilities/sources/TestDFSSource.java +++ b/hoodie-utilities/src/test/java/com/uber/hoodie/utilities/sources/TestDFSSource.java @@ -20,16 +20,20 @@ package com.uber.hoodie.utilities.sources; import static org.junit.Assert.assertEquals; +import com.uber.hoodie.AvroConversionUtils; import com.uber.hoodie.common.HoodieTestDataGenerator; import com.uber.hoodie.common.util.TypedProperties; -import com.uber.hoodie.common.util.collection.Pair; import com.uber.hoodie.utilities.UtilitiesTestBase; +import com.uber.hoodie.utilities.deltastreamer.SourceFormatAdapter; import com.uber.hoodie.utilities.schema.FilebasedSchemaProvider; import java.io.IOException; import java.util.Optional; import org.apache.avro.generic.GenericRecord; import org.apache.hadoop.fs.Path; import org.apache.spark.api.java.JavaRDD; +import org.apache.spark.sql.Dataset; +import org.apache.spark.sql.Row; +import org.apache.spark.sql.SQLContext; import org.junit.After; import org.junit.AfterClass; import org.junit.Before; @@ -37,7 +41,7 @@ import org.junit.BeforeClass; import org.junit.Test; /** - * Basic tests against all subclasses of {@link DFSSource} + * Basic tests against all subclasses of {@link JsonDFSSource} */ public class TestDFSSource extends UtilitiesTestBase { @@ -71,34 +75,47 @@ public class TestDFSSource extends UtilitiesTestBase { TypedProperties props = new TypedProperties(); props.setProperty("hoodie.deltastreamer.source.dfs.root", dfsBasePath + "/jsonFiles"); - JsonDFSSource jsonSource = new JsonDFSSource(props, jsc, schemaProvider); + JsonDFSSource jsonDFSSource = new JsonDFSSource(props, jsc, sparkSession, schemaProvider); + SourceFormatAdapter jsonSource = new SourceFormatAdapter(jsonDFSSource); // 1. Extract without any checkpoint => get all the data, respecting sourceLimit - assertEquals(Optional.empty(), jsonSource.fetchNewData(Optional.empty(), Long.MAX_VALUE).getKey()); + assertEquals(Optional.empty(), jsonSource.fetchNewDataInAvroFormat(Optional.empty(), Long.MAX_VALUE).getBatch()); UtilitiesTestBase.Helpers.saveStringsToDFS( Helpers.jsonifyRecords(dataGenerator.generateInserts("000", 100)), dfs, dfsBasePath + "/jsonFiles/1.json"); - assertEquals(Optional.empty(), jsonSource.fetchNewData(Optional.empty(), 10).getKey()); - Pair>, String> fetch1 = jsonSource.fetchNewData(Optional.empty(), 1000000); - assertEquals(100, fetch1.getKey().get().count()); + assertEquals(Optional.empty(), jsonSource.fetchNewDataInAvroFormat(Optional.empty(), 10).getBatch()); + InputBatch> fetch1 = + jsonSource.fetchNewDataInAvroFormat(Optional.empty(), 1000000); + assertEquals(100, fetch1.getBatch().get().count()); + // Test json -> Row format + InputBatch> fetch1AsRows = + jsonSource.fetchNewDataInRowFormat(Optional.empty(), 1000000); + assertEquals(100, fetch1AsRows.getBatch().get().count()); + // Test Avro -> Row format + Dataset fetch1Rows = AvroConversionUtils.createDataFrame(JavaRDD.toRDD(fetch1.getBatch().get()), + schemaProvider.getSourceSchema().toString(), jsonDFSSource.getSparkSession()); + assertEquals(100, fetch1Rows.count()); // 2. Produce new data, extract new data UtilitiesTestBase.Helpers.saveStringsToDFS( Helpers.jsonifyRecords(dataGenerator.generateInserts("001", 10000)), dfs, dfsBasePath + "/jsonFiles/2.json"); - Pair>, String> fetch2 = jsonSource.fetchNewData( - Optional.of(fetch1.getValue()), Long.MAX_VALUE); - assertEquals(10000, fetch2.getKey().get().count()); + InputBatch> fetch2 = jsonSource.fetchNewDataInRowFormat( + Optional.of(fetch1.getCheckpointForNextBatch()), Long.MAX_VALUE); + assertEquals(10000, fetch2.getBatch().get().count()); // 3. Extract with previous checkpoint => gives same data back (idempotent) - Pair>, String> fetch3 = jsonSource.fetchNewData( - Optional.of(fetch1.getValue()), Long.MAX_VALUE); - assertEquals(10000, fetch3.getKey().get().count()); - assertEquals(fetch2.getValue(), fetch3.getValue()); + InputBatch> fetch3 = jsonSource.fetchNewDataInRowFormat( + Optional.of(fetch1.getCheckpointForNextBatch()), Long.MAX_VALUE); + assertEquals(10000, fetch3.getBatch().get().count()); + assertEquals(fetch2.getCheckpointForNextBatch(), fetch3.getCheckpointForNextBatch()); + fetch3.getBatch().get().registerTempTable("test_dfs_table"); + Dataset rowDataset = new SQLContext(jsc.sc()).sql("select * from test_dfs_table"); + assertEquals(10000, rowDataset.count()); // 4. Extract with latest checkpoint => no new data returned - Pair>, String> fetch4 = jsonSource.fetchNewData( - Optional.of(fetch2.getValue()), Long.MAX_VALUE); - assertEquals(Optional.empty(), fetch4.getKey()); + InputBatch> fetch4 = jsonSource.fetchNewDataInAvroFormat( + Optional.of(fetch2.getCheckpointForNextBatch()), Long.MAX_VALUE); + assertEquals(Optional.empty(), fetch4.getBatch()); } -} +} \ No newline at end of file diff --git a/hoodie-utilities/src/test/java/com/uber/hoodie/utilities/sources/TestDataSource.java b/hoodie-utilities/src/test/java/com/uber/hoodie/utilities/sources/TestDataSource.java index 57369de33..ad074149d 100644 --- a/hoodie-utilities/src/test/java/com/uber/hoodie/utilities/sources/TestDataSource.java +++ b/hoodie-utilities/src/test/java/com/uber/hoodie/utilities/sources/TestDataSource.java @@ -21,8 +21,6 @@ package com.uber.hoodie.utilities.sources; import com.uber.hoodie.common.HoodieTestDataGenerator; import com.uber.hoodie.common.model.HoodieRecord; import com.uber.hoodie.common.util.TypedProperties; -import com.uber.hoodie.common.util.collection.ImmutablePair; -import com.uber.hoodie.common.util.collection.Pair; import com.uber.hoodie.utilities.schema.SchemaProvider; import java.io.IOException; import java.util.ArrayList; @@ -35,11 +33,12 @@ import org.apache.log4j.LogManager; import org.apache.log4j.Logger; import org.apache.spark.api.java.JavaRDD; import org.apache.spark.api.java.JavaSparkContext; +import org.apache.spark.sql.SparkSession; /** * An implementation of {@link Source}, that emits test upserts. */ -public class TestDataSource extends Source { +public class TestDataSource extends AvroSource { private static volatile Logger log = LogManager.getLogger(TestDataSource.class); @@ -54,8 +53,9 @@ public class TestDataSource extends Source { dataGenerator = null; } - public TestDataSource(TypedProperties props, JavaSparkContext sparkContext, SchemaProvider schemaProvider) { - super(props, sparkContext, schemaProvider); + public TestDataSource(TypedProperties props, JavaSparkContext sparkContext, SparkSession sparkSession, + SchemaProvider schemaProvider) { + super(props, sparkContext, sparkSession, schemaProvider); } private GenericRecord toGenericRecord(HoodieRecord hoodieRecord) { @@ -68,14 +68,14 @@ public class TestDataSource extends Source { } @Override - public Pair>, String> fetchNewData(Optional lastCheckpointStr, + protected InputBatch> fetchNewData(Optional lastCheckpointStr, long sourceLimit) { int nextCommitNum = lastCheckpointStr.map(s -> Integer.parseInt(s) + 1).orElse(0); String commitTime = String.format("%05d", nextCommitNum); // No new data. if (sourceLimit <= 0) { - return new ImmutablePair<>(Optional.empty(), commitTime); + return new InputBatch<>(Optional.empty(), commitTime); } // generate `sourceLimit` number of upserts each time. @@ -94,6 +94,6 @@ public class TestDataSource extends Source { } JavaRDD avroRDD = sparkContext.parallelize(records, 4); - return new ImmutablePair<>(Optional.of(avroRDD), commitTime); + return new InputBatch<>(Optional.of(avroRDD), commitTime); } } diff --git a/hoodie-utilities/src/test/java/com/uber/hoodie/utilities/sources/TestKafkaSource.java b/hoodie-utilities/src/test/java/com/uber/hoodie/utilities/sources/TestKafkaSource.java index 785e80569..1adcbc371 100644 --- a/hoodie-utilities/src/test/java/com/uber/hoodie/utilities/sources/TestKafkaSource.java +++ b/hoodie-utilities/src/test/java/com/uber/hoodie/utilities/sources/TestKafkaSource.java @@ -18,20 +18,23 @@ package com.uber.hoodie.utilities.sources; -import static com.uber.hoodie.utilities.sources.KafkaSource.CheckpointUtils; import static org.junit.Assert.assertEquals; +import com.uber.hoodie.AvroConversionUtils; import com.uber.hoodie.common.HoodieTestDataGenerator; import com.uber.hoodie.common.util.TypedProperties; -import com.uber.hoodie.common.util.collection.Pair; import com.uber.hoodie.utilities.UtilitiesTestBase; +import com.uber.hoodie.utilities.deltastreamer.SourceFormatAdapter; import com.uber.hoodie.utilities.schema.FilebasedSchemaProvider; +import com.uber.hoodie.utilities.sources.helpers.KafkaOffsetGen.CheckpointUtils; import java.io.IOException; import java.util.HashMap; import java.util.Optional; import kafka.common.TopicAndPartition; import org.apache.avro.generic.GenericRecord; import org.apache.spark.api.java.JavaRDD; +import org.apache.spark.sql.Dataset; +import org.apache.spark.sql.Row; import org.apache.spark.streaming.kafka.KafkaCluster.LeaderOffset; import org.apache.spark.streaming.kafka.KafkaTestUtils; import org.apache.spark.streaming.kafka.OffsetRange; @@ -42,7 +45,7 @@ import org.junit.BeforeClass; import org.junit.Test; /** - * Tests against {@link KafkaSource} + * Tests against {@link AvroKafkaSource} */ public class TestKafkaSource extends UtilitiesTestBase { @@ -89,30 +92,44 @@ public class TestKafkaSource extends UtilitiesTestBase { props.setProperty("key.serializer", "org.apache.kafka.common.serialization.StringSerializer"); props.setProperty("value.serializer", "org.apache.kafka.common.serialization.StringSerializer"); - Source kafkaSource = new JsonKafkaSource(props, jsc, schemaProvider); + Source jsonSource = new JsonKafkaSource(props, jsc, sparkSession, schemaProvider); + SourceFormatAdapter kafkaSource = new SourceFormatAdapter(jsonSource); // 1. Extract without any checkpoint => get all the data, respecting sourceLimit - assertEquals(Optional.empty(), kafkaSource.fetchNewData(Optional.empty(), Long.MAX_VALUE).getKey()); + assertEquals(Optional.empty(), kafkaSource.fetchNewDataInAvroFormat(Optional.empty(), Long.MAX_VALUE).getBatch()); testUtils.sendMessages(TEST_TOPIC_NAME, Helpers.jsonifyRecords(dataGenerator.generateInserts("000", 1000))); - Pair>, String> fetch1 = kafkaSource.fetchNewData(Optional.empty(), 900); - assertEquals(900, fetch1.getKey().get().count()); + InputBatch> fetch1 = kafkaSource.fetchNewDataInAvroFormat(Optional.empty(), 900); + assertEquals(900, fetch1.getBatch().get().count()); + // Test Avro To DataFrame path + Dataset fetch1AsRows = AvroConversionUtils.createDataFrame(JavaRDD.toRDD(fetch1.getBatch().get()), + schemaProvider.getSourceSchema().toString(), jsonSource.getSparkSession()); + assertEquals(900, fetch1AsRows.count()); // 2. Produce new data, extract new data testUtils.sendMessages(TEST_TOPIC_NAME, Helpers.jsonifyRecords(dataGenerator.generateInserts("001", 1000))); - Pair>, String> fetch2 = kafkaSource.fetchNewData( - Optional.of(fetch1.getValue()), Long.MAX_VALUE); - assertEquals(1100, fetch2.getKey().get().count()); + InputBatch> fetch2 = kafkaSource.fetchNewDataInRowFormat( + Optional.of(fetch1.getCheckpointForNextBatch()), Long.MAX_VALUE); + assertEquals(1100, fetch2.getBatch().get().count()); // 3. Extract with previous checkpoint => gives same data back (idempotent) - Pair>, String> fetch3 = kafkaSource.fetchNewData( - Optional.of(fetch1.getValue()), Long.MAX_VALUE); - assertEquals(fetch2.getKey().get().count(), fetch3.getKey().get().count()); - assertEquals(fetch2.getValue(), fetch3.getValue()); + InputBatch> fetch3 = kafkaSource.fetchNewDataInAvroFormat( + Optional.of(fetch1.getCheckpointForNextBatch()), Long.MAX_VALUE); + assertEquals(fetch2.getBatch().get().count(), fetch3.getBatch().get().count()); + assertEquals(fetch2.getCheckpointForNextBatch(), fetch3.getCheckpointForNextBatch()); + // Same using Row API + InputBatch> fetch3AsRows = + kafkaSource.fetchNewDataInRowFormat(Optional.of(fetch1.getCheckpointForNextBatch()), Long.MAX_VALUE); + assertEquals(fetch2.getBatch().get().count(), fetch3AsRows.getBatch().get().count()); + assertEquals(fetch2.getCheckpointForNextBatch(), fetch3AsRows.getCheckpointForNextBatch()); // 4. Extract with latest checkpoint => no new data returned - Pair>, String> fetch4 = kafkaSource.fetchNewData( - Optional.of(fetch2.getValue()), Long.MAX_VALUE); - assertEquals(Optional.empty(), fetch4.getKey()); + InputBatch> fetch4 = kafkaSource.fetchNewDataInAvroFormat( + Optional.of(fetch2.getCheckpointForNextBatch()), Long.MAX_VALUE); + assertEquals(Optional.empty(), fetch4.getBatch()); + // Same using Row API + InputBatch> fetch4AsRows = + kafkaSource.fetchNewDataInRowFormat(Optional.of(fetch2.getCheckpointForNextBatch()), Long.MAX_VALUE); + assertEquals(Optional.empty(), fetch4AsRows.getBatch()); } diff --git a/hoodie-utilities/src/test/resources/delta-streamer-config/sql-transformer.properties b/hoodie-utilities/src/test/resources/delta-streamer-config/sql-transformer.properties new file mode 100644 index 000000000..87038c36b --- /dev/null +++ b/hoodie-utilities/src/test/resources/delta-streamer-config/sql-transformer.properties @@ -0,0 +1,19 @@ +# +# Copyright (c) 2018 Uber Technologies, Inc. (hoodie-dev-group@uber.com) +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +# +# +include=base.properties +hoodie.deltastreamer.transformer.sql=SELECT a.timestamp, a._row_key, a.rider, a.driver, a.begin_lat, a.begin_lon, a.end_lat, a.end_lon, a.fare, CAST(1.0 AS DOUBLE) AS haversine_distance FROM a \ No newline at end of file diff --git a/hoodie-utilities/src/test/resources/delta-streamer-config/target.avsc b/hoodie-utilities/src/test/resources/delta-streamer-config/target.avsc new file mode 100644 index 000000000..d2d410363 --- /dev/null +++ b/hoodie-utilities/src/test/resources/delta-streamer-config/target.avsc @@ -0,0 +1,37 @@ +{ + "type" : "record", + "name" : "triprec", + "fields" : [ + { + "name" : "timestamp", + "type" : "double" + }, { + "name" : "_row_key", + "type" : "string" + }, { + "name" : "rider", + "type" : "string" + }, { + "name" : "driver", + "type" : "string" + }, { + "name" : "begin_lat", + "type" : "double" + }, { + "name" : "begin_lon", + "type" : "double" + }, { + "name" : "end_lat", + "type" : "double" + }, { + "name" : "end_lon", + "type" : "double" + }, { + "name" : "fare", + "type" : "double" + }, { + "name" : "haversine_distance", + "type" : "double" + }] +} + diff --git a/packaging/hoodie-hadoop-mr-bundle/pom.xml b/packaging/hoodie-hadoop-mr-bundle/pom.xml index 79a4a757a..4e2e72d47 100644 --- a/packaging/hoodie-hadoop-mr-bundle/pom.xml +++ b/packaging/hoodie-hadoop-mr-bundle/pom.xml @@ -66,6 +66,48 @@ hadoop-auth + + ${hive.groupid} + hive-jdbc + ${hive.version} + + + commons-logging + commons-logging + + + + + ${hive.groupid} + hive-exec + ${hive.version} + + + ${hive.groupid} + hive-service + ${hive.version} + + + ${hive.groupid} + hive-shims + ${hive.version} + + + ${hive.groupid} + hive-serde + ${hive.version} + + + ${hive.groupid} + hive-metastore + ${hive.version} + + + ${hive.groupid} + hive-common + ${hive.version} + + org.apache.hadoop hadoop-hdfs @@ -182,116 +224,4 @@ true - - - - hive12 - - - !hive11 - - - - - - - - ${hive12.groupid} - hive-jdbc - ${hive12.version} - - - commons-logging - commons-logging - - - - - ${hive12.groupid} - hive-exec - ${hive12.version} - - - ${hive12.groupid} - hive-service - ${hive12.version} - - - ${hive12.groupid} - hive-shims - ${hive12.version} - - - ${hive12.groupid} - hive-serde - ${hive12.version} - - - ${hive12.groupid} - hive-metastore - ${hive12.version} - - - ${hive12.groupid} - hive-common - ${hive12.version} - - - - - hive11 - - - hive11 - - - - .hive11 - - - - ${hive11.groupid} - hive-service - ${hive11.version} - - - ${hive11.groupid} - hive-shims - ${hive11.version} - - - ${hive11.groupid} - hive-jdbc - ${hive11.version} - - - commons-logging - commons-logging - - - - - ${hive11.groupid} - hive-serde - ${hive11.version} - - - ${hive11.groupid} - hive-metastore - ${hive11.version} - - - ${hive11.groupid} - hive-common - ${hive11.version} - - - ${hive11.groupid} - hive-exec - ${hive11.version} - - - - - diff --git a/packaging/hoodie-hive-bundle/pom.xml b/packaging/hoodie-hive-bundle/pom.xml index eafad2592..ee43709fa 100644 --- a/packaging/hoodie-hive-bundle/pom.xml +++ b/packaging/hoodie-hive-bundle/pom.xml @@ -44,6 +44,26 @@ org.apache.hadoop hadoop-auth + + ${hive.groupid} + hive-service + ${hive.version} + + + ${hive.groupid} + hive-jdbc + ${hive.version} + + + ${hive.groupid} + hive-metastore + ${hive.version} + + + ${hive.groupid} + hive-common + ${hive.version} + com.google.guava guava @@ -195,73 +215,4 @@ true - - - - hive12 - - - !hive11 - - - - - - - - ${hive12.groupid} - hive-service - ${hive12.version} - - - ${hive12.groupid} - hive-jdbc - ${hive12.version} - - - ${hive12.groupid} - hive-metastore - ${hive12.version} - - - ${hive12.groupid} - hive-common - ${hive12.version} - - - - - hive11 - - - hive11 - - - - .hive11 - - - - org.apache.hive - hive-service - ${hive11.version} - - - org.apache.hive - hive-jdbc - ${hive11.version} - - - org.apache.hive - hive-metastore - ${hive11.version} - - - org.apache.hive - hive-common - ${hive11.version} - - - - diff --git a/packaging/hoodie-spark-bundle/pom.xml b/packaging/hoodie-spark-bundle/pom.xml index 95b4aedd4..5c932e59c 100644 --- a/packaging/hoodie-spark-bundle/pom.xml +++ b/packaging/hoodie-spark-bundle/pom.xml @@ -239,6 +239,26 @@ org.apache.avro avro + + ${hive.groupid} + hive-service + ${hive.version} + + + ${hive.groupid} + hive-jdbc + ${hive.version} + + + ${hive.groupid} + hive-metastore + ${hive.version} + + + ${hive.groupid} + hive-common + ${hive.version} + org.apache.commons commons-configuration2 @@ -269,74 +289,5 @@ ${project.version} - - - - hive12 - - - !hive11 - - - - - - - - ${hive12.groupid} - hive-service - ${hive12.version} - - - ${hive12.groupid} - hive-jdbc - ${hive12.version} - - - ${hive12.groupid} - hive-metastore - ${hive12.version} - - - ${hive12.groupid} - hive-common - ${hive12.version} - - - - - hive11 - - - hive11 - - - - .hive11 - - - - ${hive11.groupid} - hive-service - ${hive11.version} - - - ${hive11.groupid} - hive-jdbc - ${hive11.version} - - - ${hive11.groupid} - hive-metastore - ${hive11.version} - - - ${hive11.groupid} - hive-common - ${hive11.version} - - - - diff --git a/pom.xml b/pom.xml index 88196084c..a0ec312c5 100644 --- a/pom.xml +++ b/pom.xml @@ -129,10 +129,8 @@ 1.2.17 2.9.9 2.7.3 - org.apache.hive - 1.2.1 - org.apache.hive - 1.1.1 + org.apache.hive + 1.2.1 3.1.1 2.1.0 1.7.7 @@ -589,6 +587,11 @@ commons-dbcp 1.4 + + commons-pool + commons-pool + 1.4 + org.apache.httpcomponents httpcore @@ -656,7 +659,48 @@ jackson-mapper-asl 1.9.13 - + + ${hive.groupid} + hive-service + ${hive.version} + provided + + + ${hive.groupid} + hive-shims + ${hive.version} + provided + + + ${hive.groupid} + hive-jdbc + ${hive.version} + provided + + + ${hive.groupid} + hive-serde + ${hive.version} + provided + + + ${hive.groupid} + hive-metastore + ${hive.version} + provided + + + ${hive.groupid} + hive-common + ${hive.version} + provided + + + ${hive.groupid} + hive-exec + ${hive.version} + provided + org.apache.hadoop hadoop-hdfs @@ -708,109 +752,6 @@ - - hive12 - - - !hive11 - - - - - ${hive12.groupid} - hive-service - ${hive12.version} - provided - - - ${hive12.groupid} - hive-shims - ${hive12.version} - provided - - - ${hive12.groupid} - hive-jdbc - ${hive12.version} - provided - - - ${hive12.groupid} - hive-serde - ${hive12.version} - provided - - - ${hive12.groupid} - hive-metastore - ${hive12.version} - provided - - - ${hive12.groupid} - hive-common - ${hive12.version} - provided - - - ${hive12.groupid} - hive-exec - ${hive12.version} - provided - - - - - hive11 - - - hive11 - - - - - org.apache.hive - hive-service - ${hive11.version} - - - org.apache.hive - hive-shims - ${hive11.version} - provided - - - org.apache.hive - hive-jdbc - ${hive11.version} - provided - - - org.apache.hive - hive-serde - ${hive11.version} - provided - - - org.apache.hive - hive-metastore - ${hive11.version} - provided - - - org.apache.hive - hive-common - ${hive11.version} - provided - - - org.apache.hive - hive-exec - ${hive11.version} - provided - - - release