Split insert() into insert() & bulkInsert() (#69)

- Behaviour change for existing insert() users - Made the current insert() implementation, as something to use for bulkInsert() - Normal inserts now share a lot of code with upsert, which provides benefits like small file handling - Refactored/Cleaned up code in HoodieWriteClient for reuse - Added a unit test, switching few tests to call bulkInsert() and few to call insert()
2017-01-27 10:51:00 -08:00
parent 54409b07ea
commit 8e72ed69b4
6 changed files with 309 additions and 159 deletions
--- a/hoodie-client/src/main/java/com/uber/hoodie/HoodieWriteClient.java
+++ b/hoodie-client/src/main/java/com/uber/hoodie/HoodieWriteClient.java
@@ -31,7 +31,7 @@ import com.uber.hoodie.exception.HoodieIOException;
 import com.uber.hoodie.exception.HoodieInsertException;
 import com.uber.hoodie.exception.HoodieRollbackException;
 import com.uber.hoodie.exception.HoodieUpsertException;
-import com.uber.hoodie.func.InsertMapFunction;
+import com.uber.hoodie.func.BulkInsertMapFunction;
 import com.uber.hoodie.index.HoodieIndex;
 import com.uber.hoodie.io.HoodieCleaner;
 import com.uber.hoodie.io.HoodieCommitArchiveLog;
@@ -72,12 +72,11 @@ import scala.Option;
 import scala.Tuple2;

 /**
- * Hoodie Write Client helps you build datasets on HDFS [insert()] and then
- * perform efficient mutations on a HDFS dataset [upsert()]
- *
- * Note that, at any given time, there can only be one Spark job performing
- * these operatons on a Hoodie dataset.
+ * Hoodie Write Client helps you build datasets on HDFS [insert()] and then perform efficient
+ * mutations on a HDFS dataset [upsert()]
 *
+ * Note that, at any given time, there can only be one Spark job performing these operatons on a
+ * Hoodie dataset.
 */
 public class HoodieWriteClient<T extends HoodieRecordPayload> implements Serializable {

@@ -142,72 +141,18 @@ public class HoodieWriteClient<T extends HoodieRecordPayload> implements Seriali
     */
    public JavaRDD<WriteStatus> upsert(JavaRDD<HoodieRecord<T>> records, final String commitTime) {
        final HoodieTableMetadata metadata =
-            new HoodieTableMetadata(fs, config.getBasePath(), config.getTableName());
+                new HoodieTableMetadata(fs, config.getBasePath(), config.getTableName());
        writeContext = metrics.getCommitCtx();
-        final HoodieTable table =
-            HoodieTable.getHoodieTable(metadata.getTableType(), commitTime, config, metadata);

        try {
            // De-dupe/merge if needed
            JavaRDD<HoodieRecord<T>> dedupedRecords =
-                combineOnCondition(config.shouldCombineBeforeUpsert(), records,
-                    config.getUpsertShuffleParallelism());
+                    combineOnCondition(config.shouldCombineBeforeUpsert(), records,
+                            config.getUpsertShuffleParallelism());

            // perform index loop up to get existing location of records
            JavaRDD<HoodieRecord<T>> taggedRecords = index.tagLocation(dedupedRecords, metadata);
-
-            // Cache the tagged records, so we don't end up computing both
-            taggedRecords.persist(StorageLevel.MEMORY_AND_DISK_SER());
-
-
-            WorkloadProfile profile = null;
-            if (table.isWorkloadProfileNeeded()) {
-                profile = new WorkloadProfile(taggedRecords);
-                logger.info("Workload profile :" + profile);
-            }
-
-            // obtain the upsert partitioner, and the run the tagger records through that & get a partitioned RDD.
-            final Partitioner upsertPartitioner = table.getUpsertPartitioner(profile);
-            JavaRDD<HoodieRecord<T>> partitionedRecords = taggedRecords.mapToPair(
-                new PairFunction<HoodieRecord<T>, Tuple2<HoodieKey, Option<HoodieRecordLocation>>, HoodieRecord<T>>() {
-                    @Override
-                    public Tuple2<Tuple2<HoodieKey, Option<HoodieRecordLocation>>, HoodieRecord<T>> call(
-                        HoodieRecord<T> record) throws Exception {
-                        return new Tuple2<>(new Tuple2<>(record.getKey(),
-                            Option.apply(record.getCurrentLocation())), record);
-                    }
-                }).partitionBy(upsertPartitioner).map(
-                new Function<Tuple2<Tuple2<HoodieKey, Option<HoodieRecordLocation>>, HoodieRecord<T>>, HoodieRecord<T>>() {
-                    @Override
-                    public HoodieRecord<T> call(
-                        Tuple2<Tuple2<HoodieKey, Option<HoodieRecordLocation>>, HoodieRecord<T>> tuple)
-                        throws Exception {
-                        return tuple._2();
-                    }
-                });
-
-
-            // Perform the actual writing.
-            JavaRDD<WriteStatus> upsertStatusRDD = partitionedRecords.mapPartitionsWithIndex(
-                new Function2<Integer, Iterator<HoodieRecord<T>>, Iterator<List<WriteStatus>>>() {
-                    @Override
-                    public Iterator<List<WriteStatus>> call(Integer partition,
-                        Iterator<HoodieRecord<T>> recordItr) throws Exception {
-                        return table.handleUpsertPartition(partition, recordItr, upsertPartitioner);
-                    }
-                }, true).flatMap(new FlatMapFunction<List<WriteStatus>, WriteStatus>() {
-                @Override
-                public Iterable<WriteStatus> call(List<WriteStatus> writeStatuses)
-                    throws Exception {
-                    return writeStatuses;
-                }
-            });
-
-            // Update the index back.
-            JavaRDD<WriteStatus> resultRDD = index.updateLocation(upsertStatusRDD, metadata);
-            resultRDD = resultRDD.persist(config.getWriteStatusStorageLevel());
-            commitOnAutoCommit(commitTime, resultRDD);
-            return resultRDD;
+            return upsertRecordsInternal(taggedRecords, commitTime, metadata, true);
        } catch (Throwable e) {
            if (e instanceof HoodieUpsertException) {
                throw (HoodieUpsertException) e;
@@ -216,8 +161,38 @@ public class HoodieWriteClient<T extends HoodieRecordPayload> implements Seriali
        }
    }

+    /**
+     * Inserts the given HoodieRecords, into the table. This API is intended to be used for normal
+     * writes.
+     *
+     * This implementation skips the index check & is able to leverage benefits such as
+     * small file handling/blocking alignment, as with upsert(), by profiling the workload
+     *
+     * @param records    HoodieRecords to insert
+     * @param commitTime Commit Time handle
+     * @return JavaRDD[WriteStatus] - RDD of WriteStatus to inspect errors and counts
+     */
+    public JavaRDD<WriteStatus> insert(JavaRDD<HoodieRecord<T>> records, final String commitTime) {
+        final HoodieTableMetadata metadata =
+                new HoodieTableMetadata(fs, config.getBasePath(), config.getTableName());
+        writeContext = metrics.getCommitCtx();
+        try {
+            // De-dupe/merge if needed
+            JavaRDD<HoodieRecord<T>> dedupedRecords =
+                    combineOnCondition(config.shouldCombineBeforeInsert(), records,
+                            config.getInsertShuffleParallelism());
+
+            return upsertRecordsInternal(dedupedRecords, commitTime, metadata, false);
+        } catch (Throwable e) {
+            if (e instanceof HoodieInsertException) {
+                throw e;
+            }
+            throw new HoodieInsertException("Failed to insert for commit time " + commitTime, e);
+        }
+    }
+
    private void commitOnAutoCommit(String commitTime, JavaRDD<WriteStatus> resultRDD) {
-        if(config.shouldAutoCommit()) {
+        if (config.shouldAutoCommit()) {
            logger.info("Auto commit enabled: Committing " + commitTime);
            boolean commitResult = commit(commitTime, resultRDD);
            if (!commitResult) {
@@ -229,64 +204,146 @@ public class HoodieWriteClient<T extends HoodieRecordPayload> implements Seriali
    }

    private JavaRDD<HoodieRecord<T>> combineOnCondition(boolean condition,
-        JavaRDD<HoodieRecord<T>> records, int parallelism) {
-        if(condition) {
+                                                        JavaRDD<HoodieRecord<T>> records, int parallelism) {
+        if (condition) {
            return deduplicateRecords(records, parallelism);
        }
        return records;
    }

+    private JavaRDD<HoodieRecord<T>> partition(JavaRDD<HoodieRecord<T>> dedupedRecords, Partitioner partitioner) {
+        return dedupedRecords.mapToPair(
+                new PairFunction<HoodieRecord<T>, Tuple2<HoodieKey, Option<HoodieRecordLocation>>, HoodieRecord<T>>() {
+                    @Override
+                    public Tuple2<Tuple2<HoodieKey, Option<HoodieRecordLocation>>, HoodieRecord<T>> call(
+                            HoodieRecord<T> record) throws Exception {
+                        return new Tuple2<>(new Tuple2<>(record.getKey(),
+                                Option.apply(record.getCurrentLocation())), record);
+                    }
+                }).partitionBy(partitioner).map(
+                new Function<Tuple2<Tuple2<HoodieKey, Option<HoodieRecordLocation>>, HoodieRecord<T>>, HoodieRecord<T>>() {
+                    @Override
+                    public HoodieRecord<T> call(
+                            Tuple2<Tuple2<HoodieKey, Option<HoodieRecordLocation>>, HoodieRecord<T>> tuple)
+                            throws Exception {
+                        return tuple._2();
+                    }
+                });
+    }
+
+    private Partitioner getPartitioner(HoodieTable table, boolean isUpsert, WorkloadProfile profile) {
+        if (isUpsert) {
+            return table.getUpsertPartitioner(profile);
+        } else {
+            return table.getInsertPartitioner(profile);
+        }
+    }
+
+    private JavaRDD<WriteStatus> updateIndexAndCommitIfNeeded(JavaRDD<WriteStatus> writeStatusRDD,
+                                                              HoodieTableMetadata metadata,
+                                                              String commitTime) {
+        // Update the index back
+        JavaRDD<WriteStatus> statuses = index.updateLocation(writeStatusRDD, metadata);
+        // Trigger the insert and collect statuses
+        statuses = statuses.persist(config.getWriteStatusStorageLevel());
+        commitOnAutoCommit(commitTime, statuses);
+        return statuses;
+    }
+
+    private JavaRDD<WriteStatus> upsertRecordsInternal(JavaRDD<HoodieRecord<T>> preppedRecords,
+                                                       String commitTime,
+                                                       HoodieTableMetadata metadata,
+                                                       final boolean isUpsert) {
+
+        final HoodieTable table =
+                HoodieTable.getHoodieTable(metadata.getTableType(), commitTime, config, metadata);
+
+        // Cache the tagged records, so we don't end up computing both
+        preppedRecords.persist(StorageLevel.MEMORY_AND_DISK_SER());
+
+        WorkloadProfile profile = null;
+        if (table.isWorkloadProfileNeeded()) {
+            profile = new WorkloadProfile(preppedRecords);
+            logger.info("Workload profile :" + profile);
+        }
+
+        // partition using the insert partitioner
+        final Partitioner partitioner = getPartitioner(table, isUpsert, profile);
+        JavaRDD<HoodieRecord<T>> partitionedRecords = partition(preppedRecords, partitioner);
+        JavaRDD<WriteStatus> writeStatusRDD = partitionedRecords.mapPartitionsWithIndex(
+                new Function2<Integer, Iterator<HoodieRecord<T>>, Iterator<List<WriteStatus>>>() {
+                    @Override
+                    public Iterator<List<WriteStatus>> call(Integer partition,
+                                                            Iterator<HoodieRecord<T>> recordItr) throws Exception {
+                        if (isUpsert) {
+                            return table.handleUpsertPartition(partition, recordItr, partitioner);
+                        } else {
+                            return table.handleInsertPartition(partition, recordItr, partitioner);
+                        }
+                    }
+                }, true).flatMap(new FlatMapFunction<List<WriteStatus>, WriteStatus>() {
+            @Override
+            public Iterable<WriteStatus> call(List<WriteStatus> writeStatuses)
+                    throws Exception {
+                return writeStatuses;
+            }
+        });
+
+        return updateIndexAndCommitIfNeeded(writeStatusRDD, metadata, commitTime);
+    }
+
+
    /**
-     * Loads the given HoodieRecords, as inserts into the table.
-     * (This implementation uses sortBy and attempts to control the numbers of files with less memory)
+     * Loads the given HoodieRecords, as inserts into the table. This is suitable for doing big bulk
+     * loads into a Hoodie table for the very first time (e.g: converting an existing dataset to
+     * Hoodie).
     *
-     * @param records HoodieRecords to insert
+     * This implementation uses sortBy (which does range partitioning based on reservoir sampling) &
+     * attempts to control the numbers of files with less memory compared to the {@link
+     * HoodieWriteClient#insert(JavaRDD, String)}
+     *
+     * @param records    HoodieRecords to insert
     * @param commitTime Commit Time handle
     * @return JavaRDD[WriteStatus] - RDD of WriteStatus to inspect errors and counts
-     *
     */
-    public JavaRDD<WriteStatus> insert(JavaRDD<HoodieRecord<T>> records, final String commitTime) {
+    public JavaRDD<WriteStatus> bulkInsert(JavaRDD<HoodieRecord<T>> records, final String commitTime) {
        final HoodieTableMetadata metadata =
-            new HoodieTableMetadata(fs, config.getBasePath(), config.getTableName());
+                new HoodieTableMetadata(fs, config.getBasePath(), config.getTableName());
        writeContext = metrics.getCommitCtx();
        try {
            // De-dupe/merge if needed
            JavaRDD<HoodieRecord<T>> dedupedRecords =
-                combineOnCondition(config.shouldCombineBeforeInsert(), records,
-                    config.getInsertShuffleParallelism());
+                    combineOnCondition(config.shouldCombineBeforeInsert(), records,
+                            config.getInsertShuffleParallelism());

            // Now, sort the records and line them up nicely for loading.
            JavaRDD<HoodieRecord<T>> sortedRecords =
-                dedupedRecords.sortBy(new Function<HoodieRecord<T>, String>() {
-                    @Override
-                    public String call(HoodieRecord<T> record) {
-                        // Let's use "partitionPath + key" as the sort key. Spark, will ensure
-                        // the records split evenly across RDD partitions, such that small partitions fit
-                        // into 1 RDD partition, while big ones spread evenly across multiple RDD partitions
-                        return String
-                            .format("%s+%s", record.getPartitionPath(), record.getRecordKey());
-                    }
-                }, true, config.getInsertShuffleParallelism());
+                    dedupedRecords.sortBy(new Function<HoodieRecord<T>, String>() {
+                        @Override
+                        public String call(HoodieRecord<T> record) {
+                            // Let's use "partitionPath + key" as the sort key. Spark, will ensure
+                            // the records split evenly across RDD partitions, such that small partitions fit
+                            // into 1 RDD partition, while big ones spread evenly across multiple RDD partitions
+                            return String
+                                    .format("%s+%s", record.getPartitionPath(), record.getRecordKey());
+                        }
+                    }, true, config.getInsertShuffleParallelism());
            JavaRDD<WriteStatus> writeStatusRDD = sortedRecords
-                .mapPartitionsWithIndex(new InsertMapFunction<T>(commitTime, config, metadata),
-                    true).flatMap(new FlatMapFunction<List<WriteStatus>, WriteStatus>() {
-                    @Override
-                    public Iterable<WriteStatus> call(List<WriteStatus> writeStatuses)
-                        throws Exception {
-                        return writeStatuses;
-                    }
-                });
-            // Update the index back
-            JavaRDD<WriteStatus> statuses = index.updateLocation(writeStatusRDD, metadata);
-            // Trigger the insert and collect statuses
-            statuses = statuses.persist(config.getWriteStatusStorageLevel());
-            commitOnAutoCommit(commitTime, statuses);
-            return statuses;
+                    .mapPartitionsWithIndex(new BulkInsertMapFunction<T>(commitTime, config, metadata),
+                            true).flatMap(new FlatMapFunction<List<WriteStatus>, WriteStatus>() {
+                        @Override
+                        public Iterable<WriteStatus> call(List<WriteStatus> writeStatuses)
+                                throws Exception {
+                            return writeStatuses;
+                        }
+                    });
+
+            return updateIndexAndCommitIfNeeded(writeStatusRDD, metadata, commitTime);
        } catch (Throwable e) {
            if (e instanceof HoodieInsertException) {
                throw e;
            }
-            throw new HoodieInsertException("Failed to insert for commit time " + commitTime, e);
+            throw new HoodieInsertException("Failed to bulk insert for commit time " + commitTime, e);
        }
    }

@@ -296,7 +353,7 @@ public class HoodieWriteClient<T extends HoodieRecordPayload> implements Seriali
    public boolean commit(String commitTime, JavaRDD<WriteStatus> writeStatuses) {
        logger.info("Comitting " + commitTime);
        Path commitFile =
-            new Path(config.getBasePath() + "/.hoodie/" + FSUtils.makeCommitFileName(commitTime));
+                new Path(config.getBasePath() + "/.hoodie/" + FSUtils.makeCommitFileName(commitTime));
        try {

            if (fs.exists(commitFile)) {
@@ -304,13 +361,13 @@ public class HoodieWriteClient<T extends HoodieRecordPayload> implements Seriali
            }

            List<Tuple2<String, HoodieWriteStat>> stats =
-                writeStatuses.mapToPair(new PairFunction<WriteStatus, String, HoodieWriteStat>() {
-                    @Override
-                    public Tuple2<String, HoodieWriteStat> call(WriteStatus writeStatus)
-                        throws Exception {
-                        return new Tuple2<>(writeStatus.getPartitionPath(), writeStatus.getStat());
-                    }
-                }).collect();
+                    writeStatuses.mapToPair(new PairFunction<WriteStatus, String, HoodieWriteStat>() {
+                        @Override
+                        public Tuple2<String, HoodieWriteStat> call(WriteStatus writeStatus)
+                                throws Exception {
+                            return new Tuple2<>(writeStatus.getPartitionPath(), writeStatus.getStat());
+                        }
+                    }).collect();

            HoodieCommitMetadata metadata = new HoodieCommitMetadata();
            for (Tuple2<String, HoodieWriteStat> stat : stats) {
@@ -319,10 +376,10 @@ public class HoodieWriteClient<T extends HoodieRecordPayload> implements Seriali

            // open a new file and write the commit metadata in
            Path inflightCommitFile = new Path(config.getBasePath() + "/.hoodie/" + FSUtils
-                .makeInflightCommitFileName(commitTime));
+                    .makeInflightCommitFileName(commitTime));
            FSDataOutputStream fsout = fs.create(inflightCommitFile, true);
            fsout.writeBytes(new String(metadata.toJsonString().getBytes(StandardCharsets.UTF_8),
-                StandardCharsets.UTF_8));
+                    StandardCharsets.UTF_8));
            fsout.close();

            boolean success = fs.rename(inflightCommitFile, commitFile);
@@ -331,10 +388,10 @@ public class HoodieWriteClient<T extends HoodieRecordPayload> implements Seriali
                archiveLog.archiveIfRequired();
                // Call clean to cleanup if there is anything to cleanup after the commit,
                clean();
-                if(writeContext != null) {
+                if (writeContext != null) {
                    long durationInMs = metrics.getDurationInMs(writeContext.stop());
                    metrics.updateCommitMetrics(FORMATTER.parse(commitTime).getTime(), durationInMs,
-                        metadata);
+                            metadata);
                    writeContext = null;
                }
            }
@@ -342,21 +399,18 @@ public class HoodieWriteClient<T extends HoodieRecordPayload> implements Seriali
            return success;
        } catch (IOException e) {
            throw new HoodieCommitException(
-                "Failed to commit " + config.getBasePath() + " at time " + commitTime, e);
+                    "Failed to commit " + config.getBasePath() + " at time " + commitTime, e);
        } catch (ParseException e) {
            throw new HoodieCommitException(
-                "Commit time is not of valid format.Failed to commit " + config.getBasePath()
-                    + " at time " + commitTime, e);
+                    "Commit time is not of valid format.Failed to commit " + config.getBasePath()
+                            + " at time " + commitTime, e);
        }
    }

    /**
-     * Rollback the (inflight/committed) record changes with the given commit time.
-     * Three steps:
-     * (0) Obtain the commit or rollback file
-     * (1) clean indexing data,
-     * (2) clean new generated parquet files.
-     * (3) Finally delete .commit or .inflight file,
+     * Rollback the (inflight/committed) record changes with the given commit time. Three steps: (0)
+     * Obtain the commit or rollback file (1) clean indexing data, (2) clean new generated parquet
+     * files. (3) Finally delete .commit or .inflight file,
     */
    public boolean rollback(final String commitTime) throws HoodieRollbackException {

@@ -450,7 +504,7 @@ public class HoodieWriteClient<T extends HoodieRecordPayload> implements Seriali
    /**
     * Clean up any stale/old files/data lying around (either on file storage or index storage)
     */
-    private void clean() throws HoodieIOException  {
+    private void clean() throws HoodieIOException {
        try {
            logger.info("Cleaner started");
            final Timer.Context context = metrics.getCleanCtx();
@@ -459,26 +513,26 @@ public class HoodieWriteClient<T extends HoodieRecordPayload> implements Seriali
            // shuffle to distribute cleaning work across partitions evenly
            Collections.shuffle(partitionsToClean);
            logger.info("Partitions to clean up : " + partitionsToClean + ", with policy " + config.getCleanerPolicy());
-            if(partitionsToClean.isEmpty()) {
+            if (partitionsToClean.isEmpty()) {
                logger.info("Nothing to clean here mom. It is already clean");
                return;
            }

            int cleanerParallelism = Math.min(partitionsToClean.size(), config.getCleanerParallelism());
            int numFilesDeleted = jsc.parallelize(partitionsToClean, cleanerParallelism)
-                .map(new Function<String, Integer>() {
-                    @Override
-                    public Integer call(String partitionPathToClean) throws Exception {
-                        FileSystem fs = FSUtils.getFs();
-                        HoodieCleaner cleaner = new HoodieCleaner(metadata, config, fs);
-                        return cleaner.clean(partitionPathToClean);
-                    }
-                }).reduce(new Function2<Integer, Integer, Integer>() {
-                    @Override
-                    public Integer call(Integer v1, Integer v2) throws Exception {
-                        return v1 + v2;
-                    }
-                });
+                    .map(new Function<String, Integer>() {
+                        @Override
+                        public Integer call(String partitionPathToClean) throws Exception {
+                            FileSystem fs = FSUtils.getFs();
+                            HoodieCleaner cleaner = new HoodieCleaner(metadata, config, fs);
+                            return cleaner.clean(partitionPathToClean);
+                        }
+                    }).reduce(new Function2<Integer, Integer, Integer>() {
+                        @Override
+                        public Integer call(Integer v1, Integer v2) throws Exception {
+                            return v1 + v2;
+                        }
+                    });
            logger.info("Cleaned " + numFilesDeleted + " files");
            // Emit metrics (duration, numFilesDeleted) if needed
            if (context != null) {
@@ -504,18 +558,18 @@ public class HoodieWriteClient<T extends HoodieRecordPayload> implements Seriali
        logger.info("Generate a new commit time " + commitTime);
        // Create the in-flight commit file
        Path inflightCommitFilePath = new Path(
-            config.getBasePath() + "/.hoodie/" + FSUtils.makeInflightCommitFileName(commitTime));
+                config.getBasePath() + "/.hoodie/" + FSUtils.makeInflightCommitFileName(commitTime));
        try {
            if (fs.createNewFile(inflightCommitFilePath)) {
                logger.info("Create an inflight commit file " + inflightCommitFilePath);
                return;
            }
            throw new HoodieCommitException(
-                "Failed to create the inflight commit file " + inflightCommitFilePath);
+                    "Failed to create the inflight commit file " + inflightCommitFilePath);
        } catch (IOException e) {
            // handled below
            throw new HoodieCommitException(
-                "Failed to create the inflight commit file " + inflightCommitFilePath, e);
+                    "Failed to create the inflight commit file " + inflightCommitFilePath, e);
        }
    }

@@ -552,7 +606,6 @@ public class HoodieWriteClient<T extends HoodieRecordPayload> implements Seriali

    /**
     * Cleanup all inflight commits
-     * @throws IOException
     */
    private void rollbackInflightCommits() {
        final HoodieTableMetadata metadata = new HoodieTableMetadata(fs, config.getBasePath(), config.getTableName());