[MINOR] Fix typos in Spark client related classes (#4781)
This commit is contained in:
@@ -65,7 +65,7 @@ public class HoodieReadClient<T extends HoodieRecordPayload<T>> implements Seria
|
||||
|
||||
/**
|
||||
* TODO: We need to persist the index type into hoodie.properties and be able to access the index just with a simple
|
||||
* basepath pointing to the table. Until, then just always assume a BloomIndex
|
||||
* base path pointing to the table. Until, then just always assume a BloomIndex
|
||||
*/
|
||||
private final transient HoodieIndex<?, ?> index;
|
||||
private HoodieTable<T, JavaRDD<HoodieRecord<T>>, JavaRDD<HoodieKey>, JavaRDD<WriteStatus>> hoodieTable;
|
||||
|
||||
@@ -504,7 +504,7 @@ public class SparkRDDWriteClient<T extends HoodieRecordPayload> extends
|
||||
@Override
|
||||
protected void preCommit(HoodieInstant inflightInstant, HoodieCommitMetadata metadata) {
|
||||
// Create a Hoodie table after startTxn which encapsulated the commits and files visible.
|
||||
// Important to create this after the lock to ensure latest commits show up in the timeline without need for reload
|
||||
// Important to create this after the lock to ensure the latest commits show up in the timeline without need for reload
|
||||
HoodieTable table = createTable(config, hadoopConf);
|
||||
TransactionUtils.resolveWriteConflictIfAny(table, this.txnManager.getCurrentTransactionOwner(),
|
||||
Option.of(metadata), config, txnManager.getLastCompletedTransactionOwner());
|
||||
|
||||
@@ -87,7 +87,7 @@ public class SparkSizeBasedClusteringPlanStrategy<T extends HoodieRecordPayload<
|
||||
|
||||
// Add to the current file-group
|
||||
currentGroup.add(currentSlice);
|
||||
// assume each filegroup size is ~= parquet.max.file.size
|
||||
// assume each file group size is ~= parquet.max.file.size
|
||||
totalSizeSoFar += currentSlice.getBaseFile().isPresent() ? currentSlice.getBaseFile().get().getFileSize() : writeConfig.getParquetMaxFileSize();
|
||||
}
|
||||
|
||||
@@ -118,7 +118,7 @@ public class SparkSizeBasedClusteringPlanStrategy<T extends HoodieRecordPayload<
|
||||
@Override
|
||||
protected Stream<FileSlice> getFileSlicesEligibleForClustering(final String partition) {
|
||||
return super.getFileSlicesEligibleForClustering(partition)
|
||||
// Only files that have basefile size smaller than small file size are eligible.
|
||||
// Only files that have base file size smaller than small file size are eligible.
|
||||
.filter(slice -> slice.getBaseFile().map(HoodieBaseFile::getFileSize).orElse(0L) < getWriteConfig().getClusteringSmallFileLimit());
|
||||
}
|
||||
|
||||
|
||||
@@ -37,7 +37,7 @@ import java.util.Set;
|
||||
|
||||
/**
|
||||
* Update strategy based on following.
|
||||
* if some file group have update record, throw exception
|
||||
* if some file groups have update record, throw exception
|
||||
*/
|
||||
public class SparkRejectUpdateStrategy<T extends HoodieRecordPayload<T>> extends UpdateStrategy<T, JavaRDD<HoodieRecord<T>>> {
|
||||
private static final Logger LOG = LogManager.getLogger(SparkRejectUpdateStrategy.class);
|
||||
|
||||
@@ -31,13 +31,13 @@ import org.apache.hudi.table.HoodieSparkTable;
|
||||
import org.apache.hudi.table.HoodieTable;
|
||||
import org.apache.hudi.table.action.HoodieWriteMetadata;
|
||||
import org.apache.hudi.table.action.commit.BaseSparkCommitActionExecutor;
|
||||
|
||||
import org.apache.log4j.LogManager;
|
||||
import org.apache.log4j.Logger;
|
||||
import org.apache.spark.api.java.JavaRDD;
|
||||
import org.apache.spark.sql.Dataset;
|
||||
import org.apache.spark.sql.Row;
|
||||
import org.apache.spark.sql.SQLContext;
|
||||
import scala.collection.JavaConverters;
|
||||
|
||||
import java.util.Arrays;
|
||||
import java.util.HashSet;
|
||||
@@ -47,6 +47,8 @@ import java.util.concurrent.CompletableFuture;
|
||||
import java.util.stream.Collectors;
|
||||
import java.util.stream.Stream;
|
||||
|
||||
import scala.collection.JavaConverters;
|
||||
|
||||
/**
|
||||
* Spark validator utils to verify and run any precommit validators configured.
|
||||
*/
|
||||
@@ -97,7 +99,7 @@ public class SparkValidatorUtils {
|
||||
}
|
||||
|
||||
/**
|
||||
* Run validators in a separate threadpool for parallelism. Each of validator can submit a distributed spark job if needed.
|
||||
* Run validators in a separate thread pool for parallelism. Each of validator can submit a distributed spark job if needed.
|
||||
*/
|
||||
private static CompletableFuture<Boolean> runValidatorAsync(SparkPreCommitValidator validator, HoodieWriteMetadata writeMetadata,
|
||||
Dataset<Row> beforeState, Dataset<Row> afterState, String instantTime) {
|
||||
|
||||
@@ -34,11 +34,11 @@ import org.apache.spark.sql.Row;
|
||||
import org.apache.spark.sql.SQLContext;
|
||||
|
||||
/**
|
||||
* Validator to run sql query and compare table state
|
||||
* Validator to run sql query and compare table state
|
||||
* 1) before new commit started.
|
||||
* 2) current inflight commit (if successful).
|
||||
*
|
||||
* Expects query results dont match.
|
||||
* <p>
|
||||
* Expects query results do not match.
|
||||
*/
|
||||
public class SqlQueryInequalityPreCommitValidator<T extends HoodieRecordPayload, I, K, O extends JavaRDD<WriteStatus>> extends SqlQueryPreCommitValidator<T, I, K, O> {
|
||||
private static final Logger LOG = LogManager.getLogger(SqlQueryInequalityPreCommitValidator.class);
|
||||
@@ -66,7 +66,7 @@ public class SqlQueryInequalityPreCommitValidator<T extends HoodieRecordPayload,
|
||||
LOG.info("Completed Inequality Validation, datasets equal? " + areDatasetsEqual);
|
||||
if (areDatasetsEqual) {
|
||||
LOG.error("query validation failed. See stdout for sample query results. Query: " + query);
|
||||
System.out.println("Expected query results to be inequal, but they are same. Result (sample records only):");
|
||||
System.out.println("Expected query results to be different, but they are same. Result (sample records only):");
|
||||
prevRows.show();
|
||||
throw new HoodieValidationException("Query validation failed for '" + query
|
||||
+ "'. Expected " + prevRows.count() + " rows, Found " + newRows.count());
|
||||
|
||||
@@ -35,9 +35,9 @@ import org.apache.spark.sql.SQLContext;
|
||||
import java.util.List;
|
||||
|
||||
/**
|
||||
* Validator to run sql queries on new table state and expects a single result. If the result doesnt match expected result,
|
||||
* throw validation error.
|
||||
*
|
||||
* Validator to run sql queries on new table state and expects a single result. If the result does not match expected result,
|
||||
* throw validation error.
|
||||
* <p>
|
||||
* Example configuration: "query1#expectedResult1;query2#expectedResult2;"
|
||||
*/
|
||||
public class SqlQuerySingleResultPreCommitValidator<T extends HoodieRecordPayload, I, K, O extends JavaRDD<WriteStatus>> extends SqlQueryPreCommitValidator<T, I, K, O> {
|
||||
|
||||
@@ -45,7 +45,7 @@ import java.io.Serializable;
|
||||
import java.util.concurrent.atomic.AtomicLong;
|
||||
|
||||
/**
|
||||
* Create handle with InternalRow for datasource implemention of bulk insert.
|
||||
* Create handle with InternalRow for datasource implementation of bulk insert.
|
||||
*/
|
||||
public class HoodieRowCreateHandle implements Serializable {
|
||||
|
||||
|
||||
@@ -18,9 +18,10 @@
|
||||
|
||||
package org.apache.hudi.keygen;
|
||||
|
||||
import org.apache.avro.generic.GenericRecord;
|
||||
import org.apache.hudi.common.config.TypedProperties;
|
||||
import org.apache.hudi.keygen.constant.KeyGeneratorOptions;
|
||||
|
||||
import org.apache.avro.generic.GenericRecord;
|
||||
import org.apache.spark.sql.Row;
|
||||
import org.apache.spark.sql.catalyst.InternalRow;
|
||||
import org.apache.spark.sql.types.StructType;
|
||||
@@ -31,7 +32,7 @@ import java.util.List;
|
||||
import java.util.stream.Collectors;
|
||||
|
||||
/**
|
||||
* Simple Key generator for unpartitioned Hive Tables.
|
||||
* Simple Key generator for non-partitioned Hive Tables.
|
||||
*/
|
||||
public class NonpartitionedKeyGenerator extends BuiltinKeyGenerator {
|
||||
|
||||
|
||||
@@ -40,9 +40,9 @@ import java.util.stream.IntStream;
|
||||
|
||||
import scala.Option;
|
||||
|
||||
import static org.apache.hudi.keygen.KeyGenUtils.HUDI_DEFAULT_PARTITION_PATH;
|
||||
import static org.apache.hudi.keygen.KeyGenUtils.DEFAULT_PARTITION_PATH_SEPARATOR;
|
||||
import static org.apache.hudi.keygen.KeyGenUtils.EMPTY_RECORDKEY_PLACEHOLDER;
|
||||
import static org.apache.hudi.keygen.KeyGenUtils.HUDI_DEFAULT_PARTITION_PATH;
|
||||
import static org.apache.hudi.keygen.KeyGenUtils.NULL_RECORDKEY_PLACEHOLDER;
|
||||
|
||||
/**
|
||||
@@ -230,9 +230,10 @@ public class RowKeyGeneratorHelper {
|
||||
|
||||
/**
|
||||
* Generate the tree style positions for the field requested for as per the defined struct type.
|
||||
* @param structType schema of interest
|
||||
* @param field field of interest for which the positions are requested for
|
||||
* @param isRecordKey {@code true} if the field requested for is a record key. {@code false} incase of a partition path.
|
||||
*
|
||||
* @param structType schema of interest
|
||||
* @param field field of interest for which the positions are requested for
|
||||
* @param isRecordKey {@code true} if the field requested for is a record key. {@code false} in case of a partition path.
|
||||
* @return the positions of the field as per the struct type.
|
||||
*/
|
||||
public static List<Integer> getNestedFieldIndices(StructType structType, String field, boolean isRecordKey) {
|
||||
|
||||
@@ -18,7 +18,6 @@
|
||||
|
||||
package org.apache.hudi.metadata;
|
||||
|
||||
import org.apache.avro.specific.SpecificRecordBase;
|
||||
import org.apache.hudi.client.SparkRDDWriteClient;
|
||||
import org.apache.hudi.client.WriteStatus;
|
||||
import org.apache.hudi.client.common.HoodieSparkEngineContext;
|
||||
@@ -35,6 +34,7 @@ import org.apache.hudi.data.HoodieJavaRDD;
|
||||
import org.apache.hudi.exception.HoodieMetadataException;
|
||||
import org.apache.hudi.metrics.DistributedRegistry;
|
||||
|
||||
import org.apache.avro.specific.SpecificRecordBase;
|
||||
import org.apache.hadoop.conf.Configuration;
|
||||
import org.apache.log4j.LogManager;
|
||||
import org.apache.log4j.Logger;
|
||||
@@ -51,8 +51,8 @@ public class SparkHoodieBackedTableMetadataWriter extends HoodieBackedTableMetad
|
||||
/**
|
||||
* Return a Spark based implementation of {@code HoodieTableMetadataWriter} which can be used to
|
||||
* write to the metadata table.
|
||||
*
|
||||
* If the metadata table does not exist, an attempt is made to bootstrap it but there is no guarantted that
|
||||
* <p>
|
||||
* If the metadata table does not exist, an attempt is made to bootstrap it but there is no guaranteed that
|
||||
* table will end up bootstrapping at this time.
|
||||
*
|
||||
* @param conf
|
||||
|
||||
@@ -26,10 +26,11 @@ import org.apache.hudi.keygen.KeyGeneratorInterface;
|
||||
*/
|
||||
public interface BootstrapMetadataHandler {
|
||||
/**
|
||||
* Execute bootstrap with only metatata.
|
||||
* Execute bootstrap with only metadata.
|
||||
*
|
||||
* @param srcPartitionPath source partition path.
|
||||
* @param partitionPath destination partition path.
|
||||
* @param keyGenerator key generator to use.
|
||||
* @param partitionPath destination partition path.
|
||||
* @param keyGenerator key generator to use.
|
||||
* @return the {@link BootstrapWriteStatus} which has the result of execution.
|
||||
*/
|
||||
BootstrapWriteStatus runMetadataBootstrap(String srcPartitionPath, String partitionPath, KeyGeneratorInterface keyGenerator);
|
||||
|
||||
@@ -113,9 +113,9 @@ public class SparkBootstrapCommitActionExecutor<T extends HoodieRecordPayload<T>
|
||||
validate();
|
||||
try {
|
||||
HoodieTableMetaClient metaClient = table.getMetaClient();
|
||||
Option<HoodieInstant> completetedInstant =
|
||||
Option<HoodieInstant> completedInstant =
|
||||
metaClient.getActiveTimeline().getCommitsTimeline().filterCompletedInstants().lastInstant();
|
||||
ValidationUtils.checkArgument(!completetedInstant.isPresent(),
|
||||
ValidationUtils.checkArgument(!completedInstant.isPresent(),
|
||||
"Active Timeline is expected to be empty for bootstrap to be performed. "
|
||||
+ "If you want to re-bootstrap, please rollback bootstrap first !!");
|
||||
Map<BootstrapMode, List<Pair<String, List<HoodieFileStatus>>>> partitionSelections = listAndProcessSourcePartitions();
|
||||
|
||||
@@ -116,7 +116,7 @@ public class SparkExecuteClusteringCommitActionExecutor<T extends HoodieRecordPa
|
||||
protected Map<String, List<String>> getPartitionToReplacedFileIds(HoodieWriteMetadata<JavaRDD<WriteStatus>> writeMetadata) {
|
||||
Set<HoodieFileGroupId> newFilesWritten = writeMetadata.getWriteStats().get().stream()
|
||||
.map(s -> new HoodieFileGroupId(s.getPartitionPath(), s.getFileId())).collect(Collectors.toSet());
|
||||
// for the below execution strategy, new filegroup id would be same as old filegroup id
|
||||
// for the below execution strategy, new file group id would be same as old file group id
|
||||
if (SparkSingleFileSortExecutionStrategy.class.getName().equals(config.getClusteringExecutionStrategyClass())) {
|
||||
return ClusteringUtils.getFileGroupsFromClusteringPlan(clusteringPlan)
|
||||
.collect(Collectors.groupingBy(fg -> fg.getPartitionPath(), Collectors.mapping(fg -> fg.getFileId(), Collectors.toList())));
|
||||
|
||||
@@ -20,16 +20,16 @@ package org.apache.hudi.table.action.commit;
|
||||
|
||||
import org.apache.hudi.client.WriteStatus;
|
||||
import org.apache.hudi.client.utils.SparkMemoryUtils;
|
||||
import org.apache.hudi.common.config.TypedProperties;
|
||||
import org.apache.hudi.client.utils.SparkValidatorUtils;
|
||||
import org.apache.hudi.common.config.TypedProperties;
|
||||
import org.apache.hudi.common.engine.HoodieEngineContext;
|
||||
import org.apache.hudi.common.model.HoodieCommitMetadata;
|
||||
import org.apache.hudi.common.model.HoodieFileGroupId;
|
||||
import org.apache.hudi.common.model.HoodieKey;
|
||||
import org.apache.hudi.common.model.HoodieRecord;
|
||||
import org.apache.hudi.common.model.HoodieRecordLocation;
|
||||
import org.apache.hudi.common.model.HoodieRecordPayload;
|
||||
import org.apache.hudi.common.model.HoodieWriteStat;
|
||||
import org.apache.hudi.common.model.HoodieFileGroupId;
|
||||
import org.apache.hudi.common.model.WriteOperationType;
|
||||
import org.apache.hudi.common.table.timeline.HoodieActiveTimeline;
|
||||
import org.apache.hudi.common.table.timeline.HoodieInstant;
|
||||
@@ -44,9 +44,9 @@ import org.apache.hudi.exception.HoodieIOException;
|
||||
import org.apache.hudi.exception.HoodieUpsertException;
|
||||
import org.apache.hudi.execution.SparkLazyInsertIterable;
|
||||
import org.apache.hudi.io.CreateHandleFactory;
|
||||
import org.apache.hudi.io.HoodieConcatHandle;
|
||||
import org.apache.hudi.io.HoodieMergeHandle;
|
||||
import org.apache.hudi.io.HoodieSortedMergeHandle;
|
||||
import org.apache.hudi.io.HoodieConcatHandle;
|
||||
import org.apache.hudi.keygen.BaseKeyGenerator;
|
||||
import org.apache.hudi.keygen.factory.HoodieSparkKeyGeneratorFactory;
|
||||
import org.apache.hudi.table.HoodieSparkTable;
|
||||
@@ -55,27 +55,29 @@ import org.apache.hudi.table.WorkloadProfile;
|
||||
import org.apache.hudi.table.WorkloadStat;
|
||||
import org.apache.hudi.table.action.HoodieWriteMetadata;
|
||||
import org.apache.hudi.table.action.cluster.strategy.UpdateStrategy;
|
||||
|
||||
import org.apache.log4j.LogManager;
|
||||
import org.apache.log4j.Logger;
|
||||
import org.apache.spark.Partitioner;
|
||||
import org.apache.spark.api.java.JavaPairRDD;
|
||||
import org.apache.spark.api.java.JavaRDD;
|
||||
import org.apache.spark.storage.StorageLevel;
|
||||
import scala.Tuple2;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.io.Serializable;
|
||||
import java.nio.charset.StandardCharsets;
|
||||
import java.time.Duration;
|
||||
import java.time.Instant;
|
||||
import java.util.stream.Collectors;
|
||||
import java.util.Collections;
|
||||
import java.util.Comparator;
|
||||
import java.util.HashMap;
|
||||
import java.util.Iterator;
|
||||
import java.util.List;
|
||||
import java.util.Set;
|
||||
import java.util.Map;
|
||||
import java.util.Set;
|
||||
import java.util.stream.Collectors;
|
||||
|
||||
import scala.Tuple2;
|
||||
|
||||
import static org.apache.hudi.common.util.ClusteringUtils.getAllFileGroupsInPendingClusteringPlans;
|
||||
|
||||
@@ -126,7 +128,7 @@ public abstract class BaseSparkCommitActionExecutor<T extends HoodieRecordPayloa
|
||||
if (fileGroupsWithUpdatesAndPendingClustering.isEmpty()) {
|
||||
return recordsAndPendingClusteringFileGroups.getLeft();
|
||||
}
|
||||
// there are filegroups pending clustering and receiving updates, so rollback the pending clustering instants
|
||||
// there are file groups pending clustering and receiving updates, so rollback the pending clustering instants
|
||||
// there could be race condition, for example, if the clustering completes after instants are fetched but before rollback completed
|
||||
if (config.isRollbackPendingClustering()) {
|
||||
Set<HoodieInstant> pendingClusteringInstantsToRollback = getAllFileGroupsInPendingClusteringPlans(table.getMetaClient()).entrySet().stream()
|
||||
|
||||
@@ -22,6 +22,7 @@ import org.apache.hudi.common.engine.HoodieEngineContext;
|
||||
import org.apache.hudi.config.HoodieWriteConfig;
|
||||
import org.apache.hudi.table.HoodieTable;
|
||||
import org.apache.hudi.table.WorkloadProfile;
|
||||
|
||||
import org.apache.log4j.LogManager;
|
||||
import org.apache.log4j.Logger;
|
||||
|
||||
@@ -44,7 +45,7 @@ public class SparkInsertOverwritePartitioner extends UpsertPartitioner {
|
||||
* Returns a list of small files in the given partition path.
|
||||
*/
|
||||
protected List<SmallFile> getSmallFiles(String partitionPath) {
|
||||
// for overwrite, we ignore all existing files. So dont consider any file to be smallFiles
|
||||
// for overwrite, we ignore all existing files. So do not consider any file to be smallFiles
|
||||
return Collections.emptyList();
|
||||
}
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user