[HUDI-1089] Refactor hudi-client to support multi-engine (#1827)
- This change breaks `hudi-client` into `hudi-client-common` and `hudi-spark-client` modules - Simple usages of Spark using jsc.parallelize() has been redone using EngineContext#map, EngineContext#flatMap etc - Code changes in the PR, break classes into `BaseXYZ` parent classes with no spark dependencies living in `hudi-client-common` - Classes on `hudi-spark-client` are named `SparkXYZ` extending the parent classes with all the Spark dependencies - To simplify/cleanup, HoodieIndex#fetchRecordLocation has been removed and its usages in tests replaced with alternatives Co-authored-by: Vinoth Chandar <vinoth@apache.org>
This commit is contained in:
@@ -0,0 +1,36 @@
|
||||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one
|
||||
* or more contributor license agreements. See the NOTICE file
|
||||
* distributed with this work for additional information
|
||||
* regarding copyright ownership. The ASF licenses this file
|
||||
* to you under the Apache License, Version 2.0 (the
|
||||
* "License"); you may not use this file except in compliance
|
||||
* with the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
package org.apache.hudi.async;
|
||||
|
||||
import org.apache.hudi.client.AbstractCompactor;
|
||||
import org.apache.hudi.client.AbstractHoodieWriteClient;
|
||||
import org.apache.hudi.client.HoodieSparkCompactor;
|
||||
import org.apache.hudi.client.common.HoodieEngineContext;
|
||||
|
||||
public class SparkAsyncCompactService extends AsyncCompactService {
|
||||
|
||||
public SparkAsyncCompactService(HoodieEngineContext context, AbstractHoodieWriteClient client) {
|
||||
super(context, client);
|
||||
}
|
||||
|
||||
@Override
|
||||
protected AbstractCompactor createCompactor(AbstractHoodieWriteClient client) {
|
||||
return new HoodieSparkCompactor(client);
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,208 @@
|
||||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one
|
||||
* or more contributor license agreements. See the NOTICE file
|
||||
* distributed with this work for additional information
|
||||
* regarding copyright ownership. The ASF licenses this file
|
||||
* to you under the Apache License, Version 2.0 (the
|
||||
* "License"); you may not use this file except in compliance
|
||||
* with the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
package org.apache.hudi.client;
|
||||
|
||||
import org.apache.hadoop.conf.Configuration;
|
||||
import org.apache.hudi.avro.model.HoodieCompactionPlan;
|
||||
import org.apache.hudi.client.common.HoodieSparkEngineContext;
|
||||
import org.apache.hudi.common.model.HoodieBaseFile;
|
||||
import org.apache.hudi.common.model.HoodieKey;
|
||||
import org.apache.hudi.common.model.HoodieRecord;
|
||||
import org.apache.hudi.common.model.HoodieRecordPayload;
|
||||
import org.apache.hudi.common.table.HoodieTableMetaClient;
|
||||
import org.apache.hudi.common.util.CompactionUtils;
|
||||
import org.apache.hudi.common.util.Option;
|
||||
import org.apache.hudi.common.util.collection.Pair;
|
||||
import org.apache.hudi.config.HoodieIndexConfig;
|
||||
import org.apache.hudi.config.HoodieWriteConfig;
|
||||
import org.apache.hudi.exception.HoodieIndexException;
|
||||
import org.apache.hudi.index.HoodieIndex;
|
||||
import org.apache.hudi.index.SparkHoodieIndex;
|
||||
import org.apache.hudi.table.HoodieSparkTable;
|
||||
import org.apache.hudi.table.HoodieTable;
|
||||
|
||||
import org.apache.spark.SparkConf;
|
||||
import org.apache.spark.api.java.JavaPairRDD;
|
||||
import org.apache.spark.api.java.JavaRDD;
|
||||
import org.apache.spark.sql.Dataset;
|
||||
import org.apache.spark.sql.Row;
|
||||
import org.apache.spark.sql.SQLContext;
|
||||
import org.apache.spark.sql.types.StructType;
|
||||
|
||||
import java.io.Serializable;
|
||||
import java.util.HashSet;
|
||||
import java.util.List;
|
||||
import java.util.Set;
|
||||
import java.util.stream.Collectors;
|
||||
|
||||
import scala.Tuple2;
|
||||
|
||||
/**
|
||||
* Provides an RDD based API for accessing/filtering Hoodie tables, based on keys.
|
||||
*/
|
||||
public class HoodieReadClient<T extends HoodieRecordPayload> implements Serializable {
|
||||
|
||||
private static final long serialVersionUID = 1L;
|
||||
|
||||
/**
|
||||
* TODO: We need to persist the index type into hoodie.properties and be able to access the index just with a simple
|
||||
* basepath pointing to the table. Until, then just always assume a BloomIndex
|
||||
*/
|
||||
private final transient HoodieIndex<T, JavaRDD<HoodieRecord<T>>, JavaRDD<HoodieKey>, JavaRDD<WriteStatus>> index;
|
||||
private HoodieTable<T, JavaRDD<HoodieRecord<T>>, JavaRDD<HoodieKey>, JavaRDD<WriteStatus>> hoodieTable;
|
||||
private transient Option<SQLContext> sqlContextOpt;
|
||||
private final transient HoodieSparkEngineContext context;
|
||||
private final transient Configuration hadoopConf;
|
||||
|
||||
/**
|
||||
* @param basePath path to Hoodie table
|
||||
*/
|
||||
public HoodieReadClient(HoodieSparkEngineContext context, String basePath) {
|
||||
this(context, HoodieWriteConfig.newBuilder().withPath(basePath)
|
||||
// by default we use HoodieBloomIndex
|
||||
.withIndexConfig(HoodieIndexConfig.newBuilder().withIndexType(HoodieIndex.IndexType.BLOOM).build()).build());
|
||||
}
|
||||
|
||||
/**
|
||||
* @param context
|
||||
* @param basePath
|
||||
* @param sqlContext
|
||||
*/
|
||||
public HoodieReadClient(HoodieSparkEngineContext context, String basePath, SQLContext sqlContext) {
|
||||
this(context, basePath);
|
||||
this.sqlContextOpt = Option.of(sqlContext);
|
||||
}
|
||||
|
||||
/**
|
||||
* @param clientConfig instance of HoodieWriteConfig
|
||||
*/
|
||||
public HoodieReadClient(HoodieSparkEngineContext context, HoodieWriteConfig clientConfig) {
|
||||
this.context = context;
|
||||
this.hadoopConf = context.getHadoopConf().get();
|
||||
final String basePath = clientConfig.getBasePath();
|
||||
// Create a Hoodie table which encapsulated the commits and files visible
|
||||
HoodieTableMetaClient metaClient = new HoodieTableMetaClient(hadoopConf, basePath, true);
|
||||
this.hoodieTable = HoodieSparkTable.create(clientConfig, context, metaClient);
|
||||
this.index = SparkHoodieIndex.createIndex(clientConfig);
|
||||
this.sqlContextOpt = Option.empty();
|
||||
}
|
||||
|
||||
/**
|
||||
* Adds support for accessing Hoodie built tables from SparkSQL, as you normally would.
|
||||
*
|
||||
* @return SparkConf object to be used to construct the SparkContext by caller
|
||||
*/
|
||||
public static SparkConf addHoodieSupport(SparkConf conf) {
|
||||
conf.set("spark.sql.hive.convertMetastoreParquet", "false");
|
||||
return conf;
|
||||
}
|
||||
|
||||
private void assertSqlContext() {
|
||||
if (!sqlContextOpt.isPresent()) {
|
||||
throw new IllegalStateException("SQLContext must be set, when performing dataframe operations");
|
||||
}
|
||||
}
|
||||
|
||||
private Option<String> convertToDataFilePath(Option<Pair<String, String>> partitionPathFileIDPair) {
|
||||
if (partitionPathFileIDPair.isPresent()) {
|
||||
HoodieBaseFile dataFile = hoodieTable.getBaseFileOnlyView()
|
||||
.getLatestBaseFile(partitionPathFileIDPair.get().getLeft(), partitionPathFileIDPair.get().getRight()).get();
|
||||
return Option.of(dataFile.getPath());
|
||||
} else {
|
||||
return Option.empty();
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Given a bunch of hoodie keys, fetches all the individual records out as a data frame.
|
||||
*
|
||||
* @return a dataframe
|
||||
*/
|
||||
public Dataset<Row> readROView(JavaRDD<HoodieKey> hoodieKeys, int parallelism) {
|
||||
assertSqlContext();
|
||||
JavaPairRDD<HoodieKey, Option<Pair<String, String>>> lookupResultRDD = checkExists(hoodieKeys);
|
||||
JavaPairRDD<HoodieKey, Option<String>> keyToFileRDD =
|
||||
lookupResultRDD.mapToPair(r -> new Tuple2<>(r._1, convertToDataFilePath(r._2)));
|
||||
List<String> paths = keyToFileRDD.filter(keyFileTuple -> keyFileTuple._2().isPresent())
|
||||
.map(keyFileTuple -> keyFileTuple._2().get()).collect();
|
||||
|
||||
// record locations might be same for multiple keys, so need a unique list
|
||||
Set<String> uniquePaths = new HashSet<>(paths);
|
||||
Dataset<Row> originalDF = sqlContextOpt.get().read().parquet(uniquePaths.toArray(new String[uniquePaths.size()]));
|
||||
StructType schema = originalDF.schema();
|
||||
JavaPairRDD<HoodieKey, Row> keyRowRDD = originalDF.javaRDD().mapToPair(row -> {
|
||||
HoodieKey key = new HoodieKey(row.getAs(HoodieRecord.RECORD_KEY_METADATA_FIELD),
|
||||
row.getAs(HoodieRecord.PARTITION_PATH_METADATA_FIELD));
|
||||
return new Tuple2<>(key, row);
|
||||
});
|
||||
|
||||
// Now, we need to further filter out, for only rows that match the supplied hoodie keys
|
||||
JavaRDD<Row> rowRDD = keyRowRDD.join(keyToFileRDD, parallelism).map(tuple -> tuple._2()._1());
|
||||
return sqlContextOpt.get().createDataFrame(rowRDD, schema);
|
||||
}
|
||||
|
||||
/**
|
||||
* Checks if the given [Keys] exists in the hoodie table and returns [Key, Option[FullFilePath]] If the optional
|
||||
* FullFilePath value is not present, then the key is not found. If the FullFilePath value is present, it is the path
|
||||
* component (without scheme) of the URI underlying file
|
||||
*/
|
||||
public JavaPairRDD<HoodieKey, Option<Pair<String, String>>> checkExists(JavaRDD<HoodieKey> hoodieKeys) {
|
||||
return index.tagLocation(hoodieKeys.map(k -> new HoodieRecord<>(k, null)), context, hoodieTable)
|
||||
.mapToPair(hr -> new Tuple2<>(hr.getKey(), hr.isCurrentLocationKnown()
|
||||
? Option.of(Pair.of(hr.getPartitionPath(), hr.getCurrentLocation().getFileId()))
|
||||
: Option.empty())
|
||||
);
|
||||
}
|
||||
|
||||
/**
|
||||
* Filter out HoodieRecords that already exists in the output folder. This is useful in deduplication.
|
||||
*
|
||||
* @param hoodieRecords Input RDD of Hoodie records.
|
||||
* @return A subset of hoodieRecords RDD, with existing records filtered out.
|
||||
*/
|
||||
public JavaRDD<HoodieRecord<T>> filterExists(JavaRDD<HoodieRecord<T>> hoodieRecords) {
|
||||
JavaRDD<HoodieRecord<T>> recordsWithLocation = tagLocation(hoodieRecords);
|
||||
return recordsWithLocation.filter(v1 -> !v1.isCurrentLocationKnown());
|
||||
}
|
||||
|
||||
/**
|
||||
* Looks up the index and tags each incoming record with a location of a file that contains the row (if it is actually
|
||||
* present). Input RDD should contain no duplicates if needed.
|
||||
*
|
||||
* @param hoodieRecords Input RDD of Hoodie records
|
||||
* @return Tagged RDD of Hoodie records
|
||||
*/
|
||||
public JavaRDD<HoodieRecord<T>> tagLocation(JavaRDD<HoodieRecord<T>> hoodieRecords) throws HoodieIndexException {
|
||||
return index.tagLocation(hoodieRecords, context, hoodieTable);
|
||||
}
|
||||
|
||||
/**
|
||||
* Return all pending compactions with instant time for clients to decide what to compact next.
|
||||
*
|
||||
* @return
|
||||
*/
|
||||
public List<Pair<String, HoodieCompactionPlan>> getPendingCompactions() {
|
||||
HoodieTableMetaClient metaClient =
|
||||
new HoodieTableMetaClient(hadoopConf, hoodieTable.getMetaClient().getBasePath(), true);
|
||||
return CompactionUtils.getAllPendingCompactionPlans(metaClient).stream()
|
||||
.map(
|
||||
instantWorkloadPair -> Pair.of(instantWorkloadPair.getKey().getTimestamp(), instantWorkloadPair.getValue()))
|
||||
.collect(Collectors.toList());
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,55 @@
|
||||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one
|
||||
* or more contributor license agreements. See the NOTICE file
|
||||
* distributed with this work for additional information
|
||||
* regarding copyright ownership. The ASF licenses this file
|
||||
* to you under the Apache License, Version 2.0 (the
|
||||
* "License"); you may not use this file except in compliance
|
||||
* with the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
package org.apache.hudi.client;
|
||||
|
||||
import org.apache.hudi.common.model.HoodieKey;
|
||||
import org.apache.hudi.common.model.HoodieRecord;
|
||||
import org.apache.hudi.common.model.HoodieRecordPayload;
|
||||
import org.apache.hudi.common.table.timeline.HoodieInstant;
|
||||
import org.apache.hudi.common.util.Option;
|
||||
import org.apache.hudi.exception.HoodieException;
|
||||
import org.apache.log4j.LogManager;
|
||||
import org.apache.log4j.Logger;
|
||||
import org.apache.spark.api.java.JavaRDD;
|
||||
|
||||
import java.io.IOException;
|
||||
|
||||
public class HoodieSparkCompactor<T extends HoodieRecordPayload> extends AbstractCompactor<T,
|
||||
JavaRDD<HoodieRecord<T>>, JavaRDD<HoodieKey>, JavaRDD<WriteStatus>> {
|
||||
private static final Logger LOG = LogManager.getLogger(HoodieSparkCompactor.class);
|
||||
|
||||
public HoodieSparkCompactor(AbstractHoodieWriteClient<T, JavaRDD<HoodieRecord<T>>, JavaRDD<HoodieKey>, JavaRDD<WriteStatus>> compactionClient) {
|
||||
super(compactionClient);
|
||||
}
|
||||
|
||||
@Override
|
||||
public void compact(HoodieInstant instant) throws IOException {
|
||||
LOG.info("Compactor executing compaction " + instant);
|
||||
JavaRDD<WriteStatus> res = compactionClient.compact(instant.getTimestamp());
|
||||
long numWriteErrors = res.collect().stream().filter(WriteStatus::hasErrors).count();
|
||||
if (numWriteErrors != 0) {
|
||||
// We treat even a single error in compaction as fatal
|
||||
LOG.error("Compaction for instant (" + instant + ") failed with write errors. Errors :" + numWriteErrors);
|
||||
throw new HoodieException(
|
||||
"Compaction for instant (" + instant + ") failed with write errors. Errors :" + numWriteErrors);
|
||||
}
|
||||
// Commit compaction
|
||||
compactionClient.commitCompaction(instant.getTimestamp(), res, Option.empty());
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,68 @@
|
||||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one
|
||||
* or more contributor license agreements. See the NOTICE file
|
||||
* distributed with this work for additional information
|
||||
* regarding copyright ownership. The ASF licenses this file
|
||||
* to you under the Apache License, Version 2.0 (the
|
||||
* "License"); you may not use this file except in compliance
|
||||
* with the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
package org.apache.hudi.client;
|
||||
|
||||
import org.apache.spark.api.java.JavaRDD;
|
||||
|
||||
import java.io.Serializable;
|
||||
import java.util.Collections;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
|
||||
/**
|
||||
* Result of a write operation.
|
||||
*/
|
||||
public class HoodieWriteResult implements Serializable {
|
||||
|
||||
private JavaRDD<WriteStatus> writeStatuses;
|
||||
private Map<String, List<String>> partitionToReplaceFileIds;
|
||||
|
||||
public HoodieWriteResult(JavaRDD<WriteStatus> writeStatuses) {
|
||||
this(writeStatuses, Collections.emptyMap());
|
||||
}
|
||||
|
||||
public HoodieWriteResult(JavaRDD<WriteStatus> writeStatuses, Map<String, List<String>> partitionToReplaceFileIds) {
|
||||
this.writeStatuses = writeStatuses;
|
||||
this.partitionToReplaceFileIds = partitionToReplaceFileIds;
|
||||
}
|
||||
|
||||
public JavaRDD<WriteStatus> getWriteStatuses() {
|
||||
return this.writeStatuses;
|
||||
}
|
||||
|
||||
public void setWriteStatuses(final JavaRDD<WriteStatus> writeStatuses) {
|
||||
this.writeStatuses = writeStatuses;
|
||||
}
|
||||
|
||||
public Map<String, List<String>> getPartitionToReplaceFileIds() {
|
||||
return this.partitionToReplaceFileIds;
|
||||
}
|
||||
|
||||
public void setPartitionToReplaceFileIds(final Map<String, List<String>> partitionToReplaceFileIds) {
|
||||
this.partitionToReplaceFileIds = partitionToReplaceFileIds;
|
||||
}
|
||||
|
||||
@Override
|
||||
public String toString() {
|
||||
return "HoodieWriteResult{"
|
||||
+ "writeStatuses=" + writeStatuses
|
||||
+ ", partitionToReplaceFileIds=" + partitionToReplaceFileIds
|
||||
+ '}';
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,319 @@
|
||||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one
|
||||
* or more contributor license agreements. See the NOTICE file
|
||||
* distributed with this work for additional information
|
||||
* regarding copyright ownership. The ASF licenses this file
|
||||
* to you under the Apache License, Version 2.0 (the
|
||||
* "License"); you may not use this file except in compliance
|
||||
* with the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
package org.apache.hudi.client;
|
||||
|
||||
import org.apache.hudi.client.common.HoodieEngineContext;
|
||||
import org.apache.hudi.client.common.HoodieSparkEngineContext;
|
||||
import org.apache.hudi.client.embedded.EmbeddedTimelineService;
|
||||
import org.apache.hudi.common.model.HoodieCommitMetadata;
|
||||
import org.apache.hudi.common.model.HoodieKey;
|
||||
import org.apache.hudi.common.model.HoodieRecord;
|
||||
import org.apache.hudi.common.model.HoodieRecordPayload;
|
||||
import org.apache.hudi.common.model.HoodieWriteStat;
|
||||
import org.apache.hudi.common.model.WriteOperationType;
|
||||
import org.apache.hudi.common.table.HoodieTableMetaClient;
|
||||
import org.apache.hudi.common.table.HoodieTableVersion;
|
||||
import org.apache.hudi.common.table.timeline.HoodieActiveTimeline;
|
||||
import org.apache.hudi.common.table.timeline.HoodieInstant;
|
||||
import org.apache.hudi.common.table.timeline.HoodieTimeline;
|
||||
import org.apache.hudi.common.util.Option;
|
||||
import org.apache.hudi.config.HoodieWriteConfig;
|
||||
import org.apache.hudi.exception.HoodieCommitException;
|
||||
import org.apache.hudi.index.HoodieIndex;
|
||||
import org.apache.hudi.index.SparkHoodieIndex;
|
||||
import org.apache.hudi.table.BulkInsertPartitioner;
|
||||
import org.apache.hudi.table.HoodieSparkTable;
|
||||
import org.apache.hudi.table.HoodieTable;
|
||||
import org.apache.hudi.table.action.HoodieWriteMetadata;
|
||||
import org.apache.hudi.table.action.compact.SparkCompactHelpers;
|
||||
import org.apache.hudi.table.upgrade.SparkUpgradeDowngrade;
|
||||
|
||||
import com.codahale.metrics.Timer;
|
||||
import org.apache.hadoop.conf.Configuration;
|
||||
import org.apache.log4j.LogManager;
|
||||
import org.apache.log4j.Logger;
|
||||
import org.apache.spark.SparkConf;
|
||||
import org.apache.spark.api.java.JavaRDD;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.text.ParseException;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
|
||||
@SuppressWarnings("checkstyle:LineLength")
|
||||
public class SparkRDDWriteClient<T extends HoodieRecordPayload> extends
|
||||
AbstractHoodieWriteClient<T, JavaRDD<HoodieRecord<T>>, JavaRDD<HoodieKey>, JavaRDD<WriteStatus>> {
|
||||
|
||||
private static final Logger LOG = LogManager.getLogger(SparkRDDWriteClient.class);
|
||||
|
||||
public SparkRDDWriteClient(HoodieEngineContext context, HoodieWriteConfig clientConfig) {
|
||||
super(context, clientConfig);
|
||||
}
|
||||
|
||||
public SparkRDDWriteClient(HoodieEngineContext context, HoodieWriteConfig writeConfig, boolean rollbackPending) {
|
||||
super(context, writeConfig, rollbackPending);
|
||||
}
|
||||
|
||||
public SparkRDDWriteClient(HoodieEngineContext context, HoodieWriteConfig writeConfig, boolean rollbackPending,
|
||||
Option<EmbeddedTimelineService> timelineService) {
|
||||
super(context, writeConfig, rollbackPending, timelineService);
|
||||
}
|
||||
|
||||
/**
|
||||
* Register hudi classes for Kryo serialization.
|
||||
*
|
||||
* @param conf instance of SparkConf
|
||||
* @return SparkConf
|
||||
*/
|
||||
public static SparkConf registerClasses(SparkConf conf) {
|
||||
conf.registerKryoClasses(new Class[]{HoodieWriteConfig.class, HoodieRecord.class, HoodieKey.class});
|
||||
return conf;
|
||||
}
|
||||
|
||||
@Override
|
||||
protected HoodieIndex<T, JavaRDD<HoodieRecord<T>>, JavaRDD<HoodieKey>, JavaRDD<WriteStatus>> createIndex(HoodieWriteConfig writeConfig) {
|
||||
return SparkHoodieIndex.createIndex(config);
|
||||
}
|
||||
|
||||
/**
|
||||
* Complete changes performed at the given instantTime marker with specified action.
|
||||
*/
|
||||
@Override
|
||||
public boolean commit(String instantTime, JavaRDD<WriteStatus> writeStatuses, Option<Map<String, String>> extraMetadata,
|
||||
String commitActionType, Map<String, List<String>> partitionToReplacedFileIds) {
|
||||
List<HoodieWriteStat> writeStats = writeStatuses.map(WriteStatus::getStat).collect();
|
||||
return commitStats(instantTime, writeStats, extraMetadata, commitActionType, partitionToReplacedFileIds);
|
||||
}
|
||||
|
||||
@Override
|
||||
protected HoodieTable<T, JavaRDD<HoodieRecord<T>>, JavaRDD<HoodieKey>, JavaRDD<WriteStatus>> createTable(HoodieWriteConfig config,
|
||||
Configuration hadoopConf) {
|
||||
return HoodieSparkTable.create(config, context);
|
||||
}
|
||||
|
||||
@Override
|
||||
public JavaRDD<HoodieRecord<T>> filterExists(JavaRDD<HoodieRecord<T>> hoodieRecords) {
|
||||
// Create a Hoodie table which encapsulated the commits and files visible
|
||||
HoodieSparkTable<T> table = HoodieSparkTable.create(config, context);
|
||||
Timer.Context indexTimer = metrics.getIndexCtx();
|
||||
JavaRDD<HoodieRecord<T>> recordsWithLocation = getIndex().tagLocation(hoodieRecords, context, table);
|
||||
metrics.updateIndexMetrics(LOOKUP_STR, metrics.getDurationInMs(indexTimer == null ? 0L : indexTimer.stop()));
|
||||
return recordsWithLocation.filter(v1 -> !v1.isCurrentLocationKnown());
|
||||
}
|
||||
|
||||
/**
|
||||
* Main API to run bootstrap to hudi.
|
||||
*/
|
||||
@Override
|
||||
public void bootstrap(Option<Map<String, String>> extraMetadata) {
|
||||
if (rollbackPending) {
|
||||
rollBackInflightBootstrap();
|
||||
}
|
||||
getTableAndInitCtx(WriteOperationType.UPSERT, HoodieTimeline.METADATA_BOOTSTRAP_INSTANT_TS).bootstrap(context, extraMetadata);
|
||||
}
|
||||
|
||||
@Override
|
||||
public JavaRDD<WriteStatus> upsert(JavaRDD<HoodieRecord<T>> records, String instantTime) {
|
||||
HoodieTable<T, JavaRDD<HoodieRecord<T>>, JavaRDD<HoodieKey>, JavaRDD<WriteStatus>> table =
|
||||
getTableAndInitCtx(WriteOperationType.UPSERT, instantTime);
|
||||
table.validateUpsertSchema();
|
||||
setOperationType(WriteOperationType.UPSERT);
|
||||
this.asyncCleanerService = AsyncCleanerService.startAsyncCleaningIfEnabled(this, instantTime);
|
||||
HoodieWriteMetadata<JavaRDD<WriteStatus>> result = table.upsert(context, instantTime, records);
|
||||
if (result.getIndexLookupDuration().isPresent()) {
|
||||
metrics.updateIndexMetrics(LOOKUP_STR, result.getIndexLookupDuration().get().toMillis());
|
||||
}
|
||||
return postWrite(result, instantTime, table);
|
||||
}
|
||||
|
||||
@Override
|
||||
public JavaRDD<WriteStatus> upsertPreppedRecords(JavaRDD<HoodieRecord<T>> preppedRecords, String instantTime) {
|
||||
HoodieTable<T, JavaRDD<HoodieRecord<T>>, JavaRDD<HoodieKey>, JavaRDD<WriteStatus>> table =
|
||||
getTableAndInitCtx(WriteOperationType.UPSERT_PREPPED, instantTime);
|
||||
table.validateUpsertSchema();
|
||||
setOperationType(WriteOperationType.UPSERT_PREPPED);
|
||||
this.asyncCleanerService = AsyncCleanerService.startAsyncCleaningIfEnabled(this, instantTime);
|
||||
HoodieWriteMetadata<JavaRDD<WriteStatus>> result = table.upsertPrepped(context,instantTime, preppedRecords);
|
||||
return postWrite(result, instantTime, table);
|
||||
}
|
||||
|
||||
@Override
|
||||
public JavaRDD<WriteStatus> insert(JavaRDD<HoodieRecord<T>> records, String instantTime) {
|
||||
HoodieTable<T, JavaRDD<HoodieRecord<T>>, JavaRDD<HoodieKey>, JavaRDD<WriteStatus>> table =
|
||||
getTableAndInitCtx(WriteOperationType.INSERT, instantTime);
|
||||
table.validateInsertSchema();
|
||||
setOperationType(WriteOperationType.INSERT);
|
||||
this.asyncCleanerService = AsyncCleanerService.startAsyncCleaningIfEnabled(this, instantTime);
|
||||
HoodieWriteMetadata<JavaRDD<WriteStatus>> result = table.insert(context,instantTime, records);
|
||||
return postWrite(result, instantTime, table);
|
||||
}
|
||||
|
||||
@Override
|
||||
public JavaRDD<WriteStatus> insertPreppedRecords(JavaRDD<HoodieRecord<T>> preppedRecords, String instantTime) {
|
||||
HoodieTable<T, JavaRDD<HoodieRecord<T>>, JavaRDD<HoodieKey>, JavaRDD<WriteStatus>> table =
|
||||
getTableAndInitCtx(WriteOperationType.INSERT_PREPPED, instantTime);
|
||||
table.validateInsertSchema();
|
||||
setOperationType(WriteOperationType.INSERT_PREPPED);
|
||||
this.asyncCleanerService = AsyncCleanerService.startAsyncCleaningIfEnabled(this, instantTime);
|
||||
HoodieWriteMetadata<JavaRDD<WriteStatus>> result = table.insertPrepped(context,instantTime, preppedRecords);
|
||||
return postWrite(result, instantTime, table);
|
||||
}
|
||||
|
||||
/**
|
||||
* Removes all existing records from the partitions affected and inserts the given HoodieRecords, into the table.
|
||||
|
||||
* @param records HoodieRecords to insert
|
||||
* @param instantTime Instant time of the commit
|
||||
* @return JavaRDD[WriteStatus] - RDD of WriteStatus to inspect errors and counts
|
||||
*/
|
||||
public HoodieWriteResult insertOverwrite(JavaRDD<HoodieRecord<T>> records, final String instantTime) {
|
||||
HoodieTable table = getTableAndInitCtx(WriteOperationType.INSERT_OVERWRITE, instantTime);
|
||||
table.validateInsertSchema();
|
||||
setOperationType(WriteOperationType.INSERT_OVERWRITE);
|
||||
this.asyncCleanerService = AsyncCleanerService.startAsyncCleaningIfEnabled(this, instantTime);
|
||||
HoodieWriteMetadata result = table.insertOverwrite(context, instantTime, records);
|
||||
return new HoodieWriteResult(postWrite(result, instantTime, table), result.getPartitionToReplaceFileIds());
|
||||
}
|
||||
|
||||
@Override
|
||||
public JavaRDD<WriteStatus> bulkInsert(JavaRDD<HoodieRecord<T>> records, String instantTime) {
|
||||
return bulkInsert(records, instantTime, Option.empty());
|
||||
}
|
||||
|
||||
@Override
|
||||
public JavaRDD<WriteStatus> bulkInsert(JavaRDD<HoodieRecord<T>> records, String instantTime, Option<BulkInsertPartitioner<JavaRDD<HoodieRecord<T>>>> userDefinedBulkInsertPartitioner) {
|
||||
HoodieTable<T, JavaRDD<HoodieRecord<T>>, JavaRDD<HoodieKey>, JavaRDD<WriteStatus>> table =
|
||||
getTableAndInitCtx(WriteOperationType.BULK_INSERT, instantTime);
|
||||
table.validateInsertSchema();
|
||||
setOperationType(WriteOperationType.BULK_INSERT);
|
||||
this.asyncCleanerService = AsyncCleanerService.startAsyncCleaningIfEnabled(this, instantTime);
|
||||
HoodieWriteMetadata<JavaRDD<WriteStatus>> result = table.bulkInsert(context,instantTime, records, userDefinedBulkInsertPartitioner);
|
||||
return postWrite(result, instantTime, table);
|
||||
}
|
||||
|
||||
@Override
|
||||
public JavaRDD<WriteStatus> bulkInsertPreppedRecords(JavaRDD<HoodieRecord<T>> preppedRecords, String instantTime, Option<BulkInsertPartitioner<JavaRDD<HoodieRecord<T>>>> bulkInsertPartitioner) {
|
||||
HoodieTable<T, JavaRDD<HoodieRecord<T>>, JavaRDD<HoodieKey>, JavaRDD<WriteStatus>> table =
|
||||
getTableAndInitCtx(WriteOperationType.BULK_INSERT_PREPPED, instantTime);
|
||||
table.validateInsertSchema();
|
||||
setOperationType(WriteOperationType.BULK_INSERT_PREPPED);
|
||||
this.asyncCleanerService = AsyncCleanerService.startAsyncCleaningIfEnabled(this, instantTime);
|
||||
HoodieWriteMetadata<JavaRDD<WriteStatus>> result = table.bulkInsertPrepped(context,instantTime, preppedRecords, bulkInsertPartitioner);
|
||||
return postWrite(result, instantTime, table);
|
||||
}
|
||||
|
||||
@Override
|
||||
public JavaRDD<WriteStatus> delete(JavaRDD<HoodieKey> keys, String instantTime) {
|
||||
HoodieTable<T, JavaRDD<HoodieRecord<T>>, JavaRDD<HoodieKey>, JavaRDD<WriteStatus>> table = getTableAndInitCtx(WriteOperationType.DELETE, instantTime);
|
||||
setOperationType(WriteOperationType.DELETE);
|
||||
HoodieWriteMetadata<JavaRDD<WriteStatus>> result = table.delete(context,instantTime, keys);
|
||||
return postWrite(result, instantTime, table);
|
||||
}
|
||||
|
||||
@Override
|
||||
protected JavaRDD<WriteStatus> postWrite(HoodieWriteMetadata<JavaRDD<WriteStatus>> result,
|
||||
String instantTime,
|
||||
HoodieTable<T, JavaRDD<HoodieRecord<T>>, JavaRDD<HoodieKey>, JavaRDD<WriteStatus>> hoodieTable) {
|
||||
if (result.getIndexLookupDuration().isPresent()) {
|
||||
metrics.updateIndexMetrics(getOperationType().name(), result.getIndexUpdateDuration().get().toMillis());
|
||||
}
|
||||
if (result.isCommitted()) {
|
||||
// Perform post commit operations.
|
||||
if (result.getFinalizeDuration().isPresent()) {
|
||||
metrics.updateFinalizeWriteMetrics(result.getFinalizeDuration().get().toMillis(),
|
||||
result.getWriteStats().get().size());
|
||||
}
|
||||
|
||||
postCommit(hoodieTable, result.getCommitMetadata().get(), instantTime, Option.empty());
|
||||
|
||||
emitCommitMetrics(instantTime, result.getCommitMetadata().get(), hoodieTable.getMetaClient().getCommitActionType());
|
||||
}
|
||||
return result.getWriteStatuses();
|
||||
}
|
||||
|
||||
@Override
|
||||
public void commitCompaction(String compactionInstantTime, JavaRDD<WriteStatus> writeStatuses, Option<Map<String, String>> extraMetadata) throws IOException {
|
||||
HoodieSparkTable<T> table = HoodieSparkTable.create(config, context);
|
||||
HoodieCommitMetadata metadata = SparkCompactHelpers.newInstance().createCompactionMetadata(
|
||||
table, compactionInstantTime, writeStatuses, config.getSchema());
|
||||
extraMetadata.ifPresent(m -> m.forEach(metadata::addMetadata));
|
||||
completeCompaction(metadata, writeStatuses, table, compactionInstantTime);
|
||||
}
|
||||
|
||||
@Override
|
||||
protected void completeCompaction(HoodieCommitMetadata metadata, JavaRDD<WriteStatus> writeStatuses,
|
||||
HoodieTable<T, JavaRDD<HoodieRecord<T>>, JavaRDD<HoodieKey>, JavaRDD<WriteStatus>> table,
|
||||
String compactionCommitTime) {
|
||||
List<HoodieWriteStat> writeStats = writeStatuses.map(WriteStatus::getStat).collect();
|
||||
finalizeWrite(table, compactionCommitTime, writeStats);
|
||||
LOG.info("Committing Compaction " + compactionCommitTime + ". Finished with result " + metadata);
|
||||
SparkCompactHelpers.newInstance().completeInflightCompaction(table, compactionCommitTime, metadata);
|
||||
|
||||
if (compactionTimer != null) {
|
||||
long durationInMs = metrics.getDurationInMs(compactionTimer.stop());
|
||||
try {
|
||||
metrics.updateCommitMetrics(HoodieActiveTimeline.COMMIT_FORMATTER.parse(compactionCommitTime).getTime(),
|
||||
durationInMs, metadata, HoodieActiveTimeline.COMPACTION_ACTION);
|
||||
} catch (ParseException e) {
|
||||
throw new HoodieCommitException("Commit time is not of valid format. Failed to commit compaction "
|
||||
+ config.getBasePath() + " at time " + compactionCommitTime, e);
|
||||
}
|
||||
}
|
||||
LOG.info("Compacted successfully on commit " + compactionCommitTime);
|
||||
}
|
||||
|
||||
@Override
|
||||
protected JavaRDD<WriteStatus> compact(String compactionInstantTime, boolean shouldComplete) {
|
||||
HoodieSparkTable<T> table = HoodieSparkTable.create(config, context);
|
||||
HoodieTimeline pendingCompactionTimeline = table.getActiveTimeline().filterPendingCompactionTimeline();
|
||||
HoodieInstant inflightInstant = HoodieTimeline.getCompactionInflightInstant(compactionInstantTime);
|
||||
if (pendingCompactionTimeline.containsInstant(inflightInstant)) {
|
||||
rollbackInflightCompaction(inflightInstant, table);
|
||||
table.getMetaClient().reloadActiveTimeline();
|
||||
}
|
||||
compactionTimer = metrics.getCompactionCtx();
|
||||
HoodieWriteMetadata<JavaRDD<WriteStatus>> compactionMetadata = table.compact(context, compactionInstantTime);
|
||||
JavaRDD<WriteStatus> statuses = compactionMetadata.getWriteStatuses();
|
||||
if (shouldComplete && compactionMetadata.getCommitMetadata().isPresent()) {
|
||||
completeCompaction(compactionMetadata.getCommitMetadata().get(), statuses, table, compactionInstantTime);
|
||||
}
|
||||
return statuses;
|
||||
}
|
||||
|
||||
@Override
|
||||
protected HoodieTable<T, JavaRDD<HoodieRecord<T>>, JavaRDD<HoodieKey>, JavaRDD<WriteStatus>> getTableAndInitCtx(WriteOperationType operationType, String instantTime) {
|
||||
HoodieTableMetaClient metaClient = createMetaClient(true);
|
||||
new SparkUpgradeDowngrade(metaClient, config, context).run(metaClient, HoodieTableVersion.current(), config, context, instantTime);
|
||||
return getTableAndInitCtx(metaClient, operationType);
|
||||
}
|
||||
|
||||
private HoodieTable<T, JavaRDD<HoodieRecord<T>>, JavaRDD<HoodieKey>, JavaRDD<WriteStatus>> getTableAndInitCtx(HoodieTableMetaClient metaClient, WriteOperationType operationType) {
|
||||
if (operationType == WriteOperationType.DELETE) {
|
||||
setWriteSchemaForDeletes(metaClient);
|
||||
}
|
||||
// Create a Hoodie table which encapsulated the commits and files visible
|
||||
HoodieSparkTable<T> table = HoodieSparkTable.create(config, (HoodieSparkEngineContext) context, metaClient);
|
||||
if (table.getMetaClient().getCommitActionType().equals(HoodieTimeline.COMMIT_ACTION)) {
|
||||
writeTimer = metrics.getCommitCtx();
|
||||
} else {
|
||||
writeTimer = metrics.getDeltaCommitCtx();
|
||||
}
|
||||
return table;
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,84 @@
|
||||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one
|
||||
* or more contributor license agreements. See the NOTICE file
|
||||
* distributed with this work for additional information
|
||||
* regarding copyright ownership. The ASF licenses this file
|
||||
* to you under the Apache License, Version 2.0 (the
|
||||
* "License"); you may not use this file except in compliance
|
||||
* with the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
package org.apache.hudi.client;
|
||||
|
||||
import org.apache.hudi.client.common.EngineProperty;
|
||||
import org.apache.hudi.client.common.TaskContextSupplier;
|
||||
import org.apache.hudi.common.util.Option;
|
||||
import org.apache.hudi.exception.HoodieException;
|
||||
|
||||
import org.apache.spark.SparkEnv;
|
||||
import org.apache.spark.TaskContext;
|
||||
import org.apache.spark.util.Utils;
|
||||
|
||||
import java.io.Serializable;
|
||||
import java.util.function.Supplier;
|
||||
|
||||
/**
|
||||
* Spark task context supplier.
|
||||
*/
|
||||
public class SparkTaskContextSupplier extends TaskContextSupplier implements Serializable {
|
||||
|
||||
@Override
|
||||
public Supplier<Integer> getPartitionIdSupplier() {
|
||||
return TaskContext::getPartitionId;
|
||||
}
|
||||
|
||||
@Override
|
||||
public Supplier<Integer> getStageIdSupplier() {
|
||||
return () -> TaskContext.get().stageId();
|
||||
}
|
||||
|
||||
@Override
|
||||
public Supplier<Long> getAttemptIdSupplier() {
|
||||
return () -> TaskContext.get().taskAttemptId();
|
||||
}
|
||||
|
||||
@Override
|
||||
public Option<String> getProperty(EngineProperty prop) {
|
||||
if (prop == EngineProperty.TOTAL_MEMORY_AVAILABLE) {
|
||||
// This is hard-coded in spark code {@link
|
||||
// https://github.com/apache/spark/blob/576c43fb4226e4efa12189b41c3bc862019862c6/core/src/main/scala/org/apache/
|
||||
// spark/SparkContext.scala#L471} so have to re-define this here
|
||||
final String DEFAULT_SPARK_EXECUTOR_MEMORY_MB = "1024"; // in MB
|
||||
final String SPARK_EXECUTOR_MEMORY_PROP = "spark.executor.memory";
|
||||
if (SparkEnv.get() != null) {
|
||||
// 1 GB is the default conf used by Spark, look at SparkContext.scala
|
||||
return Option.ofNullable(String.valueOf(Utils.memoryStringToMb(SparkEnv.get().conf()
|
||||
.get(SPARK_EXECUTOR_MEMORY_PROP, DEFAULT_SPARK_EXECUTOR_MEMORY_MB)) * 1024 * 1024L));
|
||||
}
|
||||
return Option.empty();
|
||||
} else if (prop == EngineProperty.MEMORY_FRACTION_IN_USE) {
|
||||
// This is hard-coded in spark code {@link
|
||||
// https://github.com/apache/spark/blob/576c43fb4226e4efa12189b41c3bc862019862c6/core/src/main/scala/org/apache/
|
||||
// spark/memory/UnifiedMemoryManager.scala#L231} so have to re-define this here
|
||||
final String DEFAULT_SPARK_EXECUTOR_MEMORY_FRACTION = "0.6";
|
||||
final String SPARK_EXECUTOR_MEMORY_FRACTION_PROP = "spark.memory.fraction";
|
||||
if (SparkEnv.get() != null) {
|
||||
// 0.6 is the default value used by Spark,
|
||||
// look at {@link
|
||||
// https://github.com/apache/spark/blob/master/core/src/main/scala/org/apache/spark/SparkConf.scala#L507}
|
||||
return Option.ofNullable(SparkEnv.get().conf()
|
||||
.get(SPARK_EXECUTOR_MEMORY_FRACTION_PROP, DEFAULT_SPARK_EXECUTOR_MEMORY_FRACTION));
|
||||
}
|
||||
return Option.empty();
|
||||
}
|
||||
throw new HoodieException("Unknown engine property :" + prop);
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,68 @@
|
||||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one
|
||||
* or more contributor license agreements. See the NOTICE file
|
||||
* distributed with this work for additional information
|
||||
* regarding copyright ownership. The ASF licenses this file
|
||||
* to you under the Apache License, Version 2.0 (the
|
||||
* "License"); you may not use this file except in compliance
|
||||
* with the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
package org.apache.hudi.client.bootstrap;
|
||||
|
||||
import org.apache.avro.Schema;
|
||||
import org.apache.hadoop.fs.Path;
|
||||
import org.apache.hudi.avro.HoodieAvroUtils;
|
||||
import org.apache.hudi.avro.model.HoodieFileStatus;
|
||||
import org.apache.hudi.client.common.HoodieEngineContext;
|
||||
import org.apache.hudi.common.bootstrap.FileStatusUtils;
|
||||
import org.apache.hudi.common.util.ParquetUtils;
|
||||
import org.apache.hudi.common.util.collection.Pair;
|
||||
import org.apache.hudi.config.HoodieWriteConfig;
|
||||
import org.apache.hudi.exception.HoodieException;
|
||||
import org.apache.parquet.schema.MessageType;
|
||||
import org.apache.spark.sql.avro.SchemaConverters;
|
||||
import org.apache.spark.sql.execution.datasources.parquet.ParquetToSparkSchemaConverter;
|
||||
import org.apache.spark.sql.internal.SQLConf;
|
||||
import org.apache.spark.sql.types.StructType;
|
||||
|
||||
import java.util.List;
|
||||
import java.util.Objects;
|
||||
|
||||
public class HoodieSparkBootstrapSchemaProvider extends HoodieBootstrapSchemaProvider {
|
||||
public HoodieSparkBootstrapSchemaProvider(HoodieWriteConfig writeConfig) {
|
||||
super(writeConfig);
|
||||
}
|
||||
|
||||
@Override
|
||||
protected Schema getBootstrapSourceSchema(HoodieEngineContext context, List<Pair<String, List<HoodieFileStatus>>> partitions) {
|
||||
MessageType parquetSchema = partitions.stream().flatMap(p -> p.getValue().stream()).map(fs -> {
|
||||
try {
|
||||
Path filePath = FileStatusUtils.toPath(fs.getPath());
|
||||
return ParquetUtils.readSchema(context.getHadoopConf().get(), filePath);
|
||||
} catch (Exception ex) {
|
||||
return null;
|
||||
}
|
||||
}).filter(Objects::nonNull).findAny()
|
||||
.orElseThrow(() -> new HoodieException("Could not determine schema from the data files."));
|
||||
|
||||
|
||||
ParquetToSparkSchemaConverter converter = new ParquetToSparkSchemaConverter(
|
||||
Boolean.parseBoolean(SQLConf.PARQUET_BINARY_AS_STRING().defaultValueString()),
|
||||
Boolean.parseBoolean(SQLConf.PARQUET_INT96_AS_TIMESTAMP().defaultValueString()));
|
||||
StructType sparkSchema = converter.convert(parquetSchema);
|
||||
String tableName = HoodieAvroUtils.sanitizeName(writeConfig.getTableName());
|
||||
String structName = tableName + "_record";
|
||||
String recordNamespace = "hoodie." + tableName;
|
||||
|
||||
return SchemaConverters.toAvroType(sparkSchema, false, structName, recordNamespace);
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,112 @@
|
||||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one
|
||||
* or more contributor license agreements. See the NOTICE file
|
||||
* distributed with this work for additional information
|
||||
* regarding copyright ownership. The ASF licenses this file
|
||||
* to you under the Apache License, Version 2.0 (the
|
||||
* "License"); you may not use this file except in compliance
|
||||
* with the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
package org.apache.hudi.client.common;
|
||||
|
||||
import org.apache.hudi.client.SparkTaskContextSupplier;
|
||||
import org.apache.hudi.common.config.SerializableConfiguration;
|
||||
import org.apache.hudi.client.common.function.SerializableConsumer;
|
||||
import org.apache.hudi.client.common.function.SerializableFunction;
|
||||
import org.apache.hudi.client.common.function.SerializablePairFunction;
|
||||
import org.apache.hudi.common.util.Option;
|
||||
import org.apache.hudi.exception.HoodieException;
|
||||
|
||||
import org.apache.spark.api.java.JavaSparkContext;
|
||||
import org.apache.spark.sql.SQLContext;
|
||||
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
import java.util.Objects;
|
||||
import java.util.stream.Stream;
|
||||
|
||||
/**
|
||||
* A Spark engine implementation of HoodieEngineContext.
|
||||
*/
|
||||
public class HoodieSparkEngineContext extends HoodieEngineContext {
|
||||
|
||||
private final JavaSparkContext javaSparkContext;
|
||||
private SQLContext sqlContext;
|
||||
|
||||
public HoodieSparkEngineContext(JavaSparkContext jsc) {
|
||||
super(new SerializableConfiguration(jsc.hadoopConfiguration()), new SparkTaskContextSupplier());
|
||||
this.javaSparkContext = jsc;
|
||||
this.sqlContext = SQLContext.getOrCreate(jsc.sc());
|
||||
}
|
||||
|
||||
public void setSqlContext(SQLContext sqlContext) {
|
||||
this.sqlContext = sqlContext;
|
||||
}
|
||||
|
||||
public JavaSparkContext getJavaSparkContext() {
|
||||
return javaSparkContext;
|
||||
}
|
||||
|
||||
public SQLContext getSqlContext() {
|
||||
return sqlContext;
|
||||
}
|
||||
|
||||
public static JavaSparkContext getSparkContext(HoodieEngineContext context) {
|
||||
return ((HoodieSparkEngineContext) context).getJavaSparkContext();
|
||||
}
|
||||
|
||||
@Override
|
||||
public <I, O> List<O> map(List<I> data, SerializableFunction<I, O> func, int parallelism) {
|
||||
return javaSparkContext.parallelize(data, parallelism).map(func::apply).collect();
|
||||
}
|
||||
|
||||
@Override
|
||||
public <I, O> List<O> flatMap(List<I> data, SerializableFunction<I, Stream<O>> func, int parallelism) {
|
||||
return javaSparkContext.parallelize(data, parallelism).flatMap(x -> func.apply(x).iterator()).collect();
|
||||
}
|
||||
|
||||
@Override
|
||||
public <I> void foreach(List<I> data, SerializableConsumer<I> consumer, int parallelism) {
|
||||
javaSparkContext.parallelize(data, parallelism).foreach(consumer::accept);
|
||||
}
|
||||
|
||||
@Override
|
||||
public <I, K, V> Map<K, V> mapToPair(List<I> data, SerializablePairFunction<I, K, V> func, Integer parallelism) {
|
||||
if (Objects.nonNull(parallelism)) {
|
||||
return javaSparkContext.parallelize(data, parallelism).mapToPair(func::call).collectAsMap();
|
||||
} else {
|
||||
return javaSparkContext.parallelize(data).mapToPair(func::call).collectAsMap();
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public void setProperty(EngineProperty key, String value) {
|
||||
if (key == EngineProperty.COMPACTION_POOL_NAME) {
|
||||
javaSparkContext.setLocalProperty("spark.scheduler.pool", value);
|
||||
} else {
|
||||
throw new HoodieException("Unknown engine property :" + key);
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public Option<String> getProperty(EngineProperty key) {
|
||||
if (key == EngineProperty.EMBEDDED_SERVER_HOST) {
|
||||
return Option.ofNullable(javaSparkContext.getConf().get("spark.driver.host", null));
|
||||
}
|
||||
throw new HoodieException("Unknown engine property :" + key);
|
||||
}
|
||||
|
||||
@Override
|
||||
public void setJobStatus(String activeModule, String activityDescription) {
|
||||
javaSparkContext.setJobGroup(activeModule, activityDescription);
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,243 @@
|
||||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one
|
||||
* or more contributor license agreements. See the NOTICE file
|
||||
* distributed with this work for additional information
|
||||
* regarding copyright ownership. The ASF licenses this file
|
||||
* to you under the Apache License, Version 2.0 (the
|
||||
* "License"); you may not use this file except in compliance
|
||||
* with the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
package org.apache.hudi.client.model;
|
||||
|
||||
import org.apache.hudi.common.model.HoodieRecord;
|
||||
import org.apache.spark.sql.catalyst.InternalRow;
|
||||
import org.apache.spark.sql.catalyst.util.ArrayData;
|
||||
import org.apache.spark.sql.catalyst.util.MapData;
|
||||
import org.apache.spark.sql.types.DataType;
|
||||
import org.apache.spark.sql.types.Decimal;
|
||||
import org.apache.spark.unsafe.types.CalendarInterval;
|
||||
import org.apache.spark.unsafe.types.UTF8String;
|
||||
|
||||
/**
|
||||
* Internal Row implementation for Hoodie Row. It wraps an {@link InternalRow} and keeps meta columns locally. But the {@link InternalRow}
|
||||
* does include the meta columns as well just that {@link HoodieInternalRow} will intercept queries for meta columns and serve from its
|
||||
* copy rather than fetching from {@link InternalRow}.
|
||||
*/
|
||||
public class HoodieInternalRow extends InternalRow {
|
||||
|
||||
private String commitTime;
|
||||
private String commitSeqNumber;
|
||||
private String recordKey;
|
||||
private String partitionPath;
|
||||
private String fileName;
|
||||
private InternalRow row;
|
||||
|
||||
public HoodieInternalRow(String commitTime, String commitSeqNumber, String recordKey, String partitionPath,
|
||||
String fileName, InternalRow row) {
|
||||
this.commitTime = commitTime;
|
||||
this.commitSeqNumber = commitSeqNumber;
|
||||
this.recordKey = recordKey;
|
||||
this.partitionPath = partitionPath;
|
||||
this.fileName = fileName;
|
||||
this.row = row;
|
||||
}
|
||||
|
||||
@Override
|
||||
public int numFields() {
|
||||
return row.numFields();
|
||||
}
|
||||
|
||||
@Override
|
||||
public void setNullAt(int i) {
|
||||
if (i < HoodieRecord.HOODIE_META_COLUMNS.size()) {
|
||||
switch (i) {
|
||||
case 0: {
|
||||
this.commitTime = null;
|
||||
break;
|
||||
}
|
||||
case 1: {
|
||||
this.commitSeqNumber = null;
|
||||
break;
|
||||
}
|
||||
case 2: {
|
||||
this.recordKey = null;
|
||||
break;
|
||||
}
|
||||
case 3: {
|
||||
this.partitionPath = null;
|
||||
break;
|
||||
}
|
||||
case 4: {
|
||||
this.fileName = null;
|
||||
break;
|
||||
}
|
||||
default: throw new IllegalArgumentException("Not expected");
|
||||
}
|
||||
} else {
|
||||
row.setNullAt(i);
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public void update(int i, Object value) {
|
||||
if (i < HoodieRecord.HOODIE_META_COLUMNS.size()) {
|
||||
switch (i) {
|
||||
case 0: {
|
||||
this.commitTime = value.toString();
|
||||
break;
|
||||
}
|
||||
case 1: {
|
||||
this.commitSeqNumber = value.toString();
|
||||
break;
|
||||
}
|
||||
case 2: {
|
||||
this.recordKey = value.toString();
|
||||
break;
|
||||
}
|
||||
case 3: {
|
||||
this.partitionPath = value.toString();
|
||||
break;
|
||||
}
|
||||
case 4: {
|
||||
this.fileName = value.toString();
|
||||
break;
|
||||
}
|
||||
default: throw new IllegalArgumentException("Not expected");
|
||||
}
|
||||
} else {
|
||||
row.update(i, value);
|
||||
}
|
||||
}
|
||||
|
||||
private String getMetaColumnVal(int ordinal) {
|
||||
switch (ordinal) {
|
||||
case 0: {
|
||||
return commitTime;
|
||||
}
|
||||
case 1: {
|
||||
return commitSeqNumber;
|
||||
}
|
||||
case 2: {
|
||||
return recordKey;
|
||||
}
|
||||
case 3: {
|
||||
return partitionPath;
|
||||
}
|
||||
case 4: {
|
||||
return fileName;
|
||||
}
|
||||
default: throw new IllegalArgumentException("Not expected");
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean isNullAt(int ordinal) {
|
||||
if (ordinal < HoodieRecord.HOODIE_META_COLUMNS.size()) {
|
||||
return null == getMetaColumnVal(ordinal);
|
||||
}
|
||||
return row.isNullAt(ordinal);
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean getBoolean(int ordinal) {
|
||||
return row.getBoolean(ordinal);
|
||||
}
|
||||
|
||||
@Override
|
||||
public byte getByte(int ordinal) {
|
||||
return row.getByte(ordinal);
|
||||
}
|
||||
|
||||
@Override
|
||||
public short getShort(int ordinal) {
|
||||
return row.getShort(ordinal);
|
||||
}
|
||||
|
||||
@Override
|
||||
public int getInt(int ordinal) {
|
||||
return row.getInt(ordinal);
|
||||
}
|
||||
|
||||
@Override
|
||||
public long getLong(int ordinal) {
|
||||
return row.getLong(ordinal);
|
||||
}
|
||||
|
||||
@Override
|
||||
public float getFloat(int ordinal) {
|
||||
return row.getFloat(ordinal);
|
||||
}
|
||||
|
||||
@Override
|
||||
public double getDouble(int ordinal) {
|
||||
return row.getDouble(ordinal);
|
||||
}
|
||||
|
||||
@Override
|
||||
public Decimal getDecimal(int ordinal, int precision, int scale) {
|
||||
return row.getDecimal(ordinal, precision, scale);
|
||||
}
|
||||
|
||||
@Override
|
||||
public UTF8String getUTF8String(int ordinal) {
|
||||
if (ordinal < HoodieRecord.HOODIE_META_COLUMNS.size()) {
|
||||
return UTF8String.fromBytes(getMetaColumnVal(ordinal).getBytes());
|
||||
}
|
||||
return row.getUTF8String(ordinal);
|
||||
}
|
||||
|
||||
@Override
|
||||
public String getString(int ordinal) {
|
||||
if (ordinal < HoodieRecord.HOODIE_META_COLUMNS.size()) {
|
||||
return new String(getMetaColumnVal(ordinal).getBytes());
|
||||
}
|
||||
return row.getString(ordinal);
|
||||
}
|
||||
|
||||
@Override
|
||||
public byte[] getBinary(int ordinal) {
|
||||
return row.getBinary(ordinal);
|
||||
}
|
||||
|
||||
@Override
|
||||
public CalendarInterval getInterval(int ordinal) {
|
||||
return row.getInterval(ordinal);
|
||||
}
|
||||
|
||||
@Override
|
||||
public InternalRow getStruct(int ordinal, int numFields) {
|
||||
return row.getStruct(ordinal, numFields);
|
||||
}
|
||||
|
||||
@Override
|
||||
public ArrayData getArray(int ordinal) {
|
||||
return row.getArray(ordinal);
|
||||
}
|
||||
|
||||
@Override
|
||||
public MapData getMap(int ordinal) {
|
||||
return row.getMap(ordinal);
|
||||
}
|
||||
|
||||
@Override
|
||||
public Object get(int ordinal, DataType dataType) {
|
||||
if (ordinal < HoodieRecord.HOODIE_META_COLUMNS.size()) {
|
||||
return UTF8String.fromBytes(getMetaColumnVal(ordinal).getBytes());
|
||||
}
|
||||
return row.get(ordinal, dataType);
|
||||
}
|
||||
|
||||
@Override
|
||||
public InternalRow copy() {
|
||||
return new HoodieInternalRow(commitTime, commitSeqNumber, recordKey, partitionPath, fileName, row.copy());
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,44 @@
|
||||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one
|
||||
* or more contributor license agreements. See the NOTICE file
|
||||
* distributed with this work for additional information
|
||||
* regarding copyright ownership. The ASF licenses this file
|
||||
* to you under the Apache License, Version 2.0 (the
|
||||
* "License"); you may not use this file except in compliance
|
||||
* with the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
package org.apache.hudi.client.utils;
|
||||
|
||||
import org.apache.hudi.config.HoodieIndexConfig;
|
||||
|
||||
import org.apache.spark.storage.StorageLevel;
|
||||
|
||||
import java.util.Properties;
|
||||
|
||||
import static org.apache.hudi.config.HoodieWriteConfig.WRITE_STATUS_STORAGE_LEVEL;
|
||||
|
||||
/**
|
||||
* Spark config utils.
|
||||
*/
|
||||
public class SparkMemoryUtils {
|
||||
public static StorageLevel getWriteStatusStorageLevel(Properties properties) {
|
||||
return StorageLevel.fromString(properties.getProperty(WRITE_STATUS_STORAGE_LEVEL));
|
||||
}
|
||||
|
||||
public static StorageLevel getBloomIndexInputStorageLevel(Properties properties) {
|
||||
return StorageLevel.fromString(properties.getProperty(HoodieIndexConfig.BLOOM_INDEX_INPUT_STORAGE_LEVEL));
|
||||
}
|
||||
|
||||
public static StorageLevel getSimpleIndexInputStorageLevel(Properties properties) {
|
||||
return StorageLevel.fromString(properties.getProperty(HoodieIndexConfig.SIMPLE_INDEX_INPUT_STORAGE_LEVEL));
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,56 @@
|
||||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one
|
||||
* or more contributor license agreements. See the NOTICE file
|
||||
* distributed with this work for additional information
|
||||
* regarding copyright ownership. The ASF licenses this file
|
||||
* to you under the Apache License, Version 2.0 (the
|
||||
* "License"); you may not use this file except in compliance
|
||||
* with the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
package org.apache.hudi.execution;
|
||||
|
||||
import org.apache.hudi.common.util.Option;
|
||||
import org.apache.hudi.common.util.queue.BoundedInMemoryExecutor;
|
||||
import org.apache.hudi.common.util.queue.BoundedInMemoryQueueConsumer;
|
||||
import org.apache.hudi.common.util.queue.BoundedInMemoryQueueProducer;
|
||||
import org.apache.hudi.common.util.queue.IteratorBasedQueueProducer;
|
||||
import org.apache.hudi.config.HoodieWriteConfig;
|
||||
|
||||
import org.apache.spark.TaskContext;
|
||||
import org.apache.spark.TaskContext$;
|
||||
|
||||
import java.util.Iterator;
|
||||
import java.util.function.Function;
|
||||
|
||||
public class SparkBoundedInMemoryExecutor<I, O, E> extends BoundedInMemoryExecutor<I, O, E> {
|
||||
|
||||
// Need to set current spark thread's TaskContext into newly launched thread so that new thread can access
|
||||
// TaskContext properties.
|
||||
final TaskContext sparkThreadTaskContext;
|
||||
|
||||
public SparkBoundedInMemoryExecutor(final HoodieWriteConfig hoodieConfig, final Iterator<I> inputItr,
|
||||
BoundedInMemoryQueueConsumer<O, E> consumer, Function<I, O> bufferedIteratorTransform) {
|
||||
this(hoodieConfig, new IteratorBasedQueueProducer<>(inputItr), consumer, bufferedIteratorTransform);
|
||||
}
|
||||
|
||||
public SparkBoundedInMemoryExecutor(final HoodieWriteConfig hoodieConfig, BoundedInMemoryQueueProducer<I> producer,
|
||||
BoundedInMemoryQueueConsumer<O, E> consumer, Function<I, O> bufferedIteratorTransform) {
|
||||
super(hoodieConfig.getWriteBufferLimitBytes(), producer, Option.of(consumer), bufferedIteratorTransform);
|
||||
this.sparkThreadTaskContext = TaskContext.get();
|
||||
}
|
||||
|
||||
@Override
|
||||
public void preExecute() {
|
||||
// Passing parent thread's TaskContext to newly launched thread for it to access original TaskContext properties.
|
||||
TaskContext$.MODULE$.setTaskContext(sparkThreadTaskContext);
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,78 @@
|
||||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one
|
||||
* or more contributor license agreements. See the NOTICE file
|
||||
* distributed with this work for additional information
|
||||
* regarding copyright ownership. The ASF licenses this file
|
||||
* to you under the Apache License, Version 2.0 (the
|
||||
* "License"); you may not use this file except in compliance
|
||||
* with the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
package org.apache.hudi.execution;
|
||||
|
||||
import org.apache.avro.Schema;
|
||||
import org.apache.hudi.client.WriteStatus;
|
||||
import org.apache.hudi.client.common.TaskContextSupplier;
|
||||
import org.apache.hudi.common.model.HoodieRecord;
|
||||
import org.apache.hudi.common.model.HoodieRecordPayload;
|
||||
import org.apache.hudi.common.util.queue.BoundedInMemoryExecutor;
|
||||
import org.apache.hudi.config.HoodieWriteConfig;
|
||||
import org.apache.hudi.exception.HoodieException;
|
||||
import org.apache.hudi.io.WriteHandleFactory;
|
||||
import org.apache.hudi.table.HoodieTable;
|
||||
|
||||
import java.util.Iterator;
|
||||
import java.util.List;
|
||||
|
||||
public class SparkLazyInsertIterable<T extends HoodieRecordPayload> extends HoodieLazyInsertIterable<T> {
|
||||
|
||||
public SparkLazyInsertIterable(Iterator<HoodieRecord<T>> recordItr,
|
||||
boolean areRecordsSorted,
|
||||
HoodieWriteConfig config,
|
||||
String instantTime,
|
||||
HoodieTable hoodieTable,
|
||||
String idPrefix,
|
||||
TaskContextSupplier taskContextSupplier) {
|
||||
super(recordItr, areRecordsSorted, config, instantTime, hoodieTable, idPrefix, taskContextSupplier);
|
||||
}
|
||||
|
||||
public SparkLazyInsertIterable(Iterator<HoodieRecord<T>> recordItr,
|
||||
boolean areRecordsSorted,
|
||||
HoodieWriteConfig config,
|
||||
String instantTime,
|
||||
HoodieTable hoodieTable,
|
||||
String idPrefix,
|
||||
TaskContextSupplier taskContextSupplier,
|
||||
WriteHandleFactory writeHandleFactory) {
|
||||
super(recordItr, areRecordsSorted, config, instantTime, hoodieTable, idPrefix, taskContextSupplier, writeHandleFactory);
|
||||
}
|
||||
|
||||
@Override
|
||||
protected List<WriteStatus> computeNext() {
|
||||
// Executor service used for launching writer thread.
|
||||
BoundedInMemoryExecutor<HoodieRecord<T>, HoodieInsertValueGenResult<HoodieRecord>, List<WriteStatus>> bufferedIteratorExecutor =
|
||||
null;
|
||||
try {
|
||||
final Schema schema = new Schema.Parser().parse(hoodieConfig.getSchema());
|
||||
bufferedIteratorExecutor =
|
||||
new SparkBoundedInMemoryExecutor<>(hoodieConfig, inputItr, getInsertHandler(), getTransformFunction(schema));
|
||||
final List<WriteStatus> result = bufferedIteratorExecutor.execute();
|
||||
assert result != null && !result.isEmpty() && !bufferedIteratorExecutor.isRemaining();
|
||||
return result;
|
||||
} catch (Exception e) {
|
||||
throw new HoodieException(e);
|
||||
} finally {
|
||||
if (null != bufferedIteratorExecutor) {
|
||||
bufferedIteratorExecutor.shutdownNow();
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,42 @@
|
||||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one
|
||||
* or more contributor license agreements. See the NOTICE file
|
||||
* distributed with this work for additional information
|
||||
* regarding copyright ownership. The ASF licenses this file
|
||||
* to you under the Apache License, Version 2.0 (the
|
||||
* "License"); you may not use this file except in compliance
|
||||
* with the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
package org.apache.hudi.execution.bulkinsert;
|
||||
|
||||
import org.apache.hudi.exception.HoodieException;
|
||||
import org.apache.hudi.table.BulkInsertPartitioner;
|
||||
|
||||
/**
|
||||
* A factory to generate built-in partitioner to repartition input records into at least
|
||||
* expected number of output spark partitions for bulk insert operation.
|
||||
*/
|
||||
public abstract class BulkInsertInternalPartitionerFactory {
|
||||
|
||||
public static BulkInsertPartitioner get(BulkInsertSortMode sortMode) {
|
||||
switch (sortMode) {
|
||||
case NONE:
|
||||
return new NonSortPartitioner();
|
||||
case GLOBAL_SORT:
|
||||
return new GlobalSortPartitioner();
|
||||
case PARTITION_SORT:
|
||||
return new RDDPartitionSortPartitioner();
|
||||
default:
|
||||
throw new HoodieException("The bulk insert sort mode \"" + sortMode.name() + "\" is not supported.");
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,60 @@
|
||||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one
|
||||
* or more contributor license agreements. See the NOTICE file
|
||||
* distributed with this work for additional information
|
||||
* regarding copyright ownership. The ASF licenses this file
|
||||
* to you under the Apache License, Version 2.0 (the
|
||||
* "License"); you may not use this file except in compliance
|
||||
* with the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
package org.apache.hudi.execution.bulkinsert;
|
||||
|
||||
import org.apache.hudi.client.WriteStatus;
|
||||
import org.apache.hudi.common.model.HoodieRecord;
|
||||
import org.apache.hudi.common.model.HoodieRecordPayload;
|
||||
import org.apache.hudi.config.HoodieWriteConfig;
|
||||
import org.apache.hudi.execution.SparkLazyInsertIterable;
|
||||
import org.apache.hudi.table.HoodieTable;
|
||||
|
||||
import org.apache.spark.api.java.function.Function2;
|
||||
|
||||
import java.util.Iterator;
|
||||
import java.util.List;
|
||||
|
||||
/**
|
||||
* Map function that handles a stream of HoodieRecords.
|
||||
*/
|
||||
public class BulkInsertMapFunction<T extends HoodieRecordPayload>
|
||||
implements Function2<Integer, Iterator<HoodieRecord<T>>, Iterator<List<WriteStatus>>> {
|
||||
|
||||
private String instantTime;
|
||||
private boolean areRecordsSorted;
|
||||
private HoodieWriteConfig config;
|
||||
private HoodieTable hoodieTable;
|
||||
private List<String> fileIDPrefixes;
|
||||
|
||||
public BulkInsertMapFunction(String instantTime, boolean areRecordsSorted,
|
||||
HoodieWriteConfig config, HoodieTable hoodieTable,
|
||||
List<String> fileIDPrefixes) {
|
||||
this.instantTime = instantTime;
|
||||
this.areRecordsSorted = areRecordsSorted;
|
||||
this.config = config;
|
||||
this.hoodieTable = hoodieTable;
|
||||
this.fileIDPrefixes = fileIDPrefixes;
|
||||
}
|
||||
|
||||
@Override
|
||||
public Iterator<List<WriteStatus>> call(Integer partition, Iterator<HoodieRecord<T>> recordItr) {
|
||||
return new SparkLazyInsertIterable<>(recordItr, areRecordsSorted, config, instantTime, hoodieTable,
|
||||
fileIDPrefixes.get(partition), hoodieTable.getTaskContextSupplier());
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,57 @@
|
||||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one
|
||||
* or more contributor license agreements. See the NOTICE file
|
||||
* distributed with this work for additional information
|
||||
* regarding copyright ownership. The ASF licenses this file
|
||||
* to you under the Apache License, Version 2.0 (the
|
||||
* "License"); you may not use this file except in compliance
|
||||
* with the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
package org.apache.hudi.execution.bulkinsert;
|
||||
|
||||
import org.apache.hudi.common.model.HoodieRecord;
|
||||
import org.apache.hudi.common.model.HoodieRecordPayload;
|
||||
import org.apache.hudi.table.BulkInsertPartitioner;
|
||||
|
||||
import org.apache.spark.api.java.JavaRDD;
|
||||
|
||||
/**
|
||||
* A built-in partitioner that does global sorting for the input records across partitions
|
||||
* after repartition for bulk insert operation, corresponding to the
|
||||
* {@code BulkInsertSortMode.GLOBAL_SORT} mode.
|
||||
*
|
||||
* @param <T> HoodieRecordPayload type
|
||||
*/
|
||||
public class GlobalSortPartitioner<T extends HoodieRecordPayload>
|
||||
implements BulkInsertPartitioner<JavaRDD<HoodieRecord<T>>> {
|
||||
|
||||
@Override
|
||||
public JavaRDD<HoodieRecord<T>> repartitionRecords(JavaRDD<HoodieRecord<T>> records,
|
||||
int outputSparkPartitions) {
|
||||
// Now, sort the records and line them up nicely for loading.
|
||||
return records.sortBy(record -> {
|
||||
// Let's use "partitionPath + key" as the sort key. Spark, will ensure
|
||||
// the records split evenly across RDD partitions, such that small partitions fit
|
||||
// into 1 RDD partition, while big ones spread evenly across multiple RDD partitions
|
||||
return new StringBuilder()
|
||||
.append(record.getPartitionPath())
|
||||
.append("+")
|
||||
.append(record.getRecordKey())
|
||||
.toString();
|
||||
}, true, outputSparkPartitions);
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean arePartitionRecordsSorted() {
|
||||
return true;
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,46 @@
|
||||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one
|
||||
* or more contributor license agreements. See the NOTICE file
|
||||
* distributed with this work for additional information
|
||||
* regarding copyright ownership. The ASF licenses this file
|
||||
* to you under the Apache License, Version 2.0 (the
|
||||
* "License"); you may not use this file except in compliance
|
||||
* with the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
package org.apache.hudi.execution.bulkinsert;
|
||||
|
||||
import org.apache.hudi.common.model.HoodieRecord;
|
||||
import org.apache.hudi.common.model.HoodieRecordPayload;
|
||||
import org.apache.hudi.table.BulkInsertPartitioner;
|
||||
|
||||
import org.apache.spark.api.java.JavaRDD;
|
||||
|
||||
/**
|
||||
* A built-in partitioner that only does coalesce for input records for bulk insert operation,
|
||||
* corresponding to the {@code BulkInsertSortMode.NONE} mode.
|
||||
*
|
||||
* @param <T> HoodieRecordPayload type
|
||||
*/
|
||||
public class NonSortPartitioner<T extends HoodieRecordPayload>
|
||||
implements BulkInsertPartitioner<JavaRDD<HoodieRecord<T>>> {
|
||||
|
||||
@Override
|
||||
public JavaRDD<HoodieRecord<T>> repartitionRecords(JavaRDD<HoodieRecord<T>> records,
|
||||
int outputSparkPartitions) {
|
||||
return records.coalesce(outputSparkPartitions);
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean arePartitionRecordsSorted() {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,69 @@
|
||||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one
|
||||
* or more contributor license agreements. See the NOTICE file
|
||||
* distributed with this work for additional information
|
||||
* regarding copyright ownership. The ASF licenses this file
|
||||
* to you under the Apache License, Version 2.0 (the
|
||||
* "License"); you may not use this file except in compliance
|
||||
* with the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
package org.apache.hudi.execution.bulkinsert;
|
||||
|
||||
import org.apache.hudi.common.model.HoodieRecord;
|
||||
import org.apache.hudi.common.model.HoodieRecordPayload;
|
||||
import org.apache.hudi.table.BulkInsertPartitioner;
|
||||
|
||||
import org.apache.spark.api.java.JavaRDD;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.Collections;
|
||||
import java.util.List;
|
||||
|
||||
import scala.Tuple2;
|
||||
|
||||
/**
|
||||
* A built-in partitioner that does local sorting for each RDD partition
|
||||
* after coalesce for bulk insert operation, corresponding to the
|
||||
* {@code BulkInsertSortMode.PARTITION_SORT} mode.
|
||||
*
|
||||
* @param <T> HoodieRecordPayload type
|
||||
*/
|
||||
public class RDDPartitionSortPartitioner<T extends HoodieRecordPayload>
|
||||
implements BulkInsertPartitioner<JavaRDD<HoodieRecord<T>>> {
|
||||
|
||||
@Override
|
||||
public JavaRDD<HoodieRecord<T>> repartitionRecords(JavaRDD<HoodieRecord<T>> records,
|
||||
int outputSparkPartitions) {
|
||||
return records.coalesce(outputSparkPartitions)
|
||||
.mapToPair(record ->
|
||||
new Tuple2<>(
|
||||
new StringBuilder()
|
||||
.append(record.getPartitionPath())
|
||||
.append("+")
|
||||
.append(record.getRecordKey())
|
||||
.toString(), record))
|
||||
.mapPartitions(partition -> {
|
||||
// Sort locally in partition
|
||||
List<Tuple2<String, HoodieRecord<T>>> recordList = new ArrayList<>();
|
||||
for (; partition.hasNext(); ) {
|
||||
recordList.add(partition.next());
|
||||
}
|
||||
Collections.sort(recordList, (o1, o2) -> o1._1.compareTo(o2._1));
|
||||
return recordList.stream().map(e -> e._2).iterator();
|
||||
});
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean arePartitionRecordsSorted() {
|
||||
return true;
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,85 @@
|
||||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one
|
||||
* or more contributor license agreements. See the NOTICE file
|
||||
* distributed with this work for additional information
|
||||
* regarding copyright ownership. The ASF licenses this file
|
||||
* to you under the Apache License, Version 2.0 (the
|
||||
* "License"); you may not use this file except in compliance
|
||||
* with the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
package org.apache.hudi.index;
|
||||
|
||||
import org.apache.hudi.ApiMaturityLevel;
|
||||
import org.apache.hudi.PublicAPIMethod;
|
||||
import org.apache.hudi.client.WriteStatus;
|
||||
import org.apache.hudi.client.common.HoodieEngineContext;
|
||||
import org.apache.hudi.common.model.HoodieKey;
|
||||
import org.apache.hudi.common.model.HoodieRecord;
|
||||
import org.apache.hudi.common.model.HoodieRecordPayload;
|
||||
import org.apache.hudi.common.util.ReflectionUtils;
|
||||
import org.apache.hudi.common.util.StringUtils;
|
||||
import org.apache.hudi.config.HoodieWriteConfig;
|
||||
import org.apache.hudi.exception.HoodieIndexException;
|
||||
import org.apache.hudi.index.bloom.SparkHoodieBloomIndex;
|
||||
import org.apache.hudi.index.bloom.SparkHoodieGlobalBloomIndex;
|
||||
import org.apache.hudi.index.hbase.SparkHoodieHBaseIndex;
|
||||
import org.apache.hudi.index.simple.SparkHoodieGlobalSimpleIndex;
|
||||
import org.apache.hudi.index.simple.SparkHoodieSimpleIndex;
|
||||
import org.apache.hudi.table.HoodieTable;
|
||||
|
||||
import org.apache.spark.api.java.JavaRDD;
|
||||
|
||||
@SuppressWarnings("checkstyle:LineLength")
|
||||
public abstract class SparkHoodieIndex<T extends HoodieRecordPayload> extends HoodieIndex<T, JavaRDD<HoodieRecord<T>>, JavaRDD<HoodieKey>, JavaRDD<WriteStatus>> {
|
||||
protected SparkHoodieIndex(HoodieWriteConfig config) {
|
||||
super(config);
|
||||
}
|
||||
|
||||
public static SparkHoodieIndex createIndex(HoodieWriteConfig config) {
|
||||
// first use index class config to create index.
|
||||
if (!StringUtils.isNullOrEmpty(config.getIndexClass())) {
|
||||
Object instance = ReflectionUtils.loadClass(config.getIndexClass(), config);
|
||||
if (!(instance instanceof HoodieIndex)) {
|
||||
throw new HoodieIndexException(config.getIndexClass() + " is not a subclass of HoodieIndex");
|
||||
}
|
||||
return (SparkHoodieIndex) instance;
|
||||
}
|
||||
switch (config.getIndexType()) {
|
||||
case HBASE:
|
||||
return new SparkHoodieHBaseIndex<>(config);
|
||||
case INMEMORY:
|
||||
return new SparkInMemoryHashIndex(config);
|
||||
case BLOOM:
|
||||
return new SparkHoodieBloomIndex<>(config);
|
||||
case GLOBAL_BLOOM:
|
||||
return new SparkHoodieGlobalBloomIndex<>(config);
|
||||
case SIMPLE:
|
||||
return new SparkHoodieSimpleIndex(config);
|
||||
case GLOBAL_SIMPLE:
|
||||
return new SparkHoodieGlobalSimpleIndex(config);
|
||||
default:
|
||||
throw new HoodieIndexException("Index type unspecified, set " + config.getIndexType());
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
@PublicAPIMethod(maturity = ApiMaturityLevel.STABLE)
|
||||
public abstract JavaRDD<WriteStatus> updateLocation(JavaRDD<WriteStatus> writeStatusRDD,
|
||||
HoodieEngineContext context,
|
||||
HoodieTable<T, JavaRDD<HoodieRecord<T>>, JavaRDD<HoodieKey>, JavaRDD<WriteStatus>> hoodieTable) throws HoodieIndexException;
|
||||
|
||||
@Override
|
||||
@PublicAPIMethod(maturity = ApiMaturityLevel.STABLE)
|
||||
public abstract JavaRDD<HoodieRecord<T>> tagLocation(JavaRDD<HoodieRecord<T>> records,
|
||||
HoodieEngineContext context,
|
||||
HoodieTable<T, JavaRDD<HoodieRecord<T>>, JavaRDD<HoodieKey>, JavaRDD<WriteStatus>> hoodieTable) throws HoodieIndexException;
|
||||
}
|
||||
@@ -0,0 +1,135 @@
|
||||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one
|
||||
* or more contributor license agreements. See the NOTICE file
|
||||
* distributed with this work for additional information
|
||||
* regarding copyright ownership. The ASF licenses this file
|
||||
* to you under the Apache License, Version 2.0 (the
|
||||
* "License"); you may not use this file except in compliance
|
||||
* with the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
package org.apache.hudi.index;
|
||||
|
||||
import org.apache.hudi.client.WriteStatus;
|
||||
import org.apache.hudi.client.common.HoodieEngineContext;
|
||||
import org.apache.hudi.common.model.HoodieKey;
|
||||
import org.apache.hudi.common.model.HoodieRecord;
|
||||
import org.apache.hudi.common.model.HoodieRecordLocation;
|
||||
import org.apache.hudi.common.model.HoodieRecordPayload;
|
||||
import org.apache.hudi.common.util.Option;
|
||||
import org.apache.hudi.config.HoodieWriteConfig;
|
||||
import org.apache.hudi.table.HoodieTable;
|
||||
|
||||
import org.apache.spark.api.java.JavaRDD;
|
||||
import org.apache.spark.api.java.function.Function2;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.Iterator;
|
||||
import java.util.List;
|
||||
import java.util.concurrent.ConcurrentHashMap;
|
||||
import java.util.concurrent.ConcurrentMap;
|
||||
|
||||
/**
|
||||
* Hoodie Index implementation backed by an in-memory Hash map.
|
||||
* <p>
|
||||
* ONLY USE FOR LOCAL TESTING
|
||||
*/
|
||||
@SuppressWarnings("checkstyle:LineLength")
|
||||
public class SparkInMemoryHashIndex<T extends HoodieRecordPayload> extends SparkHoodieIndex<T> {
|
||||
|
||||
private static ConcurrentMap<HoodieKey, HoodieRecordLocation> recordLocationMap;
|
||||
|
||||
public SparkInMemoryHashIndex(HoodieWriteConfig config) {
|
||||
super(config);
|
||||
synchronized (SparkInMemoryHashIndex.class) {
|
||||
if (recordLocationMap == null) {
|
||||
recordLocationMap = new ConcurrentHashMap<>();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public JavaRDD<HoodieRecord<T>> tagLocation(JavaRDD<HoodieRecord<T>> recordRDD, HoodieEngineContext context,
|
||||
HoodieTable<T, JavaRDD<HoodieRecord<T>>, JavaRDD<HoodieKey>, JavaRDD<WriteStatus>> hoodieTable) {
|
||||
return recordRDD.mapPartitionsWithIndex(this.new LocationTagFunction(), true);
|
||||
}
|
||||
|
||||
@Override
|
||||
public JavaRDD<WriteStatus> updateLocation(JavaRDD<WriteStatus> writeStatusRDD,
|
||||
HoodieEngineContext context,
|
||||
HoodieTable<T, JavaRDD<HoodieRecord<T>>, JavaRDD<HoodieKey>, JavaRDD<WriteStatus>> hoodieTable) {
|
||||
return writeStatusRDD.map(writeStatus -> {
|
||||
for (HoodieRecord record : writeStatus.getWrittenRecords()) {
|
||||
if (!writeStatus.isErrored(record.getKey())) {
|
||||
HoodieKey key = record.getKey();
|
||||
Option<HoodieRecordLocation> newLocation = record.getNewLocation();
|
||||
if (newLocation.isPresent()) {
|
||||
recordLocationMap.put(key, newLocation.get());
|
||||
} else {
|
||||
// Delete existing index for a deleted record
|
||||
recordLocationMap.remove(key);
|
||||
}
|
||||
}
|
||||
}
|
||||
return writeStatus;
|
||||
});
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean rollbackCommit(String instantTime) {
|
||||
return true;
|
||||
}
|
||||
|
||||
/**
|
||||
* Only looks up by recordKey.
|
||||
*/
|
||||
@Override
|
||||
public boolean isGlobal() {
|
||||
return true;
|
||||
}
|
||||
|
||||
/**
|
||||
* Mapping is available in HBase already.
|
||||
*/
|
||||
@Override
|
||||
public boolean canIndexLogFiles() {
|
||||
return true;
|
||||
}
|
||||
|
||||
/**
|
||||
* Index needs to be explicitly updated after storage write.
|
||||
*/
|
||||
@Override
|
||||
public boolean isImplicitWithStorage() {
|
||||
return false;
|
||||
}
|
||||
|
||||
/**
|
||||
* Function that tags each HoodieRecord with an existing location, if known.
|
||||
*/
|
||||
class LocationTagFunction implements Function2<Integer, Iterator<HoodieRecord<T>>, Iterator<HoodieRecord<T>>> {
|
||||
|
||||
@Override
|
||||
public Iterator<HoodieRecord<T>> call(Integer partitionNum, Iterator<HoodieRecord<T>> hoodieRecordIterator) {
|
||||
List<HoodieRecord<T>> taggedRecords = new ArrayList<>();
|
||||
while (hoodieRecordIterator.hasNext()) {
|
||||
HoodieRecord<T> rec = hoodieRecordIterator.next();
|
||||
if (recordLocationMap.containsKey(rec.getKey())) {
|
||||
rec.unseal();
|
||||
rec.setCurrentLocation(recordLocationMap.get(rec.getKey()));
|
||||
rec.seal();
|
||||
}
|
||||
taggedRecords.add(rec);
|
||||
}
|
||||
return taggedRecords.iterator();
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,156 @@
|
||||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one
|
||||
* or more contributor license agreements. See the NOTICE file
|
||||
* distributed with this work for additional information
|
||||
* regarding copyright ownership. The ASF licenses this file
|
||||
* to you under the Apache License, Version 2.0 (the
|
||||
* "License"); you may not use this file except in compliance
|
||||
* with the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
package org.apache.hudi.index.bloom;
|
||||
|
||||
import org.apache.hudi.common.util.NumericUtils;
|
||||
import org.apache.hudi.common.util.collection.Pair;
|
||||
|
||||
import org.apache.log4j.LogManager;
|
||||
import org.apache.log4j.Logger;
|
||||
import org.apache.spark.Partitioner;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.HashMap;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
import java.util.concurrent.atomic.AtomicInteger;
|
||||
|
||||
/**
|
||||
* Partitions bloom filter checks by spreading out comparisons across buckets of work.
|
||||
*
|
||||
* Each bucket incurs the following cost
|
||||
*
|
||||
* <pre>
|
||||
* 1) Read bloom filter from file footer
|
||||
* 2) Check keys against bloom filter
|
||||
* 3) [Conditional] If any key had a hit, open file and check
|
||||
* </pre>
|
||||
*
|
||||
* The partitioner performs a two phase bin packing algorithm, to pack enough work into each bucket such that cost of
|
||||
* (1) & (3) is amortized. Also, avoids any skews in the sort based approach, by directly partitioning by the file to be
|
||||
* checked against and ensuring each partition has similar number of buckets. Performance tests show that this approach
|
||||
* could bound the amount of skew to std_dev(numberOfBucketsPerPartition) * cost of (3), lower than sort partitioning.
|
||||
*
|
||||
* Approach has two goals :
|
||||
*
|
||||
* <pre>
|
||||
* 1) Pack as many buckets from same file group into same partition, to amortize cost of (1) and (2) further
|
||||
* 2) Spread buckets across partitions evenly to achieve skew reduction
|
||||
* </pre>
|
||||
*/
|
||||
public class BucketizedBloomCheckPartitioner extends Partitioner {
|
||||
|
||||
private static final Logger LOG = LogManager.getLogger(BucketizedBloomCheckPartitioner.class);
|
||||
|
||||
private int partitions;
|
||||
|
||||
/**
|
||||
* Stores the final mapping of a file group to a list of partitions for its keys.
|
||||
*/
|
||||
private Map<String, List<Integer>> fileGroupToPartitions;
|
||||
|
||||
/**
|
||||
* Create a partitioner that computes a plan based on provided workload characteristics.
|
||||
*
|
||||
* @param targetPartitions maximum number of partitions to target
|
||||
* @param fileGroupToComparisons number of expected comparisons per file group
|
||||
* @param keysPerBucket maximum number of keys to pack in a single bucket
|
||||
*/
|
||||
public BucketizedBloomCheckPartitioner(int targetPartitions, Map<String, Long> fileGroupToComparisons,
|
||||
int keysPerBucket) {
|
||||
this.fileGroupToPartitions = new HashMap<>();
|
||||
|
||||
Map<String, Integer> bucketsPerFileGroup = new HashMap<>();
|
||||
// Compute the buckets needed per file group, using simple uniform distribution
|
||||
fileGroupToComparisons.forEach((f, c) -> bucketsPerFileGroup.put(f, (int) Math.ceil((c * 1.0) / keysPerBucket)));
|
||||
int totalBuckets = bucketsPerFileGroup.values().stream().mapToInt(i -> i).sum();
|
||||
// If totalBuckets > targetPartitions, no need to have extra partitions
|
||||
this.partitions = Math.min(targetPartitions, totalBuckets);
|
||||
|
||||
// PHASE 1 : start filling upto minimum number of buckets into partitions, taking all but one bucket from each file
|
||||
// This tries to first optimize for goal 1 above, with knowledge that each partition needs a certain minimum number
|
||||
// of buckets and assigns buckets in the same order as file groups. If we were to simply round robin, then buckets
|
||||
// for a file group is more or less guaranteed to be placed on different partitions all the time.
|
||||
int minBucketsPerPartition = Math.max((int) Math.floor((1.0 * totalBuckets) / partitions), 1);
|
||||
LOG.info(String.format("TotalBuckets %d, min_buckets/partition %d", totalBuckets, minBucketsPerPartition));
|
||||
int[] bucketsFilled = new int[partitions];
|
||||
Map<String, AtomicInteger> bucketsFilledPerFileGroup = new HashMap<>();
|
||||
int partitionIndex = 0;
|
||||
for (Map.Entry<String, Integer> e : bucketsPerFileGroup.entrySet()) {
|
||||
for (int b = 0; b < Math.max(1, e.getValue() - 1); b++) {
|
||||
// keep filled counts upto date
|
||||
bucketsFilled[partitionIndex]++;
|
||||
AtomicInteger cnt = bucketsFilledPerFileGroup.getOrDefault(e.getKey(), new AtomicInteger(0));
|
||||
cnt.incrementAndGet();
|
||||
bucketsFilledPerFileGroup.put(e.getKey(), cnt);
|
||||
|
||||
// mark this partition against the file group
|
||||
List<Integer> partitionList = this.fileGroupToPartitions.getOrDefault(e.getKey(), new ArrayList<>());
|
||||
partitionList.add(partitionIndex);
|
||||
this.fileGroupToPartitions.put(e.getKey(), partitionList);
|
||||
|
||||
// switch to new partition if needed
|
||||
if (bucketsFilled[partitionIndex] >= minBucketsPerPartition) {
|
||||
partitionIndex = (partitionIndex + 1) % partitions;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// PHASE 2 : for remaining unassigned buckets, round robin over partitions once. Since we withheld 1 bucket from
|
||||
// each file group uniformly, this remaining is also an uniform mix across file groups. We just round robin to
|
||||
// optimize for goal 2.
|
||||
for (Map.Entry<String, Integer> e : bucketsPerFileGroup.entrySet()) {
|
||||
int remaining = e.getValue() - bucketsFilledPerFileGroup.get(e.getKey()).intValue();
|
||||
for (int r = 0; r < remaining; r++) {
|
||||
// mark this partition against the file group
|
||||
this.fileGroupToPartitions.get(e.getKey()).add(partitionIndex);
|
||||
bucketsFilled[partitionIndex]++;
|
||||
partitionIndex = (partitionIndex + 1) % partitions;
|
||||
}
|
||||
}
|
||||
|
||||
if (LOG.isDebugEnabled()) {
|
||||
LOG.debug("Partitions assigned per file groups :" + fileGroupToPartitions);
|
||||
StringBuilder str = new StringBuilder();
|
||||
for (int i = 0; i < bucketsFilled.length; i++) {
|
||||
str.append("p" + i + " : " + bucketsFilled[i] + ",");
|
||||
}
|
||||
LOG.debug("Num buckets assigned per file group :" + str);
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public int numPartitions() {
|
||||
return partitions;
|
||||
}
|
||||
|
||||
@Override
|
||||
public int getPartition(Object key) {
|
||||
final Pair<String, String> parts = (Pair<String, String>) key;
|
||||
final long hashOfKey = NumericUtils.getMessageDigestHash("MD5", parts.getRight());
|
||||
final List<Integer> candidatePartitions = fileGroupToPartitions.get(parts.getLeft());
|
||||
final int idx = (int) Math.floorMod((int) hashOfKey, candidatePartitions.size());
|
||||
assert idx >= 0;
|
||||
return candidatePartitions.get(idx);
|
||||
}
|
||||
|
||||
Map<String, List<Integer>> getFileGroupToPartitions() {
|
||||
return fileGroupToPartitions;
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,120 @@
|
||||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one
|
||||
* or more contributor license agreements. See the NOTICE file
|
||||
* distributed with this work for additional information
|
||||
* regarding copyright ownership. The ASF licenses this file
|
||||
* to you under the Apache License, Version 2.0 (the
|
||||
* "License"); you may not use this file except in compliance
|
||||
* with the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
package org.apache.hudi.index.bloom;
|
||||
|
||||
import org.apache.hudi.client.utils.LazyIterableIterator;
|
||||
import org.apache.hudi.common.model.HoodieKey;
|
||||
import org.apache.hudi.common.util.collection.Pair;
|
||||
import org.apache.hudi.config.HoodieWriteConfig;
|
||||
import org.apache.hudi.exception.HoodieException;
|
||||
import org.apache.hudi.exception.HoodieIndexException;
|
||||
import org.apache.hudi.io.HoodieKeyLookupHandle;
|
||||
import org.apache.hudi.io.HoodieKeyLookupHandle.KeyLookupResult;
|
||||
import org.apache.hudi.table.HoodieTable;
|
||||
|
||||
import org.apache.spark.api.java.function.Function2;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.Iterator;
|
||||
import java.util.List;
|
||||
|
||||
import scala.Tuple2;
|
||||
|
||||
/**
|
||||
* Function performing actual checking of RDD partition containing (fileId, hoodieKeys) against the actual files.
|
||||
*/
|
||||
public class HoodieBloomIndexCheckFunction
|
||||
implements Function2<Integer, Iterator<Tuple2<String, HoodieKey>>, Iterator<List<KeyLookupResult>>> {
|
||||
|
||||
private final HoodieTable hoodieTable;
|
||||
|
||||
private final HoodieWriteConfig config;
|
||||
|
||||
public HoodieBloomIndexCheckFunction(HoodieTable hoodieTable, HoodieWriteConfig config) {
|
||||
this.hoodieTable = hoodieTable;
|
||||
this.config = config;
|
||||
}
|
||||
|
||||
@Override
|
||||
public Iterator<List<KeyLookupResult>> call(Integer partition,
|
||||
Iterator<Tuple2<String, HoodieKey>> fileParitionRecordKeyTripletItr) {
|
||||
return new LazyKeyCheckIterator(fileParitionRecordKeyTripletItr);
|
||||
}
|
||||
|
||||
class LazyKeyCheckIterator extends LazyIterableIterator<Tuple2<String, HoodieKey>, List<KeyLookupResult>> {
|
||||
|
||||
private HoodieKeyLookupHandle keyLookupHandle;
|
||||
|
||||
LazyKeyCheckIterator(Iterator<Tuple2<String, HoodieKey>> filePartitionRecordKeyTripletItr) {
|
||||
super(filePartitionRecordKeyTripletItr);
|
||||
}
|
||||
|
||||
@Override
|
||||
protected void start() {
|
||||
}
|
||||
|
||||
@Override
|
||||
protected List<HoodieKeyLookupHandle.KeyLookupResult> computeNext() {
|
||||
|
||||
List<HoodieKeyLookupHandle.KeyLookupResult> ret = new ArrayList<>();
|
||||
try {
|
||||
// process one file in each go.
|
||||
while (inputItr.hasNext()) {
|
||||
Tuple2<String, HoodieKey> currentTuple = inputItr.next();
|
||||
String fileId = currentTuple._1;
|
||||
String partitionPath = currentTuple._2.getPartitionPath();
|
||||
String recordKey = currentTuple._2.getRecordKey();
|
||||
Pair<String, String> partitionPathFilePair = Pair.of(partitionPath, fileId);
|
||||
|
||||
// lazily init state
|
||||
if (keyLookupHandle == null) {
|
||||
keyLookupHandle = new HoodieKeyLookupHandle(config, hoodieTable, partitionPathFilePair);
|
||||
}
|
||||
|
||||
// if continue on current file
|
||||
if (keyLookupHandle.getPartitionPathFilePair().equals(partitionPathFilePair)) {
|
||||
keyLookupHandle.addKey(recordKey);
|
||||
} else {
|
||||
// do the actual checking of file & break out
|
||||
ret.add(keyLookupHandle.getLookupResult());
|
||||
keyLookupHandle = new HoodieKeyLookupHandle(config, hoodieTable, partitionPathFilePair);
|
||||
keyLookupHandle.addKey(recordKey);
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
// handle case, where we ran out of input, close pending work, update return val
|
||||
if (!inputItr.hasNext()) {
|
||||
ret.add(keyLookupHandle.getLookupResult());
|
||||
}
|
||||
} catch (Throwable e) {
|
||||
if (e instanceof HoodieException) {
|
||||
throw e;
|
||||
}
|
||||
throw new HoodieIndexException("Error checking bloom filter index. ", e);
|
||||
}
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
@Override
|
||||
protected void end() {
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,298 @@
|
||||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one
|
||||
* or more contributor license agreements. See the NOTICE file
|
||||
* distributed with this work for additional information
|
||||
* regarding copyright ownership. The ASF licenses this file
|
||||
* to you under the Apache License, Version 2.0 (the
|
||||
* "License"); you may not use this file except in compliance
|
||||
* with the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
package org.apache.hudi.index.bloom;
|
||||
|
||||
import org.apache.hudi.client.WriteStatus;
|
||||
import org.apache.hudi.client.utils.SparkMemoryUtils;
|
||||
import org.apache.hudi.client.common.HoodieEngineContext;
|
||||
import org.apache.hudi.common.model.HoodieKey;
|
||||
import org.apache.hudi.common.model.HoodieRecord;
|
||||
import org.apache.hudi.common.model.HoodieRecordLocation;
|
||||
import org.apache.hudi.common.model.HoodieRecordPayload;
|
||||
import org.apache.hudi.common.util.Option;
|
||||
import org.apache.hudi.common.util.collection.Pair;
|
||||
import org.apache.hudi.config.HoodieWriteConfig;
|
||||
import org.apache.hudi.exception.MetadataNotFoundException;
|
||||
import org.apache.hudi.index.HoodieIndexUtils;
|
||||
import org.apache.hudi.index.SparkHoodieIndex;
|
||||
import org.apache.hudi.io.HoodieRangeInfoHandle;
|
||||
import org.apache.hudi.table.HoodieTable;
|
||||
|
||||
import org.apache.log4j.LogManager;
|
||||
import org.apache.log4j.Logger;
|
||||
import org.apache.spark.Partitioner;
|
||||
import org.apache.spark.api.java.JavaPairRDD;
|
||||
import org.apache.spark.api.java.JavaRDD;
|
||||
import org.apache.spark.storage.StorageLevel;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.HashMap;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
import java.util.stream.Collectors;
|
||||
|
||||
import scala.Tuple2;
|
||||
|
||||
import static java.util.stream.Collectors.groupingBy;
|
||||
import static java.util.stream.Collectors.mapping;
|
||||
import static java.util.stream.Collectors.toList;
|
||||
import static org.apache.hudi.index.HoodieIndexUtils.getLatestBaseFilesForAllPartitions;
|
||||
|
||||
/**
|
||||
* Indexing mechanism based on bloom filter. Each parquet file includes its row_key bloom filter in its metadata.
|
||||
*/
|
||||
@SuppressWarnings("checkstyle:LineLength")
|
||||
public class SparkHoodieBloomIndex<T extends HoodieRecordPayload> extends SparkHoodieIndex<T> {
|
||||
|
||||
private static final Logger LOG = LogManager.getLogger(SparkHoodieBloomIndex.class);
|
||||
|
||||
public SparkHoodieBloomIndex(HoodieWriteConfig config) {
|
||||
super(config);
|
||||
}
|
||||
|
||||
@Override
|
||||
public JavaRDD<HoodieRecord<T>> tagLocation(JavaRDD<HoodieRecord<T>> recordRDD, HoodieEngineContext context,
|
||||
HoodieTable<T, JavaRDD<HoodieRecord<T>>, JavaRDD<HoodieKey>, JavaRDD<WriteStatus>> hoodieTable) {
|
||||
|
||||
// Step 0: cache the input record RDD
|
||||
if (config.getBloomIndexUseCaching()) {
|
||||
recordRDD.persist(SparkMemoryUtils.getBloomIndexInputStorageLevel(config.getProps()));
|
||||
}
|
||||
|
||||
// Step 1: Extract out thinner JavaPairRDD of (partitionPath, recordKey)
|
||||
JavaPairRDD<String, String> partitionRecordKeyPairRDD =
|
||||
recordRDD.mapToPair(record -> new Tuple2<>(record.getPartitionPath(), record.getRecordKey()));
|
||||
|
||||
// Lookup indexes for all the partition/recordkey pair
|
||||
JavaPairRDD<HoodieKey, HoodieRecordLocation> keyFilenamePairRDD =
|
||||
lookupIndex(partitionRecordKeyPairRDD, context, hoodieTable);
|
||||
|
||||
// Cache the result, for subsequent stages.
|
||||
if (config.getBloomIndexUseCaching()) {
|
||||
keyFilenamePairRDD.persist(StorageLevel.MEMORY_AND_DISK_SER());
|
||||
}
|
||||
if (LOG.isDebugEnabled()) {
|
||||
long totalTaggedRecords = keyFilenamePairRDD.count();
|
||||
LOG.debug("Number of update records (ones tagged with a fileID): " + totalTaggedRecords);
|
||||
}
|
||||
|
||||
// Step 4: Tag the incoming records, as inserts or updates, by joining with existing record keys
|
||||
// Cost: 4 sec.
|
||||
JavaRDD<HoodieRecord<T>> taggedRecordRDD = tagLocationBacktoRecords(keyFilenamePairRDD, recordRDD);
|
||||
|
||||
if (config.getBloomIndexUseCaching()) {
|
||||
recordRDD.unpersist(); // unpersist the input Record RDD
|
||||
keyFilenamePairRDD.unpersist();
|
||||
}
|
||||
return taggedRecordRDD;
|
||||
}
|
||||
|
||||
/**
|
||||
* Lookup the location for each record key and return the pair<record_key,location> for all record keys already
|
||||
* present and drop the record keys if not present.
|
||||
*/
|
||||
private JavaPairRDD<HoodieKey, HoodieRecordLocation> lookupIndex(
|
||||
JavaPairRDD<String, String> partitionRecordKeyPairRDD, final HoodieEngineContext context,
|
||||
final HoodieTable hoodieTable) {
|
||||
// Obtain records per partition, in the incoming records
|
||||
Map<String, Long> recordsPerPartition = partitionRecordKeyPairRDD.countByKey();
|
||||
List<String> affectedPartitionPathList = new ArrayList<>(recordsPerPartition.keySet());
|
||||
|
||||
// Step 2: Load all involved files as <Partition, filename> pairs
|
||||
List<Tuple2<String, BloomIndexFileInfo>> fileInfoList =
|
||||
loadInvolvedFiles(affectedPartitionPathList, context, hoodieTable);
|
||||
final Map<String, List<BloomIndexFileInfo>> partitionToFileInfo =
|
||||
fileInfoList.stream().collect(groupingBy(Tuple2::_1, mapping(Tuple2::_2, toList())));
|
||||
|
||||
// Step 3: Obtain a RDD, for each incoming record, that already exists, with the file id,
|
||||
// that contains it.
|
||||
Map<String, Long> comparisonsPerFileGroup =
|
||||
computeComparisonsPerFileGroup(recordsPerPartition, partitionToFileInfo, partitionRecordKeyPairRDD);
|
||||
int inputParallelism = partitionRecordKeyPairRDD.partitions().size();
|
||||
int joinParallelism = Math.max(inputParallelism, config.getBloomIndexParallelism());
|
||||
LOG.info("InputParallelism: ${" + inputParallelism + "}, IndexParallelism: ${"
|
||||
+ config.getBloomIndexParallelism() + "}");
|
||||
return findMatchingFilesForRecordKeys(partitionToFileInfo, partitionRecordKeyPairRDD, joinParallelism, hoodieTable,
|
||||
comparisonsPerFileGroup);
|
||||
}
|
||||
|
||||
/**
|
||||
* Compute the estimated number of bloom filter comparisons to be performed on each file group.
|
||||
*/
|
||||
private Map<String, Long> computeComparisonsPerFileGroup(final Map<String, Long> recordsPerPartition,
|
||||
final Map<String, List<BloomIndexFileInfo>> partitionToFileInfo,
|
||||
JavaPairRDD<String, String> partitionRecordKeyPairRDD) {
|
||||
|
||||
Map<String, Long> fileToComparisons;
|
||||
if (config.getBloomIndexPruneByRanges()) {
|
||||
// we will just try exploding the input and then count to determine comparisons
|
||||
// FIX(vc): Only do sampling here and extrapolate?
|
||||
fileToComparisons = explodeRecordRDDWithFileComparisons(partitionToFileInfo, partitionRecordKeyPairRDD)
|
||||
.mapToPair(t -> t).countByKey();
|
||||
} else {
|
||||
fileToComparisons = new HashMap<>();
|
||||
partitionToFileInfo.forEach((key, value) -> {
|
||||
for (BloomIndexFileInfo fileInfo : value) {
|
||||
// each file needs to be compared against all the records coming into the partition
|
||||
fileToComparisons.put(fileInfo.getFileId(), recordsPerPartition.get(key));
|
||||
}
|
||||
});
|
||||
}
|
||||
return fileToComparisons;
|
||||
}
|
||||
|
||||
/**
|
||||
* Load all involved files as <Partition, filename> pair RDD.
|
||||
*/
|
||||
List<Tuple2<String, BloomIndexFileInfo>> loadInvolvedFiles(List<String> partitions, final HoodieEngineContext context,
|
||||
final HoodieTable hoodieTable) {
|
||||
|
||||
// Obtain the latest data files from all the partitions.
|
||||
List<Pair<String, String>> partitionPathFileIDList = getLatestBaseFilesForAllPartitions(partitions, context, hoodieTable).stream()
|
||||
.map(pair -> Pair.of(pair.getKey(), pair.getValue().getFileId()))
|
||||
.collect(toList());
|
||||
|
||||
if (config.getBloomIndexPruneByRanges()) {
|
||||
// also obtain file ranges, if range pruning is enabled
|
||||
context.setJobStatus(this.getClass().getName(), "Obtain key ranges for file slices (range pruning=on)");
|
||||
return context.map(partitionPathFileIDList, pf -> {
|
||||
try {
|
||||
HoodieRangeInfoHandle rangeInfoHandle = new HoodieRangeInfoHandle(config, hoodieTable, pf);
|
||||
String[] minMaxKeys = rangeInfoHandle.getMinMaxKeys();
|
||||
return new Tuple2<>(pf.getKey(), new BloomIndexFileInfo(pf.getValue(), minMaxKeys[0], minMaxKeys[1]));
|
||||
} catch (MetadataNotFoundException me) {
|
||||
LOG.warn("Unable to find range metadata in file :" + pf);
|
||||
return new Tuple2<>(pf.getKey(), new BloomIndexFileInfo(pf.getValue()));
|
||||
}
|
||||
}, Math.max(partitionPathFileIDList.size(), 1));
|
||||
} else {
|
||||
return partitionPathFileIDList.stream()
|
||||
.map(pf -> new Tuple2<>(pf.getKey(), new BloomIndexFileInfo(pf.getValue()))).collect(toList());
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean rollbackCommit(String instantTime) {
|
||||
// Nope, don't need to do anything.
|
||||
return true;
|
||||
}
|
||||
|
||||
/**
|
||||
* This is not global, since we depend on the partitionPath to do the lookup.
|
||||
*/
|
||||
@Override
|
||||
public boolean isGlobal() {
|
||||
return false;
|
||||
}
|
||||
|
||||
/**
|
||||
* No indexes into log files yet.
|
||||
*/
|
||||
@Override
|
||||
public boolean canIndexLogFiles() {
|
||||
return false;
|
||||
}
|
||||
|
||||
/**
|
||||
* Bloom filters are stored, into the same data files.
|
||||
*/
|
||||
@Override
|
||||
public boolean isImplicitWithStorage() {
|
||||
return true;
|
||||
}
|
||||
|
||||
/**
|
||||
* For each incoming record, produce N output records, 1 each for each file against which the record's key needs to be
|
||||
* checked. For tables, where the keys have a definite insert order (e.g: timestamp as prefix), the number of files
|
||||
* to be compared gets cut down a lot from range pruning.
|
||||
* <p>
|
||||
* Sub-partition to ensure the records can be looked up against files & also prune file<=>record comparisons based on
|
||||
* recordKey ranges in the index info.
|
||||
*/
|
||||
JavaRDD<Tuple2<String, HoodieKey>> explodeRecordRDDWithFileComparisons(
|
||||
final Map<String, List<BloomIndexFileInfo>> partitionToFileIndexInfo,
|
||||
JavaPairRDD<String, String> partitionRecordKeyPairRDD) {
|
||||
IndexFileFilter indexFileFilter =
|
||||
config.useBloomIndexTreebasedFilter() ? new IntervalTreeBasedIndexFileFilter(partitionToFileIndexInfo)
|
||||
: new ListBasedIndexFileFilter(partitionToFileIndexInfo);
|
||||
|
||||
return partitionRecordKeyPairRDD.map(partitionRecordKeyPair -> {
|
||||
String recordKey = partitionRecordKeyPair._2();
|
||||
String partitionPath = partitionRecordKeyPair._1();
|
||||
|
||||
return indexFileFilter.getMatchingFilesAndPartition(partitionPath, recordKey).stream()
|
||||
.map(partitionFileIdPair -> new Tuple2<>(partitionFileIdPair.getRight(),
|
||||
new HoodieKey(recordKey, partitionPath)))
|
||||
.collect(Collectors.toList());
|
||||
}).flatMap(List::iterator);
|
||||
}
|
||||
|
||||
/**
|
||||
* Find out <RowKey, filename> pair. All workload grouped by file-level.
|
||||
* <p>
|
||||
* Join PairRDD(PartitionPath, RecordKey) and PairRDD(PartitionPath, File) & then repartition such that each RDD
|
||||
* partition is a file, then for each file, we do (1) load bloom filter, (2) load rowKeys, (3) Tag rowKey
|
||||
* <p>
|
||||
* Make sure the parallelism is atleast the groupby parallelism for tagging location
|
||||
*/
|
||||
JavaPairRDD<HoodieKey, HoodieRecordLocation> findMatchingFilesForRecordKeys(
|
||||
final Map<String, List<BloomIndexFileInfo>> partitionToFileIndexInfo,
|
||||
JavaPairRDD<String, String> partitionRecordKeyPairRDD, int shuffleParallelism, HoodieTable hoodieTable,
|
||||
Map<String, Long> fileGroupToComparisons) {
|
||||
JavaRDD<Tuple2<String, HoodieKey>> fileComparisonsRDD =
|
||||
explodeRecordRDDWithFileComparisons(partitionToFileIndexInfo, partitionRecordKeyPairRDD);
|
||||
|
||||
if (config.useBloomIndexBucketizedChecking()) {
|
||||
Partitioner partitioner = new BucketizedBloomCheckPartitioner(shuffleParallelism, fileGroupToComparisons,
|
||||
config.getBloomIndexKeysPerBucket());
|
||||
|
||||
fileComparisonsRDD = fileComparisonsRDD.mapToPair(t -> new Tuple2<>(Pair.of(t._1, t._2.getRecordKey()), t))
|
||||
.repartitionAndSortWithinPartitions(partitioner).map(Tuple2::_2);
|
||||
} else {
|
||||
fileComparisonsRDD = fileComparisonsRDD.sortBy(Tuple2::_1, true, shuffleParallelism);
|
||||
}
|
||||
|
||||
return fileComparisonsRDD.mapPartitionsWithIndex(new HoodieBloomIndexCheckFunction(hoodieTable, config), true)
|
||||
.flatMap(List::iterator).filter(lr -> lr.getMatchingRecordKeys().size() > 0)
|
||||
.flatMapToPair(lookupResult -> lookupResult.getMatchingRecordKeys().stream()
|
||||
.map(recordKey -> new Tuple2<>(new HoodieKey(recordKey, lookupResult.getPartitionPath()),
|
||||
new HoodieRecordLocation(lookupResult.getBaseInstantTime(), lookupResult.getFileId())))
|
||||
.collect(Collectors.toList()).iterator());
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Tag the <rowKey, filename> back to the original HoodieRecord RDD.
|
||||
*/
|
||||
protected JavaRDD<HoodieRecord<T>> tagLocationBacktoRecords(
|
||||
JavaPairRDD<HoodieKey, HoodieRecordLocation> keyFilenamePairRDD, JavaRDD<HoodieRecord<T>> recordRDD) {
|
||||
JavaPairRDD<HoodieKey, HoodieRecord<T>> keyRecordPairRDD =
|
||||
recordRDD.mapToPair(record -> new Tuple2<>(record.getKey(), record));
|
||||
// Here as the recordRDD might have more data than rowKeyRDD (some rowKeys' fileId is null),
|
||||
// so we do left outer join.
|
||||
return keyRecordPairRDD.leftOuterJoin(keyFilenamePairRDD).values()
|
||||
.map(v1 -> HoodieIndexUtils.getTaggedRecord(v1._1, Option.ofNullable(v1._2.orNull())));
|
||||
}
|
||||
|
||||
@Override
|
||||
public JavaRDD<WriteStatus> updateLocation(JavaRDD<WriteStatus> writeStatusRDD, HoodieEngineContext context,
|
||||
HoodieTable<T, JavaRDD<HoodieRecord<T>>, JavaRDD<HoodieKey>, JavaRDD<WriteStatus>> hoodieTable) {
|
||||
return writeStatusRDD;
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,150 @@
|
||||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one
|
||||
* or more contributor license agreements. See the NOTICE file
|
||||
* distributed with this work for additional information
|
||||
* regarding copyright ownership. The ASF licenses this file
|
||||
* to you under the Apache License, Version 2.0 (the
|
||||
* "License"); you may not use this file except in compliance
|
||||
* with the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
package org.apache.hudi.index.bloom;
|
||||
|
||||
import org.apache.hudi.client.common.HoodieEngineContext;
|
||||
import org.apache.hudi.common.fs.FSUtils;
|
||||
import org.apache.hudi.common.model.EmptyHoodieRecordPayload;
|
||||
import org.apache.hudi.common.model.HoodieKey;
|
||||
import org.apache.hudi.common.model.HoodieRecord;
|
||||
import org.apache.hudi.common.model.HoodieRecordLocation;
|
||||
import org.apache.hudi.common.model.HoodieRecordPayload;
|
||||
import org.apache.hudi.common.table.HoodieTableMetaClient;
|
||||
import org.apache.hudi.common.util.Option;
|
||||
import org.apache.hudi.config.HoodieWriteConfig;
|
||||
import org.apache.hudi.exception.HoodieIOException;
|
||||
import org.apache.hudi.index.HoodieIndexUtils;
|
||||
import org.apache.hudi.table.HoodieTable;
|
||||
|
||||
import org.apache.spark.api.java.JavaPairRDD;
|
||||
import org.apache.spark.api.java.JavaRDD;
|
||||
import org.apache.spark.api.java.Optional;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.util.Arrays;
|
||||
import java.util.Collections;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
import java.util.stream.Collectors;
|
||||
|
||||
import scala.Tuple2;
|
||||
|
||||
/**
|
||||
* This filter will only work with hoodie table since it will only load partitions with .hoodie_partition_metadata
|
||||
* file in it.
|
||||
*/
|
||||
public class SparkHoodieGlobalBloomIndex<T extends HoodieRecordPayload> extends SparkHoodieBloomIndex<T> {
|
||||
|
||||
public SparkHoodieGlobalBloomIndex(HoodieWriteConfig config) {
|
||||
super(config);
|
||||
}
|
||||
|
||||
/**
|
||||
* Load all involved files as <Partition, filename> pair RDD from all partitions in the table.
|
||||
*/
|
||||
@Override
|
||||
List<Tuple2<String, BloomIndexFileInfo>> loadInvolvedFiles(List<String> partitions, final HoodieEngineContext context,
|
||||
final HoodieTable hoodieTable) {
|
||||
HoodieTableMetaClient metaClient = hoodieTable.getMetaClient();
|
||||
try {
|
||||
List<String> allPartitionPaths = FSUtils.getAllPartitionPaths(metaClient.getFs(), metaClient.getBasePath(),
|
||||
config.shouldAssumeDatePartitioning());
|
||||
return super.loadInvolvedFiles(allPartitionPaths, context, hoodieTable);
|
||||
} catch (IOException e) {
|
||||
throw new HoodieIOException("Failed to load all partitions", e);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* For each incoming record, produce N output records, 1 each for each file against which the record's key needs to be
|
||||
* checked. For tables, where the keys have a definite insert order (e.g: timestamp as prefix), the number of files
|
||||
* to be compared gets cut down a lot from range pruning.
|
||||
* <p>
|
||||
* Sub-partition to ensure the records can be looked up against files & also prune file<=>record comparisons based on
|
||||
* recordKey ranges in the index info. the partition path of the incoming record (partitionRecordKeyPairRDD._2()) will
|
||||
* be ignored since the search scope should be bigger than that
|
||||
*/
|
||||
|
||||
@Override
|
||||
JavaRDD<Tuple2<String, HoodieKey>> explodeRecordRDDWithFileComparisons(
|
||||
final Map<String, List<BloomIndexFileInfo>> partitionToFileIndexInfo,
|
||||
JavaPairRDD<String, String> partitionRecordKeyPairRDD) {
|
||||
|
||||
IndexFileFilter indexFileFilter =
|
||||
config.useBloomIndexTreebasedFilter() ? new IntervalTreeBasedGlobalIndexFileFilter(partitionToFileIndexInfo)
|
||||
: new ListBasedGlobalIndexFileFilter(partitionToFileIndexInfo);
|
||||
|
||||
return partitionRecordKeyPairRDD.map(partitionRecordKeyPair -> {
|
||||
String recordKey = partitionRecordKeyPair._2();
|
||||
String partitionPath = partitionRecordKeyPair._1();
|
||||
|
||||
return indexFileFilter.getMatchingFilesAndPartition(partitionPath, recordKey).stream()
|
||||
.map(partitionFileIdPair -> new Tuple2<>(partitionFileIdPair.getRight(),
|
||||
new HoodieKey(recordKey, partitionFileIdPair.getLeft())))
|
||||
.collect(Collectors.toList());
|
||||
}).flatMap(List::iterator);
|
||||
}
|
||||
|
||||
/**
|
||||
* Tagging for global index should only consider the record key.
|
||||
*/
|
||||
@Override
|
||||
protected JavaRDD<HoodieRecord<T>> tagLocationBacktoRecords(
|
||||
JavaPairRDD<HoodieKey, HoodieRecordLocation> keyLocationPairRDD, JavaRDD<HoodieRecord<T>> recordRDD) {
|
||||
|
||||
JavaPairRDD<String, HoodieRecord<T>> incomingRowKeyRecordPairRDD =
|
||||
recordRDD.mapToPair(record -> new Tuple2<>(record.getRecordKey(), record));
|
||||
|
||||
JavaPairRDD<String, Tuple2<HoodieRecordLocation, HoodieKey>> existingRecordKeyToRecordLocationHoodieKeyMap =
|
||||
keyLocationPairRDD.mapToPair(p -> new Tuple2<>(p._1.getRecordKey(), new Tuple2<>(p._2, p._1)));
|
||||
|
||||
// Here as the recordRDD might have more data than rowKeyRDD (some rowKeys' fileId is null), so we do left outer join.
|
||||
return incomingRowKeyRecordPairRDD.leftOuterJoin(existingRecordKeyToRecordLocationHoodieKeyMap).values().flatMap(record -> {
|
||||
final HoodieRecord<T> hoodieRecord = record._1;
|
||||
final Optional<Tuple2<HoodieRecordLocation, HoodieKey>> recordLocationHoodieKeyPair = record._2;
|
||||
if (recordLocationHoodieKeyPair.isPresent()) {
|
||||
// Record key matched to file
|
||||
if (config.getBloomIndexUpdatePartitionPath()
|
||||
&& !recordLocationHoodieKeyPair.get()._2.getPartitionPath().equals(hoodieRecord.getPartitionPath())) {
|
||||
// Create an empty record to delete the record in the old partition
|
||||
HoodieRecord<T> deleteRecord = new HoodieRecord(recordLocationHoodieKeyPair.get()._2,
|
||||
new EmptyHoodieRecordPayload());
|
||||
deleteRecord.setCurrentLocation(recordLocationHoodieKeyPair.get()._1());
|
||||
deleteRecord.seal();
|
||||
// Tag the incoming record for inserting to the new partition
|
||||
HoodieRecord<T> insertRecord = HoodieIndexUtils.getTaggedRecord(hoodieRecord, Option.empty());
|
||||
return Arrays.asList(deleteRecord, insertRecord).iterator();
|
||||
} else {
|
||||
// Ignore the incoming record's partition, regardless of whether it differs from its old partition or not.
|
||||
// When it differs, the record will still be updated at its old partition.
|
||||
return Collections.singletonList(
|
||||
(HoodieRecord<T>) HoodieIndexUtils.getTaggedRecord(new HoodieRecord<>(recordLocationHoodieKeyPair.get()._2, hoodieRecord.getData()),
|
||||
Option.ofNullable(recordLocationHoodieKeyPair.get()._1))).iterator();
|
||||
}
|
||||
} else {
|
||||
return Collections.singletonList((HoodieRecord<T>) HoodieIndexUtils.getTaggedRecord(hoodieRecord, Option.empty())).iterator();
|
||||
}
|
||||
});
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean isGlobal() {
|
||||
return true;
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,495 @@
|
||||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one
|
||||
* or more contributor license agreements. See the NOTICE file
|
||||
* distributed with this work for additional information
|
||||
* regarding copyright ownership. The ASF licenses this file
|
||||
* to you under the Apache License, Version 2.0 (the
|
||||
* "License"); you may not use this file except in compliance
|
||||
* with the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
package org.apache.hudi.index.hbase;
|
||||
|
||||
import org.apache.hudi.client.WriteStatus;
|
||||
import org.apache.hudi.client.common.HoodieEngineContext;
|
||||
import org.apache.hudi.client.common.HoodieSparkEngineContext;
|
||||
import org.apache.hudi.client.utils.SparkMemoryUtils;
|
||||
import org.apache.hudi.common.model.HoodieKey;
|
||||
import org.apache.hudi.common.model.HoodieRecord;
|
||||
import org.apache.hudi.common.model.HoodieRecordLocation;
|
||||
import org.apache.hudi.common.model.HoodieRecordPayload;
|
||||
import org.apache.hudi.common.table.HoodieTableMetaClient;
|
||||
import org.apache.hudi.common.table.timeline.HoodieTimeline;
|
||||
import org.apache.hudi.common.util.Option;
|
||||
import org.apache.hudi.common.util.ReflectionUtils;
|
||||
import org.apache.hudi.config.HoodieHBaseIndexConfig;
|
||||
import org.apache.hudi.config.HoodieWriteConfig;
|
||||
import org.apache.hudi.exception.HoodieDependentSystemUnavailableException;
|
||||
import org.apache.hudi.exception.HoodieIndexException;
|
||||
import org.apache.hudi.index.SparkHoodieIndex;
|
||||
import org.apache.hudi.table.HoodieTable;
|
||||
|
||||
import org.apache.hadoop.conf.Configuration;
|
||||
import org.apache.hadoop.hbase.HBaseConfiguration;
|
||||
import org.apache.hadoop.hbase.HRegionLocation;
|
||||
import org.apache.hadoop.hbase.TableName;
|
||||
import org.apache.hadoop.hbase.client.BufferedMutator;
|
||||
import org.apache.hadoop.hbase.client.Connection;
|
||||
import org.apache.hadoop.hbase.client.ConnectionFactory;
|
||||
import org.apache.hadoop.hbase.client.Delete;
|
||||
import org.apache.hadoop.hbase.client.Get;
|
||||
import org.apache.hadoop.hbase.client.HTable;
|
||||
import org.apache.hadoop.hbase.client.Mutation;
|
||||
import org.apache.hadoop.hbase.client.Put;
|
||||
import org.apache.hadoop.hbase.client.RegionLocator;
|
||||
import org.apache.hadoop.hbase.client.Result;
|
||||
import org.apache.hadoop.hbase.util.Bytes;
|
||||
import org.apache.log4j.LogManager;
|
||||
import org.apache.log4j.Logger;
|
||||
import org.apache.spark.SparkConf;
|
||||
import org.apache.spark.api.java.JavaPairRDD;
|
||||
import org.apache.spark.api.java.JavaRDD;
|
||||
import org.apache.spark.api.java.JavaSparkContext;
|
||||
import org.apache.spark.api.java.function.Function2;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.io.Serializable;
|
||||
import java.util.ArrayList;
|
||||
import java.util.Iterator;
|
||||
import java.util.LinkedList;
|
||||
import java.util.List;
|
||||
|
||||
import scala.Tuple2;
|
||||
|
||||
/**
|
||||
* Hoodie Index implementation backed by HBase.
|
||||
*/
|
||||
public class SparkHoodieHBaseIndex<T extends HoodieRecordPayload> extends SparkHoodieIndex<T> {
|
||||
|
||||
public static final String DEFAULT_SPARK_EXECUTOR_INSTANCES_CONFIG_NAME = "spark.executor.instances";
|
||||
public static final String DEFAULT_SPARK_DYNAMIC_ALLOCATION_ENABLED_CONFIG_NAME = "spark.dynamicAllocation.enabled";
|
||||
public static final String DEFAULT_SPARK_DYNAMIC_ALLOCATION_MAX_EXECUTORS_CONFIG_NAME =
|
||||
"spark.dynamicAllocation.maxExecutors";
|
||||
|
||||
private static final byte[] SYSTEM_COLUMN_FAMILY = Bytes.toBytes("_s");
|
||||
private static final byte[] COMMIT_TS_COLUMN = Bytes.toBytes("commit_ts");
|
||||
private static final byte[] FILE_NAME_COLUMN = Bytes.toBytes("file_name");
|
||||
private static final byte[] PARTITION_PATH_COLUMN = Bytes.toBytes("partition_path");
|
||||
private static final int SLEEP_TIME_MILLISECONDS = 100;
|
||||
|
||||
private static final Logger LOG = LogManager.getLogger(SparkHoodieHBaseIndex.class);
|
||||
private static Connection hbaseConnection = null;
|
||||
private HBaseIndexQPSResourceAllocator hBaseIndexQPSResourceAllocator = null;
|
||||
private float qpsFraction;
|
||||
private int maxQpsPerRegionServer;
|
||||
/**
|
||||
* multiPutBatchSize will be computed and re-set in updateLocation if
|
||||
* {@link HoodieHBaseIndexConfig#HBASE_PUT_BATCH_SIZE_AUTO_COMPUTE_PROP} is set to true.
|
||||
*/
|
||||
private Integer multiPutBatchSize;
|
||||
private Integer numRegionServersForTable;
|
||||
private final String tableName;
|
||||
private HBasePutBatchSizeCalculator putBatchSizeCalculator;
|
||||
|
||||
public SparkHoodieHBaseIndex(HoodieWriteConfig config) {
|
||||
super(config);
|
||||
this.tableName = config.getHbaseTableName();
|
||||
addShutDownHook();
|
||||
init(config);
|
||||
}
|
||||
|
||||
private void init(HoodieWriteConfig config) {
|
||||
this.multiPutBatchSize = config.getHbaseIndexGetBatchSize();
|
||||
this.qpsFraction = config.getHbaseIndexQPSFraction();
|
||||
this.maxQpsPerRegionServer = config.getHbaseIndexMaxQPSPerRegionServer();
|
||||
this.putBatchSizeCalculator = new HBasePutBatchSizeCalculator();
|
||||
this.hBaseIndexQPSResourceAllocator = createQPSResourceAllocator(this.config);
|
||||
}
|
||||
|
||||
public HBaseIndexQPSResourceAllocator createQPSResourceAllocator(HoodieWriteConfig config) {
|
||||
try {
|
||||
LOG.info("createQPSResourceAllocator :" + config.getHBaseQPSResourceAllocatorClass());
|
||||
return (HBaseIndexQPSResourceAllocator) ReflectionUtils
|
||||
.loadClass(config.getHBaseQPSResourceAllocatorClass(), config);
|
||||
} catch (Exception e) {
|
||||
LOG.warn("error while instantiating HBaseIndexQPSResourceAllocator", e);
|
||||
}
|
||||
return new DefaultHBaseQPSResourceAllocator(config);
|
||||
}
|
||||
|
||||
private Connection getHBaseConnection() {
|
||||
Configuration hbaseConfig = HBaseConfiguration.create();
|
||||
String quorum = config.getHbaseZkQuorum();
|
||||
hbaseConfig.set("hbase.zookeeper.quorum", quorum);
|
||||
String zkZnodeParent = config.getHBaseZkZnodeParent();
|
||||
if (zkZnodeParent != null) {
|
||||
hbaseConfig.set("zookeeper.znode.parent", zkZnodeParent);
|
||||
}
|
||||
String port = String.valueOf(config.getHbaseZkPort());
|
||||
hbaseConfig.set("hbase.zookeeper.property.clientPort", port);
|
||||
try {
|
||||
return ConnectionFactory.createConnection(hbaseConfig);
|
||||
} catch (IOException e) {
|
||||
throw new HoodieDependentSystemUnavailableException(HoodieDependentSystemUnavailableException.HBASE,
|
||||
quorum + ":" + port);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Since we are sharing the HBaseConnection across tasks in a JVM, make sure the HBaseConnection is closed when JVM
|
||||
* exits.
|
||||
*/
|
||||
private void addShutDownHook() {
|
||||
Runtime.getRuntime().addShutdownHook(new Thread(() -> {
|
||||
try {
|
||||
hbaseConnection.close();
|
||||
} catch (Exception e) {
|
||||
// fail silently for any sort of exception
|
||||
}
|
||||
}));
|
||||
}
|
||||
|
||||
/**
|
||||
* Ensure that any resources used for indexing are released here.
|
||||
*/
|
||||
@Override
|
||||
public void close() {
|
||||
this.hBaseIndexQPSResourceAllocator.releaseQPSResources();
|
||||
}
|
||||
|
||||
private Get generateStatement(String key) throws IOException {
|
||||
return new Get(Bytes.toBytes(key)).setMaxVersions(1).addColumn(SYSTEM_COLUMN_FAMILY, COMMIT_TS_COLUMN)
|
||||
.addColumn(SYSTEM_COLUMN_FAMILY, FILE_NAME_COLUMN).addColumn(SYSTEM_COLUMN_FAMILY, PARTITION_PATH_COLUMN);
|
||||
}
|
||||
|
||||
private boolean checkIfValidCommit(HoodieTableMetaClient metaClient, String commitTs) {
|
||||
HoodieTimeline commitTimeline = metaClient.getCommitsTimeline().filterCompletedInstants();
|
||||
// Check if the last commit ts for this row is 1) present in the timeline or
|
||||
// 2) is less than the first commit ts in the timeline
|
||||
return !commitTimeline.empty()
|
||||
&& commitTimeline.containsOrBeforeTimelineStarts(commitTs);
|
||||
}
|
||||
|
||||
/**
|
||||
* Function that tags each HoodieRecord with an existing location, if known.
|
||||
*/
|
||||
private Function2<Integer, Iterator<HoodieRecord<T>>, Iterator<HoodieRecord<T>>> locationTagFunction(
|
||||
HoodieTableMetaClient metaClient) {
|
||||
|
||||
return (Function2<Integer, Iterator<HoodieRecord<T>>, Iterator<HoodieRecord<T>>>) (partitionNum,
|
||||
hoodieRecordIterator) -> {
|
||||
|
||||
int multiGetBatchSize = config.getHbaseIndexGetBatchSize();
|
||||
|
||||
// Grab the global HBase connection
|
||||
synchronized (SparkHoodieHBaseIndex.class) {
|
||||
if (hbaseConnection == null || hbaseConnection.isClosed()) {
|
||||
hbaseConnection = getHBaseConnection();
|
||||
}
|
||||
}
|
||||
List<HoodieRecord<T>> taggedRecords = new ArrayList<>();
|
||||
try (HTable hTable = (HTable) hbaseConnection.getTable(TableName.valueOf(tableName))) {
|
||||
List<Get> statements = new ArrayList<>();
|
||||
List<HoodieRecord> currentBatchOfRecords = new LinkedList<>();
|
||||
// Do the tagging.
|
||||
while (hoodieRecordIterator.hasNext()) {
|
||||
HoodieRecord rec = hoodieRecordIterator.next();
|
||||
statements.add(generateStatement(rec.getRecordKey()));
|
||||
currentBatchOfRecords.add(rec);
|
||||
// iterator till we reach batch size
|
||||
if (statements.size() >= multiGetBatchSize || !hoodieRecordIterator.hasNext()) {
|
||||
// get results for batch from Hbase
|
||||
Result[] results = doGet(hTable, statements);
|
||||
// clear statements to be GC'd
|
||||
statements.clear();
|
||||
for (Result result : results) {
|
||||
// first, attempt to grab location from HBase
|
||||
HoodieRecord currentRecord = currentBatchOfRecords.remove(0);
|
||||
if (result.getRow() != null) {
|
||||
String keyFromResult = Bytes.toString(result.getRow());
|
||||
String commitTs = Bytes.toString(result.getValue(SYSTEM_COLUMN_FAMILY, COMMIT_TS_COLUMN));
|
||||
String fileId = Bytes.toString(result.getValue(SYSTEM_COLUMN_FAMILY, FILE_NAME_COLUMN));
|
||||
String partitionPath = Bytes.toString(result.getValue(SYSTEM_COLUMN_FAMILY, PARTITION_PATH_COLUMN));
|
||||
|
||||
if (checkIfValidCommit(metaClient, commitTs)) {
|
||||
currentRecord = new HoodieRecord(new HoodieKey(currentRecord.getRecordKey(), partitionPath),
|
||||
currentRecord.getData());
|
||||
currentRecord.unseal();
|
||||
currentRecord.setCurrentLocation(new HoodieRecordLocation(commitTs, fileId));
|
||||
currentRecord.seal();
|
||||
taggedRecords.add(currentRecord);
|
||||
// the key from Result and the key being processed should be same
|
||||
assert (currentRecord.getRecordKey().contentEquals(keyFromResult));
|
||||
} else { // if commit is invalid, treat this as a new taggedRecord
|
||||
taggedRecords.add(currentRecord);
|
||||
}
|
||||
} else {
|
||||
taggedRecords.add(currentRecord);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
} catch (IOException e) {
|
||||
throw new HoodieIndexException("Failed to Tag indexed locations because of exception with HBase Client", e);
|
||||
}
|
||||
return taggedRecords.iterator();
|
||||
};
|
||||
}
|
||||
|
||||
private Result[] doGet(HTable hTable, List<Get> keys) throws IOException {
|
||||
sleepForTime(SLEEP_TIME_MILLISECONDS);
|
||||
return hTable.get(keys);
|
||||
}
|
||||
|
||||
@Override
|
||||
public JavaRDD<HoodieRecord<T>> tagLocation(JavaRDD<HoodieRecord<T>> recordRDD,
|
||||
HoodieEngineContext context,
|
||||
HoodieTable<T, JavaRDD<HoodieRecord<T>>, JavaRDD<HoodieKey>, JavaRDD<WriteStatus>> hoodieTable) {
|
||||
return recordRDD.mapPartitionsWithIndex(locationTagFunction(hoodieTable.getMetaClient()), true);
|
||||
}
|
||||
|
||||
private Function2<Integer, Iterator<WriteStatus>, Iterator<WriteStatus>> updateLocationFunction() {
|
||||
|
||||
return (Function2<Integer, Iterator<WriteStatus>, Iterator<WriteStatus>>) (partition, statusIterator) -> {
|
||||
|
||||
List<WriteStatus> writeStatusList = new ArrayList<>();
|
||||
// Grab the global HBase connection
|
||||
synchronized (SparkHoodieHBaseIndex.class) {
|
||||
if (hbaseConnection == null || hbaseConnection.isClosed()) {
|
||||
hbaseConnection = getHBaseConnection();
|
||||
}
|
||||
}
|
||||
try (BufferedMutator mutator = hbaseConnection.getBufferedMutator(TableName.valueOf(tableName))) {
|
||||
while (statusIterator.hasNext()) {
|
||||
WriteStatus writeStatus = statusIterator.next();
|
||||
List<Mutation> mutations = new ArrayList<>();
|
||||
try {
|
||||
for (HoodieRecord rec : writeStatus.getWrittenRecords()) {
|
||||
if (!writeStatus.isErrored(rec.getKey())) {
|
||||
Option<HoodieRecordLocation> loc = rec.getNewLocation();
|
||||
if (loc.isPresent()) {
|
||||
if (rec.getCurrentLocation() != null) {
|
||||
// This is an update, no need to update index
|
||||
continue;
|
||||
}
|
||||
Put put = new Put(Bytes.toBytes(rec.getRecordKey()));
|
||||
put.addColumn(SYSTEM_COLUMN_FAMILY, COMMIT_TS_COLUMN, Bytes.toBytes(loc.get().getInstantTime()));
|
||||
put.addColumn(SYSTEM_COLUMN_FAMILY, FILE_NAME_COLUMN, Bytes.toBytes(loc.get().getFileId()));
|
||||
put.addColumn(SYSTEM_COLUMN_FAMILY, PARTITION_PATH_COLUMN, Bytes.toBytes(rec.getPartitionPath()));
|
||||
mutations.add(put);
|
||||
} else {
|
||||
// Delete existing index for a deleted record
|
||||
Delete delete = new Delete(Bytes.toBytes(rec.getRecordKey()));
|
||||
mutations.add(delete);
|
||||
}
|
||||
}
|
||||
if (mutations.size() < multiPutBatchSize) {
|
||||
continue;
|
||||
}
|
||||
doMutations(mutator, mutations);
|
||||
}
|
||||
// process remaining puts and deletes, if any
|
||||
doMutations(mutator, mutations);
|
||||
} catch (Exception e) {
|
||||
Exception we = new Exception("Error updating index for " + writeStatus, e);
|
||||
LOG.error(we);
|
||||
writeStatus.setGlobalError(we);
|
||||
}
|
||||
writeStatusList.add(writeStatus);
|
||||
}
|
||||
} catch (IOException e) {
|
||||
throw new HoodieIndexException("Failed to Update Index locations because of exception with HBase Client", e);
|
||||
}
|
||||
return writeStatusList.iterator();
|
||||
};
|
||||
}
|
||||
|
||||
/**
|
||||
* Helper method to facilitate performing mutations (including puts and deletes) in Hbase.
|
||||
*/
|
||||
private void doMutations(BufferedMutator mutator, List<Mutation> mutations) throws IOException {
|
||||
if (mutations.isEmpty()) {
|
||||
return;
|
||||
}
|
||||
mutator.mutate(mutations);
|
||||
mutator.flush();
|
||||
mutations.clear();
|
||||
sleepForTime(SLEEP_TIME_MILLISECONDS);
|
||||
}
|
||||
|
||||
private static void sleepForTime(int sleepTimeMs) {
|
||||
try {
|
||||
Thread.sleep(sleepTimeMs);
|
||||
} catch (InterruptedException e) {
|
||||
LOG.error("Sleep interrupted during throttling", e);
|
||||
throw new RuntimeException(e);
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public JavaRDD<WriteStatus> updateLocation(JavaRDD<WriteStatus> writeStatusRDD, HoodieEngineContext context,
|
||||
HoodieTable<T, JavaRDD<HoodieRecord<T>>, JavaRDD<HoodieKey>, JavaRDD<WriteStatus>> hoodieTable) {
|
||||
final HBaseIndexQPSResourceAllocator hBaseIndexQPSResourceAllocator = createQPSResourceAllocator(this.config);
|
||||
setPutBatchSize(writeStatusRDD, hBaseIndexQPSResourceAllocator, context);
|
||||
LOG.info("multiPutBatchSize: before hbase puts" + multiPutBatchSize);
|
||||
JavaRDD<WriteStatus> writeStatusJavaRDD = writeStatusRDD.mapPartitionsWithIndex(updateLocationFunction(), true);
|
||||
// caching the index updated status RDD
|
||||
writeStatusJavaRDD = writeStatusJavaRDD.persist(SparkMemoryUtils.getWriteStatusStorageLevel(config.getProps()));
|
||||
return writeStatusJavaRDD;
|
||||
}
|
||||
|
||||
private void setPutBatchSize(JavaRDD<WriteStatus> writeStatusRDD,
|
||||
HBaseIndexQPSResourceAllocator hBaseIndexQPSResourceAllocator, final HoodieEngineContext context) {
|
||||
if (config.getHbaseIndexPutBatchSizeAutoCompute()) {
|
||||
JavaSparkContext jsc = HoodieSparkEngineContext.getSparkContext(context);
|
||||
SparkConf conf = jsc.getConf();
|
||||
int maxExecutors = conf.getInt(DEFAULT_SPARK_EXECUTOR_INSTANCES_CONFIG_NAME, 1);
|
||||
if (conf.getBoolean(DEFAULT_SPARK_DYNAMIC_ALLOCATION_ENABLED_CONFIG_NAME, false)) {
|
||||
maxExecutors =
|
||||
Math.max(maxExecutors, conf.getInt(DEFAULT_SPARK_DYNAMIC_ALLOCATION_MAX_EXECUTORS_CONFIG_NAME, 1));
|
||||
}
|
||||
|
||||
/*
|
||||
* Each writeStatus represents status information from a write done in one of the IOHandles. If a writeStatus has
|
||||
* any insert, it implies that the corresponding task contacts HBase for doing puts, since we only do puts for
|
||||
* inserts from HBaseIndex.
|
||||
*/
|
||||
final Tuple2<Long, Integer> numPutsParallelismTuple = getHBasePutAccessParallelism(writeStatusRDD);
|
||||
final long numPuts = numPutsParallelismTuple._1;
|
||||
final int hbasePutsParallelism = numPutsParallelismTuple._2;
|
||||
this.numRegionServersForTable = getNumRegionServersAliveForTable();
|
||||
final float desiredQPSFraction =
|
||||
hBaseIndexQPSResourceAllocator.calculateQPSFractionForPutsTime(numPuts, this.numRegionServersForTable);
|
||||
LOG.info("Desired QPSFraction :" + desiredQPSFraction);
|
||||
LOG.info("Number HBase puts :" + numPuts);
|
||||
LOG.info("Hbase Puts Parallelism :" + hbasePutsParallelism);
|
||||
final float availableQpsFraction =
|
||||
hBaseIndexQPSResourceAllocator.acquireQPSResources(desiredQPSFraction, numPuts);
|
||||
LOG.info("Allocated QPS Fraction :" + availableQpsFraction);
|
||||
multiPutBatchSize = putBatchSizeCalculator.getBatchSize(numRegionServersForTable, maxQpsPerRegionServer,
|
||||
hbasePutsParallelism, maxExecutors, SLEEP_TIME_MILLISECONDS, availableQpsFraction);
|
||||
LOG.info("multiPutBatchSize :" + multiPutBatchSize);
|
||||
}
|
||||
}
|
||||
|
||||
public Tuple2<Long, Integer> getHBasePutAccessParallelism(final JavaRDD<WriteStatus> writeStatusRDD) {
|
||||
final JavaPairRDD<Long, Integer> insertOnlyWriteStatusRDD = writeStatusRDD
|
||||
.filter(w -> w.getStat().getNumInserts() > 0).mapToPair(w -> new Tuple2<>(w.getStat().getNumInserts(), 1));
|
||||
return insertOnlyWriteStatusRDD.fold(new Tuple2<>(0L, 0), (w, c) -> new Tuple2<>(w._1 + c._1, w._2 + c._2));
|
||||
}
|
||||
|
||||
public static class HBasePutBatchSizeCalculator implements Serializable {
|
||||
|
||||
private static final int MILLI_SECONDS_IN_A_SECOND = 1000;
|
||||
private static final Logger LOG = LogManager.getLogger(HBasePutBatchSizeCalculator.class);
|
||||
|
||||
/**
|
||||
* Calculate putBatch size so that sum of requests across multiple jobs in a second does not exceed
|
||||
* maxQpsPerRegionServer for each Region Server. Multiplying qpsFraction to reduce the aggregate load on common RS
|
||||
* across topics. Assumption here is that all tables have regions across all RS, which is not necessarily true for
|
||||
* smaller tables. So, they end up getting a smaller share of QPS than they deserve, but it might be ok.
|
||||
* <p>
|
||||
* Example: int putBatchSize = batchSizeCalculator.getBatchSize(10, 16667, 1200, 200, 100, 0.1f)
|
||||
* </p>
|
||||
* <p>
|
||||
* Expected batchSize is 8 because in that case, total request sent to a Region Server in one second is:
|
||||
*
|
||||
* 8 (batchSize) * 200 (parallelism) * 10 (maxReqsInOneSecond) * 10 (numRegionServers) * 0.1 (qpsFraction)) =>
|
||||
* 16000. We assume requests get distributed to Region Servers uniformly, so each RS gets 1600 requests which
|
||||
* happens to be 10% of 16667 (maxQPSPerRegionServer), as expected.
|
||||
* </p>
|
||||
* <p>
|
||||
* Assumptions made here
|
||||
* <li>In a batch, writes get evenly distributed to each RS for that table. Since we do writes only in the case of
|
||||
* inserts and not updates, for this assumption to fail, inserts would have to be skewed towards few RS, likelihood
|
||||
* of which is less if Hbase table is pre-split and rowKeys are UUIDs (random strings). If this assumption fails,
|
||||
* then it is possible for some RS to receive more than maxQpsPerRegionServer QPS, but for simplicity, we are going
|
||||
* ahead with this model, since this is meant to be a lightweight distributed throttling mechanism without
|
||||
* maintaining a global context. So if this assumption breaks, we are hoping the HBase Master relocates hot-spot
|
||||
* regions to new Region Servers.
|
||||
*
|
||||
* </li>
|
||||
* <li>For Region Server stability, throttling at a second level granularity is fine. Although, within a second, the
|
||||
* sum of queries might be within maxQpsPerRegionServer, there could be peaks at some sub second intervals. So, the
|
||||
* assumption is that these peaks are tolerated by the Region Server (which at max can be maxQpsPerRegionServer).
|
||||
* </li>
|
||||
* </p>
|
||||
*/
|
||||
public int getBatchSize(int numRegionServersForTable, int maxQpsPerRegionServer, int numTasksDuringPut,
|
||||
int maxExecutors, int sleepTimeMs, float qpsFraction) {
|
||||
int maxReqPerSec = (int) (qpsFraction * numRegionServersForTable * maxQpsPerRegionServer);
|
||||
int maxParallelPuts = Math.max(1, Math.min(numTasksDuringPut, maxExecutors));
|
||||
int maxReqsSentPerTaskPerSec = MILLI_SECONDS_IN_A_SECOND / sleepTimeMs;
|
||||
int multiPutBatchSize = Math.max(1, maxReqPerSec / (maxParallelPuts * maxReqsSentPerTaskPerSec));
|
||||
LOG.info("HbaseIndexThrottling: qpsFraction :" + qpsFraction);
|
||||
LOG.info("HbaseIndexThrottling: numRSAlive :" + numRegionServersForTable);
|
||||
LOG.info("HbaseIndexThrottling: maxReqPerSec :" + maxReqPerSec);
|
||||
LOG.info("HbaseIndexThrottling: numTasks :" + numTasksDuringPut);
|
||||
LOG.info("HbaseIndexThrottling: maxExecutors :" + maxExecutors);
|
||||
LOG.info("HbaseIndexThrottling: maxParallelPuts :" + maxParallelPuts);
|
||||
LOG.info("HbaseIndexThrottling: maxReqsSentPerTaskPerSec :" + maxReqsSentPerTaskPerSec);
|
||||
LOG.info("HbaseIndexThrottling: numRegionServersForTable :" + numRegionServersForTable);
|
||||
LOG.info("HbaseIndexThrottling: multiPutBatchSize :" + multiPutBatchSize);
|
||||
return multiPutBatchSize;
|
||||
}
|
||||
}
|
||||
|
||||
private Integer getNumRegionServersAliveForTable() {
|
||||
// This is being called in the driver, so there is only one connection
|
||||
// from the driver, so ok to use a local connection variable.
|
||||
if (numRegionServersForTable == null) {
|
||||
try (Connection conn = getHBaseConnection()) {
|
||||
RegionLocator regionLocator = conn.getRegionLocator(TableName.valueOf(tableName));
|
||||
numRegionServersForTable = Math
|
||||
.toIntExact(regionLocator.getAllRegionLocations().stream().map(HRegionLocation::getServerName).distinct().count());
|
||||
return numRegionServersForTable;
|
||||
} catch (IOException e) {
|
||||
LOG.error(e);
|
||||
throw new RuntimeException(e);
|
||||
}
|
||||
}
|
||||
return numRegionServersForTable;
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean rollbackCommit(String instantTime) {
|
||||
// Rollback in HbaseIndex is managed via method {@link #checkIfValidCommit()}
|
||||
return true;
|
||||
}
|
||||
|
||||
/**
|
||||
* Only looks up by recordKey.
|
||||
*/
|
||||
@Override
|
||||
public boolean isGlobal() {
|
||||
return true;
|
||||
}
|
||||
|
||||
/**
|
||||
* Mapping is available in HBase already.
|
||||
*/
|
||||
@Override
|
||||
public boolean canIndexLogFiles() {
|
||||
return true;
|
||||
}
|
||||
|
||||
/**
|
||||
* Index needs to be explicitly updated after storage write.
|
||||
*/
|
||||
@Override
|
||||
public boolean isImplicitWithStorage() {
|
||||
return false;
|
||||
}
|
||||
|
||||
public void setHbaseConnection(Connection hbaseConnection) {
|
||||
SparkHoodieHBaseIndex.hbaseConnection = hbaseConnection;
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,160 @@
|
||||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one
|
||||
* or more contributor license agreements. See the NOTICE file
|
||||
* distributed with this work for additional information
|
||||
* regarding copyright ownership. The ASF licenses this file
|
||||
* to you under the Apache License, Version 2.0 (the
|
||||
* "License"); you may not use this file except in compliance
|
||||
* with the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
package org.apache.hudi.index.simple;
|
||||
|
||||
import org.apache.hudi.client.WriteStatus;
|
||||
import org.apache.hudi.client.common.HoodieEngineContext;
|
||||
import org.apache.hudi.common.fs.FSUtils;
|
||||
import org.apache.hudi.common.model.EmptyHoodieRecordPayload;
|
||||
import org.apache.hudi.common.model.HoodieBaseFile;
|
||||
import org.apache.hudi.common.model.HoodieKey;
|
||||
import org.apache.hudi.common.model.HoodieRecord;
|
||||
import org.apache.hudi.common.model.HoodieRecordLocation;
|
||||
import org.apache.hudi.common.model.HoodieRecordPayload;
|
||||
import org.apache.hudi.common.table.HoodieTableMetaClient;
|
||||
import org.apache.hudi.common.util.Option;
|
||||
import org.apache.hudi.common.util.collection.Pair;
|
||||
import org.apache.hudi.config.HoodieWriteConfig;
|
||||
import org.apache.hudi.exception.HoodieIOException;
|
||||
import org.apache.hudi.index.HoodieIndexUtils;
|
||||
import org.apache.hudi.table.HoodieTable;
|
||||
import org.apache.spark.api.java.JavaPairRDD;
|
||||
import org.apache.spark.api.java.JavaRDD;
|
||||
import scala.Tuple2;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.util.Arrays;
|
||||
import java.util.Collections;
|
||||
import java.util.List;
|
||||
|
||||
import static org.apache.hudi.index.HoodieIndexUtils.getLatestBaseFilesForAllPartitions;
|
||||
|
||||
/**
|
||||
* A global simple index which reads interested fields(record key and partition path) from base files and
|
||||
* joins with incoming records to find the tagged location.
|
||||
*
|
||||
* @param <T>
|
||||
*/
|
||||
@SuppressWarnings("checkstyle:LineLength")
|
||||
public class SparkHoodieGlobalSimpleIndex<T extends HoodieRecordPayload> extends SparkHoodieSimpleIndex<T> {
|
||||
|
||||
public SparkHoodieGlobalSimpleIndex(HoodieWriteConfig config) {
|
||||
super(config);
|
||||
}
|
||||
|
||||
@Override
|
||||
public JavaRDD<HoodieRecord<T>> tagLocation(JavaRDD<HoodieRecord<T>> recordRDD, HoodieEngineContext context,
|
||||
HoodieTable<T, JavaRDD<HoodieRecord<T>>, JavaRDD<HoodieKey>, JavaRDD<WriteStatus>> hoodieTable) {
|
||||
return tagLocationInternal(recordRDD, context, hoodieTable);
|
||||
}
|
||||
|
||||
/**
|
||||
* Tags records location for incoming records.
|
||||
*
|
||||
* @param inputRecordRDD {@link JavaRDD} of incoming records
|
||||
* @param context instance of {@link HoodieEngineContext} to use
|
||||
* @param hoodieTable instance of {@link HoodieTable} to use
|
||||
* @return {@link JavaRDD} of records with record locations set
|
||||
*/
|
||||
@Override
|
||||
protected JavaRDD<HoodieRecord<T>> tagLocationInternal(JavaRDD<HoodieRecord<T>> inputRecordRDD, HoodieEngineContext context,
|
||||
HoodieTable<T, JavaRDD<HoodieRecord<T>>, JavaRDD<HoodieKey>, JavaRDD<WriteStatus>> hoodieTable) {
|
||||
|
||||
JavaPairRDD<String, HoodieRecord<T>> keyedInputRecordRDD = inputRecordRDD.mapToPair(entry -> new Tuple2<>(entry.getRecordKey(), entry));
|
||||
JavaPairRDD<HoodieKey, HoodieRecordLocation> allRecordLocationsInTable = fetchAllRecordLocations(context, hoodieTable,
|
||||
config.getGlobalSimpleIndexParallelism());
|
||||
return getTaggedRecords(keyedInputRecordRDD, allRecordLocationsInTable);
|
||||
}
|
||||
|
||||
/**
|
||||
* Fetch record locations for passed in {@link HoodieKey}s.
|
||||
*
|
||||
* @param context instance of {@link HoodieEngineContext} to use
|
||||
* @param hoodieTable instance of {@link HoodieTable} of interest
|
||||
* @param parallelism parallelism to use
|
||||
* @return {@link JavaPairRDD} of {@link HoodieKey} and {@link HoodieRecordLocation}
|
||||
*/
|
||||
protected JavaPairRDD<HoodieKey, HoodieRecordLocation> fetchAllRecordLocations(HoodieEngineContext context,
|
||||
HoodieTable<T, JavaRDD<HoodieRecord<T>>, JavaRDD<HoodieKey>, JavaRDD<WriteStatus>> hoodieTable,
|
||||
int parallelism) {
|
||||
List<Pair<String, HoodieBaseFile>> latestBaseFiles = getAllBaseFilesInTable(context, hoodieTable);
|
||||
return fetchRecordLocations(context, hoodieTable, parallelism, latestBaseFiles);
|
||||
}
|
||||
|
||||
/**
|
||||
* Load all files for all partitions as <Partition, filename> pair RDD.
|
||||
*/
|
||||
protected List<Pair<String, HoodieBaseFile>> getAllBaseFilesInTable(final HoodieEngineContext context,
|
||||
final HoodieTable<T, JavaRDD<HoodieRecord<T>>, JavaRDD<HoodieKey>, JavaRDD<WriteStatus>> hoodieTable) {
|
||||
HoodieTableMetaClient metaClient = hoodieTable.getMetaClient();
|
||||
try {
|
||||
List<String> allPartitionPaths = FSUtils.getAllPartitionPaths(metaClient.getFs(), metaClient.getBasePath(), config.shouldAssumeDatePartitioning());
|
||||
// Obtain the latest data files from all the partitions.
|
||||
return getLatestBaseFilesForAllPartitions(allPartitionPaths, context, hoodieTable);
|
||||
} catch (IOException e) {
|
||||
throw new HoodieIOException("Failed to load all partitions", e);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Tag records with right {@link HoodieRecordLocation}.
|
||||
*
|
||||
* @param incomingRecords incoming {@link HoodieRecord}s
|
||||
* @param existingRecords existing records with {@link HoodieRecordLocation}s
|
||||
* @return {@link JavaRDD} of {@link HoodieRecord}s with tagged {@link HoodieRecordLocation}s
|
||||
*/
|
||||
private JavaRDD<HoodieRecord<T>> getTaggedRecords(JavaPairRDD<String, HoodieRecord<T>> incomingRecords, JavaPairRDD<HoodieKey, HoodieRecordLocation> existingRecords) {
|
||||
JavaPairRDD<String, Pair<String, HoodieRecordLocation>> existingRecordByRecordKey = existingRecords
|
||||
.mapToPair(entry -> new Tuple2<>(entry._1.getRecordKey(), Pair.of(entry._1.getPartitionPath(), entry._2)));
|
||||
|
||||
return incomingRecords.leftOuterJoin(existingRecordByRecordKey).values()
|
||||
.flatMap(entry -> {
|
||||
HoodieRecord<T> inputRecord = entry._1;
|
||||
Option<Pair<String, HoodieRecordLocation>> partitionPathLocationPair = Option.ofNullable(entry._2.orNull());
|
||||
List<HoodieRecord<T>> taggedRecords;
|
||||
|
||||
if (partitionPathLocationPair.isPresent()) {
|
||||
String partitionPath = partitionPathLocationPair.get().getKey();
|
||||
HoodieRecordLocation location = partitionPathLocationPair.get().getRight();
|
||||
if (config.getGlobalSimpleIndexUpdatePartitionPath() && !(inputRecord.getPartitionPath().equals(partitionPath))) {
|
||||
// Create an empty record to delete the record in the old partition
|
||||
HoodieRecord<T> deleteRecord = new HoodieRecord(new HoodieKey(inputRecord.getRecordKey(), partitionPath), new EmptyHoodieRecordPayload());
|
||||
deleteRecord.setCurrentLocation(location);
|
||||
deleteRecord.seal();
|
||||
// Tag the incoming record for inserting to the new partition
|
||||
HoodieRecord<T> insertRecord = (HoodieRecord<T>) HoodieIndexUtils.getTaggedRecord(inputRecord, Option.empty());
|
||||
taggedRecords = Arrays.asList(deleteRecord, insertRecord);
|
||||
} else {
|
||||
// Ignore the incoming record's partition, regardless of whether it differs from its old partition or not.
|
||||
// When it differs, the record will still be updated at its old partition.
|
||||
HoodieRecord<T> newRecord = new HoodieRecord<>(new HoodieKey(inputRecord.getRecordKey(), partitionPath), inputRecord.getData());
|
||||
taggedRecords = Collections.singletonList((HoodieRecord<T>) HoodieIndexUtils.getTaggedRecord(newRecord, Option.ofNullable(location)));
|
||||
}
|
||||
} else {
|
||||
taggedRecords = Collections.singletonList((HoodieRecord<T>) HoodieIndexUtils.getTaggedRecord(inputRecord, Option.empty()));
|
||||
}
|
||||
return taggedRecords.iterator();
|
||||
});
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean isGlobal() {
|
||||
return true;
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,152 @@
|
||||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one
|
||||
* or more contributor license agreements. See the NOTICE file
|
||||
* distributed with this work for additional information
|
||||
* regarding copyright ownership. The ASF licenses this file
|
||||
* to you under the Apache License, Version 2.0 (the
|
||||
* "License"); you may not use this file except in compliance
|
||||
* with the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
package org.apache.hudi.index.simple;
|
||||
|
||||
import org.apache.hudi.client.WriteStatus;
|
||||
import org.apache.hudi.client.utils.SparkMemoryUtils;
|
||||
import org.apache.hudi.client.common.HoodieEngineContext;
|
||||
import org.apache.hudi.client.common.HoodieSparkEngineContext;
|
||||
import org.apache.hudi.common.model.HoodieBaseFile;
|
||||
import org.apache.hudi.common.model.HoodieKey;
|
||||
import org.apache.hudi.common.model.HoodieRecord;
|
||||
import org.apache.hudi.common.model.HoodieRecordLocation;
|
||||
import org.apache.hudi.common.model.HoodieRecordPayload;
|
||||
import org.apache.hudi.common.util.Option;
|
||||
import org.apache.hudi.common.util.collection.Pair;
|
||||
import org.apache.hudi.config.HoodieWriteConfig;
|
||||
import org.apache.hudi.index.HoodieIndexUtils;
|
||||
import org.apache.hudi.index.SparkHoodieIndex;
|
||||
import org.apache.hudi.io.HoodieKeyLocationFetchHandle;
|
||||
import org.apache.hudi.table.HoodieTable;
|
||||
|
||||
import org.apache.spark.api.java.JavaPairRDD;
|
||||
import org.apache.spark.api.java.JavaRDD;
|
||||
import org.apache.spark.api.java.JavaSparkContext;
|
||||
|
||||
import java.util.List;
|
||||
|
||||
import scala.Tuple2;
|
||||
|
||||
import static org.apache.hudi.index.HoodieIndexUtils.getLatestBaseFilesForAllPartitions;
|
||||
|
||||
/**
|
||||
* A simple index which reads interested fields(record key and partition path) from base files and
|
||||
* joins with incoming records to find the tagged location.
|
||||
*
|
||||
* @param <T>
|
||||
*/
|
||||
@SuppressWarnings("checkstyle:LineLength")
|
||||
public class SparkHoodieSimpleIndex<T extends HoodieRecordPayload> extends SparkHoodieIndex<T> {
|
||||
|
||||
public SparkHoodieSimpleIndex(HoodieWriteConfig config) {
|
||||
super(config);
|
||||
}
|
||||
|
||||
@Override
|
||||
public JavaRDD<WriteStatus> updateLocation(JavaRDD<WriteStatus> writeStatusRDD, HoodieEngineContext context,
|
||||
HoodieTable<T, JavaRDD<HoodieRecord<T>>, JavaRDD<HoodieKey>, JavaRDD<WriteStatus>> hoodieTable) {
|
||||
return writeStatusRDD;
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean rollbackCommit(String commitTime) {
|
||||
return true;
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean isGlobal() {
|
||||
return false;
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean canIndexLogFiles() {
|
||||
return false;
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean isImplicitWithStorage() {
|
||||
return true;
|
||||
}
|
||||
|
||||
@Override
|
||||
public JavaRDD<HoodieRecord<T>> tagLocation(JavaRDD<HoodieRecord<T>> recordRDD,
|
||||
HoodieEngineContext context,
|
||||
HoodieTable<T, JavaRDD<HoodieRecord<T>>, JavaRDD<HoodieKey>, JavaRDD<WriteStatus>> hoodieTable) {
|
||||
return tagLocationInternal(recordRDD, context, hoodieTable);
|
||||
}
|
||||
|
||||
/**
|
||||
* Tags records location for incoming records.
|
||||
*
|
||||
* @param inputRecordRDD {@link JavaRDD} of incoming records
|
||||
* @param context instance of {@link HoodieEngineContext} to use
|
||||
* @param hoodieTable instance of {@link HoodieTable} to use
|
||||
* @return {@link JavaRDD} of records with record locations set
|
||||
*/
|
||||
protected JavaRDD<HoodieRecord<T>> tagLocationInternal(JavaRDD<HoodieRecord<T>> inputRecordRDD, HoodieEngineContext context,
|
||||
HoodieTable<T, JavaRDD<HoodieRecord<T>>, JavaRDD<HoodieKey>, JavaRDD<WriteStatus>> hoodieTable) {
|
||||
if (config.getSimpleIndexUseCaching()) {
|
||||
inputRecordRDD.persist(SparkMemoryUtils.getSimpleIndexInputStorageLevel(config.getProps()));
|
||||
}
|
||||
|
||||
JavaPairRDD<HoodieKey, HoodieRecord<T>> keyedInputRecordRDD = inputRecordRDD.mapToPair(record -> new Tuple2<>(record.getKey(), record));
|
||||
JavaPairRDD<HoodieKey, HoodieRecordLocation> existingLocationsOnTable = fetchRecordLocationsForAffectedPartitions(keyedInputRecordRDD.keys(), context, hoodieTable,
|
||||
config.getSimpleIndexParallelism());
|
||||
|
||||
JavaRDD<HoodieRecord<T>> taggedRecordRDD = keyedInputRecordRDD.leftOuterJoin(existingLocationsOnTable)
|
||||
.map(entry -> {
|
||||
final HoodieRecord<T> untaggedRecord = entry._2._1;
|
||||
final Option<HoodieRecordLocation> location = Option.ofNullable(entry._2._2.orNull());
|
||||
return HoodieIndexUtils.getTaggedRecord(untaggedRecord, location);
|
||||
});
|
||||
|
||||
if (config.getSimpleIndexUseCaching()) {
|
||||
inputRecordRDD.unpersist();
|
||||
}
|
||||
return taggedRecordRDD;
|
||||
}
|
||||
|
||||
/**
|
||||
* Fetch record locations for passed in {@link HoodieKey}s.
|
||||
*
|
||||
* @param hoodieKeys {@link JavaRDD} of {@link HoodieKey}s for which locations are fetched
|
||||
* @param context instance of {@link HoodieEngineContext} to use
|
||||
* @param hoodieTable instance of {@link HoodieTable} of interest
|
||||
* @param parallelism parallelism to use
|
||||
* @return {@link JavaPairRDD} of {@link HoodieKey} and {@link HoodieRecordLocation}
|
||||
*/
|
||||
protected JavaPairRDD<HoodieKey, HoodieRecordLocation> fetchRecordLocationsForAffectedPartitions(JavaRDD<HoodieKey> hoodieKeys,
|
||||
HoodieEngineContext context,
|
||||
HoodieTable<T, JavaRDD<HoodieRecord<T>>, JavaRDD<HoodieKey>, JavaRDD<WriteStatus>> hoodieTable,
|
||||
int parallelism) {
|
||||
List<String> affectedPartitionPathList = hoodieKeys.map(HoodieKey::getPartitionPath).distinct().collect();
|
||||
List<Pair<String, HoodieBaseFile>> latestBaseFiles = getLatestBaseFilesForAllPartitions(affectedPartitionPathList, context, hoodieTable);
|
||||
return fetchRecordLocations(context, hoodieTable, parallelism, latestBaseFiles);
|
||||
}
|
||||
|
||||
protected JavaPairRDD<HoodieKey, HoodieRecordLocation> fetchRecordLocations(HoodieEngineContext context,
|
||||
HoodieTable<T, JavaRDD<HoodieRecord<T>>, JavaRDD<HoodieKey>, JavaRDD<WriteStatus>> hoodieTable,
|
||||
int parallelism,
|
||||
List<Pair<String, HoodieBaseFile>> baseFiles) {
|
||||
JavaSparkContext jsc = HoodieSparkEngineContext.getSparkContext(context);
|
||||
int fetchParallelism = Math.max(1, Math.max(baseFiles.size(), parallelism));
|
||||
return jsc.parallelize(baseFiles, fetchParallelism)
|
||||
.flatMapToPair(partitionPathBaseFile -> new HoodieKeyLocationFetchHandle(config, hoodieTable, partitionPathBaseFile).locations());
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,204 @@
|
||||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one
|
||||
* or more contributor license agreements. See the NOTICE file
|
||||
* distributed with this work for additional information
|
||||
* regarding copyright ownership. The ASF licenses this file
|
||||
* to you under the Apache License, Version 2.0 (the
|
||||
* "License"); you may not use this file except in compliance
|
||||
* with the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
package org.apache.hudi.io;
|
||||
|
||||
import org.apache.hudi.client.HoodieInternalWriteStatus;
|
||||
import org.apache.hudi.client.model.HoodieInternalRow;
|
||||
import org.apache.hudi.common.fs.FSUtils;
|
||||
import org.apache.hudi.common.model.HoodiePartitionMetadata;
|
||||
import org.apache.hudi.common.model.HoodieRecord;
|
||||
import org.apache.hudi.common.model.HoodieWriteStat;
|
||||
import org.apache.hudi.common.model.IOType;
|
||||
import org.apache.hudi.common.table.HoodieTableConfig;
|
||||
import org.apache.hudi.common.util.HoodieTimer;
|
||||
import org.apache.hudi.config.HoodieWriteConfig;
|
||||
import org.apache.hudi.exception.HoodieIOException;
|
||||
import org.apache.hudi.exception.HoodieInsertException;
|
||||
import org.apache.hudi.io.storage.HoodieInternalRowFileWriter;
|
||||
import org.apache.hudi.io.storage.HoodieInternalRowFileWriterFactory;
|
||||
import org.apache.hudi.table.HoodieTable;
|
||||
import org.apache.hudi.table.MarkerFiles;
|
||||
|
||||
import org.apache.hadoop.fs.FileSystem;
|
||||
import org.apache.hadoop.fs.Path;
|
||||
import org.apache.log4j.LogManager;
|
||||
import org.apache.log4j.Logger;
|
||||
import org.apache.spark.sql.catalyst.InternalRow;
|
||||
import org.apache.spark.sql.types.StructType;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.io.Serializable;
|
||||
import java.util.concurrent.atomic.AtomicLong;
|
||||
|
||||
/**
|
||||
* Create handle with InternalRow for datasource implemention of bulk insert.
|
||||
*/
|
||||
public class HoodieRowCreateHandle implements Serializable {
|
||||
|
||||
private static final long serialVersionUID = 1L;
|
||||
private static final Logger LOG = LogManager.getLogger(HoodieRowCreateHandle.class);
|
||||
private static final AtomicLong SEQGEN = new AtomicLong(1);
|
||||
|
||||
private final String instantTime;
|
||||
private final int taskPartitionId;
|
||||
private final long taskId;
|
||||
private final long taskEpochId;
|
||||
private final HoodieTable table;
|
||||
private final HoodieWriteConfig writeConfig;
|
||||
private final HoodieInternalRowFileWriter fileWriter;
|
||||
private final String partitionPath;
|
||||
private final Path path;
|
||||
private final String fileId;
|
||||
private final FileSystem fs;
|
||||
private final HoodieInternalWriteStatus writeStatus;
|
||||
private final HoodieTimer currTimer;
|
||||
|
||||
public HoodieRowCreateHandle(HoodieTable table, HoodieWriteConfig writeConfig, String partitionPath, String fileId,
|
||||
String instantTime, int taskPartitionId, long taskId, long taskEpochId,
|
||||
StructType structType) {
|
||||
this.partitionPath = partitionPath;
|
||||
this.table = table;
|
||||
this.writeConfig = writeConfig;
|
||||
this.instantTime = instantTime;
|
||||
this.taskPartitionId = taskPartitionId;
|
||||
this.taskId = taskId;
|
||||
this.taskEpochId = taskEpochId;
|
||||
this.fileId = fileId;
|
||||
this.currTimer = new HoodieTimer();
|
||||
this.currTimer.startTimer();
|
||||
this.fs = table.getMetaClient().getFs();
|
||||
this.path = makeNewPath(partitionPath);
|
||||
this.writeStatus = new HoodieInternalWriteStatus(!table.getIndex().isImplicitWithStorage(),
|
||||
writeConfig.getWriteStatusFailureFraction());
|
||||
writeStatus.setPartitionPath(partitionPath);
|
||||
writeStatus.setFileId(fileId);
|
||||
try {
|
||||
HoodiePartitionMetadata partitionMetadata =
|
||||
new HoodiePartitionMetadata(
|
||||
fs,
|
||||
instantTime,
|
||||
new Path(writeConfig.getBasePath()),
|
||||
FSUtils.getPartitionPath(writeConfig.getBasePath(), partitionPath));
|
||||
partitionMetadata.trySave(taskPartitionId);
|
||||
createMarkerFile(partitionPath, FSUtils.makeDataFileName(this.instantTime, getWriteToken(), this.fileId, table.getBaseFileExtension()));
|
||||
this.fileWriter = createNewFileWriter(path, table, writeConfig, structType);
|
||||
} catch (IOException e) {
|
||||
throw new HoodieInsertException("Failed to initialize file writer for path " + path, e);
|
||||
}
|
||||
LOG.info("New handle created for partition :" + partitionPath + " with fileId " + fileId);
|
||||
}
|
||||
|
||||
/**
|
||||
* Writes an {@link InternalRow} to the underlying HoodieInternalRowFileWriter. Before writing, value for meta columns are computed as required
|
||||
* and wrapped in {@link HoodieInternalRow}. {@link HoodieInternalRow} is what gets written to HoodieInternalRowFileWriter.
|
||||
* @param record instance of {@link InternalRow} that needs to be written to the fileWriter.
|
||||
* @throws IOException
|
||||
*/
|
||||
public void write(InternalRow record) throws IOException {
|
||||
try {
|
||||
String partitionPath = record.getUTF8String(HoodieRecord.HOODIE_META_COLUMNS_NAME_TO_POS.get(
|
||||
HoodieRecord.PARTITION_PATH_METADATA_FIELD)).toString();
|
||||
String seqId = HoodieRecord.generateSequenceId(instantTime, taskPartitionId, SEQGEN.getAndIncrement());
|
||||
String recordKey = record.getUTF8String(HoodieRecord.HOODIE_META_COLUMNS_NAME_TO_POS.get(
|
||||
HoodieRecord.RECORD_KEY_METADATA_FIELD)).toString();
|
||||
HoodieInternalRow internalRow = new HoodieInternalRow(instantTime, seqId, recordKey, partitionPath, path.getName(),
|
||||
record);
|
||||
try {
|
||||
fileWriter.writeRow(recordKey, internalRow);
|
||||
writeStatus.markSuccess(recordKey);
|
||||
} catch (Throwable t) {
|
||||
writeStatus.markFailure(recordKey, t);
|
||||
}
|
||||
} catch (Throwable ge) {
|
||||
writeStatus.setGlobalError(ge);
|
||||
throw ge;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* @returns {@code true} if this handle can take in more writes. else {@code false}.
|
||||
*/
|
||||
public boolean canWrite() {
|
||||
return fileWriter.canWrite();
|
||||
}
|
||||
|
||||
/**
|
||||
* Closes the {@link HoodieRowCreateHandle} and returns an instance of {@link HoodieInternalWriteStatus} containing the stats and
|
||||
* status of the writes to this handle.
|
||||
* @return the {@link HoodieInternalWriteStatus} containing the stats and status of the writes to this handle.
|
||||
* @throws IOException
|
||||
*/
|
||||
public HoodieInternalWriteStatus close() throws IOException {
|
||||
fileWriter.close();
|
||||
HoodieWriteStat stat = new HoodieWriteStat();
|
||||
stat.setPartitionPath(partitionPath);
|
||||
stat.setNumWrites(writeStatus.getTotalRecords());
|
||||
stat.setNumDeletes(0);
|
||||
stat.setNumInserts(writeStatus.getTotalRecords());
|
||||
stat.setPrevCommit(HoodieWriteStat.NULL_COMMIT);
|
||||
stat.setFileId(fileId);
|
||||
stat.setPath(new Path(writeConfig.getBasePath()), path);
|
||||
long fileSizeInBytes = FSUtils.getFileSize(table.getMetaClient().getFs(), path);
|
||||
stat.setTotalWriteBytes(fileSizeInBytes);
|
||||
stat.setFileSizeInBytes(fileSizeInBytes);
|
||||
stat.setTotalWriteErrors(writeStatus.getFailedRowsSize());
|
||||
HoodieWriteStat.RuntimeStats runtimeStats = new HoodieWriteStat.RuntimeStats();
|
||||
runtimeStats.setTotalCreateTime(currTimer.endTimer());
|
||||
stat.setRuntimeStats(runtimeStats);
|
||||
writeStatus.setStat(stat);
|
||||
return writeStatus;
|
||||
}
|
||||
|
||||
public String getFileName() {
|
||||
return path.getName();
|
||||
}
|
||||
|
||||
private Path makeNewPath(String partitionPath) {
|
||||
Path path = FSUtils.getPartitionPath(writeConfig.getBasePath(), partitionPath);
|
||||
try {
|
||||
fs.mkdirs(path); // create a new partition as needed.
|
||||
} catch (IOException e) {
|
||||
throw new HoodieIOException("Failed to make dir " + path, e);
|
||||
}
|
||||
HoodieTableConfig tableConfig = table.getMetaClient().getTableConfig();
|
||||
return new Path(path.toString(), FSUtils.makeDataFileName(instantTime, getWriteToken(), fileId,
|
||||
tableConfig.getBaseFileFormat().getFileExtension()));
|
||||
}
|
||||
|
||||
/**
|
||||
* Creates an empty marker file corresponding to storage writer path.
|
||||
*
|
||||
* @param partitionPath Partition path
|
||||
*/
|
||||
private void createMarkerFile(String partitionPath, String dataFileName) {
|
||||
MarkerFiles markerFiles = new MarkerFiles(table, instantTime);
|
||||
markerFiles.create(partitionPath, dataFileName, IOType.CREATE);
|
||||
}
|
||||
|
||||
private String getWriteToken() {
|
||||
return taskPartitionId + "-" + taskId + "-" + taskEpochId;
|
||||
}
|
||||
|
||||
private HoodieInternalRowFileWriter createNewFileWriter(
|
||||
Path path, HoodieTable hoodieTable, HoodieWriteConfig config, StructType schema)
|
||||
throws IOException {
|
||||
return HoodieInternalRowFileWriterFactory.getInternalRowFileWriter(
|
||||
path, hoodieTable, config, schema);
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,46 @@
|
||||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one
|
||||
* or more contributor license agreements. See the NOTICE file
|
||||
* distributed with this work for additional information
|
||||
* regarding copyright ownership. The ASF licenses this file
|
||||
* to you under the Apache License, Version 2.0 (the
|
||||
* "License"); you may not use this file except in compliance
|
||||
* with the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
package org.apache.hudi.io.storage;
|
||||
|
||||
import org.apache.spark.sql.catalyst.InternalRow;
|
||||
|
||||
import java.io.IOException;
|
||||
|
||||
/**
|
||||
* Abstraction to assist in writing {@link InternalRow}s to be used in datasource implementation.
|
||||
*/
|
||||
public interface HoodieInternalRowFileWriter {
|
||||
|
||||
/**
|
||||
* @returns {@code true} if this RowFileWriter can take in more writes. else {@code false}.
|
||||
*/
|
||||
boolean canWrite();
|
||||
|
||||
/**
|
||||
* Writes an {@link InternalRow} to the HoodieInternalRowFileWriter.
|
||||
*
|
||||
* @throws IOException on any exception while writing.
|
||||
*/
|
||||
void writeRow(String key, InternalRow row) throws IOException;
|
||||
|
||||
/**
|
||||
* Closes the {@link HoodieInternalRowFileWriter} and may not take in any more writes.
|
||||
*/
|
||||
void close() throws IOException;
|
||||
}
|
||||
@@ -0,0 +1,79 @@
|
||||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one
|
||||
* or more contributor license agreements. See the NOTICE file
|
||||
* distributed with this work for additional information
|
||||
* regarding copyright ownership. The ASF licenses this file
|
||||
* to you under the Apache License, Version 2.0 (the
|
||||
* "License"); you may not use this file except in compliance
|
||||
* with the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
package org.apache.hudi.io.storage;
|
||||
|
||||
import org.apache.hudi.common.bloom.BloomFilter;
|
||||
import org.apache.hudi.common.bloom.BloomFilterFactory;
|
||||
import org.apache.hudi.common.fs.FSUtils;
|
||||
import org.apache.hudi.config.HoodieWriteConfig;
|
||||
import org.apache.hudi.table.HoodieTable;
|
||||
|
||||
import org.apache.hadoop.fs.Path;
|
||||
import org.apache.spark.sql.types.StructType;
|
||||
|
||||
import static org.apache.hudi.common.model.HoodieFileFormat.PARQUET;
|
||||
|
||||
import java.io.IOException;
|
||||
|
||||
/**
|
||||
* Factory to assist in instantiating a new {@link HoodieInternalRowFileWriter}.
|
||||
*/
|
||||
public class HoodieInternalRowFileWriterFactory {
|
||||
|
||||
/**
|
||||
* Factory method to assist in instantiating an instance of {@link HoodieInternalRowFileWriter}.
|
||||
* @param path path of the RowFileWriter.
|
||||
* @param hoodieTable instance of {@link HoodieTable} in use.
|
||||
* @param config instance of {@link HoodieWriteConfig} to use.
|
||||
* @param schema schema of the dataset in use.
|
||||
* @return the instantiated {@link HoodieInternalRowFileWriter}.
|
||||
* @throws IOException if format is not supported or if any exception during instantiating the RowFileWriter.
|
||||
*
|
||||
*/
|
||||
public static HoodieInternalRowFileWriter getInternalRowFileWriter(
|
||||
Path path, HoodieTable hoodieTable, HoodieWriteConfig config, StructType schema)
|
||||
throws IOException {
|
||||
final String extension = FSUtils.getFileExtension(path.getName());
|
||||
if (PARQUET.getFileExtension().equals(extension)) {
|
||||
return newParquetInternalRowFileWriter(path, config, schema, hoodieTable);
|
||||
}
|
||||
throw new UnsupportedOperationException(extension + " format not supported yet.");
|
||||
}
|
||||
|
||||
private static HoodieInternalRowFileWriter newParquetInternalRowFileWriter(
|
||||
Path path, HoodieWriteConfig writeConfig, StructType structType, HoodieTable table)
|
||||
throws IOException {
|
||||
BloomFilter filter = BloomFilterFactory.createBloomFilter(
|
||||
writeConfig.getBloomFilterNumEntries(),
|
||||
writeConfig.getBloomFilterFPP(),
|
||||
writeConfig.getDynamicBloomFilterMaxNumEntries(),
|
||||
writeConfig.getBloomFilterType());
|
||||
HoodieRowParquetWriteSupport writeSupport =
|
||||
new HoodieRowParquetWriteSupport(table.getHadoopConf(), structType, filter);
|
||||
return new HoodieInternalRowParquetWriter(
|
||||
path, new HoodieRowParquetConfig(
|
||||
writeSupport,
|
||||
writeConfig.getParquetCompressionCodec(),
|
||||
writeConfig.getParquetBlockSize(),
|
||||
writeConfig.getParquetPageSize(),
|
||||
writeConfig.getParquetMaxFileSize(),
|
||||
writeSupport.getHadoopConf(),
|
||||
writeConfig.getParquetCompressionRatio()));
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,72 @@
|
||||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one
|
||||
* or more contributor license agreements. See the NOTICE file
|
||||
* distributed with this work for additional information
|
||||
* regarding copyright ownership. The ASF licenses this file
|
||||
* to you under the Apache License, Version 2.0 (the
|
||||
* "License"); you may not use this file except in compliance
|
||||
* with the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
package org.apache.hudi.io.storage;
|
||||
|
||||
import org.apache.hadoop.fs.Path;
|
||||
import org.apache.hudi.common.fs.FSUtils;
|
||||
import org.apache.hudi.common.fs.HoodieWrapperFileSystem;
|
||||
|
||||
import org.apache.parquet.hadoop.ParquetFileWriter;
|
||||
import org.apache.parquet.hadoop.ParquetWriter;
|
||||
import org.apache.spark.sql.catalyst.InternalRow;
|
||||
|
||||
import java.io.IOException;
|
||||
|
||||
/**
|
||||
* Parquet's impl of {@link HoodieInternalRowFileWriter} to write {@link InternalRow}s.
|
||||
*/
|
||||
public class HoodieInternalRowParquetWriter extends ParquetWriter<InternalRow>
|
||||
implements HoodieInternalRowFileWriter {
|
||||
|
||||
private final Path file;
|
||||
private final HoodieWrapperFileSystem fs;
|
||||
private final long maxFileSize;
|
||||
private final HoodieRowParquetWriteSupport writeSupport;
|
||||
|
||||
public HoodieInternalRowParquetWriter(Path file, HoodieRowParquetConfig parquetConfig)
|
||||
throws IOException {
|
||||
super(HoodieWrapperFileSystem.convertToHoodiePath(file, parquetConfig.getHadoopConf()),
|
||||
ParquetFileWriter.Mode.CREATE, parquetConfig.getWriteSupport(), parquetConfig.getCompressionCodecName(),
|
||||
parquetConfig.getBlockSize(), parquetConfig.getPageSize(), parquetConfig.getPageSize(),
|
||||
DEFAULT_IS_DICTIONARY_ENABLED, DEFAULT_IS_VALIDATING_ENABLED,
|
||||
DEFAULT_WRITER_VERSION, FSUtils.registerFileSystem(file, parquetConfig.getHadoopConf()));
|
||||
this.file = HoodieWrapperFileSystem.convertToHoodiePath(file, parquetConfig.getHadoopConf());
|
||||
this.fs = (HoodieWrapperFileSystem) this.file.getFileSystem(FSUtils.registerFileSystem(file,
|
||||
parquetConfig.getHadoopConf()));
|
||||
this.maxFileSize = parquetConfig.getMaxFileSize()
|
||||
+ Math.round(parquetConfig.getMaxFileSize() * parquetConfig.getCompressionRatio());
|
||||
this.writeSupport = parquetConfig.getWriteSupport();
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean canWrite() {
|
||||
return fs.getBytesWritten(file) < maxFileSize;
|
||||
}
|
||||
|
||||
@Override
|
||||
public void writeRow(String key, InternalRow row) throws IOException {
|
||||
super.write(row);
|
||||
writeSupport.add(key);
|
||||
}
|
||||
|
||||
@Override
|
||||
public void close() throws IOException {
|
||||
super.close();
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,34 @@
|
||||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one
|
||||
* or more contributor license agreements. See the NOTICE file
|
||||
* distributed with this work for additional information
|
||||
* regarding copyright ownership. The ASF licenses this file
|
||||
* to you under the Apache License, Version 2.0 (the
|
||||
* "License"); you may not use this file except in compliance
|
||||
* with the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
package org.apache.hudi.io.storage;
|
||||
|
||||
import org.apache.hadoop.conf.Configuration;
|
||||
import org.apache.parquet.hadoop.metadata.CompressionCodecName;
|
||||
|
||||
/**
|
||||
* ParquetConfig for datasource implementation with {@link org.apache.hudi.client.model.HoodieInternalRow}.
|
||||
*/
|
||||
public class HoodieRowParquetConfig extends HoodieBaseParquetConfig<HoodieRowParquetWriteSupport> {
|
||||
|
||||
public HoodieRowParquetConfig(HoodieRowParquetWriteSupport writeSupport, CompressionCodecName compressionCodecName,
|
||||
int blockSize, int pageSize, long maxFileSize, Configuration hadoopConf,
|
||||
double compressionRatio) {
|
||||
super(writeSupport, compressionCodecName, blockSize, pageSize, maxFileSize, hadoopConf, compressionRatio);
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,89 @@
|
||||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one
|
||||
* or more contributor license agreements. See the NOTICE file
|
||||
* distributed with this work for additional information
|
||||
* regarding copyright ownership. The ASF licenses this file
|
||||
* to you under the Apache License, Version 2.0 (the
|
||||
* "License"); you may not use this file except in compliance
|
||||
* with the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
package org.apache.hudi.io.storage;
|
||||
|
||||
import org.apache.hadoop.conf.Configuration;
|
||||
import org.apache.hudi.common.bloom.BloomFilter;
|
||||
import org.apache.hudi.common.bloom.HoodieDynamicBoundedBloomFilter;
|
||||
import org.apache.parquet.hadoop.api.WriteSupport;
|
||||
import org.apache.spark.sql.execution.datasources.parquet.ParquetWriteSupport;
|
||||
import org.apache.spark.sql.types.StructType;
|
||||
|
||||
import java.util.HashMap;
|
||||
|
||||
import static org.apache.hudi.avro.HoodieAvroWriteSupport.HOODIE_AVRO_BLOOM_FILTER_METADATA_KEY;
|
||||
import static org.apache.hudi.avro.HoodieAvroWriteSupport.HOODIE_BLOOM_FILTER_TYPE_CODE;
|
||||
import static org.apache.hudi.avro.HoodieAvroWriteSupport.HOODIE_MAX_RECORD_KEY_FOOTER;
|
||||
import static org.apache.hudi.avro.HoodieAvroWriteSupport.HOODIE_MIN_RECORD_KEY_FOOTER;
|
||||
|
||||
/**
|
||||
* Hoodie Write Support for directly writing Row to Parquet.
|
||||
*/
|
||||
public class HoodieRowParquetWriteSupport extends ParquetWriteSupport {
|
||||
|
||||
private Configuration hadoopConf;
|
||||
private BloomFilter bloomFilter;
|
||||
private String minRecordKey;
|
||||
private String maxRecordKey;
|
||||
|
||||
public HoodieRowParquetWriteSupport(Configuration conf, StructType structType, BloomFilter bloomFilter) {
|
||||
super();
|
||||
Configuration hadoopConf = new Configuration(conf);
|
||||
hadoopConf.set("spark.sql.parquet.writeLegacyFormat", "false");
|
||||
hadoopConf.set("spark.sql.parquet.outputTimestampType", "TIMESTAMP_MILLIS");
|
||||
this.hadoopConf = hadoopConf;
|
||||
setSchema(structType, hadoopConf);
|
||||
this.bloomFilter = bloomFilter;
|
||||
}
|
||||
|
||||
public Configuration getHadoopConf() {
|
||||
return hadoopConf;
|
||||
}
|
||||
|
||||
@Override
|
||||
public WriteSupport.FinalizedWriteContext finalizeWrite() {
|
||||
HashMap<String, String> extraMetaData = new HashMap<>();
|
||||
if (bloomFilter != null) {
|
||||
extraMetaData.put(HOODIE_AVRO_BLOOM_FILTER_METADATA_KEY, bloomFilter.serializeToString());
|
||||
if (minRecordKey != null && maxRecordKey != null) {
|
||||
extraMetaData.put(HOODIE_MIN_RECORD_KEY_FOOTER, minRecordKey);
|
||||
extraMetaData.put(HOODIE_MAX_RECORD_KEY_FOOTER, maxRecordKey);
|
||||
}
|
||||
if (bloomFilter.getBloomFilterTypeCode().name().contains(HoodieDynamicBoundedBloomFilter.TYPE_CODE_PREFIX)) {
|
||||
extraMetaData.put(HOODIE_BLOOM_FILTER_TYPE_CODE, bloomFilter.getBloomFilterTypeCode().name());
|
||||
}
|
||||
}
|
||||
return new WriteSupport.FinalizedWriteContext(extraMetaData);
|
||||
}
|
||||
|
||||
public void add(String recordKey) {
|
||||
this.bloomFilter.add(recordKey);
|
||||
if (minRecordKey != null) {
|
||||
minRecordKey = minRecordKey.compareTo(recordKey) <= 0 ? minRecordKey : recordKey;
|
||||
} else {
|
||||
minRecordKey = recordKey;
|
||||
}
|
||||
|
||||
if (maxRecordKey != null) {
|
||||
maxRecordKey = maxRecordKey.compareTo(recordKey) >= 0 ? maxRecordKey : recordKey;
|
||||
} else {
|
||||
maxRecordKey = recordKey;
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,31 @@
|
||||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one
|
||||
* or more contributor license agreements. See the NOTICE file
|
||||
* distributed with this work for additional information
|
||||
* regarding copyright ownership. The ASF licenses this file
|
||||
* to you under the Apache License, Version 2.0 (the
|
||||
* "License"); you may not use this file except in compliance
|
||||
* with the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
package org.apache.hudi.keygen;
|
||||
|
||||
import org.apache.spark.sql.Row;
|
||||
|
||||
/**
|
||||
* Spark key generator interface.
|
||||
*/
|
||||
public interface SparkKeyGeneratorInterface extends KeyGeneratorInterface {
|
||||
|
||||
String getRecordKey(Row row);
|
||||
|
||||
String getPartitionPath(Row row);
|
||||
}
|
||||
@@ -0,0 +1,215 @@
|
||||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one
|
||||
* or more contributor license agreements. See the NOTICE file
|
||||
* distributed with this work for additional information
|
||||
* regarding copyright ownership. The ASF licenses this file
|
||||
* to you under the Apache License, Version 2.0 (the
|
||||
* "License"); you may not use this file except in compliance
|
||||
* with the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
package org.apache.hudi.table;
|
||||
|
||||
import org.apache.hudi.avro.model.HoodieCleanMetadata;
|
||||
import org.apache.hudi.avro.model.HoodieCompactionPlan;
|
||||
import org.apache.hudi.avro.model.HoodieRestoreMetadata;
|
||||
import org.apache.hudi.avro.model.HoodieRollbackMetadata;
|
||||
import org.apache.hudi.avro.model.HoodieSavepointMetadata;
|
||||
import org.apache.hudi.client.WriteStatus;
|
||||
import org.apache.hudi.client.common.HoodieEngineContext;
|
||||
import org.apache.hudi.client.common.HoodieSparkEngineContext;
|
||||
import org.apache.hudi.common.model.HoodieBaseFile;
|
||||
import org.apache.hudi.common.model.HoodieKey;
|
||||
import org.apache.hudi.common.model.HoodieRecord;
|
||||
import org.apache.hudi.common.model.HoodieRecordPayload;
|
||||
import org.apache.hudi.common.table.HoodieTableMetaClient;
|
||||
import org.apache.hudi.common.table.timeline.HoodieInstant;
|
||||
import org.apache.hudi.common.table.timeline.HoodieTimeline;
|
||||
import org.apache.hudi.common.util.Option;
|
||||
import org.apache.hudi.config.HoodieWriteConfig;
|
||||
import org.apache.hudi.exception.HoodieNotSupportedException;
|
||||
import org.apache.hudi.exception.HoodieUpsertException;
|
||||
import org.apache.hudi.io.HoodieCreateHandle;
|
||||
import org.apache.hudi.io.HoodieMergeHandle;
|
||||
import org.apache.hudi.io.HoodieSortedMergeHandle;
|
||||
import org.apache.hudi.table.action.HoodieWriteMetadata;
|
||||
import org.apache.hudi.table.action.bootstrap.HoodieBootstrapWriteMetadata;
|
||||
import org.apache.hudi.table.action.bootstrap.SparkBootstrapCommitActionExecutor;
|
||||
import org.apache.hudi.table.action.clean.SparkCleanActionExecutor;
|
||||
import org.apache.hudi.table.action.commit.SparkInsertOverwriteCommitActionExecutor;
|
||||
import org.apache.hudi.table.action.commit.SparkBulkInsertCommitActionExecutor;
|
||||
import org.apache.hudi.table.action.commit.SparkBulkInsertPreppedCommitActionExecutor;
|
||||
import org.apache.hudi.table.action.commit.SparkDeleteCommitActionExecutor;
|
||||
import org.apache.hudi.table.action.commit.SparkInsertCommitActionExecutor;
|
||||
import org.apache.hudi.table.action.commit.SparkInsertPreppedCommitActionExecutor;
|
||||
import org.apache.hudi.table.action.commit.SparkMergeHelper;
|
||||
import org.apache.hudi.table.action.commit.SparkUpsertCommitActionExecutor;
|
||||
import org.apache.hudi.table.action.commit.SparkUpsertPreppedCommitActionExecutor;
|
||||
import org.apache.hudi.table.action.restore.SparkCopyOnWriteRestoreActionExecutor;
|
||||
import org.apache.hudi.table.action.rollback.SparkCopyOnWriteRollbackActionExecutor;
|
||||
import org.apache.hudi.table.action.savepoint.SavepointActionExecutor;
|
||||
|
||||
import org.apache.log4j.LogManager;
|
||||
import org.apache.log4j.Logger;
|
||||
import org.apache.spark.api.java.JavaRDD;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.util.Collections;
|
||||
import java.util.Iterator;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
|
||||
/**
|
||||
* Implementation of a very heavily read-optimized Hoodie Table where, all data is stored in base files, with
|
||||
* zero read amplification.
|
||||
* <p>
|
||||
* INSERTS - Produce new files, block aligned to desired size (or) Merge with the smallest existing file, to expand it
|
||||
* <p>
|
||||
* UPDATES - Produce a new version of the file, just replacing the updated records with new values
|
||||
*/
|
||||
public class HoodieSparkCopyOnWriteTable<T extends HoodieRecordPayload> extends HoodieSparkTable<T> {
|
||||
|
||||
private static final Logger LOG = LogManager.getLogger(HoodieSparkCopyOnWriteTable.class);
|
||||
|
||||
public HoodieSparkCopyOnWriteTable(HoodieWriteConfig config, HoodieEngineContext context, HoodieTableMetaClient metaClient) {
|
||||
super(config, context, metaClient);
|
||||
}
|
||||
|
||||
@Override
|
||||
public HoodieWriteMetadata<JavaRDD<WriteStatus>> upsert(HoodieEngineContext context, String instantTime, JavaRDD<HoodieRecord<T>> records) {
|
||||
return new SparkUpsertCommitActionExecutor<>((HoodieSparkEngineContext) context, config, this, instantTime, records).execute();
|
||||
}
|
||||
|
||||
@Override
|
||||
public HoodieWriteMetadata<JavaRDD<WriteStatus>> insert(HoodieEngineContext context, String instantTime, JavaRDD<HoodieRecord<T>> records) {
|
||||
return new SparkInsertCommitActionExecutor<>((HoodieSparkEngineContext)context, config, this, instantTime, records).execute();
|
||||
}
|
||||
|
||||
@Override
|
||||
public HoodieWriteMetadata<JavaRDD<WriteStatus>> bulkInsert(HoodieEngineContext context, String instantTime, JavaRDD<HoodieRecord<T>> records,
|
||||
Option<BulkInsertPartitioner<JavaRDD<HoodieRecord<T>>>> userDefinedBulkInsertPartitioner) {
|
||||
return new SparkBulkInsertCommitActionExecutor((HoodieSparkEngineContext) context, config,
|
||||
this, instantTime, records, userDefinedBulkInsertPartitioner).execute();
|
||||
}
|
||||
|
||||
@Override
|
||||
public HoodieWriteMetadata<JavaRDD<WriteStatus>> delete(HoodieEngineContext context, String instantTime, JavaRDD<HoodieKey> keys) {
|
||||
return new SparkDeleteCommitActionExecutor<>((HoodieSparkEngineContext) context, config, this, instantTime, keys).execute();
|
||||
}
|
||||
|
||||
@Override
|
||||
public HoodieWriteMetadata<JavaRDD<WriteStatus>> upsertPrepped(HoodieEngineContext context, String instantTime,
|
||||
JavaRDD<HoodieRecord<T>> preppedRecords) {
|
||||
return new SparkUpsertPreppedCommitActionExecutor<>((HoodieSparkEngineContext) context, config, this, instantTime, preppedRecords).execute();
|
||||
}
|
||||
|
||||
@Override
|
||||
public HoodieWriteMetadata<JavaRDD<WriteStatus>> insertPrepped(HoodieEngineContext context, String instantTime,
|
||||
JavaRDD<HoodieRecord<T>> preppedRecords) {
|
||||
return new SparkInsertPreppedCommitActionExecutor<>((HoodieSparkEngineContext) context, config, this, instantTime, preppedRecords).execute();
|
||||
}
|
||||
|
||||
@Override
|
||||
public HoodieWriteMetadata<JavaRDD<WriteStatus>> bulkInsertPrepped(HoodieEngineContext context, String instantTime,
|
||||
JavaRDD<HoodieRecord<T>> preppedRecords, Option<BulkInsertPartitioner<JavaRDD<HoodieRecord<T>>>> userDefinedBulkInsertPartitioner) {
|
||||
return new SparkBulkInsertPreppedCommitActionExecutor((HoodieSparkEngineContext) context, config,
|
||||
this, instantTime, preppedRecords, userDefinedBulkInsertPartitioner).execute();
|
||||
}
|
||||
|
||||
@Override
|
||||
public HoodieWriteMetadata insertOverwrite(HoodieEngineContext context, String instantTime, JavaRDD<HoodieRecord<T>> records) {
|
||||
return new SparkInsertOverwriteCommitActionExecutor(context, config, this, instantTime, records).execute();
|
||||
}
|
||||
|
||||
@Override
|
||||
public Option<HoodieCompactionPlan> scheduleCompaction(HoodieEngineContext context, String instantTime, Option<Map<String, String>> extraMetadata) {
|
||||
throw new HoodieNotSupportedException("Compaction is not supported on a CopyOnWrite table");
|
||||
}
|
||||
|
||||
@Override
|
||||
public HoodieWriteMetadata<JavaRDD<WriteStatus>> compact(HoodieEngineContext context, String compactionInstantTime) {
|
||||
throw new HoodieNotSupportedException("Compaction is not supported on a CopyOnWrite table");
|
||||
}
|
||||
|
||||
@Override
|
||||
public HoodieBootstrapWriteMetadata<JavaRDD<WriteStatus>> bootstrap(HoodieEngineContext context, Option<Map<String, String>> extraMetadata) {
|
||||
return new SparkBootstrapCommitActionExecutor((HoodieSparkEngineContext) context, config, this, extraMetadata).execute();
|
||||
}
|
||||
|
||||
@Override
|
||||
public void rollbackBootstrap(HoodieEngineContext context, String instantTime) {
|
||||
new SparkCopyOnWriteRestoreActionExecutor((HoodieSparkEngineContext) context, config, this, instantTime, HoodieTimeline.INIT_INSTANT_TS).execute();
|
||||
}
|
||||
|
||||
public Iterator<List<WriteStatus>> handleUpdate(String instantTime, String partitionPath, String fileId,
|
||||
Map<String, HoodieRecord<T>> keyToNewRecords, HoodieBaseFile oldDataFile) throws IOException {
|
||||
// these are updates
|
||||
HoodieMergeHandle upsertHandle = getUpdateHandle(instantTime, partitionPath, fileId, keyToNewRecords, oldDataFile);
|
||||
return handleUpdateInternal(upsertHandle, instantTime, fileId);
|
||||
}
|
||||
|
||||
protected Iterator<List<WriteStatus>> handleUpdateInternal(HoodieMergeHandle upsertHandle, String instantTime,
|
||||
String fileId) throws IOException {
|
||||
if (upsertHandle.getOldFilePath() == null) {
|
||||
throw new HoodieUpsertException(
|
||||
"Error in finding the old file path at commit " + instantTime + " for fileId: " + fileId);
|
||||
} else {
|
||||
SparkMergeHelper.newInstance().runMerge(this, upsertHandle);
|
||||
}
|
||||
|
||||
// TODO(vc): This needs to be revisited
|
||||
if (upsertHandle.getWriteStatus().getPartitionPath() == null) {
|
||||
LOG.info("Upsert Handle has partition path as null " + upsertHandle.getOldFilePath() + ", "
|
||||
+ upsertHandle.getWriteStatus());
|
||||
}
|
||||
return Collections.singletonList(Collections.singletonList(upsertHandle.getWriteStatus())).iterator();
|
||||
}
|
||||
|
||||
protected HoodieMergeHandle getUpdateHandle(String instantTime, String partitionPath, String fileId,
|
||||
Map<String, HoodieRecord<T>> keyToNewRecords, HoodieBaseFile dataFileToBeMerged) {
|
||||
if (requireSortedRecords()) {
|
||||
return new HoodieSortedMergeHandle<>(config, instantTime, this, keyToNewRecords, partitionPath, fileId,
|
||||
dataFileToBeMerged, taskContextSupplier);
|
||||
} else {
|
||||
return new HoodieMergeHandle<>(config, instantTime, this, keyToNewRecords, partitionPath, fileId,
|
||||
dataFileToBeMerged,taskContextSupplier);
|
||||
}
|
||||
}
|
||||
|
||||
public Iterator<List<WriteStatus>> handleInsert(String instantTime, String partitionPath, String fileId,
|
||||
Map<String, HoodieRecord<? extends HoodieRecordPayload>> recordMap) {
|
||||
HoodieCreateHandle createHandle =
|
||||
new HoodieCreateHandle(config, instantTime, this, partitionPath, fileId, recordMap, taskContextSupplier);
|
||||
createHandle.write();
|
||||
return Collections.singletonList(Collections.singletonList(createHandle.close())).iterator();
|
||||
}
|
||||
|
||||
@Override
|
||||
public HoodieCleanMetadata clean(HoodieEngineContext context, String cleanInstantTime) {
|
||||
return new SparkCleanActionExecutor((HoodieSparkEngineContext)context, config, this, cleanInstantTime).execute();
|
||||
}
|
||||
|
||||
@Override
|
||||
public HoodieRollbackMetadata rollback(HoodieEngineContext context, String rollbackInstantTime, HoodieInstant commitInstant, boolean deleteInstants) {
|
||||
return new SparkCopyOnWriteRollbackActionExecutor((HoodieSparkEngineContext) context, config, this, rollbackInstantTime, commitInstant, deleteInstants).execute();
|
||||
}
|
||||
|
||||
@Override
|
||||
public HoodieSavepointMetadata savepoint(HoodieEngineContext context, String instantToSavepoint, String user, String comment) {
|
||||
return new SavepointActionExecutor(context, config, this, instantToSavepoint, user, comment).execute();
|
||||
}
|
||||
|
||||
@Override
|
||||
public HoodieRestoreMetadata restore(HoodieEngineContext context, String restoreInstantTime, String instantToRestore) {
|
||||
return new SparkCopyOnWriteRestoreActionExecutor((HoodieSparkEngineContext) context, config, this, restoreInstantTime, instantToRestore).execute();
|
||||
}
|
||||
|
||||
}
|
||||
@@ -0,0 +1,163 @@
|
||||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one
|
||||
* or more contributor license agreements. See the NOTICE file
|
||||
* distributed with this work for additional information
|
||||
* regarding copyright ownership. The ASF licenses this file
|
||||
* to you under the Apache License, Version 2.0 (the
|
||||
* "License"); you may not use this file except in compliance
|
||||
* with the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
package org.apache.hudi.table;
|
||||
|
||||
import org.apache.hudi.avro.model.HoodieCompactionPlan;
|
||||
import org.apache.hudi.avro.model.HoodieRestoreMetadata;
|
||||
import org.apache.hudi.avro.model.HoodieRollbackMetadata;
|
||||
import org.apache.hudi.client.WriteStatus;
|
||||
import org.apache.hudi.client.common.HoodieEngineContext;
|
||||
import org.apache.hudi.client.common.HoodieSparkEngineContext;
|
||||
import org.apache.hudi.common.model.HoodieKey;
|
||||
import org.apache.hudi.common.model.HoodieRecord;
|
||||
import org.apache.hudi.common.model.HoodieRecordPayload;
|
||||
import org.apache.hudi.common.model.HoodieWriteStat;
|
||||
import org.apache.hudi.common.table.HoodieTableMetaClient;
|
||||
import org.apache.hudi.common.table.timeline.HoodieInstant;
|
||||
|
||||
import org.apache.hudi.common.table.timeline.HoodieTimeline;
|
||||
import org.apache.hudi.common.util.Option;
|
||||
import org.apache.hudi.config.HoodieWriteConfig;
|
||||
import org.apache.hudi.exception.HoodieIOException;
|
||||
import org.apache.hudi.table.action.HoodieWriteMetadata;
|
||||
import org.apache.hudi.table.action.bootstrap.SparkBootstrapDeltaCommitActionExecutor;
|
||||
import org.apache.hudi.table.action.bootstrap.HoodieBootstrapWriteMetadata;
|
||||
import org.apache.hudi.table.action.compact.SparkRunCompactionActionExecutor;
|
||||
import org.apache.hudi.table.action.compact.SparkScheduleCompactionActionExecutor;
|
||||
import org.apache.hudi.table.action.deltacommit.SparkBulkInsertDeltaCommitActionExecutor;
|
||||
import org.apache.hudi.table.action.deltacommit.SparkBulkInsertPreppedDeltaCommitActionExecutor;
|
||||
import org.apache.hudi.table.action.deltacommit.SparkDeleteDeltaCommitActionExecutor;
|
||||
import org.apache.hudi.table.action.deltacommit.SparkInsertDeltaCommitActionExecutor;
|
||||
import org.apache.hudi.table.action.deltacommit.SparkInsertPreppedDeltaCommitActionExecutor;
|
||||
import org.apache.hudi.table.action.deltacommit.SparkUpsertDeltaCommitActionExecutor;
|
||||
import org.apache.hudi.table.action.deltacommit.SparkUpsertPreppedDeltaCommitActionExecutor;
|
||||
import org.apache.hudi.table.action.compact.BaseScheduleCompactionActionExecutor;
|
||||
import org.apache.hudi.table.action.restore.SparkMergeOnReadRestoreActionExecutor;
|
||||
import org.apache.hudi.table.action.rollback.SparkMergeOnReadRollbackActionExecutor;
|
||||
import org.apache.spark.api.java.JavaRDD;
|
||||
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
|
||||
/**
|
||||
* Implementation of a more real-time Hoodie Table the provides tradeoffs on read and write cost/amplification.
|
||||
*
|
||||
* <p>
|
||||
* INSERTS - Same as HoodieCopyOnWriteTable - Produce new files, block aligned to desired size (or) Merge with the
|
||||
* smallest existing file, to expand it
|
||||
* </p>
|
||||
* <p>
|
||||
* UPDATES - Appends the changes to a rolling log file maintained per file Id. Compaction merges the log file into the
|
||||
* base file.
|
||||
* </p>
|
||||
* <p>
|
||||
* WARNING - MOR table type does not support nested rollbacks, every rollback must be followed by an attempted commit
|
||||
* action
|
||||
* </p>
|
||||
*/
|
||||
public class HoodieSparkMergeOnReadTable<T extends HoodieRecordPayload> extends HoodieSparkCopyOnWriteTable<T> {
|
||||
|
||||
HoodieSparkMergeOnReadTable(HoodieWriteConfig config, HoodieEngineContext context, HoodieTableMetaClient metaClient) {
|
||||
super(config, context, metaClient);
|
||||
}
|
||||
|
||||
@Override
|
||||
public HoodieWriteMetadata<JavaRDD<WriteStatus>> upsert(HoodieEngineContext context, String instantTime, JavaRDD<HoodieRecord<T>> records) {
|
||||
return new SparkUpsertDeltaCommitActionExecutor<>((HoodieSparkEngineContext) context, config, this, instantTime, records).execute();
|
||||
}
|
||||
|
||||
@Override
|
||||
public HoodieWriteMetadata<JavaRDD<WriteStatus>> insert(HoodieEngineContext context, String instantTime, JavaRDD<HoodieRecord<T>> records) {
|
||||
return new SparkInsertDeltaCommitActionExecutor<>((HoodieSparkEngineContext) context, config, this, instantTime, records).execute();
|
||||
}
|
||||
|
||||
@Override
|
||||
public HoodieWriteMetadata<JavaRDD<WriteStatus>> bulkInsert(HoodieEngineContext context, String instantTime, JavaRDD<HoodieRecord<T>> records,
|
||||
Option<BulkInsertPartitioner<JavaRDD<HoodieRecord<T>>>> userDefinedBulkInsertPartitioner) {
|
||||
return new SparkBulkInsertDeltaCommitActionExecutor((HoodieSparkEngineContext) context, config,
|
||||
this, instantTime, records, userDefinedBulkInsertPartitioner).execute();
|
||||
}
|
||||
|
||||
@Override
|
||||
public HoodieWriteMetadata<JavaRDD<WriteStatus>> delete(HoodieEngineContext context, String instantTime, JavaRDD<HoodieKey> keys) {
|
||||
return new SparkDeleteDeltaCommitActionExecutor<>((HoodieSparkEngineContext) context, config, this, instantTime, keys).execute();
|
||||
}
|
||||
|
||||
@Override
|
||||
public HoodieWriteMetadata<JavaRDD<WriteStatus>> upsertPrepped(HoodieEngineContext context, String instantTime,
|
||||
JavaRDD<HoodieRecord<T>> preppedRecords) {
|
||||
return new SparkUpsertPreppedDeltaCommitActionExecutor<>((HoodieSparkEngineContext) context, config, this, instantTime, preppedRecords).execute();
|
||||
}
|
||||
|
||||
@Override
|
||||
public HoodieWriteMetadata<JavaRDD<WriteStatus>> insertPrepped(HoodieEngineContext context, String instantTime,
|
||||
JavaRDD<HoodieRecord<T>> preppedRecords) {
|
||||
return new SparkInsertPreppedDeltaCommitActionExecutor<>((HoodieSparkEngineContext) context, config, this, instantTime, preppedRecords).execute();
|
||||
}
|
||||
|
||||
@Override
|
||||
public HoodieWriteMetadata<JavaRDD<WriteStatus>> bulkInsertPrepped(HoodieEngineContext context, String instantTime,
|
||||
JavaRDD<HoodieRecord<T>> preppedRecords, Option<BulkInsertPartitioner<JavaRDD<HoodieRecord<T>>>> userDefinedBulkInsertPartitioner) {
|
||||
return new SparkBulkInsertPreppedDeltaCommitActionExecutor((HoodieSparkEngineContext) context, config,
|
||||
this, instantTime, preppedRecords, userDefinedBulkInsertPartitioner).execute();
|
||||
}
|
||||
|
||||
@Override
|
||||
public Option<HoodieCompactionPlan> scheduleCompaction(HoodieEngineContext context, String instantTime, Option<Map<String, String>> extraMetadata) {
|
||||
BaseScheduleCompactionActionExecutor scheduleCompactionExecutor = new SparkScheduleCompactionActionExecutor(
|
||||
context, config, this, instantTime, extraMetadata);
|
||||
return scheduleCompactionExecutor.execute();
|
||||
}
|
||||
|
||||
@Override
|
||||
public HoodieWriteMetadata<JavaRDD<WriteStatus>> compact(HoodieEngineContext context, String compactionInstantTime) {
|
||||
SparkRunCompactionActionExecutor compactionExecutor = new SparkRunCompactionActionExecutor((HoodieSparkEngineContext) context, config, this, compactionInstantTime);
|
||||
return compactionExecutor.execute();
|
||||
}
|
||||
|
||||
@Override
|
||||
public HoodieBootstrapWriteMetadata<JavaRDD<WriteStatus>> bootstrap(HoodieEngineContext context, Option<Map<String, String>> extraMetadata) {
|
||||
return new SparkBootstrapDeltaCommitActionExecutor((HoodieSparkEngineContext) context, config, this, extraMetadata).execute();
|
||||
}
|
||||
|
||||
@Override
|
||||
public void rollbackBootstrap(HoodieEngineContext context, String instantTime) {
|
||||
new SparkMergeOnReadRestoreActionExecutor((HoodieSparkEngineContext) context, config, this, instantTime, HoodieTimeline.INIT_INSTANT_TS).execute();
|
||||
}
|
||||
|
||||
@Override
|
||||
public HoodieRollbackMetadata rollback(HoodieEngineContext context,
|
||||
String rollbackInstantTime,
|
||||
HoodieInstant commitInstant,
|
||||
boolean deleteInstants) {
|
||||
return new SparkMergeOnReadRollbackActionExecutor(context, config, this, rollbackInstantTime, commitInstant, deleteInstants).execute();
|
||||
}
|
||||
|
||||
@Override
|
||||
public HoodieRestoreMetadata restore(HoodieEngineContext context, String restoreInstantTime, String instantToRestore) {
|
||||
return new SparkMergeOnReadRestoreActionExecutor((HoodieSparkEngineContext) context, config, this, restoreInstantTime, instantToRestore).execute();
|
||||
}
|
||||
|
||||
@Override
|
||||
public void finalizeWrite(HoodieEngineContext context, String instantTs, List<HoodieWriteStat> stats)
|
||||
throws HoodieIOException {
|
||||
// delegate to base class for MOR tables
|
||||
super.finalizeWrite(context, instantTs, stats);
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,72 @@
|
||||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one
|
||||
* or more contributor license agreements. See the NOTICE file
|
||||
* distributed with this work for additional information
|
||||
* regarding copyright ownership. The ASF licenses this file
|
||||
* to you under the Apache License, Version 2.0 (the
|
||||
* "License"); you may not use this file except in compliance
|
||||
* with the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
package org.apache.hudi.table;
|
||||
|
||||
import org.apache.hudi.client.WriteStatus;
|
||||
import org.apache.hudi.client.common.HoodieEngineContext;
|
||||
import org.apache.hudi.client.common.HoodieSparkEngineContext;
|
||||
import org.apache.hudi.common.model.HoodieKey;
|
||||
import org.apache.hudi.common.model.HoodieRecord;
|
||||
import org.apache.hudi.common.model.HoodieRecordPayload;
|
||||
import org.apache.hudi.common.table.HoodieTableMetaClient;
|
||||
import org.apache.hudi.common.table.timeline.versioning.TimelineLayoutVersion;
|
||||
import org.apache.hudi.common.util.Option;
|
||||
import org.apache.hudi.config.HoodieWriteConfig;
|
||||
import org.apache.hudi.exception.HoodieException;
|
||||
import org.apache.hudi.index.HoodieIndex;
|
||||
import org.apache.hudi.index.SparkHoodieIndex;
|
||||
|
||||
import org.apache.spark.api.java.JavaRDD;
|
||||
|
||||
public abstract class HoodieSparkTable<T extends HoodieRecordPayload>
|
||||
extends HoodieTable<T, JavaRDD<HoodieRecord<T>>, JavaRDD<HoodieKey>, JavaRDD<WriteStatus>> {
|
||||
|
||||
protected HoodieSparkTable(HoodieWriteConfig config, HoodieEngineContext context, HoodieTableMetaClient metaClient) {
|
||||
super(config, context, metaClient);
|
||||
}
|
||||
|
||||
public static <T extends HoodieRecordPayload> HoodieSparkTable<T> create(HoodieWriteConfig config, HoodieEngineContext context) {
|
||||
HoodieTableMetaClient metaClient = new HoodieTableMetaClient(
|
||||
context.getHadoopConf().get(),
|
||||
config.getBasePath(),
|
||||
true,
|
||||
config.getConsistencyGuardConfig(),
|
||||
Option.of(new TimelineLayoutVersion(config.getTimelineLayoutVersion()))
|
||||
);
|
||||
return HoodieSparkTable.create(config, (HoodieSparkEngineContext) context, metaClient);
|
||||
}
|
||||
|
||||
public static <T extends HoodieRecordPayload> HoodieSparkTable<T> create(HoodieWriteConfig config,
|
||||
HoodieSparkEngineContext context,
|
||||
HoodieTableMetaClient metaClient) {
|
||||
switch (metaClient.getTableType()) {
|
||||
case COPY_ON_WRITE:
|
||||
return new HoodieSparkCopyOnWriteTable<>(config, context, metaClient);
|
||||
case MERGE_ON_READ:
|
||||
return new HoodieSparkMergeOnReadTable<>(config, context, metaClient);
|
||||
default:
|
||||
throw new HoodieException("Unsupported table type :" + metaClient.getTableType());
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
protected HoodieIndex<T, JavaRDD<HoodieRecord<T>>, JavaRDD<HoodieKey>, JavaRDD<WriteStatus>> getIndex(HoodieWriteConfig config) {
|
||||
return SparkHoodieIndex.createIndex(config);
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,404 @@
|
||||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one
|
||||
* or more contributor license agreements. See the NOTICE file
|
||||
* distributed with this work for additional information
|
||||
* regarding copyright ownership. The ASF licenses this file
|
||||
* to you under the Apache License, Version 2.0 (the
|
||||
* "License"); you may not use this file except in compliance
|
||||
* with the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
package org.apache.hudi.table.action.bootstrap;
|
||||
|
||||
import org.apache.hudi.avro.HoodieAvroUtils;
|
||||
import org.apache.hudi.avro.model.HoodieFileStatus;
|
||||
import org.apache.hudi.client.WriteStatus;
|
||||
import org.apache.hudi.client.bootstrap.HoodieBootstrapSchemaProvider;
|
||||
import org.apache.hudi.client.bootstrap.BootstrapMode;
|
||||
import org.apache.hudi.client.bootstrap.BootstrapRecordPayload;
|
||||
import org.apache.hudi.client.bootstrap.BootstrapWriteStatus;
|
||||
import org.apache.hudi.client.bootstrap.FullRecordBootstrapDataProvider;
|
||||
import org.apache.hudi.client.bootstrap.HoodieSparkBootstrapSchemaProvider;
|
||||
import org.apache.hudi.client.bootstrap.selector.BootstrapModeSelector;
|
||||
import org.apache.hudi.client.bootstrap.translator.BootstrapPartitionPathTranslator;
|
||||
import org.apache.hudi.client.common.HoodieSparkEngineContext;
|
||||
import org.apache.hudi.client.utils.SparkMemoryUtils;
|
||||
import org.apache.hudi.common.bootstrap.FileStatusUtils;
|
||||
import org.apache.hudi.common.bootstrap.index.BootstrapIndex;
|
||||
import org.apache.hudi.common.config.TypedProperties;
|
||||
import org.apache.hudi.common.fs.FSUtils;
|
||||
import org.apache.hudi.common.model.BootstrapFileMapping;
|
||||
import org.apache.hudi.common.model.HoodieCommitMetadata;
|
||||
import org.apache.hudi.common.model.HoodieKey;
|
||||
import org.apache.hudi.common.model.HoodieRecord;
|
||||
import org.apache.hudi.common.model.HoodieRecordPayload;
|
||||
import org.apache.hudi.common.model.HoodieWriteStat;
|
||||
import org.apache.hudi.common.model.WriteOperationType;
|
||||
import org.apache.hudi.common.table.HoodieTableMetaClient;
|
||||
import org.apache.hudi.common.table.timeline.HoodieActiveTimeline;
|
||||
import org.apache.hudi.common.table.timeline.HoodieInstant;
|
||||
import org.apache.hudi.common.table.timeline.HoodieInstant.State;
|
||||
import org.apache.hudi.common.table.timeline.HoodieTimeline;
|
||||
import org.apache.hudi.common.util.Option;
|
||||
import org.apache.hudi.common.util.ParquetReaderIterator;
|
||||
import org.apache.hudi.common.util.ReflectionUtils;
|
||||
import org.apache.hudi.common.util.ValidationUtils;
|
||||
import org.apache.hudi.common.util.collection.Pair;
|
||||
import org.apache.hudi.common.util.queue.BoundedInMemoryExecutor;
|
||||
import org.apache.hudi.config.HoodieWriteConfig;
|
||||
import org.apache.hudi.exception.HoodieCommitException;
|
||||
import org.apache.hudi.exception.HoodieException;
|
||||
import org.apache.hudi.exception.HoodieIOException;
|
||||
import org.apache.hudi.execution.SparkBoundedInMemoryExecutor;
|
||||
import org.apache.hudi.io.HoodieBootstrapHandle;
|
||||
import org.apache.hudi.keygen.KeyGeneratorInterface;
|
||||
import org.apache.hudi.table.HoodieSparkTable;
|
||||
import org.apache.hudi.table.HoodieTable;
|
||||
import org.apache.hudi.table.action.HoodieWriteMetadata;
|
||||
import org.apache.hudi.table.action.commit.BaseSparkCommitActionExecutor;
|
||||
import org.apache.hudi.table.action.commit.BaseCommitActionExecutor;
|
||||
import org.apache.hudi.table.action.commit.SparkBulkInsertCommitActionExecutor;
|
||||
|
||||
import org.apache.avro.Schema;
|
||||
import org.apache.avro.generic.GenericData;
|
||||
import org.apache.avro.generic.GenericRecord;
|
||||
import org.apache.avro.generic.IndexedRecord;
|
||||
import org.apache.hadoop.fs.FileSystem;
|
||||
import org.apache.hadoop.fs.Path;
|
||||
import org.apache.log4j.LogManager;
|
||||
import org.apache.log4j.Logger;
|
||||
import org.apache.parquet.avro.AvroParquetReader;
|
||||
import org.apache.parquet.avro.AvroReadSupport;
|
||||
import org.apache.parquet.avro.AvroSchemaConverter;
|
||||
import org.apache.parquet.format.converter.ParquetMetadataConverter;
|
||||
import org.apache.parquet.hadoop.ParquetFileReader;
|
||||
import org.apache.parquet.hadoop.ParquetReader;
|
||||
import org.apache.parquet.hadoop.metadata.ParquetMetadata;
|
||||
import org.apache.parquet.schema.MessageType;
|
||||
import org.apache.spark.api.java.JavaRDD;
|
||||
import org.apache.spark.api.java.JavaSparkContext;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.nio.charset.StandardCharsets;
|
||||
import java.time.Duration;
|
||||
import java.time.Instant;
|
||||
import java.util.Collection;
|
||||
import java.util.Iterator;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
import java.util.stream.Collectors;
|
||||
|
||||
public class SparkBootstrapCommitActionExecutor<T extends HoodieRecordPayload<T>>
|
||||
extends BaseCommitActionExecutor<T, JavaRDD<HoodieRecord<T>>, JavaRDD<HoodieKey>, JavaRDD<WriteStatus>, HoodieBootstrapWriteMetadata> {
|
||||
|
||||
private static final Logger LOG = LogManager.getLogger(SparkBootstrapCommitActionExecutor.class);
|
||||
protected String bootstrapSchema = null;
|
||||
private transient FileSystem bootstrapSourceFileSystem;
|
||||
|
||||
public SparkBootstrapCommitActionExecutor(HoodieSparkEngineContext context,
|
||||
HoodieWriteConfig config,
|
||||
HoodieTable<T, JavaRDD<HoodieRecord<T>>, JavaRDD<HoodieKey>, JavaRDD<WriteStatus>> table,
|
||||
Option<Map<String, String>> extraMetadata) {
|
||||
super(context, new HoodieWriteConfig.Builder().withProps(config.getProps())
|
||||
.withAutoCommit(true).withWriteStatusClass(BootstrapWriteStatus.class)
|
||||
.withBulkInsertParallelism(config.getBootstrapParallelism())
|
||||
.build(), table, HoodieTimeline.METADATA_BOOTSTRAP_INSTANT_TS, WriteOperationType.BOOTSTRAP,
|
||||
extraMetadata);
|
||||
bootstrapSourceFileSystem = FSUtils.getFs(config.getBootstrapSourceBasePath(), hadoopConf);
|
||||
}
|
||||
|
||||
private void validate() {
|
||||
ValidationUtils.checkArgument(config.getBootstrapSourceBasePath() != null,
|
||||
"Ensure Bootstrap Source Path is set");
|
||||
ValidationUtils.checkArgument(config.getBootstrapModeSelectorClass() != null,
|
||||
"Ensure Bootstrap Partition Selector is set");
|
||||
ValidationUtils.checkArgument(config.getBootstrapKeyGeneratorClass() != null,
|
||||
"Ensure bootstrap key generator class is set");
|
||||
}
|
||||
|
||||
@Override
|
||||
public HoodieBootstrapWriteMetadata execute() {
|
||||
validate();
|
||||
try {
|
||||
HoodieTableMetaClient metaClient = table.getMetaClient();
|
||||
Option<HoodieInstant> completetedInstant =
|
||||
metaClient.getActiveTimeline().getCommitsTimeline().filterCompletedInstants().lastInstant();
|
||||
ValidationUtils.checkArgument(!completetedInstant.isPresent(),
|
||||
"Active Timeline is expected to be empty for bootstrap to be performed. "
|
||||
+ "If you want to re-bootstrap, please rollback bootstrap first !!");
|
||||
Map<BootstrapMode, List<Pair<String, List<HoodieFileStatus>>>> partitionSelections = listAndProcessSourcePartitions();
|
||||
|
||||
// First run metadata bootstrap which will auto commit
|
||||
Option<HoodieWriteMetadata> metadataResult = metadataBootstrap(partitionSelections.get(BootstrapMode.METADATA_ONLY));
|
||||
// if there are full bootstrap to be performed, perform that too
|
||||
Option<HoodieWriteMetadata> fullBootstrapResult = fullBootstrap(partitionSelections.get(BootstrapMode.FULL_RECORD));
|
||||
return new HoodieBootstrapWriteMetadata(metadataResult, fullBootstrapResult);
|
||||
} catch (IOException ioe) {
|
||||
throw new HoodieIOException(ioe.getMessage(), ioe);
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
protected String getSchemaToStoreInCommit() {
|
||||
return bootstrapSchema;
|
||||
}
|
||||
|
||||
/**
|
||||
* Perform Metadata Bootstrap.
|
||||
* @param partitionFilesList List of partitions and files within that partitions
|
||||
*/
|
||||
protected Option<HoodieWriteMetadata> metadataBootstrap(List<Pair<String, List<HoodieFileStatus>>> partitionFilesList) {
|
||||
if (null == partitionFilesList || partitionFilesList.isEmpty()) {
|
||||
return Option.empty();
|
||||
}
|
||||
|
||||
HoodieTableMetaClient metaClient = table.getMetaClient();
|
||||
metaClient.getActiveTimeline().createNewInstant(
|
||||
new HoodieInstant(State.REQUESTED, metaClient.getCommitActionType(),
|
||||
HoodieTimeline.METADATA_BOOTSTRAP_INSTANT_TS));
|
||||
|
||||
table.getActiveTimeline().transitionRequestedToInflight(new HoodieInstant(State.REQUESTED,
|
||||
metaClient.getCommitActionType(), HoodieTimeline.METADATA_BOOTSTRAP_INSTANT_TS), Option.empty());
|
||||
|
||||
JavaRDD<BootstrapWriteStatus> bootstrapWriteStatuses = runMetadataBootstrap(partitionFilesList);
|
||||
|
||||
HoodieWriteMetadata<JavaRDD<WriteStatus>> result = new HoodieWriteMetadata<>();
|
||||
updateIndexAndCommitIfNeeded(bootstrapWriteStatuses.map(w -> w), result);
|
||||
return Option.of(result);
|
||||
}
|
||||
|
||||
private void updateIndexAndCommitIfNeeded(JavaRDD<WriteStatus> writeStatusRDD, HoodieWriteMetadata<JavaRDD<WriteStatus>> result) {
|
||||
// cache writeStatusRDD before updating index, so that all actions before this are not triggered again for future
|
||||
// RDD actions that are performed after updating the index.
|
||||
writeStatusRDD = writeStatusRDD.persist(SparkMemoryUtils.getWriteStatusStorageLevel(config.getProps()));
|
||||
Instant indexStartTime = Instant.now();
|
||||
// Update the index back
|
||||
JavaRDD<WriteStatus> statuses = table.getIndex().updateLocation(writeStatusRDD, context,
|
||||
table);
|
||||
result.setIndexUpdateDuration(Duration.between(indexStartTime, Instant.now()));
|
||||
result.setWriteStatuses(statuses);
|
||||
commitOnAutoCommit(result);
|
||||
}
|
||||
|
||||
@Override
|
||||
public HoodieWriteMetadata<JavaRDD<WriteStatus>> execute(JavaRDD<HoodieRecord<T>> inputRecords) {
|
||||
// NO_OP
|
||||
return null;
|
||||
}
|
||||
|
||||
@Override
|
||||
protected void commit(Option<Map<String, String>> extraMetadata, HoodieWriteMetadata<JavaRDD<WriteStatus>> result) {
|
||||
// Perform bootstrap index write and then commit. Make sure both record-key and bootstrap-index
|
||||
// is all done in a single job DAG.
|
||||
Map<String, List<Pair<BootstrapFileMapping, HoodieWriteStat>>> bootstrapSourceAndStats =
|
||||
result.getWriteStatuses().collect().stream()
|
||||
.map(w -> {
|
||||
BootstrapWriteStatus ws = (BootstrapWriteStatus) w;
|
||||
return Pair.of(ws.getBootstrapSourceFileMapping(), ws.getStat());
|
||||
}).collect(Collectors.groupingBy(w -> w.getKey().getPartitionPath()));
|
||||
HoodieTableMetaClient metaClient = table.getMetaClient();
|
||||
try (BootstrapIndex.IndexWriter indexWriter = BootstrapIndex.getBootstrapIndex(metaClient)
|
||||
.createWriter(metaClient.getTableConfig().getBootstrapBasePath().get())) {
|
||||
LOG.info("Starting to write bootstrap index for source " + config.getBootstrapSourceBasePath() + " in table "
|
||||
+ config.getBasePath());
|
||||
indexWriter.begin();
|
||||
bootstrapSourceAndStats.forEach((key, value) -> indexWriter.appendNextPartition(key,
|
||||
value.stream().map(Pair::getKey).collect(Collectors.toList())));
|
||||
indexWriter.finish();
|
||||
LOG.info("Finished writing bootstrap index for source " + config.getBootstrapSourceBasePath() + " in table "
|
||||
+ config.getBasePath());
|
||||
}
|
||||
|
||||
commit(extraMetadata, result, bootstrapSourceAndStats.values().stream()
|
||||
.flatMap(f -> f.stream().map(Pair::getValue)).collect(Collectors.toList()));
|
||||
LOG.info("Committing metadata bootstrap !!");
|
||||
}
|
||||
|
||||
protected void commit(Option<Map<String, String>> extraMetadata, HoodieWriteMetadata<JavaRDD<WriteStatus>> result, List<HoodieWriteStat> stats) {
|
||||
String actionType = table.getMetaClient().getCommitActionType();
|
||||
LOG.info("Committing " + instantTime + ", action Type " + actionType);
|
||||
// Create a Hoodie table which encapsulated the commits and files visible
|
||||
HoodieSparkTable table = HoodieSparkTable.create(config, context);
|
||||
|
||||
HoodieActiveTimeline activeTimeline = table.getActiveTimeline();
|
||||
HoodieCommitMetadata metadata = new HoodieCommitMetadata();
|
||||
|
||||
result.setCommitted(true);
|
||||
stats.forEach(stat -> metadata.addWriteStat(stat.getPartitionPath(), stat));
|
||||
result.setWriteStats(stats);
|
||||
|
||||
// Finalize write
|
||||
finalizeWrite(instantTime, stats, result);
|
||||
|
||||
// add in extra metadata
|
||||
if (extraMetadata.isPresent()) {
|
||||
extraMetadata.get().forEach(metadata::addMetadata);
|
||||
}
|
||||
metadata.addMetadata(HoodieCommitMetadata.SCHEMA_KEY, getSchemaToStoreInCommit());
|
||||
metadata.setOperationType(operationType);
|
||||
|
||||
try {
|
||||
activeTimeline.saveAsComplete(new HoodieInstant(true, actionType, instantTime),
|
||||
Option.of(metadata.toJsonString().getBytes(StandardCharsets.UTF_8)));
|
||||
LOG.info("Committed " + instantTime);
|
||||
} catch (IOException e) {
|
||||
throw new HoodieCommitException("Failed to complete commit " + config.getBasePath() + " at time " + instantTime,
|
||||
e);
|
||||
}
|
||||
result.setCommitMetadata(Option.of(metadata));
|
||||
}
|
||||
|
||||
/**
|
||||
* Perform Full Bootstrap.
|
||||
* @param partitionFilesList List of partitions and files within that partitions
|
||||
*/
|
||||
protected Option<HoodieWriteMetadata> fullBootstrap(List<Pair<String, List<HoodieFileStatus>>> partitionFilesList) {
|
||||
if (null == partitionFilesList || partitionFilesList.isEmpty()) {
|
||||
return Option.empty();
|
||||
}
|
||||
TypedProperties properties = new TypedProperties();
|
||||
properties.putAll(config.getProps());
|
||||
FullRecordBootstrapDataProvider inputProvider =
|
||||
(FullRecordBootstrapDataProvider) ReflectionUtils.loadClass(config.getFullBootstrapInputProvider(),
|
||||
properties, context);
|
||||
JavaRDD<HoodieRecord> inputRecordsRDD =
|
||||
(JavaRDD<HoodieRecord>) inputProvider.generateInputRecords("bootstrap_source", config.getBootstrapSourceBasePath(),
|
||||
partitionFilesList);
|
||||
// Start Full Bootstrap
|
||||
final HoodieInstant requested = new HoodieInstant(State.REQUESTED, table.getMetaClient().getCommitActionType(),
|
||||
HoodieTimeline.FULL_BOOTSTRAP_INSTANT_TS);
|
||||
table.getActiveTimeline().createNewInstant(requested);
|
||||
|
||||
// Setup correct schema and run bulk insert.
|
||||
return Option.of(getBulkInsertActionExecutor(inputRecordsRDD).execute());
|
||||
}
|
||||
|
||||
protected BaseSparkCommitActionExecutor<T> getBulkInsertActionExecutor(JavaRDD<HoodieRecord> inputRecordsRDD) {
|
||||
return new SparkBulkInsertCommitActionExecutor((HoodieSparkEngineContext) context, new HoodieWriteConfig.Builder().withProps(config.getProps())
|
||||
.withSchema(bootstrapSchema).build(), table, HoodieTimeline.FULL_BOOTSTRAP_INSTANT_TS,
|
||||
inputRecordsRDD, extraMetadata);
|
||||
}
|
||||
|
||||
private BootstrapWriteStatus handleMetadataBootstrap(String srcPartitionPath, String partitionPath,
|
||||
HoodieFileStatus srcFileStatus, KeyGeneratorInterface keyGenerator) {
|
||||
|
||||
Path sourceFilePath = FileStatusUtils.toPath(srcFileStatus.getPath());
|
||||
HoodieBootstrapHandle bootstrapHandle = new HoodieBootstrapHandle(config, HoodieTimeline.METADATA_BOOTSTRAP_INSTANT_TS,
|
||||
table, partitionPath, FSUtils.createNewFileIdPfx(), table.getTaskContextSupplier());
|
||||
Schema avroSchema = null;
|
||||
try {
|
||||
ParquetMetadata readFooter = ParquetFileReader.readFooter(table.getHadoopConf(), sourceFilePath,
|
||||
ParquetMetadataConverter.NO_FILTER);
|
||||
MessageType parquetSchema = readFooter.getFileMetaData().getSchema();
|
||||
avroSchema = new AvroSchemaConverter().convert(parquetSchema);
|
||||
Schema recordKeySchema = HoodieAvroUtils.generateProjectionSchema(avroSchema,
|
||||
keyGenerator.getRecordKeyFieldNames());
|
||||
LOG.info("Schema to be used for reading record Keys :" + recordKeySchema);
|
||||
AvroReadSupport.setAvroReadSchema(table.getHadoopConf(), recordKeySchema);
|
||||
AvroReadSupport.setRequestedProjection(table.getHadoopConf(), recordKeySchema);
|
||||
|
||||
BoundedInMemoryExecutor<GenericRecord, HoodieRecord, Void> wrapper = null;
|
||||
try (ParquetReader<IndexedRecord> reader =
|
||||
AvroParquetReader.<IndexedRecord>builder(sourceFilePath).withConf(table.getHadoopConf()).build()) {
|
||||
wrapper = new SparkBoundedInMemoryExecutor<GenericRecord, HoodieRecord, Void>(config,
|
||||
new ParquetReaderIterator(reader), new BootstrapRecordConsumer(bootstrapHandle), inp -> {
|
||||
String recKey = keyGenerator.getKey(inp).getRecordKey();
|
||||
GenericRecord gr = new GenericData.Record(HoodieAvroUtils.RECORD_KEY_SCHEMA);
|
||||
gr.put(HoodieRecord.RECORD_KEY_METADATA_FIELD, recKey);
|
||||
BootstrapRecordPayload payload = new BootstrapRecordPayload(gr);
|
||||
HoodieRecord rec = new HoodieRecord(new HoodieKey(recKey, partitionPath), payload);
|
||||
return rec;
|
||||
});
|
||||
wrapper.execute();
|
||||
} catch (Exception e) {
|
||||
throw new HoodieException(e);
|
||||
} finally {
|
||||
bootstrapHandle.close();
|
||||
if (null != wrapper) {
|
||||
wrapper.shutdownNow();
|
||||
}
|
||||
}
|
||||
} catch (IOException e) {
|
||||
throw new HoodieIOException(e.getMessage(), e);
|
||||
}
|
||||
BootstrapWriteStatus writeStatus = (BootstrapWriteStatus)bootstrapHandle.getWriteStatus();
|
||||
BootstrapFileMapping bootstrapFileMapping = new BootstrapFileMapping(
|
||||
config.getBootstrapSourceBasePath(), srcPartitionPath, partitionPath,
|
||||
srcFileStatus, writeStatus.getFileId());
|
||||
writeStatus.setBootstrapSourceFileMapping(bootstrapFileMapping);
|
||||
return writeStatus;
|
||||
}
|
||||
|
||||
/**
|
||||
* Return Bootstrap Mode selections for partitions listed and figure out bootstrap Schema.
|
||||
* @return
|
||||
* @throws IOException
|
||||
*/
|
||||
private Map<BootstrapMode, List<Pair<String, List<HoodieFileStatus>>>> listAndProcessSourcePartitions() throws IOException {
|
||||
List<Pair<String, List<HoodieFileStatus>>> folders = BootstrapUtils.getAllLeafFoldersWithFiles(
|
||||
table.getMetaClient(), bootstrapSourceFileSystem, config.getBootstrapSourceBasePath(), context);
|
||||
|
||||
LOG.info("Fetching Bootstrap Schema !!");
|
||||
HoodieBootstrapSchemaProvider sourceSchemaProvider = new HoodieSparkBootstrapSchemaProvider(config);
|
||||
bootstrapSchema = sourceSchemaProvider.getBootstrapSchema(context, folders).toString();
|
||||
LOG.info("Bootstrap Schema :" + bootstrapSchema);
|
||||
|
||||
BootstrapModeSelector selector =
|
||||
(BootstrapModeSelector) ReflectionUtils.loadClass(config.getBootstrapModeSelectorClass(), config);
|
||||
|
||||
Map<BootstrapMode, List<String>> result = selector.select(folders);
|
||||
Map<String, List<HoodieFileStatus>> partitionToFiles = folders.stream().collect(
|
||||
Collectors.toMap(Pair::getKey, Pair::getValue));
|
||||
|
||||
// Ensure all partitions are accounted for
|
||||
ValidationUtils.checkArgument(partitionToFiles.keySet().equals(
|
||||
result.values().stream().flatMap(Collection::stream).collect(Collectors.toSet())));
|
||||
|
||||
return result.entrySet().stream().map(e -> Pair.of(e.getKey(), e.getValue().stream()
|
||||
.map(p -> Pair.of(p, partitionToFiles.get(p))).collect(Collectors.toList())))
|
||||
.collect(Collectors.toMap(Pair::getKey, Pair::getValue));
|
||||
}
|
||||
|
||||
private JavaRDD<BootstrapWriteStatus> runMetadataBootstrap(List<Pair<String, List<HoodieFileStatus>>> partitions) {
|
||||
JavaSparkContext jsc = HoodieSparkEngineContext.getSparkContext(context);
|
||||
if (null == partitions || partitions.isEmpty()) {
|
||||
return jsc.emptyRDD();
|
||||
}
|
||||
|
||||
TypedProperties properties = new TypedProperties();
|
||||
properties.putAll(config.getProps());
|
||||
KeyGeneratorInterface keyGenerator = (KeyGeneratorInterface) ReflectionUtils.loadClass(config.getBootstrapKeyGeneratorClass(),
|
||||
properties);
|
||||
BootstrapPartitionPathTranslator translator = (BootstrapPartitionPathTranslator) ReflectionUtils.loadClass(
|
||||
config.getBootstrapPartitionPathTranslatorClass(), properties);
|
||||
|
||||
List<Pair<String, Pair<String, HoodieFileStatus>>> bootstrapPaths = partitions.stream()
|
||||
.flatMap(p -> {
|
||||
String translatedPartitionPath = translator.getBootstrapTranslatedPath(p.getKey());
|
||||
return p.getValue().stream().map(f -> Pair.of(p.getKey(), Pair.of(translatedPartitionPath, f)));
|
||||
})
|
||||
.collect(Collectors.toList());
|
||||
|
||||
return jsc.parallelize(bootstrapPaths, config.getBootstrapParallelism())
|
||||
.map(partitionFsPair -> handleMetadataBootstrap(partitionFsPair.getLeft(), partitionFsPair.getRight().getLeft(),
|
||||
partitionFsPair.getRight().getRight(), keyGenerator));
|
||||
}
|
||||
|
||||
@Override
|
||||
protected Iterator<List<WriteStatus>> handleInsert(String idPfx, Iterator<HoodieRecord<T>> recordItr) {
|
||||
throw new UnsupportedOperationException("Should not called in bootstrap code path");
|
||||
}
|
||||
|
||||
@Override
|
||||
protected Iterator<List<WriteStatus>> handleUpdate(String partitionPath, String fileId, Iterator<HoodieRecord<T>> recordItr) {
|
||||
throw new UnsupportedOperationException("Should not called in bootstrap code path");
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,49 @@
|
||||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one
|
||||
* or more contributor license agreements. See the NOTICE file
|
||||
* distributed with this work for additional information
|
||||
* regarding copyright ownership. The ASF licenses this file
|
||||
* to you under the Apache License, Version 2.0 (the
|
||||
* "License"); you may not use this file except in compliance
|
||||
* with the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
package org.apache.hudi.table.action.bootstrap;
|
||||
|
||||
import java.util.Map;
|
||||
|
||||
import org.apache.hudi.client.common.HoodieSparkEngineContext;
|
||||
import org.apache.hudi.common.model.HoodieRecord;
|
||||
import org.apache.hudi.common.model.HoodieRecordPayload;
|
||||
import org.apache.hudi.common.table.timeline.HoodieTimeline;
|
||||
import org.apache.hudi.common.util.Option;
|
||||
import org.apache.hudi.config.HoodieWriteConfig;
|
||||
import org.apache.hudi.table.HoodieTable;
|
||||
import org.apache.hudi.table.action.commit.BaseSparkCommitActionExecutor;
|
||||
import org.apache.hudi.table.action.deltacommit.SparkBulkInsertDeltaCommitActionExecutor;
|
||||
import org.apache.spark.api.java.JavaRDD;
|
||||
|
||||
public class SparkBootstrapDeltaCommitActionExecutor<T extends HoodieRecordPayload<T>>
|
||||
extends SparkBootstrapCommitActionExecutor<T> {
|
||||
|
||||
public SparkBootstrapDeltaCommitActionExecutor(HoodieSparkEngineContext context,
|
||||
HoodieWriteConfig config, HoodieTable table,
|
||||
Option<Map<String, String>> extraMetadata) {
|
||||
super(context, config, table, extraMetadata);
|
||||
}
|
||||
|
||||
@Override
|
||||
protected BaseSparkCommitActionExecutor<T> getBulkInsertActionExecutor(JavaRDD<HoodieRecord> inputRecordsRDD) {
|
||||
return new SparkBulkInsertDeltaCommitActionExecutor((HoodieSparkEngineContext) context, new HoodieWriteConfig.Builder().withProps(config.getProps())
|
||||
.withSchema(bootstrapSchema).build(), table, HoodieTimeline.FULL_BOOTSTRAP_INSTANT_TS,
|
||||
inputRecordsRDD, extraMetadata);
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,134 @@
|
||||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one
|
||||
* or more contributor license agreements. See the NOTICE file
|
||||
* distributed with this work for additional information
|
||||
* regarding copyright ownership. The ASF licenses this file
|
||||
* to you under the Apache License, Version 2.0 (the
|
||||
* "License"); you may not use this file except in compliance
|
||||
* with the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
package org.apache.hudi.table.action.clean;
|
||||
|
||||
import org.apache.hadoop.fs.FileSystem;
|
||||
import org.apache.hadoop.fs.Path;
|
||||
import org.apache.hudi.avro.model.HoodieActionInstant;
|
||||
import org.apache.hudi.avro.model.HoodieCleanerPlan;
|
||||
import org.apache.hudi.client.WriteStatus;
|
||||
import org.apache.hudi.common.HoodieCleanStat;
|
||||
import org.apache.hudi.client.common.HoodieEngineContext;
|
||||
import org.apache.hudi.client.common.HoodieSparkEngineContext;
|
||||
import org.apache.hudi.common.model.CleanFileInfo;
|
||||
import org.apache.hudi.common.model.HoodieKey;
|
||||
import org.apache.hudi.common.model.HoodieRecord;
|
||||
import org.apache.hudi.common.model.HoodieRecordPayload;
|
||||
import org.apache.hudi.common.table.timeline.HoodieInstant;
|
||||
import org.apache.hudi.common.util.Option;
|
||||
import org.apache.hudi.config.HoodieWriteConfig;
|
||||
import org.apache.hudi.table.HoodieTable;
|
||||
import org.apache.log4j.LogManager;
|
||||
import org.apache.log4j.Logger;
|
||||
import org.apache.spark.api.java.JavaRDD;
|
||||
import org.apache.spark.api.java.JavaSparkContext;
|
||||
import org.apache.spark.api.java.function.PairFlatMapFunction;
|
||||
import scala.Tuple2;
|
||||
|
||||
import java.util.HashMap;
|
||||
import java.util.Iterator;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
import java.util.stream.Collectors;
|
||||
|
||||
@SuppressWarnings("checkstyle:LineLength")
|
||||
public class SparkCleanActionExecutor<T extends HoodieRecordPayload> extends
|
||||
BaseCleanActionExecutor<T, JavaRDD<HoodieRecord<T>>, JavaRDD<HoodieKey>, JavaRDD<WriteStatus>> {
|
||||
|
||||
private static final Logger LOG = LogManager.getLogger(SparkCleanActionExecutor.class);
|
||||
|
||||
public SparkCleanActionExecutor(HoodieSparkEngineContext context,
|
||||
HoodieWriteConfig config,
|
||||
HoodieTable<T, JavaRDD<HoodieRecord<T>>, JavaRDD<HoodieKey>, JavaRDD<WriteStatus>> table,
|
||||
String instantTime) {
|
||||
super(context, config, table, instantTime);
|
||||
}
|
||||
|
||||
private static PairFlatMapFunction<Iterator<Tuple2<String, CleanFileInfo>>, String, PartitionCleanStat>
|
||||
deleteFilesFunc(HoodieTable table) {
|
||||
return (PairFlatMapFunction<Iterator<Tuple2<String, CleanFileInfo>>, String, PartitionCleanStat>) iter -> {
|
||||
Map<String, PartitionCleanStat> partitionCleanStatMap = new HashMap<>();
|
||||
FileSystem fs = table.getMetaClient().getFs();
|
||||
while (iter.hasNext()) {
|
||||
Tuple2<String, CleanFileInfo> partitionDelFileTuple = iter.next();
|
||||
String partitionPath = partitionDelFileTuple._1();
|
||||
Path deletePath = new Path(partitionDelFileTuple._2().getFilePath());
|
||||
String deletePathStr = deletePath.toString();
|
||||
Boolean deletedFileResult = deleteFileAndGetResult(fs, deletePathStr);
|
||||
if (!partitionCleanStatMap.containsKey(partitionPath)) {
|
||||
partitionCleanStatMap.put(partitionPath, new PartitionCleanStat(partitionPath));
|
||||
}
|
||||
boolean isBootstrapBasePathFile = partitionDelFileTuple._2().isBootstrapBaseFile();
|
||||
PartitionCleanStat partitionCleanStat = partitionCleanStatMap.get(partitionPath);
|
||||
if (isBootstrapBasePathFile) {
|
||||
// For Bootstrap Base file deletions, store the full file path.
|
||||
partitionCleanStat.addDeleteFilePatterns(deletePath.toString(), true);
|
||||
partitionCleanStat.addDeletedFileResult(deletePath.toString(), deletedFileResult, true);
|
||||
} else {
|
||||
partitionCleanStat.addDeleteFilePatterns(deletePath.getName(), false);
|
||||
partitionCleanStat.addDeletedFileResult(deletePath.getName(), deletedFileResult, false);
|
||||
}
|
||||
}
|
||||
return partitionCleanStatMap.entrySet().stream().map(e -> new Tuple2<>(e.getKey(), e.getValue()))
|
||||
.collect(Collectors.toList()).iterator();
|
||||
};
|
||||
}
|
||||
|
||||
@Override
|
||||
List<HoodieCleanStat> clean(HoodieEngineContext context, HoodieCleanerPlan cleanerPlan) {
|
||||
JavaSparkContext jsc = HoodieSparkEngineContext.getSparkContext(context);
|
||||
int cleanerParallelism = Math.min(
|
||||
(int) (cleanerPlan.getFilePathsToBeDeletedPerPartition().values().stream().mapToInt(List::size).count()),
|
||||
config.getCleanerParallelism());
|
||||
LOG.info("Using cleanerParallelism: " + cleanerParallelism);
|
||||
|
||||
context.setJobStatus(this.getClass().getSimpleName(), "Perform cleaning of partitions");
|
||||
List<Tuple2<String, PartitionCleanStat>> partitionCleanStats = jsc
|
||||
.parallelize(cleanerPlan.getFilePathsToBeDeletedPerPartition().entrySet().stream()
|
||||
.flatMap(x -> x.getValue().stream().map(y -> new Tuple2<>(x.getKey(),
|
||||
new CleanFileInfo(y.getFilePath(), y.getIsBootstrapBaseFile()))))
|
||||
.collect(Collectors.toList()), cleanerParallelism)
|
||||
.mapPartitionsToPair(deleteFilesFunc(table))
|
||||
.reduceByKey(PartitionCleanStat::merge).collect();
|
||||
|
||||
Map<String, PartitionCleanStat> partitionCleanStatsMap = partitionCleanStats.stream()
|
||||
.collect(Collectors.toMap(Tuple2::_1, Tuple2::_2));
|
||||
|
||||
// Return PartitionCleanStat for each partition passed.
|
||||
return cleanerPlan.getFilePathsToBeDeletedPerPartition().keySet().stream().map(partitionPath -> {
|
||||
PartitionCleanStat partitionCleanStat = partitionCleanStatsMap.containsKey(partitionPath)
|
||||
? partitionCleanStatsMap.get(partitionPath)
|
||||
: new PartitionCleanStat(partitionPath);
|
||||
HoodieActionInstant actionInstant = cleanerPlan.getEarliestInstantToRetain();
|
||||
return HoodieCleanStat.newBuilder().withPolicy(config.getCleanerPolicy()).withPartitionPath(partitionPath)
|
||||
.withEarliestCommitRetained(Option.ofNullable(
|
||||
actionInstant != null
|
||||
? new HoodieInstant(HoodieInstant.State.valueOf(actionInstant.getState()),
|
||||
actionInstant.getAction(), actionInstant.getTimestamp())
|
||||
: null))
|
||||
.withDeletePathPattern(partitionCleanStat.deletePathPatterns())
|
||||
.withSuccessfulDeletes(partitionCleanStat.successDeleteFiles())
|
||||
.withFailedDeletes(partitionCleanStat.failedDeleteFiles())
|
||||
.withDeleteBootstrapBasePathPatterns(partitionCleanStat.getDeleteBootstrapBasePathPatterns())
|
||||
.withSuccessfulDeleteBootstrapBaseFiles(partitionCleanStat.getSuccessfulDeleteBootstrapBaseFiles())
|
||||
.withFailedDeleteBootstrapBaseFiles(partitionCleanStat.getFailedDeleteBootstrapBaseFiles())
|
||||
.build();
|
||||
}).collect(Collectors.toList());
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,342 @@
|
||||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one
|
||||
* or more contributor license agreements. See the NOTICE file
|
||||
* distributed with this work for additional information
|
||||
* regarding copyright ownership. The ASF licenses this file
|
||||
* to you under the Apache License, Version 2.0 (the
|
||||
* "License"); you may not use this file except in compliance
|
||||
* with the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
package org.apache.hudi.table.action.commit;
|
||||
|
||||
import org.apache.hudi.client.WriteStatus;
|
||||
import org.apache.hudi.client.utils.SparkMemoryUtils;
|
||||
import org.apache.hudi.client.common.HoodieEngineContext;
|
||||
import org.apache.hudi.common.model.HoodieBaseFile;
|
||||
import org.apache.hudi.common.model.HoodieCommitMetadata;
|
||||
import org.apache.hudi.common.model.HoodieKey;
|
||||
import org.apache.hudi.common.model.HoodieRecord;
|
||||
import org.apache.hudi.common.model.HoodieRecordLocation;
|
||||
import org.apache.hudi.common.model.HoodieRecordPayload;
|
||||
import org.apache.hudi.common.model.HoodieWriteStat;
|
||||
import org.apache.hudi.common.model.WriteOperationType;
|
||||
import org.apache.hudi.common.table.timeline.HoodieActiveTimeline;
|
||||
import org.apache.hudi.common.table.timeline.HoodieInstant;
|
||||
import org.apache.hudi.common.util.CommitUtils;
|
||||
import org.apache.hudi.common.util.Option;
|
||||
import org.apache.hudi.common.util.collection.Pair;
|
||||
import org.apache.hudi.config.HoodieWriteConfig;
|
||||
import org.apache.hudi.exception.HoodieCommitException;
|
||||
import org.apache.hudi.exception.HoodieUpsertException;
|
||||
import org.apache.hudi.execution.SparkLazyInsertIterable;
|
||||
import org.apache.hudi.io.CreateHandleFactory;
|
||||
import org.apache.hudi.io.HoodieMergeHandle;
|
||||
import org.apache.hudi.io.HoodieSortedMergeHandle;
|
||||
import org.apache.hudi.table.HoodieSparkTable;
|
||||
import org.apache.hudi.table.HoodieTable;
|
||||
import org.apache.hudi.table.WorkloadProfile;
|
||||
import org.apache.hudi.table.WorkloadStat;
|
||||
import org.apache.hudi.table.action.HoodieWriteMetadata;
|
||||
import org.apache.log4j.LogManager;
|
||||
import org.apache.log4j.Logger;
|
||||
import org.apache.spark.Partitioner;
|
||||
import org.apache.spark.api.java.JavaPairRDD;
|
||||
import org.apache.spark.api.java.JavaRDD;
|
||||
import org.apache.spark.storage.StorageLevel;
|
||||
import scala.Tuple2;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.io.Serializable;
|
||||
import java.nio.charset.StandardCharsets;
|
||||
import java.time.Duration;
|
||||
import java.time.Instant;
|
||||
import java.util.Collections;
|
||||
import java.util.Comparator;
|
||||
import java.util.HashMap;
|
||||
import java.util.Iterator;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
|
||||
public abstract class BaseSparkCommitActionExecutor<T extends HoodieRecordPayload> extends
|
||||
BaseCommitActionExecutor<T, JavaRDD<HoodieRecord<T>>, JavaRDD<HoodieKey>, JavaRDD<WriteStatus>, HoodieWriteMetadata> {
|
||||
|
||||
private static final Logger LOG = LogManager.getLogger(BaseSparkCommitActionExecutor.class);
|
||||
|
||||
public BaseSparkCommitActionExecutor(HoodieEngineContext context,
|
||||
HoodieWriteConfig config,
|
||||
HoodieTable table,
|
||||
String instantTime,
|
||||
WriteOperationType operationType) {
|
||||
super(context, config, table, instantTime, operationType, Option.empty());
|
||||
}
|
||||
|
||||
public BaseSparkCommitActionExecutor(HoodieEngineContext context,
|
||||
HoodieWriteConfig config,
|
||||
HoodieTable table,
|
||||
String instantTime,
|
||||
WriteOperationType operationType,
|
||||
Option extraMetadata) {
|
||||
super(context, config, table, instantTime, operationType, extraMetadata);
|
||||
}
|
||||
|
||||
@Override
|
||||
public HoodieWriteMetadata<JavaRDD<WriteStatus>> execute(JavaRDD<HoodieRecord<T>> inputRecordsRDD) {
|
||||
HoodieWriteMetadata<JavaRDD<WriteStatus>> result = new HoodieWriteMetadata<>();
|
||||
// Cache the tagged records, so we don't end up computing both
|
||||
// TODO: Consistent contract in HoodieWriteClient regarding preppedRecord storage level handling
|
||||
if (inputRecordsRDD.getStorageLevel() == StorageLevel.NONE()) {
|
||||
inputRecordsRDD.persist(StorageLevel.MEMORY_AND_DISK_SER());
|
||||
} else {
|
||||
LOG.info("RDD PreppedRecords was persisted at: " + inputRecordsRDD.getStorageLevel());
|
||||
}
|
||||
|
||||
WorkloadProfile profile = null;
|
||||
if (isWorkloadProfileNeeded()) {
|
||||
profile = new WorkloadProfile(buildProfile(inputRecordsRDD));
|
||||
LOG.info("Workload profile :" + profile);
|
||||
saveWorkloadProfileMetadataToInflight(profile, instantTime);
|
||||
}
|
||||
|
||||
// partition using the insert partitioner
|
||||
final Partitioner partitioner = getPartitioner(profile);
|
||||
JavaRDD<HoodieRecord<T>> partitionedRecords = partition(inputRecordsRDD, partitioner);
|
||||
JavaRDD<WriteStatus> writeStatusRDD = partitionedRecords.mapPartitionsWithIndex((partition, recordItr) -> {
|
||||
if (WriteOperationType.isChangingRecords(operationType)) {
|
||||
return handleUpsertPartition(instantTime, partition, recordItr, partitioner);
|
||||
} else {
|
||||
return handleInsertPartition(instantTime, partition, recordItr, partitioner);
|
||||
}
|
||||
}, true).flatMap(List::iterator);
|
||||
|
||||
updateIndexAndCommitIfNeeded(writeStatusRDD, result);
|
||||
return result;
|
||||
}
|
||||
|
||||
private Pair<HashMap<String, WorkloadStat>, WorkloadStat> buildProfile(JavaRDD<HoodieRecord<T>> inputRecordsRDD) {
|
||||
HashMap<String, WorkloadStat> partitionPathStatMap = new HashMap<>();
|
||||
WorkloadStat globalStat = new WorkloadStat();
|
||||
|
||||
// group the records by partitionPath + currentLocation combination, count the number of
|
||||
// records in each partition
|
||||
Map<Tuple2<String, Option<HoodieRecordLocation>>, Long> partitionLocationCounts = inputRecordsRDD
|
||||
.mapToPair(record -> new Tuple2<>(
|
||||
new Tuple2<>(record.getPartitionPath(), Option.ofNullable(record.getCurrentLocation())), record))
|
||||
.countByKey();
|
||||
|
||||
// count the number of both inserts and updates in each partition, update the counts to workLoadStats
|
||||
for (Map.Entry<Tuple2<String, Option<HoodieRecordLocation>>, Long> e : partitionLocationCounts.entrySet()) {
|
||||
String partitionPath = e.getKey()._1();
|
||||
Long count = e.getValue();
|
||||
Option<HoodieRecordLocation> locOption = e.getKey()._2();
|
||||
|
||||
if (!partitionPathStatMap.containsKey(partitionPath)) {
|
||||
partitionPathStatMap.put(partitionPath, new WorkloadStat());
|
||||
}
|
||||
|
||||
if (locOption.isPresent()) {
|
||||
// update
|
||||
partitionPathStatMap.get(partitionPath).addUpdates(locOption.get(), count);
|
||||
globalStat.addUpdates(locOption.get(), count);
|
||||
} else {
|
||||
// insert
|
||||
partitionPathStatMap.get(partitionPath).addInserts(count);
|
||||
globalStat.addInserts(count);
|
||||
}
|
||||
}
|
||||
return Pair.of(partitionPathStatMap, globalStat);
|
||||
}
|
||||
|
||||
protected Partitioner getPartitioner(WorkloadProfile profile) {
|
||||
if (WriteOperationType.isChangingRecords(operationType)) {
|
||||
return getUpsertPartitioner(profile);
|
||||
} else {
|
||||
return getInsertPartitioner(profile);
|
||||
}
|
||||
}
|
||||
|
||||
private JavaRDD<HoodieRecord<T>> partition(JavaRDD<HoodieRecord<T>> dedupedRecords, Partitioner partitioner) {
|
||||
JavaPairRDD<Tuple2, HoodieRecord<T>> mappedRDD = dedupedRecords.mapToPair(
|
||||
record -> new Tuple2<>(new Tuple2<>(record.getKey(), Option.ofNullable(record.getCurrentLocation())), record));
|
||||
|
||||
JavaPairRDD<Tuple2, HoodieRecord<T>> partitionedRDD;
|
||||
if (table.requireSortedRecords()) {
|
||||
// Partition and sort within each partition as a single step. This is faster than partitioning first and then
|
||||
// applying a sort.
|
||||
Comparator<Tuple2> comparator = (Comparator<Tuple2> & Serializable)(t1, t2) -> {
|
||||
HoodieKey key1 = (HoodieKey) t1._1;
|
||||
HoodieKey key2 = (HoodieKey) t2._1;
|
||||
return key1.getRecordKey().compareTo(key2.getRecordKey());
|
||||
};
|
||||
|
||||
partitionedRDD = mappedRDD.repartitionAndSortWithinPartitions(partitioner, comparator);
|
||||
} else {
|
||||
// Partition only
|
||||
partitionedRDD = mappedRDD.partitionBy(partitioner);
|
||||
}
|
||||
|
||||
return partitionedRDD.map(Tuple2::_2);
|
||||
}
|
||||
|
||||
protected void updateIndexAndCommitIfNeeded(JavaRDD<WriteStatus> writeStatusRDD, HoodieWriteMetadata result) {
|
||||
// cache writeStatusRDD before updating index, so that all actions before this are not triggered again for future
|
||||
// RDD actions that are performed after updating the index.
|
||||
writeStatusRDD = writeStatusRDD.persist(SparkMemoryUtils.getWriteStatusStorageLevel(config.getProps()));
|
||||
Instant indexStartTime = Instant.now();
|
||||
// Update the index back
|
||||
JavaRDD<WriteStatus> statuses = table.getIndex().updateLocation(writeStatusRDD, context, table);
|
||||
result.setIndexUpdateDuration(Duration.between(indexStartTime, Instant.now()));
|
||||
result.setWriteStatuses(statuses);
|
||||
result.setPartitionToReplaceFileIds(getPartitionToReplacedFileIds(statuses));
|
||||
commitOnAutoCommit(result);
|
||||
}
|
||||
|
||||
protected String getCommitActionType() {
|
||||
return table.getMetaClient().getCommitActionType();
|
||||
}
|
||||
|
||||
@Override
|
||||
protected void commit(Option<Map<String, String>> extraMetadata, HoodieWriteMetadata<JavaRDD<WriteStatus>> result) {
|
||||
commit(extraMetadata, result, result.getWriteStatuses().map(WriteStatus::getStat).collect());
|
||||
}
|
||||
|
||||
protected void commit(Option<Map<String, String>> extraMetadata, HoodieWriteMetadata<JavaRDD<WriteStatus>> result, List<HoodieWriteStat> writeStats) {
|
||||
String actionType = getCommitActionType();
|
||||
LOG.info("Committing " + instantTime + ", action Type " + actionType);
|
||||
result.setCommitted(true);
|
||||
result.setWriteStats(writeStats);
|
||||
// Finalize write
|
||||
finalizeWrite(instantTime, writeStats, result);
|
||||
|
||||
try {
|
||||
LOG.info("Committing " + instantTime + ", action Type " + getCommitActionType());
|
||||
HoodieActiveTimeline activeTimeline = table.getActiveTimeline();
|
||||
HoodieCommitMetadata metadata = CommitUtils.buildMetadata(writeStats, result.getPartitionToReplaceFileIds(),
|
||||
extraMetadata, operationType, getSchemaToStoreInCommit(), getCommitActionType());
|
||||
|
||||
activeTimeline.saveAsComplete(new HoodieInstant(true, getCommitActionType(), instantTime),
|
||||
Option.of(metadata.toJsonString().getBytes(StandardCharsets.UTF_8)));
|
||||
LOG.info("Committed " + instantTime);
|
||||
result.setCommitMetadata(Option.of(metadata));
|
||||
} catch (IOException e) {
|
||||
throw new HoodieCommitException("Failed to complete commit " + config.getBasePath() + " at time " + instantTime,
|
||||
e);
|
||||
}
|
||||
}
|
||||
|
||||
protected Map<String, List<String>> getPartitionToReplacedFileIds(JavaRDD<WriteStatus> writeStatuses) {
|
||||
return Collections.emptyMap();
|
||||
}
|
||||
|
||||
@SuppressWarnings("unchecked")
|
||||
protected Iterator<List<WriteStatus>> handleUpsertPartition(String instantTime, Integer partition, Iterator recordItr,
|
||||
Partitioner partitioner) {
|
||||
UpsertPartitioner upsertPartitioner = (UpsertPartitioner) partitioner;
|
||||
BucketInfo binfo = upsertPartitioner.getBucketInfo(partition);
|
||||
BucketType btype = binfo.bucketType;
|
||||
try {
|
||||
if (btype.equals(BucketType.INSERT)) {
|
||||
return handleInsert(binfo.fileIdPrefix, recordItr);
|
||||
} else if (btype.equals(BucketType.UPDATE)) {
|
||||
return handleUpdate(binfo.partitionPath, binfo.fileIdPrefix, recordItr);
|
||||
} else {
|
||||
throw new HoodieUpsertException("Unknown bucketType " + btype + " for partition :" + partition);
|
||||
}
|
||||
} catch (Throwable t) {
|
||||
String msg = "Error upserting bucketType " + btype + " for partition :" + partition;
|
||||
LOG.error(msg, t);
|
||||
throw new HoodieUpsertException(msg, t);
|
||||
}
|
||||
}
|
||||
|
||||
protected Iterator<List<WriteStatus>> handleInsertPartition(String instantTime, Integer partition, Iterator recordItr,
|
||||
Partitioner partitioner) {
|
||||
return handleUpsertPartition(instantTime, partition, recordItr, partitioner);
|
||||
}
|
||||
|
||||
@Override
|
||||
public Iterator<List<WriteStatus>> handleUpdate(String partitionPath, String fileId,
|
||||
Iterator<HoodieRecord<T>> recordItr)
|
||||
throws IOException {
|
||||
// This is needed since sometimes some buckets are never picked in getPartition() and end up with 0 records
|
||||
if (!recordItr.hasNext()) {
|
||||
LOG.info("Empty partition with fileId => " + fileId);
|
||||
return Collections.singletonList((List<WriteStatus>) Collections.EMPTY_LIST).iterator();
|
||||
}
|
||||
// these are updates
|
||||
HoodieMergeHandle upsertHandle = getUpdateHandle(partitionPath, fileId, recordItr);
|
||||
return handleUpdateInternal(upsertHandle, fileId);
|
||||
}
|
||||
|
||||
public Iterator<List<WriteStatus>> handleUpdate(String partitionPath, String fileId,
|
||||
Map<String, HoodieRecord<T>> keyToNewRecords,
|
||||
HoodieBaseFile oldDataFile) throws IOException {
|
||||
// these are updates
|
||||
HoodieMergeHandle upsertHandle = getUpdateHandle(partitionPath, fileId, keyToNewRecords, oldDataFile);
|
||||
return handleUpdateInternal(upsertHandle, fileId);
|
||||
}
|
||||
|
||||
protected Iterator<List<WriteStatus>> handleUpdateInternal(HoodieMergeHandle upsertHandle, String fileId)
|
||||
throws IOException {
|
||||
if (upsertHandle.getOldFilePath() == null) {
|
||||
throw new HoodieUpsertException(
|
||||
"Error in finding the old file path at commit " + instantTime + " for fileId: " + fileId);
|
||||
} else {
|
||||
SparkMergeHelper.newInstance().runMerge(table, upsertHandle);
|
||||
}
|
||||
|
||||
// TODO(vc): This needs to be revisited
|
||||
if (upsertHandle.getWriteStatus().getPartitionPath() == null) {
|
||||
LOG.info("Upsert Handle has partition path as null " + upsertHandle.getOldFilePath() + ", "
|
||||
+ upsertHandle.getWriteStatus());
|
||||
}
|
||||
return Collections.singletonList(Collections.singletonList(upsertHandle.getWriteStatus())).iterator();
|
||||
}
|
||||
|
||||
protected HoodieMergeHandle getUpdateHandle(String partitionPath, String fileId, Iterator<HoodieRecord<T>> recordItr) {
|
||||
if (table.requireSortedRecords()) {
|
||||
return new HoodieSortedMergeHandle<>(config, instantTime, (HoodieSparkTable) table, recordItr, partitionPath, fileId, taskContextSupplier);
|
||||
} else {
|
||||
return new HoodieMergeHandle<>(config, instantTime, table, recordItr, partitionPath, fileId, taskContextSupplier);
|
||||
}
|
||||
}
|
||||
|
||||
protected HoodieMergeHandle getUpdateHandle(String partitionPath, String fileId,
|
||||
Map<String, HoodieRecord<T>> keyToNewRecords,
|
||||
HoodieBaseFile dataFileToBeMerged) {
|
||||
return new HoodieMergeHandle<>(config, instantTime, table, keyToNewRecords,
|
||||
partitionPath, fileId, dataFileToBeMerged, taskContextSupplier);
|
||||
}
|
||||
|
||||
@Override
|
||||
public Iterator<List<WriteStatus>> handleInsert(String idPfx, Iterator<HoodieRecord<T>> recordItr)
|
||||
throws Exception {
|
||||
// This is needed since sometimes some buckets are never picked in getPartition() and end up with 0 records
|
||||
if (!recordItr.hasNext()) {
|
||||
LOG.info("Empty partition");
|
||||
return Collections.singletonList((List<WriteStatus>) Collections.EMPTY_LIST).iterator();
|
||||
}
|
||||
return new SparkLazyInsertIterable(recordItr, true, config, instantTime, table, idPfx,
|
||||
taskContextSupplier, new CreateHandleFactory<>());
|
||||
}
|
||||
|
||||
public Partitioner getUpsertPartitioner(WorkloadProfile profile) {
|
||||
if (profile == null) {
|
||||
throw new HoodieUpsertException("Need workload profile to construct the upsert partitioner.");
|
||||
}
|
||||
return new UpsertPartitioner(profile, context, table, config);
|
||||
}
|
||||
|
||||
public Partitioner getInsertPartitioner(WorkloadProfile profile) {
|
||||
return getUpsertPartitioner(profile);
|
||||
}
|
||||
|
||||
}
|
||||
@@ -0,0 +1,68 @@
|
||||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one
|
||||
* or more contributor license agreements. See the NOTICE file
|
||||
* distributed with this work for additional information
|
||||
* regarding copyright ownership. The ASF licenses this file
|
||||
* to you under the Apache License, Version 2.0 (the
|
||||
* "License"); you may not use this file except in compliance
|
||||
* with the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
package org.apache.hudi.table.action.commit;
|
||||
|
||||
import java.util.Map;
|
||||
|
||||
import org.apache.hudi.client.WriteStatus;
|
||||
import org.apache.hudi.client.common.HoodieSparkEngineContext;
|
||||
import org.apache.hudi.common.model.HoodieRecord;
|
||||
import org.apache.hudi.common.model.HoodieRecordPayload;
|
||||
import org.apache.hudi.common.model.WriteOperationType;
|
||||
import org.apache.hudi.common.util.Option;
|
||||
import org.apache.hudi.config.HoodieWriteConfig;
|
||||
import org.apache.hudi.exception.HoodieInsertException;
|
||||
import org.apache.hudi.table.HoodieTable;
|
||||
import org.apache.hudi.table.BulkInsertPartitioner;
|
||||
|
||||
import org.apache.hudi.table.action.HoodieWriteMetadata;
|
||||
import org.apache.spark.api.java.JavaRDD;
|
||||
|
||||
public class SparkBulkInsertCommitActionExecutor<T extends HoodieRecordPayload<T>> extends BaseSparkCommitActionExecutor<T> {
|
||||
|
||||
private final JavaRDD<HoodieRecord<T>> inputRecordsRDD;
|
||||
private final Option<BulkInsertPartitioner<T>> bulkInsertPartitioner;
|
||||
|
||||
public SparkBulkInsertCommitActionExecutor(HoodieSparkEngineContext context, HoodieWriteConfig config, HoodieTable table,
|
||||
String instantTime, JavaRDD<HoodieRecord<T>> inputRecordsRDD,
|
||||
Option<BulkInsertPartitioner<T>> bulkInsertPartitioner) {
|
||||
this(context, config, table, instantTime, inputRecordsRDD, bulkInsertPartitioner, Option.empty());
|
||||
}
|
||||
|
||||
public SparkBulkInsertCommitActionExecutor(HoodieSparkEngineContext context, HoodieWriteConfig config, HoodieTable table,
|
||||
String instantTime, JavaRDD<HoodieRecord<T>> inputRecordsRDD,
|
||||
Option<BulkInsertPartitioner<T>> bulkInsertPartitioner,
|
||||
Option<Map<String, String>> extraMetadata) {
|
||||
super(context, config, table, instantTime, WriteOperationType.BULK_INSERT, extraMetadata);
|
||||
this.inputRecordsRDD = inputRecordsRDD;
|
||||
this.bulkInsertPartitioner = bulkInsertPartitioner;
|
||||
}
|
||||
|
||||
@Override
|
||||
public HoodieWriteMetadata<JavaRDD<WriteStatus>> execute() {
|
||||
try {
|
||||
return SparkBulkInsertHelper.newInstance().bulkInsert(inputRecordsRDD, instantTime, table, config,
|
||||
this, true, bulkInsertPartitioner);
|
||||
} catch (HoodieInsertException ie) {
|
||||
throw ie;
|
||||
} catch (Throwable e) {
|
||||
throw new HoodieInsertException("Failed to bulk insert for commit time " + instantTime, e);
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,102 @@
|
||||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one
|
||||
* or more contributor license agreements. See the NOTICE file
|
||||
* distributed with this work for additional information
|
||||
* regarding copyright ownership. The ASF licenses this file
|
||||
* to you under the Apache License, Version 2.0 (the
|
||||
* "License"); you may not use this file except in compliance
|
||||
* with the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
package org.apache.hudi.table.action.commit;
|
||||
|
||||
import org.apache.hudi.client.WriteStatus;
|
||||
import org.apache.hudi.common.fs.FSUtils;
|
||||
import org.apache.hudi.common.model.HoodieKey;
|
||||
import org.apache.hudi.common.model.HoodieRecord;
|
||||
import org.apache.hudi.common.model.HoodieRecordPayload;
|
||||
import org.apache.hudi.common.table.timeline.HoodieInstant;
|
||||
import org.apache.hudi.common.util.Option;
|
||||
import org.apache.hudi.config.HoodieWriteConfig;
|
||||
import org.apache.hudi.execution.bulkinsert.BulkInsertInternalPartitionerFactory;
|
||||
import org.apache.hudi.execution.bulkinsert.BulkInsertMapFunction;
|
||||
import org.apache.hudi.table.BulkInsertPartitioner;
|
||||
import org.apache.hudi.table.HoodieTable;
|
||||
import org.apache.hudi.table.action.HoodieWriteMetadata;
|
||||
|
||||
import org.apache.spark.api.java.JavaRDD;
|
||||
|
||||
import java.util.List;
|
||||
import java.util.stream.Collectors;
|
||||
import java.util.stream.IntStream;
|
||||
|
||||
/**
|
||||
* A spark implementation of {@link AbstractBulkInsertHelper}.
|
||||
*
|
||||
* @param <T>
|
||||
*/
|
||||
@SuppressWarnings("checkstyle:LineLength")
|
||||
public class SparkBulkInsertHelper<T extends HoodieRecordPayload, R> extends AbstractBulkInsertHelper<T, JavaRDD<HoodieRecord<T>>,
|
||||
JavaRDD<HoodieKey>, JavaRDD<WriteStatus>, R> {
|
||||
|
||||
private SparkBulkInsertHelper() {
|
||||
}
|
||||
|
||||
private static class BulkInsertHelperHolder {
|
||||
private static final SparkBulkInsertHelper SPARK_BULK_INSERT_HELPER = new SparkBulkInsertHelper();
|
||||
}
|
||||
|
||||
public static SparkBulkInsertHelper newInstance() {
|
||||
return BulkInsertHelperHolder.SPARK_BULK_INSERT_HELPER;
|
||||
}
|
||||
|
||||
@Override
|
||||
public HoodieWriteMetadata<JavaRDD<WriteStatus>> bulkInsert(JavaRDD<HoodieRecord<T>> inputRecords,
|
||||
String instantTime,
|
||||
HoodieTable<T, JavaRDD<HoodieRecord<T>>, JavaRDD<HoodieKey>, JavaRDD<WriteStatus>> table,
|
||||
HoodieWriteConfig config,
|
||||
BaseCommitActionExecutor<T, JavaRDD<HoodieRecord<T>>, JavaRDD<HoodieKey>, JavaRDD<WriteStatus>, R> executor,
|
||||
boolean performDedupe,
|
||||
Option<BulkInsertPartitioner<T>> userDefinedBulkInsertPartitioner) {
|
||||
HoodieWriteMetadata result = new HoodieWriteMetadata();
|
||||
|
||||
// De-dupe/merge if needed
|
||||
JavaRDD<HoodieRecord<T>> dedupedRecords = inputRecords;
|
||||
|
||||
if (performDedupe) {
|
||||
dedupedRecords = (JavaRDD<HoodieRecord<T>>) SparkWriteHelper.newInstance().combineOnCondition(config.shouldCombineBeforeInsert(), inputRecords,
|
||||
config.getBulkInsertShuffleParallelism(), table);
|
||||
}
|
||||
|
||||
final JavaRDD<HoodieRecord<T>> repartitionedRecords;
|
||||
final int parallelism = config.getBulkInsertShuffleParallelism();
|
||||
BulkInsertPartitioner partitioner = userDefinedBulkInsertPartitioner.isPresent()
|
||||
? userDefinedBulkInsertPartitioner.get()
|
||||
: BulkInsertInternalPartitionerFactory.get(config.getBulkInsertSortMode());
|
||||
repartitionedRecords = (JavaRDD<HoodieRecord<T>>) partitioner.repartitionRecords(dedupedRecords, parallelism);
|
||||
|
||||
// generate new file ID prefixes for each output partition
|
||||
final List<String> fileIDPrefixes =
|
||||
IntStream.range(0, parallelism).mapToObj(i -> FSUtils.createNewFileIdPfx()).collect(Collectors.toList());
|
||||
|
||||
table.getActiveTimeline().transitionRequestedToInflight(new HoodieInstant(HoodieInstant.State.REQUESTED,
|
||||
table.getMetaClient().getCommitActionType(), instantTime), Option.empty(),
|
||||
config.shouldAllowMultiWriteOnSameInstant());
|
||||
|
||||
JavaRDD<WriteStatus> writeStatusRDD = repartitionedRecords
|
||||
.mapPartitionsWithIndex(new BulkInsertMapFunction<T>(instantTime,
|
||||
partitioner.arePartitionRecordsSorted(), config, table, fileIDPrefixes), true)
|
||||
.flatMap(List::iterator);
|
||||
|
||||
((BaseSparkCommitActionExecutor) executor).updateIndexAndCommitIfNeeded(writeStatusRDD, result);
|
||||
return result;
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,63 @@
|
||||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one
|
||||
* or more contributor license agreements. See the NOTICE file
|
||||
* distributed with this work for additional information
|
||||
* regarding copyright ownership. The ASF licenses this file
|
||||
* to you under the Apache License, Version 2.0 (the
|
||||
* "License"); you may not use this file except in compliance
|
||||
* with the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
package org.apache.hudi.table.action.commit;
|
||||
|
||||
import org.apache.hudi.client.WriteStatus;
|
||||
import org.apache.hudi.client.common.HoodieSparkEngineContext;
|
||||
import org.apache.hudi.common.model.HoodieRecord;
|
||||
import org.apache.hudi.common.model.HoodieRecordPayload;
|
||||
import org.apache.hudi.common.model.WriteOperationType;
|
||||
import org.apache.hudi.common.util.Option;
|
||||
import org.apache.hudi.config.HoodieWriteConfig;
|
||||
import org.apache.hudi.exception.HoodieInsertException;
|
||||
import org.apache.hudi.table.HoodieTable;
|
||||
import org.apache.hudi.table.BulkInsertPartitioner;
|
||||
|
||||
import org.apache.hudi.table.action.HoodieWriteMetadata;
|
||||
import org.apache.spark.api.java.JavaRDD;
|
||||
|
||||
public class SparkBulkInsertPreppedCommitActionExecutor<T extends HoodieRecordPayload<T>>
|
||||
extends BaseSparkCommitActionExecutor<T> {
|
||||
|
||||
private final JavaRDD<HoodieRecord<T>> preppedInputRecordRdd;
|
||||
private final Option<BulkInsertPartitioner<T>> userDefinedBulkInsertPartitioner;
|
||||
|
||||
public SparkBulkInsertPreppedCommitActionExecutor(HoodieSparkEngineContext context,
|
||||
HoodieWriteConfig config, HoodieTable table,
|
||||
String instantTime, JavaRDD<HoodieRecord<T>> preppedInputRecordRdd,
|
||||
Option<BulkInsertPartitioner<T>> userDefinedBulkInsertPartitioner) {
|
||||
super(context, config, table, instantTime, WriteOperationType.BULK_INSERT);
|
||||
this.preppedInputRecordRdd = preppedInputRecordRdd;
|
||||
this.userDefinedBulkInsertPartitioner = userDefinedBulkInsertPartitioner;
|
||||
}
|
||||
|
||||
@Override
|
||||
public HoodieWriteMetadata<JavaRDD<WriteStatus>> execute() {
|
||||
try {
|
||||
return SparkBulkInsertHelper.newInstance().bulkInsert(preppedInputRecordRdd, instantTime, table, config,
|
||||
this, false, userDefinedBulkInsertPartitioner);
|
||||
} catch (Throwable e) {
|
||||
if (e instanceof HoodieInsertException) {
|
||||
throw e;
|
||||
}
|
||||
throw new HoodieInsertException("Failed to bulk insert for commit time " + instantTime, e);
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
@@ -0,0 +1,48 @@
|
||||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one
|
||||
* or more contributor license agreements. See the NOTICE file
|
||||
* distributed with this work for additional information
|
||||
* regarding copyright ownership. The ASF licenses this file
|
||||
* to you under the Apache License, Version 2.0 (the
|
||||
* "License"); you may not use this file except in compliance
|
||||
* with the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
package org.apache.hudi.table.action.commit;
|
||||
|
||||
import org.apache.hudi.client.WriteStatus;
|
||||
import org.apache.hudi.client.common.HoodieSparkEngineContext;
|
||||
import org.apache.hudi.common.model.HoodieKey;
|
||||
import org.apache.hudi.common.model.HoodieRecordPayload;
|
||||
import org.apache.hudi.common.model.WriteOperationType;
|
||||
import org.apache.hudi.config.HoodieWriteConfig;
|
||||
import org.apache.hudi.table.HoodieTable;
|
||||
|
||||
import org.apache.hudi.table.action.HoodieWriteMetadata;
|
||||
import org.apache.spark.api.java.JavaRDD;
|
||||
|
||||
public class SparkDeleteCommitActionExecutor<T extends HoodieRecordPayload<T>>
|
||||
extends BaseSparkCommitActionExecutor<T> {
|
||||
|
||||
private final JavaRDD<HoodieKey> keys;
|
||||
|
||||
public SparkDeleteCommitActionExecutor(HoodieSparkEngineContext context,
|
||||
HoodieWriteConfig config, HoodieTable table,
|
||||
String instantTime, JavaRDD<HoodieKey> keys) {
|
||||
super(context, config, table, instantTime, WriteOperationType.DELETE);
|
||||
this.keys = keys;
|
||||
}
|
||||
|
||||
@Override
|
||||
public HoodieWriteMetadata<JavaRDD<WriteStatus>> execute() {
|
||||
return SparkDeleteHelper.newInstance().execute(instantTime, keys, context, config, table, this);
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,123 @@
|
||||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one
|
||||
* or more contributor license agreements. See the NOTICE file
|
||||
* distributed with this work for additional information
|
||||
* regarding copyright ownership. The ASF licenses this file
|
||||
* to you under the Apache License, Version 2.0 (the
|
||||
* "License"); you may not use this file except in compliance
|
||||
* with the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
package org.apache.hudi.table.action.commit;
|
||||
|
||||
import org.apache.hudi.client.WriteStatus;
|
||||
import org.apache.hudi.client.common.HoodieEngineContext;
|
||||
import org.apache.hudi.client.common.HoodieSparkEngineContext;
|
||||
import org.apache.hudi.common.model.EmptyHoodieRecordPayload;
|
||||
import org.apache.hudi.common.model.HoodieKey;
|
||||
import org.apache.hudi.common.model.HoodieRecord;
|
||||
import org.apache.hudi.common.model.HoodieRecordPayload;
|
||||
import org.apache.hudi.common.util.collection.Pair;
|
||||
import org.apache.hudi.config.HoodieWriteConfig;
|
||||
import org.apache.hudi.exception.HoodieUpsertException;
|
||||
import org.apache.hudi.table.HoodieTable;
|
||||
import org.apache.hudi.table.WorkloadProfile;
|
||||
import org.apache.hudi.table.WorkloadStat;
|
||||
import org.apache.hudi.table.action.HoodieWriteMetadata;
|
||||
|
||||
import org.apache.spark.api.java.JavaRDD;
|
||||
import org.apache.spark.api.java.JavaSparkContext;
|
||||
|
||||
import java.time.Duration;
|
||||
import java.time.Instant;
|
||||
import java.util.HashMap;
|
||||
|
||||
/**
|
||||
* A spark implementation of {@link AbstractDeleteHelper}.
|
||||
*
|
||||
* @param <T>
|
||||
*/
|
||||
@SuppressWarnings("checkstyle:LineLength")
|
||||
public class SparkDeleteHelper<T extends HoodieRecordPayload,R> extends
|
||||
AbstractDeleteHelper<T, JavaRDD<HoodieRecord<T>>, JavaRDD<HoodieKey>, JavaRDD<WriteStatus>, R> {
|
||||
private SparkDeleteHelper() {
|
||||
}
|
||||
|
||||
private static class DeleteHelperHolder {
|
||||
private static final SparkDeleteHelper SPARK_DELETE_HELPER = new SparkDeleteHelper();
|
||||
}
|
||||
|
||||
public static SparkDeleteHelper newInstance() {
|
||||
return DeleteHelperHolder.SPARK_DELETE_HELPER;
|
||||
}
|
||||
|
||||
@Override
|
||||
public JavaRDD<HoodieKey> deduplicateKeys(JavaRDD<HoodieKey> keys, HoodieTable<T, JavaRDD<HoodieRecord<T>>, JavaRDD<HoodieKey>, JavaRDD<WriteStatus>> table, int parallelism) {
|
||||
boolean isIndexingGlobal = table.getIndex().isGlobal();
|
||||
if (isIndexingGlobal) {
|
||||
return keys.keyBy(HoodieKey::getRecordKey)
|
||||
.reduceByKey((key1, key2) -> key1, parallelism)
|
||||
.values();
|
||||
} else {
|
||||
return keys.distinct(parallelism);
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public HoodieWriteMetadata<JavaRDD<WriteStatus>> execute(String instantTime,
|
||||
JavaRDD<HoodieKey> keys,
|
||||
HoodieEngineContext context,
|
||||
HoodieWriteConfig config,
|
||||
HoodieTable<T, JavaRDD<HoodieRecord<T>>, JavaRDD<HoodieKey>, JavaRDD<WriteStatus>> table,
|
||||
BaseCommitActionExecutor<T, JavaRDD<HoodieRecord<T>>, JavaRDD<HoodieKey>, JavaRDD<WriteStatus>, R> deleteExecutor) {
|
||||
JavaSparkContext jsc = HoodieSparkEngineContext.getSparkContext(context);
|
||||
|
||||
try {
|
||||
HoodieWriteMetadata result = null;
|
||||
JavaRDD<HoodieKey> dedupedKeys = keys;
|
||||
final int parallelism = config.getDeleteShuffleParallelism();
|
||||
if (config.shouldCombineBeforeDelete()) {
|
||||
// De-dupe/merge if needed
|
||||
dedupedKeys = deduplicateKeys(keys, table, parallelism);
|
||||
} else if (!keys.partitions().isEmpty()) {
|
||||
dedupedKeys = keys.repartition(parallelism);
|
||||
}
|
||||
|
||||
JavaRDD<HoodieRecord<T>> dedupedRecords =
|
||||
dedupedKeys.map(key -> new HoodieRecord(key, new EmptyHoodieRecordPayload()));
|
||||
Instant beginTag = Instant.now();
|
||||
// perform index loop up to get existing location of records
|
||||
JavaRDD<HoodieRecord<T>> taggedRecords =
|
||||
table.getIndex().tagLocation(dedupedRecords, context, table);
|
||||
Duration tagLocationDuration = Duration.between(beginTag, Instant.now());
|
||||
|
||||
// filter out non existent keys/records
|
||||
JavaRDD<HoodieRecord<T>> taggedValidRecords = taggedRecords.filter(HoodieRecord::isCurrentLocationKnown);
|
||||
if (!taggedValidRecords.isEmpty()) {
|
||||
result = deleteExecutor.execute(taggedValidRecords);
|
||||
result.setIndexLookupDuration(tagLocationDuration);
|
||||
} else {
|
||||
// if entire set of keys are non existent
|
||||
deleteExecutor.saveWorkloadProfileMetadataToInflight(new WorkloadProfile(Pair.of(new HashMap<>(), new WorkloadStat())), instantTime);
|
||||
result = new HoodieWriteMetadata();
|
||||
result.setWriteStatuses(jsc.emptyRDD());
|
||||
deleteExecutor.commitOnAutoCommit(result);
|
||||
}
|
||||
return result;
|
||||
} catch (Throwable e) {
|
||||
if (e instanceof HoodieUpsertException) {
|
||||
throw (HoodieUpsertException) e;
|
||||
}
|
||||
throw new HoodieUpsertException("Failed to delete for commit time " + instantTime, e);
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
@@ -0,0 +1,49 @@
|
||||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one
|
||||
* or more contributor license agreements. See the NOTICE file
|
||||
* distributed with this work for additional information
|
||||
* regarding copyright ownership. The ASF licenses this file
|
||||
* to you under the Apache License, Version 2.0 (the
|
||||
* "License"); you may not use this file except in compliance
|
||||
* with the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
package org.apache.hudi.table.action.commit;
|
||||
|
||||
import org.apache.hudi.client.WriteStatus;
|
||||
import org.apache.hudi.client.common.HoodieSparkEngineContext;
|
||||
import org.apache.hudi.common.model.HoodieRecord;
|
||||
import org.apache.hudi.common.model.HoodieRecordPayload;
|
||||
import org.apache.hudi.common.model.WriteOperationType;
|
||||
import org.apache.hudi.config.HoodieWriteConfig;
|
||||
import org.apache.hudi.table.HoodieTable;
|
||||
|
||||
import org.apache.hudi.table.action.HoodieWriteMetadata;
|
||||
import org.apache.spark.api.java.JavaRDD;
|
||||
|
||||
public class SparkInsertCommitActionExecutor<T extends HoodieRecordPayload<T>>
|
||||
extends BaseSparkCommitActionExecutor<T> {
|
||||
|
||||
private final JavaRDD<HoodieRecord<T>> inputRecordsRDD;
|
||||
|
||||
public SparkInsertCommitActionExecutor(HoodieSparkEngineContext context,
|
||||
HoodieWriteConfig config, HoodieTable table,
|
||||
String instantTime, JavaRDD<HoodieRecord<T>> inputRecordsRDD) {
|
||||
super(context, config, table, instantTime, WriteOperationType.INSERT);
|
||||
this.inputRecordsRDD = inputRecordsRDD;
|
||||
}
|
||||
|
||||
@Override
|
||||
public HoodieWriteMetadata<JavaRDD<WriteStatus>> execute() {
|
||||
return SparkWriteHelper.newInstance().write(instantTime, inputRecordsRDD, context, table,
|
||||
config.shouldCombineBeforeInsert(), config.getInsertShuffleParallelism(), this, false);
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,80 @@
|
||||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one
|
||||
* or more contributor license agreements. See the NOTICE file
|
||||
* distributed with this work for additional information
|
||||
* regarding copyright ownership. The ASF licenses this file
|
||||
* to you under the Apache License, Version 2.0 (the
|
||||
* "License"); you may not use this file except in compliance
|
||||
* with the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
package org.apache.hudi.table.action.commit;
|
||||
|
||||
import org.apache.hudi.client.WriteStatus;
|
||||
import org.apache.hudi.client.common.HoodieEngineContext;
|
||||
import org.apache.hudi.common.model.HoodieRecord;
|
||||
import org.apache.hudi.common.model.HoodieRecordPayload;
|
||||
import org.apache.hudi.common.model.WriteOperationType;
|
||||
import org.apache.hudi.common.table.timeline.HoodieTimeline;
|
||||
import org.apache.hudi.config.HoodieWriteConfig;
|
||||
import org.apache.hudi.table.HoodieTable;
|
||||
import org.apache.hudi.table.WorkloadProfile;
|
||||
import org.apache.hudi.table.action.HoodieWriteMetadata;
|
||||
import org.apache.log4j.LogManager;
|
||||
import org.apache.log4j.Logger;
|
||||
import org.apache.spark.Partitioner;
|
||||
import org.apache.spark.api.java.JavaRDD;
|
||||
import scala.Tuple2;
|
||||
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
import java.util.stream.Collectors;
|
||||
|
||||
public class SparkInsertOverwriteCommitActionExecutor<T extends HoodieRecordPayload<T>>
|
||||
extends BaseSparkCommitActionExecutor<T> {
|
||||
|
||||
private static final Logger LOG = LogManager.getLogger(SparkInsertOverwriteCommitActionExecutor.class);
|
||||
|
||||
private final JavaRDD<HoodieRecord<T>> inputRecordsRDD;
|
||||
|
||||
public SparkInsertOverwriteCommitActionExecutor(HoodieEngineContext context,
|
||||
HoodieWriteConfig config, HoodieTable table,
|
||||
String instantTime, JavaRDD<HoodieRecord<T>> inputRecordsRDD) {
|
||||
super(context, config, table, instantTime, WriteOperationType.INSERT_OVERWRITE);
|
||||
this.inputRecordsRDD = inputRecordsRDD;
|
||||
}
|
||||
|
||||
@Override
|
||||
public HoodieWriteMetadata execute() {
|
||||
return SparkWriteHelper.newInstance().write(instantTime, inputRecordsRDD, context, table,
|
||||
config.shouldCombineBeforeInsert(), config.getInsertShuffleParallelism(), this, false);
|
||||
}
|
||||
|
||||
@Override
|
||||
protected Partitioner getPartitioner(WorkloadProfile profile) {
|
||||
return new SparkInsertOverwritePartitioner(profile, context, table, config);
|
||||
}
|
||||
|
||||
@Override
|
||||
protected String getCommitActionType() {
|
||||
return HoodieTimeline.REPLACE_COMMIT_ACTION;
|
||||
}
|
||||
|
||||
protected Map<String, List<String>> getPartitionToReplacedFileIds(JavaRDD<WriteStatus> writeStatuses) {
|
||||
return writeStatuses.map(status -> status.getStat().getPartitionPath()).distinct().mapToPair(partitionPath ->
|
||||
new Tuple2<>(partitionPath, getAllExistingFileIds(partitionPath))).collectAsMap();
|
||||
}
|
||||
|
||||
private List<String> getAllExistingFileIds(String partitionPath) {
|
||||
// because new commit is not complete. it is safe to mark all existing file Ids as old files
|
||||
return table.getSliceView().getLatestFileSlices(partitionPath).map(fg -> fg.getFileId()).distinct().collect(Collectors.toList());
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,50 @@
|
||||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one
|
||||
* or more contributor license agreements. See the NOTICE file
|
||||
* distributed with this work for additional information
|
||||
* regarding copyright ownership. The ASF licenses this file
|
||||
* to you under the Apache License, Version 2.0 (the
|
||||
* "License"); you may not use this file except in compliance
|
||||
* with the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
package org.apache.hudi.table.action.commit;
|
||||
|
||||
import org.apache.hudi.client.common.HoodieEngineContext;
|
||||
import org.apache.hudi.config.HoodieWriteConfig;
|
||||
import org.apache.hudi.table.HoodieTable;
|
||||
import org.apache.hudi.table.WorkloadProfile;
|
||||
import org.apache.log4j.LogManager;
|
||||
import org.apache.log4j.Logger;
|
||||
|
||||
import java.util.Collections;
|
||||
import java.util.List;
|
||||
|
||||
/**
|
||||
* Packs incoming records to be inserted into buckets (1 bucket = 1 RDD partition).
|
||||
*/
|
||||
public class SparkInsertOverwritePartitioner extends UpsertPartitioner {
|
||||
|
||||
private static final Logger LOG = LogManager.getLogger(SparkInsertOverwritePartitioner.class);
|
||||
|
||||
public SparkInsertOverwritePartitioner(WorkloadProfile profile, HoodieEngineContext context, HoodieTable table,
|
||||
HoodieWriteConfig config) {
|
||||
super(profile, context, table, config);
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns a list of small files in the given partition path.
|
||||
*/
|
||||
protected List<SmallFile> getSmallFiles(String partitionPath) {
|
||||
// for overwrite, we ignore all existing files. So dont consider any file to be smallFiles
|
||||
return Collections.emptyList();
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,48 @@
|
||||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one
|
||||
* or more contributor license agreements. See the NOTICE file
|
||||
* distributed with this work for additional information
|
||||
* regarding copyright ownership. The ASF licenses this file
|
||||
* to you under the Apache License, Version 2.0 (the
|
||||
* "License"); you may not use this file except in compliance
|
||||
* with the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
package org.apache.hudi.table.action.commit;
|
||||
|
||||
import org.apache.hudi.client.WriteStatus;
|
||||
import org.apache.hudi.client.common.HoodieSparkEngineContext;
|
||||
import org.apache.hudi.common.model.HoodieRecord;
|
||||
import org.apache.hudi.common.model.HoodieRecordPayload;
|
||||
import org.apache.hudi.common.model.WriteOperationType;
|
||||
import org.apache.hudi.config.HoodieWriteConfig;
|
||||
import org.apache.hudi.table.HoodieTable;
|
||||
|
||||
import org.apache.hudi.table.action.HoodieWriteMetadata;
|
||||
import org.apache.spark.api.java.JavaRDD;
|
||||
|
||||
public class SparkInsertPreppedCommitActionExecutor<T extends HoodieRecordPayload<T>>
|
||||
extends BaseSparkCommitActionExecutor<T> {
|
||||
|
||||
private final JavaRDD<HoodieRecord<T>> preppedRecords;
|
||||
|
||||
public SparkInsertPreppedCommitActionExecutor(HoodieSparkEngineContext context,
|
||||
HoodieWriteConfig config, HoodieTable table,
|
||||
String instantTime, JavaRDD<HoodieRecord<T>> preppedRecords) {
|
||||
super(context, config, table, instantTime, WriteOperationType.INSERT_PREPPED);
|
||||
this.preppedRecords = preppedRecords;
|
||||
}
|
||||
|
||||
@Override
|
||||
public HoodieWriteMetadata<JavaRDD<WriteStatus>> execute() {
|
||||
return super.execute(preppedRecords);
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,110 @@
|
||||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one
|
||||
* or more contributor license agreements. See the NOTICE file
|
||||
* distributed with this work for additional information
|
||||
* regarding copyright ownership. The ASF licenses this file
|
||||
* to you under the Apache License, Version 2.0 (the
|
||||
* "License"); you may not use this file except in compliance
|
||||
* with the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
package org.apache.hudi.table.action.commit;
|
||||
|
||||
import org.apache.hudi.client.WriteStatus;
|
||||
import org.apache.hudi.common.model.HoodieBaseFile;
|
||||
import org.apache.hudi.common.model.HoodieKey;
|
||||
import org.apache.hudi.common.model.HoodieRecord;
|
||||
import org.apache.hudi.common.model.HoodieRecordPayload;
|
||||
import org.apache.hudi.common.util.queue.BoundedInMemoryExecutor;
|
||||
import org.apache.hudi.exception.HoodieException;
|
||||
import org.apache.hudi.execution.SparkBoundedInMemoryExecutor;
|
||||
import org.apache.hudi.io.HoodieMergeHandle;
|
||||
import org.apache.hudi.io.storage.HoodieFileReader;
|
||||
import org.apache.hudi.io.storage.HoodieFileReaderFactory;
|
||||
import org.apache.hudi.table.HoodieTable;
|
||||
|
||||
import org.apache.avro.Schema;
|
||||
import org.apache.avro.generic.GenericDatumReader;
|
||||
import org.apache.avro.generic.GenericDatumWriter;
|
||||
import org.apache.avro.generic.GenericRecord;
|
||||
import org.apache.avro.io.BinaryDecoder;
|
||||
import org.apache.avro.io.BinaryEncoder;
|
||||
import org.apache.hadoop.conf.Configuration;
|
||||
import org.apache.spark.api.java.JavaRDD;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.util.Iterator;
|
||||
|
||||
public class SparkMergeHelper<T extends HoodieRecordPayload> extends AbstractMergeHelper<T, JavaRDD<HoodieRecord<T>>,
|
||||
JavaRDD<HoodieKey>, JavaRDD<WriteStatus>> {
|
||||
|
||||
private static class MergeHelperHolder {
|
||||
private static final SparkMergeHelper SPARK_MERGE_HELPER = new SparkMergeHelper();
|
||||
}
|
||||
|
||||
public static SparkMergeHelper newInstance() {
|
||||
return SparkMergeHelper.MergeHelperHolder.SPARK_MERGE_HELPER;
|
||||
}
|
||||
|
||||
@Override
|
||||
public void runMerge(HoodieTable<T, JavaRDD<HoodieRecord<T>>, JavaRDD<HoodieKey>, JavaRDD<WriteStatus>> table,
|
||||
HoodieMergeHandle<T, JavaRDD<HoodieRecord<T>>, JavaRDD<HoodieKey>, JavaRDD<WriteStatus>> upsertHandle) throws IOException {
|
||||
final boolean externalSchemaTransformation = table.getConfig().shouldUseExternalSchemaTransformation();
|
||||
Configuration cfgForHoodieFile = new Configuration(table.getHadoopConf());
|
||||
HoodieMergeHandle<T, JavaRDD<HoodieRecord<T>>, JavaRDD<HoodieKey>, JavaRDD<WriteStatus>> mergeHandle = upsertHandle;
|
||||
HoodieBaseFile baseFile = mergeHandle.baseFileForMerge();
|
||||
|
||||
final GenericDatumWriter<GenericRecord> gWriter;
|
||||
final GenericDatumReader<GenericRecord> gReader;
|
||||
Schema readSchema;
|
||||
if (externalSchemaTransformation || baseFile.getBootstrapBaseFile().isPresent()) {
|
||||
readSchema = HoodieFileReaderFactory.getFileReader(table.getHadoopConf(), mergeHandle.getOldFilePath()).getSchema();
|
||||
gWriter = new GenericDatumWriter<>(readSchema);
|
||||
gReader = new GenericDatumReader<>(readSchema, mergeHandle.getWriterSchemaWithMetafields());
|
||||
} else {
|
||||
gReader = null;
|
||||
gWriter = null;
|
||||
readSchema = mergeHandle.getWriterSchemaWithMetafields();
|
||||
}
|
||||
|
||||
BoundedInMemoryExecutor<GenericRecord, GenericRecord, Void> wrapper = null;
|
||||
HoodieFileReader<GenericRecord> reader = HoodieFileReaderFactory.<T, GenericRecord>getFileReader(cfgForHoodieFile, mergeHandle.getOldFilePath());
|
||||
try {
|
||||
final Iterator<GenericRecord> readerIterator;
|
||||
if (baseFile.getBootstrapBaseFile().isPresent()) {
|
||||
readerIterator = getMergingIterator(table, mergeHandle, baseFile, reader, readSchema, externalSchemaTransformation);
|
||||
} else {
|
||||
readerIterator = reader.getRecordIterator(readSchema);
|
||||
}
|
||||
|
||||
ThreadLocal<BinaryEncoder> encoderCache = new ThreadLocal<>();
|
||||
ThreadLocal<BinaryDecoder> decoderCache = new ThreadLocal<>();
|
||||
wrapper = new SparkBoundedInMemoryExecutor(table.getConfig(), readerIterator,
|
||||
new UpdateHandler(mergeHandle), record -> {
|
||||
if (!externalSchemaTransformation) {
|
||||
return record;
|
||||
}
|
||||
return transformRecordBasedOnNewSchema(gReader, gWriter, encoderCache, decoderCache, (GenericRecord) record);
|
||||
});
|
||||
wrapper.execute();
|
||||
} catch (Exception e) {
|
||||
throw new HoodieException(e);
|
||||
} finally {
|
||||
if (reader != null) {
|
||||
reader.close();
|
||||
}
|
||||
mergeHandle.close();
|
||||
if (null != wrapper) {
|
||||
wrapper.shutdownNow();
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,49 @@
|
||||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one
|
||||
* or more contributor license agreements. See the NOTICE file
|
||||
* distributed with this work for additional information
|
||||
* regarding copyright ownership. The ASF licenses this file
|
||||
* to you under the Apache License, Version 2.0 (the
|
||||
* "License"); you may not use this file except in compliance
|
||||
* with the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
package org.apache.hudi.table.action.commit;
|
||||
|
||||
import org.apache.hudi.client.WriteStatus;
|
||||
import org.apache.hudi.client.common.HoodieSparkEngineContext;
|
||||
import org.apache.hudi.common.model.HoodieRecord;
|
||||
import org.apache.hudi.common.model.HoodieRecordPayload;
|
||||
import org.apache.hudi.common.model.WriteOperationType;
|
||||
import org.apache.hudi.config.HoodieWriteConfig;
|
||||
import org.apache.hudi.table.HoodieTable;
|
||||
|
||||
import org.apache.hudi.table.action.HoodieWriteMetadata;
|
||||
import org.apache.spark.api.java.JavaRDD;
|
||||
|
||||
public class SparkUpsertCommitActionExecutor<T extends HoodieRecordPayload<T>>
|
||||
extends BaseSparkCommitActionExecutor<T> {
|
||||
|
||||
private JavaRDD<HoodieRecord<T>> inputRecordsRDD;
|
||||
|
||||
public SparkUpsertCommitActionExecutor(HoodieSparkEngineContext context,
|
||||
HoodieWriteConfig config, HoodieTable table,
|
||||
String instantTime, JavaRDD<HoodieRecord<T>> inputRecordsRDD) {
|
||||
super(context, config, table, instantTime, WriteOperationType.UPSERT);
|
||||
this.inputRecordsRDD = inputRecordsRDD;
|
||||
}
|
||||
|
||||
@Override
|
||||
public HoodieWriteMetadata<JavaRDD<WriteStatus>> execute() {
|
||||
return SparkWriteHelper.newInstance().write(instantTime, inputRecordsRDD, context, table,
|
||||
config.shouldCombineBeforeUpsert(), config.getUpsertShuffleParallelism(), this, true);
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,48 @@
|
||||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one
|
||||
* or more contributor license agreements. See the NOTICE file
|
||||
* distributed with this work for additional information
|
||||
* regarding copyright ownership. The ASF licenses this file
|
||||
* to you under the Apache License, Version 2.0 (the
|
||||
* "License"); you may not use this file except in compliance
|
||||
* with the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
package org.apache.hudi.table.action.commit;
|
||||
|
||||
import org.apache.hudi.client.WriteStatus;
|
||||
import org.apache.hudi.client.common.HoodieSparkEngineContext;
|
||||
import org.apache.hudi.common.model.HoodieRecord;
|
||||
import org.apache.hudi.common.model.HoodieRecordPayload;
|
||||
import org.apache.hudi.common.model.WriteOperationType;
|
||||
import org.apache.hudi.config.HoodieWriteConfig;
|
||||
import org.apache.hudi.table.HoodieTable;
|
||||
|
||||
import org.apache.hudi.table.action.HoodieWriteMetadata;
|
||||
import org.apache.spark.api.java.JavaRDD;
|
||||
|
||||
public class SparkUpsertPreppedCommitActionExecutor<T extends HoodieRecordPayload<T>>
|
||||
extends BaseSparkCommitActionExecutor<T> {
|
||||
|
||||
private final JavaRDD<HoodieRecord<T>> preppedRecords;
|
||||
|
||||
public SparkUpsertPreppedCommitActionExecutor(HoodieSparkEngineContext context,
|
||||
HoodieWriteConfig config, HoodieTable table,
|
||||
String instantTime, JavaRDD<HoodieRecord<T>> preppedRecords) {
|
||||
super(context, config, table, instantTime, WriteOperationType.UPSERT_PREPPED);
|
||||
this.preppedRecords = preppedRecords;
|
||||
}
|
||||
|
||||
@Override
|
||||
public HoodieWriteMetadata<JavaRDD<WriteStatus>> execute() {
|
||||
return super.execute(preppedRecords);
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,69 @@
|
||||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one
|
||||
* or more contributor license agreements. See the NOTICE file
|
||||
* distributed with this work for additional information
|
||||
* regarding copyright ownership. The ASF licenses this file
|
||||
* to you under the Apache License, Version 2.0 (the
|
||||
* "License"); you may not use this file except in compliance
|
||||
* with the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
package org.apache.hudi.table.action.commit;
|
||||
|
||||
import org.apache.hudi.client.WriteStatus;
|
||||
import org.apache.hudi.common.model.HoodieKey;
|
||||
import org.apache.hudi.common.model.HoodieRecord;
|
||||
import org.apache.hudi.common.model.HoodieRecordPayload;
|
||||
import org.apache.hudi.index.HoodieIndex;
|
||||
|
||||
import org.apache.spark.api.java.JavaRDD;
|
||||
|
||||
import scala.Tuple2;
|
||||
|
||||
/**
|
||||
* A spark implementation of {@link AbstractWriteHelper}.
|
||||
*
|
||||
* @param <T>
|
||||
*/
|
||||
public class SparkWriteHelper<T extends HoodieRecordPayload,R> extends AbstractWriteHelper<T, JavaRDD<HoodieRecord<T>>,
|
||||
JavaRDD<HoodieKey>, JavaRDD<WriteStatus>, R> {
|
||||
private SparkWriteHelper() {
|
||||
}
|
||||
|
||||
private static class WriteHelperHolder {
|
||||
private static final SparkWriteHelper SPARK_WRITE_HELPER = new SparkWriteHelper();
|
||||
}
|
||||
|
||||
public static SparkWriteHelper newInstance() {
|
||||
return WriteHelperHolder.SPARK_WRITE_HELPER;
|
||||
}
|
||||
|
||||
@Override
|
||||
public JavaRDD<HoodieRecord<T>> deduplicateRecords(JavaRDD<HoodieRecord<T>> records,
|
||||
HoodieIndex<T, JavaRDD<HoodieRecord<T>>, JavaRDD<HoodieKey>, JavaRDD<WriteStatus>> index,
|
||||
int parallelism) {
|
||||
boolean isIndexingGlobal = index.isGlobal();
|
||||
return records.mapToPair(record -> {
|
||||
HoodieKey hoodieKey = record.getKey();
|
||||
// If index used is global, then records are expected to differ in their partitionPath
|
||||
Object key = isIndexingGlobal ? hoodieKey.getRecordKey() : hoodieKey;
|
||||
return new Tuple2<>(key, record);
|
||||
}).reduceByKey((rec1, rec2) -> {
|
||||
@SuppressWarnings("unchecked")
|
||||
T reducedData = (T) rec1.getData().preCombine(rec2.getData());
|
||||
// we cannot allow the user to change the key or partitionPath, since that will affect
|
||||
// everything
|
||||
// so pick it from one of the records.
|
||||
return new HoodieRecord<T>(rec1.getKey(), reducedData);
|
||||
}, parallelism).map(Tuple2::_2);
|
||||
}
|
||||
|
||||
}
|
||||
@@ -0,0 +1,328 @@
|
||||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one
|
||||
* or more contributor license agreements. See the NOTICE file
|
||||
* distributed with this work for additional information
|
||||
* regarding copyright ownership. The ASF licenses this file
|
||||
* to you under the Apache License, Version 2.0 (the
|
||||
* "License"); you may not use this file except in compliance
|
||||
* with the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
package org.apache.hudi.table.action.commit;
|
||||
|
||||
import org.apache.hudi.client.common.HoodieEngineContext;
|
||||
import org.apache.hudi.client.common.HoodieSparkEngineContext;
|
||||
import org.apache.hudi.common.fs.FSUtils;
|
||||
import org.apache.hudi.common.model.HoodieBaseFile;
|
||||
import org.apache.hudi.common.model.HoodieCommitMetadata;
|
||||
import org.apache.hudi.common.model.HoodieKey;
|
||||
import org.apache.hudi.common.model.HoodieRecordLocation;
|
||||
import org.apache.hudi.common.model.HoodieRecordPayload;
|
||||
import org.apache.hudi.common.table.timeline.HoodieInstant;
|
||||
import org.apache.hudi.common.table.timeline.HoodieTimeline;
|
||||
import org.apache.hudi.common.util.NumericUtils;
|
||||
import org.apache.hudi.common.util.Option;
|
||||
import org.apache.hudi.common.util.collection.Pair;
|
||||
import org.apache.hudi.config.HoodieWriteConfig;
|
||||
import org.apache.hudi.table.HoodieTable;
|
||||
import org.apache.hudi.table.WorkloadProfile;
|
||||
import org.apache.hudi.table.WorkloadStat;
|
||||
import org.apache.log4j.LogManager;
|
||||
import org.apache.log4j.Logger;
|
||||
import org.apache.spark.Partitioner;
|
||||
import org.apache.spark.api.java.JavaRDD;
|
||||
import org.apache.spark.api.java.JavaSparkContext;
|
||||
import org.apache.spark.api.java.function.PairFunction;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.Collections;
|
||||
import java.util.HashMap;
|
||||
import java.util.Iterator;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
import java.util.Map.Entry;
|
||||
import java.util.Set;
|
||||
import java.util.stream.Collectors;
|
||||
|
||||
import scala.Tuple2;
|
||||
|
||||
/**
|
||||
* Packs incoming records to be upserted, into buckets (1 bucket = 1 RDD partition).
|
||||
*/
|
||||
public class UpsertPartitioner<T extends HoodieRecordPayload<T>> extends Partitioner {
|
||||
|
||||
private static final Logger LOG = LogManager.getLogger(UpsertPartitioner.class);
|
||||
|
||||
/**
|
||||
* List of all small files to be corrected.
|
||||
*/
|
||||
protected List<SmallFile> smallFiles = new ArrayList<>();
|
||||
/**
|
||||
* Total number of RDD partitions, is determined by total buckets we want to pack the incoming workload into.
|
||||
*/
|
||||
private int totalBuckets = 0;
|
||||
/**
|
||||
* Stat for the current workload. Helps in determining inserts, upserts etc.
|
||||
*/
|
||||
private WorkloadProfile profile;
|
||||
/**
|
||||
* Helps decide which bucket an incoming update should go to.
|
||||
*/
|
||||
private HashMap<String, Integer> updateLocationToBucket;
|
||||
/**
|
||||
* Helps us pack inserts into 1 or more buckets depending on number of incoming records.
|
||||
*/
|
||||
private HashMap<String, List<InsertBucketCumulativeWeightPair>> partitionPathToInsertBucketInfos;
|
||||
/**
|
||||
* Remembers what type each bucket is for later.
|
||||
*/
|
||||
private HashMap<Integer, BucketInfo> bucketInfoMap;
|
||||
|
||||
protected final HoodieTable table;
|
||||
|
||||
protected final HoodieWriteConfig config;
|
||||
|
||||
public UpsertPartitioner(WorkloadProfile profile, HoodieEngineContext context, HoodieTable table,
|
||||
HoodieWriteConfig config) {
|
||||
updateLocationToBucket = new HashMap<>();
|
||||
partitionPathToInsertBucketInfos = new HashMap<>();
|
||||
bucketInfoMap = new HashMap<>();
|
||||
this.profile = profile;
|
||||
this.table = table;
|
||||
this.config = config;
|
||||
assignUpdates(profile);
|
||||
assignInserts(profile, context);
|
||||
|
||||
LOG.info("Total Buckets :" + totalBuckets + ", buckets info => " + bucketInfoMap + ", \n"
|
||||
+ "Partition to insert buckets => " + partitionPathToInsertBucketInfos + ", \n"
|
||||
+ "UpdateLocations mapped to buckets =>" + updateLocationToBucket);
|
||||
}
|
||||
|
||||
private void assignUpdates(WorkloadProfile profile) {
|
||||
// each update location gets a partition
|
||||
Set<Entry<String, WorkloadStat>> partitionStatEntries = profile.getPartitionPathStatMap().entrySet();
|
||||
for (Map.Entry<String, WorkloadStat> partitionStat : partitionStatEntries) {
|
||||
for (Map.Entry<String, Pair<String, Long>> updateLocEntry :
|
||||
partitionStat.getValue().getUpdateLocationToCount().entrySet()) {
|
||||
addUpdateBucket(partitionStat.getKey(), updateLocEntry.getKey());
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
private int addUpdateBucket(String partitionPath, String fileIdHint) {
|
||||
int bucket = totalBuckets;
|
||||
updateLocationToBucket.put(fileIdHint, bucket);
|
||||
BucketInfo bucketInfo = new BucketInfo();
|
||||
bucketInfo.bucketType = BucketType.UPDATE;
|
||||
bucketInfo.fileIdPrefix = fileIdHint;
|
||||
bucketInfo.partitionPath = partitionPath;
|
||||
bucketInfoMap.put(totalBuckets, bucketInfo);
|
||||
totalBuckets++;
|
||||
return bucket;
|
||||
}
|
||||
|
||||
private void assignInserts(WorkloadProfile profile, HoodieEngineContext context) {
|
||||
// for new inserts, compute buckets depending on how many records we have for each partition
|
||||
Set<String> partitionPaths = profile.getPartitionPaths();
|
||||
long averageRecordSize =
|
||||
averageBytesPerRecord(table.getMetaClient().getActiveTimeline().getCommitTimeline().filterCompletedInstants(),
|
||||
config);
|
||||
LOG.info("AvgRecordSize => " + averageRecordSize);
|
||||
|
||||
Map<String, List<SmallFile>> partitionSmallFilesMap =
|
||||
getSmallFilesForPartitions(new ArrayList<String>(partitionPaths), context);
|
||||
|
||||
for (String partitionPath : partitionPaths) {
|
||||
WorkloadStat pStat = profile.getWorkloadStat(partitionPath);
|
||||
if (pStat.getNumInserts() > 0) {
|
||||
|
||||
List<SmallFile> smallFiles = partitionSmallFilesMap.get(partitionPath);
|
||||
this.smallFiles.addAll(smallFiles);
|
||||
|
||||
LOG.info("For partitionPath : " + partitionPath + " Small Files => " + smallFiles);
|
||||
|
||||
long totalUnassignedInserts = pStat.getNumInserts();
|
||||
List<Integer> bucketNumbers = new ArrayList<>();
|
||||
List<Long> recordsPerBucket = new ArrayList<>();
|
||||
|
||||
// first try packing this into one of the smallFiles
|
||||
for (SmallFile smallFile : smallFiles) {
|
||||
long recordsToAppend = Math.min((config.getParquetMaxFileSize() - smallFile.sizeBytes) / averageRecordSize,
|
||||
totalUnassignedInserts);
|
||||
if (recordsToAppend > 0 && totalUnassignedInserts > 0) {
|
||||
// create a new bucket or re-use an existing bucket
|
||||
int bucket;
|
||||
if (updateLocationToBucket.containsKey(smallFile.location.getFileId())) {
|
||||
bucket = updateLocationToBucket.get(smallFile.location.getFileId());
|
||||
LOG.info("Assigning " + recordsToAppend + " inserts to existing update bucket " + bucket);
|
||||
} else {
|
||||
bucket = addUpdateBucket(partitionPath, smallFile.location.getFileId());
|
||||
LOG.info("Assigning " + recordsToAppend + " inserts to new update bucket " + bucket);
|
||||
}
|
||||
bucketNumbers.add(bucket);
|
||||
recordsPerBucket.add(recordsToAppend);
|
||||
totalUnassignedInserts -= recordsToAppend;
|
||||
}
|
||||
}
|
||||
|
||||
// if we have anything more, create new insert buckets, like normal
|
||||
if (totalUnassignedInserts > 0) {
|
||||
long insertRecordsPerBucket = config.getCopyOnWriteInsertSplitSize();
|
||||
if (config.shouldAutoTuneInsertSplits()) {
|
||||
insertRecordsPerBucket = config.getParquetMaxFileSize() / averageRecordSize;
|
||||
}
|
||||
|
||||
int insertBuckets = (int) Math.ceil((1.0 * totalUnassignedInserts) / insertRecordsPerBucket);
|
||||
LOG.info("After small file assignment: unassignedInserts => " + totalUnassignedInserts
|
||||
+ ", totalInsertBuckets => " + insertBuckets + ", recordsPerBucket => " + insertRecordsPerBucket);
|
||||
for (int b = 0; b < insertBuckets; b++) {
|
||||
bucketNumbers.add(totalBuckets);
|
||||
recordsPerBucket.add(totalUnassignedInserts / insertBuckets);
|
||||
BucketInfo bucketInfo = new BucketInfo();
|
||||
bucketInfo.bucketType = BucketType.INSERT;
|
||||
bucketInfo.partitionPath = partitionPath;
|
||||
bucketInfo.fileIdPrefix = FSUtils.createNewFileIdPfx();
|
||||
bucketInfoMap.put(totalBuckets, bucketInfo);
|
||||
totalBuckets++;
|
||||
}
|
||||
}
|
||||
|
||||
// Go over all such buckets, and assign weights as per amount of incoming inserts.
|
||||
List<InsertBucketCumulativeWeightPair> insertBuckets = new ArrayList<>();
|
||||
double curentCumulativeWeight = 0;
|
||||
for (int i = 0; i < bucketNumbers.size(); i++) {
|
||||
InsertBucket bkt = new InsertBucket();
|
||||
bkt.bucketNumber = bucketNumbers.get(i);
|
||||
bkt.weight = (1.0 * recordsPerBucket.get(i)) / pStat.getNumInserts();
|
||||
curentCumulativeWeight += bkt.weight;
|
||||
insertBuckets.add(new InsertBucketCumulativeWeightPair(bkt, curentCumulativeWeight));
|
||||
}
|
||||
LOG.info("Total insert buckets for partition path " + partitionPath + " => " + insertBuckets);
|
||||
partitionPathToInsertBucketInfos.put(partitionPath, insertBuckets);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
private Map<String, List<SmallFile>> getSmallFilesForPartitions(List<String> partitionPaths, HoodieEngineContext context) {
|
||||
JavaSparkContext jsc = HoodieSparkEngineContext.getSparkContext(context);
|
||||
Map<String, List<SmallFile>> partitionSmallFilesMap = new HashMap<>();
|
||||
if (partitionPaths != null && partitionPaths.size() > 0) {
|
||||
context.setJobStatus(this.getClass().getSimpleName(), "Getting small files from partitions");
|
||||
JavaRDD<String> partitionPathRdds = jsc.parallelize(partitionPaths, partitionPaths.size());
|
||||
partitionSmallFilesMap = partitionPathRdds.mapToPair((PairFunction<String, String, List<SmallFile>>)
|
||||
partitionPath -> new Tuple2<>(partitionPath, getSmallFiles(partitionPath))).collectAsMap();
|
||||
}
|
||||
|
||||
return partitionSmallFilesMap;
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns a list of small files in the given partition path.
|
||||
*/
|
||||
protected List<SmallFile> getSmallFiles(String partitionPath) {
|
||||
|
||||
// smallFiles only for partitionPath
|
||||
List<SmallFile> smallFileLocations = new ArrayList<>();
|
||||
|
||||
HoodieTimeline commitTimeline = table.getMetaClient().getCommitsTimeline().filterCompletedInstants();
|
||||
|
||||
if (!commitTimeline.empty()) { // if we have some commits
|
||||
HoodieInstant latestCommitTime = commitTimeline.lastInstant().get();
|
||||
List<HoodieBaseFile> allFiles = table.getBaseFileOnlyView()
|
||||
.getLatestBaseFilesBeforeOrOn(partitionPath, latestCommitTime.getTimestamp()).collect(Collectors.toList());
|
||||
|
||||
for (HoodieBaseFile file : allFiles) {
|
||||
if (file.getFileSize() < config.getParquetSmallFileLimit()) {
|
||||
String filename = file.getFileName();
|
||||
SmallFile sf = new SmallFile();
|
||||
sf.location = new HoodieRecordLocation(FSUtils.getCommitTime(filename), FSUtils.getFileId(filename));
|
||||
sf.sizeBytes = file.getFileSize();
|
||||
smallFileLocations.add(sf);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return smallFileLocations;
|
||||
}
|
||||
|
||||
public BucketInfo getBucketInfo(int bucketNumber) {
|
||||
return bucketInfoMap.get(bucketNumber);
|
||||
}
|
||||
|
||||
public List<InsertBucketCumulativeWeightPair> getInsertBuckets(String partitionPath) {
|
||||
return partitionPathToInsertBucketInfos.get(partitionPath);
|
||||
}
|
||||
|
||||
@Override
|
||||
public int numPartitions() {
|
||||
return totalBuckets;
|
||||
}
|
||||
|
||||
@Override
|
||||
public int getPartition(Object key) {
|
||||
Tuple2<HoodieKey, Option<HoodieRecordLocation>> keyLocation =
|
||||
(Tuple2<HoodieKey, Option<HoodieRecordLocation>>) key;
|
||||
if (keyLocation._2().isPresent()) {
|
||||
HoodieRecordLocation location = keyLocation._2().get();
|
||||
return updateLocationToBucket.get(location.getFileId());
|
||||
} else {
|
||||
String partitionPath = keyLocation._1().getPartitionPath();
|
||||
List<InsertBucketCumulativeWeightPair> targetBuckets = partitionPathToInsertBucketInfos.get(partitionPath);
|
||||
// pick the target bucket to use based on the weights.
|
||||
final long totalInserts = Math.max(1, profile.getWorkloadStat(partitionPath).getNumInserts());
|
||||
final long hashOfKey = NumericUtils.getMessageDigestHash("MD5", keyLocation._1().getRecordKey());
|
||||
final double r = 1.0 * Math.floorMod(hashOfKey, totalInserts) / totalInserts;
|
||||
|
||||
int index = Collections.binarySearch(targetBuckets, new InsertBucketCumulativeWeightPair(new InsertBucket(), r));
|
||||
|
||||
if (index >= 0) {
|
||||
return targetBuckets.get(index).getKey().bucketNumber;
|
||||
}
|
||||
|
||||
if ((-1 * index - 1) < targetBuckets.size()) {
|
||||
return targetBuckets.get((-1 * index - 1)).getKey().bucketNumber;
|
||||
}
|
||||
|
||||
// return first one, by default
|
||||
return targetBuckets.get(0).getKey().bucketNumber;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Obtains the average record size based on records written during previous commits. Used for estimating how many
|
||||
* records pack into one file.
|
||||
*/
|
||||
protected static long averageBytesPerRecord(HoodieTimeline commitTimeline, HoodieWriteConfig hoodieWriteConfig) {
|
||||
long avgSize = hoodieWriteConfig.getCopyOnWriteRecordSizeEstimate();
|
||||
long fileSizeThreshold = (long) (hoodieWriteConfig.getRecordSizeEstimationThreshold() * hoodieWriteConfig.getParquetSmallFileLimit());
|
||||
try {
|
||||
if (!commitTimeline.empty()) {
|
||||
// Go over the reverse ordered commits to get a more recent estimate of average record size.
|
||||
Iterator<HoodieInstant> instants = commitTimeline.getReverseOrderedInstants().iterator();
|
||||
while (instants.hasNext()) {
|
||||
HoodieInstant instant = instants.next();
|
||||
HoodieCommitMetadata commitMetadata = HoodieCommitMetadata
|
||||
.fromBytes(commitTimeline.getInstantDetails(instant).get(), HoodieCommitMetadata.class);
|
||||
long totalBytesWritten = commitMetadata.fetchTotalBytesWritten();
|
||||
long totalRecordsWritten = commitMetadata.fetchTotalRecordsWritten();
|
||||
if (totalBytesWritten > fileSizeThreshold && totalRecordsWritten > 0) {
|
||||
avgSize = (long) Math.ceil((1.0 * totalBytesWritten) / totalRecordsWritten);
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
} catch (Throwable t) {
|
||||
// make this fail safe.
|
||||
LOG.error("Error trying to compute average bytes/record ", t);
|
||||
}
|
||||
return avgSize;
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,243 @@
|
||||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one
|
||||
* or more contributor license agreements. See the NOTICE file
|
||||
* distributed with this work for additional information
|
||||
* regarding copyright ownership. The ASF licenses this file
|
||||
* to you under the Apache License, Version 2.0 (the
|
||||
* "License"); you may not use this file except in compliance
|
||||
* with the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
package org.apache.hudi.table.action.compact;
|
||||
|
||||
import org.apache.hudi.avro.HoodieAvroUtils;
|
||||
import org.apache.hudi.avro.model.HoodieCompactionOperation;
|
||||
import org.apache.hudi.avro.model.HoodieCompactionPlan;
|
||||
import org.apache.hudi.client.SparkTaskContextSupplier;
|
||||
import org.apache.hudi.client.WriteStatus;
|
||||
import org.apache.hudi.client.common.HoodieEngineContext;
|
||||
import org.apache.hudi.client.common.HoodieSparkEngineContext;
|
||||
import org.apache.hudi.common.fs.FSUtils;
|
||||
import org.apache.hudi.common.model.CompactionOperation;
|
||||
import org.apache.hudi.common.model.HoodieBaseFile;
|
||||
import org.apache.hudi.common.model.HoodieFileGroupId;
|
||||
import org.apache.hudi.common.model.HoodieKey;
|
||||
import org.apache.hudi.common.model.HoodieLogFile;
|
||||
import org.apache.hudi.common.model.HoodieRecord;
|
||||
import org.apache.hudi.common.model.HoodieRecordPayload;
|
||||
import org.apache.hudi.common.model.HoodieTableType;
|
||||
import org.apache.hudi.common.model.HoodieWriteStat.RuntimeStats;
|
||||
import org.apache.hudi.common.table.HoodieTableMetaClient;
|
||||
import org.apache.hudi.common.table.log.HoodieMergedLogRecordScanner;
|
||||
import org.apache.hudi.common.table.timeline.HoodieTimeline;
|
||||
import org.apache.hudi.common.table.view.TableFileSystemView.SliceView;
|
||||
import org.apache.hudi.common.util.CollectionUtils;
|
||||
import org.apache.hudi.common.util.CompactionUtils;
|
||||
import org.apache.hudi.common.util.Option;
|
||||
import org.apache.hudi.common.util.ValidationUtils;
|
||||
import org.apache.hudi.common.util.collection.Pair;
|
||||
import org.apache.hudi.config.HoodieWriteConfig;
|
||||
import org.apache.hudi.io.IOUtils;
|
||||
import org.apache.hudi.table.HoodieSparkCopyOnWriteTable;
|
||||
import org.apache.hudi.table.HoodieTable;
|
||||
import org.apache.hudi.table.action.compact.strategy.CompactionStrategy;
|
||||
|
||||
import org.apache.avro.Schema;
|
||||
import org.apache.hadoop.fs.FileSystem;
|
||||
import org.apache.hadoop.fs.Path;
|
||||
import org.apache.log4j.LogManager;
|
||||
import org.apache.log4j.Logger;
|
||||
import org.apache.spark.api.java.JavaRDD;
|
||||
import org.apache.spark.api.java.JavaSparkContext;
|
||||
import org.apache.spark.util.AccumulatorV2;
|
||||
import org.apache.spark.util.LongAccumulator;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.util.ArrayList;
|
||||
import java.util.Collection;
|
||||
import java.util.Iterator;
|
||||
import java.util.List;
|
||||
import java.util.Set;
|
||||
import java.util.stream.Collectors;
|
||||
import java.util.stream.StreamSupport;
|
||||
|
||||
import static java.util.stream.Collectors.toList;
|
||||
|
||||
/**
|
||||
* Compacts a hoodie table with merge on read storage. Computes all possible compactions,
|
||||
* passes it through a CompactionFilter and executes all the compactions and writes a new version of base files and make
|
||||
* a normal commit
|
||||
*
|
||||
*/
|
||||
@SuppressWarnings("checkstyle:LineLength")
|
||||
public class HoodieSparkMergeOnReadTableCompactor<T extends HoodieRecordPayload> implements HoodieCompactor<T, JavaRDD<HoodieRecord<T>>, JavaRDD<HoodieKey>, JavaRDD<WriteStatus>> {
|
||||
|
||||
private static final Logger LOG = LogManager.getLogger(HoodieSparkMergeOnReadTableCompactor.class);
|
||||
// Accumulator to keep track of total log files for a table
|
||||
private AccumulatorV2<Long, Long> totalLogFiles;
|
||||
// Accumulator to keep track of total log file slices for a table
|
||||
private AccumulatorV2<Long, Long> totalFileSlices;
|
||||
|
||||
@Override
|
||||
public JavaRDD<WriteStatus> compact(HoodieEngineContext context, HoodieCompactionPlan compactionPlan,
|
||||
HoodieTable hoodieTable, HoodieWriteConfig config, String compactionInstantTime) throws IOException {
|
||||
JavaSparkContext jsc = HoodieSparkEngineContext.getSparkContext(context);
|
||||
if (compactionPlan == null || (compactionPlan.getOperations() == null)
|
||||
|| (compactionPlan.getOperations().isEmpty())) {
|
||||
return jsc.emptyRDD();
|
||||
}
|
||||
HoodieTableMetaClient metaClient = hoodieTable.getMetaClient();
|
||||
// Compacting is very similar to applying updates to existing file
|
||||
HoodieSparkCopyOnWriteTable table = new HoodieSparkCopyOnWriteTable(config, context, metaClient);
|
||||
List<CompactionOperation> operations = compactionPlan.getOperations().stream()
|
||||
.map(CompactionOperation::convertFromAvroRecordInstance).collect(toList());
|
||||
LOG.info("Compactor compacting " + operations + " files");
|
||||
|
||||
context.setJobStatus(this.getClass().getSimpleName(), "Compacting file slices");
|
||||
return jsc.parallelize(operations, operations.size())
|
||||
.map(s -> compact(table, metaClient, config, s, compactionInstantTime)).flatMap(List::iterator);
|
||||
}
|
||||
|
||||
private List<WriteStatus> compact(HoodieSparkCopyOnWriteTable hoodieCopyOnWriteTable, HoodieTableMetaClient metaClient,
|
||||
HoodieWriteConfig config, CompactionOperation operation, String instantTime) throws IOException {
|
||||
FileSystem fs = metaClient.getFs();
|
||||
|
||||
Schema readerSchema = HoodieAvroUtils.addMetadataFields(new Schema.Parser().parse(config.getSchema()));
|
||||
LOG.info("Compacting base " + operation.getDataFileName() + " with delta files " + operation.getDeltaFileNames()
|
||||
+ " for commit " + instantTime);
|
||||
// TODO - FIX THIS
|
||||
// Reads the entire avro file. Always only specific blocks should be read from the avro file
|
||||
// (failure recover).
|
||||
// Load all the delta commits since the last compaction commit and get all the blocks to be
|
||||
// loaded and load it using CompositeAvroLogReader
|
||||
// Since a DeltaCommit is not defined yet, reading all the records. revisit this soon.
|
||||
String maxInstantTime = metaClient
|
||||
.getActiveTimeline().getTimelineOfActions(CollectionUtils.createSet(HoodieTimeline.COMMIT_ACTION,
|
||||
HoodieTimeline.ROLLBACK_ACTION, HoodieTimeline.DELTA_COMMIT_ACTION))
|
||||
.filterCompletedInstants().lastInstant().get().getTimestamp();
|
||||
long maxMemoryPerCompaction = IOUtils.getMaxMemoryPerCompaction(new SparkTaskContextSupplier(), config.getProps());
|
||||
LOG.info("MaxMemoryPerCompaction => " + maxMemoryPerCompaction);
|
||||
|
||||
List<String> logFiles = operation.getDeltaFileNames().stream().map(
|
||||
p -> new Path(FSUtils.getPartitionPath(metaClient.getBasePath(), operation.getPartitionPath()), p).toString())
|
||||
.collect(toList());
|
||||
HoodieMergedLogRecordScanner scanner = new HoodieMergedLogRecordScanner(fs, metaClient.getBasePath(), logFiles,
|
||||
readerSchema, maxInstantTime, maxMemoryPerCompaction, config.getCompactionLazyBlockReadEnabled(),
|
||||
config.getCompactionReverseLogReadEnabled(), config.getMaxDFSStreamBufferSize(),
|
||||
config.getSpillableMapBasePath());
|
||||
if (!scanner.iterator().hasNext()) {
|
||||
return new ArrayList<>();
|
||||
}
|
||||
|
||||
Option<HoodieBaseFile> oldDataFileOpt =
|
||||
operation.getBaseFile(metaClient.getBasePath(), operation.getPartitionPath());
|
||||
|
||||
// Compacting is very similar to applying updates to existing file
|
||||
Iterator<List<WriteStatus>> result;
|
||||
// If the dataFile is present, perform updates else perform inserts into a new base file.
|
||||
if (oldDataFileOpt.isPresent()) {
|
||||
result = hoodieCopyOnWriteTable.handleUpdate(instantTime, operation.getPartitionPath(),
|
||||
operation.getFileId(), scanner.getRecords(),
|
||||
oldDataFileOpt.get());
|
||||
} else {
|
||||
result = hoodieCopyOnWriteTable.handleInsert(instantTime, operation.getPartitionPath(), operation.getFileId(),
|
||||
scanner.getRecords());
|
||||
}
|
||||
Iterable<List<WriteStatus>> resultIterable = () -> result;
|
||||
return StreamSupport.stream(resultIterable.spliterator(), false).flatMap(Collection::stream).peek(s -> {
|
||||
s.getStat().setTotalUpdatedRecordsCompacted(scanner.getNumMergedRecordsInLog());
|
||||
s.getStat().setTotalLogFilesCompacted(scanner.getTotalLogFiles());
|
||||
s.getStat().setTotalLogRecords(scanner.getTotalLogRecords());
|
||||
s.getStat().setPartitionPath(operation.getPartitionPath());
|
||||
s.getStat()
|
||||
.setTotalLogSizeCompacted(operation.getMetrics().get(CompactionStrategy.TOTAL_LOG_FILE_SIZE).longValue());
|
||||
s.getStat().setTotalLogBlocks(scanner.getTotalLogBlocks());
|
||||
s.getStat().setTotalCorruptLogBlock(scanner.getTotalCorruptBlocks());
|
||||
s.getStat().setTotalRollbackBlocks(scanner.getTotalRollbacks());
|
||||
RuntimeStats runtimeStats = new RuntimeStats();
|
||||
runtimeStats.setTotalScanTime(scanner.getTotalTimeTakenToReadAndMergeBlocks());
|
||||
s.getStat().setRuntimeStats(runtimeStats);
|
||||
}).collect(toList());
|
||||
}
|
||||
|
||||
@Override
|
||||
public HoodieCompactionPlan generateCompactionPlan(HoodieEngineContext context,
|
||||
HoodieTable<T, JavaRDD<HoodieRecord<T>>, JavaRDD<HoodieKey>, JavaRDD<WriteStatus>> hoodieTable,
|
||||
HoodieWriteConfig config, String compactionCommitTime, Set<HoodieFileGroupId> fgIdsInPendingCompactions)
|
||||
throws IOException {
|
||||
JavaSparkContext jsc = HoodieSparkEngineContext.getSparkContext(context);
|
||||
totalLogFiles = new LongAccumulator();
|
||||
totalFileSlices = new LongAccumulator();
|
||||
jsc.sc().register(totalLogFiles);
|
||||
jsc.sc().register(totalFileSlices);
|
||||
|
||||
ValidationUtils.checkArgument(hoodieTable.getMetaClient().getTableType() == HoodieTableType.MERGE_ON_READ,
|
||||
"Can only compact table of type " + HoodieTableType.MERGE_ON_READ + " and not "
|
||||
+ hoodieTable.getMetaClient().getTableType().name());
|
||||
|
||||
// TODO : check if maxMemory is not greater than JVM or spark.executor memory
|
||||
// TODO - rollback any compactions in flight
|
||||
HoodieTableMetaClient metaClient = hoodieTable.getMetaClient();
|
||||
LOG.info("Compacting " + metaClient.getBasePath() + " with commit " + compactionCommitTime);
|
||||
List<String> partitionPaths = FSUtils.getAllPartitionPaths(metaClient.getFs(), metaClient.getBasePath(),
|
||||
config.shouldAssumeDatePartitioning());
|
||||
|
||||
// filter the partition paths if needed to reduce list status
|
||||
partitionPaths = config.getCompactionStrategy().filterPartitionPaths(config, partitionPaths);
|
||||
|
||||
if (partitionPaths.isEmpty()) {
|
||||
// In case no partitions could be picked, return no compaction plan
|
||||
return null;
|
||||
}
|
||||
|
||||
SliceView fileSystemView = hoodieTable.getSliceView();
|
||||
LOG.info("Compaction looking for files to compact in " + partitionPaths + " partitions");
|
||||
context.setJobStatus(this.getClass().getSimpleName(), "Looking for files to compact");
|
||||
|
||||
List<HoodieCompactionOperation> operations = context.flatMap(partitionPaths, partitionPath -> {
|
||||
return fileSystemView
|
||||
.getLatestFileSlices(partitionPath)
|
||||
.filter(slice -> !fgIdsInPendingCompactions.contains(slice.getFileGroupId()))
|
||||
.map(s -> {
|
||||
List<HoodieLogFile> logFiles =
|
||||
s.getLogFiles().sorted(HoodieLogFile.getLogFileComparator()).collect(Collectors.toList());
|
||||
totalLogFiles.add((long) logFiles.size());
|
||||
totalFileSlices.add(1L);
|
||||
// Avro generated classes are not inheriting Serializable. Using CompactionOperation POJO
|
||||
// for spark Map operations and collecting them finally in Avro generated classes for storing
|
||||
// into meta files.
|
||||
Option<HoodieBaseFile> dataFile = s.getBaseFile();
|
||||
return new CompactionOperation(dataFile, partitionPath, logFiles,
|
||||
config.getCompactionStrategy().captureMetrics(config, dataFile, partitionPath, logFiles));
|
||||
})
|
||||
.filter(c -> !c.getDeltaFileNames().isEmpty());
|
||||
}, partitionPaths.size()).stream().map(CompactionUtils::buildHoodieCompactionOperation).collect(toList());
|
||||
|
||||
LOG.info("Total of " + operations.size() + " compactions are retrieved");
|
||||
LOG.info("Total number of latest files slices " + totalFileSlices.value());
|
||||
LOG.info("Total number of log files " + totalLogFiles.value());
|
||||
LOG.info("Total number of file slices " + totalFileSlices.value());
|
||||
// Filter the compactions with the passed in filter. This lets us choose most effective
|
||||
// compactions only
|
||||
HoodieCompactionPlan compactionPlan = config.getCompactionStrategy().generateCompactionPlan(config, operations,
|
||||
CompactionUtils.getAllPendingCompactionPlans(metaClient).stream().map(Pair::getValue).collect(toList()));
|
||||
ValidationUtils.checkArgument(
|
||||
compactionPlan.getOperations().stream().noneMatch(
|
||||
op -> fgIdsInPendingCompactions.contains(new HoodieFileGroupId(op.getPartitionPath(), op.getFileId()))),
|
||||
"Bad Compaction Plan. FileId MUST NOT have multiple pending compactions. "
|
||||
+ "Please fix your strategy implementation. FileIdsWithPendingCompactions :" + fgIdsInPendingCompactions
|
||||
+ ", Selected workload :" + compactionPlan);
|
||||
if (compactionPlan.getOperations().isEmpty()) {
|
||||
LOG.warn("After filtering, Nothing to compact for " + metaClient.getBasePath());
|
||||
}
|
||||
return compactionPlan;
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,75 @@
|
||||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one
|
||||
* or more contributor license agreements. See the NOTICE file
|
||||
* distributed with this work for additional information
|
||||
* regarding copyright ownership. The ASF licenses this file
|
||||
* to you under the Apache License, Version 2.0 (the
|
||||
* "License"); you may not use this file except in compliance
|
||||
* with the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
package org.apache.hudi.table.action.compact;
|
||||
|
||||
import org.apache.hudi.avro.model.HoodieCompactionPlan;
|
||||
import org.apache.hudi.client.WriteStatus;
|
||||
import org.apache.hudi.common.model.HoodieCommitMetadata;
|
||||
import org.apache.hudi.common.model.HoodieKey;
|
||||
import org.apache.hudi.common.model.HoodieRecord;
|
||||
import org.apache.hudi.common.model.HoodieRecordPayload;
|
||||
import org.apache.hudi.common.model.HoodieWriteStat;
|
||||
import org.apache.hudi.common.table.timeline.HoodieTimeline;
|
||||
import org.apache.hudi.common.table.timeline.TimelineMetadataUtils;
|
||||
import org.apache.hudi.table.HoodieTable;
|
||||
|
||||
import org.apache.spark.api.java.JavaRDD;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.util.List;
|
||||
|
||||
/**
|
||||
* A spark implementation of {@link AbstractCompactHelpers}.
|
||||
*
|
||||
* @param <T>
|
||||
*/
|
||||
public class SparkCompactHelpers<T extends HoodieRecordPayload> extends
|
||||
AbstractCompactHelpers<T, JavaRDD<HoodieRecord<T>>, JavaRDD<HoodieKey>, JavaRDD<WriteStatus>> {
|
||||
|
||||
private SparkCompactHelpers() {
|
||||
}
|
||||
|
||||
private static class CompactHelperHolder {
|
||||
private static final SparkCompactHelpers SPARK_COMPACT_HELPERS = new SparkCompactHelpers();
|
||||
}
|
||||
|
||||
public static SparkCompactHelpers newInstance() {
|
||||
return CompactHelperHolder.SPARK_COMPACT_HELPERS;
|
||||
}
|
||||
|
||||
@Override
|
||||
public HoodieCommitMetadata createCompactionMetadata(HoodieTable<T, JavaRDD<HoodieRecord<T>>, JavaRDD<HoodieKey>, JavaRDD<WriteStatus>> table,
|
||||
String compactionInstantTime,
|
||||
JavaRDD<WriteStatus> writeStatuses,
|
||||
String schema) throws IOException {
|
||||
byte[] planBytes = table.getActiveTimeline().readCompactionPlanAsBytes(
|
||||
HoodieTimeline.getCompactionRequestedInstant(compactionInstantTime)).get();
|
||||
HoodieCompactionPlan compactionPlan = TimelineMetadataUtils.deserializeCompactionPlan(planBytes);
|
||||
List<HoodieWriteStat> updateStatusMap = writeStatuses.map(WriteStatus::getStat).collect();
|
||||
org.apache.hudi.common.model.HoodieCommitMetadata metadata = new org.apache.hudi.common.model.HoodieCommitMetadata(true);
|
||||
for (HoodieWriteStat stat : updateStatusMap) {
|
||||
metadata.addWriteStat(stat.getPartitionPath(), stat);
|
||||
}
|
||||
metadata.addMetadata(org.apache.hudi.common.model.HoodieCommitMetadata.SCHEMA_KEY, schema);
|
||||
if (compactionPlan.getExtraMetadata() != null) {
|
||||
compactionPlan.getExtraMetadata().forEach(metadata::addMetadata);
|
||||
}
|
||||
return metadata;
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,95 @@
|
||||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one
|
||||
* or more contributor license agreements. See the NOTICE file
|
||||
* distributed with this work for additional information
|
||||
* regarding copyright ownership. The ASF licenses this file
|
||||
* to you under the Apache License, Version 2.0 (the
|
||||
* "License"); you may not use this file except in compliance
|
||||
* with the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
package org.apache.hudi.table.action.compact;
|
||||
|
||||
import org.apache.hudi.avro.model.HoodieCompactionPlan;
|
||||
import org.apache.hudi.client.WriteStatus;
|
||||
import org.apache.hudi.client.common.HoodieSparkEngineContext;
|
||||
import org.apache.hudi.client.utils.SparkMemoryUtils;
|
||||
import org.apache.hudi.common.model.HoodieCommitMetadata;
|
||||
import org.apache.hudi.common.model.HoodieKey;
|
||||
import org.apache.hudi.common.model.HoodieRecord;
|
||||
import org.apache.hudi.common.model.HoodieRecordPayload;
|
||||
import org.apache.hudi.common.model.HoodieWriteStat;
|
||||
import org.apache.hudi.common.table.timeline.HoodieActiveTimeline;
|
||||
import org.apache.hudi.common.table.timeline.HoodieInstant;
|
||||
import org.apache.hudi.common.table.timeline.HoodieTimeline;
|
||||
import org.apache.hudi.common.util.CompactionUtils;
|
||||
import org.apache.hudi.common.util.Option;
|
||||
import org.apache.hudi.config.HoodieWriteConfig;
|
||||
import org.apache.hudi.exception.HoodieCompactionException;
|
||||
import org.apache.hudi.table.HoodieTable;
|
||||
import org.apache.hudi.table.action.BaseActionExecutor;
|
||||
import org.apache.hudi.table.action.HoodieWriteMetadata;
|
||||
|
||||
import org.apache.spark.api.java.JavaRDD;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.util.List;
|
||||
|
||||
@SuppressWarnings("checkstyle:LineLength")
|
||||
public class SparkRunCompactionActionExecutor<T extends HoodieRecordPayload> extends
|
||||
BaseActionExecutor<T, JavaRDD<HoodieRecord<T>>, JavaRDD<HoodieKey>, JavaRDD<WriteStatus>, HoodieWriteMetadata<JavaRDD<WriteStatus>>> {
|
||||
|
||||
public SparkRunCompactionActionExecutor(HoodieSparkEngineContext context,
|
||||
HoodieWriteConfig config,
|
||||
HoodieTable<T, JavaRDD<HoodieRecord<T>>, JavaRDD<HoodieKey>, JavaRDD<WriteStatus>> table,
|
||||
String instantTime) {
|
||||
super(context, config, table, instantTime);
|
||||
}
|
||||
|
||||
@Override
|
||||
public HoodieWriteMetadata<JavaRDD<WriteStatus>> execute() {
|
||||
HoodieInstant instant = HoodieTimeline.getCompactionRequestedInstant(instantTime);
|
||||
HoodieTimeline pendingCompactionTimeline = table.getActiveTimeline().filterPendingCompactionTimeline();
|
||||
if (!pendingCompactionTimeline.containsInstant(instant)) {
|
||||
throw new IllegalStateException(
|
||||
"No Compaction request available at " + instantTime + " to run compaction");
|
||||
}
|
||||
|
||||
HoodieWriteMetadata<JavaRDD<WriteStatus>> compactionMetadata = new HoodieWriteMetadata<>();
|
||||
try {
|
||||
HoodieActiveTimeline timeline = table.getActiveTimeline();
|
||||
HoodieCompactionPlan compactionPlan =
|
||||
CompactionUtils.getCompactionPlan(table.getMetaClient(), instantTime);
|
||||
// Mark instant as compaction inflight
|
||||
timeline.transitionCompactionRequestedToInflight(instant);
|
||||
table.getMetaClient().reloadActiveTimeline();
|
||||
|
||||
HoodieSparkMergeOnReadTableCompactor compactor = new HoodieSparkMergeOnReadTableCompactor();
|
||||
JavaRDD<WriteStatus> statuses = compactor.compact(context, compactionPlan, table, config, instantTime);
|
||||
|
||||
statuses.persist(SparkMemoryUtils.getWriteStatusStorageLevel(config.getProps()));
|
||||
List<HoodieWriteStat> updateStatusMap = statuses.map(WriteStatus::getStat).collect();
|
||||
HoodieCommitMetadata metadata = new HoodieCommitMetadata(true);
|
||||
for (HoodieWriteStat stat : updateStatusMap) {
|
||||
metadata.addWriteStat(stat.getPartitionPath(), stat);
|
||||
}
|
||||
metadata.addMetadata(HoodieCommitMetadata.SCHEMA_KEY, config.getSchema());
|
||||
|
||||
compactionMetadata.setWriteStatuses(statuses);
|
||||
compactionMetadata.setCommitted(false);
|
||||
compactionMetadata.setCommitMetadata(Option.of(metadata));
|
||||
} catch (IOException e) {
|
||||
throw new HoodieCompactionException("Could not compact " + config.getBasePath(), e);
|
||||
}
|
||||
|
||||
return compactionMetadata;
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,88 @@
|
||||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one
|
||||
* or more contributor license agreements. See the NOTICE file
|
||||
* distributed with this work for additional information
|
||||
* regarding copyright ownership. The ASF licenses this file
|
||||
* to you under the Apache License, Version 2.0 (the
|
||||
* "License"); you may not use this file except in compliance
|
||||
* with the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
package org.apache.hudi.table.action.compact;
|
||||
|
||||
import org.apache.hudi.avro.model.HoodieCompactionPlan;
|
||||
import org.apache.hudi.client.WriteStatus;
|
||||
import org.apache.hudi.client.common.HoodieEngineContext;
|
||||
import org.apache.hudi.common.model.HoodieKey;
|
||||
import org.apache.hudi.common.model.HoodieRecord;
|
||||
import org.apache.hudi.common.model.HoodieRecordPayload;
|
||||
import org.apache.hudi.common.table.timeline.HoodieInstant;
|
||||
import org.apache.hudi.common.table.view.SyncableFileSystemView;
|
||||
import org.apache.hudi.common.util.Option;
|
||||
import org.apache.hudi.config.HoodieWriteConfig;
|
||||
import org.apache.hudi.exception.HoodieCompactionException;
|
||||
import org.apache.hudi.table.HoodieTable;
|
||||
|
||||
import org.apache.log4j.LogManager;
|
||||
import org.apache.log4j.Logger;
|
||||
import org.apache.spark.api.java.JavaRDD;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.util.Map;
|
||||
import java.util.stream.Collectors;
|
||||
|
||||
@SuppressWarnings("checkstyle:LineLength")
|
||||
public class SparkScheduleCompactionActionExecutor<T extends HoodieRecordPayload> extends
|
||||
BaseScheduleCompactionActionExecutor<T, JavaRDD<HoodieRecord<T>>, JavaRDD<HoodieKey>, JavaRDD<WriteStatus>> {
|
||||
|
||||
private static final Logger LOG = LogManager.getLogger(SparkScheduleCompactionActionExecutor.class);
|
||||
|
||||
public SparkScheduleCompactionActionExecutor(HoodieEngineContext context,
|
||||
HoodieWriteConfig config,
|
||||
HoodieTable<T, JavaRDD<HoodieRecord<T>>, JavaRDD<HoodieKey>, JavaRDD<WriteStatus>> table,
|
||||
String instantTime,
|
||||
Option<Map<String, String>> extraMetadata) {
|
||||
super(context, config, table, instantTime, extraMetadata);
|
||||
}
|
||||
|
||||
@Override
|
||||
protected HoodieCompactionPlan scheduleCompaction() {
|
||||
LOG.info("Checking if compaction needs to be run on " + config.getBasePath());
|
||||
Option<HoodieInstant> lastCompaction = table.getActiveTimeline().getCommitTimeline()
|
||||
.filterCompletedInstants().lastInstant();
|
||||
String lastCompactionTs = "0";
|
||||
if (lastCompaction.isPresent()) {
|
||||
lastCompactionTs = lastCompaction.get().getTimestamp();
|
||||
}
|
||||
|
||||
int deltaCommitsSinceLastCompaction = table.getActiveTimeline().getDeltaCommitTimeline()
|
||||
.findInstantsAfter(lastCompactionTs, Integer.MAX_VALUE).countInstants();
|
||||
if (config.getInlineCompactDeltaCommitMax() > deltaCommitsSinceLastCompaction) {
|
||||
LOG.info("Not scheduling compaction as only " + deltaCommitsSinceLastCompaction
|
||||
+ " delta commits was found since last compaction " + lastCompactionTs + ". Waiting for "
|
||||
+ config.getInlineCompactDeltaCommitMax());
|
||||
return new HoodieCompactionPlan();
|
||||
}
|
||||
|
||||
LOG.info("Generating compaction plan for merge on read table " + config.getBasePath());
|
||||
HoodieSparkMergeOnReadTableCompactor compactor = new HoodieSparkMergeOnReadTableCompactor();
|
||||
try {
|
||||
return compactor.generateCompactionPlan(context, table, config, instantTime,
|
||||
((SyncableFileSystemView) table.getSliceView()).getPendingCompactionOperations()
|
||||
.map(instantTimeOpPair -> instantTimeOpPair.getValue().getFileGroupId())
|
||||
.collect(Collectors.toSet()));
|
||||
|
||||
} catch (IOException e) {
|
||||
throw new HoodieCompactionException("Could not schedule compaction " + config.getBasePath(), e);
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
@@ -0,0 +1,102 @@
|
||||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one
|
||||
* or more contributor license agreements. See the NOTICE file
|
||||
* distributed with this work for additional information
|
||||
* regarding copyright ownership. The ASF licenses this file
|
||||
* to you under the Apache License, Version 2.0 (the
|
||||
* "License"); you may not use this file except in compliance
|
||||
* with the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
package org.apache.hudi.table.action.deltacommit;
|
||||
|
||||
import java.util.Map;
|
||||
import org.apache.hudi.client.WriteStatus;
|
||||
import org.apache.hudi.client.common.HoodieSparkEngineContext;
|
||||
import org.apache.hudi.common.model.HoodieRecord;
|
||||
import org.apache.hudi.common.model.HoodieRecordPayload;
|
||||
import org.apache.hudi.common.model.WriteOperationType;
|
||||
import org.apache.hudi.common.util.Option;
|
||||
import org.apache.hudi.config.HoodieWriteConfig;
|
||||
import org.apache.hudi.exception.HoodieUpsertException;
|
||||
import org.apache.hudi.execution.SparkLazyInsertIterable;
|
||||
import org.apache.hudi.io.AppendHandleFactory;
|
||||
import org.apache.hudi.io.HoodieAppendHandle;
|
||||
import org.apache.hudi.table.HoodieTable;
|
||||
import org.apache.hudi.table.WorkloadProfile;
|
||||
import org.apache.hudi.table.action.commit.BaseSparkCommitActionExecutor;
|
||||
|
||||
import org.apache.log4j.LogManager;
|
||||
import org.apache.log4j.Logger;
|
||||
import org.apache.spark.Partitioner;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.util.Collections;
|
||||
import java.util.Iterator;
|
||||
import java.util.List;
|
||||
|
||||
public abstract class AbstractSparkDeltaCommitActionExecutor<T extends HoodieRecordPayload<T>>
|
||||
extends BaseSparkCommitActionExecutor<T> {
|
||||
private static final Logger LOG = LogManager.getLogger(AbstractSparkDeltaCommitActionExecutor.class);
|
||||
|
||||
// UpsertPartitioner for MergeOnRead table type
|
||||
private SparkUpsertDeltaCommitPartitioner mergeOnReadUpsertPartitioner;
|
||||
|
||||
public AbstractSparkDeltaCommitActionExecutor(HoodieSparkEngineContext context, HoodieWriteConfig config, HoodieTable table,
|
||||
String instantTime, WriteOperationType operationType) {
|
||||
this(context, config, table, instantTime, operationType, Option.empty());
|
||||
}
|
||||
|
||||
public AbstractSparkDeltaCommitActionExecutor(HoodieSparkEngineContext context, HoodieWriteConfig config, HoodieTable table,
|
||||
String instantTime, WriteOperationType operationType,
|
||||
Option<Map<String, String>> extraMetadata) {
|
||||
super(context, config, table, instantTime, operationType, extraMetadata);
|
||||
}
|
||||
|
||||
@Override
|
||||
public Partitioner getUpsertPartitioner(WorkloadProfile profile) {
|
||||
if (profile == null) {
|
||||
throw new HoodieUpsertException("Need workload profile to construct the upsert partitioner.");
|
||||
}
|
||||
mergeOnReadUpsertPartitioner = new SparkUpsertDeltaCommitPartitioner(profile, (HoodieSparkEngineContext) context, table, config);
|
||||
return mergeOnReadUpsertPartitioner;
|
||||
}
|
||||
|
||||
@Override
|
||||
public Iterator<List<WriteStatus>> handleUpdate(String partitionPath, String fileId,
|
||||
Iterator<HoodieRecord<T>> recordItr) throws IOException {
|
||||
LOG.info("Merging updates for commit " + instantTime + " for file " + fileId);
|
||||
|
||||
if (!table.getIndex().canIndexLogFiles() && mergeOnReadUpsertPartitioner.getSmallFileIds().contains(fileId)) {
|
||||
LOG.info("Small file corrections for updates for commit " + instantTime + " for file " + fileId);
|
||||
return super.handleUpdate(partitionPath, fileId, recordItr);
|
||||
} else {
|
||||
HoodieAppendHandle appendHandle = new HoodieAppendHandle<>(config, instantTime, table,
|
||||
partitionPath, fileId, recordItr, taskContextSupplier);
|
||||
appendHandle.doAppend();
|
||||
appendHandle.close();
|
||||
return Collections.singletonList(Collections.singletonList(appendHandle.getWriteStatus())).iterator();
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public Iterator<List<WriteStatus>> handleInsert(String idPfx, Iterator<HoodieRecord<T>> recordItr)
|
||||
throws Exception {
|
||||
// If canIndexLogFiles, write inserts to log files else write inserts to base files
|
||||
if (table.getIndex().canIndexLogFiles()) {
|
||||
return new SparkLazyInsertIterable<>(recordItr, true, config, instantTime, table,
|
||||
idPfx, taskContextSupplier, new AppendHandleFactory<>());
|
||||
} else {
|
||||
return super.handleInsert(idPfx, recordItr);
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
@@ -0,0 +1,70 @@
|
||||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one
|
||||
* or more contributor license agreements. See the NOTICE file
|
||||
* distributed with this work for additional information
|
||||
* regarding copyright ownership. The ASF licenses this file
|
||||
* to you under the Apache License, Version 2.0 (the
|
||||
* "License"); you may not use this file except in compliance
|
||||
* with the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
package org.apache.hudi.table.action.deltacommit;
|
||||
|
||||
import java.util.Map;
|
||||
|
||||
import org.apache.hudi.client.WriteStatus;
|
||||
import org.apache.hudi.client.common.HoodieSparkEngineContext;
|
||||
import org.apache.hudi.common.model.HoodieRecord;
|
||||
import org.apache.hudi.common.model.HoodieRecordPayload;
|
||||
import org.apache.hudi.common.model.WriteOperationType;
|
||||
import org.apache.hudi.common.util.Option;
|
||||
import org.apache.hudi.config.HoodieWriteConfig;
|
||||
import org.apache.hudi.exception.HoodieInsertException;
|
||||
import org.apache.hudi.table.HoodieTable;
|
||||
import org.apache.hudi.table.BulkInsertPartitioner;
|
||||
|
||||
import org.apache.hudi.table.action.HoodieWriteMetadata;
|
||||
import org.apache.hudi.table.action.commit.SparkBulkInsertHelper;
|
||||
import org.apache.spark.api.java.JavaRDD;
|
||||
|
||||
public class SparkBulkInsertDeltaCommitActionExecutor<T extends HoodieRecordPayload<T>>
|
||||
extends AbstractSparkDeltaCommitActionExecutor<T> {
|
||||
|
||||
private final JavaRDD<HoodieRecord<T>> inputRecordsRDD;
|
||||
private final Option<BulkInsertPartitioner<T>> bulkInsertPartitioner;
|
||||
|
||||
public SparkBulkInsertDeltaCommitActionExecutor(HoodieSparkEngineContext context, HoodieWriteConfig config, HoodieTable table,
|
||||
String instantTime, JavaRDD<HoodieRecord<T>> inputRecordsRDD,
|
||||
Option<BulkInsertPartitioner<T>> bulkInsertPartitioner) {
|
||||
this(context, config, table, instantTime, inputRecordsRDD, bulkInsertPartitioner, Option.empty());
|
||||
}
|
||||
|
||||
public SparkBulkInsertDeltaCommitActionExecutor(HoodieSparkEngineContext context, HoodieWriteConfig config, HoodieTable table,
|
||||
String instantTime, JavaRDD<HoodieRecord<T>> inputRecordsRDD,
|
||||
Option<BulkInsertPartitioner<T>> bulkInsertPartitioner,
|
||||
Option<Map<String, String>> extraMetadata) {
|
||||
super(context, config, table, instantTime, WriteOperationType.BULK_INSERT, extraMetadata);
|
||||
this.inputRecordsRDD = inputRecordsRDD;
|
||||
this.bulkInsertPartitioner = bulkInsertPartitioner;
|
||||
}
|
||||
|
||||
@Override
|
||||
public HoodieWriteMetadata<JavaRDD<WriteStatus>> execute() {
|
||||
try {
|
||||
return SparkBulkInsertHelper.newInstance().bulkInsert(inputRecordsRDD, instantTime, table, config,
|
||||
this, true, bulkInsertPartitioner);
|
||||
} catch (HoodieInsertException ie) {
|
||||
throw ie;
|
||||
} catch (Throwable e) {
|
||||
throw new HoodieInsertException("Failed to bulk insert for commit time " + instantTime, e);
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,64 @@
|
||||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one
|
||||
* or more contributor license agreements. See the NOTICE file
|
||||
* distributed with this work for additional information
|
||||
* regarding copyright ownership. The ASF licenses this file
|
||||
* to you under the Apache License, Version 2.0 (the
|
||||
* "License"); you may not use this file except in compliance
|
||||
* with the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
package org.apache.hudi.table.action.deltacommit;
|
||||
|
||||
import org.apache.hudi.client.WriteStatus;
|
||||
import org.apache.hudi.client.common.HoodieSparkEngineContext;
|
||||
import org.apache.hudi.common.model.HoodieRecord;
|
||||
import org.apache.hudi.common.model.HoodieRecordPayload;
|
||||
import org.apache.hudi.common.model.WriteOperationType;
|
||||
import org.apache.hudi.common.util.Option;
|
||||
import org.apache.hudi.config.HoodieWriteConfig;
|
||||
import org.apache.hudi.exception.HoodieInsertException;
|
||||
import org.apache.hudi.table.HoodieTable;
|
||||
import org.apache.hudi.table.BulkInsertPartitioner;
|
||||
|
||||
import org.apache.hudi.table.action.HoodieWriteMetadata;
|
||||
import org.apache.hudi.table.action.commit.SparkBulkInsertHelper;
|
||||
import org.apache.spark.api.java.JavaRDD;
|
||||
|
||||
public class SparkBulkInsertPreppedDeltaCommitActionExecutor<T extends HoodieRecordPayload<T>>
|
||||
extends AbstractSparkDeltaCommitActionExecutor<T> {
|
||||
|
||||
private final JavaRDD<HoodieRecord<T>> preppedInputRecordRdd;
|
||||
private final Option<BulkInsertPartitioner<T>> bulkInsertPartitioner;
|
||||
|
||||
public SparkBulkInsertPreppedDeltaCommitActionExecutor(HoodieSparkEngineContext context,
|
||||
HoodieWriteConfig config, HoodieTable table,
|
||||
String instantTime, JavaRDD<HoodieRecord<T>> preppedInputRecordRdd,
|
||||
Option<BulkInsertPartitioner<T>> bulkInsertPartitioner) {
|
||||
super(context, config, table, instantTime, WriteOperationType.BULK_INSERT);
|
||||
this.preppedInputRecordRdd = preppedInputRecordRdd;
|
||||
this.bulkInsertPartitioner = bulkInsertPartitioner;
|
||||
}
|
||||
|
||||
@Override
|
||||
public HoodieWriteMetadata<JavaRDD<WriteStatus>> execute() {
|
||||
try {
|
||||
return SparkBulkInsertHelper.newInstance().bulkInsert(preppedInputRecordRdd, instantTime, table, config,
|
||||
this, false, bulkInsertPartitioner);
|
||||
} catch (Throwable e) {
|
||||
if (e instanceof HoodieInsertException) {
|
||||
throw e;
|
||||
}
|
||||
throw new HoodieInsertException("Failed to bulk insert for commit time " + instantTime, e);
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
@@ -0,0 +1,49 @@
|
||||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one
|
||||
* or more contributor license agreements. See the NOTICE file
|
||||
* distributed with this work for additional information
|
||||
* regarding copyright ownership. The ASF licenses this file
|
||||
* to you under the Apache License, Version 2.0 (the
|
||||
* "License"); you may not use this file except in compliance
|
||||
* with the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
package org.apache.hudi.table.action.deltacommit;
|
||||
|
||||
import org.apache.hudi.client.WriteStatus;
|
||||
import org.apache.hudi.client.common.HoodieSparkEngineContext;
|
||||
import org.apache.hudi.common.model.HoodieKey;
|
||||
import org.apache.hudi.common.model.HoodieRecordPayload;
|
||||
import org.apache.hudi.common.model.WriteOperationType;
|
||||
import org.apache.hudi.config.HoodieWriteConfig;
|
||||
import org.apache.hudi.table.HoodieTable;
|
||||
|
||||
import org.apache.hudi.table.action.HoodieWriteMetadata;
|
||||
import org.apache.hudi.table.action.commit.SparkDeleteHelper;
|
||||
import org.apache.spark.api.java.JavaRDD;
|
||||
|
||||
public class SparkDeleteDeltaCommitActionExecutor<T extends HoodieRecordPayload<T>>
|
||||
extends AbstractSparkDeltaCommitActionExecutor<T> {
|
||||
|
||||
private final JavaRDD<HoodieKey> keys;
|
||||
|
||||
public SparkDeleteDeltaCommitActionExecutor(HoodieSparkEngineContext context,
|
||||
HoodieWriteConfig config, HoodieTable table,
|
||||
String instantTime, JavaRDD<HoodieKey> keys) {
|
||||
super(context, config, table, instantTime, WriteOperationType.DELETE);
|
||||
this.keys = keys;
|
||||
}
|
||||
|
||||
@Override
|
||||
public HoodieWriteMetadata<JavaRDD<WriteStatus>> execute() {
|
||||
return SparkDeleteHelper.newInstance().execute(instantTime, keys, context, config, table, this);
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,50 @@
|
||||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one
|
||||
* or more contributor license agreements. See the NOTICE file
|
||||
* distributed with this work for additional information
|
||||
* regarding copyright ownership. The ASF licenses this file
|
||||
* to you under the Apache License, Version 2.0 (the
|
||||
* "License"); you may not use this file except in compliance
|
||||
* with the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
package org.apache.hudi.table.action.deltacommit;
|
||||
|
||||
import org.apache.hudi.client.WriteStatus;
|
||||
import org.apache.hudi.client.common.HoodieSparkEngineContext;
|
||||
import org.apache.hudi.common.model.HoodieRecord;
|
||||
import org.apache.hudi.common.model.HoodieRecordPayload;
|
||||
import org.apache.hudi.common.model.WriteOperationType;
|
||||
import org.apache.hudi.config.HoodieWriteConfig;
|
||||
import org.apache.hudi.table.HoodieTable;
|
||||
|
||||
import org.apache.hudi.table.action.HoodieWriteMetadata;
|
||||
import org.apache.hudi.table.action.commit.SparkWriteHelper;
|
||||
import org.apache.spark.api.java.JavaRDD;
|
||||
|
||||
public class SparkInsertDeltaCommitActionExecutor<T extends HoodieRecordPayload<T>>
|
||||
extends AbstractSparkDeltaCommitActionExecutor<T> {
|
||||
|
||||
private final JavaRDD<HoodieRecord<T>> inputRecordsRDD;
|
||||
|
||||
public SparkInsertDeltaCommitActionExecutor(HoodieSparkEngineContext context,
|
||||
HoodieWriteConfig config, HoodieTable table,
|
||||
String instantTime, JavaRDD<HoodieRecord<T>> inputRecordsRDD) {
|
||||
super(context, config, table, instantTime, WriteOperationType.INSERT);
|
||||
this.inputRecordsRDD = inputRecordsRDD;
|
||||
}
|
||||
|
||||
@Override
|
||||
public HoodieWriteMetadata<JavaRDD<WriteStatus>> execute() {
|
||||
return SparkWriteHelper.newInstance().write(instantTime, inputRecordsRDD, context, table,
|
||||
config.shouldCombineBeforeInsert(), config.getInsertShuffleParallelism(),this, false);
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,47 @@
|
||||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one
|
||||
* or more contributor license agreements. See the NOTICE file
|
||||
* distributed with this work for additional information
|
||||
* regarding copyright ownership. The ASF licenses this file
|
||||
* to you under the Apache License, Version 2.0 (the
|
||||
* "License"); you may not use this file except in compliance
|
||||
* with the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
package org.apache.hudi.table.action.deltacommit;
|
||||
|
||||
import org.apache.hudi.client.WriteStatus;
|
||||
import org.apache.hudi.client.common.HoodieSparkEngineContext;
|
||||
import org.apache.hudi.common.model.HoodieRecord;
|
||||
import org.apache.hudi.common.model.HoodieRecordPayload;
|
||||
import org.apache.hudi.common.model.WriteOperationType;
|
||||
import org.apache.hudi.config.HoodieWriteConfig;
|
||||
import org.apache.hudi.table.HoodieTable;
|
||||
import org.apache.hudi.table.action.HoodieWriteMetadata;
|
||||
import org.apache.spark.api.java.JavaRDD;
|
||||
|
||||
public class SparkInsertPreppedDeltaCommitActionExecutor<T extends HoodieRecordPayload<T>>
|
||||
extends AbstractSparkDeltaCommitActionExecutor<T> {
|
||||
|
||||
private final JavaRDD<HoodieRecord<T>> preppedRecords;
|
||||
|
||||
public SparkInsertPreppedDeltaCommitActionExecutor(HoodieSparkEngineContext context,
|
||||
HoodieWriteConfig config, HoodieTable table,
|
||||
String instantTime, JavaRDD<HoodieRecord<T>> preppedRecords) {
|
||||
super(context, config, table, instantTime, WriteOperationType.INSERT_PREPPED);
|
||||
this.preppedRecords = preppedRecords;
|
||||
}
|
||||
|
||||
@Override
|
||||
public HoodieWriteMetadata<JavaRDD<WriteStatus>> execute() {
|
||||
return super.execute(preppedRecords);
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,49 @@
|
||||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one
|
||||
* or more contributor license agreements. See the NOTICE file
|
||||
* distributed with this work for additional information
|
||||
* regarding copyright ownership. The ASF licenses this file
|
||||
* to you under the Apache License, Version 2.0 (the
|
||||
* "License"); you may not use this file except in compliance
|
||||
* with the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
package org.apache.hudi.table.action.deltacommit;
|
||||
|
||||
import org.apache.hudi.client.common.HoodieSparkEngineContext;
|
||||
import org.apache.hudi.common.model.HoodieRecord;
|
||||
import org.apache.hudi.common.model.HoodieRecordPayload;
|
||||
import org.apache.hudi.common.model.WriteOperationType;
|
||||
import org.apache.hudi.config.HoodieWriteConfig;
|
||||
import org.apache.hudi.table.HoodieTable;
|
||||
|
||||
import org.apache.hudi.table.action.HoodieWriteMetadata;
|
||||
import org.apache.hudi.table.action.commit.SparkWriteHelper;
|
||||
import org.apache.spark.api.java.JavaRDD;
|
||||
|
||||
public class SparkUpsertDeltaCommitActionExecutor<T extends HoodieRecordPayload<T>>
|
||||
extends AbstractSparkDeltaCommitActionExecutor<T> {
|
||||
|
||||
private JavaRDD<HoodieRecord<T>> inputRecordsRDD;
|
||||
|
||||
public SparkUpsertDeltaCommitActionExecutor(HoodieSparkEngineContext context,
|
||||
HoodieWriteConfig config, HoodieTable table,
|
||||
String instantTime, JavaRDD<HoodieRecord<T>> inputRecordsRDD) {
|
||||
super(context, config, table, instantTime, WriteOperationType.UPSERT);
|
||||
this.inputRecordsRDD = inputRecordsRDD;
|
||||
}
|
||||
|
||||
@Override
|
||||
public HoodieWriteMetadata execute() {
|
||||
return SparkWriteHelper.newInstance().write(instantTime, inputRecordsRDD, context, table,
|
||||
config.shouldCombineBeforeUpsert(), config.getUpsertShuffleParallelism(),this, true);
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,142 @@
|
||||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one
|
||||
* or more contributor license agreements. See the NOTICE file
|
||||
* distributed with this work for additional information
|
||||
* regarding copyright ownership. The ASF licenses this file
|
||||
* to you under the Apache License, Version 2.0 (the
|
||||
* "License"); you may not use this file except in compliance
|
||||
* with the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
package org.apache.hudi.table.action.deltacommit;
|
||||
|
||||
import org.apache.hudi.client.common.HoodieSparkEngineContext;
|
||||
import org.apache.hudi.common.fs.FSUtils;
|
||||
import org.apache.hudi.common.model.FileSlice;
|
||||
import org.apache.hudi.common.model.HoodieLogFile;
|
||||
import org.apache.hudi.common.model.HoodieRecordLocation;
|
||||
import org.apache.hudi.common.model.HoodieRecordPayload;
|
||||
import org.apache.hudi.common.table.timeline.HoodieInstant;
|
||||
import org.apache.hudi.common.table.timeline.HoodieTimeline;
|
||||
import org.apache.hudi.common.util.Option;
|
||||
import org.apache.hudi.config.HoodieWriteConfig;
|
||||
import org.apache.hudi.table.HoodieTable;
|
||||
import org.apache.hudi.table.WorkloadProfile;
|
||||
|
||||
import org.apache.hudi.table.action.commit.SmallFile;
|
||||
import org.apache.hudi.table.action.commit.UpsertPartitioner;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
import java.util.stream.Collectors;
|
||||
|
||||
/**
|
||||
* UpsertPartitioner for MergeOnRead table type, this allows auto correction of small parquet files to larger ones
|
||||
* without the need for an index in the logFile.
|
||||
*/
|
||||
public class SparkUpsertDeltaCommitPartitioner<T extends HoodieRecordPayload<T>> extends UpsertPartitioner<T> {
|
||||
|
||||
SparkUpsertDeltaCommitPartitioner(WorkloadProfile profile, HoodieSparkEngineContext context, HoodieTable table,
|
||||
HoodieWriteConfig config) {
|
||||
super(profile, context, table, config);
|
||||
}
|
||||
|
||||
@Override
|
||||
protected List<SmallFile> getSmallFiles(String partitionPath) {
|
||||
|
||||
// smallFiles only for partitionPath
|
||||
List<SmallFile> smallFileLocations = new ArrayList<>();
|
||||
|
||||
// Init here since this class (and member variables) might not have been initialized
|
||||
HoodieTimeline commitTimeline = table.getCompletedCommitsTimeline();
|
||||
|
||||
// Find out all eligible small file slices
|
||||
if (!commitTimeline.empty()) {
|
||||
HoodieInstant latestCommitTime = commitTimeline.lastInstant().get();
|
||||
// find smallest file in partition and append to it
|
||||
List<FileSlice> allSmallFileSlices = new ArrayList<>();
|
||||
// If we cannot index log files, then we choose the smallest parquet file in the partition and add inserts to
|
||||
// it. Doing this overtime for a partition, we ensure that we handle small file issues
|
||||
if (!table.getIndex().canIndexLogFiles()) {
|
||||
// TODO : choose last N small files since there can be multiple small files written to a single partition
|
||||
// by different spark partitions in a single batch
|
||||
Option<FileSlice> smallFileSlice = Option.fromJavaOptional(table.getSliceView()
|
||||
.getLatestFileSlicesBeforeOrOn(partitionPath, latestCommitTime.getTimestamp(), false)
|
||||
.filter(
|
||||
fileSlice -> fileSlice.getLogFiles().count() < 1 && fileSlice.getBaseFile().get().getFileSize() < config
|
||||
.getParquetSmallFileLimit())
|
||||
.min((FileSlice left, FileSlice right) ->
|
||||
left.getBaseFile().get().getFileSize() < right.getBaseFile().get().getFileSize() ? -1 : 1));
|
||||
if (smallFileSlice.isPresent()) {
|
||||
allSmallFileSlices.add(smallFileSlice.get());
|
||||
}
|
||||
} else {
|
||||
// If we can index log files, we can add more inserts to log files for fileIds including those under
|
||||
// pending compaction.
|
||||
List<FileSlice> allFileSlices =
|
||||
table.getSliceView().getLatestFileSlicesBeforeOrOn(partitionPath, latestCommitTime.getTimestamp(), true)
|
||||
.collect(Collectors.toList());
|
||||
for (FileSlice fileSlice : allFileSlices) {
|
||||
if (isSmallFile(fileSlice)) {
|
||||
allSmallFileSlices.add(fileSlice);
|
||||
}
|
||||
}
|
||||
}
|
||||
// Create SmallFiles from the eligible file slices
|
||||
for (FileSlice smallFileSlice : allSmallFileSlices) {
|
||||
SmallFile sf = new SmallFile();
|
||||
if (smallFileSlice.getBaseFile().isPresent()) {
|
||||
// TODO : Move logic of file name, file id, base commit time handling inside file slice
|
||||
String filename = smallFileSlice.getBaseFile().get().getFileName();
|
||||
sf.location = new HoodieRecordLocation(FSUtils.getCommitTime(filename), FSUtils.getFileId(filename));
|
||||
sf.sizeBytes = getTotalFileSize(smallFileSlice);
|
||||
smallFileLocations.add(sf);
|
||||
} else {
|
||||
HoodieLogFile logFile = smallFileSlice.getLogFiles().findFirst().get();
|
||||
sf.location = new HoodieRecordLocation(FSUtils.getBaseCommitTimeFromLogPath(logFile.getPath()),
|
||||
FSUtils.getFileIdFromLogPath(logFile.getPath()));
|
||||
sf.sizeBytes = getTotalFileSize(smallFileSlice);
|
||||
smallFileLocations.add(sf);
|
||||
}
|
||||
}
|
||||
}
|
||||
return smallFileLocations;
|
||||
}
|
||||
|
||||
public List<String> getSmallFileIds() {
|
||||
return (List<String>) smallFiles.stream().map(smallFile -> ((SmallFile) smallFile).location.getFileId())
|
||||
.collect(Collectors.toList());
|
||||
}
|
||||
|
||||
private long getTotalFileSize(FileSlice fileSlice) {
|
||||
if (!fileSlice.getBaseFile().isPresent()) {
|
||||
return convertLogFilesSizeToExpectedParquetSize(fileSlice.getLogFiles().collect(Collectors.toList()));
|
||||
} else {
|
||||
return fileSlice.getBaseFile().get().getFileSize()
|
||||
+ convertLogFilesSizeToExpectedParquetSize(fileSlice.getLogFiles().collect(Collectors.toList()));
|
||||
}
|
||||
}
|
||||
|
||||
private boolean isSmallFile(FileSlice fileSlice) {
|
||||
long totalSize = getTotalFileSize(fileSlice);
|
||||
return totalSize < config.getParquetMaxFileSize();
|
||||
}
|
||||
|
||||
// TODO (NA) : Make this static part of utility
|
||||
public long convertLogFilesSizeToExpectedParquetSize(List<HoodieLogFile> hoodieLogFiles) {
|
||||
long totalSizeOfLogFiles = hoodieLogFiles.stream().map(HoodieLogFile::getFileSize)
|
||||
.filter(size -> size > 0).reduce(Long::sum).orElse(0L);
|
||||
// Here we assume that if there is no base parquet file, all log files contain only inserts.
|
||||
// We can then just get the parquet equivalent size of these log files, compare that with
|
||||
// {@link config.getParquetMaxFileSize()} and decide if there is scope to insert more rows
|
||||
return (long) (totalSizeOfLogFiles * config.getLogFileToParquetCompressionRatio());
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,47 @@
|
||||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one
|
||||
* or more contributor license agreements. See the NOTICE file
|
||||
* distributed with this work for additional information
|
||||
* regarding copyright ownership. The ASF licenses this file
|
||||
* to you under the Apache License, Version 2.0 (the
|
||||
* "License"); you may not use this file except in compliance
|
||||
* with the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
package org.apache.hudi.table.action.deltacommit;
|
||||
|
||||
import org.apache.hudi.client.WriteStatus;
|
||||
import org.apache.hudi.client.common.HoodieSparkEngineContext;
|
||||
import org.apache.hudi.common.model.HoodieRecord;
|
||||
import org.apache.hudi.common.model.HoodieRecordPayload;
|
||||
import org.apache.hudi.common.model.WriteOperationType;
|
||||
import org.apache.hudi.config.HoodieWriteConfig;
|
||||
import org.apache.hudi.table.HoodieTable;
|
||||
import org.apache.hudi.table.action.HoodieWriteMetadata;
|
||||
import org.apache.spark.api.java.JavaRDD;
|
||||
|
||||
public class SparkUpsertPreppedDeltaCommitActionExecutor<T extends HoodieRecordPayload<T>>
|
||||
extends AbstractSparkDeltaCommitActionExecutor<T> {
|
||||
|
||||
private final JavaRDD<HoodieRecord<T>> preppedRecords;
|
||||
|
||||
public SparkUpsertPreppedDeltaCommitActionExecutor(HoodieSparkEngineContext context,
|
||||
HoodieWriteConfig config, HoodieTable table,
|
||||
String instantTime, JavaRDD<HoodieRecord<T>> preppedRecords) {
|
||||
super(context, config, table, instantTime, WriteOperationType.UPSERT_PREPPED);
|
||||
this.preppedRecords = preppedRecords;
|
||||
}
|
||||
|
||||
@Override
|
||||
public HoodieWriteMetadata<JavaRDD<WriteStatus>> execute() {
|
||||
return super.execute(preppedRecords);
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,67 @@
|
||||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one
|
||||
* or more contributor license agreements. See the NOTICE file
|
||||
* distributed with this work for additional information
|
||||
* regarding copyright ownership. The ASF licenses this file
|
||||
* to you under the Apache License, Version 2.0 (the
|
||||
* "License"); you may not use this file except in compliance
|
||||
* with the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
package org.apache.hudi.table.action.restore;
|
||||
|
||||
import org.apache.hudi.avro.model.HoodieRollbackMetadata;
|
||||
import org.apache.hudi.client.WriteStatus;
|
||||
import org.apache.hudi.client.common.HoodieSparkEngineContext;
|
||||
import org.apache.hudi.common.model.HoodieKey;
|
||||
import org.apache.hudi.common.model.HoodieRecord;
|
||||
import org.apache.hudi.common.model.HoodieRecordPayload;
|
||||
import org.apache.hudi.common.table.timeline.HoodieActiveTimeline;
|
||||
import org.apache.hudi.common.table.timeline.HoodieInstant;
|
||||
import org.apache.hudi.common.table.timeline.HoodieTimeline;
|
||||
import org.apache.hudi.config.HoodieWriteConfig;
|
||||
import org.apache.hudi.exception.HoodieRollbackException;
|
||||
import org.apache.hudi.table.HoodieTable;
|
||||
import org.apache.hudi.table.action.rollback.SparkCopyOnWriteRollbackActionExecutor;
|
||||
|
||||
import org.apache.spark.api.java.JavaRDD;
|
||||
|
||||
@SuppressWarnings("checkstyle:LineLength")
|
||||
public class SparkCopyOnWriteRestoreActionExecutor<T extends HoodieRecordPayload> extends
|
||||
BaseRestoreActionExecutor<T, JavaRDD<HoodieRecord<T>>, JavaRDD<HoodieKey>, JavaRDD<WriteStatus>> {
|
||||
|
||||
public SparkCopyOnWriteRestoreActionExecutor(HoodieSparkEngineContext context,
|
||||
HoodieWriteConfig config,
|
||||
HoodieTable table,
|
||||
String instantTime,
|
||||
String restoreInstantTime) {
|
||||
super(context, config, table, instantTime, restoreInstantTime);
|
||||
}
|
||||
|
||||
@Override
|
||||
protected HoodieRollbackMetadata rollbackInstant(HoodieInstant instantToRollback) {
|
||||
table.getMetaClient().reloadActiveTimeline();
|
||||
SparkCopyOnWriteRollbackActionExecutor rollbackActionExecutor = new SparkCopyOnWriteRollbackActionExecutor(
|
||||
(HoodieSparkEngineContext) context,
|
||||
config,
|
||||
table,
|
||||
HoodieActiveTimeline.createNewInstantTime(),
|
||||
instantToRollback,
|
||||
true,
|
||||
true,
|
||||
false);
|
||||
if (!instantToRollback.getAction().equals(HoodieTimeline.COMMIT_ACTION)
|
||||
&& !instantToRollback.getAction().equals(HoodieTimeline.REPLACE_COMMIT_ACTION)) {
|
||||
throw new HoodieRollbackException("Unsupported action in rollback instant:" + instantToRollback);
|
||||
}
|
||||
return rollbackActionExecutor.execute();
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,74 @@
|
||||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one
|
||||
* or more contributor license agreements. See the NOTICE file
|
||||
* distributed with this work for additional information
|
||||
* regarding copyright ownership. The ASF licenses this file
|
||||
* to you under the Apache License, Version 2.0 (the
|
||||
* "License"); you may not use this file except in compliance
|
||||
* with the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
package org.apache.hudi.table.action.restore;
|
||||
|
||||
import org.apache.hudi.avro.model.HoodieRollbackMetadata;
|
||||
import org.apache.hudi.client.WriteStatus;
|
||||
import org.apache.hudi.client.common.HoodieSparkEngineContext;
|
||||
import org.apache.hudi.common.model.HoodieKey;
|
||||
import org.apache.hudi.common.model.HoodieRecord;
|
||||
import org.apache.hudi.common.model.HoodieRecordPayload;
|
||||
import org.apache.hudi.common.table.timeline.HoodieActiveTimeline;
|
||||
import org.apache.hudi.common.table.timeline.HoodieInstant;
|
||||
import org.apache.hudi.common.table.timeline.HoodieTimeline;
|
||||
import org.apache.hudi.config.HoodieWriteConfig;
|
||||
import org.apache.hudi.table.HoodieTable;
|
||||
import org.apache.hudi.table.action.rollback.SparkMergeOnReadRollbackActionExecutor;
|
||||
|
||||
import org.apache.spark.api.java.JavaRDD;
|
||||
|
||||
@SuppressWarnings("checkstyle:LineLength")
|
||||
public class SparkMergeOnReadRestoreActionExecutor<T extends HoodieRecordPayload> extends
|
||||
BaseRestoreActionExecutor<T, JavaRDD<HoodieRecord<T>>, JavaRDD<HoodieKey>, JavaRDD<WriteStatus>> {
|
||||
|
||||
public SparkMergeOnReadRestoreActionExecutor(HoodieSparkEngineContext context,
|
||||
HoodieWriteConfig config,
|
||||
HoodieTable table,
|
||||
String instantTime,
|
||||
String restoreInstantTime) {
|
||||
super(context, config, table, instantTime, restoreInstantTime);
|
||||
}
|
||||
|
||||
@Override
|
||||
protected HoodieRollbackMetadata rollbackInstant(HoodieInstant instantToRollback) {
|
||||
table.getMetaClient().reloadActiveTimeline();
|
||||
SparkMergeOnReadRollbackActionExecutor rollbackActionExecutor = new SparkMergeOnReadRollbackActionExecutor(
|
||||
context,
|
||||
config,
|
||||
table,
|
||||
HoodieActiveTimeline.createNewInstantTime(),
|
||||
instantToRollback,
|
||||
true,
|
||||
true,
|
||||
false);
|
||||
|
||||
switch (instantToRollback.getAction()) {
|
||||
case HoodieTimeline.COMMIT_ACTION:
|
||||
case HoodieTimeline.DELTA_COMMIT_ACTION:
|
||||
case HoodieTimeline.COMPACTION_ACTION:
|
||||
case HoodieTimeline.REPLACE_COMMIT_ACTION:
|
||||
// TODO : Get file status and create a rollback stat and file
|
||||
// TODO : Delete the .aux files along with the instant file, okay for now since the archival process will
|
||||
// delete these files when it does not see a corresponding instant file under .hoodie
|
||||
return rollbackActionExecutor.execute();
|
||||
default:
|
||||
throw new IllegalArgumentException("invalid action name " + instantToRollback.getAction());
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,239 @@
|
||||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one
|
||||
* or more contributor license agreements. See the NOTICE file
|
||||
* distributed with this work for additional information
|
||||
* regarding copyright ownership. The ASF licenses this file
|
||||
* to you under the Apache License, Version 2.0 (the
|
||||
* "License"); you may not use this file except in compliance
|
||||
* with the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
package org.apache.hudi.table.action.rollback;
|
||||
|
||||
import org.apache.hudi.client.common.HoodieEngineContext;
|
||||
import org.apache.hudi.common.HoodieRollbackStat;
|
||||
import org.apache.hudi.client.common.HoodieSparkEngineContext;
|
||||
import org.apache.hudi.common.fs.FSUtils;
|
||||
import org.apache.hudi.common.model.HoodieLogFile;
|
||||
import org.apache.hudi.common.table.HoodieTableMetaClient;
|
||||
import org.apache.hudi.common.table.log.HoodieLogFormat;
|
||||
import org.apache.hudi.common.table.log.HoodieLogFormat.Writer;
|
||||
import org.apache.hudi.common.table.log.block.HoodieCommandBlock;
|
||||
import org.apache.hudi.common.table.log.block.HoodieCommandBlock.HoodieCommandBlockTypeEnum;
|
||||
import org.apache.hudi.common.table.log.block.HoodieLogBlock.HeaderMetadataType;
|
||||
import org.apache.hudi.common.table.timeline.HoodieInstant;
|
||||
import org.apache.hudi.config.HoodieWriteConfig;
|
||||
import org.apache.hudi.exception.HoodieIOException;
|
||||
import org.apache.hudi.exception.HoodieRollbackException;
|
||||
|
||||
import org.apache.hadoop.fs.FileStatus;
|
||||
import org.apache.hadoop.fs.FileSystem;
|
||||
import org.apache.hadoop.fs.PathFilter;
|
||||
import org.apache.log4j.LogManager;
|
||||
import org.apache.log4j.Logger;
|
||||
import org.apache.spark.api.java.JavaPairRDD;
|
||||
import org.apache.spark.api.java.JavaSparkContext;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.io.Serializable;
|
||||
import java.util.Collections;
|
||||
import java.util.HashMap;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
import java.util.Objects;
|
||||
|
||||
import scala.Tuple2;
|
||||
|
||||
/**
|
||||
* Performs Rollback of Hoodie Tables.
|
||||
*/
|
||||
public class ListingBasedRollbackHelper implements Serializable {
|
||||
|
||||
private static final Logger LOG = LogManager.getLogger(ListingBasedRollbackHelper.class);
|
||||
|
||||
private final HoodieTableMetaClient metaClient;
|
||||
private final HoodieWriteConfig config;
|
||||
|
||||
public ListingBasedRollbackHelper(HoodieTableMetaClient metaClient, HoodieWriteConfig config) {
|
||||
this.metaClient = metaClient;
|
||||
this.config = config;
|
||||
}
|
||||
|
||||
/**
|
||||
* Performs all rollback actions that we have collected in parallel.
|
||||
*/
|
||||
public List<HoodieRollbackStat> performRollback(HoodieEngineContext context, HoodieInstant instantToRollback, List<ListingBasedRollbackRequest> rollbackRequests) {
|
||||
int sparkPartitions = Math.max(Math.min(rollbackRequests.size(), config.getRollbackParallelism()), 1);
|
||||
context.setJobStatus(this.getClass().getSimpleName(), "Perform rollback actions");
|
||||
JavaPairRDD<String, HoodieRollbackStat> partitionPathRollbackStatsPairRDD = maybeDeleteAndCollectStats(context, instantToRollback, rollbackRequests, sparkPartitions, true);
|
||||
return partitionPathRollbackStatsPairRDD.reduceByKey(RollbackUtils::mergeRollbackStat).map(Tuple2::_2).collect();
|
||||
}
|
||||
|
||||
/**
|
||||
* Collect all file info that needs to be rollbacked.
|
||||
*/
|
||||
public List<HoodieRollbackStat> collectRollbackStats(HoodieEngineContext context, HoodieInstant instantToRollback, List<ListingBasedRollbackRequest> rollbackRequests) {
|
||||
int sparkPartitions = Math.max(Math.min(rollbackRequests.size(), config.getRollbackParallelism()), 1);
|
||||
context.setJobStatus(this.getClass().getSimpleName(), "Collect rollback stats for upgrade/downgrade");
|
||||
JavaPairRDD<String, HoodieRollbackStat> partitionPathRollbackStatsPairRDD = maybeDeleteAndCollectStats(context, instantToRollback, rollbackRequests, sparkPartitions, false);
|
||||
return partitionPathRollbackStatsPairRDD.map(Tuple2::_2).collect();
|
||||
}
|
||||
|
||||
/**
|
||||
* May be delete interested files and collect stats or collect stats only.
|
||||
*
|
||||
* @param context instance of {@link HoodieEngineContext} to use.
|
||||
* @param instantToRollback {@link HoodieInstant} of interest for which deletion or collect stats is requested.
|
||||
* @param rollbackRequests List of {@link ListingBasedRollbackRequest} to be operated on.
|
||||
* @param sparkPartitions number of spark partitions to use for parallelism.
|
||||
* @param doDelete {@code true} if deletion has to be done. {@code false} if only stats are to be collected w/o performing any deletes.
|
||||
* @return stats collected with or w/o actual deletions.
|
||||
*/
|
||||
JavaPairRDD<String, HoodieRollbackStat> maybeDeleteAndCollectStats(HoodieEngineContext context, HoodieInstant instantToRollback, List<ListingBasedRollbackRequest> rollbackRequests,
|
||||
int sparkPartitions, boolean doDelete) {
|
||||
JavaSparkContext jsc = HoodieSparkEngineContext.getSparkContext(context);
|
||||
return jsc.parallelize(rollbackRequests, sparkPartitions).mapToPair(rollbackRequest -> {
|
||||
switch (rollbackRequest.getType()) {
|
||||
case DELETE_DATA_FILES_ONLY: {
|
||||
final Map<FileStatus, Boolean> filesToDeletedStatus = deleteBaseFiles(metaClient, config, instantToRollback.getTimestamp(),
|
||||
rollbackRequest.getPartitionPath(), doDelete);
|
||||
return new Tuple2<>(rollbackRequest.getPartitionPath(),
|
||||
HoodieRollbackStat.newBuilder().withPartitionPath(rollbackRequest.getPartitionPath())
|
||||
.withDeletedFileResults(filesToDeletedStatus).build());
|
||||
}
|
||||
case DELETE_DATA_AND_LOG_FILES: {
|
||||
final Map<FileStatus, Boolean> filesToDeletedStatus = deleteBaseAndLogFiles(metaClient, config, instantToRollback.getTimestamp(), rollbackRequest.getPartitionPath(), doDelete);
|
||||
return new Tuple2<>(rollbackRequest.getPartitionPath(),
|
||||
HoodieRollbackStat.newBuilder().withPartitionPath(rollbackRequest.getPartitionPath())
|
||||
.withDeletedFileResults(filesToDeletedStatus).build());
|
||||
}
|
||||
case APPEND_ROLLBACK_BLOCK: {
|
||||
Writer writer = null;
|
||||
try {
|
||||
writer = HoodieLogFormat.newWriterBuilder()
|
||||
.onParentPath(FSUtils.getPartitionPath(metaClient.getBasePath(), rollbackRequest.getPartitionPath()))
|
||||
.withFileId(rollbackRequest.getFileId().get())
|
||||
.overBaseCommit(rollbackRequest.getLatestBaseInstant().get()).withFs(metaClient.getFs())
|
||||
.withFileExtension(HoodieLogFile.DELTA_EXTENSION).build();
|
||||
|
||||
// generate metadata
|
||||
if (doDelete) {
|
||||
Map<HeaderMetadataType, String> header = generateHeader(instantToRollback.getTimestamp());
|
||||
// if update belongs to an existing log file
|
||||
writer = writer.appendBlock(new HoodieCommandBlock(header));
|
||||
}
|
||||
} catch (IOException | InterruptedException io) {
|
||||
throw new HoodieRollbackException("Failed to rollback for instant " + instantToRollback, io);
|
||||
} finally {
|
||||
try {
|
||||
if (writer != null) {
|
||||
writer.close();
|
||||
}
|
||||
} catch (IOException io) {
|
||||
throw new HoodieIOException("Error appending rollback block..", io);
|
||||
}
|
||||
}
|
||||
|
||||
// This step is intentionally done after writer is closed. Guarantees that
|
||||
// getFileStatus would reflect correct stats and FileNotFoundException is not thrown in
|
||||
// cloud-storage : HUDI-168
|
||||
Map<FileStatus, Long> filesToNumBlocksRollback = Collections.singletonMap(
|
||||
metaClient.getFs().getFileStatus(Objects.requireNonNull(writer).getLogFile().getPath()),
|
||||
1L
|
||||
);
|
||||
return new Tuple2<>(rollbackRequest.getPartitionPath(),
|
||||
HoodieRollbackStat.newBuilder().withPartitionPath(rollbackRequest.getPartitionPath())
|
||||
.withRollbackBlockAppendResults(filesToNumBlocksRollback).build());
|
||||
}
|
||||
default:
|
||||
throw new IllegalStateException("Unknown Rollback action " + rollbackRequest);
|
||||
}
|
||||
});
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Common method used for cleaning out base files under a partition path during rollback of a set of commits.
|
||||
*/
|
||||
private Map<FileStatus, Boolean> deleteBaseAndLogFiles(HoodieTableMetaClient metaClient, HoodieWriteConfig config,
|
||||
String commit, String partitionPath, boolean doDelete) throws IOException {
|
||||
LOG.info("Cleaning path " + partitionPath);
|
||||
String basefileExtension = metaClient.getTableConfig().getBaseFileFormat().getFileExtension();
|
||||
SerializablePathFilter filter = (path) -> {
|
||||
if (path.toString().endsWith(basefileExtension)) {
|
||||
String fileCommitTime = FSUtils.getCommitTime(path.getName());
|
||||
return commit.equals(fileCommitTime);
|
||||
} else if (FSUtils.isLogFile(path)) {
|
||||
// Since the baseCommitTime is the only commit for new log files, it's okay here
|
||||
String fileCommitTime = FSUtils.getBaseCommitTimeFromLogPath(path);
|
||||
return commit.equals(fileCommitTime);
|
||||
}
|
||||
return false;
|
||||
};
|
||||
|
||||
final Map<FileStatus, Boolean> results = new HashMap<>();
|
||||
FileSystem fs = metaClient.getFs();
|
||||
FileStatus[] toBeDeleted = fs.listStatus(FSUtils.getPartitionPath(config.getBasePath(), partitionPath), filter);
|
||||
for (FileStatus file : toBeDeleted) {
|
||||
if (doDelete) {
|
||||
boolean success = fs.delete(file.getPath(), false);
|
||||
results.put(file, success);
|
||||
LOG.info("Delete file " + file.getPath() + "\t" + success);
|
||||
} else {
|
||||
results.put(file, true);
|
||||
}
|
||||
}
|
||||
return results;
|
||||
}
|
||||
|
||||
/**
|
||||
* Common method used for cleaning out base files under a partition path during rollback of a set of commits.
|
||||
*/
|
||||
private Map<FileStatus, Boolean> deleteBaseFiles(HoodieTableMetaClient metaClient, HoodieWriteConfig config,
|
||||
String commit, String partitionPath, boolean doDelete) throws IOException {
|
||||
final Map<FileStatus, Boolean> results = new HashMap<>();
|
||||
LOG.info("Cleaning path " + partitionPath);
|
||||
FileSystem fs = metaClient.getFs();
|
||||
String basefileExtension = metaClient.getTableConfig().getBaseFileFormat().getFileExtension();
|
||||
PathFilter filter = (path) -> {
|
||||
if (path.toString().contains(basefileExtension)) {
|
||||
String fileCommitTime = FSUtils.getCommitTime(path.getName());
|
||||
return commit.equals(fileCommitTime);
|
||||
}
|
||||
return false;
|
||||
};
|
||||
FileStatus[] toBeDeleted = fs.listStatus(FSUtils.getPartitionPath(config.getBasePath(), partitionPath), filter);
|
||||
for (FileStatus file : toBeDeleted) {
|
||||
if (doDelete) {
|
||||
boolean success = fs.delete(file.getPath(), false);
|
||||
results.put(file, success);
|
||||
LOG.info("Delete file " + file.getPath() + "\t" + success);
|
||||
} else {
|
||||
results.put(file, true);
|
||||
}
|
||||
}
|
||||
return results;
|
||||
}
|
||||
|
||||
private Map<HeaderMetadataType, String> generateHeader(String commit) {
|
||||
// generate metadata
|
||||
Map<HeaderMetadataType, String> header = new HashMap<>(3);
|
||||
header.put(HeaderMetadataType.INSTANT_TIME, metaClient.getActiveTimeline().lastInstant().get().getTimestamp());
|
||||
header.put(HeaderMetadataType.TARGET_INSTANT_TIME, commit);
|
||||
header.put(HeaderMetadataType.COMMAND_BLOCK_TYPE,
|
||||
String.valueOf(HoodieCommandBlockTypeEnum.ROLLBACK_PREVIOUS_BLOCK.ordinal()));
|
||||
return header;
|
||||
}
|
||||
|
||||
public interface SerializablePathFilter extends PathFilter, Serializable {
|
||||
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,73 @@
|
||||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one
|
||||
* or more contributor license agreements. See the NOTICE file
|
||||
* distributed with this work for additional information
|
||||
* regarding copyright ownership. The ASF licenses this file
|
||||
* to you under the Apache License, Version 2.0 (the
|
||||
* "License"); you may not use this file except in compliance
|
||||
* with the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
package org.apache.hudi.table.action.rollback;
|
||||
|
||||
import org.apache.hudi.client.WriteStatus;
|
||||
import org.apache.hudi.client.common.HoodieSparkEngineContext;
|
||||
import org.apache.hudi.common.HoodieRollbackStat;
|
||||
import org.apache.hudi.common.model.HoodieKey;
|
||||
import org.apache.hudi.common.model.HoodieRecord;
|
||||
import org.apache.hudi.common.model.HoodieRecordPayload;
|
||||
import org.apache.hudi.common.table.timeline.HoodieInstant;
|
||||
import org.apache.hudi.config.HoodieWriteConfig;
|
||||
import org.apache.hudi.table.HoodieTable;
|
||||
|
||||
import org.apache.spark.api.java.JavaRDD;
|
||||
|
||||
import java.util.List;
|
||||
|
||||
@SuppressWarnings("checkstyle:LineLength")
|
||||
public class SparkCopyOnWriteRollbackActionExecutor<T extends HoodieRecordPayload> extends
|
||||
BaseCopyOnWriteRollbackActionExecutor<T, JavaRDD<HoodieRecord<T>>, JavaRDD<HoodieKey>, JavaRDD<WriteStatus>> {
|
||||
public SparkCopyOnWriteRollbackActionExecutor(HoodieSparkEngineContext context,
|
||||
HoodieWriteConfig config,
|
||||
HoodieTable<T, JavaRDD<HoodieRecord<T>>, JavaRDD<HoodieKey>, JavaRDD<WriteStatus>> table,
|
||||
String instantTime,
|
||||
HoodieInstant commitInstant,
|
||||
boolean deleteInstants) {
|
||||
super(context, config, table, instantTime, commitInstant, deleteInstants);
|
||||
}
|
||||
|
||||
public SparkCopyOnWriteRollbackActionExecutor(HoodieSparkEngineContext context,
|
||||
HoodieWriteConfig config,
|
||||
HoodieTable<T, JavaRDD<HoodieRecord<T>>, JavaRDD<HoodieKey>, JavaRDD<WriteStatus>> table,
|
||||
String instantTime,
|
||||
HoodieInstant commitInstant,
|
||||
boolean deleteInstants,
|
||||
boolean skipTimelinePublish,
|
||||
boolean useMarkerBasedStrategy) {
|
||||
super(context, config, table, instantTime, commitInstant, deleteInstants, skipTimelinePublish, useMarkerBasedStrategy);
|
||||
}
|
||||
|
||||
@Override
|
||||
protected BaseRollbackActionExecutor.RollbackStrategy getRollbackStrategy() {
|
||||
if (useMarkerBasedStrategy) {
|
||||
return new SparkMarkerBasedRollbackStrategy(table, context, config, instantTime);
|
||||
} else {
|
||||
return this::executeRollbackUsingFileListing;
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
protected List<HoodieRollbackStat> executeRollbackUsingFileListing(HoodieInstant instantToRollback) {
|
||||
List<ListingBasedRollbackRequest> rollbackRequests = RollbackUtils.generateRollbackRequestsByListingCOW(table.getMetaClient().getFs(), table.getMetaClient().getBasePath(),
|
||||
config.shouldAssumeDatePartitioning());
|
||||
return new ListingBasedRollbackHelper(table.getMetaClient(), config).performRollback(context, instantToRollback, rollbackRequests);
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,77 @@
|
||||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one
|
||||
* or more contributor license agreements. See the NOTICE file
|
||||
* distributed with this work for additional information
|
||||
* regarding copyright ownership. The ASF licenses this file
|
||||
* to you under the Apache License, Version 2.0 (the
|
||||
* "License"); you may not use this file except in compliance
|
||||
* with the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
package org.apache.hudi.table.action.rollback;
|
||||
|
||||
import org.apache.hudi.client.WriteStatus;
|
||||
import org.apache.hudi.client.common.HoodieEngineContext;
|
||||
import org.apache.hudi.client.common.HoodieSparkEngineContext;
|
||||
import org.apache.hudi.common.HoodieRollbackStat;
|
||||
import org.apache.hudi.common.model.HoodieKey;
|
||||
import org.apache.hudi.common.model.HoodieRecord;
|
||||
import org.apache.hudi.common.model.HoodieRecordPayload;
|
||||
import org.apache.hudi.common.model.IOType;
|
||||
import org.apache.hudi.common.table.timeline.HoodieInstant;
|
||||
import org.apache.hudi.config.HoodieWriteConfig;
|
||||
import org.apache.hudi.exception.HoodieRollbackException;
|
||||
import org.apache.hudi.table.HoodieTable;
|
||||
import org.apache.hudi.table.MarkerFiles;
|
||||
|
||||
import org.apache.spark.api.java.JavaRDD;
|
||||
import org.apache.spark.api.java.JavaSparkContext;
|
||||
|
||||
import java.util.List;
|
||||
|
||||
import scala.Tuple2;
|
||||
|
||||
@SuppressWarnings("checkstyle:LineLength")
|
||||
public class SparkMarkerBasedRollbackStrategy<T extends HoodieRecordPayload> extends AbstractMarkerBasedRollbackStrategy<T, JavaRDD<HoodieRecord<T>>, JavaRDD<HoodieKey>, JavaRDD<WriteStatus>> {
|
||||
public SparkMarkerBasedRollbackStrategy(HoodieTable<T, JavaRDD<HoodieRecord<T>>, JavaRDD<HoodieKey>, JavaRDD<WriteStatus>> table, HoodieEngineContext context, HoodieWriteConfig config, String instantTime) {
|
||||
super(table, context, config, instantTime);
|
||||
}
|
||||
|
||||
@Override
|
||||
public List<HoodieRollbackStat> execute(HoodieInstant instantToRollback) {
|
||||
JavaSparkContext jsc = HoodieSparkEngineContext.getSparkContext(context);
|
||||
try {
|
||||
MarkerFiles markerFiles = new MarkerFiles(table, instantToRollback.getTimestamp());
|
||||
List<String> markerFilePaths = markerFiles.allMarkerFilePaths();
|
||||
int parallelism = Math.max(Math.min(markerFilePaths.size(), config.getRollbackParallelism()), 1);
|
||||
return jsc.parallelize(markerFilePaths, parallelism)
|
||||
.map(markerFilePath -> {
|
||||
String typeStr = markerFilePath.substring(markerFilePath.lastIndexOf(".") + 1);
|
||||
IOType type = IOType.valueOf(typeStr);
|
||||
switch (type) {
|
||||
case MERGE:
|
||||
return undoMerge(MarkerFiles.stripMarkerSuffix(markerFilePath));
|
||||
case APPEND:
|
||||
return undoAppend(MarkerFiles.stripMarkerSuffix(markerFilePath), instantToRollback);
|
||||
case CREATE:
|
||||
return undoCreate(MarkerFiles.stripMarkerSuffix(markerFilePath));
|
||||
default:
|
||||
throw new HoodieRollbackException("Unknown marker type, during rollback of " + instantToRollback);
|
||||
}
|
||||
})
|
||||
.mapToPair(rollbackStat -> new Tuple2<>(rollbackStat.getPartitionPath(), rollbackStat))
|
||||
.reduceByKey(RollbackUtils::mergeRollbackStat)
|
||||
.map(Tuple2::_2).collect();
|
||||
} catch (Exception e) {
|
||||
throw new HoodieRollbackException("Error rolling back using marker files written for " + instantToRollback, e);
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,82 @@
|
||||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one
|
||||
* or more contributor license agreements. See the NOTICE file
|
||||
* distributed with this work for additional information
|
||||
* regarding copyright ownership. The ASF licenses this file
|
||||
* to you under the Apache License, Version 2.0 (the
|
||||
* "License"); you may not use this file except in compliance
|
||||
* with the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
package org.apache.hudi.table.action.rollback;
|
||||
|
||||
import org.apache.hudi.client.WriteStatus;
|
||||
import org.apache.hudi.client.common.HoodieEngineContext;
|
||||
import org.apache.hudi.client.common.HoodieSparkEngineContext;
|
||||
import org.apache.hudi.common.HoodieRollbackStat;
|
||||
import org.apache.hudi.common.model.HoodieKey;
|
||||
import org.apache.hudi.common.model.HoodieRecord;
|
||||
import org.apache.hudi.common.model.HoodieRecordPayload;
|
||||
import org.apache.hudi.common.table.timeline.HoodieInstant;
|
||||
import org.apache.hudi.config.HoodieWriteConfig;
|
||||
import org.apache.hudi.exception.HoodieIOException;
|
||||
import org.apache.hudi.table.HoodieTable;
|
||||
|
||||
import org.apache.spark.api.java.JavaRDD;
|
||||
import org.apache.spark.api.java.JavaSparkContext;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.util.List;
|
||||
|
||||
@SuppressWarnings("checkstyle:LineLength")
|
||||
public class SparkMergeOnReadRollbackActionExecutor<T extends HoodieRecordPayload> extends
|
||||
BaseMergeOnReadRollbackActionExecutor<T, JavaRDD<HoodieRecord<T>>, JavaRDD<HoodieKey>, JavaRDD<WriteStatus>> {
|
||||
public SparkMergeOnReadRollbackActionExecutor(HoodieEngineContext context,
|
||||
HoodieWriteConfig config,
|
||||
HoodieTable<T, JavaRDD<HoodieRecord<T>>, JavaRDD<HoodieKey>, JavaRDD<WriteStatus>> table,
|
||||
String instantTime,
|
||||
HoodieInstant commitInstant,
|
||||
boolean deleteInstants) {
|
||||
super(context, config, table, instantTime, commitInstant, deleteInstants);
|
||||
}
|
||||
|
||||
public SparkMergeOnReadRollbackActionExecutor(HoodieEngineContext context,
|
||||
HoodieWriteConfig config,
|
||||
HoodieTable<T, JavaRDD<HoodieRecord<T>>, JavaRDD<HoodieKey>, JavaRDD<WriteStatus>> table,
|
||||
String instantTime,
|
||||
HoodieInstant commitInstant,
|
||||
boolean deleteInstants,
|
||||
boolean skipTimelinePublish,
|
||||
boolean useMarkerBasedStrategy) {
|
||||
super(context, config, table, instantTime, commitInstant, deleteInstants, skipTimelinePublish, useMarkerBasedStrategy);
|
||||
}
|
||||
|
||||
@Override
|
||||
protected BaseRollbackActionExecutor.RollbackStrategy getRollbackStrategy() {
|
||||
if (useMarkerBasedStrategy) {
|
||||
return new SparkMarkerBasedRollbackStrategy(table, context, config, instantTime);
|
||||
} else {
|
||||
return this::executeRollbackUsingFileListing;
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
protected List<HoodieRollbackStat> executeRollbackUsingFileListing(HoodieInstant resolvedInstant) {
|
||||
List<ListingBasedRollbackRequest> rollbackRequests;
|
||||
JavaSparkContext jsc = HoodieSparkEngineContext.getSparkContext(context);
|
||||
try {
|
||||
rollbackRequests = RollbackUtils.generateRollbackRequestsUsingFileListingMOR(resolvedInstant, table, context);
|
||||
} catch (IOException e) {
|
||||
throw new HoodieIOException("Error generating rollback requests by file listing.", e);
|
||||
}
|
||||
return new ListingBasedRollbackHelper(table.getMetaClient(), config).performRollback(context, resolvedInstant, rollbackRequests);
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,48 @@
|
||||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one
|
||||
* or more contributor license agreements. See the NOTICE file
|
||||
* distributed with this work for additional information
|
||||
* regarding copyright ownership. The ASF licenses this file
|
||||
* to you under the Apache License, Version 2.0 (the
|
||||
* "License"); you may not use this file except in compliance
|
||||
* with the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
package org.apache.hudi.table.upgrade;
|
||||
|
||||
import org.apache.hudi.client.common.HoodieEngineContext;
|
||||
import org.apache.hudi.common.table.timeline.HoodieInstant;
|
||||
import org.apache.hudi.common.table.timeline.HoodieTimeline;
|
||||
import org.apache.hudi.config.HoodieWriteConfig;
|
||||
import org.apache.hudi.table.HoodieSparkTable;
|
||||
import org.apache.hudi.table.MarkerFiles;
|
||||
|
||||
import java.util.List;
|
||||
import java.util.stream.Collectors;
|
||||
|
||||
/**
|
||||
* Downgrade handle to assist in downgrading hoodie table from version 1 to 0.
|
||||
*/
|
||||
public class OneToZeroDowngradeHandler implements DowngradeHandler {
|
||||
|
||||
@Override
|
||||
public void downgrade(HoodieWriteConfig config, HoodieEngineContext context, String instantTime) {
|
||||
// fetch pending commit info
|
||||
HoodieSparkTable table = HoodieSparkTable.create(config, context);
|
||||
HoodieTimeline inflightTimeline = table.getMetaClient().getCommitsTimeline().filterPendingExcludingCompaction();
|
||||
List<HoodieInstant> commits = inflightTimeline.getReverseOrderedInstants().collect(Collectors.toList());
|
||||
for (HoodieInstant commitInstant : commits) {
|
||||
// delete existing marker files
|
||||
MarkerFiles markerFiles = new MarkerFiles(table, commitInstant.getTimestamp());
|
||||
markerFiles.quietDeleteMarkerDir(context, config.getMarkersDeleteParallelism());
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,66 @@
|
||||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one
|
||||
* or more contributor license agreements. See the NOTICE file
|
||||
* distributed with this work for additional information
|
||||
* regarding copyright ownership. The ASF licenses this file
|
||||
* to you under the Apache License, Version 2.0 (the
|
||||
* "License"); you may not use this file except in compliance
|
||||
* with the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
package org.apache.hudi.table.upgrade;
|
||||
|
||||
import org.apache.hudi.client.common.HoodieEngineContext;
|
||||
import org.apache.hudi.common.table.HoodieTableMetaClient;
|
||||
import org.apache.hudi.common.table.HoodieTableVersion;
|
||||
import org.apache.hudi.config.HoodieWriteConfig;
|
||||
import org.apache.hudi.exception.HoodieUpgradeDowngradeException;
|
||||
|
||||
import java.io.IOException;
|
||||
|
||||
public class SparkUpgradeDowngrade extends AbstractUpgradeDowngrade {
|
||||
|
||||
public SparkUpgradeDowngrade(HoodieTableMetaClient metaClient, HoodieWriteConfig config, HoodieEngineContext context) {
|
||||
super(metaClient, config, context);
|
||||
}
|
||||
|
||||
@Override
|
||||
public void run(HoodieTableMetaClient metaClient,
|
||||
HoodieTableVersion toVersion,
|
||||
HoodieWriteConfig config,
|
||||
HoodieEngineContext context,
|
||||
String instantTime) {
|
||||
try {
|
||||
new SparkUpgradeDowngrade(metaClient, config, context).run(toVersion, instantTime);
|
||||
} catch (IOException e) {
|
||||
throw new HoodieUpgradeDowngradeException("Error during upgrade/downgrade to version:" + toVersion, e);
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
@Override
|
||||
protected void upgrade(HoodieTableVersion fromVersion, HoodieTableVersion toVersion, String instantTime) {
|
||||
if (fromVersion == HoodieTableVersion.ZERO && toVersion == HoodieTableVersion.ONE) {
|
||||
new ZeroToOneUpgradeHandler().upgrade(config, context, instantTime);
|
||||
} else {
|
||||
throw new HoodieUpgradeDowngradeException(fromVersion.versionCode(), toVersion.versionCode(), true);
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
protected void downgrade(HoodieTableVersion fromVersion, HoodieTableVersion toVersion, String instantTime) {
|
||||
if (fromVersion == HoodieTableVersion.ONE && toVersion == HoodieTableVersion.ZERO) {
|
||||
new OneToZeroDowngradeHandler().downgrade(config, context, instantTime);
|
||||
} else {
|
||||
throw new HoodieUpgradeDowngradeException(fromVersion.versionCode(), toVersion.versionCode(), false);
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,136 @@
|
||||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one
|
||||
* or more contributor license agreements. See the NOTICE file
|
||||
* distributed with this work for additional information
|
||||
* regarding copyright ownership. The ASF licenses this file
|
||||
* to you under the Apache License, Version 2.0 (the
|
||||
* "License"); you may not use this file except in compliance
|
||||
* with the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
package org.apache.hudi.table.upgrade;
|
||||
|
||||
import org.apache.hudi.client.common.HoodieEngineContext;
|
||||
import org.apache.hudi.common.HoodieRollbackStat;
|
||||
import org.apache.hudi.common.fs.FSUtils;
|
||||
import org.apache.hudi.common.model.HoodieTableType;
|
||||
import org.apache.hudi.common.model.IOType;
|
||||
import org.apache.hudi.common.table.timeline.HoodieActiveTimeline;
|
||||
import org.apache.hudi.common.table.timeline.HoodieInstant;
|
||||
import org.apache.hudi.common.table.timeline.HoodieTimeline;
|
||||
import org.apache.hudi.common.util.Option;
|
||||
import org.apache.hudi.config.HoodieWriteConfig;
|
||||
import org.apache.hudi.exception.HoodieRollbackException;
|
||||
import org.apache.hudi.table.HoodieSparkTable;
|
||||
import org.apache.hudi.table.HoodieTable;
|
||||
import org.apache.hudi.table.MarkerFiles;
|
||||
import org.apache.hudi.table.action.rollback.ListingBasedRollbackHelper;
|
||||
import org.apache.hudi.table.action.rollback.ListingBasedRollbackRequest;
|
||||
|
||||
import org.apache.hadoop.fs.FileStatus;
|
||||
import org.apache.hadoop.fs.Path;
|
||||
import org.apache.hudi.table.action.rollback.RollbackUtils;
|
||||
|
||||
import java.util.List;
|
||||
import java.util.stream.Collectors;
|
||||
|
||||
/**
|
||||
* Upgrade handle to assist in upgrading hoodie table from version 0 to 1.
|
||||
*/
|
||||
public class ZeroToOneUpgradeHandler implements UpgradeHandler {
|
||||
|
||||
@Override
|
||||
public void upgrade(HoodieWriteConfig config, HoodieEngineContext context, String instantTime) {
|
||||
// fetch pending commit info
|
||||
HoodieSparkTable table = HoodieSparkTable.create(config, context);
|
||||
HoodieTimeline inflightTimeline = table.getMetaClient().getCommitsTimeline().filterPendingExcludingCompaction();
|
||||
List<String> commits = inflightTimeline.getReverseOrderedInstants().map(HoodieInstant::getTimestamp)
|
||||
.collect(Collectors.toList());
|
||||
if (commits.size() > 0 && instantTime != null) {
|
||||
// ignore the latest inflight commit since a new commit would have been started and we need to fix any pending commits from previous launch
|
||||
commits.remove(instantTime);
|
||||
}
|
||||
for (String commit : commits) {
|
||||
// for every pending commit, delete old marker files and re-create marker files in new format
|
||||
recreateMarkerFiles(commit, table, context, config.getMarkersDeleteParallelism());
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Recreate marker files in new format.
|
||||
* Step1: Delete existing marker files
|
||||
* Step2: Collect all rollback file info.
|
||||
* Step3: recreate marker files for all interested files.
|
||||
*
|
||||
* @param commitInstantTime instant of interest for which marker files need to be recreated.
|
||||
* @param table instance of {@link HoodieSparkTable} to use
|
||||
* @param context instance of {@link HoodieEngineContext} to use
|
||||
* @throws HoodieRollbackException on any exception during upgrade.
|
||||
*/
|
||||
private static void recreateMarkerFiles(final String commitInstantTime,
|
||||
HoodieSparkTable table,
|
||||
HoodieEngineContext context,
|
||||
int parallelism) throws HoodieRollbackException {
|
||||
try {
|
||||
// fetch hoodie instant
|
||||
Option<HoodieInstant> commitInstantOpt = Option.fromJavaOptional(table.getActiveTimeline().getCommitsTimeline().getInstants()
|
||||
.filter(instant -> HoodieActiveTimeline.EQUALS.test(instant.getTimestamp(), commitInstantTime))
|
||||
.findFirst());
|
||||
if (commitInstantOpt.isPresent()) {
|
||||
// delete existing marker files
|
||||
MarkerFiles markerFiles = new MarkerFiles(table, commitInstantTime);
|
||||
markerFiles.quietDeleteMarkerDir(context, parallelism);
|
||||
|
||||
// generate rollback stats
|
||||
List<ListingBasedRollbackRequest> rollbackRequests;
|
||||
if (table.getMetaClient().getTableType() == HoodieTableType.COPY_ON_WRITE) {
|
||||
rollbackRequests = RollbackUtils.generateRollbackRequestsByListingCOW(table.getMetaClient().getFs(), table.getMetaClient().getBasePath(),
|
||||
table.getConfig().shouldAssumeDatePartitioning());
|
||||
} else {
|
||||
rollbackRequests = RollbackUtils.generateRollbackRequestsUsingFileListingMOR(commitInstantOpt.get(), table, context);
|
||||
}
|
||||
List<HoodieRollbackStat> rollbackStats = new ListingBasedRollbackHelper(table.getMetaClient(), table.getConfig())
|
||||
.collectRollbackStats(context, commitInstantOpt.get(), rollbackRequests);
|
||||
|
||||
// recreate marker files adhering to marker based rollback
|
||||
for (HoodieRollbackStat rollbackStat : rollbackStats) {
|
||||
for (String path : rollbackStat.getSuccessDeleteFiles()) {
|
||||
String dataFileName = path.substring(path.lastIndexOf("/") + 1);
|
||||
// not feasible to differentiate MERGE from CREATE. hence creating with MERGE IOType for all base files.
|
||||
markerFiles.create(rollbackStat.getPartitionPath(), dataFileName, IOType.MERGE);
|
||||
}
|
||||
for (FileStatus fileStatus : rollbackStat.getCommandBlocksCount().keySet()) {
|
||||
markerFiles.create(rollbackStat.getPartitionPath(), getFileNameForMarkerFromLogFile(fileStatus.getPath().toString(), table), IOType.APPEND);
|
||||
}
|
||||
}
|
||||
}
|
||||
} catch (Exception e) {
|
||||
throw new HoodieRollbackException("Exception thrown while upgrading Hoodie Table from version 0 to 1", e);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Curates file name for marker from existing log file path.
|
||||
* log file format : partitionpath/.fileid_baseInstant.log.writetoken
|
||||
* marker file format : partitionpath/fileId_writetoken_baseinstant.basefileExtn.marker.APPEND
|
||||
*
|
||||
* @param logFilePath log file path for which marker file name needs to be generated.
|
||||
* @return the marker file name thus curated.
|
||||
*/
|
||||
private static String getFileNameForMarkerFromLogFile(String logFilePath, HoodieTable table) {
|
||||
Path logPath = new Path(table.getMetaClient().getBasePath(), logFilePath);
|
||||
String fileId = FSUtils.getFileIdFromLogPath(logPath);
|
||||
String baseInstant = FSUtils.getBaseCommitTimeFromLogPath(logPath);
|
||||
String writeToken = FSUtils.getWriteTokenFromLogPath(logPath);
|
||||
|
||||
return FSUtils.makeDataFileName(baseInstant, writeToken, fileId, table.getBaseFileFormat().getFileExtension());
|
||||
}
|
||||
}
|
||||
Reference in New Issue
Block a user