1
0

[HUDI-242] Support for RFC-12/Bootstrapping of external datasets to hudi (#1876)

- [HUDI-418] Bootstrap Index Implementation using HFile with unit-test
 - [HUDI-421] FileSystem View Changes to support Bootstrap with unit-tests
 - [HUDI-424] Implement Query Side Integration for querying tables containing bootstrap file slices
 - [HUDI-423] Implement upsert functionality for handling updates to these bootstrap file slices
 - [HUDI-421] Bootstrap Write Client with tests
 - [HUDI-425] Added HoodieDeltaStreamer support
 - [HUDI-899] Add a knob to change partition-path style while performing metadata bootstrap
 - [HUDI-900] Metadata Bootstrap Key Generator needs to handle complex keys correctly
 - [HUDI-424] Simplify Record reader implementation
 - [HUDI-423] Implement upsert functionality for handling updates to these bootstrap file slices
 - [HUDI-420] Hoodie Demo working with hive and sparkSQL. Also, Hoodie CLI working with bootstrap tables

Co-authored-by: Mehrotra <uditme@amazon.com>
Co-authored-by: Vinoth Chandar <vinoth@apache.org>
Co-authored-by: Balaji Varadarajan <varadarb@uber.com>
This commit is contained in:
vinoth chandar
2020-08-03 20:19:21 -07:00
committed by GitHub
parent 266bce12b3
commit 539621bd33
175 changed files with 7540 additions and 779 deletions

View File

@@ -206,6 +206,7 @@
This product includes code from Apache Hive.
* org.apache.hadoop.hive.ql.io.CombineHiveInputFormat copied to org.apache.hudi.hadoop.hive.HoodieCombineHiveInputFormat
* org.apache.hadoop.hive.serde2.ColumnProjectionUtils copied and modified to org.apache.hudi.hadoop.HoodieColumnProjectionUtils
Copyright: 2011-2019 The Apache Software Foundation
Home page: http://hive.apache.org/

View File

@@ -19,4 +19,7 @@ connect --path /user/hive/warehouse/stock_ticks_mor
compactions show all
compaction schedule --hoodieConfigs hoodie.compact.inline.max.delta.commits=1
compaction run --parallelism 2 --sparkMemory 1G --schemaFilePath /var/demo/config/schema.avsc --retry 1
connect --path /user/hive/warehouse/stock_ticks_mor_bs
compactions show all
compaction schedule --hoodieConfigs hoodie.compact.inline.max.delta.commits=1
compaction run --parallelism 2 --sparkMemory 1G --schemaFilePath /var/demo/config/schema.avsc --retry 1

View File

@@ -25,4 +25,12 @@ select symbol, ts, volume, open, close from stock_ticks_cow where symbol = 'GO
select symbol, ts, volume, open, close from stock_ticks_mor_ro where symbol = 'GOOG';
select symbol, ts, volume, open, close from stock_ticks_mor_rt where symbol = 'GOOG';
select symbol, max(ts) from stock_ticks_cow_bs group by symbol HAVING symbol = 'GOOG';
select symbol, max(ts) from stock_ticks_mor_bs_ro group by symbol HAVING symbol = 'GOOG';
select symbol, max(ts) from stock_ticks_mor_bs_rt group by symbol HAVING symbol = 'GOOG';
select symbol, ts, volume, open, close from stock_ticks_cow_bs where symbol = 'GOOG';
select symbol, ts, volume, open, close from stock_ticks_mor_bs_ro where symbol = 'GOOG';
select symbol, ts, volume, open, close from stock_ticks_mor_bs_rt where symbol = 'GOOG';
!quit

View File

@@ -23,4 +23,10 @@ select symbol, max(ts) from stock_ticks_mor_rt group by symbol HAVING symbol = '
select symbol, ts, volume, open, close from stock_ticks_mor_ro where symbol = 'GOOG';
select symbol, ts, volume, open, close from stock_ticks_mor_rt where symbol = 'GOOG';
select symbol, max(ts) from stock_ticks_mor_bs_ro group by symbol HAVING symbol = 'GOOG';
select symbol, max(ts) from stock_ticks_mor_bs_rt group by symbol HAVING symbol = 'GOOG';
select symbol, ts, volume, open, close from stock_ticks_mor_bs_ro where symbol = 'GOOG';
select symbol, ts, volume, open, close from stock_ticks_mor_bs_rt where symbol = 'GOOG';
!quit

View File

@@ -23,5 +23,11 @@ set hoodie.stock_ticks_cow.consume.start.timestamp='${min.commit.time}';
select symbol, ts, volume, open, close from stock_ticks_cow where symbol = 'GOOG' and `_hoodie_commit_time` > '${min.commit.time}';
set hoodie.stock_ticks_cow_bs.consume.mode=INCREMENTAL;
set hoodie.stock_ticks_cow_bs.consume.max.commits=3;
set hoodie.stock_ticks_cow_bs.consume.start.timestamp='00000000000001';
select symbol, ts, volume, open, close from stock_ticks_cow_bs where symbol = 'GOOG' and `_hoodie_commit_time` > '00000000000001';
!quit

View File

@@ -23,5 +23,11 @@ set hoodie.stock_ticks_mor.consume.start.timestamp='${min.commit.time}';
select symbol, ts, volume, open, close from stock_ticks_mor_ro where symbol = 'GOOG' and `_hoodie_commit_time` > '${min.commit.time}';
set hoodie.stock_ticks_mor_bs.consume.mode=INCREMENTAL;
set hoodie.stock_ticks_mor_bs.consume.max.commits=3;
set hoodie.stock_ticks_mor_bs.consume.start.timestamp='00000000000001';
select symbol, ts, volume, open, close from stock_ticks_mor_bs_ro where symbol = 'GOOG' and `_hoodie_commit_time` > '00000000000001';
!quit

View File

@@ -23,5 +23,11 @@ set hoodie.stock_ticks_mor.consume.start.timestamp='${min.commit.time}';
select symbol, ts, volume, open, close from stock_ticks_mor_rt where symbol = 'GOOG' and `_hoodie_commit_time` > '${min.commit.time}';
set hoodie.stock_ticks_mor_bs.consume.mode=INCREMENTAL;
set hoodie.stock_ticks_mor_bs.consume.max.commits=3;
set hoodie.stock_ticks_mor_bs.consume.start.timestamp='00000000000001';
select symbol, ts, volume, open, close from stock_ticks_mor_bs_rt where symbol = 'GOOG' and `_hoodie_commit_time` > '00000000000001';
!quit

View File

@@ -27,4 +27,14 @@ spark.sql("select symbol, max(ts) from stock_ticks_mor_rt group by symbol HAVING
spark.sql("select `_hoodie_commit_time`, symbol, ts, volume, open, close from stock_ticks_mor_ro where symbol = 'GOOG'").show(100, false)
spark.sql("select `_hoodie_commit_time`, symbol, ts, volume, open, close from stock_ticks_mor_rt where symbol = 'GOOG'").show(100, false)
// Bootstrapped Copy-On-Write table
spark.sql("select symbol, max(ts) from stock_ticks_cow_bs group by symbol HAVING symbol = 'GOOG'").show(100, false)
spark.sql("select `_hoodie_commit_time`, symbol, ts, volume, open, close from stock_ticks_cow_bs where symbol = 'GOOG'").show(100, false)
// Bootstrapped Merge-On-Read table
spark.sql("select symbol, max(ts) from stock_ticks_mor_bs_ro group by symbol HAVING symbol = 'GOOG'").show(100, false)
spark.sql("select symbol, max(ts) from stock_ticks_mor_bs_rt group by symbol HAVING symbol = 'GOOG'").show(100, false)
spark.sql("select `_hoodie_commit_time`, symbol, ts, volume, open, close from stock_ticks_mor_bs_ro where symbol = 'GOOG'").show(100, false)
spark.sql("select `_hoodie_commit_time`, symbol, ts, volume, open, close from stock_ticks_mor_bs_rt where symbol = 'GOOG'").show(100, false)
System.exit(0)

View File

@@ -26,4 +26,14 @@ spark.sql("select `_hoodie_commit_time`, symbol, ts, volume, open, close from s
spark.sql("select symbol, max(ts) from stock_ticks_mor_rt group by symbol HAVING symbol = 'GOOG'").show(100, false)
spark.sql("select `_hoodie_commit_time`, symbol, ts, volume, open, close from stock_ticks_mor_rt where symbol = 'GOOG'").show(100, false)
// Copy-On-Write Bootstrapped table
spark.sql("select symbol, max(ts) from stock_ticks_cow_bs group by symbol HAVING symbol = 'GOOG'").show(100, false)
spark.sql("select `_hoodie_commit_time`, symbol, ts, volume, open, close from stock_ticks_cow_bs where symbol = 'GOOG'").show(100, false)
// Merge-On-Read table Bootstrapped Table
spark.sql("select symbol, max(ts) from stock_ticks_mor_bs_ro group by symbol HAVING symbol = 'GOOG'").show(100, false)
spark.sql("select `_hoodie_commit_time`, symbol, ts, volume, open, close from stock_ticks_mor_bs_ro where symbol = 'GOOG'").show(100, false)
spark.sql("select symbol, max(ts) from stock_ticks_mor_bs_rt group by symbol HAVING symbol = 'GOOG'").show(100, false)
spark.sql("select `_hoodie_commit_time`, symbol, ts, volume, open, close from stock_ticks_mor_bs_rt where symbol = 'GOOG'").show(100, false)
System.exit(0)

View File

@@ -0,0 +1,22 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import org.apache.spark.sql.functions.col
val df = spark.read.format("org.apache.hudi").load("/user/hive/warehouse/stock_ticks_cow/*/*/*").drop("_hoodie_commit_time", "_hoodie_record_key", "_hoodie_file_name", "_hoodie_commit_seqno", "_hoodie_partition_path")
df.write.format("parquet").save("/user/hive/warehouse/stock_ticks_cow_bs_src/2018/08/31/")
System.exit(0)

View File

@@ -52,8 +52,38 @@ spark.sql("select key, `_hoodie_partition_path` as datestr, symbol, ts, open, cl
mode(SaveMode.Overwrite).
save("/user/hive/warehouse/stock_ticks_derived_mor");
spark.sql("show tables").show(20, false)
spark.sql("select count(*) from stock_ticks_derived_mor_ro").show(20, false)
spark.sql("select count(*) from stock_ticks_derived_mor_rt").show(20, false)
val hoodieIncQueryBsDF = spark.read.format("org.apache.hudi").
option(DataSourceReadOptions.QUERY_TYPE_OPT_KEY, DataSourceReadOptions.QUERY_TYPE_INCREMENTAL_OPT_VAL).
option(DataSourceReadOptions.BEGIN_INSTANTTIME_OPT_KEY, "00000000000001").
load("/user/hive/warehouse/stock_ticks_cow_bs");
hoodieIncQueryBsDF.registerTempTable("stock_ticks_cow_bs_incr")
spark.sql("select `_hoodie_commit_time`, symbol, ts, volume, open, close from stock_ticks_cow_bs_incr where symbol = 'GOOG'").show(100, false);
spark.sql("select key, `_hoodie_partition_path` as datestr, symbol, ts, open, close from stock_ticks_cow_bs_incr").
write.format("org.apache.hudi").
option("hoodie.insert.shuffle.parallelism", "2").
option("hoodie.upsert.shuffle.parallelism","2").
option(DataSourceWriteOptions.TABLE_TYPE_OPT_KEY, DataSourceWriteOptions.MOR_TABLE_TYPE_OPT_VAL).
option(DataSourceWriteOptions.OPERATION_OPT_KEY, DataSourceWriteOptions.UPSERT_OPERATION_OPT_VAL).
option(DataSourceWriteOptions.RECORDKEY_FIELD_OPT_KEY, "key").
option(DataSourceWriteOptions.PARTITIONPATH_FIELD_OPT_KEY, "datestr").
option(DataSourceWriteOptions.PRECOMBINE_FIELD_OPT_KEY, "ts").
option(HoodieWriteConfig.TABLE_NAME, "stock_ticks_derived_mor_bs").
option(DataSourceWriteOptions.HIVE_TABLE_OPT_KEY, "stock_ticks_derived_mor_bs").
option(DataSourceWriteOptions.HIVE_DATABASE_OPT_KEY, "default").
option(DataSourceWriteOptions.HIVE_URL_OPT_KEY, "jdbc:hive2://hiveserver:10000").
option(DataSourceWriteOptions.HIVE_USER_OPT_KEY, "hive").
option(DataSourceWriteOptions.HIVE_PASS_OPT_KEY, "hive").
option(DataSourceWriteOptions.HIVE_SYNC_ENABLED_OPT_KEY, "true").
option(DataSourceWriteOptions.HIVE_PARTITION_FIELDS_OPT_KEY, "datestr").
mode(SaveMode.Overwrite).
save("/user/hive/warehouse/stock_ticks_derived_mor_bs");
spark.sql("show tables").show(20, false)
spark.sql("select count(*) from stock_ticks_derived_mor_bs_ro").show(20, false)
spark.sql("select count(*) from stock_ticks_derived_mor_bs_rt").show(20, false)
System.exit(0);

View File

@@ -25,4 +25,7 @@ if [ -z "$CLIENT_JAR" ]; then
echo "Client jar location not set, please set it in conf/hudi-env.sh"
fi
java -cp ${HADOOP_CONF_DIR}:${SPARK_CONF_DIR}:$DIR/target/lib/*:$HOODIE_JAR:${CLIENT_JAR} -DSPARK_CONF_DIR=${SPARK_CONF_DIR} -DHADOOP_CONF_DIR=${HADOOP_CONF_DIR} org.springframework.shell.Bootstrap $@
OTHER_JARS=`ls ${DIR}/target/lib/* | grep -v 'hudi-[^/]*jar' | tr '\n' ':'`
echo "Running : java -cp ${HADOOP_CONF_DIR}:${SPARK_CONF_DIR}:${HOODIE_JAR}:${OTHER_JARS}:${CLIENT_JAR} -DSPARK_CONF_DIR=${SPARK_CONF_DIR} -DHADOOP_CONF_DIR=${HADOOP_CONF_DIR} org.springframework.shell.Bootstrap $@"
java -cp ${HADOOP_CONF_DIR}:${SPARK_CONF_DIR}:${HOODIE_JAR}:${OTHER_JARS}:${CLIENT_JAR} -DSPARK_CONF_DIR=${SPARK_CONF_DIR} -DHADOOP_CONF_DIR=${HADOOP_CONF_DIR} org.springframework.shell.Bootstrap $@

View File

@@ -139,26 +139,6 @@
</dependency>
<!-- Hoodie -->
<dependency>
<groupId>org.apache.hudi</groupId>
<artifactId>hudi-client</artifactId>
<version>${project.version}</version>
</dependency>
<dependency>
<groupId>org.apache.hudi</groupId>
<artifactId>hudi-common</artifactId>
<version>${project.version}</version>
</dependency>
<dependency>
<groupId>org.apache.hudi</groupId>
<artifactId>hudi-hive-sync</artifactId>
<version>${project.version}</version>
</dependency>
<dependency>
<groupId>org.apache.hudi</groupId>
<artifactId>hudi-utilities_${scala.binary.version}</artifactId>
<version>${project.version}</version>
</dependency>
<dependency>
<groupId>org.apache.hudi</groupId>
<artifactId>hudi-common</artifactId>
@@ -198,6 +178,12 @@
</exclusions>
</dependency>
<dependency>
<groupId>org.apache.hudi</groupId>
<artifactId>hudi-utilities-bundle_${scala.binary.version}</artifactId>
<version>${project.version}</version>
</dependency>
<!-- Logging -->
<dependency>
<groupId>log4j</groupId>

View File

@@ -213,7 +213,7 @@ public class CompactionCommand implements CommandMarker {
if (exitCode != 0) {
return "Failed to run compaction for " + compactionInstantTime;
}
return "Compaction successfully completed for " + compactionInstantTime;
return "Attempted to schedule compaction for " + compactionInstantTime;
}
@CliCommand(value = "compaction run", help = "Run Compaction for given instant time")

View File

@@ -22,6 +22,7 @@ import org.apache.hudi.cli.HoodieCLI;
import org.apache.hudi.cli.HoodiePrintHelper;
import org.apache.hudi.cli.HoodieTableHeaderFields;
import org.apache.hudi.cli.TableHeader;
import org.apache.hudi.common.fs.FSUtils;
import org.apache.hudi.common.model.FileSlice;
import org.apache.hudi.common.model.HoodieLogFile;
import org.apache.hudi.common.table.HoodieTableMetaClient;
@@ -239,7 +240,7 @@ public class FileSystemViewCommand implements CommandMarker {
new HoodieTableMetaClient(client.getHadoopConf(), client.getBasePath(), true);
FileSystem fs = HoodieCLI.fs;
String globPath = String.format("%s/%s/*", client.getBasePath(), globRegex);
FileStatus[] statuses = fs.globStatus(new Path(globPath));
List<FileStatus> statuses = FSUtils.getGlobStatusExcludingMetaFolder(fs, new Path(globPath));
Stream<HoodieInstant> instantsStream;
HoodieTimeline timeline;
@@ -269,6 +270,6 @@ public class FileSystemViewCommand implements CommandMarker {
HoodieTimeline filteredTimeline = new HoodieDefaultTimeline(instantsStream,
(Function<HoodieInstant, Option<byte[]>> & Serializable) metaClient.getActiveTimeline()::getInstantDetails);
return new HoodieTableFileSystemView(metaClient, filteredTimeline, statuses);
return new HoodieTableFileSystemView(metaClient, filteredTimeline, statuses.toArray(new FileStatus[0]));
}
}

View File

@@ -22,6 +22,7 @@ import org.apache.hudi.cli.HoodieCLI;
import org.apache.hudi.cli.HoodiePrintHelper;
import org.apache.hudi.cli.HoodieTableHeaderFields;
import org.apache.hudi.cli.TableHeader;
import org.apache.hudi.common.fs.FSUtils;
import org.apache.hudi.common.model.HoodieLogFile;
import org.apache.hudi.common.model.HoodieRecord;
import org.apache.hudi.common.model.HoodieRecordPayload;
@@ -53,7 +54,6 @@ import org.springframework.stereotype.Component;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Comparator;
import java.util.HashMap;
import java.util.List;
@@ -82,7 +82,7 @@ public class HoodieLogFileCommand implements CommandMarker {
throws IOException {
FileSystem fs = HoodieCLI.getTableMetaClient().getFs();
List<String> logFilePaths = Arrays.stream(fs.globStatus(new Path(logFilePathPattern)))
List<String> logFilePaths = FSUtils.getGlobStatusExcludingMetaFolder(fs, new Path(logFilePathPattern)).stream()
.map(status -> status.getPath().toString()).collect(Collectors.toList());
Map<String, List<Tuple3<HoodieLogBlockType, Tuple2<Map<HeaderMetadataType, String>, Map<HeaderMetadataType, String>>, Integer>>> commitCountAndMetadata =
new HashMap<>();
@@ -175,7 +175,7 @@ public class HoodieLogFileCommand implements CommandMarker {
HoodieTableMetaClient client = HoodieCLI.getTableMetaClient();
FileSystem fs = client.getFs();
List<String> logFilePaths = Arrays.stream(fs.globStatus(new Path(logFilePathPattern)))
List<String> logFilePaths = FSUtils.getGlobStatusExcludingMetaFolder(fs, new Path(logFilePathPattern)).stream()
.map(status -> status.getPath().toString()).sorted(Comparator.reverseOrder())
.collect(Collectors.toList());

View File

@@ -118,7 +118,7 @@ public class StatsCommand implements CommandMarker {
FileSystem fs = HoodieCLI.fs;
String globPath = String.format("%s/%s/*", HoodieCLI.getTableMetaClient().getBasePath(), globRegex);
FileStatus[] statuses = fs.globStatus(new Path(globPath));
List<FileStatus> statuses = FSUtils.getGlobStatusExcludingMetaFolder(fs, new Path(globPath));
// max, min, #small files < 10MB, 50th, avg, 95th
Histogram globalHistogram = new Histogram(new UniformReservoir(MAX_FILES));

View File

@@ -88,8 +88,7 @@ public class TableCommand implements CommandMarker {
@CliOption(key = {"archiveLogFolder"}, help = "Folder Name for storing archived timeline") String archiveFolder,
@CliOption(key = {"layoutVersion"}, help = "Specific Layout Version to use") Integer layoutVersion,
@CliOption(key = {"payloadClass"}, unspecifiedDefaultValue = "org.apache.hudi.common.model.HoodieAvroPayload",
help = "Payload Class") final String payloadClass)
throws IOException {
help = "Payload Class") final String payloadClass) throws IOException {
boolean initialized = HoodieCLI.initConf();
HoodieCLI.initFS(initialized);

View File

@@ -62,6 +62,7 @@ public class TestArchivedCommitsCommand extends AbstractShellIntegrationTest {
// Create table and connect
String tableName = "test_table";
tablePath = basePath + File.separator + tableName;
new TableCommand().createTable(
tablePath, tableName,
"COPY_ON_WRITE", "", 1, "org.apache.hudi.common.model.HoodieAvroPayload");

View File

@@ -149,6 +149,35 @@ public class HoodieWriteClient<T extends HoodieRecordPayload> extends AbstractHo
return recordsWithLocation.filter(v1 -> !v1.isCurrentLocationKnown());
}
/**
* Main API to run bootstrap to hudi.
*/
public void bootstrap(Option<Map<String, String>> extraMetadata) {
if (rollbackPending) {
rollBackInflightBootstrap();
}
HoodieTable<T> table = getTableAndInitCtx(WriteOperationType.UPSERT);
table.bootstrap(jsc, extraMetadata);
}
/**
* Main API to rollback pending bootstrap.
*/
protected void rollBackInflightBootstrap() {
LOG.info("Rolling back pending bootstrap if present");
HoodieTable<T> table = HoodieTable.create(config, hadoopConf);
HoodieTimeline inflightTimeline = table.getMetaClient().getCommitsTimeline().filterPendingExcludingCompaction();
Option<String> instant = Option.fromJavaOptional(
inflightTimeline.getReverseOrderedInstants().map(HoodieInstant::getTimestamp).findFirst());
if (instant.isPresent() && HoodieTimeline.compareTimestamps(instant.get(), HoodieTimeline.LESSER_THAN_OR_EQUALS,
HoodieTimeline.FULL_BOOTSTRAP_INSTANT_TS)) {
LOG.info("Found pending bootstrap instants. Rolling them back");
table.rollbackBootstrap(jsc, HoodieActiveTimeline.createNewInstantTime());
LOG.info("Finished rolling back pending bootstrap");
}
}
/**
* Upsert a batch of new records into Hoodie table at the supplied instantTime.
*
@@ -671,7 +700,13 @@ public class HoodieWriteClient<T extends HoodieRecordPayload> extends AbstractHo
List<String> commits = inflightTimeline.getReverseOrderedInstants().map(HoodieInstant::getTimestamp)
.collect(Collectors.toList());
for (String commit : commits) {
rollback(commit);
if (HoodieTimeline.compareTimestamps(commit, HoodieTimeline.LESSER_THAN_OR_EQUALS,
HoodieTimeline.FULL_BOOTSTRAP_INSTANT_TS)) {
rollBackInflightBootstrap();
break;
} else {
rollback(commit);
}
}
}

View File

@@ -0,0 +1,35 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.hudi.client.bootstrap;
/**
* Identifies different types of bootstrap.
*/
public enum BootstrapMode {
/**
* In this mode, record level metadata is generated for each source record and both original record and metadata
* for each record copied.
*/
FULL_RECORD,
/**
* In this mode, record level metadata alone is generated for each source record and stored in new bootstrap location.
*/
METADATA_ONLY
}

View File

@@ -0,0 +1,51 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.hudi.client.bootstrap;
import org.apache.hudi.common.model.HoodieRecordPayload;
import org.apache.hudi.common.util.Option;
import org.apache.avro.Schema;
import org.apache.avro.generic.GenericRecord;
import org.apache.avro.generic.IndexedRecord;
public class BootstrapRecordPayload implements HoodieRecordPayload<BootstrapRecordPayload> {
private final GenericRecord record;
public BootstrapRecordPayload(GenericRecord record) {
this.record = record;
}
@Override
public BootstrapRecordPayload preCombine(BootstrapRecordPayload another) {
return this;
}
@Override
public Option<IndexedRecord> combineAndGetUpdateValue(IndexedRecord currentValue, Schema schema) {
return Option.ofNullable(record);
}
@Override
public Option<IndexedRecord> getInsertValue(Schema schema) {
return Option.ofNullable(record);
}
}

View File

@@ -0,0 +1,77 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.hudi.client.bootstrap;
import org.apache.hudi.avro.model.HoodieFileStatus;
import org.apache.hudi.common.bootstrap.FileStatusUtils;
import org.apache.hudi.common.util.ParquetUtils;
import org.apache.hudi.common.util.collection.Pair;
import org.apache.hudi.config.HoodieWriteConfig;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.avro.Schema;
import org.apache.hadoop.fs.Path;
import java.util.List;
/**
* Bootstrap Schema Provider. Schema provided in config is used. If not available, use schema from Parquet
*/
public class BootstrapSchemaProvider {
protected final HoodieWriteConfig writeConfig;
public BootstrapSchemaProvider(HoodieWriteConfig writeConfig) {
this.writeConfig = writeConfig;
}
/**
* Main API to select avro schema for bootstrapping.
* @param jsc Java Spark Context
* @param partitions List of partitions with files within them
* @return Avro Schema
*/
public final Schema getBootstrapSchema(JavaSparkContext jsc, List<Pair<String, List<HoodieFileStatus>>> partitions) {
if (writeConfig.getSchema() != null) {
// Use schema specified by user if set
return Schema.parse(writeConfig.getSchema());
}
return getBootstrapSourceSchema(jsc, partitions);
}
/**
* Select a random file to be used to generate avro schema.
* Override this method to get custom schema selection.
* @param jsc Java Spark Context
* @param partitions List of partitions with files within them
* @return Avro Schema
*/
protected Schema getBootstrapSourceSchema(JavaSparkContext jsc,
List<Pair<String, List<HoodieFileStatus>>> partitions) {
return partitions.stream().flatMap(p -> p.getValue().stream())
.map(fs -> {
try {
Path filePath = FileStatusUtils.toPath(fs.getPath());
return ParquetUtils.readAvroSchema(jsc.hadoopConfiguration(), filePath);
} catch (Exception ex) {
return null;
}
}).filter(x -> x != null).findAny().get();
}
}

View File

@@ -0,0 +1,48 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.hudi.client.bootstrap;
import org.apache.hudi.client.WriteStatus;
import org.apache.hudi.common.model.BootstrapFileMapping;
import org.apache.hudi.common.model.HoodieWriteStat;
import org.apache.hudi.common.util.collection.Pair;
/**
* WriteStatus for Bootstrap.
*/
public class BootstrapWriteStatus extends WriteStatus {
private BootstrapFileMapping sourceFileMapping;
public BootstrapWriteStatus(Boolean trackSuccessRecords, Double failureFraction) {
super(trackSuccessRecords, failureFraction);
}
public BootstrapFileMapping getBootstrapSourceFileMapping() {
return sourceFileMapping;
}
public Pair<BootstrapFileMapping, HoodieWriteStat> getBootstrapSourceAndWriteStat() {
return Pair.of(getBootstrapSourceFileMapping(), getStat());
}
public void setBootstrapSourceFileMapping(BootstrapFileMapping sourceFileMapping) {
this.sourceFileMapping = sourceFileMapping;
}
}

View File

@@ -0,0 +1,58 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.hudi.client.bootstrap;
import java.io.Serializable;
import org.apache.hudi.avro.model.HoodieFileStatus;
import org.apache.hudi.common.config.TypedProperties;
import org.apache.hudi.common.model.HoodieRecord;
import org.apache.hudi.common.util.collection.Pair;
import org.apache.log4j.LogManager;
import org.apache.log4j.Logger;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext;
import java.util.List;
/**
* Creates RDD of Hoodie Records with complete record data, given a list of partitions to be bootstrapped.
*/
public abstract class FullRecordBootstrapDataProvider implements Serializable {
protected static final Logger LOG = LogManager.getLogger(FullRecordBootstrapDataProvider.class);
protected final TypedProperties props;
protected final transient JavaSparkContext jsc;
public FullRecordBootstrapDataProvider(TypedProperties props, JavaSparkContext jsc) {
this.props = props;
this.jsc = jsc;
}
/**
* Generates a list of input partition and files and returns a RDD representing source.
* @param tableName Hudi Table Name
* @param sourceBasePath Source Base Path
* @param partitionPaths Partition Paths
* @return JavaRDD of input records
*/
public abstract JavaRDD<HoodieRecord> generateInputRecordRDD(String tableName,
String sourceBasePath, List<Pair<String, List<HoodieFileStatus>>> partitionPaths);
}

View File

@@ -0,0 +1,48 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.hudi.client.bootstrap.selector;
import org.apache.hudi.avro.model.HoodieFileStatus;
import org.apache.hudi.client.bootstrap.BootstrapMode;
import org.apache.hudi.common.util.collection.Pair;
import org.apache.hudi.config.HoodieWriteConfig;
import java.io.Serializable;
import java.util.List;
import java.util.Map;
/**
* Pluggable Partition Selector for selecting partitions to perform full or metadata-only bootstrapping.
*/
public abstract class BootstrapModeSelector implements Serializable {
protected final HoodieWriteConfig writeConfig;
public BootstrapModeSelector(HoodieWriteConfig writeConfig) {
this.writeConfig = writeConfig;
}
/**
* Classify partitions for the purpose of bootstrapping. For a non-partitioned source, input list will be one entry.
*
* @param partitions List of partitions with files present in each partitions
* @return a partitions grouped by bootstrap mode
*/
public abstract Map<BootstrapMode, List<String>> select(List<Pair<String, List<HoodieFileStatus>>> partitions);
}

View File

@@ -0,0 +1,56 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.hudi.client.bootstrap.selector;
import java.util.List;
import java.util.Map;
import java.util.regex.Pattern;
import java.util.stream.Collectors;
import org.apache.hudi.avro.model.HoodieFileStatus;
import org.apache.hudi.client.bootstrap.BootstrapMode;
import org.apache.hudi.common.util.collection.Pair;
import org.apache.hudi.config.HoodieWriteConfig;
import org.apache.log4j.LogManager;
import org.apache.log4j.Logger;
public class BootstrapRegexModeSelector extends BootstrapModeSelector {
private static final long serialVersionUID = 1L;
private static final Logger LOG = LogManager.getLogger(BootstrapRegexModeSelector.class);
private final Pattern pattern;
private final BootstrapMode bootstrapModeOnMatch;
private final BootstrapMode defaultMode;
public BootstrapRegexModeSelector(HoodieWriteConfig writeConfig) {
super(writeConfig);
this.pattern = Pattern.compile(writeConfig.getBootstrapModeSelectorRegex());
this.bootstrapModeOnMatch = writeConfig.getBootstrapModeForRegexMatch();
this.defaultMode = BootstrapMode.FULL_RECORD.equals(bootstrapModeOnMatch)
? BootstrapMode.METADATA_ONLY : BootstrapMode.FULL_RECORD;
LOG.info("Default Mode :" + defaultMode + ", on Match Mode :" + bootstrapModeOnMatch);
}
@Override
public Map<BootstrapMode, List<String>> select(List<Pair<String, List<HoodieFileStatus>>> partitions) {
return partitions.stream()
.map(p -> Pair.of(pattern.matcher(p.getKey()).matches() ? bootstrapModeOnMatch : defaultMode, p.getKey()))
.collect(Collectors.groupingBy(Pair::getKey, Collectors.mapping(Pair::getValue, Collectors.toList())));
}
}

View File

@@ -0,0 +1,29 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.hudi.client.bootstrap.selector;
import org.apache.hudi.client.bootstrap.BootstrapMode;
import org.apache.hudi.config.HoodieWriteConfig;
public class FullRecordBootstrapModeSelector extends UniformBootstrapModeSelector {
public FullRecordBootstrapModeSelector(HoodieWriteConfig bootstrapConfig) {
super(bootstrapConfig, BootstrapMode.FULL_RECORD);
}
}

View File

@@ -0,0 +1,29 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.hudi.client.bootstrap.selector;
import org.apache.hudi.client.bootstrap.BootstrapMode;
import org.apache.hudi.config.HoodieWriteConfig;
public class MetadataOnlyBootstrapModeSelector extends UniformBootstrapModeSelector {
public MetadataOnlyBootstrapModeSelector(HoodieWriteConfig bootstrapConfig) {
super(bootstrapConfig, BootstrapMode.METADATA_ONLY);
}
}

View File

@@ -0,0 +1,48 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.hudi.client.bootstrap.selector;
import org.apache.hudi.avro.model.HoodieFileStatus;
import org.apache.hudi.client.bootstrap.BootstrapMode;
import org.apache.hudi.common.util.collection.Pair;
import org.apache.hudi.config.HoodieWriteConfig;
import java.util.List;
import java.util.Map;
import java.util.stream.Collectors;
/**
* A bootstrap selector which employs same bootstrap mode for all partitions.
*/
public abstract class UniformBootstrapModeSelector extends BootstrapModeSelector {
private final BootstrapMode bootstrapMode;
public UniformBootstrapModeSelector(HoodieWriteConfig bootstrapConfig, BootstrapMode bootstrapMode) {
super(bootstrapConfig);
this.bootstrapMode = bootstrapMode;
}
@Override
public Map<BootstrapMode, List<String>> select(List<Pair<String, List<HoodieFileStatus>>> partitions) {
return partitions.stream().map(p -> Pair.of(bootstrapMode, p))
.collect(Collectors.groupingBy(Pair::getKey, Collectors.mapping(x -> x.getValue().getKey(),
Collectors.toList())));
}
}

View File

@@ -0,0 +1,39 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.hudi.client.bootstrap.translator;
import java.io.Serializable;
import org.apache.hudi.common.config.TypedProperties;
public abstract class BootstrapPartitionPathTranslator implements Serializable {
private final TypedProperties properties;
public BootstrapPartitionPathTranslator(TypedProperties properties) {
this.properties = properties;
}
/**
* Given a bootstrap partition path, translated partition path.
*
* @param bootStrapPartitionPath bootstrap Partition Path
* @return Translated Path
*/
public abstract String getBootstrapTranslatedPath(String bootStrapPartitionPath);
}

View File

@@ -0,0 +1,36 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.hudi.client.bootstrap.translator;
import org.apache.hudi.common.config.TypedProperties;
/**
* Return same path as bootstrap partition path.
*/
public class IdentityBootstrapPartitionPathTranslator extends BootstrapPartitionPathTranslator {
public IdentityBootstrapPartitionPathTranslator(TypedProperties properties) {
super(properties);
}
@Override
public String getBootstrapTranslatedPath(String bootStrapPartitionPath) {
return bootStrapPartitionPath;
}
}

View File

@@ -0,0 +1,52 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.hudi.client.utils;
import java.util.Iterator;
import java.util.function.Function;
import org.apache.avro.generic.GenericRecord;
import org.apache.hudi.common.util.ValidationUtils;
import org.apache.hudi.common.util.collection.Pair;
public class MergingIterator<T extends GenericRecord> implements Iterator<T> {
private final Iterator<T> leftIterator;
private final Iterator<T> rightIterator;
private final Function<Pair<T,T>, T> mergeFunction;
public MergingIterator(Iterator<T> leftIterator, Iterator<T> rightIterator, Function<Pair<T,T>, T> mergeFunction) {
this.leftIterator = leftIterator;
this.rightIterator = rightIterator;
this.mergeFunction = mergeFunction;
}
@Override
public boolean hasNext() {
boolean leftHasNext = leftIterator.hasNext();
boolean rightHasNext = rightIterator.hasNext();
ValidationUtils.checkArgument(leftHasNext == rightHasNext);
return leftHasNext;
}
@Override
public T next() {
return mergeFunction.apply(Pair.of(leftIterator.next(), rightIterator.next()));
}
}

View File

@@ -0,0 +1,135 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.hudi.config;
import org.apache.hudi.client.bootstrap.BootstrapMode;
import org.apache.hudi.client.bootstrap.selector.MetadataOnlyBootstrapModeSelector;
import org.apache.hudi.client.bootstrap.translator.IdentityBootstrapPartitionPathTranslator;
import org.apache.hudi.common.config.DefaultHoodieConfig;
import java.io.File;
import java.io.FileReader;
import java.io.IOException;
import java.util.Properties;
/**
* Bootstrap specific configs.
*/
public class HoodieBootstrapConfig extends DefaultHoodieConfig {
public static final String BOOTSTRAP_BASE_PATH_PROP = "hoodie.bootstrap.base.path";
public static final String BOOTSTRAP_MODE_SELECTOR = "hoodie.bootstrap.mode.selector";
public static final String FULL_BOOTSTRAP_INPUT_PROVIDER = "hoodie.bootstrap.full.input.provider";
public static final String BOOTSTRAP_KEYGEN_CLASS = "hoodie.bootstrap.keygen.class";
public static final String BOOTSTRAP_PARTITION_PATH_TRANSLATOR_CLASS =
"hoodie.bootstrap.partitionpath.translator.class";
public static final String DEFAULT_BOOTSTRAP_PARTITION_PATH_TRANSLATOR_CLASS =
IdentityBootstrapPartitionPathTranslator.class.getName();
public static final String BOOTSTRAP_PARALLELISM = "hoodie.bootstrap.parallelism";
public static final String DEFAULT_BOOTSTRAP_PARALLELISM = "1500";
// Used By BootstrapRegexModeSelector class. When a partition path matches the regex, the corresponding
// mode will be used. Otherwise, the alternative mode will be used.
public static final String BOOTSTRAP_MODE_SELECTOR_REGEX = "hoodie.bootstrap.mode.selector.regex";
public static final String BOOTSTRAP_MODE_SELECTOR_REGEX_MODE = "hoodie.bootstrap.mode.selector.regex.mode";
public static final String DEFAULT_BOOTSTRAP_MODE_SELECTOR_REGEX = ".*";
public static final String DEFAULT_BOOTSTRAP_MODE_SELECTOR_REGEX_MODE = BootstrapMode.METADATA_ONLY.name();
public HoodieBootstrapConfig(Properties props) {
super(props);
}
public static Builder newBuilder() {
return new Builder();
}
public static class Builder {
private final Properties props = new Properties();
public Builder fromFile(File propertiesFile) throws IOException {
try (FileReader reader = new FileReader(propertiesFile)) {
this.props.load(reader);
return this;
}
}
public Builder withBootstrapBasePath(String basePath) {
props.setProperty(BOOTSTRAP_BASE_PATH_PROP, basePath);
return this;
}
public Builder withBootstrapModeSelector(String partitionSelectorClass) {
props.setProperty(BOOTSTRAP_MODE_SELECTOR, partitionSelectorClass);
return this;
}
public Builder withFullBootstrapInputProvider(String partitionSelectorClass) {
props.setProperty(FULL_BOOTSTRAP_INPUT_PROVIDER, partitionSelectorClass);
return this;
}
public Builder withBootstrapKeyGenClass(String keyGenClass) {
props.setProperty(BOOTSTRAP_KEYGEN_CLASS, keyGenClass);
return this;
}
public Builder withBootstrapPartitionPathTranslatorClass(String partitionPathTranslatorClass) {
props.setProperty(BOOTSTRAP_PARTITION_PATH_TRANSLATOR_CLASS, partitionPathTranslatorClass);
return this;
}
public Builder withBootstrapParallelism(int parallelism) {
props.setProperty(BOOTSTRAP_PARALLELISM, String.valueOf(parallelism));
return this;
}
public Builder withBootstrapModeSelectorRegex(String regex) {
props.setProperty(BOOTSTRAP_MODE_SELECTOR_REGEX, regex);
return this;
}
public Builder withBootstrapModeForRegexMatch(BootstrapMode modeForRegexMatch) {
props.setProperty(BOOTSTRAP_MODE_SELECTOR_REGEX_MODE, modeForRegexMatch.name());
return this;
}
public Builder fromProperties(Properties props) {
this.props.putAll(props);
return this;
}
public HoodieBootstrapConfig build() {
HoodieBootstrapConfig config = new HoodieBootstrapConfig(props);
setDefaultOnCondition(props, !props.containsKey(BOOTSTRAP_PARALLELISM), BOOTSTRAP_PARALLELISM,
DEFAULT_BOOTSTRAP_PARALLELISM);
setDefaultOnCondition(props, !props.containsKey(BOOTSTRAP_PARTITION_PATH_TRANSLATOR_CLASS),
BOOTSTRAP_PARTITION_PATH_TRANSLATOR_CLASS, DEFAULT_BOOTSTRAP_PARTITION_PATH_TRANSLATOR_CLASS);
setDefaultOnCondition(props, !props.containsKey(BOOTSTRAP_MODE_SELECTOR), BOOTSTRAP_MODE_SELECTOR,
MetadataOnlyBootstrapModeSelector.class.getCanonicalName());
setDefaultOnCondition(props, !props.containsKey(BOOTSTRAP_MODE_SELECTOR_REGEX), BOOTSTRAP_MODE_SELECTOR_REGEX,
DEFAULT_BOOTSTRAP_MODE_SELECTOR_REGEX);
setDefaultOnCondition(props, !props.containsKey(BOOTSTRAP_MODE_SELECTOR_REGEX_MODE),
BOOTSTRAP_MODE_SELECTOR_REGEX_MODE, DEFAULT_BOOTSTRAP_MODE_SELECTOR_REGEX_MODE);
BootstrapMode.valueOf(props.getProperty(BOOTSTRAP_MODE_SELECTOR_REGEX_MODE));
return config;
}
}
}

View File

@@ -20,6 +20,7 @@ package org.apache.hudi.config;
import org.apache.hudi.client.HoodieWriteClient;
import org.apache.hudi.client.WriteStatus;
import org.apache.hudi.client.bootstrap.BootstrapMode;
import org.apache.hudi.common.config.DefaultHoodieConfig;
import org.apache.hudi.common.fs.ConsistencyGuardConfig;
import org.apache.hudi.common.model.HoodieCleaningPolicy;
@@ -129,6 +130,9 @@ public class HoodieWriteConfig extends DefaultHoodieConfig {
"_.hoodie.allow.multi.write.on.same.instant";
public static final String DEFAULT_ALLOW_MULTI_WRITE_ON_SAME_INSTANT = "false";
public static final String EXTERNAL_RECORD_AND_SCHEMA_TRANSFORMATION = AVRO_SCHEMA + ".externalTransformation";
public static final String DEFAULT_EXTERNAL_RECORD_AND_SCHEMA_TRANSFORMATION = "false";
private ConsistencyGuardConfig consistencyGuardConfig;
// Hoodie Write Client transparently rewrites File System View config when embedded mode is enabled
@@ -136,7 +140,7 @@ public class HoodieWriteConfig extends DefaultHoodieConfig {
private final FileSystemViewStorageConfig clientSpecifiedViewStorageConfig;
private FileSystemViewStorageConfig viewStorageConfig;
private HoodieWriteConfig(Properties props) {
protected HoodieWriteConfig(Properties props) {
super(props);
Properties newProps = new Properties();
newProps.putAll(props);
@@ -180,6 +184,10 @@ public class HoodieWriteConfig extends DefaultHoodieConfig {
return Boolean.parseBoolean(props.getProperty(HOODIE_ASSUME_DATE_PARTITIONING_PROP));
}
public boolean shouldUseExternalSchemaTransformation() {
return Boolean.parseBoolean(props.getProperty(EXTERNAL_RECORD_AND_SCHEMA_TRANSFORMATION));
}
public Integer getTimelineLayoutVersion() {
return Integer.parseInt(props.getProperty(TIMELINE_LAYOUT_VERSION));
}
@@ -675,13 +683,46 @@ public class HoodieWriteConfig extends DefaultHoodieConfig {
return props.getProperty(HoodieWriteCommitCallbackConfig.CALLBACK_CLASS_PROP);
}
public String getBootstrapSourceBasePath() {
return props.getProperty(HoodieBootstrapConfig.BOOTSTRAP_BASE_PATH_PROP);
}
public String getBootstrapModeSelectorClass() {
return props.getProperty(HoodieBootstrapConfig.BOOTSTRAP_MODE_SELECTOR);
}
public String getFullBootstrapInputProvider() {
return props.getProperty(HoodieBootstrapConfig.FULL_BOOTSTRAP_INPUT_PROVIDER);
}
public String getBootstrapKeyGeneratorClass() {
return props.getProperty(HoodieBootstrapConfig.BOOTSTRAP_KEYGEN_CLASS);
}
public String getBootstrapModeSelectorRegex() {
return props.getProperty(HoodieBootstrapConfig.BOOTSTRAP_MODE_SELECTOR_REGEX);
}
public BootstrapMode getBootstrapModeForRegexMatch() {
return BootstrapMode.valueOf(props.getProperty(HoodieBootstrapConfig.BOOTSTRAP_MODE_SELECTOR_REGEX_MODE));
}
public String getBootstrapPartitionPathTranslatorClass() {
return props.getProperty(HoodieBootstrapConfig.BOOTSTRAP_PARTITION_PATH_TRANSLATOR_CLASS);
}
public int getBootstrapParallelism() {
return Integer.parseInt(props.getProperty(HoodieBootstrapConfig.BOOTSTRAP_PARALLELISM));
}
public static class Builder {
private final Properties props = new Properties();
protected final Properties props = new Properties();
private boolean isIndexConfigSet = false;
private boolean isStorageConfigSet = false;
private boolean isCompactionConfigSet = false;
private boolean isMetricsConfigSet = false;
private boolean isBootstrapConfigSet = false;
private boolean isMemoryConfigSet = false;
private boolean isViewConfigSet = false;
private boolean isConsistencyGuardSet = false;
@@ -805,6 +846,12 @@ public class HoodieWriteConfig extends DefaultHoodieConfig {
return this;
}
public Builder withBootstrapConfig(HoodieBootstrapConfig bootstrapConfig) {
props.putAll(bootstrapConfig.getProps());
isBootstrapConfigSet = true;
return this;
}
public Builder withAutoCommit(boolean autoCommit) {
props.setProperty(HOODIE_AUTO_COMMIT_PROP, String.valueOf(autoCommit));
return this;
@@ -863,7 +910,17 @@ public class HoodieWriteConfig extends DefaultHoodieConfig {
return this;
}
public HoodieWriteConfig build() {
public Builder withExternalSchemaTrasformation(boolean enabled) {
props.setProperty(EXTERNAL_RECORD_AND_SCHEMA_TRANSFORMATION, String.valueOf(enabled));
return this;
}
public Builder withProperties(Properties properties) {
this.props.putAll(properties);
return this;
}
protected void setDefaults() {
// Check for mandatory properties
setDefaultOnCondition(props, !props.containsKey(INSERT_PARALLELISM), INSERT_PARALLELISM, DEFAULT_PARALLELISM);
setDefaultOnCondition(props, !props.containsKey(BULKINSERT_PARALLELISM), BULKINSERT_PARALLELISM,
@@ -916,6 +973,8 @@ public class HoodieWriteConfig extends DefaultHoodieConfig {
setDefaultOnCondition(props, !isCompactionConfigSet,
HoodieCompactionConfig.newBuilder().fromProperties(props).build());
setDefaultOnCondition(props, !isMetricsConfigSet, HoodieMetricsConfig.newBuilder().fromProperties(props).build());
setDefaultOnCondition(props, !isBootstrapConfigSet,
HoodieBootstrapConfig.newBuilder().fromProperties(props).build());
setDefaultOnCondition(props, !isMemoryConfigSet, HoodieMemoryConfig.newBuilder().fromProperties(props).build());
setDefaultOnCondition(props, !isViewConfigSet,
FileSystemViewStorageConfig.newBuilder().fromProperties(props).build());
@@ -924,15 +983,24 @@ public class HoodieWriteConfig extends DefaultHoodieConfig {
setDefaultOnCondition(props, !isCallbackConfigSet,
HoodieWriteCommitCallbackConfig.newBuilder().fromProperties(props).build());
setDefaultOnCondition(props, !props.containsKey(EXTERNAL_RECORD_AND_SCHEMA_TRANSFORMATION),
EXTERNAL_RECORD_AND_SCHEMA_TRANSFORMATION, DEFAULT_EXTERNAL_RECORD_AND_SCHEMA_TRANSFORMATION);
setDefaultOnCondition(props, !props.containsKey(TIMELINE_LAYOUT_VERSION), TIMELINE_LAYOUT_VERSION,
String.valueOf(TimelineLayoutVersion.CURR_VERSION));
}
private void validate() {
String layoutVersion = props.getProperty(TIMELINE_LAYOUT_VERSION);
// Ensure Layout Version is good
new TimelineLayoutVersion(Integer.parseInt(layoutVersion));
Objects.requireNonNull(props.getProperty(BASE_PATH_PROP));
}
public HoodieWriteConfig build() {
setDefaults();
validate();
// Build WriteConfig at the end
HoodieWriteConfig config = new HoodieWriteConfig(props);
Objects.requireNonNull(config.getBasePath());
return config;
}
}

View File

@@ -165,7 +165,7 @@ public class HoodieAppendHandle<T extends HoodieRecordPayload> extends HoodieWri
private Option<IndexedRecord> getIndexedRecord(HoodieRecord<T> hoodieRecord) {
Option recordMetadata = hoodieRecord.getData().getMetadata();
try {
Option<IndexedRecord> avroRecord = hoodieRecord.getData().getInsertValue(originalSchema);
Option<IndexedRecord> avroRecord = hoodieRecord.getData().getInsertValue(writerSchema);
if (avroRecord.isPresent()) {
// Convert GenericRecord to GenericRecord with hoodie commit metadata in schema
avroRecord = Option.of(rewriteRecord((GenericRecord) avroRecord.get()));
@@ -212,7 +212,7 @@ public class HoodieAppendHandle<T extends HoodieRecordPayload> extends HoodieWri
private void doAppend(Map<HeaderMetadataType, String> header) {
try {
header.put(HoodieLogBlock.HeaderMetadataType.INSTANT_TIME, instantTime);
header.put(HoodieLogBlock.HeaderMetadataType.SCHEMA, writerSchema.toString());
header.put(HoodieLogBlock.HeaderMetadataType.SCHEMA, writerSchemaWithMetafields.toString());
if (recordList.size() > 0) {
writer = writer.appendBlock(HoodieDataBlock.getBlock(hoodieTable.getLogDataBlockFormat(), recordList, header));
recordList.clear();

View File

@@ -0,0 +1,49 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.hudi.io;
import org.apache.hudi.avro.HoodieAvroUtils;
import org.apache.hudi.client.SparkTaskContextSupplier;
import org.apache.hudi.common.model.HoodieRecord;
import org.apache.hudi.common.model.HoodieRecordPayload;
import org.apache.hudi.common.util.collection.Pair;
import org.apache.hudi.config.HoodieWriteConfig;
import org.apache.hudi.table.HoodieTable;
/**
* This class is essentially same as Create Handle but overrides two things
* 1) Schema : Metadata bootstrap writes only metadata fields as part of write. So, setup the writer schema accordingly.
* 2) canWrite is overridden to always return true so that skeleton file and bootstrap file is aligned and we don't end up
* writing more than 1 skeleton file for the same bootstrap file.
* @param <T> HoodieRecordPayload
*/
public class HoodieBootstrapHandle<T extends HoodieRecordPayload> extends HoodieCreateHandle<T> {
public HoodieBootstrapHandle(HoodieWriteConfig config, String commitTime, HoodieTable<T> hoodieTable,
String partitionPath, String fileId, SparkTaskContextSupplier sparkTaskContextSupplier) {
super(config, commitTime, hoodieTable, partitionPath, fileId,
Pair.of(HoodieAvroUtils.RECORD_KEY_SCHEMA,
HoodieAvroUtils.addMetadataFields(HoodieAvroUtils.RECORD_KEY_SCHEMA)), sparkTaskContextSupplier);
}
@Override
public boolean canWrite(HoodieRecord record) {
return true;
}
}

View File

@@ -18,6 +18,7 @@
package org.apache.hudi.io;
import org.apache.avro.Schema;
import org.apache.hudi.client.SparkTaskContextSupplier;
import org.apache.hudi.client.WriteStatus;
import org.apache.hudi.common.fs.FSUtils;
@@ -28,6 +29,7 @@ import org.apache.hudi.common.model.HoodieRecordPayload;
import org.apache.hudi.common.model.HoodieWriteStat;
import org.apache.hudi.common.model.HoodieWriteStat.RuntimeStats;
import org.apache.hudi.common.util.Option;
import org.apache.hudi.common.util.collection.Pair;
import org.apache.hudi.config.HoodieWriteConfig;
import org.apache.hudi.exception.HoodieInsertException;
import org.apache.hudi.io.storage.HoodieFileWriter;
@@ -56,8 +58,16 @@ public class HoodieCreateHandle<T extends HoodieRecordPayload> extends HoodieWri
private boolean useWriterSchema = false;
public HoodieCreateHandle(HoodieWriteConfig config, String instantTime, HoodieTable<T> hoodieTable,
String partitionPath, String fileId, SparkTaskContextSupplier sparkTaskContextSupplier) {
super(config, instantTime, partitionPath, fileId, hoodieTable, sparkTaskContextSupplier);
String partitionPath, String fileId, SparkTaskContextSupplier sparkTaskContextSupplier) {
this(config, instantTime, hoodieTable, partitionPath, fileId, getWriterSchemaIncludingAndExcludingMetadataPair(config),
sparkTaskContextSupplier);
}
public HoodieCreateHandle(HoodieWriteConfig config, String instantTime, HoodieTable<T> hoodieTable,
String partitionPath, String fileId, Pair<Schema, Schema> writerSchemaIncludingAndExcludingMetadataPair,
SparkTaskContextSupplier sparkTaskContextSupplier) {
super(config, instantTime, partitionPath, fileId, hoodieTable, writerSchemaIncludingAndExcludingMetadataPair,
sparkTaskContextSupplier);
writeStatus.setFileId(fileId);
writeStatus.setPartitionPath(partitionPath);
@@ -68,8 +78,7 @@ public class HoodieCreateHandle<T extends HoodieRecordPayload> extends HoodieWri
new Path(config.getBasePath()), FSUtils.getPartitionPath(config.getBasePath(), partitionPath));
partitionMetadata.trySave(getPartitionId());
createMarkerFile(partitionPath, FSUtils.makeDataFileName(this.instantTime, this.writeToken, this.fileId, hoodieTable.getBaseFileExtension()));
this.fileWriter =
HoodieFileWriterFactory.getFileWriter(instantTime, path, hoodieTable, config, writerSchema, this.sparkTaskContextSupplier);
this.fileWriter = HoodieFileWriterFactory.getFileWriter(instantTime, path, hoodieTable, config, writerSchemaWithMetafields, this.sparkTaskContextSupplier);
} catch (IOException e) {
throw new HoodieInsertException("Failed to initialize HoodieStorageWriter for path " + path, e);
}
@@ -132,9 +141,9 @@ public class HoodieCreateHandle<T extends HoodieRecordPayload> extends HoodieWri
while (recordIterator.hasNext()) {
HoodieRecord<T> record = recordIterator.next();
if (useWriterSchema) {
write(record, record.getData().getInsertValue(writerSchema));
write(record, record.getData().getInsertValue(writerSchemaWithMetafields));
} else {
write(record, record.getData().getInsertValue(originalSchema));
write(record, record.getData().getInsertValue(writerSchema));
}
}
} catch (IOException io) {

View File

@@ -67,6 +67,7 @@ public class HoodieMergeHandle<T extends HoodieRecordPayload> extends HoodieWrit
private long updatedRecordsWritten = 0;
private long insertRecordsWritten = 0;
private boolean useWriterSchema;
private HoodieBaseFile baseFileToMerge;
public HoodieMergeHandle(HoodieWriteConfig config, String instantTime, HoodieTable<T> hoodieTable,
Iterator<HoodieRecord<T>> recordItr, String partitionPath, String fileId, SparkTaskContextSupplier sparkTaskContextSupplier) {
@@ -88,6 +89,10 @@ public class HoodieMergeHandle<T extends HoodieRecordPayload> extends HoodieWrit
}
@Override
public Schema getWriterSchemaWithMetafields() {
return writerSchemaWithMetafields;
}
public Schema getWriterSchema() {
return writerSchema;
}
@@ -95,12 +100,13 @@ public class HoodieMergeHandle<T extends HoodieRecordPayload> extends HoodieWrit
/**
* Extract old file path, initialize StorageWriter and WriteStatus.
*/
private void init(String fileId, String partitionPath, HoodieBaseFile dataFileToBeMerged) {
private void init(String fileId, String partitionPath, HoodieBaseFile baseFileToMerge) {
LOG.info("partitionPath:" + partitionPath + ", fileId to be merged:" + fileId);
this.baseFileToMerge = baseFileToMerge;
this.writtenRecordKeys = new HashSet<>();
writeStatus.setStat(new HoodieWriteStat());
try {
String latestValidFilePath = dataFileToBeMerged.getFileName();
String latestValidFilePath = baseFileToMerge.getFileName();
writeStatus.getStat().setPrevCommit(FSUtils.getCommitTime(latestValidFilePath));
HoodiePartitionMetadata partitionMetadata = new HoodiePartitionMetadata(fs, instantTime,
@@ -126,8 +132,7 @@ public class HoodieMergeHandle<T extends HoodieRecordPayload> extends HoodieWrit
createMarkerFile(partitionPath, newFileName);
// Create the writer for writing the new version file
fileWriter = createNewFileWriter(instantTime, newFilePath, hoodieTable, config, writerSchema, sparkTaskContextSupplier);
fileWriter = createNewFileWriter(instantTime, newFilePath, hoodieTable, config, writerSchemaWithMetafields, sparkTaskContextSupplier);
} catch (IOException io) {
LOG.error("Error in update task at commit " + instantTime, io);
writeStatus.setGlobalError(io);
@@ -145,7 +150,7 @@ public class HoodieMergeHandle<T extends HoodieRecordPayload> extends HoodieWrit
long memoryForMerge = SparkConfigUtils.getMaxMemoryPerPartitionMerge(config.getProps());
LOG.info("MaxMemoryPerPartitionMerge => " + memoryForMerge);
this.keyToNewRecords = new ExternalSpillableMap<>(memoryForMerge, config.getSpillableMapBasePath(),
new DefaultSizeEstimator(), new HoodieRecordSizeEstimator(originalSchema));
new DefaultSizeEstimator(), new HoodieRecordSizeEstimator(writerSchema));
} catch (IOException io) {
throw new HoodieIOException("Cannot instantiate an ExternalSpillableMap", io);
}
@@ -216,7 +221,7 @@ public class HoodieMergeHandle<T extends HoodieRecordPayload> extends HoodieWrit
HoodieRecord<T> hoodieRecord = new HoodieRecord<>(keyToNewRecords.get(key));
try {
Option<IndexedRecord> combinedAvroRecord =
hoodieRecord.getData().combineAndGetUpdateValue(oldRecord, useWriterSchema ? writerSchema : originalSchema);
hoodieRecord.getData().combineAndGetUpdateValue(oldRecord, useWriterSchema ? writerSchemaWithMetafields : writerSchema);
if (writeUpdateRecord(hoodieRecord, combinedAvroRecord)) {
/*
* ONLY WHEN 1) we have an update for this key AND 2) We are able to successfully write the the combined new
@@ -241,7 +246,7 @@ public class HoodieMergeHandle<T extends HoodieRecordPayload> extends HoodieWrit
fileWriter.writeAvro(key, oldRecord);
} catch (ClassCastException e) {
LOG.error("Schema mismatch when rewriting old record " + oldRecord + " from file " + getOldFilePath()
+ " to file " + newFilePath + " with writerSchema " + writerSchema.toString(true));
+ " to file " + newFilePath + " with writerSchema " + writerSchemaWithMetafields.toString(true));
throw new HoodieUpsertException(errMsg, e);
} catch (IOException e) {
LOG.error("Failed to merge old record into new file for key " + key + " from old file " + getOldFilePath()
@@ -262,9 +267,9 @@ public class HoodieMergeHandle<T extends HoodieRecordPayload> extends HoodieWrit
HoodieRecord<T> hoodieRecord = newRecordsItr.next();
if (!writtenRecordKeys.contains(hoodieRecord.getRecordKey())) {
if (useWriterSchema) {
writeRecord(hoodieRecord, hoodieRecord.getData().getInsertValue(writerSchema));
writeRecord(hoodieRecord, hoodieRecord.getData().getInsertValue(writerSchemaWithMetafields));
} else {
writeRecord(hoodieRecord, hoodieRecord.getData().getInsertValue(originalSchema));
writeRecord(hoodieRecord, hoodieRecord.getData().getInsertValue(writerSchema));
}
insertRecordsWritten++;
}
@@ -312,4 +317,8 @@ public class HoodieMergeHandle<T extends HoodieRecordPayload> extends HoodieWrit
public IOType getIOType() {
return IOType.MERGE;
}
public HoodieBaseFile baseFileForMerge() {
return baseFileToMerge;
}
}

View File

@@ -27,6 +27,7 @@ import org.apache.hudi.common.model.HoodieRecordPayload;
import org.apache.hudi.common.util.HoodieTimer;
import org.apache.hudi.common.util.Option;
import org.apache.hudi.common.util.ReflectionUtils;
import org.apache.hudi.common.util.collection.Pair;
import org.apache.hudi.config.HoodieWriteConfig;
import org.apache.hudi.exception.HoodieIOException;
import org.apache.hudi.io.storage.HoodieFileWriter;
@@ -51,8 +52,8 @@ public abstract class HoodieWriteHandle<T extends HoodieRecordPayload> extends H
private static final Logger LOG = LogManager.getLogger(HoodieWriteHandle.class);
protected final Schema originalSchema;
protected final Schema writerSchema;
protected final Schema writerSchemaWithMetafields;
protected HoodieTimer timer;
protected final WriteStatus writeStatus;
protected final String partitionPath;
@@ -62,11 +63,18 @@ public abstract class HoodieWriteHandle<T extends HoodieRecordPayload> extends H
public HoodieWriteHandle(HoodieWriteConfig config, String instantTime, String partitionPath,
String fileId, HoodieTable<T> hoodieTable, SparkTaskContextSupplier sparkTaskContextSupplier) {
this(config, instantTime, partitionPath, fileId, hoodieTable,
getWriterSchemaIncludingAndExcludingMetadataPair(config), sparkTaskContextSupplier);
}
protected HoodieWriteHandle(HoodieWriteConfig config, String instantTime, String partitionPath, String fileId,
HoodieTable<T> hoodieTable, Pair<Schema, Schema> writerSchemaIncludingAndExcludingMetadataPair,
SparkTaskContextSupplier sparkTaskContextSupplier) {
super(config, instantTime, hoodieTable);
this.partitionPath = partitionPath;
this.fileId = fileId;
this.originalSchema = new Schema.Parser().parse(config.getSchema());
this.writerSchema = HoodieAvroUtils.createHoodieWriteSchema(originalSchema);
this.writerSchema = writerSchemaIncludingAndExcludingMetadataPair.getKey();
this.writerSchemaWithMetafields = writerSchemaIncludingAndExcludingMetadataPair.getValue();
this.timer = new HoodieTimer().startTimer();
this.writeStatus = (WriteStatus) ReflectionUtils.loadClass(config.getWriteStatusClassName(),
!hoodieTable.getIndex().isImplicitWithStorage(), config.getWriteStatusFailureFraction());
@@ -74,6 +82,19 @@ public abstract class HoodieWriteHandle<T extends HoodieRecordPayload> extends H
this.writeToken = makeWriteToken();
}
/**
* Returns writer schema pairs containing
* (a) Writer Schema from client
* (b) (a) with hoodie metadata fields.
* @param config Write Config
* @return
*/
protected static Pair<Schema, Schema> getWriterSchemaIncludingAndExcludingMetadataPair(HoodieWriteConfig config) {
Schema originalSchema = new Schema.Parser().parse(config.getSchema());
Schema hoodieSchema = HoodieAvroUtils.addMetadataFields(originalSchema);
return Pair.of(originalSchema, hoodieSchema);
}
/**
* Generate a write token based on the currently running spark task and its place in the spark dag.
*/
@@ -103,8 +124,8 @@ public abstract class HoodieWriteHandle<T extends HoodieRecordPayload> extends H
markerFiles.create(partitionPath, dataFileName, getIOType());
}
public Schema getWriterSchema() {
return writerSchema;
public Schema getWriterSchemaWithMetafields() {
return writerSchemaWithMetafields;
}
/**
@@ -142,7 +163,7 @@ public abstract class HoodieWriteHandle<T extends HoodieRecordPayload> extends H
* Rewrite the GenericRecord with the Schema containing the Hoodie Metadata fields.
*/
protected GenericRecord rewriteRecord(GenericRecord record) {
return HoodieAvroUtils.rewriteRecord(record, writerSchema);
return HoodieAvroUtils.rewriteRecord(record, writerSchemaWithMetafields);
}
public abstract WriteStatus close();

View File

@@ -0,0 +1,88 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.hudi.keygen;
import org.apache.hudi.common.config.TypedProperties;
import org.apache.hudi.common.model.HoodieKey;
import org.apache.hudi.exception.HoodieKeyException;
import org.apache.avro.generic.GenericRecord;
import java.util.List;
import java.util.stream.Collectors;
/**
* Base class for all the built-in key generators. Contains methods structured for
* code reuse amongst them.
*/
public abstract class BuiltinKeyGenerator extends KeyGenerator {
protected BuiltinKeyGenerator(TypedProperties config) {
super(config);
}
/**
* Generate a record Key out of provided generic record.
*/
public abstract String getRecordKey(GenericRecord record);
/**
* Generate a partition path out of provided generic record.
*/
public abstract String getPartitionPath(GenericRecord record);
/**
* Generate a Hoodie Key out of provided generic record.
*/
public final HoodieKey getKey(GenericRecord record) {
if (getRecordKeyFields() == null || getPartitionPathFields() == null) {
throw new HoodieKeyException("Unable to find field names for record key or partition path in cfg");
}
return new HoodieKey(getRecordKey(record), getPartitionPath(record));
}
/**
* Return fields that constitute record key. Used by Metadata bootstrap.
* Have a base implementation inorder to prevent forcing custom KeyGenerator implementation
* to implement this method
* @return list of record key fields
*/
public List<String> getRecordKeyFields() {
throw new IllegalStateException("This method is expected to be overridden by subclasses");
}
/**
* Return fields that constiture partition path. Used by Metadata bootstrap.
* Have a base implementation inorder to prevent forcing custom KeyGenerator implementation
* to implement this method
* @return list of partition path fields
*/
public List<String> getPartitionPathFields() {
throw new IllegalStateException("This method is expected to be overridden by subclasses");
}
@Override
public final List<String> getRecordKeyFieldNames() {
// For nested columns, pick top level column name
return getRecordKeyFields().stream().map(k -> {
int idx = k.indexOf('.');
return idx > 0 ? k.substring(0, idx) : k;
}).collect(Collectors.toList());
}
}

View File

@@ -0,0 +1,110 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.hudi.keygen;
import java.io.UnsupportedEncodingException;
import java.net.URLEncoder;
import java.nio.charset.StandardCharsets;
import java.util.List;
import org.apache.avro.generic.GenericRecord;
import org.apache.hudi.avro.HoodieAvroUtils;
import org.apache.hudi.exception.HoodieException;
import org.apache.hudi.exception.HoodieKeyException;
public class KeyGenUtils {
protected static final String NULL_RECORDKEY_PLACEHOLDER = "__null__";
protected static final String EMPTY_RECORDKEY_PLACEHOLDER = "__empty__";
protected static final String DEFAULT_PARTITION_PATH = "default";
protected static final String DEFAULT_PARTITION_PATH_SEPARATOR = "/";
public static String getRecordKey(GenericRecord record, List<String> recordKeyFields) {
boolean keyIsNullEmpty = true;
StringBuilder recordKey = new StringBuilder();
for (String recordKeyField : recordKeyFields) {
String recordKeyValue = HoodieAvroUtils.getNestedFieldValAsString(record, recordKeyField, true);
if (recordKeyValue == null) {
recordKey.append(recordKeyField + ":" + NULL_RECORDKEY_PLACEHOLDER + ",");
} else if (recordKeyValue.isEmpty()) {
recordKey.append(recordKeyField + ":" + EMPTY_RECORDKEY_PLACEHOLDER + ",");
} else {
recordKey.append(recordKeyField + ":" + recordKeyValue + ",");
keyIsNullEmpty = false;
}
}
recordKey.deleteCharAt(recordKey.length() - 1);
if (keyIsNullEmpty) {
throw new HoodieKeyException("recordKey values: \"" + recordKey + "\" for fields: "
+ recordKeyFields.toString() + " cannot be entirely null or empty.");
}
return recordKey.toString();
}
public static String getRecordPartitionPath(GenericRecord record, List<String> partitionPathFields,
boolean hiveStylePartitioning, boolean encodePartitionPath) {
StringBuilder partitionPath = new StringBuilder();
for (String partitionPathField : partitionPathFields) {
String fieldVal = HoodieAvroUtils.getNestedFieldValAsString(record, partitionPathField, true);
if (fieldVal == null || fieldVal.isEmpty()) {
partitionPath.append(hiveStylePartitioning ? partitionPathField + "=" + DEFAULT_PARTITION_PATH
: DEFAULT_PARTITION_PATH);
} else {
if (encodePartitionPath) {
try {
fieldVal = URLEncoder.encode(fieldVal, StandardCharsets.UTF_8.toString());
} catch (UnsupportedEncodingException uoe) {
throw new HoodieException(uoe.getMessage(), uoe);
}
}
partitionPath.append(hiveStylePartitioning ? partitionPathField + "=" + fieldVal : fieldVal);
}
partitionPath.append(DEFAULT_PARTITION_PATH_SEPARATOR);
}
partitionPath.deleteCharAt(partitionPath.length() - 1);
return partitionPath.toString();
}
public static String getRecordKey(GenericRecord record, String recordKeyField) {
String recordKey = HoodieAvroUtils.getNestedFieldValAsString(record, recordKeyField, true);
if (recordKey == null || recordKey.isEmpty()) {
throw new HoodieKeyException("recordKey value: \"" + recordKey + "\" for field: \"" + recordKeyField + "\" cannot be null or empty.");
}
return recordKey;
}
public static String getPartitionPath(GenericRecord record, String partitionPathField,
boolean hiveStylePartitioning, boolean encodePartitionPath) {
String partitionPath = HoodieAvroUtils.getNestedFieldValAsString(record, partitionPathField, true);
if (partitionPath == null || partitionPath.isEmpty()) {
partitionPath = DEFAULT_PARTITION_PATH;
}
if (encodePartitionPath) {
try {
partitionPath = URLEncoder.encode(partitionPath, StandardCharsets.UTF_8.toString());
} catch (UnsupportedEncodingException uoe) {
throw new HoodieException(uoe.getMessage(), uoe);
}
}
if (hiveStylePartitioning) {
partitionPath = partitionPathField + "=" + partitionPath;
}
return partitionPath;
}
}

View File

@@ -24,6 +24,7 @@ import org.apache.hudi.common.model.HoodieKey;
import org.apache.avro.generic.GenericRecord;
import java.io.Serializable;
import java.util.List;
/**
* Abstract class to extend for plugging in extraction of {@link HoodieKey} from an Avro record.
@@ -40,4 +41,14 @@ public abstract class KeyGenerator implements Serializable {
* Generate a Hoodie Key out of provided generic record.
*/
public abstract HoodieKey getKey(GenericRecord record);
/**
* Used during bootstrap, to project out only the record key fields from bootstrap source dataset.
*
* @return list of field names, when concatenated make up the record key.
*/
public List<String> getRecordKeyFieldNames() {
throw new UnsupportedOperationException("Bootstrap not supported for key generator. "
+ "Please override this method in your custom key generator.");
}
}

View File

@@ -18,9 +18,6 @@
package org.apache.hudi.table;
import org.apache.avro.generic.GenericRecord;
import org.apache.avro.generic.IndexedRecord;
import org.apache.hadoop.conf.Configuration;
import org.apache.hudi.avro.model.HoodieCleanMetadata;
import org.apache.hudi.avro.model.HoodieCompactionPlan;
import org.apache.hudi.avro.model.HoodieRestoreMetadata;
@@ -33,30 +30,32 @@ import org.apache.hudi.common.model.HoodieRecord;
import org.apache.hudi.common.model.HoodieRecordPayload;
import org.apache.hudi.common.table.HoodieTableMetaClient;
import org.apache.hudi.common.table.timeline.HoodieInstant;
import org.apache.hudi.common.table.timeline.HoodieTimeline;
import org.apache.hudi.common.util.Option;
import org.apache.hudi.common.util.queue.BoundedInMemoryExecutor;
import org.apache.hudi.common.util.queue.BoundedInMemoryQueueConsumer;
import org.apache.hudi.config.HoodieWriteConfig;
import org.apache.hudi.exception.HoodieException;
import org.apache.hudi.exception.HoodieNotSupportedException;
import org.apache.hudi.exception.HoodieUpsertException;
import org.apache.hudi.execution.SparkBoundedInMemoryExecutor;
import org.apache.hudi.io.HoodieCreateHandle;
import org.apache.hudi.io.HoodieMergeHandle;
import org.apache.hudi.io.storage.HoodieFileReader;
import org.apache.hudi.io.storage.HoodieFileReaderFactory;
import org.apache.hudi.table.action.clean.CleanActionExecutor;
import org.apache.hudi.table.action.HoodieWriteMetadata;
import org.apache.hudi.table.action.bootstrap.BootstrapCommitActionExecutor;
import org.apache.hudi.table.action.bootstrap.HoodieBootstrapWriteMetadata;
import org.apache.hudi.table.action.clean.CleanActionExecutor;
import org.apache.hudi.table.action.commit.BulkInsertCommitActionExecutor;
import org.apache.hudi.table.action.commit.BulkInsertPreppedCommitActionExecutor;
import org.apache.hudi.table.action.commit.DeleteCommitActionExecutor;
import org.apache.hudi.table.action.commit.InsertCommitActionExecutor;
import org.apache.hudi.table.action.commit.InsertPreppedCommitActionExecutor;
import org.apache.hudi.table.action.commit.MergeHelper;
import org.apache.hudi.table.action.commit.UpsertCommitActionExecutor;
import org.apache.hudi.table.action.commit.UpsertPreppedCommitActionExecutor;
import org.apache.hudi.table.action.restore.CopyOnWriteRestoreActionExecutor;
import org.apache.hudi.table.action.rollback.CopyOnWriteRollbackActionExecutor;
import org.apache.hudi.table.action.savepoint.SavepointActionExecutor;
import org.apache.avro.generic.GenericRecord;
import org.apache.hadoop.conf.Configuration;
import org.apache.log4j.LogManager;
import org.apache.log4j.Logger;
import org.apache.spark.api.java.JavaRDD;
@@ -97,7 +96,7 @@ public class HoodieCopyOnWriteTable<T extends HoodieRecordPayload> extends Hoodi
@Override
public HoodieWriteMetadata bulkInsert(JavaSparkContext jsc, String instantTime, JavaRDD<HoodieRecord<T>> records,
Option<BulkInsertPartitioner> userDefinedBulkInsertPartitioner) {
return new BulkInsertCommitActionExecutor<>(jsc, config,
return new BulkInsertCommitActionExecutor(jsc, config,
this, instantTime, records, userDefinedBulkInsertPartitioner).execute();
}
@@ -121,7 +120,7 @@ public class HoodieCopyOnWriteTable<T extends HoodieRecordPayload> extends Hoodi
@Override
public HoodieWriteMetadata bulkInsertPrepped(JavaSparkContext jsc, String instantTime,
JavaRDD<HoodieRecord<T>> preppedRecords, Option<BulkInsertPartitioner> userDefinedBulkInsertPartitioner) {
return new BulkInsertPreppedCommitActionExecutor<>(jsc, config,
return new BulkInsertPreppedCommitActionExecutor(jsc, config,
this, instantTime, preppedRecords, userDefinedBulkInsertPartitioner).execute();
}
@@ -135,6 +134,16 @@ public class HoodieCopyOnWriteTable<T extends HoodieRecordPayload> extends Hoodi
throw new HoodieNotSupportedException("Compaction is not supported on a CopyOnWrite table");
}
@Override
public HoodieBootstrapWriteMetadata bootstrap(JavaSparkContext jsc, Option<Map<String, String>> extraMetadata) {
return new BootstrapCommitActionExecutor(jsc, config, this, extraMetadata).execute();
}
@Override
public void rollbackBootstrap(JavaSparkContext jsc, String instantTime) {
new CopyOnWriteRestoreActionExecutor(jsc, config, this, instantTime, HoodieTimeline.INIT_INSTANT_TS).execute();
}
public Iterator<List<WriteStatus>> handleUpdate(String instantTime, String partitionPath, String fileId,
Map<String, HoodieRecord<T>> keyToNewRecords, HoodieBaseFile oldDataFile) throws IOException {
// these are updates
@@ -148,25 +157,10 @@ public class HoodieCopyOnWriteTable<T extends HoodieRecordPayload> extends Hoodi
throw new HoodieUpsertException(
"Error in finding the old file path at commit " + instantTime + " for fileId: " + fileId);
} else {
BoundedInMemoryExecutor<GenericRecord, GenericRecord, Void> wrapper = null;
HoodieFileReader<IndexedRecord> storageReader =
HoodieFileReaderFactory.getFileReader(getHadoopConf(), upsertHandle.getOldFilePath());
try {
wrapper =
new SparkBoundedInMemoryExecutor(config, storageReader.getRecordIterator(upsertHandle.getWriterSchema()),
new UpdateHandler(upsertHandle), x -> x);
wrapper.execute();
} catch (Exception e) {
throw new HoodieException(e);
} finally {
upsertHandle.close();
if (null != wrapper) {
wrapper.shutdownNow();
}
}
MergeHelper.runMerge(this, upsertHandle);
}
// TODO(vc): This needs to be revisited
if (upsertHandle.getWriteStatus().getPartitionPath() == null) {
LOG.info("Upsert Handle has partition path as null " + upsertHandle.getOldFilePath() + ", "

View File

@@ -28,10 +28,14 @@ import org.apache.hudi.common.model.HoodieRecordPayload;
import org.apache.hudi.common.model.HoodieWriteStat;
import org.apache.hudi.common.table.HoodieTableMetaClient;
import org.apache.hudi.common.table.timeline.HoodieInstant;
import org.apache.hudi.common.table.timeline.HoodieTimeline;
import org.apache.hudi.common.util.Option;
import org.apache.hudi.config.HoodieWriteConfig;
import org.apache.hudi.exception.HoodieIOException;
import org.apache.hudi.table.action.HoodieWriteMetadata;
import org.apache.hudi.table.action.bootstrap.BootstrapDeltaCommitActionExecutor;
import org.apache.hudi.table.action.bootstrap.HoodieBootstrapWriteMetadata;
import org.apache.hudi.table.action.compact.RunCompactionActionExecutor;
import org.apache.hudi.table.action.deltacommit.BulkInsertDeltaCommitActionExecutor;
import org.apache.hudi.table.action.deltacommit.BulkInsertPreppedDeltaCommitActionExecutor;
@@ -84,7 +88,7 @@ public class HoodieMergeOnReadTable<T extends HoodieRecordPayload> extends Hoodi
@Override
public HoodieWriteMetadata bulkInsert(JavaSparkContext jsc, String instantTime, JavaRDD<HoodieRecord<T>> records,
Option<BulkInsertPartitioner> userDefinedBulkInsertPartitioner) {
return new BulkInsertDeltaCommitActionExecutor<>(jsc, config,
return new BulkInsertDeltaCommitActionExecutor(jsc, config,
this, instantTime, records, userDefinedBulkInsertPartitioner).execute();
}
@@ -108,7 +112,7 @@ public class HoodieMergeOnReadTable<T extends HoodieRecordPayload> extends Hoodi
@Override
public HoodieWriteMetadata bulkInsertPrepped(JavaSparkContext jsc, String instantTime,
JavaRDD<HoodieRecord<T>> preppedRecords, Option<BulkInsertPartitioner> userDefinedBulkInsertPartitioner) {
return new BulkInsertPreppedDeltaCommitActionExecutor<>(jsc, config,
return new BulkInsertPreppedDeltaCommitActionExecutor(jsc, config,
this, instantTime, preppedRecords, userDefinedBulkInsertPartitioner).execute();
}
@@ -125,6 +129,16 @@ public class HoodieMergeOnReadTable<T extends HoodieRecordPayload> extends Hoodi
return compactionExecutor.execute();
}
@Override
public HoodieBootstrapWriteMetadata bootstrap(JavaSparkContext jsc, Option<Map<String, String>> extraMetadata) {
return new BootstrapDeltaCommitActionExecutor(jsc, config, this, extraMetadata).execute();
}
@Override
public void rollbackBootstrap(JavaSparkContext jsc, String instantTime) {
new MergeOnReadRestoreActionExecutor(jsc, config, this, instantTime, HoodieTimeline.INIT_INSTANT_TS).execute();
}
@Override
public HoodieRollbackMetadata rollback(JavaSparkContext jsc,
String rollbackInstantTime,

View File

@@ -60,6 +60,7 @@ import org.apache.hudi.exception.HoodieInsertException;
import org.apache.hudi.exception.HoodieUpsertException;
import org.apache.hudi.index.HoodieIndex;
import org.apache.hudi.table.action.HoodieWriteMetadata;
import org.apache.hudi.table.action.bootstrap.HoodieBootstrapWriteMetadata;
import org.apache.log4j.LogManager;
import org.apache.log4j.Logger;
import org.apache.spark.api.java.JavaRDD;
@@ -331,6 +332,20 @@ public abstract class HoodieTable<T extends HoodieRecordPayload> implements Seri
public abstract HoodieWriteMetadata compact(JavaSparkContext jsc,
String compactionInstantTime);
/**
* Perform metadata/full bootstrap of a Hudi table.
* @param jsc JavaSparkContext
* @param extraMetadata Additional Metadata for storing in commit file.
* @return HoodieBootstrapWriteMetadata
*/
public abstract HoodieBootstrapWriteMetadata bootstrap(JavaSparkContext jsc, Option<Map<String, String>> extraMetadata);
/**
* Perform rollback of bootstrap of a Hudi table.
* @param jsc JavaSparkContext
*/
public abstract void rollbackBootstrap(JavaSparkContext jsc, String instantTime);
/**
* Executes a new clean action.
*

View File

@@ -0,0 +1,356 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.hudi.table.action.bootstrap;
import org.apache.hudi.avro.HoodieAvroUtils;
import org.apache.hudi.avro.model.HoodieFileStatus;
import org.apache.hudi.client.WriteStatus;
import org.apache.hudi.client.bootstrap.BootstrapMode;
import org.apache.hudi.client.bootstrap.BootstrapRecordPayload;
import org.apache.hudi.client.bootstrap.BootstrapSchemaProvider;
import org.apache.hudi.client.bootstrap.BootstrapWriteStatus;
import org.apache.hudi.client.bootstrap.FullRecordBootstrapDataProvider;
import org.apache.hudi.client.bootstrap.selector.BootstrapModeSelector;
import org.apache.hudi.client.bootstrap.translator.BootstrapPartitionPathTranslator;
import org.apache.hudi.common.bootstrap.FileStatusUtils;
import org.apache.hudi.common.bootstrap.index.BootstrapIndex;
import org.apache.hudi.common.config.TypedProperties;
import org.apache.hudi.common.fs.FSUtils;
import org.apache.hudi.common.model.BootstrapFileMapping;
import org.apache.hudi.common.model.HoodieFileFormat;
import org.apache.hudi.common.model.HoodieKey;
import org.apache.hudi.common.model.HoodieRecord;
import org.apache.hudi.common.model.HoodieRecordPayload;
import org.apache.hudi.common.model.HoodieWriteStat;
import org.apache.hudi.common.model.WriteOperationType;
import org.apache.hudi.common.table.HoodieTableMetaClient;
import org.apache.hudi.common.table.timeline.HoodieInstant;
import org.apache.hudi.common.table.timeline.HoodieInstant.State;
import org.apache.hudi.common.table.timeline.HoodieTimeline;
import org.apache.hudi.common.util.Option;
import org.apache.hudi.common.util.ParquetReaderIterator;
import org.apache.hudi.common.util.ReflectionUtils;
import org.apache.hudi.common.util.ValidationUtils;
import org.apache.hudi.common.util.collection.Pair;
import org.apache.hudi.common.util.queue.BoundedInMemoryExecutor;
import org.apache.hudi.config.HoodieWriteConfig;
import org.apache.hudi.exception.HoodieException;
import org.apache.hudi.exception.HoodieIOException;
import org.apache.hudi.execution.SparkBoundedInMemoryExecutor;
import org.apache.hudi.io.HoodieBootstrapHandle;
import org.apache.hudi.keygen.KeyGenerator;
import org.apache.hudi.table.HoodieTable;
import org.apache.hudi.table.WorkloadProfile;
import org.apache.hudi.table.action.HoodieWriteMetadata;
import org.apache.hudi.table.action.commit.BaseCommitActionExecutor;
import org.apache.hudi.table.action.commit.BulkInsertCommitActionExecutor;
import org.apache.avro.Schema;
import org.apache.avro.generic.GenericData;
import org.apache.avro.generic.GenericRecord;
import org.apache.avro.generic.IndexedRecord;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hudi.table.action.commit.CommitActionExecutor;
import org.apache.log4j.LogManager;
import org.apache.log4j.Logger;
import org.apache.parquet.avro.AvroParquetReader;
import org.apache.parquet.avro.AvroReadSupport;
import org.apache.parquet.avro.AvroSchemaConverter;
import org.apache.parquet.format.converter.ParquetMetadataConverter;
import org.apache.parquet.hadoop.ParquetFileReader;
import org.apache.parquet.hadoop.ParquetReader;
import org.apache.parquet.hadoop.metadata.ParquetMetadata;
import org.apache.parquet.schema.MessageType;
import org.apache.spark.Partitioner;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext;
import java.io.IOException;
import java.util.Collection;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.stream.Collectors;
public class BootstrapCommitActionExecutor<T extends HoodieRecordPayload<T>>
extends BaseCommitActionExecutor<T, HoodieBootstrapWriteMetadata> {
private static final Logger LOG = LogManager.getLogger(BootstrapCommitActionExecutor.class);
protected String bootstrapSchema = null;
private transient FileSystem bootstrapSourceFileSystem;
public BootstrapCommitActionExecutor(JavaSparkContext jsc, HoodieWriteConfig config, HoodieTable<?> table,
Option<Map<String, String>> extraMetadata) {
super(jsc, new HoodieWriteConfig.Builder().withProps(config.getProps())
.withAutoCommit(true).withWriteStatusClass(BootstrapWriteStatus.class)
.withBulkInsertParallelism(config.getBootstrapParallelism())
.build(), table, HoodieTimeline.METADATA_BOOTSTRAP_INSTANT_TS, WriteOperationType.BOOTSTRAP,
extraMetadata);
bootstrapSourceFileSystem = FSUtils.getFs(config.getBootstrapSourceBasePath(), hadoopConf);
}
private void validate() {
ValidationUtils.checkArgument(config.getBootstrapSourceBasePath() != null,
"Ensure Bootstrap Source Path is set");
ValidationUtils.checkArgument(config.getBootstrapModeSelectorClass() != null,
"Ensure Bootstrap Partition Selector is set");
ValidationUtils.checkArgument(config.getBootstrapKeyGeneratorClass() != null,
"Ensure bootstrap key generator class is set");
}
@Override
public HoodieBootstrapWriteMetadata execute() {
validate();
try {
HoodieTableMetaClient metaClient = table.getMetaClient();
Option<HoodieInstant> completetedInstant =
metaClient.getActiveTimeline().getCommitsTimeline().filterCompletedInstants().lastInstant();
ValidationUtils.checkArgument(!completetedInstant.isPresent(),
"Active Timeline is expected to be empty for bootstrap to be performed. "
+ "If you want to re-bootstrap, please rollback bootstrap first !!");
Map<BootstrapMode, List<Pair<String, List<HoodieFileStatus>>>> partitionSelections = listAndProcessSourcePartitions();
// First run metadata bootstrap which will auto commit
Option<HoodieWriteMetadata> metadataResult = metadataBootstrap(partitionSelections.get(BootstrapMode.METADATA_ONLY));
// if there are full bootstrap to be performed, perform that too
Option<HoodieWriteMetadata> fullBootstrapResult = fullBootstrap(partitionSelections.get(BootstrapMode.FULL_RECORD));
return new HoodieBootstrapWriteMetadata(metadataResult, fullBootstrapResult);
} catch (IOException ioe) {
throw new HoodieIOException(ioe.getMessage(), ioe);
}
}
protected String getSchemaToStoreInCommit() {
return bootstrapSchema;
}
/**
* Perform Metadata Bootstrap.
* @param partitionFilesList List of partitions and files within that partitions
*/
protected Option<HoodieWriteMetadata> metadataBootstrap(List<Pair<String, List<HoodieFileStatus>>> partitionFilesList) {
if (null == partitionFilesList || partitionFilesList.isEmpty()) {
return Option.empty();
}
HoodieTableMetaClient metaClient = table.getMetaClient();
metaClient.getActiveTimeline().createNewInstant(
new HoodieInstant(State.REQUESTED, metaClient.getCommitActionType(),
HoodieTimeline.METADATA_BOOTSTRAP_INSTANT_TS));
table.getActiveTimeline().transitionRequestedToInflight(new HoodieInstant(State.REQUESTED,
metaClient.getCommitActionType(), HoodieTimeline.METADATA_BOOTSTRAP_INSTANT_TS), Option.empty());
JavaRDD<BootstrapWriteStatus> bootstrapWriteStatuses = runMetadataBootstrap(partitionFilesList);
HoodieWriteMetadata result = new HoodieWriteMetadata();
updateIndexAndCommitIfNeeded(bootstrapWriteStatuses.map(w -> w), result);
return Option.of(result);
}
@Override
protected void commit(Option<Map<String, String>> extraMetadata, HoodieWriteMetadata result) {
// Perform bootstrap index write and then commit. Make sure both record-key and bootstrap-index
// is all done in a single job DAG.
Map<String, List<Pair<BootstrapFileMapping, HoodieWriteStat>>> bootstrapSourceAndStats =
result.getWriteStatuses().collect().stream()
.map(w -> {
BootstrapWriteStatus ws = (BootstrapWriteStatus) w;
return Pair.of(ws.getBootstrapSourceFileMapping(), ws.getStat());
}).collect(Collectors.groupingBy(w -> w.getKey().getPartitionPath()));
HoodieTableMetaClient metaClient = table.getMetaClient();
try (BootstrapIndex.IndexWriter indexWriter = BootstrapIndex.getBootstrapIndex(metaClient)
.createWriter(metaClient.getTableConfig().getBootstrapBasePath().get())) {
LOG.info("Starting to write bootstrap index for source " + config.getBootstrapSourceBasePath() + " in table "
+ config.getBasePath());
indexWriter.begin();
bootstrapSourceAndStats.forEach((key, value) -> indexWriter.appendNextPartition(key,
value.stream().map(Pair::getKey).collect(Collectors.toList())));
indexWriter.finish();
LOG.info("Finished writing bootstrap index for source " + config.getBootstrapSourceBasePath() + " in table "
+ config.getBasePath());
}
super.commit(extraMetadata, result, bootstrapSourceAndStats.values().stream()
.flatMap(f -> f.stream().map(Pair::getValue)).collect(Collectors.toList()));
LOG.info("Committing metadata bootstrap !!");
}
/**
* Perform Metadata Bootstrap.
* @param partitionFilesList List of partitions and files within that partitions
*/
protected Option<HoodieWriteMetadata> fullBootstrap(List<Pair<String, List<HoodieFileStatus>>> partitionFilesList) {
if (null == partitionFilesList || partitionFilesList.isEmpty()) {
return Option.empty();
}
TypedProperties properties = new TypedProperties();
properties.putAll(config.getProps());
FullRecordBootstrapDataProvider inputProvider =
(FullRecordBootstrapDataProvider) ReflectionUtils.loadClass(config.getFullBootstrapInputProvider(),
properties, jsc);
JavaRDD<HoodieRecord> inputRecordsRDD =
inputProvider.generateInputRecordRDD("bootstrap_source", config.getBootstrapSourceBasePath(),
partitionFilesList);
// Start Full Bootstrap
final HoodieInstant requested = new HoodieInstant(State.REQUESTED, table.getMetaClient().getCommitActionType(),
HoodieTimeline.FULL_BOOTSTRAP_INSTANT_TS);
table.getActiveTimeline().createNewInstant(requested);
// Setup correct schema and run bulk insert.
return Option.of(getBulkInsertActionExecutor(inputRecordsRDD).execute());
}
protected CommitActionExecutor<T> getBulkInsertActionExecutor(JavaRDD<HoodieRecord> inputRecordsRDD) {
return new BulkInsertCommitActionExecutor(jsc, new HoodieWriteConfig.Builder().withProps(config.getProps())
.withSchema(bootstrapSchema).build(), table, HoodieTimeline.FULL_BOOTSTRAP_INSTANT_TS,
inputRecordsRDD, extraMetadata);
}
private BootstrapWriteStatus handleMetadataBootstrap(String srcPartitionPath, String partitionPath,
HoodieFileStatus srcFileStatus, KeyGenerator keyGenerator) {
Path sourceFilePath = FileStatusUtils.toPath(srcFileStatus.getPath());
HoodieBootstrapHandle bootstrapHandle = new HoodieBootstrapHandle(config, HoodieTimeline.METADATA_BOOTSTRAP_INSTANT_TS,
table, partitionPath, FSUtils.createNewFileIdPfx(), table.getSparkTaskContextSupplier());
Schema avroSchema = null;
try {
ParquetMetadata readFooter = ParquetFileReader.readFooter(table.getHadoopConf(), sourceFilePath,
ParquetMetadataConverter.NO_FILTER);
MessageType parquetSchema = readFooter.getFileMetaData().getSchema();
avroSchema = new AvroSchemaConverter().convert(parquetSchema);
Schema recordKeySchema = HoodieAvroUtils.generateProjectionSchema(avroSchema,
keyGenerator.getRecordKeyFieldNames());
LOG.info("Schema to be used for reading record Keys :" + recordKeySchema);
AvroReadSupport.setAvroReadSchema(table.getHadoopConf(), recordKeySchema);
AvroReadSupport.setRequestedProjection(table.getHadoopConf(), recordKeySchema);
BoundedInMemoryExecutor<GenericRecord, HoodieRecord, Void> wrapper = null;
try (ParquetReader<IndexedRecord> reader =
AvroParquetReader.<IndexedRecord>builder(sourceFilePath).withConf(table.getHadoopConf()).build()) {
wrapper = new SparkBoundedInMemoryExecutor<GenericRecord, HoodieRecord, Void>(config,
new ParquetReaderIterator(reader), new BootstrapRecordConsumer(bootstrapHandle), inp -> {
String recKey = keyGenerator.getKey(inp).getRecordKey();
GenericRecord gr = new GenericData.Record(HoodieAvroUtils.RECORD_KEY_SCHEMA);
gr.put(HoodieRecord.RECORD_KEY_METADATA_FIELD, recKey);
BootstrapRecordPayload payload = new BootstrapRecordPayload(gr);
HoodieRecord rec = new HoodieRecord(new HoodieKey(recKey, partitionPath), payload);
return rec;
});
wrapper.execute();
} catch (Exception e) {
throw new HoodieException(e);
} finally {
bootstrapHandle.close();
if (null != wrapper) {
wrapper.shutdownNow();
}
}
} catch (IOException e) {
throw new HoodieIOException(e.getMessage(), e);
}
BootstrapWriteStatus writeStatus = (BootstrapWriteStatus)bootstrapHandle.getWriteStatus();
BootstrapFileMapping bootstrapFileMapping = new BootstrapFileMapping(
config.getBootstrapSourceBasePath(), srcPartitionPath, partitionPath,
srcFileStatus, writeStatus.getFileId());
writeStatus.setBootstrapSourceFileMapping(bootstrapFileMapping);
return writeStatus;
}
/**
* Return Bootstrap Mode selections for partitions listed and figure out bootstrap Schema.
* @return
* @throws IOException
*/
private Map<BootstrapMode, List<Pair<String, List<HoodieFileStatus>>>> listAndProcessSourcePartitions() throws IOException {
List<Pair<String, List<HoodieFileStatus>>> folders =
BootstrapUtils.getAllLeafFoldersWithFiles(bootstrapSourceFileSystem,
config.getBootstrapSourceBasePath(), path -> {
// TODO: Needs to be abstracted out when supporting different formats
// TODO: Remove hoodieFilter
return path.getName().endsWith(HoodieFileFormat.PARQUET.getFileExtension());
});
LOG.info("Fetching Bootstrap Schema !!");
BootstrapSchemaProvider sourceSchemaProvider = new BootstrapSchemaProvider(config);
bootstrapSchema = sourceSchemaProvider.getBootstrapSchema(jsc, folders).toString();
LOG.info("Bootstrap Schema :" + bootstrapSchema);
BootstrapModeSelector selector =
(BootstrapModeSelector) ReflectionUtils.loadClass(config.getBootstrapModeSelectorClass(), config);
Map<BootstrapMode, List<String>> result = selector.select(folders);
Map<String, List<HoodieFileStatus>> partitionToFiles = folders.stream().collect(
Collectors.toMap(Pair::getKey, Pair::getValue));
// Ensure all partitions are accounted for
ValidationUtils.checkArgument(partitionToFiles.keySet().equals(
result.values().stream().flatMap(Collection::stream).collect(Collectors.toSet())));
return result.entrySet().stream().map(e -> Pair.of(e.getKey(), e.getValue().stream()
.map(p -> Pair.of(p, partitionToFiles.get(p))).collect(Collectors.toList())))
.collect(Collectors.toMap(Pair::getKey, Pair::getValue));
}
private JavaRDD<BootstrapWriteStatus> runMetadataBootstrap(List<Pair<String, List<HoodieFileStatus>>> partitions) {
if (null == partitions || partitions.isEmpty()) {
return jsc.emptyRDD();
}
TypedProperties properties = new TypedProperties();
properties.putAll(config.getProps());
KeyGenerator keyGenerator = (KeyGenerator) ReflectionUtils.loadClass(config.getBootstrapKeyGeneratorClass(),
properties);
BootstrapPartitionPathTranslator translator = (BootstrapPartitionPathTranslator) ReflectionUtils.loadClass(
config.getBootstrapPartitionPathTranslatorClass(), properties);
List<Pair<String, Pair<String, HoodieFileStatus>>> bootstrapPaths = partitions.stream()
.flatMap(p -> {
String translatedPartitionPath = translator.getBootstrapTranslatedPath(p.getKey());
return p.getValue().stream().map(f -> Pair.of(p.getKey(), Pair.of(translatedPartitionPath, f)));
})
.collect(Collectors.toList());
return jsc.parallelize(bootstrapPaths, config.getBootstrapParallelism())
.map(partitionFsPair -> handleMetadataBootstrap(partitionFsPair.getLeft(), partitionFsPair.getRight().getLeft(),
partitionFsPair.getRight().getRight(), keyGenerator));
}
//TODO: Once we decouple commit protocol, we should change the class hierarchy to avoid doing this.
@Override
protected Partitioner getUpsertPartitioner(WorkloadProfile profile) {
throw new UnsupportedOperationException("Should not called in bootstrap code path");
}
@Override
protected Partitioner getInsertPartitioner(WorkloadProfile profile) {
throw new UnsupportedOperationException("Should not called in bootstrap code path");
}
@Override
protected Iterator<List<WriteStatus>> handleInsert(String idPfx, Iterator<HoodieRecord<T>> recordItr) {
throw new UnsupportedOperationException("Should not called in bootstrap code path");
}
@Override
protected Iterator<List<WriteStatus>> handleUpdate(String partitionPath, String fileId, Iterator<HoodieRecord<T>> recordItr) {
throw new UnsupportedOperationException("Should not called in bootstrap code path");
}
}

View File

@@ -0,0 +1,47 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.hudi.table.action.bootstrap;
import java.util.Map;
import org.apache.hudi.common.model.HoodieRecord;
import org.apache.hudi.common.model.HoodieRecordPayload;
import org.apache.hudi.common.table.timeline.HoodieTimeline;
import org.apache.hudi.common.util.Option;
import org.apache.hudi.config.HoodieWriteConfig;
import org.apache.hudi.table.HoodieTable;
import org.apache.hudi.table.action.commit.CommitActionExecutor;
import org.apache.hudi.table.action.deltacommit.BulkInsertDeltaCommitActionExecutor;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext;
public class BootstrapDeltaCommitActionExecutor<T extends HoodieRecordPayload<T>>
extends BootstrapCommitActionExecutor<T> {
public BootstrapDeltaCommitActionExecutor(JavaSparkContext jsc,
HoodieWriteConfig config, HoodieTable<?> table,
Option<Map<String, String>> extraMetadata) {
super(jsc, config, table, extraMetadata);
}
protected CommitActionExecutor<T> getBulkInsertActionExecutor(JavaRDD<HoodieRecord> inputRecordsRDD) {
return new BulkInsertDeltaCommitActionExecutor(jsc, new HoodieWriteConfig.Builder().withProps(config.getProps())
.withSchema(bootstrapSchema).build(), table, HoodieTimeline.FULL_BOOTSTRAP_INSTANT_TS,
inputRecordsRDD, extraMetadata);
}
}

View File

@@ -0,0 +1,55 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.hudi.table.action.bootstrap;
import org.apache.hudi.common.model.HoodieRecord;
import org.apache.hudi.common.util.queue.BoundedInMemoryQueueConsumer;
import org.apache.hudi.exception.HoodieIOException;
import org.apache.hudi.io.HoodieBootstrapHandle;
import java.io.IOException;
/**
* Consumer that dequeues records from queue and sends to Merge Handle for writing.
*/
public class BootstrapRecordConsumer extends BoundedInMemoryQueueConsumer<HoodieRecord, Void> {
private final HoodieBootstrapHandle bootstrapHandle;
public BootstrapRecordConsumer(HoodieBootstrapHandle bootstrapHandle) {
this.bootstrapHandle = bootstrapHandle;
}
@Override
protected void consumeOneRecord(HoodieRecord record) {
try {
bootstrapHandle.write(record, record.getData().getInsertValue(bootstrapHandle.getWriterSchemaWithMetafields()));
} catch (IOException e) {
throw new HoodieIOException(e.getMessage(), e);
}
}
@Override
protected void finish() {}
@Override
protected Void getResult() {
return null;
}
}

View File

@@ -0,0 +1,77 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.hudi.table.action.bootstrap;
import org.apache.hudi.avro.model.HoodieFileStatus;
import org.apache.hudi.common.bootstrap.FileStatusUtils;
import org.apache.hudi.common.fs.FSUtils;
import org.apache.hudi.common.util.collection.Pair;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.fs.PathFilter;
import java.io.IOException;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.OptionalInt;
import java.util.stream.Collectors;
public class BootstrapUtils {
/**
* Returns leaf folders with files under a path.
* @param fs File System
* @param basePathStr Base Path to look for leaf folders
* @param filePathFilter Filters to skip directories/paths
* @return list of partition paths with files under them.
* @throws IOException
*/
public static List<Pair<String, List<HoodieFileStatus>>> getAllLeafFoldersWithFiles(FileSystem fs, String basePathStr,
PathFilter filePathFilter) throws IOException {
final Path basePath = new Path(basePathStr);
final Map<Integer, List<String>> levelToPartitions = new HashMap<>();
final Map<String, List<HoodieFileStatus>> partitionToFiles = new HashMap<>();
FSUtils.processFiles(fs, basePathStr, (status) -> {
if (status.isFile() && filePathFilter.accept(status.getPath())) {
String relativePath = FSUtils.getRelativePartitionPath(basePath, status.getPath().getParent());
List<HoodieFileStatus> statusList = partitionToFiles.get(relativePath);
if (null == statusList) {
Integer level = (int) relativePath.chars().filter(ch -> ch == '/').count();
List<String> dirs = levelToPartitions.get(level);
if (null == dirs) {
dirs = new ArrayList<>();
levelToPartitions.put(level, dirs);
}
dirs.add(relativePath);
statusList = new ArrayList<>();
partitionToFiles.put(relativePath, statusList);
}
statusList.add(FileStatusUtils.fromFileStatus(status));
}
return true;
}, true);
OptionalInt maxLevelOpt = levelToPartitions.keySet().stream().mapToInt(x -> x).max();
int maxLevel = maxLevelOpt.orElse(-1);
return maxLevel >= 0 ? levelToPartitions.get(maxLevel).stream()
.map(d -> Pair.of(d, partitionToFiles.get(d))).collect(Collectors.toList()) : new ArrayList<>();
}
}

View File

@@ -0,0 +1,45 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.hudi.table.action.bootstrap;
import org.apache.hudi.common.util.Option;
import org.apache.hudi.table.action.HoodieWriteMetadata;
/**
* Write Result for a bootstrap operation.
*/
public class HoodieBootstrapWriteMetadata {
private final Option<HoodieWriteMetadata> metadataBootstrapResult;
private final Option<HoodieWriteMetadata> fullBootstrapResult;
public HoodieBootstrapWriteMetadata(Option<HoodieWriteMetadata> metadataBootstrapResult,
Option<HoodieWriteMetadata> fullBootstrapResult) {
this.metadataBootstrapResult = metadataBootstrapResult;
this.fullBootstrapResult = fullBootstrapResult;
}
public Option<HoodieWriteMetadata> getMetadataBootstrapResult() {
return metadataBootstrapResult;
}
public Option<HoodieWriteMetadata> getFullBootstrapResult() {
return fullBootstrapResult;
}
}

View File

@@ -18,6 +18,13 @@
package org.apache.hudi.table.action.commit;
import java.io.IOException;
import java.nio.charset.StandardCharsets;
import java.time.Duration;
import java.time.Instant;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import org.apache.hudi.client.SparkTaskContextSupplier;
import org.apache.hudi.client.WriteStatus;
import org.apache.hudi.client.utils.SparkConfigUtils;
@@ -38,43 +45,31 @@ import org.apache.hudi.table.HoodieTable;
import org.apache.hudi.table.WorkloadProfile;
import org.apache.hudi.table.WorkloadStat;
import org.apache.hudi.table.action.BaseActionExecutor;
import org.apache.hudi.table.action.HoodieWriteMetadata;
import org.apache.log4j.LogManager;
import org.apache.log4j.Logger;
import org.apache.spark.Partitioner;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.storage.StorageLevel;
import java.io.IOException;
import java.nio.charset.StandardCharsets;
import java.time.Duration;
import java.time.Instant;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import scala.Tuple2;
public abstract class BaseCommitActionExecutor<T extends HoodieRecordPayload<T>>
extends BaseActionExecutor<HoodieWriteMetadata> {
public abstract class BaseCommitActionExecutor<T extends HoodieRecordPayload<T>, R>
extends BaseActionExecutor<R> {
private static final Logger LOG = LogManager.getLogger(BaseCommitActionExecutor.class);
protected final Option<Map<String, String>> extraMetadata;
private final WriteOperationType operationType;
protected final SparkTaskContextSupplier sparkTaskContextSupplier = new SparkTaskContextSupplier();
public BaseCommitActionExecutor(JavaSparkContext jsc, HoodieWriteConfig config,
HoodieTable table, String instantTime, WriteOperationType operationType) {
this(jsc, config, table, instantTime, operationType, null);
}
public BaseCommitActionExecutor(JavaSparkContext jsc, HoodieWriteConfig config,
HoodieTable table, String instantTime, WriteOperationType operationType,
JavaRDD<HoodieRecord<T>> inputRecordsRDD) {
Option<Map<String, String>> extraMetadata) {
super(jsc, config, table, instantTime);
this.operationType = operationType;
this.extraMetadata = extraMetadata;
}
public HoodieWriteMetadata execute(JavaRDD<HoodieRecord<T>> inputRecordsRDD) {
@@ -173,13 +168,17 @@ public abstract class BaseCommitActionExecutor<T extends HoodieRecordPayload<T>>
protected void commitOnAutoCommit(HoodieWriteMetadata result) {
if (config.shouldAutoCommit()) {
LOG.info("Auto commit enabled: Committing " + instantTime);
commit(Option.empty(), result);
commit(extraMetadata, result);
} else {
LOG.info("Auto commit disabled for " + instantTime);
}
}
private void commit(Option<Map<String, String>> extraMetadata, HoodieWriteMetadata result) {
protected void commit(Option<Map<String, String>> extraMetadata, HoodieWriteMetadata result) {
commit(extraMetadata, result, result.getWriteStatuses().map(WriteStatus::getStat).collect());
}
protected void commit(Option<Map<String, String>> extraMetadata, HoodieWriteMetadata result, List<HoodieWriteStat> stats) {
String actionType = table.getMetaClient().getCommitActionType();
LOG.info("Committing " + instantTime + ", action Type " + actionType);
// Create a Hoodie table which encapsulated the commits and files visible
@@ -189,7 +188,6 @@ public abstract class BaseCommitActionExecutor<T extends HoodieRecordPayload<T>>
HoodieCommitMetadata metadata = new HoodieCommitMetadata();
result.setCommitted(true);
List<HoodieWriteStat> stats = result.getWriteStatuses().map(WriteStatus::getStat).collect();
stats.forEach(stat -> metadata.addWriteStat(stat.getPartitionPath(), stat));
result.setWriteStats(stats);
@@ -200,7 +198,7 @@ public abstract class BaseCommitActionExecutor<T extends HoodieRecordPayload<T>>
if (extraMetadata.isPresent()) {
extraMetadata.get().forEach(metadata::addMetadata);
}
metadata.addMetadata(HoodieCommitMetadata.SCHEMA_KEY, config.getSchema());
metadata.addMetadata(HoodieCommitMetadata.SCHEMA_KEY, getSchemaToStoreInCommit());
metadata.setOperationType(operationType);
try {
@@ -229,6 +227,13 @@ public abstract class BaseCommitActionExecutor<T extends HoodieRecordPayload<T>>
}
}
/**
* By default, return the writer schema in Write Config for storing in commit.
*/
protected String getSchemaToStoreInCommit() {
return config.getSchema();
}
protected boolean isWorkloadProfileNeeded() {
return true;
}

View File

@@ -18,6 +18,7 @@
package org.apache.hudi.table.action.commit;
import java.util.Map;
import org.apache.hudi.common.model.HoodieRecord;
import org.apache.hudi.common.model.HoodieRecordPayload;
import org.apache.hudi.common.model.WriteOperationType;
@@ -31,30 +32,34 @@ import org.apache.hudi.table.action.HoodieWriteMetadata;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext;
public class BulkInsertCommitActionExecutor<T extends HoodieRecordPayload<T>>
extends CommitActionExecutor<T> {
public class BulkInsertCommitActionExecutor<T extends HoodieRecordPayload<T>> extends CommitActionExecutor<T> {
private final JavaRDD<HoodieRecord<T>> inputRecordsRDD;
private final Option<BulkInsertPartitioner> userDefinedBulkInsertPartitioner;
private final Option<BulkInsertPartitioner<T>> bulkInsertPartitioner;
public BulkInsertCommitActionExecutor(JavaSparkContext jsc,
HoodieWriteConfig config, HoodieTable table,
String instantTime, JavaRDD<HoodieRecord<T>> inputRecordsRDD,
Option<BulkInsertPartitioner> userDefinedBulkInsertPartitioner) {
super(jsc, config, table, instantTime, WriteOperationType.BULK_INSERT);
public BulkInsertCommitActionExecutor(JavaSparkContext jsc, HoodieWriteConfig config, HoodieTable table,
String instantTime, JavaRDD<HoodieRecord<T>> inputRecordsRDD,
Option<BulkInsertPartitioner<T>> bulkInsertPartitioner) {
this(jsc, config, table, instantTime, inputRecordsRDD, bulkInsertPartitioner, Option.empty());
}
public BulkInsertCommitActionExecutor(JavaSparkContext jsc, HoodieWriteConfig config, HoodieTable table,
String instantTime, JavaRDD<HoodieRecord<T>> inputRecordsRDD,
Option<BulkInsertPartitioner<T>> bulkInsertPartitioner,
Option<Map<String, String>> extraMetadata) {
super(jsc, config, table, instantTime, WriteOperationType.BULK_INSERT, extraMetadata);
this.inputRecordsRDD = inputRecordsRDD;
this.userDefinedBulkInsertPartitioner = userDefinedBulkInsertPartitioner;
this.bulkInsertPartitioner = bulkInsertPartitioner;
}
@Override
public HoodieWriteMetadata execute() {
try {
return BulkInsertHelper.bulkInsert(inputRecordsRDD, instantTime, (HoodieTable<T>) table, config,
this, true, userDefinedBulkInsertPartitioner);
this, true, bulkInsertPartitioner);
} catch (HoodieInsertException ie) {
throw ie;
} catch (Throwable e) {
if (e instanceof HoodieInsertException) {
throw e;
}
throw new HoodieInsertException("Failed to bulk insert for commit time " + instantTime, e);
}
}

View File

@@ -39,11 +39,10 @@ import java.util.stream.IntStream;
public class BulkInsertHelper<T extends HoodieRecordPayload<T>> {
public static <T extends HoodieRecordPayload<T>> HoodieWriteMetadata bulkInsert(
JavaRDD<HoodieRecord<T>> inputRecords, String instantTime,
HoodieTable<T> table, HoodieWriteConfig config,
CommitActionExecutor<T> executor, boolean performDedupe,
Option<BulkInsertPartitioner> userDefinedBulkInsertPartitioner) {
public static <T extends HoodieRecordPayload<T>> HoodieWriteMetadata bulkInsert(JavaRDD<HoodieRecord<T>> inputRecords, String instantTime,
HoodieTable<T> table, HoodieWriteConfig config,
CommitActionExecutor<T> executor, boolean performDedupe,
Option<BulkInsertPartitioner<T>> userDefinedBulkInsertPartitioner) {
HoodieWriteMetadata result = new HoodieWriteMetadata();
// De-dupe/merge if needed

View File

@@ -35,12 +35,12 @@ public class BulkInsertPreppedCommitActionExecutor<T extends HoodieRecordPayload
extends CommitActionExecutor<T> {
private final JavaRDD<HoodieRecord<T>> preppedInputRecordRdd;
private final Option<BulkInsertPartitioner> userDefinedBulkInsertPartitioner;
private final Option<BulkInsertPartitioner<T>> userDefinedBulkInsertPartitioner;
public BulkInsertPreppedCommitActionExecutor(JavaSparkContext jsc,
HoodieWriteConfig config, HoodieTable table,
String instantTime, JavaRDD<HoodieRecord<T>> preppedInputRecordRdd,
Option<BulkInsertPartitioner> userDefinedBulkInsertPartitioner) {
Option<BulkInsertPartitioner<T>> userDefinedBulkInsertPartitioner) {
super(jsc, config, table, instantTime, WriteOperationType.BULK_INSERT);
this.preppedInputRecordRdd = preppedInputRecordRdd;
this.userDefinedBulkInsertPartitioner = userDefinedBulkInsertPartitioner;

View File

@@ -23,21 +23,15 @@ import org.apache.hudi.common.model.HoodieBaseFile;
import org.apache.hudi.common.model.HoodieRecord;
import org.apache.hudi.common.model.HoodieRecordPayload;
import org.apache.hudi.common.model.WriteOperationType;
import org.apache.hudi.common.util.queue.BoundedInMemoryExecutor;
import org.apache.hudi.common.util.queue.BoundedInMemoryQueueConsumer;
import org.apache.hudi.common.util.Option;
import org.apache.hudi.config.HoodieWriteConfig;
import org.apache.hudi.exception.HoodieException;
import org.apache.hudi.exception.HoodieUpsertException;
import org.apache.hudi.execution.LazyInsertIterable;
import org.apache.hudi.execution.SparkBoundedInMemoryExecutor;
import org.apache.hudi.io.HoodieMergeHandle;
import org.apache.hudi.io.storage.HoodieFileReader;
import org.apache.hudi.io.storage.HoodieFileReaderFactory;
import org.apache.hudi.table.HoodieTable;
import org.apache.hudi.table.WorkloadProfile;
import org.apache.hudi.table.action.HoodieWriteMetadata;
import org.apache.avro.generic.GenericRecord;
import org.apache.avro.generic.IndexedRecord;
import org.apache.log4j.LogManager;
import org.apache.log4j.Logger;
import org.apache.spark.Partitioner;
@@ -50,14 +44,19 @@ import java.util.List;
import java.util.Map;
public abstract class CommitActionExecutor<T extends HoodieRecordPayload<T>>
extends BaseCommitActionExecutor<T> {
extends BaseCommitActionExecutor<T, HoodieWriteMetadata> {
private static final Logger LOG = LogManager.getLogger(CommitActionExecutor.class);
public CommitActionExecutor(JavaSparkContext jsc,
HoodieWriteConfig config, HoodieTable table,
String instantTime, WriteOperationType operationType) {
super(jsc, config, table, instantTime, operationType);
public CommitActionExecutor(JavaSparkContext jsc, HoodieWriteConfig config, HoodieTable table,
String instantTime, WriteOperationType operationType) {
this(jsc, config, table, instantTime, operationType, Option.empty());
}
public CommitActionExecutor(JavaSparkContext jsc, HoodieWriteConfig config, HoodieTable table,
String instantTime, WriteOperationType operationType,
Option<Map<String, String>> extraMetadata) {
super(jsc, config, table, instantTime, operationType, extraMetadata);
}
@Override
@@ -87,22 +86,7 @@ public abstract class CommitActionExecutor<T extends HoodieRecordPayload<T>>
throw new HoodieUpsertException(
"Error in finding the old file path at commit " + instantTime + " for fileId: " + fileId);
} else {
BoundedInMemoryExecutor<GenericRecord, GenericRecord, Void> wrapper = null;
try {
HoodieFileReader<IndexedRecord> storageReader =
HoodieFileReaderFactory.getFileReader(table.getHadoopConf(), upsertHandle.getOldFilePath());
wrapper =
new SparkBoundedInMemoryExecutor(config, storageReader.getRecordIterator(upsertHandle.getWriterSchema()),
new UpdateHandler(upsertHandle), x -> x);
wrapper.execute();
} catch (Exception e) {
throw new HoodieException(e);
} finally {
upsertHandle.close();
if (null != wrapper) {
wrapper.shutdownNow();
}
}
MergeHelper.runMerge(table, upsertHandle);
}
// TODO(vc): This needs to be revisited
@@ -147,29 +131,4 @@ public abstract class CommitActionExecutor<T extends HoodieRecordPayload<T>>
public Partitioner getInsertPartitioner(WorkloadProfile profile) {
return getUpsertPartitioner(profile);
}
/**
* Consumer that dequeues records from queue and sends to Merge Handle.
*/
private static class UpdateHandler extends BoundedInMemoryQueueConsumer<GenericRecord, Void> {
private final HoodieMergeHandle upsertHandle;
private UpdateHandler(HoodieMergeHandle upsertHandle) {
this.upsertHandle = upsertHandle;
}
@Override
protected void consumeOneRecord(GenericRecord record) {
upsertHandle.write(record);
}
@Override
protected void finish() {}
@Override
protected Void getResult() {
return null;
}
}
}

View File

@@ -0,0 +1,185 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.hudi.table.action.commit;
import java.io.ByteArrayOutputStream;
import org.apache.avro.generic.GenericDatumReader;
import org.apache.avro.generic.GenericDatumWriter;
import org.apache.avro.io.BinaryDecoder;
import org.apache.avro.io.BinaryEncoder;
import org.apache.avro.io.DecoderFactory;
import org.apache.avro.io.EncoderFactory;
import org.apache.hadoop.fs.Path;
import org.apache.hudi.avro.HoodieAvroUtils;
import org.apache.hudi.client.utils.MergingIterator;
import org.apache.hudi.common.model.HoodieBaseFile;
import org.apache.hudi.common.model.HoodieRecordPayload;
import org.apache.hudi.common.util.queue.BoundedInMemoryExecutor;
import org.apache.hudi.common.util.queue.BoundedInMemoryQueueConsumer;
import org.apache.hudi.exception.HoodieException;
import org.apache.hudi.execution.SparkBoundedInMemoryExecutor;
import org.apache.hudi.io.HoodieMergeHandle;
import org.apache.hudi.io.storage.HoodieFileReader;
import org.apache.hudi.io.storage.HoodieFileReaderFactory;
import org.apache.hudi.table.HoodieTable;
import org.apache.avro.Schema;
import org.apache.avro.generic.GenericRecord;
import org.apache.hadoop.conf.Configuration;
import java.io.IOException;
import java.util.Iterator;
/**
* Helper to read records from previous version of parquet and run Merge.
*/
public class MergeHelper {
/**
* Read records from previous version of base file and merge.
* @param table Hoodie Table
* @param upsertHandle Merge Handle
* @param <T>
* @throws IOException in case of error
*/
public static <T extends HoodieRecordPayload<T>> void runMerge(HoodieTable<T> table, HoodieMergeHandle<T> upsertHandle) throws IOException {
final boolean externalSchemaTransformation = table.getConfig().shouldUseExternalSchemaTransformation();
Configuration cfgForHoodieFile = new Configuration(table.getHadoopConf());
HoodieBaseFile baseFile = upsertHandle.baseFileForMerge();
final GenericDatumWriter<GenericRecord> gWriter;
final GenericDatumReader<GenericRecord> gReader;
Schema readSchema;
if (externalSchemaTransformation || baseFile.getBootstrapBaseFile().isPresent()) {
readSchema = HoodieFileReaderFactory.getFileReader(table.getHadoopConf(), upsertHandle.getOldFilePath()).getSchema();
gWriter = new GenericDatumWriter<>(readSchema);
gReader = new GenericDatumReader<>(readSchema, upsertHandle.getWriterSchemaWithMetafields());
} else {
gReader = null;
gWriter = null;
readSchema = upsertHandle.getWriterSchemaWithMetafields();
}
BoundedInMemoryExecutor<GenericRecord, GenericRecord, Void> wrapper = null;
HoodieFileReader<GenericRecord> reader = HoodieFileReaderFactory.<T, GenericRecord>getFileReader(cfgForHoodieFile, upsertHandle.getOldFilePath());
try {
final Iterator<GenericRecord> readerIterator;
if (baseFile.getBootstrapBaseFile().isPresent()) {
readerIterator = getMergingIterator(table, upsertHandle, baseFile, reader, readSchema, externalSchemaTransformation);
} else {
readerIterator = reader.getRecordIterator(readSchema);
}
ThreadLocal<BinaryEncoder> encoderCache = new ThreadLocal<>();
ThreadLocal<BinaryDecoder> decoderCache = new ThreadLocal<>();
wrapper = new SparkBoundedInMemoryExecutor(table.getConfig(), readerIterator,
new UpdateHandler(upsertHandle), record -> {
if (!externalSchemaTransformation) {
return record;
}
return transformRecordBasedOnNewSchema(gReader, gWriter, encoderCache, decoderCache, (GenericRecord) record);
});
wrapper.execute();
} catch (Exception e) {
throw new HoodieException(e);
} finally {
if (reader != null) {
reader.close();
}
upsertHandle.close();
if (null != wrapper) {
wrapper.shutdownNow();
}
}
}
private static GenericRecord transformRecordBasedOnNewSchema(GenericDatumReader<GenericRecord> gReader, GenericDatumWriter<GenericRecord> gWriter,
ThreadLocal<BinaryEncoder> encoderCache, ThreadLocal<BinaryDecoder> decoderCache,
GenericRecord gRec) {
ByteArrayOutputStream inStream = null;
try {
inStream = new ByteArrayOutputStream();
BinaryEncoder encoder = EncoderFactory.get().binaryEncoder(inStream, encoderCache.get());
encoderCache.set(encoder);
gWriter.write(gRec, encoder);
encoder.flush();
BinaryDecoder decoder = DecoderFactory.get().binaryDecoder(inStream.toByteArray(), decoderCache.get());
decoderCache.set(decoder);
GenericRecord transformedRec = gReader.read(null, decoder);
return transformedRec;
} catch (IOException e) {
throw new HoodieException(e);
} finally {
try {
inStream.close();
} catch (IOException ioe) {
throw new HoodieException(ioe.getMessage(), ioe);
}
}
}
/**
* Create Parquet record iterator that provides a stitched view of record read from skeleton and bootstrap file.
* Skeleton file is a representation of the bootstrap file inside the table, with just the bare bone fields needed
* for indexing, writing and other functionality.
*
*/
private static <T extends HoodieRecordPayload<T>> Iterator<GenericRecord> getMergingIterator(HoodieTable<T> table, HoodieMergeHandle<T> mergeHandle,
HoodieBaseFile baseFile, HoodieFileReader<GenericRecord> reader,
Schema readSchema, boolean externalSchemaTransformation) throws IOException {
Path externalFilePath = new Path(baseFile.getBootstrapBaseFile().get().getPath());
Configuration bootstrapFileConfig = new Configuration(table.getHadoopConf());
HoodieFileReader<GenericRecord> bootstrapReader = HoodieFileReaderFactory.<T, GenericRecord>getFileReader(bootstrapFileConfig, externalFilePath);
Schema bootstrapReadSchema;
if (externalSchemaTransformation) {
bootstrapReadSchema = bootstrapReader.getSchema();
} else {
bootstrapReadSchema = mergeHandle.getWriterSchema();
}
return new MergingIterator<>(reader.getRecordIterator(readSchema), bootstrapReader.getRecordIterator(bootstrapReadSchema),
(inputRecordPair) -> HoodieAvroUtils.stitchRecords(inputRecordPair.getLeft(), inputRecordPair.getRight(), mergeHandle.getWriterSchemaWithMetafields()));
}
/**
* Consumer that dequeues records from queue and sends to Merge Handle.
*/
private static class UpdateHandler extends BoundedInMemoryQueueConsumer<GenericRecord, Void> {
private final HoodieMergeHandle upsertHandle;
private UpdateHandler(HoodieMergeHandle upsertHandle) {
this.upsertHandle = upsertHandle;
}
@Override
protected void consumeOneRecord(GenericRecord record) {
upsertHandle.write(record);
}
@Override
protected void finish() {}
@Override
protected Void getResult() {
return null;
}
}
}

View File

@@ -56,22 +56,23 @@ public class ScheduleCompactionActionExecutor extends BaseActionExecutor<Option<
private HoodieCompactionPlan scheduleCompaction() {
LOG.info("Checking if compaction needs to be run on " + config.getBasePath());
Option<HoodieInstant> lastCompaction = table.getActiveTimeline().getCommitTimeline().filterCompletedInstants().lastInstant();
String deltaCommitsSinceTs = "0";
Option<HoodieInstant> lastCompaction = table.getActiveTimeline().getCommitTimeline()
.filterCompletedInstants().lastInstant();
String lastCompactionTs = "0";
if (lastCompaction.isPresent()) {
deltaCommitsSinceTs = lastCompaction.get().getTimestamp();
lastCompactionTs = lastCompaction.get().getTimestamp();
}
int deltaCommitsSinceLastCompaction = table.getActiveTimeline().getDeltaCommitTimeline()
.findInstantsAfter(deltaCommitsSinceTs, Integer.MAX_VALUE).countInstants();
.findInstantsAfter(lastCompactionTs, Integer.MAX_VALUE).countInstants();
if (config.getInlineCompactDeltaCommitMax() > deltaCommitsSinceLastCompaction) {
LOG.info("Not scheduling compaction as only " + deltaCommitsSinceLastCompaction
+ " delta commits was found since last compaction " + deltaCommitsSinceTs + ". Waiting for "
+ " delta commits was found since last compaction " + lastCompactionTs + ". Waiting for "
+ config.getInlineCompactDeltaCommitMax());
return new HoodieCompactionPlan();
}
LOG.info("Compacting merge on read table " + config.getBasePath());
LOG.info("Generating compaction plan for merge on read table " + config.getBasePath());
HoodieMergeOnReadTableCompactor compactor = new HoodieMergeOnReadTableCompactor();
try {
return compactor.generateCompactionPlan(jsc, table, config, instantTime,

View File

@@ -18,6 +18,7 @@
package org.apache.hudi.table.action.deltacommit;
import java.util.Map;
import org.apache.hudi.common.model.HoodieRecord;
import org.apache.hudi.common.model.HoodieRecordPayload;
import org.apache.hudi.common.model.WriteOperationType;
@@ -36,13 +37,19 @@ public class BulkInsertDeltaCommitActionExecutor<T extends HoodieRecordPayload<T
extends DeltaCommitActionExecutor<T> {
private final JavaRDD<HoodieRecord<T>> inputRecordsRDD;
private final Option<BulkInsertPartitioner> bulkInsertPartitioner;
private final Option<BulkInsertPartitioner<T>> bulkInsertPartitioner;
public BulkInsertDeltaCommitActionExecutor(JavaSparkContext jsc,
HoodieWriteConfig config, HoodieTable table,
String instantTime, JavaRDD<HoodieRecord<T>> inputRecordsRDD,
Option<BulkInsertPartitioner> bulkInsertPartitioner) {
super(jsc, config, table, instantTime, WriteOperationType.BULK_INSERT);
public BulkInsertDeltaCommitActionExecutor(JavaSparkContext jsc, HoodieWriteConfig config, HoodieTable table,
String instantTime, JavaRDD<HoodieRecord<T>> inputRecordsRDD,
Option<BulkInsertPartitioner<T>> bulkInsertPartitioner) {
this(jsc, config, table, instantTime, inputRecordsRDD, bulkInsertPartitioner, Option.empty());
}
public BulkInsertDeltaCommitActionExecutor(JavaSparkContext jsc, HoodieWriteConfig config, HoodieTable table,
String instantTime, JavaRDD<HoodieRecord<T>> inputRecordsRDD,
Option<BulkInsertPartitioner<T>> bulkInsertPartitioner,
Option<Map<String, String>> extraMetadata) {
super(jsc, config, table, instantTime, WriteOperationType.BULK_INSERT, extraMetadata);
this.inputRecordsRDD = inputRecordsRDD;
this.bulkInsertPartitioner = bulkInsertPartitioner;
}
@@ -52,10 +59,9 @@ public class BulkInsertDeltaCommitActionExecutor<T extends HoodieRecordPayload<T
try {
return BulkInsertHelper.bulkInsert(inputRecordsRDD, instantTime, (HoodieTable<T>) table, config,
this, true, bulkInsertPartitioner);
} catch (HoodieInsertException ie) {
throw ie;
} catch (Throwable e) {
if (e instanceof HoodieInsertException) {
throw e;
}
throw new HoodieInsertException("Failed to bulk insert for commit time " + instantTime, e);
}
}

View File

@@ -36,12 +36,12 @@ public class BulkInsertPreppedDeltaCommitActionExecutor<T extends HoodieRecordPa
extends DeltaCommitActionExecutor<T> {
private final JavaRDD<HoodieRecord<T>> preppedInputRecordRdd;
private final Option<BulkInsertPartitioner> bulkInsertPartitioner;
private final Option<BulkInsertPartitioner<T>> bulkInsertPartitioner;
public BulkInsertPreppedDeltaCommitActionExecutor(JavaSparkContext jsc,
HoodieWriteConfig config, HoodieTable table,
String instantTime, JavaRDD<HoodieRecord<T>> preppedInputRecordRdd,
Option<BulkInsertPartitioner> bulkInsertPartitioner) {
Option<BulkInsertPartitioner<T>> bulkInsertPartitioner) {
super(jsc, config, table, instantTime, WriteOperationType.BULK_INSERT);
this.preppedInputRecordRdd = preppedInputRecordRdd;
this.bulkInsertPartitioner = bulkInsertPartitioner;

View File

@@ -18,10 +18,12 @@
package org.apache.hudi.table.action.deltacommit;
import java.util.Map;
import org.apache.hudi.client.WriteStatus;
import org.apache.hudi.common.model.HoodieRecord;
import org.apache.hudi.common.model.HoodieRecordPayload;
import org.apache.hudi.common.model.WriteOperationType;
import org.apache.hudi.common.util.Option;
import org.apache.hudi.config.HoodieWriteConfig;
import org.apache.hudi.exception.HoodieUpsertException;
import org.apache.hudi.execution.LazyInsertIterable;
@@ -48,10 +50,15 @@ public abstract class DeltaCommitActionExecutor<T extends HoodieRecordPayload<T>
// UpsertPartitioner for MergeOnRead table type
private UpsertDeltaCommitPartitioner mergeOnReadUpsertPartitioner;
public DeltaCommitActionExecutor(JavaSparkContext jsc,
HoodieWriteConfig config, HoodieTable table,
String instantTime, WriteOperationType operationType) {
super(jsc, config, table, instantTime, operationType);
public DeltaCommitActionExecutor(JavaSparkContext jsc, HoodieWriteConfig config, HoodieTable table,
String instantTime, WriteOperationType operationType) {
this(jsc, config, table, instantTime, operationType, Option.empty());
}
public DeltaCommitActionExecutor(JavaSparkContext jsc, HoodieWriteConfig config, HoodieTable table,
String instantTime, WriteOperationType operationType,
Option<Map<String, String>> extraMetadata) {
super(jsc, config, table, instantTime, operationType, extraMetadata);
}
@Override

View File

@@ -20,6 +20,7 @@ package org.apache.hudi.table.action.rollback;
import org.apache.hudi.avro.model.HoodieRollbackMetadata;
import org.apache.hudi.common.HoodieRollbackStat;
import org.apache.hudi.common.bootstrap.index.BootstrapIndex;
import org.apache.hudi.common.fs.FSUtils;
import org.apache.hudi.common.table.timeline.HoodieActiveTimeline;
import org.apache.hudi.common.table.timeline.HoodieInstant;
@@ -224,4 +225,11 @@ public abstract class BaseRollbackActionExecutor extends BaseActionExecutor<Hood
LOG.warn("Rollback finished without deleting inflight instant file. Instant=" + instantToBeDeleted);
}
}
protected void dropBootstrapIndexIfNeeded(HoodieInstant instantToRollback) {
if (HoodieTimeline.compareTimestamps(instantToRollback.getTimestamp(), HoodieTimeline.EQUALS, HoodieTimeline.METADATA_BOOTSTRAP_INSTANT_TS)) {
LOG.info("Dropping bootstrap index as metadata bootstrap commit is getting rolled back !!");
BootstrapIndex.getBootstrapIndex(table.getMetaClient()).dropIndex();
}
}
}

View File

@@ -82,6 +82,9 @@ public class CopyOnWriteRollbackActionExecutor extends BaseRollbackActionExecuto
LOG.info("Clean out all base files generated for commit: " + resolvedInstant);
stats = getRollbackStrategy().execute(resolvedInstant);
}
dropBootstrapIndexIfNeeded(instantToRollback);
// Delete Inflight instant if enabled
deleteInflightAndRequestedInstant(deleteInstants, activeTimeline, resolvedInstant);
LOG.info("Time(in ms) taken to finish rollback " + rollbackTimer.endTimer());

View File

@@ -98,6 +98,8 @@ public class MergeOnReadRollbackActionExecutor extends BaseRollbackActionExecuto
allRollbackStats = getRollbackStrategy().execute(resolvedInstant);
}
dropBootstrapIndexIfNeeded(resolvedInstant);
// Delete Inflight instants if enabled
deleteInflightAndRequestedInstant(deleteInstants, table.getActiveTimeline(), resolvedInstant);
LOG.info("Time(in ms) taken to finish rollback " + rollbackTimer.endTimer());

View File

@@ -477,8 +477,8 @@ public class TestHoodieClientOnCopyOnWriteStorage extends HoodieClientTestBase {
.withBloomIndexUpdatePartitionPath(true)
.withGlobalSimpleIndexUpdatePartitionPath(true)
.build()).withTimelineLayoutVersion(VERSION_0).build();
HoodieTableMetaClient.initTableType(metaClient.getHadoopConf(), metaClient.getBasePath(), metaClient.getTableType(),
metaClient.getTableConfig().getTableName(), metaClient.getArchivePath(),
HoodieTableMetaClient.initTableType(metaClient.getHadoopConf(), metaClient.getBasePath(),
metaClient.getTableType(), metaClient.getTableConfig().getTableName(), metaClient.getArchivePath(),
metaClient.getTableConfig().getPayloadClass(), VERSION_0);
HoodieWriteClient client = getHoodieWriteClient(hoodieWriteConfig, false);

View File

@@ -122,7 +122,7 @@ public class TestUpdateSchemaEvolution extends HoodieClientTestHarness {
HoodieMergeHandle mergeHandle = new HoodieMergeHandle(config2, "101", table2,
updateRecords.iterator(), record1.getPartitionPath(), fileId, supplier);
Configuration conf = new Configuration();
AvroReadSupport.setAvroReadSchema(conf, mergeHandle.getWriterSchema());
AvroReadSupport.setAvroReadSchema(conf, mergeHandle.getWriterSchemaWithMetafields());
List<GenericRecord> oldRecords = ParquetUtils.readAvroRecords(conf,
new Path(config2.getBasePath() + "/" + insertResult.getStat().getPath()));
for (GenericRecord rec : oldRecords) {

View File

@@ -0,0 +1,75 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.hudi.client.bootstrap;
import org.apache.hudi.avro.model.HoodieFileStatus;
import org.apache.hudi.client.bootstrap.selector.BootstrapRegexModeSelector;
import org.apache.hudi.common.util.collection.Pair;
import org.apache.hudi.config.HoodieBootstrapConfig;
import org.apache.hudi.config.HoodieWriteConfig;
import org.junit.jupiter.api.Test;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.List;
import java.util.Map;
import java.util.stream.Collectors;
import static org.junit.jupiter.api.Assertions.assertEquals;
import static org.junit.jupiter.api.Assertions.assertTrue;
public class TestBootstrapRegexModeSelector {
private HoodieWriteConfig getConfig(String regex, BootstrapMode selectedMode) {
return HoodieWriteConfig.newBuilder().withPath("")
.withBootstrapConfig(HoodieBootstrapConfig.newBuilder()
.withBootstrapModeSelectorRegex(regex)
.withBootstrapModeForRegexMatch(selectedMode).build())
.forTable("test-trip-table").build();
}
@Test
public void testModeSelector() {
List<String> partitionPaths = Arrays.asList("2020/05/01", "2020/05/02", "2020/05/10", "2020/05/11");
List<Pair<String, List<HoodieFileStatus>>> input = partitionPaths.stream()
.map(p -> Pair.<String, List<HoodieFileStatus>>of(p, new ArrayList<>())).collect(Collectors.toList());
String regex = "2020/05/1[0-9]";
BootstrapRegexModeSelector regexModeSelector = new BootstrapRegexModeSelector(getConfig(regex,
BootstrapMode.FULL_RECORD));
Map<BootstrapMode, List<String>> result = regexModeSelector.select(input);
assertTrue(result.get(BootstrapMode.METADATA_ONLY).contains("2020/05/01"));
assertTrue(result.get(BootstrapMode.METADATA_ONLY).contains("2020/05/02"));
assertTrue(result.get(BootstrapMode.FULL_RECORD).contains("2020/05/10"));
assertTrue(result.get(BootstrapMode.FULL_RECORD).contains("2020/05/11"));
assertEquals(2, result.get(BootstrapMode.METADATA_ONLY).size());
assertEquals(2, result.get(BootstrapMode.FULL_RECORD).size());
regexModeSelector = new BootstrapRegexModeSelector(getConfig(regex,
BootstrapMode.METADATA_ONLY));
result = regexModeSelector.select(input);
assertTrue(result.get(BootstrapMode.FULL_RECORD).contains("2020/05/01"));
assertTrue(result.get(BootstrapMode.FULL_RECORD).contains("2020/05/02"));
assertTrue(result.get(BootstrapMode.METADATA_ONLY).contains("2020/05/10"));
assertTrue(result.get(BootstrapMode.METADATA_ONLY).contains("2020/05/11"));
assertEquals(2, result.get(BootstrapMode.METADATA_ONLY).size());
assertEquals(2, result.get(BootstrapMode.FULL_RECORD).size());
}
}

View File

@@ -0,0 +1,66 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.hudi.client.bootstrap;
import org.apache.hudi.avro.model.HoodieFileStatus;
import org.apache.hudi.client.bootstrap.selector.FullRecordBootstrapModeSelector;
import org.apache.hudi.client.bootstrap.selector.MetadataOnlyBootstrapModeSelector;
import org.apache.hudi.client.bootstrap.selector.UniformBootstrapModeSelector;
import org.apache.hudi.common.util.collection.Pair;
import org.apache.hudi.config.HoodieWriteConfig;
import org.junit.jupiter.api.Test;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.List;
import java.util.Map;
import java.util.stream.Collectors;
import static org.junit.jupiter.api.Assertions.assertEquals;
import static org.junit.jupiter.api.Assertions.assertTrue;
public class TestUniformBootstrapModeSelector {
@Test
public void testFullBootstrapModeSelector() {
FullRecordBootstrapModeSelector modeSelector = new FullRecordBootstrapModeSelector(
HoodieWriteConfig.newBuilder().withPath("").build());
testModeSelector(modeSelector, BootstrapMode.FULL_RECORD);
}
@Test
public void testMetadataOnlyBootstrapModeSelector() {
MetadataOnlyBootstrapModeSelector modeSelector = new MetadataOnlyBootstrapModeSelector(
HoodieWriteConfig.newBuilder().withPath("").build());
testModeSelector(modeSelector, BootstrapMode.METADATA_ONLY);
}
private void testModeSelector(UniformBootstrapModeSelector modeSelector, BootstrapMode mode) {
List<String> partitionPaths = Arrays.asList("2020/05/01", "2020/05/02", "2020/05/10", "2020/05/11");
List<Pair<String, List<HoodieFileStatus>>> input = partitionPaths.stream()
.map(p -> Pair.<String, List<HoodieFileStatus>>of(p, new ArrayList<>())).collect(Collectors.toList());
Map<BootstrapMode, List<String>> result = modeSelector.select(input);
assertTrue(result.get(mode).contains("2020/05/01"));
assertTrue(result.get(mode).contains("2020/05/02"));
assertTrue(result.get(mode).contains("2020/05/10"));
assertTrue(result.get(mode).contains("2020/05/11"));
assertEquals(4, result.get(mode).size());
}
}

View File

@@ -0,0 +1,85 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.hudi.table.action.bootstrap;
import org.apache.hudi.avro.model.HoodieFileStatus;
import org.apache.hudi.common.util.collection.Pair;
import org.apache.hudi.exception.HoodieException;
import org.apache.hudi.testutils.HoodieClientTestBase;
import org.apache.hadoop.fs.Path;
import org.junit.jupiter.api.Test;
import java.io.IOException;
import java.util.Arrays;
import java.util.List;
import static org.junit.jupiter.api.Assertions.assertEquals;
public class TestBootstrapUtils extends HoodieClientTestBase {
@Test
public void testAllLeafFoldersWithFiles() throws IOException {
// All directories including marker dirs.
List<String> folders = Arrays.asList("2016/04/15", "2016/05/16", "2016/05/17");
folders.forEach(f -> {
try {
metaClient.getFs().mkdirs(new Path(new Path(basePath), f));
} catch (IOException e) {
throw new HoodieException(e);
}
});
// Files inside partitions and marker directories
List<String> files = Arrays.asList(
"2016/04/15/1_1-0-1_20190528120000.parquet",
"2016/04/15/2_1-0-1_20190528120000.parquet",
"2016/05/16/3_1-0-1_20190528120000.parquet",
"2016/05/16/4_1-0-1_20190528120000.parquet",
"2016/04/17/5_1-0-1_20190528120000.parquet",
"2016/04/17/6_1-0-1_20190528120000.parquet");
files.forEach(f -> {
try {
metaClient.getFs().create(new Path(new Path(basePath), f));
} catch (IOException e) {
throw new HoodieException(e);
}
});
List<Pair<String, List<HoodieFileStatus>>> collected =
BootstrapUtils.getAllLeafFoldersWithFiles(metaClient.getFs(), basePath, (status) -> {
return true;
});
assertEquals(3, collected.size());
collected.stream().forEach(k -> {
assertEquals(2, k.getRight().size());
});
// Simulate reading from un-partitioned dataset
collected =
BootstrapUtils.getAllLeafFoldersWithFiles(metaClient.getFs(), basePath + "/" + folders.get(0), (status) -> {
return true;
});
assertEquals(1, collected.size());
collected.stream().forEach(k -> {
assertEquals(2, k.getRight().size());
});
}
}

View File

@@ -19,6 +19,7 @@
package org.apache.hudi.table.action.compact.strategy;
import org.apache.hudi.avro.model.HoodieCompactionOperation;
import org.apache.hudi.common.model.BaseFile;
import org.apache.hudi.common.model.HoodieBaseFile;
import org.apache.hudi.common.model.HoodieLogFile;
import org.apache.hudi.common.util.Option;
@@ -259,7 +260,9 @@ public class TestHoodieCompactionStrategy {
operations.add(new HoodieCompactionOperation(df.getCommitTime(),
logFiles.stream().map(s -> s.getPath().toString()).collect(Collectors.toList()), df.getPath(), df.getFileId(),
partitionPath,
config.getCompactionStrategy().captureMetrics(config, Option.of(df), partitionPath, logFiles)));
config.getCompactionStrategy().captureMetrics(config, Option.of(df), partitionPath, logFiles),
df.getBootstrapBaseFile().map(BaseFile::getPath).orElse(null))
);
});
return operations;
}

View File

@@ -22,14 +22,17 @@ import org.apache.hudi.avro.HoodieAvroUtils;
import org.apache.hudi.common.table.HoodieTableMetaClient;
import org.apache.hudi.common.testutils.HoodieTestDataGenerator;
import org.apache.hudi.common.testutils.HoodieTestUtils;
import org.apache.hudi.common.util.collection.Pair;
import org.apache.hudi.hadoop.utils.HoodieInputFormatUtils;
import org.apache.avro.Schema;
import org.apache.avro.Schema.Field;
import org.apache.avro.generic.GenericRecord;
import org.apache.avro.generic.GenericRecordBuilder;
import org.apache.hadoop.conf.Configurable;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.hive.metastore.api.hive_metastoreConstants;
import org.apache.hadoop.hive.ql.io.IOConstants;
import org.apache.hadoop.hive.serde2.ColumnProjectionUtils;
import org.apache.hadoop.io.ArrayWritable;
import org.apache.hadoop.io.Writable;
@@ -48,22 +51,39 @@ import java.util.stream.Collectors;
* Utility methods to aid in testing MergeOnRead (workaround for HoodieReadClient for MOR).
*/
public class HoodieMergeOnReadTestUtils {
public static List<GenericRecord> getRecordsUsingInputFormat(Configuration conf, List<String> inputPaths,
String basePath) {
return getRecordsUsingInputFormat(conf, inputPaths, basePath, new JobConf(conf), true);
}
public static List<GenericRecord> getRecordsUsingInputFormat(Configuration conf, List<String> inputPaths,
String basePath,
JobConf jobConf,
boolean realtime) {
HoodieTableMetaClient metaClient = new HoodieTableMetaClient(conf, basePath);
FileInputFormat inputFormat = HoodieInputFormatUtils.getInputFormat(metaClient.getTableConfig().getBaseFileFormat(),
realtime, jobConf);
String basePath, JobConf jobConf, boolean realtime) {
Schema schema = new Schema.Parser().parse(HoodieTestDataGenerator.TRIP_EXAMPLE_SCHEMA);
return getRecordsUsingInputFormat(conf, inputPaths, basePath, jobConf, realtime, schema,
HoodieTestDataGenerator.TRIP_HIVE_COLUMN_TYPES, false, new ArrayList<>());
}
public static List<GenericRecord> getRecordsUsingInputFormat(Configuration conf, List<String> inputPaths, String basePath, JobConf jobConf, boolean realtime, Schema rawSchema,
String rawHiveColumnTypes, boolean projectCols, List<String> projectedColumns) {
HoodieTableMetaClient metaClient = new HoodieTableMetaClient(conf, basePath);
FileInputFormat inputFormat = HoodieInputFormatUtils.getInputFormat(metaClient.getTableConfig().getBaseFileFormat(), realtime, jobConf);
Schema schema = HoodieAvroUtils.addMetadataFields(rawSchema);
String hiveColumnTypes = HoodieAvroUtils.addMetadataColumnTypes(rawHiveColumnTypes);
setPropsForInputFormat(inputFormat, jobConf, schema, hiveColumnTypes, projectCols, projectedColumns);
final List<Field> fields;
if (projectCols) {
fields = schema.getFields().stream().filter(f -> projectedColumns.contains(f.name()))
.collect(Collectors.toList());
} else {
fields = schema.getFields();
}
final Schema projectedSchema = Schema.createRecord(fields.stream()
.map(f -> new Schema.Field(f.name(), f.schema(), f.doc(), f.defaultVal()))
.collect(Collectors.toList()));
Schema schema = HoodieAvroUtils.addMetadataFields(
new Schema.Parser().parse(HoodieTestDataGenerator.TRIP_EXAMPLE_SCHEMA));
setPropsForInputFormat(inputFormat, jobConf, schema, basePath);
return inputPaths.stream().map(path -> {
setInputPath(jobConf, path);
List<GenericRecord> records = new ArrayList<>();
@@ -71,17 +91,18 @@ public class HoodieMergeOnReadTestUtils {
List<InputSplit> splits = Arrays.asList(inputFormat.getSplits(jobConf, 1));
for (InputSplit split : splits) {
RecordReader recordReader = inputFormat.getRecordReader(split, jobConf, null);
Void key = (Void) recordReader.createKey();
Object key = recordReader.createKey();
ArrayWritable writable = (ArrayWritable) recordReader.createValue();
while (recordReader.next(key, writable)) {
GenericRecordBuilder newRecord = new GenericRecordBuilder(schema);
GenericRecordBuilder newRecord = new GenericRecordBuilder(projectedSchema);
// writable returns an array with [field1, field2, _hoodie_commit_time,
// _hoodie_commit_seqno]
Writable[] values = writable.get();
assert schema.getFields().size() <= values.length;
schema.getFields().forEach(field -> {
newRecord.set(field, values[field.pos()]);
});
schema.getFields().stream()
.filter(f -> !projectCols || projectedColumns.contains(f.name()))
.map(f -> Pair.of(projectedSchema.getFields().stream()
.filter(p -> f.name().equals(p.name())).findFirst().get(), f))
.forEach(fieldsPair -> newRecord.set(fieldsPair.getKey(), values[fieldsPair.getValue().pos()]));
records.add(newRecord.build());
}
}
@@ -95,29 +116,40 @@ public class HoodieMergeOnReadTestUtils {
}).orElse(new ArrayList<>());
}
private static void setPropsForInputFormat(FileInputFormat inputFormat, JobConf jobConf, Schema schema,
String basePath) {
private static void setPropsForInputFormat(FileInputFormat inputFormat, JobConf jobConf, Schema schema, String hiveColumnTypes, boolean projectCols, List<String> projectedCols) {
List<Schema.Field> fields = schema.getFields();
String names = fields.stream().map(f -> f.name().toString()).collect(Collectors.joining(","));
String postions = fields.stream().map(f -> String.valueOf(f.pos())).collect(Collectors.joining(","));
Configuration conf = HoodieTestUtils.getDefaultHadoopConf();
final List<String> projectedColNames;
if (!projectCols) {
projectedColNames = fields.stream().map(Field::name).collect(Collectors.toList());
} else {
projectedColNames = projectedCols;
}
String hiveColumnNames = fields.stream().filter(field -> !field.name().equalsIgnoreCase("datestr"))
String names = fields.stream()
.filter(f -> projectedColNames.contains(f.name()))
.map(f -> f.name()).collect(Collectors.joining(","));
String positions = fields.stream()
.filter(f -> projectedColNames.contains(f.name()))
.map(f -> String.valueOf(f.pos())).collect(Collectors.joining(","));
String hiveColumnNames = fields.stream()
.filter(field -> !field.name().equalsIgnoreCase("datestr"))
.map(Schema.Field::name).collect(Collectors.joining(","));
hiveColumnNames = hiveColumnNames + ",datestr";
String hiveColumnTypes = HoodieAvroUtils.addMetadataColumnTypes(HoodieTestDataGenerator.TRIP_HIVE_COLUMN_TYPES);
hiveColumnTypes = hiveColumnTypes + ",string";
Configuration conf = HoodieTestUtils.getDefaultHadoopConf();
String hiveColumnTypesWithDatestr = hiveColumnTypes + ",string";
jobConf.set(hive_metastoreConstants.META_TABLE_COLUMNS, hiveColumnNames);
jobConf.set(hive_metastoreConstants.META_TABLE_COLUMN_TYPES, hiveColumnTypes);
jobConf.set(hive_metastoreConstants.META_TABLE_COLUMN_TYPES, hiveColumnTypesWithDatestr);
jobConf.set(ColumnProjectionUtils.READ_COLUMN_NAMES_CONF_STR, names);
jobConf.set(ColumnProjectionUtils.READ_COLUMN_IDS_CONF_STR, postions);
jobConf.set(ColumnProjectionUtils.READ_COLUMN_IDS_CONF_STR, positions);
jobConf.set(hive_metastoreConstants.META_TABLE_PARTITION_COLUMNS, "datestr");
conf.set(hive_metastoreConstants.META_TABLE_COLUMNS, hiveColumnNames);
conf.set(ColumnProjectionUtils.READ_COLUMN_NAMES_CONF_STR, names);
conf.set(ColumnProjectionUtils.READ_COLUMN_IDS_CONF_STR, postions);
conf.set(ColumnProjectionUtils.READ_COLUMN_IDS_CONF_STR, positions);
conf.set(hive_metastoreConstants.META_TABLE_PARTITION_COLUMNS, "datestr");
conf.set(hive_metastoreConstants.META_TABLE_COLUMN_TYPES, hiveColumnTypes);
conf.set(hive_metastoreConstants.META_TABLE_COLUMN_TYPES, hiveColumnTypesWithDatestr);
conf.set(IOConstants.COLUMNS, hiveColumnNames);
conf.get(IOConstants.COLUMNS_TYPES, hiveColumnTypesWithDatestr);
// Hoodie Input formats are also configurable
Configurable configurable = (Configurable)inputFormat;

View File

@@ -17,6 +17,7 @@
###
log4j.rootLogger=WARN, CONSOLE
log4j.logger.org.apache.hudi=DEBUG
log4j.logger.org.apache.hadoop.hbase=ERROR
# CONSOLE is set to be a ConsoleAppender.
log4j.appender.CONSOLE=org.apache.log4j.ConsoleAppender

View File

@@ -18,6 +18,7 @@
log4j.rootLogger=WARN, CONSOLE
log4j.logger.org.apache=INFO
log4j.logger.org.apache.hudi=DEBUG
log4j.logger.org.apache.hadoop.hbase=ERROR
# A1 is set to be a ConsoleAppender.
log4j.appender.CONSOLE=org.apache.log4j.ConsoleAppender

View File

@@ -75,6 +75,12 @@
<import>${basedir}/src/main/avro/HoodieRollbackMetadata.avsc</import>
<import>${basedir}/src/main/avro/HoodieRestoreMetadata.avsc</import>
<import>${basedir}/src/main/avro/HoodieArchivedMetaEntry.avsc</import>
<import>${basedir}/src/main/avro/HoodiePath.avsc</import>
<import>${basedir}/src/main/avro/HoodieFSPermission.avsc</import>
<import>${basedir}/src/main/avro/HoodieFileStatus.avsc</import>
<import>${basedir}/src/main/avro/HoodieBootstrapSourceFilePartitionInfo.avsc</import>
<import>${basedir}/src/main/avro/HoodieBootstrapIndexInfo.avsc</import>
<import>${basedir}/src/main/avro/HoodieBootstrapMetadata.avsc</import>
</imports>
</configuration>
</plugin>
@@ -197,12 +203,31 @@
<version>${hbase.version}</version>
<scope>test</scope>
</dependency>
<dependency>
<groupId>org.apache.hbase</groupId>
<artifactId>hbase-server</artifactId>
<version>${hbase.version}</version>
<scope>test</scope>
<!-- Unfortunately, HFile is packaged ONLY under hbase-server -->
<scope>compile</scope>
<exclusions>
<exclusion>
<groupId>javax.servlet</groupId>
<artifactId>*</artifactId>
</exclusion>
<exclusion>
<groupId>org.codehaus.jackson</groupId>
<artifactId>*</artifactId>
</exclusion>
<exclusion>
<groupId>org.mortbay.jetty</groupId>
<artifactId>*</artifactId>
</exclusion>
<exclusion>
<groupId>tomcat</groupId>
<artifactId>*</artifactId>
</exclusion>
</exclusions>
</dependency>
</dependencies>
</project>

View File

@@ -0,0 +1,44 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
{
"namespace":"org.apache.hudi.avro.model",
"type":"record",
"name":"HoodieBootstrapFilePartitionInfo",
"fields":[
{
"name":"version",
"type":["int", "null"],
"default": 1
},
{
"name":"bootstrapPartitionPath",
"type":["null", "string"],
"default" : null
},
{
"name":"bootstrapFileStatus",
"type":["null", "HoodieFileStatus"],
"default" : null
},
{
"name":"partitionPath",
"type":["null", "string"],
"default" : null
}
]
}

View File

@@ -0,0 +1,44 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
{
"namespace":"org.apache.hudi.avro.model",
"type":"record",
"name":"HoodieBootstrapIndexInfo",
"fields":[
{
"name":"version",
"type":["int", "null"],
"default": 1
},
{
"name":"bootstrapBasePath",
"type":["null", "string"],
"default" : null
},
{
"name":"createdTimestamp",
"type":["null", "long"],
"default" : null
},
{
"name":"numKeys",
"type":["null", "int"],
"default" : null
}
]
}

View File

@@ -0,0 +1,47 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
{
"namespace":"org.apache.hudi.avro.model",
"type":"record",
"name":"HoodieBootstrapPartitionMetadata",
"fields":[
{
"name":"version",
"type":["int", "null"],
"default": 1
},
{
"name":"bootstrapPartitionPath",
"type":["null", "string"],
"default" : null
},
{
"name":"partitionPath",
"type":["null", "string"],
"default" : null
},
{
"name":"fileIdToBootstrapFile",
"type":["null", {
"type":"map",
"values": "HoodieFileStatus"
}],
"default": null
}
]
}

View File

@@ -61,6 +61,11 @@
"values":"double"
}],
"default": null
},
{
"name":"bootstrapFilePath",
"type":["null", "string"],
"default": null
}
]
}

View File

@@ -0,0 +1,49 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
{
"namespace":"org.apache.hudi.avro.model",
"type":"record",
"name":"HoodieFSPermission",
"fields":[
{
"name":"version",
"type":["int", "null"],
"default": 1
},
{
"name":"userAction",
"type":[ "null", "string" ],
"default": "null"
},
{
"name":"groupAction",
"type":[ "null", "string" ],
"default": "null"
},
{
"name":"otherAction",
"type":[ "null", "string" ],
"default": "null"
},
{
"name":"stickyBit",
"type":[ "null", "boolean" ],
"default": "null"
}
]
}

View File

@@ -0,0 +1,84 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
{
"namespace":"org.apache.hudi.avro.model",
"type":"record",
"name":"HoodieFileStatus",
"fields":[
{
"name":"version",
"type":["int", "null"],
"default": 1
},
{
"name":"path",
"type":["null", "HoodiePath"],
"default" : null
},
{
"name":"length",
"type":["null", "long"],
"default" : null
},
{
"name":"isDir",
"type":["null", "boolean"],
"default" : null
},
{
"name":"blockReplication",
"type":["null", "int"],
"default" : null
},
{
"name":"blockSize",
"type":["null", "long"],
"default" : null
},
{
"name":"modificationTime",
"type":["null", "long"],
"default" : null
},
{
"name":"accessTime",
"type":["null", "long"],
"default" : null
},
{
"name":"permission",
"type":["null", "HoodieFSPermission"],
"default" : null
},
{
"name":"owner",
"type":["null", "string"],
"default" : null
},
{
"name":"group",
"type":["null", "string"],
"default" : null
},
{
"name":"symlink",
"type":["null", "HoodiePath"],
"default" : null
}
]
}

View File

@@ -0,0 +1,34 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
{
"namespace":"org.apache.hudi.avro.model",
"type":"record",
"name":"HoodiePath",
"fields":[
{
"name":"version",
"type":["int", "null"],
"default": 1
},
{
"name":"uri",
"type":["null", "string"],
"default" : null
}
]
}

View File

@@ -19,7 +19,13 @@
package org.apache.hudi.avro;
import org.apache.avro.JsonProperties;
import java.time.LocalDate;
import org.apache.avro.LogicalTypes;
import org.apache.avro.generic.GenericData.Record;
import org.apache.hudi.common.model.HoodieRecord;
import org.apache.hudi.common.util.StringUtils;
import org.apache.hudi.common.util.collection.Pair;
import org.apache.hudi.exception.HoodieException;
import org.apache.hudi.exception.HoodieIOException;
import org.apache.hudi.exception.SchemaCompatabilityException;
@@ -64,10 +70,10 @@ public class HoodieAvroUtils {
private static ThreadLocal<BinaryDecoder> reuseDecoder = ThreadLocal.withInitial(() -> null);
// All metadata fields are optional strings.
static final Schema METADATA_FIELD_SCHEMA =
public static final Schema METADATA_FIELD_SCHEMA =
Schema.createUnion(Arrays.asList(Schema.create(Schema.Type.NULL), Schema.create(Schema.Type.STRING)));
private static final Schema RECORD_KEY_SCHEMA = initRecordKeySchema();
public static final Schema RECORD_KEY_SCHEMA = initRecordKeySchema();
/**
* Convert a given avro record to bytes.
@@ -251,6 +257,17 @@ public class HoodieAvroUtils {
return record;
}
public static GenericRecord stitchRecords(GenericRecord left, GenericRecord right, Schema stitchedSchema) {
GenericRecord result = new Record(stitchedSchema);
for (Schema.Field f : left.getSchema().getFields()) {
result.put(f.name(), left.get(f.name()));
}
for (Schema.Field f : right.getSchema().getFields()) {
result.put(f.name(), right.get(f.name()));
}
return result;
}
/**
* Given a avro record with a given schema, rewrites it into the new schema while setting fields only from the old
* schema.
@@ -326,4 +343,105 @@ public class HoodieAvroUtils {
throw new HoodieIOException("IOException while decompressing text", e);
}
}
/**
* Generate a reader schema off the provided writeSchema, to just project out the provided columns.
*/
public static Schema generateProjectionSchema(Schema originalSchema, List<String> fieldNames) {
Map<String, Field> schemaFieldsMap = originalSchema.getFields().stream()
.map(r -> Pair.of(r.name().toLowerCase(), r)).collect(Collectors.toMap(Pair::getLeft, Pair::getRight));
List<Schema.Field> projectedFields = new ArrayList<>();
for (String fn : fieldNames) {
Schema.Field field = schemaFieldsMap.get(fn.toLowerCase());
if (field == null) {
throw new HoodieException("Field " + fn + " not found in log schema. Query cannot proceed! "
+ "Derived Schema Fields: " + new ArrayList<>(schemaFieldsMap.keySet()));
} else {
projectedFields.add(new Schema.Field(field.name(), field.schema(), field.doc(), field.defaultValue()));
}
}
Schema projectedSchema = Schema.createRecord(originalSchema.getName(), originalSchema.getDoc(),
originalSchema.getNamespace(), originalSchema.isError());
projectedSchema.setFields(projectedFields);
return projectedSchema;
}
/**
* Obtain value of the provided field as string, denoted by dot notation. e.g: a.b.c
*/
public static String getNestedFieldValAsString(GenericRecord record, String fieldName, boolean returnNullIfNotFound) {
Object obj = getNestedFieldVal(record, fieldName, returnNullIfNotFound);
return StringUtils.objToString(obj);
}
/**
* Obtain value of the provided field, denoted by dot notation. e.g: a.b.c
*/
public static Object getNestedFieldVal(GenericRecord record, String fieldName, boolean returnNullIfNotFound) {
String[] parts = fieldName.split("\\.");
GenericRecord valueNode = record;
int i = 0;
for (; i < parts.length; i++) {
String part = parts[i];
Object val = valueNode.get(part);
if (val == null) {
break;
}
// return, if last part of name
if (i == parts.length - 1) {
Schema fieldSchema = valueNode.getSchema().getField(part).schema();
return convertValueForSpecificDataTypes(fieldSchema, val);
} else {
// VC: Need a test here
if (!(val instanceof GenericRecord)) {
throw new HoodieException("Cannot find a record at part value :" + part);
}
valueNode = (GenericRecord) val;
}
}
if (returnNullIfNotFound) {
return null;
} else {
throw new HoodieException(
fieldName + "(Part -" + parts[i] + ") field not found in record. Acceptable fields were :"
+ valueNode.getSchema().getFields().stream().map(Field::name).collect(Collectors.toList()));
}
}
/**
* This method converts values for fields with certain Avro/Parquet data types that require special handling.
*
* Logical Date Type is converted to actual Date value instead of Epoch Integer which is how it is
* represented/stored in parquet.
*
* @param fieldSchema avro field schema
* @param fieldValue avro field value
* @return field value either converted (for certain data types) or as it is.
*/
private static Object convertValueForSpecificDataTypes(Schema fieldSchema, Object fieldValue) {
if (fieldSchema == null) {
return fieldValue;
}
if (isLogicalTypeDate(fieldSchema)) {
return LocalDate.ofEpochDay(Long.parseLong(fieldValue.toString()));
}
return fieldValue;
}
/**
* Given an Avro field schema checks whether the field is of Logical Date Type or not.
*
* @param fieldSchema avro field schema
* @return boolean indicating whether fieldSchema is of Avro's Date Logical Type
*/
private static boolean isLogicalTypeDate(Schema fieldSchema) {
if (fieldSchema.getType() == Schema.Type.UNION) {
return fieldSchema.getTypes().stream().anyMatch(schema -> schema.getLogicalType() == LogicalTypes.date());
}
return fieldSchema.getLogicalType() == LogicalTypes.date();
}
}

View File

@@ -0,0 +1,123 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.hudi.common.bootstrap;
import org.apache.hudi.avro.model.HoodieFSPermission;
import org.apache.hudi.avro.model.HoodieFileStatus;
import org.apache.hudi.avro.model.HoodiePath;
import org.apache.hudi.exception.HoodieIOException;
import org.apache.hadoop.fs.FileStatus;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.fs.permission.FsAction;
import org.apache.hadoop.fs.permission.FsPermission;
import java.io.IOException;
/**
* Helper functions around FileStatus and HoodieFileStatus.
*/
public class FileStatusUtils {
public static Path toPath(HoodiePath path) {
if (null == path) {
return null;
}
return new Path(path.getUri());
}
public static HoodiePath fromPath(Path path) {
if (null == path) {
return null;
}
return HoodiePath.newBuilder().setUri(path.toString()).build();
}
public static FsPermission toFSPermission(HoodieFSPermission fsPermission) {
if (null == fsPermission) {
return null;
}
FsAction userAction = fsPermission.getUserAction() != null ? FsAction.valueOf(fsPermission.getUserAction()) : null;
FsAction grpAction = fsPermission.getGroupAction() != null ? FsAction.valueOf(fsPermission.getGroupAction()) : null;
FsAction otherAction =
fsPermission.getOtherAction() != null ? FsAction.valueOf(fsPermission.getOtherAction()) : null;
boolean stickyBit = fsPermission.getStickyBit() != null ? fsPermission.getStickyBit() : false;
return new FsPermission(userAction, grpAction, otherAction, stickyBit);
}
public static HoodieFSPermission fromFSPermission(FsPermission fsPermission) {
if (null == fsPermission) {
return null;
}
String userAction = fsPermission.getUserAction() != null ? fsPermission.getUserAction().name() : null;
String grpAction = fsPermission.getGroupAction() != null ? fsPermission.getGroupAction().name() : null;
String otherAction = fsPermission.getOtherAction() != null ? fsPermission.getOtherAction().name() : null;
return HoodieFSPermission.newBuilder().setUserAction(userAction).setGroupAction(grpAction)
.setOtherAction(otherAction).setStickyBit(fsPermission.getStickyBit()).build();
}
public static FileStatus toFileStatus(HoodieFileStatus fileStatus) {
if (null == fileStatus) {
return null;
}
return new FileStatus(fileStatus.getLength(), fileStatus.getIsDir() == null ? false : fileStatus.getIsDir(),
fileStatus.getBlockReplication(), fileStatus.getBlockSize(), fileStatus.getModificationTime(),
fileStatus.getAccessTime(), toFSPermission(fileStatus.getPermission()), fileStatus.getOwner(),
fileStatus.getGroup(), toPath(fileStatus.getSymlink()), toPath(fileStatus.getPath()));
}
public static HoodieFileStatus fromFileStatus(FileStatus fileStatus) {
if (null == fileStatus) {
return null;
}
HoodieFileStatus fStatus = new HoodieFileStatus();
try {
fStatus.setPath(fromPath(fileStatus.getPath()));
fStatus.setLength(fileStatus.getLen());
fStatus.setIsDir(fileStatus.isDirectory());
fStatus.setBlockReplication((int) fileStatus.getReplication());
fStatus.setBlockSize(fileStatus.getBlockSize());
fStatus.setModificationTime(fileStatus.getModificationTime());
fStatus.setAccessTime(fileStatus.getModificationTime());
fStatus.setSymlink(fileStatus.isSymlink() ? fromPath(fileStatus.getSymlink()) : null);
safeReadAndSetMetadata(fStatus, fileStatus);
} catch (IOException ioe) {
throw new HoodieIOException(ioe.getMessage(), ioe);
}
return fStatus;
}
/**
* Used to safely handle FileStatus calls which might fail on some FileSystem implementation.
* (DeprecatedLocalFileSystem)
*/
private static void safeReadAndSetMetadata(HoodieFileStatus fStatus, FileStatus fileStatus) {
try {
fStatus.setOwner(fileStatus.getOwner());
fStatus.setGroup(fileStatus.getGroup());
fStatus.setPermission(fromFSPermission(fileStatus.getPermission()));
} catch (IllegalArgumentException ie) {
// Deprecated File System (testing) does not work well with this call
// skipping
}
}
}

View File

@@ -0,0 +1,161 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.hudi.common.bootstrap.index;
import org.apache.hudi.common.model.BootstrapFileMapping;
import org.apache.hudi.common.model.HoodieFileGroupId;
import org.apache.hudi.common.table.HoodieTableMetaClient;
import java.io.Serializable;
import java.util.List;
import java.util.Map;
import org.apache.hudi.common.table.timeline.HoodieTimeline;
import org.apache.hudi.common.util.ReflectionUtils;
/**
* Bootstrap Index Interface.
*/
public abstract class BootstrapIndex implements Serializable {
protected static final long serialVersionUID = 1L;
protected final HoodieTableMetaClient metaClient;
public BootstrapIndex(HoodieTableMetaClient metaClient) {
this.metaClient = metaClient;
}
/**
* Create Bootstrap Index Reader.
* @return Index Reader
*/
public abstract IndexReader createReader();
/**
* Create Bootstrap Index Writer.
* @param sourceBasePath Source Base Path
* @return Index Writer
*/
public abstract IndexWriter createWriter(String sourceBasePath);
/**
* Drop bootstrap index.
*/
public abstract void dropIndex();
/**
* Returns true if valid metadata bootstrap is present.
* @return
*/
public final boolean useIndex() {
boolean validInstantTime = metaClient.getActiveTimeline().getCommitsTimeline().filterCompletedInstants().lastInstant()
.map(i -> HoodieTimeline.compareTimestamps(i.getTimestamp(), HoodieTimeline.GREATER_THAN_OR_EQUALS,
HoodieTimeline.METADATA_BOOTSTRAP_INSTANT_TS)).orElse(false);
return validInstantTime && metaClient.getTableConfig().getBootstrapBasePath().isPresent() && isPresent();
}
/**
* Check if bootstrap Index is present and ensures readable.
*/
protected abstract boolean isPresent();
/**
* Bootstrap Index Reader Interface.
*/
public abstract static class IndexReader implements Serializable, AutoCloseable {
protected final HoodieTableMetaClient metaClient;
public IndexReader(HoodieTableMetaClient metaClient) {
this.metaClient = metaClient;
}
/**
* Return Source base path.
* @return
*/
public abstract String getBootstrapBasePath();
/**
* Return list of partitions indexed.
* @return
*/
public abstract List<String> getIndexedPartitionPaths();
/**
* Return list file-ids indexed.
* @return
*/
public abstract List<String> getIndexedFileIds();
/**
* Lookup bootstrap index by partition.
* @param partition Partition to lookup
* @return
*/
public abstract List<BootstrapFileMapping> getSourceFileMappingForPartition(String partition);
/**
* Lookup Bootstrap index by file group ids.
* @param ids File Group Ids
* @return
*/
public abstract Map<HoodieFileGroupId, BootstrapFileMapping> getSourceFileMappingForFileIds(
List<HoodieFileGroupId> ids);
public abstract void close();
}
/**
* Bootstrap Index Writer Interface.
*/
public abstract static class IndexWriter implements AutoCloseable {
protected final HoodieTableMetaClient metaClient;
public IndexWriter(HoodieTableMetaClient metaClient) {
this.metaClient = metaClient;
}
/**
* Writer calls this method before beginning indexing partitions.
*/
public abstract void begin();
/**
* Append bootstrap index entries for next partitions in sorted order.
* @param partitionPath Partition Path
* @param bootstrapFileMappings Bootstrap Source File to File Id mapping
*/
public abstract void appendNextPartition(String partitionPath,
List<BootstrapFileMapping> bootstrapFileMappings);
/**
* Writer calls this method after appending all partitions to be indexed.
*/
public abstract void finish();
public abstract void close();
}
public static BootstrapIndex getBootstrapIndex(HoodieTableMetaClient metaClient) {
return ((BootstrapIndex)(ReflectionUtils.loadClass(
metaClient.getTableConfig().getBootstrapIndexClass(), metaClient)));
}
}

View File

@@ -0,0 +1,534 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.hudi.common.bootstrap.index;
import org.apache.hudi.avro.model.HoodieBootstrapFilePartitionInfo;
import org.apache.hudi.avro.model.HoodieBootstrapIndexInfo;
import org.apache.hudi.avro.model.HoodieBootstrapPartitionMetadata;
import org.apache.hudi.common.fs.FSUtils;
import org.apache.hudi.common.model.BootstrapFileMapping;
import org.apache.hudi.common.model.HoodieFileFormat;
import org.apache.hudi.common.model.HoodieFileGroupId;
import org.apache.hudi.common.table.HoodieTableMetaClient;
import org.apache.hudi.common.table.timeline.HoodieTimeline;
import org.apache.hudi.common.table.timeline.TimelineMetadataUtils;
import org.apache.hudi.common.util.Option;
import org.apache.hudi.common.util.collection.Pair;
import org.apache.hudi.exception.HoodieException;
import org.apache.hudi.exception.HoodieIOException;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.hbase.CellUtil;
import org.apache.hadoop.hbase.HConstants;
import org.apache.hadoop.hbase.KeyValue;
import org.apache.hadoop.hbase.io.hfile.CacheConfig;
import org.apache.hadoop.hbase.io.hfile.HFile;
import org.apache.hadoop.hbase.io.hfile.HFileContext;
import org.apache.hadoop.hbase.io.hfile.HFileContextBuilder;
import org.apache.hadoop.hbase.io.hfile.HFileScanner;
import org.apache.hadoop.hbase.util.Bytes;
import org.apache.log4j.LogManager;
import org.apache.log4j.Logger;
import java.io.IOException;
import java.nio.ByteBuffer;
import java.util.ArrayList;
import java.util.Collection;
import java.util.Collections;
import java.util.Date;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.stream.Collectors;
/**
* Maintains mapping from skeleton file id to external bootstrap file.
* It maintains 2 physical indices.
* (a) At partition granularity to lookup all indices for each partition.
* (b) At file-group granularity to lookup bootstrap mapping for an individual file-group.
*
* This implementation uses HFile as physical storage of index. FOr the initial run, bootstrap
* mapping for the entire dataset resides in a single file but care has been taken in naming
* the index files in the same way as Hudi data files so that we can reuse file-system abstraction
* on these index files to manage multiple file-groups.
*/
public class HFileBootstrapIndex extends BootstrapIndex {
protected static final long serialVersionUID = 1L;
private static final Logger LOG = LogManager.getLogger(HFileBootstrapIndex.class);
public static final String BOOTSTRAP_INDEX_FILE_ID = "00000000-0000-0000-0000-000000000000-0";
// Additional Metadata written to HFiles.
public static final byte[] INDEX_INFO_KEY = Bytes.toBytes("INDEX_INFO");
private final boolean isPresent;
public HFileBootstrapIndex(HoodieTableMetaClient metaClient) {
super(metaClient);
Path indexByPartitionPath = partitionIndexPath(metaClient);
Path indexByFilePath = fileIdIndexPath(metaClient);
try {
FileSystem fs = metaClient.getFs();
isPresent = fs.exists(indexByPartitionPath) && fs.exists(indexByFilePath);
} catch (IOException ioe) {
throw new HoodieIOException(ioe.getMessage(), ioe);
}
}
private static String getPartitionKey(String partition) {
return "part=" + partition;
}
private static String getFileGroupKey(HoodieFileGroupId fileGroupId) {
return "part=" + fileGroupId.getPartitionPath() + ";fileid=" + fileGroupId.getFileId();
}
private static Path partitionIndexPath(HoodieTableMetaClient metaClient) {
return new Path(metaClient.getBootstrapIndexByPartitionFolderPath(),
FSUtils.makeBootstrapIndexFileName(HoodieTimeline.METADATA_BOOTSTRAP_INSTANT_TS, BOOTSTRAP_INDEX_FILE_ID,
HoodieFileFormat.HFILE.getFileExtension()));
}
private static Path fileIdIndexPath(HoodieTableMetaClient metaClient) {
return new Path(metaClient.getBootstrapIndexByFileIdFolderNameFolderPath(),
FSUtils.makeBootstrapIndexFileName(HoodieTimeline.METADATA_BOOTSTRAP_INSTANT_TS, BOOTSTRAP_INDEX_FILE_ID,
HoodieFileFormat.HFILE.getFileExtension()));
}
/**
* Helper method to create HFile Reader.
*
* @param hFilePath File Path
* @param conf Configuration
* @param fileSystem File System
*/
private static HFile.Reader createReader(String hFilePath, Configuration conf, FileSystem fileSystem) {
try {
LOG.info("Opening HFile for reading :" + hFilePath);
HFile.Reader reader = HFile.createReader(fileSystem, new HFilePathForReader(hFilePath),
new CacheConfig(conf), conf);
return reader;
} catch (IOException ioe) {
throw new HoodieIOException(ioe.getMessage(), ioe);
}
}
@Override
public BootstrapIndex.IndexReader createReader() {
return new HFileBootstrapIndexReader(metaClient);
}
@Override
public BootstrapIndex.IndexWriter createWriter(String bootstrapBasePath) {
return new HFileBootstrapIndexWriter(bootstrapBasePath, metaClient);
}
@Override
public void dropIndex() {
try {
Path[] indexPaths = new Path[]{partitionIndexPath(metaClient), fileIdIndexPath(metaClient)};
for (Path indexPath : indexPaths) {
if (metaClient.getFs().exists(indexPath)) {
LOG.info("Dropping bootstrap index. Deleting file : " + indexPath);
metaClient.getFs().delete(indexPath);
}
}
} catch (IOException ioe) {
throw new HoodieIOException(ioe.getMessage(), ioe);
}
}
@Override
protected boolean isPresent() {
return isPresent;
}
/**
* HFile Based Index Reader.
*/
public static class HFileBootstrapIndexReader extends BootstrapIndex.IndexReader {
// Base Path of external files.
private final String bootstrapBasePath;
// Well Known Paths for indices
private final String indexByPartitionPath;
private final String indexByFileIdPath;
// Index Readers
private transient HFile.Reader indexByPartitionReader;
private transient HFile.Reader indexByFileIdReader;
// Bootstrap Index Info
private transient HoodieBootstrapIndexInfo bootstrapIndexInfo;
public HFileBootstrapIndexReader(HoodieTableMetaClient metaClient) {
super(metaClient);
Path indexByPartitionPath = partitionIndexPath(metaClient);
Path indexByFilePath = fileIdIndexPath(metaClient);
this.indexByPartitionPath = indexByPartitionPath.toString();
this.indexByFileIdPath = indexByFilePath.toString();
initIndexInfo();
this.bootstrapBasePath = bootstrapIndexInfo.getBootstrapBasePath();
LOG.info("Loaded HFileBasedBootstrapIndex with source base path :" + bootstrapBasePath);
}
private void initIndexInfo() {
synchronized (this) {
if (null == bootstrapIndexInfo) {
try {
bootstrapIndexInfo = fetchBootstrapIndexInfo();
} catch (IOException ioe) {
throw new HoodieException(ioe.getMessage(), ioe);
}
}
}
}
private HoodieBootstrapIndexInfo fetchBootstrapIndexInfo() throws IOException {
return TimelineMetadataUtils.deserializeAvroMetadata(
partitionIndexReader().loadFileInfo().get(INDEX_INFO_KEY),
HoodieBootstrapIndexInfo.class);
}
private HFile.Reader partitionIndexReader() {
if (null == indexByPartitionReader) {
synchronized (this) {
if (null == indexByPartitionReader) {
LOG.info("Opening partition index :" + indexByPartitionPath);
this.indexByPartitionReader =
createReader(indexByPartitionPath, metaClient.getHadoopConf(), metaClient.getFs());
}
}
}
return indexByPartitionReader;
}
private HFile.Reader fileIdIndexReader() {
if (null == indexByFileIdReader) {
synchronized (this) {
if (null == indexByFileIdReader) {
LOG.info("Opening fileId index :" + indexByFileIdPath);
this.indexByFileIdReader =
createReader(indexByFileIdPath, metaClient.getHadoopConf(), metaClient.getFs());
}
}
}
return indexByFileIdReader;
}
@Override
public List<String> getIndexedPartitionPaths() {
HFileScanner scanner = partitionIndexReader().getScanner(true, true);
return getAllKeys(scanner);
}
@Override
public List<String> getIndexedFileIds() {
HFileScanner scanner = fileIdIndexReader().getScanner(true, true);
return getAllKeys(scanner);
}
private List<String> getAllKeys(HFileScanner scanner) {
List<String> keys = new ArrayList<>();
try {
boolean available = scanner.seekTo();
while (available) {
keys.add(CellUtil.getCellKeyAsString(scanner.getKeyValue()));
available = scanner.next();
}
} catch (IOException ioe) {
throw new HoodieIOException(ioe.getMessage(), ioe);
}
return keys;
}
@Override
public List<BootstrapFileMapping> getSourceFileMappingForPartition(String partition) {
try {
HFileScanner scanner = partitionIndexReader().getScanner(true, true);
KeyValue keyValue = new KeyValue(Bytes.toBytes(getPartitionKey(partition)), new byte[0], new byte[0],
HConstants.LATEST_TIMESTAMP, KeyValue.Type.Put, new byte[0]);
if (scanner.seekTo(keyValue) == 0) {
ByteBuffer readValue = scanner.getValue();
byte[] valBytes = Bytes.toBytes(readValue);
HoodieBootstrapPartitionMetadata metadata =
TimelineMetadataUtils.deserializeAvroMetadata(valBytes, HoodieBootstrapPartitionMetadata.class);
return metadata.getFileIdToBootstrapFile().entrySet().stream()
.map(e -> new BootstrapFileMapping(bootstrapBasePath, metadata.getBootstrapPartitionPath(),
partition, e.getValue(), e.getKey())).collect(Collectors.toList());
} else {
LOG.warn("No value found for partition key (" + partition + ")");
return new ArrayList<>();
}
} catch (IOException ioe) {
throw new HoodieIOException(ioe.getMessage(), ioe);
}
}
@Override
public String getBootstrapBasePath() {
return bootstrapBasePath;
}
@Override
public Map<HoodieFileGroupId, BootstrapFileMapping> getSourceFileMappingForFileIds(
List<HoodieFileGroupId> ids) {
Map<HoodieFileGroupId, BootstrapFileMapping> result = new HashMap<>();
// Arrange input Keys in sorted order for 1 pass scan
List<HoodieFileGroupId> fileGroupIds = new ArrayList<>(ids);
Collections.sort(fileGroupIds);
try {
HFileScanner scanner = fileIdIndexReader().getScanner(true, true);
for (HoodieFileGroupId fileGroupId : fileGroupIds) {
KeyValue keyValue = new KeyValue(Bytes.toBytes(getFileGroupKey(fileGroupId)), new byte[0], new byte[0],
HConstants.LATEST_TIMESTAMP, KeyValue.Type.Put, new byte[0]);
if (scanner.seekTo(keyValue) == 0) {
ByteBuffer readValue = scanner.getValue();
byte[] valBytes = Bytes.toBytes(readValue);
HoodieBootstrapFilePartitionInfo fileInfo = TimelineMetadataUtils.deserializeAvroMetadata(valBytes,
HoodieBootstrapFilePartitionInfo.class);
BootstrapFileMapping mapping = new BootstrapFileMapping(bootstrapBasePath,
fileInfo.getBootstrapPartitionPath(), fileInfo.getPartitionPath(), fileInfo.getBootstrapFileStatus(),
fileGroupId.getFileId());
result.put(fileGroupId, mapping);
}
}
} catch (IOException ioe) {
throw new HoodieIOException(ioe.getMessage(), ioe);
}
return result;
}
@Override
public void close() {
try {
if (indexByPartitionReader != null) {
indexByPartitionReader.close(true);
indexByPartitionReader = null;
}
if (indexByFileIdReader != null) {
indexByFileIdReader.close(true);
indexByFileIdReader = null;
}
} catch (IOException ioe) {
throw new HoodieIOException(ioe.getMessage(), ioe);
}
}
}
/**
* Boostrap Index Writer to build bootstrap index.
*/
public static class HFileBootstrapIndexWriter extends BootstrapIndex.IndexWriter {
private final String bootstrapBasePath;
private final Path indexByPartitionPath;
private final Path indexByFileIdPath;
private HFile.Writer indexByPartitionWriter;
private HFile.Writer indexByFileIdWriter;
private boolean closed = false;
private int numPartitionKeysAdded = 0;
private int numFileIdKeysAdded = 0;
private final Map<String, List<BootstrapFileMapping>> sourceFileMappings = new HashMap<>();
private HFileBootstrapIndexWriter(String bootstrapBasePath, HoodieTableMetaClient metaClient) {
super(metaClient);
try {
metaClient.initializeBootstrapDirsIfNotExists();
this.bootstrapBasePath = bootstrapBasePath;
this.indexByPartitionPath = partitionIndexPath(metaClient);
this.indexByFileIdPath = fileIdIndexPath(metaClient);
if (metaClient.getFs().exists(indexByPartitionPath) || metaClient.getFs().exists(indexByFileIdPath)) {
String errMsg = "Previous version of bootstrap index exists. Partition Index Path :" + indexByPartitionPath
+ ", FileId index Path :" + indexByFileIdPath;
LOG.info(errMsg);
throw new HoodieException(errMsg);
}
} catch (IOException ioe) {
throw new HoodieIOException(ioe.getMessage(), ioe);
}
}
/**
* Append bootstrap index entries for next partitions in sorted order.
* @param partitionPath Hudi Partition Path
* @param bootstrapPartitionPath Source Partition Path
* @param bootstrapFileMappings Bootstrap Source File to Hudi File Id mapping
*/
private void writeNextPartition(String partitionPath, String bootstrapPartitionPath,
List<BootstrapFileMapping> bootstrapFileMappings) {
try {
LOG.info("Adding bootstrap partition Index entry for partition :" + partitionPath
+ ", bootstrap Partition :" + bootstrapPartitionPath + ", Num Entries :" + bootstrapFileMappings.size());
LOG.info("ADDING entries :" + bootstrapFileMappings);
HoodieBootstrapPartitionMetadata bootstrapPartitionMetadata = new HoodieBootstrapPartitionMetadata();
bootstrapPartitionMetadata.setBootstrapPartitionPath(bootstrapPartitionPath);
bootstrapPartitionMetadata.setPartitionPath(partitionPath);
bootstrapPartitionMetadata.setFileIdToBootstrapFile(
bootstrapFileMappings.stream().map(m -> Pair.of(m.getFileId(),
m.getBoostrapFileStatus())).collect(Collectors.toMap(Pair::getKey, Pair::getValue)));
Option<byte[]> bytes = TimelineMetadataUtils.serializeAvroMetadata(bootstrapPartitionMetadata, HoodieBootstrapPartitionMetadata.class);
if (bytes.isPresent()) {
indexByPartitionWriter
.append(new KeyValue(Bytes.toBytes(getPartitionKey(partitionPath)), new byte[0], new byte[0],
HConstants.LATEST_TIMESTAMP, KeyValue.Type.Put, bytes.get()));
numPartitionKeysAdded++;
}
} catch (IOException e) {
throw new HoodieIOException(e.getMessage(), e);
}
}
/**
* Write next source file to hudi file-id. Entries are expected to be appended in hudi file-group id
* order.
* @param mapping boostrap source file mapping.
*/
private void writeNextSourceFileMapping(BootstrapFileMapping mapping) {
try {
HoodieBootstrapFilePartitionInfo srcFilePartitionInfo = new HoodieBootstrapFilePartitionInfo();
srcFilePartitionInfo.setPartitionPath(mapping.getPartitionPath());
srcFilePartitionInfo.setBootstrapPartitionPath(mapping.getBootstrapPartitionPath());
srcFilePartitionInfo.setBootstrapFileStatus(mapping.getBoostrapFileStatus());
KeyValue kv = new KeyValue(getFileGroupKey(mapping.getFileGroupId()).getBytes(), new byte[0], new byte[0],
HConstants.LATEST_TIMESTAMP, KeyValue.Type.Put,
TimelineMetadataUtils.serializeAvroMetadata(srcFilePartitionInfo,
HoodieBootstrapFilePartitionInfo.class).get());
indexByFileIdWriter.append(kv);
numFileIdKeysAdded++;
} catch (IOException e) {
throw new HoodieIOException(e.getMessage(), e);
}
}
/**
* Commit bootstrap index entries. Appends Metadata and closes write handles.
*/
private void commit() {
try {
if (!closed) {
HoodieBootstrapIndexInfo partitionIndexInfo = HoodieBootstrapIndexInfo.newBuilder()
.setCreatedTimestamp(new Date().getTime())
.setNumKeys(numPartitionKeysAdded)
.setBootstrapBasePath(bootstrapBasePath)
.build();
LOG.info("Adding Partition FileInfo :" + partitionIndexInfo);
HoodieBootstrapIndexInfo fileIdIndexInfo = HoodieBootstrapIndexInfo.newBuilder()
.setCreatedTimestamp(new Date().getTime())
.setNumKeys(numFileIdKeysAdded)
.setBootstrapBasePath(bootstrapBasePath)
.build();
LOG.info("Appending FileId FileInfo :" + fileIdIndexInfo);
indexByPartitionWriter.appendFileInfo(INDEX_INFO_KEY,
TimelineMetadataUtils.serializeAvroMetadata(partitionIndexInfo, HoodieBootstrapIndexInfo.class).get());
indexByFileIdWriter.appendFileInfo(INDEX_INFO_KEY,
TimelineMetadataUtils.serializeAvroMetadata(fileIdIndexInfo, HoodieBootstrapIndexInfo.class).get());
close();
}
} catch (IOException ioe) {
throw new HoodieIOException(ioe.getMessage(), ioe);
}
}
/**
* Close Writer Handles.
*/
public void close() {
try {
if (!closed) {
indexByPartitionWriter.close();
indexByFileIdWriter.close();
closed = true;
}
} catch (IOException ioe) {
throw new HoodieIOException(ioe.getMessage(), ioe);
}
}
@Override
public void begin() {
try {
HFileContext meta = new HFileContextBuilder().build();
this.indexByPartitionWriter = HFile.getWriterFactory(metaClient.getHadoopConf(),
new CacheConfig(metaClient.getHadoopConf())).withPath(metaClient.getFs(), indexByPartitionPath)
.withFileContext(meta).withComparator(new HoodieKVComparator()).create();
this.indexByFileIdWriter = HFile.getWriterFactory(metaClient.getHadoopConf(),
new CacheConfig(metaClient.getHadoopConf())).withPath(metaClient.getFs(), indexByFileIdPath)
.withFileContext(meta).withComparator(new HoodieKVComparator()).create();
} catch (IOException ioe) {
throw new HoodieIOException(ioe.getMessage(), ioe);
}
}
@Override
public void appendNextPartition(String partitionPath, List<BootstrapFileMapping> bootstrapFileMappings) {
sourceFileMappings.put(partitionPath, bootstrapFileMappings);
}
@Override
public void finish() {
// Sort and write
List<String> partitions = sourceFileMappings.keySet().stream().sorted().collect(Collectors.toList());
partitions.forEach(p -> writeNextPartition(p, sourceFileMappings.get(p).get(0).getBootstrapPartitionPath(),
sourceFileMappings.get(p)));
sourceFileMappings.values().stream().flatMap(Collection::stream).sorted()
.forEach(this::writeNextSourceFileMapping);
commit();
}
}
/**
* IMPORTANT :
* HFile Readers use HFile name (instead of path) as cache key. This could be fine as long
* as file names are UUIDs. For bootstrap, we are using well-known index names.
* Hence, this hacky workaround to return full path string from Path subclass and pass it to reader.
* The other option is to disable block cache for Bootstrap which again involves some custom code
* as there is no API to disable cache.
*/
private static class HFilePathForReader extends Path {
public HFilePathForReader(String pathString) throws IllegalArgumentException {
super(pathString);
}
@Override
public String getName() {
return toString();
}
}
/**
* This class is explicitly used as Key Comparator to workaround hard coded
* legacy format class names inside HBase. Otherwise we will face issues with shading.
*/
public static class HoodieKVComparator extends KeyValue.KVComparator {
}
}

View File

@@ -18,6 +18,7 @@
package org.apache.hudi.common.fs;
import org.apache.hudi.common.config.SerializableConfiguration;
import org.apache.hudi.common.model.HoodieFileFormat;
import org.apache.hudi.common.model.HoodieLogFile;
import org.apache.hudi.common.model.HoodiePartitionMetadata;
@@ -45,6 +46,7 @@ import java.io.File;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.List;
import java.util.Map.Entry;
import java.util.Objects;
@@ -52,6 +54,7 @@ import java.util.UUID;
import java.util.function.Function;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import java.util.stream.Collectors;
import java.util.stream.Stream;
/**
@@ -114,6 +117,10 @@ public class FSUtils {
return String.format("%s_%s_%s%s", fileId, writeToken, instantTime, fileExtension);
}
public static String makeBootstrapIndexFileName(String instantTime, String fileId, String ext) {
return String.format("%s_%s_%s%s", fileId, "1-0-1", instantTime, ext);
}
public static String maskWithoutFileId(String instantTime, int taskPartitionId) {
return String.format("*_%s_%s%s", taskPartitionId, instantTime, HoodieFileFormat.PARQUET.getFileExtension());
}
@@ -225,8 +232,8 @@ public class FSUtils {
public static String getFileExtension(String fullName) {
Objects.requireNonNull(fullName);
String fileName = (new File(fullName)).getName();
int dotIndex = fileName.indexOf('.');
String fileName = new File(fullName).getName();
int dotIndex = fileName.lastIndexOf('.');
return dotIndex == -1 ? "" : fileName.substring(dotIndex);
}
@@ -235,10 +242,6 @@ public class FSUtils {
return (path) -> !path.toString().contains(HoodieTableMetaClient.METAFOLDER_NAME);
}
public static String getInstantTime(String name) {
return name.replace(getFileExtension(name), "");
}
/**
* Returns a new unique prefix for creating a file group.
*/
@@ -516,4 +519,33 @@ public class FSUtils {
return returnConf;
}
/**
* Get the FS implementation for this table.
* @param path Path String
* @param hadoopConf Serializable Hadoop Configuration
* @param consistencyGuardConfig Consistency Guard Config
* @return HoodieWrapperFileSystem
*/
public static HoodieWrapperFileSystem getFs(String path, SerializableConfiguration hadoopConf,
ConsistencyGuardConfig consistencyGuardConfig) {
FileSystem fileSystem = FSUtils.getFs(path, hadoopConf.newCopy());
return new HoodieWrapperFileSystem(fileSystem,
consistencyGuardConfig.isConsistencyCheckEnabled()
? new FailSafeConsistencyGuard(fileSystem, consistencyGuardConfig)
: new NoOpConsistencyGuard());
}
/**
* Helper to filter out paths under metadata folder when running fs.globStatus.
* @param fs File System
* @param globPath Glob Path
* @return
* @throws IOException
*/
public static List<FileStatus> getGlobStatusExcludingMetaFolder(FileSystem fs, Path globPath) throws IOException {
FileStatus[] statuses = fs.globStatus(globPath);
return Arrays.stream(statuses)
.filter(fileStatus -> !fileStatus.getPath().toString().contains(HoodieTableMetaClient.METAFOLDER_NAME))
.collect(Collectors.toList());
}
}

View File

@@ -0,0 +1,101 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.hudi.common.model;
import org.apache.hadoop.fs.FileStatus;
import org.apache.hadoop.fs.Path;
import java.io.Serializable;
import java.util.Objects;
/**
* Represents common metadata about base-file.
* A base file can be Hudi base file or even an external (non-hudi) base file too.
*/
public class BaseFile implements Serializable {
private static final long serialVersionUID = 1L;
private transient FileStatus fileStatus;
private final String fullPath;
private long fileLen;
public BaseFile(BaseFile dataFile) {
this.fileStatus = dataFile.fileStatus;
this.fullPath = dataFile.fullPath;
this.fileLen = dataFile.fileLen;
}
public BaseFile(FileStatus fileStatus) {
this.fileStatus = fileStatus;
this.fullPath = fileStatus.getPath().toString();
this.fileLen = fileStatus.getLen();
}
public BaseFile(String filePath) {
this.fileStatus = null;
this.fullPath = filePath;
this.fileLen = -1;
}
public String getPath() {
return fullPath;
}
public String getFileName() {
return new Path(fullPath).getName();
}
public FileStatus getFileStatus() {
return fileStatus;
}
public long getFileSize() {
return fileLen;
}
public void setFileLen(long fileLen) {
this.fileLen = fileLen;
}
public long getFileLen() {
return fileLen;
}
@Override
public boolean equals(Object o) {
if (this == o) {
return true;
}
if (o == null || getClass() != o.getClass()) {
return false;
}
BaseFile dataFile = (BaseFile) o;
return Objects.equals(fullPath, dataFile.fullPath);
}
@Override
public int hashCode() {
return Objects.hash(fullPath);
}
@Override
public String toString() {
return "BaseFile{fullPath=" + fullPath + ", fileLen=" + fileLen + '}';
}
}

View File

@@ -0,0 +1,54 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.hudi.common.model;
import java.io.Serializable;
import org.apache.hudi.avro.model.HoodieFileStatus;
import org.apache.hudi.common.bootstrap.FileStatusUtils;
/**
* POJO storing (partitionPath, hoodieFileId) -> external base file path.
*/
public class BootstrapBaseFileMapping implements Serializable {
private final HoodieFileGroupId fileGroupId;
private final HoodieFileStatus bootstrapFileStatus;
public BootstrapBaseFileMapping(HoodieFileGroupId fileGroupId, HoodieFileStatus bootstrapFileStatus) {
this.fileGroupId = fileGroupId;
this.bootstrapFileStatus = bootstrapFileStatus;
}
public HoodieFileGroupId getFileGroupId() {
return fileGroupId;
}
public BaseFile getBootstrapBaseFile() {
return new BaseFile(FileStatusUtils.toFileStatus(bootstrapFileStatus));
}
@Override
public String toString() {
return "BootstrapBaseFileMapping{"
+ "fileGroupId=" + fileGroupId
+ ", bootstrapFileStatus=" + bootstrapFileStatus
+ '}';
}
}

View File

@@ -0,0 +1,110 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.hudi.common.model;
import java.io.Serializable;
import java.util.Objects;
import org.apache.hudi.avro.model.HoodieFileStatus;
/**
* Value stored in the bootstrap index.
*/
public class BootstrapFileMapping implements Serializable, Comparable<BootstrapFileMapping> {
private final String bootstrapBasePath;
private final String bootstrapPartitionPath;
private final HoodieFileStatus boostrapFileStatus;
private final String partitionPath;
private final String fileId;
public BootstrapFileMapping(String bootstrapBasePath, String bootstrapPartitionPath, String partitionPath,
HoodieFileStatus boostrapFileStatus, String fileId) {
this.bootstrapBasePath = bootstrapBasePath;
this.bootstrapPartitionPath = bootstrapPartitionPath;
this.partitionPath = partitionPath;
this.boostrapFileStatus = boostrapFileStatus;
this.fileId = fileId;
}
@Override
public String toString() {
return "BootstrapFileMapping{"
+ "bootstrapBasePath='" + bootstrapBasePath + '\''
+ ", bootstrapPartitionPath='" + bootstrapPartitionPath + '\''
+ ", boostrapFileStatus=" + boostrapFileStatus
+ ", partitionPath='" + partitionPath + '\''
+ ", fileId='" + fileId + '\''
+ '}';
}
@Override
public boolean equals(Object o) {
if (this == o) {
return true;
}
if (o == null || getClass() != o.getClass()) {
return false;
}
BootstrapFileMapping mapping = (BootstrapFileMapping) o;
return Objects.equals(bootstrapBasePath, mapping.bootstrapBasePath)
&& Objects.equals(bootstrapPartitionPath, mapping.bootstrapPartitionPath)
&& Objects.equals(partitionPath, mapping.partitionPath)
&& Objects.equals(boostrapFileStatus, mapping.boostrapFileStatus)
&& Objects.equals(fileId, mapping.fileId);
}
@Override
public int hashCode() {
return Objects.hash(bootstrapBasePath, bootstrapPartitionPath, partitionPath, boostrapFileStatus, fileId);
}
public String getBootstrapBasePath() {
return bootstrapBasePath;
}
public String getBootstrapPartitionPath() {
return bootstrapPartitionPath;
}
public String getPartitionPath() {
return partitionPath;
}
public HoodieFileStatus getBoostrapFileStatus() {
return boostrapFileStatus;
}
public String getFileId() {
return fileId;
}
public HoodieFileGroupId getFileGroupId() {
return new HoodieFileGroupId(partitionPath, fileId);
}
@Override
public int compareTo(BootstrapFileMapping o) {
int ret = partitionPath.compareTo(o.partitionPath);
if (ret == 0) {
ret = fileId.compareTo(o.fileId);
}
return ret;
}
}

View File

@@ -44,18 +44,20 @@ public class CompactionOperation implements Serializable {
private Option<String> dataFileName;
private HoodieFileGroupId id;
private Map<String, Double> metrics;
private Option<String> bootstrapFilePath;
// Only for serialization/de-serialization
@Deprecated
public CompactionOperation() {}
public CompactionOperation(String fileId, String partitionPath, String baseInstantTime,
Option<String> dataFileCommitTime, List<String> deltaFileNames, Option<String> dataFileName,
Map<String, Double> metrics) {
Option<String> dataFileCommitTime, List<String> deltaFileNames, Option<String> dataFileName,
Option<String> bootstrapFilePath, Map<String, Double> metrics) {
this.baseInstantTime = baseInstantTime;
this.dataFileCommitTime = dataFileCommitTime;
this.deltaFileNames = deltaFileNames;
this.dataFileName = dataFileName;
this.bootstrapFilePath = bootstrapFilePath;
this.id = new HoodieFileGroupId(partitionPath, fileId);
this.metrics = metrics;
}
@@ -67,14 +69,15 @@ public class CompactionOperation implements Serializable {
this.dataFileName = Option.of(dataFile.get().getFileName());
this.id = new HoodieFileGroupId(partitionPath, dataFile.get().getFileId());
this.dataFileCommitTime = Option.of(dataFile.get().getCommitTime());
this.bootstrapFilePath = dataFile.get().getBootstrapBaseFile().map(BaseFile::getPath);
} else {
assert logFiles.size() > 0;
this.dataFileName = Option.empty();
this.baseInstantTime = FSUtils.getBaseCommitTimeFromLogPath(logFiles.get(0).getPath());
this.id = new HoodieFileGroupId(partitionPath, FSUtils.getFileIdFromLogPath(logFiles.get(0).getPath()));
this.dataFileCommitTime = Option.empty();
this.bootstrapFilePath = Option.empty();
}
this.deltaFileNames = logFiles.stream().map(s -> s.getPath().getName()).collect(Collectors.toList());
this.metrics = metrics;
}
@@ -111,9 +114,17 @@ public class CompactionOperation implements Serializable {
return id;
}
public Option<String> getBootstrapFilePath() {
return bootstrapFilePath;
}
public Option<HoodieBaseFile> getBaseFile(String basePath, String partitionPath) {
Option<BaseFile> externalBaseFile = bootstrapFilePath.map(BaseFile::new);
Path dirPath = FSUtils.getPartitionPath(basePath, partitionPath);
return dataFileName.map(df -> new HoodieBaseFile(new Path(dirPath, df).toString()));
return dataFileName.map(df -> {
return externalBaseFile.map(ext -> new HoodieBaseFile(new Path(dirPath, df).toString(), ext))
.orElseGet(() -> new HoodieBaseFile(new Path(dirPath, df).toString()));
});
}
/**
@@ -130,6 +141,7 @@ public class CompactionOperation implements Serializable {
op.deltaFileNames = new ArrayList<>(operation.getDeltaFilePaths());
op.id = new HoodieFileGroupId(operation.getPartitionPath(), operation.getFileId());
op.metrics = operation.getMetrics() == null ? new HashMap<>() : new HashMap<>(operation.getMetrics());
op.bootstrapFilePath = Option.ofNullable(operation.getBootstrapFilePath());
return op;
}
@@ -137,7 +149,7 @@ public class CompactionOperation implements Serializable {
public String toString() {
return "CompactionOperation{baseInstantTime='" + baseInstantTime + '\'' + ", dataFileCommitTime="
+ dataFileCommitTime + ", deltaFileNames=" + deltaFileNames + ", dataFileName=" + dataFileName + ", id='" + id
+ '\'' + ", metrics=" + metrics + '}';
+ '\'' + ", metrics=" + metrics + ", bootstrapFilePath=" + bootstrapFilePath + '}';
}
@Override

View File

@@ -52,6 +52,14 @@ public class FileSlice implements Serializable {
*/
private final TreeSet<HoodieLogFile> logFiles;
public FileSlice(FileSlice fileSlice) {
this.baseInstantTime = fileSlice.baseInstantTime;
this.baseFile = fileSlice.baseFile != null ? new HoodieBaseFile(fileSlice.baseFile) : null;
this.fileGroupId = fileSlice.fileGroupId;
this.logFiles = new TreeSet<>(HoodieLogFile.getReverseLogFileComparator());
fileSlice.logFiles.forEach(lf -> this.logFiles.add(new HoodieLogFile(lf)));
}
public FileSlice(String partitionPath, String baseInstantTime, String fileId) {
this(new HoodieFileGroupId(partitionPath, fileId), baseInstantTime);
}

View File

@@ -19,33 +19,39 @@
package org.apache.hudi.common.model;
import org.apache.hudi.common.fs.FSUtils;
import org.apache.hudi.common.util.Option;
import org.apache.hadoop.fs.FileStatus;
import org.apache.hadoop.fs.Path;
import java.io.Serializable;
import java.util.Objects;
/**
* Hoodie base file.
* Hoodie base file - Represents metadata about Hudi file in DFS.
* Supports APIs to get Hudi FileId, Commit Time and bootstrap file (if any).
*/
public class HoodieBaseFile implements Serializable {
public class HoodieBaseFile extends BaseFile {
private static final long serialVersionUID = 1L;
private transient FileStatus fileStatus;
private final String fullPath;
private long fileLen;
private Option<BaseFile> bootstrapBaseFile;
public HoodieBaseFile(HoodieBaseFile dataFile) {
super(dataFile);
this.bootstrapBaseFile = dataFile.bootstrapBaseFile;
}
public HoodieBaseFile(FileStatus fileStatus) {
this.fileStatus = fileStatus;
this.fullPath = fileStatus.getPath().toString();
this.fileLen = fileStatus.getLen();
this(fileStatus, null);
}
public HoodieBaseFile(FileStatus fileStatus, BaseFile bootstrapBaseFile) {
super(fileStatus);
this.bootstrapBaseFile = Option.ofNullable(bootstrapBaseFile);
}
public HoodieBaseFile(String filePath) {
this.fileStatus = null;
this.fullPath = filePath;
this.fileLen = -1;
this(filePath, null);
}
public HoodieBaseFile(String filePath, BaseFile bootstrapBaseFile) {
super(filePath);
this.bootstrapBaseFile = Option.ofNullable(bootstrapBaseFile);
}
public String getFileId() {
@@ -56,49 +62,17 @@ public class HoodieBaseFile implements Serializable {
return FSUtils.getCommitTime(getFileName());
}
public String getPath() {
return fullPath;
public Option<BaseFile> getBootstrapBaseFile() {
return bootstrapBaseFile;
}
public String getFileName() {
return new Path(fullPath).getName();
}
public FileStatus getFileStatus() {
return fileStatus;
}
public long getFileSize() {
return fileLen;
}
public void setFileLen(long fileLen) {
this.fileLen = fileLen;
}
public long getFileLen() {
return fileLen;
}
@Override
public boolean equals(Object o) {
if (this == o) {
return true;
}
if (o == null || getClass() != o.getClass()) {
return false;
}
HoodieBaseFile dataFile = (HoodieBaseFile) o;
return Objects.equals(fullPath, dataFile.fullPath);
}
@Override
public int hashCode() {
return Objects.hash(fullPath);
public void setBootstrapBaseFile(BaseFile bootstrapBaseFile) {
this.bootstrapBaseFile = Option.ofNullable(bootstrapBaseFile);
}
@Override
public String toString() {
return "HoodieDataFile{fullPath=" + fullPath + ", fileLen=" + fileLen + '}';
return "HoodieBaseFile{fullPath=" + getPath() + ", fileLen=" + getFileLen()
+ ", BootstrapBaseFile=" + bootstrapBaseFile.orElse(null) + '}';
}
}

View File

@@ -22,7 +22,9 @@ package org.apache.hudi.common.model;
* Hoodie file format.
*/
public enum HoodieFileFormat {
PARQUET(".parquet"), HOODIE_LOG(".log");
PARQUET(".parquet"),
HOODIE_LOG(".log"),
HFILE(".hfile");
private final String extension;

View File

@@ -21,12 +21,14 @@ package org.apache.hudi.common.model;
import org.apache.hudi.common.table.timeline.HoodieInstant;
import org.apache.hudi.common.table.timeline.HoodieTimeline;
import org.apache.hudi.common.util.Option;
import org.apache.hudi.common.util.collection.Pair;
import java.io.Serializable;
import java.util.Comparator;
import java.util.List;
import java.util.Map;
import java.util.TreeMap;
import java.util.stream.Collectors;
import java.util.stream.Stream;
/**
@@ -58,6 +60,15 @@ public class HoodieFileGroup implements Serializable {
*/
private final Option<HoodieInstant> lastInstant;
public HoodieFileGroup(HoodieFileGroup fileGroup) {
this.timeline = fileGroup.timeline;
this.fileGroupId = fileGroup.fileGroupId;
this.fileSlices = new TreeMap<>(fileGroup.fileSlices.entrySet().stream()
.map(e -> Pair.of(e.getKey(), new FileSlice(e.getValue())))
.collect(Collectors.toMap(Pair::getKey, Pair::getValue)));
this.lastInstant = fileGroup.lastInstant;
}
public HoodieFileGroup(String partitionPath, String id, HoodieTimeline timeline) {
this(new HoodieFileGroupId(partitionPath, id), timeline);
}

View File

@@ -24,7 +24,7 @@ import java.util.Objects;
/**
* Unique ID to identify a file-group in a data-set.
*/
public class HoodieFileGroupId implements Serializable {
public class HoodieFileGroupId implements Serializable, Comparable<HoodieFileGroupId> {
private final String partitionPath;
@@ -64,4 +64,13 @@ public class HoodieFileGroupId implements Serializable {
public String toString() {
return "HoodieFileGroupId{partitionPath='" + partitionPath + '\'' + ", fileId='" + fileId + '\'' + '}';
}
@Override
public int compareTo(HoodieFileGroupId o) {
int ret = partitionPath.compareTo(o.partitionPath);
if (ret == 0) {
ret = fileId.compareTo(fileId);
}
return ret;
}
}

View File

@@ -45,6 +45,12 @@ public class HoodieLogFile implements Serializable {
private final String pathStr;
private long fileLen;
public HoodieLogFile(HoodieLogFile logFile) {
this.fileStatus = logFile.fileStatus;
this.pathStr = logFile.pathStr;
this.fileLen = logFile.fileLen;
}
public HoodieLogFile(FileStatus fileStatus) {
this.fileStatus = fileStatus;
this.pathStr = fileStatus.getPath().toString();

View File

@@ -37,6 +37,7 @@ public enum WriteOperationType {
BULK_INSERT_PREPPED("bulk_insert_prepped"),
// delete
DELETE("delete"),
BOOTSTRAP("bootstrap"),
// used for old version
UNKNOWN("unknown");

View File

@@ -18,6 +18,7 @@
package org.apache.hudi.common.table;
import org.apache.hudi.common.bootstrap.index.HFileBootstrapIndex;
import org.apache.hudi.common.model.HoodieFileFormat;
import org.apache.hudi.common.model.HoodieTableType;
import org.apache.hudi.common.model.OverwriteWithLatestAvroPayload;
@@ -63,11 +64,15 @@ public class HoodieTableConfig implements Serializable {
public static final String HOODIE_TIMELINE_LAYOUT_VERSION = "hoodie.timeline.layout.version";
public static final String HOODIE_PAYLOAD_CLASS_PROP_NAME = "hoodie.compaction.payload.class";
public static final String HOODIE_ARCHIVELOG_FOLDER_PROP_NAME = "hoodie.archivelog.folder";
public static final String HOODIE_BOOTSTRAP_INDEX_CLASS_PROP_NAME = "hoodie.bootstrap.index.class";
public static final String HOODIE_BOOTSTRAP_BASE_PATH = "hoodie.bootstrap.base.path";
public static final HoodieTableType DEFAULT_TABLE_TYPE = HoodieTableType.COPY_ON_WRITE;
public static final HoodieFileFormat DEFAULT_BASE_FILE_FORMAT = HoodieFileFormat.PARQUET;
public static final HoodieFileFormat DEFAULT_LOG_FILE_FORMAT = HoodieFileFormat.HOODIE_LOG;
public static final String DEFAULT_PAYLOAD_CLASS = OverwriteWithLatestAvroPayload.class.getName();
public static final String DEFAULT_BOOTSTRAP_INDEX_CLASS = HFileBootstrapIndex.class.getName();
public static final Integer DEFAULT_TIMELINE_LAYOUT_VERSION = TimelineLayoutVersion.VERSION_0;
public static final String DEFAULT_ARCHIVELOG_FOLDER = "";
private Properties props;
@@ -131,6 +136,10 @@ public class HoodieTableConfig implements Serializable {
// Use latest Version as default unless forced by client
properties.setProperty(HOODIE_TIMELINE_LAYOUT_VERSION, TimelineLayoutVersion.CURR_VERSION.toString());
}
if (properties.containsKey(HOODIE_BOOTSTRAP_BASE_PATH) && !properties.containsKey(HOODIE_BOOTSTRAP_INDEX_CLASS_PROP_NAME)) {
// Use the default bootstrap index class.
properties.setProperty(HOODIE_BOOTSTRAP_INDEX_CLASS_PROP_NAME, DEFAULT_BOOTSTRAP_INDEX_CLASS);
}
properties.store(outputStream, "Properties saved on " + new Date(System.currentTimeMillis()));
}
}
@@ -161,6 +170,19 @@ public class HoodieTableConfig implements Serializable {
"org.apache.hudi");
}
/**
* Read the payload class for HoodieRecords from the table properties.
*/
public String getBootstrapIndexClass() {
// There could be tables written with payload class from com.uber.hoodie. Need to transparently
// change to org.apache.hudi
return props.getProperty(HOODIE_BOOTSTRAP_INDEX_CLASS_PROP_NAME, DEFAULT_BOOTSTRAP_INDEX_CLASS);
}
public Option<String> getBootstrapBasePath() {
return Option.ofNullable(props.getProperty(HOODIE_BOOTSTRAP_BASE_PATH));
}
/**
* Read the table name.
*/

View File

@@ -74,6 +74,12 @@ public class HoodieTableMetaClient implements Serializable {
public static final String TEMPFOLDER_NAME = METAFOLDER_NAME + File.separator + ".temp";
public static final String AUXILIARYFOLDER_NAME = METAFOLDER_NAME + File.separator + ".aux";
public static final String BOOTSTRAP_INDEX_ROOT_FOLDER_PATH = AUXILIARYFOLDER_NAME + File.separator + ".bootstrap";
public static final String BOOTSTRAP_INDEX_BY_PARTITION_FOLDER_PATH = BOOTSTRAP_INDEX_ROOT_FOLDER_PATH
+ File.separator + ".partitions";
public static final String BOOTSTRAP_INDEX_BY_FILE_ID_FOLDER_PATH = BOOTSTRAP_INDEX_ROOT_FOLDER_PATH + File.separator
+ ".fileids";
public static final String MARKER_EXTN = ".marker";
private String basePath;
@@ -209,6 +215,20 @@ public class HoodieTableMetaClient implements Serializable {
return basePath + File.separator + AUXILIARYFOLDER_NAME;
}
/**
* @return Bootstrap Index By Partition Folder
*/
public String getBootstrapIndexByPartitionFolderPath() {
return basePath + File.separator + BOOTSTRAP_INDEX_BY_PARTITION_FOLDER_PATH;
}
/**
* @return Bootstrap Index By Hudi File Id Folder
*/
public String getBootstrapIndexByFileIdFolderNameFolderPath() {
return basePath + File.separator + BOOTSTRAP_INDEX_BY_FILE_ID_FOLDER_PATH;
}
/**
* @return path where archived timeline is stored
*/
@@ -300,44 +320,45 @@ public class HoodieTableMetaClient implements Serializable {
return archivedTimeline;
}
/**
* Helper method to initialize a table, with given basePath, tableType, name, archiveFolder, payloadClass.
*/
public static HoodieTableMetaClient initTableType(Configuration hadoopConf, String basePath, String tableType,
String tableName, String archiveLogFolder, String payloadClassName) throws IOException {
return initTableType(hadoopConf, basePath, HoodieTableType.valueOf(tableType), tableName,
archiveLogFolder, payloadClassName, null, null);
}
/**
* Helper method to initialize a table, with given basePath, tableType, name, archiveFolder, payloadClass and
* base file format.
*/
public static HoodieTableMetaClient initTableType(Configuration hadoopConf, String basePath, String tableType,
String tableName, String archiveLogFolder, String payloadClassName, String baseFileFormat) throws IOException {
return initTableType(hadoopConf, basePath, HoodieTableType.valueOf(tableType), tableName,
archiveLogFolder, payloadClassName, null, baseFileFormat);
public static HoodieTableMetaClient initTableTypeWithBootstrap(Configuration hadoopConf, String basePath, HoodieTableType tableType,
String tableName, String archiveLogFolder, String payloadClassName,
String baseFileFormat, String bootstrapIndexClass,
String bootstrapBasePath) throws IOException {
return initTableType(hadoopConf, basePath, tableType, tableName,
archiveLogFolder, payloadClassName, null, baseFileFormat, bootstrapIndexClass, bootstrapBasePath);
}
public static HoodieTableMetaClient initTableType(Configuration hadoopConf, String basePath, HoodieTableType tableType,
String tableName, String archiveLogFolder, String payloadClassName,
String baseFileFormat) throws IOException {
return initTableType(hadoopConf, basePath, tableType, tableName,
archiveLogFolder, payloadClassName, null, baseFileFormat, null, null);
}
/**
* Helper method to initialize a given path, as a given type and table name.
* Used primarily by tests, examples.
*/
public static HoodieTableMetaClient initTableType(Configuration hadoopConf, String basePath,
HoodieTableType tableType, String tableName, String payloadClassName) throws IOException {
return initTableType(hadoopConf, basePath, tableType, tableName, null, payloadClassName, null, null);
public static HoodieTableMetaClient initTableType(Configuration hadoopConf, String basePath, HoodieTableType tableType,
String tableName, String payloadClassName) throws IOException {
return initTableType(hadoopConf, basePath, tableType, tableName, null, payloadClassName,
null, null, null, null);
}
public static HoodieTableMetaClient initTableType(Configuration hadoopConf, String basePath,
HoodieTableType tableType, String tableName,
String archiveLogFolder, String payloadClassName,
public static HoodieTableMetaClient initTableType(Configuration hadoopConf, String basePath, HoodieTableType tableType,
String tableName, String archiveLogFolder, String payloadClassName,
Integer timelineLayoutVersion) throws IOException {
return initTableType(hadoopConf, basePath, tableType, tableName, archiveLogFolder, payloadClassName,
timelineLayoutVersion, null);
timelineLayoutVersion, null, null, null);
}
public static HoodieTableMetaClient initTableType(Configuration hadoopConf, String basePath,
HoodieTableType tableType, String tableName, String archiveLogFolder, String payloadClassName,
Integer timelineLayoutVersion, String baseFileFormat) throws IOException {
private static HoodieTableMetaClient initTableType(Configuration hadoopConf, String basePath, HoodieTableType tableType,
String tableName, String archiveLogFolder, String payloadClassName,
Integer timelineLayoutVersion, String baseFileFormat,
String bootstrapIndexClass, String bootstrapBasePath) throws IOException {
Properties properties = new Properties();
properties.setProperty(HoodieTableConfig.HOODIE_TABLE_NAME_PROP_NAME, tableName);
properties.setProperty(HoodieTableConfig.HOODIE_TABLE_TYPE_PROP_NAME, tableType.name());
@@ -356,6 +377,15 @@ public class HoodieTableMetaClient implements Serializable {
if (null != baseFileFormat) {
properties.setProperty(HoodieTableConfig.HOODIE_BASE_FILE_FORMAT_PROP_NAME, baseFileFormat.toUpperCase());
}
if (null != bootstrapIndexClass) {
properties.put(HoodieTableConfig.HOODIE_BOOTSTRAP_INDEX_CLASS_PROP_NAME, bootstrapIndexClass);
}
if (null != bootstrapBasePath) {
properties.put(HoodieTableConfig.HOODIE_BOOTSTRAP_BASE_PATH, bootstrapBasePath);
}
return HoodieTableMetaClient.initTableAndGetMetaClient(hadoopConf, basePath, properties);
}
@@ -399,6 +429,7 @@ public class HoodieTableMetaClient implements Serializable {
fs.mkdirs(auxiliaryFolder);
}
initializeBootstrapDirsIfNotExists(hadoopConf, basePath, fs);
HoodieTableConfig.createHoodieProperties(fs, metaPathDir, props);
// We should not use fs.getConf as this might be different from the original configuration
// used to create the fs in unit tests
@@ -407,6 +438,25 @@ public class HoodieTableMetaClient implements Serializable {
return metaClient;
}
public static void initializeBootstrapDirsIfNotExists(Configuration hadoopConf,
String basePath, FileSystem fs) throws IOException {
// Create bootstrap index by partition folder if it does not exist
final Path bootstrap_index_folder_by_partition =
new Path(basePath, HoodieTableMetaClient.BOOTSTRAP_INDEX_BY_PARTITION_FOLDER_PATH);
if (!fs.exists(bootstrap_index_folder_by_partition)) {
fs.mkdirs(bootstrap_index_folder_by_partition);
}
// Create bootstrap index by partition folder if it does not exist
final Path bootstrap_index_folder_by_fileids =
new Path(basePath, HoodieTableMetaClient.BOOTSTRAP_INDEX_BY_FILE_ID_FOLDER_PATH);
if (!fs.exists(bootstrap_index_folder_by_fileids)) {
fs.mkdirs(bootstrap_index_folder_by_fileids);
}
}
/**
* Helper method to scan all hoodie-instant metafiles.
*
@@ -512,7 +562,7 @@ public class HoodieTableMetaClient implements Serializable {
HoodieTableMetaClient
.scanFiles(getFs(), timelinePath, path -> {
// Include only the meta files with extensions that needs to be included
String extension = FSUtils.getFileExtension(path.getName());
String extension = HoodieInstant.getTimelineFileExtension(path.getName());
return includedExtensions.contains(extension);
})).map(HoodieInstant::new);
@@ -549,6 +599,10 @@ public class HoodieTableMetaClient implements Serializable {
return sb.toString();
}
public void initializeBootstrapDirsIfNotExists() throws IOException {
initializeBootstrapDirsIfNotExists(getHadoopConf(), basePath, getFs());
}
public void setBasePath(String basePath) {
this.basePath = basePath;
}

View File

@@ -18,9 +18,7 @@
package org.apache.hudi.common.table.timeline;
import org.apache.hudi.common.fs.FSUtils;
import org.apache.hudi.common.util.CollectionUtils;
import org.apache.hadoop.fs.FileStatus;
import java.io.Serializable;
@@ -53,6 +51,12 @@ public class HoodieInstant implements Serializable, Comparable<HoodieInstant> {
return COMPARABLE_ACTIONS.getOrDefault(action, action);
}
public static String getTimelineFileExtension(String fileName) {
Objects.requireNonNull(fileName);
int dotIndex = fileName.indexOf('.');
return dotIndex == -1 ? "" : fileName.substring(dotIndex);
}
/**
* Instant State.
*/
@@ -77,7 +81,7 @@ public class HoodieInstant implements Serializable, Comparable<HoodieInstant> {
public HoodieInstant(FileStatus fileStatus) {
// First read the instant timestamp. [==>20170101193025<==].commit
String fileName = fileStatus.getPath().getName();
String fileExtension = FSUtils.getFileExtension(fileName);
String fileExtension = getTimelineFileExtension(fileName);
timestamp = fileName.replace(fileExtension, "");
// Next read the action for this marker

Some files were not shown because too many files have changed in this diff Show More