1
0

[HUDI-808] Support cleaning bootstrap source data (#1870)

Co-authored-by: Wenning Ding <wenningd@amazon.com>
Co-authored-by: Balaji Varadarajan <vbalaji@apache.org>
This commit is contained in:
wenningd
2020-08-11 01:43:46 -07:00
committed by GitHub
parent 626f78f6f6
commit 8b928e9bca
23 changed files with 772 additions and 173 deletions

View File

@@ -24,23 +24,22 @@
{"name": "totalFilesDeleted", "type": "int"},
{"name": "earliestCommitToRetain", "type": "string"},
{"name": "partitionMetadata", "type": {
"type" : "map", "values" : {
"type": "record",
"name": "HoodieCleanPartitionMetadata",
"fields": [
{"name": "partitionPath", "type": "string"},
{"name": "policy", "type": "string"},
{"name": "deletePathPatterns", "type": {"type": "array", "items": "string"}},
{"name": "successDeleteFiles", "type": {"type": "array", "items": "string"}},
{"name": "failedDeleteFiles", "type": {"type": "array", "items": "string"}}
]
}
}
"type" : "map", "values" : "HoodieCleanPartitionMetadata"
}
},
{
"name":"version",
"type":["int", "null"],
"default": 1
},
{
"name": "bootstrapPartitionMetadata",
"type": [ "null", {
"type" : "map",
"values" : "HoodieCleanPartitionMetadata",
"default" : null
}],
"default" : null
}
]
}

View File

@@ -0,0 +1,29 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
{
"namespace": "org.apache.hudi.avro.model",
"type": "record",
"name": "HoodieCleanPartitionMetadata",
"fields": [
{"name": "partitionPath", "type": "string"},
{"name": "policy", "type": "string"},
{"name": "deletePathPatterns", "type": {"type": "array", "items": "string"}},
{"name": "successDeleteFiles", "type": {"type": "array", "items": "string"}},
{"name": "failedDeleteFiles", "type": {"type": "array", "items": "string"}}
]
}

View File

@@ -47,6 +47,7 @@
"type": "string"
},
{
/** This is deprecated and replaced by the field filePathsToBeDeletedPerPartition **/
"name": "filesToBeDeletedPerPartition",
"type": [
"null", {
@@ -64,6 +65,33 @@
"name":"version",
"type":["int", "null"],
"default": 1
},
{
"name": "filePathsToBeDeletedPerPartition",
"doc": "This field replaces the field filesToBeDeletedPerPartition",
"type": [
"null", {
"type":"map",
"values": {
"type":"array",
"items":{
"name":"HoodieCleanFileInfo",
"type": "record",
"fields":[
{
"name":"filePath",
"type":["null","string"],
"default": null
},
{
"name":"isBootstrapBaseFile",
"type":["null","boolean"],
"default": null
}
]
}
}}],
"default" : null
}
]
}

View File

@@ -20,6 +20,7 @@ package org.apache.hudi.common;
import org.apache.hudi.common.model.HoodieCleaningPolicy;
import org.apache.hudi.common.table.timeline.HoodieInstant;
import org.apache.hudi.common.util.CollectionUtils;
import org.apache.hudi.common.util.Option;
import java.io.Serializable;
@@ -39,17 +40,35 @@ public class HoodieCleanStat implements Serializable {
private final List<String> successDeleteFiles;
// Files that could not be deleted
private final List<String> failedDeleteFiles;
// Bootstrap Base Path patterns that were generated for the delete operation
private final List<String> deleteBootstrapBasePathPatterns;
private final List<String> successDeleteBootstrapBaseFiles;
// Files that could not be deleted
private final List<String> failedDeleteBootstrapBaseFiles;
// Earliest commit that was retained in this clean
private final String earliestCommitToRetain;
public HoodieCleanStat(HoodieCleaningPolicy policy, String partitionPath, List<String> deletePathPatterns,
List<String> successDeleteFiles, List<String> failedDeleteFiles, String earliestCommitToRetain) {
this(policy, partitionPath, deletePathPatterns, successDeleteFiles, failedDeleteFiles, earliestCommitToRetain,
CollectionUtils.createImmutableList(), CollectionUtils.createImmutableList(),
CollectionUtils.createImmutableList());
}
public HoodieCleanStat(HoodieCleaningPolicy policy, String partitionPath, List<String> deletePathPatterns,
List<String> successDeleteFiles, List<String> failedDeleteFiles,
String earliestCommitToRetain, List<String> deleteBootstrapBasePathPatterns,
List<String> successDeleteBootstrapBaseFiles,
List<String> failedDeleteBootstrapBaseFiles) {
this.policy = policy;
this.partitionPath = partitionPath;
this.deletePathPatterns = deletePathPatterns;
this.successDeleteFiles = successDeleteFiles;
this.failedDeleteFiles = failedDeleteFiles;
this.earliestCommitToRetain = earliestCommitToRetain;
this.deleteBootstrapBasePathPatterns = deleteBootstrapBasePathPatterns;
this.successDeleteBootstrapBaseFiles = successDeleteBootstrapBaseFiles;
this.failedDeleteBootstrapBaseFiles = failedDeleteBootstrapBaseFiles;
}
public HoodieCleaningPolicy getPolicy() {
@@ -72,6 +91,18 @@ public class HoodieCleanStat implements Serializable {
return failedDeleteFiles;
}
public List<String> getDeleteBootstrapBasePathPatterns() {
return deleteBootstrapBasePathPatterns;
}
public List<String> getSuccessDeleteBootstrapBaseFiles() {
return successDeleteBootstrapBaseFiles;
}
public List<String> getFailedDeleteBootstrapBaseFiles() {
return failedDeleteBootstrapBaseFiles;
}
public String getEarliestCommitToRetain() {
return earliestCommitToRetain;
}
@@ -91,6 +122,9 @@ public class HoodieCleanStat implements Serializable {
private List<String> failedDeleteFiles;
private String partitionPath;
private String earliestCommitToRetain;
private List<String> deleteBootstrapBasePathPatterns;
private List<String> successDeleteBootstrapBaseFiles;
private List<String> failedDeleteBootstrapBaseFiles;
public Builder withPolicy(HoodieCleaningPolicy policy) {
this.policy = policy;
@@ -112,6 +146,21 @@ public class HoodieCleanStat implements Serializable {
return this;
}
public Builder withDeleteBootstrapBasePathPatterns(List<String> deletePathPatterns) {
this.deleteBootstrapBasePathPatterns = deletePathPatterns;
return this;
}
public Builder withSuccessfulDeleteBootstrapBaseFiles(List<String> successDeleteFiles) {
this.successDeleteBootstrapBaseFiles = successDeleteFiles;
return this;
}
public Builder withFailedDeleteBootstrapBaseFiles(List<String> failedDeleteFiles) {
this.failedDeleteBootstrapBaseFiles = failedDeleteFiles;
return this;
}
public Builder withPartitionPath(String partitionPath) {
this.partitionPath = partitionPath;
return this;
@@ -125,7 +174,8 @@ public class HoodieCleanStat implements Serializable {
public HoodieCleanStat build() {
return new HoodieCleanStat(policy, partitionPath, deletePathPatterns, successDeleteFiles, failedDeleteFiles,
earliestCommitToRetain);
earliestCommitToRetain, deleteBootstrapBasePathPatterns, successDeleteBootstrapBaseFiles,
failedDeleteBootstrapBaseFiles);
}
}
@@ -137,7 +187,10 @@ public class HoodieCleanStat implements Serializable {
+ ", deletePathPatterns=" + deletePathPatterns
+ ", successDeleteFiles=" + successDeleteFiles
+ ", failedDeleteFiles=" + failedDeleteFiles
+ ", earliestCommitToRetain='" + earliestCommitToRetain + '\''
+ ", earliestCommitToRetain='" + earliestCommitToRetain
+ ", deleteBootstrapBasePathPatterns=" + deleteBootstrapBasePathPatterns
+ ", successDeleteBootstrapBaseFiles=" + successDeleteBootstrapBaseFiles
+ ", failedDeleteBootstrapBaseFiles=" + failedDeleteBootstrapBaseFiles + '\''
+ '}';
}
}

View File

@@ -0,0 +1,47 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.hudi.common.model;
import org.apache.hudi.avro.model.HoodieCleanFileInfo;
import java.io.Serializable;
public class CleanFileInfo implements Serializable {
private final String filePath;
private final boolean isBootstrapBaseFile;
public CleanFileInfo(String filePath, boolean isBootstrapBaseFile) {
this.filePath = filePath;
this.isBootstrapBaseFile = isBootstrapBaseFile;
}
public String getFilePath() {
return filePath;
}
public boolean isBootstrapBaseFile() {
return isBootstrapBaseFile;
}
public HoodieCleanFileInfo toHoodieFileCleanInfo() {
return new HoodieCleanFileInfo(filePath, isBootstrapBaseFile);
}
}

View File

@@ -29,7 +29,7 @@ public class CleanMetadataMigrator extends MetadataMigrator<HoodieCleanMetadata>
public CleanMetadataMigrator(HoodieTableMetaClient metaClient) {
super(metaClient,
Arrays
.asList(new CleanV1MigrationHandler(metaClient),
new CleanV2MigrationHandler(metaClient)));
.asList(new CleanMetadataV1MigrationHandler(metaClient),
new CleanMetadataV2MigrationHandler(metaClient)));
}
}

View File

@@ -31,11 +31,11 @@ import org.apache.hadoop.fs.Path;
import java.util.Map;
import java.util.stream.Collectors;
public class CleanV1MigrationHandler extends AbstractMigratorBase<HoodieCleanMetadata> {
public class CleanMetadataV1MigrationHandler extends AbstractMigratorBase<HoodieCleanMetadata> {
public static final Integer VERSION = 1;
public CleanV1MigrationHandler(HoodieTableMetaClient metaClient) {
public CleanMetadataV1MigrationHandler(HoodieTableMetaClient metaClient) {
super(metaClient);
}

View File

@@ -31,11 +31,11 @@ import java.util.List;
import java.util.Map;
import java.util.stream.Collectors;
public class CleanV2MigrationHandler extends AbstractMigratorBase<HoodieCleanMetadata> {
public class CleanMetadataV2MigrationHandler extends AbstractMigratorBase<HoodieCleanMetadata> {
public static final Integer VERSION = 2;
public CleanV2MigrationHandler(HoodieTableMetaClient metaClient) {
public CleanMetadataV2MigrationHandler(HoodieTableMetaClient metaClient) {
super(metaClient);
}

View File

@@ -0,0 +1,36 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.hudi.common.table.timeline.versioning.clean;
import org.apache.hudi.avro.model.HoodieCleanerPlan;
import org.apache.hudi.common.table.HoodieTableMetaClient;
import org.apache.hudi.common.table.timeline.versioning.MetadataMigrator;
import java.util.Arrays;
/**
* Manages upgrade/downgrade of cleaner plan.
*/
public class CleanPlanMigrator extends MetadataMigrator<HoodieCleanerPlan> {
public CleanPlanMigrator(HoodieTableMetaClient metaClient) {
super(metaClient,
Arrays.asList(new CleanPlanV1MigrationHandler(metaClient), new CleanPlanV2MigrationHandler(metaClient)));
}
}

View File

@@ -0,0 +1,66 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.hudi.common.table.timeline.versioning.clean;
import java.util.HashMap;
import org.apache.hudi.avro.model.HoodieCleanerPlan;
import org.apache.hudi.common.table.HoodieTableMetaClient;
import org.apache.hudi.common.table.timeline.versioning.AbstractMigratorBase;
import org.apache.hudi.common.util.collection.Pair;
import org.apache.hadoop.fs.Path;
import java.util.List;
import java.util.Map;
import java.util.stream.Collectors;
public class CleanPlanV1MigrationHandler extends AbstractMigratorBase<HoodieCleanerPlan> {
public static final Integer VERSION = 1;
public CleanPlanV1MigrationHandler(HoodieTableMetaClient metaClient) {
super(metaClient);
}
@Override
public Integer getManagedVersion() {
return VERSION;
}
@Override
public HoodieCleanerPlan upgradeFrom(HoodieCleanerPlan plan) {
throw new IllegalArgumentException(
"This is the lowest version. Plan cannot be any lower version");
}
@Override
public HoodieCleanerPlan downgradeFrom(HoodieCleanerPlan plan) {
if (metaClient.getTableConfig().getBootstrapBasePath().isPresent()) {
throw new IllegalArgumentException(
"This version do not support METADATA_ONLY bootstrapped tables. Failed to downgrade.");
}
Map<String, List<String>> filesPerPartition = plan.getFilePathsToBeDeletedPerPartition().entrySet().stream()
.map(e -> {
return Pair.of(e.getKey(), e.getValue().stream().map(v -> new Path(v.getFilePath()).getName())
.collect(Collectors.toList()));
}).collect(Collectors.toMap(Pair::getKey, Pair::getValue));
return new HoodieCleanerPlan(plan.getEarliestInstantToRetain(), plan.getPolicy(), filesPerPartition, VERSION,
new HashMap<>());
}
}

View File

@@ -0,0 +1,64 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.hudi.common.table.timeline.versioning.clean;
import org.apache.hudi.avro.model.HoodieCleanFileInfo;
import org.apache.hudi.avro.model.HoodieCleanerPlan;
import org.apache.hudi.common.fs.FSUtils;
import org.apache.hudi.common.table.HoodieTableMetaClient;
import org.apache.hudi.common.table.timeline.versioning.AbstractMigratorBase;
import org.apache.hudi.common.util.collection.Pair;
import org.apache.hadoop.fs.Path;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.stream.Collectors;
public class CleanPlanV2MigrationHandler extends AbstractMigratorBase<HoodieCleanerPlan> {
public static final Integer VERSION = 2;
public CleanPlanV2MigrationHandler(HoodieTableMetaClient metaClient) {
super(metaClient);
}
@Override
public Integer getManagedVersion() {
return VERSION;
}
@Override
public HoodieCleanerPlan upgradeFrom(HoodieCleanerPlan plan) {
Map<String, List<HoodieCleanFileInfo>> filePathsPerPartition =
plan.getFilesToBeDeletedPerPartition().entrySet().stream().map(e -> Pair.of(e.getKey(), e.getValue().stream()
.map(v -> new HoodieCleanFileInfo(
new Path(FSUtils.getPartitionPath(metaClient.getBasePath(), e.getKey()), v).toString(), false))
.collect(Collectors.toList()))).collect(Collectors.toMap(Pair::getKey, Pair::getValue));
return new HoodieCleanerPlan(plan.getEarliestInstantToRetain(), plan.getPolicy(), new HashMap<>(), VERSION,
filePathsPerPartition);
}
@Override
public HoodieCleanerPlan downgradeFrom(HoodieCleanerPlan input) {
throw new IllegalArgumentException(
"This is the current highest version. Plan cannot be any higher version");
}
}

View File

@@ -252,7 +252,8 @@ public abstract class IncrementalTimelineSyncFileSystemView extends AbstractTabl
}
/**
* Add newly found clean instant.
* Add newly found clean instant. Note that cleaner metadata (.clean.completed)
* contains only relative paths unlike clean plans (.clean.requested) which contains absolute paths.
*
* @param timeline Timeline
* @param instant Clean instant

View File

@@ -18,16 +18,20 @@
package org.apache.hudi.common.util;
import java.util.stream.Collectors;
import org.apache.hudi.avro.model.HoodieCleanFileInfo;
import org.apache.hudi.avro.model.HoodieCleanMetadata;
import org.apache.hudi.avro.model.HoodieCleanPartitionMetadata;
import org.apache.hudi.avro.model.HoodieCleanerPlan;
import org.apache.hudi.common.HoodieCleanStat;
import org.apache.hudi.common.model.CleanFileInfo;
import org.apache.hudi.common.table.HoodieTableMetaClient;
import org.apache.hudi.common.table.timeline.HoodieInstant;
import org.apache.hudi.common.table.timeline.TimelineMetadataUtils;
import org.apache.hudi.common.table.timeline.versioning.clean.CleanMetadataMigrator;
import org.apache.hudi.common.table.timeline.versioning.clean.CleanV1MigrationHandler;
import org.apache.hudi.common.table.timeline.versioning.clean.CleanV2MigrationHandler;
import org.apache.hudi.common.table.timeline.versioning.clean.CleanMetadataV1MigrationHandler;
import org.apache.hudi.common.table.timeline.versioning.clean.CleanMetadataV2MigrationHandler;
import org.apache.hudi.common.table.timeline.versioning.clean.CleanPlanMigrator;
import java.io.IOException;
import java.util.HashMap;
@@ -35,14 +39,16 @@ import java.util.List;
import java.util.Map;
public class CleanerUtils {
public static final Integer CLEAN_METADATA_VERSION_1 = CleanV1MigrationHandler.VERSION;
public static final Integer CLEAN_METADATA_VERSION_2 = CleanV2MigrationHandler.VERSION;
public static final Integer CLEAN_METADATA_VERSION_1 = CleanMetadataV1MigrationHandler.VERSION;
public static final Integer CLEAN_METADATA_VERSION_2 = CleanMetadataV2MigrationHandler.VERSION;
public static final Integer LATEST_CLEAN_METADATA_VERSION = CLEAN_METADATA_VERSION_2;
public static HoodieCleanMetadata convertCleanMetadata(String startCleanTime,
Option<Long> durationInMs,
List<HoodieCleanStat> cleanStats) {
Map<String, HoodieCleanPartitionMetadata> partitionMetadataMap = new HashMap<>();
Map<String, HoodieCleanPartitionMetadata> partitionBootstrapMetadataMap = new HashMap<>();
int totalDeleted = 0;
String earliestCommitToRetain = null;
for (HoodieCleanStat stat : cleanStats) {
@@ -50,6 +56,13 @@ public class CleanerUtils {
new HoodieCleanPartitionMetadata(stat.getPartitionPath(), stat.getPolicy().name(),
stat.getDeletePathPatterns(), stat.getSuccessDeleteFiles(), stat.getFailedDeleteFiles());
partitionMetadataMap.put(stat.getPartitionPath(), metadata);
if ((null != stat.getDeleteBootstrapBasePathPatterns())
&& (!stat.getDeleteBootstrapBasePathPatterns().isEmpty())) {
HoodieCleanPartitionMetadata bootstrapMetadata = new HoodieCleanPartitionMetadata(stat.getPartitionPath(),
stat.getPolicy().name(), stat.getDeleteBootstrapBasePathPatterns(), stat.getSuccessDeleteBootstrapBaseFiles(),
stat.getFailedDeleteBootstrapBaseFiles());
partitionBootstrapMetadataMap.put(stat.getPartitionPath(), bootstrapMetadata);
}
totalDeleted += stat.getSuccessDeleteFiles().size();
if (earliestCommitToRetain == null) {
// This will be the same for all partitions
@@ -57,8 +70,8 @@ public class CleanerUtils {
}
}
return new HoodieCleanMetadata(startCleanTime,
durationInMs.orElseGet(() -> -1L), totalDeleted, earliestCommitToRetain, partitionMetadataMap, CLEAN_METADATA_VERSION_2);
return new HoodieCleanMetadata(startCleanTime, durationInMs.orElseGet(() -> -1L), totalDeleted,
earliestCommitToRetain, partitionMetadataMap, CLEAN_METADATA_VERSION_2, partitionBootstrapMetadataMap);
}
/**
@@ -77,7 +90,7 @@ public class CleanerUtils {
}
/**
* Get Cleaner Plan corresponding to a clean instant.
* Get Latest version of cleaner plan corresponding to a clean instant.
* @param metaClient Hoodie Table Meta Client
* @param cleanInstant Instant referring to clean action
* @return Cleaner plan corresponding to clean instant
@@ -85,7 +98,18 @@ public class CleanerUtils {
*/
public static HoodieCleanerPlan getCleanerPlan(HoodieTableMetaClient metaClient, HoodieInstant cleanInstant)
throws IOException {
return TimelineMetadataUtils.deserializeAvroMetadata(metaClient.getActiveTimeline().readCleanerInfoAsBytes(cleanInstant).get(),
HoodieCleanerPlan.class);
CleanPlanMigrator cleanPlanMigrator = new CleanPlanMigrator(metaClient);
HoodieCleanerPlan cleanerPlan = TimelineMetadataUtils.deserializeAvroMetadata(
metaClient.getActiveTimeline().readCleanerInfoAsBytes(cleanInstant).get(), HoodieCleanerPlan.class);
return cleanPlanMigrator.upgradeToLatest(cleanerPlan, cleanerPlan.getVersion());
}
/**
* Convert list of cleanFileInfo instances to list of avro-generated HoodieCleanFileInfo instances.
* @param cleanFileInfoList
* @return
*/
public static List<HoodieCleanFileInfo> convertToHoodieCleanFileInfoList(List<CleanFileInfo> cleanFileInfoList) {
return cleanFileInfoList.stream().map(CleanFileInfo::toHoodieFileCleanInfo).collect(Collectors.toList());
}
}