1
0
Files
hudi/hoodie-client/src/main/java/com/uber/hoodie/io/ConsistencyCheck.java
vinothchandar 9ca6f91e97 Perform consistency checks during write finalize
- Check to ensure written files are listable on storage
 - Docs reflected to capture how this helps with s3 storage
 - Unit tests added, corrections to existing tests
 - Fix DeltaStreamer to manage archived commits in a separate folder
2018-09-28 08:04:41 +05:30

113 lines
3.9 KiB
Java

/*
* Copyright (c) 2018 Uber Technologies, Inc. (hoodie-dev-group@uber.com)
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package com.uber.hoodie.io;
import com.google.common.annotations.VisibleForTesting;
import com.uber.hoodie.common.SerializableConfiguration;
import com.uber.hoodie.common.util.FSUtils;
import java.io.Serializable;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.List;
import java.util.Set;
import java.util.stream.Collectors;
import java.util.stream.StreamSupport;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.log4j.LogManager;
import org.apache.log4j.Logger;
import org.apache.spark.api.java.JavaSparkContext;
/**
* Checks if all the written paths have their metadata consistent on storage and thus be listable to
* queries. This is important for cloud, stores like AWS S3 which are eventually consistent with
* their metadata. Without such checks, we may proceed to commit the written data, without the
* written data being made available to queries. In cases like incremental pull this can lead to
* downstream readers failing to ever see some data.
*/
public class ConsistencyCheck implements Serializable {
private static final transient Logger log = LogManager.getLogger(ConsistencyCheck.class);
private String basePath;
private List<String> relPaths;
private transient JavaSparkContext jsc;
private SerializableConfiguration hadoopConf;
private int parallelism;
public ConsistencyCheck(String basePath, List<String> relPaths, JavaSparkContext jsc,
int parallelism) {
this.basePath = basePath;
this.relPaths = relPaths;
this.jsc = jsc;
this.hadoopConf = new SerializableConfiguration(jsc.hadoopConfiguration());
this.parallelism = parallelism;
}
@VisibleForTesting
void sleepSafe(long waitMs) {
try {
Thread.sleep(waitMs);
} catch (InterruptedException e) {
// ignore & continue next attempt
}
}
/**
* Repeatedly lists the filesystem on the paths, with exponential backoff and marks paths found as
* passing the check.
*
* @return list of (relative) paths failing the check
*/
public List<String> check(int maxAttempts, long initalDelayMs) {
long waitMs = initalDelayMs;
int attempt = 0;
List<String> remainingPaths = new ArrayList<>(relPaths);
while (attempt++ < maxAttempts) {
remainingPaths = jsc.parallelize(remainingPaths, parallelism)
.groupBy(p -> new Path(basePath, p).getParent()) // list by partition
.map(pair -> {
FileSystem fs = FSUtils.getFs(basePath, hadoopConf.get());
// list the partition path and obtain all file paths present
Set<String> fileNames = Arrays.stream(fs.listStatus(pair._1()))
.map(s -> s.getPath().getName())
.collect(Collectors.toSet());
// only return paths that can't be found
return StreamSupport.stream(pair._2().spliterator(), false)
.filter(p -> !fileNames.contains(new Path(basePath, p).getName()))
.collect(Collectors.toList());
})
.flatMap(itr -> itr.iterator()).collect();
if (remainingPaths.size() == 0) {
break; // we are done.
}
log.info("Consistency check, waiting for " + waitMs + " ms , after attempt :" + attempt);
sleepSafe(waitMs);
waitMs = waitMs * 2; // double check interval every attempt
}
return remainingPaths;
}
}