[HUDI-2950] Addressing performance traps in Bulk Insert/Layout Optimization (#4234)
* Cleaned up Z-curve/Hilbert ordering seqs: - Streamlined flow - Removed unnecessary operations (double-mapping, boxing, etc) Updated `CollectionUtils::combine` to avoid AL resizing * Tidying up * Reducing small objects churn due to Scala/Java conversions by re-using `RowFactory`, passing `Object[]` * Fixing name resolution (disambiguation overloads) * `lint` * Replaced `OverwriteAvroPayloadRecord` w/ `RewriteRecordPayload` to avoid unnecessary Avro ser/de loop * Added `PathCachingFileName` to avoid fetching substrings every time file-name is fetched; Inject `PathCachingFileName` into `HoodieWrapperFileSystem.convertPathWithScheme` * Drastically reducing size of the `ArrayDeque` allocated by `ObjectSizeCalculator` * XXX * Missing license * Fixed refs (after rebase) * Fixing compilation failure in Scala 2.11 * `PathCachingFileName` > `FileNameCachingPath` * Tidying up
This commit is contained in:
@@ -596,6 +596,7 @@ public class HoodieAvroUtils {
|
||||
if (columns.length == 1) {
|
||||
return HoodieAvroUtils.getNestedFieldVal(genericRecord, columns[0], true, consistentLogicalTimestampEnabled);
|
||||
} else {
|
||||
// TODO this is inefficient, instead we can simply return array of Comparable
|
||||
StringBuilder sb = new StringBuilder();
|
||||
for (String col : columns) {
|
||||
sb.append(HoodieAvroUtils.getNestedFieldValAsString(genericRecord, col, true, consistentLogicalTimestampEnabled));
|
||||
|
||||
@@ -48,6 +48,7 @@ import org.apache.hadoop.fs.permission.FsPermission;
|
||||
import org.apache.hadoop.security.Credentials;
|
||||
import org.apache.hadoop.security.token.Token;
|
||||
import org.apache.hadoop.util.Progressable;
|
||||
import org.apache.hudi.hadoop.FileNameCachingPath;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.net.URI;
|
||||
@@ -141,7 +142,7 @@ public class HoodieWrapperFileSystem extends FileSystem {
|
||||
try {
|
||||
newURI = new URI(newScheme, oldURI.getUserInfo(), oldURI.getHost(), oldURI.getPort(), oldURI.getPath(),
|
||||
oldURI.getQuery(), oldURI.getFragment());
|
||||
return new Path(newURI);
|
||||
return new FileNameCachingPath(newURI);
|
||||
} catch (URISyntaxException e) {
|
||||
// TODO - Better Exception handling
|
||||
throw new RuntimeException(e);
|
||||
|
||||
@@ -20,6 +20,7 @@ package org.apache.hudi.common.util;
|
||||
|
||||
import org.apache.hudi.common.util.collection.Pair;
|
||||
|
||||
import java.lang.reflect.Array;
|
||||
import java.util.ArrayList;
|
||||
import java.util.Collections;
|
||||
import java.util.HashMap;
|
||||
@@ -37,11 +38,35 @@ public class CollectionUtils {
|
||||
|
||||
public static final Properties EMPTY_PROPERTIES = new Properties();
|
||||
|
||||
/**
|
||||
* Combines provided arrays into one
|
||||
*/
|
||||
@SuppressWarnings("unchecked")
|
||||
public static <T> T[] combine(T[] one, T[] another) {
|
||||
T[] combined = (T[]) Array.newInstance(one.getClass().getComponentType(), one.length + another.length);
|
||||
System.arraycopy(one, 0, combined, 0, one.length);
|
||||
System.arraycopy(another, 0, combined, one.length, another.length);
|
||||
return combined;
|
||||
}
|
||||
|
||||
/**
|
||||
* Combines provided array and an element into a new array
|
||||
*/
|
||||
@SuppressWarnings("unchecked")
|
||||
public static <T> T[] append(T[] array, T elem) {
|
||||
T[] combined = (T[]) Array.newInstance(array.getClass().getComponentType(), array.length + 1);
|
||||
System.arraycopy(array, 0, combined, 0, array.length);
|
||||
combined[array.length] = elem;
|
||||
return combined;
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Combines provided {@link List}s into one
|
||||
*/
|
||||
public static <E> List<E> combine(List<E> one, List<E> another) {
|
||||
ArrayList<E> combined = new ArrayList<>(one);
|
||||
ArrayList<E> combined = new ArrayList<>(one.size() + another.size());
|
||||
combined.addAll(one);
|
||||
combined.addAll(another);
|
||||
return combined;
|
||||
}
|
||||
|
||||
@@ -90,7 +90,7 @@ public class ObjectSizeCalculator {
|
||||
private final Map<Class<?>, ClassSizeInfo> classSizeInfos = new IdentityHashMap<>();
|
||||
|
||||
private final Set<Object> alreadyVisited = Collections.newSetFromMap(new IdentityHashMap<>());
|
||||
private final Deque<Object> pending = new ArrayDeque<>(16 * 1024);
|
||||
private final Deque<Object> pending = new ArrayDeque<>(64);
|
||||
private long size;
|
||||
|
||||
/**
|
||||
|
||||
@@ -0,0 +1,48 @@
|
||||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one
|
||||
* or more contributor license agreements. See the NOTICE file
|
||||
* distributed with this work for additional information
|
||||
* regarding copyright ownership. The ASF licenses this file
|
||||
* to you under the Apache License, Version 2.0 (the
|
||||
* "License"); you may not use this file except in compliance
|
||||
* with the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
package org.apache.hudi.hadoop;
|
||||
|
||||
import org.apache.hadoop.fs.Path;
|
||||
|
||||
import java.net.URI;
|
||||
|
||||
/**
|
||||
* NOTE: This class is thread-safe
|
||||
*/
|
||||
public class FileNameCachingPath extends Path {
|
||||
|
||||
// NOTE: volatile keyword is redundant here and put mostly for reader notice, since all
|
||||
// reads/writes to references are always atomic (including 64-bit JVMs)
|
||||
// https://docs.oracle.com/javase/specs/jls/se8/html/jls-17.html#jls-17.7
|
||||
private volatile String fileName;
|
||||
|
||||
public FileNameCachingPath(URI aUri) {
|
||||
super(aUri);
|
||||
}
|
||||
|
||||
@Override
|
||||
public String getName() {
|
||||
// This value could be overwritten concurrently and that's okay, since
|
||||
// {@code Path} is immutable
|
||||
if (fileName == null) {
|
||||
fileName = super.getName();
|
||||
}
|
||||
return fileName;
|
||||
}
|
||||
}
|
||||
Reference in New Issue
Block a user