1
0

[HUDI-2950] Addressing performance traps in Bulk Insert/Layout Optimization (#4234)

* Cleaned up Z-curve/Hilbert ordering seqs:
  - Streamlined flow
  - Removed unnecessary operations (double-mapping, boxing, etc)
Updated `CollectionUtils::combine` to avoid AL resizing

* Tidying up

* Reducing small objects churn due to Scala/Java conversions by re-using `RowFactory`, passing `Object[]`

* Fixing name resolution (disambiguation overloads)

* `lint`

* Replaced `OverwriteAvroPayloadRecord` w/ `RewriteRecordPayload` to avoid unnecessary Avro ser/de loop

* Added `PathCachingFileName` to avoid fetching substrings every time file-name is fetched;
Inject `PathCachingFileName` into `HoodieWrapperFileSystem.convertPathWithScheme`

* Drastically reducing size of the `ArrayDeque` allocated by `ObjectSizeCalculator`

* XXX

* Missing license

* Fixed refs (after rebase)

* Fixing compilation failure in Scala 2.11

* `PathCachingFileName` > `FileNameCachingPath`

* Tidying up
This commit is contained in:
Alexey Kudinkin
2022-01-10 18:23:22 -08:00
committed by GitHub
parent c8df9b09d7
commit f1e3762a94
7 changed files with 182 additions and 95 deletions

View File

@@ -596,6 +596,7 @@ public class HoodieAvroUtils {
if (columns.length == 1) {
return HoodieAvroUtils.getNestedFieldVal(genericRecord, columns[0], true, consistentLogicalTimestampEnabled);
} else {
// TODO this is inefficient, instead we can simply return array of Comparable
StringBuilder sb = new StringBuilder();
for (String col : columns) {
sb.append(HoodieAvroUtils.getNestedFieldValAsString(genericRecord, col, true, consistentLogicalTimestampEnabled));

View File

@@ -48,6 +48,7 @@ import org.apache.hadoop.fs.permission.FsPermission;
import org.apache.hadoop.security.Credentials;
import org.apache.hadoop.security.token.Token;
import org.apache.hadoop.util.Progressable;
import org.apache.hudi.hadoop.FileNameCachingPath;
import java.io.IOException;
import java.net.URI;
@@ -141,7 +142,7 @@ public class HoodieWrapperFileSystem extends FileSystem {
try {
newURI = new URI(newScheme, oldURI.getUserInfo(), oldURI.getHost(), oldURI.getPort(), oldURI.getPath(),
oldURI.getQuery(), oldURI.getFragment());
return new Path(newURI);
return new FileNameCachingPath(newURI);
} catch (URISyntaxException e) {
// TODO - Better Exception handling
throw new RuntimeException(e);

View File

@@ -20,6 +20,7 @@ package org.apache.hudi.common.util;
import org.apache.hudi.common.util.collection.Pair;
import java.lang.reflect.Array;
import java.util.ArrayList;
import java.util.Collections;
import java.util.HashMap;
@@ -37,11 +38,35 @@ public class CollectionUtils {
public static final Properties EMPTY_PROPERTIES = new Properties();
/**
* Combines provided arrays into one
*/
@SuppressWarnings("unchecked")
public static <T> T[] combine(T[] one, T[] another) {
T[] combined = (T[]) Array.newInstance(one.getClass().getComponentType(), one.length + another.length);
System.arraycopy(one, 0, combined, 0, one.length);
System.arraycopy(another, 0, combined, one.length, another.length);
return combined;
}
/**
* Combines provided array and an element into a new array
*/
@SuppressWarnings("unchecked")
public static <T> T[] append(T[] array, T elem) {
T[] combined = (T[]) Array.newInstance(array.getClass().getComponentType(), array.length + 1);
System.arraycopy(array, 0, combined, 0, array.length);
combined[array.length] = elem;
return combined;
}
/**
* Combines provided {@link List}s into one
*/
public static <E> List<E> combine(List<E> one, List<E> another) {
ArrayList<E> combined = new ArrayList<>(one);
ArrayList<E> combined = new ArrayList<>(one.size() + another.size());
combined.addAll(one);
combined.addAll(another);
return combined;
}

View File

@@ -90,7 +90,7 @@ public class ObjectSizeCalculator {
private final Map<Class<?>, ClassSizeInfo> classSizeInfos = new IdentityHashMap<>();
private final Set<Object> alreadyVisited = Collections.newSetFromMap(new IdentityHashMap<>());
private final Deque<Object> pending = new ArrayDeque<>(16 * 1024);
private final Deque<Object> pending = new ArrayDeque<>(64);
private long size;
/**

View File

@@ -0,0 +1,48 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.hudi.hadoop;
import org.apache.hadoop.fs.Path;
import java.net.URI;
/**
* NOTE: This class is thread-safe
*/
public class FileNameCachingPath extends Path {
// NOTE: volatile keyword is redundant here and put mostly for reader notice, since all
// reads/writes to references are always atomic (including 64-bit JVMs)
// https://docs.oracle.com/javase/specs/jls/se8/html/jls-17.html#jls-17.7
private volatile String fileName;
public FileNameCachingPath(URI aUri) {
super(aUri);
}
@Override
public String getName() {
// This value could be overwritten concurrently and that's okay, since
// {@code Path} is immutable
if (fileName == null) {
fileName = super.getName();
}
return fileName;
}
}