[HUDI-2778] Optimize statistics collection related codes and add some docs for z-order add fix some bugs (#4013)

* [HUDI-2778] Optimize statistics collection related codes and add more docs for z-order. * add test code for multi-thread parquet footer read
2021-11-23 13:46:02 +08:00
parent c88c2af8bf
commit 9de9951348
6 changed files with 175 additions and 39 deletions
--- a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/config/HoodieClusteringConfig.java
+++ b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/config/HoodieClusteringConfig.java
@@ -142,6 +142,17 @@ public class HoodieClusteringConfig extends HoodieConfig {
      .sinceVersion("0.9.0")
      .withDocumentation("When rewriting data, preserves existing hoodie_commit_time");

+  /**
+   * Using space-filling curves to optimize the layout of table to boost query performance.
+   * The table data which sorted by space-filling curve has better aggregation;
+   * combine with min-max filtering, it can achieve good performance improvement.
+   *
+   * Notice:
+   * when we use this feature, we need specify the sort columns.
+   * The more columns involved in sorting, the worse the aggregation, and the smaller the query performance improvement.
+   * Choose the filter columns which commonly used in query sql as sort columns.
+   * It is recommend that 2 ~ 4 columns participate in sorting.
+   */
  public static final ConfigProperty LAYOUT_OPTIMIZE_ENABLE = ConfigProperty
      .key(LAYOUT_OPTIMIZE_PARAM_PREFIX + "enable")
      .defaultValue(false)
--- a/hudi-client/hudi-spark-client/src/main/java/org/apache/spark/ZCurveOptimizeHelper.java
+++ b/hudi-client/hudi-spark-client/src/main/java/org/apache/spark/ZCurveOptimizeHelper.java
@@ -230,18 +230,14 @@ public class ZCurveOptimizeHelper {
            rows.add(currentColRangeMetaData.getMinValue());
            rows.add(currentColRangeMetaData.getMaxValue());
          } else if (colType instanceof StringType) {
-            String minString = new String(((Binary)currentColRangeMetaData.getMinValue()).getBytes());
-            String maxString = new String(((Binary)currentColRangeMetaData.getMaxValue()).getBytes());
-            rows.add(minString);
-            rows.add(maxString);
+            rows.add(currentColRangeMetaData.getMinValueAsString());
+            rows.add(currentColRangeMetaData.getMaxValueAsString());
          } else if (colType instanceof DecimalType) {
-            Double minDecimal = Double.parseDouble(currentColRangeMetaData.getStringifier().stringify(Long.valueOf(currentColRangeMetaData.getMinValue().toString())));
-            Double maxDecimal = Double.parseDouble(currentColRangeMetaData.getStringifier().stringify(Long.valueOf(currentColRangeMetaData.getMaxValue().toString())));
-            rows.add(BigDecimal.valueOf(minDecimal));
-            rows.add(BigDecimal.valueOf(maxDecimal));
+            rows.add(new BigDecimal(currentColRangeMetaData.getMinValueAsString()));
+            rows.add(new BigDecimal(currentColRangeMetaData.getMaxValueAsString()));
          } else if (colType instanceof DateType) {
-            rows.add(java.sql.Date.valueOf(currentColRangeMetaData.getStringifier().stringify((int)currentColRangeMetaData.getMinValue())));
-            rows.add(java.sql.Date.valueOf(currentColRangeMetaData.getStringifier().stringify((int)currentColRangeMetaData.getMaxValue())));
+            rows.add(java.sql.Date.valueOf(currentColRangeMetaData.getMinValueAsString()));
+            rows.add(java.sql.Date.valueOf(currentColRangeMetaData.getMaxValueAsString()));
          } else if (colType instanceof LongType) {
            rows.add(currentColRangeMetaData.getMinValue());
            rows.add(currentColRangeMetaData.getMaxValue());
@@ -344,6 +340,8 @@ public class ZCurveOptimizeHelper {
          List columns = Arrays.asList(statisticsDF.schema().fieldNames());
          spark.sql(HoodieSparkUtils$
              .MODULE$.createMergeSql(originalTable, updateTable, JavaConversions.asScalaBuffer(columns))).repartition(1).write().save(savePath.toString());
+        } else {
+          statisticsDF.repartition(1).write().mode("overwrite").save(savePath.toString());
        }
      } else {
        statisticsDF.repartition(1).write().mode("overwrite").save(savePath.toString());