[HUDI-508] Standardizing on "Table" instead of "Dataset" across code (#1197)

- Docs were talking about storage types before, cWiki moved to "Table" - Most of code already has HoodieTable, HoodieTableMetaClient - correct naming - Replacing renaming use of dataset across code/comments - Few usages in comments and use of Spark SQL DataSet remain unscathed
2020-01-07 12:52:32 -08:00
parent 8306f749a2
commit 9706f659db
73 changed files with 298 additions and 298 deletions
--- a/hudi-hive/src/main/java/org/apache/hudi/hive/HiveSyncConfig.java
+++ b/hudi-hive/src/main/java/org/apache/hudi/hive/HiveSyncConfig.java
@@ -44,7 +44,7 @@ public class HiveSyncConfig implements Serializable {
  @Parameter(names = {"--jdbc-url"}, description = "Hive jdbc connect url", required = true)
  public String jdbcUrl;

-  @Parameter(names = {"--base-path"}, description = "Basepath of hoodie dataset to sync", required = true)
+  @Parameter(names = {"--base-path"}, description = "Basepath of hoodie table to sync", required = true)
  public String basePath;

  @Parameter(names = "--partitioned-by", description = "Fields in the schema partitioned by")
--- a/hudi-hive/src/main/java/org/apache/hudi/hive/HiveSyncTool.java
+++ b/hudi-hive/src/main/java/org/apache/hudi/hive/HiveSyncTool.java
@@ -20,7 +20,7 @@ package org.apache.hudi.hive;

 import org.apache.hudi.common.util.FSUtils;
 import org.apache.hudi.common.util.Option;
-import org.apache.hudi.exception.InvalidDatasetException;
+import org.apache.hudi.exception.InvalidTableException;
 import org.apache.hudi.hadoop.HoodieParquetInputFormat;
 import org.apache.hudi.hadoop.realtime.HoodieParquetRealtimeInputFormat;
 import org.apache.hudi.hive.HoodieHiveClient.PartitionEvent;
@@ -43,7 +43,7 @@ import java.util.Map;
 import java.util.stream.Collectors;

 /**
- * Tool to sync a hoodie HDFS dataset with a hive metastore table. Either use it as a api
+ * Tool to sync a hoodie HDFS table with a hive metastore table. Either use it as a api
 * HiveSyncTool.syncHoodieTable(HiveSyncConfig) or as a command line java -cp hoodie-hive.jar HiveSyncTool [args]
 * <p>
 * This utility will get the schema from the latest commit and will sync hive table schema Also this will sync the
@@ -80,7 +80,7 @@ public class HiveSyncTool {
          break;
        default:
          LOG.error("Unknown table type " + hoodieHiveClient.getTableType());
-          throw new InvalidDatasetException(hoodieHiveClient.getBasePath());
+          throw new InvalidTableException(hoodieHiveClient.getBasePath());
      }
    } catch (RuntimeException re) {
      LOG.error("Got runtime exception when hive syncing", re);
@@ -95,7 +95,7 @@ public class HiveSyncTool {

    // Check if the necessary table exists
    boolean tableExists = hoodieHiveClient.doesTableExist();
-    // Get the parquet schema for this dataset looking at the latest commit
+    // Get the parquet schema for this table looking at the latest commit
    MessageType schema = hoodieHiveClient.getDataSchema();
    // Sync schema if needed
    syncSchema(tableExists, isRealTime, schema);
@@ -146,7 +146,7 @@ public class HiveSyncTool {
            ParquetHiveSerDe.class.getName());
      }
    } else {
-      // Check if the dataset schema has evolved
+      // Check if the table schema has evolved
      Map<String, String> tableSchema = hoodieHiveClient.getTableSchema();
      SchemaDifference schemaDiff = SchemaUtil.getSchemaDifference(schema, tableSchema, cfg.partitionFields);
      if (!schemaDiff.isEmpty()) {
--- a/hudi-hive/src/main/java/org/apache/hudi/hive/HoodieHiveClient.java
+++ b/hudi-hive/src/main/java/org/apache/hudi/hive/HoodieHiveClient.java
@@ -30,7 +30,7 @@ import org.apache.hudi.common.util.FSUtils;
 import org.apache.hudi.common.util.Option;
 import org.apache.hudi.common.util.collection.Pair;
 import org.apache.hudi.exception.HoodieIOException;
-import org.apache.hudi.exception.InvalidDatasetException;
+import org.apache.hudi.exception.InvalidTableException;
 import org.apache.hudi.hive.util.SchemaUtil;

 import com.google.common.base.Preconditions;
@@ -337,10 +337,10 @@ public class HoodieHiveClient {
  }

  /**
-   * Gets the schema for a hoodie dataset. Depending on the type of table, read from any file written in the latest
+   * Gets the schema for a hoodie table. Depending on the type of table, read from any file written in the latest
   * commit. We will assume that the schema has not changed within a single atomic write.
   *
-   * @return Parquet schema for this dataset
+   * @return Parquet schema for this table
   */
  @SuppressWarnings("WeakerAccess")
  public MessageType getDataSchema() {
@@ -350,12 +350,12 @@ public class HoodieHiveClient {
          // If this is COW, get the last commit and read the schema from a file written in the
          // last commit
          HoodieInstant lastCommit =
-              activeTimeline.lastInstant().orElseThrow(() -> new InvalidDatasetException(syncConfig.basePath));
+              activeTimeline.lastInstant().orElseThrow(() -> new InvalidTableException(syncConfig.basePath));
          HoodieCommitMetadata commitMetadata = HoodieCommitMetadata
              .fromBytes(activeTimeline.getInstantDetails(lastCommit).get(), HoodieCommitMetadata.class);
          String filePath = commitMetadata.getFileIdAndFullPaths(metaClient.getBasePath()).values().stream().findAny()
              .orElseThrow(() -> new IllegalArgumentException("Could not find any data file written for commit "
-                  + lastCommit + ", could not get schema for dataset " + metaClient.getBasePath() + ", Metadata :"
+                  + lastCommit + ", could not get schema for table " + metaClient.getBasePath() + ", Metadata :"
                  + commitMetadata));
          return readSchemaFromDataFile(new Path(filePath));
        case MERGE_ON_READ:
@@ -390,7 +390,7 @@ public class HoodieHiveClient {
                          .filter(s -> s.contains((metaClient.getTableConfig().getROFileFormat().getFileExtension())))
                          .findAny().map(f -> Pair.of(f, HoodieFileFormat.PARQUET)).orElseThrow(() -> {
                            return new IllegalArgumentException("Could not find any data file written for commit "
-                                + lastDeltaInstant + ", could not get schema for dataset " + metaClient.getBasePath()
+                                + lastDeltaInstant + ", could not get schema for table " + metaClient.getBasePath()
                                + ", CommitMetadata :" + commitMetadata);
                          });
                    });
@@ -408,10 +408,10 @@ public class HoodieHiveClient {
          }
        default:
          LOG.error("Unknown table type " + tableType);
-          throw new InvalidDatasetException(syncConfig.basePath);
+          throw new InvalidTableException(syncConfig.basePath);
      }
    } catch (IOException e) {
-      throw new HoodieHiveSyncException("Failed to get dataset schema for " + syncConfig.tableName, e);
+      throw new HoodieHiveSyncException("Failed to get table schema for " + syncConfig.tableName, e);
    }
  }

@@ -428,7 +428,7 @@ public class HoodieHiveClient {
        .fromBytes(activeTimeline.getInstantDetails(lastCompactionCommit).get(), HoodieCommitMetadata.class);
    String filePath = compactionMetadata.getFileIdAndFullPaths(metaClient.getBasePath()).values().stream().findAny()
        .orElseThrow(() -> new IllegalArgumentException("Could not find any data file written for compaction "
-            + lastCompactionCommit + ", could not get schema for dataset " + metaClient.getBasePath()));
+            + lastCompactionCommit + ", could not get schema for table " + metaClient.getBasePath()));
    return readSchemaFromDataFile(new Path(filePath));
  }

--- a/hudi-hive/src/main/java/org/apache/hudi/hive/util/SchemaUtil.java
+++ b/hudi-hive/src/main/java/org/apache/hudi/hive/util/SchemaUtil.java
@@ -95,7 +95,7 @@ public class SchemaUtil {
        expectedType = expectedType.replaceAll("`", "");

        if (!tableColumnType.equalsIgnoreCase(expectedType)) {
-          // check for incremental datasets, the schema type change is allowed as per evolution
+          // check for incremental queries, the schema type change is allowed as per evolution
          // rules
          if (!isSchemaTypeUpdateAllowed(tableColumnType, expectedType)) {
            throw new HoodieHiveSyncException("Could not convert field Type from " + tableColumnType + " to "
--- a/hudi-hive/src/test/java/org/apache/hudi/hive/TestHiveSyncTool.java
+++ b/hudi-hive/src/test/java/org/apache/hudi/hive/TestHiveSyncTool.java
@@ -154,7 +154,7 @@ public class TestHiveSyncTool {
  public void testBasicSync() throws Exception {
    TestUtil.hiveSyncConfig.useJdbc = this.useJdbc;
    String commitTime = "100";
-    TestUtil.createCOWDataset(commitTime, 5);
+    TestUtil.createCOWTable(commitTime, 5);
    HoodieHiveClient hiveClient =
        new HoodieHiveClient(TestUtil.hiveSyncConfig, TestUtil.getHiveConf(), TestUtil.fileSystem);
    assertFalse("Table " + TestUtil.hiveSyncConfig.tableName + " should not exist initially",
@@ -164,7 +164,7 @@ public class TestHiveSyncTool {
    tool.syncHoodieTable();
    assertTrue("Table " + TestUtil.hiveSyncConfig.tableName + " should exist after sync completes",
        hiveClient.doesTableExist());
-    assertEquals("Hive Schema should match the dataset schema + partition field", hiveClient.getTableSchema().size(),
+    assertEquals("Hive Schema should match the table schema + partition field", hiveClient.getTableSchema().size(),
        hiveClient.getDataSchema().getColumns().size() + 1);
    assertEquals("Table partitions should match the number of partitions we wrote", 5,
        hiveClient.scanTablePartitions().size());
@@ -176,7 +176,7 @@ public class TestHiveSyncTool {
  public void testSyncIncremental() throws Exception {
    TestUtil.hiveSyncConfig.useJdbc = this.useJdbc;
    String commitTime1 = "100";
-    TestUtil.createCOWDataset(commitTime1, 5);
+    TestUtil.createCOWTable(commitTime1, 5);
    HoodieHiveClient hiveClient =
        new HoodieHiveClient(TestUtil.hiveSyncConfig, TestUtil.getHiveConf(), TestUtil.fileSystem);
    // Lets do the sync
@@ -214,7 +214,7 @@ public class TestHiveSyncTool {
  public void testSyncIncrementalWithSchemaEvolution() throws Exception {
    TestUtil.hiveSyncConfig.useJdbc = this.useJdbc;
    String commitTime1 = "100";
-    TestUtil.createCOWDataset(commitTime1, 5);
+    TestUtil.createCOWTable(commitTime1, 5);
    HoodieHiveClient hiveClient =
        new HoodieHiveClient(TestUtil.hiveSyncConfig, TestUtil.getHiveConf(), TestUtil.fileSystem);
    // Lets do the sync
@@ -250,7 +250,7 @@ public class TestHiveSyncTool {
    TestUtil.hiveSyncConfig.useJdbc = this.useJdbc;
    String commitTime = "100";
    String deltaCommitTime = "101";
-    TestUtil.createMORDataset(commitTime, deltaCommitTime, 5);
+    TestUtil.createMORTable(commitTime, deltaCommitTime, 5);
    HoodieHiveClient hiveClient =
        new HoodieHiveClient(TestUtil.hiveSyncConfig, TestUtil.getHiveConf(), TestUtil.fileSystem);
    assertFalse("Table " + TestUtil.hiveSyncConfig.tableName + " should not exist initially",
@@ -261,7 +261,7 @@ public class TestHiveSyncTool {

    assertTrue("Table " + TestUtil.hiveSyncConfig.tableName + " should exist after sync completes",
        hiveClient.doesTableExist());
-    assertEquals("Hive Schema should match the dataset schema + partition field", hiveClient.getTableSchema().size(),
+    assertEquals("Hive Schema should match the table schema + partition field", hiveClient.getTableSchema().size(),
        SchemaTestUtil.getSimpleSchema().getFields().size() + 1);
    assertEquals("Table partitions should match the number of partitions we wrote", 5,
        hiveClient.scanTablePartitions().size());
@@ -280,11 +280,11 @@ public class TestHiveSyncTool {
    tool.syncHoodieTable();
    hiveClient = new HoodieHiveClient(TestUtil.hiveSyncConfig, TestUtil.getHiveConf(), TestUtil.fileSystem);

-    assertEquals("Hive Schema should match the evolved dataset schema + partition field",
+    assertEquals("Hive Schema should match the evolved table schema + partition field",
        hiveClient.getTableSchema().size(), SchemaTestUtil.getEvolvedSchema().getFields().size() + 1);
    // Sync should add the one partition
    assertEquals("The 2 partitions we wrote should be added to hive", 6, hiveClient.scanTablePartitions().size());
-    assertEquals("The last commit that was sycned should be 103", deltaCommitTime2,
+    assertEquals("The last commit that was synced should be 103", deltaCommitTime2,
        hiveClient.getLastCommitTimeSynced().get());
  }

@@ -295,7 +295,7 @@ public class TestHiveSyncTool {
    String deltaCommitTime = "101";
    String roTablename = TestUtil.hiveSyncConfig.tableName;
    TestUtil.hiveSyncConfig.tableName = TestUtil.hiveSyncConfig.tableName + HiveSyncTool.SUFFIX_REALTIME_TABLE;
-    TestUtil.createMORDataset(commitTime, deltaCommitTime, 5);
+    TestUtil.createMORTable(commitTime, deltaCommitTime, 5);
    HoodieHiveClient hiveClientRT =
        new HoodieHiveClient(TestUtil.hiveSyncConfig, TestUtil.getHiveConf(), TestUtil.fileSystem);

@@ -309,11 +309,11 @@ public class TestHiveSyncTool {
    assertTrue("Table " + TestUtil.hiveSyncConfig.tableName + HiveSyncTool.SUFFIX_REALTIME_TABLE
        + " should exist after sync completes", hiveClientRT.doesTableExist());

-    assertEquals("Hive Schema should match the dataset schema + partition field", hiveClientRT.getTableSchema().size(),
+    assertEquals("Hive Schema should match the table schema + partition field", hiveClientRT.getTableSchema().size(),
        SchemaTestUtil.getSimpleSchema().getFields().size() + 1);
    assertEquals("Table partitions should match the number of partitions we wrote", 5,
        hiveClientRT.scanTablePartitions().size());
-    assertEquals("The last commit that was sycned should be updated in the TBLPROPERTIES", deltaCommitTime,
+    assertEquals("The last commit that was synced should be updated in the TBLPROPERTIES", deltaCommitTime,
        hiveClientRT.getLastCommitTimeSynced().get());

    // Now lets create more parititions and these are the only ones which needs to be synced
@@ -328,7 +328,7 @@ public class TestHiveSyncTool {
    tool.syncHoodieTable();
    hiveClientRT = new HoodieHiveClient(TestUtil.hiveSyncConfig, TestUtil.getHiveConf(), TestUtil.fileSystem);

-    assertEquals("Hive Schema should match the evolved dataset schema + partition field",
+    assertEquals("Hive Schema should match the evolved table schema + partition field",
        hiveClientRT.getTableSchema().size(), SchemaTestUtil.getEvolvedSchema().getFields().size() + 1);
    // Sync should add the one partition
    assertEquals("The 2 partitions we wrote should be added to hive", 6, hiveClientRT.scanTablePartitions().size());
@@ -341,7 +341,7 @@ public class TestHiveSyncTool {
  public void testMultiPartitionKeySync() throws Exception {
    TestUtil.hiveSyncConfig.useJdbc = this.useJdbc;
    String commitTime = "100";
-    TestUtil.createCOWDataset(commitTime, 5);
+    TestUtil.createCOWTable(commitTime, 5);

    HiveSyncConfig hiveSyncConfig = HiveSyncConfig.copy(TestUtil.hiveSyncConfig);
    hiveSyncConfig.partitionValueExtractorClass = MultiPartKeysValueExtractor.class.getCanonicalName();
@@ -355,7 +355,7 @@ public class TestHiveSyncTool {
    HiveSyncTool tool = new HiveSyncTool(hiveSyncConfig, TestUtil.getHiveConf(), TestUtil.fileSystem);
    tool.syncHoodieTable();
    assertTrue("Table " + hiveSyncConfig.tableName + " should exist after sync completes", hiveClient.doesTableExist());
-    assertEquals("Hive Schema should match the dataset schema + partition fields", hiveClient.getTableSchema().size(),
+    assertEquals("Hive Schema should match the table schema + partition fields", hiveClient.getTableSchema().size(),
        hiveClient.getDataSchema().getColumns().size() + 3);
    assertEquals("Table partitions should match the number of partitions we wrote", 5,
        hiveClient.scanTablePartitions().size());
--- a/hudi-hive/src/test/java/org/apache/hudi/hive/TestUtil.java
+++ b/hudi-hive/src/test/java/org/apache/hudi/hive/TestUtil.java
@@ -152,7 +152,7 @@ public class TestUtil {
    }
  }

-  static void createCOWDataset(String commitTime, int numberOfPartitions)
+  static void createCOWTable(String commitTime, int numberOfPartitions)
      throws IOException, InitializationError, URISyntaxException {
    Path path = new Path(hiveSyncConfig.basePath);
    FileIOUtils.deleteDirectory(new File(hiveSyncConfig.basePath));
@@ -166,7 +166,7 @@ public class TestUtil {
    createCommitFile(commitMetadata, commitTime);
  }

-  static void createMORDataset(String commitTime, String deltaCommitTime, int numberOfPartitions)
+  static void createMORTable(String commitTime, String deltaCommitTime, int numberOfPartitions)
      throws IOException, InitializationError, URISyntaxException, InterruptedException {
    Path path = new Path(hiveSyncConfig.basePath);
    FileIOUtils.deleteDirectory(new File(hiveSyncConfig.basePath));