diff --git a/hoodie-client/src/main/java/com/uber/hoodie/HoodieReadClient.java b/hoodie-client/src/main/java/com/uber/hoodie/HoodieReadClient.java index 09aa2dbc5..42f5bf6c3 100644 --- a/hoodie-client/src/main/java/com/uber/hoodie/HoodieReadClient.java +++ b/hoodie-client/src/main/java/com/uber/hoodie/HoodieReadClient.java @@ -60,7 +60,7 @@ import scala.Tuple2; * Provides first class support for accessing Hoodie tables for data processing via Apache Spark. * * - * TODO: Need to move all read operations here, since Hoodie is a single writer & multiple reader + * TODO: Need to move all read operations here, since Hoodie is a single writer and multiple reader */ public class HoodieReadClient implements Serializable { @@ -247,7 +247,7 @@ public class HoodieReadClient implements Serializable { /** * Checks if the given [Keys] exists in the hoodie table and returns [Key, - * Optional] If the optional FullFilePath value is not present, then the key is + * Optional[FullFilePath]] If the optional FullFilePath value is not present, then the key is * not found. If the FullFilePath value is present, it is the path component (without scheme) of * the URI underlying file */ diff --git a/hoodie-client/src/main/java/com/uber/hoodie/HoodieWriteClient.java b/hoodie-client/src/main/java/com/uber/hoodie/HoodieWriteClient.java index edc1e8162..f0a6df3d6 100644 --- a/hoodie-client/src/main/java/com/uber/hoodie/HoodieWriteClient.java +++ b/hoodie-client/src/main/java/com/uber/hoodie/HoodieWriteClient.java @@ -105,7 +105,6 @@ public class HoodieWriteClient implements Seriali * @param jsc * @param clientConfig * @param rollbackInFlight - * @throws Exception */ public HoodieWriteClient(JavaSparkContext jsc, HoodieWriteConfig clientConfig, boolean rollbackInFlight) { this.fs = FSUtils.getFs(); @@ -234,7 +233,7 @@ public class HoodieWriteClient implements Seriali * * @param records HoodieRecords to insert * @param commitTime Commit Time handle - * @return JavaRDD - RDD of WriteStatus to inspect errors and counts + * @return JavaRDD[WriteStatus] - RDD of WriteStatus to inspect errors and counts * */ public JavaRDD insert(JavaRDD> records, final String commitTime) { diff --git a/hoodie-client/src/main/java/com/uber/hoodie/index/HoodieIndex.java b/hoodie-client/src/main/java/com/uber/hoodie/index/HoodieIndex.java index 1df62ca4e..17a1d26ad 100644 --- a/hoodie-client/src/main/java/com/uber/hoodie/index/HoodieIndex.java +++ b/hoodie-client/src/main/java/com/uber/hoodie/index/HoodieIndex.java @@ -33,7 +33,7 @@ import java.io.Serializable; /** * Base class for different types of indexes to determine the mapping from uuid - *

+ * * TODO(vc): need methods for recovery and rollback */ public abstract class HoodieIndex implements Serializable { @@ -53,7 +53,7 @@ public abstract class HoodieIndex implements Seri } /** - * Checks if the given [Keys] exists in the hoodie table and returns [Key, Optional] + * Checks if the given [Keys] exists in the hoodie table and returns [Key, Optional[FullFilePath]] * If the optional FullFilePath value is not present, then the key is not found. If the FullFilePath * value is present, it is the path component (without scheme) of the URI underlying file * @@ -74,7 +74,7 @@ public abstract class HoodieIndex implements Seri /** * Extracts the location of written records, and updates the index. - *

+ * * TODO(vc): We may need to propagate the record as well in a WriteStatus class */ public abstract JavaRDD updateLocation(JavaRDD writeStatusRDD, diff --git a/hoodie-client/src/main/java/com/uber/hoodie/io/HoodieInsertHandle.java b/hoodie-client/src/main/java/com/uber/hoodie/io/HoodieInsertHandle.java index 9b2ac3b6d..cf1eaabbe 100644 --- a/hoodie-client/src/main/java/com/uber/hoodie/io/HoodieInsertHandle.java +++ b/hoodie-client/src/main/java/com/uber/hoodie/io/HoodieInsertHandle.java @@ -64,9 +64,9 @@ public class HoodieInsertHandle extends HoodieIOH /** * Determines whether we can accept the incoming records, into the current file, depending on - *

+ * * - Whether it belongs to the same partitionPath as existing records - * - Whether the current file written bytes < max file size + * - Whether the current file written bytes lt max file size * * @return */ diff --git a/hoodie-client/src/main/java/com/uber/hoodie/metrics/Metrics.java b/hoodie-client/src/main/java/com/uber/hoodie/metrics/Metrics.java index 4158814c6..337d21c16 100644 --- a/hoodie-client/src/main/java/com/uber/hoodie/metrics/Metrics.java +++ b/hoodie-client/src/main/java/com/uber/hoodie/metrics/Metrics.java @@ -26,11 +26,7 @@ import org.apache.commons.configuration.ConfigurationException; import java.io.Closeable; /** - * This is the main class of the metrics system. To use it, - * users need to call the {@link #init(HoodieMetricsConfig) init} method to initialize the system. - * Input for {@link #init(HoodieMetricsConfig) init} includes a configuration object, where - * users can specify the reporter type, and special configs for that reporter. - * Refer to {@see MetricsConfiguration} for more configurable fields. + * This is the main class of the metrics system. */ public class Metrics { private static volatile boolean initialized = false; diff --git a/hoodie-client/src/main/java/com/uber/hoodie/table/HoodieCopyOnWriteTable.java b/hoodie-client/src/main/java/com/uber/hoodie/table/HoodieCopyOnWriteTable.java index 43455d208..23bf11065 100644 --- a/hoodie-client/src/main/java/com/uber/hoodie/table/HoodieCopyOnWriteTable.java +++ b/hoodie-client/src/main/java/com/uber/hoodie/table/HoodieCopyOnWriteTable.java @@ -115,7 +115,7 @@ public class HoodieCopyOnWriteTable extends Hoodi } /** - * Helper class for a bucket's type (INSERT & UPDATE) and its file location + * Helper class for a bucket's type (INSERT and UPDATE) and its file location */ class BucketInfo implements Serializable { BucketType bucketType; diff --git a/hoodie-common/src/main/java/com/uber/hoodie/common/file/HoodieAppendLog.java b/hoodie-common/src/main/java/com/uber/hoodie/common/file/HoodieAppendLog.java index 6eab1be12..5effd47fa 100644 --- a/hoodie-common/src/main/java/com/uber/hoodie/common/file/HoodieAppendLog.java +++ b/hoodie-common/src/main/java/com/uber/hoodie/common/file/HoodieAppendLog.java @@ -62,147 +62,6 @@ import org.apache.hadoop.util.MergeSort; import org.apache.hadoop.util.PriorityQueue; import org.apache.hadoop.util.Time; -/** - * SequenceFiles are flat files consisting of binary key/value - * pairs. - * - *

SequenceFile provides {@link HoodieAppendLog.Writer}, - * {@link HoodieAppendLog.Reader} and {@link Sorter} classes for writing, - * reading and sorting respectively.

- * - * There are three SequenceFile Writers based on the - * {@link CompressionType} used to compress key/value pairs: - *
    - *
  1. - * Writer : Uncompressed records. - *
  2. - *
  3. - * RecordCompressWriter : Record-compressed files, only compress - * values. - *
  4. - *
  5. - * BlockCompressWriter : Block-compressed files, both keys & - * values are collected in 'blocks' - * separately and compressed. The size of - * the 'block' is configurable. - *
- * - *

The actual compression algorithm used to compress key and/or values can be - * specified by using the appropriate {@link CompressionCodec}.

- * - *

The recommended way is to use the static createWriter methods - * provided by the SequenceFile to chose the preferred format.

- * - *

The {@link HoodieAppendLog.Reader} acts as the bridge and can read any of the - * above SequenceFile formats.

- * - *

SequenceFile Formats

- * - *

Essentially there are 3 different formats for SequenceFiles - * depending on the CompressionType specified. All of them share a - * common header described below. - * - *

- *
    - *
  • - * version - 3 bytes of magic header SEQ, followed by 1 byte of actual - * version number (e.g. SEQ4 or SEQ6) - *
  • - *
  • - * keyClassName -key class - *
  • - *
  • - * valueClassName - value class - *
  • - *
  • - * compression - A boolean which specifies if compression is turned on for - * keys/values in this file. - *
  • - *
  • - * blockCompression - A boolean which specifies if block-compression is - * turned on for keys/values in this file. - *
  • - *
  • - * compression codec - CompressionCodec class which is used for - * compression of keys and/or values (if compression is - * enabled). - *
  • - *
  • - * metadata - {@link Metadata} for this file. - *
  • - *
  • - * sync - A sync marker to denote end of the header. - *
  • - *
- * - *
Uncompressed SequenceFile Format
- *
    - *
  • - * Header - *
  • - *
  • - * Record - *
      - *
    • Record length
    • - *
    • Key length
    • - *
    • Key
    • - *
    • Value
    • - *
    - *
  • - *
  • - * A sync-marker every few 100 bytes or so. - *
  • - *
- * - *
Record-Compressed SequenceFile Format
- *
    - *
  • - * Header - *
  • - *
  • - * Record - *
      - *
    • Record length
    • - *
    • Key length
    • - *
    • Key
    • - *
    • Compressed Value
    • - *
    - *
  • - *
  • - * A sync-marker every few 100 bytes or so. - *
  • - *
- * - *
Block-Compressed SequenceFile Format
- *
    - *
  • - * Header - *
  • - *
  • - * Record Block - *
      - *
    • Uncompressed number of records in the block
    • - *
    • Compressed key-lengths block-size
    • - *
    • Compressed key-lengths block
    • - *
    • Compressed keys block-size
    • - *
    • Compressed keys block
    • - *
    • Compressed value-lengths block-size
    • - *
    • Compressed value-lengths block
    • - *
    • Compressed values block-size
    • - *
    • Compressed values block
    • - *
    - *
  • - *
  • - * A sync-marker every block. - *
  • - *
- * - *

The compressed blocks of key lengths and value lengths consist of the - * actual lengths of individual keys/values encoded in ZeroCompressedInteger - * format.

- * - * @see CompressionCodec - */ @InterfaceAudience.Public @InterfaceStability.Stable public class HoodieAppendLog { @@ -1930,7 +1789,7 @@ public class HoodieAppendLog { * @param fs The file system used to open the file. * @param file The file being read. * @param bufferSize The buffer size used to read the file. - * @param length The length being read if it is >= 0. Otherwise, + * @param length The length being read if it is gteq 0. Otherwise, * the length is not available. * @return The opened stream. * @throws IOException diff --git a/hoodie-common/src/main/java/com/uber/hoodie/common/model/HoodieCommits.java b/hoodie-common/src/main/java/com/uber/hoodie/common/model/HoodieCommits.java index 99ea51364..4094f78c2 100644 --- a/hoodie-common/src/main/java/com/uber/hoodie/common/model/HoodieCommits.java +++ b/hoodie-common/src/main/java/com/uber/hoodie/common/model/HoodieCommits.java @@ -128,7 +128,7 @@ public class HoodieCommits implements Serializable { } /** - * Returns the nth commit from the latest commit such that lastCommit(0) => lastCommit() + * Returns the nth commit from the latest commit such that lastCommit(0) gteq lastCommit() */ public String lastCommit(int n) { if (commitList.size() < n + 1) { diff --git a/hoodie-common/src/main/java/com/uber/hoodie/common/model/HoodieTableMetadata.java b/hoodie-common/src/main/java/com/uber/hoodie/common/model/HoodieTableMetadata.java index b87fd3fd4..bf91a0f97 100644 --- a/hoodie-common/src/main/java/com/uber/hoodie/common/model/HoodieTableMetadata.java +++ b/hoodie-common/src/main/java/com/uber/hoodie/common/model/HoodieTableMetadata.java @@ -77,7 +77,6 @@ public class HoodieTableMetadata implements Serializable { * @param fs * @param basePath * @param tableName - * @throws IOException */ public HoodieTableMetadata(FileSystem fs, String basePath, String tableName) { this(fs, basePath, tableName, true); @@ -87,7 +86,6 @@ public class HoodieTableMetadata implements Serializable { * Constructor which loads the hoodie table metadata, It requires the meta-data to be present already * @param fs * @param basePath - * @throws IOException */ public HoodieTableMetadata(FileSystem fs, String basePath) { this(fs, basePath, null, false); @@ -137,8 +135,7 @@ public class HoodieTableMetadata implements Serializable { * Returns all the commit metadata for this table. Reads all the commit files from HDFS. * Expensive operation, use with caution. * - * @return SortedMap of CommitTime,HoodieCommitMetadata - * @throws IOException + * @return SortedMap of CommitTime,HoodieCommitMetadata */ public SortedMap getAllCommitMetadata() { try { @@ -169,7 +166,7 @@ public class HoodieTableMetadata implements Serializable { /** * Lookup the file name for specified HoodieRecord - *

+ * * TODO(vc): This metadata needs to be cached in each executor, statically, and used across, if * we need to be nicer to the NameNode */ @@ -200,7 +197,7 @@ public class HoodieTableMetadata implements Serializable { /** - * Get only the latest file in the partition with precondition commitTime(file) < maxCommitTime + * Get only the latest file in the partition with precondition commitTime(file) lt maxCommitTime * * @param fs * @param partitionPathStr diff --git a/hoodie-hive/src/main/java/com/uber/hoodie/hive/client/SchemaUtil.java b/hoodie-hive/src/main/java/com/uber/hoodie/hive/client/SchemaUtil.java index e11aeaf99..77d4580dd 100644 --- a/hoodie-hive/src/main/java/com/uber/hoodie/hive/client/SchemaUtil.java +++ b/hoodie-hive/src/main/java/com/uber/hoodie/hive/client/SchemaUtil.java @@ -138,7 +138,7 @@ public class SchemaUtil { * Returns equivalent Hive table schema read from a parquet file * * @param messageType : Parquet Schema - * @return : Hive Table schema read from parquet file MAP + * @return : Hive Table schema read from parquet file MAP[String,String] * @throws IOException */ public static Map convertParquetSchemaToHiveSchema(MessageType messageType) diff --git a/pom.xml b/pom.xml index 35b1ae84b..368a6379d 100644 --- a/pom.xml +++ b/pom.xml @@ -22,6 +22,10 @@ hoodie pom 0.2.7-SNAPSHOT + Hoodie is a Apache Spark library that provides the ability to efficiently do incremental processing on datasets in HDFS + https://github.com/uber/hoodie + Hoodie + hoodie-common hoodie-client @@ -498,10 +502,6 @@ - - false - true - org.apache.maven.plugins