Docker Container Build and Run setup with foundations for adding docker integration tests. Docker images built with Hadoop 2.8.4 Hive 2.3.3 and Spark 2.3.1 and published to docker-hub

Look at quickstart document for how to setup docker and run demo
2018-08-21 22:54:57 -07:00
parent 9710b5a3a6
commit f3418e4718
63 changed files with 8952 additions and 9 deletions
--- a/docker/build_local_docker_images.sh
+++ b/docker/build_local_docker_images.sh
@@ -0,0 +1,13 @@
 #!/bin/bash
 while true; do
    read -p  "Docker images can be downloaded from docker hub and seamlessly mounted with latest HUDI jars. Do you still want to build docker images from scratch ?" yn
    case $yn in
        [Yy]* ) make install; break;;
        [Nn]* ) exit;;
        * ) echo "Please answer yes or no.";;
    esac
 done
 pushd ../
 mvn clean pre-integration-test -DskipTests -Ddocker.compose.skip=true -Ddocker.build.skip=false
 popd
--- a/docker/compose/docker-compose_hadoop284_hive233_spark231.yml
+++ b/docker/compose/docker-compose_hadoop284_hive233_spark231.yml
@@ -0,0 +1,217 @@
 version: "3.3"
 services:
  namenode:
    image: varadarb/hudi-hadoop_2.8.4-namenode:latest
    hostname: namenode
    container_name: namenode
    volumes:
      - /tmp/hadoop_name:/hadoop/dfs/name
    environment:
      - CLUSTER_NAME=hudi_hadoop284_hive232_spark231
    ports:
      - "50070:50070"
      - "8020:8020"
    env_file:
      - ./hadoop.env
    healthcheck:
      test: ["CMD", "curl", "-f", "http://namenode:50070"]
      interval: 30s
      timeout: 10s
      retries: 3
  datanode1:
    image: varadarb/hudi-hadoop_2.8.4-datanode:latest
    container_name: datanode1
    hostname: datanode1
    environment:
      - CLUSTER_NAME=hudi_hadoop284_hive232_spark231
    env_file:
      - ./hadoop.env
    ports:
      - "50075:50075"
      - "50010:50010"
    links:
      - "namenode"
      - "historyserver"
    healthcheck:
      test: ["CMD", "curl", "-f", "http://datanode1:50075"]
      interval: 30s
      timeout: 10s
      retries: 3
    depends_on:
      - namenode
    volumes:
      - /tmp/hadoop_data:/hadoop/dfs/data
  historyserver:
    image: varadarb/hudi-hadoop_2.8.4-history:latest
    hostname: historyserver
    container_name: historyserver
    environment:
      - CLUSTER_NAME=hudi_hadoop284_hive232_spark231
    depends_on:
      - "namenode"
    links:
      - "namenode"
    ports:
      - "58188:8188"
    healthcheck:
      test: ["CMD", "curl", "-f", "http://historyserver:8188"]
      interval: 30s
      timeout: 10s
      retries: 3
    env_file:
      - ./hadoop.env
    volumes:
      - historyserver:/hadoop/yarn/timeline
  hive-metastore-postgresql:
    image: bde2020/hive-metastore-postgresql:2.3.0
    volumes:
      - hive-metastore-postgresql:/var/lib/postgresql
    hostname: hive-metastore-postgresql
    container_name: hive-metastore-postgresql
  hivemetastore:
    image: varadarb/hudi-hadoop_2.8.4-hive_2.3.3:latest
    hostname: hivemetastore
    container_name: hivemetastore
    links:
      - "hive-metastore-postgresql"
      - "namenode"
    env_file:
      - ./hadoop.env
    command: /opt/hive/bin/hive --service metastore
    environment:
      SERVICE_PRECONDITION: "namenode:50070 hive-metastore-postgresql:5432"
    ports:
      - "9083:9083"
    healthcheck:
      test: ["CMD", "nc", "-z", "hivemetastore", "9083"]
      interval: 30s
      timeout: 10s
      retries: 3
    depends_on:
      - "hive-metastore-postgresql"
      - "namenode"
  hiveserver:
    image: varadarb/hudi-hadoop_2.8.4-hive_2.3.3:latest
    hostname: hiveserver
    container_name: hiveserver
    env_file:
      - ./hadoop.env
    environment:
      SERVICE_PRECONDITION: "hivemetastore:9083"
    ports:
      - "10000:10000"
    depends_on:
      - "hivemetastore"
    links:
      - "hivemetastore"
      - "hive-metastore-postgresql"
      - "namenode"
    volumes:
      - ${HUDI_WS}:/var/hoodie/ws
  sparkmaster:
    image: varadarb/hudi-hadoop_2.8.4-hive_2.3.3-sparkmaster_2.3.1:latest
    hostname: sparkmaster
    container_name: sparkmaster
    env_file:
      - ./hadoop.env
    ports:
      - "8080:8080"
      - "7077:7077"
    environment:
      - INIT_DAEMON_STEP=setup_spark
    links:
      - "hivemetastore"
      - "hiveserver"
      - "hive-metastore-postgresql"
      - "namenode"
  spark-worker-1:
    image: varadarb/hudi-hadoop_2.8.4-hive_2.3.3-sparkworker_2.3.1:latest
    hostname: spark-worker-1
    container_name: spark-worker-1
    env_file:
      - ./hadoop.env
    depends_on:
      - sparkmaster
    ports:
      - "8081:8081"
    environment:
      - "SPARK_MASTER=spark://sparkmaster:7077"
    links:
      - "hivemetastore"
      - "hiveserver"
      - "hive-metastore-postgresql"
      - "namenode"
  zookeeper:
    image: 'bitnami/zookeeper:3.4.12-r68'
    hostname: zookeeper
    container_name: zookeeper
    ports:
      - '2181:2181'
    environment:
      - ALLOW_ANONYMOUS_LOGIN=yes
  kafka:
    image: 'bitnami/kafka:2.0.0'
    hostname: kafkabroker
    container_name: kafkabroker
    ports:
      - '9092:9092'
    environment:
      - KAFKA_ZOOKEEPER_CONNECT=zookeeper:2181
      - ALLOW_PLAINTEXT_LISTENER=yes
  adhoc-1:
    image: varadarb/hudi-hadoop_2.8.4-hive_2.3.3-sparkadhoc_2.3.1:latest
    hostname: adhoc-1
    container_name: adhoc-1
    env_file:
      - ./hadoop.env
    depends_on:
      - sparkmaster
    ports:
      - '4040:4040'
    environment:
      - "SPARK_MASTER=spark://sparkmaster:7077"
    links:
      - "hivemetastore"
      - "hiveserver"
      - "hive-metastore-postgresql"
      - "namenode"
    volumes:
      - ${HUDI_WS}:/var/hoodie/ws
  adhoc-2:
    image: varadarb/hudi-hadoop_2.8.4-hive_2.3.3-sparkadhoc_2.3.1:latest
    hostname: adhoc-2
    container_name: adhoc-2
    env_file:
      - ./hadoop.env
    depends_on:
      - sparkmaster
    environment:
      - "SPARK_MASTER=spark://sparkmaster:7077"
    links:
      - "hivemetastore"
      - "hiveserver"
      - "hive-metastore-postgresql"
      - "namenode"
    volumes:
      - ${HUDI_WS}:/var/hoodie/ws
 volumes:
  namenode:
  historyserver:
  hive-metastore-postgresql:
 networks:
  default:
--- a/docker/compose/hadoop.env
+++ b/docker/compose/hadoop.env
@@ -0,0 +1,33 @@
 HIVE_SITE_CONF_javax_jdo_option_ConnectionURL=jdbc:postgresql://hive-metastore-postgresql/metastore
 HIVE_SITE_CONF_javax_jdo_option_ConnectionDriverName=org.postgresql.Driver
 HIVE_SITE_CONF_javax_jdo_option_ConnectionUserName=hive
 HIVE_SITE_CONF_javax_jdo_option_ConnectionPassword=hive
 HIVE_SITE_CONF_datanucleus_autoCreateSchema=false
 HIVE_SITE_CONF_hive_metastore_uris=thrift://hivemetastore:9083
 HDFS_CONF_dfs_namenode_datanode_registration_ip___hostname___check=false
 HDFS_CONF_dfs_webhdfs_enabled=true
 HDFS_CONF_dfs_permissions_enabled=false
 #HDFS_CONF_dfs_client_use_datanode_hostname=true
 #HDFS_CONF_dfs_namenode_use_datanode_hostname=true
 CORE_CONF_fs_defaultFS=hdfs://namenode:8020
 CORE_CONF_hadoop_http_staticuser_user=root
 CORE_CONF_hadoop_proxyuser_hue_hosts=*
 CORE_CONF_hadoop_proxyuser_hue_groups=*
 YARN_CONF_yarn_log___aggregation___enable=true
 YARN_CONF_yarn_resourcemanager_recovery_enabled=true
 YARN_CONF_yarn_resourcemanager_store_class=org.apache.hadoop.yarn.server.resourcemanager.recovery.FileSystemRMStateStore
 YARN_CONF_yarn_resourcemanager_fs_state___store_uri=/rmstate
 YARN_CONF_yarn_nodemanager_remote___app___log___dir=/app-logs
 YARN_CONF_yarn_log_server_url=http://historyserver:8188/applicationhistory/logs/
 YARN_CONF_yarn_timeline___service_enabled=true
 YARN_CONF_yarn_timeline___service_generic___application___history_enabled=true
 YARN_CONF_yarn_resourcemanager_system___metrics___publisher_enabled=true
 YARN_CONF_yarn_resourcemanager_hostname=resourcemanager
 YARN_CONF_yarn_timeline___service_hostname=historyserver
 YARN_CONF_yarn_resourcemanager_address=resourcemanager:8032
 YARN_CONF_yarn_resourcemanager_scheduler_address=resourcemanager:8030
 YARN_CONF_yarn_resourcemanager_resource___tracker_address=resourcemanager:8031
 YARN_CONF_yarn_nodemanager_vmem___check___enabled=false
--- a/docker/demo/config/base.properties
+++ b/docker/demo/config/base.properties
@@ -0,0 +1,21 @@
 #
 #  Copyright (c) 2017 Uber Technologies, Inc. (hoodie-dev-group@uber.com)
 #
 #  Licensed under the Apache License, Version 2.0 (the "License");
 #  you may not use this file except in compliance with the License.
 #  You may obtain a copy of the License at
 #
 #           http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 #
 #
 #
 # Common hoodie client configs
 hoodie.upsert.shuffle.parallelism=2
 hoodie.insert.shuffle.parallelism=2
 hoodie.bulkinsert.shuffle.parallelism=2
--- a/docker/demo/config/kafka-source.properties
+++ b/docker/demo/config/kafka-source.properties
@@ -0,0 +1,29 @@
 #
 #  Copyright (c) 2017 Uber Technologies, Inc. (hoodie-dev-group@uber.com)
 #
 #  Licensed under the Apache License, Version 2.0 (the "License");
 #  you may not use this file except in compliance with the License.
 #  You may obtain a copy of the License at
 #
 #           http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 #
 #
 #
 include=base.properties
 # Key fields, for kafka example
 hoodie.datasource.write.recordkey.field=key
 hoodie.datasource.write.partitionpath.field=date
 # Schema provider props (change to absolute path based on your installation)
 hoodie.deltastreamer.schemaprovider.source.schema.file=/var/demo/config/schema.avsc
 hoodie.deltastreamer.schemaprovider.target.schema.file=/var/demo/config/schema.avsc
 # Kafka Source
 hoodie.deltastreamer.source.kafka.topic=stock_ticks
 #Kafka props
 metadata.broker.list=kafkabroker:9092
 auto.offset.reset=smallest
--- a/docker/demo/config/schema.avsc
+++ b/docker/demo/config/schema.avsc
@@ -0,0 +1,41 @@
 {
  "type":"record",
  "name":"stock_ticks",
  "fields":[{
     "name": "volume",
     "type": "long"
  }, {
     "name": "ts", 
     "type": "string"
  }, {
     "name": "symbol", 
     "type": "string"
  },{
     "name": "year", 
     "type": "int"
  },{
     "name": "month", 
     "type": "string"
  },{
     "name": "high", 
     "type": "double"
  },{
     "name": "low", 
     "type": "double"
  },{
     "name": "key", 
     "type": "string"
  },{
     "name": "date", 
     "type":"string"
  }, {
     "name": "close", 
     "type": "double"
  }, {
     "name": "open", 
     "type": "double"
  }, {
     "name": "day", 
     "type":"string"
  }
 ]}
--- a/docker/demo/config/spark-defaults.conf
+++ b/docker/demo/config/spark-defaults.conf
@@ -0,0 +1,26 @@
 #
 # Licensed to the Apache Software Foundation (ASF) under one or more
 # contributor license agreements.  See the NOTICE file distributed with
 # this work for additional information regarding copyright ownership.
 # The ASF licenses this file to You under the Apache License, Version 2.0
 # (the "License"); you may not use this file except in compliance with
 # the License.  You may obtain a copy of the License at
 #
 #    http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 #
 # Default system properties included when running spark-submit.
 # This is useful for setting default environmental settings.
 # Example:
 spark.master                     local[3]
 spark.eventLog.dir               hdfs://namenode:8020/tmp/spark-events
 spark.serializer                 org.apache.spark.serializer.KryoSerializer
 #spark.executor.memory            4g
 # spark.executor.extraJavaOptions  -XX:+PrintGCDetails -Dkey=value -Dnumbers="one two three"
--- a/docker/demo/data/batch_1.json
+++ b/docker/demo/data/batch_1.json
--- a/docker/demo/data/batch_2.json
+++ b/docker/demo/data/batch_2.json
--- a/docker/demo/setup_demo_container.sh
+++ b/docker/demo/setup_demo_container.sh
@@ -0,0 +1,6 @@
 echo "Copying spark default config and setting up configs"
 cp /var/hoodie/ws/docker/demo/config/spark-defaults.conf $SPARK_CONF_DIR/.
 hadoop fs -mkdir -p /var/demo/
 hadoop fs -mkdir -p /tmp/spark-events
 hadoop fs -copyFromLocal  -f /var/hoodie/ws/docker/demo/config /var/demo/.
 chmod +x /var/hoodie/ws/hoodie-hive/run_sync_tool.sh
--- a/docker/hoodie/hadoop/base/Dockerfile
+++ b/docker/hoodie/hadoop/base/Dockerfile
@@ -0,0 +1,45 @@
 FROM frolvlad/alpine-oraclejdk8
 MAINTAINER Hoodie
 USER root
 # Default to UTF-8 file.encoding
 ENV LANG C.UTF-8
 # Updating & Installing packages
 RUN apk add net-tools curl bash perl procps
 ARG HADOOP_VERSION=2.8.4 
 ARG HADOOP_URL=https://www.apache.org/dist/hadoop/common/hadoop-${HADOOP_VERSION}/hadoop-${HADOOP_VERSION}.tar.gz
 ENV HADOOP_VERSION ${HADOOP_VERSION}
 ENV HADOOP_URL ${HADOOP_URL}
 RUN set -x \
    && echo "Fetch URL2 is : ${HADOOP_URL}" \
    && curl -fSL "${HADOOP_URL}" -o /tmp/hadoop.tar.gz \
    && curl -fSL "${HADOOP_URL}.asc" -o /tmp/hadoop.tar.gz.asc \
    && mkdir -p /opt/hadoop-$HADOOP_VERSION/logs \
    && tar -xvf /tmp/hadoop.tar.gz -C /opt/ \
    && rm /tmp/hadoop.tar.gz* \
    && ln -s /opt/hadoop-$HADOOP_VERSION/etc/hadoop /etc/hadoop \
    && cp /etc/hadoop/mapred-site.xml.template /etc/hadoop/mapred-site.xml \
    && mkdir /hadoop-data
 ENV HADOOP_PREFIX=/opt/hadoop-$HADOOP_VERSION
 ENV HADOOP_CONF_DIR=/etc/hadoop
 ENV MULTIHOMED_NETWORK=1
 ENV HADOOP_HOME=${HADOOP_PREFIX}
 ENV HADOOP_INSTALL=${HADOOP_HOME}
 ENV USER=root
 ENV PATH /usr/bin:/bin:$HADOOP_PREFIX/bin/:$PATH
 # Exposing a union of ports across hadoop versions
 # Well known ports including ssh
 EXPOSE 0-1024 4040 7000-10100 5000-5100 50000-50200 58188 58088 58042 
 ADD entrypoint.sh /entrypoint.sh
 ADD export_container_ip.sh /usr/bin/
 RUN chmod a+x /usr/bin/export_container_ip.sh \
    && chmod a+x /entrypoint.sh
 ENTRYPOINT ["/bin/bash", "/entrypoint.sh"]
--- a/docker/hoodie/hadoop/base/entrypoint.sh
+++ b/docker/hoodie/hadoop/base/entrypoint.sh
@@ -0,0 +1,91 @@
 #!/bin/bash
 #######################################################################################
 ##            COPIED FROM                                                            ##
 ##  https://github.com/big-data-europe/docker-hadoop/blob/master/base/entrypoint.sh  ##
 #                                                                                    ##
 #######################################################################################
 # Set some sensible defaults
 export CORE_CONF_fs_defaultFS=${CORE_CONF_fs_defaultFS:-hdfs://`hostname -f`:8020}
 function addProperty() {
  local path=$1
  local name=$2
  local value=$3
  local entry="<property><name>$name</name><value>${value}</value></property>"
  local escapedEntry=$(echo $entry | sed 's/\//\\\//g')
  sed -i "/<\/configuration>/ s/.*/${escapedEntry}\n&/" $path
 }
 function configure() {
    local path=$1
    local module=$2
    local envPrefix=$3
    local var
    local value
    echo "Configuring $module"
    for c in `printenv | perl -sne 'print "$1 " if m/^${envPrefix}_(.+?)=.*/' -- -envPrefix=$envPrefix`; do 
        name=`echo ${c} | perl -pe 's/___/-/g; s/__/@/g; s/_/./g; s/@/_/g;'`
        var="${envPrefix}_${c}"
        value=${!var}
        echo " - Setting $name=$value"
        addProperty /etc/hadoop/$module-site.xml $name "$value"
    done
 }
 configure /etc/hadoop/core-site.xml core CORE_CONF
 configure /etc/hadoop/hdfs-site.xml hdfs HDFS_CONF
 configure /etc/hadoop/yarn-site.xml yarn YARN_CONF
 configure /etc/hadoop/httpfs-site.xml httpfs HTTPFS_CONF
 configure /etc/hadoop/kms-site.xml kms KMS_CONF
 if [ "$MULTIHOMED_NETWORK" = "1" ]; then
    echo "Configuring for multihomed network"
    # HDFS
    addProperty /etc/hadoop/hdfs-site.xml dfs.namenode.rpc-bind-host 0.0.0.0
    addProperty /etc/hadoop/hdfs-site.xml dfs.namenode.servicerpc-bind-host 0.0.0.0
    addProperty /etc/hadoop/hdfs-site.xml dfs.namenode.http-bind-host 0.0.0.0
    addProperty /etc/hadoop/hdfs-site.xml dfs.namenode.https-bind-host 0.0.0.0
    addProperty /etc/hadoop/hdfs-site.xml dfs.client.use.datanode.hostname true
    addProperty /etc/hadoop/hdfs-site.xml dfs.datanode.use.datanode.hostname true
    # YARN
    addProperty /etc/hadoop/yarn-site.xml yarn.resourcemanager.bind-host 0.0.0.0
    addProperty /etc/hadoop/yarn-site.xml yarn.nodemanager.bind-host 0.0.0.0
    addProperty /etc/hadoop/yarn-site.xml yarn.nodemanager.bind-host 0.0.0.0
    addProperty /etc/hadoop/yarn-site.xml yarn.timeline-service.bind-host 0.0.0.0
    # MAPRED
    addProperty /etc/hadoop/mapred-site.xml yarn.nodemanager.bind-host 0.0.0.0
 fi
 if [ -n "$GANGLIA_HOST" ]; then
    mv /etc/hadoop/hadoop-metrics.properties /etc/hadoop/hadoop-metrics.properties.orig
    mv /etc/hadoop/hadoop-metrics2.properties /etc/hadoop/hadoop-metrics2.properties.orig
    for module in mapred jvm rpc ugi; do
        echo "$module.class=org.apache.hadoop.metrics.ganglia.GangliaContext31"
        echo "$module.period=10"
        echo "$module.servers=$GANGLIA_HOST:8649"
    done > /etc/hadoop/hadoop-metrics.properties
    for module in namenode datanode resourcemanager nodemanager mrappmaster jobhistoryserver; do
        echo "$module.sink.ganglia.class=org.apache.hadoop.metrics2.sink.ganglia.GangliaSink31"
        echo "$module.sink.ganglia.period=10"
        echo "$module.sink.ganglia.supportsparse=true"
        echo "$module.sink.ganglia.slope=jvm.metrics.gcCount=zero,jvm.metrics.memHeapUsedM=both"
        echo "$module.sink.ganglia.dmax=jvm.metrics.threadsBlocked=70,jvm.metrics.memHeapUsedM=40"
        echo "$module.sink.ganglia.servers=$GANGLIA_HOST:8649"
    done > /etc/hadoop/hadoop-metrics2.properties
 fi
 # Save Container IP in ENV variable
 /usr/bin/export_container_ip.sh
 exec "$@"
--- a/docker/hoodie/hadoop/base/export_container_ip.sh
+++ b/docker/hoodie/hadoop/base/export_container_ip.sh
@@ -0,0 +1,13 @@
 interfaces=( "en0" "eth0" )
 ipAddr=""
 for interface in "${interfaces[@]}"
 do
  ipAddr=`ifconfig $interface | grep -Eo 'inet (addr:)?([0-9]+\.){3}[0-9]+' | grep -Eo '([0-9]+\.){3}[0-9]+' | grep -v '127.0.0.1' | head`
  if [ -n "$ipAddr" ]; then
    break
  fi 
 done
 echo "Container IP is set to : $ipAddr"
 export MY_CONTAINER_IP=$ipAddr
--- a/docker/hoodie/hadoop/base/pom.xml
+++ b/docker/hoodie/hadoop/base/pom.xml
@@ -0,0 +1,90 @@
 <?xml version="1.0" encoding="UTF-8"?>
 <!--
  ~ Copyright (c) 2016 Uber Technologies, Inc. (hoodie-dev-group@uber.com)
  ~
  ~ Licensed under the Apache License, Version 2.0 (the "License");
  ~ you may not use this file except in compliance with the License.
  ~ You may obtain a copy of the License at
  ~
  ~          http://www.apache.org/licenses/LICENSE-2.0
  ~
  ~ Unless required by applicable law or agreed to in writing, software
  ~ distributed under the License is distributed on an "AS IS" BASIS,
  ~ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  ~ See the License for the specific language governing permissions and
  ~ limitations under the License.
  -->
 <project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
  <parent>
    <artifactId>hoodie-hadoop-docker</artifactId>
    <groupId>com.uber.hoodie</groupId>
    <version>0.4.5-SNAPSHOT</version>
  </parent>
  <modelVersion>4.0.0</modelVersion>
  <packaging>pom</packaging>
  <artifactId>hoodie-hadoop-base-docker</artifactId>
  <description>Base Docker Image with Hoodie</description>
  <properties>
    <project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
    <checkstyle.skip>true</checkstyle.skip>
  </properties>
  <dependencies>
    <dependency>
      <groupId>com.uber.hoodie</groupId>
      <artifactId>hoodie-hadoop-docker</artifactId>
      <version>${project.version}</version>
      <type>pom</type>
      <scope>import</scope>
    </dependency>
  </dependencies>
  <build>
    <finalName>hoodie</finalName>
    <plugins>
      <!-- Build Docker image -->
      <plugin>
        <groupId>com.spotify</groupId>
        <artifactId>dockerfile-maven-plugin</artifactId>
        <version>${dockerfile.maven.version}</version>
        <executions>
          <execution>
            <id>tag-latest</id>
            <phase>pre-integration-test</phase>
            <goals>
              <goal>build</goal>
              <goal>tag</goal>
              <!-- <goal>push</goal> -->
            </goals>
            <configuration>
              <skip>${docker.build.skip}</skip>
              <pullNewerImage>false</pullNewerImage>
              <repository>varadarb/hudi-hadoop_${docker.hadoop.version}-base</repository>
              <forceTags>true</forceTags>
              <tag>latest</tag>
            </configuration>
          </execution>
          <execution>
            <id>tag-version</id>
            <phase>pre-integration-test</phase>
            <goals>
              <goal>build</goal>
              <goal>tag</goal>
              <!-- <goal>push</goal> -->
            </goals>
            <configuration>
              <skip>${docker.build.skip}</skip>
              <pullNewerImage>false</pullNewerImage>
              <repository>varadarb/hudi-hadoop_${docker.hadoop.version}-base</repository>
              <forceTags>true</forceTags>
              <tag>${project.version}</tag>
            </configuration>
          </execution>
        </executions>
      </plugin>
    </plugins>
  </build>
 </project>
--- a/docker/hoodie/hadoop/datanode/Dockerfile
+++ b/docker/hoodie/hadoop/datanode/Dockerfile
@@ -0,0 +1,14 @@
 ARG HADOOP_VERSION=2.8.4 
 ARG HADOOP_DN_PORT=50075
 FROM varadarb/hudi-hadoop_${HADOOP_VERSION}-base:latest
 ENV HADOOP_DN_PORT ${HADOOP_DN_PORT}
 ENV HDFS_CONF_dfs_datanode_data_dir=file:///hadoop/dfs/data
 RUN mkdir -p /hadoop/dfs/data
 VOLUME /hadoop/dfs/data
 ADD run_dn.sh /run_dn.sh
 RUN chmod a+x /run_dn.sh
 CMD ["/run_dn.sh"]
--- a/docker/hoodie/hadoop/datanode/pom.xml
+++ b/docker/hoodie/hadoop/datanode/pom.xml
@@ -0,0 +1,89 @@
 <?xml version="1.0" encoding="UTF-8"?>
 <!--
  ~ Copyright (c) 2016 Uber Technologies, Inc. (hoodie-dev-group@uber.com)
  ~
  ~ Licensed under the Apache License, Version 2.0 (the "License");
  ~ you may not use this file except in compliance with the License.
  ~ You may obtain a copy of the License at
  ~
  ~          http://www.apache.org/licenses/LICENSE-2.0
  ~
  ~ Unless required by applicable law or agreed to in writing, software
  ~ distributed under the License is distributed on an "AS IS" BASIS,
  ~ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  ~ See the License for the specific language governing permissions and
  ~ limitations under the License.
  -->
 <project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
  <parent>
    <artifactId>hoodie-hadoop-docker</artifactId>
    <groupId>com.uber.hoodie</groupId>
    <version>0.4.5-SNAPSHOT</version>
  </parent>
  <modelVersion>4.0.0</modelVersion>
  <packaging>pom</packaging>
  <artifactId>hoodie-hadoop-datanode-docker</artifactId>
  <description>Base Docker Image with Hoodie</description>
  <properties>
    <project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
    <checkstyle.skip>true</checkstyle.skip>
  </properties>
  <dependencies>
    <dependency>
      <groupId>com.uber.hoodie</groupId>
      <artifactId>hoodie-hadoop-base-docker</artifactId>
      <version>${project.version}</version>
      <type>pom</type>
      <scope>import</scope>
    </dependency>
  </dependencies>
  <build>
    <plugins>
      <!-- Build Docker image -->
      <plugin>
        <groupId>com.spotify</groupId>
        <artifactId>dockerfile-maven-plugin</artifactId>
        <version>${dockerfile.maven.version}</version>
        <executions>
          <execution>
            <id>tag-latest</id>
            <phase>pre-integration-test</phase>
            <goals>
              <goal>build</goal>
              <goal>tag</goal>
              <!-- <goal>push</goal> -->
            </goals>
            <configuration>
              <skip>${docker.build.skip}</skip>
              <pullNewerImage>false</pullNewerImage>
              <repository>varadarb/hudi-hadoop_${docker.hadoop.version}-datanode</repository>
              <forceTags>true</forceTags>
              <tag>latest</tag>
            </configuration>
          </execution>
          <execution>
            <id>tag-version</id>
            <phase>pre-integration-test</phase>
            <goals>
              <goal>build</goal>
              <goal>tag</goal>
              <!-- <goal>push</goal> -->
            </goals>
            <configuration>
              <skip>${docker.build.skip}</skip>
              <pullNewerImage>false</pullNewerImage>
              <repository>varadarb/hudi-hadoop_${docker.hadoop.version}-datanode</repository>
              <forceTags>true</forceTags>
              <tag>${project.version}</tag>
            </configuration>
          </execution>
        </executions>
      </plugin>
    </plugins>
  </build>
 </project>
--- a/docker/hoodie/hadoop/datanode/run_dn.sh
+++ b/docker/hoodie/hadoop/datanode/run_dn.sh
@@ -0,0 +1,9 @@
 #!/bin/bash
 datadir=`echo $HDFS_CONF_dfs_datanode_data_dir | perl -pe 's#file://##'`
 if [ ! -d $datadir ]; then
  echo "Datanode data directory not found: $datadir"
  exit 2
 fi
 $HADOOP_PREFIX/bin/hdfs --config $HADOOP_CONF_DIR datanode
--- a/docker/hoodie/hadoop/historyserver/Dockerfile
+++ b/docker/hoodie/hadoop/historyserver/Dockerfile
@@ -0,0 +1,14 @@
 ARG HADOOP_VERSION=2.8.4 
 ARG HADOOP_HISTORY_PORT=8188
 FROM varadarb/hudi-hadoop_${HADOOP_VERSION}-base:latest
 ENV HADOOP_HISTORY_PORT ${HADOOP_HISTORY_PORT}
 ENV YARN_CONF_yarn_timeline___service_leveldb___timeline___store_path=/hadoop/yarn/timeline
 RUN mkdir -p /hadoop/yarn/timeline
 VOLUME /hadoop/yarn/timeline
 ADD run_history.sh /run_history.sh
 RUN chmod a+x /run_history.sh
 CMD ["/run_history.sh"]
--- a/docker/hoodie/hadoop/historyserver/pom.xml
+++ b/docker/hoodie/hadoop/historyserver/pom.xml
@@ -0,0 +1,89 @@
 <?xml version="1.0" encoding="UTF-8"?>
 <!--
  ~ Copyright (c) 2016 Uber Technologies, Inc. (hoodie-dev-group@uber.com)
  ~
  ~ Licensed under the Apache License, Version 2.0 (the "License");
  ~ you may not use this file except in compliance with the License.
  ~ You may obtain a copy of the License at
  ~
  ~          http://www.apache.org/licenses/LICENSE-2.0
  ~
  ~ Unless required by applicable law or agreed to in writing, software
  ~ distributed under the License is distributed on an "AS IS" BASIS,
  ~ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  ~ See the License for the specific language governing permissions and
  ~ limitations under the License.
  -->
 <project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
  <parent>
    <artifactId>hoodie-hadoop-docker</artifactId>
    <groupId>com.uber.hoodie</groupId>
    <version>0.4.5-SNAPSHOT</version>
  </parent>
  <modelVersion>4.0.0</modelVersion>
  <packaging>pom</packaging>
  <artifactId>hoodie-hadoop-history-docker</artifactId>
  <description>Base Docker Image with Hoodie</description>
  <properties>
    <project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
    <checkstyle.skip>true</checkstyle.skip>
  </properties>
  <dependencies>
    <dependency>
      <groupId>com.uber.hoodie</groupId>
      <artifactId>hoodie-hadoop-base-docker</artifactId>
      <version>${project.version}</version>
      <type>pom</type>
      <scope>import</scope>
    </dependency>
  </dependencies>
  <build>
    <plugins>
      <!-- Build Docker image -->
      <plugin>
        <groupId>com.spotify</groupId>
        <artifactId>dockerfile-maven-plugin</artifactId>
        <version>${dockerfile.maven.version}</version>
        <executions>
          <execution>
            <id>tag-latest</id>
            <phase>pre-integration-test</phase>
            <goals>
              <goal>build</goal>
              <goal>tag</goal>
              <!-- <goal>push</goal> -->
            </goals>
            <configuration>
              <skip>${docker.build.skip}</skip>
              <pullNewerImage>false</pullNewerImage>
              <repository>varadarb/hudi-hadoop_${docker.hadoop.version}-history</repository>
              <forceTags>true</forceTags>
              <tag>latest</tag>
            </configuration>
          </execution>
          <execution>
            <id>tag-version</id>
            <phase>pre-integration-test</phase>
            <goals>
              <goal>build</goal>
              <goal>tag</goal>
              <!-- <goal>push</goal> -->
            </goals>
            <configuration>
              <skip>${docker.build.skip}</skip>
              <pullNewerImage>false</pullNewerImage>
              <repository>varadarb/hudi-hadoop_${docker.hadoop.version}-history</repository>
              <forceTags>true</forceTags>
              <tag>${project.version}</tag>
            </configuration>
          </execution>
        </executions>
      </plugin>
    </plugins>
  </build>
 </project>
--- a/docker/hoodie/hadoop/historyserver/run_history.sh
+++ b/docker/hoodie/hadoop/historyserver/run_history.sh
@@ -0,0 +1,3 @@
 #!/bin/bash
 $HADOOP_PREFIX/bin/yarn --config $HADOOP_CONF_DIR historyserver
--- a/docker/hoodie/hadoop/hive_base/Dockerfile
+++ b/docker/hoodie/hadoop/hive_base/Dockerfile
@@ -0,0 +1,51 @@
 ARG HADOOP_VERSION=2.8.4 
 FROM varadarb/hudi-hadoop_${HADOOP_VERSION}-base:latest
 ENV HIVE_HOME /opt/hive
 ENV PATH $HIVE_HOME/bin:$PATH
 ENV HADOOP_HOME /opt/hadoop-$HADOOP_VERSION
 WORKDIR /opt
 ARG HIVE_VERSION=2.3.3
 ARG HIVE_URL=https://archive.apache.org/dist/hive/hive-$HIVE_VERSION/apache-hive-$HIVE_VERSION-bin.tar.gz
 ENV HIVE_VERSION ${HIVE_VERSION}
 ENV HIVE_URL ${HIVE_URL}
 #Install Hive MySQL, PostgreSQL JDBC
 RUN echo "Hive URL is :${HIVE_URL}" && wget ${HIVE_URL} -O hive.tar.gz && \
 	tar -xzvf hive.tar.gz && mv *hive*-bin hive && \
        ln -s /usr/share/java/mysql-connector-java.jar $HIVE_HOME/lib/mysql-connector-java.jar && \
 	wget https://jdbc.postgresql.org/download/postgresql-9.4.1212.jar -O $HIVE_HOME/lib/postgresql-jdbc.jar && \
 	rm hive.tar.gz && mkdir -p /var/hoodie/ws/docker/hoodie/hadoop/hive_base/target/
 #Spark should be compiled with Hive to be able to use it
 #hive-site.xml should be copied to $SPARK_HOME/conf folder
 #Custom configuration goes here
 ADD conf/hive-site.xml $HADOOP_CONF_DIR
 ADD conf/beeline-log4j2.properties $HIVE_HOME/conf
 ADD conf/hive-env.sh $HIVE_HOME/conf
 ADD conf/hive-exec-log4j2.properties $HIVE_HOME/conf
 ADD conf/hive-log4j2.properties $HIVE_HOME/conf
 ADD conf/ivysettings.xml $HIVE_HOME/conf
 ADD conf/llap-daemon-log4j2.properties $HIVE_HOME/conf
 # Setup Hoodie Library jars
 ADD target/ /var/hoodie/ws/docker/hoodie/hadoop/hive_base/target/
 ENV HUDI_HADOOP_BUNDLE=/var/hoodie/ws/docker/hoodie/hadoop/hive_base/target/hoodie-hadoop-mr-bundle.jar
 ENV HUDI_HIVE_BUNDLE=/var/hoodie/ws/docker/hoodie/hadoop/hive_base/target/hoodie-hive-bundle.jar
 ENV HUDI_SPARK_BUNDLE=/var/hoodie/ws/docker/hoodie/hadoop/hive_base/target/hoodie-spark-bundle.jar
 ENV HUDI_UTILITIES_BUNDLE=/var/hoodie/ws/docker/hoodie/hadoop/hive_base/target/hoodie-utilities.jar
 COPY startup.sh /usr/local/bin/
 RUN chmod +x /usr/local/bin/startup.sh
 COPY entrypoint.sh /usr/local/bin/
 RUN chmod +x /usr/local/bin/entrypoint.sh
 ENV PATH $HIVE_HOME/bin/:$PATH
 ENTRYPOINT ["entrypoint.sh"]
 CMD startup.sh
--- a/docker/hoodie/hadoop/hive_base/conf/beeline-log4j2.properties
+++ b/docker/hoodie/hadoop/hive_base/conf/beeline-log4j2.properties
@@ -0,0 +1,45 @@
 # Licensed to the Apache Software Foundation (ASF) under one
 # or more contributor license agreements.  See the NOTICE file
 # distributed with this work for additional information
 # regarding copyright ownership.  The ASF licenses this file
 # to you under the Apache License, Version 2.0 (the
 # "License"); you may not use this file except in compliance
 # with the License.  You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 status = INFO
 name = BeelineLog4j2
 packages = org.apache.hadoop.hive.ql.log
 # list of properties
 property.hive.log.level = WARN
 property.hive.root.logger = console
 # list of all appenders
 appenders = console
 # console appender
 appender.console.type = Console
 appender.console.name = console
 appender.console.target = SYSTEM_ERR
 appender.console.layout.type = PatternLayout
 appender.console.layout.pattern = %d{yy/MM/dd HH:mm:ss} [%t]: %p %c{2}: %m%n
 # list of all loggers
 loggers = HiveConnection
 # HiveConnection logs useful info for dynamic service discovery
 logger.HiveConnection.name = org.apache.hive.jdbc.HiveConnection
 logger.HiveConnection.level = INFO
 # root logger
 rootLogger.level = ${sys:hive.log.level}
 rootLogger.appenderRefs = root
 rootLogger.appenderRef.root.ref = ${sys:hive.root.logger}
--- a/docker/hoodie/hadoop/hive_base/conf/hive-env.sh
+++ b/docker/hoodie/hadoop/hive_base/conf/hive-env.sh
@@ -0,0 +1,54 @@
 # Licensed to the Apache Software Foundation (ASF) under one
 # or more contributor license agreements.  See the NOTICE file
 # distributed with this work for additional information
 # regarding copyright ownership.  The ASF licenses this file
 # to you under the Apache License, Version 2.0 (the
 # "License"); you may not use this file except in compliance
 # with the License.  You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # Set Hive and Hadoop environment variables here. These variables can be used
 # to control the execution of Hive. It should be used by admins to configure
 # the Hive installation (so that users do not have to set environment variables
 # or set command line parameters to get correct behavior).
 #
 # The hive service being invoked (CLI/HWI etc.) is available via the environment
 # variable SERVICE
 # Hive Client memory usage can be an issue if a large number of clients
 # are running at the same time. The flags below have been useful in 
 # reducing memory usage:
 #
 # if [ "$SERVICE" = "cli" ]; then
 #   if [ -z "$DEBUG" ]; then
 #     export HADOOP_OPTS="$HADOOP_OPTS -XX:NewRatio=12 -Xms10m -XX:MaxHeapFreeRatio=40 -XX:MinHeapFreeRatio=15 -XX:+UseParNewGC -XX:-UseGCOverheadLimit"
 #   else
 #     export HADOOP_OPTS="$HADOOP_OPTS -XX:NewRatio=12 -Xms10m -XX:MaxHeapFreeRatio=40 -XX:MinHeapFreeRatio=15 -XX:-UseGCOverheadLimit"
 #   fi
 # fi
 # The heap size of the jvm stared by hive shell script can be controlled via:
 #
 # export HADOOP_HEAPSIZE=1024
 #
 # Larger heap size may be required when running queries over large number of files or partitions. 
 # By default hive shell scripts use a heap size of 256 (MB).  Larger heap size would also be 
 # appropriate for hive server (hwi etc).
 # Set HADOOP_HOME to point to a specific hadoop install directory
 # HADOOP_HOME=${bin}/../../hadoop
 # Hive Configuration Directory can be controlled by:
 # export HIVE_CONF_DIR=
 # Folder containing extra ibraries required for hive compilation/execution can be controlled by:
 # export HIVE_AUX_JARS_PATH=
--- a/docker/hoodie/hadoop/hive_base/conf/hive-exec-log4j2.properties
+++ b/docker/hoodie/hadoop/hive_base/conf/hive-exec-log4j2.properties
@@ -0,0 +1,66 @@
 # Licensed to the Apache Software Foundation (ASF) under one
 # or more contributor license agreements.  See the NOTICE file
 # distributed with this work for additional information
 # regarding copyright ownership.  The ASF licenses this file
 # to you under the Apache License, Version 2.0 (the
 # "License"); you may not use this file except in compliance
 # with the License.  You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 status = INFO
 name = HiveExecLog4j2
 packages = org.apache.hadoop.hive.ql.log
 # list of properties
 property.hive.log.level = INFO
 property.hive.root.logger = FA
 property.hive.query.id = hadoop
 property.hive.log.dir = ${sys:java.io.tmpdir}/${sys:user.name}
 property.hive.log.file = ${sys:hive.query.id}.log
 # list of all appenders
 appenders = console, FA
 # console appender
 appender.console.type = Console
 appender.console.name = console
 appender.console.target = SYSTEM_ERR
 appender.console.layout.type = PatternLayout
 appender.console.layout.pattern = %d{yy/MM/dd HH:mm:ss} [%t]: %p %c{2}: %m%n
 # simple file appender
 appender.FA.type = File
 appender.FA.name = FA
 appender.FA.fileName = ${sys:hive.log.dir}/${sys:hive.log.file}
 appender.FA.layout.type = PatternLayout
 appender.FA.layout.pattern = %d{ISO8601} %-5p [%t]: %c{2} (%F:%M(%L)) - %m%n
 # list of all loggers
 loggers = NIOServerCnxn, ClientCnxnSocketNIO, DataNucleus, Datastore, JPOX
 logger.NIOServerCnxn.name = org.apache.zookeeper.server.NIOServerCnxn
 logger.NIOServerCnxn.level = WARN
 logger.ClientCnxnSocketNIO.name = org.apache.zookeeper.ClientCnxnSocketNIO
 logger.ClientCnxnSocketNIO.level = WARN
 logger.DataNucleus.name = DataNucleus
 logger.DataNucleus.level = ERROR
 logger.Datastore.name = Datastore
 logger.Datastore.level = ERROR
 logger.JPOX.name = JPOX
 logger.JPOX.level = ERROR
 # root logger
 rootLogger.level = ${sys:hive.log.level}
 rootLogger.appenderRefs = root
 rootLogger.appenderRef.root.ref = ${sys:hive.root.logger}
--- a/docker/hoodie/hadoop/hive_base/conf/hive-log4j2.properties
+++ b/docker/hoodie/hadoop/hive_base/conf/hive-log4j2.properties
@@ -0,0 +1,73 @@
 # Licensed to the Apache Software Foundation (ASF) under one
 # or more contributor license agreements.  See the NOTICE file
 # distributed with this work for additional information
 # regarding copyright ownership.  The ASF licenses this file
 # to you under the Apache License, Version 2.0 (the
 # "License"); you may not use this file except in compliance
 # with the License.  You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 status = INFO
 name = HiveLog4j2
 packages = org.apache.hadoop.hive.ql.log
 # list of properties
 property.hive.log.level = INFO
 property.hive.root.logger = DRFA
 property.hive.log.dir = ${sys:java.io.tmpdir}/${sys:user.name}
 property.hive.log.file = hive.log
 # list of all appenders
 appenders = console, DRFA
 # console appender
 appender.console.type = Console
 appender.console.name = console
 appender.console.target = SYSTEM_ERR
 appender.console.layout.type = PatternLayout
 appender.console.layout.pattern = %d{yy/MM/dd HH:mm:ss} [%t]: %p %c{2}: %m%n
 # daily rolling file appender
 appender.DRFA.type = RollingFile
 appender.DRFA.name = DRFA
 appender.DRFA.fileName = ${sys:hive.log.dir}/${sys:hive.log.file}
 # Use %pid in the filePattern to append <process-id>@<host-name> to the filename if you want separate log files for different CLI session
 appender.DRFA.filePattern = ${sys:hive.log.dir}/${sys:hive.log.file}.%d{yyyy-MM-dd}
 appender.DRFA.layout.type = PatternLayout
 appender.DRFA.layout.pattern = %d{ISO8601} %-5p [%t]: %c{2} (%F:%M(%L)) - %m%n
 appender.DRFA.policies.type = Policies
 appender.DRFA.policies.time.type = TimeBasedTriggeringPolicy
 appender.DRFA.policies.time.interval = 1
 appender.DRFA.policies.time.modulate = true
 appender.DRFA.strategy.type = DefaultRolloverStrategy
 appender.DRFA.strategy.max = 30
 # list of all loggers
 loggers = NIOServerCnxn, ClientCnxnSocketNIO, DataNucleus, Datastore, JPOX
 logger.NIOServerCnxn.name = org.apache.zookeeper.server.NIOServerCnxn
 logger.NIOServerCnxn.level = WARN
 logger.ClientCnxnSocketNIO.name = org.apache.zookeeper.ClientCnxnSocketNIO
 logger.ClientCnxnSocketNIO.level = WARN
 logger.DataNucleus.name = DataNucleus
 logger.DataNucleus.level = ERROR
 logger.Datastore.name = Datastore
 logger.Datastore.level = ERROR
 logger.JPOX.name = JPOX
 logger.JPOX.level = ERROR
 # root logger
 rootLogger.level = ${sys:hive.log.level}
 rootLogger.appenderRefs = root
 rootLogger.appenderRef.root.ref = ${sys:hive.root.logger}
--- a/docker/hoodie/hadoop/hive_base/conf/hive-site.xml
+++ b/docker/hoodie/hadoop/hive_base/conf/hive-site.xml
@@ -0,0 +1,18 @@
 <?xml version="1.0" encoding="UTF-8" standalone="no"?>
 <?xml-stylesheet type="text/xsl" href="configuration.xsl"?><!--
   Licensed to the Apache Software Foundation (ASF) under one or more
   contributor license agreements.  See the NOTICE file distributed with
   this work for additional information regarding copyright ownership.
   The ASF licenses this file to You under the Apache License, Version 2.0
   (the "License"); you may not use this file except in compliance with
   the License.  You may obtain a copy of the License at
       http://www.apache.org/licenses/LICENSE-2.0
   Unless required by applicable law or agreed to in writing, software
   distributed under the License is distributed on an "AS IS" BASIS,
   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
   See the License for the specific language governing permissions and
   limitations under the License.
 --><configuration>
 </configuration>
--- a/docker/hoodie/hadoop/hive_base/conf/ivysettings.xml
+++ b/docker/hoodie/hadoop/hive_base/conf/ivysettings.xml
@@ -0,0 +1,45 @@
 <!--
   Licensed to the Apache Software Foundation (ASF) under one or more
   contributor license agreements.  See the NOTICE file distributed with
   this work for additional information regarding copyright ownership.
   The ASF licenses this file to You under the Apache License, Version 2.0
   (the "License"); you may not use this file except in compliance with
   the License.  You may obtain a copy of the License at
       http://www.apache.org/licenses/LICENSE-2.0
   Unless required by applicable law or agreed to in writing, software
   distributed under the License is distributed on an "AS IS" BASIS,
   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
   See the License for the specific language governing permissions and
   limitations under the License.
   -->
 <!--This file is used by grapes to download dependencies from a maven repository.
    This is just a template and can be edited to add more repositories.
 -->
 <ivysettings>
  <!--name of the defaultResolver should always be 'downloadGrapes'. -->
  <settings defaultResolver="downloadGrapes"/>
  <!-- Only set maven.local.repository if not already set -->
  <property name="maven.local.repository" value="${user.home}/.m2/repository" override="false" />
  <property name="m2-pattern"
            value="file:${maven.local.repository}/[organisation]/[module]/[revision]/[module]-[revision](-[classifier]).[ext]"
            override="false"/>
  <resolvers>
    <!-- more resolvers can be added here -->
    <chain name="downloadGrapes">
      <!-- This resolver uses ibiblio to find artifacts, compatible with maven2 repository -->
      <ibiblio name="central" m2compatible="true"/>
      <url name="local-maven2" m2compatible="true">
        <artifact pattern="${m2-pattern}"/>
      </url>
      <!-- File resolver to add jars from the local system. -->
      <filesystem name="test" checkmodified="true">
        <artifact pattern="/tmp/[module]-[revision](-[classifier]).jar"/>
      </filesystem>
    </chain>
  </resolvers>
 </ivysettings>
--- a/docker/hoodie/hadoop/hive_base/conf/llap-daemon-log4j2.properties
+++ b/docker/hoodie/hadoop/hive_base/conf/llap-daemon-log4j2.properties
@@ -0,0 +1,93 @@
 # Licensed to the Apache Software Foundation (ASF) under one
 # or more contributor license agreements.  See the NOTICE file
 # distributed with this work for additional information
 # regarding copyright ownership.  The ASF licenses this file
 # to you under the Apache License, Version 2.0 (the
 # "License"); you may not use this file except in compliance
 # with the License.  You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 status = INFO
 name = LlapDaemonLog4j2
 packages = org.apache.hadoop.hive.ql.log
 # list of properties
 property.llap.daemon.log.level = INFO
 property.llap.daemon.root.logger = console
 property.llap.daemon.log.dir = .
 property.llap.daemon.log.file = llapdaemon.log
 property.llap.daemon.historylog.file = llapdaemon_history.log
 property.llap.daemon.log.maxfilesize = 256MB
 property.llap.daemon.log.maxbackupindex = 20
 # list of all appenders
 appenders = console, RFA, HISTORYAPPENDER
 # console appender
 appender.console.type = Console
 appender.console.name = console
 appender.console.target = SYSTEM_ERR
 appender.console.layout.type = PatternLayout
 appender.console.layout.pattern = %d{yy/MM/dd HH:mm:ss} [%t%x] %p %c{2} : %m%n
 # rolling file appender
 appender.RFA.type = RollingFile
 appender.RFA.name = RFA
 appender.RFA.fileName = ${sys:llap.daemon.log.dir}/${sys:llap.daemon.log.file}
 appender.RFA.filePattern = ${sys:llap.daemon.log.dir}/${sys:llap.daemon.log.file}_%i
 appender.RFA.layout.type = PatternLayout
 appender.RFA.layout.pattern = %d{ISO8601} %-5p [%t%x]: %c{2} (%F:%M(%L)) - %m%n
 appender.RFA.policies.type = Policies
 appender.RFA.policies.size.type = SizeBasedTriggeringPolicy
 appender.RFA.policies.size.size = ${sys:llap.daemon.log.maxfilesize}
 appender.RFA.strategy.type = DefaultRolloverStrategy
 appender.RFA.strategy.max = ${sys:llap.daemon.log.maxbackupindex}
 # history file appender
 appender.HISTORYAPPENDER.type = RollingFile
 appender.HISTORYAPPENDER.name = HISTORYAPPENDER
 appender.HISTORYAPPENDER.fileName = ${sys:llap.daemon.log.dir}/${sys:llap.daemon.historylog.file}
 appender.HISTORYAPPENDER.filePattern = ${sys:llap.daemon.log.dir}/${sys:llap.daemon.historylog.file}_%i
 appender.HISTORYAPPENDER.layout.type = PatternLayout
 appender.HISTORYAPPENDER.layout.pattern = %m%n
 appender.HISTORYAPPENDER.policies.type = Policies
 appender.HISTORYAPPENDER.policies.size.type = SizeBasedTriggeringPolicy
 appender.HISTORYAPPENDER.policies.size.size = ${sys:llap.daemon.log.maxfilesize}
 appender.HISTORYAPPENDER.strategy.type = DefaultRolloverStrategy
 appender.HISTORYAPPENDER.strategy.max = ${sys:llap.daemon.log.maxbackupindex}
 # list of all loggers
 loggers = NIOServerCnxn, ClientCnxnSocketNIO, DataNucleus, Datastore, JPOX, HistoryLogger
 logger.NIOServerCnxn.name = org.apache.zookeeper.server.NIOServerCnxn
 logger.NIOServerCnxn.level = WARN
 logger.ClientCnxnSocketNIO.name = org.apache.zookeeper.ClientCnxnSocketNIO
 logger.ClientCnxnSocketNIO.level = WARN
 logger.DataNucleus.name = DataNucleus
 logger.DataNucleus.level = ERROR
 logger.Datastore.name = Datastore
 logger.Datastore.level = ERROR
 logger.JPOX.name = JPOX
 logger.JPOX.level = ERROR
 logger.HistoryLogger.name = org.apache.hadoop.hive.llap.daemon.HistoryLogger
 logger.HistoryLogger.level = INFO
 logger.HistoryLogger.additivity = false
 logger.HistoryLogger.appenderRefs = HistoryAppender
 logger.HistoryLogger.appenderRef.HistoryAppender.ref = HISTORYAPPENDER
 # root logger
 rootLogger.level = ${sys:llap.daemon.log.level}
 rootLogger.appenderRefs = root
 rootLogger.appenderRef.root.ref = ${sys:llap.daemon.root.logger}
--- a/docker/hoodie/hadoop/hive_base/entrypoint.sh
+++ b/docker/hoodie/hadoop/hive_base/entrypoint.sh
@@ -0,0 +1,118 @@
 #!/bin/bash
 # Set some sensible defaults
 export CORE_CONF_fs_defaultFS=${CORE_CONF_fs_defaultFS:-hdfs://`hostname -f`:8020}
 function addProperty() {
  local path=$1
  local name=$2
  local value=$3
  local entry="<property><name>$name</name><value>${value}</value></property>"
  local escapedEntry=$(echo $entry | sed 's/\//\\\//g')
  sed -i "/<\/configuration>/ s/.*/${escapedEntry}\n&/" $path
 }
 function configure() {
    local path=$1
    local module=$2
    local envPrefix=$3
    local var
    local value
    echo "Configuring $module"
    for c in `printenv | perl -sne 'print "$1 " if m/^${envPrefix}_(.+?)=.*/' -- -envPrefix=$envPrefix`; do 
        name=`echo ${c} | perl -pe 's/___/-/g; s/__/_/g; s/_/./g'`
        var="${envPrefix}_${c}"
        value=${!var}
        echo " - Setting $name=$value"
        addProperty $path $name "$value"
    done
 }
 configure /etc/hadoop/core-site.xml core CORE_CONF
 configure /etc/hadoop/hdfs-site.xml hdfs HDFS_CONF
 configure /etc/hadoop/yarn-site.xml yarn YARN_CONF
 configure /etc/hadoop/httpfs-site.xml httpfs HTTPFS_CONF
 configure /etc/hadoop/kms-site.xml kms KMS_CONF
 configure /etc/hadoop/mapred-site.xml mapred MAPRED_CONF
 configure /etc/hadoop/hive-site.xml hive HIVE_SITE_CONF
 if [ "$MULTIHOMED_NETWORK" = "1" ]; then
    echo "Configuring for multihomed network"
    # HDFS
    addProperty /etc/hadoop/hdfs-site.xml dfs.namenode.rpc-bind-host 0.0.0.0
    addProperty /etc/hadoop/hdfs-site.xml dfs.namenode.servicerpc-bind-host 0.0.0.0
    addProperty /etc/hadoop/hdfs-site.xml dfs.namenode.http-bind-host 0.0.0.0
    addProperty /etc/hadoop/hdfs-site.xml dfs.namenode.https-bind-host 0.0.0.0
    addProperty /etc/hadoop/hdfs-site.xml dfs.client.use.datanode.hostname true
    addProperty /etc/hadoop/hdfs-site.xml dfs.datanode.use.datanode.hostname true
    # YARN
    addProperty /etc/hadoop/yarn-site.xml yarn.resourcemanager.bind-host 0.0.0.0
    addProperty /etc/hadoop/yarn-site.xml yarn.nodemanager.bind-host 0.0.0.0
    addProperty /etc/hadoop/yarn-site.xml yarn.nodemanager.bind-host 0.0.0.0
    addProperty /etc/hadoop/yarn-site.xml yarn.timeline-service.bind-host 0.0.0.0
    # MAPRED
    addProperty /etc/hadoop/mapred-site.xml yarn.nodemanager.bind-host 0.0.0.0
 fi
 if [ -n "$GANGLIA_HOST" ]; then
    mv /etc/hadoop/hadoop-metrics.properties /etc/hadoop/hadoop-metrics.properties.orig
    mv /etc/hadoop/hadoop-metrics2.properties /etc/hadoop/hadoop-metrics2.properties.orig
    for module in mapred jvm rpc ugi; do
        echo "$module.class=org.apache.hadoop.metrics.ganglia.GangliaContext31"
        echo "$module.period=10"
        echo "$module.servers=$GANGLIA_HOST:8649"
    done > /etc/hadoop/hadoop-metrics.properties
    for module in namenode datanode resourcemanager nodemanager mrappmaster jobhistoryserver; do
        echo "$module.sink.ganglia.class=org.apache.hadoop.metrics2.sink.ganglia.GangliaSink31"
        echo "$module.sink.ganglia.period=10"
        echo "$module.sink.ganglia.supportsparse=true"
        echo "$module.sink.ganglia.slope=jvm.metrics.gcCount=zero,jvm.metrics.memHeapUsedM=both"
        echo "$module.sink.ganglia.dmax=jvm.metrics.threadsBlocked=70,jvm.metrics.memHeapUsedM=40"
        echo "$module.sink.ganglia.servers=$GANGLIA_HOST:8649"
    done > /etc/hadoop/hadoop-metrics2.properties
 fi
 function wait_for_it()
 {
    local serviceport=$1
    local service=${serviceport%%:*}
    local port=${serviceport#*:}
    local retry_seconds=5
    local max_try=100
    let i=1
    nc -z $service $port
    result=$?
    until [ $result -eq 0 ]; do
      echo "[$i/$max_try] check for ${service}:${port}..."
      echo "[$i/$max_try] ${service}:${port} is not available yet"
      if (( $i == $max_try )); then
        echo "[$i/$max_try] ${service}:${port} is still not available; giving up after ${max_try} tries. :/"
        exit 1
      fi
      echo "[$i/$max_try] try in ${retry_seconds}s once again ..."
      let "i++"
      sleep $retry_seconds
      nc -z $service $port
      result=$?
    done
    echo "[$i/$max_try] $service:${port} is available."
 }
 for i in ${SERVICE_PRECONDITION[@]}
 do
    wait_for_it ${i}
 done
 exec $@
--- a/docker/hoodie/hadoop/hive_base/pom.xml
+++ b/docker/hoodie/hadoop/hive_base/pom.xml
@@ -0,0 +1,113 @@
 <?xml version="1.0" encoding="UTF-8"?>
 <!--
  ~ Copyright (c) 2016 Uber Technologies, Inc. (hoodie-dev-group@uber.com)
  ~
  ~ Licensed under the Apache License, Version 2.0 (the "License");
  ~ you may not use this file except in compliance with the License.
  ~ You may obtain a copy of the License at
  ~
  ~          http://www.apache.org/licenses/LICENSE-2.0
  ~
  ~ Unless required by applicable law or agreed to in writing, software
  ~ distributed under the License is distributed on an "AS IS" BASIS,
  ~ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  ~ See the License for the specific language governing permissions and
  ~ limitations under the License.
  -->
 <project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
  <parent>
    <artifactId>hoodie-hadoop-docker</artifactId>
    <groupId>com.uber.hoodie</groupId>
    <version>0.4.5-SNAPSHOT</version>
  </parent>
  <modelVersion>4.0.0</modelVersion>
  <packaging>pom</packaging>
  <artifactId>hoodie-hadoop-hive-docker</artifactId>
  <description>Base Docker Image with Hoodie</description>
  <properties>
    <project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
    <checkstyle.skip>true</checkstyle.skip>
  </properties>
  <dependencies>
    <dependency>
      <groupId>com.uber.hoodie</groupId>
      <artifactId>hoodie-hadoop-base-docker</artifactId>
      <version>${project.version}</version>
      <type>pom</type>
      <scope>import</scope>
    </dependency>
  </dependencies>
  <build>
    <plugins>
      <plugin>
        <artifactId>maven-antrun-plugin</artifactId>
        <version>1.7</version>
        <executions>
          <execution>
            <phase>package</phase>
            <configuration>
              <tasks>
                <copy file="${project.basedir}/../../../../packaging/hoodie-hadoop-mr-bundle/target/hoodie-hadoop-mr-bundle-${project.version}.jar"
                      tofile="target/hoodie-hadoop-mr-bundle.jar" />
                <copy file="${project.basedir}/../../../../packaging/hoodie-hive-bundle/target/hoodie-hive-bundle-${project.version}.jar"
                      tofile="target/hoodie-hive-bundle.jar" />
                <copy file="${project.basedir}/../../../../packaging/hoodie-spark-bundle/target/hoodie-spark-bundle-${project.version}.jar"
                      tofile="target/hoodie-spark-bundle.jar" />
                <copy file="${project.basedir}/../../../../hoodie-utilities/target/hoodie-utilities-${project.version}.jar"
                      tofile="target/hoodie-utilities.jar" />
              </tasks>
            </configuration>
            <goals>
              <goal>run</goal>
            </goals>
          </execution>
        </executions>
      </plugin>
      <!-- Build Docker image -->
      <plugin>
        <groupId>com.spotify</groupId>
        <artifactId>dockerfile-maven-plugin</artifactId>
        <version>${dockerfile.maven.version}</version>
        <executions>
          <execution>
            <id>tag-latest</id>
            <phase>pre-integration-test</phase>
            <goals>
              <goal>build</goal>
              <goal>tag</goal>
              <!-- <goal>push</goal> -->
            </goals>
            <configuration>
              <skip>${docker.build.skip}</skip>
              <pullNewerImage>false</pullNewerImage>
              <repository>varadarb/hudi-hadoop_${docker.hadoop.version}-hive_${docker.hive.version}</repository>
              <forceTags>true</forceTags>
              <tag>latest</tag>
            </configuration>
          </execution>
          <execution>
            <id>tag-version</id>
            <phase>pre-integration-test</phase>
            <goals>
              <goal>build</goal>
              <goal>tag</goal>
              <!-- <goal>push</goal> -->
            </goals>
            <configuration>
              <skip>${docker.build.skip}</skip>
              <pullNewerImage>false</pullNewerImage>
              <repository>varadarb/hudi-hadoop_${docker.hadoop.version}-hive_${docker.hive.version}</repository>
              <forceTags>true</forceTags>
              <tag>${project.version}</tag>
            </configuration>
          </execution>
        </executions>
      </plugin>
    </plugins>
  </build>
 </project>
--- a/docker/hoodie/hadoop/hive_base/startup.sh
+++ b/docker/hoodie/hadoop/hive_base/startup.sh
@@ -0,0 +1,10 @@
 #!/bin/bash
 hadoop fs -mkdir       /tmp
 hadoop fs -mkdir -p    /user/hive/warehouse
 hadoop fs -chmod g+w   /tmp
 hadoop fs -chmod g+w   /user/hive/warehouse
 cd $HIVE_HOME/bin
 export AUX_CLASSPATH=file://${HUDI_HADOOP_BUNDLE}
 ./hiveserver2 --hiveconf hive.server2.enable.doAs=false  --hiveconf hive.aux.jars.path=file://${HUDI_HADOOP_BUNDLE}
--- a/docker/hoodie/hadoop/namenode/Dockerfile
+++ b/docker/hoodie/hadoop/namenode/Dockerfile
@@ -0,0 +1,14 @@
 ARG HADOOP_VERSION=2.8.4 
 ARG HADOOP_WEBHDFS_PORT=50070
 FROM varadarb/hudi-hadoop_${HADOOP_VERSION}-base:latest
 ENV HADOOP_WEBHDFS_PORT ${HADOOP_WEBHDFS_PORT}
 ENV HDFS_CONF_dfs_namenode_name_dir=file:///hadoop/dfs/name
 RUN mkdir -p /hadoop/dfs/name
 VOLUME /hadoop/dfs/name
 ADD run_nn.sh /run_nn.sh
 RUN chmod a+x /run_nn.sh
 CMD ["/run_nn.sh"]
--- a/docker/hoodie/hadoop/namenode/pom.xml
+++ b/docker/hoodie/hadoop/namenode/pom.xml
@@ -0,0 +1,89 @@
 <?xml version="1.0" encoding="UTF-8"?>
 <!--
  ~ Copyright (c) 2016 Uber Technologies, Inc. (hoodie-dev-group@uber.com)
  ~
  ~ Licensed under the Apache License, Version 2.0 (the "License");
  ~ you may not use this file except in compliance with the License.
  ~ You may obtain a copy of the License at
  ~
  ~          http://www.apache.org/licenses/LICENSE-2.0
  ~
  ~ Unless required by applicable law or agreed to in writing, software
  ~ distributed under the License is distributed on an "AS IS" BASIS,
  ~ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  ~ See the License for the specific language governing permissions and
  ~ limitations under the License.
  -->
 <project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
  <parent>
    <artifactId>hoodie-hadoop-docker</artifactId>
    <groupId>com.uber.hoodie</groupId>
    <version>0.4.5-SNAPSHOT</version>
  </parent>
  <modelVersion>4.0.0</modelVersion>
  <packaging>pom</packaging>
  <artifactId>hoodie-hadoop-namenode-docker</artifactId>
  <description>Base Docker Image with Hoodie</description>
  <properties>
    <project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
    <checkstyle.skip>true</checkstyle.skip>
  </properties>
  <dependencies>
    <dependency>
      <groupId>com.uber.hoodie</groupId>
      <artifactId>hoodie-hadoop-base-docker</artifactId>
      <version>${project.version}</version>
      <type>pom</type>
      <scope>import</scope>
    </dependency>
  </dependencies>
  <build>
    <plugins>
      <!-- Build Docker image -->
      <plugin>
        <groupId>com.spotify</groupId>
        <artifactId>dockerfile-maven-plugin</artifactId>
        <version>${dockerfile.maven.version}</version>
        <executions>
          <execution>
            <id>tag-latest</id>
            <phase>pre-integration-test</phase>
            <goals>
              <goal>build</goal>
              <goal>tag</goal>
              <!-- <goal>push</goal> -->
            </goals>
            <configuration>
              <skip>${docker.build.skip}</skip>
              <pullNewerImage>false</pullNewerImage>
              <repository>varadarb/hudi-hadoop_${docker.hadoop.version}-namenode</repository>
              <forceTags>true</forceTags>
              <tag>latest</tag>
            </configuration>
          </execution>
          <execution>
            <id>tag-version</id>
            <phase>pre-integration-test</phase>
            <goals>
              <goal>build</goal>
              <goal>tag</goal>
              <!-- <goal>push</goal> -->
            </goals>
            <configuration>
              <skip>${docker.build.skip}</skip>
              <pullNewerImage>false</pullNewerImage>
              <repository>varadarb/hudi-hadoop_${docker.hadoop.version}-namenode</repository>
              <forceTags>true</forceTags>
              <tag>${project.version}</tag>
            </configuration>
          </execution>
        </executions>
      </plugin>
    </plugins>
  </build>
 </project>
--- a/docker/hoodie/hadoop/namenode/run_nn.sh
+++ b/docker/hoodie/hadoop/namenode/run_nn.sh
@@ -0,0 +1,19 @@
 #!/bin/bash
 namedir=`echo $HDFS_CONF_dfs_namenode_name_dir | perl -pe 's#file://##'`
 if [ ! -d $namedir ]; then
  echo "Namenode name directory not found: $namedir"
  exit 2
 fi
 if [ -z "$CLUSTER_NAME" ]; then
  echo "Cluster name not specified"
  exit 2
 fi
 if [ "`ls -A $namedir`" == "" ]; then
  echo "Formatting namenode name directory: $namedir"
  $HADOOP_PREFIX/bin/hdfs --config $HADOOP_CONF_DIR namenode -format $CLUSTER_NAME 
 fi
 $HADOOP_PREFIX/bin/hdfs --config $HADOOP_CONF_DIR namenode
--- a/docker/hoodie/hadoop/pom.xml
+++ b/docker/hoodie/hadoop/pom.xml
@@ -0,0 +1,78 @@
 <?xml version="1.0" encoding="UTF-8"?>
 <!--
  ~ Copyright (c) 2016 Uber Technologies, Inc. (hoodie-dev-group@uber.com)
  ~
  ~ Licensed under the Apache License, Version 2.0 (the "License");
  ~ you may not use this file except in compliance with the License.
  ~ You may obtain a copy of the License at
  ~
  ~          http://www.apache.org/licenses/LICENSE-2.0
  ~
  ~ Unless required by applicable law or agreed to in writing, software
  ~ distributed under the License is distributed on an "AS IS" BASIS,
  ~ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  ~ See the License for the specific language governing permissions and
  ~ limitations under the License.
  -->
 <project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
  <parent>
    <artifactId>hoodie</artifactId>
    <groupId>com.uber.hoodie</groupId>
    <version>0.4.5-SNAPSHOT</version>
    <relativePath>../../../pom.xml</relativePath>
  </parent>
  <modelVersion>4.0.0</modelVersion>
  <artifactId>hoodie-hadoop-docker</artifactId>
  <packaging>pom</packaging>
  <modules>
    <module>base</module>
    <module>namenode</module>
    <module>datanode</module>
    <module>historyserver</module>
    <module>hive_base</module>
    <module>spark_base</module>
    <module>sparkmaster</module>
    <module>sparkworker</module>
    <module>sparkadhoc</module>
  </modules>
  <dependencies>
    <dependency>
      <groupId>com.uber.hoodie</groupId>
      <artifactId>hoodie-spark-bundle</artifactId>
      <version>${project.version}</version>
    </dependency>
  </dependencies>
  <properties>
    <skipITs>false</skipITs>
    <docker.build.skip>true</docker.build.skip>
    <docker.spark.version>2.3.1</docker.spark.version>
    <docker.hive.version>2.3.3</docker.hive.version>
    <docker.hadoop.version>2.8.4</docker.hadoop.version>
    <dockerfile.maven.version>1.4.3</dockerfile.maven.version>
    <checkstyle.skip>true</checkstyle.skip>
  </properties>
  <build>
    <extensions>
      <extension>
        <groupId>com.spotify</groupId>
        <artifactId>dockerfile-maven-extension</artifactId>
        <version>${dockerfile.maven.version}</version>
      </extension>
    </extensions>
    <plugins>
     <plugin>
        <groupId>com.spotify</groupId>
        <artifactId>dockerfile-maven-plugin</artifactId>
        <version>${dockerfile.maven.version}</version>
        <configuration>
          <skip>true</skip>
        </configuration>
      </plugin>
    </plugins>
  </build>
 </project>
--- a/docker/hoodie/hadoop/spark_base/Dockerfile
+++ b/docker/hoodie/hadoop/spark_base/Dockerfile
@@ -0,0 +1,46 @@
 ARG HADOOP_VERSION=2.8.4 
 ARG HIVE_VERSION=2.3.3
 FROM varadarb/hudi-hadoop_${HADOOP_VERSION}-hive_${HIVE_VERSION}
 ENV ENABLE_INIT_DAEMON true
 ENV INIT_DAEMON_BASE_URI http://identifier/init-daemon
 ENV INIT_DAEMON_STEP spark_master_init
 ARG SPARK_VERSION=2.3.1
 ARG SPARK_HADOOP_VERSION=2.7
 ENV SPARK_VERSION ${SPARK_VERSION}
 ENV HADOOP_VERSION ${SPARK_HADOOP_VERSION}
 COPY wait-for-step.sh /
 COPY execute-step.sh /
 COPY finish-step.sh /
 RUN echo "Installing Spark-version (${SPARK_VERSION})" \
      &&  wget http://apache.mirror.iphh.net/spark/spark-${SPARK_VERSION}/spark-${SPARK_VERSION}-bin-hadoop${HADOOP_VERSION}.tgz \
      && tar -xvzf spark-${SPARK_VERSION}-bin-hadoop${HADOOP_VERSION}.tgz \
      && mv spark-${SPARK_VERSION}-bin-hadoop${HADOOP_VERSION} /opt/spark \
      && rm spark-${SPARK_VERSION}-bin-hadoop${HADOOP_VERSION}.tgz \
      && cd /
 #Give permission to execute scripts
 RUN chmod +x /wait-for-step.sh && chmod +x /execute-step.sh && chmod +x /finish-step.sh
 # Fix the value of PYTHONHASHSEED
 # Note: this is needed when you use Python 3.3 or greater
 ENV PYTHONHASHSEED 1
 ENV SPARK_HOME /opt/spark
 ENV SPARK_INSTALL ${SPARK_HOME}
 ENV SPARK_CONF_DIR ${SPARK_HOME}/conf
 ENV PATH $SPARK_INSTALL/bin:$PATH
 ENV SPARK_DRIVER_PORT 5001
 ENV SPARK_UI_PORT 5002
 ENV SPARK_BLOCKMGR_PORT 5003
 EXPOSE $SPARK_DRIVER_PORT $SPARK_UI_PORT $SPARK_BLOCKMGR_PORT
 # Without this spark-shell fails - Download if it is not already there in $SPARK_INSTALL
 RUN wget -nc -q -O "${SPARK_INSTALL}/jars/jersey-bundle-1.19.4.jar" "http://repo1.maven.org/maven2/com/sun/jersey/jersey-bundle/1.19.4/jersey-bundle-1.19.4.jar" 
--- a/docker/hoodie/hadoop/spark_base/execute-step.sh
+++ b/docker/hoodie/hadoop/spark_base/execute-step.sh
@@ -0,0 +1,14 @@
 #!/bin/bash
 if [ $ENABLE_INIT_DAEMON = "true" ]
   then
       echo "Execute step ${INIT_DAEMON_STEP} in pipeline"
       while true; do
 	   sleep 5
 	   echo -n '.'
 	   string=$(curl -sL -w "%{http_code}" -X PUT $INIT_DAEMON_BASE_URI/execute?step=$INIT_DAEMON_STEP -o /dev/null)
 	   [ "$string" = "204" ] && break
       done
       echo "Notified execution of step ${INIT_DAEMON_STEP}"
 fi
--- a/docker/hoodie/hadoop/spark_base/finish-step.sh
+++ b/docker/hoodie/hadoop/spark_base/finish-step.sh
@@ -0,0 +1,16 @@
 #!/bin/bash
 if [ $ENABLE_INIT_DAEMON = "true" ]
   then
       echo "Finish step ${INIT_DAEMON_STEP} in pipeline"
       while true; do
 	   sleep 5
 	   echo -n '.'
 	   string=$(curl -sL -w "%{http_code}" -X PUT $INIT_DAEMON_BASE_URI/finish?step=$INIT_DAEMON_STEP -o /dev/null)
 	   [ "$string" = "204" ] && break
       done
       echo "Notified finish of step ${INIT_DAEMON_STEP}"
 fi
--- a/docker/hoodie/hadoop/spark_base/pom.xml
+++ b/docker/hoodie/hadoop/spark_base/pom.xml
@@ -0,0 +1,89 @@
 <?xml version="1.0" encoding="UTF-8"?>
 <!--
  ~ Copyright (c) 2016 Uber Technologies, Inc. (hoodie-dev-group@uber.com)
  ~
  ~ Licensed under the Apache License, Version 2.0 (the "License");
  ~ you may not use this file except in compliance with the License.
  ~ You may obtain a copy of the License at
  ~
  ~          http://www.apache.org/licenses/LICENSE-2.0
  ~
  ~ Unless required by applicable law or agreed to in writing, software
  ~ distributed under the License is distributed on an "AS IS" BASIS,
  ~ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  ~ See the License for the specific language governing permissions and
  ~ limitations under the License.
  -->
 <project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
  <parent>
    <artifactId>hoodie-hadoop-docker</artifactId>
    <groupId>com.uber.hoodie</groupId>
    <version>0.4.5-SNAPSHOT</version>
  </parent>
  <modelVersion>4.0.0</modelVersion>
  <packaging>pom</packaging>
  <artifactId>hoodie-hadoop-sparkbase-docker</artifactId>
  <description>Base Docker Image with Hoodie</description>
  <properties>
    <project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
    <checkstyle.skip>true</checkstyle.skip>
  </properties>
  <dependencies>
    <dependency>
      <groupId>com.uber.hoodie</groupId>
      <artifactId>hoodie-hadoop-hive-docker</artifactId>
      <version>${project.version}</version>
      <type>pom</type>
      <scope>import</scope>
    </dependency>
  </dependencies>
  <build>
    <plugins>
      <!-- Build Docker image -->
      <plugin>
        <groupId>com.spotify</groupId>
        <artifactId>dockerfile-maven-plugin</artifactId>
        <version>${dockerfile.maven.version}</version>
        <executions>
          <execution>
            <id>tag-latest</id>
            <phase>pre-integration-test</phase>
            <goals>
              <goal>build</goal>
              <goal>tag</goal>
              <!-- <goal>push</goal> -->
            </goals>
            <configuration>
              <skip>${docker.build.skip}</skip>
              <pullNewerImage>false</pullNewerImage>
              <repository>varadarb/hudi-hadoop_${docker.hadoop.version}-hive_${docker.hive.version}-sparkbase_${docker.spark.version}</repository>
              <forceTags>true</forceTags>
              <tag>latest</tag>
            </configuration>
          </execution>
          <execution>
            <id>tag-version</id>
            <phase>pre-integration-test</phase>
            <goals>
              <goal>build</goal>
              <goal>tag</goal>
              <!-- <goal>push</goal> -->
            </goals>
            <configuration>
              <skip>${docker.build.skip}</skip>
              <pullNewerImage>false</pullNewerImage>
              <repository>varadarb/hudi-hadoop_${docker.hadoop.version}-hive_${docker.hive.version}-sparkbase_${docker.spark.version}</repository>
              <forceTags>true</forceTags>
              <tag>${project.version}</tag>
            </configuration>
          </execution>
        </executions>
      </plugin>
    </plugins>
  </build>
 </project>
--- a/docker/hoodie/hadoop/spark_base/wait-for-step.sh
+++ b/docker/hoodie/hadoop/spark_base/wait-for-step.sh
@@ -0,0 +1,13 @@
 #!/bin/bash
 if [ $ENABLE_INIT_DAEMON = "true" ]
   then
       echo "Validating if step ${INIT_DAEMON_STEP} can start in pipeline"
       while true; do
 	   sleep 5
 	   echo -n '.'
 	   string=$(curl -s $INIT_DAEMON_BASE_URI/canStart?step=$INIT_DAEMON_STEP)
 	   [ "$string" = "true" ] && break
       done
       echo "Can start step ${INIT_DAEMON_STEP}"
 fi
--- a/docker/hoodie/hadoop/sparkadhoc/Dockerfile
+++ b/docker/hoodie/hadoop/sparkadhoc/Dockerfile
@@ -0,0 +1,12 @@
 ARG HADOOP_VERSION=2.8.4 
 ARG HIVE_VERSION=2.3.3
 ARG SPARK_VERSION=2.3.1
 FROM varadarb/hudi-hadoop_${HADOOP_VERSION}-hive_${HIVE_VERSION}-sparkbase_${SPARK_VERSION}
 COPY adhoc.sh /opt/spark
 ENV SPARK_WORKER_WEBUI_PORT 8081
 ENV SPARK_WORKER_LOG /spark/logs
 ENV SPARK_MASTER "spark://spark-master:7077"
 CMD ["/bin/bash", "/opt/spark/adhoc.sh"]
--- a/docker/hoodie/hadoop/sparkadhoc/adhoc.sh
+++ b/docker/hoodie/hadoop/sparkadhoc/adhoc.sh
@@ -0,0 +1,13 @@
 #!/bin/bash
 . "/spark/sbin/spark-config.sh"
 . "/spark/bin/load-spark-env.sh"
 export SPARK_HOME=/opt/spark
 date
 echo "SPARK HOME is : $SPARK_HOME"
 tail -f /dev/null
--- a/docker/hoodie/hadoop/sparkadhoc/pom.xml
+++ b/docker/hoodie/hadoop/sparkadhoc/pom.xml
@@ -0,0 +1,89 @@
 <?xml version="1.0" encoding="UTF-8"?>
 <!--
  ~ Copyright (c) 2016 Uber Technologies, Inc. (hoodie-dev-group@uber.com)
  ~
  ~ Licensed under the Apache License, Version 2.0 (the "License");
  ~ you may not use this file except in compliance with the License.
  ~ You may obtain a copy of the License at
  ~
  ~          http://www.apache.org/licenses/LICENSE-2.0
  ~
  ~ Unless required by applicable law or agreed to in writing, software
  ~ distributed under the License is distributed on an "AS IS" BASIS,
  ~ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  ~ See the License for the specific language governing permissions and
  ~ limitations under the License.
  -->
 <project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
  <parent>
    <artifactId>hoodie-hadoop-docker</artifactId>
    <groupId>com.uber.hoodie</groupId>
    <version>0.4.5-SNAPSHOT</version>
  </parent>
  <modelVersion>4.0.0</modelVersion>
  <packaging>pom</packaging>
  <artifactId>hoodie-hadoop-sparkadhoc-docker</artifactId>
  <description>Base Docker Image with Hoodie</description>
  <properties>
    <project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
    <checkstyle.skip>true</checkstyle.skip>
  </properties>
  <dependencies>
    <dependency>
      <groupId>com.uber.hoodie</groupId>
      <artifactId>hoodie-hadoop-sparkbase-docker</artifactId>
      <version>${project.version}</version>
      <type>pom</type>
      <scope>import</scope>
    </dependency>
  </dependencies>
  <build>
    <plugins>
      <!-- Build Docker image -->
      <plugin>
        <groupId>com.spotify</groupId>
        <artifactId>dockerfile-maven-plugin</artifactId>
        <version>${dockerfile.maven.version}</version>
        <executions>
          <execution>
            <id>tag-latest</id>
            <phase>pre-integration-test</phase>
            <goals>
              <goal>build</goal>
              <goal>tag</goal>
              <!-- <goal>push</goal> -->
            </goals>
            <configuration>
              <skip>${docker.build.skip}</skip>
              <pullNewerImage>false</pullNewerImage>
              <repository>varadarb/hudi-hadoop_${docker.hadoop.version}-hive_${docker.hive.version}-sparkadhoc_${docker.spark.version}</repository>
              <forceTags>true</forceTags>
              <tag>latest</tag>
            </configuration>
          </execution>
          <execution>
            <id>tag-version</id>
            <phase>pre-integration-test</phase>
            <goals>
              <goal>build</goal>
              <goal>tag</goal>
              <!-- <goal>push</goal> -->
            </goals>
            <configuration>
              <skip>${docker.build.skip}</skip>
              <pullNewerImage>false</pullNewerImage>
              <repository>varadarb/hudi-hadoop_${docker.hadoop.version}-hive_${docker.hive.version}-sparkadhoc_${docker.spark.version}</repository>
              <forceTags>true</forceTags>
              <tag>${project.version}</tag>
            </configuration>
          </execution>
        </executions>
      </plugin>
    </plugins>
  </build>
 </project>
--- a/docker/hoodie/hadoop/sparkmaster/Dockerfile
+++ b/docker/hoodie/hadoop/sparkmaster/Dockerfile
@@ -0,0 +1,14 @@
 ARG HADOOP_VERSION=2.8.4 
 ARG HIVE_VERSION=2.3.3
 ARG SPARK_VERSION=2.3.1
 FROM varadarb/hudi-hadoop_${HADOOP_VERSION}-hive_${HIVE_VERSION}-sparkbase_${SPARK_VERSION}
 COPY master.sh /opt/spark
 ENV SPARK_MASTER_PORT 7077
 ENV SPARK_MASTER_WEBUI_PORT 8080
 ENV SPARK_MASTER_LOG /opt/spark/logs
 EXPOSE 8080 7077 6066
 CMD ["/bin/bash", "/opt/spark/master.sh"]
--- a/docker/hoodie/hadoop/sparkmaster/master.sh
+++ b/docker/hoodie/hadoop/sparkmaster/master.sh
@@ -0,0 +1,16 @@
 #!/bin/bash
 export SPARK_MASTER_HOST=`hostname`
 . "/opt/spark/sbin/spark-config.sh"
 . "/opt/spark/bin/load-spark-env.sh"
 mkdir -p $SPARK_MASTER_LOG
 export SPARK_HOME=/opt/spark
 ln -sf /dev/stdout $SPARK_MASTER_LOG/spark-master.out
 cd /opt/spark/bin && /opt/spark/sbin/../bin/spark-class org.apache.spark.deploy.master.Master \
    --ip $SPARK_MASTER_HOST --port $SPARK_MASTER_PORT --webui-port $SPARK_MASTER_WEBUI_PORT >> $SPARK_MASTER_LOG/spark-master.out
--- a/docker/hoodie/hadoop/sparkmaster/pom.xml
+++ b/docker/hoodie/hadoop/sparkmaster/pom.xml
@@ -0,0 +1,89 @@
 <?xml version="1.0" encoding="UTF-8"?>
 <!--
  ~ Copyright (c) 2016 Uber Technologies, Inc. (hoodie-dev-group@uber.com)
  ~
  ~ Licensed under the Apache License, Version 2.0 (the "License");
  ~ you may not use this file except in compliance with the License.
  ~ You may obtain a copy of the License at
  ~
  ~          http://www.apache.org/licenses/LICENSE-2.0
  ~
  ~ Unless required by applicable law or agreed to in writing, software
  ~ distributed under the License is distributed on an "AS IS" BASIS,
  ~ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  ~ See the License for the specific language governing permissions and
  ~ limitations under the License.
  -->
 <project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
  <parent>
    <artifactId>hoodie-hadoop-docker</artifactId>
    <groupId>com.uber.hoodie</groupId>
    <version>0.4.5-SNAPSHOT</version>
  </parent>
  <modelVersion>4.0.0</modelVersion>
  <packaging>pom</packaging>
  <artifactId>hoodie-hadoop-sparkmaster-docker</artifactId>
  <description>Base Docker Image with Hoodie</description>
  <properties>
    <project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
    <checkstyle.skip>true</checkstyle.skip>
  </properties>
  <dependencies>
    <dependency>
      <groupId>com.uber.hoodie</groupId>
      <artifactId>hoodie-hadoop-sparkbase-docker</artifactId>
      <version>${project.version}</version>
      <type>pom</type>
      <scope>import</scope>
    </dependency>
  </dependencies>
  <build>
    <plugins>
      <!-- Build Docker image -->
      <plugin>
        <groupId>com.spotify</groupId>
        <artifactId>dockerfile-maven-plugin</artifactId>
        <version>${dockerfile.maven.version}</version>
        <executions>
          <execution>
            <id>tag-latest</id>
            <phase>pre-integration-test</phase>
            <goals>
              <goal>build</goal>
              <goal>tag</goal>
              <!-- <goal>push</goal> -->
            </goals>
            <configuration>
              <skip>${docker.build.skip}</skip>
              <pullNewerImage>false</pullNewerImage>
              <repository>varadarb/hudi-hadoop_${docker.hadoop.version}-hive_${docker.hive.version}-sparkmaster_${docker.spark.version}</repository>
              <forceTags>true</forceTags>
              <tag>latest</tag>
            </configuration>
          </execution>
          <execution>
            <id>tag-version</id>
            <phase>pre-integration-test</phase>
            <goals>
              <goal>build</goal>
              <goal>tag</goal>
              <!-- <goal>push</goal> -->
            </goals>
            <configuration>
              <skip>${docker.build.skip}</skip>
              <pullNewerImage>false</pullNewerImage>
              <repository>varadarb/hudi-hadoop_${docker.hadoop.version}-hive_${docker.hive.version}-sparkmaster_${docker.spark.version}</repository>
              <forceTags>true</forceTags>
              <tag>${project.version}</tag>
            </configuration>
          </execution>
        </executions>
      </plugin>
    </plugins>
  </build>
 </project>
--- a/docker/hoodie/hadoop/sparkworker/Dockerfile
+++ b/docker/hoodie/hadoop/sparkworker/Dockerfile
@@ -0,0 +1,14 @@
 ARG HADOOP_VERSION=2.8.4 
 ARG HIVE_VERSION=2.3.3
 ARG SPARK_VERSION=2.3.1
 FROM varadarb/hudi-hadoop_${HADOOP_VERSION}-hive_${HIVE_VERSION}-sparkbase_${SPARK_VERSION}
 COPY worker.sh /opt/spark
 ENV SPARK_WORKER_WEBUI_PORT 8081
 ENV SPARK_WORKER_LOG /spark/logs
 ENV SPARK_MASTER "spark://spark-master:7077"
 EXPOSE 8081
 CMD ["/bin/bash", "/opt/spark/worker.sh"]
--- a/docker/hoodie/hadoop/sparkworker/pom.xml
+++ b/docker/hoodie/hadoop/sparkworker/pom.xml
@@ -0,0 +1,89 @@
 <?xml version="1.0" encoding="UTF-8"?>
 <!--
  ~ Copyright (c) 2016 Uber Technologies, Inc. (hoodie-dev-group@uber.com)
  ~
  ~ Licensed under the Apache License, Version 2.0 (the "License");
  ~ you may not use this file except in compliance with the License.
  ~ You may obtain a copy of the License at
  ~
  ~          http://www.apache.org/licenses/LICENSE-2.0
  ~
  ~ Unless required by applicable law or agreed to in writing, software
  ~ distributed under the License is distributed on an "AS IS" BASIS,
  ~ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  ~ See the License for the specific language governing permissions and
  ~ limitations under the License.
  -->
 <project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
  <parent>
    <artifactId>hoodie-hadoop-docker</artifactId>
    <groupId>com.uber.hoodie</groupId>
    <version>0.4.5-SNAPSHOT</version>
  </parent>
  <modelVersion>4.0.0</modelVersion>
  <packaging>pom</packaging>
  <artifactId>hoodie-hadoop-sparkworker-docker</artifactId>
  <description>Base Docker Image with Hoodie</description>
  <properties>
    <project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
    <checkstyle.skip>true</checkstyle.skip>
  </properties>
  <dependencies>
    <dependency>
      <groupId>com.uber.hoodie</groupId>
      <artifactId>hoodie-hadoop-sparkbase-docker</artifactId>
      <version>${project.version}</version>
      <type>pom</type>
      <scope>import</scope>
    </dependency>
  </dependencies>
  <build>
    <plugins>
      <!-- Build Docker image -->
      <plugin>
        <groupId>com.spotify</groupId>
        <artifactId>dockerfile-maven-plugin</artifactId>
        <version>${dockerfile.maven.version}</version>
        <executions>
          <execution>
            <id>tag-latest</id>
            <phase>pre-integration-test</phase>
            <goals>
              <goal>build</goal>
              <goal>tag</goal>
              <!--<goal>push</goal> -->
            </goals>
            <configuration>
              <skip>${docker.build.skip}</skip>
              <pullNewerImage>false</pullNewerImage>
              <repository>varadarb/hudi-hadoop_${docker.hadoop.version}-hive_${docker.hive.version}-sparkworker_${docker.spark.version}</repository>
              <forceTags>true</forceTags>
              <tag>latest</tag>
            </configuration>
          </execution>
          <execution>
            <id>tag-version</id>
            <phase>pre-integration-test</phase>
            <goals>
              <goal>build</goal>
              <goal>tag</goal>
              <!--<goal>push</goal> -->
            </goals>
            <configuration>
              <skip>${docker.build.skip}</skip>
              <pullNewerImage>false</pullNewerImage>
              <repository>varadarb/hudi-hadoop_${docker.hadoop.version}-hive_${docker.hive.version}-sparkworker_${docker.spark.version}</repository>
              <forceTags>true</forceTags>
              <tag>${project.version}</tag>
            </configuration>
          </execution>
        </executions>
      </plugin>
    </plugins>
  </build>
 </project>
--- a/docker/hoodie/hadoop/sparkworker/worker.sh
+++ b/docker/hoodie/hadoop/sparkworker/worker.sh
@@ -0,0 +1,16 @@
 #!/bin/bash
 . "/spark/sbin/spark-config.sh"
 . "/spark/bin/load-spark-env.sh"
 mkdir -p $SPARK_WORKER_LOG
 export SPARK_HOME=/opt/spark
 ln -sf /dev/stdout $SPARK_WORKER_LOG/spark-worker.out
 date
 echo "SPARK HOME is : $SPARK_HOME"
 /opt/spark/sbin/../bin/spark-class org.apache.spark.deploy.worker.Worker \
    --webui-port $SPARK_WORKER_WEBUI_PORT $SPARK_MASTER >> $SPARK_WORKER_LOG/spark-worker.out
--- a/docker/setup_demo.sh
+++ b/docker/setup_demo.sh
@@ -0,0 +1,16 @@
 # Create host mount directory and copy
 mkdir -p /tmp/hadoop_name
 mkdir -p /tmp/hadoop_data
 WS_ROOT=`dirname $PWD`
 # restart cluster
 HUDI_WS=${WS_ROOT} docker-compose -f compose/docker-compose_hadoop284_hive233_spark231.yml down
 HUDI_WS=${WS_ROOT} docker-compose -f compose/docker-compose_hadoop284_hive233_spark231.yml pull
 rm -rf /tmp/hadoop_data/*
 rm -rf /tmp/hadoop_name/*
 sleep 5
 HUDI_WS=${WS_ROOT} docker-compose -f compose/docker-compose_hadoop284_hive233_spark231.yml up -d
 sleep 15
 docker exec -it adhoc-1 /bin/bash /var/hoodie/ws/docker/demo/setup_demo_container.sh
 docker exec -it adhoc-2 /bin/bash /var/hoodie/ws/docker/demo/setup_demo_container.sh
--- a/docs/quickstart.md
+++ b/docs/quickstart.md
@@ -14,11 +14,11 @@ Check out code and pull it into Intellij as a normal maven project.
 Normally build the maven project, from command line
 ```
-$ mvn clean install -DskipTests
+$ mvn clean install -DskipTests -DskipITs
 To work with older version of Hive (pre Hive-1.2.1), use
-$ mvn clean install -DskipTests -Dhive11
+$ mvn clean install -DskipTests -DskipITs -Dhive11
 ```
@@ -293,6 +293,947 @@ hive>
 {% include note.html content="This is only supported for Read-optimized tables for now." %}
 ## A Demo using docker containers
 Lets use a real world example to see how hudi works end to end. For this purpose, a self contained
 data infrastructure is brought up in a local docker cluster within your computer. 
 The steps assume you are using Mac laptop
 ### Prerequisites
  * Docker Setup :  For Mac, Please follow the steps as defined in [https://docs.docker.com/v17.12/docker-for-mac/install/]. For running Spark-SQL queries, please ensure atleast 6 GB and 4 CPUs are allocated to Docker (See Docker -> Preferences -> Advanced). Otherwise, spark-SQL queries could be killed because of memory issues.
  * kafkacat : A command-line utility to publish/consume from kafka topics. Use `brew install kafkacat` to install kafkacat
  * /etc/hosts : The demo references many services running in container by the hostname. Add the following settings to /etc/hosts
  ```
   127.0.0.1 adhoc-1
   127.0.0.1 adhoc-2
   127.0.0.1 namenode
   127.0.0.1 datanode1
   127.0.0.1 hiveserver
   127.0.0.1 hivemetastore
   127.0.0.1 kafkabroker
   127.0.0.1 sparkmaster
   127.0.0.1 zookeeper
  ```
 ### Setting up Docker Cluster
 #### Build Hoodie
 The first step is to build hoodie
 ```
 cd <HUDI_WORKSPACE>
 mvn package -DskipTests
 ```
 #### Bringing up Demo Cluster
 The next step is to run the docker compose script and setup configs for bringing up the cluster.
 This should pull the docker images from docker hub and setup docker cluster.
 ```
 cd docker
 ./setup_demo.sh
 ....
 ....
 ....
 Stopping spark-worker-1            ... done
 Stopping hiveserver                ... done
 Stopping hivemetastore             ... done
 Stopping historyserver             ... done
 .......
 ......
 Creating network "hudi_demo" with the default driver
 Creating hive-metastore-postgresql ... done
 Creating namenode                  ... done
 Creating zookeeper                 ... done
 Creating kafkabroker               ... done
 Creating hivemetastore             ... done
 Creating historyserver             ... done
 Creating hiveserver                ... done
 Creating datanode1                 ... done
 Creating sparkmaster               ... done
 Creating adhoc-1                   ... done
 Creating adhoc-2                   ... done
 Creating spark-worker-1            ... done
 Copying spark default config and setting up configs
 Copying spark default config and setting up configs
 Copying spark default config and setting up configs
 varadarb-C02SG7Q3G8WP:docker varadarb$ docker ps
 ```
 At this point, the docker cluster will be up and running. The demo cluster brings up the following services
   * HDFS Services (NameNode, DataNode)
   * Spark Master and Worker
   * Hive Services (Metastore, HiveServer2 along with PostgresDB)
   * Kafka Broker and a Zookeeper Node (Kakfa will be used as upstream source for the demo) 
   * Adhoc containers to run Hudi/Hive CLI commands
 ### Demo
 Stock Tracker data will be used to showcase both different Hudi Views and the effects of Compaction. 
 Take a look at the directory `docker/demo/data`. There are 2 batches of stock data - each at 1 minute granularity. 
 The first batch contains stocker tracker data for some stock symbols during the first hour of trading window 
 (9:30 a.m to 10:30 a.m). The second batch contains tracker data for next 30 mins (10:30 - 11 a.m). Hudi will
 be used to ingest these batches to a dataset which will contain the latest stock tracker data at hour level granularity.
 The batches are windowed intentionally so that the second batch contains updates to some of the rows in the first batch.
 #### Step 1 : Publish the first batch to Kafka
 Upload the first batch to Kafka topic 'stock ticks'
 ```
 cat docker/demo/data/batch_1.json | kafkacat -b kafkabroker -t stock_ticks -P 
 To check if the new topic shows up, use
 kafkacat -b kafkabroker -L -J | jq .
 {
  "originating_broker": {
    "id": 1001,
    "name": "kafkabroker:9092/1001"
  },
  "query": {
    "topic": "*"
  },
  "brokers": [
    {
      "id": 1001,
      "name": "kafkabroker:9092"
    }
  ],
  "topics": [
    {
      "topic": "stock_ticks",
      "partitions": [
        {
          "partition": 0,
          "leader": 1001,
          "replicas": [
            {
              "id": 1001
            }
          ],
          "isrs": [
            {
              "id": 1001
            }
          ]
        }
      ]
    }
  ]
 }
 ```
 #### Step 2: Incrementally ingest data from Kafka topic
 Hudi comes with a tool named DeltaStreamer. This tool can connect to variety of data sources (including Kafka) to
 pull changes and apply to Hudi dataset using upsert/insert primitives. Here, we will use the tool to download
 json data from kafka topic and ingest to both COW and MOR tables we initialized in the previous step. This tool 
 automatically initializes the datasets in the file-system if they do not exist yet.
 ```
 docker exec -it adhoc-2 /bin/bash
 # Run the following spark-submit command to execute the delta-streamer and ingest to stock_ticks_cow dataset in HDFS
 spark-submit --class com.uber.hoodie.utilities.deltastreamer.HoodieDeltaStreamer $HUDI_UTILITIES_BUNDLE --storage-type COPY_ON_WRITE --source-class com.uber.hoodie.utilities.sources.JsonKafkaSource --source-ordering-field ts  --target-base-path /user/hive/warehouse/stock_ticks_cow --target-table stock_ticks_cow --props /var/demo/config/kafka-source.properties
 ....
 ....
 2018-09-24 22:20:00 INFO  OutputCommitCoordinator$OutputCommitCoordinatorEndpoint:54 - OutputCommitCoordinator stopped!
 2018-09-24 22:20:00 INFO  SparkContext:54 - Successfully stopped SparkContext
 # Run the following spark-submit command to execute the delta-streamer and ingest to stock_ticks_mor dataset in HDFS
 spark-submit --class com.uber.hoodie.utilities.deltastreamer.HoodieDeltaStreamer $HUDI_UTILITIES_BUNDLE --storage-type MERGE_ON_READ --source-class com.uber.hoodie.utilities.sources.JsonKafkaSource --source-ordering-field ts  --target-base-path /user/hive/warehouse/stock_ticks_mor --target-table stock_ticks_mor --props /var/demo/config/kafka-source.properties
 ....
 2018-09-24 22:22:01 INFO  OutputCommitCoordinator$OutputCommitCoordinatorEndpoint:54 - OutputCommitCoordinator stopped!
 2018-09-24 22:22:01 INFO  SparkContext:54 - Successfully stopped SparkContext
 ....
 # As part of the setup (Look at setup_demo.sh), the configs needed for DeltaStreamer is uploaded to HDFS. The configs
 # contain mostly Kafa connectivity settings, the avro-schema to be used for ingesting along with key and partitioning fields.
 exit
 ```
 You can use HDFS web-browser to look at the datasets 
 `http://namenode:50070/explorer.html#/user/hive/warehouse/stock_ticks_cow`. 
 You can explore the new partition folder created in the dataset along with a "deltacommit"
 file under .hoodie which signals a successful commit.
 There will be a similar setup when you browse the MOR dataset
 `http://namenode:50070/explorer.html#/user/hive/warehouse/stock_ticks_mor`
 #### Step 3: Sync with Hive
 At this step, the datasets are available in HDFS. We need to sync with Hive to create new Hive tables and add partitions
 inorder to run Hive queries against those datasets.
 ```
 docker exec -it adhoc-2 /bin/bash
 # THis command takes in HIveServer URL and COW Hudi Dataset location in HDFS and sync the HDFS state to Hive
 /var/hoodie/ws/hoodie-hive/run_sync_tool.sh  --jdbc-url jdbc:hive2://hiveserver:10000 --user hive --pass hive --partitioned-by dt --base-path /user/hive/warehouse/stock_ticks_cow --database default --table stock_ticks_cow
 .....
 2018-09-24 22:22:45,568 INFO  [main] hive.HiveSyncTool (HiveSyncTool.java:syncHoodieTable(112)) - Sync complete for stock_ticks_cow
 .....
 # Now run hive-sync for the second data-set in HDFS using Merge-On-Read (MOR storage)
 /var/hoodie/ws/hoodie-hive/run_sync_tool.sh  --jdbc-url jdbc:hive2://hiveserver:10000 --user hive --pass hive --partitioned-by dt --base-path /user/hive/warehouse/stock_ticks_mor --database default --table stock_ticks_mor
 ...
 2018-09-24 22:23:09,171 INFO  [main] hive.HiveSyncTool (HiveSyncTool.java:syncHoodieTable(112)) - Sync complete for stock_ticks_mor
 ...
 2018-09-24 22:23:09,559 INFO  [main] hive.HiveSyncTool (HiveSyncTool.java:syncHoodieTable(112)) - Sync complete for stock_ticks_mor_rt
 ....
 exit
 ```
 After executing the above command, you will notice 
 1. A hive table named `stock_ticks_cow` created which provides Read-Optimized view for the Copy On Write dataset.
 2. Two new tables `stock_ticks_mor` and `stock_ticks_mor_rt` created for the Merge On Read dataset. The former
 provides the ReadOptimized view for the Hudi dataset and the later provides the realtime-view for the dataset.
 #### Step 4 (a): Run Hive Queries
 Run a hive query to find the latest timestamp ingested for stock symbol 'GOOG'. You will notice that both read-optimized
 (for both COW and MOR dataset)and realtime views (for MOR dataset)give the same value "10:29 a.m" as Hudi create a 
 parquet file for the first batch of data.
 ```
 docker exec -it adhoc-2 /bin/bash
 beeline -u jdbc:hive2://hiveserver:10000 --hiveconf hive.input.format=org.apache.hadoop.hive.ql.io.HiveInputFormat --hiveconf hive.stats.autogather=false
 # List Tables
 0: jdbc:hive2://hiveserver:10000> show tables;
 +---------------------+--+
 |      tab_name       |
 +---------------------+--+
 | stock_ticks_cow     |
 | stock_ticks_mor     |
 | stock_ticks_mor_rt  |
 +---------------------+--+
 2 rows selected (0.801 seconds)
 0: jdbc:hive2://hiveserver:10000>
 # Look at partitions that were added
 0: jdbc:hive2://hiveserver:10000> show partitions stock_ticks_mor_rt;
 +----------------+--+
 |   partition    |
 +----------------+--+
 | dt=2018-08-31  |
 +----------------+--+
 1 row selected (0.24 seconds)
 # COPY-ON-WRITE Queries:
 =========================
 0: jdbc:hive2://hiveserver:10000> select symbol, max(ts) from stock_ticks_cow group by symbol HAVING symbol = 'GOOG';
 +---------+----------------------+--+
 | symbol  |         _c1          |
 +---------+----------------------+--+
 | GOOG    | 2018-08-31 10:29:00  |
 +---------+----------------------+--+
 Now, run a projection query:
 0: jdbc:hive2://hiveserver:10000> select `_hoodie_commit_time`, symbol, ts, volume, open, close  from stock_ticks_cow where  symbol = 'GOOG';
 +----------------------+---------+----------------------+---------+------------+-----------+--+
 | _hoodie_commit_time  | symbol  |          ts          | volume  |    open    |   close   |
 +----------------------+---------+----------------------+---------+------------+-----------+--+
 | 20180924221953       | GOOG    | 2018-08-31 09:59:00  | 6330    | 1230.5     | 1230.02   |
 | 20180924221953       | GOOG    | 2018-08-31 10:29:00  | 3391    | 1230.1899  | 1230.085  |
 +----------------------+---------+----------------------+---------+------------+-----------+--+
 # Merge-On-Read Queries:
 ==========================
 Lets run similar queries against M-O-R dataset. Lets look at both 
 ReadOptimized and Realtime views supported by M-O-R dataset
 # Run against ReadOptimized View. Notice that the latest timestamp is 10:29
 0: jdbc:hive2://hiveserver:10000> select symbol, max(ts) from stock_ticks_mor group by symbol HAVING symbol = 'GOOG';
 WARNING: Hive-on-MR is deprecated in Hive 2 and may not be available in the future versions. Consider using a different execution engine (i.e. spark, tez) or using Hive 1.X releases.
 +---------+----------------------+--+
 | symbol  |         _c1          |
 +---------+----------------------+--+
 | GOOG    | 2018-08-31 10:29:00  |
 +---------+----------------------+--+
 1 row selected (6.326 seconds)
 # Run against Realtime View. Notice that the latest timestamp is again 10:29
 0: jdbc:hive2://hiveserver:10000> select symbol, max(ts) from stock_ticks_mor_rt group by symbol HAVING symbol = 'GOOG';
 WARNING: Hive-on-MR is deprecated in Hive 2 and may not be available in the future versions. Consider using a different execution engine (i.e. spark, tez) or using Hive 1.X releases.
 +---------+----------------------+--+
 | symbol  |         _c1          |
 +---------+----------------------+--+
 | GOOG    | 2018-08-31 10:29:00  |
 +---------+----------------------+--+
 1 row selected (1.606 seconds)
 # Run projection query against Read Optimized and Realtime tables
 0: jdbc:hive2://hiveserver:10000> select `_hoodie_commit_time`, symbol, ts, volume, open, close  from stock_ticks_mor where  symbol = 'GOOG';
 +----------------------+---------+----------------------+---------+------------+-----------+--+
 | _hoodie_commit_time  | symbol  |          ts          | volume  |    open    |   close   |
 +----------------------+---------+----------------------+---------+------------+-----------+--+
 | 20180924222155       | GOOG    | 2018-08-31 09:59:00  | 6330    | 1230.5     | 1230.02   |
 | 20180924222155       | GOOG    | 2018-08-31 10:29:00  | 3391    | 1230.1899  | 1230.085  |
 +----------------------+---------+----------------------+---------+------------+-----------+--+
 0: jdbc:hive2://hiveserver:10000> select `_hoodie_commit_time`, symbol, ts, volume, open, close  from stock_ticks_mor_rt where  symbol = 'GOOG';
 +----------------------+---------+----------------------+---------+------------+-----------+--+
 | _hoodie_commit_time  | symbol  |          ts          | volume  |    open    |   close   |
 +----------------------+---------+----------------------+---------+------------+-----------+--+
 | 20180924222155       | GOOG    | 2018-08-31 09:59:00  | 6330    | 1230.5     | 1230.02   |
 | 20180924222155       | GOOG    | 2018-08-31 10:29:00  | 3391    | 1230.1899  | 1230.085  |
 +----------------------+---------+----------------------+---------+------------+-----------+--+
 exit
 exit
 ```
 #### Step 4 (b): Run Spark-SQL Queries
 Hudi support Spark as query processor just like Hive. Here are the same hive queries
 running in spark-sql
 ```
 docker exec -it adhoc-1 /bin/bash
 $SPARK_INSTALL/bin/spark-shell --jars $HUDI_SPARK_BUNDLE --master local[2] --driver-class-path $HADOOP_CONF_DIR --conf spark.sql.hive.convertMetastoreParquet=false --deploy-mode client  --driver-memory 1G --executor-memory 3G --num-executors 1  --packages com.databricks:spark-avro_2.11:4.0.0
 ...
 Welcome to
      ____              __
     / __/__  ___ _____/ /__
    _\ \/ _ \/ _ `/ __/  '_/
   /___/ .__/\_,_/_/ /_/\_\   version 2.3.1
      /_/
 Using Scala version 2.11.8 (Java HotSpot(TM) 64-Bit Server VM, Java 1.8.0_181)
 Type in expressions to have them evaluated.
 Type :help for more information.
 scala>
 scala> spark.sql("show tables").show(100, false)
 +--------+------------------+-----------+
 |database|tableName         |isTemporary|
 +--------+------------------+-----------+
 |default |stock_ticks_cow   |false      |
 |default |stock_ticks_mor   |false      |
 |default |stock_ticks_mor_rt|false      |
 +--------+------------------+-----------+
 # Copy-On-Write Table
 ## Run max timestamp query against COW table
 scala> spark.sql("select symbol, max(ts) from stock_ticks_cow group by symbol HAVING symbol = 'GOOG'").show(100, false)
 [Stage 0:>                                                          (0 + 1) / 1]SLF4J: Failed to load class "org.slf4j.impl.StaticLoggerBinder".
 SLF4J: Defaulting to no-operation (NOP) logger implementation
 SLF4J: See http://www.slf4j.org/codes.html#StaticLoggerBinder for further details.
 +------+-------------------+
 |symbol|max(ts)            |
 +------+-------------------+
 |GOOG  |2018-08-31 10:29:00|
 +------+-------------------+
 ## Projection Query
 scala> spark.sql("select `_hoodie_commit_time`, symbol, ts, volume, open, close  from stock_ticks_cow where  symbol = 'GOOG'").show(100, false)
 +-------------------+------+-------------------+------+---------+--------+
 |_hoodie_commit_time|symbol|ts                 |volume|open     |close   |
 +-------------------+------+-------------------+------+---------+--------+
 |20180924221953     |GOOG  |2018-08-31 09:59:00|6330  |1230.5   |1230.02 |
 |20180924221953     |GOOG  |2018-08-31 10:29:00|3391  |1230.1899|1230.085|
 +-------------------+------+-------------------+------+---------+--------+
 # Merge-On-Read Queries:
 ==========================
 Lets run similar queries against M-O-R dataset. Lets look at both 
 ReadOptimized and Realtime views supported by M-O-R dataset
 # Run against ReadOptimized View. Notice that the latest timestamp is 10:29
 scala> spark.sql("select symbol, max(ts) from stock_ticks_mor group by symbol HAVING symbol = 'GOOG'").show(100, false)
 +------+-------------------+
 |symbol|max(ts)            |
 +------+-------------------+
 |GOOG  |2018-08-31 10:29:00|
 +------+-------------------+
 # Run against Realtime View. Notice that the latest timestamp is again 10:29
 scala> spark.sql("select symbol, max(ts) from stock_ticks_mor_rt group by symbol HAVING symbol = 'GOOG'").show(100, false)
 +------+-------------------+
 |symbol|max(ts)            |
 +------+-------------------+
 |GOOG  |2018-08-31 10:29:00|
 +------+-------------------+
 # Run projection query against Read Optimized and Realtime tables
 scala> spark.sql("select `_hoodie_commit_time`, symbol, ts, volume, open, close  from stock_ticks_mor where  symbol = 'GOOG'").show(100, false)
 +-------------------+------+-------------------+------+---------+--------+
 |_hoodie_commit_time|symbol|ts                 |volume|open     |close   |
 +-------------------+------+-------------------+------+---------+--------+
 |20180924222155     |GOOG  |2018-08-31 09:59:00|6330  |1230.5   |1230.02 |
 |20180924222155     |GOOG  |2018-08-31 10:29:00|3391  |1230.1899|1230.085|
 +-------------------+------+-------------------+------+---------+--------+
 scala> spark.sql("select `_hoodie_commit_time`, symbol, ts, volume, open, close  from stock_ticks_mor_rt where  symbol = 'GOOG'").show(100, false)
 +-------------------+------+-------------------+------+---------+--------+
 |_hoodie_commit_time|symbol|ts                 |volume|open     |close   |
 +-------------------+------+-------------------+------+---------+--------+
 |20180924222155     |GOOG  |2018-08-31 09:59:00|6330  |1230.5   |1230.02 |
 |20180924222155     |GOOG  |2018-08-31 10:29:00|3391  |1230.1899|1230.085|
 +-------------------+------+-------------------+------+---------+--------+
 ```
 #### Step 5: Upload second batch to Kafka and run DeltaStreamer to ingest
 Upload the second batch of data and ingest this batch using delta-streamer. As this batch does not bring in any new
 partitions, there is no need to run hive-sync
 ```
 cat docker/demo/data/batch_2.json | kafkacat -b kafkabroker -t stock_ticks -P 
 # Within Docker container, run the ingestion command
 docker exec -it adhoc-2 /bin/bash
 # Run the following spark-submit command to execute the delta-streamer and ingest to stock_ticks_cow dataset in HDFS
 spark-submit --class com.uber.hoodie.utilities.deltastreamer.HoodieDeltaStreamer $HUDI_UTILITIES_BUNDLE --storage-type COPY_ON_WRITE --source-class com.uber.hoodie.utilities.sources.JsonKafkaSource --source-ordering-field ts  --target-base-path /user/hive/warehouse/stock_ticks_cow --target-table stock_ticks_cow --props /var/demo/config/kafka-source.properties
 # Run the following spark-submit command to execute the delta-streamer and ingest to stock_ticks_mor dataset in HDFS
 spark-submit --class com.uber.hoodie.utilities.deltastreamer.HoodieDeltaStreamer $HUDI_UTILITIES_BUNDLE --storage-type MERGE_ON_READ --source-class com.uber.hoodie.utilities.sources.JsonKafkaSource --source-ordering-field ts  --target-base-path /user/hive/warehouse/stock_ticks_mor --target-table stock_ticks_mor --props /var/demo/config/kafka-source.properties
 exit
 ```
 With Copy-On-Write table, the second ingestion by DeltaStreamer resulted in a new version of Parquet file getting created.
 See `http://namenode:50070/explorer.html#/user/hive/warehouse/stock_ticks_cow/2018/08/31`
 With Merge-On-Read table, the second ingestion merely appended the batch to an unmerged delta (log) file. 
 Take a look at the HDFS filesystem to get an idea: `http://namenode:50070/explorer.html#/user/hive/warehouse/stock_ticks_mor/2018/08/31`
 #### Step 6(a): Run Hive Queries
 With Copy-On-Write table, the read-optimized view immediately sees the changes as part of second batch once the batch 
 got committed as each ingestion creates newer versions of parquet files. 
 With Merge-On-Read table, the second ingestion merely appended the batch to an unmerged delta (log) file. 
 This is the time, when ReadOptimized and Realtime views will provide different results. ReadOptimized view will still
 return "10:29 am" as it will only read from the Parquet file. Realtime View will do on-the-fly merge and return
 latest committed data which is "10:59 a.m".
 ```
 docker exec -it adhoc-2 /bin/bash
 beeline -u jdbc:hive2://hiveserver:10000 --hiveconf hive.input.format=org.apache.hadoop.hive.ql.io.HiveInputFormat --hiveconf hive.stats.autogather=false
 # Copy On Write Table:
 0: jdbc:hive2://hiveserver:10000> select symbol, max(ts) from stock_ticks_cow group by symbol HAVING symbol = 'GOOG';
 WARNING: Hive-on-MR is deprecated in Hive 2 and may not be available in the future versions. Consider using a different execution engine (i.e. spark, tez) or using Hive 1.X releases.
 +---------+----------------------+--+
 | symbol  |         _c1          |
 +---------+----------------------+--+
 | GOOG    | 2018-08-31 10:59:00  |
 +---------+----------------------+--+
 1 row selected (1.932 seconds)
 0: jdbc:hive2://hiveserver:10000> select `_hoodie_commit_time`, symbol, ts, volume, open, close  from stock_ticks_cow where  symbol = 'GOOG';
 +----------------------+---------+----------------------+---------+------------+-----------+--+
 | _hoodie_commit_time  | symbol  |          ts          | volume  |    open    |   close   |
 +----------------------+---------+----------------------+---------+------------+-----------+--+
 | 20180924221953       | GOOG    | 2018-08-31 09:59:00  | 6330    | 1230.5     | 1230.02   |
 | 20180924224524       | GOOG    | 2018-08-31 10:59:00  | 9021    | 1227.1993  | 1227.215  |
 +----------------------+---------+----------------------+---------+------------+-----------+--+
 As you can notice, the above queries now reflect the changes that came as part of ingesting second batch.
 # Merge On Read Table: 
 # Read Optimized View
 0: jdbc:hive2://hiveserver:10000> select symbol, max(ts) from stock_ticks_mor group by symbol HAVING symbol = 'GOOG';
 WARNING: Hive-on-MR is deprecated in Hive 2 and may not be available in the future versions. Consider using a different execution engine (i.e. spark, tez) or using Hive 1.X releases.
 +---------+----------------------+--+
 | symbol  |         _c1          |
 +---------+----------------------+--+
 | GOOG    | 2018-08-31 10:29:00  |
 +---------+----------------------+--+
 1 row selected (1.6 seconds)
 0: jdbc:hive2://hiveserver:10000> select `_hoodie_commit_time`, symbol, ts, volume, open, close  from stock_ticks_mor where  symbol = 'GOOG';
 +----------------------+---------+----------------------+---------+------------+-----------+--+
 | _hoodie_commit_time  | symbol  |          ts          | volume  |    open    |   close   |
 +----------------------+---------+----------------------+---------+------------+-----------+--+
 | 20180924222155       | GOOG    | 2018-08-31 09:59:00  | 6330    | 1230.5     | 1230.02   |
 | 20180924222155       | GOOG    | 2018-08-31 10:29:00  | 3391    | 1230.1899  | 1230.085  |
 +----------------------+---------+----------------------+---------+------------+-----------+--+
 # Realtime View
 0: jdbc:hive2://hiveserver:10000> select symbol, max(ts) from stock_ticks_mor_rt group by symbol HAVING symbol = 'GOOG';
 WARNING: Hive-on-MR is deprecated in Hive 2 and may not be available in the future versions. Consider using a different execution engine (i.e. spark, tez) or using Hive 1.X releases.
 +---------+----------------------+--+
 | symbol  |         _c1          |
 +---------+----------------------+--+
 | GOOG    | 2018-08-31 10:59:00  |
 +---------+----------------------+--+
 0: jdbc:hive2://hiveserver:10000> select `_hoodie_commit_time`, symbol, ts, volume, open, close  from stock_ticks_mor_rt where  symbol = 'GOOG';
 +----------------------+---------+----------------------+---------+------------+-----------+--+
 | _hoodie_commit_time  | symbol  |          ts          | volume  |    open    |   close   |
 +----------------------+---------+----------------------+---------+------------+-----------+--+
 | 20180924222155       | GOOG    | 2018-08-31 09:59:00  | 6330    | 1230.5     | 1230.02   |
 | 20180924224537       | GOOG    | 2018-08-31 10:59:00  | 9021    | 1227.1993  | 1227.215  |
 +----------------------+---------+----------------------+---------+------------+-----------+--+
 exit
 exit
 ```
 #### Step 6(b): Run Spark SQL Queries
 Running the same queries in Spark-SQL:
 ```
 docker exec -it adhoc-1 /bin/bash
 bash-4.4# $SPARK_INSTALL/bin/spark-shell --jars $HUDI_SPARK_BUNDLE --driver-class-path $HADOOP_CONF_DIR --conf spark.sql.hive.convertMetastoreParquet=false --deploy-mode client  --driver-memory 1G --master local[2] --executor-memory 3G --num-executors 1  --packages com.databricks:spark-avro_2.11:4.0.0
 # Copy On Write Table:
 scala> spark.sql("select symbol, max(ts) from stock_ticks_cow group by symbol HAVING symbol = 'GOOG'").show(100, false)
 +------+-------------------+
 |symbol|max(ts)            |
 +------+-------------------+
 |GOOG  |2018-08-31 10:59:00|
 +------+-------------------+
 scala> spark.sql("select `_hoodie_commit_time`, symbol, ts, volume, open, close  from stock_ticks_cow where  symbol = 'GOOG'").show(100, false)
 +----------------------+---------+----------------------+---------+------------+-----------+--+
 | _hoodie_commit_time  | symbol  |          ts          | volume  |    open    |   close   |
 +----------------------+---------+----------------------+---------+------------+-----------+--+
 | 20180924221953       | GOOG    | 2018-08-31 09:59:00  | 6330    | 1230.5     | 1230.02   |
 | 20180924224524       | GOOG    | 2018-08-31 10:59:00  | 9021    | 1227.1993  | 1227.215  |
 +----------------------+---------+----------------------+---------+------------+-----------+--+
 As you can notice, the above queries now reflect the changes that came as part of ingesting second batch.
 # Merge On Read Table: 
 # Read Optimized View
 scala> spark.sql("select symbol, max(ts) from stock_ticks_mor group by symbol HAVING symbol = 'GOOG'").show(100, false)
 +---------+----------------------+--+
 | symbol  |         _c1          |
 +---------+----------------------+--+
 | GOOG    | 2018-08-31 10:29:00  |
 +---------+----------------------+--+
 1 row selected (1.6 seconds)
 scala> spark.sql("select `_hoodie_commit_time`, symbol, ts, volume, open, close  from stock_ticks_mor where  symbol = 'GOOG'").show(100, false)
 +----------------------+---------+----------------------+---------+------------+-----------+--+
 | _hoodie_commit_time  | symbol  |          ts          | volume  |    open    |   close   |
 +----------------------+---------+----------------------+---------+------------+-----------+--+
 | 20180924222155       | GOOG    | 2018-08-31 09:59:00  | 6330    | 1230.5     | 1230.02   |
 | 20180924222155       | GOOG    | 2018-08-31 10:29:00  | 3391    | 1230.1899  | 1230.085  |
 +----------------------+---------+----------------------+---------+------------+-----------+--+
 # Realtime View
 scala> spark.sql("select symbol, max(ts) from stock_ticks_mor_rt group by symbol HAVING symbol = 'GOOG'").show(100, false)
 +---------+----------------------+--+
 | symbol  |         _c1          |
 +---------+----------------------+--+
 | GOOG    | 2018-08-31 10:59:00  |
 +---------+----------------------+--+
 scala> spark.sql("select `_hoodie_commit_time`, symbol, ts, volume, open, close  from stock_ticks_mor_rt where  symbol = 'GOOG'").show(100, false)
 +----------------------+---------+----------------------+---------+------------+-----------+--+
 | _hoodie_commit_time  | symbol  |          ts          | volume  |    open    |   close   |
 +----------------------+---------+----------------------+---------+------------+-----------+--+
 | 20180924222155       | GOOG    | 2018-08-31 09:59:00  | 6330    | 1230.5     | 1230.02   |
 | 20180924224537       | GOOG    | 2018-08-31 10:59:00  | 9021    | 1227.1993  | 1227.215  |
 +----------------------+---------+----------------------+---------+------------+-----------+--+
 exit
 exit
 ```
 #### Step 7 : Incremental Query for COPY-ON-WRITE Table
 With 2 batches of data ingested, lets showcase the support for incremental queries in Hudi Copy-On-Write datasets
 Lets take the same projection query example
 ```
 docker exec -it adhoc-2 /bin/bash
 beeline -u jdbc:hive2://hiveserver:10000 --hiveconf hive.input.format=org.apache.hadoop.hive.ql.io.HiveInputFormat --hiveconf hive.stats.autogather=false
 0: jdbc:hive2://hiveserver:10000> select `_hoodie_commit_time`, symbol, ts, volume, open, close  from stock_ticks_cow where  symbol = 'GOOG';
 +----------------------+---------+----------------------+---------+------------+-----------+--+
 | _hoodie_commit_time  | symbol  |          ts          | volume  |    open    |   close   |
 +----------------------+---------+----------------------+---------+------------+-----------+--+
 | 20180924064621       | GOOG    | 2018-08-31 09:59:00  | 6330    | 1230.5     | 1230.02   |
 | 20180924065039       | GOOG    | 2018-08-31 10:59:00  | 9021    | 1227.1993  | 1227.215  |
 +----------------------+---------+----------------------+---------+------------+-----------+--+
 ```
 As you notice from the above queries, there are 2 commits - 20180924064621 and 20180924065039 in timeline order.
 When you follow the steps, you will be getting different timestamps for commits. Substitute them
 in place of the above timestamps.
 To show the effects of incremental-query, let us assume that a reader has already seen the changes as part of
 ingesting first batch. Now, for the reader to see effect of the second batch, he/she has to keep the start timestamp to
 the commit time of the first batch (20180924064621) and run incremental query
 `Hudi incremental mode` provides efficient scanning for incremental queries by filtering out files that do not have any 
 candidate rows using hudi-managed metadata. 
 ```
 docker exec -it adhoc-2 /bin/bash
 beeline -u jdbc:hive2://hiveserver:10000 --hiveconf hive.input.format=org.apache.hadoop.hive.ql.io.HiveInputFormat --hiveconf hive.stats.autogather=false
 0: jdbc:hive2://hiveserver:10000> set hoodie.stock_ticks_cow.consume.mode=INCREMENTAL;
 No rows affected (0.009 seconds)
 0: jdbc:hive2://hiveserver:10000>  set hoodie.stock_ticks_cow.consume.max.commits=3;
 No rows affected (0.009 seconds)
 0: jdbc:hive2://hiveserver:10000> set hoodie.stock_ticks_cow.consume.start.timestamp=20180924064621;
 # With the above setting, file-ids that do not have any updates from the commit 20180924065039 is filtered out without scanning.
 # Here is the incremental query :
 0: jdbc:hive2://hiveserver:10000>
 0: jdbc:hive2://hiveserver:10000> select `_hoodie_commit_time`, symbol, ts, volume, open, close  from stock_ticks_cow where  symbol = 'GOOG' and `_hoodie_commit_time` > '20180924064621';
 +----------------------+---------+----------------------+---------+------------+-----------+--+
 | _hoodie_commit_time  | symbol  |          ts          | volume  |    open    |   close   |
 +----------------------+---------+----------------------+---------+------------+-----------+--+
 | 20180924065039       | GOOG    | 2018-08-31 10:59:00  | 9021    | 1227.1993  | 1227.215  |
 +----------------------+---------+----------------------+---------+------------+-----------+--+
 1 row selected (0.83 seconds)
 0: jdbc:hive2://hiveserver:10000>
 ```
 ##### Incremental Query with Spark SQL:
 ```
 docker exec -it adhoc-1 /bin/bash
 bash-4.4# $SPARK_INSTALL/bin/spark-shell --jars $HUDI_SPARK_BUNDLE --driver-class-path $HADOOP_CONF_DIR --conf spark.sql.hive.convertMetastoreParquet=false --deploy-mode client  --driver-memory 1G --master local[2] --executor-memory 3G --num-executors 1  --packages com.databricks:spark-avro_2.11:4.0.0
 Welcome to
      ____              __
     / __/__  ___ _____/ /__
    _\ \/ _ \/ _ `/ __/  '_/
   /___/ .__/\_,_/_/ /_/\_\   version 2.3.1
      /_/
 Using Scala version 2.11.8 (Java HotSpot(TM) 64-Bit Server VM, Java 1.8.0_181)
 Type in expressions to have them evaluated.
 Type :help for more information.
 scala> import com.uber.hoodie.DataSourceReadOptions
 import com.uber.hoodie.DataSourceReadOptions
 # In the below query, 20180925045257 is the first commit's timestamp
 scala> val hoodieIncViewDF =  spark.read.format("com.uber.hoodie").option(DataSourceReadOptions.VIEW_TYPE_OPT_KEY, DataSourceReadOptions.VIEW_TYPE_INCREMENTAL_OPT_VAL).option(DataSourceReadOptions.BEGIN_INSTANTTIME_OPT_KEY, "20180924064621").load("/user/hive/warehouse/stock_ticks_cow")
 SLF4J: Failed to load class "org.slf4j.impl.StaticLoggerBinder".
 SLF4J: Defaulting to no-operation (NOP) logger implementation
 SLF4J: See http://www.slf4j.org/codes.html#StaticLoggerBinder for further details.
 hoodieIncViewDF: org.apache.spark.sql.DataFrame = [_hoodie_commit_time: string, _hoodie_commit_seqno: string ... 15 more fields]
 scala> hoodieIncViewDF.registerTempTable("stock_ticks_cow_incr_tmp1")
 warning: there was one deprecation warning; re-run with -deprecation for details
 scala> spark.sql("select `_hoodie_commit_time`, symbol, ts, volume, open, close  from stock_ticks_cow_incr_tmp1 where  symbol = 'GOOG'").show(100, false);
 +----------------------+---------+----------------------+---------+------------+-----------+--+
 | _hoodie_commit_time  | symbol  |          ts          | volume  |    open    |   close   |
 +----------------------+---------+----------------------+---------+------------+-----------+--+
 | 20180924065039       | GOOG    | 2018-08-31 10:59:00  | 9021    | 1227.1993  | 1227.215  |
 +----------------------+---------+----------------------+---------+------------+-----------+--+
 ```
 #### Step 8: Schedule and Run Compaction for Merge-On-Read dataset
 Lets schedule and run a compaction to create a new version of columnar  file so that read-optimized readers will see fresher data.
 Again, You can use Hudi CLI to manually schedule and run compaction
 ```
 docker exec -it adhoc-1 /bin/bash
 ^[[Aroot@adhoc-1:/opt#   /var/hoodie/ws/hoodie-cli/hoodie-cli.sh
 ============================================
 *                                          *
 *     _    _                 _ _           *
 *    | |  | |               | (_)          *
 *    | |__| | ___   ___   __| |_  ___      *
 *    |  __  |/ _ \ / _ \ / _` | |/ _ \     *
 *    | |  | | (_) | (_) | (_| | |  __/     *
 *    |_|  |_|\___/ \___/ \__,_|_|\___|     *
 *                                          *
 ============================================
 Welcome to Hoodie CLI. Please type help if you are looking for help.
 hoodie->connect --path /user/hive/warehouse/stock_ticks_mor
 18/09/24 06:59:34 WARN util.NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
 18/09/24 06:59:35 INFO table.HoodieTableMetaClient: Loading HoodieTableMetaClient from /user/hive/warehouse/stock_ticks_mor
 18/09/24 06:59:35 INFO util.FSUtils: Hadoop Configuration: fs.defaultFS: [hdfs://namenode:8020], Config:[Configuration: core-default.xml, core-site.xml, mapred-default.xml, mapred-site.xml, yarn-default.xml, yarn-site.xml, hdfs-default.xml, hdfs-site.xml], FileSystem: [DFS[DFSClient[clientName=DFSClient_NONMAPREDUCE_-1261652683_11, ugi=root (auth:SIMPLE)]]]
 18/09/24 06:59:35 INFO table.HoodieTableConfig: Loading dataset properties from /user/hive/warehouse/stock_ticks_mor/.hoodie/hoodie.properties
 18/09/24 06:59:36 INFO table.HoodieTableMetaClient: Finished Loading Table of type MERGE_ON_READ from /user/hive/warehouse/stock_ticks_mor
 Metadata for table stock_ticks_mor loaded
 # Ensure no compactions are present
 hoodie:stock_ticks_mor->compactions show all
 18/09/24 06:59:54 INFO timeline.HoodieActiveTimeline: Loaded instants [[20180924064636__clean__COMPLETED], [20180924064636__deltacommit__COMPLETED], [20180924065057__clean__COMPLETED], [20180924065057__deltacommit__COMPLETED]]
    ___________________________________________________________________
    | Compaction Instant Time| State    | Total FileIds to be Compacted|
    |==================================================================|
 # Schedule a compaction. This will use Spark Launcher to schedule compaction
 hoodie:stock_ticks_mor->compaction schedule
 ....
 Compaction successfully completed for 20180924070031
 # Now refresh and check again. You will see that there is a new compaction requested
 hoodie:stock_ticks->connect --path /user/hive/warehouse/stock_ticks_mor
 18/09/24 07:01:16 INFO table.HoodieTableMetaClient: Loading HoodieTableMetaClient from /user/hive/warehouse/stock_ticks_mor
 18/09/24 07:01:16 INFO util.FSUtils: Hadoop Configuration: fs.defaultFS: [hdfs://namenode:8020], Config:[Configuration: core-default.xml, core-site.xml, mapred-default.xml, mapred-site.xml, yarn-default.xml, yarn-site.xml, hdfs-default.xml, hdfs-site.xml], FileSystem: [DFS[DFSClient[clientName=DFSClient_NONMAPREDUCE_-1261652683_11, ugi=root (auth:SIMPLE)]]]
 18/09/24 07:01:16 INFO table.HoodieTableConfig: Loading dataset properties from /user/hive/warehouse/stock_ticks_mor/.hoodie/hoodie.properties
 18/09/24 07:01:16 INFO table.HoodieTableMetaClient: Finished Loading Table of type MERGE_ON_READ from /user/hive/warehouse/stock_ticks_mor
 Metadata for table stock_ticks_mor loaded
 hoodie:stock_ticks_mor->compactions show all
 18/09/24 06:34:12 INFO timeline.HoodieActiveTimeline: Loaded instants [[20180924041125__clean__COMPLETED], [20180924041125__deltacommit__COMPLETED], [20180924042735__clean__COMPLETED], [20180924042735__deltacommit__COMPLETED], [==>20180924063245__compaction__REQUESTED]]
    ___________________________________________________________________
    | Compaction Instant Time| State    | Total FileIds to be Compacted|
    |==================================================================|
    | 20180924070031         | REQUESTED| 1                            | 
 # Execute the compaction. The compaction instant value passed below must be the one displayed in the above "compactions show all" query
 hoodie:stock_ticks_mor->compaction run --compactionInstant  20180924070031 --parallelism 2 --sparkMemory 1G  --schemaFilePath /var/demo/config/schema.avsc --retry 1  
 ....
 Compaction successfully completed for 20180924070031
 ## Now check if compaction is completed
 hoodie:stock_ticks_mor->connect --path /user/hive/warehouse/stock_ticks_mor
 18/09/24 07:03:00 INFO table.HoodieTableMetaClient: Loading HoodieTableMetaClient from /user/hive/warehouse/stock_ticks_mor
 18/09/24 07:03:00 INFO util.FSUtils: Hadoop Configuration: fs.defaultFS: [hdfs://namenode:8020], Config:[Configuration: core-default.xml, core-site.xml, mapred-default.xml, mapred-site.xml, yarn-default.xml, yarn-site.xml, hdfs-default.xml, hdfs-site.xml], FileSystem: [DFS[DFSClient[clientName=DFSClient_NONMAPREDUCE_-1261652683_11, ugi=root (auth:SIMPLE)]]]
 18/09/24 07:03:00 INFO table.HoodieTableConfig: Loading dataset properties from /user/hive/warehouse/stock_ticks_mor/.hoodie/hoodie.properties
 18/09/24 07:03:00 INFO table.HoodieTableMetaClient: Finished Loading Table of type MERGE_ON_READ from /user/hive/warehouse/stock_ticks_mor
 Metadata for table stock_ticks_mor loaded
 hoodie:stock_ticks->compactions show all
 18/09/24 07:03:15 INFO timeline.HoodieActiveTimeline: Loaded instants [[20180924064636__clean__COMPLETED], [20180924064636__deltacommit__COMPLETED], [20180924065057__clean__COMPLETED], [20180924065057__deltacommit__COMPLETED], [20180924070031__commit__COMPLETED]]
    ___________________________________________________________________
    | Compaction Instant Time| State    | Total FileIds to be Compacted|
    |==================================================================|
    | 20180924070031         | COMPLETED| 1                            |
 ``` 
 #### Step 9: Run Hive Queries including incremental queries
 You will see that both ReadOptimized and Realtime Views will show the latest committed data.
 Lets also run the incremental query for MOR table.
 From looking at the below query output, it will be clear that the fist commit time for the MOR table is 20180924064636
 and the second commit time is 20180924070031
 ```
 docker exec -it adhoc-2 /bin/bash
 beeline -u jdbc:hive2://hiveserver:10000 --hiveconf hive.input.format=org.apache.hadoop.hive.ql.io.HiveInputFormat --hiveconf hive.stats.autogather=false
 # Read Optimized View
 0: jdbc:hive2://hiveserver:10000> select symbol, max(ts) from stock_ticks_mor group by symbol HAVING symbol = 'GOOG';
 WARNING: Hive-on-MR is deprecated in Hive 2 and may not be available in the future versions. Consider using a different execution engine (i.e. spark, tez) or using Hive 1.X releases.
 +---------+----------------------+--+
 | symbol  |         _c1          |
 +---------+----------------------+--+
 | GOOG    | 2018-08-31 10:59:00  |
 +---------+----------------------+--+
 1 row selected (1.6 seconds)
 0: jdbc:hive2://hiveserver:10000> select `_hoodie_commit_time`, symbol, ts, volume, open, close  from stock_ticks_mor where  symbol = 'GOOG';
 +----------------------+---------+----------------------+---------+------------+-----------+--+
 | _hoodie_commit_time  | symbol  |          ts          | volume  |    open    |   close   |
 +----------------------+---------+----------------------+---------+------------+-----------+--+
 | 20180924064636       | GOOG    | 2018-08-31 09:59:00  | 6330    | 1230.5     | 1230.02   |
 | 20180924070031       | GOOG    | 2018-08-31 10:59:00  | 9021    | 1227.1993  | 1227.215  |
 +----------------------+---------+----------------------+---------+------------+-----------+--+
 # Realtime View
 0: jdbc:hive2://hiveserver:10000> select symbol, max(ts) from stock_ticks_mor_rt group by symbol HAVING symbol = 'GOOG';
 WARNING: Hive-on-MR is deprecated in Hive 2 and may not be available in the future versions. Consider using a different execution engine (i.e. spark, tez) or using Hive 1.X releases.
 +---------+----------------------+--+
 | symbol  |         _c1          |
 +---------+----------------------+--+
 | GOOG    | 2018-08-31 10:59:00  |
 +---------+----------------------+--+
 0: jdbc:hive2://hiveserver:10000> select `_hoodie_commit_time`, symbol, ts, volume, open, close  from stock_ticks_mor_rt where  symbol = 'GOOG';
 +----------------------+---------+----------------------+---------+------------+-----------+--+
 | _hoodie_commit_time  | symbol  |          ts          | volume  |    open    |   close   |
 +----------------------+---------+----------------------+---------+------------+-----------+--+
 | 20180924064636       | GOOG    | 2018-08-31 09:59:00  | 6330    | 1230.5     | 1230.02   |
 | 20180924070031       | GOOG    | 2018-08-31 10:59:00  | 9021    | 1227.1993  | 1227.215  |
 +----------------------+---------+----------------------+---------+------------+-----------+--+
 # Incremental View:
 0: jdbc:hive2://hiveserver:10000> set hoodie.stock_ticks_mor.consume.mode=INCREMENTAL;
 No rows affected (0.008 seconds)
 # Max-Commits covers both second batch and compaction commit
 0: jdbc:hive2://hiveserver:10000> set hoodie.stock_ticks_mor.consume.max.commits=3;
 No rows affected (0.007 seconds)
 0: jdbc:hive2://hiveserver:10000> set hoodie.stock_ticks_mor.consume.start.timestamp=20180924064636;
 No rows affected (0.013 seconds)
 # Query:
 0: jdbc:hive2://hiveserver:10000> select `_hoodie_commit_time`, symbol, ts, volume, open, close  from stock_ticks_mor where  symbol = 'GOOG' and `_hoodie_commit_time` > '20180924064636';
 +----------------------+---------+----------------------+---------+------------+-----------+--+
 | _hoodie_commit_time  | symbol  |          ts          | volume  |    open    |   close   |
 +----------------------+---------+----------------------+---------+------------+-----------+--+
 | 20180924070031       | GOOG    | 2018-08-31 10:59:00  | 9021    | 1227.1993  | 1227.215  |
 +----------------------+---------+----------------------+---------+------------+-----------+--+
 exit
 exit
 ```
 ##### Read Optimized and Realtime Views for MOR with Spark-SQL after compaction
 ```
 docker exec -it adhoc-1 /bin/bash
 bash-4.4# $SPARK_INSTALL/bin/spark-shell --jars $HUDI_SPARK_BUNDLE --driver-class-path $HADOOP_CONF_DIR --conf spark.sql.hive.convertMetastoreParquet=false --deploy-mode client  --driver-memory 1G --master local[2] --executor-memory 3G --num-executors 1  --packages com.databricks:spark-avro_2.11:4.0.0
 # Read Optimized View
 scala> spark.sql("select symbol, max(ts) from stock_ticks_mor group by symbol HAVING symbol = 'GOOG'").show(100, false)
 +---------+----------------------+--+
 | symbol  |         _c1          |
 +---------+----------------------+--+
 | GOOG    | 2018-08-31 10:59:00  |
 +---------+----------------------+--+
 1 row selected (1.6 seconds)
 scala> spark.sql("select `_hoodie_commit_time`, symbol, ts, volume, open, close  from stock_ticks_mor where  symbol = 'GOOG'").show(100, false)
 +----------------------+---------+----------------------+---------+------------+-----------+--+
 | _hoodie_commit_time  | symbol  |          ts          | volume  |    open    |   close   |
 +----------------------+---------+----------------------+---------+------------+-----------+--+
 | 20180924064636       | GOOG    | 2018-08-31 09:59:00  | 6330    | 1230.5     | 1230.02   |
 | 20180924070031       | GOOG    | 2018-08-31 10:59:00  | 9021    | 1227.1993  | 1227.215  |
 +----------------------+---------+----------------------+---------+------------+-----------+--+
 # Realtime View
 scala> spark.sql("select symbol, max(ts) from stock_ticks_mor_rt group by symbol HAVING symbol = 'GOOG'").show(100, false)
 +---------+----------------------+--+
 | symbol  |         _c1          |
 +---------+----------------------+--+
 | GOOG    | 2018-08-31 10:59:00  |
 +---------+----------------------+--+
 scala> spark.sql("select `_hoodie_commit_time`, symbol, ts, volume, open, close  from stock_ticks_mor_rt where  symbol = 'GOOG'").show(100, false)
 +----------------------+---------+----------------------+---------+------------+-----------+--+
 | _hoodie_commit_time  | symbol  |          ts          | volume  |    open    |   close   |
 +----------------------+---------+----------------------+---------+------------+-----------+--+
 | 20180924064636       | GOOG    | 2018-08-31 09:59:00  | 6330    | 1230.5     | 1230.02   |
 | 20180924070031       | GOOG    | 2018-08-31 10:59:00  | 9021    | 1227.1993  | 1227.215  |
 +----------------------+---------+----------------------+---------+------------+-----------+--+
 ```
 This brings the demo to an end.
 ## Testing Hoodie in Local Docker environment
 You can bring up a hadoop docker environment containing Hadoop, Hive and Spark services with support for hoodie.
 ```
 $ mvn pre-integration-test -DskipTests
 ```
 The above command builds docker images for all the services with 
 current hoodie source installed at /var/hoodie/ws and also brings up the services using a compose file. We 
 currently use Hadoop (v2.8.4), Hive (v2.3.3) and Spark (v2.3.1) in docker images. 
 To bring down the containers
 ```
 $ cd hoodie-integ-test
 $ mvn docker-compose:down
 ```
 If you want to bring up the docker containers, use
 ```
 $ cd hoodie-integ-test
 $  mvn docker-compose:up -DdetachedMode=true
 ```
 Hoodie is a library that is operated in a broader data analytics/ingestion environment 
 involving Hadoop, Hive and Spark. Interoperability with all these systems is a key objective for us. We are
 actively adding integration-tests under __hoodie-integ-test/src/test/java__ that makes use of this 
 docker environment (See __hoodie-integ-test/src/test/java/com/uber/hoodie/integ/ITTestHoodieSanity.java__ )
 #### Building Local Docker Containers:
 The docker images required for demo and running integration test are already in docker-hub. The docker images
 and compose scripts are carefully implemented so that they serve dual-purpose
 1. The docker images have inbuilt hudi jar files with environment variable pointing to those jars (HUDI_HADOOP_BUNDLE, ...)
 2. For running integration-tests, we need the jars generated locally to be used for running services within docker. The
   docker-compose scripts (see `docker/compose/docker-compose_hadoop284_hive233_spark231.yml`) ensures local jars override
   inbuilt jars by mounting local HUDI workspace over the docker location
 This helps avoid maintaining separate docker images and avoids the costly step of building HUDI docker images locally.
 But if users want to test hudi from locations with lower network bandwidth, they can still build local images 
 run the script 
 `docker/build_local_docker_images.sh` to build local docker images before running `docker/setup_demo.sh`
 Here are the commands:
 ```
 cd docker
 ./build_local_docker_images.sh
 .....
 [INFO] Reactor Summary:
 [INFO]
 [INFO] Hoodie ............................................. SUCCESS [  1.709 s]
 [INFO] hoodie-common ...................................... SUCCESS [  9.015 s]
 [INFO] hoodie-hadoop-mr ................................... SUCCESS [  1.108 s]
 [INFO] hoodie-client ...................................... SUCCESS [  4.409 s]
 [INFO] hoodie-hive ........................................ SUCCESS [  0.976 s]
 [INFO] hoodie-spark ....................................... SUCCESS [ 26.522 s]
 [INFO] hoodie-utilities ................................... SUCCESS [ 16.256 s]
 [INFO] hoodie-cli ......................................... SUCCESS [ 11.341 s]
 [INFO] hoodie-hadoop-mr-bundle ............................ SUCCESS [  1.893 s]
 [INFO] hoodie-hive-bundle ................................. SUCCESS [ 14.099 s]
 [INFO] hoodie-spark-bundle ................................ SUCCESS [ 58.252 s]
 [INFO] hoodie-hadoop-docker ............................... SUCCESS [  0.612 s]
 [INFO] hoodie-hadoop-base-docker .......................... SUCCESS [04:04 min]
 [INFO] hoodie-hadoop-namenode-docker ...................... SUCCESS [  6.142 s]
 [INFO] hoodie-hadoop-datanode-docker ...................... SUCCESS [  7.763 s]
 [INFO] hoodie-hadoop-history-docker ....................... SUCCESS [  5.922 s]
 [INFO] hoodie-hadoop-hive-docker .......................... SUCCESS [ 56.152 s]
 [INFO] hoodie-hadoop-sparkbase-docker ..................... SUCCESS [01:18 min]
 [INFO] hoodie-hadoop-sparkmaster-docker ................... SUCCESS [  2.964 s]
 [INFO] hoodie-hadoop-sparkworker-docker ................... SUCCESS [  3.032 s]
 [INFO] hoodie-hadoop-sparkadhoc-docker .................... SUCCESS [  2.764 s]
 [INFO] hoodie-integ-test .................................. SUCCESS [  1.785 s]
 [INFO] ------------------------------------------------------------------------
 [INFO] BUILD SUCCESS
 [INFO] ------------------------------------------------------------------------
 [INFO] Total time: 09:15 min
 [INFO] Finished at: 2018-09-10T17:47:37-07:00
 [INFO] Final Memory: 236M/1848M
 [INFO] ------------------------------------------------------------------------
 ```
--- a/hoodie-hive/run_sync_tool.sh
+++ b/hoodie-hive/run_sync_tool.sh
--- a/hoodie-hive/src/test/java/com/uber/hoodie/hive/util/MultiPartKeysValueExtractor.java
+++ b/hoodie-hive/src/test/java/com/uber/hoodie/hive/util/MultiPartKeysValueExtractor.java
@@ -14,9 +14,8 @@
 * limitations under the License.
 */
-package com.uber.hoodie.hive.util;
+package com.uber.hoodie.hive;
 import com.uber.hoodie.hive.PartitionValueExtractor;
 import java.util.Arrays;
 import java.util.List;
--- a/hoodie-hive/src/test/java/com/uber/hoodie/hive/HiveSyncToolTest.java
+++ b/hoodie-hive/src/test/java/com/uber/hoodie/hive/HiveSyncToolTest.java
@@ -26,7 +26,6 @@ import com.google.common.collect.Lists;
 import com.uber.hoodie.common.util.SchemaTestUtil;
 import com.uber.hoodie.hive.HoodieHiveClient.PartitionEvent;
 import com.uber.hoodie.hive.HoodieHiveClient.PartitionEvent.PartitionEventType;
 import com.uber.hoodie.hive.util.MultiPartKeysValueExtractor;
 import com.uber.hoodie.hive.util.SchemaUtil;
 import java.io.IOException;
 import java.net.URISyntaxException;
--- a/hoodie-integ-test/pom.xml
+++ b/hoodie-integ-test/pom.xml
@@ -0,0 +1,212 @@
 <?xml version="1.0" encoding="UTF-8"?>
 <!--
  ~ Copyright (c) 2016 Uber Technologies, Inc. (hoodie-dev-group@uber.com)
  ~
  ~ Licensed under the Apache License, Version 2.0 (the "License");
  ~ you may not use this file except in compliance with the License.
  ~ You may obtain a copy of the License at
  ~
  ~           http://www.apache.org/licenses/LICENSE-2.0
  ~
  ~ Unless required by applicable law or agreed to in writing, software
  ~ distributed under the License is distributed on an "AS IS" BASIS,
  ~ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  ~ See the License for the specific language governing permissions and
  ~ limitations under the License.
  -->
 <project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
  xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
  <parent>
    <artifactId>hoodie</artifactId>
    <groupId>com.uber.hoodie</groupId>
    <version>0.4.5-SNAPSHOT</version>
    <relativePath>../pom.xml</relativePath>
  </parent>
  <artifactId>hoodie-integ-test</artifactId>
  <modelVersion>4.0.0</modelVersion>
  <dependencies>
    <dependency>
      <groupId>org.glassfish.jersey.connectors</groupId>
      <artifactId>jersey-apache-connector</artifactId>
      <version>2.17</version>
    </dependency>
    <dependency>
      <groupId>org.glassfish.jersey.core</groupId>
      <artifactId>jersey-server</artifactId>
      <version>2.17</version>
    </dependency>
    <dependency>
      <groupId>org.glassfish.jersey.containers</groupId>
      <artifactId>jersey-container-servlet-core</artifactId>
      <version>2.17</version>
    </dependency>
    <dependency>
      <groupId>com.uber.hoodie</groupId>
      <artifactId>hoodie-spark</artifactId>
      <version>${project.version}</version>
      <exclusions>
        <exclusion>
          <groupId>org.glassfish.**</groupId>
          <artifactId>*</artifactId>
        </exclusion>
      </exclusions>
    </dependency>
    <dependency>
      <groupId>com.uber.hoodie</groupId>
      <artifactId>hoodie-common</artifactId>
      <version>${project.version}</version>
      <classifier>tests</classifier>
      <type>test-jar</type>
      <scope>test</scope>
    </dependency>
    <dependency>
      <groupId>org.awaitility</groupId>
      <artifactId>awaitility</artifactId>
      <version>3.1.2</version>
      <scope>test</scope>
    </dependency>
    <dependency>
      <groupId>com.uber.hoodie</groupId>
      <artifactId>hoodie-spark</artifactId>
      <version>${project.version}</version>
      <classifier>tests</classifier>
      <type>test-jar</type>
      <scope>test</scope>
      <exclusions>
        <exclusion>
          <groupId>org.glassfish.**</groupId>
          <artifactId>*</artifactId>
        </exclusion>
      </exclusions>
    </dependency>
    <dependency>
      <groupId>com.google.guava</groupId>
      <artifactId>guava</artifactId>
      <version>20.0</version>
      <scope>test</scope>
    </dependency>
    <dependency>
      <groupId>com.fasterxml.jackson.core</groupId>
      <artifactId>jackson-annotations</artifactId>
      <version>2.6.4</version>
      <scope>test</scope>
    </dependency>
    <dependency>
      <groupId>com.fasterxml.jackson.core</groupId>
      <artifactId>jackson-databind</artifactId>
      <version>2.6.4</version>
      <scope>test</scope>
    </dependency>
    <dependency>
      <groupId>com.fasterxml.jackson.datatype</groupId>
      <artifactId>jackson-datatype-guava</artifactId>
      <version>2.9.4</version>
      <scope>test</scope>
    </dependency>
    <dependency>
      <groupId>com.github.docker-java</groupId>
      <artifactId>docker-java</artifactId>
      <version>3.1.0-rc-3</version>
      <scope>test</scope>
      <exclusions>
        <exclusion>
          <groupId>org.glassfish.**</groupId>
          <artifactId>*</artifactId>
        </exclusion>
      </exclusions>
    </dependency>
    <dependency>
      <groupId>com.uber.hoodie</groupId>
      <artifactId>hoodie-hadoop-sparkworker-docker</artifactId>
      <version>${project.version}</version>
      <type>pom</type>
      <scope>import</scope>
    </dependency>
    <dependency>
      <groupId>junit</groupId>
      <artifactId>junit</artifactId>
      <version>${junit.version}</version>
      <scope>test</scope>
    </dependency>
  </dependencies>
  <properties>
    <dockerCompose.envFile>${project.basedir}/compose_env</dockerCompose.envFile>
    <dockerCompose.file>${project.basedir}/../docker/compose/docker-compose_hadoop284_hive233_spark231.yml</dockerCompose.file>
    <skipITs>false</skipITs>
    <docker.compose.skip>${skipITs}</docker.compose.skip>
    <checkstyle.skip>true</checkstyle.skip>
  </properties>
  <build>
    <plugins>
      <plugin>
        <artifactId>exec-maven-plugin</artifactId>
        <groupId>org.codehaus.mojo</groupId>
        <executions>
          <execution><!-- setup HUDI_WS variable in docker compose env file -->
            <id>Setup HUDI_WS</id>
            <phase>generate-sources</phase>
            <goals>
              <goal>exec</goal>
            </goals>
            <configuration>
              <executable>/bin/bash</executable>
              <arguments>
                <argument> -c </argument>
                <argument>echo HUDI_WS=`dirname ${project.basedir}`</argument>
              </arguments>
              <outputFile>${dockerCompose.envFile}</outputFile>
            </configuration>
          </execution>
        </executions>
      </plugin>
      <plugin>
        <groupId>org.apache.maven.plugins</groupId>
        <artifactId>maven-failsafe-plugin</artifactId>
        <version>2.22.0</version>
        <executions>
          <execution>
            <goals>
              <goal>integration-test</goal>
            </goals>
          </execution>
        </executions>
      </plugin>
      <plugin>
        <groupId>com.dkanejs.maven.plugins</groupId>
        <artifactId>docker-compose-maven-plugin</artifactId>
        <version>2.0.1</version>
        <executions>
          <execution>
            <id>up</id>
            <phase>pre-integration-test</phase>
            <goals>
              <goal>up</goal>
            </goals>
            <configuration>
              <skip>${docker.compose.skip}</skip>
              <host>unix:///var/run/docker.sock</host>
              <composeFile>${project.basedir}/../docker/compose/docker-compose_hadoop284_hive233_spark231.yml</composeFile>
              <detachedMode>true</detachedMode>
              <envFile>${dockerCompose.envFile}</envFile>
            </configuration>
          </execution>
          <execution>
            <id>down</id>
            <phase>integration-test</phase>
            <goals>
              <goal>down</goal>
            </goals>
            <configuration>
              <skip>${docker.compose.skip}</skip>
              <composeFile>${project.basedir}/../docker/compose/docker-compose_hadoop284_hive233_spark231.yml</composeFile>
              <removeVolumes>true</removeVolumes>
            </configuration>
          </execution>
        </executions>
      </plugin>
    </plugins>
  </build>
 </project>
--- a/hoodie-integ-test/src/test/java/com/uber/hoodie/integ/ITTestBase.java
+++ b/hoodie-integ-test/src/test/java/com/uber/hoodie/integ/ITTestBase.java
@@ -0,0 +1,178 @@
 /*
 * Copyright (c) 2016 Uber Technologies, Inc. (hoodie-dev-group@uber.com)
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *           http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
 package com.uber.hoodie.integ;
 import static java.util.concurrent.TimeUnit.SECONDS;
 import static org.awaitility.Awaitility.await;
 import com.github.dockerjava.api.DockerClient;
 import com.github.dockerjava.api.command.DockerCmdExecFactory;
 import com.github.dockerjava.api.command.ExecCreateCmd;
 import com.github.dockerjava.api.command.ExecCreateCmdResponse;
 import com.github.dockerjava.api.model.Container;
 import com.github.dockerjava.core.DefaultDockerClientConfig;
 import com.github.dockerjava.core.DockerClientBuilder;
 import com.github.dockerjava.core.DockerClientConfig;
 import com.github.dockerjava.core.command.ExecStartResultCallback;
 import com.github.dockerjava.jaxrs.JerseyDockerCmdExecFactory;
 import com.google.common.collect.ImmutableList;
 import java.io.ByteArrayOutputStream;
 import java.io.IOException;
 import java.util.Arrays;
 import java.util.List;
 import java.util.Map;
 import java.util.stream.Collectors;
 import org.apache.commons.lang3.tuple.Pair;
 import org.apache.log4j.LogManager;
 import org.apache.log4j.Logger;
 import org.junit.Assert;
 import org.junit.Before;
 public abstract class ITTestBase {
  public static final Logger LOG = LogManager.getLogger(ITTestBase.class);
  protected static final String SPARK_WORKER_CONTAINER = "/spark-worker-1";
  protected static final String ADHOC_1_CONTAINER = "/adhoc-1";
  protected static final String ADHOC_2_CONTAINER = "/adhoc-2";
  protected static final String HIVESERVER = "/hiveserver";
  protected static final String HOODIE_WS_ROOT = "/var/hoodie/ws";
  protected static final String HOODIE_JAVA_APP = HOODIE_WS_ROOT + "/hoodie-spark/run_hoodie_app.sh";
  protected static final String HUDI_HADOOP_BUNDLE =
      HOODIE_WS_ROOT + "/docker/hoodie/hadoop/hive_base/target/hoodie-hadoop-mr-bundle.jar";
  protected static final String HUDI_HIVE_BUNDLE =
      HOODIE_WS_ROOT + "/docker/hoodie/hadoop/hive_base/target/hoodie-hive-bundle.jar";
  protected static final String HUDI_SPARK_BUNDLE =
      HOODIE_WS_ROOT + "/docker/hoodie/hadoop/hive_base/target/hoodie-spark-bundle.jar";
  protected static final String HIVE_SERVER_JDBC_URL = "jdbc:hive2://hiveserver:10000";
  // Skip these lines when capturing output from hive
  protected static final Integer SLF4J_WARNING_LINE_COUNT_IN_HIVE_CMD = 9;
  private static final String DEFAULT_DOCKER_HOST = "unix:///var/run/docker.sock";
  private static final String OVERRIDDEN_DOCKER_HOST = System.getenv("DOCKER_HOST");
  protected DockerClient dockerClient;
  protected Map<String, Container> runningContainers;
  protected static String[] getHiveConsoleCommand(String rawCommand) {
    String jarCommand = "add jar " + HUDI_HADOOP_BUNDLE + ";";
    String fullCommand = jarCommand + rawCommand;
    List<String> cmd = new ImmutableList.Builder().add("hive")
        .add("--hiveconf")
        .add("hive.input.format=org.apache.hadoop.hive.ql.io.HiveInputFormat")
        .add("--hiveconf")
        .add("hive.stats.autogather=false")
        .add("-e")
        .add("\"" + fullCommand + "\"")
        .build();
    return cmd.stream().toArray(String[]::new);
  }
  @Before
  public void init() throws IOException {
    String dockerHost = (OVERRIDDEN_DOCKER_HOST != null) ? OVERRIDDEN_DOCKER_HOST : DEFAULT_DOCKER_HOST;
    //Assuming insecure docker engine
    DockerClientConfig config = DefaultDockerClientConfig.createDefaultConfigBuilder()
        .withDockerHost(dockerHost)
        .build();
    // using jaxrs/jersey implementation here (netty impl is also available)
    DockerCmdExecFactory dockerCmdExecFactory = new JerseyDockerCmdExecFactory()
        .withConnectTimeout(1000)
        .withMaxTotalConnections(100)
        .withMaxPerRouteConnections(10);
    dockerClient = DockerClientBuilder.getInstance(config)
        .withDockerCmdExecFactory(dockerCmdExecFactory)
        .build();
    await().atMost(60, SECONDS).until(this::servicesUp);
  }
  private boolean servicesUp() {
    List<Container> containerList = dockerClient.listContainersCmd().exec();
    for (Container c : containerList) {
      if (!c.getState().equalsIgnoreCase("running")) {
        System.out.println("Container : " + Arrays.toString(c.getNames())
            + "not in running state,  Curr State :" + c.getState());
        return false;
      }
    }
    runningContainers = containerList.stream().map(c -> Pair.of(c.getNames()[0], c))
        .collect(Collectors.toMap(Pair::getLeft, Pair::getRight));
    return true;
  }
  protected TestExecStartResultCallback executeCommandInDocker(String containerName, String[] command,
      boolean expectedToSucceed)
      throws Exception {
    LOG.info("Executing command (" + Arrays.toString(command) + ") in container " + containerName);
    Container sparkWorkerContainer = runningContainers.get(containerName);
    ExecCreateCmd cmd = dockerClient.execCreateCmd(sparkWorkerContainer.getId())
        .withCmd(command).withAttachStdout(true).withAttachStderr(true);
    ExecCreateCmdResponse createCmdResponse = cmd.exec();
    TestExecStartResultCallback callback = new TestExecStartResultCallback(new ByteArrayOutputStream(),
        new ByteArrayOutputStream());
    dockerClient.execStartCmd(createCmdResponse.getId()).withDetach(false).withTty(false)
        .exec(callback).awaitCompletion();
    int exitCode = dockerClient.inspectExecCmd(createCmdResponse.getId()).exec().getExitCode();
    LOG.info("Exit code for command (" + Arrays.toString(command) + ") is " + exitCode);
    if (exitCode != 0) {
      LOG.error("Command (" + Arrays.toString(command) + ") failed.");
      LOG.error("Stdout is :" + callback.getStdout().toString());
      LOG.error("Stderr is :" + callback.getStderr().toString());
    }
    if (expectedToSucceed) {
      Assert.assertTrue("Command (" + Arrays.toString(command)
          + ") expected to succeed. Exit (" + exitCode + ")", exitCode == 0);
    } else {
      Assert.assertTrue("Command (" + Arrays.toString(command)
          + ") expected to fail. Exit (" + exitCode + ")", exitCode != 0);
    }
    cmd.close();
    return callback;
  }
  public class TestExecStartResultCallback extends ExecStartResultCallback {
    // Storing the reference in subclass to expose to clients
    private final ByteArrayOutputStream stdout;
    private final ByteArrayOutputStream stderr;
    public TestExecStartResultCallback(ByteArrayOutputStream stdout, ByteArrayOutputStream stderr) {
      super(stdout, stderr);
      this.stdout = stdout;
      this.stderr = stderr;
    }
    @Override
    public void onComplete() {
      super.onComplete();
      LOG.info("onComplete called");
      try {
        stderr.flush();
        stdout.flush();
      } catch (IOException e) {
        throw new RuntimeException(e);
      }
    }
    public ByteArrayOutputStream getStdout() {
      return stdout;
    }
    public ByteArrayOutputStream getStderr() {
      return stderr;
    }
  }
 }
--- a/hoodie-integ-test/src/test/java/com/uber/hoodie/integ/ITTestHoodieSanity.java
+++ b/hoodie-integ-test/src/test/java/com/uber/hoodie/integ/ITTestHoodieSanity.java
@@ -0,0 +1,139 @@
 /*
 * Copyright (c) 2016 Uber Technologies, Inc. (hoodie-dev-group@uber.com)
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *           http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
 package com.uber.hoodie.integ;
 import java.util.Arrays;
 import org.junit.Assert;
 import org.junit.Test;
 /**
 * Smoke tests to run as part of verification.
 */
 public class ITTestHoodieSanity extends ITTestBase {
  @Test
  public void testRunEcho() throws Exception {
    String[] cmd = new String[]{"echo", "Happy Testing"};
    TestExecStartResultCallback callback = executeCommandInDocker(ADHOC_1_CONTAINER,
        cmd, true);
    String stdout = callback.getStdout().toString();
    String stderr = callback.getStderr().toString();
    LOG.info("Got output for (" + Arrays.toString(cmd) + ") :" + stdout);
    LOG.info("Got error output for (" + Arrays.toString(cmd) + ") :" + stderr);
  }
  @Test
  /**
   * A basic integration test that runs HoodieJavaApp to create a sample COW Hoodie with single partition key
   * data-set and performs upserts on it. Hive integration and upsert functionality is checked by running a count
   * query in hive console.
   */
  public void testRunHoodieJavaAppOnSinglePartitionKeyCOWTable() throws Exception {
    String hiveTableName = "docker_hoodie_single_partition_key_cow_test";
    testRunHoodieJavaAppOnCOWTable(hiveTableName, true);
  }
  @Test
  /**
   * A basic integration test that runs HoodieJavaApp to create a sample COW Hoodie with multiple partition-keys
   * data-set and performs upserts on it. Hive integration and upsert functionality is checked by running a count
   * query in hive console.
   */
  public void testRunHoodieJavaAppOnMultiPartitionKeysCOWTable() throws Exception {
    String hiveTableName = "docker_hoodie_multi_partition_key_cow_test";
    testRunHoodieJavaAppOnCOWTable(hiveTableName, false);
  }
  /**
   * A basic integration test that runs HoodieJavaApp to create a sample COW Hoodie
   * data-set and performs upserts on it. Hive integration and upsert functionality is checked by running a count
   * query in hive console.
   * TODO: Add spark-shell test-case
   */
  public void testRunHoodieJavaAppOnCOWTable(String hiveTableName, boolean singlePartitionKey) throws Exception {
    // Drop Table if it exists
    {
      String[] hiveDropCmd = getHiveConsoleCommand("drop table if exists " + hiveTableName);
      executeCommandInDocker(HIVESERVER, hiveDropCmd, true);
    }
    // Ensure table does not exist
    {
      String[] hiveTableCheck = getHiveConsoleCommand("show tables like '" + hiveTableName + "'");
      TestExecStartResultCallback callback =
          executeCommandInDocker(HIVESERVER, hiveTableCheck, true);
      String stderr = callback.getStderr().toString();
      String stdout = callback.getStdout().toString();
      LOG.info("Got output for (" + Arrays.toString(hiveTableCheck) + ") :" + stdout);
      LOG.info("Got error output for (" + Arrays.toString(hiveTableCheck) + ") :" + stderr);
      Assert.assertTrue("Result :" + callback.getStdout().toString(), stdout.trim().isEmpty());
    }
    // Run Hoodie Java App
    {
      String[] cmd = null;
      if (singlePartitionKey) {
        cmd = new String[]{
            HOODIE_JAVA_APP,
            "--hive-sync",
            "--hive-url", HIVE_SERVER_JDBC_URL,
            "--hive-table", hiveTableName
        };
      } else {
        cmd = new String[]{
            HOODIE_JAVA_APP,
            "--hive-sync",
            "--hive-url", HIVE_SERVER_JDBC_URL,
            "--use-multi-partition-keys",
            "--hive-table", hiveTableName
        };
      }
      TestExecStartResultCallback callback = executeCommandInDocker(ADHOC_1_CONTAINER,
          cmd, true);
      String stdout = callback.getStdout().toString().trim();
      String stderr = callback.getStderr().toString().trim();
      LOG.info("Got output for (" + Arrays.toString(cmd) + ") :" + stdout);
      LOG.info("Got error output for (" + Arrays.toString(cmd) + ") :" + stderr);
    }
    // Ensure table does exist
    {
      String[] hiveTableCheck = getHiveConsoleCommand("show tables like '" + hiveTableName + "'");
      TestExecStartResultCallback callback =
          executeCommandInDocker(HIVESERVER, hiveTableCheck, true);
      String stderr = callback.getStderr().toString().trim();
      String stdout = callback.getStdout().toString().trim();
      LOG.info("Got output for (" + Arrays.toString(hiveTableCheck) + ") : (" + stdout + ")");
      LOG.info("Got error output for (" + Arrays.toString(hiveTableCheck) + ") : (" + stderr + ")");
      Assert.assertEquals("Table exists", hiveTableName, stdout);
    }
    // Ensure row count is 100 (without duplicates)
    {
      String[] hiveTableCheck = getHiveConsoleCommand("select count(1) from " + hiveTableName);
      TestExecStartResultCallback callback =
          executeCommandInDocker(ADHOC_1_CONTAINER, hiveTableCheck, true);
      String stderr = callback.getStderr().toString().trim();
      String stdout = callback.getStdout().toString().trim();
      LOG.info("Got output for (" + Arrays.toString(hiveTableCheck) + ") : (" + stdout + ")");
      LOG.info("Got error output for (" + Arrays.toString(hiveTableCheck) + ") : (" + stderr + ")");
      Assert.assertEquals("Expecting 100 rows to be present in the new table", 100,
          Integer.parseInt(stdout.trim()));
    }
  }
 }
--- a/hoodie-integ-test/src/test/resources/log4j-surefire.properties
+++ b/hoodie-integ-test/src/test/resources/log4j-surefire.properties
@@ -0,0 +1,23 @@
 #
 # Copyright (c) 2016 Uber Technologies, Inc. (hoodie-dev-group@uber.com)
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #           http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 #
 log4j.rootLogger=WARN, A1
 log4j.category.com.uber=INFO
 log4j.category.org.apache.parquet.hadoop=WARN
 # A1 is set to be a ConsoleAppender.
 log4j.appender.A1=org.apache.log4j.ConsoleAppender
 # A1 uses PatternLayout.
 log4j.appender.A1.layout=org.apache.log4j.PatternLayout
 log4j.appender.A1.layout.ConversionPattern=%-4r [%t] %-5p %c %x - %m%n
--- a/hoodie-spark/pom.xml
+++ b/hoodie-spark/pom.xml
@@ -111,6 +111,21 @@
          </execution>
        </executions>
      </plugin>
      <plugin>
        <groupId>org.apache.maven.plugins</groupId>
        <artifactId>maven-jar-plugin</artifactId>
        <executions>
          <execution>
            <goals>
              <goal>test-jar</goal>
            </goals>
            <phase>test-compile</phase>
          </execution>
        </executions>
        <configuration>
          <skip>false</skip>
        </configuration>
      </plugin>
      <plugin>
        <groupId>org.apache.rat</groupId>
        <artifactId>apache-rat-plugin</artifactId>
--- a/hoodie-spark/run_hoodie_app.sh
+++ b/hoodie-spark/run_hoodie_app.sh
@@ -21,4 +21,4 @@ fi
 OTHER_JARS=`ls -1 $DIR/target/lib/*jar | grep -v '*avro*-1.' | tr '\n' ':'`
 #TODO - Need to move TestDataGenerator and HoodieJavaApp out of tests
 echo "Running command : java -cp $DIR/target/test-classes/:$DIR/../hoodie-client/target/test-classes/:${HADOOP_CONF_DIR}:$HOODIE_JAR:${CLIENT_JAR}:$OTHER_JARS HoodieJavaApp $@"
-java -cp $DIR/target/test-classes/:$DIR/../hoodie-client/target/test-classes/:${HADOOP_CONF_DIR}:$HOODIE_JAR:${CLIENT_JAR}:$OTHER_JARS HoodieJavaApp "$@"
+java -Xmx1G -cp $DIR/target/test-classes/:$DIR/../hoodie-client/target/test-classes/:${HADOOP_CONF_DIR}:$HOODIE_JAR:${CLIENT_JAR}:$OTHER_JARS HoodieJavaApp "$@"
--- a/hoodie-spark/src/main/scala/com/uber/hoodie/DataSourceOptions.scala
+++ b/hoodie-spark/src/main/scala/com/uber/hoodie/DataSourceOptions.scala
@@ -153,7 +153,7 @@ object DataSourceWriteOptions {
  val HIVE_TABLE_OPT_KEY = "hoodie.datasource.hive_sync.table"
  val HIVE_USER_OPT_KEY = "hoodie.datasource.hive_sync.username"
  val HIVE_PASS_OPT_KEY = "hoodie.datasource.hive_sync.password"
-  val HIVE_URL_OPT_KEY = "hoodie.datasource.hive_sync.jdbcUrl"
+  val HIVE_URL_OPT_KEY = "hoodie.datasource.hive_sync.jdbcurl"
  val HIVE_PARTITION_FIELDS_OPT_KEY = "hoodie.datasource.hive_sync.partition_fields"
  val HIVE_PARTITION_EXTRACTOR_CLASS_OPT_KEY = "hoodie.datasource.hive_sync.partition_extractor_class"
  val HIVE_ASSUME_DATE_PARTITION_OPT_KEY = "hoodie.datasource.hive_sync.assume_date_partitioning"
--- a/hoodie-spark/src/test/java/HoodieJavaApp.java
+++ b/hoodie-spark/src/test/java/HoodieJavaApp.java
@@ -24,6 +24,7 @@ import com.uber.hoodie.HoodieDataSourceHelpers;
 import com.uber.hoodie.common.HoodieTestDataGenerator;
 import com.uber.hoodie.common.model.HoodieTableType;
 import com.uber.hoodie.config.HoodieWriteConfig;
 import com.uber.hoodie.hive.MultiPartKeysValueExtractor;
 import java.util.List;
 import org.apache.hadoop.fs.FileSystem;
 import org.apache.log4j.LogManager;
@@ -65,7 +66,10 @@ public class HoodieJavaApp {
  private String hivePass = "hive";
  @Parameter(names = {"--hive-url", "-hl"}, description = "hive JDBC URL")
-  private String hiveJdbcUrl = "jdbc:hive://localhost:10000";
+  private String hiveJdbcUrl = "jdbc:hive2://localhost:10000";
  @Parameter(names = {"--use-multi-partition-keys", "-mp"}, description = "Use Multiple Partition Keys")
  private Boolean useMultiPartitionKeys = false;
  @Parameter(names = {"--help", "-h"}, help = true)
  public Boolean help = false;
@@ -188,10 +192,16 @@ public class HoodieJavaApp {
      writer = writer.option(DataSourceWriteOptions.HIVE_TABLE_OPT_KEY(), hiveTable)
          .option(DataSourceWriteOptions.HIVE_DATABASE_OPT_KEY(), hiveDB)
          .option(DataSourceWriteOptions.HIVE_URL_OPT_KEY(), hiveJdbcUrl)
          .option(DataSourceWriteOptions.HIVE_PARTITION_FIELDS_OPT_KEY(), "dateStr")
          .option(DataSourceWriteOptions.HIVE_USER_OPT_KEY(), hiveUser)
          .option(DataSourceWriteOptions.HIVE_PASS_OPT_KEY(), hivePass)
          .option(DataSourceWriteOptions.HIVE_SYNC_ENABLED_OPT_KEY(), "true");
      if (useMultiPartitionKeys) {
        writer = writer.option(DataSourceWriteOptions.HIVE_PARTITION_FIELDS_OPT_KEY(), "year,month,day")
            .option(DataSourceWriteOptions.HIVE_PARTITION_EXTRACTOR_CLASS_OPT_KEY(),
            MultiPartKeysValueExtractor.class.getCanonicalName());
      } else {
        writer = writer.option(DataSourceWriteOptions.HIVE_PARTITION_FIELDS_OPT_KEY(), "dateStr");
      }
    }
    return writer;
  }
--- a/pom.xml
+++ b/pom.xml
@@ -39,6 +39,8 @@
    <module>packaging/hoodie-hadoop-mr-bundle</module>
    <module>packaging/hoodie-hive-bundle</module>
    <module>packaging/hoodie-spark-bundle</module>
    <module>docker/hoodie/hadoop</module>
    <module>hoodie-integ-test</module>
  </modules>
  <licenses>
		`@@ -0,0 +1,3 @@`
							`#!/bin/bash`

							`$HADOOP_PREFIX/bin/yarn --config $HADOOP_CONF_DIR historyserver`