1
0

Docker Container Build and Run setup with foundations for adding docker integration tests. Docker images built with Hadoop 2.8.4 Hive 2.3.3 and Spark 2.3.1 and published to docker-hub

Look at quickstart document for how to setup docker and run demo
This commit is contained in:
Balaji Varadarajan
2018-08-21 22:54:57 -07:00
committed by vinoth chandar
parent 9710b5a3a6
commit f3418e4718
63 changed files with 8952 additions and 9 deletions

View File

@@ -0,0 +1,13 @@
#!/bin/bash
while true; do
read -p "Docker images can be downloaded from docker hub and seamlessly mounted with latest HUDI jars. Do you still want to build docker images from scratch ?" yn
case $yn in
[Yy]* ) make install; break;;
[Nn]* ) exit;;
* ) echo "Please answer yes or no.";;
esac
done
pushd ../
mvn clean pre-integration-test -DskipTests -Ddocker.compose.skip=true -Ddocker.build.skip=false
popd

View File

@@ -0,0 +1,217 @@
version: "3.3"
services:
namenode:
image: varadarb/hudi-hadoop_2.8.4-namenode:latest
hostname: namenode
container_name: namenode
volumes:
- /tmp/hadoop_name:/hadoop/dfs/name
environment:
- CLUSTER_NAME=hudi_hadoop284_hive232_spark231
ports:
- "50070:50070"
- "8020:8020"
env_file:
- ./hadoop.env
healthcheck:
test: ["CMD", "curl", "-f", "http://namenode:50070"]
interval: 30s
timeout: 10s
retries: 3
datanode1:
image: varadarb/hudi-hadoop_2.8.4-datanode:latest
container_name: datanode1
hostname: datanode1
environment:
- CLUSTER_NAME=hudi_hadoop284_hive232_spark231
env_file:
- ./hadoop.env
ports:
- "50075:50075"
- "50010:50010"
links:
- "namenode"
- "historyserver"
healthcheck:
test: ["CMD", "curl", "-f", "http://datanode1:50075"]
interval: 30s
timeout: 10s
retries: 3
depends_on:
- namenode
volumes:
- /tmp/hadoop_data:/hadoop/dfs/data
historyserver:
image: varadarb/hudi-hadoop_2.8.4-history:latest
hostname: historyserver
container_name: historyserver
environment:
- CLUSTER_NAME=hudi_hadoop284_hive232_spark231
depends_on:
- "namenode"
links:
- "namenode"
ports:
- "58188:8188"
healthcheck:
test: ["CMD", "curl", "-f", "http://historyserver:8188"]
interval: 30s
timeout: 10s
retries: 3
env_file:
- ./hadoop.env
volumes:
- historyserver:/hadoop/yarn/timeline
hive-metastore-postgresql:
image: bde2020/hive-metastore-postgresql:2.3.0
volumes:
- hive-metastore-postgresql:/var/lib/postgresql
hostname: hive-metastore-postgresql
container_name: hive-metastore-postgresql
hivemetastore:
image: varadarb/hudi-hadoop_2.8.4-hive_2.3.3:latest
hostname: hivemetastore
container_name: hivemetastore
links:
- "hive-metastore-postgresql"
- "namenode"
env_file:
- ./hadoop.env
command: /opt/hive/bin/hive --service metastore
environment:
SERVICE_PRECONDITION: "namenode:50070 hive-metastore-postgresql:5432"
ports:
- "9083:9083"
healthcheck:
test: ["CMD", "nc", "-z", "hivemetastore", "9083"]
interval: 30s
timeout: 10s
retries: 3
depends_on:
- "hive-metastore-postgresql"
- "namenode"
hiveserver:
image: varadarb/hudi-hadoop_2.8.4-hive_2.3.3:latest
hostname: hiveserver
container_name: hiveserver
env_file:
- ./hadoop.env
environment:
SERVICE_PRECONDITION: "hivemetastore:9083"
ports:
- "10000:10000"
depends_on:
- "hivemetastore"
links:
- "hivemetastore"
- "hive-metastore-postgresql"
- "namenode"
volumes:
- ${HUDI_WS}:/var/hoodie/ws
sparkmaster:
image: varadarb/hudi-hadoop_2.8.4-hive_2.3.3-sparkmaster_2.3.1:latest
hostname: sparkmaster
container_name: sparkmaster
env_file:
- ./hadoop.env
ports:
- "8080:8080"
- "7077:7077"
environment:
- INIT_DAEMON_STEP=setup_spark
links:
- "hivemetastore"
- "hiveserver"
- "hive-metastore-postgresql"
- "namenode"
spark-worker-1:
image: varadarb/hudi-hadoop_2.8.4-hive_2.3.3-sparkworker_2.3.1:latest
hostname: spark-worker-1
container_name: spark-worker-1
env_file:
- ./hadoop.env
depends_on:
- sparkmaster
ports:
- "8081:8081"
environment:
- "SPARK_MASTER=spark://sparkmaster:7077"
links:
- "hivemetastore"
- "hiveserver"
- "hive-metastore-postgresql"
- "namenode"
zookeeper:
image: 'bitnami/zookeeper:3.4.12-r68'
hostname: zookeeper
container_name: zookeeper
ports:
- '2181:2181'
environment:
- ALLOW_ANONYMOUS_LOGIN=yes
kafka:
image: 'bitnami/kafka:2.0.0'
hostname: kafkabroker
container_name: kafkabroker
ports:
- '9092:9092'
environment:
- KAFKA_ZOOKEEPER_CONNECT=zookeeper:2181
- ALLOW_PLAINTEXT_LISTENER=yes
adhoc-1:
image: varadarb/hudi-hadoop_2.8.4-hive_2.3.3-sparkadhoc_2.3.1:latest
hostname: adhoc-1
container_name: adhoc-1
env_file:
- ./hadoop.env
depends_on:
- sparkmaster
ports:
- '4040:4040'
environment:
- "SPARK_MASTER=spark://sparkmaster:7077"
links:
- "hivemetastore"
- "hiveserver"
- "hive-metastore-postgresql"
- "namenode"
volumes:
- ${HUDI_WS}:/var/hoodie/ws
adhoc-2:
image: varadarb/hudi-hadoop_2.8.4-hive_2.3.3-sparkadhoc_2.3.1:latest
hostname: adhoc-2
container_name: adhoc-2
env_file:
- ./hadoop.env
depends_on:
- sparkmaster
environment:
- "SPARK_MASTER=spark://sparkmaster:7077"
links:
- "hivemetastore"
- "hiveserver"
- "hive-metastore-postgresql"
- "namenode"
volumes:
- ${HUDI_WS}:/var/hoodie/ws
volumes:
namenode:
historyserver:
hive-metastore-postgresql:
networks:
default:

33
docker/compose/hadoop.env Normal file
View File

@@ -0,0 +1,33 @@
HIVE_SITE_CONF_javax_jdo_option_ConnectionURL=jdbc:postgresql://hive-metastore-postgresql/metastore
HIVE_SITE_CONF_javax_jdo_option_ConnectionDriverName=org.postgresql.Driver
HIVE_SITE_CONF_javax_jdo_option_ConnectionUserName=hive
HIVE_SITE_CONF_javax_jdo_option_ConnectionPassword=hive
HIVE_SITE_CONF_datanucleus_autoCreateSchema=false
HIVE_SITE_CONF_hive_metastore_uris=thrift://hivemetastore:9083
HDFS_CONF_dfs_namenode_datanode_registration_ip___hostname___check=false
HDFS_CONF_dfs_webhdfs_enabled=true
HDFS_CONF_dfs_permissions_enabled=false
#HDFS_CONF_dfs_client_use_datanode_hostname=true
#HDFS_CONF_dfs_namenode_use_datanode_hostname=true
CORE_CONF_fs_defaultFS=hdfs://namenode:8020
CORE_CONF_hadoop_http_staticuser_user=root
CORE_CONF_hadoop_proxyuser_hue_hosts=*
CORE_CONF_hadoop_proxyuser_hue_groups=*
YARN_CONF_yarn_log___aggregation___enable=true
YARN_CONF_yarn_resourcemanager_recovery_enabled=true
YARN_CONF_yarn_resourcemanager_store_class=org.apache.hadoop.yarn.server.resourcemanager.recovery.FileSystemRMStateStore
YARN_CONF_yarn_resourcemanager_fs_state___store_uri=/rmstate
YARN_CONF_yarn_nodemanager_remote___app___log___dir=/app-logs
YARN_CONF_yarn_log_server_url=http://historyserver:8188/applicationhistory/logs/
YARN_CONF_yarn_timeline___service_enabled=true
YARN_CONF_yarn_timeline___service_generic___application___history_enabled=true
YARN_CONF_yarn_resourcemanager_system___metrics___publisher_enabled=true
YARN_CONF_yarn_resourcemanager_hostname=resourcemanager
YARN_CONF_yarn_timeline___service_hostname=historyserver
YARN_CONF_yarn_resourcemanager_address=resourcemanager:8032
YARN_CONF_yarn_resourcemanager_scheduler_address=resourcemanager:8030
YARN_CONF_yarn_resourcemanager_resource___tracker_address=resourcemanager:8031
YARN_CONF_yarn_nodemanager_vmem___check___enabled=false

View File

@@ -0,0 +1,21 @@
#
# Copyright (c) 2017 Uber Technologies, Inc. (hoodie-dev-group@uber.com)
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
#
#
# Common hoodie client configs
hoodie.upsert.shuffle.parallelism=2
hoodie.insert.shuffle.parallelism=2
hoodie.bulkinsert.shuffle.parallelism=2

View File

@@ -0,0 +1,29 @@
#
# Copyright (c) 2017 Uber Technologies, Inc. (hoodie-dev-group@uber.com)
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
#
#
include=base.properties
# Key fields, for kafka example
hoodie.datasource.write.recordkey.field=key
hoodie.datasource.write.partitionpath.field=date
# Schema provider props (change to absolute path based on your installation)
hoodie.deltastreamer.schemaprovider.source.schema.file=/var/demo/config/schema.avsc
hoodie.deltastreamer.schemaprovider.target.schema.file=/var/demo/config/schema.avsc
# Kafka Source
hoodie.deltastreamer.source.kafka.topic=stock_ticks
#Kafka props
metadata.broker.list=kafkabroker:9092
auto.offset.reset=smallest

View File

@@ -0,0 +1,41 @@
{
"type":"record",
"name":"stock_ticks",
"fields":[{
"name": "volume",
"type": "long"
}, {
"name": "ts",
"type": "string"
}, {
"name": "symbol",
"type": "string"
},{
"name": "year",
"type": "int"
},{
"name": "month",
"type": "string"
},{
"name": "high",
"type": "double"
},{
"name": "low",
"type": "double"
},{
"name": "key",
"type": "string"
},{
"name": "date",
"type":"string"
}, {
"name": "close",
"type": "double"
}, {
"name": "open",
"type": "double"
}, {
"name": "day",
"type":"string"
}
]}

View File

@@ -0,0 +1,26 @@
#
# Licensed to the Apache Software Foundation (ASF) under one or more
# contributor license agreements. See the NOTICE file distributed with
# this work for additional information regarding copyright ownership.
# The ASF licenses this file to You under the Apache License, Version 2.0
# (the "License"); you may not use this file except in compliance with
# the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
# Default system properties included when running spark-submit.
# This is useful for setting default environmental settings.
# Example:
spark.master local[3]
spark.eventLog.dir hdfs://namenode:8020/tmp/spark-events
spark.serializer org.apache.spark.serializer.KryoSerializer
#spark.executor.memory 4g
# spark.executor.extraJavaOptions -XX:+PrintGCDetails -Dkey=value -Dnumbers="one two three"

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,6 @@
echo "Copying spark default config and setting up configs"
cp /var/hoodie/ws/docker/demo/config/spark-defaults.conf $SPARK_CONF_DIR/.
hadoop fs -mkdir -p /var/demo/
hadoop fs -mkdir -p /tmp/spark-events
hadoop fs -copyFromLocal -f /var/hoodie/ws/docker/demo/config /var/demo/.
chmod +x /var/hoodie/ws/hoodie-hive/run_sync_tool.sh

View File

@@ -0,0 +1,45 @@
FROM frolvlad/alpine-oraclejdk8
MAINTAINER Hoodie
USER root
# Default to UTF-8 file.encoding
ENV LANG C.UTF-8
# Updating & Installing packages
RUN apk add net-tools curl bash perl procps
ARG HADOOP_VERSION=2.8.4
ARG HADOOP_URL=https://www.apache.org/dist/hadoop/common/hadoop-${HADOOP_VERSION}/hadoop-${HADOOP_VERSION}.tar.gz
ENV HADOOP_VERSION ${HADOOP_VERSION}
ENV HADOOP_URL ${HADOOP_URL}
RUN set -x \
&& echo "Fetch URL2 is : ${HADOOP_URL}" \
&& curl -fSL "${HADOOP_URL}" -o /tmp/hadoop.tar.gz \
&& curl -fSL "${HADOOP_URL}.asc" -o /tmp/hadoop.tar.gz.asc \
&& mkdir -p /opt/hadoop-$HADOOP_VERSION/logs \
&& tar -xvf /tmp/hadoop.tar.gz -C /opt/ \
&& rm /tmp/hadoop.tar.gz* \
&& ln -s /opt/hadoop-$HADOOP_VERSION/etc/hadoop /etc/hadoop \
&& cp /etc/hadoop/mapred-site.xml.template /etc/hadoop/mapred-site.xml \
&& mkdir /hadoop-data
ENV HADOOP_PREFIX=/opt/hadoop-$HADOOP_VERSION
ENV HADOOP_CONF_DIR=/etc/hadoop
ENV MULTIHOMED_NETWORK=1
ENV HADOOP_HOME=${HADOOP_PREFIX}
ENV HADOOP_INSTALL=${HADOOP_HOME}
ENV USER=root
ENV PATH /usr/bin:/bin:$HADOOP_PREFIX/bin/:$PATH
# Exposing a union of ports across hadoop versions
# Well known ports including ssh
EXPOSE 0-1024 4040 7000-10100 5000-5100 50000-50200 58188 58088 58042
ADD entrypoint.sh /entrypoint.sh
ADD export_container_ip.sh /usr/bin/
RUN chmod a+x /usr/bin/export_container_ip.sh \
&& chmod a+x /entrypoint.sh
ENTRYPOINT ["/bin/bash", "/entrypoint.sh"]

View File

@@ -0,0 +1,91 @@
#!/bin/bash
#######################################################################################
## COPIED FROM ##
## https://github.com/big-data-europe/docker-hadoop/blob/master/base/entrypoint.sh ##
# ##
#######################################################################################
# Set some sensible defaults
export CORE_CONF_fs_defaultFS=${CORE_CONF_fs_defaultFS:-hdfs://`hostname -f`:8020}
function addProperty() {
local path=$1
local name=$2
local value=$3
local entry="<property><name>$name</name><value>${value}</value></property>"
local escapedEntry=$(echo $entry | sed 's/\//\\\//g')
sed -i "/<\/configuration>/ s/.*/${escapedEntry}\n&/" $path
}
function configure() {
local path=$1
local module=$2
local envPrefix=$3
local var
local value
echo "Configuring $module"
for c in `printenv | perl -sne 'print "$1 " if m/^${envPrefix}_(.+?)=.*/' -- -envPrefix=$envPrefix`; do
name=`echo ${c} | perl -pe 's/___/-/g; s/__/@/g; s/_/./g; s/@/_/g;'`
var="${envPrefix}_${c}"
value=${!var}
echo " - Setting $name=$value"
addProperty /etc/hadoop/$module-site.xml $name "$value"
done
}
configure /etc/hadoop/core-site.xml core CORE_CONF
configure /etc/hadoop/hdfs-site.xml hdfs HDFS_CONF
configure /etc/hadoop/yarn-site.xml yarn YARN_CONF
configure /etc/hadoop/httpfs-site.xml httpfs HTTPFS_CONF
configure /etc/hadoop/kms-site.xml kms KMS_CONF
if [ "$MULTIHOMED_NETWORK" = "1" ]; then
echo "Configuring for multihomed network"
# HDFS
addProperty /etc/hadoop/hdfs-site.xml dfs.namenode.rpc-bind-host 0.0.0.0
addProperty /etc/hadoop/hdfs-site.xml dfs.namenode.servicerpc-bind-host 0.0.0.0
addProperty /etc/hadoop/hdfs-site.xml dfs.namenode.http-bind-host 0.0.0.0
addProperty /etc/hadoop/hdfs-site.xml dfs.namenode.https-bind-host 0.0.0.0
addProperty /etc/hadoop/hdfs-site.xml dfs.client.use.datanode.hostname true
addProperty /etc/hadoop/hdfs-site.xml dfs.datanode.use.datanode.hostname true
# YARN
addProperty /etc/hadoop/yarn-site.xml yarn.resourcemanager.bind-host 0.0.0.0
addProperty /etc/hadoop/yarn-site.xml yarn.nodemanager.bind-host 0.0.0.0
addProperty /etc/hadoop/yarn-site.xml yarn.nodemanager.bind-host 0.0.0.0
addProperty /etc/hadoop/yarn-site.xml yarn.timeline-service.bind-host 0.0.0.0
# MAPRED
addProperty /etc/hadoop/mapred-site.xml yarn.nodemanager.bind-host 0.0.0.0
fi
if [ -n "$GANGLIA_HOST" ]; then
mv /etc/hadoop/hadoop-metrics.properties /etc/hadoop/hadoop-metrics.properties.orig
mv /etc/hadoop/hadoop-metrics2.properties /etc/hadoop/hadoop-metrics2.properties.orig
for module in mapred jvm rpc ugi; do
echo "$module.class=org.apache.hadoop.metrics.ganglia.GangliaContext31"
echo "$module.period=10"
echo "$module.servers=$GANGLIA_HOST:8649"
done > /etc/hadoop/hadoop-metrics.properties
for module in namenode datanode resourcemanager nodemanager mrappmaster jobhistoryserver; do
echo "$module.sink.ganglia.class=org.apache.hadoop.metrics2.sink.ganglia.GangliaSink31"
echo "$module.sink.ganglia.period=10"
echo "$module.sink.ganglia.supportsparse=true"
echo "$module.sink.ganglia.slope=jvm.metrics.gcCount=zero,jvm.metrics.memHeapUsedM=both"
echo "$module.sink.ganglia.dmax=jvm.metrics.threadsBlocked=70,jvm.metrics.memHeapUsedM=40"
echo "$module.sink.ganglia.servers=$GANGLIA_HOST:8649"
done > /etc/hadoop/hadoop-metrics2.properties
fi
# Save Container IP in ENV variable
/usr/bin/export_container_ip.sh
exec "$@"

View File

@@ -0,0 +1,13 @@
interfaces=( "en0" "eth0" )
ipAddr=""
for interface in "${interfaces[@]}"
do
ipAddr=`ifconfig $interface | grep -Eo 'inet (addr:)?([0-9]+\.){3}[0-9]+' | grep -Eo '([0-9]+\.){3}[0-9]+' | grep -v '127.0.0.1' | head`
if [ -n "$ipAddr" ]; then
break
fi
done
echo "Container IP is set to : $ipAddr"
export MY_CONTAINER_IP=$ipAddr

View File

@@ -0,0 +1,90 @@
<?xml version="1.0" encoding="UTF-8"?>
<!--
~ Copyright (c) 2016 Uber Technologies, Inc. (hoodie-dev-group@uber.com)
~
~ Licensed under the Apache License, Version 2.0 (the "License");
~ you may not use this file except in compliance with the License.
~ You may obtain a copy of the License at
~
~ http://www.apache.org/licenses/LICENSE-2.0
~
~ Unless required by applicable law or agreed to in writing, software
~ distributed under the License is distributed on an "AS IS" BASIS,
~ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
~ See the License for the specific language governing permissions and
~ limitations under the License.
-->
<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
<parent>
<artifactId>hoodie-hadoop-docker</artifactId>
<groupId>com.uber.hoodie</groupId>
<version>0.4.5-SNAPSHOT</version>
</parent>
<modelVersion>4.0.0</modelVersion>
<packaging>pom</packaging>
<artifactId>hoodie-hadoop-base-docker</artifactId>
<description>Base Docker Image with Hoodie</description>
<properties>
<project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
<checkstyle.skip>true</checkstyle.skip>
</properties>
<dependencies>
<dependency>
<groupId>com.uber.hoodie</groupId>
<artifactId>hoodie-hadoop-docker</artifactId>
<version>${project.version}</version>
<type>pom</type>
<scope>import</scope>
</dependency>
</dependencies>
<build>
<finalName>hoodie</finalName>
<plugins>
<!-- Build Docker image -->
<plugin>
<groupId>com.spotify</groupId>
<artifactId>dockerfile-maven-plugin</artifactId>
<version>${dockerfile.maven.version}</version>
<executions>
<execution>
<id>tag-latest</id>
<phase>pre-integration-test</phase>
<goals>
<goal>build</goal>
<goal>tag</goal>
<!-- <goal>push</goal> -->
</goals>
<configuration>
<skip>${docker.build.skip}</skip>
<pullNewerImage>false</pullNewerImage>
<repository>varadarb/hudi-hadoop_${docker.hadoop.version}-base</repository>
<forceTags>true</forceTags>
<tag>latest</tag>
</configuration>
</execution>
<execution>
<id>tag-version</id>
<phase>pre-integration-test</phase>
<goals>
<goal>build</goal>
<goal>tag</goal>
<!-- <goal>push</goal> -->
</goals>
<configuration>
<skip>${docker.build.skip}</skip>
<pullNewerImage>false</pullNewerImage>
<repository>varadarb/hudi-hadoop_${docker.hadoop.version}-base</repository>
<forceTags>true</forceTags>
<tag>${project.version}</tag>
</configuration>
</execution>
</executions>
</plugin>
</plugins>
</build>
</project>

View File

@@ -0,0 +1,14 @@
ARG HADOOP_VERSION=2.8.4
ARG HADOOP_DN_PORT=50075
FROM varadarb/hudi-hadoop_${HADOOP_VERSION}-base:latest
ENV HADOOP_DN_PORT ${HADOOP_DN_PORT}
ENV HDFS_CONF_dfs_datanode_data_dir=file:///hadoop/dfs/data
RUN mkdir -p /hadoop/dfs/data
VOLUME /hadoop/dfs/data
ADD run_dn.sh /run_dn.sh
RUN chmod a+x /run_dn.sh
CMD ["/run_dn.sh"]

View File

@@ -0,0 +1,89 @@
<?xml version="1.0" encoding="UTF-8"?>
<!--
~ Copyright (c) 2016 Uber Technologies, Inc. (hoodie-dev-group@uber.com)
~
~ Licensed under the Apache License, Version 2.0 (the "License");
~ you may not use this file except in compliance with the License.
~ You may obtain a copy of the License at
~
~ http://www.apache.org/licenses/LICENSE-2.0
~
~ Unless required by applicable law or agreed to in writing, software
~ distributed under the License is distributed on an "AS IS" BASIS,
~ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
~ See the License for the specific language governing permissions and
~ limitations under the License.
-->
<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
<parent>
<artifactId>hoodie-hadoop-docker</artifactId>
<groupId>com.uber.hoodie</groupId>
<version>0.4.5-SNAPSHOT</version>
</parent>
<modelVersion>4.0.0</modelVersion>
<packaging>pom</packaging>
<artifactId>hoodie-hadoop-datanode-docker</artifactId>
<description>Base Docker Image with Hoodie</description>
<properties>
<project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
<checkstyle.skip>true</checkstyle.skip>
</properties>
<dependencies>
<dependency>
<groupId>com.uber.hoodie</groupId>
<artifactId>hoodie-hadoop-base-docker</artifactId>
<version>${project.version}</version>
<type>pom</type>
<scope>import</scope>
</dependency>
</dependencies>
<build>
<plugins>
<!-- Build Docker image -->
<plugin>
<groupId>com.spotify</groupId>
<artifactId>dockerfile-maven-plugin</artifactId>
<version>${dockerfile.maven.version}</version>
<executions>
<execution>
<id>tag-latest</id>
<phase>pre-integration-test</phase>
<goals>
<goal>build</goal>
<goal>tag</goal>
<!-- <goal>push</goal> -->
</goals>
<configuration>
<skip>${docker.build.skip}</skip>
<pullNewerImage>false</pullNewerImage>
<repository>varadarb/hudi-hadoop_${docker.hadoop.version}-datanode</repository>
<forceTags>true</forceTags>
<tag>latest</tag>
</configuration>
</execution>
<execution>
<id>tag-version</id>
<phase>pre-integration-test</phase>
<goals>
<goal>build</goal>
<goal>tag</goal>
<!-- <goal>push</goal> -->
</goals>
<configuration>
<skip>${docker.build.skip}</skip>
<pullNewerImage>false</pullNewerImage>
<repository>varadarb/hudi-hadoop_${docker.hadoop.version}-datanode</repository>
<forceTags>true</forceTags>
<tag>${project.version}</tag>
</configuration>
</execution>
</executions>
</plugin>
</plugins>
</build>
</project>

View File

@@ -0,0 +1,9 @@
#!/bin/bash
datadir=`echo $HDFS_CONF_dfs_datanode_data_dir | perl -pe 's#file://##'`
if [ ! -d $datadir ]; then
echo "Datanode data directory not found: $datadir"
exit 2
fi
$HADOOP_PREFIX/bin/hdfs --config $HADOOP_CONF_DIR datanode

View File

@@ -0,0 +1,14 @@
ARG HADOOP_VERSION=2.8.4
ARG HADOOP_HISTORY_PORT=8188
FROM varadarb/hudi-hadoop_${HADOOP_VERSION}-base:latest
ENV HADOOP_HISTORY_PORT ${HADOOP_HISTORY_PORT}
ENV YARN_CONF_yarn_timeline___service_leveldb___timeline___store_path=/hadoop/yarn/timeline
RUN mkdir -p /hadoop/yarn/timeline
VOLUME /hadoop/yarn/timeline
ADD run_history.sh /run_history.sh
RUN chmod a+x /run_history.sh
CMD ["/run_history.sh"]

View File

@@ -0,0 +1,89 @@
<?xml version="1.0" encoding="UTF-8"?>
<!--
~ Copyright (c) 2016 Uber Technologies, Inc. (hoodie-dev-group@uber.com)
~
~ Licensed under the Apache License, Version 2.0 (the "License");
~ you may not use this file except in compliance with the License.
~ You may obtain a copy of the License at
~
~ http://www.apache.org/licenses/LICENSE-2.0
~
~ Unless required by applicable law or agreed to in writing, software
~ distributed under the License is distributed on an "AS IS" BASIS,
~ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
~ See the License for the specific language governing permissions and
~ limitations under the License.
-->
<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
<parent>
<artifactId>hoodie-hadoop-docker</artifactId>
<groupId>com.uber.hoodie</groupId>
<version>0.4.5-SNAPSHOT</version>
</parent>
<modelVersion>4.0.0</modelVersion>
<packaging>pom</packaging>
<artifactId>hoodie-hadoop-history-docker</artifactId>
<description>Base Docker Image with Hoodie</description>
<properties>
<project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
<checkstyle.skip>true</checkstyle.skip>
</properties>
<dependencies>
<dependency>
<groupId>com.uber.hoodie</groupId>
<artifactId>hoodie-hadoop-base-docker</artifactId>
<version>${project.version}</version>
<type>pom</type>
<scope>import</scope>
</dependency>
</dependencies>
<build>
<plugins>
<!-- Build Docker image -->
<plugin>
<groupId>com.spotify</groupId>
<artifactId>dockerfile-maven-plugin</artifactId>
<version>${dockerfile.maven.version}</version>
<executions>
<execution>
<id>tag-latest</id>
<phase>pre-integration-test</phase>
<goals>
<goal>build</goal>
<goal>tag</goal>
<!-- <goal>push</goal> -->
</goals>
<configuration>
<skip>${docker.build.skip}</skip>
<pullNewerImage>false</pullNewerImage>
<repository>varadarb/hudi-hadoop_${docker.hadoop.version}-history</repository>
<forceTags>true</forceTags>
<tag>latest</tag>
</configuration>
</execution>
<execution>
<id>tag-version</id>
<phase>pre-integration-test</phase>
<goals>
<goal>build</goal>
<goal>tag</goal>
<!-- <goal>push</goal> -->
</goals>
<configuration>
<skip>${docker.build.skip}</skip>
<pullNewerImage>false</pullNewerImage>
<repository>varadarb/hudi-hadoop_${docker.hadoop.version}-history</repository>
<forceTags>true</forceTags>
<tag>${project.version}</tag>
</configuration>
</execution>
</executions>
</plugin>
</plugins>
</build>
</project>

View File

@@ -0,0 +1,3 @@
#!/bin/bash
$HADOOP_PREFIX/bin/yarn --config $HADOOP_CONF_DIR historyserver

View File

@@ -0,0 +1,51 @@
ARG HADOOP_VERSION=2.8.4
FROM varadarb/hudi-hadoop_${HADOOP_VERSION}-base:latest
ENV HIVE_HOME /opt/hive
ENV PATH $HIVE_HOME/bin:$PATH
ENV HADOOP_HOME /opt/hadoop-$HADOOP_VERSION
WORKDIR /opt
ARG HIVE_VERSION=2.3.3
ARG HIVE_URL=https://archive.apache.org/dist/hive/hive-$HIVE_VERSION/apache-hive-$HIVE_VERSION-bin.tar.gz
ENV HIVE_VERSION ${HIVE_VERSION}
ENV HIVE_URL ${HIVE_URL}
#Install Hive MySQL, PostgreSQL JDBC
RUN echo "Hive URL is :${HIVE_URL}" && wget ${HIVE_URL} -O hive.tar.gz && \
tar -xzvf hive.tar.gz && mv *hive*-bin hive && \
ln -s /usr/share/java/mysql-connector-java.jar $HIVE_HOME/lib/mysql-connector-java.jar && \
wget https://jdbc.postgresql.org/download/postgresql-9.4.1212.jar -O $HIVE_HOME/lib/postgresql-jdbc.jar && \
rm hive.tar.gz && mkdir -p /var/hoodie/ws/docker/hoodie/hadoop/hive_base/target/
#Spark should be compiled with Hive to be able to use it
#hive-site.xml should be copied to $SPARK_HOME/conf folder
#Custom configuration goes here
ADD conf/hive-site.xml $HADOOP_CONF_DIR
ADD conf/beeline-log4j2.properties $HIVE_HOME/conf
ADD conf/hive-env.sh $HIVE_HOME/conf
ADD conf/hive-exec-log4j2.properties $HIVE_HOME/conf
ADD conf/hive-log4j2.properties $HIVE_HOME/conf
ADD conf/ivysettings.xml $HIVE_HOME/conf
ADD conf/llap-daemon-log4j2.properties $HIVE_HOME/conf
# Setup Hoodie Library jars
ADD target/ /var/hoodie/ws/docker/hoodie/hadoop/hive_base/target/
ENV HUDI_HADOOP_BUNDLE=/var/hoodie/ws/docker/hoodie/hadoop/hive_base/target/hoodie-hadoop-mr-bundle.jar
ENV HUDI_HIVE_BUNDLE=/var/hoodie/ws/docker/hoodie/hadoop/hive_base/target/hoodie-hive-bundle.jar
ENV HUDI_SPARK_BUNDLE=/var/hoodie/ws/docker/hoodie/hadoop/hive_base/target/hoodie-spark-bundle.jar
ENV HUDI_UTILITIES_BUNDLE=/var/hoodie/ws/docker/hoodie/hadoop/hive_base/target/hoodie-utilities.jar
COPY startup.sh /usr/local/bin/
RUN chmod +x /usr/local/bin/startup.sh
COPY entrypoint.sh /usr/local/bin/
RUN chmod +x /usr/local/bin/entrypoint.sh
ENV PATH $HIVE_HOME/bin/:$PATH
ENTRYPOINT ["entrypoint.sh"]
CMD startup.sh

View File

@@ -0,0 +1,45 @@
# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements. See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership. The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
status = INFO
name = BeelineLog4j2
packages = org.apache.hadoop.hive.ql.log
# list of properties
property.hive.log.level = WARN
property.hive.root.logger = console
# list of all appenders
appenders = console
# console appender
appender.console.type = Console
appender.console.name = console
appender.console.target = SYSTEM_ERR
appender.console.layout.type = PatternLayout
appender.console.layout.pattern = %d{yy/MM/dd HH:mm:ss} [%t]: %p %c{2}: %m%n
# list of all loggers
loggers = HiveConnection
# HiveConnection logs useful info for dynamic service discovery
logger.HiveConnection.name = org.apache.hive.jdbc.HiveConnection
logger.HiveConnection.level = INFO
# root logger
rootLogger.level = ${sys:hive.log.level}
rootLogger.appenderRefs = root
rootLogger.appenderRef.root.ref = ${sys:hive.root.logger}

View File

@@ -0,0 +1,54 @@
# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements. See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership. The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# Set Hive and Hadoop environment variables here. These variables can be used
# to control the execution of Hive. It should be used by admins to configure
# the Hive installation (so that users do not have to set environment variables
# or set command line parameters to get correct behavior).
#
# The hive service being invoked (CLI/HWI etc.) is available via the environment
# variable SERVICE
# Hive Client memory usage can be an issue if a large number of clients
# are running at the same time. The flags below have been useful in
# reducing memory usage:
#
# if [ "$SERVICE" = "cli" ]; then
# if [ -z "$DEBUG" ]; then
# export HADOOP_OPTS="$HADOOP_OPTS -XX:NewRatio=12 -Xms10m -XX:MaxHeapFreeRatio=40 -XX:MinHeapFreeRatio=15 -XX:+UseParNewGC -XX:-UseGCOverheadLimit"
# else
# export HADOOP_OPTS="$HADOOP_OPTS -XX:NewRatio=12 -Xms10m -XX:MaxHeapFreeRatio=40 -XX:MinHeapFreeRatio=15 -XX:-UseGCOverheadLimit"
# fi
# fi
# The heap size of the jvm stared by hive shell script can be controlled via:
#
# export HADOOP_HEAPSIZE=1024
#
# Larger heap size may be required when running queries over large number of files or partitions.
# By default hive shell scripts use a heap size of 256 (MB). Larger heap size would also be
# appropriate for hive server (hwi etc).
# Set HADOOP_HOME to point to a specific hadoop install directory
# HADOOP_HOME=${bin}/../../hadoop
# Hive Configuration Directory can be controlled by:
# export HIVE_CONF_DIR=
# Folder containing extra ibraries required for hive compilation/execution can be controlled by:
# export HIVE_AUX_JARS_PATH=

View File

@@ -0,0 +1,66 @@
# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements. See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership. The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
status = INFO
name = HiveExecLog4j2
packages = org.apache.hadoop.hive.ql.log
# list of properties
property.hive.log.level = INFO
property.hive.root.logger = FA
property.hive.query.id = hadoop
property.hive.log.dir = ${sys:java.io.tmpdir}/${sys:user.name}
property.hive.log.file = ${sys:hive.query.id}.log
# list of all appenders
appenders = console, FA
# console appender
appender.console.type = Console
appender.console.name = console
appender.console.target = SYSTEM_ERR
appender.console.layout.type = PatternLayout
appender.console.layout.pattern = %d{yy/MM/dd HH:mm:ss} [%t]: %p %c{2}: %m%n
# simple file appender
appender.FA.type = File
appender.FA.name = FA
appender.FA.fileName = ${sys:hive.log.dir}/${sys:hive.log.file}
appender.FA.layout.type = PatternLayout
appender.FA.layout.pattern = %d{ISO8601} %-5p [%t]: %c{2} (%F:%M(%L)) - %m%n
# list of all loggers
loggers = NIOServerCnxn, ClientCnxnSocketNIO, DataNucleus, Datastore, JPOX
logger.NIOServerCnxn.name = org.apache.zookeeper.server.NIOServerCnxn
logger.NIOServerCnxn.level = WARN
logger.ClientCnxnSocketNIO.name = org.apache.zookeeper.ClientCnxnSocketNIO
logger.ClientCnxnSocketNIO.level = WARN
logger.DataNucleus.name = DataNucleus
logger.DataNucleus.level = ERROR
logger.Datastore.name = Datastore
logger.Datastore.level = ERROR
logger.JPOX.name = JPOX
logger.JPOX.level = ERROR
# root logger
rootLogger.level = ${sys:hive.log.level}
rootLogger.appenderRefs = root
rootLogger.appenderRef.root.ref = ${sys:hive.root.logger}

View File

@@ -0,0 +1,73 @@
# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements. See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership. The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
status = INFO
name = HiveLog4j2
packages = org.apache.hadoop.hive.ql.log
# list of properties
property.hive.log.level = INFO
property.hive.root.logger = DRFA
property.hive.log.dir = ${sys:java.io.tmpdir}/${sys:user.name}
property.hive.log.file = hive.log
# list of all appenders
appenders = console, DRFA
# console appender
appender.console.type = Console
appender.console.name = console
appender.console.target = SYSTEM_ERR
appender.console.layout.type = PatternLayout
appender.console.layout.pattern = %d{yy/MM/dd HH:mm:ss} [%t]: %p %c{2}: %m%n
# daily rolling file appender
appender.DRFA.type = RollingFile
appender.DRFA.name = DRFA
appender.DRFA.fileName = ${sys:hive.log.dir}/${sys:hive.log.file}
# Use %pid in the filePattern to append <process-id>@<host-name> to the filename if you want separate log files for different CLI session
appender.DRFA.filePattern = ${sys:hive.log.dir}/${sys:hive.log.file}.%d{yyyy-MM-dd}
appender.DRFA.layout.type = PatternLayout
appender.DRFA.layout.pattern = %d{ISO8601} %-5p [%t]: %c{2} (%F:%M(%L)) - %m%n
appender.DRFA.policies.type = Policies
appender.DRFA.policies.time.type = TimeBasedTriggeringPolicy
appender.DRFA.policies.time.interval = 1
appender.DRFA.policies.time.modulate = true
appender.DRFA.strategy.type = DefaultRolloverStrategy
appender.DRFA.strategy.max = 30
# list of all loggers
loggers = NIOServerCnxn, ClientCnxnSocketNIO, DataNucleus, Datastore, JPOX
logger.NIOServerCnxn.name = org.apache.zookeeper.server.NIOServerCnxn
logger.NIOServerCnxn.level = WARN
logger.ClientCnxnSocketNIO.name = org.apache.zookeeper.ClientCnxnSocketNIO
logger.ClientCnxnSocketNIO.level = WARN
logger.DataNucleus.name = DataNucleus
logger.DataNucleus.level = ERROR
logger.Datastore.name = Datastore
logger.Datastore.level = ERROR
logger.JPOX.name = JPOX
logger.JPOX.level = ERROR
# root logger
rootLogger.level = ${sys:hive.log.level}
rootLogger.appenderRefs = root
rootLogger.appenderRef.root.ref = ${sys:hive.root.logger}

View File

@@ -0,0 +1,18 @@
<?xml version="1.0" encoding="UTF-8" standalone="no"?>
<?xml-stylesheet type="text/xsl" href="configuration.xsl"?><!--
Licensed to the Apache Software Foundation (ASF) under one or more
contributor license agreements. See the NOTICE file distributed with
this work for additional information regarding copyright ownership.
The ASF licenses this file to You under the Apache License, Version 2.0
(the "License"); you may not use this file except in compliance with
the License. You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
--><configuration>
</configuration>

View File

@@ -0,0 +1,45 @@
<!--
Licensed to the Apache Software Foundation (ASF) under one or more
contributor license agreements. See the NOTICE file distributed with
this work for additional information regarding copyright ownership.
The ASF licenses this file to You under the Apache License, Version 2.0
(the "License"); you may not use this file except in compliance with
the License. You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
-->
<!--This file is used by grapes to download dependencies from a maven repository.
This is just a template and can be edited to add more repositories.
-->
<ivysettings>
<!--name of the defaultResolver should always be 'downloadGrapes'. -->
<settings defaultResolver="downloadGrapes"/>
<!-- Only set maven.local.repository if not already set -->
<property name="maven.local.repository" value="${user.home}/.m2/repository" override="false" />
<property name="m2-pattern"
value="file:${maven.local.repository}/[organisation]/[module]/[revision]/[module]-[revision](-[classifier]).[ext]"
override="false"/>
<resolvers>
<!-- more resolvers can be added here -->
<chain name="downloadGrapes">
<!-- This resolver uses ibiblio to find artifacts, compatible with maven2 repository -->
<ibiblio name="central" m2compatible="true"/>
<url name="local-maven2" m2compatible="true">
<artifact pattern="${m2-pattern}"/>
</url>
<!-- File resolver to add jars from the local system. -->
<filesystem name="test" checkmodified="true">
<artifact pattern="/tmp/[module]-[revision](-[classifier]).jar"/>
</filesystem>
</chain>
</resolvers>
</ivysettings>

View File

@@ -0,0 +1,93 @@
# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements. See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership. The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
status = INFO
name = LlapDaemonLog4j2
packages = org.apache.hadoop.hive.ql.log
# list of properties
property.llap.daemon.log.level = INFO
property.llap.daemon.root.logger = console
property.llap.daemon.log.dir = .
property.llap.daemon.log.file = llapdaemon.log
property.llap.daemon.historylog.file = llapdaemon_history.log
property.llap.daemon.log.maxfilesize = 256MB
property.llap.daemon.log.maxbackupindex = 20
# list of all appenders
appenders = console, RFA, HISTORYAPPENDER
# console appender
appender.console.type = Console
appender.console.name = console
appender.console.target = SYSTEM_ERR
appender.console.layout.type = PatternLayout
appender.console.layout.pattern = %d{yy/MM/dd HH:mm:ss} [%t%x] %p %c{2} : %m%n
# rolling file appender
appender.RFA.type = RollingFile
appender.RFA.name = RFA
appender.RFA.fileName = ${sys:llap.daemon.log.dir}/${sys:llap.daemon.log.file}
appender.RFA.filePattern = ${sys:llap.daemon.log.dir}/${sys:llap.daemon.log.file}_%i
appender.RFA.layout.type = PatternLayout
appender.RFA.layout.pattern = %d{ISO8601} %-5p [%t%x]: %c{2} (%F:%M(%L)) - %m%n
appender.RFA.policies.type = Policies
appender.RFA.policies.size.type = SizeBasedTriggeringPolicy
appender.RFA.policies.size.size = ${sys:llap.daemon.log.maxfilesize}
appender.RFA.strategy.type = DefaultRolloverStrategy
appender.RFA.strategy.max = ${sys:llap.daemon.log.maxbackupindex}
# history file appender
appender.HISTORYAPPENDER.type = RollingFile
appender.HISTORYAPPENDER.name = HISTORYAPPENDER
appender.HISTORYAPPENDER.fileName = ${sys:llap.daemon.log.dir}/${sys:llap.daemon.historylog.file}
appender.HISTORYAPPENDER.filePattern = ${sys:llap.daemon.log.dir}/${sys:llap.daemon.historylog.file}_%i
appender.HISTORYAPPENDER.layout.type = PatternLayout
appender.HISTORYAPPENDER.layout.pattern = %m%n
appender.HISTORYAPPENDER.policies.type = Policies
appender.HISTORYAPPENDER.policies.size.type = SizeBasedTriggeringPolicy
appender.HISTORYAPPENDER.policies.size.size = ${sys:llap.daemon.log.maxfilesize}
appender.HISTORYAPPENDER.strategy.type = DefaultRolloverStrategy
appender.HISTORYAPPENDER.strategy.max = ${sys:llap.daemon.log.maxbackupindex}
# list of all loggers
loggers = NIOServerCnxn, ClientCnxnSocketNIO, DataNucleus, Datastore, JPOX, HistoryLogger
logger.NIOServerCnxn.name = org.apache.zookeeper.server.NIOServerCnxn
logger.NIOServerCnxn.level = WARN
logger.ClientCnxnSocketNIO.name = org.apache.zookeeper.ClientCnxnSocketNIO
logger.ClientCnxnSocketNIO.level = WARN
logger.DataNucleus.name = DataNucleus
logger.DataNucleus.level = ERROR
logger.Datastore.name = Datastore
logger.Datastore.level = ERROR
logger.JPOX.name = JPOX
logger.JPOX.level = ERROR
logger.HistoryLogger.name = org.apache.hadoop.hive.llap.daemon.HistoryLogger
logger.HistoryLogger.level = INFO
logger.HistoryLogger.additivity = false
logger.HistoryLogger.appenderRefs = HistoryAppender
logger.HistoryLogger.appenderRef.HistoryAppender.ref = HISTORYAPPENDER
# root logger
rootLogger.level = ${sys:llap.daemon.log.level}
rootLogger.appenderRefs = root
rootLogger.appenderRef.root.ref = ${sys:llap.daemon.root.logger}

View File

@@ -0,0 +1,118 @@
#!/bin/bash
# Set some sensible defaults
export CORE_CONF_fs_defaultFS=${CORE_CONF_fs_defaultFS:-hdfs://`hostname -f`:8020}
function addProperty() {
local path=$1
local name=$2
local value=$3
local entry="<property><name>$name</name><value>${value}</value></property>"
local escapedEntry=$(echo $entry | sed 's/\//\\\//g')
sed -i "/<\/configuration>/ s/.*/${escapedEntry}\n&/" $path
}
function configure() {
local path=$1
local module=$2
local envPrefix=$3
local var
local value
echo "Configuring $module"
for c in `printenv | perl -sne 'print "$1 " if m/^${envPrefix}_(.+?)=.*/' -- -envPrefix=$envPrefix`; do
name=`echo ${c} | perl -pe 's/___/-/g; s/__/_/g; s/_/./g'`
var="${envPrefix}_${c}"
value=${!var}
echo " - Setting $name=$value"
addProperty $path $name "$value"
done
}
configure /etc/hadoop/core-site.xml core CORE_CONF
configure /etc/hadoop/hdfs-site.xml hdfs HDFS_CONF
configure /etc/hadoop/yarn-site.xml yarn YARN_CONF
configure /etc/hadoop/httpfs-site.xml httpfs HTTPFS_CONF
configure /etc/hadoop/kms-site.xml kms KMS_CONF
configure /etc/hadoop/mapred-site.xml mapred MAPRED_CONF
configure /etc/hadoop/hive-site.xml hive HIVE_SITE_CONF
if [ "$MULTIHOMED_NETWORK" = "1" ]; then
echo "Configuring for multihomed network"
# HDFS
addProperty /etc/hadoop/hdfs-site.xml dfs.namenode.rpc-bind-host 0.0.0.0
addProperty /etc/hadoop/hdfs-site.xml dfs.namenode.servicerpc-bind-host 0.0.0.0
addProperty /etc/hadoop/hdfs-site.xml dfs.namenode.http-bind-host 0.0.0.0
addProperty /etc/hadoop/hdfs-site.xml dfs.namenode.https-bind-host 0.0.0.0
addProperty /etc/hadoop/hdfs-site.xml dfs.client.use.datanode.hostname true
addProperty /etc/hadoop/hdfs-site.xml dfs.datanode.use.datanode.hostname true
# YARN
addProperty /etc/hadoop/yarn-site.xml yarn.resourcemanager.bind-host 0.0.0.0
addProperty /etc/hadoop/yarn-site.xml yarn.nodemanager.bind-host 0.0.0.0
addProperty /etc/hadoop/yarn-site.xml yarn.nodemanager.bind-host 0.0.0.0
addProperty /etc/hadoop/yarn-site.xml yarn.timeline-service.bind-host 0.0.0.0
# MAPRED
addProperty /etc/hadoop/mapred-site.xml yarn.nodemanager.bind-host 0.0.0.0
fi
if [ -n "$GANGLIA_HOST" ]; then
mv /etc/hadoop/hadoop-metrics.properties /etc/hadoop/hadoop-metrics.properties.orig
mv /etc/hadoop/hadoop-metrics2.properties /etc/hadoop/hadoop-metrics2.properties.orig
for module in mapred jvm rpc ugi; do
echo "$module.class=org.apache.hadoop.metrics.ganglia.GangliaContext31"
echo "$module.period=10"
echo "$module.servers=$GANGLIA_HOST:8649"
done > /etc/hadoop/hadoop-metrics.properties
for module in namenode datanode resourcemanager nodemanager mrappmaster jobhistoryserver; do
echo "$module.sink.ganglia.class=org.apache.hadoop.metrics2.sink.ganglia.GangliaSink31"
echo "$module.sink.ganglia.period=10"
echo "$module.sink.ganglia.supportsparse=true"
echo "$module.sink.ganglia.slope=jvm.metrics.gcCount=zero,jvm.metrics.memHeapUsedM=both"
echo "$module.sink.ganglia.dmax=jvm.metrics.threadsBlocked=70,jvm.metrics.memHeapUsedM=40"
echo "$module.sink.ganglia.servers=$GANGLIA_HOST:8649"
done > /etc/hadoop/hadoop-metrics2.properties
fi
function wait_for_it()
{
local serviceport=$1
local service=${serviceport%%:*}
local port=${serviceport#*:}
local retry_seconds=5
local max_try=100
let i=1
nc -z $service $port
result=$?
until [ $result -eq 0 ]; do
echo "[$i/$max_try] check for ${service}:${port}..."
echo "[$i/$max_try] ${service}:${port} is not available yet"
if (( $i == $max_try )); then
echo "[$i/$max_try] ${service}:${port} is still not available; giving up after ${max_try} tries. :/"
exit 1
fi
echo "[$i/$max_try] try in ${retry_seconds}s once again ..."
let "i++"
sleep $retry_seconds
nc -z $service $port
result=$?
done
echo "[$i/$max_try] $service:${port} is available."
}
for i in ${SERVICE_PRECONDITION[@]}
do
wait_for_it ${i}
done
exec $@

View File

@@ -0,0 +1,113 @@
<?xml version="1.0" encoding="UTF-8"?>
<!--
~ Copyright (c) 2016 Uber Technologies, Inc. (hoodie-dev-group@uber.com)
~
~ Licensed under the Apache License, Version 2.0 (the "License");
~ you may not use this file except in compliance with the License.
~ You may obtain a copy of the License at
~
~ http://www.apache.org/licenses/LICENSE-2.0
~
~ Unless required by applicable law or agreed to in writing, software
~ distributed under the License is distributed on an "AS IS" BASIS,
~ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
~ See the License for the specific language governing permissions and
~ limitations under the License.
-->
<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
<parent>
<artifactId>hoodie-hadoop-docker</artifactId>
<groupId>com.uber.hoodie</groupId>
<version>0.4.5-SNAPSHOT</version>
</parent>
<modelVersion>4.0.0</modelVersion>
<packaging>pom</packaging>
<artifactId>hoodie-hadoop-hive-docker</artifactId>
<description>Base Docker Image with Hoodie</description>
<properties>
<project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
<checkstyle.skip>true</checkstyle.skip>
</properties>
<dependencies>
<dependency>
<groupId>com.uber.hoodie</groupId>
<artifactId>hoodie-hadoop-base-docker</artifactId>
<version>${project.version}</version>
<type>pom</type>
<scope>import</scope>
</dependency>
</dependencies>
<build>
<plugins>
<plugin>
<artifactId>maven-antrun-plugin</artifactId>
<version>1.7</version>
<executions>
<execution>
<phase>package</phase>
<configuration>
<tasks>
<copy file="${project.basedir}/../../../../packaging/hoodie-hadoop-mr-bundle/target/hoodie-hadoop-mr-bundle-${project.version}.jar"
tofile="target/hoodie-hadoop-mr-bundle.jar" />
<copy file="${project.basedir}/../../../../packaging/hoodie-hive-bundle/target/hoodie-hive-bundle-${project.version}.jar"
tofile="target/hoodie-hive-bundle.jar" />
<copy file="${project.basedir}/../../../../packaging/hoodie-spark-bundle/target/hoodie-spark-bundle-${project.version}.jar"
tofile="target/hoodie-spark-bundle.jar" />
<copy file="${project.basedir}/../../../../hoodie-utilities/target/hoodie-utilities-${project.version}.jar"
tofile="target/hoodie-utilities.jar" />
</tasks>
</configuration>
<goals>
<goal>run</goal>
</goals>
</execution>
</executions>
</plugin>
<!-- Build Docker image -->
<plugin>
<groupId>com.spotify</groupId>
<artifactId>dockerfile-maven-plugin</artifactId>
<version>${dockerfile.maven.version}</version>
<executions>
<execution>
<id>tag-latest</id>
<phase>pre-integration-test</phase>
<goals>
<goal>build</goal>
<goal>tag</goal>
<!-- <goal>push</goal> -->
</goals>
<configuration>
<skip>${docker.build.skip}</skip>
<pullNewerImage>false</pullNewerImage>
<repository>varadarb/hudi-hadoop_${docker.hadoop.version}-hive_${docker.hive.version}</repository>
<forceTags>true</forceTags>
<tag>latest</tag>
</configuration>
</execution>
<execution>
<id>tag-version</id>
<phase>pre-integration-test</phase>
<goals>
<goal>build</goal>
<goal>tag</goal>
<!-- <goal>push</goal> -->
</goals>
<configuration>
<skip>${docker.build.skip}</skip>
<pullNewerImage>false</pullNewerImage>
<repository>varadarb/hudi-hadoop_${docker.hadoop.version}-hive_${docker.hive.version}</repository>
<forceTags>true</forceTags>
<tag>${project.version}</tag>
</configuration>
</execution>
</executions>
</plugin>
</plugins>
</build>
</project>

View File

@@ -0,0 +1,10 @@
#!/bin/bash
hadoop fs -mkdir /tmp
hadoop fs -mkdir -p /user/hive/warehouse
hadoop fs -chmod g+w /tmp
hadoop fs -chmod g+w /user/hive/warehouse
cd $HIVE_HOME/bin
export AUX_CLASSPATH=file://${HUDI_HADOOP_BUNDLE}
./hiveserver2 --hiveconf hive.server2.enable.doAs=false --hiveconf hive.aux.jars.path=file://${HUDI_HADOOP_BUNDLE}

View File

@@ -0,0 +1,14 @@
ARG HADOOP_VERSION=2.8.4
ARG HADOOP_WEBHDFS_PORT=50070
FROM varadarb/hudi-hadoop_${HADOOP_VERSION}-base:latest
ENV HADOOP_WEBHDFS_PORT ${HADOOP_WEBHDFS_PORT}
ENV HDFS_CONF_dfs_namenode_name_dir=file:///hadoop/dfs/name
RUN mkdir -p /hadoop/dfs/name
VOLUME /hadoop/dfs/name
ADD run_nn.sh /run_nn.sh
RUN chmod a+x /run_nn.sh
CMD ["/run_nn.sh"]

View File

@@ -0,0 +1,89 @@
<?xml version="1.0" encoding="UTF-8"?>
<!--
~ Copyright (c) 2016 Uber Technologies, Inc. (hoodie-dev-group@uber.com)
~
~ Licensed under the Apache License, Version 2.0 (the "License");
~ you may not use this file except in compliance with the License.
~ You may obtain a copy of the License at
~
~ http://www.apache.org/licenses/LICENSE-2.0
~
~ Unless required by applicable law or agreed to in writing, software
~ distributed under the License is distributed on an "AS IS" BASIS,
~ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
~ See the License for the specific language governing permissions and
~ limitations under the License.
-->
<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
<parent>
<artifactId>hoodie-hadoop-docker</artifactId>
<groupId>com.uber.hoodie</groupId>
<version>0.4.5-SNAPSHOT</version>
</parent>
<modelVersion>4.0.0</modelVersion>
<packaging>pom</packaging>
<artifactId>hoodie-hadoop-namenode-docker</artifactId>
<description>Base Docker Image with Hoodie</description>
<properties>
<project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
<checkstyle.skip>true</checkstyle.skip>
</properties>
<dependencies>
<dependency>
<groupId>com.uber.hoodie</groupId>
<artifactId>hoodie-hadoop-base-docker</artifactId>
<version>${project.version}</version>
<type>pom</type>
<scope>import</scope>
</dependency>
</dependencies>
<build>
<plugins>
<!-- Build Docker image -->
<plugin>
<groupId>com.spotify</groupId>
<artifactId>dockerfile-maven-plugin</artifactId>
<version>${dockerfile.maven.version}</version>
<executions>
<execution>
<id>tag-latest</id>
<phase>pre-integration-test</phase>
<goals>
<goal>build</goal>
<goal>tag</goal>
<!-- <goal>push</goal> -->
</goals>
<configuration>
<skip>${docker.build.skip}</skip>
<pullNewerImage>false</pullNewerImage>
<repository>varadarb/hudi-hadoop_${docker.hadoop.version}-namenode</repository>
<forceTags>true</forceTags>
<tag>latest</tag>
</configuration>
</execution>
<execution>
<id>tag-version</id>
<phase>pre-integration-test</phase>
<goals>
<goal>build</goal>
<goal>tag</goal>
<!-- <goal>push</goal> -->
</goals>
<configuration>
<skip>${docker.build.skip}</skip>
<pullNewerImage>false</pullNewerImage>
<repository>varadarb/hudi-hadoop_${docker.hadoop.version}-namenode</repository>
<forceTags>true</forceTags>
<tag>${project.version}</tag>
</configuration>
</execution>
</executions>
</plugin>
</plugins>
</build>
</project>

View File

@@ -0,0 +1,19 @@
#!/bin/bash
namedir=`echo $HDFS_CONF_dfs_namenode_name_dir | perl -pe 's#file://##'`
if [ ! -d $namedir ]; then
echo "Namenode name directory not found: $namedir"
exit 2
fi
if [ -z "$CLUSTER_NAME" ]; then
echo "Cluster name not specified"
exit 2
fi
if [ "`ls -A $namedir`" == "" ]; then
echo "Formatting namenode name directory: $namedir"
$HADOOP_PREFIX/bin/hdfs --config $HADOOP_CONF_DIR namenode -format $CLUSTER_NAME
fi
$HADOOP_PREFIX/bin/hdfs --config $HADOOP_CONF_DIR namenode

View File

@@ -0,0 +1,78 @@
<?xml version="1.0" encoding="UTF-8"?>
<!--
~ Copyright (c) 2016 Uber Technologies, Inc. (hoodie-dev-group@uber.com)
~
~ Licensed under the Apache License, Version 2.0 (the "License");
~ you may not use this file except in compliance with the License.
~ You may obtain a copy of the License at
~
~ http://www.apache.org/licenses/LICENSE-2.0
~
~ Unless required by applicable law or agreed to in writing, software
~ distributed under the License is distributed on an "AS IS" BASIS,
~ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
~ See the License for the specific language governing permissions and
~ limitations under the License.
-->
<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
<parent>
<artifactId>hoodie</artifactId>
<groupId>com.uber.hoodie</groupId>
<version>0.4.5-SNAPSHOT</version>
<relativePath>../../../pom.xml</relativePath>
</parent>
<modelVersion>4.0.0</modelVersion>
<artifactId>hoodie-hadoop-docker</artifactId>
<packaging>pom</packaging>
<modules>
<module>base</module>
<module>namenode</module>
<module>datanode</module>
<module>historyserver</module>
<module>hive_base</module>
<module>spark_base</module>
<module>sparkmaster</module>
<module>sparkworker</module>
<module>sparkadhoc</module>
</modules>
<dependencies>
<dependency>
<groupId>com.uber.hoodie</groupId>
<artifactId>hoodie-spark-bundle</artifactId>
<version>${project.version}</version>
</dependency>
</dependencies>
<properties>
<skipITs>false</skipITs>
<docker.build.skip>true</docker.build.skip>
<docker.spark.version>2.3.1</docker.spark.version>
<docker.hive.version>2.3.3</docker.hive.version>
<docker.hadoop.version>2.8.4</docker.hadoop.version>
<dockerfile.maven.version>1.4.3</dockerfile.maven.version>
<checkstyle.skip>true</checkstyle.skip>
</properties>
<build>
<extensions>
<extension>
<groupId>com.spotify</groupId>
<artifactId>dockerfile-maven-extension</artifactId>
<version>${dockerfile.maven.version}</version>
</extension>
</extensions>
<plugins>
<plugin>
<groupId>com.spotify</groupId>
<artifactId>dockerfile-maven-plugin</artifactId>
<version>${dockerfile.maven.version}</version>
<configuration>
<skip>true</skip>
</configuration>
</plugin>
</plugins>
</build>
</project>

View File

@@ -0,0 +1,46 @@
ARG HADOOP_VERSION=2.8.4
ARG HIVE_VERSION=2.3.3
FROM varadarb/hudi-hadoop_${HADOOP_VERSION}-hive_${HIVE_VERSION}
ENV ENABLE_INIT_DAEMON true
ENV INIT_DAEMON_BASE_URI http://identifier/init-daemon
ENV INIT_DAEMON_STEP spark_master_init
ARG SPARK_VERSION=2.3.1
ARG SPARK_HADOOP_VERSION=2.7
ENV SPARK_VERSION ${SPARK_VERSION}
ENV HADOOP_VERSION ${SPARK_HADOOP_VERSION}
COPY wait-for-step.sh /
COPY execute-step.sh /
COPY finish-step.sh /
RUN echo "Installing Spark-version (${SPARK_VERSION})" \
&& wget http://apache.mirror.iphh.net/spark/spark-${SPARK_VERSION}/spark-${SPARK_VERSION}-bin-hadoop${HADOOP_VERSION}.tgz \
&& tar -xvzf spark-${SPARK_VERSION}-bin-hadoop${HADOOP_VERSION}.tgz \
&& mv spark-${SPARK_VERSION}-bin-hadoop${HADOOP_VERSION} /opt/spark \
&& rm spark-${SPARK_VERSION}-bin-hadoop${HADOOP_VERSION}.tgz \
&& cd /
#Give permission to execute scripts
RUN chmod +x /wait-for-step.sh && chmod +x /execute-step.sh && chmod +x /finish-step.sh
# Fix the value of PYTHONHASHSEED
# Note: this is needed when you use Python 3.3 or greater
ENV PYTHONHASHSEED 1
ENV SPARK_HOME /opt/spark
ENV SPARK_INSTALL ${SPARK_HOME}
ENV SPARK_CONF_DIR ${SPARK_HOME}/conf
ENV PATH $SPARK_INSTALL/bin:$PATH
ENV SPARK_DRIVER_PORT 5001
ENV SPARK_UI_PORT 5002
ENV SPARK_BLOCKMGR_PORT 5003
EXPOSE $SPARK_DRIVER_PORT $SPARK_UI_PORT $SPARK_BLOCKMGR_PORT
# Without this spark-shell fails - Download if it is not already there in $SPARK_INSTALL
RUN wget -nc -q -O "${SPARK_INSTALL}/jars/jersey-bundle-1.19.4.jar" "http://repo1.maven.org/maven2/com/sun/jersey/jersey-bundle/1.19.4/jersey-bundle-1.19.4.jar"

View File

@@ -0,0 +1,14 @@
#!/bin/bash
if [ $ENABLE_INIT_DAEMON = "true" ]
then
echo "Execute step ${INIT_DAEMON_STEP} in pipeline"
while true; do
sleep 5
echo -n '.'
string=$(curl -sL -w "%{http_code}" -X PUT $INIT_DAEMON_BASE_URI/execute?step=$INIT_DAEMON_STEP -o /dev/null)
[ "$string" = "204" ] && break
done
echo "Notified execution of step ${INIT_DAEMON_STEP}"
fi

View File

@@ -0,0 +1,16 @@
#!/bin/bash
if [ $ENABLE_INIT_DAEMON = "true" ]
then
echo "Finish step ${INIT_DAEMON_STEP} in pipeline"
while true; do
sleep 5
echo -n '.'
string=$(curl -sL -w "%{http_code}" -X PUT $INIT_DAEMON_BASE_URI/finish?step=$INIT_DAEMON_STEP -o /dev/null)
[ "$string" = "204" ] && break
done
echo "Notified finish of step ${INIT_DAEMON_STEP}"
fi

View File

@@ -0,0 +1,89 @@
<?xml version="1.0" encoding="UTF-8"?>
<!--
~ Copyright (c) 2016 Uber Technologies, Inc. (hoodie-dev-group@uber.com)
~
~ Licensed under the Apache License, Version 2.0 (the "License");
~ you may not use this file except in compliance with the License.
~ You may obtain a copy of the License at
~
~ http://www.apache.org/licenses/LICENSE-2.0
~
~ Unless required by applicable law or agreed to in writing, software
~ distributed under the License is distributed on an "AS IS" BASIS,
~ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
~ See the License for the specific language governing permissions and
~ limitations under the License.
-->
<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
<parent>
<artifactId>hoodie-hadoop-docker</artifactId>
<groupId>com.uber.hoodie</groupId>
<version>0.4.5-SNAPSHOT</version>
</parent>
<modelVersion>4.0.0</modelVersion>
<packaging>pom</packaging>
<artifactId>hoodie-hadoop-sparkbase-docker</artifactId>
<description>Base Docker Image with Hoodie</description>
<properties>
<project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
<checkstyle.skip>true</checkstyle.skip>
</properties>
<dependencies>
<dependency>
<groupId>com.uber.hoodie</groupId>
<artifactId>hoodie-hadoop-hive-docker</artifactId>
<version>${project.version}</version>
<type>pom</type>
<scope>import</scope>
</dependency>
</dependencies>
<build>
<plugins>
<!-- Build Docker image -->
<plugin>
<groupId>com.spotify</groupId>
<artifactId>dockerfile-maven-plugin</artifactId>
<version>${dockerfile.maven.version}</version>
<executions>
<execution>
<id>tag-latest</id>
<phase>pre-integration-test</phase>
<goals>
<goal>build</goal>
<goal>tag</goal>
<!-- <goal>push</goal> -->
</goals>
<configuration>
<skip>${docker.build.skip}</skip>
<pullNewerImage>false</pullNewerImage>
<repository>varadarb/hudi-hadoop_${docker.hadoop.version}-hive_${docker.hive.version}-sparkbase_${docker.spark.version}</repository>
<forceTags>true</forceTags>
<tag>latest</tag>
</configuration>
</execution>
<execution>
<id>tag-version</id>
<phase>pre-integration-test</phase>
<goals>
<goal>build</goal>
<goal>tag</goal>
<!-- <goal>push</goal> -->
</goals>
<configuration>
<skip>${docker.build.skip}</skip>
<pullNewerImage>false</pullNewerImage>
<repository>varadarb/hudi-hadoop_${docker.hadoop.version}-hive_${docker.hive.version}-sparkbase_${docker.spark.version}</repository>
<forceTags>true</forceTags>
<tag>${project.version}</tag>
</configuration>
</execution>
</executions>
</plugin>
</plugins>
</build>
</project>

View File

@@ -0,0 +1,13 @@
#!/bin/bash
if [ $ENABLE_INIT_DAEMON = "true" ]
then
echo "Validating if step ${INIT_DAEMON_STEP} can start in pipeline"
while true; do
sleep 5
echo -n '.'
string=$(curl -s $INIT_DAEMON_BASE_URI/canStart?step=$INIT_DAEMON_STEP)
[ "$string" = "true" ] && break
done
echo "Can start step ${INIT_DAEMON_STEP}"
fi

View File

@@ -0,0 +1,12 @@
ARG HADOOP_VERSION=2.8.4
ARG HIVE_VERSION=2.3.3
ARG SPARK_VERSION=2.3.1
FROM varadarb/hudi-hadoop_${HADOOP_VERSION}-hive_${HIVE_VERSION}-sparkbase_${SPARK_VERSION}
COPY adhoc.sh /opt/spark
ENV SPARK_WORKER_WEBUI_PORT 8081
ENV SPARK_WORKER_LOG /spark/logs
ENV SPARK_MASTER "spark://spark-master:7077"
CMD ["/bin/bash", "/opt/spark/adhoc.sh"]

View File

@@ -0,0 +1,13 @@
#!/bin/bash
. "/spark/sbin/spark-config.sh"
. "/spark/bin/load-spark-env.sh"
export SPARK_HOME=/opt/spark
date
echo "SPARK HOME is : $SPARK_HOME"
tail -f /dev/null

View File

@@ -0,0 +1,89 @@
<?xml version="1.0" encoding="UTF-8"?>
<!--
~ Copyright (c) 2016 Uber Technologies, Inc. (hoodie-dev-group@uber.com)
~
~ Licensed under the Apache License, Version 2.0 (the "License");
~ you may not use this file except in compliance with the License.
~ You may obtain a copy of the License at
~
~ http://www.apache.org/licenses/LICENSE-2.0
~
~ Unless required by applicable law or agreed to in writing, software
~ distributed under the License is distributed on an "AS IS" BASIS,
~ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
~ See the License for the specific language governing permissions and
~ limitations under the License.
-->
<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
<parent>
<artifactId>hoodie-hadoop-docker</artifactId>
<groupId>com.uber.hoodie</groupId>
<version>0.4.5-SNAPSHOT</version>
</parent>
<modelVersion>4.0.0</modelVersion>
<packaging>pom</packaging>
<artifactId>hoodie-hadoop-sparkadhoc-docker</artifactId>
<description>Base Docker Image with Hoodie</description>
<properties>
<project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
<checkstyle.skip>true</checkstyle.skip>
</properties>
<dependencies>
<dependency>
<groupId>com.uber.hoodie</groupId>
<artifactId>hoodie-hadoop-sparkbase-docker</artifactId>
<version>${project.version}</version>
<type>pom</type>
<scope>import</scope>
</dependency>
</dependencies>
<build>
<plugins>
<!-- Build Docker image -->
<plugin>
<groupId>com.spotify</groupId>
<artifactId>dockerfile-maven-plugin</artifactId>
<version>${dockerfile.maven.version}</version>
<executions>
<execution>
<id>tag-latest</id>
<phase>pre-integration-test</phase>
<goals>
<goal>build</goal>
<goal>tag</goal>
<!-- <goal>push</goal> -->
</goals>
<configuration>
<skip>${docker.build.skip}</skip>
<pullNewerImage>false</pullNewerImage>
<repository>varadarb/hudi-hadoop_${docker.hadoop.version}-hive_${docker.hive.version}-sparkadhoc_${docker.spark.version}</repository>
<forceTags>true</forceTags>
<tag>latest</tag>
</configuration>
</execution>
<execution>
<id>tag-version</id>
<phase>pre-integration-test</phase>
<goals>
<goal>build</goal>
<goal>tag</goal>
<!-- <goal>push</goal> -->
</goals>
<configuration>
<skip>${docker.build.skip}</skip>
<pullNewerImage>false</pullNewerImage>
<repository>varadarb/hudi-hadoop_${docker.hadoop.version}-hive_${docker.hive.version}-sparkadhoc_${docker.spark.version}</repository>
<forceTags>true</forceTags>
<tag>${project.version}</tag>
</configuration>
</execution>
</executions>
</plugin>
</plugins>
</build>
</project>

View File

@@ -0,0 +1,14 @@
ARG HADOOP_VERSION=2.8.4
ARG HIVE_VERSION=2.3.3
ARG SPARK_VERSION=2.3.1
FROM varadarb/hudi-hadoop_${HADOOP_VERSION}-hive_${HIVE_VERSION}-sparkbase_${SPARK_VERSION}
COPY master.sh /opt/spark
ENV SPARK_MASTER_PORT 7077
ENV SPARK_MASTER_WEBUI_PORT 8080
ENV SPARK_MASTER_LOG /opt/spark/logs
EXPOSE 8080 7077 6066
CMD ["/bin/bash", "/opt/spark/master.sh"]

View File

@@ -0,0 +1,16 @@
#!/bin/bash
export SPARK_MASTER_HOST=`hostname`
. "/opt/spark/sbin/spark-config.sh"
. "/opt/spark/bin/load-spark-env.sh"
mkdir -p $SPARK_MASTER_LOG
export SPARK_HOME=/opt/spark
ln -sf /dev/stdout $SPARK_MASTER_LOG/spark-master.out
cd /opt/spark/bin && /opt/spark/sbin/../bin/spark-class org.apache.spark.deploy.master.Master \
--ip $SPARK_MASTER_HOST --port $SPARK_MASTER_PORT --webui-port $SPARK_MASTER_WEBUI_PORT >> $SPARK_MASTER_LOG/spark-master.out

View File

@@ -0,0 +1,89 @@
<?xml version="1.0" encoding="UTF-8"?>
<!--
~ Copyright (c) 2016 Uber Technologies, Inc. (hoodie-dev-group@uber.com)
~
~ Licensed under the Apache License, Version 2.0 (the "License");
~ you may not use this file except in compliance with the License.
~ You may obtain a copy of the License at
~
~ http://www.apache.org/licenses/LICENSE-2.0
~
~ Unless required by applicable law or agreed to in writing, software
~ distributed under the License is distributed on an "AS IS" BASIS,
~ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
~ See the License for the specific language governing permissions and
~ limitations under the License.
-->
<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
<parent>
<artifactId>hoodie-hadoop-docker</artifactId>
<groupId>com.uber.hoodie</groupId>
<version>0.4.5-SNAPSHOT</version>
</parent>
<modelVersion>4.0.0</modelVersion>
<packaging>pom</packaging>
<artifactId>hoodie-hadoop-sparkmaster-docker</artifactId>
<description>Base Docker Image with Hoodie</description>
<properties>
<project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
<checkstyle.skip>true</checkstyle.skip>
</properties>
<dependencies>
<dependency>
<groupId>com.uber.hoodie</groupId>
<artifactId>hoodie-hadoop-sparkbase-docker</artifactId>
<version>${project.version}</version>
<type>pom</type>
<scope>import</scope>
</dependency>
</dependencies>
<build>
<plugins>
<!-- Build Docker image -->
<plugin>
<groupId>com.spotify</groupId>
<artifactId>dockerfile-maven-plugin</artifactId>
<version>${dockerfile.maven.version}</version>
<executions>
<execution>
<id>tag-latest</id>
<phase>pre-integration-test</phase>
<goals>
<goal>build</goal>
<goal>tag</goal>
<!-- <goal>push</goal> -->
</goals>
<configuration>
<skip>${docker.build.skip}</skip>
<pullNewerImage>false</pullNewerImage>
<repository>varadarb/hudi-hadoop_${docker.hadoop.version}-hive_${docker.hive.version}-sparkmaster_${docker.spark.version}</repository>
<forceTags>true</forceTags>
<tag>latest</tag>
</configuration>
</execution>
<execution>
<id>tag-version</id>
<phase>pre-integration-test</phase>
<goals>
<goal>build</goal>
<goal>tag</goal>
<!-- <goal>push</goal> -->
</goals>
<configuration>
<skip>${docker.build.skip}</skip>
<pullNewerImage>false</pullNewerImage>
<repository>varadarb/hudi-hadoop_${docker.hadoop.version}-hive_${docker.hive.version}-sparkmaster_${docker.spark.version}</repository>
<forceTags>true</forceTags>
<tag>${project.version}</tag>
</configuration>
</execution>
</executions>
</plugin>
</plugins>
</build>
</project>

View File

@@ -0,0 +1,14 @@
ARG HADOOP_VERSION=2.8.4
ARG HIVE_VERSION=2.3.3
ARG SPARK_VERSION=2.3.1
FROM varadarb/hudi-hadoop_${HADOOP_VERSION}-hive_${HIVE_VERSION}-sparkbase_${SPARK_VERSION}
COPY worker.sh /opt/spark
ENV SPARK_WORKER_WEBUI_PORT 8081
ENV SPARK_WORKER_LOG /spark/logs
ENV SPARK_MASTER "spark://spark-master:7077"
EXPOSE 8081
CMD ["/bin/bash", "/opt/spark/worker.sh"]

View File

@@ -0,0 +1,89 @@
<?xml version="1.0" encoding="UTF-8"?>
<!--
~ Copyright (c) 2016 Uber Technologies, Inc. (hoodie-dev-group@uber.com)
~
~ Licensed under the Apache License, Version 2.0 (the "License");
~ you may not use this file except in compliance with the License.
~ You may obtain a copy of the License at
~
~ http://www.apache.org/licenses/LICENSE-2.0
~
~ Unless required by applicable law or agreed to in writing, software
~ distributed under the License is distributed on an "AS IS" BASIS,
~ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
~ See the License for the specific language governing permissions and
~ limitations under the License.
-->
<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
<parent>
<artifactId>hoodie-hadoop-docker</artifactId>
<groupId>com.uber.hoodie</groupId>
<version>0.4.5-SNAPSHOT</version>
</parent>
<modelVersion>4.0.0</modelVersion>
<packaging>pom</packaging>
<artifactId>hoodie-hadoop-sparkworker-docker</artifactId>
<description>Base Docker Image with Hoodie</description>
<properties>
<project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
<checkstyle.skip>true</checkstyle.skip>
</properties>
<dependencies>
<dependency>
<groupId>com.uber.hoodie</groupId>
<artifactId>hoodie-hadoop-sparkbase-docker</artifactId>
<version>${project.version}</version>
<type>pom</type>
<scope>import</scope>
</dependency>
</dependencies>
<build>
<plugins>
<!-- Build Docker image -->
<plugin>
<groupId>com.spotify</groupId>
<artifactId>dockerfile-maven-plugin</artifactId>
<version>${dockerfile.maven.version}</version>
<executions>
<execution>
<id>tag-latest</id>
<phase>pre-integration-test</phase>
<goals>
<goal>build</goal>
<goal>tag</goal>
<!--<goal>push</goal> -->
</goals>
<configuration>
<skip>${docker.build.skip}</skip>
<pullNewerImage>false</pullNewerImage>
<repository>varadarb/hudi-hadoop_${docker.hadoop.version}-hive_${docker.hive.version}-sparkworker_${docker.spark.version}</repository>
<forceTags>true</forceTags>
<tag>latest</tag>
</configuration>
</execution>
<execution>
<id>tag-version</id>
<phase>pre-integration-test</phase>
<goals>
<goal>build</goal>
<goal>tag</goal>
<!--<goal>push</goal> -->
</goals>
<configuration>
<skip>${docker.build.skip}</skip>
<pullNewerImage>false</pullNewerImage>
<repository>varadarb/hudi-hadoop_${docker.hadoop.version}-hive_${docker.hive.version}-sparkworker_${docker.spark.version}</repository>
<forceTags>true</forceTags>
<tag>${project.version}</tag>
</configuration>
</execution>
</executions>
</plugin>
</plugins>
</build>
</project>

View File

@@ -0,0 +1,16 @@
#!/bin/bash
. "/spark/sbin/spark-config.sh"
. "/spark/bin/load-spark-env.sh"
mkdir -p $SPARK_WORKER_LOG
export SPARK_HOME=/opt/spark
ln -sf /dev/stdout $SPARK_WORKER_LOG/spark-worker.out
date
echo "SPARK HOME is : $SPARK_HOME"
/opt/spark/sbin/../bin/spark-class org.apache.spark.deploy.worker.Worker \
--webui-port $SPARK_WORKER_WEBUI_PORT $SPARK_MASTER >> $SPARK_WORKER_LOG/spark-worker.out

16
docker/setup_demo.sh Executable file
View File

@@ -0,0 +1,16 @@
# Create host mount directory and copy
mkdir -p /tmp/hadoop_name
mkdir -p /tmp/hadoop_data
WS_ROOT=`dirname $PWD`
# restart cluster
HUDI_WS=${WS_ROOT} docker-compose -f compose/docker-compose_hadoop284_hive233_spark231.yml down
HUDI_WS=${WS_ROOT} docker-compose -f compose/docker-compose_hadoop284_hive233_spark231.yml pull
rm -rf /tmp/hadoop_data/*
rm -rf /tmp/hadoop_name/*
sleep 5
HUDI_WS=${WS_ROOT} docker-compose -f compose/docker-compose_hadoop284_hive233_spark231.yml up -d
sleep 15
docker exec -it adhoc-1 /bin/bash /var/hoodie/ws/docker/demo/setup_demo_container.sh
docker exec -it adhoc-2 /bin/bash /var/hoodie/ws/docker/demo/setup_demo_container.sh

View File

@@ -14,11 +14,11 @@ Check out code and pull it into Intellij as a normal maven project.
Normally build the maven project, from command line Normally build the maven project, from command line
``` ```
$ mvn clean install -DskipTests $ mvn clean install -DskipTests -DskipITs
To work with older version of Hive (pre Hive-1.2.1), use To work with older version of Hive (pre Hive-1.2.1), use
$ mvn clean install -DskipTests -Dhive11 $ mvn clean install -DskipTests -DskipITs -Dhive11
``` ```
@@ -293,6 +293,947 @@ hive>
{% include note.html content="This is only supported for Read-optimized tables for now." %} {% include note.html content="This is only supported for Read-optimized tables for now." %}
## A Demo using docker containers
Lets use a real world example to see how hudi works end to end. For this purpose, a self contained
data infrastructure is brought up in a local docker cluster within your computer.
The steps assume you are using Mac laptop
### Prerequisites
* Docker Setup : For Mac, Please follow the steps as defined in [https://docs.docker.com/v17.12/docker-for-mac/install/]. For running Spark-SQL queries, please ensure atleast 6 GB and 4 CPUs are allocated to Docker (See Docker -> Preferences -> Advanced). Otherwise, spark-SQL queries could be killed because of memory issues.
* kafkacat : A command-line utility to publish/consume from kafka topics. Use `brew install kafkacat` to install kafkacat
* /etc/hosts : The demo references many services running in container by the hostname. Add the following settings to /etc/hosts
```
127.0.0.1 adhoc-1
127.0.0.1 adhoc-2
127.0.0.1 namenode
127.0.0.1 datanode1
127.0.0.1 hiveserver
127.0.0.1 hivemetastore
127.0.0.1 kafkabroker
127.0.0.1 sparkmaster
127.0.0.1 zookeeper
```
### Setting up Docker Cluster
#### Build Hoodie
The first step is to build hoodie
```
cd <HUDI_WORKSPACE>
mvn package -DskipTests
```
#### Bringing up Demo Cluster
The next step is to run the docker compose script and setup configs for bringing up the cluster.
This should pull the docker images from docker hub and setup docker cluster.
```
cd docker
./setup_demo.sh
....
....
....
Stopping spark-worker-1 ... done
Stopping hiveserver ... done
Stopping hivemetastore ... done
Stopping historyserver ... done
.......
......
Creating network "hudi_demo" with the default driver
Creating hive-metastore-postgresql ... done
Creating namenode ... done
Creating zookeeper ... done
Creating kafkabroker ... done
Creating hivemetastore ... done
Creating historyserver ... done
Creating hiveserver ... done
Creating datanode1 ... done
Creating sparkmaster ... done
Creating adhoc-1 ... done
Creating adhoc-2 ... done
Creating spark-worker-1 ... done
Copying spark default config and setting up configs
Copying spark default config and setting up configs
Copying spark default config and setting up configs
varadarb-C02SG7Q3G8WP:docker varadarb$ docker ps
```
At this point, the docker cluster will be up and running. The demo cluster brings up the following services
* HDFS Services (NameNode, DataNode)
* Spark Master and Worker
* Hive Services (Metastore, HiveServer2 along with PostgresDB)
* Kafka Broker and a Zookeeper Node (Kakfa will be used as upstream source for the demo)
* Adhoc containers to run Hudi/Hive CLI commands
### Demo
Stock Tracker data will be used to showcase both different Hudi Views and the effects of Compaction.
Take a look at the directory `docker/demo/data`. There are 2 batches of stock data - each at 1 minute granularity.
The first batch contains stocker tracker data for some stock symbols during the first hour of trading window
(9:30 a.m to 10:30 a.m). The second batch contains tracker data for next 30 mins (10:30 - 11 a.m). Hudi will
be used to ingest these batches to a dataset which will contain the latest stock tracker data at hour level granularity.
The batches are windowed intentionally so that the second batch contains updates to some of the rows in the first batch.
#### Step 1 : Publish the first batch to Kafka
Upload the first batch to Kafka topic 'stock ticks'
```
cat docker/demo/data/batch_1.json | kafkacat -b kafkabroker -t stock_ticks -P
To check if the new topic shows up, use
kafkacat -b kafkabroker -L -J | jq .
{
"originating_broker": {
"id": 1001,
"name": "kafkabroker:9092/1001"
},
"query": {
"topic": "*"
},
"brokers": [
{
"id": 1001,
"name": "kafkabroker:9092"
}
],
"topics": [
{
"topic": "stock_ticks",
"partitions": [
{
"partition": 0,
"leader": 1001,
"replicas": [
{
"id": 1001
}
],
"isrs": [
{
"id": 1001
}
]
}
]
}
]
}
```
#### Step 2: Incrementally ingest data from Kafka topic
Hudi comes with a tool named DeltaStreamer. This tool can connect to variety of data sources (including Kafka) to
pull changes and apply to Hudi dataset using upsert/insert primitives. Here, we will use the tool to download
json data from kafka topic and ingest to both COW and MOR tables we initialized in the previous step. This tool
automatically initializes the datasets in the file-system if they do not exist yet.
```
docker exec -it adhoc-2 /bin/bash
# Run the following spark-submit command to execute the delta-streamer and ingest to stock_ticks_cow dataset in HDFS
spark-submit --class com.uber.hoodie.utilities.deltastreamer.HoodieDeltaStreamer $HUDI_UTILITIES_BUNDLE --storage-type COPY_ON_WRITE --source-class com.uber.hoodie.utilities.sources.JsonKafkaSource --source-ordering-field ts --target-base-path /user/hive/warehouse/stock_ticks_cow --target-table stock_ticks_cow --props /var/demo/config/kafka-source.properties
....
....
2018-09-24 22:20:00 INFO OutputCommitCoordinator$OutputCommitCoordinatorEndpoint:54 - OutputCommitCoordinator stopped!
2018-09-24 22:20:00 INFO SparkContext:54 - Successfully stopped SparkContext
# Run the following spark-submit command to execute the delta-streamer and ingest to stock_ticks_mor dataset in HDFS
spark-submit --class com.uber.hoodie.utilities.deltastreamer.HoodieDeltaStreamer $HUDI_UTILITIES_BUNDLE --storage-type MERGE_ON_READ --source-class com.uber.hoodie.utilities.sources.JsonKafkaSource --source-ordering-field ts --target-base-path /user/hive/warehouse/stock_ticks_mor --target-table stock_ticks_mor --props /var/demo/config/kafka-source.properties
....
2018-09-24 22:22:01 INFO OutputCommitCoordinator$OutputCommitCoordinatorEndpoint:54 - OutputCommitCoordinator stopped!
2018-09-24 22:22:01 INFO SparkContext:54 - Successfully stopped SparkContext
....
# As part of the setup (Look at setup_demo.sh), the configs needed for DeltaStreamer is uploaded to HDFS. The configs
# contain mostly Kafa connectivity settings, the avro-schema to be used for ingesting along with key and partitioning fields.
exit
```
You can use HDFS web-browser to look at the datasets
`http://namenode:50070/explorer.html#/user/hive/warehouse/stock_ticks_cow`.
You can explore the new partition folder created in the dataset along with a "deltacommit"
file under .hoodie which signals a successful commit.
There will be a similar setup when you browse the MOR dataset
`http://namenode:50070/explorer.html#/user/hive/warehouse/stock_ticks_mor`
#### Step 3: Sync with Hive
At this step, the datasets are available in HDFS. We need to sync with Hive to create new Hive tables and add partitions
inorder to run Hive queries against those datasets.
```
docker exec -it adhoc-2 /bin/bash
# THis command takes in HIveServer URL and COW Hudi Dataset location in HDFS and sync the HDFS state to Hive
/var/hoodie/ws/hoodie-hive/run_sync_tool.sh --jdbc-url jdbc:hive2://hiveserver:10000 --user hive --pass hive --partitioned-by dt --base-path /user/hive/warehouse/stock_ticks_cow --database default --table stock_ticks_cow
.....
2018-09-24 22:22:45,568 INFO [main] hive.HiveSyncTool (HiveSyncTool.java:syncHoodieTable(112)) - Sync complete for stock_ticks_cow
.....
# Now run hive-sync for the second data-set in HDFS using Merge-On-Read (MOR storage)
/var/hoodie/ws/hoodie-hive/run_sync_tool.sh --jdbc-url jdbc:hive2://hiveserver:10000 --user hive --pass hive --partitioned-by dt --base-path /user/hive/warehouse/stock_ticks_mor --database default --table stock_ticks_mor
...
2018-09-24 22:23:09,171 INFO [main] hive.HiveSyncTool (HiveSyncTool.java:syncHoodieTable(112)) - Sync complete for stock_ticks_mor
...
2018-09-24 22:23:09,559 INFO [main] hive.HiveSyncTool (HiveSyncTool.java:syncHoodieTable(112)) - Sync complete for stock_ticks_mor_rt
....
exit
```
After executing the above command, you will notice
1. A hive table named `stock_ticks_cow` created which provides Read-Optimized view for the Copy On Write dataset.
2. Two new tables `stock_ticks_mor` and `stock_ticks_mor_rt` created for the Merge On Read dataset. The former
provides the ReadOptimized view for the Hudi dataset and the later provides the realtime-view for the dataset.
#### Step 4 (a): Run Hive Queries
Run a hive query to find the latest timestamp ingested for stock symbol 'GOOG'. You will notice that both read-optimized
(for both COW and MOR dataset)and realtime views (for MOR dataset)give the same value "10:29 a.m" as Hudi create a
parquet file for the first batch of data.
```
docker exec -it adhoc-2 /bin/bash
beeline -u jdbc:hive2://hiveserver:10000 --hiveconf hive.input.format=org.apache.hadoop.hive.ql.io.HiveInputFormat --hiveconf hive.stats.autogather=false
# List Tables
0: jdbc:hive2://hiveserver:10000> show tables;
+---------------------+--+
| tab_name |
+---------------------+--+
| stock_ticks_cow |
| stock_ticks_mor |
| stock_ticks_mor_rt |
+---------------------+--+
2 rows selected (0.801 seconds)
0: jdbc:hive2://hiveserver:10000>
# Look at partitions that were added
0: jdbc:hive2://hiveserver:10000> show partitions stock_ticks_mor_rt;
+----------------+--+
| partition |
+----------------+--+
| dt=2018-08-31 |
+----------------+--+
1 row selected (0.24 seconds)
# COPY-ON-WRITE Queries:
=========================
0: jdbc:hive2://hiveserver:10000> select symbol, max(ts) from stock_ticks_cow group by symbol HAVING symbol = 'GOOG';
+---------+----------------------+--+
| symbol | _c1 |
+---------+----------------------+--+
| GOOG | 2018-08-31 10:29:00 |
+---------+----------------------+--+
Now, run a projection query:
0: jdbc:hive2://hiveserver:10000> select `_hoodie_commit_time`, symbol, ts, volume, open, close from stock_ticks_cow where symbol = 'GOOG';
+----------------------+---------+----------------------+---------+------------+-----------+--+
| _hoodie_commit_time | symbol | ts | volume | open | close |
+----------------------+---------+----------------------+---------+------------+-----------+--+
| 20180924221953 | GOOG | 2018-08-31 09:59:00 | 6330 | 1230.5 | 1230.02 |
| 20180924221953 | GOOG | 2018-08-31 10:29:00 | 3391 | 1230.1899 | 1230.085 |
+----------------------+---------+----------------------+---------+------------+-----------+--+
# Merge-On-Read Queries:
==========================
Lets run similar queries against M-O-R dataset. Lets look at both
ReadOptimized and Realtime views supported by M-O-R dataset
# Run against ReadOptimized View. Notice that the latest timestamp is 10:29
0: jdbc:hive2://hiveserver:10000> select symbol, max(ts) from stock_ticks_mor group by symbol HAVING symbol = 'GOOG';
WARNING: Hive-on-MR is deprecated in Hive 2 and may not be available in the future versions. Consider using a different execution engine (i.e. spark, tez) or using Hive 1.X releases.
+---------+----------------------+--+
| symbol | _c1 |
+---------+----------------------+--+
| GOOG | 2018-08-31 10:29:00 |
+---------+----------------------+--+
1 row selected (6.326 seconds)
# Run against Realtime View. Notice that the latest timestamp is again 10:29
0: jdbc:hive2://hiveserver:10000> select symbol, max(ts) from stock_ticks_mor_rt group by symbol HAVING symbol = 'GOOG';
WARNING: Hive-on-MR is deprecated in Hive 2 and may not be available in the future versions. Consider using a different execution engine (i.e. spark, tez) or using Hive 1.X releases.
+---------+----------------------+--+
| symbol | _c1 |
+---------+----------------------+--+
| GOOG | 2018-08-31 10:29:00 |
+---------+----------------------+--+
1 row selected (1.606 seconds)
# Run projection query against Read Optimized and Realtime tables
0: jdbc:hive2://hiveserver:10000> select `_hoodie_commit_time`, symbol, ts, volume, open, close from stock_ticks_mor where symbol = 'GOOG';
+----------------------+---------+----------------------+---------+------------+-----------+--+
| _hoodie_commit_time | symbol | ts | volume | open | close |
+----------------------+---------+----------------------+---------+------------+-----------+--+
| 20180924222155 | GOOG | 2018-08-31 09:59:00 | 6330 | 1230.5 | 1230.02 |
| 20180924222155 | GOOG | 2018-08-31 10:29:00 | 3391 | 1230.1899 | 1230.085 |
+----------------------+---------+----------------------+---------+------------+-----------+--+
0: jdbc:hive2://hiveserver:10000> select `_hoodie_commit_time`, symbol, ts, volume, open, close from stock_ticks_mor_rt where symbol = 'GOOG';
+----------------------+---------+----------------------+---------+------------+-----------+--+
| _hoodie_commit_time | symbol | ts | volume | open | close |
+----------------------+---------+----------------------+---------+------------+-----------+--+
| 20180924222155 | GOOG | 2018-08-31 09:59:00 | 6330 | 1230.5 | 1230.02 |
| 20180924222155 | GOOG | 2018-08-31 10:29:00 | 3391 | 1230.1899 | 1230.085 |
+----------------------+---------+----------------------+---------+------------+-----------+--+
exit
exit
```
#### Step 4 (b): Run Spark-SQL Queries
Hudi support Spark as query processor just like Hive. Here are the same hive queries
running in spark-sql
```
docker exec -it adhoc-1 /bin/bash
$SPARK_INSTALL/bin/spark-shell --jars $HUDI_SPARK_BUNDLE --master local[2] --driver-class-path $HADOOP_CONF_DIR --conf spark.sql.hive.convertMetastoreParquet=false --deploy-mode client --driver-memory 1G --executor-memory 3G --num-executors 1 --packages com.databricks:spark-avro_2.11:4.0.0
...
Welcome to
____ __
/ __/__ ___ _____/ /__
_\ \/ _ \/ _ `/ __/ '_/
/___/ .__/\_,_/_/ /_/\_\ version 2.3.1
/_/
Using Scala version 2.11.8 (Java HotSpot(TM) 64-Bit Server VM, Java 1.8.0_181)
Type in expressions to have them evaluated.
Type :help for more information.
scala>
scala> spark.sql("show tables").show(100, false)
+--------+------------------+-----------+
|database|tableName |isTemporary|
+--------+------------------+-----------+
|default |stock_ticks_cow |false |
|default |stock_ticks_mor |false |
|default |stock_ticks_mor_rt|false |
+--------+------------------+-----------+
# Copy-On-Write Table
## Run max timestamp query against COW table
scala> spark.sql("select symbol, max(ts) from stock_ticks_cow group by symbol HAVING symbol = 'GOOG'").show(100, false)
[Stage 0:> (0 + 1) / 1]SLF4J: Failed to load class "org.slf4j.impl.StaticLoggerBinder".
SLF4J: Defaulting to no-operation (NOP) logger implementation
SLF4J: See http://www.slf4j.org/codes.html#StaticLoggerBinder for further details.
+------+-------------------+
|symbol|max(ts) |
+------+-------------------+
|GOOG |2018-08-31 10:29:00|
+------+-------------------+
## Projection Query
scala> spark.sql("select `_hoodie_commit_time`, symbol, ts, volume, open, close from stock_ticks_cow where symbol = 'GOOG'").show(100, false)
+-------------------+------+-------------------+------+---------+--------+
|_hoodie_commit_time|symbol|ts |volume|open |close |
+-------------------+------+-------------------+------+---------+--------+
|20180924221953 |GOOG |2018-08-31 09:59:00|6330 |1230.5 |1230.02 |
|20180924221953 |GOOG |2018-08-31 10:29:00|3391 |1230.1899|1230.085|
+-------------------+------+-------------------+------+---------+--------+
# Merge-On-Read Queries:
==========================
Lets run similar queries against M-O-R dataset. Lets look at both
ReadOptimized and Realtime views supported by M-O-R dataset
# Run against ReadOptimized View. Notice that the latest timestamp is 10:29
scala> spark.sql("select symbol, max(ts) from stock_ticks_mor group by symbol HAVING symbol = 'GOOG'").show(100, false)
+------+-------------------+
|symbol|max(ts) |
+------+-------------------+
|GOOG |2018-08-31 10:29:00|
+------+-------------------+
# Run against Realtime View. Notice that the latest timestamp is again 10:29
scala> spark.sql("select symbol, max(ts) from stock_ticks_mor_rt group by symbol HAVING symbol = 'GOOG'").show(100, false)
+------+-------------------+
|symbol|max(ts) |
+------+-------------------+
|GOOG |2018-08-31 10:29:00|
+------+-------------------+
# Run projection query against Read Optimized and Realtime tables
scala> spark.sql("select `_hoodie_commit_time`, symbol, ts, volume, open, close from stock_ticks_mor where symbol = 'GOOG'").show(100, false)
+-------------------+------+-------------------+------+---------+--------+
|_hoodie_commit_time|symbol|ts |volume|open |close |
+-------------------+------+-------------------+------+---------+--------+
|20180924222155 |GOOG |2018-08-31 09:59:00|6330 |1230.5 |1230.02 |
|20180924222155 |GOOG |2018-08-31 10:29:00|3391 |1230.1899|1230.085|
+-------------------+------+-------------------+------+---------+--------+
scala> spark.sql("select `_hoodie_commit_time`, symbol, ts, volume, open, close from stock_ticks_mor_rt where symbol = 'GOOG'").show(100, false)
+-------------------+------+-------------------+------+---------+--------+
|_hoodie_commit_time|symbol|ts |volume|open |close |
+-------------------+------+-------------------+------+---------+--------+
|20180924222155 |GOOG |2018-08-31 09:59:00|6330 |1230.5 |1230.02 |
|20180924222155 |GOOG |2018-08-31 10:29:00|3391 |1230.1899|1230.085|
+-------------------+------+-------------------+------+---------+--------+
```
#### Step 5: Upload second batch to Kafka and run DeltaStreamer to ingest
Upload the second batch of data and ingest this batch using delta-streamer. As this batch does not bring in any new
partitions, there is no need to run hive-sync
```
cat docker/demo/data/batch_2.json | kafkacat -b kafkabroker -t stock_ticks -P
# Within Docker container, run the ingestion command
docker exec -it adhoc-2 /bin/bash
# Run the following spark-submit command to execute the delta-streamer and ingest to stock_ticks_cow dataset in HDFS
spark-submit --class com.uber.hoodie.utilities.deltastreamer.HoodieDeltaStreamer $HUDI_UTILITIES_BUNDLE --storage-type COPY_ON_WRITE --source-class com.uber.hoodie.utilities.sources.JsonKafkaSource --source-ordering-field ts --target-base-path /user/hive/warehouse/stock_ticks_cow --target-table stock_ticks_cow --props /var/demo/config/kafka-source.properties
# Run the following spark-submit command to execute the delta-streamer and ingest to stock_ticks_mor dataset in HDFS
spark-submit --class com.uber.hoodie.utilities.deltastreamer.HoodieDeltaStreamer $HUDI_UTILITIES_BUNDLE --storage-type MERGE_ON_READ --source-class com.uber.hoodie.utilities.sources.JsonKafkaSource --source-ordering-field ts --target-base-path /user/hive/warehouse/stock_ticks_mor --target-table stock_ticks_mor --props /var/demo/config/kafka-source.properties
exit
```
With Copy-On-Write table, the second ingestion by DeltaStreamer resulted in a new version of Parquet file getting created.
See `http://namenode:50070/explorer.html#/user/hive/warehouse/stock_ticks_cow/2018/08/31`
With Merge-On-Read table, the second ingestion merely appended the batch to an unmerged delta (log) file.
Take a look at the HDFS filesystem to get an idea: `http://namenode:50070/explorer.html#/user/hive/warehouse/stock_ticks_mor/2018/08/31`
#### Step 6(a): Run Hive Queries
With Copy-On-Write table, the read-optimized view immediately sees the changes as part of second batch once the batch
got committed as each ingestion creates newer versions of parquet files.
With Merge-On-Read table, the second ingestion merely appended the batch to an unmerged delta (log) file.
This is the time, when ReadOptimized and Realtime views will provide different results. ReadOptimized view will still
return "10:29 am" as it will only read from the Parquet file. Realtime View will do on-the-fly merge and return
latest committed data which is "10:59 a.m".
```
docker exec -it adhoc-2 /bin/bash
beeline -u jdbc:hive2://hiveserver:10000 --hiveconf hive.input.format=org.apache.hadoop.hive.ql.io.HiveInputFormat --hiveconf hive.stats.autogather=false
# Copy On Write Table:
0: jdbc:hive2://hiveserver:10000> select symbol, max(ts) from stock_ticks_cow group by symbol HAVING symbol = 'GOOG';
WARNING: Hive-on-MR is deprecated in Hive 2 and may not be available in the future versions. Consider using a different execution engine (i.e. spark, tez) or using Hive 1.X releases.
+---------+----------------------+--+
| symbol | _c1 |
+---------+----------------------+--+
| GOOG | 2018-08-31 10:59:00 |
+---------+----------------------+--+
1 row selected (1.932 seconds)
0: jdbc:hive2://hiveserver:10000> select `_hoodie_commit_time`, symbol, ts, volume, open, close from stock_ticks_cow where symbol = 'GOOG';
+----------------------+---------+----------------------+---------+------------+-----------+--+
| _hoodie_commit_time | symbol | ts | volume | open | close |
+----------------------+---------+----------------------+---------+------------+-----------+--+
| 20180924221953 | GOOG | 2018-08-31 09:59:00 | 6330 | 1230.5 | 1230.02 |
| 20180924224524 | GOOG | 2018-08-31 10:59:00 | 9021 | 1227.1993 | 1227.215 |
+----------------------+---------+----------------------+---------+------------+-----------+--+
As you can notice, the above queries now reflect the changes that came as part of ingesting second batch.
# Merge On Read Table:
# Read Optimized View
0: jdbc:hive2://hiveserver:10000> select symbol, max(ts) from stock_ticks_mor group by symbol HAVING symbol = 'GOOG';
WARNING: Hive-on-MR is deprecated in Hive 2 and may not be available in the future versions. Consider using a different execution engine (i.e. spark, tez) or using Hive 1.X releases.
+---------+----------------------+--+
| symbol | _c1 |
+---------+----------------------+--+
| GOOG | 2018-08-31 10:29:00 |
+---------+----------------------+--+
1 row selected (1.6 seconds)
0: jdbc:hive2://hiveserver:10000> select `_hoodie_commit_time`, symbol, ts, volume, open, close from stock_ticks_mor where symbol = 'GOOG';
+----------------------+---------+----------------------+---------+------------+-----------+--+
| _hoodie_commit_time | symbol | ts | volume | open | close |
+----------------------+---------+----------------------+---------+------------+-----------+--+
| 20180924222155 | GOOG | 2018-08-31 09:59:00 | 6330 | 1230.5 | 1230.02 |
| 20180924222155 | GOOG | 2018-08-31 10:29:00 | 3391 | 1230.1899 | 1230.085 |
+----------------------+---------+----------------------+---------+------------+-----------+--+
# Realtime View
0: jdbc:hive2://hiveserver:10000> select symbol, max(ts) from stock_ticks_mor_rt group by symbol HAVING symbol = 'GOOG';
WARNING: Hive-on-MR is deprecated in Hive 2 and may not be available in the future versions. Consider using a different execution engine (i.e. spark, tez) or using Hive 1.X releases.
+---------+----------------------+--+
| symbol | _c1 |
+---------+----------------------+--+
| GOOG | 2018-08-31 10:59:00 |
+---------+----------------------+--+
0: jdbc:hive2://hiveserver:10000> select `_hoodie_commit_time`, symbol, ts, volume, open, close from stock_ticks_mor_rt where symbol = 'GOOG';
+----------------------+---------+----------------------+---------+------------+-----------+--+
| _hoodie_commit_time | symbol | ts | volume | open | close |
+----------------------+---------+----------------------+---------+------------+-----------+--+
| 20180924222155 | GOOG | 2018-08-31 09:59:00 | 6330 | 1230.5 | 1230.02 |
| 20180924224537 | GOOG | 2018-08-31 10:59:00 | 9021 | 1227.1993 | 1227.215 |
+----------------------+---------+----------------------+---------+------------+-----------+--+
exit
exit
```
#### Step 6(b): Run Spark SQL Queries
Running the same queries in Spark-SQL:
```
docker exec -it adhoc-1 /bin/bash
bash-4.4# $SPARK_INSTALL/bin/spark-shell --jars $HUDI_SPARK_BUNDLE --driver-class-path $HADOOP_CONF_DIR --conf spark.sql.hive.convertMetastoreParquet=false --deploy-mode client --driver-memory 1G --master local[2] --executor-memory 3G --num-executors 1 --packages com.databricks:spark-avro_2.11:4.0.0
# Copy On Write Table:
scala> spark.sql("select symbol, max(ts) from stock_ticks_cow group by symbol HAVING symbol = 'GOOG'").show(100, false)
+------+-------------------+
|symbol|max(ts) |
+------+-------------------+
|GOOG |2018-08-31 10:59:00|
+------+-------------------+
scala> spark.sql("select `_hoodie_commit_time`, symbol, ts, volume, open, close from stock_ticks_cow where symbol = 'GOOG'").show(100, false)
+----------------------+---------+----------------------+---------+------------+-----------+--+
| _hoodie_commit_time | symbol | ts | volume | open | close |
+----------------------+---------+----------------------+---------+------------+-----------+--+
| 20180924221953 | GOOG | 2018-08-31 09:59:00 | 6330 | 1230.5 | 1230.02 |
| 20180924224524 | GOOG | 2018-08-31 10:59:00 | 9021 | 1227.1993 | 1227.215 |
+----------------------+---------+----------------------+---------+------------+-----------+--+
As you can notice, the above queries now reflect the changes that came as part of ingesting second batch.
# Merge On Read Table:
# Read Optimized View
scala> spark.sql("select symbol, max(ts) from stock_ticks_mor group by symbol HAVING symbol = 'GOOG'").show(100, false)
+---------+----------------------+--+
| symbol | _c1 |
+---------+----------------------+--+
| GOOG | 2018-08-31 10:29:00 |
+---------+----------------------+--+
1 row selected (1.6 seconds)
scala> spark.sql("select `_hoodie_commit_time`, symbol, ts, volume, open, close from stock_ticks_mor where symbol = 'GOOG'").show(100, false)
+----------------------+---------+----------------------+---------+------------+-----------+--+
| _hoodie_commit_time | symbol | ts | volume | open | close |
+----------------------+---------+----------------------+---------+------------+-----------+--+
| 20180924222155 | GOOG | 2018-08-31 09:59:00 | 6330 | 1230.5 | 1230.02 |
| 20180924222155 | GOOG | 2018-08-31 10:29:00 | 3391 | 1230.1899 | 1230.085 |
+----------------------+---------+----------------------+---------+------------+-----------+--+
# Realtime View
scala> spark.sql("select symbol, max(ts) from stock_ticks_mor_rt group by symbol HAVING symbol = 'GOOG'").show(100, false)
+---------+----------------------+--+
| symbol | _c1 |
+---------+----------------------+--+
| GOOG | 2018-08-31 10:59:00 |
+---------+----------------------+--+
scala> spark.sql("select `_hoodie_commit_time`, symbol, ts, volume, open, close from stock_ticks_mor_rt where symbol = 'GOOG'").show(100, false)
+----------------------+---------+----------------------+---------+------------+-----------+--+
| _hoodie_commit_time | symbol | ts | volume | open | close |
+----------------------+---------+----------------------+---------+------------+-----------+--+
| 20180924222155 | GOOG | 2018-08-31 09:59:00 | 6330 | 1230.5 | 1230.02 |
| 20180924224537 | GOOG | 2018-08-31 10:59:00 | 9021 | 1227.1993 | 1227.215 |
+----------------------+---------+----------------------+---------+------------+-----------+--+
exit
exit
```
#### Step 7 : Incremental Query for COPY-ON-WRITE Table
With 2 batches of data ingested, lets showcase the support for incremental queries in Hudi Copy-On-Write datasets
Lets take the same projection query example
```
docker exec -it adhoc-2 /bin/bash
beeline -u jdbc:hive2://hiveserver:10000 --hiveconf hive.input.format=org.apache.hadoop.hive.ql.io.HiveInputFormat --hiveconf hive.stats.autogather=false
0: jdbc:hive2://hiveserver:10000> select `_hoodie_commit_time`, symbol, ts, volume, open, close from stock_ticks_cow where symbol = 'GOOG';
+----------------------+---------+----------------------+---------+------------+-----------+--+
| _hoodie_commit_time | symbol | ts | volume | open | close |
+----------------------+---------+----------------------+---------+------------+-----------+--+
| 20180924064621 | GOOG | 2018-08-31 09:59:00 | 6330 | 1230.5 | 1230.02 |
| 20180924065039 | GOOG | 2018-08-31 10:59:00 | 9021 | 1227.1993 | 1227.215 |
+----------------------+---------+----------------------+---------+------------+-----------+--+
```
As you notice from the above queries, there are 2 commits - 20180924064621 and 20180924065039 in timeline order.
When you follow the steps, you will be getting different timestamps for commits. Substitute them
in place of the above timestamps.
To show the effects of incremental-query, let us assume that a reader has already seen the changes as part of
ingesting first batch. Now, for the reader to see effect of the second batch, he/she has to keep the start timestamp to
the commit time of the first batch (20180924064621) and run incremental query
`Hudi incremental mode` provides efficient scanning for incremental queries by filtering out files that do not have any
candidate rows using hudi-managed metadata.
```
docker exec -it adhoc-2 /bin/bash
beeline -u jdbc:hive2://hiveserver:10000 --hiveconf hive.input.format=org.apache.hadoop.hive.ql.io.HiveInputFormat --hiveconf hive.stats.autogather=false
0: jdbc:hive2://hiveserver:10000> set hoodie.stock_ticks_cow.consume.mode=INCREMENTAL;
No rows affected (0.009 seconds)
0: jdbc:hive2://hiveserver:10000> set hoodie.stock_ticks_cow.consume.max.commits=3;
No rows affected (0.009 seconds)
0: jdbc:hive2://hiveserver:10000> set hoodie.stock_ticks_cow.consume.start.timestamp=20180924064621;
# With the above setting, file-ids that do not have any updates from the commit 20180924065039 is filtered out without scanning.
# Here is the incremental query :
0: jdbc:hive2://hiveserver:10000>
0: jdbc:hive2://hiveserver:10000> select `_hoodie_commit_time`, symbol, ts, volume, open, close from stock_ticks_cow where symbol = 'GOOG' and `_hoodie_commit_time` > '20180924064621';
+----------------------+---------+----------------------+---------+------------+-----------+--+
| _hoodie_commit_time | symbol | ts | volume | open | close |
+----------------------+---------+----------------------+---------+------------+-----------+--+
| 20180924065039 | GOOG | 2018-08-31 10:59:00 | 9021 | 1227.1993 | 1227.215 |
+----------------------+---------+----------------------+---------+------------+-----------+--+
1 row selected (0.83 seconds)
0: jdbc:hive2://hiveserver:10000>
```
##### Incremental Query with Spark SQL:
```
docker exec -it adhoc-1 /bin/bash
bash-4.4# $SPARK_INSTALL/bin/spark-shell --jars $HUDI_SPARK_BUNDLE --driver-class-path $HADOOP_CONF_DIR --conf spark.sql.hive.convertMetastoreParquet=false --deploy-mode client --driver-memory 1G --master local[2] --executor-memory 3G --num-executors 1 --packages com.databricks:spark-avro_2.11:4.0.0
Welcome to
____ __
/ __/__ ___ _____/ /__
_\ \/ _ \/ _ `/ __/ '_/
/___/ .__/\_,_/_/ /_/\_\ version 2.3.1
/_/
Using Scala version 2.11.8 (Java HotSpot(TM) 64-Bit Server VM, Java 1.8.0_181)
Type in expressions to have them evaluated.
Type :help for more information.
scala> import com.uber.hoodie.DataSourceReadOptions
import com.uber.hoodie.DataSourceReadOptions
# In the below query, 20180925045257 is the first commit's timestamp
scala> val hoodieIncViewDF = spark.read.format("com.uber.hoodie").option(DataSourceReadOptions.VIEW_TYPE_OPT_KEY, DataSourceReadOptions.VIEW_TYPE_INCREMENTAL_OPT_VAL).option(DataSourceReadOptions.BEGIN_INSTANTTIME_OPT_KEY, "20180924064621").load("/user/hive/warehouse/stock_ticks_cow")
SLF4J: Failed to load class "org.slf4j.impl.StaticLoggerBinder".
SLF4J: Defaulting to no-operation (NOP) logger implementation
SLF4J: See http://www.slf4j.org/codes.html#StaticLoggerBinder for further details.
hoodieIncViewDF: org.apache.spark.sql.DataFrame = [_hoodie_commit_time: string, _hoodie_commit_seqno: string ... 15 more fields]
scala> hoodieIncViewDF.registerTempTable("stock_ticks_cow_incr_tmp1")
warning: there was one deprecation warning; re-run with -deprecation for details
scala> spark.sql("select `_hoodie_commit_time`, symbol, ts, volume, open, close from stock_ticks_cow_incr_tmp1 where symbol = 'GOOG'").show(100, false);
+----------------------+---------+----------------------+---------+------------+-----------+--+
| _hoodie_commit_time | symbol | ts | volume | open | close |
+----------------------+---------+----------------------+---------+------------+-----------+--+
| 20180924065039 | GOOG | 2018-08-31 10:59:00 | 9021 | 1227.1993 | 1227.215 |
+----------------------+---------+----------------------+---------+------------+-----------+--+
```
#### Step 8: Schedule and Run Compaction for Merge-On-Read dataset
Lets schedule and run a compaction to create a new version of columnar file so that read-optimized readers will see fresher data.
Again, You can use Hudi CLI to manually schedule and run compaction
```
docker exec -it adhoc-1 /bin/bash
^[[Aroot@adhoc-1:/opt# /var/hoodie/ws/hoodie-cli/hoodie-cli.sh
============================================
* *
* _ _ _ _ *
* | | | | | (_) *
* | |__| | ___ ___ __| |_ ___ *
* | __ |/ _ \ / _ \ / _` | |/ _ \ *
* | | | | (_) | (_) | (_| | | __/ *
* |_| |_|\___/ \___/ \__,_|_|\___| *
* *
============================================
Welcome to Hoodie CLI. Please type help if you are looking for help.
hoodie->connect --path /user/hive/warehouse/stock_ticks_mor
18/09/24 06:59:34 WARN util.NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
18/09/24 06:59:35 INFO table.HoodieTableMetaClient: Loading HoodieTableMetaClient from /user/hive/warehouse/stock_ticks_mor
18/09/24 06:59:35 INFO util.FSUtils: Hadoop Configuration: fs.defaultFS: [hdfs://namenode:8020], Config:[Configuration: core-default.xml, core-site.xml, mapred-default.xml, mapred-site.xml, yarn-default.xml, yarn-site.xml, hdfs-default.xml, hdfs-site.xml], FileSystem: [DFS[DFSClient[clientName=DFSClient_NONMAPREDUCE_-1261652683_11, ugi=root (auth:SIMPLE)]]]
18/09/24 06:59:35 INFO table.HoodieTableConfig: Loading dataset properties from /user/hive/warehouse/stock_ticks_mor/.hoodie/hoodie.properties
18/09/24 06:59:36 INFO table.HoodieTableMetaClient: Finished Loading Table of type MERGE_ON_READ from /user/hive/warehouse/stock_ticks_mor
Metadata for table stock_ticks_mor loaded
# Ensure no compactions are present
hoodie:stock_ticks_mor->compactions show all
18/09/24 06:59:54 INFO timeline.HoodieActiveTimeline: Loaded instants [[20180924064636__clean__COMPLETED], [20180924064636__deltacommit__COMPLETED], [20180924065057__clean__COMPLETED], [20180924065057__deltacommit__COMPLETED]]
___________________________________________________________________
| Compaction Instant Time| State | Total FileIds to be Compacted|
|==================================================================|
# Schedule a compaction. This will use Spark Launcher to schedule compaction
hoodie:stock_ticks_mor->compaction schedule
....
Compaction successfully completed for 20180924070031
# Now refresh and check again. You will see that there is a new compaction requested
hoodie:stock_ticks->connect --path /user/hive/warehouse/stock_ticks_mor
18/09/24 07:01:16 INFO table.HoodieTableMetaClient: Loading HoodieTableMetaClient from /user/hive/warehouse/stock_ticks_mor
18/09/24 07:01:16 INFO util.FSUtils: Hadoop Configuration: fs.defaultFS: [hdfs://namenode:8020], Config:[Configuration: core-default.xml, core-site.xml, mapred-default.xml, mapred-site.xml, yarn-default.xml, yarn-site.xml, hdfs-default.xml, hdfs-site.xml], FileSystem: [DFS[DFSClient[clientName=DFSClient_NONMAPREDUCE_-1261652683_11, ugi=root (auth:SIMPLE)]]]
18/09/24 07:01:16 INFO table.HoodieTableConfig: Loading dataset properties from /user/hive/warehouse/stock_ticks_mor/.hoodie/hoodie.properties
18/09/24 07:01:16 INFO table.HoodieTableMetaClient: Finished Loading Table of type MERGE_ON_READ from /user/hive/warehouse/stock_ticks_mor
Metadata for table stock_ticks_mor loaded
hoodie:stock_ticks_mor->compactions show all
18/09/24 06:34:12 INFO timeline.HoodieActiveTimeline: Loaded instants [[20180924041125__clean__COMPLETED], [20180924041125__deltacommit__COMPLETED], [20180924042735__clean__COMPLETED], [20180924042735__deltacommit__COMPLETED], [==>20180924063245__compaction__REQUESTED]]
___________________________________________________________________
| Compaction Instant Time| State | Total FileIds to be Compacted|
|==================================================================|
| 20180924070031 | REQUESTED| 1 |
# Execute the compaction. The compaction instant value passed below must be the one displayed in the above "compactions show all" query
hoodie:stock_ticks_mor->compaction run --compactionInstant 20180924070031 --parallelism 2 --sparkMemory 1G --schemaFilePath /var/demo/config/schema.avsc --retry 1
....
Compaction successfully completed for 20180924070031
## Now check if compaction is completed
hoodie:stock_ticks_mor->connect --path /user/hive/warehouse/stock_ticks_mor
18/09/24 07:03:00 INFO table.HoodieTableMetaClient: Loading HoodieTableMetaClient from /user/hive/warehouse/stock_ticks_mor
18/09/24 07:03:00 INFO util.FSUtils: Hadoop Configuration: fs.defaultFS: [hdfs://namenode:8020], Config:[Configuration: core-default.xml, core-site.xml, mapred-default.xml, mapred-site.xml, yarn-default.xml, yarn-site.xml, hdfs-default.xml, hdfs-site.xml], FileSystem: [DFS[DFSClient[clientName=DFSClient_NONMAPREDUCE_-1261652683_11, ugi=root (auth:SIMPLE)]]]
18/09/24 07:03:00 INFO table.HoodieTableConfig: Loading dataset properties from /user/hive/warehouse/stock_ticks_mor/.hoodie/hoodie.properties
18/09/24 07:03:00 INFO table.HoodieTableMetaClient: Finished Loading Table of type MERGE_ON_READ from /user/hive/warehouse/stock_ticks_mor
Metadata for table stock_ticks_mor loaded
hoodie:stock_ticks->compactions show all
18/09/24 07:03:15 INFO timeline.HoodieActiveTimeline: Loaded instants [[20180924064636__clean__COMPLETED], [20180924064636__deltacommit__COMPLETED], [20180924065057__clean__COMPLETED], [20180924065057__deltacommit__COMPLETED], [20180924070031__commit__COMPLETED]]
___________________________________________________________________
| Compaction Instant Time| State | Total FileIds to be Compacted|
|==================================================================|
| 20180924070031 | COMPLETED| 1 |
```
#### Step 9: Run Hive Queries including incremental queries
You will see that both ReadOptimized and Realtime Views will show the latest committed data.
Lets also run the incremental query for MOR table.
From looking at the below query output, it will be clear that the fist commit time for the MOR table is 20180924064636
and the second commit time is 20180924070031
```
docker exec -it adhoc-2 /bin/bash
beeline -u jdbc:hive2://hiveserver:10000 --hiveconf hive.input.format=org.apache.hadoop.hive.ql.io.HiveInputFormat --hiveconf hive.stats.autogather=false
# Read Optimized View
0: jdbc:hive2://hiveserver:10000> select symbol, max(ts) from stock_ticks_mor group by symbol HAVING symbol = 'GOOG';
WARNING: Hive-on-MR is deprecated in Hive 2 and may not be available in the future versions. Consider using a different execution engine (i.e. spark, tez) or using Hive 1.X releases.
+---------+----------------------+--+
| symbol | _c1 |
+---------+----------------------+--+
| GOOG | 2018-08-31 10:59:00 |
+---------+----------------------+--+
1 row selected (1.6 seconds)
0: jdbc:hive2://hiveserver:10000> select `_hoodie_commit_time`, symbol, ts, volume, open, close from stock_ticks_mor where symbol = 'GOOG';
+----------------------+---------+----------------------+---------+------------+-----------+--+
| _hoodie_commit_time | symbol | ts | volume | open | close |
+----------------------+---------+----------------------+---------+------------+-----------+--+
| 20180924064636 | GOOG | 2018-08-31 09:59:00 | 6330 | 1230.5 | 1230.02 |
| 20180924070031 | GOOG | 2018-08-31 10:59:00 | 9021 | 1227.1993 | 1227.215 |
+----------------------+---------+----------------------+---------+------------+-----------+--+
# Realtime View
0: jdbc:hive2://hiveserver:10000> select symbol, max(ts) from stock_ticks_mor_rt group by symbol HAVING symbol = 'GOOG';
WARNING: Hive-on-MR is deprecated in Hive 2 and may not be available in the future versions. Consider using a different execution engine (i.e. spark, tez) or using Hive 1.X releases.
+---------+----------------------+--+
| symbol | _c1 |
+---------+----------------------+--+
| GOOG | 2018-08-31 10:59:00 |
+---------+----------------------+--+
0: jdbc:hive2://hiveserver:10000> select `_hoodie_commit_time`, symbol, ts, volume, open, close from stock_ticks_mor_rt where symbol = 'GOOG';
+----------------------+---------+----------------------+---------+------------+-----------+--+
| _hoodie_commit_time | symbol | ts | volume | open | close |
+----------------------+---------+----------------------+---------+------------+-----------+--+
| 20180924064636 | GOOG | 2018-08-31 09:59:00 | 6330 | 1230.5 | 1230.02 |
| 20180924070031 | GOOG | 2018-08-31 10:59:00 | 9021 | 1227.1993 | 1227.215 |
+----------------------+---------+----------------------+---------+------------+-----------+--+
# Incremental View:
0: jdbc:hive2://hiveserver:10000> set hoodie.stock_ticks_mor.consume.mode=INCREMENTAL;
No rows affected (0.008 seconds)
# Max-Commits covers both second batch and compaction commit
0: jdbc:hive2://hiveserver:10000> set hoodie.stock_ticks_mor.consume.max.commits=3;
No rows affected (0.007 seconds)
0: jdbc:hive2://hiveserver:10000> set hoodie.stock_ticks_mor.consume.start.timestamp=20180924064636;
No rows affected (0.013 seconds)
# Query:
0: jdbc:hive2://hiveserver:10000> select `_hoodie_commit_time`, symbol, ts, volume, open, close from stock_ticks_mor where symbol = 'GOOG' and `_hoodie_commit_time` > '20180924064636';
+----------------------+---------+----------------------+---------+------------+-----------+--+
| _hoodie_commit_time | symbol | ts | volume | open | close |
+----------------------+---------+----------------------+---------+------------+-----------+--+
| 20180924070031 | GOOG | 2018-08-31 10:59:00 | 9021 | 1227.1993 | 1227.215 |
+----------------------+---------+----------------------+---------+------------+-----------+--+
exit
exit
```
##### Read Optimized and Realtime Views for MOR with Spark-SQL after compaction
```
docker exec -it adhoc-1 /bin/bash
bash-4.4# $SPARK_INSTALL/bin/spark-shell --jars $HUDI_SPARK_BUNDLE --driver-class-path $HADOOP_CONF_DIR --conf spark.sql.hive.convertMetastoreParquet=false --deploy-mode client --driver-memory 1G --master local[2] --executor-memory 3G --num-executors 1 --packages com.databricks:spark-avro_2.11:4.0.0
# Read Optimized View
scala> spark.sql("select symbol, max(ts) from stock_ticks_mor group by symbol HAVING symbol = 'GOOG'").show(100, false)
+---------+----------------------+--+
| symbol | _c1 |
+---------+----------------------+--+
| GOOG | 2018-08-31 10:59:00 |
+---------+----------------------+--+
1 row selected (1.6 seconds)
scala> spark.sql("select `_hoodie_commit_time`, symbol, ts, volume, open, close from stock_ticks_mor where symbol = 'GOOG'").show(100, false)
+----------------------+---------+----------------------+---------+------------+-----------+--+
| _hoodie_commit_time | symbol | ts | volume | open | close |
+----------------------+---------+----------------------+---------+------------+-----------+--+
| 20180924064636 | GOOG | 2018-08-31 09:59:00 | 6330 | 1230.5 | 1230.02 |
| 20180924070031 | GOOG | 2018-08-31 10:59:00 | 9021 | 1227.1993 | 1227.215 |
+----------------------+---------+----------------------+---------+------------+-----------+--+
# Realtime View
scala> spark.sql("select symbol, max(ts) from stock_ticks_mor_rt group by symbol HAVING symbol = 'GOOG'").show(100, false)
+---------+----------------------+--+
| symbol | _c1 |
+---------+----------------------+--+
| GOOG | 2018-08-31 10:59:00 |
+---------+----------------------+--+
scala> spark.sql("select `_hoodie_commit_time`, symbol, ts, volume, open, close from stock_ticks_mor_rt where symbol = 'GOOG'").show(100, false)
+----------------------+---------+----------------------+---------+------------+-----------+--+
| _hoodie_commit_time | symbol | ts | volume | open | close |
+----------------------+---------+----------------------+---------+------------+-----------+--+
| 20180924064636 | GOOG | 2018-08-31 09:59:00 | 6330 | 1230.5 | 1230.02 |
| 20180924070031 | GOOG | 2018-08-31 10:59:00 | 9021 | 1227.1993 | 1227.215 |
+----------------------+---------+----------------------+---------+------------+-----------+--+
```
This brings the demo to an end.
## Testing Hoodie in Local Docker environment
You can bring up a hadoop docker environment containing Hadoop, Hive and Spark services with support for hoodie.
```
$ mvn pre-integration-test -DskipTests
```
The above command builds docker images for all the services with
current hoodie source installed at /var/hoodie/ws and also brings up the services using a compose file. We
currently use Hadoop (v2.8.4), Hive (v2.3.3) and Spark (v2.3.1) in docker images.
To bring down the containers
```
$ cd hoodie-integ-test
$ mvn docker-compose:down
```
If you want to bring up the docker containers, use
```
$ cd hoodie-integ-test
$ mvn docker-compose:up -DdetachedMode=true
```
Hoodie is a library that is operated in a broader data analytics/ingestion environment
involving Hadoop, Hive and Spark. Interoperability with all these systems is a key objective for us. We are
actively adding integration-tests under __hoodie-integ-test/src/test/java__ that makes use of this
docker environment (See __hoodie-integ-test/src/test/java/com/uber/hoodie/integ/ITTestHoodieSanity.java__ )
#### Building Local Docker Containers:
The docker images required for demo and running integration test are already in docker-hub. The docker images
and compose scripts are carefully implemented so that they serve dual-purpose
1. The docker images have inbuilt hudi jar files with environment variable pointing to those jars (HUDI_HADOOP_BUNDLE, ...)
2. For running integration-tests, we need the jars generated locally to be used for running services within docker. The
docker-compose scripts (see `docker/compose/docker-compose_hadoop284_hive233_spark231.yml`) ensures local jars override
inbuilt jars by mounting local HUDI workspace over the docker location
This helps avoid maintaining separate docker images and avoids the costly step of building HUDI docker images locally.
But if users want to test hudi from locations with lower network bandwidth, they can still build local images
run the script
`docker/build_local_docker_images.sh` to build local docker images before running `docker/setup_demo.sh`
Here are the commands:
```
cd docker
./build_local_docker_images.sh
.....
[INFO] Reactor Summary:
[INFO]
[INFO] Hoodie ............................................. SUCCESS [ 1.709 s]
[INFO] hoodie-common ...................................... SUCCESS [ 9.015 s]
[INFO] hoodie-hadoop-mr ................................... SUCCESS [ 1.108 s]
[INFO] hoodie-client ...................................... SUCCESS [ 4.409 s]
[INFO] hoodie-hive ........................................ SUCCESS [ 0.976 s]
[INFO] hoodie-spark ....................................... SUCCESS [ 26.522 s]
[INFO] hoodie-utilities ................................... SUCCESS [ 16.256 s]
[INFO] hoodie-cli ......................................... SUCCESS [ 11.341 s]
[INFO] hoodie-hadoop-mr-bundle ............................ SUCCESS [ 1.893 s]
[INFO] hoodie-hive-bundle ................................. SUCCESS [ 14.099 s]
[INFO] hoodie-spark-bundle ................................ SUCCESS [ 58.252 s]
[INFO] hoodie-hadoop-docker ............................... SUCCESS [ 0.612 s]
[INFO] hoodie-hadoop-base-docker .......................... SUCCESS [04:04 min]
[INFO] hoodie-hadoop-namenode-docker ...................... SUCCESS [ 6.142 s]
[INFO] hoodie-hadoop-datanode-docker ...................... SUCCESS [ 7.763 s]
[INFO] hoodie-hadoop-history-docker ....................... SUCCESS [ 5.922 s]
[INFO] hoodie-hadoop-hive-docker .......................... SUCCESS [ 56.152 s]
[INFO] hoodie-hadoop-sparkbase-docker ..................... SUCCESS [01:18 min]
[INFO] hoodie-hadoop-sparkmaster-docker ................... SUCCESS [ 2.964 s]
[INFO] hoodie-hadoop-sparkworker-docker ................... SUCCESS [ 3.032 s]
[INFO] hoodie-hadoop-sparkadhoc-docker .................... SUCCESS [ 2.764 s]
[INFO] hoodie-integ-test .................................. SUCCESS [ 1.785 s]
[INFO] ------------------------------------------------------------------------
[INFO] BUILD SUCCESS
[INFO] ------------------------------------------------------------------------
[INFO] Total time: 09:15 min
[INFO] Finished at: 2018-09-10T17:47:37-07:00
[INFO] Final Memory: 236M/1848M
[INFO] ------------------------------------------------------------------------
```

0
hoodie-hive/run_sync_tool.sh Normal file → Executable file
View File

View File

@@ -14,9 +14,8 @@
* limitations under the License. * limitations under the License.
*/ */
package com.uber.hoodie.hive.util; package com.uber.hoodie.hive;
import com.uber.hoodie.hive.PartitionValueExtractor;
import java.util.Arrays; import java.util.Arrays;
import java.util.List; import java.util.List;

View File

@@ -26,7 +26,6 @@ import com.google.common.collect.Lists;
import com.uber.hoodie.common.util.SchemaTestUtil; import com.uber.hoodie.common.util.SchemaTestUtil;
import com.uber.hoodie.hive.HoodieHiveClient.PartitionEvent; import com.uber.hoodie.hive.HoodieHiveClient.PartitionEvent;
import com.uber.hoodie.hive.HoodieHiveClient.PartitionEvent.PartitionEventType; import com.uber.hoodie.hive.HoodieHiveClient.PartitionEvent.PartitionEventType;
import com.uber.hoodie.hive.util.MultiPartKeysValueExtractor;
import com.uber.hoodie.hive.util.SchemaUtil; import com.uber.hoodie.hive.util.SchemaUtil;
import java.io.IOException; import java.io.IOException;
import java.net.URISyntaxException; import java.net.URISyntaxException;

212
hoodie-integ-test/pom.xml Normal file
View File

@@ -0,0 +1,212 @@
<?xml version="1.0" encoding="UTF-8"?>
<!--
~ Copyright (c) 2016 Uber Technologies, Inc. (hoodie-dev-group@uber.com)
~
~ Licensed under the Apache License, Version 2.0 (the "License");
~ you may not use this file except in compliance with the License.
~ You may obtain a copy of the License at
~
~ http://www.apache.org/licenses/LICENSE-2.0
~
~ Unless required by applicable law or agreed to in writing, software
~ distributed under the License is distributed on an "AS IS" BASIS,
~ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
~ See the License for the specific language governing permissions and
~ limitations under the License.
-->
<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
<parent>
<artifactId>hoodie</artifactId>
<groupId>com.uber.hoodie</groupId>
<version>0.4.5-SNAPSHOT</version>
<relativePath>../pom.xml</relativePath>
</parent>
<artifactId>hoodie-integ-test</artifactId>
<modelVersion>4.0.0</modelVersion>
<dependencies>
<dependency>
<groupId>org.glassfish.jersey.connectors</groupId>
<artifactId>jersey-apache-connector</artifactId>
<version>2.17</version>
</dependency>
<dependency>
<groupId>org.glassfish.jersey.core</groupId>
<artifactId>jersey-server</artifactId>
<version>2.17</version>
</dependency>
<dependency>
<groupId>org.glassfish.jersey.containers</groupId>
<artifactId>jersey-container-servlet-core</artifactId>
<version>2.17</version>
</dependency>
<dependency>
<groupId>com.uber.hoodie</groupId>
<artifactId>hoodie-spark</artifactId>
<version>${project.version}</version>
<exclusions>
<exclusion>
<groupId>org.glassfish.**</groupId>
<artifactId>*</artifactId>
</exclusion>
</exclusions>
</dependency>
<dependency>
<groupId>com.uber.hoodie</groupId>
<artifactId>hoodie-common</artifactId>
<version>${project.version}</version>
<classifier>tests</classifier>
<type>test-jar</type>
<scope>test</scope>
</dependency>
<dependency>
<groupId>org.awaitility</groupId>
<artifactId>awaitility</artifactId>
<version>3.1.2</version>
<scope>test</scope>
</dependency>
<dependency>
<groupId>com.uber.hoodie</groupId>
<artifactId>hoodie-spark</artifactId>
<version>${project.version}</version>
<classifier>tests</classifier>
<type>test-jar</type>
<scope>test</scope>
<exclusions>
<exclusion>
<groupId>org.glassfish.**</groupId>
<artifactId>*</artifactId>
</exclusion>
</exclusions>
</dependency>
<dependency>
<groupId>com.google.guava</groupId>
<artifactId>guava</artifactId>
<version>20.0</version>
<scope>test</scope>
</dependency>
<dependency>
<groupId>com.fasterxml.jackson.core</groupId>
<artifactId>jackson-annotations</artifactId>
<version>2.6.4</version>
<scope>test</scope>
</dependency>
<dependency>
<groupId>com.fasterxml.jackson.core</groupId>
<artifactId>jackson-databind</artifactId>
<version>2.6.4</version>
<scope>test</scope>
</dependency>
<dependency>
<groupId>com.fasterxml.jackson.datatype</groupId>
<artifactId>jackson-datatype-guava</artifactId>
<version>2.9.4</version>
<scope>test</scope>
</dependency>
<dependency>
<groupId>com.github.docker-java</groupId>
<artifactId>docker-java</artifactId>
<version>3.1.0-rc-3</version>
<scope>test</scope>
<exclusions>
<exclusion>
<groupId>org.glassfish.**</groupId>
<artifactId>*</artifactId>
</exclusion>
</exclusions>
</dependency>
<dependency>
<groupId>com.uber.hoodie</groupId>
<artifactId>hoodie-hadoop-sparkworker-docker</artifactId>
<version>${project.version}</version>
<type>pom</type>
<scope>import</scope>
</dependency>
<dependency>
<groupId>junit</groupId>
<artifactId>junit</artifactId>
<version>${junit.version}</version>
<scope>test</scope>
</dependency>
</dependencies>
<properties>
<dockerCompose.envFile>${project.basedir}/compose_env</dockerCompose.envFile>
<dockerCompose.file>${project.basedir}/../docker/compose/docker-compose_hadoop284_hive233_spark231.yml</dockerCompose.file>
<skipITs>false</skipITs>
<docker.compose.skip>${skipITs}</docker.compose.skip>
<checkstyle.skip>true</checkstyle.skip>
</properties>
<build>
<plugins>
<plugin>
<artifactId>exec-maven-plugin</artifactId>
<groupId>org.codehaus.mojo</groupId>
<executions>
<execution><!-- setup HUDI_WS variable in docker compose env file -->
<id>Setup HUDI_WS</id>
<phase>generate-sources</phase>
<goals>
<goal>exec</goal>
</goals>
<configuration>
<executable>/bin/bash</executable>
<arguments>
<argument> -c </argument>
<argument>echo HUDI_WS=`dirname ${project.basedir}`</argument>
</arguments>
<outputFile>${dockerCompose.envFile}</outputFile>
</configuration>
</execution>
</executions>
</plugin>
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-failsafe-plugin</artifactId>
<version>2.22.0</version>
<executions>
<execution>
<goals>
<goal>integration-test</goal>
</goals>
</execution>
</executions>
</plugin>
<plugin>
<groupId>com.dkanejs.maven.plugins</groupId>
<artifactId>docker-compose-maven-plugin</artifactId>
<version>2.0.1</version>
<executions>
<execution>
<id>up</id>
<phase>pre-integration-test</phase>
<goals>
<goal>up</goal>
</goals>
<configuration>
<skip>${docker.compose.skip}</skip>
<host>unix:///var/run/docker.sock</host>
<composeFile>${project.basedir}/../docker/compose/docker-compose_hadoop284_hive233_spark231.yml</composeFile>
<detachedMode>true</detachedMode>
<envFile>${dockerCompose.envFile}</envFile>
</configuration>
</execution>
<execution>
<id>down</id>
<phase>integration-test</phase>
<goals>
<goal>down</goal>
</goals>
<configuration>
<skip>${docker.compose.skip}</skip>
<composeFile>${project.basedir}/../docker/compose/docker-compose_hadoop284_hive233_spark231.yml</composeFile>
<removeVolumes>true</removeVolumes>
</configuration>
</execution>
</executions>
</plugin>
</plugins>
</build>
</project>

View File

@@ -0,0 +1,178 @@
/*
* Copyright (c) 2016 Uber Technologies, Inc. (hoodie-dev-group@uber.com)
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package com.uber.hoodie.integ;
import static java.util.concurrent.TimeUnit.SECONDS;
import static org.awaitility.Awaitility.await;
import com.github.dockerjava.api.DockerClient;
import com.github.dockerjava.api.command.DockerCmdExecFactory;
import com.github.dockerjava.api.command.ExecCreateCmd;
import com.github.dockerjava.api.command.ExecCreateCmdResponse;
import com.github.dockerjava.api.model.Container;
import com.github.dockerjava.core.DefaultDockerClientConfig;
import com.github.dockerjava.core.DockerClientBuilder;
import com.github.dockerjava.core.DockerClientConfig;
import com.github.dockerjava.core.command.ExecStartResultCallback;
import com.github.dockerjava.jaxrs.JerseyDockerCmdExecFactory;
import com.google.common.collect.ImmutableList;
import java.io.ByteArrayOutputStream;
import java.io.IOException;
import java.util.Arrays;
import java.util.List;
import java.util.Map;
import java.util.stream.Collectors;
import org.apache.commons.lang3.tuple.Pair;
import org.apache.log4j.LogManager;
import org.apache.log4j.Logger;
import org.junit.Assert;
import org.junit.Before;
public abstract class ITTestBase {
public static final Logger LOG = LogManager.getLogger(ITTestBase.class);
protected static final String SPARK_WORKER_CONTAINER = "/spark-worker-1";
protected static final String ADHOC_1_CONTAINER = "/adhoc-1";
protected static final String ADHOC_2_CONTAINER = "/adhoc-2";
protected static final String HIVESERVER = "/hiveserver";
protected static final String HOODIE_WS_ROOT = "/var/hoodie/ws";
protected static final String HOODIE_JAVA_APP = HOODIE_WS_ROOT + "/hoodie-spark/run_hoodie_app.sh";
protected static final String HUDI_HADOOP_BUNDLE =
HOODIE_WS_ROOT + "/docker/hoodie/hadoop/hive_base/target/hoodie-hadoop-mr-bundle.jar";
protected static final String HUDI_HIVE_BUNDLE =
HOODIE_WS_ROOT + "/docker/hoodie/hadoop/hive_base/target/hoodie-hive-bundle.jar";
protected static final String HUDI_SPARK_BUNDLE =
HOODIE_WS_ROOT + "/docker/hoodie/hadoop/hive_base/target/hoodie-spark-bundle.jar";
protected static final String HIVE_SERVER_JDBC_URL = "jdbc:hive2://hiveserver:10000";
// Skip these lines when capturing output from hive
protected static final Integer SLF4J_WARNING_LINE_COUNT_IN_HIVE_CMD = 9;
private static final String DEFAULT_DOCKER_HOST = "unix:///var/run/docker.sock";
private static final String OVERRIDDEN_DOCKER_HOST = System.getenv("DOCKER_HOST");
protected DockerClient dockerClient;
protected Map<String, Container> runningContainers;
protected static String[] getHiveConsoleCommand(String rawCommand) {
String jarCommand = "add jar " + HUDI_HADOOP_BUNDLE + ";";
String fullCommand = jarCommand + rawCommand;
List<String> cmd = new ImmutableList.Builder().add("hive")
.add("--hiveconf")
.add("hive.input.format=org.apache.hadoop.hive.ql.io.HiveInputFormat")
.add("--hiveconf")
.add("hive.stats.autogather=false")
.add("-e")
.add("\"" + fullCommand + "\"")
.build();
return cmd.stream().toArray(String[]::new);
}
@Before
public void init() throws IOException {
String dockerHost = (OVERRIDDEN_DOCKER_HOST != null) ? OVERRIDDEN_DOCKER_HOST : DEFAULT_DOCKER_HOST;
//Assuming insecure docker engine
DockerClientConfig config = DefaultDockerClientConfig.createDefaultConfigBuilder()
.withDockerHost(dockerHost)
.build();
// using jaxrs/jersey implementation here (netty impl is also available)
DockerCmdExecFactory dockerCmdExecFactory = new JerseyDockerCmdExecFactory()
.withConnectTimeout(1000)
.withMaxTotalConnections(100)
.withMaxPerRouteConnections(10);
dockerClient = DockerClientBuilder.getInstance(config)
.withDockerCmdExecFactory(dockerCmdExecFactory)
.build();
await().atMost(60, SECONDS).until(this::servicesUp);
}
private boolean servicesUp() {
List<Container> containerList = dockerClient.listContainersCmd().exec();
for (Container c : containerList) {
if (!c.getState().equalsIgnoreCase("running")) {
System.out.println("Container : " + Arrays.toString(c.getNames())
+ "not in running state, Curr State :" + c.getState());
return false;
}
}
runningContainers = containerList.stream().map(c -> Pair.of(c.getNames()[0], c))
.collect(Collectors.toMap(Pair::getLeft, Pair::getRight));
return true;
}
protected TestExecStartResultCallback executeCommandInDocker(String containerName, String[] command,
boolean expectedToSucceed)
throws Exception {
LOG.info("Executing command (" + Arrays.toString(command) + ") in container " + containerName);
Container sparkWorkerContainer = runningContainers.get(containerName);
ExecCreateCmd cmd = dockerClient.execCreateCmd(sparkWorkerContainer.getId())
.withCmd(command).withAttachStdout(true).withAttachStderr(true);
ExecCreateCmdResponse createCmdResponse = cmd.exec();
TestExecStartResultCallback callback = new TestExecStartResultCallback(new ByteArrayOutputStream(),
new ByteArrayOutputStream());
dockerClient.execStartCmd(createCmdResponse.getId()).withDetach(false).withTty(false)
.exec(callback).awaitCompletion();
int exitCode = dockerClient.inspectExecCmd(createCmdResponse.getId()).exec().getExitCode();
LOG.info("Exit code for command (" + Arrays.toString(command) + ") is " + exitCode);
if (exitCode != 0) {
LOG.error("Command (" + Arrays.toString(command) + ") failed.");
LOG.error("Stdout is :" + callback.getStdout().toString());
LOG.error("Stderr is :" + callback.getStderr().toString());
}
if (expectedToSucceed) {
Assert.assertTrue("Command (" + Arrays.toString(command)
+ ") expected to succeed. Exit (" + exitCode + ")", exitCode == 0);
} else {
Assert.assertTrue("Command (" + Arrays.toString(command)
+ ") expected to fail. Exit (" + exitCode + ")", exitCode != 0);
}
cmd.close();
return callback;
}
public class TestExecStartResultCallback extends ExecStartResultCallback {
// Storing the reference in subclass to expose to clients
private final ByteArrayOutputStream stdout;
private final ByteArrayOutputStream stderr;
public TestExecStartResultCallback(ByteArrayOutputStream stdout, ByteArrayOutputStream stderr) {
super(stdout, stderr);
this.stdout = stdout;
this.stderr = stderr;
}
@Override
public void onComplete() {
super.onComplete();
LOG.info("onComplete called");
try {
stderr.flush();
stdout.flush();
} catch (IOException e) {
throw new RuntimeException(e);
}
}
public ByteArrayOutputStream getStdout() {
return stdout;
}
public ByteArrayOutputStream getStderr() {
return stderr;
}
}
}

View File

@@ -0,0 +1,139 @@
/*
* Copyright (c) 2016 Uber Technologies, Inc. (hoodie-dev-group@uber.com)
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package com.uber.hoodie.integ;
import java.util.Arrays;
import org.junit.Assert;
import org.junit.Test;
/**
* Smoke tests to run as part of verification.
*/
public class ITTestHoodieSanity extends ITTestBase {
@Test
public void testRunEcho() throws Exception {
String[] cmd = new String[]{"echo", "Happy Testing"};
TestExecStartResultCallback callback = executeCommandInDocker(ADHOC_1_CONTAINER,
cmd, true);
String stdout = callback.getStdout().toString();
String stderr = callback.getStderr().toString();
LOG.info("Got output for (" + Arrays.toString(cmd) + ") :" + stdout);
LOG.info("Got error output for (" + Arrays.toString(cmd) + ") :" + stderr);
}
@Test
/**
* A basic integration test that runs HoodieJavaApp to create a sample COW Hoodie with single partition key
* data-set and performs upserts on it. Hive integration and upsert functionality is checked by running a count
* query in hive console.
*/
public void testRunHoodieJavaAppOnSinglePartitionKeyCOWTable() throws Exception {
String hiveTableName = "docker_hoodie_single_partition_key_cow_test";
testRunHoodieJavaAppOnCOWTable(hiveTableName, true);
}
@Test
/**
* A basic integration test that runs HoodieJavaApp to create a sample COW Hoodie with multiple partition-keys
* data-set and performs upserts on it. Hive integration and upsert functionality is checked by running a count
* query in hive console.
*/
public void testRunHoodieJavaAppOnMultiPartitionKeysCOWTable() throws Exception {
String hiveTableName = "docker_hoodie_multi_partition_key_cow_test";
testRunHoodieJavaAppOnCOWTable(hiveTableName, false);
}
/**
* A basic integration test that runs HoodieJavaApp to create a sample COW Hoodie
* data-set and performs upserts on it. Hive integration and upsert functionality is checked by running a count
* query in hive console.
* TODO: Add spark-shell test-case
*/
public void testRunHoodieJavaAppOnCOWTable(String hiveTableName, boolean singlePartitionKey) throws Exception {
// Drop Table if it exists
{
String[] hiveDropCmd = getHiveConsoleCommand("drop table if exists " + hiveTableName);
executeCommandInDocker(HIVESERVER, hiveDropCmd, true);
}
// Ensure table does not exist
{
String[] hiveTableCheck = getHiveConsoleCommand("show tables like '" + hiveTableName + "'");
TestExecStartResultCallback callback =
executeCommandInDocker(HIVESERVER, hiveTableCheck, true);
String stderr = callback.getStderr().toString();
String stdout = callback.getStdout().toString();
LOG.info("Got output for (" + Arrays.toString(hiveTableCheck) + ") :" + stdout);
LOG.info("Got error output for (" + Arrays.toString(hiveTableCheck) + ") :" + stderr);
Assert.assertTrue("Result :" + callback.getStdout().toString(), stdout.trim().isEmpty());
}
// Run Hoodie Java App
{
String[] cmd = null;
if (singlePartitionKey) {
cmd = new String[]{
HOODIE_JAVA_APP,
"--hive-sync",
"--hive-url", HIVE_SERVER_JDBC_URL,
"--hive-table", hiveTableName
};
} else {
cmd = new String[]{
HOODIE_JAVA_APP,
"--hive-sync",
"--hive-url", HIVE_SERVER_JDBC_URL,
"--use-multi-partition-keys",
"--hive-table", hiveTableName
};
}
TestExecStartResultCallback callback = executeCommandInDocker(ADHOC_1_CONTAINER,
cmd, true);
String stdout = callback.getStdout().toString().trim();
String stderr = callback.getStderr().toString().trim();
LOG.info("Got output for (" + Arrays.toString(cmd) + ") :" + stdout);
LOG.info("Got error output for (" + Arrays.toString(cmd) + ") :" + stderr);
}
// Ensure table does exist
{
String[] hiveTableCheck = getHiveConsoleCommand("show tables like '" + hiveTableName + "'");
TestExecStartResultCallback callback =
executeCommandInDocker(HIVESERVER, hiveTableCheck, true);
String stderr = callback.getStderr().toString().trim();
String stdout = callback.getStdout().toString().trim();
LOG.info("Got output for (" + Arrays.toString(hiveTableCheck) + ") : (" + stdout + ")");
LOG.info("Got error output for (" + Arrays.toString(hiveTableCheck) + ") : (" + stderr + ")");
Assert.assertEquals("Table exists", hiveTableName, stdout);
}
// Ensure row count is 100 (without duplicates)
{
String[] hiveTableCheck = getHiveConsoleCommand("select count(1) from " + hiveTableName);
TestExecStartResultCallback callback =
executeCommandInDocker(ADHOC_1_CONTAINER, hiveTableCheck, true);
String stderr = callback.getStderr().toString().trim();
String stdout = callback.getStdout().toString().trim();
LOG.info("Got output for (" + Arrays.toString(hiveTableCheck) + ") : (" + stdout + ")");
LOG.info("Got error output for (" + Arrays.toString(hiveTableCheck) + ") : (" + stderr + ")");
Assert.assertEquals("Expecting 100 rows to be present in the new table", 100,
Integer.parseInt(stdout.trim()));
}
}
}

View File

@@ -0,0 +1,23 @@
#
# Copyright (c) 2016 Uber Technologies, Inc. (hoodie-dev-group@uber.com)
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
log4j.rootLogger=WARN, A1
log4j.category.com.uber=INFO
log4j.category.org.apache.parquet.hadoop=WARN
# A1 is set to be a ConsoleAppender.
log4j.appender.A1=org.apache.log4j.ConsoleAppender
# A1 uses PatternLayout.
log4j.appender.A1.layout=org.apache.log4j.PatternLayout
log4j.appender.A1.layout.ConversionPattern=%-4r [%t] %-5p %c %x - %m%n

View File

@@ -111,6 +111,21 @@
</execution> </execution>
</executions> </executions>
</plugin> </plugin>
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-jar-plugin</artifactId>
<executions>
<execution>
<goals>
<goal>test-jar</goal>
</goals>
<phase>test-compile</phase>
</execution>
</executions>
<configuration>
<skip>false</skip>
</configuration>
</plugin>
<plugin> <plugin>
<groupId>org.apache.rat</groupId> <groupId>org.apache.rat</groupId>
<artifactId>apache-rat-plugin</artifactId> <artifactId>apache-rat-plugin</artifactId>

2
hoodie-spark/run_hoodie_app.sh Normal file → Executable file
View File

@@ -21,4 +21,4 @@ fi
OTHER_JARS=`ls -1 $DIR/target/lib/*jar | grep -v '*avro*-1.' | tr '\n' ':'` OTHER_JARS=`ls -1 $DIR/target/lib/*jar | grep -v '*avro*-1.' | tr '\n' ':'`
#TODO - Need to move TestDataGenerator and HoodieJavaApp out of tests #TODO - Need to move TestDataGenerator and HoodieJavaApp out of tests
echo "Running command : java -cp $DIR/target/test-classes/:$DIR/../hoodie-client/target/test-classes/:${HADOOP_CONF_DIR}:$HOODIE_JAR:${CLIENT_JAR}:$OTHER_JARS HoodieJavaApp $@" echo "Running command : java -cp $DIR/target/test-classes/:$DIR/../hoodie-client/target/test-classes/:${HADOOP_CONF_DIR}:$HOODIE_JAR:${CLIENT_JAR}:$OTHER_JARS HoodieJavaApp $@"
java -cp $DIR/target/test-classes/:$DIR/../hoodie-client/target/test-classes/:${HADOOP_CONF_DIR}:$HOODIE_JAR:${CLIENT_JAR}:$OTHER_JARS HoodieJavaApp "$@" java -Xmx1G -cp $DIR/target/test-classes/:$DIR/../hoodie-client/target/test-classes/:${HADOOP_CONF_DIR}:$HOODIE_JAR:${CLIENT_JAR}:$OTHER_JARS HoodieJavaApp "$@"

View File

@@ -153,7 +153,7 @@ object DataSourceWriteOptions {
val HIVE_TABLE_OPT_KEY = "hoodie.datasource.hive_sync.table" val HIVE_TABLE_OPT_KEY = "hoodie.datasource.hive_sync.table"
val HIVE_USER_OPT_KEY = "hoodie.datasource.hive_sync.username" val HIVE_USER_OPT_KEY = "hoodie.datasource.hive_sync.username"
val HIVE_PASS_OPT_KEY = "hoodie.datasource.hive_sync.password" val HIVE_PASS_OPT_KEY = "hoodie.datasource.hive_sync.password"
val HIVE_URL_OPT_KEY = "hoodie.datasource.hive_sync.jdbcUrl" val HIVE_URL_OPT_KEY = "hoodie.datasource.hive_sync.jdbcurl"
val HIVE_PARTITION_FIELDS_OPT_KEY = "hoodie.datasource.hive_sync.partition_fields" val HIVE_PARTITION_FIELDS_OPT_KEY = "hoodie.datasource.hive_sync.partition_fields"
val HIVE_PARTITION_EXTRACTOR_CLASS_OPT_KEY = "hoodie.datasource.hive_sync.partition_extractor_class" val HIVE_PARTITION_EXTRACTOR_CLASS_OPT_KEY = "hoodie.datasource.hive_sync.partition_extractor_class"
val HIVE_ASSUME_DATE_PARTITION_OPT_KEY = "hoodie.datasource.hive_sync.assume_date_partitioning" val HIVE_ASSUME_DATE_PARTITION_OPT_KEY = "hoodie.datasource.hive_sync.assume_date_partitioning"

View File

@@ -24,6 +24,7 @@ import com.uber.hoodie.HoodieDataSourceHelpers;
import com.uber.hoodie.common.HoodieTestDataGenerator; import com.uber.hoodie.common.HoodieTestDataGenerator;
import com.uber.hoodie.common.model.HoodieTableType; import com.uber.hoodie.common.model.HoodieTableType;
import com.uber.hoodie.config.HoodieWriteConfig; import com.uber.hoodie.config.HoodieWriteConfig;
import com.uber.hoodie.hive.MultiPartKeysValueExtractor;
import java.util.List; import java.util.List;
import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.FileSystem;
import org.apache.log4j.LogManager; import org.apache.log4j.LogManager;
@@ -65,7 +66,10 @@ public class HoodieJavaApp {
private String hivePass = "hive"; private String hivePass = "hive";
@Parameter(names = {"--hive-url", "-hl"}, description = "hive JDBC URL") @Parameter(names = {"--hive-url", "-hl"}, description = "hive JDBC URL")
private String hiveJdbcUrl = "jdbc:hive://localhost:10000"; private String hiveJdbcUrl = "jdbc:hive2://localhost:10000";
@Parameter(names = {"--use-multi-partition-keys", "-mp"}, description = "Use Multiple Partition Keys")
private Boolean useMultiPartitionKeys = false;
@Parameter(names = {"--help", "-h"}, help = true) @Parameter(names = {"--help", "-h"}, help = true)
public Boolean help = false; public Boolean help = false;
@@ -188,10 +192,16 @@ public class HoodieJavaApp {
writer = writer.option(DataSourceWriteOptions.HIVE_TABLE_OPT_KEY(), hiveTable) writer = writer.option(DataSourceWriteOptions.HIVE_TABLE_OPT_KEY(), hiveTable)
.option(DataSourceWriteOptions.HIVE_DATABASE_OPT_KEY(), hiveDB) .option(DataSourceWriteOptions.HIVE_DATABASE_OPT_KEY(), hiveDB)
.option(DataSourceWriteOptions.HIVE_URL_OPT_KEY(), hiveJdbcUrl) .option(DataSourceWriteOptions.HIVE_URL_OPT_KEY(), hiveJdbcUrl)
.option(DataSourceWriteOptions.HIVE_PARTITION_FIELDS_OPT_KEY(), "dateStr")
.option(DataSourceWriteOptions.HIVE_USER_OPT_KEY(), hiveUser) .option(DataSourceWriteOptions.HIVE_USER_OPT_KEY(), hiveUser)
.option(DataSourceWriteOptions.HIVE_PASS_OPT_KEY(), hivePass) .option(DataSourceWriteOptions.HIVE_PASS_OPT_KEY(), hivePass)
.option(DataSourceWriteOptions.HIVE_SYNC_ENABLED_OPT_KEY(), "true"); .option(DataSourceWriteOptions.HIVE_SYNC_ENABLED_OPT_KEY(), "true");
if (useMultiPartitionKeys) {
writer = writer.option(DataSourceWriteOptions.HIVE_PARTITION_FIELDS_OPT_KEY(), "year,month,day")
.option(DataSourceWriteOptions.HIVE_PARTITION_EXTRACTOR_CLASS_OPT_KEY(),
MultiPartKeysValueExtractor.class.getCanonicalName());
} else {
writer = writer.option(DataSourceWriteOptions.HIVE_PARTITION_FIELDS_OPT_KEY(), "dateStr");
}
} }
return writer; return writer;
} }

View File

@@ -39,6 +39,8 @@
<module>packaging/hoodie-hadoop-mr-bundle</module> <module>packaging/hoodie-hadoop-mr-bundle</module>
<module>packaging/hoodie-hive-bundle</module> <module>packaging/hoodie-hive-bundle</module>
<module>packaging/hoodie-spark-bundle</module> <module>packaging/hoodie-spark-bundle</module>
<module>docker/hoodie/hadoop</module>
<module>hoodie-integ-test</module>
</modules> </modules>
<licenses> <licenses>