Skip to content

Commit 08f94a2

Browse files
committed
HIVE-29419: Provide a Hive-specific docker image for Tez AM
1 parent 423fc83 commit 08f94a2

File tree

8 files changed

+304
-39
lines changed

8 files changed

+304
-39
lines changed

packaging/src/docker/Dockerfile

Lines changed: 42 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -67,19 +67,25 @@ RUN tar -xzv \
6767
tar -xzv \
6868
--exclude="apache-tez-$TEZ_VERSION-bin/share" \
6969
-f /opt/apache-tez-$TEZ_VERSION-bin.tar.gz \
70-
-C /opt
70+
-C /opt; \
71+
mkdir -p /opt/tez-snapshot;
7172

7273
FROM eclipse-temurin:21-jdk-ubi9-minimal AS run
7374

7475
ARG UID=1000
7576
ARG HADOOP_VERSION
7677
ARG HIVE_VERSION
7778
ARG TEZ_VERSION
79+
ARG TEZ_SNAPSHOT_VERSION=
80+
ARG TEZ_SNAPSHOT_REPO_URL=https://repository.apache.org/content/repositories/snapshots
81+
82+
# When snapshot jars are included, client version must match the snapshot version.
83+
ENV TEZ_CLIENT_VERSION=${TEZ_SNAPSHOT_VERSION:-$TEZ_VERSION}
7884

7985
# Install dependencies
8086
RUN set -ex; \
8187
microdnf update -y; \
82-
microdnf -y install procps gettext; \
88+
microdnf -y install procps gettext wget xmlstarlet; \
8389
microdnf clean all; \
8490
useradd --no-create-home -s /sbin/nologin -c "" --uid $UID hive
8591

@@ -94,6 +100,38 @@ ENV PATH=$HIVE_HOME/bin:$HADOOP_HOME/bin:$PATH
94100
COPY --from=env --chown=hive /opt/hadoop-$HADOOP_VERSION $HADOOP_HOME
95101
COPY --from=env --chown=hive /opt/apache-hive-$HIVE_VERSION-bin $HIVE_HOME
96102
COPY --from=env --chown=hive /opt/apache-tez-$TEZ_VERSION-bin $TEZ_HOME
103+
COPY --from=env --chown=hive /opt/tez-snapshot /opt/tez-snapshot
104+
105+
# When TEZ_SNAPSHOT_VERSION is set, fetch Tez snapshot jars from the Maven snapshot repository
106+
# and place them under /opt/tez-snapshot. At runtime, entrypoint.sh symlinks these into
107+
# $HIVE_HOME/lib with a "0-" prefix so they sort first in bin/hive's classpath glob, ensuring
108+
# snapshot classes take precedence over the Tez release jars bundled with Hive.
109+
# Maven snapshot repositories use timestamped filenames (e.g. tez-api-1.0.0-20250101.jar),
110+
# so we fetch maven-metadata.xml first to resolve the exact filename before downloading the jar.
111+
RUN set -eux; \
112+
mkdir -p /opt/tez-snapshot-download; \
113+
if [[ -n "${TEZ_SNAPSHOT_VERSION}" ]]; then \
114+
base_url="${TEZ_SNAPSHOT_REPO_URL}/org/apache/tez"; \
115+
for artifact in tez-common tez-api tez-dag tez-mapreduce tez-runtime-internals tez-runtime-library; do \
116+
version_url="${base_url}/${artifact}/${TEZ_SNAPSHOT_VERSION}"; \
117+
metadata_url="${version_url}/maven-metadata.xml"; \
118+
metadata_file="/opt/tez-snapshot-download/${artifact}-maven-metadata.xml"; \
119+
echo "metadata_url=${metadata_url}"; \
120+
wget -q "${metadata_url}" -O "${metadata_file}"; \
121+
snapshot_value="$(xmlstarlet sel -t -v "string(/metadata/versioning/snapshotVersions/snapshotVersion[extension='jar' and not(classifier)]/value)" "${metadata_file}")"; \
122+
test -n "${snapshot_value}"; \
123+
jar_file="${artifact}-${snapshot_value}.jar"; \
124+
jar_url="${version_url}/${jar_file}"; \
125+
echo "jar_url=${jar_url}"; \
126+
wget -q "${jar_url}" -O "/opt/tez-snapshot/${jar_file}"; \
127+
done; \
128+
echo "Downloaded Tez snapshot jars under /opt/tez-snapshot:"; \
129+
ls -1 /opt/tez-snapshot/*.jar; \
130+
else \
131+
echo "TEZ_SNAPSHOT_VERSION not set. Skipping Tez snapshot download."; \
132+
fi; \
133+
rm -rf /opt/tez-snapshot-download
134+
97135

98136
COPY --chown=hive entrypoint.sh /
99137
COPY --chown=hive conf $HIVE_HOME/conf
@@ -104,7 +142,8 @@ RUN chmod +x /entrypoint.sh && \
104142
mkdir -p $HIVE_HOME/scratch && \
105143
chown hive $HIVE_HOME/scratch && \
106144
mkdir -p /home/hive/.beeline && \
107-
chown hive /home/hive/.beeline
145+
chown hive /home/hive/.beeline && \
146+
chown -R hive /opt/tez-snapshot
108147

109148
USER hive
110149
WORKDIR $HIVE_HOME

packaging/src/docker/build.sh

Lines changed: 29 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -20,13 +20,15 @@ set -eux
2020
HIVE_VERSION=
2121
HADOOP_VERSION=
2222
TEZ_VERSION=
23+
TEZ_SNAPSHOT_VERSION=
2324
usage() {
2425
cat <<EOF 1>&2
25-
Usage: $0 [-h] [-hadoop <Hadoop version>] [-tez <Tez version>] [-hive <Hive version>] [-repo <Docker repo>]
26+
Usage: $0 [-h] [-hadoop <Hadoop version>] -tez <Tez release version> [-tez-snapshot [<Maven snapshot version>]] [-hive <Hive version>] [-repo <Docker repo>]
2627
Build the Hive Docker image (reused for LLAP too)
2728
-help Display help
28-
-hadoop Build image with the specified Hadoop version
29-
-tez Build image with the specified Tez version
29+
-hadoop Build image with the specified Hadoop version (default: from Maven pom)
30+
-tez Required. Tez release tarball version (apache-tez-\$TEZ_VERSION-bin.tar.gz from archive)
31+
-tez-snapshot <ver> Optional. When a snapshot version is given, fetch Tez Maven snapshot jars into the image. With no version, snapshot prefetch is skipped.
3032
-hive Build image with the specified Hive version
3133
-repo Docker repository
3234
EOF
@@ -48,6 +50,13 @@ while [ $# -gt 0 ]; do
4850
TEZ_VERSION=$1
4951
shift
5052
;;
53+
-tez-snapshot)
54+
shift
55+
if [ $# -gt 0 ] && [[ "$1" != -* ]]; then
56+
TEZ_SNAPSHOT_VERSION=$1
57+
shift
58+
fi
59+
;;
5160
-hive)
5261
shift
5362
HIVE_VERSION=$1
@@ -64,6 +73,12 @@ while [ $# -gt 0 ]; do
6473
esac
6574
done
6675

76+
if [ -z "${TEZ_VERSION}" ]; then
77+
echo "Error: -tez <Tez version> is required." >&2
78+
usage
79+
exit 1
80+
fi
81+
6782
SCRIPT_DIR=$(cd $(dirname $0); pwd)
6883
SOURCE_DIR=${SOURCE_DIR:-"$SCRIPT_DIR/../../.."}
6984
repo=${REPO:-apache}
@@ -123,12 +138,20 @@ cp "$CACHE_DIR/apache-tez-$TEZ_VERSION-bin.tar.gz" "$WORK_DIR/"
123138
cp -R "$SOURCE_DIR/packaging/src/docker/conf" "$WORK_DIR/"
124139
cp -R "$SOURCE_DIR/packaging/src/docker/entrypoint.sh" "$WORK_DIR/"
125140
cp "$SOURCE_DIR/packaging/src/docker/Dockerfile" "$WORK_DIR/"
141+
142+
DOCKER_BUILD_ARGS=(
143+
--build-arg "HIVE_VERSION=$HIVE_VERSION"
144+
--build-arg "HADOOP_VERSION=$HADOOP_VERSION"
145+
--build-arg "TEZ_VERSION=$TEZ_VERSION"
146+
)
147+
if [ -n "$TEZ_SNAPSHOT_VERSION" ]; then
148+
DOCKER_BUILD_ARGS+=(--build-arg "TEZ_SNAPSHOT_VERSION=$TEZ_SNAPSHOT_VERSION")
149+
fi
150+
126151
docker build \
127152
"$WORK_DIR" \
128153
-f "$WORK_DIR/Dockerfile" \
129154
-t "$repo/hive:$HIVE_VERSION" \
130-
--build-arg "HIVE_VERSION=$HIVE_VERSION" \
131-
--build-arg "HADOOP_VERSION=$HADOOP_VERSION" \
132-
--build-arg "TEZ_VERSION=$TEZ_VERSION"
155+
"${DOCKER_BUILD_ARGS[@]}"
133156

134157
rm -r "${WORK_DIR}"

packaging/src/docker/conf/hive-site.xml.template

Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -76,4 +76,25 @@
7676
<name>hive.query.results.cache.directory</name>
7777
<value>${HIVE_QUERY_RESULTS_CACHE_DIRECTORY}</value>
7878
</property>
79+
<property>
80+
<name>hive.server2.tez.initialize.default.sessions</name>
81+
<value>false</value>
82+
</property>
83+
<property>
84+
<name>hive.server2.tez.use.external.sessions</name>
85+
<value>${HIVE_SERVER2_TEZ_USE_EXTERNAL_SESSIONS}</value>
86+
</property>
87+
<!--
88+
A registry namespace prefix is a hardcoded prefix for Tez external sessions.
89+
The actual tez.am.registry.namespace value is appended to this prefix.
90+
Once hive can use the registry client that Tez provides (ZkAMRegistryClient), this property will be removed.
91+
-->
92+
<property>
93+
<name>hive.server2.tez.external.sessions.namespace</name>
94+
<value>/tez-external-sessions${TEZ_AM_REGISTRY_NAMESPACE}</value>
95+
</property>
96+
<property>
97+
<name>hive.server2.tez.external.sessions.registry.class</name>
98+
<value>org.apache.hadoop.hive.ql.exec.tez.ZookeeperExternalSessionsRegistryClient</value>
99+
</property>
79100
</configuration>
Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,22 @@
1+
{
2+
"taskSchedulerDescriptors": [
3+
{
4+
"className": "org.apache.hadoop.hive.llap.tezplugins.LlapTaskSchedulerService",
5+
"entityName": "LLAP"
6+
}
7+
],
8+
"containerLauncherDescriptors": [
9+
{
10+
"className": "org.apache.hadoop.hive.llap.tezplugins.LlapContainerLauncher",
11+
"entityName": "LLAP"
12+
}
13+
],
14+
"taskCommunicatorDescriptors": [
15+
{
16+
"className": "org.apache.hadoop.hive.llap.tezplugins.LlapTaskCommunicator",
17+
"entityName": "LLAP"
18+
}
19+
],
20+
"enableContainers": false,
21+
"enableUber": false
22+
}
Lines changed: 74 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,74 @@
1+
<?xml version="1.0"?>
2+
<?xml-stylesheet type="text/xsl" href="configuration.xsl"?>
3+
<!--
4+
Licensed to the Apache Software Foundation (ASF) under one or more
5+
contributor license agreements. See the NOTICE file distributed with
6+
this work for additional information regarding copyright ownership.
7+
The ASF licenses this file to You under the Apache License, Version 2.0
8+
(the "License"); you may not use this file except in compliance with
9+
the License. You may obtain a copy of the License at
10+
11+
http://www.apache.org/licenses/LICENSE-2.0
12+
13+
Unless required by applicable law or agreed to in writing, software
14+
distributed under the License is distributed on an "AS IS" BASIS,
15+
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16+
See the License for the specific language governing permissions and
17+
limitations under the License.
18+
-->
19+
<configuration>
20+
<property>
21+
<name>tez.am.mode.session</name>
22+
<value>true</value>
23+
</property>
24+
<property>
25+
<name>tez.am.framework.mode</name>
26+
<value>${TEZ_FRAMEWORK_MODE}</value>
27+
</property>
28+
<property>
29+
<name>tez.am.zookeeper.quorum</name>
30+
<value>${TEZ_AM_ZOOKEEPER_QUORUM}</value>
31+
</property>
32+
<property>
33+
<name>tez.am.registry.namespace</name>
34+
<value>${TEZ_AM_REGISTRY_NAMESPACE}</value>
35+
</property>
36+
<property>
37+
<name>tez.local.mode</name>
38+
<value>false</value>
39+
</property>
40+
<property>
41+
<name>tez.am.tez-ui.webservice.enable</name>
42+
<value>false</value>
43+
</property>
44+
<!-- Tez AM should not timeout in ZK Mode -->
45+
<property>
46+
<name>tez.session.am.dag.submit.timeout.secs</name>
47+
<value>-1</value>
48+
</property>
49+
<property>
50+
<name>tez.ignore.lib.uris</name>
51+
<value>true</value>
52+
</property>
53+
54+
<!--
55+
In standalone AM mode with LLAP, the task scheduler (LlapTaskSchedulerService) and task
56+
communicator (LlapTaskCommunicator) are instantiated at AM startup from service_plugins_descriptor.json
57+
on the classpath — not from the DAG payload as in YARN-submitted mode. These plugins read their
58+
configuration from TezConfiguration, which is built exclusively from tez-site.xml on the classpath.
59+
Hive-specific properties must therefore appear here so the AM can find LLAP daemons and connect
60+
to them, even though they are conceptually Hive settings.
61+
-->
62+
<property>
63+
<name>hive.zookeeper.quorum</name>
64+
<value>${HIVE_ZOOKEEPER_QUORUM}</value>
65+
</property>
66+
<property>
67+
<name>hive.llap.daemon.service.hosts</name>
68+
<value>${HIVE_LLAP_DAEMON_SERVICE_HOSTS}</value>
69+
</property>
70+
<property>
71+
<name>hive.llap.daemon.umbilical.port</name>
72+
<value>33333</value>
73+
</property>
74+
</configuration>
Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,5 @@
1+
log4j.rootLogger=INFO, console
2+
log4j.appender.console=org.apache.log4j.ConsoleAppender
3+
log4j.appender.console.Target=System.err
4+
log4j.appender.console.layout=org.apache.log4j.PatternLayout
5+
log4j.appender.console.layout.ConversionPattern=%d{ISO8601} %-5p %c{1} - %m%n

packaging/src/docker/docker-compose.yml

Lines changed: 29 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -124,14 +124,35 @@ services:
124124
- zookeeper_datalog:/datalog
125125
- zookeeper_logs:/logs
126126

127-
#TODO Tez AM container (in the meantime, the HS2(with local Tez AM) + LLAP daemon setup is working properly)
128-
# 1. Define and use a Tez AM image from HIVE-29419 or TEZ-4682
129-
# 2. Configure TezAM to use Zookeeper Llap Registry to discover the LLAP daemon
130-
# 3. Configure HiveServer2 to use the Tez AM Zookeeper Registry to discover the Tez AM
131-
# Prerequisites:
132-
# - tez-api 1.0.0-SNAPSHOT jar injected into HiveSever2 until Tez 1.0.0 is released
133-
# - make HIVE-29477 happen to let HiveServer2 use Tez external sessions
134-
# 4. Define hadoop components here to be used by all the containers (working example can be found at TEZ-4682), currently a local volume
127+
tezam:
128+
profiles:
129+
- llap
130+
image: apache/hive:${HIVE_VERSION}
131+
container_name: tezam
132+
hostname: tezam
133+
depends_on:
134+
- zookeeper
135+
restart: on-failure:3
136+
environment:
137+
USER: hive
138+
SERVICE_NAME: 'tezam'
139+
140+
TEZ_FRAMEWORK_MODE: STANDALONE_ZOOKEEPER
141+
TEZ_AM_ZOOKEEPER_QUORUM: zookeeper:2181
142+
143+
# LLAP daemon discovery
144+
HIVE_ZOOKEEPER_QUORUM: zookeeper:2181
145+
LLAP_SERVICE_HOSTS: '@llap0'
146+
147+
# Directories shared between HiveServer2 and LLAP daemon
148+
HIVE_SCRATCH_DIR: /opt/hive/scratch
149+
HIVE_QUERY_RESULTS_CACHE_DIRECTORY: /opt/hive/scratch/_resultscache_
150+
151+
volumes:
152+
- warehouse:/opt/hive/data/warehouse
153+
- scratch:/opt/hive/scratch
154+
networks:
155+
- hive
135156

136157
llapdaemon:
137158
profiles:

0 commit comments

Comments
 (0)