Skip to content

Commit

Permalink
[GH-15470] Upgrade Hadoop Libraries in Main Standalone Jar (#15469)
Browse files Browse the repository at this point in the history
* [PUBDEV-9089] Upgrade Hadoop Libraries in Main Standalone Jar

(cherry picked from commit a2e9f68)

* Add hadoop commons

* Add some transitive dependencies for hdfs

* Remove orc dependency for main

* Specify different version of hadoop for orc

* remove hive exec version spec from hive properties

* Change hadoop version in orc tests

* Fix usage of http client

* Add libs for s3 persist tests

* Update tests

* Update tests

* add hadoop-common to perstist-s3 tests

* Refactor usage of hadoop-mapreduce-client-core

* update usage of hadoop-common

* try to relocate hadoop libraries

* Revert relocation

* Relocated hadoop libs

* add hadoop-common.jar on tests

* exclude org.apache.hadoop.net.DNSDomainNameResolver from relocation

* exclude org.apache.hadoop.security.JniBasedUnixGroupsMappingWithFallback

* include hive shims common

* remove hadoop common

* remove s3n filesystem from hadoop tests

* Upgrade hive shimps common

* Exclude most of the Shim classes

* Add some comments to build.gradle about extensions

* Fix S3 tests
  • Loading branch information
mn-mikke authored Aug 17, 2023
1 parent ac135bd commit d817ab9
Show file tree
Hide file tree
Showing 14 changed files with 85 additions and 80 deletions.
8 changes: 1 addition & 7 deletions gradle.properties
Original file line number Diff line number Diff line change
Expand Up @@ -18,9 +18,6 @@ doFindbugs=false
# Run animal sniffer to verify compatibility of API with actual Java version
doAnimalSniffer=false

# include ORC support inside default h2o.jar.
doIncludeOrc=false

# include MOJO Pipeline support inside default h2o.jar.
doIncludeMojoPipeline=false

Expand Down Expand Up @@ -55,12 +52,9 @@ httpClientVersion=4.5.2
defaultParquetVersion=1.12.3

# Default Hadoop client version
defaultHadoopVersion=2.8.4
defaultHadoopVersion=3.3.5
defaultHdfsDependency=hadoop-hdfs-client

# Default Hive version
defaultHiveExecVersion=1.1.0

defaultWebserverModule=h2o-jetty-9
# default module to be included in assemblies
defaultExtWebserverModule=h2o-jetty-9-ext
Expand Down
33 changes: 30 additions & 3 deletions h2o-assemblies/main/build.gradle
Original file line number Diff line number Diff line change
Expand Up @@ -17,12 +17,28 @@ dependencies {
api project(":h2o-persist-http")
api project(":h2o-persist-hdfs")
api project(":h2o-ext-krbstandalone")
if (project.hasProperty("doIncludeOrc") && project.doIncludeOrc == "true") {
api project(":h2o-orc-parser")
}
api project(":h2o-parquet-parser")
api("org.apache.hadoop:hadoop-mapreduce-client-core:${defaultHadoopVersion}") {
transitive = false
}
api project(":h2o-k8s-int")

api "org.apache.hadoop:hadoop-hdfs-client:${defaultHadoopVersion}"
implementation("org.apache.hadoop:hadoop-mapreduce-client-core:${defaultHadoopVersion}") {
transitive = false
}
api("org.apache.hadoop:hadoop-common:${defaultHadoopVersion}") {
exclude group: "com.sun.jersey"
exclude group: "javax.servlet"
exclude group: "org.apache.avro"
exclude group: "org.apache.curator"
exclude group: "org.apache.zookeeper"
exclude group: "org.eclipse.jetty"
}

// Need to a newer org.apache.hadoop.hive.shims.ShimLoader to make older hive JDBC drivers work on Hadoop 3.
implementation 'org.apache.hive.shims:hive-shims-common:2.3.9'

constraints {
api('com.fasterxml.jackson.core:jackson-databind:2.13.4.2') {
because 'Fixes CVE-2022-42003'
Expand Down Expand Up @@ -50,6 +66,17 @@ shadowJar {
exclude 'test.properties'
exclude 'cockpitlite.properties'
exclude 'devpay_products.properties'

// Need to a newer org.apache.hadoop.hive.shims.ShimLoader to make older hive JDBC drivers work on Hadoop 3.
// Excluding other classes of org.apache.hive.shims:hive-shims-common.
exclude 'org/apache/hadoop/hive/thrift/**/*.*'
exclude 'org/apache/hadoop/hive/io/**/*.*'
exclude 'org/apache/hadoop/hive/upgrade/**/*.*'
exclude 'org/apache/hadoop/hive/shims/Utils.*'
exclude 'org/apache/hadoop/hive/shims/CombineHiveKey.*'
exclude 'org/apache/hadoop/hive/shims/*Shims*.*'
exclude 'org/apache/hadoop/hive/shims/HiveHarFileSystem.*'

manifest {
attributes 'Main-Class': 'water.H2OApp'
attributes 'Add-Opens': 'java.base/java.lang java.base/java.util java.base/java.lang.reflect'
Expand Down
17 changes: 4 additions & 13 deletions h2o-assemblies/steam/build.gradle
Original file line number Diff line number Diff line change
Expand Up @@ -25,15 +25,13 @@ dependencies {
api(project(":h2o-persist-s3")) {
exclude group: "org.apache.hadoop"
}
api(project(":h2o-persist-hdfs")) {
exclude group: "org.apache.hadoop"
}
api(project(":h2o-persist-hdfs"))
api(project(":h2o-parquet-parser")) {
exclude group: "org.apache.hadoop"
}
// Force latest version Hadoop with unused components excluded - we need Hadoop for Parquet and S3A export
api "org.apache.hadoop:hadoop-hdfs-client:3.3.5"
api("org.apache.hadoop:hadoop-common:3.3.5") {
api "org.apache.hadoop:hadoop-hdfs-client:${defaultHadoopVersion}"
api("org.apache.hadoop:hadoop-common:${defaultHadoopVersion}") {
exclude group: "com.sun.jersey"
exclude group: "javax.servlet"
exclude group: "org.apache.avro"
Expand All @@ -42,18 +40,11 @@ dependencies {
exclude group: "org.eclipse.jetty"
exclude group: "org.apache.hadoop.thirdparty", module: "hadoop-shaded-protobuf_3_7"
}
api("org.apache.hadoop:hadoop-aws:3.3.5") {
exclude group: "com.amazonaws", module: "aws-java-sdk-bundle"
}
// aws-java-sdk-dynamodb is required for S3A support, S3A import throws NoClassDefFoundError (AmazonDynamoDBException)
api("com.amazonaws:aws-java-sdk-dynamodb:${awsJavaSdkVersion}") {
transitive = false
}
// Upgrade dependencies coming from Hadoop to address vulnerabilities
api "org.apache.commons:commons-compress:1.21"
// Force specific Parquet version to avoid dependency on vulnerable FasterXML jackson-mapper-asl
api "org.apache.parquet:parquet-hadoop:${defaultParquetVersion}"
api("org.apache.hadoop:hadoop-mapreduce-client-core:3.3.5") {
api("org.apache.hadoop:hadoop-mapreduce-client-core:${defaultHadoopVersion}") {
transitive = false
}
// Google OAuth force version
Expand Down
1 change: 1 addition & 0 deletions h2o-extensions/krbstandalone/build.gradle
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@ description = "H2O Kerberos Standalone support"
dependencies {
api project(":h2o-core")
api project(":h2o-persist-hdfs")
compileOnly("org.apache.hadoop:hadoop-common:$defaultHadoopVersion")
api("org.apache.hadoop:hadoop-auth:$defaultHadoopVersion") {
// Pull all dependencies to allow run directly from IDE or command line
transitive = true
Expand Down
15 changes: 8 additions & 7 deletions h2o-parsers/h2o-orc-parser/build.gradle
Original file line number Diff line number Diff line change
Expand Up @@ -12,8 +12,10 @@ configurations{
}

dependencies {
hadoopCommonExclude("org.apache.hadoop:hadoop-common:${defaultHadoopVersion}")
hiveExecExclude("org.apache.hive:hive-exec:$defaultHiveExecVersion"){
def hadoopVersion="2.8.4"
def hiveExecVersion="1.1.0"
hadoopCommonExclude("org.apache.hadoop:hadoop-common:$hadoopVersion")
hiveExecExclude("org.apache.hive:hive-exec:$hiveExecVersion"){
// this dependency need to be excluded manually as Gradle can't find it in maven central
exclude group: 'org.pentaho', module: 'pentaho-aggdesigner-algorithm'
exclude group: 'eigenbase', module: 'eigenbase-properties'
Expand All @@ -23,14 +25,13 @@ dependencies {
api(project(":h2o-persist-hdfs")) {
exclude group: 'ai.h2o', module: 'h2o-core'
exclude group: 'net.java.dev.jets3t', module: 'jets3t'
exclude group: 'org.apache.hadoop', module: 'hadoop-client'
exclude group: 'org.apache.hadoop', module: 'hadoop-aws'
exclude group: 'org.apache.hadoop'
}

// Note: What is connection between hive-exec version and hadoop-version and orc version?
// Note: In this case we are using hive version which is compatible with $defaultHadoopVersion
// Note: for newest version it should be replaces by hive-orc
api("org.apache.hive:hive-exec:$defaultHiveExecVersion") {
api("org.apache.hive:hive-exec:$hiveExecVersion") {
// we can't use transitive=false so we need to exclude the dependencies manually
configurations.hiveExecExclude.getResolvedConfiguration().getResolvedArtifacts().each {
if (it.moduleVersion.id.group != "org.apache.hive" && it.moduleVersion.id.module.name != "hive-exec") {
Expand All @@ -40,7 +41,7 @@ dependencies {
exclude group: 'org.pentaho', module: 'pentaho-aggdesigner-algorithm'
}
// For compilation we need common
api("org.apache.hadoop:hadoop-common:$defaultHadoopVersion") {
api("org.apache.hadoop:hadoop-common:$hadoopVersion") {
// we can't use transitive=false so we need to exclude the dependencies manually
configurations.hadoopCommonExclude.getResolvedConfiguration().getResolvedArtifacts().each {
if (it.moduleVersion.id.group != "org.apache.hadoop" && it.moduleVersion.id.module.name != "hadoop-common") {
Expand All @@ -52,7 +53,7 @@ dependencies {
testImplementation project(":h2o-test-support")
testRuntimeOnly project(":${defaultWebserverModule}")
// We need correct version of MapRe Hadoop to run JUnits
testRuntimeOnly("org.apache.hadoop:hadoop-client:$defaultHadoopVersion") {
testRuntimeOnly("org.apache.hadoop:hadoop-client:$hadoopVersion") {
exclude module: "jasper-runtime"
exclude module: "jasper-compiler"
exclude module: "curator-client"
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -16,9 +16,7 @@ dependencies {
}
// Parquet support
api("org.apache.parquet:parquet-hadoop:1.7.0")
api("org.apache.hadoop:hadoop-common:${parquetHadoopVersion}") {
transitive = false
}
compileOnly("org.apache.hadoop:hadoop-common:${parquetHadoopVersion}")

testImplementation project(":h2o-test-support")
testImplementation project(":h2o-parquet-parser-tests")
Expand All @@ -28,6 +26,7 @@ dependencies {
testImplementation("org.apache.hadoop:hadoop-client:${parquetHadoopVersion}") {
exclude module: "servlet-api"
}
testImplementation("org.apache.hadoop:hadoop-common:${parquetHadoopVersion}")
}

apply from: "${rootDir}/gradle/dataCheck.gradle"
Expand Down
3 changes: 3 additions & 0 deletions h2o-parsers/h2o-parquet-parser/build.gradle
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,9 @@ description = "H2O Parquet Parser"
dependencies {
testImplementation project(":h2o-test-support")
testImplementation project(":h2o-parquet-parser-tests")
testImplementation("org.apache.hadoop:hadoop-mapreduce-client-core:${defaultHadoopVersion}") {
transitive = false
}
testRuntimeOnly project(":${defaultWebserverModule}")
}

Expand Down
20 changes: 2 additions & 18 deletions h2o-parsers/h2o-parquet-parser/parquet_dependencies.gradle
Original file line number Diff line number Diff line change
@@ -1,14 +1,7 @@
def parquetHadoopVersion = binding.variables.get("hadoopVersion") ?
binding.variables.get("hadoopVersion") : defaultHadoopVersion

configurations{
// Configuration used to get all transitive dependencies for org.apache.hadoop:hadoop-common
hadoopCommonExclude
}

dependencies {
hadoopCommonExclude("org.apache.hadoop:hadoop-common:${parquetHadoopVersion}")

api project(":h2o-core")
api(project(":h2o-persist-hdfs")) {
exclude group: 'ai.h2o', module: 'h2o-core'
Expand All @@ -21,15 +14,6 @@ dependencies {
api("org.apache.parquet:parquet-hadoop:${defaultParquetVersion}")


api("org.apache.hadoop:hadoop-common:${parquetHadoopVersion}") {
// we can't use transitive=false so we need to exclude the dependencies manually
configurations.hadoopCommonExclude.getResolvedConfiguration().getResolvedArtifacts().each {
if (it.moduleVersion.id.group != "org.apache.hadoop" && it.moduleVersion.id.module.name != "hadoop-common") {
exclude group: it.moduleVersion.id.group, module: it.moduleVersion.id.module.name
}
}
}
implementation("org.apache.hadoop:hadoop-mapreduce-client-core:${parquetHadoopVersion}") {
transitive = false
}
compileOnly("org.apache.hadoop:hadoop-common:${parquetHadoopVersion}")
testImplementation("org.apache.hadoop:hadoop-common:${parquetHadoopVersion}")
}
12 changes: 11 additions & 1 deletion h2o-persist-hdfs/build.gradle
Original file line number Diff line number Diff line change
Expand Up @@ -8,15 +8,25 @@ configurations {

dependencies {
api project(":h2o-core")
compileOnly("org.apache.hadoop:hadoop-common:$defaultHadoopVersion")
api("org.apache.hadoop:$defaultHdfsDependency:$defaultHadoopVersion") {
// Pull all dependencies to allow run directly from IDE or command line
transitive = true
}
api("org.apache.hadoop:hadoop-aws:$defaultHadoopVersion")
api("org.apache.hadoop:hadoop-aws:${defaultHadoopVersion}") {
exclude group: "com.amazonaws", module: "aws-java-sdk-bundle"
}
// aws-java-sdk-dynamodb is required for S3A support, S3A import throws NoClassDefFoundError (AmazonDynamoDBException)
api("com.amazonaws:aws-java-sdk-dynamodb:${awsJavaSdkVersion}") {
transitive = false
}

api("com.nimbusds:nimbus-jose-jwt:9.11.3")

testImplementation project(":h2o-test-support")
testImplementation "org.apache.hadoop:hadoop-common:$defaultHadoopVersion"
testImplementation "com.amazonaws:aws-java-sdk-s3:${awsJavaSdkVersion}"
testImplementation "org.jets3t:jets3t:0.9.7"
testRuntimeOnly project(":${defaultWebserverModule}")
testRuntimeOnly project(":h2o-persist-s3")
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -23,8 +23,10 @@
public class PersistHdfsTest extends TestUtil {

@Parameterized.Parameters(name = "{index}: scheme={0}")
public static Object[] schemes() {
return new Object[] { "s3n", "s3a" };
public static Object[] schemes() {
return new Object[] {
// "s3n", - s3n is not supported by hadoop-aws 3.0+
"s3a" };
}

@Parameterized.Parameter
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -2,11 +2,9 @@

import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.fs.s3.S3FileSystem;
import org.jets3t.service.S3Service;
import org.jets3t.service.model.S3Object;
import org.apache.hadoop.fs.s3a.S3AFileSystem;
import com.amazonaws.services.s3.model.S3Object;
import org.junit.BeforeClass;
import org.junit.Ignore;
import org.junit.Rule;
import org.junit.Test;
import org.junit.rules.ExpectedException;
Expand Down Expand Up @@ -37,29 +35,14 @@ public void testPubDev5663() throws Exception { // Demonstrates that S3FileSyste

PersistHdfs hdfsPersist = (PersistHdfs) H2O.getPM().getPersistForURI(URI.create("hdfs://localhost/"));

String existing = "s3://" + bucket + "/" + key;
String existing = "s3a://" + bucket + "/" + key;
Path p = new Path(existing);

S3FileSystem fs = (S3FileSystem) FileSystem.get(p.toUri(), PersistHdfs.CONF);
// use crazy reflection to get to the actual S3 Service instance
S3Service s3Service = (S3Service) getValue(fs, "store", "h", "proxyDescriptor", "fpp", "proxy", "s3Service");

S3Object s3Object = s3Service.getObject(bucket, key);
S3AFileSystem fs = (S3AFileSystem) FileSystem.get(p.toUri(), PersistHdfs.CONF);
S3Object s3Object = fs.getAmazonS3ClientForTesting("testPubDev5663").getObject(bucket, key);

assertNotNull(s3Object); // The object exists
assertFalse(fs.exists(p)); // But FS says it doesn't => S3 is broken in Hadoop
assertFalse(hdfsPersist.exists(existing)); // Our persist gives the same result
}

private Object getValue(Object o, String... fieldNames) {
StringBuilder path = new StringBuilder(o.getClass().getName());
for (String f : fieldNames) {
path.append('.').append(f);
Object no = ReflectionUtils.getFieldValue(o, f);
if (no == null)
throw new IllegalStateException("Invalid path: " + path.toString() + ", object is instance of " + o.getClass());
o = no;
}
return o;
assert(fs.exists(p)); // But FS says it exists as well.
assert(hdfsPersist.exists(existing)); // Our persist gives the same result
}

}
5 changes: 5 additions & 0 deletions h2o-persist-s3/build.gradle
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@ configurations {

dependencies {
api project(":h2o-core")
compileOnly("org.apache.hadoop:hadoop-common:$defaultHadoopVersion")
api "com.amazonaws:aws-java-sdk-s3:${awsJavaSdkVersion}"
api "com.amazonaws:aws-java-sdk-sts:${awsJavaSdkVersion}" // Required by WebIdentityTokenCredentialsProvider from AWS SDK
api "org.apache.httpcomponents:httpclient:${httpClientVersion}"
Expand All @@ -19,6 +20,10 @@ dependencies {
testRuntimeOnly project(":${defaultWebserverModule}")
testRuntimeOnly project(":h2o-parquet-parser")
testImplementation project(":h2o-persist-hdfs")
testImplementation "org.apache.hadoop:hadoop-common:$defaultHadoopVersion"
testImplementation("org.apache.hadoop:hadoop-mapreduce-client-core:$defaultHadoopVersion") {
transitive = false
}
}

apply from: "${rootDir}/gradle/dataCheck.gradle"
Expand Down
6 changes: 5 additions & 1 deletion scripts/jenkins/groovy/defineTestStages.groovy
Original file line number Diff line number Diff line change
Expand Up @@ -589,7 +589,8 @@ def call(final pipelineContext) {
version: distribution.version,
commandFactory: 'h2o-3/scripts/jenkins/groovy/hadoopCommands.groovy',
ldapConfigPath: ldapConfigPath,
ldapConfigPathStandalone: 'scripts/jenkins/config/ldap-jetty-9.txt'
ldapConfigPathStandalone: 'scripts/jenkins/config/ldap-jetty-9.txt',
bundledS3FileSystems: 's3a,s3n'
],
pythonVersion: '3.6',
customDockerArgs: [ '--privileged' ],
Expand All @@ -599,6 +600,7 @@ def call(final pipelineContext) {
def standaloneStage = evaluate(stageTemplate.inspect())
standaloneStage.stageName = "${distribution.name.toUpperCase()} ${distribution.version} - STANDALONE"
standaloneStage.customData.mode = 'STANDALONE'
standaloneStage.customData.bundledS3FileSystems = 's3a'

def onHadoopStage = evaluate(stageTemplate.inspect())
onHadoopStage.stageName = "${distribution.name.toUpperCase()} ${distribution.version} - HADOOP"
Expand Down Expand Up @@ -672,10 +674,12 @@ def call(final pipelineContext) {
def standaloneStage = evaluate(stageTemplate.inspect())
standaloneStage.stageName = "${distribution.name.toUpperCase()} ${distribution.version} - STANDALONE"
standaloneStage.customData.mode = 'STANDALONE'
standaloneStage.customData.bundledS3FileSystems = 's3a'

def standaloneKeytabStage = evaluate(stageTemplate.inspect())
standaloneKeytabStage.stageName = "${distribution.name.toUpperCase()} ${distribution.version} - STANDALONE KEYTAB"
standaloneKeytabStage.customData.mode = 'STANDALONE_KEYTAB'
standaloneKeytabStage.customData.bundledS3FileSystems = 's3a'

def standaloneDriverKeytabStage = evaluate(stageTemplate.inspect())
standaloneDriverKeytabStage.stageName = "${distribution.name.toUpperCase()} ${distribution.version} - DRIVER KEYTAB"
Expand Down
1 change: 1 addition & 0 deletions scripts/jenkins/groovy/hadoopCommands.groovy
Original file line number Diff line number Diff line change
Expand Up @@ -81,6 +81,7 @@ private GString getCommandStandalone(final stageConfig) {
fi
export CLOUD_IP=\$(hostname --ip-address)
export CLOUD_PORT=${defaultPort}
export HADOOP_S3_FILESYSTEMS=${stageConfig.customData.bundledS3FileSystems}
"""
}

Expand Down

0 comments on commit d817ab9

Please sign in to comment.