From f94c3f6a38ba86d26f808d5c6576551e7fe09e62 Mon Sep 17 00:00:00 2001 From: Arunkumar Chacko Date: Tue, 27 May 2025 22:17:54 +0000 Subject: [PATCH 1/8] HADOOP-19343: GCS / Add implementation for create() API Closes #7656 Signed-off-by: Chris Nauroth --- hadoop-project/pom.xml | 7 +- .../dev-support/findbugs-exclude.xml | 29 + hadoop-tools/hadoop-gcp/pom.xml | 503 +++++++++++++++++ .../src/config/checkstyle-suppressions.xml | 26 + .../org/apache/hadoop/fs/gs/Constants.java | 31 ++ .../apache/hadoop/fs/gs/CreateOptions.java | 121 +++++ .../hadoop/fs/gs/ErrorTypeExtractor.java | 60 +++ .../org/apache/hadoop/fs/gs/FileInfo.java | 203 +++++++ .../hadoop/fs/gs/GoogleCloudStorage.java | 261 +++++++++ .../GoogleCloudStorageClientWriteChannel.java | 116 ++++ .../fs/gs/GoogleCloudStorageFileSystem.java | 89 +++ .../fs/gs/GoogleCloudStorageItemInfo.java | 423 +++++++++++++++ .../hadoop/fs/gs/GoogleHadoopFileSystem.java | 505 ++++++++++++++++++ .../GoogleHadoopFileSystemConfiguration.java | 75 +++ .../fs/gs/GoogleHadoopOutputStream.java | 124 +++++ .../fs/gs/HadoopConfigurationProperty.java | 90 ++++ .../hadoop/fs/gs/StorageResourceId.java | 328 ++++++++++++ .../org/apache/hadoop/fs/gs/StringPaths.java | 169 ++++++ .../org/apache/hadoop/fs/gs/UriPaths.java | 113 ++++ .../hadoop/fs/gs/VerificationAttributes.java | 68 +++ .../org/apache/hadoop/fs/gs/package-info.java | 23 + .../hadoop/fs/gs/TestStorageResourceId.java | 285 ++++++++++ .../apache/hadoop/fs/gs/TestStringPaths.java | 164 ++++++ .../org/apache/hadoop/fs/gs/TestUriPaths.java | 150 ++++++ .../org/apache/hadoop/fs/gs/package-info.java | 22 + hadoop-tools/pom.xml | 1 + 26 files changed, 3985 insertions(+), 1 deletion(-) create mode 100644 hadoop-tools/hadoop-gcp/dev-support/findbugs-exclude.xml create mode 100644 hadoop-tools/hadoop-gcp/pom.xml create mode 100644 hadoop-tools/hadoop-gcp/src/config/checkstyle-suppressions.xml create mode 100644 hadoop-tools/hadoop-gcp/src/main/java/org/apache/hadoop/fs/gs/Constants.java create mode 100644 hadoop-tools/hadoop-gcp/src/main/java/org/apache/hadoop/fs/gs/CreateOptions.java create mode 100644 hadoop-tools/hadoop-gcp/src/main/java/org/apache/hadoop/fs/gs/ErrorTypeExtractor.java create mode 100644 hadoop-tools/hadoop-gcp/src/main/java/org/apache/hadoop/fs/gs/FileInfo.java create mode 100644 hadoop-tools/hadoop-gcp/src/main/java/org/apache/hadoop/fs/gs/GoogleCloudStorage.java create mode 100644 hadoop-tools/hadoop-gcp/src/main/java/org/apache/hadoop/fs/gs/GoogleCloudStorageClientWriteChannel.java create mode 100644 hadoop-tools/hadoop-gcp/src/main/java/org/apache/hadoop/fs/gs/GoogleCloudStorageFileSystem.java create mode 100644 hadoop-tools/hadoop-gcp/src/main/java/org/apache/hadoop/fs/gs/GoogleCloudStorageItemInfo.java create mode 100644 hadoop-tools/hadoop-gcp/src/main/java/org/apache/hadoop/fs/gs/GoogleHadoopFileSystem.java create mode 100644 hadoop-tools/hadoop-gcp/src/main/java/org/apache/hadoop/fs/gs/GoogleHadoopFileSystemConfiguration.java create mode 100644 hadoop-tools/hadoop-gcp/src/main/java/org/apache/hadoop/fs/gs/GoogleHadoopOutputStream.java create mode 100644 hadoop-tools/hadoop-gcp/src/main/java/org/apache/hadoop/fs/gs/HadoopConfigurationProperty.java create mode 100644 hadoop-tools/hadoop-gcp/src/main/java/org/apache/hadoop/fs/gs/StorageResourceId.java create mode 100644 hadoop-tools/hadoop-gcp/src/main/java/org/apache/hadoop/fs/gs/StringPaths.java create mode 100644 hadoop-tools/hadoop-gcp/src/main/java/org/apache/hadoop/fs/gs/UriPaths.java create mode 100644 hadoop-tools/hadoop-gcp/src/main/java/org/apache/hadoop/fs/gs/VerificationAttributes.java create mode 100644 hadoop-tools/hadoop-gcp/src/main/java/org/apache/hadoop/fs/gs/package-info.java create mode 100644 hadoop-tools/hadoop-gcp/src/test/java/org/apache/hadoop/fs/gs/TestStorageResourceId.java create mode 100644 hadoop-tools/hadoop-gcp/src/test/java/org/apache/hadoop/fs/gs/TestStringPaths.java create mode 100644 hadoop-tools/hadoop-gcp/src/test/java/org/apache/hadoop/fs/gs/TestUriPaths.java create mode 100644 hadoop-tools/hadoop-gcp/src/test/java/org/apache/hadoop/fs/gs/package-info.java diff --git a/hadoop-project/pom.xml b/hadoop-project/pom.xml index 35d71e583b0a5..a5bf2ed66947a 100644 --- a/hadoop-project/pom.xml +++ b/hadoop-project/pom.xml @@ -108,7 +108,7 @@ 3.0.5 3.6.1 - 27.0-jre + 33.1.0-jre 5.1.0 1.78.1 @@ -2157,6 +2157,11 @@ failsafe 2.4.4 + + com.google.cloud + google-cloud-storage + 2.44.1 + diff --git a/hadoop-tools/hadoop-gcp/dev-support/findbugs-exclude.xml b/hadoop-tools/hadoop-gcp/dev-support/findbugs-exclude.xml new file mode 100644 index 0000000000000..80be329bd6d16 --- /dev/null +++ b/hadoop-tools/hadoop-gcp/dev-support/findbugs-exclude.xml @@ -0,0 +1,29 @@ + + + + + + + + + + + + + + diff --git a/hadoop-tools/hadoop-gcp/pom.xml b/hadoop-tools/hadoop-gcp/pom.xml new file mode 100644 index 0000000000000..d5744f1f97c44 --- /dev/null +++ b/hadoop-tools/hadoop-gcp/pom.xml @@ -0,0 +1,503 @@ + + + + 4.0.0 + + org.apache.hadoop + hadoop-project + 3.5.0-SNAPSHOT + ../../hadoop-project + + hadoop-gcp + 3.5.0-SNAPSHOT + Apache Hadoop Google Cloud Platform support + + This module contains code to support integration with Google Cloud Platform. + It also declares the dependencies needed to work with Google Cloud Storage. + + jar + + + UTF-8 + true + ${project.build.directory}/test + + + + + tests-off + + + src/test/resources/auth-keys.xml + + + + true + + + + tests-on + + + src/test/resources/auth-keys.xml + + + + false + + + + parallel-tests + + + parallel-tests + + + + + + org.apache.hadoop + hadoop-maven-plugins + + + parallel-tests-createdir + + parallel-tests-createdir + + + + + + org.apache.maven.plugins + maven-surefire-plugin + + ${testsThreadCount} + false + false + ${maven-surefire-plugin.argLine} -DminiClusterDedicatedDirs=true + + ${testsThreadCount} + ${test.build.data}/${surefire.forkNumber} + ${test.build.dir}/${surefire.forkNumber} + ${hadoop.tmp.dir}/${surefire.forkNumber} + job-${job.id}-fork-000${surefire.forkNumber} + + + + + org.apache.maven.plugins + maven-failsafe-plugin + + + default-integration-test + + integration-test + verify + + + ${testsThreadCount} + false + ${maven-surefire-plugin.argLine} -DminiClusterDedicatedDirs=true + ${fs.gs.scale.test.timeout} + false + + + true + ${test.build.data}/${surefire.forkNumber} + ${test.build.dir}/${surefire.forkNumber} + ${hadoop.tmp.dir}/${surefire.forkNumber} + + + + + + job-${job.id}-fork-000${surefire.forkNumber} + ${test.integration.timeout} + + + **/ITest*.java + + + + + sequential-integration-tests + + integration-test + verify + + + ${fs.gs.scale.test.timeout} + false + + + false + job-${job.id} + + + + + **/ITest*.java + + + + + + + + + + sequential-tests + + + !parallel-tests + + + + + + org.apache.maven.plugins + maven-failsafe-plugin + + + + integration-test + verify + + + false + + job-${job.id} + + + + + + + + + + + + + + maven-shade-plugin + + + package + + shade + + + + + + + + + + + com.google.auth:* + + ** + + + + io.grpc:* + + ** + + + + io.opencensus:* + + ** + + + + *:* + + *.json + google/** + grpc/** + + + + + + com.google.api + com.google.api-client + com.google.api.grpc + com.google.apis + com.google.auth + com.google.cloud + com.google.cloud.bigdataoss + com.google.cloud.grpc + com.google.cloud.http + com.google.flogger + com.google.code.gson + com.google.guava + com.google.http-client + com.google.oauth-client + com.google.protobuf + com.google.re2j + com.google.storage.v2 + com.lmax + io.grpc + io.opencensus + io.perfmark + org.apache.httpcomponents + org.threeten:threetenbp + + + true + + + com + com.google.cloud.hadoop.repackaged.ossgcs.com + + com.google.api.** + com.google.api.gax.** + com.google.auth.** + com.google.cloud.* + com.google.cloud.audit.** + com.google.cloud.grpc.** + com.google.cloud.hadoop.gcsio.** + com.google.cloud.hadoop.util.** + com.google.cloud.http.** + com.google.cloud.monitoring.** + com.google.cloud.spi.** + com.google.cloud.storage.** + com.google.common.** + com.google.geo.** + com.google.gson.** + com.google.google.storage.** + com.google.iam.** + com.google.logging.** + com.google.longrunning.** + com.google.monitoring.** + com.google.protobuf.** + com.google.re2j.** + com.google.rpc.** + com.google.storage.** + com.google.thirdparty.** + com.google.type.** + com.lmax.disruptor.** + + + com.google.cloud.hadoop.util.AccessTokenProvider + com.google.cloud.hadoop.util.AccessTokenProvider$AccessToken + com.google.cloud.hadoop.util.AccessTokenProvider$AccessTokenType + com.google.cloud.hadoop.util.AccessBoundary + com.google.cloud.hadoop.util.AccessBoundary$Action + com.google.cloud.hadoop.util.AutoValue_AccessBoundary + + + + org + com.google.cloud.hadoop.repackaged.ossgcs.org + + org.apache.http.** + org.threeten.** + + + + + io.grpc.netty.shaded + + com.google.cloud.hadoop.repackaged.ossgcs.io.grpc.netty.shaded + + + + io + com.google.cloud.hadoop.repackaged.ossgcs.io + + io.grpc.** + io.opencensus.** + io.perfmark.** + + + + META-INF/native/io_grpc_netty_shaded_ + + META-INF/native/com_google_cloud_hadoop_repackaged_gcs_io_grpc_netty_shaded_ + + + + META-INF/native/libio_grpc_netty_shaded_ + + META-INF/native/libcom_google_cloud_hadoop_repackaged_gcs_io_grpc_netty_shaded_ + + + + true + + + + + + + com.github.spotbugs + spotbugs-maven-plugin + + true + ${basedir}/dev-support/findbugs-exclude.xml + + Max + + + + org.apache.maven.plugins + maven-checkstyle-plugin + + src/config/checkstyle-suppressions.xml + + + + org.apache.maven.plugins + maven-surefire-plugin + + 3600 + + ${test.integration.timeout} + + + + + org.apache.maven.plugins + maven-enforcer-plugin + + + banned-illegal-imports + process-sources + + enforce + + + + + false + Restrict mapreduce imports to committer code + + + + + org.apache.hadoop.mapreduce.** + org.apache.hadoop.mapred.** + + + + false + Restrict encryption client imports to encryption client factory + + + + + + + + + + + + + + + + + + + + com.google.protobuf + protobuf-java + 3.25.5 + + + + + + + org.apache.hadoop + hadoop-common + provided + + + javax.servlet + servlet-api + + + javax.enterprise + cdi-api + + + + com.google.protobuf + protobuf-java + + + + + org.assertj + assertj-core + test + + + junit + junit + test + + + org.junit.platform + junit-platform-launcher + test + + + org.junit.vintage + junit-vintage-engine + test + + + com.google.cloud + google-cloud-storage + + + org.junit.jupiter + junit-jupiter-api + test + + + org.junit.jupiter + junit-jupiter-engine + test + + + org.junit.jupiter + junit-jupiter-params + test + + + + diff --git a/hadoop-tools/hadoop-gcp/src/config/checkstyle-suppressions.xml b/hadoop-tools/hadoop-gcp/src/config/checkstyle-suppressions.xml new file mode 100644 index 0000000000000..8c765bc97f92a --- /dev/null +++ b/hadoop-tools/hadoop-gcp/src/config/checkstyle-suppressions.xml @@ -0,0 +1,26 @@ + + + + + + + + + + diff --git a/hadoop-tools/hadoop-gcp/src/main/java/org/apache/hadoop/fs/gs/Constants.java b/hadoop-tools/hadoop-gcp/src/main/java/org/apache/hadoop/fs/gs/Constants.java new file mode 100644 index 0000000000000..34434b2859a06 --- /dev/null +++ b/hadoop-tools/hadoop-gcp/src/main/java/org/apache/hadoop/fs/gs/Constants.java @@ -0,0 +1,31 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hadoop.fs.gs; + +final class Constants { + private Constants() {} + + // URI scheme for GCS. + static final String SCHEME = "gs"; + static final String PATH_DELIMITER = "/"; + + static final String GCS_CONFIG_PREFIX = "fs.gs"; + + static final String BASE_KEY_PREFIX = "google.cloud"; +} diff --git a/hadoop-tools/hadoop-gcp/src/main/java/org/apache/hadoop/fs/gs/CreateOptions.java b/hadoop-tools/hadoop-gcp/src/main/java/org/apache/hadoop/fs/gs/CreateOptions.java new file mode 100644 index 0000000000000..c9b44a1a481b1 --- /dev/null +++ b/hadoop-tools/hadoop-gcp/src/main/java/org/apache/hadoop/fs/gs/CreateOptions.java @@ -0,0 +1,121 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hadoop.fs.gs; + +import static org.apache.hadoop.thirdparty.com.google.common.base.Preconditions.checkArgument; + +import org.apache.hadoop.thirdparty.com.google.common.collect.ImmutableMap; + +import java.util.Map; +import javax.annotation.Nullable; + +/** + * Options that can be specified when creating a file in the {@link GoogleCloudStorageFileSystem}. + */ +final class CreateOptions { + private final ImmutableMap attributes; + private final String contentType; + private final long overwriteGenerationId; + private final WriteMode mode; + + private CreateOptions(CreateOperationOptionsBuilder builder) { + this.attributes = ImmutableMap.copyOf(builder.attributes); + this.contentType = builder.contentType; + this.overwriteGenerationId = builder.overwriteGenerationId; + this.mode = builder.writeMode; + } + + boolean isOverwriteExisting() { + return this.mode == WriteMode.OVERWRITE; + } + + enum WriteMode { + /** + * Creates a new file for write and fails if file already exists. + */ + CREATE_NEW, + /** + * Creates a new file for write or overwrites an existing file if it already exists. + */ + OVERWRITE + } + + static CreateOperationOptionsBuilder builder() { + return new CreateOperationOptionsBuilder(); + } + + /** + * Extended attributes to set when creating a file. + */ + ImmutableMap getAttributes() { + return attributes; + } + + /** + * Content-type to set when creating a file. + */ + @Nullable + String getContentType() { + return contentType; + } + + /** + * Whether to overwrite an existing file with the same name. + */ + WriteMode getWriteMode() { + return mode; + } + + /** + * Generation of existing object to overwrite. Ignored if set to {@link + * StorageResourceId#UNKNOWN_GENERATION_ID}, but otherwise this is used instead of {@code + * overwriteExisting}, where 0 indicates no existing object, and otherwise an existing object will + * only be overwritten by the newly created file if its generation matches this provided + * generationId. + */ + long getOverwriteGenerationId() { + return overwriteGenerationId; + } + + static class CreateOperationOptionsBuilder { + private Map attributes = ImmutableMap.of(); + private String contentType = "application/octet-stream"; + private long overwriteGenerationId = StorageResourceId.UNKNOWN_GENERATION_ID; + private WriteMode writeMode = WriteMode.CREATE_NEW; + + CreateOperationOptionsBuilder setWriteMode(WriteMode mode) { + this.writeMode = mode; + return this; + } + + CreateOptions build() { + CreateOptions options = new CreateOptions(this); + + checkArgument(!options.getAttributes().containsKey("Content-Type"), + "The Content-Type attribute must be set via the contentType option"); + if (options.getWriteMode() != WriteMode.OVERWRITE) { + checkArgument(options.getOverwriteGenerationId() == StorageResourceId.UNKNOWN_GENERATION_ID, + "overwriteGenerationId is set to %s but it can be set only in OVERWRITE mode", + options.getOverwriteGenerationId()); + } + + return options; + } + } +} diff --git a/hadoop-tools/hadoop-gcp/src/main/java/org/apache/hadoop/fs/gs/ErrorTypeExtractor.java b/hadoop-tools/hadoop-gcp/src/main/java/org/apache/hadoop/fs/gs/ErrorTypeExtractor.java new file mode 100644 index 0000000000000..a4497734524e7 --- /dev/null +++ b/hadoop-tools/hadoop-gcp/src/main/java/org/apache/hadoop/fs/gs/ErrorTypeExtractor.java @@ -0,0 +1,60 @@ +/* + * Copyright 2023 Google LLC + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hadoop.fs.gs; + +import io.grpc.Status; + +/** + * Implementation for {@link ErrorTypeExtractor} for exception specifically thrown from gRPC path. + */ +final class ErrorTypeExtractor { + + enum ErrorType { + NOT_FOUND, OUT_OF_RANGE, ALREADY_EXISTS, FAILED_PRECONDITION, INTERNAL, RESOURCE_EXHAUSTED, + UNAVAILABLE, UNKNOWN + } + + // public static final ErrorTypeExtractor INSTANCE = new ErrorTypeExtractor(); + + private static final String BUCKET_ALREADY_EXISTS_MESSAGE = + "FAILED_PRECONDITION: Your previous request to create the named bucket succeeded and you " + + "already own it."; + + private ErrorTypeExtractor() { + } + + static ErrorType getErrorType(Exception error) { + switch (Status.fromThrowable(error).getCode()) { + case NOT_FOUND: + return ErrorType.NOT_FOUND; + case OUT_OF_RANGE: + return ErrorType.OUT_OF_RANGE; + case ALREADY_EXISTS: + return ErrorType.ALREADY_EXISTS; + case FAILED_PRECONDITION: + return ErrorType.FAILED_PRECONDITION; + case RESOURCE_EXHAUSTED: + return ErrorType.RESOURCE_EXHAUSTED; + case INTERNAL: + return ErrorType.INTERNAL; + case UNAVAILABLE: + return ErrorType.UNAVAILABLE; + default: + return ErrorType.UNKNOWN; + } + } +} diff --git a/hadoop-tools/hadoop-gcp/src/main/java/org/apache/hadoop/fs/gs/FileInfo.java b/hadoop-tools/hadoop-gcp/src/main/java/org/apache/hadoop/fs/gs/FileInfo.java new file mode 100644 index 0000000000000..df8d63f5eecf2 --- /dev/null +++ b/hadoop-tools/hadoop-gcp/src/main/java/org/apache/hadoop/fs/gs/FileInfo.java @@ -0,0 +1,203 @@ +/* + * Copyright 2013 Google Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hadoop.fs.gs; + +import java.net.URI; +import java.time.Instant; +import java.util.ArrayList; +import java.util.List; +import java.util.Map; +import java.util.Objects; + +/** + * Contains information about a file or a directory. + * + *

Note: This class wraps GoogleCloudStorageItemInfo, adds file system specific information and + * hides bucket/object specific information. + */ +final class FileInfo { + + // Info about the root path. + static final FileInfo ROOT_INFO = + new FileInfo(GoogleCloudStorageFileSystem.GCSROOT, GoogleCloudStorageItemInfo.ROOT_INFO); + + // Path of this file or directory. + private final URI path; + + // Information about the underlying GCS item. + private final GoogleCloudStorageItemInfo itemInfo; + + /** + * Constructs an instance of FileInfo. + * + * @param itemInfo Information about the underlying item. + */ + private FileInfo(URI path, GoogleCloudStorageItemInfo itemInfo) { + this.itemInfo = itemInfo; + + // Construct the path once. + this.path = path; + } + + /** + * Gets the path of this file or directory. + */ + URI getPath() { + return path; + } + + /** + * Indicates whether this item is a directory. + */ + boolean isDirectory() { + return itemInfo.isDirectory(); + } + + /** + * Indicates whether this item is an inferred directory. + */ + boolean isInferredDirectory() { + return itemInfo.isInferredDirectory(); + } + + /** + * Indicates whether this instance has information about the unique, shared root of the underlying + * storage system. + */ + boolean isGlobalRoot() { + return itemInfo.isGlobalRoot(); + } + + /** + * Gets creation time of this item. + * + *

Time is expressed as milliseconds since January 1, 1970 UTC. + */ + long getCreationTime() { + return itemInfo.getCreationTime(); + } + + /** + * Gets the size of this file or directory. + * + *

For files, size is in number of bytes. For directories size is 0. For items that do not + * exist, size is -1. + */ + long getSize() { + return itemInfo.getSize(); + } + + /** + * Gets the modification time of this file if one is set, otherwise the value of {@link + * #getCreationTime()} is returned. + * + *

Time is expressed as milliseconds since January 1, 1970 UTC. + */ + long getModificationTime() { + return itemInfo.getModificationTime(); + } + + /** + * Retrieve file attributes for this file. + * + * @return A map of file attributes + */ + Map getAttributes() { + return itemInfo.getMetadata(); + } + + /** + * Indicates whether this file or directory exists. + */ + boolean exists() { + return itemInfo.exists(); + } + + /** + * Returns CRC32C checksum of the file or {@code null}. + */ + byte[] getCrc32cChecksum() { + VerificationAttributes verificationAttributes = itemInfo.getVerificationAttributes(); + return verificationAttributes == null ? null : verificationAttributes.getCrc32c(); + } + + /** + * Returns MD5 checksum of the file or {@code null}. + */ + byte[] getMd5Checksum() { + VerificationAttributes verificationAttributes = itemInfo.getVerificationAttributes(); + return verificationAttributes == null ? null : verificationAttributes.getMd5hash(); + } + + /** + * Gets information about the underlying item. + */ + GoogleCloudStorageItemInfo getItemInfo() { + return itemInfo; + } + + /** + * Gets string representation of this instance. + */ + @Override + public String toString() { + return getPath() + (exists() ? + ": created on: " + Instant.ofEpochMilli(getCreationTime()) : + ": exists: no"); + } + + @Override + public boolean equals(Object o) { + if (this == o) { + return true; + } + if (!(o instanceof FileInfo)) { + return false; + } + FileInfo fileInfo = (FileInfo) o; + return Objects.equals(path, fileInfo.path) && Objects.equals(itemInfo, fileInfo.itemInfo); + } + + @Override + public int hashCode() { + return Objects.hash(path, itemInfo); + } + + /** + * Handy factory method for constructing a FileInfo from a GoogleCloudStorageItemInfo while + * potentially returning a singleton instead of really constructing an object for cases like ROOT. + */ + static FileInfo fromItemInfo(GoogleCloudStorageItemInfo itemInfo) { + if (itemInfo.isRoot()) { + return ROOT_INFO; + } + URI path = UriPaths.fromResourceId(itemInfo.getResourceId(), /* allowEmptyObjectName= */ true); + return new FileInfo(path, itemInfo); + } + + /** + * Handy factory method for constructing a list of FileInfo from a list of + * GoogleCloudStorageItemInfo. + */ + static List fromItemInfos(List itemInfos) { + List fileInfos = new ArrayList<>(itemInfos.size()); + for (GoogleCloudStorageItemInfo itemInfo : itemInfos) { + fileInfos.add(fromItemInfo(itemInfo)); + } + return fileInfos; + } +} diff --git a/hadoop-tools/hadoop-gcp/src/main/java/org/apache/hadoop/fs/gs/GoogleCloudStorage.java b/hadoop-tools/hadoop-gcp/src/main/java/org/apache/hadoop/fs/gs/GoogleCloudStorage.java new file mode 100644 index 0000000000000..9c15962b7ef36 --- /dev/null +++ b/hadoop-tools/hadoop-gcp/src/main/java/org/apache/hadoop/fs/gs/GoogleCloudStorage.java @@ -0,0 +1,261 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hadoop.fs.gs; + +import static org.apache.hadoop.thirdparty.com.google.common.base.Preconditions.*; +import static org.apache.hadoop.thirdparty.com.google.common.base.Strings.isNullOrEmpty; + +import com.google.cloud.storage.*; +import org.apache.hadoop.thirdparty.com.google.common.collect.ImmutableList; +import org.apache.hadoop.thirdparty.com.google.common.collect.Maps; +import org.apache.hadoop.thirdparty.com.google.common.io.BaseEncoding; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import javax.annotation.Nullable; +import java.io.IOException; +import java.nio.channels.WritableByteChannel; +import java.nio.file.FileAlreadyExistsException; +import java.util.List; +import java.util.Map; + +/** + * A wrapper around Google cloud storage + * client. + */ +class GoogleCloudStorage { + public static final Logger LOG = LoggerFactory.getLogger(GoogleHadoopFileSystem.class); + static final List BLOB_FIELDS = + ImmutableList.of(Storage.BlobField.BUCKET, Storage.BlobField.CONTENT_ENCODING, + Storage.BlobField.CONTENT_TYPE, Storage.BlobField.CRC32C, Storage.BlobField.GENERATION, + Storage.BlobField.METADATA, Storage.BlobField.MD5HASH, Storage.BlobField.METAGENERATION, + Storage.BlobField.NAME, Storage.BlobField.SIZE, Storage.BlobField.TIME_CREATED, + Storage.BlobField.UPDATED); + private final Storage storage; + private final GoogleHadoopFileSystemConfiguration configuration; + + /** + * Having an instance of gscImpl to redirect calls to Json client while new client implementation + * is in WIP. + */ + GoogleCloudStorage(GoogleHadoopFileSystemConfiguration configuration) throws IOException { + // TODO: Set projectId + // TODO: Set credentials + this.storage = StorageOptions.newBuilder().build().getService(); + this.configuration = configuration; + } + + public WritableByteChannel create(final StorageResourceId resourceId, final CreateOptions options) + throws IOException { + LOG.trace("create({})", resourceId); + + checkArgument(resourceId.isStorageObject(), "Expected full StorageObject id, got %s", + resourceId); + // Update resourceId if generationId is missing + StorageResourceId resourceIdWithGeneration = resourceId; + if (!resourceId.hasGenerationId()) { + resourceIdWithGeneration = + new StorageResourceId(resourceId.getBucketName(), resourceId.getObjectName(), + getWriteGeneration(resourceId, options.isOverwriteExisting())); + } + + return new GoogleCloudStorageClientWriteChannel(storage, resourceIdWithGeneration, options); + } + + /** + * Gets the object generation for a write operation + * + *

making getItemInfo call even if overwrite is disabled to fail fast in case file is existing. + * + * @param resourceId object for which generation info is requested + * @param overwrite whether existing object should be overwritten + * @return the generation of the object + * @throws IOException if the object already exists and cannot be overwritten + */ + private long getWriteGeneration(StorageResourceId resourceId, boolean overwrite) + throws IOException { + LOG.trace("getWriteGeneration({}, {})", resourceId, overwrite); + GoogleCloudStorageItemInfo info = getItemInfo(resourceId); + if (!info.exists()) { + return 0L; + } + if (info.exists() && overwrite) { + long generation = info.getContentGeneration(); + checkState(generation != 0, "Generation should not be 0 for an existing item"); + return generation; + } + + throw new FileAlreadyExistsException(String.format("Object %s already exists.", resourceId)); + } + + public void close() { + try { + storage.close(); + } catch (Exception e) { + LOG.warn("Error occurred while closing the storage client", e); + } + } + + public GoogleCloudStorageItemInfo getItemInfo(StorageResourceId resourceId) throws IOException { + LOG.trace("getItemInfo({})", resourceId); + + // Handle ROOT case first. + if (resourceId.isRoot()) { + return GoogleCloudStorageItemInfo.ROOT_INFO; + } + GoogleCloudStorageItemInfo itemInfo = null; + + if (resourceId.isBucket()) { + Bucket bucket = getBucket(resourceId.getBucketName()); + if (bucket != null) { + itemInfo = createItemInfoForBucket(resourceId, bucket); + } else { + LOG.debug("getBucket({}): not found", resourceId.getBucketName()); + } + } else { + Blob blob = getBlob(resourceId); + if (blob != null) { + itemInfo = createItemInfoForBlob(resourceId, blob); + } else { + LOG.debug("getObject({}): not found", resourceId); + } + } + + if (itemInfo == null) { + itemInfo = GoogleCloudStorageItemInfo.createNotFound(resourceId); + } + LOG.debug("getItemInfo: {}", itemInfo); + return itemInfo; + } + + /** + * Gets the bucket with the given name. + * + * @param bucketName name of the bucket to get + * @return the bucket with the given name or null if bucket not found + * @throws IOException if the bucket exists but cannot be accessed + */ + @Nullable + private Bucket getBucket(String bucketName) throws IOException { + LOG.debug("getBucket({})", bucketName); + checkArgument(!isNullOrEmpty(bucketName), "bucketName must not be null or empty"); + try { + return storage.get(bucketName); + } catch (StorageException e) { + if (ErrorTypeExtractor.getErrorType(e) == ErrorTypeExtractor.ErrorType.NOT_FOUND) { + return null; + } + throw new IOException("Error accessing Bucket " + bucketName, e); + } + } + + private static GoogleCloudStorageItemInfo createItemInfoForBlob(StorageResourceId resourceId, + Blob blob) { + checkArgument(resourceId != null, "resourceId must not be null"); + checkArgument(blob != null, "object must not be null"); + checkArgument(resourceId.isStorageObject(), + "resourceId must be a StorageObject. resourceId: %s", resourceId); + checkArgument(resourceId.getBucketName().equals(blob.getBucket()), + "resourceId.getBucketName() must equal object.getBucket(): '%s' vs '%s'", + resourceId.getBucketName(), blob.getBucket()); + checkArgument(resourceId.getObjectName().equals(blob.getName()), + "resourceId.getObjectName() must equal object.getName(): '%s' vs '%s'", + resourceId.getObjectName(), blob.getName()); + + Map decodedMetadata = + blob.getMetadata() == null ? null : decodeMetadata(blob.getMetadata()); + + byte[] md5Hash = null; + byte[] crc32c = null; + + if (!isNullOrEmpty(blob.getCrc32c())) { + crc32c = BaseEncoding.base64().decode(blob.getCrc32c()); + } + + if (!isNullOrEmpty(blob.getMd5())) { + md5Hash = BaseEncoding.base64().decode(blob.getMd5()); + } + + return GoogleCloudStorageItemInfo.createObject(resourceId, + blob.getCreateTimeOffsetDateTime() == null ? + 0 : + blob.getCreateTimeOffsetDateTime().toInstant().toEpochMilli(), + blob.getUpdateTimeOffsetDateTime() == null ? + 0 : + blob.getUpdateTimeOffsetDateTime().toInstant().toEpochMilli(), + blob.getSize() == null ? 0 : blob.getSize(), blob.getContentType(), + blob.getContentEncoding(), decodedMetadata, + blob.getGeneration() == null ? 0 : blob.getGeneration(), + blob.getMetageneration() == null ? 0 : blob.getMetageneration(), + new VerificationAttributes(md5Hash, crc32c)); + } + + static Map decodeMetadata(Map metadata) { + return Maps.transformValues(metadata, GoogleCloudStorage::decodeMetadataValues); + } + + @Nullable + private static byte[] decodeMetadataValues(String value) { + try { + return BaseEncoding.base64().decode(value); + } catch (IllegalArgumentException iae) { + LOG.error("Failed to parse base64 encoded attribute value {}", value, iae); + return null; + } + } + + /** + * Gets the object with the given resourceId. + * + * @param resourceId identifies a StorageObject + * @return the object with the given name or null if object not found + * @throws IOException if the object exists but cannot be accessed + */ + @Nullable + Blob getBlob(StorageResourceId resourceId) throws IOException { + checkArgument(resourceId.isStorageObject(), "Expected full StorageObject id, got %s", + resourceId); + String bucketName = resourceId.getBucketName(); + String objectName = resourceId.getObjectName(); + Blob blob; + try { + blob = storage.get(BlobId.of(bucketName, objectName), + Storage.BlobGetOption.fields(BLOB_FIELDS.toArray(new Storage.BlobField[0]))); + } catch (StorageException e) { + throw new IOException("Error accessing " + resourceId, e); + } + return blob; + } + + private static GoogleCloudStorageItemInfo createItemInfoForBucket(StorageResourceId resourceId, + Bucket bucket) { + checkArgument(resourceId != null, "resourceId must not be null"); + checkArgument(bucket != null, "bucket must not be null"); + checkArgument(resourceId.isBucket(), "resourceId must be a Bucket. resourceId: %s", resourceId); + checkArgument(resourceId.getBucketName().equals(bucket.getName()), + "resourceId.getBucketName() must equal bucket.getName(): '%s' vs '%s'", + resourceId.getBucketName(), bucket.getName()); + + return GoogleCloudStorageItemInfo.createBucket(resourceId, + bucket.asBucketInfo().getCreateTimeOffsetDateTime().toInstant().toEpochMilli(), + bucket.asBucketInfo().getUpdateTimeOffsetDateTime().toInstant().toEpochMilli(), + bucket.getLocation(), + bucket.getStorageClass() == null ? null : bucket.getStorageClass().name()); + } +} diff --git a/hadoop-tools/hadoop-gcp/src/main/java/org/apache/hadoop/fs/gs/GoogleCloudStorageClientWriteChannel.java b/hadoop-tools/hadoop-gcp/src/main/java/org/apache/hadoop/fs/gs/GoogleCloudStorageClientWriteChannel.java new file mode 100644 index 0000000000000..7956b6f0a8276 --- /dev/null +++ b/hadoop-tools/hadoop-gcp/src/main/java/org/apache/hadoop/fs/gs/GoogleCloudStorageClientWriteChannel.java @@ -0,0 +1,116 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hadoop.fs.gs; + +import com.google.cloud.storage.BlobId; +import com.google.cloud.storage.BlobInfo; +import com.google.cloud.storage.BlobWriteSession; +import com.google.cloud.storage.Storage; +import com.google.cloud.storage.Storage.BlobWriteOption; +import com.google.cloud.storage.StorageException; + +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import java.io.IOException; +import java.nio.ByteBuffer; +import java.nio.channels.WritableByteChannel; +import java.util.ArrayList; +import java.util.List; + +/** + * Implements WritableByteChannel to provide write access to GCS via java-storage client. + */ +class GoogleCloudStorageClientWriteChannel implements WritableByteChannel { + private static final Logger LOG = + LoggerFactory.getLogger(GoogleCloudStorageClientWriteChannel.class); + + private final StorageResourceId resourceId; + private WritableByteChannel writableByteChannel; + + GoogleCloudStorageClientWriteChannel(final Storage storage, + final StorageResourceId resourceId, final CreateOptions createOptions) throws IOException { + this.resourceId = resourceId; + BlobWriteSession blobWriteSession = getBlobWriteSession(storage, resourceId, createOptions); + try { + this.writableByteChannel = blobWriteSession.open(); + } catch (StorageException e) { + throw new IOException(e); + } + } + + private static BlobInfo getBlobInfo(final StorageResourceId resourceId, + final CreateOptions createOptions) { + BlobInfo blobInfo = BlobInfo.newBuilder( + BlobId.of(resourceId.getBucketName(), resourceId.getObjectName(), + resourceId.getGenerationId())).setContentType(createOptions.getContentType()) + // .setMetadata(encodeMetadata(createOptions.getMetadata())) // TODO: + .build(); + return blobInfo; + } + + private static BlobWriteSession getBlobWriteSession(final Storage storage, + final StorageResourceId resourceId, final CreateOptions createOptions) { + return storage.blobWriteSession(getBlobInfo(resourceId, createOptions), + generateWriteOptions(createOptions)); + } + + private static BlobWriteOption[] generateWriteOptions(final CreateOptions createOptions) { + List blobWriteOptions = new ArrayList<>(); + + blobWriteOptions.add(BlobWriteOption.disableGzipContent()); + blobWriteOptions.add(BlobWriteOption.generationMatch()); + + //TODO: Enable KMS and checksum + return blobWriteOptions.toArray(new BlobWriteOption[blobWriteOptions.size()]); + } + + @Override + public boolean isOpen() { + return writableByteChannel != null && writableByteChannel.isOpen(); + } + + @Override + public void close() throws IOException { + try { + if (!isOpen()) { + return; + } + + writableByteChannel.close(); + } catch (Exception e) { + throw new IOException( + String.format("Upload failed for '%s'. reason=%s", resourceId, e.getMessage()), e); + } finally { + writableByteChannel = null; + } + } + + private int writeInternal(final ByteBuffer byteBuffer) throws IOException { + int bytesWritten = writableByteChannel.write(byteBuffer); + LOG.trace("{} bytes were written out of provided buffer of capacity {}", bytesWritten, + byteBuffer.limit()); + return bytesWritten; + } + + @Override + public int write(final ByteBuffer src) throws IOException { + return writeInternal(src); + } +} diff --git a/hadoop-tools/hadoop-gcp/src/main/java/org/apache/hadoop/fs/gs/GoogleCloudStorageFileSystem.java b/hadoop-tools/hadoop-gcp/src/main/java/org/apache/hadoop/fs/gs/GoogleCloudStorageFileSystem.java new file mode 100644 index 0000000000000..e411f22eb3994 --- /dev/null +++ b/hadoop-tools/hadoop-gcp/src/main/java/org/apache/hadoop/fs/gs/GoogleCloudStorageFileSystem.java @@ -0,0 +1,89 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hadoop.fs.gs; + +import static org.apache.hadoop.thirdparty.com.google.common.base.Preconditions.*; +import static org.apache.hadoop.fs.gs.Constants.SCHEME; + +import com.google.auth.Credentials; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import java.io.IOException; +import java.net.URI; +import java.nio.channels.WritableByteChannel; + +/** + * Provides FS semantics over GCS based on Objects API. + */ +class GoogleCloudStorageFileSystem { + private static final Logger LOG = LoggerFactory.getLogger(StorageResourceId.class); + + // URI of the root path. + static final URI GCSROOT = URI.create(SCHEME + ":/"); + + // GCS access instance. + private GoogleCloudStorage gcs; + + private static GoogleCloudStorage createCloudStorage( + final GoogleHadoopFileSystemConfiguration configuration, final Credentials credentials) + throws IOException { + checkNotNull(configuration, "configuration must not be null"); + + return new GoogleCloudStorage(configuration); + } + + GoogleCloudStorageFileSystem(final GoogleHadoopFileSystemConfiguration configuration, + final Credentials credentials) throws IOException { + gcs = createCloudStorage(configuration, credentials); + } + + WritableByteChannel create(final URI path, final CreateOptions createOptions) + throws IOException { + LOG.trace("create(path: {}, createOptions: {})", path, createOptions); + checkNotNull(path, "path could not be null"); + StorageResourceId resourceId = + StorageResourceId.fromUriPath(path, /* allowEmptyObjectName=*/ true); + + if (resourceId.isDirectory()) { + throw new IOException( + String.format("Cannot create a file whose name looks like a directory: '%s'", + resourceId)); + } + + if (createOptions.getOverwriteGenerationId() != StorageResourceId.UNKNOWN_GENERATION_ID) { + resourceId = new StorageResourceId(resourceId.getBucketName(), resourceId.getObjectName(), + createOptions.getOverwriteGenerationId()); + } + + return gcs.create(resourceId, createOptions); + } + + void close() { + if (gcs == null) { + return; + } + LOG.trace("close()"); + try { + gcs.close(); + } finally { + gcs = null; + } + } +} diff --git a/hadoop-tools/hadoop-gcp/src/main/java/org/apache/hadoop/fs/gs/GoogleCloudStorageItemInfo.java b/hadoop-tools/hadoop-gcp/src/main/java/org/apache/hadoop/fs/gs/GoogleCloudStorageItemInfo.java new file mode 100644 index 0000000000000..887e68b05f98c --- /dev/null +++ b/hadoop-tools/hadoop-gcp/src/main/java/org/apache/hadoop/fs/gs/GoogleCloudStorageItemInfo.java @@ -0,0 +1,423 @@ +/* + * Copyright 2013 Google Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hadoop.fs.gs; + +import static org.apache.hadoop.thirdparty.com.google.common.base.Preconditions.checkArgument; +import static org.apache.hadoop.thirdparty.com.google.common.base.Preconditions.checkNotNull; + +import org.apache.hadoop.thirdparty.com.google.common.annotations.VisibleForTesting; +import org.apache.hadoop.thirdparty.com.google.common.collect.ImmutableMap; + +import java.time.Instant; +import java.util.Arrays; +import java.util.Map; +import java.util.Objects; + +/** + * Contains information about an item in Google Cloud Storage. + */ +final class GoogleCloudStorageItemInfo { + // Info about the root of GCS namespace. + public static final GoogleCloudStorageItemInfo ROOT_INFO = + new GoogleCloudStorageItemInfo(StorageResourceId.ROOT, + /* creationTime= */ 0, + /* modificationTime= */ 0, + /* size= */ 0, + /* location= */ null, + /* storageClass= */ null, + /* contentType= */ null, + /* contentEncoding= */ null, + /* metadata= */ null, + /* contentGeneration= */ 0, + /* metaGeneration= */ 0, + /* verificationAttributes= */ null); + + /** + * Factory method for creating a GoogleCloudStorageItemInfo for a bucket. + * + * @param resourceId Resource ID that identifies a bucket + * @param creationTime Time when a bucket was created (milliseconds since January 1, 1970 + * UTC). + * @param modificationTime Time when a bucket was last modified (milliseconds since January 1, + * 1970 UTC). + * @param location Location of a bucket. + * @param storageClass Storage class of a bucket. + */ + static GoogleCloudStorageItemInfo createBucket(StorageResourceId resourceId, + long creationTime, long modificationTime, String location, String storageClass) { + checkNotNull(resourceId, "resourceId must not be null"); + checkArgument(resourceId.isBucket(), "expected bucket but got '%s'", resourceId); + return new GoogleCloudStorageItemInfo(resourceId, creationTime, modificationTime, + /* size= */ 0, location, storageClass, + /* contentType= */ null, + /* contentEncoding= */ null, + /* metadata= */ null, + /* contentGeneration= */ 0, + /* metaGeneration= */ 0, + /* verificationAttributes= */ null); + } + + /** + * Factory method for creating a GoogleCloudStorageItemInfo for an object. + * + * @param resourceId identifies either root, a Bucket, or a StorageObject + * @param creationTime Time when object was created (milliseconds since January 1, 1970 + * UTC). + * @param size Size of the given object (number of bytes) or -1 if the object + * does not exist. + * @param metadata User-supplied object metadata for this object. + */ + static GoogleCloudStorageItemInfo createObject(StorageResourceId resourceId, + long creationTime, long modificationTime, long size, String contentType, + String contentEncoding, Map metadata, long contentGeneration, + long metaGeneration, VerificationAttributes verificationAttributes) { + checkNotNull(resourceId, "resourceId must not be null"); + checkArgument( + !resourceId.isRoot(), + "expected object or directory but got '%s'", resourceId); + checkArgument( + !resourceId.isBucket(), + "expected object or directory but got '%s'", resourceId); + return new GoogleCloudStorageItemInfo(resourceId, creationTime, modificationTime, size, + /* location= */ null, + /* storageClass= */ null, contentType, contentEncoding, metadata, contentGeneration, + metaGeneration, verificationAttributes); + } + + /** + * Factory method for creating a "found" GoogleCloudStorageItemInfo for an inferred directory. + * + * @param resourceId Resource ID that identifies an inferred directory + */ + static GoogleCloudStorageItemInfo createInferredDirectory(StorageResourceId resourceId) { + return new GoogleCloudStorageItemInfo(resourceId, + /* creationTime= */ 0, + /* modificationTime= */ 0, + /* size= */ 0, + /* location= */ null, + /* storageClass= */ null, + /* contentType= */ null, + /* contentEncoding= */ null, + /* metadata= */ null, + /* contentGeneration= */ 0, + /* metaGeneration= */ 0, + /* verificationAttributes= */ null); + } + + /** + * Factory method for creating a "not found" GoogleCloudStorageItemInfo for a bucket or an object. + * + * @param resourceId Resource ID that identifies an inferred directory + */ + static GoogleCloudStorageItemInfo createNotFound(StorageResourceId resourceId) { + return new GoogleCloudStorageItemInfo(resourceId, + /* creationTime= */ 0, + /* modificationTime= */ 0, + /* size= */ -1, + /* location= */ null, + /* storageClass= */ null, + /* contentType= */ null, + /* contentEncoding= */ null, + /* metadata= */ null, + /* contentGeneration= */ 0, + /* metaGeneration= */ 0, + /* verificationAttributes= */ null); + } + + // The Bucket and maybe StorageObject names of the GCS "item" referenced by this object. Not + // null. + private final StorageResourceId resourceId; + + // Creation time of this item. + // Time is expressed as milliseconds since January 1, 1970 UTC. + private final long creationTime; + + // Modification time of this item. + // Time is expressed as milliseconds since January 1, 1970 UTC. + private final long modificationTime; + + // Size of an object (number of bytes). + // Size is -1 for items that do not exist. + private final long size; + + // Location of this item. + private final String location; + + // Storage class of this item. + private final String storageClass; + + // Content-Type of this item + private final String contentType; + + private final String contentEncoding; + + // User-supplied metadata. + private final Map metadata; + + private final long contentGeneration; + + private final long metaGeneration; + + private final VerificationAttributes verificationAttributes; + + private GoogleCloudStorageItemInfo(StorageResourceId resourceId, long creationTime, + long modificationTime, long size, String location, String storageClass, String contentType, + String contentEncoding, Map metadata, long contentGeneration, + long metaGeneration, VerificationAttributes verificationAttributes) { + this.resourceId = checkNotNull(resourceId, "resourceId must not be null"); + this.creationTime = creationTime; + this.modificationTime = modificationTime; + this.size = size; + this.location = location; + this.storageClass = storageClass; + this.contentType = contentType; + this.contentEncoding = contentEncoding; + this.metadata = (metadata == null) ? ImmutableMap.of() : metadata; + this.contentGeneration = contentGeneration; + this.metaGeneration = metaGeneration; + this.verificationAttributes = verificationAttributes; + } + + /** + * Gets bucket name of this item. + */ + String getBucketName() { + return resourceId.getBucketName(); + } + + /** + * Gets object name of this item. + */ + String getObjectName() { + return resourceId.getObjectName(); + } + + /** + * Gets the resourceId that holds the (possibly null) bucketName and objectName of this object. + */ + StorageResourceId getResourceId() { + return resourceId; + } + + /** + * Gets creation time of this item. + * + *

Time is expressed as milliseconds since January 1, 1970 UTC. + */ + long getCreationTime() { + return creationTime; + } + + /** + * Gets modification time of this item. + * + *

Time is expressed as milliseconds since January 1, 1970 UTC. + */ + long getModificationTime() { + return modificationTime; + } + + /** + * Gets size of this item (number of bytes). Returns -1 if the object does not exist. + */ + long getSize() { + return size; + } + + /** + * Gets location of this item. + * + *

Note: Location is only supported for buckets. The value is always null for objects. + */ + String getLocation() { + return location; + } + + /** + * Gets storage class of this item. + * + *

Note: Storage-class is only supported for buckets. The value is always null for objects. + */ + String getStorageClass() { + return storageClass; + } + + /** + * Gets the content-type of this item, or null if unknown or inapplicable. + * + *

Note: content-type is only supported for objects, and will always be null for buckets. + */ + String getContentType() { + return contentType; + } + + /** + * Gets the content-encoding of this item, or null if unknown or inapplicable. + * + *

Note: content-encoding is only supported for objects, and will always be null for buckets. + */ + String getContentEncoding() { + return contentEncoding; + } + + /** + * Gets user-supplied metadata for this item. + * + *

Note: metadata is only supported for objects. This value is always an empty map for buckets. + */ + Map getMetadata() { + return metadata; + } + + /** + * Indicates whether this item is a bucket. Root is not considered to be a bucket. + */ + boolean isBucket() { + return resourceId.isBucket(); + } + + /** + * Indicates whether this item refers to the GCS root (gs://). + */ + boolean isRoot() { + return resourceId.isRoot(); + } + + /** + * Indicates whether this instance has information about the unique, shared root of the underlying + * storage system. + */ + boolean isGlobalRoot() { + return isRoot() && exists(); + } + + /** + * Indicates whether {@code itemInfo} is a directory. + */ + boolean isDirectory() { + return isGlobalRoot() || isBucket() || resourceId.isDirectory(); + } + + /** + * Indicates whether {@code itemInfo} is an inferred directory. + */ + boolean isInferredDirectory() { + return creationTime == 0 && modificationTime == 0 && size == 0 && contentGeneration == 0 + && metaGeneration == 0; + } + + /** + * Get the content generation of the object. + */ + long getContentGeneration() { + return contentGeneration; + } + + /** + * Get the meta generation of the object. + */ + long getMetaGeneration() { + return metaGeneration; + } + + /** + * Get object validation attributes. + */ + VerificationAttributes getVerificationAttributes() { + return verificationAttributes; + } + + /** + * Indicates whether this item exists. + */ + boolean exists() { + return size >= 0; + } + + /** + * Helper for checking logical equality of metadata maps, checking equality of keySet() between + * this.metadata and otherMetadata, and then using Arrays.equals to compare contents of + * corresponding byte arrays. + */ + @VisibleForTesting + public boolean metadataEquals(Map otherMetadata) { + if (metadata == otherMetadata) { + // Fast-path for common cases where the same actual default metadata instance may be + // used in + // multiple different item infos. + return true; + } + // No need to check if other `metadata` is not null, + // because previous `if` checks if both of them are null. + if (metadata == null || otherMetadata == null) { + return false; + } + if (!metadata.keySet().equals(otherMetadata.keySet())) { + return false; + } + + // Compare each byte[] with Arrays.equals. + for (Map.Entry metadataEntry : metadata.entrySet()) { + if (!Arrays.equals(metadataEntry.getValue(), otherMetadata.get(metadataEntry.getKey()))) { + return false; + } + } + return true; + } + + /** + * Gets string representation of this instance. + */ + @Override + public String toString() { + return exists() ? + String.format("%s: created on: %s", resourceId, Instant.ofEpochMilli(creationTime)) : + String.format("%s: exists: no", resourceId); + } + + @Override + public boolean equals(Object obj) { + if (obj instanceof GoogleCloudStorageItemInfo) { + GoogleCloudStorageItemInfo other = (GoogleCloudStorageItemInfo) obj; + return resourceId.equals(other.resourceId) && creationTime == other.creationTime + && modificationTime == other.modificationTime && size == other.size && Objects.equals( + location, other.location) && Objects.equals(storageClass, other.storageClass) + && Objects.equals(verificationAttributes, other.verificationAttributes) + && metaGeneration == other.metaGeneration && contentGeneration == other.contentGeneration + && metadataEquals(other.getMetadata()); + } + return false; + } + + @Override + public int hashCode() { + final int prime = 31; + int result = 1; + result = prime * result + resourceId.hashCode(); + result = prime * result + (int) creationTime; + result = prime * result + (int) modificationTime; + result = prime * result + (int) size; + result = prime * result + Objects.hashCode(location); + result = prime * result + Objects.hashCode(storageClass); + result = prime * result + Objects.hashCode(verificationAttributes); + result = prime * result + (int) metaGeneration; + result = prime * result + (int) contentGeneration; + result = prime * result + metadata.entrySet().stream() + .mapToInt(e -> Objects.hash(e.getKey()) + Arrays.hashCode(e.getValue())).sum(); + return result; + } +} diff --git a/hadoop-tools/hadoop-gcp/src/main/java/org/apache/hadoop/fs/gs/GoogleHadoopFileSystem.java b/hadoop-tools/hadoop-gcp/src/main/java/org/apache/hadoop/fs/gs/GoogleHadoopFileSystem.java new file mode 100644 index 0000000000000..1c2fc19d2b5b1 --- /dev/null +++ b/hadoop-tools/hadoop-gcp/src/main/java/org/apache/hadoop/fs/gs/GoogleHadoopFileSystem.java @@ -0,0 +1,505 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hadoop.fs.gs; + +import static org.apache.hadoop.fs.gs.Constants.GCS_CONFIG_PREFIX; +import static org.apache.hadoop.fs.gs.GoogleHadoopFileSystemConfiguration.GCS_WORKING_DIRECTORY; + +import static org.apache.hadoop.thirdparty.com.google.common.base.Preconditions.checkArgument; +import static org.apache.hadoop.thirdparty.com.google.common.base.Preconditions.checkNotNull; +import static org.apache.hadoop.thirdparty.com.google.common.base.Preconditions.checkState; +import static org.apache.hadoop.thirdparty.com.google.common.base.Strings.isNullOrEmpty; + +import com.google.auth.oauth2.GoogleCredentials; +import org.apache.hadoop.thirdparty.com.google.common.base.Ascii; + +import java.io.FileNotFoundException; +import java.io.IOException; +import java.net.URI; +import java.util.EnumSet; + +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.fs.*; +import org.apache.hadoop.fs.permission.FsPermission; +import org.apache.hadoop.security.ProviderUtils; +import org.apache.hadoop.util.Progressable; + +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +/** + * GoogleHadoopFileSystem is rooted in a single bucket at initialization time; in this case, Hadoop + * paths no longer correspond directly to general GCS paths, and all Hadoop operations going through + * this FileSystem will never touch any GCS bucket other than the bucket on which this FileSystem is + * rooted. + * + *

This implementation sacrifices a small amount of cross-bucket interoperability in favor of + * more straightforward FileSystem semantics and compatibility with existing Hadoop applications. In + * particular, it is not subject to bucket-naming constraints, and files are allowed to be placed in + * root. + */ +public class GoogleHadoopFileSystem extends FileSystem { + + public static final Logger LOG = LoggerFactory.getLogger(GoogleHadoopFileSystem.class); + + /** + * URI scheme for GoogleHadoopFileSystem. + */ + private static final String SCHEME = Constants.SCHEME; + + /** + * Default value of replication factor. + */ + static final short REPLICATION_FACTOR_DEFAULT = 3; + + // TODO: Take this from config + private static final int PERMISSIONS_TO_REPORT = 700; + + /** + * The URI the File System is passed in initialize. + */ + private URI initUri; + + /** + * Default block size. Note that this is the size that is reported to Hadoop FS clients. It does + * not modify the actual block size of an underlying GCS object, because GCS JSON API does not + * allow modifying or querying the value. Modifying this value allows one to control how many + * mappers are used to process a given file. + */ + private long defaultBlockSize = GoogleHadoopFileSystemConfiguration.BLOCK_SIZE.getDefault(); + + // The bucket the file system is rooted in used for default values of: + // -- working directory + // -- user home directories (only for Hadoop purposes). + private Path fsRoot; + + /** + * Current working directory; overridden in initialize() if {@link + * GoogleHadoopFileSystemConfiguration#GCS_WORKING_DIRECTORY} is set. + */ + private Path workingDirectory; + private GoogleCloudStorageFileSystem gcsFs; + private boolean isClosed; + private FsPermission reportedPermissions; + + public GoogleHadoopFileSystemConfiguration getFileSystemConfiguration() { + return fileSystemConfiguration; + } + + private GoogleHadoopFileSystemConfiguration fileSystemConfiguration; + + @Override + public void initialize(final URI path, Configuration config) throws IOException { + LOG.trace("initialize(path: {}, config: {})", path, config); + + checkArgument(path != null, "path must not be null"); + checkArgument(config != null, "config must not be null"); + checkArgument(path.getScheme() != null, "scheme of path must not be null"); + checkArgument(path.getScheme().equals(getScheme()), "URI scheme not supported: {}", path); + + config = + ProviderUtils.excludeIncompatibleCredentialProviders(config, GoogleHadoopFileSystem.class); + super.initialize(path, config); + + initUri = path; + + // Set this configuration as the default config for this instance; configure() + // will perform some file-system-specific adjustments, but the original should + // be sufficient (and is required) for the delegation token binding initialization. + setConf(config); + + this.reportedPermissions = new FsPermission(PERMISSIONS_TO_REPORT); + + initializeFsRoot(); + + this.fileSystemConfiguration = new GoogleHadoopFileSystemConfiguration(config); + initializeWorkingDirectory(fileSystemConfiguration); + initializeGcsFs(fileSystemConfiguration); + } + + private void initializeFsRoot() { + String rootBucket = initUri.getAuthority(); + checkArgument(rootBucket != null, "No bucket specified in GCS URI: {}", initUri); + // Validate root bucket name + URI rootUri = UriPaths.fromStringPathComponents(rootBucket, /* objectName= */ + null, /* allowEmptyObjectName= */ true); + fsRoot = new Path(rootUri); + LOG.trace("Configured FS root: '{}'", fsRoot); + } + + private void initializeWorkingDirectory(final GoogleHadoopFileSystemConfiguration config) { + String configWorkingDirectory = config.getWorkingDirectory(); + if (isNullOrEmpty(configWorkingDirectory)) { + LOG.warn("No working directory configured, using default: '{}'", workingDirectory); + } + // Use the public method to ensure proper behavior of normalizing and resolving the new + // working directory relative to the initial filesystem-root directory. + setWorkingDirectory( + isNullOrEmpty(configWorkingDirectory) ? fsRoot : new Path(configWorkingDirectory)); + LOG.trace("Configured working directory: {} = {}", GCS_WORKING_DIRECTORY.getKey(), + getWorkingDirectory()); + } + + private synchronized void initializeGcsFs(final GoogleHadoopFileSystemConfiguration config) + throws IOException { + this.gcsFs = createGcsFs(config); + } + + private GoogleCloudStorageFileSystem createGcsFs(final GoogleHadoopFileSystemConfiguration config) + throws IOException { + GoogleCredentials credentials = getCredentials(config); + return new GoogleCloudStorageFileSystem(config, credentials); + } + + private GoogleCredentials getCredentials(GoogleHadoopFileSystemConfiguration config) + throws IOException { + return getCredentials(config, GCS_CONFIG_PREFIX); + } + + public static GoogleCredentials getCredentials(GoogleHadoopFileSystemConfiguration config, + String... keyPrefixesVararg) throws IOException { + return GoogleCredentials.getApplicationDefault(); // TODO: Add other Auth mechanisms + } + + @Override + protected void checkPath(final Path path) { + LOG.trace("checkPath(path: {})", path); + // Validate scheme + URI uri = path.toUri(); + + String scheme = uri.getScheme(); + if (scheme != null && !scheme.equalsIgnoreCase(getScheme())) { + throw new IllegalArgumentException( + String.format("Wrong scheme: {}, in path: {}, expected scheme: {}", scheme, path, + getScheme())); + } + + String bucket = uri.getAuthority(); + String rootBucket = fsRoot.toUri().getAuthority(); + + // Bucket-less URIs will be qualified later + if (bucket == null || bucket.equals(rootBucket)) { + return; + } + + throw new IllegalArgumentException( + String.format("Wrong bucket: {}, in path: {}, expected bucket: {}", bucket, path, + rootBucket)); + } + + /** + * Validates that GCS path belongs to this file system. The bucket must match the root bucket + * provided at initialization time. + */ + Path getHadoopPath(final URI gcsPath) { + LOG.trace("getHadoopPath(gcsPath: {})", gcsPath); + + // Handle root. Delegate to getGcsPath on "gs:/" to resolve the appropriate gs:// URI. + if (gcsPath.equals(getGcsPath(fsRoot))) { + return fsRoot; + } + + StorageResourceId resourceId = StorageResourceId.fromUriPath(gcsPath, true); + + checkArgument(!resourceId.isRoot(), "Missing authority in gcsPath '{}'", gcsPath); + String rootBucket = fsRoot.toUri().getAuthority(); + checkArgument(resourceId.getBucketName().equals(rootBucket), + "Authority of URI '{}' doesn't match root bucket '{}'", resourceId.getBucketName(), + rootBucket); + + Path hadoopPath = new Path(fsRoot, + new Path(/* schema= */ null, /* authority= */ null, resourceId.getObjectName())); + LOG.trace("getHadoopPath(gcsPath: {}): {}", gcsPath, hadoopPath); + return hadoopPath; + } + + /** + * Translates a "gs:/" style hadoopPath (or relative path which is not fully-qualified) into the + * appropriate GCS path which is compatible with the underlying GcsFs. + */ + URI getGcsPath(final Path hadoopPath) { + LOG.trace("getGcsPath(hadoopPath: {})", hadoopPath); + + // Convert to fully qualified absolute path; the Path object will call back to get our current + // workingDirectory as part of fully resolving the path. + Path resolvedPath = makeQualified(hadoopPath); + + String objectName = resolvedPath.toUri().getPath(); + if (objectName != null && resolvedPath.isAbsolute()) { + // Strip off leading '/' because GoogleCloudStorageFileSystem.getPath appends it explicitly + // between bucket and objectName. + objectName = objectName.substring(1); + } + + // Construct GCS path URI + String rootBucket = fsRoot.toUri().getAuthority(); + URI gcsPath = + UriPaths.fromStringPathComponents(rootBucket, objectName, /* allowEmptyObjectName= */ true); + LOG.trace("getGcsPath(hadoopPath: {}): {}", hadoopPath, gcsPath); + return gcsPath; + } + + @Override + public String getScheme() { + return SCHEME; + } + + @Override + public FSDataInputStream open(final Path path, final int bufferSize) throws IOException { + LOG.trace("open({})", path); + throw new UnsupportedOperationException(path.toString()); + } + + @Override + public FSDataOutputStream create(Path hadoopPath, FsPermission permission, boolean overwrite, + int bufferSize, short replication, long blockSize, Progressable progress) throws IOException { + checkArgument(hadoopPath != null, "hadoopPath must not be null"); + checkArgument(replication > 0, "replication must be a positive integer: %s", replication); + checkArgument(blockSize > 0, "blockSize must be a positive integer: %s", blockSize); + + System.out.println(String.format("create(%s)", hadoopPath)); + checkOpen(); + + LOG.trace("create(hadoopPath: {}, overwrite: {}, bufferSize: {} [ignored])", hadoopPath, + overwrite, bufferSize); + + CreateOptions.WriteMode writeMode = + overwrite ? CreateOptions.WriteMode.OVERWRITE : CreateOptions.WriteMode.CREATE_NEW; + FSDataOutputStream response = new FSDataOutputStream( + new GoogleHadoopOutputStream(this, getGcsPath(hadoopPath), + CreateOptions.builder().setWriteMode(writeMode).build(), statistics), statistics); + + return response; + } + + @Override + public FSDataOutputStream createNonRecursive(Path hadoopPath, FsPermission permission, + EnumSet flags, int bufferSize, short replication, long blockSize, + Progressable progress) throws IOException { + throw new UnsupportedOperationException(hadoopPath.toString()); + } + + @Override + public FSDataOutputStream append(final Path path, final int i, final Progressable progressable) + throws IOException { + throw new UnsupportedOperationException(path.toString()); + } + + @Override + public boolean rename(final Path path, final Path path1) throws IOException { + LOG.trace("rename({}, {})", path, path1); + throw new UnsupportedOperationException(path.toString()); + } + + @Override + public boolean delete(final Path path, final boolean recursive) throws IOException { + LOG.trace("delete({}, {})", path, recursive); + throw new UnsupportedOperationException(path.toString()); + } + + @Override + public FileStatus[] listStatus(final Path path) throws FileNotFoundException, IOException { + checkArgument(path != null, "hadoopPath must not be null"); + + checkOpen(); + + LOG.trace("listStatus(hadoopPath: {})", path); + throw new UnsupportedOperationException(path.toString()); + } + + /** + * Overridden to make root its own parent. This is POSIX compliant, but more importantly guards + * against poor directory accounting in the PathData class of Hadoop 2's FsShell. + */ + @Override + public Path makeQualified(final Path path) { + Path qualifiedPath = super.makeQualified(path); + + URI uri = qualifiedPath.toUri(); + + checkState("".equals(uri.getPath()) || qualifiedPath.isAbsolute(), + "Path '{}' must be fully qualified.", qualifiedPath); + + Path result; + String upath = uri.getPath(); + + // Strip initial '..'s to make root is its own parent. + int i = 0; + while (upath.startsWith("/../", i)) { + // Leave a preceding slash, so path is still absolute. + i += 3; + } + if (i == upath.length() || upath.substring(i).equals("/..")) { + // Allow a Path of gs://someBucket to map to gs://someBucket/ + result = new Path(uri.getScheme(), uri.getAuthority(), "/"); + } else if (i == 0) { + result = qualifiedPath; + } else { + result = new Path(uri.getScheme(), uri.getAuthority(), upath.substring(i)); + } + + LOG.trace("makeQualified(path: {}): {}", path, result); + return result; + } + + /** + * Returns a URI of the root of this FileSystem. + */ + @Override + public URI getUri() { + return fsRoot.toUri(); + } + + /** + * The default port is listed as -1 as an indication that ports are not used. + */ + @Override + protected int getDefaultPort() { + int result = -1; + LOG.trace("getDefaultPort(): %d", result); + return result; + } + + @Override + public boolean hasPathCapability(final Path path, final String capability) { + checkNotNull(path, "path must not be null"); + checkArgument(!isNullOrEmpty(capability), "capability must not be null or empty string for {}", + path); + switch (Ascii.toLowerCase(capability)) { + case CommonPathCapabilities.FS_APPEND: + case CommonPathCapabilities.FS_CONCAT: + return false; + default: + return false; + } + } + + /** + * Gets the current working directory. + * + * @return The current working directory. + */ + @Override + public Path getWorkingDirectory() { + LOG.trace("getWorkingDirectory(): {}", workingDirectory); + return workingDirectory; + } + + @Override + public boolean mkdirs(final Path path, final FsPermission fsPermission) throws IOException { + LOG.trace("mkdirs({})", path); + throw new UnsupportedOperationException(path.toString()); + } + +// /** +// * Gets the default replication factor. +// */ +// @Override +// public short getDefaultReplication() { +// return REPLICATION_FACTOR_DEFAULT; +// } + + @Override + public FileStatus getFileStatus(final Path path) throws IOException { + checkArgument(path != null, "path must not be null"); + + checkOpen(); + + URI gcsPath = getGcsPath(path); + + LOG.trace("getFileStatus(): {}", gcsPath); + + throw new UnsupportedOperationException(path.toString()); + } + + /** + * Returns home directory of the current user. + * + *

Note: This directory is only used for Hadoop purposes. It is not the same as a user's OS + * home directory. + */ + @Override + public Path getHomeDirectory() { + Path result = new Path(fsRoot, "user/" + System.getProperty("user.name")); + LOG.trace("getHomeDirectory(): {}", result); + return result; + } + + /** + * {@inheritDoc} + * + *

Returns the service if delegation tokens are configured, otherwise, null. + */ + @Override + public String getCanonicalServiceName() { + // TODO: Add delegation token support + return null; + } + + /** + * Gets GCS FS instance. + */ + GoogleCloudStorageFileSystem getGcsFs() { + return gcsFs; + } + + /** + * Assert that the FileSystem has been initialized and not close()d. + */ + private void checkOpen() throws IOException { + if (isClosed) { + throw new IOException("GoogleHadoopFileSystem has been closed or not initialized."); + } + } + + @Override + public void close() throws IOException { + LOG.trace("close()"); + if (isClosed) { + return; + } + + super.close(); + + getGcsFs().close(); + + this.isClosed = true; + } + + @Override + public long getUsed() throws IOException { + long result = super.getUsed(); + LOG.trace("getUsed(): {}", result); + return result; + } + +// @Override +// public long getDefaultBlockSize() { +// LOG.trace("getDefaultBlockSize(): {}", defaultBlockSize); +// return defaultBlockSize; +// } + + @Override + public void setWorkingDirectory(final Path hadoopPath) { + checkArgument(hadoopPath != null, "hadoopPath must not be null"); + URI gcsPath = UriPaths.toDirectory(getGcsPath(hadoopPath)); + workingDirectory = getHadoopPath(gcsPath); + LOG.trace("setWorkingDirectory(hadoopPath: {}): {}", hadoopPath, workingDirectory); + } +} diff --git a/hadoop-tools/hadoop-gcp/src/main/java/org/apache/hadoop/fs/gs/GoogleHadoopFileSystemConfiguration.java b/hadoop-tools/hadoop-gcp/src/main/java/org/apache/hadoop/fs/gs/GoogleHadoopFileSystemConfiguration.java new file mode 100644 index 0000000000000..16d940b16f49c --- /dev/null +++ b/hadoop-tools/hadoop-gcp/src/main/java/org/apache/hadoop/fs/gs/GoogleHadoopFileSystemConfiguration.java @@ -0,0 +1,75 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hadoop.fs.gs; + +import static java.lang.Math.toIntExact; + +import org.apache.hadoop.conf.Configuration; + +/** + * This class provides a configuration for the {@link GoogleHadoopFileSystem} implementations. + */ +class GoogleHadoopFileSystemConfiguration { + /** + * Configuration key for default block size of a file. + * + *

Note that this is the size that is reported to Hadoop FS clients. It does not modify the + * actual block size of an underlying GCS object, because GCS JSON API does not allow modifying or + * querying the value. Modifying this value allows one to control how many mappers are used to + * process a given file. + */ + public static final HadoopConfigurationProperty BLOCK_SIZE = + new HadoopConfigurationProperty<>("fs.gs.block.size", 64 * 1024 * 1024L); + + /** + * Configuration key for GCS project ID. Default value: none + */ + public static final HadoopConfigurationProperty GCS_PROJECT_ID = + new HadoopConfigurationProperty<>("fs.gs.project.id"); + + /** + * Configuration key for initial working directory of a GHFS instance. Default value: '/' + */ + public static final HadoopConfigurationProperty GCS_WORKING_DIRECTORY = + new HadoopConfigurationProperty<>("fs.gs.working.dir", "/"); + + /** + * Configuration key for setting write buffer size. + */ + public static final HadoopConfigurationProperty GCS_OUTPUT_STREAM_BUFFER_SIZE = + new HadoopConfigurationProperty<>("fs.gs.outputstream.buffer.size", 8L * 1024 * 1024); + + private final String workingDirectory; + + public int getOutStreamBufferSize() { + return outStreamBufferSize; + } + + private final int outStreamBufferSize; + + GoogleHadoopFileSystemConfiguration(Configuration config) { + this.workingDirectory = GCS_WORKING_DIRECTORY.get(config, config::get); + this.outStreamBufferSize = + toIntExact(GCS_OUTPUT_STREAM_BUFFER_SIZE.get(config, config::getLongBytes)); + } + + public String getWorkingDirectory() { + return this.workingDirectory; + } +} diff --git a/hadoop-tools/hadoop-gcp/src/main/java/org/apache/hadoop/fs/gs/GoogleHadoopOutputStream.java b/hadoop-tools/hadoop-gcp/src/main/java/org/apache/hadoop/fs/gs/GoogleHadoopOutputStream.java new file mode 100644 index 0000000000000..747d9f001c517 --- /dev/null +++ b/hadoop-tools/hadoop-gcp/src/main/java/org/apache/hadoop/fs/gs/GoogleHadoopOutputStream.java @@ -0,0 +1,124 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hadoop.fs.gs; + +import java.io.BufferedOutputStream; +import java.io.IOException; +import java.io.OutputStream; +import java.net.URI; +import java.nio.channels.Channels; +import java.nio.channels.ClosedChannelException; +import java.nio.channels.WritableByteChannel; +import javax.annotation.Nonnull; + +import org.apache.hadoop.fs.FileAlreadyExistsException; +import org.apache.hadoop.fs.FileSystem; + +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +class GoogleHadoopOutputStream extends OutputStream { + public static final Logger LOG = LoggerFactory.getLogger(StorageResourceId.class); + + private final GoogleHadoopFileSystem ghfs; + + // Path of the file to write to. + private final URI dstGcsPath; + + private OutputStream outputStream; + + // Statistics tracker provided by the parent GoogleHadoopFileSystem for recording + // numbers of bytes written. + private final FileSystem.Statistics statistics; + + /** + * Constructs an instance of GoogleHadoopOutputStream object. + * + * @param ghfs Instance of {@link GoogleHadoopFileSystem}. + * @param dstGcsPath Path of the file to write to. + * @param statistics File system statistics object. + * @param createFileOptions options for file creation + * @throws IOException if an IO error occurs. + */ + GoogleHadoopOutputStream(GoogleHadoopFileSystem ghfs, URI dstGcsPath, + CreateOptions createFileOptions, FileSystem.Statistics statistics) throws IOException { + LOG.trace("GoogleHadoopOutputStream(gcsPath: {}, createFileOptions: {})", dstGcsPath, + createFileOptions); + this.ghfs = ghfs; + this.dstGcsPath = dstGcsPath; + this.statistics = statistics; + + this.outputStream = createOutputStream(ghfs.getGcsFs(), dstGcsPath, createFileOptions, + ghfs.getFileSystemConfiguration()); + } + + private static OutputStream createOutputStream(GoogleCloudStorageFileSystem gcsfs, URI gcsPath, + CreateOptions options, GoogleHadoopFileSystemConfiguration fileSystemConfiguration) + throws IOException { + WritableByteChannel channel; + try { + channel = gcsfs.create(gcsPath, options); + } catch (java.nio.file.FileAlreadyExistsException e) { + + throw (FileAlreadyExistsException) new FileAlreadyExistsException( + String.format("'%s' already exists", gcsPath)).initCause(e); + } + OutputStream outputStream = Channels.newOutputStream(channel); + int bufferSize = fileSystemConfiguration.getOutStreamBufferSize(); + return bufferSize > 0 ? new BufferedOutputStream(outputStream, bufferSize) : outputStream; + } + + @Override + public void write(int b) throws IOException { + throwIfNotOpen(); + outputStream.write(b); + statistics.incrementBytesWritten(1); + statistics.incrementWriteOps(1); + } + + @Override + public void write(@Nonnull byte[] b, int offset, int len) throws IOException { + throwIfNotOpen(); + outputStream.write(b, offset, len); + statistics.incrementBytesWritten(len); + statistics.incrementWriteOps(1); + } + + @Override + public void close() throws IOException { + LOG.trace("close(): final destination: {}", dstGcsPath); + + if (outputStream == null) { + LOG.trace("close(): Ignoring; stream already closed."); + return; + } + + try { + outputStream.close(); + } finally { + outputStream = null; + } + } + + private void throwIfNotOpen() throws IOException { + if (outputStream == null) { + throw new ClosedChannelException(); + } + } +} diff --git a/hadoop-tools/hadoop-gcp/src/main/java/org/apache/hadoop/fs/gs/HadoopConfigurationProperty.java b/hadoop-tools/hadoop-gcp/src/main/java/org/apache/hadoop/fs/gs/HadoopConfigurationProperty.java new file mode 100644 index 0000000000000..9360290a09c5b --- /dev/null +++ b/hadoop-tools/hadoop-gcp/src/main/java/org/apache/hadoop/fs/gs/HadoopConfigurationProperty.java @@ -0,0 +1,90 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hadoop.fs.gs; + +import org.apache.hadoop.thirdparty.com.google.common.collect.ImmutableList; + +import java.util.List; +import java.util.function.BiFunction; + +import org.apache.hadoop.conf.Configuration; + +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +/** + * Hadoop configuration property. + */ +class HadoopConfigurationProperty { + private static final Logger LOG = LoggerFactory.getLogger(HadoopConfigurationProperty.class); + + private final String key; + private final List deprecatedKeys; + private final T defaultValue; + + private List keyPrefixes = ImmutableList.of(""); + + HadoopConfigurationProperty(String key) { + this(key, null); + } + + HadoopConfigurationProperty(String key, T defaultValue, String... deprecatedKeys) { + this.key = key; + this.deprecatedKeys = + deprecatedKeys == null ? ImmutableList.of() : ImmutableList.copyOf(deprecatedKeys); + this.defaultValue = defaultValue; + } + + String getKey() { + return key; + } + + T getDefault() { + return defaultValue; + } + + T get(Configuration config, BiFunction getterFn) { + String lookupKey = getLookupKey(config, key, (c, k) -> c.get(k) != null); + return logProperty(lookupKey, getterFn.apply(lookupKey, defaultValue)); + } + + private String getLookupKey(Configuration config, String lookupKey, + BiFunction checkFn) { + for (String prefix : keyPrefixes) { + String prefixedKey = prefix + lookupKey; + if (checkFn.apply(config, prefixedKey)) { + return prefixedKey; + } + for (String deprecatedKey : deprecatedKeys) { + String prefixedDeprecatedKey = prefix + deprecatedKey; + if (checkFn.apply(config, prefixedDeprecatedKey)) { + LOG.warn("Using deprecated key '{}', use '{}' key instead.", prefixedDeprecatedKey, + prefixedKey); + return prefixedDeprecatedKey; + } + } + } + return keyPrefixes.get(0) + lookupKey; + } + + private static S logProperty(String key, S value) { + LOG.trace("{} = {}", key, value); + return value; + } +} diff --git a/hadoop-tools/hadoop-gcp/src/main/java/org/apache/hadoop/fs/gs/StorageResourceId.java b/hadoop-tools/hadoop-gcp/src/main/java/org/apache/hadoop/fs/gs/StorageResourceId.java new file mode 100644 index 0000000000000..5935564feedfa --- /dev/null +++ b/hadoop-tools/hadoop-gcp/src/main/java/org/apache/hadoop/fs/gs/StorageResourceId.java @@ -0,0 +1,328 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hadoop.fs.gs; + +import static org.apache.hadoop.thirdparty.com.google.common.base.Preconditions.checkArgument; +import static org.apache.hadoop.thirdparty.com.google.common.base.Preconditions.checkNotNull; +import static org.apache.hadoop.thirdparty.com.google.common.base.Strings.isNullOrEmpty; +import static org.apache.hadoop.fs.gs.Constants.SCHEME; + +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import java.net.URI; +import java.util.Objects; +import java.util.regex.Matcher; +import java.util.regex.Pattern; + +/** + * Data struct representing either a GCS StorageObject, a GCS Bucket or the GCS root (gs://). If + * both bucketName and objectName are null, the StorageResourceId refers to GCS root (gs://). If + * bucketName is non-null, and objectName is null, then this refers to a GCS Bucket. Otherwise, if + * bucketName and objectName are both non-null, this refers to a GCS StorageObject. + */ +class StorageResourceId { + + private static final Logger LOG = LoggerFactory.getLogger(StorageResourceId.class); + + // The generationId used to denote "unknown"; if given to a method expecting generationId + // constraints, the method may perform extra low-level GETs to determine an existing + // generationId + // if idempotency constraints require doing so. + static final long UNKNOWN_GENERATION_ID = -1L; + + // Pattern that parses out bucket and object names. + // Given 'gs://foo-bucket/foo/bar/baz', matcher.group(x) will return: + // 0 = gs://foo-bucket/foo/bar/baz + // 1 = foo-bucket/foo/bar/baz + // 2 = foo-bucket + // 3 = /foo/bar/baz + // 4 = foo/bar/baz + // Groups 2 and 4 can be used to create an instance. + private static final Pattern GCS_PATH_PATTERN = Pattern.compile("gs://(([^/]+)(/(.+)?)?)?"); + + // The singleton instance identifying the GCS root (gs://). Both getObjectName() and + // getBucketName() will return null. + static final StorageResourceId ROOT = new StorageResourceId(); + + // Bucket name of this storage resource to be used with the Google Cloud Storage API. + private final String bucketName; + + // Object name of this storage resource to be used with the Google Cloud Storage API. + private final String objectName; + + // Human-readable String to be returned by toString(); kept as 'final' member for efficiency. + private final String stringPath; + + // The generationId to be used with precondition checks when using this StorageResourceId + // as an identifier for mutation requests. + private final long generationId; + + /** + * Constructor for a StorageResourceId that refers to the GCS root (gs://). Private because all + * external users should just use the singleton StorageResourceId.ROOT. + */ + private StorageResourceId() { + this.bucketName = null; + this.objectName = null; + this.stringPath = StringPaths.fromComponents(bucketName, objectName); + this.generationId = UNKNOWN_GENERATION_ID; + } + + /** + * Constructor for a StorageResourceId representing a Bucket; {@code getObjectName()} will return + * null for a StorageResourceId that represents a Bucket. + * + * @param bucketName The bucket name of the resource. Must be non-empty and non-null. + */ + StorageResourceId(String bucketName) { + checkArgument(!isNullOrEmpty(bucketName), "bucketName must not be null or empty"); + + this.bucketName = bucketName; + this.objectName = null; + this.stringPath = StringPaths.fromComponents(bucketName, objectName); + this.generationId = UNKNOWN_GENERATION_ID; + } + + /** + * Constructor for a StorageResourceId representing a full StorageObject, including bucketName and + * objectName. + * + * @param bucketName The bucket name of the resource. Must be non-empty and non-null. + * @param objectName The object name of the resource. Must be non-empty and non-null. + */ + StorageResourceId(String bucketName, String objectName) { + checkArgument(!isNullOrEmpty(bucketName), "bucketName must not be null or empty"); + checkArgument(!isNullOrEmpty(objectName), "objectName must not be null or empty"); + + this.bucketName = bucketName; + this.objectName = objectName; + this.stringPath = StringPaths.fromComponents(bucketName, objectName); + this.generationId = UNKNOWN_GENERATION_ID; + } + + /** + * Constructor for a StorageResourceId representing a full StorageObject, including bucketName and + * objectName. + * + * @param bucketName The bucket name of the resource. Must be non-empty and non-null. + * @param objectName The object name of the resource. Must be non-empty and non-null. + * @param generationId The generationId to be used with precondition checks when using this + * StorageResourceId as an identifier for mutation requests. + */ + StorageResourceId(String bucketName, String objectName, long generationId) { + checkArgument(!isNullOrEmpty(bucketName), "bucketName must not be null or empty"); + checkArgument(!isNullOrEmpty(objectName), "objectName must not be null or empty"); + + this.bucketName = bucketName; + this.objectName = objectName; + this.stringPath = StringPaths.fromComponents(bucketName, objectName); + this.generationId = generationId; + } + + /** + * Constructor for a StorageResourceId representing a full StorageObject, including bucketName and + * objectName. + * + * @param bucketName The bucket name of the resource. Must be non-empty and non-null. + * @param generationId The generationId to be used with precondition checks when using this + * StorageResourceId as an identifier for mutation requests. + */ + StorageResourceId(String bucketName, long generationId) { + checkArgument(!isNullOrEmpty(bucketName), "bucketName must not be null or empty"); + this.bucketName = bucketName; + this.objectName = null; + this.stringPath = StringPaths.fromComponents(bucketName, objectName); + this.generationId = generationId; + } + + /** + * Returns true if this StorageResourceId represents a GCS StorageObject; if true, both {@code + * getBucketName} and {@code getObjectName} will be non-empty and non-null. + */ + boolean isStorageObject() { + return bucketName != null && objectName != null; + } + + /** + * Returns true if this StorageResourceId represents a GCS Bucket; if true, then {@code + * getObjectName} will return null. + */ + boolean isBucket() { + return bucketName != null && objectName == null; + } + + /** + * Returns true if this StorageResourceId represents the GCS root (gs://); if true, then both + * {@code getBucketName} and {@code getObjectName} will be null. + */ + boolean isRoot() { + return bucketName == null && objectName == null; + } + + /** + * Indicates if this StorageResourceId corresponds to a 'directory'; similar to {@link + * FileInfo#isDirectory} except deals entirely with pathnames instead of also checking for + * exists() to be true on a corresponding GoogleCloudStorageItemInfo. + */ + boolean isDirectory() { + return isRoot() || isBucket() || StringPaths.isDirectoryPath(objectName); + } + + /** + * Gets the bucket name component of this resource identifier. + */ + String getBucketName() { + return bucketName; + } + + /** + * Gets the object name component of this resource identifier. + */ + String getObjectName() { + return objectName; + } + + /** + * The generationId to be used with precondition checks when using this StorageResourceId as an + * identifier for mutation requests. The generationId is *not* used when determining equals() or + * hashCode(). + */ + long getGenerationId() { + return generationId; + } + + /** + * Returns true if generationId is not UNKNOWN_GENERATION_ID. + */ + boolean hasGenerationId() { + return generationId != UNKNOWN_GENERATION_ID; + } + + /** + * Returns a string of the form {@code gs:///}. + */ + @Override + public String toString() { + return stringPath; + } + + @Override + public boolean equals(Object obj) { + if (obj instanceof StorageResourceId) { + StorageResourceId other = (StorageResourceId) obj; + return Objects.equals(bucketName, other.bucketName) && Objects.equals(objectName, + other.objectName); + } + return false; + } + + @Override + public int hashCode() { + return stringPath.hashCode(); + } + + /** + * Converts StorageResourceId instance to look like a directory path. If the path already looks + * like a directory path then this call is a no-op. + * + * @return A resourceId with a directory path corresponding to the given resourceId. + */ + StorageResourceId toDirectoryId() { + if (isStorageObject() && !StringPaths.isDirectoryPath(getObjectName())) { + return new StorageResourceId(getBucketName(), StringPaths.toDirectoryPath(getObjectName())); + } + return this; + } + + /** + * Parses {@link StorageResourceId} from specified string. + */ + static StorageResourceId fromStringPath(String path) { + return fromStringPath(path, UNKNOWN_GENERATION_ID); + } + + /** + * Parses {@link StorageResourceId} from specified string and generationId. + */ + static StorageResourceId fromStringPath(String path, long generationId) { + checkArgument(path != null, "path must not be null"); + + Matcher matcher = GCS_PATH_PATTERN.matcher(path); + checkArgument(matcher.matches(), "'%s' is not a valid GCS object name.", path); + + String bucketName = matcher.group(2); + String relativePath = matcher.group(4); + if (bucketName == null) { + checkArgument(generationId == UNKNOWN_GENERATION_ID, + "Cannot specify generationId '%s' for root object '%s'", generationId, path); + return ROOT; + } else if (relativePath != null) { + return new StorageResourceId(bucketName, relativePath, generationId); + } + checkArgument(generationId == UNKNOWN_GENERATION_ID, + "Cannot specify generationId '%s' for bucket '%s'", generationId, path); + return new StorageResourceId(bucketName); + } + + /** + * Validates the given URI and if valid, returns the associated StorageResourceId. + * + * @param path The GCS URI to validate. + * @param allowEmptyObjectName If true, a missing object name is not considered invalid. + * @return a StorageResourceId that may be the GCS root, a Bucket, or a StorageObject. + */ + static StorageResourceId fromUriPath(URI path, boolean allowEmptyObjectName) { + return fromUriPath(path, allowEmptyObjectName, UNKNOWN_GENERATION_ID); + } + + /** + * Validates the given URI and if valid, returns the associated StorageResourceId. + * + * @param path The GCS URI to validate. + * @param allowEmptyObjectName If true, a missing object name is not considered invalid. + * @param generationId The generationId to be used with precondition checks when + * using this + * @return a StorageResourceId that may be the GCS root, a Bucket, or a StorageObject. + */ + static StorageResourceId fromUriPath(URI path, boolean allowEmptyObjectName, + long generationId) { + LOG.trace("fromUriPath('{}', {})", path, allowEmptyObjectName); + checkNotNull(path); + + if (!SCHEME.equals(path.getScheme())) { + throw new IllegalArgumentException( + String.format("GCS path supports only '%s' scheme, instead got '%s' from '%s'.", SCHEME, + path.getScheme(), path)); + } + + if (path.equals(GoogleCloudStorageFileSystem.GCSROOT)) { + return ROOT; + } + + String bucketName = StringPaths.validateBucketName(path.getAuthority()); + // Note that we're using getPath here instead of rawPath, etc. This is because it is assumed + // that the path was properly encoded in getPath (or another similar method): + String objectName = StringPaths.validateObjectName(path.getPath(), allowEmptyObjectName); + + return isNullOrEmpty(objectName) ? + new StorageResourceId(bucketName, generationId) : + new StorageResourceId(bucketName, objectName, generationId); + } +} diff --git a/hadoop-tools/hadoop-gcp/src/main/java/org/apache/hadoop/fs/gs/StringPaths.java b/hadoop-tools/hadoop-gcp/src/main/java/org/apache/hadoop/fs/gs/StringPaths.java new file mode 100644 index 0000000000000..80682c3ed2a31 --- /dev/null +++ b/hadoop-tools/hadoop-gcp/src/main/java/org/apache/hadoop/fs/gs/StringPaths.java @@ -0,0 +1,169 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hadoop.fs.gs; + +import static org.apache.hadoop.thirdparty.com.google.common.base.Strings.isNullOrEmpty; +import static org.apache.hadoop.fs.gs.Constants.PATH_DELIMITER; + +import org.apache.hadoop.thirdparty.com.google.common.base.CharMatcher; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +/** + * Utility methods for String GCS paths. + */ +final class StringPaths { + + public static final Logger LOG = LoggerFactory.getLogger(StorageResourceId.class); + + private StringPaths() { + } + + // 14x faster (20ns vs 280ns) than "^[a-z0-9_.-]+$" regex + private static final CharMatcher BUCKET_NAME_CHAR_MATCHER = CharMatcher.ascii() + .and(CharMatcher.inRange('0', '9').or(CharMatcher.inRange('a', 'z')) + .or(CharMatcher.anyOf("_.-"))) + .precomputed(); + + /** + * Validate the given bucket name to make sure that it can be used as a part of a file system + * path. + * + *

Note: this is not designed to duplicate the exact checks that GCS would perform on the + * server side. We make some checks that are relevant to using GCS as a file system. + * + * @param bucketName Bucket name to check. + */ + static String validateBucketName(String bucketName) { + // If the name ends with '/', remove it. + bucketName = toFilePath(bucketName); + + if (isNullOrEmpty(bucketName)) { + throw new IllegalArgumentException("GCS bucket name cannot be empty."); + } + + if (!BUCKET_NAME_CHAR_MATCHER.matchesAllOf(bucketName)) { + throw new IllegalArgumentException(String.format( + "Invalid GCS bucket name '%s': bucket name must contain only 'a-z0-9_.-' characters.", + bucketName)); + } + + return bucketName; + } + + /** + * Validate the given object name to make sure that it can be used as a part of a file system + * path. + * + *

Note: this is not designed to duplicate the exact checks that GCS would perform on the + * server side. We make some checks that are relevant to using GCS as a file system. + * + * @param objectName Object name to check. + * @param allowEmptyObjectName If true, a missing object name is not considered invalid. + */ + static String validateObjectName(String objectName, boolean allowEmptyObjectName) { + LOG.trace("validateObjectName('{}', {})", objectName, allowEmptyObjectName); + + if (isNullOrEmpty(objectName) || objectName.equals(PATH_DELIMITER)) { + if (allowEmptyObjectName) { + objectName = ""; + } else { + throw new IllegalArgumentException(String.format( + "GCS path must include non-empty object name [objectName='%s'," + + " allowEmptyObjectName=%s]", objectName, allowEmptyObjectName)); + } + } + + // We want objectName to look like a traditional file system path, + // therefore, disallow objectName with consecutive '/' chars. + for (int i = 0; i < (objectName.length() - 1); i++) { + if (objectName.charAt(i) == '/' && objectName.charAt(i + 1) == '/') { + throw new IllegalArgumentException( + String.format("GCS path must not have consecutive '/' characters: '%s'", objectName)); + } + } + + // Remove leading '/' if it exists. + if (objectName.startsWith(PATH_DELIMITER)) { + objectName = objectName.substring(1); + } + + LOG.trace("validateObjectName -> '{}'", objectName); + return objectName; + } + + /** + * Helper for standardizing the way various human-readable messages in logs/exceptions that refer + * to a bucket/object pair. + */ + public static String fromComponents(String bucketName, String objectName) { + if (bucketName == null && objectName != null) { + throw new IllegalArgumentException( + String.format("Invalid bucketName/objectName pair: gs://%s/%s", "", objectName)); + } + // TODO(user): Unify this method with other methods that convert bucketName/objectName + // to a URI; maybe use the single slash for compatibility. + StringBuilder result = new StringBuilder("gs://"); + if (bucketName != null) { + result.append(bucketName); + } + if (objectName != null) { + result.append('/').append(objectName); + } + return result.toString(); + } + + /** + * Indicates whether the given object name looks like a directory path. + * + * @param path Name of the object to inspect. + * @return Whether the given object name looks like a directory path. + */ + public static boolean isDirectoryPath(String path) { + return !isNullOrEmpty(path) && path.endsWith(PATH_DELIMITER); + } + + /** + * Converts the given object name to look like a file path. If the object name already looks like + * a file path then this call is a no-op. + * + *

If the object name is null or empty, it is returned as-is. + * + * @param path Name of the object to inspect. + * @return File path for the given path. + */ + public static String toFilePath(String path) { + return !isNullOrEmpty(path) && isDirectoryPath(path) ? + path.substring(0, path.length() - 1) : + path; + } + + /** + * Converts the given object name to look like a directory path. If the object name already looks + * like a directory path then this call is a no-op. + * + *

If the object name is null or empty, it is returned as-is. + * + * @param path Name of the object to inspect. + * @return Directory path for the given path. + */ + static String toDirectoryPath(String path) { + return isNullOrEmpty(path) || isDirectoryPath(path) ? path : path + PATH_DELIMITER; + } +} diff --git a/hadoop-tools/hadoop-gcp/src/main/java/org/apache/hadoop/fs/gs/UriPaths.java b/hadoop-tools/hadoop-gcp/src/main/java/org/apache/hadoop/fs/gs/UriPaths.java new file mode 100644 index 0000000000000..30e13cb33cbf1 --- /dev/null +++ b/hadoop-tools/hadoop-gcp/src/main/java/org/apache/hadoop/fs/gs/UriPaths.java @@ -0,0 +1,113 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hadoop.fs.gs; + +import static org.apache.hadoop.thirdparty.com.google.common.base.Preconditions.checkNotNull; +import static org.apache.hadoop.fs.gs.Constants.PATH_DELIMITER; +import static org.apache.hadoop.fs.gs.Constants.SCHEME; + +import java.net.URI; +import java.net.URISyntaxException; + +/** + * Utility methods for URI GCS paths. + */ +final class UriPaths { + + private UriPaths() { + } + + /** + * Converts the given path to look like a directory path. If the path already looks like a + * directory path then this call is a no-op. + * + * @param path Path to convert. + * @return Directory path for the given path. + */ + public static URI toDirectory(URI path) { + StorageResourceId resourceId = StorageResourceId.fromUriPath(path, true); + + if (resourceId.isStorageObject() && !resourceId.isDirectory()) { + resourceId = resourceId.toDirectoryId(); + path = fromResourceId(resourceId, /* allowEmptyObjectName= */ false); + } + return path; + } + + /** + * Gets the parent directory of the given path. + * + * @param path Path to convert. + * @return Path of parent directory of the given item or null for root path. + */ + public static URI getParentPath(URI path) { + checkNotNull(path); + + // Root path has no parent. + if (path.equals(GoogleCloudStorageFileSystem.GCSROOT)) { + return null; + } + + StorageResourceId resourceId = StorageResourceId.fromUriPath(path, true); + + if (resourceId.isBucket()) { + return GoogleCloudStorageFileSystem.GCSROOT; + } + + String objectName = resourceId.getObjectName(); + int index = StringPaths.isDirectoryPath(objectName) ? + objectName.lastIndexOf(PATH_DELIMITER, objectName.length() - 2) : + objectName.lastIndexOf(PATH_DELIMITER); + return index < 0 ? + fromStringPathComponents(resourceId.getBucketName(), /* objectName= */ + null, /* allowEmptyObjectName= */ true) : + fromStringPathComponents(resourceId.getBucketName(), objectName.substring(0, index + 1), + /* allowEmptyObjectName= */ false); + } + + /** + * Constructs and returns full path for the given bucket and object names. + */ + public static URI fromResourceId(StorageResourceId resourceId, boolean allowEmptyObjectName) { + return fromStringPathComponents(resourceId.getBucketName(), resourceId.getObjectName(), + allowEmptyObjectName); + } + + /** + * Constructs and returns full path for the given bucket and object names. + */ + public static URI fromStringPathComponents(String bucketName, String objectName, + boolean allowEmptyObjectName) { + if (allowEmptyObjectName && bucketName == null && objectName == null) { + return GoogleCloudStorageFileSystem.GCSROOT; + } + + String authority = StringPaths.validateBucketName(bucketName); + String path = PATH_DELIMITER + StringPaths.validateObjectName(objectName, allowEmptyObjectName); + + try { + return new URI(SCHEME, authority, path, + /* query= */ null, + /* fragment= */ null); + } catch (URISyntaxException e) { + throw new IllegalArgumentException( + String.format("Invalid bucket name (%s) or object name (%s)", bucketName, objectName), e); + } + } +} diff --git a/hadoop-tools/hadoop-gcp/src/main/java/org/apache/hadoop/fs/gs/VerificationAttributes.java b/hadoop-tools/hadoop-gcp/src/main/java/org/apache/hadoop/fs/gs/VerificationAttributes.java new file mode 100644 index 0000000000000..4155482fc7d33 --- /dev/null +++ b/hadoop-tools/hadoop-gcp/src/main/java/org/apache/hadoop/fs/gs/VerificationAttributes.java @@ -0,0 +1,68 @@ +/* + * Copyright 2016 Google Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hadoop.fs.gs; + +import java.util.Arrays; +import javax.annotation.Nullable; + +/** + * GCS provided validation attributes for a single object. + */ +class VerificationAttributes { + private final byte[] md5hash; + private final byte[] crc32c; + + VerificationAttributes(@Nullable byte[] md5hash, @Nullable byte[] crc32c) { + this.md5hash = md5hash; + this.crc32c = crc32c; + } + + /** + * MD5 hash of an object, if available. + */ + @Nullable + byte[] getMd5hash() { + return md5hash; + } + + /** + * CRC32c checksum of an object, if available. + */ + @Nullable + byte[] getCrc32c() { + return crc32c; + } + + @Override + public boolean equals(Object o) { + if (this == o) { + return true; + } + if (!(o instanceof VerificationAttributes)) { + return false; + } + VerificationAttributes that = (VerificationAttributes) o; + return Arrays.equals(md5hash, that.md5hash) && Arrays.equals(crc32c, that.crc32c); + } + + @Override + public int hashCode() { + int result = Arrays.hashCode(md5hash); + result = 31 * result + Arrays.hashCode(crc32c); + return result; + } +} diff --git a/hadoop-tools/hadoop-gcp/src/main/java/org/apache/hadoop/fs/gs/package-info.java b/hadoop-tools/hadoop-gcp/src/main/java/org/apache/hadoop/fs/gs/package-info.java new file mode 100644 index 0000000000000..eedfb7822acd7 --- /dev/null +++ b/hadoop-tools/hadoop-gcp/src/main/java/org/apache/hadoop/fs/gs/package-info.java @@ -0,0 +1,23 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/** + * Google Cloud Storage Filesystem. + */ + +package org.apache.hadoop.fs.gs; \ No newline at end of file diff --git a/hadoop-tools/hadoop-gcp/src/test/java/org/apache/hadoop/fs/gs/TestStorageResourceId.java b/hadoop-tools/hadoop-gcp/src/test/java/org/apache/hadoop/fs/gs/TestStorageResourceId.java new file mode 100644 index 0000000000000..e0a39b2d7e403 --- /dev/null +++ b/hadoop-tools/hadoop-gcp/src/test/java/org/apache/hadoop/fs/gs/TestStorageResourceId.java @@ -0,0 +1,285 @@ +/* + * Copyright 2013 Google Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hadoop.fs.gs; + +import java.net.URI; + +import org.junit.Test; + +import static org.junit.jupiter.api.Assertions.assertNull; +import static org.junit.jupiter.api.Assertions.assertThrows; +import static org.junit.jupiter.api.Assertions.assertEquals; +import static org.junit.jupiter.api.Assertions.assertTrue; +import static org.junit.jupiter.api.Assertions.assertFalse; +import static org.junit.jupiter.api.Assertions.assertNotSame; +import static org.junit.jupiter.api.Assertions.assertSame; +import org.junit.jupiter.params.ParameterizedTest; +import org.junit.jupiter.params.provider.ValueSource; + +public class TestStorageResourceId { + @Test + public void testConstructorInvalid() { + assertThrows(IllegalArgumentException.class, () -> { + new StorageResourceId(null); + }); + + assertThrows(IllegalArgumentException.class, () -> { + new StorageResourceId(""); + }); + + assertThrows(IllegalArgumentException.class, () -> { + new StorageResourceId(null, null); + }); + + assertThrows(IllegalArgumentException.class, () -> { + new StorageResourceId("foo", null); + }); + + assertThrows(IllegalArgumentException.class, () -> { + new StorageResourceId("", null); + }); + + assertThrows(IllegalArgumentException.class, () -> { + new StorageResourceId(null, null, 0L); + }); + + assertThrows(IllegalArgumentException.class, () -> { + new StorageResourceId("foo", null, 0L); + }); + + assertThrows(IllegalArgumentException.class, () -> { + new StorageResourceId("", null, 0L); + }); + + assertThrows(IllegalArgumentException.class, () -> { + new StorageResourceId(null, 0L); + }); + + assertThrows(IllegalArgumentException.class, () -> { + new StorageResourceId("", 0L); + }); + } + + @Test + public void testFromStringPathInvalid() { + assertThrows(IllegalArgumentException.class, () -> { + StorageResourceId.fromStringPath(null); + }); + + assertThrows(IllegalArgumentException.class, () -> { + StorageResourceId.fromStringPath(""); + }); + + assertThrows(IllegalArgumentException.class, () -> { + StorageResourceId.fromStringPath("foo"); + }); + + assertThrows(IllegalArgumentException.class, () -> { + StorageResourceId.fromStringPath("/foo/bar"); + }); + + assertThrows(IllegalArgumentException.class, () -> { + StorageResourceId.fromStringPath("gs:///foo/bar"); + }); + } + + @Test + public void testConstructor() { + String bucketName = "testbucketname"; + String objectName = "a/b/c.txt"; + + verify(new StorageResourceId(bucketName), bucketName, + StorageResourceId.UNKNOWN_GENERATION_ID, null, false, + true, true, false, false); + + verify(new StorageResourceId(bucketName, objectName), bucketName, + StorageResourceId.UNKNOWN_GENERATION_ID, objectName, false, + false, false, true, false); + + long genId = System.currentTimeMillis(); + verify(new StorageResourceId(bucketName, objectName, genId), bucketName, + genId, objectName, true, + false, false, true, false); + + verify(new StorageResourceId(bucketName, genId), bucketName, + genId, null, true, + true, true, false, false); + } + + @Test + public void testEqualsBucket() { + StorageResourceId bucket1 = new StorageResourceId("test-bucket"); + StorageResourceId bucket2 = new StorageResourceId("test-bucket"); + assertTrue(bucket1.equals(bucket2)); + assertEquals(bucket1.hashCode(), bucket2.hashCode()); + } + + @Test + public void testEqualsObject() { + StorageResourceId obj1 = new StorageResourceId("test-bucket", "test-object"); + StorageResourceId obj2 = new StorageResourceId("test-bucket", "test-object"); + assertTrue(obj1.equals(obj2)); + assertEquals(obj1.hashCode(), obj2.hashCode()); + } + + @Test + public void testEqualsDifferentBucket() { + StorageResourceId bucket1 = new StorageResourceId("test-bucket"); + StorageResourceId bucket2 = new StorageResourceId("other-bucket"); + assertFalse(bucket1.equals(bucket2)); + } + + @Test + public void testEqualsDifferentObject() { + StorageResourceId obj1 = new StorageResourceId("test-bucket", "test-object"); + StorageResourceId obj2 = new StorageResourceId("test-bucket", "other-object"); + assertFalse(obj1.equals(obj2)); + } + + @Test + public void testToDirectoryIdFromFile() { + StorageResourceId fileId = new StorageResourceId("my-bucket", "path/to/file.txt"); + StorageResourceId dirId = fileId.toDirectoryId(); + + assertNotSame(fileId, dirId); // Should return a new instance + assertTrue(dirId.isDirectory()); + assertEquals("my-bucket", dirId.getBucketName()); + assertEquals("path/to/file.txt/", dirId.getObjectName()); + assertEquals(fileId.getGenerationId(), dirId.getGenerationId()); + } + + @Test + public void testToDirectoryIdFromDirectoryObject() { + StorageResourceId dirIdOriginal = new StorageResourceId("my-bucket", "path/to/dir/"); + StorageResourceId dirIdConverted = dirIdOriginal.toDirectoryId(); + + assertSame(dirIdOriginal, dirIdConverted); // Should return the same instance + assertTrue(dirIdConverted.isDirectory()); + assertEquals("path/to/dir/", dirIdConverted.getObjectName()); + } + + @Test + public void testToDirectoryIdFromBucket() { + StorageResourceId bucketId = new StorageResourceId("my-bucket"); + StorageResourceId convertedId = bucketId.toDirectoryId(); + assertSame(bucketId, convertedId); + assertTrue(convertedId.isBucket()); + } + + @Test + public void testFromStringPathRoot() { + StorageResourceId id = StorageResourceId.fromStringPath("gs://"); + assertTrue(id.isRoot()); + } + + @Test + public void testFromStringPathBucket() { + StorageResourceId id = StorageResourceId.fromStringPath("gs://my-bucket"); + assertTrue(id.isBucket()); + assertEquals("my-bucket", id.getBucketName()); + assertNull(id.getObjectName()); + assertEquals(StorageResourceId.UNKNOWN_GENERATION_ID, id.getGenerationId()); + } + + @ParameterizedTest + @ValueSource(strings = { + "gs://my-bucket/object", + "gs://my-bucket/folder/file.txt", + "gs://my-bucket/folder/" + }) + public void testFromStringPathObject(String path) { + String expectedBucket = path.split("/")[2]; + String expectedObject = + path.substring(path.indexOf(expectedBucket) + expectedBucket.length() + 1); + + StorageResourceId id = StorageResourceId.fromStringPath(path); + assertTrue(id.isStorageObject()); + assertEquals(expectedBucket, id.getBucketName()); + assertEquals(expectedObject, id.getObjectName()); + assertEquals(StorageResourceId.UNKNOWN_GENERATION_ID, id.getGenerationId()); + } + + @Test + public void testFromStringPathObjectWithGenerationId() { + long genId = 12345L; + StorageResourceId id = StorageResourceId.fromStringPath("gs://my-bucket/object.txt", genId); + assertTrue(id.isStorageObject()); + assertEquals("my-bucket", id.getBucketName()); + assertEquals("object.txt", id.getObjectName()); + assertEquals(genId, id.getGenerationId()); + assertTrue(id.hasGenerationId()); + } + + @Test + public void testFromUriPathBucket() throws Exception { + URI uri = new URI("gs://my-bucket"); + StorageResourceId id = StorageResourceId.fromUriPath(uri, true); + assertTrue(id.isBucket()); + assertEquals("my-bucket", id.getBucketName()); + assertNull(id.getObjectName()); + } + + @Test + public void testFromUriPathObject() throws Exception { + URI uri = new URI("gs://my-bucket/path/to/file.txt"); + StorageResourceId id = StorageResourceId.fromUriPath(uri, false); + assertTrue(id.isStorageObject()); + assertEquals("my-bucket", id.getBucketName()); + assertEquals("path/to/file.txt", id.getObjectName()); + } + + @Test + public void testFromUriPathObjectWithGenerationId() throws Exception { + URI uri = new URI("gs://my-bucket/object.txt"); + long genId = 54321L; + StorageResourceId id = StorageResourceId.fromUriPath(uri, false, genId); + assertTrue(id.isStorageObject()); + assertEquals("my-bucket", id.getBucketName()); + assertEquals("object.txt", id.getObjectName()); + assertEquals(genId, id.getGenerationId()); + assertTrue(id.hasGenerationId()); + } + + @Test + public void testFromUriPathBucketWithGenerationId() throws Exception { + assertThrows(IllegalArgumentException.class, () -> { + URI uri = new URI("gs://my-bucket"); + long genId = 54321L; + StorageResourceId.fromUriPath(uri, false, genId); + }); + } + + private static void verify( + StorageResourceId id, + String bucketName, + long generationId, + String objectName, + boolean hasGenerationId, + boolean isBucket, + boolean isDirectory, + boolean isStorageObject, + boolean isRoot) { + assertEquals(bucketName, id.getBucketName()); + assertEquals(generationId, id.getGenerationId()); + assertEquals(objectName, id.getObjectName()); + assertEquals(hasGenerationId, id.hasGenerationId()); + assertEquals(isBucket, id.isBucket()); + assertEquals(isDirectory, id.isDirectory()); + assertEquals(isStorageObject, id.isStorageObject()); + assertEquals(isRoot, id.isRoot()); + } +} diff --git a/hadoop-tools/hadoop-gcp/src/test/java/org/apache/hadoop/fs/gs/TestStringPaths.java b/hadoop-tools/hadoop-gcp/src/test/java/org/apache/hadoop/fs/gs/TestStringPaths.java new file mode 100644 index 0000000000000..16234e0ce1d57 --- /dev/null +++ b/hadoop-tools/hadoop-gcp/src/test/java/org/apache/hadoop/fs/gs/TestStringPaths.java @@ -0,0 +1,164 @@ +/* + * Copyright 2013 Google Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hadoop.fs.gs; + +import org.junit.Test; +import static org.junit.jupiter.api.Assertions.assertNull; +import static org.junit.jupiter.api.Assertions.assertEquals; +import static org.junit.jupiter.api.Assertions.assertTrue; +import static org.junit.jupiter.api.Assertions.assertFalse; + +public class TestStringPaths { + @Test + public void testValidateBucketNameValid() { + assertEquals("my-bucket", StringPaths.validateBucketName("my-bucket")); + assertEquals("my.bucket", StringPaths.validateBucketName("my.bucket")); + assertEquals("my_bucket", StringPaths.validateBucketName("my_bucket")); + assertEquals("bucket123", StringPaths.validateBucketName("bucket123")); + assertEquals("a", StringPaths.validateBucketName("a")); + assertEquals("long-bucket-name-with-numbers-123", + StringPaths.validateBucketName("long-bucket-name-with-numbers-123")); + } + + @Test + public void testValidateBucketNameEndsWithSlash() { + assertEquals("my-bucket", StringPaths.validateBucketName("my-bucket/")); + assertEquals("another-bucket", StringPaths.validateBucketName("another-bucket/")); + } + + @Test(expected = IllegalArgumentException.class) + public void testValidateBucketNameEmpty() { + StringPaths.validateBucketName(""); + } + + @Test(expected = IllegalArgumentException.class) + public void testValidateBucketNameNull() { + StringPaths.validateBucketName(null); + } + + @Test(expected = IllegalArgumentException.class) + public void testValidateBucketNameInvalidChars() { + StringPaths.validateBucketName("my bucket"); // Space + } + + @Test(expected = IllegalArgumentException.class) + public void testValidateBucketNameInvalidChars2() { + StringPaths.validateBucketName("my@bucket"); // @ symbol + } + + @Test(expected = IllegalArgumentException.class) + public void testValidateBucketNameUpperCase() { + StringPaths.validateBucketName("MyBucket"); // Uppercase + } + + @Test + public void testValidateObjectNameValid() { + assertEquals("path/to/object", + StringPaths.validateObjectName("path/to/object", false)); + assertEquals("object", StringPaths.validateObjectName("object", false)); + assertEquals("dir/", + StringPaths.validateObjectName("dir/", false)); // Still valid after validation + assertEquals("", StringPaths.validateObjectName("/", true)); // Slash becomes empty if allowed + assertEquals("", StringPaths.validateObjectName("", true)); + } + + @Test + public void testValidateObjectNameLeadingSlash() { + assertEquals("path/to/object", StringPaths.validateObjectName("/path/to/object", false)); + assertEquals("object", StringPaths.validateObjectName("/object", false)); + } + + @Test(expected = IllegalArgumentException.class) + public void testValidateObjectNameEmptyNotAllowed() { + StringPaths.validateObjectName("", false); + } + + @Test(expected = IllegalArgumentException.class) + public void testValidateObjectNameNullNotAllowed() { + StringPaths.validateObjectName(null, false); + } + + @Test + public void testValidateObjectNameEmptyAllowed() { + assertEquals("", StringPaths.validateObjectName("", true)); + assertEquals("", StringPaths.validateObjectName(null, true)); + assertEquals("", StringPaths.validateObjectName("/", true)); // Single slash becomes empty + } + + @Test(expected = IllegalArgumentException.class) + public void testValidateObjectNameConsecutiveSlashes() { + StringPaths.validateObjectName("path//to/object", false); + } + + @Test(expected = IllegalArgumentException.class) + public void testValidateObjectNameConsecutiveSlashesAtStart() { + StringPaths.validateObjectName("//path/to/object", false); + } + + @Test(expected = IllegalArgumentException.class) + public void testValidateObjectNameConsecutiveSlashesAtEnd() { + StringPaths.validateObjectName("path/to/object//", false); + } + + @Test + public void testFromComponentsValid() { + assertEquals("gs://my-bucket/path/to/object", + StringPaths.fromComponents("my-bucket", "path/to/object")); + assertEquals("gs://my-bucket/dir/", StringPaths.fromComponents("my-bucket", "dir/")); + assertEquals("gs://my-bucket/", StringPaths.fromComponents("my-bucket", "")); + } + + @Test(expected = IllegalArgumentException.class) + public void testFromComponentsNullBucketNonNullObject() { + StringPaths.fromComponents(null, "path/to/object"); + } + + @Test + public void testFromComponentsNullBucketAndObject() { + assertEquals("gs://", StringPaths.fromComponents(null, null)); + } + + @Test + public void testIsDirectoryPath() { + assertTrue(StringPaths.isDirectoryPath("dir/")); + assertTrue(StringPaths.isDirectoryPath("path/to/dir/")); + assertFalse(StringPaths.isDirectoryPath("file.txt")); + assertFalse(StringPaths.isDirectoryPath("path/to/file.txt")); + assertFalse(StringPaths.isDirectoryPath("")); + assertFalse(StringPaths.isDirectoryPath(null)); + } + + @Test + public void testToFilePath() { + assertEquals("path/to/file", StringPaths.toFilePath("path/to/file/")); + assertEquals("file.txt", StringPaths.toFilePath("file.txt")); + assertEquals("dir", StringPaths.toFilePath("dir/")); + assertEquals("", StringPaths.toFilePath("")); + assertNull(StringPaths.toFilePath(null)); + } + + // --- Tests for toDirectoryPath --- + + @Test + public void testToDirectoryPath() { + assertEquals("path/to/dir/", StringPaths.toDirectoryPath("path/to/dir")); + assertEquals("dir/", StringPaths.toDirectoryPath("dir/")); + assertEquals("file/", StringPaths.toDirectoryPath("file")); + assertEquals("", StringPaths.toDirectoryPath("")); + assertNull(StringPaths.toDirectoryPath(null)); + } +} \ No newline at end of file diff --git a/hadoop-tools/hadoop-gcp/src/test/java/org/apache/hadoop/fs/gs/TestUriPaths.java b/hadoop-tools/hadoop-gcp/src/test/java/org/apache/hadoop/fs/gs/TestUriPaths.java new file mode 100644 index 0000000000000..fe93a28dc435c --- /dev/null +++ b/hadoop-tools/hadoop-gcp/src/test/java/org/apache/hadoop/fs/gs/TestUriPaths.java @@ -0,0 +1,150 @@ +/* + * Copyright 2013 Google Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hadoop.fs.gs; + +import java.net.URI; + +import org.junit.Test; +import static org.junit.jupiter.api.Assertions.assertEquals; + +public class TestUriPaths { + @Test + public void testToDirectoryFile() throws Exception { + URI fileUri = new URI("gs://my-bucket/path/to/file.txt"); + URI expectedDirUri = new URI("gs://my-bucket/path/to/file.txt/"); + // Temporarily override the behavior for testing purposes + // This is not a clean mocking strategy for static methods, but demonstrates the test intent. + // In a real environment, you'd use PowerMock or refactor. + URI result = UriPaths.toDirectory(fileUri); + assertEquals(expectedDirUri, result); + } + + @Test + public void testToDirectoryAlreadyDirectory() throws Exception { + URI dirUri = new URI("gs://my-bucket/path/to/dir/"); + URI result = UriPaths.toDirectory(dirUri); + assertEquals(dirUri, result); + } + + @Test + public void testToDirectoryRootBucket() throws Exception { + URI bucketUri = new URI("gs://my-bucket"); + URI result = UriPaths.toDirectory(bucketUri); + assertEquals(bucketUri, result); // Buckets are implicitly directories + } + + @Test + public void testGetParentPathFile() throws Exception { + URI uri = new URI("gs://my-bucket/path/to/file.txt"); + URI expectedParent = new URI("gs://my-bucket/path/to/"); + assertEquals(expectedParent, UriPaths.getParentPath(uri)); + } + + @Test + public void testGetParentPathDirectory() throws Exception { + URI uri = new URI("gs://my-bucket/path/to/dir/"); + URI expectedParent = new URI("gs://my-bucket/path/to/"); + assertEquals(expectedParent, UriPaths.getParentPath(uri)); + } + + @Test + public void testGetParentPathObjectAtBucketRoot() throws Exception { + URI uri = new URI("gs://my-bucket/file.txt"); + URI expectedParent = new URI("gs://my-bucket/"); + assertEquals(expectedParent, UriPaths.getParentPath(uri)); + } + + @Test + public void testGetParentPathDirectoryAtBucketRoot() throws Exception { + URI uri = new URI("gs://my-bucket/dir/"); + URI expectedParent = new URI("gs://my-bucket/"); + assertEquals(expectedParent, UriPaths.getParentPath(uri)); + } + + @Test + public void testGetParentPathBucket() throws Exception { + URI uri = new URI("gs://my-bucket"); + assertEquals(GoogleCloudStorageFileSystem.GCSROOT, UriPaths.getParentPath(uri)); + } + + @Test + public void testFromResourceIdObject() throws Exception { + StorageResourceId resourceId = new StorageResourceId("my-bucket", "path/to/object"); + URI expectedUri = new URI("gs://my-bucket/path/to/object"); + assertEquals(expectedUri, UriPaths.fromResourceId(resourceId, false)); + } + + @Test + public void testFromResourceIdDirectory() throws Exception { + StorageResourceId resourceId = new StorageResourceId("my-bucket", "path/to/dir/"); + URI expectedUri = new URI("gs://my-bucket/path/to/dir/"); + assertEquals(expectedUri, UriPaths.fromResourceId(resourceId, false)); + } + + @Test + public void testFromResourceIdBucket() throws Exception { + StorageResourceId resourceId = new StorageResourceId("my-bucket"); + URI expectedUri = new URI("gs://my-bucket/"); + assertEquals(expectedUri, UriPaths.fromResourceId(resourceId, true)); + } + + @Test + public void testFromResourceIdEmptyObjectAllowed() throws Exception { + StorageResourceId resourceId = new StorageResourceId("my-bucket"); + URI expectedUri = new URI("gs://my-bucket/"); + assertEquals(expectedUri, UriPaths.fromResourceId(resourceId, true)); + } + + @Test + public void testFromResourceIdNullObjectAllowed() throws Exception { + StorageResourceId resourceId = new StorageResourceId("my-bucket"); + URI expectedUri = new URI("gs://my-bucket/"); + assertEquals(expectedUri, UriPaths.fromResourceId(resourceId, true)); + } + + @Test + public void testFromStringPathComponentsValid() throws Exception { + assertEquals(new URI("gs://my-bucket/path/to/object"), + UriPaths.fromStringPathComponents("my-bucket", "path/to/object", false)); + assertEquals(new URI("gs://my-bucket/path/to/dir/"), + UriPaths.fromStringPathComponents("my-bucket", "path/to/dir/", false)); + assertEquals(new URI("gs://my-bucket/"), + UriPaths.fromStringPathComponents("my-bucket", null, true)); + assertEquals(new URI("gs://my-bucket/"), + UriPaths.fromStringPathComponents("my-bucket", "", true)); + } + + @Test(expected = IllegalArgumentException.class) + public void testFromStringPathComponentsNullBucketNameNotAllowed() { + UriPaths.fromStringPathComponents(null, "object", false); + } + + @Test(expected = IllegalArgumentException.class) + public void testFromStringPathComponentsEmptyObjectNameNotAllowed() { + UriPaths.fromStringPathComponents("my-bucket", "", false); + } + + @Test(expected = IllegalArgumentException.class) + public void testFromStringPathComponentsConsecutiveSlashes() { + UriPaths.fromStringPathComponents("my-bucket", "path//to/object", false); + } + + @Test(expected = IllegalArgumentException.class) + public void testFromStringPathComponentsInvalidBucketName() { + UriPaths.fromStringPathComponents("MyBucket", "object", false); // Uppercase + } +} \ No newline at end of file diff --git a/hadoop-tools/hadoop-gcp/src/test/java/org/apache/hadoop/fs/gs/package-info.java b/hadoop-tools/hadoop-gcp/src/test/java/org/apache/hadoop/fs/gs/package-info.java new file mode 100644 index 0000000000000..fe289cc6d3dc2 --- /dev/null +++ b/hadoop-tools/hadoop-gcp/src/test/java/org/apache/hadoop/fs/gs/package-info.java @@ -0,0 +1,22 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/** + * Google Cloud Storage Filesystem tests. + */ +package org.apache.hadoop.fs.gs; \ No newline at end of file diff --git a/hadoop-tools/pom.xml b/hadoop-tools/pom.xml index 8c1256a177cc4..70df9b02d0fa9 100644 --- a/hadoop-tools/pom.xml +++ b/hadoop-tools/pom.xml @@ -53,6 +53,7 @@ hadoop-fs2img hadoop-benchmark hadoop-compat-bench + hadoop-gcp From 0b104fdbf2901762d3a1ae11d71dd1c88d70fb40 Mon Sep 17 00:00:00 2001 From: Arunkumar Chacko Date: Tue, 10 Jun 2025 04:53:19 +0000 Subject: [PATCH 2/8] HADOOP-19343: Add support for mkdir() and getFileStatus() Closes #7721 Signed-off-by: Chris Nauroth --- hadoop-project/pom.xml | 2 +- hadoop-tools/hadoop-gcp/pom.xml | 11 + .../hadoop/fs/gs/ApiErrorExtractor.java | 327 +++++++++++++++ .../hadoop/fs/gs/CreateBucketOptions.java | 81 ++++ .../hadoop/fs/gs/CreateObjectOptions.java | 127 ++++++ .../hadoop/fs/gs/ErrorTypeExtractor.java | 47 ++- .../org/apache/hadoop/fs/gs/FileInfo.java | 14 +- .../hadoop/fs/gs/GoogleCloudStorage.java | 385 +++++++++++++++++- .../fs/gs/GoogleCloudStorageFileSystem.java | 285 +++++++++++++ .../fs/gs/GoogleCloudStorageItemInfo.java | 14 +- .../hadoop/fs/gs/GoogleHadoopFileSystem.java | 156 +++++-- .../GoogleHadoopFileSystemConfiguration.java | 18 +- .../hadoop/fs/gs/IoExceptionHelper.java | 83 ++++ .../apache/hadoop/fs/gs/ListFileOptions.java | 34 ++ .../hadoop/fs/gs/ListObjectOptions.java | 141 +++++++ .../hadoop/fs/gs/VerificationAttributes.java | 14 +- .../hadoop/fs/gs/TestConfiguration.java | 64 +++ .../hadoop/fs/gs/TestStorageResourceId.java | 14 +- .../apache/hadoop/fs/gs/TestStringPaths.java | 14 +- .../org/apache/hadoop/fs/gs/TestUriPaths.java | 14 +- .../hadoop/fs/gs/contract/GoogleContract.java | 44 ++ .../contract/ITestGoogleContractDelete.java | 37 ++ .../ITestGoogleContractGetFileStatus.java | 30 ++ .../gs/contract/ITestGoogleContractMkdir.java | 61 +++ .../hadoop/fs/gs/contract/package-info.java | 22 + 25 files changed, 1959 insertions(+), 80 deletions(-) create mode 100644 hadoop-tools/hadoop-gcp/src/main/java/org/apache/hadoop/fs/gs/ApiErrorExtractor.java create mode 100644 hadoop-tools/hadoop-gcp/src/main/java/org/apache/hadoop/fs/gs/CreateBucketOptions.java create mode 100644 hadoop-tools/hadoop-gcp/src/main/java/org/apache/hadoop/fs/gs/CreateObjectOptions.java create mode 100644 hadoop-tools/hadoop-gcp/src/main/java/org/apache/hadoop/fs/gs/IoExceptionHelper.java create mode 100644 hadoop-tools/hadoop-gcp/src/main/java/org/apache/hadoop/fs/gs/ListFileOptions.java create mode 100644 hadoop-tools/hadoop-gcp/src/main/java/org/apache/hadoop/fs/gs/ListObjectOptions.java create mode 100644 hadoop-tools/hadoop-gcp/src/test/java/org/apache/hadoop/fs/gs/TestConfiguration.java create mode 100644 hadoop-tools/hadoop-gcp/src/test/java/org/apache/hadoop/fs/gs/contract/GoogleContract.java create mode 100644 hadoop-tools/hadoop-gcp/src/test/java/org/apache/hadoop/fs/gs/contract/ITestGoogleContractDelete.java create mode 100644 hadoop-tools/hadoop-gcp/src/test/java/org/apache/hadoop/fs/gs/contract/ITestGoogleContractGetFileStatus.java create mode 100644 hadoop-tools/hadoop-gcp/src/test/java/org/apache/hadoop/fs/gs/contract/ITestGoogleContractMkdir.java create mode 100644 hadoop-tools/hadoop-gcp/src/test/java/org/apache/hadoop/fs/gs/contract/package-info.java diff --git a/hadoop-project/pom.xml b/hadoop-project/pom.xml index a5bf2ed66947a..8ea97f8366a78 100644 --- a/hadoop-project/pom.xml +++ b/hadoop-project/pom.xml @@ -2160,7 +2160,7 @@ com.google.cloud google-cloud-storage - 2.44.1 + 2.52.0 diff --git a/hadoop-tools/hadoop-gcp/pom.xml b/hadoop-tools/hadoop-gcp/pom.xml index d5744f1f97c44..2da2881ab7935 100644 --- a/hadoop-tools/hadoop-gcp/pom.xml +++ b/hadoop-tools/hadoop-gcp/pom.xml @@ -261,6 +261,10 @@ com.lmax io.grpc io.opencensus + io.opentelemetry + io.opentelemetry.api + io.opentelemetry.contrib + io.opentelemetry.semconv io.perfmark org.apache.httpcomponents org.threeten:threetenbp @@ -282,6 +286,7 @@ com.google.cloud.hadoop.util.** com.google.cloud.http.** com.google.cloud.monitoring.** + com.google.cloud.opentelemetry.** com.google.cloud.spi.** com.google.cloud.storage.** com.google.common.** @@ -459,6 +464,12 @@ + + org.apache.hadoop + hadoop-common + test + test-jar + org.assertj assertj-core diff --git a/hadoop-tools/hadoop-gcp/src/main/java/org/apache/hadoop/fs/gs/ApiErrorExtractor.java b/hadoop-tools/hadoop-gcp/src/main/java/org/apache/hadoop/fs/gs/ApiErrorExtractor.java new file mode 100644 index 0000000000000..4fef41b1971ee --- /dev/null +++ b/hadoop-tools/hadoop-gcp/src/main/java/org/apache/hadoop/fs/gs/ApiErrorExtractor.java @@ -0,0 +1,327 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hadoop.fs.gs; + +import com.google.api.client.googleapis.json.GoogleJsonError; +import com.google.api.client.googleapis.json.GoogleJsonError.ErrorInfo; +import com.google.api.client.googleapis.json.GoogleJsonResponseException; +import com.google.api.client.http.HttpResponseException; +import com.google.api.client.http.HttpStatusCodes; +import org.apache.hadoop.thirdparty.com.google.common.collect.ImmutableList; +import org.apache.hadoop.thirdparty.com.google.common.collect.Iterables; +import java.io.IOException; +import java.util.List; +import javax.annotation.Nullable; + +/** + * Translates exceptions from API calls into higher-level meaning, while allowing injectability for + * testing how API errors are handled. + */ +class ApiErrorExtractor { + + /** Singleton instance of the ApiErrorExtractor. */ + public static final ApiErrorExtractor INSTANCE = new ApiErrorExtractor(); + + public static final int STATUS_CODE_RANGE_NOT_SATISFIABLE = 416; + + public static final String GLOBAL_DOMAIN = "global"; + public static final String USAGE_LIMITS_DOMAIN = "usageLimits"; + + public static final String RATE_LIMITED_REASON = "rateLimitExceeded"; + public static final String USER_RATE_LIMITED_REASON = "userRateLimitExceeded"; + + public static final String QUOTA_EXCEEDED_REASON = "quotaExceeded"; + + // These come with "The account for ... has been disabled" message. + public static final String ACCOUNT_DISABLED_REASON = "accountDisabled"; + + // These come with "Project marked for deletion" message. + public static final String ACCESS_NOT_CONFIGURED_REASON = "accessNotConfigured"; + + // These are 400 error codes with "resource 'xyz' is not ready" message. + // These sometimes happens when create operation is still in-flight but resource + // representation is already available via get call. + public static final String RESOURCE_NOT_READY_REASON = "resourceNotReady"; + + // HTTP 413 with message "Value for field 'foo' is too large". + public static final String FIELD_SIZE_TOO_LARGE_REASON = "fieldSizeTooLarge"; + + // HTTP 400 message for 'USER_PROJECT_MISSING' error. + public static final String USER_PROJECT_MISSING_MESSAGE = + "Bucket is a requester pays bucket but no user project provided."; + + // The debugInfo field present on Errors collection in GoogleJsonException + // as an unknown key. + private static final String DEBUG_INFO_FIELD = "debugInfo"; + + /** + * Determines if the given exception indicates intermittent request failure or failure caused by + * user error. + */ + public boolean requestFailure(IOException e) { + HttpResponseException httpException = getHttpResponseException(e); + return httpException != null + && (accessDenied(httpException) + || badRequest(httpException) + || internalServerError(httpException) + || rateLimited(httpException) + || IoExceptionHelper.isSocketError(httpException) + || unauthorized(httpException)); + } + + /** + * Determines if the given exception indicates 'access denied'. Recursively checks getCause() if + * outer exception isn't an instance of the correct class. + * + *

Warning: this method only checks for access denied status code, however this may include + * potentially recoverable reason codes such as rate limiting. For alternative, see {@link + * #accessDeniedNonRecoverable(IOException)}. + */ + public boolean accessDenied(IOException e) { + return recursiveCheckForCode(e, HttpStatusCodes.STATUS_CODE_FORBIDDEN); + } + + /** Determines if the given exception indicates bad request. */ + public boolean badRequest(IOException e) { + return recursiveCheckForCode(e, HttpStatusCodes.STATUS_CODE_BAD_REQUEST); + } + + /** + * Determines if the given exception indicates the request was unauthenticated. This can be caused + * by attaching invalid credentials to a request. + */ + public boolean unauthorized(IOException e) { + return recursiveCheckForCode(e, HttpStatusCodes.STATUS_CODE_UNAUTHORIZED); + } + + /** + * Determines if the exception is a non-recoverable access denied code (such as account closed or + * marked for deletion). + */ + public boolean accessDeniedNonRecoverable(IOException e) { + ErrorInfo errorInfo = getErrorInfo(e); + String reason = errorInfo != null ? errorInfo.getReason() : null; + return ACCOUNT_DISABLED_REASON.equals(reason) || ACCESS_NOT_CONFIGURED_REASON.equals(reason); + } + + /** Determines if the exception is a client error. */ + public boolean clientError(IOException e) { + HttpResponseException httpException = getHttpResponseException(e); + return httpException != null && getHttpStatusCode(httpException) / 100 == 4; + } + + /** Determines if the exception is an internal server error. */ + public boolean internalServerError(IOException e) { + HttpResponseException httpException = getHttpResponseException(e); + return httpException != null && getHttpStatusCode(httpException) / 100 == 5; + } + + /** + * Determines if the given exception indicates 'item already exists'. Recursively checks + * getCause() if outer exception isn't an instance of the correct class. + */ + public boolean itemAlreadyExists(IOException e) { + return recursiveCheckForCode(e, HttpStatusCodes.STATUS_CODE_CONFLICT); + } + + /** + * Determines if the given exception indicates 'item not found'. Recursively checks getCause() if + * outer exception isn't an instance of the correct class. + */ + public boolean itemNotFound(IOException e) { + return recursiveCheckForCode(e, HttpStatusCodes.STATUS_CODE_NOT_FOUND); + } + + /** + * Determines if the given exception indicates 'field size too large'. Recursively checks + * getCause() if outer exception isn't an instance of the correct class. + */ + public boolean fieldSizeTooLarge(IOException e) { + ErrorInfo errorInfo = getErrorInfo(e); + return errorInfo != null && FIELD_SIZE_TOO_LARGE_REASON.equals(errorInfo.getReason()); + } + + /** + * Determines if the given exception indicates 'resource not ready'. Recursively checks getCause() + * if outer exception isn't an instance of the correct class. + */ + public boolean resourceNotReady(IOException e) { + ErrorInfo errorInfo = getErrorInfo(e); + return errorInfo != null && RESOURCE_NOT_READY_REASON.equals(errorInfo.getReason()); + } + + /** + * Determines if the given IOException indicates 'precondition not met' Recursively checks + * getCause() if outer exception isn't an instance of the correct class. + */ + public boolean preconditionNotMet(IOException e) { + return recursiveCheckForCode(e, HttpStatusCodes.STATUS_CODE_PRECONDITION_FAILED); + } + + /** + * Determines if the given exception indicates 'range not satisfiable'. Recursively checks + * getCause() if outer exception isn't an instance of the correct class. + */ + public boolean rangeNotSatisfiable(IOException e) { + return recursiveCheckForCode(e, STATUS_CODE_RANGE_NOT_SATISFIABLE); + } + + /** + * Determines if a given Throwable is caused by a rate limit being applied. Recursively checks + * getCause() if outer exception isn't an instance of the correct class. + * + * @param e The Throwable to check. + * @return True if the Throwable is a result of rate limiting being applied. + */ + public boolean rateLimited(IOException e) { + ErrorInfo errorInfo = getErrorInfo(e); + if (errorInfo != null) { + String domain = errorInfo.getDomain(); + boolean isRateLimitedOrGlobalDomain = + USAGE_LIMITS_DOMAIN.equals(domain) || GLOBAL_DOMAIN.equals(domain); + String reason = errorInfo.getReason(); + boolean isRateLimitedReason = + RATE_LIMITED_REASON.equals(reason) || USER_RATE_LIMITED_REASON.equals(reason); + return isRateLimitedOrGlobalDomain && isRateLimitedReason; + } + return false; + } + + /** + * Determines if a given Throwable is caused by Quota Exceeded. Recursively checks getCause() if + * outer exception isn't an instance of the correct class. + */ + public boolean quotaExceeded(IOException e) { + ErrorInfo errorInfo = getErrorInfo(e); + return errorInfo != null && QUOTA_EXCEEDED_REASON.equals(errorInfo.getReason()); + } + + /** + * Determines if the given exception indicates that 'userProject' is missing in request. + * Recursively checks getCause() if outer exception isn't an instance of the correct class. + */ + public boolean userProjectMissing(IOException e) { + GoogleJsonError jsonError = getJsonError(e); + return jsonError != null + && jsonError.getCode() == HttpStatusCodes.STATUS_CODE_BAD_REQUEST + && USER_PROJECT_MISSING_MESSAGE.equals(jsonError.getMessage()); + } + + /** Extracts the error message. */ + public String getErrorMessage(IOException e) { + // Prefer to use message from GJRE. + GoogleJsonError jsonError = getJsonError(e); + return jsonError == null ? e.getMessage() : jsonError.getMessage(); + } + + /** + * Converts the exception to a user-presentable error message. Specifically, extracts message + * field for HTTP 4xx codes, and creates a generic "Internal Server Error" for HTTP 5xx codes. + * + * @param e the exception + * @param action the description of the action being performed at the time of error. + * @see #toUserPresentableMessage(IOException, String) + */ + public IOException toUserPresentableException(IOException e, String action) throws IOException { + throw new IOException(toUserPresentableMessage(e, action), e); + } + + /** + * Converts the exception to a user-presentable error message. Specifically, extracts message + * field for HTTP 4xx codes, and creates a generic "Internal Server Error" for HTTP 5xx codes. + */ + public String toUserPresentableMessage(IOException e, @Nullable String action) { + String message = "Internal server error"; + if (clientError(e)) { + message = getErrorMessage(e); + } + return action == null + ? message + : String.format("Encountered an error while %s: %s", action, message); + } + + /** See {@link #toUserPresentableMessage(IOException, String)}. */ + public String toUserPresentableMessage(IOException e) { + return toUserPresentableMessage(e, null); + } + + @Nullable + public String getDebugInfo(IOException e) { + ErrorInfo errorInfo = getErrorInfo(e); + return errorInfo != null ? (String) errorInfo.getUnknownKeys().get(DEBUG_INFO_FIELD) : null; + } + + /** + * Returns HTTP status code from the given exception. + * + *

Note: GoogleJsonResponseException.getStatusCode() method is marked final therefore it cannot + * be mocked using Mockito. We use this helper so that we can override it in tests. + */ + protected int getHttpStatusCode(HttpResponseException e) { + return e.getStatusCode(); + } + + /** + * Get the first ErrorInfo from an IOException if it is an instance of + * GoogleJsonResponseException, otherwise return null. + */ + @Nullable + protected ErrorInfo getErrorInfo(IOException e) { + GoogleJsonError jsonError = getJsonError(e); + List errors = jsonError != null ? jsonError.getErrors() : ImmutableList.of(); + return errors != null ? Iterables.getFirst(errors, null) : null; + } + + /** If the exception is a GoogleJsonResponseException, get the error details, else return null. */ + @Nullable + protected GoogleJsonError getJsonError(IOException e) { + GoogleJsonResponseException jsonException = getJsonResponseException(e); + return jsonException == null ? null : jsonException.getDetails(); + } + + /** Recursively checks getCause() if outer exception isn't an instance of the correct class. */ + protected boolean recursiveCheckForCode(IOException e, int code) { + HttpResponseException httpException = getHttpResponseException(e); + return httpException != null && getHttpStatusCode(httpException) == code; + } + + @Nullable + public static GoogleJsonResponseException getJsonResponseException(Throwable throwable) { + Throwable cause = throwable; + while (cause != null) { + if (cause instanceof GoogleJsonResponseException) { + return (GoogleJsonResponseException) cause; + } + cause = cause.getCause(); + } + return null; + } + + @Nullable + public static HttpResponseException getHttpResponseException(Throwable throwable) { + Throwable cause = throwable; + while (cause != null) { + if (cause instanceof HttpResponseException) { + return (HttpResponseException) cause; + } + cause = cause.getCause(); + } + return null; + } +} diff --git a/hadoop-tools/hadoop-gcp/src/main/java/org/apache/hadoop/fs/gs/CreateBucketOptions.java b/hadoop-tools/hadoop-gcp/src/main/java/org/apache/hadoop/fs/gs/CreateBucketOptions.java new file mode 100644 index 0000000000000..46cd2a7efbd34 --- /dev/null +++ b/hadoop-tools/hadoop-gcp/src/main/java/org/apache/hadoop/fs/gs/CreateBucketOptions.java @@ -0,0 +1,81 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hadoop.fs.gs; + +import java.time.Duration; + +final class CreateBucketOptions { + // TODO: Make sure the defaults have the setting matching the existing connector. + static final CreateBucketOptions DEFAULT = new Builder().build(); + private final String location; + private final String storageClass; + private final Duration ttl; + private final String projectId; + + private CreateBucketOptions(Builder builder) { + this.location = builder.location; + this.storageClass = builder.storageClass; + this.ttl = builder.ttl; + this.projectId = builder.projectId; + } + + public String getLocation() { + return location; + } + + public String getStorageClass() { + return storageClass; + } + + public Duration getTtl() { // Changed return type to Duration + return ttl; + } + + static class Builder { + private String location; + private String storageClass; + private Duration ttl; + private String projectId; + + public Builder withLocation(String loc) { + this.location = loc; + return this; + } + + public Builder withStorageClass(String sc) { + this.storageClass = sc; + return this; + } + + public Builder withTtl(Duration ttlDuration) { + this.ttl = ttlDuration; + return this; + } + + public Builder withProjectId(String pid) { + this.projectId = pid; + return this; + } + + public CreateBucketOptions build() { + return new CreateBucketOptions(this); + } + } +} + diff --git a/hadoop-tools/hadoop-gcp/src/main/java/org/apache/hadoop/fs/gs/CreateObjectOptions.java b/hadoop-tools/hadoop-gcp/src/main/java/org/apache/hadoop/fs/gs/CreateObjectOptions.java new file mode 100644 index 0000000000000..26c91fcae7bd2 --- /dev/null +++ b/hadoop-tools/hadoop-gcp/src/main/java/org/apache/hadoop/fs/gs/CreateObjectOptions.java @@ -0,0 +1,127 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hadoop.fs.gs; + +import org.apache.hadoop.thirdparty.com.google.common.collect.ImmutableMap; + +import java.util.HashMap; +import java.util.Map; + +/** Options that can be specified when creating a file in the {@link GoogleCloudStorage}. */ + +final class CreateObjectOptions { + static final CreateObjectOptions DEFAULT_OVERWRITE = builder().setOverwriteExisting(true).build(); + + private final String contentEncoding; + private final String contentType; + private final boolean ensureEmptyObjectsMetadataMatch; + private final String kmsKeyName; + private final ImmutableMap metadata; + private final boolean overwriteExisting; + + private CreateObjectOptions(Builder builder) { + this.contentEncoding = builder.contentEncoding; + this.contentType = builder.contentType; + this.ensureEmptyObjectsMetadataMatch = builder.ensureEmptyObjectsMetadataMatch; + this.kmsKeyName = builder.kmsKeyName; + this.metadata = ImmutableMap.copyOf(builder.metadata); + this.overwriteExisting = builder.overwriteExisting; + } + + public static Builder builder() { + return new Builder(); + } + + public String getContentEncoding() { + return contentEncoding; + } + + public String getContentType() { + return contentType; + } + + public boolean isEnsureEmptyObjectsMetadataMatch() { + return ensureEmptyObjectsMetadataMatch; + } + + public String getKmsKeyName() { + return kmsKeyName; + } + + public Map getMetadata() { + return metadata; + } + + public boolean isOverwriteExisting() { + return overwriteExisting; + } + + public Builder toBuilder() { + return builder().setContentEncoding(this.contentEncoding).setContentType(this.contentType) + .setEnsureEmptyObjectsMetadataMatch(this.ensureEmptyObjectsMetadataMatch) + .setKmsKeyName(this.kmsKeyName).setMetadata(this.metadata) + .setOverwriteExisting(this.overwriteExisting); + } + + static final class Builder { + private String contentEncoding; + private String contentType; + private boolean ensureEmptyObjectsMetadataMatch = false; + private String kmsKeyName; + private Map metadata = new HashMap<>(); + private boolean overwriteExisting = false; + + private Builder() { + } + + public Builder setContentEncoding(String ce) { + this.contentEncoding = ce; + return this; + } + + public Builder setContentType(String ct) { + this.contentType = ct; + return this; + } + + public Builder setEnsureEmptyObjectsMetadataMatch(boolean val) { + this.ensureEmptyObjectsMetadataMatch = val; + return this; + } + + public Builder setKmsKeyName(String key) { + this.kmsKeyName = key; + return this; + } + + public Builder setMetadata(Map m) { + this.metadata = m; + return this; + } + + public Builder setOverwriteExisting(boolean overwrite) { + this.overwriteExisting = overwrite; + return this; + } + + public CreateObjectOptions build() { + return new CreateObjectOptions(this); + } + } +} diff --git a/hadoop-tools/hadoop-gcp/src/main/java/org/apache/hadoop/fs/gs/ErrorTypeExtractor.java b/hadoop-tools/hadoop-gcp/src/main/java/org/apache/hadoop/fs/gs/ErrorTypeExtractor.java index a4497734524e7..547d855d1d649 100644 --- a/hadoop-tools/hadoop-gcp/src/main/java/org/apache/hadoop/fs/gs/ErrorTypeExtractor.java +++ b/hadoop-tools/hadoop-gcp/src/main/java/org/apache/hadoop/fs/gs/ErrorTypeExtractor.java @@ -1,11 +1,13 @@ /* - * Copyright 2023 Google LLC + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 + * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, @@ -16,13 +18,46 @@ package org.apache.hadoop.fs.gs; +import javax.annotation.Nullable; + import io.grpc.Status; +import io.grpc.StatusRuntimeException; /** * Implementation for {@link ErrorTypeExtractor} for exception specifically thrown from gRPC path. */ final class ErrorTypeExtractor { + static boolean bucketAlreadyExists(Exception e) { + ErrorType errorType = getErrorType(e); + if (errorType == ErrorType.ALREADY_EXISTS) { + return true; + } else if (errorType == ErrorType.FAILED_PRECONDITION) { + // The gRPC API currently throws a FAILED_PRECONDITION status code instead of ALREADY_EXISTS, + // so we handle both these conditions in the interim. + StatusRuntimeException statusRuntimeException = getStatusRuntimeException(e); + return statusRuntimeException != null + && BUCKET_ALREADY_EXISTS_MESSAGE.equals(statusRuntimeException.getMessage()); + } + return false; + } + + @Nullable + static private StatusRuntimeException getStatusRuntimeException(Exception e) { + Throwable cause = e; + // Keeping a counter to break early from the loop to avoid infinite loop condition due to + // cyclic exception chains. + int currentExceptionDepth = 0, maxChainDepth = 1000; + while (cause != null && currentExceptionDepth < maxChainDepth) { + if (cause instanceof StatusRuntimeException) { + return (StatusRuntimeException) cause; + } + cause = cause.getCause(); + currentExceptionDepth++; + } + return null; + } + enum ErrorType { NOT_FOUND, OUT_OF_RANGE, ALREADY_EXISTS, FAILED_PRECONDITION, INTERNAL, RESOURCE_EXHAUSTED, UNAVAILABLE, UNKNOWN diff --git a/hadoop-tools/hadoop-gcp/src/main/java/org/apache/hadoop/fs/gs/FileInfo.java b/hadoop-tools/hadoop-gcp/src/main/java/org/apache/hadoop/fs/gs/FileInfo.java index df8d63f5eecf2..3b9d9f475ae91 100644 --- a/hadoop-tools/hadoop-gcp/src/main/java/org/apache/hadoop/fs/gs/FileInfo.java +++ b/hadoop-tools/hadoop-gcp/src/main/java/org/apache/hadoop/fs/gs/FileInfo.java @@ -1,11 +1,13 @@ /* - * Copyright 2013 Google Inc. + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 + * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, diff --git a/hadoop-tools/hadoop-gcp/src/main/java/org/apache/hadoop/fs/gs/GoogleCloudStorage.java b/hadoop-tools/hadoop-gcp/src/main/java/org/apache/hadoop/fs/gs/GoogleCloudStorage.java index 9c15962b7ef36..d68eca6a8a5f3 100644 --- a/hadoop-tools/hadoop-gcp/src/main/java/org/apache/hadoop/fs/gs/GoogleCloudStorage.java +++ b/hadoop-tools/hadoop-gcp/src/main/java/org/apache/hadoop/fs/gs/GoogleCloudStorage.java @@ -20,7 +20,12 @@ import static org.apache.hadoop.thirdparty.com.google.common.base.Preconditions.*; import static org.apache.hadoop.thirdparty.com.google.common.base.Strings.isNullOrEmpty; +import static java.lang.Math.toIntExact; +import com.google.api.client.util.BackOff; +import com.google.api.client.util.ExponentialBackOff; +import com.google.api.client.util.Sleeper; +import com.google.api.gax.paging.Page; import com.google.cloud.storage.*; import org.apache.hadoop.thirdparty.com.google.common.collect.ImmutableList; import org.apache.hadoop.thirdparty.com.google.common.collect.Maps; @@ -32,21 +37,34 @@ import java.io.IOException; import java.nio.channels.WritableByteChannel; import java.nio.file.FileAlreadyExistsException; +import java.time.Duration; +import java.util.ArrayList; +import java.util.Collections; +import java.util.HashMap; +import java.util.HashSet; import java.util.List; import java.util.Map; +import java.util.Set; /** * A wrapper around Google cloud storage * client. */ class GoogleCloudStorage { - public static final Logger LOG = LoggerFactory.getLogger(GoogleHadoopFileSystem.class); + static final Logger LOG = LoggerFactory.getLogger(GoogleHadoopFileSystem.class); static final List BLOB_FIELDS = - ImmutableList.of(Storage.BlobField.BUCKET, Storage.BlobField.CONTENT_ENCODING, + ImmutableList.of( + Storage.BlobField.BUCKET, Storage.BlobField.CONTENT_ENCODING, Storage.BlobField.CONTENT_TYPE, Storage.BlobField.CRC32C, Storage.BlobField.GENERATION, Storage.BlobField.METADATA, Storage.BlobField.MD5HASH, Storage.BlobField.METAGENERATION, Storage.BlobField.NAME, Storage.BlobField.SIZE, Storage.BlobField.TIME_CREATED, Storage.BlobField.UPDATED); + + static final CreateObjectOptions EMPTY_OBJECT_CREATE_OPTIONS = + CreateObjectOptions.DEFAULT_OVERWRITE.toBuilder() + .setEnsureEmptyObjectsMetadataMatch(false) + .build(); + private final Storage storage; private final GoogleHadoopFileSystemConfiguration configuration; @@ -55,13 +73,20 @@ class GoogleCloudStorage { * is in WIP. */ GoogleCloudStorage(GoogleHadoopFileSystemConfiguration configuration) throws IOException { - // TODO: Set projectId // TODO: Set credentials - this.storage = StorageOptions.newBuilder().build().getService(); + this.storage = createStorage(configuration.getProjectId()); this.configuration = configuration; } - public WritableByteChannel create(final StorageResourceId resourceId, final CreateOptions options) + private static Storage createStorage(String projectId) { + if (projectId != null) { + return StorageOptions.newBuilder().setProjectId(projectId).build().getService(); + } + + return StorageOptions.newBuilder().build().getService(); + } + + WritableByteChannel create(final StorageResourceId resourceId, final CreateOptions options) throws IOException { LOG.trace("create({})", resourceId); @@ -104,7 +129,7 @@ private long getWriteGeneration(StorageResourceId resourceId, boolean overwrite) throw new FileAlreadyExistsException(String.format("Object %s already exists.", resourceId)); } - public void close() { + void close() { try { storage.close(); } catch (Exception e) { @@ -112,7 +137,7 @@ public void close() { } } - public GoogleCloudStorageItemInfo getItemInfo(StorageResourceId resourceId) throws IOException { + GoogleCloudStorageItemInfo getItemInfo(StorageResourceId resourceId) throws IOException { LOG.trace("getItemInfo({})", resourceId); // Handle ROOT case first. @@ -258,4 +283,350 @@ private static GoogleCloudStorageItemInfo createItemInfoForBucket(StorageResourc bucket.getLocation(), bucket.getStorageClass() == null ? null : bucket.getStorageClass().name()); } + + List listObjectInfo( + String bucketName, + String objectNamePrefix, + ListObjectOptions listOptions) throws IOException { + try { + long maxResults = listOptions.getMaxResults() > 0 ? + listOptions.getMaxResults() + (listOptions.isIncludePrefix() ? 0 : 1) : + listOptions.getMaxResults(); + + Storage.BlobListOption[] blobListOptions = + getBlobListOptions(objectNamePrefix, listOptions, maxResults); + Page blobs = storage.list(bucketName, blobListOptions); + ListOperationResult result = new ListOperationResult(maxResults); + for (Blob blob : blobs.iterateAll()) { + result.add(blob); + } + + return result.getItems(); + } catch (StorageException e) { + throw new IOException( + String.format("listing object '%s' failed.", BlobId.of(bucketName, objectNamePrefix)), + e); + } + } + + private Storage.BlobListOption[] getBlobListOptions( + String objectNamePrefix, ListObjectOptions listOptions, long maxResults) { + List options = new ArrayList<>(); + + options.add(Storage.BlobListOption.fields(BLOB_FIELDS.toArray(new Storage.BlobField[0]))); + options.add(Storage.BlobListOption.prefix(objectNamePrefix)); + // TODO: set max results as a BlobListOption + if ("/".equals(listOptions.getDelimiter())) { + options.add(Storage.BlobListOption.currentDirectory()); + } + + if (listOptions.getDelimiter() != null) { + options.add(Storage.BlobListOption.includeTrailingDelimiter()); + } + + return options.toArray(new Storage.BlobListOption[0]); + } + + private GoogleCloudStorageItemInfo createItemInfoForBlob(Blob blob) { + long generationId = blob.getGeneration() == null ? 0L : blob.getGeneration(); + StorageResourceId resourceId = + new StorageResourceId(blob.getBucket(), blob.getName(), generationId); + return createItemInfoForBlob(resourceId, blob); + } + + void createBucket(String bucketName, CreateBucketOptions options) throws IOException { + LOG.trace("createBucket({})", bucketName); + checkArgument(!isNullOrEmpty(bucketName), "bucketName must not be null or empty"); + checkNotNull(options, "options must not be null"); + + BucketInfo.Builder bucketInfoBuilder = + BucketInfo.newBuilder(bucketName).setLocation(options.getLocation()); + + if (options.getStorageClass() != null) { + bucketInfoBuilder.setStorageClass( + StorageClass.valueOfStrict(options.getStorageClass().toUpperCase())); + } + if (options.getTtl() != null) { + bucketInfoBuilder.setLifecycleRules( + Collections.singletonList( + new BucketInfo.LifecycleRule( + BucketInfo.LifecycleRule.LifecycleAction.newDeleteAction(), + BucketInfo.LifecycleRule.LifecycleCondition.newBuilder() + .setAge(toIntExact(options.getTtl().toDays())) + .build()))); + } + try { + storage.create(bucketInfoBuilder.build()); + } catch (StorageException e) { + if (ErrorTypeExtractor.bucketAlreadyExists(e)) { + throw (FileAlreadyExistsException) + new FileAlreadyExistsException(String.format("Bucket '%s' already exists.", bucketName)) + .initCause(e); + } + throw new IOException(e); + } + } + + void createEmptyObject(StorageResourceId resourceId) throws IOException { + LOG.trace("createEmptyObject({})", resourceId); + checkArgument( + resourceId.isStorageObject(), "Expected full StorageObject id, got %s", resourceId); + createEmptyObject(resourceId, EMPTY_OBJECT_CREATE_OPTIONS); + } + + void createEmptyObject(StorageResourceId resourceId, CreateObjectOptions options) + throws IOException { + checkArgument( + resourceId.isStorageObject(), "Expected full StorageObject id, got %s", resourceId); + + try { + createEmptyObjectInternal(resourceId, options); + } catch (StorageException e) { + if (canIgnoreExceptionForEmptyObject(e, resourceId, options)) { + LOG.info( + "Ignoring exception of type {}; verified object already exists with desired state.", + e.getClass().getSimpleName()); + LOG.trace("Ignored exception while creating empty object: {}", resourceId, e); + } else { + if (ErrorTypeExtractor.getErrorType(e) == ErrorTypeExtractor.ErrorType.ALREADY_EXISTS) { + throw (FileAlreadyExistsException) + new FileAlreadyExistsException( + String.format("Object '%s' already exists.", resourceId) + ).initCause(e); + } + throw new IOException(e); + } + } + } + + /** + * Helper to check whether an empty object already exists with the expected metadata specified in + * {@code options}, to be used to determine whether it's safe to ignore an exception that was + * thrown when trying to create the object, {@code exceptionOnCreate}. + */ + private boolean canIgnoreExceptionForEmptyObject( + StorageException exceptionOnCreate, StorageResourceId resourceId, CreateObjectOptions options) + throws IOException { + ErrorTypeExtractor.ErrorType errorType = ErrorTypeExtractor.getErrorType(exceptionOnCreate); + if (shouldBackoff(resourceId, errorType)) { + GoogleCloudStorageItemInfo existingInfo; + Duration maxWaitTime = Duration.ofSeconds(3); // TODO: make this configurable + + BackOff backOff = + !maxWaitTime.isZero() && !maxWaitTime.isNegative() + ? new ExponentialBackOff.Builder() + .setMaxElapsedTimeMillis(toIntExact(maxWaitTime.toMillis())) + .setMaxIntervalMillis(500) + .setInitialIntervalMillis(100) + .setMultiplier(1.5) + .setRandomizationFactor(0.15) + .build() + : BackOff.STOP_BACKOFF; + long nextSleep = 0L; + do { + if (nextSleep > 0) { + try { + Sleeper.DEFAULT.sleep(nextSleep); + } catch (InterruptedException e) { + Thread.currentThread().interrupt(); + nextSleep = BackOff.STOP; + } + } + existingInfo = getItemInfo(resourceId); + nextSleep = nextSleep == BackOff.STOP ? BackOff.STOP : backOff.nextBackOffMillis(); + } while (!existingInfo.exists() && nextSleep != BackOff.STOP); + + // Compare existence, size, and metadata; for 429 errors creating an empty object, + // we don't care about metaGeneration/contentGeneration as long as the metadata + // matches, since we don't know for sure whether our low-level request succeeded + // first or some other client succeeded first. + if (existingInfo.exists() && existingInfo.getSize() == 0) { + if (options.isEnsureEmptyObjectsMetadataMatch()) { + return existingInfo.metadataEquals(options.getMetadata()); + } + return true; + } + } + return false; + } + + private static boolean shouldBackoff(StorageResourceId resourceId, + ErrorTypeExtractor.ErrorType errorType) { + return errorType == ErrorTypeExtractor.ErrorType.RESOURCE_EXHAUSTED + || errorType == ErrorTypeExtractor.ErrorType.INTERNAL || + (resourceId.isDirectory() && errorType == ErrorTypeExtractor.ErrorType.FAILED_PRECONDITION); + } + + private void createEmptyObjectInternal( + StorageResourceId resourceId, CreateObjectOptions createObjectOptions) throws IOException { + Map rewrittenMetadata = encodeMetadata(createObjectOptions.getMetadata()); + + List blobTargetOptions = new ArrayList<>(); + blobTargetOptions.add(Storage.BlobTargetOption.disableGzipContent()); + if (resourceId.hasGenerationId()) { + blobTargetOptions.add(Storage.BlobTargetOption.generationMatch(resourceId.getGenerationId())); + } else if (resourceId.isDirectory() || !createObjectOptions.isOverwriteExisting()) { + blobTargetOptions.add(Storage.BlobTargetOption.doesNotExist()); + } + + try { + // TODO: Set encryption key and related properties + storage.create( + BlobInfo.newBuilder(BlobId.of(resourceId.getBucketName(), resourceId.getObjectName())) + .setMetadata(rewrittenMetadata) + .setContentEncoding(createObjectOptions.getContentEncoding()) + .setContentType(createObjectOptions.getContentType()) + .build(), + blobTargetOptions.toArray(new Storage.BlobTargetOption[0])); + } catch (StorageException e) { + throw new IOException(String.format("Creating empty object %s failed.", resourceId), e); + } + } + + private static Map encodeMetadata(Map metadata) { + return Maps.transformValues(metadata, GoogleCloudStorage::encodeMetadataValues); + } + + private static String encodeMetadataValues(byte[] bytes) { + return bytes == null ? null : BaseEncoding.base64().encode(bytes); + } + + List listDirectoryRecursive(String bucketName, String objectName) + throws IOException { + // TODO: Take delimiter from config + // TODO: Set specific fields + + try { + Page blobs = storage.list( + bucketName, + Storage.BlobListOption.prefix(objectName)); + + List result = new ArrayList<>(); + for (Blob blob : blobs.iterateAll()) { + result.add(createItemInfoForBlob(blob)); + } + + return result; + } catch (StorageException e) { + throw new IOException( + String.format("Listing '%s' failed", BlobId.of(bucketName, objectName)), e); + } + } + + void deleteObjects(List fullObjectNames) throws IOException { + LOG.trace("deleteObjects({})", fullObjectNames); + + if (fullObjectNames.isEmpty()) { + return; + } + + // Validate that all the elements represent StorageObjects. + for (StorageResourceId toDelete : fullObjectNames) { + checkArgument( + toDelete.isStorageObject(), + "Expected full StorageObject names only, got: %s", + toDelete); + } + + // TODO: Do this concurrently + // TODO: There is duplication. fix it + for (StorageResourceId toDelete : fullObjectNames) { + try { + LOG.trace("Deleting Object ({})", toDelete); + if (toDelete.hasGenerationId() && toDelete.getGenerationId() != 0) { + storage.delete( + BlobId.of(toDelete.getBucketName(), toDelete.getObjectName()), + Storage.BlobSourceOption.generationMatch(toDelete.getGenerationId())); + } else { + // TODO: Remove delete without generationId + storage.delete(BlobId.of(toDelete.getBucketName(), toDelete.getObjectName())); + + LOG.trace("Deleting Object without generationId ({})", toDelete); + } + } catch (StorageException e) { + throw new IOException(String.format("Deleting resource %s failed.", toDelete), e); + } + } + } + + List listBucketInfo() throws IOException { + List allBuckets = listBucketsInternal(); + List bucketInfos = new ArrayList<>(allBuckets.size()); + for (Bucket bucket : allBuckets) { + bucketInfos.add(createItemInfoForBucket(new StorageResourceId(bucket.getName()), bucket)); + } + return bucketInfos; + } + + + private List listBucketsInternal() throws IOException { + checkNotNull(configuration.getProjectId(), "projectId must not be null"); + List allBuckets = new ArrayList<>(); + try { + Page buckets = + storage.list( + Storage.BucketListOption.pageSize(configuration.getMaxListItemsPerCall()), + Storage.BucketListOption.fields( + Storage.BucketField.LOCATION, + Storage.BucketField.STORAGE_CLASS, + Storage.BucketField.TIME_CREATED, + Storage.BucketField.UPDATED)); + + // Loop to fetch all the items. + for (Bucket bucket : buckets.iterateAll()) { + allBuckets.add(bucket); + } + } catch (StorageException e) { + throw new IOException(e); + } + return allBuckets; + } + + // Helper class to capture the results of list operation. + private class ListOperationResult { + private final Map prefixes = new HashMap<>(); + private final List objects = new ArrayList<>(); + + private final Set objectsSet = new HashSet<>(); + + private final long maxResults; + + ListOperationResult(long maxResults) { + this.maxResults = maxResults; + } + + void add(Blob blob) { + String path = blob.getBlobId().toGsUtilUri(); + if (blob.getGeneration() != null) { + prefixes.remove(path); + objects.add(blob); + + objectsSet.add(path); + } else if (!objectsSet.contains(path)) { + prefixes.put(path, blob); + } + } + + List getItems() { + List result = new ArrayList<>(prefixes.size() + objects.size()); + + for (Blob blob : objects) { + result.add(createItemInfoForBlob(blob)); + + if (result.size() == maxResults) { + return result; + } + } + + for (Blob blob : prefixes.values()) { + if (result.size() == maxResults) { + return result; + } + + result.add(createItemInfoForBlob(blob)); + } + + return result; + } + } } diff --git a/hadoop-tools/hadoop-gcp/src/main/java/org/apache/hadoop/fs/gs/GoogleCloudStorageFileSystem.java b/hadoop-tools/hadoop-gcp/src/main/java/org/apache/hadoop/fs/gs/GoogleCloudStorageFileSystem.java index e411f22eb3994..aa1617e4da687 100644 --- a/hadoop-tools/hadoop-gcp/src/main/java/org/apache/hadoop/fs/gs/GoogleCloudStorageFileSystem.java +++ b/hadoop-tools/hadoop-gcp/src/main/java/org/apache/hadoop/fs/gs/GoogleCloudStorageFileSystem.java @@ -19,21 +19,57 @@ package org.apache.hadoop.fs.gs; import static org.apache.hadoop.thirdparty.com.google.common.base.Preconditions.*; +import static java.util.Comparator.comparing; +import static org.apache.hadoop.fs.gs.Constants.PATH_DELIMITER; import static org.apache.hadoop.fs.gs.Constants.SCHEME; import com.google.auth.Credentials; +import org.apache.hadoop.thirdparty.com.google.common.annotations.VisibleForTesting; +import org.apache.hadoop.thirdparty.com.google.common.collect.ImmutableList; +import org.apache.hadoop.thirdparty.com.google.common.collect.Iterables; import org.slf4j.Logger; import org.slf4j.LoggerFactory; +import java.io.FileNotFoundException; import java.io.IOException; import java.net.URI; import java.nio.channels.WritableByteChannel; +import java.nio.file.DirectoryNotEmptyException; +import java.nio.file.FileAlreadyExistsException; +import java.util.ArrayList; +import java.util.Comparator; +import java.util.List; +import java.util.Objects; /** * Provides FS semantics over GCS based on Objects API. */ class GoogleCloudStorageFileSystem { private static final Logger LOG = LoggerFactory.getLogger(StorageResourceId.class); + // Comparator used for sorting paths. + // + // For some bulk operations, we need to operate on parent directories before + // we operate on their children. To achieve this, we sort paths such that + // shorter paths appear before longer paths. Also, we sort lexicographically + // within paths of the same length (this is not strictly required but helps when + // debugging/testing). + @VisibleForTesting + static final Comparator PATH_COMPARATOR = + comparing( + URI::toString, + (as, bs) -> + (as.length() == bs.length()) + ? as.compareTo(bs) + : Integer.compare(as.length(), bs.length())); + + static final Comparator FILE_INFO_PATH_COMPARATOR = + comparing(FileInfo::getPath, PATH_COMPARATOR); + + private static final ListObjectOptions GET_FILE_INFO_LIST_OPTIONS = + ListObjectOptions.DEFAULT.builder().setIncludePrefix(true).setMaxResults(1).build(); + + private static final ListObjectOptions LIST_FILE_INFO_LIST_OPTIONS = + ListObjectOptions.DEFAULT.builder().setIncludePrefix(true).build(); // URI of the root path. static final URI GCSROOT = URI.create(SCHEME + ":/"); @@ -86,4 +122,253 @@ void close() { gcs = null; } } + + public FileInfo getFileInfo(URI path) throws IOException { + checkArgument(path != null, "path must not be null"); + // Validate the given path. true == allow empty object name. + // One should be able to get info about top level directory (== bucket), + // therefore we allow object name to be empty. + StorageResourceId resourceId = StorageResourceId.fromUriPath(path, true); + FileInfo fileInfo = + FileInfo.fromItemInfo( + getFileInfoInternal(resourceId, /* inferImplicitDirectories= */ true)); + LOG.trace("getFileInfo(path: {}): {}", path, fileInfo); + return fileInfo; + } + + private GoogleCloudStorageItemInfo getFileInfoInternal( + StorageResourceId resourceId, + boolean inferImplicitDirectories) + throws IOException { + if (resourceId.isRoot() || resourceId.isBucket()) { + return gcs.getItemInfo(resourceId); + } + + StorageResourceId dirId = resourceId.toDirectoryId(); + if (!resourceId.isDirectory()) { + GoogleCloudStorageItemInfo itemInfo = gcs.getItemInfo(resourceId); + if (itemInfo.exists()) { + return itemInfo; + } + + if (inferImplicitDirectories) { + // TODO: Set max result + List listDirResult = gcs.listObjectInfo( + resourceId.getBucketName(), + resourceId.getObjectName(), + GET_FILE_INFO_LIST_OPTIONS); + LOG.trace("List for getMetadata returned {}. {}", listDirResult.size(), listDirResult); + if (!listDirResult.isEmpty()) { + LOG.trace("Get metadata for directory returned non empty {}", listDirResult); + return GoogleCloudStorageItemInfo.createInferredDirectory(resourceId.toDirectoryId()); + } + } + } + + List listDirInfo = ImmutableList.of(gcs.getItemInfo(dirId)); + if (listDirInfo.isEmpty()) { + return GoogleCloudStorageItemInfo.createNotFound(resourceId); + } + checkState(listDirInfo.size() <= 2, "listed more than 2 objects: '%s'", listDirInfo); + GoogleCloudStorageItemInfo dirInfo = Iterables.get(listDirInfo, /* position= */ 0); + checkState( + dirInfo.getResourceId().equals(dirId) || !inferImplicitDirectories, + "listed wrong object '%s', but should be '%s'", + dirInfo.getResourceId(), + resourceId); + return dirInfo.getResourceId().equals(dirId) && dirInfo.exists() + ? dirInfo + : GoogleCloudStorageItemInfo.createNotFound(resourceId); + } + + public void mkdirs(URI path) throws IOException { + LOG.trace("mkdirs(path: {})", path); + checkNotNull(path, "path should not be null"); + + /* allowEmptyObjectName= */ + StorageResourceId resourceId = + StorageResourceId.fromUriPath(path, /* allowEmptyObjectName= */ true); + if (resourceId.isRoot()) { + // GCS_ROOT directory always exists, no need to go through the rest of the method. + return; + } + + // In case path is a bucket we just attempt to create it without additional checks + if (resourceId.isBucket()) { + try { + gcs.createBucket(resourceId.getBucketName(), CreateBucketOptions.DEFAULT); + } catch (FileAlreadyExistsException e) { + // This means that bucket already exist, and we do not need to do anything. + LOG.trace("mkdirs: {} already exists, ignoring creation failure", resourceId, e); + } + return; + } + + resourceId = resourceId.toDirectoryId(); + + // TODO: Before creating a leaf directory we need to check if there are no conflicting files + // TODO: with the same name as any subdirectory + + // Create only a leaf directory because subdirectories will be inferred + // if leaf directory exists + try { + gcs.createEmptyObject(resourceId); + } catch (FileAlreadyExistsException e) { + // This means that directory object already exist, and we do not need to do anything. + LOG.trace("mkdirs: {} already exists, ignoring creation failure", resourceId, e); + } + } + + void delete(URI path, boolean recursive) throws IOException { + checkNotNull(path, "path should not be null"); + checkArgument(!path.equals(GCSROOT), "Cannot delete root path (%s)", path); + + FileInfo fileInfo = getFileInfo(path); + if (!fileInfo.exists()) { + throw new FileNotFoundException("Item not found: " + path); + } + + List itemsToDelete; + // Delete sub-items if it is a directory. + if (fileInfo.isDirectory()) { + itemsToDelete = + recursive + ? listRecursive(fileInfo.getPath()) // TODO: Get only one result + : listDirectory(fileInfo.getPath()); + + if (!itemsToDelete.isEmpty() && !recursive) { + throw new DirectoryNotEmptyException("Cannot delete a non-empty directory. : " + path); + } + } else { + itemsToDelete = new ArrayList<>(); + } + + List bucketsToDelete = new ArrayList<>(); + (fileInfo.getItemInfo().isBucket() ? bucketsToDelete : itemsToDelete).add(fileInfo); + + deleteObjects(itemsToDelete, bucketsToDelete); + + StorageResourceId parentId = + StorageResourceId.fromUriPath(UriPaths.getParentPath(path), true); + GoogleCloudStorageItemInfo parentInfo = + getFileInfoInternal(parentId, /* inferImplicitDirectories= */ false); + + StorageResourceId resourceId = parentInfo.getResourceId(); + if (parentInfo.exists() + || resourceId.isRoot() + || resourceId.isBucket() + || PATH_DELIMITER.equals(resourceId.getObjectName())) { + return; + } + + // TODO: Keep the repair parent step behind a flag + gcs.createEmptyObject(parentId); + } + + private List listRecursive(URI prefix) throws IOException { + StorageResourceId prefixId = getPrefixId(prefix); + List itemInfos = + gcs.listDirectoryRecursive(prefixId.getBucketName(), prefixId.getObjectName()); + List fileInfos = FileInfo.fromItemInfos(itemInfos); + fileInfos.sort(FILE_INFO_PATH_COMPARATOR); + return fileInfos; + } + + private List listDirectory(URI prefix) throws IOException { + StorageResourceId prefixId = getPrefixId(prefix); + List itemInfos = gcs.listObjectInfo( + prefixId.getBucketName(), + prefixId.getObjectName(), + ListObjectOptions.DEFAULT_FLAT_LIST); + + List fileInfos = FileInfo.fromItemInfos(itemInfos); + fileInfos.sort(FILE_INFO_PATH_COMPARATOR); + return fileInfos; + } + + private StorageResourceId getPrefixId(URI prefix) { + checkNotNull(prefix, "prefix could not be null"); + + StorageResourceId prefixId = StorageResourceId.fromUriPath(prefix, true); + checkArgument(!prefixId.isRoot(), "prefix must not be global root, got '%s'", prefix); + + return prefixId; + } + + private void deleteObjects( + List itemsToDelete, List bucketsToDelete) + throws IOException { + LOG.trace("deleteInternalWithFolders; fileSize={} bucketSize={}", + itemsToDelete.size(), bucketsToDelete.size()); + deleteObjects(itemsToDelete); + deleteBucket(bucketsToDelete); + } + + private void deleteObjects(List itemsToDelete) throws IOException { + // Delete children before their parents. + // + // Note: we modify the input list, which is ok for current usage. + // We should make a copy in case that changes in future. + itemsToDelete.sort(FILE_INFO_PATH_COMPARATOR.reversed()); + + if (!itemsToDelete.isEmpty()) { + List objectsToDelete = new ArrayList<>(itemsToDelete.size()); + for (FileInfo fileInfo : itemsToDelete) { + if (!fileInfo.isInferredDirectory()) { + objectsToDelete.add( + new StorageResourceId( + fileInfo.getItemInfo().getBucketName(), + fileInfo.getItemInfo().getObjectName(), + fileInfo.getItemInfo().getContentGeneration())); + } + } + + gcs.deleteObjects(objectsToDelete); + } + } + + private void deleteBucket(List bucketsToDelete) throws IOException { + if (bucketsToDelete == null || bucketsToDelete.isEmpty()) { + return; + } + + // TODO: Add support for deleting bucket + throw new UnsupportedOperationException("deleteBucket is not supported."); + } + + public List listFileInfo(URI path, ListFileOptions listOptions) throws IOException { + checkNotNull(path, "path can not be null"); + LOG.trace("listStatus(path: {})", path); + + StorageResourceId pathId = + StorageResourceId.fromUriPath(path, /* allowEmptyObjectName= */ true); + + if (!pathId.isDirectory()) { + GoogleCloudStorageItemInfo pathInfo = gcs.getItemInfo(pathId); + if (pathInfo.exists()) { + List listedInfo = new ArrayList<>(); + listedInfo.add(FileInfo.fromItemInfo(pathInfo)); + + return listedInfo; + } + } + + StorageResourceId dirId = pathId.toDirectoryId(); + List dirItemInfos = dirId.isRoot() ? + gcs.listBucketInfo() : + gcs.listObjectInfo( + dirId.getBucketName(), dirId.getObjectName(), LIST_FILE_INFO_LIST_OPTIONS); + + if (pathId.isStorageObject() && dirItemInfos.isEmpty()) { + throw new FileNotFoundException("Item not found: " + path); + } + + if (!dirItemInfos.isEmpty() && Objects.equals(dirItemInfos.get(0).getResourceId(), dirId)) { + dirItemInfos.remove(0); + } + + List fileInfos = FileInfo.fromItemInfos(dirItemInfos); + fileInfos.sort(FILE_INFO_PATH_COMPARATOR); + return fileInfos; + } } diff --git a/hadoop-tools/hadoop-gcp/src/main/java/org/apache/hadoop/fs/gs/GoogleCloudStorageItemInfo.java b/hadoop-tools/hadoop-gcp/src/main/java/org/apache/hadoop/fs/gs/GoogleCloudStorageItemInfo.java index 887e68b05f98c..83169b8d9213d 100644 --- a/hadoop-tools/hadoop-gcp/src/main/java/org/apache/hadoop/fs/gs/GoogleCloudStorageItemInfo.java +++ b/hadoop-tools/hadoop-gcp/src/main/java/org/apache/hadoop/fs/gs/GoogleCloudStorageItemInfo.java @@ -1,11 +1,13 @@ /* - * Copyright 2013 Google Inc. + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 + * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, diff --git a/hadoop-tools/hadoop-gcp/src/main/java/org/apache/hadoop/fs/gs/GoogleHadoopFileSystem.java b/hadoop-tools/hadoop-gcp/src/main/java/org/apache/hadoop/fs/gs/GoogleHadoopFileSystem.java index 1c2fc19d2b5b1..8831568a3560d 100644 --- a/hadoop-tools/hadoop-gcp/src/main/java/org/apache/hadoop/fs/gs/GoogleHadoopFileSystem.java +++ b/hadoop-tools/hadoop-gcp/src/main/java/org/apache/hadoop/fs/gs/GoogleHadoopFileSystem.java @@ -32,12 +32,16 @@ import java.io.FileNotFoundException; import java.io.IOException; import java.net.URI; +import java.nio.file.DirectoryNotEmptyException; +import java.util.ArrayList; import java.util.EnumSet; +import java.util.List; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.*; import org.apache.hadoop.fs.permission.FsPermission; import org.apache.hadoop.security.ProviderUtils; +import org.apache.hadoop.security.UserGroupInformation; import org.apache.hadoop.util.Progressable; import org.slf4j.Logger; @@ -273,7 +277,6 @@ public FSDataOutputStream create(Path hadoopPath, FsPermission permission, boole checkArgument(replication > 0, "replication must be a positive integer: %s", replication); checkArgument(blockSize > 0, "blockSize must be a positive integer: %s", blockSize); - System.out.println(String.format("create(%s)", hadoopPath)); checkOpen(); LOG.trace("create(hadoopPath: {}, overwrite: {}, bufferSize: {} [ignored])", hadoopPath, @@ -289,10 +292,32 @@ public FSDataOutputStream create(Path hadoopPath, FsPermission permission, boole } @Override - public FSDataOutputStream createNonRecursive(Path hadoopPath, FsPermission permission, - EnumSet flags, int bufferSize, short replication, long blockSize, - Progressable progress) throws IOException { - throw new UnsupportedOperationException(hadoopPath.toString()); + public FSDataOutputStream createNonRecursive( + Path hadoopPath, + FsPermission permission, + EnumSet flags, + int bufferSize, + short replication, + long blockSize, + Progressable progress) + throws IOException { + URI gcsPath = getGcsPath(checkNotNull(hadoopPath, "hadoopPath must not be null")); + URI parentGcsPath = UriPaths.getParentPath(gcsPath); + if (!getGcsFs().getFileInfo(parentGcsPath).exists()) { + throw new FileNotFoundException( + String.format( + "Can not create '%s' file, because parent folder does not exist: %s", + gcsPath, parentGcsPath)); + } + + return create( + hadoopPath, + permission, + flags.contains(CreateFlag.OVERWRITE), + bufferSize, + replication, + blockSize, + progress); } @Override @@ -308,19 +333,57 @@ public boolean rename(final Path path, final Path path1) throws IOException { } @Override - public boolean delete(final Path path, final boolean recursive) throws IOException { - LOG.trace("delete({}, {})", path, recursive); - throw new UnsupportedOperationException(path.toString()); + public boolean delete(final Path hadoopPath, final boolean recursive) throws IOException { + LOG.trace("delete({}, {})", hadoopPath, recursive); + checkArgument(hadoopPath != null, "hadoopPath must not be null"); + + checkOpen(); + + URI gcsPath = getGcsPath(hadoopPath); + try { + getGcsFs().delete(gcsPath, recursive); + } catch (DirectoryNotEmptyException e) { + throw e; + } catch (IOException e) { + if (ApiErrorExtractor.INSTANCE.requestFailure(e)) { + throw e; + } + LOG.trace("delete(hadoopPath: {}, recursive: {}): false [failed]", hadoopPath, recursive, e); + return false; + } + + LOG.trace("delete(hadoopPath: %s, recursive: %b): true", hadoopPath, recursive); + return true; } @Override - public FileStatus[] listStatus(final Path path) throws FileNotFoundException, IOException { - checkArgument(path != null, "hadoopPath must not be null"); + public FileStatus[] listStatus(final Path hadoopPath) throws IOException { + checkArgument(hadoopPath != null, "hadoopPath must not be null"); checkOpen(); - LOG.trace("listStatus(hadoopPath: {})", path); - throw new UnsupportedOperationException(path.toString()); + LOG.trace("listStatus(hadoopPath: {})", hadoopPath); + + URI gcsPath = getGcsPath(hadoopPath); + List status; + + try { + List fileInfos = getGcsFs().listFileInfo(gcsPath, ListFileOptions.OBJECTFIELDS); + status = new ArrayList<>(fileInfos.size()); + String userName = getUgiUserName(); + for (FileInfo fileInfo : fileInfos) { + status.add(getFileStatus(fileInfo, userName)); + } + } catch (FileNotFoundException fnfe) { + throw (FileNotFoundException) + new FileNotFoundException( + String.format( + "listStatus(hadoopPath: %s): '%s' does not exist.", + hadoopPath, gcsPath)) + .initCause(fnfe); + } + + return status.toArray(new FileStatus[0]); } /** @@ -402,18 +465,29 @@ public Path getWorkingDirectory() { } @Override - public boolean mkdirs(final Path path, final FsPermission fsPermission) throws IOException { - LOG.trace("mkdirs({})", path); - throw new UnsupportedOperationException(path.toString()); - } + public boolean mkdirs(final Path hadoopPath, final FsPermission permission) throws IOException { + checkArgument(hadoopPath != null, "hadoopPath must not be null"); -// /** -// * Gets the default replication factor. -// */ -// @Override -// public short getDefaultReplication() { -// return REPLICATION_FACTOR_DEFAULT; -// } + LOG.trace( + "mkdirs(hadoopPath: {}, permission: {}): true", hadoopPath, permission); + + checkOpen(); + + URI gcsPath = getGcsPath(hadoopPath); + try { + getGcsFs().mkdirs(gcsPath); + } catch (java.nio.file.FileAlreadyExistsException faee) { + // Need to convert to the Hadoop flavor of FileAlreadyExistsException. + throw (FileAlreadyExistsException) + new FileAlreadyExistsException( + String.format( + "mkdirs(hadoopPath: %s, permission: %s): failed", + hadoopPath, permission)) + .initCause(faee); + } + + return true; + } @Override public FileStatus getFileStatus(final Path path) throws IOException { @@ -423,9 +497,14 @@ public FileStatus getFileStatus(final Path path) throws IOException { URI gcsPath = getGcsPath(path); - LOG.trace("getFileStatus(): {}", gcsPath); - - throw new UnsupportedOperationException(path.toString()); + FileInfo fileInfo = getGcsFs().getFileInfo(gcsPath); + if (!fileInfo.exists()) { + throw new FileNotFoundException( + String.format( + "%s not found: %s", fileInfo.isDirectory() ? "Directory" : "File", path)); + } + String userName = getUgiUserName(); + return getFileStatus(fileInfo, userName); } /** @@ -502,4 +581,29 @@ public void setWorkingDirectory(final Path hadoopPath) { workingDirectory = getHadoopPath(gcsPath); LOG.trace("setWorkingDirectory(hadoopPath: {}): {}", hadoopPath, workingDirectory); } + + + private static String getUgiUserName() throws IOException { + UserGroupInformation ugi = UserGroupInformation.getCurrentUser(); + return ugi.getShortUserName(); + } + + private FileStatus getFileStatus(FileInfo fileInfo, String userName) { + checkNotNull(fileInfo, "fileInfo should not be null"); + // GCS does not provide modification time. It only provides creation time. + // It works for objects because they are immutable once created. + FileStatus status = new FileStatus( + fileInfo.getSize(), + fileInfo.isDirectory(), + REPLICATION_FACTOR_DEFAULT, + defaultBlockSize, + fileInfo.getModificationTime(), + fileInfo.getModificationTime(), + reportedPermissions, + userName, + userName, + getHadoopPath(fileInfo.getPath())); + LOG.trace("FileStatus(path: {}, userName: {}): {}", fileInfo.getPath(), userName, status); + return status; + } } diff --git a/hadoop-tools/hadoop-gcp/src/main/java/org/apache/hadoop/fs/gs/GoogleHadoopFileSystemConfiguration.java b/hadoop-tools/hadoop-gcp/src/main/java/org/apache/hadoop/fs/gs/GoogleHadoopFileSystemConfiguration.java index 16d940b16f49c..a480a72e60bd2 100644 --- a/hadoop-tools/hadoop-gcp/src/main/java/org/apache/hadoop/fs/gs/GoogleHadoopFileSystemConfiguration.java +++ b/hadoop-tools/hadoop-gcp/src/main/java/org/apache/hadoop/fs/gs/GoogleHadoopFileSystemConfiguration.java @@ -34,28 +34,29 @@ class GoogleHadoopFileSystemConfiguration { * querying the value. Modifying this value allows one to control how many mappers are used to * process a given file. */ - public static final HadoopConfigurationProperty BLOCK_SIZE = + static final HadoopConfigurationProperty BLOCK_SIZE = new HadoopConfigurationProperty<>("fs.gs.block.size", 64 * 1024 * 1024L); /** * Configuration key for GCS project ID. Default value: none */ - public static final HadoopConfigurationProperty GCS_PROJECT_ID = + static final HadoopConfigurationProperty GCS_PROJECT_ID = new HadoopConfigurationProperty<>("fs.gs.project.id"); /** * Configuration key for initial working directory of a GHFS instance. Default value: '/' */ - public static final HadoopConfigurationProperty GCS_WORKING_DIRECTORY = + static final HadoopConfigurationProperty GCS_WORKING_DIRECTORY = new HadoopConfigurationProperty<>("fs.gs.working.dir", "/"); /** * Configuration key for setting write buffer size. */ - public static final HadoopConfigurationProperty GCS_OUTPUT_STREAM_BUFFER_SIZE = + static final HadoopConfigurationProperty GCS_OUTPUT_STREAM_BUFFER_SIZE = new HadoopConfigurationProperty<>("fs.gs.outputstream.buffer.size", 8L * 1024 * 1024); private final String workingDirectory; + private final String projectId; public int getOutStreamBufferSize() { return outStreamBufferSize; @@ -67,9 +68,18 @@ public int getOutStreamBufferSize() { this.workingDirectory = GCS_WORKING_DIRECTORY.get(config, config::get); this.outStreamBufferSize = toIntExact(GCS_OUTPUT_STREAM_BUFFER_SIZE.get(config, config::getLongBytes)); + this.projectId = GCS_PROJECT_ID.get(config, config::get); } public String getWorkingDirectory() { return this.workingDirectory; } + + String getProjectId() { + return this.projectId; + } + + public long getMaxListItemsPerCall() { + return 5000L; //TODO: Make this configurable + } } diff --git a/hadoop-tools/hadoop-gcp/src/main/java/org/apache/hadoop/fs/gs/IoExceptionHelper.java b/hadoop-tools/hadoop-gcp/src/main/java/org/apache/hadoop/fs/gs/IoExceptionHelper.java new file mode 100644 index 0000000000000..c68a6cac1a1c8 --- /dev/null +++ b/hadoop-tools/hadoop-gcp/src/main/java/org/apache/hadoop/fs/gs/IoExceptionHelper.java @@ -0,0 +1,83 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hadoop.fs.gs; + +import java.io.IOError; +import java.io.IOException; +import java.net.SocketException; +import java.net.SocketTimeoutException; +import javax.net.ssl.SSLException; + +/** + * Translates exceptions from API calls into higher-level meaning, while allowing injectability for + * testing how API errors are handled. + */ +public final class IoExceptionHelper { + + private IoExceptionHelper() {} + + /** + * Determines if a given {@link Throwable} is caused by an IO error. + * + *

Recursively checks {@code getCause()} if outer exception isn't an instance of the correct + * class. + * + * @param throwable The {@link Throwable} to check. + * @return True if the {@link Throwable} is a result of an IO error. + */ + public static boolean isIoError(Throwable throwable) { + if (throwable instanceof IOException || throwable instanceof IOError) { + return true; + } + Throwable cause = throwable.getCause(); + return cause != null && isIoError(cause); + } + + /** + * Determines if a given {@link Throwable} is caused by a socket error. + * + *

Recursively checks {@code getCause()} if outer exception isn't an instance of the correct + * class. + * + * @param throwable The {@link Throwable} to check. + * @return True if the {@link Throwable} is a result of a socket error. + */ + public static boolean isSocketError(Throwable throwable) { + if (throwable instanceof SocketException || throwable instanceof SocketTimeoutException) { + return true; + } + Throwable cause = throwable.getCause(); + // Subset of SSL exceptions that are caused by IO errors (e.g. SSLHandshakeException due to + // unexpected connection closure) is also a socket error. + if (throwable instanceof SSLException && cause != null && isIoError(cause)) { + return true; + } + return cause != null && isSocketError(cause); + } + + /** + * Determines if a given {@link IOException} is caused by a timed out read. + * + * @param e The {@link IOException} to check. + * @return True if the {@link IOException} is a result of a read timeout. + */ + public static boolean isReadTimedOut(IOException e) { + return e instanceof SocketTimeoutException && e.getMessage().equalsIgnoreCase("Read timed out"); + } +} diff --git a/hadoop-tools/hadoop-gcp/src/main/java/org/apache/hadoop/fs/gs/ListFileOptions.java b/hadoop-tools/hadoop-gcp/src/main/java/org/apache/hadoop/fs/gs/ListFileOptions.java new file mode 100644 index 0000000000000..2bc74c6fc2190 --- /dev/null +++ b/hadoop-tools/hadoop-gcp/src/main/java/org/apache/hadoop/fs/gs/ListFileOptions.java @@ -0,0 +1,34 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hadoop.fs.gs; + +import javax.annotation.Nonnull; + +final class ListFileOptions { + static final ListFileOptions OBJECTFIELDS = new ListFileOptions("bucket,name,size,updated"); + private final String fields; + + private ListFileOptions(@Nonnull String fields) { + this.fields = fields; + } + + String getFields() { + return fields; + } +} diff --git a/hadoop-tools/hadoop-gcp/src/main/java/org/apache/hadoop/fs/gs/ListObjectOptions.java b/hadoop-tools/hadoop-gcp/src/main/java/org/apache/hadoop/fs/gs/ListObjectOptions.java new file mode 100644 index 0000000000000..60ec409b5c7d8 --- /dev/null +++ b/hadoop-tools/hadoop-gcp/src/main/java/org/apache/hadoop/fs/gs/ListObjectOptions.java @@ -0,0 +1,141 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hadoop.fs.gs; + +import javax.annotation.Nullable; + +import static org.apache.hadoop.fs.gs.Constants.PATH_DELIMITER; + +/** Options that can be specified when listing objects in the {@link GoogleCloudStorage}. */ +final class ListObjectOptions { + + /** List all objects in the directory. */ + public static final ListObjectOptions DEFAULT = new Builder().build(); + + /** List all objects with the prefix. */ + public static final ListObjectOptions DEFAULT_FLAT_LIST = + DEFAULT.builder().setDelimiter(null).build(); + + Builder builder() { + Builder result = new Builder(); + result.fields = fields; + result.delimiter = delimiter; + result.maxResults = maxResult; + result.includePrefix = includePrefix; + + return result; + } + + private final String delimiter; + private final boolean includePrefix; + private final long maxResult; + private final String fields; + + private ListObjectOptions(Builder builder) { + this.delimiter = builder.delimiter; + this.includePrefix = builder.includePrefix; + this.maxResult = builder.maxResults; + this.fields = builder.fields; + } + + /** Delimiter to use (typically {@code /}), otherwise {@code null}. */ + @Nullable + String getDelimiter() { + return delimiter; + } + + /** Whether to include prefix object in the result. */ + boolean isIncludePrefix() { + return includePrefix; + } + + /** Maximum number of results to return, unlimited if negative or zero. */ + long getMaxResults() { + return maxResult; + } + + /** + * Comma separated list of object fields to include in the list response. + * + *

See + * object resource for reference. + */ + @Nullable + String getFields() { + return fields; + } + + static class Builder { + private static final int MAX_RESULTS_UNLIMITED = -1; + + static final String OBJECT_FIELDS = + String.join( + /* delimiter= */ ",", + "bucket", + "name", + "timeCreated", + "updated", + "generation", + "metageneration", + "size", + "contentType", + "contentEncoding", + "md5Hash", + "crc32c", + "metadata"); + + private String delimiter; + private boolean includePrefix; + + private long maxResults; + + private String fields; + + Builder() { + this.delimiter = PATH_DELIMITER; + this.includePrefix = false; + this.maxResults = MAX_RESULTS_UNLIMITED; + this.fields = OBJECT_FIELDS; + } + public Builder setDelimiter(String d) { + this.delimiter = d; + return this; + } + + public Builder setIncludePrefix(boolean value) { + this.includePrefix = value; + return this; + } + + public Builder setMaxResults(long mr) { + this.maxResults = mr; + return this; + } + + public Builder setFields(String f) { + this.fields = f; + return this; + } + + public ListObjectOptions build() { + return new ListObjectOptions(this); + } + } +} diff --git a/hadoop-tools/hadoop-gcp/src/main/java/org/apache/hadoop/fs/gs/VerificationAttributes.java b/hadoop-tools/hadoop-gcp/src/main/java/org/apache/hadoop/fs/gs/VerificationAttributes.java index 4155482fc7d33..03de0a52e373e 100644 --- a/hadoop-tools/hadoop-gcp/src/main/java/org/apache/hadoop/fs/gs/VerificationAttributes.java +++ b/hadoop-tools/hadoop-gcp/src/main/java/org/apache/hadoop/fs/gs/VerificationAttributes.java @@ -1,11 +1,13 @@ /* - * Copyright 2016 Google Inc. + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 + * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, diff --git a/hadoop-tools/hadoop-gcp/src/test/java/org/apache/hadoop/fs/gs/TestConfiguration.java b/hadoop-tools/hadoop-gcp/src/test/java/org/apache/hadoop/fs/gs/TestConfiguration.java new file mode 100644 index 0000000000000..f205276d37294 --- /dev/null +++ b/hadoop-tools/hadoop-gcp/src/test/java/org/apache/hadoop/fs/gs/TestConfiguration.java @@ -0,0 +1,64 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hadoop.fs.gs; + +/** Access to test configurations values. */ +public abstract class TestConfiguration { + public static final String GCS_TEST_PROJECT_ID = "GCS_TEST_PROJECT_ID"; + public static final String GCS_TEST_JSON_KEYFILE = "GCS_TEST_JSON_KEYFILE"; + + public static final String GCS_TEST_DIRECT_PATH_PREFERRED = "GCS_TEST_DIRECT_PATH_PREFERRED"; + + /** Environment-based test configuration. */ + public static class EnvironmentBasedTestConfiguration extends TestConfiguration { + @Override + public String getProjectId() { + return System.getenv(GCS_TEST_PROJECT_ID); + } + + @Override + public String getServiceAccountJsonKeyFile() { + return System.getenv(GCS_TEST_JSON_KEYFILE); + } + + @Override + public boolean isDirectPathPreferred() { + String envVar = System.getenv(GCS_TEST_DIRECT_PATH_PREFERRED); + // if env variable is not configured default behaviour is to attempt directPath + if (envVar == null) { + return true; + } + return Boolean.parseBoolean(envVar); + } + } + + public static TestConfiguration getInstance() { + return LazyHolder.INSTANCE; + } + + private static class LazyHolder { + private static final TestConfiguration INSTANCE = new EnvironmentBasedTestConfiguration(); + } + + public abstract String getProjectId(); + + public abstract String getServiceAccountJsonKeyFile(); + + public abstract boolean isDirectPathPreferred(); +} diff --git a/hadoop-tools/hadoop-gcp/src/test/java/org/apache/hadoop/fs/gs/TestStorageResourceId.java b/hadoop-tools/hadoop-gcp/src/test/java/org/apache/hadoop/fs/gs/TestStorageResourceId.java index e0a39b2d7e403..e027c7b40914a 100644 --- a/hadoop-tools/hadoop-gcp/src/test/java/org/apache/hadoop/fs/gs/TestStorageResourceId.java +++ b/hadoop-tools/hadoop-gcp/src/test/java/org/apache/hadoop/fs/gs/TestStorageResourceId.java @@ -1,11 +1,13 @@ /* - * Copyright 2013 Google Inc. + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 + * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, diff --git a/hadoop-tools/hadoop-gcp/src/test/java/org/apache/hadoop/fs/gs/TestStringPaths.java b/hadoop-tools/hadoop-gcp/src/test/java/org/apache/hadoop/fs/gs/TestStringPaths.java index 16234e0ce1d57..a6b64ff7cffab 100644 --- a/hadoop-tools/hadoop-gcp/src/test/java/org/apache/hadoop/fs/gs/TestStringPaths.java +++ b/hadoop-tools/hadoop-gcp/src/test/java/org/apache/hadoop/fs/gs/TestStringPaths.java @@ -1,11 +1,13 @@ /* - * Copyright 2013 Google Inc. + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 + * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, diff --git a/hadoop-tools/hadoop-gcp/src/test/java/org/apache/hadoop/fs/gs/TestUriPaths.java b/hadoop-tools/hadoop-gcp/src/test/java/org/apache/hadoop/fs/gs/TestUriPaths.java index fe93a28dc435c..0325df52f9b72 100644 --- a/hadoop-tools/hadoop-gcp/src/test/java/org/apache/hadoop/fs/gs/TestUriPaths.java +++ b/hadoop-tools/hadoop-gcp/src/test/java/org/apache/hadoop/fs/gs/TestUriPaths.java @@ -1,11 +1,13 @@ /* - * Copyright 2013 Google Inc. + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 + * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, diff --git a/hadoop-tools/hadoop-gcp/src/test/java/org/apache/hadoop/fs/gs/contract/GoogleContract.java b/hadoop-tools/hadoop-gcp/src/test/java/org/apache/hadoop/fs/gs/contract/GoogleContract.java new file mode 100644 index 0000000000000..aa131981caf2b --- /dev/null +++ b/hadoop-tools/hadoop-gcp/src/test/java/org/apache/hadoop/fs/gs/contract/GoogleContract.java @@ -0,0 +1,44 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hadoop.fs.gs.contract; + +import org.apache.hadoop.fs.gs.TestConfiguration; +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.fs.contract.AbstractBondedFSContract; + +/** Contract of GoogleHadoopFileSystem via scheme "gs". */ +public class GoogleContract extends AbstractBondedFSContract { + private static final String CONTRACT_XML = "contract/gs.xml"; + + public GoogleContract(Configuration conf) { + super(conf); + addConfResource(CONTRACT_XML); + conf.set("fs.contract.test.fs.gs", "gs://arunchacko-oss-test-bucket"); // TODO: + + TestConfiguration testConf = TestConfiguration.getInstance(); + if (testConf.getProjectId() != null) { + conf.set("fs.gs.project.id", testConf.getProjectId()); + } + } + + @Override + public String getScheme() { + return "gs"; + } +} diff --git a/hadoop-tools/hadoop-gcp/src/test/java/org/apache/hadoop/fs/gs/contract/ITestGoogleContractDelete.java b/hadoop-tools/hadoop-gcp/src/test/java/org/apache/hadoop/fs/gs/contract/ITestGoogleContractDelete.java new file mode 100644 index 0000000000000..7ed3834025c3c --- /dev/null +++ b/hadoop-tools/hadoop-gcp/src/test/java/org/apache/hadoop/fs/gs/contract/ITestGoogleContractDelete.java @@ -0,0 +1,37 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hadoop.fs.gs.contract; + +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.fs.contract.AbstractContractDeleteTest; +import org.apache.hadoop.fs.contract.AbstractFSContract; +import org.apache.hadoop.fs.contract.ContractTestUtils; + +public class ITestGoogleContractDelete extends AbstractContractDeleteTest { + @Override + protected AbstractFSContract createContract(Configuration conf) { + return new GoogleContract(conf); + } + + @Override + public void testDeleteEmptyDirNonRecursive() { + // TODO: Enable this + ContractTestUtils.skip("Skipping the test. This will be enabled in a subsequent change"); + } +} diff --git a/hadoop-tools/hadoop-gcp/src/test/java/org/apache/hadoop/fs/gs/contract/ITestGoogleContractGetFileStatus.java b/hadoop-tools/hadoop-gcp/src/test/java/org/apache/hadoop/fs/gs/contract/ITestGoogleContractGetFileStatus.java new file mode 100644 index 0000000000000..aae16c2a410ef --- /dev/null +++ b/hadoop-tools/hadoop-gcp/src/test/java/org/apache/hadoop/fs/gs/contract/ITestGoogleContractGetFileStatus.java @@ -0,0 +1,30 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hadoop.fs.gs.contract; + +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.fs.contract.AbstractContractGetFileStatusTest; +import org.apache.hadoop.fs.contract.AbstractFSContract; + +public class ITestGoogleContractGetFileStatus extends AbstractContractGetFileStatusTest { + @Override + protected AbstractFSContract createContract(Configuration conf) { + return new GoogleContract(conf); + } +} diff --git a/hadoop-tools/hadoop-gcp/src/test/java/org/apache/hadoop/fs/gs/contract/ITestGoogleContractMkdir.java b/hadoop-tools/hadoop-gcp/src/test/java/org/apache/hadoop/fs/gs/contract/ITestGoogleContractMkdir.java new file mode 100644 index 0000000000000..26181f20385a3 --- /dev/null +++ b/hadoop-tools/hadoop-gcp/src/test/java/org/apache/hadoop/fs/gs/contract/ITestGoogleContractMkdir.java @@ -0,0 +1,61 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hadoop.fs.gs.contract; + +import org.apache.hadoop.fs.contract.ContractTestUtils; +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.fs.contract.AbstractContractMkdirTest; +import org.apache.hadoop.fs.contract.AbstractFSContract; + +public class ITestGoogleContractMkdir extends AbstractContractMkdirTest { + @Override + protected AbstractFSContract createContract(Configuration conf) { + return new GoogleContract(conf); + } + + @Override + public void testMkdirsDoesNotRemoveParentDirectories() { + // TODO: Enable this + ContractTestUtils.skip("Skipping the test. This will be enabled in a subsequent change"); + } + + @Override + public void testCreateDirWithExistingDir() { + // TODO: Enable this + ContractTestUtils.skip("Skipping the test. This will be enabled in a subsequent change"); + } + + @Override + public void testMkDirRmDir() { + // TODO: Enable this + ContractTestUtils.skip("Skipping the test. This will be enabled in a subsequent change"); + } + + @Override + public void testNoMkdirOverFile() { + // TODO: Enable this + ContractTestUtils.skip("Skipping the test. This will be enabled in a subsequent change"); + } + + @Override + public void testMkdirOverParentFile() { + // TODO: Enable this + ContractTestUtils.skip("Skipping the test. This will be enabled in a subsequent change"); + } +} diff --git a/hadoop-tools/hadoop-gcp/src/test/java/org/apache/hadoop/fs/gs/contract/package-info.java b/hadoop-tools/hadoop-gcp/src/test/java/org/apache/hadoop/fs/gs/contract/package-info.java new file mode 100644 index 0000000000000..8806dc9f45bf7 --- /dev/null +++ b/hadoop-tools/hadoop-gcp/src/test/java/org/apache/hadoop/fs/gs/contract/package-info.java @@ -0,0 +1,22 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/** + * Google Cloud Storage Filesystem contract tests. + */ +package org.apache.hadoop.fs.gs.contract; \ No newline at end of file From 115a0e32cf6ed943cea3ff4b00d6ff564dd1c565 Mon Sep 17 00:00:00 2001 From: Arunkumar Chacko Date: Mon, 16 Jun 2025 19:48:13 +0000 Subject: [PATCH 3/8] HADOOP-19343: Add instruction for running tests Closes #7734 Signed-off-by: Chris Nauroth --- .../site/markdown/tools/hadoop-gcp/testing.md | 70 +++++++++++++++++++ .../hadoop/fs/gs/contract/GoogleContract.java | 7 -- .../src/test/resources/contract/gs.xml | 21 ++++++ 3 files changed, 91 insertions(+), 7 deletions(-) create mode 100644 hadoop-tools/hadoop-gcp/src/site/markdown/tools/hadoop-gcp/testing.md create mode 100644 hadoop-tools/hadoop-gcp/src/test/resources/contract/gs.xml diff --git a/hadoop-tools/hadoop-gcp/src/site/markdown/tools/hadoop-gcp/testing.md b/hadoop-tools/hadoop-gcp/src/site/markdown/tools/hadoop-gcp/testing.md new file mode 100644 index 0000000000000..a56d7e6c395f8 --- /dev/null +++ b/hadoop-tools/hadoop-gcp/src/site/markdown/tools/hadoop-gcp/testing.md @@ -0,0 +1,70 @@ + + +# Testing the GCS filesystem client and its features + + + +This module includes both unit tests, which can run in isolation without +connecting to the GCS service, and integration tests, which require a working +connection to GCS to interact with a bucket. Unit test suites follow the naming +convention `Test*.java`. Integration tests follow the naming convention +`ITest*.java`. + +## Setting up the tests + +To integration test the GCS filesystem client, you need to provide +`auth-keys.xml` which passes in authentication details to the test runner. + +It is a Hadoop XML configuration file, which must be placed into +`hadoop-tools/hadoop-gcp/src/test/resources`. + +### File `core-site.xml` + +This file pre-exists and sources the configurations created +under `auth-keys.xml`. + +For most purposes you will not need to edit this file unless you +need to apply a specific, non-default property change during the tests. + +### File `auth-keys.xml` + +The presence of this file triggers the testing of the GCS classes. + +Without this file, *none of the integration tests in this module will be +executed*. + +* `fs.contract.test.fs.gs` : the URL of the bucket for GCS filesystem contract tests + +Example: + +```xml + + + fs.contract.test.fs.gs + gs://your bucket name + + + +``` + +## Running the Tests + +After completing the configuration, execute the test run through Maven. + +This has to be run from a GCP VM. This limitation will be removed later. + +```bash +mvn clean verify +``` \ No newline at end of file diff --git a/hadoop-tools/hadoop-gcp/src/test/java/org/apache/hadoop/fs/gs/contract/GoogleContract.java b/hadoop-tools/hadoop-gcp/src/test/java/org/apache/hadoop/fs/gs/contract/GoogleContract.java index aa131981caf2b..af65e2c516be8 100644 --- a/hadoop-tools/hadoop-gcp/src/test/java/org/apache/hadoop/fs/gs/contract/GoogleContract.java +++ b/hadoop-tools/hadoop-gcp/src/test/java/org/apache/hadoop/fs/gs/contract/GoogleContract.java @@ -18,7 +18,6 @@ package org.apache.hadoop.fs.gs.contract; -import org.apache.hadoop.fs.gs.TestConfiguration; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.contract.AbstractBondedFSContract; @@ -29,12 +28,6 @@ public class GoogleContract extends AbstractBondedFSContract { public GoogleContract(Configuration conf) { super(conf); addConfResource(CONTRACT_XML); - conf.set("fs.contract.test.fs.gs", "gs://arunchacko-oss-test-bucket"); // TODO: - - TestConfiguration testConf = TestConfiguration.getInstance(); - if (testConf.getProjectId() != null) { - conf.set("fs.gs.project.id", testConf.getProjectId()); - } } @Override diff --git a/hadoop-tools/hadoop-gcp/src/test/resources/contract/gs.xml b/hadoop-tools/hadoop-gcp/src/test/resources/contract/gs.xml new file mode 100644 index 0000000000000..1de34245a5d1a --- /dev/null +++ b/hadoop-tools/hadoop-gcp/src/test/resources/contract/gs.xml @@ -0,0 +1,21 @@ + + + + + From 4e18572a77e4cb6000ced14b4962d749977ac9f4 Mon Sep 17 00:00:00 2001 From: Arunkumar Chacko Date: Wed, 18 Jun 2025 04:21:07 +0000 Subject: [PATCH 4/8] HADOOP-19343: Add support for open() and rename() Closes #7742 Signed-off-by: Chris Nauroth --- .../java/org/apache/hadoop/fs/gs/Fadvise.java | 23 + .../fs/gs/FileAccessPatternManager.java | 184 ++++++ .../hadoop/fs/gs/GoogleCloudStorage.java | 289 +++++++++ .../GoogleCloudStorageClientReadChannel.java | 609 ++++++++++++++++++ .../fs/gs/GoogleCloudStorageExceptions.java | 58 ++ .../fs/gs/GoogleCloudStorageFileSystem.java | 322 ++++++++- .../fs/gs/GoogleHadoopFSInputStream.java | 187 ++++++ .../hadoop/fs/gs/GoogleHadoopFileSystem.java | 45 +- .../GoogleHadoopFileSystemConfiguration.java | 120 +++- .../main/java/org/apache/hadoop/fs/gs/Gs.java | 64 ++ .../apache/hadoop/fs/gs/ListFileOptions.java | 4 + .../gs/contract/ITestGoogleContractOpen.java | 31 + .../contract/ITestGoogleContractRename.java | 43 ++ .../gs/contract/ITestGoogleContractSeek.java | 30 + 14 files changed, 1991 insertions(+), 18 deletions(-) create mode 100644 hadoop-tools/hadoop-gcp/src/main/java/org/apache/hadoop/fs/gs/Fadvise.java create mode 100644 hadoop-tools/hadoop-gcp/src/main/java/org/apache/hadoop/fs/gs/FileAccessPatternManager.java create mode 100644 hadoop-tools/hadoop-gcp/src/main/java/org/apache/hadoop/fs/gs/GoogleCloudStorageClientReadChannel.java create mode 100644 hadoop-tools/hadoop-gcp/src/main/java/org/apache/hadoop/fs/gs/GoogleCloudStorageExceptions.java create mode 100644 hadoop-tools/hadoop-gcp/src/main/java/org/apache/hadoop/fs/gs/GoogleHadoopFSInputStream.java create mode 100644 hadoop-tools/hadoop-gcp/src/main/java/org/apache/hadoop/fs/gs/Gs.java create mode 100644 hadoop-tools/hadoop-gcp/src/test/java/org/apache/hadoop/fs/gs/contract/ITestGoogleContractOpen.java create mode 100644 hadoop-tools/hadoop-gcp/src/test/java/org/apache/hadoop/fs/gs/contract/ITestGoogleContractRename.java create mode 100644 hadoop-tools/hadoop-gcp/src/test/java/org/apache/hadoop/fs/gs/contract/ITestGoogleContractSeek.java diff --git a/hadoop-tools/hadoop-gcp/src/main/java/org/apache/hadoop/fs/gs/Fadvise.java b/hadoop-tools/hadoop-gcp/src/main/java/org/apache/hadoop/fs/gs/Fadvise.java new file mode 100644 index 0000000000000..6bc82324ddf7f --- /dev/null +++ b/hadoop-tools/hadoop-gcp/src/main/java/org/apache/hadoop/fs/gs/Fadvise.java @@ -0,0 +1,23 @@ +package org.apache.hadoop.fs.gs; + +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +enum Fadvise { + RANDOM, SEQUENTIAL, AUTO, AUTO_RANDOM +} diff --git a/hadoop-tools/hadoop-gcp/src/main/java/org/apache/hadoop/fs/gs/FileAccessPatternManager.java b/hadoop-tools/hadoop-gcp/src/main/java/org/apache/hadoop/fs/gs/FileAccessPatternManager.java new file mode 100644 index 0000000000000..e653e2d4bf19a --- /dev/null +++ b/hadoop-tools/hadoop-gcp/src/main/java/org/apache/hadoop/fs/gs/FileAccessPatternManager.java @@ -0,0 +1,184 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hadoop.fs.gs; + +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +/** + * Manages the access pattern of object being read from cloud storage. For adaptive fadvise + * configurations it computes the access pattern based on previous requests. + */ +class FileAccessPatternManager { + private static final Logger LOG = LoggerFactory.getLogger(FileAccessPatternManager.class); + private final StorageResourceId resourceId; + private final GoogleHadoopFileSystemConfiguration config; + private final Fadvise fadvise; + private boolean isPatternOverriden; + private boolean randomAccess; + // keeps track of any backward seek requested in lifecycle of InputStream + private boolean isBackwardSeekRequested = false; + // keeps track of any backward seek requested in lifecycle of InputStream + private boolean isForwardSeekRequested = false; + private long lastServedIndex = -1; + // Keeps track of distance between consecutive requests + private int consecutiveSequentialCount = 0; + + FileAccessPatternManager( + StorageResourceId resourceId, GoogleHadoopFileSystemConfiguration configuration) { + this.isPatternOverriden = false; + this.resourceId = resourceId; + this.config = configuration; + this.fadvise = config.getFadvise(); + this.randomAccess = fadvise == Fadvise.AUTO_RANDOM || fadvise == Fadvise.RANDOM; + } + + void updateLastServedIndex(long position) { + this.lastServedIndex = position; + } + + boolean shouldAdaptToRandomAccess() { + return randomAccess; + } + + void updateAccessPattern(long currentPosition) { + if (isPatternOverriden) { + LOG.trace("Will bypass computing access pattern as it's overriden for resource :{}", + resourceId); + return; + } + updateSeekFlags(currentPosition); + if (fadvise == Fadvise.AUTO_RANDOM) { + if (randomAccess) { + if (shouldAdaptToSequential(currentPosition)) { + unsetRandomAccess(); + } + } else { + if (shouldAdaptToRandomAccess(currentPosition)) { + setRandomAccess(); + } + } + } else if (fadvise == Fadvise.AUTO) { + if (shouldAdaptToRandomAccess(currentPosition)) { + setRandomAccess(); + } + } + } + + /** + * This provides a way to override the access isRandomPattern, once overridden it will not be + * recomputed for adaptive fadvise types. + * + * @param isRandomPattern, true, to override with random access else false + */ + void overrideAccessPattern(boolean isRandomPattern) { + this.isPatternOverriden = true; + this.randomAccess = isRandomPattern; + LOG.trace( + "Overriding the random access pattern to %s for fadvise:%s for resource: %s ", + isRandomPattern, fadvise, resourceId); + } + + private boolean shouldAdaptToSequential(long currentPosition) { + if (lastServedIndex != -1) { + long distance = currentPosition - lastServedIndex; + if (distance < 0 || distance > config.getInplaceSeekLimit()) { + consecutiveSequentialCount = 0; + } else { + consecutiveSequentialCount++; + } + } + + if (!shouldDetectSequentialAccess()) { + return false; + } + + if (consecutiveSequentialCount < config.getFadviseRequestTrackCount()) { + return false; + } + LOG.trace( + "Detected {} consecutive read request within distance threshold {} with fadvise: {} " + + "switching to sequential IO for '{}'", + consecutiveSequentialCount, + config.getInplaceSeekLimit(), + fadvise, + resourceId); + return true; + } + + private boolean shouldAdaptToRandomAccess(long currentPosition) { + if (!shouldDetectRandomAccess()) { + return false; + } + if (lastServedIndex == -1) { + return false; + } + + if (isBackwardOrForwardSeekRequested()) { + LOG.trace( + "Backward or forward seek requested, isBackwardSeek: {}, isForwardSeek:{} for '{}'", + isBackwardSeekRequested, isForwardSeekRequested, resourceId); + return true; + } + return false; + } + + private boolean shouldDetectSequentialAccess() { + return randomAccess + && !isBackwardOrForwardSeekRequested() + && consecutiveSequentialCount >= config.getFadviseRequestTrackCount() + && fadvise == Fadvise.AUTO_RANDOM; + } + + private boolean shouldDetectRandomAccess() { + return !randomAccess && (fadvise == Fadvise.AUTO || fadvise == Fadvise.AUTO_RANDOM); + } + + private void setRandomAccess() { + randomAccess = true; + } + + private void unsetRandomAccess() { + randomAccess = false; + } + + private boolean isBackwardOrForwardSeekRequested() { + return isBackwardSeekRequested || isForwardSeekRequested; + } + + private void updateSeekFlags(long currentPosition) { + if (lastServedIndex == -1) { + return; + } + + if (currentPosition < lastServedIndex) { + isBackwardSeekRequested = true; + LOG.trace( + "Detected backward read from {} to {} position, updating to backwardSeek for '{}'", + lastServedIndex, currentPosition, resourceId); + + } else if (lastServedIndex + config.getInplaceSeekLimit() < currentPosition) { + isForwardSeekRequested = true; + LOG.trace( + "Detected forward read from {} to {} position over {} threshold," + + " updated to forwardSeek for '{}'", + lastServedIndex, currentPosition, config.getInplaceSeekLimit(), resourceId); + } + } +} diff --git a/hadoop-tools/hadoop-gcp/src/main/java/org/apache/hadoop/fs/gs/GoogleCloudStorage.java b/hadoop-tools/hadoop-gcp/src/main/java/org/apache/hadoop/fs/gs/GoogleCloudStorage.java index d68eca6a8a5f3..89a86eef8ff07 100644 --- a/hadoop-tools/hadoop-gcp/src/main/java/org/apache/hadoop/fs/gs/GoogleCloudStorage.java +++ b/hadoop-tools/hadoop-gcp/src/main/java/org/apache/hadoop/fs/gs/GoogleCloudStorage.java @@ -21,6 +21,7 @@ import static org.apache.hadoop.thirdparty.com.google.common.base.Preconditions.*; import static org.apache.hadoop.thirdparty.com.google.common.base.Strings.isNullOrEmpty; import static java.lang.Math.toIntExact; +import static org.apache.hadoop.fs.gs.GoogleCloudStorageExceptions.createFileNotFoundException; import com.google.api.client.util.BackOff; import com.google.api.client.util.ExponentialBackOff; @@ -34,7 +35,9 @@ import org.slf4j.LoggerFactory; import javax.annotation.Nullable; +import java.io.FileNotFoundException; import java.io.IOException; +import java.nio.channels.SeekableByteChannel; import java.nio.channels.WritableByteChannel; import java.nio.file.FileAlreadyExistsException; import java.time.Duration; @@ -582,6 +585,292 @@ private List listBucketsInternal() throws IOException { return allBuckets; } + public SeekableByteChannel open(GoogleCloudStorageItemInfo itemInfo, + GoogleHadoopFileSystemConfiguration config) throws IOException { + LOG.trace("open({})", itemInfo); + checkNotNull(itemInfo, "itemInfo should not be null"); + + StorageResourceId resourceId = itemInfo.getResourceId(); + checkArgument( + resourceId.isStorageObject(), "Expected full StorageObject id, got %s", resourceId); + + return open(resourceId, itemInfo, config); + } + + private SeekableByteChannel open( + StorageResourceId resourceId, + GoogleCloudStorageItemInfo itemInfo, + GoogleHadoopFileSystemConfiguration config) + throws IOException { + return new GoogleCloudStorageClientReadChannel( + storage, + itemInfo == null ? getItemInfo(resourceId) : itemInfo, + config); + } + + public void move(Map sourceToDestinationObjectsMap) + throws IOException { + validateMoveArguments(sourceToDestinationObjectsMap); + + if (sourceToDestinationObjectsMap.isEmpty()) { + return; + } + + for (Map.Entry entry : + sourceToDestinationObjectsMap.entrySet()) { + StorageResourceId srcObject = entry.getKey(); + StorageResourceId dstObject = entry.getValue(); + // TODO: Do this concurrently + moveInternal( + srcObject.getBucketName(), + srcObject.getGenerationId(), + srcObject.getObjectName(), + dstObject.getGenerationId(), + dstObject.getObjectName()); + } + } + + private void moveInternal( + String srcBucketName, + long srcContentGeneration, + String srcObjectName, + long dstContentGeneration, + String dstObjectName) throws IOException { + Storage.MoveBlobRequest.Builder moveRequestBuilder = + createMoveRequestBuilder( + srcBucketName, + srcObjectName, + dstObjectName, + srcContentGeneration, + dstContentGeneration); + try { + String srcString = StringPaths.fromComponents(srcBucketName, srcObjectName); + String dstString = StringPaths.fromComponents(srcBucketName, dstObjectName); + + Blob movedBlob = storage.moveBlob(moveRequestBuilder.build()); + if (movedBlob != null) { + LOG.trace("Successfully moved {} to {}", srcString, dstString); + } + } catch (StorageException e) { + if (ErrorTypeExtractor.getErrorType(e) == ErrorTypeExtractor.ErrorType.NOT_FOUND) { + throw createFileNotFoundException(srcBucketName, srcObjectName, new IOException(e)); + } else { + throw + new IOException( + String.format( + "Error moving '%s'", + StringPaths.fromComponents(srcBucketName, srcObjectName)), + e); + } + } + } + + /** Creates a builder for a blob move request. */ + private Storage.MoveBlobRequest.Builder createMoveRequestBuilder( + String srcBucketName, + String srcObjectName, + String dstObjectName, + long srcContentGeneration, + long dstContentGeneration) { + + Storage.MoveBlobRequest.Builder moveRequestBuilder = + Storage.MoveBlobRequest.newBuilder().setSource(BlobId.of(srcBucketName, srcObjectName)); + moveRequestBuilder.setTarget(BlobId.of(srcBucketName, dstObjectName)); + + List blobTargetOptions = new ArrayList<>(); + List blobSourceOptions = new ArrayList<>(); + + if (srcContentGeneration != StorageResourceId.UNKNOWN_GENERATION_ID) { + blobSourceOptions.add(Storage.BlobSourceOption.generationMatch(srcContentGeneration)); + } + + if (dstContentGeneration != StorageResourceId.UNKNOWN_GENERATION_ID) { + blobTargetOptions.add(Storage.BlobTargetOption.generationMatch(dstContentGeneration)); + } + + // TODO: Add encryption support + + moveRequestBuilder.setSourceOptions(blobSourceOptions); + moveRequestBuilder.setTargetOptions(blobTargetOptions); + + return moveRequestBuilder; + } + + /** + * Validates basic argument constraints like non-null, non-empty Strings, using {@code + * Preconditions} in addition to checking for src/dst bucket equality. + */ + public static void validateMoveArguments( + Map sourceToDestinationObjectsMap) throws IOException { + checkNotNull(sourceToDestinationObjectsMap, "srcObjects must not be null"); + + if (sourceToDestinationObjectsMap.isEmpty()) { + return; + } + + for (Map.Entry entry : + sourceToDestinationObjectsMap.entrySet()) { + StorageResourceId source = entry.getKey(); + StorageResourceId destination = entry.getValue(); + String srcBucketName = source.getBucketName(); + String dstBucketName = destination.getBucketName(); + // Avoid move across buckets. + if (!srcBucketName.equals(dstBucketName)) { + throw new UnsupportedOperationException( + "This operation is not supported across two different buckets."); + } + checkArgument( + !isNullOrEmpty(source.getObjectName()), "srcObjectName must not be null or empty"); + checkArgument( + !isNullOrEmpty(destination.getObjectName()), "dstObjectName must not be null or empty"); + if (srcBucketName.equals(dstBucketName) + && source.getObjectName().equals(destination.getObjectName())) { + throw new IllegalArgumentException( + String.format( + "Move destination must be different from source for %s.", + StringPaths.fromComponents(srcBucketName, source.getObjectName()))); + } + } + } + + void copy(Map sourceToDestinationObjectsMap) + throws IOException { + validateCopyArguments(sourceToDestinationObjectsMap, this); + + if (sourceToDestinationObjectsMap.isEmpty()) { + return; + } + + for (Map.Entry entry : + sourceToDestinationObjectsMap.entrySet()) { + StorageResourceId srcObject = entry.getKey(); + StorageResourceId dstObject = entry.getValue(); + // TODO: Do this concurrently + copyInternal( + srcObject.getBucketName(), + srcObject.getObjectName(), + dstObject.getGenerationId(), + dstObject.getBucketName(), + dstObject.getObjectName()); + } + } + + private void copyInternal( + String srcBucketName, + String srcObjectName, + long dstContentGeneration, + String dstBucketName, + String dstObjectName) throws IOException { + Storage.CopyRequest.Builder copyRequestBuilder = + Storage.CopyRequest.newBuilder().setSource(BlobId.of(srcBucketName, srcObjectName)); + if (dstContentGeneration != StorageResourceId.UNKNOWN_GENERATION_ID) { + copyRequestBuilder.setTarget( + BlobId.of(dstBucketName, dstObjectName), + Storage.BlobTargetOption.generationMatch(dstContentGeneration)); + } else { + copyRequestBuilder.setTarget(BlobId.of(dstBucketName, dstObjectName)); + } + + // TODO: Add support for encryption key + if (configuration.getMaxRewriteChunkSize() > 0) { + copyRequestBuilder.setMegabytesCopiedPerChunk( + // Convert raw byte size into Mib. + configuration.getMaxRewriteChunkSize() / (1024 * 1024)); + } + + String srcString = StringPaths.fromComponents(srcBucketName, srcObjectName); + String dstString = StringPaths.fromComponents(dstBucketName, dstObjectName); + + try { + CopyWriter copyWriter = storage.copy(copyRequestBuilder.build()); + while (!copyWriter.isDone()) { + copyWriter.copyChunk(); + LOG.trace( + "Copy ({} to {}) did not complete. Resuming...", srcString, dstString); + } + LOG.trace("Successfully copied {} to {}", srcString, dstString); + } catch (StorageException e) { + if (ErrorTypeExtractor.getErrorType(e) == ErrorTypeExtractor.ErrorType.NOT_FOUND) { + throw createFileNotFoundException(srcBucketName, srcObjectName, new IOException(e)); + } else { + throw new IOException(String.format("copy(%s->%s) failed.", srcString, dstString), e); + } + } + } + + public static void validateCopyArguments( + Map sourceToDestinationObjectsMap, + GoogleCloudStorage gcsImpl) + throws IOException { + checkNotNull(sourceToDestinationObjectsMap, "srcObjects must not be null"); + + if (sourceToDestinationObjectsMap.isEmpty()) { + return; + } + + Map bucketInfoCache = new HashMap<>(); + + for (Map.Entry entry : + sourceToDestinationObjectsMap.entrySet()) { + StorageResourceId source = entry.getKey(); + StorageResourceId destination = entry.getValue(); + String srcBucketName = source.getBucketName(); + String dstBucketName = destination.getBucketName(); + // Avoid copy across locations or storage classes. + if (!srcBucketName.equals(dstBucketName)) { + StorageResourceId srcBucketResourceId = new StorageResourceId(srcBucketName); + GoogleCloudStorageItemInfo srcBucketInfo = + getGoogleCloudStorageItemInfo(gcsImpl, bucketInfoCache, srcBucketResourceId); + if (!srcBucketInfo.exists()) { + throw new FileNotFoundException("Bucket not found: " + srcBucketName); + } + + StorageResourceId dstBucketResourceId = new StorageResourceId(dstBucketName); + GoogleCloudStorageItemInfo dstBucketInfo = + getGoogleCloudStorageItemInfo(gcsImpl, bucketInfoCache, dstBucketResourceId); + if (!dstBucketInfo.exists()) { + throw new FileNotFoundException("Bucket not found: " + dstBucketName); + } + + // TODO: Restrict this only when copy-with-rewrite is enabled + if (!srcBucketInfo.getLocation().equals(dstBucketInfo.getLocation())) { + throw new UnsupportedOperationException( + "This operation is not supported across two different storage locations."); + } + + if (!srcBucketInfo.getStorageClass().equals(dstBucketInfo.getStorageClass())) { + throw new UnsupportedOperationException( + "This operation is not supported across two different storage classes."); + } + } + checkArgument( + !isNullOrEmpty(source.getObjectName()), "srcObjectName must not be null or empty"); + checkArgument( + !isNullOrEmpty(destination.getObjectName()), "dstObjectName must not be null or empty"); + if (srcBucketName.equals(dstBucketName) + && source.getObjectName().equals(destination.getObjectName())) { + throw new IllegalArgumentException( + String.format( + "Copy destination must be different from source for %s.", + StringPaths.fromComponents(srcBucketName, source.getObjectName()))); + } + } + } + + private static GoogleCloudStorageItemInfo getGoogleCloudStorageItemInfo( + GoogleCloudStorage gcsImpl, + Map bucketInfoCache, + StorageResourceId resourceId) + throws IOException { + GoogleCloudStorageItemInfo storageItemInfo = bucketInfoCache.get(resourceId); + if (storageItemInfo != null) { + return storageItemInfo; + } + storageItemInfo = gcsImpl.getItemInfo(resourceId); + bucketInfoCache.put(resourceId, storageItemInfo); + return storageItemInfo; + } + // Helper class to capture the results of list operation. private class ListOperationResult { private final Map prefixes = new HashMap<>(); diff --git a/hadoop-tools/hadoop-gcp/src/main/java/org/apache/hadoop/fs/gs/GoogleCloudStorageClientReadChannel.java b/hadoop-tools/hadoop-gcp/src/main/java/org/apache/hadoop/fs/gs/GoogleCloudStorageClientReadChannel.java new file mode 100644 index 0000000000000..afe16c66701c4 --- /dev/null +++ b/hadoop-tools/hadoop-gcp/src/main/java/org/apache/hadoop/fs/gs/GoogleCloudStorageClientReadChannel.java @@ -0,0 +1,609 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hadoop.fs.gs; + +import static org.apache.hadoop.thirdparty.com.google.common.base.Preconditions.checkArgument; +import static org.apache.hadoop.thirdparty.com.google.common.base.Preconditions.checkNotNull; +import static org.apache.hadoop.thirdparty.com.google.common.base.Preconditions.checkState; +import static org.apache.hadoop.thirdparty.com.google.common.base.Strings.nullToEmpty; +import static java.lang.Math.max; +import static java.lang.Math.min; +import static java.lang.Math.toIntExact; +import static org.apache.hadoop.fs.gs.GoogleCloudStorageExceptions.createFileNotFoundException; + +import com.google.cloud.ReadChannel; +import com.google.cloud.storage.BlobId; +import com.google.cloud.storage.Storage; +import com.google.cloud.storage.Storage.BlobSourceOption; + +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import java.io.ByteArrayInputStream; +import java.io.EOFException; +import java.io.FileNotFoundException; +import java.io.IOException; +import java.io.InputStream; +import java.nio.ByteBuffer; +import java.nio.channels.Channels; +import java.nio.channels.ClosedChannelException; +import java.nio.channels.ReadableByteChannel; +import java.nio.channels.SeekableByteChannel; +import java.util.ArrayList; +import java.util.List; +import javax.annotation.Nullable; + +/** Provides seekable read access to GCS via java-storage library. */ +class GoogleCloudStorageClientReadChannel implements SeekableByteChannel { + private static final Logger LOG = + LoggerFactory.getLogger(GoogleCloudStorageClientReadChannel.class); + private static final String GZIP_ENCODING = "gzip"; + + private final StorageResourceId resourceId; + private final Storage storage; + private final GoogleHadoopFileSystemConfiguration config; + + // The size of this object generation, in bytes. + private long objectSize; + private ContentReadChannel contentReadChannel; + private boolean gzipEncoded = false; + private boolean open = true; + + // Current position in this channel, it could be different from contentChannelCurrentPosition if + // position(long) method calls were made without calls to read(ByteBuffer) method. + private long currentPosition = 0; + + GoogleCloudStorageClientReadChannel( + Storage storage, + GoogleCloudStorageItemInfo itemInfo, + GoogleHadoopFileSystemConfiguration config) + throws IOException { + validate(itemInfo); + this.storage = storage; + this.resourceId = + new StorageResourceId( + itemInfo.getBucketName(), itemInfo.getObjectName(), itemInfo.getContentGeneration()); + this.contentReadChannel = new ContentReadChannel(config, resourceId); + initMetadata(itemInfo.getContentEncoding(), itemInfo.getSize()); + this.config = config; + } + + protected void initMetadata(@Nullable String encoding, long sizeFromMetadata) throws IOException { + gzipEncoded = nullToEmpty(encoding).contains(GZIP_ENCODING); + if (gzipEncoded && !config.isGzipEncodingSupportEnabled()) { + throw new IOException( + "Cannot read GZIP encoded files - content encoding support is disabled."); + } + objectSize = gzipEncoded ? Long.MAX_VALUE : sizeFromMetadata; + } + + @Override + public int read(ByteBuffer dst) throws IOException { + throwIfNotOpen(); + + // Don't try to read if the buffer has no space. + if (dst.remaining() == 0) { + return 0; + } + LOG.trace( + "Reading {} bytes at {} position from '{}'", dst.remaining(), currentPosition, resourceId); + if (currentPosition == objectSize) { + return -1; + } + return contentReadChannel.readContent(dst); + } + + @Override + public int write(ByteBuffer src) throws IOException { + throw new UnsupportedOperationException("Cannot mutate read-only channel"); + } + + @Override + public long position() throws IOException { + return currentPosition; + } + + /** + * Sets this channel's position. + * + *

This method will throw an exception if {@code newPosition} is greater than object size, + * which contradicts {@link SeekableByteChannel#position(long) SeekableByteChannel} contract. + * TODO(user): decide if this needs to be fixed. + * + * @param newPosition the new position, counting the number of bytes from the beginning. + * @return this channel instance + * @throws FileNotFoundException if the underlying object does not exist. + * @throws IOException on IO error + */ + @Override + public SeekableByteChannel position(long newPosition) throws IOException { + throwIfNotOpen(); + + if (newPosition == currentPosition) { + return this; + } + + validatePosition(newPosition); + LOG.trace( + "Seek from {} to {} position for '{}'", currentPosition, newPosition, resourceId); + currentPosition = newPosition; + return this; + } + + @Override + public long size() throws IOException { + return objectSize; + } + + @Override + public SeekableByteChannel truncate(long size) throws IOException { + throw new UnsupportedOperationException("Cannot mutate read-only channel"); + } + + @Override + public boolean isOpen() { + return open; + } + + @Override + public void close() throws IOException { + if (open) { + try { + LOG.trace("Closing channel for '{}'", resourceId); + contentReadChannel.closeContentChannel(); + } catch (Exception e) { + throw new IOException( + String.format("Exception occurred while closing channel '%s'", resourceId), e); + } finally { + contentReadChannel = null; + open = false; + } + } + } + + /** + * This class own the responsibility of opening up contentChannel. It also implements the Fadvise, + * which helps in deciding the boundaries of content channel being opened and also caching the + * footer of an object. + */ + private class ContentReadChannel { + + // Size of buffer to allocate for skipping bytes in-place when performing in-place seeks. + private static final int SKIP_BUFFER_SIZE = 8192; + private final BlobId blobId; + + // This is the actual current position in `contentChannel` from where read can happen. + // This remains unchanged of position(long) method call. + private long contentChannelCurrentPosition = -1; + private long contentChannelEnd = -1; + // Prefetched footer content. + private byte[] footerContent; + // Used as scratch space when reading bytes just to discard them when trying to perform small + // in-place seeks. + private byte[] skipBuffer = null; + private ReadableByteChannel byteChannel = null; + private final FileAccessPatternManager fileAccessManager; + + ContentReadChannel( + GoogleHadoopFileSystemConfiguration config, StorageResourceId resourceId) { + this.blobId = + BlobId.of( + resourceId.getBucketName(), resourceId.getObjectName(), resourceId.getGenerationId()); + this.fileAccessManager = new FileAccessPatternManager(resourceId, config); + if (gzipEncoded) { + fileAccessManager.overrideAccessPattern(false); + } + } + + int readContent(ByteBuffer dst) throws IOException { + + performPendingSeeks(); + + checkState( + contentChannelCurrentPosition == currentPosition || byteChannel == null, + "contentChannelCurrentPosition (%s) should be equal to currentPosition " + + "(%s) after lazy seek, if channel is open", + contentChannelCurrentPosition, + currentPosition); + + int totalBytesRead = 0; + // We read from a streaming source. We may not get all the bytes we asked for + // in the first read. Therefore, loop till we either read the required number of + // bytes or we reach end-of-stream. + while (dst.hasRemaining()) { + int remainingBeforeRead = dst.remaining(); + try { + if (byteChannel == null) { + byteChannel = openByteChannel(dst.remaining()); + // We adjust the start index of content channel in following cases + // 1. request range is in footer boundaries --> request the whole footer + // 2. requested content is gzip encoded -> request always from start of file. + // Case(1) is handled with reading and caching the extra read bytes, for all other cases + // we need to skip all the unrequested bytes before start reading from current position. + if (currentPosition > contentChannelCurrentPosition) { + skipInPlace(); + } + // making sure that currentPosition is in alignment with currentReadPosition before + // actual read starts to avoid read discrepancies. + checkState( + contentChannelCurrentPosition == currentPosition, + "position of read offset isn't in alignment with channel's read offset"); + } + int bytesRead = byteChannel.read(dst); + + /* + As we are using the zero copy implementation of byteChannel, + it can return even zero bytes, + while reading, + we should not treat it as an error scenario anymore. + */ + if (bytesRead == 0) { + LOG.trace( + "Read {} from storage-client's byte channel at position: {} with channel " + + "ending at: {} for resourceId: {} of size: {}", + bytesRead, currentPosition, contentChannelEnd, resourceId, objectSize); + } + + if (bytesRead < 0) { + // Because we don't know decompressed object size for gzip-encoded objects, + // assume that this is an object end. + if (gzipEncoded) { + objectSize = currentPosition; + contentChannelEnd = currentPosition; + } + + if (currentPosition != contentChannelEnd && currentPosition != objectSize) { + throw new IOException( + String.format( + "Received end of stream result before all requestedBytes were received;" + + "EndOf stream signal received at offset: %d where as stream was " + + "suppose to end at: %d for resource: %s of size: %d", + currentPosition, contentChannelEnd, resourceId, objectSize)); + } + // If we have reached an end of a contentChannel but not an end of an object. + // then close contentChannel and continue reading an object if necessary. + if (contentChannelEnd != objectSize && currentPosition == contentChannelEnd) { + closeContentChannel(); + continue; + } else { + break; + } + } + totalBytesRead += bytesRead; + currentPosition += bytesRead; + contentChannelCurrentPosition += bytesRead; + checkState( + contentChannelCurrentPosition == currentPosition, + "contentChannelPosition (%s) should be equal to currentPosition (%s)" + + " after successful read", + contentChannelCurrentPosition, + currentPosition); + } catch (Exception e) { + int partialBytes = partiallyReadBytes(remainingBeforeRead, dst); + currentPosition += partialBytes; + contentChannelCurrentPosition += partialBytes; + LOG.trace( + "Closing contentChannel after {} exception for '{}'.", e.getMessage(), resourceId); + closeContentChannel(); + throw convertError(e); + } + } + return totalBytesRead; + } + + private int partiallyReadBytes(int remainingBeforeRead, ByteBuffer dst) { + int partialReadBytes = 0; + if (remainingBeforeRead != dst.remaining()) { + partialReadBytes = remainingBeforeRead - dst.remaining(); + } + return partialReadBytes; + } + + private ReadableByteChannel openByteChannel(long bytesToRead) throws IOException { + checkArgument( + bytesToRead > 0, "bytesToRead should be greater than 0, but was %s", bytesToRead); + checkState( + byteChannel == null && contentChannelEnd < 0, + "contentChannel and contentChannelEnd should be not initialized yet for '%s'", + resourceId); + + if (footerContent != null && currentPosition >= objectSize - footerContent.length) { + return serveFooterContent(); + } + + // Should be updated only if content is not served from cached footer + fileAccessManager.updateAccessPattern(currentPosition); + + setChannelBoundaries(bytesToRead); + + ReadableByteChannel readableByteChannel = + getStorageReadChannel(contentChannelCurrentPosition, contentChannelEnd); + + if (contentChannelEnd == objectSize + && (contentChannelEnd - contentChannelCurrentPosition) + <= config.getMinRangeRequestSize()) { + + if (footerContent == null) { + cacheFooter(readableByteChannel); + } + return serveFooterContent(); + } + return readableByteChannel; + } + + private void setChannelBoundaries(long bytesToRead) { + contentChannelCurrentPosition = getRangeRequestStart(); + contentChannelEnd = getRangeRequestEnd(contentChannelCurrentPosition, bytesToRead); + checkState( + contentChannelEnd >= contentChannelCurrentPosition, + String.format( + "Start position should be <= endPosition startPosition:%d, endPosition: %d", + contentChannelCurrentPosition, contentChannelEnd)); + } + + private void cacheFooter(ReadableByteChannel readableByteChannel) throws IOException { + int footerSize = toIntExact(objectSize - contentChannelCurrentPosition); + footerContent = new byte[footerSize]; + try (InputStream footerStream = Channels.newInputStream(readableByteChannel)) { + int totalBytesRead = 0; + int bytesRead; + do { + bytesRead = footerStream.read(footerContent, totalBytesRead, footerSize - totalBytesRead); + if (bytesRead >= 0) { + totalBytesRead += bytesRead; + } + } while (bytesRead >= 0 && totalBytesRead < footerSize); + checkState( + bytesRead >= 0, + "footerStream shouldn't be empty before reading the footer of size %s, " + + "totalBytesRead %s, read via last call %s, for '%s'", + footerSize, + totalBytesRead, + bytesRead, + resourceId); + checkState( + totalBytesRead == footerSize, + "totalBytesRead (%s) should equal footerSize (%s) for '%s'", + totalBytesRead, + footerSize, + resourceId); + } catch (Exception e) { + footerContent = null; + throw e; + } + LOG.trace("Prefetched {} bytes footer for '{}'", footerContent.length, resourceId); + } + + private ReadableByteChannel serveFooterContent() { + contentChannelCurrentPosition = currentPosition; + int offset = toIntExact(currentPosition - (objectSize - footerContent.length)); + int length = footerContent.length - offset; + LOG.trace( + "Opened channel (prefetched footer) from {} position for '{}'", + currentPosition, resourceId); + return Channels.newChannel(new ByteArrayInputStream(footerContent, offset, length)); + } + + private long getRangeRequestStart() { + if (gzipEncoded) { + return 0; + } + if (config.getFadvise() != Fadvise.SEQUENTIAL + && isFooterRead() + && !config.isReadExactRequestedBytesEnabled()) { + // Prefetch footer and adjust start position to footerStart. + return max(0, objectSize - config.getMinRangeRequestSize()); + } + return currentPosition; + } + + private long getRangeRequestEnd(long startPosition, long bytesToRead) { + // Always read gzip-encoded files till the end - they do not support range reads. + if (gzipEncoded) { + return objectSize; + } + long endPosition = objectSize; + if (fileAccessManager.shouldAdaptToRandomAccess()) { + // opening a channel for whole object doesn't make sense as anyhow it will not be utilized + // for further reads. + endPosition = startPosition + max(bytesToRead, config.getMinRangeRequestSize()); + } else { + if (config.getFadvise() == Fadvise.AUTO_RANDOM) { + endPosition = min(startPosition + config.getBlockSize(), objectSize); + } + } + + if (footerContent != null) { + // If footer is cached open just till footerStart. + // Remaining content ill be served from cached footer itself. + endPosition = min(endPosition, objectSize - footerContent.length); + } + return endPosition; + } + + void closeContentChannel() { + if (byteChannel != null) { + LOG.trace("Closing internal contentChannel for '{}'", resourceId); + try { + byteChannel.close(); + } catch (Exception e) { + LOG.trace( + "Got an exception on contentChannel.close() for '{}'; ignoring it.", resourceId, e); + } finally { + byteChannel = null; + fileAccessManager.updateLastServedIndex(contentChannelCurrentPosition); + reset(); + } + } + } + + private void reset() { + checkState(byteChannel == null, "contentChannel should be null for '%s'", resourceId); + contentChannelCurrentPosition = -1; + contentChannelEnd = -1; + } + + private boolean isInRangeSeek() { + long seekDistance = currentPosition - contentChannelCurrentPosition; + if (byteChannel != null + && seekDistance > 0 + // for gzip encoded content always seek in place + && (gzipEncoded || seekDistance <= config.getInplaceSeekLimit()) + && currentPosition < contentChannelEnd) { + return true; + } + return false; + } + + private void skipInPlace() { + if (skipBuffer == null) { + skipBuffer = new byte[SKIP_BUFFER_SIZE]; + } + long seekDistance = currentPosition - contentChannelCurrentPosition; + while (seekDistance > 0 && byteChannel != null) { + try { + int bufferSize = toIntExact(min(skipBuffer.length, seekDistance)); + int bytesRead = byteChannel.read(ByteBuffer.wrap(skipBuffer, 0, bufferSize)); + if (bytesRead < 0) { + LOG.info( + "Somehow read {} bytes trying to skip {} bytes to seek to position {}, size: {}", + bytesRead, seekDistance, currentPosition, objectSize); + closeContentChannel(); + } else { + seekDistance -= bytesRead; + contentChannelCurrentPosition += bytesRead; + } + } catch (Exception e) { + LOG.info( + "Got an IO exception on contentChannel.read(), a lazy-seek will be pending for '{}'", + resourceId, e); + closeContentChannel(); + } + } + checkState( + byteChannel == null || contentChannelCurrentPosition == currentPosition, + "contentChannelPosition (%s) should be equal to currentPosition (%s)" + + " after successful in-place skip", + contentChannelCurrentPosition, + currentPosition); + } + + private void performPendingSeeks() { + + // Return quickly if there is no pending seek operation, i.e. position didn't change. + if (currentPosition == contentChannelCurrentPosition && byteChannel != null) { + return; + } + + LOG.trace( + "Performing lazySeek from {} to {} position '{}'", + contentChannelCurrentPosition, currentPosition, resourceId); + + if (isInRangeSeek()) { + skipInPlace(); + } else { + // close existing contentChannel as requested bytes can't be served from current + // contentChannel; + closeContentChannel(); + } + } + + private ReadableByteChannel getStorageReadChannel(long seek, long limit) throws IOException { + ReadChannel readChannel = storage.reader(blobId, generateReadOptions()); + try { + readChannel.seek(seek); + readChannel.limit(limit); + // bypass the storage-client caching layer hence eliminates the need to maintain a copy of + // chunk + readChannel.setChunkSize(0); + return readChannel; + } catch (Exception e) { + throw new IOException( + String.format( + "Unable to update the boundaries/Range of contentChannel %s", + resourceId.toString()), + e); + } + } + + private BlobSourceOption[] generateReadOptions() { + List blobReadOptions = new ArrayList<>(); + // To get decoded content + blobReadOptions.add(BlobSourceOption.shouldReturnRawInputStream(false)); + + if (blobId.getGeneration() != null) { + blobReadOptions.add(BlobSourceOption.generationMatch(blobId.getGeneration())); + } + + // TODO: Add support for encryptionKey + return blobReadOptions.toArray(new BlobSourceOption[blobReadOptions.size()]); + } + + private boolean isFooterRead() { + return objectSize - currentPosition <= config.getMinRangeRequestSize(); + } + } + + private static void validate(GoogleCloudStorageItemInfo itemInfo) throws IOException { + checkNotNull(itemInfo, "itemInfo cannot be null"); + StorageResourceId resourceId = itemInfo.getResourceId(); + checkArgument( + resourceId.isStorageObject(), "Can not open a non-file object for read: %s", resourceId); + if (!itemInfo.exists()) { + throw new FileNotFoundException(String.format("Item not found: %s", resourceId)); + } + } + + private IOException convertError(Exception error) { + String msg = String.format("Error reading '%s'", resourceId); + switch (ErrorTypeExtractor.getErrorType(error)) { + case NOT_FOUND: + return createFileNotFoundException( + resourceId.getBucketName(), resourceId.getObjectName(), new IOException(msg, error)); + case OUT_OF_RANGE: + return (IOException) new EOFException(msg).initCause(error); + default: + return new IOException(msg, error); + } + } + + /** Validates that the given position is valid for this channel. */ + private void validatePosition(long position) throws IOException { + if (position < 0) { + throw new EOFException( + String.format( + "Invalid seek offset: position value (%d) must be >= 0 for '%s'", + position, resourceId)); + } + + if (objectSize >= 0 && position >= objectSize) { + throw new EOFException( + String.format( + "Invalid seek offset: position value (%d) must be between 0 and %d for '%s'", + position, objectSize, resourceId)); + } + } + + /** Throws if this channel is not currently open. */ + private void throwIfNotOpen() throws IOException { + if (!isOpen()) { + throw new ClosedChannelException(); + } + } +} diff --git a/hadoop-tools/hadoop-gcp/src/main/java/org/apache/hadoop/fs/gs/GoogleCloudStorageExceptions.java b/hadoop-tools/hadoop-gcp/src/main/java/org/apache/hadoop/fs/gs/GoogleCloudStorageExceptions.java new file mode 100644 index 0000000000000..95f0e41617c74 --- /dev/null +++ b/hadoop-tools/hadoop-gcp/src/main/java/org/apache/hadoop/fs/gs/GoogleCloudStorageExceptions.java @@ -0,0 +1,58 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hadoop.fs.gs; + +import static org.apache.hadoop.thirdparty.com.google.common.base.Preconditions.checkArgument; +import static org.apache.hadoop.thirdparty.com.google.common.base.Strings.isNullOrEmpty; +import static org.apache.hadoop.thirdparty.com.google.common.base.Strings.nullToEmpty; + +import java.io.FileNotFoundException; +import java.io.IOException; +import javax.annotation.Nullable; + +/** + * Miscellaneous helper methods for standardizing the types of exceptions thrown by the various + * GCS-based FileSystems. + */ +final class GoogleCloudStorageExceptions { + + private GoogleCloudStorageExceptions() {} + + /** Creates FileNotFoundException with suitable message for a GCS bucket or object. */ + static FileNotFoundException createFileNotFoundException( + String bucketName, String objectName, @Nullable IOException cause) { + checkArgument(!isNullOrEmpty(bucketName), "bucketName must not be null or empty"); + FileNotFoundException fileNotFoundException = + new FileNotFoundException( + String.format( + "Item not found: '%s'. Note, it is possible that the live version" + + " is still available but the requested generation is deleted.", + StringPaths.fromComponents(bucketName, nullToEmpty(objectName)))); + if (cause != null) { + fileNotFoundException.initCause(cause); + } + return fileNotFoundException; + } + + static FileNotFoundException createFileNotFoundException( + StorageResourceId resourceId, @Nullable IOException cause) { + return createFileNotFoundException( + resourceId.getBucketName(), resourceId.getObjectName(), cause); + } +} diff --git a/hadoop-tools/hadoop-gcp/src/main/java/org/apache/hadoop/fs/gs/GoogleCloudStorageFileSystem.java b/hadoop-tools/hadoop-gcp/src/main/java/org/apache/hadoop/fs/gs/GoogleCloudStorageFileSystem.java index aa1617e4da687..2b0c238eb02a0 100644 --- a/hadoop-tools/hadoop-gcp/src/main/java/org/apache/hadoop/fs/gs/GoogleCloudStorageFileSystem.java +++ b/hadoop-tools/hadoop-gcp/src/main/java/org/apache/hadoop/fs/gs/GoogleCloudStorageFileSystem.java @@ -26,6 +26,7 @@ import com.google.auth.Credentials; import org.apache.hadoop.thirdparty.com.google.common.annotations.VisibleForTesting; import org.apache.hadoop.thirdparty.com.google.common.collect.ImmutableList; +import org.apache.hadoop.thirdparty.com.google.common.collect.ImmutableMap; import org.apache.hadoop.thirdparty.com.google.common.collect.Iterables; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -33,13 +34,20 @@ import java.io.FileNotFoundException; import java.io.IOException; import java.net.URI; +import java.nio.channels.SeekableByteChannel; import java.nio.channels.WritableByteChannel; import java.nio.file.DirectoryNotEmptyException; import java.nio.file.FileAlreadyExistsException; import java.util.ArrayList; +import java.util.Collections; import java.util.Comparator; +import java.util.HashMap; import java.util.List; +import java.util.Map; import java.util.Objects; +import java.util.TreeMap; +import java.util.regex.Pattern; +import javax.annotation.Nullable; /** * Provides FS semantics over GCS based on Objects API. @@ -73,6 +81,7 @@ class GoogleCloudStorageFileSystem { // URI of the root path. static final URI GCSROOT = URI.create(SCHEME + ":/"); + private final GoogleHadoopFileSystemConfiguration configuration; // GCS access instance. private GoogleCloudStorage gcs; @@ -87,6 +96,7 @@ private static GoogleCloudStorage createCloudStorage( GoogleCloudStorageFileSystem(final GoogleHadoopFileSystemConfiguration configuration, final Credentials credentials) throws IOException { + this.configuration = configuration; gcs = createCloudStorage(configuration, credentials); } @@ -157,9 +167,7 @@ private GoogleCloudStorageItemInfo getFileInfoInternal( resourceId.getBucketName(), resourceId.getObjectName(), GET_FILE_INFO_LIST_OPTIONS); - LOG.trace("List for getMetadata returned {}. {}", listDirResult.size(), listDirResult); if (!listDirResult.isEmpty()) { - LOG.trace("Get metadata for directory returned non empty {}", listDirResult); return GoogleCloudStorageItemInfo.createInferredDirectory(resourceId.toDirectoryId()); } } @@ -371,4 +379,314 @@ public List listFileInfo(URI path, ListFileOptions listOptions) throws fileInfos.sort(FILE_INFO_PATH_COMPARATOR); return fileInfos; } + + FileInfo getFileInfoObject(URI path) throws IOException { + checkArgument(path != null, "path must not be null"); + StorageResourceId resourceId = StorageResourceId.fromUriPath(path, true); + checkArgument( + !resourceId.isDirectory(), + String.format( + "path must be an object and not a directory, path: %s, resourceId: %s", + path, resourceId)); + FileInfo fileInfo = FileInfo.fromItemInfo(gcs.getItemInfo(resourceId)); + LOG.trace("getFileInfoObject(path: {}): {}", path, fileInfo); + return fileInfo; + } + + SeekableByteChannel open(FileInfo fileInfo, GoogleHadoopFileSystemConfiguration config) + throws IOException { + checkNotNull(fileInfo, "fileInfo should not be null"); + checkArgument( + !fileInfo.isDirectory(), "Cannot open a directory for reading: %s", fileInfo.getPath()); + + return gcs.open(fileInfo.getItemInfo(), config); + } + + void rename(URI src, URI dst) throws IOException { + LOG.trace("rename(src: {}, dst: {})", src, dst); + checkNotNull(src); + checkNotNull(dst); + checkArgument(!src.equals(GCSROOT), "Root path cannot be renamed."); + + // Parent of the destination path. + URI dstParent = UriPaths.getParentPath(dst); + + // Obtain info on source, destination and destination-parent. + List paths = new ArrayList<>(); + paths.add(src); + paths.add(dst); + if (dstParent != null) { + // dstParent is null if dst is GCS_ROOT. + paths.add(dstParent); + } + List fileInfos = getFileInfos(paths); + FileInfo srcInfo = fileInfos.get(0); + FileInfo dstInfo = fileInfos.get(1); + FileInfo dstParentInfo = dstParent == null ? null : fileInfos.get(2); + + // Throw if the source file does not exist. + if (!srcInfo.exists()) { + throw new FileNotFoundException("Item not found: " + src); + } + + // Make sure paths match what getFileInfo() returned (it can add / at the end). + src = srcInfo.getPath(); + dst = getDstUri(srcInfo, dstInfo, dstParentInfo); + + // if src and dst are equal then do nothing + if (src.equals(dst)) { + return; + } + + if (srcInfo.isDirectory()) { + renameDirectoryInternal(srcInfo, dst); + } else { + renameObject(src, dst, srcInfo); + } + } + + private void renameObject(URI src, URI dst, FileInfo srcInfo) throws IOException { + StorageResourceId srcResourceId = + StorageResourceId.fromUriPath(src, /* allowEmptyObjectName= */ true); + StorageResourceId dstResourceId = StorageResourceId.fromUriPath( + dst, + /* allowEmptyObjectName= */ true, + /* generationId= */ 0L); + + if (srcResourceId.getBucketName().equals(dstResourceId.getBucketName())) { + gcs.move( + ImmutableMap.of( + new StorageResourceId( + srcInfo.getItemInfo().getBucketName(), + srcInfo.getItemInfo().getObjectName(), + srcInfo.getItemInfo().getContentGeneration()), + dstResourceId)); + } else { + gcs.copy(ImmutableMap.of(srcResourceId, dstResourceId)); + + gcs.deleteObjects( + ImmutableList.of( + new StorageResourceId( + srcInfo.getItemInfo().getBucketName(), + srcInfo.getItemInfo().getObjectName(), + srcInfo.getItemInfo().getContentGeneration()))); + } + } + + /** + * Renames given directory without checking any parameters. + * + *

GCS does not support atomic renames therefore rename is implemented as copying source + * metadata to destination and then deleting source metadata. Note that only the metadata is + * copied and not the content of any file. + */ + private void renameDirectoryInternal(FileInfo srcInfo, URI dst) throws IOException { + checkArgument(srcInfo.isDirectory(), "'%s' should be a directory", srcInfo); + checkArgument(dst.toString().endsWith(PATH_DELIMITER), "'%s' should be a directory", dst); + + URI src = srcInfo.getPath(); + + // Mapping from each src to its respective dst. + // Sort src items so that parent directories appear before their children. + // That allows us to copy parent directories before we copy their children. + Map srcToDstItemNames = new TreeMap<>(FILE_INFO_PATH_COMPARATOR); + Map srcToDstMarkerItemNames = new TreeMap<>(FILE_INFO_PATH_COMPARATOR); + + // List of individual paths to rename; + // we will try to carry out the copies in this list's order. + List srcItemInfos = + listFileInfoForPrefix(src, ListFileOptions.DELETE_RENAME_LIST_OPTIONS); + + // Create a list of sub-items to copy. + Pattern markerFilePattern = configuration.getMarkerFilePattern(); + String prefix = src.toString(); + for (FileInfo srcItemInfo : srcItemInfos) { + String relativeItemName = srcItemInfo.getPath().toString().substring(prefix.length()); + URI dstItemName = dst.resolve(relativeItemName); + if (markerFilePattern != null && markerFilePattern.matcher(relativeItemName).matches()) { + srcToDstMarkerItemNames.put(srcItemInfo, dstItemName); + } else { + srcToDstItemNames.put(srcItemInfo, dstItemName); + } + } + + StorageResourceId srcResourceId = + StorageResourceId.fromUriPath(src, /* allowEmptyObjectName= */ true); + StorageResourceId dstResourceId = + StorageResourceId.fromUriPath( + dst, /* allowEmptyObjectName= */ true, /* generationId= */ 0L); + if (srcResourceId.getBucketName().equals(dstResourceId.getBucketName())) { + // First, move all items except marker items + moveInternal(srcToDstItemNames); + // Finally, move marker items (if any) to mark rename operation success + moveInternal(srcToDstMarkerItemNames); + + if (srcInfo.getItemInfo().isBucket()) { + deleteBucket(Collections.singletonList(srcInfo)); + } else { + // If src is a directory then srcItemInfos does not contain its own name, + // we delete item separately in the list. + deleteObjects(Collections.singletonList(srcInfo)); + } + return; + } + + // TODO: Add support for across bucket moves + throw new UnsupportedOperationException(String.format( + "Moving object from bucket '%s' to '%s' is not supported", + srcResourceId.getBucketName(), + dstResourceId.getBucketName())); + } + + List listFileInfoForPrefix(URI prefix, ListFileOptions listOptions) + throws IOException { + LOG.trace("listAllFileInfoForPrefix(prefix: {})", prefix); + StorageResourceId prefixId = getPrefixId(prefix); + List itemInfos = + gcs.listObjectInfo( + prefixId.getBucketName(), + prefixId.getObjectName(), + updateListObjectOptions(ListObjectOptions.DEFAULT_FLAT_LIST, listOptions)); + List fileInfos = FileInfo.fromItemInfos(itemInfos); + fileInfos.sort(FILE_INFO_PATH_COMPARATOR); + return fileInfos; + } + + /** Moves items in given map that maps source items to destination items. */ + private void moveInternal(Map srcToDstItemNames) throws IOException { + if (srcToDstItemNames.isEmpty()) { + return; + } + + Map sourceToDestinationObjectsMap = new HashMap<>(); + + // Prepare list of items to move. + for (Map.Entry srcToDstItemName : srcToDstItemNames.entrySet()) { + StorageResourceId srcResourceId = srcToDstItemName.getKey().getItemInfo().getResourceId(); + + StorageResourceId dstResourceId = + StorageResourceId.fromUriPath(srcToDstItemName.getValue(), true); + sourceToDestinationObjectsMap.put(srcResourceId, dstResourceId); + } + + // Perform move. + gcs.move(sourceToDestinationObjectsMap); + } + + private static ListObjectOptions updateListObjectOptions( + ListObjectOptions listObjectOptions, ListFileOptions listFileOptions) { + return listObjectOptions.builder().setFields(listFileOptions.getFields()).build(); + } + + private List getFileInfos(List paths) throws IOException { + List result = new ArrayList<>(paths.size()); + for (URI path : paths) { + // TODO: Do this concurrently + result.add(getFileInfo(path)); + } + + return result; + } + + private URI getDstUri(FileInfo srcInfo, FileInfo dstInfo, @Nullable FileInfo dstParentInfo) + throws IOException { + URI src = srcInfo.getPath(); + URI dst = dstInfo.getPath(); + + // Throw if src is a file and dst == GCS_ROOT + if (!srcInfo.isDirectory() && dst.equals(GCSROOT)) { + throw new IOException("A file cannot be created in root."); + } + + // Throw if the destination is a file that already exists, and it's not a source file. + if (dstInfo.exists() && !dstInfo.isDirectory() && (srcInfo.isDirectory() || !dst.equals(src))) { + throw new IOException("Cannot overwrite an existing file: " + dst); + } + + // Rename operation cannot be completed if parent of destination does not exist. + if (dstParentInfo != null && !dstParentInfo.exists()) { + throw new IOException( + "Cannot rename because path does not exist: " + dstParentInfo.getPath()); + } + + // Leaf item of the source path. + String srcItemName = getItemName(src); + + // Having taken care of the initial checks, apply the regular rules. + // After applying the rules, we will be left with 2 paths such that: + // -- either both are files or both are directories + // -- src exists and dst leaf does not exist + if (srcInfo.isDirectory()) { + // -- if src is a directory + // -- dst is an existing file => disallowed + // -- dst is a directory => rename the directory. + + // The first case (dst is an existing file) is already checked earlier. + // If the destination path looks like a file, make it look like a + // directory path. This is because users often type 'mv foo bar' + // rather than 'mv foo bar/'. + if (!dstInfo.isDirectory()) { + dst = UriPaths.toDirectory(dst); + } + + // Throw if renaming directory to self - this is forbidden + if (src.equals(dst)) { + throw new IOException("Rename dir to self is forbidden"); + } + + URI dstRelativeToSrc = src.relativize(dst); + // Throw if dst URI relative to src is not equal to dst, + // because this means that src is a parent directory of dst + // and src cannot be "renamed" to its subdirectory + if (!dstRelativeToSrc.equals(dst)) { + throw new IOException("Rename to subdir is forbidden"); + } + + if (dstInfo.exists()) { + dst = + dst.equals(GCSROOT) + ? UriPaths.fromStringPathComponents( + srcItemName, /* objectName= */ null, /* allowEmptyObjectName= */ true) + : UriPaths.toDirectory(dst.resolve(srcItemName)); + } + } else { + // -- src is a file + // -- dst is a file => rename the file. + // -- dst is a directory => similar to the previous case after + // appending src file-name to dst + + if (dstInfo.isDirectory()) { + if (!dstInfo.exists()) { + throw new IOException("Cannot rename because path does not exist: " + dstInfo.getPath()); + } else { + dst = dst.resolve(srcItemName); + } + } + } + + return dst; + } + + @Nullable + static String getItemName(URI path) { + checkNotNull(path, "path can not be null"); + + // There is no leaf item for the root path. + if (path.equals(GCSROOT)) { + return null; + } + + StorageResourceId resourceId = StorageResourceId.fromUriPath(path, true); + + if (resourceId.isBucket()) { + return resourceId.getBucketName(); + } + + String objectName = resourceId.getObjectName(); + int index = + StringPaths.isDirectoryPath(objectName) + ? objectName.lastIndexOf(PATH_DELIMITER, objectName.length() - 2) + : objectName.lastIndexOf(PATH_DELIMITER); + return index < 0 ? objectName : objectName.substring(index + 1); + } } diff --git a/hadoop-tools/hadoop-gcp/src/main/java/org/apache/hadoop/fs/gs/GoogleHadoopFSInputStream.java b/hadoop-tools/hadoop-gcp/src/main/java/org/apache/hadoop/fs/gs/GoogleHadoopFSInputStream.java new file mode 100644 index 0000000000000..26629fc79b27e --- /dev/null +++ b/hadoop-tools/hadoop-gcp/src/main/java/org/apache/hadoop/fs/gs/GoogleHadoopFSInputStream.java @@ -0,0 +1,187 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hadoop.fs.gs; + +import static org.apache.hadoop.thirdparty.com.google.common.base.Preconditions.checkNotNull; +import static org.apache.hadoop.thirdparty.com.google.common.base.Preconditions.checkState; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import java.io.IOException; +import java.net.URI; +import java.nio.ByteBuffer; +import java.nio.channels.ClosedChannelException; +import java.nio.channels.SeekableByteChannel; +import javax.annotation.Nonnull; +import org.apache.hadoop.fs.FSExceptionMessages; +import org.apache.hadoop.fs.FSInputStream; +import org.apache.hadoop.fs.FileSystem; + +final class GoogleHadoopFSInputStream extends FSInputStream { + public static final Logger LOG = LoggerFactory.getLogger(GoogleHadoopFSInputStream.class); + + // Used for single-byte reads. + private final byte[] singleReadBuf = new byte[1]; + + // Path of the file to read. + private final URI gcsPath; + // File Info of gcsPath, will be pre-populated in some cases i.e. when Json client is used and + // failFast is disabled. + + // All store IO access goes through this. + private final SeekableByteChannel channel; + // Number of bytes read through this channel. + private long totalBytesRead = 0; + + /** + * Closed bit. Volatile so reads are non-blocking. Updates must be in a synchronized block to + * guarantee an atomic check and set + */ + private volatile boolean closed; + + // Statistics tracker provided by the parent GoogleHadoopFileSystem for recording stats + private final FileSystem.Statistics statistics; + + static GoogleHadoopFSInputStream create( + GoogleHadoopFileSystem ghfs, URI gcsPath, FileSystem.Statistics statistics) + throws IOException { + LOG.trace("create(gcsPath: {})", gcsPath); + GoogleCloudStorageFileSystem gcsFs = ghfs.getGcsFs(); + FileInfo fileInfo = gcsFs.getFileInfoObject(gcsPath); + SeekableByteChannel channel = gcsFs.open(fileInfo, ghfs.getFileSystemConfiguration()); + return new GoogleHadoopFSInputStream(gcsPath, channel, statistics); + } + + private GoogleHadoopFSInputStream( + URI gcsPath, + SeekableByteChannel channel, + FileSystem.Statistics statistics) { + LOG.trace("GoogleHadoopFSInputStream(gcsPath: %s)", gcsPath); + this.gcsPath = gcsPath; + this.channel = channel; + this.statistics = statistics; + } + + @Override + public synchronized int read() throws IOException { + checkNotClosed(); + int numRead = read(singleReadBuf, /* offset= */ 0, /* length= */ 1); + checkState( + numRead == -1 || numRead == 1, + "Read %s bytes using single-byte buffer for path %s ending in position %s", + numRead, + gcsPath, + channel.position()); + return numRead > 0 ? singleReadBuf[0] & 0xff : numRead; + } + + @Override + public synchronized int read(@Nonnull byte[] buf, int offset, int length) throws IOException { + checkNotClosed(); + checkNotNull(buf, "buf must not be null"); + if (offset < 0 || length < 0 || length > buf.length - offset) { + throw new IndexOutOfBoundsException(); + } + + // TODO(user): Wrap this in a while-loop if we ever introduce a non-blocking mode for + // the underlying channel. + int numRead = channel.read(ByteBuffer.wrap(buf, offset, length)); + if (numRead > 0) { + // -1 means we actually read 0 bytes, but requested at least one byte. + totalBytesRead += numRead; + statistics.incrementBytesRead(numRead); + statistics.incrementReadOps(1); + } + return numRead; + } + + @Override + public synchronized void seek(long pos) throws IOException { + checkNotClosed(); + LOG.trace("seek({})", pos); + try { + channel.position(pos); + } catch (IllegalArgumentException e) { + throw new IOException(e); + } + } + + @Override + public synchronized void close() throws IOException { + if (!closed) { + closed = true; + + LOG.trace("close(): {}", gcsPath); + try { + if (channel != null) { + LOG.trace( + "Closing '{}' file with {} total bytes read", gcsPath, totalBytesRead); + channel.close(); + } + } catch (Exception e) { + LOG.warn("Error while closing underneath read channel resources for path: {}", gcsPath, e); + } + } + } + + /** + * Gets the current position within the file being read. + * + * @return The current position within the file being read. + * @throws IOException if an IO error occurs. + */ + @Override + public synchronized long getPos() throws IOException { + checkNotClosed(); + long pos = channel.position(); + LOG.trace("getPos(): {}", pos); + return pos; + } + + /** + * Seeks a different copy of the data. Not supported. + * + * @return true if a new source is found, false otherwise. + */ + @Override + public boolean seekToNewSource(long targetPos) { + LOG.trace("seekToNewSource({}): false", targetPos); + return false; + } + + @Override + public int available() throws IOException { + if (!channel.isOpen()) { + throw new ClosedChannelException(); + } + return super.available(); + } + + /** + * Verify that the input stream is open. Non-blocking; this gives the last state of the volatile + * {@link #closed} field. + * + * @throws IOException if the connection is closed. + */ + private void checkNotClosed() throws IOException { + if (closed) { + throw new IOException(gcsPath + ": " + FSExceptionMessages.STREAM_IS_CLOSED); + } + } +} diff --git a/hadoop-tools/hadoop-gcp/src/main/java/org/apache/hadoop/fs/gs/GoogleHadoopFileSystem.java b/hadoop-tools/hadoop-gcp/src/main/java/org/apache/hadoop/fs/gs/GoogleHadoopFileSystem.java index 8831568a3560d..8bf2f05772458 100644 --- a/hadoop-tools/hadoop-gcp/src/main/java/org/apache/hadoop/fs/gs/GoogleHadoopFileSystem.java +++ b/hadoop-tools/hadoop-gcp/src/main/java/org/apache/hadoop/fs/gs/GoogleHadoopFileSystem.java @@ -265,9 +265,10 @@ public String getScheme() { } @Override - public FSDataInputStream open(final Path path, final int bufferSize) throws IOException { - LOG.trace("open({})", path); - throw new UnsupportedOperationException(path.toString()); + public FSDataInputStream open(final Path hadoopPath, final int bufferSize) throws IOException { + LOG.trace("open({})", hadoopPath); + URI gcsPath = getGcsPath(hadoopPath); + return new FSDataInputStream(GoogleHadoopFSInputStream.create(this, gcsPath, statistics)); } @Override @@ -327,9 +328,37 @@ public FSDataOutputStream append(final Path path, final int i, final Progressabl } @Override - public boolean rename(final Path path, final Path path1) throws IOException { - LOG.trace("rename({}, {})", path, path1); - throw new UnsupportedOperationException(path.toString()); + public boolean rename(final Path src, final Path dst) throws IOException { + LOG.trace("rename({}, {})", src, dst); + + checkArgument(src != null, "src must not be null"); + checkArgument(dst != null, "dst must not be null"); + + // Even though the underlying GCSFS will also throw an IAE if src is root, since our filesystem + // root happens to equal the global root, we want to explicitly check it here since derived + // classes may not have filesystem roots equal to the global root. + if (this.makeQualified(src).equals(fsRoot)) { + LOG.trace("rename(src: {}, dst: {}): false [src is a root]", src, dst); + return false; + } + + try { + checkOpen(); + + URI srcPath = getGcsPath(src); + URI dstPath = getGcsPath(dst); + getGcsFs().rename(srcPath, dstPath); + + LOG.trace("rename(src: {}, dst: {}): true", src, dst); + } catch (IOException e) { + if (ApiErrorExtractor.INSTANCE.requestFailure(e)) { + throw e; + } + LOG.trace("rename(src: %s, dst: %s): false [failed]", src, dst, e); + return false; + } + + return true; } @Override @@ -468,8 +497,7 @@ public Path getWorkingDirectory() { public boolean mkdirs(final Path hadoopPath, final FsPermission permission) throws IOException { checkArgument(hadoopPath != null, "hadoopPath must not be null"); - LOG.trace( - "mkdirs(hadoopPath: {}, permission: {}): true", hadoopPath, permission); + LOG.trace("mkdirs(hadoopPath: {}, permission: {}): true", hadoopPath, permission); checkOpen(); @@ -582,7 +610,6 @@ public void setWorkingDirectory(final Path hadoopPath) { LOG.trace("setWorkingDirectory(hadoopPath: {}): {}", hadoopPath, workingDirectory); } - private static String getUgiUserName() throws IOException { UserGroupInformation ugi = UserGroupInformation.getCurrentUser(); return ugi.getShortUserName(); diff --git a/hadoop-tools/hadoop-gcp/src/main/java/org/apache/hadoop/fs/gs/GoogleHadoopFileSystemConfiguration.java b/hadoop-tools/hadoop-gcp/src/main/java/org/apache/hadoop/fs/gs/GoogleHadoopFileSystemConfiguration.java index a480a72e60bd2..20831885fe6dc 100644 --- a/hadoop-tools/hadoop-gcp/src/main/java/org/apache/hadoop/fs/gs/GoogleHadoopFileSystemConfiguration.java +++ b/hadoop-tools/hadoop-gcp/src/main/java/org/apache/hadoop/fs/gs/GoogleHadoopFileSystemConfiguration.java @@ -18,6 +18,8 @@ package org.apache.hadoop.fs.gs; +import java.util.regex.Pattern; + import static java.lang.Math.toIntExact; import org.apache.hadoop.conf.Configuration; @@ -26,6 +28,8 @@ * This class provides a configuration for the {@link GoogleHadoopFileSystem} implementations. */ class GoogleHadoopFileSystemConfiguration { + private static final Long GCS_INPUT_STREAM_INPLACE_SEEK_LIMIT_DEFAULT = 8 * 1024 * 1024L; + /** * Configuration key for default block size of a file. * @@ -55,23 +59,79 @@ class GoogleHadoopFileSystemConfiguration { static final HadoopConfigurationProperty GCS_OUTPUT_STREAM_BUFFER_SIZE = new HadoopConfigurationProperty<>("fs.gs.outputstream.buffer.size", 8L * 1024 * 1024); + + /** + * If forward seeks are within this many bytes of the current position, seeks are performed by + * reading and discarding bytes in-place rather than opening a new underlying stream. + */ + public static final HadoopConfigurationProperty GCS_INPUT_STREAM_INPLACE_SEEK_LIMIT = + new HadoopConfigurationProperty<>( + "fs.gs.inputstream.inplace.seek.limit", + GCS_INPUT_STREAM_INPLACE_SEEK_LIMIT_DEFAULT); + + /** Tunes reading objects behavior to optimize HTTP GET requests for various use cases. */ + public static final HadoopConfigurationProperty GCS_INPUT_STREAM_FADVISE = + new HadoopConfigurationProperty<>("fs.gs.inputstream.fadvise", Fadvise.RANDOM); + + /** + * If false, reading a file with GZIP content encoding (HTTP header "Content-Encoding: gzip") will + * result in failure (IOException is thrown). + */ + public static final HadoopConfigurationProperty + GCS_INPUT_STREAM_SUPPORT_GZIP_ENCODING_ENABLE = + new HadoopConfigurationProperty<>( + "fs.gs.inputstream.support.gzip.encoding.enable", + false); + + /** + * Minimum size in bytes of the HTTP Range header set in GCS request when opening new stream to + * read an object. + */ + public static final HadoopConfigurationProperty GCS_INPUT_STREAM_MIN_RANGE_REQUEST_SIZE = + new HadoopConfigurationProperty<>( + "fs.gs.inputstream.min.range.request.size", + 2 * 1024 * 1024L); + + /** + * Configuration key for number of request to track for adapting the access pattern i.e. fadvise: + * AUTO & AUTO_RANDOM. + */ + public static final HadoopConfigurationProperty GCS_FADVISE_REQUEST_TRACK_COUNT = + new HadoopConfigurationProperty<>("fs.gs.fadvise.request.track.count", 3); + + /** + * Configuration key for specifying max number of bytes rewritten in a single rewrite request when + * fs.gs.copy.with.rewrite.enable is set to 'true'. + */ + public static final HadoopConfigurationProperty GCS_REWRITE_MAX_CHUNK_SIZE = + new HadoopConfigurationProperty<>( + "fs.gs.rewrite.max.chunk.size", + 512 * 1024 * 1024L); + + /** Configuration key for marker file pattern. Default value: none */ + public static final HadoopConfigurationProperty GCS_MARKER_FILE_PATTERN = + new HadoopConfigurationProperty<>("fs.gs.marker.file.pattern"); + private final String workingDirectory; private final String projectId; + private final Configuration config; + private Pattern fileMarkerFilePattern; - public int getOutStreamBufferSize() { + int getOutStreamBufferSize() { return outStreamBufferSize; } private final int outStreamBufferSize; - GoogleHadoopFileSystemConfiguration(Configuration config) { - this.workingDirectory = GCS_WORKING_DIRECTORY.get(config, config::get); + GoogleHadoopFileSystemConfiguration(Configuration conf) { + this.workingDirectory = GCS_WORKING_DIRECTORY.get(conf, conf::get); this.outStreamBufferSize = - toIntExact(GCS_OUTPUT_STREAM_BUFFER_SIZE.get(config, config::getLongBytes)); - this.projectId = GCS_PROJECT_ID.get(config, config::get); + toIntExact(GCS_OUTPUT_STREAM_BUFFER_SIZE.get(conf, conf::getLongBytes)); + this.projectId = GCS_PROJECT_ID.get(conf, conf::get); + this.config = conf; } - public String getWorkingDirectory() { + String getWorkingDirectory() { return this.workingDirectory; } @@ -79,7 +139,53 @@ String getProjectId() { return this.projectId; } - public long getMaxListItemsPerCall() { + long getMaxListItemsPerCall() { return 5000L; //TODO: Make this configurable } + + Fadvise getFadvise() { + return GCS_INPUT_STREAM_FADVISE.get(config, config::getEnum); + } + + long getInplaceSeekLimit() { + return GCS_INPUT_STREAM_INPLACE_SEEK_LIMIT.get(config, config::getLongBytes); + } + + public int getFadviseRequestTrackCount() { + return GCS_FADVISE_REQUEST_TRACK_COUNT.get(config, config::getInt); + } + + public boolean isGzipEncodingSupportEnabled() { + return GCS_INPUT_STREAM_SUPPORT_GZIP_ENCODING_ENABLE.get(config, config::getBoolean); + } + + public long getMinRangeRequestSize() { + return GCS_INPUT_STREAM_MIN_RANGE_REQUEST_SIZE.get(config, config::getLongBytes); + } + + public long getBlockSize() { + return BLOCK_SIZE.get(config, config::getLong); + } + + public boolean isReadExactRequestedBytesEnabled() { + return false; //TODO: Remove this option? + } + + public long getMaxRewriteChunkSize() { + return GCS_REWRITE_MAX_CHUNK_SIZE.get(config, config::getLong); + } + + public Pattern getMarkerFilePattern() { + String pattern = GCS_MARKER_FILE_PATTERN.get(config, config::get); + if (pattern == null) { + return null; + } + + if (fileMarkerFilePattern == null) { + // Caching the pattern since compile step can be expensive + fileMarkerFilePattern = Pattern.compile("^(.+/)?" + pattern + "$"); + } + + return fileMarkerFilePattern; + } } diff --git a/hadoop-tools/hadoop-gcp/src/main/java/org/apache/hadoop/fs/gs/Gs.java b/hadoop-tools/hadoop-gcp/src/main/java/org/apache/hadoop/fs/gs/Gs.java new file mode 100644 index 0000000000000..e0b5ec4acaf6d --- /dev/null +++ b/hadoop-tools/hadoop-gcp/src/main/java/org/apache/hadoop/fs/gs/Gs.java @@ -0,0 +1,64 @@ +package org.apache.hadoop.fs.gs; + +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + *

+ * http://www.apache.org/licenses/LICENSE-2.0 + *

+ * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.io.IOException; +import java.net.URI; +import java.net.URISyntaxException; + +import org.apache.hadoop.classification.InterfaceAudience; +import org.apache.hadoop.classification.InterfaceStability; +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.fs.DelegateToFileSystem; + +/** + * GCS implementation of AbstractFileSystem. + * This impl delegates to the GoogleHadoopFileSystem + */ +@InterfaceAudience.Public +@InterfaceStability.Evolving +public class Gs extends DelegateToFileSystem { + public Gs(URI theUri, Configuration conf) throws IOException, URISyntaxException { + super(theUri, new GoogleHadoopFileSystem(), conf, + theUri.getScheme().isEmpty() ? Constants.SCHEME : theUri.getScheme(), false); + } + + @Override + public int getUriDefaultPort() { + return super.getUriDefaultPort(); + } + + @Override + public String toString() { + final StringBuilder sb = new StringBuilder("gs{"); + sb.append("URI =").append(fsImpl.getUri()); + sb.append("; fsImpl=").append(fsImpl); + sb.append('}'); + return sb.toString(); + } + + /** + * Close the file system; the FileContext API doesn't have an explicit close. + */ + @Override + protected void finalize() throws Throwable { + fsImpl.close(); + super.finalize(); + } +} \ No newline at end of file diff --git a/hadoop-tools/hadoop-gcp/src/main/java/org/apache/hadoop/fs/gs/ListFileOptions.java b/hadoop-tools/hadoop-gcp/src/main/java/org/apache/hadoop/fs/gs/ListFileOptions.java index 2bc74c6fc2190..6ef7b7641f548 100644 --- a/hadoop-tools/hadoop-gcp/src/main/java/org/apache/hadoop/fs/gs/ListFileOptions.java +++ b/hadoop-tools/hadoop-gcp/src/main/java/org/apache/hadoop/fs/gs/ListFileOptions.java @@ -22,6 +22,10 @@ final class ListFileOptions { static final ListFileOptions OBJECTFIELDS = new ListFileOptions("bucket,name,size,updated"); + + static final ListFileOptions DELETE_RENAME_LIST_OPTIONS = + new ListFileOptions("bucket,name,generation"); + private final String fields; private ListFileOptions(@Nonnull String fields) { diff --git a/hadoop-tools/hadoop-gcp/src/test/java/org/apache/hadoop/fs/gs/contract/ITestGoogleContractOpen.java b/hadoop-tools/hadoop-gcp/src/test/java/org/apache/hadoop/fs/gs/contract/ITestGoogleContractOpen.java new file mode 100644 index 0000000000000..c3d7c6ddac7d4 --- /dev/null +++ b/hadoop-tools/hadoop-gcp/src/test/java/org/apache/hadoop/fs/gs/contract/ITestGoogleContractOpen.java @@ -0,0 +1,31 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hadoop.fs.gs.contract; + +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.fs.contract.AbstractContractOpenTest; +import org.apache.hadoop.fs.contract.AbstractFSContract; + +/** GCS contract tests covering file open. */ +public class ITestGoogleContractOpen extends AbstractContractOpenTest { + @Override + protected AbstractFSContract createContract(Configuration conf) { + return new GoogleContract(conf); + } +} diff --git a/hadoop-tools/hadoop-gcp/src/test/java/org/apache/hadoop/fs/gs/contract/ITestGoogleContractRename.java b/hadoop-tools/hadoop-gcp/src/test/java/org/apache/hadoop/fs/gs/contract/ITestGoogleContractRename.java new file mode 100644 index 0000000000000..5d9459cac19e9 --- /dev/null +++ b/hadoop-tools/hadoop-gcp/src/test/java/org/apache/hadoop/fs/gs/contract/ITestGoogleContractRename.java @@ -0,0 +1,43 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.hadoop.fs.gs.contract; + +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.fs.contract.AbstractContractRenameTest; +import org.apache.hadoop.fs.contract.AbstractFSContract; +import org.apache.hadoop.fs.contract.ContractTestUtils; + +/** GCS contract tests covering file rename. */ +public class ITestGoogleContractRename extends AbstractContractRenameTest { + @Override + protected AbstractFSContract createContract(Configuration conf) { + return new GoogleContract(conf); + } + + @Override + public void testRenameWithNonEmptySubDir() { + // TODO: Enable this + ContractTestUtils.skip("Skipping the test. This will be enabled in a subsequent change"); + } + + @Override + public void testRenameNonexistentFile() { + // TODO: Enable this + ContractTestUtils.skip("Skipping the test. This will be enabled in a subsequent change"); + } +} diff --git a/hadoop-tools/hadoop-gcp/src/test/java/org/apache/hadoop/fs/gs/contract/ITestGoogleContractSeek.java b/hadoop-tools/hadoop-gcp/src/test/java/org/apache/hadoop/fs/gs/contract/ITestGoogleContractSeek.java new file mode 100644 index 0000000000000..651487bd64149 --- /dev/null +++ b/hadoop-tools/hadoop-gcp/src/test/java/org/apache/hadoop/fs/gs/contract/ITestGoogleContractSeek.java @@ -0,0 +1,30 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hadoop.fs.gs.contract; + +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.fs.contract.AbstractContractSeekTest; +import org.apache.hadoop.fs.contract.AbstractFSContract; + +public class ITestGoogleContractSeek extends AbstractContractSeekTest { + @Override + protected AbstractFSContract createContract(Configuration conf) { + return new GoogleContract(conf); + } +} From 48b319879efb6ab2c4db6ebd868b160e62e171f1 Mon Sep 17 00:00:00 2001 From: Arunkumar Chacko Date: Wed, 25 Jun 2025 18:22:57 +0000 Subject: [PATCH 5/8] HADOOP-19343. Add support for hflush() Closes #7761 Co-authored-by: Chris Nauroth Signed-off-by: Chris Nauroth --- ...ateOptions.java => CreateFileOptions.java} | 28 +- .../hadoop/fs/gs/ErrorTypeExtractor.java | 27 +- .../hadoop/fs/gs/GoogleCloudStorage.java | 82 +++++- .../GoogleCloudStorageClientWriteChannel.java | 14 +- .../fs/gs/GoogleCloudStorageFileSystem.java | 115 +++++++- .../hadoop/fs/gs/GoogleHadoopFileSystem.java | 14 +- .../GoogleHadoopFileSystemConfiguration.java | 44 ++- .../fs/gs/GoogleHadoopOutputStream.java | 273 +++++++++++++++++- .../fs/gs/HadoopConfigurationProperty.java | 10 + .../contract/ITestGoogleContractCreate.java | 36 +++ .../gs/contract/ITestGoogleContractMkdir.java | 24 -- .../contract/ITestGoogleContractRename.java | 6 - .../src/test/resources/contract/gs.xml | 100 ++++++- 13 files changed, 679 insertions(+), 94 deletions(-) rename hadoop-tools/hadoop-gcp/src/main/java/org/apache/hadoop/fs/gs/{CreateOptions.java => CreateFileOptions.java} (81%) create mode 100644 hadoop-tools/hadoop-gcp/src/test/java/org/apache/hadoop/fs/gs/contract/ITestGoogleContractCreate.java diff --git a/hadoop-tools/hadoop-gcp/src/main/java/org/apache/hadoop/fs/gs/CreateOptions.java b/hadoop-tools/hadoop-gcp/src/main/java/org/apache/hadoop/fs/gs/CreateFileOptions.java similarity index 81% rename from hadoop-tools/hadoop-gcp/src/main/java/org/apache/hadoop/fs/gs/CreateOptions.java rename to hadoop-tools/hadoop-gcp/src/main/java/org/apache/hadoop/fs/gs/CreateFileOptions.java index c9b44a1a481b1..1c0e48ae14418 100644 --- a/hadoop-tools/hadoop-gcp/src/main/java/org/apache/hadoop/fs/gs/CreateOptions.java +++ b/hadoop-tools/hadoop-gcp/src/main/java/org/apache/hadoop/fs/gs/CreateFileOptions.java @@ -28,24 +28,38 @@ /** * Options that can be specified when creating a file in the {@link GoogleCloudStorageFileSystem}. */ -final class CreateOptions { +final class CreateFileOptions { private final ImmutableMap attributes; private final String contentType; private final long overwriteGenerationId; private final WriteMode mode; + private final boolean ensureNoDirectoryConflict; - private CreateOptions(CreateOperationOptionsBuilder builder) { + private CreateFileOptions(CreateOperationOptionsBuilder builder) { this.attributes = ImmutableMap.copyOf(builder.attributes); this.contentType = builder.contentType; this.overwriteGenerationId = builder.overwriteGenerationId; this.mode = builder.writeMode; + this.ensureNoDirectoryConflict = builder.ensureNoDirectoryConflict; } boolean isOverwriteExisting() { return this.mode == WriteMode.OVERWRITE; } + boolean isEnsureNoDirectoryConflict() { + return ensureNoDirectoryConflict; + } + + CreateOperationOptionsBuilder toBuilder() { + return builder().setWriteMode(this.mode) + .setEnsureNoDirectoryConflict(ensureNoDirectoryConflict); + } + enum WriteMode { + /** Write new bytes to the end of the existing file rather than the beginning. */ + APPEND, + /** * Creates a new file for write and fails if file already exists. */ @@ -98,14 +112,20 @@ static class CreateOperationOptionsBuilder { private String contentType = "application/octet-stream"; private long overwriteGenerationId = StorageResourceId.UNKNOWN_GENERATION_ID; private WriteMode writeMode = WriteMode.CREATE_NEW; + private boolean ensureNoDirectoryConflict = true; CreateOperationOptionsBuilder setWriteMode(WriteMode mode) { this.writeMode = mode; return this; } - CreateOptions build() { - CreateOptions options = new CreateOptions(this); + CreateOperationOptionsBuilder setEnsureNoDirectoryConflict(boolean ensure) { + this.ensureNoDirectoryConflict = ensure; + return this; + } + + CreateFileOptions build() { + CreateFileOptions options = new CreateFileOptions(this); checkArgument(!options.getAttributes().containsKey("Content-Type"), "The Content-Type attribute must be set via the contentType option"); diff --git a/hadoop-tools/hadoop-gcp/src/main/java/org/apache/hadoop/fs/gs/ErrorTypeExtractor.java b/hadoop-tools/hadoop-gcp/src/main/java/org/apache/hadoop/fs/gs/ErrorTypeExtractor.java index 547d855d1d649..a94156e68e699 100644 --- a/hadoop-tools/hadoop-gcp/src/main/java/org/apache/hadoop/fs/gs/ErrorTypeExtractor.java +++ b/hadoop-tools/hadoop-gcp/src/main/java/org/apache/hadoop/fs/gs/ErrorTypeExtractor.java @@ -20,6 +20,8 @@ import javax.annotation.Nullable; +import com.google.api.client.http.HttpStatusCodes; +import com.google.cloud.storage.StorageException; import io.grpc.Status; import io.grpc.StatusRuntimeException; @@ -63,8 +65,6 @@ enum ErrorType { UNAVAILABLE, UNKNOWN } - // public static final ErrorTypeExtractor INSTANCE = new ErrorTypeExtractor(); - private static final String BUCKET_ALREADY_EXISTS_MESSAGE = "FAILED_PRECONDITION: Your previous request to create the named bucket succeeded and you " + "already own it."; @@ -89,7 +89,28 @@ static ErrorType getErrorType(Exception error) { case UNAVAILABLE: return ErrorType.UNAVAILABLE; default: - return ErrorType.UNKNOWN; + return getErrorTypeFromStorageException(error); } } + + private static ErrorType getErrorTypeFromStorageException(Exception error) { + if (error instanceof StorageException) { + StorageException se = (StorageException) error; + int httpCode = se.getCode(); + + if (httpCode == HttpStatusCodes.STATUS_CODE_PRECONDITION_FAILED) { + return ErrorType.FAILED_PRECONDITION; + } + + if (httpCode == HttpStatusCodes.STATUS_CODE_NOT_FOUND) { + return ErrorType.NOT_FOUND; + } + + if (httpCode == HttpStatusCodes.STATUS_CODE_SERVICE_UNAVAILABLE) { + return ErrorType.UNAVAILABLE; + } + } + + return ErrorType.UNKNOWN; + } } diff --git a/hadoop-tools/hadoop-gcp/src/main/java/org/apache/hadoop/fs/gs/GoogleCloudStorage.java b/hadoop-tools/hadoop-gcp/src/main/java/org/apache/hadoop/fs/gs/GoogleCloudStorage.java index 89a86eef8ff07..dcf5ff231f7bf 100644 --- a/hadoop-tools/hadoop-gcp/src/main/java/org/apache/hadoop/fs/gs/GoogleCloudStorage.java +++ b/hadoop-tools/hadoop-gcp/src/main/java/org/apache/hadoop/fs/gs/GoogleCloudStorage.java @@ -48,6 +48,7 @@ import java.util.List; import java.util.Map; import java.util.Set; +import java.util.stream.Collectors; /** * A wrapper around Google cloud storage @@ -89,7 +90,7 @@ private static Storage createStorage(String projectId) { return StorageOptions.newBuilder().build().getService(); } - WritableByteChannel create(final StorageResourceId resourceId, final CreateOptions options) + WritableByteChannel create(final StorageResourceId resourceId, final CreateFileOptions options) throws IOException { LOG.trace("create({})", resourceId); @@ -402,6 +403,47 @@ void createEmptyObject(StorageResourceId resourceId, CreateObjectOptions options } } + + public GoogleCloudStorageItemInfo composeObjects( + List sources, StorageResourceId destination, CreateObjectOptions options) + throws IOException { + LOG.trace("composeObjects({}, {}, {})", sources, destination, options); + for (StorageResourceId inputId : sources) { + if (!destination.getBucketName().equals(inputId.getBucketName())) { + throw new IOException( + String.format( + "Bucket doesn't match for source '%s' and destination '%s'!", + inputId, destination)); + } + } + Storage.ComposeRequest request = + Storage.ComposeRequest.newBuilder() + .addSource( + sources.stream().map(StorageResourceId::getObjectName).collect(Collectors.toList())) + .setTarget( + BlobInfo.newBuilder(destination.getBucketName(), destination.getObjectName()) + .setContentType(options.getContentType()) + .setContentEncoding(options.getContentEncoding()) + .setMetadata(encodeMetadata(options.getMetadata())) + .build()) + .setTargetOptions( + Storage.BlobTargetOption.generationMatch( + destination.hasGenerationId() + ? destination.getGenerationId() + : getWriteGeneration(destination, true))) + .build(); + + Blob composedBlob; + try { + composedBlob = storage.compose(request); + } catch (StorageException e) { + throw new IOException(e); + } + GoogleCloudStorageItemInfo compositeInfo = createItemInfoForBlob(destination, composedBlob); + LOG.trace("composeObjects() done, returning: {}", compositeInfo); + return compositeInfo; + } + /** * Helper to check whether an empty object already exists with the expected metadata specified in * {@code options}, to be used to determine whether it's safe to ignore an exception that was @@ -450,6 +492,7 @@ private boolean canIgnoreExceptionForEmptyObject( return true; } } + return false; } @@ -472,18 +515,14 @@ private void createEmptyObjectInternal( blobTargetOptions.add(Storage.BlobTargetOption.doesNotExist()); } - try { - // TODO: Set encryption key and related properties - storage.create( - BlobInfo.newBuilder(BlobId.of(resourceId.getBucketName(), resourceId.getObjectName())) - .setMetadata(rewrittenMetadata) - .setContentEncoding(createObjectOptions.getContentEncoding()) - .setContentType(createObjectOptions.getContentType()) - .build(), - blobTargetOptions.toArray(new Storage.BlobTargetOption[0])); - } catch (StorageException e) { - throw new IOException(String.format("Creating empty object %s failed.", resourceId), e); - } + // TODO: Set encryption key and related properties + storage.create( + BlobInfo.newBuilder(BlobId.of(resourceId.getBucketName(), resourceId.getObjectName())) + .setMetadata(rewrittenMetadata) + .setContentEncoding(createObjectOptions.getContentEncoding()) + .setContentType(createObjectOptions.getContentType()) + .build(), + blobTargetOptions.toArray(new Storage.BlobTargetOption[0])); } private static Map encodeMetadata(Map metadata) { @@ -871,6 +910,23 @@ private static GoogleCloudStorageItemInfo getGoogleCloudStorageItemInfo( return storageItemInfo; } + List getItemInfos(List resourceIds) + throws IOException { + LOG.trace("getItemInfos({})", resourceIds); + + if (resourceIds.isEmpty()) { + return new ArrayList<>(); + } + + List result = new ArrayList<>(resourceIds.size()); + for (StorageResourceId resourceId : resourceIds) { + // TODO: Do this concurrently + result.add(getItemInfo(resourceId)); + } + + return result; + } + // Helper class to capture the results of list operation. private class ListOperationResult { private final Map prefixes = new HashMap<>(); diff --git a/hadoop-tools/hadoop-gcp/src/main/java/org/apache/hadoop/fs/gs/GoogleCloudStorageClientWriteChannel.java b/hadoop-tools/hadoop-gcp/src/main/java/org/apache/hadoop/fs/gs/GoogleCloudStorageClientWriteChannel.java index 7956b6f0a8276..438d8c040c940 100644 --- a/hadoop-tools/hadoop-gcp/src/main/java/org/apache/hadoop/fs/gs/GoogleCloudStorageClientWriteChannel.java +++ b/hadoop-tools/hadoop-gcp/src/main/java/org/apache/hadoop/fs/gs/GoogleCloudStorageClientWriteChannel.java @@ -44,8 +44,12 @@ class GoogleCloudStorageClientWriteChannel implements WritableByteChannel { private final StorageResourceId resourceId; private WritableByteChannel writableByteChannel; - GoogleCloudStorageClientWriteChannel(final Storage storage, - final StorageResourceId resourceId, final CreateOptions createOptions) throws IOException { + private GoogleCloudStorageItemInfo completedItemInfo = null; + + GoogleCloudStorageClientWriteChannel( + final Storage storage, + final StorageResourceId resourceId, + final CreateFileOptions createOptions) throws IOException { this.resourceId = resourceId; BlobWriteSession blobWriteSession = getBlobWriteSession(storage, resourceId, createOptions); try { @@ -56,7 +60,7 @@ class GoogleCloudStorageClientWriteChannel implements WritableByteChannel { } private static BlobInfo getBlobInfo(final StorageResourceId resourceId, - final CreateOptions createOptions) { + final CreateFileOptions createOptions) { BlobInfo blobInfo = BlobInfo.newBuilder( BlobId.of(resourceId.getBucketName(), resourceId.getObjectName(), resourceId.getGenerationId())).setContentType(createOptions.getContentType()) @@ -66,12 +70,12 @@ private static BlobInfo getBlobInfo(final StorageResourceId resourceId, } private static BlobWriteSession getBlobWriteSession(final Storage storage, - final StorageResourceId resourceId, final CreateOptions createOptions) { + final StorageResourceId resourceId, final CreateFileOptions createOptions) { return storage.blobWriteSession(getBlobInfo(resourceId, createOptions), generateWriteOptions(createOptions)); } - private static BlobWriteOption[] generateWriteOptions(final CreateOptions createOptions) { + private static BlobWriteOption[] generateWriteOptions(final CreateFileOptions createOptions) { List blobWriteOptions = new ArrayList<>(); blobWriteOptions.add(BlobWriteOption.disableGzipContent()); diff --git a/hadoop-tools/hadoop-gcp/src/main/java/org/apache/hadoop/fs/gs/GoogleCloudStorageFileSystem.java b/hadoop-tools/hadoop-gcp/src/main/java/org/apache/hadoop/fs/gs/GoogleCloudStorageFileSystem.java index 2b0c238eb02a0..951c38f596667 100644 --- a/hadoop-tools/hadoop-gcp/src/main/java/org/apache/hadoop/fs/gs/GoogleCloudStorageFileSystem.java +++ b/hadoop-tools/hadoop-gcp/src/main/java/org/apache/hadoop/fs/gs/GoogleCloudStorageFileSystem.java @@ -19,6 +19,8 @@ package org.apache.hadoop.fs.gs; import static org.apache.hadoop.thirdparty.com.google.common.base.Preconditions.*; +import static org.apache.hadoop.thirdparty.com.google.common.base.Strings.isNullOrEmpty; +import static org.apache.hadoop.thirdparty.com.google.common.collect.ImmutableList.toImmutableList; import static java.util.Comparator.comparing; import static org.apache.hadoop.fs.gs.Constants.PATH_DELIMITER; import static org.apache.hadoop.fs.gs.Constants.SCHEME; @@ -100,7 +102,7 @@ private static GoogleCloudStorage createCloudStorage( gcs = createCloudStorage(configuration, credentials); } - WritableByteChannel create(final URI path, final CreateOptions createOptions) + WritableByteChannel create(final URI path, final CreateFileOptions createOptions) throws IOException { LOG.trace("create(path: {}, createOptions: {})", path, createOptions); checkNotNull(path, "path could not be null"); @@ -113,6 +115,32 @@ WritableByteChannel create(final URI path, final CreateOptions createOptions) resourceId)); } + // Because create call should create parent directories too, before creating an actual file + // we need to check if there are no conflicting items in the directory tree: + // - if there are no conflicting files with the same name as any parent subdirectory + // - if there are no conflicting directory with the name as a file + // + // For example, for a new `gs://bucket/c/d/f` file: + // - files `gs://bucket/c` and `gs://bucket/c/d` should not exist + // - directory `gs://bucket/c/d/f/` should not exist + if (configuration.isEnsureNoConflictingItems()) { + // Check if a directory with the same name exists. + StorageResourceId dirId = resourceId.toDirectoryId(); + Boolean conflictingDirExist = false; + if (createOptions.isEnsureNoDirectoryConflict()) { + // TODO: Do this concurrently + conflictingDirExist = + getFileInfoInternal(dirId, /* inferImplicitDirectories */ true).exists(); + } + + checkNoFilesConflictingWithDirs(resourceId); + + // Check if a directory with the same name exists. + if (conflictingDirExist) { + throw new FileAlreadyExistsException("A directory with that name exists: " + path); + } + } + if (createOptions.getOverwriteGenerationId() != StorageResourceId.UNKNOWN_GENERATION_ID) { resourceId = new StorageResourceId(resourceId.getBucketName(), resourceId.getObjectName(), createOptions.getOverwriteGenerationId()); @@ -214,8 +242,11 @@ public void mkdirs(URI path) throws IOException { resourceId = resourceId.toDirectoryId(); - // TODO: Before creating a leaf directory we need to check if there are no conflicting files - // TODO: with the same name as any subdirectory + // Before creating a leaf directory we need to check if there are no conflicting files + // with the same name as any subdirectory + if (configuration.isEnsureNoConflictingItems()) { + checkNoFilesConflictingWithDirs(resourceId); + } // Create only a leaf directory because subdirectories will be inferred // if leaf directory exists @@ -689,4 +720,82 @@ static String getItemName(URI path) { : objectName.lastIndexOf(PATH_DELIMITER); return index < 0 ? objectName : objectName.substring(index + 1); } + + static CreateObjectOptions objectOptionsFromFileOptions(CreateFileOptions options) { + checkArgument( + options.getWriteMode() == CreateFileOptions.WriteMode.CREATE_NEW + || options.getWriteMode() == CreateFileOptions.WriteMode.OVERWRITE, + "unsupported write mode: %s", + options.getWriteMode()); + return CreateObjectOptions.builder() + .setContentType(options.getContentType()) + .setMetadata(options.getAttributes()) + .setOverwriteExisting(options.getWriteMode() == CreateFileOptions.WriteMode.OVERWRITE) + .build(); + } + + GoogleHadoopFileSystemConfiguration getConfiguration() { + return configuration; + } + + GoogleCloudStorageItemInfo composeObjects(ImmutableList sources, + StorageResourceId dstId, CreateObjectOptions composeObjectOptions) throws IOException { + return gcs.composeObjects(sources, dstId, composeObjectOptions); + } + + void delete(List items) throws IOException { + gcs.deleteObjects(items); + } + + private void checkNoFilesConflictingWithDirs(StorageResourceId resourceId) throws IOException { + // Create a list of all files that can conflict with intermediate/subdirectory paths. + // For example: gs://foo/bar/zoo/ => (gs://foo/bar, gs://foo/bar/zoo) + List fileIds = + getDirs(resourceId.getObjectName()).stream() + .filter(subdir -> !isNullOrEmpty(subdir)) + .map( + subdir -> + new StorageResourceId( + resourceId.getBucketName(), StringPaths.toFilePath(subdir))) + .collect(toImmutableList()); + + // Each intermediate path must ensure that corresponding file does not exist + // + // If for any of the intermediate paths file already exists then bail out early. + // It is possible that the status of intermediate paths can change after + // we make this check therefore this is a good faith effort and not a guarantee. + for (GoogleCloudStorageItemInfo fileInfo : gcs.getItemInfos(fileIds)) { + if (fileInfo.exists()) { + throw new FileAlreadyExistsException( + "Cannot create directories because of existing file: " + fileInfo.getResourceId()); + } + } + } + + /** + * For objects whose name looks like a path (foo/bar/zoo), returns all directory paths. + * + *

For example: + * + *

+ * + * @param objectName Name of an object. + * @return List of subdirectory like paths. + */ + static List getDirs(String objectName) { + if (isNullOrEmpty(objectName)) { + return ImmutableList.of(); + } + List dirs = new ArrayList<>(); + int index = 0; + while ((index = objectName.indexOf(PATH_DELIMITER, index)) >= 0) { + index = index + PATH_DELIMITER.length(); + dirs.add(objectName.substring(0, index)); + } + return dirs; + } } diff --git a/hadoop-tools/hadoop-gcp/src/main/java/org/apache/hadoop/fs/gs/GoogleHadoopFileSystem.java b/hadoop-tools/hadoop-gcp/src/main/java/org/apache/hadoop/fs/gs/GoogleHadoopFileSystem.java index 8bf2f05772458..e4db8146186a0 100644 --- a/hadoop-tools/hadoop-gcp/src/main/java/org/apache/hadoop/fs/gs/GoogleHadoopFileSystem.java +++ b/hadoop-tools/hadoop-gcp/src/main/java/org/apache/hadoop/fs/gs/GoogleHadoopFileSystem.java @@ -283,11 +283,11 @@ public FSDataOutputStream create(Path hadoopPath, FsPermission permission, boole LOG.trace("create(hadoopPath: {}, overwrite: {}, bufferSize: {} [ignored])", hadoopPath, overwrite, bufferSize); - CreateOptions.WriteMode writeMode = - overwrite ? CreateOptions.WriteMode.OVERWRITE : CreateOptions.WriteMode.CREATE_NEW; + CreateFileOptions.WriteMode writeMode = + overwrite ? CreateFileOptions.WriteMode.OVERWRITE : CreateFileOptions.WriteMode.CREATE_NEW; FSDataOutputStream response = new FSDataOutputStream( new GoogleHadoopOutputStream(this, getGcsPath(hadoopPath), - CreateOptions.builder().setWriteMode(writeMode).build(), statistics), statistics); + CreateFileOptions.builder().setWriteMode(writeMode).build(), statistics), statistics); return response; } @@ -596,12 +596,6 @@ public long getUsed() throws IOException { return result; } -// @Override -// public long getDefaultBlockSize() { -// LOG.trace("getDefaultBlockSize(): {}", defaultBlockSize); -// return defaultBlockSize; -// } - @Override public void setWorkingDirectory(final Path hadoopPath) { checkArgument(hadoopPath != null, "hadoopPath must not be null"); @@ -633,4 +627,4 @@ private FileStatus getFileStatus(FileInfo fileInfo, String userName) { LOG.trace("FileStatus(path: {}, userName: {}): {}", fileInfo.getPath(), userName, status); return status; } -} +} \ No newline at end of file diff --git a/hadoop-tools/hadoop-gcp/src/main/java/org/apache/hadoop/fs/gs/GoogleHadoopFileSystemConfiguration.java b/hadoop-tools/hadoop-gcp/src/main/java/org/apache/hadoop/fs/gs/GoogleHadoopFileSystemConfiguration.java index 20831885fe6dc..4097b5e1f839f 100644 --- a/hadoop-tools/hadoop-gcp/src/main/java/org/apache/hadoop/fs/gs/GoogleHadoopFileSystemConfiguration.java +++ b/hadoop-tools/hadoop-gcp/src/main/java/org/apache/hadoop/fs/gs/GoogleHadoopFileSystemConfiguration.java @@ -18,6 +18,7 @@ package org.apache.hadoop.fs.gs; +import java.time.Duration; import java.util.regex.Pattern; import static java.lang.Math.toIntExact; @@ -44,7 +45,7 @@ class GoogleHadoopFileSystemConfiguration { /** * Configuration key for GCS project ID. Default value: none */ - static final HadoopConfigurationProperty GCS_PROJECT_ID = + private static final HadoopConfigurationProperty GCS_PROJECT_ID = new HadoopConfigurationProperty<>("fs.gs.project.id"); /** @@ -56,7 +57,7 @@ class GoogleHadoopFileSystemConfiguration { /** * Configuration key for setting write buffer size. */ - static final HadoopConfigurationProperty GCS_OUTPUT_STREAM_BUFFER_SIZE = + private static final HadoopConfigurationProperty GCS_OUTPUT_STREAM_BUFFER_SIZE = new HadoopConfigurationProperty<>("fs.gs.outputstream.buffer.size", 8L * 1024 * 1024); @@ -64,20 +65,20 @@ class GoogleHadoopFileSystemConfiguration { * If forward seeks are within this many bytes of the current position, seeks are performed by * reading and discarding bytes in-place rather than opening a new underlying stream. */ - public static final HadoopConfigurationProperty GCS_INPUT_STREAM_INPLACE_SEEK_LIMIT = + private static final HadoopConfigurationProperty GCS_INPUT_STREAM_INPLACE_SEEK_LIMIT = new HadoopConfigurationProperty<>( "fs.gs.inputstream.inplace.seek.limit", GCS_INPUT_STREAM_INPLACE_SEEK_LIMIT_DEFAULT); /** Tunes reading objects behavior to optimize HTTP GET requests for various use cases. */ - public static final HadoopConfigurationProperty GCS_INPUT_STREAM_FADVISE = + private static final HadoopConfigurationProperty GCS_INPUT_STREAM_FADVISE = new HadoopConfigurationProperty<>("fs.gs.inputstream.fadvise", Fadvise.RANDOM); /** * If false, reading a file with GZIP content encoding (HTTP header "Content-Encoding: gzip") will * result in failure (IOException is thrown). */ - public static final HadoopConfigurationProperty + private static final HadoopConfigurationProperty GCS_INPUT_STREAM_SUPPORT_GZIP_ENCODING_ENABLE = new HadoopConfigurationProperty<>( "fs.gs.inputstream.support.gzip.encoding.enable", @@ -87,7 +88,7 @@ class GoogleHadoopFileSystemConfiguration { * Minimum size in bytes of the HTTP Range header set in GCS request when opening new stream to * read an object. */ - public static final HadoopConfigurationProperty GCS_INPUT_STREAM_MIN_RANGE_REQUEST_SIZE = + private static final HadoopConfigurationProperty GCS_INPUT_STREAM_MIN_RANGE_REQUEST_SIZE = new HadoopConfigurationProperty<>( "fs.gs.inputstream.min.range.request.size", 2 * 1024 * 1024L); @@ -96,22 +97,39 @@ class GoogleHadoopFileSystemConfiguration { * Configuration key for number of request to track for adapting the access pattern i.e. fadvise: * AUTO & AUTO_RANDOM. */ - public static final HadoopConfigurationProperty GCS_FADVISE_REQUEST_TRACK_COUNT = + private static final HadoopConfigurationProperty GCS_FADVISE_REQUEST_TRACK_COUNT = new HadoopConfigurationProperty<>("fs.gs.fadvise.request.track.count", 3); /** * Configuration key for specifying max number of bytes rewritten in a single rewrite request when * fs.gs.copy.with.rewrite.enable is set to 'true'. */ - public static final HadoopConfigurationProperty GCS_REWRITE_MAX_CHUNK_SIZE = + private static final HadoopConfigurationProperty GCS_REWRITE_MAX_CHUNK_SIZE = new HadoopConfigurationProperty<>( "fs.gs.rewrite.max.chunk.size", 512 * 1024 * 1024L); /** Configuration key for marker file pattern. Default value: none */ - public static final HadoopConfigurationProperty GCS_MARKER_FILE_PATTERN = + private static final HadoopConfigurationProperty GCS_MARKER_FILE_PATTERN = new HadoopConfigurationProperty<>("fs.gs.marker.file.pattern"); + /** + * Configuration key for enabling check to ensure that conflicting directories do not exist when + * creating files and conflicting files do not exist when creating directories. + */ + private static final HadoopConfigurationProperty GCS_CREATE_ITEMS_CONFLICT_CHECK_ENABLE = + new HadoopConfigurationProperty<>( + "fs.gs.create.items.conflict.check.enable", + true); + + /** + * Configuration key for the minimal time interval between consecutive sync/hsync/hflush calls. + */ + private static final HadoopConfigurationProperty GCS_OUTPUT_STREAM_SYNC_MIN_INTERVAL = + new HadoopConfigurationProperty<>( + "fs.gs.outputstream.sync.min.interval", + 0L); + private final String workingDirectory; private final String projectId; private final Configuration config; @@ -188,4 +206,12 @@ public Pattern getMarkerFilePattern() { return fileMarkerFilePattern; } + + public boolean isEnsureNoConflictingItems() { + return GCS_CREATE_ITEMS_CONFLICT_CHECK_ENABLE.get(config, config::getBoolean); + } + + public Duration getMinSyncInterval() { + return GCS_OUTPUT_STREAM_SYNC_MIN_INTERVAL.getTimeDuration(config); + } } diff --git a/hadoop-tools/hadoop-gcp/src/main/java/org/apache/hadoop/fs/gs/GoogleHadoopOutputStream.java b/hadoop-tools/hadoop-gcp/src/main/java/org/apache/hadoop/fs/gs/GoogleHadoopOutputStream.java index 747d9f001c517..c41ce13edaeca 100644 --- a/hadoop-tools/hadoop-gcp/src/main/java/org/apache/hadoop/fs/gs/GoogleHadoopOutputStream.java +++ b/hadoop-tools/hadoop-gcp/src/main/java/org/apache/hadoop/fs/gs/GoogleHadoopOutputStream.java @@ -25,23 +25,88 @@ import java.nio.channels.Channels; import java.nio.channels.ClosedChannelException; import java.nio.channels.WritableByteChannel; +import java.time.Duration; +import java.util.ArrayList; +import java.util.List; +import java.util.UUID; +import java.util.concurrent.ExecutionException; +import java.util.concurrent.ExecutorService; +import java.util.concurrent.Executors; +import java.util.concurrent.Future; import javax.annotation.Nonnull; import org.apache.hadoop.fs.FileAlreadyExistsException; import org.apache.hadoop.fs.FileSystem; +import org.apache.hadoop.fs.Path; +import org.apache.hadoop.fs.StreamCapabilities; +import org.apache.hadoop.fs.Syncable; +import org.apache.hadoop.thirdparty.com.google.common.base.Ascii; +import org.apache.hadoop.thirdparty.com.google.common.collect.ImmutableList; +import org.apache.hadoop.thirdparty.com.google.common.util.concurrent.RateLimiter; +import org.apache.hadoop.thirdparty.com.google.common.util.concurrent.ThreadFactoryBuilder; import org.slf4j.Logger; import org.slf4j.LoggerFactory; -class GoogleHadoopOutputStream extends OutputStream { - public static final Logger LOG = LoggerFactory.getLogger(StorageResourceId.class); +import static org.apache.hadoop.thirdparty.com.google.common.base.Preconditions.checkArgument; +import static org.apache.hadoop.thirdparty.com.google.common.base.Preconditions.checkState; +import static org.apache.hadoop.thirdparty.com.google.common.base.Strings.isNullOrEmpty; + +class GoogleHadoopOutputStream extends OutputStream + implements StreamCapabilities, Syncable { + private static final Logger LOG = LoggerFactory.getLogger(StorageResourceId.class); + + // Prefix used for all temporary files created by this stream. + private static final String TMP_FILE_PREFIX = "_GHFS_SYNC_TMP_FILE_"; + + // Temporary files don't need to contain the desired attributes of the final destination file + // since metadata settings get clobbered on final compose() anyways; additionally, due to + // the way we pick temp file names and already ensured directories for the destination file, + // we can optimize tempfile creation by skipping various directory checks. + private static final CreateFileOptions TMP_FILE_CREATE_OPTIONS = + CreateFileOptions.builder().setEnsureNoDirectoryConflict(false).build(); + + // Deletion of temporary files occurs asynchronously for performance reasons, but in-flight + // deletions are awaited on close() so as long as all output streams are closed, there should + // be no remaining in-flight work occurring inside this threadpool. + private static final ExecutorService TMP_FILE_CLEANUP_THREADPOOL = + Executors.newCachedThreadPool( + new ThreadFactoryBuilder() + .setNameFormat("ghfs-output-stream-sync-cleanup-%d") + .setDaemon(true) + .build()); private final GoogleHadoopFileSystem ghfs; + private final CreateObjectOptions composeObjectOptions; + // Path of the file to write to. private final URI dstGcsPath; - private OutputStream outputStream; + /** + * The last known generationId of the {@link #dstGcsPath} file, or possibly {@link + * StorageResourceId#UNKNOWN_GENERATION_ID} if unknown. + */ + private long dstGenerationId; + + // GCS path pointing at the "tail" file which will be appended to the destination + // on hflush()/hsync() call. + private URI tmpGcsPath; + + /** + * Stores the component index corresponding to {@link #tmpGcsPath}. If close() is called, the + * total number of components in the {@link #dstGcsPath} will be {@code tmpIndex + 1}. + */ + private int tmpIndex; + + // OutputStream pointing at the "tail" file which will be appended to the destination + // on hflush()/hsync() call. + private OutputStream tmpOut; + + private final RateLimiter syncRateLimiter; + + // List of temporary file-deletion futures accrued during the lifetime of this output stream. + private final List> tmpDeletionFutures = new ArrayList<>(); // Statistics tracker provided by the parent GoogleHadoopFileSystem for recording // numbers of bytes written. @@ -57,19 +122,49 @@ class GoogleHadoopOutputStream extends OutputStream { * @throws IOException if an IO error occurs. */ GoogleHadoopOutputStream(GoogleHadoopFileSystem ghfs, URI dstGcsPath, - CreateOptions createFileOptions, FileSystem.Statistics statistics) throws IOException { + CreateFileOptions createFileOptions, FileSystem.Statistics statistics) throws IOException { LOG.trace("GoogleHadoopOutputStream(gcsPath: {}, createFileOptions: {})", dstGcsPath, createFileOptions); this.ghfs = ghfs; this.dstGcsPath = dstGcsPath; this.statistics = statistics; - this.outputStream = createOutputStream(ghfs.getGcsFs(), dstGcsPath, createFileOptions, - ghfs.getFileSystemConfiguration()); + Duration minSyncInterval = ghfs.getFileSystemConfiguration().getMinSyncInterval(); + + this.syncRateLimiter = + minSyncInterval.isNegative() || minSyncInterval.isZero() + ? null + : RateLimiter.create(/* permitsPerSecond= */ 1_000.0 / minSyncInterval.toMillis()); + this.composeObjectOptions = + GoogleCloudStorageFileSystem.objectOptionsFromFileOptions( + createFileOptions.toBuilder() + // Set write mode to OVERWRITE because we use compose operation to append new data + // to an existing object + .setWriteMode(CreateFileOptions.WriteMode.OVERWRITE) + .build()); + + if (createFileOptions.getWriteMode() == CreateFileOptions.WriteMode.APPEND) { + // When appending first component has to go to new temporary file. + this.tmpGcsPath = getNextTmpPath(); + this.tmpIndex = 1; + } else { + // The first component of the stream will go straight to the destination filename to optimize + // the case where no hsync() or a single hsync() is called during the lifetime of the stream; + // committing the first component thus doesn't require any compose() call under the hood. + this.tmpGcsPath = dstGcsPath; + this.tmpIndex = 0; + } + + this.tmpOut = + createOutputStream( + ghfs.getGcsFs(), + tmpGcsPath, + tmpIndex == 0 ? createFileOptions : TMP_FILE_CREATE_OPTIONS); + this.dstGenerationId = StorageResourceId.UNKNOWN_GENERATION_ID; } - private static OutputStream createOutputStream(GoogleCloudStorageFileSystem gcsfs, URI gcsPath, - CreateOptions options, GoogleHadoopFileSystemConfiguration fileSystemConfiguration) + private OutputStream createOutputStream(GoogleCloudStorageFileSystem gcsfs, URI gcsPath, + CreateFileOptions options) throws IOException { WritableByteChannel channel; try { @@ -80,14 +175,14 @@ private static OutputStream createOutputStream(GoogleCloudStorageFileSystem gcsf String.format("'%s' already exists", gcsPath)).initCause(e); } OutputStream outputStream = Channels.newOutputStream(channel); - int bufferSize = fileSystemConfiguration.getOutStreamBufferSize(); + int bufferSize = gcsfs.getConfiguration().getOutStreamBufferSize(); return bufferSize > 0 ? new BufferedOutputStream(outputStream, bufferSize) : outputStream; } @Override public void write(int b) throws IOException { throwIfNotOpen(); - outputStream.write(b); + tmpOut.write(b); statistics.incrementBytesWritten(1); statistics.incrementWriteOps(1); } @@ -95,30 +190,176 @@ public void write(int b) throws IOException { @Override public void write(@Nonnull byte[] b, int offset, int len) throws IOException { throwIfNotOpen(); - outputStream.write(b, offset, len); + tmpOut.write(b, offset, len); statistics.incrementBytesWritten(len); statistics.incrementWriteOps(1); } + private void commitTempFile() throws IOException { + // TODO: return early when 0 bytes have been written in the temp files + tmpOut.close(); + + long tmpGenerationId = StorageResourceId.UNKNOWN_GENERATION_ID; + LOG.trace( + "tmpOut is an instance of {}; expected generationId {}.", + tmpOut.getClass(), tmpGenerationId); + + // On the first component, tmpGcsPath will equal finalGcsPath, and no compose() call is + // necessary. Otherwise, we compose in-place into the destination object and then delete + // the temporary object. + if (dstGcsPath.equals(tmpGcsPath)) { + // First commit was direct to the destination; the generationId of the object we just + // committed will be used as the destination generation id for future compose calls. + dstGenerationId = tmpGenerationId; + } else { + StorageResourceId dstId = + StorageResourceId.fromUriPath( + dstGcsPath, /* allowEmptyObjectName= */ false, dstGenerationId); + StorageResourceId tmpId = + StorageResourceId.fromUriPath( + tmpGcsPath, /* allowEmptyObjectName= */ false, tmpGenerationId); + checkState( + dstId.getBucketName().equals(tmpId.getBucketName()), + "Destination bucket in path '%s' doesn't match temp file bucket in path '%s'", + dstGcsPath, + tmpGcsPath); + GoogleCloudStorageFileSystem gcs = ghfs.getGcsFs(); + GoogleCloudStorageItemInfo composedObject = + gcs.composeObjects(ImmutableList.of(dstId, tmpId), dstId, composeObjectOptions); + dstGenerationId = composedObject.getContentGeneration(); + tmpDeletionFutures.add( + TMP_FILE_CLEANUP_THREADPOOL.submit( + () -> { + gcs.delete(ImmutableList.of(tmpId)); + return null; + })); + } + } + @Override public void close() throws IOException { - LOG.trace("close(): final destination: {}", dstGcsPath); + LOG.trace( + "close(): temp tail file: %s final destination: {}", tmpGcsPath, dstGcsPath); - if (outputStream == null) { + if (tmpOut == null) { LOG.trace("close(): Ignoring; stream already closed."); return; } + commitTempFile(); + try { - outputStream.close(); + tmpOut.close(); } finally { - outputStream = null; + tmpOut = null; + } + tmpGcsPath = null; + tmpIndex = -1; + + LOG.trace("close(): Awaiting {} deletionFutures", tmpDeletionFutures.size()); + for (Future deletion : tmpDeletionFutures) { + try { + deletion.get(); + } catch (ExecutionException | InterruptedException e) { + if (e instanceof InterruptedException) { + Thread.currentThread().interrupt(); + } + throw new IOException( + String.format( + "Failed to delete temporary files while closing stream: '%s'", dstGcsPath), + e); + } } } private void throwIfNotOpen() throws IOException { - if (outputStream == null) { + if (tmpOut == null) { throw new ClosedChannelException(); } } + + @Override + public boolean hasCapability(String capability) { + checkArgument(!isNullOrEmpty(capability), "capability must not be null or empty string"); + switch (Ascii.toLowerCase(capability)) { + case StreamCapabilities.HFLUSH: + case StreamCapabilities.HSYNC: + return syncRateLimiter != null; + case StreamCapabilities.IOSTATISTICS: + return false; // TODO: Add support + default: + return false; + } + } + + /** + * There is no way to flush data to become available for readers without a full-fledged hsync(), + * If the output stream is only syncable, this method is a no-op. If the output stream is also + * flushable, this method will simply use the same implementation of hsync(). + * + *

If it is rate limited, unlike hsync(), which will try to acquire the permits and block, it + * will do nothing. + */ + @Override + public void hflush() throws IOException { + LOG.trace("hflush(): {}", dstGcsPath); + + long startMs = System.currentTimeMillis(); + throwIfNotOpen(); + // If rate limit not set or permit acquired than use hsync() + if (syncRateLimiter == null || syncRateLimiter.tryAcquire()) { + LOG.trace("hflush() uses hsyncInternal() for {}", dstGcsPath); + hsyncInternal(startMs); + return; + } + LOG.trace( + "hflush(): No-op due to rate limit ({}): readers will *not* yet see flushed data for {}", + syncRateLimiter, dstGcsPath); + + } + + @Override + public void hsync() throws IOException { + LOG.trace("hsync(): {}", dstGcsPath); + + long startMs = System.currentTimeMillis(); + throwIfNotOpen(); + if (syncRateLimiter != null) { + LOG.trace( + "hsync(): Rate limited ({}) with blocking permit acquisition for {}", + syncRateLimiter, dstGcsPath); + syncRateLimiter.acquire(); + } + hsyncInternal(startMs); + + } + + /** Internal implementation of hsync, can be reused by hflush() as well. */ + private void hsyncInternal(long startMs) throws IOException { + LOG.trace( + "hsyncInternal(): Committing tail file {} to final destination {}", tmpGcsPath, dstGcsPath); + commitTempFile(); + + // Use a different temporary path for each temporary component to reduce the possible avenues of + // race conditions in the face of low-level retries, etc. + ++tmpIndex; + tmpGcsPath = getNextTmpPath(); + + LOG.trace( + "hsync(): Opening next temporary tail file {} at {} index", tmpGcsPath, tmpIndex); + tmpOut = createOutputStream(ghfs.getGcsFs(), tmpGcsPath, TMP_FILE_CREATE_OPTIONS); + + long finishMs = System.currentTimeMillis(); + LOG.trace("Took {}ms to sync() for {}", finishMs - startMs, dstGcsPath); + } + /** Returns URI to be used for the next temp "tail" file in the series. */ + private URI getNextTmpPath() { + Path basePath = ghfs.getHadoopPath(dstGcsPath); + Path tempPath = + new Path( + basePath.getParent(), + String.format( + "%s%s.%d.%s", TMP_FILE_PREFIX, basePath.getName(), tmpIndex, UUID.randomUUID())); + return ghfs.getGcsPath(tempPath); + } } diff --git a/hadoop-tools/hadoop-gcp/src/main/java/org/apache/hadoop/fs/gs/HadoopConfigurationProperty.java b/hadoop-tools/hadoop-gcp/src/main/java/org/apache/hadoop/fs/gs/HadoopConfigurationProperty.java index 9360290a09c5b..450459e6a8dbc 100644 --- a/hadoop-tools/hadoop-gcp/src/main/java/org/apache/hadoop/fs/gs/HadoopConfigurationProperty.java +++ b/hadoop-tools/hadoop-gcp/src/main/java/org/apache/hadoop/fs/gs/HadoopConfigurationProperty.java @@ -20,6 +20,7 @@ import org.apache.hadoop.thirdparty.com.google.common.collect.ImmutableList; +import java.time.Duration; import java.util.List; import java.util.function.BiFunction; @@ -28,6 +29,8 @@ import org.slf4j.Logger; import org.slf4j.LoggerFactory; +import static java.util.concurrent.TimeUnit.MILLISECONDS; + /** * Hadoop configuration property. */ @@ -64,6 +67,13 @@ T get(Configuration config, BiFunction getterFn) { return logProperty(lookupKey, getterFn.apply(lookupKey, defaultValue)); } + Duration getTimeDuration(Configuration config) { + String lookupKey = getLookupKey(config, key, (c, k) -> c.get(k) != null); + String defValStr = defaultValue == null ? null : String.valueOf(defaultValue); + return logProperty( + lookupKey, Duration.ofMillis(config.getTimeDuration(lookupKey, defValStr, MILLISECONDS))); + } + private String getLookupKey(Configuration config, String lookupKey, BiFunction checkFn) { for (String prefix : keyPrefixes) { diff --git a/hadoop-tools/hadoop-gcp/src/test/java/org/apache/hadoop/fs/gs/contract/ITestGoogleContractCreate.java b/hadoop-tools/hadoop-gcp/src/test/java/org/apache/hadoop/fs/gs/contract/ITestGoogleContractCreate.java new file mode 100644 index 0000000000000..56ae35b4ff472 --- /dev/null +++ b/hadoop-tools/hadoop-gcp/src/test/java/org/apache/hadoop/fs/gs/contract/ITestGoogleContractCreate.java @@ -0,0 +1,36 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hadoop.fs.gs.contract; + +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.fs.contract.AbstractContractCreateTest; +import org.apache.hadoop.fs.contract.AbstractFSContract; +import org.apache.hadoop.fs.contract.ContractTestUtils; + +public class ITestGoogleContractCreate extends AbstractContractCreateTest { + @Override + protected AbstractFSContract createContract(Configuration conf) { + return new GoogleContract(conf); + } + + @Override + public void testOverwriteEmptyDirectory() throws Throwable { + ContractTestUtils.skip("blobstores can't distinguish empty directories from files"); + } +} diff --git a/hadoop-tools/hadoop-gcp/src/test/java/org/apache/hadoop/fs/gs/contract/ITestGoogleContractMkdir.java b/hadoop-tools/hadoop-gcp/src/test/java/org/apache/hadoop/fs/gs/contract/ITestGoogleContractMkdir.java index 26181f20385a3..27acc015ab8ba 100644 --- a/hadoop-tools/hadoop-gcp/src/test/java/org/apache/hadoop/fs/gs/contract/ITestGoogleContractMkdir.java +++ b/hadoop-tools/hadoop-gcp/src/test/java/org/apache/hadoop/fs/gs/contract/ITestGoogleContractMkdir.java @@ -29,33 +29,9 @@ protected AbstractFSContract createContract(Configuration conf) { return new GoogleContract(conf); } - @Override - public void testMkdirsDoesNotRemoveParentDirectories() { - // TODO: Enable this - ContractTestUtils.skip("Skipping the test. This will be enabled in a subsequent change"); - } - - @Override - public void testCreateDirWithExistingDir() { - // TODO: Enable this - ContractTestUtils.skip("Skipping the test. This will be enabled in a subsequent change"); - } - @Override public void testMkDirRmDir() { // TODO: Enable this ContractTestUtils.skip("Skipping the test. This will be enabled in a subsequent change"); } - - @Override - public void testNoMkdirOverFile() { - // TODO: Enable this - ContractTestUtils.skip("Skipping the test. This will be enabled in a subsequent change"); - } - - @Override - public void testMkdirOverParentFile() { - // TODO: Enable this - ContractTestUtils.skip("Skipping the test. This will be enabled in a subsequent change"); - } } diff --git a/hadoop-tools/hadoop-gcp/src/test/java/org/apache/hadoop/fs/gs/contract/ITestGoogleContractRename.java b/hadoop-tools/hadoop-gcp/src/test/java/org/apache/hadoop/fs/gs/contract/ITestGoogleContractRename.java index 5d9459cac19e9..a159d46b0061f 100644 --- a/hadoop-tools/hadoop-gcp/src/test/java/org/apache/hadoop/fs/gs/contract/ITestGoogleContractRename.java +++ b/hadoop-tools/hadoop-gcp/src/test/java/org/apache/hadoop/fs/gs/contract/ITestGoogleContractRename.java @@ -34,10 +34,4 @@ public void testRenameWithNonEmptySubDir() { // TODO: Enable this ContractTestUtils.skip("Skipping the test. This will be enabled in a subsequent change"); } - - @Override - public void testRenameNonexistentFile() { - // TODO: Enable this - ContractTestUtils.skip("Skipping the test. This will be enabled in a subsequent change"); - } } diff --git a/hadoop-tools/hadoop-gcp/src/test/resources/contract/gs.xml b/hadoop-tools/hadoop-gcp/src/test/resources/contract/gs.xml index 1de34245a5d1a..542df5166b672 100644 --- a/hadoop-tools/hadoop-gcp/src/test/resources/contract/gs.xml +++ b/hadoop-tools/hadoop-gcp/src/test/resources/contract/gs.xml @@ -17,5 +17,103 @@ --> + + fs.contract.test.root-tests-enabled + true + - + + fs.contract.test.random-seek-count + 10 + + + + fs.contract.create-visibility-delayed + true + + + + fs.contract.is-blobstore + true + + + + fs.contract.is-case-sensitive + true + + + + fs.contract.rename-returns-false-if-source-missing + true + + + + fs.contract.rename-returns-false-if-dest-exists + true + + + + fs.contract.rename-remove-dest-if-empty-dir + false + + + + fs.contract.supports-append + true + + + + fs.contract.supports-atomic-directory-delete + false + + + + fs.contract.supports-atomic-rename + false + + + + fs.contract.supports-block-locality + false + + + + fs.contract.supports-concat + true + + + + fs.contract.supports-getfilestatus + true + + + + fs.contract.supports-seek + true + + + + fs.contract.supports-seek-on-closed-file + true + + + + fs.contract.rejects-seek-past-eof + true + + + + fs.contract.supports-strict-exceptions + true + + + + fs.contract.supports-unix-permissions + false + + + + fs.contract.rename-overwrites-dest + false + + \ No newline at end of file From cdf1c38365d68fb54fc444995b3c4b0c25ff32ee Mon Sep 17 00:00:00 2001 From: Arunkumar Chacko Date: Wed, 2 Jul 2025 17:58:18 +0000 Subject: [PATCH 6/8] HADOOP-19343. Add support for append(), compose(), concat() Closes #7773 Signed-off-by: Chris Nauroth --- .../org/apache/hadoop/fs/gs/Constants.java | 2 + .../hadoop/fs/gs/CreateFileOptions.java | 1 + .../apache/hadoop/fs/gs/GcsListOperation.java | 98 ++++++++++ .../hadoop/fs/gs/GoogleCloudStorage.java | 174 +++++++++++------- .../fs/gs/GoogleCloudStorageFileSystem.java | 147 ++++++--------- .../hadoop/fs/gs/GoogleHadoopFileSystem.java | 67 ++++++- .../hadoop/fs/gs/StorageResourceId.java | 5 + .../contract/ITestGoogleContractAppend.java | 35 ++++ .../contract/ITestGoogleContractConcat.java | 30 +++ .../contract/ITestGoogleContractDelete.java | 7 - .../gs/contract/ITestGoogleContractMkdir.java | 7 - .../contract/ITestGoogleContractRename.java | 7 - 12 files changed, 396 insertions(+), 184 deletions(-) create mode 100644 hadoop-tools/hadoop-gcp/src/main/java/org/apache/hadoop/fs/gs/GcsListOperation.java create mode 100644 hadoop-tools/hadoop-gcp/src/test/java/org/apache/hadoop/fs/gs/contract/ITestGoogleContractAppend.java create mode 100644 hadoop-tools/hadoop-gcp/src/test/java/org/apache/hadoop/fs/gs/contract/ITestGoogleContractConcat.java diff --git a/hadoop-tools/hadoop-gcp/src/main/java/org/apache/hadoop/fs/gs/Constants.java b/hadoop-tools/hadoop-gcp/src/main/java/org/apache/hadoop/fs/gs/Constants.java index 34434b2859a06..61c1571c4dcde 100644 --- a/hadoop-tools/hadoop-gcp/src/main/java/org/apache/hadoop/fs/gs/Constants.java +++ b/hadoop-tools/hadoop-gcp/src/main/java/org/apache/hadoop/fs/gs/Constants.java @@ -19,6 +19,8 @@ package org.apache.hadoop.fs.gs; final class Constants { + static final int MAX_COMPOSE_OBJECTS = 32; + private Constants() {} // URI scheme for GCS. diff --git a/hadoop-tools/hadoop-gcp/src/main/java/org/apache/hadoop/fs/gs/CreateFileOptions.java b/hadoop-tools/hadoop-gcp/src/main/java/org/apache/hadoop/fs/gs/CreateFileOptions.java index 1c0e48ae14418..e3d0631b501d3 100644 --- a/hadoop-tools/hadoop-gcp/src/main/java/org/apache/hadoop/fs/gs/CreateFileOptions.java +++ b/hadoop-tools/hadoop-gcp/src/main/java/org/apache/hadoop/fs/gs/CreateFileOptions.java @@ -29,6 +29,7 @@ * Options that can be specified when creating a file in the {@link GoogleCloudStorageFileSystem}. */ final class CreateFileOptions { + static final CreateFileOptions DEFAULT = CreateFileOptions.builder().build(); private final ImmutableMap attributes; private final String contentType; private final long overwriteGenerationId; diff --git a/hadoop-tools/hadoop-gcp/src/main/java/org/apache/hadoop/fs/gs/GcsListOperation.java b/hadoop-tools/hadoop-gcp/src/main/java/org/apache/hadoop/fs/gs/GcsListOperation.java new file mode 100644 index 0000000000000..9cd4fdbb867e4 --- /dev/null +++ b/hadoop-tools/hadoop-gcp/src/main/java/org/apache/hadoop/fs/gs/GcsListOperation.java @@ -0,0 +1,98 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hadoop.fs.gs; + +import java.util.ArrayList; +import java.util.List; + +import com.google.cloud.storage.Blob; +import com.google.cloud.storage.Storage; + +import static org.apache.hadoop.thirdparty.com.google.common.base.Preconditions.checkArgument; + +final class GcsListOperation { + private static final int ALL = 0; + private final Storage.BlobListOption[] listOptions; + private final String bucketName; + private final Storage storage; + private final int limit; + + private GcsListOperation(Builder builder) { + this.listOptions = builder.blobListOptions + .toArray(new Storage.BlobListOption[builder.blobListOptions.size()]); + this.bucketName = builder.bucket; + this.storage = builder.storage; + this.limit = builder.limit; + } + + public List execute() { + List result = new ArrayList<>(); + for (Blob blob : storage.list(bucketName, listOptions).iterateAll()) { + result.add(blob); + + if (limit != ALL && result.size() >= limit) { + break; + } + } + + return result; + } + + static class Builder { + private final ArrayList blobListOptions = new ArrayList<>(); + private String prefix; + private final String bucket; + private final Storage storage; + private int limit = GcsListOperation.ALL; + + Builder(final String bucketName, final String thePrefix, Storage storage) { + this.storage = storage; + this.bucket = bucketName; + this.prefix = thePrefix; + } + + Builder forRecursiveListing() { + return this; + } + + GcsListOperation build() { + blobListOptions.add(Storage.BlobListOption.prefix(prefix)); + return new GcsListOperation(this); + } + + Builder forCurrentDirectoryListing() { + blobListOptions.add(Storage.BlobListOption.currentDirectory()); + blobListOptions.add(Storage.BlobListOption.includeTrailingDelimiter()); + return this; + } + + Builder forCurrentDirectoryListingWithLimit(int theLimit) { + checkArgument( + theLimit > 0, + "limit should be greater than 0. found %d; prefix=%s", theLimit, prefix); + + this.limit = theLimit; + prefix = StringPaths.toDirectoryPath(prefix); + + blobListOptions.add(Storage.BlobListOption.pageSize(1)); + forCurrentDirectoryListing(); + return this; + } + } +} diff --git a/hadoop-tools/hadoop-gcp/src/main/java/org/apache/hadoop/fs/gs/GoogleCloudStorage.java b/hadoop-tools/hadoop-gcp/src/main/java/org/apache/hadoop/fs/gs/GoogleCloudStorage.java index dcf5ff231f7bf..e8cafa57ef820 100644 --- a/hadoop-tools/hadoop-gcp/src/main/java/org/apache/hadoop/fs/gs/GoogleCloudStorage.java +++ b/hadoop-tools/hadoop-gcp/src/main/java/org/apache/hadoop/fs/gs/GoogleCloudStorage.java @@ -288,49 +288,6 @@ private static GoogleCloudStorageItemInfo createItemInfoForBucket(StorageResourc bucket.getStorageClass() == null ? null : bucket.getStorageClass().name()); } - List listObjectInfo( - String bucketName, - String objectNamePrefix, - ListObjectOptions listOptions) throws IOException { - try { - long maxResults = listOptions.getMaxResults() > 0 ? - listOptions.getMaxResults() + (listOptions.isIncludePrefix() ? 0 : 1) : - listOptions.getMaxResults(); - - Storage.BlobListOption[] blobListOptions = - getBlobListOptions(objectNamePrefix, listOptions, maxResults); - Page blobs = storage.list(bucketName, blobListOptions); - ListOperationResult result = new ListOperationResult(maxResults); - for (Blob blob : blobs.iterateAll()) { - result.add(blob); - } - - return result.getItems(); - } catch (StorageException e) { - throw new IOException( - String.format("listing object '%s' failed.", BlobId.of(bucketName, objectNamePrefix)), - e); - } - } - - private Storage.BlobListOption[] getBlobListOptions( - String objectNamePrefix, ListObjectOptions listOptions, long maxResults) { - List options = new ArrayList<>(); - - options.add(Storage.BlobListOption.fields(BLOB_FIELDS.toArray(new Storage.BlobField[0]))); - options.add(Storage.BlobListOption.prefix(objectNamePrefix)); - // TODO: set max results as a BlobListOption - if ("/".equals(listOptions.getDelimiter())) { - options.add(Storage.BlobListOption.currentDirectory()); - } - - if (listOptions.getDelimiter() != null) { - options.add(Storage.BlobListOption.includeTrailingDelimiter()); - } - - return options.toArray(new Storage.BlobListOption[0]); - } - private GoogleCloudStorageItemInfo createItemInfoForBlob(Blob blob) { long generationId = blob.getGeneration() == null ? 0L : blob.getGeneration(); StorageResourceId resourceId = @@ -403,8 +360,7 @@ void createEmptyObject(StorageResourceId resourceId, CreateObjectOptions options } } - - public GoogleCloudStorageItemInfo composeObjects( + GoogleCloudStorageItemInfo composeObjects( List sources, StorageResourceId destination, CreateObjectOptions options) throws IOException { LOG.trace("composeObjects({}, {}, {})", sources, destination, options); @@ -538,13 +494,14 @@ List listDirectoryRecursive(String bucketName, Strin // TODO: Take delimiter from config // TODO: Set specific fields + checkArgument(objectName.endsWith("/"), String.format("%s should end with /", objectName)); try { - Page blobs = storage.list( - bucketName, - Storage.BlobListOption.prefix(objectName)); + List blobs = new GcsListOperation.Builder(bucketName, objectName, storage) + .forRecursiveListing().build() + .execute(); List result = new ArrayList<>(); - for (Blob blob : blobs.iterateAll()) { + for (Blob blob : blobs) { result.add(createItemInfoForBlob(blob)); } @@ -624,7 +581,7 @@ private List listBucketsInternal() throws IOException { return allBuckets; } - public SeekableByteChannel open(GoogleCloudStorageItemInfo itemInfo, + SeekableByteChannel open(GoogleCloudStorageItemInfo itemInfo, GoogleHadoopFileSystemConfiguration config) throws IOException { LOG.trace("open({})", itemInfo); checkNotNull(itemInfo, "itemInfo should not be null"); @@ -647,7 +604,7 @@ private SeekableByteChannel open( config); } - public void move(Map sourceToDestinationObjectsMap) + void move(Map sourceToDestinationObjectsMap) throws IOException { validateMoveArguments(sourceToDestinationObjectsMap); @@ -739,7 +696,7 @@ private Storage.MoveBlobRequest.Builder createMoveRequestBuilder( * Validates basic argument constraints like non-null, non-empty Strings, using {@code * Preconditions} in addition to checking for src/dst bucket equality. */ - public static void validateMoveArguments( + static void validateMoveArguments( Map sourceToDestinationObjectsMap) throws IOException { checkNotNull(sourceToDestinationObjectsMap, "srcObjects must not be null"); @@ -837,7 +794,7 @@ private void copyInternal( } } - public static void validateCopyArguments( + static void validateCopyArguments( Map sourceToDestinationObjectsMap, GoogleCloudStorage gcsImpl) throws IOException { @@ -927,6 +884,103 @@ List getItemInfos(List resourceId return result; } + List listDirectory(String bucketName, String objectNamePrefix) + throws IOException { + checkArgument( + objectNamePrefix.endsWith("/"), + String.format("%s should end with /", objectNamePrefix)); + + try { + List blobs = new GcsListOperation.Builder(bucketName, objectNamePrefix, storage) + .forCurrentDirectoryListing().build() + .execute(); + + ListOperationResult result = new ListOperationResult(); + for (Blob blob : blobs) { + result.add(blob); + } + + return result.getItems(); + } catch (StorageException e) { + throw new IOException( + String.format("listing object '%s' failed.", BlobId.of(bucketName, objectNamePrefix)), + e); + } + } + + void compose( + String bucketName, List sources, String destination, String contentType) + throws IOException { + LOG.trace("compose({}, {}, {}, {})", bucketName, sources, destination, contentType); + List sourceIds = + sources.stream() + .map(objectName -> new StorageResourceId(bucketName, objectName)) + .collect(Collectors.toList()); + StorageResourceId destinationId = new StorageResourceId(bucketName, destination); + CreateObjectOptions options = + CreateObjectOptions.DEFAULT_OVERWRITE.toBuilder() + .setContentType(contentType) + .setEnsureEmptyObjectsMetadataMatch(false) + .build(); + composeObjects(sourceIds, destinationId, options); + } + + /** + * Get metadata for the given resourceId. The resourceId can be a file or a directory. + * + * For a resourceId gs://b/foo/a, it can be a file or a directory (gs:/b/foo/a/). + * This method checks for both and return the one that is found. "NotFound" is returned + * if not found. + */ + GoogleCloudStorageItemInfo getFileOrDirectoryInfo(StorageResourceId resourceId) { + BlobId blobId = resourceId.toBlobId(); + if (resourceId.isDirectory()) { + // Do not check for "file" for directory paths. + Blob blob = storage.get(blobId); + if (blob != null) { + return createItemInfoForBlob(blob); + } + } else { + BlobId dirId = resourceId.toDirectoryId().toBlobId(); + + // Check for both file and directory. + List blobs = storage.get(blobId, dirId); + for (Blob blob : blobs) { + if (blob != null) { + return createItemInfoForBlob(blob); + } + } + } + + return GoogleCloudStorageItemInfo.createNotFound(resourceId); + } + + /** + * Check if any "implicit" directory exists for the given resourceId. + * + * Note that GCS object store does not have a concept of directories for non-HNS buckets. + * For e.g. one could create an object gs://bucket/foo/bar/a.txt, without creating the + * parent directories (i.e. placeholder emtpy files ending with a /). In this case we might + * want to treat gs://bucket/foo/ and gs://bucket/foo/bar/ as directories. + * + * This method helps check if a given resourceId (e.g. gs://bucket/foo/bar/) is an "implicit" + * directory. + * + * Note that this will result in a list operation and is more expensive than "get metadata". + */ + GoogleCloudStorageItemInfo getImplicitDirectory(StorageResourceId resourceId) { + List blobs = new GcsListOperation + .Builder(resourceId.getBucketName(), resourceId.getObjectName(), storage) + .forCurrentDirectoryListingWithLimit(1).build() + .execute(); + + if (blobs.isEmpty()) { + return GoogleCloudStorageItemInfo.createNotFound(resourceId); + } + + return GoogleCloudStorageItemInfo.createInferredDirectory(resourceId.toDirectoryId()); + } + // Helper class to capture the results of list operation. private class ListOperationResult { private final Map prefixes = new HashMap<>(); @@ -934,12 +988,6 @@ private class ListOperationResult { private final Set objectsSet = new HashSet<>(); - private final long maxResults; - - ListOperationResult(long maxResults) { - this.maxResults = maxResults; - } - void add(Blob blob) { String path = blob.getBlobId().toGsUtilUri(); if (blob.getGeneration() != null) { @@ -957,17 +1005,9 @@ List getItems() { for (Blob blob : objects) { result.add(createItemInfoForBlob(blob)); - - if (result.size() == maxResults) { - return result; - } } for (Blob blob : prefixes.values()) { - if (result.size() == maxResults) { - return result; - } - result.add(createItemInfoForBlob(blob)); } diff --git a/hadoop-tools/hadoop-gcp/src/main/java/org/apache/hadoop/fs/gs/GoogleCloudStorageFileSystem.java b/hadoop-tools/hadoop-gcp/src/main/java/org/apache/hadoop/fs/gs/GoogleCloudStorageFileSystem.java index 951c38f596667..8cf11d009c8ed 100644 --- a/hadoop-tools/hadoop-gcp/src/main/java/org/apache/hadoop/fs/gs/GoogleCloudStorageFileSystem.java +++ b/hadoop-tools/hadoop-gcp/src/main/java/org/apache/hadoop/fs/gs/GoogleCloudStorageFileSystem.java @@ -29,7 +29,6 @@ import org.apache.hadoop.thirdparty.com.google.common.annotations.VisibleForTesting; import org.apache.hadoop.thirdparty.com.google.common.collect.ImmutableList; import org.apache.hadoop.thirdparty.com.google.common.collect.ImmutableMap; -import org.apache.hadoop.thirdparty.com.google.common.collect.Iterables; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -49,6 +48,7 @@ import java.util.Objects; import java.util.TreeMap; import java.util.regex.Pattern; +import java.util.stream.Collectors; import javax.annotation.Nullable; /** @@ -161,7 +161,7 @@ void close() { } } - public FileInfo getFileInfo(URI path) throws IOException { + FileInfo getFileInfo(URI path) throws IOException { checkArgument(path != null, "path must not be null"); // Validate the given path. true == allow empty object name. // One should be able to get info about top level directory (== bucket), @@ -182,42 +182,17 @@ private GoogleCloudStorageItemInfo getFileInfoInternal( return gcs.getItemInfo(resourceId); } - StorageResourceId dirId = resourceId.toDirectoryId(); - if (!resourceId.isDirectory()) { - GoogleCloudStorageItemInfo itemInfo = gcs.getItemInfo(resourceId); - if (itemInfo.exists()) { - return itemInfo; - } - - if (inferImplicitDirectories) { - // TODO: Set max result - List listDirResult = gcs.listObjectInfo( - resourceId.getBucketName(), - resourceId.getObjectName(), - GET_FILE_INFO_LIST_OPTIONS); - if (!listDirResult.isEmpty()) { - return GoogleCloudStorageItemInfo.createInferredDirectory(resourceId.toDirectoryId()); - } - } + GoogleCloudStorageItemInfo dirOrObject = gcs.getFileOrDirectoryInfo(resourceId); + if (dirOrObject.exists() || !inferImplicitDirectories) { + return dirOrObject; } - List listDirInfo = ImmutableList.of(gcs.getItemInfo(dirId)); - if (listDirInfo.isEmpty()) { - return GoogleCloudStorageItemInfo.createNotFound(resourceId); - } - checkState(listDirInfo.size() <= 2, "listed more than 2 objects: '%s'", listDirInfo); - GoogleCloudStorageItemInfo dirInfo = Iterables.get(listDirInfo, /* position= */ 0); - checkState( - dirInfo.getResourceId().equals(dirId) || !inferImplicitDirectories, - "listed wrong object '%s', but should be '%s'", - dirInfo.getResourceId(), - resourceId); - return dirInfo.getResourceId().equals(dirId) && dirInfo.exists() - ? dirInfo - : GoogleCloudStorageItemInfo.createNotFound(resourceId); + // File does not exist; Explicit directory does not exist. Check for implicit directory. + // This will result in a list operation, which is expensive + return gcs.getImplicitDirectory(resourceId); } - public void mkdirs(URI path) throws IOException { + void mkdirs(URI path) throws IOException { LOG.trace("mkdirs(path: {})", path); checkNotNull(path, "path should not be null"); @@ -313,18 +288,6 @@ private List listRecursive(URI prefix) throws IOException { return fileInfos; } - private List listDirectory(URI prefix) throws IOException { - StorageResourceId prefixId = getPrefixId(prefix); - List itemInfos = gcs.listObjectInfo( - prefixId.getBucketName(), - prefixId.getObjectName(), - ListObjectOptions.DEFAULT_FLAT_LIST); - - List fileInfos = FileInfo.fromItemInfos(itemInfos); - fileInfos.sort(FILE_INFO_PATH_COMPARATOR); - return fileInfos; - } - private StorageResourceId getPrefixId(URI prefix) { checkNotNull(prefix, "prefix could not be null"); @@ -375,42 +338,6 @@ private void deleteBucket(List bucketsToDelete) throws IOException { throw new UnsupportedOperationException("deleteBucket is not supported."); } - public List listFileInfo(URI path, ListFileOptions listOptions) throws IOException { - checkNotNull(path, "path can not be null"); - LOG.trace("listStatus(path: {})", path); - - StorageResourceId pathId = - StorageResourceId.fromUriPath(path, /* allowEmptyObjectName= */ true); - - if (!pathId.isDirectory()) { - GoogleCloudStorageItemInfo pathInfo = gcs.getItemInfo(pathId); - if (pathInfo.exists()) { - List listedInfo = new ArrayList<>(); - listedInfo.add(FileInfo.fromItemInfo(pathInfo)); - - return listedInfo; - } - } - - StorageResourceId dirId = pathId.toDirectoryId(); - List dirItemInfos = dirId.isRoot() ? - gcs.listBucketInfo() : - gcs.listObjectInfo( - dirId.getBucketName(), dirId.getObjectName(), LIST_FILE_INFO_LIST_OPTIONS); - - if (pathId.isStorageObject() && dirItemInfos.isEmpty()) { - throw new FileNotFoundException("Item not found: " + path); - } - - if (!dirItemInfos.isEmpty() && Objects.equals(dirItemInfos.get(0).getResourceId(), dirId)) { - dirItemInfos.remove(0); - } - - List fileInfos = FileInfo.fromItemInfos(dirItemInfos); - fileInfos.sort(FILE_INFO_PATH_COMPARATOR); - return fileInfos; - } - FileInfo getFileInfoObject(URI path) throws IOException { checkArgument(path != null, "path must not be null"); StorageResourceId resourceId = StorageResourceId.fromUriPath(path, true); @@ -574,10 +501,7 @@ List listFileInfoForPrefix(URI prefix, ListFileOptions listOptions) LOG.trace("listAllFileInfoForPrefix(prefix: {})", prefix); StorageResourceId prefixId = getPrefixId(prefix); List itemInfos = - gcs.listObjectInfo( - prefixId.getBucketName(), - prefixId.getObjectName(), - updateListObjectOptions(ListObjectOptions.DEFAULT_FLAT_LIST, listOptions)); + gcs.listDirectoryRecursive(prefixId.getBucketName(), prefixId.getObjectName()); List fileInfos = FileInfo.fromItemInfos(itemInfos); fileInfos.sort(FILE_INFO_PATH_COMPARATOR); return fileInfos; @@ -604,11 +528,6 @@ private void moveInternal(Map srcToDstItemNames) throws IOExcepti gcs.move(sourceToDestinationObjectsMap); } - private static ListObjectOptions updateListObjectOptions( - ListObjectOptions listObjectOptions, ListFileOptions listFileOptions) { - return listObjectOptions.builder().setFields(listFileOptions.getFields()).build(); - } - private List getFileInfos(List paths) throws IOException { List result = new ArrayList<>(paths.size()); for (URI path : paths) { @@ -798,4 +717,50 @@ static List getDirs(String objectName) { } return dirs; } + + List listDirectory(URI path) throws IOException { + checkNotNull(path, "path can not be null"); + LOG.trace("listStatus(path: {})", path); + + StorageResourceId pathId = + StorageResourceId.fromUriPath(path, /* allowEmptyObjectName= */ true); + + if (!pathId.isDirectory()) { + GoogleCloudStorageItemInfo pathInfo = gcs.getItemInfo(pathId); + if (pathInfo.exists()) { + List listedInfo = new ArrayList<>(); + listedInfo.add(FileInfo.fromItemInfo(pathInfo)); + + return listedInfo; + } + } + + StorageResourceId dirId = pathId.toDirectoryId(); + List dirItemInfos = dirId.isRoot() ? + gcs.listBucketInfo() : + gcs.listDirectory( + dirId.getBucketName(), dirId.getObjectName()); + + if (pathId.isStorageObject() && dirItemInfos.isEmpty()) { + throw new FileNotFoundException("Item not found: " + path); + } + + if (!dirItemInfos.isEmpty() && Objects.equals(dirItemInfos.get(0).getResourceId(), dirId)) { + dirItemInfos.remove(0); + } + + List fileInfos = FileInfo.fromItemInfos(dirItemInfos); + fileInfos.sort(FILE_INFO_PATH_COMPARATOR); + return fileInfos; + } + + void compose(List sources, URI destination, String contentType) throws IOException { + StorageResourceId destResource = StorageResourceId.fromStringPath(destination.toString()); + List sourceObjects = + sources.stream() + .map(uri -> StorageResourceId.fromStringPath(uri.toString()).getObjectName()) + .collect(Collectors.toList()); + gcs.compose( + destResource.getBucketName(), sourceObjects, destResource.getObjectName(), contentType); + } } diff --git a/hadoop-tools/hadoop-gcp/src/main/java/org/apache/hadoop/fs/gs/GoogleHadoopFileSystem.java b/hadoop-tools/hadoop-gcp/src/main/java/org/apache/hadoop/fs/gs/GoogleHadoopFileSystem.java index e4db8146186a0..3c4e84ce1eb8c 100644 --- a/hadoop-tools/hadoop-gcp/src/main/java/org/apache/hadoop/fs/gs/GoogleHadoopFileSystem.java +++ b/hadoop-tools/hadoop-gcp/src/main/java/org/apache/hadoop/fs/gs/GoogleHadoopFileSystem.java @@ -18,6 +18,7 @@ package org.apache.hadoop.fs.gs; +import static org.apache.hadoop.thirdparty.com.google.common.collect.ImmutableList.toImmutableList; import static org.apache.hadoop.fs.gs.Constants.GCS_CONFIG_PREFIX; import static org.apache.hadoop.fs.gs.GoogleHadoopFileSystemConfiguration.GCS_WORKING_DIRECTORY; @@ -34,6 +35,7 @@ import java.net.URI; import java.nio.file.DirectoryNotEmptyException; import java.util.ArrayList; +import java.util.Arrays; import java.util.EnumSet; import java.util.List; @@ -44,6 +46,8 @@ import org.apache.hadoop.security.UserGroupInformation; import org.apache.hadoop.util.Progressable; +import org.apache.hadoop.thirdparty.com.google.common.base.Preconditions; +import org.apache.hadoop.thirdparty.com.google.common.collect.Lists; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -321,10 +325,63 @@ public FSDataOutputStream createNonRecursive( progress); } + /** + * Appends to an existing file (optional operation). Not supported. + * + * @param hadoopPath The existing file to be appended. + * @param bufferSize The size of the buffer to be used. + * @param progress For reporting progress if it is not null. + * @return A writable stream. + * @throws IOException if an error occurs. + */ @Override - public FSDataOutputStream append(final Path path, final int i, final Progressable progressable) + public FSDataOutputStream append(Path hadoopPath, int bufferSize, Progressable progress) throws IOException { - throw new UnsupportedOperationException(path.toString()); + Preconditions.checkArgument(hadoopPath != null, "hadoopPath must not be null"); + LOG.trace("append(hadoopPath: {}, bufferSize: {} [ignored])", hadoopPath, bufferSize); + URI filePath = getGcsPath(hadoopPath); + return new FSDataOutputStream( + new GoogleHadoopOutputStream( + this, + filePath, + CreateFileOptions.builder() + .setWriteMode(CreateFileOptions.WriteMode.APPEND) + .build(), + statistics), + statistics); + } + + /** + * Concat existing files into one file. + * + * @param tgt the path to the target destination. + * @param srcs the paths to the sources to use for the concatenation. + * @throws IOException IO failure + */ + @Override + public void concat(Path tgt, Path[] srcs) throws IOException { + LOG.trace("concat(tgt: {}, srcs.length: {})", tgt, srcs.length); + + Preconditions.checkArgument(srcs.length > 0, "srcs must have at least one source"); + + URI tgtPath = getGcsPath(tgt); + List srcPaths = Arrays.stream(srcs).map(this::getGcsPath).collect(toImmutableList()); + + Preconditions.checkArgument( + !srcPaths.contains(tgtPath), + "target must not be contained in sources"); + + List> partitions = + Lists.partition(srcPaths, Constants.MAX_COMPOSE_OBJECTS - 1); + LOG.trace("concat(tgt: {}, {} partitions: {})", tgt, partitions.size(), partitions); + for (List partition : partitions) { + // We need to include the target in the list of sources to compose since + // the GCS FS compose operation will overwrite the target, whereas the Hadoop + // concat operation appends to the target. + List sources = Lists.newArrayList(tgtPath); + sources.addAll(partition); + getGcsFs().compose(sources, tgtPath, CreateFileOptions.DEFAULT.getContentType()); + } } @Override @@ -377,6 +434,7 @@ public boolean delete(final Path hadoopPath, final boolean recursive) throws IOE if (ApiErrorExtractor.INSTANCE.requestFailure(e)) { throw e; } + LOG.trace("delete(hadoopPath: {}, recursive: {}): false [failed]", hadoopPath, recursive, e); return false; } @@ -397,7 +455,7 @@ public FileStatus[] listStatus(final Path hadoopPath) throws IOException { List status; try { - List fileInfos = getGcsFs().listFileInfo(gcsPath, ListFileOptions.OBJECTFIELDS); + List fileInfos = getGcsFs().listDirectory(gcsPath); status = new ArrayList<>(fileInfos.size()); String userName = getUgiUserName(); for (FileInfo fileInfo : fileInfos) { @@ -476,7 +534,7 @@ public boolean hasPathCapability(final Path path, final String capability) { switch (Ascii.toLowerCase(capability)) { case CommonPathCapabilities.FS_APPEND: case CommonPathCapabilities.FS_CONCAT: - return false; + return true; default: return false; } @@ -524,7 +582,6 @@ public FileStatus getFileStatus(final Path path) throws IOException { checkOpen(); URI gcsPath = getGcsPath(path); - FileInfo fileInfo = getGcsFs().getFileInfo(gcsPath); if (!fileInfo.exists()) { throw new FileNotFoundException( diff --git a/hadoop-tools/hadoop-gcp/src/main/java/org/apache/hadoop/fs/gs/StorageResourceId.java b/hadoop-tools/hadoop-gcp/src/main/java/org/apache/hadoop/fs/gs/StorageResourceId.java index 5935564feedfa..31c268a5d19b1 100644 --- a/hadoop-tools/hadoop-gcp/src/main/java/org/apache/hadoop/fs/gs/StorageResourceId.java +++ b/hadoop-tools/hadoop-gcp/src/main/java/org/apache/hadoop/fs/gs/StorageResourceId.java @@ -23,6 +23,7 @@ import static org.apache.hadoop.thirdparty.com.google.common.base.Strings.isNullOrEmpty; import static org.apache.hadoop.fs.gs.Constants.SCHEME; +import com.google.cloud.storage.BlobId; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -325,4 +326,8 @@ static StorageResourceId fromUriPath(URI path, boolean allowEmptyObjectName, new StorageResourceId(bucketName, generationId) : new StorageResourceId(bucketName, objectName, generationId); } + + BlobId toBlobId() { + return BlobId.of(bucketName, objectName); + } } diff --git a/hadoop-tools/hadoop-gcp/src/test/java/org/apache/hadoop/fs/gs/contract/ITestGoogleContractAppend.java b/hadoop-tools/hadoop-gcp/src/test/java/org/apache/hadoop/fs/gs/contract/ITestGoogleContractAppend.java new file mode 100644 index 0000000000000..4ca0b4cd082b6 --- /dev/null +++ b/hadoop-tools/hadoop-gcp/src/test/java/org/apache/hadoop/fs/gs/contract/ITestGoogleContractAppend.java @@ -0,0 +1,35 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hadoop.fs.gs.contract; +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.fs.contract.AbstractContractAppendTest; +import org.apache.hadoop.fs.contract.AbstractFSContract; +import org.apache.hadoop.fs.contract.ContractTestUtils; + +public class ITestGoogleContractAppend extends AbstractContractAppendTest { + @Override + protected AbstractFSContract createContract(Configuration conf) { + return new GoogleContract(conf); + } + + @Override + public void testRenameFileBeingAppended() throws Throwable { + ContractTestUtils.skip("blobstores can not rename file that being appended"); + } +} diff --git a/hadoop-tools/hadoop-gcp/src/test/java/org/apache/hadoop/fs/gs/contract/ITestGoogleContractConcat.java b/hadoop-tools/hadoop-gcp/src/test/java/org/apache/hadoop/fs/gs/contract/ITestGoogleContractConcat.java new file mode 100644 index 0000000000000..3a21b66631fd1 --- /dev/null +++ b/hadoop-tools/hadoop-gcp/src/test/java/org/apache/hadoop/fs/gs/contract/ITestGoogleContractConcat.java @@ -0,0 +1,30 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hadoop.fs.gs.contract; +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.fs.contract.AbstractContractConcatTest; +import org.apache.hadoop.fs.contract.AbstractFSContract; + +/** GCS contract tests covering file concat. */ +public class ITestGoogleContractConcat extends AbstractContractConcatTest { + @Override + protected AbstractFSContract createContract(Configuration conf) { + return new GoogleContract(conf); + } +} diff --git a/hadoop-tools/hadoop-gcp/src/test/java/org/apache/hadoop/fs/gs/contract/ITestGoogleContractDelete.java b/hadoop-tools/hadoop-gcp/src/test/java/org/apache/hadoop/fs/gs/contract/ITestGoogleContractDelete.java index 7ed3834025c3c..dabe396f65d5c 100644 --- a/hadoop-tools/hadoop-gcp/src/test/java/org/apache/hadoop/fs/gs/contract/ITestGoogleContractDelete.java +++ b/hadoop-tools/hadoop-gcp/src/test/java/org/apache/hadoop/fs/gs/contract/ITestGoogleContractDelete.java @@ -21,17 +21,10 @@ import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.contract.AbstractContractDeleteTest; import org.apache.hadoop.fs.contract.AbstractFSContract; -import org.apache.hadoop.fs.contract.ContractTestUtils; public class ITestGoogleContractDelete extends AbstractContractDeleteTest { @Override protected AbstractFSContract createContract(Configuration conf) { return new GoogleContract(conf); } - - @Override - public void testDeleteEmptyDirNonRecursive() { - // TODO: Enable this - ContractTestUtils.skip("Skipping the test. This will be enabled in a subsequent change"); - } } diff --git a/hadoop-tools/hadoop-gcp/src/test/java/org/apache/hadoop/fs/gs/contract/ITestGoogleContractMkdir.java b/hadoop-tools/hadoop-gcp/src/test/java/org/apache/hadoop/fs/gs/contract/ITestGoogleContractMkdir.java index 27acc015ab8ba..4f846feb263b7 100644 --- a/hadoop-tools/hadoop-gcp/src/test/java/org/apache/hadoop/fs/gs/contract/ITestGoogleContractMkdir.java +++ b/hadoop-tools/hadoop-gcp/src/test/java/org/apache/hadoop/fs/gs/contract/ITestGoogleContractMkdir.java @@ -18,7 +18,6 @@ package org.apache.hadoop.fs.gs.contract; -import org.apache.hadoop.fs.contract.ContractTestUtils; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.contract.AbstractContractMkdirTest; import org.apache.hadoop.fs.contract.AbstractFSContract; @@ -28,10 +27,4 @@ public class ITestGoogleContractMkdir extends AbstractContractMkdirTest { protected AbstractFSContract createContract(Configuration conf) { return new GoogleContract(conf); } - - @Override - public void testMkDirRmDir() { - // TODO: Enable this - ContractTestUtils.skip("Skipping the test. This will be enabled in a subsequent change"); - } } diff --git a/hadoop-tools/hadoop-gcp/src/test/java/org/apache/hadoop/fs/gs/contract/ITestGoogleContractRename.java b/hadoop-tools/hadoop-gcp/src/test/java/org/apache/hadoop/fs/gs/contract/ITestGoogleContractRename.java index a159d46b0061f..cd168da7b07c0 100644 --- a/hadoop-tools/hadoop-gcp/src/test/java/org/apache/hadoop/fs/gs/contract/ITestGoogleContractRename.java +++ b/hadoop-tools/hadoop-gcp/src/test/java/org/apache/hadoop/fs/gs/contract/ITestGoogleContractRename.java @@ -20,7 +20,6 @@ import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.contract.AbstractContractRenameTest; import org.apache.hadoop.fs.contract.AbstractFSContract; -import org.apache.hadoop.fs.contract.ContractTestUtils; /** GCS contract tests covering file rename. */ public class ITestGoogleContractRename extends AbstractContractRenameTest { @@ -28,10 +27,4 @@ public class ITestGoogleContractRename extends AbstractContractRenameTest { protected AbstractFSContract createContract(Configuration conf) { return new GoogleContract(conf); } - - @Override - public void testRenameWithNonEmptySubDir() { - // TODO: Enable this - ContractTestUtils.skip("Skipping the test. This will be enabled in a subsequent change"); - } } From d3892f234286e1fab7e68a6a299d6644fe2fa4a5 Mon Sep 17 00:00:00 2001 From: Arunkumar Chacko Date: Tue, 8 Jul 2025 23:18:42 +0000 Subject: [PATCH 7/8] HADOOP-19343. Add additional authentication support Closes #7779 Co-authored-by: Chris Nauroth Signed-off-by: Chris Nauroth --- .../dev-support/findbugs-exclude.xml | 5 + .../hadoop/fs/gs/GcsInstrumentation.java | 63 ++++ .../apache/hadoop/fs/gs/GcsListOperation.java | 20 +- .../apache/hadoop/fs/gs/GcsStatistics.java | 72 ++++ .../hadoop/fs/gs/GcsStorageStatistics.java | 52 +++ .../hadoop/fs/gs/GoogleCloudStorage.java | 54 ++- .../fs/gs/GoogleCloudStorageExceptions.java | 23 ++ .../fs/gs/GoogleCloudStorageFileSystem.java | 21 +- .../fs/gs/GoogleHadoopFSInputStream.java | 2 +- .../hadoop/fs/gs/GoogleHadoopFileSystem.java | 349 +++++++++++------- .../GoogleHadoopFileSystemConfiguration.java | 39 +- .../fs/gs/GoogleHadoopOutputStream.java | 2 +- .../fs/gs/HadoopConfigurationProperty.java | 21 ++ .../fs/gs/HadoopCredentialsConfiguration.java | 213 +++++++++++ .../apache/hadoop/fs/gs/RedactedString.java | 50 +++ .../hadoop/fs/gs/StatisticTypeEnum.java | 23 ++ .../org/apache/hadoop/fs/gs/StringPaths.java | 2 +- .../tools/hadoop-gcp/Configuration.md | 197 ++++++++++ .../site/markdown/tools/hadoop-gcp/testing.md | 25 +- .../ITestGoogleContractContentSummary.java | 29 ++ .../ITestGoogleContractRootDirectory.java | 31 ++ .../contract/ITestGoogleContractUnbuffer.java | 29 ++ 22 files changed, 1145 insertions(+), 177 deletions(-) create mode 100644 hadoop-tools/hadoop-gcp/src/main/java/org/apache/hadoop/fs/gs/GcsInstrumentation.java create mode 100644 hadoop-tools/hadoop-gcp/src/main/java/org/apache/hadoop/fs/gs/GcsStatistics.java create mode 100644 hadoop-tools/hadoop-gcp/src/main/java/org/apache/hadoop/fs/gs/GcsStorageStatistics.java create mode 100644 hadoop-tools/hadoop-gcp/src/main/java/org/apache/hadoop/fs/gs/HadoopCredentialsConfiguration.java create mode 100644 hadoop-tools/hadoop-gcp/src/main/java/org/apache/hadoop/fs/gs/RedactedString.java create mode 100644 hadoop-tools/hadoop-gcp/src/main/java/org/apache/hadoop/fs/gs/StatisticTypeEnum.java create mode 100644 hadoop-tools/hadoop-gcp/src/site/markdown/tools/hadoop-gcp/Configuration.md create mode 100644 hadoop-tools/hadoop-gcp/src/test/java/org/apache/hadoop/fs/gs/contract/ITestGoogleContractContentSummary.java create mode 100644 hadoop-tools/hadoop-gcp/src/test/java/org/apache/hadoop/fs/gs/contract/ITestGoogleContractRootDirectory.java create mode 100644 hadoop-tools/hadoop-gcp/src/test/java/org/apache/hadoop/fs/gs/contract/ITestGoogleContractUnbuffer.java diff --git a/hadoop-tools/hadoop-gcp/dev-support/findbugs-exclude.xml b/hadoop-tools/hadoop-gcp/dev-support/findbugs-exclude.xml index 80be329bd6d16..0063be022dbf4 100644 --- a/hadoop-tools/hadoop-gcp/dev-support/findbugs-exclude.xml +++ b/hadoop-tools/hadoop-gcp/dev-support/findbugs-exclude.xml @@ -26,4 +26,9 @@ + + + + + diff --git a/hadoop-tools/hadoop-gcp/src/main/java/org/apache/hadoop/fs/gs/GcsInstrumentation.java b/hadoop-tools/hadoop-gcp/src/main/java/org/apache/hadoop/fs/gs/GcsInstrumentation.java new file mode 100644 index 0000000000000..273d29d2f49de --- /dev/null +++ b/hadoop-tools/hadoop-gcp/src/main/java/org/apache/hadoop/fs/gs/GcsInstrumentation.java @@ -0,0 +1,63 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hadoop.fs.gs; + +import java.io.Closeable; +import java.io.IOException; +import java.util.EnumSet; + +import org.apache.hadoop.fs.statistics.IOStatisticsSource; +import org.apache.hadoop.fs.statistics.impl.IOStatisticsStore; +import org.apache.hadoop.fs.statistics.impl.IOStatisticsStoreBuilder; + +import static org.apache.hadoop.fs.statistics.impl.IOStatisticsBinding.iostatisticsStore; + +class GcsInstrumentation implements Closeable, IOStatisticsSource { + private final IOStatisticsStore instanceIOStatistics; + + GcsInstrumentation() { + IOStatisticsStoreBuilder storeBuilder = iostatisticsStore(); + + // declare all counter statistics + EnumSet.allOf(GcsStatistics.class).stream() + .filter(statistic -> + statistic.getType() == StatisticTypeEnum.TYPE_COUNTER) + .forEach(stat -> { + storeBuilder.withCounters(stat.getSymbol()); + }); + + EnumSet.allOf(GcsStatistics.class).stream() + .filter(statistic -> + statistic.getType() == StatisticTypeEnum.TYPE_DURATION) + .forEach(stat -> { + storeBuilder.withDurationTracking(stat.getSymbol()); + }); + + this.instanceIOStatistics = storeBuilder.build(); + } + + @Override + public void close() throws IOException { + } + + @Override + public IOStatisticsStore getIOStatistics() { + return instanceIOStatistics; + } +} diff --git a/hadoop-tools/hadoop-gcp/src/main/java/org/apache/hadoop/fs/gs/GcsListOperation.java b/hadoop-tools/hadoop-gcp/src/main/java/org/apache/hadoop/fs/gs/GcsListOperation.java index 9cd4fdbb867e4..c3de5dd1c136c 100644 --- a/hadoop-tools/hadoop-gcp/src/main/java/org/apache/hadoop/fs/gs/GcsListOperation.java +++ b/hadoop-tools/hadoop-gcp/src/main/java/org/apache/hadoop/fs/gs/GcsListOperation.java @@ -24,8 +24,6 @@ import com.google.cloud.storage.Blob; import com.google.cloud.storage.Storage; -import static org.apache.hadoop.thirdparty.com.google.common.base.Preconditions.checkArgument; - final class GcsListOperation { private static final int ALL = 0; private final Storage.BlobListOption[] listOptions; @@ -72,7 +70,11 @@ Builder forRecursiveListing() { } GcsListOperation build() { - blobListOptions.add(Storage.BlobListOption.prefix(prefix)); + // Can be null while listing the root directory. + if (prefix != null) { + blobListOptions.add(Storage.BlobListOption.prefix(prefix)); + } + return new GcsListOperation(this); } @@ -82,13 +84,11 @@ Builder forCurrentDirectoryListing() { return this; } - Builder forCurrentDirectoryListingWithLimit(int theLimit) { - checkArgument( - theLimit > 0, - "limit should be greater than 0. found %d; prefix=%s", theLimit, prefix); - - this.limit = theLimit; - prefix = StringPaths.toDirectoryPath(prefix); + Builder forImplicitDirectoryCheck() { + this.limit = 1; + if (prefix != null) { + prefix = StringPaths.toDirectoryPath(prefix); + } blobListOptions.add(Storage.BlobListOption.pageSize(1)); forCurrentDirectoryListing(); diff --git a/hadoop-tools/hadoop-gcp/src/main/java/org/apache/hadoop/fs/gs/GcsStatistics.java b/hadoop-tools/hadoop-gcp/src/main/java/org/apache/hadoop/fs/gs/GcsStatistics.java new file mode 100644 index 0000000000000..3e78bd24accfd --- /dev/null +++ b/hadoop-tools/hadoop-gcp/src/main/java/org/apache/hadoop/fs/gs/GcsStatistics.java @@ -0,0 +1,72 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hadoop.fs.gs; + +import org.apache.hadoop.fs.statistics.StoreStatisticNames; + +import static org.apache.hadoop.fs.gs.StatisticTypeEnum.TYPE_DURATION; + +enum GcsStatistics { + INVOCATION_GET_FILE_STATUS( + StoreStatisticNames.OP_GET_FILE_STATUS, + "Calls of getFileStatus()", + TYPE_DURATION), + INVOCATION_CREATE( + StoreStatisticNames.OP_CREATE, + "Calls of create()", + TYPE_DURATION), + INVOCATION_DELETE( + StoreStatisticNames.OP_DELETE, + "Calls of delete()", + TYPE_DURATION), + INVOCATION_RENAME( + StoreStatisticNames.OP_RENAME, + "Calls of rename()", + TYPE_DURATION), + INVOCATION_OPEN( + StoreStatisticNames.OP_OPEN, + "Calls of open()", + TYPE_DURATION), + INVOCATION_MKDIRS( + StoreStatisticNames.OP_MKDIRS, + "Calls of mkdirs()", + TYPE_DURATION), + INVOCATION_LIST_STATUS( + StoreStatisticNames.OP_LIST_STATUS, + "Calls of listStatus()", + TYPE_DURATION); + + private final String description; + private final StatisticTypeEnum type; + private final String symbol; + + StatisticTypeEnum getType() { + return this.type; + } + + String getSymbol() { + return this.symbol; + } + + GcsStatistics(String symbol, String description, StatisticTypeEnum type) { + this.symbol = symbol; + this.description = description; + this.type = type; + } +} diff --git a/hadoop-tools/hadoop-gcp/src/main/java/org/apache/hadoop/fs/gs/GcsStorageStatistics.java b/hadoop-tools/hadoop-gcp/src/main/java/org/apache/hadoop/fs/gs/GcsStorageStatistics.java new file mode 100644 index 0000000000000..394007817a82b --- /dev/null +++ b/hadoop-tools/hadoop-gcp/src/main/java/org/apache/hadoop/fs/gs/GcsStorageStatistics.java @@ -0,0 +1,52 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hadoop.fs.gs; + +import java.util.Iterator; + +import org.apache.hadoop.classification.InterfaceAudience; +import org.apache.hadoop.classification.InterfaceStability; +import org.apache.hadoop.fs.statistics.IOStatistics; +import org.apache.hadoop.fs.statistics.impl.StorageStatisticsFromIOStatistics; + +@InterfaceAudience.Private +@InterfaceStability.Evolving +class GcsStorageStatistics + extends StorageStatisticsFromIOStatistics { + static final String NAME = "GhfsStorageStatistics"; + + GcsStorageStatistics(final IOStatistics ioStatistics) { + super(NAME, Constants.SCHEME, ioStatistics); + } + + @Override + public String toString() { + StringBuilder sb = new StringBuilder(); + for (Iterator it = this.getLongStatistics(); it.hasNext();) { + LongStatistic statistic = it.next(); + + if (sb.length() != 0) { + sb.append(", "); + } + sb.append(String.format("%s=%s", statistic.getName(), statistic.getValue())); + } + + return String.format("[%s]", sb); + } +} diff --git a/hadoop-tools/hadoop-gcp/src/main/java/org/apache/hadoop/fs/gs/GoogleCloudStorage.java b/hadoop-tools/hadoop-gcp/src/main/java/org/apache/hadoop/fs/gs/GoogleCloudStorage.java index e8cafa57ef820..24addcdd33932 100644 --- a/hadoop-tools/hadoop-gcp/src/main/java/org/apache/hadoop/fs/gs/GoogleCloudStorage.java +++ b/hadoop-tools/hadoop-gcp/src/main/java/org/apache/hadoop/fs/gs/GoogleCloudStorage.java @@ -18,6 +18,7 @@ package org.apache.hadoop.fs.gs; +import static org.apache.hadoop.thirdparty.com.google.common.base.Preconditions.checkArgument; import static org.apache.hadoop.thirdparty.com.google.common.base.Preconditions.*; import static org.apache.hadoop.thirdparty.com.google.common.base.Strings.isNullOrEmpty; import static java.lang.Math.toIntExact; @@ -27,7 +28,9 @@ import com.google.api.client.util.ExponentialBackOff; import com.google.api.client.util.Sleeper; import com.google.api.gax.paging.Page; +import com.google.auth.Credentials; import com.google.cloud.storage.*; +import org.apache.hadoop.thirdparty.com.google.common.base.Strings; import org.apache.hadoop.thirdparty.com.google.common.collect.ImmutableList; import org.apache.hadoop.thirdparty.com.google.common.collect.Maps; import org.apache.hadoop.thirdparty.com.google.common.io.BaseEncoding; @@ -55,7 +58,7 @@ * client. */ class GoogleCloudStorage { - static final Logger LOG = LoggerFactory.getLogger(GoogleHadoopFileSystem.class); + static final Logger LOG = LoggerFactory.getLogger(GoogleCloudStorage.class); static final List BLOB_FIELDS = ImmutableList.of( Storage.BlobField.BUCKET, Storage.BlobField.CONTENT_ENCODING, @@ -76,18 +79,19 @@ class GoogleCloudStorage { * Having an instance of gscImpl to redirect calls to Json client while new client implementation * is in WIP. */ - GoogleCloudStorage(GoogleHadoopFileSystemConfiguration configuration) throws IOException { - // TODO: Set credentials - this.storage = createStorage(configuration.getProjectId()); + GoogleCloudStorage(GoogleHadoopFileSystemConfiguration configuration, Credentials credentials) + throws IOException { + this.storage = createStorage(configuration.getProjectId(), credentials); this.configuration = configuration; } - private static Storage createStorage(String projectId) { + private static Storage createStorage(String projectId, Credentials credentials) { + StorageOptions.Builder builder = StorageOptions.newBuilder(); if (projectId != null) { - return StorageOptions.newBuilder().setProjectId(projectId).build().getService(); + builder.setProjectId(projectId); } - return StorageOptions.newBuilder().build().getService(); + return builder.setCredentials(credentials).build().getService(); } WritableByteChannel create(final StorageResourceId resourceId, final CreateFileOptions options) @@ -494,7 +498,9 @@ List listDirectoryRecursive(String bucketName, Strin // TODO: Take delimiter from config // TODO: Set specific fields - checkArgument(objectName.endsWith("/"), String.format("%s should end with /", objectName)); + checkArgument( + objectName == null || objectName.endsWith("/"), + String.format("%s should end with /", objectName)); try { List blobs = new GcsListOperation.Builder(bucketName, objectName, storage) .forRecursiveListing().build() @@ -887,7 +893,7 @@ List getItemInfos(List resourceId List listDirectory(String bucketName, String objectNamePrefix) throws IOException { checkArgument( - objectNamePrefix.endsWith("/"), + objectNamePrefix == null || objectNamePrefix.endsWith("/"), String.format("%s should end with /", objectNamePrefix)); try { @@ -971,7 +977,7 @@ GoogleCloudStorageItemInfo getFileOrDirectoryInfo(StorageResourceId resourceId) GoogleCloudStorageItemInfo getImplicitDirectory(StorageResourceId resourceId) { List blobs = new GcsListOperation .Builder(resourceId.getBucketName(), resourceId.getObjectName(), storage) - .forCurrentDirectoryListingWithLimit(1).build() + .forImplicitDirectoryCheck().build() .execute(); if (blobs.isEmpty()) { @@ -981,6 +987,34 @@ GoogleCloudStorageItemInfo getImplicitDirectory(StorageResourceId resourceId) { return GoogleCloudStorageItemInfo.createInferredDirectory(resourceId.toDirectoryId()); } + public void deleteBuckets(List bucketNames) throws IOException { + LOG.trace("deleteBuckets({})", bucketNames); + + // Validate all the inputs first. + for (String bucketName : bucketNames) { + checkArgument(!Strings.isNullOrEmpty(bucketName), "bucketName must not be null or empty"); + } + + // Gather exceptions to wrap in a composite exception at the end. + List innerExceptions = new ArrayList<>(); + + for (String bucketName : bucketNames) { + try { + boolean isDeleted = storage.delete(bucketName); + if (!isDeleted) { + innerExceptions.add(createFileNotFoundException(bucketName, null, null)); + } + } catch (StorageException e) { + innerExceptions.add( + new IOException(String.format("Error deleting '%s' bucket", bucketName), e)); + } + } + + if (!innerExceptions.isEmpty()) { + throw GoogleCloudStorageExceptions.createCompositeException(innerExceptions); + } + } + // Helper class to capture the results of list operation. private class ListOperationResult { private final Map prefixes = new HashMap<>(); diff --git a/hadoop-tools/hadoop-gcp/src/main/java/org/apache/hadoop/fs/gs/GoogleCloudStorageExceptions.java b/hadoop-tools/hadoop-gcp/src/main/java/org/apache/hadoop/fs/gs/GoogleCloudStorageExceptions.java index 95f0e41617c74..db7ffa7eb9513 100644 --- a/hadoop-tools/hadoop-gcp/src/main/java/org/apache/hadoop/fs/gs/GoogleCloudStorageExceptions.java +++ b/hadoop-tools/hadoop-gcp/src/main/java/org/apache/hadoop/fs/gs/GoogleCloudStorageExceptions.java @@ -18,12 +18,16 @@ package org.apache.hadoop.fs.gs; +import org.apache.hadoop.thirdparty.com.google.common.base.Preconditions; + import static org.apache.hadoop.thirdparty.com.google.common.base.Preconditions.checkArgument; import static org.apache.hadoop.thirdparty.com.google.common.base.Strings.isNullOrEmpty; import static org.apache.hadoop.thirdparty.com.google.common.base.Strings.nullToEmpty; import java.io.FileNotFoundException; import java.io.IOException; +import java.util.Collection; +import java.util.Iterator; import javax.annotation.Nullable; /** @@ -55,4 +59,23 @@ static FileNotFoundException createFileNotFoundException( return createFileNotFoundException( resourceId.getBucketName(), resourceId.getObjectName(), cause); } + + public static IOException createCompositeException(Collection innerExceptions) { + Preconditions.checkArgument( + innerExceptions != null && !innerExceptions.isEmpty(), + "innerExceptions (%s) must be not null and contain at least one element", + innerExceptions); + + Iterator innerExceptionIterator = innerExceptions.iterator(); + + if (innerExceptions.size() == 1) { + return innerExceptionIterator.next(); + } + + IOException combined = new IOException("Multiple IOExceptions."); + while (innerExceptionIterator.hasNext()) { + combined.addSuppressed(innerExceptionIterator.next()); + } + return combined; + } } diff --git a/hadoop-tools/hadoop-gcp/src/main/java/org/apache/hadoop/fs/gs/GoogleCloudStorageFileSystem.java b/hadoop-tools/hadoop-gcp/src/main/java/org/apache/hadoop/fs/gs/GoogleCloudStorageFileSystem.java index 8cf11d009c8ed..7da6a83c417e0 100644 --- a/hadoop-tools/hadoop-gcp/src/main/java/org/apache/hadoop/fs/gs/GoogleCloudStorageFileSystem.java +++ b/hadoop-tools/hadoop-gcp/src/main/java/org/apache/hadoop/fs/gs/GoogleCloudStorageFileSystem.java @@ -55,7 +55,7 @@ * Provides FS semantics over GCS based on Objects API. */ class GoogleCloudStorageFileSystem { - private static final Logger LOG = LoggerFactory.getLogger(StorageResourceId.class); + private static final Logger LOG = LoggerFactory.getLogger(GoogleCloudStorageFileSystem.class); // Comparator used for sorting paths. // // For some bulk operations, we need to operate on parent directories before @@ -93,7 +93,7 @@ private static GoogleCloudStorage createCloudStorage( throws IOException { checkNotNull(configuration, "configuration must not be null"); - return new GoogleCloudStorage(configuration); + return new GoogleCloudStorage(configuration, credentials); } GoogleCloudStorageFileSystem(final GoogleHadoopFileSystemConfiguration configuration, @@ -330,12 +330,19 @@ private void deleteObjects(List itemsToDelete) throws IOException { } private void deleteBucket(List bucketsToDelete) throws IOException { - if (bucketsToDelete == null || bucketsToDelete.isEmpty()) { - return; - } + if (!bucketsToDelete.isEmpty()) { + List bucketNames = new ArrayList<>(bucketsToDelete.size()); + for (FileInfo bucketInfo : bucketsToDelete) { + bucketNames.add(bucketInfo.getItemInfo().getResourceId().getBucketName()); + } - // TODO: Add support for deleting bucket - throw new UnsupportedOperationException("deleteBucket is not supported."); + if (configuration.isBucketDeleteEnabled()) { + gcs.deleteBuckets(bucketNames); + } else { + LOG.info("Skipping deletion of buckets because enableBucketDelete is false: {}", + bucketNames); + } + } } FileInfo getFileInfoObject(URI path) throws IOException { diff --git a/hadoop-tools/hadoop-gcp/src/main/java/org/apache/hadoop/fs/gs/GoogleHadoopFSInputStream.java b/hadoop-tools/hadoop-gcp/src/main/java/org/apache/hadoop/fs/gs/GoogleHadoopFSInputStream.java index 26629fc79b27e..79ffbcc6f3926 100644 --- a/hadoop-tools/hadoop-gcp/src/main/java/org/apache/hadoop/fs/gs/GoogleHadoopFSInputStream.java +++ b/hadoop-tools/hadoop-gcp/src/main/java/org/apache/hadoop/fs/gs/GoogleHadoopFSInputStream.java @@ -72,7 +72,7 @@ private GoogleHadoopFSInputStream( URI gcsPath, SeekableByteChannel channel, FileSystem.Statistics statistics) { - LOG.trace("GoogleHadoopFSInputStream(gcsPath: %s)", gcsPath); + LOG.trace("GoogleHadoopFSInputStream(gcsPath: {})", gcsPath); this.gcsPath = gcsPath; this.channel = channel; this.statistics = statistics; diff --git a/hadoop-tools/hadoop-gcp/src/main/java/org/apache/hadoop/fs/gs/GoogleHadoopFileSystem.java b/hadoop-tools/hadoop-gcp/src/main/java/org/apache/hadoop/fs/gs/GoogleHadoopFileSystem.java index 3c4e84ce1eb8c..172d134758247 100644 --- a/hadoop-tools/hadoop-gcp/src/main/java/org/apache/hadoop/fs/gs/GoogleHadoopFileSystem.java +++ b/hadoop-tools/hadoop-gcp/src/main/java/org/apache/hadoop/fs/gs/GoogleHadoopFileSystem.java @@ -19,15 +19,19 @@ package org.apache.hadoop.fs.gs; import static org.apache.hadoop.thirdparty.com.google.common.collect.ImmutableList.toImmutableList; +import static java.util.Objects.requireNonNull; import static org.apache.hadoop.fs.gs.Constants.GCS_CONFIG_PREFIX; import static org.apache.hadoop.fs.gs.GoogleHadoopFileSystemConfiguration.GCS_WORKING_DIRECTORY; +import static org.apache.hadoop.fs.statistics.impl.IOStatisticsBinding.trackDuration; import static org.apache.hadoop.thirdparty.com.google.common.base.Preconditions.checkArgument; import static org.apache.hadoop.thirdparty.com.google.common.base.Preconditions.checkNotNull; import static org.apache.hadoop.thirdparty.com.google.common.base.Preconditions.checkState; import static org.apache.hadoop.thirdparty.com.google.common.base.Strings.isNullOrEmpty; import com.google.auth.oauth2.GoogleCredentials; +import org.apache.hadoop.fs.statistics.IOStatistics; +import org.apache.hadoop.fs.statistics.IOStatisticsSource; import org.apache.hadoop.thirdparty.com.google.common.base.Ascii; import java.io.FileNotFoundException; @@ -48,6 +52,7 @@ import org.apache.hadoop.thirdparty.com.google.common.base.Preconditions; import org.apache.hadoop.thirdparty.com.google.common.collect.Lists; +import org.apache.hadoop.util.functional.CallableRaisingIOE; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -62,7 +67,7 @@ * particular, it is not subject to bucket-naming constraints, and files are allowed to be placed in * root. */ -public class GoogleHadoopFileSystem extends FileSystem { +public class GoogleHadoopFileSystem extends FileSystem implements IOStatisticsSource { public static final Logger LOG = LoggerFactory.getLogger(GoogleHadoopFileSystem.class); @@ -90,7 +95,7 @@ public class GoogleHadoopFileSystem extends FileSystem { * allow modifying or querying the value. Modifying this value allows one to control how many * mappers are used to process a given file. */ - private long defaultBlockSize = GoogleHadoopFileSystemConfiguration.BLOCK_SIZE.getDefault(); + private final long defaultBlockSize = GoogleHadoopFileSystemConfiguration.BLOCK_SIZE.getDefault(); // The bucket the file system is rooted in used for default values of: // -- working directory @@ -106,7 +111,21 @@ public class GoogleHadoopFileSystem extends FileSystem { private boolean isClosed; private FsPermission reportedPermissions; - public GoogleHadoopFileSystemConfiguration getFileSystemConfiguration() { + /** + * Setting this to static inorder to have a singleton instance. This will help us get the JVM + * level metrics. Note that we use this to generate Global Storage Statistics. If we make this + * an instance field, only the first filesystem instance metrics will be updated since while + * initializing GlobalStorageStatistics (refer initialize()) only the first instance will be + * registered. + * + * For filesystem instance level instrumentation, one more per instance object can be created + * and both be updated. + */ + private static GcsInstrumentation instrumentation = new GcsInstrumentation(); + private GcsStorageStatistics storageStatistics; + + + GoogleHadoopFileSystemConfiguration getFileSystemConfiguration() { return fileSystemConfiguration; } @@ -132,6 +151,9 @@ public void initialize(final URI path, Configuration config) throws IOException // be sufficient (and is required) for the delegation token binding initialization. setConf(config); + storageStatistics = createStorageStatistics( + requireNonNull(getIOStatistics())); + this.reportedPermissions = new FsPermission(PERMISSIONS_TO_REPORT); initializeFsRoot(); @@ -141,6 +163,12 @@ public void initialize(final URI path, Configuration config) throws IOException initializeGcsFs(fileSystemConfiguration); } + private static GcsStorageStatistics createStorageStatistics( + final IOStatistics ioStatistics) { + return (GcsStorageStatistics) GlobalStorageStatistics.INSTANCE + .put(GcsStorageStatistics.NAME, () -> new GcsStorageStatistics(ioStatistics)); + } + private void initializeFsRoot() { String rootBucket = initUri.getAuthority(); checkArgument(rootBucket != null, "No bucket specified in GCS URI: {}", initUri); @@ -180,9 +208,9 @@ private GoogleCredentials getCredentials(GoogleHadoopFileSystemConfiguration con return getCredentials(config, GCS_CONFIG_PREFIX); } - public static GoogleCredentials getCredentials(GoogleHadoopFileSystemConfiguration config, + static GoogleCredentials getCredentials(GoogleHadoopFileSystemConfiguration config, String... keyPrefixesVararg) throws IOException { - return GoogleCredentials.getApplicationDefault(); // TODO: Add other Auth mechanisms + return HadoopCredentialsConfiguration.getCredentials(config.getConfig(), keyPrefixesVararg); } @Override @@ -194,7 +222,7 @@ protected void checkPath(final Path path) { String scheme = uri.getScheme(); if (scheme != null && !scheme.equalsIgnoreCase(getScheme())) { throw new IllegalArgumentException( - String.format("Wrong scheme: {}, in path: {}, expected scheme: {}", scheme, path, + String.format("Wrong scheme: %s, in path: %s, expected scheme: %s", scheme, path, getScheme())); } @@ -207,7 +235,7 @@ protected void checkPath(final Path path) { } throw new IllegalArgumentException( - String.format("Wrong bucket: {}, in path: {}, expected bucket: {}", bucket, path, + String.format("Wrong bucket: %s, in path: %s, expected bucket: %s", bucket, path, rootBucket)); } @@ -270,30 +298,40 @@ public String getScheme() { @Override public FSDataInputStream open(final Path hadoopPath, final int bufferSize) throws IOException { - LOG.trace("open({})", hadoopPath); - URI gcsPath = getGcsPath(hadoopPath); - return new FSDataInputStream(GoogleHadoopFSInputStream.create(this, gcsPath, statistics)); + return runOperation( + GcsStatistics.INVOCATION_OPEN, + () -> { + LOG.trace("open({})", hadoopPath); + URI gcsPath = getGcsPath(hadoopPath); + return new FSDataInputStream(GoogleHadoopFSInputStream.create(this, gcsPath, statistics)); + }, + String.format("open(%s)", hadoopPath)); } @Override - public FSDataOutputStream create(Path hadoopPath, FsPermission permission, boolean overwrite, - int bufferSize, short replication, long blockSize, Progressable progress) throws IOException { - checkArgument(hadoopPath != null, "hadoopPath must not be null"); - checkArgument(replication > 0, "replication must be a positive integer: %s", replication); - checkArgument(blockSize > 0, "blockSize must be a positive integer: %s", blockSize); + public FSDataOutputStream create( + Path hadoopPath, FsPermission permission, boolean overwrite, int bufferSize, + short replication, long blockSize, Progressable progress) throws IOException { + return runOperation( + GcsStatistics.INVOCATION_CREATE, + () -> { + checkArgument(hadoopPath != null, "hadoopPath must not be null"); + checkArgument(replication > 0, "replication must be a positive integer: %s", replication); + checkArgument(blockSize > 0, "blockSize must be a positive integer: %s", blockSize); - checkOpen(); + checkOpen(); - LOG.trace("create(hadoopPath: {}, overwrite: {}, bufferSize: {} [ignored])", hadoopPath, - overwrite, bufferSize); + LOG.trace("create(hadoopPath: {}, overwrite: {}, bufferSize: {} [ignored])", hadoopPath, + overwrite, bufferSize); - CreateFileOptions.WriteMode writeMode = - overwrite ? CreateFileOptions.WriteMode.OVERWRITE : CreateFileOptions.WriteMode.CREATE_NEW; - FSDataOutputStream response = new FSDataOutputStream( - new GoogleHadoopOutputStream(this, getGcsPath(hadoopPath), - CreateFileOptions.builder().setWriteMode(writeMode).build(), statistics), statistics); + CreateFileOptions.WriteMode writeMode = overwrite ? + CreateFileOptions.WriteMode.OVERWRITE : CreateFileOptions.WriteMode.CREATE_NEW; - return response; + CreateFileOptions fileOptions = CreateFileOptions.builder().setWriteMode(writeMode).build(); + return new FSDataOutputStream(new GoogleHadoopOutputStream( + this, getGcsPath(hadoopPath), fileOptions, statistics), statistics); + }, + String.format("create(%s, %s)", hadoopPath, overwrite)); } @Override @@ -386,91 +424,112 @@ public void concat(Path tgt, Path[] srcs) throws IOException { @Override public boolean rename(final Path src, final Path dst) throws IOException { - LOG.trace("rename({}, {})", src, dst); - - checkArgument(src != null, "src must not be null"); - checkArgument(dst != null, "dst must not be null"); - - // Even though the underlying GCSFS will also throw an IAE if src is root, since our filesystem - // root happens to equal the global root, we want to explicitly check it here since derived - // classes may not have filesystem roots equal to the global root. - if (this.makeQualified(src).equals(fsRoot)) { - LOG.trace("rename(src: {}, dst: {}): false [src is a root]", src, dst); - return false; - } - - try { - checkOpen(); - - URI srcPath = getGcsPath(src); - URI dstPath = getGcsPath(dst); - getGcsFs().rename(srcPath, dstPath); - - LOG.trace("rename(src: {}, dst: {}): true", src, dst); - } catch (IOException e) { - if (ApiErrorExtractor.INSTANCE.requestFailure(e)) { - throw e; - } - LOG.trace("rename(src: %s, dst: %s): false [failed]", src, dst, e); - return false; - } - - return true; + return runOperation(GcsStatistics.INVOCATION_RENAME, + () -> { + LOG.trace("rename({}, {})", src, dst); + + checkArgument(src != null, "src must not be null"); + checkArgument(dst != null, "dst must not be null"); + + // Even though the underlying GCSFS will also throw an IAE if src is root, since our + // filesystem root happens to equal the global root, we want to explicitly check it + // here since derived classes may not have filesystem roots equal to the global root. + if (this.makeQualified(src).equals(fsRoot)) { + LOG.trace("rename(src: {}, dst: {}): false [src is a root]", src, dst); + return false; + } + + try { + checkOpen(); + + URI srcPath = getGcsPath(src); + URI dstPath = getGcsPath(dst); + getGcsFs().rename(srcPath, dstPath); + + LOG.trace("rename(src: {}, dst: {}): true", src, dst); + } catch (IOException e) { + if (ApiErrorExtractor.INSTANCE.requestFailure(e)) { + throw e; + } + LOG.trace("rename(src: {}, dst: {}): false [failed]", src, dst, e); + return false; + } + + return true; + }, + String.format("rename(%s, %s)", src, dst)); } @Override public boolean delete(final Path hadoopPath, final boolean recursive) throws IOException { - LOG.trace("delete({}, {})", hadoopPath, recursive); - checkArgument(hadoopPath != null, "hadoopPath must not be null"); + return runOperation(GcsStatistics.INVOCATION_DELETE, + () -> { + LOG.trace("delete({}, {})", hadoopPath, recursive); + checkArgument(hadoopPath != null, "hadoopPath must not be null"); - checkOpen(); + checkOpen(); - URI gcsPath = getGcsPath(hadoopPath); - try { - getGcsFs().delete(gcsPath, recursive); - } catch (DirectoryNotEmptyException e) { - throw e; - } catch (IOException e) { - if (ApiErrorExtractor.INSTANCE.requestFailure(e)) { - throw e; - } + URI gcsPath = getGcsPath(hadoopPath); + try { + getGcsFs().delete(gcsPath, recursive); + } catch (DirectoryNotEmptyException e) { + throw e; + } catch (IOException e) { + if (ApiErrorExtractor.INSTANCE.requestFailure(e)) { + throw e; + } - LOG.trace("delete(hadoopPath: {}, recursive: {}): false [failed]", hadoopPath, recursive, e); - return false; - } + LOG.trace("delete(hadoopPath: {}, recursive: {}): false [failed]", + hadoopPath, recursive, e); + return false; + } + + LOG.trace("delete(hadoopPath: {}, recursive: {}): true", + hadoopPath, recursive); + return true; + }, + String.format("delete(%s,%s", hadoopPath, recursive)); + } - LOG.trace("delete(hadoopPath: %s, recursive: %b): true", hadoopPath, recursive); - return true; + private B runOperation(GcsStatistics stat, CallableRaisingIOE operation, String context) + throws IOException { + LOG.trace("{}({})", stat, context); + return trackDuration(instrumentation.getIOStatistics(), stat.getSymbol(), operation); } @Override public FileStatus[] listStatus(final Path hadoopPath) throws IOException { - checkArgument(hadoopPath != null, "hadoopPath must not be null"); - - checkOpen(); - - LOG.trace("listStatus(hadoopPath: {})", hadoopPath); - - URI gcsPath = getGcsPath(hadoopPath); - List status; - - try { - List fileInfos = getGcsFs().listDirectory(gcsPath); - status = new ArrayList<>(fileInfos.size()); - String userName = getUgiUserName(); - for (FileInfo fileInfo : fileInfos) { - status.add(getFileStatus(fileInfo, userName)); - } - } catch (FileNotFoundException fnfe) { - throw (FileNotFoundException) - new FileNotFoundException( - String.format( - "listStatus(hadoopPath: %s): '%s' does not exist.", - hadoopPath, gcsPath)) - .initCause(fnfe); - } - - return status.toArray(new FileStatus[0]); + return runOperation( + GcsStatistics.INVOCATION_LIST_STATUS, + () -> { + checkArgument(hadoopPath != null, "hadoopPath must not be null"); + + checkOpen(); + + LOG.trace("listStatus(hadoopPath: {})", hadoopPath); + + URI gcsPath = getGcsPath(hadoopPath); + List status; + + try { + List fileInfos = getGcsFs().listDirectory(gcsPath); + status = new ArrayList<>(fileInfos.size()); + String userName = getUgiUserName(); + for (FileInfo fileInfo : fileInfos) { + status.add(getFileStatus(fileInfo, userName)); + } + } catch (FileNotFoundException fnfe) { + throw (FileNotFoundException) + new FileNotFoundException( + String.format( + "listStatus(hadoopPath: %s): '%s' does not exist.", + hadoopPath, gcsPath)) + .initCause(fnfe); + } + + return status.toArray(new FileStatus[0]); + }, + String.format("listStatus(%s", hadoopPath)); } /** @@ -522,7 +581,7 @@ public URI getUri() { @Override protected int getDefaultPort() { int result = -1; - LOG.trace("getDefaultPort(): %d", result); + LOG.trace("getDefaultPort(): {}", result); return result; } @@ -553,43 +612,53 @@ public Path getWorkingDirectory() { @Override public boolean mkdirs(final Path hadoopPath, final FsPermission permission) throws IOException { - checkArgument(hadoopPath != null, "hadoopPath must not be null"); + return runOperation( + GcsStatistics.INVOCATION_MKDIRS, + () -> { + checkArgument(hadoopPath != null, "hadoopPath must not be null"); - LOG.trace("mkdirs(hadoopPath: {}, permission: {}): true", hadoopPath, permission); - - checkOpen(); - - URI gcsPath = getGcsPath(hadoopPath); - try { - getGcsFs().mkdirs(gcsPath); - } catch (java.nio.file.FileAlreadyExistsException faee) { - // Need to convert to the Hadoop flavor of FileAlreadyExistsException. - throw (FileAlreadyExistsException) - new FileAlreadyExistsException( - String.format( - "mkdirs(hadoopPath: %s, permission: %s): failed", - hadoopPath, permission)) - .initCause(faee); - } + LOG.trace("mkdirs(hadoopPath: {}, permission: {}): true", hadoopPath, permission); - return true; + checkOpen(); + + URI gcsPath = getGcsPath(hadoopPath); + try { + getGcsFs().mkdirs(gcsPath); + } catch (java.nio.file.FileAlreadyExistsException faee) { + // Need to convert to the Hadoop flavor of FileAlreadyExistsException. + throw (FileAlreadyExistsException) + new FileAlreadyExistsException( + String.format( + "mkdirs(hadoopPath: %s, permission: %s): failed", + hadoopPath, permission)) + .initCause(faee); + } + + return true; + }, + String.format("mkdirs(%s)", hadoopPath)); } @Override public FileStatus getFileStatus(final Path path) throws IOException { - checkArgument(path != null, "path must not be null"); - - checkOpen(); - - URI gcsPath = getGcsPath(path); - FileInfo fileInfo = getGcsFs().getFileInfo(gcsPath); - if (!fileInfo.exists()) { - throw new FileNotFoundException( - String.format( - "%s not found: %s", fileInfo.isDirectory() ? "Directory" : "File", path)); - } - String userName = getUgiUserName(); - return getFileStatus(fileInfo, userName); + return runOperation( + GcsStatistics.INVOCATION_GET_FILE_STATUS, + () -> { + checkArgument(path != null, "path must not be null"); + + checkOpen(); + + URI gcsPath = getGcsPath(path); + FileInfo fileInfo = getGcsFs().getFileInfo(gcsPath); + if (!fileInfo.exists()) { + throw new FileNotFoundException( + String.format( + "%s not found: %s", fileInfo.isDirectory() ? "Directory" : "File", path)); + } + String userName = getUgiUserName(); + return getFileStatus(fileInfo, userName); + }, + String.format("getFileStatus(%s)", path)); } /** @@ -661,6 +730,26 @@ public void setWorkingDirectory(final Path hadoopPath) { LOG.trace("setWorkingDirectory(hadoopPath: {}): {}", hadoopPath, workingDirectory); } + /** + * Get the instrumentation's IOStatistics. + * @return statistics + */ + @Override + public IOStatistics getIOStatistics() { + return instrumentation != null + ? instrumentation.getIOStatistics() + : null; + } + + /** + * Get the storage statistics of this filesystem. + * @return the storage statistics + */ + @Override + public GcsStorageStatistics getStorageStatistics() { + return this.storageStatistics; + } + private static String getUgiUserName() throws IOException { UserGroupInformation ugi = UserGroupInformation.getCurrentUser(); return ugi.getShortUserName(); @@ -684,4 +773,4 @@ private FileStatus getFileStatus(FileInfo fileInfo, String userName) { LOG.trace("FileStatus(path: {}, userName: {}): {}", fileInfo.getPath(), userName, status); return status; } -} \ No newline at end of file +} diff --git a/hadoop-tools/hadoop-gcp/src/main/java/org/apache/hadoop/fs/gs/GoogleHadoopFileSystemConfiguration.java b/hadoop-tools/hadoop-gcp/src/main/java/org/apache/hadoop/fs/gs/GoogleHadoopFileSystemConfiguration.java index 4097b5e1f839f..9d48747ad582f 100644 --- a/hadoop-tools/hadoop-gcp/src/main/java/org/apache/hadoop/fs/gs/GoogleHadoopFileSystemConfiguration.java +++ b/hadoop-tools/hadoop-gcp/src/main/java/org/apache/hadoop/fs/gs/GoogleHadoopFileSystemConfiguration.java @@ -130,6 +130,19 @@ class GoogleHadoopFileSystemConfiguration { "fs.gs.outputstream.sync.min.interval", 0L); + /** + * If true, recursive delete on a path that refers to a GCS bucket itself ('/' for any + * bucket-rooted GoogleHadoopFileSystem) or delete on that path when it's empty will result in + * fully deleting the GCS bucket. If false, any operation that normally would have deleted the + * bucket will be ignored instead. Setting to 'false' preserves the typical behavior of "rm -rf /" + * which translates to deleting everything inside of root, but without clobbering the filesystem + * authority corresponding to that root path in the process. + */ + static final HadoopConfigurationProperty GCE_BUCKET_DELETE_ENABLE = + new HadoopConfigurationProperty<>( + "fs.gs.bucket.delete.enable", + false); + private final String workingDirectory; private final String projectId; private final Configuration config; @@ -169,31 +182,31 @@ long getInplaceSeekLimit() { return GCS_INPUT_STREAM_INPLACE_SEEK_LIMIT.get(config, config::getLongBytes); } - public int getFadviseRequestTrackCount() { + int getFadviseRequestTrackCount() { return GCS_FADVISE_REQUEST_TRACK_COUNT.get(config, config::getInt); } - public boolean isGzipEncodingSupportEnabled() { + boolean isGzipEncodingSupportEnabled() { return GCS_INPUT_STREAM_SUPPORT_GZIP_ENCODING_ENABLE.get(config, config::getBoolean); } - public long getMinRangeRequestSize() { + long getMinRangeRequestSize() { return GCS_INPUT_STREAM_MIN_RANGE_REQUEST_SIZE.get(config, config::getLongBytes); } - public long getBlockSize() { + long getBlockSize() { return BLOCK_SIZE.get(config, config::getLong); } - public boolean isReadExactRequestedBytesEnabled() { + boolean isReadExactRequestedBytesEnabled() { return false; //TODO: Remove this option? } - public long getMaxRewriteChunkSize() { + long getMaxRewriteChunkSize() { return GCS_REWRITE_MAX_CHUNK_SIZE.get(config, config::getLong); } - public Pattern getMarkerFilePattern() { + Pattern getMarkerFilePattern() { String pattern = GCS_MARKER_FILE_PATTERN.get(config, config::get); if (pattern == null) { return null; @@ -207,11 +220,19 @@ public Pattern getMarkerFilePattern() { return fileMarkerFilePattern; } - public boolean isEnsureNoConflictingItems() { + boolean isEnsureNoConflictingItems() { return GCS_CREATE_ITEMS_CONFLICT_CHECK_ENABLE.get(config, config::getBoolean); } - public Duration getMinSyncInterval() { + Duration getMinSyncInterval() { return GCS_OUTPUT_STREAM_SYNC_MIN_INTERVAL.getTimeDuration(config); } + + Configuration getConfig() { + return config; + } + + boolean isBucketDeleteEnabled() { + return GCE_BUCKET_DELETE_ENABLE.get(config, config::getBoolean); + } } diff --git a/hadoop-tools/hadoop-gcp/src/main/java/org/apache/hadoop/fs/gs/GoogleHadoopOutputStream.java b/hadoop-tools/hadoop-gcp/src/main/java/org/apache/hadoop/fs/gs/GoogleHadoopOutputStream.java index c41ce13edaeca..e1a87915a0432 100644 --- a/hadoop-tools/hadoop-gcp/src/main/java/org/apache/hadoop/fs/gs/GoogleHadoopOutputStream.java +++ b/hadoop-tools/hadoop-gcp/src/main/java/org/apache/hadoop/fs/gs/GoogleHadoopOutputStream.java @@ -54,7 +54,7 @@ class GoogleHadoopOutputStream extends OutputStream implements StreamCapabilities, Syncable { - private static final Logger LOG = LoggerFactory.getLogger(StorageResourceId.class); + private static final Logger LOG = LoggerFactory.getLogger(GoogleHadoopOutputStream.class); // Prefix used for all temporary files created by this stream. private static final String TMP_FILE_PREFIX = "_GHFS_SYNC_TMP_FILE_"; diff --git a/hadoop-tools/hadoop-gcp/src/main/java/org/apache/hadoop/fs/gs/HadoopConfigurationProperty.java b/hadoop-tools/hadoop-gcp/src/main/java/org/apache/hadoop/fs/gs/HadoopConfigurationProperty.java index 450459e6a8dbc..71bd729f62e53 100644 --- a/hadoop-tools/hadoop-gcp/src/main/java/org/apache/hadoop/fs/gs/HadoopConfigurationProperty.java +++ b/hadoop-tools/hadoop-gcp/src/main/java/org/apache/hadoop/fs/gs/HadoopConfigurationProperty.java @@ -20,6 +20,7 @@ import org.apache.hadoop.thirdparty.com.google.common.collect.ImmutableList; +import java.io.IOException; import java.time.Duration; import java.util.List; import java.util.function.BiFunction; @@ -29,6 +30,7 @@ import org.slf4j.Logger; import org.slf4j.LoggerFactory; +import static org.apache.hadoop.thirdparty.com.google.common.base.Preconditions.checkState; import static java.util.concurrent.TimeUnit.MILLISECONDS; /** @@ -74,6 +76,25 @@ Duration getTimeDuration(Configuration config) { lookupKey, Duration.ofMillis(config.getTimeDuration(lookupKey, defValStr, MILLISECONDS))); } + HadoopConfigurationProperty withPrefixes(List prefixes) { + this.keyPrefixes = ImmutableList.copyOf(prefixes); + return this; + } + + RedactedString getPassword(Configuration config) { + checkState(defaultValue == null || defaultValue instanceof String, "Not a string property"); + String lookupKey = getLookupKey(config, key, (c, k) -> c.get(k) != null); + char[] value; + try { + value = config.getPassword(lookupKey); + } catch (IOException e) { + throw new RuntimeException(e); + } + return logProperty( + lookupKey, + RedactedString.create(value == null ? (String) defaultValue : String.valueOf(value))); + } + private String getLookupKey(Configuration config, String lookupKey, BiFunction checkFn) { for (String prefix : keyPrefixes) { diff --git a/hadoop-tools/hadoop-gcp/src/main/java/org/apache/hadoop/fs/gs/HadoopCredentialsConfiguration.java b/hadoop-tools/hadoop-gcp/src/main/java/org/apache/hadoop/fs/gs/HadoopCredentialsConfiguration.java new file mode 100644 index 0000000000000..a0a36b7bfefbf --- /dev/null +++ b/hadoop-tools/hadoop-gcp/src/main/java/org/apache/hadoop/fs/gs/HadoopCredentialsConfiguration.java @@ -0,0 +1,213 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hadoop.fs.gs; + +import com.google.auth.oauth2.ComputeEngineCredentials; +import com.google.auth.oauth2.ExternalAccountCredentials; +import com.google.auth.oauth2.GoogleCredentials; +import com.google.auth.oauth2.ServiceAccountCredentials; +import com.google.auth.oauth2.UserCredentials; +import org.apache.hadoop.thirdparty.com.google.common.annotations.VisibleForTesting; +import org.apache.hadoop.thirdparty.com.google.common.base.Strings; +import org.apache.hadoop.thirdparty.com.google.common.collect.ImmutableList; +import java.io.FileInputStream; +import java.io.IOException; +import java.net.URI; +import java.util.List; +import org.apache.hadoop.conf.Configuration; + +/** + * The Hadoop credentials configuration. + * + *

When reading configuration this class makes use of a list of key prefixes that are each + * applied to key suffixes to create a complete configuration key. There is a base prefix of + * 'google.cloud.' that is included by the builder for each configuration key suffix. When + * constructing, other prefixes can be specified. Prefixes specified later can be used to override + * the values of previously set values. In this way a set of global credentials can be specified for + * most connectors with an override specified for any connectors that need different credentials. + */ +final class HadoopCredentialsConfiguration { + + /** + * All instances constructed using the builder will use {@code google.cloud} as the first prefix + * checked. Other prefixes can be added and will override values in the {@code google.cloud} + * prefix. + */ + private static final String BASE_KEY_PREFIX = "google.cloud"; + private static final String CLOUD_PLATFORM_SCOPE = + "https://www.googleapis.com/auth/cloud-platform"; + /** Key suffix used to configure authentication type. */ + private static final HadoopConfigurationProperty AUTHENTICATION_TYPE_SUFFIX = + new HadoopConfigurationProperty<>(".auth.type", AuthenticationType.COMPUTE_ENGINE); + /** + * Key suffix used to configure the path to a JSON file containing a Service Account key and + * identifier (email). Technically, this could be a JSON containing a non-service account user, + * but this setting is only used in the service account flow and is namespaced as such. + */ + private static final HadoopConfigurationProperty SERVICE_ACCOUNT_JSON_KEYFILE_SUFFIX = + new HadoopConfigurationProperty<>(".auth.service.account.json.keyfile"); + /** + * Key suffix used to configure the path to a JSON file containing a workload identity federation, + * i.e. external account credential configuration. Technically, this could be a JSON containing an + * service account impersonation url and credential source. but this setting is only used in the + * workload identity federation flow and is namespaced as such. + */ + private static final HadoopConfigurationProperty + WORKLOAD_IDENTITY_FEDERATION_CREDENTIAL_CONFIG_FILE_SUFFIX = + new HadoopConfigurationProperty<>( + ".auth.workload.identity.federation.credential.config.file"); + + /** Key suffix for setting a token server URL to use to refresh OAuth token. */ + private static final HadoopConfigurationProperty TOKEN_SERVER_URL_SUFFIX = + new HadoopConfigurationProperty<>(".token.server.url"); + + private static final HadoopConfigurationProperty READ_TIMEOUT_SUFFIX = + new HadoopConfigurationProperty<>(".http.read-timeout", 5_000L); + /** + * Configuration key for defining the OAUth2 client ID. Required when the authentication type is + * USER_CREDENTIALS + */ + private static final HadoopConfigurationProperty AUTH_CLIENT_ID_SUFFIX = + new HadoopConfigurationProperty<>(".auth.client.id"); + /** + * Configuration key for defining the OAUth2 client secret. Required when the authentication type + * is USER_CREDENTIALS + */ + private static final HadoopConfigurationProperty AUTH_CLIENT_SECRET_SUFFIX = + new HadoopConfigurationProperty<>(".auth.client.secret"); + /** + * Configuration key for defining the OAuth2 refresh token. Required when the authentication type + * is USER_CREDENTIALS + */ + private static final HadoopConfigurationProperty AUTH_REFRESH_TOKEN_SUFFIX = + new HadoopConfigurationProperty<>(".auth.refresh.token"); + + private HadoopCredentialsConfiguration() {} + + /** + * Returns full list of config prefixes that will be resolved based on the order in returned list. + */ + static List getConfigKeyPrefixes(String... keyPrefixes) { + return ImmutableList.builder().add(keyPrefixes).add(BASE_KEY_PREFIX).build(); + } + + /** + * Get the credentials for the configured {@link AuthenticationType}. + * + * @throws IllegalStateException if configured {@link AuthenticationType} is not recognized. + */ + static GoogleCredentials getCredentials(Configuration config, String... keyPrefixesVararg) + throws IOException { + List keyPrefixes = getConfigKeyPrefixes(keyPrefixesVararg); + return getCredentials(config, keyPrefixes); + } + + @VisibleForTesting + static GoogleCredentials getCredentials(Configuration config, List keyPrefixes) + throws IOException { + GoogleCredentials credentials = getCredentialsInternal(config, keyPrefixes); + return credentials == null ? null : configureCredentials(config, keyPrefixes, credentials); + } + + private static GoogleCredentials getCredentialsInternal( + Configuration config, List keyPrefixes) throws IOException { + AuthenticationType authenticationType = + AUTHENTICATION_TYPE_SUFFIX.withPrefixes(keyPrefixes).get(config, config::getEnum); + switch (authenticationType) { + case APPLICATION_DEFAULT: + return GoogleCredentials.getApplicationDefault(); + case COMPUTE_ENGINE: + return ComputeEngineCredentials.newBuilder().build(); + case SERVICE_ACCOUNT_JSON_KEYFILE: + String keyFile = SERVICE_ACCOUNT_JSON_KEYFILE_SUFFIX + .withPrefixes(keyPrefixes).get(config, config::get); + + if (Strings.isNullOrEmpty(keyFile)) { + throw new IllegalArgumentException(String.format( + "Missing keyfile property ('%s') for authentication type '%s'", + SERVICE_ACCOUNT_JSON_KEYFILE_SUFFIX.getKey(), + authenticationType)); + } + + try (FileInputStream fis = new FileInputStream(keyFile)) { + return ServiceAccountCredentials.fromStream(fis); + } + case USER_CREDENTIALS: + String clientId = AUTH_CLIENT_ID_SUFFIX.withPrefixes(keyPrefixes).get(config, config::get); + RedactedString clientSecret = + AUTH_CLIENT_SECRET_SUFFIX.withPrefixes(keyPrefixes).getPassword(config); + RedactedString refreshToken = + AUTH_REFRESH_TOKEN_SUFFIX.withPrefixes(keyPrefixes).getPassword(config); + + return UserCredentials.newBuilder() + .setClientId(clientId) + .setClientSecret(clientSecret.getValue()) + .setRefreshToken(refreshToken.getValue()) + .build(); + + case WORKLOAD_IDENTITY_FEDERATION_CREDENTIAL_CONFIG_FILE: + String configFile = + WORKLOAD_IDENTITY_FEDERATION_CREDENTIAL_CONFIG_FILE_SUFFIX + .withPrefixes(keyPrefixes) + .get(config, config::get); + try (FileInputStream fis = new FileInputStream(configFile)) { + return ExternalAccountCredentials.fromStream(fis); + } + case UNAUTHENTICATED: + return null; + default: + throw new IllegalArgumentException("Unknown authentication type: " + authenticationType); + } + } + + private static GoogleCredentials configureCredentials( + Configuration config, List keyPrefixes, GoogleCredentials credentials) { + credentials = credentials.createScoped(CLOUD_PLATFORM_SCOPE); + String tokenServerUrl = + TOKEN_SERVER_URL_SUFFIX.withPrefixes(keyPrefixes).get(config, config::get); + if (tokenServerUrl == null) { + return credentials; + } + if (credentials instanceof ServiceAccountCredentials) { + return ((ServiceAccountCredentials) credentials) + .toBuilder().setTokenServerUri(URI.create(tokenServerUrl)).build(); + } + if (credentials instanceof UserCredentials) { + return ((UserCredentials) credentials) + .toBuilder().setTokenServerUri(URI.create(tokenServerUrl)).build(); + } + return credentials; + } + + /** Enumerates all supported authentication types. */ + public enum AuthenticationType { + /** Configures Application Default Credentials authentication. */ + APPLICATION_DEFAULT, + /** Configures Google Compute Engine service account authentication. */ + COMPUTE_ENGINE, + /** Configures JSON keyfile service account authentication. */ + SERVICE_ACCOUNT_JSON_KEYFILE, + /** Configures workload identity pool key file. */ + WORKLOAD_IDENTITY_FEDERATION_CREDENTIAL_CONFIG_FILE, + /** Configures unauthenticated access. */ + UNAUTHENTICATED, + /** Configures user credentials authentication. */ + USER_CREDENTIALS, + } +} diff --git a/hadoop-tools/hadoop-gcp/src/main/java/org/apache/hadoop/fs/gs/RedactedString.java b/hadoop-tools/hadoop-gcp/src/main/java/org/apache/hadoop/fs/gs/RedactedString.java new file mode 100644 index 0000000000000..db2d49de697bb --- /dev/null +++ b/hadoop-tools/hadoop-gcp/src/main/java/org/apache/hadoop/fs/gs/RedactedString.java @@ -0,0 +1,50 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hadoop.fs.gs; + +import static org.apache.hadoop.thirdparty.com.google.common.base.Strings.isNullOrEmpty; + +import javax.annotation.Nullable; + +/** + * Holder class for string values that should not be logged and displayed when {@code toString} + * method called. For example, it should be used for credentials. + */ +class RedactedString { + + private final String value; + + RedactedString(String value) { + this.value = value; + } + + @Nullable + static RedactedString create(@Nullable String value) { + return isNullOrEmpty(value) ? null : new RedactedString(value); + } + + String getValue() { + return value; + } + + @Override + public final String toString() { + return ""; + } +} diff --git a/hadoop-tools/hadoop-gcp/src/main/java/org/apache/hadoop/fs/gs/StatisticTypeEnum.java b/hadoop-tools/hadoop-gcp/src/main/java/org/apache/hadoop/fs/gs/StatisticTypeEnum.java new file mode 100644 index 0000000000000..4c203e6b687ba --- /dev/null +++ b/hadoop-tools/hadoop-gcp/src/main/java/org/apache/hadoop/fs/gs/StatisticTypeEnum.java @@ -0,0 +1,23 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hadoop.fs.gs; + +enum StatisticTypeEnum { + TYPE_COUNTER, TYPE_DURATION +} diff --git a/hadoop-tools/hadoop-gcp/src/main/java/org/apache/hadoop/fs/gs/StringPaths.java b/hadoop-tools/hadoop-gcp/src/main/java/org/apache/hadoop/fs/gs/StringPaths.java index 80682c3ed2a31..fb4449e517c0a 100644 --- a/hadoop-tools/hadoop-gcp/src/main/java/org/apache/hadoop/fs/gs/StringPaths.java +++ b/hadoop-tools/hadoop-gcp/src/main/java/org/apache/hadoop/fs/gs/StringPaths.java @@ -30,7 +30,7 @@ */ final class StringPaths { - public static final Logger LOG = LoggerFactory.getLogger(StorageResourceId.class); + public static final Logger LOG = LoggerFactory.getLogger(StringPaths.class); private StringPaths() { } diff --git a/hadoop-tools/hadoop-gcp/src/site/markdown/tools/hadoop-gcp/Configuration.md b/hadoop-tools/hadoop-gcp/src/site/markdown/tools/hadoop-gcp/Configuration.md new file mode 100644 index 0000000000000..d91e69f63432e --- /dev/null +++ b/hadoop-tools/hadoop-gcp/src/site/markdown/tools/hadoop-gcp/Configuration.md @@ -0,0 +1,197 @@ + + +## Configuration properties + +### General configuration + +* `fs.gs.project.id` (not set by default) + + Google Cloud Project ID with access to Google Cloud Storage buckets. + Required only for list buckets and create bucket operations. + +* `fs.gs.working.dir` (default: `/`) + + The directory relative `gs:` uris resolve in inside the default bucket. + +* `fs.gs.rewrite.max.chunk.size` (default: `512m`) + + Maximum size of object chunk that will be rewritten in a single rewrite + request when `fs.gs.copy.with.rewrite.enable` is set to `true`. + +* `fs.gs.bucket.delete.enable` (default: `false`) + + If `true`, recursive delete on a path that refers to a Cloud Storage bucket + itself or delete on that path when it is empty will result in deletion of + the bucket itself. If `false`, any operation that normally would have + deleted the bucket will be ignored. Setting to `false` preserves the typical + behavior of `rm -rf /` which translates to deleting everything inside of + root, but without clobbering the filesystem authority corresponding to that + root path in the process. + +* `fs.gs.block.size` (default: `64m`) + + The reported block size of the file system. This does not change any + behavior of the connector or the underlying Google Cloud Storage objects. + However, it will affect the number of splits Hadoop MapReduce uses for a + given input. + +* `fs.gs.create.items.conflict.check.enable` (default: `true`) + + Enables a check that ensures that conflicting directories do not exist when + creating files and conflicting files do not exist when creating directories. + +* `fs.gs.marker.file.pattern` (not set by default) + + If set, files that match specified pattern are copied last during folder + rename operation. + +### Authentication + +* `fs.gs.auth.type` (default: `COMPUTE_ENGINE`) + + What type of authentication mechanism to use for Google Cloud Storage + access. + + Valid values: + + * `APPLICATION_DEFAULT` - configures + [Application Default Credentials](https://javadoc.io/doc/com.google.auth/google-auth-library-oauth2-http/latest/com/google/auth/oauth2/GoogleCredentials.html) + authentication + + * `COMPUTE_ENGINE` - configures Google Compute Engine service account + authentication + + * `SERVICE_ACCOUNT_JSON_KEYFILE` - configures JSON keyfile service account + authentication + + * `UNAUTHENTICATED` - configures unauthenticated access + + * `USER_CREDENTIALS` - configure [user credentials](#user-credentials) + +* `fs.gs.auth.service.account.json.keyfile` (not set by default) + + The path to the JSON keyfile for the service account when `fs.gs.auth.type` + property is set to `SERVICE_ACCOUNT_JSON_KEYFILE`. The file must exist at + the same path on all nodes + +#### User credentials + +User credentials allows you to access Google resources on behalf of a user, with +the according permissions associated to this user. + +To achieve this the connector will use the +[refresh token grant flow](https://oauth.net/2/grant-types/refresh-token/) to +retrieve a new access tokens when necessary. + +In order to use this authentication type, you will first need to retrieve a +refresh token using the +[authorization code grant flow](https://oauth.net/2/grant-types/authorization-code) +and pass it to the connector with OAuth client ID and secret: + +* `fs.gs.auth.client.id` (not set by default) + + The OAuth2 client ID. + +* `fs.gs.auth.client.secret` (not set by default) + + The OAuth2 client secret. + +* `fs.gs.auth.refresh.token` (not set by default) + + The refresh token. + +### IO configuration + +* `fs.gs.inputstream.support.gzip.encoding.enable` (default: `false`) + + If set to `false` then reading files with GZIP content encoding (HTTP header + `Content-Encoding: gzip`) will result in failure (`IOException` is thrown). + + This feature is disabled by default because processing of + [GZIP encoded](https://cloud.google.com/storage/docs/transcoding#decompressive_transcoding) + files is inefficient and error-prone in Hadoop and Spark. + +* `fs.gs.outputstream.buffer.size` (default: `8m`) + + Write buffer size used by the file system API to send the data to be + uploaded to Cloud Storage upload thread via pipes. The various pipe types + are documented below. + +* `fs.gs.outputstream.sync.min.interval` (default: `0`) + + Output stream configuration that controls the minimum interval between + consecutive syncs. This allows to avoid getting rate-limited by Google Cloud + Storage. Default is `0` - no wait between syncs. Note that `hflush()` will + be no-op if called more frequently than minimum sync interval and `hsync()` + will block until an end of a min sync interval. + +### Fadvise feature configuration + +* `fs.gs.inputstream.fadvise` (default: `AUTO`) + + Tunes reading objects behavior to optimize HTTP GET requests for various use + cases. + + This property controls fadvise feature that allows to read objects in + different modes: + + * `SEQUENTIAL` - in this mode connector sends a single streaming + (unbounded) Cloud Storage request to read object from a specified + position sequentially. + + * `RANDOM` - in this mode connector will send bounded Cloud Storage range + requests (specified through HTTP Range header) which are more efficient + in some cases (e.g. reading objects in row-columnar file formats like + ORC, Parquet, etc). + + Range request size is limited by whatever is greater, `fs.gs.io.buffer` + or read buffer size passed by a client. + + To avoid sending too small range requests (couple bytes) - could happen + if `fs.gs.io.buffer` is 0 and client passes very small read buffer, + minimum range request size is limited to 2 MB by default configurable + through `fs.gs.inputstream.min.range.request.size` property + + * `AUTO` - in this mode (adaptive range reads) connector starts to send + bounded range requests when reading non gzip-encoded objects instead of + streaming requests as soon as first backward read or forward read for + more than `fs.gs.inputstream.inplace.seek.limit` bytes was detected. + + * `AUTO_RANDOM` - It is complementing `AUTO` mode which uses sequential + mode to start with and adapts to bounded range requests. `AUTO_RANDOM` + mode uses bounded channel initially and adapts to sequential requests if + consecutive requests are within `fs.gs.inputstream.min.range.request.size`. + gzip-encode object will bypass this adoption, it will always be a + streaming(unbounded) channel. This helps in cases where egress limits is + getting breached for customer because `AUTO` mode will always lead to + one unbounded channel for a file. `AUTO_RANDOM` will avoid such unwanted + unbounded channels. + +* `fs.gs.fadvise.request.track.count` (default: `3`) + + Self adaptive fadvise mode uses distance between the served requests to + decide the access pattern. This property controls how many such requests + need to be tracked. It is used when `AUTO_RANDOM` is selected. + +* `fs.gs.inputstream.inplace.seek.limit` (default: `8m`) + + If forward seeks are within this many bytes of the current position, seeks + are performed by reading and discarding bytes in-place rather than opening a + new underlying stream. + +* `fs.gs.inputstream.min.range.request.size` (default: `2m`) + + Minimum size in bytes of the read range for Cloud Storage request when + opening a new stream to read an object. \ No newline at end of file diff --git a/hadoop-tools/hadoop-gcp/src/site/markdown/tools/hadoop-gcp/testing.md b/hadoop-tools/hadoop-gcp/src/site/markdown/tools/hadoop-gcp/testing.md index a56d7e6c395f8..e9c2c305e2e07 100644 --- a/hadoop-tools/hadoop-gcp/src/site/markdown/tools/hadoop-gcp/testing.md +++ b/hadoop-tools/hadoop-gcp/src/site/markdown/tools/hadoop-gcp/testing.md @@ -51,11 +51,22 @@ Example: ```xml - - fs.contract.test.fs.gs - gs://your bucket name - - + + fs.gs.auth.type + SERVICE_ACCOUNT_JSON_KEYFILE + + + fs.gs.auth.service.account.json.keyfile + YOUR_JSON_KEY_FILE + + + fs.gs.project.id + YOUR_PROJECT_ID_HERE + + + fs.contract.test.fs.gs + gs://your_bucket + ``` @@ -63,8 +74,6 @@ Example: After completing the configuration, execute the test run through Maven. -This has to be run from a GCP VM. This limitation will be removed later. - ```bash mvn clean verify -``` \ No newline at end of file +``` diff --git a/hadoop-tools/hadoop-gcp/src/test/java/org/apache/hadoop/fs/gs/contract/ITestGoogleContractContentSummary.java b/hadoop-tools/hadoop-gcp/src/test/java/org/apache/hadoop/fs/gs/contract/ITestGoogleContractContentSummary.java new file mode 100644 index 0000000000000..56c0938972a41 --- /dev/null +++ b/hadoop-tools/hadoop-gcp/src/test/java/org/apache/hadoop/fs/gs/contract/ITestGoogleContractContentSummary.java @@ -0,0 +1,29 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hadoop.fs.gs.contract; +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.fs.contract.AbstractFSContract; +import org.apache.hadoop.fs.contract.AbstractContractContentSummaryTest; + +public class ITestGoogleContractContentSummary extends AbstractContractContentSummaryTest { + @Override + protected AbstractFSContract createContract(Configuration conf) { + return new GoogleContract(conf); + } +} diff --git a/hadoop-tools/hadoop-gcp/src/test/java/org/apache/hadoop/fs/gs/contract/ITestGoogleContractRootDirectory.java b/hadoop-tools/hadoop-gcp/src/test/java/org/apache/hadoop/fs/gs/contract/ITestGoogleContractRootDirectory.java new file mode 100644 index 0000000000000..e00b3f2fb68b4 --- /dev/null +++ b/hadoop-tools/hadoop-gcp/src/test/java/org/apache/hadoop/fs/gs/contract/ITestGoogleContractRootDirectory.java @@ -0,0 +1,31 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hadoop.fs.gs.contract; + +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.fs.contract.AbstractContractRootDirectoryTest; +import org.apache.hadoop.fs.contract.AbstractFSContract; + +/** GCS contract tests covering file root directory. */ +public class ITestGoogleContractRootDirectory extends AbstractContractRootDirectoryTest { + @Override + protected AbstractFSContract createContract(Configuration conf) { + return new GoogleContract(conf); + } +} diff --git a/hadoop-tools/hadoop-gcp/src/test/java/org/apache/hadoop/fs/gs/contract/ITestGoogleContractUnbuffer.java b/hadoop-tools/hadoop-gcp/src/test/java/org/apache/hadoop/fs/gs/contract/ITestGoogleContractUnbuffer.java new file mode 100644 index 0000000000000..0d520ac516cf2 --- /dev/null +++ b/hadoop-tools/hadoop-gcp/src/test/java/org/apache/hadoop/fs/gs/contract/ITestGoogleContractUnbuffer.java @@ -0,0 +1,29 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hadoop.fs.gs.contract; +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.fs.contract.AbstractContractUnbufferTest; +import org.apache.hadoop.fs.contract.AbstractFSContract; + +public class ITestGoogleContractUnbuffer extends AbstractContractUnbufferTest { + @Override + protected AbstractFSContract createContract(Configuration conf) { + return new GoogleContract(conf); + } +} From faff8a4012f57e3703626dbb168b72c7c2b01721 Mon Sep 17 00:00:00 2001 From: Arunkumar Chacko Date: Mon, 21 Jul 2025 22:33:18 +0000 Subject: [PATCH 8/8] HADOOP-19343. Add core-site.xml for testing Closes #7797 Signed-off-by: Chris Nauroth --- .../src/test/resources/core-site.xml | 55 +++++++++++++++++++ 1 file changed, 55 insertions(+) create mode 100644 hadoop-tools/hadoop-gcp/src/test/resources/core-site.xml diff --git a/hadoop-tools/hadoop-gcp/src/test/resources/core-site.xml b/hadoop-tools/hadoop-gcp/src/test/resources/core-site.xml new file mode 100644 index 0000000000000..eb12fe132f4ea --- /dev/null +++ b/hadoop-tools/hadoop-gcp/src/test/resources/core-site.xml @@ -0,0 +1,55 @@ + + + + + + + + + hadoop.tmp.dir + target/build/test + A base for other temporary directories. + true + + + + hadoop.security.authentication + simple + + + fs.gs.impl + org.apache.hadoop.fs.gs.GoogleHadoopFileSystem + + + fs.AbstractFileSystem.gs.impl + org.apache.hadoop.fs.gs.Gs + + + + + + + +