From 923dcc81fdf192397226d7828933a30384203217 Mon Sep 17 00:00:00 2001 From: Jia Ke Date: Fri, 10 Jan 2025 12:51:09 -0800 Subject: [PATCH] fix(hadoop): Remove the schema for hdfs path when reading file (#11963) Summary: Although we support JVM libhdfs, Gluten's internal benchmark still uses Libhdfs3. We encountered a 'File Not Found' exception when reading the HDFS path with libhdfs3. ``` Reason: Unable to get file path info for file: hdfs://b49691a74b48.jf.intel.com:8020/tpch_sf3000/lineitem/part-00281-3761d71a-87c6-4341-8f1c-db804f904130-c000.snappy.parquet. got error: FileNotFoundException: Path hdfs://b49691a74b48.jf.intel.com:8020/tpch_sf3000/lineitem/part-00281-3761d71a-87c6-4341-8f1c-db804f904130-c000.snappy.parquet does not exist. Retriable: False Context: Split [Hive: hdfs://b49691a74b48.jf.intel.com:8020/tpch_sf3000/lineitem/part-00281-3761d71a-87c6-4341-8f1c-db804f904130-c000.snappy.parquet 0 - 1489456566] Task Gluten_Stage_8_TID_842_VTID_27 Additional Context: Operator: TableScan[0] 0 Function: Impl File: /home/sparkuser/workspace/workspace/Gluten_TPCH_Spark32_test/ep/build-velox/build/velox_ep/velox/connectors/hive/storage_adapters/hdfs/HdfsReadFile.cpp Line: 79 ``` This PR reverts some changes from a previous [PR ](https://github.com/facebookincubator/velox/pull/11811)to ensure continued support for libhdfs3 reading in Velox Pull Request resolved: https://github.com/facebookincubator/velox/pull/11963 Reviewed By: xiaoxmeng Differential Revision: D67996555 Pulled By: Yuhta fbshipit-source-id: 29e8c0070bdb403609f3dee711ea3db8a011f8b3 --- .../hive/storage_adapters/hdfs/HdfsFileSystem.cpp | 7 +++++++ .../storage_adapters/hdfs/tests/HdfsFileSystemTest.cpp | 2 +- 2 files changed, 8 insertions(+), 1 deletion(-) diff --git a/velox/connectors/hive/storage_adapters/hdfs/HdfsFileSystem.cpp b/velox/connectors/hive/storage_adapters/hdfs/HdfsFileSystem.cpp index 9479279ee54a..856f2b2526de 100644 --- a/velox/connectors/hive/storage_adapters/hdfs/HdfsFileSystem.cpp +++ b/velox/connectors/hive/storage_adapters/hdfs/HdfsFileSystem.cpp @@ -91,6 +91,13 @@ std::string HdfsFileSystem::name() const { std::unique_ptr HdfsFileSystem::openFileForRead( std::string_view path, const FileOptions& /*unused*/) { + // Only remove the schema for hdfs path. + if (path.find(kScheme) == 0) { + path.remove_prefix(kScheme.length()); + if (auto index = path.find('/')) { + path.remove_prefix(index); + } + } return std::make_unique( impl_->hdfsShim(), impl_->hdfsClient(), path); } diff --git a/velox/connectors/hive/storage_adapters/hdfs/tests/HdfsFileSystemTest.cpp b/velox/connectors/hive/storage_adapters/hdfs/tests/HdfsFileSystemTest.cpp index 168ca9abfb60..ec08af1b1ff1 100644 --- a/velox/connectors/hive/storage_adapters/hdfs/tests/HdfsFileSystemTest.cpp +++ b/velox/connectors/hive/storage_adapters/hdfs/tests/HdfsFileSystemTest.cpp @@ -274,7 +274,7 @@ TEST_F(HdfsFileSystemTest, missingFileViaFileSystem) { hdfsFileSystem->openFileForRead( "hdfs://localhost:7777/path/that/does/not/exist"), error_code::kFileNotFound, - "Unable to get file path info for file: hdfs://localhost:7777/path/that/does/not/exist. got error: FileNotFoundException: Path hdfs://localhost:7777/path/that/does/not/exist does not exist."); + "Unable to get file path info for file: /path/that/does/not/exist. got error: FileNotFoundException: Path /path/that/does/not/exist does not exist."); } TEST_F(HdfsFileSystemTest, missingHost) {