From 8907a9020a4ff9e3361d6dcf84e4f12fb2f985c8 Mon Sep 17 00:00:00 2001
From: db <dbaranec@nvidia.com>
Date: Fri, 6 Oct 2023 10:59:22 -0500
Subject: [PATCH 01/49] Cleanup of namespaces in parquet.  The
 ::detail::parquet namespace has been changed to ::parquet::detail, 
 ::parquet::gpu has been renamed to ::parquet::detail, and several
 detail-style files which were just using ::parquet have been moved into
 parquet::detail.

---
 cpp/include/cudf/io/detail/parquet.hpp        |   8 +-
 cpp/include/cudf/io/parquet.hpp               |   4 +-
 cpp/src/io/functions.cpp                      |   4 +-
 cpp/src/io/parquet/chunk_dict.cu              |  12 +-
 .../io/parquet/compact_protocol_reader.cpp    |   2 +
 .../io/parquet/compact_protocol_reader.hpp    |   3 +
 .../io/parquet/compact_protocol_writer.cpp    |   2 +
 .../io/parquet/compact_protocol_writer.hpp    |   2 +
 cpp/src/io/parquet/decode_preprocess.cu       |   4 +-
 cpp/src/io/parquet/delta_binary.cuh           |   4 +-
 cpp/src/io/parquet/page_data.cu               |   6 +-
 cpp/src/io/parquet/page_decode.cuh            |   4 +-
 cpp/src/io/parquet/page_delta_decode.cu       |   6 +-
 cpp/src/io/parquet/page_enc.cu                |  16 +-
 cpp/src/io/parquet/page_hdr.cu                |   7 +-
 cpp/src/io/parquet/page_string_decode.cu      |   8 +-
 cpp/src/io/parquet/page_string_utils.cuh      |   4 +-
 cpp/src/io/parquet/parquet.hpp                |   3 +
 cpp/src/io/parquet/parquet_common.hpp         |   3 +
 cpp/src/io/parquet/parquet_gpu.cuh            |   4 +-
 cpp/src/io/parquet/parquet_gpu.hpp            |  27 +-
 cpp/src/io/parquet/predicate_pushdown.cpp     |  24 +-
 cpp/src/io/parquet/reader.cpp                 |   4 +-
 cpp/src/io/parquet/reader_impl.cpp            |  36 +-
 cpp/src/io/parquet/reader_impl.hpp            |  12 +-
 cpp/src/io/parquet/reader_impl_chunking.cu    | 597 ++++++++++++++++++
 cpp/src/io/parquet/reader_impl_helpers.cpp    | 121 ++--
 cpp/src/io/parquet/reader_impl_helpers.hpp    |  21 +-
 cpp/src/io/parquet/reader_impl_preprocess.cu  | 259 ++++----
 cpp/src/io/parquet/rle_stream.cuh             |   4 +-
 cpp/src/io/parquet/writer_impl.cu             | 217 ++++---
 cpp/src/io/parquet/writer_impl.hpp            |  22 +-
 cpp/src/io/utilities/column_buffer.cpp        |  10 +-
 cpp/tests/io/parquet_test.cpp                 | 207 +++---
 34 files changed, 1142 insertions(+), 525 deletions(-)
 create mode 100644 cpp/src/io/parquet/reader_impl_chunking.cu
diff --git a/cpp/include/cudf/io/detail/parquet.hpp b/cpp/include/cudf/io/detail/parquet.hpp
index 074f690d2c7..0b8ee9676de 100644
--- a/cpp/include/cudf/io/detail/parquet.hpp
+++ b/cpp/include/cudf/io/detail/parquet.hpp
@@ -38,7 +38,7 @@ class parquet_reader_options;
 class parquet_writer_options;
 class chunked_parquet_writer_options;
 
-namespace detail::parquet {
+namespace parquet::detail {
 
 /**
  * @brief Class to read Parquet dataset data into columns.
@@ -186,7 +186,7 @@ class writer {
    */
   explicit writer(std::vector<std::unique_ptr<data_sink>> sinks,
                   parquet_writer_options const& options,
-                  single_write_mode mode,
+                  cudf::io::detail::single_write_mode mode,
                   rmm::cuda_stream_view stream);
 
   /**
@@ -201,7 +201,7 @@ class writer {
    */
   explicit writer(std::vector<std::unique_ptr<data_sink>> sinks,
                   chunked_parquet_writer_options const& options,
-                  single_write_mode mode,
+                  cudf::io::detail::single_write_mode mode,
                   rmm::cuda_stream_view stream);
 
   /**
@@ -250,5 +250,5 @@ class writer {
  * metadata.
  */
 parquet_metadata read_parquet_metadata(host_span<std::unique_ptr<datasource> const> sources);
-}  // namespace detail::parquet
+}  // namespace parquet::detail
 }  // namespace cudf::io
diff --git a/cpp/include/cudf/io/parquet.hpp b/cpp/include/cudf/io/parquet.hpp
index deaf23d405a..06ac9caac75 100644
--- a/cpp/include/cudf/io/parquet.hpp
+++ b/cpp/include/cudf/io/parquet.hpp
@@ -499,7 +499,7 @@ class chunked_parquet_reader {
   [[nodiscard]] table_with_metadata read_chunk() const;
 
  private:
-  std::unique_ptr<cudf::io::detail::parquet::chunked_reader> reader;
+  std::unique_ptr<cudf::io::parquet::detail::chunked_reader> reader;
 };
 
 /** @} */  // end of group
@@ -1750,7 +1750,7 @@ class parquet_chunked_writer {
     std::vector<std::string> const& column_chunks_file_paths = {});
 
   /// Unique pointer to impl writer class
-  std::unique_ptr<cudf::io::detail::parquet::writer> writer;
+  std::unique_ptr<cudf::io::parquet::detail::writer> writer;
 };
 
 /** @} */  // end of group
diff --git a/cpp/src/io/functions.cpp b/cpp/src/io/functions.cpp
index 392a7850886..726442d752e 100644
--- a/cpp/src/io/functions.cpp
+++ b/cpp/src/io/functions.cpp
@@ -470,8 +470,8 @@ void orc_chunked_writer::close()
   writer->close();
 }
 
-using namespace cudf::io::detail::parquet;
-namespace detail_parquet = cudf::io::detail::parquet;
+using namespace cudf::io::parquet::detail;
+namespace detail_parquet = cudf::io::parquet::detail;
 
 table_with_metadata read_parquet(parquet_reader_options const& options,
                                  rmm::mr::device_memory_resource* mr)
diff --git a/cpp/src/io/parquet/chunk_dict.cu b/cpp/src/io/parquet/chunk_dict.cu
index 9ff1869edde..86678fe58d5 100644
--- a/cpp/src/io/parquet/chunk_dict.cu
+++ b/cpp/src/io/parquet/chunk_dict.cu
@@ -27,7 +27,7 @@
 namespace cudf {
 namespace io {
 namespace parquet {
-namespace gpu {
+namespace detail {
 namespace {
 constexpr int DEFAULT_BLOCK_SIZE = 256;
 }
@@ -101,7 +101,7 @@ struct map_find_fn {
 
 template <int block_size>
 __global__ void __launch_bounds__(block_size)
-  populate_chunk_hash_maps_kernel(cudf::detail::device_2dspan<gpu::PageFragment const> frags)
+  populate_chunk_hash_maps_kernel(cudf::detail::device_2dspan<PageFragment const> frags)
 {
   auto col_idx = blockIdx.y;
   auto block_x = blockIdx.x;
@@ -226,7 +226,7 @@ __global__ void __launch_bounds__(block_size)
 
 template <int block_size>
 __global__ void __launch_bounds__(block_size)
-  get_dictionary_indices_kernel(cudf::detail::device_2dspan<gpu::PageFragment const> frags)
+  get_dictionary_indices_kernel(cudf::detail::device_2dspan<PageFragment const> frags)
 {
   auto col_idx = blockIdx.y;
   auto block_x = blockIdx.x;
@@ -276,7 +276,7 @@ void initialize_chunk_hash_maps(device_span<EncColumnChunk> chunks, rmm::cuda_st
     <<<chunks.size(), block_size, 0, stream.value()>>>(chunks);
 }
 
-void populate_chunk_hash_maps(cudf::detail::device_2dspan<gpu::PageFragment const> frags,
+void populate_chunk_hash_maps(cudf::detail::device_2dspan<PageFragment const> frags,
                               rmm::cuda_stream_view stream)
 {
   dim3 const dim_grid(frags.size().second, frags.size().first);
@@ -290,14 +290,14 @@ void collect_map_entries(device_span<EncColumnChunk> chunks, rmm::cuda_stream_vi
   collect_map_entries_kernel<block_size><<<chunks.size(), block_size, 0, stream.value()>>>(chunks);
 }
 
-void get_dictionary_indices(cudf::detail::device_2dspan<gpu::PageFragment const> frags,
+void get_dictionary_indices(cudf::detail::device_2dspan<PageFragment const> frags,
                             rmm::cuda_stream_view stream)
 {
   dim3 const dim_grid(frags.size().second, frags.size().first);
   get_dictionary_indices_kernel<DEFAULT_BLOCK_SIZE>
     <<<dim_grid, DEFAULT_BLOCK_SIZE, 0, stream.value()>>>(frags);
 }
-}  // namespace gpu
+}  // namespace detail
 }  // namespace parquet
 }  // namespace io
 }  // namespace cudf
diff --git a/cpp/src/io/parquet/compact_protocol_reader.cpp b/cpp/src/io/parquet/compact_protocol_reader.cpp
index 5c7b8ca3f8c..bf4bdb47cec 100644
--- a/cpp/src/io/parquet/compact_protocol_reader.cpp
+++ b/cpp/src/io/parquet/compact_protocol_reader.cpp
@@ -24,6 +24,7 @@
 namespace cudf {
 namespace io {
 namespace parquet {
+namespace detail {
 
 /**
  * @brief Base class for parquet field functors.
@@ -870,6 +871,7 @@ int CompactProtocolReader::WalkSchema(
   }
 }
 
+}  // namespace detail
 }  // namespace parquet
 }  // namespace io
 }  // namespace cudf
diff --git a/cpp/src/io/parquet/compact_protocol_reader.hpp b/cpp/src/io/parquet/compact_protocol_reader.hpp
index 619815db503..77f8232ab7d 100644
--- a/cpp/src/io/parquet/compact_protocol_reader.hpp
+++ b/cpp/src/io/parquet/compact_protocol_reader.hpp
@@ -28,6 +28,8 @@
 namespace cudf {
 namespace io {
 namespace parquet {
+namespace detail {
+
 /**
  * @brief Class for parsing Parquet's Thrift Compact Protocol encoded metadata
  *
@@ -147,6 +149,7 @@ class CompactProtocolReader {
   friend class parquet_field_struct_blob;
 };
 
+}  // namespace detail
 }  // namespace parquet
 }  // namespace io
 }  // namespace cudf
diff --git a/cpp/src/io/parquet/compact_protocol_writer.cpp b/cpp/src/io/parquet/compact_protocol_writer.cpp
index 60bc8984d81..cca0ca83c25 100644
--- a/cpp/src/io/parquet/compact_protocol_writer.cpp
+++ b/cpp/src/io/parquet/compact_protocol_writer.cpp
@@ -19,6 +19,7 @@
 namespace cudf {
 namespace io {
 namespace parquet {
+namespace detail {
 
 /**
  * @brief Parquet CompactProtocolWriter class
@@ -391,6 +392,7 @@ inline void CompactProtocolFieldWriter::set_current_field(int const& field)
   current_field_value = field;
 }
 
+}  // namespace detail
 }  // namespace parquet
 }  // namespace io
 }  // namespace cudf
diff --git a/cpp/src/io/parquet/compact_protocol_writer.hpp b/cpp/src/io/parquet/compact_protocol_writer.hpp
index 26d66527aa5..345783e0451 100644
--- a/cpp/src/io/parquet/compact_protocol_writer.hpp
+++ b/cpp/src/io/parquet/compact_protocol_writer.hpp
@@ -28,6 +28,7 @@
 namespace cudf {
 namespace io {
 namespace parquet {
+namespace detail {
 
 /**
  * @brief Class for parsing Parquet's Thrift Compact Protocol encoded metadata
@@ -115,6 +116,7 @@ class CompactProtocolFieldWriter {
   inline void set_current_field(int const& field);
 };
 
+}  // namespace detail
 }  // namespace parquet
 }  // namespace io
 }  // namespace cudf
diff --git a/cpp/src/io/parquet/decode_preprocess.cu b/cpp/src/io/parquet/decode_preprocess.cu
index 8de3702bc2e..6c2e435a1c3 100644
--- a/cpp/src/io/parquet/decode_preprocess.cu
+++ b/cpp/src/io/parquet/decode_preprocess.cu
@@ -26,7 +26,7 @@
 namespace cudf {
 namespace io {
 namespace parquet {
-namespace gpu {
+namespace detail {
 
 namespace {
 
@@ -411,7 +411,7 @@ void ComputePageSizes(cudf::detail::hostdevice_vector<PageInfo>& pages,
   }
 }
 
-}  // namespace gpu
+}  // namespace detail
 }  // namespace parquet
 }  // namespace io
 }  // namespace cudf
diff --git a/cpp/src/io/parquet/delta_binary.cuh b/cpp/src/io/parquet/delta_binary.cuh
index 2382e4aafdf..a513e6674b4 100644
--- a/cpp/src/io/parquet/delta_binary.cuh
+++ b/cpp/src/io/parquet/delta_binary.cuh
@@ -18,7 +18,7 @@
 
 #include "page_decode.cuh"
 
-namespace cudf::io::parquet::gpu {
+namespace cudf::io::parquet::detail {
 
 // DELTA_XXX encoding support
 //
@@ -291,4 +291,4 @@ struct delta_binary_decoder {
   }
 };
 
-}  // namespace cudf::io::parquet::gpu
+}  // namespace cudf::io::parquet::detail
diff --git a/cpp/src/io/parquet/page_data.cu b/cpp/src/io/parquet/page_data.cu
index 230834632dd..436b8177ced 100644
--- a/cpp/src/io/parquet/page_data.cu
+++ b/cpp/src/io/parquet/page_data.cu
@@ -26,7 +26,7 @@
 namespace cudf {
 namespace io {
 namespace parquet {
-namespace gpu {
+namespace detail {
 
 namespace {
 
@@ -624,7 +624,7 @@ uint32_t GetAggregatedDecodeKernelMask(cudf::detail::hostdevice_vector<PageInfo>
 }
 
 /**
- * @copydoc cudf::io::parquet::gpu::DecodePageData
+ * @copydoc cudf::io::parquet::detail::DecodePageData
  */
 void __host__ DecodePageData(cudf::detail::hostdevice_vector<PageInfo>& pages,
                              cudf::detail::hostdevice_vector<ColumnChunkDesc> const& chunks,
@@ -648,7 +648,7 @@ void __host__ DecodePageData(cudf::detail::hostdevice_vector<PageInfo>& pages,
   }
 }
 
-}  // namespace gpu
+}  // namespace detail
 }  // namespace parquet
 }  // namespace io
 }  // namespace cudf
diff --git a/cpp/src/io/parquet/page_decode.cuh b/cpp/src/io/parquet/page_decode.cuh
index cdc29197eb3..d794e14d98b 100644
--- a/cpp/src/io/parquet/page_decode.cuh
+++ b/cpp/src/io/parquet/page_decode.cuh
@@ -24,7 +24,7 @@
 #include <cuda/atomic>
 #include <cuda/std/tuple>
 
-namespace cudf::io::parquet::gpu {
+namespace cudf::io::parquet::detail {
 
 struct page_state_s {
   constexpr page_state_s() noexcept {}
@@ -1384,4 +1384,4 @@ inline __device__ bool setupLocalPageInfo(page_state_s* const s,
   return true;
 }
 
-}  // namespace cudf::io::parquet::gpu
+}  // namespace cudf::io::parquet::detail
diff --git a/cpp/src/io/parquet/page_delta_decode.cu b/cpp/src/io/parquet/page_delta_decode.cu
index 2b78dead205..d25684a59f3 100644
--- a/cpp/src/io/parquet/page_delta_decode.cu
+++ b/cpp/src/io/parquet/page_delta_decode.cu
@@ -23,7 +23,7 @@
 #include <rmm/exec_policy.hpp>
 #include <thrust/transform_scan.h>
 
-namespace cudf::io::parquet::gpu {
+namespace cudf::io::parquet::detail {
 
 namespace {
 
@@ -160,7 +160,7 @@ __global__ void __launch_bounds__(96)
 }  // anonymous namespace
 
 /**
- * @copydoc cudf::io::parquet::gpu::DecodeDeltaBinary
+ * @copydoc cudf::io::parquet::detail::DecodeDeltaBinary
  */
 void __host__ DecodeDeltaBinary(cudf::detail::hostdevice_vector<PageInfo>& pages,
                                 cudf::detail::hostdevice_vector<ColumnChunkDesc> const& chunks,
@@ -184,4 +184,4 @@ void __host__ DecodeDeltaBinary(cudf::detail::hostdevice_vector<PageInfo>& pages
   }
 }
 
-}  // namespace cudf::io::parquet::gpu
+}  // namespace cudf::io::parquet::detail
diff --git a/cpp/src/io/parquet/page_enc.cu b/cpp/src/io/parquet/page_enc.cu
index fe0dbb85124..533c55ef41a 100644
--- a/cpp/src/io/parquet/page_enc.cu
+++ b/cpp/src/io/parquet/page_enc.cu
@@ -44,7 +44,7 @@
 namespace cudf {
 namespace io {
 namespace parquet {
-namespace gpu {
+namespace detail {
 
 namespace {
 
@@ -329,7 +329,7 @@ __global__ void __launch_bounds__(128)
 // blockDim {128,1,1}
 __global__ void __launch_bounds__(128)
   gpuInitPages(device_2dspan<EncColumnChunk> chunks,
-               device_span<gpu::EncPage> pages,
+               device_span<EncPage> pages,
                device_span<size_type> page_sizes,
                device_span<size_type> comp_page_sizes,
                device_span<parquet_column_device_view const> col_desc,
@@ -998,7 +998,7 @@ __device__ auto julian_days_with_time(int64_t v)
 // blockDim(128, 1, 1)
 template <int block_size>
 __global__ void __launch_bounds__(128, 8)
-  gpuEncodePages(device_span<gpu::EncPage> pages,
+  gpuEncodePages(device_span<EncPage> pages,
                  device_span<device_span<uint8_t const>> comp_in,
                  device_span<device_span<uint8_t>> comp_out,
                  device_span<compression_result> comp_results,
@@ -1988,7 +1988,7 @@ __global__ void __launch_bounds__(128)
 
 // blockDim(1024, 1, 1)
 __global__ void __launch_bounds__(1024)
-  gpuGatherPages(device_span<EncColumnChunk> chunks, device_span<gpu::EncPage const> pages)
+  gpuGatherPages(device_span<EncColumnChunk> chunks, device_span<EncPage const> pages)
 {
   __shared__ __align__(8) EncColumnChunk ck_g;
   __shared__ __align__(8) EncPage page_g;
@@ -2265,7 +2265,7 @@ void InitFragmentStatistics(device_span<statistics_group> groups,
 }
 
 void InitEncoderPages(device_2dspan<EncColumnChunk> chunks,
-                      device_span<gpu::EncPage> pages,
+                      device_span<EncPage> pages,
                       device_span<size_type> page_sizes,
                       device_span<size_type> comp_page_sizes,
                       device_span<parquet_column_device_view const> col_desc,
@@ -2294,7 +2294,7 @@ void InitEncoderPages(device_2dspan<EncColumnChunk> chunks,
                                                      write_v2_headers);
 }
 
-void EncodePages(device_span<gpu::EncPage> pages,
+void EncodePages(device_span<EncPage> pages,
                  bool write_v2_headers,
                  device_span<device_span<uint8_t const>> comp_in,
                  device_span<device_span<uint8_t>> comp_out,
@@ -2328,7 +2328,7 @@ void EncodePageHeaders(device_span<EncPage> pages,
 }
 
 void GatherPages(device_span<EncColumnChunk> chunks,
-                 device_span<gpu::EncPage const> pages,
+                 device_span<EncPage const> pages,
                  rmm::cuda_stream_view stream)
 {
   gpuGatherPages<<<chunks.size(), 1024, 0, stream.value()>>>(chunks, pages);
@@ -2343,7 +2343,7 @@ void EncodeColumnIndexes(device_span<EncColumnChunk> chunks,
     chunks, column_stats, column_index_truncate_length);
 }
 
-}  // namespace gpu
+}  // namespace detail
 }  // namespace parquet
 }  // namespace io
 }  // namespace cudf
diff --git a/cpp/src/io/parquet/page_hdr.cu b/cpp/src/io/parquet/page_hdr.cu
index 6f8b2f50443..839a75c31ff 100644
--- a/cpp/src/io/parquet/page_hdr.cu
+++ b/cpp/src/io/parquet/page_hdr.cu
@@ -23,7 +23,7 @@
 namespace cudf {
 namespace io {
 namespace parquet {
-namespace gpu {
+namespace detail {
 // Minimal thrift implementation for parsing page headers
 // https://github.com/apache/thrift/blob/master/doc/specs/thrift-compact-protocol.md
 
@@ -161,8 +161,7 @@ __device__ void skip_struct_field(byte_stream_s* bs, int field_type)
  * @param chunk Column chunk the page belongs to
  * @return `kernel_mask_bits` value for the given page
  */
-__device__ uint32_t kernel_mask_for_page(gpu::PageInfo const& page,
-                                         gpu::ColumnChunkDesc const& chunk)
+__device__ uint32_t kernel_mask_for_page(PageInfo const& page, ColumnChunkDesc const& chunk)
 {
   if (page.flags & PAGEINFO_FLAGS_DICTIONARY) { return 0; }
 
@@ -528,7 +527,7 @@ void __host__ BuildStringDictionaryIndex(ColumnChunkDesc* chunks,
   gpuBuildStringDictionaryIndex<<<dim_grid, dim_block, 0, stream.value()>>>(chunks, num_chunks);
 }
 
-}  // namespace gpu
+}  // namespace detail
 }  // namespace parquet
 }  // namespace io
 }  // namespace cudf
diff --git a/cpp/src/io/parquet/page_string_decode.cu b/cpp/src/io/parquet/page_string_decode.cu
index d79abe4a6d2..cb9461dc9ce 100644
--- a/cpp/src/io/parquet/page_string_decode.cu
+++ b/cpp/src/io/parquet/page_string_decode.cu
@@ -23,7 +23,7 @@
 namespace cudf {
 namespace io {
 namespace parquet {
-namespace gpu {
+namespace detail {
 
 namespace {
 
@@ -757,7 +757,7 @@ __global__ void __launch_bounds__(decode_block_size)
 }  // anonymous namespace
 
 /**
- * @copydoc cudf::io::parquet::gpu::ComputePageStringSizes
+ * @copydoc cudf::io::parquet::detail::ComputePageStringSizes
  */
 void ComputePageStringSizes(cudf::detail::hostdevice_vector<PageInfo>& pages,
                             cudf::detail::hostdevice_vector<ColumnChunkDesc> const& chunks,
@@ -778,7 +778,7 @@ void ComputePageStringSizes(cudf::detail::hostdevice_vector<PageInfo>& pages,
 }
 
 /**
- * @copydoc cudf::io::parquet::gpu::DecodeStringPageData
+ * @copydoc cudf::io::parquet::detail::DecodeStringPageData
  */
 void __host__ DecodeStringPageData(cudf::detail::hostdevice_vector<PageInfo>& pages,
                                    cudf::detail::hostdevice_vector<ColumnChunkDesc> const& chunks,
@@ -802,7 +802,7 @@ void __host__ DecodeStringPageData(cudf::detail::hostdevice_vector<PageInfo>& pa
   }
 }
 
-}  // namespace gpu
+}  // namespace detail
 }  // namespace parquet
 }  // namespace io
 }  // namespace cudf
diff --git a/cpp/src/io/parquet/page_string_utils.cuh b/cpp/src/io/parquet/page_string_utils.cuh
index 9395599b3ff..a81d0a64466 100644
--- a/cpp/src/io/parquet/page_string_utils.cuh
+++ b/cpp/src/io/parquet/page_string_utils.cuh
@@ -18,7 +18,7 @@
 
 #include <cudf/strings/detail/gather.cuh>
 
-namespace cudf::io::parquet::gpu {
+namespace cudf::io::parquet::detail {
 
 // stole this from cudf/strings/detail/gather.cuh. modified to run on a single string on one warp.
 // copies from src to dst in 16B chunks per thread.
@@ -107,4 +107,4 @@ __device__ void block_excl_sum(size_type* arr, size_type length, size_type initi
   }
 }
 
-}  // namespace cudf::io::parquet::gpu
+}  // namespace cudf::io::parquet::detail
diff --git a/cpp/src/io/parquet/parquet.hpp b/cpp/src/io/parquet/parquet.hpp
index c2affc774c2..5a3bec9a185 100644
--- a/cpp/src/io/parquet/parquet.hpp
+++ b/cpp/src/io/parquet/parquet.hpp
@@ -28,6 +28,8 @@
 namespace cudf {
 namespace io {
 namespace parquet {
+namespace detail {
+
 constexpr uint32_t parquet_magic = (('P' << 0) | ('A' << 8) | ('R' << 16) | ('1' << 24));
 
 /**
@@ -405,6 +407,7 @@ static inline int CountLeadingZeros32(uint32_t value)
 #endif
 }
 
+}  // namespace detail
 }  // namespace parquet
 }  // namespace io
 }  // namespace cudf
diff --git a/cpp/src/io/parquet/parquet_common.hpp b/cpp/src/io/parquet/parquet_common.hpp
index 5a1716bb547..2ac2c4388f3 100644
--- a/cpp/src/io/parquet/parquet_common.hpp
+++ b/cpp/src/io/parquet/parquet_common.hpp
@@ -21,6 +21,8 @@
 namespace cudf {
 namespace io {
 namespace parquet {
+namespace detail {
+
 // Max decimal precisions according to the parquet spec:
 // https://github.com/apache/parquet-format/blob/master/LogicalTypes.md#decimal
 auto constexpr MAX_DECIMAL32_PRECISION  = 9;
@@ -156,6 +158,7 @@ enum FieldType {
   ST_FLD_STRUCT = 12,
 };
 
+}  // namespace detail
 }  // namespace parquet
 }  // namespace io
 }  // namespace cudf
diff --git a/cpp/src/io/parquet/parquet_gpu.cuh b/cpp/src/io/parquet/parquet_gpu.cuh
index dc74bee1536..10e12ebb782 100644
--- a/cpp/src/io/parquet/parquet_gpu.cuh
+++ b/cpp/src/io/parquet/parquet_gpu.cuh
@@ -23,7 +23,7 @@
 
 #include <cuco/static_map.cuh>
 
-namespace cudf::io::parquet::gpu {
+namespace cudf::io::parquet::detail {
 
 auto constexpr KEY_SENTINEL   = size_type{-1};
 auto constexpr VALUE_SENTINEL = size_type{-1};
@@ -81,4 +81,4 @@ inline size_type __device__ row_to_value_idx(size_type idx,
   return idx;
 }
 
-}  // namespace cudf::io::parquet::gpu
+}  // namespace cudf::io::parquet::detail
diff --git a/cpp/src/io/parquet/parquet_gpu.hpp b/cpp/src/io/parquet/parquet_gpu.hpp
index 51c862b376b..767668cc65e 100644
--- a/cpp/src/io/parquet/parquet_gpu.hpp
+++ b/cpp/src/io/parquet/parquet_gpu.hpp
@@ -35,7 +35,7 @@
 
 #include <vector>
 
-namespace cudf::io::parquet {
+namespace cudf::io::parquet::detail {
 
 using cudf::io::detail::string_index_pair;
 
@@ -88,8 +88,6 @@ struct input_column_info {
   auto nesting_depth() const { return nesting.size(); }
 };
 
-namespace gpu {
-
 /**
  * @brief Enums for the flags in the page header
  */
@@ -347,7 +345,7 @@ struct file_intermediate_data {
 
   // all chunks from the selected row groups. We may end up reading these chunks progressively
   // instead of all at once
-  std::vector<gpu::ColumnChunkDesc> chunks{};
+  std::vector<ColumnChunkDesc> chunks{};
 
   // skip_rows/num_rows values for the entire file. these need to be adjusted per-pass because we
   // may not be visiting every row group that contains these bounds
@@ -372,16 +370,16 @@ struct pass_intermediate_data {
 
   // rowgroup, chunk and page information for the current pass.
   std::vector<row_group_info> row_groups{};
-  cudf::detail::hostdevice_vector<gpu::ColumnChunkDesc> chunks{};
-  cudf::detail::hostdevice_vector<gpu::PageInfo> pages_info{};
-  cudf::detail::hostdevice_vector<gpu::PageNestingInfo> page_nesting_info{};
-  cudf::detail::hostdevice_vector<gpu::PageNestingDecodeInfo> page_nesting_decode_info{};
+  cudf::detail::hostdevice_vector<ColumnChunkDesc> chunks{};
+  cudf::detail::hostdevice_vector<PageInfo> pages_info{};
+  cudf::detail::hostdevice_vector<PageNestingInfo> page_nesting_info{};
+  cudf::detail::hostdevice_vector<PageNestingDecodeInfo> page_nesting_decode_info{};
 
   rmm::device_uvector<int32_t> page_keys{0, rmm::cuda_stream_default};
   rmm::device_uvector<int32_t> page_index{0, rmm::cuda_stream_default};
   rmm::device_uvector<string_index_pair> str_dict_index{0, rmm::cuda_stream_default};
 
-  std::vector<gpu::chunk_read_info> output_chunk_read_info;
+  std::vector<chunk_read_info> output_chunk_read_info;
   std::size_t current_output_chunk{0};
 
   rmm::device_buffer level_decode_data{};
@@ -739,7 +737,7 @@ void initialize_chunk_hash_maps(device_span<EncColumnChunk> chunks, rmm::cuda_st
  * @param frags Column fragments
  * @param stream CUDA stream to use
  */
-void populate_chunk_hash_maps(cudf::detail::device_2dspan<gpu::PageFragment const> frags,
+void populate_chunk_hash_maps(cudf::detail::device_2dspan<PageFragment const> frags,
                               rmm::cuda_stream_view stream);
 
 /**
@@ -762,7 +760,7 @@ void collect_map_entries(device_span<EncColumnChunk> chunks, rmm::cuda_stream_vi
  * @param frags Column fragments
  * @param stream CUDA stream to use
  */
-void get_dictionary_indices(cudf::detail::device_2dspan<gpu::PageFragment const> frags,
+void get_dictionary_indices(cudf::detail::device_2dspan<PageFragment const> frags,
                             rmm::cuda_stream_view stream);
 
 /**
@@ -781,7 +779,7 @@ void get_dictionary_indices(cudf::detail::device_2dspan<gpu::PageFragment const>
  * @param[in] stream CUDA stream to use
  */
 void InitEncoderPages(cudf::detail::device_2dspan<EncColumnChunk> chunks,
-                      device_span<gpu::EncPage> pages,
+                      device_span<EncPage> pages,
                       device_span<size_type> page_sizes,
                       device_span<size_type> comp_page_sizes,
                       device_span<parquet_column_device_view const> col_desc,
@@ -847,7 +845,7 @@ void EncodePageHeaders(device_span<EncPage> pages,
  * @param[in] stream CUDA stream to use
  */
 void GatherPages(device_span<EncColumnChunk> chunks,
-                 device_span<gpu::EncPage const> pages,
+                 device_span<EncPage const> pages,
                  rmm::cuda_stream_view stream);
 
 /**
@@ -863,5 +861,4 @@ void EncodeColumnIndexes(device_span<EncColumnChunk> chunks,
                          int32_t column_index_truncate_length,
                          rmm::cuda_stream_view stream);
 
-}  // namespace gpu
-}  // namespace cudf::io::parquet
+}  // namespace cudf::io::parquet::detail
diff --git a/cpp/src/io/parquet/predicate_pushdown.cpp b/cpp/src/io/parquet/predicate_pushdown.cpp
index 805d082c71e..ceb4c660dbc 100644
--- a/cpp/src/io/parquet/predicate_pushdown.cpp
+++ b/cpp/src/io/parquet/predicate_pushdown.cpp
@@ -35,7 +35,7 @@
 #include <numeric>
 #include <optional>
 
-namespace cudf::io::detail::parquet {
+namespace cudf::io::parquet::detail {
 
 namespace {
 /**
@@ -62,13 +62,17 @@ struct stats_caster {
 
   // uses storage type as T
   template <typename T, CUDF_ENABLE_IF(cudf::is_dictionary<T>() or cudf::is_nested<T>())>
-  static T convert(uint8_t const* stats_val, size_t stats_size, cudf::io::parquet::Type const type)
+  static T convert(uint8_t const* stats_val,
+                   size_t stats_size,
+                   cudf::io::parquet::detail::Type const type)
   {
     CUDF_FAIL("unsupported type for stats casting");
   }
 
   template <typename T, CUDF_ENABLE_IF(cudf::is_boolean<T>())>
-  static T convert(uint8_t const* stats_val, size_t stats_size, cudf::io::parquet::Type const type)
+  static T convert(uint8_t const* stats_val,
+                   size_t stats_size,
+                   cudf::io::parquet::detail::Type const type)
   {
     CUDF_EXPECTS(type == BOOLEAN, "Invalid type and stats combination");
     return targetType<T>(*reinterpret_cast<bool const*>(stats_val));
@@ -78,7 +82,9 @@ struct stats_caster {
   template <typename T,
             CUDF_ENABLE_IF((cudf::is_integral<T>() and !cudf::is_boolean<T>()) or
                            cudf::is_fixed_point<T>() or cudf::is_chrono<T>())>
-  static T convert(uint8_t const* stats_val, size_t stats_size, cudf::io::parquet::Type const type)
+  static T convert(uint8_t const* stats_val,
+                   size_t stats_size,
+                   cudf::io::parquet::detail::Type const type)
   {
     switch (type) {
       case INT32: return targetType<T>(*reinterpret_cast<int32_t const*>(stats_val));
@@ -103,7 +109,9 @@ struct stats_caster {
   }
 
   template <typename T, CUDF_ENABLE_IF(cudf::is_floating_point<T>())>
-  static T convert(uint8_t const* stats_val, size_t stats_size, cudf::io::parquet::Type const type)
+  static T convert(uint8_t const* stats_val,
+                   size_t stats_size,
+                   cudf::io::parquet::detail::Type const type)
   {
     switch (type) {
       case FLOAT: return targetType<T>(*reinterpret_cast<float const*>(stats_val));
@@ -113,7 +121,9 @@ struct stats_caster {
   }
 
   template <typename T, CUDF_ENABLE_IF(std::is_same_v<T, string_view>)>
-  static T convert(uint8_t const* stats_val, size_t stats_size, cudf::io::parquet::Type const type)
+  static T convert(uint8_t const* stats_val,
+                   size_t stats_size,
+                   cudf::io::parquet::detail::Type const type)
   {
     switch (type) {
       case BYTE_ARRAY: [[fallthrough]];
@@ -527,4 +537,4 @@ named_to_reference_converter::visit_operands(
   return transformed_operands;
 }
 
-}  // namespace cudf::io::detail::parquet
+}  // namespace cudf::io::parquet::detail
diff --git a/cpp/src/io/parquet/reader.cpp b/cpp/src/io/parquet/reader.cpp
index 1e87447006d..17d7c07bc91 100644
--- a/cpp/src/io/parquet/reader.cpp
+++ b/cpp/src/io/parquet/reader.cpp
@@ -16,7 +16,7 @@
 
 #include "reader_impl.hpp"
 
-namespace cudf::io::detail::parquet {
+namespace cudf::io::parquet::detail {
 
 reader::reader() = default;
 
@@ -59,4 +59,4 @@ bool chunked_reader::has_next() const { return _impl->has_next(); }
 
 table_with_metadata chunked_reader::read_chunk() const { return _impl->read_chunk(); }
 
-}  // namespace cudf::io::detail::parquet
+}  // namespace cudf::io::parquet::detail
diff --git a/cpp/src/io/parquet/reader_impl.cpp b/cpp/src/io/parquet/reader_impl.cpp
index ea40f29a070..34aa4f2201f 100644
--- a/cpp/src/io/parquet/reader_impl.cpp
+++ b/cpp/src/io/parquet/reader_impl.cpp
@@ -25,7 +25,7 @@
 #include <bitset>
 #include <numeric>
 
-namespace cudf::io::detail::parquet {
+namespace cudf::io::parquet::detail {
 
 void reader::impl::decode_page_data(size_t skip_rows, size_t num_rows)
 {
@@ -38,7 +38,7 @@ void reader::impl::decode_page_data(size_t skip_rows, size_t num_rows)
   CUDF_EXPECTS(pages.size() > 0, "There is no page to decode");
 
   size_t const sum_max_depths = std::accumulate(
-    chunks.begin(), chunks.end(), 0, [&](size_t cursum, gpu::ColumnChunkDesc const& chunk) {
+    chunks.begin(), chunks.end(), 0, [&](size_t cursum, ColumnChunkDesc const& chunk) {
       return cursum + _metadata->get_output_nesting_depth(chunk.src_col_schema);
     });
 
@@ -51,10 +51,10 @@ void reader::impl::decode_page_data(size_t skip_rows, size_t num_rows)
   // doing a gather operation later on.
   // TODO: This step is somewhat redundant if size info has already been calculated (nested schema,
   // chunked reader).
-  auto const has_strings = (kernel_mask & gpu::KERNEL_MASK_STRING) != 0;
+  auto const has_strings = (kernel_mask & KERNEL_MASK_STRING) != 0;
   std::vector<size_t> col_sizes(_input_columns.size(), 0L);
   if (has_strings) {
-    gpu::ComputePageStringSizes(
+    ComputePageStringSizes(
       pages, chunks, skip_rows, num_rows, _pass_itm_data->level_type_size, _stream);
 
     col_sizes = calculate_page_string_offsets();
@@ -176,19 +176,19 @@ void reader::impl::decode_page_data(size_t skip_rows, size_t num_rows)
   if (has_strings) {
     auto& stream = streams[s_idx++];
     chunk_nested_str_data.host_to_device_async(stream);
-    gpu::DecodeStringPageData(
+    DecodeStringPageData(
       pages, chunks, num_rows, skip_rows, level_type_size, error_code.data(), stream);
   }
 
   // launch delta binary decoder
-  if ((kernel_mask & gpu::KERNEL_MASK_DELTA_BINARY) != 0) {
-    gpu::DecodeDeltaBinary(
+  if ((kernel_mask & KERNEL_MASK_DELTA_BINARY) != 0) {
+    DecodeDeltaBinary(
       pages, chunks, num_rows, skip_rows, level_type_size, error_code.data(), streams[s_idx++]);
   }
 
   // launch the catch-all page decoder
-  if ((kernel_mask & gpu::KERNEL_MASK_GENERAL) != 0) {
-    gpu::DecodePageData(
+  if ((kernel_mask & KERNEL_MASK_GENERAL) != 0) {
+    DecodePageData(
       pages, chunks, num_rows, skip_rows, level_type_size, error_code.data(), streams[s_idx++]);
   }
 
@@ -248,13 +248,13 @@ void reader::impl::decode_page_data(size_t skip_rows, size_t num_rows)
 
   // update null counts in the final column buffers
   for (size_t idx = 0; idx < pages.size(); idx++) {
-    gpu::PageInfo* pi = &pages[idx];
-    if (pi->flags & gpu::PAGEINFO_FLAGS_DICTIONARY) { continue; }
-    gpu::ColumnChunkDesc* col          = &chunks[pi->chunk_idx];
+    PageInfo* pi = &pages[idx];
+    if (pi->flags & PAGEINFO_FLAGS_DICTIONARY) { continue; }
+    ColumnChunkDesc* col               = &chunks[pi->chunk_idx];
     input_column_info const& input_col = _input_columns[col->src_col_index];
 
-    int index                        = pi->nesting_decode - page_nesting_decode.device_ptr();
-    gpu::PageNestingDecodeInfo* pndi = &page_nesting_decode[index];
+    int index                   = pi->nesting_decode - page_nesting_decode.device_ptr();
+    PageNestingDecodeInfo* pndi = &page_nesting_decode[index];
 
     auto* cols = &_output_buffers;
     for (size_t l_idx = 0; l_idx < input_col.nesting_depth(); l_idx++) {
@@ -320,7 +320,7 @@ reader::impl::impl(std::size_t chunk_read_limit,
 
   // Save the states of the output buffers for reuse in `chunk_read()`.
   for (auto const& buff : _output_buffers) {
-    _output_buffers_template.emplace_back(inline_column_buffer::empty_like(buff));
+    _output_buffers_template.emplace_back(cudf::io::detail::inline_column_buffer::empty_like(buff));
   }
 }
 
@@ -368,7 +368,7 @@ void reader::impl::prepare_data(int64_t skip_rows,
 
     // always create the pass struct, even if we end up with no passes.
     // this will also cause the previous pass information to be deleted
-    _pass_itm_data = std::make_unique<cudf::io::parquet::gpu::pass_intermediate_data>();
+    _pass_itm_data = std::make_unique<cudf::io::parquet::detail::pass_intermediate_data>();
 
     if (_file_itm_data.global_num_rows > 0 && not _file_itm_data.row_groups.empty() &&
         not _input_columns.empty() && _current_input_pass < num_passes) {
@@ -521,7 +521,7 @@ table_with_metadata reader::impl::read_chunk()
   if (_chunk_count > 0) {
     _output_buffers.resize(0);
     for (auto const& buff : _output_buffers_template) {
-      _output_buffers.emplace_back(inline_column_buffer::empty_like(buff));
+      _output_buffers.emplace_back(cudf::io::detail::inline_column_buffer::empty_like(buff));
     }
   }
 
@@ -571,4 +571,4 @@ parquet_metadata read_parquet_metadata(host_span<std::unique_ptr<datasource> con
                           metadata.get_key_value_metadata()[0]};
 }
 
-}  // namespace cudf::io::detail::parquet
+}  // namespace cudf::io::parquet::detail
diff --git a/cpp/src/io/parquet/reader_impl.hpp b/cpp/src/io/parquet/reader_impl.hpp
index 9445e4d1648..03990f1a1f3 100644
--- a/cpp/src/io/parquet/reader_impl.hpp
+++ b/cpp/src/io/parquet/reader_impl.hpp
@@ -35,7 +35,7 @@
 #include <optional>
 #include <vector>
 
-namespace cudf::io::detail::parquet {
+namespace cudf::io::parquet::detail {
 
 /**
  * @brief Implementation for Parquet reader
@@ -261,10 +261,10 @@ class reader::impl {
   std::vector<input_column_info> _input_columns;
 
   // Buffers for generating output columns
-  std::vector<inline_column_buffer> _output_buffers;
+  std::vector<cudf::io::detail::inline_column_buffer> _output_buffers;
 
   // Buffers copied from `_output_buffers` after construction for reuse
-  std::vector<inline_column_buffer> _output_buffers_template;
+  std::vector<cudf::io::detail::inline_column_buffer> _output_buffers_template;
 
   // _output_buffers associated schema indices
   std::vector<int> _output_column_schemas;
@@ -285,8 +285,8 @@ class reader::impl {
   // Within a pass, we produce one or more chunks of output, whose maximum total
   // byte size is controlled by _output_chunk_read_limit.
 
-  cudf::io::parquet::gpu::file_intermediate_data _file_itm_data;
-  std::unique_ptr<cudf::io::parquet::gpu::pass_intermediate_data> _pass_itm_data;
+  cudf::io::parquet::detail::file_intermediate_data _file_itm_data;
+  std::unique_ptr<cudf::io::parquet::detail::pass_intermediate_data> _pass_itm_data;
 
   // an array of offsets into _file_itm_data::global_chunks. Each pair of offsets represents
   // the start/end of the chunks to be loaded for a given pass.
@@ -301,4 +301,4 @@ class reader::impl {
   bool _file_preprocessed{false};
 };
 
-}  // namespace cudf::io::detail::parquet
+}  // namespace cudf::io::parquet::detail
diff --git a/cpp/src/io/parquet/reader_impl_chunking.cu b/cpp/src/io/parquet/reader_impl_chunking.cu
new file mode 100644
index 00000000000..2c1521e46db
--- /dev/null
+++ b/cpp/src/io/parquet/reader_impl_chunking.cu
@@ -0,0 +1,597 @@
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "reader_impl.hpp"
+
+#include <cudf/detail/iterator.cuh>
+#include <cudf/detail/utilities/integer_utils.hpp>
+
+#include <io/utilities/time_utils.cuh>
+
+#include <rmm/exec_policy.hpp>
+
+#include <thrust/binary_search.h>
+#include <thrust/iterator/constant_iterator.h>
+#include <thrust/iterator/discard_iterator.h>
+#include <thrust/sort.h>
+
+namespace cudf::io::parquet::detail {
+
+namespace {
+
+struct cumulative_row_info {
+  size_t row_count;   // cumulative row count
+  size_t size_bytes;  // cumulative size in bytes
+  int key;            // schema index
+};
+
+#if defined(PREPROCESS_DEBUG)
+void print_cumulative_page_info(cudf::detail::hostdevice_vector<gpu::PageInfo>& pages,
+                                rmm::device_uvector<int32_t> const& page_index,
+                                rmm::device_uvector<cumulative_row_info> const& c_info,
+                                rmm::cuda_stream_view stream)
+{
+  pages.device_to_host_sync(stream);
+
+  printf("------------\nCumulative sizes by page\n");
+
+  std::vector<int> schemas(pages.size());
+  std::vector<int> h_page_index(pages.size());
+  CUDF_CUDA_TRY(cudaMemcpy(
+    h_page_index.data(), page_index.data(), sizeof(int) * pages.size(), cudaMemcpyDefault));
+  std::vector<cumulative_row_info> h_cinfo(pages.size());
+  CUDF_CUDA_TRY(cudaMemcpy(
+    h_cinfo.data(), c_info.data(), sizeof(cumulative_row_info) * pages.size(), cudaMemcpyDefault));
+  auto schema_iter = cudf::detail::make_counting_transform_iterator(
+    0, [&](size_type i) { return pages[h_page_index[i]].src_col_schema; });
+  thrust::copy(thrust::seq, schema_iter, schema_iter + pages.size(), schemas.begin());
+  auto last = thrust::unique(thrust::seq, schemas.begin(), schemas.end());
+  schemas.resize(last - schemas.begin());
+  printf("Num schemas: %lu\n", schemas.size());
+
+  for (size_t idx = 0; idx < schemas.size(); idx++) {
+    printf("Schema %d\n", schemas[idx]);
+    for (size_t pidx = 0; pidx < pages.size(); pidx++) {
+      auto const& page = pages[h_page_index[pidx]];
+      if (page.flags & gpu::PAGEINFO_FLAGS_DICTIONARY || page.src_col_schema != schemas[idx]) {
+        continue;
+      }
+      printf("\tP: {%lu, %lu}\n", h_cinfo[pidx].row_count, h_cinfo[pidx].size_bytes);
+    }
+  }
+}
+
+void print_cumulative_row_info(
+  host_span<cumulative_row_info const> sizes,
+  std::string const& label,
+  std::optional<std::vector<gpu::chunk_read_info>> splits = std::nullopt)
+{
+  if (splits.has_value()) {
+    printf("------------\nSplits\n");
+    for (size_t idx = 0; idx < splits->size(); idx++) {
+      printf("{%lu, %lu}\n", splits.value()[idx].skip_rows, splits.value()[idx].num_rows);
+    }
+  }
+
+  printf("------------\nCumulative sizes %s\n", label.c_str());
+  for (size_t idx = 0; idx < sizes.size(); idx++) {
+    printf("{%lu, %lu, %d}", sizes[idx].row_count, sizes[idx].size_bytes, sizes[idx].key);
+    if (splits.has_value()) {
+      // if we have a split at this row count and this is the last instance of this row count
+      auto start = thrust::make_transform_iterator(
+        splits->begin(), [](gpu::chunk_read_info const& i) { return i.skip_rows; });
+      auto end               = start + splits->size();
+      auto split             = std::find(start, end, sizes[idx].row_count);
+      auto const split_index = [&]() -> int {
+        if (split != end &&
+            ((idx == sizes.size() - 1) || (sizes[idx + 1].row_count > sizes[idx].row_count))) {
+          return static_cast<int>(std::distance(start, split));
+        }
+        return idx == 0 ? 0 : -1;
+      }();
+      if (split_index >= 0) {
+        printf(" <-- split {%lu, %lu}",
+               splits.value()[split_index].skip_rows,
+               splits.value()[split_index].num_rows);
+      }
+    }
+    printf("\n");
+  }
+}
+#endif  // PREPROCESS_DEBUG
+
+/**
+ * @brief Functor which reduces two cumulative_row_info structs of the same key.
+ */
+struct cumulative_row_sum {
+  cumulative_row_info operator()
+    __device__(cumulative_row_info const& a, cumulative_row_info const& b) const
+  {
+    return cumulative_row_info{a.row_count + b.row_count, a.size_bytes + b.size_bytes, a.key};
+  }
+};
+
+/**
+ * @brief Functor which computes the total data size for a given type of cudf column.
+ *
+ * In the case of strings, the return size does not include the chars themselves. That
+ * information is tracked separately (see PageInfo::str_bytes).
+ */
+struct row_size_functor {
+  __device__ size_t validity_size(size_t num_rows, bool nullable)
+  {
+    return nullable ? (cudf::util::div_rounding_up_safe(num_rows, size_t{32}) * 4) : 0;
+  }
+
+  template <typename T>
+  __device__ size_t operator()(size_t num_rows, bool nullable)
+  {
+    auto const element_size = sizeof(device_storage_type_t<T>);
+    return (element_size * num_rows) + validity_size(num_rows, nullable);
+  }
+};
+
+template <>
+__device__ size_t row_size_functor::operator()<list_view>(size_t num_rows, bool nullable)
+{
+  auto const offset_size = sizeof(size_type);
+  // NOTE: Adding the + 1 offset here isn't strictly correct.  There will only be 1 extra offset
+  // for the entire column, whereas this is adding an extra offset per page.  So we will get a
+  // small over-estimate of the real size of the order :  # of pages * 4 bytes. It seems better
+  // to overestimate size somewhat than to underestimate it and potentially generate chunks
+  // that are too large.
+  return (offset_size * (num_rows + 1)) + validity_size(num_rows, nullable);
+}
+
+template <>
+__device__ size_t row_size_functor::operator()<struct_view>(size_t num_rows, bool nullable)
+{
+  return validity_size(num_rows, nullable);
+}
+
+template <>
+__device__ size_t row_size_functor::operator()<string_view>(size_t num_rows, bool nullable)
+{
+  // only returns the size of offsets and validity. the size of the actual string chars
+  // is tracked separately.
+  auto const offset_size = sizeof(size_type);
+  // see note about offsets in the list_view template.
+  return (offset_size * (num_rows + 1)) + validity_size(num_rows, nullable);
+}
+
+/**
+ * @brief Functor which computes the total output cudf data size for all of
+ * the data in this page.
+ *
+ * Sums across all nesting levels.
+ */
+struct get_cumulative_row_info {
+  gpu::PageInfo const* const pages;
+
+  __device__ cumulative_row_info operator()(size_type index)
+  {
+    auto const& page = pages[index];
+    if (page.flags & gpu::PAGEINFO_FLAGS_DICTIONARY) {
+      return cumulative_row_info{0, 0, page.src_col_schema};
+    }
+
+    // total nested size, not counting string data
+    auto iter =
+      cudf::detail::make_counting_transform_iterator(0, [page, index] __device__(size_type i) {
+        auto const& pni = page.nesting[i];
+        return cudf::type_dispatcher(
+          data_type{pni.type}, row_size_functor{}, pni.size, pni.nullable);
+      });
+
+    size_t const row_count = static_cast<size_t>(page.nesting[0].size);
+    return {
+      row_count,
+      thrust::reduce(thrust::seq, iter, iter + page.num_output_nesting_levels) + page.str_bytes,
+      page.src_col_schema};
+  }
+};
+
+/**
+ * @brief Functor which computes the effective size of all input columns by page.
+ *
+ * For a given row, we want to find the cost of all pages for all columns involved
+ * in loading up to that row.  The complication here is that not all pages are the
+ * same size between columns. Example:
+ *
+ *              page row counts
+ * Column A:    0 <----> 100 <----> 200
+ * Column B:    0 <---------------> 200 <--------> 400
+                          |
+ * if we decide to split at row 100, we don't really know the actual amount of bytes in column B
+ * at that point.  So we have to proceed as if we are taking the bytes from all 200 rows of that
+ * page. Essentially, a conservative over-estimate of the real size.
+ */
+struct row_total_size {
+  cumulative_row_info const* c_info;
+  size_type const* key_offsets;
+  size_t num_keys;
+
+  __device__ cumulative_row_info operator()(cumulative_row_info const& i)
+  {
+    // sum sizes for each input column at this row
+    size_t sum = 0;
+    for (int idx = 0; idx < num_keys; idx++) {
+      auto const start = key_offsets[idx];
+      auto const end   = key_offsets[idx + 1];
+      auto iter        = cudf::detail::make_counting_transform_iterator(
+        0, [&] __device__(size_type i) { return c_info[i].row_count; });
+      auto const page_index =
+        thrust::lower_bound(thrust::seq, iter + start, iter + end, i.row_count) - iter;
+      sum += c_info[page_index].size_bytes;
+    }
+    return {i.row_count, sum, i.key};
+  }
+};
+
+/**
+ * @brief Given a vector of cumulative {row_count, byte_size} pairs and a chunk read
+ * limit, determine the set of splits.
+ *
+ * @param sizes Vector of cumulative {row_count, byte_size} pairs
+ * @param num_rows Total number of rows to read
+ * @param chunk_read_limit Limit on total number of bytes to be returned per read, for all columns
+ */
+std::vector<gpu::chunk_read_info> find_splits(std::vector<cumulative_row_info> const& sizes,
+                                              size_t num_rows,
+                                              size_t chunk_read_limit)
+{
+  // now we have an array of {row_count, real output bytes}. just walk through it and generate
+  // splits.
+  // TODO: come up with a clever way to do this entirely in parallel. For now, as long as batch
+  // sizes are reasonably large, this shouldn't iterate too many times
+  std::vector<gpu::chunk_read_info> splits;
+  {
+    size_t cur_pos             = 0;
+    size_t cur_cumulative_size = 0;
+    size_t cur_row_count       = 0;
+    auto start = thrust::make_transform_iterator(sizes.begin(), [&](cumulative_row_info const& i) {
+      return i.size_bytes - cur_cumulative_size;
+    });
+    auto end   = start + sizes.size();
+    while (cur_row_count < num_rows) {
+      int64_t split_pos =
+        thrust::lower_bound(thrust::seq, start + cur_pos, end, chunk_read_limit) - start;
+
+      // if we're past the end, or if the returned bucket is > than the chunk_read_limit, move back
+      // one.
+      if (static_cast<size_t>(split_pos) >= sizes.size() ||
+          (sizes[split_pos].size_bytes - cur_cumulative_size > chunk_read_limit)) {
+        split_pos--;
+      }
+
+      // best-try. if we can't find something that'll fit, we have to go bigger. we're doing this in
+      // a loop because all of the cumulative sizes for all the pages are sorted into one big list.
+      // so if we had two columns, both of which had an entry {1000, 10000}, that entry would be in
+      // the list twice. so we have to iterate until we skip past all of them.  The idea is that we
+      // either do this, or we have to call unique() on the input first.
+      while (split_pos < (static_cast<int64_t>(sizes.size()) - 1) &&
+             (split_pos < 0 || sizes[split_pos].row_count == cur_row_count)) {
+        split_pos++;
+      }
+
+      auto const start_row = cur_row_count;
+      cur_row_count        = sizes[split_pos].row_count;
+      splits.push_back(gpu::chunk_read_info{start_row, cur_row_count - start_row});
+      cur_pos             = split_pos;
+      cur_cumulative_size = sizes[split_pos].size_bytes;
+    }
+  }
+  // print_cumulative_row_info(sizes, "adjusted", splits);
+
+  return splits;
+}
+
+/**
+ * @brief Converts cuDF units to Parquet units.
+ *
+ * @return A tuple of Parquet type width, Parquet clock rate and Parquet decimal type.
+ */
+[[nodiscard]] std::tuple<int32_t, int32_t, int8_t> conversion_info(type_id column_type_id,
+                                                                   type_id timestamp_type_id,
+                                                                   parquet::Type physical,
+                                                                   int8_t converted,
+                                                                   int32_t length)
+{
+  int32_t type_width = (physical == parquet::FIXED_LEN_BYTE_ARRAY) ? length : 0;
+  int32_t clock_rate = 0;
+  if (column_type_id == type_id::INT8 or column_type_id == type_id::UINT8) {
+    type_width = 1;  // I32 -> I8
+  } else if (column_type_id == type_id::INT16 or column_type_id == type_id::UINT16) {
+    type_width = 2;  // I32 -> I16
+  } else if (column_type_id == type_id::INT32) {
+    type_width = 4;  // str -> hash32
+  } else if (is_chrono(data_type{column_type_id})) {
+    clock_rate = to_clockrate(timestamp_type_id);
+  }
+
+  int8_t converted_type = converted;
+  if (converted_type == parquet::DECIMAL && column_type_id != type_id::FLOAT64 &&
+      not cudf::is_fixed_point(data_type{column_type_id})) {
+    converted_type = parquet::UNKNOWN;  // Not converting to float64 or decimal
+  }
+  return std::make_tuple(type_width, clock_rate, converted_type);
+}
+
+/**
+ * @brief Return the required number of bits to store a value.
+ */
+template <typename T = uint8_t>
+[[nodiscard]] T required_bits(uint32_t max_level)
+{
+  return static_cast<T>(CompactProtocolReader::NumRequiredBits(max_level));
+}
+
+struct row_count_compare {
+  __device__ bool operator()(cumulative_row_info const& a, cumulative_row_info const& b) 
+  {
+    return a.row_count < b.row_count;
+  }
+};
+
+}  // anonymous namespace
+
+void reader::impl::create_global_chunk_info()
+{
+  auto const num_rows         = _file_itm_data.global_num_rows;
+  auto const& row_groups_info = _file_itm_data.row_groups;
+  auto& chunks                = _file_itm_data.chunks;
+
+  // Descriptors for all the chunks that make up the selected columns
+  auto const num_input_columns = _input_columns.size();
+  auto const num_chunks        = row_groups_info.size() * num_input_columns;
+
+  // Initialize column chunk information
+  auto remaining_rows = num_rows;
+  for (auto const& rg : row_groups_info) {
+    auto const& row_group      = _metadata->get_row_group(rg.index, rg.source_index);
+    auto const row_group_start = rg.start_row;
+    auto const row_group_rows  = std::min<int>(remaining_rows, row_group.num_rows);
+
+    // generate ColumnChunkDesc objects for everything to be decoded (all input columns)
+    for (size_t i = 0; i < num_input_columns; ++i) {
+      auto col = _input_columns[i];
+      // look up metadata
+      auto& col_meta = _metadata->get_column_metadata(rg.index, rg.source_index, col.schema_idx);
+      auto& schema   = _metadata->get_schema(col.schema_idx);
+
+      auto [type_width, clock_rate, converted_type] =
+        conversion_info(to_type_id(schema, _strings_to_categorical, _timestamp_type.id()),
+                        _timestamp_type.id(),
+                        schema.type,
+                        schema.converted_type,
+                        schema.type_length);
+
+      chunks.push_back(gpu::ColumnChunkDesc(col_meta.total_compressed_size,
+                                            nullptr,
+                                            col_meta.num_values,
+                                            schema.type,
+                                            type_width,
+                                            row_group_start,
+                                            row_group_rows,
+                                            schema.max_definition_level,
+                                            schema.max_repetition_level,
+                                            _metadata->get_output_nesting_depth(col.schema_idx),
+                                            required_bits(schema.max_definition_level),
+                                            required_bits(schema.max_repetition_level),
+                                            col_meta.codec,
+                                            converted_type,
+                                            schema.logical_type,
+                                            schema.decimal_precision,
+                                            clock_rate,
+                                            i,
+                                            col.schema_idx));
+    }
+
+    remaining_rows -= row_group_rows;
+  }
+}
+
+void reader::impl::compute_input_passes()
+{
+  // at this point, row_groups has already been filtered down to just the row groups we need to
+  // handle optional skip_rows/num_rows parameters.
+  auto const& row_groups_info = _file_itm_data.row_groups;
+
+  // if the user hasn't specified an input size limit, read everything in a single pass.
+  if (_input_pass_read_limit == 0) {
+    _file_itm_data.input_pass_row_group_offsets.push_back(0);
+    _file_itm_data.input_pass_row_group_offsets.push_back(row_groups_info.size());
+    return;
+  }
+
+  // generate passes. make sure to account for the case where a single row group doesn't fit within
+  //
+  std::size_t const read_limit =
+    _input_pass_read_limit > 0 ? _input_pass_read_limit : std::numeric_limits<std::size_t>::max();
+  std::size_t cur_pass_byte_size = 0;
+  std::size_t cur_rg_start       = 0;
+  std::size_t cur_row_count      = 0;
+  _file_itm_data.input_pass_row_group_offsets.push_back(0);
+  _file_itm_data.input_pass_row_count.push_back(0);
+
+  for (size_t cur_rg_index = 0; cur_rg_index < row_groups_info.size(); cur_rg_index++) {
+    auto const& rgi       = row_groups_info[cur_rg_index];
+    auto const& row_group = _metadata->get_row_group(rgi.index, rgi.source_index);
+
+    // can we add this row group
+    if (cur_pass_byte_size + row_group.total_byte_size >= read_limit) {
+      // A single row group (the current one) is larger than the read limit:
+      // We always need to include at least one row group, so end the pass at the end of the current
+      // row group
+      if (cur_rg_start == cur_rg_index) {
+        _file_itm_data.input_pass_row_group_offsets.push_back(cur_rg_index + 1);
+        _file_itm_data.input_pass_row_count.push_back(cur_row_count + row_group.num_rows);
+        cur_rg_start       = cur_rg_index + 1;
+        cur_pass_byte_size = 0;
+      }
+      // End the pass at the end of the previous row group
+      else {
+        _file_itm_data.input_pass_row_group_offsets.push_back(cur_rg_index);
+        _file_itm_data.input_pass_row_count.push_back(cur_row_count);
+        cur_rg_start       = cur_rg_index;
+        cur_pass_byte_size = row_group.total_byte_size;
+      }
+    } else {
+      cur_pass_byte_size += row_group.total_byte_size;
+    }
+    cur_row_count += row_group.num_rows;
+  }
+  // add the last pass if necessary
+  if (_file_itm_data.input_pass_row_group_offsets.back() != row_groups_info.size()) {
+    _file_itm_data.input_pass_row_group_offsets.push_back(row_groups_info.size());
+    _file_itm_data.input_pass_row_count.push_back(cur_row_count);
+  }
+}
+
+void reader::impl::setup_next_pass()
+{
+  // this will also cause the previous pass information to be deleted
+  _pass_itm_data = std::make_unique<cudf::io::parquet::detail::pass_intermediate_data>();
+
+  // setup row groups to be loaded for this pass
+  auto const row_group_start = _file_itm_data.input_pass_row_group_offsets[_current_input_pass];
+  auto const row_group_end   = _file_itm_data.input_pass_row_group_offsets[_current_input_pass + 1];
+  auto const num_row_groups  = row_group_end - row_group_start;
+  _pass_itm_data->row_groups.resize(num_row_groups);
+  std::copy(_file_itm_data.row_groups.begin() + row_group_start,
+            _file_itm_data.row_groups.begin() + row_group_end,
+            _pass_itm_data->row_groups.begin());
+
+  auto const num_passes = _file_itm_data.input_pass_row_group_offsets.size() - 1;
+  CUDF_EXPECTS(_current_input_pass < num_passes, "Encountered an invalid read pass index");
+
+  auto const chunks_per_rowgroup = _input_columns.size();
+  auto const num_chunks          = chunks_per_rowgroup * num_row_groups;
+
+  auto chunk_start = _file_itm_data.chunks.begin() + (row_group_start * chunks_per_rowgroup);
+  auto chunk_end   = _file_itm_data.chunks.begin() + (row_group_end * chunks_per_rowgroup);
+
+  _pass_itm_data->chunks =
+    cudf::detail::hostdevice_vector<gpu::ColumnChunkDesc>(num_chunks, _stream);
+  std::copy(chunk_start, chunk_end, _pass_itm_data->chunks.begin());
+
+  // adjust skip_rows and num_rows by what's available in the row groups we are processing
+  if (num_passes == 1) {
+    _pass_itm_data->skip_rows = _file_itm_data.global_skip_rows;
+    _pass_itm_data->num_rows  = _file_itm_data.global_num_rows;
+  } else {
+    auto const global_start_row = _file_itm_data.global_skip_rows;
+    auto const global_end_row   = global_start_row + _file_itm_data.global_num_rows;
+    auto const start_row = std::max(_file_itm_data.input_pass_row_count[_current_input_pass], global_start_row);
+    auto const end_row   = std::min(_file_itm_data.input_pass_row_count[_current_input_pass + 1], global_end_row);
+
+    // skip_rows is always global in the sense that it is relative to the first row of
+    // everything we will be reading, regardless of what pass we are on.
+    // num_rows is how many rows we are reading this pass.
+    _pass_itm_data->skip_rows = global_start_row + _file_itm_data.input_pass_row_count[_current_input_pass];
+    _pass_itm_data->num_rows  = end_row - start_row;
+  }
+}
+
+void reader::impl::compute_splits_for_pass()
+{
+  auto const skip_rows = _pass_itm_data->skip_rows;
+  auto const num_rows = _pass_itm_data->num_rows;
+
+  // simple case : no chunk size, no splits
+  if(_output_chunk_read_limit <= 0){
+    _pass_itm_data->output_chunk_read_info = std::vector<gpu::chunk_read_info>{{skip_rows, num_rows}};
+    return;
+  }
+
+  auto& pages          = _pass_itm_data->pages_info;
+
+  auto const& page_keys  = _pass_itm_data->page_keys;
+  auto const& page_index = _pass_itm_data->page_index;
+
+  // generate cumulative row counts and sizes
+  rmm::device_uvector<cumulative_row_info> c_info(page_keys.size(), _stream);
+  // convert PageInfo to cumulative_row_info
+  auto page_input = thrust::make_transform_iterator(page_index.begin(),
+                                                    get_cumulative_row_info{pages.device_ptr()});
+  thrust::inclusive_scan_by_key(rmm::exec_policy(_stream),
+                                page_keys.begin(),
+                                page_keys.end(),
+                                page_input,
+                                c_info.begin(),
+                                thrust::equal_to{},
+                                cumulative_row_sum{});
+  // print_cumulative_page_info(pages, page_index, c_info, stream);
+
+  // sort by row count
+  rmm::device_uvector<cumulative_row_info> c_info_sorted{c_info, _stream};
+  thrust::sort(rmm::exec_policy(_stream),
+               c_info_sorted.begin(),
+               c_info_sorted.end(),
+               row_count_compare{});
+
+  // std::vector<cumulative_row_info> h_c_info_sorted(c_info_sorted.size());
+  // CUDF_CUDA_TRY(cudaMemcpy(h_c_info_sorted.data(),
+  //                          c_info_sorted.data(),
+  //                          sizeof(cumulative_row_info) * c_info_sorted.size(),
+  //                          cudaMemcpyDefault));
+  // print_cumulative_row_info(h_c_info_sorted, "raw");
+
+  // generate key offsets (offsets to the start of each partition of keys). worst case is 1 page per
+  // key
+  rmm::device_uvector<size_type> key_offsets(page_keys.size() + 1, _stream);
+  auto const key_offsets_end = thrust::reduce_by_key(rmm::exec_policy(_stream),
+                                                     page_keys.begin(),
+                                                     page_keys.end(),
+                                                     thrust::make_constant_iterator(1),
+                                                     thrust::make_discard_iterator(),
+                                                     key_offsets.begin())
+                                 .second;
+  size_t const num_unique_keys = key_offsets_end - key_offsets.begin();
+  thrust::exclusive_scan(
+    rmm::exec_policy(_stream), key_offsets.begin(), key_offsets.end(), key_offsets.begin());
+
+  // adjust the cumulative info such that for each row count, the size includes any pages that span
+  // that row count. this is so that if we have this case:
+  //              page row counts
+  // Column A:    0 <----> 100 <----> 200
+  // Column B:    0 <---------------> 200 <--------> 400
+  //                        |
+  // if we decide to split at row 100, we don't really know the actual amount of bytes in column B
+  // at that point.  So we have to proceed as if we are taking the bytes from all 200 rows of that
+  // page.
+  //
+  rmm::device_uvector<cumulative_row_info> aggregated_info(c_info.size(), _stream);
+  thrust::transform(rmm::exec_policy(_stream),
+                    c_info_sorted.begin(),
+                    c_info_sorted.end(),
+                    aggregated_info.begin(),
+                    row_total_size{c_info.data(), key_offsets.data(), num_unique_keys});
+
+  // bring back to the cpu
+  std::vector<cumulative_row_info> h_aggregated_info(aggregated_info.size());
+  CUDF_CUDA_TRY(cudaMemcpyAsync(h_aggregated_info.data(),
+                                aggregated_info.data(),
+                                sizeof(cumulative_row_info) * c_info.size(),
+                                cudaMemcpyDefault,
+                                _stream.value()));
+  _stream.synchronize();
+
+  // generate the actual splits
+  _pass_itm_data->output_chunk_read_info = find_splits(h_aggregated_info, num_rows, _output_chunk_read_limit);
+}
+
+}  // namespace cudf::io::parquet::detail
diff --git a/cpp/src/io/parquet/reader_impl_helpers.cpp b/cpp/src/io/parquet/reader_impl_helpers.cpp
index fcaa610fbb7..05158c3d299 100644
--- a/cpp/src/io/parquet/reader_impl_helpers.cpp
+++ b/cpp/src/io/parquet/reader_impl_helpers.cpp
@@ -21,34 +21,34 @@
 #include <numeric>
 #include <regex>
 
-namespace cudf::io::detail::parquet {
+namespace cudf::io::parquet::detail {
 
 namespace {
 
 ConvertedType logical_type_to_converted_type(LogicalType const& logical)
 {
   if (logical.isset.STRING) {
-    return parquet::UTF8;
+    return UTF8;
   } else if (logical.isset.MAP) {
-    return parquet::MAP;
+    return MAP;
   } else if (logical.isset.LIST) {
-    return parquet::LIST;
+    return LIST;
   } else if (logical.isset.ENUM) {
-    return parquet::ENUM;
+    return ENUM;
   } else if (logical.isset.DECIMAL) {
-    return parquet::DECIMAL;  // TODO set decimal values
+    return DECIMAL;  // TODO set decimal values
   } else if (logical.isset.DATE) {
-    return parquet::DATE;
+    return DATE;
   } else if (logical.isset.TIME) {
     if (logical.TIME.unit.isset.MILLIS)
-      return parquet::TIME_MILLIS;
+      return TIME_MILLIS;
     else if (logical.TIME.unit.isset.MICROS)
-      return parquet::TIME_MICROS;
+      return TIME_MICROS;
   } else if (logical.isset.TIMESTAMP) {
     if (logical.TIMESTAMP.unit.isset.MILLIS)
-      return parquet::TIMESTAMP_MILLIS;
+      return TIMESTAMP_MILLIS;
     else if (logical.TIMESTAMP.unit.isset.MICROS)
-      return parquet::TIMESTAMP_MICROS;
+      return TIMESTAMP_MICROS;
   } else if (logical.isset.INTEGER) {
     switch (logical.INTEGER.bitWidth) {
       case 8: return logical.INTEGER.isSigned ? INT_8 : UINT_8;
@@ -58,13 +58,13 @@ ConvertedType logical_type_to_converted_type(LogicalType const& logical)
       default: break;
     }
   } else if (logical.isset.UNKNOWN) {
-    return parquet::NA;
+    return NA;
   } else if (logical.isset.JSON) {
-    return parquet::JSON;
+    return JSON;
   } else if (logical.isset.BSON) {
-    return parquet::BSON;
+    return BSON;
   }
-  return parquet::UNKNOWN;
+  return UNKNOWN;
 }
 
 }  // namespace
@@ -76,39 +76,39 @@ type_id to_type_id(SchemaElement const& schema,
                    bool strings_to_categorical,
                    type_id timestamp_type_id)
 {
-  parquet::Type const physical            = schema.type;
-  parquet::LogicalType const logical_type = schema.logical_type;
-  parquet::ConvertedType converted_type   = schema.converted_type;
-  int32_t decimal_precision               = schema.decimal_precision;
+  Type const physical            = schema.type;
+  LogicalType const logical_type = schema.logical_type;
+  ConvertedType converted_type   = schema.converted_type;
+  int32_t decimal_precision      = schema.decimal_precision;
 
   // Logical type used for actual data interpretation; the legacy converted type
   // is superseded by 'logical' type whenever available.
   auto const inferred_converted_type = logical_type_to_converted_type(logical_type);
-  if (inferred_converted_type != parquet::UNKNOWN) { converted_type = inferred_converted_type; }
-  if (inferred_converted_type == parquet::DECIMAL) {
+  if (inferred_converted_type != UNKNOWN) { converted_type = inferred_converted_type; }
+  if (inferred_converted_type == DECIMAL) {
     decimal_precision = schema.logical_type.DECIMAL.precision;
   }
 
   switch (converted_type) {
-    case parquet::UINT_8: return type_id::UINT8;
-    case parquet::INT_8: return type_id::INT8;
-    case parquet::UINT_16: return type_id::UINT16;
-    case parquet::INT_16: return type_id::INT16;
-    case parquet::UINT_32: return type_id::UINT32;
-    case parquet::UINT_64: return type_id::UINT64;
-    case parquet::DATE: return type_id::TIMESTAMP_DAYS;
-    case parquet::TIME_MILLIS: return type_id::DURATION_MILLISECONDS;
-    case parquet::TIME_MICROS: return type_id::DURATION_MICROSECONDS;
-    case parquet::TIMESTAMP_MILLIS:
+    case UINT_8: return type_id::UINT8;
+    case INT_8: return type_id::INT8;
+    case UINT_16: return type_id::UINT16;
+    case INT_16: return type_id::INT16;
+    case UINT_32: return type_id::UINT32;
+    case UINT_64: return type_id::UINT64;
+    case DATE: return type_id::TIMESTAMP_DAYS;
+    case TIME_MILLIS: return type_id::DURATION_MILLISECONDS;
+    case TIME_MICROS: return type_id::DURATION_MICROSECONDS;
+    case TIMESTAMP_MILLIS:
       return (timestamp_type_id != type_id::EMPTY) ? timestamp_type_id
                                                    : type_id::TIMESTAMP_MILLISECONDS;
-    case parquet::TIMESTAMP_MICROS:
+    case TIMESTAMP_MICROS:
       return (timestamp_type_id != type_id::EMPTY) ? timestamp_type_id
                                                    : type_id::TIMESTAMP_MICROSECONDS;
-    case parquet::DECIMAL:
-      if (physical == parquet::INT32) { return type_id::DECIMAL32; }
-      if (physical == parquet::INT64) { return type_id::DECIMAL64; }
-      if (physical == parquet::FIXED_LEN_BYTE_ARRAY) {
+    case DECIMAL:
+      if (physical == INT32) { return type_id::DECIMAL32; }
+      if (physical == INT64) { return type_id::DECIMAL64; }
+      if (physical == FIXED_LEN_BYTE_ARRAY) {
         if (schema.type_length <= static_cast<int32_t>(sizeof(int32_t))) {
           return type_id::DECIMAL32;
         }
@@ -119,7 +119,7 @@ type_id to_type_id(SchemaElement const& schema,
           return type_id::DECIMAL128;
         }
       }
-      if (physical == parquet::BYTE_ARRAY) {
+      if (physical == BYTE_ARRAY) {
         CUDF_EXPECTS(decimal_precision <= MAX_DECIMAL128_PRECISION, "Invalid decimal precision");
         if (decimal_precision <= MAX_DECIMAL32_PRECISION) {
           return type_id::DECIMAL32;
@@ -133,20 +133,20 @@ type_id to_type_id(SchemaElement const& schema,
       break;
 
     // maps are just List<Struct<>>.
-    case parquet::MAP:
-    case parquet::LIST: return type_id::LIST;
-    case parquet::NA: return type_id::STRING;
+    case MAP:
+    case LIST: return type_id::LIST;
+    case NA: return type_id::STRING;
     // return type_id::EMPTY; //TODO(kn): enable after Null/Empty column support
     default: break;
   }
 
-  if (inferred_converted_type == parquet::UNKNOWN and physical == parquet::INT64 and
+  if (inferred_converted_type == UNKNOWN and physical == INT64 and
       logical_type.TIMESTAMP.unit.isset.NANOS) {
     return (timestamp_type_id != type_id::EMPTY) ? timestamp_type_id
                                                  : type_id::TIMESTAMP_NANOSECONDS;
   }
 
-  if (inferred_converted_type == parquet::UNKNOWN and physical == parquet::INT64 and
+  if (inferred_converted_type == UNKNOWN and physical == INT64 and
       logical_type.TIME.unit.isset.NANOS) {
     return type_id::DURATION_NANOSECONDS;
   }
@@ -157,16 +157,16 @@ type_id to_type_id(SchemaElement const& schema,
   // Physical storage type supported by Parquet; controls the on-disk storage
   // format in combination with the encoding type.
   switch (physical) {
-    case parquet::BOOLEAN: return type_id::BOOL8;
-    case parquet::INT32: return type_id::INT32;
-    case parquet::INT64: return type_id::INT64;
-    case parquet::FLOAT: return type_id::FLOAT32;
-    case parquet::DOUBLE: return type_id::FLOAT64;
-    case parquet::BYTE_ARRAY:
-    case parquet::FIXED_LEN_BYTE_ARRAY:
+    case BOOLEAN: return type_id::BOOL8;
+    case INT32: return type_id::INT32;
+    case INT64: return type_id::INT64;
+    case FLOAT: return type_id::FLOAT32;
+    case DOUBLE: return type_id::FLOAT64;
+    case BYTE_ARRAY:
+    case FIXED_LEN_BYTE_ARRAY:
       // Can be mapped to INT32 (32-bit hash) or STRING
       return strings_to_categorical ? type_id::INT32 : type_id::STRING;
-    case parquet::INT96:
+    case INT96:
       return (timestamp_type_id != type_id::EMPTY) ? timestamp_type_id
                                                    : type_id::TIMESTAMP_NANOSECONDS;
     default: break;
@@ -344,7 +344,7 @@ std::vector<std::string> aggregate_reader_metadata::get_pandas_index_names() con
   return names;
 }
 
-std::tuple<int64_t, size_type, std::vector<gpu::row_group_info>>
+std::tuple<int64_t, size_type, std::vector<row_group_info>>
 aggregate_reader_metadata::select_row_groups(
   host_span<std::vector<size_type> const> row_group_indices,
   int64_t skip_rows_opt,
@@ -362,7 +362,7 @@ aggregate_reader_metadata::select_row_groups(
         host_span<std::vector<size_type> const>(filtered_row_group_indices.value());
     }
   }
-  std::vector<gpu::row_group_info> selection;
+  std::vector<row_group_info> selection;
   auto [rows_to_skip, rows_to_read] = [&]() {
     if (not row_group_indices.empty()) { return std::pair<int64_t, size_type>{}; }
     auto const from_opts = cudf::io::detail::skip_rows_num_rows_from_options(
@@ -402,7 +402,7 @@ aggregate_reader_metadata::select_row_groups(
 }
 
 std::tuple<std::vector<input_column_info>,
-           std::vector<inline_column_buffer>,
+           std::vector<cudf::io::detail::inline_column_buffer>,
            std::vector<size_type>>
 aggregate_reader_metadata::select_columns(std::optional<std::vector<std::string>> const& use_names,
                                           bool include_index,
@@ -420,17 +420,18 @@ aggregate_reader_metadata::select_columns(std::optional<std::vector<std::string>
              : -1;
   };
 
-  std::vector<inline_column_buffer> output_columns;
+  std::vector<cudf::io::detail::inline_column_buffer> output_columns;
   std::vector<input_column_info> input_columns;
   std::vector<int> nesting;
 
   // Return true if column path is valid. e.g. if the path is {"struct1", "child1"}, then it is
   // valid if "struct1.child1" exists in this file's schema. If "struct1" exists but "child1" is
   // not a child of "struct1" then the function will return false for "struct1"
-  std::function<bool(column_name_info const*, int, std::vector<inline_column_buffer>&, bool)>
+  std::function<bool(
+    column_name_info const*, int, std::vector<cudf::io::detail::inline_column_buffer>&, bool)>
     build_column = [&](column_name_info const* col_name_info,
                        int schema_idx,
-                       std::vector<inline_column_buffer>& out_col_array,
+                       std::vector<cudf::io::detail::inline_column_buffer>& out_col_array,
                        bool has_list_parent) {
       if (schema_idx < 0) { return false; }
       auto const& schema_elem = get_schema(schema_idx);
@@ -451,7 +452,8 @@ aggregate_reader_metadata::select_columns(std::optional<std::vector<std::string>
                               : to_type_id(schema_elem, strings_to_categorical, timestamp_type_id);
       auto const dtype    = to_data_type(col_type, schema_elem);
 
-      inline_column_buffer output_col(dtype, schema_elem.repetition_type == OPTIONAL);
+      cudf::io::detail::inline_column_buffer output_col(dtype,
+                                                        schema_elem.repetition_type == OPTIONAL);
       if (has_list_parent) { output_col.user_data |= PARQUET_COLUMN_BUFFER_FLAG_HAS_LIST_PARENT; }
       // store the index of this element if inserted in out_col_array
       nesting.push_back(static_cast<int>(out_col_array.size()));
@@ -491,7 +493,8 @@ aggregate_reader_metadata::select_columns(std::optional<std::vector<std::string>
             to_type_id(schema_elem, strings_to_categorical, timestamp_type_id);
           auto const element_dtype = to_data_type(element_type, schema_elem);
 
-          inline_column_buffer element_col(element_dtype, schema_elem.repetition_type == OPTIONAL);
+          cudf::io::detail::inline_column_buffer element_col(
+            element_dtype, schema_elem.repetition_type == OPTIONAL);
           if (has_list_parent || col_type == type_id::LIST) {
             element_col.user_data |= PARQUET_COLUMN_BUFFER_FLAG_HAS_LIST_PARENT;
           }
@@ -656,4 +659,4 @@ aggregate_reader_metadata::select_columns(std::optional<std::vector<std::string>
     std::move(input_columns), std::move(output_columns), std::move(output_column_schemas));
 }
 
-}  // namespace cudf::io::detail::parquet
+}  // namespace cudf::io::parquet::detail
diff --git a/cpp/src/io/parquet/reader_impl_helpers.hpp b/cpp/src/io/parquet/reader_impl_helpers.hpp
index 61e4f94df0f..2ff18bfbe7e 100644
--- a/cpp/src/io/parquet/reader_impl_helpers.hpp
+++ b/cpp/src/io/parquet/reader_impl_helpers.hpp
@@ -32,9 +32,7 @@
 #include <tuple>
 #include <vector>
 
-namespace cudf::io::detail::parquet {
-
-using namespace cudf::io::parquet;
+namespace cudf::io::parquet::detail {
 
 /**
  * @brief Function that translates Parquet datatype to cuDF type enum
@@ -181,7 +179,7 @@ class aggregate_reader_metadata {
    * @return A tuple of corrected row_start, row_count and list of row group indexes and its
    *         starting row
    */
-  [[nodiscard]] std::tuple<int64_t, size_type, std::vector<gpu::row_group_info>> select_row_groups(
+  [[nodiscard]] std::tuple<int64_t, size_type, std::vector<row_group_info>> select_row_groups(
     host_span<std::vector<size_type> const> row_group_indices,
     int64_t row_start,
     std::optional<size_type> const& row_count,
@@ -201,12 +199,13 @@ class aggregate_reader_metadata {
    * @return input column information, output column information, list of output column schema
    * indices
    */
-  [[nodiscard]] std::
-    tuple<std::vector<input_column_info>, std::vector<inline_column_buffer>, std::vector<size_type>>
-    select_columns(std::optional<std::vector<std::string>> const& use_names,
-                   bool include_index,
-                   bool strings_to_categorical,
-                   type_id timestamp_type_id) const;
+  [[nodiscard]] std::tuple<std::vector<input_column_info>,
+                           std::vector<cudf::io::detail::inline_column_buffer>,
+                           std::vector<size_type>>
+  select_columns(std::optional<std::vector<std::string>> const& use_names,
+                 bool include_index,
+                 bool strings_to_categorical,
+                 type_id timestamp_type_id) const;
 };
 
 /**
@@ -275,4 +274,4 @@ class named_to_reference_converter : public ast::detail::expression_transformer
   std::list<ast::operation> _operators;
 };
 
-}  // namespace cudf::io::detail::parquet
+}  // namespace cudf::io::parquet::detail
diff --git a/cpp/src/io/parquet/reader_impl_preprocess.cu b/cpp/src/io/parquet/reader_impl_preprocess.cu
index c731c467f2c..4bc6bb6f43b 100644
--- a/cpp/src/io/parquet/reader_impl_preprocess.cu
+++ b/cpp/src/io/parquet/reader_impl_preprocess.cu
@@ -43,7 +43,8 @@
 
 #include <numeric>
 
-namespace cudf::io::detail::parquet {
+namespace cudf::io::parquet::detail {
+
 namespace {
 
 /**
@@ -185,11 +186,11 @@ template <typename T = uint8_t>
  */
 [[nodiscard]] std::tuple<int32_t, int32_t, int8_t> conversion_info(type_id column_type_id,
                                                                    type_id timestamp_type_id,
-                                                                   parquet::Type physical,
+                                                                   Type physical,
                                                                    int8_t converted,
                                                                    int32_t length)
 {
-  int32_t type_width = (physical == parquet::FIXED_LEN_BYTE_ARRAY) ? length : 0;
+  int32_t type_width = (physical == FIXED_LEN_BYTE_ARRAY) ? length : 0;
   int32_t clock_rate = 0;
   if (column_type_id == type_id::INT8 or column_type_id == type_id::UINT8) {
     type_width = 1;  // I32 -> I8
@@ -202,9 +203,9 @@ template <typename T = uint8_t>
   }
 
   int8_t converted_type = converted;
-  if (converted_type == parquet::DECIMAL && column_type_id != type_id::FLOAT64 &&
+  if (converted_type == DECIMAL && column_type_id != type_id::FLOAT64 &&
       not cudf::is_fixed_point(data_type{column_type_id})) {
-    converted_type = parquet::UNKNOWN;  // Not converting to float64 or decimal
+    converted_type = UNKNOWN;  // Not converting to float64 or decimal
   }
   return std::make_tuple(type_width, clock_rate, converted_type);
 }
@@ -226,7 +227,7 @@ template <typename T = uint8_t>
 [[nodiscard]] std::future<void> read_column_chunks_async(
   std::vector<std::unique_ptr<datasource>> const& sources,
   std::vector<std::unique_ptr<datasource::buffer>>& page_data,
-  cudf::detail::hostdevice_vector<gpu::ColumnChunkDesc>& chunks,
+  cudf::detail::hostdevice_vector<ColumnChunkDesc>& chunks,
   size_t begin_chunk,
   size_t end_chunk,
   std::vector<size_t> const& column_chunk_offsets,
@@ -239,11 +240,10 @@ template <typename T = uint8_t>
     size_t const io_offset   = column_chunk_offsets[chunk];
     size_t io_size           = chunks[chunk].compressed_size;
     size_t next_chunk        = chunk + 1;
-    bool const is_compressed = (chunks[chunk].codec != parquet::Compression::UNCOMPRESSED);
+    bool const is_compressed = (chunks[chunk].codec != Compression::UNCOMPRESSED);
     while (next_chunk < end_chunk) {
-      size_t const next_offset = column_chunk_offsets[next_chunk];
-      bool const is_next_compressed =
-        (chunks[next_chunk].codec != parquet::Compression::UNCOMPRESSED);
+      size_t const next_offset      = column_chunk_offsets[next_chunk];
+      bool const is_next_compressed = (chunks[next_chunk].codec != Compression::UNCOMPRESSED);
       if (next_offset != io_offset + io_size || is_next_compressed != is_compressed ||
           chunk_source_map[chunk] != chunk_source_map[next_chunk]) {
         // Can't merge if not contiguous or mixing compressed and uncompressed
@@ -300,13 +300,13 @@ template <typename T = uint8_t>
  *
  * @return The total number of pages
  */
-[[nodiscard]] size_t count_page_headers(
-  cudf::detail::hostdevice_vector<gpu::ColumnChunkDesc>& chunks, rmm::cuda_stream_view stream)
+[[nodiscard]] size_t count_page_headers(cudf::detail::hostdevice_vector<ColumnChunkDesc>& chunks,
+                                        rmm::cuda_stream_view stream)
 {
   size_t total_pages = 0;
 
   chunks.host_to_device_async(stream);
-  gpu::DecodePageHeaders(chunks.device_ptr(), chunks.size(), stream);
+  DecodePageHeaders(chunks.device_ptr(), chunks.size(), stream);
   chunks.device_to_host_sync(stream);
 
   for (size_t c = 0; c < chunks.size(); c++) {
@@ -337,8 +337,8 @@ constexpr bool is_supported_encoding(Encoding enc)
  * @param stream CUDA stream used for device memory operations and kernel launches
  * @returns The size in bytes of level type data required
  */
-int decode_page_headers(cudf::detail::hostdevice_vector<gpu::ColumnChunkDesc>& chunks,
-                        cudf::detail::hostdevice_vector<gpu::PageInfo>& pages,
+int decode_page_headers(cudf::detail::hostdevice_vector<ColumnChunkDesc>& chunks,
+                        cudf::detail::hostdevice_vector<PageInfo>& pages,
                         rmm::cuda_stream_view stream)
 {
   // IMPORTANT : if you change how pages are stored within a chunk (dist pages, then data pages),
@@ -350,14 +350,14 @@ int decode_page_headers(cudf::detail::hostdevice_vector<gpu::ColumnChunkDesc>& c
   }
 
   chunks.host_to_device_async(stream);
-  gpu::DecodePageHeaders(chunks.device_ptr(), chunks.size(), stream);
+  DecodePageHeaders(chunks.device_ptr(), chunks.size(), stream);
 
   // compute max bytes needed for level data
   auto level_bit_size =
     cudf::detail::make_counting_transform_iterator(0, [chunks = chunks.begin()] __device__(int i) {
       auto c = chunks[i];
       return static_cast<int>(
-        max(c.level_bits[gpu::level_type::REPETITION], c.level_bits[gpu::level_type::DEFINITION]));
+        max(c.level_bits[level_type::REPETITION], c.level_bits[level_type::DEFINITION]));
     });
   // max level data bit size.
   int const max_level_bits   = thrust::reduce(rmm::exec_policy(stream),
@@ -388,11 +388,11 @@ int decode_page_headers(cudf::detail::hostdevice_vector<gpu::ColumnChunkDesc>& c
  * @return Device buffer to decompressed page data
  */
 [[nodiscard]] rmm::device_buffer decompress_page_data(
-  cudf::detail::hostdevice_vector<gpu::ColumnChunkDesc>& chunks,
-  cudf::detail::hostdevice_vector<gpu::PageInfo>& pages,
+  cudf::detail::hostdevice_vector<ColumnChunkDesc>& chunks,
+  cudf::detail::hostdevice_vector<PageInfo>& pages,
   rmm::cuda_stream_view stream)
 {
-  auto for_each_codec_page = [&](parquet::Compression codec, std::function<void(size_t)> const& f) {
+  auto for_each_codec_page = [&](Compression codec, std::function<void(size_t)> const& f) {
     for (size_t c = 0, page_count = 0; c < chunks.size(); c++) {
       const auto page_stride = chunks[c].max_num_pages;
       if (chunks[c].codec == codec) {
@@ -412,19 +412,16 @@ int decode_page_headers(cudf::detail::hostdevice_vector<gpu::ColumnChunkDesc>& c
   size_t total_decomp_size = 0;
 
   struct codec_stats {
-    parquet::Compression compression_type = UNCOMPRESSED;
-    size_t num_pages                      = 0;
-    int32_t max_decompressed_size         = 0;
-    size_t total_decomp_size              = 0;
+    Compression compression_type  = UNCOMPRESSED;
+    size_t num_pages              = 0;
+    int32_t max_decompressed_size = 0;
+    size_t total_decomp_size      = 0;
   };
 
-  std::array codecs{codec_stats{parquet::GZIP},
-                    codec_stats{parquet::SNAPPY},
-                    codec_stats{parquet::BROTLI},
-                    codec_stats{parquet::ZSTD}};
+  std::array codecs{codec_stats{GZIP}, codec_stats{SNAPPY}, codec_stats{BROTLI}, codec_stats{ZSTD}};
 
   auto is_codec_supported = [&codecs](int8_t codec) {
-    if (codec == parquet::UNCOMPRESSED) return true;
+    if (codec == UNCOMPRESSED) return true;
     return std::find_if(codecs.begin(), codecs.end(), [codec](auto& cstats) {
              return codec == cstats.compression_type;
            }) != codecs.end();
@@ -445,7 +442,7 @@ int decode_page_headers(cudf::detail::hostdevice_vector<gpu::ColumnChunkDesc>& c
       codec.num_pages++;
       num_comp_pages++;
     });
-    if (codec.compression_type == parquet::BROTLI && codec.num_pages > 0) {
+    if (codec.compression_type == BROTLI && codec.num_pages > 0) {
       debrotli_scratch.resize(get_gpu_debrotli_scratch_size(codec.num_pages), stream);
     }
   }
@@ -482,7 +479,7 @@ int decode_page_headers(cudf::detail::hostdevice_vector<gpu::ColumnChunkDesc>& c
       auto& page          = pages[page_idx];
       // offset will only be non-zero for V2 pages
       auto const offset =
-        page.lvl_bytes[gpu::level_type::DEFINITION] + page.lvl_bytes[gpu::level_type::REPETITION];
+        page.lvl_bytes[level_type::DEFINITION] + page.lvl_bytes[level_type::REPETITION];
       // for V2 need to copy def and rep level info into place, and then offset the
       // input and output buffers. otherwise we'd have to keep both the compressed
       // and decompressed data.
@@ -509,11 +506,11 @@ int decode_page_headers(cudf::detail::hostdevice_vector<gpu::ColumnChunkDesc>& c
     device_span<compression_result> d_comp_res_view(comp_res.data() + start_pos, codec.num_pages);
 
     switch (codec.compression_type) {
-      case parquet::GZIP:
+      case GZIP:
         gpuinflate(d_comp_in, d_comp_out, d_comp_res_view, gzip_header_included::YES, stream);
         break;
-      case parquet::SNAPPY:
-        if (nvcomp_integration::is_stable_enabled()) {
+      case SNAPPY:
+        if (cudf::io::detail::nvcomp_integration::is_stable_enabled()) {
           nvcomp::batched_decompress(nvcomp::compression_type::SNAPPY,
                                      d_comp_in,
                                      d_comp_out,
@@ -525,7 +522,7 @@ int decode_page_headers(cudf::detail::hostdevice_vector<gpu::ColumnChunkDesc>& c
           gpu_unsnap(d_comp_in, d_comp_out, d_comp_res_view, stream);
         }
         break;
-      case parquet::ZSTD:
+      case ZSTD:
         nvcomp::batched_decompress(nvcomp::compression_type::ZSTD,
                                    d_comp_in,
                                    d_comp_out,
@@ -534,7 +531,7 @@ int decode_page_headers(cudf::detail::hostdevice_vector<gpu::ColumnChunkDesc>& c
                                    codec.total_decomp_size,
                                    stream);
         break;
-      case parquet::BROTLI:
+      case BROTLI:
         gpu_debrotli(d_comp_in,
                      d_comp_out,
                      d_comp_res_view,
@@ -594,9 +591,9 @@ void reader::impl::allocate_nesting_info()
     });
 
   page_nesting_info =
-    cudf::detail::hostdevice_vector<gpu::PageNestingInfo>{total_page_nesting_infos, _stream};
+    cudf::detail::hostdevice_vector<PageNestingInfo>{total_page_nesting_infos, _stream};
   page_nesting_decode_info =
-    cudf::detail::hostdevice_vector<gpu::PageNestingDecodeInfo>{total_page_nesting_infos, _stream};
+    cudf::detail::hostdevice_vector<PageNestingDecodeInfo>{total_page_nesting_infos, _stream};
 
   // update pointers in the PageInfos
   int target_page_index = 0;
@@ -653,10 +650,10 @@ void reader::impl::allocate_nesting_info()
       if (!cur_schema.is_stub()) {
         // initialize each page within the chunk
         for (int p_idx = 0; p_idx < chunks[idx].num_data_pages; p_idx++) {
-          gpu::PageNestingInfo* pni =
+          PageNestingInfo* pni =
             &page_nesting_info[nesting_info_index + (p_idx * per_page_nesting_info_size)];
 
-          gpu::PageNestingDecodeInfo* nesting_info =
+          PageNestingDecodeInfo* nesting_info =
             &page_nesting_decode_info[nesting_info_index + (p_idx * per_page_nesting_info_size)];
 
           // if we have lists, set our start and end depth remappings
@@ -717,9 +714,9 @@ void reader::impl::allocate_level_decode_space()
   for (size_t idx = 0; idx < pages.size(); idx++) {
     auto& p = pages[idx];
 
-    p.lvl_decode_buf[gpu::level_type::DEFINITION] = buf;
+    p.lvl_decode_buf[level_type::DEFINITION] = buf;
     buf += (LEVEL_DECODE_BUF_SIZE * _pass_itm_data->level_type_size);
-    p.lvl_decode_buf[gpu::level_type::REPETITION] = buf;
+    p.lvl_decode_buf[level_type::REPETITION] = buf;
     buf += (LEVEL_DECODE_BUF_SIZE * _pass_itm_data->level_type_size);
   }
 }
@@ -824,25 +821,25 @@ void reader::impl::load_global_chunk_info()
                         schema.converted_type,
                         schema.type_length);
 
-      chunks.push_back(gpu::ColumnChunkDesc(col_meta.total_compressed_size,
-                                            nullptr,
-                                            col_meta.num_values,
-                                            schema.type,
-                                            type_width,
-                                            row_group_start,
-                                            row_group_rows,
-                                            schema.max_definition_level,
-                                            schema.max_repetition_level,
-                                            _metadata->get_output_nesting_depth(col.schema_idx),
-                                            required_bits(schema.max_definition_level),
-                                            required_bits(schema.max_repetition_level),
-                                            col_meta.codec,
-                                            converted_type,
-                                            schema.logical_type,
-                                            schema.decimal_precision,
-                                            clock_rate,
-                                            i,
-                                            col.schema_idx));
+      chunks.push_back(ColumnChunkDesc(col_meta.total_compressed_size,
+                                       nullptr,
+                                       col_meta.num_values,
+                                       schema.type,
+                                       type_width,
+                                       row_group_start,
+                                       row_group_rows,
+                                       schema.max_definition_level,
+                                       schema.max_repetition_level,
+                                       _metadata->get_output_nesting_depth(col.schema_idx),
+                                       required_bits(schema.max_definition_level),
+                                       required_bits(schema.max_repetition_level),
+                                       col_meta.codec,
+                                       converted_type,
+                                       schema.logical_type,
+                                       schema.decimal_precision,
+                                       clock_rate,
+                                       i,
+                                       col.schema_idx));
     }
 
     remaining_rows -= row_group_rows;
@@ -909,7 +906,7 @@ void reader::impl::compute_input_pass_row_group_info()
 void reader::impl::setup_pass()
 {
   // this will also cause the previous pass information to be deleted
-  _pass_itm_data = std::make_unique<cudf::io::parquet::gpu::pass_intermediate_data>();
+  _pass_itm_data = std::make_unique<pass_intermediate_data>();
 
   // setup row groups to be loaded for this pass
   auto const row_group_start = _input_pass_row_group_offsets[_current_input_pass];
@@ -929,8 +926,7 @@ void reader::impl::setup_pass()
   auto chunk_start = _file_itm_data.chunks.begin() + (row_group_start * chunks_per_rowgroup);
   auto chunk_end   = _file_itm_data.chunks.begin() + (row_group_end * chunks_per_rowgroup);
 
-  _pass_itm_data->chunks =
-    cudf::detail::hostdevice_vector<gpu::ColumnChunkDesc>(num_chunks, _stream);
+  _pass_itm_data->chunks = cudf::detail::hostdevice_vector<ColumnChunkDesc>(num_chunks, _stream);
   std::copy(chunk_start, chunk_end, _pass_itm_data->chunks.begin());
 
   // adjust skip_rows and num_rows by what's available in the row groups we are processing
@@ -970,7 +966,7 @@ void reader::impl::load_and_decompress_data()
   // Process dataset chunk pages into output columns
   auto const total_pages = count_page_headers(chunks, _stream);
   if (total_pages <= 0) { return; }
-  pages = cudf::detail::hostdevice_vector<gpu::PageInfo>(total_pages, total_pages, _stream);
+  pages = cudf::detail::hostdevice_vector<PageInfo>(total_pages, total_pages, _stream);
 
   // decoding of column/page information
   _pass_itm_data->level_type_size = decode_page_headers(chunks, pages, _stream);
@@ -978,7 +974,7 @@ void reader::impl::load_and_decompress_data()
     decomp_page_data = decompress_page_data(chunks, pages, _stream);
     // Free compressed data
     for (size_t c = 0; c < chunks.size(); c++) {
-      if (chunks[c].codec != parquet::Compression::UNCOMPRESSED) { raw_page_data[c].reset(); }
+      if (chunks[c].codec != Compression::UNCOMPRESSED) { raw_page_data[c].reset(); }
     }
   }
 
@@ -1019,14 +1015,13 @@ struct cumulative_row_info {
 };
 
 #if defined(PREPROCESS_DEBUG)
-void print_pages(cudf::detail::hostdevice_vector<gpu::PageInfo>& pages,
-                 rmm::cuda_stream_view _stream)
+void print_pages(cudf::detail::hostdevice_vector<PageInfo>& pages, rmm::cuda_stream_view _stream)
 {
   pages.device_to_host_sync(_stream);
   for (size_t idx = 0; idx < pages.size(); idx++) {
     auto const& p = pages[idx];
     // skip dictionary pages
-    if (p.flags & gpu::PAGEINFO_FLAGS_DICTIONARY) { continue; }
+    if (p.flags & PAGEINFO_FLAGS_DICTIONARY) { continue; }
     printf(
       "P(%lu, s:%d): chunk_row(%d), num_rows(%d), skipped_values(%d), skipped_leaf_values(%d), "
       "str_bytes(%d)\n",
@@ -1040,7 +1035,7 @@ void print_pages(cudf::detail::hostdevice_vector<gpu::PageInfo>& pages,
   }
 }
 
-void print_cumulative_page_info(cudf::detail::hostdevice_vector<gpu::PageInfo>& pages,
+void print_cumulative_page_info(cudf::detail::hostdevice_vector<PageInfo>& pages,
                                 rmm::device_uvector<int32_t> const& page_index,
                                 rmm::device_uvector<cumulative_row_info> const& c_info,
                                 rmm::cuda_stream_view stream)
@@ -1067,7 +1062,7 @@ void print_cumulative_page_info(cudf::detail::hostdevice_vector<gpu::PageInfo>&
     printf("Schema %d\n", schemas[idx]);
     for (size_t pidx = 0; pidx < pages.size(); pidx++) {
       auto const& page = pages[h_page_index[pidx]];
-      if (page.flags & gpu::PAGEINFO_FLAGS_DICTIONARY || page.src_col_schema != schemas[idx]) {
+      if (page.flags & PAGEINFO_FLAGS_DICTIONARY || page.src_col_schema != schemas[idx]) {
         continue;
       }
       printf("\tP: {%lu, %lu}\n", h_cinfo[pidx].row_count, h_cinfo[pidx].size_bytes);
@@ -1075,10 +1070,9 @@ void print_cumulative_page_info(cudf::detail::hostdevice_vector<gpu::PageInfo>&
   }
 }
 
-void print_cumulative_row_info(
-  host_span<cumulative_row_info const> sizes,
-  std::string const& label,
-  std::optional<std::vector<gpu::chunk_read_info>> splits = std::nullopt)
+void print_cumulative_row_info(host_span<cumulative_row_info const> sizes,
+                               std::string const& label,
+                               std::optional<std::vector<chunk_read_info>> splits = std::nullopt)
 {
   if (splits.has_value()) {
     printf("------------\nSplits\n");
@@ -1093,7 +1087,7 @@ void print_cumulative_row_info(
     if (splits.has_value()) {
       // if we have a split at this row count and this is the last instance of this row count
       auto start = thrust::make_transform_iterator(
-        splits->begin(), [](gpu::chunk_read_info const& i) { return i.skip_rows; });
+        splits->begin(), [](chunk_read_info const& i) { return i.skip_rows; });
       auto end               = start + splits->size();
       auto split             = std::find(start, end, sizes[idx].row_count);
       auto const split_index = [&]() -> int {
@@ -1180,12 +1174,12 @@ __device__ size_t row_size_functor::operator()<string_view>(size_t num_rows, boo
  * Sums across all nesting levels.
  */
 struct get_cumulative_row_info {
-  gpu::PageInfo const* const pages;
+  PageInfo const* const pages;
 
   __device__ cumulative_row_info operator()(size_type index)
   {
     auto const& page = pages[index];
-    if (page.flags & gpu::PAGEINFO_FLAGS_DICTIONARY) {
+    if (page.flags & PAGEINFO_FLAGS_DICTIONARY) {
       return cumulative_row_info{0, 0, page.src_col_schema};
     }
 
@@ -1250,15 +1244,15 @@ struct row_total_size {
  * @param num_rows Total number of rows to read
  * @param chunk_read_limit Limit on total number of bytes to be returned per read, for all columns
  */
-std::vector<gpu::chunk_read_info> find_splits(std::vector<cumulative_row_info> const& sizes,
-                                              size_t num_rows,
-                                              size_t chunk_read_limit)
+std::vector<chunk_read_info> find_splits(std::vector<cumulative_row_info> const& sizes,
+                                         size_t num_rows,
+                                         size_t chunk_read_limit)
 {
   // now we have an array of {row_count, real output bytes}. just walk through it and generate
   // splits.
   // TODO: come up with a clever way to do this entirely in parallel. For now, as long as batch
   // sizes are reasonably large, this shouldn't iterate too many times
-  std::vector<gpu::chunk_read_info> splits;
+  std::vector<chunk_read_info> splits;
   {
     size_t cur_pos             = 0;
     size_t cur_cumulative_size = 0;
@@ -1290,7 +1284,7 @@ std::vector<gpu::chunk_read_info> find_splits(std::vector<cumulative_row_info> c
 
       auto const start_row = cur_row_count;
       cur_row_count        = sizes[split_pos].row_count;
-      splits.push_back(gpu::chunk_read_info{start_row, cur_row_count - start_row});
+      splits.push_back(chunk_read_info{start_row, cur_row_count - start_row});
       cur_pos             = split_pos;
       cur_cumulative_size = sizes[split_pos].size_bytes;
     }
@@ -1311,12 +1305,11 @@ std::vector<gpu::chunk_read_info> find_splits(std::vector<cumulative_row_info> c
  * @param chunk_read_limit Limit on total number of bytes to be returned per read, for all columns
  * @param stream CUDA stream to use
  */
-std::vector<gpu::chunk_read_info> compute_splits(
-  cudf::detail::hostdevice_vector<gpu::PageInfo>& pages,
-  gpu::pass_intermediate_data const& id,
-  size_t num_rows,
-  size_t chunk_read_limit,
-  rmm::cuda_stream_view stream)
+std::vector<chunk_read_info> compute_splits(cudf::detail::hostdevice_vector<PageInfo>& pages,
+                                            pass_intermediate_data const& id,
+                                            size_t num_rows,
+                                            size_t chunk_read_limit,
+                                            rmm::cuda_stream_view stream)
 {
   auto const& page_keys  = id.page_keys;
   auto const& page_index = id.page_index;
@@ -1395,16 +1388,16 @@ std::vector<gpu::chunk_read_info> compute_splits(
 }
 
 struct get_page_chunk_idx {
-  __device__ size_type operator()(gpu::PageInfo const& page) { return page.chunk_idx; }
+  __device__ size_type operator()(PageInfo const& page) { return page.chunk_idx; }
 };
 
 struct get_page_num_rows {
-  __device__ size_type operator()(gpu::PageInfo const& page) { return page.num_rows; }
+  __device__ size_type operator()(PageInfo const& page) { return page.num_rows; }
 };
 
 struct get_page_column_index {
-  gpu::ColumnChunkDesc const* chunks;
-  __device__ size_type operator()(gpu::PageInfo const& page)
+  ColumnChunkDesc const* chunks;
+  __device__ size_type operator()(PageInfo const& page)
   {
     return chunks[page.chunk_idx].src_col_index;
   }
@@ -1441,7 +1434,7 @@ struct get_page_nesting_size {
   input_col_info const* const input_cols;
   size_type const max_depth;
   size_t const num_pages;
-  gpu::PageInfo const* const pages;
+  PageInfo const* const pages;
   int const* page_indices;
 
   __device__ size_type operator()(size_t index) const
@@ -1450,7 +1443,7 @@ struct get_page_nesting_size {
 
     auto const& page = pages[page_indices[indices.page_idx]];
     if (page.src_col_schema != input_cols[indices.col_idx].schema_idx ||
-        page.flags & gpu::PAGEINFO_FLAGS_DICTIONARY ||
+        page.flags & PAGEINFO_FLAGS_DICTIONARY ||
         indices.depth_idx >= input_cols[indices.col_idx].nesting_depth) {
       return 0;
     }
@@ -1468,7 +1461,7 @@ struct get_reduction_key {
  * @brief Writes to the chunk_row field of the PageInfo struct.
  */
 struct chunk_row_output_iter {
-  gpu::PageInfo* p;
+  PageInfo* p;
   using value_type        = size_type;
   using difference_type   = size_type;
   using pointer           = size_type*;
@@ -1490,7 +1483,7 @@ struct chunk_row_output_iter {
  * @brief Writes to the page_start_value field of the PageNestingInfo struct, keyed by schema.
  */
 struct start_offset_output_iterator {
-  gpu::PageInfo const* pages;
+  PageInfo const* pages;
   int const* page_indices;
   size_t cur_index;
   input_col_info const* input_cols;
@@ -1529,9 +1522,9 @@ struct start_offset_output_iterator {
   {
     auto const indices = reduction_indices{index, max_depth, num_pages};
 
-    gpu::PageInfo const& p = pages[page_indices[indices.page_idx]];
+    PageInfo const& p = pages[page_indices[indices.page_idx]];
     if (p.src_col_schema != input_cols[indices.col_idx].schema_idx ||
-        p.flags & gpu::PAGEINFO_FLAGS_DICTIONARY ||
+        p.flags & PAGEINFO_FLAGS_DICTIONARY ||
         indices.depth_idx >= input_cols[indices.col_idx].nesting_depth) {
       return empty;
     }
@@ -1540,15 +1533,15 @@ struct start_offset_output_iterator {
 };
 
 struct flat_column_num_rows {
-  gpu::PageInfo const* pages;
-  gpu::ColumnChunkDesc const* chunks;
+  PageInfo const* pages;
+  ColumnChunkDesc const* chunks;
 
   __device__ size_type operator()(size_type pindex) const
   {
-    gpu::PageInfo const& page = pages[pindex];
+    PageInfo const& page = pages[pindex];
     // ignore dictionary pages and pages belonging to any column containing repetition (lists)
-    if ((page.flags & gpu::PAGEINFO_FLAGS_DICTIONARY) ||
-        (chunks[page.chunk_idx].max_level[gpu::level_type::REPETITION] > 0)) {
+    if ((page.flags & PAGEINFO_FLAGS_DICTIONARY) ||
+        (chunks[page.chunk_idx].max_level[level_type::REPETITION] > 0)) {
       return 0;
     }
     return page.num_rows;
@@ -1581,8 +1574,8 @@ struct row_counts_different {
  * @param expected_row_count Expected row count, if applicable
  * @param stream CUDA stream used for device memory operations and kernel launches
  */
-void detect_malformed_pages(cudf::detail::hostdevice_vector<gpu::PageInfo>& pages,
-                            cudf::detail::hostdevice_vector<gpu::ColumnChunkDesc> const& chunks,
+void detect_malformed_pages(cudf::detail::hostdevice_vector<PageInfo>& pages,
+                            cudf::detail::hostdevice_vector<ColumnChunkDesc> const& chunks,
                             device_span<int const> page_keys,
                             device_span<int const> page_index,
                             std::optional<size_t> expected_row_count,
@@ -1631,23 +1624,21 @@ void detect_malformed_pages(cudf::detail::hostdevice_vector<gpu::PageInfo>& page
 }
 
 struct page_to_string_size {
-  gpu::PageInfo* pages;
-  gpu::ColumnChunkDesc const* chunks;
+  PageInfo* pages;
+  ColumnChunkDesc const* chunks;
 
   __device__ size_t operator()(size_type page_idx) const
   {
     auto const page  = pages[page_idx];
     auto const chunk = chunks[page.chunk_idx];
 
-    if (not is_string_col(chunk) || (page.flags & gpu::PAGEINFO_FLAGS_DICTIONARY) != 0) {
-      return 0;
-    }
+    if (not is_string_col(chunk) || (page.flags & PAGEINFO_FLAGS_DICTIONARY) != 0) { return 0; }
     return pages[page_idx].str_bytes;
   }
 };
 
 struct page_offset_output_iter {
-  gpu::PageInfo* p;
+  PageInfo* p;
   size_type const* index;
 
   using value_type        = size_type;
@@ -1738,7 +1729,7 @@ void reader::impl::preprocess_pages(bool uses_custom_row_bounds, size_t chunk_re
       cols          = &out_buf.children;
 
       // if this has a list parent, we have to get column sizes from the
-      // data computed during gpu::ComputePageSizes
+      // data computed during ComputePageSizes
       if (out_buf.user_data & PARQUET_COLUMN_BUFFER_FLAG_HAS_LIST_PARENT) {
         has_lists = true;
         break;
@@ -1749,7 +1740,7 @@ void reader::impl::preprocess_pages(bool uses_custom_row_bounds, size_t chunk_re
 
   // generate string dict indices if necessary
   {
-    auto is_dict_chunk = [](gpu::ColumnChunkDesc const& chunk) {
+    auto is_dict_chunk = [](ColumnChunkDesc const& chunk) {
       return (chunk.data_type & 0x7) == BYTE_ARRAY && chunk.num_dict_pages > 0;
     };
 
@@ -1785,7 +1776,7 @@ void reader::impl::preprocess_pages(bool uses_custom_row_bounds, size_t chunk_re
 
     if (total_str_dict_indexes > 0) {
       chunks.host_to_device_async(_stream);
-      gpu::BuildStringDictionaryIndex(chunks.device_ptr(), chunks.size(), _stream);
+      BuildStringDictionaryIndex(chunks.device_ptr(), chunks.size(), _stream);
     }
   }
 
@@ -1800,14 +1791,14 @@ void reader::impl::preprocess_pages(bool uses_custom_row_bounds, size_t chunk_re
     // if:
     // - user has passed custom row bounds
     // - we will be doing a chunked read
-    gpu::ComputePageSizes(pages,
-                          chunks,
-                          0,  // 0-max size_t. process all possible rows
-                          std::numeric_limits<size_t>::max(),
-                          true,                  // compute num_rows
-                          chunk_read_limit > 0,  // compute string sizes
-                          _pass_itm_data->level_type_size,
-                          _stream);
+    ComputePageSizes(pages,
+                     chunks,
+                     0,  // 0-max size_t. process all possible rows
+                     std::numeric_limits<size_t>::max(),
+                     true,                  // compute num_rows
+                     chunk_read_limit > 0,  // compute string sizes
+                     _pass_itm_data->level_type_size,
+                     _stream);
 
     // computes:
     // PageInfo::chunk_row (the absolute start row index) for all pages
@@ -1836,7 +1827,7 @@ void reader::impl::preprocess_pages(bool uses_custom_row_bounds, size_t chunk_re
   _pass_itm_data->output_chunk_read_info =
     _output_chunk_read_limit > 0
       ? compute_splits(pages, *_pass_itm_data, num_rows, chunk_read_limit, _stream)
-      : std::vector<gpu::chunk_read_info>{{skip_rows, num_rows}};
+      : std::vector<chunk_read_info>{{skip_rows, num_rows}};
 }
 
 void reader::impl::allocate_columns(size_t skip_rows, size_t num_rows, bool uses_custom_row_bounds)
@@ -1853,14 +1844,14 @@ void reader::impl::allocate_columns(size_t skip_rows, size_t num_rows, bool uses
   // respect the user bounds. It is only necessary to do this second pass if uses_custom_row_bounds
   // is set (if the user has specified artificial bounds).
   if (uses_custom_row_bounds) {
-    gpu::ComputePageSizes(pages,
-                          chunks,
-                          skip_rows,
-                          num_rows,
-                          false,  // num_rows is already computed
-                          false,  // no need to compute string sizes
-                          _pass_itm_data->level_type_size,
-                          _stream);
+    ComputePageSizes(pages,
+                     chunks,
+                     skip_rows,
+                     num_rows,
+                     false,  // num_rows is already computed
+                     false,  // no need to compute string sizes
+                     _pass_itm_data->level_type_size,
+                     _stream);
 
     // print_pages(pages, _stream);
   }
@@ -1879,7 +1870,7 @@ void reader::impl::allocate_columns(size_t skip_rows, size_t num_rows, bool uses
       cols          = &out_buf.children;
 
       // if this has a list parent, we have to get column sizes from the
-      // data computed during gpu::ComputePageSizes
+      // data computed during ComputePageSizes
       if (out_buf.user_data & PARQUET_COLUMN_BUFFER_FLAG_HAS_LIST_PARENT) {
         has_lists = true;
       }
@@ -2014,4 +2005,4 @@ std::vector<size_t> reader::impl::calculate_page_string_offsets()
   return col_sizes;
 }
 
-}  // namespace cudf::io::detail::parquet
+}  // namespace cudf::io::parquet::detail
diff --git a/cpp/src/io/parquet/rle_stream.cuh b/cpp/src/io/parquet/rle_stream.cuh
index 2545a074a38..799d6d9fd64 100644
--- a/cpp/src/io/parquet/rle_stream.cuh
+++ b/cpp/src/io/parquet/rle_stream.cuh
@@ -20,7 +20,7 @@
 #include <cudf/detail/utilities/cuda.cuh>
 #include <cudf/detail/utilities/integer_utils.hpp>
 
-namespace cudf::io::parquet::gpu {
+namespace cudf::io::parquet::detail {
 
 template <int num_threads>
 constexpr int rle_stream_required_run_buffer_size()
@@ -362,4 +362,4 @@ struct rle_stream {
   }
 };
 
-}  // namespace cudf::io::parquet::gpu
+}  // namespace cudf::io::parquet::detail
diff --git a/cpp/src/io/parquet/writer_impl.cu b/cpp/src/io/parquet/writer_impl.cu
index a124f352ee4..a021aa89714 100644
--- a/cpp/src/io/parquet/writer_impl.cu
+++ b/cpp/src/io/parquet/writer_impl.cu
@@ -56,10 +56,10 @@
 
 namespace cudf {
 namespace io {
-namespace detail {
 namespace parquet {
-using namespace cudf::io::parquet;
-using namespace cudf::io;
+namespace detail {
+
+using namespace cudf::io::detail;
 
 struct aggregate_writer_metadata {
   aggregate_writer_metadata(host_span<partition_info const> partitions,
@@ -185,13 +185,13 @@ namespace {
  * @param compression The compression type
  * @return The supported Parquet compression
  */
-parquet::Compression to_parquet_compression(compression_type compression)
+Compression to_parquet_compression(compression_type compression)
 {
   switch (compression) {
     case compression_type::AUTO:
-    case compression_type::SNAPPY: return parquet::Compression::SNAPPY;
-    case compression_type::ZSTD: return parquet::Compression::ZSTD;
-    case compression_type::NONE: return parquet::Compression::UNCOMPRESSED;
+    case compression_type::SNAPPY: return Compression::SNAPPY;
+    case compression_type::ZSTD: return Compression::ZSTD;
+    case compression_type::NONE: return Compression::UNCOMPRESSED;
     default: CUDF_FAIL("Unsupported compression type");
   }
 }
@@ -206,7 +206,7 @@ void update_chunk_encodings(std::vector<Encoding>& encodings, uint32_t enc_mask)
 {
   for (uint8_t enc = 0; enc < static_cast<uint8_t>(Encoding::NUM_ENCODINGS); enc++) {
     auto const enc_enum = static_cast<Encoding>(enc);
-    if ((enc_mask & gpu::encoding_to_mask(enc_enum)) != 0) { encodings.push_back(enc_enum); }
+    if ((enc_mask & encoding_to_mask(enc_enum)) != 0) { encodings.push_back(enc_enum); }
   }
 }
 
@@ -761,11 +761,11 @@ struct parquet_column_view {
                       std::vector<schema_tree_node> const& schema_tree,
                       rmm::cuda_stream_view stream);
 
-  [[nodiscard]] gpu::parquet_column_device_view get_device_view(rmm::cuda_stream_view stream) const;
+  [[nodiscard]] parquet_column_device_view get_device_view(rmm::cuda_stream_view stream) const;
 
   [[nodiscard]] column_view cudf_column_view() const { return cudf_col; }
-  [[nodiscard]] parquet::Type physical_type() const { return schema_node.type; }
-  [[nodiscard]] parquet::ConvertedType converted_type() const { return schema_node.converted_type; }
+  [[nodiscard]] Type physical_type() const { return schema_node.type; }
+  [[nodiscard]] ConvertedType converted_type() const { return schema_node.converted_type; }
 
   std::vector<std::string> const& get_path_in_schema() { return path_in_schema; }
 
@@ -846,11 +846,11 @@ parquet_column_view::parquet_column_view(schema_tree_node const& schema_node,
   uint16_t max_rep_level = 0;
   curr_schema_node       = schema_node;
   while (curr_schema_node.parent_idx != -1) {
-    if (curr_schema_node.repetition_type == parquet::REPEATED or
-        curr_schema_node.repetition_type == parquet::OPTIONAL) {
+    if (curr_schema_node.repetition_type == REPEATED or
+        curr_schema_node.repetition_type == OPTIONAL) {
       ++max_def_level;
     }
-    if (curr_schema_node.repetition_type == parquet::REPEATED) { ++max_rep_level; }
+    if (curr_schema_node.repetition_type == REPEATED) { ++max_rep_level; }
     curr_schema_node = schema_tree[curr_schema_node.parent_idx];
   }
   CUDF_EXPECTS(max_def_level < 256, "Definition levels above 255 are not supported");
@@ -897,9 +897,9 @@ parquet_column_view::parquet_column_view(schema_tree_node const& schema_node,
   }
 }
 
-gpu::parquet_column_device_view parquet_column_view::get_device_view(rmm::cuda_stream_view) const
+parquet_column_device_view parquet_column_view::get_device_view(rmm::cuda_stream_view) const
 {
-  auto desc        = gpu::parquet_column_device_view{};  // Zero out all fields
+  auto desc        = parquet_column_device_view{};  // Zero out all fields
   desc.stats_dtype = schema_node.stats_dtype;
   desc.ts_scale    = schema_node.ts_scale;
 
@@ -931,8 +931,8 @@ gpu::parquet_column_device_view parquet_column_view::get_device_view(rmm::cuda_s
  * @param fragment_size Number of rows per fragment
  * @param stream CUDA stream used for device memory operations and kernel launches
  */
-void init_row_group_fragments(cudf::detail::hostdevice_2dvector<gpu::PageFragment>& frag,
-                              device_span<gpu::parquet_column_device_view const> col_desc,
+void init_row_group_fragments(cudf::detail::hostdevice_2dvector<PageFragment>& frag,
+                              device_span<parquet_column_device_view const> col_desc,
                               host_span<partition_info const> partitions,
                               device_span<int const> part_frag_offset,
                               uint32_t fragment_size,
@@ -940,7 +940,7 @@ void init_row_group_fragments(cudf::detail::hostdevice_2dvector<gpu::PageFragmen
 {
   auto d_partitions = cudf::detail::make_device_uvector_async(
     partitions, stream, rmm::mr::get_current_device_resource());
-  gpu::InitRowGroupFragments(frag, col_desc, d_partitions, part_frag_offset, fragment_size, stream);
+  InitRowGroupFragments(frag, col_desc, d_partitions, part_frag_offset, fragment_size, stream);
   frag.device_to_host_sync(stream);
 }
 
@@ -954,13 +954,13 @@ void init_row_group_fragments(cudf::detail::hostdevice_2dvector<gpu::PageFragmen
  * @param frag_sizes Array of fragment sizes for each column
  * @param stream CUDA stream used for device memory operations and kernel launches
  */
-void calculate_page_fragments(device_span<gpu::PageFragment> frag,
+void calculate_page_fragments(device_span<PageFragment> frag,
                               host_span<size_type const> frag_sizes,
                               rmm::cuda_stream_view stream)
 {
   auto d_frag_sz = cudf::detail::make_device_uvector_async(
     frag_sizes, stream, rmm::mr::get_current_device_resource());
-  gpu::CalculatePageFragments(frag, d_frag_sz, stream);
+  CalculatePageFragments(frag, d_frag_sz, stream);
 }
 
 /**
@@ -972,13 +972,13 @@ void calculate_page_fragments(device_span<gpu::PageFragment> frag,
  * @param stream CUDA stream used for device memory operations and kernel launches
  */
 void gather_fragment_statistics(device_span<statistics_chunk> frag_stats,
-                                device_span<gpu::PageFragment const> frags,
+                                device_span<PageFragment const> frags,
                                 bool int96_timestamps,
                                 rmm::cuda_stream_view stream)
 {
   rmm::device_uvector<statistics_group> frag_stats_group(frag_stats.size(), stream);
 
-  gpu::InitFragmentStatistics(frag_stats_group, frags, stream);
+  InitFragmentStatistics(frag_stats_group, frags, stream);
   detail::calculate_group_statistics<detail::io_file_format::PARQUET>(
     frag_stats.data(), frag_stats_group.data(), frag_stats.size(), stream, int96_timestamps);
   stream.synchronize();
@@ -1008,8 +1008,8 @@ size_t max_compression_output_size(Compression codec, uint32_t compression_block
   return compress_max_output_chunk_size(to_nvcomp_compression_type(codec), compression_blocksize);
 }
 
-auto init_page_sizes(hostdevice_2dvector<gpu::EncColumnChunk>& chunks,
-                     device_span<gpu::parquet_column_device_view const> col_desc,
+auto init_page_sizes(hostdevice_2dvector<EncColumnChunk>& chunks,
+                     device_span<parquet_column_device_view const> col_desc,
                      uint32_t num_columns,
                      size_t max_page_size_bytes,
                      size_type max_page_size_rows,
@@ -1021,19 +1021,19 @@ auto init_page_sizes(hostdevice_2dvector<gpu::EncColumnChunk>& chunks,
 
   chunks.host_to_device_async(stream);
   // Calculate number of pages and store in respective chunks
-  gpu::InitEncoderPages(chunks,
-                        {},
-                        {},
-                        {},
-                        col_desc,
-                        num_columns,
-                        max_page_size_bytes,
-                        max_page_size_rows,
-                        page_alignment(compression_codec),
-                        write_v2_headers,
-                        nullptr,
-                        nullptr,
-                        stream);
+  InitEncoderPages(chunks,
+                   {},
+                   {},
+                   {},
+                   col_desc,
+                   num_columns,
+                   max_page_size_bytes,
+                   max_page_size_rows,
+                   page_alignment(compression_codec),
+                   write_v2_headers,
+                   nullptr,
+                   nullptr,
+                   stream);
   chunks.device_to_host_sync(stream);
 
   int num_pages = 0;
@@ -1046,19 +1046,19 @@ auto init_page_sizes(hostdevice_2dvector<gpu::EncColumnChunk>& chunks,
   // Now that we know the number of pages, allocate an array to hold per page size and get it
   // populated
   cudf::detail::hostdevice_vector<size_type> page_sizes(num_pages, stream);
-  gpu::InitEncoderPages(chunks,
-                        {},
-                        page_sizes,
-                        {},
-                        col_desc,
-                        num_columns,
-                        max_page_size_bytes,
-                        max_page_size_rows,
-                        page_alignment(compression_codec),
-                        write_v2_headers,
-                        nullptr,
-                        nullptr,
-                        stream);
+  InitEncoderPages(chunks,
+                   {},
+                   page_sizes,
+                   {},
+                   col_desc,
+                   num_columns,
+                   max_page_size_bytes,
+                   max_page_size_rows,
+                   page_alignment(compression_codec),
+                   write_v2_headers,
+                   nullptr,
+                   nullptr,
+                   stream);
   page_sizes.device_to_host_sync(stream);
 
   // Get per-page max compressed size
@@ -1072,26 +1072,26 @@ auto init_page_sizes(hostdevice_2dvector<gpu::EncColumnChunk>& chunks,
   comp_page_sizes.host_to_device_async(stream);
 
   // Use per-page max compressed size to calculate chunk.compressed_size
-  gpu::InitEncoderPages(chunks,
-                        {},
-                        {},
-                        comp_page_sizes,
-                        col_desc,
-                        num_columns,
-                        max_page_size_bytes,
-                        max_page_size_rows,
-                        page_alignment(compression_codec),
-                        write_v2_headers,
-                        nullptr,
-                        nullptr,
-                        stream);
+  InitEncoderPages(chunks,
+                   {},
+                   {},
+                   comp_page_sizes,
+                   col_desc,
+                   num_columns,
+                   max_page_size_bytes,
+                   max_page_size_rows,
+                   page_alignment(compression_codec),
+                   write_v2_headers,
+                   nullptr,
+                   nullptr,
+                   stream);
   chunks.device_to_host_sync(stream);
   return comp_page_sizes;
 }
 
 size_t max_page_bytes(Compression compression, size_t max_page_size_bytes)
 {
-  if (compression == parquet::Compression::UNCOMPRESSED) { return max_page_size_bytes; }
+  if (compression == Compression::UNCOMPRESSED) { return max_page_size_bytes; }
 
   auto const ncomp_type   = to_nvcomp_compression_type(compression);
   auto const nvcomp_limit = nvcomp::is_compression_disabled(ncomp_type)
@@ -1104,9 +1104,9 @@ size_t max_page_bytes(Compression compression, size_t max_page_size_bytes)
 }
 
 std::pair<std::vector<rmm::device_uvector<size_type>>, std::vector<rmm::device_uvector<size_type>>>
-build_chunk_dictionaries(hostdevice_2dvector<gpu::EncColumnChunk>& chunks,
-                         host_span<gpu::parquet_column_device_view const> col_desc,
-                         device_2dspan<gpu::PageFragment const> frags,
+build_chunk_dictionaries(hostdevice_2dvector<EncColumnChunk>& chunks,
+                         host_span<parquet_column_device_view const> col_desc,
+                         device_2dspan<PageFragment const> frags,
                          Compression compression,
                          dictionary_policy dict_policy,
                          size_t max_dict_size,
@@ -1130,7 +1130,7 @@ build_chunk_dictionaries(hostdevice_2dvector<gpu::EncColumnChunk>& chunks,
   }
 
   // Allocate slots for each chunk
-  std::vector<rmm::device_uvector<gpu::slot_type>> hash_maps_storage;
+  std::vector<rmm::device_uvector<slot_type>> hash_maps_storage;
   hash_maps_storage.reserve(h_chunks.size());
   for (auto& chunk : h_chunks) {
     if (col_desc[chunk.col_desc_id].physical_type == Type::BOOLEAN ||
@@ -1149,8 +1149,8 @@ build_chunk_dictionaries(hostdevice_2dvector<gpu::EncColumnChunk>& chunks,
 
   chunks.host_to_device_async(stream);
 
-  gpu::initialize_chunk_hash_maps(chunks.device_view().flat_view(), stream);
-  gpu::populate_chunk_hash_maps(frags, stream);
+  initialize_chunk_hash_maps(chunks.device_view().flat_view(), stream);
+  populate_chunk_hash_maps(frags, stream);
 
   chunks.device_to_host_sync(stream);
 
@@ -1197,8 +1197,8 @@ build_chunk_dictionaries(hostdevice_2dvector<gpu::EncColumnChunk>& chunks,
     chunk.dict_index          = inserted_dict_index.data();
   }
   chunks.host_to_device_async(stream);
-  gpu::collect_map_entries(chunks.device_view().flat_view(), stream);
-  gpu::get_dictionary_indices(frags, stream);
+  collect_map_entries(chunks.device_view().flat_view(), stream);
+  get_dictionary_indices(frags, stream);
 
   return std::pair(std::move(dict_data), std::move(dict_index));
 }
@@ -1221,9 +1221,9 @@ build_chunk_dictionaries(hostdevice_2dvector<gpu::EncColumnChunk>& chunks,
  * @param write_v2_headers True if version 2 page headers are to be written
  * @param stream CUDA stream used for device memory operations and kernel launches
  */
-void init_encoder_pages(hostdevice_2dvector<gpu::EncColumnChunk>& chunks,
-                        device_span<gpu::parquet_column_device_view const> col_desc,
-                        device_span<gpu::EncPage> pages,
+void init_encoder_pages(hostdevice_2dvector<EncColumnChunk>& chunks,
+                        device_span<parquet_column_device_view const> col_desc,
+                        device_span<EncPage> pages,
                         cudf::detail::hostdevice_vector<size_type>& comp_page_sizes,
                         statistics_chunk* page_stats,
                         statistics_chunk* frag_stats,
@@ -1286,8 +1286,8 @@ void init_encoder_pages(hostdevice_2dvector<gpu::EncColumnChunk>& chunks,
  * @param write_v2_headers True if V2 page headers should be written
  * @param stream CUDA stream used for device memory operations and kernel launches
  */
-void encode_pages(hostdevice_2dvector<gpu::EncColumnChunk>& chunks,
-                  device_span<gpu::EncPage> pages,
+void encode_pages(hostdevice_2dvector<EncColumnChunk>& chunks,
+                  device_span<EncPage> pages,
                   uint32_t pages_in_batch,
                   uint32_t first_page_in_batch,
                   uint32_t rowgroups_in_batch,
@@ -1308,8 +1308,7 @@ void encode_pages(hostdevice_2dvector<gpu::EncColumnChunk>& chunks,
       ? device_span<statistics_chunk const>(page_stats + first_page_in_batch, pages_in_batch)
       : device_span<statistics_chunk const>();
 
-  uint32_t max_comp_pages =
-    (compression != parquet::Compression::UNCOMPRESSED) ? pages_in_batch : 0;
+  uint32_t max_comp_pages = (compression != Compression::UNCOMPRESSED) ? pages_in_batch : 0;
 
   rmm::device_uvector<device_span<uint8_t const>> comp_in(max_comp_pages, stream);
   rmm::device_uvector<device_span<uint8_t>> comp_out(max_comp_pages, stream);
@@ -1319,9 +1318,9 @@ void encode_pages(hostdevice_2dvector<gpu::EncColumnChunk>& chunks,
                comp_res.end(),
                compression_result{0, compression_status::FAILURE});
 
-  gpu::EncodePages(batch_pages, write_v2_headers, comp_in, comp_out, comp_res, stream);
+  EncodePages(batch_pages, write_v2_headers, comp_in, comp_out, comp_res, stream);
   switch (compression) {
-    case parquet::Compression::SNAPPY:
+    case Compression::SNAPPY:
       if (nvcomp::is_compression_disabled(nvcomp::compression_type::SNAPPY)) {
         gpu_snap(comp_in, comp_out, comp_res, stream);
       } else {
@@ -1329,7 +1328,7 @@ void encode_pages(hostdevice_2dvector<gpu::EncColumnChunk>& chunks,
           nvcomp::compression_type::SNAPPY, comp_in, comp_out, comp_res, stream);
       }
       break;
-    case parquet::Compression::ZSTD: {
+    case Compression::ZSTD: {
       if (auto const reason = nvcomp::is_compression_disabled(nvcomp::compression_type::ZSTD);
           reason) {
         CUDF_FAIL("Compression error: " + reason.value());
@@ -1338,7 +1337,7 @@ void encode_pages(hostdevice_2dvector<gpu::EncColumnChunk>& chunks,
 
       break;
     }
-    case parquet::Compression::UNCOMPRESSED: break;
+    case Compression::UNCOMPRESSED: break;
     default: CUDF_FAIL("invalid compression type");
   }
 
@@ -1378,7 +1377,7 @@ void encode_pages(hostdevice_2dvector<gpu::EncColumnChunk>& chunks,
  * @param column_index_truncate_length maximum length of min or max values in column index, in bytes
  * @return Computed buffer size needed to encode the column index
  */
-size_t column_index_buffer_size(gpu::EncColumnChunk* ck, int32_t column_index_truncate_length)
+size_t column_index_buffer_size(EncColumnChunk* ck, int32_t column_index_truncate_length)
 {
   // encoding the column index for a given chunk requires:
   //   each list (4 of them) requires 6 bytes of overhead
@@ -1499,8 +1498,8 @@ auto convert_table_to_parquet_data(table_input_metadata& table_meta,
   std::vector<SchemaElement> this_table_schema(schema_tree.begin(), schema_tree.end());
 
   // Initialize column description
-  cudf::detail::hostdevice_vector<gpu::parquet_column_device_view> col_desc(parquet_columns.size(),
-                                                                            stream);
+  cudf::detail::hostdevice_vector<parquet_column_device_view> col_desc(parquet_columns.size(),
+                                                                       stream);
   std::transform(
     parquet_columns.begin(), parquet_columns.end(), col_desc.host_ptr(), [&](auto const& pcol) {
       return pcol.get_device_view(stream);
@@ -1576,7 +1575,7 @@ auto convert_table_to_parquet_data(table_input_metadata& table_meta,
 
   auto d_part_frag_offset = cudf::detail::make_device_uvector_async(
     part_frag_offset, stream, rmm::mr::get_current_device_resource());
-  cudf::detail::hostdevice_2dvector<gpu::PageFragment> row_group_fragments(
+  cudf::detail::hostdevice_2dvector<PageFragment> row_group_fragments(
     num_columns, num_fragments, stream);
 
   // Create table_device_view so that corresponding column_device_view data
@@ -1588,7 +1587,7 @@ auto convert_table_to_parquet_data(table_input_metadata& table_meta,
   if (num_fragments != 0) {
     // Move column info to device
     col_desc.host_to_device_async(stream);
-    leaf_column_views = create_leaf_column_device_views<gpu::parquet_column_device_view>(
+    leaf_column_views = create_leaf_column_device_views<parquet_column_device_view>(
       col_desc, *parent_column_table_device_view, stream);
 
     init_row_group_fragments(row_group_fragments,
@@ -1662,7 +1661,7 @@ auto convert_table_to_parquet_data(table_input_metadata& table_meta,
 
   // Initialize row groups and column chunks
   auto const num_chunks = num_rowgroups * num_columns;
-  hostdevice_2dvector<gpu::EncColumnChunk> chunks(num_rowgroups, num_columns, stream);
+  hostdevice_2dvector<EncColumnChunk> chunks(num_rowgroups, num_columns, stream);
 
   // total fragments per column (in case they are non-uniform)
   std::vector<size_type> frags_per_column(num_columns, 0);
@@ -1678,7 +1677,7 @@ auto convert_table_to_parquet_data(table_input_metadata& table_meta,
       row_group.total_byte_size = 0;
       row_group.columns.resize(num_columns);
       for (int c = 0; c < num_columns; c++) {
-        gpu::EncColumnChunk& ck = chunks[r + first_rg_in_part[p]][c];
+        EncColumnChunk& ck = chunks[r + first_rg_in_part[p]][c];
 
         ck                   = {};
         ck.col_desc          = col_desc.device_ptr() + c;
@@ -1700,7 +1699,7 @@ auto convert_table_to_parquet_data(table_input_metadata& table_meta,
             return l + r.num_values;
           });
         ck.plain_data_size = std::accumulate(
-          chunk_fragments.begin(), chunk_fragments.end(), 0, [](int sum, gpu::PageFragment frag) {
+          chunk_fragments.begin(), chunk_fragments.end(), 0, [](int sum, PageFragment frag) {
             return sum + frag.fragment_data_size;
           });
         auto& column_chunk_meta          = row_group.columns[c].meta_data;
@@ -1731,7 +1730,7 @@ auto convert_table_to_parquet_data(table_input_metadata& table_meta,
     frags_per_column.empty() ? 0 : frag_offsets.back() + frags_per_column.back();
 
   rmm::device_uvector<statistics_chunk> frag_stats(0, stream);
-  cudf::detail::hostdevice_vector<gpu::PageFragment> page_fragments(total_frags, stream);
+  cudf::detail::hostdevice_vector<PageFragment> page_fragments(total_frags, stream);
 
   // update fragments and/or prepare for fragment statistics calculation if necessary
   if (total_frags != 0) {
@@ -1749,9 +1748,9 @@ auto convert_table_to_parquet_data(table_input_metadata& table_meta,
           auto const& row_group = agg_meta->file(p).row_groups[global_r];
           uint32_t const fragments_in_chunk =
             util::div_rounding_up_unsafe(row_group.num_rows, frag_size);
-          gpu::EncColumnChunk& ck = chunks[r + first_rg_in_part[p]][c];
-          ck.fragments            = page_fragments.device_ptr(frag_offset);
-          ck.first_fragment       = frag_offset;
+          EncColumnChunk& ck = chunks[r + first_rg_in_part[p]][c];
+          ck.fragments       = page_fragments.device_ptr(frag_offset);
+          ck.first_fragment  = frag_offset;
 
           // update the chunk pointer here for each fragment in chunk.fragments
           for (uint32_t i = 0; i < fragments_in_chunk; i++) {
@@ -1817,8 +1816,8 @@ auto convert_table_to_parquet_data(table_input_metadata& table_meta,
     size_t comp_rowgroup_size = 0;
     if (r < num_rowgroups) {
       for (int i = 0; i < num_columns; i++) {
-        gpu::EncColumnChunk* ck = &chunks[r][i];
-        ck->first_page          = num_pages;
+        EncColumnChunk* ck = &chunks[r][i];
+        ck->first_page     = num_pages;
         num_pages += ck->num_pages;
         pages_in_batch += ck->num_pages;
         rowgroup_size += ck->bfr_size;
@@ -1850,7 +1849,7 @@ auto convert_table_to_parquet_data(table_input_metadata& table_meta,
   }
 
   // Clear compressed buffer size if compression has been turned off
-  if (compression == parquet::Compression::UNCOMPRESSED) { max_comp_bfr_size = 0; }
+  if (compression == Compression::UNCOMPRESSED) { max_comp_bfr_size = 0; }
 
   // Initialize data pointers in batch
   uint32_t const num_stats_bfr =
@@ -1864,7 +1863,7 @@ auto convert_table_to_parquet_data(table_input_metadata& table_meta,
                               stream);
 
   rmm::device_buffer col_idx_bfr(column_index_bfr_size, stream);
-  rmm::device_uvector<gpu::EncPage> pages(num_pages, stream);
+  rmm::device_uvector<EncPage> pages(num_pages, stream);
 
   // This contains stats for both the pages and the rowgroups. TODO: make them separate.
   rmm::device_uvector<statistics_chunk> page_stats(num_stats_bfr, stream);
@@ -1874,10 +1873,10 @@ auto convert_table_to_parquet_data(table_input_metadata& table_meta,
     auto bfr_c = static_cast<uint8_t*>(comp_bfr.data());
     for (auto j = 0; j < batch_list[b]; j++, r++) {
       for (auto i = 0; i < num_columns; i++) {
-        gpu::EncColumnChunk& ck = chunks[r][i];
-        ck.uncompressed_bfr     = bfr;
-        ck.compressed_bfr       = bfr_c;
-        ck.column_index_blob    = bfr_i;
+        EncColumnChunk& ck   = chunks[r][i];
+        ck.uncompressed_bfr  = bfr;
+        ck.compressed_bfr    = bfr_c;
+        ck.column_index_blob = bfr_i;
         bfr += ck.bfr_size;
         bfr_c += ck.compressed_size;
         if (stats_granularity == statistics_freq::STATISTICS_COLUMN) {
@@ -1960,7 +1959,7 @@ auto convert_table_to_parquet_data(table_input_metadata& table_meta,
         if (ck.ck_stat_size != 0) {
           std::vector<uint8_t> const stats_blob = cudf::detail::make_std_vector_sync(
             device_span<uint8_t const>(dev_bfr, ck.ck_stat_size), stream);
-          cudf::io::parquet::CompactProtocolReader cp(stats_blob.data(), stats_blob.size());
+          CompactProtocolReader cp(stats_blob.data(), stats_blob.size());
           cp.read(&column_chunk_meta.statistics);
           need_sync = true;
         }
@@ -2142,8 +2141,8 @@ void writer::impl::write(table_view const& input, std::vector<partition_info> co
 
 void writer::impl::write_parquet_data_to_sink(
   std::unique_ptr<aggregate_writer_metadata>& updated_agg_meta,
-  device_span<gpu::EncPage const> pages,
-  host_2dspan<gpu::EncColumnChunk const> chunks,
+  device_span<EncPage const> pages,
+  host_2dspan<EncColumnChunk const> chunks,
   host_span<size_t const> global_rowgroup_base,
   host_span<int const> first_rg_in_part,
   host_span<size_type const> batch_list,
@@ -2209,7 +2208,7 @@ void writer::impl::write_parquet_data_to_sink(
         int const global_r    = global_rowgroup_base[p] + r - first_rg_in_part[p];
         auto const& row_group = _agg_meta->file(p).row_groups[global_r];
         for (std::size_t i = 0; i < num_columns; i++) {
-          gpu::EncColumnChunk const& ck = chunks[r][i];
+          EncColumnChunk const& ck      = chunks[r][i];
           auto const& column_chunk_meta = row_group.columns[i].meta_data;
 
           // start transfer of the column index
@@ -2392,7 +2391,7 @@ std::unique_ptr<std::vector<uint8_t>> writer::merge_row_group_metadata(
   return std::make_unique<std::vector<uint8_t>>(std::move(output));
 }
 
-}  // namespace parquet
 }  // namespace detail
+}  // namespace parquet
 }  // namespace io
 }  // namespace cudf
diff --git a/cpp/src/io/parquet/writer_impl.hpp b/cpp/src/io/parquet/writer_impl.hpp
index 89ef85ba2bd..e0f38ed362c 100644
--- a/cpp/src/io/parquet/writer_impl.hpp
+++ b/cpp/src/io/parquet/writer_impl.hpp
@@ -40,13 +40,12 @@
 
 namespace cudf {
 namespace io {
-namespace detail {
 namespace parquet {
+namespace detail {
+
 // Forward internal classes
 struct aggregate_writer_metadata;
 
-using namespace cudf::io::parquet;
-using namespace cudf::io;
 using cudf::detail::device_2dspan;
 using cudf::detail::host_2dspan;
 using cudf::detail::hostdevice_2dvector;
@@ -66,7 +65,7 @@ class writer::impl {
    */
   explicit impl(std::vector<std::unique_ptr<data_sink>> sinks,
                 parquet_writer_options const& options,
-                single_write_mode mode,
+                cudf::io::detail::single_write_mode mode,
                 rmm::cuda_stream_view stream);
 
   /**
@@ -79,7 +78,7 @@ class writer::impl {
    */
   explicit impl(std::vector<std::unique_ptr<data_sink>> sinks,
                 chunked_parquet_writer_options const& options,
-                single_write_mode mode,
+                cudf::io::detail::single_write_mode mode,
                 rmm::cuda_stream_view stream);
 
   /**
@@ -139,8 +138,8 @@ class writer::impl {
    * @param[out] bounce_buffer Temporary host output buffer
    */
   void write_parquet_data_to_sink(std::unique_ptr<aggregate_writer_metadata>& updated_agg_meta,
-                                  device_span<gpu::EncPage const> pages,
-                                  host_2dspan<gpu::EncColumnChunk const> chunks,
+                                  device_span<EncPage const> pages,
+                                  host_2dspan<EncColumnChunk const> chunks,
                                   host_span<size_t const> global_rowgroup_base,
                                   host_span<int const> first_rg_in_part,
                                   host_span<size_type const> batch_list,
@@ -164,9 +163,10 @@ class writer::impl {
   bool const _write_v2_headers;
   int32_t const _column_index_truncate_length;
   std::vector<std::map<std::string, std::string>> const _kv_meta;  // Optional user metadata.
-  single_write_mode const _single_write_mode;  // Special parameter only used by `write()` to
-                                               // indicate that we are guaranteeing a single table
-                                               // write. This enables some internal optimizations.
+  cudf::io::detail::single_write_mode const
+    _single_write_mode;  // Special parameter only used by `write()` to
+                         // indicate that we are guaranteeing a single table
+                         // write. This enables some internal optimizations.
   std::vector<std::unique_ptr<data_sink>> const _out_sink;
 
   // Internal states, filled during `write()` and written to sink during `write` and `close()`.
@@ -180,7 +180,7 @@ class writer::impl {
   bool _closed                = false;  // To track if the output has been written to sink.
 };
 
-}  // namespace parquet
 }  // namespace detail
+}  // namespace parquet
 }  // namespace io
 }  // namespace cudf
diff --git a/cpp/src/io/utilities/column_buffer.cpp b/cpp/src/io/utilities/column_buffer.cpp
index f3a43cbc63c..dd049d401cf 100644
--- a/cpp/src/io/utilities/column_buffer.cpp
+++ b/cpp/src/io/utilities/column_buffer.cpp
@@ -51,19 +51,21 @@ std::unique_ptr<column> gather_column_buffer::make_string_column_impl(rmm::cuda_
   return make_strings_column(*_strings, stream, _mr);
 }
 
-void inline_column_buffer::allocate_strings_data(rmm::cuda_stream_view stream)
+void cudf::io::detail::inline_column_buffer::allocate_strings_data(rmm::cuda_stream_view stream)
 {
   CUDF_EXPECTS(type.id() == type_id::STRING, "allocate_strings_data called for non-string column");
   // size + 1 for final offset. _string_data will be initialized later.
   _data = create_data(data_type{type_id::INT32}, size + 1, stream, _mr);
 }
 
-void inline_column_buffer::create_string_data(size_t num_bytes, rmm::cuda_stream_view stream)
+void cudf::io::detail::inline_column_buffer::create_string_data(size_t num_bytes,
+                                                                rmm::cuda_stream_view stream)
 {
   _string_data = rmm::device_buffer(num_bytes, stream, _mr);
 }
 
-std::unique_ptr<column> inline_column_buffer::make_string_column_impl(rmm::cuda_stream_view stream)
+std::unique_ptr<column> cudf::io::detail::inline_column_buffer::make_string_column_impl(
+  rmm::cuda_stream_view stream)
 {
   // no need for copies, just transfer ownership of the data_buffers to the columns
   auto const state = mask_state::UNALLOCATED;
@@ -324,7 +326,7 @@ std::unique_ptr<column> empty_like(column_buffer_base<string_policy>& buffer,
 }
 
 using pointer_type = gather_column_buffer;
-using string_type  = inline_column_buffer;
+using string_type  = cudf::io::detail::inline_column_buffer;
 
 using pointer_column_buffer = column_buffer_base<pointer_type>;
 using string_column_buffer  = column_buffer_base<string_type>;
diff --git a/cpp/tests/io/parquet_test.cpp b/cpp/tests/io/parquet_test.cpp
index 81e0e12eeb9..217bb891a2b 100644
--- a/cpp/tests/io/parquet_test.cpp
+++ b/cpp/tests/io/parquet_test.cpp
@@ -200,29 +200,30 @@ std::unique_ptr<cudf::column> make_parquet_list_list_col(
 // of the file to populate the FileMetaData pointed to by file_meta_data.
 // throws cudf::logic_error if the file or metadata is invalid.
 void read_footer(std::unique_ptr<cudf::io::datasource> const& source,
-                 cudf::io::parquet::FileMetaData* file_meta_data)
+                 cudf::io::parquet::detail::FileMetaData* file_meta_data)
 {
-  constexpr auto header_len = sizeof(cudf::io::parquet::file_header_s);
-  constexpr auto ender_len  = sizeof(cudf::io::parquet::file_ender_s);
+  constexpr auto header_len = sizeof(cudf::io::parquet::detail::file_header_s);
+  constexpr auto ender_len  = sizeof(cudf::io::parquet::detail::file_ender_s);
 
   auto const len           = source->size();
   auto const header_buffer = source->host_read(0, header_len);
   auto const header =
-    reinterpret_cast<cudf::io::parquet::file_header_s const*>(header_buffer->data());
+    reinterpret_cast<cudf::io::parquet::detail::file_header_s const*>(header_buffer->data());
   auto const ender_buffer = source->host_read(len - ender_len, ender_len);
-  auto const ender = reinterpret_cast<cudf::io::parquet::file_ender_s const*>(ender_buffer->data());
+  auto const ender =
+    reinterpret_cast<cudf::io::parquet::detail::file_ender_s const*>(ender_buffer->data());
 
   // checks for valid header, footer, and file length
   ASSERT_GT(len, header_len + ender_len);
-  ASSERT_TRUE(header->magic == cudf::io::parquet::parquet_magic &&
-              ender->magic == cudf::io::parquet::parquet_magic);
+  ASSERT_TRUE(header->magic == cudf::io::parquet::detail::parquet_magic &&
+              ender->magic == cudf::io::parquet::detail::parquet_magic);
   ASSERT_TRUE(ender->footer_len != 0 && ender->footer_len <= (len - header_len - ender_len));
 
   // parquet files end with 4-byte footer_length and 4-byte magic == "PAR1"
   // seek backwards from the end of the file (footer_length + 8 bytes of ender)
   auto const footer_buffer =
     source->host_read(len - ender->footer_len - ender_len, ender->footer_len);
-  cudf::io::parquet::CompactProtocolReader cp(footer_buffer->data(), ender->footer_len);
+  cudf::io::parquet::detail::CompactProtocolReader cp(footer_buffer->data(), ender->footer_len);
 
   // returns true on success
   bool res = cp.read(file_meta_data);
@@ -233,14 +234,14 @@ void read_footer(std::unique_ptr<cudf::io::datasource> const& source,
 // this assumes the data is uncompressed.
 // throws cudf::logic_error if the page_loc data is invalid.
 int read_dict_bits(std::unique_ptr<cudf::io::datasource> const& source,
-                   cudf::io::parquet::PageLocation const& page_loc)
+                   cudf::io::parquet::detail::PageLocation const& page_loc)
 {
   CUDF_EXPECTS(page_loc.offset > 0, "Cannot find page header");
   CUDF_EXPECTS(page_loc.compressed_page_size > 0, "Invalid page header length");
 
-  cudf::io::parquet::PageHeader page_hdr;
+  cudf::io::parquet::detail::PageHeader page_hdr;
   auto const page_buf = source->host_read(page_loc.offset, page_loc.compressed_page_size);
-  cudf::io::parquet::CompactProtocolReader cp(page_buf->data(), page_buf->size());
+  cudf::io::parquet::detail::CompactProtocolReader cp(page_buf->data(), page_buf->size());
   bool res = cp.read(&page_hdr);
   CUDF_EXPECTS(res, "Cannot parse page header");
 
@@ -252,15 +253,16 @@ int read_dict_bits(std::unique_ptr<cudf::io::datasource> const& source,
 // read column index from datasource at location indicated by chunk,
 // parse and return as a ColumnIndex struct.
 // throws cudf::logic_error if the chunk data is invalid.
-cudf::io::parquet::ColumnIndex read_column_index(
-  std::unique_ptr<cudf::io::datasource> const& source, cudf::io::parquet::ColumnChunk const& chunk)
+cudf::io::parquet::detail::ColumnIndex read_column_index(
+  std::unique_ptr<cudf::io::datasource> const& source,
+  cudf::io::parquet::detail::ColumnChunk const& chunk)
 {
   CUDF_EXPECTS(chunk.column_index_offset > 0, "Cannot find column index");
   CUDF_EXPECTS(chunk.column_index_length > 0, "Invalid column index length");
 
-  cudf::io::parquet::ColumnIndex colidx;
+  cudf::io::parquet::detail::ColumnIndex colidx;
   auto const ci_buf = source->host_read(chunk.column_index_offset, chunk.column_index_length);
-  cudf::io::parquet::CompactProtocolReader cp(ci_buf->data(), ci_buf->size());
+  cudf::io::parquet::detail::CompactProtocolReader cp(ci_buf->data(), ci_buf->size());
   bool res = cp.read(&colidx);
   CUDF_EXPECTS(res, "Cannot parse column index");
   return colidx;
@@ -269,22 +271,24 @@ cudf::io::parquet::ColumnIndex read_column_index(
 // read offset index from datasource at location indicated by chunk,
 // parse and return as an OffsetIndex struct.
 // throws cudf::logic_error if the chunk data is invalid.
-cudf::io::parquet::OffsetIndex read_offset_index(
-  std::unique_ptr<cudf::io::datasource> const& source, cudf::io::parquet::ColumnChunk const& chunk)
+cudf::io::parquet::detail::OffsetIndex read_offset_index(
+  std::unique_ptr<cudf::io::datasource> const& source,
+  cudf::io::parquet::detail::ColumnChunk const& chunk)
 {
   CUDF_EXPECTS(chunk.offset_index_offset > 0, "Cannot find offset index");
   CUDF_EXPECTS(chunk.offset_index_length > 0, "Invalid offset index length");
 
-  cudf::io::parquet::OffsetIndex offidx;
+  cudf::io::parquet::detail::OffsetIndex offidx;
   auto const oi_buf = source->host_read(chunk.offset_index_offset, chunk.offset_index_length);
-  cudf::io::parquet::CompactProtocolReader cp(oi_buf->data(), oi_buf->size());
+  cudf::io::parquet::detail::CompactProtocolReader cp(oi_buf->data(), oi_buf->size());
   bool res = cp.read(&offidx);
   CUDF_EXPECTS(res, "Cannot parse offset index");
   return offidx;
 }
 
 // Return as a Statistics from the column chunk
-cudf::io::parquet::Statistics const& get_statistics(cudf::io::parquet::ColumnChunk const& chunk)
+cudf::io::parquet::detail::Statistics const& get_statistics(
+  cudf::io::parquet::detail::ColumnChunk const& chunk)
 {
   return chunk.meta_data.statistics;
 }
@@ -292,15 +296,16 @@ cudf::io::parquet::Statistics const& get_statistics(cudf::io::parquet::ColumnChu
 // read page header from datasource at location indicated by page_loc,
 // parse and return as a PageHeader struct.
 // throws cudf::logic_error if the page_loc data is invalid.
-cudf::io::parquet::PageHeader read_page_header(std::unique_ptr<cudf::io::datasource> const& source,
-                                               cudf::io::parquet::PageLocation const& page_loc)
+cudf::io::parquet::detail::PageHeader read_page_header(
+  std::unique_ptr<cudf::io::datasource> const& source,
+  cudf::io::parquet::detail::PageLocation const& page_loc)
 {
   CUDF_EXPECTS(page_loc.offset > 0, "Cannot find page header");
   CUDF_EXPECTS(page_loc.compressed_page_size > 0, "Invalid page header length");
 
-  cudf::io::parquet::PageHeader page_hdr;
+  cudf::io::parquet::detail::PageHeader page_hdr;
   auto const page_buf = source->host_read(page_loc.offset, page_loc.compressed_page_size);
-  cudf::io::parquet::CompactProtocolReader cp(page_buf->data(), page_buf->size());
+  cudf::io::parquet::detail::CompactProtocolReader cp(page_buf->data(), page_buf->size());
   bool res = cp.read(&page_hdr);
   CUDF_EXPECTS(res, "Cannot parse page header");
   return page_hdr;
@@ -3686,7 +3691,7 @@ TEST_F(ParquetWriterTest, CheckPageRows)
 
   // check first page header and make sure it has only page_rows values
   auto const source = cudf::io::datasource::create(filepath);
-  cudf::io::parquet::FileMetaData fmd;
+  cudf::io::parquet::detail::FileMetaData fmd;
 
   read_footer(source, &fmd);
   ASSERT_GT(fmd.row_groups.size(), 0);
@@ -3697,7 +3702,7 @@ TEST_F(ParquetWriterTest, CheckPageRows)
   // read first data page header.  sizeof(PageHeader) is not exact, but the thrift encoded
   // version should be smaller than size of the struct.
   auto const ph = read_page_header(
-    source, {first_chunk.data_page_offset, sizeof(cudf::io::parquet::PageHeader), 0});
+    source, {first_chunk.data_page_offset, sizeof(cudf::io::parquet::detail::PageHeader), 0});
 
   EXPECT_EQ(ph.data_page_header.num_values, page_rows);
 }
@@ -3722,7 +3727,7 @@ TEST_F(ParquetWriterTest, CheckPageRowsAdjusted)
 
   // check first page header and make sure it has only page_rows values
   auto const source = cudf::io::datasource::create(filepath);
-  cudf::io::parquet::FileMetaData fmd;
+  cudf::io::parquet::detail::FileMetaData fmd;
 
   read_footer(source, &fmd);
   ASSERT_GT(fmd.row_groups.size(), 0);
@@ -3733,7 +3738,7 @@ TEST_F(ParquetWriterTest, CheckPageRowsAdjusted)
   // read first data page header.  sizeof(PageHeader) is not exact, but the thrift encoded
   // version should be smaller than size of the struct.
   auto const ph = read_page_header(
-    source, {first_chunk.data_page_offset, sizeof(cudf::io::parquet::PageHeader), 0});
+    source, {first_chunk.data_page_offset, sizeof(cudf::io::parquet::detail::PageHeader), 0});
 
   EXPECT_LE(ph.data_page_header.num_values, rows_per_page);
 }
@@ -3759,7 +3764,7 @@ TEST_F(ParquetWriterTest, CheckPageRowsTooSmall)
 
   // check that file is written correctly when rows/page < fragment size
   auto const source = cudf::io::datasource::create(filepath);
-  cudf::io::parquet::FileMetaData fmd;
+  cudf::io::parquet::detail::FileMetaData fmd;
 
   read_footer(source, &fmd);
   ASSERT_TRUE(fmd.row_groups.size() > 0);
@@ -3770,7 +3775,7 @@ TEST_F(ParquetWriterTest, CheckPageRowsTooSmall)
   // read first data page header.  sizeof(PageHeader) is not exact, but the thrift encoded
   // version should be smaller than size of the struct.
   auto const ph = read_page_header(
-    source, {first_chunk.data_page_offset, sizeof(cudf::io::parquet::PageHeader), 0});
+    source, {first_chunk.data_page_offset, sizeof(cudf::io::parquet::detail::PageHeader), 0});
 
   // there should be only one page since the fragment size is larger than rows_per_page
   EXPECT_EQ(ph.data_page_header.num_values, num_rows);
@@ -3798,7 +3803,7 @@ TEST_F(ParquetWriterTest, Decimal128Stats)
   cudf::io::write_parquet(out_opts);
 
   auto const source = cudf::io::datasource::create(filepath);
-  cudf::io::parquet::FileMetaData fmd;
+  cudf::io::parquet::detail::FileMetaData fmd;
 
   read_footer(source, &fmd);
 
@@ -4031,7 +4036,7 @@ TYPED_TEST(ParquetWriterComparableTypeTest, ThreeColumnSorted)
   cudf::io::write_parquet(out_opts);
 
   auto const source = cudf::io::datasource::create(filepath);
-  cudf::io::parquet::FileMetaData fmd;
+  cudf::io::parquet::detail::FileMetaData fmd;
 
   read_footer(source, &fmd);
   ASSERT_GT(fmd.row_groups.size(), 0);
@@ -4041,10 +4046,10 @@ TYPED_TEST(ParquetWriterComparableTypeTest, ThreeColumnSorted)
 
   // now check that the boundary order for chunk 1 is ascending,
   // chunk 2 is descending, and chunk 3 is unordered
-  cudf::io::parquet::BoundaryOrder expected_orders[] = {
-    cudf::io::parquet::BoundaryOrder::ASCENDING,
-    cudf::io::parquet::BoundaryOrder::DESCENDING,
-    cudf::io::parquet::BoundaryOrder::UNORDERED};
+  cudf::io::parquet::detail::BoundaryOrder expected_orders[] = {
+    cudf::io::parquet::detail::BoundaryOrder::ASCENDING,
+    cudf::io::parquet::detail::BoundaryOrder::DESCENDING,
+    cudf::io::parquet::detail::BoundaryOrder::UNORDERED};
 
   for (std::size_t i = 0; i < columns.size(); i++) {
     auto const ci = read_column_index(source, columns[i]);
@@ -4067,15 +4072,15 @@ int32_t compare(T& v1, T& v2)
 // 1 if v1 > v2.
 int32_t compare_binary(std::vector<uint8_t> const& v1,
                        std::vector<uint8_t> const& v2,
-                       cudf::io::parquet::Type ptype,
-                       cudf::io::parquet::ConvertedType ctype)
+                       cudf::io::parquet::detail::Type ptype,
+                       cudf::io::parquet::detail::ConvertedType ctype)
 {
   switch (ptype) {
-    case cudf::io::parquet::INT32:
+    case cudf::io::parquet::detail::INT32:
       switch (ctype) {
-        case cudf::io::parquet::UINT_8:
-        case cudf::io::parquet::UINT_16:
-        case cudf::io::parquet::UINT_32:
+        case cudf::io::parquet::detail::UINT_8:
+        case cudf::io::parquet::detail::UINT_16:
+        case cudf::io::parquet::detail::UINT_32:
           return compare(*(reinterpret_cast<uint32_t const*>(v1.data())),
                          *(reinterpret_cast<uint32_t const*>(v2.data())));
         default:
@@ -4083,23 +4088,23 @@ int32_t compare_binary(std::vector<uint8_t> const& v1,
                          *(reinterpret_cast<int32_t const*>(v2.data())));
       }
 
-    case cudf::io::parquet::INT64:
-      if (ctype == cudf::io::parquet::UINT_64) {
+    case cudf::io::parquet::detail::INT64:
+      if (ctype == cudf::io::parquet::detail::UINT_64) {
         return compare(*(reinterpret_cast<uint64_t const*>(v1.data())),
                        *(reinterpret_cast<uint64_t const*>(v2.data())));
       }
       return compare(*(reinterpret_cast<int64_t const*>(v1.data())),
                      *(reinterpret_cast<int64_t const*>(v2.data())));
 
-    case cudf::io::parquet::FLOAT:
+    case cudf::io::parquet::detail::FLOAT:
       return compare(*(reinterpret_cast<float const*>(v1.data())),
                      *(reinterpret_cast<float const*>(v2.data())));
 
-    case cudf::io::parquet::DOUBLE:
+    case cudf::io::parquet::detail::DOUBLE:
       return compare(*(reinterpret_cast<double const*>(v1.data())),
                      *(reinterpret_cast<double const*>(v2.data())));
 
-    case cudf::io::parquet::BYTE_ARRAY: {
+    case cudf::io::parquet::detail::BYTE_ARRAY: {
       int32_t v1sz = v1.size();
       int32_t v2sz = v2.size();
       int32_t ret  = memcmp(v1.data(), v2.data(), std::min(v1sz, v2sz));
@@ -4142,7 +4147,7 @@ TEST_P(ParquetV2Test, LargeColumnIndex)
   cudf::io::write_parquet(out_opts);
 
   auto const source = cudf::io::datasource::create(filepath);
-  cudf::io::parquet::FileMetaData fmd;
+  cudf::io::parquet::detail::FileMetaData fmd;
 
   read_footer(source, &fmd);
 
@@ -4164,10 +4169,10 @@ TEST_P(ParquetV2Test, LargeColumnIndex)
 
 TEST_P(ParquetV2Test, CheckColumnOffsetIndex)
 {
-  constexpr auto num_rows = 100000;
-  auto const is_v2        = GetParam();
-  auto const expected_hdr_type =
-    is_v2 ? cudf::io::parquet::PageType::DATA_PAGE_V2 : cudf::io::parquet::PageType::DATA_PAGE;
+  constexpr auto num_rows      = 100000;
+  auto const is_v2             = GetParam();
+  auto const expected_hdr_type = is_v2 ? cudf::io::parquet::detail::PageType::DATA_PAGE_V2
+                                       : cudf::io::parquet::detail::PageType::DATA_PAGE;
 
   // fixed length strings
   auto str1_elements = cudf::detail::make_counting_transform_iterator(0, [](auto i) {
@@ -4210,7 +4215,7 @@ TEST_P(ParquetV2Test, CheckColumnOffsetIndex)
   cudf::io::write_parquet(out_opts);
 
   auto const source = cudf::io::datasource::create(filepath);
-  cudf::io::parquet::FileMetaData fmd;
+  cudf::io::parquet::detail::FileMetaData fmd;
 
   read_footer(source, &fmd);
 
@@ -4255,10 +4260,10 @@ TEST_P(ParquetV2Test, CheckColumnOffsetIndex)
 
 TEST_P(ParquetV2Test, CheckColumnOffsetIndexNulls)
 {
-  constexpr auto num_rows = 100000;
-  auto const is_v2        = GetParam();
-  auto const expected_hdr_type =
-    is_v2 ? cudf::io::parquet::PageType::DATA_PAGE_V2 : cudf::io::parquet::PageType::DATA_PAGE;
+  constexpr auto num_rows      = 100000;
+  auto const is_v2             = GetParam();
+  auto const expected_hdr_type = is_v2 ? cudf::io::parquet::detail::PageType::DATA_PAGE_V2
+                                       : cudf::io::parquet::detail::PageType::DATA_PAGE;
 
   // fixed length strings
   auto str1_elements = cudf::detail::make_counting_transform_iterator(0, [](auto i) {
@@ -4311,7 +4316,7 @@ TEST_P(ParquetV2Test, CheckColumnOffsetIndexNulls)
   cudf::io::write_parquet(out_opts);
 
   auto const source = cudf::io::datasource::create(filepath);
-  cudf::io::parquet::FileMetaData fmd;
+  cudf::io::parquet::detail::FileMetaData fmd;
 
   read_footer(source, &fmd);
 
@@ -4362,10 +4367,10 @@ TEST_P(ParquetV2Test, CheckColumnOffsetIndexNulls)
 
 TEST_P(ParquetV2Test, CheckColumnOffsetIndexNullColumn)
 {
-  constexpr auto num_rows = 100000;
-  auto const is_v2        = GetParam();
-  auto const expected_hdr_type =
-    is_v2 ? cudf::io::parquet::PageType::DATA_PAGE_V2 : cudf::io::parquet::PageType::DATA_PAGE;
+  constexpr auto num_rows      = 100000;
+  auto const is_v2             = GetParam();
+  auto const expected_hdr_type = is_v2 ? cudf::io::parquet::detail::PageType::DATA_PAGE_V2
+                                       : cudf::io::parquet::detail::PageType::DATA_PAGE;
 
   // fixed length strings
   auto str1_elements = cudf::detail::make_counting_transform_iterator(0, [](auto i) {
@@ -4403,7 +4408,7 @@ TEST_P(ParquetV2Test, CheckColumnOffsetIndexNullColumn)
   cudf::io::write_parquet(out_opts);
 
   auto const source = cudf::io::datasource::create(filepath);
-  cudf::io::parquet::FileMetaData fmd;
+  cudf::io::parquet::detail::FileMetaData fmd;
 
   read_footer(source, &fmd);
 
@@ -4458,9 +4463,9 @@ TEST_P(ParquetV2Test, CheckColumnOffsetIndexNullColumn)
 
 TEST_P(ParquetV2Test, CheckColumnOffsetIndexStruct)
 {
-  auto const is_v2 = GetParam();
-  auto const expected_hdr_type =
-    is_v2 ? cudf::io::parquet::PageType::DATA_PAGE_V2 : cudf::io::parquet::PageType::DATA_PAGE;
+  auto const is_v2             = GetParam();
+  auto const expected_hdr_type = is_v2 ? cudf::io::parquet::detail::PageType::DATA_PAGE_V2
+                                       : cudf::io::parquet::detail::PageType::DATA_PAGE;
 
   auto c0 = testdata::ascending<uint32_t>();
 
@@ -4495,7 +4500,7 @@ TEST_P(ParquetV2Test, CheckColumnOffsetIndexStruct)
   cudf::io::write_parquet(out_opts);
 
   auto const source = cudf::io::datasource::create(filepath);
-  cudf::io::parquet::FileMetaData fmd;
+  cudf::io::parquet::detail::FileMetaData fmd;
 
   read_footer(source, &fmd);
 
@@ -4542,9 +4547,9 @@ TEST_P(ParquetV2Test, CheckColumnOffsetIndexStruct)
 
 TEST_P(ParquetV2Test, CheckColumnOffsetIndexStructNulls)
 {
-  auto const is_v2 = GetParam();
-  auto const expected_hdr_type =
-    is_v2 ? cudf::io::parquet::PageType::DATA_PAGE_V2 : cudf::io::parquet::PageType::DATA_PAGE;
+  auto const is_v2             = GetParam();
+  auto const expected_hdr_type = is_v2 ? cudf::io::parquet::detail::PageType::DATA_PAGE_V2
+                                       : cudf::io::parquet::detail::PageType::DATA_PAGE;
 
   auto validity2 =
     cudf::detail::make_counting_transform_iterator(0, [](cudf::size_type i) { return i % 2; });
@@ -4586,7 +4591,7 @@ TEST_P(ParquetV2Test, CheckColumnOffsetIndexStructNulls)
   cudf::io::write_parquet(out_opts);
 
   auto const source = cudf::io::datasource::create(filepath);
-  cudf::io::parquet::FileMetaData fmd;
+  cudf::io::parquet::detail::FileMetaData fmd;
 
   read_footer(source, &fmd);
 
@@ -4616,9 +4621,9 @@ TEST_P(ParquetV2Test, CheckColumnOffsetIndexStructNulls)
 
 TEST_P(ParquetV2Test, CheckColumnIndexListWithNulls)
 {
-  auto const is_v2 = GetParam();
-  auto const expected_hdr_type =
-    is_v2 ? cudf::io::parquet::PageType::DATA_PAGE_V2 : cudf::io::parquet::PageType::DATA_PAGE;
+  auto const is_v2             = GetParam();
+  auto const expected_hdr_type = is_v2 ? cudf::io::parquet::detail::PageType::DATA_PAGE_V2
+                                       : cudf::io::parquet::detail::PageType::DATA_PAGE;
 
   using cudf::test::iterators::null_at;
   using cudf::test::iterators::nulls_at;
@@ -4711,7 +4716,7 @@ TEST_P(ParquetV2Test, CheckColumnIndexListWithNulls)
   cudf::io::write_parquet(out_opts);
 
   auto const source = cudf::io::datasource::create(filepath);
-  cudf::io::parquet::FileMetaData fmd;
+  cudf::io::parquet::detail::FileMetaData fmd;
 
   read_footer(source, &fmd);
 
@@ -4812,7 +4817,7 @@ TEST_F(ParquetWriterTest, CheckColumnIndexTruncation)
   cudf::io::write_parquet(out_opts);
 
   auto const source = cudf::io::datasource::create(filepath);
-  cudf::io::parquet::FileMetaData fmd;
+  cudf::io::parquet::detail::FileMetaData fmd;
 
   read_footer(source, &fmd);
 
@@ -4870,7 +4875,7 @@ TEST_F(ParquetWriterTest, BinaryColumnIndexTruncation)
   cudf::io::write_parquet(out_opts);
 
   auto const source = cudf::io::datasource::create(filepath);
-  cudf::io::parquet::FileMetaData fmd;
+  cudf::io::parquet::detail::FileMetaData fmd;
 
   read_footer(source, &fmd);
 
@@ -5030,10 +5035,10 @@ TEST_F(ParquetReaderTest, NestedByteArray)
   cudf::io::write_parquet(out_opts);
 
   auto source = cudf::io::datasource::create(filepath);
-  cudf::io::parquet::FileMetaData fmd;
+  cudf::io::parquet::detail::FileMetaData fmd;
 
   read_footer(source, &fmd);
-  EXPECT_EQ(fmd.schema[5].type, cudf::io::parquet::Type::BYTE_ARRAY);
+  EXPECT_EQ(fmd.schema[5].type, cudf::io::parquet::detail::Type::BYTE_ARRAY);
 
   std::vector<cudf::io::reader_column_schema> md{
     {},
@@ -5081,12 +5086,12 @@ TEST_F(ParquetWriterTest, ByteArrayStats)
   auto result = cudf::io::read_parquet(in_opts);
 
   auto source = cudf::io::datasource::create(filepath);
-  cudf::io::parquet::FileMetaData fmd;
+  cudf::io::parquet::detail::FileMetaData fmd;
 
   read_footer(source, &fmd);
 
-  EXPECT_EQ(fmd.schema[1].type, cudf::io::parquet::Type::BYTE_ARRAY);
-  EXPECT_EQ(fmd.schema[2].type, cudf::io::parquet::Type::BYTE_ARRAY);
+  EXPECT_EQ(fmd.schema[1].type, cudf::io::parquet::detail::Type::BYTE_ARRAY);
+  EXPECT_EQ(fmd.schema[2].type, cudf::io::parquet::detail::Type::BYTE_ARRAY);
 
   auto const stats0 = get_statistics(fmd.row_groups[0].columns[0]);
   auto const stats1 = get_statistics(fmd.row_groups[0].columns[1]);
@@ -5137,9 +5142,9 @@ TEST_F(ParquetReaderTest, StructByteArray)
 
 TEST_F(ParquetReaderTest, NestingOptimizationTest)
 {
-  // test nesting levels > cudf::io::parquet::gpu::max_cacheable_nesting_decode_info deep.
+  // test nesting levels > cudf::io::parquet::detail::max_cacheable_nesting_decode_info deep.
   constexpr cudf::size_type num_nesting_levels = 16;
-  static_assert(num_nesting_levels > cudf::io::parquet::gpu::max_cacheable_nesting_decode_info);
+  static_assert(num_nesting_levels > cudf::io::parquet::detail::max_cacheable_nesting_decode_info);
   constexpr cudf::size_type rows_per_level = 2;
 
   constexpr cudf::size_type num_values = (1 << num_nesting_levels) * rows_per_level;
@@ -5206,13 +5211,13 @@ TEST_F(ParquetWriterTest, SingleValueDictionaryTest)
 
   // make sure dictionary was used
   auto const source = cudf::io::datasource::create(filepath);
-  cudf::io::parquet::FileMetaData fmd;
+  cudf::io::parquet::detail::FileMetaData fmd;
 
   read_footer(source, &fmd);
   auto used_dict = [&fmd]() {
     for (auto enc : fmd.row_groups[0].columns[0].meta_data.encodings) {
-      if (enc == cudf::io::parquet::Encoding::PLAIN_DICTIONARY or
-          enc == cudf::io::parquet::Encoding::RLE_DICTIONARY) {
+      if (enc == cudf::io::parquet::detail::Encoding::PLAIN_DICTIONARY or
+          enc == cudf::io::parquet::detail::Encoding::RLE_DICTIONARY) {
         return true;
       }
     }
@@ -5252,13 +5257,13 @@ TEST_F(ParquetWriterTest, DictionaryNeverTest)
 
   // make sure dictionary was not used
   auto const source = cudf::io::datasource::create(filepath);
-  cudf::io::parquet::FileMetaData fmd;
+  cudf::io::parquet::detail::FileMetaData fmd;
 
   read_footer(source, &fmd);
   auto used_dict = [&fmd]() {
     for (auto enc : fmd.row_groups[0].columns[0].meta_data.encodings) {
-      if (enc == cudf::io::parquet::Encoding::PLAIN_DICTIONARY or
-          enc == cudf::io::parquet::Encoding::RLE_DICTIONARY) {
+      if (enc == cudf::io::parquet::detail::Encoding::PLAIN_DICTIONARY or
+          enc == cudf::io::parquet::detail::Encoding::RLE_DICTIONARY) {
         return true;
       }
     }
@@ -5303,13 +5308,13 @@ TEST_F(ParquetWriterTest, DictionaryAdaptiveTest)
   // make sure dictionary was used as expected. col0 should use one,
   // col1 should not.
   auto const source = cudf::io::datasource::create(filepath);
-  cudf::io::parquet::FileMetaData fmd;
+  cudf::io::parquet::detail::FileMetaData fmd;
 
   read_footer(source, &fmd);
   auto used_dict = [&fmd](int col) {
     for (auto enc : fmd.row_groups[0].columns[col].meta_data.encodings) {
-      if (enc == cudf::io::parquet::Encoding::PLAIN_DICTIONARY or
-          enc == cudf::io::parquet::Encoding::RLE_DICTIONARY) {
+      if (enc == cudf::io::parquet::detail::Encoding::PLAIN_DICTIONARY or
+          enc == cudf::io::parquet::detail::Encoding::RLE_DICTIONARY) {
         return true;
       }
     }
@@ -5354,13 +5359,13 @@ TEST_F(ParquetWriterTest, DictionaryAlwaysTest)
 
   // make sure dictionary was used for both columns
   auto const source = cudf::io::datasource::create(filepath);
-  cudf::io::parquet::FileMetaData fmd;
+  cudf::io::parquet::detail::FileMetaData fmd;
 
   read_footer(source, &fmd);
   auto used_dict = [&fmd](int col) {
     for (auto enc : fmd.row_groups[0].columns[col].meta_data.encodings) {
-      if (enc == cudf::io::parquet::Encoding::PLAIN_DICTIONARY or
-          enc == cudf::io::parquet::Encoding::RLE_DICTIONARY) {
+      if (enc == cudf::io::parquet::detail::Encoding::PLAIN_DICTIONARY or
+          enc == cudf::io::parquet::detail::Encoding::RLE_DICTIONARY) {
         return true;
       }
     }
@@ -5438,13 +5443,13 @@ TEST_P(ParquetSizedTest, DictionaryTest)
 
   // make sure dictionary was used
   auto const source = cudf::io::datasource::create(filepath);
-  cudf::io::parquet::FileMetaData fmd;
+  cudf::io::parquet::detail::FileMetaData fmd;
 
   read_footer(source, &fmd);
   auto used_dict = [&fmd]() {
     for (auto enc : fmd.row_groups[0].columns[0].meta_data.encodings) {
-      if (enc == cudf::io::parquet::Encoding::PLAIN_DICTIONARY or
-          enc == cudf::io::parquet::Encoding::RLE_DICTIONARY) {
+      if (enc == cudf::io::parquet::detail::Encoding::PLAIN_DICTIONARY or
+          enc == cudf::io::parquet::detail::Encoding::RLE_DICTIONARY) {
         return true;
       }
     }
@@ -6664,7 +6669,7 @@ TEST_F(ParquetWriterTest, PreserveNullability)
 
 TEST_P(ParquetV2Test, CheckEncodings)
 {
-  using cudf::io::parquet::Encoding;
+  using cudf::io::parquet::detail::Encoding;
   constexpr auto num_rows = 100'000;
   auto const is_v2        = GetParam();
 
@@ -6697,7 +6702,7 @@ TEST_P(ParquetV2Test, CheckEncodings)
   };
 
   auto const source = cudf::io::datasource::create(filepath);
-  cudf::io::parquet::FileMetaData fmd;
+  cudf::io::parquet::detail::FileMetaData fmd;
 
   read_footer(source, &fmd);
   auto const& chunk0_enc = fmd.row_groups[0].columns[0].meta_data.encodings;

From cb74b7e1c9307eb262ece26a626763d504a9d577 Mon Sep 17 00:00:00 2001
From: db <dbaranec@nvidia.com>
Date: Fri, 6 Oct 2023 11:16:52 -0500
Subject: [PATCH 02/49] Remove reader_impl_chunking.cu, which was accidentally
 included.

---
 cpp/src/io/parquet/reader_impl_chunking.cu | 597 ---------------------
 1 file changed, 597 deletions(-)
 delete mode 100644 cpp/src/io/parquet/reader_impl_chunking.cu

diff --git a/cpp/src/io/parquet/reader_impl_chunking.cu b/cpp/src/io/parquet/reader_impl_chunking.cu
deleted file mode 100644
index 2c1521e46db..00000000000
--- a/cpp/src/io/parquet/reader_impl_chunking.cu
+++ /dev/null
@@ -1,597 +0,0 @@
-/*
- * Copyright (c) 2023, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include "reader_impl.hpp"
-
-#include <cudf/detail/iterator.cuh>
-#include <cudf/detail/utilities/integer_utils.hpp>
-
-#include <io/utilities/time_utils.cuh>
-
-#include <rmm/exec_policy.hpp>
-
-#include <thrust/binary_search.h>
-#include <thrust/iterator/constant_iterator.h>
-#include <thrust/iterator/discard_iterator.h>
-#include <thrust/sort.h>
-
-namespace cudf::io::parquet::detail {
-
-namespace {
-
-struct cumulative_row_info {
-  size_t row_count;   // cumulative row count
-  size_t size_bytes;  // cumulative size in bytes
-  int key;            // schema index
-};
-
-#if defined(PREPROCESS_DEBUG)
-void print_cumulative_page_info(cudf::detail::hostdevice_vector<gpu::PageInfo>& pages,
-                                rmm::device_uvector<int32_t> const& page_index,
-                                rmm::device_uvector<cumulative_row_info> const& c_info,
-                                rmm::cuda_stream_view stream)
-{
-  pages.device_to_host_sync(stream);
-
-  printf("------------\nCumulative sizes by page\n");
-
-  std::vector<int> schemas(pages.size());
-  std::vector<int> h_page_index(pages.size());
-  CUDF_CUDA_TRY(cudaMemcpy(
-    h_page_index.data(), page_index.data(), sizeof(int) * pages.size(), cudaMemcpyDefault));
-  std::vector<cumulative_row_info> h_cinfo(pages.size());
-  CUDF_CUDA_TRY(cudaMemcpy(
-    h_cinfo.data(), c_info.data(), sizeof(cumulative_row_info) * pages.size(), cudaMemcpyDefault));
-  auto schema_iter = cudf::detail::make_counting_transform_iterator(
-    0, [&](size_type i) { return pages[h_page_index[i]].src_col_schema; });
-  thrust::copy(thrust::seq, schema_iter, schema_iter + pages.size(), schemas.begin());
-  auto last = thrust::unique(thrust::seq, schemas.begin(), schemas.end());
-  schemas.resize(last - schemas.begin());
-  printf("Num schemas: %lu\n", schemas.size());
-
-  for (size_t idx = 0; idx < schemas.size(); idx++) {
-    printf("Schema %d\n", schemas[idx]);
-    for (size_t pidx = 0; pidx < pages.size(); pidx++) {
-      auto const& page = pages[h_page_index[pidx]];
-      if (page.flags & gpu::PAGEINFO_FLAGS_DICTIONARY || page.src_col_schema != schemas[idx]) {
-        continue;
-      }
-      printf("\tP: {%lu, %lu}\n", h_cinfo[pidx].row_count, h_cinfo[pidx].size_bytes);
-    }
-  }
-}
-
-void print_cumulative_row_info(
-  host_span<cumulative_row_info const> sizes,
-  std::string const& label,
-  std::optional<std::vector<gpu::chunk_read_info>> splits = std::nullopt)
-{
-  if (splits.has_value()) {
-    printf("------------\nSplits\n");
-    for (size_t idx = 0; idx < splits->size(); idx++) {
-      printf("{%lu, %lu}\n", splits.value()[idx].skip_rows, splits.value()[idx].num_rows);
-    }
-  }
-
-  printf("------------\nCumulative sizes %s\n", label.c_str());
-  for (size_t idx = 0; idx < sizes.size(); idx++) {
-    printf("{%lu, %lu, %d}", sizes[idx].row_count, sizes[idx].size_bytes, sizes[idx].key);
-    if (splits.has_value()) {
-      // if we have a split at this row count and this is the last instance of this row count
-      auto start = thrust::make_transform_iterator(
-        splits->begin(), [](gpu::chunk_read_info const& i) { return i.skip_rows; });
-      auto end               = start + splits->size();
-      auto split             = std::find(start, end, sizes[idx].row_count);
-      auto const split_index = [&]() -> int {
-        if (split != end &&
-            ((idx == sizes.size() - 1) || (sizes[idx + 1].row_count > sizes[idx].row_count))) {
-          return static_cast<int>(std::distance(start, split));
-        }
-        return idx == 0 ? 0 : -1;
-      }();
-      if (split_index >= 0) {
-        printf(" <-- split {%lu, %lu}",
-               splits.value()[split_index].skip_rows,
-               splits.value()[split_index].num_rows);
-      }
-    }
-    printf("\n");
-  }
-}
-#endif  // PREPROCESS_DEBUG
-
-/**
- * @brief Functor which reduces two cumulative_row_info structs of the same key.
- */
-struct cumulative_row_sum {
-  cumulative_row_info operator()
-    __device__(cumulative_row_info const& a, cumulative_row_info const& b) const
-  {
-    return cumulative_row_info{a.row_count + b.row_count, a.size_bytes + b.size_bytes, a.key};
-  }
-};
-
-/**
- * @brief Functor which computes the total data size for a given type of cudf column.
- *
- * In the case of strings, the return size does not include the chars themselves. That
- * information is tracked separately (see PageInfo::str_bytes).
- */
-struct row_size_functor {
-  __device__ size_t validity_size(size_t num_rows, bool nullable)
-  {
-    return nullable ? (cudf::util::div_rounding_up_safe(num_rows, size_t{32}) * 4) : 0;
-  }
-
-  template <typename T>
-  __device__ size_t operator()(size_t num_rows, bool nullable)
-  {
-    auto const element_size = sizeof(device_storage_type_t<T>);
-    return (element_size * num_rows) + validity_size(num_rows, nullable);
-  }
-};
-
-template <>
-__device__ size_t row_size_functor::operator()<list_view>(size_t num_rows, bool nullable)
-{
-  auto const offset_size = sizeof(size_type);
-  // NOTE: Adding the + 1 offset here isn't strictly correct.  There will only be 1 extra offset
-  // for the entire column, whereas this is adding an extra offset per page.  So we will get a
-  // small over-estimate of the real size of the order :  # of pages * 4 bytes. It seems better
-  // to overestimate size somewhat than to underestimate it and potentially generate chunks
-  // that are too large.
-  return (offset_size * (num_rows + 1)) + validity_size(num_rows, nullable);
-}
-
-template <>
-__device__ size_t row_size_functor::operator()<struct_view>(size_t num_rows, bool nullable)
-{
-  return validity_size(num_rows, nullable);
-}
-
-template <>
-__device__ size_t row_size_functor::operator()<string_view>(size_t num_rows, bool nullable)
-{
-  // only returns the size of offsets and validity. the size of the actual string chars
-  // is tracked separately.
-  auto const offset_size = sizeof(size_type);
-  // see note about offsets in the list_view template.
-  return (offset_size * (num_rows + 1)) + validity_size(num_rows, nullable);
-}
-
-/**
- * @brief Functor which computes the total output cudf data size for all of
- * the data in this page.
- *
- * Sums across all nesting levels.
- */
-struct get_cumulative_row_info {
-  gpu::PageInfo const* const pages;
-
-  __device__ cumulative_row_info operator()(size_type index)
-  {
-    auto const& page = pages[index];
-    if (page.flags & gpu::PAGEINFO_FLAGS_DICTIONARY) {
-      return cumulative_row_info{0, 0, page.src_col_schema};
-    }
-
-    // total nested size, not counting string data
-    auto iter =
-      cudf::detail::make_counting_transform_iterator(0, [page, index] __device__(size_type i) {
-        auto const& pni = page.nesting[i];
-        return cudf::type_dispatcher(
-          data_type{pni.type}, row_size_functor{}, pni.size, pni.nullable);
-      });
-
-    size_t const row_count = static_cast<size_t>(page.nesting[0].size);
-    return {
-      row_count,
-      thrust::reduce(thrust::seq, iter, iter + page.num_output_nesting_levels) + page.str_bytes,
-      page.src_col_schema};
-  }
-};
-
-/**
- * @brief Functor which computes the effective size of all input columns by page.
- *
- * For a given row, we want to find the cost of all pages for all columns involved
- * in loading up to that row.  The complication here is that not all pages are the
- * same size between columns. Example:
- *
- *              page row counts
- * Column A:    0 <----> 100 <----> 200
- * Column B:    0 <---------------> 200 <--------> 400
-                          |
- * if we decide to split at row 100, we don't really know the actual amount of bytes in column B
- * at that point.  So we have to proceed as if we are taking the bytes from all 200 rows of that
- * page. Essentially, a conservative over-estimate of the real size.
- */
-struct row_total_size {
-  cumulative_row_info const* c_info;
-  size_type const* key_offsets;
-  size_t num_keys;
-
-  __device__ cumulative_row_info operator()(cumulative_row_info const& i)
-  {
-    // sum sizes for each input column at this row
-    size_t sum = 0;
-    for (int idx = 0; idx < num_keys; idx++) {
-      auto const start = key_offsets[idx];
-      auto const end   = key_offsets[idx + 1];
-      auto iter        = cudf::detail::make_counting_transform_iterator(
-        0, [&] __device__(size_type i) { return c_info[i].row_count; });
-      auto const page_index =
-        thrust::lower_bound(thrust::seq, iter + start, iter + end, i.row_count) - iter;
-      sum += c_info[page_index].size_bytes;
-    }
-    return {i.row_count, sum, i.key};
-  }
-};
-
-/**
- * @brief Given a vector of cumulative {row_count, byte_size} pairs and a chunk read
- * limit, determine the set of splits.
- *
- * @param sizes Vector of cumulative {row_count, byte_size} pairs
- * @param num_rows Total number of rows to read
- * @param chunk_read_limit Limit on total number of bytes to be returned per read, for all columns
- */
-std::vector<gpu::chunk_read_info> find_splits(std::vector<cumulative_row_info> const& sizes,
-                                              size_t num_rows,
-                                              size_t chunk_read_limit)
-{
-  // now we have an array of {row_count, real output bytes}. just walk through it and generate
-  // splits.
-  // TODO: come up with a clever way to do this entirely in parallel. For now, as long as batch
-  // sizes are reasonably large, this shouldn't iterate too many times
-  std::vector<gpu::chunk_read_info> splits;
-  {
-    size_t cur_pos             = 0;
-    size_t cur_cumulative_size = 0;
-    size_t cur_row_count       = 0;
-    auto start = thrust::make_transform_iterator(sizes.begin(), [&](cumulative_row_info const& i) {
-      return i.size_bytes - cur_cumulative_size;
-    });
-    auto end   = start + sizes.size();
-    while (cur_row_count < num_rows) {
-      int64_t split_pos =
-        thrust::lower_bound(thrust::seq, start + cur_pos, end, chunk_read_limit) - start;
-
-      // if we're past the end, or if the returned bucket is > than the chunk_read_limit, move back
-      // one.
-      if (static_cast<size_t>(split_pos) >= sizes.size() ||
-          (sizes[split_pos].size_bytes - cur_cumulative_size > chunk_read_limit)) {
-        split_pos--;
-      }
-
-      // best-try. if we can't find something that'll fit, we have to go bigger. we're doing this in
-      // a loop because all of the cumulative sizes for all the pages are sorted into one big list.
-      // so if we had two columns, both of which had an entry {1000, 10000}, that entry would be in
-      // the list twice. so we have to iterate until we skip past all of them.  The idea is that we
-      // either do this, or we have to call unique() on the input first.
-      while (split_pos < (static_cast<int64_t>(sizes.size()) - 1) &&
-             (split_pos < 0 || sizes[split_pos].row_count == cur_row_count)) {
-        split_pos++;
-      }
-
-      auto const start_row = cur_row_count;
-      cur_row_count        = sizes[split_pos].row_count;
-      splits.push_back(gpu::chunk_read_info{start_row, cur_row_count - start_row});
-      cur_pos             = split_pos;
-      cur_cumulative_size = sizes[split_pos].size_bytes;
-    }
-  }
-  // print_cumulative_row_info(sizes, "adjusted", splits);
-
-  return splits;
-}
-
-/**
- * @brief Converts cuDF units to Parquet units.
- *
- * @return A tuple of Parquet type width, Parquet clock rate and Parquet decimal type.
- */
-[[nodiscard]] std::tuple<int32_t, int32_t, int8_t> conversion_info(type_id column_type_id,
-                                                                   type_id timestamp_type_id,
-                                                                   parquet::Type physical,
-                                                                   int8_t converted,
-                                                                   int32_t length)
-{
-  int32_t type_width = (physical == parquet::FIXED_LEN_BYTE_ARRAY) ? length : 0;
-  int32_t clock_rate = 0;
-  if (column_type_id == type_id::INT8 or column_type_id == type_id::UINT8) {
-    type_width = 1;  // I32 -> I8
-  } else if (column_type_id == type_id::INT16 or column_type_id == type_id::UINT16) {
-    type_width = 2;  // I32 -> I16
-  } else if (column_type_id == type_id::INT32) {
-    type_width = 4;  // str -> hash32
-  } else if (is_chrono(data_type{column_type_id})) {
-    clock_rate = to_clockrate(timestamp_type_id);
-  }
-
-  int8_t converted_type = converted;
-  if (converted_type == parquet::DECIMAL && column_type_id != type_id::FLOAT64 &&
-      not cudf::is_fixed_point(data_type{column_type_id})) {
-    converted_type = parquet::UNKNOWN;  // Not converting to float64 or decimal
-  }
-  return std::make_tuple(type_width, clock_rate, converted_type);
-}
-
-/**
- * @brief Return the required number of bits to store a value.
- */
-template <typename T = uint8_t>
-[[nodiscard]] T required_bits(uint32_t max_level)
-{
-  return static_cast<T>(CompactProtocolReader::NumRequiredBits(max_level));
-}
-
-struct row_count_compare {
-  __device__ bool operator()(cumulative_row_info const& a, cumulative_row_info const& b) 
-  {
-    return a.row_count < b.row_count;
-  }
-};
-
-}  // anonymous namespace
-
-void reader::impl::create_global_chunk_info()
-{
-  auto const num_rows         = _file_itm_data.global_num_rows;
-  auto const& row_groups_info = _file_itm_data.row_groups;
-  auto& chunks                = _file_itm_data.chunks;
-
-  // Descriptors for all the chunks that make up the selected columns
-  auto const num_input_columns = _input_columns.size();
-  auto const num_chunks        = row_groups_info.size() * num_input_columns;
-
-  // Initialize column chunk information
-  auto remaining_rows = num_rows;
-  for (auto const& rg : row_groups_info) {
-    auto const& row_group      = _metadata->get_row_group(rg.index, rg.source_index);
-    auto const row_group_start = rg.start_row;
-    auto const row_group_rows  = std::min<int>(remaining_rows, row_group.num_rows);
-
-    // generate ColumnChunkDesc objects for everything to be decoded (all input columns)
-    for (size_t i = 0; i < num_input_columns; ++i) {
-      auto col = _input_columns[i];
-      // look up metadata
-      auto& col_meta = _metadata->get_column_metadata(rg.index, rg.source_index, col.schema_idx);
-      auto& schema   = _metadata->get_schema(col.schema_idx);
-
-      auto [type_width, clock_rate, converted_type] =
-        conversion_info(to_type_id(schema, _strings_to_categorical, _timestamp_type.id()),
-                        _timestamp_type.id(),
-                        schema.type,
-                        schema.converted_type,
-                        schema.type_length);
-
-      chunks.push_back(gpu::ColumnChunkDesc(col_meta.total_compressed_size,
-                                            nullptr,
-                                            col_meta.num_values,
-                                            schema.type,
-                                            type_width,
-                                            row_group_start,
-                                            row_group_rows,
-                                            schema.max_definition_level,
-                                            schema.max_repetition_level,
-                                            _metadata->get_output_nesting_depth(col.schema_idx),
-                                            required_bits(schema.max_definition_level),
-                                            required_bits(schema.max_repetition_level),
-                                            col_meta.codec,
-                                            converted_type,
-                                            schema.logical_type,
-                                            schema.decimal_precision,
-                                            clock_rate,
-                                            i,
-                                            col.schema_idx));
-    }
-
-    remaining_rows -= row_group_rows;
-  }
-}
-
-void reader::impl::compute_input_passes()
-{
-  // at this point, row_groups has already been filtered down to just the row groups we need to
-  // handle optional skip_rows/num_rows parameters.
-  auto const& row_groups_info = _file_itm_data.row_groups;
-
-  // if the user hasn't specified an input size limit, read everything in a single pass.
-  if (_input_pass_read_limit == 0) {
-    _file_itm_data.input_pass_row_group_offsets.push_back(0);
-    _file_itm_data.input_pass_row_group_offsets.push_back(row_groups_info.size());
-    return;
-  }
-
-  // generate passes. make sure to account for the case where a single row group doesn't fit within
-  //
-  std::size_t const read_limit =
-    _input_pass_read_limit > 0 ? _input_pass_read_limit : std::numeric_limits<std::size_t>::max();
-  std::size_t cur_pass_byte_size = 0;
-  std::size_t cur_rg_start       = 0;
-  std::size_t cur_row_count      = 0;
-  _file_itm_data.input_pass_row_group_offsets.push_back(0);
-  _file_itm_data.input_pass_row_count.push_back(0);
-
-  for (size_t cur_rg_index = 0; cur_rg_index < row_groups_info.size(); cur_rg_index++) {
-    auto const& rgi       = row_groups_info[cur_rg_index];
-    auto const& row_group = _metadata->get_row_group(rgi.index, rgi.source_index);
-
-    // can we add this row group
-    if (cur_pass_byte_size + row_group.total_byte_size >= read_limit) {
-      // A single row group (the current one) is larger than the read limit:
-      // We always need to include at least one row group, so end the pass at the end of the current
-      // row group
-      if (cur_rg_start == cur_rg_index) {
-        _file_itm_data.input_pass_row_group_offsets.push_back(cur_rg_index + 1);
-        _file_itm_data.input_pass_row_count.push_back(cur_row_count + row_group.num_rows);
-        cur_rg_start       = cur_rg_index + 1;
-        cur_pass_byte_size = 0;
-      }
-      // End the pass at the end of the previous row group
-      else {
-        _file_itm_data.input_pass_row_group_offsets.push_back(cur_rg_index);
-        _file_itm_data.input_pass_row_count.push_back(cur_row_count);
-        cur_rg_start       = cur_rg_index;
-        cur_pass_byte_size = row_group.total_byte_size;
-      }
-    } else {
-      cur_pass_byte_size += row_group.total_byte_size;
-    }
-    cur_row_count += row_group.num_rows;
-  }
-  // add the last pass if necessary
-  if (_file_itm_data.input_pass_row_group_offsets.back() != row_groups_info.size()) {
-    _file_itm_data.input_pass_row_group_offsets.push_back(row_groups_info.size());
-    _file_itm_data.input_pass_row_count.push_back(cur_row_count);
-  }
-}
-
-void reader::impl::setup_next_pass()
-{
-  // this will also cause the previous pass information to be deleted
-  _pass_itm_data = std::make_unique<cudf::io::parquet::detail::pass_intermediate_data>();
-
-  // setup row groups to be loaded for this pass
-  auto const row_group_start = _file_itm_data.input_pass_row_group_offsets[_current_input_pass];
-  auto const row_group_end   = _file_itm_data.input_pass_row_group_offsets[_current_input_pass + 1];
-  auto const num_row_groups  = row_group_end - row_group_start;
-  _pass_itm_data->row_groups.resize(num_row_groups);
-  std::copy(_file_itm_data.row_groups.begin() + row_group_start,
-            _file_itm_data.row_groups.begin() + row_group_end,
-            _pass_itm_data->row_groups.begin());
-
-  auto const num_passes = _file_itm_data.input_pass_row_group_offsets.size() - 1;
-  CUDF_EXPECTS(_current_input_pass < num_passes, "Encountered an invalid read pass index");
-
-  auto const chunks_per_rowgroup = _input_columns.size();
-  auto const num_chunks          = chunks_per_rowgroup * num_row_groups;
-
-  auto chunk_start = _file_itm_data.chunks.begin() + (row_group_start * chunks_per_rowgroup);
-  auto chunk_end   = _file_itm_data.chunks.begin() + (row_group_end * chunks_per_rowgroup);
-
-  _pass_itm_data->chunks =
-    cudf::detail::hostdevice_vector<gpu::ColumnChunkDesc>(num_chunks, _stream);
-  std::copy(chunk_start, chunk_end, _pass_itm_data->chunks.begin());
-
-  // adjust skip_rows and num_rows by what's available in the row groups we are processing
-  if (num_passes == 1) {
-    _pass_itm_data->skip_rows = _file_itm_data.global_skip_rows;
-    _pass_itm_data->num_rows  = _file_itm_data.global_num_rows;
-  } else {
-    auto const global_start_row = _file_itm_data.global_skip_rows;
-    auto const global_end_row   = global_start_row + _file_itm_data.global_num_rows;
-    auto const start_row = std::max(_file_itm_data.input_pass_row_count[_current_input_pass], global_start_row);
-    auto const end_row   = std::min(_file_itm_data.input_pass_row_count[_current_input_pass + 1], global_end_row);
-
-    // skip_rows is always global in the sense that it is relative to the first row of
-    // everything we will be reading, regardless of what pass we are on.
-    // num_rows is how many rows we are reading this pass.
-    _pass_itm_data->skip_rows = global_start_row + _file_itm_data.input_pass_row_count[_current_input_pass];
-    _pass_itm_data->num_rows  = end_row - start_row;
-  }
-}
-
-void reader::impl::compute_splits_for_pass()
-{
-  auto const skip_rows = _pass_itm_data->skip_rows;
-  auto const num_rows = _pass_itm_data->num_rows;
-
-  // simple case : no chunk size, no splits
-  if(_output_chunk_read_limit <= 0){
-    _pass_itm_data->output_chunk_read_info = std::vector<gpu::chunk_read_info>{{skip_rows, num_rows}};
-    return;
-  }
-
-  auto& pages          = _pass_itm_data->pages_info;
-
-  auto const& page_keys  = _pass_itm_data->page_keys;
-  auto const& page_index = _pass_itm_data->page_index;
-
-  // generate cumulative row counts and sizes
-  rmm::device_uvector<cumulative_row_info> c_info(page_keys.size(), _stream);
-  // convert PageInfo to cumulative_row_info
-  auto page_input = thrust::make_transform_iterator(page_index.begin(),
-                                                    get_cumulative_row_info{pages.device_ptr()});
-  thrust::inclusive_scan_by_key(rmm::exec_policy(_stream),
-                                page_keys.begin(),
-                                page_keys.end(),
-                                page_input,
-                                c_info.begin(),
-                                thrust::equal_to{},
-                                cumulative_row_sum{});
-  // print_cumulative_page_info(pages, page_index, c_info, stream);
-
-  // sort by row count
-  rmm::device_uvector<cumulative_row_info> c_info_sorted{c_info, _stream};
-  thrust::sort(rmm::exec_policy(_stream),
-               c_info_sorted.begin(),
-               c_info_sorted.end(),
-               row_count_compare{});
-
-  // std::vector<cumulative_row_info> h_c_info_sorted(c_info_sorted.size());
-  // CUDF_CUDA_TRY(cudaMemcpy(h_c_info_sorted.data(),
-  //                          c_info_sorted.data(),
-  //                          sizeof(cumulative_row_info) * c_info_sorted.size(),
-  //                          cudaMemcpyDefault));
-  // print_cumulative_row_info(h_c_info_sorted, "raw");
-
-  // generate key offsets (offsets to the start of each partition of keys). worst case is 1 page per
-  // key
-  rmm::device_uvector<size_type> key_offsets(page_keys.size() + 1, _stream);
-  auto const key_offsets_end = thrust::reduce_by_key(rmm::exec_policy(_stream),
-                                                     page_keys.begin(),
-                                                     page_keys.end(),
-                                                     thrust::make_constant_iterator(1),
-                                                     thrust::make_discard_iterator(),
-                                                     key_offsets.begin())
-                                 .second;
-  size_t const num_unique_keys = key_offsets_end - key_offsets.begin();
-  thrust::exclusive_scan(
-    rmm::exec_policy(_stream), key_offsets.begin(), key_offsets.end(), key_offsets.begin());
-
-  // adjust the cumulative info such that for each row count, the size includes any pages that span
-  // that row count. this is so that if we have this case:
-  //              page row counts
-  // Column A:    0 <----> 100 <----> 200
-  // Column B:    0 <---------------> 200 <--------> 400
-  //                        |
-  // if we decide to split at row 100, we don't really know the actual amount of bytes in column B
-  // at that point.  So we have to proceed as if we are taking the bytes from all 200 rows of that
-  // page.
-  //
-  rmm::device_uvector<cumulative_row_info> aggregated_info(c_info.size(), _stream);
-  thrust::transform(rmm::exec_policy(_stream),
-                    c_info_sorted.begin(),
-                    c_info_sorted.end(),
-                    aggregated_info.begin(),
-                    row_total_size{c_info.data(), key_offsets.data(), num_unique_keys});
-
-  // bring back to the cpu
-  std::vector<cumulative_row_info> h_aggregated_info(aggregated_info.size());
-  CUDF_CUDA_TRY(cudaMemcpyAsync(h_aggregated_info.data(),
-                                aggregated_info.data(),
-                                sizeof(cumulative_row_info) * c_info.size(),
-                                cudaMemcpyDefault,
-                                _stream.value()));
-  _stream.synchronize();
-
-  // generate the actual splits
-  _pass_itm_data->output_chunk_read_info = find_splits(h_aggregated_info, num_rows, _output_chunk_read_limit);
-}
-
-}  // namespace cudf::io::parquet::detail

From 227e1f08533ac0bab256d3a5f26a42e9fc0db11f Mon Sep 17 00:00:00 2001
From: db <dbaranec@nvidia.com>
Date: Fri, 6 Oct 2023 16:37:03 -0500
Subject: [PATCH 03/49] Centralize all pass/chunk related code into
 reader_impl_chunking.cu

---
 cpp/CMakeLists.txt                           |   1 +
 cpp/src/io/parquet/parquet_gpu.hpp           |  73 ---
 cpp/src/io/parquet/reader_impl.cpp           |  12 +-
 cpp/src/io/parquet/reader_impl.hpp           |  61 +-
 cpp/src/io/parquet/reader_impl_chunking.cu   | 598 +++++++++++++++++++
 cpp/src/io/parquet/reader_impl_chunking.hpp  |  87 +++
 cpp/src/io/parquet/reader_impl_helpers.hpp   |  17 +
 cpp/src/io/parquet/reader_impl_preprocess.cu | 558 +----------------
 8 files changed, 755 insertions(+), 652 deletions(-)
 create mode 100644 cpp/src/io/parquet/reader_impl_chunking.cu
 create mode 100644 cpp/src/io/parquet/reader_impl_chunking.hpp

diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt
index 000f80065ab..f8b9762f1d4 100644
--- a/cpp/CMakeLists.txt
+++ b/cpp/CMakeLists.txt
@@ -401,6 +401,7 @@ add_library(
   src/io/parquet/predicate_pushdown.cpp
   src/io/parquet/reader.cpp
   src/io/parquet/reader_impl.cpp
+  src/io/parquet/reader_impl_chunking.cu
   src/io/parquet/reader_impl_helpers.cpp
   src/io/parquet/reader_impl_preprocess.cu
   src/io/parquet/writer_impl.cu
diff --git a/cpp/src/io/parquet/parquet_gpu.hpp b/cpp/src/io/parquet/parquet_gpu.hpp
index 767668cc65e..6a93fec0c46 100644
--- a/cpp/src/io/parquet/parquet_gpu.hpp
+++ b/cpp/src/io/parquet/parquet_gpu.hpp
@@ -318,79 +318,6 @@ struct ColumnChunkDesc {
   int32_t src_col_schema{};  // my schema index in the file
 };
 
-/**
- * @brief The row_group_info class
- */
-struct row_group_info {
-  size_type index;  // row group index within a file. aggregate_reader_metadata::get_row_group() is
-                    // called with index and source_index
-  size_t start_row;
-  size_type source_index;  // file index.
-
-  row_group_info() = default;
-
-  row_group_info(size_type index, size_t start_row, size_type source_index)
-    : index{index}, start_row{start_row}, source_index{source_index}
-  {
-  }
-};
-
-/**
- * @brief Struct to store file-level data that remains constant for
- * all passes/chunks for the file.
- */
-struct file_intermediate_data {
-  // all row groups to read
-  std::vector<row_group_info> row_groups{};
-
-  // all chunks from the selected row groups. We may end up reading these chunks progressively
-  // instead of all at once
-  std::vector<ColumnChunkDesc> chunks{};
-
-  // skip_rows/num_rows values for the entire file. these need to be adjusted per-pass because we
-  // may not be visiting every row group that contains these bounds
-  size_t global_skip_rows;
-  size_t global_num_rows;
-};
-
-/**
- * @brief Structs to identify the reading row range for each chunk of rows in chunked reading.
- */
-struct chunk_read_info {
-  size_t skip_rows;
-  size_t num_rows;
-};
-
-/**
- * @brief Struct to store pass-level data that remains constant for a single pass.
- */
-struct pass_intermediate_data {
-  std::vector<std::unique_ptr<datasource::buffer>> raw_page_data;
-  rmm::device_buffer decomp_page_data;
-
-  // rowgroup, chunk and page information for the current pass.
-  std::vector<row_group_info> row_groups{};
-  cudf::detail::hostdevice_vector<ColumnChunkDesc> chunks{};
-  cudf::detail::hostdevice_vector<PageInfo> pages_info{};
-  cudf::detail::hostdevice_vector<PageNestingInfo> page_nesting_info{};
-  cudf::detail::hostdevice_vector<PageNestingDecodeInfo> page_nesting_decode_info{};
-
-  rmm::device_uvector<int32_t> page_keys{0, rmm::cuda_stream_default};
-  rmm::device_uvector<int32_t> page_index{0, rmm::cuda_stream_default};
-  rmm::device_uvector<string_index_pair> str_dict_index{0, rmm::cuda_stream_default};
-
-  std::vector<chunk_read_info> output_chunk_read_info;
-  std::size_t current_output_chunk{0};
-
-  rmm::device_buffer level_decode_data{};
-  int level_type_size{0};
-
-  // skip_rows and num_rows values for this particular pass. these may be adjusted values from the
-  // global values stored in file_intermediate_data.
-  size_t skip_rows;
-  size_t num_rows;
-};
-
 /**
  * @brief Struct describing an encoder column
  */
diff --git a/cpp/src/io/parquet/reader_impl.cpp b/cpp/src/io/parquet/reader_impl.cpp
index 34aa4f2201f..44f9c160c25 100644
--- a/cpp/src/io/parquet/reader_impl.cpp
+++ b/cpp/src/io/parquet/reader_impl.cpp
@@ -349,14 +349,14 @@ void reader::impl::prepare_data(int64_t skip_rows,
         not _input_columns.empty()) {
       // fills in chunk information without physically loading or decompressing
       // the associated data
-      load_global_chunk_info();
+      create_global_chunk_info();
 
       // compute schedule of input reads. Each rowgroup contains 1 chunk per column. For now
       // we will read an entire row group at a time. However, it is possible to do
       // sub-rowgroup reads if we made some estimates on individual chunk sizes (tricky) and
       // changed the high level structure such that we weren't always reading an entire table's
       // worth of columns at once.
-      compute_input_pass_row_group_info();
+      compute_input_passes();
     }
 
     _file_preprocessed = true;
@@ -364,7 +364,7 @@ void reader::impl::prepare_data(int64_t skip_rows,
 
   // if we have to start a new pass, do that now
   if (!_pass_preprocessed) {
-    auto const num_passes = _input_pass_row_group_offsets.size() - 1;
+    auto const num_passes = _file_itm_data.input_pass_row_group_offsets.size() - 1;
 
     // always create the pass struct, even if we end up with no passes.
     // this will also cause the previous pass information to be deleted
@@ -373,7 +373,7 @@ void reader::impl::prepare_data(int64_t skip_rows,
     if (_file_itm_data.global_num_rows > 0 && not _file_itm_data.row_groups.empty() &&
         not _input_columns.empty() && _current_input_pass < num_passes) {
       // setup the pass_intermediate_info for this pass.
-      setup_pass();
+      setup_next_pass();
 
       load_and_decompress_data();
       preprocess_pages(uses_custom_row_bounds, _output_chunk_read_limit);
@@ -541,8 +541,8 @@ bool reader::impl::has_next()
                {} /*row_group_indices, empty means read all row groups*/,
                std::nullopt /*filter*/);
 
-  auto const num_input_passes =
-    _input_pass_row_group_offsets.size() == 0 ? 0 : _input_pass_row_group_offsets.size() - 1;
+  size_t const num_input_passes = std::max(
+    int64_t{0}, static_cast<int64_t>(_file_itm_data.input_pass_row_group_offsets.size()) - 1);
   return (_pass_itm_data->current_output_chunk < _pass_itm_data->output_chunk_read_info.size()) ||
          (_current_input_pass < num_input_passes);
 }
diff --git a/cpp/src/io/parquet/reader_impl.hpp b/cpp/src/io/parquet/reader_impl.hpp
index 03990f1a1f3..22217b55411 100644
--- a/cpp/src/io/parquet/reader_impl.hpp
+++ b/cpp/src/io/parquet/reader_impl.hpp
@@ -22,6 +22,7 @@
 #pragma once
 
 #include "parquet_gpu.hpp"
+#include "reader_impl_chunking.hpp"
 #include "reader_impl_helpers.hpp"
 
 #include <cudf/io/datasource.hpp>
@@ -136,10 +137,6 @@ class reader::impl {
                     host_span<std::vector<size_type> const> row_group_indices,
                     std::optional<std::reference_wrapper<ast::expression const>> filter);
 
-  void load_global_chunk_info();
-  void compute_input_pass_row_group_info();
-  void setup_pass();
-
   /**
    * @brief Create chunk information and start file reads
    *
@@ -250,6 +247,39 @@ class reader::impl {
    */
   void decode_page_data(size_t skip_rows, size_t num_rows);
 
+  /*
+   *
+   *
+    Functions related to computing chunks and passes (reader_impl_chunking.cu)
+   *
+   *
+   */
+
+  /**
+   * @brief Creates file-wide parquet chunk information.
+   *
+   * Creates information about all chunks in the file, storing it in
+   * the file-wide _file_itm_data structure.
+   */
+  void create_global_chunk_info();
+
+  /**
+   * @brief Computes all of the passes we will perform over the file.
+   */
+  void compute_input_passes();
+
+  /**
+   * @brief Close out the existing pass (if any) and prepare for the next pass.
+   */
+  void setup_next_pass();
+
+  /**
+   * @brief Given a set of pages that have had their sizes computed by nesting level and
+   * a limit on total read size, generate a set of {skip_rows, num_rows} pairs representing
+   * a set of reads that will generate output columns of total size <= `chunk_read_limit` bytes.
+   */
+  void compute_splits_for_pass();
+
  private:
   rmm::cuda_stream_view _stream;
   rmm::mr::device_memory_resource* _mr = nullptr;
@@ -278,27 +308,24 @@ class reader::impl {
 
   // chunked reading happens in 2 parts:
   //
-  // At the top level there is the "pass" in which we try and limit the
+  // At the top level, the entire file is divided up into "passes" omn which we try and limit the
   // total amount of temporary memory (compressed data, decompressed data) in use
   // via _input_pass_read_limit.
   //
   // Within a pass, we produce one or more chunks of output, whose maximum total
   // byte size is controlled by _output_chunk_read_limit.
 
-  cudf::io::parquet::detail::file_intermediate_data _file_itm_data;
-  std::unique_ptr<cudf::io::parquet::detail::pass_intermediate_data> _pass_itm_data;
-
-  // an array of offsets into _file_itm_data::global_chunks. Each pair of offsets represents
-  // the start/end of the chunks to be loaded for a given pass.
-  std::vector<std::size_t> _input_pass_row_group_offsets{};
-  std::vector<std::size_t> _input_pass_row_count{};
-  std::size_t _current_input_pass{0};
-  std::size_t _chunk_count{0};
+  file_intermediate_data _file_itm_data;
+  bool _file_preprocessed{false};
 
-  std::size_t _output_chunk_read_limit{0};
-  std::size_t _input_pass_read_limit{0};
+  std::unique_ptr<pass_intermediate_data> _pass_itm_data;
   bool _pass_preprocessed{false};
-  bool _file_preprocessed{false};
+
+  std::size_t _output_chunk_read_limit{0};  // output chunk size limit in bytes
+  std::size_t _input_pass_read_limit{0};    // input pass memory usage limit in bytes
+
+  std::size_t _current_input_pass{0};  // current input pass index
+  std::size_t _chunk_count{0};         // how many output chunks we have produced
 };
 
 }  // namespace cudf::io::parquet::detail
diff --git a/cpp/src/io/parquet/reader_impl_chunking.cu b/cpp/src/io/parquet/reader_impl_chunking.cu
new file mode 100644
index 00000000000..5d5b152e1aa
--- /dev/null
+++ b/cpp/src/io/parquet/reader_impl_chunking.cu
@@ -0,0 +1,598 @@
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "reader_impl.hpp"
+#include "reader_impl_chunking.hpp"
+
+#include <cudf/detail/iterator.cuh>
+#include <cudf/detail/utilities/integer_utils.hpp>
+
+#include <io/utilities/time_utils.cuh>
+
+#include <rmm/exec_policy.hpp>
+
+#include <thrust/binary_search.h>
+#include <thrust/iterator/constant_iterator.h>
+#include <thrust/iterator/discard_iterator.h>
+#include <thrust/sort.h>
+
+namespace cudf::io::parquet::detail {
+
+namespace {
+
+struct cumulative_row_info {
+  size_t row_count;   // cumulative row count
+  size_t size_bytes;  // cumulative size in bytes
+  int key;            // schema index
+};
+
+#if defined(CHUNKING_DEBUG)
+void print_cumulative_page_info(cudf::detail::hostdevice_vector<PageInfo>& pages,
+                                rmm::device_uvector<int32_t> const& page_index,
+                                rmm::device_uvector<cumulative_row_info> const& c_info,
+                                rmm::cuda_stream_view stream)
+{
+  pages.device_to_host_sync(stream);
+
+  printf("------------\nCumulative sizes by page\n");
+
+  std::vector<int> schemas(pages.size());
+  std::vector<int> h_page_index(pages.size());
+  CUDF_CUDA_TRY(cudaMemcpy(
+    h_page_index.data(), page_index.data(), sizeof(int) * pages.size(), cudaMemcpyDefault));
+  std::vector<cumulative_row_info> h_cinfo(pages.size());
+  CUDF_CUDA_TRY(cudaMemcpy(
+    h_cinfo.data(), c_info.data(), sizeof(cumulative_row_info) * pages.size(), cudaMemcpyDefault));
+  auto schema_iter = cudf::detail::make_counting_transform_iterator(
+    0, [&](size_type i) { return pages[h_page_index[i]].src_col_schema; });
+  thrust::copy(thrust::seq, schema_iter, schema_iter + pages.size(), schemas.begin());
+  auto last = thrust::unique(thrust::seq, schemas.begin(), schemas.end());
+  schemas.resize(last - schemas.begin());
+  printf("Num schemas: %lu\n", schemas.size());
+
+  for (size_t idx = 0; idx < schemas.size(); idx++) {
+    printf("Schema %d\n", schemas[idx]);
+    for (size_t pidx = 0; pidx < pages.size(); pidx++) {
+      auto const& page = pages[h_page_index[pidx]];
+      if (page.flags & PAGEINFO_FLAGS_DICTIONARY || page.src_col_schema != schemas[idx]) {
+        continue;
+      }
+      printf("\tP: {%lu, %lu}\n", h_cinfo[pidx].row_count, h_cinfo[pidx].size_bytes);
+    }
+  }
+}
+
+void print_cumulative_row_info(
+  host_span<cumulative_row_info const> sizes,
+  std::string const& label,
+  std::optional<std::vector<chunk_read_info>> splits = std::nullopt)
+{
+  if (splits.has_value()) {
+    printf("------------\nSplits\n");
+    for (size_t idx = 0; idx < splits->size(); idx++) {
+      printf("{%lu, %lu}\n", splits.value()[idx].skip_rows, splits.value()[idx].num_rows);
+    }
+  }
+
+  printf("------------\nCumulative sizes %s\n", label.c_str());
+  for (size_t idx = 0; idx < sizes.size(); idx++) {
+    printf("{%lu, %lu, %d}", sizes[idx].row_count, sizes[idx].size_bytes, sizes[idx].key);
+    if (splits.has_value()) {
+      // if we have a split at this row count and this is the last instance of this row count
+      auto start = thrust::make_transform_iterator(
+        splits->begin(), [](chunk_read_info const& i) { return i.skip_rows; });
+      auto end               = start + splits->size();
+      auto split             = std::find(start, end, sizes[idx].row_count);
+      auto const split_index = [&]() -> int {
+        if (split != end &&
+            ((idx == sizes.size() - 1) || (sizes[idx + 1].row_count > sizes[idx].row_count))) {
+          return static_cast<int>(std::distance(start, split));
+        }
+        return idx == 0 ? 0 : -1;
+      }();
+      if (split_index >= 0) {
+        printf(" <-- split {%lu, %lu}",
+               splits.value()[split_index].skip_rows,
+               splits.value()[split_index].num_rows);
+      }
+    }
+    printf("\n");
+  }
+}
+#endif  // CHUNKING_DEBUG
+
+/**
+ * @brief Functor which reduces two cumulative_row_info structs of the same key.
+ */
+struct cumulative_row_sum {
+  cumulative_row_info operator()
+    __device__(cumulative_row_info const& a, cumulative_row_info const& b) const
+  {
+    return cumulative_row_info{a.row_count + b.row_count, a.size_bytes + b.size_bytes, a.key};
+  }
+};
+
+/**
+ * @brief Functor which computes the total data size for a given type of cudf column.
+ *
+ * In the case of strings, the return size does not include the chars themselves. That
+ * information is tracked separately (see PageInfo::str_bytes).
+ */
+struct row_size_functor {
+  __device__ size_t validity_size(size_t num_rows, bool nullable)
+  {
+    return nullable ? (cudf::util::div_rounding_up_safe(num_rows, size_t{32}) * 4) : 0;
+  }
+
+  template <typename T>
+  __device__ size_t operator()(size_t num_rows, bool nullable)
+  {
+    auto const element_size = sizeof(device_storage_type_t<T>);
+    return (element_size * num_rows) + validity_size(num_rows, nullable);
+  }
+};
+
+template <>
+__device__ size_t row_size_functor::operator()<list_view>(size_t num_rows, bool nullable)
+{
+  auto const offset_size = sizeof(size_type);
+  // NOTE: Adding the + 1 offset here isn't strictly correct.  There will only be 1 extra offset
+  // for the entire column, whereas this is adding an extra offset per page.  So we will get a
+  // small over-estimate of the real size of the order :  # of pages * 4 bytes. It seems better
+  // to overestimate size somewhat than to underestimate it and potentially generate chunks
+  // that are too large.
+  return (offset_size * (num_rows + 1)) + validity_size(num_rows, nullable);
+}
+
+template <>
+__device__ size_t row_size_functor::operator()<struct_view>(size_t num_rows, bool nullable)
+{
+  return validity_size(num_rows, nullable);
+}
+
+template <>
+__device__ size_t row_size_functor::operator()<string_view>(size_t num_rows, bool nullable)
+{
+  // only returns the size of offsets and validity. the size of the actual string chars
+  // is tracked separately.
+  auto const offset_size = sizeof(size_type);
+  // see note about offsets in the list_view template.
+  return (offset_size * (num_rows + 1)) + validity_size(num_rows, nullable);
+}
+
+/**
+ * @brief Functor which computes the total output cudf data size for all of
+ * the data in this page.
+ *
+ * Sums across all nesting levels.
+ */
+struct get_cumulative_row_info {
+  PageInfo const* const pages;
+
+  __device__ cumulative_row_info operator()(size_type index)
+  {
+    auto const& page = pages[index];
+    if (page.flags & PAGEINFO_FLAGS_DICTIONARY) {
+      return cumulative_row_info{0, 0, page.src_col_schema};
+    }
+
+    // total nested size, not counting string data
+    auto iter =
+      cudf::detail::make_counting_transform_iterator(0, [page, index] __device__(size_type i) {
+        auto const& pni = page.nesting[i];
+        return cudf::type_dispatcher(
+          data_type{pni.type}, row_size_functor{}, pni.size, pni.nullable);
+      });
+
+    size_t const row_count = static_cast<size_t>(page.nesting[0].size);
+    return {
+      row_count,
+      thrust::reduce(thrust::seq, iter, iter + page.num_output_nesting_levels) + page.str_bytes,
+      page.src_col_schema};
+  }
+};
+
+/**
+ * @brief Functor which computes the effective size of all input columns by page.
+ *
+ * For a given row, we want to find the cost of all pages for all columns involved
+ * in loading up to that row.  The complication here is that not all pages are the
+ * same size between columns. Example:
+ *
+ *              page row counts
+ * Column A:    0 <----> 100 <----> 200
+ * Column B:    0 <---------------> 200 <--------> 400
+                          |
+ * if we decide to split at row 100, we don't really know the actual amount of bytes in column B
+ * at that point.  So we have to proceed as if we are taking the bytes from all 200 rows of that
+ * page. Essentially, a conservative over-estimate of the real size.
+ */
+struct row_total_size {
+  cumulative_row_info const* c_info;
+  size_type const* key_offsets;
+  size_t num_keys;
+
+  __device__ cumulative_row_info operator()(cumulative_row_info const& i)
+  {
+    // sum sizes for each input column at this row
+    size_t sum = 0;
+    for (int idx = 0; idx < num_keys; idx++) {
+      auto const start = key_offsets[idx];
+      auto const end   = key_offsets[idx + 1];
+      auto iter        = cudf::detail::make_counting_transform_iterator(
+        0, [&] __device__(size_type i) { return c_info[i].row_count; });
+      auto const page_index =
+        thrust::lower_bound(thrust::seq, iter + start, iter + end, i.row_count) - iter;
+      sum += c_info[page_index].size_bytes;
+    }
+    return {i.row_count, sum, i.key};
+  }
+};
+
+/**
+ * @brief Given a vector of cumulative {row_count, byte_size} pairs and a chunk read
+ * limit, determine the set of splits.
+ *
+ * @param sizes Vector of cumulative {row_count, byte_size} pairs
+ * @param num_rows Total number of rows to read
+ * @param chunk_read_limit Limit on total number of bytes to be returned per read, for all columns
+ */
+std::vector<chunk_read_info> find_splits(std::vector<cumulative_row_info> const& sizes,
+                                              size_t num_rows,
+                                              size_t chunk_read_limit)
+{
+  // now we have an array of {row_count, real output bytes}. just walk through it and generate
+  // splits.
+  // TODO: come up with a clever way to do this entirely in parallel. For now, as long as batch
+  // sizes are reasonably large, this shouldn't iterate too many times
+  std::vector<chunk_read_info> splits;
+  {
+    size_t cur_pos             = 0;
+    size_t cur_cumulative_size = 0;
+    size_t cur_row_count       = 0;
+    auto start = thrust::make_transform_iterator(sizes.begin(), [&](cumulative_row_info const& i) {
+      return i.size_bytes - cur_cumulative_size;
+    });
+    auto end   = start + sizes.size();
+    while (cur_row_count < num_rows) {
+      int64_t split_pos =
+        thrust::lower_bound(thrust::seq, start + cur_pos, end, chunk_read_limit) - start;
+
+      // if we're past the end, or if the returned bucket is > than the chunk_read_limit, move back
+      // one.
+      if (static_cast<size_t>(split_pos) >= sizes.size() ||
+          (sizes[split_pos].size_bytes - cur_cumulative_size > chunk_read_limit)) {
+        split_pos--;
+      }
+
+      // best-try. if we can't find something that'll fit, we have to go bigger. we're doing this in
+      // a loop because all of the cumulative sizes for all the pages are sorted into one big list.
+      // so if we had two columns, both of which had an entry {1000, 10000}, that entry would be in
+      // the list twice. so we have to iterate until we skip past all of them.  The idea is that we
+      // either do this, or we have to call unique() on the input first.
+      while (split_pos < (static_cast<int64_t>(sizes.size()) - 1) &&
+             (split_pos < 0 || sizes[split_pos].row_count == cur_row_count)) {
+        split_pos++;
+      }
+
+      auto const start_row = cur_row_count;
+      cur_row_count        = sizes[split_pos].row_count;
+      splits.push_back(chunk_read_info{start_row, cur_row_count - start_row});
+      cur_pos             = split_pos;
+      cur_cumulative_size = sizes[split_pos].size_bytes;
+    }
+  }
+  // print_cumulative_row_info(sizes, "adjusted", splits);
+
+  return splits;
+}
+
+/**
+ * @brief Converts cuDF units to Parquet units.
+ *
+ * @return A tuple of Parquet type width, Parquet clock rate and Parquet decimal type.
+ */
+[[nodiscard]] std::tuple<int32_t, int32_t, int8_t> conversion_info(type_id column_type_id,
+                                                                   type_id timestamp_type_id,
+                                                                   Type physical,
+                                                                   int8_t converted,
+                                                                   int32_t length)
+{
+  int32_t type_width = (physical == FIXED_LEN_BYTE_ARRAY) ? length : 0;
+  int32_t clock_rate = 0;
+  if (column_type_id == type_id::INT8 or column_type_id == type_id::UINT8) {
+    type_width = 1;  // I32 -> I8
+  } else if (column_type_id == type_id::INT16 or column_type_id == type_id::UINT16) {
+    type_width = 2;  // I32 -> I16
+  } else if (column_type_id == type_id::INT32) {
+    type_width = 4;  // str -> hash32
+  } else if (is_chrono(data_type{column_type_id})) {
+    clock_rate = to_clockrate(timestamp_type_id);
+  }
+
+  int8_t converted_type = converted;
+  if (converted_type == DECIMAL && column_type_id != type_id::FLOAT64 &&
+      not cudf::is_fixed_point(data_type{column_type_id})) {
+    converted_type = UNKNOWN;  // Not converting to float64 or decimal
+  }
+  return std::make_tuple(type_width, clock_rate, converted_type);
+}
+
+/**
+ * @brief Return the required number of bits to store a value.
+ */
+template <typename T = uint8_t>
+[[nodiscard]] T required_bits(uint32_t max_level)
+{
+  return static_cast<T>(CompactProtocolReader::NumRequiredBits(max_level));
+}
+
+struct row_count_compare {
+  __device__ bool operator()(cumulative_row_info const& a, cumulative_row_info const& b) 
+  {
+    return a.row_count < b.row_count;
+  }
+};
+
+}  // anonymous namespace
+
+void reader::impl::create_global_chunk_info()
+{
+  auto const num_rows         = _file_itm_data.global_num_rows;
+  auto const& row_groups_info = _file_itm_data.row_groups;
+  auto& chunks                = _file_itm_data.chunks;
+
+  // Descriptors for all the chunks that make up the selected columns
+  auto const num_input_columns = _input_columns.size();
+  auto const num_chunks        = row_groups_info.size() * num_input_columns;
+
+  // Initialize column chunk information
+  auto remaining_rows = num_rows;
+  for (auto const& rg : row_groups_info) {
+    auto const& row_group      = _metadata->get_row_group(rg.index, rg.source_index);
+    auto const row_group_start = rg.start_row;
+    auto const row_group_rows  = std::min<int>(remaining_rows, row_group.num_rows);
+
+    // generate ColumnChunkDesc objects for everything to be decoded (all input columns)
+    for (size_t i = 0; i < num_input_columns; ++i) {
+      auto col = _input_columns[i];
+      // look up metadata
+      auto& col_meta = _metadata->get_column_metadata(rg.index, rg.source_index, col.schema_idx);
+      auto& schema   = _metadata->get_schema(col.schema_idx);
+
+      auto [type_width, clock_rate, converted_type] =
+        conversion_info(to_type_id(schema, _strings_to_categorical, _timestamp_type.id()),
+                        _timestamp_type.id(),
+                        schema.type,
+                        schema.converted_type,
+                        schema.type_length);
+
+      chunks.push_back(ColumnChunkDesc(col_meta.total_compressed_size,
+                                            nullptr,
+                                            col_meta.num_values,
+                                            schema.type,
+                                            type_width,
+                                            row_group_start,
+                                            row_group_rows,
+                                            schema.max_definition_level,
+                                            schema.max_repetition_level,
+                                            _metadata->get_output_nesting_depth(col.schema_idx),
+                                            required_bits(schema.max_definition_level),
+                                            required_bits(schema.max_repetition_level),
+                                            col_meta.codec,
+                                            converted_type,
+                                            schema.logical_type,
+                                            schema.decimal_precision,
+                                            clock_rate,
+                                            i,
+                                            col.schema_idx));
+    }
+
+    remaining_rows -= row_group_rows;
+  }
+}
+
+void reader::impl::compute_input_passes()
+{
+  // at this point, row_groups has already been filtered down to just the row groups we need to
+  // handle optional skip_rows/num_rows parameters.
+  auto const& row_groups_info = _file_itm_data.row_groups;
+
+  // if the user hasn't specified an input size limit, read everything in a single pass.
+  if (_input_pass_read_limit == 0) {
+    _file_itm_data.input_pass_row_group_offsets.push_back(0);
+    _file_itm_data.input_pass_row_group_offsets.push_back(row_groups_info.size());
+    return;
+  }
+
+  // generate passes. make sure to account for the case where a single row group doesn't fit within
+  //
+  std::size_t const read_limit =
+    _input_pass_read_limit > 0 ? _input_pass_read_limit : std::numeric_limits<std::size_t>::max();
+  std::size_t cur_pass_byte_size = 0;
+  std::size_t cur_rg_start       = 0;
+  std::size_t cur_row_count      = 0;
+  _file_itm_data.input_pass_row_group_offsets.push_back(0);
+  _file_itm_data.input_pass_row_count.push_back(0);
+
+  for (size_t cur_rg_index = 0; cur_rg_index < row_groups_info.size(); cur_rg_index++) {
+    auto const& rgi       = row_groups_info[cur_rg_index];
+    auto const& row_group = _metadata->get_row_group(rgi.index, rgi.source_index);
+
+    // can we add this row group
+    if (cur_pass_byte_size + row_group.total_byte_size >= read_limit) {
+      // A single row group (the current one) is larger than the read limit:
+      // We always need to include at least one row group, so end the pass at the end of the current
+      // row group
+      if (cur_rg_start == cur_rg_index) {
+        _file_itm_data.input_pass_row_group_offsets.push_back(cur_rg_index + 1);
+        _file_itm_data.input_pass_row_count.push_back(cur_row_count + row_group.num_rows);
+        cur_rg_start       = cur_rg_index + 1;
+        cur_pass_byte_size = 0;
+      }
+      // End the pass at the end of the previous row group
+      else {
+        _file_itm_data.input_pass_row_group_offsets.push_back(cur_rg_index);
+        _file_itm_data.input_pass_row_count.push_back(cur_row_count);
+        cur_rg_start       = cur_rg_index;
+        cur_pass_byte_size = row_group.total_byte_size;
+      }
+    } else {
+      cur_pass_byte_size += row_group.total_byte_size;
+    }
+    cur_row_count += row_group.num_rows;
+  }
+  // add the last pass if necessary
+  if (_file_itm_data.input_pass_row_group_offsets.back() != row_groups_info.size()) {
+    _file_itm_data.input_pass_row_group_offsets.push_back(row_groups_info.size());
+    _file_itm_data.input_pass_row_count.push_back(cur_row_count);
+  }
+}
+
+void reader::impl::setup_next_pass()
+{
+  // this will also cause the previous pass information to be deleted
+  _pass_itm_data = std::make_unique<cudf::io::parquet::detail::pass_intermediate_data>();
+
+  // setup row groups to be loaded for this pass
+  auto const row_group_start = _file_itm_data.input_pass_row_group_offsets[_current_input_pass];
+  auto const row_group_end   = _file_itm_data.input_pass_row_group_offsets[_current_input_pass + 1];
+  auto const num_row_groups  = row_group_end - row_group_start;
+  _pass_itm_data->row_groups.resize(num_row_groups);
+  std::copy(_file_itm_data.row_groups.begin() + row_group_start,
+            _file_itm_data.row_groups.begin() + row_group_end,
+            _pass_itm_data->row_groups.begin());
+
+  auto const num_passes = _file_itm_data.input_pass_row_group_offsets.size() - 1;
+  CUDF_EXPECTS(_current_input_pass < num_passes, "Encountered an invalid read pass index");
+
+  auto const chunks_per_rowgroup = _input_columns.size();
+  auto const num_chunks          = chunks_per_rowgroup * num_row_groups;
+
+  auto chunk_start = _file_itm_data.chunks.begin() + (row_group_start * chunks_per_rowgroup);
+  auto chunk_end   = _file_itm_data.chunks.begin() + (row_group_end * chunks_per_rowgroup);
+
+  _pass_itm_data->chunks =
+    cudf::detail::hostdevice_vector<ColumnChunkDesc>(num_chunks, _stream);
+  std::copy(chunk_start, chunk_end, _pass_itm_data->chunks.begin());
+
+  // adjust skip_rows and num_rows by what's available in the row groups we are processing
+  if (num_passes == 1) {
+    _pass_itm_data->skip_rows = _file_itm_data.global_skip_rows;
+    _pass_itm_data->num_rows  = _file_itm_data.global_num_rows;
+  } else {
+    auto const global_start_row = _file_itm_data.global_skip_rows;
+    auto const global_end_row   = global_start_row + _file_itm_data.global_num_rows;
+    auto const start_row = std::max(_file_itm_data.input_pass_row_count[_current_input_pass], global_start_row);
+    auto const end_row   = std::min(_file_itm_data.input_pass_row_count[_current_input_pass + 1], global_end_row);
+
+    // skip_rows is always global in the sense that it is relative to the first row of
+    // everything we will be reading, regardless of what pass we are on.
+    // num_rows is how many rows we are reading this pass.
+    _pass_itm_data->skip_rows = global_start_row + _file_itm_data.input_pass_row_count[_current_input_pass];
+    _pass_itm_data->num_rows  = end_row - start_row;
+  }
+}
+
+void reader::impl::compute_splits_for_pass()
+{
+  auto const skip_rows = _pass_itm_data->skip_rows;
+  auto const num_rows = _pass_itm_data->num_rows;
+
+  // simple case : no chunk size, no splits
+  if(_output_chunk_read_limit <= 0){
+    _pass_itm_data->output_chunk_read_info = std::vector<chunk_read_info>{{skip_rows, num_rows}};
+    return;
+  }
+
+  auto& pages          = _pass_itm_data->pages_info;
+
+  auto const& page_keys  = _pass_itm_data->page_keys;
+  auto const& page_index = _pass_itm_data->page_index;
+
+  // generate cumulative row counts and sizes
+  rmm::device_uvector<cumulative_row_info> c_info(page_keys.size(), _stream);
+  // convert PageInfo to cumulative_row_info
+  auto page_input = thrust::make_transform_iterator(page_index.begin(),
+                                                    get_cumulative_row_info{pages.device_ptr()});
+  thrust::inclusive_scan_by_key(rmm::exec_policy(_stream),
+                                page_keys.begin(),
+                                page_keys.end(),
+                                page_input,
+                                c_info.begin(),
+                                thrust::equal_to{},
+                                cumulative_row_sum{});
+  // print_cumulative_page_info(pages, page_index, c_info, stream);
+
+  // sort by row count
+  rmm::device_uvector<cumulative_row_info> c_info_sorted{c_info, _stream};
+  thrust::sort(rmm::exec_policy(_stream),
+               c_info_sorted.begin(),
+               c_info_sorted.end(),
+               row_count_compare{});
+
+  // std::vector<cumulative_row_info> h_c_info_sorted(c_info_sorted.size());
+  // CUDF_CUDA_TRY(cudaMemcpy(h_c_info_sorted.data(),
+  //                          c_info_sorted.data(),
+  //                          sizeof(cumulative_row_info) * c_info_sorted.size(),
+  //                          cudaMemcpyDefault));
+  // print_cumulative_row_info(h_c_info_sorted, "raw");
+
+  // generate key offsets (offsets to the start of each partition of keys). worst case is 1 page per
+  // key
+  rmm::device_uvector<size_type> key_offsets(page_keys.size() + 1, _stream);
+  auto const key_offsets_end = thrust::reduce_by_key(rmm::exec_policy(_stream),
+                                                     page_keys.begin(),
+                                                     page_keys.end(),
+                                                     thrust::make_constant_iterator(1),
+                                                     thrust::make_discard_iterator(),
+                                                     key_offsets.begin())
+                                 .second;
+  size_t const num_unique_keys = key_offsets_end - key_offsets.begin();
+  thrust::exclusive_scan(
+    rmm::exec_policy(_stream), key_offsets.begin(), key_offsets.end(), key_offsets.begin());
+
+  // adjust the cumulative info such that for each row count, the size includes any pages that span
+  // that row count. this is so that if we have this case:
+  //              page row counts
+  // Column A:    0 <----> 100 <----> 200
+  // Column B:    0 <---------------> 200 <--------> 400
+  //                        |
+  // if we decide to split at row 100, we don't really know the actual amount of bytes in column B
+  // at that point.  So we have to proceed as if we are taking the bytes from all 200 rows of that
+  // page.
+  //
+  rmm::device_uvector<cumulative_row_info> aggregated_info(c_info.size(), _stream);
+  thrust::transform(rmm::exec_policy(_stream),
+                    c_info_sorted.begin(),
+                    c_info_sorted.end(),
+                    aggregated_info.begin(),
+                    row_total_size{c_info.data(), key_offsets.data(), num_unique_keys});
+
+  // bring back to the cpu
+  std::vector<cumulative_row_info> h_aggregated_info(aggregated_info.size());
+  CUDF_CUDA_TRY(cudaMemcpyAsync(h_aggregated_info.data(),
+                                aggregated_info.data(),
+                                sizeof(cumulative_row_info) * c_info.size(),
+                                cudaMemcpyDefault,
+                                _stream.value()));
+  _stream.synchronize();
+
+  // generate the actual splits
+  _pass_itm_data->output_chunk_read_info = find_splits(h_aggregated_info, num_rows, _output_chunk_read_limit);
+}
+
+}  // namespace cudf::io::parquet::detail
diff --git a/cpp/src/io/parquet/reader_impl_chunking.hpp b/cpp/src/io/parquet/reader_impl_chunking.hpp
new file mode 100644
index 00000000000..f3c595a9a2b
--- /dev/null
+++ b/cpp/src/io/parquet/reader_impl_chunking.hpp
@@ -0,0 +1,87 @@
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <cudf/types.hpp>
+
+#include "reader_impl_helpers.hpp"
+
+namespace cudf::io::parquet::detail {
+
+/**
+ * @brief Struct to store file-level data that remains constant for
+ * all passes/chunks in the file.
+ */
+struct file_intermediate_data {
+  // all row groups to read
+  std::vector<row_group_info> row_groups{};
+
+  // all chunks from the selected row groups. We may end up reading these chunks progressively
+  // instead of all at once
+  std::vector<ColumnChunkDesc> chunks{};
+
+  // an array of offsets into _file_itm_data::global_chunks. Each pair of offsets represents
+  // the start/end of the chunks to be loaded for a given pass.
+  std::vector<std::size_t> input_pass_row_group_offsets{};
+  // row counts per input-pass
+  std::vector<std::size_t> input_pass_row_count{};
+
+  // skip_rows/num_rows values for the entire file. these need to be adjusted per-pass because we
+  // may not be visiting every row group that contains these bounds
+  size_t global_skip_rows;
+  size_t global_num_rows;
+};
+
+/**
+ * @brief Struct to identify the range for each chunk of rows during a chunked reading pass.
+ */
+struct chunk_read_info {
+  size_t skip_rows;
+  size_t num_rows;
+};
+
+/**
+ * @brief Struct to store pass-level data that remains constant for a single pass.
+ */
+struct pass_intermediate_data {
+  std::vector<std::unique_ptr<datasource::buffer>> raw_page_data;
+  rmm::device_buffer decomp_page_data;
+
+  // rowgroup, chunk and page information for the current pass.
+  std::vector<row_group_info> row_groups{};
+  cudf::detail::hostdevice_vector<ColumnChunkDesc> chunks{};
+  cudf::detail::hostdevice_vector<PageInfo> pages_info{};
+  cudf::detail::hostdevice_vector<PageNestingInfo> page_nesting_info{};
+  cudf::detail::hostdevice_vector<PageNestingDecodeInfo> page_nesting_decode_info{};
+
+  rmm::device_uvector<int32_t> page_keys{0, rmm::cuda_stream_default};
+  rmm::device_uvector<int32_t> page_index{0, rmm::cuda_stream_default};
+  rmm::device_uvector<string_index_pair> str_dict_index{0, rmm::cuda_stream_default};
+
+  std::vector<chunk_read_info> output_chunk_read_info;
+  std::size_t current_output_chunk{0};
+
+  rmm::device_buffer level_decode_data{};
+  int level_type_size{0};
+
+  // skip_rows and num_rows values for this particular pass. these may be adjusted values from the
+  // global values stored in file_intermediate_data.
+  size_t skip_rows;
+  size_t num_rows;
+};
+
+} // namespace cudf::io::parquet::detail
diff --git a/cpp/src/io/parquet/reader_impl_helpers.hpp b/cpp/src/io/parquet/reader_impl_helpers.hpp
index 2ff18bfbe7e..7a4fcc72dce 100644
--- a/cpp/src/io/parquet/reader_impl_helpers.hpp
+++ b/cpp/src/io/parquet/reader_impl_helpers.hpp
@@ -34,6 +34,23 @@
 
 namespace cudf::io::parquet::detail {
 
+/**
+ * @brief The row_group_info class
+ */
+struct row_group_info {
+  size_type index;  // row group index within a file. aggregate_reader_metadata::get_row_group() is
+                    // called with index and source_index
+  size_t start_row;
+  size_type source_index;  // file index.
+
+  row_group_info() = default;
+
+  row_group_info(size_type index, size_t start_row, size_type source_index)
+    : index{index}, start_row{start_row}, source_index{source_index}
+  {
+  }
+};
+
 /**
  * @brief Function that translates Parquet datatype to cuDF type enum
  */
diff --git a/cpp/src/io/parquet/reader_impl_preprocess.cu b/cpp/src/io/parquet/reader_impl_preprocess.cu
index 4bc6bb6f43b..ce45f709ee1 100644
--- a/cpp/src/io/parquet/reader_impl_preprocess.cu
+++ b/cpp/src/io/parquet/reader_impl_preprocess.cu
@@ -18,7 +18,6 @@
 
 #include <io/comp/nvcomp_adapter.hpp>
 #include <io/utilities/config_utils.hpp>
-#include <io/utilities/time_utils.cuh>
 
 #include <cudf/detail/iterator.cuh>
 #include <cudf/detail/utilities/integer_utils.hpp>
@@ -44,7 +43,6 @@
 #include <numeric>
 
 namespace cudf::io::parquet::detail {
-
 namespace {
 
 /**
@@ -170,46 +168,6 @@ void generate_depth_remappings(std::map<int, std::pair<std::vector<int>, std::ve
   }
 }
 
-/**
- * @brief Return the required number of bits to store a value.
- */
-template <typename T = uint8_t>
-[[nodiscard]] T required_bits(uint32_t max_level)
-{
-  return static_cast<T>(CompactProtocolReader::NumRequiredBits(max_level));
-}
-
-/**
- * @brief Converts cuDF units to Parquet units.
- *
- * @return A tuple of Parquet type width, Parquet clock rate and Parquet decimal type.
- */
-[[nodiscard]] std::tuple<int32_t, int32_t, int8_t> conversion_info(type_id column_type_id,
-                                                                   type_id timestamp_type_id,
-                                                                   Type physical,
-                                                                   int8_t converted,
-                                                                   int32_t length)
-{
-  int32_t type_width = (physical == FIXED_LEN_BYTE_ARRAY) ? length : 0;
-  int32_t clock_rate = 0;
-  if (column_type_id == type_id::INT8 or column_type_id == type_id::UINT8) {
-    type_width = 1;  // I32 -> I8
-  } else if (column_type_id == type_id::INT16 or column_type_id == type_id::UINT16) {
-    type_width = 2;  // I32 -> I16
-  } else if (column_type_id == type_id::INT32) {
-    type_width = 4;  // str -> hash32
-  } else if (is_chrono(data_type{column_type_id})) {
-    clock_rate = to_clockrate(timestamp_type_id);
-  }
-
-  int8_t converted_type = converted;
-  if (converted_type == DECIMAL && column_type_id != type_id::FLOAT64 &&
-      not cudf::is_fixed_point(data_type{column_type_id})) {
-    converted_type = UNKNOWN;  // Not converting to float64 or decimal
-  }
-  return std::make_tuple(type_width, clock_rate, converted_type);
-}
-
 /**
  * @brief Reads compressed page data to device memory.
  *
@@ -790,163 +748,6 @@ std::pair<bool, std::vector<std::future<void>>> reader::impl::read_and_decompres
   return {total_decompressed_size > 0, std::move(read_chunk_tasks)};
 }
 
-void reader::impl::load_global_chunk_info()
-{
-  auto const num_rows         = _file_itm_data.global_num_rows;
-  auto const& row_groups_info = _file_itm_data.row_groups;
-  auto& chunks                = _file_itm_data.chunks;
-
-  // Descriptors for all the chunks that make up the selected columns
-  auto const num_input_columns = _input_columns.size();
-  auto const num_chunks        = row_groups_info.size() * num_input_columns;
-
-  // Initialize column chunk information
-  auto remaining_rows = num_rows;
-  for (auto const& rg : row_groups_info) {
-    auto const& row_group      = _metadata->get_row_group(rg.index, rg.source_index);
-    auto const row_group_start = rg.start_row;
-    auto const row_group_rows  = std::min<int>(remaining_rows, row_group.num_rows);
-
-    // generate ColumnChunkDesc objects for everything to be decoded (all input columns)
-    for (size_t i = 0; i < num_input_columns; ++i) {
-      auto col = _input_columns[i];
-      // look up metadata
-      auto& col_meta = _metadata->get_column_metadata(rg.index, rg.source_index, col.schema_idx);
-      auto& schema   = _metadata->get_schema(col.schema_idx);
-
-      auto [type_width, clock_rate, converted_type] =
-        conversion_info(to_type_id(schema, _strings_to_categorical, _timestamp_type.id()),
-                        _timestamp_type.id(),
-                        schema.type,
-                        schema.converted_type,
-                        schema.type_length);
-
-      chunks.push_back(ColumnChunkDesc(col_meta.total_compressed_size,
-                                       nullptr,
-                                       col_meta.num_values,
-                                       schema.type,
-                                       type_width,
-                                       row_group_start,
-                                       row_group_rows,
-                                       schema.max_definition_level,
-                                       schema.max_repetition_level,
-                                       _metadata->get_output_nesting_depth(col.schema_idx),
-                                       required_bits(schema.max_definition_level),
-                                       required_bits(schema.max_repetition_level),
-                                       col_meta.codec,
-                                       converted_type,
-                                       schema.logical_type,
-                                       schema.decimal_precision,
-                                       clock_rate,
-                                       i,
-                                       col.schema_idx));
-    }
-
-    remaining_rows -= row_group_rows;
-  }
-}
-
-void reader::impl::compute_input_pass_row_group_info()
-{
-  // at this point, row_groups has already been filtered down to just the row groups we need to
-  // handle optional skip_rows/num_rows parameters.
-  auto const& row_groups_info = _file_itm_data.row_groups;
-
-  // if the user hasn't specified an input size limit, read everything in a single pass.
-  if (_input_pass_read_limit == 0) {
-    _input_pass_row_group_offsets.push_back(0);
-    _input_pass_row_group_offsets.push_back(row_groups_info.size());
-    return;
-  }
-
-  // generate passes. make sure to account for the case where a single row group doesn't fit within
-  //
-  std::size_t const read_limit =
-    _input_pass_read_limit > 0 ? _input_pass_read_limit : std::numeric_limits<std::size_t>::max();
-  std::size_t cur_pass_byte_size = 0;
-  std::size_t cur_rg_start       = 0;
-  std::size_t cur_row_count      = 0;
-  _input_pass_row_group_offsets.push_back(0);
-  _input_pass_row_count.push_back(0);
-
-  for (size_t cur_rg_index = 0; cur_rg_index < row_groups_info.size(); cur_rg_index++) {
-    auto const& rgi       = row_groups_info[cur_rg_index];
-    auto const& row_group = _metadata->get_row_group(rgi.index, rgi.source_index);
-
-    // can we add this row group
-    if (cur_pass_byte_size + row_group.total_byte_size >= read_limit) {
-      // A single row group (the current one) is larger than the read limit:
-      // We always need to include at least one row group, so end the pass at the end of the current
-      // row group
-      if (cur_rg_start == cur_rg_index) {
-        _input_pass_row_group_offsets.push_back(cur_rg_index + 1);
-        _input_pass_row_count.push_back(cur_row_count + row_group.num_rows);
-        cur_rg_start       = cur_rg_index + 1;
-        cur_pass_byte_size = 0;
-      }
-      // End the pass at the end of the previous row group
-      else {
-        _input_pass_row_group_offsets.push_back(cur_rg_index);
-        _input_pass_row_count.push_back(cur_row_count);
-        cur_rg_start       = cur_rg_index;
-        cur_pass_byte_size = row_group.total_byte_size;
-      }
-    } else {
-      cur_pass_byte_size += row_group.total_byte_size;
-    }
-    cur_row_count += row_group.num_rows;
-  }
-  // add the last pass if necessary
-  if (_input_pass_row_group_offsets.back() != row_groups_info.size()) {
-    _input_pass_row_group_offsets.push_back(row_groups_info.size());
-    _input_pass_row_count.push_back(cur_row_count);
-  }
-}
-
-void reader::impl::setup_pass()
-{
-  // this will also cause the previous pass information to be deleted
-  _pass_itm_data = std::make_unique<pass_intermediate_data>();
-
-  // setup row groups to be loaded for this pass
-  auto const row_group_start = _input_pass_row_group_offsets[_current_input_pass];
-  auto const row_group_end   = _input_pass_row_group_offsets[_current_input_pass + 1];
-  auto const num_row_groups  = row_group_end - row_group_start;
-  _pass_itm_data->row_groups.resize(num_row_groups);
-  std::copy(_file_itm_data.row_groups.begin() + row_group_start,
-            _file_itm_data.row_groups.begin() + row_group_end,
-            _pass_itm_data->row_groups.begin());
-
-  auto const num_passes = _input_pass_row_group_offsets.size() - 1;
-  CUDF_EXPECTS(_current_input_pass < num_passes, "Encountered an invalid read pass index");
-
-  auto const chunks_per_rowgroup = _input_columns.size();
-  auto const num_chunks          = chunks_per_rowgroup * num_row_groups;
-
-  auto chunk_start = _file_itm_data.chunks.begin() + (row_group_start * chunks_per_rowgroup);
-  auto chunk_end   = _file_itm_data.chunks.begin() + (row_group_end * chunks_per_rowgroup);
-
-  _pass_itm_data->chunks = cudf::detail::hostdevice_vector<ColumnChunkDesc>(num_chunks, _stream);
-  std::copy(chunk_start, chunk_end, _pass_itm_data->chunks.begin());
-
-  // adjust skip_rows and num_rows by what's available in the row groups we are processing
-  if (num_passes == 1) {
-    _pass_itm_data->skip_rows = _file_itm_data.global_skip_rows;
-    _pass_itm_data->num_rows  = _file_itm_data.global_num_rows;
-  } else {
-    auto const global_start_row = _file_itm_data.global_skip_rows;
-    auto const global_end_row   = global_start_row + _file_itm_data.global_num_rows;
-    auto const start_row = std::max(_input_pass_row_count[_current_input_pass], global_start_row);
-    auto const end_row   = std::min(_input_pass_row_count[_current_input_pass + 1], global_end_row);
-
-    // skip_rows is always global in the sense that it is relative to the first row of
-    // everything we will be reading, regardless of what pass we are on.
-    // num_rows is how many rows we are reading this pass.
-    _pass_itm_data->skip_rows = global_start_row + _input_pass_row_count[_current_input_pass];
-    _pass_itm_data->num_rows  = end_row - start_row;
-  }
-}
-
 void reader::impl::load_and_decompress_data()
 {
   // This function should never be called if `num_rows == 0`.
@@ -1034,359 +835,8 @@ void print_pages(cudf::detail::hostdevice_vector<PageInfo>& pages, rmm::cuda_str
       p.str_bytes);
   }
 }
-
-void print_cumulative_page_info(cudf::detail::hostdevice_vector<PageInfo>& pages,
-                                rmm::device_uvector<int32_t> const& page_index,
-                                rmm::device_uvector<cumulative_row_info> const& c_info,
-                                rmm::cuda_stream_view stream)
-{
-  pages.device_to_host_sync(stream);
-
-  printf("------------\nCumulative sizes by page\n");
-
-  std::vector<int> schemas(pages.size());
-  std::vector<int> h_page_index(pages.size());
-  CUDF_CUDA_TRY(cudaMemcpy(
-    h_page_index.data(), page_index.data(), sizeof(int) * pages.size(), cudaMemcpyDefault));
-  std::vector<cumulative_row_info> h_cinfo(pages.size());
-  CUDF_CUDA_TRY(cudaMemcpy(
-    h_cinfo.data(), c_info.data(), sizeof(cumulative_row_info) * pages.size(), cudaMemcpyDefault));
-  auto schema_iter = cudf::detail::make_counting_transform_iterator(
-    0, [&](size_type i) { return pages[h_page_index[i]].src_col_schema; });
-  thrust::copy(thrust::seq, schema_iter, schema_iter + pages.size(), schemas.begin());
-  auto last = thrust::unique(thrust::seq, schemas.begin(), schemas.end());
-  schemas.resize(last - schemas.begin());
-  printf("Num schemas: %lu\n", schemas.size());
-
-  for (size_t idx = 0; idx < schemas.size(); idx++) {
-    printf("Schema %d\n", schemas[idx]);
-    for (size_t pidx = 0; pidx < pages.size(); pidx++) {
-      auto const& page = pages[h_page_index[pidx]];
-      if (page.flags & PAGEINFO_FLAGS_DICTIONARY || page.src_col_schema != schemas[idx]) {
-        continue;
-      }
-      printf("\tP: {%lu, %lu}\n", h_cinfo[pidx].row_count, h_cinfo[pidx].size_bytes);
-    }
-  }
-}
-
-void print_cumulative_row_info(host_span<cumulative_row_info const> sizes,
-                               std::string const& label,
-                               std::optional<std::vector<chunk_read_info>> splits = std::nullopt)
-{
-  if (splits.has_value()) {
-    printf("------------\nSplits\n");
-    for (size_t idx = 0; idx < splits->size(); idx++) {
-      printf("{%lu, %lu}\n", splits.value()[idx].skip_rows, splits.value()[idx].num_rows);
-    }
-  }
-
-  printf("------------\nCumulative sizes %s\n", label.c_str());
-  for (size_t idx = 0; idx < sizes.size(); idx++) {
-    printf("{%lu, %lu, %d}", sizes[idx].row_count, sizes[idx].size_bytes, sizes[idx].key);
-    if (splits.has_value()) {
-      // if we have a split at this row count and this is the last instance of this row count
-      auto start = thrust::make_transform_iterator(
-        splits->begin(), [](chunk_read_info const& i) { return i.skip_rows; });
-      auto end               = start + splits->size();
-      auto split             = std::find(start, end, sizes[idx].row_count);
-      auto const split_index = [&]() -> int {
-        if (split != end &&
-            ((idx == sizes.size() - 1) || (sizes[idx + 1].row_count > sizes[idx].row_count))) {
-          return static_cast<int>(std::distance(start, split));
-        }
-        return idx == 0 ? 0 : -1;
-      }();
-      if (split_index >= 0) {
-        printf(" <-- split {%lu, %lu}",
-               splits.value()[split_index].skip_rows,
-               splits.value()[split_index].num_rows);
-      }
-    }
-    printf("\n");
-  }
-}
 #endif  // PREPROCESS_DEBUG
 
-/**
- * @brief Functor which reduces two cumulative_row_info structs of the same key.
- */
-struct cumulative_row_sum {
-  cumulative_row_info operator()
-    __device__(cumulative_row_info const& a, cumulative_row_info const& b) const
-  {
-    return cumulative_row_info{a.row_count + b.row_count, a.size_bytes + b.size_bytes, a.key};
-  }
-};
-
-/**
- * @brief Functor which computes the total data size for a given type of cudf column.
- *
- * In the case of strings, the return size does not include the chars themselves. That
- * information is tracked separately (see PageInfo::str_bytes).
- */
-struct row_size_functor {
-  __device__ size_t validity_size(size_t num_rows, bool nullable)
-  {
-    return nullable ? (cudf::util::div_rounding_up_safe(num_rows, size_t{32}) * 4) : 0;
-  }
-
-  template <typename T>
-  __device__ size_t operator()(size_t num_rows, bool nullable)
-  {
-    auto const element_size = sizeof(device_storage_type_t<T>);
-    return (element_size * num_rows) + validity_size(num_rows, nullable);
-  }
-};
-
-template <>
-__device__ size_t row_size_functor::operator()<list_view>(size_t num_rows, bool nullable)
-{
-  auto const offset_size = sizeof(size_type);
-  // NOTE: Adding the + 1 offset here isn't strictly correct.  There will only be 1 extra offset
-  // for the entire column, whereas this is adding an extra offset per page.  So we will get a
-  // small over-estimate of the real size of the order :  # of pages * 4 bytes. It seems better
-  // to overestimate size somewhat than to underestimate it and potentially generate chunks
-  // that are too large.
-  return (offset_size * (num_rows + 1)) + validity_size(num_rows, nullable);
-}
-
-template <>
-__device__ size_t row_size_functor::operator()<struct_view>(size_t num_rows, bool nullable)
-{
-  return validity_size(num_rows, nullable);
-}
-
-template <>
-__device__ size_t row_size_functor::operator()<string_view>(size_t num_rows, bool nullable)
-{
-  // only returns the size of offsets and validity. the size of the actual string chars
-  // is tracked separately.
-  auto const offset_size = sizeof(size_type);
-  // see note about offsets in the list_view template.
-  return (offset_size * (num_rows + 1)) + validity_size(num_rows, nullable);
-}
-
-/**
- * @brief Functor which computes the total output cudf data size for all of
- * the data in this page.
- *
- * Sums across all nesting levels.
- */
-struct get_cumulative_row_info {
-  PageInfo const* const pages;
-
-  __device__ cumulative_row_info operator()(size_type index)
-  {
-    auto const& page = pages[index];
-    if (page.flags & PAGEINFO_FLAGS_DICTIONARY) {
-      return cumulative_row_info{0, 0, page.src_col_schema};
-    }
-
-    // total nested size, not counting string data
-    auto iter =
-      cudf::detail::make_counting_transform_iterator(0, [page, index] __device__(size_type i) {
-        auto const& pni = page.nesting[i];
-        return cudf::type_dispatcher(
-          data_type{pni.type}, row_size_functor{}, pni.size, pni.nullable);
-      });
-
-    size_t const row_count = static_cast<size_t>(page.nesting[0].size);
-    return {
-      row_count,
-      thrust::reduce(thrust::seq, iter, iter + page.num_output_nesting_levels) + page.str_bytes,
-      page.src_col_schema};
-  }
-};
-
-/**
- * @brief Functor which computes the effective size of all input columns by page.
- *
- * For a given row, we want to find the cost of all pages for all columns involved
- * in loading up to that row.  The complication here is that not all pages are the
- * same size between columns. Example:
- *
- *              page row counts
- * Column A:    0 <----> 100 <----> 200
- * Column B:    0 <---------------> 200 <--------> 400
-                          |
- * if we decide to split at row 100, we don't really know the actual amount of bytes in column B
- * at that point.  So we have to proceed as if we are taking the bytes from all 200 rows of that
- * page. Essentially, a conservative over-estimate of the real size.
- */
-struct row_total_size {
-  cumulative_row_info const* c_info;
-  size_type const* key_offsets;
-  size_t num_keys;
-
-  __device__ cumulative_row_info operator()(cumulative_row_info const& i)
-  {
-    // sum sizes for each input column at this row
-    size_t sum = 0;
-    for (int idx = 0; idx < num_keys; idx++) {
-      auto const start = key_offsets[idx];
-      auto const end   = key_offsets[idx + 1];
-      auto iter        = cudf::detail::make_counting_transform_iterator(
-        0, [&] __device__(size_type i) { return c_info[i].row_count; });
-      auto const page_index =
-        thrust::lower_bound(thrust::seq, iter + start, iter + end, i.row_count) - iter;
-      sum += c_info[page_index].size_bytes;
-    }
-    return {i.row_count, sum, i.key};
-  }
-};
-
-/**
- * @brief Given a vector of cumulative {row_count, byte_size} pairs and a chunk read
- * limit, determine the set of splits.
- *
- * @param sizes Vector of cumulative {row_count, byte_size} pairs
- * @param num_rows Total number of rows to read
- * @param chunk_read_limit Limit on total number of bytes to be returned per read, for all columns
- */
-std::vector<chunk_read_info> find_splits(std::vector<cumulative_row_info> const& sizes,
-                                         size_t num_rows,
-                                         size_t chunk_read_limit)
-{
-  // now we have an array of {row_count, real output bytes}. just walk through it and generate
-  // splits.
-  // TODO: come up with a clever way to do this entirely in parallel. For now, as long as batch
-  // sizes are reasonably large, this shouldn't iterate too many times
-  std::vector<chunk_read_info> splits;
-  {
-    size_t cur_pos             = 0;
-    size_t cur_cumulative_size = 0;
-    size_t cur_row_count       = 0;
-    auto start = thrust::make_transform_iterator(sizes.begin(), [&](cumulative_row_info const& i) {
-      return i.size_bytes - cur_cumulative_size;
-    });
-    auto end   = start + sizes.size();
-    while (cur_row_count < num_rows) {
-      int64_t split_pos =
-        thrust::lower_bound(thrust::seq, start + cur_pos, end, chunk_read_limit) - start;
-
-      // if we're past the end, or if the returned bucket is > than the chunk_read_limit, move back
-      // one.
-      if (static_cast<size_t>(split_pos) >= sizes.size() ||
-          (sizes[split_pos].size_bytes - cur_cumulative_size > chunk_read_limit)) {
-        split_pos--;
-      }
-
-      // best-try. if we can't find something that'll fit, we have to go bigger. we're doing this in
-      // a loop because all of the cumulative sizes for all the pages are sorted into one big list.
-      // so if we had two columns, both of which had an entry {1000, 10000}, that entry would be in
-      // the list twice. so we have to iterate until we skip past all of them.  The idea is that we
-      // either do this, or we have to call unique() on the input first.
-      while (split_pos < (static_cast<int64_t>(sizes.size()) - 1) &&
-             (split_pos < 0 || sizes[split_pos].row_count == cur_row_count)) {
-        split_pos++;
-      }
-
-      auto const start_row = cur_row_count;
-      cur_row_count        = sizes[split_pos].row_count;
-      splits.push_back(chunk_read_info{start_row, cur_row_count - start_row});
-      cur_pos             = split_pos;
-      cur_cumulative_size = sizes[split_pos].size_bytes;
-    }
-  }
-  // print_cumulative_row_info(sizes, "adjusted", splits);
-
-  return splits;
-}
-
-/**
- * @brief Given a set of pages that have had their sizes computed by nesting level and
- * a limit on total read size, generate a set of {skip_rows, num_rows} pairs representing
- * a set of reads that will generate output columns of total size <= `chunk_read_limit` bytes.
- *
- * @param pages All pages in the file
- * @param id Additional intermediate information required to process the pages
- * @param num_rows Total number of rows to read
- * @param chunk_read_limit Limit on total number of bytes to be returned per read, for all columns
- * @param stream CUDA stream to use
- */
-std::vector<chunk_read_info> compute_splits(cudf::detail::hostdevice_vector<PageInfo>& pages,
-                                            pass_intermediate_data const& id,
-                                            size_t num_rows,
-                                            size_t chunk_read_limit,
-                                            rmm::cuda_stream_view stream)
-{
-  auto const& page_keys  = id.page_keys;
-  auto const& page_index = id.page_index;
-
-  // generate cumulative row counts and sizes
-  rmm::device_uvector<cumulative_row_info> c_info(page_keys.size(), stream);
-  // convert PageInfo to cumulative_row_info
-  auto page_input = thrust::make_transform_iterator(page_index.begin(),
-                                                    get_cumulative_row_info{pages.device_ptr()});
-  thrust::inclusive_scan_by_key(rmm::exec_policy(stream),
-                                page_keys.begin(),
-                                page_keys.end(),
-                                page_input,
-                                c_info.begin(),
-                                thrust::equal_to{},
-                                cumulative_row_sum{});
-  // print_cumulative_page_info(pages, page_index, c_info, stream);
-
-  // sort by row count
-  rmm::device_uvector<cumulative_row_info> c_info_sorted{c_info, stream};
-  thrust::sort(rmm::exec_policy(stream),
-               c_info_sorted.begin(),
-               c_info_sorted.end(),
-               [] __device__(cumulative_row_info const& a, cumulative_row_info const& b) {
-                 return a.row_count < b.row_count;
-               });
-
-  // std::vector<cumulative_row_info> h_c_info_sorted(c_info_sorted.size());
-  // CUDF_CUDA_TRY(cudaMemcpy(h_c_info_sorted.data(),
-  //                          c_info_sorted.data(),
-  //                          sizeof(cumulative_row_info) * c_info_sorted.size(),
-  //                          cudaMemcpyDefault));
-  // print_cumulative_row_info(h_c_info_sorted, "raw");
-
-  // generate key offsets (offsets to the start of each partition of keys). worst case is 1 page per
-  // key
-  rmm::device_uvector<size_type> key_offsets(page_keys.size() + 1, stream);
-  auto const key_offsets_end = thrust::reduce_by_key(rmm::exec_policy(stream),
-                                                     page_keys.begin(),
-                                                     page_keys.end(),
-                                                     thrust::make_constant_iterator(1),
-                                                     thrust::make_discard_iterator(),
-                                                     key_offsets.begin())
-                                 .second;
-  size_t const num_unique_keys = key_offsets_end - key_offsets.begin();
-  thrust::exclusive_scan(
-    rmm::exec_policy(stream), key_offsets.begin(), key_offsets.end(), key_offsets.begin());
-
-  // adjust the cumulative info such that for each row count, the size includes any pages that span
-  // that row count. this is so that if we have this case:
-  //              page row counts
-  // Column A:    0 <----> 100 <----> 200
-  // Column B:    0 <---------------> 200 <--------> 400
-  //                        |
-  // if we decide to split at row 100, we don't really know the actual amount of bytes in column B
-  // at that point.  So we have to proceed as if we are taking the bytes from all 200 rows of that
-  // page.
-  //
-  rmm::device_uvector<cumulative_row_info> aggregated_info(c_info.size(), stream);
-  thrust::transform(rmm::exec_policy(stream),
-                    c_info_sorted.begin(),
-                    c_info_sorted.end(),
-                    aggregated_info.begin(),
-                    row_total_size{c_info.data(), key_offsets.data(), num_unique_keys});
-
-  // bring back to the cpu
-  std::vector<cumulative_row_info> h_aggregated_info(aggregated_info.size());
-  CUDF_CUDA_TRY(cudaMemcpyAsync(h_aggregated_info.data(),
-                                aggregated_info.data(),
-                                sizeof(cumulative_row_info) * c_info.size(),
-                                cudaMemcpyDefault,
-                                stream.value()));
-  stream.synchronize();
-
-  return find_splits(h_aggregated_info, num_rows, chunk_read_limit);
-}
-
 struct get_page_chunk_idx {
   __device__ size_type operator()(PageInfo const& page) { return page.chunk_idx; }
 };
@@ -1822,12 +1272,8 @@ void reader::impl::preprocess_pages(bool uses_custom_row_bounds, size_t chunk_re
   _pass_itm_data->page_keys  = std::move(page_keys);
   _pass_itm_data->page_index = std::move(page_index);
 
-  // compute splits if necessary. otherwise return a single split representing
-  // the whole file.
-  _pass_itm_data->output_chunk_read_info =
-    _output_chunk_read_limit > 0
-      ? compute_splits(pages, *_pass_itm_data, num_rows, chunk_read_limit, _stream)
-      : std::vector<chunk_read_info>{{skip_rows, num_rows}};
+  // compute splits for the pass
+  compute_splits_for_pass();
 }
 
 void reader::impl::allocate_columns(size_t skip_rows, size_t num_rows, bool uses_custom_row_bounds)

From f1378e5b6543d846c4dcbfcdc3fa9ecc256fda60 Mon Sep 17 00:00:00 2001
From: db <dbaranec@nvidia.com>
Date: Mon, 9 Oct 2023 11:07:57 -0500
Subject: [PATCH 04/49] Formatting.

---
 cpp/src/io/parquet/reader_impl_chunking.cu  | 78 ++++++++++-----------
 cpp/src/io/parquet/reader_impl_chunking.hpp |  2 +-
 2 files changed, 40 insertions(+), 40 deletions(-)

diff --git a/cpp/src/io/parquet/reader_impl_chunking.cu b/cpp/src/io/parquet/reader_impl_chunking.cu
index 5d5b152e1aa..ad52a7dfcc1 100644
--- a/cpp/src/io/parquet/reader_impl_chunking.cu
+++ b/cpp/src/io/parquet/reader_impl_chunking.cu
@@ -75,10 +75,9 @@ void print_cumulative_page_info(cudf::detail::hostdevice_vector<PageInfo>& pages
   }
 }
 
-void print_cumulative_row_info(
-  host_span<cumulative_row_info const> sizes,
-  std::string const& label,
-  std::optional<std::vector<chunk_read_info>> splits = std::nullopt)
+void print_cumulative_row_info(host_span<cumulative_row_info const> sizes,
+                               std::string const& label,
+                               std::optional<std::vector<chunk_read_info>> splits = std::nullopt)
 {
   if (splits.has_value()) {
     printf("------------\nSplits\n");
@@ -251,8 +250,8 @@ struct row_total_size {
  * @param chunk_read_limit Limit on total number of bytes to be returned per read, for all columns
  */
 std::vector<chunk_read_info> find_splits(std::vector<cumulative_row_info> const& sizes,
-                                              size_t num_rows,
-                                              size_t chunk_read_limit)
+                                         size_t num_rows,
+                                         size_t chunk_read_limit)
 {
   // now we have an array of {row_count, real output bytes}. just walk through it and generate
   // splits.
@@ -341,7 +340,7 @@ template <typename T = uint8_t>
 }
 
 struct row_count_compare {
-  __device__ bool operator()(cumulative_row_info const& a, cumulative_row_info const& b) 
+  __device__ bool operator()(cumulative_row_info const& a, cumulative_row_info const& b)
   {
     return a.row_count < b.row_count;
   }
@@ -381,24 +380,24 @@ void reader::impl::create_global_chunk_info()
                         schema.type_length);
 
       chunks.push_back(ColumnChunkDesc(col_meta.total_compressed_size,
-                                            nullptr,
-                                            col_meta.num_values,
-                                            schema.type,
-                                            type_width,
-                                            row_group_start,
-                                            row_group_rows,
-                                            schema.max_definition_level,
-                                            schema.max_repetition_level,
-                                            _metadata->get_output_nesting_depth(col.schema_idx),
-                                            required_bits(schema.max_definition_level),
-                                            required_bits(schema.max_repetition_level),
-                                            col_meta.codec,
-                                            converted_type,
-                                            schema.logical_type,
-                                            schema.decimal_precision,
-                                            clock_rate,
-                                            i,
-                                            col.schema_idx));
+                                       nullptr,
+                                       col_meta.num_values,
+                                       schema.type,
+                                       type_width,
+                                       row_group_start,
+                                       row_group_rows,
+                                       schema.max_definition_level,
+                                       schema.max_repetition_level,
+                                       _metadata->get_output_nesting_depth(col.schema_idx),
+                                       required_bits(schema.max_definition_level),
+                                       required_bits(schema.max_repetition_level),
+                                       col_meta.codec,
+                                       converted_type,
+                                       schema.logical_type,
+                                       schema.decimal_precision,
+                                       clock_rate,
+                                       i,
+                                       col.schema_idx));
     }
 
     remaining_rows -= row_group_rows;
@@ -485,8 +484,7 @@ void reader::impl::setup_next_pass()
   auto chunk_start = _file_itm_data.chunks.begin() + (row_group_start * chunks_per_rowgroup);
   auto chunk_end   = _file_itm_data.chunks.begin() + (row_group_end * chunks_per_rowgroup);
 
-  _pass_itm_data->chunks =
-    cudf::detail::hostdevice_vector<ColumnChunkDesc>(num_chunks, _stream);
+  _pass_itm_data->chunks = cudf::detail::hostdevice_vector<ColumnChunkDesc>(num_chunks, _stream);
   std::copy(chunk_start, chunk_end, _pass_itm_data->chunks.begin());
 
   // adjust skip_rows and num_rows by what's available in the row groups we are processing
@@ -496,29 +494,32 @@ void reader::impl::setup_next_pass()
   } else {
     auto const global_start_row = _file_itm_data.global_skip_rows;
     auto const global_end_row   = global_start_row + _file_itm_data.global_num_rows;
-    auto const start_row = std::max(_file_itm_data.input_pass_row_count[_current_input_pass], global_start_row);
-    auto const end_row   = std::min(_file_itm_data.input_pass_row_count[_current_input_pass + 1], global_end_row);
+    auto const start_row =
+      std::max(_file_itm_data.input_pass_row_count[_current_input_pass], global_start_row);
+    auto const end_row =
+      std::min(_file_itm_data.input_pass_row_count[_current_input_pass + 1], global_end_row);
 
     // skip_rows is always global in the sense that it is relative to the first row of
     // everything we will be reading, regardless of what pass we are on.
     // num_rows is how many rows we are reading this pass.
-    _pass_itm_data->skip_rows = global_start_row + _file_itm_data.input_pass_row_count[_current_input_pass];
-    _pass_itm_data->num_rows  = end_row - start_row;
+    _pass_itm_data->skip_rows =
+      global_start_row + _file_itm_data.input_pass_row_count[_current_input_pass];
+    _pass_itm_data->num_rows = end_row - start_row;
   }
 }
 
 void reader::impl::compute_splits_for_pass()
 {
   auto const skip_rows = _pass_itm_data->skip_rows;
-  auto const num_rows = _pass_itm_data->num_rows;
+  auto const num_rows  = _pass_itm_data->num_rows;
 
   // simple case : no chunk size, no splits
-  if(_output_chunk_read_limit <= 0){
+  if (_output_chunk_read_limit <= 0) {
     _pass_itm_data->output_chunk_read_info = std::vector<chunk_read_info>{{skip_rows, num_rows}};
     return;
   }
 
-  auto& pages          = _pass_itm_data->pages_info;
+  auto& pages = _pass_itm_data->pages_info;
 
   auto const& page_keys  = _pass_itm_data->page_keys;
   auto const& page_index = _pass_itm_data->page_index;
@@ -539,10 +540,8 @@ void reader::impl::compute_splits_for_pass()
 
   // sort by row count
   rmm::device_uvector<cumulative_row_info> c_info_sorted{c_info, _stream};
-  thrust::sort(rmm::exec_policy(_stream),
-               c_info_sorted.begin(),
-               c_info_sorted.end(),
-               row_count_compare{});
+  thrust::sort(
+    rmm::exec_policy(_stream), c_info_sorted.begin(), c_info_sorted.end(), row_count_compare{});
 
   // std::vector<cumulative_row_info> h_c_info_sorted(c_info_sorted.size());
   // CUDF_CUDA_TRY(cudaMemcpy(h_c_info_sorted.data(),
@@ -592,7 +591,8 @@ void reader::impl::compute_splits_for_pass()
   _stream.synchronize();
 
   // generate the actual splits
-  _pass_itm_data->output_chunk_read_info = find_splits(h_aggregated_info, num_rows, _output_chunk_read_limit);
+  _pass_itm_data->output_chunk_read_info =
+    find_splits(h_aggregated_info, num_rows, _output_chunk_read_limit);
 }
 
 }  // namespace cudf::io::parquet::detail
diff --git a/cpp/src/io/parquet/reader_impl_chunking.hpp b/cpp/src/io/parquet/reader_impl_chunking.hpp
index f3c595a9a2b..29a91c4cb00 100644
--- a/cpp/src/io/parquet/reader_impl_chunking.hpp
+++ b/cpp/src/io/parquet/reader_impl_chunking.hpp
@@ -84,4 +84,4 @@ struct pass_intermediate_data {
   size_t num_rows;
 };
 
-} // namespace cudf::io::parquet::detail
+}  // namespace cudf::io::parquet::detail

From 79ae066888c597bab034df511044e1ed2f654be0 Mon Sep 17 00:00:00 2001
From: db <dbaranec@nvidia.com>
Date: Mon, 9 Oct 2023 16:50:47 -0500
Subject: [PATCH 05/49] Remove unnecessary comment block.

---
 cpp/src/io/parquet/reader_impl.hpp | 8 --------
 1 file changed, 8 deletions(-)

diff --git a/cpp/src/io/parquet/reader_impl.hpp b/cpp/src/io/parquet/reader_impl.hpp
index 22217b55411..cea4ba35606 100644
--- a/cpp/src/io/parquet/reader_impl.hpp
+++ b/cpp/src/io/parquet/reader_impl.hpp
@@ -247,14 +247,6 @@ class reader::impl {
    */
   void decode_page_data(size_t skip_rows, size_t num_rows);
 
-  /*
-   *
-   *
-    Functions related to computing chunks and passes (reader_impl_chunking.cu)
-   *
-   *
-   */
-
   /**
    * @brief Creates file-wide parquet chunk information.
    *

From 85b1e839eaf2719673cf5d1fda6bb092c6d56ae8 Mon Sep 17 00:00:00 2001
From: db <dbaranec@nvidia.com>
Date: Tue, 10 Oct 2023 10:04:07 -0500
Subject: [PATCH 06/49] Change include file ordering

---
 cpp/src/io/parquet/reader_impl_chunking.hpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/cpp/src/io/parquet/reader_impl_chunking.hpp b/cpp/src/io/parquet/reader_impl_chunking.hpp
index 29a91c4cb00..dfc239d8451 100644
--- a/cpp/src/io/parquet/reader_impl_chunking.hpp
+++ b/cpp/src/io/parquet/reader_impl_chunking.hpp
@@ -16,10 +16,10 @@
 
 #pragma once
 
-#include <cudf/types.hpp>
-
 #include "reader_impl_helpers.hpp"
 
+#include <cudf/types.hpp>
+
 namespace cudf::io::parquet::detail {
 
 /**

From 08ce7709127b91f959981abd271a21c5ed26b825 Mon Sep 17 00:00:00 2001
From: db <dbaranec@nvidia.com>
Date: Fri, 3 Nov 2023 16:43:46 -0500
Subject: [PATCH 07/49] First pass at sub rwogroup reading. Basic tests show
 proof of concept. More work remains though.

---
 cpp/src/io/parquet/page_decode.cuh           |    6 +-
 cpp/src/io/parquet/page_hdr.cu               |   21 +-
 cpp/src/io/parquet/page_string_decode.cu     |    6 +-
 cpp/src/io/parquet/parquet_gpu.hpp           |   35 +-
 cpp/src/io/parquet/reader_impl.cpp           |  151 +--
 cpp/src/io/parquet/reader_impl.hpp           |   55 +-
 cpp/src/io/parquet/reader_impl_chunking.cu   | 1009 ++++++++++++++----
 cpp/src/io/parquet/reader_impl_chunking.hpp  |   73 +-
 cpp/src/io/parquet/reader_impl_preprocess.cu |  814 +++++---------
 9 files changed, 1306 insertions(+), 864 deletions(-)

diff --git a/cpp/src/io/parquet/page_decode.cuh b/cpp/src/io/parquet/page_decode.cuh
index 7c866fd8b9e..b4f7b29a870 100644
--- a/cpp/src/io/parquet/page_decode.cuh
+++ b/cpp/src/io/parquet/page_decode.cuh
@@ -1279,11 +1279,11 @@ inline __device__ bool setupLocalPageInfo(page_state_s* const s,
           if (((s->col.data_type & 7) == BYTE_ARRAY) && (s->col.str_dict_index)) {
             // String dictionary: use index
             s->dict_base = reinterpret_cast<uint8_t const*>(s->col.str_dict_index);
-            s->dict_size = s->col.page_info[0].num_input_values * sizeof(string_index_pair);
+            s->dict_size = s->col.dict_page->num_input_values * sizeof(string_index_pair);
           } else {
             s->dict_base =
-              s->col.page_info[0].page_data;  // dictionary is always stored in the first page
-            s->dict_size = s->col.page_info[0].uncompressed_page_size;
+              s->col.dict_page->page_data;  // dictionary is always stored in the first page
+            s->dict_size = s->col.dict_page->uncompressed_page_size;
           }
           s->dict_run  = 0;
           s->dict_val  = 0;
diff --git a/cpp/src/io/parquet/page_hdr.cu b/cpp/src/io/parquet/page_hdr.cu
index eae8e05e61e..49a714eaab5 100644
--- a/cpp/src/io/parquet/page_hdr.cu
+++ b/cpp/src/io/parquet/page_hdr.cu
@@ -356,7 +356,7 @@ struct gpuParsePageHeader {
  */
 // blockDim {128,1,1}
 __global__ void __launch_bounds__(128)
-  gpuDecodePageHeaders(ColumnChunkDesc* chunks, int32_t num_chunks)
+  gpuDecodePageHeaders(ColumnChunkDesc* chunks, chunk_page_info *chunk_pages, int32_t num_chunks)
 {
   gpuParsePageHeader parse_page_header;
   __shared__ byte_stream_s bs_g[4];
@@ -365,7 +365,9 @@ __global__ void __launch_bounds__(128)
   int chunk               = (blockIdx.x * 4) + (threadIdx.x / 32);
   byte_stream_s* const bs = &bs_g[threadIdx.x / 32];
 
-  if (chunk < num_chunks and lane_id == 0) bs->ck = chunks[chunk];
+  if (chunk < num_chunks and lane_id == 0){
+    bs->ck = chunks[chunk];
+  }
   __syncthreads();
 
   if (chunk < num_chunks) {
@@ -392,8 +394,7 @@ __global__ void __launch_bounds__(128)
       bs->page.kernel_mask         = 0;
     }
     num_values     = bs->ck.num_values;
-    page_info      = bs->ck.page_info;
-    num_dict_pages = bs->ck.num_dict_pages;
+    page_info      = chunk_pages ? chunk_pages[chunk].pages : nullptr;
     max_num_pages  = (page_info) ? bs->ck.max_num_pages : 0;
     values_found   = 0;
     __syncwarp();
@@ -446,8 +447,9 @@ __global__ void __launch_bounds__(128)
         }
       }
       index_out = shuffle(index_out);
-      if (index_out >= 0 && index_out < max_num_pages && lane_id == 0)
+      if (index_out >= 0 && index_out < max_num_pages && lane_id == 0){
         page_info[index_out] = bs->page;
+      }
       num_values = shuffle(num_values);
       __syncwarp();
     }
@@ -485,9 +487,9 @@ __global__ void __launch_bounds__(128)
   if (!lane_id && ck->num_dict_pages > 0 && ck->str_dict_index) {
     // Data type to describe a string
     string_index_pair* dict_index = ck->str_dict_index;
-    uint8_t const* dict           = ck->page_info[0].page_data;
-    int dict_size                 = ck->page_info[0].uncompressed_page_size;
-    int num_entries               = ck->page_info[0].num_input_values;
+    uint8_t const* dict           = ck->dict_page->page_data;
+    int dict_size                 = ck->dict_page->uncompressed_page_size;
+    int num_entries               = ck->dict_page->num_input_values;
     int pos = 0, cur = 0;
     for (int i = 0; i < num_entries; i++) {
       int len = 0;
@@ -508,12 +510,13 @@ __global__ void __launch_bounds__(128)
 }
 
 void __host__ DecodePageHeaders(ColumnChunkDesc* chunks,
+                                chunk_page_info* chunk_pages,
                                 int32_t num_chunks,
                                 rmm::cuda_stream_view stream)
 {
   dim3 dim_block(128, 1);
   dim3 dim_grid((num_chunks + 3) >> 2, 1);  // 1 chunk per warp, 4 warps per block
-  gpuDecodePageHeaders<<<dim_grid, dim_block, 0, stream.value()>>>(chunks, num_chunks);
+  gpuDecodePageHeaders<<<dim_grid, dim_block, 0, stream.value()>>>(chunks, chunk_pages, num_chunks);
 }
 
 void __host__ BuildStringDictionaryIndex(ColumnChunkDesc* chunks,
diff --git a/cpp/src/io/parquet/page_string_decode.cu b/cpp/src/io/parquet/page_string_decode.cu
index 4d79770ec34..6f45b307dd7 100644
--- a/cpp/src/io/parquet/page_string_decode.cu
+++ b/cpp/src/io/parquet/page_string_decode.cu
@@ -538,10 +538,10 @@ __global__ void __launch_bounds__(preprocess_block_size) gpuComputePageStringSiz
         if (col.str_dict_index) {
           // String dictionary: use index
           dict_base = reinterpret_cast<const uint8_t*>(col.str_dict_index);
-          dict_size = col.page_info[0].num_input_values * sizeof(string_index_pair);
+          dict_size = col.dict_page->num_input_values * sizeof(string_index_pair);
         } else {
-          dict_base = col.page_info[0].page_data;  // dictionary is always stored in the first page
-          dict_size = col.page_info[0].uncompressed_page_size;
+          dict_base = col.dict_page->page_data;  // dictionary is always stored in the first page
+          dict_size = col.dict_page->uncompressed_page_size;
         }
 
         // FIXME: need to return an error condition...this won't actually do anything
diff --git a/cpp/src/io/parquet/parquet_gpu.hpp b/cpp/src/io/parquet/parquet_gpu.hpp
index 6a93fec0c46..ff365e3792a 100644
--- a/cpp/src/io/parquet/parquet_gpu.hpp
+++ b/cpp/src/io/parquet/parquet_gpu.hpp
@@ -237,6 +237,24 @@ struct PageInfo {
   uint32_t kernel_mask;
 };
 
+/**
+ * @brief Return the column schema id as the key for a PageInfo struct.
+ */
+struct get_page_key {
+  __device__ int32_t operator()(PageInfo const& page) const
+  {
+    return page.src_col_schema;
+  }
+};
+
+/**
+ * @brief Return and iterator that returns they keys for a vector of pages.
+ */
+inline auto make_page_key_iterator(cudf::detail::hostdevice_vector<PageInfo> const& pages)
+{
+  return thrust::make_transform_iterator(pages.d_begin(), get_page_key{});
+}
+
 /**
  * @brief Struct describing a particular chunk of column data
  */
@@ -273,7 +291,7 @@ struct ColumnChunkDesc {
       num_data_pages(0),
       num_dict_pages(0),
       max_num_pages(0),
-      page_info(nullptr),
+      dict_page(nullptr),
       str_dict_index(nullptr),
       valid_map_base{nullptr},
       column_data_base{nullptr},
@@ -291,7 +309,7 @@ struct ColumnChunkDesc {
   uint8_t const* compressed_data{};                  // pointer to compressed column chunk data
   size_t compressed_size{};                          // total compressed data size for this chunk
   size_t num_values{};                               // total number of values in this column
-  size_t start_row{};                                // starting row of this chunk
+  size_t start_row{};                                // file-wide, absolute starting row of this chunk
   uint32_t num_rows{};                               // number of rows in this chunk
   int16_t max_level[level_type::NUM_LEVEL_TYPES]{};  // max definition/repetition level
   int16_t max_nesting_depth{};                       // max nesting depth of the output
@@ -302,8 +320,7 @@ struct ColumnChunkDesc {
   int32_t num_data_pages{};                     // number of data pages
   int32_t num_dict_pages{};                     // number of dictionary pages
   int32_t max_num_pages{};                      // size of page_info array
-  PageInfo* page_info{};                        // output page info for up to num_dict_pages +
-                                                // num_data_pages (dictionary pages first)
+  PageInfo* dict_page{};
   string_index_pair* str_dict_index{};          // index for string dictionary
   bitmask_type** valid_map_base{};              // base pointers of valid bit map for this column
   void** column_data_base{};                    // base pointers of column data
@@ -318,6 +335,13 @@ struct ColumnChunkDesc {
   int32_t src_col_schema{};  // my schema index in the file
 };
 
+/**
+ * @brief A utility structure for use in decoding page headers.
+ */
+struct chunk_page_info {
+  PageInfo *pages;
+};
+
 /**
  * @brief Struct describing an encoder column
  */
@@ -451,10 +475,11 @@ constexpr bool is_string_col(ColumnChunkDesc const& chunk)
  * @brief Launches kernel for parsing the page headers in the column chunks
  *
  * @param[in] chunks List of column chunks
+ * @param[in] chunks List of column chunks
  * @param[in] num_chunks Number of column chunks
  * @param[in] stream CUDA stream to use
  */
-void DecodePageHeaders(ColumnChunkDesc* chunks, int32_t num_chunks, rmm::cuda_stream_view stream);
+void DecodePageHeaders(ColumnChunkDesc* chunks, chunk_page_info* chunk_pages, int32_t num_chunks, rmm::cuda_stream_view stream);
 
 /**
  * @brief Launches kernel for building the dictionary index for the column
diff --git a/cpp/src/io/parquet/reader_impl.cpp b/cpp/src/io/parquet/reader_impl.cpp
index db81222157a..22e5038099c 100644
--- a/cpp/src/io/parquet/reader_impl.cpp
+++ b/cpp/src/io/parquet/reader_impl.cpp
@@ -29,21 +29,22 @@ namespace cudf::io::parquet::detail {
 
 void reader::impl::decode_page_data(size_t skip_rows, size_t num_rows)
 {
-  auto& chunks              = _pass_itm_data->chunks;
-  auto& pages               = _pass_itm_data->pages_info;
-  auto& page_nesting        = _pass_itm_data->page_nesting_info;
-  auto& page_nesting_decode = _pass_itm_data->page_nesting_decode_info;
+  auto& pass = *_pass_itm_data;
+  auto& subpass = *pass.subpass;
+  
+  auto& page_nesting        = subpass.page_nesting_info;
+  auto& page_nesting_decode = subpass.page_nesting_decode_info;
 
   // Should not reach here if there is no page data.
-  CUDF_EXPECTS(pages.size() > 0, "There is no page to decode");
+  CUDF_EXPECTS(subpass.pages.size() > 0, "There are no pages to decode");
 
   size_t const sum_max_depths = std::accumulate(
-    chunks.begin(), chunks.end(), 0, [&](size_t cursum, ColumnChunkDesc const& chunk) {
+    pass.chunks.begin(), pass.chunks.end(), 0, [&](size_t cursum, ColumnChunkDesc const& chunk) {
       return cursum + _metadata->get_output_nesting_depth(chunk.src_col_schema);
     });
 
   // figure out which kernels to run
-  auto const kernel_mask = GetAggregatedDecodeKernelMask(pages, _stream);
+  auto const kernel_mask = GetAggregatedDecodeKernelMask(subpass.pages, _stream);
 
   // Check to see if there are any string columns present. If so, then we need to get size info
   // for each string page. This size info will be used to pre-allocate memory for the column,
@@ -55,7 +56,7 @@ void reader::impl::decode_page_data(size_t skip_rows, size_t num_rows)
   std::vector<size_t> col_sizes(_input_columns.size(), 0L);
   if (has_strings) {
     ComputePageStringSizes(
-      pages, chunks, skip_rows, num_rows, _pass_itm_data->level_type_size, _stream);
+      subpass.pages, pass.chunks, skip_rows, num_rows, pass.level_type_size, _stream);
 
     col_sizes = calculate_page_string_offsets();
 
@@ -78,26 +79,26 @@ void reader::impl::decode_page_data(size_t skip_rows, size_t num_rows)
     cudf::detail::hostdevice_vector<void*>(has_strings ? sum_max_depths : 0, _stream);
 
   // Update chunks with pointers to column data.
-  for (size_t c = 0, page_count = 0, chunk_off = 0; c < chunks.size(); c++) {
-    input_column_info const& input_col = _input_columns[chunks[c].src_col_index];
-    CUDF_EXPECTS(input_col.schema_idx == chunks[c].src_col_schema,
+  for (size_t c = 0, page_count = 0, chunk_off = 0; c < pass.chunks.size(); c++) {
+    input_column_info const& input_col = _input_columns[pass.chunks[c].src_col_index];
+    CUDF_EXPECTS(input_col.schema_idx == pass.chunks[c].src_col_schema,
                  "Column/page schema index mismatch");
 
-    size_t max_depth = _metadata->get_output_nesting_depth(chunks[c].src_col_schema);
+    size_t max_depth = _metadata->get_output_nesting_depth(pass.chunks[c].src_col_schema);
     chunk_offsets.push_back(chunk_off);
 
     // get a slice of size `nesting depth` from `chunk_nested_valids` to store an array of pointers
     // to validity data
     auto valids              = chunk_nested_valids.host_ptr(chunk_off);
-    chunks[c].valid_map_base = chunk_nested_valids.device_ptr(chunk_off);
+    pass.chunks[c].valid_map_base = chunk_nested_valids.device_ptr(chunk_off);
 
     // get a slice of size `nesting depth` from `chunk_nested_data` to store an array of pointers to
     // out data
     auto data                  = chunk_nested_data.host_ptr(chunk_off);
-    chunks[c].column_data_base = chunk_nested_data.device_ptr(chunk_off);
+    pass.chunks[c].column_data_base = chunk_nested_data.device_ptr(chunk_off);
 
     auto str_data = has_strings ? chunk_nested_str_data.host_ptr(chunk_off) : nullptr;
-    chunks[c].column_string_base =
+    pass.chunks[c].column_string_base =
       has_strings ? chunk_nested_str_data.device_ptr(chunk_off) : nullptr;
 
     chunk_off += max_depth;
@@ -143,8 +144,8 @@ void reader::impl::decode_page_data(size_t skip_rows, size_t num_rows)
         valids[idx] = out_buf.null_mask();
         data[idx]   = out_buf.data();
         // only do string buffer for leaf
-        if (out_buf.string_size() == 0 && col_sizes[chunks[c].src_col_index] > 0) {
-          out_buf.create_string_data(col_sizes[chunks[c].src_col_index], _stream);
+        if (out_buf.string_size() == 0 && col_sizes[pass.chunks[c].src_col_index] > 0) {
+          out_buf.create_string_data(col_sizes[pass.chunks[c].src_col_index], _stream);
         }
         if (has_strings) { str_data[idx] = out_buf.string_data(); }
         out_buf.user_data |=
@@ -156,10 +157,10 @@ void reader::impl::decode_page_data(size_t skip_rows, size_t num_rows)
     }
 
     // column_data_base will always point to leaf data, even for nested types.
-    page_count += chunks[c].max_num_pages;
+    page_count += subpass.chunk_page_count[c];
   }
 
-  chunks.host_to_device_async(_stream);
+  pass.chunks.host_to_device_async(_stream);
   chunk_nested_valids.host_to_device_async(_stream);
   chunk_nested_data.host_to_device_async(_stream);
 
@@ -177,25 +178,25 @@ void reader::impl::decode_page_data(size_t skip_rows, size_t num_rows)
     auto& stream = streams[s_idx++];
     chunk_nested_str_data.host_to_device_async(stream);
     DecodeStringPageData(
-      pages, chunks, num_rows, skip_rows, level_type_size, error_code.data(), stream);
+      subpass.pages, pass.chunks, num_rows, skip_rows, level_type_size, error_code.data(), stream);
   }
 
   // launch delta binary decoder
   if ((kernel_mask & KERNEL_MASK_DELTA_BINARY) != 0) {
     DecodeDeltaBinary(
-      pages, chunks, num_rows, skip_rows, level_type_size, error_code.data(), streams[s_idx++]);
+      subpass.pages, pass.chunks, num_rows, skip_rows, level_type_size, error_code.data(), streams[s_idx++]);
   }
 
   // launch the catch-all page decoder
   if ((kernel_mask & KERNEL_MASK_GENERAL) != 0) {
     DecodePageData(
-      pages, chunks, num_rows, skip_rows, level_type_size, error_code.data(), streams[s_idx++]);
+      subpass.pages, pass.chunks, num_rows, skip_rows, level_type_size, error_code.data(), streams[s_idx++]);
   }
 
   // synchronize the streams
   cudf::detail::join_streams(streams, _stream);
 
-  pages.device_to_host_async(_stream);
+  subpass.pages.device_to_host_async(_stream);
   page_nesting.device_to_host_async(_stream);
   page_nesting_decode.device_to_host_async(_stream);
 
@@ -247,10 +248,10 @@ void reader::impl::decode_page_data(size_t skip_rows, size_t num_rows)
   }
 
   // update null counts in the final column buffers
-  for (size_t idx = 0; idx < pages.size(); idx++) {
-    PageInfo* pi = &pages[idx];
+  for (size_t idx = 0; idx < subpass.pages.size(); idx++) {
+    PageInfo* pi = &subpass.pages[idx];
     if (pi->flags & PAGEINFO_FLAGS_DICTIONARY) { continue; }
-    ColumnChunkDesc* col               = &chunks[pi->chunk_idx];
+    ColumnChunkDesc* col               = &pass.chunks[pi->chunk_idx];
     input_column_info const& input_col = _input_columns[col->src_col_index];
 
     int index                   = pi->nesting_decode - page_nesting_decode.device_ptr();
@@ -332,60 +333,16 @@ void reader::impl::prepare_data(int64_t skip_rows,
 {
   // if we have not preprocessed at the whole-file level, do that now
   if (!_file_preprocessed) {
-    // if filter is not empty, then create output types as vector and pass for filtering.
-    std::vector<data_type> output_types;
-    if (filter.has_value()) {
-      std::transform(_output_buffers.cbegin(),
-                     _output_buffers.cend(),
-                     std::back_inserter(output_types),
-                     [](auto const& col) { return col.type; });
-    }
-    std::tie(
-      _file_itm_data.global_skip_rows, _file_itm_data.global_num_rows, _file_itm_data.row_groups) =
-      _metadata->select_row_groups(
-        row_group_indices, skip_rows, num_rows, output_types, filter, _stream);
-
-    if (_file_itm_data.global_num_rows > 0 && not _file_itm_data.row_groups.empty() &&
-        not _input_columns.empty()) {
-      // fills in chunk information without physically loading or decompressing
-      // the associated data
-      create_global_chunk_info();
-
-      // compute schedule of input reads. Each rowgroup contains 1 chunk per column. For now
-      // we will read an entire row group at a time. However, it is possible to do
-      // sub-rowgroup reads if we made some estimates on individual chunk sizes (tricky) and
-      // changed the high level structure such that we weren't always reading an entire table's
-      // worth of columns at once.
-      compute_input_passes();
-    }
-
-    _file_preprocessed = true;
+    // setup file level information
+    // - read row group information
+    // - setup information on (parquet) chunks
+    // - compute schedule of input passes
+    preprocess_file(skip_rows, num_rows, row_group_indices, filter);
   }
 
-  // if we have to start a new pass, do that now
-  if (!_pass_preprocessed) {
-    auto const num_passes = _file_itm_data.input_pass_row_group_offsets.size() - 1;
-
-    // always create the pass struct, even if we end up with no passes.
-    // this will also cause the previous pass information to be deleted
-    _pass_itm_data = std::make_unique<pass_intermediate_data>();
-
-    if (_file_itm_data.global_num_rows > 0 && not _file_itm_data.row_groups.empty() &&
-        not _input_columns.empty() && _current_input_pass < num_passes) {
-      // setup the pass_intermediate_info for this pass.
-      setup_next_pass();
-
-      load_and_decompress_data();
-      preprocess_pages(uses_custom_row_bounds, _output_chunk_read_limit);
-
-      if (_output_chunk_read_limit == 0) {  // read the whole file at once
-        CUDF_EXPECTS(_pass_itm_data->output_chunk_read_info.size() == 1,
-                     "Reading the whole file should yield only one chunk.");
-      }
-    }
-
-    _pass_preprocessed = true;
-  }
+  // handle any chunking work (ratcheting through the subpasses and chunks within
+  // our current pass)
+  handle_chunking(uses_custom_row_bounds);
 }
 
 void reader::impl::populate_metadata(table_metadata& out_metadata)
@@ -415,12 +372,15 @@ table_with_metadata reader::impl::read_chunk_internal(
   auto out_columns = std::vector<std::unique_ptr<column>>{};
   out_columns.reserve(_output_buffers.size());
 
-  if (!has_next() || _pass_itm_data->output_chunk_read_info.empty()) {
+  #if 0
+  if (!has_next()/* || _pass_itm_data->output_chunk_read_info.empty()*/) {
     return finalize_output(out_metadata, out_columns, filter);
   }
+  #endif
 
-  auto const& read_info =
-    _pass_itm_data->output_chunk_read_info[_pass_itm_data->current_output_chunk];
+  auto& pass = *_pass_itm_data;
+  auto& subpass = *pass.subpass;
+  auto const& read_info = subpass.output_chunk_read_info[subpass.current_output_chunk];
 
   // Allocate memory buffers for the output columns.
   allocate_columns(read_info.skip_rows, read_info.num_rows, uses_custom_row_bounds);
@@ -473,16 +433,12 @@ table_with_metadata reader::impl::finalize_output(
     _output_metadata = std::make_unique<table_metadata>(out_metadata);
   }
 
-  // advance chunks/passes as necessary
-  _pass_itm_data->current_output_chunk++;
-  _chunk_count++;
-  if (_pass_itm_data->current_output_chunk >= _pass_itm_data->output_chunk_read_info.size()) {
-    _pass_itm_data->current_output_chunk = 0;
-    _pass_itm_data->output_chunk_read_info.clear();
-
-    _current_input_pass++;
-    _pass_preprocessed = false;
-  }
+  // advance output chunk/subpass/pass info
+  auto& pass = *_pass_itm_data;
+  auto& subpass = *pass.subpass;
+  subpass.current_output_chunk++;
+  pass.processed_rows += subpass.num_rows;
+  _file_itm_data._output_chunk_count++;
 
   if (filter.has_value()) {
     auto read_table = std::make_unique<table>(std::move(out_columns));
@@ -518,7 +474,7 @@ table_with_metadata reader::impl::read_chunk()
 {
   // Reset the output buffers to their original states (right after reader construction).
   // Don't need to do it if we read the file all at once.
-  if (_chunk_count > 0) {
+  if (_file_itm_data._output_chunk_count > 0) {
     _output_buffers.resize(0);
     for (auto const& buff : _output_buffers_template) {
       _output_buffers.emplace_back(cudf::io::detail::inline_column_buffer::empty_like(buff));
@@ -540,11 +496,12 @@ bool reader::impl::has_next()
                true /*uses_custom_row_bounds*/,
                {} /*row_group_indices, empty means read all row groups*/,
                std::nullopt /*filter*/);
-
-  size_t const num_input_passes = std::max(
-    int64_t{0}, static_cast<int64_t>(_file_itm_data.input_pass_row_group_offsets.size()) - 1);
-  return (_pass_itm_data->current_output_chunk < _pass_itm_data->output_chunk_read_info.size()) ||
-         (_current_input_pass < num_input_passes);
+  
+  // current_input_pass will only be incremented to be == num_passes after
+  // the last chunk in the last subpass in the last pass has been returned
+  auto const num_passes = _file_itm_data.input_pass_row_group_offsets.size() - 1;  
+  bool const more_work = _file_itm_data._current_input_pass < num_passes;  
+  return more_work;
 }
 
 namespace {
diff --git a/cpp/src/io/parquet/reader_impl.hpp b/cpp/src/io/parquet/reader_impl.hpp
index cea4ba35606..c6d711666b8 100644
--- a/cpp/src/io/parquet/reader_impl.hpp
+++ b/cpp/src/io/parquet/reader_impl.hpp
@@ -138,20 +138,57 @@ class reader::impl {
                     std::optional<std::reference_wrapper<ast::expression const>> filter);
 
   /**
-   * @brief Create chunk information and start file reads
+   * @brief Read the set of column chunks to be processed for this pass.
+   * 
+   * Does not decompress the chunk data.
    *
    * @return pair of boolean indicating if compressed chunks were found and a vector of futures for
    * read completion
    */
-  std::pair<bool, std::vector<std::future<void>>> read_and_decompress_column_chunks();
+  std::pair<bool, std::vector<std::future<void>>> read_column_chunks();
 
   /**
-   * @brief Load and decompress the input file(s) into memory.
+   * @brief Load compressed data and page information for the current pass.
    */
-  void load_and_decompress_data();
+  void load_compressed_data();
 
   /**
-   * @brief Perform some preprocessing for page data and also compute the split locations
+   * @brief Preprocess step for the entire file.
+   * 
+   * Only ever called once. This function reads in rowgroup and associated chunk
+   * information and computes the schedule of top level passes (see `pass_intermediate_data`).
+   * 
+   * @param skip_rows The number of rows to skip in the requested set of rowgroups to be read
+   * @param num_rows The total number of rows to read out of the selected rowgroups
+   * @param row_group_indices Lists of row groups to read, one per source
+   * @param filter Optional AST expression to filter output rows
+   */
+  void preprocess_file(int64_t skip_rows,
+                       std::optional<size_type> const& num_rows,
+                       host_span<std::vector<size_type> const> row_group_indices,
+                       std::optional<std::reference_wrapper<ast::expression const>> filter);
+
+  /**
+   * @brief Preprocess step for the next input read pass.
+   * 
+   * A 'pass' is defined as a subset of row groups read out of the globally
+   * requested set of all row groups.
+   */
+  void preprocess_next_pass();
+
+  /**
+   * @brief Ratchet the pass/subpass/chunk process forward.
+   */
+  void handle_chunking(bool uses_custom_row_bounds);
+
+  /**
+   * @brief Build string dictionary indices for a pass.
+   *
+   */
+  void build_string_dict_indices();
+
+  /**
+   * @brief Perform some preprocessing for subpass page data and also compute the split locations
    * {skip_rows, num_rows} for chunked reading.
    *
    * There are several pieces of information we can't compute directly from row counts in
@@ -166,7 +203,7 @@ class reader::impl {
    * @param chunk_read_limit Limit on total number of bytes to be returned per read,
    *        or `0` if there is no limit
    */
-  void preprocess_pages(bool uses_custom_row_bounds, size_t chunk_read_limit);
+  void preprocess_subpass_pages(bool uses_custom_row_bounds, size_t chunk_read_limit);
 
   /**
    * @brief Allocate nesting information storage for all pages and set pointers to it.
@@ -270,7 +307,7 @@ class reader::impl {
    * a limit on total read size, generate a set of {skip_rows, num_rows} pairs representing
    * a set of reads that will generate output columns of total size <= `chunk_read_limit` bytes.
    */
-  void compute_splits_for_pass();
+  void compute_chunks_for_subpass();
 
  private:
   rmm::cuda_stream_view _stream;
@@ -311,13 +348,9 @@ class reader::impl {
   bool _file_preprocessed{false};
 
   std::unique_ptr<pass_intermediate_data> _pass_itm_data;
-  bool _pass_preprocessed{false};
 
   std::size_t _output_chunk_read_limit{0};  // output chunk size limit in bytes
   std::size_t _input_pass_read_limit{0};    // input pass memory usage limit in bytes
-
-  std::size_t _current_input_pass{0};  // current input pass index
-  std::size_t _chunk_count{0};         // how many output chunks we have produced
 };
 
 }  // namespace cudf::io::parquet::detail
diff --git a/cpp/src/io/parquet/reader_impl_chunking.cu b/cpp/src/io/parquet/reader_impl_chunking.cu
index ad52a7dfcc1..b902c68dcd8 100644
--- a/cpp/src/io/parquet/reader_impl_chunking.cu
+++ b/cpp/src/io/parquet/reader_impl_chunking.cu
@@ -19,30 +19,42 @@
 
 #include <cudf/detail/iterator.cuh>
 #include <cudf/detail/utilities/integer_utils.hpp>
+#include <cudf/detail/utilities/vector_factories.hpp>
+
+#include <io/comp/nvcomp_adapter.hpp>
 
 #include <io/utilities/time_utils.cuh>
+#include <io/utilities/config_utils.hpp>
 
 #include <rmm/exec_policy.hpp>
 
 #include <thrust/binary_search.h>
 #include <thrust/iterator/constant_iterator.h>
 #include <thrust/iterator/discard_iterator.h>
+#include <thrust/logical.h>
 #include <thrust/sort.h>
+#include <thrust/unique.h>
+
+#include <numeric>
 
 namespace cudf::io::parquet::detail {
 
 namespace {
 
-struct cumulative_row_info {
+struct cumulative_page_info {
   size_t row_count;   // cumulative row count
   size_t size_bytes;  // cumulative size in bytes
   int key;            // schema index
 };
 
+struct split_info {
+  row_range rows;
+  int64_t split_pos;
+};
+
 #if defined(CHUNKING_DEBUG)
 void print_cumulative_page_info(cudf::detail::hostdevice_vector<PageInfo>& pages,
-                                rmm::device_uvector<int32_t> const& page_index,
-                                rmm::device_uvector<cumulative_row_info> const& c_info,
+                                rmm::device_uvector<cumulative_page_info> const& c_info,
                                 rmm::cuda_stream_view stream)
 {
   pages.device_to_host_sync(stream);
@@ -50,14 +62,11 @@ void print_cumulative_page_info(cudf::detail::hostdevice_vector<PageInfo>& pages
   printf("------------\nCumulative sizes by page\n");
 
   std::vector<int> schemas(pages.size());
-  std::vector<int> h_page_index(pages.size());
+  std::vector<cumulative_page_info> h_cinfo(pages.size());
   CUDF_CUDA_TRY(cudaMemcpy(
-    h_page_index.data(), page_index.data(), sizeof(int) * pages.size(), cudaMemcpyDefault));
-  std::vector<cumulative_row_info> h_cinfo(pages.size());
-  CUDF_CUDA_TRY(cudaMemcpy(
-    h_cinfo.data(), c_info.data(), sizeof(cumulative_row_info) * pages.size(), cudaMemcpyDefault));
+    h_cinfo.data(), c_info.data(), sizeof(cumulative_page_info) * pages.size(), cudaMemcpyDefault));
   auto schema_iter = cudf::detail::make_counting_transform_iterator(
-    0, [&](size_type i) { return pages[h_page_index[i]].src_col_schema; });
+    0, [&](size_type i) { return pages[i].src_col_schema; });
   thrust::copy(thrust::seq, schema_iter, schema_iter + pages.size(), schemas.begin());
   auto last = thrust::unique(thrust::seq, schemas.begin(), schemas.end());
   schemas.resize(last - schemas.begin());
@@ -66,7 +75,7 @@ void print_cumulative_page_info(cudf::detail::hostdevice_vector<PageInfo>& pages
   for (size_t idx = 0; idx < schemas.size(); idx++) {
     printf("Schema %d\n", schemas[idx]);
     for (size_t pidx = 0; pidx < pages.size(); pidx++) {
-      auto const& page = pages[h_page_index[pidx]];
+      auto const& page = pages[pidx];
       if (page.flags & PAGEINFO_FLAGS_DICTIONARY || page.src_col_schema != schemas[idx]) {
         continue;
       }
@@ -75,24 +84,24 @@ void print_cumulative_page_info(cudf::detail::hostdevice_vector<PageInfo>& pages
   }
 }
 
-void print_cumulative_row_info(host_span<cumulative_row_info const> sizes,
+void print_cumulative_row_info(host_span<cumulative_page_info const> sizes,
                                std::string const& label,
-                               std::optional<std::vector<chunk_read_info>> splits = std::nullopt)
+                               std::optional<std::vector<split_info>> splits = std::nullopt)
 {
   if (splits.has_value()) {
-    printf("------------\nSplits\n");
+    printf("------------\nSplits (skip_rows, num_rows)\n");
     for (size_t idx = 0; idx < splits->size(); idx++) {
-      printf("{%lu, %lu}\n", splits.value()[idx].skip_rows, splits.value()[idx].num_rows);
+      printf("{%lu, %lu}\n", splits.value()[idx].rows.skip_rows, splits.value()[idx].rows.num_rows);
     }
   }
 
-  printf("------------\nCumulative sizes %s\n", label.c_str());
+  printf("------------\nCumulative sizes %s (row_count, size_bytes, page_key)\n", label.c_str());
   for (size_t idx = 0; idx < sizes.size(); idx++) {
     printf("{%lu, %lu, %d}", sizes[idx].row_count, sizes[idx].size_bytes, sizes[idx].key);
     if (splits.has_value()) {
       // if we have a split at this row count and this is the last instance of this row count
       auto start = thrust::make_transform_iterator(
-        splits->begin(), [](chunk_read_info const& i) { return i.skip_rows; });
+        splits->begin(), [](split_info const& i) { return i.rows.skip_rows; });
       auto end               = start + splits->size();
       auto split             = std::find(start, end, sizes[idx].row_count);
       auto const split_index = [&]() -> int {
@@ -103,9 +112,10 @@ void print_cumulative_row_info(host_span<cumulative_row_info const> sizes,
         return idx == 0 ? 0 : -1;
       }();
       if (split_index >= 0) {
-        printf(" <-- split {%lu, %lu}",
-               splits.value()[split_index].skip_rows,
-               splits.value()[split_index].num_rows);
+        printf(" <-- split {%lu, %lu, %lu}",
+               splits.value()[split_index].rows.skip_rows,
+               splits.value()[split_index].rows.num_rows,
+               splits.value()[split_index].split_pos);
       }
     }
     printf("\n");
@@ -113,14 +123,15 @@ void print_cumulative_row_info(host_span<cumulative_row_info const> sizes,
 }
 #endif  // CHUNKING_DEBUG
 
+
 /**
- * @brief Functor which reduces two cumulative_row_info structs of the same key.
+ * @brief Functor which reduces two cumulative_page_info structs of the same key.
  */
-struct cumulative_row_sum {
-  cumulative_row_info operator()
-    __device__(cumulative_row_info const& a, cumulative_row_info const& b) const
+struct cumulative_page_sum {
+  cumulative_page_info operator()
+    __device__(cumulative_page_info const& a, cumulative_page_info const& b) const
   {
-    return cumulative_row_info{a.row_count + b.row_count, a.size_bytes + b.size_bytes, a.key};
+    return cumulative_page_info{a.row_count + b.row_count, a.size_bytes + b.size_bytes, a.key};
   }
 };
 
@@ -178,19 +189,16 @@ __device__ size_t row_size_functor::operator()<string_view>(size_t num_rows, boo
  *
  * Sums across all nesting levels.
  */
-struct get_cumulative_row_info {
-  PageInfo const* const pages;
-
-  __device__ cumulative_row_info operator()(size_type index)
-  {
-    auto const& page = pages[index];
+struct get_cumulative_page_info {
+  __device__ cumulative_page_info operator()(PageInfo const& page)
+  {    
     if (page.flags & PAGEINFO_FLAGS_DICTIONARY) {
-      return cumulative_row_info{0, 0, page.src_col_schema};
+      return cumulative_page_info{0, 0, page.src_col_schema};
     }
 
     // total nested size, not counting string data
     auto iter =
-      cudf::detail::make_counting_transform_iterator(0, [page, index] __device__(size_type i) {
+      cudf::detail::make_counting_transform_iterator(0, [page] __device__(size_type i) {
         auto const& pni = page.nesting[i];
         return cudf::type_dispatcher(
           data_type{pni.type}, row_size_functor{}, pni.size, pni.nullable);
@@ -204,6 +212,28 @@ struct get_cumulative_row_info {
   }
 };
 
+/**
+ * @brief Functor which computes the (uncompressed) size of a page.
+ */
+struct get_page_size {
+  device_span<const ColumnChunkDesc> chunks;
+
+  __device__ cumulative_page_info operator()(PageInfo const& page)
+  {
+    if (page.flags & PAGEINFO_FLAGS_DICTIONARY) {
+      return cumulative_page_info{0, 0, page.src_col_schema};
+    }
+    // TODO: this is not accurate for lists. it might make sense to make a guess
+    // based on total-rowgroup-size / # of rows in the rowgroup for an average of
+    // rows-per-byte.     
+    size_t const row_count = page.num_rows;
+    return {
+      row_count,
+      static_cast<size_t>(page.uncompressed_page_size),
+      page.src_col_schema};
+  }
+};
+
 /**
  * @brief Functor which computes the effective size of all input columns by page.
  *
@@ -219,12 +249,12 @@ struct get_cumulative_row_info {
  * at that point.  So we have to proceed as if we are taking the bytes from all 200 rows of that
  * page. Essentially, a conservative over-estimate of the real size.
  */
-struct row_total_size {
-  cumulative_row_info const* c_info;
+struct page_total_size {
+  cumulative_page_info const* c_info;
   size_type const* key_offsets;
   size_t num_keys;
 
-  __device__ cumulative_row_info operator()(cumulative_row_info const& i)
+  __device__ cumulative_page_info operator()(cumulative_page_info const& i)
   {
     // sum sizes for each input column at this row
     size_t sum = 0;
@@ -241,57 +271,67 @@ struct row_total_size {
   }
 };
 
+int64_t find_next_split(int64_t cur_pos,
+                        size_t cur_row_count,
+                        std::vector<cumulative_page_info> const& sizes,
+                        size_t chunk_read_limit)
+{
+  size_t cur_cumulative_size = cur_pos == 0 ? 0 : sizes[cur_pos-1].size_bytes;
+
+  auto start = thrust::make_transform_iterator(sizes.begin(), [&](cumulative_page_info const& i) {
+    return i.size_bytes - cur_cumulative_size;
+  });
+  auto end   = start + sizes.size();
+  
+  int64_t split_pos =
+    thrust::lower_bound(thrust::seq, start + cur_pos, end, chunk_read_limit) - start;
+
+  // if we're past the end, or if the returned bucket is > than the chunk_read_limit, move back
+  // one.
+  if (static_cast<size_t>(split_pos) >= sizes.size() ||
+      (sizes[split_pos].size_bytes - cur_cumulative_size > chunk_read_limit)) {
+    split_pos--;
+  }
+
+  // best-try. if we can't find something that'll fit, we have to go bigger. we're doing this in
+  // a loop because all of the cumulative sizes for all the pages are sorted into one big list.
+  // so if we had two columns, both of which had an entry {1000, 10000}, that entry would be in
+  // the list twice. so we have to iterate until we skip past all of them.  The idea is that we
+  // either do this, or we have to call unique() on the input first.
+  while (split_pos < (static_cast<int64_t>(sizes.size()) - 1) &&
+          (split_pos < 0 || sizes[split_pos].row_count == cur_row_count)) {
+    split_pos++;
+  }
+
+  return split_pos;
+}
+
 /**
  * @brief Given a vector of cumulative {row_count, byte_size} pairs and a chunk read
  * limit, determine the set of splits.
  *
  * @param sizes Vector of cumulative {row_count, byte_size} pairs
- * @param num_rows Total number of rows to read
  * @param chunk_read_limit Limit on total number of bytes to be returned per read, for all columns
  */
-std::vector<chunk_read_info> find_splits(std::vector<cumulative_row_info> const& sizes,
-                                         size_t num_rows,
-                                         size_t chunk_read_limit)
+std::vector<split_info> find_splits(std::vector<cumulative_page_info> const& sizes,
+                                   size_t chunk_read_limit)
 {
   // now we have an array of {row_count, real output bytes}. just walk through it and generate
   // splits.
   // TODO: come up with a clever way to do this entirely in parallel. For now, as long as batch
   // sizes are reasonably large, this shouldn't iterate too many times
-  std::vector<chunk_read_info> splits;
+  std::vector<split_info> splits;
   {
     size_t cur_pos             = 0;
-    size_t cur_cumulative_size = 0;
     size_t cur_row_count       = 0;
-    auto start = thrust::make_transform_iterator(sizes.begin(), [&](cumulative_row_info const& i) {
-      return i.size_bytes - cur_cumulative_size;
-    });
-    auto end   = start + sizes.size();
+    auto const num_rows = sizes.back().row_count;
     while (cur_row_count < num_rows) {
-      int64_t split_pos =
-        thrust::lower_bound(thrust::seq, start + cur_pos, end, chunk_read_limit) - start;
-
-      // if we're past the end, or if the returned bucket is > than the chunk_read_limit, move back
-      // one.
-      if (static_cast<size_t>(split_pos) >= sizes.size() ||
-          (sizes[split_pos].size_bytes - cur_cumulative_size > chunk_read_limit)) {
-        split_pos--;
-      }
-
-      // best-try. if we can't find something that'll fit, we have to go bigger. we're doing this in
-      // a loop because all of the cumulative sizes for all the pages are sorted into one big list.
-      // so if we had two columns, both of which had an entry {1000, 10000}, that entry would be in
-      // the list twice. so we have to iterate until we skip past all of them.  The idea is that we
-      // either do this, or we have to call unique() on the input first.
-      while (split_pos < (static_cast<int64_t>(sizes.size()) - 1) &&
-             (split_pos < 0 || sizes[split_pos].row_count == cur_row_count)) {
-        split_pos++;
-      }
-
+      auto const split_pos = find_next_split(cur_pos, cur_row_count, sizes, chunk_read_limit);
+      
       auto const start_row = cur_row_count;
       cur_row_count        = sizes[split_pos].row_count;
-      splits.push_back(chunk_read_info{start_row, cur_row_count - start_row});
+      splits.push_back(split_info{row_range{start_row, cur_row_count - start_row}, static_cast<int64_t>(cur_pos == 0 ? 0 : cur_pos + 1)});
       cur_pos             = split_pos;
-      cur_cumulative_size = sizes[split_pos].size_bytes;
     }
   }
   // print_cumulative_row_info(sizes, "adjusted", splits);
@@ -340,12 +380,467 @@ template <typename T = uint8_t>
 }
 
 struct row_count_compare {
-  __device__ bool operator()(cumulative_row_info const& a, cumulative_row_info const& b)
+  __device__ bool operator()(cumulative_page_info const& a, cumulative_page_info const& b)
   {
     return a.row_count < b.row_count;
   }
 };
 
+std::pair<size_t, size_t> get_row_group_size(RowGroup const& rg)
+{
+  auto compressed_size_iter = thrust::make_transform_iterator(rg.columns.begin(), [](ColumnChunk const& c){
+    return c.meta_data.total_compressed_size;
+  });
+
+  // the trick is that total temp space needed is tricky to know
+  auto const compressed_size = std::reduce(compressed_size_iter, compressed_size_iter + rg.columns.size());
+  auto const total_size = compressed_size + rg.total_byte_size;
+  return {compressed_size, total_size};
+}
+
+std::pair<rmm::device_uvector<cumulative_page_info>, rmm::device_uvector<int32_t>>
+adjust_cumulative_sizes(rmm::device_uvector<cumulative_page_info> const& c_info,
+                        cudf::detail::hostdevice_vector<PageInfo> const& pages,
+                        rmm::cuda_stream_view stream)
+{
+  // sort by row count
+  rmm::device_uvector<cumulative_page_info> c_info_sorted{c_info, stream};  
+  thrust::sort(
+    rmm::exec_policy(stream), c_info_sorted.begin(), c_info_sorted.end(), row_count_compare{});
+
+  // page keys grouped by split.
+  rmm::device_uvector<int32_t> page_keys_by_split{c_info.size(), stream};
+  thrust::transform(rmm::exec_policy(stream), c_info_sorted.begin(), c_info_sorted.end(), page_keys_by_split.begin(), [] __device__ (cumulative_page_info const& c){
+    return c.key;
+  });
+
+  std::vector<cumulative_page_info> h_c_info_sorted(c_info_sorted.size());
+  CUDF_CUDA_TRY(cudaMemcpy(h_c_info_sorted.data(),
+                            c_info_sorted.data(),
+                            sizeof(cumulative_page_info) * c_info_sorted.size(),
+                            cudaMemcpyDefault));
+  // print_cumulative_row_info(h_c_info_sorted, "raw");
+
+  // generate key offsets (offsets to the start of each partition of keys). worst case is 1 page per
+  // key
+  rmm::device_uvector<size_type> key_offsets(pages.size() + 1, stream);
+  auto page_keys = make_page_key_iterator(pages);
+  auto const key_offsets_end = thrust::reduce_by_key(rmm::exec_policy(stream),
+                                                     page_keys,
+                                                     page_keys + pages.size(),
+                                                     thrust::make_constant_iterator(1),
+                                                     thrust::make_discard_iterator(),
+                                                     key_offsets.begin()).second;
+  size_t const num_unique_keys = key_offsets_end - key_offsets.begin();
+  thrust::exclusive_scan(
+    rmm::exec_policy(stream), key_offsets.begin(), key_offsets.end(), key_offsets.begin());
+
+  // adjust the cumulative info such that for each row count, the size includes any pages that span
+  // that row count. this is so that if we have this case:
+  //              page row counts
+  // Column A:    0 <----> 100 <----> 200
+  // Column B:    0 <---------------> 200 <--------> 400
+  //                        |
+  // if we decide to split at row 100, we don't really know the actual amount of bytes in column B
+  // at that point.  So we have to proceed as if we are taking the bytes from all 200 rows of that
+  // page.
+  //
+  rmm::device_uvector<cumulative_page_info> aggregated_info(c_info.size(), stream);
+  thrust::transform(rmm::exec_policy(stream),
+                    c_info_sorted.begin(),
+                    c_info_sorted.end(),
+                    aggregated_info.begin(),
+                    page_total_size{c_info.data(), key_offsets.data(), num_unique_keys});
+  return {std::move(aggregated_info), std::move(page_keys_by_split)};
+}
+
+struct page_span {
+  size_t start, end;
+};
+std::pair<std::vector<page_span>, size_t>
+compute_next_subpass(rmm::device_uvector<cumulative_page_info> const& c_info,
+                     cudf::detail::hostdevice_vector<PageInfo> const& pages,
+                     cudf::detail::hostdevice_vector<size_type> const& page_offsets,
+                     size_t min_row,
+                     size_t size_limit,
+                     size_t num_columns,
+                     rmm::cuda_stream_view stream)
+{
+  auto [aggregated_info, page_keys_by_split] = adjust_cumulative_sizes(c_info, pages, stream);
+
+  // bring back to the cpu
+  std::vector<cumulative_page_info> h_aggregated_info(aggregated_info.size());
+  CUDF_CUDA_TRY(cudaMemcpyAsync(h_aggregated_info.data(),
+                                aggregated_info.data(),
+                                sizeof(cumulative_page_info) * c_info.size(),
+                                cudaMemcpyDefault,
+                                stream.value()));
+  stream.synchronize();
+
+  // print_cumulative_row_info(h_aggregated_info, "adjusted");
+
+  // first, find the min row
+  auto start = thrust::make_transform_iterator(h_aggregated_info.begin(), [&](cumulative_page_info const& i){
+    return i.row_count;
+  });
+  auto const start_index = thrust::upper_bound(thrust::host, start, start + h_aggregated_info.size(), min_row) - start;    
+
+  // find the next split
+  auto const end_index = find_next_split(start_index,
+                                         min_row,
+                                         // 0,
+                                         h_aggregated_info,
+                                         size_limit) + 1; // the split index returned is inclusive
+
+  // get the number of pages for each column/schema
+  auto get_page_counts = [num_columns, stream](rmm::device_uvector<cumulative_page_info> const& aggregated_info, int start_index, int end_index){
+    std::vector<size_t> h_page_counts(num_columns);
+
+    auto const num_pages = end_index - start_index;
+    if(num_pages == 0){
+      std::fill(h_page_counts.begin(), h_page_counts.end(), 0);
+      return h_page_counts;
+    }
+
+    rmm::device_uvector<int32_t> page_keys(num_pages, stream);
+    thrust::transform(rmm::exec_policy(stream), 
+                      aggregated_info.begin() + start_index, 
+                      aggregated_info.begin() + end_index,
+                      page_keys.begin(),
+                      [] __device__ (cumulative_page_info const& i){
+                        return i.key;
+                      });    
+    thrust::sort(rmm::exec_policy(stream), page_keys.begin(), page_keys.end());
+    rmm::device_uvector<size_t> page_counts(num_pages, stream);
+    auto page_counts_end = thrust::reduce_by_key(rmm::exec_policy(stream), 
+                                                 page_keys.begin(), 
+                                                 page_keys.end(), 
+                                                 thrust::make_constant_iterator(1), 
+                                                 thrust::make_discard_iterator(),
+                                                 page_counts.begin()).second;
+    auto const num_page_counts = page_counts_end - page_counts.begin();
+    CUDF_EXPECTS(static_cast<size_t>(num_page_counts) == num_columns, "Encountered a mismatch in column/schema counts while computing subpass split");
+    
+    cudaMemcpyAsync(h_page_counts.data(), page_counts.data(), sizeof(size_t) * num_columns, cudaMemcpyDeviceToHost);
+    stream.synchronize();
+    return h_page_counts;
+  };
+
+  // get count of pages before this split and in this split.
+  auto last_counts = get_page_counts(aggregated_info, 0, start_index);
+  auto this_counts = get_page_counts(aggregated_info, start_index, end_index);
+
+  // convert to page spans
+  std::vector<page_span> out(num_columns);
+  size_t total_pages = 0;
+  for(size_t c_idx=0; c_idx<num_columns; c_idx++){
+    // add page_offsets to get proper indices into the pages array
+    out[c_idx].start = (last_counts[c_idx]) + page_offsets[c_idx];
+    out[c_idx].end = (last_counts[c_idx] + this_counts[c_idx]) + page_offsets[c_idx];
+    total_pages += this_counts[c_idx];
+  }
+
+  return {out, total_pages};
+}
+
+
+std::pair<std::vector<split_info>, rmm::device_uvector<int32_t>>
+compute_page_splits_by_row(rmm::device_uvector<cumulative_page_info> const& c_info,
+                           cudf::detail::hostdevice_vector<PageInfo> const& pages,
+                           size_t size_limit,
+                           rmm::cuda_stream_view stream)
+{
+  auto [aggregated_info, page_keys_by_split] = adjust_cumulative_sizes(c_info, pages, stream);
+
+  // bring back to the cpu
+  std::vector<cumulative_page_info> h_aggregated_info(aggregated_info.size());
+  CUDF_CUDA_TRY(cudaMemcpyAsync(h_aggregated_info.data(),
+                                aggregated_info.data(),
+                                sizeof(cumulative_page_info) * c_info.size(),
+                                cudaMemcpyDefault,
+                                stream.value()));
+  stream.synchronize();
+
+  // generate the actual splits
+  return {find_splits(h_aggregated_info, size_limit), std::move(page_keys_by_split)};
+}
+
+/**
+ * @brief Decompresses the page data, at page granularity.
+ * 
+ * This function handles the case where `pages` is only a subset of all available
+ * pages in `chunks`.
+ *
+ * @param chunks List of column chunk descriptors
+ * @param pages List of page information
+ * @param stream CUDA stream used for device memory operations and kernel launches
+ *
+ * @return Device buffer to decompressed page data
+ */
+[[nodiscard]] rmm::device_buffer decompress_page_data(
+  cudf::detail::hostdevice_vector<ColumnChunkDesc> const& chunks,
+  cudf::detail::hostdevice_vector<PageInfo>& pages,
+  rmm::cuda_stream_view stream)
+{
+  auto for_each_codec_page = [&](Compression codec, std::function<void(size_t)> const& f) {
+    for(size_t p = 0; p<pages.size(); p++){
+      if(chunks[pages[p].chunk_idx].codec == codec){
+        f(p);
+      }
+    }
+  };
+
+  // Brotli scratch memory for decompressing
+  rmm::device_buffer debrotli_scratch;
+
+  // Count the exact number of compressed pages
+  size_t num_comp_pages    = 0;
+  size_t total_decomp_size = 0;
+
+  struct codec_stats {
+    Compression compression_type  = UNCOMPRESSED;
+    size_t num_pages              = 0;
+    int32_t max_decompressed_size = 0;
+    size_t total_decomp_size      = 0;
+  };
+
+  std::array codecs{codec_stats{GZIP}, codec_stats{SNAPPY}, codec_stats{BROTLI}, codec_stats{ZSTD}};
+
+  auto is_codec_supported = [&codecs](int8_t codec) {
+    if (codec == UNCOMPRESSED) return true;
+    return std::find_if(codecs.begin(), codecs.end(), [codec](auto& cstats) {
+             return codec == cstats.compression_type;
+           }) != codecs.end();
+  };
+  CUDF_EXPECTS(std::all_of(chunks.begin(),
+                           chunks.end(),
+                           [&is_codec_supported](auto const& chunk) {
+                             return is_codec_supported(chunk.codec);
+                           }),
+               "Unsupported compression type");
+
+  for (auto& codec : codecs) {
+    for_each_codec_page(codec.compression_type, [&](size_t page) {
+      auto page_uncomp_size = pages[page].uncompressed_page_size;
+      total_decomp_size += page_uncomp_size;
+      codec.total_decomp_size += page_uncomp_size;
+      codec.max_decompressed_size = std::max(codec.max_decompressed_size, page_uncomp_size);
+      codec.num_pages++;
+      num_comp_pages++;
+    });
+    if (codec.compression_type == BROTLI && codec.num_pages > 0) {
+      debrotli_scratch.resize(get_gpu_debrotli_scratch_size(codec.num_pages), stream);
+    }
+  }
+
+  // Dispatch batches of pages to decompress for each codec.
+  // Buffer needs to be padded, required by `gpuDecodePageData`.
+  rmm::device_buffer decomp_pages(
+    cudf::util::round_up_safe(total_decomp_size, BUFFER_PADDING_MULTIPLE), stream);
+
+  std::vector<device_span<uint8_t const>> comp_in;
+  comp_in.reserve(num_comp_pages);
+  std::vector<device_span<uint8_t>> comp_out;
+  comp_out.reserve(num_comp_pages);
+
+  // vectors to save v2 def and rep level data, if any
+  std::vector<device_span<uint8_t const>> copy_in;
+  copy_in.reserve(num_comp_pages);
+  std::vector<device_span<uint8_t>> copy_out;
+  copy_out.reserve(num_comp_pages);
+
+  rmm::device_uvector<compression_result> comp_res(num_comp_pages, stream);
+  thrust::fill(rmm::exec_policy(stream),
+               comp_res.begin(),
+               comp_res.end(),
+               compression_result{0, compression_status::FAILURE});
+
+  size_t decomp_offset = 0;
+  int32_t start_pos    = 0;
+  for (auto const& codec : codecs) {
+    if (codec.num_pages == 0) { continue; }
+
+    for_each_codec_page(codec.compression_type, [&](size_t page_idx) {
+      auto const dst_base = static_cast<uint8_t*>(decomp_pages.data()) + decomp_offset;
+      auto& page          = pages[page_idx];
+      // offset will only be non-zero for V2 pages
+      auto const offset =
+        page.lvl_bytes[level_type::DEFINITION] + page.lvl_bytes[level_type::REPETITION];
+      // for V2 need to copy def and rep level info into place, and then offset the
+      // input and output buffers. otherwise we'd have to keep both the compressed
+      // and decompressed data.
+      if (offset != 0) {
+        copy_in.emplace_back(page.page_data, offset);
+        copy_out.emplace_back(dst_base, offset);
+      }
+      comp_in.emplace_back(page.page_data + offset,
+                           static_cast<size_t>(page.compressed_page_size - offset));
+      comp_out.emplace_back(dst_base + offset,
+                            static_cast<size_t>(page.uncompressed_page_size - offset));
+      page.page_data = dst_base;
+      decomp_offset += page.uncompressed_page_size;
+    });
+
+    host_span<device_span<uint8_t const> const> comp_in_view{comp_in.data() + start_pos,
+                                                             codec.num_pages};
+    auto const d_comp_in = cudf::detail::make_device_uvector_async(
+      comp_in_view, stream, rmm::mr::get_current_device_resource());
+    host_span<device_span<uint8_t> const> comp_out_view(comp_out.data() + start_pos,
+                                                        codec.num_pages);
+    auto const d_comp_out = cudf::detail::make_device_uvector_async(
+      comp_out_view, stream, rmm::mr::get_current_device_resource());
+    device_span<compression_result> d_comp_res_view(comp_res.data() + start_pos, codec.num_pages);
+
+    switch (codec.compression_type) {
+      case GZIP:
+        gpuinflate(d_comp_in, d_comp_out, d_comp_res_view, gzip_header_included::YES, stream);
+        break;
+      case SNAPPY:
+        if (cudf::io::detail::nvcomp_integration::is_stable_enabled()) {
+          nvcomp::batched_decompress(nvcomp::compression_type::SNAPPY,
+                                     d_comp_in,
+                                     d_comp_out,
+                                     d_comp_res_view,
+                                     codec.max_decompressed_size,
+                                     codec.total_decomp_size,
+                                     stream);
+        } else {
+          gpu_unsnap(d_comp_in, d_comp_out, d_comp_res_view, stream);
+        }
+        break;
+      case ZSTD:
+        nvcomp::batched_decompress(nvcomp::compression_type::ZSTD,
+                                   d_comp_in,
+                                   d_comp_out,
+                                   d_comp_res_view,
+                                   codec.max_decompressed_size,
+                                   codec.total_decomp_size,
+                                   stream);
+        break;
+      case BROTLI:
+        gpu_debrotli(d_comp_in,
+                     d_comp_out,
+                     d_comp_res_view,
+                     debrotli_scratch.data(),
+                     debrotli_scratch.size(),
+                     stream);
+        break;
+      default: CUDF_FAIL("Unexpected decompression dispatch"); break;
+    }
+    start_pos += codec.num_pages;
+  }
+
+  CUDF_EXPECTS(thrust::all_of(rmm::exec_policy(stream),
+                              comp_res.begin(),
+                              comp_res.end(),
+                              [] __device__(auto const& res) {
+                                return res.status == compression_status::SUCCESS;
+                              }),
+               "Error during decompression");
+
+  // now copy the uncompressed V2 def and rep level data
+  if (not copy_in.empty()) {
+    auto const d_copy_in = cudf::detail::make_device_uvector_async(
+      copy_in, stream, rmm::mr::get_current_device_resource());
+    auto const d_copy_out = cudf::detail::make_device_uvector_async(
+      copy_out, stream, rmm::mr::get_current_device_resource());
+
+    gpu_copy_uncompressed_blocks(d_copy_in, d_copy_out, stream);
+    stream.synchronize();
+  }
+
+  // Update the page information in device memory with the updated value of
+  // page_data; it now points to the uncompressed data buffer
+  pages.host_to_device_async(stream);
+
+  return decomp_pages;
+}
+
+struct flat_column_num_rows {
+  ColumnChunkDesc const* chunks;
+
+  __device__ size_type operator()(PageInfo const& page) const
+  {
+    // ignore dictionary pages and pages belonging to any column containing repetition (lists)
+    if ((page.flags & PAGEINFO_FLAGS_DICTIONARY) ||
+        (chunks[page.chunk_idx].max_level[level_type::REPETITION] > 0)) {
+      return 0;
+    }
+    return page.num_rows;
+  }
+};
+
+struct row_counts_nonzero {
+  __device__ bool operator()(size_type count) const { return count > 0; }
+};
+
+struct row_counts_different {
+  size_type const expected;
+  __device__ bool operator()(size_type count) const { return (count != 0) && (count != expected); }
+};
+
+/**
+ * @brief Detect malformed parquet input data.
+ *
+ * We have seen cases where parquet files can be oddly malformed. This function specifically
+ * detects one case in particular:
+ *
+ * - When you have a file containing N rows
+ * - For some reason, the sum total of the number of rows over all pages for a given column
+ *   is != N
+ *
+ * @param pages All pages to be decoded
+ * @param chunks Chunk data
+ * @param expected_row_count Expected row count, if applicable
+ * @param stream CUDA stream used for device memory operations and kernel launches
+ */
+void detect_malformed_pages(cudf::detail::hostdevice_vector<PageInfo> const& pages,
+                            cudf::detail::hostdevice_vector<ColumnChunkDesc> const& chunks,
+                            std::optional<size_t> expected_row_count,
+                            rmm::cuda_stream_view stream)
+{
+  // sum row counts for all non-dictionary, non-list columns. other columns will be indicated as 0
+  rmm::device_uvector<size_type> row_counts(pages.size(),
+                                            stream);  // worst case:  num keys == num pages
+  auto const size_iter = thrust::make_transform_iterator(pages.d_begin(), flat_column_num_rows{chunks.device_ptr()});
+  auto const row_counts_begin = row_counts.begin();
+  auto page_keys = make_page_key_iterator(pages);
+  auto const row_counts_end   = thrust::reduce_by_key(rmm::exec_policy(stream),
+                                                      page_keys,
+                                                      page_keys + pages.size(),
+                                                      size_iter,
+                                                      thrust::make_discard_iterator(),
+                                                      row_counts_begin)
+                                .second;
+
+  // make sure all non-zero row counts are the same
+  rmm::device_uvector<size_type> compacted_row_counts(pages.size(), stream);
+  auto const compacted_row_counts_begin = compacted_row_counts.begin();
+  auto const compacted_row_counts_end   = thrust::copy_if(rmm::exec_policy(stream),
+                                                        row_counts_begin,
+                                                        row_counts_end,
+                                                        compacted_row_counts_begin,
+                                                        row_counts_nonzero{});
+  if (compacted_row_counts_end != compacted_row_counts_begin) {
+    size_t const found_row_count = static_cast<size_t>(compacted_row_counts.element(0, stream));
+
+    // if we somehow don't match the expected row count from the row groups themselves
+    if (expected_row_count.has_value()) {
+      CUDF_EXPECTS(expected_row_count.value() == found_row_count,
+                   "Encountered malformed parquet page data (unexpected row count in page data)");
+    }
+
+    // all non-zero row counts must be the same
+    auto const chk =
+      thrust::count_if(rmm::exec_policy(stream),
+                       compacted_row_counts_begin,
+                       compacted_row_counts_end,
+                       row_counts_different{static_cast<size_type>(found_row_count)});
+    CUDF_EXPECTS(chk == 0,
+                 "Encountered malformed parquet page data (row count mismatch in page data)");
+  }
+}
+
 }  // anonymous namespace
 
 void reader::impl::create_global_chunk_info()
@@ -431,8 +926,11 @@ void reader::impl::compute_input_passes()
     auto const& rgi       = row_groups_info[cur_rg_index];
     auto const& row_group = _metadata->get_row_group(rgi.index, rgi.source_index);
 
+    // total compressed size and total size (compressed + uncompressed) for 
+    auto const [compressed_rg_size, _/*compressed + uncompressed*/] = get_row_group_size(row_group);
+
     // can we add this row group
-    if (cur_pass_byte_size + row_group.total_byte_size >= read_limit) {
+    if (cur_pass_byte_size + compressed_rg_size >= read_limit) {
       // A single row group (the current one) is larger than the read limit:
       // We always need to include at least one row group, so end the pass at the end of the current
       // row group
@@ -447,13 +945,14 @@ void reader::impl::compute_input_passes()
         _file_itm_data.input_pass_row_group_offsets.push_back(cur_rg_index);
         _file_itm_data.input_pass_row_count.push_back(cur_row_count);
         cur_rg_start       = cur_rg_index;
-        cur_pass_byte_size = row_group.total_byte_size;
+        cur_pass_byte_size = compressed_rg_size;
       }
     } else {
-      cur_pass_byte_size += row_group.total_byte_size;
+      cur_pass_byte_size += compressed_rg_size;
     }
     cur_row_count += row_group.num_rows;
   }
+
   // add the last pass if necessary
   if (_file_itm_data.input_pass_row_group_offsets.back() != row_groups_info.size()) {
     _file_itm_data.input_pass_row_group_offsets.push_back(row_groups_info.size());
@@ -461,138 +960,274 @@ void reader::impl::compute_input_passes()
   }
 }
 
-void reader::impl::setup_next_pass()
+void reader::impl::compute_chunks_for_subpass()
 {
-  // this will also cause the previous pass information to be deleted
-  _pass_itm_data = std::make_unique<cudf::io::parquet::detail::pass_intermediate_data>();
-
-  // setup row groups to be loaded for this pass
-  auto const row_group_start = _file_itm_data.input_pass_row_group_offsets[_current_input_pass];
-  auto const row_group_end   = _file_itm_data.input_pass_row_group_offsets[_current_input_pass + 1];
-  auto const num_row_groups  = row_group_end - row_group_start;
-  _pass_itm_data->row_groups.resize(num_row_groups);
-  std::copy(_file_itm_data.row_groups.begin() + row_group_start,
-            _file_itm_data.row_groups.begin() + row_group_end,
-            _pass_itm_data->row_groups.begin());
-
-  auto const num_passes = _file_itm_data.input_pass_row_group_offsets.size() - 1;
-  CUDF_EXPECTS(_current_input_pass < num_passes, "Encountered an invalid read pass index");
-
-  auto const chunks_per_rowgroup = _input_columns.size();
-  auto const num_chunks          = chunks_per_rowgroup * num_row_groups;
-
-  auto chunk_start = _file_itm_data.chunks.begin() + (row_group_start * chunks_per_rowgroup);
-  auto chunk_end   = _file_itm_data.chunks.begin() + (row_group_end * chunks_per_rowgroup);
-
-  _pass_itm_data->chunks = cudf::detail::hostdevice_vector<ColumnChunkDesc>(num_chunks, _stream);
-  std::copy(chunk_start, chunk_end, _pass_itm_data->chunks.begin());
-
-  // adjust skip_rows and num_rows by what's available in the row groups we are processing
-  if (num_passes == 1) {
-    _pass_itm_data->skip_rows = _file_itm_data.global_skip_rows;
-    _pass_itm_data->num_rows  = _file_itm_data.global_num_rows;
-  } else {
-    auto const global_start_row = _file_itm_data.global_skip_rows;
-    auto const global_end_row   = global_start_row + _file_itm_data.global_num_rows;
-    auto const start_row =
-      std::max(_file_itm_data.input_pass_row_count[_current_input_pass], global_start_row);
-    auto const end_row =
-      std::min(_file_itm_data.input_pass_row_count[_current_input_pass + 1], global_end_row);
-
-    // skip_rows is always global in the sense that it is relative to the first row of
-    // everything we will be reading, regardless of what pass we are on.
-    // num_rows is how many rows we are reading this pass.
-    _pass_itm_data->skip_rows =
-      global_start_row + _file_itm_data.input_pass_row_count[_current_input_pass];
-    _pass_itm_data->num_rows = end_row - start_row;
-  }
-}
-
-void reader::impl::compute_splits_for_pass()
-{
-  auto const skip_rows = _pass_itm_data->skip_rows;
-  auto const num_rows  = _pass_itm_data->num_rows;
-
+  auto& pass = *_pass_itm_data;
+  auto& subpass = *pass.subpass;
+ 
   // simple case : no chunk size, no splits
   if (_output_chunk_read_limit <= 0) {
-    _pass_itm_data->output_chunk_read_info = std::vector<chunk_read_info>{{skip_rows, num_rows}};
+    subpass.output_chunk_read_info.push_back({subpass.skip_rows, subpass.num_rows});
     return;
   }
-
-  auto& pages = _pass_itm_data->pages_info;
-
-  auto const& page_keys  = _pass_itm_data->page_keys;
-  auto const& page_index = _pass_itm_data->page_index;
-
+  
   // generate cumulative row counts and sizes
-  rmm::device_uvector<cumulative_row_info> c_info(page_keys.size(), _stream);
-  // convert PageInfo to cumulative_row_info
-  auto page_input = thrust::make_transform_iterator(page_index.begin(),
-                                                    get_cumulative_row_info{pages.device_ptr()});
+  rmm::device_uvector<cumulative_page_info> c_info(subpass.pages.size(), _stream);
+  // convert PageInfo to cumulative_page_info
+  auto page_input = thrust::make_transform_iterator(subpass.pages.d_begin(), get_cumulative_page_info{});
+  auto page_keys = make_page_key_iterator(subpass.pages);
   thrust::inclusive_scan_by_key(rmm::exec_policy(_stream),
-                                page_keys.begin(),
-                                page_keys.end(),
+                                page_keys,
+                                page_keys + subpass.pages.size(),
                                 page_input,
                                 c_info.begin(),
                                 thrust::equal_to{},
-                                cumulative_row_sum{});
-  // print_cumulative_page_info(pages, page_index, c_info, stream);
+                                cumulative_page_sum{});
+  // print_cumulative_page_info(subpass.pages, c_info, _stream);
+  
+  // compute the splits
+  auto [splits, _] = compute_page_splits_by_row(c_info, subpass.pages, _output_chunk_read_limit, _stream);
+  subpass.output_chunk_read_info.reserve(splits.size());
+
+  // apply skip_rows from the subpass
+  std::transform(splits.begin(), splits.end(), std::back_inserter(subpass.output_chunk_read_info), [&subpass](split_info const &s){
+    row_range r = s.rows;
+    r.skip_rows += subpass.skip_rows;
+    return r;
+  });
+}
 
-  // sort by row count
-  rmm::device_uvector<cumulative_row_info> c_info_sorted{c_info, _stream};
-  thrust::sort(
-    rmm::exec_policy(_stream), c_info_sorted.begin(), c_info_sorted.end(), row_count_compare{});
+void reader::impl::preprocess_next_pass()
+{
+  auto const num_passes = _file_itm_data.input_pass_row_group_offsets.size() - 1;
 
-  // std::vector<cumulative_row_info> h_c_info_sorted(c_info_sorted.size());
-  // CUDF_CUDA_TRY(cudaMemcpy(h_c_info_sorted.data(),
-  //                          c_info_sorted.data(),
-  //                          sizeof(cumulative_row_info) * c_info_sorted.size(),
-  //                          cudaMemcpyDefault));
-  // print_cumulative_row_info(h_c_info_sorted, "raw");
+  // always create the pass struct, even if we end up with no work.
+  // this will also cause the previous pass information to be deleted
+  _pass_itm_data = std::make_unique<pass_intermediate_data>();
 
-  // generate key offsets (offsets to the start of each partition of keys). worst case is 1 page per
-  // key
-  rmm::device_uvector<size_type> key_offsets(page_keys.size() + 1, _stream);
-  auto const key_offsets_end = thrust::reduce_by_key(rmm::exec_policy(_stream),
-                                                     page_keys.begin(),
-                                                     page_keys.end(),
-                                                     thrust::make_constant_iterator(1),
-                                                     thrust::make_discard_iterator(),
-                                                     key_offsets.begin())
-                                 .second;
-  size_t const num_unique_keys = key_offsets_end - key_offsets.begin();
-  thrust::exclusive_scan(
-    rmm::exec_policy(_stream), key_offsets.begin(), key_offsets.end(), key_offsets.begin());
+  if (_file_itm_data.global_num_rows > 0 && not _file_itm_data.row_groups.empty() &&
+      not _input_columns.empty() && _file_itm_data._current_input_pass < num_passes) {
 
-  // adjust the cumulative info such that for each row count, the size includes any pages that span
-  // that row count. this is so that if we have this case:
-  //              page row counts
-  // Column A:    0 <----> 100 <----> 200
-  // Column B:    0 <---------------> 200 <--------> 400
-  //                        |
-  // if we decide to split at row 100, we don't really know the actual amount of bytes in column B
-  // at that point.  So we have to proceed as if we are taking the bytes from all 200 rows of that
-  // page.
-  //
-  rmm::device_uvector<cumulative_row_info> aggregated_info(c_info.size(), _stream);
-  thrust::transform(rmm::exec_policy(_stream),
-                    c_info_sorted.begin(),
-                    c_info_sorted.end(),
-                    aggregated_info.begin(),
-                    row_total_size{c_info.data(), key_offsets.data(), num_unique_keys});
+    auto& pass = *_pass_itm_data;
 
-  // bring back to the cpu
-  std::vector<cumulative_row_info> h_aggregated_info(aggregated_info.size());
-  CUDF_CUDA_TRY(cudaMemcpyAsync(h_aggregated_info.data(),
-                                aggregated_info.data(),
-                                sizeof(cumulative_row_info) * c_info.size(),
-                                cudaMemcpyDefault,
-                                _stream.value()));
-  _stream.synchronize();
+    // setup row groups to be loaded for this pass
+    auto const row_group_start = _file_itm_data.input_pass_row_group_offsets[_file_itm_data._current_input_pass];
+    auto const row_group_end   = _file_itm_data.input_pass_row_group_offsets[_file_itm_data._current_input_pass + 1];
+    auto const num_row_groups  = row_group_end - row_group_start;
+    pass.row_groups.resize(num_row_groups);
+    std::copy(_file_itm_data.row_groups.begin() + row_group_start,
+              _file_itm_data.row_groups.begin() + row_group_end,
+              pass.row_groups.begin());
 
-  // generate the actual splits
-  _pass_itm_data->output_chunk_read_info =
-    find_splits(h_aggregated_info, num_rows, _output_chunk_read_limit);
+    auto const num_passes = _file_itm_data.input_pass_row_group_offsets.size() - 1;
+    CUDF_EXPECTS(_file_itm_data._current_input_pass < num_passes, "Encountered an invalid read pass index");
+
+    auto const chunks_per_rowgroup = _input_columns.size();
+    auto const num_chunks          = chunks_per_rowgroup * num_row_groups;
+
+    auto chunk_start = _file_itm_data.chunks.begin() + (row_group_start * chunks_per_rowgroup);
+    auto chunk_end   = _file_itm_data.chunks.begin() + (row_group_end * chunks_per_rowgroup);
+
+    pass.chunks = cudf::detail::hostdevice_vector<ColumnChunkDesc>(num_chunks, _stream);
+    std::copy(chunk_start, chunk_end, pass.chunks.begin());
+
+    // compute skip_rows / num_rows for this pass.
+    if (num_passes == 1) {
+      pass.skip_rows = _file_itm_data.global_skip_rows;
+      pass.num_rows  = _file_itm_data.global_num_rows;
+    } else {
+      auto const global_start_row = _file_itm_data.global_skip_rows;
+      auto const global_end_row   = global_start_row + _file_itm_data.global_num_rows;
+      auto const start_row =
+        std::max(_file_itm_data.input_pass_row_count[_file_itm_data._current_input_pass], global_start_row);
+      auto const end_row =
+        std::min(_file_itm_data.input_pass_row_count[_file_itm_data._current_input_pass + 1], global_end_row);
+
+      // skip_rows is always global in the sense that it is relative to the first row of
+      // everything we will be reading, regardless of what pass we are on.
+      // num_rows is how many rows we are reading this pass.
+      pass.skip_rows =
+        global_start_row + _file_itm_data.input_pass_row_count[_file_itm_data._current_input_pass];
+      pass.num_rows = end_row - start_row;
+    }
+
+    // load page information for the chunk. this retrieves the compressed bytes for all the
+    // pages, and their headers (which we can access without decompressing)
+    load_compressed_data();
+
+    // detect malformed columns.
+    // - we have seen some cases in the wild where we have a row group containing N
+    //   rows, but the total number of rows in the pages for column X is != N. while it
+    //   is possible to load this by just capping the number of rows read, we cannot tell
+    //   which rows are invalid so we may be returning bad data. in addition, this mismatch
+    //   confuses the chunked reader
+    detect_malformed_pages(pass.pages,
+                           pass.chunks,
+                           pass.num_rows,
+                           _stream);
+
+    // since there is only ever 1 dictionary per chunk (the 0th path), do it at the 
+    // pass level.
+    build_string_dict_indices();
+
+    // compute offsets to each group of input pages. this also gives us the number of unique
+    // columns in the input
+    // page_keys:   1, 1, 1, 1, 2, 2, 2, 2, 3, 3, 3, 3
+    //
+    // result:      0,          4,          8    
+    rmm::device_uvector<size_type> page_counts(pass.pages.size() + 1, _stream);
+    auto page_keys = make_page_key_iterator(pass.pages);
+    auto const page_counts_end = thrust::reduce_by_key(rmm::exec_policy(_stream),
+                                                        page_keys,
+                                                        page_keys + pass.pages.size(),
+                                                        thrust::make_constant_iterator(1),
+                                                        thrust::make_discard_iterator(),
+                                                        page_counts.begin()).second;
+    auto const num_page_counts = page_counts_end - page_counts.begin();
+    pass.page_offsets = cudf::detail::hostdevice_vector<size_type>(num_page_counts + 1, _stream);
+    thrust::exclusive_scan(
+      rmm::exec_policy(_stream), page_counts.begin(), page_counts.begin() + num_page_counts + 1, pass.page_offsets.d_begin());
+    pass.page_offsets.device_to_host_async(_stream);
+
+    pass.page_processed_counts = std::vector<size_type>(num_page_counts);
+    std::fill(pass.page_processed_counts.begin(), pass.page_processed_counts.end(), 0);
+
+    // compute subpasses for this pass using the page information we now have.
+    // compute_subpasses();
+    /*
+    if (_output_chunk_read_limit == 0) {  // read the whole file at once
+      CUDF_EXPECTS(_pass_itm_data->output_chunk_read_info.size() == 1,
+                    "Reading the whole file should yield only one chunk.");
+    }
+    */
+
+    _stream.synchronize();
+  }
+}
+
+void reader::impl::handle_chunking(bool uses_custom_row_bounds)
+{  
+  // if this is our first time in here, setup the first pass.
+  if(!_pass_itm_data){
+    // preprocess the next pass
+    preprocess_next_pass();
+  }
+
+  auto& pass = *_pass_itm_data;
+
+  // if we already have a subpass in flight.
+  if(pass.subpass != nullptr){
+    // if it still has more chunks in flight, there's nothing more to do
+    if(pass.subpass->current_output_chunk < pass.subpass->output_chunk_read_info.size()){
+      return;
+    }    
+
+    // release the old subpass (will free memory)
+    pass.subpass.reset();
+
+    // otherwise we are done with the pass entirely
+    if(pass.processed_rows == pass.num_rows){
+      // release the old pass
+      _pass_itm_data.reset();
+
+      _file_itm_data._current_input_pass++;
+      auto const num_passes = _file_itm_data.input_pass_row_group_offsets.size() - 1;
+      // no more passes. we are absolutely done with this file.
+      if(_file_itm_data._current_input_pass == num_passes){
+        return;
+      }
+
+      // preprocess the next pass
+      preprocess_next_pass();
+    }
+  } 
+  
+  // next pass
+  pass.subpass = std::make_unique<subpass_intermediate_data>();
+  auto& subpass = *pass.subpass;
+
+  auto const num_columns = pass.page_offsets.size() - 1;
+  
+  auto [page_indices, total_pages] = [&]() -> std::pair<std::vector<page_span>, size_t> {    
+    // special case:  if we contain no compressed data, or if we have no input limit, we can always just do 1 subpass since
+    // what we already have loaded is all the temporary memory we will ever use.
+    if(!pass.has_compressed_data || _input_pass_read_limit == 0){
+      std::vector<page_span> page_indices;
+      page_indices.reserve(num_columns);
+      auto iter = thrust::make_counting_iterator(0);
+      std::transform(iter, iter + num_columns, std::back_inserter(page_indices), [&](size_t i) -> page_span {
+        return {static_cast<size_t>(pass.page_offsets[i]), static_cast<size_t>(pass.page_offsets[i+1])};
+      });
+      return {page_indices, pass.pages.size()};
+    } 
+    // otherwise we have to look forward and choose a batch of pages
+
+    // generate cumulative page sizes.
+    rmm::device_uvector<cumulative_page_info> c_info(pass.pages.size(), _stream);
+    auto page_keys = make_page_key_iterator(pass.pages);
+    auto page_size = thrust::make_transform_iterator(pass.pages.d_begin(), get_page_size{pass.chunks});
+    thrust::inclusive_scan_by_key(rmm::exec_policy(_stream),
+                                  page_keys,
+                                  page_keys + pass.pages.size(),
+                                  page_size,
+                                  c_info.begin(),
+                                  thrust::equal_to{},
+                                  cumulative_page_sum{});
+    // print_cumulative_page_info(pass.pages, c_info, _stream);
+
+    // get the next batch of pages
+    return compute_next_subpass(c_info, pass.pages, pass.page_offsets, pass.processed_rows, _input_pass_read_limit, num_columns, _stream);
+  }();
+  
+  // fill out the subpass struct  
+  subpass.pages = cudf::detail::hostdevice_vector<PageInfo>(0, total_pages, _stream);
+  subpass.page_src_index = cudf::detail::hostdevice_vector<size_t>(total_pages, total_pages, _stream);
+  // copy the appropriate subset of pages from each column
+  size_t page_count = 0;
+  for(size_t c_idx=0; c_idx<num_columns; c_idx++){
+    auto const num_column_pages = page_indices[c_idx].end - page_indices[c_idx].start;
+    subpass.chunk_page_count.push_back(num_column_pages);
+    std::copy(pass.pages.begin() + page_indices[c_idx].start,
+              pass.pages.begin() + page_indices[c_idx].end,
+              std::back_inserter(subpass.pages));
+    
+    // mapping back to original pages in the pass
+    thrust::sequence(thrust::host,
+                     subpass.page_src_index.begin() + page_count,
+                     subpass.page_src_index.begin() + page_count + num_column_pages,
+                     page_indices[c_idx].start);
+    page_count += num_column_pages;
+  }
+  subpass.pages.host_to_device_async(_stream);
+  subpass.page_src_index.host_to_device_async(_stream);
+
+  //print_hostdevice_vector(subpass.page_src_index);
+
+  // decompress the pages
+  if (pass.has_compressed_data) {
+    subpass.decomp_page_data = decompress_page_data(pass.chunks, subpass.pages, _stream);
+    /*
+    // Free compressed data
+    for (size_t c = 0; c < chunks.size(); c++) {
+      if (chunks[c].codec != Compression::UNCOMPRESSED) { raw_page_data[c].reset(); }
+    }
+    */
+  }
+  // buffers needed by the decode kernels
+  subpass.pages.device_to_host_sync(_stream);
+  {
+    // nesting information (sizes, etc) stored -per page-
+    // note : even for flat schemas, we allocate 1 level of "nesting" info
+    allocate_nesting_info();
+
+    // level decode space
+    allocate_level_decode_space();
+  }
+  subpass.pages.host_to_device_async(_stream);
+
+  // preprocess pages (computes row counts for lists, computes output chunks and computes
+  // the actual row counts we will be able load out of this subpass)
+  preprocess_subpass_pages(uses_custom_row_bounds, _output_chunk_read_limit);
 }
 
 }  // namespace cudf::io::parquet::detail
diff --git a/cpp/src/io/parquet/reader_impl_chunking.hpp b/cpp/src/io/parquet/reader_impl_chunking.hpp
index dfc239d8451..2179e6155a0 100644
--- a/cpp/src/io/parquet/reader_impl_chunking.hpp
+++ b/cpp/src/io/parquet/reader_impl_chunking.hpp
@@ -30,16 +30,19 @@ struct file_intermediate_data {
   // all row groups to read
   std::vector<row_group_info> row_groups{};
 
-  // all chunks from the selected row groups. We may end up reading these chunks progressively
-  // instead of all at once
+  // all chunks from the selected row groups.
   std::vector<ColumnChunkDesc> chunks{};
 
   // an array of offsets into _file_itm_data::global_chunks. Each pair of offsets represents
   // the start/end of the chunks to be loaded for a given pass.
   std::vector<std::size_t> input_pass_row_group_offsets{};
+  
   // row counts per input-pass
   std::vector<std::size_t> input_pass_row_count{};
 
+  std::size_t _current_input_pass{0};  // current input pass index
+  std::size_t _output_chunk_count{0};  // how many output chunks we have produced
+
   // skip_rows/num_rows values for the entire file. these need to be adjusted per-pass because we
   // may not be visiting every row group that contains these bounds
   size_t global_skip_rows;
@@ -47,41 +50,75 @@ struct file_intermediate_data {
 };
 
 /**
- * @brief Struct to identify the range for each chunk of rows during a chunked reading pass.
+ * @brief Struct to identify a range of rows.
  */
-struct chunk_read_info {
+struct row_range {
+  size_t skip_rows;
+  size_t num_rows;
+};
+
+/**
+ * @brief Passes are broken down into subpasses based on temporary memory constraints.
+ */
+struct subpass_intermediate_data {
+  rmm::device_buffer decomp_page_data;
+
+  rmm::device_buffer level_decode_data{};
+  cudf::detail::hostdevice_vector<PageInfo> pages{};
+  // for each page in the subpass, the index of our source page in the pass
+  cudf::detail::hostdevice_vector<size_t> page_src_index{};
+  // for each chunk in the subpass, the number of associated pages for this
+  // subpass
+  std::vector<size_t> chunk_page_count;
+  cudf::detail::hostdevice_vector<PageNestingInfo> page_nesting_info{};
+  cudf::detail::hostdevice_vector<PageNestingDecodeInfo> page_nesting_decode_info{};
+
+  std::vector<row_range> output_chunk_read_info;
+  std::size_t current_output_chunk{0};
+
+  // skip_rows and num_rows values for this particular subpass.
   size_t skip_rows;
   size_t num_rows;
 };
 
 /**
  * @brief Struct to store pass-level data that remains constant for a single pass.
+ *
+ * A pass is defined as 
  */
 struct pass_intermediate_data {
   std::vector<std::unique_ptr<datasource::buffer>> raw_page_data;
-  rmm::device_buffer decomp_page_data;
 
   // rowgroup, chunk and page information for the current pass.
+  bool has_compressed_data{false};
   std::vector<row_group_info> row_groups{};
   cudf::detail::hostdevice_vector<ColumnChunkDesc> chunks{};
-  cudf::detail::hostdevice_vector<PageInfo> pages_info{};
-  cudf::detail::hostdevice_vector<PageNestingInfo> page_nesting_info{};
-  cudf::detail::hostdevice_vector<PageNestingDecodeInfo> page_nesting_decode_info{};
-
-  rmm::device_uvector<int32_t> page_keys{0, rmm::cuda_stream_default};
-  rmm::device_uvector<int32_t> page_index{0, rmm::cuda_stream_default};
+  cudf::detail::hostdevice_vector<PageInfo> pages{};
+  
+  // offsets to each group of input pages (by column/schema)
+  // so if we had 2 columns/schemas, with page keys
+  //
+  // 1 1 1 1 1 2 2 2
+  // 
+  // page_offsets would be 0, 5, 8
+  cudf::detail::hostdevice_vector<size_type> page_offsets{};
+  // for each group of input pages (by column, schema), the count
+  // of how many pages we have processed so far
+  std::vector<size_type> page_processed_counts{};  
+    
   rmm::device_uvector<string_index_pair> str_dict_index{0, rmm::cuda_stream_default};
 
-  std::vector<chunk_read_info> output_chunk_read_info;
-  std::size_t current_output_chunk{0};
-
-  rmm::device_buffer level_decode_data{};
   int level_type_size{0};
 
-  // skip_rows and num_rows values for this particular pass. these may be adjusted values from the
-  // global values stored in file_intermediate_data.
+  // skip_rows / num_rows for this pass.
+  // NOTE: skip_rows is the absolute row index in the file.
   size_t skip_rows;
-  size_t num_rows;
+  size_t num_rows; 
+  // number of rows we have processed so far (out of num_rows)
+  size_t processed_rows{0}; 
+
+  // currently active subpass
+  std::unique_ptr<subpass_intermediate_data> subpass{};
 };
 
 }  // namespace cudf::io::parquet::detail
diff --git a/cpp/src/io/parquet/reader_impl_preprocess.cu b/cpp/src/io/parquet/reader_impl_preprocess.cu
index ce45f709ee1..cc336792b48 100644
--- a/cpp/src/io/parquet/reader_impl_preprocess.cu
+++ b/cpp/src/io/parquet/reader_impl_preprocess.cu
@@ -16,9 +16,6 @@
 
 #include "reader_impl.hpp"
 
-#include <io/comp/nvcomp_adapter.hpp>
-#include <io/utilities/config_utils.hpp>
-
 #include <cudf/detail/iterator.cuh>
 #include <cudf/detail/utilities/integer_utils.hpp>
 #include <cudf/detail/utilities/vector_factories.hpp>
@@ -45,6 +42,29 @@
 namespace cudf::io::parquet::detail {
 namespace {
 
+#if defined(PREPROCESS_DEBUG)
+void print_pages(cudf::detail::hostdevice_vector<PageInfo>& pages, rmm::cuda_stream_view _stream)
+{
+  pages.device_to_host_sync(_stream);
+  for (size_t idx = 0; idx < pages.size(); idx++) {
+    auto const& p = pages[idx];
+    // skip dictionary pages
+    if (p.flags & PAGEINFO_FLAGS_DICTIONARY) { continue; }
+    printf(
+      "P(%lu, s:%d): chunk_row(%d), num_rows(%d), skipped_values(%d), skipped_leaf_values(%d), "
+      "str_bytes(%d)\n",
+      idx,
+      p.src_col_schema,
+      p.chunk_row,
+      p.num_rows,
+      p.skipped_values,
+      p.skipped_leaf_values,
+      p.str_bytes);
+  }
+}
+#endif  // PREPROCESS_DEBUG
+
+
 /**
  * @brief Generate depth remappings for repetition and definition levels.
  *
@@ -264,7 +284,7 @@ void generate_depth_remappings(std::map<int, std::pair<std::vector<int>, std::ve
   size_t total_pages = 0;
 
   chunks.host_to_device_async(stream);
-  DecodePageHeaders(chunks.device_ptr(), chunks.size(), stream);
+  DecodePageHeaders(chunks.device_ptr(), nullptr, chunks.size(), stream);
   chunks.device_to_host_sync(stream);
 
   for (size_t c = 0; c < chunks.size(); c++) {
@@ -299,20 +319,23 @@ int decode_page_headers(cudf::detail::hostdevice_vector<ColumnChunkDesc>& chunks
                         cudf::detail::hostdevice_vector<PageInfo>& pages,
                         rmm::cuda_stream_view stream)
 {
+  cudf::detail::hostdevice_vector<chunk_page_info> chunk_page_info(chunks.size(), stream);
+
   // IMPORTANT : if you change how pages are stored within a chunk (dist pages, then data pages),
   // please update preprocess_nested_columns to reflect this.
   for (size_t c = 0, page_count = 0; c < chunks.size(); c++) {
     chunks[c].max_num_pages = chunks[c].num_data_pages + chunks[c].num_dict_pages;
-    chunks[c].page_info     = pages.device_ptr(page_count);
+    chunk_page_info[c].pages     = pages.device_ptr(page_count);
     page_count += chunks[c].max_num_pages;
   }
 
   chunks.host_to_device_async(stream);
-  DecodePageHeaders(chunks.device_ptr(), chunks.size(), stream);
+  chunk_page_info.host_to_device_async(stream);
+  DecodePageHeaders(chunks.device_ptr(), chunk_page_info.device_ptr(), chunks.size(), stream);
 
   // compute max bytes needed for level data
   auto level_bit_size =
-    cudf::detail::make_counting_transform_iterator(0, [chunks = chunks.begin()] __device__(int i) {
+    cudf::detail::make_counting_transform_iterator(0, [chunks = chunks.d_begin()] __device__(int i) {
       auto c = chunks[i];
       return static_cast<int>(
         max(c.level_bits[level_type::REPETITION], c.level_bits[level_type::DEFINITION]));
@@ -325,6 +348,41 @@ int decode_page_headers(cudf::detail::hostdevice_vector<ColumnChunkDesc>& chunks
                                             thrust::maximum<int>());
   auto const level_type_size = std::max(1, cudf::util::div_rounding_up_safe(max_level_bits, 8));
 
+  // sort the pages in schema order.
+  //
+  // ordering of pages is by input column schema, repeated across row groups.  so
+  // if we had 3 columns, each with 2 pages, and 1 row group, our schema values might look like
+  //
+  // 1, 1, 2, 2, 3, 3
+  //
+  // However, if we had more than one row group, the pattern would be
+  //
+  // 1, 1, 2, 2, 3, 3, 1, 1, 2, 2, 3, 3
+  // ^ row group 0     |
+  //                   ^ row group 1
+  //
+  // To process pages by key (exclusive_scan_by_key, reduce_by_key, etc), the ordering we actually
+  // want is
+  //
+  // 1, 1, 1, 1, 2, 2, 2, 2, 3, 3, 3, 3
+  //
+  // We also need to preserve key-relative page ordering, so we need to use a stable sort.
+  {
+    rmm::device_uvector<int32_t> page_keys{pages.size(), stream};
+    thrust::transform(rmm::exec_policy(stream),
+                      pages.d_begin(),
+                      pages.d_begin() + pages.size(),
+                      page_keys.begin(),
+                      [] __device__  (PageInfo const& page){
+                        return page.src_col_schema;
+                      });
+    thrust::stable_sort_by_key(rmm::exec_policy(stream),
+                               page_keys.begin(),
+                               page_keys.end(),
+                               pages.d_begin(),
+                               thrust::less<int>());
+  }
+
   pages.device_to_host_sync(stream);
 
   // validate page encodings
@@ -336,216 +394,74 @@ int decode_page_headers(cudf::detail::hostdevice_vector<ColumnChunkDesc>& chunks
   return level_type_size;
 }
 
-/**
- * @brief Decompresses the page data, at page granularity.
- *
- * @param chunks List of column chunk descriptors
- * @param pages List of page information
- * @param stream CUDA stream used for device memory operations and kernel launches
- *
- * @return Device buffer to decompressed page data
- */
-[[nodiscard]] rmm::device_buffer decompress_page_data(
-  cudf::detail::hostdevice_vector<ColumnChunkDesc>& chunks,
-  cudf::detail::hostdevice_vector<PageInfo>& pages,
-  rmm::cuda_stream_view stream)
-{
-  auto for_each_codec_page = [&](Compression codec, std::function<void(size_t)> const& f) {
-    for (size_t c = 0, page_count = 0; c < chunks.size(); c++) {
-      const auto page_stride = chunks[c].max_num_pages;
-      if (chunks[c].codec == codec) {
-        for (int k = 0; k < page_stride; k++) {
-          f(page_count + k);
-        }
-      }
-      page_count += page_stride;
-    }
-  };
-
-  // Brotli scratch memory for decompressing
-  rmm::device_buffer debrotli_scratch;
+}  // namespace
 
-  // Count the exact number of compressed pages
-  size_t num_comp_pages    = 0;
-  size_t total_decomp_size = 0;
+void reader::impl::build_string_dict_indices()
+{
+  auto& pass = *_pass_itm_data;
 
-  struct codec_stats {
-    Compression compression_type  = UNCOMPRESSED;
-    size_t num_pages              = 0;
-    int32_t max_decompressed_size = 0;
-    size_t total_decomp_size      = 0;
+  auto is_dict_chunk = [](ColumnChunkDesc const& chunk) {
+    return (chunk.data_type & 0x7) == BYTE_ARRAY && chunk.num_dict_pages > 0;
   };
 
-  std::array codecs{codec_stats{GZIP}, codec_stats{SNAPPY}, codec_stats{BROTLI}, codec_stats{ZSTD}};
-
-  auto is_codec_supported = [&codecs](int8_t codec) {
-    if (codec == UNCOMPRESSED) return true;
-    return std::find_if(codecs.begin(), codecs.end(), [codec](auto& cstats) {
-             return codec == cstats.compression_type;
-           }) != codecs.end();
-  };
-  CUDF_EXPECTS(std::all_of(chunks.begin(),
-                           chunks.end(),
-                           [&is_codec_supported](auto const& chunk) {
-                             return is_codec_supported(chunk.codec);
-                           }),
-               "Unsupported compression type");
-
-  for (auto& codec : codecs) {
-    for_each_codec_page(codec.compression_type, [&](size_t page) {
-      auto page_uncomp_size = pages[page].uncompressed_page_size;
-      total_decomp_size += page_uncomp_size;
-      codec.total_decomp_size += page_uncomp_size;
-      codec.max_decompressed_size = std::max(codec.max_decompressed_size, page_uncomp_size);
-      codec.num_pages++;
-      num_comp_pages++;
-    });
-    if (codec.compression_type == BROTLI && codec.num_pages > 0) {
-      debrotli_scratch.resize(get_gpu_debrotli_scratch_size(codec.num_pages), stream);
+  // Count the number of string dictionary entries
+  // NOTE: Assumes first page in the chunk is always the dictionary page
+  size_t total_str_dict_indexes = 0;
+  for (size_t c = 0, page_count = 0; c < pass.chunks.size(); c++) {
+    if (is_dict_chunk(pass.chunks[c])) {
+      total_str_dict_indexes += pass.pages[page_count].num_input_values;
     }
+    page_count += pass.chunks[c].max_num_pages;
   }
 
-  // Dispatch batches of pages to decompress for each codec.
-  // Buffer needs to be padded, required by `gpuDecodePageData`.
-  rmm::device_buffer decomp_pages(
-    cudf::util::round_up_safe(total_decomp_size, BUFFER_PADDING_MULTIPLE), stream);
-
-  std::vector<device_span<uint8_t const>> comp_in;
-  comp_in.reserve(num_comp_pages);
-  std::vector<device_span<uint8_t>> comp_out;
-  comp_out.reserve(num_comp_pages);
-
-  // vectors to save v2 def and rep level data, if any
-  std::vector<device_span<uint8_t const>> copy_in;
-  copy_in.reserve(num_comp_pages);
-  std::vector<device_span<uint8_t>> copy_out;
-  copy_out.reserve(num_comp_pages);
-
-  rmm::device_uvector<compression_result> comp_res(num_comp_pages, stream);
-  thrust::fill(rmm::exec_policy(stream),
-               comp_res.begin(),
-               comp_res.end(),
-               compression_result{0, compression_status::FAILURE});
-
-  size_t decomp_offset = 0;
-  int32_t start_pos    = 0;
-  for (auto const& codec : codecs) {
-    if (codec.num_pages == 0) { continue; }
-
-    for_each_codec_page(codec.compression_type, [&](size_t page_idx) {
-      auto const dst_base = static_cast<uint8_t*>(decomp_pages.data()) + decomp_offset;
-      auto& page          = pages[page_idx];
-      // offset will only be non-zero for V2 pages
-      auto const offset =
-        page.lvl_bytes[level_type::DEFINITION] + page.lvl_bytes[level_type::REPETITION];
-      // for V2 need to copy def and rep level info into place, and then offset the
-      // input and output buffers. otherwise we'd have to keep both the compressed
-      // and decompressed data.
-      if (offset != 0) {
-        copy_in.emplace_back(page.page_data, offset);
-        copy_out.emplace_back(dst_base, offset);
-      }
-      comp_in.emplace_back(page.page_data + offset,
-                           static_cast<size_t>(page.compressed_page_size - offset));
-      comp_out.emplace_back(dst_base + offset,
-                            static_cast<size_t>(page.uncompressed_page_size - offset));
-      page.page_data = dst_base;
-      decomp_offset += page.uncompressed_page_size;
-    });
-
-    host_span<device_span<uint8_t const> const> comp_in_view{comp_in.data() + start_pos,
-                                                             codec.num_pages};
-    auto const d_comp_in = cudf::detail::make_device_uvector_async(
-      comp_in_view, stream, rmm::mr::get_current_device_resource());
-    host_span<device_span<uint8_t> const> comp_out_view(comp_out.data() + start_pos,
-                                                        codec.num_pages);
-    auto const d_comp_out = cudf::detail::make_device_uvector_async(
-      comp_out_view, stream, rmm::mr::get_current_device_resource());
-    device_span<compression_result> d_comp_res_view(comp_res.data() + start_pos, codec.num_pages);
-
-    switch (codec.compression_type) {
-      case GZIP:
-        gpuinflate(d_comp_in, d_comp_out, d_comp_res_view, gzip_header_included::YES, stream);
-        break;
-      case SNAPPY:
-        if (cudf::io::detail::nvcomp_integration::is_stable_enabled()) {
-          nvcomp::batched_decompress(nvcomp::compression_type::SNAPPY,
-                                     d_comp_in,
-                                     d_comp_out,
-                                     d_comp_res_view,
-                                     codec.max_decompressed_size,
-                                     codec.total_decomp_size,
-                                     stream);
-        } else {
-          gpu_unsnap(d_comp_in, d_comp_out, d_comp_res_view, stream);
-        }
-        break;
-      case ZSTD:
-        nvcomp::batched_decompress(nvcomp::compression_type::ZSTD,
-                                   d_comp_in,
-                                   d_comp_out,
-                                   d_comp_res_view,
-                                   codec.max_decompressed_size,
-                                   codec.total_decomp_size,
-                                   stream);
-        break;
-      case BROTLI:
-        gpu_debrotli(d_comp_in,
-                     d_comp_out,
-                     d_comp_res_view,
-                     debrotli_scratch.data(),
-                     debrotli_scratch.size(),
-                     stream);
-        break;
-      default: CUDF_FAIL("Unexpected decompression dispatch"); break;
+  // Build index for string dictionaries since they can't be indexed
+  // directly due to variable-sized elements
+  pass.str_dict_index =
+    cudf::detail::make_zeroed_device_uvector_async<string_index_pair>(
+      total_str_dict_indexes, _stream, rmm::mr::get_current_device_resource());
+
+  // Update chunks with pointers to string dict indices
+  for (size_t c = 0, page_count = 0, str_ofs = 0; c < pass.chunks.size(); c++) {
+    input_column_info const& input_col = _input_columns[pass.chunks[c].src_col_index];
+    CUDF_EXPECTS(input_col.schema_idx == pass.chunks[c].src_col_schema,
+                "Column/page schema index mismatch");
+    if (is_dict_chunk(pass.chunks[c])) {
+      pass.chunks[c].str_dict_index = pass.str_dict_index.data() + str_ofs;
+      str_ofs += pass.pages[page_count].num_input_values;
     }
-    start_pos += codec.num_pages;
-  }
 
-  CUDF_EXPECTS(thrust::all_of(rmm::exec_policy(stream),
-                              comp_res.begin(),
-                              comp_res.end(),
-                              [] __device__(auto const& res) {
-                                return res.status == compression_status::SUCCESS;
-                              }),
-               "Error during decompression");
-
-  // now copy the uncompressed V2 def and rep level data
-  if (not copy_in.empty()) {
-    auto const d_copy_in = cudf::detail::make_device_uvector_async(
-      copy_in, stream, rmm::mr::get_current_device_resource());
-    auto const d_copy_out = cudf::detail::make_device_uvector_async(
-      copy_out, stream, rmm::mr::get_current_device_resource());
-
-    gpu_copy_uncompressed_blocks(d_copy_in, d_copy_out, stream);
-    stream.synchronize();
+    // column_data_base will always point to leaf data, even for nested types.
+    page_count += pass.chunks[c].max_num_pages;
   }
 
-  // Update the page information in device memory with the updated value of
-  // page_data; it now points to the uncompressed data buffer
-  pages.host_to_device_async(stream);
-
-  return decomp_pages;
+  if (total_str_dict_indexes > 0) {
+    pass.chunks.host_to_device_async(_stream);
+    BuildStringDictionaryIndex(pass.chunks.device_ptr(), pass.chunks.size(), _stream);
+  }
 }
 
-}  // namespace
-
 void reader::impl::allocate_nesting_info()
 {
-  auto const& chunks             = _pass_itm_data->chunks;
-  auto& pages                    = _pass_itm_data->pages_info;
-  auto& page_nesting_info        = _pass_itm_data->page_nesting_info;
-  auto& page_nesting_decode_info = _pass_itm_data->page_nesting_decode_info;
+  auto& pass = *_pass_itm_data;
+  auto& subpass = *pass.subpass;
+
+  auto const& chunks             = pass.chunks;
+  auto& pages                    = subpass.pages;
+  auto& page_nesting_info        = subpass.page_nesting_info;
+  auto& page_nesting_decode_info = subpass.page_nesting_decode_info;
 
   // compute total # of page_nesting infos needed and allocate space. doing this in one
   // buffer to keep it to a single gpu allocation
+  auto counting_iter = thrust::make_counting_iterator(size_t{0});
   size_t const total_page_nesting_infos = std::accumulate(
-    chunks.host_ptr(), chunks.host_ptr() + chunks.size(), 0, [&](int total, auto& chunk) {
+    counting_iter, counting_iter + chunks.size(), 0, [&](int total, size_t index) {
+      auto const& chunk = chunks[index];
+
       // the schema of the input column
       auto const& schema                    = _metadata->get_schema(chunk.src_col_schema);
       auto const per_page_nesting_info_size = max(
         schema.max_definition_level + 1, _metadata->get_output_nesting_depth(chunk.src_col_schema));
-      return total + (per_page_nesting_info_size * chunk.num_data_pages);
+      return total + (per_page_nesting_info_size * subpass.chunk_page_count[index]);
     });
 
   page_nesting_info =
@@ -563,8 +479,9 @@ void reader::impl::allocate_nesting_info()
       schema.max_definition_level + 1, _metadata->get_output_nesting_depth(src_col_schema));
 
     // skip my dict pages
+    CUDF_EXPECTS(chunks[idx].num_dict_pages <= 1, "Unexpected dictionary page count for chunk");
     target_page_index += chunks[idx].num_dict_pages;
-    for (int p_idx = 0; p_idx < chunks[idx].num_data_pages; p_idx++) {
+    for (size_t p_idx = 0; p_idx < subpass.chunk_page_count[idx]; p_idx++) {
       pages[target_page_index + p_idx].nesting = page_nesting_info.device_ptr() + src_info_index;
       pages[target_page_index + p_idx].nesting_decode =
         page_nesting_decode_info.device_ptr() + src_info_index;
@@ -575,7 +492,7 @@ void reader::impl::allocate_nesting_info()
 
       src_info_index += per_page_nesting_info_size;
     }
-    target_page_index += chunks[idx].num_data_pages;
+    target_page_index += subpass.chunk_page_count[idx];
   }
 
   // fill in
@@ -607,7 +524,7 @@ void reader::impl::allocate_nesting_info()
       // we can ignore them for the purposes of output nesting info
       if (!cur_schema.is_stub()) {
         // initialize each page within the chunk
-        for (int p_idx = 0; p_idx < chunks[idx].num_data_pages; p_idx++) {
+        for (size_t p_idx = 0; p_idx < subpass.chunk_page_count[idx]; p_idx++) {
           PageNestingInfo* pni =
             &page_nesting_info[nesting_info_index + (p_idx * per_page_nesting_info_size)];
 
@@ -647,7 +564,7 @@ void reader::impl::allocate_nesting_info()
       cur_schema = _metadata->get_schema(schema_idx);
     }
 
-    nesting_info_index += (per_page_nesting_info_size * chunks[idx].num_data_pages);
+    nesting_info_index += (per_page_nesting_info_size * subpass.chunk_page_count[idx]);
   }
 
   // copy nesting info to the device
@@ -657,32 +574,34 @@ void reader::impl::allocate_nesting_info()
 
 void reader::impl::allocate_level_decode_space()
 {
-  auto& pages = _pass_itm_data->pages_info;
+  auto& pass = *_pass_itm_data;
+  auto& subpass = *pass.subpass;
+
+  auto& pages = subpass.pages;
 
   // TODO: this could be made smaller if we ignored dictionary pages and pages with no
   // repetition data.
   size_t const per_page_decode_buf_size =
-    LEVEL_DECODE_BUF_SIZE * 2 * _pass_itm_data->level_type_size;
+    LEVEL_DECODE_BUF_SIZE * 2 * pass.level_type_size;
   auto const decode_buf_size = per_page_decode_buf_size * pages.size();
-  _pass_itm_data->level_decode_data =
+  subpass.level_decode_data =
     rmm::device_buffer(decode_buf_size, _stream, rmm::mr::get_current_device_resource());
 
   // distribute the buffers
-  uint8_t* buf = static_cast<uint8_t*>(_pass_itm_data->level_decode_data.data());
+  uint8_t* buf = static_cast<uint8_t*>(subpass.level_decode_data.data());
   for (size_t idx = 0; idx < pages.size(); idx++) {
     auto& p = pages[idx];
 
     p.lvl_decode_buf[level_type::DEFINITION] = buf;
-    buf += (LEVEL_DECODE_BUF_SIZE * _pass_itm_data->level_type_size);
+    buf += (LEVEL_DECODE_BUF_SIZE * pass.level_type_size);
     p.lvl_decode_buf[level_type::REPETITION] = buf;
-    buf += (LEVEL_DECODE_BUF_SIZE * _pass_itm_data->level_type_size);
+    buf += (LEVEL_DECODE_BUF_SIZE * pass.level_type_size);
   }
 }
 
-std::pair<bool, std::vector<std::future<void>>> reader::impl::read_and_decompress_column_chunks()
+std::pair<bool, std::vector<std::future<void>>> reader::impl::read_column_chunks()
 {
   auto const& row_groups_info = _pass_itm_data->row_groups;
-  auto const num_rows         = _pass_itm_data->num_rows;
 
   auto& raw_page_data = _pass_itm_data->raw_page_data;
   auto& chunks        = _pass_itm_data->chunks;
@@ -702,13 +621,14 @@ std::pair<bool, std::vector<std::future<void>>> reader::impl::read_and_decompres
 
   // Initialize column chunk information
   size_t total_decompressed_size = 0;
-  auto remaining_rows            = num_rows;
+  // TODO: make this respect the pass-wide skip_rows/num_rows instead of the file-wide skip_rows/num_rows
+  //auto remaining_rows            = num_rows;
   std::vector<std::future<void>> read_chunk_tasks;
   size_type chunk_count = 0;
   for (auto const& rg : row_groups_info) {
     auto const& row_group       = _metadata->get_row_group(rg.index, rg.source_index);
     auto const row_group_source = rg.source_index;
-    auto const row_group_rows   = std::min<int>(remaining_rows, row_group.num_rows);
+    // auto const row_group_rows   = std::min<int>(remaining_rows, row_group.num_rows);
 
     // generate ColumnChunkDesc objects for everything to be decoded (all input columns)
     for (size_t i = 0; i < num_input_columns; ++i) {
@@ -730,7 +650,7 @@ std::pair<bool, std::vector<std::future<void>>> reader::impl::read_and_decompres
 
       chunk_count++;
     }
-    remaining_rows -= row_group_rows;
+    //remaining_rows -= row_group_rows;
   }
 
   // Read compressed chunk data to device memory
@@ -743,22 +663,23 @@ std::pair<bool, std::vector<std::future<void>>> reader::impl::read_and_decompres
                                                       chunk_source_map,
                                                       _stream));
 
-  CUDF_EXPECTS(remaining_rows == 0, "All rows data must be read.");
+  //CUDF_EXPECTS(remaining_rows == 0, "All rows data must be read.");
 
   return {total_decompressed_size > 0, std::move(read_chunk_tasks)};
 }
 
-void reader::impl::load_and_decompress_data()
+void reader::impl::load_compressed_data()
 {
-  // This function should never be called if `num_rows == 0`.
-  CUDF_EXPECTS(_pass_itm_data->num_rows > 0, "Number of reading rows must not be zero.");
+  auto& pass = *_pass_itm_data;
 
-  auto& raw_page_data    = _pass_itm_data->raw_page_data;
-  auto& decomp_page_data = _pass_itm_data->decomp_page_data;
-  auto& chunks           = _pass_itm_data->chunks;
-  auto& pages            = _pass_itm_data->pages_info;
+  // This function should never be called if `num_rows == 0`.
+  // CUDF_EXPECTS(_pass_itm_data->num_rows > 0, "Number of reading rows must not be zero.");
+  
+  auto& chunks           = pass.chunks;
+  auto& pages            = pass.pages;
 
-  auto const [has_compressed_data, read_chunks_tasks] = read_and_decompress_column_chunks();
+  auto const [has_compressed_data, read_chunks_tasks] = read_column_chunks();  
+  pass.has_compressed_data = has_compressed_data;
 
   for (auto& task : read_chunks_tasks) {
     task.wait();
@@ -770,41 +691,7 @@ void reader::impl::load_and_decompress_data()
   pages = cudf::detail::hostdevice_vector<PageInfo>(total_pages, total_pages, _stream);
 
   // decoding of column/page information
-  _pass_itm_data->level_type_size = decode_page_headers(chunks, pages, _stream);
-  if (has_compressed_data) {
-    decomp_page_data = decompress_page_data(chunks, pages, _stream);
-    // Free compressed data
-    for (size_t c = 0; c < chunks.size(); c++) {
-      if (chunks[c].codec != Compression::UNCOMPRESSED) { raw_page_data[c].reset(); }
-    }
-  }
-
-  // build output column info
-  // walk the schema, building out_buffers that mirror what our final cudf columns will look
-  // like. important : there is not necessarily a 1:1 mapping between input columns and output
-  // columns. For example, parquet does not explicitly store a ColumnChunkDesc for struct
-  // columns. The "structiness" is simply implied by the schema.  For example, this schema:
-  //  required group field_id=1 name {
-  //    required binary field_id=2 firstname (String);
-  //    required binary field_id=3 middlename (String);
-  //    required binary field_id=4 lastname (String);
-  // }
-  // will only contain 3 columns of data (firstname, middlename, lastname).  But of course
-  // "name" is a struct column that we want to return, so we have to make sure that we
-  // create it ourselves.
-  // std::vector<output_column_info> output_info = build_output_column_info();
-
-  // the following two allocate functions modify the page data
-  pages.device_to_host_sync(_stream);
-  {
-    // nesting information (sizes, etc) stored -per page-
-    // note : even for flat schemas, we allocate 1 level of "nesting" info
-    allocate_nesting_info();
-
-    // level decode space
-    allocate_level_decode_space();
-  }
-  pages.host_to_device_async(_stream);
+  pass.level_type_size = decode_page_headers(chunks, pages, _stream);
 }
 
 namespace {
@@ -815,28 +702,6 @@ struct cumulative_row_info {
   int key;            // schema index
 };
 
-#if defined(PREPROCESS_DEBUG)
-void print_pages(cudf::detail::hostdevice_vector<PageInfo>& pages, rmm::cuda_stream_view _stream)
-{
-  pages.device_to_host_sync(_stream);
-  for (size_t idx = 0; idx < pages.size(); idx++) {
-    auto const& p = pages[idx];
-    // skip dictionary pages
-    if (p.flags & PAGEINFO_FLAGS_DICTIONARY) { continue; }
-    printf(
-      "P(%lu, s:%d): chunk_row(%d), num_rows(%d), skipped_values(%d), skipped_leaf_values(%d), "
-      "str_bytes(%d)\n",
-      idx,
-      p.src_col_schema,
-      p.chunk_row,
-      p.num_rows,
-      p.skipped_values,
-      p.skipped_leaf_values,
-      p.str_bytes);
-  }
-}
-#endif  // PREPROCESS_DEBUG
-
 struct get_page_chunk_idx {
   __device__ size_type operator()(PageInfo const& page) { return page.chunk_idx; }
 };
@@ -845,14 +710,6 @@ struct get_page_num_rows {
   __device__ size_type operator()(PageInfo const& page) { return page.num_rows; }
 };
 
-struct get_page_column_index {
-  ColumnChunkDesc const* chunks;
-  __device__ size_type operator()(PageInfo const& page)
-  {
-    return chunks[page.chunk_idx].src_col_index;
-  }
-};
-
 struct input_col_info {
   int const schema_idx;
   size_type const nesting_depth;
@@ -885,13 +742,12 @@ struct get_page_nesting_size {
   size_type const max_depth;
   size_t const num_pages;
   PageInfo const* const pages;
-  int const* page_indices;
 
   __device__ size_type operator()(size_t index) const
   {
     auto const indices = reduction_indices{index, max_depth, num_pages};
 
-    auto const& page = pages[page_indices[indices.page_idx]];
+    auto const& page = pages[indices.page_idx];
     if (page.src_col_schema != input_cols[indices.col_idx].schema_idx ||
         page.flags & PAGEINFO_FLAGS_DICTIONARY ||
         indices.depth_idx >= input_cols[indices.col_idx].nesting_depth) {
@@ -934,7 +790,6 @@ struct chunk_row_output_iter {
  */
 struct start_offset_output_iterator {
   PageInfo const* pages;
-  int const* page_indices;
   size_t cur_index;
   input_col_info const* input_cols;
   size_type max_depth;
@@ -949,7 +804,6 @@ struct start_offset_output_iterator {
   constexpr void operator=(start_offset_output_iterator const& other)
   {
     pages        = other.pages;
-    page_indices = other.page_indices;
     cur_index    = other.cur_index;
     input_cols   = other.input_cols;
     max_depth    = other.max_depth;
@@ -959,7 +813,7 @@ struct start_offset_output_iterator {
   constexpr start_offset_output_iterator operator+(size_t i)
   {
     return start_offset_output_iterator{
-      pages, page_indices, cur_index + i, input_cols, max_depth, num_pages};
+      pages, cur_index + i, input_cols, max_depth, num_pages};
   }
 
   constexpr void operator++() { cur_index++; }
@@ -972,7 +826,7 @@ struct start_offset_output_iterator {
   {
     auto const indices = reduction_indices{index, max_depth, num_pages};
 
-    PageInfo const& p = pages[page_indices[indices.page_idx]];
+    PageInfo const& p = pages[indices.page_idx];
     if (p.src_col_schema != input_cols[indices.col_idx].schema_idx ||
         p.flags & PAGEINFO_FLAGS_DICTIONARY ||
         indices.depth_idx >= input_cols[indices.col_idx].nesting_depth) {
@@ -982,114 +836,20 @@ struct start_offset_output_iterator {
   }
 };
 
-struct flat_column_num_rows {
-  PageInfo const* pages;
+struct page_to_string_size {  
   ColumnChunkDesc const* chunks;
 
-  __device__ size_type operator()(size_type pindex) const
+  __device__ size_t operator()(PageInfo const& page) const
   {
-    PageInfo const& page = pages[pindex];
-    // ignore dictionary pages and pages belonging to any column containing repetition (lists)
-    if ((page.flags & PAGEINFO_FLAGS_DICTIONARY) ||
-        (chunks[page.chunk_idx].max_level[level_type::REPETITION] > 0)) {
-      return 0;
-    }
-    return page.num_rows;
-  }
-};
-
-struct row_counts_nonzero {
-  __device__ bool operator()(size_type count) const { return count > 0; }
-};
-
-struct row_counts_different {
-  size_type const expected;
-  __device__ bool operator()(size_type count) const { return (count != 0) && (count != expected); }
-};
-
-/**
- * @brief Detect malformed parquet input data.
- *
- * We have seen cases where parquet files can be oddly malformed. This function specifically
- * detects one case in particular:
- *
- * - When you have a file containing N rows
- * - For some reason, the sum total of the number of rows over all pages for a given column
- *   is != N
- *
- * @param pages All pages to be decoded
- * @param chunks Chunk data
- * @param page_keys Keys (schema id) associated with each page, sorted by column
- * @param page_index Page indices for iteration, sorted by column
- * @param expected_row_count Expected row count, if applicable
- * @param stream CUDA stream used for device memory operations and kernel launches
- */
-void detect_malformed_pages(cudf::detail::hostdevice_vector<PageInfo>& pages,
-                            cudf::detail::hostdevice_vector<ColumnChunkDesc> const& chunks,
-                            device_span<int const> page_keys,
-                            device_span<int const> page_index,
-                            std::optional<size_t> expected_row_count,
-                            rmm::cuda_stream_view stream)
-{
-  // sum row counts for all non-dictionary, non-list columns. other columns will be indicated as 0
-  rmm::device_uvector<size_type> row_counts(pages.size(),
-                                            stream);  // worst case:  num keys == num pages
-  auto const size_iter = thrust::make_transform_iterator(
-    page_index.begin(), flat_column_num_rows{pages.device_ptr(), chunks.device_ptr()});
-  auto const row_counts_begin = row_counts.begin();
-  auto const row_counts_end   = thrust::reduce_by_key(rmm::exec_policy(stream),
-                                                    page_keys.begin(),
-                                                    page_keys.end(),
-                                                    size_iter,
-                                                    thrust::make_discard_iterator(),
-                                                    row_counts_begin)
-                                .second;
-
-  // make sure all non-zero row counts are the same
-  rmm::device_uvector<size_type> compacted_row_counts(pages.size(), stream);
-  auto const compacted_row_counts_begin = compacted_row_counts.begin();
-  auto const compacted_row_counts_end   = thrust::copy_if(rmm::exec_policy(stream),
-                                                        row_counts_begin,
-                                                        row_counts_end,
-                                                        compacted_row_counts_begin,
-                                                        row_counts_nonzero{});
-  if (compacted_row_counts_end != compacted_row_counts_begin) {
-    size_t const found_row_count = static_cast<size_t>(compacted_row_counts.element(0, stream));
-
-    // if we somehow don't match the expected row count from the row groups themselves
-    if (expected_row_count.has_value()) {
-      CUDF_EXPECTS(expected_row_count.value() == found_row_count,
-                   "Encountered malformed parquet page data (unexpected row count in page data)");
-    }
-
-    // all non-zero row counts must be the same
-    auto const chk =
-      thrust::count_if(rmm::exec_policy(stream),
-                       compacted_row_counts_begin,
-                       compacted_row_counts_end,
-                       row_counts_different{static_cast<size_type>(found_row_count)});
-    CUDF_EXPECTS(chk == 0,
-                 "Encountered malformed parquet page data (row count mismatch in page data)");
-  }
-}
-
-struct page_to_string_size {
-  PageInfo* pages;
-  ColumnChunkDesc const* chunks;
-
-  __device__ size_t operator()(size_type page_idx) const
-  {
-    auto const page  = pages[page_idx];
     auto const chunk = chunks[page.chunk_idx];
 
     if (not is_string_col(chunk) || (page.flags & PAGEINFO_FLAGS_DICTIONARY) != 0) { return 0; }
-    return pages[page_idx].str_bytes;
+    return page.str_bytes;
   }
 };
 
 struct page_offset_output_iter {
   PageInfo* p;
-  size_type const* index;
 
   using value_type        = size_type;
   using difference_type   = size_type;
@@ -1099,75 +859,83 @@ struct page_offset_output_iter {
 
   __host__ __device__ page_offset_output_iter operator+(int i)
   {
-    return page_offset_output_iter{p, index + i};
+    return page_offset_output_iter{p + i};
   }
 
-  __host__ __device__ void operator++() { index++; }
+  __host__ __device__ void operator++() { p++; }
 
-  __device__ reference operator[](int i) { return p[index[i]].str_offset; }
-  __device__ reference operator*() { return p[*index].str_offset; }
+  __device__ reference operator[](int i) { return p[i].str_offset; }
+  __device__ reference operator*() { return p->str_offset; }
 };
 
 }  // anonymous namespace
 
-void reader::impl::preprocess_pages(bool uses_custom_row_bounds, size_t chunk_read_limit)
+void reader::impl::preprocess_file(int64_t skip_rows,
+                                   std::optional<size_type> const& num_rows,
+                                   host_span<std::vector<size_type> const> row_group_indices,
+                                   std::optional<std::reference_wrapper<ast::expression const>> filter)
 {
-  auto const skip_rows = _pass_itm_data->skip_rows;
-  auto const num_rows  = _pass_itm_data->num_rows;
-  auto& chunks         = _pass_itm_data->chunks;
-  auto& pages          = _pass_itm_data->pages_info;
+  CUDF_EXPECTS(!_file_preprocessed, "Attempted to preprocess file more than once");
+
+   // if filter is not empty, then create output types as vector and pass for filtering.
+  std::vector<data_type> output_types;
+  if (filter.has_value()) {
+    std::transform(_output_buffers.cbegin(),
+                    _output_buffers.cend(),
+                    std::back_inserter(output_types),
+                    [](auto const& col) { return col.type; });
+  }
+  std::tie(
+    _file_itm_data.global_skip_rows, _file_itm_data.global_num_rows, _file_itm_data.row_groups) =
+    _metadata->select_row_groups(
+      row_group_indices, skip_rows, num_rows, output_types, filter, _stream);
+
+  if (_file_itm_data.global_num_rows > 0 && not _file_itm_data.row_groups.empty() &&
+      not _input_columns.empty()) {
+    
+    // fills in chunk information without physically loading or decompressing
+    // the associated data
+    create_global_chunk_info();
+
+    // compute schedule of input reads.
+    compute_input_passes();
+  }
 
-  // compute page ordering.
-  //
-  // ordering of pages is by input column schema, repeated across row groups.  so
-  // if we had 3 columns, each with 2 pages, and 1 row group, our schema values might look like
-  //
-  // 1, 1, 2, 2, 3, 3
-  //
-  // However, if we had more than one row group, the pattern would be
-  //
-  // 1, 1, 2, 2, 3, 3, 1, 1, 2, 2, 3, 3
-  // ^ row group 0     |
-  //                   ^ row group 1
-  //
-  // To process pages by key (exclusive_scan_by_key, reduce_by_key, etc), the ordering we actually
-  // want is
-  //
-  // 1, 1, 1, 1, 2, 2, 2, 2, 3, 3, 3, 3
-  //
-  // We also need to preserve key-relative page ordering, so we need to use a stable sort.
-  rmm::device_uvector<int> page_keys(pages.size(), _stream);
-  rmm::device_uvector<int> page_index(pages.size(), _stream);
+  _file_preprocessed = true;
+}
+
+// update chunk_row field in subpass page from pass page
+struct update_subpass_chunk_row {
+  device_span<PageInfo> pass_pages;
+  device_span<PageInfo> subpass_pages;
+  device_span<size_t> page_src_index;
+
+  void operator()(size_t i)
   {
-    thrust::transform(rmm::exec_policy(_stream),
-                      pages.device_ptr(),
-                      pages.device_ptr() + pages.size(),
-                      page_keys.begin(),
-                      get_page_column_index{chunks.device_ptr()});
+    subpass_pages[i].chunk_row = pass_pages[page_src_index[i]].chunk_row;
+  }
+};
 
-    thrust::sequence(rmm::exec_policy(_stream), page_index.begin(), page_index.end());
-    thrust::stable_sort_by_key(rmm::exec_policy(_stream),
-                               page_keys.begin(),
-                               page_keys.end(),
-                               page_index.begin(),
-                               thrust::less<int>());
+// update num_rows field from pass page to subpass page
+struct update_pass_num_rows {
+  device_span<PageInfo> pass_pages;
+  device_span<PageInfo> subpass_pages;
+  device_span<size_t> page_src_index;
+
+  void operator()(size_t i)
+  {
+    pass_pages[page_src_index[i]].num_rows = subpass_pages[i].num_rows;
   }
+};
+
+void reader::impl::preprocess_subpass_pages(bool uses_custom_row_bounds, size_t chunk_read_limit)
+{
+  auto& pass = *_pass_itm_data;
+  auto& subpass = *pass.subpass;
 
-  // detect malformed columns.
-  // - we have seen some cases in the wild where we have a row group containing N
-  //   rows, but the total number of rows in the pages for column X is != N. while it
-  //   is possible to load this by just capping the number of rows read, we cannot tell
-  //   which rows are invalid so we may be returning bad data. in addition, this mismatch
-  //   confuses the chunked reader
-  detect_malformed_pages(pages,
-                         chunks,
-                         page_keys,
-                         page_index,
-                         uses_custom_row_bounds ? std::nullopt : std::make_optional(num_rows),
-                         _stream);
-
-  // iterate over all input columns and determine if they contain lists so we can further
-  // preprocess them.
+  // iterate over all input columns and determine if they contain lists.
+  // TODO: we could do this once at the file level instead of every time we get in here. the set of
+  // columns we are processing does not change over multiple passes/subpasses/output chunks.
   bool has_lists = false;
   for (size_t idx = 0; idx < _input_columns.size(); idx++) {
     auto const& input_col  = _input_columns[idx];
@@ -1188,49 +956,9 @@ void reader::impl::preprocess_pages(bool uses_custom_row_bounds, size_t chunk_re
     if (has_lists) { break; }
   }
 
-  // generate string dict indices if necessary
-  {
-    auto is_dict_chunk = [](ColumnChunkDesc const& chunk) {
-      return (chunk.data_type & 0x7) == BYTE_ARRAY && chunk.num_dict_pages > 0;
-    };
-
-    // Count the number of string dictionary entries
-    // NOTE: Assumes first page in the chunk is always the dictionary page
-    size_t total_str_dict_indexes = 0;
-    for (size_t c = 0, page_count = 0; c < chunks.size(); c++) {
-      if (is_dict_chunk(chunks[c])) {
-        total_str_dict_indexes += pages[page_count].num_input_values;
-      }
-      page_count += chunks[c].max_num_pages;
-    }
-
-    // Build index for string dictionaries since they can't be indexed
-    // directly due to variable-sized elements
-    _pass_itm_data->str_dict_index =
-      cudf::detail::make_zeroed_device_uvector_async<string_index_pair>(
-        total_str_dict_indexes, _stream, rmm::mr::get_current_device_resource());
-
-    // Update chunks with pointers to string dict indices
-    for (size_t c = 0, page_count = 0, str_ofs = 0; c < chunks.size(); c++) {
-      input_column_info const& input_col = _input_columns[chunks[c].src_col_index];
-      CUDF_EXPECTS(input_col.schema_idx == chunks[c].src_col_schema,
-                   "Column/page schema index mismatch");
-      if (is_dict_chunk(chunks[c])) {
-        chunks[c].str_dict_index = _pass_itm_data->str_dict_index.data() + str_ofs;
-        str_ofs += pages[page_count].num_input_values;
-      }
-
-      // column_data_base will always point to leaf data, even for nested types.
-      page_count += chunks[c].max_num_pages;
-    }
-
-    if (total_str_dict_indexes > 0) {
-      chunks.host_to_device_async(_stream);
-      BuildStringDictionaryIndex(chunks.device_ptr(), chunks.size(), _stream);
-    }
-  }
-
-  // intermediate data we will need for further chunked reads
+  // in some cases we will need to do further preprocessing of pages.
+  // - if we have lists, the num_rows field in PageInfo will be incorrect coming out of the file
+  // - if we are doing a chunked read, we need to compute the size of all string data
   if (has_lists || chunk_read_limit > 0) {
     // computes:
     // PageNestingInfo::num_rows for each page. the true number of rows (taking repetition into
@@ -1241,48 +969,74 @@ void reader::impl::preprocess_pages(bool uses_custom_row_bounds, size_t chunk_re
     // if:
     // - user has passed custom row bounds
     // - we will be doing a chunked read
-    ComputePageSizes(pages,
-                     chunks,
+    ComputePageSizes(subpass.pages,
+                     pass.chunks,
                      0,  // 0-max size_t. process all possible rows
                      std::numeric_limits<size_t>::max(),
                      true,                  // compute num_rows
                      chunk_read_limit > 0,  // compute string sizes
                      _pass_itm_data->level_type_size,
                      _stream);
+  }
 
-    // computes:
-    // PageInfo::chunk_row (the absolute start row index) for all pages
-    // Note: this is doing some redundant work for pages in flat hierarchies.  chunk_row has already
-    // been computed during header decoding. the overall amount of work here is very small though.
-    auto key_input  = thrust::make_transform_iterator(pages.device_ptr(), get_page_chunk_idx{});
-    auto page_input = thrust::make_transform_iterator(pages.device_ptr(), get_page_num_rows{});
-    thrust::exclusive_scan_by_key(rmm::exec_policy(_stream),
-                                  key_input,
-                                  key_input + pages.size(),
-                                  page_input,
-                                  chunk_row_output_iter{pages.device_ptr()});
-
-    // retrieve pages back
-    pages.device_to_host_sync(_stream);
+  // copy our now-correct row counts  back to the base pages stored in the pass.
+  auto iter = thrust::make_counting_iterator(0);  
+  thrust::for_each(rmm::exec_policy(_stream), 
+    iter,
+    iter + subpass.pages.size(),
+    update_pass_num_rows{pass.pages, subpass.pages, subpass.page_src_index});
 
-    // print_pages(pages, _stream);
+  // computes:
+  // PageInfo::chunk_row (the chunk-relative row index) for all pages in the pass. The start_row field in
+  // ColumnChunkDesc is the absolute row index for the whole file. chunk_row in PageInfo is relative
+  // to the beginning of the chunk. so in the kernels, chunk.start_row + page.chunk_row gives us the
+  // absolute row index.
+  // NOTE: this is recomputing chunk_row for -all- pages in the pass, not just the pages in the current
+  // subpass.  the reason we do this is that we may visit the same page multiple times over multiple
+  // subpasses (if we didn't process all rows in a given subpass). this greatly simplifies the logic.
+  auto key_input  = thrust::make_transform_iterator(pass.pages.d_begin(), get_page_chunk_idx{});
+  auto page_input = thrust::make_transform_iterator(pass.pages.d_begin(), get_page_num_rows{});
+  thrust::exclusive_scan_by_key(rmm::exec_policy(_stream),
+                                key_input,
+                                key_input + pass.pages.size(),
+                                page_input,
+                                chunk_row_output_iter{pass.pages.device_ptr()});
+
+  // finally, copy chunk row into the subpass.
+  thrust::for_each(rmm::exec_policy(_stream),
+    iter,
+    iter + subpass.pages.size(),
+    update_subpass_chunk_row{pass.pages, subpass.pages, subpass.page_src_index});
+
+  // retrieve pages back
+  subpass.pages.device_to_host_sync(_stream);
+
+  // at this point we have an accurate row count so we can compute how many rows we will actually be
+  // able to decode for this pass. we will have selected a set of pages for each column in the chunk,
+  // but not every page will have the same number of rows. so, we can only read as many rows as
+  // the smallest batch (by column) we have decompressed.  
+  size_t page_index = 0;
+  size_t max_row = std::numeric_limits<size_t>::max();
+  for(size_t idx=0; idx<subpass.chunk_page_count.size(); idx++){
+    auto const& last_page = subpass.pages[page_index + (subpass.chunk_page_count[idx] - 1)];
+    max_row = min(max_row, static_cast<size_t>(last_page.chunk_row + last_page.num_rows));
+    page_index += subpass.chunk_page_count[idx];
   }
+  CUDF_EXPECTS(max_row > pass.processed_rows, "Encountered invalid row read count");  
+  subpass.skip_rows = pass.skip_rows + pass.processed_rows;
+  subpass.num_rows = max_row - pass.processed_rows;
 
-  // preserve page ordering data for string decoder
-  _pass_itm_data->page_keys  = std::move(page_keys);
-  _pass_itm_data->page_index = std::move(page_index);
-
-  // compute splits for the pass
-  compute_splits_for_pass();
+  // now split up the output into chunks as necessary
+  compute_chunks_for_subpass();
 }
 
 void reader::impl::allocate_columns(size_t skip_rows, size_t num_rows, bool uses_custom_row_bounds)
 {
-  auto const& chunks = _pass_itm_data->chunks;
-  auto& pages        = _pass_itm_data->pages_info;
+  auto& pass = *_pass_itm_data;
+  auto& subpass = *pass.subpass;
 
   // Should not reach here if there is no page data.
-  CUDF_EXPECTS(pages.size() > 0, "There is no page to parse");
+  CUDF_EXPECTS(subpass.pages.size() > 0, "There are no pages present in the subpass");
 
   // computes:
   // PageNestingInfo::batch_size for each level of nesting, for each page, taking row bounds into
@@ -1290,13 +1044,13 @@ void reader::impl::allocate_columns(size_t skip_rows, size_t num_rows, bool uses
   // respect the user bounds. It is only necessary to do this second pass if uses_custom_row_bounds
   // is set (if the user has specified artificial bounds).
   if (uses_custom_row_bounds) {
-    ComputePageSizes(pages,
-                     chunks,
+    ComputePageSizes(subpass.pages,
+                     pass.chunks,
                      skip_rows,
                      num_rows,
                      false,  // num_rows is already computed
                      false,  // no need to compute string sizes
-                     _pass_itm_data->level_type_size,
+                     pass.level_type_size,
                      _stream);
 
     // print_pages(pages, _stream);
@@ -1333,8 +1087,6 @@ void reader::impl::allocate_columns(size_t skip_rows, size_t num_rows, bool uses
 
   // compute output column sizes by examining the pages of the -input- columns
   if (has_lists) {
-    auto& page_index = _pass_itm_data->page_index;
-
     std::vector<input_col_info> h_cols_info;
     h_cols_info.reserve(_input_columns.size());
     std::transform(_input_columns.cbegin(),
@@ -1353,7 +1105,7 @@ void reader::impl::allocate_columns(size_t skip_rows, size_t num_rows, bool uses
     auto const d_cols_info = cudf::detail::make_device_uvector_async(
       h_cols_info, _stream, rmm::mr::get_current_device_resource());
 
-    auto const num_keys = _input_columns.size() * max_depth * pages.size();
+    auto const num_keys = _input_columns.size() * max_depth * subpass.pages.size();
     // size iterator. indexes pages by sorted order
     rmm::device_uvector<size_type> size_input{num_keys, _stream};
     thrust::transform(
@@ -1362,9 +1114,9 @@ void reader::impl::allocate_columns(size_t skip_rows, size_t num_rows, bool uses
       thrust::make_counting_iterator<size_type>(num_keys),
       size_input.begin(),
       get_page_nesting_size{
-        d_cols_info.data(), max_depth, pages.size(), pages.device_ptr(), page_index.begin()});
+        d_cols_info.data(), max_depth, subpass.pages.size(), subpass.pages.d_begin()});
     auto const reduction_keys =
-      cudf::detail::make_counting_transform_iterator(0, get_reduction_key{pages.size()});
+      cudf::detail::make_counting_transform_iterator(0, get_reduction_key{subpass.pages.size()});
     cudf::detail::hostdevice_vector<size_t> sizes{_input_columns.size() * max_depth, _stream};
 
     // find the size of each column
@@ -1382,7 +1134,7 @@ void reader::impl::allocate_columns(size_t skip_rows, size_t num_rows, bool uses
       reduction_keys + num_keys,
       size_input.cbegin(),
       start_offset_output_iterator{
-        pages.device_ptr(), page_index.begin(), 0, d_cols_info.data(), max_depth, pages.size()});
+        subpass.pages.d_begin(), 0, d_cols_info.data(), max_depth, subpass.pages.size()});
 
     sizes.device_to_host_sync(_stream);
     for (size_type idx = 0; idx < static_cast<size_type>(_input_columns.size()); idx++) {
@@ -1413,30 +1165,30 @@ void reader::impl::allocate_columns(size_t skip_rows, size_t num_rows, bool uses
 
 std::vector<size_t> reader::impl::calculate_page_string_offsets()
 {
-  auto& chunks           = _pass_itm_data->chunks;
-  auto& pages            = _pass_itm_data->pages_info;
-  auto const& page_keys  = _pass_itm_data->page_keys;
-  auto const& page_index = _pass_itm_data->page_index;
+  auto& pass = *_pass_itm_data;
+  auto& subpass = *pass.subpass;
+  
+  auto page_keys = make_page_key_iterator(subpass.pages);
 
   std::vector<size_t> col_sizes(_input_columns.size(), 0L);
   rmm::device_uvector<size_t> d_col_sizes(col_sizes.size(), _stream);
 
   // use page_index to fetch page string sizes in the proper order
-  auto val_iter = thrust::make_transform_iterator(
-    page_index.begin(), page_to_string_size{pages.device_ptr(), chunks.device_ptr()});
+  auto val_iter = thrust::make_transform_iterator(subpass.pages.d_begin(), 
+    page_to_string_size{pass.chunks.d_begin()});
 
   // do scan by key to calculate string offsets for each page
   thrust::exclusive_scan_by_key(rmm::exec_policy(_stream),
-                                page_keys.begin(),
-                                page_keys.end(),
+                                page_keys,
+                                page_keys + subpass.pages.size(),
                                 val_iter,
-                                page_offset_output_iter{pages.device_ptr(), page_index.data()});
+                                page_offset_output_iter{subpass.pages.device_ptr()});
 
   // now sum up page sizes
   rmm::device_uvector<int> reduce_keys(col_sizes.size(), _stream);
   thrust::reduce_by_key(rmm::exec_policy(_stream),
-                        page_keys.begin(),
-                        page_keys.end(),
+                        page_keys,
+                        page_keys + subpass.pages.size(),
                         val_iter,
                         reduce_keys.begin(),
                         d_col_sizes.begin());

From 4d2326d667b189f08059741eda9683434d2ff9b8 Mon Sep 17 00:00:00 2001
From: db <dbaranec@nvidia.com>
Date: Mon, 6 Nov 2023 11:39:48 -0600
Subject: [PATCH 08/49] Formatting.

---
 cpp/src/io/parquet/page_hdr.cu               |  19 +-
 cpp/src/io/parquet/parquet_gpu.hpp           |  33 +-
 cpp/src/io/parquet/reader_impl.cpp           |  42 ++-
 cpp/src/io/parquet/reader_impl.hpp           |   8 +-
 cpp/src/io/parquet/reader_impl_chunking.cu   | 319 ++++++++++---------
 cpp/src/io/parquet/reader_impl_chunking.hpp  |  14 +-
 cpp/src/io/parquet/reader_impl_preprocess.cu | 145 ++++-----
 7 files changed, 302 insertions(+), 278 deletions(-)

diff --git a/cpp/src/io/parquet/page_hdr.cu b/cpp/src/io/parquet/page_hdr.cu
index 5aa73f6aaea..6ac7eb1982e 100644
--- a/cpp/src/io/parquet/page_hdr.cu
+++ b/cpp/src/io/parquet/page_hdr.cu
@@ -342,8 +342,10 @@ struct gpuParsePageHeader {
  * @param[in] num_chunks Number of column chunks
  */
 // blockDim {128,1,1}
-__global__ void __launch_bounds__(128)
-  gpuDecodePageHeaders(ColumnChunkDesc* chunks, chunk_page_info *chunk_pages, int32_t num_chunks, int32_t* error_code)
+__global__ void __launch_bounds__(128) gpuDecodePageHeaders(ColumnChunkDesc* chunks,
+                                                            chunk_page_info* chunk_pages,
+                                                            int32_t num_chunks,
+                                                            int32_t* error_code)
 {
   using cudf::detail::warp_size;
   gpuParsePageHeader parse_page_header;
@@ -382,10 +384,10 @@ __global__ void __launch_bounds__(128)
       bs->page.str_bytes           = 0;
       bs->page.kernel_mask         = 0;
     }
-    num_values     = bs->ck.num_values;
-    page_info      = chunk_pages ? chunk_pages[chunk].pages : nullptr;
-    max_num_pages  = (page_info) ? bs->ck.max_num_pages : 0;
-    values_found   = 0;
+    num_values    = bs->ck.num_values;
+    page_info     = chunk_pages ? chunk_pages[chunk].pages : nullptr;
+    max_num_pages = (page_info) ? bs->ck.max_num_pages : 0;
+    values_found  = 0;
     __syncwarp();
     while (values_found < num_values && bs->cur < bs->end) {
       int index_out = -1;
@@ -442,7 +444,7 @@ __global__ void __launch_bounds__(128)
         }
       }
       index_out = shuffle(index_out);
-      if (index_out >= 0 && index_out < max_num_pages && lane_id == 0){
+      if (index_out >= 0 && index_out < max_num_pages && lane_id == 0) {
         page_info[index_out] = bs->page;
       }
       num_values = shuffle(num_values);
@@ -513,7 +515,8 @@ void __host__ DecodePageHeaders(ColumnChunkDesc* chunks,
 {
   dim3 dim_block(128, 1);
   dim3 dim_grid((num_chunks + 3) >> 2, 1);  // 1 chunk per warp, 4 warps per block
-  gpuDecodePageHeaders<<<dim_grid, dim_block, 0, stream.value()>>>(chunks, chunk_pages, num_chunks, error_code);
+  gpuDecodePageHeaders<<<dim_grid, dim_block, 0, stream.value()>>>(
+    chunks, chunk_pages, num_chunks, error_code);
 }
 
 void __host__ BuildStringDictionaryIndex(ColumnChunkDesc* chunks,
diff --git a/cpp/src/io/parquet/parquet_gpu.hpp b/cpp/src/io/parquet/parquet_gpu.hpp
index a07b5ce4830..ea46387be9c 100644
--- a/cpp/src/io/parquet/parquet_gpu.hpp
+++ b/cpp/src/io/parquet/parquet_gpu.hpp
@@ -298,10 +298,7 @@ struct PageInfo {
  * @brief Return the column schema id as the key for a PageInfo struct.
  */
 struct get_page_key {
-  __device__ int32_t operator()(PageInfo const& page) const
-  {
-    return page.src_col_schema;
-  }
+  __device__ int32_t operator()(PageInfo const& page) const { return page.src_col_schema; }
 };
 
 /**
@@ -363,11 +360,11 @@ struct ColumnChunkDesc {
   {
   }
 
-  uint8_t const* compressed_data{};                  // pointer to compressed column chunk data
-  size_t compressed_size{};                          // total compressed data size for this chunk
-  size_t num_values{};                               // total number of values in this column
-  size_t start_row{};                                // file-wide, absolute starting row of this chunk
-  uint32_t num_rows{};                               // number of rows in this chunk
+  uint8_t const* compressed_data{};  // pointer to compressed column chunk data
+  size_t compressed_size{};          // total compressed data size for this chunk
+  size_t num_values{};               // total number of values in this column
+  size_t start_row{};                // file-wide, absolute starting row of this chunk
+  uint32_t num_rows{};               // number of rows in this chunk
   int16_t max_level[level_type::NUM_LEVEL_TYPES]{};  // max definition/repetition level
   int16_t max_nesting_depth{};                       // max nesting depth of the output
   uint16_t data_type{};  // basic column data type, ((type_length << 3) |
@@ -378,14 +375,14 @@ struct ColumnChunkDesc {
   int32_t num_dict_pages{};                     // number of dictionary pages
   int32_t max_num_pages{};                      // size of page_info array
   PageInfo* dict_page{};
-  string_index_pair* str_dict_index{};          // index for string dictionary
-  bitmask_type** valid_map_base{};              // base pointers of valid bit map for this column
-  void** column_data_base{};                    // base pointers of column data
-  void** column_string_base{};                  // base pointers of column string data
-  int8_t codec{};                               // compressed codec enum
-  int8_t converted_type{};                      // converted type enum
-  thrust::optional<LogicalType> logical_type{}; // logical type  
-  int8_t decimal_precision{};                   // Decimal precision
+  string_index_pair* str_dict_index{};           // index for string dictionary
+  bitmask_type** valid_map_base{};               // base pointers of valid bit map for this column
+  void** column_data_base{};                     // base pointers of column data
+  void** column_string_base{};                   // base pointers of column string data
+  int8_t codec{};                                // compressed codec enum
+  int8_t converted_type{};                       // converted type enum
+  thrust::optional<LogicalType> logical_type{};  // logical type
+  int8_t decimal_precision{};                    // Decimal precision
   int32_t ts_clock_rate{};  // output timestamp clock frequency (0=default, 1000=ms, 1000000000=ns)
 
   int32_t src_col_index{};   // my input column index
@@ -396,7 +393,7 @@ struct ColumnChunkDesc {
  * @brief A utility structure for use in decoding page headers.
  */
 struct chunk_page_info {
-  PageInfo *pages;
+  PageInfo* pages;
 };
 
 /**
diff --git a/cpp/src/io/parquet/reader_impl.cpp b/cpp/src/io/parquet/reader_impl.cpp
index 7b42a034bfe..e7ec419e470 100644
--- a/cpp/src/io/parquet/reader_impl.cpp
+++ b/cpp/src/io/parquet/reader_impl.cpp
@@ -30,9 +30,9 @@ namespace cudf::io::parquet::detail {
 
 void reader::impl::decode_page_data(size_t skip_rows, size_t num_rows)
 {
-  auto& pass = *_pass_itm_data;
+  auto& pass    = *_pass_itm_data;
   auto& subpass = *pass.subpass;
-  
+
   auto& page_nesting        = subpass.page_nesting_info;
   auto& page_nesting_decode = subpass.page_nesting_decode_info;
 
@@ -90,12 +90,12 @@ void reader::impl::decode_page_data(size_t skip_rows, size_t num_rows)
 
     // get a slice of size `nesting depth` from `chunk_nested_valids` to store an array of pointers
     // to validity data
-    auto valids              = chunk_nested_valids.host_ptr(chunk_off);
+    auto valids                   = chunk_nested_valids.host_ptr(chunk_off);
     pass.chunks[c].valid_map_base = chunk_nested_valids.device_ptr(chunk_off);
 
     // get a slice of size `nesting depth` from `chunk_nested_data` to store an array of pointers to
     // out data
-    auto data                  = chunk_nested_data.host_ptr(chunk_off);
+    auto data                       = chunk_nested_data.host_ptr(chunk_off);
     pass.chunks[c].column_data_base = chunk_nested_data.device_ptr(chunk_off);
 
     auto str_data = has_strings ? chunk_nested_str_data.host_ptr(chunk_off) : nullptr;
@@ -185,14 +185,24 @@ void reader::impl::decode_page_data(size_t skip_rows, size_t num_rows)
 
   // launch delta binary decoder
   if ((kernel_mask & KERNEL_MASK_DELTA_BINARY) != 0) {
-    DecodeDeltaBinary(
-      subpass.pages, pass.chunks, num_rows, skip_rows, level_type_size, error_code.data(), streams[s_idx++]);
+    DecodeDeltaBinary(subpass.pages,
+                      pass.chunks,
+                      num_rows,
+                      skip_rows,
+                      level_type_size,
+                      error_code.data(),
+                      streams[s_idx++]);
   }
 
   // launch the catch-all page decoder
   if ((kernel_mask & KERNEL_MASK_GENERAL) != 0) {
-    DecodePageData(
-      subpass.pages, pass.chunks, num_rows, skip_rows, level_type_size, error_code.data(), streams[s_idx++]);
+    DecodePageData(subpass.pages,
+                   pass.chunks,
+                   num_rows,
+                   skip_rows,
+                   level_type_size,
+                   error_code.data(),
+                   streams[s_idx++]);
   }
 
   // synchronize the streams
@@ -371,14 +381,14 @@ table_with_metadata reader::impl::read_chunk_internal(
   auto out_columns = std::vector<std::unique_ptr<column>>{};
   out_columns.reserve(_output_buffers.size());
 
-  #if 0
+#if 0
   if (!has_next()/* || _pass_itm_data->output_chunk_read_info.empty()*/) {
     return finalize_output(out_metadata, out_columns, filter);
   }
-  #endif
+#endif
 
-  auto& pass = *_pass_itm_data;
-  auto& subpass = *pass.subpass;
+  auto& pass            = *_pass_itm_data;
+  auto& subpass         = *pass.subpass;
   auto const& read_info = subpass.output_chunk_read_info[subpass.current_output_chunk];
 
   // Allocate memory buffers for the output columns.
@@ -433,7 +443,7 @@ table_with_metadata reader::impl::finalize_output(
   }
 
   // advance output chunk/subpass/pass info
-  auto& pass = *_pass_itm_data;
+  auto& pass    = *_pass_itm_data;
   auto& subpass = *pass.subpass;
   subpass.current_output_chunk++;
   pass.processed_rows += subpass.num_rows;
@@ -495,11 +505,11 @@ bool reader::impl::has_next()
                true /*uses_custom_row_bounds*/,
                {} /*row_group_indices, empty means read all row groups*/,
                std::nullopt /*filter*/);
-  
+
   // current_input_pass will only be incremented to be == num_passes after
   // the last chunk in the last subpass in the last pass has been returned
-  auto const num_passes = _file_itm_data.input_pass_row_group_offsets.size() - 1;  
-  bool const more_work = _file_itm_data._current_input_pass < num_passes;  
+  auto const num_passes = _file_itm_data.input_pass_row_group_offsets.size() - 1;
+  bool const more_work  = _file_itm_data._current_input_pass < num_passes;
   return more_work;
 }
 
diff --git a/cpp/src/io/parquet/reader_impl.hpp b/cpp/src/io/parquet/reader_impl.hpp
index c6d711666b8..95d2e8ae1bb 100644
--- a/cpp/src/io/parquet/reader_impl.hpp
+++ b/cpp/src/io/parquet/reader_impl.hpp
@@ -139,7 +139,7 @@ class reader::impl {
 
   /**
    * @brief Read the set of column chunks to be processed for this pass.
-   * 
+   *
    * Does not decompress the chunk data.
    *
    * @return pair of boolean indicating if compressed chunks were found and a vector of futures for
@@ -154,10 +154,10 @@ class reader::impl {
 
   /**
    * @brief Preprocess step for the entire file.
-   * 
+   *
    * Only ever called once. This function reads in rowgroup and associated chunk
    * information and computes the schedule of top level passes (see `pass_intermediate_data`).
-   * 
+   *
    * @param skip_rows The number of rows to skip in the requested set of rowgroups to be read
    * @param num_rows The total number of rows to read out of the selected rowgroups
    * @param row_group_indices Lists of row groups to read, one per source
@@ -170,7 +170,7 @@ class reader::impl {
 
   /**
    * @brief Preprocess step for the next input read pass.
-   * 
+   *
    * A 'pass' is defined as a subset of row groups read out of the globally
    * requested set of all row groups.
    */
diff --git a/cpp/src/io/parquet/reader_impl_chunking.cu b/cpp/src/io/parquet/reader_impl_chunking.cu
index 38124bfbf2c..393b9a47a14 100644
--- a/cpp/src/io/parquet/reader_impl_chunking.cu
+++ b/cpp/src/io/parquet/reader_impl_chunking.cu
@@ -23,8 +23,8 @@
 
 #include <io/comp/nvcomp_adapter.hpp>
 
-#include <io/utilities/time_utils.cuh>
 #include <io/utilities/config_utils.hpp>
+#include <io/utilities/time_utils.cuh>
 
 #include <rmm/exec_policy.hpp>
 
@@ -123,7 +123,6 @@ void print_cumulative_row_info(host_span<cumulative_page_info const> sizes,
 }
 #endif  // CHUNKING_DEBUG
 
-
 /**
  * @brief Functor which reduces two cumulative_page_info structs of the same key.
  */
@@ -191,18 +190,16 @@ __device__ size_t row_size_functor::operator()<string_view>(size_t num_rows, boo
  */
 struct get_cumulative_page_info {
   __device__ cumulative_page_info operator()(PageInfo const& page)
-  {    
+  {
     if (page.flags & PAGEINFO_FLAGS_DICTIONARY) {
       return cumulative_page_info{0, 0, page.src_col_schema};
     }
 
     // total nested size, not counting string data
-    auto iter =
-      cudf::detail::make_counting_transform_iterator(0, [page] __device__(size_type i) {
-        auto const& pni = page.nesting[i];
-        return cudf::type_dispatcher(
-          data_type{pni.type}, row_size_functor{}, pni.size, pni.nullable);
-      });
+    auto iter = cudf::detail::make_counting_transform_iterator(0, [page] __device__(size_type i) {
+      auto const& pni = page.nesting[i];
+      return cudf::type_dispatcher(data_type{pni.type}, row_size_functor{}, pni.size, pni.nullable);
+    });
 
     size_t const row_count = static_cast<size_t>(page.nesting[0].size);
     return {
@@ -225,12 +222,9 @@ struct get_page_size {
     }
     // TODO: this is not accurate for lists. it might make sense to make a guess
     // based on total-rowgroup-size / # of rows in the rowgroup for an average of
-    // rows-per-byte.     
+    // rows-per-byte.
     size_t const row_count = page.num_rows;
-    return {
-      row_count,
-      static_cast<size_t>(page.uncompressed_page_size),
-      page.src_col_schema};
+    return {row_count, static_cast<size_t>(page.uncompressed_page_size), page.src_col_schema};
   }
 };
 
@@ -276,13 +270,13 @@ int64_t find_next_split(int64_t cur_pos,
                         std::vector<cumulative_page_info> const& sizes,
                         size_t chunk_read_limit)
 {
-  size_t cur_cumulative_size = cur_pos == 0 ? 0 : sizes[cur_pos-1].size_bytes;
+  size_t cur_cumulative_size = cur_pos == 0 ? 0 : sizes[cur_pos - 1].size_bytes;
 
   auto start = thrust::make_transform_iterator(sizes.begin(), [&](cumulative_page_info const& i) {
     return i.size_bytes - cur_cumulative_size;
   });
   auto end   = start + sizes.size();
-  
+
   int64_t split_pos =
     thrust::lower_bound(thrust::seq, start + cur_pos, end, chunk_read_limit) - start;
 
@@ -299,7 +293,7 @@ int64_t find_next_split(int64_t cur_pos,
   // the list twice. so we have to iterate until we skip past all of them.  The idea is that we
   // either do this, or we have to call unique() on the input first.
   while (split_pos < (static_cast<int64_t>(sizes.size()) - 1) &&
-          (split_pos < 0 || sizes[split_pos].row_count == cur_row_count)) {
+         (split_pos < 0 || sizes[split_pos].row_count == cur_row_count)) {
     split_pos++;
   }
 
@@ -314,7 +308,7 @@ int64_t find_next_split(int64_t cur_pos,
  * @param chunk_read_limit Limit on total number of bytes to be returned per read, for all columns
  */
 std::vector<split_info> find_splits(std::vector<cumulative_page_info> const& sizes,
-                                   size_t chunk_read_limit)
+                                    size_t chunk_read_limit)
 {
   // now we have an array of {row_count, real output bytes}. just walk through it and generate
   // splits.
@@ -322,16 +316,17 @@ std::vector<split_info> find_splits(std::vector<cumulative_page_info> const& siz
   // sizes are reasonably large, this shouldn't iterate too many times
   std::vector<split_info> splits;
   {
-    size_t cur_pos             = 0;
-    size_t cur_row_count       = 0;
-    auto const num_rows = sizes.back().row_count;
+    size_t cur_pos       = 0;
+    size_t cur_row_count = 0;
+    auto const num_rows  = sizes.back().row_count;
     while (cur_row_count < num_rows) {
       auto const split_pos = find_next_split(cur_pos, cur_row_count, sizes, chunk_read_limit);
-      
+
       auto const start_row = cur_row_count;
       cur_row_count        = sizes[split_pos].row_count;
-      splits.push_back(split_info{row_range{start_row, cur_row_count - start_row}, static_cast<int64_t>(cur_pos == 0 ? 0 : cur_pos + 1)});
-      cur_pos             = split_pos;
+      splits.push_back(split_info{row_range{start_row, cur_row_count - start_row},
+                                  static_cast<int64_t>(cur_pos == 0 ? 0 : cur_pos + 1)});
+      cur_pos = split_pos;
     }
   }
   // print_cumulative_row_info(sizes, "adjusted", splits);
@@ -389,12 +384,12 @@ struct row_count_compare {
 
 std::pair<size_t, size_t> get_row_group_size(RowGroup const& rg)
 {
-  auto compressed_size_iter = thrust::make_transform_iterator(rg.columns.begin(), [](ColumnChunk const& c){
-    return c.meta_data.total_compressed_size;
-  });
+  auto compressed_size_iter = thrust::make_transform_iterator(
+    rg.columns.begin(), [](ColumnChunk const& c) { return c.meta_data.total_compressed_size; });
 
   // the trick is that total temp space needed is tricky to know
-  auto const compressed_size = std::reduce(compressed_size_iter, compressed_size_iter + rg.columns.size());
+  auto const compressed_size =
+    std::reduce(compressed_size_iter, compressed_size_iter + rg.columns.size());
   auto const total_size = compressed_size + rg.total_byte_size;
   return {compressed_size, total_size};
 }
@@ -405,33 +400,36 @@ adjust_cumulative_sizes(rmm::device_uvector<cumulative_page_info> const& c_info,
                         rmm::cuda_stream_view stream)
 {
   // sort by row count
-  rmm::device_uvector<cumulative_page_info> c_info_sorted{c_info, stream};  
+  rmm::device_uvector<cumulative_page_info> c_info_sorted{c_info, stream};
   thrust::sort(
     rmm::exec_policy(stream), c_info_sorted.begin(), c_info_sorted.end(), row_count_compare{});
 
   // page keys grouped by split.
   rmm::device_uvector<int32_t> page_keys_by_split{c_info.size(), stream};
-  thrust::transform(rmm::exec_policy(stream), c_info_sorted.begin(), c_info_sorted.end(), page_keys_by_split.begin(), [] __device__ (cumulative_page_info const& c){
-    return c.key;
-  });
+  thrust::transform(rmm::exec_policy(stream),
+                    c_info_sorted.begin(),
+                    c_info_sorted.end(),
+                    page_keys_by_split.begin(),
+                    [] __device__(cumulative_page_info const& c) { return c.key; });
 
   std::vector<cumulative_page_info> h_c_info_sorted(c_info_sorted.size());
   CUDF_CUDA_TRY(cudaMemcpy(h_c_info_sorted.data(),
-                            c_info_sorted.data(),
-                            sizeof(cumulative_page_info) * c_info_sorted.size(),
-                            cudaMemcpyDefault));
+                           c_info_sorted.data(),
+                           sizeof(cumulative_page_info) * c_info_sorted.size(),
+                           cudaMemcpyDefault));
   // print_cumulative_row_info(h_c_info_sorted, "raw");
 
   // generate key offsets (offsets to the start of each partition of keys). worst case is 1 page per
   // key
   rmm::device_uvector<size_type> key_offsets(pages.size() + 1, stream);
-  auto page_keys = make_page_key_iterator(pages);
+  auto page_keys             = make_page_key_iterator(pages);
   auto const key_offsets_end = thrust::reduce_by_key(rmm::exec_policy(stream),
                                                      page_keys,
                                                      page_keys + pages.size(),
                                                      thrust::make_constant_iterator(1),
                                                      thrust::make_discard_iterator(),
-                                                     key_offsets.begin()).second;
+                                                     key_offsets.begin())
+                                 .second;
   size_t const num_unique_keys = key_offsets_end - key_offsets.begin();
   thrust::exclusive_scan(
     rmm::exec_policy(stream), key_offsets.begin(), key_offsets.end(), key_offsets.begin());
@@ -458,14 +456,14 @@ adjust_cumulative_sizes(rmm::device_uvector<cumulative_page_info> const& c_info,
 struct page_span {
   size_t start, end;
 };
-std::pair<std::vector<page_span>, size_t>
-compute_next_subpass(rmm::device_uvector<cumulative_page_info> const& c_info,
-                     cudf::detail::hostdevice_vector<PageInfo> const& pages,
-                     cudf::detail::hostdevice_vector<size_type> const& page_offsets,
-                     size_t min_row,
-                     size_t size_limit,
-                     size_t num_columns,
-                     rmm::cuda_stream_view stream)
+std::pair<std::vector<page_span>, size_t> compute_next_subpass(
+  rmm::device_uvector<cumulative_page_info> const& c_info,
+  cudf::detail::hostdevice_vector<PageInfo> const& pages,
+  cudf::detail::hostdevice_vector<size_type> const& page_offsets,
+  size_t min_row,
+  size_t size_limit,
+  size_t num_columns,
+  rmm::cuda_stream_view stream)
 {
   auto [aggregated_info, page_keys_by_split] = adjust_cumulative_sizes(c_info, pages, stream);
 
@@ -481,48 +479,55 @@ compute_next_subpass(rmm::device_uvector<cumulative_page_info> const& c_info,
   // print_cumulative_row_info(h_aggregated_info, "adjusted");
 
   // first, find the min row
-  auto start = thrust::make_transform_iterator(h_aggregated_info.begin(), [&](cumulative_page_info const& i){
-    return i.row_count;
-  });
-  auto const start_index = thrust::upper_bound(thrust::host, start, start + h_aggregated_info.size(), min_row) - start;    
+  auto start = thrust::make_transform_iterator(
+    h_aggregated_info.begin(), [&](cumulative_page_info const& i) { return i.row_count; });
+  auto const start_index =
+    thrust::upper_bound(thrust::host, start, start + h_aggregated_info.size(), min_row) - start;
 
   // find the next split
   auto const end_index = find_next_split(start_index,
                                          min_row,
                                          // 0,
                                          h_aggregated_info,
-                                         size_limit) + 1; // the split index returned is inclusive
+                                         size_limit) +
+                         1;  // the split index returned is inclusive
 
   // get the number of pages for each column/schema
-  auto get_page_counts = [num_columns, stream](rmm::device_uvector<cumulative_page_info> const& aggregated_info, int start_index, int end_index){
+  auto get_page_counts = [num_columns, stream](
+                           rmm::device_uvector<cumulative_page_info> const& aggregated_info,
+                           int start_index,
+                           int end_index) {
     std::vector<size_t> h_page_counts(num_columns);
 
     auto const num_pages = end_index - start_index;
-    if(num_pages == 0){
+    if (num_pages == 0) {
       std::fill(h_page_counts.begin(), h_page_counts.end(), 0);
       return h_page_counts;
     }
 
     rmm::device_uvector<int32_t> page_keys(num_pages, stream);
-    thrust::transform(rmm::exec_policy(stream), 
-                      aggregated_info.begin() + start_index, 
+    thrust::transform(rmm::exec_policy(stream),
+                      aggregated_info.begin() + start_index,
                       aggregated_info.begin() + end_index,
                       page_keys.begin(),
-                      [] __device__ (cumulative_page_info const& i){
-                        return i.key;
-                      });    
+                      [] __device__(cumulative_page_info const& i) { return i.key; });
     thrust::sort(rmm::exec_policy(stream), page_keys.begin(), page_keys.end());
     rmm::device_uvector<size_t> page_counts(num_pages, stream);
-    auto page_counts_end = thrust::reduce_by_key(rmm::exec_policy(stream), 
-                                                 page_keys.begin(), 
-                                                 page_keys.end(), 
-                                                 thrust::make_constant_iterator(1), 
+    auto page_counts_end = thrust::reduce_by_key(rmm::exec_policy(stream),
+                                                 page_keys.begin(),
+                                                 page_keys.end(),
+                                                 thrust::make_constant_iterator(1),
                                                  thrust::make_discard_iterator(),
-                                                 page_counts.begin()).second;
+                                                 page_counts.begin())
+                             .second;
     auto const num_page_counts = page_counts_end - page_counts.begin();
-    CUDF_EXPECTS(static_cast<size_t>(num_page_counts) == num_columns, "Encountered a mismatch in column/schema counts while computing subpass split");
-    
-    cudaMemcpyAsync(h_page_counts.data(), page_counts.data(), sizeof(size_t) * num_columns, cudaMemcpyDeviceToHost);
+    CUDF_EXPECTS(static_cast<size_t>(num_page_counts) == num_columns,
+                 "Encountered a mismatch in column/schema counts while computing subpass split");
+
+    cudaMemcpyAsync(h_page_counts.data(),
+                    page_counts.data(),
+                    sizeof(size_t) * num_columns,
+                    cudaMemcpyDeviceToHost);
     stream.synchronize();
     return h_page_counts;
   };
@@ -534,22 +539,21 @@ compute_next_subpass(rmm::device_uvector<cumulative_page_info> const& c_info,
   // convert to page spans
   std::vector<page_span> out(num_columns);
   size_t total_pages = 0;
-  for(size_t c_idx=0; c_idx<num_columns; c_idx++){
+  for (size_t c_idx = 0; c_idx < num_columns; c_idx++) {
     // add page_offsets to get proper indices into the pages array
     out[c_idx].start = (last_counts[c_idx]) + page_offsets[c_idx];
-    out[c_idx].end = (last_counts[c_idx] + this_counts[c_idx]) + page_offsets[c_idx];
+    out[c_idx].end   = (last_counts[c_idx] + this_counts[c_idx]) + page_offsets[c_idx];
     total_pages += this_counts[c_idx];
   }
 
   return {out, total_pages};
 }
 
-
-std::pair<std::vector<split_info>, rmm::device_uvector<int32_t>>
-compute_page_splits_by_row(rmm::device_uvector<cumulative_page_info> const& c_info,
-                           cudf::detail::hostdevice_vector<PageInfo> const& pages,
-                           size_t size_limit,
-                           rmm::cuda_stream_view stream)
+std::pair<std::vector<split_info>, rmm::device_uvector<int32_t>> compute_page_splits_by_row(
+  rmm::device_uvector<cumulative_page_info> const& c_info,
+  cudf::detail::hostdevice_vector<PageInfo> const& pages,
+  size_t size_limit,
+  rmm::cuda_stream_view stream)
 {
   auto [aggregated_info, page_keys_by_split] = adjust_cumulative_sizes(c_info, pages, stream);
 
@@ -568,7 +572,7 @@ compute_page_splits_by_row(rmm::device_uvector<cumulative_page_info> const& c_in
 
 /**
  * @brief Decompresses the page data, at page granularity.
- * 
+ *
  * This function handles the case where `pages` is only a subset of all available
  * pages in `chunks`.
  *
@@ -584,10 +588,8 @@ compute_page_splits_by_row(rmm::device_uvector<cumulative_page_info> const& c_in
   rmm::cuda_stream_view stream)
 {
   auto for_each_codec_page = [&](Compression codec, std::function<void(size_t)> const& f) {
-    for(size_t p = 0; p<pages.size(); p++){
-      if(chunks[pages[p].chunk_idx].codec == codec){
-        f(p);
-      }
+    for (size_t p = 0; p < pages.size(); p++) {
+      if (chunks[pages[p].chunk_idx].codec == codec) { f(p); }
     }
   };
 
@@ -803,15 +805,16 @@ void detect_malformed_pages(cudf::detail::hostdevice_vector<PageInfo> const& pag
   // sum row counts for all non-dictionary, non-list columns. other columns will be indicated as 0
   rmm::device_uvector<size_type> row_counts(pages.size(),
                                             stream);  // worst case:  num keys == num pages
-  auto const size_iter = thrust::make_transform_iterator(pages.d_begin(), flat_column_num_rows{chunks.device_ptr()});
+  auto const size_iter =
+    thrust::make_transform_iterator(pages.d_begin(), flat_column_num_rows{chunks.device_ptr()});
   auto const row_counts_begin = row_counts.begin();
-  auto page_keys = make_page_key_iterator(pages);
+  auto page_keys              = make_page_key_iterator(pages);
   auto const row_counts_end   = thrust::reduce_by_key(rmm::exec_policy(stream),
-                                                      page_keys,
-                                                      page_keys + pages.size(),
-                                                      size_iter,
-                                                      thrust::make_discard_iterator(),
-                                                      row_counts_begin)
+                                                    page_keys,
+                                                    page_keys + pages.size(),
+                                                    size_iter,
+                                                    thrust::make_discard_iterator(),
+                                                    row_counts_begin)
                                 .second;
 
   // make sure all non-zero row counts are the same
@@ -927,8 +930,9 @@ void reader::impl::compute_input_passes()
     auto const& rgi       = row_groups_info[cur_rg_index];
     auto const& row_group = _metadata->get_row_group(rgi.index, rgi.source_index);
 
-    // total compressed size and total size (compressed + uncompressed) for 
-    auto const [compressed_rg_size, _/*compressed + uncompressed*/] = get_row_group_size(row_group);
+    // total compressed size and total size (compressed + uncompressed) for
+    auto const [compressed_rg_size, _ /*compressed + uncompressed*/] =
+      get_row_group_size(row_group);
 
     // can we add this row group
     if (cur_pass_byte_size + compressed_rg_size >= read_limit) {
@@ -963,19 +967,20 @@ void reader::impl::compute_input_passes()
 
 void reader::impl::compute_chunks_for_subpass()
 {
-  auto& pass = *_pass_itm_data;
+  auto& pass    = *_pass_itm_data;
   auto& subpass = *pass.subpass;
- 
+
   // simple case : no chunk size, no splits
   if (_output_chunk_read_limit <= 0) {
     subpass.output_chunk_read_info.push_back({subpass.skip_rows, subpass.num_rows});
     return;
   }
-  
+
   // generate cumulative row counts and sizes
   rmm::device_uvector<cumulative_page_info> c_info(subpass.pages.size(), _stream);
   // convert PageInfo to cumulative_page_info
-  auto page_input = thrust::make_transform_iterator(subpass.pages.d_begin(), get_cumulative_page_info{});
+  auto page_input =
+    thrust::make_transform_iterator(subpass.pages.d_begin(), get_cumulative_page_info{});
   auto page_keys = make_page_key_iterator(subpass.pages);
   thrust::inclusive_scan_by_key(rmm::exec_policy(_stream),
                                 page_keys,
@@ -985,17 +990,21 @@ void reader::impl::compute_chunks_for_subpass()
                                 thrust::equal_to{},
                                 cumulative_page_sum{});
   // print_cumulative_page_info(subpass.pages, c_info, _stream);
-  
+
   // compute the splits
-  auto [splits, _] = compute_page_splits_by_row(c_info, subpass.pages, _output_chunk_read_limit, _stream);
+  auto [splits, _] =
+    compute_page_splits_by_row(c_info, subpass.pages, _output_chunk_read_limit, _stream);
   subpass.output_chunk_read_info.reserve(splits.size());
 
   // apply skip_rows from the subpass
-  std::transform(splits.begin(), splits.end(), std::back_inserter(subpass.output_chunk_read_info), [&subpass](split_info const &s){
-    row_range r = s.rows;
-    r.skip_rows += subpass.skip_rows;
-    return r;
-  });
+  std::transform(splits.begin(),
+                 splits.end(),
+                 std::back_inserter(subpass.output_chunk_read_info),
+                 [&subpass](split_info const& s) {
+                   row_range r = s.rows;
+                   r.skip_rows += subpass.skip_rows;
+                   return r;
+                 });
 }
 
 void reader::impl::preprocess_next_pass()
@@ -1008,20 +1017,22 @@ void reader::impl::preprocess_next_pass()
 
   if (_file_itm_data.global_num_rows > 0 && not _file_itm_data.row_groups.empty() &&
       not _input_columns.empty() && _file_itm_data._current_input_pass < num_passes) {
-
     auto& pass = *_pass_itm_data;
 
     // setup row groups to be loaded for this pass
-    auto const row_group_start = _file_itm_data.input_pass_row_group_offsets[_file_itm_data._current_input_pass];
-    auto const row_group_end   = _file_itm_data.input_pass_row_group_offsets[_file_itm_data._current_input_pass + 1];
-    auto const num_row_groups  = row_group_end - row_group_start;
+    auto const row_group_start =
+      _file_itm_data.input_pass_row_group_offsets[_file_itm_data._current_input_pass];
+    auto const row_group_end =
+      _file_itm_data.input_pass_row_group_offsets[_file_itm_data._current_input_pass + 1];
+    auto const num_row_groups = row_group_end - row_group_start;
     pass.row_groups.resize(num_row_groups);
     std::copy(_file_itm_data.row_groups.begin() + row_group_start,
               _file_itm_data.row_groups.begin() + row_group_end,
               pass.row_groups.begin());
 
     auto const num_passes = _file_itm_data.input_pass_row_group_offsets.size() - 1;
-    CUDF_EXPECTS(_file_itm_data._current_input_pass < num_passes, "Encountered an invalid read pass index");
+    CUDF_EXPECTS(_file_itm_data._current_input_pass < num_passes,
+                 "Encountered an invalid read pass index");
 
     auto const chunks_per_rowgroup = _input_columns.size();
     auto const num_chunks          = chunks_per_rowgroup * num_row_groups;
@@ -1039,10 +1050,11 @@ void reader::impl::preprocess_next_pass()
     } else {
       auto const global_start_row = _file_itm_data.global_skip_rows;
       auto const global_end_row   = global_start_row + _file_itm_data.global_num_rows;
-      auto const start_row =
-        std::max(_file_itm_data.input_pass_row_count[_file_itm_data._current_input_pass], global_start_row);
+      auto const start_row        = std::max(
+        _file_itm_data.input_pass_row_count[_file_itm_data._current_input_pass], global_start_row);
       auto const end_row =
-        std::min(_file_itm_data.input_pass_row_count[_file_itm_data._current_input_pass + 1], global_end_row);
+        std::min(_file_itm_data.input_pass_row_count[_file_itm_data._current_input_pass + 1],
+                 global_end_row);
 
       // skip_rows is always global in the sense that it is relative to the first row of
       // everything we will be reading, regardless of what pass we are on.
@@ -1062,12 +1074,9 @@ void reader::impl::preprocess_next_pass()
     //   is possible to load this by just capping the number of rows read, we cannot tell
     //   which rows are invalid so we may be returning bad data. in addition, this mismatch
     //   confuses the chunked reader
-    detect_malformed_pages(pass.pages,
-                           pass.chunks,
-                           pass.num_rows,
-                           _stream);
+    detect_malformed_pages(pass.pages, pass.chunks, pass.num_rows, _stream);
 
-    // since there is only ever 1 dictionary per chunk (the 0th path), do it at the 
+    // since there is only ever 1 dictionary per chunk (the 0th path), do it at the
     // pass level.
     build_string_dict_indices();
 
@@ -1075,19 +1084,22 @@ void reader::impl::preprocess_next_pass()
     // columns in the input
     // page_keys:   1, 1, 1, 1, 2, 2, 2, 2, 3, 3, 3, 3
     //
-    // result:      0,          4,          8    
+    // result:      0,          4,          8
     rmm::device_uvector<size_type> page_counts(pass.pages.size() + 1, _stream);
-    auto page_keys = make_page_key_iterator(pass.pages);
+    auto page_keys             = make_page_key_iterator(pass.pages);
     auto const page_counts_end = thrust::reduce_by_key(rmm::exec_policy(_stream),
-                                                        page_keys,
-                                                        page_keys + pass.pages.size(),
-                                                        thrust::make_constant_iterator(1),
-                                                        thrust::make_discard_iterator(),
-                                                        page_counts.begin()).second;
+                                                       page_keys,
+                                                       page_keys + pass.pages.size(),
+                                                       thrust::make_constant_iterator(1),
+                                                       thrust::make_discard_iterator(),
+                                                       page_counts.begin())
+                                   .second;
     auto const num_page_counts = page_counts_end - page_counts.begin();
     pass.page_offsets = cudf::detail::hostdevice_vector<size_type>(num_page_counts + 1, _stream);
-    thrust::exclusive_scan(
-      rmm::exec_policy(_stream), page_counts.begin(), page_counts.begin() + num_page_counts + 1, pass.page_offsets.d_begin());
+    thrust::exclusive_scan(rmm::exec_policy(_stream),
+                           page_counts.begin(),
+                           page_counts.begin() + num_page_counts + 1,
+                           pass.page_offsets.d_begin());
     pass.page_offsets.device_to_host_async(_stream);
 
     pass.page_processed_counts = std::vector<size_type>(num_page_counts);
@@ -1107,9 +1119,9 @@ void reader::impl::preprocess_next_pass()
 }
 
 void reader::impl::handle_chunking(bool uses_custom_row_bounds)
-{  
+{
   // if this is our first time in here, setup the first pass.
-  if(!_pass_itm_data){
+  if (!_pass_itm_data) {
     // preprocess the next pass
     preprocess_next_pass();
   }
@@ -1117,56 +1129,58 @@ void reader::impl::handle_chunking(bool uses_custom_row_bounds)
   auto& pass = *_pass_itm_data;
 
   // if we already have a subpass in flight.
-  if(pass.subpass != nullptr){
+  if (pass.subpass != nullptr) {
     // if it still has more chunks in flight, there's nothing more to do
-    if(pass.subpass->current_output_chunk < pass.subpass->output_chunk_read_info.size()){
+    if (pass.subpass->current_output_chunk < pass.subpass->output_chunk_read_info.size()) {
       return;
-    }    
+    }
 
     // release the old subpass (will free memory)
     pass.subpass.reset();
 
     // otherwise we are done with the pass entirely
-    if(pass.processed_rows == pass.num_rows){
+    if (pass.processed_rows == pass.num_rows) {
       // release the old pass
       _pass_itm_data.reset();
 
       _file_itm_data._current_input_pass++;
       auto const num_passes = _file_itm_data.input_pass_row_group_offsets.size() - 1;
       // no more passes. we are absolutely done with this file.
-      if(_file_itm_data._current_input_pass == num_passes){
-        return;
-      }
+      if (_file_itm_data._current_input_pass == num_passes) { return; }
 
       // preprocess the next pass
       preprocess_next_pass();
     }
-  } 
-  
+  }
+
   // next pass
-  pass.subpass = std::make_unique<subpass_intermediate_data>();
+  pass.subpass  = std::make_unique<subpass_intermediate_data>();
   auto& subpass = *pass.subpass;
 
   auto const num_columns = pass.page_offsets.size() - 1;
-  
-  auto [page_indices, total_pages] = [&]() -> std::pair<std::vector<page_span>, size_t> {    
-    // special case:  if we contain no compressed data, or if we have no input limit, we can always just do 1 subpass since
-    // what we already have loaded is all the temporary memory we will ever use.
-    if(!pass.has_compressed_data || _input_pass_read_limit == 0){
+
+  auto [page_indices, total_pages] = [&]() -> std::pair<std::vector<page_span>, size_t> {
+    // special case:  if we contain no compressed data, or if we have no input limit, we can always
+    // just do 1 subpass since what we already have loaded is all the temporary memory we will ever
+    // use.
+    if (!pass.has_compressed_data || _input_pass_read_limit == 0) {
       std::vector<page_span> page_indices;
       page_indices.reserve(num_columns);
       auto iter = thrust::make_counting_iterator(0);
-      std::transform(iter, iter + num_columns, std::back_inserter(page_indices), [&](size_t i) -> page_span {
-        return {static_cast<size_t>(pass.page_offsets[i]), static_cast<size_t>(pass.page_offsets[i+1])};
-      });
+      std::transform(
+        iter, iter + num_columns, std::back_inserter(page_indices), [&](size_t i) -> page_span {
+          return {static_cast<size_t>(pass.page_offsets[i]),
+                  static_cast<size_t>(pass.page_offsets[i + 1])};
+        });
       return {page_indices, pass.pages.size()};
-    } 
+    }
     // otherwise we have to look forward and choose a batch of pages
 
     // generate cumulative page sizes.
     rmm::device_uvector<cumulative_page_info> c_info(pass.pages.size(), _stream);
     auto page_keys = make_page_key_iterator(pass.pages);
-    auto page_size = thrust::make_transform_iterator(pass.pages.d_begin(), get_page_size{pass.chunks});
+    auto page_size =
+      thrust::make_transform_iterator(pass.pages.d_begin(), get_page_size{pass.chunks});
     thrust::inclusive_scan_by_key(rmm::exec_policy(_stream),
                                   page_keys,
                                   page_keys + pass.pages.size(),
@@ -1177,21 +1191,28 @@ void reader::impl::handle_chunking(bool uses_custom_row_bounds)
     // print_cumulative_page_info(pass.pages, c_info, _stream);
 
     // get the next batch of pages
-    return compute_next_subpass(c_info, pass.pages, pass.page_offsets, pass.processed_rows, _input_pass_read_limit, num_columns, _stream);
+    return compute_next_subpass(c_info,
+                                pass.pages,
+                                pass.page_offsets,
+                                pass.processed_rows,
+                                _input_pass_read_limit,
+                                num_columns,
+                                _stream);
   }();
-  
-  // fill out the subpass struct  
+
+  // fill out the subpass struct
   subpass.pages = cudf::detail::hostdevice_vector<PageInfo>(0, total_pages, _stream);
-  subpass.page_src_index = cudf::detail::hostdevice_vector<size_t>(total_pages, total_pages, _stream);
+  subpass.page_src_index =
+    cudf::detail::hostdevice_vector<size_t>(total_pages, total_pages, _stream);
   // copy the appropriate subset of pages from each column
   size_t page_count = 0;
-  for(size_t c_idx=0; c_idx<num_columns; c_idx++){
+  for (size_t c_idx = 0; c_idx < num_columns; c_idx++) {
     auto const num_column_pages = page_indices[c_idx].end - page_indices[c_idx].start;
     subpass.chunk_page_count.push_back(num_column_pages);
     std::copy(pass.pages.begin() + page_indices[c_idx].start,
               pass.pages.begin() + page_indices[c_idx].end,
               std::back_inserter(subpass.pages));
-    
+
     // mapping back to original pages in the pass
     thrust::sequence(thrust::host,
                      subpass.page_src_index.begin() + page_count,
@@ -1202,7 +1223,7 @@ void reader::impl::handle_chunking(bool uses_custom_row_bounds)
   subpass.pages.host_to_device_async(_stream);
   subpass.page_src_index.host_to_device_async(_stream);
 
-  //print_hostdevice_vector(subpass.page_src_index);
+  // print_hostdevice_vector(subpass.page_src_index);
 
   // decompress the pages
   if (pass.has_compressed_data) {
diff --git a/cpp/src/io/parquet/reader_impl_chunking.hpp b/cpp/src/io/parquet/reader_impl_chunking.hpp
index 0e28f7de2df..74ec0f24f7d 100644
--- a/cpp/src/io/parquet/reader_impl_chunking.hpp
+++ b/cpp/src/io/parquet/reader_impl_chunking.hpp
@@ -36,7 +36,7 @@ struct file_intermediate_data {
   // an array of offsets into _file_itm_data::global_chunks. Each pair of offsets represents
   // the start/end of the chunks to be loaded for a given pass.
   std::vector<std::size_t> input_pass_row_group_offsets{};
-  
+
   // row counts per input-pass
   std::vector<std::size_t> input_pass_row_count{};
 
@@ -95,18 +95,18 @@ struct pass_intermediate_data {
   std::vector<row_group_info> row_groups{};
   cudf::detail::hostdevice_vector<ColumnChunkDesc> chunks{};
   cudf::detail::hostdevice_vector<PageInfo> pages{};
-  
+
   // offsets to each group of input pages (by column/schema)
   // so if we had 2 columns/schemas, with page keys
   //
   // 1 1 1 1 1 2 2 2
-  // 
+  //
   // page_offsets would be 0, 5, 8
   cudf::detail::hostdevice_vector<size_type> page_offsets{};
   // for each group of input pages (by column, schema), the count
   // of how many pages we have processed so far
-  std::vector<size_type> page_processed_counts{};  
-    
+  std::vector<size_type> page_processed_counts{};
+
   rmm::device_uvector<string_index_pair> str_dict_index{0, rmm::cuda_stream_default};
 
   int level_type_size{0};
@@ -114,9 +114,9 @@ struct pass_intermediate_data {
   // skip_rows / num_rows for this pass.
   // NOTE: skip_rows is the absolute row index in the file.
   size_t skip_rows;
-  size_t num_rows; 
+  size_t num_rows;
   // number of rows we have processed so far (out of num_rows)
-  size_t processed_rows{0}; 
+  size_t processed_rows{0};
 
   // currently active subpass
   std::unique_ptr<subpass_intermediate_data> subpass{};
diff --git a/cpp/src/io/parquet/reader_impl_preprocess.cu b/cpp/src/io/parquet/reader_impl_preprocess.cu
index a2e93de00bb..531b416023c 100644
--- a/cpp/src/io/parquet/reader_impl_preprocess.cu
+++ b/cpp/src/io/parquet/reader_impl_preprocess.cu
@@ -65,7 +65,6 @@ void print_pages(cudf::detail::hostdevice_vector<PageInfo>& pages, rmm::cuda_str
 }
 #endif  // PREPROCESS_DEBUG
 
-
 /**
  * @brief Generate depth remappings for repetition and definition levels.
  *
@@ -317,15 +316,16 @@ int decode_page_headers(cudf::detail::hostdevice_vector<ColumnChunkDesc>& chunks
   // IMPORTANT : if you change how pages are stored within a chunk (dist pages, then data pages),
   // please update preprocess_nested_columns to reflect this.
   for (size_t c = 0, page_count = 0; c < chunks.size(); c++) {
-    chunks[c].max_num_pages = chunks[c].num_data_pages + chunks[c].num_dict_pages;
-    chunk_page_info[c].pages     = pages.device_ptr(page_count);
+    chunks[c].max_num_pages  = chunks[c].num_data_pages + chunks[c].num_dict_pages;
+    chunk_page_info[c].pages = pages.device_ptr(page_count);
     page_count += chunks[c].max_num_pages;
   }
 
   kernel_error error_code(stream);
   chunks.host_to_device_async(stream);
   chunk_page_info.host_to_device_async(stream);
-  DecodePageHeaders(chunks.device_ptr(), chunk_page_info.device_ptr(), chunks.size(), error_code.data(), stream);
+  DecodePageHeaders(
+    chunks.device_ptr(), chunk_page_info.device_ptr(), chunks.size(), error_code.data(), stream);
 
   if (error_code.value() != 0) {
     // TODO(ets): if an unsupported encoding was detected, do extra work to figure out which one
@@ -371,9 +371,7 @@ int decode_page_headers(cudf::detail::hostdevice_vector<ColumnChunkDesc>& chunks
                       pages.d_begin(),
                       pages.d_begin() + pages.size(),
                       page_keys.begin(),
-                      [] __device__  (PageInfo const& page){
-                        return page.src_col_schema;
-                      });
+                      [] __device__(PageInfo const& page) { return page.src_col_schema; });
     thrust::stable_sort_by_key(rmm::exec_policy(stream),
                                page_keys.begin(),
                                page_keys.end(),
@@ -414,15 +412,14 @@ void reader::impl::build_string_dict_indices()
 
   // Build index for string dictionaries since they can't be indexed
   // directly due to variable-sized elements
-  pass.str_dict_index =
-    cudf::detail::make_zeroed_device_uvector_async<string_index_pair>(
-      total_str_dict_indexes, _stream, rmm::mr::get_current_device_resource());
+  pass.str_dict_index = cudf::detail::make_zeroed_device_uvector_async<string_index_pair>(
+    total_str_dict_indexes, _stream, rmm::mr::get_current_device_resource());
 
   // Update chunks with pointers to string dict indices
   for (size_t c = 0, page_count = 0, str_ofs = 0; c < pass.chunks.size(); c++) {
     input_column_info const& input_col = _input_columns[pass.chunks[c].src_col_index];
     CUDF_EXPECTS(input_col.schema_idx == pass.chunks[c].src_col_schema,
-                "Column/page schema index mismatch");
+                 "Column/page schema index mismatch");
     if (is_dict_chunk(pass.chunks[c])) {
       pass.chunks[c].str_dict_index = pass.str_dict_index.data() + str_ofs;
       str_ofs += pass.pages[page_count].num_input_values;
@@ -440,7 +437,7 @@ void reader::impl::build_string_dict_indices()
 
 void reader::impl::allocate_nesting_info()
 {
-  auto& pass = *_pass_itm_data;
+  auto& pass    = *_pass_itm_data;
   auto& subpass = *pass.subpass;
 
   auto const& chunks             = pass.chunks;
@@ -451,8 +448,8 @@ void reader::impl::allocate_nesting_info()
   // compute total # of page_nesting infos needed and allocate space. doing this in one
   // buffer to keep it to a single gpu allocation
   auto counting_iter = thrust::make_counting_iterator(size_t{0});
-  size_t const total_page_nesting_infos = std::accumulate(
-    counting_iter, counting_iter + chunks.size(), 0, [&](int total, size_t index) {
+  size_t const total_page_nesting_infos =
+    std::accumulate(counting_iter, counting_iter + chunks.size(), 0, [&](int total, size_t index) {
       auto const& chunk = chunks[index];
 
       // the schema of the input column
@@ -572,16 +569,15 @@ void reader::impl::allocate_nesting_info()
 
 void reader::impl::allocate_level_decode_space()
 {
-  auto& pass = *_pass_itm_data;
+  auto& pass    = *_pass_itm_data;
   auto& subpass = *pass.subpass;
 
   auto& pages = subpass.pages;
 
   // TODO: this could be made smaller if we ignored dictionary pages and pages with no
   // repetition data.
-  size_t const per_page_decode_buf_size =
-    LEVEL_DECODE_BUF_SIZE * 2 * pass.level_type_size;
-  auto const decode_buf_size = per_page_decode_buf_size * pages.size();
+  size_t const per_page_decode_buf_size = LEVEL_DECODE_BUF_SIZE * 2 * pass.level_type_size;
+  auto const decode_buf_size            = per_page_decode_buf_size * pages.size();
   subpass.level_decode_data =
     rmm::device_buffer(decode_buf_size, _stream, rmm::mr::get_current_device_resource());
 
@@ -619,8 +615,9 @@ std::pair<bool, std::vector<std::future<void>>> reader::impl::read_column_chunks
 
   // Initialize column chunk information
   size_t total_decompressed_size = 0;
-  // TODO: make this respect the pass-wide skip_rows/num_rows instead of the file-wide skip_rows/num_rows
-  //auto remaining_rows            = num_rows;
+  // TODO: make this respect the pass-wide skip_rows/num_rows instead of the file-wide
+  // skip_rows/num_rows
+  // auto remaining_rows            = num_rows;
   std::vector<std::future<void>> read_chunk_tasks;
   size_type chunk_count = 0;
   for (auto const& rg : row_groups_info) {
@@ -648,7 +645,7 @@ std::pair<bool, std::vector<std::future<void>>> reader::impl::read_column_chunks
 
       chunk_count++;
     }
-    //remaining_rows -= row_group_rows;
+    // remaining_rows -= row_group_rows;
   }
 
   // Read compressed chunk data to device memory
@@ -661,7 +658,7 @@ std::pair<bool, std::vector<std::future<void>>> reader::impl::read_column_chunks
                                                       chunk_source_map,
                                                       _stream));
 
-  //CUDF_EXPECTS(remaining_rows == 0, "All rows data must be read.");
+  // CUDF_EXPECTS(remaining_rows == 0, "All rows data must be read.");
 
   return {total_decompressed_size > 0, std::move(read_chunk_tasks)};
 }
@@ -672,12 +669,12 @@ void reader::impl::load_compressed_data()
 
   // This function should never be called if `num_rows == 0`.
   // CUDF_EXPECTS(_pass_itm_data->num_rows > 0, "Number of reading rows must not be zero.");
-  
-  auto& chunks           = pass.chunks;
-  auto& pages            = pass.pages;
 
-  auto const [has_compressed_data, read_chunks_tasks] = read_column_chunks();  
-  pass.has_compressed_data = has_compressed_data;
+  auto& chunks = pass.chunks;
+  auto& pages  = pass.pages;
+
+  auto const [has_compressed_data, read_chunks_tasks] = read_column_chunks();
+  pass.has_compressed_data                            = has_compressed_data;
 
   for (auto& task : read_chunks_tasks) {
     task.wait();
@@ -801,17 +798,16 @@ struct start_offset_output_iterator {
 
   constexpr void operator=(start_offset_output_iterator const& other)
   {
-    pages        = other.pages;
-    cur_index    = other.cur_index;
-    input_cols   = other.input_cols;
-    max_depth    = other.max_depth;
-    num_pages    = other.num_pages;
+    pages      = other.pages;
+    cur_index  = other.cur_index;
+    input_cols = other.input_cols;
+    max_depth  = other.max_depth;
+    num_pages  = other.num_pages;
   }
 
   constexpr start_offset_output_iterator operator+(size_t i)
   {
-    return start_offset_output_iterator{
-      pages, cur_index + i, input_cols, max_depth, num_pages};
+    return start_offset_output_iterator{pages, cur_index + i, input_cols, max_depth, num_pages};
   }
 
   constexpr void operator++() { cur_index++; }
@@ -834,7 +830,7 @@ struct start_offset_output_iterator {
   }
 };
 
-struct page_to_string_size {  
+struct page_to_string_size {
   ColumnChunkDesc const* chunks;
 
   __device__ size_t operator()(PageInfo const& page) const
@@ -868,20 +864,21 @@ struct page_offset_output_iter {
 
 }  // anonymous namespace
 
-void reader::impl::preprocess_file(int64_t skip_rows,
-                                   std::optional<size_type> const& num_rows,
-                                   host_span<std::vector<size_type> const> row_group_indices,
-                                   std::optional<std::reference_wrapper<ast::expression const>> filter)
+void reader::impl::preprocess_file(
+  int64_t skip_rows,
+  std::optional<size_type> const& num_rows,
+  host_span<std::vector<size_type> const> row_group_indices,
+  std::optional<std::reference_wrapper<ast::expression const>> filter)
 {
   CUDF_EXPECTS(!_file_preprocessed, "Attempted to preprocess file more than once");
 
-   // if filter is not empty, then create output types as vector and pass for filtering.
+  // if filter is not empty, then create output types as vector and pass for filtering.
   std::vector<data_type> output_types;
   if (filter.has_value()) {
     std::transform(_output_buffers.cbegin(),
-                    _output_buffers.cend(),
-                    std::back_inserter(output_types),
-                    [](auto const& col) { return col.type; });
+                   _output_buffers.cend(),
+                   std::back_inserter(output_types),
+                   [](auto const& col) { return col.type; });
   }
   std::tie(
     _file_itm_data.global_skip_rows, _file_itm_data.global_num_rows, _file_itm_data.row_groups) =
@@ -890,7 +887,6 @@ void reader::impl::preprocess_file(int64_t skip_rows,
 
   if (_file_itm_data.global_num_rows > 0 && not _file_itm_data.row_groups.empty() &&
       not _input_columns.empty()) {
-    
     // fills in chunk information without physically loading or decompressing
     // the associated data
     create_global_chunk_info();
@@ -920,15 +916,12 @@ struct update_pass_num_rows {
   device_span<PageInfo> subpass_pages;
   device_span<size_t> page_src_index;
 
-  void operator()(size_t i)
-  {
-    pass_pages[page_src_index[i]].num_rows = subpass_pages[i].num_rows;
-  }
+  void operator()(size_t i) { pass_pages[page_src_index[i]].num_rows = subpass_pages[i].num_rows; }
 };
 
 void reader::impl::preprocess_subpass_pages(bool uses_custom_row_bounds, size_t chunk_read_limit)
 {
-  auto& pass = *_pass_itm_data;
+  auto& pass    = *_pass_itm_data;
   auto& subpass = *pass.subpass;
 
   // iterate over all input columns and determine if they contain lists.
@@ -978,20 +971,20 @@ void reader::impl::preprocess_subpass_pages(bool uses_custom_row_bounds, size_t
   }
 
   // copy our now-correct row counts  back to the base pages stored in the pass.
-  auto iter = thrust::make_counting_iterator(0);  
-  thrust::for_each(rmm::exec_policy(_stream), 
-    iter,
-    iter + subpass.pages.size(),
-    update_pass_num_rows{pass.pages, subpass.pages, subpass.page_src_index});
+  auto iter = thrust::make_counting_iterator(0);
+  thrust::for_each(rmm::exec_policy(_stream),
+                   iter,
+                   iter + subpass.pages.size(),
+                   update_pass_num_rows{pass.pages, subpass.pages, subpass.page_src_index});
 
   // computes:
-  // PageInfo::chunk_row (the chunk-relative row index) for all pages in the pass. The start_row field in
-  // ColumnChunkDesc is the absolute row index for the whole file. chunk_row in PageInfo is relative
-  // to the beginning of the chunk. so in the kernels, chunk.start_row + page.chunk_row gives us the
-  // absolute row index.
-  // NOTE: this is recomputing chunk_row for -all- pages in the pass, not just the pages in the current
-  // subpass.  the reason we do this is that we may visit the same page multiple times over multiple
-  // subpasses (if we didn't process all rows in a given subpass). this greatly simplifies the logic.
+  // PageInfo::chunk_row (the chunk-relative row index) for all pages in the pass. The start_row
+  // field in ColumnChunkDesc is the absolute row index for the whole file. chunk_row in PageInfo is
+  // relative to the beginning of the chunk. so in the kernels, chunk.start_row + page.chunk_row
+  // gives us the absolute row index. NOTE: this is recomputing chunk_row for -all- pages in the
+  // pass, not just the pages in the current subpass.  the reason we do this is that we may visit
+  // the same page multiple times over multiple subpasses (if we didn't process all rows in a given
+  // subpass). this greatly simplifies the logic.
   auto key_input  = thrust::make_transform_iterator(pass.pages.d_begin(), get_page_chunk_idx{});
   auto page_input = thrust::make_transform_iterator(pass.pages.d_begin(), get_page_num_rows{});
   thrust::exclusive_scan_by_key(rmm::exec_policy(_stream),
@@ -1002,27 +995,27 @@ void reader::impl::preprocess_subpass_pages(bool uses_custom_row_bounds, size_t
 
   // finally, copy chunk row into the subpass.
   thrust::for_each(rmm::exec_policy(_stream),
-    iter,
-    iter + subpass.pages.size(),
-    update_subpass_chunk_row{pass.pages, subpass.pages, subpass.page_src_index});
+                   iter,
+                   iter + subpass.pages.size(),
+                   update_subpass_chunk_row{pass.pages, subpass.pages, subpass.page_src_index});
 
   // retrieve pages back
   subpass.pages.device_to_host_sync(_stream);
 
   // at this point we have an accurate row count so we can compute how many rows we will actually be
-  // able to decode for this pass. we will have selected a set of pages for each column in the chunk,
-  // but not every page will have the same number of rows. so, we can only read as many rows as
-  // the smallest batch (by column) we have decompressed.  
+  // able to decode for this pass. we will have selected a set of pages for each column in the
+  // chunk, but not every page will have the same number of rows. so, we can only read as many rows
+  // as the smallest batch (by column) we have decompressed.
   size_t page_index = 0;
-  size_t max_row = std::numeric_limits<size_t>::max();
-  for(size_t idx=0; idx<subpass.chunk_page_count.size(); idx++){
+  size_t max_row    = std::numeric_limits<size_t>::max();
+  for (size_t idx = 0; idx < subpass.chunk_page_count.size(); idx++) {
     auto const& last_page = subpass.pages[page_index + (subpass.chunk_page_count[idx] - 1)];
     max_row = min(max_row, static_cast<size_t>(last_page.chunk_row + last_page.num_rows));
     page_index += subpass.chunk_page_count[idx];
   }
-  CUDF_EXPECTS(max_row > pass.processed_rows, "Encountered invalid row read count");  
+  CUDF_EXPECTS(max_row > pass.processed_rows, "Encountered invalid row read count");
   subpass.skip_rows = pass.skip_rows + pass.processed_rows;
-  subpass.num_rows = max_row - pass.processed_rows;
+  subpass.num_rows  = max_row - pass.processed_rows;
 
   // now split up the output into chunks as necessary
   compute_chunks_for_subpass();
@@ -1030,7 +1023,7 @@ void reader::impl::preprocess_subpass_pages(bool uses_custom_row_bounds, size_t
 
 void reader::impl::allocate_columns(size_t skip_rows, size_t num_rows, bool uses_custom_row_bounds)
 {
-  auto& pass = *_pass_itm_data;
+  auto& pass    = *_pass_itm_data;
   auto& subpass = *pass.subpass;
 
   // Should not reach here if there is no page data.
@@ -1163,17 +1156,17 @@ void reader::impl::allocate_columns(size_t skip_rows, size_t num_rows, bool uses
 
 std::vector<size_t> reader::impl::calculate_page_string_offsets()
 {
-  auto& pass = *_pass_itm_data;
+  auto& pass    = *_pass_itm_data;
   auto& subpass = *pass.subpass;
-  
+
   auto page_keys = make_page_key_iterator(subpass.pages);
 
   std::vector<size_t> col_sizes(_input_columns.size(), 0L);
   rmm::device_uvector<size_t> d_col_sizes(col_sizes.size(), _stream);
 
   // use page_index to fetch page string sizes in the proper order
-  auto val_iter = thrust::make_transform_iterator(subpass.pages.d_begin(), 
-    page_to_string_size{pass.chunks.d_begin()});
+  auto val_iter = thrust::make_transform_iterator(subpass.pages.d_begin(),
+                                                  page_to_string_size{pass.chunks.d_begin()});
 
   // do scan by key to calculate string offsets for each page
   thrust::exclusive_scan_by_key(rmm::exec_policy(_stream),

From 8d4e3f9f6330c1b5dc5ed36389cd632841817cb9 Mon Sep 17 00:00:00 2001
From: db <dbaranec@nvidia.com>
Date: Tue, 7 Nov 2023 15:59:41 -0600
Subject: [PATCH 09/49] Setup dictionary pages properly at the pass level.
 Fixed issues with uncompressed data.  Add a couple of simple testts.

---
 cpp/src/io/parquet/reader_impl.hpp           |  89 ++--
 cpp/src/io/parquet/reader_impl_chunking.cu   | 488 ++++++++++---------
 cpp/src/io/parquet/reader_impl_chunking.hpp  |   1 +
 cpp/src/io/parquet/reader_impl_preprocess.cu |  89 ++--
 cpp/tests/io/parquet_chunked_reader_test.cpp |  81 +++
 5 files changed, 459 insertions(+), 289 deletions(-)

diff --git a/cpp/src/io/parquet/reader_impl.hpp b/cpp/src/io/parquet/reader_impl.hpp
index 95d2e8ae1bb..c8190d60dc9 100644
--- a/cpp/src/io/parquet/reader_impl.hpp
+++ b/cpp/src/io/parquet/reader_impl.hpp
@@ -120,6 +120,8 @@ class reader::impl {
    */
   table_with_metadata read_chunk();
 
+  // top level functions involved with ratcheting through the passes, subpasses
+  // and output chunks of the read process
  private:
   /**
    * @brief Perform the necessary data preprocessing for parsing file later on.
@@ -137,21 +139,6 @@ class reader::impl {
                     host_span<std::vector<size_type> const> row_group_indices,
                     std::optional<std::reference_wrapper<ast::expression const>> filter);
 
-  /**
-   * @brief Read the set of column chunks to be processed for this pass.
-   *
-   * Does not decompress the chunk data.
-   *
-   * @return pair of boolean indicating if compressed chunks were found and a vector of futures for
-   * read completion
-   */
-  std::pair<bool, std::vector<std::future<void>>> read_column_chunks();
-
-  /**
-   * @brief Load compressed data and page information for the current pass.
-   */
-  void load_compressed_data();
-
   /**
    * @brief Preprocess step for the entire file.
    *
@@ -169,17 +156,62 @@ class reader::impl {
                        std::optional<std::reference_wrapper<ast::expression const>> filter);
 
   /**
-   * @brief Preprocess step for the next input read pass.
+   * @brief Ratchet the pass/subpass/chunk process forward.
+   */
+  void handle_chunking(bool uses_custom_row_bounds);
+
+  /**
+   * @brief Setup step for the next input read pass.
    *
    * A 'pass' is defined as a subset of row groups read out of the globally
    * requested set of all row groups.
    */
-  void preprocess_next_pass();
+  void setup_next_pass();
 
   /**
-   * @brief Ratchet the pass/subpass/chunk process forward.
+   * @brief Setup step for the next decompression subpass.
+   *
+   * A 'subpass' is defined as a subset of pages within a pass that are
+   * decompressed a decoded as a batch. Subpasses may be further subdivided
+   * into output chunks.
    */
-  void handle_chunking(bool uses_custom_row_bounds);
+  void setup_next_subpass(bool uses_custom_row_bounds);
+
+  /**
+   * @brief Read a chunk of data and return an output table.
+   *
+   * This function is called internally and expects all preprocessing steps have already been done.
+   *
+   * @param uses_custom_row_bounds Whether or not num_rows and skip_rows represents user-specific
+   *        bounds
+   * @param filter Optional AST expression to filter output rows
+   * @return The output table along with columns' metadata
+   */
+  table_with_metadata read_chunk_internal(
+    bool uses_custom_row_bounds,
+    std::optional<std::reference_wrapper<ast::expression const>> filter);
+
+  // utility functions
+ private:
+  /**
+   * @brief Read the set of column chunks to be processed for this pass.
+   *
+   * Does not decompress the chunk data.
+   *
+   * @return pair of boolean indicating if compressed chunks were found and a vector of futures for
+   * read completion
+   */
+  std::pair<bool, std::vector<std::future<void>>> read_column_chunks();
+
+  /**
+   * @brief Load compressed data and page information for the current pass.
+   */
+  void load_compressed_data();
+
+  /**
+   * @brief Decompress dictionary data pages for a pass
+   */
+  void decompress_dict_data();
 
   /**
    * @brief Build string dictionary indices for a pass.
@@ -231,20 +263,6 @@ class reader::impl {
    */
   void populate_metadata(table_metadata& out_metadata);
 
-  /**
-   * @brief Read a chunk of data and return an output table.
-   *
-   * This function is called internally and expects all preprocessing steps have already been done.
-   *
-   * @param uses_custom_row_bounds Whether or not num_rows and skip_rows represents user-specific
-   *        bounds
-   * @param filter Optional AST expression to filter output rows
-   * @return The output table along with columns' metadata
-   */
-  table_with_metadata read_chunk_internal(
-    bool uses_custom_row_bounds,
-    std::optional<std::reference_wrapper<ast::expression const>> filter);
-
   /**
    * @brief Finalize the output table by adding empty columns for the non-selected columns in
    * schema.
@@ -297,11 +315,6 @@ class reader::impl {
    */
   void compute_input_passes();
 
-  /**
-   * @brief Close out the existing pass (if any) and prepare for the next pass.
-   */
-  void setup_next_pass();
-
   /**
    * @brief Given a set of pages that have had their sizes computed by nesting level and
    * a limit on total read size, generate a set of {skip_rows, num_rows} pairs representing
diff --git a/cpp/src/io/parquet/reader_impl_chunking.cu b/cpp/src/io/parquet/reader_impl_chunking.cu
index 393b9a47a14..b050022aa92 100644
--- a/cpp/src/io/parquet/reader_impl_chunking.cu
+++ b/cpp/src/io/parquet/reader_impl_chunking.cu
@@ -217,6 +217,8 @@ struct get_page_size {
 
   __device__ cumulative_page_info operator()(PageInfo const& page)
   {
+    // we treat dictionary page sizes as 0 for subpasses because we have already paid the price for
+    // them at the pass level.
     if (page.flags & PAGEINFO_FLAGS_DICTIONARY) {
       return cumulative_page_info{0, 0, page.src_col_schema};
     }
@@ -475,7 +477,6 @@ std::pair<std::vector<page_span>, size_t> compute_next_subpass(
                                 cudaMemcpyDefault,
                                 stream.value()));
   stream.synchronize();
-
   // print_cumulative_row_info(h_aggregated_info, "adjusted");
 
   // first, find the min row
@@ -487,11 +488,18 @@ std::pair<std::vector<page_span>, size_t> compute_next_subpass(
   // find the next split
   auto const end_index = find_next_split(start_index,
                                          min_row,
-                                         // 0,
                                          h_aggregated_info,
                                          size_limit) +
                          1;  // the split index returned is inclusive
 
+  /*
+  printf("Split: row(%lu -> %lu), size(%lu -> %lu)\n",
+          h_aggregated_info[start_index].row_count,
+          h_aggregated_info[end_index].row_count,
+          h_aggregated_info[start_index].size_bytes,
+          h_aggregated_info[end_index].size_bytes);
+          */
+
   // get the number of pages for each column/schema
   auto get_page_counts = [num_columns, stream](
                            rmm::device_uvector<cumulative_page_info> const& aggregated_info,
@@ -571,7 +579,7 @@ std::pair<std::vector<split_info>, rmm::device_uvector<int32_t>> compute_page_sp
 }
 
 /**
- * @brief Decompresses the page data, at page granularity.
+ * @brief Decompresses a set of pages contained in the set of chunks.
  *
  * This function handles the case where `pages` is only a subset of all available
  * pages in `chunks`.
@@ -752,10 +760,6 @@ std::pair<std::vector<split_info>, rmm::device_uvector<int32_t>> compute_page_sp
     stream.synchronize();
   }
 
-  // Update the page information in device memory with the updated value of
-  // page_data; it now points to the uncompressed data buffer
-  pages.host_to_device_async(stream);
-
   return decomp_pages;
 }
 
@@ -845,169 +849,68 @@ void detect_malformed_pages(cudf::detail::hostdevice_vector<PageInfo> const& pag
   }
 }
 
-}  // anonymous namespace
-
-void reader::impl::create_global_chunk_info()
-{
-  auto const num_rows         = _file_itm_data.global_num_rows;
-  auto const& row_groups_info = _file_itm_data.row_groups;
-  auto& chunks                = _file_itm_data.chunks;
-
-  // Descriptors for all the chunks that make up the selected columns
-  auto const num_input_columns = _input_columns.size();
-  auto const num_chunks        = row_groups_info.size() * num_input_columns;
-
-  // Initialize column chunk information
-  auto remaining_rows = num_rows;
-  for (auto const& rg : row_groups_info) {
-    auto const& row_group      = _metadata->get_row_group(rg.index, rg.source_index);
-    auto const row_group_start = rg.start_row;
-    auto const row_group_rows  = std::min<int>(remaining_rows, row_group.num_rows);
-
-    // generate ColumnChunkDesc objects for everything to be decoded (all input columns)
-    for (size_t i = 0; i < num_input_columns; ++i) {
-      auto col = _input_columns[i];
-      // look up metadata
-      auto& col_meta = _metadata->get_column_metadata(rg.index, rg.source_index, col.schema_idx);
-      auto& schema   = _metadata->get_schema(col.schema_idx);
-
-      auto [type_width, clock_rate, converted_type] =
-        conversion_info(to_type_id(schema, _strings_to_categorical, _timestamp_type.id()),
-                        _timestamp_type.id(),
-                        schema.type,
-                        schema.converted_type,
-                        schema.type_length);
+struct is_dict_page {
+  device_span<const PageInfo> pages;
+  bool __device__ operator()(size_t i) { return pages[i].flags & PAGEINFO_FLAGS_DICTIONARY; }
+};
 
-      chunks.push_back(ColumnChunkDesc(col_meta.total_compressed_size,
-                                       nullptr,
-                                       col_meta.num_values,
-                                       schema.type,
-                                       type_width,
-                                       row_group_start,
-                                       row_group_rows,
-                                       schema.max_definition_level,
-                                       schema.max_repetition_level,
-                                       _metadata->get_output_nesting_depth(col.schema_idx),
-                                       required_bits(schema.max_definition_level),
-                                       required_bits(schema.max_repetition_level),
-                                       col_meta.codec,
-                                       converted_type,
-                                       schema.logical_type,
-                                       schema.decimal_precision,
-                                       clock_rate,
-                                       i,
-                                       col.schema_idx));
-    }
+struct get_page_by_index {
+  device_span<const PageInfo> pages;
+  PageInfo __device__ operator()(size_t i) { return pages[i]; }
+};
 
-    remaining_rows -= row_group_rows;
+struct copy_data_ptr {
+  device_span<PageInfo> pass_pages;
+  device_span<const PageInfo> dict_pages;
+  device_span<const size_t> dict_page_indices;
+  void __device__ operator()(size_t i)
+  {
+    pass_pages[dict_page_indices[i]].page_data = dict_pages[i].page_data;
   }
-}
+};
 
-void reader::impl::compute_input_passes()
-{
-  // at this point, row_groups has already been filtered down to just the row groups we need to
-  // handle optional skip_rows/num_rows parameters.
-  auto const& row_groups_info = _file_itm_data.row_groups;
+}  // anonymous namespace
 
-  // if the user hasn't specified an input size limit, read everything in a single pass.
-  if (_input_pass_read_limit == 0) {
-    _file_itm_data.input_pass_row_group_offsets.push_back(0);
-    _file_itm_data.input_pass_row_group_offsets.push_back(row_groups_info.size());
-    return;
+void reader::impl::handle_chunking(bool uses_custom_row_bounds)
+{
+  // if this is our first time in here, setup the first pass.
+  if (!_pass_itm_data) {
+    // setup the next pass
+    setup_next_pass();
   }
 
-  // generate passes. make sure to account for the case where a single row group doesn't fit within
-  //
-  std::size_t const read_limit =
-    _input_pass_read_limit > 0 ? _input_pass_read_limit : std::numeric_limits<std::size_t>::max();
-  std::size_t cur_pass_byte_size = 0;
-  std::size_t cur_rg_start       = 0;
-  std::size_t cur_row_count      = 0;
-  _file_itm_data.input_pass_row_group_offsets.push_back(0);
-  _file_itm_data.input_pass_row_count.push_back(0);
-
-  for (size_t cur_rg_index = 0; cur_rg_index < row_groups_info.size(); cur_rg_index++) {
-    auto const& rgi       = row_groups_info[cur_rg_index];
-    auto const& row_group = _metadata->get_row_group(rgi.index, rgi.source_index);
-
-    // total compressed size and total size (compressed + uncompressed) for
-    auto const [compressed_rg_size, _ /*compressed + uncompressed*/] =
-      get_row_group_size(row_group);
+  auto& pass = *_pass_itm_data;
 
-    // can we add this row group
-    if (cur_pass_byte_size + compressed_rg_size >= read_limit) {
-      // A single row group (the current one) is larger than the read limit:
-      // We always need to include at least one row group, so end the pass at the end of the current
-      // row group
-      if (cur_rg_start == cur_rg_index) {
-        _file_itm_data.input_pass_row_group_offsets.push_back(cur_rg_index + 1);
-        _file_itm_data.input_pass_row_count.push_back(cur_row_count + row_group.num_rows);
-        cur_rg_start       = cur_rg_index + 1;
-        cur_pass_byte_size = 0;
-      }
-      // End the pass at the end of the previous row group
-      else {
-        _file_itm_data.input_pass_row_group_offsets.push_back(cur_rg_index);
-        _file_itm_data.input_pass_row_count.push_back(cur_row_count);
-        cur_rg_start       = cur_rg_index;
-        cur_pass_byte_size = compressed_rg_size;
-      }
-    } else {
-      cur_pass_byte_size += compressed_rg_size;
+  // if we already have a subpass in flight.
+  if (pass.subpass != nullptr) {
+    // if it still has more chunks in flight, there's nothing more to do
+    if (pass.subpass->current_output_chunk < pass.subpass->output_chunk_read_info.size()) {
+      return;
     }
-    cur_row_count += row_group.num_rows;
-  }
-
-  // add the last pass if necessary
-  if (_file_itm_data.input_pass_row_group_offsets.back() != row_groups_info.size()) {
-    _file_itm_data.input_pass_row_group_offsets.push_back(row_groups_info.size());
-    _file_itm_data.input_pass_row_count.push_back(cur_row_count);
-  }
-}
 
-void reader::impl::compute_chunks_for_subpass()
-{
-  auto& pass    = *_pass_itm_data;
-  auto& subpass = *pass.subpass;
+    // release the old subpass (will free memory)
+    pass.subpass.reset();
 
-  // simple case : no chunk size, no splits
-  if (_output_chunk_read_limit <= 0) {
-    subpass.output_chunk_read_info.push_back({subpass.skip_rows, subpass.num_rows});
-    return;
-  }
+    // otherwise we are done with the pass entirely
+    if (pass.processed_rows == pass.num_rows) {
+      // release the old pass
+      _pass_itm_data.reset();
 
-  // generate cumulative row counts and sizes
-  rmm::device_uvector<cumulative_page_info> c_info(subpass.pages.size(), _stream);
-  // convert PageInfo to cumulative_page_info
-  auto page_input =
-    thrust::make_transform_iterator(subpass.pages.d_begin(), get_cumulative_page_info{});
-  auto page_keys = make_page_key_iterator(subpass.pages);
-  thrust::inclusive_scan_by_key(rmm::exec_policy(_stream),
-                                page_keys,
-                                page_keys + subpass.pages.size(),
-                                page_input,
-                                c_info.begin(),
-                                thrust::equal_to{},
-                                cumulative_page_sum{});
-  // print_cumulative_page_info(subpass.pages, c_info, _stream);
+      _file_itm_data._current_input_pass++;
+      auto const num_passes = _file_itm_data.input_pass_row_group_offsets.size() - 1;
+      // no more passes. we are absolutely done with this file.
+      if (_file_itm_data._current_input_pass == num_passes) { return; }
 
-  // compute the splits
-  auto [splits, _] =
-    compute_page_splits_by_row(c_info, subpass.pages, _output_chunk_read_limit, _stream);
-  subpass.output_chunk_read_info.reserve(splits.size());
+      // setup the next pass
+      setup_next_pass();
+    }
+  }
 
-  // apply skip_rows from the subpass
-  std::transform(splits.begin(),
-                 splits.end(),
-                 std::back_inserter(subpass.output_chunk_read_info),
-                 [&subpass](split_info const& s) {
-                   row_range r = s.rows;
-                   r.skip_rows += subpass.skip_rows;
-                   return r;
-                 });
+  // setup the next sub pass
+  setup_next_subpass(uses_custom_row_bounds);
 }
 
-void reader::impl::preprocess_next_pass()
+void reader::impl::setup_next_pass()
 {
   auto const num_passes = _file_itm_data.input_pass_row_group_offsets.size() - 1;
 
@@ -1076,33 +979,14 @@ void reader::impl::preprocess_next_pass()
     //   confuses the chunked reader
     detect_malformed_pages(pass.pages, pass.chunks, pass.num_rows, _stream);
 
+    // decompress dictionary data if applicable.
+    if (pass.has_compressed_data) { decompress_dict_data(); }
+
     // since there is only ever 1 dictionary per chunk (the 0th path), do it at the
     // pass level.
     build_string_dict_indices();
 
-    // compute offsets to each group of input pages. this also gives us the number of unique
-    // columns in the input
-    // page_keys:   1, 1, 1, 1, 2, 2, 2, 2, 3, 3, 3, 3
-    //
-    // result:      0,          4,          8
-    rmm::device_uvector<size_type> page_counts(pass.pages.size() + 1, _stream);
-    auto page_keys             = make_page_key_iterator(pass.pages);
-    auto const page_counts_end = thrust::reduce_by_key(rmm::exec_policy(_stream),
-                                                       page_keys,
-                                                       page_keys + pass.pages.size(),
-                                                       thrust::make_constant_iterator(1),
-                                                       thrust::make_discard_iterator(),
-                                                       page_counts.begin())
-                                   .second;
-    auto const num_page_counts = page_counts_end - page_counts.begin();
-    pass.page_offsets = cudf::detail::hostdevice_vector<size_type>(num_page_counts + 1, _stream);
-    thrust::exclusive_scan(rmm::exec_policy(_stream),
-                           page_counts.begin(),
-                           page_counts.begin() + num_page_counts + 1,
-                           pass.page_offsets.d_begin());
-    pass.page_offsets.device_to_host_async(_stream);
-
-    pass.page_processed_counts = std::vector<size_type>(num_page_counts);
+    pass.page_processed_counts = std::vector<size_type>(pass.page_offsets.size() - 1);
     std::fill(pass.page_processed_counts.begin(), pass.page_processed_counts.end(), 0);
 
     // compute subpasses for this pass using the page information we now have.
@@ -1118,42 +1002,9 @@ void reader::impl::preprocess_next_pass()
   }
 }
 
-void reader::impl::handle_chunking(bool uses_custom_row_bounds)
+void reader::impl::setup_next_subpass(bool uses_custom_row_bounds)
 {
-  // if this is our first time in here, setup the first pass.
-  if (!_pass_itm_data) {
-    // preprocess the next pass
-    preprocess_next_pass();
-  }
-
-  auto& pass = *_pass_itm_data;
-
-  // if we already have a subpass in flight.
-  if (pass.subpass != nullptr) {
-    // if it still has more chunks in flight, there's nothing more to do
-    if (pass.subpass->current_output_chunk < pass.subpass->output_chunk_read_info.size()) {
-      return;
-    }
-
-    // release the old subpass (will free memory)
-    pass.subpass.reset();
-
-    // otherwise we are done with the pass entirely
-    if (pass.processed_rows == pass.num_rows) {
-      // release the old pass
-      _pass_itm_data.reset();
-
-      _file_itm_data._current_input_pass++;
-      auto const num_passes = _file_itm_data.input_pass_row_group_offsets.size() - 1;
-      // no more passes. we are absolutely done with this file.
-      if (_file_itm_data._current_input_pass == num_passes) { return; }
-
-      // preprocess the next pass
-      preprocess_next_pass();
-    }
-  }
-
-  // next pass
+  auto& pass    = *_pass_itm_data;
   pass.subpass  = std::make_unique<subpass_intermediate_data>();
   auto& subpass = *pass.subpass;
 
@@ -1220,23 +1071,18 @@ void reader::impl::handle_chunking(bool uses_custom_row_bounds)
                      page_indices[c_idx].start);
     page_count += num_column_pages;
   }
-  subpass.pages.host_to_device_async(_stream);
-  subpass.page_src_index.host_to_device_async(_stream);
-
   // print_hostdevice_vector(subpass.page_src_index);
 
-  // decompress the pages
+  // decompress the data for the pages in this subpass.
   if (pass.has_compressed_data) {
     subpass.decomp_page_data = decompress_page_data(pass.chunks, subpass.pages, _stream);
-    /*
-    // Free compressed data
-    for (size_t c = 0; c < chunks.size(); c++) {
-      if (chunks[c].codec != Compression::UNCOMPRESSED) { raw_page_data[c].reset(); }
-    }
-    */
   }
+
+  subpass.pages.host_to_device_async(_stream);
+  subpass.page_src_index.host_to_device_async(_stream);
+  _stream.synchronize();
+
   // buffers needed by the decode kernels
-  subpass.pages.device_to_host_sync(_stream);
   {
     // nesting information (sizes, etc) stored -per page-
     // note : even for flat schemas, we allocate 1 level of "nesting" info
@@ -1252,4 +1098,200 @@ void reader::impl::handle_chunking(bool uses_custom_row_bounds)
   preprocess_subpass_pages(uses_custom_row_bounds, _output_chunk_read_limit);
 }
 
+void reader::impl::create_global_chunk_info()
+{
+  auto const num_rows         = _file_itm_data.global_num_rows;
+  auto const& row_groups_info = _file_itm_data.row_groups;
+  auto& chunks                = _file_itm_data.chunks;
+
+  // Descriptors for all the chunks that make up the selected columns
+  auto const num_input_columns = _input_columns.size();
+  auto const num_chunks        = row_groups_info.size() * num_input_columns;
+
+  // Initialize column chunk information
+  auto remaining_rows = num_rows;
+  for (auto const& rg : row_groups_info) {
+    auto const& row_group      = _metadata->get_row_group(rg.index, rg.source_index);
+    auto const row_group_start = rg.start_row;
+    auto const row_group_rows  = std::min<int>(remaining_rows, row_group.num_rows);
+
+    // generate ColumnChunkDesc objects for everything to be decoded (all input columns)
+    for (size_t i = 0; i < num_input_columns; ++i) {
+      auto col = _input_columns[i];
+      // look up metadata
+      auto& col_meta = _metadata->get_column_metadata(rg.index, rg.source_index, col.schema_idx);
+      auto& schema   = _metadata->get_schema(col.schema_idx);
+
+      auto [type_width, clock_rate, converted_type] =
+        conversion_info(to_type_id(schema, _strings_to_categorical, _timestamp_type.id()),
+                        _timestamp_type.id(),
+                        schema.type,
+                        schema.converted_type,
+                        schema.type_length);
+
+      chunks.push_back(ColumnChunkDesc(col_meta.total_compressed_size,
+                                       nullptr,
+                                       col_meta.num_values,
+                                       schema.type,
+                                       type_width,
+                                       row_group_start,
+                                       row_group_rows,
+                                       schema.max_definition_level,
+                                       schema.max_repetition_level,
+                                       _metadata->get_output_nesting_depth(col.schema_idx),
+                                       required_bits(schema.max_definition_level),
+                                       required_bits(schema.max_repetition_level),
+                                       col_meta.codec,
+                                       converted_type,
+                                       schema.logical_type,
+                                       schema.decimal_precision,
+                                       clock_rate,
+                                       i,
+                                       col.schema_idx));
+    }
+
+    remaining_rows -= row_group_rows;
+  }
+}
+
+void reader::impl::compute_input_passes()
+{
+  // at this point, row_groups has already been filtered down to just the row groups we need to
+  // handle optional skip_rows/num_rows parameters.
+  auto const& row_groups_info = _file_itm_data.row_groups;
+
+  // if the user hasn't specified an input size limit, read everything in a single pass.
+  if (_input_pass_read_limit == 0) {
+    _file_itm_data.input_pass_row_group_offsets.push_back(0);
+    _file_itm_data.input_pass_row_group_offsets.push_back(row_groups_info.size());
+    return;
+  }
+
+  // generate passes. make sure to account for the case where a single row group doesn't fit within
+  //
+  std::size_t const read_limit =
+    _input_pass_read_limit > 0 ? _input_pass_read_limit : std::numeric_limits<std::size_t>::max();
+  std::size_t cur_pass_byte_size = 0;
+  std::size_t cur_rg_start       = 0;
+  std::size_t cur_row_count      = 0;
+  _file_itm_data.input_pass_row_group_offsets.push_back(0);
+  _file_itm_data.input_pass_row_count.push_back(0);
+
+  for (size_t cur_rg_index = 0; cur_rg_index < row_groups_info.size(); cur_rg_index++) {
+    auto const& rgi       = row_groups_info[cur_rg_index];
+    auto const& row_group = _metadata->get_row_group(rgi.index, rgi.source_index);
+
+    // total compressed size and total size (compressed + uncompressed) for
+    auto const [compressed_rg_size, _ /*compressed + uncompressed*/] =
+      get_row_group_size(row_group);
+
+    // can we add this row group
+    if (cur_pass_byte_size + compressed_rg_size >= read_limit) {
+      // A single row group (the current one) is larger than the read limit:
+      // We always need to include at least one row group, so end the pass at the end of the current
+      // row group
+      if (cur_rg_start == cur_rg_index) {
+        _file_itm_data.input_pass_row_group_offsets.push_back(cur_rg_index + 1);
+        _file_itm_data.input_pass_row_count.push_back(cur_row_count + row_group.num_rows);
+        cur_rg_start       = cur_rg_index + 1;
+        cur_pass_byte_size = 0;
+      }
+      // End the pass at the end of the previous row group
+      else {
+        _file_itm_data.input_pass_row_group_offsets.push_back(cur_rg_index);
+        _file_itm_data.input_pass_row_count.push_back(cur_row_count);
+        cur_rg_start       = cur_rg_index;
+        cur_pass_byte_size = compressed_rg_size;
+      }
+    } else {
+      cur_pass_byte_size += compressed_rg_size;
+    }
+    cur_row_count += row_group.num_rows;
+  }
+
+  // add the last pass if necessary
+  if (_file_itm_data.input_pass_row_group_offsets.back() != row_groups_info.size()) {
+    _file_itm_data.input_pass_row_group_offsets.push_back(row_groups_info.size());
+    _file_itm_data.input_pass_row_count.push_back(cur_row_count);
+  }
+}
+
+void reader::impl::decompress_dict_data()
+{
+  auto& pass = *_pass_itm_data;
+
+  // collect all dictionary pages in the pass
+  rmm::device_uvector<size_t> dict_page_indices(pass.pages.size(), _stream);
+  auto iter       = thrust::make_counting_iterator(0);
+  auto last_index = thrust::copy_if(rmm::exec_policy(_stream),
+                                    iter,
+                                    iter + pass.pages.size(),
+                                    dict_page_indices.begin(),
+                                    is_dict_page{pass.pages});
+
+  // print_vector(dict_page_indices);
+
+  cudf::detail::hostdevice_vector<PageInfo> dict_pages(last_index - dict_page_indices.begin(),
+                                                       _stream);
+  auto last_page = thrust::transform(rmm::exec_policy(_stream),
+                                     dict_page_indices.begin(),
+                                     dict_page_indices.begin() + dict_pages.size(),
+                                     dict_pages.d_begin(),
+                                     get_page_by_index{pass.pages});
+
+  // decompress
+  pass.decomp_dict_data = decompress_page_data(pass.chunks, dict_pages, _stream);
+
+  // copy the data pointers back to the pages in the pass. chunk.dict_page always
+  // references these.
+  thrust::for_each(rmm::exec_policy(_stream),
+                   iter,
+                   iter + dict_pages.size(),
+                   copy_data_ptr{pass.pages, dict_pages, dict_page_indices});
+
+  pass.pages.device_to_host_sync(_stream);
+}
+
+void reader::impl::compute_chunks_for_subpass()
+{
+  auto& pass    = *_pass_itm_data;
+  auto& subpass = *pass.subpass;
+
+  // simple case : no chunk size, no splits
+  if (_output_chunk_read_limit <= 0) {
+    subpass.output_chunk_read_info.push_back({subpass.skip_rows, subpass.num_rows});
+    return;
+  }
+
+  // generate cumulative row counts and sizes
+  rmm::device_uvector<cumulative_page_info> c_info(subpass.pages.size(), _stream);
+  // convert PageInfo to cumulative_page_info
+  auto page_input =
+    thrust::make_transform_iterator(subpass.pages.d_begin(), get_cumulative_page_info{});
+  auto page_keys = make_page_key_iterator(subpass.pages);
+  thrust::inclusive_scan_by_key(rmm::exec_policy(_stream),
+                                page_keys,
+                                page_keys + subpass.pages.size(),
+                                page_input,
+                                c_info.begin(),
+                                thrust::equal_to{},
+                                cumulative_page_sum{});
+  // print_cumulative_page_info(subpass.pages, c_info, _stream);
+
+  // compute the splits
+  auto [splits, _] =
+    compute_page_splits_by_row(c_info, subpass.pages, _output_chunk_read_limit, _stream);
+  subpass.output_chunk_read_info.reserve(splits.size());
+
+  // apply skip_rows from the subpass
+  std::transform(splits.begin(),
+                 splits.end(),
+                 std::back_inserter(subpass.output_chunk_read_info),
+                 [&subpass](split_info const& s) {
+                   row_range r = s.rows;
+                   r.skip_rows += subpass.skip_rows;
+                   return r;
+                 });
+}
+
 }  // namespace cudf::io::parquet::detail
diff --git a/cpp/src/io/parquet/reader_impl_chunking.hpp b/cpp/src/io/parquet/reader_impl_chunking.hpp
index 74ec0f24f7d..9469fe656e6 100644
--- a/cpp/src/io/parquet/reader_impl_chunking.hpp
+++ b/cpp/src/io/parquet/reader_impl_chunking.hpp
@@ -107,6 +107,7 @@ struct pass_intermediate_data {
   // of how many pages we have processed so far
   std::vector<size_type> page_processed_counts{};
 
+  rmm::device_buffer decomp_dict_data{0, rmm::cuda_stream_default};
   rmm::device_uvector<string_index_pair> str_dict_index{0, rmm::cuda_stream_default};
 
   int level_type_size{0};
diff --git a/cpp/src/io/parquet/reader_impl_preprocess.cu b/cpp/src/io/parquet/reader_impl_preprocess.cu
index 531b416023c..f24bcc88fe5 100644
--- a/cpp/src/io/parquet/reader_impl_preprocess.cu
+++ b/cpp/src/io/parquet/reader_impl_preprocess.cu
@@ -300,32 +300,31 @@ void generate_depth_remappings(std::map<int, std::pair<std::vector<int>, std::ve
 }
 
 /**
- * @brief Decode the page information from the given column chunks.
+ * @brief Decode the page information for a given pass.
+ *
+ * @param pass_intermediate_data The struct containing pass information
  *
- * @param chunks List of column chunk descriptors
- * @param pages List of page information
- * @param stream CUDA stream used for device memory operations and kernel launches
- * @returns The size in bytes of level type data required
  */
-int decode_page_headers(cudf::detail::hostdevice_vector<ColumnChunkDesc>& chunks,
-                        cudf::detail::hostdevice_vector<PageInfo>& pages,
-                        rmm::cuda_stream_view stream)
+void decode_page_headers(pass_intermediate_data& pass, rmm::cuda_stream_view stream)
 {
-  cudf::detail::hostdevice_vector<chunk_page_info> chunk_page_info(chunks.size(), stream);
+  cudf::detail::hostdevice_vector<chunk_page_info> chunk_page_info(pass.chunks.size(), stream);
 
   // IMPORTANT : if you change how pages are stored within a chunk (dist pages, then data pages),
   // please update preprocess_nested_columns to reflect this.
-  for (size_t c = 0, page_count = 0; c < chunks.size(); c++) {
-    chunks[c].max_num_pages  = chunks[c].num_data_pages + chunks[c].num_dict_pages;
-    chunk_page_info[c].pages = pages.device_ptr(page_count);
-    page_count += chunks[c].max_num_pages;
+  for (size_t c = 0, page_count = 0; c < pass.chunks.size(); c++) {
+    pass.chunks[c].max_num_pages = pass.chunks[c].num_data_pages + pass.chunks[c].num_dict_pages;
+    chunk_page_info[c].pages     = pass.pages.device_ptr(page_count);
+    page_count += pass.chunks[c].max_num_pages;
   }
 
   kernel_error error_code(stream);
-  chunks.host_to_device_async(stream);
+  pass.chunks.host_to_device_async(stream);
   chunk_page_info.host_to_device_async(stream);
-  DecodePageHeaders(
-    chunks.device_ptr(), chunk_page_info.device_ptr(), chunks.size(), error_code.data(), stream);
+  DecodePageHeaders(pass.chunks.device_ptr(),
+                    chunk_page_info.device_ptr(),
+                    pass.chunks.size(),
+                    error_code.data(),
+                    stream);
 
   if (error_code.value() != 0) {
     // TODO(ets): if an unsupported encoding was detected, do extra work to figure out which one
@@ -334,7 +333,7 @@ int decode_page_headers(cudf::detail::hostdevice_vector<ColumnChunkDesc>& chunks
 
   // compute max bytes needed for level data
   auto level_bit_size = cudf::detail::make_counting_transform_iterator(
-    0, [chunks = chunks.d_begin()] __device__(int i) {
+    0, [chunks = pass.chunks.d_begin()] __device__(int i) {
       auto c = chunks[i];
       return static_cast<int>(
         max(c.level_bits[level_type::REPETITION], c.level_bits[level_type::DEFINITION]));
@@ -342,9 +341,10 @@ int decode_page_headers(cudf::detail::hostdevice_vector<ColumnChunkDesc>& chunks
   // max level data bit size.
   int const max_level_bits = thrust::reduce(rmm::exec_policy(stream),
                                             level_bit_size,
-                                            level_bit_size + chunks.size(),
+                                            level_bit_size + pass.chunks.size(),
                                             0,
                                             thrust::maximum<int>());
+  pass.level_type_size     = std::max(1, cudf::util::div_rounding_up_safe(max_level_bits, 8));
 
   // sort the pages in schema order.
   //
@@ -366,28 +366,61 @@ int decode_page_headers(cudf::detail::hostdevice_vector<ColumnChunkDesc>& chunks
   //
   // We also need to preserve key-relative page ordering, so we need to use a stable sort.
   {
-    rmm::device_uvector<int32_t> page_keys{pages.size(), stream};
+    rmm::device_uvector<int32_t> page_keys{pass.pages.size(), stream};
     thrust::transform(rmm::exec_policy(stream),
-                      pages.d_begin(),
-                      pages.d_begin() + pages.size(),
+                      pass.pages.d_begin(),
+                      pass.pages.d_begin() + pass.pages.size(),
                       page_keys.begin(),
                       [] __device__(PageInfo const& page) { return page.src_col_schema; });
     thrust::stable_sort_by_key(rmm::exec_policy(stream),
                                page_keys.begin(),
                                page_keys.end(),
-                               pages.d_begin(),
+                               pass.pages.d_begin(),
                                thrust::less<int>());
   }
 
-  pages.device_to_host_sync(stream);
+  // compute offsets to each group of input pages.
+  // page_keys:   1, 1, 1, 1, 2, 2, 2, 2, 3, 3, 3, 3
+  //
+  // result:      0,          4,          8
+  rmm::device_uvector<size_type> page_counts(pass.pages.size() + 1, stream);
+  auto page_keys             = make_page_key_iterator(pass.pages);
+  auto const page_counts_end = thrust::reduce_by_key(rmm::exec_policy(stream),
+                                                     page_keys,
+                                                     page_keys + pass.pages.size(),
+                                                     thrust::make_constant_iterator(1),
+                                                     thrust::make_discard_iterator(),
+                                                     page_counts.begin())
+                                 .second;
+  auto const num_page_counts = page_counts_end - page_counts.begin();
+  pass.page_offsets = cudf::detail::hostdevice_vector<size_type>(num_page_counts + 1, stream);
+  thrust::exclusive_scan(rmm::exec_policy(stream),
+                         page_counts.begin(),
+                         page_counts.begin() + num_page_counts + 1,
+                         pass.page_offsets.d_begin());
+
+  // setup dict_page for each chunk if necessary
+  auto iter = thrust::make_counting_iterator(0);
+  thrust::for_each(rmm::exec_policy(stream),
+                   iter,
+                   iter + pass.chunks.size(),
+                   [chunks       = pass.chunks.d_begin(),
+                    pages        = pass.pages.d_begin(),
+                    page_offsets = pass.page_offsets.d_begin()] __device__(size_t i) {
+                     auto& chunk = chunks[i];
+                     if (chunk.num_dict_pages > 0) { chunk.dict_page = &pages[page_offsets[i]]; }
+                   });
+
+  pass.page_offsets.device_to_host_async(stream);
+  pass.pages.device_to_host_async(stream);
+  pass.chunks.device_to_host_async(stream);
+  stream.synchronize();
 
   // validate page encodings
-  CUDF_EXPECTS(std::all_of(pages.begin(),
-                           pages.end(),
+  CUDF_EXPECTS(std::all_of(pass.pages.begin(),
+                           pass.pages.end(),
                            [](auto const& page) { return is_supported_encoding(page.encoding); }),
                "Unsupported page encoding detected");
-
-  return std::max(1, cudf::util::div_rounding_up_safe(max_level_bits, 8));
 }
 
 }  // namespace
@@ -686,7 +719,7 @@ void reader::impl::load_compressed_data()
   pages = cudf::detail::hostdevice_vector<PageInfo>(total_pages, total_pages, _stream);
 
   // decoding of column/page information
-  pass.level_type_size = decode_page_headers(chunks, pages, _stream);
+  decode_page_headers(pass, _stream);
 }
 
 namespace {
diff --git a/cpp/tests/io/parquet_chunked_reader_test.cpp b/cpp/tests/io/parquet_chunked_reader_test.cpp
index 05fb9a3ec48..f8d2a5e98e1 100644
--- a/cpp/tests/io/parquet_chunked_reader_test.cpp
+++ b/cpp/tests/io/parquet_chunked_reader_test.cpp
@@ -1014,3 +1014,84 @@ TEST_F(ParquetChunkedReaderTest, InputLimitSimple)
     CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, result->get_column(0));
   }
 }
+
+struct ParquetChunkedSubRowgroupReaderTest : public cudf::test::BaseFixture {};
+
+void sub_rowgroup_test(std::string const& filepath,
+                       cudf::table_view const& t,
+                       size_t output_limit,
+                       size_t input_limit)
+{
+  // uncompressed, no dictionary
+  {
+    cudf::io::parquet_writer_options out_opts =
+      cudf::io::parquet_writer_options::builder(cudf::io::sink_info{filepath}, t)
+        .compression(cudf::io::compression_type::NONE)
+        .dictionary_policy(cudf::io::dictionary_policy::NEVER);
+    cudf::io::write_parquet(out_opts);
+
+    auto result = chunked_read(filepath, output_limit, input_limit);
+    CUDF_TEST_EXPECT_TABLES_EQUIVALENT(*result.first, t);
+  }
+
+  // compressed, no dictionary
+  {
+    cudf::io::parquet_writer_options out_opts =
+      cudf::io::parquet_writer_options::builder(cudf::io::sink_info{filepath}, t)
+        .compression(cudf::io::compression_type::SNAPPY)
+        .dictionary_policy(cudf::io::dictionary_policy::NEVER);
+    cudf::io::write_parquet(out_opts);
+
+    auto result = chunked_read(filepath, output_limit, input_limit);
+    CUDF_TEST_EXPECT_TABLES_EQUIVALENT(*result.first, t);
+  }
+
+  // uncompressed, dictionary
+  {
+    cudf::io::parquet_writer_options out_opts =
+      cudf::io::parquet_writer_options::builder(cudf::io::sink_info{filepath}, t)
+        .compression(cudf::io::compression_type::NONE)
+        .dictionary_policy(cudf::io::dictionary_policy::ALWAYS);
+    cudf::io::write_parquet(out_opts);
+
+    auto result = chunked_read(filepath, output_limit, input_limit);
+    CUDF_TEST_EXPECT_TABLES_EQUIVALENT(*result.first, t);
+  }
+
+  // compressed, dictionary
+  {
+    cudf::io::parquet_writer_options out_opts =
+      cudf::io::parquet_writer_options::builder(cudf::io::sink_info{filepath}, t)
+        .compression(cudf::io::compression_type::SNAPPY)
+        .dictionary_policy(cudf::io::dictionary_policy::ALWAYS);
+    cudf::io::write_parquet(out_opts);
+
+    auto result = chunked_read(filepath, output_limit, input_limit);
+    CUDF_TEST_EXPECT_TABLES_EQUIVALENT(*result.first, t);
+  }
+}
+
+TEST_F(ParquetChunkedSubRowgroupReaderTest, SingleFixedWidthColumnNoSplits)
+{
+  auto filepath           = std::string("table_with_dict.parquet");
+  constexpr auto num_rows = 100;
+  auto iter1 = cudf::detail::make_counting_transform_iterator(0, [](int i) { return 15; });
+  cudf::test::fixed_width_column_wrapper<int> col1(iter1, iter1 + num_rows);
+  auto tbl = cudf::table_view{{col1}};
+  sub_rowgroup_test(filepath, tbl, 0, 100 * 1024 * 1024);
+}
+
+TEST_F(ParquetChunkedSubRowgroupReaderTest, MultipleFixedWidthColumns)
+{
+  auto filepath           = std::string("multiple_col_fixed_width.parquet");
+  constexpr auto num_rows = 200000;
+
+  auto iter1 = thrust::make_counting_iterator<int>(0);
+  cudf::test::fixed_width_column_wrapper<int> col1(iter1, iter1 + num_rows);
+
+  auto iter2 = thrust::make_counting_iterator<double>(0);
+  cudf::test::fixed_width_column_wrapper<double> col2(iter2, iter2 + num_rows);
+
+  auto tbl = cudf::table_view{{col1, col2}};
+  sub_rowgroup_test(filepath, tbl, 0, 1 * 1024 * 1024);
+}

From df67c5c8cd3a6f02446374543481b9bff3d53bd8 Mon Sep 17 00:00:00 2001
From: db <dbaranec@nvidia.com>
Date: Wed, 8 Nov 2023 16:23:04 -0600
Subject: [PATCH 10/49] Fixed an issue with double-decompression of
 dictionaries. Greatly simplified the logic on dictionary vs. non-dictionary
 page decompression steps.

---
 cpp/src/io/parquet/reader_impl.hpp         |  5 --
 cpp/src/io/parquet/reader_impl_chunking.cu | 76 +++++-----------------
 2 files changed, 16 insertions(+), 65 deletions(-)

diff --git a/cpp/src/io/parquet/reader_impl.hpp b/cpp/src/io/parquet/reader_impl.hpp
index c8190d60dc9..1f1fb5301c8 100644
--- a/cpp/src/io/parquet/reader_impl.hpp
+++ b/cpp/src/io/parquet/reader_impl.hpp
@@ -208,11 +208,6 @@ class reader::impl {
    */
   void load_compressed_data();
 
-  /**
-   * @brief Decompress dictionary data pages for a pass
-   */
-  void decompress_dict_data();
-
   /**
    * @brief Build string dictionary indices for a pass.
    *
diff --git a/cpp/src/io/parquet/reader_impl_chunking.cu b/cpp/src/io/parquet/reader_impl_chunking.cu
index b050022aa92..5107097a0c2 100644
--- a/cpp/src/io/parquet/reader_impl_chunking.cu
+++ b/cpp/src/io/parquet/reader_impl_chunking.cu
@@ -586,6 +586,8 @@ std::pair<std::vector<split_info>, rmm::device_uvector<int32_t>> compute_page_sp
  *
  * @param chunks List of column chunk descriptors
  * @param pages List of page information
+ * @param dict_pages If true, decompress dictionary pages only. Otherwise decompress non-dictionary
+ * pages only.
  * @param stream CUDA stream used for device memory operations and kernel launches
  *
  * @return Device buffer to decompressed page data
@@ -593,11 +595,16 @@ std::pair<std::vector<split_info>, rmm::device_uvector<int32_t>> compute_page_sp
 [[nodiscard]] rmm::device_buffer decompress_page_data(
   cudf::detail::hostdevice_vector<ColumnChunkDesc> const& chunks,
   cudf::detail::hostdevice_vector<PageInfo>& pages,
+  bool dict_pages,
   rmm::cuda_stream_view stream)
 {
   auto for_each_codec_page = [&](Compression codec, std::function<void(size_t)> const& f) {
     for (size_t p = 0; p < pages.size(); p++) {
-      if (chunks[pages[p].chunk_idx].codec == codec) { f(p); }
+      if (chunks[pages[p].chunk_idx].codec == codec &&
+          ((dict_pages && (pages[p].flags & PAGEINFO_FLAGS_DICTIONARY)) ||
+           (!dict_pages && !(pages[p].flags & PAGEINFO_FLAGS_DICTIONARY)))) {
+        f(p);
+      }
     }
   };
 
@@ -757,9 +764,12 @@ std::pair<std::vector<split_info>, rmm::device_uvector<int32_t>> compute_page_sp
       copy_out, stream, rmm::mr::get_current_device_resource());
 
     gpu_copy_uncompressed_blocks(d_copy_in, d_copy_out, stream);
-    stream.synchronize();
   }
 
+  pages.host_to_device_async(stream);
+
+  stream.synchronize();
+
   return decomp_pages;
 }
 
@@ -849,26 +859,6 @@ void detect_malformed_pages(cudf::detail::hostdevice_vector<PageInfo> const& pag
   }
 }
 
-struct is_dict_page {
-  device_span<const PageInfo> pages;
-  bool __device__ operator()(size_t i) { return pages[i].flags & PAGEINFO_FLAGS_DICTIONARY; }
-};
-
-struct get_page_by_index {
-  device_span<const PageInfo> pages;
-  PageInfo __device__ operator()(size_t i) { return pages[i]; }
-};
-
-struct copy_data_ptr {
-  device_span<PageInfo> pass_pages;
-  device_span<const PageInfo> dict_pages;
-  device_span<const size_t> dict_page_indices;
-  void __device__ operator()(size_t i)
-  {
-    pass_pages[dict_page_indices[i]].page_data = dict_pages[i].page_data;
-  }
-};
-
 }  // anonymous namespace
 
 void reader::impl::handle_chunking(bool uses_custom_row_bounds)
@@ -980,7 +970,9 @@ void reader::impl::setup_next_pass()
     detect_malformed_pages(pass.pages, pass.chunks, pass.num_rows, _stream);
 
     // decompress dictionary data if applicable.
-    if (pass.has_compressed_data) { decompress_dict_data(); }
+    if (pass.has_compressed_data) {
+      pass.decomp_dict_data = decompress_page_data(pass.chunks, pass.pages, true, _stream);
+    }
 
     // since there is only ever 1 dictionary per chunk (the 0th path), do it at the
     // pass level.
@@ -1075,7 +1067,7 @@ void reader::impl::setup_next_subpass(bool uses_custom_row_bounds)
 
   // decompress the data for the pages in this subpass.
   if (pass.has_compressed_data) {
-    subpass.decomp_page_data = decompress_page_data(pass.chunks, subpass.pages, _stream);
+    subpass.decomp_page_data = decompress_page_data(pass.chunks, subpass.pages, false, _stream);
   }
 
   subpass.pages.host_to_device_async(_stream);
@@ -1216,42 +1208,6 @@ void reader::impl::compute_input_passes()
   }
 }
 
-void reader::impl::decompress_dict_data()
-{
-  auto& pass = *_pass_itm_data;
-
-  // collect all dictionary pages in the pass
-  rmm::device_uvector<size_t> dict_page_indices(pass.pages.size(), _stream);
-  auto iter       = thrust::make_counting_iterator(0);
-  auto last_index = thrust::copy_if(rmm::exec_policy(_stream),
-                                    iter,
-                                    iter + pass.pages.size(),
-                                    dict_page_indices.begin(),
-                                    is_dict_page{pass.pages});
-
-  // print_vector(dict_page_indices);
-
-  cudf::detail::hostdevice_vector<PageInfo> dict_pages(last_index - dict_page_indices.begin(),
-                                                       _stream);
-  auto last_page = thrust::transform(rmm::exec_policy(_stream),
-                                     dict_page_indices.begin(),
-                                     dict_page_indices.begin() + dict_pages.size(),
-                                     dict_pages.d_begin(),
-                                     get_page_by_index{pass.pages});
-
-  // decompress
-  pass.decomp_dict_data = decompress_page_data(pass.chunks, dict_pages, _stream);
-
-  // copy the data pointers back to the pages in the pass. chunk.dict_page always
-  // references these.
-  thrust::for_each(rmm::exec_policy(_stream),
-                   iter,
-                   iter + dict_pages.size(),
-                   copy_data_ptr{pass.pages, dict_pages, dict_page_indices});
-
-  pass.pages.device_to_host_sync(_stream);
-}
-
 void reader::impl::compute_chunks_for_subpass()
 {
   auto& pass    = *_pass_itm_data;

From c13eb58883cec3d9c4d07e8d6731dd3dd9d33ed4 Mon Sep 17 00:00:00 2001
From: db <dbaranec@nvidia.com>
Date: Wed, 8 Nov 2023 17:30:46 -0600
Subject: [PATCH 11/49] Fixed an issue with mis-allocation of nesting info
 structs for list columns.

---
 cpp/src/io/parquet/reader_impl_preprocess.cu | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/cpp/src/io/parquet/reader_impl_preprocess.cu b/cpp/src/io/parquet/reader_impl_preprocess.cu
index f24bcc88fe5..5d4cb0d2921 100644
--- a/cpp/src/io/parquet/reader_impl_preprocess.cu
+++ b/cpp/src/io/parquet/reader_impl_preprocess.cu
@@ -506,9 +506,6 @@ void reader::impl::allocate_nesting_info()
     auto const per_page_nesting_info_size = std::max(
       schema.max_definition_level + 1, _metadata->get_output_nesting_depth(src_col_schema));
 
-    // skip my dict pages
-    CUDF_EXPECTS(chunks[idx].num_dict_pages <= 1, "Unexpected dictionary page count for chunk");
-    target_page_index += chunks[idx].num_dict_pages;
     for (size_t p_idx = 0; p_idx < subpass.chunk_page_count[idx]; p_idx++) {
       pages[target_page_index + p_idx].nesting = page_nesting_info.device_ptr() + src_info_index;
       pages[target_page_index + p_idx].nesting_decode =
@@ -949,7 +946,10 @@ struct update_pass_num_rows {
   device_span<PageInfo> subpass_pages;
   device_span<size_t> page_src_index;
 
-  void operator()(size_t i) { pass_pages[page_src_index[i]].num_rows = subpass_pages[i].num_rows; }
+  __device__ void operator()(size_t i)
+  {
+    pass_pages[page_src_index[i]].num_rows = subpass_pages[i].num_rows;
+  }
 };
 
 void reader::impl::preprocess_subpass_pages(bool uses_custom_row_bounds, size_t chunk_read_limit)

From 0a3a9183cb5779dee3204b15e3b16aca171436b1 Mon Sep 17 00:00:00 2001
From: db <dbaranec@nvidia.com>
Date: Wed, 8 Nov 2023 19:08:24 -0600
Subject: [PATCH 12/49] Fix an issue with setting up nesting information for
 lists. Fixed incorrect row counting. Found and fixed a long-standing (but
 benign bug) where we were recomputing column information multiple times -
 essentially we were recomputing column information for each rowgroup-chunk in
 the file, instead of just once.

---
 cpp/src/io/parquet/reader_impl.cpp           |  5 +-
 cpp/src/io/parquet/reader_impl_chunking.cu   |  7 +--
 cpp/src/io/parquet/reader_impl_chunking.hpp  | 15 +++---
 cpp/src/io/parquet/reader_impl_preprocess.cu | 48 +++++++++++---------
 4 files changed, 35 insertions(+), 40 deletions(-)

diff --git a/cpp/src/io/parquet/reader_impl.cpp b/cpp/src/io/parquet/reader_impl.cpp
index e7ec419e470..092ffceb88f 100644
--- a/cpp/src/io/parquet/reader_impl.cpp
+++ b/cpp/src/io/parquet/reader_impl.cpp
@@ -80,7 +80,7 @@ void reader::impl::decode_page_data(size_t skip_rows, size_t num_rows)
     cudf::detail::hostdevice_vector<void*>(has_strings ? sum_max_depths : 0, _stream);
 
   // Update chunks with pointers to column data.
-  for (size_t c = 0, page_count = 0, chunk_off = 0; c < pass.chunks.size(); c++) {
+  for (size_t c = 0, chunk_off = 0; c < pass.chunks.size(); c++) {
     input_column_info const& input_col = _input_columns[pass.chunks[c].src_col_index];
     CUDF_EXPECTS(input_col.schema_idx == pass.chunks[c].src_col_schema,
                  "Column/page schema index mismatch");
@@ -156,9 +156,6 @@ void reader::impl::decode_page_data(size_t skip_rows, size_t num_rows)
         data[idx]   = nullptr;
       }
     }
-
-    // column_data_base will always point to leaf data, even for nested types.
-    page_count += subpass.chunk_page_count[c];
   }
 
   pass.chunks.host_to_device_async(_stream);
diff --git a/cpp/src/io/parquet/reader_impl_chunking.cu b/cpp/src/io/parquet/reader_impl_chunking.cu
index 5107097a0c2..656e44b0e35 100644
--- a/cpp/src/io/parquet/reader_impl_chunking.cu
+++ b/cpp/src/io/parquet/reader_impl_chunking.cu
@@ -978,9 +978,6 @@ void reader::impl::setup_next_pass()
     // pass level.
     build_string_dict_indices();
 
-    pass.page_processed_counts = std::vector<size_type>(pass.page_offsets.size() - 1);
-    std::fill(pass.page_processed_counts.begin(), pass.page_processed_counts.end(), 0);
-
     // compute subpasses for this pass using the page information we now have.
     // compute_subpasses();
     /*
@@ -1000,7 +997,7 @@ void reader::impl::setup_next_subpass(bool uses_custom_row_bounds)
   pass.subpass  = std::make_unique<subpass_intermediate_data>();
   auto& subpass = *pass.subpass;
 
-  auto const num_columns = pass.page_offsets.size() - 1;
+  auto const num_columns = _input_columns.size();
 
   auto [page_indices, total_pages] = [&]() -> std::pair<std::vector<page_span>, size_t> {
     // special case:  if we contain no compressed data, or if we have no input limit, we can always
@@ -1051,7 +1048,7 @@ void reader::impl::setup_next_subpass(bool uses_custom_row_bounds)
   size_t page_count = 0;
   for (size_t c_idx = 0; c_idx < num_columns; c_idx++) {
     auto const num_column_pages = page_indices[c_idx].end - page_indices[c_idx].start;
-    subpass.chunk_page_count.push_back(num_column_pages);
+    subpass.column_page_count.push_back(num_column_pages);
     std::copy(pass.pages.begin() + page_indices[c_idx].start,
               pass.pages.begin() + page_indices[c_idx].end,
               std::back_inserter(subpass.pages));
diff --git a/cpp/src/io/parquet/reader_impl_chunking.hpp b/cpp/src/io/parquet/reader_impl_chunking.hpp
index 9469fe656e6..fc07c793ac4 100644
--- a/cpp/src/io/parquet/reader_impl_chunking.hpp
+++ b/cpp/src/io/parquet/reader_impl_chunking.hpp
@@ -40,8 +40,8 @@ struct file_intermediate_data {
   // row counts per input-pass
   std::vector<std::size_t> input_pass_row_count{};
 
-  std::size_t _current_input_pass{0};  // current input pass index
-  std::size_t _output_chunk_count{0};  // how many output chunks we have produced
+  size_t _current_input_pass{0};  // current input pass index
+  size_t _output_chunk_count{0};  // how many output chunks we have produced
 
   // skip_rows/num_rows values for the entire file. these need to be adjusted per-pass because we
   // may not be visiting every row group that contains these bounds
@@ -67,9 +67,9 @@ struct subpass_intermediate_data {
   cudf::detail::hostdevice_vector<PageInfo> pages{};
   // for each page in the subpass, the index of our source page in the pass
   cudf::detail::hostdevice_vector<size_t> page_src_index{};
-  // for each chunk in the subpass, the number of associated pages for this
-  // subpass
-  std::vector<size_t> chunk_page_count;
+  // for each column in the file (indexed by _input_columns.size())
+  // the number of associated pages for this subpass
+  std::vector<size_t> column_page_count;
   cudf::detail::hostdevice_vector<PageNestingInfo> page_nesting_info{};
   cudf::detail::hostdevice_vector<PageNestingDecodeInfo> page_nesting_decode_info{};
 
@@ -96,16 +96,13 @@ struct pass_intermediate_data {
   cudf::detail::hostdevice_vector<ColumnChunkDesc> chunks{};
   cudf::detail::hostdevice_vector<PageInfo> pages{};
 
-  // offsets to each group of input pages (by column/schema)
+  // offsets to each group of input pages (by column/schema, indexed by _input_columns.size())
   // so if we had 2 columns/schemas, with page keys
   //
   // 1 1 1 1 1 2 2 2
   //
   // page_offsets would be 0, 5, 8
   cudf::detail::hostdevice_vector<size_type> page_offsets{};
-  // for each group of input pages (by column, schema), the count
-  // of how many pages we have processed so far
-  std::vector<size_type> page_processed_counts{};
 
   rmm::device_buffer decomp_dict_data{0, rmm::cuda_stream_default};
   rmm::device_uvector<string_index_pair> str_dict_index{0, rmm::cuda_stream_default};
diff --git a/cpp/src/io/parquet/reader_impl_preprocess.cu b/cpp/src/io/parquet/reader_impl_preprocess.cu
index 5d4cb0d2921..ef509afd48d 100644
--- a/cpp/src/io/parquet/reader_impl_preprocess.cu
+++ b/cpp/src/io/parquet/reader_impl_preprocess.cu
@@ -473,7 +473,8 @@ void reader::impl::allocate_nesting_info()
   auto& pass    = *_pass_itm_data;
   auto& subpass = *pass.subpass;
 
-  auto const& chunks             = pass.chunks;
+  // auto const& chunks             = pass.chunks;
+  auto const num_columns         = _input_columns.size();
   auto& pages                    = subpass.pages;
   auto& page_nesting_info        = subpass.page_nesting_info;
   auto& page_nesting_decode_info = subpass.page_nesting_decode_info;
@@ -482,14 +483,13 @@ void reader::impl::allocate_nesting_info()
   // buffer to keep it to a single gpu allocation
   auto counting_iter = thrust::make_counting_iterator(size_t{0});
   size_t const total_page_nesting_infos =
-    std::accumulate(counting_iter, counting_iter + chunks.size(), 0, [&](int total, size_t index) {
-      auto const& chunk = chunks[index];
-
+    std::accumulate(counting_iter, counting_iter + num_columns, 0, [&](int total, size_t index) {
       // the schema of the input column
-      auto const& schema                    = _metadata->get_schema(chunk.src_col_schema);
-      auto const per_page_nesting_info_size = max(
-        schema.max_definition_level + 1, _metadata->get_output_nesting_depth(chunk.src_col_schema));
-      return total + (per_page_nesting_info_size * subpass.chunk_page_count[index]);
+      auto const schema_idx = _input_columns[index].schema_idx;
+      auto const& schema    = _metadata->get_schema(schema_idx);
+      auto const per_page_nesting_info_size =
+        max(schema.max_definition_level + 1, _metadata->get_output_nesting_depth(schema_idx));
+      return total + (per_page_nesting_info_size * subpass.column_page_count[index]);
     });
 
   page_nesting_info =
@@ -500,13 +500,13 @@ void reader::impl::allocate_nesting_info()
   // update pointers in the PageInfos
   int target_page_index = 0;
   int src_info_index    = 0;
-  for (size_t idx = 0; idx < chunks.size(); idx++) {
-    int src_col_schema                    = chunks[idx].src_col_schema;
+  for (size_t idx = 0; idx < _input_columns.size(); idx++) {
+    auto const src_col_schema             = _input_columns[idx].schema_idx;
     auto& schema                          = _metadata->get_schema(src_col_schema);
     auto const per_page_nesting_info_size = std::max(
       schema.max_definition_level + 1, _metadata->get_output_nesting_depth(src_col_schema));
 
-    for (size_t p_idx = 0; p_idx < subpass.chunk_page_count[idx]; p_idx++) {
+    for (size_t p_idx = 0; p_idx < subpass.column_page_count[idx]; p_idx++) {
       pages[target_page_index + p_idx].nesting = page_nesting_info.device_ptr() + src_info_index;
       pages[target_page_index + p_idx].nesting_decode =
         page_nesting_decode_info.device_ptr() + src_info_index;
@@ -517,14 +517,14 @@ void reader::impl::allocate_nesting_info()
 
       src_info_index += per_page_nesting_info_size;
     }
-    target_page_index += subpass.chunk_page_count[idx];
+    target_page_index += subpass.column_page_count[idx];
   }
 
   // fill in
   int nesting_info_index = 0;
   std::map<int, std::pair<std::vector<int>, std::vector<int>>> depth_remapping;
-  for (size_t idx = 0; idx < chunks.size(); idx++) {
-    int src_col_schema = chunks[idx].src_col_schema;
+  for (size_t idx = 0; idx < _input_columns.size(); idx++) {
+    auto const src_col_schema = _input_columns[idx].schema_idx;
 
     // schema of the input column
     auto& schema = _metadata->get_schema(src_col_schema);
@@ -549,7 +549,7 @@ void reader::impl::allocate_nesting_info()
       // we can ignore them for the purposes of output nesting info
       if (!cur_schema.is_stub()) {
         // initialize each page within the chunk
-        for (size_t p_idx = 0; p_idx < subpass.chunk_page_count[idx]; p_idx++) {
+        for (size_t p_idx = 0; p_idx < subpass.column_page_count[idx]; p_idx++) {
           PageNestingInfo* pni =
             &page_nesting_info[nesting_info_index + (p_idx * per_page_nesting_info_size)];
 
@@ -589,7 +589,7 @@ void reader::impl::allocate_nesting_info()
       cur_schema = _metadata->get_schema(schema_idx);
     }
 
-    nesting_info_index += (per_page_nesting_info_size * subpass.chunk_page_count[idx]);
+    nesting_info_index += (per_page_nesting_info_size * subpass.column_page_count[idx]);
   }
 
   // copy nesting info to the device
@@ -717,6 +717,8 @@ void reader::impl::load_compressed_data()
 
   // decoding of column/page information
   decode_page_headers(pass, _stream);
+  CUDF_EXPECTS(pass.page_offsets.size() - 1 == static_cast<size_t>(_input_columns.size()),
+               "Encountered page_offsets / num_columns mismatch");
 }
 
 namespace {
@@ -1037,14 +1039,16 @@ void reader::impl::preprocess_subpass_pages(bool uses_custom_row_bounds, size_t
 
   // at this point we have an accurate row count so we can compute how many rows we will actually be
   // able to decode for this pass. we will have selected a set of pages for each column in the
-  // chunk, but not every page will have the same number of rows. so, we can only read as many rows
-  // as the smallest batch (by column) we have decompressed.
+  // row group, but not every page will have the same number of rows. so, we can only read as many
+  // rows as the smallest batch (by column) we have decompressed.
   size_t page_index = 0;
   size_t max_row    = std::numeric_limits<size_t>::max();
-  for (size_t idx = 0; idx < subpass.chunk_page_count.size(); idx++) {
-    auto const& last_page = subpass.pages[page_index + (subpass.chunk_page_count[idx] - 1)];
-    max_row = min(max_row, static_cast<size_t>(last_page.chunk_row + last_page.num_rows));
-    page_index += subpass.chunk_page_count[idx];
+  for (size_t idx = 0; idx < subpass.column_page_count.size(); idx++) {
+    auto const& last_page = subpass.pages[page_index + (subpass.column_page_count[idx] - 1)];
+    auto const& chunk     = pass.chunks[last_page.chunk_idx];
+    max_row =
+      min(max_row, static_cast<size_t>(chunk.start_row + last_page.chunk_row + last_page.num_rows));
+    page_index += subpass.column_page_count[idx];
   }
   CUDF_EXPECTS(max_row > pass.processed_rows, "Encountered invalid row read count");
   subpass.skip_rows = pass.skip_rows + pass.processed_rows;

From 10009784b6999f6d4f7265610b00681f1050fcd1 Mon Sep 17 00:00:00 2001
From: db <dbaranec@nvidia.com>
Date: Thu, 9 Nov 2023 14:10:15 -0600
Subject: [PATCH 13/49] Fix another dict_page setup issue. Many more tests
 passing.

---
 cpp/src/io/parquet/parquet_gpu.hpp           |  2 +-
 cpp/src/io/parquet/reader_impl_preprocess.cu | 14 ++++++--------
 2 files changed, 7 insertions(+), 9 deletions(-)

diff --git a/cpp/src/io/parquet/parquet_gpu.hpp b/cpp/src/io/parquet/parquet_gpu.hpp
index ea46387be9c..448d868d354 100644
--- a/cpp/src/io/parquet/parquet_gpu.hpp
+++ b/cpp/src/io/parquet/parquet_gpu.hpp
@@ -374,7 +374,7 @@ struct ColumnChunkDesc {
   int32_t num_data_pages{};                     // number of data pages
   int32_t num_dict_pages{};                     // number of dictionary pages
   int32_t max_num_pages{};                      // size of page_info array
-  PageInfo* dict_page{};
+  PageInfo const* dict_page{};
   string_index_pair* str_dict_index{};           // index for string dictionary
   bitmask_type** valid_map_base{};               // base pointers of valid bit map for this column
   void** column_data_base{};                     // base pointers of column data
diff --git a/cpp/src/io/parquet/reader_impl_preprocess.cu b/cpp/src/io/parquet/reader_impl_preprocess.cu
index ef509afd48d..424f7b52a92 100644
--- a/cpp/src/io/parquet/reader_impl_preprocess.cu
+++ b/cpp/src/io/parquet/reader_impl_preprocess.cu
@@ -400,15 +400,13 @@ void decode_page_headers(pass_intermediate_data& pass, rmm::cuda_stream_view str
                          pass.page_offsets.d_begin());
 
   // setup dict_page for each chunk if necessary
-  auto iter = thrust::make_counting_iterator(0);
   thrust::for_each(rmm::exec_policy(stream),
-                   iter,
-                   iter + pass.chunks.size(),
-                   [chunks       = pass.chunks.d_begin(),
-                    pages        = pass.pages.d_begin(),
-                    page_offsets = pass.page_offsets.d_begin()] __device__(size_t i) {
-                     auto& chunk = chunks[i];
-                     if (chunk.num_dict_pages > 0) { chunk.dict_page = &pages[page_offsets[i]]; }
+                   pass.pages.d_begin(),
+                   pass.pages.d_end(),
+                   [chunks = pass.chunks.d_begin()] __device__(PageInfo const& p) {
+                     if (p.flags & PAGEINFO_FLAGS_DICTIONARY) {
+                       chunks[p.chunk_idx].dict_page = &p;
+                     }
                    });
 
   pass.page_offsets.device_to_host_async(stream);

From cea9bc5ebe7010c8f29fa39ab6aaa923d2763188 Mon Sep 17 00:00:00 2001
From: db <dbaranec@nvidia.com>
Date: Sun, 12 Nov 2023 15:01:44 -0600
Subject: [PATCH 14/49] Fixed edge cases for skip_rows/num_rows.  Added a
 missing __device__ tag to a lambda that was quietly failing causing issues
 with list tests downstream.

---
 cpp/src/io/parquet/reader_impl.cpp           | 27 ++++++++++----------
 cpp/src/io/parquet/reader_impl.hpp           | 19 +++++++++++++-
 cpp/src/io/parquet/reader_impl_chunking.cu   | 19 ++++++++------
 cpp/src/io/parquet/reader_impl_chunking.hpp  |  5 ++++
 cpp/src/io/parquet/reader_impl_preprocess.cu | 10 +++++---
 5 files changed, 53 insertions(+), 27 deletions(-)

diff --git a/cpp/src/io/parquet/reader_impl.cpp b/cpp/src/io/parquet/reader_impl.cpp
index 092ffceb88f..3bb2c594545 100644
--- a/cpp/src/io/parquet/reader_impl.cpp
+++ b/cpp/src/io/parquet/reader_impl.cpp
@@ -253,6 +253,8 @@ void reader::impl::decode_page_data(size_t skip_rows, size_t num_rows)
     }
   }
 
+  _stream.synchronize();
+
   // update null counts in the final column buffers
   for (size_t idx = 0; idx < subpass.pages.size(); idx++) {
     PageInfo* pi = &subpass.pages[idx];
@@ -348,7 +350,7 @@ void reader::impl::prepare_data(int64_t skip_rows,
 
   // handle any chunking work (ratcheting through the subpasses and chunks within
   // our current pass)
-  handle_chunking(uses_custom_row_bounds);
+  if (_file_itm_data.num_passes() > 0) { handle_chunking(uses_custom_row_bounds); }
 }
 
 void reader::impl::populate_metadata(table_metadata& out_metadata)
@@ -378,11 +380,8 @@ table_with_metadata reader::impl::read_chunk_internal(
   auto out_columns = std::vector<std::unique_ptr<column>>{};
   out_columns.reserve(_output_buffers.size());
 
-#if 0
-  if (!has_next()/* || _pass_itm_data->output_chunk_read_info.empty()*/) {
-    return finalize_output(out_metadata, out_columns, filter);
-  }
-#endif
+  // no work to do (this can happen on the first pass if we have no rows to read)
+  if (!has_more_work()) { return finalize_output(out_metadata, out_columns, filter); }
 
   auto& pass            = *_pass_itm_data;
   auto& subpass         = *pass.subpass;
@@ -440,11 +439,13 @@ table_with_metadata reader::impl::finalize_output(
   }
 
   // advance output chunk/subpass/pass info
-  auto& pass    = *_pass_itm_data;
-  auto& subpass = *pass.subpass;
-  subpass.current_output_chunk++;
-  pass.processed_rows += subpass.num_rows;
-  _file_itm_data._output_chunk_count++;
+  if (_file_itm_data.num_passes() > 0) {
+    auto& pass    = *_pass_itm_data;
+    auto& subpass = *pass.subpass;
+    subpass.current_output_chunk++;
+    pass.processed_rows += subpass.num_rows;
+    _file_itm_data._output_chunk_count++;
+  }
 
   if (filter.has_value()) {
     auto read_table = std::make_unique<table>(std::move(out_columns));
@@ -505,9 +506,7 @@ bool reader::impl::has_next()
 
   // current_input_pass will only be incremented to be == num_passes after
   // the last chunk in the last subpass in the last pass has been returned
-  auto const num_passes = _file_itm_data.input_pass_row_group_offsets.size() - 1;
-  bool const more_work  = _file_itm_data._current_input_pass < num_passes;
-  return more_work;
+  return has_more_work();
 }
 
 namespace {
diff --git a/cpp/src/io/parquet/reader_impl.hpp b/cpp/src/io/parquet/reader_impl.hpp
index 1f1fb5301c8..244c8b2ecfd 100644
--- a/cpp/src/io/parquet/reader_impl.hpp
+++ b/cpp/src/io/parquet/reader_impl.hpp
@@ -157,6 +157,9 @@ class reader::impl {
 
   /**
    * @brief Ratchet the pass/subpass/chunk process forward.
+   *
+   * @param uses_custom_row_bounds Whether or not num_rows and skip_rows represents user-specific
+   *        bounds
    */
   void handle_chunking(bool uses_custom_row_bounds);
 
@@ -165,12 +168,18 @@ class reader::impl {
    *
    * A 'pass' is defined as a subset of row groups read out of the globally
    * requested set of all row groups.
+   *
+   * @param uses_custom_row_bounds Whether or not num_rows and skip_rows represents user-specific
+   *        bounds
    */
-  void setup_next_pass();
+  void setup_next_pass(bool uses_custom_row_bounds);
 
   /**
    * @brief Setup step for the next decompression subpass.
    *
+   * @param uses_custom_row_bounds Whether or not num_rows and skip_rows represents user-specific
+   *        bounds
+   *
    * A 'subpass' is defined as a subset of pages within a pass that are
    * decompressed a decoded as a batch. Subpasses may be further subdivided
    * into output chunks.
@@ -317,6 +326,14 @@ class reader::impl {
    */
   void compute_chunks_for_subpass();
 
+  bool has_more_work()
+  {
+    // no work to do (this can happen on the first pass if we have no rows to read)
+    auto const num_passes = _file_itm_data.num_passes();
+    bool const more_work  = num_passes > 0 && _file_itm_data._current_input_pass < num_passes;
+    return more_work;
+  }
+
  private:
   rmm::cuda_stream_view _stream;
   rmm::mr::device_memory_resource* _mr = nullptr;
diff --git a/cpp/src/io/parquet/reader_impl_chunking.cu b/cpp/src/io/parquet/reader_impl_chunking.cu
index 656e44b0e35..3022f2430c3 100644
--- a/cpp/src/io/parquet/reader_impl_chunking.cu
+++ b/cpp/src/io/parquet/reader_impl_chunking.cu
@@ -866,7 +866,7 @@ void reader::impl::handle_chunking(bool uses_custom_row_bounds)
   // if this is our first time in here, setup the first pass.
   if (!_pass_itm_data) {
     // setup the next pass
-    setup_next_pass();
+    setup_next_pass(uses_custom_row_bounds);
   }
 
   auto& pass = *_pass_itm_data;
@@ -887,12 +887,11 @@ void reader::impl::handle_chunking(bool uses_custom_row_bounds)
       _pass_itm_data.reset();
 
       _file_itm_data._current_input_pass++;
-      auto const num_passes = _file_itm_data.input_pass_row_group_offsets.size() - 1;
       // no more passes. we are absolutely done with this file.
-      if (_file_itm_data._current_input_pass == num_passes) { return; }
+      if (_file_itm_data._current_input_pass == _file_itm_data.num_passes()) { return; }
 
       // setup the next pass
-      setup_next_pass();
+      setup_next_pass(uses_custom_row_bounds);
     }
   }
 
@@ -900,9 +899,9 @@ void reader::impl::handle_chunking(bool uses_custom_row_bounds)
   setup_next_subpass(uses_custom_row_bounds);
 }
 
-void reader::impl::setup_next_pass()
+void reader::impl::setup_next_pass(bool uses_custom_row_bounds)
 {
-  auto const num_passes = _file_itm_data.input_pass_row_group_offsets.size() - 1;
+  auto const num_passes = _file_itm_data.num_passes();
 
   // always create the pass struct, even if we end up with no work.
   // this will also cause the previous pass information to be deleted
@@ -923,7 +922,7 @@ void reader::impl::setup_next_pass()
               _file_itm_data.row_groups.begin() + row_group_end,
               pass.row_groups.begin());
 
-    auto const num_passes = _file_itm_data.input_pass_row_group_offsets.size() - 1;
+    auto const num_passes = _file_itm_data.num_passes();
     CUDF_EXPECTS(_file_itm_data._current_input_pass < num_passes,
                  "Encountered an invalid read pass index");
 
@@ -967,7 +966,11 @@ void reader::impl::setup_next_pass()
     //   is possible to load this by just capping the number of rows read, we cannot tell
     //   which rows are invalid so we may be returning bad data. in addition, this mismatch
     //   confuses the chunked reader
-    detect_malformed_pages(pass.pages, pass.chunks, pass.num_rows, _stream);
+    detect_malformed_pages(
+      pass.pages,
+      pass.chunks,
+      uses_custom_row_bounds ? std::nullopt : std::make_optional(pass.num_rows),
+      _stream);
 
     // decompress dictionary data if applicable.
     if (pass.has_compressed_data) {
diff --git a/cpp/src/io/parquet/reader_impl_chunking.hpp b/cpp/src/io/parquet/reader_impl_chunking.hpp
index fc07c793ac4..3e3ab0ba8b9 100644
--- a/cpp/src/io/parquet/reader_impl_chunking.hpp
+++ b/cpp/src/io/parquet/reader_impl_chunking.hpp
@@ -47,6 +47,11 @@ struct file_intermediate_data {
   // may not be visiting every row group that contains these bounds
   size_t global_skip_rows;
   size_t global_num_rows;
+
+  size_t num_passes()
+  {
+    return input_pass_row_group_offsets.size() == 0 ? 0 : input_pass_row_group_offsets.size() - 1;
+  }
 };
 
 /**
diff --git a/cpp/src/io/parquet/reader_impl_preprocess.cu b/cpp/src/io/parquet/reader_impl_preprocess.cu
index 424f7b52a92..b57f6451e8e 100644
--- a/cpp/src/io/parquet/reader_impl_preprocess.cu
+++ b/cpp/src/io/parquet/reader_impl_preprocess.cu
@@ -543,7 +543,7 @@ void reader::impl::allocate_nesting_info()
     auto cur_schema = _metadata->get_schema(schema_idx);
     int cur_depth   = max_depth - 1;
     while (schema_idx > 0) {
-      // stub columns (basically the inner field of a list scheme element) are not real columns.
+      // stub columns (basically the inner field of a list schema element) are not real columns.
       // we can ignore them for the purposes of output nesting info
       if (!cur_schema.is_stub()) {
         // initialize each page within the chunk
@@ -934,7 +934,7 @@ struct update_subpass_chunk_row {
   device_span<PageInfo> subpass_pages;
   device_span<size_t> page_src_index;
 
-  void operator()(size_t i)
+  __device__ void operator()(size_t i)
   {
     subpass_pages[i].chunk_row = pass_pages[page_src_index[i]].chunk_row;
   }
@@ -1049,8 +1049,10 @@ void reader::impl::preprocess_subpass_pages(bool uses_custom_row_bounds, size_t
     page_index += subpass.column_page_count[idx];
   }
   CUDF_EXPECTS(max_row > pass.processed_rows, "Encountered invalid row read count");
-  subpass.skip_rows = pass.skip_rows + pass.processed_rows;
-  subpass.num_rows  = max_row - pass.processed_rows;
+  subpass.skip_rows   = pass.skip_rows + pass.processed_rows;
+  auto const pass_end = pass.skip_rows + pass.num_rows;
+  max_row             = min(max_row, pass_end);
+  subpass.num_rows    = max_row - subpass.skip_rows;
 
   // now split up the output into chunks as necessary
   compute_chunks_for_subpass();

From 259ca1cd212d9e1bbcc3eaba632c42e3863b4231 Mon Sep 17 00:00:00 2001
From: db <dbaranec@nvidia.com>
Date: Sun, 12 Nov 2023 15:54:43 -0600
Subject: [PATCH 15/49] Sort pass pages by input schema index instead of input
 schema value to handle user output column reordering.

---
 cpp/src/io/parquet/reader_impl_preprocess.cu | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/cpp/src/io/parquet/reader_impl_preprocess.cu b/cpp/src/io/parquet/reader_impl_preprocess.cu
index b57f6451e8e..1cdfae40f9f 100644
--- a/cpp/src/io/parquet/reader_impl_preprocess.cu
+++ b/cpp/src/io/parquet/reader_impl_preprocess.cu
@@ -346,7 +346,9 @@ void decode_page_headers(pass_intermediate_data& pass, rmm::cuda_stream_view str
                                             thrust::maximum<int>());
   pass.level_type_size     = std::max(1, cudf::util::div_rounding_up_safe(max_level_bits, 8));
 
-  // sort the pages in schema order.
+  // sort the pages in chunk/schema order. we use chunk.src_col_index instead of
+  // chunk.src_col_schema because the user may have reordered them (reading columns, "a" and "b" but
+  // returning them as "b" and "a")
   //
   // ordering of pages is by input column schema, repeated across row groups.  so
   // if we had 3 columns, each with 2 pages, and 1 row group, our schema values might look like
@@ -371,7 +373,9 @@ void decode_page_headers(pass_intermediate_data& pass, rmm::cuda_stream_view str
                       pass.pages.d_begin(),
                       pass.pages.d_begin() + pass.pages.size(),
                       page_keys.begin(),
-                      [] __device__(PageInfo const& page) { return page.src_col_schema; });
+                      [chunks = pass.chunks.d_begin()] __device__(PageInfo const& page) {
+                        return chunks[page.chunk_idx].src_col_index;
+                      });
     thrust::stable_sort_by_key(rmm::exec_policy(stream),
                                page_keys.begin(),
                                page_keys.end(),

From 875857413511198130dbcda747f34de46fda74ee Mon Sep 17 00:00:00 2001
From: db <dbaranec@nvidia.com>
Date: Mon, 13 Nov 2023 11:54:21 -0600
Subject: [PATCH 16/49] Fixed several edge cases in input and output chunking.
 Fixed an odd test that looked bogus.

---
 cpp/src/io/parquet/reader_impl.cpp           |  1 -
 cpp/src/io/parquet/reader_impl_chunking.cu   | 25 ++++++++++++--------
 cpp/src/io/parquet/reader_impl_chunking.hpp  |  4 +++-
 cpp/src/io/parquet/reader_impl_preprocess.cu |  1 -
 cpp/tests/io/parquet_test.cpp                |  3 ++-
 5 files changed, 20 insertions(+), 14 deletions(-)

diff --git a/cpp/src/io/parquet/reader_impl.cpp b/cpp/src/io/parquet/reader_impl.cpp
index 3bb2c594545..f230447f6c9 100644
--- a/cpp/src/io/parquet/reader_impl.cpp
+++ b/cpp/src/io/parquet/reader_impl.cpp
@@ -443,7 +443,6 @@ table_with_metadata reader::impl::finalize_output(
     auto& pass    = *_pass_itm_data;
     auto& subpass = *pass.subpass;
     subpass.current_output_chunk++;
-    pass.processed_rows += subpass.num_rows;
     _file_itm_data._output_chunk_count++;
   }
 
diff --git a/cpp/src/io/parquet/reader_impl_chunking.cu b/cpp/src/io/parquet/reader_impl_chunking.cu
index 3022f2430c3..254e5f0c248 100644
--- a/cpp/src/io/parquet/reader_impl_chunking.cu
+++ b/cpp/src/io/parquet/reader_impl_chunking.cu
@@ -269,11 +269,10 @@ struct page_total_size {
 
 int64_t find_next_split(int64_t cur_pos,
                         size_t cur_row_count,
+                        size_t cur_cumulative_size,
                         std::vector<cumulative_page_info> const& sizes,
                         size_t chunk_read_limit)
 {
-  size_t cur_cumulative_size = cur_pos == 0 ? 0 : sizes[cur_pos - 1].size_bytes;
-
   auto start = thrust::make_transform_iterator(sizes.begin(), [&](cumulative_page_info const& i) {
     return i.size_bytes - cur_cumulative_size;
   });
@@ -318,20 +317,22 @@ std::vector<split_info> find_splits(std::vector<cumulative_page_info> const& siz
   // sizes are reasonably large, this shouldn't iterate too many times
   std::vector<split_info> splits;
   {
-    size_t cur_pos       = 0;
-    size_t cur_row_count = 0;
-    auto const num_rows  = sizes.back().row_count;
+    size_t cur_pos             = 0;
+    size_t cur_row_count       = 0;
+    size_t cur_cumulative_size = 0;
+    auto const num_rows        = sizes.back().row_count;
     while (cur_row_count < num_rows) {
-      auto const split_pos = find_next_split(cur_pos, cur_row_count, sizes, chunk_read_limit);
+      auto const split_pos =
+        find_next_split(cur_pos, cur_row_count, cur_cumulative_size, sizes, chunk_read_limit);
 
       auto const start_row = cur_row_count;
       cur_row_count        = sizes[split_pos].row_count;
-      splits.push_back(split_info{row_range{start_row, cur_row_count - start_row},
-                                  static_cast<int64_t>(cur_pos == 0 ? 0 : cur_pos + 1)});
-      cur_pos = split_pos;
+      splits.push_back(split_info{row_range{start_row, cur_row_count - start_row}, split_pos});
+      cur_pos             = split_pos;
+      cur_cumulative_size = sizes[split_pos].size_bytes;
     }
   }
-  // print_cumulative_row_info(sizes, "adjusted", splits);
+  // print_cumulative_row_info(sizes, "adjusted w/splits", splits);
 
   return splits;
 }
@@ -488,6 +489,7 @@ std::pair<std::vector<page_span>, size_t> compute_next_subpass(
   // find the next split
   auto const end_index = find_next_split(start_index,
                                          min_row,
+                                         0,
                                          h_aggregated_info,
                                          size_limit) +
                          1;  // the split index returned is inclusive
@@ -878,6 +880,9 @@ void reader::impl::handle_chunking(bool uses_custom_row_bounds)
       return;
     }
 
+    // increment rows processed
+    pass.processed_rows += pass.subpass->num_rows;
+
     // release the old subpass (will free memory)
     pass.subpass.reset();
 
diff --git a/cpp/src/io/parquet/reader_impl_chunking.hpp b/cpp/src/io/parquet/reader_impl_chunking.hpp
index 3e3ab0ba8b9..3e654b086bf 100644
--- a/cpp/src/io/parquet/reader_impl_chunking.hpp
+++ b/cpp/src/io/parquet/reader_impl_chunking.hpp
@@ -118,7 +118,9 @@ struct pass_intermediate_data {
   // NOTE: skip_rows is the absolute row index in the file.
   size_t skip_rows;
   size_t num_rows;
-  // number of rows we have processed so far (out of num_rows)
+  // number of rows we have processed so far (out of num_rows). note that this
+  // only includes the number of rows we have processed before starting the current
+  // subpass. it does not get updated as a subpass iterates through output chunks.
   size_t processed_rows{0};
 
   // currently active subpass
diff --git a/cpp/src/io/parquet/reader_impl_preprocess.cu b/cpp/src/io/parquet/reader_impl_preprocess.cu
index 1cdfae40f9f..d744e75f0ac 100644
--- a/cpp/src/io/parquet/reader_impl_preprocess.cu
+++ b/cpp/src/io/parquet/reader_impl_preprocess.cu
@@ -1052,7 +1052,6 @@ void reader::impl::preprocess_subpass_pages(bool uses_custom_row_bounds, size_t
       min(max_row, static_cast<size_t>(chunk.start_row + last_page.chunk_row + last_page.num_rows));
     page_index += subpass.column_page_count[idx];
   }
-  CUDF_EXPECTS(max_row > pass.processed_rows, "Encountered invalid row read count");
   subpass.skip_rows   = pass.skip_rows + pass.processed_rows;
   auto const pass_end = pass.skip_rows + pass.num_rows;
   max_row             = min(max_row, pass_end);
diff --git a/cpp/tests/io/parquet_test.cpp b/cpp/tests/io/parquet_test.cpp
index fece83f891b..8af50fdc538 100644
--- a/cpp/tests/io/parquet_test.cpp
+++ b/cpp/tests/io/parquet_test.cpp
@@ -5656,8 +5656,9 @@ TEST_F(ParquetReaderTest, ChunkedSingleLevelLists)
     cudf::io::parquet_reader_options::builder(
       cudf::io::source_info{reinterpret_cast<char const*>(list_bytes), sizeof(list_bytes)}));
   int iterations = 0;
-  while (reader.has_next() && iterations < 10) {
+  while (reader.has_next()) {
     auto chunk = reader.read_chunk();
+    iterations++;
   }
   EXPECT_TRUE(iterations < 10);
 }

From 7cd3b8335e65a5a8d9fac7eb46d07e25d4ca9f99 Mon Sep 17 00:00:00 2001
From: db <dbaranec@nvidia.com>
Date: Mon, 13 Nov 2023 15:43:35 -0600
Subject: [PATCH 17/49] Fixed an issue with subpass computation.

---
 cpp/src/io/parquet/reader_impl_chunking.cu | 13 +++----------
 1 file changed, 3 insertions(+), 10 deletions(-)

diff --git a/cpp/src/io/parquet/reader_impl_chunking.cu b/cpp/src/io/parquet/reader_impl_chunking.cu
index 254e5f0c248..84e50c18291 100644
--- a/cpp/src/io/parquet/reader_impl_chunking.cu
+++ b/cpp/src/io/parquet/reader_impl_chunking.cu
@@ -487,21 +487,14 @@ std::pair<std::vector<page_span>, size_t> compute_next_subpass(
     thrust::upper_bound(thrust::host, start, start + h_aggregated_info.size(), min_row) - start;
 
   // find the next split
-  auto const end_index = find_next_split(start_index,
+  auto const cumulative_size = start_index == 0 ? 0 : h_aggregated_info[start_index - 1].size_bytes;
+  auto const end_index       = find_next_split(start_index,
                                          min_row,
-                                         0,
+                                         cumulative_size,
                                          h_aggregated_info,
                                          size_limit) +
                          1;  // the split index returned is inclusive
 
-  /*
-  printf("Split: row(%lu -> %lu), size(%lu -> %lu)\n",
-          h_aggregated_info[start_index].row_count,
-          h_aggregated_info[end_index].row_count,
-          h_aggregated_info[start_index].size_bytes,
-          h_aggregated_info[end_index].size_bytes);
-          */
-
   // get the number of pages for each column/schema
   auto get_page_counts = [num_columns, stream](
                            rmm::device_uvector<cumulative_page_info> const& aggregated_info,

From 2dcc10b33ea85b183023c78ae7a8da24fe11d8d6 Mon Sep 17 00:00:00 2001
From: db <dbaranec@nvidia.com>
Date: Thu, 16 Nov 2023 15:50:14 -0600
Subject: [PATCH 18/49] Fixed an issue with string dict index computation
 stemming from how pages are sorted in a different order now.

---
 cpp/src/io/parquet/reader_impl_preprocess.cu | 95 +++++++++++++-------
 1 file changed, 61 insertions(+), 34 deletions(-)

diff --git a/cpp/src/io/parquet/reader_impl_preprocess.cu b/cpp/src/io/parquet/reader_impl_preprocess.cu
index d744e75f0ac..69abbb19123 100644
--- a/cpp/src/io/parquet/reader_impl_preprocess.cu
+++ b/cpp/src/io/parquet/reader_impl_preprocess.cu
@@ -425,49 +425,76 @@ void decode_page_headers(pass_intermediate_data& pass, rmm::cuda_stream_view str
                "Unsupported page encoding detected");
 }
 
-}  // namespace
+struct set_str_dict_index_count {
+  device_span<size_t> str_dict_index_count;
+  device_span<const ColumnChunkDesc> chunks;
 
-void reader::impl::build_string_dict_indices()
-{
-  auto& pass = *_pass_itm_data;
+  __device__ void operator()(PageInfo const& page)
+  {
+    auto const& chunk = chunks[page.chunk_idx];
+    if ((page.flags & PAGEINFO_FLAGS_DICTIONARY) && (chunk.data_type & 0x7) == BYTE_ARRAY &&
+        (chunk.num_dict_pages > 0)) {
+      // there is only ever one dictionary page per chunk, so this is safe to do in parallel.
+      str_dict_index_count[page.chunk_idx] = page.num_input_values;
+    }
+  }
+};
 
-  auto is_dict_chunk = [](ColumnChunkDesc const& chunk) {
-    return (chunk.data_type & 0x7) == BYTE_ARRAY && chunk.num_dict_pages > 0;
-  };
+struct set_str_dict_index_ptr {
+  string_index_pair* const base;
+  device_span<const size_t> str_dict_index_offsets;
+  device_span<ColumnChunkDesc> chunks;
 
-  // Count the number of string dictionary entries
-  // NOTE: Assumes first page in the chunk is always the dictionary page
-  size_t total_str_dict_indexes = 0;
-  for (size_t c = 0, page_count = 0; c < pass.chunks.size(); c++) {
-    if (is_dict_chunk(pass.chunks[c])) {
-      total_str_dict_indexes += pass.pages[page_count].num_input_values;
+  __device__ void operator()(size_t i)
+  {
+    auto& chunk = chunks[i];
+    if ((chunk.data_type & 0x7) == BYTE_ARRAY && (chunk.num_dict_pages > 0)) {
+      chunk.str_dict_index = base + str_dict_index_offsets[i];
     }
-    page_count += pass.chunks[c].max_num_pages;
   }
+};
 
-  // Build index for string dictionaries since they can't be indexed
-  // directly due to variable-sized elements
-  pass.str_dict_index = cudf::detail::make_zeroed_device_uvector_async<string_index_pair>(
-    total_str_dict_indexes, _stream, rmm::mr::get_current_device_resource());
+}  // namespace
 
-  // Update chunks with pointers to string dict indices
-  for (size_t c = 0, page_count = 0, str_ofs = 0; c < pass.chunks.size(); c++) {
-    input_column_info const& input_col = _input_columns[pass.chunks[c].src_col_index];
-    CUDF_EXPECTS(input_col.schema_idx == pass.chunks[c].src_col_schema,
-                 "Column/page schema index mismatch");
-    if (is_dict_chunk(pass.chunks[c])) {
-      pass.chunks[c].str_dict_index = pass.str_dict_index.data() + str_ofs;
-      str_ofs += pass.pages[page_count].num_input_values;
-    }
+void reader::impl::build_string_dict_indices()
+{
+  auto& pass = *_pass_itm_data;
 
-    // column_data_base will always point to leaf data, even for nested types.
-    page_count += pass.chunks[c].max_num_pages;
-  }
+  // compute number of indices per chunk and a summed total
+  rmm::device_uvector<size_t> str_dict_index_count(pass.chunks.size() + 1, _stream);
+  thrust::fill(
+    rmm::exec_policy(_stream), str_dict_index_count.begin(), str_dict_index_count.end(), 0);
+  thrust::for_each(rmm::exec_policy(_stream),
+                   pass.pages.begin(),
+                   pass.pages.end(),
+                   set_str_dict_index_count{str_dict_index_count, pass.chunks});
+
+  size_t const total_str_dict_indexes = thrust::reduce(
+    rmm::exec_policy(_stream), str_dict_index_count.begin(), str_dict_index_count.end());
+  if (total_str_dict_indexes == 0) { return; }
+
+  // convert to offsets
+  rmm::device_uvector<size_t>& str_dict_index_offsets = str_dict_index_count;
+  thrust::exclusive_scan(rmm::exec_policy(_stream),
+                         str_dict_index_offsets.begin(),
+                         str_dict_index_offsets.end(),
+                         str_dict_index_offsets.begin(),
+                         0);
+
+  // allocate and distribute pointers
+  pass.str_dict_index = cudf::detail::make_zeroed_device_uvector_async<string_index_pair>(
+    total_str_dict_indexes, _stream, rmm::mr::get_current_device_resource());
 
-  if (total_str_dict_indexes > 0) {
-    pass.chunks.host_to_device_async(_stream);
-    BuildStringDictionaryIndex(pass.chunks.device_ptr(), pass.chunks.size(), _stream);
-  }
+  auto iter = thrust::make_counting_iterator(0);
+  thrust::for_each(
+    rmm::exec_policy(_stream),
+    iter,
+    iter + pass.chunks.size(),
+    set_str_dict_index_ptr{pass.str_dict_index.data(), str_dict_index_offsets, pass.chunks});
+
+  // compute the indices
+  BuildStringDictionaryIndex(pass.chunks.device_ptr(), pass.chunks.size(), _stream);
+  pass.chunks.device_to_host_sync(_stream);
 }
 
 void reader::impl::allocate_nesting_info()

From a055bbb2cad7f448d2237bbaaeccbbeefac02f24 Mon Sep 17 00:00:00 2001
From: db <dbaranec@nvidia.com>
Date: Fri, 17 Nov 2023 11:42:43 -0600
Subject: [PATCH 19/49] Fixed an old bug that caused an issue with the
 subrowgroup reader. Dictionary pages that had no data in them were still
 incorrect getting their data pointer set to something not null (the data in
 the subsequent page).  The new code leaves them null. The were checks in the
 kernels to verify that the data pointer was not null that were erroneously
 failing because of this.

---
 cpp/src/io/parquet/page_decode.cuh       | 2 +-
 cpp/src/io/parquet/page_string_decode.cu | 4 +++-
 2 files changed, 4 insertions(+), 2 deletions(-)

diff --git a/cpp/src/io/parquet/page_decode.cuh b/cpp/src/io/parquet/page_decode.cuh
index a5e253fbe54..be0e3d75f9b 100644
--- a/cpp/src/io/parquet/page_decode.cuh
+++ b/cpp/src/io/parquet/page_decode.cuh
@@ -1289,7 +1289,7 @@ inline __device__ bool setupLocalPageInfo(page_state_s* const s,
           s->dict_run  = 0;
           s->dict_val  = 0;
           s->dict_bits = (cur < end) ? *cur++ : 0;
-          if (s->dict_bits > 32 || !s->dict_base) {
+          if (s->dict_bits > 32 || (!s->dict_base && s->col.dict_page->num_input_values > 0)) {
             s->set_error_code(decode_error::INVALID_DICT_WIDTH);
           }
           break;
diff --git a/cpp/src/io/parquet/page_string_decode.cu b/cpp/src/io/parquet/page_string_decode.cu
index 9a6a62a64d2..77593d33f44 100644
--- a/cpp/src/io/parquet/page_string_decode.cu
+++ b/cpp/src/io/parquet/page_string_decode.cu
@@ -545,7 +545,9 @@ __global__ void __launch_bounds__(preprocess_block_size) gpuComputePageStringSiz
         }
 
         // FIXME: need to return an error condition...this won't actually do anything
-        if (s->dict_bits > 32 || !dict_base) { CUDF_UNREACHABLE("invalid dictionary bit size"); }
+        if (s->dict_bits > 32 || (!dict_base && col.dict_page->num_input_values > 0)) {
+          CUDF_UNREACHABLE("invalid dictionary bit size");
+        }
 
         str_bytes = totalDictEntriesSize(
           data, dict_base, s->dict_bits, dict_size, (end - data), start_value, end_value);

From 1227e723e5a04f3562f5c1b8f44ad77d35bbe418 Mon Sep 17 00:00:00 2001
From: db <dbaranec@nvidia.com>
Date: Sun, 3 Dec 2023 14:09:55 -0600
Subject: [PATCH 20/49] Handling for list columns with rows that span page
 boundaries. Better row count estimation for undecoded list columns. Take
 compressed memory usage into account when choosing subpass splits and attempt
 to gracefully handle the case where we do not have much memory remaining for
 decompression. Switch to using absolute row indices when computing subpass
 splits and output chunk splits, which makes working with skip_rows/num_rows
 considerably easier to think about. Lots of name cleanup and doc
 improvements.

Added support to column_buffer for retrieving properly nested column type names.
---
 cpp/src/io/parquet/parquet_gpu.hpp           |   9 +-
 cpp/src/io/parquet/reader_impl.hpp           |  13 +-
 cpp/src/io/parquet/reader_impl_chunking.cu   | 339 ++++++++++++-------
 cpp/src/io/parquet/reader_impl_chunking.hpp  |  21 +-
 cpp/src/io/parquet/reader_impl_preprocess.cu | 149 ++++++--
 cpp/src/io/utilities/column_buffer.cpp       |  33 ++
 cpp/src/io/utilities/column_buffer.hpp       |   4 +
 7 files changed, 408 insertions(+), 160 deletions(-)

diff --git a/cpp/src/io/parquet/parquet_gpu.hpp b/cpp/src/io/parquet/parquet_gpu.hpp
index 448d868d354..3041ded7423 100644
--- a/cpp/src/io/parquet/parquet_gpu.hpp
+++ b/cpp/src/io/parquet/parquet_gpu.hpp
@@ -252,6 +252,7 @@ struct PageInfo {
   int32_t num_input_values;
   int32_t chunk_row;  // starting row of this page relative to the start of the chunk
   int32_t num_rows;   // number of rows in this page
+  // in the case of list columns, rows can span page boundaries, so even though row N
   // the next two are calculated in gpuComputePageStringSizes
   int32_t num_nulls;       // number of null values (V2 header), but recalculated for string cols
   int32_t num_valids;      // number of non-null values, taking into account skip_rows/num_rows
@@ -332,7 +333,8 @@ struct ColumnChunkDesc {
                            int8_t decimal_precision_,
                            int32_t ts_clock_rate_,
                            int32_t src_col_index_,
-                           int32_t src_col_schema_)
+                           int32_t src_col_schema_,
+                           float list_bytes_per_row_est_)
     : compressed_data(compressed_data_),
       compressed_size(compressed_size_),
       num_values(num_values_),
@@ -356,7 +358,8 @@ struct ColumnChunkDesc {
       decimal_precision(decimal_precision_),
       ts_clock_rate(ts_clock_rate_),
       src_col_index(src_col_index_),
-      src_col_schema(src_col_schema_)
+      src_col_schema(src_col_schema_),
+      list_bytes_per_row_est(list_bytes_per_row_est_)
   {
   }
 
@@ -387,6 +390,8 @@ struct ColumnChunkDesc {
 
   int32_t src_col_index{};   // my input column index
   int32_t src_col_schema{};  // my schema index in the file
+
+  float list_bytes_per_row_est{};  // for LIST columns, an estimate on number of bytes per row
 };
 
 /**
diff --git a/cpp/src/io/parquet/reader_impl.hpp b/cpp/src/io/parquet/reader_impl.hpp
index 244c8b2ecfd..a070b0bf3e2 100644
--- a/cpp/src/io/parquet/reader_impl.hpp
+++ b/cpp/src/io/parquet/reader_impl.hpp
@@ -223,6 +223,17 @@ class reader::impl {
    */
   void build_string_dict_indices();
 
+  /**
+   * @brief For list columns, generate estimated row counts for pages in the current pass.
+   *
+   * The row counts in the pages that come out of the file only reflect the number of values in
+   * all of the rows in the page, not the number of rows themselves. In order to do subpass reading
+   * more accurately, we would like to have a more accurate guess of the real number of rows per
+   * page.
+   *
+   */
+  void generate_list_column_row_count_estimates();
+
   /**
    * @brief Perform some preprocessing for subpass page data and also compute the split locations
    * {skip_rows, num_rows} for chunked reading.
@@ -324,7 +335,7 @@ class reader::impl {
    * a limit on total read size, generate a set of {skip_rows, num_rows} pairs representing
    * a set of reads that will generate output columns of total size <= `chunk_read_limit` bytes.
    */
-  void compute_chunks_for_subpass();
+  void compute_output_chunks_for_subpass();
 
   bool has_more_work()
   {
diff --git a/cpp/src/io/parquet/reader_impl_chunking.cu b/cpp/src/io/parquet/reader_impl_chunking.cu
index 84e50c18291..acb56adafcd 100644
--- a/cpp/src/io/parquet/reader_impl_chunking.cu
+++ b/cpp/src/io/parquet/reader_impl_chunking.cu
@@ -41,17 +41,17 @@ namespace cudf::io::parquet::detail {
 
 namespace {
 
-struct cumulative_page_info {
-  size_t row_count;   // cumulative row count
-  size_t size_bytes;  // cumulative size in bytes
-  int key;            // schema index
-};
-
 struct split_info {
   row_range rows;
   int64_t split_pos;
 };
 
+// the minimum amount of memory we can safely expect to be enough to
+// do a subpass decode. if the difference between the user specified limit and
+// the actual memory used for compressed/temp data is > than this value, we will still use
+// at least this many additional bytes.
+constexpr size_t minimum_subpass_expected_size = 200 * 1024 * 1024;
+
 #if defined(CHUNKING_DEBUG)
 void print_cumulative_page_info(cudf::detail::hostdevice_vector<PageInfo>& pages,
                                 rmm::device_uvector<cumulative_page_info> const& c_info,
@@ -79,43 +79,43 @@ void print_cumulative_page_info(cudf::detail::hostdevice_vector<PageInfo>& pages
       if (page.flags & PAGEINFO_FLAGS_DICTIONARY || page.src_col_schema != schemas[idx]) {
         continue;
       }
-      printf("\tP: {%lu, %lu}\n", h_cinfo[pidx].row_count, h_cinfo[pidx].size_bytes);
+      printf("\tP: {%lu, %lu, %lu}\n", pidx, h_cinfo[pidx].row_index, h_cinfo[pidx].size_bytes);
     }
   }
 }
 
 void print_cumulative_row_info(host_span<cumulative_page_info const> sizes,
                                std::string const& label,
-                               std::optional<std::vector<split_info>> splits = std::nullopt)
+                               std::optional<std::vector<row_range>> splits = std::nullopt)
 {
   if (splits.has_value()) {
     printf("------------\nSplits (skip_rows, num_rows)\n");
     for (size_t idx = 0; idx < splits->size(); idx++) {
-      printf("{%lu, %lu}\n", splits.value()[idx].rows.skip_rows, splits.value()[idx].rows.num_rows);
+      printf("{%lu, %lu}\n", splits.value()[idx].skip_rows, splits.value()[idx].num_rows);
     }
   }
 
-  printf("------------\nCumulative sizes %s (row_count, size_bytes, page_key)\n", label.c_str());
+  printf("------------\nCumulative sizes %s (index, row_index, size_bytes, page_key)\n",
+         label.c_str());
   for (size_t idx = 0; idx < sizes.size(); idx++) {
-    printf("{%lu, %lu, %d}", sizes[idx].row_count, sizes[idx].size_bytes, sizes[idx].key);
+    printf("{%lu, %lu, %lu, %d}", idx, sizes[idx].row_index, sizes[idx].size_bytes, sizes[idx].key);
     if (splits.has_value()) {
       // if we have a split at this row count and this is the last instance of this row count
-      auto start = thrust::make_transform_iterator(
-        splits->begin(), [](split_info const& i) { return i.rows.skip_rows; });
+      auto start             = thrust::make_transform_iterator(splits->begin(),
+                                                   [](row_range const& i) { return i.skip_rows; });
       auto end               = start + splits->size();
-      auto split             = std::find(start, end, sizes[idx].row_count);
+      auto split             = std::find(start, end, sizes[idx].row_index);
       auto const split_index = [&]() -> int {
         if (split != end &&
-            ((idx == sizes.size() - 1) || (sizes[idx + 1].row_count > sizes[idx].row_count))) {
+            ((idx == sizes.size() - 1) || (sizes[idx + 1].row_index > sizes[idx].row_index))) {
           return static_cast<int>(std::distance(start, split));
         }
         return idx == 0 ? 0 : -1;
       }();
       if (split_index >= 0) {
-        printf(" <-- split {%lu, %lu, %lu}",
-               splits.value()[split_index].rows.skip_rows,
-               splits.value()[split_index].rows.num_rows,
-               splits.value()[split_index].split_pos);
+        printf(" <-- split {%lu, %lu}",
+               splits.value()[split_index].skip_rows,
+               splits.value()[split_index].num_rows);
       }
     }
     printf("\n");
@@ -130,7 +130,7 @@ struct cumulative_page_sum {
   cumulative_page_info operator()
     __device__(cumulative_page_info const& a, cumulative_page_info const& b) const
   {
-    return cumulative_page_info{a.row_count + b.row_count, a.size_bytes + b.size_bytes, a.key};
+    return cumulative_page_info{0, a.size_bytes + b.size_bytes, a.key};
   }
 };
 
@@ -188,7 +188,7 @@ __device__ size_t row_size_functor::operator()<string_view>(size_t num_rows, boo
  *
  * Sums across all nesting levels.
  */
-struct get_cumulative_page_info {
+struct get_page_output_size {
   __device__ cumulative_page_info operator()(PageInfo const& page)
   {
     if (page.flags & PAGEINFO_FLAGS_DICTIONARY) {
@@ -200,19 +200,17 @@ struct get_cumulative_page_info {
       auto const& pni = page.nesting[i];
       return cudf::type_dispatcher(data_type{pni.type}, row_size_functor{}, pni.size, pni.nullable);
     });
-
-    size_t const row_count = static_cast<size_t>(page.nesting[0].size);
     return {
-      row_count,
+      0,
       thrust::reduce(thrust::seq, iter, iter + page.num_output_nesting_levels) + page.str_bytes,
       page.src_col_schema};
   }
 };
 
 /**
- * @brief Functor which computes the (uncompressed) size of a page.
+ * @brief Functor which sets the (uncompressed) size of a page.
  */
-struct get_page_size {
+struct get_page_input_size {
   device_span<const ColumnChunkDesc> chunks;
 
   __device__ cumulative_page_info operator()(PageInfo const& page)
@@ -222,11 +220,24 @@ struct get_page_size {
     if (page.flags & PAGEINFO_FLAGS_DICTIONARY) {
       return cumulative_page_info{0, 0, page.src_col_schema};
     }
-    // TODO: this is not accurate for lists. it might make sense to make a guess
-    // based on total-rowgroup-size / # of rows in the rowgroup for an average of
-    // rows-per-byte.
-    size_t const row_count = page.num_rows;
-    return {row_count, static_cast<size_t>(page.uncompressed_page_size), page.src_col_schema};
+    return {0, static_cast<size_t>(page.uncompressed_page_size), page.src_col_schema};
+  }
+};
+
+/**
+ * @brief Functor which sets the absolute row index of a page in a cumulative_page_info struct
+ */
+struct set_row_index {
+  device_span<const ColumnChunkDesc> chunks;
+  device_span<const PageInfo> pages;
+  device_span<cumulative_page_info> c_info;
+
+  __device__ void operator()(size_t i)
+  {
+    auto const& page            = pages[i];
+    auto const& chunk           = chunks[page.chunk_idx];
+    size_t const page_start_row = chunk.start_row + page.chunk_row + page.num_rows;
+    c_info[i].row_index         = page_start_row;
   }
 };
 
@@ -258,17 +269,24 @@ struct page_total_size {
       auto const start = key_offsets[idx];
       auto const end   = key_offsets[idx + 1];
       auto iter        = cudf::detail::make_counting_transform_iterator(
-        0, [&] __device__(size_type i) { return c_info[i].row_count; });
+        0, [&] __device__(size_type i) { return c_info[i].row_index; });
       auto const page_index =
-        thrust::lower_bound(thrust::seq, iter + start, iter + end, i.row_count) - iter;
+        thrust::lower_bound(thrust::seq, iter + start, iter + end, i.row_index) - iter;
       sum += c_info[page_index].size_bytes;
     }
-    return {i.row_count, sum, i.key};
+    return {i.row_index, sum, i.key};
   }
 };
 
+/**
+ * @brief Functor which returns the compressed data size for a chunk
+ */
+struct get_chunk_compressed_size {
+  __device__ size_t operator()(ColumnChunkDesc const& chunk) { return chunk.compressed_size; }
+};
+
 int64_t find_next_split(int64_t cur_pos,
-                        size_t cur_row_count,
+                        size_t cur_row_index,
                         size_t cur_cumulative_size,
                         std::vector<cumulative_page_info> const& sizes,
                         size_t chunk_read_limit)
@@ -294,49 +312,13 @@ int64_t find_next_split(int64_t cur_pos,
   // the list twice. so we have to iterate until we skip past all of them.  The idea is that we
   // either do this, or we have to call unique() on the input first.
   while (split_pos < (static_cast<int64_t>(sizes.size()) - 1) &&
-         (split_pos < 0 || sizes[split_pos].row_count == cur_row_count)) {
+         (split_pos < 0 || sizes[split_pos].row_index == cur_row_index)) {
     split_pos++;
   }
 
   return split_pos;
 }
 
-/**
- * @brief Given a vector of cumulative {row_count, byte_size} pairs and a chunk read
- * limit, determine the set of splits.
- *
- * @param sizes Vector of cumulative {row_count, byte_size} pairs
- * @param chunk_read_limit Limit on total number of bytes to be returned per read, for all columns
- */
-std::vector<split_info> find_splits(std::vector<cumulative_page_info> const& sizes,
-                                    size_t chunk_read_limit)
-{
-  // now we have an array of {row_count, real output bytes}. just walk through it and generate
-  // splits.
-  // TODO: come up with a clever way to do this entirely in parallel. For now, as long as batch
-  // sizes are reasonably large, this shouldn't iterate too many times
-  std::vector<split_info> splits;
-  {
-    size_t cur_pos             = 0;
-    size_t cur_row_count       = 0;
-    size_t cur_cumulative_size = 0;
-    auto const num_rows        = sizes.back().row_count;
-    while (cur_row_count < num_rows) {
-      auto const split_pos =
-        find_next_split(cur_pos, cur_row_count, cur_cumulative_size, sizes, chunk_read_limit);
-
-      auto const start_row = cur_row_count;
-      cur_row_count        = sizes[split_pos].row_count;
-      splits.push_back(split_info{row_range{start_row, cur_row_count - start_row}, split_pos});
-      cur_pos             = split_pos;
-      cur_cumulative_size = sizes[split_pos].size_bytes;
-    }
-  }
-  // print_cumulative_row_info(sizes, "adjusted w/splits", splits);
-
-  return splits;
-}
-
 /**
  * @brief Converts cuDF units to Parquet units.
  *
@@ -381,10 +363,14 @@ template <typename T = uint8_t>
 struct row_count_compare {
   __device__ bool operator()(cumulative_page_info const& a, cumulative_page_info const& b)
   {
-    return a.row_count < b.row_count;
+    return a.row_index < b.row_index;
   }
 };
 
+/**
+ * @brief return compressed and total size of the data in a row group
+ *
+ */
 std::pair<size_t, size_t> get_row_group_size(RowGroup const& rg)
 {
   auto compressed_size_iter = thrust::make_transform_iterator(
@@ -397,6 +383,14 @@ std::pair<size_t, size_t> get_row_group_size(RowGroup const& rg)
   return {compressed_size, total_size};
 }
 
+/**
+ * @brief For a set of cumulative_page_info data, adjust the size_bytes field
+ * such that it reflects the worst case for all pages that span the same rows.
+ *
+ * By doing this, we can now look at row X and know the total
+ * byte cost for all pages that span row X, not just the cost up to row X itself.
+ *
+ */
 std::pair<rmm::device_uvector<cumulative_page_info>, rmm::device_uvector<int32_t>>
 adjust_cumulative_sizes(rmm::device_uvector<cumulative_page_info> const& c_info,
                         cudf::detail::hostdevice_vector<PageInfo> const& pages,
@@ -456,10 +450,24 @@ adjust_cumulative_sizes(rmm::device_uvector<cumulative_page_info> const& c_info,
   return {std::move(aggregated_info), std::move(page_keys_by_split)};
 }
 
+/**
+ * @brief Find the first entry in the aggreggated_info that corresponds to the specified row
+ *
+ */
+size_t find_start_index(std::vector<cumulative_page_info> const& aggregated_info, size_t min_row)
+{
+  auto start = thrust::make_transform_iterator(
+    aggregated_info.begin(), [&](cumulative_page_info const& i) { return i.row_index; });
+  auto start_index =
+    thrust::lower_bound(thrust::host, start, start + aggregated_info.size(), min_row) - start;
+  if (aggregated_info[start_index].row_index == min_row) { start_index++; }
+  return start_index;
+}
+
 struct page_span {
   size_t start, end;
 };
-std::pair<std::vector<page_span>, size_t> compute_next_subpass(
+std::tuple<std::vector<page_span>, size_t> compute_next_subpass(
   rmm::device_uvector<cumulative_page_info> const& c_info,
   cudf::detail::hostdevice_vector<PageInfo> const& pages,
   cudf::detail::hostdevice_vector<size_type> const& page_offsets,
@@ -480,13 +488,12 @@ std::pair<std::vector<page_span>, size_t> compute_next_subpass(
   stream.synchronize();
   // print_cumulative_row_info(h_aggregated_info, "adjusted");
 
-  // first, find the min row
-  auto start = thrust::make_transform_iterator(
-    h_aggregated_info.begin(), [&](cumulative_page_info const& i) { return i.row_count; });
-  auto const start_index =
-    thrust::upper_bound(thrust::host, start, start + h_aggregated_info.size(), min_row) - start;
+  // TODO: if the user has explicitly specified skip_rows/num_rows we could be more intelligent
+  // about skipping subpasses/pages that do not fall within the range of values, but only if the
+  // data does not contain lists (because our row counts are only estimates in that case)/
 
   // find the next split
+  auto const start_index     = find_start_index(h_aggregated_info, min_row);
   auto const cumulative_size = start_index == 0 ? 0 : h_aggregated_info[start_index - 1].size_bytes;
   auto const end_index       = find_next_split(start_index,
                                          min_row,
@@ -552,9 +559,11 @@ std::pair<std::vector<page_span>, size_t> compute_next_subpass(
   return {out, total_pages};
 }
 
-std::pair<std::vector<split_info>, rmm::device_uvector<int32_t>> compute_page_splits_by_row(
+std::vector<row_range> compute_page_splits_by_row(
   rmm::device_uvector<cumulative_page_info> const& c_info,
   cudf::detail::hostdevice_vector<PageInfo> const& pages,
+  size_t skip_rows,
+  size_t num_rows,
   size_t size_limit,
   rmm::cuda_stream_view stream)
 {
@@ -568,9 +577,28 @@ std::pair<std::vector<split_info>, rmm::device_uvector<int32_t>> compute_page_sp
                                 cudaMemcpyDefault,
                                 stream.value()));
   stream.synchronize();
+  // print_cumulative_row_info(h_aggregated_info, "adjusted");
 
-  // generate the actual splits
-  return {find_splits(h_aggregated_info, size_limit), std::move(page_keys_by_split)};
+  std::vector<row_range> splits;
+  // note: we are working with absolute row indices so skip_rows represents the absolute min row
+  // index we care about
+  size_t cur_pos             = find_start_index(h_aggregated_info, skip_rows);
+  size_t cur_row_index       = h_aggregated_info[cur_pos].row_index;
+  size_t cur_cumulative_size = 0;
+  auto const max_row         = min(skip_rows + num_rows, h_aggregated_info.back().row_index);
+  while (cur_row_index < max_row) {
+    auto const split_pos =
+      find_next_split(cur_pos, cur_row_index, cur_cumulative_size, h_aggregated_info, size_limit);
+
+    auto const start_row = cur_row_index;
+    cur_row_index        = min(max_row, h_aggregated_info[split_pos].row_index);
+    splits.push_back({start_row, cur_row_index - start_row});
+    cur_pos             = split_pos;
+    cur_cumulative_size = h_aggregated_info[split_pos].size_bytes;
+  }
+  // print_cumulative_row_info(h_aggregated_info, "adjusted w/splits", splits);
+
+  return splits;
 }
 
 /**
@@ -920,7 +948,6 @@ void reader::impl::setup_next_pass(bool uses_custom_row_bounds)
               _file_itm_data.row_groups.begin() + row_group_end,
               pass.row_groups.begin());
 
-    auto const num_passes = _file_itm_data.num_passes();
     CUDF_EXPECTS(_file_itm_data._current_input_pass < num_passes,
                  "Encountered an invalid read pass index");
 
@@ -940,17 +967,19 @@ void reader::impl::setup_next_pass(bool uses_custom_row_bounds)
     } else {
       auto const global_start_row = _file_itm_data.global_skip_rows;
       auto const global_end_row   = global_start_row + _file_itm_data.global_num_rows;
-      auto const start_row        = std::max(
-        _file_itm_data.input_pass_row_count[_file_itm_data._current_input_pass], global_start_row);
+      auto const start_row =
+        std::max(_file_itm_data.input_pass_start_row_count[_file_itm_data._current_input_pass],
+                 global_start_row);
       auto const end_row =
-        std::min(_file_itm_data.input_pass_row_count[_file_itm_data._current_input_pass + 1],
+        std::min(_file_itm_data.input_pass_start_row_count[_file_itm_data._current_input_pass + 1],
                  global_end_row);
 
       // skip_rows is always global in the sense that it is relative to the first row of
       // everything we will be reading, regardless of what pass we are on.
       // num_rows is how many rows we are reading this pass.
       pass.skip_rows =
-        global_start_row + _file_itm_data.input_pass_row_count[_file_itm_data._current_input_pass];
+        global_start_row +
+        _file_itm_data.input_pass_start_row_count[_file_itm_data._current_input_pass];
       pass.num_rows = end_row - start_row;
     }
 
@@ -975,18 +1004,38 @@ void reader::impl::setup_next_pass(bool uses_custom_row_bounds)
       pass.decomp_dict_data = decompress_page_data(pass.chunks, pass.pages, true, _stream);
     }
 
-    // since there is only ever 1 dictionary per chunk (the 0th path), do it at the
+    // store off how much memory we've used so far. This includes the compressed page data and the
+    // decompressed dictionary data. we will subtract this from the available total memory for the
+    // subpasses
+    auto chunk_iter =
+      thrust::make_transform_iterator(pass.chunks.d_begin(), get_chunk_compressed_size{});
+    pass.base_mem_size =
+      pass.decomp_dict_data.size() +
+      thrust::reduce(rmm::exec_policy(_stream), chunk_iter, chunk_iter + pass.chunks.size());
+
+    // since there is only ever 1 dictionary per chunk (the first page), do it at the
     // pass level.
     build_string_dict_indices();
 
-    // compute subpasses for this pass using the page information we now have.
-    // compute_subpasses();
-    /*
-    if (_output_chunk_read_limit == 0) {  // read the whole file at once
-      CUDF_EXPECTS(_pass_itm_data->output_chunk_read_info.size() == 1,
-                    "Reading the whole file should yield only one chunk.");
+    // if we are doing subpass reading, generate more accurate num_row estimates for list columns.
+    // this helps us to generate more accurate subpass splits.
+    if (_input_pass_read_limit != 0) { generate_list_column_row_count_estimates(); }
+
+#if defined(PARQUET_CHUNK_LOGGING)
+    printf("Pass: row_groups(%'lu), chunks(%'lu), pages(%'lu)\n",
+           pass.row_groups.size(),
+           pass.chunks.size(),
+           pass.pages.size());
+    printf("\tskip_rows: %'lu\n", pass.skip_rows);
+    printf("\tnum_rows: %'lu\n", pass.num_rows);
+    printf("\tbase mem usage: %'lu\n", pass.base_mem_size);
+    auto const num_columns = _input_columns.size();
+    for (size_t c_idx = 0; c_idx < num_columns; c_idx++) {
+      printf("\t\tColumn %'lu: num_pages(%'d)\n",
+             c_idx,
+             pass.page_offsets[c_idx + 1] - pass.page_offsets[c_idx]);
     }
-    */
+#endif
 
     _stream.synchronize();
   }
@@ -1000,7 +1049,17 @@ void reader::impl::setup_next_subpass(bool uses_custom_row_bounds)
 
   auto const num_columns = _input_columns.size();
 
-  auto [page_indices, total_pages] = [&]() -> std::pair<std::vector<page_span>, size_t> {
+  // what do we do if the base memory size (the compressed data) itself is approaching or larger
+  // than the overall read limit? we are still going to be decompressing in subpasses, but we have
+  // to assume some reasonable minimum size needed to safely decompress a single subpass. so always
+  // reserve at least that much space.
+  size_t const remaining_read_limit =
+    _input_pass_read_limit == 0 ? 0
+    : pass.base_mem_size + minimum_subpass_expected_size >= _input_pass_read_limit
+      ? minimum_subpass_expected_size
+      : _input_pass_read_limit - pass.base_mem_size;
+
+  auto [page_indices, total_pages] = [&]() -> std::tuple<std::vector<page_span>, size_t> {
     // special case:  if we contain no compressed data, or if we have no input limit, we can always
     // just do 1 subpass since what we already have loaded is all the temporary memory we will ever
     // use.
@@ -1017,11 +1076,13 @@ void reader::impl::setup_next_subpass(bool uses_custom_row_bounds)
     }
     // otherwise we have to look forward and choose a batch of pages
 
-    // generate cumulative page sizes.
+    // as subpasses get decoded, the initial estimates we have for list row counts
+    // get updated with accurate data, so regenerate cumulative size info and row
+    // indices
     rmm::device_uvector<cumulative_page_info> c_info(pass.pages.size(), _stream);
     auto page_keys = make_page_key_iterator(pass.pages);
     auto page_size =
-      thrust::make_transform_iterator(pass.pages.d_begin(), get_page_size{pass.chunks});
+      thrust::make_transform_iterator(pass.pages.d_begin(), get_page_input_size{pass.chunks});
     thrust::inclusive_scan_by_key(rmm::exec_policy(_stream),
                                   page_keys,
                                   page_keys + pass.pages.size(),
@@ -1029,6 +1090,11 @@ void reader::impl::setup_next_subpass(bool uses_custom_row_bounds)
                                   c_info.begin(),
                                   thrust::equal_to{},
                                   cumulative_page_sum{});
+    auto iter = thrust::make_counting_iterator(0);
+    thrust::for_each(rmm::exec_policy(_stream),
+                     iter,
+                     iter + pass.pages.size(),
+                     set_row_index{pass.chunks, pass.pages, c_info});
     // print_cumulative_page_info(pass.pages, c_info, _stream);
 
     // get the next batch of pages
@@ -1036,7 +1102,7 @@ void reader::impl::setup_next_subpass(bool uses_custom_row_bounds)
                                 pass.pages,
                                 pass.page_offsets,
                                 pass.processed_rows,
-                                _input_pass_read_limit,
+                                remaining_read_limit,
                                 num_columns,
                                 _stream);
   }();
@@ -1086,6 +1152,28 @@ void reader::impl::setup_next_subpass(bool uses_custom_row_bounds)
   // preprocess pages (computes row counts for lists, computes output chunks and computes
   // the actual row counts we will be able load out of this subpass)
   preprocess_subpass_pages(uses_custom_row_bounds, _output_chunk_read_limit);
+
+#if defined(PARQUET_CHUNK_LOGGING)
+  printf("\tSubpass: skip_rows(%'lu), num_rows(%'lu), remaining read limit(%'lu)\n",
+         subpass.skip_rows,
+         subpass.num_rows,
+         remaining_read_limit);
+  printf("\t\tDecompressed size: %'lu\n", subpass.decomp_page_data.size());
+  printf("\t\tTotal expected usage: %'lu\n", subpass.decomp_page_data.size() + pass.base_mem_size);
+  for (size_t c_idx = 0; c_idx < num_columns; c_idx++) {
+    printf("\t\tColumn %'lu: pages(%'lu - %'lu)\n",
+           c_idx,
+           page_indices[c_idx].start,
+           page_indices[c_idx].end);
+  }
+  printf("\t\tOutput chunks:\n");
+  for (size_t idx = 0; idx < subpass.output_chunk_read_info.size(); idx++) {
+    printf("\t\t\t%'lu: skip_rows(%'lu) num_rows(%'lu)\n",
+           idx,
+           subpass.output_chunk_read_info[idx].skip_rows,
+           subpass.output_chunk_read_info[idx].num_rows);
+  }
+#endif
 }
 
 void reader::impl::create_global_chunk_info()
@@ -1119,6 +1207,13 @@ void reader::impl::create_global_chunk_info()
                         schema.converted_type,
                         schema.type_length);
 
+      // for lists, estimate the number of bytes per row. this is used by the subpass reader to
+      // determine where to split the decompression boundaries
+      float const list_bytes_per_row_est =
+        schema.max_repetition_level > 0 ? static_cast<float>(col_meta.total_uncompressed_size) /
+                                            static_cast<float>(row_group.num_rows)
+                                        : 0.0f;
+
       chunks.push_back(ColumnChunkDesc(col_meta.total_compressed_size,
                                        nullptr,
                                        col_meta.num_values,
@@ -1137,7 +1232,8 @@ void reader::impl::create_global_chunk_info()
                                        schema.decimal_precision,
                                        clock_rate,
                                        i,
-                                       col.schema_idx));
+                                       col.schema_idx,
+                                       list_bytes_per_row_est));
     }
 
     remaining_rows -= row_group_rows;
@@ -1154,6 +1250,14 @@ void reader::impl::compute_input_passes()
   if (_input_pass_read_limit == 0) {
     _file_itm_data.input_pass_row_group_offsets.push_back(0);
     _file_itm_data.input_pass_row_group_offsets.push_back(row_groups_info.size());
+    _file_itm_data.input_pass_start_row_count.push_back(0);
+    auto rg_row_count = cudf::detail::make_counting_transform_iterator(0, [&](size_t i) {
+      auto const& rgi       = row_groups_info[i];
+      auto const& row_group = _metadata->get_row_group(rgi.index, rgi.source_index);
+      return row_group.num_rows;
+    });
+    _file_itm_data.input_pass_start_row_count.push_back(
+      std::reduce(rg_row_count, rg_row_count + row_groups_info.size()));
     return;
   }
 
@@ -1165,7 +1269,7 @@ void reader::impl::compute_input_passes()
   std::size_t cur_rg_start       = 0;
   std::size_t cur_row_count      = 0;
   _file_itm_data.input_pass_row_group_offsets.push_back(0);
-  _file_itm_data.input_pass_row_count.push_back(0);
+  _file_itm_data.input_pass_start_row_count.push_back(0);
 
   for (size_t cur_rg_index = 0; cur_rg_index < row_groups_info.size(); cur_rg_index++) {
     auto const& rgi       = row_groups_info[cur_rg_index];
@@ -1182,14 +1286,14 @@ void reader::impl::compute_input_passes()
       // row group
       if (cur_rg_start == cur_rg_index) {
         _file_itm_data.input_pass_row_group_offsets.push_back(cur_rg_index + 1);
-        _file_itm_data.input_pass_row_count.push_back(cur_row_count + row_group.num_rows);
+        _file_itm_data.input_pass_start_row_count.push_back(cur_row_count + row_group.num_rows);
         cur_rg_start       = cur_rg_index + 1;
         cur_pass_byte_size = 0;
       }
       // End the pass at the end of the previous row group
       else {
         _file_itm_data.input_pass_row_group_offsets.push_back(cur_rg_index);
-        _file_itm_data.input_pass_row_count.push_back(cur_row_count);
+        _file_itm_data.input_pass_start_row_count.push_back(cur_row_count);
         cur_rg_start       = cur_rg_index;
         cur_pass_byte_size = compressed_rg_size;
       }
@@ -1202,11 +1306,11 @@ void reader::impl::compute_input_passes()
   // add the last pass if necessary
   if (_file_itm_data.input_pass_row_group_offsets.back() != row_groups_info.size()) {
     _file_itm_data.input_pass_row_group_offsets.push_back(row_groups_info.size());
-    _file_itm_data.input_pass_row_count.push_back(cur_row_count);
+    _file_itm_data.input_pass_start_row_count.push_back(cur_row_count);
   }
 }
 
-void reader::impl::compute_chunks_for_subpass()
+void reader::impl::compute_output_chunks_for_subpass()
 {
   auto& pass    = *_pass_itm_data;
   auto& subpass = *pass.subpass;
@@ -1217,11 +1321,10 @@ void reader::impl::compute_chunks_for_subpass()
     return;
   }
 
-  // generate cumulative row counts and sizes
+  // generate row_indices and cumulative output sizes for all pages
   rmm::device_uvector<cumulative_page_info> c_info(subpass.pages.size(), _stream);
-  // convert PageInfo to cumulative_page_info
   auto page_input =
-    thrust::make_transform_iterator(subpass.pages.d_begin(), get_cumulative_page_info{});
+    thrust::make_transform_iterator(subpass.pages.d_begin(), get_page_output_size{});
   auto page_keys = make_page_key_iterator(subpass.pages);
   thrust::inclusive_scan_by_key(rmm::exec_policy(_stream),
                                 page_keys,
@@ -1230,22 +1333,16 @@ void reader::impl::compute_chunks_for_subpass()
                                 c_info.begin(),
                                 thrust::equal_to{},
                                 cumulative_page_sum{});
+  auto iter = thrust::make_counting_iterator(0);
+  thrust::for_each(rmm::exec_policy(_stream),
+                   iter,
+                   iter + subpass.pages.size(),
+                   set_row_index{pass.chunks, subpass.pages, c_info});
   // print_cumulative_page_info(subpass.pages, c_info, _stream);
 
   // compute the splits
-  auto [splits, _] =
-    compute_page_splits_by_row(c_info, subpass.pages, _output_chunk_read_limit, _stream);
-  subpass.output_chunk_read_info.reserve(splits.size());
-
-  // apply skip_rows from the subpass
-  std::transform(splits.begin(),
-                 splits.end(),
-                 std::back_inserter(subpass.output_chunk_read_info),
-                 [&subpass](split_info const& s) {
-                   row_range r = s.rows;
-                   r.skip_rows += subpass.skip_rows;
-                   return r;
-                 });
+  subpass.output_chunk_read_info = compute_page_splits_by_row(
+    c_info, subpass.pages, subpass.skip_rows, subpass.num_rows, _output_chunk_read_limit, _stream);
 }
 
 }  // namespace cudf::io::parquet::detail
diff --git a/cpp/src/io/parquet/reader_impl_chunking.hpp b/cpp/src/io/parquet/reader_impl_chunking.hpp
index 3e654b086bf..4f0027098e6 100644
--- a/cpp/src/io/parquet/reader_impl_chunking.hpp
+++ b/cpp/src/io/parquet/reader_impl_chunking.hpp
@@ -37,14 +37,14 @@ struct file_intermediate_data {
   // the start/end of the chunks to be loaded for a given pass.
   std::vector<std::size_t> input_pass_row_group_offsets{};
 
-  // row counts per input-pass
-  std::vector<std::size_t> input_pass_row_count{};
+  // start row counts per input-pass. this includes all rows in the row groups of the pass and
+  // is not capped by global_skip_rows and global_num_rows.
+  std::vector<std::size_t> input_pass_start_row_count{};
 
   size_t _current_input_pass{0};  // current input pass index
   size_t _output_chunk_count{0};  // how many output chunks we have produced
 
-  // skip_rows/num_rows values for the entire file. these need to be adjusted per-pass because we
-  // may not be visiting every row group that contains these bounds
+  // skip_rows/num_rows values for the entire file.
   size_t global_skip_rows;
   size_t global_num_rows;
 
@@ -81,11 +81,18 @@ struct subpass_intermediate_data {
   std::vector<row_range> output_chunk_read_info;
   std::size_t current_output_chunk{0};
 
-  // skip_rows and num_rows values for this particular subpass.
+  // skip_rows and num_rows values for this particular subpass. in absolute
+  // row indices.
   size_t skip_rows;
   size_t num_rows;
 };
 
+struct cumulative_page_info {
+  size_t row_index;   // row index
+  size_t size_bytes;  // cumulative size in bytes
+  int key;            // schema index
+};
+
 /**
  * @brief Struct to store pass-level data that remains constant for a single pass.
  *
@@ -101,6 +108,10 @@ struct pass_intermediate_data {
   cudf::detail::hostdevice_vector<ColumnChunkDesc> chunks{};
   cudf::detail::hostdevice_vector<PageInfo> pages{};
 
+  // base memory used for the pass itself (compressed data in the loaded chunks and any
+  // decompressed dictionary pages)
+  size_t base_mem_size{0};
+
   // offsets to each group of input pages (by column/schema, indexed by _input_columns.size())
   // so if we had 2 columns/schemas, with page keys
   //
diff --git a/cpp/src/io/parquet/reader_impl_preprocess.cu b/cpp/src/io/parquet/reader_impl_preprocess.cu
index 69abbb19123..4c17aee5916 100644
--- a/cpp/src/io/parquet/reader_impl_preprocess.cu
+++ b/cpp/src/io/parquet/reader_impl_preprocess.cu
@@ -454,7 +454,35 @@ struct set_str_dict_index_ptr {
   }
 };
 
-}  // namespace
+/**
+ * @brief Functor which computes an estimated row count for list pages.
+ *
+ */
+struct set_list_row_count_estimate {
+  device_span<const ColumnChunkDesc> chunks;
+
+  __device__ void operator()(PageInfo& page)
+  {
+    if (page.flags & PAGEINFO_FLAGS_DICTIONARY) { return; }
+    auto const& chunk  = chunks[page.chunk_idx];
+    auto const is_list = chunk.max_level[level_type::REPETITION] > 0;
+    if (!is_list) { return; }
+
+    // For LIST pages that we have not yet decoded, page.num_rows is not an accurate number.
+    // so we instead estimate the number of rows as follows:
+    // - each chunk stores an estimated number of bytes per row E
+    // - estimate number of rows in a page = page.uncompressed_page_size / E
+    //
+    // it is not required that this number is accurate. we just want it to be somewhat close so that
+    // we get reasonable results as we choose subpass splits.
+    //
+    // all other columns can use page.num_rows directly as it will be accurate.
+    page.num_rows = static_cast<size_t>(static_cast<float>(page.uncompressed_page_size) /
+                                        chunk.list_bytes_per_row_est);
+  }
+};
+
+}  // anonymous namespace
 
 void reader::impl::build_string_dict_indices()
 {
@@ -923,6 +951,30 @@ struct page_offset_output_iter {
   __device__ reference operator*() { return p->str_offset; }
 };
 
+// update chunk_row field in subpass page from pass page
+struct update_subpass_chunk_row {
+  device_span<PageInfo> pass_pages;
+  device_span<PageInfo> subpass_pages;
+  device_span<size_t> page_src_index;
+
+  __device__ void operator()(size_t i)
+  {
+    subpass_pages[i].chunk_row = pass_pages[page_src_index[i]].chunk_row;
+  }
+};
+
+// update num_rows field from pass page to subpass page
+struct update_pass_num_rows {
+  device_span<PageInfo> pass_pages;
+  device_span<PageInfo> subpass_pages;
+  device_span<size_t> page_src_index;
+
+  __device__ void operator()(size_t i)
+  {
+    pass_pages[page_src_index[i]].num_rows = subpass_pages[i].num_rows;
+  }
+};
+
 }  // anonymous namespace
 
 void reader::impl::preprocess_file(
@@ -956,32 +1008,55 @@ void reader::impl::preprocess_file(
     compute_input_passes();
   }
 
+#if defined(PARQUET_CHUNK_LOGGING)
+  printf("==============================================\n");
+  setlocale(LC_NUMERIC, "");
+  printf("File: skip_rows(%'lu), num_rows(%'lu), input_read_limit(%'lu), output_read_limit(%'lu)\n",
+         _file_itm_data.global_skip_rows,
+         _file_itm_data.global_num_rows,
+         _input_pass_read_limit,
+         _output_chunk_read_limit);
+  printf("# Row groups: %'lu\n", _file_itm_data.row_groups.size());
+  printf("# Input passes: %'lu\n", _file_itm_data.num_passes());
+  printf("# Input columns: %'lu\n", _input_columns.size());
+  for (size_t idx = 0; idx < _input_columns.size(); idx++) {
+    auto const& schema = _metadata->get_schema(_input_columns[idx].schema_idx);
+    auto const type_id = to_type_id(schema, _strings_to_categorical, _timestamp_type.id());
+    printf("\tC(%'lu, %s): %s\n",
+           idx,
+           _input_columns[idx].name.c_str(),
+           cudf::type_to_name(cudf::data_type{type_id}).c_str());
+  }
+  printf("# Output columns: %'lu\n", _output_buffers.size());
+  for (size_t idx = 0; idx < _output_buffers.size(); idx++) {
+    printf("\tC(%'lu): %s\n", idx, cudf::io::detail::type_to_name(_output_buffers[idx]).c_str());
+  }
+#endif
+
   _file_preprocessed = true;
 }
 
-// update chunk_row field in subpass page from pass page
-struct update_subpass_chunk_row {
-  device_span<PageInfo> pass_pages;
-  device_span<PageInfo> subpass_pages;
-  device_span<size_t> page_src_index;
-
-  __device__ void operator()(size_t i)
-  {
-    subpass_pages[i].chunk_row = pass_pages[page_src_index[i]].chunk_row;
-  }
-};
-
-// update num_rows field from pass page to subpass page
-struct update_pass_num_rows {
-  device_span<PageInfo> pass_pages;
-  device_span<PageInfo> subpass_pages;
-  device_span<size_t> page_src_index;
+void reader::impl::generate_list_column_row_count_estimates()
+{
+  auto& pass = *_pass_itm_data;
+  thrust::for_each(rmm::exec_policy(_stream),
+                   pass.pages.d_begin(),
+                   pass.pages.d_end(),
+                   set_list_row_count_estimate{pass.chunks});
 
-  __device__ void operator()(size_t i)
-  {
-    pass_pages[page_src_index[i]].num_rows = subpass_pages[i].num_rows;
-  }
-};
+  // computes:
+  // PageInfo::chunk_row (the chunk-relative row index) for all pages in the pass. The start_row
+  // field in ColumnChunkDesc is the absolute row index for the whole file. chunk_row in PageInfo is
+  // relative to the beginning of the chunk. so in the kernels, chunk.start_row + page.chunk_row
+  // gives us the absolute row index
+  auto key_input  = thrust::make_transform_iterator(pass.pages.d_begin(), get_page_chunk_idx{});
+  auto page_input = thrust::make_transform_iterator(pass.pages.d_begin(), get_page_num_rows{});
+  thrust::exclusive_scan_by_key(rmm::exec_policy(_stream),
+                                key_input,
+                                key_input + pass.pages.size(),
+                                page_input,
+                                chunk_row_output_iter{pass.pages.device_ptr()});
+}
 
 void reader::impl::preprocess_subpass_pages(bool uses_custom_row_bounds, size_t chunk_read_limit)
 {
@@ -1045,10 +1120,7 @@ void reader::impl::preprocess_subpass_pages(bool uses_custom_row_bounds, size_t
   // PageInfo::chunk_row (the chunk-relative row index) for all pages in the pass. The start_row
   // field in ColumnChunkDesc is the absolute row index for the whole file. chunk_row in PageInfo is
   // relative to the beginning of the chunk. so in the kernels, chunk.start_row + page.chunk_row
-  // gives us the absolute row index. NOTE: this is recomputing chunk_row for -all- pages in the
-  // pass, not just the pages in the current subpass.  the reason we do this is that we may visit
-  // the same page multiple times over multiple subpasses (if we didn't process all rows in a given
-  // subpass). this greatly simplifies the logic.
+  // gives us the absolute row index
   auto key_input  = thrust::make_transform_iterator(pass.pages.d_begin(), get_page_chunk_idx{});
   auto page_input = thrust::make_transform_iterator(pass.pages.d_begin(), get_page_num_rows{});
   thrust::exclusive_scan_by_key(rmm::exec_policy(_stream),
@@ -1057,7 +1129,7 @@ void reader::impl::preprocess_subpass_pages(bool uses_custom_row_bounds, size_t
                                 page_input,
                                 chunk_row_output_iter{pass.pages.device_ptr()});
 
-  // finally, copy chunk row into the subpass.
+  // copy chunk row into the subpass pages
   thrust::for_each(rmm::exec_policy(_stream),
                    iter,
                    iter + subpass.pages.size(),
@@ -1072,11 +1144,26 @@ void reader::impl::preprocess_subpass_pages(bool uses_custom_row_bounds, size_t
   // rows as the smallest batch (by column) we have decompressed.
   size_t page_index = 0;
   size_t max_row    = std::numeric_limits<size_t>::max();
+  auto const last_pass_row =
+    _file_itm_data.input_pass_start_row_count[_file_itm_data._current_input_pass + 1];
   for (size_t idx = 0; idx < subpass.column_page_count.size(); idx++) {
     auto const& last_page = subpass.pages[page_index + (subpass.column_page_count[idx] - 1)];
     auto const& chunk     = pass.chunks[last_page.chunk_idx];
-    max_row =
-      min(max_row, static_cast<size_t>(chunk.start_row + last_page.chunk_row + last_page.num_rows));
+
+    size_t max_page_row =
+      static_cast<size_t>(chunk.start_row + last_page.chunk_row + last_page.num_rows);
+    // special case.  list rows can span page boundaries, but we can't tell if that is happening
+    // here because we have not yet decoded the pages. the very last row starting in the page may
+    // not terminate in the page. to handle this, only decode up to the second to last row in the
+    // page since we know that will safely completed.
+    bool const is_list = chunk.max_level[level_type::REPETITION] > 0;
+    if (is_list && max_page_row < last_pass_row) {
+      CUDF_EXPECTS(last_page.num_rows > 1, "Unexpected short list page");
+      max_page_row--;
+    }
+
+    max_row = min(max_row, max_page_row);
+
     page_index += subpass.column_page_count[idx];
   }
   subpass.skip_rows   = pass.skip_rows + pass.processed_rows;
@@ -1085,7 +1172,7 @@ void reader::impl::preprocess_subpass_pages(bool uses_custom_row_bounds, size_t
   subpass.num_rows    = max_row - subpass.skip_rows;
 
   // now split up the output into chunks as necessary
-  compute_chunks_for_subpass();
+  compute_output_chunks_for_subpass();
 }
 
 void reader::impl::allocate_columns(size_t skip_rows, size_t num_rows, bool uses_custom_row_bounds)
diff --git a/cpp/src/io/utilities/column_buffer.cpp b/cpp/src/io/utilities/column_buffer.cpp
index dd049d401cf..cbdf6f52d62 100644
--- a/cpp/src/io/utilities/column_buffer.cpp
+++ b/cpp/src/io/utilities/column_buffer.cpp
@@ -26,6 +26,9 @@
 
 #include <rmm/mr/device/per_device_resource.hpp>
 
+#include <iomanip>
+#include <sstream>
+
 namespace cudf::io::detail {
 
 void gather_column_buffer::allocate_strings_data(rmm::cuda_stream_view stream)
@@ -145,6 +148,30 @@ string_policy column_buffer_base<string_policy>::empty_like(string_policy const&
   return new_buff;
 }
 
+template <typename string_policy>
+std::string type_to_name(column_buffer_base<string_policy> const& buffer, bool include_nesting)
+{
+  if (buffer.type.id() == cudf::type_id::LIST) {
+    return "List<" + (type_to_name<string_policy>(buffer.children[0], true)) + ">";
+  }
+
+  if (buffer.type.id() == cudf::type_id::STRUCT) {
+    std::ostringstream out;
+
+    out << "Struct<";
+    auto iter = thrust::make_counting_iterator(0);
+    std::transform(
+      iter,
+      iter + buffer.children.size(),
+      std::ostream_iterator<std::string>(out, ","),
+      [&buffer](size_type i) { return type_to_name<string_policy>(buffer.children[i], true); });
+    out << ">";
+    return out.str();
+  }
+
+  return cudf::type_to_name(buffer.type);
+}
+
 template <class string_policy>
 std::unique_ptr<column> make_column(column_buffer_base<string_policy>& buffer,
                                     column_name_info* schema_info,
@@ -353,6 +380,12 @@ template std::unique_ptr<column> empty_like<pointer_type>(pointer_column_buffer&
                                                           rmm::cuda_stream_view stream,
                                                           rmm::mr::device_memory_resource* mr);
 
+template std::string type_to_name<string_type>(string_column_buffer const& buffer,
+                                               bool include_nesting);
+template std::string type_to_name<pointer_type>(pointer_column_buffer const& buffer,
+                                                bool include_nesting);
+
 template class column_buffer_base<pointer_type>;
 template class column_buffer_base<string_type>;
+
 }  // namespace cudf::io::detail
diff --git a/cpp/src/io/utilities/column_buffer.hpp b/cpp/src/io/utilities/column_buffer.hpp
index 2ee7c17e480..b54e2714ef9 100644
--- a/cpp/src/io/utilities/column_buffer.hpp
+++ b/cpp/src/io/utilities/column_buffer.hpp
@@ -253,6 +253,10 @@ std::unique_ptr<column> empty_like(column_buffer_base<string_policy>& buffer,
                                    rmm::cuda_stream_view stream,
                                    rmm::mr::device_memory_resource* mr);
 
+template <class string_policy>
+std::string type_to_name(column_buffer_base<string_policy> const& buffer,
+                         bool include_nesting = true);
+
 }  // namespace detail
 }  // namespace io
 }  // namespace cudf

From 953e539f46edf64a52e4b8fe7ab127f2dd2f453a Mon Sep 17 00:00:00 2001
From: db <dbaranec@nvidia.com>
Date: Sun, 3 Dec 2023 14:58:02 -0600
Subject: [PATCH 21/49] Fix a small bug in output chunk computation.

---
 cpp/src/io/parquet/reader_impl_chunking.cu | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/cpp/src/io/parquet/reader_impl_chunking.cu b/cpp/src/io/parquet/reader_impl_chunking.cu
index acb56adafcd..eb4cdb723fd 100644
--- a/cpp/src/io/parquet/reader_impl_chunking.cu
+++ b/cpp/src/io/parquet/reader_impl_chunking.cu
@@ -583,7 +583,7 @@ std::vector<row_range> compute_page_splits_by_row(
   // note: we are working with absolute row indices so skip_rows represents the absolute min row
   // index we care about
   size_t cur_pos             = find_start_index(h_aggregated_info, skip_rows);
-  size_t cur_row_index       = h_aggregated_info[cur_pos].row_index;
+  size_t cur_row_index       = skip_rows;
   size_t cur_cumulative_size = 0;
   auto const max_row         = min(skip_rows + num_rows, h_aggregated_info.back().row_index);
   while (cur_row_index < max_row) {

From 8ef72a655e3aaf90055011876887c6938e900e2f Mon Sep 17 00:00:00 2001
From: db <dbaranec@nvidia.com>
Date: Sun, 3 Dec 2023 21:46:46 -0600
Subject: [PATCH 22/49] Fixed a missing stream in a cudaMemcpyAsync call. 
 Switch some tests to use ZSTD instead of SNAPPY as I think I've found an
 nvcomp bug.

---
 cpp/src/io/parquet/reader_impl_chunking.cu   | 5 +++--
 cpp/tests/io/parquet_chunked_reader_test.cpp | 8 ++++----
 2 files changed, 7 insertions(+), 6 deletions(-)

diff --git a/cpp/src/io/parquet/reader_impl_chunking.cu b/cpp/src/io/parquet/reader_impl_chunking.cu
index eb4cdb723fd..cf1c4754ada 100644
--- a/cpp/src/io/parquet/reader_impl_chunking.cu
+++ b/cpp/src/io/parquet/reader_impl_chunking.cu
@@ -484,7 +484,7 @@ std::tuple<std::vector<page_span>, size_t> compute_next_subpass(
                                 aggregated_info.data(),
                                 sizeof(cumulative_page_info) * c_info.size(),
                                 cudaMemcpyDefault,
-                                stream.value()));
+                                stream));
   stream.synchronize();
   // print_cumulative_row_info(h_aggregated_info, "adjusted");
 
@@ -537,7 +537,8 @@ std::tuple<std::vector<page_span>, size_t> compute_next_subpass(
     cudaMemcpyAsync(h_page_counts.data(),
                     page_counts.data(),
                     sizeof(size_t) * num_columns,
-                    cudaMemcpyDeviceToHost);
+                    cudaMemcpyDeviceToHost,
+                    stream);
     stream.synchronize();
     return h_page_counts;
   };
diff --git a/cpp/tests/io/parquet_chunked_reader_test.cpp b/cpp/tests/io/parquet_chunked_reader_test.cpp
index f8d2a5e98e1..41151d1e9ef 100644
--- a/cpp/tests/io/parquet_chunked_reader_test.cpp
+++ b/cpp/tests/io/parquet_chunked_reader_test.cpp
@@ -1038,7 +1038,7 @@ void sub_rowgroup_test(std::string const& filepath,
   {
     cudf::io::parquet_writer_options out_opts =
       cudf::io::parquet_writer_options::builder(cudf::io::sink_info{filepath}, t)
-        .compression(cudf::io::compression_type::SNAPPY)
+        .compression(cudf::io::compression_type::ZSTD)
         .dictionary_policy(cudf::io::dictionary_policy::NEVER);
     cudf::io::write_parquet(out_opts);
 
@@ -1062,7 +1062,7 @@ void sub_rowgroup_test(std::string const& filepath,
   {
     cudf::io::parquet_writer_options out_opts =
       cudf::io::parquet_writer_options::builder(cudf::io::sink_info{filepath}, t)
-        .compression(cudf::io::compression_type::SNAPPY)
+        .compression(cudf::io::compression_type::ZSTD)
         .dictionary_policy(cudf::io::dictionary_policy::ALWAYS);
     cudf::io::write_parquet(out_opts);
 
@@ -1073,7 +1073,7 @@ void sub_rowgroup_test(std::string const& filepath,
 
 TEST_F(ParquetChunkedSubRowgroupReaderTest, SingleFixedWidthColumnNoSplits)
 {
-  auto filepath           = std::string("table_with_dict.parquet");
+  auto filepath           = temp_env->get_temp_filepath("table_with_dict.parquet");
   constexpr auto num_rows = 100;
   auto iter1 = cudf::detail::make_counting_transform_iterator(0, [](int i) { return 15; });
   cudf::test::fixed_width_column_wrapper<int> col1(iter1, iter1 + num_rows);
@@ -1083,7 +1083,7 @@ TEST_F(ParquetChunkedSubRowgroupReaderTest, SingleFixedWidthColumnNoSplits)
 
 TEST_F(ParquetChunkedSubRowgroupReaderTest, MultipleFixedWidthColumns)
 {
-  auto filepath           = std::string("multiple_col_fixed_width.parquet");
+  auto filepath           = temp_env->get_temp_filepath("multiple_col_fixed_width.parquet");
   constexpr auto num_rows = 200000;
 
   auto iter1 = thrust::make_counting_iterator<int>(0);

From 54a8c0255cc6b3cd8113f1ec76788171d7da4078 Mon Sep 17 00:00:00 2001
From: db <dbaranec@nvidia.com>
Date: Thu, 7 Dec 2023 00:41:08 -0600
Subject: [PATCH 23/49] Changed the mechanism with which we collect pages after
 determining subpass splits - the old method had some broken edge cases with
 columns of highly variable page counts.

---
 cpp/src/io/parquet/reader_impl_chunking.cu   | 212 ++++++++++---------
 cpp/src/io/parquet/reader_impl_preprocess.cu |  44 +++-
 2 files changed, 158 insertions(+), 98 deletions(-)

diff --git a/cpp/src/io/parquet/reader_impl_chunking.cu b/cpp/src/io/parquet/reader_impl_chunking.cu
index cf1c4754ada..7e1862cc189 100644
--- a/cpp/src/io/parquet/reader_impl_chunking.cu
+++ b/cpp/src/io/parquet/reader_impl_chunking.cu
@@ -54,6 +54,7 @@ constexpr size_t minimum_subpass_expected_size = 200 * 1024 * 1024;
 
 #if defined(CHUNKING_DEBUG)
 void print_cumulative_page_info(cudf::detail::hostdevice_vector<PageInfo>& pages,
+                                cudf::detail::hostdevice_vector<ColumnChunkDesc>& chunks,
                                 rmm::device_uvector<cumulative_page_info> const& c_info,
                                 rmm::cuda_stream_view stream)
 {
@@ -79,7 +80,12 @@ void print_cumulative_page_info(cudf::detail::hostdevice_vector<PageInfo>& pages
       if (page.flags & PAGEINFO_FLAGS_DICTIONARY || page.src_col_schema != schemas[idx]) {
         continue;
       }
-      printf("\tP: {%lu, %lu, %lu}\n", pidx, h_cinfo[pidx].row_index, h_cinfo[pidx].size_bytes);
+      bool const is_list = chunks[page.chunk_idx].max_level[level_type::REPETITION] > 0;
+      printf("\tP %s: {%lu, %lu, %lu}\n",
+             is_list ? "(L)" : "",
+             pidx,
+             h_cinfo[pidx].row_index,
+             h_cinfo[pidx].size_bytes);
     }
   }
 }
@@ -285,32 +291,54 @@ struct get_chunk_compressed_size {
   __device__ size_t operator()(ColumnChunkDesc const& chunk) { return chunk.compressed_size; }
 };
 
+/**
+ * @brief Find the first entry in the aggreggated_info that corresponds to the specified row
+ *
+ */
+size_t find_start_index(std::vector<cumulative_page_info> const& aggregated_info, size_t start_row)
+{
+  auto start = thrust::make_transform_iterator(
+    aggregated_info.begin(), [&](cumulative_page_info const& i) { return i.row_index; });
+  auto start_index =
+    thrust::lower_bound(thrust::host, start, start + aggregated_info.size(), start_row) - start;
+
+  // cumulative_page_info.row_index is the -end- of the rows of a given page. so move forward until
+  // we find the next group of pages
+  while (start_index < (static_cast<int64_t>(aggregated_info.size()) - 1) &&
+         (start_index < 0 || aggregated_info[start_index].row_index == start_row)) {
+    start_index++;
+  }
+
+  return start_index;
+}
+
+/**
+ * @brief Given a current position and row index, find the next split based on the
+ * specified size limit
+ *
+ */
 int64_t find_next_split(int64_t cur_pos,
                         size_t cur_row_index,
                         size_t cur_cumulative_size,
                         std::vector<cumulative_page_info> const& sizes,
-                        size_t chunk_read_limit)
+                        size_t size_limit)
 {
   auto start = thrust::make_transform_iterator(sizes.begin(), [&](cumulative_page_info const& i) {
     return i.size_bytes - cur_cumulative_size;
   });
   auto end   = start + sizes.size();
 
-  int64_t split_pos =
-    thrust::lower_bound(thrust::seq, start + cur_pos, end, chunk_read_limit) - start;
+  int64_t split_pos = thrust::lower_bound(thrust::seq, start + cur_pos, end, size_limit) - start;
 
   // if we're past the end, or if the returned bucket is > than the chunk_read_limit, move back
   // one.
   if (static_cast<size_t>(split_pos) >= sizes.size() ||
-      (sizes[split_pos].size_bytes - cur_cumulative_size > chunk_read_limit)) {
+      (sizes[split_pos].size_bytes - cur_cumulative_size > size_limit)) {
     split_pos--;
   }
 
-  // best-try. if we can't find something that'll fit, we have to go bigger. we're doing this in
-  // a loop because all of the cumulative sizes for all the pages are sorted into one big list.
-  // so if we had two columns, both of which had an entry {1000, 10000}, that entry would be in
-  // the list twice. so we have to iterate until we skip past all of them.  The idea is that we
-  // either do this, or we have to call unique() on the input first.
+  // cumulative_page_info.row_index is the -end- of the rows of a given page. so move forward until
+  // we find the next group of pages
   while (split_pos < (static_cast<int64_t>(sizes.size()) - 1) &&
          (split_pos < 0 || sizes[split_pos].row_index == cur_row_index)) {
     split_pos++;
@@ -409,13 +437,6 @@ adjust_cumulative_sizes(rmm::device_uvector<cumulative_page_info> const& c_info,
                     page_keys_by_split.begin(),
                     [] __device__(cumulative_page_info const& c) { return c.key; });
 
-  std::vector<cumulative_page_info> h_c_info_sorted(c_info_sorted.size());
-  CUDF_CUDA_TRY(cudaMemcpy(h_c_info_sorted.data(),
-                           c_info_sorted.data(),
-                           sizeof(cumulative_page_info) * c_info_sorted.size(),
-                           cudaMemcpyDefault));
-  // print_cumulative_row_info(h_c_info_sorted, "raw");
-
   // generate key offsets (offsets to the start of each partition of keys). worst case is 1 page per
   // key
   rmm::device_uvector<size_type> key_offsets(pages.size() + 1, stream);
@@ -450,28 +471,61 @@ adjust_cumulative_sizes(rmm::device_uvector<cumulative_page_info> const& c_info,
   return {std::move(aggregated_info), std::move(page_keys_by_split)};
 }
 
+struct page_span {
+  size_t start, end;
+};
+
+struct get_page_row_index {
+  device_span<const cumulative_page_info> c_info;
+
+  __device__ size_t operator()(size_t i) { return c_info[i].row_index; }
+};
+
 /**
- * @brief Find the first entry in the aggreggated_info that corresponds to the specified row
+ * @brief Return the span of page indices for a given column index that spans start_row and end_row
  *
  */
-size_t find_start_index(std::vector<cumulative_page_info> const& aggregated_info, size_t min_row)
-{
-  auto start = thrust::make_transform_iterator(
-    aggregated_info.begin(), [&](cumulative_page_info const& i) { return i.row_index; });
-  auto start_index =
-    thrust::lower_bound(thrust::host, start, start + aggregated_info.size(), min_row) - start;
-  if (aggregated_info[start_index].row_index == min_row) { start_index++; }
-  return start_index;
-}
+template <typename RowIndexIter>
+struct get_page_span {
+  device_span<const size_type> page_offsets;
+  RowIndexIter page_row_index;
+  size_t const start_row;
+  size_t const end_row;
+
+  get_page_span(device_span<const size_type> _page_offsets,
+                RowIndexIter _page_row_index,
+                size_t _start_row,
+                size_t _end_row)
+    : page_offsets(_page_offsets),
+      page_row_index(_page_row_index),
+      start_row(_start_row),
+      end_row(_end_row)
+  {
+  }
 
-struct page_span {
-  size_t start, end;
+  __device__ page_span operator()(size_t column_index)
+  {
+    auto const column_page_start = page_row_index + page_offsets[column_index];
+    auto const column_page_end   = page_row_index + page_offsets[column_index + 1];
+    auto const num_pages         = column_page_end - column_page_start;
+    auto start_page =
+      thrust::lower_bound(thrust::seq, column_page_start, column_page_end, start_row) -
+      column_page_start;
+    if (page_row_index[start_page] == start_row) { start_page++; }
+    auto end_page = thrust::lower_bound(thrust::seq, column_page_start, column_page_end, end_row) -
+                    column_page_start;
+    if (end_page < num_pages) { end_page++; }
+
+    return {static_cast<size_t>(start_page + page_offsets[column_index]),
+            static_cast<size_t>(end_page + page_offsets[column_index])};
+  }
 };
+
 std::tuple<std::vector<page_span>, size_t> compute_next_subpass(
   rmm::device_uvector<cumulative_page_info> const& c_info,
   cudf::detail::hostdevice_vector<PageInfo> const& pages,
   cudf::detail::hostdevice_vector<size_type> const& page_offsets,
-  size_t min_row,
+  size_t start_row,
   size_t size_limit,
   size_t num_columns,
   rmm::cuda_stream_view stream)
@@ -483,81 +537,46 @@ std::tuple<std::vector<page_span>, size_t> compute_next_subpass(
   CUDF_CUDA_TRY(cudaMemcpyAsync(h_aggregated_info.data(),
                                 aggregated_info.data(),
                                 sizeof(cumulative_page_info) * c_info.size(),
-                                cudaMemcpyDefault,
+                                cudaMemcpyDeviceToHost,
                                 stream));
   stream.synchronize();
   // print_cumulative_row_info(h_aggregated_info, "adjusted");
 
   // TODO: if the user has explicitly specified skip_rows/num_rows we could be more intelligent
   // about skipping subpasses/pages that do not fall within the range of values, but only if the
-  // data does not contain lists (because our row counts are only estimates in that case)/
+  // data does not contain lists (because our row counts are only estimates in that case)
 
   // find the next split
-  auto const start_index     = find_start_index(h_aggregated_info, min_row);
-  auto const cumulative_size = start_index == 0 ? 0 : h_aggregated_info[start_index - 1].size_bytes;
-  auto const end_index       = find_next_split(start_index,
-                                         min_row,
-                                         cumulative_size,
-                                         h_aggregated_info,
-                                         size_limit) +
-                         1;  // the split index returned is inclusive
-
-  // get the number of pages for each column/schema
-  auto get_page_counts = [num_columns, stream](
-                           rmm::device_uvector<cumulative_page_info> const& aggregated_info,
-                           int start_index,
-                           int end_index) {
-    std::vector<size_t> h_page_counts(num_columns);
-
-    auto const num_pages = end_index - start_index;
-    if (num_pages == 0) {
-      std::fill(h_page_counts.begin(), h_page_counts.end(), 0);
-      return h_page_counts;
-    }
-
-    rmm::device_uvector<int32_t> page_keys(num_pages, stream);
-    thrust::transform(rmm::exec_policy(stream),
-                      aggregated_info.begin() + start_index,
-                      aggregated_info.begin() + end_index,
-                      page_keys.begin(),
-                      [] __device__(cumulative_page_info const& i) { return i.key; });
-    thrust::sort(rmm::exec_policy(stream), page_keys.begin(), page_keys.end());
-    rmm::device_uvector<size_t> page_counts(num_pages, stream);
-    auto page_counts_end = thrust::reduce_by_key(rmm::exec_policy(stream),
-                                                 page_keys.begin(),
-                                                 page_keys.end(),
-                                                 thrust::make_constant_iterator(1),
-                                                 thrust::make_discard_iterator(),
-                                                 page_counts.begin())
-                             .second;
-    auto const num_page_counts = page_counts_end - page_counts.begin();
-    CUDF_EXPECTS(static_cast<size_t>(num_page_counts) == num_columns,
-                 "Encountered a mismatch in column/schema counts while computing subpass split");
-
-    cudaMemcpyAsync(h_page_counts.data(),
-                    page_counts.data(),
-                    sizeof(size_t) * num_columns,
-                    cudaMemcpyDeviceToHost,
-                    stream);
-    stream.synchronize();
-    return h_page_counts;
-  };
-
-  // get count of pages before this split and in this split.
-  auto last_counts = get_page_counts(aggregated_info, 0, start_index);
-  auto this_counts = get_page_counts(aggregated_info, start_index, end_index);
+  auto const start_index     = find_start_index(h_aggregated_info, start_row);
+  auto const cumulative_size = start_row == 0 ? 0 : h_aggregated_info[start_index].size_bytes;
+  auto const end_index =
+    find_next_split(start_index, start_row, cumulative_size, h_aggregated_info, size_limit);
+  auto const end_row = h_aggregated_info[end_index].row_index;
+
+  // for each column, collect the set of pages that spans start_row / end_row
+  rmm::device_uvector<page_span> page_bounds(num_columns, stream);
+  auto iter = thrust::make_counting_iterator(size_t{0});
+  auto page_row_index =
+    cudf::detail::make_counting_transform_iterator(0, get_page_row_index{c_info});
+  thrust::transform(rmm::exec_policy(stream),
+                    iter,
+                    iter + num_columns,
+                    page_bounds.begin(),
+                    get_page_span{page_offsets, page_row_index, start_row, end_row});
+  std::vector<page_span> h_page_bounds(num_columns);
+  cudaMemcpyAsync(h_page_bounds.data(),
+                  page_bounds.data(),
+                  sizeof(page_span) * num_columns,
+                  cudaMemcpyDeviceToHost,
+                  stream);
+  stream.synchronize();
 
-  // convert to page spans
-  std::vector<page_span> out(num_columns);
-  size_t total_pages = 0;
-  for (size_t c_idx = 0; c_idx < num_columns; c_idx++) {
-    // add page_offsets to get proper indices into the pages array
-    out[c_idx].start = (last_counts[c_idx]) + page_offsets[c_idx];
-    out[c_idx].end   = (last_counts[c_idx] + this_counts[c_idx]) + page_offsets[c_idx];
-    total_pages += this_counts[c_idx];
-  }
+  // total page count over all columns
+  auto page_count_iter = thrust::make_transform_iterator(
+    h_page_bounds.begin(), [](page_span const& s) { return s.end - s.start; });
+  size_t const total_pages = std::reduce(page_count_iter, page_count_iter + num_columns);
 
-  return {out, total_pages};
+  return {h_page_bounds, total_pages};
 }
 
 std::vector<row_range> compute_page_splits_by_row(
@@ -1091,12 +1110,13 @@ void reader::impl::setup_next_subpass(bool uses_custom_row_bounds)
                                   c_info.begin(),
                                   thrust::equal_to{},
                                   cumulative_page_sum{});
+
     auto iter = thrust::make_counting_iterator(0);
     thrust::for_each(rmm::exec_policy(_stream),
                      iter,
                      iter + pass.pages.size(),
                      set_row_index{pass.chunks, pass.pages, c_info});
-    // print_cumulative_page_info(pass.pages, c_info, _stream);
+    // print_cumulative_page_info(pass.pages, pass.chunks, c_info, _stream);
 
     // get the next batch of pages
     return compute_next_subpass(c_info,
diff --git a/cpp/src/io/parquet/reader_impl_preprocess.cu b/cpp/src/io/parquet/reader_impl_preprocess.cu
index 958c484b70b..08583bfe3e7 100644
--- a/cpp/src/io/parquet/reader_impl_preprocess.cu
+++ b/cpp/src/io/parquet/reader_impl_preprocess.cu
@@ -498,6 +498,26 @@ struct set_list_row_count_estimate {
   }
 };
 
+/**
+ * @brief Set the expected row count on the final page for all columns.
+ *
+ */
+struct set_final_row_count {
+  device_span<PageInfo> pages;
+  device_span<const ColumnChunkDesc> chunks;
+  device_span<const size_type> page_offsets;
+  size_t const max_row;
+
+  __device__ void operator()(size_t i)
+  {
+    auto const last_page_index      = page_offsets[i + 1] - 1;
+    auto const& page                = pages[last_page_index];
+    auto const& chunk               = chunks[page.chunk_idx];
+    size_t const page_start_row     = chunk.start_row + page.chunk_row;
+    pages[last_page_index].num_rows = max_row - page_start_row;
+  }
+};
+
 }  // anonymous namespace
 
 void reader::impl::build_string_dict_indices()
@@ -886,6 +906,9 @@ struct chunk_row_output_iter {
   __device__ reference operator*() { return p->chunk_row; }
 };
 
+/**
+ * @brief Writes to the page_start_value field of the PageNestingInfo struct, keyed by schema.
+ */
 /**
  * @brief Writes to the page_start_value field of the PageNestingInfo struct, keyed by schema.
  */
@@ -972,7 +995,6 @@ struct page_offset_output_iter {
   __device__ reference operator[](int i) { return p[i].str_offset; }
   __device__ reference operator*() { return p->str_offset; }
 };
-
 // update chunk_row field in subpass page from pass page
 struct update_subpass_chunk_row {
   device_span<PageInfo> pass_pages;
@@ -1078,6 +1100,22 @@ void reader::impl::generate_list_column_row_count_estimates()
                                 key_input + pass.pages.size(),
                                 page_input,
                                 chunk_row_output_iter{pass.pages.device_ptr()});
+
+  // finally, fudge the last page for each column such that it ends on the real known row count
+  // for the pass. this is so that as we march through the subpasses, we will find that every column
+  // cleanly ends up the expected row count at the row group boundary.
+  auto const& last_chunk = pass.chunks[pass.chunks.size() - 1];
+  auto const num_columns = _input_columns.size();
+  size_t const max_row   = last_chunk.start_row + last_chunk.num_rows;
+  auto iter              = thrust::make_counting_iterator(0);
+  thrust::for_each(rmm::exec_policy(_stream),
+                   iter,
+                   iter + num_columns,
+                   set_final_row_count{pass.pages, pass.chunks, pass.page_offsets, max_row});
+
+  pass.chunks.device_to_host_async(_stream);
+  pass.pages.device_to_host_async(_stream);
+  _stream.synchronize();
 }
 
 void reader::impl::preprocess_subpass_pages(bool uses_custom_row_bounds, size_t chunk_read_limit)
@@ -1158,7 +1196,9 @@ void reader::impl::preprocess_subpass_pages(bool uses_custom_row_bounds, size_t
                    update_subpass_chunk_row{pass.pages, subpass.pages, subpass.page_src_index});
 
   // retrieve pages back
-  subpass.pages.device_to_host_sync(_stream);
+  pass.pages.device_to_host_async(_stream);
+  subpass.pages.device_to_host_async(_stream);
+  _stream.synchronize();
 
   // at this point we have an accurate row count so we can compute how many rows we will actually be
   // able to decode for this pass. we will have selected a set of pages for each column in the

From 1bff089fdbdcb7f35ba9d33b61b27c27c57adce9 Mon Sep 17 00:00:00 2001
From: db <dbaranec@nvidia.com>
Date: Thu, 7 Dec 2023 15:48:32 -0600
Subject: [PATCH 24/49] Fixed an indexing issue (not using column-relative
 indices) in the page collection code during subpass computation.

---
 cpp/src/io/parquet/reader_impl_chunking.cu | 20 ++++++++++++--------
 1 file changed, 12 insertions(+), 8 deletions(-)

diff --git a/cpp/src/io/parquet/reader_impl_chunking.cu b/cpp/src/io/parquet/reader_impl_chunking.cu
index 7e1862cc189..3336ac8e388 100644
--- a/cpp/src/io/parquet/reader_impl_chunking.cu
+++ b/cpp/src/io/parquet/reader_impl_chunking.cu
@@ -505,19 +505,23 @@ struct get_page_span {
 
   __device__ page_span operator()(size_t column_index)
   {
-    auto const column_page_start = page_row_index + page_offsets[column_index];
+    auto const first_page_index  = page_offsets[column_index];
+    auto const column_page_start = page_row_index + first_page_index;
     auto const column_page_end   = page_row_index + page_offsets[column_index + 1];
     auto const num_pages         = column_page_end - column_page_start;
+
     auto start_page =
-      thrust::lower_bound(thrust::seq, column_page_start, column_page_end, start_row) -
-      column_page_start;
+      (thrust::lower_bound(thrust::seq, column_page_start, column_page_end, start_row) -
+       column_page_start) +
+      first_page_index;
     if (page_row_index[start_page] == start_row) { start_page++; }
-    auto end_page = thrust::lower_bound(thrust::seq, column_page_start, column_page_end, end_row) -
-                    column_page_start;
-    if (end_page < num_pages) { end_page++; }
 
-    return {static_cast<size_t>(start_page + page_offsets[column_index]),
-            static_cast<size_t>(end_page + page_offsets[column_index])};
+    auto end_page = (thrust::lower_bound(thrust::seq, column_page_start, column_page_end, end_row) -
+                     column_page_start) +
+                    first_page_index;
+    if (end_page < (first_page_index + num_pages)) { end_page++; }
+
+    return {static_cast<size_t>(start_page), static_cast<size_t>(end_page)};
   }
 };
 

From b3539f903960e480bc35db8b502da32c805a2602 Mon Sep 17 00:00:00 2001
From: db <dbaranec@nvidia.com>
Date: Tue, 12 Dec 2023 10:36:41 -0600
Subject: [PATCH 25/49] Wave of PR review comment fixes.

---
 cpp/src/io/parquet/page_decode.cuh           |  3 +--
 cpp/src/io/parquet/page_hdr.cu               |  2 +-
 cpp/src/io/parquet/page_string_decode.cu     |  2 +-
 cpp/src/io/parquet/parquet_gpu.hpp           |  6 +++---
 cpp/src/io/parquet/reader_impl.hpp           | 10 ++++------
 cpp/src/io/parquet/reader_impl_chunking.hpp  |  2 +-
 cpp/src/io/parquet/reader_impl_preprocess.cu | 14 ++++----------
 7 files changed, 15 insertions(+), 24 deletions(-)

diff --git a/cpp/src/io/parquet/page_decode.cuh b/cpp/src/io/parquet/page_decode.cuh
index a96051854b4..1dc7a81721a 100644
--- a/cpp/src/io/parquet/page_decode.cuh
+++ b/cpp/src/io/parquet/page_decode.cuh
@@ -1287,8 +1287,7 @@ inline __device__ bool setupLocalPageInfo(page_state_s* const s,
             s->dict_base = reinterpret_cast<uint8_t const*>(s->col.str_dict_index);
             s->dict_size = s->col.dict_page->num_input_values * sizeof(string_index_pair);
           } else {
-            s->dict_base =
-              s->col.dict_page->page_data;  // dictionary is always stored in the first page
+            s->dict_base = s->col.dict_page->page_data;
             s->dict_size = s->col.dict_page->uncompressed_page_size;
           }
           s->dict_run  = 0;
diff --git a/cpp/src/io/parquet/page_hdr.cu b/cpp/src/io/parquet/page_hdr.cu
index 6d31aea966f..d711bef34ab 100644
--- a/cpp/src/io/parquet/page_hdr.cu
+++ b/cpp/src/io/parquet/page_hdr.cu
@@ -393,7 +393,7 @@ __global__ void __launch_bounds__(128) gpuDecodePageHeaders(ColumnChunkDesc* chu
     }
     num_values    = bs->ck.num_values;
     page_info     = chunk_pages ? chunk_pages[chunk].pages : nullptr;
-    max_num_pages = (page_info) ? bs->ck.max_num_pages : 0;
+    max_num_pages = page_info ? bs->ck.max_num_pages : 0;
     values_found  = 0;
     __syncwarp();
     while (values_found < num_values && bs->cur < bs->end) {
diff --git a/cpp/src/io/parquet/page_string_decode.cu b/cpp/src/io/parquet/page_string_decode.cu
index bb1001bf455..0166478b78c 100644
--- a/cpp/src/io/parquet/page_string_decode.cu
+++ b/cpp/src/io/parquet/page_string_decode.cu
@@ -752,7 +752,7 @@ __global__ void __launch_bounds__(preprocess_block_size) gpuComputePageStringSiz
           dict_base = reinterpret_cast<const uint8_t*>(col.str_dict_index);
           dict_size = col.dict_page->num_input_values * sizeof(string_index_pair);
         } else {
-          dict_base = col.dict_page->page_data;  // dictionary is always stored in the first page
+          dict_base = col.dict_page->page_data;
           dict_size = col.dict_page->uncompressed_page_size;
         }
 
diff --git a/cpp/src/io/parquet/parquet_gpu.hpp b/cpp/src/io/parquet/parquet_gpu.hpp
index 9bcdc1c95b8..de4d74b8c3f 100644
--- a/cpp/src/io/parquet/parquet_gpu.hpp
+++ b/cpp/src/io/parquet/parquet_gpu.hpp
@@ -341,11 +341,11 @@ struct get_page_key {
 };
 
 /**
- * @brief Return and iterator that returns they keys for a vector of pages.
+ * @brief Return an iterator that returns they keys for a vector of pages.
  */
-inline auto make_page_key_iterator(cudf::detail::hostdevice_vector<PageInfo> const& pages)
+inline auto make_page_key_iterator(device_span<const PageInfo> pages)
 {
-  return thrust::make_transform_iterator(pages.d_begin(), get_page_key{});
+  return thrust::make_transform_iterator(pages.begin(), get_page_key{});
 }
 
 /**
diff --git a/cpp/src/io/parquet/reader_impl.hpp b/cpp/src/io/parquet/reader_impl.hpp
index a070b0bf3e2..31bddaf8bda 100644
--- a/cpp/src/io/parquet/reader_impl.hpp
+++ b/cpp/src/io/parquet/reader_impl.hpp
@@ -158,7 +158,7 @@ class reader::impl {
   /**
    * @brief Ratchet the pass/subpass/chunk process forward.
    *
-   * @param uses_custom_row_bounds Whether or not num_rows and skip_rows represents user-specific
+   * @param uses_custom_row_bounds Whether or not num_rows and skip_rows represents user-specified
    *        bounds
    */
   void handle_chunking(bool uses_custom_row_bounds);
@@ -337,12 +337,10 @@ class reader::impl {
    */
   void compute_output_chunks_for_subpass();
 
-  bool has_more_work()
+  [[nodiscard]] bool has_more_work() const
   {
-    // no work to do (this can happen on the first pass if we have no rows to read)
-    auto const num_passes = _file_itm_data.num_passes();
-    bool const more_work  = num_passes > 0 && _file_itm_data._current_input_pass < num_passes;
-    return more_work;
+    return _file_itm_data.num_passes() > 0 &&
+           _file_itm_data._current_input_pass < _file_itm_data.num_passes();
   }
 
  private:
diff --git a/cpp/src/io/parquet/reader_impl_chunking.hpp b/cpp/src/io/parquet/reader_impl_chunking.hpp
index 4f0027098e6..7ff346a4311 100644
--- a/cpp/src/io/parquet/reader_impl_chunking.hpp
+++ b/cpp/src/io/parquet/reader_impl_chunking.hpp
@@ -48,7 +48,7 @@ struct file_intermediate_data {
   size_t global_skip_rows;
   size_t global_num_rows;
 
-  size_t num_passes()
+  size_t num_passes() const
   {
     return input_pass_row_group_offsets.size() == 0 ? 0 : input_pass_row_group_offsets.size() - 1;
   }
diff --git a/cpp/src/io/parquet/reader_impl_preprocess.cu b/cpp/src/io/parquet/reader_impl_preprocess.cu
index 08583bfe3e7..64ae988ed0d 100644
--- a/cpp/src/io/parquet/reader_impl_preprocess.cu
+++ b/cpp/src/io/parquet/reader_impl_preprocess.cu
@@ -433,12 +433,6 @@ void decode_page_headers(pass_intermediate_data& pass,
   pass.pages.device_to_host_async(stream);
   pass.chunks.device_to_host_async(stream);
   stream.synchronize();
-
-  // validate page encodings
-  CUDF_EXPECTS(std::all_of(pass.pages.begin(),
-                           pass.pages.end(),
-                           [](auto const& page) { return is_supported_encoding(page.encoding); }),
-               "Unsupported page encoding detected");
 }
 
 struct set_str_dict_index_count {
@@ -791,7 +785,7 @@ void reader::impl::load_compressed_data()
   auto& pass = *_pass_itm_data;
 
   // This function should never be called if `num_rows == 0`.
-  // CUDF_EXPECTS(_pass_itm_data->num_rows > 0, "Number of reading rows must not be zero.");
+  CUDF_EXPECTS(_pass_itm_data->num_rows > 0, "Number of reading rows must not be zero.");
 
   auto& chunks = pass.chunks;
 
@@ -1171,7 +1165,7 @@ void reader::impl::preprocess_subpass_pages(bool uses_custom_row_bounds, size_t
 
   // copy our now-correct row counts  back to the base pages stored in the pass.
   auto iter = thrust::make_counting_iterator(0);
-  thrust::for_each(rmm::exec_policy(_stream),
+  thrust::for_each(rmm::exec_policy_nosync(_stream),
                    iter,
                    iter + subpass.pages.size(),
                    update_pass_num_rows{pass.pages, subpass.pages, subpass.page_src_index});
@@ -1183,14 +1177,14 @@ void reader::impl::preprocess_subpass_pages(bool uses_custom_row_bounds, size_t
   // gives us the absolute row index
   auto key_input  = thrust::make_transform_iterator(pass.pages.d_begin(), get_page_chunk_idx{});
   auto page_input = thrust::make_transform_iterator(pass.pages.d_begin(), get_page_num_rows{});
-  thrust::exclusive_scan_by_key(rmm::exec_policy(_stream),
+  thrust::exclusive_scan_by_key(rmm::exec_policy_nosync(_stream),
                                 key_input,
                                 key_input + pass.pages.size(),
                                 page_input,
                                 chunk_row_output_iter{pass.pages.device_ptr()});
 
   // copy chunk row into the subpass pages
-  thrust::for_each(rmm::exec_policy(_stream),
+  thrust::for_each(rmm::exec_policy_nosync(_stream),
                    iter,
                    iter + subpass.pages.size(),
                    update_subpass_chunk_row{pass.pages, subpass.pages, subpass.page_src_index});

From ae452f90a1461e43eaa6e1a26290044030c48348 Mon Sep 17 00:00:00 2001
From: db <dbaranec@nvidia.com>
Date: Tue, 12 Dec 2023 13:56:00 -0600
Subject: [PATCH 26/49] Second wave of PR review feedback.

---
 cpp/src/io/parquet/reader_impl.hpp           |  4 +-
 cpp/src/io/parquet/reader_impl_chunking.cu   |  2 +-
 cpp/src/io/parquet/reader_impl_chunking.hpp  |  2 +-
 cpp/src/io/parquet/reader_impl_preprocess.cu | 76 ++++++++++++--------
 4 files changed, 51 insertions(+), 33 deletions(-)

diff --git a/cpp/src/io/parquet/reader_impl.hpp b/cpp/src/io/parquet/reader_impl.hpp
index 31bddaf8bda..d74218ff24b 100644
--- a/cpp/src/io/parquet/reader_impl.hpp
+++ b/cpp/src/io/parquet/reader_impl.hpp
@@ -213,9 +213,9 @@ class reader::impl {
   std::pair<bool, std::vector<std::future<void>>> read_column_chunks();
 
   /**
-   * @brief Load compressed data and page information for the current pass.
+   * @brief Read compressed data and page information for the current pass.
    */
-  void load_compressed_data();
+  void read_compressed_data();
 
   /**
    * @brief Build string dictionary indices for a pass.
diff --git a/cpp/src/io/parquet/reader_impl_chunking.cu b/cpp/src/io/parquet/reader_impl_chunking.cu
index 3336ac8e388..fb8c15999ba 100644
--- a/cpp/src/io/parquet/reader_impl_chunking.cu
+++ b/cpp/src/io/parquet/reader_impl_chunking.cu
@@ -1009,7 +1009,7 @@ void reader::impl::setup_next_pass(bool uses_custom_row_bounds)
 
     // load page information for the chunk. this retrieves the compressed bytes for all the
     // pages, and their headers (which we can access without decompressing)
-    load_compressed_data();
+    read_compressed_data();
 
     // detect malformed columns.
     // - we have seen some cases in the wild where we have a row group containing N
diff --git a/cpp/src/io/parquet/reader_impl_chunking.hpp b/cpp/src/io/parquet/reader_impl_chunking.hpp
index 7ff346a4311..4e8d8de73c3 100644
--- a/cpp/src/io/parquet/reader_impl_chunking.hpp
+++ b/cpp/src/io/parquet/reader_impl_chunking.hpp
@@ -48,7 +48,7 @@ struct file_intermediate_data {
   size_t global_skip_rows;
   size_t global_num_rows;
 
-  size_t num_passes() const
+  [[nodiscard]] size_t num_passes() const
   {
     return input_pass_row_group_offsets.size() == 0 ? 0 : input_pass_row_group_offsets.size() - 1;
   }
diff --git a/cpp/src/io/parquet/reader_impl_preprocess.cu b/cpp/src/io/parquet/reader_impl_preprocess.cu
index 64ae988ed0d..80cb22089c8 100644
--- a/cpp/src/io/parquet/reader_impl_preprocess.cu
+++ b/cpp/src/io/parquet/reader_impl_preprocess.cu
@@ -371,7 +371,7 @@ void decode_page_headers(pass_intermediate_data& pass,
   // We also need to preserve key-relative page ordering, so we need to use a stable sort.
   {
     rmm::device_uvector<int32_t> page_keys{unsorted_pages.size(), stream};
-    thrust::transform(rmm::exec_policy(stream),
+    thrust::transform(rmm::exec_policy_nosync(stream),
                       unsorted_pages.begin(),
                       unsorted_pages.end(),
                       page_keys.begin(),
@@ -382,15 +382,15 @@ void decode_page_headers(pass_intermediate_data& pass,
     // started generating kernels using too much shared memory when trying to sort the pages
     // directly.
     rmm::device_uvector<int32_t> sort_indices(unsorted_pages.size(), stream);
-    thrust::sequence(rmm::exec_policy(stream), sort_indices.begin(), sort_indices.end(), 0);
-    thrust::stable_sort_by_key(rmm::exec_policy(stream),
+    thrust::sequence(rmm::exec_policy_nosync(stream), sort_indices.begin(), sort_indices.end(), 0);
+    thrust::stable_sort_by_key(rmm::exec_policy_nosync(stream),
                                page_keys.begin(),
                                page_keys.end(),
                                sort_indices.begin(),
                                thrust::less<int>());
     pass.pages = cudf::detail::hostdevice_vector<PageInfo>(
       unsorted_pages.size(), unsorted_pages.size(), stream);
-    thrust::transform(rmm::exec_policy(stream),
+    thrust::transform(rmm::exec_policy_nosync(stream),
                       sort_indices.begin(),
                       sort_indices.end(),
                       pass.pages.d_begin(),
@@ -414,13 +414,13 @@ void decode_page_headers(pass_intermediate_data& pass,
                                  .second;
   auto const num_page_counts = page_counts_end - page_counts.begin();
   pass.page_offsets = cudf::detail::hostdevice_vector<size_type>(num_page_counts + 1, stream);
-  thrust::exclusive_scan(rmm::exec_policy(stream),
+  thrust::exclusive_scan(rmm::exec_policy_nosync(stream),
                          page_counts.begin(),
                          page_counts.begin() + num_page_counts + 1,
                          pass.page_offsets.d_begin());
 
   // setup dict_page for each chunk if necessary
-  thrust::for_each(rmm::exec_policy(stream),
+  thrust::for_each(rmm::exec_policy_nosync(stream),
                    pass.pages.d_begin(),
                    pass.pages.d_end(),
                    [chunks = pass.chunks.d_begin()] __device__(PageInfo const& p) {
@@ -560,23 +560,26 @@ void reader::impl::allocate_nesting_info()
   auto& pass    = *_pass_itm_data;
   auto& subpass = *pass.subpass;
 
-  // auto const& chunks             = pass.chunks;
   auto const num_columns         = _input_columns.size();
   auto& pages                    = subpass.pages;
   auto& page_nesting_info        = subpass.page_nesting_info;
   auto& page_nesting_decode_info = subpass.page_nesting_decode_info;
 
+  // generate the number of nesting info structs needed per-page, by column
+  std::vector<int> per_page_nesting_info_size(num_columns);
+  auto iter = thrust::make_counting_iterator(size_type{0});
+  std::transform(iter, iter + num_columns, per_page_nesting_info_size.begin(), [&](size_type i) {
+    auto const schema_idx = _input_columns[i].schema_idx;
+    auto const& schema    = _metadata->get_schema(schema_idx);
+    return max(schema.max_definition_level + 1, _metadata->get_output_nesting_depth(schema_idx));
+  });
+
   // compute total # of page_nesting infos needed and allocate space. doing this in one
   // buffer to keep it to a single gpu allocation
   auto counting_iter = thrust::make_counting_iterator(size_t{0});
   size_t const total_page_nesting_infos =
     std::accumulate(counting_iter, counting_iter + num_columns, 0, [&](int total, size_t index) {
-      // the schema of the input column
-      auto const schema_idx = _input_columns[index].schema_idx;
-      auto const& schema    = _metadata->get_schema(schema_idx);
-      auto const per_page_nesting_info_size =
-        max(schema.max_definition_level + 1, _metadata->get_output_nesting_depth(schema_idx));
-      return total + (per_page_nesting_info_size * subpass.column_page_count[index]);
+      return total + (per_page_nesting_info_size[index] * subpass.column_page_count[index]);
     });
 
   page_nesting_info =
@@ -588,25 +591,27 @@ void reader::impl::allocate_nesting_info()
   int target_page_index = 0;
   int src_info_index    = 0;
   for (size_t idx = 0; idx < _input_columns.size(); idx++) {
-    auto const src_col_schema             = _input_columns[idx].schema_idx;
-    auto& schema                          = _metadata->get_schema(src_col_schema);
-    auto const per_page_nesting_info_size = std::max(
-      schema.max_definition_level + 1, _metadata->get_output_nesting_depth(src_col_schema));
+    auto const src_col_schema = _input_columns[idx].schema_idx;
 
     for (size_t p_idx = 0; p_idx < subpass.column_page_count[idx]; p_idx++) {
       pages[target_page_index + p_idx].nesting = page_nesting_info.device_ptr() + src_info_index;
       pages[target_page_index + p_idx].nesting_decode =
         page_nesting_decode_info.device_ptr() + src_info_index;
 
-      pages[target_page_index + p_idx].nesting_info_size = per_page_nesting_info_size;
+      pages[target_page_index + p_idx].nesting_info_size = per_page_nesting_info_size[idx];
       pages[target_page_index + p_idx].num_output_nesting_levels =
         _metadata->get_output_nesting_depth(src_col_schema);
 
-      src_info_index += per_page_nesting_info_size;
+      src_info_index += per_page_nesting_info_size[idx];
     }
     target_page_index += subpass.column_page_count[idx];
   }
 
+  // set type to invalid for all page_nesting_infos so we can verify we've initialized everything.
+  std::for_each(page_nesting_info.begin(), page_nesting_info.end(), [](PageNestingInfo& pni) {
+    pni.type = cudf::type_id::NUM_TYPE_IDS;
+  });
+
   // fill in
   int nesting_info_index = 0;
   std::map<int, std::pair<std::vector<int>, std::vector<int>>> depth_remapping;
@@ -616,10 +621,7 @@ void reader::impl::allocate_nesting_info()
     // schema of the input column
     auto& schema = _metadata->get_schema(src_col_schema);
     // real depth of the output cudf column hierarchy (1 == no nesting, 2 == 1 level, etc)
-    int max_depth = _metadata->get_output_nesting_depth(src_col_schema);
-
-    // # of nesting infos stored per page for this column
-    auto const per_page_nesting_info_size = std::max(schema.max_definition_level + 1, max_depth);
+    int const max_output_depth = _metadata->get_output_nesting_depth(src_col_schema);
 
     // if this column has lists, generate depth remapping
     std::map<int, std::pair<std::vector<int>, std::vector<int>>> depth_remapping;
@@ -627,10 +629,19 @@ void reader::impl::allocate_nesting_info()
       generate_depth_remappings(depth_remapping, src_col_schema, *_metadata);
     }
 
+    // PageNestingInfo structs above max_output_depth are unused. set them to empty.
+    for (size_t p_idx = 0; p_idx < subpass.column_page_count[idx]; p_idx++) {
+      auto const nesting_size = per_page_nesting_info_size[idx];
+      for (int e_idx = max_output_depth - 1; e_idx < nesting_size; e_idx++) {
+        page_nesting_info[nesting_info_index + (p_idx * nesting_size) + e_idx].type =
+          cudf::type_id::EMPTY;
+      }
+    }
+
     // fill in host-side nesting info
     int schema_idx  = src_col_schema;
     auto cur_schema = _metadata->get_schema(schema_idx);
-    int cur_depth   = max_depth - 1;
+    int cur_depth   = max_output_depth - 1;
     while (schema_idx > 0) {
       // stub columns (basically the inner field of a list schema element) are not real columns.
       // we can ignore them for the purposes of output nesting info
@@ -638,10 +649,11 @@ void reader::impl::allocate_nesting_info()
         // initialize each page within the chunk
         for (size_t p_idx = 0; p_idx < subpass.column_page_count[idx]; p_idx++) {
           PageNestingInfo* pni =
-            &page_nesting_info[nesting_info_index + (p_idx * per_page_nesting_info_size)];
+            &page_nesting_info[nesting_info_index + (p_idx * per_page_nesting_info_size[idx])];
 
           PageNestingDecodeInfo* nesting_info =
-            &page_nesting_decode_info[nesting_info_index + (p_idx * per_page_nesting_info_size)];
+            &page_nesting_decode_info[nesting_info_index +
+                                      (p_idx * per_page_nesting_info_size[idx])];
 
           // if we have lists, set our start and end depth remappings
           if (schema.max_repetition_level > 0) {
@@ -676,9 +688,16 @@ void reader::impl::allocate_nesting_info()
       cur_schema = _metadata->get_schema(schema_idx);
     }
 
-    nesting_info_index += (per_page_nesting_info_size * subpass.column_page_count[idx]);
+    nesting_info_index += (per_page_nesting_info_size[idx] * subpass.column_page_count[idx]);
   }
 
+  // verify all of the page_nesting_info structs have been initialized
+  CUDF_EXPECTS(
+    std::all_of(page_nesting_info.begin(),
+                page_nesting_info.end(),
+                [](PageNestingInfo const& pni) { return pni.type != cudf::type_id::NUM_TYPE_IDS; }),
+    "Encountered uninitialized PageNestingInfo structs");
+
   // copy nesting info to the device
   page_nesting_info.host_to_device_async(_stream);
   page_nesting_decode_info.host_to_device_async(_stream);
@@ -780,7 +799,7 @@ std::pair<bool, std::vector<std::future<void>>> reader::impl::read_column_chunks
   return {total_decompressed_size > 0, std::move(read_chunk_tasks)};
 }
 
-void reader::impl::load_compressed_data()
+void reader::impl::read_compressed_data()
 {
   auto& pass = *_pass_itm_data;
 
@@ -800,7 +819,6 @@ void reader::impl::load_compressed_data()
   auto const total_pages = count_page_headers(chunks, _stream);
   if (total_pages <= 0) { return; }
   rmm::device_uvector<PageInfo> unsorted_pages(total_pages, _stream);
-  // pages = cudf::detail::hostdevice_vector<PageInfo>(total_pages, total_pages, _stream);
 
   // decoding of column/page information
   decode_page_headers(pass, unsorted_pages, _stream);

From 491f99147414d971d745473e111bf6399b7e6296 Mon Sep 17 00:00:00 2001
From: db <dbaranec@nvidia.com>
Date: Tue, 12 Dec 2023 15:53:57 -0600
Subject: [PATCH 27/49] More PR review feedback.

---
 cpp/src/io/parquet/reader_impl_chunking.cu   | 70 +++++++++++---------
 cpp/src/io/parquet/reader_impl_preprocess.cu |  4 +-
 2 files changed, 41 insertions(+), 33 deletions(-)

diff --git a/cpp/src/io/parquet/reader_impl_chunking.cu b/cpp/src/io/parquet/reader_impl_chunking.cu
index fb8c15999ba..3b05c5ab3d5 100644
--- a/cpp/src/io/parquet/reader_impl_chunking.cu
+++ b/cpp/src/io/parquet/reader_impl_chunking.cu
@@ -217,8 +217,6 @@ struct get_page_output_size {
  * @brief Functor which sets the (uncompressed) size of a page.
  */
 struct get_page_input_size {
-  device_span<const ColumnChunkDesc> chunks;
-
   __device__ cumulative_page_info operator()(PageInfo const& page)
   {
     // we treat dictionary page sizes as 0 for subpasses because we have already paid the price for
@@ -316,6 +314,8 @@ size_t find_start_index(std::vector<cumulative_page_info> const& aggregated_info
  * @brief Given a current position and row index, find the next split based on the
  * specified size limit
  *
+ * @returns The inclusive index within `sizes` where the next split should happen
+ *
  */
 int64_t find_next_split(int64_t cur_pos,
                         size_t cur_row_index,
@@ -418,6 +418,9 @@ std::pair<size_t, size_t> get_row_group_size(RowGroup const& rg)
  * By doing this, we can now look at row X and know the total
  * byte cost for all pages that span row X, not just the cost up to row X itself.
  *
+ * This function is asynchronous. Call stream.synchronize() before using the
+ * results.
+ *
  */
 std::pair<rmm::device_uvector<cumulative_page_info>, rmm::device_uvector<int32_t>>
 adjust_cumulative_sizes(rmm::device_uvector<cumulative_page_info> const& c_info,
@@ -426,12 +429,14 @@ adjust_cumulative_sizes(rmm::device_uvector<cumulative_page_info> const& c_info,
 {
   // sort by row count
   rmm::device_uvector<cumulative_page_info> c_info_sorted{c_info, stream};
-  thrust::sort(
-    rmm::exec_policy(stream), c_info_sorted.begin(), c_info_sorted.end(), row_count_compare{});
+  thrust::sort(rmm::exec_policy_nosync(stream),
+               c_info_sorted.begin(),
+               c_info_sorted.end(),
+               row_count_compare{});
 
   // page keys grouped by split.
   rmm::device_uvector<int32_t> page_keys_by_split{c_info.size(), stream};
-  thrust::transform(rmm::exec_policy(stream),
+  thrust::transform(rmm::exec_policy_nosync(stream),
                     c_info_sorted.begin(),
                     c_info_sorted.end(),
                     page_keys_by_split.begin(),
@@ -450,7 +455,7 @@ adjust_cumulative_sizes(rmm::device_uvector<cumulative_page_info> const& c_info,
                                  .second;
   size_t const num_unique_keys = key_offsets_end - key_offsets.begin();
   thrust::exclusive_scan(
-    rmm::exec_policy(stream), key_offsets.begin(), key_offsets.end(), key_offsets.begin());
+    rmm::exec_policy_nosync(stream), key_offsets.begin(), key_offsets.end(), key_offsets.begin());
 
   // adjust the cumulative info such that for each row count, the size includes any pages that span
   // that row count. this is so that if we have this case:
@@ -463,7 +468,7 @@ adjust_cumulative_sizes(rmm::device_uvector<cumulative_page_info> const& c_info,
   // page.
   //
   rmm::device_uvector<cumulative_page_info> aggregated_info(c_info.size(), stream);
-  thrust::transform(rmm::exec_policy(stream),
+  thrust::transform(rmm::exec_policy_nosync(stream),
                     c_info_sorted.begin(),
                     c_info_sorted.end(),
                     aggregated_info.begin(),
@@ -525,6 +530,27 @@ struct get_page_span {
   }
 };
 
+/**
+ * @brief Computes the next subpass within the current pass.
+ *
+ * A subpass is a subset of the pages within the parent pass that is decompressed
+ * as a batch and decoded.  Subpasses are the level at which we control memory intermediate
+ * memory usage. A pass consists of >= 1 subpass.  We cannot compute all subpasses in one
+ * shot because we do not know how many rows we actually have in the pages of list columns.
+ * So we have to make an educated guess that fits within the memory limits, and then adjust
+ * for subsequent subpasses when we see how many rows we actually receive.
+ *
+ * @param c_info The cumulative page size information (row count and byte size) per column
+ * @param pages All of the pages in the pass
+ * @param page_offsets Offsets into the pages array representing the first page for each column
+ * @param start_row The row to start the subpass at
+ * @param size_limit The size limit in bytes of the subpass
+ * @param num_columns The number of columns
+ * @param stream The stream to execute cuda operations on
+ * @returns A vector of page_span structs indicating the page indices to include for each column
+ * to be processed, and the total number of pages over all columns
+ *
+ */
 std::tuple<std::vector<page_span>, size_t> compute_next_subpass(
   rmm::device_uvector<cumulative_page_info> const& c_info,
   cudf::detail::hostdevice_vector<PageInfo> const& pages,
@@ -537,13 +563,7 @@ std::tuple<std::vector<page_span>, size_t> compute_next_subpass(
   auto [aggregated_info, page_keys_by_split] = adjust_cumulative_sizes(c_info, pages, stream);
 
   // bring back to the cpu
-  std::vector<cumulative_page_info> h_aggregated_info(aggregated_info.size());
-  CUDF_CUDA_TRY(cudaMemcpyAsync(h_aggregated_info.data(),
-                                aggregated_info.data(),
-                                sizeof(cumulative_page_info) * c_info.size(),
-                                cudaMemcpyDeviceToHost,
-                                stream));
-  stream.synchronize();
+  auto const h_aggregated_info = cudf::detail::make_std_vector_sync(aggregated_info, stream);
   // print_cumulative_row_info(h_aggregated_info, "adjusted");
 
   // TODO: if the user has explicitly specified skip_rows/num_rows we could be more intelligent
@@ -562,18 +582,12 @@ std::tuple<std::vector<page_span>, size_t> compute_next_subpass(
   auto iter = thrust::make_counting_iterator(size_t{0});
   auto page_row_index =
     cudf::detail::make_counting_transform_iterator(0, get_page_row_index{c_info});
-  thrust::transform(rmm::exec_policy(stream),
+  thrust::transform(rmm::exec_policy_nosync(stream),
                     iter,
                     iter + num_columns,
                     page_bounds.begin(),
                     get_page_span{page_offsets, page_row_index, start_row, end_row});
-  std::vector<page_span> h_page_bounds(num_columns);
-  cudaMemcpyAsync(h_page_bounds.data(),
-                  page_bounds.data(),
-                  sizeof(page_span) * num_columns,
-                  cudaMemcpyDeviceToHost,
-                  stream);
-  stream.synchronize();
+  auto h_page_bounds = cudf::detail::make_std_vector_sync(page_bounds, stream);
 
   // total page count over all columns
   auto page_count_iter = thrust::make_transform_iterator(
@@ -594,13 +608,8 @@ std::vector<row_range> compute_page_splits_by_row(
   auto [aggregated_info, page_keys_by_split] = adjust_cumulative_sizes(c_info, pages, stream);
 
   // bring back to the cpu
-  std::vector<cumulative_page_info> h_aggregated_info(aggregated_info.size());
-  CUDF_CUDA_TRY(cudaMemcpyAsync(h_aggregated_info.data(),
-                                aggregated_info.data(),
-                                sizeof(cumulative_page_info) * c_info.size(),
-                                cudaMemcpyDefault,
-                                stream.value()));
-  stream.synchronize();
+  std::vector<cumulative_page_info> h_aggregated_info =
+    cudf::detail::make_std_vector_sync(aggregated_info, stream);
   // print_cumulative_row_info(h_aggregated_info, "adjusted");
 
   std::vector<row_range> splits;
@@ -1105,8 +1114,7 @@ void reader::impl::setup_next_subpass(bool uses_custom_row_bounds)
     // indices
     rmm::device_uvector<cumulative_page_info> c_info(pass.pages.size(), _stream);
     auto page_keys = make_page_key_iterator(pass.pages);
-    auto page_size =
-      thrust::make_transform_iterator(pass.pages.d_begin(), get_page_input_size{pass.chunks});
+    auto page_size = thrust::make_transform_iterator(pass.pages.d_begin(), get_page_input_size{});
     thrust::inclusive_scan_by_key(rmm::exec_policy(_stream),
                                   page_keys,
                                   page_keys + pass.pages.size(),
diff --git a/cpp/src/io/parquet/reader_impl_preprocess.cu b/cpp/src/io/parquet/reader_impl_preprocess.cu
index 83792865d15..6f833d8ab63 100644
--- a/cpp/src/io/parquet/reader_impl_preprocess.cu
+++ b/cpp/src/io/parquet/reader_impl_preprocess.cu
@@ -1109,7 +1109,7 @@ void reader::impl::generate_list_column_row_count_estimates()
   // gives us the absolute row index
   auto key_input  = thrust::make_transform_iterator(pass.pages.d_begin(), get_page_chunk_idx{});
   auto page_input = thrust::make_transform_iterator(pass.pages.d_begin(), get_page_num_rows{});
-  thrust::exclusive_scan_by_key(rmm::exec_policy(_stream),
+  thrust::exclusive_scan_by_key(rmm::exec_policy_nosync(_stream),
                                 key_input,
                                 key_input + pass.pages.size(),
                                 page_input,
@@ -1122,7 +1122,7 @@ void reader::impl::generate_list_column_row_count_estimates()
   auto const num_columns = _input_columns.size();
   size_t const max_row   = last_chunk.start_row + last_chunk.num_rows;
   auto iter              = thrust::make_counting_iterator(0);
-  thrust::for_each(rmm::exec_policy(_stream),
+  thrust::for_each(rmm::exec_policy_nosync(_stream),
                    iter,
                    iter + num_columns,
                    set_final_row_count{pass.pages, pass.chunks, pass.page_offsets, max_row});

From f4f043894a0fb8a9d8ed83c613d28518eddce5d3 Mon Sep 17 00:00:00 2001
From: db <dbaranec@nvidia.com>
Date: Thu, 14 Dec 2023 16:10:26 -0600
Subject: [PATCH 28/49] Remove the code that checks for uninitialized
 PageNestingInfo structs.  It exposed a latent underlying bug which will get
 fixed in a separate PR.

---
 cpp/src/io/parquet/reader_impl_preprocess.cu | 21 --------------------
 1 file changed, 21 deletions(-)

diff --git a/cpp/src/io/parquet/reader_impl_preprocess.cu b/cpp/src/io/parquet/reader_impl_preprocess.cu
index 6f833d8ab63..e42dc50c28f 100644
--- a/cpp/src/io/parquet/reader_impl_preprocess.cu
+++ b/cpp/src/io/parquet/reader_impl_preprocess.cu
@@ -609,11 +609,6 @@ void reader::impl::allocate_nesting_info()
     target_page_index += subpass.column_page_count[idx];
   }
 
-  // set type to invalid for all page_nesting_infos so we can verify we've initialized everything.
-  std::for_each(page_nesting_info.begin(), page_nesting_info.end(), [](PageNestingInfo& pni) {
-    pni.type = cudf::type_id::NUM_TYPE_IDS;
-  });
-
   // fill in
   int nesting_info_index = 0;
   std::map<int, std::pair<std::vector<int>, std::vector<int>>> depth_remapping;
@@ -631,15 +626,6 @@ void reader::impl::allocate_nesting_info()
       generate_depth_remappings(depth_remapping, src_col_schema, *_metadata);
     }
 
-    // PageNestingInfo structs above max_output_depth are unused. set them to empty.
-    for (size_t p_idx = 0; p_idx < subpass.column_page_count[idx]; p_idx++) {
-      auto const nesting_size = per_page_nesting_info_size[idx];
-      for (int e_idx = max_output_depth - 1; e_idx < nesting_size; e_idx++) {
-        page_nesting_info[nesting_info_index + (p_idx * nesting_size) + e_idx].type =
-          cudf::type_id::EMPTY;
-      }
-    }
-
     // fill in host-side nesting info
     int schema_idx  = src_col_schema;
     auto cur_schema = _metadata->get_schema(schema_idx);
@@ -693,13 +679,6 @@ void reader::impl::allocate_nesting_info()
     nesting_info_index += (per_page_nesting_info_size[idx] * subpass.column_page_count[idx]);
   }
 
-  // verify all of the page_nesting_info structs have been initialized
-  CUDF_EXPECTS(
-    std::all_of(page_nesting_info.begin(),
-                page_nesting_info.end(),
-                [](PageNestingInfo const& pni) { return pni.type != cudf::type_id::NUM_TYPE_IDS; }),
-    "Encountered uninitialized PageNestingInfo structs");
-
   // copy nesting info to the device
   page_nesting_info.host_to_device_async(_stream);
   page_nesting_decode_info.host_to_device_async(_stream);

From b131e7a3d35d6af687c6992505c1f788457e01a9 Mon Sep 17 00:00:00 2001
From: db <dbaranec@nvidia.com>
Date: Tue, 19 Dec 2023 13:44:44 -0600
Subject: [PATCH 29/49] Include nvcomp scratch space needed in chunking
 computation. Fixed an issue with incorrect starting row counts in multi-pass,
 multi-subpass conditions.

---
 cpp/src/io/comp/nvcomp_adapter.cpp         |   1 +
 cpp/src/io/comp/nvcomp_adapter.hpp         |   7 +-
 cpp/src/io/parquet/reader_impl_chunking.cu | 143 +++++++++++++++++++--
 3 files changed, 140 insertions(+), 11 deletions(-)

diff --git a/cpp/src/io/comp/nvcomp_adapter.cpp b/cpp/src/io/comp/nvcomp_adapter.cpp
index 1a2c90eb52e..37a71e8fa85 100644
--- a/cpp/src/io/comp/nvcomp_adapter.cpp
+++ b/cpp/src/io/comp/nvcomp_adapter.cpp
@@ -133,6 +133,7 @@ std::string compression_type_name(compression_type compression)
     case compression_type::SNAPPY: return "Snappy";
     case compression_type::ZSTD: return "Zstandard";
     case compression_type::DEFLATE: return "Deflate";
+    case compression_type::INVALID: CUDF_FAIL("Invalid nvcomp compression type");
   }
   return "compression_type(" + std::to_string(static_cast<int>(compression)) + ")";
 }
diff --git a/cpp/src/io/comp/nvcomp_adapter.hpp b/cpp/src/io/comp/nvcomp_adapter.hpp
index 1393b70f058..a2890b0968b 100644
--- a/cpp/src/io/comp/nvcomp_adapter.hpp
+++ b/cpp/src/io/comp/nvcomp_adapter.hpp
@@ -29,7 +29,7 @@
 
 namespace cudf::io::nvcomp {
 
-enum class compression_type { SNAPPY, ZSTD, DEFLATE };
+enum class compression_type { SNAPPY, ZSTD, DEFLATE, INVALID };
 
 /**
  * @brief Set of parameters that impact whether the use nvCOMP features is enabled.
@@ -111,6 +111,11 @@ void batched_decompress(compression_type compression,
                         size_t max_total_uncomp_size,
                         rmm::cuda_stream_view stream);
 
+size_t batched_decompress_temp_size(compression_type compression,
+                                    size_t num_chunks,
+                                    size_t max_uncomp_chunk_size,
+                                    size_t max_total_uncomp_size);
+
 /**
  * @brief Gets the maximum size any chunk could compress to in the batch.
  *
diff --git a/cpp/src/io/parquet/reader_impl_chunking.cu b/cpp/src/io/parquet/reader_impl_chunking.cu
index 3b05c5ab3d5..e72c405f37f 100644
--- a/cpp/src/io/parquet/reader_impl_chunking.cu
+++ b/cpp/src/io/parquet/reader_impl_chunking.cu
@@ -53,19 +53,18 @@ struct split_info {
 constexpr size_t minimum_subpass_expected_size = 200 * 1024 * 1024;
 
 #if defined(CHUNKING_DEBUG)
-void print_cumulative_page_info(cudf::detail::hostdevice_vector<PageInfo>& pages,
-                                cudf::detail::hostdevice_vector<ColumnChunkDesc>& chunks,
-                                rmm::device_uvector<cumulative_page_info> const& c_info,
+void print_cumulative_page_info(device_span<const PageInfo> d_pages,
+                                device_span<const ColumnChunkDesc> d_chunks,
+                                device_span<const cumulative_page_info> d_c_info,
                                 rmm::cuda_stream_view stream)
 {
-  pages.device_to_host_sync(stream);
+  std::vector<PageInfo> pages              = cudf::detail::make_std_vector_sync(d_pages, stream);
+  std::vector<ColumnChunkDesc> chunks      = cudf::detail::make_std_vector_sync(d_chunks, stream);
+  std::vector<cumulative_page_info> c_info = cudf::detail::make_std_vector_sync(d_c_info, stream);
 
   printf("------------\nCumulative sizes by page\n");
 
   std::vector<int> schemas(pages.size());
-  std::vector<cumulative_page_info> h_cinfo(pages.size());
-  CUDF_CUDA_TRY(cudaMemcpy(
-    h_cinfo.data(), c_info.data(), sizeof(cumulative_page_info) * pages.size(), cudaMemcpyDefault));
   auto schema_iter = cudf::detail::make_counting_transform_iterator(
     0, [&](size_type i) { return pages[i].src_col_schema; });
   thrust::copy(thrust::seq, schema_iter, schema_iter + pages.size(), schemas.begin());
@@ -84,8 +83,8 @@ void print_cumulative_page_info(cudf::detail::hostdevice_vector<PageInfo>& pages
       printf("\tP %s: {%lu, %lu, %lu}\n",
              is_list ? "(L)" : "",
              pidx,
-             h_cinfo[pidx].row_index,
-             h_cinfo[pidx].size_bytes);
+             c_info[pidx].row_index,
+             c_info[pidx].size_bytes);
     }
   }
 }
@@ -915,6 +914,126 @@ void detect_malformed_pages(cudf::detail::hostdevice_vector<PageInfo> const& pag
   }
 }
 
+struct decompression_info {
+  Compression codec;
+  size_t num_pages;
+  size_t max_page_decompressed_size;
+  size_t total_decompressed_size;
+};
+
+/**
+ * @brief Functor which retrieves per-page decompression information.
+ *
+ */
+struct get_decomp_info {
+  device_span<const ColumnChunkDesc> chunks;
+
+  __device__ decompression_info operator()(PageInfo const& p)
+  {
+    return {static_cast<Compression>(chunks[p.chunk_idx].codec),
+            1,
+            static_cast<size_t>(p.uncompressed_page_size),
+            static_cast<size_t>(p.uncompressed_page_size)};
+  }
+};
+
+/**
+ * @brief Functor which accumulates per-page decompression information.
+ *
+ */
+struct decomp_sum {
+  __device__ decompression_info operator()(decompression_info const& a, decompression_info const& b)
+  {
+    return {a.codec,
+            a.num_pages + b.num_pages,
+            std::max(a.max_page_decompressed_size, b.max_page_decompressed_size),
+            a.total_decompressed_size + b.total_decompressed_size};
+  }
+};
+
+/**
+ * @brief Functor which returns total scratch space required based on computed decompression_info
+ * data.
+ *
+ */
+struct get_decomp_scratch {
+  size_t operator()(decompression_info const& di)
+  {
+    cudf::io::nvcomp::compression_type nvcomp_codec = cudf::io::nvcomp::compression_type::INVALID;
+
+    switch (di.codec) {
+      case UNCOMPRESSED:
+      case GZIP: return 0;
+
+      case BROTLI: return get_gpu_debrotli_scratch_size(di.num_pages);
+
+      case SNAPPY:
+        if (cudf::io::detail::nvcomp_integration::is_stable_enabled()) {
+          nvcomp_codec = cudf::io::nvcomp::compression_type::SNAPPY;
+        } else {
+          return 0;
+        }
+        break;
+      case ZSTD: nvcomp_codec = cudf::io::nvcomp::compression_type::ZSTD; break;
+
+      default: CUDF_FAIL("Invalid compression codec for parquet decompression");
+    }
+
+    CUDF_EXPECTS(nvcomp_codec != cudf::io::nvcomp::compression_type::INVALID,
+                 "Invalid nvcomp codec encountered");
+    return cudf::io::nvcomp::batched_decompress_temp_size(
+      nvcomp_codec, di.num_pages, di.max_page_decompressed_size, di.total_decompressed_size);
+  }
+};
+
+/**
+ * @brief Add the cost of decompression codec scratch space to the per-page cumulative
+ * size information.
+ *
+ */
+void include_decompression_scratch_size(device_span<const ColumnChunkDesc> chunks,
+                                        device_span<const PageInfo> pages,
+                                        device_span<cumulative_page_info> c_info,
+                                        rmm::cuda_stream_view stream)
+{
+  CUDF_EXPECTS(pages.size() == c_info.size(),
+               "Encountered page/cumulative_page_info size mismatch");
+
+  auto page_keys = make_page_key_iterator(pages);
+
+  // per-codec page counts and decompression sizes
+  rmm::device_uvector<decompression_info> decomp_info(pages.size(), stream);
+  auto decomp_iter = thrust::make_transform_iterator(pages.begin(), get_decomp_info{chunks});
+  thrust::inclusive_scan_by_key(rmm::exec_policy(stream),
+                                page_keys,
+                                page_keys + pages.size(),
+                                decomp_iter,
+                                decomp_info.begin(),
+                                thrust::equal_to<int32_t>{},
+                                decomp_sum{});
+
+  // retrieve to host so we can call nvcomp to get compression scratch sizes
+  std::vector<decompression_info> h_decomp_info =
+    cudf::detail::make_std_vector_sync(decomp_info, stream);
+  std::vector<size_t> temp_cost(pages.size());
+  thrust::transform(thrust::host,
+                    h_decomp_info.begin(),
+                    h_decomp_info.end(),
+                    temp_cost.begin(),
+                    get_decomp_scratch{});
+
+  // add to the cumulative_page_info data
+  rmm::device_uvector<size_t> d_temp_cost = cudf::detail::make_device_uvector_async(
+    temp_cost, stream, rmm::mr::get_current_device_resource());
+  auto iter = thrust::make_counting_iterator(size_t{0});
+  thrust::for_each(rmm::exec_policy(stream),
+                   iter,
+                   iter + pages.size(),
+                   [temp_cost = d_temp_cost.begin(), c_info = c_info.begin()] __device__(size_t i) {
+                     c_info[i].size_bytes += temp_cost[i];
+                   });
+}
+
 }  // anonymous namespace
 
 void reader::impl::handle_chunking(bool uses_custom_row_bounds)
@@ -1123,6 +1242,10 @@ void reader::impl::setup_next_subpass(bool uses_custom_row_bounds)
                                   thrust::equal_to{},
                                   cumulative_page_sum{});
 
+    // include scratch space needed for decompression. for certain codecs (eg ZSTD) this
+    // can be considerable.
+    include_decompression_scratch_size(pass.chunks, pass.pages, c_info, _stream);
+
     auto iter = thrust::make_counting_iterator(0);
     thrust::for_each(rmm::exec_policy(_stream),
                      iter,
@@ -1134,7 +1257,7 @@ void reader::impl::setup_next_subpass(bool uses_custom_row_bounds)
     return compute_next_subpass(c_info,
                                 pass.pages,
                                 pass.page_offsets,
-                                pass.processed_rows,
+                                pass.processed_rows + pass.skip_rows,
                                 remaining_read_limit,
                                 num_columns,
                                 _stream);

From 266ad872c653b9f05a19fb6e4e9104234e57d5a5 Mon Sep 17 00:00:00 2001
From: db <dbaranec@nvidia.com>
Date: Thu, 18 Jan 2024 16:58:34 -0600
Subject: [PATCH 30/49] Added tweakable parameter for controlling ratio of
 compressed/decompressed memory use. Fixed an issue with an incorrect
 exception related to list handling. Fixed an issue with cumulative size
 computation in some cases. Many new tests.

---
 cpp/src/io/parquet/reader_impl_chunking.cu    |  32 +-
 cpp/src/io/parquet/reader_impl_preprocess.cu  |  13 +-
 cpp/tests/CMakeLists.txt                      |   2 +-
 ...est.cpp => parquet_chunked_reader_test.cu} | 331 +++++++++++-------
 cpp/tests/io/parquet_test.cpp                 |   2 +-
 5 files changed, 239 insertions(+), 141 deletions(-)
 rename cpp/tests/io/{parquet_chunked_reader_test.cpp => parquet_chunked_reader_test.cu} (75%)

diff --git a/cpp/src/io/parquet/reader_impl_chunking.cu b/cpp/src/io/parquet/reader_impl_chunking.cu
index e72c405f37f..0249e813c84 100644
--- a/cpp/src/io/parquet/reader_impl_chunking.cu
+++ b/cpp/src/io/parquet/reader_impl_chunking.cu
@@ -52,6 +52,10 @@ struct split_info {
 // at least this many additional bytes.
 constexpr size_t minimum_subpass_expected_size = 200 * 1024 * 1024;
 
+// percentage of the total available input read limit that should be reserved for compressed
+// data vs uncompressed data. 
+constexpr float input_limit_compression_reserve = 0.3f;
+
 #if defined(CHUNKING_DEBUG)
 void print_cumulative_page_info(device_span<const PageInfo> d_pages,
                                 device_span<const ColumnChunkDesc> d_chunks,
@@ -546,11 +550,11 @@ struct get_page_span {
  * @param size_limit The size limit in bytes of the subpass
  * @param num_columns The number of columns
  * @param stream The stream to execute cuda operations on
- * @returns A vector of page_span structs indicating the page indices to include for each column
- * to be processed, and the total number of pages over all columns
+ * @returns A tuple containing a vector of page_span structs indicating the page indices to include for each column
+ * to be processed, the total number of pages over all columns, and the total expected memory usage (including scratch space)
  *
  */
-std::tuple<std::vector<page_span>, size_t> compute_next_subpass(
+std::tuple<std::vector<page_span>, size_t, size_t> compute_next_subpass(
   rmm::device_uvector<cumulative_page_info> const& c_info,
   cudf::detail::hostdevice_vector<PageInfo> const& pages,
   cudf::detail::hostdevice_vector<size_type> const& page_offsets,
@@ -571,7 +575,7 @@ std::tuple<std::vector<page_span>, size_t> compute_next_subpass(
 
   // find the next split
   auto const start_index     = find_start_index(h_aggregated_info, start_row);
-  auto const cumulative_size = start_row == 0 ? 0 : h_aggregated_info[start_index].size_bytes;
+  auto const cumulative_size = start_row == 0 || start_index == 0 ? 0 : h_aggregated_info[start_index - 1].size_bytes;
   auto const end_index =
     find_next_split(start_index, start_row, cumulative_size, h_aggregated_info, size_limit);
   auto const end_row = h_aggregated_info[end_index].row_index;
@@ -593,7 +597,7 @@ std::tuple<std::vector<page_span>, size_t> compute_next_subpass(
     h_page_bounds.begin(), [](page_span const& s) { return s.end - s.start; });
   size_t const total_pages = std::reduce(page_count_iter, page_count_iter + num_columns);
 
-  return {h_page_bounds, total_pages};
+  return {h_page_bounds, total_pages, h_aggregated_info[end_index].size_bytes - cumulative_size};
 }
 
 std::vector<row_range> compute_page_splits_by_row(
@@ -1201,17 +1205,21 @@ void reader::impl::setup_next_subpass(bool uses_custom_row_bounds)
 
   auto const num_columns = _input_columns.size();
 
+  // if the user has passed a very small value (under the hardcoded minimum_subpass_expected_size), respect it.
+  auto const min_subpass_size = _input_pass_read_limit < minimum_subpass_expected_size ? _input_pass_read_limit : minimum_subpass_expected_size;
+
   // what do we do if the base memory size (the compressed data) itself is approaching or larger
   // than the overall read limit? we are still going to be decompressing in subpasses, but we have
   // to assume some reasonable minimum size needed to safely decompress a single subpass. so always
-  // reserve at least that much space.
+  // reserve at least that much space. this can result in using up to 2x the specified user limit but
+  // should only ever happen with unrealistically low numbers.
   size_t const remaining_read_limit =
     _input_pass_read_limit == 0 ? 0
-    : pass.base_mem_size + minimum_subpass_expected_size >= _input_pass_read_limit
-      ? minimum_subpass_expected_size
+    : pass.base_mem_size + min_subpass_size >= _input_pass_read_limit
+      ? min_subpass_size
       : _input_pass_read_limit - pass.base_mem_size;
 
-  auto [page_indices, total_pages] = [&]() -> std::tuple<std::vector<page_span>, size_t> {
+  auto [page_indices, total_pages, total_expected_size] = [&]() -> std::tuple<std::vector<page_span>, size_t, size_t> {
     // special case:  if we contain no compressed data, or if we have no input limit, we can always
     // just do 1 subpass since what we already have loaded is all the temporary memory we will ever
     // use.
@@ -1224,7 +1232,7 @@ void reader::impl::setup_next_subpass(bool uses_custom_row_bounds)
           return {static_cast<size_t>(pass.page_offsets[i]),
                   static_cast<size_t>(pass.page_offsets[i + 1])};
         });
-      return {page_indices, pass.pages.size()};
+      return {page_indices, pass.pages.size(), 0};
     }
     // otherwise we have to look forward and choose a batch of pages
 
@@ -1315,7 +1323,7 @@ void reader::impl::setup_next_subpass(bool uses_custom_row_bounds)
          subpass.num_rows,
          remaining_read_limit);
   printf("\t\tDecompressed size: %'lu\n", subpass.decomp_page_data.size());
-  printf("\t\tTotal expected usage: %'lu\n", subpass.decomp_page_data.size() + pass.base_mem_size);
+  printf("\t\tTotal expected usage: %'lu\n", total_expected_size == 0 ? subpass.decomp_page_data.size() + pass.base_mem_size : total_expected_size + pass.base_mem_size);
   for (size_t c_idx = 0; c_idx < num_columns; c_idx++) {
     printf("\t\tColumn %'lu: pages(%'lu - %'lu)\n",
            c_idx,
@@ -1420,7 +1428,7 @@ void reader::impl::compute_input_passes()
   // generate passes. make sure to account for the case where a single row group doesn't fit within
   //
   std::size_t const read_limit =
-    _input_pass_read_limit > 0 ? _input_pass_read_limit : std::numeric_limits<std::size_t>::max();
+    _input_pass_read_limit > 0 ? static_cast<size_t>(static_cast<float>(_input_pass_read_limit) * input_limit_compression_reserve) : std::numeric_limits<std::size_t>::max();
   std::size_t cur_pass_byte_size = 0;
   std::size_t cur_rg_start       = 0;
   std::size_t cur_row_count      = 0;
diff --git a/cpp/src/io/parquet/reader_impl_preprocess.cu b/cpp/src/io/parquet/reader_impl_preprocess.cu
index 9aed93fc20b..a19e877d8e7 100644
--- a/cpp/src/io/parquet/reader_impl_preprocess.cu
+++ b/cpp/src/io/parquet/reader_impl_preprocess.cu
@@ -1277,19 +1277,20 @@ void reader::impl::preprocess_subpass_pages(bool uses_custom_row_bounds, size_t
     auto const& last_page = subpass.pages[page_index + (subpass.column_page_count[idx] - 1)];
     auto const& chunk     = pass.chunks[last_page.chunk_idx];
 
-    size_t max_page_row =
+    size_t max_col_row =
       static_cast<size_t>(chunk.start_row + last_page.chunk_row + last_page.num_rows);
     // special case.  list rows can span page boundaries, but we can't tell if that is happening
     // here because we have not yet decoded the pages. the very last row starting in the page may
     // not terminate in the page. to handle this, only decode up to the second to last row in the
-    // page since we know that will safely completed.
+    // subpass since we know that will safely completed.
     bool const is_list = chunk.max_level[level_type::REPETITION] > 0;
-    if (is_list && max_page_row < last_pass_row) {
-      CUDF_EXPECTS(last_page.num_rows > 1, "Unexpected short list page");
-      max_page_row--;
+    if (is_list && max_col_row < last_pass_row) {
+      size_t const min_col_row = static_cast<size_t>(chunk.start_row + last_page.chunk_row);
+      CUDF_EXPECTS((max_col_row - min_col_row) > 1, "Unexpected short list page");
+      max_col_row--;
     }
 
-    max_row = min(max_row, max_page_row);
+    max_row = min(max_row, max_col_row);
 
     page_index += subpass.column_page_count[idx];
   }
diff --git a/cpp/tests/CMakeLists.txt b/cpp/tests/CMakeLists.txt
index 48bc4ac6fc1..37d57ce052f 100644
--- a/cpp/tests/CMakeLists.txt
+++ b/cpp/tests/CMakeLists.txt
@@ -288,7 +288,7 @@ ConfigureTest(
 ConfigureTest(
   PARQUET_TEST
   io/parquet_test.cpp
-  io/parquet_chunked_reader_test.cpp
+  io/parquet_chunked_reader_test.cu
   io/parquet_chunked_writer_test.cpp
   io/parquet_common.cpp
   io/parquet_misc_test.cpp
diff --git a/cpp/tests/io/parquet_chunked_reader_test.cpp b/cpp/tests/io/parquet_chunked_reader_test.cu
similarity index 75%
rename from cpp/tests/io/parquet_chunked_reader_test.cpp
rename to cpp/tests/io/parquet_chunked_reader_test.cu
index 41151d1e9ef..a68d42d208d 100644
--- a/cpp/tests/io/parquet_chunked_reader_test.cpp
+++ b/cpp/tests/io/parquet_chunked_reader_test.cu
@@ -14,6 +14,8 @@
  * limitations under the License.
  */
 
+#include "parquet_common.hpp"
+
 #include <cudf_test/base_fixture.hpp>
 #include <cudf_test/column_utilities.hpp>
 #include <cudf_test/column_wrapper.hpp>
@@ -44,14 +46,12 @@
 #include <thrust/iterator/counting_iterator.h>
 
 #include <rmm/cuda_stream_view.hpp>
+#include <rmm/exec_policy.hpp>
 
 #include <fstream>
 #include <type_traits>
 
 namespace {
-// Global environment for temporary files
-auto const temp_env = static_cast<cudf::test::TempDirTestEnvironment*>(
-  ::testing::AddGlobalTestEnvironment(new cudf::test::TempDirTestEnvironment));
 
 using int32s_col       = cudf::test::fixed_width_column_wrapper<int32_t>;
 using int64s_col       = cudf::test::fixed_width_column_wrapper<int64_t>;
@@ -953,138 +953,70 @@ TEST_F(ParquetChunkedReaderTest, TestChunkedReadNullCount)
   } while (reader.has_next());
 }
 
-TEST_F(ParquetChunkedReaderTest, InputLimitSimple)
+void input_limit_test_write_one(std::string const& filepath,
+                                 cudf::table_view const& t,
+                                 cudf::io::compression_type compression,
+                                 cudf::io::dictionary_policy dict_policy)
 {
-  auto const filepath = temp_env->get_temp_filepath("input_limit_10_rowgroups.parquet");
-
-  // This results in 10 grow groups, at 4001150 bytes per row group
-  constexpr int num_rows = 25'000'000;
-  auto value_iter = cudf::detail::make_counting_transform_iterator(0, [](int i) { return i; });
-  cudf::test::fixed_width_column_wrapper<int> expected(value_iter, value_iter + num_rows);
-  cudf::io::parquet_writer_options opts =
-    cudf::io::parquet_writer_options::builder(cudf::io::sink_info{filepath},
-                                              cudf::table_view{{expected}})
-      // note: it is unnecessary to force compression to NONE here because the size we are using in
-      // the row group is the uncompressed data size. But forcing the dictionary policy to
-      // dictionary_policy::NEVER is necessary to prevent changes in the
-      // decompressed-but-not-yet-decoded data.
-      .dictionary_policy(cudf::io::dictionary_policy::NEVER);
-
-  cudf::io::write_parquet(opts);
-
-  {
-    // no chunking
-    auto const [result, num_chunks] = chunked_read(filepath, 0, 0);
-    EXPECT_EQ(num_chunks, 1);
-    CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, result->get_column(0));
-  }
-
-  {
-    // 25 chunks of 100k rows each
-    auto const [result, num_chunks] = chunked_read(filepath, 0, 1);
-    EXPECT_EQ(num_chunks, 25);
-    CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, result->get_column(0));
-  }
-
-  {
-    // 25 chunks of 100k rows each
-    auto const [result, num_chunks] = chunked_read(filepath, 0, 4000000);
-    EXPECT_EQ(num_chunks, 25);
-    CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, result->get_column(0));
-  }
-
-  {
-    // 25 chunks of 100k rows each
-    auto const [result, num_chunks] = chunked_read(filepath, 0, 4100000);
-    EXPECT_EQ(num_chunks, 25);
-    CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, result->get_column(0));
-  }
-
-  {
-    // 12 chunks of 200k rows each, plus 1 final chunk of 100k rows.
-    auto const [result, num_chunks] = chunked_read(filepath, 0, 8002301);
-    EXPECT_EQ(num_chunks, 13);
-    CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, result->get_column(0));
-  }
-
-  {
-    // 1 big chunk
-    auto const [result, num_chunks] = chunked_read(filepath, 0, size_t{1} * 1024 * 1024 * 1024);
-    EXPECT_EQ(num_chunks, 1);
-    CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, result->get_column(0));
-  }
+  cudf::io::parquet_writer_options out_opts =
+    cudf::io::parquet_writer_options::builder(cudf::io::sink_info{filepath}, t)
+      .compression(compression)
+      .dictionary_policy(dict_policy);
+  cudf::io::write_parquet(out_opts);
 }
 
-struct ParquetChunkedSubRowgroupReaderTest : public cudf::test::BaseFixture {};
-
-void sub_rowgroup_test(std::string const& filepath,
-                       cudf::table_view const& t,
-                       size_t output_limit,
-                       size_t input_limit)
+void input_limit_test_write(std::string const& base_path,
+                             cudf::table_view const& t)
 {
-  // uncompressed, no dictionary
-  {
-    cudf::io::parquet_writer_options out_opts =
-      cudf::io::parquet_writer_options::builder(cudf::io::sink_info{filepath}, t)
-        .compression(cudf::io::compression_type::NONE)
-        .dictionary_policy(cudf::io::dictionary_policy::NEVER);
-    cudf::io::write_parquet(out_opts);
-
-    auto result = chunked_read(filepath, output_limit, input_limit);
-    CUDF_TEST_EXPECT_TABLES_EQUIVALENT(*result.first, t);
-  }
-
-  // compressed, no dictionary
-  {
-    cudf::io::parquet_writer_options out_opts =
-      cudf::io::parquet_writer_options::builder(cudf::io::sink_info{filepath}, t)
-        .compression(cudf::io::compression_type::ZSTD)
-        .dictionary_policy(cudf::io::dictionary_policy::NEVER);
-    cudf::io::write_parquet(out_opts);
-
-    auto result = chunked_read(filepath, output_limit, input_limit);
-    CUDF_TEST_EXPECT_TABLES_EQUIVALENT(*result.first, t);
-  }
-
-  // uncompressed, dictionary
-  {
-    cudf::io::parquet_writer_options out_opts =
-      cudf::io::parquet_writer_options::builder(cudf::io::sink_info{filepath}, t)
-        .compression(cudf::io::compression_type::NONE)
-        .dictionary_policy(cudf::io::dictionary_policy::ALWAYS);
-    cudf::io::write_parquet(out_opts);
-
-    auto result = chunked_read(filepath, output_limit, input_limit);
-    CUDF_TEST_EXPECT_TABLES_EQUIVALENT(*result.first, t);
-  }
+  // no compression
+  input_limit_test_write_one(base_path + "_a.parquet", t, cudf::io::compression_type::NONE, cudf::io::dictionary_policy::NEVER);  
+  // compression with a codec that uses a lot of scratch space at decode time (2.5x the total decompressed buffer size)
+  input_limit_test_write_one(base_path + "_b.parquet", t, cudf::io::compression_type::ZSTD, cudf::io::dictionary_policy::NEVER);  
+  // compression with a codec that uses no scratch space at decode time
+  input_limit_test_write_one(base_path + "_c.parquet", t, cudf::io::compression_type::SNAPPY, cudf::io::dictionary_policy::NEVER);
+  input_limit_test_write_one(base_path + "_d.parquet", t, cudf::io::compression_type::SNAPPY, cudf::io::dictionary_policy::ALWAYS);
+}
 
-  // compressed, dictionary
-  {
-    cudf::io::parquet_writer_options out_opts =
-      cudf::io::parquet_writer_options::builder(cudf::io::sink_info{filepath}, t)
-        .compression(cudf::io::compression_type::ZSTD)
-        .dictionary_policy(cudf::io::dictionary_policy::ALWAYS);
-    cudf::io::write_parquet(out_opts);
+void input_limit_test_read(std::string const& base_path,
+                            cudf::table_view const& t,
+                            size_t output_limit,
+                            size_t input_limit,
+                            int const expected_chunk_counts[4])
+{
+  std::vector<std::string> file_suffix{"_a.parquet", "_b.parquet", "_c.parquet", "_d.parquet"};
+  CUDF_EXPECTS(file_suffix.size() == 4, "Unexpected mismatch between number of test cases and result count");
 
-    auto result = chunked_read(filepath, output_limit, input_limit);
+  for(size_t idx=0; idx<file_suffix.size(); idx++){
+    auto result = chunked_read(base_path + file_suffix[idx], output_limit, input_limit);        
+    CUDF_EXPECTS(result.second == expected_chunk_counts[idx], "Unexpected number of chunks produced in chunk read");
     CUDF_TEST_EXPECT_TABLES_EQUIVALENT(*result.first, t);
   }
 }
 
-TEST_F(ParquetChunkedSubRowgroupReaderTest, SingleFixedWidthColumnNoSplits)
+struct ParquetChunkedReaderInputLimitConstrainedTest : public cudf::test::BaseFixture {};
+
+TEST_F(ParquetChunkedReaderInputLimitConstrainedTest, SingleFixedWidthColumn)
 {
-  auto filepath           = temp_env->get_temp_filepath("table_with_dict.parquet");
-  constexpr auto num_rows = 100;
+  auto base_path           = temp_env->get_temp_filepath("single_col_fixed_width");
+  constexpr auto num_rows = 1'000'000;
   auto iter1 = cudf::detail::make_counting_transform_iterator(0, [](int i) { return 15; });
-  cudf::test::fixed_width_column_wrapper<int> col1(iter1, iter1 + num_rows);
+  cudf::test::fixed_width_column_wrapper<double> col1(iter1, iter1 + num_rows);
   auto tbl = cudf::table_view{{col1}};
-  sub_rowgroup_test(filepath, tbl, 0, 100 * 1024 * 1024);
+
+  input_limit_test_write(base_path, tbl);
+
+  // semi-reasonable limit  
+  constexpr int expected_a[] = {1, 17, 4, 1};
+  input_limit_test_read(base_path, tbl, 0, 2 * 1024 * 1024, expected_a);
+  // an unreasonable limit
+  constexpr int expected_b[] = {1, 50, 50, 1};
+  input_limit_test_read(base_path, tbl, 0, 1, expected_b);
 }
 
-TEST_F(ParquetChunkedSubRowgroupReaderTest, MultipleFixedWidthColumns)
+TEST_F(ParquetChunkedReaderInputLimitConstrainedTest, MixedColumns)
 {
-  auto filepath           = temp_env->get_temp_filepath("multiple_col_fixed_width.parquet");
-  constexpr auto num_rows = 200000;
+  auto base_path           = temp_env->get_temp_filepath("mixed_columns");
+  constexpr auto num_rows = 1'000'000;
 
   auto iter1 = thrust::make_counting_iterator<int>(0);
   cudf::test::fixed_width_column_wrapper<int> col1(iter1, iter1 + num_rows);
@@ -1092,6 +1024,163 @@ TEST_F(ParquetChunkedSubRowgroupReaderTest, MultipleFixedWidthColumns)
   auto iter2 = thrust::make_counting_iterator<double>(0);
   cudf::test::fixed_width_column_wrapper<double> col2(iter2, iter2 + num_rows);
 
-  auto tbl = cudf::table_view{{col1, col2}};
-  sub_rowgroup_test(filepath, tbl, 0, 1 * 1024 * 1024);
+  auto const strings  = std::vector<std::string>{"abc", "de", "fghi"};
+  auto const str_iter = cudf::detail::make_counting_transform_iterator(0, [&](int32_t i) {
+    if (i < 250000) { return strings[0]; }
+    if (i < 750000) { return strings[1]; }
+    return strings[2];
+  });
+  auto col3 = strings_col(str_iter, str_iter + num_rows);
+
+  auto tbl = cudf::table_view{{col1, col2, col3}};
+
+  input_limit_test_write(base_path, tbl);
+  
+  constexpr int expected_a[] = {1, 50, 10, 7};
+  input_limit_test_read(base_path, tbl, 0, 2 * 1024 * 1024, expected_a);
+  constexpr int expected_b[] = {1, 50, 50, 50};
+  input_limit_test_read(base_path, tbl, 0, 1, expected_b);
+}
+
+struct ParquetChunkedReaderInputLimitTest : public cudf::test::BaseFixture {};
+
+struct offset_gen {
+  int const group_size;
+  __device__ int operator()(int i)
+  {
+    return i * group_size;
+  }
+};
+
+template<typename T>
+struct value_gen {
+  __device__ T operator()(int i)
+  {
+    return i % 1024;
+  }
+};
+TEST_F(ParquetChunkedReaderInputLimitTest, List)
+{
+  auto base_path           = temp_env->get_temp_filepath("list");
+  constexpr int num_rows = 50'000'000;
+  constexpr int list_size = 4;
+
+  auto const stream = cudf::get_default_stream();
+
+  auto offset_iter = cudf::detail::make_counting_transform_iterator(0, offset_gen{list_size});
+  auto offset_col = cudf::make_fixed_width_column(cudf::data_type{cudf::type_id::INT32}, num_rows + 1, cudf::mask_state::UNALLOCATED);
+  thrust::copy(rmm::exec_policy(stream), offset_iter, offset_iter + num_rows + 1, offset_col->mutable_view().begin<int>());
+  
+  // list<int>
+  constexpr int num_ints = num_rows * list_size;
+  auto value_iter = cudf::detail::make_counting_transform_iterator(0, value_gen<int>{});
+  auto value_col = cudf::make_fixed_width_column(cudf::data_type{cudf::type_id::INT32}, num_ints, cudf::mask_state::UNALLOCATED);  
+  thrust::copy(rmm::exec_policy(stream), value_iter, value_iter + num_ints, value_col->mutable_view().begin<int>());
+  auto col1 = cudf::make_lists_column(num_rows,
+                                      std::move(offset_col),
+                                      std::move(value_col),
+                                      0,
+                                      cudf::create_null_mask(num_rows, cudf::mask_state::UNALLOCATED),
+                                      stream);
+
+  auto tbl = cudf::table_view{{*col1}};
+  
+  input_limit_test_write(base_path, tbl);
+  
+  // even though we have a very large limit here, there are two cases where we actually produce splits.
+  // - uncompressed data (with no dict). This happens because the code has to make a guess at how much 
+  //   space to reserve for compressed/uncompressed data prior to reading. It does not know that everything
+  //   it will be reading in this case is uncompressed already, so this guess ends up causing it to generate two top
+  //   level passes. in practice, this shouldn't matter because we never really see uncompressed data in the wild.
+  //
+  // - ZSTD (with no dict). In this case, ZSTD simple requires a huge amount of temporary space: 2.5x the total
+  //   size of the decompressed data. so 2 GB is actually not enough to hold the whole thing at once.
+  // 
+  // Note that in the dictionary cases, both of these revert down to 1 chunk because the dictionaries dramatically 
+  // shrink the size of the uncompressed data.
+  constexpr int expected_a[] = {2, 2, 1, 1};
+  input_limit_test_read(base_path, tbl, 0, size_t{2} * 1024 * 1024 * 1024, expected_a);
+  // smaller limit
+  constexpr int expected_b[] = {6, 6, 2, 1};
+  input_limit_test_read(base_path, tbl, 0, 512 * 1024 * 1024, expected_b);
+  // include output chunking as well
+  constexpr int expected_c[] = {11, 11, 9, 8};
+  input_limit_test_read(base_path, tbl, 128 * 1024 * 1024, 512 * 1024 * 1024, expected_c);
+}
+
+struct char_values {
+  __device__ int8_t operator()(int i)
+  {
+    int const index = (i/2) % 3;  
+    // generate repeating 3-runs of 2 values each. aabbcc
+    return index == 0 ? 'a' : (index == 1 ? 'b' : 'c');
+  }
+};
+TEST_F(ParquetChunkedReaderInputLimitTest, Mixed)
+{
+  auto base_path           = temp_env->get_temp_filepath("mixed_types");
+  constexpr int num_rows = 50'000'000;
+  constexpr int list_size = 4;
+  constexpr int str_size = 3;
+
+  auto const stream = cudf::get_default_stream();
+
+  auto offset_iter = cudf::detail::make_counting_transform_iterator(0, offset_gen{list_size});
+  auto offset_col = cudf::make_fixed_width_column(cudf::data_type{cudf::type_id::INT32}, num_rows + 1, cudf::mask_state::UNALLOCATED);
+  thrust::copy(rmm::exec_policy(stream), offset_iter, offset_iter + num_rows + 1, offset_col->mutable_view().begin<int>());
+  
+  // list<int>
+  constexpr int num_ints = num_rows * list_size;
+  auto value_iter = cudf::detail::make_counting_transform_iterator(0, value_gen<int>{});
+  auto value_col = cudf::make_fixed_width_column(cudf::data_type{cudf::type_id::INT32}, num_ints, cudf::mask_state::UNALLOCATED);  
+  thrust::copy(rmm::exec_policy(stream), value_iter, value_iter + num_ints, value_col->mutable_view().begin<int>());
+  auto col1 = cudf::make_lists_column(num_rows,
+                                      std::move(offset_col),
+                                      std::move(value_col),
+                                      0,
+                                      cudf::create_null_mask(num_rows, cudf::mask_state::UNALLOCATED),
+                                      stream);
+
+  // strings
+  constexpr int num_chars = num_rows * str_size;
+  auto str_offset_iter = cudf::detail::make_counting_transform_iterator(0, offset_gen{str_size});
+  auto str_offset_col = cudf::make_fixed_width_column(cudf::data_type{cudf::type_id::INT32}, num_rows + 1, cudf::mask_state::UNALLOCATED);
+  thrust::copy(rmm::exec_policy(stream), str_offset_iter, str_offset_iter + num_rows + 1, str_offset_col->mutable_view().begin<int>());
+  auto str_iter = cudf::detail::make_counting_transform_iterator(0, char_values{});
+  auto str_value_col = cudf::make_fixed_width_column(cudf::data_type{cudf::type_id::INT8}, num_chars, cudf::mask_state::UNALLOCATED);
+  thrust::copy(rmm::exec_policy(stream), str_iter, str_iter + num_chars, str_value_col->mutable_view().begin<int8_t>());
+  auto col2 = cudf::make_strings_column(num_rows, 
+                                        std::move(str_offset_col),
+                                        std::move(str_value_col),
+                                        0,
+                                        cudf::create_null_mask(num_rows, cudf::mask_state::UNALLOCATED));
+
+  // doubles
+  auto double_iter = cudf::detail::make_counting_transform_iterator(0, value_gen<double>{});
+  auto col3 = cudf::make_fixed_width_column(cudf::data_type{cudf::type_id::FLOAT64}, num_rows, cudf::mask_state::UNALLOCATED);
+  thrust::copy(rmm::exec_policy(stream), double_iter, double_iter + num_rows, col3->mutable_view().begin<double>());
+
+  auto tbl = cudf::table_view{{*col1, *col2, *col3}};
+  
+  input_limit_test_write(base_path, tbl);
+  
+  // even though we have a very large limit here, there are two cases where we actually produce splits.
+  // - uncompressed data (with no dict). This happens because the code has to make a guess at how much 
+  //   space to reserve for compressed/uncompressed data prior to reading. It does not know that everything
+  //   it will be reading in this case is uncompressed already, so this guess ends up causing it to generate two top
+  //   level passes. in practice, this shouldn't matter because we never really see uncompressed data in the wild.
+  //
+  // - ZSTD (with no dict). In this case, ZSTD simple requires a huge amount of temporary space: 2.5x the total
+  //   size of the decompressed data. so 2 GB is actually not enough to hold the whole thing at once.
+  // 
+  // Note that in the dictionary cases, both of these revert down to 1 chunk because the dictionaries dramatically 
+  // shrink the size of the uncompressed data.
+  constexpr int expected_a[] = {3, 3, 1, 1};
+  input_limit_test_read(base_path, tbl, 0, size_t{2} * 1024 * 1024 * 1024, expected_a);
+  // smaller limit
+  constexpr int expected_b[] = {10, 11, 4, 1};
+  input_limit_test_read(base_path, tbl, 0, 512 * 1024 * 1024, expected_b);
+  // include output chunking as well
+  constexpr int expected_c[] = {20, 21, 15, 14};
+  input_limit_test_read(base_path, tbl, 128 * 1024 * 1024, 512 * 1024 * 1024, expected_c);
 }
diff --git a/cpp/tests/io/parquet_test.cpp b/cpp/tests/io/parquet_test.cpp
index be2ecd56424..5657698dfdb 100644
--- a/cpp/tests/io/parquet_test.cpp
+++ b/cpp/tests/io/parquet_test.cpp
@@ -23,4 +23,4 @@
 //
 // Do not add any test definitions to this file.
 
-CUDF_TEST_PROGRAM_MAIN()
+CUDF_TEST_PROGRAM_MAIN()
\ No newline at end of file

From 21c16366a1f78cd2d6f5d4aa17c036edb5774d95 Mon Sep 17 00:00:00 2001
From: db <dbaranec@nvidia.com>
Date: Thu, 18 Jan 2024 17:17:01 -0600
Subject: [PATCH 31/49] Fixed exception message.

---
 cpp/src/io/parquet/reader_impl_preprocess.cu | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/cpp/src/io/parquet/reader_impl_preprocess.cu b/cpp/src/io/parquet/reader_impl_preprocess.cu
index a19e877d8e7..615d0a7540f 100644
--- a/cpp/src/io/parquet/reader_impl_preprocess.cu
+++ b/cpp/src/io/parquet/reader_impl_preprocess.cu
@@ -1286,7 +1286,7 @@ void reader::impl::preprocess_subpass_pages(bool uses_custom_row_bounds, size_t
     bool const is_list = chunk.max_level[level_type::REPETITION] > 0;
     if (is_list && max_col_row < last_pass_row) {
       size_t const min_col_row = static_cast<size_t>(chunk.start_row + last_page.chunk_row);
-      CUDF_EXPECTS((max_col_row - min_col_row) > 1, "Unexpected short list page");
+      CUDF_EXPECTS((max_col_row - min_col_row) > 1, "Unexpected short subpass");
       max_col_row--;
     }
 

From aa99bb888f8f5fa59aeca2fcc57b550ba00924d1 Mon Sep 17 00:00:00 2001
From: db <dbaranec@nvidia.com>
Date: Thu, 18 Jan 2024 17:22:17 -0600
Subject: [PATCH 32/49] Formatting.

---
 cpp/src/io/comp/nvcomp_adapter.hpp           | 2 +-
 cpp/src/io/parquet/page_string_decode.cu     | 2 +-
 cpp/src/io/parquet/parquet_gpu.hpp           | 2 +-
 cpp/src/io/parquet/reader_impl.cpp           | 2 +-
 cpp/src/io/parquet/reader_impl.hpp           | 2 +-
 cpp/src/io/parquet/reader_impl_chunking.cu   | 2 +-
 cpp/src/io/parquet/reader_impl_chunking.hpp  | 2 +-
 cpp/src/io/parquet/reader_impl_preprocess.cu | 2 +-
 cpp/src/io/utilities/column_buffer.cpp       | 2 +-
 cpp/src/io/utilities/column_buffer.hpp       | 2 +-
 cpp/tests/io/parquet_chunked_reader_test.cu  | 2 +-
 cpp/tests/io/parquet_test.cpp                | 2 +-
 12 files changed, 12 insertions(+), 12 deletions(-)

diff --git a/cpp/src/io/comp/nvcomp_adapter.hpp b/cpp/src/io/comp/nvcomp_adapter.hpp
index a2890b0968b..6953b598ac6 100644
--- a/cpp/src/io/comp/nvcomp_adapter.hpp
+++ b/cpp/src/io/comp/nvcomp_adapter.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
diff --git a/cpp/src/io/parquet/page_string_decode.cu b/cpp/src/io/parquet/page_string_decode.cu
index 86051a0dfc0..71035e4088e 100644
--- a/cpp/src/io/parquet/page_string_decode.cu
+++ b/cpp/src/io/parquet/page_string_decode.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2018-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
diff --git a/cpp/src/io/parquet/parquet_gpu.hpp b/cpp/src/io/parquet/parquet_gpu.hpp
index 6b14bfce367..7f6c32f11ad 100644
--- a/cpp/src/io/parquet/parquet_gpu.hpp
+++ b/cpp/src/io/parquet/parquet_gpu.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2018-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
diff --git a/cpp/src/io/parquet/reader_impl.cpp b/cpp/src/io/parquet/reader_impl.cpp
index ce3e22a5b02..02bda56437e 100644
--- a/cpp/src/io/parquet/reader_impl.cpp
+++ b/cpp/src/io/parquet/reader_impl.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
diff --git a/cpp/src/io/parquet/reader_impl.hpp b/cpp/src/io/parquet/reader_impl.hpp
index d74218ff24b..e41323ecb2e 100644
--- a/cpp/src/io/parquet/reader_impl.hpp
+++ b/cpp/src/io/parquet/reader_impl.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
diff --git a/cpp/src/io/parquet/reader_impl_chunking.cu b/cpp/src/io/parquet/reader_impl_chunking.cu
index 0249e813c84..05f306de9c3 100644
--- a/cpp/src/io/parquet/reader_impl_chunking.cu
+++ b/cpp/src/io/parquet/reader_impl_chunking.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2023, NVIDIA CORPORATION.
+ * Copyright (c) 2023-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
diff --git a/cpp/src/io/parquet/reader_impl_chunking.hpp b/cpp/src/io/parquet/reader_impl_chunking.hpp
index 4e8d8de73c3..8109159d38d 100644
--- a/cpp/src/io/parquet/reader_impl_chunking.hpp
+++ b/cpp/src/io/parquet/reader_impl_chunking.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2023, NVIDIA CORPORATION.
+ * Copyright (c) 2023-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
diff --git a/cpp/src/io/parquet/reader_impl_preprocess.cu b/cpp/src/io/parquet/reader_impl_preprocess.cu
index 615d0a7540f..b60c7b4b4dc 100644
--- a/cpp/src/io/parquet/reader_impl_preprocess.cu
+++ b/cpp/src/io/parquet/reader_impl_preprocess.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
diff --git a/cpp/src/io/utilities/column_buffer.cpp b/cpp/src/io/utilities/column_buffer.cpp
index 996f55747e4..3228612ed34 100644
--- a/cpp/src/io/utilities/column_buffer.cpp
+++ b/cpp/src/io/utilities/column_buffer.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
diff --git a/cpp/src/io/utilities/column_buffer.hpp b/cpp/src/io/utilities/column_buffer.hpp
index b54e2714ef9..cc39f79ba61 100644
--- a/cpp/src/io/utilities/column_buffer.hpp
+++ b/cpp/src/io/utilities/column_buffer.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
diff --git a/cpp/tests/io/parquet_chunked_reader_test.cu b/cpp/tests/io/parquet_chunked_reader_test.cu
index a68d42d208d..f36fd956c63 100644
--- a/cpp/tests/io/parquet_chunked_reader_test.cu
+++ b/cpp/tests/io/parquet_chunked_reader_test.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
diff --git a/cpp/tests/io/parquet_test.cpp b/cpp/tests/io/parquet_test.cpp
index 5657698dfdb..be2ecd56424 100644
--- a/cpp/tests/io/parquet_test.cpp
+++ b/cpp/tests/io/parquet_test.cpp
@@ -23,4 +23,4 @@
 //
 // Do not add any test definitions to this file.
 
-CUDF_TEST_PROGRAM_MAIN()
\ No newline at end of file
+CUDF_TEST_PROGRAM_MAIN()

From 479251f986df76c76678da5423438e3efd6845ba Mon Sep 17 00:00:00 2001
From: db <dbaranec@nvidia.com>
Date: Thu, 18 Jan 2024 17:24:52 -0600
Subject: [PATCH 33/49] More formatting. For some reason pre-commit didn't
 catch everything last time.

---
 cpp/src/io/parquet/reader_impl.cpp          |   9 +-
 cpp/src/io/parquet/reader_impl_chunking.cu  |  34 ++-
 cpp/tests/io/parquet_chunked_reader_test.cu | 247 ++++++++++++--------
 3 files changed, 177 insertions(+), 113 deletions(-)

diff --git a/cpp/src/io/parquet/reader_impl.cpp b/cpp/src/io/parquet/reader_impl.cpp
index 02bda56437e..bfa374bf78e 100644
--- a/cpp/src/io/parquet/reader_impl.cpp
+++ b/cpp/src/io/parquet/reader_impl.cpp
@@ -206,8 +206,13 @@ void reader::impl::decode_page_data(size_t skip_rows, size_t num_rows)
 
   // launch delta length byte array decoder
   if (BitAnd(kernel_mask, decode_kernel_mask::DELTA_LENGTH_BA) != 0) {
-    DecodeDeltaLengthByteArray(
-      subpass.pages, pass.chunks, num_rows, skip_rows, level_type_size, error_code.data(), streams[s_idx++]);
+    DecodeDeltaLengthByteArray(subpass.pages,
+                               pass.chunks,
+                               num_rows,
+                               skip_rows,
+                               level_type_size,
+                               error_code.data(),
+                               streams[s_idx++]);
   }
 
   // launch delta binary decoder
diff --git a/cpp/src/io/parquet/reader_impl_chunking.cu b/cpp/src/io/parquet/reader_impl_chunking.cu
index 05f306de9c3..c2947d62e1e 100644
--- a/cpp/src/io/parquet/reader_impl_chunking.cu
+++ b/cpp/src/io/parquet/reader_impl_chunking.cu
@@ -53,7 +53,7 @@ struct split_info {
 constexpr size_t minimum_subpass_expected_size = 200 * 1024 * 1024;
 
 // percentage of the total available input read limit that should be reserved for compressed
-// data vs uncompressed data. 
+// data vs uncompressed data.
 constexpr float input_limit_compression_reserve = 0.3f;
 
 #if defined(CHUNKING_DEBUG)
@@ -550,8 +550,9 @@ struct get_page_span {
  * @param size_limit The size limit in bytes of the subpass
  * @param num_columns The number of columns
  * @param stream The stream to execute cuda operations on
- * @returns A tuple containing a vector of page_span structs indicating the page indices to include for each column
- * to be processed, the total number of pages over all columns, and the total expected memory usage (including scratch space)
+ * @returns A tuple containing a vector of page_span structs indicating the page indices to include
+ * for each column to be processed, the total number of pages over all columns, and the total
+ * expected memory usage (including scratch space)
  *
  */
 std::tuple<std::vector<page_span>, size_t, size_t> compute_next_subpass(
@@ -574,8 +575,9 @@ std::tuple<std::vector<page_span>, size_t, size_t> compute_next_subpass(
   // data does not contain lists (because our row counts are only estimates in that case)
 
   // find the next split
-  auto const start_index     = find_start_index(h_aggregated_info, start_row);
-  auto const cumulative_size = start_row == 0 || start_index == 0 ? 0 : h_aggregated_info[start_index - 1].size_bytes;
+  auto const start_index = find_start_index(h_aggregated_info, start_row);
+  auto const cumulative_size =
+    start_row == 0 || start_index == 0 ? 0 : h_aggregated_info[start_index - 1].size_bytes;
   auto const end_index =
     find_next_split(start_index, start_row, cumulative_size, h_aggregated_info, size_limit);
   auto const end_row = h_aggregated_info[end_index].row_index;
@@ -1205,21 +1207,25 @@ void reader::impl::setup_next_subpass(bool uses_custom_row_bounds)
 
   auto const num_columns = _input_columns.size();
 
-  // if the user has passed a very small value (under the hardcoded minimum_subpass_expected_size), respect it.
-  auto const min_subpass_size = _input_pass_read_limit < minimum_subpass_expected_size ? _input_pass_read_limit : minimum_subpass_expected_size;
+  // if the user has passed a very small value (under the hardcoded minimum_subpass_expected_size),
+  // respect it.
+  auto const min_subpass_size = _input_pass_read_limit < minimum_subpass_expected_size
+                                  ? _input_pass_read_limit
+                                  : minimum_subpass_expected_size;
 
   // what do we do if the base memory size (the compressed data) itself is approaching or larger
   // than the overall read limit? we are still going to be decompressing in subpasses, but we have
   // to assume some reasonable minimum size needed to safely decompress a single subpass. so always
-  // reserve at least that much space. this can result in using up to 2x the specified user limit but
-  // should only ever happen with unrealistically low numbers.
+  // reserve at least that much space. this can result in using up to 2x the specified user limit
+  // but should only ever happen with unrealistically low numbers.
   size_t const remaining_read_limit =
     _input_pass_read_limit == 0 ? 0
     : pass.base_mem_size + min_subpass_size >= _input_pass_read_limit
       ? min_subpass_size
       : _input_pass_read_limit - pass.base_mem_size;
 
-  auto [page_indices, total_pages, total_expected_size] = [&]() -> std::tuple<std::vector<page_span>, size_t, size_t> {
+  auto [page_indices, total_pages, total_expected_size] =
+    [&]() -> std::tuple<std::vector<page_span>, size_t, size_t> {
     // special case:  if we contain no compressed data, or if we have no input limit, we can always
     // just do 1 subpass since what we already have loaded is all the temporary memory we will ever
     // use.
@@ -1323,7 +1329,9 @@ void reader::impl::setup_next_subpass(bool uses_custom_row_bounds)
          subpass.num_rows,
          remaining_read_limit);
   printf("\t\tDecompressed size: %'lu\n", subpass.decomp_page_data.size());
-  printf("\t\tTotal expected usage: %'lu\n", total_expected_size == 0 ? subpass.decomp_page_data.size() + pass.base_mem_size : total_expected_size + pass.base_mem_size);
+  printf("\t\tTotal expected usage: %'lu\n",
+         total_expected_size == 0 ? subpass.decomp_page_data.size() + pass.base_mem_size
+                                  : total_expected_size + pass.base_mem_size);
   for (size_t c_idx = 0; c_idx < num_columns; c_idx++) {
     printf("\t\tColumn %'lu: pages(%'lu - %'lu)\n",
            c_idx,
@@ -1428,7 +1436,9 @@ void reader::impl::compute_input_passes()
   // generate passes. make sure to account for the case where a single row group doesn't fit within
   //
   std::size_t const read_limit =
-    _input_pass_read_limit > 0 ? static_cast<size_t>(static_cast<float>(_input_pass_read_limit) * input_limit_compression_reserve) : std::numeric_limits<std::size_t>::max();
+    _input_pass_read_limit > 0 ? static_cast<size_t>(static_cast<float>(_input_pass_read_limit) *
+                                                     input_limit_compression_reserve)
+                               : std::numeric_limits<std::size_t>::max();
   std::size_t cur_pass_byte_size = 0;
   std::size_t cur_rg_start       = 0;
   std::size_t cur_row_count      = 0;
diff --git a/cpp/tests/io/parquet_chunked_reader_test.cu b/cpp/tests/io/parquet_chunked_reader_test.cu
index f36fd956c63..b0232ee8292 100644
--- a/cpp/tests/io/parquet_chunked_reader_test.cu
+++ b/cpp/tests/io/parquet_chunked_reader_test.cu
@@ -954,9 +954,9 @@ TEST_F(ParquetChunkedReaderTest, TestChunkedReadNullCount)
 }
 
 void input_limit_test_write_one(std::string const& filepath,
-                                 cudf::table_view const& t,
-                                 cudf::io::compression_type compression,
-                                 cudf::io::dictionary_policy dict_policy)
+                                cudf::table_view const& t,
+                                cudf::io::compression_type compression,
+                                cudf::io::dictionary_policy dict_policy)
 {
   cudf::io::parquet_writer_options out_opts =
     cudf::io::parquet_writer_options::builder(cudf::io::sink_info{filepath}, t)
@@ -965,30 +965,44 @@ void input_limit_test_write_one(std::string const& filepath,
   cudf::io::write_parquet(out_opts);
 }
 
-void input_limit_test_write(std::string const& base_path,
-                             cudf::table_view const& t)
+void input_limit_test_write(std::string const& base_path, cudf::table_view const& t)
 {
   // no compression
-  input_limit_test_write_one(base_path + "_a.parquet", t, cudf::io::compression_type::NONE, cudf::io::dictionary_policy::NEVER);  
-  // compression with a codec that uses a lot of scratch space at decode time (2.5x the total decompressed buffer size)
-  input_limit_test_write_one(base_path + "_b.parquet", t, cudf::io::compression_type::ZSTD, cudf::io::dictionary_policy::NEVER);  
+  input_limit_test_write_one(base_path + "_a.parquet",
+                             t,
+                             cudf::io::compression_type::NONE,
+                             cudf::io::dictionary_policy::NEVER);
+  // compression with a codec that uses a lot of scratch space at decode time (2.5x the total
+  // decompressed buffer size)
+  input_limit_test_write_one(base_path + "_b.parquet",
+                             t,
+                             cudf::io::compression_type::ZSTD,
+                             cudf::io::dictionary_policy::NEVER);
   // compression with a codec that uses no scratch space at decode time
-  input_limit_test_write_one(base_path + "_c.parquet", t, cudf::io::compression_type::SNAPPY, cudf::io::dictionary_policy::NEVER);
-  input_limit_test_write_one(base_path + "_d.parquet", t, cudf::io::compression_type::SNAPPY, cudf::io::dictionary_policy::ALWAYS);
+  input_limit_test_write_one(base_path + "_c.parquet",
+                             t,
+                             cudf::io::compression_type::SNAPPY,
+                             cudf::io::dictionary_policy::NEVER);
+  input_limit_test_write_one(base_path + "_d.parquet",
+                             t,
+                             cudf::io::compression_type::SNAPPY,
+                             cudf::io::dictionary_policy::ALWAYS);
 }
 
 void input_limit_test_read(std::string const& base_path,
-                            cudf::table_view const& t,
-                            size_t output_limit,
-                            size_t input_limit,
-                            int const expected_chunk_counts[4])
+                           cudf::table_view const& t,
+                           size_t output_limit,
+                           size_t input_limit,
+                           int const expected_chunk_counts[4])
 {
   std::vector<std::string> file_suffix{"_a.parquet", "_b.parquet", "_c.parquet", "_d.parquet"};
-  CUDF_EXPECTS(file_suffix.size() == 4, "Unexpected mismatch between number of test cases and result count");
+  CUDF_EXPECTS(file_suffix.size() == 4,
+               "Unexpected mismatch between number of test cases and result count");
 
-  for(size_t idx=0; idx<file_suffix.size(); idx++){
-    auto result = chunked_read(base_path + file_suffix[idx], output_limit, input_limit);        
-    CUDF_EXPECTS(result.second == expected_chunk_counts[idx], "Unexpected number of chunks produced in chunk read");
+  for (size_t idx = 0; idx < file_suffix.size(); idx++) {
+    auto result = chunked_read(base_path + file_suffix[idx], output_limit, input_limit);
+    CUDF_EXPECTS(result.second == expected_chunk_counts[idx],
+                 "Unexpected number of chunks produced in chunk read");
     CUDF_TEST_EXPECT_TABLES_EQUIVALENT(*result.first, t);
   }
 }
@@ -997,7 +1011,7 @@ struct ParquetChunkedReaderInputLimitConstrainedTest : public cudf::test::BaseFi
 
 TEST_F(ParquetChunkedReaderInputLimitConstrainedTest, SingleFixedWidthColumn)
 {
-  auto base_path           = temp_env->get_temp_filepath("single_col_fixed_width");
+  auto base_path          = temp_env->get_temp_filepath("single_col_fixed_width");
   constexpr auto num_rows = 1'000'000;
   auto iter1 = cudf::detail::make_counting_transform_iterator(0, [](int i) { return 15; });
   cudf::test::fixed_width_column_wrapper<double> col1(iter1, iter1 + num_rows);
@@ -1005,7 +1019,7 @@ TEST_F(ParquetChunkedReaderInputLimitConstrainedTest, SingleFixedWidthColumn)
 
   input_limit_test_write(base_path, tbl);
 
-  // semi-reasonable limit  
+  // semi-reasonable limit
   constexpr int expected_a[] = {1, 17, 4, 1};
   input_limit_test_read(base_path, tbl, 0, 2 * 1024 * 1024, expected_a);
   // an unreasonable limit
@@ -1015,7 +1029,7 @@ TEST_F(ParquetChunkedReaderInputLimitConstrainedTest, SingleFixedWidthColumn)
 
 TEST_F(ParquetChunkedReaderInputLimitConstrainedTest, MixedColumns)
 {
-  auto base_path           = temp_env->get_temp_filepath("mixed_columns");
+  auto base_path          = temp_env->get_temp_filepath("mixed_columns");
   constexpr auto num_rows = 1'000'000;
 
   auto iter1 = thrust::make_counting_iterator<int>(0);
@@ -1030,12 +1044,12 @@ TEST_F(ParquetChunkedReaderInputLimitConstrainedTest, MixedColumns)
     if (i < 750000) { return strings[1]; }
     return strings[2];
   });
-  auto col3 = strings_col(str_iter, str_iter + num_rows);
+  auto col3           = strings_col(str_iter, str_iter + num_rows);
 
   auto tbl = cudf::table_view{{col1, col2, col3}};
 
   input_limit_test_write(base_path, tbl);
-  
+
   constexpr int expected_a[] = {1, 50, 10, 7};
   input_limit_test_read(base_path, tbl, 0, 2 * 1024 * 1024, expected_a);
   constexpr int expected_b[] = {1, 50, 50, 50};
@@ -1046,58 +1060,66 @@ struct ParquetChunkedReaderInputLimitTest : public cudf::test::BaseFixture {};
 
 struct offset_gen {
   int const group_size;
-  __device__ int operator()(int i)
-  {
-    return i * group_size;
-  }
+  __device__ int operator()(int i) { return i * group_size; }
 };
 
-template<typename T>
+template <typename T>
 struct value_gen {
-  __device__ T operator()(int i)
-  {
-    return i % 1024;
-  }
+  __device__ T operator()(int i) { return i % 1024; }
 };
 TEST_F(ParquetChunkedReaderInputLimitTest, List)
 {
-  auto base_path           = temp_env->get_temp_filepath("list");
-  constexpr int num_rows = 50'000'000;
+  auto base_path          = temp_env->get_temp_filepath("list");
+  constexpr int num_rows  = 50'000'000;
   constexpr int list_size = 4;
 
   auto const stream = cudf::get_default_stream();
 
   auto offset_iter = cudf::detail::make_counting_transform_iterator(0, offset_gen{list_size});
-  auto offset_col = cudf::make_fixed_width_column(cudf::data_type{cudf::type_id::INT32}, num_rows + 1, cudf::mask_state::UNALLOCATED);
-  thrust::copy(rmm::exec_policy(stream), offset_iter, offset_iter + num_rows + 1, offset_col->mutable_view().begin<int>());
-  
+  auto offset_col  = cudf::make_fixed_width_column(
+    cudf::data_type{cudf::type_id::INT32}, num_rows + 1, cudf::mask_state::UNALLOCATED);
+  thrust::copy(rmm::exec_policy(stream),
+               offset_iter,
+               offset_iter + num_rows + 1,
+               offset_col->mutable_view().begin<int>());
+
   // list<int>
   constexpr int num_ints = num_rows * list_size;
-  auto value_iter = cudf::detail::make_counting_transform_iterator(0, value_gen<int>{});
-  auto value_col = cudf::make_fixed_width_column(cudf::data_type{cudf::type_id::INT32}, num_ints, cudf::mask_state::UNALLOCATED);  
-  thrust::copy(rmm::exec_policy(stream), value_iter, value_iter + num_ints, value_col->mutable_view().begin<int>());
-  auto col1 = cudf::make_lists_column(num_rows,
-                                      std::move(offset_col),
-                                      std::move(value_col),
-                                      0,
-                                      cudf::create_null_mask(num_rows, cudf::mask_state::UNALLOCATED),
-                                      stream);
+  auto value_iter        = cudf::detail::make_counting_transform_iterator(0, value_gen<int>{});
+  auto value_col         = cudf::make_fixed_width_column(
+    cudf::data_type{cudf::type_id::INT32}, num_ints, cudf::mask_state::UNALLOCATED);
+  thrust::copy(rmm::exec_policy(stream),
+               value_iter,
+               value_iter + num_ints,
+               value_col->mutable_view().begin<int>());
+  auto col1 =
+    cudf::make_lists_column(num_rows,
+                            std::move(offset_col),
+                            std::move(value_col),
+                            0,
+                            cudf::create_null_mask(num_rows, cudf::mask_state::UNALLOCATED),
+                            stream);
 
   auto tbl = cudf::table_view{{*col1}};
-  
+
   input_limit_test_write(base_path, tbl);
-  
-  // even though we have a very large limit here, there are two cases where we actually produce splits.
-  // - uncompressed data (with no dict). This happens because the code has to make a guess at how much 
-  //   space to reserve for compressed/uncompressed data prior to reading. It does not know that everything
-  //   it will be reading in this case is uncompressed already, so this guess ends up causing it to generate two top
-  //   level passes. in practice, this shouldn't matter because we never really see uncompressed data in the wild.
+
+  // even though we have a very large limit here, there are two cases where we actually produce
+  // splits.
+  // - uncompressed data (with no dict). This happens because the code has to make a guess at how
+  // much
+  //   space to reserve for compressed/uncompressed data prior to reading. It does not know that
+  //   everything it will be reading in this case is uncompressed already, so this guess ends up
+  //   causing it to generate two top level passes. in practice, this shouldn't matter because we
+  //   never really see uncompressed data in the wild.
+  //
+  // - ZSTD (with no dict). In this case, ZSTD simple requires a huge amount of temporary
+  // space: 2.5x the total
+  //   size of the decompressed data. so 2 GB is actually not enough to hold the whole thing at
+  //   once.
   //
-  // - ZSTD (with no dict). In this case, ZSTD simple requires a huge amount of temporary space: 2.5x the total
-  //   size of the decompressed data. so 2 GB is actually not enough to hold the whole thing at once.
-  // 
-  // Note that in the dictionary cases, both of these revert down to 1 chunk because the dictionaries dramatically 
-  // shrink the size of the uncompressed data.
+  // Note that in the dictionary cases, both of these revert down to 1 chunk because the
+  // dictionaries dramatically shrink the size of the uncompressed data.
   constexpr int expected_a[] = {2, 2, 1, 1};
   input_limit_test_read(base_path, tbl, 0, size_t{2} * 1024 * 1024 * 1024, expected_a);
   // smaller limit
@@ -1111,70 +1133,97 @@ TEST_F(ParquetChunkedReaderInputLimitTest, List)
 struct char_values {
   __device__ int8_t operator()(int i)
   {
-    int const index = (i/2) % 3;  
+    int const index = (i / 2) % 3;
     // generate repeating 3-runs of 2 values each. aabbcc
     return index == 0 ? 'a' : (index == 1 ? 'b' : 'c');
   }
 };
 TEST_F(ParquetChunkedReaderInputLimitTest, Mixed)
 {
-  auto base_path           = temp_env->get_temp_filepath("mixed_types");
-  constexpr int num_rows = 50'000'000;
+  auto base_path          = temp_env->get_temp_filepath("mixed_types");
+  constexpr int num_rows  = 50'000'000;
   constexpr int list_size = 4;
-  constexpr int str_size = 3;
+  constexpr int str_size  = 3;
 
   auto const stream = cudf::get_default_stream();
 
   auto offset_iter = cudf::detail::make_counting_transform_iterator(0, offset_gen{list_size});
-  auto offset_col = cudf::make_fixed_width_column(cudf::data_type{cudf::type_id::INT32}, num_rows + 1, cudf::mask_state::UNALLOCATED);
-  thrust::copy(rmm::exec_policy(stream), offset_iter, offset_iter + num_rows + 1, offset_col->mutable_view().begin<int>());
-  
+  auto offset_col  = cudf::make_fixed_width_column(
+    cudf::data_type{cudf::type_id::INT32}, num_rows + 1, cudf::mask_state::UNALLOCATED);
+  thrust::copy(rmm::exec_policy(stream),
+               offset_iter,
+               offset_iter + num_rows + 1,
+               offset_col->mutable_view().begin<int>());
+
   // list<int>
   constexpr int num_ints = num_rows * list_size;
-  auto value_iter = cudf::detail::make_counting_transform_iterator(0, value_gen<int>{});
-  auto value_col = cudf::make_fixed_width_column(cudf::data_type{cudf::type_id::INT32}, num_ints, cudf::mask_state::UNALLOCATED);  
-  thrust::copy(rmm::exec_policy(stream), value_iter, value_iter + num_ints, value_col->mutable_view().begin<int>());
-  auto col1 = cudf::make_lists_column(num_rows,
-                                      std::move(offset_col),
-                                      std::move(value_col),
-                                      0,
-                                      cudf::create_null_mask(num_rows, cudf::mask_state::UNALLOCATED),
-                                      stream);
+  auto value_iter        = cudf::detail::make_counting_transform_iterator(0, value_gen<int>{});
+  auto value_col         = cudf::make_fixed_width_column(
+    cudf::data_type{cudf::type_id::INT32}, num_ints, cudf::mask_state::UNALLOCATED);
+  thrust::copy(rmm::exec_policy(stream),
+               value_iter,
+               value_iter + num_ints,
+               value_col->mutable_view().begin<int>());
+  auto col1 =
+    cudf::make_lists_column(num_rows,
+                            std::move(offset_col),
+                            std::move(value_col),
+                            0,
+                            cudf::create_null_mask(num_rows, cudf::mask_state::UNALLOCATED),
+                            stream);
 
   // strings
   constexpr int num_chars = num_rows * str_size;
-  auto str_offset_iter = cudf::detail::make_counting_transform_iterator(0, offset_gen{str_size});
-  auto str_offset_col = cudf::make_fixed_width_column(cudf::data_type{cudf::type_id::INT32}, num_rows + 1, cudf::mask_state::UNALLOCATED);
-  thrust::copy(rmm::exec_policy(stream), str_offset_iter, str_offset_iter + num_rows + 1, str_offset_col->mutable_view().begin<int>());
-  auto str_iter = cudf::detail::make_counting_transform_iterator(0, char_values{});
-  auto str_value_col = cudf::make_fixed_width_column(cudf::data_type{cudf::type_id::INT8}, num_chars, cudf::mask_state::UNALLOCATED);
-  thrust::copy(rmm::exec_policy(stream), str_iter, str_iter + num_chars, str_value_col->mutable_view().begin<int8_t>());
-  auto col2 = cudf::make_strings_column(num_rows, 
-                                        std::move(str_offset_col),
-                                        std::move(str_value_col),
-                                        0,
-                                        cudf::create_null_mask(num_rows, cudf::mask_state::UNALLOCATED));
+  auto str_offset_iter    = cudf::detail::make_counting_transform_iterator(0, offset_gen{str_size});
+  auto str_offset_col     = cudf::make_fixed_width_column(
+    cudf::data_type{cudf::type_id::INT32}, num_rows + 1, cudf::mask_state::UNALLOCATED);
+  thrust::copy(rmm::exec_policy(stream),
+               str_offset_iter,
+               str_offset_iter + num_rows + 1,
+               str_offset_col->mutable_view().begin<int>());
+  auto str_iter      = cudf::detail::make_counting_transform_iterator(0, char_values{});
+  auto str_value_col = cudf::make_fixed_width_column(
+    cudf::data_type{cudf::type_id::INT8}, num_chars, cudf::mask_state::UNALLOCATED);
+  thrust::copy(rmm::exec_policy(stream),
+               str_iter,
+               str_iter + num_chars,
+               str_value_col->mutable_view().begin<int8_t>());
+  auto col2 =
+    cudf::make_strings_column(num_rows,
+                              std::move(str_offset_col),
+                              std::move(str_value_col),
+                              0,
+                              cudf::create_null_mask(num_rows, cudf::mask_state::UNALLOCATED));
 
   // doubles
   auto double_iter = cudf::detail::make_counting_transform_iterator(0, value_gen<double>{});
-  auto col3 = cudf::make_fixed_width_column(cudf::data_type{cudf::type_id::FLOAT64}, num_rows, cudf::mask_state::UNALLOCATED);
-  thrust::copy(rmm::exec_policy(stream), double_iter, double_iter + num_rows, col3->mutable_view().begin<double>());
+  auto col3        = cudf::make_fixed_width_column(
+    cudf::data_type{cudf::type_id::FLOAT64}, num_rows, cudf::mask_state::UNALLOCATED);
+  thrust::copy(rmm::exec_policy(stream),
+               double_iter,
+               double_iter + num_rows,
+               col3->mutable_view().begin<double>());
 
   auto tbl = cudf::table_view{{*col1, *col2, *col3}};
-  
+
   input_limit_test_write(base_path, tbl);
-  
-  // even though we have a very large limit here, there are two cases where we actually produce splits.
-  // - uncompressed data (with no dict). This happens because the code has to make a guess at how much 
-  //   space to reserve for compressed/uncompressed data prior to reading. It does not know that everything
-  //   it will be reading in this case is uncompressed already, so this guess ends up causing it to generate two top
-  //   level passes. in practice, this shouldn't matter because we never really see uncompressed data in the wild.
+
+  // even though we have a very large limit here, there are two cases where we actually produce
+  // splits.
+  // - uncompressed data (with no dict). This happens because the code has to make a guess at how
+  // much
+  //   space to reserve for compressed/uncompressed data prior to reading. It does not know that
+  //   everything it will be reading in this case is uncompressed already, so this guess ends up
+  //   causing it to generate two top level passes. in practice, this shouldn't matter because we
+  //   never really see uncompressed data in the wild.
+  //
+  // - ZSTD (with no dict). In this case, ZSTD simple requires a huge amount of temporary
+  // space: 2.5x the total
+  //   size of the decompressed data. so 2 GB is actually not enough to hold the whole thing at
+  //   once.
   //
-  // - ZSTD (with no dict). In this case, ZSTD simple requires a huge amount of temporary space: 2.5x the total
-  //   size of the decompressed data. so 2 GB is actually not enough to hold the whole thing at once.
-  // 
-  // Note that in the dictionary cases, both of these revert down to 1 chunk because the dictionaries dramatically 
-  // shrink the size of the uncompressed data.
+  // Note that in the dictionary cases, both of these revert down to 1 chunk because the
+  // dictionaries dramatically shrink the size of the uncompressed data.
   constexpr int expected_a[] = {3, 3, 1, 1};
   input_limit_test_read(base_path, tbl, 0, size_t{2} * 1024 * 1024 * 1024, expected_a);
   // smaller limit

From ec819288ca94b7c99472e8a0024fd5453baf48ed Mon Sep 17 00:00:00 2001
From: db <dbaranec@nvidia.com>
Date: Fri, 19 Jan 2024 15:06:24 -0600
Subject: [PATCH 34/49] Review feedback changes.

---
 cpp/src/io/parquet/reader_impl.cpp         |  7 ++--
 cpp/src/io/parquet/reader_impl.hpp         |  2 +-
 cpp/src/io/parquet/reader_impl_chunking.cu | 42 +++++++++++-----------
 cpp/src/io/utilities/column_buffer.cpp     | 12 +++----
 cpp/src/io/utilities/column_buffer.hpp     | 11 ++++--
 5 files changed, 41 insertions(+), 33 deletions(-)

diff --git a/cpp/src/io/parquet/reader_impl.cpp b/cpp/src/io/parquet/reader_impl.cpp
index bfa374bf78e..45458d43a99 100644
--- a/cpp/src/io/parquet/reader_impl.cpp
+++ b/cpp/src/io/parquet/reader_impl.cpp
@@ -66,7 +66,7 @@ void reader::impl::decode_page_data(size_t skip_rows, size_t num_rows)
                            delta_temp_buf,
                            skip_rows,
                            num_rows,
-                           pass.level_type_size,
+                           level_type_size,
                            kernel_mask,
                            _stream);
 
@@ -247,6 +247,9 @@ void reader::impl::decode_page_data(size_t skip_rows, size_t num_rows)
   if (error_code.value() != 0) {
     CUDF_FAIL("Parquet data decode failed with code(s) " + error_code.str());
   }
+  // error_code.value() does a synchronize, but I think we should leave this here as a more explicit
+  // reminder in the code.
+  _stream.synchronize();
 
   // for list columns, add the final offset to every offset buffer.
   // TODO : make this happen in more efficiently. Maybe use thrust::for_each
@@ -288,8 +291,6 @@ void reader::impl::decode_page_data(size_t skip_rows, size_t num_rows)
     }
   }
 
-  _stream.synchronize();
-
   // update null counts in the final column buffers
   for (size_t idx = 0; idx < subpass.pages.size(); idx++) {
     PageInfo* pi = &subpass.pages[idx];
diff --git a/cpp/src/io/parquet/reader_impl.hpp b/cpp/src/io/parquet/reader_impl.hpp
index e41323ecb2e..b92665c167f 100644
--- a/cpp/src/io/parquet/reader_impl.hpp
+++ b/cpp/src/io/parquet/reader_impl.hpp
@@ -181,7 +181,7 @@ class reader::impl {
    *        bounds
    *
    * A 'subpass' is defined as a subset of pages within a pass that are
-   * decompressed a decoded as a batch. Subpasses may be further subdivided
+   * decompressed and decoded as a batch. Subpasses may be further subdivided
    * into output chunks.
    */
   void setup_next_subpass(bool uses_custom_row_bounds);
diff --git a/cpp/src/io/parquet/reader_impl_chunking.cu b/cpp/src/io/parquet/reader_impl_chunking.cu
index c2947d62e1e..70a3be7e194 100644
--- a/cpp/src/io/parquet/reader_impl_chunking.cu
+++ b/cpp/src/io/parquet/reader_impl_chunking.cu
@@ -35,6 +35,8 @@
 #include <thrust/sort.h>
 #include <thrust/unique.h>
 
+#include <cuda/functional>
+
 #include <numeric>
 
 namespace cudf::io::parquet::detail {
@@ -198,17 +200,18 @@ __device__ size_t row_size_functor::operator()<string_view>(size_t num_rows, boo
  * Sums across all nesting levels.
  */
 struct get_page_output_size {
-  __device__ cumulative_page_info operator()(PageInfo const& page)
+  __device__ cumulative_page_info operator()(PageInfo const& page) const
   {
     if (page.flags & PAGEINFO_FLAGS_DICTIONARY) {
       return cumulative_page_info{0, 0, page.src_col_schema};
     }
 
     // total nested size, not counting string data
-    auto iter = cudf::detail::make_counting_transform_iterator(0, [page] __device__(size_type i) {
-      auto const& pni = page.nesting[i];
-      return cudf::type_dispatcher(data_type{pni.type}, row_size_functor{}, pni.size, pni.nullable);
-    });
+    auto iter = cudf::detail::make_counting_transform_iterator(0, 
+      cuda::proclaim_return_type<size_t>([page]__device__(size_type i) {
+        auto const& pni = page.nesting[i];
+        return cudf::type_dispatcher(data_type{pni.type}, row_size_functor{}, pni.size, pni.nullable);
+      }));
     return {
       0,
       thrust::reduce(thrust::seq, iter, iter + page.num_output_nesting_levels) + page.str_bytes,
@@ -220,7 +223,7 @@ struct get_page_output_size {
  * @brief Functor which sets the (uncompressed) size of a page.
  */
 struct get_page_input_size {
-  __device__ cumulative_page_info operator()(PageInfo const& page)
+  __device__ cumulative_page_info operator()(PageInfo const& page) const
   {
     // we treat dictionary page sizes as 0 for subpasses because we have already paid the price for
     // them at the pass level.
@@ -235,8 +238,8 @@ struct get_page_input_size {
  * @brief Functor which sets the absolute row index of a page in a cumulative_page_info struct
  */
 struct set_row_index {
-  device_span<const ColumnChunkDesc> chunks;
-  device_span<const PageInfo> pages;
+  device_span<ColumnChunkDesc const> chunks;
+  device_span<PageInfo const> pages;
   device_span<cumulative_page_info> c_info;
 
   __device__ void operator()(size_t i)
@@ -268,7 +271,7 @@ struct page_total_size {
   size_type const* key_offsets;
   size_t num_keys;
 
-  __device__ cumulative_page_info operator()(cumulative_page_info const& i)
+  __device__ cumulative_page_info operator()(cumulative_page_info const& i) const
   {
     // sum sizes for each input column at this row
     size_t sum = 0;
@@ -276,7 +279,7 @@ struct page_total_size {
       auto const start = key_offsets[idx];
       auto const end   = key_offsets[idx + 1];
       auto iter        = cudf::detail::make_counting_transform_iterator(
-        0, [&] __device__(size_type i) { return c_info[i].row_index; });
+        0, cuda::proclaim_return_type<size_t>([&] __device__(size_type i) { return c_info[i].row_index; }));
       auto const page_index =
         thrust::lower_bound(thrust::seq, iter + start, iter + end, i.row_index) - iter;
       sum += c_info[page_index].size_bytes;
@@ -289,14 +292,14 @@ struct page_total_size {
  * @brief Functor which returns the compressed data size for a chunk
  */
 struct get_chunk_compressed_size {
-  __device__ size_t operator()(ColumnChunkDesc const& chunk) { return chunk.compressed_size; }
+  __device__ size_t operator()(ColumnChunkDesc const& chunk) const { return chunk.compressed_size; }
 };
 
 /**
  * @brief Find the first entry in the aggreggated_info that corresponds to the specified row
  *
  */
-size_t find_start_index(std::vector<cumulative_page_info> const& aggregated_info, size_t start_row)
+size_t find_start_index(cudf::host_span<cumulative_page_info const> aggregated_info, size_t start_row)
 {
   auto start = thrust::make_transform_iterator(
     aggregated_info.begin(), [&](cumulative_page_info const& i) { return i.row_index; });
@@ -323,13 +326,13 @@ size_t find_start_index(std::vector<cumulative_page_info> const& aggregated_info
 int64_t find_next_split(int64_t cur_pos,
                         size_t cur_row_index,
                         size_t cur_cumulative_size,
-                        std::vector<cumulative_page_info> const& sizes,
+                        cudf::host_span<cumulative_page_info const> sizes,
                         size_t size_limit)
 {
-  auto start = thrust::make_transform_iterator(sizes.begin(), [&](cumulative_page_info const& i) {
+  auto const start = thrust::make_transform_iterator(sizes.begin(), [&](cumulative_page_info const& i) {
     return i.size_bytes - cur_cumulative_size;
   });
-  auto end   = start + sizes.size();
+  auto const end   = start + sizes.size();
 
   int64_t split_pos = thrust::lower_bound(thrust::seq, start + cur_pos, end, size_limit) - start;
 
@@ -392,7 +395,7 @@ template <typename T = uint8_t>
 }
 
 struct row_count_compare {
-  __device__ bool operator()(cumulative_page_info const& a, cumulative_page_info const& b)
+  __device__ bool operator()(cumulative_page_info const& a, cumulative_page_info const& b) const
   {
     return a.row_index < b.row_index;
   }
@@ -423,7 +426,6 @@ std::pair<size_t, size_t> get_row_group_size(RowGroup const& rg)
  *
  * This function is asynchronous. Call stream.synchronize() before using the
  * results.
- *
  */
 std::pair<rmm::device_uvector<cumulative_page_info>, rmm::device_uvector<int32_t>>
 adjust_cumulative_sizes(rmm::device_uvector<cumulative_page_info> const& c_info,
@@ -443,7 +445,7 @@ adjust_cumulative_sizes(rmm::device_uvector<cumulative_page_info> const& c_info,
                     c_info_sorted.begin(),
                     c_info_sorted.end(),
                     page_keys_by_split.begin(),
-                    [] __device__(cumulative_page_info const& c) { return c.key; });
+                    cuda::proclaim_return_type<int>([] __device__(cumulative_page_info const& c) { return c.key; }));
 
   // generate key offsets (offsets to the start of each partition of keys). worst case is 1 page per
   // key
@@ -486,7 +488,7 @@ struct page_span {
 struct get_page_row_index {
   device_span<const cumulative_page_info> c_info;
 
-  __device__ size_t operator()(size_t i) { return c_info[i].row_index; }
+  __device__ size_t operator()(size_t i) const { return c_info[i].row_index; }
 };
 
 /**
@@ -511,7 +513,7 @@ struct get_page_span {
   {
   }
 
-  __device__ page_span operator()(size_t column_index)
+  __device__ page_span operator()(size_t column_index) const
   {
     auto const first_page_index  = page_offsets[column_index];
     auto const column_page_start = page_row_index + first_page_index;
diff --git a/cpp/src/io/utilities/column_buffer.cpp b/cpp/src/io/utilities/column_buffer.cpp
index 4eec0efd1a6..f4324a17683 100644
--- a/cpp/src/io/utilities/column_buffer.cpp
+++ b/cpp/src/io/utilities/column_buffer.cpp
@@ -149,10 +149,10 @@ string_policy column_buffer_base<string_policy>::empty_like(string_policy const&
 }
 
 template <typename string_policy>
-std::string type_to_name(column_buffer_base<string_policy> const& buffer, bool include_nesting)
+std::string type_to_name(column_buffer_base<string_policy> const& buffer)
 {
   if (buffer.type.id() == cudf::type_id::LIST) {
-    return "List<" + (type_to_name<string_policy>(buffer.children[0], true)) + ">";
+    return "List<" + (type_to_name<string_policy>(buffer.children[0])) + ">";
   }
 
   if (buffer.type.id() == cudf::type_id::STRUCT) {
@@ -164,7 +164,7 @@ std::string type_to_name(column_buffer_base<string_policy> const& buffer, bool i
       iter,
       iter + buffer.children.size(),
       std::ostream_iterator<std::string>(out, ","),
-      [&buffer](size_type i) { return type_to_name<string_policy>(buffer.children[i], true); });
+      [&buffer](size_type i) { return type_to_name<string_policy>(buffer.children[i]); });
     out << ">";
     return out.str();
   }
@@ -379,10 +379,8 @@ template std::unique_ptr<column> empty_like<pointer_type>(pointer_column_buffer&
                                                           rmm::cuda_stream_view stream,
                                                           rmm::mr::device_memory_resource* mr);
 
-template std::string type_to_name<string_type>(string_column_buffer const& buffer,
-                                               bool include_nesting);
-template std::string type_to_name<pointer_type>(pointer_column_buffer const& buffer,
-                                                bool include_nesting);
+template std::string type_to_name<string_type>(string_column_buffer const& buffer);
+template std::string type_to_name<pointer_type>(pointer_column_buffer const& buffer);
 
 template class column_buffer_base<pointer_type>;
 template class column_buffer_base<string_type>;
diff --git a/cpp/src/io/utilities/column_buffer.hpp b/cpp/src/io/utilities/column_buffer.hpp
index cc39f79ba61..2bb4a9322c7 100644
--- a/cpp/src/io/utilities/column_buffer.hpp
+++ b/cpp/src/io/utilities/column_buffer.hpp
@@ -253,9 +253,16 @@ std::unique_ptr<column> empty_like(column_buffer_base<string_policy>& buffer,
                                    rmm::cuda_stream_view stream,
                                    rmm::mr::device_memory_resource* mr);
 
+
+/**
+ * @brief Given a column_buffer, produce a formatted name string describing the type. 
+ *
+ * @param buffer The column buffer
+ * 
+ * @return A string describing the type of the buffer suitable for printing
+ */
 template <class string_policy>
-std::string type_to_name(column_buffer_base<string_policy> const& buffer,
-                         bool include_nesting = true);
+std::string type_to_name(column_buffer_base<string_policy> const& buffer);
 
 }  // namespace detail
 }  // namespace io

From b01b06e4a17be55e009e1eabde09c07f767bacba Mon Sep 17 00:00:00 2001
From: db <dbaranec@nvidia.com>
Date: Fri, 19 Jan 2024 15:15:14 -0600
Subject: [PATCH 35/49] Formatting.

---
 cpp/src/io/parquet/reader_impl_chunking.cu | 25 +++++++++++++---------
 cpp/src/io/utilities/column_buffer.hpp     |  5 ++---
 2 files changed, 17 insertions(+), 13 deletions(-)

diff --git a/cpp/src/io/parquet/reader_impl_chunking.cu b/cpp/src/io/parquet/reader_impl_chunking.cu
index 70a3be7e194..9138742e5c6 100644
--- a/cpp/src/io/parquet/reader_impl_chunking.cu
+++ b/cpp/src/io/parquet/reader_impl_chunking.cu
@@ -207,10 +207,11 @@ struct get_page_output_size {
     }
 
     // total nested size, not counting string data
-    auto iter = cudf::detail::make_counting_transform_iterator(0, 
-      cuda::proclaim_return_type<size_t>([page]__device__(size_type i) {
+    auto iter = cudf::detail::make_counting_transform_iterator(
+      0, cuda::proclaim_return_type<size_t>([page] __device__(size_type i) {
         auto const& pni = page.nesting[i];
-        return cudf::type_dispatcher(data_type{pni.type}, row_size_functor{}, pni.size, pni.nullable);
+        return cudf::type_dispatcher(
+          data_type{pni.type}, row_size_functor{}, pni.size, pni.nullable);
       }));
     return {
       0,
@@ -279,7 +280,9 @@ struct page_total_size {
       auto const start = key_offsets[idx];
       auto const end   = key_offsets[idx + 1];
       auto iter        = cudf::detail::make_counting_transform_iterator(
-        0, cuda::proclaim_return_type<size_t>([&] __device__(size_type i) { return c_info[i].row_index; }));
+        0, cuda::proclaim_return_type<size_t>([&] __device__(size_type i) {
+          return c_info[i].row_index;
+        }));
       auto const page_index =
         thrust::lower_bound(thrust::seq, iter + start, iter + end, i.row_index) - iter;
       sum += c_info[page_index].size_bytes;
@@ -299,7 +302,8 @@ struct get_chunk_compressed_size {
  * @brief Find the first entry in the aggreggated_info that corresponds to the specified row
  *
  */
-size_t find_start_index(cudf::host_span<cumulative_page_info const> aggregated_info, size_t start_row)
+size_t find_start_index(cudf::host_span<cumulative_page_info const> aggregated_info,
+                        size_t start_row)
 {
   auto start = thrust::make_transform_iterator(
     aggregated_info.begin(), [&](cumulative_page_info const& i) { return i.row_index; });
@@ -329,10 +333,10 @@ int64_t find_next_split(int64_t cur_pos,
                         cudf::host_span<cumulative_page_info const> sizes,
                         size_t size_limit)
 {
-  auto const start = thrust::make_transform_iterator(sizes.begin(), [&](cumulative_page_info const& i) {
-    return i.size_bytes - cur_cumulative_size;
-  });
-  auto const end   = start + sizes.size();
+  auto const start = thrust::make_transform_iterator(
+    sizes.begin(),
+    [&](cumulative_page_info const& i) { return i.size_bytes - cur_cumulative_size; });
+  auto const end = start + sizes.size();
 
   int64_t split_pos = thrust::lower_bound(thrust::seq, start + cur_pos, end, size_limit) - start;
 
@@ -445,7 +449,8 @@ adjust_cumulative_sizes(rmm::device_uvector<cumulative_page_info> const& c_info,
                     c_info_sorted.begin(),
                     c_info_sorted.end(),
                     page_keys_by_split.begin(),
-                    cuda::proclaim_return_type<int>([] __device__(cumulative_page_info const& c) { return c.key; }));
+                    cuda::proclaim_return_type<int>(
+                      [] __device__(cumulative_page_info const& c) { return c.key; }));
 
   // generate key offsets (offsets to the start of each partition of keys). worst case is 1 page per
   // key
diff --git a/cpp/src/io/utilities/column_buffer.hpp b/cpp/src/io/utilities/column_buffer.hpp
index 2bb4a9322c7..57ee1043ee9 100644
--- a/cpp/src/io/utilities/column_buffer.hpp
+++ b/cpp/src/io/utilities/column_buffer.hpp
@@ -253,12 +253,11 @@ std::unique_ptr<column> empty_like(column_buffer_base<string_policy>& buffer,
                                    rmm::cuda_stream_view stream,
                                    rmm::mr::device_memory_resource* mr);
 
-
 /**
- * @brief Given a column_buffer, produce a formatted name string describing the type. 
+ * @brief Given a column_buffer, produce a formatted name string describing the type.
  *
  * @param buffer The column buffer
- * 
+ *
  * @return A string describing the type of the buffer suitable for printing
  */
 template <class string_policy>

From ce1f94ee5b4d42fdc0a258fd05dd11560cc6d75b Mon Sep 17 00:00:00 2001
From: db <dbaranec@nvidia.com>
Date: Fri, 19 Jan 2024 15:57:42 -0600
Subject: [PATCH 36/49] Add a missing proclaim_return_type.

---
 cpp/src/io/parquet/reader_impl_chunking.cu | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/cpp/src/io/parquet/reader_impl_chunking.cu b/cpp/src/io/parquet/reader_impl_chunking.cu
index 9138742e5c6..dbc45f024d5 100644
--- a/cpp/src/io/parquet/reader_impl_chunking.cu
+++ b/cpp/src/io/parquet/reader_impl_chunking.cu
@@ -819,9 +819,9 @@ std::vector<row_range> compute_page_splits_by_row(
   CUDF_EXPECTS(thrust::all_of(rmm::exec_policy(stream),
                               comp_res.begin(),
                               comp_res.end(),
-                              [] __device__(auto const& res) {
+                              cuda::proclaim_return_type<bool>([] __device__(auto const& res) {
                                 return res.status == compression_status::SUCCESS;
-                              }),
+                              })),
                "Error during decompression");
 
   // now copy the uncompressed V2 def and rep level data

From 484d63ccb6f126cdf89cc75809001e4edf45f7c2 Mon Sep 17 00:00:00 2001
From: db <dbaranec@nvidia.com>
Date: Mon, 22 Jan 2024 16:28:31 -0600
Subject: [PATCH 37/49] PR review feedback.  Remove unnecessary INVALID nvcomp
 wrapper enum value.

---
 cpp/src/io/comp/nvcomp_adapter.cpp          |  1 -
 cpp/src/io/comp/nvcomp_adapter.hpp          | 18 +++++++++++++++---
 cpp/src/io/parquet/reader_impl_chunking.cu  | 15 ++++++---------
 cpp/tests/io/parquet_chunked_reader_test.cu |  2 +-
 4 files changed, 22 insertions(+), 14 deletions(-)

diff --git a/cpp/src/io/comp/nvcomp_adapter.cpp b/cpp/src/io/comp/nvcomp_adapter.cpp
index b908ac5ac3a..40ed7677603 100644
--- a/cpp/src/io/comp/nvcomp_adapter.cpp
+++ b/cpp/src/io/comp/nvcomp_adapter.cpp
@@ -128,7 +128,6 @@ std::string compression_type_name(compression_type compression)
     case compression_type::SNAPPY: return "Snappy";
     case compression_type::ZSTD: return "Zstandard";
     case compression_type::DEFLATE: return "Deflate";
-    case compression_type::INVALID: CUDF_FAIL("Invalid nvcomp compression type");
   }
   return "compression_type(" + std::to_string(static_cast<int>(compression)) + ")";
 }
diff --git a/cpp/src/io/comp/nvcomp_adapter.hpp b/cpp/src/io/comp/nvcomp_adapter.hpp
index 6953b598ac6..3b28e38bf56 100644
--- a/cpp/src/io/comp/nvcomp_adapter.hpp
+++ b/cpp/src/io/comp/nvcomp_adapter.hpp
@@ -29,7 +29,7 @@
 
 namespace cudf::io::nvcomp {
 
-enum class compression_type { SNAPPY, ZSTD, DEFLATE, INVALID };
+enum class compression_type { SNAPPY, ZSTD, DEFLATE };
 
 /**
  * @brief Set of parameters that impact whether the use nvCOMP features is enabled.
@@ -99,8 +99,8 @@ inline bool operator==(feature_status_parameters const& lhs, feature_status_para
  * @param[in] inputs List of input buffers
  * @param[out] outputs List of output buffers
  * @param[out] results List of output status structures
- * @param[in] max_uncomp_chunk_size maximum size of uncompressed chunk
- * @param[in] max_total_uncomp_size maximum total size of uncompressed data
+ * @param[in] max_uncomp_chunk_size Maximum size of any single uncompressed chunk
+ * @param[in] max_total_uncomp_size Maximum total size of uncompressed data
  * @param[in] stream CUDA stream to use
  */
 void batched_decompress(compression_type compression,
@@ -111,6 +111,18 @@ void batched_decompress(compression_type compression,
                         size_t max_total_uncomp_size,
                         rmm::cuda_stream_view stream);
 
+/**
+ * @brief Return the amount of temporary space required in bytes for a given decompression
+ * operation.
+ * 
+ * The size returned reflects the size of the scratch buffer to be passed to `batched_decompress_async`
+ *
+ * @param[in] compression Compression type
+ * @param[in] num_chunks The number of decompression chunks to be processed 
+ * @param[in] max_uncomp_chunk_size Maximum size of any single uncompressed chunk
+ * @param[in] max_total_uncomp_size Maximum total size of uncompressed data
+ * @returns The total required size in bytes
+ */
 size_t batched_decompress_temp_size(compression_type compression,
                                     size_t num_chunks,
                                     size_t max_uncomp_chunk_size,
diff --git a/cpp/src/io/parquet/reader_impl_chunking.cu b/cpp/src/io/parquet/reader_impl_chunking.cu
index dbc45f024d5..1bb687410bb 100644
--- a/cpp/src/io/parquet/reader_impl_chunking.cu
+++ b/cpp/src/io/parquet/reader_impl_chunking.cu
@@ -972,8 +972,6 @@ struct decomp_sum {
 struct get_decomp_scratch {
   size_t operator()(decompression_info const& di)
   {
-    cudf::io::nvcomp::compression_type nvcomp_codec = cudf::io::nvcomp::compression_type::INVALID;
-
     switch (di.codec) {
       case UNCOMPRESSED:
       case GZIP: return 0;
@@ -982,20 +980,19 @@ struct get_decomp_scratch {
 
       case SNAPPY:
         if (cudf::io::detail::nvcomp_integration::is_stable_enabled()) {
-          nvcomp_codec = cudf::io::nvcomp::compression_type::SNAPPY;
+          return cudf::io::nvcomp::batched_decompress_temp_size(
+            cudf::io::nvcomp::compression_type::SNAPPY, di.num_pages, di.max_page_decompressed_size, di.total_decompressed_size);
         } else {
           return 0;
         }
         break;
-      case ZSTD: nvcomp_codec = cudf::io::nvcomp::compression_type::ZSTD; break;
+
+      case ZSTD:
+        return cudf::io::nvcomp::batched_decompress_temp_size(
+            cudf::io::nvcomp::compression_type::ZSTD, di.num_pages, di.max_page_decompressed_size, di.total_decompressed_size);
 
       default: CUDF_FAIL("Invalid compression codec for parquet decompression");
     }
-
-    CUDF_EXPECTS(nvcomp_codec != cudf::io::nvcomp::compression_type::INVALID,
-                 "Invalid nvcomp codec encountered");
-    return cudf::io::nvcomp::batched_decompress_temp_size(
-      nvcomp_codec, di.num_pages, di.max_page_decompressed_size, di.total_decompressed_size);
   }
 };
 
diff --git a/cpp/tests/io/parquet_chunked_reader_test.cu b/cpp/tests/io/parquet_chunked_reader_test.cu
index b0232ee8292..a5973724ae7 100644
--- a/cpp/tests/io/parquet_chunked_reader_test.cu
+++ b/cpp/tests/io/parquet_chunked_reader_test.cu
@@ -1013,7 +1013,7 @@ TEST_F(ParquetChunkedReaderInputLimitConstrainedTest, SingleFixedWidthColumn)
 {
   auto base_path          = temp_env->get_temp_filepath("single_col_fixed_width");
   constexpr auto num_rows = 1'000'000;
-  auto iter1 = cudf::detail::make_counting_transform_iterator(0, [](int i) { return 15; });
+  auto iter1 = thrust::make_constant_iterator(15);
   cudf::test::fixed_width_column_wrapper<double> col1(iter1, iter1 + num_rows);
   auto tbl = cudf::table_view{{col1}};
 

From 781526b3b3f0554ff3368f847d5f6bd4fc11175d Mon Sep 17 00:00:00 2001
From: db <dbaranec@nvidia.com>
Date: Mon, 22 Jan 2024 16:32:56 -0600
Subject: [PATCH 38/49] Formatting.

---
 cpp/src/io/comp/nvcomp_adapter.hpp          |  7 ++++---
 cpp/src/io/parquet/reader_impl_chunking.cu  | 10 ++++++++--
 cpp/tests/io/parquet_chunked_reader_test.cu |  2 +-
 3 files changed, 13 insertions(+), 6 deletions(-)

diff --git a/cpp/src/io/comp/nvcomp_adapter.hpp b/cpp/src/io/comp/nvcomp_adapter.hpp
index 3b28e38bf56..69a278757ce 100644
--- a/cpp/src/io/comp/nvcomp_adapter.hpp
+++ b/cpp/src/io/comp/nvcomp_adapter.hpp
@@ -114,11 +114,12 @@ void batched_decompress(compression_type compression,
 /**
  * @brief Return the amount of temporary space required in bytes for a given decompression
  * operation.
- * 
- * The size returned reflects the size of the scratch buffer to be passed to `batched_decompress_async`
+ *
+ * The size returned reflects the size of the scratch buffer to be passed to
+ * `batched_decompress_async`
  *
  * @param[in] compression Compression type
- * @param[in] num_chunks The number of decompression chunks to be processed 
+ * @param[in] num_chunks The number of decompression chunks to be processed
  * @param[in] max_uncomp_chunk_size Maximum size of any single uncompressed chunk
  * @param[in] max_total_uncomp_size Maximum total size of uncompressed data
  * @returns The total required size in bytes
diff --git a/cpp/src/io/parquet/reader_impl_chunking.cu b/cpp/src/io/parquet/reader_impl_chunking.cu
index 1bb687410bb..6cbc32b717e 100644
--- a/cpp/src/io/parquet/reader_impl_chunking.cu
+++ b/cpp/src/io/parquet/reader_impl_chunking.cu
@@ -981,7 +981,10 @@ struct get_decomp_scratch {
       case SNAPPY:
         if (cudf::io::detail::nvcomp_integration::is_stable_enabled()) {
           return cudf::io::nvcomp::batched_decompress_temp_size(
-            cudf::io::nvcomp::compression_type::SNAPPY, di.num_pages, di.max_page_decompressed_size, di.total_decompressed_size);
+            cudf::io::nvcomp::compression_type::SNAPPY,
+            di.num_pages,
+            di.max_page_decompressed_size,
+            di.total_decompressed_size);
         } else {
           return 0;
         }
@@ -989,7 +992,10 @@ struct get_decomp_scratch {
 
       case ZSTD:
         return cudf::io::nvcomp::batched_decompress_temp_size(
-            cudf::io::nvcomp::compression_type::ZSTD, di.num_pages, di.max_page_decompressed_size, di.total_decompressed_size);
+          cudf::io::nvcomp::compression_type::ZSTD,
+          di.num_pages,
+          di.max_page_decompressed_size,
+          di.total_decompressed_size);
 
       default: CUDF_FAIL("Invalid compression codec for parquet decompression");
     }
diff --git a/cpp/tests/io/parquet_chunked_reader_test.cu b/cpp/tests/io/parquet_chunked_reader_test.cu
index a5973724ae7..6f64306ed3c 100644
--- a/cpp/tests/io/parquet_chunked_reader_test.cu
+++ b/cpp/tests/io/parquet_chunked_reader_test.cu
@@ -1013,7 +1013,7 @@ TEST_F(ParquetChunkedReaderInputLimitConstrainedTest, SingleFixedWidthColumn)
 {
   auto base_path          = temp_env->get_temp_filepath("single_col_fixed_width");
   constexpr auto num_rows = 1'000'000;
-  auto iter1 = thrust::make_constant_iterator(15);
+  auto iter1              = thrust::make_constant_iterator(15);
   cudf::test::fixed_width_column_wrapper<double> col1(iter1, iter1 + num_rows);
   auto tbl = cudf::table_view{{col1}};
 

From fe995bba7b0b12da39dac51b2fcacc651c7c3ee0 Mon Sep 17 00:00:00 2001
From: db <dbaranec@nvidia.com>
Date: Mon, 22 Jan 2024 17:25:53 -0600
Subject: [PATCH 39/49] PR review feedback. Remove use of deprecated
 make_strings_column() interface.

---
 cpp/tests/io/parquet_chunked_reader_test.cu | 78 +++++++++++++--------
 1 file changed, 48 insertions(+), 30 deletions(-)

diff --git a/cpp/tests/io/parquet_chunked_reader_test.cu b/cpp/tests/io/parquet_chunked_reader_test.cu
index 6f64306ed3c..8446c313e07 100644
--- a/cpp/tests/io/parquet_chunked_reader_test.cu
+++ b/cpp/tests/io/parquet_chunked_reader_test.cu
@@ -953,6 +953,16 @@ TEST_F(ParquetChunkedReaderTest, TestChunkedReadNullCount)
   } while (reader.has_next());
 }
 
+constexpr size_t input_limit_expected_file_count = 4;
+
+std::vector<std::string> input_limit_get_test_names(std::string const& base_filename)
+{
+  return { base_filename + "_a.parquet",
+           base_filename + "_b.parquet",
+           base_filename + "_c.parquet",
+           base_filename + "_d.parquet" };
+}
+
 void input_limit_test_write_one(std::string const& filepath,
                                 cudf::table_view const& t,
                                 cudf::io::compression_type compression,
@@ -965,42 +975,43 @@ void input_limit_test_write_one(std::string const& filepath,
   cudf::io::write_parquet(out_opts);
 }
 
-void input_limit_test_write(std::string const& base_path, cudf::table_view const& t)
+void input_limit_test_write(std::vector<std::string> const& test_filenames, cudf::table_view const& t)
 {
+  CUDF_EXPECTS(test_filenames.size() == 4, "Unexpected count of test filenames");
+  CUDF_EXPECTS(test_filenames.size() == input_limit_expected_file_count, "Unexpected count of test filenames");
+
   // no compression
-  input_limit_test_write_one(base_path + "_a.parquet",
+  input_limit_test_write_one(test_filenames[0],
                              t,
                              cudf::io::compression_type::NONE,
                              cudf::io::dictionary_policy::NEVER);
   // compression with a codec that uses a lot of scratch space at decode time (2.5x the total
   // decompressed buffer size)
-  input_limit_test_write_one(base_path + "_b.parquet",
+  input_limit_test_write_one(test_filenames[1],
                              t,
                              cudf::io::compression_type::ZSTD,
                              cudf::io::dictionary_policy::NEVER);
   // compression with a codec that uses no scratch space at decode time
-  input_limit_test_write_one(base_path + "_c.parquet",
+  input_limit_test_write_one(test_filenames[2],
                              t,
                              cudf::io::compression_type::SNAPPY,
                              cudf::io::dictionary_policy::NEVER);
-  input_limit_test_write_one(base_path + "_d.parquet",
+  input_limit_test_write_one(test_filenames[3],
                              t,
                              cudf::io::compression_type::SNAPPY,
                              cudf::io::dictionary_policy::ALWAYS);
 }
 
-void input_limit_test_read(std::string const& base_path,
+void input_limit_test_read(std::vector<std::string> const& test_filenames,
                            cudf::table_view const& t,
                            size_t output_limit,
                            size_t input_limit,
-                           int const expected_chunk_counts[4])
+                           int const expected_chunk_counts[input_limit_expected_file_count])
 {
-  std::vector<std::string> file_suffix{"_a.parquet", "_b.parquet", "_c.parquet", "_d.parquet"};
-  CUDF_EXPECTS(file_suffix.size() == 4,
-               "Unexpected mismatch between number of test cases and result count");
+  CUDF_EXPECTS(test_filenames.size() == input_limit_expected_file_count, "Unexpected count of test filenames");
 
-  for (size_t idx = 0; idx < file_suffix.size(); idx++) {
-    auto result = chunked_read(base_path + file_suffix[idx], output_limit, input_limit);
+  for (size_t idx = 0; idx < test_filenames.size(); idx++) {
+    auto result = chunked_read(test_filenames[idx], output_limit, input_limit);
     CUDF_EXPECTS(result.second == expected_chunk_counts[idx],
                  "Unexpected number of chunks produced in chunk read");
     CUDF_TEST_EXPECT_TABLES_EQUIVALENT(*result.first, t);
@@ -1012,24 +1023,28 @@ struct ParquetChunkedReaderInputLimitConstrainedTest : public cudf::test::BaseFi
 TEST_F(ParquetChunkedReaderInputLimitConstrainedTest, SingleFixedWidthColumn)
 {
   auto base_path          = temp_env->get_temp_filepath("single_col_fixed_width");
+  auto test_filenames = input_limit_get_test_names(base_path);
+
   constexpr auto num_rows = 1'000'000;
   auto iter1              = thrust::make_constant_iterator(15);
   cudf::test::fixed_width_column_wrapper<double> col1(iter1, iter1 + num_rows);
   auto tbl = cudf::table_view{{col1}};
 
-  input_limit_test_write(base_path, tbl);
+  input_limit_test_write(test_filenames, tbl);
 
   // semi-reasonable limit
   constexpr int expected_a[] = {1, 17, 4, 1};
-  input_limit_test_read(base_path, tbl, 0, 2 * 1024 * 1024, expected_a);
+  input_limit_test_read(test_filenames, tbl, 0, 2 * 1024 * 1024, expected_a);
   // an unreasonable limit
   constexpr int expected_b[] = {1, 50, 50, 1};
-  input_limit_test_read(base_path, tbl, 0, 1, expected_b);
+  input_limit_test_read(test_filenames, tbl, 0, 1, expected_b);
 }
 
 TEST_F(ParquetChunkedReaderInputLimitConstrainedTest, MixedColumns)
 {
   auto base_path          = temp_env->get_temp_filepath("mixed_columns");
+  auto test_filenames = input_limit_get_test_names(base_path);
+
   constexpr auto num_rows = 1'000'000;
 
   auto iter1 = thrust::make_counting_iterator<int>(0);
@@ -1048,12 +1063,12 @@ TEST_F(ParquetChunkedReaderInputLimitConstrainedTest, MixedColumns)
 
   auto tbl = cudf::table_view{{col1, col2, col3}};
 
-  input_limit_test_write(base_path, tbl);
+  input_limit_test_write(test_filenames, tbl);
 
   constexpr int expected_a[] = {1, 50, 10, 7};
-  input_limit_test_read(base_path, tbl, 0, 2 * 1024 * 1024, expected_a);
+  input_limit_test_read(test_filenames, tbl, 0, 2 * 1024 * 1024, expected_a);
   constexpr int expected_b[] = {1, 50, 50, 50};
-  input_limit_test_read(base_path, tbl, 0, 1, expected_b);
+  input_limit_test_read(test_filenames, tbl, 0, 1, expected_b);
 }
 
 struct ParquetChunkedReaderInputLimitTest : public cudf::test::BaseFixture {};
@@ -1070,6 +1085,8 @@ struct value_gen {
 TEST_F(ParquetChunkedReaderInputLimitTest, List)
 {
   auto base_path          = temp_env->get_temp_filepath("list");
+  auto test_filenames = input_limit_get_test_names(base_path);
+
   constexpr int num_rows  = 50'000'000;
   constexpr int list_size = 4;
 
@@ -1102,7 +1119,7 @@ TEST_F(ParquetChunkedReaderInputLimitTest, List)
 
   auto tbl = cudf::table_view{{*col1}};
 
-  input_limit_test_write(base_path, tbl);
+  input_limit_test_write(test_filenames, tbl);
 
   // even though we have a very large limit here, there are two cases where we actually produce
   // splits.
@@ -1121,13 +1138,13 @@ TEST_F(ParquetChunkedReaderInputLimitTest, List)
   // Note that in the dictionary cases, both of these revert down to 1 chunk because the
   // dictionaries dramatically shrink the size of the uncompressed data.
   constexpr int expected_a[] = {2, 2, 1, 1};
-  input_limit_test_read(base_path, tbl, 0, size_t{2} * 1024 * 1024 * 1024, expected_a);
+  input_limit_test_read(test_filenames, tbl, 0, size_t{2} * 1024 * 1024 * 1024, expected_a);
   // smaller limit
   constexpr int expected_b[] = {6, 6, 2, 1};
-  input_limit_test_read(base_path, tbl, 0, 512 * 1024 * 1024, expected_b);
+  input_limit_test_read(test_filenames, tbl, 0, 512 * 1024 * 1024, expected_b);
   // include output chunking as well
   constexpr int expected_c[] = {11, 11, 9, 8};
-  input_limit_test_read(base_path, tbl, 128 * 1024 * 1024, 512 * 1024 * 1024, expected_c);
+  input_limit_test_read(test_filenames, tbl, 128 * 1024 * 1024, 512 * 1024 * 1024, expected_c);
 }
 
 struct char_values {
@@ -1141,6 +1158,8 @@ struct char_values {
 TEST_F(ParquetChunkedReaderInputLimitTest, Mixed)
 {
   auto base_path          = temp_env->get_temp_filepath("mixed_types");
+  auto test_filenames = input_limit_get_test_names(base_path);
+
   constexpr int num_rows  = 50'000'000;
   constexpr int list_size = 4;
   constexpr int str_size  = 3;
@@ -1182,16 +1201,15 @@ TEST_F(ParquetChunkedReaderInputLimitTest, Mixed)
                str_offset_iter + num_rows + 1,
                str_offset_col->mutable_view().begin<int>());
   auto str_iter      = cudf::detail::make_counting_transform_iterator(0, char_values{});
-  auto str_value_col = cudf::make_fixed_width_column(
-    cudf::data_type{cudf::type_id::INT8}, num_chars, cudf::mask_state::UNALLOCATED);
+  rmm::device_buffer str_chars(num_chars, stream);
   thrust::copy(rmm::exec_policy(stream),
                str_iter,
                str_iter + num_chars,
-               str_value_col->mutable_view().begin<int8_t>());
+               static_cast<int8_t*>(str_chars.data()));
   auto col2 =
     cudf::make_strings_column(num_rows,
                               std::move(str_offset_col),
-                              std::move(str_value_col),
+                              std::move(str_chars),
                               0,
                               cudf::create_null_mask(num_rows, cudf::mask_state::UNALLOCATED));
 
@@ -1206,7 +1224,7 @@ TEST_F(ParquetChunkedReaderInputLimitTest, Mixed)
 
   auto tbl = cudf::table_view{{*col1, *col2, *col3}};
 
-  input_limit_test_write(base_path, tbl);
+  input_limit_test_write(test_filenames, tbl);
 
   // even though we have a very large limit here, there are two cases where we actually produce
   // splits.
@@ -1225,11 +1243,11 @@ TEST_F(ParquetChunkedReaderInputLimitTest, Mixed)
   // Note that in the dictionary cases, both of these revert down to 1 chunk because the
   // dictionaries dramatically shrink the size of the uncompressed data.
   constexpr int expected_a[] = {3, 3, 1, 1};
-  input_limit_test_read(base_path, tbl, 0, size_t{2} * 1024 * 1024 * 1024, expected_a);
+  input_limit_test_read(test_filenames, tbl, 0, size_t{2} * 1024 * 1024 * 1024, expected_a);
   // smaller limit
   constexpr int expected_b[] = {10, 11, 4, 1};
-  input_limit_test_read(base_path, tbl, 0, 512 * 1024 * 1024, expected_b);
+  input_limit_test_read(test_filenames, tbl, 0, 512 * 1024 * 1024, expected_b);
   // include output chunking as well
   constexpr int expected_c[] = {20, 21, 15, 14};
-  input_limit_test_read(base_path, tbl, 128 * 1024 * 1024, 512 * 1024 * 1024, expected_c);
+  input_limit_test_read(test_filenames, tbl, 128 * 1024 * 1024, 512 * 1024 * 1024, expected_c);
 }

From 97df57a7d6f4bafd150773d490cd83c2f60dc62d Mon Sep 17 00:00:00 2001
From: db <dbaranec@nvidia.com>
Date: Mon, 22 Jan 2024 17:34:54 -0600
Subject: [PATCH 40/49] Formatting.

---
 cpp/tests/io/parquet_chunked_reader_test.cu | 51 ++++++++++-----------
 1 file changed, 23 insertions(+), 28 deletions(-)

diff --git a/cpp/tests/io/parquet_chunked_reader_test.cu b/cpp/tests/io/parquet_chunked_reader_test.cu
index 8446c313e07..dea44f0e7c3 100644
--- a/cpp/tests/io/parquet_chunked_reader_test.cu
+++ b/cpp/tests/io/parquet_chunked_reader_test.cu
@@ -957,10 +957,10 @@ constexpr size_t input_limit_expected_file_count = 4;
 
 std::vector<std::string> input_limit_get_test_names(std::string const& base_filename)
 {
-  return { base_filename + "_a.parquet",
-           base_filename + "_b.parquet",
-           base_filename + "_c.parquet",
-           base_filename + "_d.parquet" };
+  return {base_filename + "_a.parquet",
+          base_filename + "_b.parquet",
+          base_filename + "_c.parquet",
+          base_filename + "_d.parquet"};
 }
 
 void input_limit_test_write_one(std::string const& filepath,
@@ -975,31 +975,25 @@ void input_limit_test_write_one(std::string const& filepath,
   cudf::io::write_parquet(out_opts);
 }
 
-void input_limit_test_write(std::vector<std::string> const& test_filenames, cudf::table_view const& t)
+void input_limit_test_write(std::vector<std::string> const& test_filenames,
+                            cudf::table_view const& t)
 {
   CUDF_EXPECTS(test_filenames.size() == 4, "Unexpected count of test filenames");
-  CUDF_EXPECTS(test_filenames.size() == input_limit_expected_file_count, "Unexpected count of test filenames");
+  CUDF_EXPECTS(test_filenames.size() == input_limit_expected_file_count,
+               "Unexpected count of test filenames");
 
   // no compression
-  input_limit_test_write_one(test_filenames[0],
-                             t,
-                             cudf::io::compression_type::NONE,
-                             cudf::io::dictionary_policy::NEVER);
+  input_limit_test_write_one(
+    test_filenames[0], t, cudf::io::compression_type::NONE, cudf::io::dictionary_policy::NEVER);
   // compression with a codec that uses a lot of scratch space at decode time (2.5x the total
   // decompressed buffer size)
-  input_limit_test_write_one(test_filenames[1],
-                             t,
-                             cudf::io::compression_type::ZSTD,
-                             cudf::io::dictionary_policy::NEVER);
+  input_limit_test_write_one(
+    test_filenames[1], t, cudf::io::compression_type::ZSTD, cudf::io::dictionary_policy::NEVER);
   // compression with a codec that uses no scratch space at decode time
-  input_limit_test_write_one(test_filenames[2],
-                             t,
-                             cudf::io::compression_type::SNAPPY,
-                             cudf::io::dictionary_policy::NEVER);
-  input_limit_test_write_one(test_filenames[3],
-                             t,
-                             cudf::io::compression_type::SNAPPY,
-                             cudf::io::dictionary_policy::ALWAYS);
+  input_limit_test_write_one(
+    test_filenames[2], t, cudf::io::compression_type::SNAPPY, cudf::io::dictionary_policy::NEVER);
+  input_limit_test_write_one(
+    test_filenames[3], t, cudf::io::compression_type::SNAPPY, cudf::io::dictionary_policy::ALWAYS);
 }
 
 void input_limit_test_read(std::vector<std::string> const& test_filenames,
@@ -1008,7 +1002,8 @@ void input_limit_test_read(std::vector<std::string> const& test_filenames,
                            size_t input_limit,
                            int const expected_chunk_counts[input_limit_expected_file_count])
 {
-  CUDF_EXPECTS(test_filenames.size() == input_limit_expected_file_count, "Unexpected count of test filenames");
+  CUDF_EXPECTS(test_filenames.size() == input_limit_expected_file_count,
+               "Unexpected count of test filenames");
 
   for (size_t idx = 0; idx < test_filenames.size(); idx++) {
     auto result = chunked_read(test_filenames[idx], output_limit, input_limit);
@@ -1022,7 +1017,7 @@ struct ParquetChunkedReaderInputLimitConstrainedTest : public cudf::test::BaseFi
 
 TEST_F(ParquetChunkedReaderInputLimitConstrainedTest, SingleFixedWidthColumn)
 {
-  auto base_path          = temp_env->get_temp_filepath("single_col_fixed_width");
+  auto base_path      = temp_env->get_temp_filepath("single_col_fixed_width");
   auto test_filenames = input_limit_get_test_names(base_path);
 
   constexpr auto num_rows = 1'000'000;
@@ -1042,7 +1037,7 @@ TEST_F(ParquetChunkedReaderInputLimitConstrainedTest, SingleFixedWidthColumn)
 
 TEST_F(ParquetChunkedReaderInputLimitConstrainedTest, MixedColumns)
 {
-  auto base_path          = temp_env->get_temp_filepath("mixed_columns");
+  auto base_path      = temp_env->get_temp_filepath("mixed_columns");
   auto test_filenames = input_limit_get_test_names(base_path);
 
   constexpr auto num_rows = 1'000'000;
@@ -1084,7 +1079,7 @@ struct value_gen {
 };
 TEST_F(ParquetChunkedReaderInputLimitTest, List)
 {
-  auto base_path          = temp_env->get_temp_filepath("list");
+  auto base_path      = temp_env->get_temp_filepath("list");
   auto test_filenames = input_limit_get_test_names(base_path);
 
   constexpr int num_rows  = 50'000'000;
@@ -1157,7 +1152,7 @@ struct char_values {
 };
 TEST_F(ParquetChunkedReaderInputLimitTest, Mixed)
 {
-  auto base_path          = temp_env->get_temp_filepath("mixed_types");
+  auto base_path      = temp_env->get_temp_filepath("mixed_types");
   auto test_filenames = input_limit_get_test_names(base_path);
 
   constexpr int num_rows  = 50'000'000;
@@ -1200,7 +1195,7 @@ TEST_F(ParquetChunkedReaderInputLimitTest, Mixed)
                str_offset_iter,
                str_offset_iter + num_rows + 1,
                str_offset_col->mutable_view().begin<int>());
-  auto str_iter      = cudf::detail::make_counting_transform_iterator(0, char_values{});
+  auto str_iter = cudf::detail::make_counting_transform_iterator(0, char_values{});
   rmm::device_buffer str_chars(num_chars, stream);
   thrust::copy(rmm::exec_policy(stream),
                str_iter,

From 9097dfd2a7e88581ee0cf3af1785f3eb01fb6b6e Mon Sep 17 00:00:00 2001
From: db <dbaranec@nvidia.com>
Date: Tue, 23 Jan 2024 13:05:03 -0600
Subject: [PATCH 41/49] PR review feedback.

---
 cpp/src/io/parquet/parquet_gpu.hpp         |  2 +-
 cpp/src/io/parquet/reader_impl.hpp         |  1 -
 cpp/src/io/parquet/reader_impl_chunking.cu | 10 +++++++---
 3 files changed, 8 insertions(+), 5 deletions(-)

diff --git a/cpp/src/io/parquet/parquet_gpu.hpp b/cpp/src/io/parquet/parquet_gpu.hpp
index 7f6c32f11ad..c717d7c247e 100644
--- a/cpp/src/io/parquet/parquet_gpu.hpp
+++ b/cpp/src/io/parquet/parquet_gpu.hpp
@@ -349,7 +349,7 @@ struct get_page_key {
 /**
  * @brief Return an iterator that returns they keys for a vector of pages.
  */
-inline auto make_page_key_iterator(device_span<const PageInfo> pages)
+inline auto make_page_key_iterator(device_span<PageInfo const> pages)
 {
   return thrust::make_transform_iterator(pages.begin(), get_page_key{});
 }
diff --git a/cpp/src/io/parquet/reader_impl.hpp b/cpp/src/io/parquet/reader_impl.hpp
index b92665c167f..67c56c9c2d7 100644
--- a/cpp/src/io/parquet/reader_impl.hpp
+++ b/cpp/src/io/parquet/reader_impl.hpp
@@ -230,7 +230,6 @@ class reader::impl {
    * all of the rows in the page, not the number of rows themselves. In order to do subpass reading
    * more accurately, we would like to have a more accurate guess of the real number of rows per
    * page.
-   *
    */
   void generate_list_column_row_count_estimates();
 
diff --git a/cpp/src/io/parquet/reader_impl_chunking.cu b/cpp/src/io/parquet/reader_impl_chunking.cu
index 6cbc32b717e..0e6d3ae59bb 100644
--- a/cpp/src/io/parquet/reader_impl_chunking.cu
+++ b/cpp/src/io/parquet/reader_impl_chunking.cu
@@ -52,6 +52,12 @@ struct split_info {
 // do a subpass decode. if the difference between the user specified limit and
 // the actual memory used for compressed/temp data is > than this value, we will still use
 // at least this many additional bytes.
+// Example:
+// - user has specified 1 GB limit
+// - we have read in 900 MB of compressed data
+// - that leaves us 100 MB of space for decompression batches
+// - to keep the gpu busy, we really don't want to do less than 200 MB at a time so we're just going to use 200 MB of space
+//   even if that goes past the user-specified limit.
 constexpr size_t minimum_subpass_expected_size = 200 * 1024 * 1024;
 
 // percentage of the total available input read limit that should be reserved for compressed
@@ -1219,9 +1225,7 @@ void reader::impl::setup_next_subpass(bool uses_custom_row_bounds)
 
   // if the user has passed a very small value (under the hardcoded minimum_subpass_expected_size),
   // respect it.
-  auto const min_subpass_size = _input_pass_read_limit < minimum_subpass_expected_size
-                                  ? _input_pass_read_limit
-                                  : minimum_subpass_expected_size;
+  auto const min_subpass_size = std::min(_input_pass_read_limit, minimum_subpass_expected_size);
 
   // what do we do if the base memory size (the compressed data) itself is approaching or larger
   // than the overall read limit? we are still going to be decompressing in subpasses, but we have

From 44f365a5806cb4d1ed3c38fa3e1bf6bda4775880 Mon Sep 17 00:00:00 2001
From: db <dbaranec@nvidia.com>
Date: Tue, 23 Jan 2024 13:13:29 -0600
Subject: [PATCH 42/49] Formatting

---
 cpp/src/io/parquet/reader_impl_chunking.cu | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/cpp/src/io/parquet/reader_impl_chunking.cu b/cpp/src/io/parquet/reader_impl_chunking.cu
index 0e6d3ae59bb..36898ec3f6c 100644
--- a/cpp/src/io/parquet/reader_impl_chunking.cu
+++ b/cpp/src/io/parquet/reader_impl_chunking.cu
@@ -56,7 +56,8 @@ struct split_info {
 // - user has specified 1 GB limit
 // - we have read in 900 MB of compressed data
 // - that leaves us 100 MB of space for decompression batches
-// - to keep the gpu busy, we really don't want to do less than 200 MB at a time so we're just going to use 200 MB of space
+// - to keep the gpu busy, we really don't want to do less than 200 MB at a time so we're just going
+// to use 200 MB of space
 //   even if that goes past the user-specified limit.
 constexpr size_t minimum_subpass_expected_size = 200 * 1024 * 1024;
 

From 7ff6459cd75cdf9bdfbb11a0bb2bc04524b591d3 Mon Sep 17 00:00:00 2001
From: db <dbaranec@nvidia.com>
Date: Tue, 23 Jan 2024 17:42:41 -0600
Subject: [PATCH 43/49] More PR review feedback.

---
 cpp/src/io/parquet/reader_impl_chunking.cu | 22 +++++++++++++---------
 1 file changed, 13 insertions(+), 9 deletions(-)

diff --git a/cpp/src/io/parquet/reader_impl_chunking.cu b/cpp/src/io/parquet/reader_impl_chunking.cu
index 36898ec3f6c..58c83a02a10 100644
--- a/cpp/src/io/parquet/reader_impl_chunking.cu
+++ b/cpp/src/io/parquet/reader_impl_chunking.cu
@@ -547,6 +547,10 @@ struct get_page_span {
   }
 };
 
+struct get_span_size {
+  __device__ size_t operator()(page_span const& s) const { return s.end - s.start; }
+};
+
 /**
  * @brief Computes the next subpass within the current pass.
  *
@@ -606,13 +610,13 @@ std::tuple<std::vector<page_span>, size_t, size_t> compute_next_subpass(
                     iter + num_columns,
                     page_bounds.begin(),
                     get_page_span{page_offsets, page_row_index, start_row, end_row});
-  auto h_page_bounds = cudf::detail::make_std_vector_sync(page_bounds, stream);
-
+  
   // total page count over all columns
   auto page_count_iter = thrust::make_transform_iterator(
-    h_page_bounds.begin(), [](page_span const& s) { return s.end - s.start; });
-  size_t const total_pages = std::reduce(page_count_iter, page_count_iter + num_columns);
-
+    page_bounds.begin(), get_span_size{});
+  size_t const total_pages = thrust::reduce(rmm::exec_policy(stream), page_count_iter, page_count_iter + num_columns);
+  
+  auto h_page_bounds = cudf::detail::make_std_vector_sync(page_bounds, stream);
   return {h_page_bounds, total_pages, h_aggregated_info[end_index].size_bytes - cumulative_size};
 }
 
@@ -839,12 +843,12 @@ std::vector<row_range> compute_page_splits_by_row(
       copy_out, stream, rmm::mr::get_current_device_resource());
 
     gpu_copy_uncompressed_blocks(d_copy_in, d_copy_out, stream);
+    stream.synchronize();
   }
 
   pages.host_to_device_async(stream);
 
   stream.synchronize();
-
   return decomp_pages;
 }
 
@@ -948,7 +952,7 @@ struct decompression_info {
 struct get_decomp_info {
   device_span<const ColumnChunkDesc> chunks;
 
-  __device__ decompression_info operator()(PageInfo const& p)
+  __device__ decompression_info operator()(PageInfo const& p) const
   {
     return {static_cast<Compression>(chunks[p.chunk_idx].codec),
             1,
@@ -962,7 +966,7 @@ struct get_decomp_info {
  *
  */
 struct decomp_sum {
-  __device__ decompression_info operator()(decompression_info const& a, decompression_info const& b)
+  __device__ decompression_info operator()(decompression_info const& a, decompression_info const& b) const
   {
     return {a.codec,
             a.num_pages + b.num_pages,
@@ -977,7 +981,7 @@ struct decomp_sum {
  *
  */
 struct get_decomp_scratch {
-  size_t operator()(decompression_info const& di)
+  size_t operator()(decompression_info const& di) const
   {
     switch (di.codec) {
       case UNCOMPRESSED:

From 8ab4f8043ef323fbb56ed0c921eab711ec210c48 Mon Sep 17 00:00:00 2001
From: db <dbaranec@nvidia.com>
Date: Tue, 23 Jan 2024 17:45:57 -0600
Subject: [PATCH 44/49] Formatting.

---
 cpp/src/io/parquet/reader_impl_chunking.cu | 13 +++++++------
 1 file changed, 7 insertions(+), 6 deletions(-)

diff --git a/cpp/src/io/parquet/reader_impl_chunking.cu b/cpp/src/io/parquet/reader_impl_chunking.cu
index 58c83a02a10..362155edbb6 100644
--- a/cpp/src/io/parquet/reader_impl_chunking.cu
+++ b/cpp/src/io/parquet/reader_impl_chunking.cu
@@ -610,12 +610,12 @@ std::tuple<std::vector<page_span>, size_t, size_t> compute_next_subpass(
                     iter + num_columns,
                     page_bounds.begin(),
                     get_page_span{page_offsets, page_row_index, start_row, end_row});
-  
+
   // total page count over all columns
-  auto page_count_iter = thrust::make_transform_iterator(
-    page_bounds.begin(), get_span_size{});
-  size_t const total_pages = thrust::reduce(rmm::exec_policy(stream), page_count_iter, page_count_iter + num_columns);
-  
+  auto page_count_iter = thrust::make_transform_iterator(page_bounds.begin(), get_span_size{});
+  size_t const total_pages =
+    thrust::reduce(rmm::exec_policy(stream), page_count_iter, page_count_iter + num_columns);
+
   auto h_page_bounds = cudf::detail::make_std_vector_sync(page_bounds, stream);
   return {h_page_bounds, total_pages, h_aggregated_info[end_index].size_bytes - cumulative_size};
 }
@@ -966,7 +966,8 @@ struct get_decomp_info {
  *
  */
 struct decomp_sum {
-  __device__ decompression_info operator()(decompression_info const& a, decompression_info const& b) const
+  __device__ decompression_info operator()(decompression_info const& a,
+                                           decompression_info const& b) const
   {
     return {a.codec,
             a.num_pages + b.num_pages,

From 3ed0351c9e35f24687ad4ed6886c4ef17d9a9b95 Mon Sep 17 00:00:00 2001
From: db <dbaranec@nvidia.com>
Date: Wed, 24 Jan 2024 10:14:16 -0600
Subject: [PATCH 45/49] Wave of PR review feedback.

---
 cpp/src/io/parquet/parquet_gpu.hpp          |  3 +-
 cpp/src/io/parquet/reader_impl.cpp          |  3 +-
 cpp/src/io/parquet/reader_impl_chunking.cu  | 42 ++++++++++++---------
 cpp/src/io/parquet/reader_impl_chunking.hpp |  9 +----
 4 files changed, 27 insertions(+), 30 deletions(-)

diff --git a/cpp/src/io/parquet/parquet_gpu.hpp b/cpp/src/io/parquet/parquet_gpu.hpp
index c717d7c247e..d58c7f95389 100644
--- a/cpp/src/io/parquet/parquet_gpu.hpp
+++ b/cpp/src/io/parquet/parquet_gpu.hpp
@@ -414,8 +414,7 @@ struct ColumnChunkDesc {
   uint32_t num_rows{};               // number of rows in this chunk
   int16_t max_level[level_type::NUM_LEVEL_TYPES]{};  // max definition/repetition level
   int16_t max_nesting_depth{};                       // max nesting depth of the output
-  uint16_t data_type{};  // basic column data type, ((type_length << 3) |
-                         // parquet::Type)
+  uint16_t data_type{};  // basic column data type, ((type_length << 3) | // parquet::Type)
   uint8_t
     level_bits[level_type::NUM_LEVEL_TYPES]{};  // bits to encode max definition/repetition levels
   int32_t num_data_pages{};                     // number of data pages
diff --git a/cpp/src/io/parquet/reader_impl.cpp b/cpp/src/io/parquet/reader_impl.cpp
index 45458d43a99..f76c90faf82 100644
--- a/cpp/src/io/parquet/reader_impl.cpp
+++ b/cpp/src/io/parquet/reader_impl.cpp
@@ -247,8 +247,7 @@ void reader::impl::decode_page_data(size_t skip_rows, size_t num_rows)
   if (error_code.value() != 0) {
     CUDF_FAIL("Parquet data decode failed with code(s) " + error_code.str());
   }
-  // error_code.value() does a synchronize, but I think we should leave this here as a more explicit
-  // reminder in the code.
+  // error_code.value() is synchronous; explictly sync here for better visibility
   _stream.synchronize();
 
   // for list columns, add the final offset to every offset buffer.
diff --git a/cpp/src/io/parquet/reader_impl_chunking.cu b/cpp/src/io/parquet/reader_impl_chunking.cu
index 362155edbb6..66e4cedacc9 100644
--- a/cpp/src/io/parquet/reader_impl_chunking.cu
+++ b/cpp/src/io/parquet/reader_impl_chunking.cu
@@ -48,6 +48,12 @@ struct split_info {
   int64_t split_pos;
 };
 
+struct cumulative_page_info {
+  size_t row_index;   // row index
+  size_t size_bytes;  // cumulative size in bytes
+  int key;            // schema index
+};
+
 // the minimum amount of memory we can safely expect to be enough to
 // do a subpass decode. if the difference between the user specified limit and
 // the actual memory used for compressed/temp data is > than this value, we will still use
@@ -66,9 +72,9 @@ constexpr size_t minimum_subpass_expected_size = 200 * 1024 * 1024;
 constexpr float input_limit_compression_reserve = 0.3f;
 
 #if defined(CHUNKING_DEBUG)
-void print_cumulative_page_info(device_span<const PageInfo> d_pages,
-                                device_span<const ColumnChunkDesc> d_chunks,
-                                device_span<const cumulative_page_info> d_c_info,
+void print_cumulative_page_info(device_span<PageInfo const> d_pages,
+                                device_span<ColumnChunkDesc const> d_chunks,
+                                device_span<cumulative_page_info const> d_c_info,
                                 rmm::cuda_stream_view stream)
 {
   std::vector<PageInfo> pages              = cudf::detail::make_std_vector_sync(d_pages, stream);
@@ -236,7 +242,7 @@ struct get_page_input_size {
     // we treat dictionary page sizes as 0 for subpasses because we have already paid the price for
     // them at the pass level.
     if (page.flags & PAGEINFO_FLAGS_DICTIONARY) {
-      return cumulative_page_info{0, 0, page.src_col_schema};
+      return {0, 0, page.src_col_schema};
     }
     return {0, static_cast<size_t>(page.uncompressed_page_size), page.src_col_schema};
   }
@@ -439,12 +445,12 @@ std::pair<size_t, size_t> get_row_group_size(RowGroup const& rg)
  * results.
  */
 std::pair<rmm::device_uvector<cumulative_page_info>, rmm::device_uvector<int32_t>>
-adjust_cumulative_sizes(rmm::device_uvector<cumulative_page_info> const& c_info,
-                        cudf::detail::hostdevice_vector<PageInfo> const& pages,
+adjust_cumulative_sizes(device_span<cumulative_page_info const> c_info,
+                        device_span<PageInfo const> pages,
                         rmm::cuda_stream_view stream)
 {
   // sort by row count
-  rmm::device_uvector<cumulative_page_info> c_info_sorted{c_info, stream};
+  rmm::device_uvector<cumulative_page_info> c_info_sorted = make_device_uvector_async(c_info, stream, rmm::mr::get_current_device_resource());
   thrust::sort(rmm::exec_policy_nosync(stream),
                c_info_sorted.begin(),
                c_info_sorted.end(),
@@ -574,9 +580,9 @@ struct get_span_size {
  *
  */
 std::tuple<std::vector<page_span>, size_t, size_t> compute_next_subpass(
-  rmm::device_uvector<cumulative_page_info> const& c_info,
-  cudf::detail::hostdevice_vector<PageInfo> const& pages,
-  cudf::detail::hostdevice_vector<size_type> const& page_offsets,
+  device_span<cumulative_page_info const> c_info,
+  device_span<PageInfo const> pages,
+  device_span<size_type const> page_offsets,
   size_t start_row,
   size_t size_limit,
   size_t num_columns,
@@ -621,8 +627,8 @@ std::tuple<std::vector<page_span>, size_t, size_t> compute_next_subpass(
 }
 
 std::vector<row_range> compute_page_splits_by_row(
-  rmm::device_uvector<cumulative_page_info> const& c_info,
-  cudf::detail::hostdevice_vector<PageInfo> const& pages,
+  device_span<cumulative_page_info const> c_info,
+  device_span<PageInfo const> pages,
   size_t skip_rows,
   size_t num_rows,
   size_t size_limit,
@@ -1032,7 +1038,7 @@ void include_decompression_scratch_size(device_span<const ColumnChunkDesc> chunk
   // per-codec page counts and decompression sizes
   rmm::device_uvector<decompression_info> decomp_info(pages.size(), stream);
   auto decomp_iter = thrust::make_transform_iterator(pages.begin(), get_decomp_info{chunks});
-  thrust::inclusive_scan_by_key(rmm::exec_policy(stream),
+  thrust::inclusive_scan_by_key(rmm::exec_policy_nosync(stream),
                                 page_keys,
                                 page_keys + pages.size(),
                                 decomp_iter,
@@ -1281,7 +1287,7 @@ void reader::impl::setup_next_subpass(bool uses_custom_row_bounds)
     include_decompression_scratch_size(pass.chunks, pass.pages, c_info, _stream);
 
     auto iter = thrust::make_counting_iterator(0);
-    thrust::for_each(rmm::exec_policy(_stream),
+    thrust::for_each(rmm::exec_policy_nosync(_stream),
                      iter,
                      iter + pass.pages.size(),
                      set_row_index{pass.chunks, pass.pages, c_info});
@@ -1455,7 +1461,7 @@ void reader::impl::compute_input_passes()
 
   // generate passes. make sure to account for the case where a single row group doesn't fit within
   //
-  std::size_t const read_limit =
+  std::size_t const comp_read_limit =
     _input_pass_read_limit > 0 ? static_cast<size_t>(static_cast<float>(_input_pass_read_limit) *
                                                      input_limit_compression_reserve)
                                : std::numeric_limits<std::size_t>::max();
@@ -1474,7 +1480,7 @@ void reader::impl::compute_input_passes()
       get_row_group_size(row_group);
 
     // can we add this row group
-    if (cur_pass_byte_size + compressed_rg_size >= read_limit) {
+    if (cur_pass_byte_size + compressed_rg_size >= comp_read_limit) {
       // A single row group (the current one) is larger than the read limit:
       // We always need to include at least one row group, so end the pass at the end of the current
       // row group
@@ -1520,7 +1526,7 @@ void reader::impl::compute_output_chunks_for_subpass()
   auto page_input =
     thrust::make_transform_iterator(subpass.pages.d_begin(), get_page_output_size{});
   auto page_keys = make_page_key_iterator(subpass.pages);
-  thrust::inclusive_scan_by_key(rmm::exec_policy(_stream),
+  thrust::inclusive_scan_by_key(rmm::exec_policy_nosync(_stream),
                                 page_keys,
                                 page_keys + subpass.pages.size(),
                                 page_input,
@@ -1528,7 +1534,7 @@ void reader::impl::compute_output_chunks_for_subpass()
                                 thrust::equal_to{},
                                 cumulative_page_sum{});
   auto iter = thrust::make_counting_iterator(0);
-  thrust::for_each(rmm::exec_policy(_stream),
+  thrust::for_each(rmm::exec_policy_nosync(_stream),
                    iter,
                    iter + subpass.pages.size(),
                    set_row_index{pass.chunks, subpass.pages, c_info});
diff --git a/cpp/src/io/parquet/reader_impl_chunking.hpp b/cpp/src/io/parquet/reader_impl_chunking.hpp
index 8109159d38d..a9cf0e94ec8 100644
--- a/cpp/src/io/parquet/reader_impl_chunking.hpp
+++ b/cpp/src/io/parquet/reader_impl_chunking.hpp
@@ -81,18 +81,11 @@ struct subpass_intermediate_data {
   std::vector<row_range> output_chunk_read_info;
   std::size_t current_output_chunk{0};
 
-  // skip_rows and num_rows values for this particular subpass. in absolute
-  // row indices.
+  // skip_rows and num_rows values for this particular subpass. in absolute row indices.
   size_t skip_rows;
   size_t num_rows;
 };
 
-struct cumulative_page_info {
-  size_t row_index;   // row index
-  size_t size_bytes;  // cumulative size in bytes
-  int key;            // schema index
-};
-
 /**
  * @brief Struct to store pass-level data that remains constant for a single pass.
  *

From 6a5b17f3d846584a7da2d8d4651dc0d868ce9249 Mon Sep 17 00:00:00 2001
From: db <dbaranec@nvidia.com>
Date: Wed, 24 Jan 2024 10:43:07 -0600
Subject: [PATCH 46/49] More PR feedback. Whole lot of consting going on.

---
 cpp/src/io/parquet/reader_impl_chunking.cu | 30 +++++++++++-----------
 1 file changed, 15 insertions(+), 15 deletions(-)

diff --git a/cpp/src/io/parquet/reader_impl_chunking.cu b/cpp/src/io/parquet/reader_impl_chunking.cu
index 66e4cedacc9..ca0b2da6d42 100644
--- a/cpp/src/io/parquet/reader_impl_chunking.cu
+++ b/cpp/src/io/parquet/reader_impl_chunking.cu
@@ -411,7 +411,7 @@ template <typename T = uint8_t>
   return static_cast<T>(CompactProtocolReader::NumRequiredBits(max_level));
 }
 
-struct row_count_compare {
+struct row_count_less {
   __device__ bool operator()(cumulative_page_info const& a, cumulative_page_info const& b) const
   {
     return a.row_index < b.row_index;
@@ -454,7 +454,7 @@ adjust_cumulative_sizes(device_span<cumulative_page_info const> c_info,
   thrust::sort(rmm::exec_policy_nosync(stream),
                c_info_sorted.begin(),
                c_info_sorted.end(),
-               row_count_compare{});
+               row_count_less{});
 
   // page keys grouped by split.
   rmm::device_uvector<int32_t> page_keys_by_split{c_info.size(), stream};
@@ -504,7 +504,7 @@ struct page_span {
 };
 
 struct get_page_row_index {
-  device_span<const cumulative_page_info> c_info;
+  device_span<cumulative_page_info const> c_info;
 
   __device__ size_t operator()(size_t i) const { return c_info[i].row_index; }
 };
@@ -515,12 +515,12 @@ struct get_page_row_index {
  */
 template <typename RowIndexIter>
 struct get_page_span {
-  device_span<const size_type> page_offsets;
+  device_span<size_type const> page_offsets;
   RowIndexIter page_row_index;
   size_t const start_row;
   size_t const end_row;
 
-  get_page_span(device_span<const size_type> _page_offsets,
+  get_page_span(device_span<size_type const> _page_offsets,
                 RowIndexIter _page_row_index,
                 size_t _start_row,
                 size_t _end_row)
@@ -896,8 +896,8 @@ struct row_counts_different {
  * @param expected_row_count Expected row count, if applicable
  * @param stream CUDA stream used for device memory operations and kernel launches
  */
-void detect_malformed_pages(cudf::detail::hostdevice_vector<PageInfo> const& pages,
-                            cudf::detail::hostdevice_vector<ColumnChunkDesc> const& chunks,
+void detect_malformed_pages(device_span<PageInfo const> pages,
+                            device_span<ColumnChunkDesc const> chunks,
                             std::optional<size_t> expected_row_count,
                             rmm::cuda_stream_view stream)
 {
@@ -905,7 +905,7 @@ void detect_malformed_pages(cudf::detail::hostdevice_vector<PageInfo> const& pag
   rmm::device_uvector<size_type> row_counts(pages.size(),
                                             stream);  // worst case:  num keys == num pages
   auto const size_iter =
-    thrust::make_transform_iterator(pages.d_begin(), flat_column_num_rows{chunks.device_ptr()});
+    thrust::make_transform_iterator(pages.begin(), flat_column_num_rows{chunks.data()});
   auto const row_counts_begin = row_counts.begin();
   auto page_keys              = make_page_key_iterator(pages);
   auto const row_counts_end   = thrust::reduce_by_key(rmm::exec_policy(stream),
@@ -1025,8 +1025,8 @@ struct get_decomp_scratch {
  * size information.
  *
  */
-void include_decompression_scratch_size(device_span<const ColumnChunkDesc> chunks,
-                                        device_span<const PageInfo> pages,
+void include_decompression_scratch_size(device_span<ColumnChunkDesc const> chunks,
+                                        device_span<PageInfo const> pages,
                                         device_span<cumulative_page_info> c_info,
                                         rmm::cuda_stream_view stream)
 {
@@ -1274,7 +1274,7 @@ void reader::impl::setup_next_subpass(bool uses_custom_row_bounds)
     rmm::device_uvector<cumulative_page_info> c_info(pass.pages.size(), _stream);
     auto page_keys = make_page_key_iterator(pass.pages);
     auto page_size = thrust::make_transform_iterator(pass.pages.d_begin(), get_page_input_size{});
-    thrust::inclusive_scan_by_key(rmm::exec_policy(_stream),
+    thrust::inclusive_scan_by_key(rmm::exec_policy_nosync(_stream),
                                   page_keys,
                                   page_keys + pass.pages.size(),
                                   page_size,
@@ -1408,9 +1408,9 @@ void reader::impl::create_global_chunk_info()
       // for lists, estimate the number of bytes per row. this is used by the subpass reader to
       // determine where to split the decompression boundaries
       float const list_bytes_per_row_est =
-        schema.max_repetition_level > 0 ? static_cast<float>(col_meta.total_uncompressed_size) /
-                                            static_cast<float>(row_group.num_rows)
-                                        : 0.0f;
+        schema.max_repetition_level > 0 && row_group.num_rows > 0 ? static_cast<float>(col_meta.total_uncompressed_size) /
+                                                                    static_cast<float>(row_group.num_rows)
+                                                                    : 0.0f;
 
       chunks.push_back(ColumnChunkDesc(col_meta.total_compressed_size,
                                        nullptr,
@@ -1462,7 +1462,7 @@ void reader::impl::compute_input_passes()
   // generate passes. make sure to account for the case where a single row group doesn't fit within
   //
   std::size_t const comp_read_limit =
-    _input_pass_read_limit > 0 ? static_cast<size_t>(static_cast<float>(_input_pass_read_limit) *
+    _input_pass_read_limit > 0 ? static_cast<size_t>(_input_pass_read_limit *
                                                      input_limit_compression_reserve)
                                : std::numeric_limits<std::size_t>::max();
   std::size_t cur_pass_byte_size = 0;

From 0083639151724c32963f129a4c4e8b4713080e86 Mon Sep 17 00:00:00 2001
From: db <dbaranec@nvidia.com>
Date: Wed, 24 Jan 2024 10:53:35 -0600
Subject: [PATCH 47/49] Formatting.

---
 cpp/src/io/parquet/reader_impl.cpp         |  2 +-
 cpp/src/io/parquet/reader_impl_chunking.cu | 39 ++++++++++------------
 2 files changed, 19 insertions(+), 22 deletions(-)

diff --git a/cpp/src/io/parquet/reader_impl.cpp b/cpp/src/io/parquet/reader_impl.cpp
index f76c90faf82..24d46d91dbb 100644
--- a/cpp/src/io/parquet/reader_impl.cpp
+++ b/cpp/src/io/parquet/reader_impl.cpp
@@ -247,7 +247,7 @@ void reader::impl::decode_page_data(size_t skip_rows, size_t num_rows)
   if (error_code.value() != 0) {
     CUDF_FAIL("Parquet data decode failed with code(s) " + error_code.str());
   }
-  // error_code.value() is synchronous; explictly sync here for better visibility
+  // error_code.value() is synchronous; explicitly sync here for better visibility
   _stream.synchronize();
 
   // for list columns, add the final offset to every offset buffer.
diff --git a/cpp/src/io/parquet/reader_impl_chunking.cu b/cpp/src/io/parquet/reader_impl_chunking.cu
index ca0b2da6d42..555f531f7c0 100644
--- a/cpp/src/io/parquet/reader_impl_chunking.cu
+++ b/cpp/src/io/parquet/reader_impl_chunking.cu
@@ -241,9 +241,7 @@ struct get_page_input_size {
   {
     // we treat dictionary page sizes as 0 for subpasses because we have already paid the price for
     // them at the pass level.
-    if (page.flags & PAGEINFO_FLAGS_DICTIONARY) {
-      return {0, 0, page.src_col_schema};
-    }
+    if (page.flags & PAGEINFO_FLAGS_DICTIONARY) { return {0, 0, page.src_col_schema}; }
     return {0, static_cast<size_t>(page.uncompressed_page_size), page.src_col_schema};
   }
 };
@@ -450,11 +448,10 @@ adjust_cumulative_sizes(device_span<cumulative_page_info const> c_info,
                         rmm::cuda_stream_view stream)
 {
   // sort by row count
-  rmm::device_uvector<cumulative_page_info> c_info_sorted = make_device_uvector_async(c_info, stream, rmm::mr::get_current_device_resource());
-  thrust::sort(rmm::exec_policy_nosync(stream),
-               c_info_sorted.begin(),
-               c_info_sorted.end(),
-               row_count_less{});
+  rmm::device_uvector<cumulative_page_info> c_info_sorted =
+    make_device_uvector_async(c_info, stream, rmm::mr::get_current_device_resource());
+  thrust::sort(
+    rmm::exec_policy_nosync(stream), c_info_sorted.begin(), c_info_sorted.end(), row_count_less{});
 
   // page keys grouped by split.
   rmm::device_uvector<int32_t> page_keys_by_split{c_info.size(), stream};
@@ -626,13 +623,12 @@ std::tuple<std::vector<page_span>, size_t, size_t> compute_next_subpass(
   return {h_page_bounds, total_pages, h_aggregated_info[end_index].size_bytes - cumulative_size};
 }
 
-std::vector<row_range> compute_page_splits_by_row(
-  device_span<cumulative_page_info const> c_info,
-  device_span<PageInfo const> pages,
-  size_t skip_rows,
-  size_t num_rows,
-  size_t size_limit,
-  rmm::cuda_stream_view stream)
+std::vector<row_range> compute_page_splits_by_row(device_span<cumulative_page_info const> c_info,
+                                                  device_span<PageInfo const> pages,
+                                                  size_t skip_rows,
+                                                  size_t num_rows,
+                                                  size_t size_limit,
+                                                  rmm::cuda_stream_view stream)
 {
   auto [aggregated_info, page_keys_by_split] = adjust_cumulative_sizes(c_info, pages, stream);
 
@@ -1408,9 +1404,10 @@ void reader::impl::create_global_chunk_info()
       // for lists, estimate the number of bytes per row. this is used by the subpass reader to
       // determine where to split the decompression boundaries
       float const list_bytes_per_row_est =
-        schema.max_repetition_level > 0 && row_group.num_rows > 0 ? static_cast<float>(col_meta.total_uncompressed_size) /
-                                                                    static_cast<float>(row_group.num_rows)
-                                                                    : 0.0f;
+        schema.max_repetition_level > 0 && row_group.num_rows > 0
+          ? static_cast<float>(col_meta.total_uncompressed_size) /
+              static_cast<float>(row_group.num_rows)
+          : 0.0f;
 
       chunks.push_back(ColumnChunkDesc(col_meta.total_compressed_size,
                                        nullptr,
@@ -1462,9 +1459,9 @@ void reader::impl::compute_input_passes()
   // generate passes. make sure to account for the case where a single row group doesn't fit within
   //
   std::size_t const comp_read_limit =
-    _input_pass_read_limit > 0 ? static_cast<size_t>(_input_pass_read_limit *
-                                                     input_limit_compression_reserve)
-                               : std::numeric_limits<std::size_t>::max();
+    _input_pass_read_limit > 0
+      ? static_cast<size_t>(_input_pass_read_limit * input_limit_compression_reserve)
+      : std::numeric_limits<std::size_t>::max();
   std::size_t cur_pass_byte_size = 0;
   std::size_t cur_rg_start       = 0;
   std::size_t cur_row_count      = 0;

From b0002d77c2cf2cd8befdc6ca587cf70cd650dbfd Mon Sep 17 00:00:00 2001
From: db <dbaranec@nvidia.com>
Date: Wed, 24 Jan 2024 16:43:34 -0600
Subject: [PATCH 48/49] Added a missing CUDF_KERNEL tag to
 gpuDecodePageHeaders. Misc review feedback changes.

---
 cpp/src/io/parquet/page_hdr.cu               |  9 +++++----
 cpp/src/io/parquet/reader_impl_chunking.cu   |  8 ++++----
 cpp/src/io/parquet/reader_impl_preprocess.cu | 12 ++++--------
 3 files changed, 13 insertions(+), 16 deletions(-)

diff --git a/cpp/src/io/parquet/page_hdr.cu b/cpp/src/io/parquet/page_hdr.cu
index e617ce82f19..888d9452612 100644
--- a/cpp/src/io/parquet/page_hdr.cu
+++ b/cpp/src/io/parquet/page_hdr.cu
@@ -348,10 +348,11 @@ struct gpuParsePageHeader {
  * @param[in] num_chunks Number of column chunks
  */
 // blockDim {128,1,1}
-__global__ void __launch_bounds__(128) gpuDecodePageHeaders(ColumnChunkDesc* chunks,
-                                                            chunk_page_info* chunk_pages,
-                                                            int32_t num_chunks,
-                                                            kernel_error::pointer error_code)
+CUDF_KERNEL
+void __launch_bounds__(128) gpuDecodePageHeaders(ColumnChunkDesc* chunks,
+                                                 chunk_page_info* chunk_pages,
+                                                 int32_t num_chunks,
+                                                 kernel_error::pointer error_code)
 {
   using cudf::detail::warp_size;
   gpuParsePageHeader parse_page_header;
diff --git a/cpp/src/io/parquet/reader_impl_chunking.cu b/cpp/src/io/parquet/reader_impl_chunking.cu
index 555f531f7c0..366bc29226e 100644
--- a/cpp/src/io/parquet/reader_impl_chunking.cu
+++ b/cpp/src/io/parquet/reader_impl_chunking.cu
@@ -619,8 +619,7 @@ std::tuple<std::vector<page_span>, size_t, size_t> compute_next_subpass(
   size_t const total_pages =
     thrust::reduce(rmm::exec_policy(stream), page_count_iter, page_count_iter + num_columns);
 
-  auto h_page_bounds = cudf::detail::make_std_vector_sync(page_bounds, stream);
-  return {h_page_bounds, total_pages, h_aggregated_info[end_index].size_bytes - cumulative_size};
+  return {cudf::detail::make_std_vector_sync(page_bounds, stream), total_pages, h_aggregated_info[end_index].size_bytes - cumulative_size};
 }
 
 std::vector<row_range> compute_page_splits_by_row(device_span<cumulative_page_info const> c_info,
@@ -749,7 +748,7 @@ std::vector<row_range> compute_page_splits_by_row(device_span<cumulative_page_in
   copy_out.reserve(num_comp_pages);
 
   rmm::device_uvector<compression_result> comp_res(num_comp_pages, stream);
-  thrust::fill(rmm::exec_policy(stream),
+  thrust::fill(rmm::exec_policy_nosync(stream),
                comp_res.begin(),
                comp_res.end(),
                compression_result{0, compression_status::FAILURE});
@@ -1056,12 +1055,13 @@ void include_decompression_scratch_size(device_span<ColumnChunkDesc const> chunk
   rmm::device_uvector<size_t> d_temp_cost = cudf::detail::make_device_uvector_async(
     temp_cost, stream, rmm::mr::get_current_device_resource());
   auto iter = thrust::make_counting_iterator(size_t{0});
-  thrust::for_each(rmm::exec_policy(stream),
+  thrust::for_each(rmm::exec_policy_nosync(stream),
                    iter,
                    iter + pages.size(),
                    [temp_cost = d_temp_cost.begin(), c_info = c_info.begin()] __device__(size_t i) {
                      c_info[i].size_bytes += temp_cost[i];
                    });
+  stream.synchronize();
 }
 
 }  // anonymous namespace
diff --git a/cpp/src/io/parquet/reader_impl_preprocess.cu b/cpp/src/io/parquet/reader_impl_preprocess.cu
index b60c7b4b4dc..ee3b1c466e0 100644
--- a/cpp/src/io/parquet/reader_impl_preprocess.cu
+++ b/cpp/src/io/parquet/reader_impl_preprocess.cu
@@ -595,8 +595,8 @@ void reader::impl::build_string_dict_indices()
   // compute number of indices per chunk and a summed total
   rmm::device_uvector<size_t> str_dict_index_count(pass.chunks.size() + 1, _stream);
   thrust::fill(
-    rmm::exec_policy(_stream), str_dict_index_count.begin(), str_dict_index_count.end(), 0);
-  thrust::for_each(rmm::exec_policy(_stream),
+    rmm::exec_policy_nosync(_stream), str_dict_index_count.begin(), str_dict_index_count.end(), 0);
+  thrust::for_each(rmm::exec_policy_nosync(_stream),
                    pass.pages.begin(),
                    pass.pages.end(),
                    set_str_dict_index_count{str_dict_index_count, pass.chunks});
@@ -607,7 +607,7 @@ void reader::impl::build_string_dict_indices()
 
   // convert to offsets
   rmm::device_uvector<size_t>& str_dict_index_offsets = str_dict_index_count;
-  thrust::exclusive_scan(rmm::exec_policy(_stream),
+  thrust::exclusive_scan(rmm::exec_policy_nosync(_stream),
                          str_dict_index_offsets.begin(),
                          str_dict_index_offsets.end(),
                          str_dict_index_offsets.begin(),
@@ -619,7 +619,7 @@ void reader::impl::build_string_dict_indices()
 
   auto iter = thrust::make_counting_iterator(0);
   thrust::for_each(
-    rmm::exec_policy(_stream),
+    rmm::exec_policy_nosync(_stream),
     iter,
     iter + pass.chunks.size(),
     set_str_dict_index_ptr{pass.str_dict_index.data(), str_dict_index_offsets, pass.chunks});
@@ -812,7 +812,6 @@ std::pair<bool, std::vector<std::future<void>>> reader::impl::read_column_chunks
   for (auto const& rg : row_groups_info) {
     auto const& row_group       = _metadata->get_row_group(rg.index, rg.source_index);
     auto const row_group_source = rg.source_index;
-    // auto const row_group_rows   = std::min<int>(remaining_rows, row_group.num_rows);
 
     // generate ColumnChunkDesc objects for everything to be decoded (all input columns)
     for (size_t i = 0; i < num_input_columns; ++i) {
@@ -834,7 +833,6 @@ std::pair<bool, std::vector<std::future<void>>> reader::impl::read_column_chunks
 
       chunk_count++;
     }
-    // remaining_rows -= row_group_rows;
   }
 
   // Read compressed chunk data to device memory
@@ -847,8 +845,6 @@ std::pair<bool, std::vector<std::future<void>>> reader::impl::read_column_chunks
                                                       chunk_source_map,
                                                       _stream));
 
-  // CUDF_EXPECTS(remaining_rows == 0, "All rows data must be read.");
-
   return {total_decompressed_size > 0, std::move(read_chunk_tasks)};
 }
 

From a48c8e8a0e2c70b7a309cdd78251c6d04da62cef Mon Sep 17 00:00:00 2001
From: db <dbaranec@nvidia.com>
Date: Wed, 24 Jan 2024 16:47:02 -0600
Subject: [PATCH 49/49] Formatting.

---
 cpp/src/io/parquet/reader_impl_chunking.cu | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/cpp/src/io/parquet/reader_impl_chunking.cu b/cpp/src/io/parquet/reader_impl_chunking.cu
index 366bc29226e..1bfe5745b9e 100644
--- a/cpp/src/io/parquet/reader_impl_chunking.cu
+++ b/cpp/src/io/parquet/reader_impl_chunking.cu
@@ -619,7 +619,9 @@ std::tuple<std::vector<page_span>, size_t, size_t> compute_next_subpass(
   size_t const total_pages =
     thrust::reduce(rmm::exec_policy(stream), page_count_iter, page_count_iter + num_columns);
 
-  return {cudf::detail::make_std_vector_sync(page_bounds, stream), total_pages, h_aggregated_info[end_index].size_bytes - cumulative_size};
+  return {cudf::detail::make_std_vector_sync(page_bounds, stream),
+          total_pages,
+          h_aggregated_info[end_index].size_bytes - cumulative_size};
 }
 
 std::vector<row_range> compute_page_splits_by_row(device_span<cumulative_page_info const> c_info,