rapidsai · shrshi · Jan 9, 2025 · Jan 9, 2025 · Jan 10, 2025 · Jan 10, 2025
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022-2024, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2025, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -389,10 +389,58 @@ table_with_metadata read_json_impl(host_span<std::unique_ptr<datasource>> source
 
   std::vector<cudf::io::table_with_metadata> partial_tables;
   json_reader_options batched_reader_opts{reader_opts};
+
+  // recursive lambda to construct schema_element. Here, we assume that the table from the
+  // first batch contains all the columns in the concatenated table, and that the partial tables
+  // from all following batches contain the same set of columns
+  std::function<schema_element(cudf::host_span<column const> cols,
+                               cudf::host_span<column_name_info const> names,
+                               schema_element & schema)>
+    construct_schema;
+  schema_element schema{data_type{cudf::type_id::STRUCT}};
+  construct_schema = [&construct_schema](cudf::host_span<column const> children,
+                                         cudf::host_span<column_name_info const> children_props,
+                                         schema_element& schema) -> schema_element {
+    CUDF_EXPECTS(children.size() == children_props.size(), "Something's fishy");
+
+    std::vector<std::string> col_order;
+    for (size_t i = 0; i < children.size(); i++) {
+      if (schema.type == data_type{cudf::type_id::LIST} && children_props[i].name == "offsets")
+        continue;
+      col_order.push_back(children_props[i].name);
+    }
+    schema.column_order = std::move(col_order);
+
+    for (auto i = 0ul; i < children.size(); i++) {
+      if (schema.type == data_type{cudf::type_id::LIST} && children_props[i].name == "offsets")
+        continue;
+      schema_element child_schema{children[i].type()};
+      std::vector<column> grandchildren_cols;
+      for (size_type j = 0; j < children[i].num_children(); j++)
+        grandchildren_cols.emplace_back(children[i].child(j));
+      schema.child_types[children_props[i].name] =
+        construct_schema(grandchildren_cols, children_props[i].children, child_schema);
+    }
+
+    return schema;
+  };
   // Dispatch individual batches to read_batch and push the resulting table into
   // partial_tables array. Note that the reader options need to be updated for each
   // batch to adjust byte range offset and byte range size.
-  for (std::size_t i = 0; i < batch_offsets.size() - 1; i++) {
+  batched_reader_opts.set_byte_range_offset(batch_offsets[0]);
+  batched_reader_opts.set_byte_range_size(batch_offsets[1] - batch_offsets[0]);
+  partial_tables.emplace_back(
+    read_batch(sources, batched_reader_opts, stream, cudf::get_current_device_resource_ref()));
+
+  auto& tbl = partial_tables.back().tbl;
+  std::vector<column> children;
+  for (size_type j = 0; j < tbl->num_columns(); j++)
+    children.emplace_back(tbl->get_column(j));
+  batched_reader_opts.set_dtypes(
+    construct_schema(children, partial_tables.back().metadata.schema_info, schema));
+  batched_reader_opts.enable_prune_columns(true);
+
+  for (std::size_t i = 1; i < batch_offsets.size() - 1; i++) {
     batched_reader_opts.set_byte_range_offset(batch_offsets[i]);
     batched_reader_opts.set_byte_range_size(batch_offsets[i + 1] - batch_offsets[i]);
     partial_tables.emplace_back(

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2024, NVIDIA CORPORATION.
+ * Copyright (c) 2024-2025, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -131,3 +131,72 @@ TEST_P(JsonLargeReaderTest, MultiBatch)
   // go back to normal batch_size
   unsetenv("LIBCUDF_JSON_BATCH_SIZE");
 }
+
+TEST_P(JsonLargeReaderTest, MultiBatchWithNulls)
+{
+  cudf::io::compression_type const comptype = GetParam();
+
+  std::string json_string_b1 = R"(
+    { "a": { "y" : 6}, "b" : [1, 2, 3], "c": 11 }
+    { "a": { "y" : 6}, "b" : [4, 5   ], "c": 12 }
+    { "a": { "y" : 6}, "b" : [6      ], "c": 13 }
+    { "a": { "y" : 6}, "b" : [7      ], "c": 14 })";
+  std::string json_string_b2 = R"(
+    { "a": { "y" : 6}, "c": 11 }
+    { "a": { "y" : 6}, "b" : [4, 5   ], "c": 12 }
+    { "a": { "y" : 6}, "b" : [6      ], "c": 13 }
+    { "a": { "y" : 6}, "b" : [7      ], "c": 14 })";
+  std::string json_string_b3 = R"(
+    { "a": { "y" : 6}}
+    { "a": { "y" : 6}, "b" : [4, 5   ], "c": 12 }
+    { "a": { "y" : 6}, "b" : [6      ], "c": 13 }
+    { "a": { "y" : 6}, "b" : [7      ], "c": 14 })";
+
+  std::size_t const batch_size_upper_bound = json_string_b1.size();
+  // set smaller batch_size to reduce file size and execution time
+  setenv("LIBCUDF_JSON_BATCH_SIZE", std::to_string(batch_size_upper_bound).c_str(), 1);
+
+  auto json_string = json_string_b1 + json_string_b2 + json_string_b3;
+  std::vector<std::uint8_t> cdata;
+  if (comptype != cudf::io::compression_type::NONE) {
+    cdata = cudf::io::detail::compress(
+      comptype,
+      cudf::host_span<uint8_t const>(reinterpret_cast<uint8_t const*>(json_string.data()),
+                                     json_string.size()),
+      cudf::get_default_stream());
+  } else
+    cdata = std::vector<uint8_t>(
+      reinterpret_cast<uint8_t const*>(json_string.data()),
+      reinterpret_cast<uint8_t const*>(json_string.data()) + json_string.size());
+
+  constexpr int num_sources = 2;
+  std::vector<cudf::host_span<std::byte>> hostbufs(
+    num_sources,
+    cudf::host_span<std::byte>(reinterpret_cast<std::byte*>(json_string.data()),
+                               json_string.size()));
+  std::vector<cudf::host_span<std::byte>> chostbufs(
+    num_sources,
+    cudf::host_span<std::byte>(reinterpret_cast<std::byte*>(cdata.data()), cdata.size()));
+
+  // Initialize parsing options (reading json lines)
+  cudf::io::json_reader_options json_lines_options =
+    cudf::io::json_reader_options::builder(
+      cudf::io::source_info{
+        cudf::host_span<cudf::host_span<std::byte>>(hostbufs.data(), hostbufs.size())})
+      .lines(true)
+      .compression(cudf::io::compression_type::NONE)
+      .recovery_mode(cudf::io::json_recovery_mode_t::FAIL);
+  cudf::io::json_reader_options cjson_lines_options =
+    cudf::io::json_reader_options::builder(
+      cudf::io::source_info{
+        cudf::host_span<cudf::host_span<std::byte>>(chostbufs.data(), chostbufs.size())})
+      .lines(true)
+      .compression(comptype)
+      .recovery_mode(cudf::io::json_recovery_mode_t::FAIL);
+
+  // Read full test data via existing, nested JSON lines reader
+  CUDF_EXPECT_NO_THROW(cudf::io::read_json(cjson_lines_options));
+
+  // go back to normal batch_size
+  unsetenv("LIBCUDF_JSON_BATCH_SIZE");
+}