Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Handle first row blanks, parse booleans correctly, "" in varchars should be NULL #48

Merged
merged 5 commits into from
Dec 30, 2024
Merged
Show file tree
Hide file tree
Changes from 4 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
set(TARGET_NAME gsheets)

# Find OpenSSL package
find_package(OpenSSL REQUIRED)
find_package(OpenSSL REQUIRED CONFIG)

set(EXTENSION_NAME ${TARGET_NAME}_extension)
set(LOADABLE_EXTENSION_NAME ${TARGET_NAME}_loadable_extension)
Expand Down
2 changes: 1 addition & 1 deletion docs/pages/index.md
Original file line number Diff line number Diff line change
Expand Up @@ -105,7 +105,7 @@ This token will periodically expire - you can re-run the above command again to

## Limitations / Known Issues

- Google Sheets has a limit of 1,000,000 cells per spreadsheet.
- Google Sheets has a limit of 10,000,000 cells per spreadsheet.
- Reading sheets where data does not start in A1 is not yet supported.
- Writing data to a sheet starting from a cell other than A1 is not yet supported.
- Sheets must already exist to COPY TO them.
Expand Down
86 changes: 52 additions & 34 deletions src/gsheets_read.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,7 @@ void ReadSheetFunction(ClientContext &context, TableFunctionInput &data_p, DataC
if (bind_data.finished) {
return;
}

json cleanJson = parseJson(bind_data.response);
SheetData sheet_data = getSheetData(cleanJson);

Expand All @@ -48,26 +48,12 @@ void ReadSheetFunction(ClientContext &context, TableFunctionInput &data_p, DataC
// Adjust starting index based on whether we're using the header
idx_t start_index = bind_data.header ? bind_data.row_index + 1 : bind_data.row_index;

// Determine column types
vector<LogicalType> column_types(column_count, LogicalType::VARCHAR);
if (start_index < sheet_data.values.size()) {
const auto& first_data_row = sheet_data.values[start_index];
for (idx_t col = 0; col < column_count && col < first_data_row.size(); col++) {
const string& value = first_data_row[col];
if (value == "true" || value == "false") {
column_types[col] = LogicalType::BOOLEAN;
} else if (IsValidNumber(value)) {
column_types[col] = LogicalType::DOUBLE;
}
}
}

for (idx_t i = start_index; i < sheet_data.values.size() && row_count < STANDARD_VECTOR_SIZE; i++) {
const auto& row = sheet_data.values[i];
for (idx_t col = 0; col < column_count; col++) {
if (col < row.size()) {
const string& value = row[col];
switch (column_types[col].id()) {
switch (bind_data.return_types[col].id()) {
case LogicalTypeId::BOOLEAN:
if (value.empty()) {
output.SetValue(col, row_count, Value(LogicalType::BOOLEAN));
Expand All @@ -83,7 +69,12 @@ void ReadSheetFunction(ClientContext &context, TableFunctionInput &data_p, DataC
}
break;
default:
output.SetValue(col, row_count, Value(value));
// Empty strings should be converted to NULL
if (value.empty()) {
output.SetValue(col, row_count, Value(LogicalType::VARCHAR));
} else {
output.SetValue(col, row_count, Value(value));
}
break;
}
} else {
Expand Down Expand Up @@ -160,25 +151,52 @@ unique_ptr<FunctionData> ReadSheetBind(ClientContext &context, TableFunctionBind
json cleanJson = parseJson(bind_data->response);
SheetData sheet_data = getSheetData(cleanJson);

if (!sheet_data.values.empty()) {
idx_t start_index = header ? 1 : 0;
if (start_index < sheet_data.values.size()) {
const auto& first_data_row = sheet_data.values[start_index];
for (size_t i = 0; i < first_data_row.size(); i++) {
string column_name = header ? sheet_data.values[0][i] : "column" + std::to_string(i + 1);
names.push_back(column_name);

const string& value = first_data_row[i];
if (value == "true" || value == "false") {
return_types.push_back(LogicalType::BOOLEAN);
} else if (IsValidNumber(value)) {
return_types.push_back(LogicalType::DOUBLE);
} else {
return_types.push_back(LogicalType::VARCHAR);
}
}
// Prefering early return style to reduce nesting
if (sheet_data.values.empty()) {
return bind_data;
}
idx_t start_index = header ? 1 : 0;
if (start_index >= sheet_data.values.size()) {
return bind_data;
}

const auto& first_data_row = sheet_data.values[start_index];
// If we have a header, we want the width of the result to be the max of:
// the width of the header row
// or the width of the first row of data
int result_width = first_data_row.size();
if (header) {
int header_width = sheet_data.values[0].size();
if (header_width > result_width) {
result_width = header_width;
}
}

for (size_t i = 0; i < result_width; i++) {
// Assign default column_name, but rename to header value if using a header and header cell exists
string column_name = "column" + std::to_string(i + 1);
if (header && (i < sheet_data.values[0].size())) {
column_name = sheet_data.values[0][i];
}
names.push_back(column_name);

// If the first row has blanks, assume varchar for now
if (i >= first_data_row.size()) {
return_types.push_back(LogicalType::VARCHAR);
continue;
}
const string& value = first_data_row[i];
if (value == "TRUE" || value == "FALSE") {
return_types.push_back(LogicalType::BOOLEAN);
} else if (IsValidNumber(value)) {
return_types.push_back(LogicalType::DOUBLE);
} else {
return_types.push_back(LogicalType::VARCHAR);
}
}

bind_data->names = names;
bind_data->return_types = return_types;

return bind_data;
}
Expand Down
2 changes: 2 additions & 0 deletions src/include/gsheets_read.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,8 @@ struct ReadSheetBindData : public TableFunctionData {
string response;
bool header;
string sheet_name;
vector<LogicalType> return_types;
vector<string> names;

ReadSheetBindData(string spreadsheet_id, string token, bool header, string sheet_name);
};
Expand Down
22 changes: 22 additions & 0 deletions test/sql/read_gsheet.test
Original file line number Diff line number Diff line change
Expand Up @@ -78,6 +78,28 @@ FROM read_gsheet('https://docs.google.com/spreadsheets/d/11QdEasMWbETbFVxry-SsD8
3.0 value3 blabla3
NULL value4 blabla4

# Issue 47: Blanks in the first row should not prevent all columns from returning
query IIII
FROM read_gsheet('https://docs.google.com/spreadsheets/d/11QdEasMWbETbFVxry-SsD8jVcdYIT1zBQszcF84MdE8/edit?gid=1746330494#gid=1746330494');
----
woot blah NULL NULL
more wooting more blah NULL should get this!

# Issue 47: Read despite missing cells (and test booleans and doubles)
query IIIIIII
FROM read_gsheet('https://docs.google.com/spreadsheets/d/11QdEasMWbETbFVxry-SsD8jVcdYIT1zBQszcF84MdE8/edit?gid=1961167280#gid=1961167280');
----
woot blah NULL NULL true 123.0 should get this!
more wooting more blah should handle blank to the right NULL NULL NULL NULL
more wooting more blah NULL NULL false 456.789 should get this!

# Issue 47: Read despite missing column headers
query IIII
FROM read_gsheet('https://docs.google.com/spreadsheets/d/11QdEasMWbETbFVxry-SsD8jVcdYIT1zBQszcF84MdE8/edit?gid=1108445818#gid=1108445818');
----
woot blah NULL should get this!
more wooting more blah NULL should get this!

# Drop the secret
statement ok
drop secret test_secret;
Loading