Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add the implementation for the model selection example #1112

Merged
merged 1 commit into from
Oct 24, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 3 additions & 3 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -29,10 +29,10 @@ LIST(APPEND CMAKE_MODULE_PATH ${PROJECT_SOURCE_DIR}/cmake/Thirdparty)
#string(REGEX REPLACE "^[0-9]+\\.[0-9]+\\.([0-9]+).*" "\\1" VERSION_PATCH "${VERSION}")


SET(PACKAGE_VERSION 4.0.0) # ${VERSION})
SET(VERSION 4.0.0)
SET(PACKAGE_VERSION 4.1.0) # ${VERSION})
SET(VERSION 4.1.0)
SET(SINGA_MAJOR_VERSION 4)
SET(SINGA_MINOR_VERSION 0)
SET(SINGA_MINOR_VERSION 1)
SET(SINGA_PATCH_VERSION 0)
#SET(SINGA_MAJOR_VERSION ${VERSION_MAJOR}) # 0 -
#SET(SINGA_MINOR_VERSION ${VERSION_MINOR}) # 0 - 9
Expand Down
56 changes: 56 additions & 0 deletions LICENSE
Original file line number Diff line number Diff line change
Expand Up @@ -559,3 +559,59 @@ PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

===============================================================================
SINGA bundles the following under MIT License:
examples/model_selection/TRAILS-Database-Native-Model-Selection/internal/pg_extension/*

MIT License

Portions Copyright 2019-2021 ZomboDB, LLC.
Portions Copyright 2021-2023 Technology Concepts & Design, Inc.
Portions Copyright 2023 PgCentral Foundation, Inc.

All rights reserved.

Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:

The above copyright notice and this permission notice shall be included in all
copies or substantial portions of the Software.

THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
SOFTWARE.

===============================================================================
SINGA bundles the following under The PostgreSQL License:
examples/model_selection/TRAILS-Database-Native-Model-Selection/internal/pg_extension/*

The PostgreSQL License

Portions Copyright (c) 1996-2023, The PostgreSQL Global Development Group

Portions Copyright (c) 1994, The Regents of the University of California

Permission to use, copy, modify, and distribute this software and its documentation for any
purpose, without fee, and without a written agreement is hereby granted, provided that the above
copyright notice and this paragraph and the following two paragraphs appear in all copies.

IN NO EVENT SHALL THE UNIVERSITY OF CALIFORNIA BE LIABLE TO ANY PARTY FOR DIRECT, INDIRECT,
SPECIAL, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, INCLUDING LOST PROFITS, ARISING
OUT OF THE USE OF THIS SOFTWARE AND ITS DOCUMENTATION, EVEN IF
THE UNIVERSITY OF CALIFORNIA HAS BEEN ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

THE UNIVERSITY OF CALIFORNIA SPECIFICALLY DISCLAIMS ANY WARRANTIES, INCLUDING,
BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE. THE SOFTWARE PROVIDED HEREUNDER IS ON AN "AS IS" BASIS,
AND THE UNIVERSITY OF CALIFORNIA HAS NO OBLIGATIONS TO PROVIDE MAINTENANCE, SUPPORT,
UPDATES, ENHANCEMENTS, OR MODIFICATIONS.

Original file line number Diff line number Diff line change
@@ -0,0 +1,58 @@
FROM ubuntu:20.04

ENV DEBIAN_FRONTEND=noninteractive

# Install Python, Vim, and necessary libraries
RUN apt-get update && \
apt-get install -y software-properties-common wget gnupg2 lsb-release git && \
add-apt-repository ppa:deadsnakes/ppa && \
apt-get install -y python3.6 python3-pip vim && \
apt-get clean && \
rm -rf /var/lib/apt/lists/*

# Install necessary dependencies for PostgreSQL and Rust
RUN apt-get update && \
apt-get install -y pkg-config libssl-dev libpq-dev libclang-dev curl && \
apt-get clean && \
rm -rf /var/lib/apt/lists/*

# Install necessary dependencies for pgrx
RUN apt-get update && \
apt-get install -y bison flex libreadline-dev && \
apt-get clean && \
rm -rf /var/lib/apt/lists/*

# Create the postgres user
USER root
RUN adduser --disabled-password --gecos "" postgres && \
mkdir /project && \
adduser postgres sudo && \
chown -R postgres:postgres /project

# Switch to the postgres user andInstall Rust and init the cargo
USER postgres
RUN curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh -s -- -y && \
echo 'source $HOME/.cargo/env' >> $HOME/.bashrc && \
/bin/bash -c "source $HOME/.cargo/env && cargo install cargo-pgrx --version '0.9.7' --locked" && \
/bin/bash -c "source $HOME/.cargo/env && cargo pgrx init"

# Set environment variables for Rust and Python
ENV PATH="/root/.cargo/bin:${PATH}"
ENV PYTHONPATH="${PYTHONPATH}:/project/TRAILS/internal/ml/model_selection"

WORKDIR /project
COPY ./internal/ml/model_selection/requirement.txt ./requirement.txt
RUN pip install -r requirement.txt

RUN pip install https://www.comp.nus.edu.sg/~zhaojing/files/singa-3.1.0-cp38-cp38-manylinux2014_x86_64.whl

# appendix
USER root
RUN apt-get update && apt-get install -y \
postgresql-client && \
apt-get clean && \
rm -rf /var/lib/apt/lists/*

USER postgres

CMD ["tail", "-f", "/dev/null"]
Original file line number Diff line number Diff line change
@@ -0,0 +1,148 @@
# Database-Native Model Selection

​ -- based on Singa



![image-20231020174425377](documents/image-20231020174425377.png)

## Build Docker Image

```bash
git clone https://github.com/apache/singa.git
cd singa/examples/model_selection/TRAILS-Database-Native-Model-Selection/
docker build -t trails-singa .
```

## Run Docker Image
Download exp_data.zip from https://www.dropbox.com/scl/fi/xz4teosklwmfc5j4x2ug6/exp_data.zip?rlkey=5fk2ttib0zt49suyppcjhsrn2&dl=0
and unzip the exp_data/ folder to a specific directory (path_to_exp_data_folder)
```bash
docker run -d --name trails-singa \
--network="host" \
-v path_to_exp_data_folder:/project/exp_data \
trails-singa
```

## Start PostgreSQL Instance

```bash
# 1. Run docker container
docker exec -it trails-singa bash
# 2. Clone the code
cd ~
git clone https://github.com/apache/singa.git
cd singa/examples/model_selection/TRAILS-Database-Native-Model-Selection/
# 3. Export PYTHONPATH
export PYTHONPATH=$PYTHONPATH:./internal/ml/model_selection
# 4. Start the RDBMS and then exit
cd internal/pg_extension
cargo pgrx run
exit
cd ../..
# 5. Load data into RDBMS
bash internal/ml/model_selection/scripts/database/load_data_to_db.sh /project/exp_data/data/structure_data/frappe frappe
# 6. Run database server
cd internal/pg_extension
cargo pgrx run

```


## Register Stored Procedure

```sql
CREATE OR REPLACE
PROCEDURE model_selection_sp(
dataset TEXT, --dataset name
selected_columns TEXT[], --used columns
N INTEGER, --number of models to evaluate
batch_size INTEGER, --batch size, for profiling, filtering
config_file TEXT --config file path
)
LANGUAGE plpgsql
AS $$
DECLARE
-- global inputs/outputs
result_status TEXT;
column_list TEXT;
BEGIN
-- combine the columns into a string
column_list := array_to_string(selected_columns, ', ');

-- 4. Run filtering phase to get top K models.
EXECUTE format('
WITH batch_rows AS (
SELECT %s
FROM %I
ORDER BY RANDOM()
LIMIT %s OFFSET 0
)
SELECT filtering_phase(
json_agg(row_to_json(t))::text, %s, %s, %L
)
FROM batch_rows AS t', column_list, dataset, batch_size, N, 1, config_file) INTO result_status;
RAISE NOTICE '4. run filtering phase, k models = %', result_status;

END; $$;
```

# Compile the UDF

```bash
# Try compile the UDF
DROP EXTENSION IF EXISTS pg_extension;
CREATE EXTENSION pg_extension;
```

If the above fails, open another terminal and go into the docker via docker exec -it trails-singa bash
Then run the following
```bash
rm /home/postgres/.pgrx/14.9/pgrx-install/share/extension/pg_extension--0.1.0.sql
vi /home/postgres/.pgrx/14.9/pgrx-install/share/extension/pg_extension--0.1.0.sql
# Copy the following to the /home/postgres/.pgrx/14.9/pgrx-install/share/extension/pg_extension--0.1.0.sql
-- src/lib.rs:66
-- pg_extension::filtering_phase
CREATE FUNCTION "filtering_phase"(
"mini_batch" TEXT, /* alloc::string::String */
"n" INT, /* i32 */
"k" INT, /* i32 */
"config_file" TEXT /* alloc::string::String */
) RETURNS TEXT /* alloc::string::String */
IMMUTABLE STRICT PARALLEL SAFE
LANGUAGE c /* Rust */
AS 'MODULE_PATHNAME', 'filtering_phase_wrapper';
```

Go back to the first terminal and run the following in the database server again
```bash
# Try compile the UDF
DROP EXTENSION IF EXISTS pg_extension;
CREATE EXTENSION pg_extension;
```

## Run Model Selection

```sql
-- Template for calling 'model_selection_sp' stored procedure
CALL model_selection_sp(
<TABLE_NAME>, -- The name of the table or dataset from which data should be retrieved.
<COLUMN_NAMES_ARRAY>, -- An array of column names to be considered in the model selection process.
<PARAMETER_1>, -- Number of models to explore
<PARAMETER_2>, -- Batch size
<CONFIG_FILE_PATH> -- The file path to a configuration file needed for the process.
);


# For example
CALL model_selection_sp(
'frappe_train',
ARRAY['col1', 'col2', 'col3', 'label'],
10,
32,
'/home/postgres/singa/examples/model_selection/TRAILS-Database-Native-Model-Selection/internal/ml/model_selection/config.ini');
```

# Example Result

![image-20231020174945226](documents/image-20231020174945226.png)
Loading
Loading