From ec213583c0dbc8d6cb48b1c1efce40ce27c2ce98 Mon Sep 17 00:00:00 2001
From: lemonviv <lemonwyc@gmail.com>
Date: Tue, 19 Dec 2023 19:17:54 +0800
Subject: [PATCH] Add implementation for model and data processing for the hfl
 example

---
 examples/hfl/bank.py |  78 +++++++++++++++++++++++++
 examples/hfl/mlp.py  | 134 +++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 212 insertions(+)
 create mode 100644 examples/hfl/bank.py
 create mode 100644 examples/hfl/mlp.py

diff --git a/examples/hfl/bank.py b/examples/hfl/bank.py
new file mode 100644
index 000000000..b7131723f
--- /dev/null
+++ b/examples/hfl/bank.py
@@ -0,0 +1,78 @@
+# https://github.com/zhengzangw/Fed-SINGA/blob/main/src/client/data/bank.py
+
+import pandas as pd
+import numpy as np
+import sys
+from pandas.api.types import is_numeric_dtype
+from sklearn.model_selection import train_test_split
+from sklearn.utils import shuffle
+
+
+def encode(df):
+    res = pd.DataFrame()
+    for col in df.columns.values:
+        if not is_numeric_dtype(df[col]):
+            tmp = pd.get_dummies(df[col], prefix=col)
+        else:
+            tmp = df[col]
+        res = pd.concat([res, tmp], axis=1)
+    return res
+
+
+def load(device_id):
+    fn_train = "data/bank_train_" + str(device_id) + ".csv"
+    fn_test = "data/bank_test_" + str(device_id) + ".csv"
+
+    train = pd.read_csv(fn_train, sep=',')
+    test = pd.read_csv(fn_test, sep=',')
+
+    train_x = train.drop(['y'], axis=1)
+    train_y = train['y']
+    val_x = test.drop(['y'], axis=1)
+    val_y = test['y']
+
+    train_x = np.array((train_x), dtype=np.float32)
+    val_x = np.array((val_x), dtype=np.float32)
+    train_y = np.array((train_y), dtype=np.int32)
+    val_y = np.array((val_y), dtype=np.int32)
+
+    train_x, val_x = normalize(train_x, val_x)
+    num_classes = 2
+
+    return train_x, train_y, val_x, val_y, num_classes
+
+
+def normalize(X_train, X_test):
+    from sklearn.preprocessing import MinMaxScaler
+    scaler = MinMaxScaler()
+    X_train_scaled = scaler.fit_transform(X_train)
+    X_test_scaled = scaler.transform(X_test)
+    return X_train_scaled, X_test_scaled
+
+
+def split(num):
+    filepath = "../data/bank-additional-full.csv"
+    df = pd.read_csv(filepath, sep=';')
+    df['y'] = (df['y'] == 'yes').astype(int)
+    data = encode(df)
+    data = shuffle(data)
+    train, test = train_test_split(data, test_size=0.2)
+
+    train.to_csv("data/bank_train_.csv", index=False)
+    test.to_csv("data/bank_test_.csv", index=False)
+
+    train_per_client = len(train) // num
+    test_per_client = len(test) // num
+
+    print("train_per_client:", train_per_client)
+    print("test_per_client:", test_per_client)
+    for i in range(num):
+        sub_train = train[i * train_per_client:(i + 1) * train_per_client]
+        sub_test = test[i * test_per_client:(i + 1) * test_per_client]
+        sub_train.to_csv("data/bank_train_" + str(i) + ".csv", index=False)
+        sub_test.to_csv("data/bank_test_" + str(i) + ".csv", index=False)
+
+
+if __name__ == "__main__":
+    split(int(sys.argv[1]))
+
diff --git a/examples/hfl/mlp.py b/examples/hfl/mlp.py
new file mode 100644
index 000000000..294db6c60
--- /dev/null
+++ b/examples/hfl/mlp.py
@@ -0,0 +1,134 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+
+import argparse
+
+import numpy as np
+from singa import device, layer, model, opt, tensor
+
+np_dtype = {"float16": np.float16, "float32": np.float32}
+
+singa_dtype = {"float16": tensor.float16, "float32": tensor.float32}
+
+
+class MLP(model.Model):
+    def __init__(self, data_size=10, perceptron_size=100, num_classes=10):
+        super(MLP, self).__init__()
+        self.num_classes = num_classes
+        self.dimension = 2
+
+        self.relu = layer.ReLU()
+        self.linear1 = layer.Linear(perceptron_size)
+        self.linear2 = layer.Linear(num_classes)
+        self.softmax_cross_entropy = layer.SoftMaxCrossEntropy()
+
+    def forward(self, inputs):
+        y = self.linear1(inputs)
+        y = self.relu(y)
+        y = self.linear2(y)
+        return y
+
+    def train_one_batch(self, x, y, dist_option, spars):
+        out = self.forward(x)
+        loss = self.softmax_cross_entropy(out, y)
+
+        if dist_option == "plain":
+            self.optimizer(loss)
+        elif dist_option == "half":
+            self.optimizer.backward_and_update_half(loss)
+        elif dist_option == "partialUpdate":
+            self.optimizer.backward_and_partial_update(loss)
+        elif dist_option == "sparseTopK":
+            self.optimizer.backward_and_sparse_update(loss, topK=True, spars=spars)
+        elif dist_option == "sparseThreshold":
+            self.optimizer.backward_and_sparse_update(loss, topK=False, spars=spars)
+        return out, loss
+
+    def set_optimizer(self, optimizer):
+        self.optimizer = optimizer
+
+
+def create_model(pretrained=False, **kwargs):
+    """Constructs a CNN model.
+    Args:
+        pretrained (bool): If True, returns a pre-trained model.
+
+    Returns:
+        The created CNN model.
+    """
+    model = MLP(**kwargs)
+
+    return model
+
+
+__all__ = ["MLP", "create_model"]
+
+if __name__ == "__main__":
+    np.random.seed(0)
+
+    parser = argparse.ArgumentParser()
+    parser.add_argument("-p", choices=["float32", "float16"], default="float32", dest="precision")
+    parser.add_argument(
+        "-g",
+        "--disable-graph",
+        default="True",
+        action="store_false",
+        help="disable graph",
+        dest="graph",
+    )
+    parser.add_argument(
+        "-m", "--max-epoch", default=1001, type=int, help="maximum epochs", dest="max_epoch"
+    )
+    args = parser.parse_args()
+
+    # generate the boundary
+    f = lambda x: (5 * x + 1)
+    bd_x = np.linspace(-1.0, 1, 200)
+    bd_y = f(bd_x)
+
+    # generate the training data
+    x = np.random.uniform(-1, 1, 400)
+    y = f(x) + 2 * np.random.randn(len(x))
+
+    # choose one precision
+    precision = singa_dtype[args.precision]
+    np_precision = np_dtype[args.precision]
+
+    # convert training data to 2d space
+    label = np.asarray([5 * a + 1 > b for (a, b) in zip(x, y)]).astype(np.int32)
+    data = np.array([[a, b] for (a, b) in zip(x, y)], dtype=np_precision)
+
+    dev = device.create_cuda_gpu_on(0)
+    sgd = opt.SGD(0.1, 0.9, 1e-5, dtype=singa_dtype[args.precision])
+    tx = tensor.Tensor((400, 2), dev, precision)
+    ty = tensor.Tensor((400,), dev, tensor.int32)
+    model = MLP(data_size=2, perceptron_size=3, num_classes=2)
+
+    # attach model to graph
+    model.set_optimizer(sgd)
+    model.compile([tx], is_train=True, use_graph=args.graph, sequential=True)
+    model.train()
+
+    for i in range(args.max_epoch):
+        tx.copy_from_numpy(data)
+        ty.copy_from_numpy(label)
+        out, loss = model(tx, ty, "fp32", spars=None)
+
+        if i % 100 == 0:
+            print("training loss = ", tensor.to_numpy(loss)[0])