Today I will share the deep neural network using numpy, which are commonly asked in machine learning interviews:

  • Initialization
  • Activation Function
  • Single Layer Forward and Full Layer Forward
  • Cost Function and Metric Evaluation
  • Single Layer Backward and Full Layer Backward
  • Training and Test

Code Implementation

import numpy as np
from sklearn.datasets import make_moons
from sklearn.model_selection import train_test_split

# Initialization
def init_layers(nn_structure, seed=42):
    np.random.seed(seed)
    param_values = {}
    for i, layer in enumerate(nn_structure):
        layer_idx = i+1
        layer_input_size = layer["input_dim"]
        layer_output_size = layer["output_dim"]

        param_values["W_" + str(layer_idx)] = np.random.randn(layer_output_size, layer_input_size)*0.1
        param_values["b_" + str(layer_idx)] = np.random.randn(layer_output_size, 1)*0.1
    
    return param_values

# Activaition Function
def sigmoid(Z):
    return 1/(1+np.exp(-Z))

def relu(Z):
    return np.maximum(0,Z)

def sigmoid_backward(dA, Z):
    sig = sigmoid(Z)
    return dA * sig * (1 - sig)

def relu_backward(dA, Z):
    dZ = np.array(dA, copy = True)
    dZ[Z <= 0] = 0
    return dZ

# Single Layer Forward and Full Layer Forward
def single_layer_forward(A_prev, W, b, activation):
    Z = np.dot(W, A_prev) + b
    if activation == "sigmoid":
        A = sigmoid(Z)
    elif activation == "relu":
        A = relu(Z)
    else:
        raise Exception("Non-supported activation functions")
    return A, Z

def full_layer_forward(X, param_values, nn_structure):
    cache = {}
    A = X
    for i, layer in enumerate(nn_structure):
        layer_idx = i+1
        A_prev = A
        activation = layer["activation"]
        W = param_values["W_" + str(layer_idx)]
        b = param_values["b_" + str(layer_idx)]
        A, Z = single_layer_forward(A_prev, W, b, activation)
        cache["A_"+ str(i)] = A_prev
        cache["Z_" + str(layer_idx)] = Z
    
    return A, cache

# Cost Function and Metric Evaluation
def cost_func(Y_hat, Y):
    m = Y_hat.shape[1]
    cost = -(np.dot(Y, np.log(Y_hat).T) + np.dot(1-Y, np.log(1-Y_hat).T))/m
    return np.squeeze(cost)

def acc_func(Y_hat, Y, threshold=0.5):
    probs = np.copy(Y_hat)
    probs[probs > threshold] = 1
    probs[probs <= threshold] = 0
    return (probs == Y).all(axis=0).mean()

# Single Layer Backward and Full Layer Backward
def single_layer_backward(dA, W, b, Z, A_prev, activation):
    m = A_prev.shape[1]
    if activation == "sigmoid":
        dZ = sigmoid_backward(dA, Z)
    elif activation == "relu":
        dZ = relu_backward(dA, Z)
    else:
        raise Exception("Non-supported activation functions")
    dW = np.dot(dZ, A_prev.T)/m
    db = np.sum(dZ, axis=1, keepdims=True)/m
    dA_prev = np.dot(W.T, dZ)

    return dA_prev, dW, db

def full_layer_backward(Y_hat, Y, cache, param_values, nn_structure):
    grads_values = {}
    Y = Y.reshape(Y_hat.shape)
    dA_prev = - (np.divide(Y, Y_hat) - np.divide(1 - Y, 1 - Y_hat))

    for layer_idx_prev in range(len(nn_structure)-1, -1, -1):
        layer_idx = layer_idx_prev+1
        layer = nn_structure[layer_idx_prev]
        activation = layer["activation"]
        dA = dA_prev
        A_prev = cache["A_" + str(layer_idx_prev)]
        Z = cache["Z_" + str(layer_idx)]
        W = param_values["W_" + str(layer_idx)]
        b = param_values["b_" + str(layer_idx)]
        dA_prev, dW, db = single_layer_backward(dA, W, b, Z, A_prev, activation)
        grads_values["dW_" + str(layer_idx)] = dW
        grads_values["db_" + str(layer_idx)] = db
    
    return grads_values

# Training and Test
def update(param_values, grads_values, nn_structure, learning_rate):
    for i in range(1, len(nn_structure)):
        param_values["W_" + str(i)] -= learning_rate*grads_values["dW_" + str(i)]
        param_values["b_" + str(i)] -= learning_rate*grads_values["db_" + str(i)]
    
    return param_values

def train(X, Y, nn_structure, epochs, learning_rate):
    param_values = init_layers(nn_structure)
    cost_history = []
    acc_history = []
    
    for i in range(epochs):
        Y_hat, cache = full_layer_forward(X, param_values, nn_structure)
        cost = cost_func(Y_hat, Y)
        acc = acc_func(Y_hat, Y)
        if i%1000 == 0:
            print("cost value: %f"%cost)
            print("acc value: %f"%acc)
        cost_history.append(cost)
        acc_history.append(acc)
        grads_values = full_layer_backward(Y_hat, Y, cache, param_values, nn_structure)
        param_values = update(param_values, grads_values, nn_structure, learning_rate)

    return param_values

# number of samples in the data set
N_SAMPLES = 1000
# ratio between training and test sets
TEST_SIZE = 0.2
# NN structure
NN_STRUCTURE = [
    {"input_dim": 2, "output_dim": 25, "activation": "relu"},
    {"input_dim": 25, "output_dim": 50, "activation": "relu"},
    {"input_dim": 50, "output_dim": 50, "activation": "relu"},
    {"input_dim": 50, "output_dim": 25, "activation": "relu"},
    {"input_dim": 25, "output_dim": 1, "activation": "sigmoid"},
]

X, y = make_moons(n_samples = N_SAMPLES, noise=0.2, random_state=100)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=TEST_SIZE, random_state=42)

# Training
params_values = train(np.transpose(X_train), np.transpose(y_train.reshape((y_train.shape[0], 1))), NN_STRUCTURE, 10000, 0.01)

# Prediction 
Y_test_hat, _ = full_layer_forward(np.transpose(X_test), params_values, NN_STRUCTURE)

# Accuracy achieved on the test set
acc_test = acc_func(Y_test_hat, np.transpose(y_test.reshape((y_test.shape[0], 1))))
print("Test set accuracy: {:.2f}".format(acc_test))

Reference