Today I will share the deep neural network using numpy, which are commonly asked in machine learning interviews:
Initialization
Activation Function
Single Layer Forward and Full Layer Forward
Cost Function and Metric Evaluation
Single Layer Backward and Full Layer Backward
Training and Test
Code Implementation

import numpy as np
from sklearn.datasets import make_moons
from sklearn.model_selection import train_test_split

# Initialization
def init_layers(nn_structure, seed=42):
    np.random.seed(seed)
    param_values = {}
    for i, layer in enumerate(nn_structure):
        layer_idx = i+1
        layer_input_size = layer["input_dim"]
        layer_output_size = layer["output_dim"]

        param_values["W_" + str(layer_idx)] = np.random.randn(layer_output_size, layer_input_size)*0.1
        param_values["b_" + str(layer_idx)] = np.random.randn(layer_output_size, 1)*0.1
    
    return param_values

# Activaition Function
def sigmoid(Z):
    return 1/(1+np.exp(-Z))

def relu(Z):
    return np.maximum(0,Z)

def sigmoid_backward(dA, Z):
    sig = sigmoid(Z)
    return dA * sig * (1 - sig)

def relu_backward(dA, Z):
    dZ = np.array(dA, copy = True)
    dZ[Z <= 0] = 0
    return dZ

# Single Layer Forward and Full Layer Forward
def single_layer_forward(A_prev, W, b, activation):
    Z = np.dot(W, A_prev) + b
    if activation == "sigmoid":
        A = sigmoid(Z)
    elif activation == "relu":
        A = relu(Z)
    else:
        raise Exception("Non-supported activation functions")
    return A, Z

def full_layer_forward(X, param_values, nn_structure):
    cache = {}
    A = X
    for i, layer in enumerate(nn_structure):
        layer_idx = i+1
        A_prev = A
        activation = layer["activation"]
        W = param_values["W_" + str(layer_idx)]
        b = param_values["b_" + str(layer_idx)]
        A, Z = single_layer_forward(A_prev, W, b, activation)
        cache["A_"+ str(i)] = A_prev
        cache["Z_" + str(layer_idx)] = Z
    
    return A, cache

# Cost Function and Metric Evaluation
def cost_func(Y_hat, Y):
    m = Y_hat.shape[1]
    cost = -(np.dot(Y, np.log(Y_hat).T) + np.dot(1-Y, np.log(1-Y_hat).T))/m
    return np.squeeze(cost)

def acc_func(Y_hat, Y, threshold=0.5):
    probs = np.copy(Y_hat)
    probs[probs > threshold] = 1
    probs[probs <= threshold] = 0
    return (probs == Y).all(axis=0).mean()

# Single Layer Backward and Full Layer Backward
def single_layer_backward(dA, W, b, Z, A_prev, activation):
    m = A_prev.shape[1]
    if activation == "sigmoid":
        dZ = sigmoid_backward(dA, Z)
    elif activation == "relu":
        dZ = relu_backward(dA, Z)
    else:
        raise Exception("Non-supported activation functions")
    dW = np.dot(dZ, A_prev.T)/m
    db = np.sum(dZ, axis=1, keepdims=True)/m
    dA_prev = np.dot(W.T, dZ)

    return dA_prev, dW, db

def full_layer_backward(Y_hat, Y, cache, param_values, nn_structure):
    grads_values = {}
    Y = Y.reshape(Y_hat.shape)
    dA_prev = - (np.divide(Y, Y_hat) - np.divide(1 - Y, 1 - Y_hat))

    for layer_idx_prev in range(len(nn_structure)-1, -1, -1):
        layer_idx = layer_idx_prev+1
        layer = nn_structure[layer_idx_prev]
        activation = layer["activation"]
        dA = dA_prev
        A_prev = cache["A_" + str(layer_idx_prev)]
        Z = cache["Z_" + str(layer_idx)]
        W = param_values["W_" + str(layer_idx)]
        b = param_values["b_" + str(layer_idx)]
        dA_prev, dW, db = single_layer_backward(dA, W, b, Z, A_prev, activation)
        grads_values["dW_" + str(layer_idx)] = dW
        grads_values["db_" + str(layer_idx)] = db
    
    return grads_values

# Training and Test
def update(param_values, grads_values, nn_structure, learning_rate):
    for i in range(1, len(nn_structure)):
        param_values["W_" + str(i)] -= learning_rate*grads_values["dW_" + str(i)]
        param_values["b_" + str(i)] -= learning_rate*grads_values["db_" + str(i)]
    
    return param_values

def train(X, Y, nn_structure, epochs, learning_rate):
    param_values = init_layers(nn_structure)
    cost_history = []
    acc_history = []
    
    for i in range(epochs):
        Y_hat, cache = full_layer_forward(X, param_values, nn_structure)
        cost = cost_func(Y_hat, Y)
        acc = acc_func(Y_hat, Y)
        if i%1000 == 0:
            print("cost value: %f"%cost)
            print("acc value: %f"%acc)
        cost_history.append(cost)
        acc_history.append(acc)
        grads_values = full_layer_backward(Y_hat, Y, cache, param_values, nn_structure)
        param_values = update(param_values, grads_values, nn_structure, learning_rate)

    return param_values

# number of samples in the data set
N_SAMPLES = 1000
# ratio between training and test sets
TEST_SIZE = 0.2
# NN structure
NN_STRUCTURE = [
    {"input_dim": 2, "output_dim": 25, "activation": "relu"},
    {"input_dim": 25, "output_dim": 50, "activation": "relu"},
    {"input_dim": 50, "output_dim": 50, "activation": "relu"},
    {"input_dim": 50, "output_dim": 25, "activation": "relu"},
    {"input_dim": 25, "output_dim": 1, "activation": "sigmoid"},
]

X, y = make_moons(n_samples = N_SAMPLES, noise=0.2, random_state=100)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=TEST_SIZE, random_state=42)

# Training
params_values = train(np.transpose(X_train), np.transpose(y_train.reshape((y_train.shape[0], 1))), NN_STRUCTURE, 10000, 0.01)

# Prediction 
Y_test_hat, _ = full_layer_forward(np.transpose(X_test), params_values, NN_STRUCTURE)

# Accuracy achieved on the test set
acc_test = acc_func(Y_test_hat, np.transpose(y_test.reshape((y_test.shape[0], 1))))
print("Test set accuracy: {:.2f}".format(acc_test))
Rui Wang

Numpy Deep Learning Nueral Network

Code Implementation

Reference