Written by MMA
on May 20, 2019
4 mins to read.

Function to create batches of data

This is a basic function to create batches from a data set.

import math
import numpy as np

def miniBatch(x, y, batchSize):
    numObs  = x.shape[0]
    batches = [] 
    batchNum = math.floor(numObs / batchSize)
    
    if numObs % batchSize == 0:
        for i in range(batchNum):
            xBatch = x[i * batchSize:(i + 1) * batchSize, :]
            yBatch = y[i * batchSize:(i + 1) * batchSize, :]
            batches.append((xBatch, yBatch))
    else:
        for i in range(batchNum):
            xBatch = x[i * batchSize:(i + 1) * batchSize, :]
            yBatch = y[i * batchSize:(i + 1) * batchSize, :]
            batches.append((xBatch, yBatch))
        xBatch = x[batchNum * batchSize:, :]
        yBatch = y[batchNum * batchSize:, :]
        batches.append((xBatch, yBatch))
    return batches

Here $x$ and $y$ should be numpy array. Let’s create an example dataset and see how the function works:

from sklearn.datasets import make_classification
from sklearn.preprocessing import OneHotEncoder

# Define dummy training data, N examples
N = 1000
# Create a simulated feature matrix and output vector with 100 samples,
X_train, output = make_classification(n_samples = N,
                                       # ten features
                                       n_features = n_features,
                                       # three output classes
                                       n_classes = n_classes,
                                       n_informative=4,
                                       n_redundant=2,
                                       # with 20% of observations in the first class, 30% in the second class,
                                       # and 50% in the third class [.2, .3, .8]. ('None' makes balanced classes)
                                       weights = None)

onehot_encoder = OneHotEncoder(sparse=False)
integer_encoded = output.reshape(len(output), 1)
y_train = onehot_encoder.fit_transform(integer_encoded)

X_train.shape
#(1000, 32)
y_train.shape
#(1000, 8)

X_train
# array([[ 0.45396437,  1.51656331,  1.3629891 , ...,  0.14717609,
#         -0.15119727,  1.01995008],
#        [-0.81950039,  0.26249003, -0.51083559, ...,  0.36629063,
#          1.13654148, -0.19591568],
#        [ 1.31671553, -1.30269225, -0.56231267, ...,  0.40714641,
#         -1.05379444,  2.03267767],
#        ...,
#        [-0.31350026,  0.65172311, -1.21891585, ...,  0.64381309,
#          0.18544175,  1.74250682],
#        [-0.84452574,  0.12441828, -1.73681735, ...,  0.50854846,
#         -0.04233258,  0.51991385],
#        [-0.33021028, -0.32264822, -0.27926319, ..., -1.01789405,
#         -0.55645379,  2.15772776]])

y_train
# array([[0., 0., 0., ..., 0., 0., 0.],
#        [0., 1., 0., ..., 0., 0., 0.],
#        [0., 0., 1., ..., 0., 0., 0.],
#        ...,
#        [0., 0., 0., ..., 0., 0., 0.],
#        [0., 1., 0., ..., 0., 0., 0.],
#        [0., 1., 0., ..., 0., 0., 0.]])

Here, if number of observations in the dataset is not divisible by batch size, the last batch should not be dropped in the case it has fewer than batch size elements.

miniBatches = miniBatch(X_train, y_train, batchSize=64)
total_batch = len(miniBatches)
# Loop over all batches
for batch in miniBatches:
    xBatch = batch[0]
    yBatch = batch[1]
    print(xBatch.shape)
    
# (64, 32)
# (64, 32)
# (64, 32)
# (64, 32)
# (64, 32)
# (64, 32)
# (64, 32)
# (64, 32)
# (64, 32)
# (64, 32)
# (64, 32)
# (64, 32)
# (64, 32)
# (64, 32)
# (64, 32)
# (40, 32)

← → Top