@ -0,0 +1,262 @@ | |||||
from __future__ import print_function | |||||
from builtins import range | |||||
from six.moves import cPickle as pickle | |||||
import numpy as np | |||||
import os | |||||
from imageio import imread | |||||
import platform | |||||
def load_pickle(f): | |||||
version = platform.python_version_tuple() | |||||
if version[0] == '2': | |||||
return pickle.load(f) | |||||
elif version[0] == '3': | |||||
return pickle.load(f, encoding='latin1') | |||||
raise ValueError("invalid python version: {}".format(version)) | |||||
def load_CIFAR_batch(filename): | |||||
""" load single batch of cifar """ | |||||
with open(filename, 'rb') as f: | |||||
datadict = load_pickle(f) | |||||
X = datadict['data'] | |||||
Y = datadict['labels'] | |||||
X = X.reshape(10000, 3, 32, 32).transpose(0,2,3,1).astype("float") | |||||
Y = np.array(Y) | |||||
return X, Y | |||||
def load_CIFAR10(ROOT): | |||||
""" load all of cifar """ | |||||
xs = [] | |||||
ys = [] | |||||
for b in range(1,6): | |||||
f = os.path.join(ROOT, 'data_batch_%d' % (b, )) | |||||
X, Y = load_CIFAR_batch(f) | |||||
xs.append(X) | |||||
ys.append(Y) | |||||
Xtr = np.concatenate(xs) | |||||
Ytr = np.concatenate(ys) | |||||
del X, Y | |||||
Xte, Yte = load_CIFAR_batch(os.path.join(ROOT, 'test_batch')) | |||||
return Xtr, Ytr, Xte, Yte | |||||
def get_CIFAR10_data(num_training=49000, num_validation=1000, num_test=1000, | |||||
subtract_mean=True): | |||||
""" | |||||
Load the CIFAR-10 dataset from disk and perform preprocessing to prepare | |||||
it for classifiers. These are the same steps as we used for the SVM, but | |||||
condensed to a single function. | |||||
""" | |||||
# Load the raw CIFAR-10 data | |||||
cifar10_dir = 'daseCV/datasets/cifar-10-batches-py' | |||||
X_train, y_train, X_test, y_test = load_CIFAR10(cifar10_dir) | |||||
# Subsample the data | |||||
mask = list(range(num_training, num_training + num_validation)) | |||||
X_val = X_train[mask] | |||||
y_val = y_train[mask] | |||||
mask = list(range(num_training)) | |||||
X_train = X_train[mask] | |||||
y_train = y_train[mask] | |||||
mask = list(range(num_test)) | |||||
X_test = X_test[mask] | |||||
y_test = y_test[mask] | |||||
# Normalize the data: subtract the mean image | |||||
if subtract_mean: | |||||
mean_image = np.mean(X_train, axis=0) | |||||
X_train -= mean_image | |||||
X_val -= mean_image | |||||
X_test -= mean_image | |||||
# Transpose so that channels come first | |||||
X_train = X_train.transpose(0, 3, 1, 2).copy() | |||||
X_val = X_val.transpose(0, 3, 1, 2).copy() | |||||
X_test = X_test.transpose(0, 3, 1, 2).copy() | |||||
# Package data into a dictionary | |||||
return { | |||||
'X_train': X_train, 'y_train': y_train, | |||||
'X_val': X_val, 'y_val': y_val, | |||||
'X_test': X_test, 'y_test': y_test, | |||||
} | |||||
def load_tiny_imagenet(path, dtype=np.float32, subtract_mean=True): | |||||
""" | |||||
Load TinyImageNet. Each of TinyImageNet-100-A, TinyImageNet-100-B, and | |||||
TinyImageNet-200 have the same directory structure, so this can be used | |||||
to load any of them. | |||||
Inputs: | |||||
- path: String giving path to the directory to load. | |||||
- dtype: numpy datatype used to load the data. | |||||
- subtract_mean: Whether to subtract the mean training image. | |||||
Returns: A dictionary with the following entries: | |||||
- class_names: A list where class_names[i] is a list of strings giving the | |||||
WordNet names for class i in the loaded dataset. | |||||
- X_train: (N_tr, 3, 64, 64) array of training images | |||||
- y_train: (N_tr,) array of training labels | |||||
- X_val: (N_val, 3, 64, 64) array of validation images | |||||
- y_val: (N_val,) array of validation labels | |||||
- X_test: (N_test, 3, 64, 64) array of testing images. | |||||
- y_test: (N_test,) array of test labels; if test labels are not available | |||||
(such as in student code) then y_test will be None. | |||||
- mean_image: (3, 64, 64) array giving mean training image | |||||
""" | |||||
# First load wnids | |||||
with open(os.path.join(path, 'wnids.txt'), 'r') as f: | |||||
wnids = [x.strip() for x in f] | |||||
# Map wnids to integer labels | |||||
wnid_to_label = {wnid: i for i, wnid in enumerate(wnids)} | |||||
# Use words.txt to get names for each class | |||||
with open(os.path.join(path, 'words.txt'), 'r') as f: | |||||
wnid_to_words = dict(line.split('\t') for line in f) | |||||
for wnid, words in wnid_to_words.items(): | |||||
wnid_to_words[wnid] = [w.strip() for w in words.split(',')] | |||||
class_names = [wnid_to_words[wnid] for wnid in wnids] | |||||
# Next load training data. | |||||
X_train = [] | |||||
y_train = [] | |||||
for i, wnid in enumerate(wnids): | |||||
if (i + 1) % 20 == 0: | |||||
print('loading training data for synset %d / %d' | |||||
% (i + 1, len(wnids))) | |||||
# To figure out the filenames we need to open the boxes file | |||||
boxes_file = os.path.join(path, 'train', wnid, '%s_boxes.txt' % wnid) | |||||
with open(boxes_file, 'r') as f: | |||||
filenames = [x.split('\t')[0] for x in f] | |||||
num_images = len(filenames) | |||||
X_train_block = np.zeros((num_images, 3, 64, 64), dtype=dtype) | |||||
y_train_block = wnid_to_label[wnid] * \ | |||||
np.ones(num_images, dtype=np.int64) | |||||
for j, img_file in enumerate(filenames): | |||||
img_file = os.path.join(path, 'train', wnid, 'images', img_file) | |||||
img = imread(img_file) | |||||
if img.ndim == 2: | |||||
## grayscale file | |||||
img.shape = (64, 64, 1) | |||||
X_train_block[j] = img.transpose(2, 0, 1) | |||||
X_train.append(X_train_block) | |||||
y_train.append(y_train_block) | |||||
# We need to concatenate all training data | |||||
X_train = np.concatenate(X_train, axis=0) | |||||
y_train = np.concatenate(y_train, axis=0) | |||||
# Next load validation data | |||||
with open(os.path.join(path, 'val', 'val_annotations.txt'), 'r') as f: | |||||
img_files = [] | |||||
val_wnids = [] | |||||
for line in f: | |||||
img_file, wnid = line.split('\t')[:2] | |||||
img_files.append(img_file) | |||||
val_wnids.append(wnid) | |||||
num_val = len(img_files) | |||||
y_val = np.array([wnid_to_label[wnid] for wnid in val_wnids]) | |||||
X_val = np.zeros((num_val, 3, 64, 64), dtype=dtype) | |||||
for i, img_file in enumerate(img_files): | |||||
img_file = os.path.join(path, 'val', 'images', img_file) | |||||
img = imread(img_file) | |||||
if img.ndim == 2: | |||||
img.shape = (64, 64, 1) | |||||
X_val[i] = img.transpose(2, 0, 1) | |||||
# Next load test images | |||||
# Students won't have test labels, so we need to iterate over files in the | |||||
# images directory. | |||||
img_files = os.listdir(os.path.join(path, 'test', 'images')) | |||||
X_test = np.zeros((len(img_files), 3, 64, 64), dtype=dtype) | |||||
for i, img_file in enumerate(img_files): | |||||
img_file = os.path.join(path, 'test', 'images', img_file) | |||||
img = imread(img_file) | |||||
if img.ndim == 2: | |||||
img.shape = (64, 64, 1) | |||||
X_test[i] = img.transpose(2, 0, 1) | |||||
y_test = None | |||||
y_test_file = os.path.join(path, 'test', 'test_annotations.txt') | |||||
if os.path.isfile(y_test_file): | |||||
with open(y_test_file, 'r') as f: | |||||
img_file_to_wnid = {} | |||||
for line in f: | |||||
line = line.split('\t') | |||||
img_file_to_wnid[line[0]] = line[1] | |||||
y_test = [wnid_to_label[img_file_to_wnid[img_file]] | |||||
for img_file in img_files] | |||||
y_test = np.array(y_test) | |||||
mean_image = X_train.mean(axis=0) | |||||
if subtract_mean: | |||||
X_train -= mean_image[None] | |||||
X_val -= mean_image[None] | |||||
X_test -= mean_image[None] | |||||
return { | |||||
'class_names': class_names, | |||||
'X_train': X_train, | |||||
'y_train': y_train, | |||||
'X_val': X_val, | |||||
'y_val': y_val, | |||||
'X_test': X_test, | |||||
'y_test': y_test, | |||||
'class_names': class_names, | |||||
'mean_image': mean_image, | |||||
} | |||||
def load_models(models_dir): | |||||
""" | |||||
Load saved models from disk. This will attempt to unpickle all files in a | |||||
directory; any files that give errors on unpickling (such as README.txt) | |||||
will be skipped. | |||||
Inputs: | |||||
- models_dir: String giving the path to a directory containing model files. | |||||
Each model file is a pickled dictionary with a 'model' field. | |||||
Returns: | |||||
A dictionary mapping model file names to models. | |||||
""" | |||||
models = {} | |||||
for model_file in os.listdir(models_dir): | |||||
with open(os.path.join(models_dir, model_file), 'rb') as f: | |||||
try: | |||||
models[model_file] = load_pickle(f)['model'] | |||||
except pickle.UnpicklingError: | |||||
continue | |||||
return models | |||||
def load_imagenet_val(num=None): | |||||
"""Load a handful of validation images from ImageNet. | |||||
Inputs: | |||||
- num: Number of images to load (max of 25) | |||||
Returns: | |||||
- X: numpy array with shape [num, 224, 224, 3] | |||||
- y: numpy array of integer image labels, shape [num] | |||||
- class_names: dict mapping integer label to class name | |||||
""" | |||||
imagenet_fn = 'daseCV/datasets/imagenet_val_25.npz' | |||||
if not os.path.isfile(imagenet_fn): | |||||
print('file %s not found' % imagenet_fn) | |||||
print('Run the following:') | |||||
print('cd daseCV/datasets') | |||||
print('bash get_imagenet_val.sh') | |||||
assert False, 'Need to download imagenet_val_25.npz' | |||||
f = np.load(imagenet_fn) | |||||
X = f['X'] | |||||
y = f['y'] | |||||
class_names = f['label_map'].item() | |||||
if num is not None: | |||||
X = X[:num] | |||||
y = y[:num] | |||||
return X, y, class_names |
@ -0,0 +1,293 @@ | |||||
from __future__ import print_function | |||||
import numpy as np | |||||
import torch | |||||
import torch.nn as nn | |||||
try: | |||||
from daseCV.im2col_cython import col2im_cython, im2col_cython | |||||
from daseCV.im2col_cython import col2im_6d_cython | |||||
except ImportError: | |||||
print('run the following from the daseCV directory and try again:') | |||||
print('python setup.py build_ext --inplace') | |||||
print('You may also need to restart your iPython kernel') | |||||
from daseCV.im2col import * | |||||
def conv_forward_im2col(x, w, b, conv_param): | |||||
""" | |||||
A fast implementation of the forward pass for a convolutional layer | |||||
based on im2col and col2im. | |||||
""" | |||||
N, C, H, W = x.shape | |||||
num_filters, _, filter_height, filter_width = w.shape | |||||
stride, pad = conv_param['stride'], conv_param['pad'] | |||||
# Check dimensions | |||||
assert (W + 2 * pad - filter_width) % stride == 0, 'width does not work' | |||||
assert (H + 2 * pad - filter_height) % stride == 0, 'height does not work' | |||||
# Create output | |||||
out_height = (H + 2 * pad - filter_height) // stride + 1 | |||||
out_width = (W + 2 * pad - filter_width) // stride + 1 | |||||
out = np.zeros((N, num_filters, out_height, out_width), dtype=x.dtype) | |||||
# x_cols = im2col_indices(x, w.shape[2], w.shape[3], pad, stride) | |||||
x_cols = im2col_cython(x, w.shape[2], w.shape[3], pad, stride) | |||||
res = w.reshape((w.shape[0], -1)).dot(x_cols) + b.reshape(-1, 1) | |||||
out = res.reshape(w.shape[0], out.shape[2], out.shape[3], x.shape[0]) | |||||
out = out.transpose(3, 0, 1, 2) | |||||
cache = (x, w, b, conv_param, x_cols) | |||||
return out, cache | |||||
def conv_forward_pytorch(x, w, b, conv_param): | |||||
N, C, H, W = x.shape | |||||
F, _, HH, WW = w.shape | |||||
stride, pad = conv_param['stride'], conv_param['pad'] | |||||
layer = nn.Conv2d(C, F, (HH, WW), stride=stride, padding=pad) | |||||
layer.weight = nn.Parameter(torch.tensor(w)) | |||||
layer.bias = nn.Parameter(torch.tensor(b)) | |||||
tx = torch.tensor(x, requires_grad=True) | |||||
out = layer(tx) | |||||
cache = (x, w, b, conv_param, tx, out, layer) | |||||
return out, cache | |||||
def conv_backward_pytorch(dout, cache): | |||||
x, _, _, _, tx, out, layer = cache | |||||
out.backward(torch.tensor(dout)) | |||||
dx = tx.grad.detach().numpy() | |||||
dw = layer.weight.grad.detach().numpy() | |||||
db = layer.bias.grad.detach().numpy() | |||||
return dx, dw, db | |||||
def conv_forward_strides(x, w, b, conv_param): | |||||
N, C, H, W = x.shape | |||||
F, _, HH, WW = w.shape | |||||
stride, pad = conv_param['stride'], conv_param['pad'] | |||||
# Check dimensions | |||||
#assert (W + 2 * pad - WW) % stride == 0, 'width does not work' | |||||
#assert (H + 2 * pad - HH) % stride == 0, 'height does not work' | |||||
# Pad the input | |||||
p = pad | |||||
x_padded = np.pad(x, ((0, 0), (0, 0), (p, p), (p, p)), mode='constant') | |||||
# Figure out output dimensions | |||||
H += 2 * pad | |||||
W += 2 * pad | |||||
out_h = (H - HH) // stride + 1 | |||||
out_w = (W - WW) // stride + 1 | |||||
# Perform an im2col operation by picking clever strides | |||||
shape = (C, HH, WW, N, out_h, out_w) | |||||
strides = (H * W, W, 1, C * H * W, stride * W, stride) | |||||
strides = x.itemsize * np.array(strides) | |||||
x_stride = np.lib.stride_tricks.as_strided(x_padded, | |||||
shape=shape, strides=strides) | |||||
x_cols = np.ascontiguousarray(x_stride) | |||||
x_cols.shape = (C * HH * WW, N * out_h * out_w) | |||||
# Now all our convolutions are a big matrix multiply | |||||
res = w.reshape(F, -1).dot(x_cols) + b.reshape(-1, 1) | |||||
# Reshape the output | |||||
res.shape = (F, N, out_h, out_w) | |||||
out = res.transpose(1, 0, 2, 3) | |||||
# Be nice and return a contiguous array | |||||
# The old version of conv_forward_fast doesn't do this, so for a fair | |||||
# comparison we won't either | |||||
out = np.ascontiguousarray(out) | |||||
cache = (x, w, b, conv_param, x_cols) | |||||
return out, cache | |||||
def conv_backward_strides(dout, cache): | |||||
x, w, b, conv_param, x_cols = cache | |||||
stride, pad = conv_param['stride'], conv_param['pad'] | |||||
N, C, H, W = x.shape | |||||
F, _, HH, WW = w.shape | |||||
_, _, out_h, out_w = dout.shape | |||||
db = np.sum(dout, axis=(0, 2, 3)) | |||||
dout_reshaped = dout.transpose(1, 0, 2, 3).reshape(F, -1) | |||||
dw = dout_reshaped.dot(x_cols.T).reshape(w.shape) | |||||
dx_cols = w.reshape(F, -1).T.dot(dout_reshaped) | |||||
dx_cols.shape = (C, HH, WW, N, out_h, out_w) | |||||
dx = col2im_6d_cython(dx_cols, N, C, H, W, HH, WW, pad, stride) | |||||
return dx, dw, db | |||||
def conv_backward_im2col(dout, cache): | |||||
""" | |||||
A fast implementation of the backward pass for a convolutional layer | |||||
based on im2col and col2im. | |||||
""" | |||||
x, w, b, conv_param, x_cols = cache | |||||
stride, pad = conv_param['stride'], conv_param['pad'] | |||||
db = np.sum(dout, axis=(0, 2, 3)) | |||||
num_filters, _, filter_height, filter_width = w.shape | |||||
dout_reshaped = dout.transpose(1, 2, 3, 0).reshape(num_filters, -1) | |||||
dw = dout_reshaped.dot(x_cols.T).reshape(w.shape) | |||||
dx_cols = w.reshape(num_filters, -1).T.dot(dout_reshaped) | |||||
# dx = col2im_indices(dx_cols, x.shape, filter_height, filter_width, pad, stride) | |||||
dx = col2im_cython(dx_cols, x.shape[0], x.shape[1], x.shape[2], x.shape[3], | |||||
filter_height, filter_width, pad, stride) | |||||
return dx, dw, db | |||||
conv_forward_fast = conv_forward_strides | |||||
conv_backward_fast = conv_backward_strides | |||||
def max_pool_forward_fast(x, pool_param): | |||||
""" | |||||
A fast implementation of the forward pass for a max pooling layer. | |||||
This chooses between the reshape method and the im2col method. If the pooling | |||||
regions are square and tile the input image, then we can use the reshape | |||||
method which is very fast. Otherwise we fall back on the im2col method, which | |||||
is not much faster than the naive method. | |||||
""" | |||||
N, C, H, W = x.shape | |||||
pool_height, pool_width = pool_param['pool_height'], pool_param['pool_width'] | |||||
stride = pool_param['stride'] | |||||
same_size = pool_height == pool_width == stride | |||||
tiles = H % pool_height == 0 and W % pool_width == 0 | |||||
if same_size and tiles: | |||||
out, reshape_cache = max_pool_forward_reshape(x, pool_param) | |||||
cache = ('reshape', reshape_cache) | |||||
else: | |||||
out, im2col_cache = max_pool_forward_im2col(x, pool_param) | |||||
cache = ('im2col', im2col_cache) | |||||
return out, cache | |||||
def max_pool_backward_fast(dout, cache): | |||||
""" | |||||
A fast implementation of the backward pass for a max pooling layer. | |||||
This switches between the reshape method an the im2col method depending on | |||||
which method was used to generate the cache. | |||||
""" | |||||
method, real_cache = cache | |||||
if method == 'reshape': | |||||
return max_pool_backward_reshape(dout, real_cache) | |||||
elif method == 'im2col': | |||||
return max_pool_backward_im2col(dout, real_cache) | |||||
else: | |||||
raise ValueError('Unrecognized method "%s"' % method) | |||||
def max_pool_forward_reshape(x, pool_param): | |||||
""" | |||||
A fast implementation of the forward pass for the max pooling layer that uses | |||||
some clever reshaping. | |||||
This can only be used for square pooling regions that tile the input. | |||||
""" | |||||
N, C, H, W = x.shape | |||||
pool_height, pool_width = pool_param['pool_height'], pool_param['pool_width'] | |||||
stride = pool_param['stride'] | |||||
assert pool_height == pool_width == stride, 'Invalid pool params' | |||||
assert H % pool_height == 0 | |||||
assert W % pool_height == 0 | |||||
x_reshaped = x.reshape(N, C, H // pool_height, pool_height, | |||||
W // pool_width, pool_width) | |||||
out = x_reshaped.max(axis=3).max(axis=4) | |||||
cache = (x, x_reshaped, out) | |||||
return out, cache | |||||
def max_pool_backward_reshape(dout, cache): | |||||
""" | |||||
A fast implementation of the backward pass for the max pooling layer that | |||||
uses some clever broadcasting and reshaping. | |||||
This can only be used if the forward pass was computed using | |||||
max_pool_forward_reshape. | |||||
NOTE: If there are multiple argmaxes, this method will assign gradient to | |||||
ALL argmax elements of the input rather than picking one. In this case the | |||||
gradient will actually be incorrect. However this is unlikely to occur in | |||||
practice, so it shouldn't matter much. One possible solution is to split the | |||||
upstream gradient equally among all argmax elements; this should result in a | |||||
valid subgradient. You can make this happen by uncommenting the line below; | |||||
however this results in a significant performance penalty (about 40% slower) | |||||
and is unlikely to matter in practice so we don't do it. | |||||
""" | |||||
x, x_reshaped, out = cache | |||||
dx_reshaped = np.zeros_like(x_reshaped) | |||||
out_newaxis = out[:, :, :, np.newaxis, :, np.newaxis] | |||||
mask = (x_reshaped == out_newaxis) | |||||
dout_newaxis = dout[:, :, :, np.newaxis, :, np.newaxis] | |||||
dout_broadcast, _ = np.broadcast_arrays(dout_newaxis, dx_reshaped) | |||||
dx_reshaped[mask] = dout_broadcast[mask] | |||||
dx_reshaped /= np.sum(mask, axis=(3, 5), keepdims=True) | |||||
dx = dx_reshaped.reshape(x.shape) | |||||
return dx | |||||
def max_pool_forward_im2col(x, pool_param): | |||||
""" | |||||
An implementation of the forward pass for max pooling based on im2col. | |||||
This isn't much faster than the naive version, so it should be avoided if | |||||
possible. | |||||
""" | |||||
N, C, H, W = x.shape | |||||
pool_height, pool_width = pool_param['pool_height'], pool_param['pool_width'] | |||||
stride = pool_param['stride'] | |||||
assert (H - pool_height) % stride == 0, 'Invalid height' | |||||
assert (W - pool_width) % stride == 0, 'Invalid width' | |||||
out_height = (H - pool_height) // stride + 1 | |||||
out_width = (W - pool_width) // stride + 1 | |||||
x_split = x.reshape(N * C, 1, H, W) | |||||
x_cols = im2col(x_split, pool_height, pool_width, padding=0, stride=stride) | |||||
x_cols_argmax = np.argmax(x_cols, axis=0) | |||||
x_cols_max = x_cols[x_cols_argmax, np.arange(x_cols.shape[1])] | |||||
out = x_cols_max.reshape(out_height, out_width, N, C).transpose(2, 3, 0, 1) | |||||
cache = (x, x_cols, x_cols_argmax, pool_param) | |||||
return out, cache | |||||
def max_pool_backward_im2col(dout, cache): | |||||
""" | |||||
An implementation of the backward pass for max pooling based on im2col. | |||||
This isn't much faster than the naive version, so it should be avoided if | |||||
possible. | |||||
""" | |||||
x, x_cols, x_cols_argmax, pool_param = cache | |||||
N, C, H, W = x.shape | |||||
pool_height, pool_width = pool_param['pool_height'], pool_param['pool_width'] | |||||
stride = pool_param['stride'] | |||||
dout_reshaped = dout.transpose(2, 3, 0, 1).flatten() | |||||
dx_cols = np.zeros_like(x_cols) | |||||
dx_cols[x_cols_argmax, np.arange(dx_cols.shape[1])] = dout_reshaped | |||||
dx = col2im_indices(dx_cols, (N * C, 1, H, W), pool_height, pool_width, | |||||
padding=0, stride=stride) | |||||
dx = dx.reshape(x.shape) | |||||
return dx |
@ -0,0 +1,129 @@ | |||||
from __future__ import print_function | |||||
from builtins import range | |||||
from past.builtins import xrange | |||||
import numpy as np | |||||
from random import randrange | |||||
def eval_numerical_gradient(f, x, verbose=True, h=0.00001): | |||||
""" | |||||
a naive implementation of numerical gradient of f at x | |||||
- f should be a function that takes a single argument | |||||
- x is the point (numpy array) to evaluate the gradient at | |||||
""" | |||||
fx = f(x) # evaluate function value at original point | |||||
grad = np.zeros_like(x) | |||||
# iterate over all indexes in x | |||||
it = np.nditer(x, flags=['multi_index'], op_flags=['readwrite']) | |||||
while not it.finished: | |||||
# evaluate function at x+h | |||||
ix = it.multi_index | |||||
oldval = x[ix] | |||||
x[ix] = oldval + h # increment by h | |||||
fxph = f(x) # evalute f(x + h) | |||||
x[ix] = oldval - h | |||||
fxmh = f(x) # evaluate f(x - h) | |||||
x[ix] = oldval # restore | |||||
# compute the partial derivative with centered formula | |||||
grad[ix] = (fxph - fxmh) / (2 * h) # the slope | |||||
if verbose: | |||||
print(ix, grad[ix]) | |||||
it.iternext() # step to next dimension | |||||
return grad | |||||
def eval_numerical_gradient_array(f, x, df, h=1e-5): | |||||
""" | |||||
Evaluate a numeric gradient for a function that accepts a numpy | |||||
array and returns a numpy array. | |||||
""" | |||||
grad = np.zeros_like(x) | |||||
it = np.nditer(x, flags=['multi_index'], op_flags=['readwrite']) | |||||
while not it.finished: | |||||
ix = it.multi_index | |||||
oldval = x[ix] | |||||
x[ix] = oldval + h | |||||
pos = f(x).copy() | |||||
x[ix] = oldval - h | |||||
neg = f(x).copy() | |||||
x[ix] = oldval | |||||
grad[ix] = np.sum((pos - neg) * df) / (2 * h) | |||||
it.iternext() | |||||
return grad | |||||
def eval_numerical_gradient_blobs(f, inputs, output, h=1e-5): | |||||
""" | |||||
Compute numeric gradients for a function that operates on input | |||||
and output blobs. | |||||
We assume that f accepts several input blobs as arguments, followed by a | |||||
blob where outputs will be written. For example, f might be called like: | |||||
f(x, w, out) | |||||
where x and w are input Blobs, and the result of f will be written to out. | |||||
Inputs: | |||||
- f: function | |||||
- inputs: tuple of input blobs | |||||
- output: output blob | |||||
- h: step size | |||||
""" | |||||
numeric_diffs = [] | |||||
for input_blob in inputs: | |||||
diff = np.zeros_like(input_blob.diffs) | |||||
it = np.nditer(input_blob.vals, flags=['multi_index'], | |||||
op_flags=['readwrite']) | |||||
while not it.finished: | |||||
idx = it.multi_index | |||||
orig = input_blob.vals[idx] | |||||
input_blob.vals[idx] = orig + h | |||||
f(*(inputs + (output,))) | |||||
pos = np.copy(output.vals) | |||||
input_blob.vals[idx] = orig - h | |||||
f(*(inputs + (output,))) | |||||
neg = np.copy(output.vals) | |||||
input_blob.vals[idx] = orig | |||||
diff[idx] = np.sum((pos - neg) * output.diffs) / (2.0 * h) | |||||
it.iternext() | |||||
numeric_diffs.append(diff) | |||||
return numeric_diffs | |||||
def eval_numerical_gradient_net(net, inputs, output, h=1e-5): | |||||
return eval_numerical_gradient_blobs(lambda *args: net.forward(), | |||||
inputs, output, h=h) | |||||
def grad_check_sparse(f, x, analytic_grad, num_checks=10, h=1e-5): | |||||
""" | |||||
sample a few random elements and only return numerical | |||||
in this dimensions. | |||||
""" | |||||
for i in range(num_checks): | |||||
ix = tuple([randrange(m) for m in x.shape]) | |||||
oldval = x[ix] | |||||
x[ix] = oldval + h # increment by h | |||||
fxph = f(x) # evaluate f(x + h) | |||||
x[ix] = oldval - h # increment by h | |||||
fxmh = f(x) # evaluate f(x - h) | |||||
x[ix] = oldval # reset | |||||
grad_numerical = (fxph - fxmh) / (2 * h) | |||||
grad_analytic = analytic_grad[ix] | |||||
rel_error = (abs(grad_numerical - grad_analytic) / | |||||
(abs(grad_numerical) + abs(grad_analytic))) | |||||
print('numerical: %f analytic: %f, relative error: %e' | |||||
%(grad_numerical, grad_analytic, rel_error)) |
@ -0,0 +1,54 @@ | |||||
from builtins import range | |||||
import numpy as np | |||||
def get_im2col_indices(x_shape, field_height, field_width, padding=1, stride=1): | |||||
# First figure out what the size of the output should be | |||||
N, C, H, W = x_shape | |||||
assert (H + 2 * padding - field_height) % stride == 0 | |||||
assert (W + 2 * padding - field_height) % stride == 0 | |||||
out_height = (H + 2 * padding - field_height) / stride + 1 | |||||
out_width = (W + 2 * padding - field_width) / stride + 1 | |||||
i0 = np.repeat(np.arange(field_height), field_width) | |||||
i0 = np.tile(i0, C) | |||||
i1 = stride * np.repeat(np.arange(out_height), out_width) | |||||
j0 = np.tile(np.arange(field_width), field_height * C) | |||||
j1 = stride * np.tile(np.arange(out_width), out_height) | |||||
i = i0.reshape(-1, 1) + i1.reshape(1, -1) | |||||
j = j0.reshape(-1, 1) + j1.reshape(1, -1) | |||||
k = np.repeat(np.arange(C), field_height * field_width).reshape(-1, 1) | |||||
return (k, i, j) | |||||
def im2col_indices(x, field_height, field_width, padding=1, stride=1): | |||||
""" An implementation of im2col based on some fancy indexing """ | |||||
# Zero-pad the input | |||||
p = padding | |||||
x_padded = np.pad(x, ((0, 0), (0, 0), (p, p), (p, p)), mode='constant') | |||||
k, i, j = get_im2col_indices(x.shape, field_height, field_width, padding, | |||||
stride) | |||||
cols = x_padded[:, k, i, j] | |||||
C = x.shape[1] | |||||
cols = cols.transpose(1, 2, 0).reshape(field_height * field_width * C, -1) | |||||
return cols | |||||
def col2im_indices(cols, x_shape, field_height=3, field_width=3, padding=1, | |||||
stride=1): | |||||
""" An implementation of col2im based on fancy indexing and np.add.at """ | |||||
N, C, H, W = x_shape | |||||
H_padded, W_padded = H + 2 * padding, W + 2 * padding | |||||
x_padded = np.zeros((N, C, H_padded, W_padded), dtype=cols.dtype) | |||||
k, i, j = get_im2col_indices(x_shape, field_height, field_width, padding, | |||||
stride) | |||||
cols_reshaped = cols.reshape(C * field_height * field_width, -1, N) | |||||
cols_reshaped = cols_reshaped.transpose(2, 0, 1) | |||||
np.add.at(x_padded, (slice(None), k, i, j), cols_reshaped) | |||||
if padding == 0: | |||||
return x_padded | |||||
return x_padded[:, :, padding:-padding, padding:-padding] |
@ -0,0 +1,121 @@ | |||||
import numpy as np | |||||
cimport numpy as np | |||||
cimport cython | |||||
# DTYPE = np.float64 | |||||
# ctypedef np.float64_t DTYPE_t | |||||
ctypedef fused DTYPE_t: | |||||
np.float32_t | |||||
np.float64_t | |||||
def im2col_cython(np.ndarray[DTYPE_t, ndim=4] x, int field_height, | |||||
int field_width, int padding, int stride): | |||||
cdef int N = x.shape[0] | |||||
cdef int C = x.shape[1] | |||||
cdef int H = x.shape[2] | |||||
cdef int W = x.shape[3] | |||||
cdef int HH = (H + 2 * padding - field_height) / stride + 1 | |||||
cdef int WW = (W + 2 * padding - field_width) / stride + 1 | |||||
cdef int p = padding | |||||
cdef np.ndarray[DTYPE_t, ndim=4] x_padded = np.pad(x, | |||||
((0, 0), (0, 0), (p, p), (p, p)), mode='constant') | |||||
cdef np.ndarray[DTYPE_t, ndim=2] cols = np.zeros( | |||||
(C * field_height * field_width, N * HH * WW), | |||||
dtype=x.dtype) | |||||
# Moving the inner loop to a C function with no bounds checking works, but does | |||||
# not seem to help performance in any measurable way. | |||||
im2col_cython_inner(cols, x_padded, N, C, H, W, HH, WW, | |||||
field_height, field_width, padding, stride) | |||||
return cols | |||||
@cython.boundscheck(False) | |||||
cdef int im2col_cython_inner(np.ndarray[DTYPE_t, ndim=2] cols, | |||||
np.ndarray[DTYPE_t, ndim=4] x_padded, | |||||
int N, int C, int H, int W, int HH, int WW, | |||||
int field_height, int field_width, int padding, int stride) except? -1: | |||||
cdef int c, ii, jj, row, yy, xx, i, col | |||||
for c in range(C): | |||||
for yy in range(HH): | |||||
for xx in range(WW): | |||||
for ii in range(field_height): | |||||
for jj in range(field_width): | |||||
row = c * field_width * field_height + ii * field_height + jj | |||||
for i in range(N): | |||||
col = yy * WW * N + xx * N + i | |||||
cols[row, col] = x_padded[i, c, stride * yy + ii, stride * xx + jj] | |||||
def col2im_cython(np.ndarray[DTYPE_t, ndim=2] cols, int N, int C, int H, int W, | |||||
int field_height, int field_width, int padding, int stride): | |||||
cdef np.ndarray x = np.empty((N, C, H, W), dtype=cols.dtype) | |||||
cdef int HH = (H + 2 * padding - field_height) / stride + 1 | |||||
cdef int WW = (W + 2 * padding - field_width) / stride + 1 | |||||
cdef np.ndarray[DTYPE_t, ndim=4] x_padded = np.zeros((N, C, H + 2 * padding, W + 2 * padding), | |||||
dtype=cols.dtype) | |||||
# Moving the inner loop to a C-function with no bounds checking improves | |||||
# performance quite a bit for col2im. | |||||
col2im_cython_inner(cols, x_padded, N, C, H, W, HH, WW, | |||||
field_height, field_width, padding, stride) | |||||
if padding > 0: | |||||
return x_padded[:, :, padding:-padding, padding:-padding] | |||||
return x_padded | |||||
@cython.boundscheck(False) | |||||
cdef int col2im_cython_inner(np.ndarray[DTYPE_t, ndim=2] cols, | |||||
np.ndarray[DTYPE_t, ndim=4] x_padded, | |||||
int N, int C, int H, int W, int HH, int WW, | |||||
int field_height, int field_width, int padding, int stride) except? -1: | |||||
cdef int c, ii, jj, row, yy, xx, i, col | |||||
for c in range(C): | |||||
for ii in range(field_height): | |||||
for jj in range(field_width): | |||||
row = c * field_width * field_height + ii * field_height + jj | |||||
for yy in range(HH): | |||||
for xx in range(WW): | |||||
for i in range(N): | |||||
col = yy * WW * N + xx * N + i | |||||
x_padded[i, c, stride * yy + ii, stride * xx + jj] += cols[row, col] | |||||
@cython.boundscheck(False) | |||||
@cython.wraparound(False) | |||||
cdef col2im_6d_cython_inner(np.ndarray[DTYPE_t, ndim=6] cols, | |||||
np.ndarray[DTYPE_t, ndim=4] x_padded, | |||||
int N, int C, int H, int W, int HH, int WW, | |||||
int out_h, int out_w, int pad, int stride): | |||||
cdef int c, hh, ww, n, h, w | |||||
for n in range(N): | |||||
for c in range(C): | |||||
for hh in range(HH): | |||||
for ww in range(WW): | |||||
for h in range(out_h): | |||||
for w in range(out_w): | |||||
x_padded[n, c, stride * h + hh, stride * w + ww] += cols[c, hh, ww, n, h, w] | |||||
def col2im_6d_cython(np.ndarray[DTYPE_t, ndim=6] cols, int N, int C, int H, int W, | |||||
int HH, int WW, int pad, int stride): | |||||
cdef np.ndarray x = np.empty((N, C, H, W), dtype=cols.dtype) | |||||
cdef int out_h = (H + 2 * pad - HH) / stride + 1 | |||||
cdef int out_w = (W + 2 * pad - WW) / stride + 1 | |||||
cdef np.ndarray[DTYPE_t, ndim=4] x_padded = np.zeros((N, C, H + 2 * pad, W + 2 * pad), | |||||
dtype=cols.dtype) | |||||
col2im_6d_cython_inner(cols, x_padded, N, C, H, W, HH, WW, out_h, out_w, pad, stride) | |||||
if pad > 0: | |||||
return x_padded[:, :, pad:-pad, pad:-pad] | |||||
return x_padded |
@ -0,0 +1,105 @@ | |||||
from daseCV.layers import * | |||||
from daseCV.fast_layers import * | |||||
def affine_relu_forward(x, w, b): | |||||
""" | |||||
Convenience layer that perorms an affine transform followed by a ReLU | |||||
Inputs: | |||||
- x: Input to the affine layer | |||||
- w, b: Weights for the affine layer | |||||
Returns a tuple of: | |||||
- out: Output from the ReLU | |||||
- cache: Object to give to the backward pass | |||||
""" | |||||
a, fc_cache = affine_forward(x, w, b) | |||||
out, relu_cache = relu_forward(a) | |||||
cache = (fc_cache, relu_cache) | |||||
return out, cache | |||||
def affine_relu_backward(dout, cache): | |||||
""" | |||||
Backward pass for the affine-relu convenience layer | |||||
""" | |||||
fc_cache, relu_cache = cache | |||||
da = relu_backward(dout, relu_cache) | |||||
dx, dw, db = affine_backward(da, fc_cache) | |||||
return dx, dw, db | |||||
def conv_relu_forward(x, w, b, conv_param): | |||||
""" | |||||
A convenience layer that performs a convolution followed by a ReLU. | |||||
Inputs: | |||||
- x: Input to the convolutional layer | |||||
- w, b, conv_param: Weights and parameters for the convolutional layer | |||||
Returns a tuple of: | |||||
- out: Output from the ReLU | |||||
- cache: Object to give to the backward pass | |||||
""" | |||||
a, conv_cache = conv_forward_fast(x, w, b, conv_param) | |||||
out, relu_cache = relu_forward(a) | |||||
cache = (conv_cache, relu_cache) | |||||
return out, cache | |||||
def conv_relu_backward(dout, cache): | |||||
""" | |||||
Backward pass for the conv-relu convenience layer. | |||||
""" | |||||
conv_cache, relu_cache = cache | |||||
da = relu_backward(dout, relu_cache) | |||||
dx, dw, db = conv_backward_fast(da, conv_cache) | |||||
return dx, dw, db | |||||
def conv_bn_relu_forward(x, w, b, gamma, beta, conv_param, bn_param): | |||||
a, conv_cache = conv_forward_fast(x, w, b, conv_param) | |||||
an, bn_cache = spatial_batchnorm_forward(a, gamma, beta, bn_param) | |||||
out, relu_cache = relu_forward(an) | |||||
cache = (conv_cache, bn_cache, relu_cache) | |||||
return out, cache | |||||
def conv_bn_relu_backward(dout, cache): | |||||
conv_cache, bn_cache, relu_cache = cache | |||||
dan = relu_backward(dout, relu_cache) | |||||
da, dgamma, dbeta = spatial_batchnorm_backward(dan, bn_cache) | |||||
dx, dw, db = conv_backward_fast(da, conv_cache) | |||||
return dx, dw, db, dgamma, dbeta | |||||
def conv_relu_pool_forward(x, w, b, conv_param, pool_param): | |||||
""" | |||||
Convenience layer that performs a convolution, a ReLU, and a pool. | |||||
Inputs: | |||||
- x: Input to the convolutional layer | |||||
- w, b, conv_param: Weights and parameters for the convolutional layer | |||||
- pool_param: Parameters for the pooling layer | |||||
Returns a tuple of: | |||||
- out: Output from the pooling layer | |||||
- cache: Object to give to the backward pass | |||||
""" | |||||
a, conv_cache = conv_forward_fast(x, w, b, conv_param) | |||||
s, relu_cache = relu_forward(a) | |||||
out, pool_cache = max_pool_forward_fast(s, pool_param) | |||||
cache = (conv_cache, relu_cache, pool_cache) | |||||
return out, cache | |||||
def conv_relu_pool_backward(dout, cache): | |||||
""" | |||||
Backward pass for the conv-relu-pool convenience layer | |||||
""" | |||||
conv_cache, relu_cache, pool_cache = cache | |||||
ds = max_pool_backward_fast(dout, pool_cache) | |||||
da = relu_backward(ds, relu_cache) | |||||
dx, dw, db = conv_backward_fast(da, conv_cache) | |||||
return dx, dw, db |
@ -0,0 +1,793 @@ | |||||
from builtins import range | |||||
import numpy as np | |||||
def affine_forward(x, w, b): | |||||
""" | |||||
为仿射(全连接)层计算前向传播 | |||||
输入x的形状为(N, d_1, ..., d_k),其中包含了N个样本, | |||||
每个样本x[i]都有形状(d_1, ..., d_k)。我们把每个输入重新reshape成为一个D维向量 | |||||
D = d_1 * ... * d_k,然后将其转换为M维的输出向量。 | |||||
Inputs: | |||||
- x: A numpy array containing input data, of shape (N, d_1, ..., d_k) | |||||
- w: A numpy array of weights, of shape (D, M) | |||||
- b: A numpy array of biases, of shape (M,) | |||||
Returns a tuple of: | |||||
- out: output, of shape (N, M) | |||||
- cache: (x, w, b) | |||||
""" | |||||
out = None | |||||
########################################################################### | |||||
# TODO: Implement the affine forward pass. Store the result in out. You # | |||||
# will need to reshape the input into rows. # | |||||
########################################################################### | |||||
# *****START OF YOUR CODE (DO NOT DELETE/MODIFY THIS LINE)***** | |||||
pass | |||||
# *****END OF YOUR CODE (DO NOT DELETE/MODIFY THIS LINE)***** | |||||
########################################################################### | |||||
# END OF YOUR CODE # | |||||
########################################################################### | |||||
cache = (x, w, b) | |||||
return out, cache | |||||
def affine_backward(dout, cache): | |||||
""" | |||||
为仿射层计算反向传播 | |||||
Inputs: | |||||
- dout: Upstream derivative, of shape (N, M) | |||||
- cache: Tuple of: | |||||
- x: Input data, of shape (N, d_1, ... d_k) | |||||
- w: Weights, of shape (D, M) | |||||
- b: Biases, of shape (M,) | |||||
Returns a tuple of: | |||||
- dx: Gradient with respect to x, of shape (N, d1, ..., d_k) | |||||
- dw: Gradient with respect to w, of shape (D, M) | |||||
- db: Gradient with respect to b, of shape (M,) | |||||
""" | |||||
x, w, b = cache | |||||
dx, dw, db = None, None, None | |||||
########################################################################### | |||||
# TODO: Implement the affine backward pass. # | |||||
########################################################################### | |||||
# *****START OF YOUR CODE (DO NOT DELETE/MODIFY THIS LINE)***** | |||||
pass | |||||
# *****END OF YOUR CODE (DO NOT DELETE/MODIFY THIS LINE)***** | |||||
########################################################################### | |||||
# END OF YOUR CODE # | |||||
########################################################################### | |||||
return dx, dw, db | |||||
def relu_forward(x): | |||||
""" | |||||
计算一层整流线性单元(ReLUs)的前向传播。 | |||||
Input: | |||||
- x: Inputs, of any shape | |||||
Returns a tuple of: | |||||
- out: Output, of the same shape as x | |||||
- cache: x | |||||
""" | |||||
out = None | |||||
########################################################################### | |||||
# TODO: Implement the ReLU forward pass. # | |||||
########################################################################### | |||||
# *****START OF YOUR CODE (DO NOT DELETE/MODIFY THIS LINE)***** | |||||
pass | |||||
# *****END OF YOUR CODE (DO NOT DELETE/MODIFY THIS LINE)***** | |||||
########################################################################### | |||||
# END OF YOUR CODE # | |||||
########################################################################### | |||||
cache = x | |||||
return out, cache | |||||
def relu_backward(dout, cache): | |||||
""" | |||||
Computes the backward pass for a layer of rectified linear units (ReLUs). | |||||
Input: | |||||
- dout: Upstream derivatives, of any shape | |||||
- cache: Input x, of same shape as dout | |||||
Returns: | |||||
- dx: Gradient with respect to x | |||||
""" | |||||
dx, x = None, cache | |||||
########################################################################### | |||||
# TODO: Implement the ReLU backward pass. # | |||||
########################################################################### | |||||
# *****START OF YOUR CODE (DO NOT DELETE/MODIFY THIS LINE)***** | |||||
pass | |||||
# *****END OF YOUR CODE (DO NOT DELETE/MODIFY THIS LINE)***** | |||||
########################################################################### | |||||
# END OF YOUR CODE # | |||||
########################################################################### | |||||
return dx | |||||
def batchnorm_forward(x, gamma, beta, bn_param): | |||||
""" | |||||
Forward pass for batch normalization. | |||||
During training the sample mean and (uncorrected) sample variance are | |||||
computed from minibatch statistics and used to normalize the incoming data. | |||||
During training we also keep an exponentially decaying running mean of the | |||||
mean and variance of each feature, and these averages are used to normalize | |||||
data at test-time. | |||||
At each timestep we update the running averages for mean and variance using | |||||
an exponential decay based on the momentum parameter: | |||||
running_mean = momentum * running_mean + (1 - momentum) * sample_mean | |||||
running_var = momentum * running_var + (1 - momentum) * sample_var | |||||
Note that the batch normalization paper suggests a different test-time | |||||
behavior: they compute sample mean and variance for each feature using a | |||||
large number of training images rather than using a running average. For | |||||
this implementation we have chosen to use running averages instead since | |||||
they do not require an additional estimation step; the torch7 | |||||
implementation of batch normalization also uses running averages. | |||||
Input: | |||||
- x: Data of shape (N, D) | |||||
- gamma: Scale parameter of shape (D,) | |||||
- beta: Shift paremeter of shape (D,) | |||||
- bn_param: Dictionary with the following keys: | |||||
- mode: 'train' or 'test'; required | |||||
- eps: Constant for numeric stability | |||||
- momentum: Constant for running mean / variance. | |||||
- running_mean: Array of shape (D,) giving running mean of features | |||||
- running_var Array of shape (D,) giving running variance of features | |||||
Returns a tuple of: | |||||
- out: of shape (N, D) | |||||
- cache: A tuple of values needed in the backward pass | |||||
""" | |||||
mode = bn_param['mode'] | |||||
eps = bn_param.get('eps', 1e-5) | |||||
momentum = bn_param.get('momentum', 0.9) | |||||
N, D = x.shape | |||||
running_mean = bn_param.get('running_mean', np.zeros(D, dtype=x.dtype)) | |||||
running_var = bn_param.get('running_var', np.zeros(D, dtype=x.dtype)) | |||||
out, cache = None, None | |||||
if mode == 'train': | |||||
####################################################################### | |||||
# TODO: Implement the training-time forward pass for batch norm. # | |||||
# Use minibatch statistics to compute the mean and variance, use # | |||||
# these statistics to normalize the incoming data, and scale and # | |||||
# shift the normalized data using gamma and beta. # | |||||
# # | |||||
# You should store the output in the variable out. Any intermediates # | |||||
# that you need for the backward pass should be stored in the cache # | |||||
# variable. # | |||||
# # | |||||
# You should also use your computed sample mean and variance together # | |||||
# with the momentum variable to update the running mean and running # | |||||
# variance, storing your result in the running_mean and running_var # | |||||
# variables. # | |||||
# # | |||||
# Note that though you should be keeping track of the running # | |||||
# variance, you should normalize the data based on the standard # | |||||
# deviation (square root of variance) instead! # | |||||
# Referencing the original paper (https://arxiv.org/abs/1502.03167) # | |||||
# might prove to be helpful. # | |||||
####################################################################### | |||||
# *****START OF YOUR CODE (DO NOT DELETE/MODIFY THIS LINE)***** | |||||
pass | |||||
# *****END OF YOUR CODE (DO NOT DELETE/MODIFY THIS LINE)***** | |||||
####################################################################### | |||||
# END OF YOUR CODE # | |||||
####################################################################### | |||||
elif mode == 'test': | |||||
####################################################################### | |||||
# TODO: Implement the test-time forward pass for batch normalization. # | |||||
# Use the running mean and variance to normalize the incoming data, # | |||||
# then scale and shift the normalized data using gamma and beta. # | |||||
# Store the result in the out variable. # | |||||
####################################################################### | |||||
# *****START OF YOUR CODE (DO NOT DELETE/MODIFY THIS LINE)***** | |||||
pass | |||||
# *****END OF YOUR CODE (DO NOT DELETE/MODIFY THIS LINE)***** | |||||
####################################################################### | |||||
# END OF YOUR CODE # | |||||
####################################################################### | |||||
else: | |||||
raise ValueError('Invalid forward batchnorm mode "%s"' % mode) | |||||
# Store the updated running means back into bn_param | |||||
bn_param['running_mean'] = running_mean | |||||
bn_param['running_var'] = running_var | |||||
return out, cache | |||||
def batchnorm_backward(dout, cache): | |||||
""" | |||||
Backward pass for batch normalization. | |||||
For this implementation, you should write out a computation graph for | |||||
batch normalization on paper and propagate gradients backward through | |||||
intermediate nodes. | |||||
Inputs: | |||||
- dout: Upstream derivatives, of shape (N, D) | |||||
- cache: Variable of intermediates from batchnorm_forward. | |||||
Returns a tuple of: | |||||
- dx: Gradient with respect to inputs x, of shape (N, D) | |||||
- dgamma: Gradient with respect to scale parameter gamma, of shape (D,) | |||||
- dbeta: Gradient with respect to shift parameter beta, of shape (D,) | |||||
""" | |||||
dx, dgamma, dbeta = None, None, None | |||||
########################################################################### | |||||
# TODO: Implement the backward pass for batch normalization. Store the # | |||||
# results in the dx, dgamma, and dbeta variables. # | |||||
# Referencing the original paper (https://arxiv.org/abs/1502.03167) # | |||||
# might prove to be helpful. # | |||||
########################################################################### | |||||
# *****START OF YOUR CODE (DO NOT DELETE/MODIFY THIS LINE)***** | |||||
pass | |||||
# *****END OF YOUR CODE (DO NOT DELETE/MODIFY THIS LINE)***** | |||||
########################################################################### | |||||
# END OF YOUR CODE # | |||||
########################################################################### | |||||
return dx, dgamma, dbeta | |||||
def batchnorm_backward_alt(dout, cache): | |||||
""" | |||||
Alternative backward pass for batch normalization. | |||||
For this implementation you should work out the derivatives for the batch | |||||
normalizaton backward pass on paper and simplify as much as possible. You | |||||
should be able to derive a simple expression for the backward pass. | |||||
See the jupyter notebook for more hints. | |||||
Note: This implementation should expect to receive the same cache variable | |||||
as batchnorm_backward, but might not use all of the values in the cache. | |||||
Inputs / outputs: Same as batchnorm_backward | |||||
""" | |||||
dx, dgamma, dbeta = None, None, None | |||||
########################################################################### | |||||
# TODO: Implement the backward pass for batch normalization. Store the # | |||||
# results in the dx, dgamma, and dbeta variables. # | |||||
# # | |||||
# After computing the gradient with respect to the centered inputs, you # | |||||
# should be able to compute gradients with respect to the inputs in a # | |||||
# single statement; our implementation fits on a single 80-character line.# | |||||
########################################################################### | |||||
# *****START OF YOUR CODE (DO NOT DELETE/MODIFY THIS LINE)***** | |||||
pass | |||||
# *****END OF YOUR CODE (DO NOT DELETE/MODIFY THIS LINE)***** | |||||
########################################################################### | |||||
# END OF YOUR CODE # | |||||
########################################################################### | |||||
return dx, dgamma, dbeta | |||||
def layernorm_forward(x, gamma, beta, ln_param): | |||||
""" | |||||
Forward pass for layer normalization. | |||||
During both training and test-time, the incoming data is normalized per data-point, | |||||
before being scaled by gamma and beta parameters identical to that of batch normalization. | |||||
Note that in contrast to batch normalization, the behavior during train and test-time for | |||||
layer normalization are identical, and we do not need to keep track of running averages | |||||
of any sort. | |||||
Input: | |||||
- x: Data of shape (N, D) | |||||
- gamma: Scale parameter of shape (D,) | |||||
- beta: Shift paremeter of shape (D,) | |||||
- ln_param: Dictionary with the following keys: | |||||
- eps: Constant for numeric stability | |||||
Returns a tuple of: | |||||
- out: of shape (N, D) | |||||
- cache: A tuple of values needed in the backward pass | |||||
""" | |||||
out, cache = None, None | |||||
eps = ln_param.get('eps', 1e-5) | |||||
########################################################################### | |||||
# TODO: Implement the training-time forward pass for layer norm. # | |||||
# Normalize the incoming data, and scale and shift the normalized data # | |||||
# using gamma and beta. # | |||||
# HINT: this can be done by slightly modifying your training-time # | |||||
# implementation of batch normalization, and inserting a line or two of # | |||||
# well-placed code. In particular, can you think of any matrix # | |||||
# transformations you could perform, that would enable you to copy over # | |||||
# the batch norm code and leave it almost unchanged? # | |||||
########################################################################### | |||||
# *****START OF YOUR CODE (DO NOT DELETE/MODIFY THIS LINE)***** | |||||
pass | |||||
# *****END OF YOUR CODE (DO NOT DELETE/MODIFY THIS LINE)***** | |||||
########################################################################### | |||||
# END OF YOUR CODE # | |||||
########################################################################### | |||||
return out, cache | |||||
def layernorm_backward(dout, cache): | |||||
""" | |||||
Backward pass for layer normalization. | |||||
For this implementation, you can heavily rely on the work you've done already | |||||
for batch normalization. | |||||
Inputs: | |||||
- dout: Upstream derivatives, of shape (N, D) | |||||
- cache: Variable of intermediates from layernorm_forward. | |||||
Returns a tuple of: | |||||
- dx: Gradient with respect to inputs x, of shape (N, D) | |||||
- dgamma: Gradient with respect to scale parameter gamma, of shape (D,) | |||||
- dbeta: Gradient with respect to shift parameter beta, of shape (D,) | |||||
""" | |||||
dx, dgamma, dbeta = None, None, None | |||||
########################################################################### | |||||
# TODO: Implement the backward pass for layer norm. # | |||||
# # | |||||
# HINT: this can be done by slightly modifying your training-time # | |||||
# implementation of batch normalization. The hints to the forward pass # | |||||
# still apply! # | |||||
########################################################################### | |||||
# *****START OF YOUR CODE (DO NOT DELETE/MODIFY THIS LINE)***** | |||||
pass | |||||
# *****END OF YOUR CODE (DO NOT DELETE/MODIFY THIS LINE)***** | |||||
########################################################################### | |||||
# END OF YOUR CODE # | |||||
########################################################################### | |||||
return dx, dgamma, dbeta | |||||
def dropout_forward(x, dropout_param): | |||||
""" | |||||
Performs the forward pass for (inverted) dropout. | |||||
Inputs: | |||||
- x: Input data, of any shape | |||||
- dropout_param: A dictionary with the following keys: | |||||
- p: Dropout parameter. We keep each neuron output with probability p. | |||||
- mode: 'test' or 'train'. If the mode is train, then perform dropout; | |||||
if the mode is test, then just return the input. | |||||
- seed: Seed for the random number generator. Passing seed makes this | |||||
function deterministic, which is needed for gradient checking but not | |||||
in real networks. | |||||
Outputs: | |||||
- out: Array of the same shape as x. | |||||
- cache: tuple (dropout_param, mask). In training mode, mask is the dropout | |||||
mask that was used to multiply the input; in test mode, mask is None. | |||||
NOTE: Please implement **inverted** dropout, not the vanilla version of dropout. | |||||
See http://cs231n.github.io/neural-networks-2/#reg for more details. | |||||
NOTE 2: Keep in mind that p is the probability of **keep** a neuron | |||||
output; this might be contrary to some sources, where it is referred to | |||||
as the probability of dropping a neuron output. | |||||
""" | |||||
p, mode = dropout_param['p'], dropout_param['mode'] | |||||
if 'seed' in dropout_param: | |||||
np.random.seed(dropout_param['seed']) | |||||
mask = None | |||||
out = None | |||||
if mode == 'train': | |||||
####################################################################### | |||||
# TODO: 完成训练阶段的dropout正向传播。 | |||||
# 将dropout掩码存储在mask变量中。 | |||||
####################################################################### | |||||
# *****START OF YOUR CODE (DO NOT DELETE/MODIFY THIS LINE)***** | |||||
pass | |||||
# *****END OF YOUR CODE (DO NOT DELETE/MODIFY THIS LINE)***** | |||||
####################################################################### | |||||
# END OF YOUR CODE # | |||||
####################################################################### | |||||
elif mode == 'test': | |||||
####################################################################### | |||||
# TODO: 完成测试阶段的dropout正向传播。 | |||||
####################################################################### | |||||
# *****START OF YOUR CODE (DO NOT DELETE/MODIFY THIS LINE)***** | |||||
pass | |||||
# *****END OF YOUR CODE (DO NOT DELETE/MODIFY THIS LINE)***** | |||||
####################################################################### | |||||
# END OF YOUR CODE # | |||||
####################################################################### | |||||
cache = (dropout_param, mask) | |||||
out = out.astype(x.dtype, copy=False) | |||||
return out, cache | |||||
def dropout_backward(dout, cache): | |||||
""" | |||||
Perform the backward pass for (inverted) dropout. | |||||
Inputs: | |||||
- dout: Upstream derivatives, of any shape | |||||
- cache: (dropout_param, mask) from dropout_forward. | |||||
""" | |||||
dropout_param, mask = cache | |||||
mode = dropout_param['mode'] | |||||
dx = None | |||||
if mode == 'train': | |||||
####################################################################### | |||||
# TODO: 完成训练阶段的dropout反向传播。 | |||||
####################################################################### | |||||
# *****START OF YOUR CODE (DO NOT DELETE/MODIFY THIS LINE)***** | |||||
pass | |||||
# *****END OF YOUR CODE (DO NOT DELETE/MODIFY THIS LINE)***** | |||||
####################################################################### | |||||
# END OF YOUR CODE # | |||||
####################################################################### | |||||
elif mode == 'test': | |||||
dx = dout | |||||
return dx | |||||
def conv_forward_naive(x, w, b, conv_param): | |||||
""" | |||||
A naive implementation of the forward pass for a convolutional layer. | |||||
The input consists of N data points, each with C channels, height H and | |||||
width W. We convolve each input with F different filters, where each filter | |||||
spans all C channels and has height HH and width WW. | |||||
Input: | |||||
- x: Input data of shape (N, C, H, W) | |||||
- w: Filter weights of shape (F, C, HH, WW) | |||||
- b: Biases, of shape (F,) | |||||
- conv_param: A dictionary with the following keys: | |||||
- 'stride': The number of pixels between adjacent receptive fields in the | |||||
horizontal and vertical directions. | |||||
- 'pad': The number of pixels that will be used to zero-pad the input. | |||||
During padding, 'pad' zeros should be placed symmetrically (i.e equally on both sides) | |||||
along the height and width axes of the input. Be careful not to modfiy the original | |||||
input x directly. | |||||
Returns a tuple of: | |||||
- out: Output data, of shape (N, F, H', W') where H' and W' are given by | |||||
H' = 1 + (H + 2 * pad - HH) / stride | |||||
W' = 1 + (W + 2 * pad - WW) / stride | |||||
- cache: (x, w, b, conv_param) | |||||
""" | |||||
out = None | |||||
########################################################################### | |||||
# TODO: 实现卷积正向传播。 | |||||
# Hint: 你可以使用np.pad函数进行填充。 | |||||
########################################################################### | |||||
# *****START OF YOUR CODE (DO NOT DELETE/MODIFY THIS LINE)***** | |||||
pass | |||||
# *****END OF YOUR CODE (DO NOT DELETE/MODIFY THIS LINE)***** | |||||
########################################################################### | |||||
# END OF YOUR CODE # | |||||
########################################################################### | |||||
cache = (x, w, b, conv_param) | |||||
return out, cache | |||||
def conv_backward_naive(dout, cache): | |||||
""" | |||||
A naive implementation of the backward pass for a convolutional layer. | |||||
Inputs: | |||||
- dout: Upstream derivatives. | |||||
- cache: A tuple of (x, w, b, conv_param) as in conv_forward_naive | |||||
Returns a tuple of: | |||||
- dx: Gradient with respect to x | |||||
- dw: Gradient with respect to w | |||||
- db: Gradient with respect to b | |||||
""" | |||||
dx, dw, db = None, None, None | |||||
########################################################################### | |||||
# TODO: 实现卷积的反向传播 | |||||
########################################################################### | |||||
# *****START OF YOUR CODE (DO NOT DELETE/MODIFY THIS LINE)***** | |||||
pass | |||||
# *****END OF YOUR CODE (DO NOT DELETE/MODIFY THIS LINE)***** | |||||
########################################################################### | |||||
# END OF YOUR CODE # | |||||
########################################################################### | |||||
return dx, dw, db | |||||
def max_pool_forward_naive(x, pool_param): | |||||
""" | |||||
A naive implementation of the forward pass for a max-pooling layer. | |||||
Inputs: | |||||
- x: Input data, of shape (N, C, H, W) | |||||
- pool_param: dictionary with the following keys: | |||||
- 'pool_height': The height of each pooling region | |||||
- 'pool_width': The width of each pooling region | |||||
- 'stride': The distance between adjacent pooling regions | |||||
No padding is necessary here. Output size is given by | |||||
Returns a tuple of: | |||||
- out: Output data, of shape (N, C, H', W') where H' and W' are given by | |||||
H' = 1 + (H - pool_height) / stride | |||||
W' = 1 + (W - pool_width) / stride | |||||
- cache: (x, pool_param) | |||||
""" | |||||
out = None | |||||
########################################################################### | |||||
# TODO: 完成最大池化的正向传播。 # | |||||
########################################################################### | |||||
# *****START OF YOUR CODE (DO NOT DELETE/MODIFY THIS LINE)***** | |||||
pass | |||||
# *****END OF YOUR CODE (DO NOT DELETE/MODIFY THIS LINE)***** | |||||
########################################################################### | |||||
# END OF YOUR CODE # | |||||
########################################################################### | |||||
cache = (x, pool_param) | |||||
return out, cache | |||||
def max_pool_backward_naive(dout, cache): | |||||
""" | |||||
A naive implementation of the backward pass for a max-pooling layer. | |||||
Inputs: | |||||
- dout: Upstream derivatives | |||||
- cache: A tuple of (x, pool_param) as in the forward pass. | |||||
Returns: | |||||
- dx: Gradient with respect to x | |||||
""" | |||||
dx = None | |||||
########################################################################### | |||||
# TODO: 完成最大池化的反向传播 # | |||||
########################################################################### | |||||
# *****START OF YOUR CODE (DO NOT DELETE/MODIFY THIS LINE)***** | |||||
pass | |||||
# *****END OF YOUR CODE (DO NOT DELETE/MODIFY THIS LINE)***** | |||||
########################################################################### | |||||
# END OF YOUR CODE # | |||||
########################################################################### | |||||
return dx | |||||
def spatial_batchnorm_forward(x, gamma, beta, bn_param): | |||||
""" | |||||
Computes the forward pass for spatial batch normalization. | |||||
Inputs: | |||||
- x: Input data of shape (N, C, H, W) | |||||
- gamma: Scale parameter, of shape (C,) | |||||
- beta: Shift parameter, of shape (C,) | |||||
- bn_param: Dictionary with the following keys: | |||||
- mode: 'train' or 'test'; required | |||||
- eps: Constant for numeric stability | |||||
- momentum: Constant for running mean / variance. momentum=0 means that | |||||
old information is discarded completely at every time step, while | |||||
momentum=1 means that new information is never incorporated. The | |||||
default of momentum=0.9 should work well in most situations. | |||||
- running_mean: Array of shape (D,) giving running mean of features | |||||
- running_var Array of shape (D,) giving running variance of features | |||||
Returns a tuple of: | |||||
- out: Output data, of shape (N, C, H, W) | |||||
- cache: Values needed for the backward pass | |||||
""" | |||||
out, cache = None, None | |||||
########################################################################### | |||||
# TODO: 完成空间批量归一化的正向传播。 | |||||
# | |||||
# HINT: 您可以通过调用上面实现的批量标准化的原始版本来实现空间批量标准化。 | |||||
# 您的实现应该非常简短;我们的代码少于五行。 | |||||
########################################################################### | |||||
# *****START OF YOUR CODE (DO NOT DELETE/MODIFY THIS LINE)***** | |||||
pass | |||||
# *****END OF YOUR CODE (DO NOT DELETE/MODIFY THIS LINE)***** | |||||
########################################################################### | |||||
# END OF YOUR CODE # | |||||
########################################################################### | |||||
return out, cache | |||||
def spatial_batchnorm_backward(dout, cache): | |||||
""" | |||||
Computes the backward pass for spatial batch normalization. | |||||
Inputs: | |||||
- dout: Upstream derivatives, of shape (N, C, H, W) | |||||
- cache: Values from the forward pass | |||||
Returns a tuple of: | |||||
- dx: Gradient with respect to inputs, of shape (N, C, H, W) | |||||
- dgamma: Gradient with respect to scale parameter, of shape (C,) | |||||
- dbeta: Gradient with respect to shift parameter, of shape (C,) | |||||
""" | |||||
dx, dgamma, dbeta = None, None, None | |||||
########################################################################### | |||||
# TODO: 为空间批量归一化实现反向传递。 | |||||
# | |||||
# HINT: 您可以通过调用上面实现的批量标准化的原始版本来实现空间批量标准化。 | |||||
# 您的实现应该非常简短;我们的代码少于五行。 | |||||
########################################################################### | |||||
# *****START OF YOUR CODE (DO NOT DELETE/MODIFY THIS LINE)***** | |||||
pass | |||||
# *****END OF YOUR CODE (DO NOT DELETE/MODIFY THIS LINE)***** | |||||
########################################################################### | |||||
# END OF YOUR CODE # | |||||
########################################################################### | |||||
return dx, dgamma, dbeta | |||||
def spatial_groupnorm_forward(x, gamma, beta, G, gn_param): | |||||
""" | |||||
Computes the forward pass for spatial group normalization. | |||||
In contrast to layer normalization, group normalization splits each entry | |||||
in the data into G contiguous pieces, which it then normalizes independently. | |||||
Per feature shifting and scaling are then applied to the data, in a manner identical to that of batch normalization and layer normalization. | |||||
Inputs: | |||||
- x: Input data of shape (N, C, H, W) | |||||
- gamma: Scale parameter, of shape (C,) | |||||
- beta: Shift parameter, of shape (C,) | |||||
- G: Integer mumber of groups to split into, should be a divisor of C | |||||
- gn_param: Dictionary with the following keys: | |||||
- eps: Constant for numeric stability | |||||
Returns a tuple of: | |||||
- out: Output data, of shape (N, C, H, W) | |||||
- cache: Values needed for the backward pass | |||||
""" | |||||
out, cache = None, None | |||||
eps = gn_param.get('eps',1e-5) | |||||
########################################################################### | |||||
# TODO: 为空间组归一化实现正向传播。 | |||||
# 这与层归一化的实现极为相似。 | |||||
# 尤其要考虑如何转换矩阵,以使大部分代码与训练时批处理归一化和层归一化相似! | |||||
########################################################################### | |||||
# *****START OF YOUR CODE (DO NOT DELETE/MODIFY THIS LINE)***** | |||||
pass | |||||
# *****END OF YOUR CODE (DO NOT DELETE/MODIFY THIS LINE)***** | |||||
########################################################################### | |||||
# END OF YOUR CODE # | |||||
########################################################################### | |||||
return out, cache | |||||
def spatial_groupnorm_backward(dout, cache): | |||||
""" | |||||
Computes the backward pass for spatial group normalization. | |||||
Inputs: | |||||
- dout: Upstream derivatives, of shape (N, C, H, W) | |||||
- cache: Values from the forward pass | |||||
Returns a tuple of: | |||||
- dx: Gradient with respect to inputs, of shape (N, C, H, W) | |||||
- dgamma: Gradient with respect to scale parameter, of shape (C,) | |||||
- dbeta: Gradient with respect to shift parameter, of shape (C,) | |||||
""" | |||||
dx, dgamma, dbeta = None, None, None | |||||
########################################################################### | |||||
# TODO: 为空间组归一化实现反反向传播。 | |||||
# 这将与层归一化的实现极为相似。 | |||||
########################################################################### | |||||
# *****START OF YOUR CODE (DO NOT DELETE/MODIFY THIS LINE)***** | |||||
pass | |||||
# *****END OF YOUR CODE (DO NOT DELETE/MODIFY THIS LINE)***** | |||||
########################################################################### | |||||
# END OF YOUR CODE # | |||||
########################################################################### | |||||
return dx, dgamma, dbeta | |||||
def svm_loss(x, y): | |||||
""" | |||||
Computes the loss and gradient using for multiclass SVM classification. | |||||
Inputs: | |||||
- x: Input data, of shape (N, C) where x[i, j] is the score for the jth | |||||
class for the ith input. | |||||
- y: Vector of labels, of shape (N,) where y[i] is the label for x[i] and | |||||
0 <= y[i] < C | |||||
Returns a tuple of: | |||||
- loss: Scalar giving the loss | |||||
- dx: Gradient of the loss with respect to x | |||||
""" | |||||
N = x.shape[0] | |||||
correct_class_scores = x[np.arange(N), y] | |||||
margins = np.maximum(0, x - correct_class_scores[:, np.newaxis] + 1.0) | |||||
margins[np.arange(N), y] = 0 | |||||
loss = np.sum(margins) / N | |||||
num_pos = np.sum(margins > 0, axis=1) | |||||
dx = np.zeros_like(x) | |||||
dx[margins > 0] = 1 | |||||
dx[np.arange(N), y] -= num_pos | |||||
dx /= N | |||||
return loss, dx | |||||
def softmax_loss(x, y): | |||||
""" | |||||
Computes the loss and gradient for softmax classification. | |||||
Inputs: | |||||
- x: Input data, of shape (N, C) where x[i, j] is the score for the jth | |||||
class for the ith input. | |||||
- y: Vector of labels, of shape (N,) where y[i] is the label for x[i] and | |||||
0 <= y[i] < C | |||||
Returns a tuple of: | |||||
- loss: Scalar giving the loss | |||||
- dx: Gradient of the loss with respect to x | |||||
""" | |||||
shifted_logits = x - np.max(x, axis=1, keepdims=True) | |||||
Z = np.sum(np.exp(shifted_logits), axis=1, keepdims=True) | |||||
log_probs = shifted_logits - np.log(Z) | |||||
probs = np.exp(log_probs) | |||||
N = x.shape[0] | |||||
loss = -np.sum(log_probs[np.arange(N), y]) / N | |||||
dx = probs.copy() | |||||
dx[np.arange(N), y] -= 1 | |||||
dx /= N | |||||
return loss, dx |
@ -0,0 +1,159 @@ | |||||
import numpy as np | |||||
""" | |||||
This file implements various first-order update rules that are commonly used | |||||
for training neural networks. Each update rule accepts current weights and the | |||||
gradient of the loss with respect to those weights and produces the next set of | |||||
weights. Each update rule has the same interface: | |||||
def update(w, dw, config=None): | |||||
Inputs: | |||||
- w: A numpy array giving the current weights. | |||||
- dw: A numpy array of the same shape as w giving the gradient of the | |||||
loss with respect to w. | |||||
- config: A dictionary containing hyperparameter values such as learning | |||||
rate, momentum, etc. If the update rule requires caching values over many | |||||
iterations, then config will also hold these cached values. | |||||
Returns: | |||||
- next_w: The next point after the update. | |||||
- config: The config dictionary to be passed to the next iteration of the | |||||
update rule. | |||||
NOTE: For most update rules, the default learning rate will probably not | |||||
perform well; however the default values of the other hyperparameters should | |||||
work well for a variety of different problems. | |||||
For efficiency, update rules may perform in-place updates, mutating w and | |||||
setting next_w equal to w. | |||||
""" | |||||
def sgd(w, dw, config=None): | |||||
""" | |||||
Performs vanilla stochastic gradient descent. | |||||
config format: | |||||
- learning_rate: Scalar learning rate. | |||||
""" | |||||
if config is None: config = {} | |||||
config.setdefault('learning_rate', 1e-2) | |||||
w -= config['learning_rate'] * dw | |||||
return w, config | |||||
def sgd_momentum(w, dw, config=None): | |||||
""" | |||||
Performs stochastic gradient descent with momentum. | |||||
config format: | |||||
- learning_rate: Scalar learning rate. | |||||
- momentum: Scalar between 0 and 1 giving the momentum value. | |||||
Setting momentum = 0 reduces to sgd. | |||||
- velocity: A numpy array of the same shape as w and dw used to store a | |||||
moving average of the gradients. | |||||
""" | |||||
if config is None: config = {} | |||||
config.setdefault('learning_rate', 1e-2) | |||||
config.setdefault('momentum', 0.9) | |||||
v = config.get('velocity', np.zeros_like(w)) | |||||
next_w = None | |||||
########################################################################### | |||||
# TODO: Implement the momentum update formula. Store the updated value in # | |||||
# the next_w variable. You should also use and update the velocity v. # | |||||
########################################################################### | |||||
# *****START OF YOUR CODE (DO NOT DELETE/MODIFY THIS LINE)***** | |||||
pass | |||||
# *****END OF YOUR CODE (DO NOT DELETE/MODIFY THIS LINE)***** | |||||
########################################################################### | |||||
# END OF YOUR CODE # | |||||
########################################################################### | |||||
config['velocity'] = v | |||||
return next_w, config | |||||
def rmsprop(w, dw, config=None): | |||||
""" | |||||
Uses the RMSProp update rule, which uses a moving average of squared | |||||
gradient values to set adaptive per-parameter learning rates. | |||||
config format: | |||||
- learning_rate: Scalar learning rate. | |||||
- decay_rate: Scalar between 0 and 1 giving the decay rate for the squared | |||||
gradient cache. | |||||
- epsilon: Small scalar used for smoothing to avoid dividing by zero. | |||||
- cache: Moving average of second moments of gradients. | |||||
""" | |||||
if config is None: config = {} | |||||
config.setdefault('learning_rate', 1e-2) | |||||
config.setdefault('decay_rate', 0.99) | |||||
config.setdefault('epsilon', 1e-8) | |||||
config.setdefault('cache', np.zeros_like(w)) | |||||
next_w = None | |||||
########################################################################### | |||||
# TODO: Implement the RMSprop update formula, storing the next value of w # | |||||
# in the next_w variable. Don't forget to update cache value stored in # | |||||
# config['cache']. # | |||||
########################################################################### | |||||
# *****START OF YOUR CODE (DO NOT DELETE/MODIFY THIS LINE)***** | |||||
pass | |||||
# *****END OF YOUR CODE (DO NOT DELETE/MODIFY THIS LINE)***** | |||||
########################################################################### | |||||
# END OF YOUR CODE # | |||||
########################################################################### | |||||
return next_w, config | |||||
def adam(w, dw, config=None): | |||||
""" | |||||
Uses the Adam update rule, which incorporates moving averages of both the | |||||
gradient and its square and a bias correction term. | |||||
config format: | |||||
- learning_rate: Scalar learning rate. | |||||
- beta1: Decay rate for moving average of first moment of gradient. | |||||
- beta2: Decay rate for moving average of second moment of gradient. | |||||
- epsilon: Small scalar used for smoothing to avoid dividing by zero. | |||||
- m: Moving average of gradient. | |||||
- v: Moving average of squared gradient. | |||||
- t: Iteration number. | |||||
""" | |||||
if config is None: config = {} | |||||
config.setdefault('learning_rate', 1e-3) | |||||
config.setdefault('beta1', 0.9) | |||||
config.setdefault('beta2', 0.999) | |||||
config.setdefault('epsilon', 1e-8) | |||||
config.setdefault('m', np.zeros_like(w)) | |||||
config.setdefault('v', np.zeros_like(w)) | |||||
config.setdefault('t', 0) | |||||
next_w = None | |||||
########################################################################### | |||||
# TODO: Implement the Adam update formula, storing the next value of w in # | |||||
# the next_w variable. Don't forget to update the m, v, and t variables # | |||||
# stored in config. # | |||||
# # | |||||
# NOTE: In order to match the reference output, please modify t _before_ # | |||||
# using it in any calculations. # | |||||
########################################################################### | |||||
# *****START OF YOUR CODE (DO NOT DELETE/MODIFY THIS LINE)***** | |||||
pass | |||||
# *****END OF YOUR CODE (DO NOT DELETE/MODIFY THIS LINE)***** | |||||
########################################################################### | |||||
# END OF YOUR CODE # | |||||
########################################################################### | |||||
return next_w, config |
@ -0,0 +1,14 @@ | |||||
from distutils.core import setup | |||||
from distutils.extension import Extension | |||||
from Cython.Build import cythonize | |||||
import numpy | |||||
extensions = [ | |||||
Extension('im2col_cython', ['im2col_cython.pyx'], | |||||
include_dirs = [numpy.get_include()] | |||||
), | |||||
] | |||||
setup( | |||||
ext_modules = cythonize(extensions), | |||||
) |
@ -0,0 +1,299 @@ | |||||
from __future__ import print_function, division | |||||
from future import standard_library | |||||
standard_library.install_aliases() | |||||
from builtins import range | |||||
from builtins import object | |||||
import os | |||||
import pickle as pickle | |||||
import numpy as np | |||||
from daseCV import optim | |||||
class Solver(object): | |||||
""" | |||||
Solver封装了模型训练所需的所有逻辑。 | |||||
Solver使用optim.py中定义好的不同更新规则执行随机梯度下降。 | |||||
solver同时接受训练、验证数据和标签的输入, | |||||
它可以定期检查训练和验证数据的分类准确性,监视是否存在过拟合。 | |||||
要训练模型,首先要构造一个Solver实例,传递模型、数据集和超参数(learning rate, batch size, etc)。 | |||||
然后调用train()方法训练模型。 | |||||
训练结束后,经过更新在验证集上优化后的模型参数会保存在model.params中。此外,损失值的 | |||||
历史训练信息会保存在solver.loss_history中,还有solver.train_acc_history和 | |||||
solver.val_acc_history中会分别保存训练集和验证集在每一次epoch时的模型准确率。 | |||||
样例如下: | |||||
data = { | |||||
'X_train': # training data | |||||
'y_train': # training labels | |||||
'X_val': # validation data | |||||
'y_val': # validation labels | |||||
} | |||||
model = MyAwesomeModel(hidden_size=100, reg=10) | |||||
solver = Solver(model, data, | |||||
update_rule='sgd', | |||||
optim_config={ | |||||
'learning_rate': 1e-3, | |||||
}, | |||||
lr_decay=0.95, | |||||
num_epochs=10, batch_size=100, | |||||
print_every=100) | |||||
solver.train() | |||||
A Solver works on a model object that must conform to the following API: | |||||
- model.params must be a dictionary mapping string parameter names to numpy | |||||
arrays containing parameter values. | |||||
- model.loss(X, y) must be a function that computes training-time loss and | |||||
gradients, and test-time classification scores, with the following inputs | |||||
and outputs: | |||||
Inputs: | |||||
- X: Array giving a minibatch of input data of shape (N, d_1, ..., d_k) | |||||
- y: Array of labels, of shape (N,) giving labels for X where y[i] is the | |||||
label for X[i]. | |||||
Returns: | |||||
If y is None, run a test-time forward pass and return: | |||||
- scores: Array of shape (N, C) giving classification scores for X where | |||||
scores[i, c] gives the score of class c for X[i]. | |||||
If y is not None, run a training time forward and backward pass and | |||||
return a tuple of: | |||||
- loss: Scalar giving the loss | |||||
- grads: Dictionary with the same keys as self.params mapping parameter | |||||
names to gradients of the loss with respect to those parameters. | |||||
""" | |||||
def __init__(self, model, data, **kwargs): | |||||
""" | |||||
Construct a new Solver instance. | |||||
Required arguments: | |||||
- model: A model object conforming to the API described above | |||||
- data: A dictionary of training and validation data containing: | |||||
'X_train': Array, shape (N_train, d_1, ..., d_k) of training images | |||||
'X_val': Array, shape (N_val, d_1, ..., d_k) of validation images | |||||
'y_train': Array, shape (N_train,) of labels for training images | |||||
'y_val': Array, shape (N_val,) of labels for validation images | |||||
Optional arguments: | |||||
- update_rule: A string giving the name of an update rule in optim.py. | |||||
Default is 'sgd'. | |||||
- optim_config: A dictionary containing hyperparameters that will be | |||||
passed to the chosen update rule. Each update rule requires different | |||||
hyperparameters (see optim.py) but all update rules require a | |||||
'learning_rate' parameter so that should always be present. | |||||
- lr_decay: A scalar for learning rate decay; after each epoch the | |||||
learning rate is multiplied by this value. | |||||
- batch_size: Size of minibatches used to compute loss and gradient | |||||
during training. | |||||
- num_epochs: The number of epochs to run for during training. | |||||
- print_every: Integer; training losses will be printed every | |||||
print_every iterations. | |||||
- verbose: Boolean; if set to false then no output will be printed | |||||
during training. | |||||
- num_train_samples: Number of training samples used to check training | |||||
accuracy; default is 1000; set to None to use entire training set. | |||||
- num_val_samples: Number of validation samples to use to check val | |||||
accuracy; default is None, which uses the entire validation set. | |||||
- checkpoint_name: If not None, then save model checkpoints here every | |||||
epoch. | |||||
""" | |||||
self.model = model | |||||
self.X_train = data['X_train'] | |||||
self.y_train = data['y_train'] | |||||
self.X_val = data['X_val'] | |||||
self.y_val = data['y_val'] | |||||
# Unpack keyword arguments | |||||
self.update_rule = kwargs.pop('update_rule', 'sgd') | |||||
self.optim_config = kwargs.pop('optim_config', {}) | |||||
self.lr_decay = kwargs.pop('lr_decay', 1.0) | |||||
self.batch_size = kwargs.pop('batch_size', 100) | |||||
self.num_epochs = kwargs.pop('num_epochs', 10) | |||||
self.num_train_samples = kwargs.pop('num_train_samples', 1000) | |||||
self.num_val_samples = kwargs.pop('num_val_samples', None) | |||||
self.checkpoint_name = kwargs.pop('checkpoint_name', None) | |||||
self.print_every = kwargs.pop('print_every', 10) | |||||
self.verbose = kwargs.pop('verbose', True) | |||||
# Throw an error if there are extra keyword arguments | |||||
if len(kwargs) > 0: | |||||
extra = ', '.join('"%s"' % k for k in list(kwargs.keys())) | |||||
raise ValueError('Unrecognized arguments %s' % extra) | |||||
# Make sure the update rule exists, then replace the string | |||||
# name with the actual function | |||||
if not hasattr(optim, self.update_rule): | |||||
raise ValueError('Invalid update_rule "%s"' % self.update_rule) | |||||
self.update_rule = getattr(optim, self.update_rule) | |||||
self._reset() | |||||
def _reset(self): | |||||
""" | |||||
Set up some book-keeping variables for optimization. Don't call this | |||||
manually. | |||||
""" | |||||
# Set up some variables for book-keeping | |||||
self.epoch = 0 | |||||
self.best_val_acc = 0 | |||||
self.best_params = {} | |||||
self.loss_history = [] | |||||
self.train_acc_history = [] | |||||
self.val_acc_history = [] | |||||
# Make a deep copy of the optim_config for each parameter | |||||
self.optim_configs = {} | |||||
for p in self.model.params: | |||||
d = {k: v for k, v in self.optim_config.items()} | |||||
self.optim_configs[p] = d | |||||
def _step(self): | |||||
""" | |||||
Make a single gradient update. This is called by train() and should not | |||||
be called manually. | |||||
""" | |||||
# Make a minibatch of training data | |||||
num_train = self.X_train.shape[0] | |||||
batch_mask = np.random.choice(num_train, self.batch_size) | |||||
X_batch = self.X_train[batch_mask] | |||||
y_batch = self.y_train[batch_mask] | |||||
# Compute loss and gradient | |||||
loss, grads = self.model.loss(X_batch, y_batch) | |||||
self.loss_history.append(loss) | |||||
# Perform a parameter update | |||||
for p, w in self.model.params.items(): | |||||
dw = grads[p] | |||||
config = self.optim_configs[p] | |||||
next_w, next_config = self.update_rule(w, dw, config) | |||||
self.model.params[p] = next_w | |||||
self.optim_configs[p] = next_config | |||||
def _save_checkpoint(self): | |||||
if self.checkpoint_name is None: return | |||||
checkpoint = { | |||||
'model': self.model, | |||||
'update_rule': self.update_rule, | |||||
'lr_decay': self.lr_decay, | |||||
'optim_config': self.optim_config, | |||||
'batch_size': self.batch_size, | |||||
'num_train_samples': self.num_train_samples, | |||||
'num_val_samples': self.num_val_samples, | |||||
'epoch': self.epoch, | |||||
'loss_history': self.loss_history, | |||||
'train_acc_history': self.train_acc_history, | |||||
'val_acc_history': self.val_acc_history, | |||||
} | |||||
filename = '%s_epoch_%d.pkl' % (self.checkpoint_name, self.epoch) | |||||
if self.verbose: | |||||
print('Saving checkpoint to "%s"' % filename) | |||||
with open(filename, 'wb') as f: | |||||
pickle.dump(checkpoint, f) | |||||
def check_accuracy(self, X, y, num_samples=None, batch_size=100): | |||||
""" | |||||
Check accuracy of the model on the provided data. | |||||
Inputs: | |||||
- X: Array of data, of shape (N, d_1, ..., d_k) | |||||
- y: Array of labels, of shape (N,) | |||||
- num_samples: If not None, subsample the data and only test the model | |||||
on num_samples datapoints. | |||||
- batch_size: Split X and y into batches of this size to avoid using | |||||
too much memory. | |||||
Returns: | |||||
- acc: Scalar giving the fraction of instances that were correctly | |||||
classified by the model. | |||||
""" | |||||
# Maybe subsample the data | |||||
N = X.shape[0] | |||||
if num_samples is not None and N > num_samples: | |||||
mask = np.random.choice(N, num_samples) | |||||
N = num_samples | |||||
X = X[mask] | |||||
y = y[mask] | |||||
# Compute predictions in batches | |||||
num_batches = N // batch_size | |||||
if N % batch_size != 0: | |||||
num_batches += 1 | |||||
y_pred = [] | |||||
for i in range(num_batches): | |||||
start = i * batch_size | |||||
end = (i + 1) * batch_size | |||||
scores = self.model.loss(X[start:end]) | |||||
y_pred.append(np.argmax(scores, axis=1)) | |||||
y_pred = np.hstack(y_pred) | |||||
acc = np.mean(y_pred == y) | |||||
return acc | |||||
def train(self): | |||||
""" | |||||
Run optimization to train the model. | |||||
""" | |||||
num_train = self.X_train.shape[0] | |||||
iterations_per_epoch = max(num_train // self.batch_size, 1) | |||||
num_iterations = self.num_epochs * iterations_per_epoch | |||||
for t in range(num_iterations): | |||||
self._step() | |||||
# Maybe print training loss | |||||
if self.verbose and t % self.print_every == 0: | |||||
print('(Iteration %d / %d) loss: %f' % ( | |||||
t + 1, num_iterations, self.loss_history[-1])) | |||||
# At the end of every epoch, increment the epoch counter and decay | |||||
# the learning rate. | |||||
epoch_end = (t + 1) % iterations_per_epoch == 0 | |||||
if epoch_end: | |||||
self.epoch += 1 | |||||
for k in self.optim_configs: | |||||
self.optim_configs[k]['learning_rate'] *= self.lr_decay | |||||
# Check train and val accuracy on the first iteration, the last | |||||
# iteration, and at the end of each epoch. | |||||
first_it = (t == 0) | |||||
last_it = (t == num_iterations - 1) | |||||
if first_it or last_it or epoch_end: | |||||
train_acc = self.check_accuracy(self.X_train, self.y_train, | |||||
num_samples=self.num_train_samples) | |||||
val_acc = self.check_accuracy(self.X_val, self.y_val, | |||||
num_samples=self.num_val_samples) | |||||
self.train_acc_history.append(train_acc) | |||||
self.val_acc_history.append(val_acc) | |||||
self._save_checkpoint() | |||||
if self.verbose: | |||||
print('(Epoch %d / %d) train acc: %f; val_acc: %f' % ( | |||||
self.epoch, self.num_epochs, train_acc, val_acc)) | |||||
# Keep track of the best model | |||||
if val_acc > self.best_val_acc: | |||||
self.best_val_acc = val_acc | |||||
self.best_params = {} | |||||
for k, v in self.model.params.items(): | |||||
self.best_params[k] = v.copy() | |||||
# At the end of training swap the best params into the model | |||||
self.model.params = self.best_params |
@ -0,0 +1,73 @@ | |||||
from builtins import range | |||||
from past.builtins import xrange | |||||
from math import sqrt, ceil | |||||
import numpy as np | |||||
def visualize_grid(Xs, ubound=255.0, padding=1): | |||||
""" | |||||
Reshape a 4D tensor of image data to a grid for easy visualization. | |||||
Inputs: | |||||
- Xs: Data of shape (N, H, W, C) | |||||
- ubound: Output grid will have values scaled to the range [0, ubound] | |||||
- padding: The number of blank pixels between elements of the grid | |||||
""" | |||||
(N, H, W, C) = Xs.shape | |||||
grid_size = int(ceil(sqrt(N))) | |||||
grid_height = H * grid_size + padding * (grid_size - 1) | |||||
grid_width = W * grid_size + padding * (grid_size - 1) | |||||
grid = np.zeros((grid_height, grid_width, C)) | |||||
next_idx = 0 | |||||
y0, y1 = 0, H | |||||
for y in range(grid_size): | |||||
x0, x1 = 0, W | |||||
for x in range(grid_size): | |||||
if next_idx < N: | |||||
img = Xs[next_idx] | |||||
low, high = np.min(img), np.max(img) | |||||
grid[y0:y1, x0:x1] = ubound * (img - low) / (high - low) | |||||
# grid[y0:y1, x0:x1] = Xs[next_idx] | |||||
next_idx += 1 | |||||
x0 += W + padding | |||||
x1 += W + padding | |||||
y0 += H + padding | |||||
y1 += H + padding | |||||
# grid_max = np.max(grid) | |||||
# grid_min = np.min(grid) | |||||
# grid = ubound * (grid - grid_min) / (grid_max - grid_min) | |||||
return grid | |||||
def vis_grid(Xs): | |||||
""" visualize a grid of images """ | |||||
(N, H, W, C) = Xs.shape | |||||
A = int(ceil(sqrt(N))) | |||||
G = np.ones((A*H+A, A*W+A, C), Xs.dtype) | |||||
G *= np.min(Xs) | |||||
n = 0 | |||||
for y in range(A): | |||||
for x in range(A): | |||||
if n < N: | |||||
G[y*H+y:(y+1)*H+y, x*W+x:(x+1)*W+x, :] = Xs[n,:,:,:] | |||||
n += 1 | |||||
# normalize to [0,1] | |||||
maxg = G.max() | |||||
ming = G.min() | |||||
G = (G - ming)/(maxg-ming) | |||||
return G | |||||
def vis_nn(rows): | |||||
""" visualize array of arrays of images """ | |||||
N = len(rows) | |||||
D = len(rows[0]) | |||||
H,W,C = rows[0][0].shape | |||||
Xs = rows[0][0] | |||||
G = np.ones((N*H+N, D*W+D, C), Xs.dtype) | |||||
for y in range(N): | |||||
for x in range(D): | |||||
G[y*H+y:(y+1)*H+y, x*W+x:(x+1)*W+x, :] = rows[y][x] | |||||
# normalize to [0,1] | |||||
maxg = G.max() | |||||
ming = G.min() | |||||
G = (G - ming)/(maxg-ming) | |||||
return G |