import tensorflow as tf
def semi_conv_2d(inputs, **conv_kwargs):
with tf.variable_scope('semi_conv_2d'):
inputs = tf.layers.conv2d(inputs, kernel_size=1, **conv_kwargs)
shape = tf.shape(inputs)
inds = [tf.arange(shape[i]) for i in tf.range(shape)]
delta = tf.meshgrid(*inds)/shape
inputs = inputs + delta
return inputs
a = tf.placeholder(dtype=tf.int32, shape=[])
b = tf.one_hot([[1,9,1,0]], depth=a, axis=0)
with tf.Session() as sess:
print(sess.run(b, {a:5}).shape)
import numpy as np
a = np.random.random((7,4,5,6)) > 0.5
b = np.random.random((9,4,5,6))
(a[np.newaxis]*b[:,np.newaxis]).shape
The loss is defined for the set of instances $\mathcal{S}$ within an image but could also be extended to the set of instances within a mini-batch
$$\mathcal{L}(\Psi|\mathbf{x}, \mathcal{S}) = \sum_{S \in \mathcal{S}} \frac{1}{\lvert S \rvert} \sum_{u \in S}\left\lVert\Psi_u(\mathbf{x}) - \frac{1}{\lvert S \rvert}\sum_{u \in S}\Psi_u(\mathbf{x})\right\rVert$$
Specifically for each instance $S$, the loss is the mean of the Euclidean distances between the embeddings for each pixel $u \in S$ and the mean embedding over all the pixels in that instance.
def semi_conv_loss(y_true, y_pred):
"""
Implements equation 5 from https://arxiv.org/abs/1807.10712 for a mini-batch of images.
Args:
y_true (Tensor): sparse label tensor of shape batch_size x height x width,
with a separate number for each instance present in the image.
Requires that the values are consecutive integers starting from 0.
y_pred (Tensor): sparse prediction tensor of shape batch_size x height x width x channels
Returns:
semi-convolutional loss
"""
#find the maximum number of instances in any image in this batch
n_inst_max = tf.max(y_true)
#batch_size x height x width -> n_inst_max x batch_size x height x width
y_true_one_hot = tf.one_hot(y_true, depth=n_inst_max, axis=0)
#results in tensor of shape batch_size x n_inst_max x height x width x channels
y_pred_dense = y_true_one_hot[tf.newaxis]*y_pred[:,tf.newaxis]
#reshape to (batch_size*n_inst_max) x height x width x channels
y_pred_dense = tf.reshape(y_pred_dense,
tf.concat([[-1], tf.shape(y_pred_dense)[2:]], axis=0))
#batch_size x n_inst_max x height x width x channels -> (batch_size*n_inst_max) x height x width
y_true_one_hot = tf.reshape(y_true_one_hot,
tf.concat([[-1], tf.shape(y_true_one_hot)[2:]], axis=0))
n_inst_max instances, we need to avoid zero-divison error.First note that
$$\left\lVert u - \frac{v}{q} \right\rVert = \sqrt{\sum_i\left(u_i - \frac{v_i}{q}\right)^2} = \sqrt{\sum_i \frac{1}{q^2}\left(q\cdot u_i - v_i\right)^2} = \frac{1}{q}\sqrt{\sum_i\left(q\cdot u_i - v_i\right)^2} = \frac{1}{q}\left\lVert q\cdot u - v \right\rVert $$
The loss function becomes
$$\mathcal{L}(\Psi|\mathbf{x}, \mathcal{S}) = \sum_{S \in \mathcal{S}} \frac{1}{\lvert S \rvert^2} \sum_{u \in S}\left\lVert{\lvert S \rvert}\cdot\Psi_u(\mathbf{x}) - \sum_{u \in S}\Psi_u(\mathbf{x})\right\rVert$$
batch_size $\times$ n_inst_max rows in y_pred_dense.n_inst_pixels thus avoiding division by zero #find number of pixels in each instance
#(batch_size*n_inst_max) x height x width -> (batch_size*n_inst_max)
n_inst_pixels = tf.reduce_sum(y_true_one_hot, axis=[1, 2])
#(batch_size*n_inst_max) x height x width x channels -> (batch_size*n_inst_max) x channels
embeds_sum = tf.reduce_sum(y_pred_dense, axis=[1,2], keep_dims=True)
#(batch_size*n_inst_max) x height x width x channels -> (batch_size*n_inst_max) x height x width
dist = tf.norm(y_pred_dense*n_inst_pixels - embeds_sum, axis=-1)
#keep only the distances for pixels that belong to the instance
dist_masked = dist*y_true_one_hot
#sum the losses for each instance
#(batch_size*n_inst_max) x height x width -> (batch_size*n_inst_max)
dist_sum = tf.reduce_sum(dist_masked, axis=[1,2])
has_inst_mask = tf.greater(n_inst_pixels, 0)
#select only the elements of dist that correspond to an instance
losses = (tf.boolean_mask(dist_sum, has_inst_mask)/
tf.boolean_mask(n_inst_pixels, has_inst_mask)**2)
loss = tf.reduce_sum(losses)
return loss
Implements the evaluation metric described at https://www.kaggle.com/c/airbus-ship-detection#evaluation.
def iou_score(y_true, y_pred):
thresholds = tf.constant([0.5, 0.55, 0.6, 0.65, 0.7, 0.75, 0.8, 0.85, 0.9, 0.95])
beta = 2
n_inst_max = tf.max(y_true)
n_pred_max = tf.max(y_pred)
#B x H X W x I x 1
y_true_one_hot = tf.one_hot(y_true, depth=n_inst_max, axis=-1)[...,np.newaxis]
#B x H x W x 1 x P
y_pred_one_hot = tf.one_hot(y_pred, depth=n_pred_max, axis=-1)[...,np.newaxis,:]
#B x I x P
intersection = tf.reduce_sum(y_true*y_pred, axis=[1,2])
#B x I x P
union = tf.reduce_sum(y_true, axis=[1,2]) + tf.reduce_sum(y_pred, axis=[1,2]) - intersection
#B x I x P
iou_masked = intersection/tf.where(tf.greater(union, 0), union, tf.ones_like(union))
#B x I x P x T
match = tf.greater(iou_masked[...,tf.newaxis], thresholds)
#B x I x T
inst_match_at_thresh = tf.to_float32(tf.reduce_any(match, axis=[-2]))
#B x P x T
pred_match_at_thresh = tf.to_float32(tf.reduce_any(match, axis=[-3]))
#T
tp_at_thresh = tf.reduce_sum(inst_match_at_thresh, axis=[0, 1])
fn_at_thresh = tf.reduce_sum(1 - inst_match_at_thresh, axis=[0, 1])
fp_at_thresh = tf.reduce_sum(1 - pred_match_at_thresh, axis=[0, 1])
f2_numerator = (1 + beta**2)*tp_at_thresh
f2_score = f2_numerator/(f2_numerator + (beta**2)*fn_at_thresh + fp_at_thresh)
mean_f2_score = tf.reduce_mean(f2_score, axis=0)
return mean_f2_score