Hessian Computation using TensorFlow

Compute Hessian using TensorFlow

Reference:

TensorFlow has a function called tf.gradients() that computes gradient. In the past, I’ve tried to compute Hessian of an neural network objective function in Torch7 using torch-autograd but it was somewhat cumbersome; there wasn’t an easy way to store/reshape parameters because Lua uses table for everything. Today, I’d like to do the same thing in TensorFlow. It should be much easier than in Torch7 due to the symbolic differentiation.

Example 1 : Quadratic function

We are going to use $f(x) = \frac{1}{2} x^T A x + b^T x + c$ as our first example to compute Hessian. When A is a symmetric matrix, the hessian of $f$ should be equal to $A$.

For simplicity, let us start with:
$$A = \left[ \begin{array}{rrr} 2 & 2 & 2 \\ 2 & 2 & 2 \\ 2 & 2 & 2 \end{array} \right] \quad b = \left[ \begin{array}{rrr} 3 \\ 3 \\ 3 \end{array} \right] \quad c = 1$$

The code below calculates the hessian for f(x).

1
2
3
4
import tensorflow as tf
import matplotlib as plt
import numpy as np
import math
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
def getHessian(dim):
# Each time getHessian is called, we create a new graph so that the default graph (which exists a priori) won't be filled with old ops.
g = tf.Graph()
with g.as_default():
# First create placeholders for inputs: A, b, and c.
A = tf.placeholder(tf.float32, shape=[dim, dim])
b = tf.placeholder(tf.float32, shape=[dim, 1])
c = tf.placeholder(tf.float32, shape=[1])
# Define our variable
x = tf.Variable(np.float32(np.repeat(1,dim).reshape(dim,1)))
# Construct the computational graph for quadratic function: f(x) = 1/2 * x^t A x + b^t x + c
fx = 0.5 * tf.matmul(tf.matmul(tf.transpose(x), A), x) + tf.matmul(tf.transpose(b), x) + c
# Get gradients of fx with repect to x
dfx = tf.gradients(fx, x)[0]
# Compute hessian
for i in range(dim):
# Take the i th value of the gradient vector dfx
# tf.slice: https://www.tensorflow.org/versions/0.6.0/api_docs/python/array_ops.html#slice
dfx_i = tf.slice(dfx, begin=[i,0] , size=[1,1])
# Feed it to tf.gradients to compute the second derivative.
# Since x is a vector and dfx_i is a scalar, this will return a vector : [d(dfx_i) / dx_i , ... , d(dfx_n) / dx_n]
ddfx_i = tf.gradients(dfx_i, x)[0] # whenever we use tf.gradients, make sure you get the actual tensors by putting [0] at the end
if i == 0: hess = ddfx_i
else: hess = tf.concat(1, [hess, ddfx_i])
## Instead of doing this, you can just append each element to a list, and then do tf.pack(list_object) to get the hessian matrix too.
## I'll use this alternative in the second example.
# Before we execute the graph, we need to initialize all the variables we defined
init_op = tf.initialize_all_variables()
with tf.Session() as sess:
sess.run(init_op)
# We need to feed actual values into the computational graph that we created above.
feed_dict = {A: np.float32(np.repeat(2,dim*dim).reshape(dim,dim)), b: np.float32(np.repeat(3,dim).reshape(dim,1)) , c: [1]}
# sess.run() executes the graph. Here, "hess" will be calculated with the values in "feed_dict".
print(sess.run(hess, feed_dict))
1
getHessian(3)
[[ 2.  2.  2.]
 [ 2.  2.  2.]
 [ 2.  2.  2.]]

We can see that the result of sess.run(hess, feed_dict) is indeed the desired value: A

Example 2 : Multilayer Perceptron

Next, we’ll try a small neural network model: Multilayer perceptron. We need to modify our getHessian function a little bit; we need to create one-long vector for parameters, and then slice them according to the model architecture. Otherwise tf.gradients() cannot calculate the hessian matrix.

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
def getHessianMLP(n_input, n_hidden, n_output):
batch_size = 1
# Each time getHessianMLP is called, we create a new graph so that the default graph (which exists a priori) won't be filled with old ops.
g = tf.Graph()
with g.as_default():
# First create placeholders for inputs and targets: x_input, y_target
x_input = tf.placeholder(tf.float32, shape=[batch_size, n_input])
y_target = tf.placeholder(tf.float32, shape=[batch_size, n_output])
# Start constructing a computational graph for multilayer perceptron
### Since we want to store parameters as one long vector, we first define our parameters as below and then
### reshape it later according to each layer specification.
parameters = tf.Variable(tf.concat(0, [tf.truncated_normal([n_input * n_hidden, 1]), tf.zeros([n_hidden, 1]),
tf.truncated_normal([n_hidden * n_output,1]), tf.zeros([n_output, 1])]))
with tf.name_scope("hidden") as scope:
idx_from = 0
weights = tf.reshape(tf.slice(parameters, begin=[idx_from, 0], size=[n_input*n_hidden, 1]), [n_input, n_hidden])
idx_from = idx_from + n_input*n_hidden
biases = tf.reshape(tf.slice(parameters, begin=[idx_from, 0], size=[n_hidden, 1]), [n_hidden]) # tf.Variable(tf.truncated_normal([n_hidden]))
hidden = tf.matmul(x_input, weights) + biases
with tf.name_scope("linear") as scope:
idx_from = idx_from + n_hidden
weights = tf.reshape(tf.slice(parameters, begin=[idx_from, 0], size=[n_hidden*n_output, 1]), [n_hidden, n_output])
idx_from = idx_from + n_hidden*n_output
biases = tf.reshape(tf.slice(parameters, begin=[idx_from, 0], size=[n_output, 1]), [n_output])
output = tf.nn.softmax(tf.matmul(hidden, weights) + biases)
# Define cross entropy loss
loss = -tf.reduce_sum(y_target * tf.log(output))
### Note: We can call tf.trainable_variables to get GraphKeys.TRAINABLE_VARIABLES
### because we are using g as our default graph inside the "with" scope.
# Get trainable variables
tvars = tf.trainable_variables()
# Get gradients of loss with repect to parameters
dloss_dw = tf.gradients(loss, tvars)[0]
dim, _ = dloss_dw.get_shape()
hess = []
for i in range(dim):
# tf.slice: https://www.tensorflow.org/versions/0.6.0/api_docs/python/array_ops.html#slice
dfx_i = tf.slice(dloss_dw, begin=[i,0] , size=[1,1])
ddfx_i = tf.gradients(dfx_i, parameters)[0] # whenever we use tf.gradients, make sure you get the actual tensors by putting [0] at the end
hess.append(ddfx_i)
hess = tf.squeeze(hess)
init_op = tf.initialize_all_variables()
with tf.Session() as sess:
sess.run(init_op)
feed_dict = {x_input: np.random.random([batch_size, n_input]), y_target: np.random.random([batch_size, n_output])}
#print(sess.run(loss, feed_dict))
print(hess.get_shape())
print(sess.run(hess, feed_dict))
1
getHessianMLP(n_input=3,n_hidden=4,n_output=3)
(31, 31)
[[  2.19931314e-03  -1.42659002e-03   1.30957202e-03  -8.70158256e-04
    8.50890204e-03  -5.51932165e-03   5.06659178e-03  -3.36654740e-03
    7.25943921e-03  -4.70885402e-03   4.32260381e-03  -2.87219742e-03
    1.87662840e-02  -1.21727986e-02   1.11743081e-02  -7.42488075e-03
   -5.90046346e-02   8.59218910e-02  -2.69172415e-02  -1.75508182e-03
    3.87416431e-03  -2.11908249e-03  -5.98554593e-03   1.32124824e-02
   -7.22693698e-03   3.56099289e-03  -7.86052924e-03   4.29953635e-03
   -1.33406427e-02   2.94481106e-02  -1.61074679e-02]
 [ -1.42659002e-03   2.15499196e-03   2.23391340e-03   5.85207134e-04
   -5.51932165e-03   8.33742879e-03   8.64276756e-03   2.26410269e-03
   -4.70885402e-03   7.11314566e-03   7.37364776e-03   1.93163764e-03
   -1.21727986e-02   1.83881018e-02   1.90615226e-02   4.99345176e-03
   -2.40529864e-03  -1.63234770e-02   1.87287778e-02  -5.11422716e-02
    6.40287027e-02  -1.28864162e-02  -1.72071008e-03  -1.16775408e-02
    1.33982506e-02   1.02370558e-03   6.94734277e-03  -7.97104836e-03
   -3.83513537e-03  -2.60270163e-02   2.98621524e-02] ... [omitted] ]