# Newton’s Method vs. Gradient descent

## Newton = Gradient + Hessian

See https://medium.com/@ranjeettate/optimal-learning-rate-from-the-hessian-examples-e89f8d1af977 for some background. See https://en.wikipedia.org/wiki/Newton%27s_method_in_optimization for a complete derivation of Newton’s Method and how it uses the second derivative (the Hessian) to improve Gradient descent.

Post in progress, but meanwhile, here are the pictures:

Key part of the code for hybrid descent, note that it includes Gradient descent as well as Newton’s Method (or what I was calling Hessian descent), code for which I haven’t included separately.

def hybrid_descent(foo, initial_point, iterations = 5, damping = 0.25):

point = initial_point

dim = len(initial_point)

# gradient of foo

dfoo = nd.Gradient(foo)

iterL = []

for iter in range(iterations):

iterD = {}

function_value = foo(point)# gradient of foo at the current iteration point

gradient = tf.reshape(dfoo(point), shape = [dim,1])

# practical gradient change in the variables, note the default damping = 0.25

delta_grad = - gradient * damping# Hessian (matrix of mixed partial derivatives) of foo

hessian = tf.constant(nd.Hessian(f)(point))

# inverse of the Hessian

inv_hessian = tf.matrix_inverse(hessian, adjoint=False)

# optimal vector change in the variables, note the '-'

optimal_delta = -tf.matmul(inv_hessian, gradient)

# practical Hessian vector change in the variables, note the Hessian damping = 2 * damping for gradient

delta_hess = optimal_delta * damping * 2

# Hessian "norm" of (negative) gradient = optimal_delta * ( - gradient), ?> 0

# = Hessian **(-1) * gradient * gradient

hess_grad_sq = - tf.matmul(tf.reshape(optimal_delta, shape = [1, dim]), gradient)# reshape

gradient = tf.reshape(gradient, shape = [dim,])

delta_grad = tf.reshape(delta_grad, shape = [dim,])

delta_hess = tf.reshape(delta_hess, shape = [dim,])

hess_grad_sq = tf.reshape(hess_grad_sq, shape = [1,])[0]with tf.Session() as sess:

# sess.run(tf.global_variables_initializer())

gradient = sess.run(gradient)

hess_grad_sq = sess.run(hess_grad_sq)

if hess_grad_sq > 0:

delta = sess.run(delta_hess)

method = 'Hessian'

else:

delta = sess.run(delta_grad)

method = 'Gradient'

iterD = {"iteration": iter,

"function_value": function_value,

"point": map(lambda x: round(x, 3), point),

"gradient": map(lambda x: round(x, 3), gradient),

"method": method,

"delta": map(lambda x: round(x, 3), delta),

"Hessian_Gradient_Sq": hess_grad_sq}

iterL.append(iterD)

point = map(lambda i: point[i] + delta[i], range(dim))

hybrid_descentDF = pd.DataFrame(iterL)

return hybrid_descentDF