xx = np.linspace(0,1,41)
f = plt.figure(figsize=(6,4))
plt.plot(xx, fu(xx));

f, ax = plt.subplots(ncols=2, figsize=(8,4))
for _i in range(0,5):
    ax[0].plot(xx, eval_leg0(_i, xx), label=f"Order={_i}")
    ax[1].plot(xx, eval_leg1(_i, xx))
ax[0].legend()
ax[0].set_title("Legendre polynomials")
ax[1].set_title("Derivatives");

theta0 = [10.0, 10.0]
sol, mats = galerkin(M, N, p, ub, nb, theta0)
print(obj(sol)[0])
compare_sol(sol, fu, 41);

0.6717431265076316

# Adjoint
sol, mats = galerkin(M, N, p, ub, nb, theta0)  # Forward solve
_, gda = galerkin_da(sol, mats, obj)           # Adjoint/Backward solve

# Finite difference, using a SciPy function
def Jfd(qs):
    sol, _ = galerkin(M, N, p, ub, nb, qs)
    d, _ = obj(sol)
    return d
gfd = approx_fprime(theta0, Jfd)

print(f"Adjoint: {gda}, FD: {gfd}")

Adjoint: [0.04238442 0.09196361], FD: [0.04238444 0.0919636 ]

def Jda(qs):
    sol, mats = galerkin(M, N, p, ub, nb, qs)
    fun, grd = galerkin_da(sol, mats, obj)
    return fun, grd
res = minimize(Jda, theta0, method="l-bfgs-b", jac=True)
print(res)

  message: CONVERGENCE: NORM_OF_PROJECTED_GRADIENT_<=_PGTOL
  success: True
   status: 0
      fun: 7.840073109219852e-09
        x: [ 3.404e-04 -2.283e-04]
      nit: 4
      jac: [-2.017e-08 -4.432e-08]
     nfev: 8
     njev: 8
 hess_inv: <2x2 LbfgsInvHessProduct with dtype=float64>

# Check the converged solutions, i.e., "learned" model
sol, mats = galerkin(M, N, p, ub, nb, res.x)
compare_sol(sol, fu, 41);

# Note that much more cost would be incurred if we optimize
# by BRUTAL force (i.e., finite difference)
res = minimize(Jfd, theta0, method="l-bfgs-b", jac=False)
print(res)  # Look at `nfev`: function evaluation

  message: CONVERGENCE: NORM_OF_PROJECTED_GRADIENT_<=_PGTOL
  success: True
   status: 0
      fun: 7.84006204764976e-09
        x: [ 4.063e-04 -2.550e-04]
      nit: 4
      jac: [-7.499e-09 -2.315e-08]
     nfev: 24
     njev: 8
 hess_inv: <2x2 LbfgsInvHessProduct with dtype=float64>

def zoo(func, qs, Nq, eps=1e-3):   # Zeroth-Order Optimization
    ws = np.random.randn(Nq, len(qs))
    J0 = func(qs)
    tmp = 0
    for _i in range(Nq):
        J = func(qs + eps*ws[_i])
        tmp += (J-J0) / eps * ws[_i]
    return tmp / Nq
print(f"Adjoint: {gda}, Zeroth: {zoo(Jfd, theta0, 100)}")

Adjoint: [0.04238442 0.09196361], Zeroth: [0.03181694 0.09244476]

Ns = [20, 50, 100, 200, 500, 1000, 10000]  # Check convergence
for _n in Ns:
    _g = zoo(Jfd, theta0, _n, eps=1e-3)
    print(f"{_n:<5}: {_g}, Error: {np.linalg.norm(gda-_g) / np.linalg.norm(gda)}")

20   : [0.03850679 0.10418842], Error: 0.12665369540944582
50   : [0.03966067 0.10514643], Error: 0.13293654133796545
100  : [0.07316111 0.11569813], Error: 0.38381656015138965
200  : [0.04563858 0.0786218 ], Error: 0.13561943280468086
500  : [0.04716712 0.09594641], Error: 0.06146403440542132
1000 : [0.04682252 0.10215667], Error: 0.10978914707587059
10000: [0.04242091 0.09247902], Error: 0.005102673084574341

# Zeroth-order optimization - Note # of iterations (nit)
res_zo = minimize(Jfd, theta0, method="l-bfgs-b", jac=lambda q: zoo(Jfd, q, 100))
print(res_zo)

  message: CONVERGENCE: NORM_OF_PROJECTED_GRADIENT_<=_PGTOL
  success: True
   status: 0
      fun: 2.427189391111174e-08
        x: [-2.630e-02  1.212e-02]
      nit: 10
      jac: [-1.306e-07  2.061e-06]
     nfev: 16
     njev: 16
 hess_inv: <2x2 LbfgsInvHessProduct with dtype=float64>

# Adjoint solution as reference
res_da = minimize(Jda, theta0, method="l-bfgs-b", jac=True)
print(res_da)

  message: CONVERGENCE: NORM_OF_PROJECTED_GRADIENT_<=_PGTOL
  success: True
   status: 0
      fun: 7.840073109219852e-09
        x: [ 3.404e-04 -2.283e-04]
      nit: 4
      jac: [-2.017e-08 -4.432e-08]
     nfev: 8
     njev: 8
 hess_inv: <2x2 LbfgsInvHessProduct with dtype=float64>

# Check the converged solutions, i.e., "learned" model
sol, _ = galerkin(M, N, p, ub, nb, res_zo.x)
compare_sol(sol, fu, 41);

# Non-differentiable version; no useful gradient for theta_2!
def Jnd(qs):
    q0 = qs[0]
    q1 = int(qs[1]*100)/100
    sol, _ = galerkin(M, N, p, ub, nb, np.array([q0, q1]))
    d, _ = obj(sol)
    return d
gnd = approx_fprime(theta0, Jnd)

print(f"Diff ver.: {gda}, ND: {gnd}")

Diff ver.: [0.04238442 0.09196361], ND: [0.04238444 0.        ]

# Finite difference: No longer works
res_fd = minimize(Jnd, theta0, method="l-bfgs-b", jac=False)
print(res_fd)

  message: CONVERGENCE: NORM_OF_PROJECTED_GRADIENT_<=_PGTOL
  success: True
   status: 0
      fun: 0.010686658899366581
        x: [-2.119e+01  1.000e+01]
      nit: 2
      jac: [ 5.829e-08  0.000e+00]
     nfev: 12
     njev: 4
 hess_inv: <2x2 LbfgsInvHessProduct with dtype=float64>

# Zeroth-order optimization - Note the larger eps
res_zo = minimize(Jnd, theta0, method="l-bfgs-b",
                  jac=lambda q: zoo(Jnd, q, 100, eps=0.01))
print(res_zo)

  message: CONVERGENCE: NORM_OF_PROJECTED_GRADIENT_<=_PGTOL
  success: True
   status: 0
      fun: 1.6598598437503237e-07
        x: [-7.315e-02  3.889e-02]
      nit: 17
      jac: [ 7.673e-07  1.196e-06]
     nfev: 43
     njev: 43
 hess_inv: <2x2 LbfgsInvHessProduct with dtype=float64>

# Check the converged solutions, i.e., "learned" model
sol, _ = galerkin(M, N, p, ub, nb, res_zo.x)
compare_sol(sol, fu, 41);

Machine Learning for Engineering¶

Neural Networks: More on Back Propagation¶

Instructor: Daning Huang¶

TODAY: Neural Networks - V¶

References:¶

Motivation¶

Example 1: All models are wrong, but some are useful.¶

Example 2: Don't learn what you already know¶

Posing the Problem¶

Example 1: Model Calibration¶

Example 2: Imitation Learning¶

General Form of Problem¶

Strategies for BP¶

Easier Case First¶

Brutal force: Unrolling¶

Unrolling - a fix¶

(Possibly) Better solution: Adjoint¶

Non-differentiable Case¶

Brutal force: Penalty/Surrogate¶

(Possibly) Better solution: Probabilistic formulation¶

Recap¶

Adjoint Method for Differentiable Cases¶

Formulation by adjoint variables¶

Alternative: Formulation by Lagrange multipliers¶

Extra: Multiple inner problems¶

Numerical Example¶

System Discretization¶

Adjoint-based Optimization¶

Probabilistic Method for Non-Differentiable Cases¶

Route 1: Zeroth-order Estimate¶

A discussion on variance¶

Side Note¶

Route 2: Policy Gradient¶

Variance¶

Numerical Example - Cont'd¶

Apply zeroth-order optimization¶