主要就是利用torch.autograd.grad或者合理的backward
Higher order derivative
求解 ∂ n f ∂ x n \frac{\partial^n f}{\partial x^n} ∂xn∂nf,不求混合导数。利用 ∂ ∑ x i ∂ x j = 1 \frac{\partial \sum x_i}{\partial x_j} =1 ∂xj∂∑xi=1,链式求导法则和递推进行求解
def nth_derivative(f, wrt, n):
for i in range(n):
grads = grad(f, wrt, create_graph=True)[0]
f = grads.sum()
return grads
x = torch.arange(4, requires_grad=True).reshape(2, 2)
loss = (x ** 4).sum()
print(nth_derivative(f=loss, wrt=x, n=3))
- 1
- 2
- 3
- 4
- 5
- 6
- 7
- 8
- 9
- 10
Jacobian matrix
可以利用循环进行求解,但是有下面更好的办法。y的每一行都是相同的,然而是对不同的x求的(x的repeat)。y的对角元素就是对不同的x copy求的 y i y_i yi
def get_jacobian(fun, x, noutputs):
x_size = list(x.size())
x = x.unsqueeze(0).repeat(noutputs, *([1]*(len(x_size))) ).detach().requires_grad_(True)
y = fun(x)
y.backward(torch.eye(noutputs))
return x.grad.view(noutputs,*x_size)
def fun(x):
return torch.stack([(x**3).sum(1),(x**2).sum(1)],-1)
x = torch.tensor([1.,2.,3.,4.,5.,6.],requires_grad=True)
print(get_jacobian(fun,x,2))
'''输出
tensor([[ 3., 12., 27., 48., 75., 108.],
[ 2., 4., 6., 8., 10., 12.]])
'''
- 1
- 2
- 3
- 4
- 5
- 6
- 7
- 8
- 9
- 10
- 11
- 12
- 13
- 14
- 15
Hessian matrix
def Hessian_matrix(fun,x):
#y is a scalar Tensor
def get_grad(xx):
y = fun(xx)
grad, = torch.autograd.grad(y,xx,create_graph=True,grad_outputs=torch.ones_like(y))
return grad
x_size = x.numel()
return get_jacobian(get_grad, x, x_size)
weight = torch.rand(3,2)
def fun(x):
return (x @ weight).pow(2).sum(-1)
x = torch.tensor([1.,2.,3.],requires_grad=True)
print(Hessian_matrix(fun,x))
- 1
- 2
- 3
- 4
- 5
- 6
- 7
- 8
- 9
- 10
- 11
- 12
- 13
- 14
- 15
- 16
- 17
Hessian Vector Product
这个比较常用,通常用于神经网络求二阶梯度和某个向量的乘积
由于
∂
2
f
∂
x
2
v
=
∂
(
(
∂
f
/
∂
x
)
T
v
)
∂
x
\frac{\partial^2 f }{\partial x^2}v =\frac{\partial ((\partial f/\partial x)^T v) }{\partial x}
∂x2∂2fv=∂x∂((∂f/∂x)Tv),所以不用直接求Hessian矩阵,也可以用差分代替求导
在linear后面加pow(2),是为了让model非线性,如果是线性的话,y的梯度跟model的参数无关,会导致错误。
class Model(nn.Module):
def __init__(self):
super(Model,self).__init__()
self.linear = nn.Linear(2,1)
def forward(self,x):
return self.linear(x).pow(2)
def Hessian_vec_prod(model, x, vecs):
y = model(x)
grads = torch.autograd.grad(y,model.parameters(),create_graph=True)
prod = sum([(g*v).sum() for g,v in zip(grads,vecs) ])
prod.backward()
return [p.grad.detach() for p in model.parameters()]
model = Model()
x = torch.Tensor([1.,2.])
vec = [torch.Tensor([[1.,5.]]),torch.Tensor([2.])]
print(Hessian_vec_prod(model,x,vec))
- 1
- 2
- 3
- 4
- 5
- 6
- 7
- 8
- 9
- 10
- 11
- 12
- 13
- 14
- 15
- 16
- 17
- 18
- 19
def Hessian_vec_prod_diff(model,x,vecs,r=1e-7):
def get_grad(model,xx):
y = model(xx)
grads = torch.autograd.grad(y,model.parameters())
return grads
def add(model, vec, r):
model = deepcopy(model)
for par,v in zip(model.parameters(), vec):
par.data = par.data + r * v
return model
model1 = add(model, vec, r)
model2 = add(model, vec, -r)
g_lefts = get_grad(model1, x)
g_rights = get_grad(model2,x)
return [(g_left - g_right) / (2*r) for g_left,g_right in zip(g_lefts,g_rights) ]
- 1
- 2
- 3
- 4
- 5
- 6
- 7
- 8
- 9
- 10
- 11
- 12
- 13
- 14
- 15
- 16
- 17
- 18
- 19