```
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
dtype = torch.float32
X = torch.tensor([[1, 2, 3, 4, 5, 6]], dtype=dtype)
Y = torch.tensor([[1, 4, 9, 16, 25, 36]], dtype=dtype)
class Model(nn.Module):
def __init__(self):
super(Model, self).__init__()
torch.manual_seed(3)
self.base_l1 = torch.nn.Linear(6, 6, bias=True)
self.base_l2 = torch.nn.Linear(6, 6, bias=True)
self.l3 = torch.nn.Linear(6, 6, bias=True)
self.l4 = torch.nn.Linear(6, 6, bias=True)
def forward(self, x):
x1 = self.base_l1(x)
x1 = F.relu(x1)
x1 = self.base_l2(x1)
x2 = x1#.detach()
x2 = F.relu(x2)
x2 = self.l3(x2)
x2 = F.relu(x2)
x2 = self.l4(x2)
return x2, x1
model = Model()
my_list = ['base']
base_params = list(filter(lambda kv: my_list[0] in kv[0], model.named_parameters()))
params = list(filter(lambda kv: my_list[0] not in kv[0], model.named_parameters()))
prms = []
base_prms = []
for i in params:
prms.append(i[1])
for i in base_params:
base_prms.append(i[1])
# these 2 optimizers does not have any common parametrs
optimizer1 = optim.SGD(base_prms, lr=.05, momentum=0.9)
optimizer2 = optim.SGD(prms, lr=.05, momentum=0.9)
Loss = nn.MSELoss()
l1 = []
l2 = []
n_iters = 300
for epoch in range(n_iters):
print(epoch)
optimizer1.zero_grad()
optimizer2.zero_grad()
y_pred2, y_pred1 = model(X)
loss2 = Loss(y_pred2, Y)
l2.append(loss2)
loss2.backward()
optimizer2.step()
# a trick to avoid the RuntimeError: Trying to backward through the graph a second
# time ...
y_pred1 = torch.tensor(y_pred1, requires_grad=True)
loss1 = Loss(y_pred1, Y)
l1.append(loss1)
loss1.backward()
optimizer1.step()
```

**

by running this version we observe a very poor performance and we could even argue that y_pred1 part is not getting any closer to the target Y. However, if we rerun the same version by assigning to x2= x1.detach() in the forward function and cancelling the mentioned trick of restarting the y_pred1 tensor we get a very satisfying accuracy.

I could also join the gradient graph of this model you could easily notice that l1_base and l2_base layers can be learned without any help from l3 and l4 and vice versa if we backpropagate separately from the green rectangles presenting y_pred2 (the lower one) and y_pred1 (the upper one)