import torch 
import torchvision
import torch.nn as nn
import numpy as np
import torchvision.transforms as transforms

# Create tensors of shape (10, 3) and (10, 2)
x = torch.randn(10, 3)
y = torch.randn(10, 2)

# Build a fully connected layer
linear = nn.Linear(3, 2)
print ('w: ', linear.weight)
print ('b: ', linear.bias)

w:  Parameter containing:
tensor([[-0.4826,  0.1809, -0.5195],
        [ 0.0754,  0.3776,  0.2431]], requires_grad=True)
b:  Parameter containing:
tensor([ 0.4482, -0.2941], requires_grad=True)

# Loss function and optimizer
criterion = nn.MSELoss()
optimizer = torch.optim.SGD(linear.parameters(), lr=0.01)

# Forward pass
pred = linear(x)

# Compute loss
loss = criterion(pred, y)
print('loss: ', loss.item())

loss:  0.8285614252090454

# Backward pass
loss.backward()

# Print out the gradients
print ('dL/dw: ', linear.weight.grad) 
print ('dL/db: ', linear.bias.grad)

dL/dw:  tensor([[-0.0552, -0.0734, -0.3314],
        [ 0.4246,  0.3199,  0.0844]])
dL/db:  tensor([-0.3375,  0.1542])

# 1-step gradient descent
optimizer.step()

# Print out the loss after 1-step gradient descent.
pred = linear(x)
loss = criterion(pred, y)
print('loss after 1 step optimization: ', loss.item())

loss after 1 step optimization:  0.8231405019760132