pytorch简单总结

模型构造方法

继承Module类
使用继承自Module的类：Sequential、ModuleList、ModuleDict

继承Module类，这种方式比较灵活

import torch
from torch import nn

class MLP(nn.Module):
    # 声明带有模型参数的层，这里声明了两个全连接层
    def __init__(self, **kwargs):
        # 调用MLP父类Module的构造函数来进行必要的初始化。这样在构造实例时还可以指定其他函数
        # 参数，如“模型参数的访问、初始化和共享”一节将介绍的模型参数params
        super(MLP, self).__init__(**kwargs)
        self.hidden = nn.Linear(784, 256) # 隐藏层
        self.act = nn.ReLU()
        self.output = nn.Linear(256, 10)  # 输出层


    # 定义模型的前向计算，即如何根据输入x计算返回所需要的模型输出
    def forward(self, x):
        a = self.act(self.hidden(x))
        return self.output(a)

X = torch.rand(2, 784)
net = MLP()
print(net)
net(X)

使用Sequential，方便的搭建顺序网络，不需要手动实现forward

# 自己继承Module来实现一个Sequential类

class MySeq(nn.Module):
    from collections import OrderedDict
    def __init__(self, *args):
        super(MySeq, self).__init__()
        if len(args) == 1 and ininstance(args[0], OrderedDict): # 如果传入的是OrderedDict
            for key, module in args[0].items():
                self.add_module(key, module)                    # add_module将module添加进self._modules(它是一个OrderedDict)
        else:                                                   # 传入的是一些Module
            for idx, module in enumerate(args):
                self.add_module(str(idx), module)
    
    def forward(self, input):
        # self._modules返回一个 OrderedDict，保证会按照成员添加时的顺序遍历成员
        for module in self._modules.values():      
            input = module(input)
        return input

net = MySeq(
        nn.Linear(784, 256),
        nn.ReLU(),
        nn.Linear(256, 10),
        )

print(net)
net(X)

使用Module

Sequential与ModuleLst的区别在于Seq是有序的，因此相邻模块的输入输出大小确定，它在内部可以帮我们实现forward，而ModuleList的模块之间没有联系和顺序，需要自己实现forward。也就是说它们不一定都在不同的层，可能出现在同一层。并且加入到ModuleList中的参数会自动添加到网络中，一般的python list则不会。

class MyModule(nn.Module):
    def __init__(self):
        super(MyModule, self).__init__()
        self.linears = nn.ModuleList([nn.Linear(10, 10) for i in range(10)])   #通过ModuleList添加10个linear
        #self.linears = [nn.Linear(10, 10)  for i in range(10)]   #若使用list，则打印时不会出现参数

    def forward(self, x):
        # ModuleList can act as an iterable, or be indexed using ints
        for i, l in enumerate(self.linears):
            x = self.linears[i // 2](x) + l(x)                    # 它们之间的联系可以自己定义
        return x

net1 = MyModule()
for p in net1.parameters():
    print(p.size())

ModuleDict

它的特性与ModuleList很像，无序、自动添加参数、需要手动实现forward。

net = nn.ModuleDict({
    'linear': nn.Linear(784, 256),
    'act': nn.ReLU(),
})
net['output'] = nn.Linear(256, 10) # 添加
print(net['linear']) # 访问
print(net.output)
print(net)
# net(torch.zeros(1, 784)) # 会报NotImplementedError

复杂模型

复杂模型一般继承前面的Module类来自己写。

class FancyMLP(nn.Module):
    def __init__(self, **kwargs):
        super(FancyMLP, self).__init__(**kwargs)

        self.rand_weight = torch.rand((20, 20), requires_grad=False) # 不可训练参数（常数参数）
        self.linear = nn.Linear(20, 20)

    def forward(self, x):
        x = self.linear(x)
        # 使用创建的常数参数，以及nn.functional中的relu函数和mm函数
        x = nn.functional.relu(torch.mm(x, self.rand_weight.data) + 1)

        # 复用全连接层。等价于两个全连接层共享参数
        x = self.linear(x)
        # 控制流，这里我们需要调用item函数来返回标量进行比较
        while x.norm().item() > 1:
            x /= 2
        if x.norm().item() < 0.8:
            x *= 10
        return x.sum()

X = torch.rand(2, 20)
net = FancyMLP()
print(net)
net(X)

嵌套调用

这些类都是继承自Module类的子类，因此可以嵌套调用它们。

class NestMLP(nn.Module):
    def __init__(self, **kwargs):
        super(NestMLP, self).__init__(**kwargs)
        self.net = nn.Sequential(nn.Linear(40, 30), nn.ReLU()) 

    def forward(self, x):
        return self.net(x)


## 通过嵌套调用来生成网络    
net = nn.Sequential(NestMLP(), nn.Linear(30, 20), FancyMLP())

X = torch.rand(2, 40)
print(net)
net(X)

输出

Sequential(
  (0): NestMLP(
    (net): Sequential(
      (0): Linear(in_features=40, out_features=30, bias=True)
      (1): ReLU()
    )
  )
  (1): Linear(in_features=30, out_features=20, bias=True)
  (2): FancyMLP(
    (linear): Linear(in_features=20, out_features=20, bias=True)
  )
)
tensor(14.4908, grad_fn=<SumBackward0>)

参数的访问、初始化和共享

访问

import torch
from torch import nn
from torch.nn import init

net = nn.Sequential(
    nn.Linear(4, 3),
    nn.ReLU(),
    nn.Linear(3, 1)
)                            ### torch会默认初始化


for p in net.parameters():   # 访问
    print(p)
    
print(net)
X = torch.rand(3, 4)
Y = net(X).sum()
print(Y)



class MyModel(nn.Module):
    def __init__(self, **kwargs):
        super(MyModel, self).__init__(**kwargs)
        # Parameter是Tensor的子类，但会被自动添加进网络参数
        self.weight1 = nn.Parameter(torch.rand(20, 20))  
        
        self.weight2 = torch.rand(20, 20)
    
    def backward():
        pass

n = MyModel()
for name, param in n.named_parameters():
    print(name)
    print(param.grad)
    
>>> weight1
>>> None

初始化

for name, param in net.named_parameters():
    if 'weight' in name:
        init.normal_(param, mean=0, std=0.01)
        print(name, param.data)
    elif 'bias' in name:
        init.constant_(param, val=0)
        print(name, param.data)

        
## 这是torch中的normal_方法，实际就是一个改变值且不记录梯度的函数
# def normal_(tensor, mean=0, std=1):
#     with torch.no_grad():
#         return tensor.normal_(mean, std)
# 仿造这个方法可以自定义初始化函数
def init_weight_(tensor):
    with torch.no_grad():
        tensor.uniform_(-10, 10)
        tensor *= (tensor.abs() >= 5).float()  # (-10, -5),(-5, 5), (5, 10)

for name, param in net.named_parameters():
    if 'weight' in name:
        init_weight_(param)
        print(name, param.data)

# 还可以用data使得改写参数时不影响梯度
for name, param in net.named_parameters():
    if 'bias' in name:
        param.data += 1
        print(name, param.data)

共享

除了前面通过Module类创建的网络，在forward多次调用同一个层的参数是共享的外，还有一个方法是传入Sequential的模块是Module的实例时，参数也是共享的

linear = nn.Linear(1, 1, bias=False)  ## 注意这是一个Module的实例
net = nn.Sequential(linear, linear) 
print(net)
for name, param in net.named_parameters():
    init.constant_(param, val=3)
    print(name, param.data)

x = torch.ones(1, 1)
y = net(x).sum()
print(y)
y.backward()
print(net[0].weight.grad) # 单次梯度是3，两次所以就是6

# y = 1 * 3 * 3 = 9, 两个参数都是3，共享参数在计算时是累加的，因此梯度是6

自定义层

定义一个简单层，不含参数，仅计算输入数据减去均值后的值

import torch
from torch import nn

class CenteredLayer(nn.Module):
    def __init__(self, **kwargs):
        super(CenteredLayer, self).__init__(**kwargs)
    def forward(self, x):
        return x - x.mean()
    
layer = CenteredLayer()
layer(torch.tensor([1, 2, 3, 4, 5], dtype=torch.float))


# 自定义层可以添加进网络
net = nn.Sequential(nn.Linear(8, 128), CenteredLayer())

y = net(torch.rand(4, 8))
y.mean().item()

对于含模型参数的层，除了前面用Parameter一个一个添加外，还可以使用ParameterList和ParameterDict

# ParameterList

class MyListDense(nn.Module):
    def __init__(self):
        super(MyListDense, self).__init__()
        self.params = nn.ParameterList([nn.Parameter(torch.randn(4, 4)) for i in range(3)])
        self.params.append(nn.Parameter(torch.randn(4, 1)))      ## append 新增

    def forward(self, x):
        for i in range(len(self.params)):
            x = torch.mm(x, self.params[i])
        return x
net = MyDense()
print(net)

# ParameterDict

class MyDictDense(nn.Module):
    def __init__(self):
        super(MyDictDense, self).__init__()
        self.params = nn.ParameterDict({
                'linear1': nn.Parameter(torch.randn(4, 4)),
                'linear2': nn.Parameter(torch.randn(4, 1))
        })
        self.params.update({'linear3': nn.Parameter(torch.randn(4, 2))}) # update 新增

    def forward(self, x, choice='linear1'):        # 可根据键值进行不同操作
        return torch.mm(x, self.params[choice])    # 在计算时
    

net = MyDictDense()
print(net)

x = torch.ones(1, 4)
print(net(x, 'linear1'))   # 可以传入键值来进行不同的计算
print(net(x, 'linear2'))
print(net(x, 'linear3'))

net = nn.Sequential(    # 自定义层也可用于构造网络
    MyDictDense(),
    MyListDense(),
)
print(net)
print(net(x))

读写与保存

读写Tensor

可以直接使用save函数和load函数分别存储和读取Tensor。save使用Python的pickle实用程序将对象进行序列化，然后将序列化的对象保存到disk，使用save可以保存各种对象,包括模型、张量和字典等。而load使用pickle unpickle工具将pickle的对象文件反序列化为内存。

import torch
from torch import nn

x = torch.ones(3)
torch.save(x, 'x.pt')             # 存储Tensor
x2 = torch.load('x.pt')
print(x2)

y = torch.zeros(4)
torch.save([x, y], 'xy.pt')       # 存储Tensor列表
xy_list = torch.load('xy.pt')
print(xy_list)

torch.save({'x': x, 'y': y}, 'xy_dict.pt') # 存储Tensor字典
xy = torch.load('xy_dict.pt')
print(xy)

模型的保存与加载

PyTorch中，Module的可学习参数(即权重和偏差)，模块模型包含在参数中(通过model.parameters()访问)。state_dict是一个从参数名称隐射到参数Tesnor的字典对象。

class MLP(nn.Module):
    def __init__(self):
        super(MLP, self).__init__()
        self.hidden = nn.Linear(3, 2)
        self.act = nn.ReLU()
        self.output = nn.Linear(2, 1)

    def forward(self, x):
        a = self.act(self.hidden(x))
        return self.output(a)

net = MLP()
net.state_dict()
# 只有具有可学习参数的层(卷积层、线性层等)才有state_dict中的条目。优化器(optim)也有一个state_dict，其中包含关于优化器状态以及所使用的超参数的信息。
optimizer = torch.optim.SGD(net.parameters(), lr=0.001, momentum=0.9)
optimizer.state_dict()

模型保存和加载一般有两种：

仅保存和加载模型参数(state_dict) 。（推荐）
保存和加载整个模型。

# 方法一
# 保存
torch.save(model.state_dict(), PATH)       # 推荐的文件后缀名是pt或pth
# 加载
model = TheModelClass(*args, **kwargs)
model.load_state_dict(torch.load(PATH))

# 方法二
# 保存
torch.save(model, PATH)
# 加载
model = torch.load(PATH)                   # 直接加载整个模型

以方法一为例：

X = torch.randn(2, 3)
Y = net(X)

PATH = "./net.pt"
torch.save(net.state_dict(), PATH)     # save

net2 = MLP()                           # 实例化
net2.load_state_dict(torch.load(PATH)) # 实例化对象调用加载方法
Y2 = net2(X)
Y2 == Y

GPU

查看GPU信息

import torch
from torch import nn

torch.cuda.is_available() # GPU是否可用

torch.cuda.device_count() # GPU数量

torch.cuda.current_device() # 查看当前GPU索引号，从0开始

torch.cuda.get_device_name(0) # 根据索引查看GPU名字

数据转化为GPU数据

x = torch.tensor([1, 2, 3])
x = x.cuda()
print(x)
print(x.device)

# 也可以在创建时指定device
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

x = torch.tensor([1, 2, 3], device=device)
# or
x = torch.tensor([1, 2, 3]).to(device)
x

# 存储在不同设备上的数据不可以直接计算，不同的GPU也不行

模型转移到GPU

net = nn.Linear(3, 1)
list(net.parameters())[0].device
#  模型默认在cpu上，可用cuda将其转移到GPU，注意其计算数据和参数也必须在同一个设备中

net.cuda()
list(net.parameters())[0].device