Loading Pretrained VectorsΒΆ

It can be extremely useful to make a model which had as advantageous starting point.

To do this, we can set the values of the embedding matrix.

def get_pretrained_embeddings(filename, dim_size, token_vocab):
    embedding_matrix = torch.zeros(len(token_vocab), dim_size)
    all_words = set(token_vocab.keys())

    with open(filename) as fp:
        for line in tqdm_notebook(fp.readlines(), leave=False):
            line = line.split(" ")
            word = line[0]
            if word not in token_vocab:
                continue
            all_words.remove(word)
            row_index = token_vocab[word]
            embedding_matrix[row_index] = torch.FloatTensor([float(x) for x in line[1:]])
    for remaining_word in all_words:
        row_index = token_vocab[remaining_word]
        embedding_matrix[row_index] = torch.nn.init.kaiming_normal_(torch.zeros(1, dim_size))

    return embedding_matrix

Then, we can load that embedding matrix:

load_pretrained = True
pretrained_embeddings = None

if load_pretrained:
    pretrained_embeddings = get_pretrained_embeddings("../data/glove.6B.100d.txt",
                                                      dim_size=100,
                                                      token_vocab=dataset.vectorizer.token_vocab)
    embedding_size = pretrained_embeddings.shape[1]

And we can use it in an embedding layer:

emb = nn.Embedding.from_pretrained(embeddings=pretrained_embeddings, freeze=False, padding_idx=0)