I'm trying to load glove vectors, with the following code
en_model = gensim.models.KeyedVectors.load_word2vec_format(model_path, binary=False)
and I unexpectedly get the following error.
File "/home/k/Desktop/Work/Vector explorer/word2vec-explorer/vec_test_loader.py", line 55, in make_model
en_model = KeyedVectors.load_word2vec_format(model_path, binary=is_bin)
File "/home/k/.local/lib/python3.5/site-packages/gensim/models/keyedvectors.py", line 1119, in load_word2vec_format
limit=limit, datatype=datatype)
File "/home/k/.local/lib/python3.5/site-packages/gensim/models/utils_any2vec.py", line 175, in _load_word2vec_format
vocab_size, vector_size = (int(x) for x in header.split()) # throws for invalid file format
File "/home/k/.local/lib/python3.5/site-packages/gensim/models/utils_any2vec.py", line 175, in <genexpr>
vocab_size, vector_size = (int(x) for x in header.split()) # throws for invalid file format
ValueError: invalid literal for int() with base 10: 'the'
Can someone help?
Gensim need more information about model_path
, we have to append two number at the first line which the first indicates how many numbers of words vocabulary we have and the second indicates the number of dimension of word embedding, it looks like below:
101 300
the 1.0 2.1 -1.3 ...
I 1.1 0.2 -0.3 ...
.
.
.
you can try to use one line code as below:
python -m gensim.scripts.glove2word2vec --input glove.840B.300d.txt --output glove.840B.300d.w2vformat.txt
Or you can use my code as reference below:
import gensim
import os
import shutil
import hashlib
from sys import platform
def getFileLineNums(filename):
f = open(filename, 'r')
count = 0
for line in f:
count += 1
return count
def prepend_line(infile, outfile, line):
with open(infile, 'r') as old:
with open(outfile, 'w') as new:
new.write(str(line) + "\n")
shutil.copyfileobj(old, new)
def prepend_slow(infile, outfile, line):
with open(infile, 'r') as fin:
with open(outfile, 'w') as fout:
fout.write(line + "\n")
for line in fin:
fout.write(line)
def load(filename):
num_lines = getFileLineNums(filename)
gensim_file = 'glove_model.txt'
gensim_first_line = "{} {}".format(num_lines, 300)
# Prepends the line.
if platform == "linux" or platform == "linux2":
prepend_line(filename, gensim_file, gensim_first_line)
else:
prepend_slow(filename, gensim_file, gensim_first_line)
model = gensim.models.KeyedVectors.load_word2vec_format(gensim_file)
return model
model = load(your_model_path)