In [1]:
import gensim
from gensim.models import KeyedVectors
from gensim.models import Word2Vec
from gensim.scripts.glove2word2vec import glove2word2vec
from gensim.test.utils import datapath, get_tmpfile
import numpy as np
In [3]:
# Function for loading the glove word embedding matrix values
def load_glove_vectors(glove_file):
with open(glove_file, 'r', encoding="utf-8") as file:
# unique words
words = set()
word_to_vec = {}
# each line starts with a word then the values for the different features
for line in file:
line = line.strip().split()
# take the word
curr_word = line[0]
words.add(curr_word)
# rest of the features for the word
word_to_vec[curr_word] = np.array(line[1:], dtype=np.float64)
return words, word_to_vec
In [5]:
words, word_to_vec = load_glove_vectors("glove.6B.50d.txt")
Cosine Similarity¶
We will be using cosine similarity for finding the suitable word. We will $e_b - e_a$ and $e_d - e_c$ as the two vectors to find their cosine, where $e_d$ is searched from all the other words in the vocabulary.
Given two vectors $u$ and $v$, cosine similarity is defined as follows:
$\text{Cosine Similarity} (u, v) = \frac{u.v}{||u||_2 ||v||_2} = cos(\theta)$
where $u.v$ is the dot product of two vectors, $||u||_2$ is the norm of the vector $u$, and $\theta$ is the angle between $u$ and $v$.
This similarity depends on the angle between $u$ and $v$.
If $u$ and $v$ are very similar, their cosine similarity will be close to $1$
If they are dissimilar, the cosine similarity will take a smaller value.
In [8]:
# finds the cosine similarity between u and v
'''
Arguments:
u(n,) - vector of words
v(n,) - vector of words
Returns:
cosine_sim - the cosine similarity between u and v
'''
def find_cosine_similarity(u, v):
distance = 0.0
# find the dot product between u and v
dot = np.dot(u,v)
# find the L2 norm of u
norm_u = np.sqrt(np.sum(u**2))
# Compute the L2 norm of v
norm_v = np.sqrt(np.sum(v**2))
# Compute the cosine similarity
cosine_sim = dot/(norm_u)/norm_v
return cosine_sim
In [10]:
# sample words
father = word_to_vec["father"]
mother = word_to_vec["mother"]
king = word_to_vec["king"]
queen = word_to_vec["queen"]
bat = word_to_vec["bat"]
crow = word_to_vec["crow"]
india = word_to_vec["india"]
italy = word_to_vec["italy"]
delhi = word_to_vec["delhi"]
rome = word_to_vec["rome"]
love = word_to_vec["love"]
like = word_to_vec["like"]
hate = word_to_vec["hate"]
print("cosine_similarity(king, queen) = ", find_cosine_similarity(king, queen))
print("cosine_similarity(father, mother) = ", find_cosine_similarity(father, mother))
print("cosine_similarity(king - queen, father - mother) = ",find_cosine_similarity(king - queen, father - mother))
print("cosine_similarity(bat, crow) = ",find_cosine_similarity(bat, crow))
print("cosine_similarity(india - delhi, rome - italy) = ",find_cosine_similarity(india - delhi, rome - italy))
print("cosine_similarity(love, like) = ", find_cosine_similarity(love, like))
cosine_similarity(king, queen) = 0.7839043010964117 cosine_similarity(father, mother) = 0.8909038442893615 cosine_similarity(king - queen, father - mother) = 0.661889473579435 cosine_similarity(bat, crow) = 0.41574518317394416 cosine_similarity(india - delhi, rome - italy) = -0.6363974204130605 cosine_similarity(love, like) = 0.7682945294633257
In [12]:
# Word analogy task: a is to b as c is to ____
def find_analogy(word_a, word_b, word_c, word_to_vec):
# convert words to lower case
word_a = word_a.lower()
word_b = word_b.lower()
word_c = word_c.lower()
# find the word embeddings for word_a, word_b, word_c
e_a, e_b, e_c = word_to_vec[word_a], word_to_vec[word_b], word_to_vec[word_c]
words = word_to_vec.keys()
max_cosine_sim = -999
best_word = None
# search for word_d in the whole word vector set
for w in words:
# ignore input words
if w in [word_a, word_b, word_c] :
continue
# Compute cosine similarity between the vectors u and v
#u:(e_b - e_a)
#v:((w's vector representation) - e_c)
cosine_sim = find_cosine_similarity(e_b - e_a, word_to_vec[w] - e_c)
if cosine_sim > max_cosine_sim:
max_cosine_sim = cosine_sim
# update word_d
best_word = w
return best_word
In [14]:
examples = [('france', 'paris', 'japan'), ('tall', 'taller', 'small'), ('morning', 'breakfast', 'evening')]
for example in examples:
print ('{} -> {} :: {} -> {}'.format( *example, find_analogy(*example, word_to_vec)))
france -> paris :: japan -> tokyo tall -> taller :: small -> outnumber morning -> breakfast :: evening -> dinners
In [16]:
# for taking input from the user and doing word analogy task on that
def take_input():
print('a --> b :: c --> d')
print('Enter a, b, c words separated by space')
words = input().split(' ')
best_pick = find_analogy(*words, word_to_vec)
print ('{} -> {} :: {} -> {}'.format( *words, best_pick))
print('Best pick: ' + best_pick)
In [18]:
take_input()
a --> b :: c --> d Enter a, b, c words separated by space
king -> queen :: boy -> girl Best pick: girl
In [ ]: