import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize

nltk.download('punkt')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to /Users/z3534407/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/z3534407/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!

True

# Text preprocessing - tokenisation, stop words removal, and stemming
def preprocess_text(text):
    # Tokenize the text
    tokens = word_tokenize(text.lower())
    
    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    filtered_tokens = [token for token in tokens if token.isalnum() and token not in stop_words]
    
    # Stemming
    stemmer = PorterStemmer()
    stemmed_tokens = [stemmer.stem(token) for token in filtered_tokens]
    
    return stemmed_tokens

import numpy as np
from gensim.models import Word2Vec

raw_text = """
I love neural networks, machine learning, and deep learning. It is awesome! Language models are wonderful to learn.
Neural networks and deep learning play a critical role in pushing the boundaries of what AI can achieve, 
making them indispensable for various industries and applications. Their ability to learn and adapt from 
data has revolutionised many fields and opened up new opportunities for solving complex problems.
This course provides an introduction to and deep exploration of neural networks and deep learning
principles and practice
"""

preprocessed_tokens = preprocess_text(raw_text)

# Load GloVe embeddings
glove_embeddings = {}
with open("glove.6B.50d.txt", "r") as file:
    for line in file:
        values = line.split()
        word = values[0]
        vector = np.asarray(values[1:], dtype="float32")
        glove_embeddings[word] = vector

# Train Word2Vec embeddings
model = Word2Vec([preprocessed_tokens], min_count=1, vector_size=50, workers=4)

import matplotlib.pyplot as plt
from sklearn.manifold import TSNE

def visualize_embeddings(embeddings, words):
    tsne = TSNE(n_components=2, random_state=0, perplexity=len(words)-1)
    embedding_vectors = np.array([embeddings[word] for word in words])
    two_d_embeddings = tsne.fit_transform(embedding_vectors)

    plt.figure(figsize=(8, 8))
    for i, word in enumerate(words):
        x, y = two_d_embeddings[i, :]
        plt.scatter(x, y)
        plt.annotate(word, (x, y), xytext=(5, 2), textcoords="offset points", ha="right", va="bottom")
    plt.show()

# For GloVe
glove_words = [word for word in preprocessed_tokens if word in glove_embeddings]
visualize_embeddings(glove_embeddings, glove_words)

# For Word2Vec
word2vec_words = model.wv.index_to_key
visualize_embeddings(model.wv, word2vec_words)

Understanding And Visualizing Word Embeddings¶

Glove¶

Word2Vec¶

Visualizing Word Embeddings using t-SNE¶