I = np.eye(5)
print(I)

[[1. 0. 0. 0. 0.]
 [0. 1. 0. 0. 0.]
 [0. 0. 1. 0. 0.]
 [0. 0. 0. 1. 0.]
 [0. 0. 0. 0. 1.]]

e0 = I[:,0]

print(e0)

[1. 0. 0. 0. 0.]

e0 = I[0,:]

print(e0)

[1. 0. 0. 0. 0.]

np.linalg.inv(D)*D

array([[1., 0., 0., 0.],
       [0., 1., 0., 0.],
       [0., 0., 1., 0.],
       [0., 0., 0., 1.]])

theta = 45*np.pi/180

R = np.array([[np.cos(theta), np.sin(theta)],[-np.sin(theta),np.cos(theta)]])
print(R)

[[ 0.70710678  0.70710678]
 [-0.70710678  0.70710678]]

R@np.array([0,1])

array([0.70710678, 0.70710678])

R@R.T

array([[ 1.00000000e+00, -4.26642159e-17],
       [-4.26642159e-17,  1.00000000e+00]])

from sklearn.manifold import TSNE
#help(TSNE)

import numpy as np
from sklearn.manifold import TSNE
import matplotlib.pyplot as plt

# toy dataset: three tight clusters in 2D, then embedded to 10D
rng = np.random.default_rng(0)
A = rng.normal([0,0], 0.1, (50,2))
B = rng.normal([3,0], 0.1, (50,2))
C = rng.normal([0,3], 0.1, (50,2))
X2 = np.vstack([A,B,C])

# map to 10D with a random linear transform
W = rng.normal(size=(2,10))
X10 = X2 @ W

# t-SNE back to 2D
Y = TSNE(n_components=2, perplexity=20, learning_rate=200, init="random", random_state=0).fit_transform(X10)

plt.figure(figsize=(3,3))
plt.scatter(Y[:,0], Y[:,1], c=np.repeat([0,1,2], 50), s=20)
plt.title("Toy t-SNE example")
plt.show()

# pip install sentence-transformers

from sentence_transformers import SentenceTransformer, util

model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")
s1 = "The patient has diabetic retinopathy."
s2 = "The eye shows signs of retinal damage."

v1 = model.encode([s1])
v2 = model.encode([s2])
v1.shape

(1, 384)

plt.figure(figsize=(5,3))
plt.plot(v1.flatten());
plt.plot(v2.flatten());

sim = util.cos_sim(v1, v2)
print(sim.item())

0.5058673024177551

s3 = "I ate pasta for lunch."
v3 = model.encode([s3])
sim = util.cos_sim(v1, v3)
print(sim.item())

0.02934318035840988

import torch
from transformers import AutoTokenizer, AutoModel
from sklearn.metrics.pairwise import cosine_similarity

# CHOOSE ONE:
MODEL_NAME = "bert-base-uncased"
# MODEL_NAME = "roberta-base"
# MODEL_NAME = "distilbert-base-uncased"
# MODEL_NAME = "microsoft/deberta-v3-base"
#MODEL_NAME = "roberta-base"

tok = AutoTokenizer.from_pretrained(MODEL_NAME)
model = AutoModel.from_pretrained(MODEL_NAME)
model.eval()

def embed_sentence(text: str, use_cls: bool = False) -> torch.Tensor:
    x = tok(text, return_tensors="pt", truncation=True, padding=True)
    with torch.no_grad():
        out = model(**x)
        hidden = out.last_hidden_state      # [1, T, H]

    if use_cls:
        # CLS pooling: first token
        emb = hidden[:, 0, :]              # [1, H]
    else:
        # mean pooling over non-padding tokens
        mask = x["attention_mask"].unsqueeze(-1)  # [1, T, 1]
        masked = hidden * mask
        emb = masked.sum(dim=1) / mask.sum(dim=1)

    emb = emb / emb.norm(p=2, dim=-1, keepdim=True)
    return emb.squeeze(0)

s1 = "The patient has diabetic retinopathy."
s2 = "The eye shows signs of retinal damage."
s3 = "I ate pasta for lunch."

e1 = embed_sentence(s1, use_cls=False).numpy()
e2 = embed_sentence(s2, use_cls=False).numpy()
e3 = embed_sentence(s3, use_cls=False).numpy()

print("sim(s1, s2) =", cosine_similarity([e1],[e2])[0,0])
print("sim(s1, s3) =", cosine_similarity([e1],[e3])[0,0])

sim(s1, s2) = 0.75558174
sim(s1, s3) = 0.40877026

from matplotlib import pyplot as plt
from sklearn.datasets import load_sample_image
flower = load_sample_image('flower.jpg') 
china = load_sample_image('china.jpg')   
plt.imshow(china);
plt.figure();
plt.imshow(flower);

china.shape

(427, 640, 3)

import numpy as np
import torch
from transformers import AutoImageProcessor, AutoModel

# HuggingFace CNN ResNet-50
model_name = "microsoft/resnet-50"
processor = AutoImageProcessor.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name)
model.eval()

def get_embedding(np_image: np.ndarray) -> np.ndarray:
    """
    np_image: H x W x 3 (RGB), uint8 or float32
    returns: 1D embedding as numpy array
    """
    inputs = processor(images=np_image, return_tensors="pt")
    with torch.no_grad():
        outputs = model(**inputs)                 # last_hidden_state: [1, C, H', W']
        feat_map = outputs.last_hidden_state
        emb = feat_map.mean(dim=[2, 3])           # global average pooling → [1, C]
        # L2-normalize
        emb = emb / emb.norm(p=2, dim=-1, keepdim=True)
    return emb.squeeze(0).cpu().numpy()          # shape (C,)

# Compute embeddings
e_flower1  = get_embedding(flower)
e_flower2  = get_embedding(np.flipud(flower))
e_china    = get_embedding(china)

flower.shape, np.prod(flower.shape), e_flower1.shape

((427, 640, 3), np.int64(819840), (2048,))

def cossim(v1,v2):
    return float(np.dot(v1,v2)/np.sqrt(np.dot(v1,v1))/np.sqrt(np.dot(v2,v2)))

print(cossim(flower.flatten(),np.flipud(flower).flatten())), 
print(cossim(e_flower1,e_flower2))

0.04443359375
0.9064571857452393

print(cossim(flower.flatten(),china.flatten())), 
print(cossim(e_flower1,e_china))

0.513671875
0.2582722306251526

BDS 761: Data Science and Machine Learning I

Topic 8: Embeddings

This topic:¶

Reading:¶

Embedding intro¶

ELI5: Embedding¶

Data embedding¶

Famous Matrices: Diagonal matrix¶

Famous Matrices: Diagonal matrix - Applications¶

Famous Matrices: Orthogonal matrix¶

Famous Matrices: Orthogonal matrix - Applications¶

Eigendecomposition¶

Diagonalization¶

Famous Matrices: Normal matrix¶

Famous Matrices: Normal matrix - value¶

2D Normal Distribution¶

Exercise:¶

Dimensionality Reduction¶

Dimensionality Reduction - Motivation¶

Features for dim reduction¶

Feature extraction: Linear Methods¶

Principal Component Analysis¶

Principal Component Analysis¶

Scree graph¶

Exercise:¶

Graph Embedding¶

Network Embedding versus "Data Embedding"¶

Aside: Mean-removal matrix $H$¶

PCA¶

Classical MDS¶

Spectral Graph theory¶

Spectral clustering¶

Isomap¶

Diffusion Maps¶

Spectral Clustering (Normalized Cut)¶

Laplacian Eigenmaps¶

Scikit-Learn Manifold Learning methods¶

t-SNE¶

KL Divergence¶

Quick Information Theory review¶

K-L Divergence¶

K-L divergence notes¶

Example¶

t-SNE¶

t-SNE Algorithm¶

Text embedding methods¶

Topic Modeling¶

Matrix Factorization¶

Latent Semantic Indexing¶

Application: Recommender System¶

Word embeddings¶

Word embedding - Motivation¶

Distributional hypothesis¶

Word embedding¶

Vector semantic models¶

Term-document matrix¶

Document vectors¶

Word vectors¶

Dense Embeddings¶

Wide range of Embedding methods¶

GloVe - Global Vectors for Word Representation¶

Neural word embeddings¶

Embedding via linear layer¶

Exercise - Embedding Layer¶

word2vec¶

word2vec contains two algorithms:¶

The classifier¶

"skip-gram"¶

Examples:¶

Semantic Properties of Embeddings¶

Neural Representations¶

Deep Learning Models for Representation¶

Motivation: Vector databases¶

Indexing methods - basic, efficient, scalable¶

Examples¶

Deep Learning for Embedded Representations¶

Sentence Transformers Framework¶

Manually using a Model¶

Image Embedding example¶