Example of using embeddings computed from a (small) model on hugging face

x handZone psicostat

1. Import and visualize data

# import packages
import numpy as np
import pandas as pd

# Define path to data, import and visualize data
input_datapath = "C:/Users/enric/Documents/PSICOSTAT/AI meeting 2024_05_22/" 
df = pd.read_csv(input_datapath+"Toy_dataset.csv", sep=',') 
df
Item Testo Scale Color DotShape
0 X1 I make plans to achieve my goals conscientiousness blue o
1 X2 I organize my daily tasks conscientiousness blue o
2 X3 I am punctual and reliable conscientiousness blue o
3 X4 I approach tasks with careful attention conscientiousness blue o
4 X5 I demonstrate perseverance and dedication conscientiousness blue o
5 X6 I need to keep every aspect of my life under s... conscientiousness blue o
6 X7 I worry about many aspects of life neuroticism red X
7 X8 I often feel distressed neuroticism red X
8 X9 I dwell on negative experiences neuroticism red X
9 X10 If faced with uncertainty, I feel anxious neuroticism red X
10 X11 I struggle to regain control of my emotions neuroticism red X
11 X12 When confronted with situations that deviate f... neuroticism red X

2. Get embeddings

Now the core part!

# import packages/functions
from sentence_transformers import SentenceTransformer

# load a (small) model available on hugging face 
model = SentenceTransformer("avsolatorio/GIST-small-Embedding-v0")

# Compute embeddings on item texts in dataframe
embeddings = model.encode(df["Testo"], convert_to_tensor=True)

3. Visualize embeddings

# print embedding values
print(embeddings)

# see dimensions of the embeddings
embeddings.shape
tensor([[-0.0295, -0.0231,  0.0577,  ..., -0.0057,  0.0255, -0.0241],
        [-0.0007, -0.0224,  0.0492,  ...,  0.0048, -0.0038, -0.0229],
        [-0.0446, -0.0111,  0.0369,  ...,  0.0064,  0.0310, -0.0156],
        ...,
        [ 0.0057, -0.0013, -0.0019,  ..., -0.0162, -0.0387, -0.0351],
        [ 0.0064, -0.0087,  0.0691,  ...,  0.0157, -0.0343, -0.0004],
        [ 0.0039, -0.0091,  0.0104,  ...,  0.0210, -0.0556, -0.0624]])
torch.Size([12, 384])

4. Compute cosine similarities

# import packages/functions
import torch.nn.functional as F
from sklearn.metrics.pairwise import cosine_similarity

# Compute cosine-similarity for each pair of sentences
scores = F.cosine_similarity(embeddings.unsqueeze(1), embeddings.unsqueeze(0), dim=-1)

# visualize matrix of similarities
print(scores.cpu().numpy().round(decimals=3))
[[1.    0.802 0.705 0.77  0.785 0.757 0.688 0.601 0.66  0.663 0.641 0.763]
 [0.802 1.    0.748 0.818 0.736 0.762 0.708 0.651 0.672 0.625 0.656 0.712]
 [0.705 0.748 1.    0.747 0.773 0.718 0.636 0.646 0.649 0.66  0.615 0.645]
 [0.77  0.818 0.747 1.    0.768 0.733 0.748 0.691 0.716 0.713 0.683 0.758]
 [0.785 0.736 0.773 0.768 1.    0.72  0.687 0.668 0.723 0.667 0.676 0.645]
 [0.757 0.762 0.718 0.733 0.72  1.    0.817 0.7   0.701 0.701 0.751 0.739]
 [0.688 0.708 0.636 0.748 0.687 0.817 1.    0.806 0.769 0.84  0.712 0.778]
 [0.601 0.651 0.646 0.691 0.668 0.7   0.806 1.    0.796 0.782 0.775 0.744]
 [0.66  0.672 0.649 0.716 0.723 0.701 0.769 0.796 1.    0.731 0.73  0.743]
 [0.663 0.625 0.66  0.713 0.667 0.701 0.84  0.782 0.731 1.    0.702 0.824]
 [0.641 0.656 0.615 0.683 0.676 0.751 0.712 0.775 0.73  0.702 1.    0.74 ]
 [0.763 0.712 0.645 0.758 0.645 0.739 0.778 0.744 0.743 0.824 0.74  1.   ]]

5. Lastly, plot items on 2D obtained via multimensional scaling

# import packages/functions
import matplotlib.pyplot as plt
import adjustText
from sklearn.manifold import TSNE

# compute multidimensional scaling reducing to 2D
tsne = TSNE(n_components=2, perplexity=5, random_state=42, init='random', learning_rate=200)
vis_dims = tsne.fit_transform(np.array(scores))

# extract x, y coordinates for all data points
x = [x for x,y in vis_dims]
y = [y for x,y in vis_dims]

# define color, dot shape, and other parameters
colors = df["Color"]
marker = df["DotShape"]
scalename = df["Scale"]
text = df["Item"]+" :"+df["Testo"]
    
# plot basic graph
font = {'size' : 10}
plt.figure(figsize=(9, 7))
plt.rcParams['figure.dpi'] = 600
for _markers, _colors, _name,_x, _y in zip(marker, colors, scalename, x, y):
   plt.scatter(_x, _y, s=80, alpha=0.7, marker=_markers, c=_colors, label=_name)
# add legend
handles, labels = plt.gca().get_legend_handles_labels()
by_label = dict(zip(labels, handles))
plt.legend(by_label.values(), by_label.keys(), loc='upper center', bbox_to_anchor=(1, 1), fontsize=8)
# add labels and adjust their positions
texts = [plt.text(x[i], y[i], text[i], ha='center', va='center', **font) for i in range(len(text))]
adjustText.adjust_text(texts,expand_text=(0.9,0.9))
500