forked from manashmandal/FastTextModelLoader
-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathtsne_plot.py
More file actions
57 lines (43 loc) · 1.48 KB
/
tsne_plot.py
File metadata and controls
57 lines (43 loc) · 1.48 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
from __future__ import print_function
from gensim.models import KeyedVectors
import numpy as np
from sklearn.manifold import TSNE
import matplotlib.pyplot as plt
# Loading the vectors
## [Warning] Takes a lot of time
en_model = KeyedVectors.load_word2vec_format('wiki.en/wiki.en.vec')
# Limit number of tokens to be visualized
limit = 500
vector_dim = 300
# Getting tokens and vectors
words = []
embedding = np.array([])
i = 0
for word in en_model.vocab:
# Break the loop if limit exceeds
if i == limit: break
# Getting token
words.append(word)
# Appending the vectors
embedding = np.append(embedding, en_model[word])
i += 1
# Reshaping the embedding vector
embedding = embedding.reshape(limit, vector_dim)
def plot_with_labels(low_dim_embs, labels, filename='tsne.png'):
assert low_dim_embs.shape[0] >= len(labels), "More labels than embeddings"
plt.figure(figsize=(18, 18)) # in inches
for i, label in enumerate(labels):
x, y = low_dim_embs[i, :]
plt.scatter(x, y)
plt.annotate(label,
xy=(x, y),
xytext=(5, 2),
textcoords='offset points',
ha='right',
va='bottom')
plt.savefig(filename)
# Creating the tsne plot [Warning: will take time]
tsne = TSNE(perplexity=30.0, n_components=2, init='pca', n_iter=5000)
low_dim_embedding = tsne.fit_transform(embedding)
# Finally plotting and saving the fig
plot_with_labels(low_dim_embedding, words)