Source code for mowl.visualization.base

from sklearn.manifold import TSNE as SKTSNE
import matplotlib.pyplot as plt
import numpy as np
from gensim.models.keyedvectors import KeyedVectors
import logging
import warnings
logging.basicConfig(level=logging.INFO)


class Visualizer():

    def __init_(self):
        return

    def show(self):
        raise NotImplementedError()

    def savefig(self, outfile):
        raise NotImplementedError()


[docs]class TSNE(Visualizer): """ Wrapper for :class:`sklearn.manifold.TSNE` :param embeddings: Embeddings dictionary :type embeddings: dict or :class:`gensim.models.keyedvectors.KeyedVectors` :param labels: Dictionary containing label information of the entities :type labels: dict of {str: str} :param entities: List of entities to consider for computing the TSNE. If `None`, then all \ the entitites in the embeddings dictionary will be considered. :type entities: list of str """ def __init__(self, embeddings, labels, entities=None): self.total_embeddings = len(embeddings) self.labels = labels self.embeddings = dict() self.not_to_process = 0 if isinstance(embeddings, KeyedVectors): for idx, word in enumerate(embeddings.index_to_key): if (entities is not None) and (word not in entities): self.not_to_process += 1 continue if word not in self.labels: self.not_to_process += 1 continue self.embeddings[word] = embeddings[word] elif isinstance(embeddings, dict): if entities is None: self.embeddings = {name: emb for name, emb in embeddings.items() if name in self.labels} else: self.embeddings = {name: emb for name, emb in embeddings.items() if name in entities and name in self.labels} else: raise TypeError("Embeddings type {type(embeddings)} not recognized. Expected types \ are dict or gensim.models.keyedvectors.KeyedVectors") logging.info("Found %d embedding vectors. Processing only %d.", self.total_embeddings, len(self.embeddings)) self.embedding_idx_dict = {v: k for k, v in enumerate(self.embeddings.keys())} self.classes = set(self.labels.values()) colors = plt.cm.rainbow(np.linspace(0, 1, len(self.classes))) self.class_color_dict = {cl: col for cl, col in zip(self.classes, colors)}
[docs] def generate_points(self, epochs, workers=1, verbose=0): """This method will call the :meth:`sklearn.manifold.TSNE.fit_transform` method to generate the points for the plot. :param epochs: Number of epochs to run the TSNE algorithm :type epochs: int :param workers: Number of workers to use for parallel processing. Defaults to 1. :type workers: int, optional :param verbose: Verbosity level. Defaults to 0. """ points = np.array(list(self.embeddings.values())) if np.iscomplexobj(points): if verbose: warnings.warn("Complex numpy array detected. Only real part will be considered", UserWarning) points = points.real self.points = SKTSNE(n_components=2, verbose=verbose, n_iter=epochs, n_jobs=workers) self.points = self.points.fit_transform(points) self.plot_data = {} for name, idx in self.embedding_idx_dict.items(): label = self.labels[name] x, y = tuple(self.points[idx]) if label not in self.plot_data: self.plot_data[label] = [], [] self.plot_data[label][0].append(x) self.plot_data[label][1].append(y)
[docs] def show(self): """ This method will call the :meth:`matplotlib.pyplot.show` method to show the plot. """ fig, ax = plt.subplots(figsize=(20, 20)) for label, (xs, ys) in self.plot_data.items(): color = self.class_color_dict[label] ax.scatter(xs, ys, color=color, label=label) ax.legend() ax.grid(True) plt.show()
[docs] def savefig(self, outfile): """ This method will call the :meth:`matplotlib.pyplot.savefig` method to save the plot. :param outfile: Path to the output file :type outfile: str """ fig, ax = plt.subplots(figsize=(20, 20)) for label, (xs, ys) in self.plot_data.items(): color = self.class_color_dict[label] ax.scatter(xs, ys, color=color, label=label) ax.legend() ax.grid(True) plt.savefig(outfile) plt.close()