import pandas as pd 
import matplotlib.pyplot as plt
import seaborn as sns 
import numpy as np
import matplotlib.colors as mcolors

import umap
import umap.plot
from sklearn.manifold import TSNE
from sklearn.manifold import MDS
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import AgglomerativeClustering
from bioinfokit.visuz import cluster
from sklearn import datasets
from scipy.spatial import ConvexHull


large = 22; med = 16; small = 12
params = {'axes.titlesize': large,
          'legend.fontsize': med,
          'figure.figsize': (14, 5),
          'axes.labelsize': med,
          'axes.titlesize': med,
          'xtick.labelsize': med,
          'ytick.labelsize': med,
          'figure.titlesize': large}

plt.rcParams.update(params)


luce = pd.read_csv("datasets/Leukemia.csv")
colors = luce.iloc[:,-1] # categorical variable to color the points
luce = luce.iloc[:,1:-1] # numerical varibables


scaler = StandardScaler()
scaler.fit(luce)

luce = scaler.transform(luce)


df = datasets.load_digits()
x = pd.DataFrame(df.data, columns=[ f"ft_{i}" for i in range(64)])
y = df.target

reducer = umap.UMAP(random_state=42, 
                    n_neighbors= 6 , 
                    metric='euclidean',
                    n_components=2)

reducer.fit(x.values)
embedding = reducer.transform(x.values)


fig, axes= plt.subplots(1,1, figsize=(10,10))
umap.plot.points(reducer, 
                 labels=y, 
                 theme='fire', 
                 ax =axes, 
                 show_legend=True)

axes.set_title('UMAP.plot packages')
plt.show()


ax = umap.plot.connectivity(reducer,
                            theme='fire',
                           width=800,
                           height=500)

ax.set_title("Graph oriented")
plt.show()


def tsnes(data, p= [2,5,25,50], component=2):
    res = []
    for ps in p:
        tsne = TSNE(random_state=42,
                    n_components=component, 
                    verbose=0, 
                    perplexity=ps, 
                    n_iter=1000)

        emb = tsne.fit_transform(data)
        df = pd.DataFrame(emb, columns = [f'PC{i+1}' for i in range(component)])
        res.append(df)
    return res


def encircle(x,y, ax=None, **kw):
    if not ax: ax=plt.gca()
    p = np.c_[x,y]
    hull = ConvexHull(p)
    poly = plt.Polygon(p[hull.vertices,:], **kw)
    ax.add_patch(poly)


df = pd.read_csv("datasets/US_road_conditions.csv")
tmp = []
for state in np.unique(df.State):
    row=df[df.State == state].groupby('System').sum().transpose()
    row['State']=state
    tmp.append(row)
df = pd.concat(tmp)
df.reset_index(inplace=True, drop=True)

# or 
df = pd.read_csv("datasets/US_road_conditions.csv")
df_p=pd.get_dummies(df.iloc[:,1:])
y = df.State

df_n = df.iloc[:,1:-1]
colors__ = df.iloc[:,-1]
perplexities = [2,10,20,30]
list_tsnes = tsnes(df_p, perplexities)


fig, axes= plt.subplots(2,2, figsize=(15,20))

g1 = sns.scatterplot(ax = axes[0][0],
                    data=list_tsnes[0], 
                    x='PC1', 
                    y = "PC2", 
                    hue= colors__, palette='Reds', legend=False)

g2 = sns.scatterplot(ax = axes[0][1],
                    data=list_tsnes[1], 
                    x='PC1', 
                    y = "PC2", 
                    hue= colors__, palette='Reds', legend=False)

g3 = sns.scatterplot(ax = axes[1][0],
                    data=list_tsnes[2], 
                    x='PC1', 
                    y = "PC2", 
                    hue= colors__, palette='Reds', legend=False)

g4 = sns.scatterplot(ax = axes[1][1],
                    data=list_tsnes[3], 
                    x='PC1', 
                    y = "PC2", 
                    hue= colors__, palette='Reds', legend=False)


col = ['orange','green','firebrick','blue','red','yellow','violet','khaki','white','snow',"brown"]
clusters_ = 8
clusters = AgglomerativeClustering(n_clusters=clusters_)
results = clusters.fit(list_tsnes[0])
for group in np.unique(results.labels_):
    sns.scatterplot(ax = axes[0][0],  data = list_tsnes[0].loc[results.labels_ == group,:], 
                    x= 'PC1',  y='PC2', label = str(group), legend=False)
    encircle(x= list_tsnes[0].loc[results.labels_ == group, 'PC1'].values.tolist(), 
             y= list_tsnes[0].loc[results.labels_ == group, 'PC2'].values.tolist(),  
             alpha=0.4, linewidth=1, fc=col[group], ax = axes[0][0])
    
clusters_ = AgglomerativeClustering(n_clusters=clusters_)
results = clusters.fit(list_tsnes[1])
for group in np.unique(results.labels_):
    sns.scatterplot(ax = axes[0][1],  data = list_tsnes[1].loc[results.labels_ == group,:], 
                    x= 'PC1',  y='PC2', label = str(group), legend=False)
    encircle(x= list_tsnes[1].loc[results.labels_ == group, 'PC1'].values.tolist(), 
             y= list_tsnes[1].loc[results.labels_ == group, 'PC2'].values.tolist(),  
             alpha=0.4, linewidth=1, fc=col[group], ax = axes[0][1])
    
clusters_ = AgglomerativeClustering(n_clusters=clusters_)
results = clusters.fit(list_tsnes[2])
for group in np.unique(results.labels_):
    sns.scatterplot(ax = axes[1][0],  data = list_tsnes[2].loc[results.labels_ == group,:], 
                    x= 'PC1',  y='PC2', label = str(group), legend=False)
    encircle(x= list_tsnes[2].loc[results.labels_ == group, 'PC1'].values.tolist(), 
             y= list_tsnes[2].loc[results.labels_ == group, 'PC2'].values.tolist(),  
             alpha=0.4, linewidth=1, fc=col[group], ax = axes[1][0])


clusters_ = AgglomerativeClustering(n_clusters=clusters_)
results = clusters.fit(list_tsnes[3].values)
for group in np.unique(results.labels_):
    sns.scatterplot(ax = axes[1][1],  data = list_tsnes[3].loc[results.labels_ == group,:], 
                    x= 'PC1',  y='PC2', label = str(group), legend=False)
    encircle(x= list_tsnes[3].loc[results.labels_ == group, 'PC1'].values.tolist(), 
             y= list_tsnes[3].loc[results.labels_ == group, 'PC2'].values.tolist(),  
             alpha=0.4, linewidth=1, fc=col[group], ax = axes[1][1])

plt.show()


def explained_v(pca):
    print(f"""Variance Explained: {[ round (i,3) for i in pca.explained_variance_ratio_.tolist()[0:10]]}\ntotal components {len(pca.explained_variance_ratio_.tolist())}""")


components = 10

pca = PCA(n_components=components)

pca_emb = pca.fit_transform(luce)
pca_df = pd.DataFrame(pca_emb, columns = [f'PC{i+1}' for i in range(components)])

explained_v(pca)

Variance Explained: [0.149, 0.094, 0.047, 0.04, 0.034, 0.032, 0.027, 0.024, 0.022, 0.021]
total components 10


def show_values_on_bars(axs, h_v="v", space_x=0.12,space_y=0.1, fontdict=None):
    def _show_on_single_plot(ax):
        if h_v == "v":
            for p in ax.patches:
                _x = p.get_x() + p.get_width() / 2 + space_x
                _y = p.get_y() + p.get_height() + (space_y)
                if not np.isnan(p.get_width()):
                    value = round(p.get_height(),2)
                    if fontdict: ax.text(_x, _y, value, ha="left", fontdict=fontdict)
                    else: ax.text(_x, _y, value, ha="left")
        elif h_v == "h":
            for p in ax.patches:
                try:
                    _x = p.get_x() + p.get_width() + space_x
                    _y = p.get_y() + p.get_height() + space_y
                    if not np.isnan(p.get_width()):
                        value = round(p.get_width(),2)
                        if value < 0: _x-=0.27
                        if fontdict: ax.text(_x, _y, value, ha="left", fontdict=fontdict)
                        else: ax.text(_x, _y, value, ha="left")
                except: print(f'Error while preparing {str(p)}')
    if isinstance(axs, np.ndarray):
        for idx, ax in np.ndenumerate(axs):
            _show_on_single_plot(ax)
    else:
        _show_on_single_plot(axs)


fig, axes= plt.subplots(2,1, figsize=(10,15))
sns.set_style('whitegrid')


############### Explained Variance
_y= np.cumsum(pca.explained_variance_ratio_).tolist()
_x = [i for i in range(len(_y))]

sns.barplot(ax = axes[0],x=_x, y=_y, color='firebrick')

annotations_dict = dict(size=20, style='italic', weight='bold')
show_values_on_bars(axes[0], space_x=-0.2, space_y =0.01, fontdict=annotations_dict)
axes[0].set_ylim(0,1)
axes[0].set_title('Explained Varince on PCA')

################ heatmap
sns.heatmap(ax = axes[1],data=pca_df.sort_values(by='PC1'), annot=True, cmap='Spectral')
axes[1].set_title('HeatMap')

plt.show()


################ Biplot

loadings = pca.components_
cluster.biplot(cscore=pca_emb, loadings=loadings,
               var1=round(pca.explained_variance_ratio_[0]*100, 2),
               var2=round(pca.explained_variance_ratio_[1]*100, 2), 
               labels = ["" for i in range(7222)], 
               show=True,
               dim=(15,10)) # 15% PC1 - 10% PC2

plt.show()


components=2

mds = MDS(random_state=42,
          n_components=components, 
          verbose=1)

mds_emb = mds.fit_transform(luce)
mds_df = pd.DataFrame(mds_emb, columns = [f'PC{i+1}' for i in range(components)])
mds_df.head(2)


def umaps(data, k=None, dist=None):
    if k: reducer = umap.UMAP(random_state=42, 
                    n_neighbors= k , 
                    metric='euclidean',
                    n_components=2)
        
    elif dist: reducer = umap.UMAP(random_state=42, 
                    min_dist = dist,
                    metric='euclidean',
                    n_components=2)

    reducer.fit(data)
    embedding = reducer.transform(data)
    return embedding


k = [5,50]
multiple_emb = [umaps(luce, k=i) for i in k]
umap_df = pd.DataFrame(multiple_emb[0], columns = [f'PC{i+1}' for i in range(2)])
perpexilities = [2,10,50,100]
multiple_emb = tsnes(luce, 
                     p=perpexilities )

tsne_df = pd.DataFrame(multiple_emb[0], columns = [f'PC{i+1}' for i in range(2)])

fig, axes = plt.subplots(2,2, figsize=(20,20))

style_title= dict(size=20, weight='bold', style='italic')
style_labels = dict(size=15, style='italic')

sns.scatterplot(ax = axes[0][0], data = pca_df, x = 'PC1', y= 'PC2', 
                hue=colors, legend=False)

axes[0][0].set_title("Principal Components analysis\n",  fontdict=style_title)
axes[0][0].set_ylabel("Component_1", fontdict=style_labels)
axes[0][0].set_xlabel("Component_2", fontdict=style_labels)

sns.scatterplot(ax = axes[0][1], data = mds_df, x = 'PC1', y= 'PC2', 
                hue=colors, legend=False)

axes[0][1].set_title("MultiDimensionality Reduction \n",  fontdict=style_title)
axes[0][1].set_ylabel("Component_1", fontdict=style_labels)
axes[0][1].set_xlabel("Component_2", fontdict=style_labels)

sns.scatterplot(ax = axes[1][0], data = tsne_df, x = 'PC1', y= 'PC2', 
               hue=colors, legend=False)

axes[1][0].set_title("T-SNE with coplexity = 50\n",  fontdict=style_title)
axes[1][0].set_ylabel("Component_1", fontdict=style_labels)
axes[1][0].set_xlabel("Component_2", fontdict=style_labels)

sns.scatterplot(ax = axes[1][1], data = umap_df, x = 'PC1', y= 'PC2', 
                hue=colors, legend=False)

axes[1][1].set_title("UMAP\n",  fontdict=style_title)
axes[1][1].set_ylabel("Component_1", fontdict=style_labels)
axes[1][1].set_xlabel("Component_2", fontdict=style_labels)

plt.show()

	PC1	PC2
0	52.919393	69.977881
1	79.787537	50.920510

Dimensionality reduction

Uniform Manifold Approximation and Projection - UMAP¶

T-SNE¶

PCA¶

MDS - Multidimensional scaling¶

Multiple Panel Plots¶