Data Viz Basic
2021, May 06
data viz, exploring matplotlib, pandas and seaborn.
In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
In [2]:
large = 22; med = 16; small = 12
params = {'axes.titlesize': large,
'legend.fontsize': med,
'figure.figsize': (14, 5),
'axes.labelsize': med,
'axes.titlesize': med,
'xtick.labelsize': med,
'ytick.labelsize': med,
'figure.titlesize': large}
plt.rcParams.update(params)
Subplots¶
In [3]:
x1,x2 = [i for i in range(1,20,2)], [i for i in range(55,40,-1)]
y1,y2 = [i for i in range(1,20,2)], [i for i in range(55,40,-1)]
fig, axes = plt.subplots(1,3)
sns.lineplot( x=x1, y=y1,ax = axes[0]) # if axes has more than 1 dim then two values!
sns.lineplot( x=x2, y=y2,ax = axes[1])
g1 = sns.lineplot( x=x1, y=y1, ax = axes[2], label="multy_1", linestyle='dashed', marker = 's')
g2 = sns.lineplot( x=x2, y=y2, ax = axes[2], label="multy_2", linestyle='dashdot')
axes[2].axvline(x=30, color='black', linestyle='dotted') # added vertical line at x = 30.
axes[0].set_title("First")
axes[1].set_title("Second")
axes[2].set_title("Third")
axes[0].set_xlabel("yep")
axes[1].set_xlabel("yep")
axes[2].set_xlabel("yep")
axes[2].legend(loc="lower center")
axes[2].set_ylim(-50,70) # if using not axes just use ylim() without set_*
axes[2].set_xlim(-5,70)
# styles! --> plt.style.available
plt.style.use("bmh")
plt.show()
Multiple Lines, V,H and Text annotation¶
In [4]:
plt.figure(figsize=(10,7), dpi=80)
x = np.linspace(-3, 3*np.pi, 1000)
sine = sns.lineplot(x=x, y=np.sin(x), label='sine')
cosine = sns.lineplot(x=x, y=np.cos(x), label='cosine') # avoid warning put explict x = ..., y = ...
plt.axhline(y=0.0, color='black', linestyle='dotted', label='H') # axes horizontal line
plt.axvline(x=0.0, color='black', linestyle='dotted', label='V') # axes vertical line
plt.title("Sine-Cosine", )
plt.xlabel("Time")
plt.ylabel("Value")
plt.xlim(-5,10)
plt.ylim(-2,2)
#legend
plt.legend(frameon=True, # legend border
framealpha=1, # transparency of border
ncol=2, # num columns
shadow=True, # shadow on
borderpad=0.5, # thickness of border
title='Sines and Cosines')
# Text annotation
plt.annotate('Peaks', xy=(1.6, 1.0), xytext=(1.6, 1.3),
bbox=dict(boxstyle='square', fc='green', linewidth=0.1),
arrowprops=dict(facecolor='green', shrink=0.01, width=0.1),
fontsize=12, color='white', horizontalalignment='center')
plt.text(x=2.5, y=-1.3, s= "2.5\ndegrees", horizontalalignment='center', color='red') # transform=plt.gca().transData,
plt.show()
In [5]:
## Customize subplot layout
In [6]:
fig = plt.figure()
ax1 = plt.subplot2grid((3,3), (0,0), colspan=2, rowspan=2) # topleft
ax3 = plt.subplot2grid((3,3), (0,2), rowspan=3) # right
ax4 = plt.subplot2grid((3,3), (2,0)) # bottom left
ax5 = plt.subplot2grid((3,3), (2,1)) # bottom right
sns.lineplot(ax= ax5, x=[1,2,3], y=[0,1,0]) # MUST USE AX=
ax1.plot([1,2,3],[2,3,2])
ax4.plot([1,2,3], [0,1,0])
ax3.plot([1,2,3], [0,1,2])
fig.tight_layout() # automatically fix the subplots!
plt.show()
Inner Plot¶
In [7]:
fig, axes = plt.subplots()
axes.plot([1,3,4],[0,0,0], label='main', color='firebrick', linestyle='dashed')
axes.set_xlim(-5,5)
axes.set_ylim(-5,5) # lim of subplot! set_ylim!
axes.legend(loc='best')
inner = fig.add_axes([0.2,0.3,0.2,0.3])
inner.plot([3,2,1],[4,2,1], label='inner', color="royalblue", linestyle='dotted')
inner.legend(loc='best')
plt.show()
Basics Plot¶
ScatterPlot¶
In [8]:
midwest = pd.read_csv("https://raw.githubusercontent.com/selva86/datasets/master/midwest_filter.csv")
fig = plt.figure(figsize=(14, 7), dpi= 200)
plt.scatter(x='area', y='poptotal', data=midwest, #date
s = 'dot_size', # s = shape (n, ), optional as The marker size in points**2
c = 'popdensity', # c = color! in this case is according to popdensity variable in the df!
cmap='icefire', # cmap = colormap aka palette
edgecolors='black',
linewidths=1)
plt.colorbar() # bar on the right!
plt.title("Bubble Plot of PopTotal vs Area\n(color: 'popdensity' & size: 'dot_size' - both are numeric columns in midwest)", fontsize=16)
plt.show()
Seaborn HUE options¶
In [9]:
plt.figure(figsize=(10,10))
sns.scatterplot(data=midwest, x='area', y='poptotal',
hue = 'category', # very important!
size='dot_size',
cmap="Reds")
plt.title("Hue category in SNS")
plt.show()
In [10]:
## KDE, Kernel density distribution
In [11]:
iris = sns.load_dataset("iris")
setosa = iris.loc[iris["species"]=="setosa"]
virginica = iris.loc[iris["species"]=="virginica"]
sns.kdeplot(x="sepal_width",y= "sepal_length", data=setosa,
cmap="husl",
n_levels=20, # number of circles
label='Setosa')
sns.kdeplot(x="sepal_width",y= "sepal_length", data=virginica, cmap="inferno", n_levels=20, label='Virginica')
plt.legend()
plt.title("Multiple KDE")
plt.show()
Histogram and Displot¶
In [12]:
fig, axes = plt.subplots(1,2)
# must pass just one variable as x or y.
sns.histplot(ax=axes[0], data=midwest, x='popwhite',
kde=True, # kernel distribution line
alpha=0.3, # color related
color="firebrick")\
.set_title('First') # title subplot of seaborn!
sns.histplot(ax=axes[1], data=midwest, x='popblack',
hue='state', kde=True, alpha=.3).set_title('Second')
plt.show()
Barplot¶
In [13]:
fig,axes = plt.subplots(1,2)
sns.countplot(ax=axes[0], data=midwest, x ="state",
palette= "RdYlGn").set_title('Count plot')
sns.barplot(ax = axes[1],
data=midwest, y= "popwhite", x="state" ,
palette="spring",
capsize=0.2).set_title("Bar plot")
plt.show()
Bar Chart, tick rotation¶
In [14]:
def show_values_on_bars(axs, h_v="v", space_x=0.12,space_y=0.1, fontdict=None):
def _show_on_single_plot(ax):
if h_v == "v":
for p in ax.patches:
_x = p.get_x() + p.get_width() / 2 + space_x
_y = p.get_y() + p.get_height() + (space_y)
if not np.isnan(p.get_width()):
value = round(p.get_height(),2)
if fontdict: ax.text(_x, _y, value, ha="left", fontdict=fontdict)
else: ax.text(_x, _y, value, ha="left")
elif h_v == "h":
for p in ax.patches:
try:
_x = p.get_x() + p.get_width() + space_x
_y = p.get_y() + p.get_height() + space_y
if not np.isnan(p.get_width()):
value = round(p.get_width(),2)
if value < 0: _x-=0.27
if fontdict: ax.text(_x, _y, value, ha="left", fontdict=fontdict)
else: ax.text(_x, _y, value, ha="left")
except: print(f'Error while preparing {str(p)}')
if isinstance(axs, np.ndarray):
for idx, ax in np.ndenumerate(axs):
_show_on_single_plot(ax)
else:
_show_on_single_plot(axs)
In [15]:
df_raw = pd.read_csv("https://github.com/selva86/datasets/raw/master/mpg_ggplot2.csv")
df = df_raw[['cty', 'manufacturer']].groupby('manufacturer').apply(lambda x: x.mean())
df.reset_index(inplace=True, drop=False)
df.sort_values(by='cty', inplace=True)
fig = plt.figure(figsize=(15,5))
g1 = sns.barplot(data=df, x='manufacturer', y='cty',
color='firebrick')
g1.set_ylim(0,30)
show_values_on_bars(g1, 'v', fontdict=dict(size=15), space_x=-0.4, space_y=1)
for item in g1.get_xticklabels(): # tick rotation
item.set_rotation(45)
import matplotlib.patches as patches # colored boxes on xticks
p1 = patches.Rectangle((.57, -0.005), width=.33, height=.13, alpha=.1, facecolor='green',
transform=fig.transFigure)
p2 = patches.Rectangle((.124, -0.005), width=.446, height=.13, alpha=.1, facecolor='red',
transform=fig.transFigure)
fig.add_artist(p1)
fig.add_artist(p2)
plt.show()
LinePlot¶
In [16]:
sns.lineplot(data=midwest[(midwest.state == 'IL') | (midwest.state == 'IN')],
x = "area",
y="popdensity",
hue ="state",
palette="dark",
ci = 0.8).set_title("Line Plot")
plt.show()
Box, Box + Stripp & jitter, Box + Swarm, Violin Plot, Text inside boxplot¶
In [17]:
fig, axes = plt.subplots(4,1, figsize=(15,20))
# BOXPLOT
sns.boxplot(ax = axes[0], data=midwest, y = "state", x ="percadultpoverty",
#hue="sex", # hue according to some category if usefull
palette="husl",
orient = "h") # h or v. must change also the x,y data.
m = 0
for i in (midwest.state.unique()):
filtered = midwest.percadultpoverty[midwest.state == i]
axes[0].text(y = m, x = filtered.median(),
s= str(len(filtered)),
fontdict=dict(size=15, color='white'),
rotation=270)
m+=1
#VIOLINPLOT
sns.violinplot(ax = axes[1], data=midwest, y = "state", x ="percadultpoverty",
palette="Spectral",
orient = "h",
inner='quartile') # {“box”, “quartile”, “point”, “stick”}
#BOX+STRIP
sns.boxplot(ax = axes[2], data=midwest, y = "state", x ="percadultpoverty",
palette="magma",
orient = "h")
sns.stripplot(ax = axes[2], data=midwest, y = "state", x ="percadultpoverty",
palette="magma",
orient = "h",
jitter= 0.5)
#BOX + SWARM
sns.boxplot(ax = axes[3], data=midwest, y = "state", x ="percadultpoverty",
palette="Reds",
orient = "h")
sns.swarmplot(ax = axes[3], data=midwest, y = "state", x ="percadultpoverty",
palette="Reds",
orient = "h")
plt.show()
LMPLOT, scatter with best fit line¶
In [18]:
sns.lmplot(data=midwest[(midwest.state == 'IL') | (midwest.state == 'MI')],
x='popdensity', y='popblack',
col='state', # if in one plot put hue instead
height=4, aspect=1.6, palette='tab10',
scatter_kws=dict(s=60, linewidths=.7, edgecolors='black'),
legend=False)
plt.show()
Stripplot with Jitter¶
In [19]:
sns.set_style("dark")
sns.stripplot(x="category", y="popdensity", data=midwest,
palette="Reds",
dodge=True, # used to better represent the HUE!
hue="state",
linewidth= 1,
marker="d",
size = 10,
alpha=0.6)
plt.show()
Marginal Plots¶
In [20]:
midwest = pd.read_csv("https://raw.githubusercontent.com/selva86/datasets/master/midwest_filter.csv")
plt.figure(figsize=(10,10))
marginal = sns.JointGrid(data=midwest, x = "popwhite", y="popdensity", hue='state')
marginal.plot_joint(sns.scatterplot, color="slateblue")
marginal.plot_marginals(sns.boxplot, color="mediumseagreen")
marginal.plot_marginals(sns.swarmplot, color="orange", s=2)
marginal.plot_marginals(sns.rugplot, color="coral", height=-.15)
plt.show()
<Figure size 720x720 with 0 Axes>
Pairwise Plot¶
In [21]:
iris = sns.load_dataset("iris")
fig = plt.figure(figsize=(10,10))
grid = sns.PairGrid(data=iris, hue="species",
palette="dark",
hue_kws={"marker":["d","s","+"]})
grid.map_diag(sns.histplot)
grid.map_upper(sns.scatterplot)
grid.map_lower(sns.kdeplot)
#grid.add_legend()
plt.show()
<Figure size 720x720 with 0 Axes>
Categorical Plot¶
In [22]:
titanic = sns.load_dataset("titanic")
titanic = titanic[ ~ pd.isna(titanic.deck) ]
sns.catplot( x= 'alive', data= titanic, col= 'deck',
kind = 'count' , col_wrap=4)
plt.show()