%matplotlib inline

# General packages

import itertools

import pandas as pd

import numpy as np

# Packages for visualizing

import matplotlib.pyplot as plt

import seaborn as sns

import geopandas as gpd

# Packages for clustering

from sklearn.preprocessing import StandardScaler

from sklearn.cluster import (KMeans,

AgglomerativeClustering)

import scipy.cluster.hierarchy as sch

from sklearn.decomposition import PCA

import bioinfokit.visuz

R code

library(tidyverse)

library(glue)

library(maps)

library(factoextra)

Python code

url="https://cssbook.net/d/eurobarom_nov_2017.csv"

d2=pd.read_csv(url)

print("Shape of my filtered data =", d2.shape)

print("Variables:", d2.columns)

R code

url="https://cssbook.net/d/eurobarom_nov_2017.csv"

d2= read_csv(url, col_names = TRUE)

glue("{nrow(d2)} row x {ncol(d2)} columns")

colnames(d2)

Example 7.2

Python code

print(d2["gender"].value_counts())

print(d2["gender"].value_counts(normalize=True))

R code

d2 %>%

group_by(gender) %>%

summarise(frequency = n()) %>%

mutate(rel_freq = frequency / sum(frequency))

Python code

print(d2["support_refugees"].value_counts())

print(d2["support_refugees"].value_counts(

normalize=True,dropna=False))

R code

d2 %>%

group_by(support_refugees) %>%

summarise(frequency = n()) %>%

mutate(rel_freq = frequency / sum(frequency))

Example 7.3

Python code

n_miss = d2["support_refugees"].isna().sum()

print(f"# of missing values: {n_miss}")

d2 = d2.dropna()

print(f"Shape after dropping NAs: {d2.shape}")

R code

n_miss = sum(is.na(d2$support_refugees))

print(glue("# of missing values: {n_miss}"))

d2 = d2 %>% drop_na()

print(glue("Rows after dropping NAs: {nrow(d2)}"))

Example 7.4

Python code

print("Crosstab gender and support_refugees:")

print(pd.crosstab(d2["support_refugees"],

d2["gender"]))

print("Summary statistics for group of cases:")

print(d2.groupby(["support_refugees", "gender"])

["age"].mean())

R code

print("Crosstab gender and support_refugees:")

d2 %>%

group_by(gender, support_refugees)%>%

summarise(n=n())%>%

pivot_wider(values_from="n",names_from="gender")

print("Summary statistics for group of cases:")

d2 %>%

group_by(support_refugees, gender)%>%

summarise(mean_age=mean(age, na.rm = TRUE))

Example 7.5

Python code

d2["support_refugees"].value_counts().plot(

kind="bar")

plt.show()

R code

ggplot(data=d2) +

geom_bar(mapping = aes(x= support_refugees))

Example 7.6

Python code

d2.hist(column="age", bins=15)

plt.show()

R code

ggplot(data=d2) +

geom_histogram(mapping = aes(x= age), bins = 15)

Example 7.7

Python code

d2 = d2.sort_values(by ="country" )

plt.figure(figsize=(8,8))

sns.boxplot(x="age", y="country", data=d2)

plt.show()

R code

ggplot(d2, aes(y=fct_rev(country), x=age))+

geom_boxplot()

Example 7.8

Python code

support_refugees = (d2.groupby(["date_n"])

["support_refugees_n"].mean())

support_refugees = support_refugees.to_frame()

plt.plot(support_refugees.index,

support_refugees["support_refugees_n"])

plt.xlabel("Day")

plt.ylabel("Support for refugees")

plt.show()

R code

support_refugees = d2 %>%

group_by(date_n) %>%

summarise(support=mean(support_refugees_n,

na.rm = TRUE))

ggplot(support_refugees,aes(x=date_n, y=support))+

geom_line() +

xlab("Day") +

ylab("Support for refugees")

Example 7.9

Python code

# Combine data

support_combined = d2.groupby(["date_n"]).agg(

refugees = ("support_refugees_n", "mean"),

migrants = ("support_migrants_n", "mean"))

#plot

sns.lineplot(x="date_n", y="refugees",

data=support_combined, color="blue")

sns.lineplot(x="date_n", y="migrants",

data=support_combined, color="red")

plt.xlabel("Day")

plt.ylabel("Level of support")

plt.title("Support of refugees and migrants")

plt.show()

R code

# Combine data

support_combined = d2 %>% group_by(date_n) %>%

summarise(

refugees=mean(support_refugees_n, na.rm = TRUE),

migrants=mean(support_migrants_n, na.rm = TRUE))

# Pivot to long format and plot

support_long = support_combined %>%

pivot_longer(-date_n, names_to="group",

values_to="support")

ggplot(support_long,

aes(x=date_n, y=support, colour=group)) +

geom_line(size = 1.5) +

labs(title="Support for refugees and migrants",

x="Day", y="Level of Support")

Example 7.10

Python code

f, axes = plt.subplots(2,1)

sns.lineplot(x="date_n", y="refugees",

data=support_combined, ax=axes[0])

sns.lineplot(x="date_n", y="migrants",

data=support_combined, ax=axes[1])

sns.lineplot(x="date_n", y="support_refugees_n",

data=d2, ci=0, ax=axes[0])

sns.lineplot(x="date_n", y="support_migrants_n",

data=d2, ci=0, ax=axes[1])

plt.show()

R code

ggplot(support_long, aes(x=date_n, y=support)) +

geom_line() + facet_grid(rows=vars(group)) +

xlab("Day") + ylab("Support")

Example 7.11

Python code

sns.scatterplot(data=support_combined,

x="refugees", y="migrants")

R code

ggplot(support_combined,

aes(x=refugees, y=migrants))+

geom_point()

Example 7.12

Python code

sns.lmplot(data=support_combined,

x="refugees", y="migrants")

plt.show()

R code

ggplot(support_combined,

aes(x=refugees, y= migrants))+

geom_point()+

geom_smooth(method = lm)

Example 7.13

Python code

print(support_combined["refugees"]

.corr(support_combined["migrants"],

method="pearson"))

R code

cor.test(support_combined$refugees,

support_combined$migrants,

method = "pearson")

Example 7.14

Python code

pivot_data = pd.pivot_table(d2,

values="support_refugees_n",

index=["country"], columns="gender")

R code

pivot_data= d2 %>%

select(gender, country, support_refugees_n) %>%

group_by(country, gender) %>%

summarise(score = mean(support_refugees_n))

Example 7.15

Python code

plt.figure(figsize=(10,6))

sns.heatmap(pivot_data, cmap="Blues",

cbar_kws={"label": "support_refugees_n"})

plt.show()

R code

ggplot(pivot_data, aes(x = gender,

y = fct_rev(country), fill = score)) +

geom_tile()+

scale_fill_gradient2(low="white", high="blue")

Example 7.16

Python code

sns.lineplot(x="date_n", y="support_refugees_n",

data=d2, color="blue", ci=100, label="Refugees")

sns.lineplot(x="date_n", y="support_migrants_n",

data=d2, color="red", ci=100, label="Migrants")

plt.xlabel("Day")

plt.ylabel("Level of support")

plt.title("Support for refugees and migrants")

plt.show()

R code

ggplot(support_long,

aes(x=date_n, y=support, color=group)) +

geom_line(size=1.5) +

geom_ribbon(aes(fill=group, ymin=support-0.15,

ymax=support+0.15),

alpha=.1, lty=0) +

ggtitle("Support for refugees and migrants")

Example 7.17

Python code

supports_country = (d2.groupby(["country"])

["support_refugees_n"].mean()

.to_frame().reset_index())

#Load a world map and plot it

wmap = gpd.read_file(

gpd.datasets.get_path("naturalearth_lowres"))

wmap = wmap.rename(columns={"name": "country"})

wmap.plot();

R code

supports_country = d2 %>%

group_by(country) %>%

summarise(m=mean(support_refugees_n,na.rm=TRUE))

#Load a world map and plot it

wmap = map_data("world")

ggplot(wmap, aes(x=long,y=lat,group=group)) +

geom_polygon(fill="lightgray", colour = "white")

Example 7.18

Python code

countries = [

"Portugal", "Spain", "France", "Germany",

"Austria", "Belgium", "Netherlands", "Ireland",

"Denmark", "Poland", "UK", "Latvia", "Cyprus",

"Croatia", "Slovenia", "Hungary", "Slovakia",

"Czech republic", "Greece", "Finland", "Italy",

"Luxemburg", "Sweden", "Sweden", "Bulgaria",

"Estonia", "Lithuania", "Malta", "Romania"]

m = wmap.loc[

wmap["country"].isin(countries)]

m = pd.merge(supports_country, m, on="country")

R code

countries = c(

"Portugal", "Spain", "France", "Germany",

"Austria", "Belgium", "Netherlands", "Ireland",

"Denmark", "Poland", "UK", "Latvia", "Cyprus",

"Croatia", "Slovenia", "Hungary", "Slovakia",

"Czech republic", "Greece", "Finland", "Italy",

"Luxemburg", "Sweden", "Sweden", "Bulgaria",

"Estonia", "Lithuania", "Malta", "Romania")

m = wmap %>% rename(country=region) %>%

filter(country %in% countries) %>%

left_join(supports_country, by="country")

Example 7.19

Python code

m = gpd.GeoDataFrame(m, geometry=m["geometry"])

m.plot(column="support_refugees_n",

legend=True, cmap="OrRd",

legend_kwds={"label": "Level of suppport"}

).set_title("Support of refugees by country")

R code

ggplot(m, aes(long, lat, group=group))+

geom_polygon(aes(fill = m), color="white")+

scale_fill_viridis_c(option="B")+

labs(title="Support for refugees by country",

fill="Level of support")

Example 7.20

Python code

# Average variables by country and scale

d3 = d2.groupby(["country"])[[

"support_refugees_n",

"support_migrants_n",

"age",

"educational_n"]].mean()

scaler = StandardScaler()

d3_s = scaler.fit_transform(d3)

# Store sum of squares for 1..15 clusters

wss = []

for i in range(1, 15):

km_out = KMeans(n_clusters=i, n_init=20)

km_out.fit(d3_s)

wss.append(km_out.inertia_)

plt.plot(range(1, 15), wss, marker="o")

plt.xlabel("Number of clusters")

plt.ylabel("Within groups sum of squares")

plt.show()

R code

# Average variables by country and scale

d3_s = d2%>%

group_by(country)%>%

summarise(

m_refugees=mean(support_refugees_n, na.rm=T),

m_migrants=mean(support_migrants_n, na.rm=T),

m_age=mean(age, na.rm=T),

m_edu=mean(educational_n, na.rm=T)) %>%

column_to_rownames(var="country") %>%

scale()

# Store sum of squares for 1..15 clusters

wss = list()

for (i in 1:15) {

km.out = kmeans(d3_s, centers=i, nstart=25)

wss[[i]] = tibble(k=i, ss=km.out$tot.withinss)

}

wss = bind_rows(wss)

ggplot(wss, aes(x=k, y=ss)) +

geom_line() + geom_point() +

xlab("Number of Clusters") +

ylab("Within groups sum of squares")

Example 7.21

Python code

# Compute k-means with k = 3

km_res = KMeans(n_clusters=3, n_init=25).fit(d3_s)

print(km_res)

print("K-means cluste sizes:",

np.bincount(km_res.labels_[km_res.labels_>=0]))

print(f"Cluster means: {km_res.cluster_centers_}")

print("Clustering vector:")

print(np.column_stack((d3.index, km_res.labels_)))

print("Within cluster sum of squares:")

print(km_res.inertia_)

R code

set.seed(123)

km.res = kmeans(d3_s, 3, nstart=25)

print(km.res)

Example 7.22

Python code

for cluster in range(km_res.n_clusters):

plt.scatter(d3_s[km_res.labels_ == cluster, 0],

d3_s[km_res.labels_ == cluster, 1])

plt.scatter(km_res.cluster_centers_[:, 0],

km_res.cluster_centers_[:, 1],

s=250, marker="*")

plt.legend(scatterpoints=1)

plt.show()

R code

fviz_cluster(km.res, d3_s, ellipse.type="norm")

Example 7.23

Python code

hc_res = AgglomerativeClustering(

affinity = "euclidean", linkage = "complete")

hc_res.fit_predict(d3_s)

print(hc_res)

R code

hc.res <- hcut(d3_s, hc_method="complete")

summary(hc.res)

Example 7.24

Python code

dendrogram = sch.dendrogram(

sch.linkage(d3_s, method="complete"),

labels=list(d3.index), leaf_rotation=90)

R code

plot(hc.res, cex=0.5)

Example 7.25

Python code

hc_res = AgglomerativeClustering(n_clusters=3,

affinity = "euclidean", linkage = "ward")

hc_res.fit_predict(d3_s)

print(hc_res)

R code

hc.res = hcut(d3_s, k=3, hc_method="complete")

summary(hc.res)

Example 7.26

Python code

for cluster in range(hc_res.n_clusters):

plt.scatter(d3_s[hc_res.labels_==cluster, 0],

d3_s[hc_res.labels_==cluster, 1])

plt.legend(scatterpoints=1)

plt.show()

R code

fviz_cluster(hc.res, d3_s, ellipse.type="convex")

Example 7.27

Python code

pca_m = PCA()

pca = pca_m.fit(d3_s)

pca_n = PCA()

pca = pca_n.fit_transform(d3_s)

pca_df = pd.DataFrame(data=pca,

columns=["PC1", "PC2", "PC3", "PC4"])

pca_df.index = d3.index

print(pca_df.head())

pca_df_2 = pd.DataFrame(data=pca_n.components_.T,

columns=["PC1", "PC2", "PC3", "PC4"])

pca_df_2.index = d3.columns

print(pca_df_2)

R code

pca = prcomp(d3_s, scale=TRUE)

head(pca$x)

pca$rotation

Example 7.28

Python code

var1 = round(pca_n.explained_variance_ratio_[0],2)

var2 = round(pca_n.explained_variance_ratio_[1],2)

bioinfokit.visuz.cluster.biplot(cscore=pca,

loadings=pca_n.components_,

labels=pca_df_2.index.values,

var1=var1, var2=var2, show=True)

R code

biplot(x = pca, scale = 0, cex = 0.6,

col = c("blue4", "brown3"))

Example 7.29

Python code

print("Proportion of variance explained:")

print(pca_n.explained_variance_ratio_)

R code

print("Proportion of variance explained:")

prop_var = tibble(pc=1:4,

var=pca$sdev^2 / sum(pca$sdev^2))

prop_var

Example 7.30

Python code

plt.bar([1,2,3,4],pca_n.explained_variance_ratio_)

plt.ylabel("Proportion of variance explained")

plt.xlabel("Principal component")

plt.xticks([1,2,3,4])

plt.show()

R code

ggplot(prop_var, aes(x=pc, y=var)) +

geom_col() +

scale_y_continuous(limits = c(0,1)) +

xlab("Principal component") +

ylab("Proportion of variance explained")

Example 7.31

Python code

cvar = np.cumsum(pca_n.explained_variance_ratio_)

cvar

R code

cvar = cumsum(prop_var)

cvar

Example 7.32

Python code

plt.plot(cvar)

plt.xlabel("number of components")

plt.xticks(np.arange(len(cvar)),

np.arange(1, len(cvar)+1))

plt.ylabel("cumulative explained variance")

plt.show()

R code

ggplot(cvar, aes(x=pc, y=var)) +

geom_point() +

geom_line() +

theme_bw() +

xlab("Principal component") +

ylab("Cumulative explained variance")

Example 7.33

Python code

#Generate a new dataset with first components

d5 = pca[:,0:2]

d5[0:5]

#Get optimal number of clusters

wss = []

for i in range(1, 15):

km_out = KMeans(n_clusters=i, n_init=20)

km_out.fit(d5)

wss.append(km_out.inertia_)

# Plot sum of squares vs. number of clusters

plt.plot(range(1, 15), wss, marker="o")

plt.xlabel("Number of clusters")

plt.ylabel("Within groups sum of squares")

plt.show()

# Compute again with k = 3 and visualize

km_res_5 = KMeans(n_clusters=3, n_init=25).fit(d5)

for cluster in range(km_res_5.n_clusters):

plt.scatter(d3_s[km_res_5.labels_==cluster, 0],

d3_s[km_res_5.labels_==cluster, 1])

plt.scatter(km_res_5.cluster_centers_[:, 0],

km_res_5.cluster_centers_[:, 1],

s=250, marker="*")

plt.legend(scatterpoints=1)

plt.show()

R code

#Generate a new dataset with first components

d5 = pca$x[, c("PC1", "PC2")]

head(d5)

#Get optimal number of clusters

wss = list()

for (i in 1:15) {

km.out = kmeans(d5, centers = i, nstart = 20)

wss[[i]] = tibble(k=i, ss=km.out$tot.withinss)

}

wss = bind_rows(wss)

# Plot sum of squares vs. number of clusters

ggplot(wss, aes(x=k, y=ss)) + geom_line() +

xlab("Number of Clusters") +

ylab("Within groups sum of squares")

# Compute again with k = 3 and visualize

set.seed(123)

km.res_5 <- kmeans(d5, 3, nstart = 25)

fviz_cluster(km.res_5, d5, ellipse.type = "norm")