Example 7.1

Python code

!pip3 install pandas matplotlib seaborn geopandas 
!pip3 install scikit-learn scipy bioinfokit 
!pip3 install descartes

R code

#install.packages(c("tidyverse""glue""maps"
#                   "factoextra"))

Python code

%matplotlib inline
# General packages
import itertools
import pandas as pd
import numpy as np
# Packages for visualizing
import matplotlib.pyplot as plt
import seaborn as sns
import geopandas as gpd
# Packages for clustering
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import (KMeans, 
    AgglomerativeClustering)
import scipy.cluster.hierarchy as sch
from sklearn.decomposition import PCA
import bioinfokit.visuz

R code

library(tidyverse)
library(glue)
library(maps)
library(factoextra)

Python code

url="https://cssbook.net/d/eurobarom_nov_2017.csv"
d2=pd.read_csv(url)
print("Shape of my filtered data =", d2.shape)
print("Variables:", d2.columns)

R code

url="https://cssbook.net/d/eurobarom_nov_2017.csv"
d2= read_csv(url, col_names = TRUE)
glue("{nrow(d2)} row x {ncol(d2)} columns")
colnames(d2)

Example 7.2

Python code

print(d2["gender"].value_counts())
print(d2["gender"].value_counts(normalize=True))

R code

d2 %>%
  group_by(gender) %>%
  summarise(frequency = n()) %>%
  mutate(rel_freq = frequency / sum(frequency))   

Python code

print(d2["support_refugees"].value_counts())
print(d2["support_refugees"].value_counts(
    normalize=True,dropna=False))

R code

d2 %>%
  group_by(support_refugees) %>%
  summarise(frequency = n()) %>%
  mutate(rel_freq = frequency / sum(frequency)) 

Example 7.3

Python code

n_miss = d2["support_refugees"].isna().sum()
print(f"# of missing values: {n_miss}")

d2 = d2.dropna()
print(f"Shape after dropping NAs: {d2.shape}")

R code

n_miss = sum(is.na(d2$support_refugees))
print(glue("# of missing values: {n_miss}"))

d2 = d2 %>% drop_na()
print(glue("Rows after dropping NAs: {nrow(d2)}"))

Example 7.4

Python code

print("Crosstab gender and support_refugees:")
print(pd.crosstab(d2["support_refugees"], 
                  d2["gender"]))

print("Summary statistics for group of cases:")
print(d2.groupby(["support_refugees""gender"])
      ["age"].mean())

R code

print("Crosstab gender and support_refugees:")
d2 %>%
  group_by(gender, support_refugees)%>%
  summarise(n=n())%>%
  pivot_wider(values_from="n",names_from="gender")

print("Summary statistics for group of cases:")
d2 %>%
  group_by(support_refugees, gender)%>%
  summarise(mean_age=mean(age, na.rm = TRUE))

Example 7.5

Python code

d2["support_refugees"].value_counts().plot(
    kind="bar")
plt.show()

R code

ggplot(data=d2) +
  geom_bar(mapping = aes(x= support_refugees))

Example 7.6

Python code

d2.hist(column="age", bins=15)
plt.show()

R code

ggplot(data=d2) +
  geom_histogram(mapping = aes(x= age), bins = 15)

Example 7.7

Python code

d2 = d2.sort_values(by ="country" )
plt.figure(figsize=(8,8))
sns.boxplot(x="age", y="country", data=d2)
plt.show()

R code

ggplot(d2, aes(y=fct_rev(country), x=age))+
  geom_boxplot()

Example 7.8

Python code

support_refugees = (d2.groupby(["date_n"])
                    ["support_refugees_n"].mean())
support_refugees = support_refugees.to_frame()

plt.plot(support_refugees.index, 
         support_refugees["support_refugees_n"])
plt.xlabel("Day")
plt.ylabel("Support for refugees")
plt.show()

R code

support_refugees = d2 %>%
  group_by(date_n) %>%
  summarise(support=mean(support_refugees_n, 
                         na.rm = TRUE))
ggplot(support_refugees,aes(x=date_n, y=support))+
  geom_line() + 
  xlab("Day") + 
  ylab("Support for refugees")

Example 7.9

Python code

# Combine data
support_combined = d2.groupby(["date_n"]).agg(
    refugees = ("support_refugees_n""mean"),
    migrants = ("support_migrants_n""mean"))


#plot
sns.lineplot(x="date_n", y="refugees"
             data=support_combined, color="blue")
sns.lineplot(x="date_n", y="migrants"
             data=support_combined, color="red")
plt.xlabel("Day")
plt.ylabel("Level of support")
plt.title("Support of refugees and migrants"
plt.show()

R code

# Combine data
support_combined = d2 %>% group_by(date_n) %>%
 summarise(
  refugees=mean(support_refugees_n, na.rm = TRUE),
  migrants=mean(support_migrants_n, na.rm = TRUE))

# Pivot to long format and plot 
support_long = support_combined %>% 
  pivot_longer(-date_n, names_to="group"
               values_to="support")
ggplot(support_long, 
       aes(x=date_n, y=support, colour=group)) +
  geom_line(size = 1.5) +
  labs(title="Support for refugees and migrants"
       x="Day", y="Level of Support"

Example 7.10

Python code

f, axes = plt.subplots(2,1)
sns.lineplot(x="date_n", y="refugees"
             data=support_combined, ax=axes[0])
sns.lineplot(x="date_n", y="migrants"
             data=support_combined, ax=axes[1])

sns.lineplot(x="date_n", y="support_refugees_n"
             data=d2, ci=0, ax=axes[0])
sns.lineplot(x="date_n", y="support_migrants_n"
             data=d2, ci=0, ax=axes[1])
plt.show()

R code

ggplot(support_long, aes(x=date_n, y=support)) +  
  geom_line() + facet_grid(rows=vars(group)) +
  xlab("Day") + ylab("Support")

Example 7.11

Python code

sns.scatterplot(data=support_combined, 
                x="refugees", y="migrants")

R code

ggplot(support_combined, 
       aes(x=refugees, y=migrants))+
  geom_point()

Example 7.12

Python code

sns.lmplot(data=support_combined, 
           x="refugees", y="migrants")
plt.show()

R code

ggplot(support_combined,
       aes(x=refugees, y= migrants))+
  geom_point()+
  geom_smooth(method = lm)

Example 7.13

Python code

print(support_combined["refugees"]
      .corr(support_combined["migrants"], 
            method="pearson"))

R code

cor.test(support_combined$refugees, 
         support_combined$migrants, 
         method = "pearson")

Example 7.14

Python code

pivot_data = pd.pivot_table(d2, 
  values="support_refugees_n"
  index=["country"], columns="gender")

R code

pivot_data= d2 %>% 
  select(gender, country, support_refugees_n) %>%
  group_by(country, gender) %>%
  summarise(score = mean(support_refugees_n))

Example 7.15

Python code

plt.figure(figsize=(10,6))
sns.heatmap(pivot_data, cmap="Blues"
  cbar_kws={"label""support_refugees_n"}) 
plt.show()

R code

ggplot(pivot_data, aes(x = gender, 
    y = fct_rev(country), fill = score)) + 
  geom_tile()+
  scale_fill_gradient2(low="white", high="blue")

Example 7.16

Python code

sns.lineplot(x="date_n", y="support_refugees_n"
  data=d2, color="blue", ci=100, label="Refugees")
sns.lineplot(x="date_n", y="support_migrants_n"
  data=d2, color="red", ci=100, label="Migrants")
plt.xlabel("Day")
plt.ylabel("Level of support")
plt.title("Support for refugees and migrants"
plt.show()

R code

ggplot(support_long, 
       aes(x=date_n, y=support, color=group)) + 
  geom_line(size=1.5) + 
  geom_ribbon(aes(fill=group, ymin=support-0.15,
                  ymax=support+0.15),
              alpha=.1, lty=0) +
  ggtitle("Support for refugees and migrants")

Example 7.17

Python code

supports_country = (d2.groupby(["country"])
  ["support_refugees_n"].mean()
  .to_frame().reset_index())

#Load a world map and plot it
wmap = gpd.read_file(
    gpd.datasets.get_path("naturalearth_lowres"))
wmap = wmap.rename(columns={"name""country"})
wmap.plot();

R code

supports_country = d2 %>%
  group_by(country) %>%
  summarise(m=mean(support_refugees_n,na.rm=TRUE))

#Load a world map and plot it
wmap = map_data("world")
ggplot(wmap, aes(x=long,y=lat,group=group)) +
  geom_polygon(fill="lightgray", colour = "white")

Example 7.18

Python code

countries = [
  "Portugal""Spain""France""Germany",
  "Austria""Belgium""Netherlands""Ireland",
  "Denmark""Poland""UK""Latvia""Cyprus",
  "Croatia""Slovenia""Hungary""Slovakia",
  "Czech republic""Greece""Finland""Italy",
  "Luxemburg""Sweden""Sweden""Bulgaria"
  "Estonia""Lithuania""Malta""Romania"]
m = wmap.loc[
    wmap["country"].isin(countries)]
m = pd.merge(supports_country, m, on="country")

R code

countries = c(
  "Portugal""Spain""France""Germany",
  "Austria""Belgium""Netherlands""Ireland",
  "Denmark""Poland""UK""Latvia""Cyprus",
  "Croatia""Slovenia""Hungary""Slovakia",
  "Czech republic""Greece""Finland""Italy",
  "Luxemburg""Sweden""Sweden""Bulgaria"
  "Estonia""Lithuania""Malta""Romania")
m = wmap %>% rename(country=region) %>% 
  filter(country %in% countries) %>%
  left_join(supports_country, by="country")

Example 7.19

Python code

m = gpd.GeoDataFrame(m, geometry=m["geometry"])
m.plot(column="support_refugees_n"
  legend=True, cmap="OrRd",
  legend_kwds={"label""Level of suppport"}
).set_title("Support of refugees by country")

R code

ggplot(m, aes(long, lat, group=group))+
  geom_polygon(aes(fill = m), color="white")+
  scale_fill_viridis_c(option="B")+
  labs(title="Support for refugees by country"
       fill="Level of support")

Example 7.20

Python code

# Average variables by country and scale
d3 = d2.groupby(["country"])[[
    "support_refugees_n"
    "support_migrants_n"
    "age"
    "educational_n"]].mean()

scaler = StandardScaler()
d3_s = scaler.fit_transform(d3) 

# Store sum of squares for 1..15 clusters
wss = []
for i in range(1, 15):
    km_out = KMeans(n_clusters=i, n_init=20)
    km_out.fit(d3_s)
    wss.append(km_out.inertia_)
  
plt.plot(range(1, 15), wss, marker="o")
plt.xlabel("Number of clusters")
plt.ylabel("Within groups sum of squares")
plt.show()

R code

# Average variables by country and scale
d3_s = d2%>%
  group_by(country)%>%
  summarise(
    m_refugees=mean(support_refugees_n, na.rm=T), 
    m_migrants=mean(support_migrants_n, na.rm=T),
    m_age=mean(age, na.rm=T),
    m_edu=mean(educational_n, na.rm=T)) %>%
  column_to_rownames(var="country"%>%
  scale()
# Store sum of squares for 1..15 clusters
wss = list()
for (i in 1:15) {
  km.out = kmeans(d3_s, centers=i, nstart=25)
  wss[[i]] = tibble(k=i, ss=km.out$tot.withinss)
}
wss = bind_rows(wss)
ggplot(wss, aes(x=k, y=ss)) + 
  geom_line() + geom_point() + 
  xlab("Number of Clusters") + 
  ylab("Within groups sum of squares")

Example 7.21

Python code

# Compute k-means with k = 3
km_res = KMeans(n_clusters=3, n_init=25).fit(d3_s)
print(km_res)
print("K-means cluste sizes:",  
  np.bincount(km_res.labels_[km_res.labels_>=0]))
print(f"Cluster means: {km_res.cluster_centers_}")
print("Clustering vector:")
print(np.column_stack((d3.index, km_res.labels_)))
print("Within cluster sum of squares:")
print(km_res.inertia_)

R code

set.seed(123)
km.res = kmeans(d3_s, 3, nstart=25)
print(km.res)

Example 7.22

Python code

for cluster in range(km_res.n_clusters):
  plt.scatter(d3_s[km_res.labels_ == cluster, 0], 
              d3_s[km_res.labels_ == cluster, 1])
plt.scatter(km_res.cluster_centers_[:, 0], 
            km_res.cluster_centers_[:, 1], 
            s=250, marker="*")
plt.legend(scatterpoints=1)
plt.show()

R code

fviz_cluster(km.res, d3_s, ellipse.type="norm")

Example 7.23

Python code

hc_res = AgglomerativeClustering(
    affinity = "euclidean", linkage = "complete"
hc_res.fit_predict(d3_s)
print(hc_res)

R code

hc.res <- hcut(d3_s, hc_method="complete")
summary(hc.res)

Example 7.24

Python code

dendrogram = sch.dendrogram(
    sch.linkage(d3_s, method="complete"), 
    labels=list(d3.index), leaf_rotation=90) 

R code

plot(hc.res, cex=0.5)

Example 7.25

Python code

hc_res = AgglomerativeClustering(n_clusters=3, 
  affinity = "euclidean", linkage = "ward")
hc_res.fit_predict(d3_s)
print(hc_res)

R code

hc.res = hcut(d3_s, k=3, hc_method="complete"
summary(hc.res)

Example 7.26

Python code

for cluster in range(hc_res.n_clusters):
    plt.scatter(d3_s[hc_res.labels_==cluster, 0], 
                d3_s[hc_res.labels_==cluster, 1])
plt.legend(scatterpoints=1)
plt.show()

R code

fviz_cluster(hc.res, d3_s, ellipse.type="convex")

Example 7.27

Python code

pca_m = PCA()
pca = pca_m.fit(d3_s)
pca_n = PCA()
pca = pca_n.fit_transform(d3_s)
pca_df = pd.DataFrame(data=pca, 
  columns=["PC1""PC2""PC3""PC4"])
pca_df.index = d3.index
print(pca_df.head())

pca_df_2 = pd.DataFrame(data=pca_n.components_.T, 
  columns=["PC1""PC2""PC3""PC4"])
pca_df_2.index = d3.columns
print(pca_df_2)

R code

pca = prcomp(d3_s, scale=TRUE)
head(pca$x)
pca$rotation

Example 7.28

Python code

var1 = round(pca_n.explained_variance_ratio_[0],2)
var2 = round(pca_n.explained_variance_ratio_[1],2)
bioinfokit.visuz.cluster.biplot(cscore=pca, 
  loadings=pca_n.components_, 
  labels=pca_df_2.index.values, 
  var1=var1, var2=var2, show=True)

R code

biplot(x = pca, scale = 0, cex = 0.6, 
       col = c("blue4""brown3"))

Example 7.29

Python code

print("Proportion of variance explained:")
print(pca_n.explained_variance_ratio_)

R code

print("Proportion of variance explained:")
prop_var = tibble(pc=1:4,
    var=pca$sdev^2 / sum(pca$sdev^2))
prop_var

Example 7.30

Python code

plt.bar([1,2,3,4],pca_n.explained_variance_ratio_)
plt.ylabel("Proportion of variance explained")
plt.xlabel("Principal component")
plt.xticks([1,2,3,4])
plt.show()

R code

ggplot(prop_var, aes(x=pc, y=var)) +
  geom_col() +
  scale_y_continuous(limits = c(0,1)) +
  xlab("Principal component") + 
  ylab("Proportion of variance explained")

Example 7.31

Python code

cvar = np.cumsum(pca_n.explained_variance_ratio_)
cvar 

R code

cvar = cumsum(prop_var)
cvar

Example 7.32

Python code

plt.plot(cvar)
plt.xlabel("number of components")
plt.xticks(np.arange(len(cvar)), 
           np.arange(1, len(cvar)+1))
plt.ylabel("cumulative explained variance"
plt.show()

R code

ggplot(cvar, aes(x=pc, y=var)) +
  geom_point() +
  geom_line() +
  theme_bw() +
  xlab("Principal component") +
  ylab("Cumulative explained variance")

Example 7.33

Python code

#Generate a new dataset with first components
d5 = pca[:,0:2]
d5[0:5]

#Get optimal number of clusters
wss = []
for i in range(1, 15):
    km_out = KMeans(n_clusters=i, n_init=20)
    km_out.fit(d5)
    wss.append(km_out.inertia_)

    
# Plot sum of squares vs. number of clusters
plt.plot(range(1, 15), wss, marker="o")
plt.xlabel("Number of clusters")
plt.ylabel("Within groups sum of squares")
plt.show()

# Compute again with k = 3 and visualize
km_res_5 = KMeans(n_clusters=3, n_init=25).fit(d5)
for cluster in range(km_res_5.n_clusters):
  plt.scatter(d3_s[km_res_5.labels_==cluster, 0], 
              d3_s[km_res_5.labels_==cluster, 1])
plt.scatter(km_res_5.cluster_centers_[:, 0], 
            km_res_5.cluster_centers_[:, 1], 
            s=250, marker="*")
plt.legend(scatterpoints=1)
plt.show()

R code

#Generate a new dataset with first components
d5 = pca$x[, c("PC1""PC2")]
head(d5)

#Get optimal number of clusters
wss = list()
for (i in 1:15) {
  km.out = kmeans(d5, centers = i, nstart = 20)
  wss[[i]] = tibble(k=i, ss=km.out$tot.withinss)
}
wss = bind_rows(wss)

# Plot sum of squares vs. number of clusters
ggplot(wss, aes(x=k, y=ss)) + geom_line() + 
     xlab("Number of Clusters") + 
     ylab("Within groups sum of squares")


# Compute again with k = 3 and visualize
set.seed(123)
km.res_5 <- kmeans(d5, 3, nstart = 25)
fviz_cluster(km.res_5, d5, ellipse.type = "norm")