!pip3 install pandas matplotlib seaborn geopandas
!pip3 install scikit-learn scipy bioinfokit
!pip3 install descartes
%matplotlib inline
import itertools
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import geopandas as gpd
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import (KMeans,
AgglomerativeClustering)
import scipy.cluster.hierarchy as sch
from sklearn.decomposition import PCA
import bioinfokit.visuz
library(tidyverse)
library(glue)
library(maps)
library(factoextra)
url="https://cssbook.net/d/eurobarom_nov_2017.csv"
d2=pd.read_csv(url)
print("Shape of my filtered data =", d2.shape)
print("Variables:", d2.columns)
url="https://cssbook.net/d/eurobarom_nov_2017.csv"
d2= read_csv(url, col_names = TRUE)
glue("{nrow(d2)} row x {ncol(d2)} columns")
colnames(d2)
print(d2["gender"].value_counts())
print(d2["gender"].value_counts(normalize=True))
d2 %>%
group_by(gender) %>%
summarise(frequency = n()) %>%
mutate(rel_freq = frequency / sum(frequency))
print(d2["support_refugees"].value_counts())
print(d2["support_refugees"].value_counts(
normalize=True,dropna=False))
d2 %>%
group_by(support_refugees) %>%
summarise(frequency = n()) %>%
mutate(rel_freq = frequency / sum(frequency))
n_miss = d2["support_refugees"].isna().sum()
print(f"# of missing values: {n_miss}")
d2 = d2.dropna()
print(f"Shape after dropping NAs: {d2.shape}")
n_miss = sum(is.na(d2$support_refugees))
print(glue("# of missing values: {n_miss}"))
d2 = d2 %>% drop_na()
print(glue("Rows after dropping NAs: {nrow(d2)}"))
print("Crosstab gender and support_refugees:")
print(pd.crosstab(d2["support_refugees"],
d2["gender"]))
print("Summary statistics for group of cases:")
print(d2.groupby(["support_refugees", "gender"])
["age"].mean())
print("Crosstab gender and support_refugees:")
d2 %>%
group_by(gender, support_refugees)%>%
summarise(n=n())%>%
pivot_wider(values_from="n",names_from="gender")
print("Summary statistics for group of cases:")
d2 %>%
group_by(support_refugees, gender)%>%
summarise(mean_age=mean(age, na.rm = TRUE))
d2["support_refugees"].value_counts().plot(
kind="bar")
plt.show()
ggplot(data=d2) +
geom_bar(mapping = aes(x= support_refugees))
d2.hist(column="age", bins=15)
plt.show()
ggplot(data=d2) +
geom_histogram(mapping = aes(x= age), bins = 15)
d2 = d2.sort_values(by ="country" )
plt.figure(figsize=(8,8))
sns.boxplot(x="age", y="country", data=d2)
plt.show()
ggplot(d2, aes(y=fct_rev(country), x=age))+
geom_boxplot()
support_refugees = (d2.groupby(["date_n"])
["support_refugees_n"].mean())
support_refugees = support_refugees.to_frame()
plt.plot(support_refugees.index,
support_refugees["support_refugees_n"])
plt.xlabel("Day")
plt.ylabel("Support for refugees")
plt.show()
support_refugees = d2 %>%
group_by(date_n) %>%
summarise(support=mean(support_refugees_n,
na.rm = TRUE))
ggplot(support_refugees,aes(x=date_n, y=support))+
geom_line() +
xlab("Day") +
ylab("Support for refugees")
support_combined = d2.groupby(["date_n"]).agg(
refugees = ("support_refugees_n", "mean"),
migrants = ("support_migrants_n", "mean"))
sns.lineplot(x="date_n", y="refugees",
data=support_combined, color="blue")
sns.lineplot(x="date_n", y="migrants",
data=support_combined, color="red")
plt.xlabel("Day")
plt.ylabel("Level of support")
plt.title("Support of refugees and migrants")
plt.show()
support_combined = d2 %>% group_by(date_n) %>%
summarise(
refugees=mean(support_refugees_n, na.rm = TRUE),
migrants=mean(support_migrants_n, na.rm = TRUE))
support_long = support_combined %>%
pivot_longer(-date_n, names_to="group",
values_to="support")
ggplot(support_long,
aes(x=date_n, y=support, colour=group)) +
geom_line(size = 1.5) +
labs(title="Support for refugees and migrants",
x="Day", y="Level of Support")
f, axes = plt.subplots(2,1)
sns.lineplot(x="date_n", y="refugees",
data=support_combined, ax=axes[0])
sns.lineplot(x="date_n", y="migrants",
data=support_combined, ax=axes[1])
sns.lineplot(x="date_n", y="support_refugees_n",
data=d2, ci=0, ax=axes[0])
sns.lineplot(x="date_n", y="support_migrants_n",
data=d2, ci=0, ax=axes[1])
plt.show()
ggplot(support_long, aes(x=date_n, y=support)) +
geom_line() + facet_grid(rows=vars(group)) +
xlab("Day") + ylab("Support")
sns.scatterplot(data=support_combined,
x="refugees", y="migrants")
ggplot(support_combined,
aes(x=refugees, y=migrants))+
geom_point()
sns.lmplot(data=support_combined,
x="refugees", y="migrants")
plt.show()
ggplot(support_combined,
aes(x=refugees, y= migrants))+
geom_point()+
geom_smooth(method = lm)
print(support_combined["refugees"]
.corr(support_combined["migrants"],
method="pearson"))
cor.test(support_combined$refugees,
support_combined$migrants,
method = "pearson")
pivot_data = pd.pivot_table(d2,
values="support_refugees_n",
index=["country"], columns="gender")
pivot_data= d2 %>%
select(gender, country, support_refugees_n) %>%
group_by(country, gender) %>%
summarise(score = mean(support_refugees_n))
plt.figure(figsize=(10,6))
sns.heatmap(pivot_data, cmap="Blues",
cbar_kws={"label": "support_refugees_n"})
plt.show()
ggplot(pivot_data, aes(x = gender,
y = fct_rev(country), fill = score)) +
geom_tile()+
scale_fill_gradient2(low="white", high="blue")
sns.lineplot(x="date_n", y="support_refugees_n",
data=d2, color="blue", ci=100, label="Refugees")
sns.lineplot(x="date_n", y="support_migrants_n",
data=d2, color="red", ci=100, label="Migrants")
plt.xlabel("Day")
plt.ylabel("Level of support")
plt.title("Support for refugees and migrants")
plt.show()
ggplot(support_long,
aes(x=date_n, y=support, color=group)) +
geom_line(size=1.5) +
geom_ribbon(aes(fill=group, ymin=support-0.15,
ymax=support+0.15),
alpha=.1, lty=0) +
ggtitle("Support for refugees and migrants")
supports_country = (d2.groupby(["country"])
["support_refugees_n"].mean()
.to_frame().reset_index())
wmap = gpd.read_file(
gpd.datasets.get_path("naturalearth_lowres"))
wmap = wmap.rename(columns={"name": "country"})
wmap.plot();
supports_country = d2 %>%
group_by(country) %>%
summarise(m=mean(support_refugees_n,na.rm=TRUE))
wmap = map_data("world")
ggplot(wmap, aes(x=long,y=lat,group=group)) +
geom_polygon(fill="lightgray", colour = "white")
countries = [
"Portugal", "Spain", "France", "Germany",
"Austria", "Belgium", "Netherlands", "Ireland",
"Denmark", "Poland", "UK", "Latvia", "Cyprus",
"Croatia", "Slovenia", "Hungary", "Slovakia",
"Czech republic", "Greece", "Finland", "Italy",
"Luxemburg", "Sweden", "Sweden", "Bulgaria",
"Estonia", "Lithuania", "Malta", "Romania"]
m = wmap.loc[
wmap["country"].isin(countries)]
m = pd.merge(supports_country, m, on="country")
countries = c(
"Portugal", "Spain", "France", "Germany",
"Austria", "Belgium", "Netherlands", "Ireland",
"Denmark", "Poland", "UK", "Latvia", "Cyprus",
"Croatia", "Slovenia", "Hungary", "Slovakia",
"Czech republic", "Greece", "Finland", "Italy",
"Luxemburg", "Sweden", "Sweden", "Bulgaria",
"Estonia", "Lithuania", "Malta", "Romania")
m = wmap %>% rename(country=region) %>%
filter(country %in% countries) %>%
left_join(supports_country, by="country")
m = gpd.GeoDataFrame(m, geometry=m["geometry"])
m.plot(column="support_refugees_n",
legend=True, cmap="OrRd",
legend_kwds={"label": "Level of suppport"}
).set_title("Support of refugees by country")
ggplot(m, aes(long, lat, group=group))+
geom_polygon(aes(fill = m), color="white")+
scale_fill_viridis_c(option="B")+
labs(title="Support for refugees by country",
fill="Level of support")
d3 = d2.groupby(["country"])[[
"support_refugees_n",
"support_migrants_n",
"age",
"educational_n"]].mean()
scaler = StandardScaler()
d3_s = scaler.fit_transform(d3)
wss = []
for i in range(1, 15):
km_out = KMeans(n_clusters=i, n_init=20)
km_out.fit(d3_s)
wss.append(km_out.inertia_)
plt.plot(range(1, 15), wss, marker="o")
plt.xlabel("Number of clusters")
plt.ylabel("Within groups sum of squares")
plt.show()
d3_s = d2%>%
group_by(country)%>%
summarise(
m_refugees=mean(support_refugees_n, na.rm=T),
m_migrants=mean(support_migrants_n, na.rm=T),
m_age=mean(age, na.rm=T),
m_edu=mean(educational_n, na.rm=T)) %>%
column_to_rownames(var="country") %>%
scale()
wss = list()
for (i in 1:15) {
km.out = kmeans(d3_s, centers=i, nstart=25)
wss[[i]] = tibble(k=i, ss=km.out$tot.withinss)
}
wss = bind_rows(wss)
ggplot(wss, aes(x=k, y=ss)) +
geom_line() + geom_point() +
xlab("Number of Clusters") +
ylab("Within groups sum of squares")
km_res = KMeans(n_clusters=3, n_init=25).fit(d3_s)
print(km_res)
print("K-means cluste sizes:",
np.bincount(km_res.labels_[km_res.labels_>=0]))
print(f"Cluster means: {km_res.cluster_centers_}")
print("Clustering vector:")
print(np.column_stack((d3.index, km_res.labels_)))
print("Within cluster sum of squares:")
print(km_res.inertia_)
set.seed(123)
km.res = kmeans(d3_s, 3, nstart=25)
print(km.res)
for cluster in range(km_res.n_clusters):
plt.scatter(d3_s[km_res.labels_ == cluster, 0],
d3_s[km_res.labels_ == cluster, 1])
plt.scatter(km_res.cluster_centers_[:, 0],
km_res.cluster_centers_[:, 1],
s=250, marker="*")
plt.legend(scatterpoints=1)
plt.show()
fviz_cluster(km.res, d3_s, ellipse.type="norm")
hc_res = AgglomerativeClustering(
affinity = "euclidean", linkage = "complete")
hc_res.fit_predict(d3_s)
print(hc_res)
hc.res <- hcut(d3_s, hc_method="complete")
summary(hc.res)
dendrogram = sch.dendrogram(
sch.linkage(d3_s, method="complete"),
labels=list(d3.index), leaf_rotation=90)
hc_res = AgglomerativeClustering(n_clusters=3,
affinity = "euclidean", linkage = "ward")
hc_res.fit_predict(d3_s)
print(hc_res)
hc.res = hcut(d3_s, k=3, hc_method="complete")
summary(hc.res)
for cluster in range(hc_res.n_clusters):
plt.scatter(d3_s[hc_res.labels_==cluster, 0],
d3_s[hc_res.labels_==cluster, 1])
plt.legend(scatterpoints=1)
plt.show()
fviz_cluster(hc.res, d3_s, ellipse.type="convex")
pca_m = PCA()
pca = pca_m.fit(d3_s)
pca_n = PCA()
pca = pca_n.fit_transform(d3_s)
pca_df = pd.DataFrame(data=pca,
columns=["PC1", "PC2", "PC3", "PC4"])
pca_df.index = d3.index
print(pca_df.head())
pca_df_2 = pd.DataFrame(data=pca_n.components_.T,
columns=["PC1", "PC2", "PC3", "PC4"])
pca_df_2.index = d3.columns
print(pca_df_2)
pca = prcomp(d3_s, scale=TRUE)
head(pca$x)
pca$rotation
var1 = round(pca_n.explained_variance_ratio_[0],2)
var2 = round(pca_n.explained_variance_ratio_[1],2)
bioinfokit.visuz.cluster.biplot(cscore=pca,
loadings=pca_n.components_,
labels=pca_df_2.index.values,
var1=var1, var2=var2, show=True)
biplot(x = pca, scale = 0, cex = 0.6,
col = c("blue4", "brown3"))
print("Proportion of variance explained:")
print(pca_n.explained_variance_ratio_)
print("Proportion of variance explained:")
prop_var = tibble(pc=1:4,
var=pca$sdev^2 / sum(pca$sdev^2))
prop_var
plt.bar([1,2,3,4],pca_n.explained_variance_ratio_)
plt.ylabel("Proportion of variance explained")
plt.xlabel("Principal component")
plt.xticks([1,2,3,4])
plt.show()
ggplot(prop_var, aes(x=pc, y=var)) +
geom_col() +
scale_y_continuous(limits = c(0,1)) +
xlab("Principal component") +
ylab("Proportion of variance explained")
cvar = np.cumsum(pca_n.explained_variance_ratio_)
cvar
cvar = cumsum(prop_var)
cvar
plt.plot(cvar)
plt.xlabel("number of components")
plt.xticks(np.arange(len(cvar)),
np.arange(1, len(cvar)+1))
plt.ylabel("cumulative explained variance")
plt.show()
ggplot(cvar, aes(x=pc, y=var)) +
geom_point() +
geom_line() +
theme_bw() +
xlab("Principal component") +
ylab("Cumulative explained variance")
d5 = pca[:,0:2]
d5[0:5]
wss = []
for i in range(1, 15):
km_out = KMeans(n_clusters=i, n_init=20)
km_out.fit(d5)
wss.append(km_out.inertia_)
plt.plot(range(1, 15), wss, marker="o")
plt.xlabel("Number of clusters")
plt.ylabel("Within groups sum of squares")
plt.show()
km_res_5 = KMeans(n_clusters=3, n_init=25).fit(d5)
for cluster in range(km_res_5.n_clusters):
plt.scatter(d3_s[km_res_5.labels_==cluster, 0],
d3_s[km_res_5.labels_==cluster, 1])
plt.scatter(km_res_5.cluster_centers_[:, 0],
km_res_5.cluster_centers_[:, 1],
s=250, marker="*")
plt.legend(scatterpoints=1)
plt.show()
d5 = pca$x[, c("PC1", "PC2")]
head(d5)
wss = list()
for (i in 1:15) {
km.out = kmeans(d5, centers = i, nstart = 20)
wss[[i]] = tibble(k=i, ss=km.out$tot.withinss)
}
wss = bind_rows(wss)
ggplot(wss, aes(x=k, y=ss)) + geom_line() +
xlab("Number of Clusters") +
ylab("Within groups sum of squares")
set.seed(123)
km.res_5 <- kmeans(d5, 3, nstart = 25)
fviz_cluster(km.res_5, d5, ellipse.type = "norm")