## Bibliotheken importieren

# Datenimport
import os

# Datenmanipulation und Deskriptive Statistik
import numpy as np
import pandas as pd

# Datenvisualisierung
import matplotlib.pyplot as plt
import seaborn as sns

#Statistische Modelle
import statsmodels.formula.api as sm
import warnings

## Generelle Einstellungen
warnings.filterwarnings('ignore')


## Daten einlesen
source_path=".\Data"
source_filename="WHR2019_Chapter2OnlineData_Europe.csv"
os.chdir(source_path)
# Default für read_csv ist 1. Reihe als Spaltenname (header=0)
# und keine Spalten für den Index; letzteres wird hier geändert
df = pd.read_csv(source_filename, index_col=0)
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 355 entries, 71 to 1603
Data columns (total 8 columns):
 #   Column                       Non-Null Count  Dtype  
---  ------                       --------------  -----  
 0   Country name                 355 non-null    object 
 1   Country group                355 non-null    object 
 2   Year                         355 non-null    int64  
 3   Subjective Wellbeing         355 non-null    float64
 4   Positive affect              355 non-null    float64
 5   Life expectancy (quartiles)  355 non-null    object 
 6   Log GDP per capita           353 non-null    float64
 7   Democratic Quality           326 non-null    float64
dtypes: float64(4), int64(1), object(3)
memory usage: 25.0+ KB


df[df["Year"] == 2018]["Democratic Quality"].unique()

array([nan])


df.head()


df[df["Country name"]=="Austria"][["Country name", "Year", "Subjective Wellbeing"]]


# Überblick für alle Variablen
df.describe(include="all")


df["Country group"].value_counts()

Western Europe    210
Eastern Europe    145
Name: Country group, dtype: int64


df[df["Country group"] == "Western Europe"]["Country name"].unique().shape

(18,)


df[df["Country group"] == "Eastern Europe"]["Country name"].unique().shape

(13,)


df["Life expectancy (quartiles)"].value_counts()

3. Quartile    92
2. Quartile    89
1. Quartile    89
4. Quartile    85
Name: Life expectancy (quartiles), dtype: int64


df.boxplot(by="Country group", column="Subjective Wellbeing")

<AxesSubplot:title={'center':'Subjective Wellbeing'}, xlabel='Country group'>


df_countries_grp = df.groupby(["Country name"])
df_countries_grp["Year"].count().sort_values(ascending=False)

Country name
United Kingdom            13
Sweden                    13
Spain                     13
Denmark                   13
Lithuania                 13
France                    13
Germany                   13
Italy                     13
Ireland                   12
Romania                   12
Netherlands               12
Belgium                   12
Latvia                    12
Greece                    12
Estonia                   12
Hungary                   11
Bosnia and Herzegovina    11
Croatia                   11
Slovenia                  11
Serbia                    11
Cyprus                    11
Portugal                  11
Poland                    11
Czech Republic            11
Finland                   11
Austria                   11
Slovakia                  10
Malta                     10
Luxembourg                10
Norway                     8
Switzerland                8
Name: Year, dtype: int64


# Mehrere Aggregationsfunktionen für eine Spalte
df_countries_grp["Subjective Wellbeing"].agg(["mean", "std"]).sort_values("mean", ascending=False)


# Mehrere Aggregationsfunktionen für eine Spalte mit selbst bestimmten Spaltennamen
df_countries_grp["Subjective Wellbeing"].agg([("Durchschnitt", "mean"), ("Abweichung" ,"std")]).sort_values("Durchschnitt", ascending=False)


df_countries_grp = df.groupby(["Country name", "Country group"], as_index=False)
df_countries = df_countries_grp[["Subjective Wellbeing", "Log GDP per capita", "Democratic Quality"]].agg("mean")
df_countries.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 31 entries, 0 to 30
Data columns (total 5 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   Country name          31 non-null     object 
 1   Country group         31 non-null     object 
 2   Subjective Wellbeing  31 non-null     float64
 3   Log GDP per capita    31 non-null     float64
 4   Democratic Quality    31 non-null     float64
dtypes: float64(3), object(2)
memory usage: 1.5+ KB


df_countries.head()


df_countries.plot(kind="scatter", x="Log GDP per capita", y="Subjective Wellbeing")

<AxesSubplot:xlabel='Log GDP per capita', ylabel='Subjective Wellbeing'>


sns.pairplot(df_countries)

<seaborn.axisgrid.PairGrid at 0x1a50db0beb0>


sns.pairplot(df_countries, diag_kind = 'kde')

<seaborn.axisgrid.PairGrid at 0x1a50df61fd0>


sns.regplot(x=df_countries["Democratic Quality"], y=df_countries["Subjective Wellbeing"])
plt.show()


# Mit 'hue' wird die Farbe an eine kategoriale Variable angepasst
sns.lmplot( x="Democratic Quality", y="Subjective Wellbeing", data=df_countries, hue="Country group", fit_reg=True, legend=False)

# Die Legende kann verschoben werden
plt.legend(loc='lower right')

plt.show

<function matplotlib.pyplot.show(close=None, block=None)>


df_countries = df_countries.rename(columns={"Country name" : "Country", "Subjective Wellbeing": "Wellbeing", "Log GDP per capita" : "Log_Wealth", "Democratic Quality" : "Democracy", "Country group" : "Region"})
df_countries.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 31 entries, 0 to 30
Data columns (total 5 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   Country     31 non-null     object 
 1   Region      31 non-null     object 
 2   Wellbeing   31 non-null     float64
 3   Log_Wealth  31 non-null     float64
 4   Democracy   31 non-null     float64
dtypes: float64(3), object(2)
memory usage: 1.5+ KB


reg_result = sm.ols(formula = "Wellbeing ~ Democracy + Log_Wealth + Region", data = df_countries).fit()
reg_result.summary()

	Country name	Country group	Year	Subjective Wellbeing	Positive affect	Life expectancy (quartiles)	Log GDP per capita	Democratic Quality
71	Austria	Western Europe	2006	7.122211	0.823105	2. Quartile	10.657212	1.224309
72	Austria	Western Europe	2008	7.180954	0.832170	2. Quartile	10.701936	1.348835
73	Austria	Western Europe	2010	7.302679	0.814719	3. Quartile	10.676744	1.291470
74	Austria	Western Europe	2011	7.470513	0.789471	3. Quartile	10.702182	1.297983
75	Austria	Western Europe	2012	7.400689	0.822248	3. Quartile	10.704404	1.394670

	Country name	Year	Subjective Wellbeing
71	Austria	2006	7.122211
72	Austria	2008	7.180954
73	Austria	2010	7.302679
74	Austria	2011	7.470513
75	Austria	2012	7.400689
76	Austria	2013	7.498803
77	Austria	2014	6.950000
78	Austria	2015	7.076447
79	Austria	2016	7.048072
80	Austria	2017	7.293728
81	Austria	2018	7.396002

	Country name	Country group	Year	Subjective Wellbeing	Positive affect	Life expectancy (quartiles)	Log GDP per capita	Democratic Quality
count	355	355	355.000000	355.000000	355.000000	355	353.000000	326.000000
unique	31	2	NaN	NaN	NaN	4	NaN	NaN
top	United Kingdom	Western Europe	NaN	NaN	NaN	3. Quartile	NaN	NaN
freq	13	210	NaN	NaN	NaN	92	NaN	NaN
mean	NaN	NaN	2012.411268	6.356516	0.721648	NaN	10.374421	0.893796
std	NaN	NaN	3.732570	0.879001	0.093599	NaN	0.431043	0.408455
min	NaN	NaN	2005.000000	4.380312	0.473150	NaN	9.135507	-0.489775
25%	NaN	NaN	2010.000000	5.694887	0.652065	NaN	10.129182	0.692288
50%	NaN	NaN	2013.000000	6.357625	0.737924	NaN	10.397203	0.940049
75%	NaN	NaN	2016.000000	7.099326	0.795470	NaN	10.672445	1.187322
max	NaN	NaN	2018.000000	8.018934	0.878256	NaN	11.460800	1.529229

	mean	std
Country name
Denmark	7.692072	0.172191
Finland	7.554010	0.179375
Norway	7.549186	0.099576
Switzerland	7.534953	0.103910
Netherlands	7.466531	0.089703
Sweden	7.371575	0.106308
Austria	7.249100	0.184722
Ireland	7.054912	0.206791
Luxembourg	7.011553	0.150256
Belgium	7.010866	0.143701
United Kingdom	6.908503	0.175071
Germany	6.792368	0.228729
France	6.664774	0.246413
Czech Republic	6.563952	0.235159
Spain	6.514385	0.384701
Malta	6.384076	0.344295
Italy	6.257383	0.342071
Cyprus	6.087859	0.470613
Slovakia	6.000420	0.296805
Slovenia	5.960895	0.180298
Poland	5.865575	0.195837
Lithuania	5.764080	0.353107
Croatia	5.548247	0.257856
Estonia	5.531269	0.267461
Greece	5.496580	0.564360
Romania	5.475363	0.448037
Portugal	5.352193	0.311528
Latvia	5.315125	0.532209
Hungary	5.120173	0.396097
Bosnia and Herzegovina	5.086144	0.317867
Serbia	5.082295	0.479855

	Durchschnitt	Abweichung
Country name
Denmark	7.692072	0.172191
Finland	7.554010	0.179375
Norway	7.549186	0.099576
Switzerland	7.534953	0.103910
Netherlands	7.466531	0.089703
Sweden	7.371575	0.106308
Austria	7.249100	0.184722
Ireland	7.054912	0.206791
Luxembourg	7.011553	0.150256
Belgium	7.010866	0.143701
United Kingdom	6.908503	0.175071
Germany	6.792368	0.228729
France	6.664774	0.246413
Czech Republic	6.563952	0.235159
Spain	6.514385	0.384701
Malta	6.384076	0.344295
Italy	6.257383	0.342071
Cyprus	6.087859	0.470613
Slovakia	6.000420	0.296805
Slovenia	5.960895	0.180298
Poland	5.865575	0.195837
Lithuania	5.764080	0.353107
Croatia	5.548247	0.257856
Estonia	5.531269	0.267461
Greece	5.496580	0.564360
Romania	5.475363	0.448037
Portugal	5.352193	0.311528
Latvia	5.315125	0.532209
Hungary	5.120173	0.396097
Bosnia and Herzegovina	5.086144	0.317867
Serbia	5.082295	0.479855

Explorative Datenanalyse (EDA) - Teil 2¶

Inhalt¶

Einleitung¶

Bibliotheken laden und allgemeine Einstellungen¶

Daten einlesen und Überblick gewinnen¶

Auswertung pro Land (Durchschnittswerte)¶

Bi-Variate Zusammenhänge: Datenvisualisierung¶

Regressionsanalyse¶

Dep. Variable:	Wellbeing	R-squared:	0.782
Model:	OLS	Adj. R-squared:	0.758
Method:	Least Squares	F-statistic:	32.35
Date:	Tue, 02 Mar 2021	Prob (F-statistic):	4.37e-09
Time:	16:46:24	Log-Likelihood:	-14.865
No. Observations:	31	AIC:	37.73
Df Residuals:	27	BIC:	43.47
Df Model:	3
Covariance Type:	nonrobust

	coef	std err	t	P>\|t\|	[0.025	0.975]
Intercept	-0.5705	4.579	-0.125	0.902	-9.966	8.825
Region[T.Western Europe]	0.4945	0.225	2.196	0.037	0.033	0.956
Democracy	0.8680	0.428	2.026	0.053	-0.011	1.747
Log_Wealth	0.5644	0.482	1.171	0.252	-0.425	1.554

Omnibus:	7.523	Durbin-Watson:	1.544
Prob(Omnibus):	0.023	Jarque-Bera (JB):	5.838
Skew:	-0.897	Prob(JB):	0.0540
Kurtosis:	4.140	Cond. No.	645.