#Cloninng my project repositary, reading data and importing different libraries:
!git clone https://github.com/mandeepkrk/CMPSFinalProject
%cd /content/CMPSFinalProject
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

Cloning into 'CMPSFinalProject'...
remote: Enumerating objects: 17, done.
remote: Counting objects: 100% (17/17), done.
remote: Compressing objects: 100% (15/15), done.
remote: Total 17 (delta 3), reused 0 (delta 0), pack-reused 0
Receiving objects: 100% (17/17), 2.07 MiB | 5.71 MiB/s, done.
Resolving deltas: 100% (3/3), done.
/content/CMPSFinalProject


#creating a dataframe with all the variables in our dataset:
df_gini = pd.read_excel("./gdp_gini_final.xlsx")
df_gini.head()


#Here, I create a new variable called 'country_year' combining country code and year. This serves as the unique identifier for each row of data.

df_gini.loc[:, "country_year"] = df_gini["country_code"].astype(str) + df_gini["year"].astype(str)


#Checking datatype for variables in df_gini_subset

print(df_gini.dtypes)

year               int64
country           object
country_code      object
gdp               object
gdp_gr            object
gdp_pc            object
gini              object
gni               object
gni_pc            object
infl              object
life_exp         float64
popn_gr          float64
tax_gdp_prcnt     object
unemp             object
country_year      object
dtype: object


# Drop tax_gdp_prcnt and gni as I an not using them for my analysis

df_gini = df_gini.drop(['tax_gdp_prcnt','gni'], axis=1)


#I want to change the variables to the appropriate datatypes. First, I check for the columns that have string value and shouldn't be string.
for col in ['gdp_pc', 'gdp','gdp_gr','gni_pc','infl','unemp']:
    if df_gini[col].dtype == 'object':
        print(f"Column '{col}' contains string values:")
        print(df_gini[col].unique())

Column 'gdp_pc' contains string values:
[5730.7238098842345 6815.329329698251 6957.417498892505 ...
 1495.752138410211 1475.1998833853477 1268.1209405624106]
Column 'gdp' contains string values:
[189719984268.48453 228778917308.16986 236741715015.015 ...
 25873601260.835304 26311507273.67354 23308667781.225754]
Column 'gdp_gr' contains string values:
[9.133110567389608 7.937291556430765 8.206979072212278 ...
 3.5043360955868224 4.034493896671648 1.44130602603785]
Column 'gni_pc' contains string values:
[3860 6060 7110 ... 3060 390 1330]
Column 'infl' contains string values:
[140.50237866101241 16.071993535694546 -3.561095575765748 ...
 10.095729868345842 7.411570948432569 7.6334704807674285]
Column 'unemp' contains string values:
[5.44 6.36 10.1 ... 8.137 8.431 5.54]


#Then I drop the rows with missing values and change the variables to float.
df_gini= df_gini.dropna(subset=['gdp_pc', 'gdp','gdp_gr', 'infl', 'life_exp','popn_gr','gni_pc','unemp'])
df_gini[['gdp_pc', 'gdp','gdp_gr', 'infl', 'life_exp','popn_gr','gni_pc','unemp']] = df_gini[['gdp_pc', 'gdp','gdp_gr', 'infl', 'life_exp','popn_gr','gni_pc','unemp']].replace('..', np.nan)


#Creating new varaible income level based on country's gni per capita as defined by the World Bank:

df_gini['income_lvl'] = np.select(
    [df_gini['gni_pc'] < 1085,
     (df_gini['gni_pc'] >= 1086) & (df_gini['gni_pc'] < 4095),
     (df_gini['gni_pc'] >= 4095) & (df_gini['gni_pc'] < 13205),
     df_gini['gni_pc'] >= 13205],
    [1, 2, 3, 4],
    default=np.nan)
df_gini['income_lvl'] = df_gini['income_lvl'].astype('Int64')
df_gini.head()


# change income_lvl to float for statistical analysis later

df_gini['income_lvl'] = df_gini['income_lvl'].astype(float)


#Confirming all the variables have been changed to appropriate datatype:
print(df_gini.dtypes)

year              int64
country          object
country_code     object
gdp             float64
gdp_gr          float64
gdp_pc          float64
gini             object
gni_pc          float64
infl            float64
life_exp        float64
popn_gr         float64
unemp           float64
country_year     object
income_lvl      float64
dtype: object


#changing the missing values in Gini to NaN:

df_gini['gini'] = df_gini['gini'].replace('..', np.nan)


# Check how many missing values there are in the column Gini

print(df_gini.gini.isnull().sum())

315


#perform linear interpolation for missing values in gini variable based on each country

# Loop through each country and perform linear interpolation for missing values in the Gini variable.

for country_code in df_gini['country_code'].unique():
  # Subset the data for the current country.
  country_data = df_gini[df_gini['country_code'] == country_code]

  # Perform linear interpolation for missing values in the Gini variable.
  country_data['gini'] = country_data['gini'].interpolate(method='linear')

  # Update the original dataframe with the interpolated values.
  df_gini.loc[country_data.index, 'gini'] = country_data['gini']


df_gini['gini'].describe()

count    1667.000000
mean       38.074625
std         9.108657
min        20.700000
25%        30.800000
50%        35.700000
75%        45.175000
max        61.600000
Name: gini, dtype: float64


#Check how many missing valuesb there still are as if the the first value for a country is missing, it would still remain missing:
print(df_gini.gini.isnull().sum())

0


#Confirming that dataype for all variables is now correct:
print(df_gini.dtypes)

year              int64
country          object
country_code     object
gdp             float64
gdp_gr          float64
gdp_pc          float64
gini            float64
gni_pc          float64
infl            float64
life_exp        float64
popn_gr         float64
unemp           float64
country_year     object
income_lvl      float64
dtype: object


#The first 20 rows of the final dataframe:
df_gini.head(20)


#Countries in our sample:

unique_countries = df_gini['country'].unique()

for i, country in enumerate(unique_countries):
    if i % 5 == 0:
        print()
    print(f"{country:20}", end="")

Argentina           Armenia             Australia           Austria             Belarus             
Belgium             Bolivia             Brazil              Bulgaria            Canada              
Chile               China               Colombia            Costa Rica          Croatia             
Cyprus              Czechia             Denmark             Dominican Republic  Ecuador             
El Salvador         Estonia             Finland             France              Georgia             
Germany             Greece              Honduras            Hungary             Iceland             
Indonesia           Iran, Islamic Rep.  Ireland             Israel              Italy               
Kazakhstan          Kosovo              Kyrgyz Republic     Latvia              Lithuania           
Luxembourg          Madagascar          Malaysia            Malta               Mexico              
Moldova             Netherlands         North Macedonia     Norway              Pakistan            
Panama              Paraguay            Peru                Philippines         Poland              
Portugal            Romania             Russian Federation  Slovak Republic     Slovenia            
Spain               Sweden              Switzerland         Tajikistan          Thailand            
Uganda              Ukraine             United Kingdom      United States       Uruguay             
Venezuela, RB       Viet Nam            Zambia


# Summary statistics for Gini by country:
gini_stats = df_gini.groupby('country')['gini'].describe()
print(gini_stats.to_string())

                    count       mean       std   min        25%        50%        75%   max
country                                                                                    
Argentina            29.0  46.191379  3.797510  41.1  42.700000  45.900000  49.100000  53.8
Armenia              21.0  32.419048  2.795285  28.0  30.000000  32.400000  34.800000  37.5
Australia            25.0  33.910000  0.758494  32.6  33.350000  34.000000  34.475000  35.4
Austria              26.0  30.100000  0.749637  28.7  29.525000  30.250000  30.750000  31.5
Belarus              22.0  27.954545  2.166350  25.2  26.500000  27.650000  29.400000  32.0
Belgium              17.0  28.182353  0.857493  27.2  27.600000  28.100000  28.400000  30.5
Bolivia              21.0  51.176190  6.158178  41.6  46.600000  49.200000  57.150000  61.6
Brazil               28.0  55.898214  2.898546  51.9  53.375000  55.250000  58.775000  60.1
Bulgaria             14.0  37.171429  2.640638  33.6  35.700000  36.350000  39.875000  41.3
Canada               29.0  33.055172  0.844557  31.3  32.700000  33.300000  33.700000  34.1
Chile                28.0  50.016071  4.010894  45.3  46.837500  48.333333  54.312500  56.4
China                27.0  39.542593  2.832573  33.9  38.350000  39.700000  41.816667  43.7
Colombia             29.0  54.103448  2.566707  49.7  52.600000  54.200000  55.550000  58.7
Costa Rica           29.0  48.089655  1.627519  45.6  46.800000  48.300000  48.800000  51.8
Croatia              11.0  31.354545  1.255678  28.9  30.650000  32.000000  32.350000  32.6
Cyprus               16.0  32.475000  1.910846  30.1  31.175000  31.900000  33.175000  37.0
Czechia              28.0  25.991071  1.186086  20.7  25.900000  26.200000  26.525000  27.5
Denmark              28.0  25.789286  1.931132  23.0  24.050000  25.400000  27.725000  28.7
Dominican Republic   28.0  48.264286  2.877369  41.9  47.000000  48.900000  50.100000  52.1
Ecuador              26.0  50.142308  4.049072  44.7  46.025000  49.966667  53.400000  58.6
El Salvador          29.0  47.034483  5.219253  38.0  42.300000  47.800000  51.500000  54.5
Estonia              17.0  32.641176  1.835776  30.3  31.200000  32.500000  33.600000  37.2
Finland              20.0  27.495000  0.362621  26.8  27.200000  27.516667  27.700000  28.3
France               24.0  32.054167  0.991696  29.7  31.675000  32.350000  32.625000  33.7
Georgia              24.0  38.170833  1.566908  35.9  36.850000  38.000000  39.525000  41.3
Germany              29.0  30.165517  1.230353  28.0  29.000000  30.300000  31.100000  31.9
Greece               20.0  34.350000  1.087220  32.8  33.600000  34.150000  35.025000  36.3
Honduras             29.0  53.644828  3.164150  48.2  51.300000  53.500000  55.700000  59.5
Hungary              16.0  29.875000  1.823367  27.0  28.975000  29.950000  30.650000  34.7
Iceland              17.0  27.605882  1.725714  25.4  26.200000  26.800000  28.700000  31.8
Indonesia            27.0  35.811111  3.293739  29.5  33.550000  35.600000  38.600000  40.8
Iran, Islamic Rep.   15.0  38.366667  3.950166  34.0  35.050000  36.700000  42.550000  44.8
Ireland              26.0  33.078846  1.646643  30.6  32.000000  32.850000  33.637500  37.0
Israel               23.0  40.100000  1.542430  38.1  38.650000  39.800000  41.450000  42.6
Italy                29.0  34.736207  1.121932  31.1  34.300000  35.000000  35.200000  36.7
Kazakhstan           19.0  29.931579  3.646010  26.8  27.650000  28.200000  31.000000  39.8
Kosovo               17.0  29.317647  2.000698  26.3  27.800000  29.000000  30.800000  33.3
Kyrgyz Republic      20.0  30.085000  2.827548  26.8  27.775000  29.800000  31.125000  37.4
Latvia               16.0  35.750000  1.267544  34.2  35.075000  35.550000  36.100000  39.0
Lithuania            16.0  35.793750  1.602693  32.5  35.025000  35.500000  37.225000  38.4
Luxembourg           29.0  30.858621  2.158623  27.0  30.000000  30.900000  32.000000  35.4
Madagascar           27.0  42.266667  1.935163  38.6  41.175000  42.600000  42.600000  47.4
Malaysia             28.0  45.225000  2.859410  41.1  42.275000  45.416667  47.804167  49.1
Malta                14.0  29.221429  0.696341  28.0  29.000000  29.100000  29.350000  31.0
Mexico               28.0  50.269643  2.352357  46.0  48.837500  50.375000  52.387500  53.4
Moldova              23.0  32.647826  4.924786  25.7  27.750000  34.400000  36.050000  42.6
Netherlands          16.0  28.606250  0.775000  27.6  28.050000  28.350000  29.225000  30.0
North Macedonia      11.0  36.609091  3.120082  33.0  34.350000  35.600000  38.750000  42.8
Norway               29.0  26.962069  1.427817  25.2  26.000000  26.840000  27.500000  31.6
Pakistan             24.0  30.129167  1.045271  28.7  29.483333  29.933333  30.775000  33.1
Panama               29.0  54.220690  3.156442  49.2  51.500000  54.600000  57.500000  58.2
Paraguay             25.0  51.804000  3.573890  45.7  48.500000  52.300000  54.600000  58.2
Peru                 23.0  47.882609  4.376762  41.5  43.750000  47.500000  50.900000  55.1
Philippines          20.0  45.795000  1.674552  42.3  45.075000  46.466667  46.916667  47.7
Poland               16.0  32.900000  2.282688  28.8  31.650000  33.150000  33.625000  38.0
Portugal             17.0  36.076471  1.795317  32.8  35.200000  36.000000  36.800000  38.9
Romania              14.0  36.200000  1.250846  34.4  35.650000  35.950000  36.475000  39.6
Russian Federation   23.0  39.091304  1.738315  36.8  37.450000  39.500000  40.500000  42.3
Slovak Republic      16.0  26.081250  1.614608  23.2  25.150000  26.100000  27.125000  29.3
Slovenia             16.0  24.837500  0.627030  23.7  24.400000  24.800000  25.025000  26.2
Spain                27.0  34.637037  1.249182  31.8  33.950000  34.700000  35.700000  36.5
Sweden               20.0  27.690000  1.304204  25.3  26.725000  27.600000  28.800000  30.0
Switzerland          20.0  32.847500  0.773317  31.6  32.450000  32.750000  33.362500  34.3
Tajikistan           21.0  32.447619  1.339883  29.5  31.500000  32.666667  33.600000  34.0
Thailand             28.0  40.607143  3.143916  34.9  37.725000  41.650000  42.575000  47.9
Uganda               28.0  42.433929  1.545515  39.0  41.437500  42.750000  43.579167  45.2
Ukraine              28.0  28.710714  3.862855  24.0  25.450000  28.800000  30.291667  39.3
United Kingdom       29.0  34.803448  1.502969  32.6  33.300000  35.000000  35.500000  38.8
United States        29.0  40.506897  0.804865  38.0  40.100000  40.600000  41.000000  41.5
Uruguay              14.0  42.057143  2.755055  39.5  39.750000  40.300000  44.950000  46.4
Venezuela, RB        28.0  45.921429  1.789549  42.1  44.800000  44.800000  47.500000  49.5
Viet Nam             23.0  36.145652  1.012009  34.8  35.550000  35.720000  36.740000  39.3
Zambia               29.0  52.217241  4.185394  42.1  49.100000  53.300000  55.120000  60.5


# The highest and lowest Gini values in our dataset

#the countries with the minimum and maximum Gini values
min_gini_country = gini_stats['min'].idxmin()
max_gini_country = gini_stats['max'].idxmax()

# The years with the minimum and maximum Gini values for each country
min_gini_year = df_gini[df_gini['country'] == min_gini_country].sort_values(by='gini')['year'].iloc[0]
max_gini_year = df_gini[df_gini['country'] == max_gini_country].sort_values(by='gini', ascending=False)['year'].iloc[0]

# Print the results
print(f"Country with minimum Gini: {min_gini_country}, Year: {min_gini_year}, Gini: {gini_stats.loc[min_gini_country, 'min']}")
print(f"Country with maximum Gini: {max_gini_country}, Year: {max_gini_year}, Gini: {gini_stats.loc[max_gini_country, 'max']}")

Country with minimum Gini: Czechia, Year: 1992, Gini: 20.7
Country with maximum Gini: Bolivia, Year: 2000, Gini: 61.6


# Let's see how the summary statistics for Gini varies by income level:
gini_stats_by_income_level = df_gini.groupby('income_lvl')['gini'].describe()
print(gini_stats_by_income_level.to_string())

            count       mean       std   min        25%        50%      75%   max
income_lvl                                                                       
1.0         201.0  39.818035  8.230691  27.4  33.133333  39.050000  43.2250  61.6
2.0         392.0  43.884418  9.851186  20.7  36.075000  46.333333  51.9625  60.1
3.0         383.0  41.873325  8.488807  25.2  35.700000  42.800000  48.7500  59.9
4.0         673.0  32.074443  4.704032  23.0  28.300000  31.900000  34.8000  49.9


#Let's see summary stats on infation based on income levels.
#It is important for our investigation as inflation affects rich and poor countries and the rich and the poor people within a country differently.

inflation_stats_by_income_level = df_gini.groupby('income_lvl')['infl'].describe()
print(inflation_stats_by_income_level.to_string())

            count       mean         std        min       25%       50%        75%          max
income_lvl                                                                                     
1.0         201.0  21.477256   74.643823  -3.169556  5.452302  8.837864  17.607724   952.995953
2.0         392.0  37.767287  247.145958 -26.299993  3.519471  6.654897  13.429700  3333.585422
3.0         383.0   7.772207    9.542675  -5.992202  2.578473  4.785570   8.799837    75.277369
4.0         673.0   2.190532    2.190667  -9.653676  1.003867  1.880282   2.985620    15.333310


# Let's see the summary statistics on population growth by income level:
popn_gr_stats_by_income_level = df_gini.groupby('income_lvl')['popn_gr'].describe()
print(popn_gr_stats_by_income_level.to_string())

            count      mean       std       min       25%       50%       75%       max
income_lvl                                                                             
1.0         201.0  1.751652  1.436717 -3.629546  1.048445  2.141922  2.906232  3.532921
2.0         392.0  1.163340  1.024360 -1.757004  0.496976  1.394561  1.873829  3.571097
3.0         383.0  0.717204  0.915528 -2.096943 -0.006653  1.015471  1.398723  2.760033
4.0         673.0  0.657537  0.744143 -2.258464  0.238457  0.540461  1.082907  3.931356


# Lets' see what the trend of Gini Index has been like for some of the major World powers in this period:


# Filter the data for United States, United Kingdom, France, Germany, and Australia and creating the plot:
countries = ['United States', 'United Kingdom', 'France', 'Germany', 'Australia']
df_filtered = df_gini[df_gini['country'].isin(countries)]

plt.figure(figsize=(10, 6))
for country in countries:
    df_country = df_filtered[df_filtered['country'] == country]
    plt.plot(df_country['year'], df_country['gini'], label=country)

plt.xlabel('Year')
plt.ylabel('Gini Coefficient')
plt.title('Gini Trend for Major World Powers')

plt.legend()
plt.show()


#Scatterplots for gdp per capita and gini for countries with different income levels:

import matplotlib.pyplot as plt
import numpy as np


# Define colors for the plots
colors = {
    1: '#1F77B4',
    2: '#FF7F0E',
    3: '#2CA02C',
    4: '#D62728'
}

# Create a figure with four subplots
fig, axes = plt.subplots(nrows=2, ncols=2, figsize=(12, 8))

# Loop through income levels
for i, income_lvl in enumerate(range(1, 5)):
    # Filter data for the current income level
    df_subset = df_gini[df_gini['income_lvl'] == income_lvl]

    # Plot the scatterplot
    axes[i // 2, i % 2].scatter(df_subset['gdp_pc'], df_subset['gini'], color=colors[income_lvl])

    # Calculate and plot the trendline
    slope, intercept = np.polyfit(df_subset['gdp_pc'], df_subset['gini'], 1)
    axes[i // 2, i % 2].plot(df_subset['gdp_pc'], slope * df_subset['gdp_pc'] + intercept, color='black', linestyle='--')

    # Set the title and labels
    axes[i // 2, i % 2].set_title(f'Income Level {income_lvl}')
    axes[i // 2, i % 2].set_xlabel('GDP per Capita')
    axes[i // 2, i % 2].set_ylabel('Gini Coefficient')

# Adjust the spacing between subplots
plt.tight_layout()

# Show the plot
plt.show()


# Scatter plot with a trendline for life expectancy and Gini coefficient:
import matplotlib.pyplot as plt
import numpy as np


plt.figure(figsize=(10, 6))

# Scatter plot
plt.scatter(df_gini['life_exp'], df_gini['gini'])

# Trendline
z = np.polyfit(df_gini['life_exp'], df_gini['gini'], 1)
p = np.poly1d(z)
plt.plot(df_gini['life_exp'], p(df_gini['life_exp']), color='red')

# Labels and title
plt.xlabel('Life Expectancy')
plt.ylabel('Gini Coefficient')
plt.title('Life Expectancy vs. Gini Coefficient')

# Show the plot
plt.show()


# gini vs popn_gr scatterplot for countries with different income levels:

import matplotlib.pyplot as plt
import numpy as np

# Define colors for the plots
colors = {
    1: '#1F77B4',
    2: '#FF7F0E',
    3: '#2CA02C',
    4: '#D62728'
}

# Create a figure with four subplots
fig, axes = plt.subplots(nrows=2, ncols=2, figsize=(12, 8))

# Loop through income levels
for i, income_lvl in enumerate(range(1, 5)):
    # Filter data for the current income level
    df_subset = df_gini[df_gini['income_lvl'] == income_lvl]

    # Plot the scatterplot
    axes[i // 2, i % 2].scatter(df_subset['popn_gr'], df_subset['gini'], color=colors[income_lvl])

    # Calculate and plot the trendline
    slope, intercept = np.polyfit(df_subset['popn_gr'], df_subset['gini'], 1)
    axes[i // 2, i % 2].plot(df_subset['popn_gr'], slope * df_subset['popn_gr'] + intercept, color='black', linestyle='--')

    # Set the title and labels
    axes[i // 2, i % 2].set_title(f'Income Level {income_lvl}')
    axes[i // 2, i % 2].set_xlabel('Population Growth Rate')
    axes[i // 2, i % 2].set_ylabel('Gini Coefficient')

# Adjust the spacing between subplots
plt.tight_layout()

# Show the plot
plt.show()


# scatterplot infl and gini for different income levels (outliers have been taken off)

import matplotlib.pyplot as plt
import numpy as np

# Define colors for the plots
colors = {
    1: '#1F77B4',
    2: '#FF7F0E',
    3: '#2CA02C',
    4: '#D62728'
}

# Create a figure with four subplots
fig, axes = plt.subplots(nrows=2, ncols=2, figsize=(12, 8))

# Loop through income levels
for i, income_lvl in enumerate(range(1, 5)):
    # Filter data for the current income level and inflation values between -10 and 40
    df_subset = df_gini[(df_gini['income_lvl'] == income_lvl) & (df_gini['infl'] < 40) & (df_gini['infl'] > -10)]

    # Plot the scatterplot
    axes[i // 2, i % 2].scatter(df_subset['infl'], df_subset['gini'], color=colors[income_lvl])

    # Calculate and plot the trendline
    slope, intercept = np.polyfit(df_subset['infl'], df_subset['gini'], 1)
    axes[i // 2, i % 2].plot(df_subset['infl'], slope * df_subset['infl'] + intercept, color='black', linestyle='--')

    # Set the title and labels
    axes[i // 2, i % 2].set_title(f'Income Level {income_lvl}')
    axes[i // 2, i % 2].set_xlabel('Inflation')
    axes[i // 2, i % 2].set_ylabel('Gini Coefficient')

# Adjust the spacing between subplots
plt.tight_layout()

# Show the plot
plt.show()


#Installing required linear model packages for our statistical analysis
!pip install linearmodels
from linearmodels import PanelOLS
from linearmodels import RandomEffects


from linearmodels.panel import PooledOLS, PanelOLS, RandomEffects

# Assuming df_gini is your DataFrame with columns: gini, gdp_pc, infl, life_exp, popn_gr, unemp, income_lvl
# Drop any missing values
df_gini.dropna(inplace=True)
# Create a MultiIndex DataFrame with country and time dimensions
df_gini.set_index(['country', 'year'], inplace=True)

# Define dependent and independent variables
exog_vars = ['gdp_pc', 'infl', 'life_exp', 'popn_gr', 'unemp', 'income_lvl']
exog = df_gini[exog_vars]
endog = df_gini['gini']

# Pooled OLS model
pooled_ols_model = PooledOLS(endog, exog)
pooled_ols_results = pooled_ols_model.fit()


# Print model summaries
print("Pooled OLS:")
print(pooled_ols_results)

Pooled OLS:
                          PooledOLS Estimation Summary                          
================================================================================
Dep. Variable:                   gini   R-squared:                        0.9685
Estimator:                  PooledOLS   R-squared (Between):              0.9726
No. Observations:                1639   R-squared (Within):              -0.4438
Date:                Fri, May 03 2024   R-squared (Overall):              0.9685
Time:                        04:28:58   Log-likelihood                   -5506.1
Cov. Estimator:            Unadjusted                                           
                                        F-statistic:                      8371.8
Entities:                          72   P-value                           0.0000
Avg Obs:                       22.764   Distribution:                  F(6,1633)
Min Obs:                       11.000                                           
Max Obs:                       29.000   F-statistic (robust):             8371.8
                                        P-value                           0.0000
Time periods:                      29   Distribution:                  F(6,1633)
Avg Obs:                       56.517                                           
Min Obs:                       14.000                                           
Max Obs:                       72.000                                           
                                                                                
                             Parameter Estimates                              
==============================================================================
            Parameter  Std. Err.     T-stat    P-value    Lower CI    Upper CI
------------------------------------------------------------------------------
gdp_pc        -0.0002  1.184e-05    -18.057     0.0000     -0.0002     -0.0002
infl           0.0025     0.0014     1.7775     0.0757     -0.0003      0.0052
life_exp       0.5553     0.0117     47.527     0.0000      0.5324      0.5783
popn_gr        4.8373     0.1809     26.740     0.0000      4.4825      5.1921
unemp          0.3307     0.0418     7.9146     0.0000      0.2488      0.4127
income_lvl    -2.0961     0.2896    -7.2376     0.0000     -2.6642     -1.5281
==============================================================================


# Fixed effects model
fixed_effects_model = PanelOLS(endog, exog, entity_effects=True)
fixed_effects_results = fixed_effects_model.fit()

print("\nFixed Effects:")
print(fixed_effects_results)

Fixed Effects:
                          PanelOLS Estimation Summary                           
================================================================================
Dep. Variable:                   gini   R-squared:                        0.1737
Estimator:                   PanelOLS   R-squared (Between):             -0.4042
No. Observations:                1639   R-squared (Within):               0.1737
Date:                Fri, May 03 2024   R-squared (Overall):             -0.3823
Time:                        04:28:59   Log-likelihood                   -3672.2
Cov. Estimator:            Unadjusted                                           
                                        F-statistic:                      54.681
Entities:                          72   P-value                           0.0000
Avg Obs:                       22.764   Distribution:                  F(6,1561)
Min Obs:                       11.000                                           
Max Obs:                       29.000   F-statistic (robust):             54.681
                                        P-value                           0.0000
Time periods:                      29   Distribution:                  F(6,1561)
Avg Obs:                       56.517                                           
Min Obs:                       14.000                                           
Max Obs:                       72.000                                           
                                                                                
                             Parameter Estimates                              
==============================================================================
            Parameter  Std. Err.     T-stat    P-value    Lower CI    Upper CI
------------------------------------------------------------------------------
gdp_pc      1.365e-05  8.134e-06     1.6783     0.0935  -2.303e-06    2.96e-05
infl           0.0012     0.0005     2.3911     0.0169      0.0002      0.0022
life_exp      -0.0619     0.0364    -1.7001     0.0893     -0.1334      0.0095
popn_gr        0.8427     0.1621     5.2002     0.0000      0.5248      1.1606
unemp          0.2317     0.0261     8.8766     0.0000      0.1805      0.2829
income_lvl    -1.8328     0.1758    -10.426     0.0000     -2.1776     -1.4880
==============================================================================

F-test for Poolability: 179.73
P-value: 0.0000
Distribution: F(71,1561)

Included effects: Entity


# Random effects model
random_effects_model = RandomEffects(endog, exog)
random_effects_results = random_effects_model.fit()

print("\nRandom Effects:")
print(random_effects_results)

Random Effects:
                        RandomEffects Estimation Summary                        
================================================================================
Dep. Variable:                   gini   R-squared:                        0.5557
Estimator:              RandomEffects   R-squared (Between):              0.9503
No. Observations:                1639   R-squared (Within):               0.0312
Date:                Fri, May 03 2024   R-squared (Overall):              0.9424
Time:                        04:28:59   Log-likelihood                   -3856.7
Cov. Estimator:            Unadjusted                                           
                                        F-statistic:                      340.35
Entities:                          72   P-value                           0.0000
Avg Obs:                       22.764   Distribution:                  F(6,1633)
Min Obs:                       11.000                                           
Max Obs:                       29.000   F-statistic (robust):             340.35
                                        P-value                           0.0000
Time periods:                      29   Distribution:                  F(6,1633)
Avg Obs:                       56.517                                           
Min Obs:                       14.000                                           
Max Obs:                       72.000                                           
                                                                                
                             Parameter Estimates                              
==============================================================================
            Parameter  Std. Err.     T-stat    P-value    Lower CI    Upper CI
------------------------------------------------------------------------------
gdp_pc     -4.898e-05  8.017e-06    -6.1097     0.0000   -6.47e-05  -3.326e-05
infl           0.0020     0.0005     3.6861     0.0002      0.0009      0.0030
life_exp       0.5226     0.0143     36.517     0.0000      0.4945      0.5506
popn_gr        1.5994     0.1696     9.4298     0.0000      1.2667      1.9320
unemp          0.3435     0.0274     12.555     0.0000      0.2899      0.3972
income_lvl    -2.8567     0.1805    -15.823     0.0000     -3.2108     -2.5025
==============================================================================


#Performing Hausman test to choose between fixed and random effects model.
#The numm hypothesis is random effects and alternative hypothesis is fixed effects in Hausman test

from scipy.stats import chi2

# Hausman test for comparing fixed effects and random effects
b_fe = fixed_effects_results.params
b_re = random_effects_results.params
cov_fe = fixed_effects_results.cov
cov_re = random_effects_results.cov

# Calculate the Hausman test statistic
d = b_fe - b_re
d_cov = cov_fe - cov_re
hausman_stat = d.T @ (d_cov @ d.T)
p_value = 1 - chi2.cdf(hausman_stat, df=d.shape[0])

# Print Hausman test statistic and p-value
print("Hausman Test:")
print("Test Statistic:", hausman_stat)
print("P-value:", p_value)

Hausman Test:
Test Statistic: 0.004634724637361317
P-value: 0.9999999979294973


# Breusch-Pagan Lagrange Multiplier test for heteroskedasticity to choose between random effects model and pooled OLS,
#The Null hypotheisis is Pooled OLs Model and Alternative Hypothesis is Random Effects in this test.

# Calculate the residuals from the pooled OLS and fixed effects models
pooled_ols_residuals = pooled_ols_results.resids
fixed_effects_residuals = fixed_effects_results.resids

# Calculate the squared residuals
pooled_ols_squared_residuals = pooled_ols_residuals ** 2
fixed_effects_squared_residuals = fixed_effects_residuals ** 2

# Calculate the cross-sectional averages of the squared residuals for the fixed effects model
fe_residuals_mean = fixed_effects_residuals.groupby(level='country').mean()

# Calculate the Breusch-Pagan test statistic
num = (pooled_ols_squared_residuals - fe_residuals_mean).sum()
den = pooled_ols_squared_residuals.sum() - (pooled_ols_squared_residuals.mean() ** 2) * pooled_ols_residuals.shape[0]
lm_stat = num / den

# Calculate the p-value using a chi-squared distribution with degrees of freedom equal to the number of countries - 1
p_value = 1 - chi2.cdf(lm_stat, df=fixed_effects_residuals.groupby(level='country').count().shape[0] - 1)

# Print the Breusch-Pagan test statistic and p-value
print("\nBreusch-Pagan Lagrange Multiplier Test:")
print("Test Statistic:", lm_stat)
print("P-value:", p_value)

Breusch-Pagan Lagrange Multiplier Test:
Test Statistic: -0.021064060212915268
P-value: 1.0


#Final Model
print("Pooled OLS:")
print(pooled_ols_results)

Pooled OLS:
                          PooledOLS Estimation Summary                          
================================================================================
Dep. Variable:                   gini   R-squared:                        0.9685
Estimator:                  PooledOLS   R-squared (Between):              0.9726
No. Observations:                1639   R-squared (Within):              -0.4438
Date:                Fri, May 03 2024   R-squared (Overall):              0.9685
Time:                        04:28:58   Log-likelihood                   -5506.1
Cov. Estimator:            Unadjusted                                           
                                        F-statistic:                      8371.8
Entities:                          72   P-value                           0.0000
Avg Obs:                       22.764   Distribution:                  F(6,1633)
Min Obs:                       11.000                                           
Max Obs:                       29.000   F-statistic (robust):             8371.8
                                        P-value                           0.0000
Time periods:                      29   Distribution:                  F(6,1633)
Avg Obs:                       56.517                                           
Min Obs:                       14.000                                           
Max Obs:                       72.000                                           
                                                                                
                             Parameter Estimates                              
==============================================================================
            Parameter  Std. Err.     T-stat    P-value    Lower CI    Upper CI
------------------------------------------------------------------------------
gdp_pc        -0.0002  1.184e-05    -18.057     0.0000     -0.0002     -0.0002
infl           0.0025     0.0014     1.7775     0.0757     -0.0003      0.0052
life_exp       0.5553     0.0117     47.527     0.0000      0.5324      0.5783
popn_gr        4.8373     0.1809     26.740     0.0000      4.4825      5.1921
unemp          0.3307     0.0418     7.9146     0.0000      0.2488      0.4127
income_lvl    -2.0961     0.2896    -7.2376     0.0000     -2.6642     -1.5281
==============================================================================

	year	country	country_code	gdp	gdp_gr	gdp_pc	gini	gni	gni_pc	infl	life_exp	popn_gr	tax_gdp_prcnt	unemp
0	1991	Argentina	ARG	189719984268.484528	9.133111	5730.72381	46.8	183857364656.528564	3860	140.502379	72.319	1.424063	5.008237	5.44
1	1992	Argentina	ARG	228778917308.169861	7.937292	6815.32933	45.5	224521060685.898621	6060	16.071994	72.430	1.387435	5.547922	6.36
2	1993	Argentina	ARG	236741715015.015015	8.206979	6957.417499	44.9	233743662862.862854	7110	-3.561096	72.565	1.357966	8.015138	10.1
3	1994	Argentina	ARG	257440000000	5.836201	7464.474737	45.9	253743263100	7600	2.84934	73.172	1.347024	8.126437	11.76
4	1995	Argentina	ARG	258031750000	-2.84521	7383.70451	48.9	253362890700	7340	3.165123	73.133	1.317554	8.054358	18.8

	year	country	country_code	gdp	gdp_gr	gdp_pc	gini	gni_pc	infl	life_exp	popn_gr	unemp	country_year	income_lvl
0	1991	Argentina	ARG	1.897200e+11	9.133111	5730.723810	46.8	3860.0	140.502379	72.319	1.424063	5.44	ARG1991	2
1	1992	Argentina	ARG	2.287789e+11	7.937292	6815.329330	45.5	6060.0	16.071994	72.430	1.387435	6.36	ARG1992	3
2	1993	Argentina	ARG	2.367417e+11	8.206979	6957.417499	44.9	7110.0	-3.561096	72.565	1.357966	10.10	ARG1993	3
3	1994	Argentina	ARG	2.574400e+11	5.836201	7464.474737	45.9	7600.0	2.849340	73.172	1.347024	11.76	ARG1994	3
4	1995	Argentina	ARG	2.580318e+11	-2.845210	7383.704510	48.9	7340.0	3.165123	73.133	1.317554	18.80	ARG1995	3

	year	country	country_code	gdp	gdp_gr	gdp_pc	gini	gni_pc	infl	life_exp	popn_gr	unemp	country_year	income_lvl
0	1991	Argentina	ARG	1.897200e+11	9.133111	5730.723810	46.8	3860.0	140.502379	72.319	1.424063	5.44	ARG1991	2.0
1	1992	Argentina	ARG	2.287789e+11	7.937292	6815.329330	45.5	6060.0	16.071994	72.430	1.387435	6.36	ARG1992	3.0
2	1993	Argentina	ARG	2.367417e+11	8.206979	6957.417499	44.9	7110.0	-3.561096	72.565	1.357966	10.10	ARG1993	3.0
3	1994	Argentina	ARG	2.574400e+11	5.836201	7464.474737	45.9	7600.0	2.849340	73.172	1.347024	11.76	ARG1994	3.0
4	1995	Argentina	ARG	2.580318e+11	-2.845210	7383.704510	48.9	7340.0	3.165123	73.133	1.317554	18.80	ARG1995	3.0
5	1996	Argentina	ARG	2.721498e+11	5.526690	7690.157003	49.5	7700.0	-0.052375	73.307	1.260411	17.11	ARG1996	3.0
6	1997	Argentina	ARG	2.928590e+11	8.111047	8176.771195	49.1	8110.0	-0.463913	73.090	1.198264	14.82	ARG1997	3.0
7	1998	Argentina	ARG	2.989482e+11	3.850179	8250.673174	50.7	7990.0	-1.705280	73.474	1.158178	12.65	ARG1998	3.0
8	1999	Argentina	ARG	2.835230e+11	-3.385457	7735.322080	49.8	7540.0	-1.836558	73.722	1.152044	14.05	ARG1999	3.0
9	2000	Argentina	ARG	2.842038e+11	-0.788999	7666.517834	51.1	7430.0	1.037287	73.926	1.133277	15.00	ARG2000	3.0
10	2001	Argentina	ARG	2.686968e+11	-4.408840	7168.975872	53.3	6960.0	-1.095768	74.186	1.099171	17.32	ARG2001	3.0
11	2002	Argentina	ARG	9.772400e+10	-10.894485	2579.488769	53.8	4020.0	30.555204	74.408	1.073538	19.59	ARG2002	2.0
12	2003	Argentina	ARG	1.275870e+11	8.837041	3333.152904	51.0	3640.0	10.495703	74.080	1.032361	15.36	ARG2003	2.0
13	2004	Argentina	ARG	1.646579e+11	9.029573	4258.160261	48.5	3360.0	18.363354	74.855	1.015337	13.52	ARG2004	2.0
14	2005	Argentina	ARG	1.987371e+11	8.851660	5086.627761	47.8	4240.0	10.317511	75.139	1.033476	11.51	ARG2005	3.0
15	2006	Argentina	ARG	2.325573e+11	8.047152	5890.978002	46.4	5460.0	13.741052	75.433	1.034672	10.08	ARG2006	3.0
16	2007	Argentina	ARG	2.875305e+11	9.007651	7210.595548	46.3	6480.0	14.939925	75.006	1.006297	8.47	ARG2007	3.0
17	2008	Argentina	ARG	3.615580e+11	4.057233	8977.506851	45.0	7630.0	23.171165	75.641	0.992294	7.84	ARG2008	3.0
18	2009	Argentina	ARG	3.329765e+11	-5.918525	8184.389889	43.8	7760.0	15.377649	75.936	1.014284	8.65	ARG2009	3.0
19	2010	Argentina	ARG	4.236274e+11	10.125398	10385.964432	43.7	9270.0	20.915124	75.721	0.255582	7.71	ARG2010	3.0

How does Economic Growth Affect Income Inequality?¶

Mandeep Karki¶

Project Goals¶

Dataset for the Project¶

Question and Model Decision¶

ELT (Extraction, Load, and Transform)¶

Summary Stats and Visualizations¶

Model and Results¶

Final Model, Results, and Conclusion¶