# Import the important Libraries

import pandas as pd
import numpy as np
import re
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.stats import linregress

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import PolynomialFeatures
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import r2_score, mean_absolute_error, mean_absolute_percentage_error, mean_squared_error


pd.set_option("display.max_rows", None, "display.max_columns", None)

from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


# Loading the dataset to Pandas DataFrame
Energy_df = pd.read_csv("/content/drive/MyDrive/Data_Science/project/Global/global-data-on-sustainable-energy.csv")
Energy_df.head()
Energy_df.tail()


#Listing the North_American countries
north_america_countries = ['Antigua and Barbuda','Bahamas','Barbados','Belize','Canada','Costa Rica','Cuba', 'Dominica',
    'Dominican Republic','El Salvador','Grenada','Guatemala','Haiti','Honduras','Jamaica','Mexico','Nicaragua',
    'Panama','Saint Kitts and Nevis','Saint Lucia','Saint Vincent and the Grenadines','Trinidad and Tobago',
    'United States']

# Filtering the DataFrame to include only North_american countries
north_america_df = Energy_df.loc[Energy_df['Entity'].isin(north_america_countries)].copy()

# Save north_america_df to csv file
north_america_df.to_csv("north_america_data.csv", index = False)

North_america_df = pd.read_csv("/content/north_america_data.csv")
North_america_df.head()


display(North_america_df.dtypes)

Entity                                                               object
Year                                                                  int64
Access to electricity (% of population)                             float64
Access to clean fuels for cooking                                   float64
Renewable-electricity-generating-capacity-per-capita                float64
Financial flows to developing countries (US $)                      float64
Renewable energy share in the total final energy consumption (%)    float64
Electricity from fossil fuels (TWh)                                 float64
Electricity from nuclear (TWh)                                      float64
Electricity from renewables (TWh)                                   float64
Low-carbon electricity (% electricity)                              float64
Primary energy consumption per capita (kWh/person)                  float64
Energy intensity level of primary energy (MJ/$2017 PPP GDP)         float64
Value_co2_emissions_kt_by_country                                   float64
Renewables (% equivalent primary energy)                            float64
gdp_growth                                                          float64
gdp_per_capita                                                      float64
Density\n(P/Km2)                                                      int64
Land Area(Km2)                                                      float64
Latitude                                                            float64
Longitude                                                           float64
dtype: object


# Set 'Entity' and 'Year' columns as index
North_america_df.set_index(['Entity', 'Year'], inplace=True)
North_america_df.head()


# Fill NaN in each column by 0
North_america_df.fillna(0, inplace = True);


# Check missing values in columns
North_america_df.isnull().sum(axis=0)

Access to electricity (% of population)                             0
Access to clean fuels for cooking                                   0
Renewable-electricity-generating-capacity-per-capita                0
Financial flows to developing countries (US $)                      0
Renewable energy share in the total final energy consumption (%)    0
Electricity from fossil fuels (TWh)                                 0
Electricity from nuclear (TWh)                                      0
Electricity from renewables (TWh)                                   0
Low-carbon electricity (% electricity)                              0
Primary energy consumption per capita (kWh/person)                  0
Energy intensity level of primary energy (MJ/$2017 PPP GDP)         0
Value_co2_emissions_kt_by_country                                   0
Renewables (% equivalent primary energy)                            0
gdp_growth                                                          0
gdp_per_capita                                                      0
Density\n(P/Km2)                                                    0
Land Area(Km2)                                                      0
Latitude                                                            0
Longitude                                                           0
dtype: int64


# Checking missing values in rows
North_america_df.isnull().sum(axis=1);


# check the statistical information for the dataset
North_america_df.describe()


#Plot the histogram and distribution plot together
plt.figure(figsize=(16, 8))
sns.histplot(North_america_df['Primary energy consumption per capita (kWh/person)'],
             bins=20,  kde=True, color='blue', edgecolor='red')
plt.xlabel('Primary energy consumption per capita (kWh/person)')
plt.ylabel('Frequency')
plt.title('Distribution of Primary energy consumption per capita')
plt.grid(True)
plt.show()


# Extract the columns for GDP growth, primary energy consumption per capita, and country
gdp_growth = North_america_df['gdp_growth']
energy_consumption_per_capita = North_america_df['Primary energy consumption per capita (kWh/person)']
countries = North_america_df.index.get_level_values('Entity')

# Create a dictionary to map countries to unique markers and colors
country_marker_map = {}
country_color_map = {}
unique_countries = countries.unique()
markers = ['o', 's', '^', 'D', 'v', '+', 'x', '*', '.', 'p', 'h', 'H']        # Different marker styles
colors = plt.cm.tab20.colors            # Using the tab20 colormap for a variety of distinct colors
for i, country in enumerate(unique_countries):
    country_marker_map[country] = markers[i % len(markers)]   # Cycle through markers if more countries than markers
    country_color_map[country] = colors[i % len(colors)]      # Cycle through colors if more countries than colors

# Create the scatter plot, using different markers and colors for each country
plt.figure(figsize=(20, 8))
for country in unique_countries:
    country_data = North_america_df.loc[country]
    plt.scatter(country_data['gdp_growth'], country_data['Primary energy consumption per capita (kWh/person)'],
                marker=country_marker_map[country], color=country_color_map[country], alpha=0.8, label=country)
plt.title('Correlation between GDP Growth and Primary Energy Consumption per Capita')
plt.xlabel('GDP Growth (%)')
plt.ylabel('Primary Energy Consumption per Capita (kWh/person)')
plt.grid(True)

# Move legend outside the plot box
plt.legend(title='Country', bbox_to_anchor=(1.05, 1), loc='upper left')

# Show plot
plt.show()


access_to_electricity = North_america_df['Access to electricity (% of population)']
# Select unique countries from the index
countries = access_to_electricity.index.get_level_values('Entity').unique()

# Set up the subplot layout
num_rows = 5  # Number of rows of subplots
num_cols = 5  # Number of columns of subplots
num_plots = num_rows * num_cols
fig, axes = plt.subplots(num_rows, num_cols, figsize=(20, 10))

# Plotting for each country
for i, country in enumerate(countries[:num_plots]):
    row = i // num_cols
    col = i % num_cols
    country_data = access_to_electricity.loc[country]
    if num_rows == 1 or num_cols == 1:
        country_data.plot(kind='line', ax=axes[i], title=f'Access to electricity - {country}')
    else:
        country_data.plot(kind='line', ax=axes[row, col], title=f'Access to electricity - {country}')

# Adjust layout
plt.tight_layout()
plt.show()


plt.figure(figsize=(16, 8))
sns.boxplot(data=North_america_df, x='Entity', y='Access to clean fuels for cooking')
plt.title('Access to Clean Cooking Fuels Across Countries')
plt.xlabel('Country')
plt.ylabel('Access to Clean Cooking Fuels')
plt.xticks(rotation=45)
plt.show()


# Select relevant columns for CO2 emissions and electricity generation from different sources
co2_emissions = North_america_df['Value_co2_emissions_kt_by_country']
electricity_fossil_fuels = North_america_df['Electricity from fossil fuels (TWh)']
electricity_nuclear = North_america_df['Electricity from nuclear (TWh)']
electricity_renewables = North_america_df['Electricity from renewables (TWh)']


# Set up the subplot layout
num_rows = 5  # Number of rows of subplots
num_cols = 5  # Number of columns of subplots
num_plots = num_rows * num_cols
fig, axes = plt.subplots(num_rows, num_cols, figsize=(15, 10))

# Plotting for each country
for i, country in enumerate(countries[:num_plots]):
    row = i // num_cols
    col = i % num_cols
    if country in North_america_df.index:
        country_data = North_america_df.loc[country]
        years = country_data.index.get_level_values('Year')
        # Stack the energy sources vertically
        stacked_data = np.vstack([electricity_fossil_fuels.loc[country], electricity_nuclear.loc[country], electricity_renewables.loc[country]])
        # Plot the stacked area chart
        axes[row, col].stackplot(years, stacked_data, labels=['Fossil Fuels (TWh)', 'Nuclear (TWh)', 'Renewables(TWh)'])
        axes[row, col].set_title(f'{country}')
        axes[row, col].set_xlabel('Year')
        axes[row, col].set_ylabel('Value')
        axes[row, col].grid(True)

# Create a single legend for all the subplots
handles, labels = axes[0, 0].get_legend_handles_labels()
fig.legend(handles, labels, loc='upper left', bbox_to_anchor=(1.05, 1), borderaxespad=0.)

# Adjust layout
plt.tight_layout()
plt.show()


# Selecting countries with non-zero values for electricity from nuclear
countries_with_nuclear = North_america_df[North_america_df['Electricity from nuclear (TWh)'] > 10].index.get_level_values('Entity').unique()

# Plotting for each country with non-zero values for electricity from nuclear
plt.figure(figsize=(10, 6))
for country in countries_with_nuclear:
    country_data = North_america_df.loc[country]
    years = country_data.index.get_level_values('Year')
    electricity_nuclear = country_data['Electricity from nuclear (TWh)']
    plt.plot(years, electricity_nuclear, label=country, marker="*")

# Adding labels and legend
plt.title('Electricity from Nuclear by Country')
plt.xlabel('Year')
plt.ylabel('Electricity from Nuclear (TWh)')
plt.legend()
plt.grid(True)

# Show plot
plt.show()


# Select relevant columns for analysis
relevant_data =North_america_df[['Electricity from fossil fuels (TWh)', 'Electricity from nuclear (TWh)',
                      'Electricity from renewables (TWh)', 'Financial flows to developing countries (US $)', 'gdp_growth', 'gdp_per_capita'
                       ,'Value_co2_emissions_kt_by_country']]


# Calculate the correlation matrix
correlation_matrix = relevant_data.corr()

# Visualize the correlation matrix using a heatmap
plt.figure(figsize=(20, 10))
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', fmt=".2f", linewidths=.5)
plt.title('Correlation Matrix of Electricity Generation and CO2 Emissions')
plt.show()


# Select relevant columns for analysis
relevant_data = North_america_df[['Electricity from fossil fuels (TWh)', 'Electricity from nuclear (TWh)',
                      'Electricity from renewables (TWh)', 'Value_co2_emissions_kt_by_country']]


# Calculate the correlation matrix
correlation_matrix = relevant_data.corr()

# Visualize the correlation matrix using a heatmap
plt.figure(figsize=(16, 10))
sns.heatmap(correlation_matrix, annot=True, cmap='RdYlBu', fmt=".2f", linewidths=.7)
plt.title('Correlation Matrix of Electricity Generation and CO2 Emissions')
plt.show()


# for subplot
fig, axes = plt.subplots(nrows=1, ncols=3, figsize=(18, 5))

# Iterate over each predictor variable and create a regression plot
predictor_vars = ['Electricity from fossil fuels (TWh)', 'Electricity from nuclear (TWh)', 'Electricity from renewables (TWh)']

for idx, var in enumerate(predictor_vars):
    # Regression plot with scatter points and regression line
    sns.regplot(x=var, y='Value_co2_emissions_kt_by_country', data= North_america_df, ax=axes[idx], line_kws={'color': 'red'})

    # Set axis labels and title
    axes[idx].set_xlabel(var)
    axes[idx].set_ylabel('CO2 Emissions (kt)')
    axes[idx].set_title(f'Regression Plot: {var} vs. CO2 Emissions')

# Adjust spacing between subplots
plt.subplots_adjust(wspace=0.3)

# Display the plots
plt.show()


# Define variables
renewable_capacity = North_america_df['Renewable-electricity-generating-capacity-per-capita']
renewable_share = North_america_df['Renewable energy share in the total final energy consumption (%)']
co2_emissions = North_america_df['Value_co2_emissions_kt_by_country']

# Calculate the linear regression parameters
slope, intercept, r_value, p_value, std_err = linregress(renewable_capacity, renewable_share)

# Create scatter plot
plt.figure(figsize=(20, 8))
plt.scatter(renewable_capacity, renewable_share, c=co2_emissions, cmap='coolwarm', alpha=0.8)
plt.colorbar(label='CO2 Emissions (kt)')
plt.xlabel('Renewable Energy Capacity per Capita')
plt.ylabel('Renewable Energy Share (%)')
plt.title('Relationship Between Renewable Energy Capacity, Share, and CO2 Emissions')

# Add a linear regression line
x_vals = np.array([min(renewable_capacity), max(renewable_capacity)])
y_vals = intercept + slope * x_vals
plt.plot(x_vals, y_vals, 'r-', label=f'Linear regression\ny = {intercept:.2f} + {slope:.2f}x')
plt.legend()

plt.grid(True)
plt.show()


# Define variables
renewable_capacity = North_america_df['Renewable-electricity-generating-capacity-per-capita']
renewable_share = North_america_df['Renewable energy share in the total final energy consumption (%)']
co2_emissions = North_america_df['Value_co2_emissions_kt_by_country']

# Create scatter plot
plt.figure(figsize=(20, 8))
plt.scatter(renewable_capacity, renewable_share, c=co2_emissions, cmap='coolwarm', alpha=0.8)
plt.colorbar(label='CO2 Emissions (kt)')
plt.xlabel('Renewable Energy Capacity per Capita')
plt.ylabel('Renewable Energy Share (%)')
plt.title('Relationship Between Renewable Energy Capacity, Share, and CO2 Emissions')
plt.grid(True)
plt.show()


# Select the required features and target values
X = North_america_df[['Electricity from fossil fuels (TWh)', 'Electricity from nuclear (TWh)', 'Electricity from renewables (TWh)']]
y = North_america_df['Value_co2_emissions_kt_by_country']

# Split the data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4, random_state=42)

# Create and fit the Multiple Linear Regression model
model = LinearRegression()
model.fit(X_train, y_train)

# Make predictions on the test set
y_pred = model.predict(X_test)

# Evaluate the model performance
# (You can use metrics like mean squared error, R-squared, etc.)
print('Model Coefficients:', model.coef_)
print('Model Intercept:', model.intercept_)

Model Coefficients: [ 2632.64653286 -2921.61136083   197.0527656 ]
Model Intercept: 10222.61293404107


# Calculate the Mean Squared Error and R-squared values
mse = mean_squared_error(y_test, y_pred)
r2 =  r2_score(y_test, y_pred)

print(f"Mean Squared Error: {mse}")
print(f"R-squared: {r2}")

# Calculate  Root Mean Squared Error (RMSE)
rmse = mean_squared_error(y_test, y_pred, squared=False)
print("Root Mean Squared Error (RMSE):", rmse)

Mean Squared Error: 9015591873.10823
R-squared: 0.9935387149973578
Root Mean Squared Error (RMSE): 94950.4706313151


# Plotting actual vs predicted values
plt.figure(figsize=(16, 10))
plt.scatter(y_test, y_pred, alpha=0.7)
plt.xlabel('Actual CO2 Emissions (kt)')
plt.ylabel('Predicted CO2 Emissions (kt)')
plt.title('Actual vs Predicted CO2 Emissions')
plt.plot([y.min(), y.max()], [y.min(), y.max()], 'k--', lw=4)  # Line showing perfect predictions
plt.show()


# Create polynomial features (e.g., quadratic features)
poly = PolynomialFeatures(degree=2)  # Specify the degree of the polynomial
X_train_poly = poly.fit_transform(X_train)
X_test_poly = poly.transform(X_test)

# Create and fit the Polynomial Regression model
model = LinearRegression()
model.fit(X_train_poly, y_train)

# Make predictions on the test set
y_pred = model.predict(X_test_poly)

# Evaluate the model performance
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print('Mean Squared Error:', mse)
print('R-squared:', r2)

# Visualize the results (actual vs. predicted)
plt.figure(figsize=(16, 10))
plt.scatter(y_test, y_pred)
plt.xlabel('Actual Values')
plt.ylabel('Predicted Values')
plt.title('Actual vs. Predicted Values')
plt.plot([y.min(), y.max()], [y.min(), y.max()], 'k--', lw=4)
plt.show()

Mean Squared Error: 17607714067.40731
R-squared: 0.9873809218034925


# Initializing Decision Tree Regressor
dt_regressor = DecisionTreeRegressor(random_state=42)
# Training the Decision Tree model
dt_regressor.fit(X_train, y_train)
# Predicting on the test set
dt_predictions = dt_regressor.predict(X_test)
# Calculate Mean Squared Error
dt_mse = mean_squared_error(y_test, dt_predictions)
print(" Mean Squared Error:", dt_mse)


# Calculate R-squared (R²)
r2 = r2_score(y_test, dt_predictions)
print("R-squared (R²):", r2)
# Calculate Mean Absolute Error (MAE)
mae = mean_absolute_error(y_test, dt_predictions)
print("Mean Absolute Error (MAE):", mae)
# Calculate  Root Mean Squared Error (RMSE)
rmse = mean_squared_error(y_test, dt_predictions, squared=False)
print("Root Mean Squared Error (RMSE):", rmse)

 Mean Squared Error: 5163479906.235304
R-squared (R²): 0.9962994425935454
Mean Absolute Error (MAE): 16191.712643913226
Root Mean Squared Error (RMSE): 71857.35805215291


# Initializing Random Forest Regressor
rf_regressor = RandomForestRegressor(random_state=42)
# Training the Random Forest model
rf_regressor.fit(X_train, y_train)
# Predicting on the test set
rf_predictions = rf_regressor.predict(X_test)


# Calculate Mean Squared Error
rf_mse = mean_squared_error(y_test, rf_predictions)
print("Random Forest Mean Squared Error:", rf_mse)
# Calculate R-squared (R²)
r2 = r2_score(y_test, rf_predictions)
print("R-squared (R²):", r2)
# Calculate Mean Absolute Error (MAE)
mae = mean_absolute_error(y_test, rf_predictions)
print("Mean Absolute Error (MAE):", mae)
# Calculate  Root Mean Squared Error (RMSE)
rmse = mean_squared_error(y_test, rf_predictions, squared=False)
print("Root Mean Squared Error (RMSE):", rmse)

Random Forest Mean Squared Error: 4170248057.053951
R-squared (R²): 0.997011270961731
Mean Absolute Error (MAE): 15999.066497858534
Root Mean Squared Error (RMSE): 64577.457808851155


# Define a list of Asian countries

asian_countries = ['Afghanistan', 'Armenia', 'Azerbaijan', 'Bahrain', 'Bangladesh', 'Bhutan', 'Brunei', 'Cambodia',
                   'China', 'Cyprus', 'Georgia', 'India', 'Indonesia', 'Iran', 'Iraq', 'Israel', 'Japan', 'Jordan',
                   'Kazakhstan', 'Kuwait', 'Kyrgyzstan', 'Laos', 'Lebanon', 'Malaysia', 'Maldives', 'Mongolia',
                   'Myanmar', 'Nepal',  'Oman', 'Pakistan', 'Palestine', 'Philippines', 'Qatar',
                   'Saudi Arabia', 'Singapore','Sri Lanka', 'Syria', 'Taiwan', 'Tajikistan',
                   'Thailand', 'Turkey', 'Turkmenistan', 'United Arab Emirates', 'Uzbekistan', 'Yemen']



# Filter the DataFrame to include only Asian countries
asia_df = Energy_df[Energy_df['Entity'].isin(asian_countries)].copy()

# Save the Asia DataFrame to a file
asia_df.to_csv('asia_data.csv', index=False)


Asia_df = pd.read_csv("/content/asia_data.csv")
Asia_df.head()


# Set 'Entity' and 'Year' columns as index
Asia_df.set_index(['Entity', 'Year'], inplace=True)
Asia_df.head()


# Fill NaN in each column by 0
Asia_df.fillna(0, inplace=True);

Asia_df.dropna(axis = 1);

# check missing values in columns
Asia_df.isnull().sum()

Access to electricity (% of population)                             0
Access to clean fuels for cooking                                   0
Renewable-electricity-generating-capacity-per-capita                0
Financial flows to developing countries (US $)                      0
Renewable energy share in the total final energy consumption (%)    0
Electricity from fossil fuels (TWh)                                 0
Electricity from nuclear (TWh)                                      0
Electricity from renewables (TWh)                                   0
Low-carbon electricity (% electricity)                              0
Primary energy consumption per capita (kWh/person)                  0
Energy intensity level of primary energy (MJ/$2017 PPP GDP)         0
Value_co2_emissions_kt_by_country                                   0
Renewables (% equivalent primary energy)                            0
gdp_growth                                                          0
gdp_per_capita                                                      0
Density\n(P/Km2)                                                    0
Land Area(Km2)                                                      0
Latitude                                                            0
Longitude                                                           0
dtype: int64


access_to_electricity = Asia_df['Access to electricity (% of population)']
# Select unique countries from the index
countries = access_to_electricity.index.get_level_values('Entity').unique()

# Set up the subplot layout
num_rows = 6  # Number of rows of subplots
num_cols = 6  # Number of columns of subplots
num_plots = num_rows * num_cols
fig, axes = plt.subplots(num_rows, num_cols, figsize=(20, 15))

# Plotting for each country
for i, country in enumerate(countries[:num_plots]):
    row = i // num_cols
    col = i % num_cols
    country_data = access_to_electricity.loc[country]
    if num_rows == 1 or num_cols == 1:
        country_data.plot(kind='line', ax=axes[i], title=f'Access to electricity - {country}')
    else:
        country_data.plot(kind='line', ax=axes[row, col], title=f'Access to electricity - {country}')

# Adjust layout
plt.tight_layout()
plt.show()


# Select relevant columns for CO2 emissions and electricity generation from different sources
co2_emissions = Asia_df['Value_co2_emissions_kt_by_country']
electricity_fossil_fuels = Asia_df['Electricity from fossil fuels (TWh)']
electricity_nuclear = Asia_df['Electricity from nuclear (TWh)']
electricity_renewables = Asia_df['Electricity from renewables (TWh)']


# Set up the subplot layout
num_rows = 6  # Number of rows of subplots
num_cols = 6  # Number of columns of subplots
num_plots = num_rows * num_cols
fig, axes = plt.subplots(num_rows, num_cols, figsize=(20, 15))

# Plotting for each country
for i, country in enumerate(countries[:num_plots]):
    row = i // num_cols
    col = i % num_cols
    if country in Asia_df.index:
        country_data = Asia_df.loc[country]
        years = country_data.index.get_level_values('Year')
        # Stack the energy sources vertically
        stacked_data = np.vstack([electricity_fossil_fuels.loc[country], electricity_nuclear.loc[country], electricity_renewables.loc[country]])
        # Plot the stacked area chart
        axes[row, col].stackplot(years, stacked_data, labels=['Fossil Fuels (TWh)', 'Nuclear (TWh)', 'Renewables(TWh)'])
        axes[row, col].set_title(f'{country}')
        axes[row, col].set_xlabel('Year')
        axes[row, col].set_ylabel('Value')
        axes[row, col].grid(True)

# Create a single legend for all the subplots
handles, labels = axes[0, 0].get_legend_handles_labels()
fig.legend(handles, labels, loc='upper left', bbox_to_anchor=(1.05, 1), borderaxespad=0.)

# Adjust layout
plt.tight_layout()
plt.show()


# Select relevant columns for analysis
relevant_data =  Asia_df[['Electricity from fossil fuels (TWh)', 'Electricity from nuclear (TWh)',
                      'Electricity from renewables (TWh)', 'Financial flows to developing countries (US $)', 'gdp_growth', 'gdp_per_capita'
                       ,'Value_co2_emissions_kt_by_country']]

# Calculate the correlation matrix
correlation_matrix = relevant_data.corr()

# Visualize the correlation matrix using a heatmap
plt.figure(figsize=(16, 10))
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', fmt=".2f", linewidths=.5)
plt.title('Correlation Matrix of Electricity Generation and CO2 Emissions')
plt.show()


# Select the required features and target values
X_data = Asia_df[['Electricity from fossil fuels (TWh)', 'Electricity from nuclear (TWh)', 'Electricity from renewables (TWh)']]
y_data = Asia_df['Value_co2_emissions_kt_by_country']

# Split the data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X_data, y_data, test_size=0.4, random_state=42)

# Create and fit the Multiple Linear Regression model
model = LinearRegression()
model.fit(X_train, y_train)

# Make predictions on the test set
y_pred = model.predict(X_test)

# Evaluate the model performance
# (You can use metrics like mean squared error, R-squared, etc.)
print('Model Coefficients for Asian data :', model.coef_)
print('Model Intercept Asian data:', model.intercept_)

Model Coefficients for Asian data : [ 2704.54573939 -1783.64009849 -1064.33341351]
Model Intercept Asian data: -45481.16916626948


# Calculate the Mean Squared Error and R-squared values
mse = mean_squared_error(y_test, y_pred)
r2 =  r2_score(y_test, y_pred)

print(f"Mean Squared Error Asian data: {mse}")
print(f"R-squared: {r2}")


rmse = mean_squared_error(y_test, y_pred, squared=False)
print("Root Mean Squared Error (RMSE) Asian data:", rmse)

Mean Squared Error Asian data: 396984990458.2904
R-squared: 0.7223382481582331
Root Mean Squared Error (RMSE) Asian data: 630067.4491340513


# Initializing Random Forest Regressor
rf_regressor = RandomForestRegressor(random_state=42)
# Training the Random Forest model
rf_regressor.fit(X_train, y_train)
# Predicting on the test set
rf_predictions = rf_regressor.predict(X_test)


# Calculate Mean Squared Error
rf_mse = mean_squared_error(y_test, rf_predictions)
print("Random Forest Mean Squared Error:", rf_mse)
# Calculate R-squared (R²)
r2 = r2_score(y_test, rf_predictions)
print("R-squared (R²):", r2)
# Calculate Mean Absolute Error (MAE)
mae = mean_absolute_error(y_test, rf_predictions)
print("Mean Absolute Error (MAE):", mae)
# Calculate  Root Mean Squared Error (RMSE)
rmse = mean_squared_error(y_test, rf_predictions, squared=False)
print("Root Mean Squared Error (RMSE):", rmse)

Random Forest Mean Squared Error: 341564756263.001
R-squared (R²): 0.7611006187364775
Mean Absolute Error (MAE): 67164.90525945261
Root Mean Squared Error (RMSE): 584435.4166740761


# Define a list of European countries

european_countries = ['Albania', 'Austria', 'Belarus', 'Belgium', 'Bosnia and Herzegovina', "Bulgaria",
'Croatia','Czech Republic','Denmark','Estonia', 'Finland', 'France', 'Germany','Greece','Hungary',
'Iceland', 'Ireland', 'Italy','Latvia','Lithuania','Luxembourg','Malta',
'Montenegro','Netherlands','North Macedonia','Norway','Poland','Portugal','Romania','Russia',
'Serbia','Slovakia','Slovenia','Spain','Sweden','Switzerland','Ukraine','United Kingdom']


# Filter the DataFrame to include only European countries
europe_df = Energy_df[Energy_df['Entity'].isin(european_countries)].copy()

# Save the European DataFrame to a file
europe_df.to_csv('europe_data.csv', index=False)


Europe_df = pd.read_csv("/content/europe_data.csv")
Europe_df.head()


# Set 'Entity' and 'Year' columns as index
Europe_df.set_index(['Entity', 'Year'], inplace=True)
Europe_df.head()


# Fill NaN in each column by 0
Europe_df.fillna(0, inplace=True);

Europe_df.dropna(axis = 1);

# check missing values in columns
Europe_df.isnull().sum()

Access to electricity (% of population)                             0
Access to clean fuels for cooking                                   0
Renewable-electricity-generating-capacity-per-capita                0
Financial flows to developing countries (US $)                      0
Renewable energy share in the total final energy consumption (%)    0
Electricity from fossil fuels (TWh)                                 0
Electricity from nuclear (TWh)                                      0
Electricity from renewables (TWh)                                   0
Low-carbon electricity (% electricity)                              0
Primary energy consumption per capita (kWh/person)                  0
Energy intensity level of primary energy (MJ/$2017 PPP GDP)         0
Value_co2_emissions_kt_by_country                                   0
Renewables (% equivalent primary energy)                            0
gdp_growth                                                          0
gdp_per_capita                                                      0
Density\n(P/Km2)                                                    0
Land Area(Km2)                                                      0
Latitude                                                            0
Longitude                                                           0
dtype: int64


access_to_electricity = Europe_df['Access to electricity (% of population)']
# Select unique countries from the index
countries = access_to_electricity.index.get_level_values('Entity').unique()

# Set up the subplot layout
num_rows = 6  # Number of rows of subplots
num_cols = 6  # Number of columns of subplots
num_plots = num_rows * num_cols
fig, axes = plt.subplots(num_rows, num_cols, figsize=(20, 15))

# Plotting for each country
for i, country in enumerate(countries[:num_plots]):
    row = i // num_cols
    col = i % num_cols
    country_data = access_to_electricity.loc[country]
    if num_rows == 1 or num_cols == 1:
        country_data.plot(kind='line', ax=axes[i], title=f'Access to electricity - {country}')
    else:
        country_data.plot(kind='line', ax=axes[row, col], title=f'Access to electricity - {country}')

# Adjust layout
plt.tight_layout()
plt.show()


# Select relevant columns for CO2 emissions and electricity generation from different sources
co2_emissions = Europe_df['Value_co2_emissions_kt_by_country']
electricity_fossil_fuels = Europe_df['Electricity from fossil fuels (TWh)']
electricity_nuclear = Europe_df['Electricity from nuclear (TWh)']
electricity_renewables = Europe_df['Electricity from renewables (TWh)']


# Set up the subplot layout
num_rows = 6  # Number of rows of subplots
num_cols = 6  # Number of columns of subplots
num_plots = num_rows * num_cols
fig, axes = plt.subplots(num_rows, num_cols, figsize=(20, 15))

# Plotting for each country
for i, country in enumerate(countries[:num_plots]):
    row = i // num_cols
    col = i % num_cols
    if country in Europe_df.index:
        country_data = Europe_df.loc[country]
        years = country_data.index.get_level_values('Year')
        # Stack the energy sources vertically
        stacked_data = np.vstack([electricity_fossil_fuels.loc[country], electricity_nuclear.loc[country], electricity_renewables.loc[country]])
        # Plot the stacked area chart
        axes[row, col].stackplot(years, stacked_data, labels=['Fossil Fuels (TWh)', 'Nuclear (TWh)', 'Renewables(TWh)'])
        axes[row, col].set_title(f'{country}')
        axes[row, col].set_xlabel('Year')
        axes[row, col].set_ylabel('Value')
        axes[row, col].grid(True)

# Create a single legend for all the subplots
handles, labels = axes[0, 0].get_legend_handles_labels()
fig.legend(handles, labels, loc='upper left', bbox_to_anchor=(1.05, 1), borderaxespad=0.)

# Adjust layout
plt.tight_layout()
plt.show()


# Select relevant columns for analysis
relevant_data =  Europe_df[['Electricity from fossil fuels (TWh)', 'Electricity from nuclear (TWh)',
                      'Electricity from renewables (TWh)', 'Financial flows to developing countries (US $)', 'gdp_growth', 'gdp_per_capita'
                       ,'Value_co2_emissions_kt_by_country']]


# Calculate the correlation matrix
correlation_matrix = relevant_data.corr()

# Visualize the correlation matrix using a heatmap
plt.figure(figsize=(16, 10))
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', fmt=".2f", linewidths=.5)
plt.title('Correlation Matrix of Electricity Generation and CO2 Emissions')
plt.show()


# Select the required features and target values
X_data = Europe_df[['Electricity from fossil fuels (TWh)', 'Electricity from nuclear (TWh)', 'Electricity from renewables (TWh)']]
y_data = Europe_df['Value_co2_emissions_kt_by_country']

# Split the data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X_data, y_data, test_size=0.4, random_state=42)

# Create and fit the Multiple Linear Regression model
model = LinearRegression()
model.fit(X_train, y_train)

# Make predictions on the test set
y_pred = model.predict(X_test)

# Evaluate the model performance
# (You can use metrics like mean squared error, R-squared, etc.)
print('Model Coefficients for Europe data :', model.coef_)
print('Model Intercept Europe data:', model.intercept_)

Model Coefficients for Europe data : [1799.96574346  561.84445698 -177.37769508]
Model Intercept Europe data: 7385.190645998897


# Plotting actual vs predicted values
plt.figure(figsize=(16, 10))
plt.scatter(y_test, y_pred, alpha=0.7)
plt.xlabel('Actual CO2 Emissions (kt)')
plt.ylabel('Predicted CO2 Emissions (kt)')
plt.title('Actual vs Predicted CO2 Emissions')
plt.plot([y.min(), y.max()], [y.min(), y.max()], 'k--', lw=4)  # Line showing perfect predictions
plt.show()


# Calculate the Mean Squared Error and R-squared values
mse = mean_squared_error(y_test, y_pred)
r2 =  r2_score(y_test, y_pred)

print(f"Mean Squared Error Asian data: {mse}")
print(f"R-squared: {r2}")


rmse = mean_squared_error(y_test, y_pred, squared=False)
print("Root Mean Squared Error (RMSE) Asian data:", rmse)

Mean Squared Error Asian data: 902622519.3142585
R-squared: 0.9688854265701723
Root Mean Squared Error (RMSE) Asian data: 30043.6768607682


# Initializing Random Forest Regressor
rf_regressor = RandomForestRegressor(random_state=42)
# Training the Random Forest model
rf_regressor.fit(X_train, y_train)
# Predicting on the test set
rf_predictions = rf_regressor.predict(X_test)


# Calculate Mean Squared Error
rf_mse = mean_squared_error(y_test, rf_predictions)
print("Random Forest Mean Squared Error:", rf_mse)
# Calculate R-squared (R²)
r2 = r2_score(y_test, rf_predictions)
print("R-squared (R²):", r2)
# Calculate Mean Absolute Error (MAE)
mae = mean_absolute_error(y_test, rf_predictions)
print("Mean Absolute Error (MAE):", mae)
# Calculate  Root Mean Squared Error (RMSE)
rmse = mean_squared_error(y_test, rf_predictions, squared=False)
print("Root Mean Squared Error (RMSE):", rmse)

Random Forest Mean Squared Error: 1218390929.9647906
R-squared (R²): 0.9580004783334828
Mean Absolute Error (MAE): 11358.713379156494
Root Mean Squared Error (RMSE): 34905.457022717674

	Entity	Year	Access to electricity (% of population)	Access to clean fuels for cooking	Renewable-electricity-generating-capacity-per-capita	Financial flows to developing countries (US $)	Renewable energy share in the total final energy consumption (%)	Electricity from fossil fuels (TWh)	Electricity from renewables (TWh)	Low-carbon electricity (% electricity)	Primary energy consumption per capita (kWh/person)	Energy intensity level of primary energy (MJ/$2017 PPP GDP)	Value_co2_emissions_kt_by_country	Renewables (% equivalent primary energy)	gdp_growth	gdp_per_capita	Density\n(P/Km2)	Land Area(Km2)	Latitude	Longitude
3644	Zimbabwe	2016	42.561730	29.8	62.88	30000.0	81.90	3.50	3.32	48.680350	3227.6802	10.00	11020.00046	NaN	0.755869	1464.588957	38	390757.0	-19.015438	29.154857
3645	Zimbabwe	2017	44.178635	29.8	62.33	5570000.0	82.46	3.05	4.30	58.503407	3068.0115	9.51	10340.00015	NaN	4.709492	1235.189032	38	390757.0	-19.015438	29.154857
3646	Zimbabwe	2018	45.572647	29.9	82.53	10000.0	80.23	3.73	5.46	59.412407	3441.9858	9.83	12380.00011	NaN	4.824211	1254.642265	38	390757.0	-19.015438	29.154857
3647	Zimbabwe	2019	46.781475	30.1	81.40	250000.0	81.50	3.66	4.58	55.582527	3003.6553	10.47	11760.00023	NaN	-6.144236	1316.740657	38	390757.0	-19.015438	29.154857
3648	Zimbabwe	2020	52.747670	30.4	80.61	30000.0	81.90	3.40	4.19	55.204216	2680.1318	10.00	NaN	NaN	-6.248748	1214.509820	38	390757.0	-19.015438	29.154857

	Entity	Year	Access to electricity (% of population)	Access to clean fuels for cooking	Financial flows to developing countries (US $)	Electricity from fossil fuels (TWh)	Primary energy consumption per capita (kWh/person)	Energy intensity level of primary energy (MJ/$2017 PPP GDP)	Value_co2_emissions_kt_by_country	Renewables (% equivalent primary energy)	gdp_growth	gdp_per_capita	Density\n(P/Km2)	Land Area(Km2)	Latitude	Longitude
0	Antigua and Barbuda	2000	97.689260	100.0	NaN	0.14	28457.223	3.13	330.000000	NaN	6.203431	10872.29295	223	443.0	17.060816	-61.796428
1	Antigua and Barbuda	2001	97.785255	100.0	NaN	0.16	28025.290	3.31	350.000000	NaN	-4.548003	10367.31961	223	443.0	17.060816	-61.796428
2	Antigua and Barbuda	2002	100.000000	100.0	NaN	0.18	29660.344	3.59	389.999986	NaN	1.027442	10401.05088	223	443.0	17.060816	-61.796428
3	Antigua and Barbuda	2003	97.956825	100.0	NaN	0.20	30824.252	3.63	409.999996	NaN	6.076544	10797.95106	223	443.0	17.060816	-61.796428
4	Antigua and Barbuda	2004	98.037100	100.0	NaN	0.21	31622.059	3.57	400.000006	NaN	5.767029	11446.96914	223	443.0	17.060816	-61.796428

		Access to electricity (% of population)	Access to clean fuels for cooking	Renewable-electricity-generating-capacity-per-capita	Financial flows to developing countries (US $)	Renewable energy share in the total final energy consumption (%)	Electricity from fossil fuels (TWh)	Electricity from nuclear (TWh)	Electricity from renewables (TWh)	Low-carbon electricity (% electricity)	Primary energy consumption per capita (kWh/person)	Energy intensity level of primary energy (MJ/$2017 PPP GDP)	Value_co2_emissions_kt_by_country	Renewables (% equivalent primary energy)	gdp_growth	gdp_per_capita	Density\n(P/Km2)	Land Area(Km2)	Latitude	Longitude
Entity	Year
Antigua and Barbuda	2000	97.689260	100.0	0.0	NaN	0.0	0.14	0.0	0.0	0.0	28457.223	3.13	330.000000	NaN	6.203431	10872.29295	223	443.0	17.060816	-61.796428
	2001	97.785255	100.0	0.0	NaN	0.0	0.16	0.0	0.0	0.0	28025.290	3.31	350.000000	NaN	-4.548003	10367.31961	223	443.0	17.060816	-61.796428
	2002	100.000000	100.0	0.0	NaN	0.0	0.18	0.0	0.0	0.0	29660.344	3.59	389.999986	NaN	1.027442	10401.05088	223	443.0	17.060816	-61.796428
	2003	97.956825	100.0	0.0	NaN	0.0	0.20	0.0	0.0	0.0	30824.252	3.63	409.999996	NaN	6.076544	10797.95106	223	443.0	17.060816	-61.796428
	2004	98.037100	100.0	0.0	NaN	0.0	0.21	0.0	0.0	0.0	31622.059	3.57	400.000006	NaN	5.767029	11446.96914	223	443.0	17.060816	-61.796428

	Access to electricity (% of population)	Access to clean fuels for cooking	Renewable-electricity-generating-capacity-per-capita	Financial flows to developing countries (US $)	Renewable energy share in the total final energy consumption (%)	Electricity from fossil fuels (TWh)	Electricity from nuclear (TWh)	Electricity from renewables (TWh)	Low-carbon electricity (% electricity)	Primary energy consumption per capita (kWh/person)	Energy intensity level of primary energy (MJ/$2017 PPP GDP)	Value_co2_emissions_kt_by_country	Renewables (% equivalent primary energy)	gdp_growth	gdp_per_capita	Density\n(P/Km2)	Land Area(Km2)	Latitude	Longitude
count	483.000000	483.000000	483.000000	4.830000e+02	483.000000	483.000000	483.000000	483.000000	483.000000	483.000000	483.000000	4.830000e+02	483.000000	483.000000	483.000000	483.000000	4.830000e+02	483.000000	483.000000
mean	91.237958	81.656004	86.770683	1.903126e+07	21.572712	136.656812	38.618137	41.593934	29.428154	26587.691064	4.304741	2.671328e+05	1.781932	1.900436	8961.279285	188.913043	9.793458e+05	18.481245	-77.149814
std	13.664655	24.773086	121.423265	6.717739e+07	21.447954	559.528519	161.532173	128.185234	29.014980	35566.004222	3.732953	1.064432e+06	6.002859	4.070652	12683.050130	154.419152	2.786825e+06	9.975725	14.103692
min	31.542381	2.600000	0.000000	0.000000e+00	0.000000	0.030000	0.000000	0.000000	0.000000	836.080400	0.000000	0.000000e+00	0.000000	-20.192371	0.000000	4.000000	2.610000e+02	8.537981	-106.346771
25%	88.013810	82.450000	2.375000	0.000000e+00	5.360000	0.265000	0.000000	0.010000	0.490316	8666.838500	2.560000	1.950000e+02	0.000000	0.000000	1489.301724	58.000000	6.160000e+02	12.984305	-88.497650
50%	96.802520	89.600000	52.620000	0.000000e+00	12.560000	1.890000	0.000000	0.450000	20.952381	11274.687000	3.540000	5.240000e+03	0.000000	2.202086	4718.436581	167.000000	2.296600e+04	15.783471	-77.781167
75%	100.000000	100.000000	110.170000	1.415000e+06	32.775000	6.515000	0.000000	4.120000	52.462604	26481.503500	4.900000	1.577500e+04	0.000000	4.109209	10330.594430	284.000000	1.108600e+05	18.971187	-61.679000
max	100.000000	100.000000	622.840000	5.963700e+08	82.190000	2988.240000	809.410000	821.400000	99.121710	165831.280000	21.600000	5.775810e+06	30.542150	14.440990	65279.529030	668.000000	9.984670e+06	56.130366	-59.543198

	Entity	Year	Access to electricity (% of population)	Access to clean fuels for cooking	Renewable-electricity-generating-capacity-per-capita	Financial flows to developing countries (US $)	Renewable energy share in the total final energy consumption (%)	Electricity from fossil fuels (TWh)	Electricity from renewables (TWh)	Low-carbon electricity (% electricity)	Primary energy consumption per capita (kWh/person)	Energy intensity level of primary energy (MJ/$2017 PPP GDP)	Value_co2_emissions_kt_by_country	Renewables (% equivalent primary energy)	gdp_growth	gdp_per_capita	Density\n(P/Km2)	Land Area(Km2)	Latitude	Longitude
0	Afghanistan	2000	1.613591	6.2	9.22	20000.0	44.99	0.16	0.31	65.957440	302.59482	1.64	760.000000	NaN	NaN	NaN	60	652230.0	33.93911	67.709953
1	Afghanistan	2001	4.074574	7.2	8.86	130000.0	45.60	0.09	0.50	84.745766	236.89185	1.74	730.000000	NaN	NaN	NaN	60	652230.0	33.93911	67.709953
2	Afghanistan	2002	9.409158	8.2	8.47	3950000.0	37.83	0.13	0.56	81.159424	210.86215	1.40	1029.999971	NaN	NaN	179.426579	60	652230.0	33.93911	67.709953
3	Afghanistan	2003	14.738506	9.5	8.09	25970000.0	36.66	0.31	0.63	67.021280	229.96822	1.40	1220.000029	NaN	8.832278	190.683814	60	652230.0	33.93911	67.709953
4	Afghanistan	2004	20.064968	10.9	7.75	NaN	44.24	0.33	0.56	62.921350	204.23125	1.20	1029.999971	NaN	1.414118	211.382074	60	652230.0	33.93911	67.709953

Trends in Renewable Energy Adoption¶

ETL (Extraction, Transform, and Load)¶

	Entity	Year	Access to electricity (% of population)	Access to clean fuels for cooking	Renewable-electricity-generating-capacity-per-capita	Financial flows to developing countries (US $)	Renewable energy share in the total final energy consumption (%)	Electricity from fossil fuels (TWh)	Electricity from renewables (TWh)	Low-carbon electricity (% electricity)	Primary energy consumption per capita (kWh/person)	Energy intensity level of primary energy (MJ/$2017 PPP GDP)	Value_co2_emissions_kt_by_country	Renewables (% equivalent primary energy)	gdp_growth	gdp_per_capita	Density\n(P/Km2)	Land Area(Km2)	Latitude	Longitude
0	Albania	2000	100.0	38.2	NaN	NaN	NaN	0.14	4.55	97.014930	9029.4375	4.13	3170.000000	NaN	6.946217	1126.683340	105	28748.0	41.153332	20.168331
1	Albania	2001	100.0	40.5	NaN	NaN	NaN	0.13	3.52	96.438354	8635.5320	3.89	3230.000000	NaN	8.293313	1281.659826	105	28748.0	41.153332	20.168331
2	Albania	2002	100.0	43.2	NaN	NaN	NaN	0.16	3.48	95.604390	9443.5550	4.10	3759.999990	NaN	4.536524	1425.124219	105	28748.0	41.153332	20.168331
3	Albania	2003	100.0	46.4	NaN	NaN	NaN	0.10	5.12	98.084300	10756.6120	3.80	4070.000172	NaN	5.528637	1846.120121	105	28748.0	41.153332	20.168331
4	Albania	2004	100.0	49.0	NaN	NaN	NaN	0.13	5.41	97.653430	11586.9510	3.96	4250.000000	NaN	5.514668	2373.581292	105	28748.0	41.153332	20.168331