# -*- coding: utf-8 -*-
"""income_disparity.ipynb

Automatically generated by Colaboratory.

Original file is located at
    https://colab.research.google.com/drive/1upuHuQ3gWDkpbvkvHl2uTQlSv20JZnf2
"""


#!pip install pandas-datareader
import wbdata
import datetime
import numpy as np
import pandas as pd
from pandas_datareader import wb
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression as lr
from matplotlib.pyplot import MultipleLocator


# =============================================================================
# # Part 1: API Integration
# =============================================================================

# =============================================================================
# # API method 1: using wbdata module
# =============================================================================

# #searching for countries index using names
# print(wbdata.search_countries('United Kingdom'))

# list of countries
countries = ["USA", "BEL", "BRA", "COL", "FRA", "DEU", "GRC", "IDN", "IRL", "MEX", "NLD", "RUS"]
# date period
dates = datetime.datetime(2008, 1, 1), datetime.datetime(2018, 1, 1)

# data object
indicators = {'SI.DST.05TH.20':'Income share held by highest 20%', 'SI.DST.FRST.20': 'Income share held by lowest 20%', \
             'SL.EMP.TOTL.SP.FE.NE.ZS': 'Employment to population ratio, 15+, female (%) (national estimate)',\
             'SL.EMP.TOTL.SP.MA.NE.ZS': 'Employment to population ratio, 15+, male (%) (national estimate)'}

# getting data from these countries
raw_data = wbdata.get_dataframe(indicators, country=countries, data_date=dates, convert_date=True)

raw_unstacked_data = raw_data.unstack(level=0)

# printing our data object
# print(raw_data)
# print(raw_unstacked_data)

# =============================================================================
# # API method 2: using from pandas.datareader import wb, convert the data object to a DataFrame 
# =============================================================================

# view all data
pd.set_option('display.max_columns', 15) 
pd.set_option('display.max_rows', 15) 

df1 = wb.download(indicator = indicators, country = countries,  start = 2008, end = 2018)
date_period = [i for i in range(2008, 2019)]
print(df1)

# create a new DataFrame df2 for later use, not change origin values from df1 if we do some calculations for our dataframe df2
# rename the columns name
df2 = df1.rename(columns = {'SI.DST.05TH.20':'Income share held by highest 20%', 'SI.DST.FRST.20': 'Income share held by lowest 20%', \
             'SL.EMP.TOTL.SP.FE.NE.ZS': 'Employment to population ratio, 15+, female (%) (national estimate)',\
             'SL.EMP.TOTL.SP.MA.NE.ZS': 'Employment to population ratio, 15+, male (%) (national estimate)'}, inplace = False)

# overview our data object DataFrame
# Data manipulation: dealing with the missing value, replace them as mean(), which has less impact on our data sets
df2.mean()
df2.fillna(df2.mean(), inplace = True)
print(df2)

# Overview our new edited DataFram and get basic info of statistics
print(df2.describe())


# =============================================================================
# # Part 2: Data structure set up
# =============================================================================

# =============================================================================
# # creating our Data Structure type I
# =============================================================================

# step I: convert DataFrame to a list in correct order from 2008 to 2018
def country_DataFrame_to_list(country, target_data):
  df = wb.download(indicator = target_data, country = country,  start = 2008, end = 2018)
  df.fillna(df.mean(), inplace = True)
  df_list =df[df.columns[0]].tolist()
  round_list = [round(i, 2) for i in df_list ]
  return round_list[::-1]

# step II: make a list of tuple, which is a good way to save our data
def country_tuples(country_list, time):
  return list(zip(country_list, time))

# additional gap calculation for calculating the gap between two list
def gap_between(toplist, lowlist):
  gap_list = []
  for i in range(len(toplist)):
    gap_list.append(round((toplist[i]- lowlist[i]), 2))
  return gap_list


# step IV: Make a dictionary of list of tuple, which is one of our data structure of this project,
# named as Data Structure type I.
def object_Dictionary(country_list, object_target, date_period):
  object_df = {}
  for country in country_list:
    object_df[country] = country_tuples(date_period, country_DataFrame_to_list(country, object_target))
  return object_df

# step V: start to build: 
    
    
# This data set is for storing data of Income share held by highest 20%
Top_20_df = object_Dictionary(countries, 'SI.DST.05TH.20', date_period)

# This data set is for storing data of Income share held by lowest 20%
Low_20_df = object_Dictionary(countries, 'SI.DST.FRST.20', date_period)

# This data set is for storing data of 'Employment to population ratio, 15+, female (%) (national estimate)'
female_employ_df = object_Dictionary(countries, 'SL.EMP.TOTL.SP.FE.NE.ZS', date_period)

# This data set is for storing data of 'Employment to population ratio, 15+, male (%) (national estimate)'
male_employ_df = object_Dictionary(countries, 'SL.EMP.TOTL.SP.MA.NE.ZS', date_period)


# =============================================================================
# # creating our Data Structure type II: convert our Data structure type I to typle II
# =============================================================================
# step 1: write a function that can unpack dictionary of tuple to a new dictionary of simple list, and calculate the gap
def no_tuple_dic(object_Dictionary1, object_Dictionary2):
  new_dict = {}
  for i in countries:
    new_list = []
    for j in range(11):
      # The reason why I didn't use the difference function is because I don't want my new dictionary has year
      new_list.append(round((object_Dictionary1[i][j][1]- object_Dictionary2[i][j][1]), 2)) 
    new_dict[i] = new_list  

  return new_dict

# step 2: getting the income gap dictionary of list between income share held by highest 20% and income share held by lowest 20%
income_gap_dict = no_tuple_dic(Top_20_df, Low_20_df)

# step 3: create our Data structure type II, DataFrame
income_gap_dict_df = pd.DataFrame(income_gap_dict, columns = countries)

# step 4: show the basic statistic info of our income gap DataFrame
print(round(income_gap_dict_df.describe(),2))

# same step as above, to get our Data Structure type II, between male employment population and female employment population
gender_gap_dict = no_tuple_dic(male_employ_df, female_employ_df)

gender_gap_dict_df = pd.DataFrame(gender_gap_dict, columns = countries)
print(round(gender_gap_dict_df.describe(),2))


# Data Structure function application

# This function is to calculate the difference of the gap between income share held by highest 20% and income share held by lowest 20%
def gap_income_Dataframe(country):
  gap = {}
  for i in range(len(Top_20_df[country])):
    year1, data1 = Top_20_df[country][i]
    year2, data2 = Low_20_df[country][i]  
    if year1 == year2:
      gap[year1] = round(data1-data2, 2)
  return gap

# This function is to calculate the difference of the gap between male employment population and female employment population
def gap_gender_Dataframe(country):
  gap = {}
  for i in range(len(Top_20_df[country])):
    year1, data1 = male_employ_df[country][i]
    year2, data2 = female_employ_df[country][i]  
    if year1 == year2:
      gap[year1] = round(data1-data2, 2)
  return gap

# This function is to searching specific country and year  
def searching_data(object_Dictionary, country, year):
  country_list = []
  if country in countries:
    for i in range(11):
      country_list.append(object_Dictionary[country][i])
  
  output = [item for item in country_list if item[0] == year]
  #return empty list if data not found, return a tuple if country and year is valid    
  return output


# =============================================================================
# # Part 3: Ploting the data set
# =============================================================================


# =============================================================================
# #plot 1: Income gap from 2008 to 2018
# =============================================================================

from matplotlib.pyplot import MultipleLocator
plt.title('Income gap from 2008 to 2018')
plt.xlabel('Year')
plt.ylabel('Income gap%')
all_data_i = []

for c in countries:
  gap_i = gap_income_Dataframe(c)
  x_i = gap_i.keys()
  y_i = gap_i.values()
  all_data_i.append(gap_i)
  plt.scatter(x_i,y_i,marker='+',label=c)
  plt.legend(loc=2,bbox_to_anchor=(1.05,1.0),borderaxespad = 0.)

x_major_locator=MultipleLocator(1)  #set the x interval as 1
y_major_locator=MultipleLocator(2)   #set the y interval as 2
ax=plt.gca()
ax.xaxis.set_major_locator(x_major_locator)     #Set the major scale of the x-axis to a multiple of 1
ax.yaxis.set_major_locator(y_major_locator)     #Set the major scale of the y-axis to a multiple of 2
plt.xlim(2007,2019)   #Set the x scale range of the x-axis from 2008 to 2018， the reason why I use 2019 is because we can see clearly t 
plt.ylim(25,60)     #Set the y scale range of the y-axis from 25 to 60

N = 10000
xr_i = list(range(2008,2019))
yr_i = []
for i in xr_i:
  temp = 0
  for j in all_data_i:
    temp += j[i]
  temp /= len(countries)
  yr_i.append(temp)
plt.plot(xr_i,yr_i,"r-",label='average')
plt.legend(loc=2,bbox_to_anchor=(1.05,1.0),borderaxespad = 0.)
plt.savefig('Income gap.pdf')  
plt.show()

# =============================================================================
# #plot 2: Gender Employment rate gap from 2008 to 2018
# =============================================================================

plt.title('Gender Employment rate gap from 2008 to 2018')
plt.xlabel('Year')
plt.ylabel('Gender Employment Gap %')
all_data_j = []
for c in countries:
  gap_j = gap_gender_Dataframe(c)
  x_j = gap_j.keys()
  y_j = gap_j.values()
  all_data_j.append(gap_j)
  plt.scatter(x_j,y_j,marker='+',label=c)
  plt.legend(loc=2,bbox_to_anchor=(1.05,1.0),borderaxespad = 0.)

x_major_locator=MultipleLocator(1)  #set the x interval as 1
y_major_locator=MultipleLocator(2)   #set the y interval as 2
ax=plt.gca()
ax.xaxis.set_major_locator(x_major_locator)     #Set the major scale of the x-axis to a multiple of 1
ax.yaxis.set_major_locator(y_major_locator)     #Set the major scale of the y-axis to a multiple of 0.02
plt.xlim(2007,2019)   #Set the scale range of the x-axis from 2008 to 2018
plt.ylim(6,38)     #Set the scale range of the y-axis from 25 to 60

N = 10000
xr_j = list(range(2008,2019))
yr_j = []
for i in xr_j:
  temp = 0
  for j in all_data_j:
    temp += j[i]
  temp /= len(countries)
  yr_j.append(temp)
plt.plot(xr_j,yr_j,"r-",label='average')
plt.legend(loc=2,bbox_to_anchor=(1.05,1.0),borderaxespad = 0.)
plt.show()

# =============================================================================
# #boxplot 1 income gap
# =============================================================================

plt.figure(figsize=(9,6),dpi=60)

labels, data = [*zip(*income_gap_dict.items())]  # 'transpose' items to parallel key, value lists

# or backwards compatable    
labels, data = income_gap_dict.keys(), income_gap_dict.values()
plt.title('Income Gap from 2008 to 2018')
plt.xlabel('Country')
plt.ylabel('Income Gap %')
plt.boxplot(data)
plt.xticks(range(1, len(labels) + 1), labels)
plt.show()

# =============================================================================
# #boxplot 2 gender employment gap
# =============================================================================

plt.figure(figsize=(9,6),dpi=60)

labels, data = [*zip(*gender_gap_dict.items())]  # 'transpose' items to parallel key, value lists

# or backwards compatable    
labels, data = gender_gap_dict.keys(), gender_gap_dict.values()
plt.title('Gender Employment Gap')
plt.xlabel('Country')
plt.ylabel('Gender Employment Gap %')
plt.boxplot(data)
plt.xticks(range(1, len(labels) + 1), labels)
plt.show()

# =============================================================================
# #Part 4: linear regression
# =============================================================================

import numpy as np
from sklearn.linear_model import LinearRegression

# Convert the original data frame to list
def convert_to_target_data_dict(country_list):
    converted_dict = {}

    for i in range(len(country_list)):
        country_name = country_list[i]
        converted_dict[country_name] = {}
        gap_income_dict = gap_income_Dataframe(country_name)
        gap_gender_dict = gap_gender_Dataframe(country_name)
        converted_gap_income_list = []
        converted_gap_gender_list = []

        for k in gap_income_dict:
            converted_gap_income_list.append(gap_income_dict[k])
            converted_gap_gender_list.append(gap_gender_dict[k])

        converted_dict[country_name]["income"] = converted_gap_income_list
        converted_dict[country_name]["gender"] = converted_gap_gender_list

    return converted_dict


# Work out the x-coordinates for linear regression
def x_coordinate():
    x_list = []
    x_coordinate = 2008
    for i in range(11):
        x_list.append(x_coordinate)
        x_coordinate = x_coordinate + 1

    return x_list


# Work out the linear regression for single country
def linear_regression(contry_name, coordinate_dict, data_type, predict_time):
    y_list = coordinate_dict[contry_name][data_type]
    x_list = x_coordinate()
    x = np.array(x_list).reshape((-1, 1))
    y = np.array(y_list)

    linear_model = LinearRegression().fit(x, y)

    predict_year = np.array([predict_time]).reshape((-1, 1))
    ten_year_prediction = linear_model.predict(predict_year)
    

    return ten_year_prediction[0]


# Work out the final predicted result for the income and gender gap of 2030
def total_linear_regression_result(y_coordinate_dict):
    linear_regression_result_dict = {}

    for k in y_coordinate_dict:
        linear_regression_result_dict[k] = {}
        predict_income_gap_2030 = linear_regression(k, y_coordinate_dict, "income", 2030)
        predict_gender_gap_2030 = linear_regression(k, y_coordinate_dict, "gender", 2030)
        linear_regression_result_dict[k]["income"] = predict_income_gap_2030
        linear_regression_result_dict[k]["gender"] = predict_gender_gap_2030

    return linear_regression_result_dict


# Calculate the average income & gender gap of 2030
def calculate_average_gap(result_dict, country_list):
    average_result_dict = {}
    sum_income_gap = 0
    sum_gender_gap = 0

    for k in result_dict:
        sum_income_gap = sum_income_gap + result_dict[k]["income"]
        sum_gender_gap = sum_gender_gap + result_dict[k]["gender"]

    average_income_gap = sum_income_gap / len(country_list)
    average_gender_gap = sum_gender_gap / len(country_list)

    average_result_dict["average_income_gap"] = average_income_gap
    average_result_dict["average_gender_gap"] = average_gender_gap

    return average_result_dict


# Compare the average value with our liner regression result
# print the list of countries which higher or lower than our average prediction, or even equal
def compare_with_the_average(average_dict, result_dict):
    compare_result_dict = {}
    higher_than_income_average = []
    lower_than_income_average = []
    equal_to_income_average = []
    higher_than_gender_average = []
    lower_than_gender_average = []
    equal_to_gender_average = []

    for k in result_dict:
        if result_dict[k]["income"] > average_dict["average_income_gap"]:
            higher_than_income_average.append(k)
        elif result_dict[k]["income"] < average_dict["average_income_gap"]:
            lower_than_income_average.append(k)
        elif result_dict[k]["income"] == average_dict["average_income_gap"]:
            equal_to_income_average.append(k)

        if result_dict[k]["gender"] > average_dict["average_gender_gap"]:
            higher_than_gender_average.append(k)
        elif result_dict[k]["gender"] < average_dict["average_gender_gap"]:
            lower_than_gender_average.append(k)
        elif result_dict[k]["gender"] == average_dict["average_gender_gap"]:
            equal_to_gender_average.append(k)

    compare_result_dict["higher_than_income_average"] = higher_than_income_average
    compare_result_dict["lower_than_income_average"] = lower_than_income_average
    compare_result_dict["equal_to_income_average"] = equal_to_income_average

    compare_result_dict["higher_than_gender_average"] = higher_than_gender_average
    compare_result_dict["lower_than_gender_average"] = lower_than_gender_average
    compare_result_dict["equal_to_gender_average"] = equal_to_gender_average

    return compare_result_dict


def main():
    # Work out the linear regression result for the 'countries' list
    y_dict = convert_to_target_data_dict(countries)
    linear_regression_result_dict = total_linear_regression_result(y_dict)

    # Work out the average income & gender gap
    average_gap_result = calculate_average_gap(linear_regression_result_dict, countries)

    # Compare the average gap with the gap for each country
    compare_with_average = compare_with_the_average(average_gap_result, linear_regression_result_dict)

    # Print the results
    print(linear_regression_result_dict)
    print()
    print(average_gap_result)
    print()
    print(compare_with_average)
    return linear_regression_result_dict,average_gap_result,compare_with_average


if __name__ == "__main__":
    linear_regression_result_dict,average_gap_result,compare_with_average = main()


# over view our linear regression result
print()
print(linear_regression_result_dict)


# =============================================================================
# #part 5: plot the figure with our prediction with comparison
# =============================================================================

# Commented out IPython magic to ensure Python compatibility.
# =============================================================================
# #plot 1 for income gap with prediction in 2030
# =============================================================================
# %matplotlib inline
from matplotlib.pyplot import MultipleLocator
plt.figure(figsize=(12,6),dpi=60)
plt.title('Prediction of Income Gap in 2030')
plt.xlabel('Year')
plt.ylabel('Income gap%')
all_data_i = []

xr_x = list(range(2008,2019))
xr_x.append(2030)
# xr_x = list(map(lambda x:str(x),xr_x))
for c in countries:
  gap_i = gap_income_Dataframe(c)
  x_i = list(gap_i.keys())
  y_i = list(gap_i.values())
  tmp = linear_regression_result_dict[c]
  x_i.append(2019)
  y_i.append(tmp["income"])
  gap_i[2019] = tmp["income"]
  all_data_i.append(gap_i)
  plt.scatter(xr_x,y_i,marker='+',label=c)
  plt.legend(loc=2,bbox_to_anchor=(1.05,1.0),borderaxespad = 0.)

x_major_locator=MultipleLocator(1)  #set the x interval as 1
y_major_locator=MultipleLocator(2)   #set the y interval as 2
ax=plt.gca()
ax.xaxis.set_major_locator(x_major_locator)     #Set the major scale of the x-axis to a multiple of 1
ax.yaxis.set_major_locator(y_major_locator)     #Set the major scale of the y-axis to a multiple of 2
plt.xlim(2007,2031)   #Set the scale range of the x-axis from 2008 to 2018
plt.ylim(25,60)     #Set the scale range of the y-axis from 25 to 60


xr_i = list(range(2008,2019))
xr_i.append(2019)
yr_i = []
for i in xr_i:
  temp = 0
  for j in all_data_i:
    temp += j[i]
  temp /= len(countries)
  yr_i.append(temp)

plt.plot(xr_x,yr_i,"r-",label='average')
plt.legend(loc=2,bbox_to_anchor=(1.05,1.0),borderaxespad = 0.)
plt.savefig('Income gap.pdf')  
plt.show()


# =============================================================================
# #plot 2 for gender gap with prediction in 2030
# =============================================================================
plt.figure(figsize=(12,6),dpi=60)
plt.title('Prediction of Gender Employment Gap in 2030')
plt.xlabel('Year')
plt.ylabel('Gender Employment Gap %')
all_data_j = []

xr_x = list(range(2008,2019))
xr_x.append(2030)
for c in countries:
  gap_j = gap_gender_Dataframe(c)
  x_j = list(gap_j.keys())
  y_j = list(gap_j.values())
  tmp = linear_regression_result_dict[c]
  x_j.append(2019)
  y_j.append(tmp["gender"])
  gap_j[2019] = tmp["gender"]
  all_data_j.append(gap_j)
  plt.scatter(xr_x,y_j,marker='+',label=c)
  plt.legend(loc=2,bbox_to_anchor=(1.05,1.0),borderaxespad = 0.)

x_major_locator=MultipleLocator(1)  #set the x interval as 1
y_major_locator=MultipleLocator(2)   #set the y interval as 2
ax=plt.gca()
ax.xaxis.set_major_locator(x_major_locator)     #Set the major scale of the x-axis to a multiple of 1
ax.yaxis.set_major_locator(y_major_locator)     #Set the major scale of the y-axis to a multiple of 0.02
plt.xlim(2007,2031)   #Set the scale range of the x-axis from 2008 to 2018
plt.ylim(2,38)     #Set the scale range of the y-axis from 25 to 60


xr_j = list(range(2008,2019))
xr_j.append(2019)
yr_j = []
for i in xr_j:
  temp = 0
  for j in all_data_j:
    temp += j[i]
  temp /= len(countries)
  yr_j.append(temp)
plt.plot(xr_x,yr_j,"r-",label='average')
plt.legend(loc=2,bbox_to_anchor=(1.05,1.0),borderaxespad = 0.)
plt.show()