income_disparity_final_version_2.py 21 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580
  1. # -*- coding: utf-8 -*-
  2. """income_disparity.ipynb
  3. Automatically generated by Colaboratory.
  4. Original file is located at
  5. https://colab.research.google.com/drive/1upuHuQ3gWDkpbvkvHl2uTQlSv20JZnf2
  6. """
  7. #!pip install pandas-datareader
  8. import wbdata
  9. import datetime
  10. import numpy as np
  11. import pandas as pd
  12. from pandas_datareader import wb
  13. import matplotlib.pyplot as plt
  14. from sklearn.linear_model import LinearRegression as lr
  15. from matplotlib.pyplot import MultipleLocator
  16. # =============================================================================
  17. # # Part 1: API Integration
  18. # =============================================================================
  19. # =============================================================================
  20. # # API method 1: using wbdata module
  21. # =============================================================================
  22. # #searching for countries index using names
  23. # print(wbdata.search_countries('United Kingdom'))
  24. # list of countries
  25. countries = ["USA", "BEL", "BRA", "COL", "FRA", "DEU", "GRC", "IDN", "IRL", "MEX", "NLD", "RUS"]
  26. # date period
  27. dates = datetime.datetime(2008, 1, 1), datetime.datetime(2018, 1, 1)
  28. # data object
  29. indicators = {'SI.DST.05TH.20':'Income share held by highest 20%', 'SI.DST.FRST.20': 'Income share held by lowest 20%', \
  30. 'SL.EMP.TOTL.SP.FE.NE.ZS': 'Employment to population ratio, 15+, female (%) (national estimate)',\
  31. 'SL.EMP.TOTL.SP.MA.NE.ZS': 'Employment to population ratio, 15+, male (%) (national estimate)'}
  32. # getting data from these countries
  33. raw_data = wbdata.get_dataframe(indicators, country=countries, data_date=dates, convert_date=True)
  34. raw_unstacked_data = raw_data.unstack(level=0)
  35. # printing our data object
  36. # print(raw_data)
  37. # print(raw_unstacked_data)
  38. # =============================================================================
  39. # # API method 2: using from pandas.datareader import wb, convert the data object to a DataFrame
  40. # =============================================================================
  41. # view all data
  42. pd.set_option('display.max_columns', 15)
  43. pd.set_option('display.max_rows', 15)
  44. df1 = wb.download(indicator = indicators, country = countries, start = 2008, end = 2018)
  45. date_period = [i for i in range(2008, 2019)]
  46. print(df1)
  47. # create a new DataFrame df2 for later use, not change origin values from df1 if we do some calculations for our dataframe df2
  48. # rename the columns name
  49. df2 = df1.rename(columns = {'SI.DST.05TH.20':'Income share held by highest 20%', 'SI.DST.FRST.20': 'Income share held by lowest 20%', \
  50. 'SL.EMP.TOTL.SP.FE.NE.ZS': 'Employment to population ratio, 15+, female (%) (national estimate)',\
  51. 'SL.EMP.TOTL.SP.MA.NE.ZS': 'Employment to population ratio, 15+, male (%) (national estimate)'}, inplace = False)
  52. # overview our data object DataFrame
  53. # Data manipulation: dealing with the missing value, replace them as mean(), which has less impact on our data sets
  54. df2.mean()
  55. df2.fillna(df2.mean(), inplace = True)
  56. print(df2)
  57. # Overview our new edited DataFram and get basic info of statistics
  58. print(df2.describe())
  59. # =============================================================================
  60. # # Part 2: Data structure set up
  61. # =============================================================================
  62. # =============================================================================
  63. # # creating our Data Structure type I
  64. # =============================================================================
  65. # step I: convert DataFrame to a list in correct order from 2008 to 2018
  66. def country_DataFrame_to_list(country, target_data):
  67. df = wb.download(indicator = target_data, country = country, start = 2008, end = 2018)
  68. df.fillna(df.mean(), inplace = True)
  69. df_list =df[df.columns[0]].tolist()
  70. round_list = [round(i, 2) for i in df_list ]
  71. return round_list[::-1]
  72. # step II: make a list of tuple, which is a good way to save our data
  73. def country_tuples(country_list, time):
  74. return list(zip(country_list, time))
  75. # additional gap calculation for calculating the gap between two list
  76. def gap_between(toplist, lowlist):
  77. gap_list = []
  78. for i in range(len(toplist)):
  79. gap_list.append(round((toplist[i]- lowlist[i]), 2))
  80. return gap_list
  81. # step IV: Make a dictionary of list of tuple, which is one of our data structure of this project,
  82. # named as Data Structure type I.
  83. def object_Dictionary(country_list, object_target, date_period):
  84. object_df = {}
  85. for country in country_list:
  86. object_df[country] = country_tuples(date_period, country_DataFrame_to_list(country, object_target))
  87. return object_df
  88. # step V: start to build:
  89. # This data set is for storing data of Income share held by highest 20%
  90. Top_20_df = object_Dictionary(countries, 'SI.DST.05TH.20', date_period)
  91. # This data set is for storing data of Income share held by lowest 20%
  92. Low_20_df = object_Dictionary(countries, 'SI.DST.FRST.20', date_period)
  93. # This data set is for storing data of 'Employment to population ratio, 15+, female (%) (national estimate)'
  94. female_employ_df = object_Dictionary(countries, 'SL.EMP.TOTL.SP.FE.NE.ZS', date_period)
  95. # This data set is for storing data of 'Employment to population ratio, 15+, male (%) (national estimate)'
  96. male_employ_df = object_Dictionary(countries, 'SL.EMP.TOTL.SP.MA.NE.ZS', date_period)
  97. # =============================================================================
  98. # # creating our Data Structure type II: convert our Data structure type I to typle II
  99. # =============================================================================
  100. # step 1: write a function that can unpack dictionary of tuple to a new dictionary of simple list, and calculate the gap
  101. def no_tuple_dic(object_Dictionary1, object_Dictionary2):
  102. new_dict = {}
  103. for i in countries:
  104. new_list = []
  105. for j in range(11):
  106. # The reason why I didn't use the difference function is because I don't want my new dictionary has year
  107. new_list.append(round((object_Dictionary1[i][j][1]- object_Dictionary2[i][j][1]), 2))
  108. new_dict[i] = new_list
  109. return new_dict
  110. # step 2: getting the income gap dictionary of list between income share held by highest 20% and income share held by lowest 20%
  111. income_gap_dict = no_tuple_dic(Top_20_df, Low_20_df)
  112. # step 3: create our Data structure type II, DataFrame
  113. income_gap_dict_df = pd.DataFrame(income_gap_dict, columns = countries)
  114. # step 4: show the basic statistic info of our income gap DataFrame
  115. print(round(income_gap_dict_df.describe(),2))
  116. # same step as above, to get our Data Structure type II, between male employment population and female employment population
  117. gender_gap_dict = no_tuple_dic(male_employ_df, female_employ_df)
  118. gender_gap_dict_df = pd.DataFrame(gender_gap_dict, columns = countries)
  119. print(round(gender_gap_dict_df.describe(),2))
  120. # Data Structure function application
  121. # This function is to calculate the difference of the gap between income share held by highest 20% and income share held by lowest 20%
  122. def gap_income_Dataframe(country):
  123. gap = {}
  124. for i in range(len(Top_20_df[country])):
  125. year1, data1 = Top_20_df[country][i]
  126. year2, data2 = Low_20_df[country][i]
  127. if year1 == year2:
  128. gap[year1] = round(data1-data2, 2)
  129. return gap
  130. # This function is to calculate the difference of the gap between male employment population and female employment population
  131. def gap_gender_Dataframe(country):
  132. gap = {}
  133. for i in range(len(Top_20_df[country])):
  134. year1, data1 = male_employ_df[country][i]
  135. year2, data2 = female_employ_df[country][i]
  136. if year1 == year2:
  137. gap[year1] = round(data1-data2, 2)
  138. return gap
  139. # This function is to searching specific country and year
  140. def searching_data(object_Dictionary, country, year):
  141. country_list = []
  142. if country in countries:
  143. for i in range(11):
  144. country_list.append(object_Dictionary[country][i])
  145. output = [item for item in country_list if item[0] == year]
  146. #return empty list if data not found, return a tuple if country and year is valid
  147. return output
  148. # =============================================================================
  149. # # Part 3: Ploting the data set
  150. # =============================================================================
  151. # =============================================================================
  152. # #plot 1: Income gap from 2008 to 2018
  153. # =============================================================================
  154. from matplotlib.pyplot import MultipleLocator
  155. plt.title('Income gap from 2008 to 2018')
  156. plt.xlabel('Year')
  157. plt.ylabel('Income gap%')
  158. all_data_i = []
  159. for c in countries:
  160. gap_i = gap_income_Dataframe(c)
  161. x_i = gap_i.keys()
  162. y_i = gap_i.values()
  163. all_data_i.append(gap_i)
  164. plt.scatter(x_i,y_i,marker='+',label=c)
  165. plt.legend(loc=2,bbox_to_anchor=(1.05,1.0),borderaxespad = 0.)
  166. x_major_locator=MultipleLocator(1) #set the x interval as 1
  167. y_major_locator=MultipleLocator(2) #set the y interval as 2
  168. ax=plt.gca()
  169. ax.xaxis.set_major_locator(x_major_locator) #Set the major scale of the x-axis to a multiple of 1
  170. ax.yaxis.set_major_locator(y_major_locator) #Set the major scale of the y-axis to a multiple of 2
  171. plt.xlim(2007,2019) #Set the x scale range of the x-axis from 2008 to 2018, the reason why I use 2019 is because we can see clearly t
  172. plt.ylim(25,60) #Set the y scale range of the y-axis from 25 to 60
  173. N = 10000
  174. xr_i = list(range(2008,2019))
  175. yr_i = []
  176. for i in xr_i:
  177. temp = 0
  178. for j in all_data_i:
  179. temp += j[i]
  180. temp /= len(countries)
  181. yr_i.append(temp)
  182. plt.plot(xr_i,yr_i,"r-",label='average')
  183. plt.legend(loc=2,bbox_to_anchor=(1.05,1.0),borderaxespad = 0.)
  184. plt.savefig('Income gap.pdf')
  185. plt.show()
  186. # =============================================================================
  187. # #plot 2: Gender Employment rate gap from 2008 to 2018
  188. # =============================================================================
  189. plt.title('Gender Employment rate gap from 2008 to 2018')
  190. plt.xlabel('Year')
  191. plt.ylabel('Gender Employment Gap %')
  192. all_data_j = []
  193. for c in countries:
  194. gap_j = gap_gender_Dataframe(c)
  195. x_j = gap_j.keys()
  196. y_j = gap_j.values()
  197. all_data_j.append(gap_j)
  198. plt.scatter(x_j,y_j,marker='+',label=c)
  199. plt.legend(loc=2,bbox_to_anchor=(1.05,1.0),borderaxespad = 0.)
  200. x_major_locator=MultipleLocator(1) #set the x interval as 1
  201. y_major_locator=MultipleLocator(2) #set the y interval as 2
  202. ax=plt.gca()
  203. ax.xaxis.set_major_locator(x_major_locator) #Set the major scale of the x-axis to a multiple of 1
  204. ax.yaxis.set_major_locator(y_major_locator) #Set the major scale of the y-axis to a multiple of 0.02
  205. plt.xlim(2007,2019) #Set the scale range of the x-axis from 2008 to 2018
  206. plt.ylim(6,38) #Set the scale range of the y-axis from 25 to 60
  207. N = 10000
  208. xr_j = list(range(2008,2019))
  209. yr_j = []
  210. for i in xr_j:
  211. temp = 0
  212. for j in all_data_j:
  213. temp += j[i]
  214. temp /= len(countries)
  215. yr_j.append(temp)
  216. plt.plot(xr_j,yr_j,"r-",label='average')
  217. plt.legend(loc=2,bbox_to_anchor=(1.05,1.0),borderaxespad = 0.)
  218. plt.show()
  219. # =============================================================================
  220. # #boxplot 1 income gap
  221. # =============================================================================
  222. plt.figure(figsize=(9,6),dpi=60)
  223. labels, data = [*zip(*income_gap_dict.items())] # 'transpose' items to parallel key, value lists
  224. # or backwards compatable
  225. labels, data = income_gap_dict.keys(), income_gap_dict.values()
  226. plt.title('Income Gap from 2008 to 2018')
  227. plt.xlabel('Country')
  228. plt.ylabel('Income Gap %')
  229. plt.boxplot(data)
  230. plt.xticks(range(1, len(labels) + 1), labels)
  231. plt.show()
  232. # =============================================================================
  233. # #boxplot 2 gender employment gap
  234. # =============================================================================
  235. plt.figure(figsize=(9,6),dpi=60)
  236. labels, data = [*zip(*gender_gap_dict.items())] # 'transpose' items to parallel key, value lists
  237. # or backwards compatable
  238. labels, data = gender_gap_dict.keys(), gender_gap_dict.values()
  239. plt.title('Gender Employment Gap')
  240. plt.xlabel('Country')
  241. plt.ylabel('Gender Employment Gap %')
  242. plt.boxplot(data)
  243. plt.xticks(range(1, len(labels) + 1), labels)
  244. plt.show()
  245. # =============================================================================
  246. # #Part 4: linear regression
  247. # =============================================================================
  248. import numpy as np
  249. from sklearn.linear_model import LinearRegression
  250. # Convert the original data frame to list
  251. def convert_to_target_data_dict(country_list):
  252. converted_dict = {}
  253. for i in range(len(country_list)):
  254. country_name = country_list[i]
  255. converted_dict[country_name] = {}
  256. gap_income_dict = gap_income_Dataframe(country_name)
  257. gap_gender_dict = gap_gender_Dataframe(country_name)
  258. converted_gap_income_list = []
  259. converted_gap_gender_list = []
  260. for k in gap_income_dict:
  261. converted_gap_income_list.append(gap_income_dict[k])
  262. converted_gap_gender_list.append(gap_gender_dict[k])
  263. converted_dict[country_name]["income"] = converted_gap_income_list
  264. converted_dict[country_name]["gender"] = converted_gap_gender_list
  265. return converted_dict
  266. # Work out the x-coordinates for linear regression
  267. def x_coordinate():
  268. x_list = []
  269. x_coordinate = 2008
  270. for i in range(11):
  271. x_list.append(x_coordinate)
  272. x_coordinate = x_coordinate + 1
  273. return x_list
  274. # Work out the linear regression for single country
  275. def linear_regression(contry_name, coordinate_dict, data_type, predict_time):
  276. y_list = coordinate_dict[contry_name][data_type]
  277. x_list = x_coordinate()
  278. x = np.array(x_list).reshape((-1, 1))
  279. y = np.array(y_list)
  280. linear_model = LinearRegression().fit(x, y)
  281. predict_year = np.array([predict_time]).reshape((-1, 1))
  282. ten_year_prediction = linear_model.predict(predict_year)
  283. return ten_year_prediction[0]
  284. # Work out the final predicted result for the income and gender gap of 2030
  285. def total_linear_regression_result(y_coordinate_dict):
  286. linear_regression_result_dict = {}
  287. for k in y_coordinate_dict:
  288. linear_regression_result_dict[k] = {}
  289. predict_income_gap_2030 = linear_regression(k, y_coordinate_dict, "income", 2030)
  290. predict_gender_gap_2030 = linear_regression(k, y_coordinate_dict, "gender", 2030)
  291. linear_regression_result_dict[k]["income"] = predict_income_gap_2030
  292. linear_regression_result_dict[k]["gender"] = predict_gender_gap_2030
  293. return linear_regression_result_dict
  294. # Calculate the average income & gender gap of 2030
  295. def calculate_average_gap(result_dict, country_list):
  296. average_result_dict = {}
  297. sum_income_gap = 0
  298. sum_gender_gap = 0
  299. for k in result_dict:
  300. sum_income_gap = sum_income_gap + result_dict[k]["income"]
  301. sum_gender_gap = sum_gender_gap + result_dict[k]["gender"]
  302. average_income_gap = sum_income_gap / len(country_list)
  303. average_gender_gap = sum_gender_gap / len(country_list)
  304. average_result_dict["average_income_gap"] = average_income_gap
  305. average_result_dict["average_gender_gap"] = average_gender_gap
  306. return average_result_dict
  307. # Compare the average value with our liner regression result
  308. # print the list of countries which higher or lower than our average prediction, or even equal
  309. def compare_with_the_average(average_dict, result_dict):
  310. compare_result_dict = {}
  311. higher_than_income_average = []
  312. lower_than_income_average = []
  313. equal_to_income_average = []
  314. higher_than_gender_average = []
  315. lower_than_gender_average = []
  316. equal_to_gender_average = []
  317. for k in result_dict:
  318. if result_dict[k]["income"] > average_dict["average_income_gap"]:
  319. higher_than_income_average.append(k)
  320. elif result_dict[k]["income"] < average_dict["average_income_gap"]:
  321. lower_than_income_average.append(k)
  322. elif result_dict[k]["income"] == average_dict["average_income_gap"]:
  323. equal_to_income_average.append(k)
  324. if result_dict[k]["gender"] > average_dict["average_gender_gap"]:
  325. higher_than_gender_average.append(k)
  326. elif result_dict[k]["gender"] < average_dict["average_gender_gap"]:
  327. lower_than_gender_average.append(k)
  328. elif result_dict[k]["gender"] == average_dict["average_gender_gap"]:
  329. equal_to_gender_average.append(k)
  330. compare_result_dict["higher_than_income_average"] = higher_than_income_average
  331. compare_result_dict["lower_than_income_average"] = lower_than_income_average
  332. compare_result_dict["equal_to_income_average"] = equal_to_income_average
  333. compare_result_dict["higher_than_gender_average"] = higher_than_gender_average
  334. compare_result_dict["lower_than_gender_average"] = lower_than_gender_average
  335. compare_result_dict["equal_to_gender_average"] = equal_to_gender_average
  336. return compare_result_dict
  337. def main():
  338. # Work out the linear regression result for the 'countries' list
  339. y_dict = convert_to_target_data_dict(countries)
  340. linear_regression_result_dict = total_linear_regression_result(y_dict)
  341. # Work out the average income & gender gap
  342. average_gap_result = calculate_average_gap(linear_regression_result_dict, countries)
  343. # Compare the average gap with the gap for each country
  344. compare_with_average = compare_with_the_average(average_gap_result, linear_regression_result_dict)
  345. # Print the results
  346. print(linear_regression_result_dict)
  347. print()
  348. print(average_gap_result)
  349. print()
  350. print(compare_with_average)
  351. return linear_regression_result_dict,average_gap_result,compare_with_average
  352. if __name__ == "__main__":
  353. linear_regression_result_dict,average_gap_result,compare_with_average = main()
  354. # over view our linear regression result
  355. print()
  356. print(linear_regression_result_dict)
  357. # =============================================================================
  358. # #part 5: plot the figure with our prediction with comparison
  359. # =============================================================================
  360. # Commented out IPython magic to ensure Python compatibility.
  361. # =============================================================================
  362. # #plot 1 for income gap with prediction in 2030
  363. # =============================================================================
  364. # %matplotlib inline
  365. from matplotlib.pyplot import MultipleLocator
  366. plt.figure(figsize=(12,6),dpi=60)
  367. plt.title('Prediction of Income Gap in 2030')
  368. plt.xlabel('Year')
  369. plt.ylabel('Income gap%')
  370. all_data_i = []
  371. xr_x = list(range(2008,2019))
  372. xr_x.append(2030)
  373. # xr_x = list(map(lambda x:str(x),xr_x))
  374. for c in countries:
  375. gap_i = gap_income_Dataframe(c)
  376. x_i = list(gap_i.keys())
  377. y_i = list(gap_i.values())
  378. tmp = linear_regression_result_dict[c]
  379. x_i.append(2019)
  380. y_i.append(tmp["income"])
  381. gap_i[2019] = tmp["income"]
  382. all_data_i.append(gap_i)
  383. plt.scatter(xr_x,y_i,marker='+',label=c)
  384. plt.legend(loc=2,bbox_to_anchor=(1.05,1.0),borderaxespad = 0.)
  385. x_major_locator=MultipleLocator(1) #set the x interval as 1
  386. y_major_locator=MultipleLocator(2) #set the y interval as 2
  387. ax=plt.gca()
  388. ax.xaxis.set_major_locator(x_major_locator) #Set the major scale of the x-axis to a multiple of 1
  389. ax.yaxis.set_major_locator(y_major_locator) #Set the major scale of the y-axis to a multiple of 2
  390. plt.xlim(2007,2031) #Set the scale range of the x-axis from 2008 to 2018
  391. plt.ylim(25,60) #Set the scale range of the y-axis from 25 to 60
  392. xr_i = list(range(2008,2019))
  393. xr_i.append(2019)
  394. yr_i = []
  395. for i in xr_i:
  396. temp = 0
  397. for j in all_data_i:
  398. temp += j[i]
  399. temp /= len(countries)
  400. yr_i.append(temp)
  401. plt.plot(xr_x,yr_i,"r-",label='average')
  402. plt.legend(loc=2,bbox_to_anchor=(1.05,1.0),borderaxespad = 0.)
  403. plt.savefig('Income gap.pdf')
  404. plt.show()
  405. # =============================================================================
  406. # #plot 2 for gender gap with prediction in 2030
  407. # =============================================================================
  408. plt.figure(figsize=(12,6),dpi=60)
  409. plt.title('Prediction of Gender Employment Gap in 2030')
  410. plt.xlabel('Year')
  411. plt.ylabel('Gender Employment Gap %')
  412. all_data_j = []
  413. xr_x = list(range(2008,2019))
  414. xr_x.append(2030)
  415. for c in countries:
  416. gap_j = gap_gender_Dataframe(c)
  417. x_j = list(gap_j.keys())
  418. y_j = list(gap_j.values())
  419. tmp = linear_regression_result_dict[c]
  420. x_j.append(2019)
  421. y_j.append(tmp["gender"])
  422. gap_j[2019] = tmp["gender"]
  423. all_data_j.append(gap_j)
  424. plt.scatter(xr_x,y_j,marker='+',label=c)
  425. plt.legend(loc=2,bbox_to_anchor=(1.05,1.0),borderaxespad = 0.)
  426. x_major_locator=MultipleLocator(1) #set the x interval as 1
  427. y_major_locator=MultipleLocator(2) #set the y interval as 2
  428. ax=plt.gca()
  429. ax.xaxis.set_major_locator(x_major_locator) #Set the major scale of the x-axis to a multiple of 1
  430. ax.yaxis.set_major_locator(y_major_locator) #Set the major scale of the y-axis to a multiple of 0.02
  431. plt.xlim(2007,2031) #Set the scale range of the x-axis from 2008 to 2018
  432. plt.ylim(2,38) #Set the scale range of the y-axis from 25 to 60
  433. xr_j = list(range(2008,2019))
  434. xr_j.append(2019)
  435. yr_j = []
  436. for i in xr_j:
  437. temp = 0
  438. for j in all_data_j:
  439. temp += j[i]
  440. temp /= len(countries)
  441. yr_j.append(temp)
  442. plt.plot(xr_x,yr_j,"r-",label='average')
  443. plt.legend(loc=2,bbox_to_anchor=(1.05,1.0),borderaxespad = 0.)
  444. plt.show()