bi_main.py 18 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440
  1. """
  2. This OOP is to do the BI Challenge
  3. """
  4. from warnings import simplefilter
  5. simplefilter(action='ignore', category=FutureWarning)
  6. # %matplotlib inline
  7. from google.colab import files
  8. import pandas as pd
  9. import numpy as np
  10. # %reload_ext sql
  11. import sqlite3
  12. import seaborn as sns
  13. import matplotlib.pyplot as plt
  14. from plotly.offline import iplot
  15. import plotly.express as px
  16. pd.options.display.float_format = '{:.2f}'.format # uppress scientific notation
  17. # Declare your Github Repository address
  18. A_url='https://raw.githubusercontent.com/haensel-ams/recruitment_challenge/master/BI_201805/table_A_conversions.csv'
  19. B_url='https://raw.githubusercontent.com/haensel-ams/recruitment_challenge/master/BI_201805/table_B_attribution.csv'
  20. # The Extract class is to extract data from your Gihub Repos address
  21. class Extract():
  22. def __init__(self,A_url,B_url):
  23. print('\033[1m'+'Please, wait! I am extracting data from your Github Repository'+'\033[0m'+'\n...')
  24. self.A_url=A_url
  25. self.table_A_conversions=self.load_data(self.A_url)
  26. self.B_url=B_url
  27. self.table_B_attribution=self.load_data(self.B_url)
  28. print('Data was successfully extracted!')
  29. def load_data(self,url):
  30. self.data=pd.read_csv(url)
  31. #display(self.data.head(3))
  32. return self.data
  33. # The Transform class is to combine two different extracted datasets and do the data cleansing
  34. # Also, to know the generanl informantion about KPIs
  35. class Transform():
  36. def __init__(self,extract):
  37. print('\033[1m'+'I am transforming the extracted data'+'\033[0m'+'\n...')
  38. self.table_A_conversions=extract.table_A_conversions
  39. self.table_B_attribution=extract.table_B_attribution
  40. self.joined_tabs = self.combine_tab(self.table_A_conversions, self.table_B_attribution,'Conv_ID')
  41. self.time_tab=self.cleaning_data(self.joined_tabs)
  42. # self.infor_Data=self.get_infor(self.time_tab)
  43. self.get_missing=self.check_missing(self.time_tab)
  44. self.cleaned_tab=self.time_tab.dropna()
  45. display(self.cleaned_tab.head(5))
  46. self.infor_Data=self.get_infor(self.cleaned_tab)
  47. self.more_infor=self.deep_infor(self.cleaned_tab)
  48. def deep_infor(self,data):
  49. print('Total annual revenue: %d'%data['Revenue'].sum())
  50. def combine_tab(self,tab_1,tab_2,common_col):
  51. print('I am combining two data into one and coverting the time format\n...')
  52. self.data=pd.merge(tab_1, tab_2, on=common_col, how='outer')
  53. # display(self.data.head(5))
  54. return self.data
  55. def cleaning_data(self,data):
  56. data['Conv_Date']= pd.to_datetime(data['Conv_Date'])
  57. self.data=data
  58. print('Data was completely transformed!')
  59. return self.data
  60. def get_infor(self,data):
  61. print('\033[1m'+'General information:'+'\033[0m')
  62. self.information=data.info()
  63. print('\033[1m'+'Descriptive Statistics:'+'\033[0m')
  64. # print(data.describe())
  65. return self.information
  66. def check_missing(self,data):
  67. print('\033[1m'+ 'The number of missing values:'+'\033[0m')
  68. self.miss_data=data.isnull().sum()
  69. self.miss_rate=100*data.isnull().sum()/len(data)
  70. self.mis_infor=pd.concat([self.miss_data, self.miss_rate], axis=1).reset_index()
  71. self.mis_infor=self.mis_infor.rename(columns={0: 'Amounts', 1: 'Percentage'})
  72. # print(self.mis_infor)
  73. return self.miss_data
  74. # The Load class is to load the tranformed data to the database
  75. class Load():
  76. def __init__(self,transform):
  77. print('\033[1m'+'I am loading the transformed data to my database'+'\033[0m'+'\n...')
  78. self.data=transform.cleaned_tab
  79. self.connect=self.connect_database()
  80. self.insert=self.insert_data(self.data)
  81. def connect_database(self):
  82. print('I am trying to connect to my SQL database\n....')
  83. self.connect= "%sql sqlite:///phuong_database.db"
  84. print(self.connect,'connection is success!',sep='\n')
  85. return self.connect
  86. def insert_data(self,data):
  87. print('I am loading the transformed data to my SQL Database\n....')
  88. self.check ="%sql DROP TABLE IF EXISTS data"
  89. self.insert="%sql PERSIST data"
  90. self.list_table="%sql SELECT name FROM sqlite_master WHERE type='table'"
  91. print(self.list_table)
  92. self.data="%sql SELECT * FROM data LIMIT 3"
  93. print(self.data)
  94. print('Data was completely inserted into my SQL Database!')
  95. return self.insert
  96. # The EDA_Overview_KPI class is to generate a preliminary overview on the KPI
  97. class EDA_Overview_KPI():
  98. def __init__(self,transform):
  99. print('\033[1m'+'I am doing the Explanatory Data Analysis (EDA) process for Revenue KPIs'+'\033[0m'+'\n...')
  100. self.data=transform.cleaned_tab[['Conv_Date','Revenue','User_ID']]
  101. self.by_kpi=self.group_data(self.data,'Conv_Date','Revenue','User_ID')
  102. # display(self.by_kpi.head(3))
  103. self.kpi_fig=self.plot_kpi(self.by_kpi)
  104. self.sum_stat=self.get_infor(self.by_kpi,'Conv_Date','Revenue','User_ID')
  105. def group_data(self,data,target,exp_1,exp_2):
  106. self.num_target=len(data[target].unique())
  107. print('The number of '+target+': %d'%self.num_target)
  108. self.data=data.groupby([target]).agg({exp_1:'sum',exp_2:'count'})
  109. return self.data
  110. def plot_kpi(self,data):
  111. self.name_column=self.data.columns
  112. plt.figure(figsize=(15, 9))
  113. for i,col in enumerate(self.name_column):
  114. plt.subplot(2,1,i+1)
  115. plt.plot(self.data[col],label=col)
  116. plt.title('The changes in of the daily '+col +' over the time period',fontweight='bold',fontsize='12')
  117. plt.legend()
  118. plt.autoscale(enable=True, axis='both',tight=True)
  119. plt.savefig('Overview_KPI.png')
  120. files.download('Overview_KPI.png')
  121. return self.name_column
  122. def get_infor(self,data,target,exp_1,exp_2):
  123. self.infor=display(self.data.head(8).T)
  124. print('\033[1m'+'Desriptive Statistics of the Daily KPIs by '+ target +'\033[0m', self.data.describe(),sep='\n')
  125. print('Date with the highest revenue:', self.data[exp_1].idxmax(axis = 0) )
  126. print('Date with the lowest revenue:', self.data[exp_1].idxmin(axis = 0) )
  127. print('Date with the highest number of users:', self.data[exp_2].idxmax(axis = 0) )
  128. print('Date with the lowest number of users:', self.data[exp_2].idxmin(axis = 0) )
  129. return self.infor
  130. # The EDA_KPI_Return class is to generate a preliminary overview on the return customer
  131. class EDA_KPI_Return():
  132. def __init__(self,transform):
  133. print('\033[1m'+'I am doing the Explanatory Data Analysis (EDA) process for User KPIs'+'\033[0m'+'\n...')
  134. self.data=transform.cleaned_tab[['Conv_Date','User_ID']]
  135. self.infor_user=self.get_infor(self.data,'User_ID')
  136. self.by_user=self.group_data(self.data,'User_ID','Conv_Date')
  137. display(self.by_user.head(8).T)
  138. self.user_plot=self.plot_user(self.by_user,'Conv_Date')
  139. def get_infor(self,data,exp):
  140. self.num_user=data[exp].unique()
  141. print('The number of users: %d'%len(self.num_user))
  142. return self.num_user
  143. def group_data(self,data,target,exp):
  144. self.num_target=len(data[target].unique())
  145. print('The number of '+target+': %d'%self.num_target)
  146. self.data=data.groupby([target]).agg({exp:'count'})
  147. # display(self.data.head(8).T)
  148. print('\033[1m'+'Desriptive Statistics of the Daily KPIs by '+ target +'\033[0m', self.data.describe(),sep='\n')
  149. return self.data
  150. def plot_user(self,data,exp):
  151. self.data=data.rename(columns={exp: 'The number of returns'})
  152. self.ax=self.data.plot.hist(figsize=(15, 9),bins=1500,xlim=(1,20),color='#86bf91'
  153. ,title='The Frequence of return customer',grid=True)
  154. self.ax.set_xlabel('The number of days')
  155. plt.savefig('Customer_return.png')
  156. files.download('Customer_return.png')
  157. return self.ax
  158. # The EDA_Static_Ren class is to explore the information about the total revenue per year
  159. class EDA_Static_Ren():
  160. def __init__(self,transform):
  161. print('\033[1m'+'I am doing the EDA on Conversion'+'\033[0m'+'\n...')
  162. self.data=transform.cleaned_tab[['Channel','Revenue']]
  163. display(self.data.head(3))
  164. # self.infor_conver=self.get_infor(self.data)
  165. self.by_ChanelRen=self.group_data(self.data,'Channel')
  166. self.pie_ChanelRen=self.plot_pie(self.by_ChanelRen,'Revenue')
  167. def plot_pie(self,data,target):
  168. self.data=data
  169. self.data['Total Conver'] = self.data.sum(axis=1)
  170. self.data['Total Top Five'] = self.data[['A','G','H','I','B']].sum(axis=1)
  171. self.data['The Rest'] = self.data['Total Conver']-self.data['Total Top Five']
  172. self.ax=self.data[['A','G','H','I','B','The Rest']].T.plot.pie(y=target,figsize=(12, 7),autopct='%1.1f%%',)
  173. plt.savefig('channel_Static_Ren.jpg')
  174. files.download('channel_Static_Ren.jpg')
  175. return self.data
  176. def get_infor(self,data):
  177. self.conver_uni=self.data.User_ID.unique()
  178. print('The number of conversions: %d'%len(self.conver_uni))
  179. return self.conver_uni
  180. def group_data(self,data,target):
  181. print('I am grouping data by '+ target + '\n...')
  182. self.data=data.groupby([target]).agg({'Revenue':'sum'})
  183. self.data=self.data.T
  184. display(self.data)
  185. print('I am done! ')
  186. return self.data
  187. # The EDA_Static_User class is to generate information about the total annual number of visits
  188. class EDA_Static_User():
  189. def __init__(self,transform):
  190. print('\033[1m'+'I am doing the EDA on Conversion'+'\033[0m'+'\n...')
  191. self.data=transform.cleaned_tab[['Channel','User_ID']] #'Conv_Date',
  192. display(self.data.head(3))
  193. # self.infor_conver=self.get_infor(self.data)
  194. self.by_ChanelConv=self.group_data(self.data,'Channel')
  195. self.pie_channelConv=self.plot_pie(self.by_ChanelConv,'User_ID')
  196. def plot_pie(self,data,target):
  197. self.data=data
  198. self.data['Total Conver'] = self.data.sum(axis=1)
  199. self.data['Total Top Five'] = self.data[['A','G','H','I','B']].sum(axis=1)
  200. self.data['The Rest'] = self.data['Total Conver']-self.data['Total Top Five']
  201. self.ax=self.data[['A','G','H','I','B','The Rest']].T.plot.pie(y=target,figsize=(12, 7),autopct='%1.1f%%',)
  202. plt.savefig('channel_Static_User.jpg')
  203. files.download('channel_Static_User.jpg')
  204. return self.data
  205. def get_infor(self,data):
  206. self.conver_uni=self.data.User_ID.unique()
  207. print('The number of conversions: %d'%len(self.conver_uni))
  208. return self.conver_uni
  209. def group_data(self,data,target):
  210. print('I am grouping data by '+ target + '\n...')
  211. self.data=data.groupby([target]).agg({'User_ID':'count'})
  212. self.data=self.data.T
  213. display(self.data)
  214. print('I am done! ')
  215. return self.data
  216. # The EDA_Static_Conversion is to generate the information about the total annual number of conversion
  217. class EDA_Static_Conversion():
  218. def __init__(self,transform):
  219. print('\033[1m'+'I am doing the EDA on Conversion'+'\033[0m'+'\n...')
  220. self.data=transform.cleaned_tab[['Channel','Conv_ID','IHC_Conv']] #'Conv_Date',
  221. display(self.data.head(3))
  222. self.infor_conver=self.get_infor(self.data)
  223. self.by_ChanelConv=self.group_data(self.data,'Channel','Conv_ID')
  224. self.pie_channelConv=self.plot_pie(self.by_ChanelConv,'Conv_ID')
  225. def get_infor(self,data):
  226. self.conver_uni=self.data.Conv_ID.unique()
  227. print('The number of conversions: %d'%len(self.conver_uni))
  228. return self.conver_uni
  229. def group_data(self,data,target,exp):
  230. print('I am grouping data by '+ target + '\n...')
  231. if data[exp].dtype=='object':
  232. self.data=data.groupby([target]).agg({exp:'count'})
  233. else:
  234. self.data=data.groupby([target]).agg({exp:'sum'})
  235. self.data=self.data.T
  236. display(self.data)
  237. print('I am done! ')
  238. return self.data
  239. def plot_pie(self,data,target):
  240. self.data=data
  241. self.data['Total Conver'] = self.data.sum(axis=1)
  242. self.data['Total Top Five'] = self.data[['A','G','H','I','B']].sum(axis=1)
  243. self.data['The Rest'] = self.data['Total Conver']-self.data['Total Top Five']
  244. self.ax=self.data[['A','G','H','I','B','The Rest']].T.plot.pie(y=target,figsize=(12, 7),autopct='%1.1f%%',)
  245. plt.savefig('channel_Conver.png')
  246. files.download('channel_Conver.png')
  247. return self.data
  248. # The EDA_Channel_Revenue class is to analyze the impacts of the online marketing channels on
  249. # the daily Revenue
  250. class EDA_Channel_Revenue():
  251. def __init__(self,transform):
  252. print('\033[1m'+'I am analyzing the influences of the online marketing channels on the daily revenue'+'\033[0m'+'\n...')
  253. self.data=transform.cleaned_tab[['Conv_Date','Channel','Revenue']]
  254. self.by_DateChannel=self.group_data(self.data,'Conv_Date','Channel')
  255. self.unstaked_data=self.unstack_data(self.by_DateChannel,'Revenue','bar')
  256. self.plotted_data=self.plot_data(self.unstaked_data)
  257. self.exported_data=self.export_data(self.unstaked_data,'channel_revenue')
  258. def group_data(self,data,target_1,target_2):
  259. print('I am grouping data by '+ target_1 +' and '+ target_2 + '\n...')
  260. self.data=data.groupby([target_1,target_2])#.agg({exp:'count'})
  261. print('I am done! ')
  262. return self.data
  263. def unstack_data(self,data,exp,kind):
  264. print('I am unstacking data \n...')
  265. data=data.sum()[exp].unstack(level=-1)
  266. self.data=data
  267. display(self.data.head(3))
  268. print('Data were unstacked completely\n...')
  269. return self.data
  270. def plot_data(self,data):
  271. self.data=data
  272. print('I am visualizing the contribution of Top 5 Channels to the Daily Revenue\n...')
  273. self.data['The Total'] = self.data.sum(axis=1)
  274. self.data['The Rest']= self.data['The Total']-self.data[['A','G','H','I','B']].sum(axis=1)
  275. self.xlim=('2017-03-01','2018-03-24')
  276. self.ax =self.data[['A','G','H','I','B','The Rest']].plot.area(xlim=self.xlim, figsize=(12,8))
  277. self.ax.set_xlabel('Date')
  278. self.ax.set_ylabel('Revenue')
  279. print(self.data['The Rest'].describe())
  280. plt.savefig('channel_ren.png')
  281. files.download('channel_ren.png')
  282. return self.data
  283. def export_data(self,data,title):
  284. print('I am exporting data to the excel and csv files\n...')
  285. data.to_excel(title+'.xlsx')
  286. self.excel=files.download(title+'.xlsx')
  287. data.to_csv(title+'.csv')
  288. self.csv=files.download(title+'.csv')
  289. return self.excel
  290. # The EDA_Channel_User class is to analyze the impacts of the online marketing channels on
  291. # the daily number of users
  292. class EDA_Channel_User():
  293. def __init__(self,transform):
  294. print('\033[1m'+'I am analyzing the influences of the online marketing channels on the daily number of users'+'\033[0m'+'\n...')
  295. self.data=transform.cleaned_tab[['Conv_Date','Channel','User_ID']]
  296. self.by_DateUser=self.group_data(self.data,'Conv_Date','Channel','User_ID')
  297. self.unstaked_data=self.unstack_data(self.by_DateUser,'User_ID','bar')
  298. #display(self.unstaked_data.head(3))
  299. self.plotted_data=self.plot_data(self.unstaked_data)
  300. # self.exported_data=self.export_data(self.unstaked_data,'channel_num_user')
  301. def group_data(self,data,target_1,target_2,exp):
  302. print('I am grouping data by '+ target_1 +' and '+ target_2 + '\n...')
  303. self.data=data.groupby([target_1,target_2])#.agg({exp:'count'})
  304. print('I am done! ')
  305. return self.data
  306. def unstack_data(self,data,exp,kind):
  307. print('I am unstacking data \n...')
  308. data=data.count()[exp].unstack(level=-1)
  309. self.data=data
  310. print('Data were unstacked completely\n...')
  311. return self.data
  312. def plot_data(self,data):
  313. self.data=data
  314. print('I am visualizing the contribution of Top 5 Channels to the Daily Number of Users\n...')
  315. self.data['The Total'] = self.data.sum(axis=1)
  316. self.data['The Rest'] = self.data['The Total'] - self.data[['A','G','H','I','B']].sum(axis=1)
  317. self.xlim=('2017-03-01','2018-03-24')
  318. self.ax =self.data[['A','G','H','I','B','The Rest']].plot.area(xlim=self.xlim, figsize=(12,8))
  319. self.ax.set_xlabel('Date')
  320. self.ax.set_ylabel('The number of Users')
  321. plt.savefig('channel_user.png')
  322. files.download('channel_user.png')
  323. return self.data
  324. def export_data(self,data,title):
  325. print('I am exporting data to the excel and csv files\n...')
  326. data.to_excel(title+'.xlsx')
  327. self.excel=files.download(title+'.xlsx')
  328. data.to_csv(title+'.csv')
  329. self.csv=files.download(title+'.csv')
  330. return self.excel
  331. # The EDA_channel_IHC class is to generate the changes in the daily IHC of Channels
  332. class EDA_channel_IHC():
  333. def __init__(self,transform):
  334. print('\033[1m'+'I am doing the EDA on Conversion'+'\033[0m'+'\n...')
  335. self.data=transform.cleaned_tab[['Conv_Date','Channel','IHC_Conv']] #'Conv_Date',
  336. self.by_TimeChannel=self.group_data(self.data,'Conv_Date','Channel','IHC_Conv')
  337. self.unstacked_data=self.unstack_data(self.by_TimeChannel,'IHC_Conv')
  338. self.change_plot=self.plot_data(self.unstacked_data)
  339. def plot_data(self,data):
  340. self.data=data
  341. # self.data['The Rest'] = self.data.sum(axis=1)
  342. self.xlim=('2017-03-01','2018-03-24')
  343. self.ylim=('0','550')
  344. self.ax =self.data[['A','G','H','I','B']].plot.line(xlim=self.xlim,figsize=(12,8))
  345. self.ax.set_xlabel('Date')
  346. self.ax.set_ylabel('IHC_Conv')
  347. plt.savefig('channel_IHC.png')
  348. files.download('channel_IHC.png')
  349. return self.data
  350. def group_data(self,data,target_1,target_2,exp):
  351. print('I am grouping data by '+ target_1 +' and '+ target_2 + '\n...')
  352. self.data=data.groupby([target_1,target_2])#.agg({exp:'sum'})
  353. print('I am done! ')
  354. return self.data
  355. def unstack_data(self,data,exp):
  356. print('I am unstacking data \n...')
  357. data=data.sum()[exp].unstack(level=-1)
  358. self.data=data
  359. print('Data were unstacked completely\n...')
  360. return self.data
  361. class main():
  362. Extract=Extract(A_url,B_url)
  363. Transform=Transform(Extract)
  364. Load=Load(Transform)
  365. EDA_Overview_KPI=EDA_Overview_KPI(Transform)
  366. EDA_Static_Ren=EDA_Static_Ren(Transform)
  367. EDA_KPI_Return=EDA_KPI_Return(Transform)
  368. EDA_Static_User=EDA_Static_User(Transform)
  369. EDA_Static_Conversion=EDA_Static_Conversion(Transform)
  370. EDA_Channel_Revenue=EDA_Channel_Revenue(Transform)
  371. EDA_Channel_User=EDA_Channel_User(Transform)
  372. EDA_channel_IHC=EDA_channel_IHC(Transform)
  373. if __name__=='__main__':
  374. main()