macro_analysis-backup.py 19 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338
  1. import pandas as pd
  2. from bokeh.plotting import figure, save, show,output_file, ColumnDataSource
  3. from bokeh.models import HoverTool
  4. import matplotlib.pyplot as plt
  5. class DataFrameAnalysis:
  6. """Arms Macro-Analysis capability to a dataframe"""
  7. def __init__(self, frame):
  8. self.df = frame # dataframe object
  9. def avg_discount_rate(self):
  10. """Calculates average discount rate of all orders."""
  11. # You should calculate the average and gross discount rate.
  12. pd.to_numeric(self.df['Discount_Amount'])
  13. pd.to_numeric(self.df['Order_Total_Amount'])
  14. total_sales_amount = self.df['Order_Total_Amount'].sum()
  15. total_discount_amount = self.df['Discount_Amount'].sum()
  16. total_discount_avg = int((total_discount_amount / (total_discount_amount+total_sales_amount))*100)
  17. return print(f'Customer Discount Avg: {total_discount_avg}%')
  18. def customer_role_breakdown(self):
  19. """Calculates proportion of retail/wholesale as a function of sales."""
  20. retail = 0
  21. wholesale = 0
  22. sum_count =int(len(self.df.index))
  23. sum_sales = self.df['Order_Total_Amount'].sum()
  24. retail_customer_count = round((len(self.df.loc[self.df['Customer_Role']=='Customer'].index)/sum_count)*100)
  25. wholesale_customer_count = round((len(self.df.loc[self.df['Customer_Role']=='Wholesale Customer'].index)/sum_count)*100)
  26. retail_sales = round((self.df['Order_Total_Amount'].loc[self.df['Customer_Role']=='Customer'].sum()/sum_sales)*100)
  27. wholesale_sales = round((self.df['Order_Total_Amount'].loc[self.df['Customer_Role']=='Wholesale Customer'].sum()/sum_sales)*100)
  28. grid = [[retail_customer_count,wholesale_customer_count],[retail_sales,wholesale_sales]]
  29. crb_df = pd.DataFrame(data=grid, columns=['Retail','Wholesale'], index=['Proportional Order Counts', 'Proportional Sales'])
  30. plt.style.use('seaborn-deep')
  31. fig, ax = plt.subplots(figsize=(10, 10))
  32. crb_df.plot.bar(title='Customer Role Breakdown', xlabel='Customer Role', ylabel='Proportion (%)',
  33. cmap='winter', ax=ax)
  34. plt.figsave('Customer_Role_Breakdown.png')
  35. print(crb_df.head(3))
  36. def geographical_breakdown(self):
  37. """ Displays a scatterplot of Sales/Revenue weights for different States."""
  38. self.df = self.df[self.df.Country_Name_Shipping== 'United States (US)']
  39. counts = self.df["State_Name_Shipping"].value_counts().to_dict()
  40. States = list(counts.keys())
  41. Count = list(counts.values())
  42. geo = pd.DataFrame({'States': States, 'Counts': Count})
  43. geo_dataframe = pd.DataFrame(geo)
  44. geo_dataframe.insert(loc=2, column="Sales_Total", value=0)
  45. geo_dataframe.insert(loc=3, column="Avg_Purchase_Revenue", value=0)
  46. for i, row in self.df.iterrows():
  47. state = row.loc['State_Name_Shipping']
  48. total = row.loc['Order_Total_Amount']
  49. idx = geo_dataframe[geo_dataframe["States"] == state].index.item()
  50. av = int(geo_dataframe.at[idx, 'Sales_Total']) / int(geo_dataframe.at[idx, 'Counts'])
  51. geo_dataframe.at[idx, 'Sales_Total'] += total
  52. geo_dataframe.at[idx, 'Avg_Purchase_Revenue'] = av
  53. # data visualization
  54. cds = ColumnDataSource(geo_dataframe)
  55. cds.data.keys()
  56. visual = figure(tools='box_zoom, pan, reset',
  57. width=700, height=700,
  58. title='Geographical Sales Breakdown',
  59. y_axis_label='Order Quantity', x_axis_label='Revenue')
  60. visual.circle('Sales_Total', 'Counts', size=7, source=cds, name= 'States')
  61. visual.add_tools(HoverTool(tooltips=[("State", "@States"),
  62. ("Average Purchase Revenue", "@Avg_Purchase_Revenue")
  63. ]))
  64. output_file('geographical_breakdown.html')
  65. save(visual)
  66. show(visual)
  67. return print(geo_dataframe)
  68. class ProductAnalysis:
  69. """Arms product analysis capability to a dataframe"""
  70. def __init__(self, frame):
  71. self.df = frame # dataframe object
  72. self.analysis_frame = self.monthly_product_frame()
  73. self.time_span = self.serve_time_span() # list of tuples: x[0] == year, x[1] == month for x in self.time_span
  74. def monthly_product_frame(self):
  75. """Analyzes the order lines in the CSV_Files folder and
  76. Returns a pandas Dataframe with monthly product statistics."""
  77. from datetime import datetime
  78. import information_repository as ir
  79. frame = self.df
  80. frame = frame[['Order_Date', 'Product_Name', 'Quantity', 'Item_Cost']]
  81. dict_list = []
  82. for i, row in frame.iterrows():
  83. row_date = row['Order_Date']
  84. row_date = datetime.strptime(row_date, "%Y-%m-%d %H:%M")
  85. row_date_month = row_date.month
  86. row_date_year = row_date.year
  87. raw_products = row['Product_Name'].replace('\r', '').split('\n')
  88. raw_quantities = row['Quantity'].replace('\r', '').split('\n')
  89. raw_cost = row['Item_Cost'].replace('\r', '').split('\n')
  90. for key in range(len(raw_products)):
  91. product = [i for i in ir.p_list if i in raw_products[key]][0]
  92. quantity = int(raw_quantities[key])
  93. revenue = float(raw_cost[key])
  94. dict_object = [product, quantity, revenue, row_date_month, row_date_year]
  95. matched_dictionary = [i for i in dict_list if
  96. i['name'] == dict_object[0] and i['month'] == dict_object[3]
  97. and i['year'] == dict_object[4]]
  98. if len(matched_dictionary) == 1:
  99. matched_dictionary[0]['count'] += dict_object[1]
  100. matched_dictionary[0]['revenue'] += dict_object[2]
  101. else:
  102. dict_list.append({'name': dict_object[0], 'count': dict_object[1],
  103. 'revenue': dict_object[2], 'month': dict_object[3], 'year': dict_object[4]})
  104. self.analysis_frame = pd.DataFrame(columns=['year', 'month', 'count', 'revenue', 'change_over_month', 'product'])
  105. time_span = []
  106. for product in ir.p_list:
  107. product_dictionaries = sorted(
  108. sorted([i for i in dict_list if i['name'] == product], key= x['month']
  109. ), key= x['year'])
  110. data_list = []
  111. year_list = []
  112. month_list = []
  113. for key in range(len(product_dictionaries)):
  114. if key > 0:
  115. try:
  116. change_over_month = (100 - round(
  117. ((product_dictionaries[key]['revenue'] / product_dictionaries[key]['count'])
  118. / (product_dictionaries[key - 1]['revenue'] / product_dictionaries[key - 1][
  119. 'count'])) * 100))
  120. except IndexError:
  121. print('change_list calls need to be refined')
  122. else:
  123. change_over_month = 0
  124. row_list = [product_dictionaries[key]['year'], product_dictionaries[key]['month'],
  125. product_dictionaries[key]['count'], product_dictionaries[key]['revenue'], change_over_month,
  126. product_dictionaries[key]['name']]
  127. data_list.append(row_list)
  128. if product == 'Blue Moon':
  129. month_list.append(product_dictionaries[key]['month'])
  130. year_list.append(product_dictionaries[key]['year'])
  131. if product == 'Blue Moon':
  132. time_span = [*zip(year_list, month_list)]
  133. append_frame = pd.DataFrame(data=data_list,
  134. columns=['year', 'month', 'count', 'revenue', 'change_over_month', 'product'])
  135. self.analysis_frame = pd.concat([self.analysis_frame, append_frame], ignore_index=True)
  136. self.time_span = time_span
  137. return self.analysis_frame
  138. def highest_positive_product_change_over_month_analysis(self):
  139. """Analyzes the monthly_product_frame and returns the 5 products whose sales level increased the most"""
  140. year = int(input('Type the year you would like to query in yyyy format: '))
  141. month = int(input('Type the month you would like to query: '))
  142. data_slice = self.analysis_frame.loc[self.analysis_frame['month'] == month].loc[self.analysis_frame['year'] == year].loc[self.analysis_frame['revenue']>500]
  143. data_slice.sort_values(by='change_over_month', inplace=True, ascending=False)
  144. return print(data_slice.head(5))
  145. def highest_negative_product_change_over_month_analysis(self):
  146. """Analyzes the monthly_product_frame and returns the 5 products whose sales level decreased the most"""
  147. year = int(input('Type the year you would like to query in yyyy format: '))
  148. month = int(input('Type the month you would like to query: '))
  149. data_slice = self.analysis_frame.loc[self.analysis_frame['month'] == month].loc[self.analysis_frame['year'] == year].loc[self.analysis_frame['revenue']>500]
  150. data_slice.sort_values(by='change_over_month', inplace=True, ascending=True)
  151. return data_slice
  152. def product_line_change_over_month_analysis(self, year, month):
  153. """Analyzes the monthly_product_frame by product line and returns a dataframe with
  154. product line change over month data."""
  155. import information_repository as ir
  156. #year = int(input('Type the year you would like to query in yyyy format: '))
  157. #month = int(input('Type the month you would like to query: '))
  158. product_line_list_of_lists = [ir.tea_product_list, ir.capsule_product_list, ir.smokeable_product_list,
  159. ir.skincare_product_list, ir.superfood_product_list, ir.honey_product_list,
  160. ir.tincture_product_list]
  161. product_line_strings = ['Tea', 'Capsules', 'Smokeables', 'Skincare', 'Superfood', 'Honey', 'Tinctures']
  162. product_line_append_list = []
  163. line_index_counter = 0
  164. for product_line in product_line_list_of_lists:
  165. line_list = []
  166. line_list.append(year)
  167. line_list.append(month)
  168. data_slice = self.analysis_frame.loc[self.analysis_frame['month'] == month].loc[
  169. self.analysis_frame['year'] == year].loc[self.analysis_frame['product'].isin(product_line)]
  170. if month > 1:
  171. last_month_frame = self.analysis_frame.loc[self.analysis_frame['month'] == (month - 1)].loc[
  172. self.analysis_frame['year'] == year].loc[self.analysis_frame['product'].isin(product_line)]
  173. else:
  174. last_month_frame = self.analysis_frame.loc[self.analysis_frame['month'] == 12].loc[
  175. self.analysis_frame['year'] == (year - 1)].loc[self.analysis_frame['product'].isin(product_line)]
  176. last_month_revenue = last_month_frame['revenue'].sum()
  177. this_month_revenue = data_slice['revenue'].sum()
  178. avg_change_over_month = (this_month_revenue / last_month_revenue) * 100
  179. line_list.append(avg_change_over_month)
  180. product_line = product_line_strings[line_index_counter]
  181. line_index_counter += 1
  182. line_list.append(product_line)
  183. product_line_append_list.append(line_list)
  184. product_line_analysis_frame = pd.DataFrame(data=product_line_append_list,
  185. columns=['year', 'month', 'avg_change_over_month',
  186. 'product_line'])
  187. product_line_analysis_frame.to_csv('product_line_csv_2021.csv')
  188. return product_line_analysis_frame
  189. def serve_time_span(self):
  190. """Returns a list of tuples of unique (year, month) pairs in chronological order based on the
  191. monthly_product_frame."""
  192. return sorted(sorted(list(set([*zip(self.analysis_frame['year'],self.analysis_frame['month'])])),
  193. key=x[1]), key=x[0])
  194. def product_line_change_over_month_graph(self):
  195. """Using the product_line_change_over_month_analysis frame, it outputs a graph of the changes over time for
  196. the top product lines."""
  197. line_change_frame_data = []
  198. for i in self.time_span:
  199. month_frame = self.product_line_change_over_month_analysis(i[0], i[1])
  200. change_list = month_frame['avg_change_over_month']
  201. line_change_frame_data.append(change_list)
  202. treated_line_change_frame_data = []
  203. for i in range(len(line_change_frame_data)): #index of time period/segment
  204. if i ==0:
  205. treated_line_change_frame_data.append([self.time_span[i][0], self.time_span[i][1],
  206. 0,0,0,0,0,0,0]) #insert base amounts for the first month
  207. else: #function as intended
  208. month_cumulative_change_list = []
  209. month_cumulative_change_list.append(self.time_span[i][0])
  210. month_cumulative_change_list.append(self.time_span[i][1])# append year and month
  211. for x in range(len(line_change_frame_data[0])):
  212. prior_change_list = [i[x] for i in line_change_frame_data]
  213. product_cumulative_change = (100+treated_line_change_frame_data[i-1][x+2]) * ((prior_change_list[i]/100))-100
  214. #i-1 for previous time period and x+2 for offset due to year and month category
  215. month_cumulative_change_list.append(product_cumulative_change)
  216. treated_line_change_frame_data.append(month_cumulative_change_list)
  217. graph_frame = pd.DataFrame(data=treated_line_change_frame_data, columns=['Year', 'Month', 'Tea', 'Capsules', 'Smokeables','Skincare',
  218. 'Superfood', 'Honey', 'Tinctures'])
  219. print(graph_frame.head(7))
  220. x = [str(i) for i in graph_frame['Month']]
  221. y1 = graph_frame['Tea']
  222. y2 = graph_frame['Capsules']
  223. y3 = graph_frame['Superfood']
  224. y4 = graph_frame['Honey']
  225. y5 = graph_frame['Smokeables']
  226. graph = figure(x_range=x,title='Cumulative Percentage Change of Product Lines',x_axis_label='Month', y_axis_label='Percentage Change')
  227. graph.line(x, y1, legend_label ='Tea', color='red', line_width=3)
  228. graph.line(x, y2, legend_label ='Capsules', color='blue', line_width=3)
  229. graph.line(x, y3, legend_label ='Superfood', color='orange', line_width=3)
  230. graph.line(x, y4, legend_label ='Honey', color='yellow', line_width=3)
  231. graph.line(x, y5, legend_label ='Smokeables', color='green', line_width=3)
  232. output_file('product_line_change_over_month.html')
  233. save(graph)
  234. return show(graph)
  235. class InventoryPredictor:
  236. """Inventory volume prediction using a product sales csv as the raw data."""
  237. def __init__(self):
  238. import information_repository as ir
  239. self.unit_counts = self.sales_unit_count_dictionaries()
  240. self.ingredients = self.ingredient_dictionary()
  241. self.recipes = ir.unit_recipes
  242. print('initiating')
  243. pass
  244. def sales_unit_count_dictionaries(self):
  245. """Creates a set of dictionaries for each product and the cumulative quantity of units across all SKUs."""
  246. import information_repository as ir
  247. product_sales_frame = pd.read_csv('product_sales.csv')
  248. product_sales_frame = product_sales_frame.where(pd.notnull(product_sales_frame), 'None')
  249. product_unit_amounts = []
  250. for i in ir.p_list:
  251. product_dict = dict(name=i, quantity=0)
  252. for x, row in product_sales_frame.iterrows():
  253. if i in row['Product Name']:
  254. if i in ir.tea_product_list:
  255. if '1' in row['Variation Attributes']:
  256. product_dict['quantity'] += row['Quantity Sold']
  257. elif '3' in row['Variation Attributes']:
  258. product_dict['quantity'] += row['Quantity Sold'] * 3
  259. elif '20' in row['Variation Attributes']:
  260. product_dict['quantity'] += row['Quantity Sold'] * 20
  261. else:
  262. pass
  263. # print('Something unexpected occured', row['Product Name'], row['Variation Attributes'])
  264. elif i in ir.superfood_product_list:
  265. if '3' in row['Variation Attributes']:
  266. product_dict['quantity'] += row['Quantity Sold']
  267. elif '9' in row['Variation Attributes']:
  268. product_dict['quantity'] += row['Quantity Sold'] * 3
  269. else:
  270. product_dict['quantity'] += 1
  271. elif i in ir.capsule_product_list:
  272. if '1' in row['Variation Attributes']:
  273. product_dict['quantity'] += row['Quantity Sold']
  274. if '4' in row['Variation Attributes']:
  275. product_dict['quantity'] += row['Quantity Sold'] * 4
  276. elif i in ir.smokeable_product_list:
  277. if '7' in row['Variation Attributes']:
  278. product_dict['quantity'] += row['Quantity Sold'] * 7
  279. elif 'prerolls' in row['Variation Attributes']:
  280. product_dict['quantity'] += row['Quantity Sold'] * 2
  281. else:
  282. product_dict['quantity'] += row['Quantity Sold'] * 4
  283. elif i in ir.honey_product_list:
  284. if '3' in row['Variation Attributes']:
  285. product_dict['quantity'] += row['Quantity Sold'] * 3
  286. elif '5' in row['Variation Attributes']:
  287. product_dict['quantity'] += row['Quantity Sold'] * 5
  288. elif '2' in row['Variation Attributes']:
  289. pass
  290. # print('Reminder that packet honeys and jars need to separate')
  291. else:
  292. product_dict['quantity'] += row['Quantity Sold']
  293. product_unit_amounts.append(product_dict)
  294. return product_unit_amounts
  295. def ingredient_dictionary(self):
  296. """Creates a ingredient dictionary with all ingredients as keys and the cumulative volume across all
  297. products as values."""
  298. inventory = pd.read_csv('craftybase-export-material.csv')
  299. ingredient_dictionary = {}
  300. for i in list(inventory['name']):
  301. ingredient_dictionary[i]=0
  302. return ingredient_dictionary
  303. def ingredient_volume_table(self):
  304. """Creates a csv with ingredients and the cumulative volume used across a time span."""
  305. for x in self.unit_counts:
  306. for y in self.recipes:
  307. if x['name'] == y['name']:
  308. for k, v in y.items():
  309. if k != 'name':
  310. self.ingredients[k] += v * x['quantity']
  311. sorted_ingredient_volumes = sorted(self.ingredients.items(), key= x[1], reverse=True)
  312. output_frame = pd.DataFrame(data = sorted_ingredient_volumes, columns= ['Ingredient', 'Volume (gram or oz)'])
  313. output_frame = output_frame[output_frame['Volume (gram or oz)'] !=0]
  314. output_frame.to_csv('ingredient_volume_table.csv')