LiuFan
/
PrivacyScanData


			
							123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338
							import pandas as pd
from bokeh.plotting import figure, save, show,output_file, ColumnDataSource
from bokeh.models import HoverTool
import matplotlib.pyplot as plt

class DataFrameAnalysis:
    """Arms Macro-Analysis capability to a dataframe"""
    def __init__(self, frame):
        self.df = frame  # dataframe object

    def avg_discount_rate(self):
        """Calculates average discount rate of all orders."""
        # You should calculate the average and gross discount rate.
        pd.to_numeric(self.df['Discount_Amount'])
        pd.to_numeric(self.df['Order_Total_Amount'])
        total_sales_amount = self.df['Order_Total_Amount'].sum()
        total_discount_amount = self.df['Discount_Amount'].sum()
        total_discount_avg = int((total_discount_amount / (total_discount_amount+total_sales_amount))*100)
        return print(f'Customer Discount Avg: {total_discount_avg}%')


    def customer_role_breakdown(self):
        """Calculates proportion of retail/wholesale as a function of sales."""
        retail = 0
        wholesale = 0
        sum_count =int(len(self.df.index))
        sum_sales = self.df['Order_Total_Amount'].sum()
        retail_customer_count = round((len(self.df.loc[self.df['Customer_Role']=='Customer'].index)/sum_count)*100)
        wholesale_customer_count = round((len(self.df.loc[self.df['Customer_Role']=='Wholesale Customer'].index)/sum_count)*100)
        retail_sales = round((self.df['Order_Total_Amount'].loc[self.df['Customer_Role']=='Customer'].sum()/sum_sales)*100)
        wholesale_sales = round((self.df['Order_Total_Amount'].loc[self.df['Customer_Role']=='Wholesale Customer'].sum()/sum_sales)*100)
        grid = [[retail_customer_count,wholesale_customer_count],[retail_sales,wholesale_sales]]
        crb_df = pd.DataFrame(data=grid, columns=['Retail','Wholesale'], index=['Proportional Order Counts', 'Proportional Sales'])
        plt.style.use('seaborn-deep')
        fig, ax = plt.subplots(figsize=(10, 10))
        crb_df.plot.bar(title='Customer Role Breakdown', xlabel='Customer Role', ylabel='Proportion (%)',
                        cmap='winter', ax=ax)
        plt.figsave('Customer_Role_Breakdown.png')
        print(crb_df.head(3))

    def geographical_breakdown(self):
        """ Displays a scatterplot of Sales/Revenue weights for different States."""
        self.df = self.df[self.df.Country_Name_Shipping== 'United States (US)']
        counts = self.df["State_Name_Shipping"].value_counts().to_dict()
        States = list(counts.keys())
        Count = list(counts.values())
        geo = pd.DataFrame({'States': States, 'Counts': Count})
        geo_dataframe = pd.DataFrame(geo)
        geo_dataframe.insert(loc=2, column="Sales_Total", value=0)
        geo_dataframe.insert(loc=3, column="Avg_Purchase_Revenue", value=0)
        for i, row in self.df.iterrows():
            state = row.loc['State_Name_Shipping']
            total = row.loc['Order_Total_Amount']
            idx = geo_dataframe[geo_dataframe["States"] == state].index.item()
            av = int(geo_dataframe.at[idx, 'Sales_Total']) / int(geo_dataframe.at[idx, 'Counts'])
            geo_dataframe.at[idx, 'Sales_Total'] += total
            geo_dataframe.at[idx, 'Avg_Purchase_Revenue'] = av
        # data visualization
        cds = ColumnDataSource(geo_dataframe)
        cds.data.keys()
        visual = figure(tools='box_zoom, pan, reset',
                        width=700, height=700,
                        title='Geographical Sales Breakdown',
                        y_axis_label='Order Quantity', x_axis_label='Revenue')
        visual.circle('Sales_Total', 'Counts', size=7, source=cds, name= 'States')
        visual.add_tools(HoverTool(tooltips=[("State", "@States"),
                                             ("Average Purchase Revenue", "@Avg_Purchase_Revenue")
                                             ]))
        output_file('geographical_breakdown.html')
        save(visual)
        show(visual)
        return print(geo_dataframe)


class ProductAnalysis:
    """Arms product analysis capability to a dataframe"""

    def __init__(self, frame):
        self.df = frame  # dataframe object
        self.analysis_frame = self.monthly_product_frame()
        self.time_span = self.serve_time_span()  # list of tuples: x[0] == year, x[1] == month for x in self.time_span

    def monthly_product_frame(self):
        """Analyzes the order lines in the CSV_Files folder and
        Returns a pandas Dataframe with monthly product statistics."""
        from datetime import datetime
        import information_repository as ir
        frame = self.df
        frame = frame[['Order_Date', 'Product_Name', 'Quantity', 'Item_Cost']]
        dict_list = []
        for i, row in frame.iterrows():
            row_date = row['Order_Date']
            row_date = datetime.strptime(row_date, "%Y-%m-%d %H:%M")
            row_date_month = row_date.month
            row_date_year = row_date.year
            raw_products = row['Product_Name'].replace('\r', '').split('\n')
            raw_quantities = row['Quantity'].replace('\r', '').split('\n')
            raw_cost = row['Item_Cost'].replace('\r', '').split('\n')
            for key in range(len(raw_products)):
                product = [i for i in ir.p_list if i in raw_products[key]][0]
                quantity = int(raw_quantities[key])
                revenue = float(raw_cost[key])
                dict_object = [product, quantity, revenue, row_date_month, row_date_year]
                matched_dictionary = [i for i in dict_list if
                                      i['name'] == dict_object[0] and i['month'] == dict_object[3]
                                      and i['year'] == dict_object[4]]
                if len(matched_dictionary) == 1:
                    matched_dictionary[0]['count'] += dict_object[1]
                    matched_dictionary[0]['revenue'] += dict_object[2]
                else:
                    dict_list.append({'name': dict_object[0], 'count': dict_object[1],
                                      'revenue': dict_object[2], 'month': dict_object[3], 'year': dict_object[4]})
        self.analysis_frame = pd.DataFrame(columns=['year', 'month', 'count', 'revenue', 'change_over_month', 'product'])
        time_span = []
        for product in ir.p_list:
            product_dictionaries = sorted(
                sorted([i for i in dict_list if i['name'] == product], key= x['month']
                       ), key= x['year'])
            data_list = []
            year_list = []
            month_list = []
            for key in range(len(product_dictionaries)):
                if key > 0:
                    try:
                        change_over_month = (100 - round(
                            ((product_dictionaries[key]['revenue'] / product_dictionaries[key]['count'])
                             / (product_dictionaries[key - 1]['revenue'] / product_dictionaries[key - 1][
                                        'count'])) * 100))

                    except IndexError:
                        print('change_list calls need to be refined')
                else:
                    change_over_month = 0

                row_list = [product_dictionaries[key]['year'], product_dictionaries[key]['month'],
                            product_dictionaries[key]['count'], product_dictionaries[key]['revenue'], change_over_month,
                            product_dictionaries[key]['name']]
                data_list.append(row_list)
                if product == 'Blue Moon':
                    month_list.append(product_dictionaries[key]['month'])
                    year_list.append(product_dictionaries[key]['year'])

            if product == 'Blue Moon':
                time_span = [*zip(year_list, month_list)]
            append_frame = pd.DataFrame(data=data_list,
                                        columns=['year', 'month', 'count', 'revenue', 'change_over_month', 'product'])
            self.analysis_frame = pd.concat([self.analysis_frame, append_frame], ignore_index=True)
        self.time_span = time_span
        return self.analysis_frame

    def highest_positive_product_change_over_month_analysis(self):
        """Analyzes the monthly_product_frame and returns the 5 products whose sales level increased the most"""
        year = int(input('Type the year you would like to query in yyyy format:  '))
        month = int(input('Type the month you would like to query:  '))
        data_slice = self.analysis_frame.loc[self.analysis_frame['month'] == month].loc[self.analysis_frame['year'] == year].loc[self.analysis_frame['revenue']>500]
        data_slice.sort_values(by='change_over_month', inplace=True, ascending=False)
        return print(data_slice.head(5))

    def highest_negative_product_change_over_month_analysis(self):
        """Analyzes the monthly_product_frame and returns the 5 products whose sales level decreased the most"""
        year = int(input('Type the year you would like to query in yyyy format:  '))
        month = int(input('Type the month you would like to query:  '))
        data_slice = self.analysis_frame.loc[self.analysis_frame['month'] == month].loc[self.analysis_frame['year'] == year].loc[self.analysis_frame['revenue']>500]
        data_slice.sort_values(by='change_over_month', inplace=True, ascending=True)
        return data_slice

    def product_line_change_over_month_analysis(self, year, month):
        """Analyzes the monthly_product_frame by product line and returns a dataframe with
        product line change over month data."""
        import information_repository as ir
        #year = int(input('Type the year you would like to query in yyyy format:  '))
        #month = int(input('Type the month you would like to query:  '))
        product_line_list_of_lists = [ir.tea_product_list, ir.capsule_product_list, ir.smokeable_product_list,
                             ir.skincare_product_list, ir.superfood_product_list, ir.honey_product_list,
                             ir.tincture_product_list]
        product_line_strings = ['Tea', 'Capsules', 'Smokeables', 'Skincare', 'Superfood', 'Honey', 'Tinctures']
        product_line_append_list = []
        line_index_counter = 0
        for product_line in product_line_list_of_lists:
            line_list = []
            line_list.append(year)
            line_list.append(month)
            data_slice = self.analysis_frame.loc[self.analysis_frame['month'] == month].loc[
                self.analysis_frame['year'] == year].loc[self.analysis_frame['product'].isin(product_line)]
            if month > 1:
                last_month_frame = self.analysis_frame.loc[self.analysis_frame['month'] == (month - 1)].loc[
                    self.analysis_frame['year'] == year].loc[self.analysis_frame['product'].isin(product_line)]
            else:
                last_month_frame = self.analysis_frame.loc[self.analysis_frame['month'] == 12].loc[
                    self.analysis_frame['year'] == (year - 1)].loc[self.analysis_frame['product'].isin(product_line)]
            last_month_revenue = last_month_frame['revenue'].sum()
            this_month_revenue = data_slice['revenue'].sum()
            avg_change_over_month = (this_month_revenue / last_month_revenue) * 100
            line_list.append(avg_change_over_month)
            product_line = product_line_strings[line_index_counter]
            line_index_counter += 1
            line_list.append(product_line)
            product_line_append_list.append(line_list)
        product_line_analysis_frame = pd.DataFrame(data=product_line_append_list,
                                                   columns=['year', 'month', 'avg_change_over_month',
                                                            'product_line'])
        product_line_analysis_frame.to_csv('product_line_csv_2021.csv')
        return product_line_analysis_frame

    def serve_time_span(self):
        """Returns a list of tuples of unique (year, month) pairs in chronological order based on the
         monthly_product_frame."""
        return sorted(sorted(list(set([*zip(self.analysis_frame['year'],self.analysis_frame['month'])])),
                            key=x[1]), key=x[0])

    def product_line_change_over_month_graph(self):
        """Using the product_line_change_over_month_analysis frame, it outputs a graph of the changes over time for
        the top product lines."""
        line_change_frame_data = []
        for i in self.time_span:
            month_frame = self.product_line_change_over_month_analysis(i[0], i[1])
            change_list = month_frame['avg_change_over_month']
            line_change_frame_data.append(change_list)
        treated_line_change_frame_data = []
        for i in range(len(line_change_frame_data)): #index of time period/segment
            if i ==0:
                treated_line_change_frame_data.append([self.time_span[i][0], self.time_span[i][1],
                                                       0,0,0,0,0,0,0]) #insert base amounts for the first month
            else: #function as intended
                month_cumulative_change_list = []
                month_cumulative_change_list.append(self.time_span[i][0])
                month_cumulative_change_list.append(self.time_span[i][1])# append year and month
                for x in range(len(line_change_frame_data[0])):
                    prior_change_list = [i[x] for i in line_change_frame_data]
                    product_cumulative_change = (100+treated_line_change_frame_data[i-1][x+2]) * ((prior_change_list[i]/100))-100
                    #i-1 for previous time period and x+2 for offset due to year and month category
                    month_cumulative_change_list.append(product_cumulative_change)
                treated_line_change_frame_data.append(month_cumulative_change_list)
        graph_frame = pd.DataFrame(data=treated_line_change_frame_data, columns=['Year', 'Month', 'Tea', 'Capsules', 'Smokeables','Skincare',
                                                                           'Superfood', 'Honey', 'Tinctures'])
        print(graph_frame.head(7))
        x = [str(i) for i in graph_frame['Month']]
        y1 = graph_frame['Tea']
        y2 = graph_frame['Capsules']
        y3 = graph_frame['Superfood']
        y4 = graph_frame['Honey']
        y5 = graph_frame['Smokeables']
        graph = figure(x_range=x,title='Cumulative Percentage Change of Product Lines',x_axis_label='Month', y_axis_label='Percentage Change')
        graph.line(x, y1, legend_label ='Tea', color='red', line_width=3)
        graph.line(x, y2, legend_label ='Capsules', color='blue', line_width=3)
        graph.line(x, y3, legend_label ='Superfood', color='orange', line_width=3)
        graph.line(x, y4, legend_label ='Honey', color='yellow', line_width=3)
        graph.line(x, y5, legend_label ='Smokeables', color='green', line_width=3)
        output_file('product_line_change_over_month.html')
        save(graph)
        return show(graph)


class InventoryPredictor:
    """Inventory volume prediction using a product sales csv as the raw data."""
    def __init__(self):
        import information_repository as ir
        self.unit_counts = self.sales_unit_count_dictionaries()
        self.ingredients = self.ingredient_dictionary()
        self.recipes = ir.unit_recipes

        print('initiating')
        pass

    def sales_unit_count_dictionaries(self):
        """Creates a set of dictionaries for each product and the cumulative quantity of units across all SKUs."""
        import information_repository as ir
        product_sales_frame = pd.read_csv('product_sales.csv')
        product_sales_frame = product_sales_frame.where(pd.notnull(product_sales_frame), 'None')
        product_unit_amounts = []
        for i in ir.p_list:
            product_dict = dict(name=i, quantity=0)
            for x, row in product_sales_frame.iterrows():
                if i in row['Product Name']:
                    if i in ir.tea_product_list:
                        if '1' in row['Variation Attributes']:
                            product_dict['quantity'] += row['Quantity Sold']
                        elif '3' in row['Variation Attributes']:
                            product_dict['quantity'] += row['Quantity Sold'] * 3
                        elif '20' in row['Variation Attributes']:
                            product_dict['quantity'] += row['Quantity Sold'] * 20
                        else:
                            pass
                            # print('Something unexpected occured', row['Product Name'], row['Variation Attributes'])
                    elif i in ir.superfood_product_list:
                        if '3' in row['Variation Attributes']:
                            product_dict['quantity'] += row['Quantity Sold']
                        elif '9' in row['Variation Attributes']:
                            product_dict['quantity'] += row['Quantity Sold'] * 3
                        else:
                            product_dict['quantity'] += 1
                    elif i in ir.capsule_product_list:
                        if '1' in row['Variation Attributes']:
                            product_dict['quantity'] += row['Quantity Sold']
                        if '4' in row['Variation Attributes']:
                            product_dict['quantity'] += row['Quantity Sold'] * 4
                    elif i in ir.smokeable_product_list:
                        if '7' in row['Variation Attributes']:
                            product_dict['quantity'] += row['Quantity Sold'] * 7
                        elif 'prerolls' in row['Variation Attributes']:
                            product_dict['quantity'] += row['Quantity Sold'] * 2
                        else:
                            product_dict['quantity'] += row['Quantity Sold'] * 4
                    elif i in ir.honey_product_list:
                        if '3' in row['Variation Attributes']:
                            product_dict['quantity'] += row['Quantity Sold'] * 3
                        elif '5' in row['Variation Attributes']:
                            product_dict['quantity'] += row['Quantity Sold'] * 5
                        elif '2' in row['Variation Attributes']:
                            pass
                            # print('Reminder that packet honeys and jars need to separate')
                    else:
                        product_dict['quantity'] += row['Quantity Sold']
            product_unit_amounts.append(product_dict)
        return product_unit_amounts

    def ingredient_dictionary(self):
        """Creates a ingredient dictionary with all ingredients as keys and the cumulative volume across all
        products as values."""
        inventory = pd.read_csv('craftybase-export-material.csv')
        ingredient_dictionary = {}
        for i in list(inventory['name']):
            ingredient_dictionary[i]=0
        return ingredient_dictionary

    def ingredient_volume_table(self):
        """Creates a csv with ingredients and the cumulative volume used across a time span."""
        for x in self.unit_counts:
            for y in self.recipes:
                if x['name'] == y['name']:
                    for k, v in y.items():
                        if k != 'name':
                            self.ingredients[k] += v * x['quantity']
        sorted_ingredient_volumes = sorted(self.ingredients.items(), key= x[1], reverse=True)
        output_frame = pd.DataFrame(data = sorted_ingredient_volumes, columns= ['Ingredient', 'Volume (gram or oz)'])
        output_frame = output_frame[output_frame['Volume (gram or oz)'] !=0]
        output_frame.to_csv('ingredient_volume_table.csv')