123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163 |
- ##########
- # IMPORT #
- ##########
- # for Heroku
- import os
- from flask import Flask, render_template, request, jsonify
- import numpy as np
- from scipy import sparse
- import json
- import pandas as pd
- ########
- # DATA #
- ########
- comic_data_df = pd.read_csv("final_data/comic_data.csv")
- tfidf_vectors = sparse.load_npz("final_data/tfidf_vectors.npz")
- with open("final_data/tfidf_feature_names.txt", 'r') as filehandle:
- tfidf_feature_names = json.load(filehandle)
- _, feature_counts = tfidf_vectors.nonzero();
- _, feature_counts = np.unique(feature_counts, return_counts=True)
- feature_counts = ["{} ({})".format(name, count) for name, count in zip(tfidf_feature_names, feature_counts)]
- word_data_all_summed = tfidf_vectors.sum(axis=0)
- word_data_all_summed = np.squeeze(np.asarray(word_data_all_summed))
- #######
- # APP #
- #######
- app = Flask(__name__)
- @app.route('/')
- @app.route('/index')
- @app.route('/home')
- def homepage():
- comic_data = comic_data_df.to_dict(orient='records')
- comic_data = json.dumps(comic_data, indent=2)
- return_comic_data = {'comic_data': comic_data}
- feature_names = json.dumps(feature_counts)
- return_feature_names = {'feature_names': feature_names}
- return render_template('index.html',
- return_comic_data=return_comic_data,
- num_comics=comic_data_df.shape[0],
- return_feature_names=return_feature_names,
- num_features=len(feature_counts))
- @app.route('/barchart-data', methods=['POST'])
- def barchart_data():
- if request.method == 'POST':
- picked_idx = [request.json['picked_sn'] - 1]
- selected_idx = request.json['selected_sn']
- selected_idx = [num - 1 for num in selected_idx]
- barchart_data = calc_barchart_data(picked_idx, selected_idx)
- return barchart_data
- @app.route('/feature-data', methods=['POST'])
- def feature_data():
- if request.method == 'POST':
- feature_idx_list = request.json['feature_idx_list']
- feature_data = calc_feature_distribution(feature_idx_list)
- return feature_data
- ####################
- # HELPER FUNCTIONS #
- ####################
- def calc_feature_distribution(feature_idx_list):
- # get col of selected features
- feature_idx_list = [int(i) for i in feature_idx_list]
- selected_features = tfidf_vectors[:, feature_idx_list]
- # get comics (rows) where selected features are nonzero
- nonzero_comics, _ = selected_features.nonzero()
- nonzero_comics, comic_counts = np.unique(nonzero_comics, return_counts=True)
- nonzero_comics = nonzero_comics + 1
- single = []
- both = []
- for idx,comic_sn in enumerate(nonzero_comics):
- if comic_counts[idx] > 1:
- both.append(str(comic_sn))
- else:
- single.append(str(comic_sn))
- feature_dict = dict(single=single, both=both)
- feature_data = json.dumps([feature_dict])
- return feature_data
- def calc_barchart_data(picked_idx, selected_idx):
- """
- Get top words and vales based off of a_idx
- Then for those words get values for b_idx
- word_data type: 'scipy.sparse.csr.csr_matrix'
- sum word data so total tfidf value for word
- word_data type: 'numpy.matrix'
- transform 'numpy.matrix' to 'numpy.ndarray'
- """
- len_output = 30
- word_data_picked = get_summed_tfidf(picked_idx, len_output)
- word_data_selected = get_summed_tfidf(selected_idx, len_output)
- word_data_all = get_summed_tfidf([-1], len_output)
- top_word_idxs_selected = np.argpartition(word_data_selected, -len_output)[-len_output:]
- top_word_idxs_selected = top_word_idxs_selected[np.argsort(word_data_selected[top_word_idxs_selected])]
- top_word_vals_selected = word_data_selected[top_word_idxs_selected]
- top_word_vals_picked = word_data_picked[top_word_idxs_selected]
- top_word_vals_all = word_data_all[top_word_idxs_selected]
- top_words_selected = [tfidf_feature_names[i] for i in top_word_idxs_selected]
- # labels = ["word", "tfidf"]
- labels = ["name", "value"]
- top_word_vals = zip(top_word_vals_picked, top_word_vals_selected, top_word_vals_all)
- tfidf_zipped = zip(top_words_selected, top_word_vals)
- tfidf_dict = [dict(zip(labels, row)) for row in tfidf_zipped]
- barchart_data = json.dumps([tfidf_dict])
- return barchart_data
- def get_summed_tfidf(idx_list, len_output):
- if len(idx_list) == 0:
- return np.zeros(len_output)
- elif idx_list[0] == -1:
- return word_data_all_summed
- else:
- word_data = tfidf_vectors[idx_list, :]
- word_data = word_data.sum(axis=0)
- word_data = np.squeeze(np.asarray(word_data))
- return word_data
- ########
- # MAIN #
- ########
- if __name__ == "__main__":
- # print("comic_data_df: ", comic_data_df.shape)
- # print("tfidf_vectors: ", tfidf_vectors.shape)
- # print("tfidf_feature_names: ", len(tfidf_feature_names))
- # app.run(debug=True)
- # Make Heroku Use 0.0.0.0, and read the port number from an environment variable
- HOST = '0.0.0.0' if 'PORT' in os.environ else '127.0.0.1'
- PORT = int(os.environ.get('PORT', 5000))
- app.run(host=HOST, port=PORT)
|