application.py 5.0 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163
  1. ##########
  2. # IMPORT #
  3. ##########
  4. # for Heroku
  5. import os
  6. from flask import Flask, render_template, request, jsonify
  7. import numpy as np
  8. from scipy import sparse
  9. import json
  10. import pandas as pd
  11. ########
  12. # DATA #
  13. ########
  14. comic_data_df = pd.read_csv("final_data/comic_data.csv")
  15. tfidf_vectors = sparse.load_npz("final_data/tfidf_vectors.npz")
  16. with open("final_data/tfidf_feature_names.txt", 'r') as filehandle:
  17. tfidf_feature_names = json.load(filehandle)
  18. _, feature_counts = tfidf_vectors.nonzero();
  19. _, feature_counts = np.unique(feature_counts, return_counts=True)
  20. feature_counts = ["{} ({})".format(name, count) for name, count in zip(tfidf_feature_names, feature_counts)]
  21. word_data_all_summed = tfidf_vectors.sum(axis=0)
  22. word_data_all_summed = np.squeeze(np.asarray(word_data_all_summed))
  23. #######
  24. # APP #
  25. #######
  26. app = Flask(__name__)
  27. @app.route('/')
  28. @app.route('/index')
  29. @app.route('/home')
  30. def homepage():
  31. comic_data = comic_data_df.to_dict(orient='records')
  32. comic_data = json.dumps(comic_data, indent=2)
  33. return_comic_data = {'comic_data': comic_data}
  34. feature_names = json.dumps(feature_counts)
  35. return_feature_names = {'feature_names': feature_names}
  36. return render_template('index.html',
  37. return_comic_data=return_comic_data,
  38. num_comics=comic_data_df.shape[0],
  39. return_feature_names=return_feature_names,
  40. num_features=len(feature_counts))
  41. @app.route('/barchart-data', methods=['POST'])
  42. def barchart_data():
  43. if request.method == 'POST':
  44. picked_idx = [request.json['picked_sn'] - 1]
  45. selected_idx = request.json['selected_sn']
  46. selected_idx = [num - 1 for num in selected_idx]
  47. barchart_data = calc_barchart_data(picked_idx, selected_idx)
  48. return barchart_data
  49. @app.route('/feature-data', methods=['POST'])
  50. def feature_data():
  51. if request.method == 'POST':
  52. feature_idx_list = request.json['feature_idx_list']
  53. feature_data = calc_feature_distribution(feature_idx_list)
  54. return feature_data
  55. ####################
  56. # HELPER FUNCTIONS #
  57. ####################
  58. def calc_feature_distribution(feature_idx_list):
  59. # get col of selected features
  60. feature_idx_list = [int(i) for i in feature_idx_list]
  61. selected_features = tfidf_vectors[:, feature_idx_list]
  62. # get comics (rows) where selected features are nonzero
  63. nonzero_comics, _ = selected_features.nonzero()
  64. nonzero_comics, comic_counts = np.unique(nonzero_comics, return_counts=True)
  65. nonzero_comics = nonzero_comics + 1
  66. single = []
  67. both = []
  68. for idx,comic_sn in enumerate(nonzero_comics):
  69. if comic_counts[idx] > 1:
  70. both.append(str(comic_sn))
  71. else:
  72. single.append(str(comic_sn))
  73. feature_dict = dict(single=single, both=both)
  74. feature_data = json.dumps([feature_dict])
  75. return feature_data
  76. def calc_barchart_data(picked_idx, selected_idx):
  77. """
  78. Get top words and vales based off of a_idx
  79. Then for those words get values for b_idx
  80. word_data type: 'scipy.sparse.csr.csr_matrix'
  81. sum word data so total tfidf value for word
  82. word_data type: 'numpy.matrix'
  83. transform 'numpy.matrix' to 'numpy.ndarray'
  84. """
  85. len_output = 30
  86. word_data_picked = get_summed_tfidf(picked_idx, len_output)
  87. word_data_selected = get_summed_tfidf(selected_idx, len_output)
  88. word_data_all = get_summed_tfidf([-1], len_output)
  89. top_word_idxs_selected = np.argpartition(word_data_selected, -len_output)[-len_output:]
  90. top_word_idxs_selected = top_word_idxs_selected[np.argsort(word_data_selected[top_word_idxs_selected])]
  91. top_word_vals_selected = word_data_selected[top_word_idxs_selected]
  92. top_word_vals_picked = word_data_picked[top_word_idxs_selected]
  93. top_word_vals_all = word_data_all[top_word_idxs_selected]
  94. top_words_selected = [tfidf_feature_names[i] for i in top_word_idxs_selected]
  95. # labels = ["word", "tfidf"]
  96. labels = ["name", "value"]
  97. top_word_vals = zip(top_word_vals_picked, top_word_vals_selected, top_word_vals_all)
  98. tfidf_zipped = zip(top_words_selected, top_word_vals)
  99. tfidf_dict = [dict(zip(labels, row)) for row in tfidf_zipped]
  100. barchart_data = json.dumps([tfidf_dict])
  101. return barchart_data
  102. def get_summed_tfidf(idx_list, len_output):
  103. if len(idx_list) == 0:
  104. return np.zeros(len_output)
  105. elif idx_list[0] == -1:
  106. return word_data_all_summed
  107. else:
  108. word_data = tfidf_vectors[idx_list, :]
  109. word_data = word_data.sum(axis=0)
  110. word_data = np.squeeze(np.asarray(word_data))
  111. return word_data
  112. ########
  113. # MAIN #
  114. ########
  115. if __name__ == "__main__":
  116. # print("comic_data_df: ", comic_data_df.shape)
  117. # print("tfidf_vectors: ", tfidf_vectors.shape)
  118. # print("tfidf_feature_names: ", len(tfidf_feature_names))
  119. # app.run(debug=True)
  120. # Make Heroku Use 0.0.0.0, and read the port number from an environment variable
  121. HOST = '0.0.0.0' if 'PORT' in os.environ else '127.0.0.1'
  122. PORT = int(os.environ.get('PORT', 5000))
  123. app.run(host=HOST, port=PORT)