LiuFan
/
PrivacyScanData


			
				
					
						
						
							1234567891011121314151617181920212223242526272829303132
							def calc_barchart_data(picked_idx, selected_idx):
    """
    Get top words and vales based off of a_idx
    Then for those words get values for b_idx

    word_data type: 'scipy.sparse.csr.csr_matrix'
    sum word data so total tfidf value for word
    word_data type: 'numpy.matrix'
    transform 'numpy.matrix' to 'numpy.ndarray'
    """
    len_output = 30

    word_data_picked = get_summed_tfidf(picked_idx, len_output)
    word_data_selected = get_summed_tfidf(selected_idx, len_output)
    word_data_all = get_summed_tfidf([-1], len_output)

    top_word_idxs_selected = np.argpartition(word_data_selected, -len_output)[-len_output:]
    top_word_idxs_selected = top_word_idxs_selected[np.argsort(word_data_selected[top_word_idxs_selected])]

    top_word_vals_selected = word_data_selected[top_word_idxs_selected]
    top_word_vals_picked = word_data_picked[top_word_idxs_selected]
    top_word_vals_all = word_data_all[top_word_idxs_selected]
    top_words_selected = [tfidf_feature_names[i] for i in top_word_idxs_selected]

    # labels = ["word", "tfidf"]
    labels = ["name", "value"]
    top_word_vals = zip(top_word_vals_picked, top_word_vals_selected, top_word_vals_all)
    tfidf_zipped = zip(top_words_selected, top_word_vals)
    tfidf_dict = [dict(zip(labels, row)) for row in tfidf_zipped]

    barchart_data = json.dumps([tfidf_dict])
    return barchart_data