123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561 |
- import numpy as np
- import time
- import sys
- from sklearn.preprocessing import OneHotEncoder
- from sklearn.datasets import make_moons, make_blobs
- from .activelearner import *
- from .gbssl import *
- import mlflow
- BMODELNAMES = ['gr', 'log', 'probitnorm']
- MMODELNAMES = ['mgr', 'ce']
- OTHERMODELNAMES = ['rkhs', 'hf']
- ACQS = ['mc', 'uncertainty', 'rand', 'vopt', 'sopt', 'mbr', 'mcgreedy', 'mcavg', 'mcavgf', 'mcf']
- def create_checkerboard2(N):
- X = np.random.rand(N,2)
- labels = []
- for x in X:
- i, j = 0,0
- if 0.25 <= x[0] and x[0] < 0.5:
- i = 1
- elif 0.5 <= x[0] and x[0] < 0.75:
- i = 2
- elif 0.75 <= x[0]:
- i = 3
- if 0.25 <= x[1] and x[1] < 0.5:
- j = 1
- elif 0.5 <= x[1] and x[1] < 0.75:
- j = 2
- elif 0.75 <= x[1]:
- j = 3
- labels.append((i+j) % 2)
- return X, np.array(labels)
- def create_binary_clusters():
- np.random.seed(4)
- Xm, labelsm = make_moons(200, shuffle=False, noise=0.12)
- X1, labels1 = make_blobs([50,60, 40, 30, 40], 2, shuffle=False, centers=[[1.6,-1.3],[1.3,1.7], [0.5, 2.4], [0.2,-1.], [-1.7,2.2]], cluster_std=[.26, .23, .23, .26, .23])
- labels1 = labels1 % 2
- X2 = np.random.randn(100,2) @ np.array([[.4, 0.],[0.,.3]]) + np.array([-1.5,-.8])
- X3 = np.random.randn(70,2) @ np.array([[.4, 0.],[0.,.3]]) + np.array([2.5,2.8])
- x11, x12 = np.array([-2., 0.8])[np.newaxis, :], np.array([-.2,2.])[np.newaxis, :]
- l1 = (x11 + np.linspace(0,1, 80)[:, np.newaxis] @ (x12 - x11)) + np.random.randn(80, 2)*0.18
- x21, x22 = np.array([2.5, -1.5])[np.newaxis, :], np.array([2.5, 2.])[np.newaxis, :]
- l2 = (x21 + np.linspace(0,1, 90)[:, np.newaxis] @ (x22 - x21)) + np.random.randn(90, 2)*0.2
- X = np.concatenate((Xm, X1, X2, X3, l1, l2))
- labels = np.concatenate((labelsm, labels1, np.zeros(100), np.ones(70), np.ones(80), np.zeros(90)))
- return X, labels
- def create_checkerboard3(N):
- X = np.random.rand(N,2)
- labels = []
- for x in X:
- i, j = 0,0
- if 0.33333 <= x[0] and x[0] < 0.66666:
- i = 1
- elif 0.66666 <= x[0]:
- i = 2
- if 0.33333 <= x[1] and x[1] < 0.66666:
- j = 1
- elif 0.66666 <= x[1]:
- j = 2
- labels.append(3*j + i)
- labels = np.array(labels)
- labels[labels == 4] = 0
- labels[labels == 8] = 0
- labels[labels == 5] = 1
- labels[labels == 6] = 1
- labels[labels == 3] = 2
- labels[labels == 7] = 2
- return X, labels
- def run_binary(w, v, tau, gamma, oracle, init_labeled, num_al_iters, B_per_al_iter, modelname='gr', acq='mc',
- cand='rand', select_method='top', full=False,
- verbose=False):
- '''
- Inputs:
- w = eigenvalue numpy array
- v = eigenvectors numpy array (columns)
- oracle = "labels" ground truth numpy array, in {0, 1, ..., n_c} or {-1, 1}
- init_labeled = list of indices that are initially labeled, per ordering in oracle and rows of v
- num_al_iters = total number of active learning iterations to perform
- B_per_al_iter = batch size B that will be done on each iteration
- acq = string that refers to the acquisition function to be tried in this experiment
- Outputs:
- labeled : list of indices of labeled points chosen throughout whole active learning process
- acc : list of length (num_al_iters + 1) corresponding to the accuracies of the current classifer at each AL iteration
- '''
- if modelname not in BMODELNAMES:
- raise ValueError("modelname %s not in list of possible modelnames : \n%s" % (
- modelname, str(BMODELNAMES)))
- if acq not in ACQS:
- raise ValueError(
- "acq = %s is not a valid acquisition function currently implemented:\n\t%s" % (acq, str(ACQS)))
- N, M = v.shape
- if M < N:
- truncated = True
- else:
- truncated = False
- if -1 not in np.unique(oracle):
- oracle[oracle == 0] = -1
- if truncated and not full:
- print("Binary %s Reduced Model -- i.e. not storing full C covariance matrix" % modelname)
- model = BinaryGraphBasedSSLModelReduced(
- modelname, gamma, tau, w=w, v=v)
- elif truncated and full:
- print("Binary %s FULL Model, but Truncated eigenvalues" % modelname)
- model = BinaryGraphBasedSSLModel(modelname, gamma, tau, w=w, v=v)
- else:
- print("Binary %s FULL Model, with ALL eigenvalues" % modelname)
- model = BinaryGraphBasedSSLModel(modelname, gamma, tau, w=w, v=v)
- # train the initial model, record accuracy
- model.calculate_model(labeled=init_labeled[:], y=list(oracle[init_labeled]))
- acc = get_acc(model.m, oracle, unlabeled=model.unlabeled)[1]
- mlflow.log_metric('init_acc', acc)
- # instantiate ActiveLearner object
- print("ActiveLearner Settings:\n\tacq = \t%s\n\tcand = \t%s" % (acq, cand))
- print("\tselect_method = %s, B = %d" % (select_method, B_per_al_iter))
- AL = ActiveLearner(acquisition=acq, candidate=cand)
- iter_acc = []
- iter_time = []
- al_choices = []
- for al_iter in range(num_al_iters):
- if verbose or (al_iter % 10 == 0):
- print("AL Iteration %d, acc=%1.6f" % (al_iter + 1, acc))
- # select query points via active learning
- tic = time.perf_counter()
- Q = AL.select_query_points(
- model, B_per_al_iter, method=select_method, verbose=verbose)
- toc = time.perf_counter()
- # query oracle
- yQ = list(oracle[Q])
- # update model, and calculate updated model's accuracy
- model.update_model(Q, yQ)
- acc = get_acc(model.m, oracle, unlabeled=model.unlabeled)[1]
- iter_acc.append(acc)
- iter_time.append(toc - tic)
- al_choices.append(Q)
- np.savez('tmp/iter_stats.npz', al_choices=np.array(al_choices), iter_acc=np.array(iter_acc), iter_time=np.array(iter_time))
- mlflow.log_artifact('tmp/iter_stats.npz')
- return
- def run_rkhs_hf(oracle, init_labeled, num_al_iters, B_per_al_iter, modelname='rkhs', h=0.1, delta=0.1, X=None, L=None,
- cand='rand', select_method='top', acq='db', verbose=False):
- '''
- Inputs:
- X = dataset
- oracle = "labels" ground truth numpy array, in {0, 1, ..., n_c} or {-1, 1}
- init_labeled = list of indices that are initially labeled, per ordering in oracle and rows of v
- num_al_iters = total number of active learning iterations to perform
- B_per_al_iter = batch size B that will be done on each iteration
- Outputs:
- labeled : list of indices of labeled points chosen throughout whole active learning process
- acc : list of length (num_al_iters + 1) corresponding to the accuracies of the current classifer at each AL iteration
- '''
- if modelname == 'rkhs':
- assert X is not None
- model = RKHSClassifier(X, sigma=h) # bandwidth from Karzand paper
- else:
- assert L is not None
- model = HFGraphBasedSSLModel(delta, L)
- # train the initial model, record accuracy
- if len(np.unique(oracle)) > 2:
- # calculate one-hot labels for oracle
- enc = OneHotEncoder()
- enc.fit(oracle.reshape((-1, 1)))
- oracle_onehot = enc.transform(oracle.reshape((-1, 1))).todense()
- y_init = oracle_onehot[init_labeled]
- else:
- # binary case
- if -1 not in np.unique(oracle):
- oracle[oracle == 0] = -1
- y_init = list(oracle[init_labeled])
- model.calculate_model(labeled=init_labeled[:], y=y_init)
- if model.nc > 2:
- acc = get_acc_multi(np.argmax(model.f, axis=1),
- oracle, unlabeled=model.unlabeled)[1]
- else:
- acc = get_acc(model.f, oracle, unlabeled=model.unlabeled)[1]
- mlflow.log_metric('init_acc', acc)
- # instantiate ActiveLearner object
- print("ActiveLearner Settings:\n\t{} {}".format(modelname.upper(), acq.upper()))
- print("\tselect_method = %s, B = %d" % (select_method, B_per_al_iter))
- AL = ActiveLearner(acquisition=acq, candidate=cand)
- iter_acc = []
- iter_time = []
- al_choices = []
- for al_iter in range(num_al_iters):
- if verbose or (al_iter % 10 == 0):
- print("AL Iteration %d, acc=%1.6f" % (al_iter + 1, acc))
- # select query points via active learning
- tic = time.perf_counter()
- Q = AL.select_query_points(
- model, B_per_al_iter, method=select_method, verbose=verbose)
- toc = time.perf_counter()
- # query oracle
- if model.nc > 2:
- yQ = oracle_onehot[Q]
- else:
- yQ = list(oracle[Q])
- # update model, and calculate updated model's accuracy
- model.update_model(Q, yQ)
- if model.nc > 2:
- acc = get_acc_multi(np.argmax(model.f, axis=1),
- oracle, unlabeled=model.unlabeled)[1]
- else:
- acc = get_acc(model.f, oracle, unlabeled=model.unlabeled)[1]
- iter_acc.append(acc)
- iter_time.append(toc - tic)
- al_choices.append(Q)
- np.savez('tmp/iter_stats.npz', al_choices=np.array(al_choices), iter_acc=np.array(iter_acc), iter_time=np.array(iter_time))
- mlflow.log_artifact('tmp/iter_stats.npz')
- return
- def run_multi(w, v, tau, gamma, oracle, init_labeled, num_al_iters, B_per_al_iter,
- modelname='mgr', acq='mc', cand='rand', select_method='top', full=False,
- verbose=False):
- '''
- Inputs:
- w = eigenvalue numpy array
- v = eigenvectors numpy array (columns)
- oracle = "labels" ground truth numpy array, in {0, 1, ..., n_c} or {-1, 1}
- init_labeled = list of indices that are initially labeled, per ordering in oracle and rows of v
- num_al_iters = total number of active learning iterations to perform
- B_per_al_iter = batch size B that will be done on each iteration
- acq = string that refers to the acquisition function to be tried in this experiment
- Outputs:
- labeled : list of indices of labeled points chosen throughout whole active learning process
- acc : list of length (num_al_iters + 1) corresponding to the accuracies of the current classifer at each AL iteration
- '''
- if modelname not in MMODELNAMES:
- raise ValueError("modelname %s not in list of possible modelnames : \n%s" % (
- modelname, str(MMODELNAMES)))
- if acq not in ACQS:
- raise ValueError(
- "acq = %s is not a valid acquisition function currently implemented:\n\t%s" % (acq, str(ACQS)))
- N, M = v.shape
- if M < N:
- truncated = True
- else:
- truncated = False
- if modelname == 'mgr': # GR is implemented in the Binary model since it requires same storage structure
- if truncated and not full:
- print(
- "Multi %s Reduced Model -- i.e. not storing full C covariance matrix" % modelname)
- model = BinaryGraphBasedSSLModelReduced(
- modelname, gamma, tau, w=w, v=v)
- elif truncated and full:
- print("Multi %s FULL Model, but Truncated eigenvalues" % modelname)
- model = BinaryGraphBasedSSLModel(modelname, gamma, tau, w=w, v=v)
- else:
- print("Multi %s FULL Model, with ALL eigenvalues" % modelname)
- model = BinaryGraphBasedSSLModel(modelname, gamma, tau, w=w, v=v)
- else:
- print("Multi %s Reduced Model -- i.e. not storing full C covariance matrix" % modelname)
- model = CrossEntropyGraphBasedSSLModelReduced(gamma, tau, w=w, v=v)
- # calculate one-hot labels for oracle
- enc = OneHotEncoder()
- enc.fit(oracle.reshape((-1, 1)))
- oracle_onehot = enc.transform(oracle.reshape((-1, 1))).todense()
- # train the initial model, record accuracy
- model.calculate_model(
- labeled=init_labeled[:], y=oracle_onehot[init_labeled])
- acc = get_acc_multi(np.argmax(model.m, axis=1),
- oracle, unlabeled=model.unlabeled)[1]
- mlflow.log_metric('init_acc', acc)
- # instantiate ActiveLearner object
- print("ActiveLearner Settings:\n\tacq = \t%s\n\tcand = \t%s" % (acq, cand))
- print("\tselect_method = %s, B = %d" % (select_method, B_per_al_iter))
- AL = ActiveLearner(acquisition=acq, candidate=cand)
- iter_acc = []
- iter_time = []
- al_choices = []
- beta = 0.
- for al_iter in range(num_al_iters):
- if verbose or (al_iter % 1 == 0):
- print("AL Iteration %d, acc=%1.6f" % (al_iter + 1, acc))
- if acq in ['mcavg', 'mcavgf']:
- #beta = 1./(1. + al_iter // 10)
- beta = (1. - (al_iter/float(num_al_iters)))
- if beta < 0:
- beta = 0.0
- # if al_iter < 8:
- # beta = 1.0
- # else:
- # beta = 0.0
- print("\tbeta = {:.3f}".format(beta))
- # select query points via active learning
- tic = time.perf_counter()
- Q = AL.select_query_points(
- model, B_per_al_iter, method=select_method, verbose=verbose, mcavg_beta=beta)
- toc = time.perf_counter()
- # query oracle
- yQ = oracle_onehot[Q]
- # update model, and calculate updated model's accuracy
- model.update_model(Q, yQ)
- acc = get_acc_multi(np.argmax(model.m, axis=1),
- oracle, unlabeled=model.unlabeled)[1]
- iter_acc.append(acc)
- iter_time.append(toc - tic)
- al_choices.append(Q)
- np.savez('tmp/iter_stats.npz', al_choices=np.array(al_choices), iter_acc=np.array(iter_acc), iter_time=np.array(iter_time))
- mlflow.log_artifact('tmp/iter_stats.npz')
- return
- #
- #
- # def run_test(oracle, init_labeled, num_al_iters, B_per_al_iter, modelname='gr', acq='mc',
- # cand='rand', select_method='top', w=None, v=None, tau=0.1, gamma=0.1,
- # X=None, L=None, h=0.1, delta=0.1,full=False, verbose=False):
- #
- # # if modelname not in BMODELNAMES:
- # # raise ValueError("modelname %s not in list of possible modelnames : \n%s" % (
- # # modelname, str(BMODELNAMES)))
- # # if acq not in ACQS:
- # # raise ValueError(
- # # "acq = %s is not a valid acquisition function currently implemented:\n\t%s" % (acq, str(ACQS)))
- #
- # if v is not None:
- # N, M = v.shape
- # if M < N:
- # truncated = True
- # else:
- # truncated = False
- #
- # if modelname in BMODELNAMES:
- # assert v is not None
- # assert w is not None
- # if -1 not in np.unique(oracle):
- # oracle[oracle == 0] = -1
- # if truncated and not full:
- # print("Binary %s Reduced Model -- i.e. not storing full C covariance matrix" % modelname)
- # model = BinaryGraphBasedSSLModelReduced(
- # modelname, gamma, tau, w=w, v=v)
- # elif truncated and full:
- # print("Binary %s FULL Model, but Truncated eigenvalues" % modelname)
- # model = BinaryGraphBasedSSLModel(modelname, gamma, tau, w=w, v=v)
- # else:
- # print("Binary %s FULL Model, with ALL eigenvalues" % modelname)
- # model = BinaryGraphBasedSSLModel(modelname, gamma, tau, w=w, v=v)
- #
- # ylab = list(oracle[init_labeled])
- #
- # elif modelname in MMODELNAMES:
- # assert v is not None
- # assert w is not None
- # if modelname == 'mgr': # GR is implemented in the Binary model since it requires same storage structure
- # if truncated and not full:
- # print(
- # "Multi %s Reduced Model -- i.e. not storing full C covariance matrix" % modelname)
- # model = BinaryGraphBasedSSLModelReduced(
- # modelname, gamma, tau, w=w, v=v)
- # elif truncated and full:
- # print("Multi %s FULL Model, but Truncated eigenvalues" % modelname)
- # model = BinaryGraphBasedSSLModel(modelname, gamma, tau, w=w, v=v)
- # else:
- # print("Multi %s FULL Model, with ALL eigenvalues" % modelname)
- # model = BinaryGraphBasedSSLModel(modelname, gamma, tau, w=w, v=v)
- # else:
- # print("Multi %s Reduced Model -- i.e. not storing full C covariance matrix" % modelname)
- # model = CrossEntropyGraphBasedSSLModelReduced(gamma, tau, w=w, v=v)
- #
- # enc = OneHotEncoder()
- # enc.fit(oracle.reshape((-1, 1)))
- # oracle_onehot = enc.transform(oracle.reshape((-1, 1))).todense()
- # ylab = oracle_onehot[init_labeled]
- #
- # elif modelname in OTHERMODELNAMES:
- # if modelname == 'rkhs':
- # assert X is not None
- # assert acq == 'db'
- # model = RKHSClassifier(X, sigma=h) # bandwidth from Karzand paper
- # else:
- # assert L is not None
- # assert acq in ['vopt', 'sopt']
- # model = HFGraphBasedSSLModel(delta, L)
- #
- # ylab = list(oracle[init_labeled])
- # else:
- # raise ValueError("{} is not a valid model name")
- #
- #
- #
- # # train the initial model, record accuracy
- # model.calculate_model(labeled=init_labeled[:], y=ylab[:])
- # acc = get_acc(model.m, oracle, unlabeled=model.unlabeled)[1]
- # mlflow.log_metric('init_acc', acc)
- #
- #
- # # instantiate ActiveLearner object
- # print("ActiveLearner Settings:\n\tacq = \t%s\n\tcand = \t%s" % (acq, cand))
- # print("\tselect_method = %s, B = %d" % (select_method, B_per_al_iter))
- # AL = ActiveLearner(acquisition=acq, candidate=cand)
- #
- #
- # iter_acc = []
- # iter_time = []
- # al_choices = []
- # for al_iter in range(num_al_iters):
- # if verbose or (al_iter % 10 == 0):
- # print("AL Iteration %d, acc=%1.6f" % (al_iter + 1, acc))
- # # select query points via active learning
- # tic = time.perf_counter()
- # Q = AL.select_query_points(
- # model, B_per_al_iter, method=select_method, verbose=verbose)
- # toc = time.perf_counter()
- #
- # # query oracle
- # yQ = list(oracle[Q])
- #
- #
- # # update model, and calculate updated model's accuracy
- # model.update_model(Q, yQ)
- # acc = get_acc(model.m, oracle, unlabeled=model.unlabeled)[1]
- # iter_acc.append(acc)
- # iter_time.append(toc - tic)
- # al_choices.append(Q)
- #
- # np.savez('tmp/iter_stats.npz', al_choices=np.array(al_choices), iter_acc=np.array(iter_acc), iter_time=np.array(iter_time))
- # mlflow.log_artifact('tmp/iter_stats.npz')
- #
- # return
- #
- #
- #
- #
- #
- # def get_data_from_runs(acq, modelname, M, tau, gamma, cand, select_method, B, num_al_iters, runs=[1], root_filename='./'):
- # parent_filename = root_filename + "%s-%s-%d-%s-%s/" % (acq, modelname, M, str(tau), str(gamma))
- # if not os.path.exists(parent_filename):
- # raise ValueError("data at %s does not exist..." % parent_filename)
- # RUNS = {}
- # for run in runs:
- # experiment_name = "%s-%s-%d-%d-%d.txt" % (cand, select_method, B, num_al_iters, run)
- # if not os.path.exists(parent_filename + experiment_name):
- # print('Run #%d that you requested does not exist at %s, skipping' % (run, parent_filename + experiment_name))
- # else:
- # with open(parent_filename + experiment_name, 'r') as f:
- # for i, line in enumerate(f.readlines()):
- # # read in init_labeled, and initial accuracy
- # if i == 0:
- # line = line.split(',')
- # RUNS[run] = {'init_labeled': [int(x) for x in line[:-2]], 'acc':[float(line[-1])], 'times':[], 'choices':[]}
- # else:
- # line = line.split(',')
- # RUNS[run]['acc'].append(float(line[-1]))
- # RUNS[run]['choices'].extend(int(x) for x in line[:-2])
- # RUNS[run]['times'].append(float(line[-2]))
- #
- # return RUNS
- #
- # def get_avg_acc_from_runs_dict(RUNS, runs=[1]):
- # count = len(runs)
- # accs = []
- # for run in runs:
- # if run not in RUNS:
- # print("Run #%d not in RUNS dictionary given, skipping..." % run)
- # else:
- # accs.append(RUNS[run]['acc'])
- # if len(accs) == 0:
- # print("No valid runs found, returning None")
- # return
- # accs = np.array(accs)
- # return np.average(accs, axis=0), np.std(accs, axis=0)
|