| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317 |
- package com.mooctest.cluster;
- import java.util.ArrayList;
- import java.util.List;
- import java.util.ListIterator;
- import java.util.Random;
- public class Cluster<K> implements Comparable< Cluster<K>>
- {
- List<Document<K>> documents_; ///< documents
- SparseVector composite_; ///< a composite SparseVector
- SparseVector centroid_; ///< a centroid SparseVector
- List< Cluster<K>> sectioned_clusters_; ///< sectioned clusters
- double sectioned_gain_; ///< a sectioned gain
- Random random;
- public Cluster()
- {
- this(new ArrayList<Document<K>>());
- }
- public Cluster(List<Document<K>> documents)
- {
- this.documents_ = documents;
- composite_ = new SparseVector();
- random = new Random();
- }
- /**
- * Add the vectors of all documents to a composite vector.
- */
- void set_composite_vector()
- {
- composite_.clear();
- for (Document<K> document : documents_)
- {
- composite_.add_vector(document.feature());
- }
- }
- /**
- * Clear status.
- */
- void clear()
- {
- documents_.clear();
- composite_.clear();
- if (centroid_ != null)
- centroid_.clear();
- if (sectioned_clusters_ != null)
- sectioned_clusters_.clear();
- sectioned_gain_ = 0.0;
- }
- /**
- * Get the size.
- *
- * @return the size of this cluster
- */
- int size()
- {
- return documents_.size();
- }
- /**
- * Get the pointer of a centroid vector.
- *
- * @return the pointer of a centroid vector
- */
- SparseVector centroid_vector()
- {
- if (documents_.size() > 0 && composite_.size() == 0)
- set_composite_vector();
- centroid_ = (SparseVector) composite_vector().clone();
- centroid_.normalize();
- return centroid_;
- }
- /**
- * Get the pointer of a composite vector.
- *
- * @return the pointer of a composite vector
- */
- SparseVector composite_vector()
- {
- return composite_;
- }
- /**
- * Get documents in this cluster.
- *
- * @return documents in this cluster
- */
- List<Document<K>> documents()
- {
- return documents_;
- }
- /**
- * Add a document.
- *
- * @param doc the pointer of a document object
- */
- void add_document(Document doc)
- {
- doc.feature().normalize();
- documents_.add(doc);
- composite_.add_vector(doc.feature());
- }
- /**
- * Remove a document from this cluster.
- *
- * @param index the index of vector container of documents
- */
- void remove_document(int index)
- {
- ListIterator<Document<K>> listIterator = documents_.listIterator(index);
- Document<K> document = listIterator.next();
- listIterator.set(null);
- composite_.sub_vector(document.feature());
- }
- /**
- * Remove a document from this cluster.
- *
- * @param doc the pointer of a document object
- */
- void remove_document(Document doc)
- {
- for (Document<K> document : documents_)
- {
- if (document.equals(doc))
- {
- remove_document(doc);
- return;
- }
- }
- }
- /**
- * Delete removed documents from the internal container.
- */
- void refresh()
- {
- ListIterator<Document<K>> listIterator = documents_.listIterator();
- while (listIterator.hasNext())
- {
- if (listIterator.next() == null)
- listIterator.remove();
- }
- }
- /**
- * Get a gain when this cluster sectioned.
- *
- * @return a gain
- */
- double sectioned_gain()
- {
- return sectioned_gain_;
- }
- /**
- * Set a gain when the cluster sectioned.
- */
- void set_sectioned_gain()
- {
- double gain = 0.0f;
- if (sectioned_gain_ == 0 && sectioned_clusters_.size() > 1)
- {
- for ( Cluster<K> cluster : sectioned_clusters_)
- {
- gain += cluster.composite_vector().norm();
- }
- gain -= composite_.norm();
- }
- sectioned_gain_ = gain;
- }
- /**
- * Get sectioned clusters.
- *
- * @return sectioned clusters
- */
- List< Cluster<K>> sectioned_clusters()
- {
- return sectioned_clusters_;
- }
- // /**
- // * Choose documents randomly.
- // */
- // void choose_randomly(int ndocs, List<Document > docs)
- //{
- // HashMap<int, bool>.type choosed;
- // int siz = size();
- // init_hash_map(siz, choosed, ndocs);
- // if (siz < ndocs)
- // ndocs = siz;
- // int count = 0;
- // while (count < ndocs)
- // {
- // int index = myrand(seed_) % siz;
- // if (choosed.find(index) == choosed.end())
- // {
- // choosed.insert(std.pair<int, bool>(index, true));
- // docs.push_back(documents_[index]);
- // ++count;
- // }
- // }
- //}
- /**
- * 选取初始质心
- *
- * @param ndocs 质心数量
- * @param docs 输出到该列表中
- */
- void choose_smartly(int ndocs, List<Document> docs)
- {
- int siz = size();
- double[] closest = new double[siz];
- if (siz < ndocs)
- ndocs = siz;
- int index, count = 0;
- index = random.nextInt(siz); // initial center
- docs.add(documents_.get(index));
- ++count;
- double potential = 0.0;
- for (int i = 0; i < documents_.size(); i++)
- {
- double dist = 1.0 - SparseVector.inner_product(documents_.get(i).feature(), documents_.get(index).feature());
- potential += dist;
- closest[i] = dist;
- }
- // choose each center
- while (count < ndocs)
- {
- double randval = random.nextDouble() * potential;
- for (index = 0; index < documents_.size(); index++)
- {
- double dist = closest[index];
- if (randval <= dist)
- break;
- randval -= dist;
- }
- if (index == documents_.size())
- index--;
- docs.add(documents_.get(index));
- ++count;
- double new_potential = 0.0;
- for (int i = 0; i < documents_.size(); i++)
- {
- double dist = 1.0 - SparseVector.inner_product(documents_.get(i).feature(), documents_.get(index).feature());
- double min = closest[i];
- if (dist < min)
- {
- closest[i] = dist;
- min = dist;
- }
- new_potential += min;
- }
- potential = new_potential;
- }
- }
- /**
- * 将本簇划分为nclusters个簇
- *
- * @param nclusters
- */
- void section(int nclusters)
- {
- if (size() < nclusters)
- return;
- sectioned_clusters_ = new ArrayList< Cluster<K>>(nclusters);
- List<Document> centroids = new ArrayList<Document>(nclusters);
- // choose_randomly(nclusters, centroids);
- choose_smartly(nclusters, centroids);
- for (int i = 0; i < centroids.size(); i++)
- {
- Cluster<K> cluster = new Cluster<K>();
- sectioned_clusters_.add(cluster);
- }
- for (Document<K> d : documents_)
- {
- double max_similarity = -1.0;
- int max_index = 0;
- for (int j = 0; j < centroids.size(); j++)
- {
- double similarity = SparseVector.inner_product(d.feature(), centroids.get(j).feature());
- if (max_similarity < similarity)
- {
- max_similarity = similarity;
- max_index = j;
- }
- }
- sectioned_clusters_.get(max_index).add_document(d);
- }
- }
- @Override
- public int compareTo( Cluster<K> o)
- {
- return Double.compare(o.sectioned_gain(), sectioned_gain());
- }
- }
|