crowd-test-service
/
moooc_review


			
							123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317
							package com.mooctest.cluster;

import java.util.ArrayList;
import java.util.List;
import java.util.ListIterator;
import java.util.Random;

public class Cluster<K> implements Comparable< Cluster<K>>
{
    List<Document<K>> documents_;          ///< documents
    SparseVector composite_;                           ///< a composite SparseVector
    SparseVector centroid_;                            ///< a centroid SparseVector
    List< Cluster<K>> sectioned_clusters_;  ///< sectioned clusters
    double sectioned_gain_;                      ///< a sectioned gain
    Random random;

    public Cluster()
    {
        this(new ArrayList<Document<K>>());
    }

    public Cluster(List<Document<K>> documents)
    {
        this.documents_ = documents;
        composite_ = new SparseVector();
        random = new Random();
    }

    /**
     * Add the vectors of all documents to a composite vector.
     */
    void set_composite_vector()
    {
        composite_.clear();
        for (Document<K> document : documents_)
        {
            composite_.add_vector(document.feature());
        }
    }

    /**
     * Clear status.
     */
    void clear()
    {
        documents_.clear();
        composite_.clear();
        if (centroid_ != null)
            centroid_.clear();
        if (sectioned_clusters_ != null)
            sectioned_clusters_.clear();
        sectioned_gain_ = 0.0;
    }


    /**
     * Get the size.
     *
     * @return the size of this cluster
     */
    int size()
    {
        return documents_.size();
    }

    /**
     * Get the pointer of a centroid vector.
     *
     * @return the pointer of a centroid vector
     */
    SparseVector centroid_vector()
    {
        if (documents_.size() > 0 && composite_.size() == 0)
            set_composite_vector();
        centroid_ = (SparseVector) composite_vector().clone();
        centroid_.normalize();
        return centroid_;
    }

    /**
     * Get the pointer of a composite vector.
     *
     * @return the pointer of a composite vector
     */
    SparseVector composite_vector()
    {
        return composite_;
    }

    /**
     * Get documents in this cluster.
     *
     * @return documents in this cluster
     */
    List<Document<K>> documents()
    {
        return documents_;
    }

    /**
     * Add a document.
     *
     * @param doc the pointer of a document object
     */
    void add_document(Document doc)
    {
        doc.feature().normalize();
        documents_.add(doc);
        composite_.add_vector(doc.feature());
    }

    /**
     * Remove a document from this cluster.
     *
     * @param index the index of vector container of documents
     */
    void remove_document(int index)
    {
        ListIterator<Document<K>> listIterator = documents_.listIterator(index);
        Document<K> document = listIterator.next();
        listIterator.set(null);
        composite_.sub_vector(document.feature());
    }

    /**
     * Remove a document from this cluster.
     *
     * @param doc the pointer of a document object
     */
    void remove_document(Document doc)
    {
        for (Document<K> document : documents_)
        {
            if (document.equals(doc))
            {
                remove_document(doc);
                return;
            }
        }
    }


    /**
     * Delete removed documents from the internal container.
     */
    void refresh()
    {
        ListIterator<Document<K>> listIterator = documents_.listIterator();
        while (listIterator.hasNext())
        {
            if (listIterator.next() == null)
                listIterator.remove();
        }
    }

    /**
     * Get a gain when this cluster sectioned.
     *
     * @return a gain
     */
    double sectioned_gain()
    {
        return sectioned_gain_;
    }

    /**
     * Set a gain when the cluster sectioned.
     */
    void set_sectioned_gain()
    {
        double gain = 0.0f;
        if (sectioned_gain_ == 0 && sectioned_clusters_.size() > 1)
        {
            for ( Cluster<K> cluster : sectioned_clusters_)
            {
                gain += cluster.composite_vector().norm();
            }
            gain -= composite_.norm();
        }
        sectioned_gain_ = gain;
    }

    /**
     * Get sectioned clusters.
     *
     * @return sectioned clusters
     */
    List< Cluster<K>> sectioned_clusters()
    {
        return sectioned_clusters_;
    }

//    /**
//     * Choose documents randomly.
//     */
//    void choose_randomly(int ndocs, List<Document > docs)
//{
//    HashMap<int, bool>.type choosed;
//    int siz = size();
//    init_hash_map(siz, choosed, ndocs);
//    if (siz < ndocs)
//        ndocs = siz;
//    int count = 0;
//    while (count < ndocs)
//    {
//        int index = myrand(seed_) % siz;
//        if (choosed.find(index) == choosed.end())
//        {
//            choosed.insert(std.pair<int, bool>(index, true));
//            docs.push_back(documents_[index]);
//            ++count;
//        }
//    }
//}

    /**
     * 选取初始质心
     *
     * @param ndocs 质心数量
     * @param docs  输出到该列表中
     */
    void choose_smartly(int ndocs, List<Document> docs)
    {
        int siz = size();
        double[] closest = new double[siz];
        if (siz < ndocs)
            ndocs = siz;
        int index, count = 0;

        index = random.nextInt(siz);  // initial center
        docs.add(documents_.get(index));
        ++count;
        double potential = 0.0;
        for (int i = 0; i < documents_.size(); i++)
        {
            double dist = 1.0 - SparseVector.inner_product(documents_.get(i).feature(), documents_.get(index).feature());
            potential += dist;
            closest[i] = dist;
        }

        // choose each center
        while (count < ndocs)
        {
            double randval = random.nextDouble() * potential;

            for (index = 0; index < documents_.size(); index++)
            {
                double dist = closest[index];
                if (randval <= dist)
                    break;
                randval -= dist;
            }
            if (index == documents_.size())
                index--;
            docs.add(documents_.get(index));
            ++count;

            double new_potential = 0.0;
            for (int i = 0; i < documents_.size(); i++)
            {
                double dist = 1.0 - SparseVector.inner_product(documents_.get(i).feature(), documents_.get(index).feature());
                double min = closest[i];
                if (dist < min)
                {
                    closest[i] = dist;
                    min = dist;
                }
                new_potential += min;
            }
            potential = new_potential;
        }
    }

    /**
     * 将本簇划分为nclusters个簇
     *
     * @param nclusters
     */
    void section(int nclusters)
    {
        if (size() < nclusters)
            return;

        sectioned_clusters_ = new ArrayList< Cluster<K>>(nclusters);
        List<Document> centroids = new ArrayList<Document>(nclusters);
        // choose_randomly(nclusters, centroids);
        choose_smartly(nclusters, centroids);
        for (int i = 0; i < centroids.size(); i++)
        {
             Cluster<K> cluster = new  Cluster<K>();
            sectioned_clusters_.add(cluster);
        }

        for (Document<K> d : documents_)
        {
            double max_similarity = -1.0;
            int max_index = 0;
            for (int j = 0; j < centroids.size(); j++)
            {
                double similarity = SparseVector.inner_product(d.feature(), centroids.get(j).feature());
                if (max_similarity < similarity)
                {
                    max_similarity = similarity;
                    max_index = j;
                }
            }
            sectioned_clusters_.get(max_index).add_document(d);
        }
    }

    @Override
    public int compareTo( Cluster<K> o)
    {
        return Double.compare(o.sectioned_gain(), sectioned_gain());
    }
}