|
@@ -1,110 +1,110 @@
|
|
|
-package com.mooctest.cluster;
|
|
|
-
|
|
|
-import com.hankcs.hanlp.mining.word2vec.DocVectorModel;
|
|
|
-import com.hankcs.hanlp.mining.word2vec.WordVectorModel;
|
|
|
-import com.mooctest.data.BugDTO;
|
|
|
-import lombok.Getter;
|
|
|
-import smile.clustering.GMeans;
|
|
|
-import smile.clustering.HierarchicalClustering;
|
|
|
-import smile.clustering.MEC;
|
|
|
-import smile.clustering.SIB;
|
|
|
-import smile.clustering.linkage.WardLinkage;
|
|
|
-import smile.math.distance.EuclideanDistance;
|
|
|
-
|
|
|
-import java.io.IOException;
|
|
|
-import java.util.*;
|
|
|
-import java.util.stream.Collectors;
|
|
|
-
|
|
|
-@Getter
|
|
|
-public class ClusterAnalyzer<K> {
|
|
|
-
|
|
|
- protected Map<K, Doc<K>> documents;
|
|
|
- protected List<K> idList;
|
|
|
-
|
|
|
- public ClusterAnalyzer() {
|
|
|
- documents = new HashMap<>();
|
|
|
- idList = new ArrayList<>();
|
|
|
- }
|
|
|
- public List<Set<K>> HAC(double[][] distMatrix, List<K> ids, double h) {
|
|
|
- idList = ids;
|
|
|
- HierarchicalClustering hc = new HierarchicalClustering(new WardLinkage(distMatrix));
|
|
|
- int[] labels = hc.partition(h);
|
|
|
- return genClusters(labels);
|
|
|
- }
|
|
|
-
|
|
|
- public List<Set<K>> gmeans(List<BugDTO> bugs) throws IOException {
|
|
|
- DocVectorModel docVectorModel2 = new DocVectorModel(new WordVectorModel("/Users/major/Downloads/sgns.wiki.word"));
|
|
|
- double[][] data = new double[bugs.size()][];
|
|
|
- for (int i = 0; i < bugs.size(); i++) {
|
|
|
- float[] vec = docVectorModel2.query(bugs.get(i).getDescription()).getElementArray();
|
|
|
- data[i] = convertFloatsToDoubles(vec);
|
|
|
- }
|
|
|
-
|
|
|
- GMeans gMeans = new GMeans(data, bugs.size() / 2);
|
|
|
- int[] labels = gMeans.getClusterLabel();
|
|
|
- return genClusters(labels);
|
|
|
- }
|
|
|
-
|
|
|
- public List<Set<K>> sib(List<BugDTO> bugs) throws IOException {
|
|
|
- DocVectorModel docVectorModel2 = new DocVectorModel(new WordVectorModel("/Users/major/Downloads/sgns.wiki.word"));
|
|
|
- double[][] data = new double[bugs.size()][];
|
|
|
- for (int i = 0; i < bugs.size(); i++) {
|
|
|
- float[] vec = docVectorModel2.query(bugs.get(i).getDescription()).getElementArray();
|
|
|
- data[i] = convertFloatsToDoubles(vec);
|
|
|
- }
|
|
|
-
|
|
|
- SIB sib = new SIB(data, 65);
|
|
|
- int[] labels = sib.getClusterLabel();
|
|
|
- return genClusters(labels);
|
|
|
- }
|
|
|
-
|
|
|
- public List<Set<K>> mec(List<BugDTO> bugs) throws IOException {
|
|
|
- DocVectorModel docVectorModel2 = new DocVectorModel(new WordVectorModel("/Users/major/Downloads/sgns.wiki.word"));
|
|
|
- double[][] data = new double[bugs.size()][];
|
|
|
- for (int i = 0; i < bugs.size(); i++) {
|
|
|
- float[] vec = docVectorModel2.query(bugs.get(i).getDescription()).getElementArray();
|
|
|
- data[i] = convertFloatsToDoubles(vec);
|
|
|
- }
|
|
|
-
|
|
|
- MEC mec = new MEC(data, new EuclideanDistance(),65, 1.0);
|
|
|
- int[] labels = mec.getClusterLabel();
|
|
|
- return genClusters(labels);
|
|
|
- }
|
|
|
-
|
|
|
- public static double[] convertFloatsToDoubles(float[] input)
|
|
|
- {
|
|
|
- if (input == null)
|
|
|
- {
|
|
|
- return null; // Or throw an exception - your choice
|
|
|
- }
|
|
|
- double[] output = new double[input.length];
|
|
|
- for (int i = 0; i < input.length; i++)
|
|
|
- {
|
|
|
- output[i] = input[i];
|
|
|
- }
|
|
|
- return output;
|
|
|
- }
|
|
|
-
|
|
|
- private List<Set<K>> genClusters(int[] labels) {
|
|
|
- Map<Integer, Set<K>> groupSet = new HashMap<>();
|
|
|
- for (int i = 0; i < labels.length; i++) {
|
|
|
- int clusterId = labels[i];
|
|
|
- if (groupSet.get(clusterId) == null) {
|
|
|
- Set<K> cluster = new HashSet<>();
|
|
|
- cluster.add(idList.get(i));
|
|
|
- groupSet.put(clusterId, cluster);
|
|
|
- } else {
|
|
|
- groupSet.get(clusterId).add(idList.get(i));
|
|
|
- }
|
|
|
- }
|
|
|
- return groupSet.entrySet().stream()
|
|
|
- .map(entry -> entry.getValue())
|
|
|
- .collect(Collectors.toList());
|
|
|
- }
|
|
|
-
|
|
|
- public void addDocument(K id, String document) {
|
|
|
- Doc<K> doc= new Doc<>(id, document);
|
|
|
- documents.put(id, doc);
|
|
|
- idList.add(id);
|
|
|
- }
|
|
|
-}
|
|
|
+package com.mooctest.cluster;
|
|
|
+
|
|
|
+import com.hankcs.hanlp.mining.word2vec.DocVectorModel;
|
|
|
+import com.hankcs.hanlp.mining.word2vec.WordVectorModel;
|
|
|
+import com.mooctest.data.BugDTO;
|
|
|
+import lombok.Getter;
|
|
|
+import smile.clustering.GMeans;
|
|
|
+import smile.clustering.HierarchicalClustering;
|
|
|
+import smile.clustering.MEC;
|
|
|
+import smile.clustering.SIB;
|
|
|
+import smile.clustering.linkage.WardLinkage;
|
|
|
+import smile.math.distance.EuclideanDistance;
|
|
|
+
|
|
|
+import java.io.IOException;
|
|
|
+import java.util.*;
|
|
|
+import java.util.stream.Collectors;
|
|
|
+
|
|
|
+@Getter
|
|
|
+public class ClusterAnalyzer<K> {
|
|
|
+
|
|
|
+ protected Map<K, Doc<K>> documents;
|
|
|
+ protected List<K> idList;
|
|
|
+
|
|
|
+ public ClusterAnalyzer() {
|
|
|
+ documents = new HashMap<>();
|
|
|
+ idList = new ArrayList<>();
|
|
|
+ }
|
|
|
+ public List<Set<K>> HAC(double[][] distMatrix, List<K> ids, double h) {
|
|
|
+ idList = ids;
|
|
|
+ HierarchicalClustering hc = new HierarchicalClustering(new WardLinkage(distMatrix));
|
|
|
+ int[] labels = hc.partition(h);
|
|
|
+ return genClusters(labels);
|
|
|
+ }
|
|
|
+
|
|
|
+ public List<Set<K>> gmeans(List<BugDTO> bugs) throws IOException {
|
|
|
+ DocVectorModel docVectorModel2 = new DocVectorModel(new WordVectorModel("/project/sgns.wiki.word"));
|
|
|
+ double[][] data = new double[bugs.size()][];
|
|
|
+ for (int i = 0; i < bugs.size(); i++) {
|
|
|
+ float[] vec = docVectorModel2.query(bugs.get(i).getDescription()).getElementArray();
|
|
|
+ data[i] = convertFloatsToDoubles(vec);
|
|
|
+ }
|
|
|
+
|
|
|
+ GMeans gMeans = new GMeans(data, bugs.size() / 2);
|
|
|
+ int[] labels = gMeans.getClusterLabel();
|
|
|
+ return genClusters(labels);
|
|
|
+ }
|
|
|
+
|
|
|
+ public List<Set<K>> sib(List<BugDTO> bugs) throws IOException {
|
|
|
+ DocVectorModel docVectorModel2 = new DocVectorModel(new WordVectorModel("/project/sgns.wiki.word"));
|
|
|
+ double[][] data = new double[bugs.size()][];
|
|
|
+ for (int i = 0; i < bugs.size(); i++) {
|
|
|
+ float[] vec = docVectorModel2.query(bugs.get(i).getDescription()).getElementArray();
|
|
|
+ data[i] = convertFloatsToDoubles(vec);
|
|
|
+ }
|
|
|
+
|
|
|
+ SIB sib = new SIB(data, 65);
|
|
|
+ int[] labels = sib.getClusterLabel();
|
|
|
+ return genClusters(labels);
|
|
|
+ }
|
|
|
+
|
|
|
+ public List<Set<K>> mec(List<BugDTO> bugs) throws IOException {
|
|
|
+ DocVectorModel docVectorModel2 = new DocVectorModel(new WordVectorModel("/project/sgns.wiki.word"));
|
|
|
+ double[][] data = new double[bugs.size()][];
|
|
|
+ for (int i = 0; i < bugs.size(); i++) {
|
|
|
+ float[] vec = docVectorModel2.query(bugs.get(i).getDescription()).getElementArray();
|
|
|
+ data[i] = convertFloatsToDoubles(vec);
|
|
|
+ }
|
|
|
+
|
|
|
+ MEC mec = new MEC(data, new EuclideanDistance(),65, 1.0);
|
|
|
+ int[] labels = mec.getClusterLabel();
|
|
|
+ return genClusters(labels);
|
|
|
+ }
|
|
|
+
|
|
|
+ public static double[] convertFloatsToDoubles(float[] input)
|
|
|
+ {
|
|
|
+ if (input == null)
|
|
|
+ {
|
|
|
+ return null; // Or throw an exception - your choice
|
|
|
+ }
|
|
|
+ double[] output = new double[input.length];
|
|
|
+ for (int i = 0; i < input.length; i++)
|
|
|
+ {
|
|
|
+ output[i] = input[i];
|
|
|
+ }
|
|
|
+ return output;
|
|
|
+ }
|
|
|
+
|
|
|
+ private List<Set<K>> genClusters(int[] labels) {
|
|
|
+ Map<Integer, Set<K>> groupSet = new HashMap<>();
|
|
|
+ for (int i = 0; i < labels.length; i++) {
|
|
|
+ int clusterId = labels[i];
|
|
|
+ if (groupSet.get(clusterId) == null) {
|
|
|
+ Set<K> cluster = new HashSet<>();
|
|
|
+ cluster.add(idList.get(i));
|
|
|
+ groupSet.put(clusterId, cluster);
|
|
|
+ } else {
|
|
|
+ groupSet.get(clusterId).add(idList.get(i));
|
|
|
+ }
|
|
|
+ }
|
|
|
+ return groupSet.entrySet().stream()
|
|
|
+ .map(entry -> entry.getValue())
|
|
|
+ .collect(Collectors.toList());
|
|
|
+ }
|
|
|
+
|
|
|
+ public void addDocument(K id, String document) {
|
|
|
+ Doc<K> doc= new Doc<>(id, document);
|
|
|
+ documents.put(id, doc);
|
|
|
+ idList.add(id);
|
|
|
+ }
|
|
|
+}
|