浏览代码

修改sgns.wiki.word路径

郭超 4 年之前
父节点
当前提交
a892dda08f
共有 2 个文件被更改,包括 132 次插入133 次删除
  1. 110 110
      src/main/java/com/mooctest/cluster/ClusterAnalyzer.java
  2. 22 23
      src/test/java/com/mooctest/demo/DemoTrain.java

+ 110 - 110
src/main/java/com/mooctest/cluster/ClusterAnalyzer.java

@@ -1,110 +1,110 @@
-package com.mooctest.cluster;
-
-import com.hankcs.hanlp.mining.word2vec.DocVectorModel;
-import com.hankcs.hanlp.mining.word2vec.WordVectorModel;
-import com.mooctest.data.BugDTO;
-import lombok.Getter;
-import smile.clustering.GMeans;
-import smile.clustering.HierarchicalClustering;
-import smile.clustering.MEC;
-import smile.clustering.SIB;
-import smile.clustering.linkage.WardLinkage;
-import smile.math.distance.EuclideanDistance;
-
-import java.io.IOException;
-import java.util.*;
-import java.util.stream.Collectors;
-
-@Getter
-public class ClusterAnalyzer<K> {
-
-    protected Map<K, Doc<K>> documents;
-    protected List<K> idList;
-
-    public ClusterAnalyzer() {
-        documents = new HashMap<>();
-        idList = new ArrayList<>();
-    }
-    public List<Set<K>> HAC(double[][] distMatrix, List<K> ids, double h) {
-        idList = ids;
-        HierarchicalClustering hc = new HierarchicalClustering(new WardLinkage(distMatrix));
-        int[] labels = hc.partition(h);
-        return genClusters(labels);
-    }
-
-    public List<Set<K>> gmeans(List<BugDTO> bugs) throws IOException {
-        DocVectorModel docVectorModel2 = new DocVectorModel(new WordVectorModel("/Users/major/Downloads/sgns.wiki.word"));
-        double[][] data = new double[bugs.size()][];
-        for (int i = 0; i < bugs.size(); i++) {
-            float[] vec = docVectorModel2.query(bugs.get(i).getDescription()).getElementArray();
-            data[i] = convertFloatsToDoubles(vec);
-        }
-
-        GMeans gMeans = new GMeans(data, bugs.size() / 2);
-        int[] labels = gMeans.getClusterLabel();
-        return genClusters(labels);
-    }
-
-    public List<Set<K>> sib(List<BugDTO> bugs) throws IOException {
-        DocVectorModel docVectorModel2 = new DocVectorModel(new WordVectorModel("/Users/major/Downloads/sgns.wiki.word"));
-        double[][] data = new double[bugs.size()][];
-        for (int i = 0; i < bugs.size(); i++) {
-            float[] vec = docVectorModel2.query(bugs.get(i).getDescription()).getElementArray();
-            data[i] = convertFloatsToDoubles(vec);
-        }
-
-        SIB sib = new SIB(data, 65);
-        int[] labels = sib.getClusterLabel();
-        return genClusters(labels);
-    }
-
-    public List<Set<K>> mec(List<BugDTO> bugs) throws IOException {
-        DocVectorModel docVectorModel2 = new DocVectorModel(new WordVectorModel("/Users/major/Downloads/sgns.wiki.word"));
-        double[][] data = new double[bugs.size()][];
-        for (int i = 0; i < bugs.size(); i++) {
-            float[] vec = docVectorModel2.query(bugs.get(i).getDescription()).getElementArray();
-            data[i] = convertFloatsToDoubles(vec);
-        }
-
-        MEC mec = new MEC(data, new EuclideanDistance(),65, 1.0);
-        int[] labels = mec.getClusterLabel();
-        return genClusters(labels);
-    }
-
-    public static double[] convertFloatsToDoubles(float[] input)
-    {
-        if (input == null)
-        {
-            return null; // Or throw an exception - your choice
-        }
-        double[] output = new double[input.length];
-        for (int i = 0; i < input.length; i++)
-        {
-            output[i] = input[i];
-        }
-        return output;
-    }
-
-    private List<Set<K>> genClusters(int[] labels) {
-        Map<Integer, Set<K>> groupSet = new HashMap<>();
-        for (int i = 0; i < labels.length; i++) {
-            int clusterId = labels[i];
-            if (groupSet.get(clusterId) == null) {
-                Set<K> cluster = new HashSet<>();
-                cluster.add(idList.get(i));
-                groupSet.put(clusterId, cluster);
-            } else {
-                groupSet.get(clusterId).add(idList.get(i));
-            }
-        }
-        return groupSet.entrySet().stream()
-                .map(entry -> entry.getValue())
-                .collect(Collectors.toList());
-    }
-
-    public void addDocument(K id, String document) {
-        Doc<K> doc= new Doc<>(id, document);
-        documents.put(id, doc);
-        idList.add(id);
-    }
-}
+package com.mooctest.cluster;
+
+import com.hankcs.hanlp.mining.word2vec.DocVectorModel;
+import com.hankcs.hanlp.mining.word2vec.WordVectorModel;
+import com.mooctest.data.BugDTO;
+import lombok.Getter;
+import smile.clustering.GMeans;
+import smile.clustering.HierarchicalClustering;
+import smile.clustering.MEC;
+import smile.clustering.SIB;
+import smile.clustering.linkage.WardLinkage;
+import smile.math.distance.EuclideanDistance;
+
+import java.io.IOException;
+import java.util.*;
+import java.util.stream.Collectors;
+
+@Getter
+public class ClusterAnalyzer<K> {
+
+    protected Map<K, Doc<K>> documents;
+    protected List<K> idList;
+
+    public ClusterAnalyzer() {
+        documents = new HashMap<>();
+        idList = new ArrayList<>();
+    }
+    public List<Set<K>> HAC(double[][] distMatrix, List<K> ids, double h) {
+        idList = ids;
+        HierarchicalClustering hc = new HierarchicalClustering(new WardLinkage(distMatrix));
+        int[] labels = hc.partition(h);
+        return genClusters(labels);
+    }
+
+    public List<Set<K>> gmeans(List<BugDTO> bugs) throws IOException {
+        DocVectorModel docVectorModel2 = new DocVectorModel(new WordVectorModel("/project/sgns.wiki.word"));
+        double[][] data = new double[bugs.size()][];
+        for (int i = 0; i < bugs.size(); i++) {
+            float[] vec = docVectorModel2.query(bugs.get(i).getDescription()).getElementArray();
+            data[i] = convertFloatsToDoubles(vec);
+        }
+
+        GMeans gMeans = new GMeans(data, bugs.size() / 2);
+        int[] labels = gMeans.getClusterLabel();
+        return genClusters(labels);
+    }
+
+    public List<Set<K>> sib(List<BugDTO> bugs) throws IOException {
+        DocVectorModel docVectorModel2 = new DocVectorModel(new WordVectorModel("/project/sgns.wiki.word"));
+        double[][] data = new double[bugs.size()][];
+        for (int i = 0; i < bugs.size(); i++) {
+            float[] vec = docVectorModel2.query(bugs.get(i).getDescription()).getElementArray();
+            data[i] = convertFloatsToDoubles(vec);
+        }
+
+        SIB sib = new SIB(data, 65);
+        int[] labels = sib.getClusterLabel();
+        return genClusters(labels);
+    }
+
+    public List<Set<K>> mec(List<BugDTO> bugs) throws IOException {
+        DocVectorModel docVectorModel2 = new DocVectorModel(new WordVectorModel("/project/sgns.wiki.word"));
+        double[][] data = new double[bugs.size()][];
+        for (int i = 0; i < bugs.size(); i++) {
+            float[] vec = docVectorModel2.query(bugs.get(i).getDescription()).getElementArray();
+            data[i] = convertFloatsToDoubles(vec);
+        }
+
+        MEC mec = new MEC(data, new EuclideanDistance(),65, 1.0);
+        int[] labels = mec.getClusterLabel();
+        return genClusters(labels);
+    }
+
+    public static double[] convertFloatsToDoubles(float[] input)
+    {
+        if (input == null)
+        {
+            return null; // Or throw an exception - your choice
+        }
+        double[] output = new double[input.length];
+        for (int i = 0; i < input.length; i++)
+        {
+            output[i] = input[i];
+        }
+        return output;
+    }
+
+    private List<Set<K>> genClusters(int[] labels) {
+        Map<Integer, Set<K>> groupSet = new HashMap<>();
+        for (int i = 0; i < labels.length; i++) {
+            int clusterId = labels[i];
+            if (groupSet.get(clusterId) == null) {
+                Set<K> cluster = new HashSet<>();
+                cluster.add(idList.get(i));
+                groupSet.put(clusterId, cluster);
+            } else {
+                groupSet.get(clusterId).add(idList.get(i));
+            }
+        }
+        return groupSet.entrySet().stream()
+                .map(entry -> entry.getValue())
+                .collect(Collectors.toList());
+    }
+
+    public void addDocument(K id, String document) {
+        Doc<K> doc= new Doc<>(id, document);
+        documents.put(id, doc);
+        idList.add(id);
+    }
+}

+ 22 - 23
src/test/java/com/mooctest/demo/DemoTrain.java

@@ -1,23 +1,22 @@
-package com.mooctest.demo;
-
-import com.hankcs.hanlp.mining.word2vec.DocVectorModel;
-import com.hankcs.hanlp.mining.word2vec.Word2VecTrainer;
-import com.hankcs.hanlp.mining.word2vec.WordVectorModel;
-
-import java.io.IOException;
-
-public class DemoTrain {
-    public static void main(String[] args) throws IOException {
-//        Word2VecTrainer trainerBuilder = new Word2VecTrainer();
-//        WordVectorModel wordVectorModel = trainerBuilder.train("/Users/major/Downloads/hanlp-wiki-vec-zh/hanlp-wiki-vec-zh.txt", "/Users/major/Downloads/hanlp-wiki-vec-zh/msr_vectors.txt");
-//        wordVectorModel.nearest("中国");
-
-//        WordVectorModel wordVectorModel = new WordVectorModel("/Users/major/Downloads/hanlp-wiki-vec-zh/hanlp-wiki-vec-zh.txt");
-//        System.out.println(wordVectorModel.nearest("奖励"));
-
-//        DocVectorModel docVectorModel1 = new DocVectorModel(new WordVectorModel("/Users/major/Downloads/hanlp-wiki-vec-zh/hanlp-wiki-vec-zh.txt"));
-        DocVectorModel docVectorModel2 = new DocVectorModel(new WordVectorModel("/Users/major/Downloads/sgns.wiki.word"));
-//        System.out.println(docVectorModel1.similarity("山西副省长贪污腐败开庭", "陕西村干部受贿违纪"));
-        System.out.println(docVectorModel2.similarity("“全部”界面中可供选择的完整路径太少,只有部分路径存在(周边,亲子游,城市,1-3天,101-500,4-6月),绝大部分路径都不存在。", "在热门搜索-品类-全部筛选-滑到下面进行“自定义预计花费”,在该输入框中一单输入数字,页面立刻跳到上面,给用户输入造成困难,我用的雷电模拟器"));
-    }
-}
+package com.mooctest.demo;
+
+import com.hankcs.hanlp.mining.word2vec.DocVectorModel;
+import com.hankcs.hanlp.mining.word2vec.WordVectorModel;
+
+import java.io.IOException;
+
+public class DemoTrain {
+    public static void main(String[] args) throws IOException {
+//        Word2VecTrainer trainerBuilder = new Word2VecTrainer();
+//        WordVectorModel wordVectorModel = trainerBuilder.train("/Users/major/Downloads/hanlp-wiki-vec-zh/hanlp-wiki-vec-zh.txt", "/Users/major/Downloads/hanlp-wiki-vec-zh/msr_vectors.txt");
+//        wordVectorModel.nearest("中国");
+
+//        WordVectorModel wordVectorModel = new WordVectorModel("/Users/major/Downloads/hanlp-wiki-vec-zh/hanlp-wiki-vec-zh.txt");
+//        System.out.println(wordVectorModel.nearest("奖励"));
+
+//        DocVectorModel docVectorModel1 = new DocVectorModel(new WordVectorModel("/Users/major/Downloads/hanlp-wiki-vec-zh/hanlp-wiki-vec-zh.txt"));
+        DocVectorModel docVectorModel2 = new DocVectorModel(new WordVectorModel("/project/sgns.wiki.word"));
+//        System.out.println(docVectorModel1.similarity("山西副省长贪污腐败开庭", "陕西村干部受贿违纪"));
+        System.out.println(docVectorModel2.similarity("“全部”界面中可供选择的完整路径太少,只有部分路径存在(周边,亲子游,城市,1-3天,101-500,4-6月),绝大部分路径都不存在。", "在热门搜索-品类-全部筛选-滑到下面进行“自定义预计花费”,在该输入框中一单输入数字,页面立刻跳到上面,给用户输入造成困难,我用的雷电模拟器"));
+    }
+}