余弦相似性算法的具体介绍参考:http://www.ruanyifeng.com/blog/2013/03/cosine_similarity.html
下面是我根据上边的介绍进行的java语言的实现:
import java.io.IOException; import java.io.StringReader; import java.util.Collections; import java.util.Comparator; import java.util.HashMap; import java.util.HashSet; import java.util.LinkedList; import java.util.List; import java.util.Map; import java.util.Map.Entry; import java.util.Set; import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; import org.wltea.analyzer.lucene.IKAnalyzer; import com.wjb.util.common.WjbTuple2; public class CosineTextSimilarity { public static Map<String,Integer> makeTermFrequency(String text) throws IOException { Analyzer analyzer = new IKAnalyzer(true); StringReader reader = new StringReader(text); TokenStream ts = analyzer.tokenStream("",reader); CharTermAttribute term=ts.getAttribute(CharTermAttribute.class); Map<String,Integer> tf = new HashMap<String,Integer>(); while(ts.incrementToken()){ String t = term.toString(); Integer count = tf.get(t); if(count == null) { tf.put(t,1); }else{ tf.put(t,count + 1); } } analyzer.close(); reader.close(); return tf; } /** * 根据key的长度进行过滤,只有key的长度不小于 length 时,这个key才会保留 * @param map * @param length * @return * @throws IOException */ public static Map<String,Integer> filterByKeyLength(Map<String,Integer> map,int length) throws IOException { Map<String,Integer> m = new HashMap<String,Integer>(); for(String key : map.keySet()) { if(key == null || key.trim().length() >= length) { m.put(key,map.get(key)); } } return m; } public static WjbTuple2<int[],int[]> makeVector(Map<String,Integer> first,Map<String,Integer> second){ Set<String> keys = new HashSet<String>(); keys.addAll(first.keySet()); keys.addAll(second.keySet()); int[] vector1 = new int[keys.size()]; int[] vector2 = new int[keys.size()]; int i = 0; for(String key : keys) { Integer count1 = first.get(key); if(count1 != null) { vector1[i] = count1; } Integer count2 = second.get(key); if(count2 != null) { vector2[i] = count2; } i++; } return new WjbTuple2<int[],int[]>(vector1,vector2); } public static double cosine(WjbTuple2<int[],int[]> tuple) { int[] vector1 = tuple._1; int[] vector2 = tuple._2; double sum1 = 0; double sum21 = 0; double sum22 = 0; for (int i = 0; i < vector1.length; i++) { sum1 += vector1[i] * vector2[i]; sum21 += vector1[i] * vector1[i]; sum22 += vector2[i] * vector2[i]; } return sum1/(Math.sqrt(sum21 * sum22 )); } public static List<Entry> sort(Map unsortMap) { // Convert Map to List List<Map.Entry> list = new LinkedList<Map.Entry>(unsortMap.entrySet()); // Sort list with comparator,to compare the Map values Collections.sort(list,new Comparator<Map.Entry>() { public int compare(Map.Entry o1,Map.Entry o2) { String d1 = o1.getValue().toString(); String d2 = o2.getValue().toString(); String k1 = o1.getKey().toString(); String k2 = o2.getKey().toString(); if(o1.getValue() instanceof Integer) { Integer nd1 = Integer.parseInt(d1); Integer nd2 = Integer.parseInt(d2); if( nd2 - nd1 != 0 ) return nd2 - nd1; else{ return k2.compareTo(k1); } }else return d2.compareTo(d1); } }); return list; } }
import java.util.HashMap; import java.util.List; import java.util.Map; import java.util.Map.Entry; import com.wjb.util.common.WjbFileUtil; import com.wjb.util.common.WjbTuple2; public class Main { public static void main(String[] args) throws Exception { String text1 = WjbFileUtil.fromFile("d:/1.txt"); String text2 = WjbFileUtil.fromFile("d:/2.txt",WjbFileUtil.GBK); System.out.println(text2); long begin = System.currentTimeMillis(); Map<String,Integer> map1 = CosineTextSimilarity.makeTermFrequency(text1); Map<String,Integer> map2 = CosineTextSimilarity.makeTermFrequency(text2); // map1 = CosineTextSimilarity.filterByKeyLength(map1,2); // map2 = CosineTextSimilarity.filterByKeyLength(map2,2); List<Entry> list1 = CosineTextSimilarity.sort(map1); System.out.println(list1); list1 = list1.subList(0,list1.size() > 20 ? 20 : list1.size()); List<Entry> list2 = CosineTextSimilarity.sort(map2); System.out.println(list2); list2 = list2.subList(0,list2.size() > 20 ? 20 : list2.size()); map1 = list2Map(list1); map2 = list2Map(list2); WjbTuple2<int[],int[]> tuple = CosineTextSimilarity.makeVector(map1,map2); double cos = CosineTextSimilarity.cosine(tuple); long end = System.currentTimeMillis(); System.out.println(end - begin); System.out.println(cos); } public static Map<String,Integer> list2Map(List<Entry> list) { Map<String,Integer> map = new HashMap<String,Integer>(); for(Entry e : list) { map.put(e.getKey().toString(),(Integer)e.getValue()); } return map; } }