`
cooliufang
  • 浏览: 130224 次
社区版块
存档分类
最新评论

【Similarity calculation】Jaro Winkler distance

阅读更多
based on
http://en.wikipedia.org/wiki/Jaro%E2%80%93Winkler_distance


import java.util.Arrays;

public class JaroDistance {

	public static double jaroDistance(String source, String target) {
		int slen = source.length();
		int tlen = target.length();
		int i, j;
		
		// the match window
		int matchWindow = Math.max(slen, tlen)/2 - 1;
		
		//the number of matching characters
		int m = 0;
		
		//half the number of transposition
		int t = 0;
		
		boolean[] smatched = new boolean[slen];
		boolean[] tmatched = new boolean[tlen];
		
		Arrays.fill(smatched, false);
		Arrays.fill(tmatched, false);
		
		
		if (slen == 0) {
			return tlen == 0 ? 1.0 : 0.0;
		}		
		
		for (i = 0; i < slen; ++i) {
			int start = Math.max(0, i-matchWindow);
			int end = Math.min(i+matchWindow, tlen);
			for (j = start; j < end; j++) {
				if (tmatched[j]) continue;
				if (source.charAt(i) != target.charAt(j))
					continue;
				smatched[i] = true;
				tmatched[j] = true;
				++m;
				break;
			}
		}
		
		if (m == 0) return 0.0;
		
		j = 0;		
		for (i = 0; i < slen; ++i) {
			if (!smatched[i]) continue;
			while (!tmatched[j]) ++j;
			if (source.charAt(i) != target.charAt(j)) 
				++t;
			++j;
		}		
		t = t / 2;
//		System.out.println(m + " " + t);
		
		return ((double)m/slen + (double)m/tlen + (double)(m-t)/m) / 3.0;		
	}
	
	public static double jaroWinklerDistance(String source, String target) {
		int max = Math.min(4, Math.min(source.length(), target.length()));
		int len = 0;
		for (int i = 0; i < max; ++i) {
			if (source.charAt(len) == target.charAt(len)) 
				++len;
		}

		double jaro = jaroDistance(source, target);
		return jaro + 0.1 * len * (1.0 - jaro);
	}
	public static void main(String[] args) {
		String source = "MARTHA";
		String target = "MARHTA";
		System.out.println("Dj = " + jaroDistance(source, target));
		System.out.println("Dw = " + jaroWinklerDistance(source, target));
		System.out.println();
		
		source = "DICKSONX";
		target = "DIXON";
		System.out.println("Dj = " + jaroDistance(source, target));
		System.out.println("Dw = " + jaroWinklerDistance(source, target));
		System.out.println();
	}

}



Output:
Dj = 0.9444444444444445
Dw = 0.9611111111111111

Dj = 0.7666666666666666
Dw = 0.8133333333333332

分享到:
评论

相关推荐

Global site tag (gtag.js) - Google Analytics