计算字符串相似度

orange.lpai

浏览: 92719 次
性别:
来自: 上海

最近访客更多访客>>

fanan_666

asp188

persourle

zzzzlm

博主相关

博客

微博

相册

留言

关于我

文章分类

社区版块

存档分类

算法 J#C C++C#

转载自http://blog.csdn.net/liwenjia1981/archive/2010/07/13/5731040.aspx
编程之美3.3

看完题后，毫无头绪

书上的解题思路很好，首先两个字符串的距离肯定是个可知数，必须小于两字符串之和。

可以通过删除操作将两个串都变成空串。

书上所示的递归方法，代码敲出来了，有点点不同

view plaincopy to clipboardprint?

#include <stdlib.h>  
#include <stdio.h>  
#include <string.h>  
  
void calDistance(char *a, int aBegin, int aEnd, char * b, int bBegin, int bEnd, int& dis);  
int min(int a, int b, int c);  
  
int main()  
{  
    char a[50];  
    char b[50];  
    int dis = 0;  
    printf("First String :\n");  
    scanf("%s", a);  
    printf("Second String : \n");  
    scanf("%s", b);  
    int aLen = strlen(a);  
    int bLen = strlen(b);  
    calDistance(a,0,aLen-1, b, 0, bLen-1, dis);  
    printf("The two string distance is :%d\n", dis);  
    return 0;  
}  
  
int min(int a, int b, int c)  
{  
    if(a < b && a < c)  
        return a;  
    if(b <a && b < c)  
        return b;  
    return c;  
}  
  
void calDistance(char *a, int aBegin, int aEnd, char * b, int bBegin, int bEnd, int& dis)  
{  
    if(aBegin > aEnd)  
    {  
        if(bBegin > bEnd)  
            return ;  
        else  
        {  
            dis = dis + bEnd - bBegin + 1;  
            return ;  
        }  
    }  
    else if(bBegin > bEnd)  
    {  
        dis = dis + aEnd - aBegin + 1;  
        return ;  
    }  
  
    if(a[aBegin] == b[bBegin])  
        calDistance(a, aBegin + 1, aEnd, b, bBegin + 1, bEnd, dis);  
    else  
    {  
        int num1, num2, num3;  
        num1 = num2 = num3 = dis + 1;  
        calDistance(a, aBegin + 1, aEnd, b, bBegin, bEnd, num1); //给b[bBegin]前加a[aBegin]  
        calDistance(a, aBegin, aEnd, b, bBegin + 1, bEnd, num2); //给a[aBegin]前加b[bBegin]  
        calDistance(a, aBegin + 1, aEnd, b, bBegin + 1, bEnd, num3); //将a[aBegin]置换为b[bBegin]，或者将b[bBegin]置换为a[aBegin]  
        dis = min(num1, num2, num3);  
    }  
}

因为上述的代码存在重复子问题，可以优化，利用将子问题计算后的结构暂存，再次调用时可以直接查结果。

涉及到重复子问题，联想到动态规划的备忘录...具体实现，嗯，需要参考资料了...

参考文章:

http://tech.ddvip.com/2009-05/1243159182120785_3.html

先转DP解法代码

动态规划算法总是充分利用重叠子问题，即通过每个子问题只解一次，把解保存在一个需要时就可以查看的表中，而每次查表的时间为常数。

动态规划算法是有底向上的。

view plaincopy to clipboardprint?

/* 
* A loop method using dynamic programming. 
* Calculate from bottom to top. 
*/  
int calculateStringDistance(string strA, string strB)  
{  
　　　　 int lenA = (int)strA.length();  
　　　　 int lenB = (int)strB.length();  
　　　　 int c[lenA+1][lenB+1]; // Record the distance of all begin points of each string  
   
  　　　 // i: begin point of strA  
　　　　 // j: begin point of strB  
　　　　 for(int i = 0; i < lenA; i++) c[i][lenB] = lenA - i;  
　　　　 for(int j = 0; j < lenB; j++) c[lenA][j] = lenB - j;  
　　　　 c[lenA][lenB] = 0;  
   
　　　　 for(int i = lenA-1; i >= 0; i--)  
　　　　　　　　 for(int j = lenB-1; j >= 0; j--)  
　　　　　　　　 {  
　　　　　　　　　　　　 if(strB[j] == strA[i])  
　　　　　　　　　　　　　　　　 c[i][j] = c[i+1][j+1];  
　　　　　　　　　　　　 else  
　　　　　　　　　　　　　　　　 c[i][j] = minValue(c[i][j+1], c[i+1][j], c[i+1][j+1]) + 1;  
　　　　　　　　 }  
   
　　　　 return c[0][0];  
}

再转备忘录做法：

备忘录是动态规划的一种变形，它既具有通常的动态规划方法的效率，又采用了一种自顶向下的策略。

加了备忘的递归算法为每一个子问题的解在表中记录一个表项。开始时，每个表项最初都包含一个特殊的值，以表示该表项有待填入。当在递归算法的执行中第一次遇到一个子问题时，就计算它的解并填入表中。以后每次遇到该子问题时，只要查看并返回先前填入的值即可。

下面是原文递归算法的做备忘录版本，并通过布尔变量memoize来控制是否使用备忘录，以及布尔变量debug来控制是否打印调用过程。有兴趣的读都可以通过这两个布尔变量的控制来对比一下备忘录版本与非备忘录版本的复杂度。

view plaincopy to clipboardprint?

#include <iostream>  
#define M 100  
  
using namespace std;  
   
const bool debug = false; // Whether to print debug info  
const bool memoize = true; // Whether to use memoization  
unsigned int cnt = 0; // Line number for the debug info  
  
int memoizedDistance[M][M]; // Matrix for memoiztion  
int minValue(int a, int b, int c)  
{  
　　　　 if(a < b && a < c) return a;  
　　　　 else if(b < a && b < c) return b;  
　　　　 else return c;  
}  
   
/* 
　 * A recursive method which can be decorated by memoization. 
　 * Calculate from top to bottom. 
　 */  
int calculateStringDistance(string strA, int pABegin, int pAEnd, string strB, int pBBegin, int pBEnd)  
{  
　　　　 if(memoize && memoizedDistance[pABegin][pBBegin] >= 0)  
　　　　　　　　 return memoizedDistance[pABegin][pBBegin];  
   
　　　　 if(pABegin > pAEnd)  
　　　　 {  
　　　　　　　　 if(pBBegin > pBEnd)  
　　　　　　　　 {  
　　　　　　　　　　　　 if(memoize)  
　　　　　　　　　　　　　　　　 memoizedDistance[pABegin][pBBegin] = 0;  
　　　　　　　　　　　　 if(debug)  
　　　　　　　　　　　　　　　　 cout << cnt++ << ": m(" << pABegin << "," << pBBegin << ")=0" << endl;  
　　　　　　　　　　　　 return 0;  
　　　　　　　　 }  
　　　　　　　　 else  
　　　　　　　　 {  
　　　　　　　　　　　　 int temp = pBEnd - pBBegin + 1;  
　　　　　　　　　　　　 if(memoize)  
　　　　　　　　　　　　　　　　 memoizedDistance[pABegin][pBBegin] = temp;  
　　　　　　　　　　　　 if(debug)  
　　　　　　　　　　　　　　　　 cout << cnt++ << ": m(" << pABegin << "," << pBBegin << ")=" << temp << endl;  
　　　　　　　　　　　　 return temp;  
　　　　　　　　 }  
　　　　 }  
   
　　　　 if(pBBegin > pBEnd)  
　　　　 {  
　　　　　　　　 if(pABegin > pAEnd)  
　　　　　　　　 {  
　　　　　　　　　　　　 if(memoize)  
　　　　　　　　　　　　　　　　 memoizedDistance[pABegin][pBBegin] = 0;  
　　　　　　　　　　　　 if(debug)  
　　　　　　　　　　　　　　　　 cout << cnt++ << ": m(" << pABegin << "," << pBBegin << ")=0" << endl;  
　　　　　　　　　　　　 return 0;  
　　　　　　　　 }  
　　　　　　　　 else  
　　　　　　　　 {  
　　　　　　　　　　　　 int temp = pAEnd - pABegin + 1;  
　　　　　　　　　　　　 if(memoize)  
　　　　　　　　　　　　　　　　 memoizedDistance[pABegin][pBBegin] = temp;  
　　　　　　　　　　　　 if(debug)  
　　　　　　　　　　　　　　　　 cout << cnt++ << ": m(" << pABegin << "," << pBBegin << ")=" << temp << endl;  
　　　　　　　　　　　　 return temp;  
　　　　　　　　 }  
　　　　 }  
   
　　　　 if(strA[pABegin] == strB[pBBegin])  
　　　　 {  
　　　　　　　　 int temp = calculateStringDistance(strA, pABegin+1, pAEnd, strB, pBBegin+1, pBEnd);  
　　　　　　　　 if(memoize)  
　　　　　　　　　　　　 memoizedDistance[pABegin][pBBegin] = temp;  
　　　　　　　　　 if(debug)  
　　　　　　　　　　　　 cout << cnt++ << ": m(" << pABegin << "," << pBBegin << ")=" << temp << endl;  
　　　　　　　　 return temp;  
　　　　 }  
　　　　 else  
　　　　 {  
　　　　　　　　 int t1 = calculateStringDistance(strA, pABegin, pAEnd, strB, pBBegin+1, pBEnd);  
　　　　　　　　 int t2 = calculateStringDistance(strA, pABegin+1, pAEnd, strB, pBBegin, pBEnd);  
　　　　　　　　 int t3 = calculateStringDistance(strA, pABegin+1, pAEnd, strB, pBBegin+1, pBEnd);  
　　　　　　　　 int temp = minValue(t1, t2, t3) + 1;  
　　　　　　　　 if(memoize)  
　　　　　　　　　　　　 memoizedDistance[pABegin][pBBegin] = temp;  
　　　　　　　　 if(debug)  
　　　　　　　　　　　　 cout << cnt++ << ": m(" << pABegin << "," << pBBegin << ")=" << temp << endl;  
　　　　　　　　 return temp;  
　　　　 }  
}  
  
int main()  
{  
　　　　 if(memoize)  
　　　　 {  
　　　　　　　　 // initialize the matrix : memoizedDistance[][]  
　　　　　　　　 for(int i = 0; i < M; i++)  
　　　　　　　　　　　　 for(int j = 0; j < M; j++)  
　　　　　　　　　　　　　　　　 memoizedDistance[i][j] = -1; // -1 means unfilled cell yet  
　　　　 }  
   
　　　　 string strA = "abcdfef";  
　　　　 string strB = "a";  
   
　　　　 cout << endl << "Similarity = "  
　　　　　　　　　　　　 << 1.0 / (1 + calculateStringDistance(strA, 0, (int)strA.length()-1, strB, 0, (int)strB.length()-1))  
　　　　　　　　　　　　 << endl;  
   
　　　　 return 0;  
 }

总结：

可以计算出，如果不用动态规划或是做备忘录，最坏情况下复杂度约为：lenA！*lenB！。使用动态规划的复杂度为O(（lenA+1）*（lenB+1）)。递归并做备忘录的方法最坏情况下复杂度为O(（lenA+1）*（lenB+1）)。

在实际应用中，如果所有的子问题都至少要被计算一次，则一个自底向上的动态规划算法通常要比一个自顶向下的做备忘录算法好出一个常数因子，因为前者无需递归的代价，而且维护表格的开销也小些。

此外，在有些问题中，还可以用动态规划算法中的表存取模式来进一步减少时间或空间上的需求。或者，如果子问题空间中的某些子问题根本没有必要求解，做备忘录方法有着只解那些肯定要求解的子问题的优点，对于本问题就是这样。

http://blog.csdn.net/jeiwt/archive/2009/12/10/4981876.aspx

--------------------------------------------------------------------------------------------------------

两字符串相似度计算方法有好多，现对基于编距的算法的相似度计算自己总结下。

    简单介绍下Levenshtein Distance(LD)：LD 可能衡量两字符串的相似性。它们的距离就是一个字符串转换成那一个字符串过程中的添加、删除、修改数值。

    举例：

如果str1="test"，str2="test"，那么LD(str1,str2) = 0。没有经过转换。
如果str1="test"，str2="tent"，那么LD(str1,str2) = 1。str1的"s"转换"n"，转换了一个字符，所以是1。
如果它们的距离越大，说明它们越是不同。

     Levenshtein distance最先是由俄国科学家Vladimir Levenshtein在1965年发明，用他的名字命名。不会拼读，可以叫它edit distance（编辑距离）。

    Levenshtein distance可以用来：

Spell checking(拼写检查)
Speech recognition(语句识别)
DNA analysis(DNA分析)
Plagiarism detection(抄袭检测)
LD用m*n的矩阵存储距离值。算法大概过程：

str1或str2的长度为0返回另一个字符串的长度。
初始化(n+1)*(m+1)的矩阵d，并让第一行和列的值从0开始增长。
扫描两字符串（n*m级的），如果：str1[i] == str2[j]，用temp记录它，为0。否则temp记为1。然后在矩阵d[i][j]赋于d[i-1][j]+1 、d[i][j-1]+1、d[i-1][j-1]+temp三者的最小值。
扫描完后，返回矩阵的最后一个值即d[n][m]
最后返回的是它们的距离。怎么根据这个距离求出相似度呢？因为它们的最大距离就是两字符串长度的最大值。对字符串不是很敏感。现我把相似度计算公式定为1-它们的距离/字符串长度最大值。

    源码：

package  com.chenlb.algorithm;

/** 
 * 编辑距离的两字符串相似度
 * 
 *  @author  chenlb 2008-6-24 下午06:41:55
  */ 
public   class  Similarity {

     private   int  min( int  one,  int  two,  int  three) {
         int  min  =  one;
         if (two  <  min) {
            min  =  two;
        }
         if (three  <  min) {
            min  =  three;
        }
         return  min;
    }
    
     public   int  ld(String str1, String str2) {
         int  d[][];     // 矩阵 
         int  n  =  str1.length();
         int  m  =  str2.length();
         int  i;     // 遍历str1的 
         int  j;     // 遍历str2的 
         char  ch1;     // str1的 
         char  ch2;     // str2的 
         int  temp;     // 记录相同字符,在某个矩阵位置值的增量,不是0就是1 
         if (n  ==   0 ) {
             return  m;
        }
         if (m  ==   0 ) {
             return  n;
        }
        d  =   new   int [n + 1 ][m + 1 ];
         for (i = 0 ; i <= n; i ++ ) {     // 初始化第一列 
            d[i][ 0 ]  =  i;
        }
         for (j = 0 ; j <= m; j ++ ) {     // 初始化第一行 
            d[ 0 ][j]  =  j;
        }
         for (i = 1 ; i <= n; i ++ ) {     // 遍历str1 
            ch1  =  str1.charAt(i - 1 );
             // 去匹配str2 
             for (j = 1 ; j <= m; j ++ ) {
                ch2  =  str2.charAt(j - 1 );
                 if (ch1  ==  ch2) {
                    temp  =   0 ;
                }  else  {
                    temp  =   1 ;
                }
                 // 左边+1,上边+1, 左上角+temp取最小 
                d[i][j]  =  min(d[i - 1 ][j] + 1 , d[i][j - 1 ] + 1 , d[i - 1 ][j - 1 ] + temp);
            }
        }
         return  d[n][m];
    }
    
     public   double  sim(String str1, String str2) {
         int  ld  =  ld(str1, str2);
         return   1   -  ( double ) ld  /  Math.max(str1.length(), str2.length()); 
    }
    
     public   static   void  main(String[] args) {
        Similarity s  =   new  Similarity();
        String str1  =   " chenlb.blogjava.net " ;
        String str2  =   " chenlb.iteye.com " ;
        System.out.println( " ld= " + s.ld(str1, str2));
        System.out.println( " sim= " + s.sim(str1, str2));
    }
}

不知sim方法中的公式是合理，个人认为差强人意思，^_^

分享到：