xiaobian

浏览: 590855 次
来自: 北京

最近访客更多访客>>

suweihuaok

weimingtom

yixiaof

wq611403

博主相关

博客

微博

相册

留言

关于我

文章分类

社区版块

存档分类

字符串相似度算法

博客分类：

其他

算法 J#C C++C#

字符串相似度算法介绍(整理)收藏

新一篇: 添加了计数器，时钟，日历，天气预报和背景音乐 | 旧一篇: 试用Web-Harvest

最近在做这方面的应用，把我找到的资料贴出来，有需要的人可以参考参考。
1．编辑距离（Levenshtein Distance）
编辑距离就是用来计算从原串（s）转换到目标串(t)所需要的最少的插入，删除和替换
的数目，在NLP中应用比较广泛，如一些评测方法中就用到了（wer,mWer等），同时也常用来计算你对原文本所作的改动数。编辑距离的算法是首先由俄国科学家Levenshtein提出的，故又叫Levenshtein Distance。
Levenshtein Distance算法可以看作动态规划。它的思路就是从两个字符串的左边开始比较,记录已经比较过的子串相似度(实际上叫做距离),然后进一步得到下一个字符位置时的相似度。用下面的例子: GUMBO和GAMBOL。当算到矩阵D[3,3]位置时,也就是当比较到GUM和GAM时,要从已经比较过的3对子串GU-GAM, GUM-GA和GU-GA之中选一个差别最小的来当它的值. 所以要从左上到右下构造矩阵。
编辑距离的伪算法：
整数 Levenshtein距离(字符 str1[1..lenStr1], 字符 str2[1..lenStr2])
   宣告 int d[0..lenStr1, 0..lenStr2]
   宣告 int i, j, cost

   对于 i 等于由 0 至 lenStr1
       d[i, 0] := i
   对于 j 等于由 0 至 lenStr2
       d[0, j] := j
   对于 i 等于由 1 至 lenStr1
       对于 j 等于由 1 至 lenStr2
           若 str1[i] = str2[j] 则 cost := 0
                                否则 cost := 1
           d[i, j] := 最小值(
                                d[i-1, j ] + 1,     // 删除
                                d[i , j-1] + 1,     // 插入
                                d[i-1, j-1] + cost   // 替换
                            )
返回 d[lenStr1, lenStr2]

double Minimum(double a, double b, double c)
{
double mi;

mi = a;
if (b < mi) {
mi = b;
}
if (c < mi) {
mi = c;
}
return mi;
}

int* GetCellPointer(int *pOrigin, int col, int row, int nCols)
{
return pOrigin + col + (row * (nCols + 1));
}

int GetAt(int *pOrigin, int col, int row, int nCols)
{
int *pCell;

pCell = GetCellPointer (pOrigin, col, row, nCols);
return *pCell;
}

void PutAt(int *pOrigin, int col, int row, int nCols, double x)
{
int *pCell;
pCell = GetCellPointer (pOrigin, col, row, nCols);
*pCell = x;
}

//编辑距离
LD(const char *s, const char *t)
{
int *d; // pointer to matrix
int n; // length of s
int m; // length of t
int i; // iterates through s
int j; // iterates through t
char s_i1; // ith character of s
char s_i2; // ith character of s
char t_j1; // jth character of t
char t_j2; // jth character of t
int *cost; // cost代价矩阵
int result; // result
int cell; // contents of target cell
int above; // contents of cell immediately above
int left; // contents of cell immediately to left
int diag; // contents of cell immediately above and to left
int sz; // number of cells in matrix

// Step 1

n = strlen (s);
m = strlen (t);
if (n == 0)
{
return m;
}
if (m == 0)
{
return n;
}
sz = (n+1) * (m+1) * sizeof (int);
d = (int *) malloc (sz);
cost = (int *) malloc (sz);

// Step 2

for (i = 0; i <= n; i++)
{
PutAt (d, i, 0, n, i);
}

for (j = 0; j <= m; j++)
{
PutAt (d, 0, j, n, j);
}
for (int g=0;g<=m;g++)//把代价距离矩阵全部初始化为同一个值，以后可根据此值判断相应的方格是否被赋过值
{
for(int h=0;h<=n;h++)
{
PutAt(cost,h,g,n,2);
}
}
// Step 3

for (i = 1; i <= n; i++)
{

s_i1 = s[i-1];
s_i2 = s[i];
bool sbd=false;
bool tbd=false;
if(s_i1>=' '&&s_i1<='@'||s_i1>='A'&&s_i1<='~')
{//s为标点符号或其他非中文符号和数字
sbd=true;
}
// Step 4

for (j = 1; j <= m; j++)
{

   tbd=false;
   t_j1 = t[j-1];
   t_j2 = t[j];
   // Step 5
   if(t_j1>=' '&&t_j1<='@'||t_j1>='A'&&t_j1<='~')
   {//t也为标点符号
    tbd=true;
   }
   if(!sbd)
   {//s为汉字
    if(!tbd)
    {//t也为汉字
     if (s_i1 == t_j1&&s_i2 == t_j2)
     {
      bool tt=false;
      int temp=GetAt(cost,i,j,n);
      if(temp==2)
      {
       PutAt(cost,i,j,n,0);
       tt=true;
      }
      if(tt)
      {//因为st全市汉字，所以把代价矩阵他相邻的未赋过值的三个格赋值
       int temp1=GetAt(cost,i+1,j,n);
       if(temp1==2)
       {
        PutAt(cost,i+1,j,n,0);
       }
       int temp2=GetAt(cost,i,j+1,n);
       if(temp2==2)
       {
        PutAt(cost,i,j+1,n,0);
       }
       int temp3=GetAt(cost,i+1,j+1,n);
       if(temp3==2)
       {
        PutAt(cost,i+1,j+1,n,0);
       }
      }
     }
     else
     {
      bool tt=false;
      int temp=GetAt(cost,i,j,n);
      if(temp==2)
      {
       PutAt(cost,i,j,n,1);
       tt=true;
      }
      if(tt)
      {
       int temp1=GetAt(cost,i+1,j,n);
       if(temp1==2)
       {
        PutAt(cost,i+1,j,n,1);
       }
       int temp2=GetAt(cost,i,j+1,n);
       if(temp2==2)
       {
        PutAt(cost,i,j+1,n,1);
       }
       int temp3=GetAt(cost,i+1,j+1,n);
       if(temp3==2)
       {
        PutAt(cost,i+1,j+1,n,1);
       }
      }
     }
    }
    else
    {//t为符号
     bool tt=false;
     int temp=GetAt(cost,i,j,n);
     if(temp==2)
     {
      PutAt(cost,i,j,n,1);
      tt=true;
     }
     if(tt)
     {
      int temp1=GetAt(cost,i+1,j,n);
      if(temp1==2)
      {
       PutAt(cost,i+1,j,n,1);
      }
     }

    }

   }
   else
   {//s为符号
    if(!tbd)
    {//t为汉字
     bool tt=false;
     int temp=GetAt(cost,i,j,n);
     if(temp==2)
     {
      PutAt(cost,i,j,n,1);
      tt=true;
     }
     if(tt)
     {
      int temp1=GetAt(cost,i,j+1,n);
      if(temp1==2)
      {
       PutAt(cost,i,j+1,n,1);
      }
     }
    }
    else
    {
     if(s_i1==t_j1)
     {
      int temp=GetAt(cost,i,j,n);
      if(temp==2)
      {
       PutAt(cost,i,j,n,0);
      }
     }
     else
     {
      int temp=GetAt(cost,i,j,n);
      if(temp==2)
      {
       PutAt(cost,i,j,n,1);
      }
     }
    }

}

// Step 6

   above = GetAt (d,i-1,j, n);
   left = GetAt (d,i, j-1, n);
   diag = GetAt (d, i-1,j-1, n);
   int curcost=GetAt(cost,i,j,n);
   cell = Minimum (above + 1, left + 1, diag + curcost);
   PutAt (d, i, j, n, cell);
}
}

// Step 7

result = GetAt (d, n, m, n);
free (d);
return result;

}

2．最长公共子串 (LCS)
LCS问题就是求两个字符串最长公共子串的问题。解法就是用一个矩阵来记录两个字符
串中所有位置的两个字符之间的匹配情况，若是匹配则为1，否则为0。然后求出对角线最长的1序列，其对应的位置就是最长匹配子串的位置.
下面是字符串21232523311324和字符串312123223445的匹配矩阵，前者为X方向的，
后者为Y方向的。不难找到，红色部分是最长的匹配子串。通过查找位置我们得到最长的匹配子串为：21232
0 0 0 1 0 0 0 1 1 0 0 1 0 0 0
　　0 1 0 0 0 0 0 0 0 1 1 0 0 0 0
　　1 0 1 0 1 0 1 0 0 0 0 0 1 0 0
　　0 1 0 0 0 0 0 0 0 1 1 0 0 0 0
　　1 0 1 0 1 0 1 0 0 0 0 0 1 0 0
　　0 0 0 1 0 0 0 1 1 0 0 1 0 0 0
　　1 0 1 0 1 0 1 0 0 0 0 0 1 0 0
　　1 0 1 0 1 0 1 0 0 0 0 0 1 0 0
　　0 0 0 1 0 0 0 1 1 0 0 1 0 0 0
　　0 0 0 0 0 0 0 0 0 0 0 0 0 1 0
　　0 0 0 0 0 0 0 0 0 0 0 0 0 1 0
　　0 0 0 0 0 1 0 0 0 0 0 0 0 0 0
　　0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
但是在0和1的矩阵中找最长的1对角线序列又要花去一定的时间。通过改进矩阵的生成方式和设置标记变量，可以省去这部分时间。下面是新的矩阵生成方式：
0 0 0 1 0 0 0 1 1 0 0 1 0 0 0
　　0 1 0 0 0 0 0 0 0 2 1 0 0 0 0
　　1 0 2 0 1 0 1 0 0 0 0 0 1 0 0
　　0 2 0 0 0 0 0 0 0 1 1 0 0 0 0
　　1 0 3 0 1 0 1 0 0 0 0 0 1 0 0
　　0 0 0 4 0 0 0 2 1 0 0 1 0 0 0
　　1 0 1 0 5 0 1 0 0 0 0 0 2 0 0
　　1 0 1 0 1 0 1 0 0 0 0 0 1 0 0
　　0 0 0 2 0 0 0 2 1 0 0 1 0 0 0
　　0 0 0 0 0 0 0 0 0 0 0 0 0 1 0
　　0 0 0 0 0 0 0 0 0 0 0 0 0 1 0
　　0 0 0 0 0 1 0 0 0 0 0 0 0 0 0
　　0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
当字符匹配的时候，我们并不是简单的给相应元素赋上1，而是赋上其左上角元素的值加一。我们用两个标记变量来标记矩阵中值最大的元素的位置，在矩阵生成的过程中来判断当前生成的元素的值是不是最大的，据此来改变标记变量的值，那么到矩阵完成的时候，最长匹配子串的位置和长度就已经出来了。

//最长公共子串
char* LCS(char*left,char* right){
int lenLeft,lenRight;
lenLeft = strlen(left);
lenRight = strlen(right);
int *c = new int[lenRight];
int start,end,len;
end = len = 0;
for(int i = 0; i < lenLeft; i++){
for(int j = lenRight-1; j >= 0; j--){
   if(left[i] == right[j]){
    if(i == 0 || j == 0)
     c[j] = 1;
    else
     c[j] = c[j-1]+1;
   }
   else
    c[j] = 0;
   if(c[j] > len){
    len = c[j];
    end = j;
   }
}
}
char *p = new char[len+1];
start = end - len + 1;
for(i = start; i <= end; i++)
p[i - start] = right[i];
p[len] = '\0';
return p;
}
3. 余弦定理 (向量空间算法)
余弦定理古老而广泛的数学概念，在各个学科及实践中都得到了大量的应用，这里简单的介绍下其在判断两个字符串相似度的应用。
在余弦定理中基本的公式为：

假如字符串s1与s2，比较两个字符串的相似度，sim(s1,s2)，假设s1,s2中含有n个不同的字符，其分别为c1,c2,...cn，判断字符串的相似度转换为两个字符串对应的向量v1,v2之间夹角大小的判断，余弦值越大其向量之间的夹角越小，s1与S2的相似度越大。
向量空间算法的介绍：
在向量空间模型中，文本泛指各种机器可读的记录。用D（Document）表示，特征项（Term，用t表示）是指出现在文档D中且能够代表该文档内容的基本语言单位，主要是由词或者短语构成，文本可以用特征项集表示为D(T1，T2，…，Tn)，其中Tk是特征项，1<=k<=N。例如一篇文档中有a、b、c、d四个特征项，那么这篇文档就可以表示为D(a，b，c，d)。对含有n个特征项的文本而言，通常会给每个特征项赋予一定的权重表示其重要程度。即D＝D(T1，W1；T2，W2；…，Tn，Wn)，简记为D＝D(W1，W2，…，Wn)，我们把它叫做文本D的向量表示。其中Wk是Tk 的权重，1<=k<=N。在上面那个例子中，假设a、b、c、d的权重分别为30，20，20，10，那么该文本的向量表示为 D(30，20，20，10)。在向量空间模型中，两个文本D1和D2之间的内容相关度Sim(D1，D2)常用向量之间夹角的余弦值表示，公式为：

其中，W1k、W2k分别表示文本D1和D2第K个特征项的权值，1<=k<=N。我们可以利用类似的方法来计算两个字符串的相关度。
这个算法网上没找到，虽然我写过，但是没什么通用性，就不贴出来。很简单的，有兴趣的可以自己写一个。

http://www.cppblog.com/whncpp/archive/2008/09/21/62378.html

字符串相似度算法（ Levenshtein Distance算法）

昨天论坛看到的，简单写了一下
题目：一个字符串可以通过增加一个字符，删除一个字符，替换一个字符得到另外一个字符串，假设，我们把从字符串A转换成字符串B，前面3种操作所执行的最少次数称为AB相似度
如  abc adc  度为 1
   ababababa babababab 度为 2
   abcd acdb 度为2

字符串相似度算法可以使用 Levenshtein Distance算法(中文翻译：编辑距离算法) 这算法是由俄国科学家Levenshtein提出的。其步骤

StepDescription

1	Set n to be the length of s. Set m to be the length of t. If n = 0, return m and exit. If m = 0, return n and exit. Construct a matrix containing 0..m rows and 0..n columns.
2	Initialize the first row to 0..n. Initialize the first column to 0..m.
3	Examine each character of s (i from 1 to n).
4	Examine each character of t (j from 1 to m).
5	If s[i] equals t[j], the cost is 0. If s[i] doesn't equal t[j], the cost is 1.
6	Set cell d[i,j] of the matrix equal to the minimum of: a. The cell immediately above plus 1: d[i-1,j] + 1. b. The cell immediately to the left plus 1: d[i,j-1] + 1. c. The cell diagonally above and to the left plus the cost: d[i-1,j-1] + cost.
7	After the iteration steps (3, 4, 5, 6) are complete, the distance is found in cell d[n,m].

C++实现如下

#include <iostream>

#include <vector>

#include <string>

using namespace std;

//算法

int ldistance(const string source,const string target)

{

//step 1

int n=source.length();

int m=target.length();

if (m==0) return n;

if (n==0) return m;

//Construct a matrix

typedef vector< vector<int> > Tmatrix;

Tmatrix matrix(n+1);

for(int i=0; i<=n; i++) matrix[i].resize(m+1);

//step 2 Initialize

for(int i=1;i<=n;i++) matrix[i][0]=i;

for(int i=1;i<=m;i++) matrix[0][i]=i;

//step 3

for(int i=1;i<=n;i++)

{

const char si=source[i-1];

//step 4

for(int j=1;j<=m;j++)

{

const char dj=target[j-1];

//step 5

int cost;

if(si==dj){

cost=0;

}

else{

cost=1;

}

//step 6

const int above=matrix[i-1][j]+1;

const int left=matrix[i][j-1]+1;

const int diag=matrix[i-1][j-1]+cost;

matrix[i][j]=min(above,min(left,diag));

}

}//step7

return matrix[n][m];

}

int main(){

string s;

string d;

cout<<"source=";

cin>>s;

cout<<"diag=";

cin>>d;

int dist=ldistance(s,d);

cout<<"dist="<<dist<<endl;

}

#include <iostream>

#include <vector>

#include <string>

using namespace std;

//算法

int ldistance(const string source,const string target)

{

//step 1

int n=source.length();

int m=target.length();

if (m==0) return n;

if (n==0) return m;

//Construct a matrix

typedef vector< vector<int> > Tmatrix;

Tmatrix matrix(n+1);

for(int i=0; i<=n; i++) matrix[i].resize(m+1);

//step 2 Initialize

for(int i=1;i<=n;i++) matrix[i][0]=i;

for(int i=1;i<=m;i++) matrix[0][i]=i;

//step 3

for(int i=1;i<=n;i++)

{

const char si=source[i-1];

//step 4

for(int j=1;j<=m;j++)

{

const char dj=target[j-1];

//step 5

int cost;

if(si==dj){

cost=0;

}

else{

cost=1;

}

//step 6

const int above=matrix[i-1][j]+1;

const int left=matrix[i][j-1]+1;

const int diag=matrix[i-1][j-1]+cost;

matrix[i][j]=min(above,min(left,diag));

}

}//step7

return matrix[n][m];

}

int main(){

string s;

string d;

cout<<"source=";

cin>>s;

cout<<"diag=";

cin>>d;

int dist=ldistance(s,d);

cout<<"dist="<<dist<<endl;

}

分享到：

Java多线程介绍 | 字符串相似度算法Python实现

2009-02-05 17:02
浏览 2973
评论(0)
查看更多

发表评论

您还没有登录,请您登录后再发表评论

最近访客更多访客>>

博主相关

文章分类

社区版块

存档分类

最新评论

字符串相似度算法

字符串相似度算法介绍(整理)收藏

新一篇: 添加了计数器，时钟，日历，天气预报和背景音乐 | 旧一篇: 试用Web-Harvest

字符串相似度算法（ Levenshtein Distance算法）

评论

发表评论

相关推荐

最近访客 更多访客>>

博主相关

文章分类

社区版块

存档分类

最新评论

字符串相似度算法

字符串相似度算法介绍(整理)收藏

新一篇: 添加了计数器，时钟，日历，天气预报和背景音乐 | 旧一篇: 试用Web-Harvest

字符串相似度算法（ Levenshtein Distance算法）

评论

发表评论

相关推荐

做事遵循一个好的习惯

使用FireFox更加安全的访问网站

程序员的5中层次，你属于哪一种呢？

论坛列表

压力值测试工具

CSS选择器

LaTeX排版系统介绍

第三方电子支付平台

金融IT供应商

金融资讯

精彩PuTTY 中文教程（解决乱码、X窗口、自动登陆等问题） 收藏

圣斗士星矢

最近访客更多访客>>

精彩PuTTY 中文教程（解决乱码、X窗口、自动登陆等问题）收藏