一段没有空格的中英文分词的n-gram算法实现

hermitte

浏览: 30507 次

最近访客更多访客>>

博主相关

博客

微博

相册

留言

关于我

文章分类

社区版块

存档分类

博客分类：

商业智能和数据挖掘

算法 J#数据挖掘 C C++

我刚写过个C#的实现。用的N-Gram算法很简单的。也能解决楼上的朋友的问题就是第一个单词和往后数8个单词的排列组合的取最大概率值得时候，把第一位的单词作为分词的结果，然后分词窗口后移，继续下一步。用堆栈作的等下我给你找找,算法部分直接就可以在java下面Ctrl+C了。。我开发项目用java,作数据挖掘和商业算法研究用C#的

c# 代码

using System;
using System.Collections.Generic;
using System.Text;
using System.Collections;
using System.IO;
namespace HNOZ
{
class Program
{
static double UNKNOWN = 0.05F;
static int pt = 0;
static int PRE_LENGTH =8;
static int FL_LENGTH = 8;
static int fl = FL_LENGTH;
static int pre = PRE_LENGTH;
static string sentence = "goodmorningbetterhello";
static Hashtable dict = new Hashtable();
static void Init()
{
}
static void Main(string[] args)
{
Hashtable ht = new Hashtable();
// sentence = "欧美的政治上的保守党派确实经济上的自由主义鼓吹者欧美欧美的政治上的保守党派确实经济上欧美的政治上的保守党派确实经济上的自由主义鼓吹者欧美欧美的政治上的保守党派确实经济上";
//string sentence = "欧美的政治上的保守党派确实经";
string sentence = "goodmorningbetterhello";
FileStream fs = new FileStream("11.csv", FileMode.Open);
StreamReader sr = new StreamReader(fs);
string line = "";
string hz = "";
string gl = "";
dict.Add("", 0.00);
while ((line = sr.ReadLine()) != null)
{
int i = line.IndexOf(',');
hz = line.Substring(0, i);
gl = line.Substring(i + 1, line.Length - i - 1);
if (!hz.Equals("?"))
dict.Add(hz, double.Parse(gl));
}
int start = 0;
//dict = ht;
//string aa = Console.ReadLine();
Console.WriteLine(analyse(sentence, start));
string e = Console.ReadLine();
start = 0;
Console.WriteLine(analyse(e, start));
Console.WriteLine(analyse(sentence, start));
}
static string analyse(string sentence, int start)
{
string results = "";
string nowstr = "";
int pos = 0;
int len = 0;
double max = 0;
double now = 0;
while (start < sentence.Length)
{
nowstr = Split(sentence.Substring(start, sentence.Length - start));
start = start + nowstr.Length;
results += nowstr + "/";
}
return results;
}
static string Split(string sentence)
{
int m = 0;
int result = 0;
double now = 0;
double p = 0;
int pos = 0;
double max = 0;
int j = 1;
string curWord = "";
int i = 1;
int len = 0;
int[] oj = new int[PRE_LENGTH];
if (sentence.Length < PRE_LENGTH)
{
fl = sentence.Length + 1;
pre = sentence.Length + 1;
}
else
{
fl = FL_LENGTH;
pre = PRE_LENGTH;
}
while (i < pre)
{
while (j < fl)
{
oj[i] = j;
pos = 0;
for (int k = 1; k < i; k++)
{
pos = pos + oj[k];
}
if (pos + j > sentence.Length)
{
curWord = "";
}
else
{
len = j;
curWord = sentence.Substring(pos, len);
}
if (dict[curWord] != null)
{
p = (double)dict[curWord];
}
else
{
j++;
continue;
}
if (i == pre - 1)
{
if (p + now > max)
{
result = oj[1];
max = p + now;
}
else
{
}
j++;
}
else
{
now = now + p;
i++;
j = 1;
}
// j++;
}
j = oj[i - 1];
oj[i] = 0;
j++;
i--;
if (j == 1 && i == 0)
break;
}
return sentence.Substring(0, result);
}
}
}

分享到：

永别了javaeye | Apriori 购物栏挖掘算法的C#实现。原创代码

2007-02-01 23:11
浏览 6010
评论(0)
查看更多

发表评论

您还没有登录,请您登录后再发表评论

最近访客更多访客>>

博主相关

文章分类

社区版块

存档分类

最新评论

一段没有空格的中英文分词的n-gram算法实现

评论

发表评论

相关推荐

最近访客 更多访客>>

博主相关

文章分类

社区版块

存档分类

最新评论

一段没有空格的中英文分词的n-gram算法实现

评论

发表评论

相关推荐

Apriori 购物栏挖掘算法的C#实现。原创代码

得到一个集合的所有子集的算法,非常巧,用二进制移位操作实现的

最近访客更多访客>>