package text_category;
import java.io.BufferedReader;
import java.io.BufferedWriter;
import java.io.FileWriter;
import java.io.PrintWriter;
import java.io.Reader;
import java.io.StringReader;
import com.xjt.nlp.word.ICTCLAS;
import edu.udo.cs.wvtool.generic.tokenizer.WVTTokenizer;
import edu.udo.cs.wvtool.main.WVTDocumentInfo;
import edu.udo.cs.wvtool.util.TokenEnumeration;
public class ChineseTokenizer implements WVTTokenizer, TokenEnumeration {
/** The underlying character stream of the currently tokenized document */
private Reader input;
/**
* The token, which is currently provided. This buffer is neccessary, to implement the semantic of TokenEnumeration
*/
private String currentToken;
public ChineseTokenizer()
{
input = null;
currentToken = null;
}
/**
* @see edu.udo.cs.wvtool.generic.tokenizer.WVTTokenizer#tokenize(Reader, WVTDocumentInfo)
*/
public TokenEnumeration tokenize(Reader source, WVTDocumentInfo d) {
if (source != null) {
String resultstring = null;
try
{
BufferedReader br = new BufferedReader(source);
StringBuffer sb = new StringBuffer();
String inputstring = null;
while ((inputstring = br.readLine()) != null)
{
sb.append(inputstring);
}
inputstring = sb.toString();
resultstring = inputstring;
resultstring = ICTCLASCutWord(inputstring);
}catch(Exception e)
{
return null;
}
input = new StringReader(resultstring);
readNextToken();
return this;
} else
return null;
}
/**
* Read a token from the character stream and store it into currentToken. If there are no more tokens left store a null value.
*
*/
public void readNextToken() {
StringBuffer buf = new StringBuffer();
boolean endReached = false;
int in = 0;
try {
// Read from the stream, until a letter occurs
in = input.read();
char ch = (char) in;
while ((in != -1) && !Character.isLetter(ch)) {
in = input.read();
ch = (char) in;
}
if (in != -1)
buf.append(ch);
// Read from the stream, util a non-letter occurs
while ((in != -1) && Character.isLetter(ch)) {
in = input.read();
ch = (char) in;
if (Character.isLetter(ch))
buf.append(ch);
}
} catch (Exception e) {
endReached = true;
}
if (in == -1)
endReached = true;
if (endReached) {
// If the stream ended with a non-empty token, this is the last
// token, otherwise there is no more token.
if (buf.length() > 0)
currentToken = buf.toString();
else
currentToken = null;
return;
} else {
// if the end of the stream has not been reached yet, simply store
// the extracted token.
currentToken = buf.toString();
return;
}
}
/**
* @see edu.udo.cs.wvtool.util.TokenEnumeration#hasMoreTokens()
*/
public boolean hasMoreTokens() {
// If the current token does not equal the null value, then there is at
// least this token left
if (input != null)
return (currentToken != null);
else
return false;
}
/**
* @see edu.udo.cs.wvtool.util.TokenEnumeration#nextToken()
*/
public String nextToken() {
String result = null;
// If unequal null, return the current token and read another one from
// the stream
if (currentToken != null) {
result = currentToken;
readNextToken();
} else
result = null;
return result;
}
public static String ICTCLASCutWord(String inputstring)
{
String resultstring = null;
try
{
ICTCLAS splitword = ICTCLAS.getInstance();
inputstring = inputstring.replace("\"", "");
inputstring = inputstring.replace("'", "");
inputstring = inputstring.replace("((", "");
inputstring = inputstring.replace("/", "");
inputstring = inputstring.replace(" ", "");
inputstring = inputstring.replace(">", "");
inputstring = inputstring.replace("<", "");
/*Character.UnicodeBlock ub;
char[] ch = inputstring.toCharArray();
StringBuffer temp = new StringBuffer();
for (int c = 0; c < ch.length; c++)
{
ub = Character.UnicodeBlock.of(ch[c]);
if ((ub == Character.UnicodeBlock.BASIC_LATIN)
|| (ub == Character.UnicodeBlock.HALFWIDTH_AND_FULLWIDTH_FORMS) || Character.isLetter(ch[c]))
{
temp.append(ch[c]);
}
}
inputstring = temp.toString();*/
//System.out.println(inputstring);
inputstring = splitword.paragraphProcess(inputstring);
String[] immediatestrings = inputstring.split(" ");
StringBuffer sb = new StringBuffer();
for (int i = 0; i < immediatestrings.length; i++)
{
if (immediatestrings[i].length() <= 0)
continue;
int end = immediatestrings[i].lastIndexOf("/");
String str = "";
if (end < 0 || end > immediatestrings[i].length())
{
str = immediatestrings[i] + " ";
}
else
{
str = immediatestrings[i].substring(0, end) + " ";
}
sb.append(str);
}
resultstring = sb.toString();
}catch(Exception e)
{
return null;
}
return resultstring;
}
}
分享到:
相关推荐
### DELPHI 中的 implements 指示符详解 #### 一、概述 在 Delphi 编程语言中,`implements` 指示符是一个非常有用且强大的特性,它主要用于实现接口的方法委托。通过使用 `implements` 关键字,程序员可以指定一...
Java 中 extends 与 implements 的区别 Java 中的继承和实现接口是两个基本概念,extends 和 implements 是两个关键字,它们之间的区别是 Java 编程语言中最重要的基础知识。 extends 的作用 在 Java 中,extends...
"implements Runnable"是Java编程语言中的一个重要概念,它与多线程编程紧密相关。在Java中,线程是程序执行的最小单位,而创建线程主要有两种方式:继承Thread类和实现Runnable接口。本项目是一个Java小游戏,核心...
public class findmin extends Applet implements ActionListener JavaAppliet程序,在文本框里输入三个数,显示最小数
Java extends 与 implements 的区别 在 Java 中,extends 和 implements 是两个关键字,都是用来建立类与类或类与接口之间的关系的,但它们的使用场景和实现机制却有着很大的不同。 extends extends 关键字用于...
implements是用于检查Object符合给定接口的实用程序模块。 例子 var impl = require ( 'implements' ) ; var instance = [ ] ; impl ( instance , [ 'some' , 'every' ] ) ; // true 安装 节点 要在Node应用程序中...
在JavaScript中,当我们讨论Class属性Extends和Implements的区别时,我们通常是在讨论在某个特定的JavaScript框架或者库中的类的继承方式,比如在Prototype框架或Mootools框架中。 首先,我们来详细探讨一下Extends...
通过 `implements` 关键字,我们可以确保一个类遵循特定的契约,即实现了接口中定义的所有方法。这种设计模式在多态和组件开发中尤其有用,因为它允许不同类之间共享相同的接口,从而实现通信和互操作性。 首先,...
This module implements the Requests API.
a Go library that implements E
标题中的"PyPI 官网下载 | protocol_implements_decorator-0.3.1.tar.gz"表明这是一个在Python Package Index(PyPI)上发布的软件包,名为`protocol_implements_decorator`,版本为0.3.1,且以tar.gz格式打包。...
ate final String[] COMMAND={"Backspace","CE","C"}; private final String[] M={" ","MC","MR","MS","M+"}; private JButton keys[]=new JButton[KEYS.length]; private JButton commands[]=new JButton[COMMAND....
在Java编程语言中,`extends` 和 `implements` 关键字分别用于类的继承和接口的实现,它们是面向对象编程的重要特性。接下来我们将详细探讨这两个关键字的区别和使用场景。 1. **`extends` 关键字**: - `extends`...
java学习-java中的interface和implements关键字
在Java编程语言中,`Comparator`接口是一个非常重要的工具,它允许我们自定义对象的排序规则。`Comparator`可以用于任何实现了`Comparable`接口的类,或者当我们想要对不支持自然排序的对象进行排序时。...
标题中的"JS带图片标题自动播放的幻灯片效果.zip_implements_out"指的是一个使用JavaScript实现的图片标题幻灯片效果,其中包含了“滑入滑出”的动画效果,类似于早期Flash技术中的幻灯片展示。这个项目可能是一个...
Alibaba Java Coding Guidelines pmd implements and IDE plugin