package com.chipmunk.analyzer; import java.util.Map; import com.sun.jna.Library; import com.sun.jna.Native; public interface NLPIRLibrary extends Library { // 定义并初始化接口的静态变量 这一个语句是来加载 dll 的, 注意 dll 文件的路径 // 可以是绝对路径也可以是相对路径,只需要填写 dll 的文件名,不能加后缀 NLPIRLibrary Instance = (NLPIRLibrary) Native.loadLibrary( NLPIRPath.PATH_LIB, NLPIRLibrary.class); /** * 初始化函数声明 * @param sDataPath:Initial Directory Path, where file Configure.xml and Data directory stored. the default value is 0, it indicates the initial directory is current working directory path * @param encoding:encoding of input string, default is GBK_CODE (GBK encoding), and it can be set with UTF8_CODE (UTF8 encoding) and BIG5_CODE (BIG5 encoding). * @param sLicenceCode:license code, special use for some commercial users. Other users ignore the argument * @return 1-success,0-fail */ public int NLPIR_Init(String sDataPath, int encoding, String sLicenceCode); /** * 执行分词函数声明 * The NLPIR_ParagraphProcess function works properly only if NLPIR_Init succeeds. * @param sParagraph: The source paragraph * @param bPOStagged: Judge whether need POS tagging, 0 for no tag; 1 for tagging; default:1. * @return */ public String NLPIR_ParagraphProcess(String sSrc, int bPOSTagged); /** * 提取关键词函数声明 * @param sLine, the input text. * @param nMaxKeyLimit, the maximum number of key words. * @param bWeightOut: whether the keyword weight output or not * @return the keywords list if excute succeed. otherwise return NULL. */ public String NLPIR_GetKeyWords(String sLine, int nMaxKeyLimit, boolean bWeightOut); /** * Extract keyword from a text file. * @param sTextFile, the input text filename. * @param nMaxKeyLimit, the maximum number of key words. * @param bWeightOut: whether the keyword weight output or not * @return Return the keywords list if excute succeed. otherwise return NULL. */ public String NLPIR_GetFileKeyWords(String sLine, int nMaxKeyLimit, boolean bWeightOut); /** * 添加用户词典声明Add a word to the user dictionary. * @param sWord * @return Return 1 if add succeed. Otherwise return 0. */ public int NLPIR_AddUserWord(String sWord);// add by qp 2008.11.10 /** * 删除用户词典声明 * @param sWord * @return Return -1, the word not exist in the user dictionary; else, the handle of the word deleted */ public int NLPIR_DelUsrWord(String sWord);// add by qp 2008.11.10 //错误信息 public String NLPIR_GetLastErrorMsg(); /** * 退出函数声明 * Exit the program and free all resources and destroy all working buffer used in NLPIR. */ public void NLPIR_Exit(); /** * 文件分词声明 The NLPIR_FileProcess function works properly only if NLPIR_Init succeeds. * @param sSourceFilename: The source file name to be analysized; * @param sResultFilename: The result file name to store the results. * @param bPOStagged: Judge whether need POS tagging, 0 for no tag; 1 for tagging; default:1. * Return the processing speed if processing succeed. Otherwise return false. */ public boolean NLPIR_FileProcess(String utf8File, String utf8FileResult, int i); //--------------------------------plus----------------------------------------// /** * 引入用户词典 Import user-defined dictionary from a text file. * @param sFilename: Text filename for user dictionary * @param bOverwrite: true(default), overwrite the existing dictionary ,false, add to the existing dictionary * @return */ public int NLPIR_ImportUserDict(String sFilename,boolean bOverwrite); /** * * @param sParagraph: The source paragraph * @param pResultCount: pointer to result vector size * @param bUserDict:whether use UserDict * @return * the pointer of result vector, it is managed by system, user cannot alloc and free it struct result_t{ int start; //start position,词语在输入句子中的开始位置 int length; //length,词语的长度 char sPOS[POS_SIZE];//word type,词性ID值,可以快速的获取词性表 int iPOS;//词性 int word_ID; //如果是未登录词,设成或者-1 int word_type; //区分用户词典;1,是用户词典中的词;,非用户词典中的词 int weight;// word weight }; */ // public result_t NLPIR_ParagraphProcessA(String sParagraph,int pResultCount,boolean bUserDict); public Map<String, Object> NLPIR_ParagraphProcessA(String sParagraph,int pResultCount,boolean bUserDict); /** * The output format is customized in NLPIR configure. * @param sParagraph * @return */ public int NLPIR_GetParagraphProcessAWordCount(String sParagraph); /** * * @param nCount: the paragraph word count. * @param result: Pointer to structure to store results. */ // public void NLPIR_ParagraphProcessAW(int nCount,result_t result); public void NLPIR_ParagraphProcessAW(int nCount,Map<String, Object> result); /** * Save the user dictionary to disk. * @return Return 1 if save succeed. Otherwise return 0. */ public int NLPIR_SaveTheUsrDic(); /** * 关键词黑名单(永远不作为关键词输出) * Import blacklist keyword dictionary from a text file. * @param sFilename: Text filename for user dictionary * @return The number of lexical entry imported successfully */ public int NLPIR_ImportKeyBlackList(String sFilename); /** * Extract new words from paragraph. * @param sLine, the input text. * @param nMaxKeyLimit, the maximum number of key words. * @param bWeightOut: whether the keyword weight output or not * @return */ public String NLPIR_GetNewWords(String sLine,int nMaxKeyLimit,boolean bWeightOut); /** * Extract new words from a text file. * @param sTextFile, the input text filename. * @param nMaxKeyLimit, the maximum number of key words. * @param bWeightOut: whether the keyword weight output or not * @return Return the keywords list if excute succeed. otherwise return NULL. */ public String NLPIR_GetFileNewWords(String sTextFile,int nMaxKeyLimit,boolean bWeightOut); /** * Extract a finger print from the paragraph . * @param sLine * @return 0, failed; else, the finger print of the content */ public long NLPIR_FingerPrint(String sLine); /** * select which pos map will use * @param nPOSmap * @return */ public int NLPIR_SetPOSmap(int nPOSmap); /** * * @return true:success, false:fail */ public boolean NLPIR_NWI_Start(); /** * 需要在运行NLPIR_NWI_Start()之后,才有效 * @param sFilename * @return */ public int NLPIR_NWI_AddFile(String sFilename); /** * 往新词识别系统中添加一段待识别新词的内存 * 需要在运行NLPIR_NWI_Start()之后,才有效 * @param sText * @return */ public boolean NLPIR_NWI_AddMem(String sText); /** * 新词识别添加内容结束 * 需要在运行NLPIR_NWI_Start()之后,才有效 * @return */ public boolean NLPIR_NWI_Complete();//新词 /** * 获取新词识别的结果 * 需要在运行NLPIR_NWI_Complete()之后,才有效 * @param bWeightOut 是否需要输出每个新词的权重参数 * @return */ public String NLPIR_NWI_GetResult(boolean bWeightOut);//输出新词识别结果 /** * 将新词识别结果导入到用户词典中 * 需要在运行NLPIR_NWI_Complete()之后,才有效 * 如果需要将新词结果永久保存,建议在执行NLPIR_SaveTheUsrDic * @return */ public int NLPIR_NWI_Result2UserDict();//新词识别结果转为用户词典,返回新词结果数目 /** * //NLPIR NLPIR_NWI_Start();//启动新词发现功能 NLPIR_NWI_AddFile(sInputFile); //添加新词训练的文件,可反复添加 NLPIR_NWI_Complete();//添加文件或者训练内容结束 const char *pNewWordlist=NLPIR_NWI_GetResult();//输出新词识别结果 printf("识别出的新词为:%s\n",pNewWordlist); strcpy(sResultFile,sInputFile); strcat(sResultFile,"_result1.txt"); NLPIR_FileProcess(sInputFile,sResultFile); NLPIR_NWI_Result2UserDict();//新词识别结果导入到用户词典 strcpy(sResultFile,sInputFile); strcat(sResultFile,"_result2.txt"); NLPIR_FileProcess(sInputFile,sResultFile); NLPIR_Exit(); */ /** * 功能:当前的切分结果过大时,如“中华人民共和国” 需要执行该函数,将切分结果细分为“中华人民共和国” 细分粒度最大为三个汉字 返回:返回细粒度分词,如果不能细分,则返回为空字符串"" * @param sLine * @return */ public String NLPIR_FinerSegment(String sLine);//最大细粒化分词 /** * 功能:获取各类英文单词的原型,考虑了过去分词、单复数等情况 返回:返回的词原型形式 driven->drive drives->drive drove-->drive * @param sWord * @return */ public String NLPIR_GetEngWordOrign(String sWord);//获取各类英文单词的原型,考虑了过去分词、单复数等情况 /** * 功能:获取输入文本的词,词性,频统计结果,按照词频大小排序 返回:返回的是词频统计结果形式如下: 张华平/nr/10#博士/n/9#分词/n/8 * * @param sText * @return */ public String NLPIR_WordFreqStat(String sText);//获取输入文本的词,词性,频统计结果,按照词频大小排序 /** 功能:获取输入文本的词,词性,频统计结果,按照词频大小排序 参数:sFilename 文本文件的全路径 返回: 返回的是词频统计结果形式如下: 张华平/nr/10#博士/n/9#分词/n/8 * @param sFilename * @return * */ public String NLPIR_FileWordFreqStat(String sFilename);//获取输入文本的词,词性,频统计结果,按照词频大小排序 }
public static void main(String[] args) { String argu = NLPIRPath.PATH_DATA; int charset_type = 1; int init_flag = NLPIRLibrary.Instance.NLPIR_Init(argu, charset_type,"0"); if (init_flag==1) { System.out.println("init success!"); }else if (init_flag==0) { String message = NLPIRLibrary.Instance.NLPIR_GetLastErrorMsg(); System.out.println("init fail!"+message); } String word = NLPIRLibrary.Instance.NLPIR_GetEngWordOrign("wanted"); System.out.println(word); NLPIRLibrary.Instance.NLPIR_AddUserWord("强降雨 n"); String word2 = NLPIRLibrary.Instance.NLPIR_WordFreqStat("南方多地出现强降雨的同时,高温天气也在南方蔓延。据中国天气网21日消息,在副热带高压控制下,19日开始,华南一带出现高温天气,影响范围逐步扩大。预计,未来10天,高温继续蔓延,江南中南部、华南将出现日最高气温为35-38℃的持续高温晴热天气。"); System.out.println(word2); String keywords =NLPIRLibrary.Instance.NLPIR_GetFileKeyWords("E:/temp/abc.txt", 10, true); System.out.println(keywords); // NLPIRLibrary.Instance.NLPIR_NWI_Start();//新词识别开始 // NLPIRLibrary.Instance.NLPIR_NWI_AddFile("E:/temp/def.txt");//批量增加输入文件,可以不断循环调用NLPIR_NWI_AddFile或者NLPIR_NWI_AddMem // // NLPIRLibrary.Instance.NLPIR_NWI_Complete();//新词识别导入文件结束 // // String t= NLPIRLibrary.Instance.NLPIR_NWI_GetResult(false);//获取本次批量导入文本文件中识别的新词结果 // System.out.println("新词识别结果 " + t);//打印输出新词识别结果 // System.out.println("============"); try { String aaa = NLPIRLibrary.Instance.NLPIR_ParagraphProcess(FileUtil.read(new File("E:/temp/def.txt"), "UTF-8"), 0); System.out.println(aaa.replaceAll(" ", ",")); } catch (Exception e) { e.printStackTrace(); } String abc =NLPIRLibrary.Instance.NLPIR_GetFileNewWords("E:/temp/def.txt", 10, false); for (String a : abc.split("#")) { NLPIRLibrary.Instance.NLPIR_AddUserWord(a+" n"); System.out.println(a); } System.out.println("abc:"+abc); NLPIRLibrary.Instance.NLPIR_AddUserWord("暴雨黄色预警 n"); try { String ddd = NLPIRLibrary.Instance.NLPIR_ParagraphProcess(FileUtil.read(new File("E:/temp/def.txt"), "UTF-8"), 0); System.out.println(ddd.replaceAll(" ", ",")); } catch (Exception e) { e.printStackTrace(); } NLPIRLibrary.Instance.NLPIR_Exit(); }
