论坛首页 Java企业应用论坛

JAV程序解析搜狗词库scel文件格式

浏览 3463 次
精华帖 (0) :: 良好帖 (0) :: 新手帖 (0) :: 隐藏帖 (0)
作者 正文
   发表时间:2014-07-04  
在做一个电商的网站的初期时,我们常常面临词库的问题,因为我们并没有比较好的词库,这时候呢,我们就可以从网上下一些,别人有的词库,这些词库有淘宝的,有搜狗的,搜狗的分类比较细, 我们可以根据下载与我们行业比较相关的词库,但这些词库一般都是scel格式的,直接使用JAVA解析,是没法解析的,如果遇到这种情况可用散仙下面的这个类,来解析,经测试无乱码现象,解析完整度还不错。

源码如下:

<pre name="code" class="java">package com.qin.parse.scel;



import java.io.ByteArrayOutputStream;
import java.io.DataInputStream;
import java.io.File;
import java.io.FileInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.net.URL;
import java.util.ArrayList;
import java.util.Iterator;
import java.util.LinkedHashMap;
import java.util.List;
import java.util.Map;
import java.util.Map.Entry;
import java.util.Set;

public class SougouScelReader {

    public SougouScelMdel read(File file) throws IOException {
        return read(new FileInputStream(file));
    }

    public SougouScelMdel read(URL url) throws IOException {
        return read(url.openStream());
    }

    protected ByteArrayOutputStream output=new ByteArrayOutputStream();

    protected String readString(DataInputStream input,int pos,int[] reads) throws IOException {
        int read=reads[0];
        input.skip(pos-read);
        read=pos;
        output.reset();
        while(true) {
            int c1 = input.read();
            int c2 = input.read();
            read+=2;
            if(c1==0 &amp;&amp; c2==0) {
                break;
            } else {
                output.write(c1);
                output.write(c2);
            }
        }
        reads[0]=read;
        return new String(output.toByteArray(),encoding);
    }

    protected static String encoding = "UTF-16LE";

    public SougouScelMdel read(InputStream in) throws IOException {
        SougouScelMdel model = new SougouScelMdel();
        DataInputStream input = new DataInputStream(in);
        int read;
        try {
            byte[] bytes = new byte[4];
            input.readFully(bytes);
            assert (bytes[0] == 0x40 &amp;&amp; bytes[1] == 0x15 &amp;&amp; bytes[2] == 0 &amp;&amp; bytes[3] == 0);
            input.readFully(bytes);
            int flag1 = bytes[0];
            assert (bytes[1] == 0x43 &amp;&amp; bytes[2] == 0x53 &amp;&amp; bytes[3] == 0x01);
            int[] reads=new int[]{8};
            model.setName(readString(input,0x130,reads));
            model.setType(readString(input,0x338,reads));
            model.setDescription(readString(input,0x540,reads));
            model.setSample(readString(input,0xd40,reads));
            read = reads[0];
            input.skip(0x1540 - read);
            read=0x1540;
            input.readFully(bytes);
            read += 4;
            assert (bytes[0] == (byte) 0x9D &amp;&amp; bytes[1] == 0x01 &amp;&amp; bytes[2] == 0 &amp;&amp; bytes[3] == 0);
            bytes = new byte[128];
            Map&lt;Integer, String&gt; pyMap = new LinkedHashMap&lt;Integer, String&gt;();
            while (true) {
                int mark = readUnsignedShort(input);
                int size = input.readUnsignedByte();
                input.skip(1);
                read += 4;
                assert (size &gt; 0 &amp;&amp; (size % 2) == 0);
                input.readFully(bytes, 0, size);
                read += size;
                String py = new String(bytes, 0, size, encoding);
                //System.out.println(py);
                pyMap.put(mark, py);
                if ("zuo".equals(py)) {
                    break;
                }
            }
            if (flag1 == 0x44) {
                input.skip(0x2628 - read);
            } else if (flag1 == 0x45) {
                input.skip(0x26C4 - read);
            } else {
                throw new RuntimeException("出现意外,联系作者");
            }
            StringBuffer buffer = new StringBuffer();
            Map&lt;String, List&lt;String&gt;&gt; wordMap = new LinkedHashMap&lt;String, List&lt;String&gt;&gt;();
            while (true) {
                int size = readUnsignedShort(input);
                if (size &lt; 0) {
                    break;
                }
                int count = readUnsignedShort(input);
                int len = count / 2;
                assert (len * 2 == count);
                buffer.setLength(0);
                for (int i = 0; i &lt; len; i++) {
                    int key = readUnsignedShort(input);
                    buffer.append(pyMap.get(key)).append("'");
                }
                buffer.setLength(buffer.length() - 1);
                String py = buffer.toString();
                List&lt;String&gt; list = wordMap.get(py);
                if (list == null) {
                    list = new ArrayList&lt;String&gt;();
                    wordMap.put(py, list);
                }
                for (int i = 0; i &lt; size; i++) {
                    count = readUnsignedShort(input);
                    if (count &gt; bytes.length) {
                        bytes = new byte[count];
                    }
                    input.readFully(bytes, 0, count);
                    String word = new String(bytes, 0, count, encoding);
                    //接下来12个字节可能是词频或者类似信息
                    input.skip(12);
                    list.add(word);
                }
            }
            //System.out.println(wordMap.size());
            model.setWordMap(wordMap);
            return model;
        } finally {
            in.close();
        }
    }

    protected final int readUnsignedShort(InputStream in) throws IOException {
        int ch1 = in.read();
        int ch2 = in.read();
        if ((ch1 | ch2) &lt; 0) {
            return Integer.MIN_VALUE;
        }
        return (ch2 &lt;&lt; + (ch1 &lt;&lt; 0);
    }

}

//自行将此类提出来为public class
class SougouScelMdel {

    private Map&lt;String, List&lt;String&gt;&gt; wordMap;

    private String name;
    private String type;
    private String description;
    private String sample;

    public Map&lt;String, List&lt;String&gt;&gt; getWordMap() {
        return wordMap;
    }

    void setWordMap(Map&lt;String, List&lt;String&gt;&gt; wordMap) {
        this.wordMap = wordMap;
    }

    public String getType() {
        return type;
    }

    public void setType(String type) {
        this.type = type;
    }

    public String getDescription() {
        return description;
    }

    public void setDescription(String description) {
        this.description = description;
    }

    public String getSample() {
        return sample;
    }

    public void setSample(String sample) {
        this.sample = sample;
    }

    public String getName() {
        return name;
    }

    public void setName(String name) {
        this.name = name;
    }

   
   
   
   
}
</pre>



<pre name="code" class="java">package com.qin.parse.scel;

import java.io.File;
import java.io.IOException;
import java.io.RandomAccessFile;
import java.nio.file.Files;
import java.nio.file.LinkOption;
import java.nio.file.Paths;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.Map.Entry;

/**
* 解析sogo词库工具类
*
*
* **/
public class ParseSogo {

public static void main(String[] args)throws Exception {

   sogou("D:\\词库\\dianshang.scel","D:\\词库\\goods1.txt",false);
}
  
/**
* 读取scel的词库文件
* 生成txt格式的文件
* @param inputPath 输入路径
* @param outputPath 输出路径
* @param isAppend  是否拼接追加词库内容
* true 代表追加,false代表重建
*
* **/
   private static void sogou(String inputPath,String outputPath,boolean isAppend) throws IOException{ 
       File file=new File(inputPath); 
       if(!isAppend){
       if(Files.exists(Paths.get(outputPath),LinkOption.values())){
       System.out.println("存储此文件已经删除");
       Files.deleteIfExists(Paths.get(outputPath));
      
       }
       }
       RandomAccessFile raf=new RandomAccessFile(outputPath, "rw");
     
       int count=0;
       SougouScelMdel model = new SougouScelReader().read(file); 
       Map&lt;String,List&lt;String&gt;&gt; words = model.getWordMap(); //词&lt;拼音,词&gt; 
       Set&lt;Entry&lt;String,List&lt;String&gt;&gt;&gt; set = words.entrySet(); 
       Iterator&lt;Entry&lt;String,List&lt;String&gt;&gt;&gt; iter = set.iterator(); 
       while(iter.hasNext()){ 
           Entry&lt;String,List&lt;String&gt;&gt; entry = iter.next(); 
           List&lt;String&gt; list = entry.getValue(); 
           int size = list.size(); 
           for(int i = 0; i &lt; size; i++){ 
               String word = list.get(i); 
              
               //System.out.println(word);
               raf.seek(raf.getFilePointer());
               raf.write((word+"\n").getBytes());//写入txt文件
               count++;
              
              
           } 
       } 
       raf.close();
       System.out.println("生成txt成功!,总计写入: "+count+" 条数据!");
   } 

}
</pre>
论坛首页 Java企业应用版

跳转论坛:
Global site tag (gtag.js) - Google Analytics