java解析搜狗词库scel文件到txt



SougouScelReader 读取词库文件类
import java.io.*;
import java.net.URL;
import java.util.ArrayList;
import java.util.LinkedHashMap;
import java.util.List;
import java.util.Map;
/**
 * 读取搜索词库
 *
 * @author dengjh
 * @create 2016-11-03 9:39
 **/
public class SougouScelReader {

    public SougouScelMdel read(File file) throws IOException {
        return read(new FileInputStream(file));
    }

    public SougouScelMdel read(URL url) throws IOException {
        return read(url.openStream());
    }

    protected ByteArrayOutputStream output=new ByteArrayOutputStream();

    protected String readString(DataInputStream input,int pos,int[] reads) throws IOException {
        int read=reads[0];
        input.skip(pos-read);
        read=pos;
        output.reset();
        while(true) {
            int c1 = input.read();
            int c2 = input.read();
            read+=2;
            if(c1==0 && c2==0) {
                break;
            } else {
                output.write(c1);
                output.write(c2);
            }
        }
        reads[0]=read;
        return new String(output.toByteArray(),encoding);
    }

    protected static String encoding = "UTF-16LE";

    public SougouScelMdel read(InputStream in) throws IOException {
        SougouScelMdel model = new SougouScelMdel();
        DataInputStream input = new DataInputStream(in);
        int read;
        try {
            byte[] bytes = new byte[4];
            input.readFully(bytes);
            assert (bytes[0] == 0x40 && bytes[1] == 0x15 && bytes[2] == 0 && bytes[3] == 0);
            input.readFully(bytes);
            int flag1 = bytes[0];
            assert (bytes[1] == 0x43 && bytes[2] == 0x53 && bytes[3] == 0x01);
            int[] reads=new int[]{8};
            model.setName(readString(input,0x130,reads));
            model.setType(readString(input,0x338,reads));
            model.setDescription(readString(input,0x540,reads));
            model.setSample(readString(input,0xd40,reads));
            read = reads[0];
            input.skip(0x1540 - read);
            read=0x1540;
            input.readFully(bytes);
            read += 4;
            assert (bytes[0] == (byte) 0x9D && bytes[1] == 0x01 && bytes[2] == 0 && bytes[3] == 0);
            bytes = new byte[128];
            Map<Integer, String> pyMap = new LinkedHashMap<Integer, String>();
            while (true) {
                int mark = readUnsignedShort(input);
                int size = input.readUnsignedByte();
                input.skip(1);
                read += 4;
                assert (size > 0 && (size % 2) == 0);
                input.readFully(bytes, 0, size);
                read += size;
                String py = new String(bytes, 0, size, encoding);
                //System.out.println(py);
                pyMap.put(mark, py);
                if ("zuo".equals(py)) {
                    break;
                }
            }
            if (flag1 == 0x44) {
                input.skip(0x2628 - read);
            } else if (flag1 == 0x45) {
                input.skip(0x26C4 - read);
            } else {
                throw new RuntimeException("出现意外,联系作者");
            }
            StringBuffer buffer = new StringBuffer();
            Map<String, List<String>> wordMap = new LinkedHashMap<String, List<String>>();
            while (true) {
                int size = readUnsignedShort(input);
                if (size < 0) {
                    break;
                }
                int count = readUnsignedShort(input);
                int len = count / 2;
                assert (len * 2 == count);
                buffer.setLength(0);
                for (int i = 0; i < len; i++) {
                    int key = readUnsignedShort(input);
                    buffer.append(pyMap.get(key)).append("'");
                }
                buffer.setLength(buffer.length() - 1);
                String py = buffer.toString();
                List<String> list = wordMap.get(py);
                if (list == null) {
                    list = new ArrayList<String>();
                    wordMap.put(py, list);
                }
                for (int i = 0; i < size; i++) {
                    count = readUnsignedShort(input);
                    if (count > bytes.length) {
                        bytes = new byte[count];
                    }
                    input.readFully(bytes, 0, count);
                    String word = new String(bytes, 0, count, encoding);
                    //接下来12个字节可能是词频或者类似信息
                    input.skip(12);
                    list.add(word);
                }
            }
            //System.out.println(wordMap.size());
            model.setWordMap(wordMap);
            return model;
        } finally {
            in.close();
        }
    }

    protected final int readUnsignedShort(InputStream in) throws IOException {
        int ch1 = in.read();
        int ch2 = in.read();
        if ((ch1 | ch2) < 0) {
            return Integer.MIN_VALUE;
        }
        return (ch2 << 8) + (ch1 << 0);
    }
}


SougouScelMdel.java 
import java.util.List;
import java.util.Map;

/**
 * @author dengjh
 * @create 2016-11-03 9:40
 **/
public class SougouScelMdel {

    private Map<String, List<String>> wordMap;

    private String name;
    private String type;
    private String description;
    private String sample;

    public Map<String, List<String>> getWordMap() {
        return wordMap;
    }

    void setWordMap(Map<String, List<String>> wordMap) {
        this.wordMap = wordMap;
    }

    public String getType() {
        return type;
    }

    public void setType(String type) {
        this.type = type;
    }

    public String getDescription() {
        return description;
    }

    public void setDescription(String description) {
        this.description = description;
    }

    public String getSample() {
        return sample;
    }

    public void setSample(String sample) {
        this.sample = sample;
    }

    public String getName() {
        return name;
    }

    public void setName(String name) {
        this.name = name;
    }

}

ParseSogo.java 解析词库文件类
 
/**
 * 解析搜狗词库文件
 *
 * @author dengjh
 * @create 2016-11-03 9:44
 **/
import java.io.File;
import java.io.IOException;
import java.io.RandomAccessFile;
import java.nio.file.Files;
import java.nio.file.LinkOption;
import java.nio.file.Paths;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.Map.Entry;
public class ParseSogo {

    public static void main(String[] args)throws Exception {

        sogou("D:\\scel\\goods.scel","D:\\scel\\goods.txt",true);
    }

    /**
     * 读取scel的词库文件
     * 生成txt格式的文件
     * @param inputPath 输入路径
     * @param outputPath 输出路径
     * @param isAppend  是否拼接追加词库内容 true 代表追加,false代表重建
     *
     * **/
    private static void sogou(String inputPath,String outputPath,boolean isAppend) throws IOException{
        File file=new File(inputPath);
        if(!isAppend){
            if(Files.exists(Paths.get(outputPath),LinkOption.values())){
                System.out.println("存储此文件已经删除");
                Files.deleteIfExists(Paths.get(outputPath));

            }
        }
        RandomAccessFile raf=new RandomAccessFile(outputPath, "rw");

        int count=0;
        SougouScelMdel model = new SougouScelReader().read(file);
        Map<String,List<String>> words = model.getWordMap(); //词<拼音,词>
        Set<Entry<String,List<String>>> set = words.entrySet();
        Iterator<Entry<String,List<String>>> iter = set.iterator();
        while(iter.hasNext()){
            Entry<String,List<String>> entry = iter.next();
            List<String> list = entry.getValue();
            int size = list.size();
            for(int i = 0; i < size; i++){
                String word = list.get(i);

                //System.out.println(word);
                raf.seek(raf.getFilePointer());
                raf.write((word+"\n").getBytes());//写入txt文件
                count++;


            }
        }
        raf.close();
        System.out.println("生成txt成功!,总计写入: "+count+" 条数据!");
    }

}
评论 3
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值