中文词典转换程序

将中文文本转换成hashmap对象,中文词一个一行.

 

package test;

import java.io.Serializable;

public class Item implements Serializable {
 private String content;
 private int count;
 private boolean isWord;
 private boolean haveSuccessor;

 public String getContent() {
  return content;
 }

 public void setContent(String content) {
  this.content = content;
 }

 public int getCount() {
  return count;
 }

 public void setCount(int count) {
  this.count = count;
 }

 public boolean isWord() {
  return isWord;
 }

 public void setWord(boolean isWord) {
  this.isWord = isWord;
 }

 public boolean isHaveSuccessor() {
  return haveSuccessor;
 }

 public void setHaveSuccessor(boolean haveSuccessor) {
  this.haveSuccessor = haveSuccessor;
 }
}

 

 

 

 

package test;

import java.io.BufferedReader;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.InputStreamReader;
import java.io.ObjectInputStream;
import java.io.ObjectOutputStream;
import java.util.HashMap;

public class Main {

 /**
  * @param args
  * @throws IOException
  * @throws FileNotFoundException
  */
 public static void main(String[] args) throws FileNotFoundException,
   IOException {
  // TODO Auto-generated method stub

  String file = null;
  String dir = null;
  HashMap map = null;

  file = "f://test//results.obj";
  dir = "f://word";

  File filer = new File(file);
  if (!filer.exists()) {
   map = new HashMap();
   Main.save(file, map);
  } else {
   map = Main.load(file);
  }

  String[] results = null;

  results = Main.list(dir);
  if (results != null) {
   int s = results.length;
   for (int i = 0; i < s; i++) {
    try {
     Main.read(dir + "//" + results[i], map);
    } catch(Exception e) {
     //
     e.printStackTrace();
    }
   }
  }
  
  Main.save(file, map);

 }

 public static void read(String file, HashMap map) throws IOException {
  InputStreamReader reader = new InputStreamReader(new FileInputStream(file), "UTF-8");
  BufferedReader br = new BufferedReader(reader);
  String s1 = null;
  while ((s1 = Main.read(br)) != null) {
   System.out.println(s1);
   Main.scan(s1, map);
  }
  br.close();
  reader.close();
 }

 public static String read(BufferedReader reader) {
  String result = null;

  try {
   result = reader.readLine();
  } catch (IOException e) {
   // TODO Auto-generated catch block
   e.printStackTrace();
  }

  return result;
 }

 public static String[] list(String path) {
  File file = new File(path);
  String[] result = null;

  if (file.exists()) {
   result = file.list();
  }
  return result;
 }

 public static HashMap load(String file) throws FileNotFoundException,
   IOException {
  HashMap map = null;

  ObjectInputStream in = null;
  try {
   in = new ObjectInputStream(new FileInputStream(new File(file)));
   map = (HashMap) in.readObject();
   // out.close();
  } catch (Exception e) {
   // TODO Auto-generated catch block
   e.printStackTrace();
   try {
    in.close();
   } catch (IOException e1) {
    // TODO Auto-generated catch block
    e1.printStackTrace();
    in = null;
   }
  } finally {
   if (in != null) {
    try {
     in.close();
    } catch (IOException e) {
     // TODO Auto-generated catch block
     e.printStackTrace();
     in = null;
    }
   }
  }

  return map;
 }

 public static void save(String file, HashMap map) {
  ObjectOutputStream out = null;
  try {
   out = new ObjectOutputStream(new FileOutputStream(new File(file)));
   out.writeObject(map);
   // out.close();
  } catch (Exception e) {
   // TODO Auto-generated catch block
   e.printStackTrace();
   try {
    out.close();
   } catch (IOException e1) {
    // TODO Auto-generated catch block
    e1.printStackTrace();
    out = null;
   }
  } finally {
   if (out != null) {
    try {
     out.close();
    } catch (IOException e) {
     // TODO Auto-generated catch block
     e.printStackTrace();
     out = null;
    }
   }
  }
 }

 public static void scan(String content, HashMap map) {
  String temp = null;
  Item item = null;
  int s = content.length();
  for (int i = 0; i < s; i++) {
   if(s == 1) {
    item = new Item();
    item.setContent(content);
    item.setCount(1);
    item.setWord(true);
    item.setHaveSuccessor(false);
    
   } else if(i < s-1){
    temp = content.substring(0, i);
    item = (Item)map.get(temp.toString());
    if(item != null) {
     item.setCount(item.getCount() + 1);
     item.setHaveSuccessor(true);
    } else {
     item = new Item();
     item.setContent(temp);
     item.setCount(1);
     item.setWord(false);
     item.setHaveSuccessor(true);
    }
   } else if(i == s-1){
    temp = content;
    item = (Item)map.get(temp.toString());
    if(item != null) {
     item.setCount(item.getCount() + 1);
     item.setWord(true);
    } else {
     item = new Item();
     item.setContent(temp);
     item.setCount(1);
     item.setWord(true);
     item.setHaveSuccessor(false);
    }
   }
   map.put(temp, item);
  }
 }
}

 

 

 

package test;

import java.io.File;
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.IOException;
import java.io.ObjectInputStream;
import java.util.HashMap;

public class Test {

 /**
  * @param args
  * @throws IOException
  * @throws FileNotFoundException
  */
 public static void main(String[] args) throws FileNotFoundException, IOException {
  // TODO Auto-generated method stub
  
  HashMap map = null;
  
  map = Test.load("f://test//results.obj");

  for(Object obj : map.keySet().toArray()) {
   String str = (String)obj;
   Item item = (Item)map.get(str);
   System.out.println(str + " : " + item.getCount() + " , " + item.isWord() + " , " + item.isHaveSuccessor());
  }
  
  System.out.println(map.size());
 }

 public static HashMap load(String file) throws FileNotFoundException,
   IOException {
  HashMap map = null;

  ObjectInputStream in = null;
  try {
   in = new ObjectInputStream(new FileInputStream(new File(file)));
   map = (HashMap) in.readObject();
   // out.close();
  } catch (Exception e) {
   // TODO Auto-generated catch block
   e.printStackTrace();
   try {
    in.close();
   } catch (IOException e1) {
    // TODO Auto-generated catch block
    e1.printStackTrace();
    in = null;
   }
  } finally {
   if (in != null) {
    try {
     in.close();
    } catch (IOException e) {
     // TODO Auto-generated catch block
     e.printStackTrace();
     in = null;
    }
   }
  }

  return map;
 }

}

  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
以下是一个使用Python和TensorFlow训练中文分词模型的示例程序: ```python import tensorflow as tf import numpy as np import os # 加载数据集 def load_data(data_dir): train_data = [] train_label = [] test_data = [] test_label = [] with open(os.path.join(data_dir, 'train.txt'), 'r', encoding='utf-8') as f: for line in f: line = line.strip() if not line: continue segs = line.split('\t') train_data.append(segs[0]) train_label.append(segs[1]) with open(os.path.join(data_dir, 'test.txt'), 'r', encoding='utf-8') as f: for line in f: line = line.strip() if not line: continue segs = line.split('\t') test_data.append(segs[0]) test_label.append(segs[1]) return train_data, train_label, test_data, test_label # 构建词典 def build_vocab(data): vocab = set() for sentence in data: for word in sentence: vocab.add(word) return sorted(list(vocab)) # 生成词向量 def generate_word_vectors(word_index, embedding_size=128): vocab_size = len(word_index) word_vectors = np.random.uniform(-0.25, 0.25, (vocab_size, embedding_size)) word_vectors[0] = np.zeros((embedding_size,)) return word_vectors # 将文本转换为数字序列 def text_to_sequence(text, word_index): seq = [] for word in text: if word in word_index: seq.append(word_index[word]) else: seq.append(0) return seq # 构建模型 def build_model(vocab_size, embedding_size, hidden_size, num_classes): model = tf.keras.Sequential([ tf.keras.layers.Embedding(vocab_size, embedding_size, mask_zero=True), tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(hidden_size)), tf.keras.layers.Dense(num_classes, activation='softmax') ]) return model if __name__ == '__main__': data_dir = 'data' # 数据集所在目录 embedding_size = 128 # 词向量维度 hidden_size = 64 # LSTM隐层维度 batch_size = 64 # 批大小 epochs = 5 # 训练轮数 # 加载数据集 train_data, train_label, test_data, test_label = load_data(data_dir) # 构建词典 vocab = build_vocab(train_data) # 生成词向量 word_index = {word: i for i, word in enumerate(vocab)} word_vectors = generate_word_vectors(word_index) # 将文本转换为数字序列 train_data = [text_to_sequence(text, word_index) for text in train_data] test_data = [text_to_sequence(text, word_index) for text in test_data] # 填充序列 train_data = tf.keras.preprocessing.sequence.pad_sequences(train_data, padding='post') test_data = tf.keras.preprocessing.sequence.pad_sequences(test_data, padding='post') # 构建模型 model = build_model(len(vocab), embedding_size, hidden_size, 2) model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy']) # 训练模型 model.fit(train_data, train_label, batch_size=batch_size, epochs=epochs, validation_data=(test_data, test_label)) # 保存模型 model.save('model.h5') ``` 该程序中使用了双向LSTM作为模型的主要组成部分,并使用随机初始化的词向量作为输入。训练数据集和测试数据集均为文本序列和标签的对应关系,其中标签为0或1,表示当前位置是否应该被分割。训练完成后,模型将被保存到文件“model.h5”中,用于后续的预测任务。
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值