#!/usr/bin/python
# -*- coding: utf-8 -*-
"""
Created on Mon Oct 29 09:36:34 2018
@author: Robert
https://pyltp.readthedocs.io/zh_CN/latest/api.html#id2
注意和准备事项
1. 中文文本编码必须为utf-8。
2. 准备好ltp_data_v3.4.0数据包。
"""
#导入/读取自己的中文语料
import os
input_file = r"D:/nltk_data/mycorpora/三毛/撒哈拉的故事.txt"
input_file2 = r"D:/MoYan/蛙.txt"
LTP_DATA_DIR = r"D:/NLTK_data/mycorpora/ltp_data_v3.4.0" # ltp模型目录的路径
seg_model_path = os.path.join(LTP_DATA_DIR, 'cws.model') #分词模型路径
pos_model_path = os.path.join(LTP_DATA_DIR, 'pos.model') # 词性标注模型路径,模型名称为`pos.model`
par_model_path = os.path.join(LTP_DATA_DIR, 'parser.model') # 依存句法分析模型路径,模型名称为`parser.model`
#分句
from pyltp import SentenceSplitter
sents = SentenceSplitter.split("元芳你怎么看?我就趴窗口上看呗!") #字符串
print("\n".join(sents))
raw = open(input_file, encoding = "utf-8").read() #文件
sents = SentenceSplitter.split(raw)
print("\n".join(sents))
raw = open(input_file2, encoding = "utf-8").read() #文件
sents = SentenceSplitter.split(raw)
print("\n".join(sents))
out = open(r"D:/MyResult.txt", "w", encoding = "utf-8") #输出到文件
out.write("\n".join(sents))
out.close()
#分词
from pyltp import Segmentor
segmentor = Segmentor() # 初始化实例
segmentor.load(seg_model_path) # 加载模型
words = segmentor.segment("元芳你怎么看") #字符串
print("\n".join(words))
raw = open(input_file, encoding = 'utf-8').read()
words = segmentor.segment(raw) #文件
print("\n".join(words))
out = open(r"D:/MyResult.txt", "w", encoding = "utf-8") #输出到文件
out.write(" ".join(words))
out.close()
segmentor.release() # 释放模型
#词性标注
from pyltp import Segmentor
from pyltp import Postagger
segmentor = Segmentor() # 初始化实例
postagger = Postagger() # 初始化实例
segmentor.load(seg_model_path) # 加载模型
postagger.load(pos_model_path) # 加载模型
words = ['元芳', '你', '怎么', '看'] # 分词结果
postags = postagger.postag(words) # 词性标注
print("\t".join(postags))
raw = open(input_file2, encoding = 'utf-8').read()
words = segmentor.segment(raw) #文件
postags = postagger.postag(words) # 词性标注
result = zip(words, postags)
result_list = []
for i in result:
word, pos = i
result_list.append(word + "/" + pos)
out = open(r"D:/MyResult.txt", "w", encoding = "utf-8") #输出到文件
out.write(" ".join(result_list))
out.close()
segmentor.release() # 释放模型
postagger.release() # 释放模型
#依存句法分析
import os
input_file = r"D:/nltk_data/mycorpora/三毛/撒哈拉的故事.txt"
input_file2 = r"D:/MoYan/蛙.txt"
LTP_DATA_DIR = r"D:/NLTK_data/mycorpora/ltp_data_v3.4.0" # ltp模型目录的路径
seg_model_path = os.path.join(LTP_DATA_DIR, 'cws.model') #分词模型路径
pos_model_path = os.path.join(LTP_DATA_DIR, 'pos.model') # 词性标注模型路径,模型名称为`pos.model`
par_model_path = os.path.join(LTP_DATA_DIR, 'parser.model') # 依存句法分析模型路径,模型名称为`parser.model`
from pyltp import SentenceSplitter
from pyltp import Segmentor
from pyltp import Postagger
from pyltp import Parser
segmentor = Segmentor() # 初始化实例
postagger = Postagger() # 初始化实例
parser = Parser() # 初始化实例
segmentor.load(seg_model_path) # 加载模型
postagger.load(pos_model_path) # 加载模型
parser.load(par_model_path) # 加载模型
words = ['元芳', '你', '怎么', '看']
postags = ['nh', 'r', 'r', 'v']
arcs = parser.parse(words, postags) # 句法分析
print("\t".join("%d:%s" % (arc.head, arc.relation) for arc in arcs))
raw = open(input_file2, encoding = 'utf-8').read()
sents = SentenceSplitter.split(raw) #必须分句,否则出错
outStr = ""
for i in range(len(sents)):
outStr = outStr + sents[i] + "\n"
words = segmentor.segment(sents[i]) #文件
outStr = outStr + "\t".join(words) + "\n"
postags = postagger.postag(words) # 词性标注
outStr = outStr + "\t".join(postags) + "\n"
arcs = parser.parse(words, postags) # 句法分析
tmpStr = "\t".join("%d:%s" % (arc.head, arc.relation) for arc in arcs)
outStr = outStr + tmpStr + "\n\n"
out = open(r"D:/MyResult.txt", "w", encoding = "utf-8") #输出到文件
out.write(outStr)
out.close()
segmentor.release() # 释放模型
postagger.release() # 释放模型
parser.release() # 释放模型