colab使用斯坦福分词器和词性标注

最新推荐文章于 2023-02-14 17:20:28 发布

zhuozhuomin

最新推荐文章于 2023-02-14 17:20:28 发布

阅读量408

点赞数

本文链接：https://blog.csdn.net/zhuozhuomin/article/details/107501768

版权

引入斯坦福

!pip install stanfordcorenlp

下载

!wget http://nlp.stanford.edu/software/stanford-corenlp-latest.zip

解压

!unzip '/content/stanford-corenlp-latest.zip' -d '/content'

改目录

import os
os.chdir('/content/stanford-corenlp-4.0.0')
!pwd

样例一

from stanfordcorenlp import StanfordCoreNLP
import nltk
from nltk.tree import Tree as nltkTree
 
##读取stanford-corenlp所在的目录
nlp = StanfordCoreNLP('/content/stanford-corenlp-4.0.0') 
 
#输入句子
sentence = 'Video becomes a new way of communication between Internet users with the proliferation of sensor-rich mobile devices.'
 

print('Part of Speech:', nlp.pos_tag(sentence))
print('Part of Speech:', nlp.dependency_parse(sentence))
print(nlp.word_tokenize(sentence))
print(nlp.ner(sentence))
print(nlp.parse(sentence))

写入json文件

from stanfordcorenlp import StanfordCoreNLP
import nltk
from nltk.tree import Tree as nltkTree
 
##读取stanford-corenlp所在的目录
nlp = StanfordCoreNLP('/content/stanford-corenlp-4.0.0') 

all_cap_pos = []
all_img_id = []
examples = []

word, pos = [], [] 
for annot in annotations['annotations'][:2000]:
    cap_pos = nlp.pos_tag(annot['caption'])
    image_id = annot['image_id']

    for cap_p in cap_pos:
      word.append(cap_p[0])
      pos.append(cap_p[1])
    examples.append({'word':word, 'pos':pos, 'image_id':image_id})
    word, pos = [], [] 

with open('/content/cap_pos.json','w', encoding='utf-8') as f: 
 	json.dump(examples, f)