具体处理方法如下:
处理后的语料下载地址
from time import strptime
import csv
import numpy
article_components = ['DATE', 'PLACES', 'DATELINE', 'TOPICS', 'PEOPLE', 'ORGS', 'EXCHANGES',
'COMPANIES', 'TITLE', 'BODY','HOUR']
files = ['reut2-000.sgm', 'reut2-001.sgm', 'reut2-002.sgm', 'reut2-003.sgm',
'reut2-004.sgm', 'reut2-005.sgm', 'reut2-006.sgm', 'reut2-007.sgm', 'reut2-008.sgm',
'reut2-009.sgm', 'reut2-010.sgm', 'reut2-011.sgm', 'reut2-012.sgm', 'reut2-013.sgm',
'reut2-014.sgm', 'reut2-015.sgm', 'reut2-016.sgm', 'reut2-017.sgm', 'reut2-018.sgm',
'reut2-019.sgm', &#