下载数据集
数据集地址: sentence-compression.tsv.gz
数据集论文地址: Overcoming the Lack of Parallel Data in Sentence Compression
数据集内容:10个训练数据集,每个包括20000组数据;1个测试数据集,包括10000组数据。下面是一组数据的实例。我们只取“sentence”
和“headline”
作为同义转换的训练数据对。
{
"graph": {
"id": "0",
"sentence": "Five people have been taken to hospital with minor injuries following a crash on the A17 near Sleaford this morning.",
"node": [ {
"form": "ROOT",
"word": [ {
"id": -1,
"form": "ROOT",
"stem": "ROOT",
"tag": "ROOT"
} ],
"gender": 0,
"head_word_index": 0
}, {
"form": "Five people",
"word": [ {
"id": 13,
"form": "Five",
"stem": "five",
"tag": "CD"
}, {
"id": 14,
"form": "people",
"stem": "person",
"tag": "NNS"
} ],
"gender": 0,
"head_word_index": 1
}, {
"form": "have been taken",
"word": [ {
"id": 15,
"form": "have",
"stem": "have",
"tag": "VBP"
}, {
"id": 16,
"form": "been",
"stem": "be",
"tag": "VBN"
}, {
"id": 17,
"form": "taken",
"stem": "take",
"tag": "VBN"
} ],
"gender": 0,
"head_word_index": 2
}, {
"form": "to hospital",
"word": [ {
"id": 18,
"form": "to",
"stem": "to",
"tag": "IN"
}, {
"id": 19,
"form": "hospital",
"stem": "hospital",
"tag": "NN"
} ],
"gender": 0,
"head_word_index": 1
}, {
"form": "minor",
"word": [ {
"id": 21,
"form": "minor",
"stem": "minor",
"tag": "JJ"
} ],
"gender": 0,
"head_word_index": 0
}, {
"form": "with injuries",
"word": [ {
"id": 20,
"form": "with",
"stem": "with",
"tag": "IN"
}, {
"id": 22,
"form": "injuries",
"stem": "injury",
"tag": "NNS"
} ],
"gender": 0,
"head_word_index": 1
}, {
"form": "following a crash",
"word": [ {
"id": 23,
"form": "following",
"stem": "follow",
"tag": "VBG"
}, {
"id": 24,
"form": "a",
"stem": "a",
"tag": "DT"
}, {
"id": 25,
"form": "crash",
"stem": "crash",
"tag": "NN"
} ],
"gender": 0,
"head_word_index": 2
}, {
"form": "on the A17",
"type": "LOC",
"mid": "/m/08tthd",
"word": [ {
"id": 26,
"form": "on",
"stem": "on",
"tag": "IN"
}, {
"id": 27,
"form": "the",
"stem": "the",
"tag": "DT"
}, {
"id": 28,
"form": "A17",
"stem": "A17",
"tag": "NNP"
} ],
"gender": 0,
"head_word_index": 2
}, {
"form": "near Sleaford",
"type": "LOC",
"mid": "/m/01cfbw",
"word": [ {
"id": 29,
"form": "near",
"stem": "near",
"tag": "IN"
}, {
"id&#