下面是代码:
在这里插入代码片
import hashlib
import os
import json
import datasets
DM_SINGLE_CLOSE_QUOTE = "\u2019" # unicode
DM_DOUBLE_CLOSE_QUOTE = "\u201d"
# acceptable ways to end a sentence
END_TOKENS = [".", "!", "?", "...", "'", "`", '"', DM_SINGLE_CLOSE_QUOTE, DM_DOUBLE_CLOSE_QUOTE, ")"]
def _read_text_file_path(path):
with open(path, "r", encoding="utf-8") as f:
lines = [line.strip() for line in f]
return lines
# def _read_text_file(file):
# return [line.decode("utf-8").strip() for line in file]
def _read_text_file(text_file):
lines = []
with open(text_file, "r", encoding='utf-8') as f:
for line in f:
lines.append(line.strip())
return lines
def _get_art_abs(story_file, tfds_version):
"""Get abstract (highlights) and article from a story file path."""
# Based on https://github.com/abisee/cnn-dailymail/blob/master/
# make_datafiles.py
lines = _read_text_file(story_file)
# The github code lowercase the text and we removed it in 3.0.0.
# Put periods on the ends of lines that are missing them
# (this is a problem in the dataset because many image captions don't end in
# periods; consequently they end up in the body of the article as run-on
# sentences)
def fix_missing_period(line):
"""Adds a period to a line that is missing a period."""
if "@highlight" in line:
return line
if not line:
return line
if line[-1] in END_TOKENS:
return line
return line + " ."
lines = [fix_missing_period(line) for line in lines]
# Separate out article and abstract sentences
article_lines = []
highlights = []
next_is_highlight = False
for line in lines:
if not line:
continue # empty line
elif line.startswith("@highlight"):
next_is_highlight = True
elif next_is_highlight:
highlights.append(line)
else:
article_lines.append(line)
# Make article into a single string
article = " ".join(article_lines)
if tfds_version >= "2.0.0":
abstract = "\n".join(highlights)
else:
abstract = " ".join(highlights)
return article, abstract
def _generate_examples(file):
article, highlights = _get_art_abs(file, '3.0.0')
if not article or not highlights:
return print('no')
return {
"instruction": 'Please help me to summary this article.',
"input": article,
"output": highlights
}
def main():
json_file_path = './result.json'
jsonpath = open(json_file_path, mode='w')
total_list = []
for file in os.listdir("./cnn/stories"):
total_list.append(_generate_examples("./cnn/stories/" + file))
json.dump(total_list, jsonpath)
if __name__ == '__main__':
main()