读取cnn_dl

最新推荐文章于 2024-07-28 20:53:06 发布

畫聿

最新推荐文章于 2024-07-28 20:53:06 发布

阅读量26

点赞数 1

文章标签： cnn 人工智能神经网络

本文链接：https://blog.csdn.net/qq_44951348/article/details/132094894

版权

下面是代码：

在这里插入代码片

import hashlib
import os
import json
import datasets

DM_SINGLE_CLOSE_QUOTE = "\u2019"  # unicode
DM_DOUBLE_CLOSE_QUOTE = "\u201d"
# acceptable ways to end a sentence
END_TOKENS = [".", "!", "?", "...", "'", "`", '"', DM_SINGLE_CLOSE_QUOTE, DM_DOUBLE_CLOSE_QUOTE, ")"]


def _read_text_file_path(path):
    with open(path, "r", encoding="utf-8") as f:
        lines = [line.strip() for line in f]
    return lines


# def _read_text_file(file):
#     return [line.decode("utf-8").strip() for line in file]

def _read_text_file(text_file):
    lines = []
    with open(text_file, "r", encoding='utf-8') as f:
        for line in f:
            lines.append(line.strip())
    return lines


def _get_art_abs(story_file, tfds_version):
    """Get abstract (highlights) and article from a story file path."""
    # Based on https://github.com/abisee/cnn-dailymail/blob/master/
    #     make_datafiles.py

    lines = _read_text_file(story_file)

    # The github code lowercase the text and we removed it in 3.0.0.

    # Put periods on the ends of lines that are missing them
    # (this is a problem in the dataset because many image captions don't end in
    # periods; consequently they end up in the body of the article as run-on
    # sentences)
    def fix_missing_period(line):
        """Adds a period to a line that is missing a period."""
        if "@highlight" in line:
            return line
        if not line:
            return line
        if line[-1] in END_TOKENS:
            return line
        return line + " ."

    lines = [fix_missing_period(line) for line in lines]

    # Separate out article and abstract sentences
    article_lines = []
    highlights = []
    next_is_highlight = False
    for line in lines:
        if not line:
            continue  # empty line
        elif line.startswith("@highlight"):
            next_is_highlight = True
        elif next_is_highlight:
            highlights.append(line)
        else:
            article_lines.append(line)

    # Make article into a single string
    article = " ".join(article_lines)

    if tfds_version >= "2.0.0":
        abstract = "\n".join(highlights)
    else:
        abstract = " ".join(highlights)

    return article, abstract


def _generate_examples(file):
    article, highlights = _get_art_abs(file, '3.0.0')
    if not article or not highlights:
        return print('no')
    return {
        "instruction": 'Please help me to summary this article.',
        "input": article,
        "output": highlights
    }


def main():
    json_file_path = './result.json'
    jsonpath = open(json_file_path, mode='w')
    total_list = []
    for file in os.listdir("./cnn/stories"):
        total_list.append(_generate_examples("./cnn/stories/" + file))
    json.dump(total_list, jsonpath)


if __name__ == '__main__':
    main()