tarfile.ReadError: not a gzip file / download_20newsgroups 数据集失败

改进方法:

dataset = fetch_20newsgroups(categories=categories)
改为:
dataset = _fetch_20newsgroups(categories=categories)

并添加方法:

import shutil

import matplotlib as mpl
from sklearn.datasets.twenty_newsgroups import strip_newsgroup_header, strip_newsgroup_footer, strip_newsgroup_quoting
from sklearn.utils import check_random_state

mpl.use('Agg')
import os
import pickle
import codecs
import math
import networkx as nx
import pickle as pkl
import numpy as np
from itertools import product
from sklearn.datasets import fetch_20newsgroups, load_files
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.neighbors import kneighbors_graph
from sklearn.manifold import TSNE
from tqdm import tqdm
import tarfile

from matplotlib import pyplot as plt

TRAIN_FOLDER = "20news-bydate-train"
TEST_FOLDER = "20news-bydate-test"
batch_size = 32

categories = ['comp.graphics', 'rec.sport.baseball', 'talk.politics.guns']

def download_20newsgroups(target_dir):
    """Download the 20 newsgroups data and stored it as a zipped pickle."""
    train_path = os.path.join(target_dir, TRAIN_FOLDER)
    test_path = os.path.join(target_dir, TEST_FOLDER)

    if not os.path.exists(target_dir):
        os.makedirs(target_dir)

    archive_path = "./data/20news-bydate.tar.gz"

    tarfile.open(archive_path, "r:gz").extractall(path=target_dir)

    # Store a zipped pickle
    cache = dict(train=load_files(train_path, encoding='latin1'),
                 test=load_files(test_path, encoding='latin1'))

    shutil.rmtree(target_dir)
    return cache

def _fetch_20newsgroups(data_home=None, subset='train', categories=None,
                       shuffle=True, random_state=42,
                       remove=(),
                       download_if_missing=True):

    twenty_home = os.path.join("./", "20news_home")
    cache = download_20newsgroups(target_dir=twenty_home)

    if subset in ('train', 'test'):
        data = cache[subset]
    elif subset == 'all':
        data_lst = list()
        target = list()
        filenames = list()
        for subset in ('train', 'test'):
            data = cache[subset]
            data_lst.extend(data.data)
            target.extend(data.target)
            filenames.extend(data.filenames)

        data.data = data_lst
        data.target = np.array(target)
        data.filenames = np.array(filenames)
    else:
        raise ValueError(
            "subset can only be 'train', 'test' or 'all', got '%s'" % subset)

    data.description = 'the 20 newsgroups by date dataset'

    if 'headers' in remove:
        data.data = [strip_newsgroup_header(text) for text in data.data]
    if 'footers' in remove:
        data.data = [strip_newsgroup_footer(text) for text in data.data]
    if 'quotes' in remove:
        data.data = [strip_newsgroup_quoting(text) for text in data.data]

    if categories is not None:
        labels = [(data.target_names.index(cat), cat) for cat in categories]
        # Sort the categories to have the ordering of the labels
        labels.sort()
        labels, categories = zip(*labels)
        mask = np.in1d(data.target, labels)
        data.filenames = data.filenames[mask]
        data.target = data.target[mask]
        # searchsorted to have continuous labels
        data.target = np.searchsorted(labels, data.target)
        data.target_names = list(categories)
        # Use an object array to shuffle: avoids memory copy
        data_lst = np.array(data.data, dtype=object)
        data_lst = data_lst[mask]
        data.data = data_lst.tolist()

    if shuffle:
        random_state = check_random_state(random_state)
        indices = np.arange(data.target.shape[0])
        random_state.shuffle(indices)
        data.filenames = data.filenames[indices]
        data.target = data.target[indices]
        # Use an object array to shuffle: avoids memory copy
        data_lst = np.array(data.data, dtype=object)
        data_lst = data_lst[indices]
        data.data = data_lst.tolist()

    return data

  • 0
    点赞
  • 1
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
[/public/home/pengjy/anaconda3] >>> PREFIX=/public/home/pengjy/anaconda3 WARNING: md5sum mismatch of tar archive expected: 8a581514493c9e0a1cbd425bc1c7dd90 got: 614f6284c34f91affd38a1be2e4be076 - Unpacking payload ... Traceback (most recent call last): File "entry_point.py", line 76, in <module> File "tarfile.py", line 2024, in extractall File "tarfile.py", line 2065, in extract File "tarfile.py", line 2137, in _extract_member File "tarfile.py", line 2186, in makefile File "tarfile.py", line 249, in copyfileobj tarfile.ReadError: unexpected end of data [210095] Failed to execute script entry_point concurrent.futures.process._RemoteTraceback: ''' Traceback (most recent call last): File "concurrent/futures/process.py", line 368, in _queue_management_worker File "multiprocessing/connection.py", line 251, in recv TypeError: __init__() missing 1 required positional argument: 'msg' ''' The above exception was the direct cause of the following exception: Traceback (most recent call last): File "entry_point.py", line 69, in <module> File "concurrent/futures/process.py", line 484, in _chain_from_iterable_of_lists File "concurrent/futures/_base.py", line 611, in result_iterator File "concurrent/futures/_base.py", line 439, in result File "concurrent/futures/_base.py", line 388, in __get_result concurrent.futures.process.BrokenProcessPool: A process in the process pool was terminated abruptly while the future was running or pending. [210105] Failed to execute script entry_point 是什么问题,如何解决?
07-25

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值