2023年1月 Python处理数据集的路径工具

前言

在对代码类型的数据集进行预处理的时候,通常一组文件需要经过多次处理,同时还需要保持目录结构不变。
以下以flask的代码作为数据集举例

raw-data目录:

flask
│  app.py
│  blueprints.py
│  cli.py
│  config.py
│  ctx.py
│  debughelpers.py
│  globals.py
│  helpers.py
│  logging.py
│  scaffold.py
│  sessions.py
│  signals.py
│  templating.py
│  testing.py
│  typing.py
│  views.py
│  wrappers.py
│  __init__.py
│  __main__.py
│
└─json
        provider.py
        tag.py
        __init__.py

clean-data目录:

flask
│  app.py
│  blueprints.py
│  cli.py
│  config.py
│  ctx.py
│  debughelpers.py
│  globals.py
│  helpers.py
│  logging.py
│  scaffold.py
│  sessions.py
│  signals.py
│  templating.py
│  testing.py
│  typing.py
│  views.py
│  wrappers.py
│  __init__.py
│  __main__.py
│
└─json
        provider.py
        tag.py
        __init__.py

json-data目录

    proj_depends.json
    proj_token_count.json
    proj_token_no.json

encoded-data目录

│  flask_feature_data.csv
│  flask_feature_label.csv
│
└─flask
        app.jsonl
        blueprints.jsonl
        cli.jsonl
        __init__.jsonl
        __main__.jsonl

logs目录

    flask.log

这些文件有潜在的对应关系,并且代码中经常需要从一个路径转换到另一个路径的操作。如果每次都手动转换则不便管理,而且有时候根据文件名很难判断这是进行到哪一步的文件,因此想到了用变量的类型来表明这是进行到哪一步的文件,于是把路径的部分抽象出来重构出了path_utils.py。

完整代码

# _*_ coding: utf-8 _*_
# @Time    :   2023/01/05 11:17:17
# @FileName:   path_utils.py
# @Author  :   
# @Software:   VSCode
import logging
from pathlib import Path, WindowsPath, PosixPath
# from config import config
import os
from typing import Generator, ClassVar, TypeVar

_cls = WindowsPath if os.name == 'nt' else PosixPath
loggers = {}
config = {
    "datasets": {
        "raw-data": "datasets/raw-data",
        "clean-data": "datasets/clean-data",
        "json-data": "datasets/json-data",
        "encoded-data": "datasets/encoded-data",
        "log": "logs"
    }
}


# raw-data、clean-data、*tmp.py、json-data、*.jsonl、flask.jsonl、proj_*.json、feature_*.csv

class _PrefixPath(_cls):
    """
    抽象的类,不应直接实例化此类。
    join_path、rglob和with_suffix会直接返回自定义类型,所以不需要重写。
    """
    prefix: ClassVar[str] = ""

    # def __init__(self, prefix:str=prefix, *args, **kwargs) -> None:
    #     super().__init__(prefix, *args, *kwargs)

    def __new__(cls, *args, **kwargs):
        if len(args) > 0 and issubclass(type(args[0]), _PrefixPath):
            return args[0].change_to_path(cls)
        return super().__new__(cls, cls.prefix, *args, **kwargs)

    def __getnewargs__(self):
        return self.get_relative_path().parts

    def __reduce__(self):  # 防止序列化后出现重复前缀
        return type(self), self.__getnewargs__()

    def change_to_path(self, path_type: type, *args, **kwargs):  # 切换为另一个_PrefixPath类型
        # if issubclass(path_type, type(self)):
        #     return path_type(self.get_relative_path(), prefix=self.prefix, *args, **kwargs)
        return path_type(self.get_relative_path(), prefix=path_type.prefix, *args, **kwargs)

    def make_parent_dirs(self, mode: int = 0o777, parents: bool = True,
                         exist_ok: bool = True) -> None:  # pathlib里mode的默认值就是这样
        return self.parent.mkdir(mode, parents, exist_ok)

    def get_relative_path(self):
        return Path(self.relative_to(self.prefix))

    def get_logger(self) -> logging.Logger:
        return self.change_to_path(LogFilePath).logger

    # def joinpath(self, *other):
    #     return type(self)(self.prefix,
    #         super().joinpath(*other).relative_to(self.prefix))

    # def rglob(self, pattern: str) -> Generator:
    #     for path in super().rglob(pattern):
    #         yield type(self)(self.prefix, path.relative_to(self.prefix))

    # def with_suffix(self, suffix: str):
    #     return type(self)(self.prefix,
    #         super().with_suffix(suffix).relative_to(self.prefix))


_PrefixPathType = TypeVar('PrefixPathType', bound=_PrefixPath)


class _SuffixPath(_PrefixPath):
    """抽象的类,不应直接实例化此类"""
    suffix: ClassVar[str] = ""

    def __new__(cls: type, *args, **kwargs):
        obj = super(_PrefixPath, cls).__new__(_cls, *args, **kwargs)  # 这里不需要加前缀
        obj = _SuffixPath.with_suffix(obj, cls.suffix)
        return super().__new__(cls, obj)  # 这里会加前缀

    def with_suffix(self, suffix):
        """Return a new path with the file suffix changed.  If the path
        has no suffix, add given suffix.  If the given suffix is an empty
        string, remove the suffix from the path.
        """
        f = self._flavour
        if f.sep in suffix or f.altsep and f.altsep in suffix:
            raise ValueError("Invalid suffix %r" % (suffix,))
        if suffix == '.':
            raise ValueError("Invalid suffix %r" % (suffix))
        name = self.name
        if not name:
            raise ValueError("%r has an empty name" % (self,))
        old_suffix = self.suffix
        if not old_suffix:
            name = name + suffix
        else:
            name = name[:-len(old_suffix)] + suffix
        return self._from_parsed_parts(self._drv, self._root,
                                       self._parts[:-1] + [name])


class RawFilePath(_PrefixPath):
    prefix = config["datasets"]["raw-data"]
    # prefix = "datasets/raw-data" # 测试相对路径用


class CleanFilePath(_PrefixPath):
    prefix = config["datasets"]["clean-data"]


class JsonFilePath(_PrefixPath):
    prefix = config["datasets"]["json-data"]


class EncodedFilePath(_PrefixPath):
    prefix = config["datasets"]["encoded-data"]


class _LogFilePath(_PrefixPath):
    prefix = config["datasets"]["log"]


class ModuleApisJsonFilePath(_SuffixPath, JsonFilePath):
    suffix = "_module_apis.json"


class DefApisJsonFilePath(_SuffixPath, JsonFilePath):
    suffix = "_def_apis.json"


class JsonListEncodedFilePath(_SuffixPath, EncodedFilePath):
    suffix = ".jsonl"


class LogFilePath(_SuffixPath, _LogFilePath):
    suffix = ".log"

    def __new__(cls, *args, **kwargs):
        obj = super().__new__(cls, *args, **kwargs)
        return cls.__attach_logger(obj)

    # def __getattr__(self, item):  # pathlib中的逻辑会捕获AttributeError,所以不能用这个了
    #     print(self.logger)
    #     return getattr(self.logger, item)

    @staticmethod
    def __create_logger(obj, name):
        logger = logging.getLogger(name)
        logger.setLevel(level=logging.DEBUG)

        console = logging.StreamHandler()
        console.setLevel(level=logging.INFO)  # 控制台输出INFO级别以上的信息
        logger.addHandler(console)

        file = logging.FileHandler(str(obj), encoding="utf-8", mode="w")
        file.setLevel(level=logging.DEBUG)  # 文件输出DEBUG级别以上信息(全部信息)
        formatter = logging.Formatter('[%(asctime)s | %(filename)s | line %(lineno)d] - %(levelname)s: %(message)s')
        file.setFormatter(formatter)
        logger.addHandler(file)

        logger.debug('-' * 100)
        logger.debug('Start print log')
        return logger

    @staticmethod
    def __attach_logger(obj):
        obj.make_parent_dirs()
        name = str(obj.get_relative_path())
        if name in loggers:
            obj.logger = loggers[name]
        else:
            obj.logger = loggers[name] = LogFilePath.__create_logger(obj, name)
        return obj


if __name__ == "__main__":
    import pickle

    raw_file_path = RawFilePath("flask")
    print(raw_file_path)
    print(type(raw_file_path))
    print(raw_file_path.joinpath("123"))
    print(type(raw_file_path.joinpath("123")))
    # for i in raw_file_path.rglob("*"):
    #     print(i, type(i))
    print(raw_file_path.with_suffix(".456"))
    print(type(raw_file_path.with_suffix(".456")))
    module_path = ModuleApisJsonFilePath("101")
    print(raw_file_path.change_to_path(CleanFilePath))
    print(raw_file_path.change_to_path(ModuleApisJsonFilePath))
    print(raw_file_path.change_to_path(JsonListEncodedFilePath))
    print(module_path.change_to_path(RawFilePath))
    print(module_path.change_to_path(JsonFilePath))
    print(raw_file_path.glob("*"))
    clean = CleanFilePath()
    print(clean)
    print(clean.get_relative_path())
    print(raw_file_path.get_relative_path())
    print(RawFilePath(raw_file_path))
    print(EncodedFilePath(raw_file_path))
    # pickle封存再解封后仍和原来一样,支持多进程
    raw_pickle = pickle.dumps(raw_file_path)
    print(raw_pickle)
    raw_load = pickle.loads(raw_pickle)
    print(raw_load)
    log_path = raw_file_path.change_to_path(LogFilePath)
    print(log_path)
    log_path.logger.debug("3333")

输出

datasets\raw-data\flask
<class '__main__.RawFilePath'>
datasets\raw-data\flask\123
<class '__main__.RawFilePath'>
datasets\raw-data\flask.456
<class '__main__.RawFilePath'>
datasets\clean-data\flask
datasets\json-data\flask_module_apis.json
datasets\encoded-data\flask.jsonl
datasets\raw-data\101_module_apis.json
datasets\json-data\101_module_apis.json
<generator object Path.glob at 0x000001BFFC59B340>
datasets\clean-data
.
flask
datasets\raw-data\flask
datasets\encoded-data\flask
b'\x80\x04\x95(\x00\x00\x00\x00\x00\x00\x00\x8c\x08__main__\x94\x8c\x0bRawFilePath\x94\x93\x94\x8c\x05flask\x94\x85\x94R\x94.'
datasets\raw-data\flask
logs\flask.log
  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 1
    评论
评论 1
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值