Python快速生成100M大小TXT文本，超三千万汉字

最新推荐文章于 2025-11-19 09:42:17 发布

原创最新推荐文章于 2025-11-19 09:42:17 发布 · 365 阅读

3 ·

CC 4.0 BY-SA版权

文章标签：

#python #开发语言 #文本生成

代码如下

import random
import os

# 基本词汇库配置
surnames = ['张', '李', '王', '赵', '陈', '周', '徐', '孙', '马', '朱']
names = ['伟', '芳', '强', '敏', '磊', '婷', '军', '杰', '琳', '浩']
cities = ['京城', '幽州', '临安', '洛阳', '长安', '金陵', '成都', '襄阳', '姑苏', '扬州']
nouns = ['山脉', '河流', '森林', '宫殿', '市场', '客栈', '寺庙', '村庄', '庭院', '地窖']
verbs = ['行走', '观察', '发现', '讨论', '思考', '寻找', '建造', '战斗', '阅读', '谈判']
adjectives = ['古老的', '神秘的', '繁华的', '宁静的', '险峻的', '宏伟的', '破败的', '热闹的', '幽暗的', '神圣的']
events = ['庆典', '阴谋', '邂逅', '探索', '战斗', '比武', '祭祀', '交易', '叛乱', '结盟']
emotions = ['惊讶', '恐惧', '喜悦', '疑惑', '愤怒', '悲伤', '期待', '失望', '平静', '兴奋']

# 生成元素函数
def generate_character():
    return random.choice(surnames) + random.choice(names)

def generate_location():
    return random.choice(cities) + random.choice(['的郊外', '的中心', '附近', '一角', '深处'])

# 句子模板（增加更多模板以提高多样性）
templates = [
    "{character}来到{location}，{action}。",
    "在{adjective}的{place}，众人{action}。",
    "突然，{character}感到{emotion}，{action}。",
    "传说中{place}隐藏着{adjective}秘密。",
    "{character}与{character2}{action}，引发了{event}。",
    "经过漫长的{action}，{character}终于{action2}。",
    "{time}的{place}笼罩在{adjective}氛围中。",
    "{character}翻开{adjective}典籍，{action}。",
    "当{event}发生时，{character}{action}。",
    "{character}凝视着{adjective}{place}，{action}。"
]

# 生成段落函数
def generate_sentence():
    template = random.choice(templates)
    try:
        return template.format(
            character=generate_character(),
            character2=generate_character(),
            location=generate_location(),
            adjective=random.choice(adjectives),
            place=random.choice(nouns),
            event=random.choice(events),
            emotion=random.choice(emotions),
            action=random.choice(verbs),
            action2=random.choice(verbs),
            time=random.choice(['清晨', '正午', '黄昏', '深夜']),
        )
    except KeyError:
        return generate_sentence()  # 防止模板变量不匹配

def generate_paragraph(min_sentences=4, max_sentences=8):
    sentences = []
    for _ in range(random.randint(min_sentences, max_sentences)):
        sentences.append(generate_sentence())
    return '　　' + ' '.join(sentences) + '\n\n'

# 章节标题生成
chapter_themes = ['初遇', '风波', '征程', '谜团', '决战', '暗涌', '往昔', '奇缘', '困局', '新生']
chapter_modifiers = ['暗夜的', '黎明的', '血色', '黄金', '沉默的', '最后的', '被遗忘的', '禁忌']

def generate_chapter_title(chapter_num):
    return f"第{chapter_num}章　{random.choice(chapter_modifiers)}{random.choice(chapter_themes)}"

# 生成文本主函数
def generate_large_text(target_size_mb=100, filename="test_text.txt"):
    target_size = target_size_mb * 1024 * 1024
    chapter_num = 1
    
    with open(filename, 'w', encoding='utf-8') as f:
        current_size = 0
        progress_interval = 10  # 每10MB输出一次进度
        
        while current_size < target_size:
            # 生成章节标题
            title = generate_chapter_title(chapter_num)
            title_block = f"{title}\n\n"
            f.write(title_block)
            current_size += len(title_block.encode('utf-8'))
            
            # 生成章节内容（3-8个段落）
            for _ in range(random.randint(3, 8)):
                paragraph = generate_paragraph()
                f.write(paragraph)
                current_size += len(paragraph.encode('utf-8'))
                
                # 检查文件大小
                if current_size >= target_size:
                    break
            
            # 更新进度
            if (current_size // (1024*1024)) % progress_interval == 0:
                print(f"已生成 {current_size//1024//1024} MB...")
            
            chapter_num += 1
            
            # 最终大小检查
            if current_size >= target_size:
                break
    
    print(f"文件生成完成，最终大小：{os.path.getsize(filename)//1024//1024} MB")

# 执行生成（注意：生成100MB文件需要较长时间）
if __name__ == "__main__":
    generate_large_text(target_size_mb=100)

统计字数

import re
import os

def count_file_words(file_path, mode='all'):
    """
    统计文本文件字数
    参数：
    file_path: 文件路径
    mode: 统计模式
        'all'        - 统计所有字符（包括标点、空格、换行符）
        'content'    - 统计非空白内容字符（排除空格、换行、制表符）
        'cjk'        - 统计中日韩文字字符（包括中文标点）
        'strict'     - 严格中文模式（仅汉字）
    """
    count = 0
    patterns = {
        'all': r'.',
        'content': r'[^\s]',
        'cjk': r'[\u4E00-\u9FFF\u3000-\u303F\uFF00-\uFFEF]',
        'strict': r'[\u4E00-\u9FFF]'
    }
    
    try:
        with open(file_path, 'r', encoding='utf-8') as f:
            for line in f:
                # 使用正则表达式匹配模式
                matches = re.findall(patterns[mode.lower()], line)
                count += len(matches)
        return count
    except FileNotFoundError:
        print(f"错误：文件 {file_path} 不存在")
        return -1
    except Exception as e:
        print(f"发生错误：{str(e)}")
        return -1

# 使用示例
if __name__ == "__main__":
    file_path = "test_text.txt"  # 需要统计的文件路径
    
    # 显示文件基本信息
    file_size = os.path.getsize(file_path) / 1024 / 1024
    print(f"文件信息：{file_path}")
    print(f"文件大小：{file_size:.2f} MB\n")

    # 多种统计模式
    modes = [
        ('all', "总字符数（包含所有字符）"),
        ('content', "内容字符数（排除空白符）"),
        ('cjk', "中日韩文字字符（含中文标点）"),
        ('strict', "严格中文汉字数")
    ]

    for mode, description in modes:
        word_count = count_file_words(file_path, mode)
        if word_count >= 0:
            print(f"{description}：{word_count:,} 字")