代码如下
import random
import os
# 基本词汇库配置
surnames = ['张', '李', '王', '赵', '陈', '周', '徐', '孙', '马', '朱']
names = ['伟', '芳', '强', '敏', '磊', '婷', '军', '杰', '琳', '浩']
cities = ['京城', '幽州', '临安', '洛阳', '长安', '金陵', '成都', '襄阳', '姑苏', '扬州']
nouns = ['山脉', '河流', '森林', '宫殿', '市场', '客栈', '寺庙', '村庄', '庭院', '地窖']
verbs = ['行走', '观察', '发现', '讨论', '思考', '寻找', '建造', '战斗', '阅读', '谈判']
adjectives = ['古老的', '神秘的', '繁华的', '宁静的', '险峻的', '宏伟的', '破败的', '热闹的', '幽暗的', '神圣的']
events = ['庆典', '阴谋', '邂逅', '探索', '战斗', '比武', '祭祀', '交易', '叛乱', '结盟']
emotions = ['惊讶', '恐惧', '喜悦', '疑惑', '愤怒', '悲伤', '期待', '失望', '平静', '兴奋']
# 生成元素函数
def generate_character():
return random.choice(surnames) + random.choice(names)
def generate_location():
return random.choice(cities) + random.choice(['的郊外', '的中心', '附近', '一角', '深处'])
# 句子模板(增加更多模板以提高多样性)
templates = [
"{character}来到{location},{action}。",
"在{adjective}的{place},众人{action}。",
"突然,{character}感到{emotion},{action}。",
"传说中{place}隐藏着{adjective}秘密。",
"{character}与{character2}{action},引发了{event}。",
"经过漫长的{action},{character}终于{action2}。",
"{time}的{place}笼罩在{adjective}氛围中。",
"{character}翻开{adjective}典籍,{action}。",
"当{event}发生时,{character}{action}。",
"{character}凝视着{adjective}{place},{action}。"
]
# 生成段落函数
def generate_sentence():
template = random.choice(templates)
try:
return template.format(
character=generate_character(),
character2=generate_character(),
location=generate_location(),
adjective=random.choice(adjectives),
place=random.choice(nouns),
event=random.choice(events),
emotion=random.choice(emotions),
action=random.choice(verbs),
action2=random.choice(verbs),
time=random.choice(['清晨', '正午', '黄昏', '深夜']),
)
except KeyError:
return generate_sentence() # 防止模板变量不匹配
def generate_paragraph(min_sentences=4, max_sentences=8):
sentences = []
for _ in range(random.randint(min_sentences, max_sentences)):
sentences.append(generate_sentence())
return ' ' + ' '.join(sentences) + '\n\n'
# 章节标题生成
chapter_themes = ['初遇', '风波', '征程', '谜团', '决战', '暗涌', '往昔', '奇缘', '困局', '新生']
chapter_modifiers = ['暗夜的', '黎明的', '血色', '黄金', '沉默的', '最后的', '被遗忘的', '禁忌']
def generate_chapter_title(chapter_num):
return f"第{chapter_num}章 {random.choice(chapter_modifiers)}{random.choice(chapter_themes)}"
# 生成文本主函数
def generate_large_text(target_size_mb=100, filename="test_text.txt"):
target_size = target_size_mb * 1024 * 1024
chapter_num = 1
with open(filename, 'w', encoding='utf-8') as f:
current_size = 0
progress_interval = 10 # 每10MB输出一次进度
while current_size < target_size:
# 生成章节标题
title = generate_chapter_title(chapter_num)
title_block = f"{title}\n\n"
f.write(title_block)
current_size += len(title_block.encode('utf-8'))
# 生成章节内容(3-8个段落)
for _ in range(random.randint(3, 8)):
paragraph = generate_paragraph()
f.write(paragraph)
current_size += len(paragraph.encode('utf-8'))
# 检查文件大小
if current_size >= target_size:
break
# 更新进度
if (current_size // (1024*1024)) % progress_interval == 0:
print(f"已生成 {current_size//1024//1024} MB...")
chapter_num += 1
# 最终大小检查
if current_size >= target_size:
break
print(f"文件生成完成,最终大小:{os.path.getsize(filename)//1024//1024} MB")
# 执行生成(注意:生成100MB文件需要较长时间)
if __name__ == "__main__":
generate_large_text(target_size_mb=100)
统计字数
import re
import os
def count_file_words(file_path, mode='all'):
"""
统计文本文件字数
参数:
file_path: 文件路径
mode: 统计模式
'all' - 统计所有字符(包括标点、空格、换行符)
'content' - 统计非空白内容字符(排除空格、换行、制表符)
'cjk' - 统计中日韩文字字符(包括中文标点)
'strict' - 严格中文模式(仅汉字)
"""
count = 0
patterns = {
'all': r'.',
'content': r'[^\s]',
'cjk': r'[\u4E00-\u9FFF\u3000-\u303F\uFF00-\uFFEF]',
'strict': r'[\u4E00-\u9FFF]'
}
try:
with open(file_path, 'r', encoding='utf-8') as f:
for line in f:
# 使用正则表达式匹配模式
matches = re.findall(patterns[mode.lower()], line)
count += len(matches)
return count
except FileNotFoundError:
print(f"错误:文件 {file_path} 不存在")
return -1
except Exception as e:
print(f"发生错误:{str(e)}")
return -1
# 使用示例
if __name__ == "__main__":
file_path = "test_text.txt" # 需要统计的文件路径
# 显示文件基本信息
file_size = os.path.getsize(file_path) / 1024 / 1024
print(f"文件信息:{file_path}")
print(f"文件大小:{file_size:.2f} MB\n")
# 多种统计模式
modes = [
('all', "总字符数(包含所有字符)"),
('content', "内容字符数(排除空白符)"),
('cjk', "中日韩文字字符(含中文标点)"),
('strict', "严格中文汉字数")
]
for mode, description in modes:
word_count = count_file_words(file_path, mode)
if word_count >= 0:
print(f"{description}:{word_count:,} 字")
文件信息:test_text.txt
文件大小:100.90 MB
总字符数(包含所有字符):36,205,436 字
内容字符数(排除空白符):33,333,241 字
中日韩文字字符(含中文标点):33,853,407 字
严格中文汉字数:28,421,911 字
892

被折叠的 条评论
为什么被折叠?



