import os
import random
defcollect_code_files(root_dir, extensions):"""Collects all code files in the given directory with the specified extensions."""
code_files =[]for subdir, _, files in os.walk(root_dir):forfilein files:ifany(file.endswith(ext)for ext in extensions):
code_files.append(os.path.join(subdir,file))return code_files
defread_all_lines(code_files):"""Reads all lines from the given list of code files."""
all_lines =[]forfilein code_files:withopen(file,'r', encoding='utf-8', errors='ignore')as f:
all_lines.extend(f.readlines())return all_lines
defget_random_lines(all_lines, num_lines):"""Randomly selects the specified number of lines from the list of all lines."""return random.sample(all_lines, num_lines)defmain():# Define the root directory of your project and the file extensions to include
root_dir ='path/to/your/project'
extensions =['.java','.py','.cpp','.c','.cs','.h']# Collect all code files in the project
code_files = collect_code_files(root_dir, extensions)# Read all lines from the code files
all_lines = read_all_lines(code_files)# Check if we have enough linesiflen(all_lines)<3500:print(f"Not enough lines in the project. Only found {len(all_lines)} lines.")return# Get 3500 random lines
random_lines = get_random_lines(all_lines,3500)# Write the random lines to a new filewithopen('random_3500_lines.txt','w', encoding='utf-8')as output_file:
output_file.writelines(random_lines)print(f"3500 random lines have been written to random_3500_lines.txt")if __name__ =="__main__":
main()
随机从项目中取3500行代码, 尽量连续, 连续不超过100行
import os
import random
defcollect_code_files(root_dir, extensions):"""Collects all code files in the given directory with the specified extensions."""
code_files =[]for subdir, _, files in os.walk(root_dir):forfilein files:ifany(file.endswith(ext)for ext in extensions):
code_files.append(os.path.join(subdir,file))return code_files
defread_and_chunk_file(file_path, max_chunk_size=100):"""Reads a file and splits it into chunks of at most max_chunk_size lines."""
chunks =[]withopen(file_path,'r', encoding='utf-8', errors='ignore')as f:
lines = f.readlines()for i inrange(0,len(lines), max_chunk_size):
chunks.append(lines[i:i + max_chunk_size])return chunks
defmain():# Define the root directory of your project and the file extensions to include
root_dir ='path/to/your/project'
extensions =['.java','.py','.cpp','.c','.cs','.h']
max_chunk_size =100
total_lines_needed =3500# Collect all code files in the project
code_files = collect_code_files(root_dir, extensions)# Read and chunk all code files
all_chunks =[]forfilein code_files:
all_chunks.extend(read_and_chunk_file(file, max_chunk_size))# Shuffle all chunks to ensure randomness
random.shuffle(all_chunks)# Select chunks until we have enough lines
selected_lines =[]for chunk in all_chunks:iflen(selected_lines)+len(chunk)<= total_lines_needed:
selected_lines.extend(chunk)else:
remaining_lines_needed = total_lines_needed -len(selected_lines)
selected_lines.extend(chunk[:remaining_lines_needed])break# Write the selected lines to a new filewithopen('random_3500_lines.txt','w', encoding='utf-8')as output_file:
output_file.writelines(selected_lines)print(f"3500 random lines have been written to random_3500_lines.txt")if __name__ =="__main__":
main()