import heapq
from collections import OrderedDict
from itertools import islice
import multiprocessing
def analyze_frequency(data):
freq = {}
for symbol in data:
freq[symbol] = freq.get(symbol, 0) + 1
return sorted(freq.items(), key=lambda x: -x[1])
def build_huffman_tree(frequency):
heap = [[wt, [sym, ""]] for sym, wt in frequency]
heapq.heapify(heap)
while len(heap) > 1:
lo = heapq.heappop(heap)
hi = heapq.heappop(heap)
for pair in lo[1:]:
pair[1] = '0' + pair[1]
for pair in hi[1:]:
pair[1] = '1' + pair[1]
heapq.heappush(heap, [lo[0] + hi[0]] + lo[1:] + hi[1:])
return sorted(heapq.heappop(heap)[1:], key=lambda p: (len(p[-1]), p))
def find_patterns(data, k):
patterns = {}
n = len(data)
for i in range(n - k + 1):
pattern = data[i:i+k]
patterns[pattern] = patterns.get(pattern, 0) + 1
sorted_patterns = sorted([p for p, cnt in patterns.items() if cnt >= 2])
return OrderedDict((p, idx) for idx, p in enumerate(sorted_patterns))
def parallel_compress(data, k):
frequency = analyze_frequency(data)
huffman_codes = build_huffman_tree(frequency)
encoding_table = {sym: code for sym, code in huffman_codes}
frequent_patterns = find_patterns(data, k)
compressed = []
i = 0
while i < len(data):
current_pattern = data[i:i+k]
if len(current_pattern) == k and current_pattern in frequent_patterns:
pointer_index = frequent_patterns[current_pattern]
compressed.append("1" + bin(pointer_index)[2:].zfill(8))
i += k
else:
compressed.append("0" + encoding_table[data[i]])
i += 1
return ''.join(compressed), encoding_table, frequent_patterns
def parallel_decompress(compressed_data, encoding_table, frequent_patterns):
reverse_encoding = {v: k for k, v in encoding_table.items()}
pattern_list = list(frequent_patterns.keys())
decompressed = []
i = 0
while i < len(compressed_data):
if compressed_data[i] == '1':
pointer_index = int(compressed_data[i+1:i+9], 2)
decompressed.append(pattern_list[pointer_index])
i += 9
else:
code_builder = []
j = i + 1
while j <= len(compressed_data):
current_code = ''.join(code_builder)
if current_code in reverse_encoding:
decompressed.append(reverse_encoding[current_code])
i = j - 1
break
if j < len(compressed_data):
code_builder.append(compressed_data[j])
j += 1
else:
raise ValueError("Invalid Huffman code in compressed data")
i += 1
return ''.join(decompressed)
# 测试用例
data = "abracadabra"
compressed_data, encoding_table, frequent_patterns = parallel_compress(data, k=4)
print("Compressed data:", compressed_data)
decompressed_data = parallel_decompress(compressed_data, encoding_table, frequent_patterns)
print("Decompressed data:", decompressed_data)
print("Original matches decompressed:", data == decompressed_data)