>>> from collections import defaultdict
>>> import glob
>>> pos = defaultdict(dict)
>>> for index, infile in enumerate(glob.glob('D:\\DATA\\FP12210\\My Documents\\Temp\\Python\\sample*.vcf'), 1):
for line in open(infile):
# Convert value in integer already
val, letter = int(line.split()[1]), line.split()[3]
pos[val][index] = letter
>>> def print_pos(pos):
""" Formats pos """
# Print header by sorting keys of pos
values = sorted(pos.keys())
print ' ',
for val in range(values[0], values[-1] + 1):
print '{0:5}'.format(val),
# pos has keys according to row1, create pos2 with keys = sample #
pos2 = defaultdict(dict)
for val, d in pos.iteritems():
for index, letter in d.iteritems():
pos2[index][val] = letter
# Now easier to print lines
for index in sorted(pos2.keys()):
print ' sample{0:2} '.format(index),
for val in range(values[0], values[-1] + 1):
if val in pos2[index]:
print ' {0} '.format(pos2[index][val]),
else:
print ' NaN ',
>>> print_pos(pos)
2025 2026 2027 2028 2029 2030 2031 2032
sample 1 A NaN C T NaN NaN NaN NaN
sample 2 G A NaN NaN NaN NaN NaN T
>>>
我使用pos来收集值,我还使用pos2来收集相同的数据,因为:pos是面向值的,对于具有范围值很有用
pos2是面向样本的,对于给定样本号的值非常有用
为了避免范围过大,我使用了以下值:
-sample1.vcf:
^{pr2}$
-sample2.vcf:1 2025 blah G . blah PASS AC=0 GT:DP 0/0:61
2 2026 blah A . blah blah AC=0 GT:DP 0/0:61
3 2032 blah T . blah PASS AC=0 GT:DP 0/0:61