细胞测序/质控+基因比对+合并日志文件+featurecount（笔记）

最新推荐文章于 2024-08-19 17:07:48 发布

�路颜可��

最新推荐文章于 2024-08-19 17:07:48 发布

阅读量223

点赞数

本文链接：https://blog.csdn.net/qq_55905518/article/details/129853682

版权

该文描述了对CLL细胞的测序数据进行质控、比对和计数的过程。使用fastp进行FastQ格式的数据质控，STAR进行基因组比对，featureCounts计算基因表达量，并用Python处理和合并日志文件，创建Excel报告。

摘要由CSDN通过智能技术生成

fastp/fastq进行质控

CLL细胞质控(putty)

for i in {5..48}; do
./fastp --in1 /home/xxxxxxx/tools/fastp/CLL_cell_combine_fastp_file_format/*S${i}_R1_001.fastq* --in2 /home/xxxxxx/tools/fastp/CLL_cell_combine_fastp_file_format/*S${i}_R2_001.fastq* --out1 /home/xxxxxxx/tools/fastp/_deal_seq/S${i}_R1.fastq.gz --out2 /home/xxxxxxx/tools/fastp/_deal_seq/S${i}_R2.fastq.gz --html
/home/xxxxxxx/tools/fastp/html/S${i}.html --json
/home/xxxxxxx/tools/fastp/json/S${i}.json; done;

SAR进行基因比对

CLL比对(putty)

for i in {5..48}; do
/work/tools/STAR/STAR-2.7.10a/bin/Linux_x86_64_static/STAR \
--runThreadN 15 \
--readFilesIn /home/xxxxxxx/tools/fastp/CLL_deal_seq/S${i}_R1.fastq.gz /home/xxxxxxx/tools/fastp/CLL_deal_seq/S${i}_R2.fastq.gz \
--readFilesCommand zcat \
--outSAMtype BAM SortedByCoordinate \
--genomeDir /work/genomes/UCSC/hg19 \
--outFileNamePrefix /home/xxxxxxx/tools/fastp/CLL_comparison/CLL-cell_S${i};done

合并fina_out日志文件

CLL_cell日志文件的合并(python)

import re
import sys
import os
import pandas as pd
from matplotlib import pyplot as plt
import math
import xlwt
import xlsxwriter
import xlrd
#file="路径\CLL-cell_S5Log.final.out"
#########新建excel
plt.rcParams['font.sans-serif']=['SimHei']#用来正常显示中文标签
plt.rcParams['axes.unicode_minus']=False#用来正常显示负号
#path = '路径\data.xls'
new_excel = xlwt.Workbook(encoding='utf-8', style_compression=0) # 新建一个excel
new_sheet = new_excel.add_sheet('sheet') # 新建一个sheet
new_excel.save("/home/luyanke/tools/fastp/CLL_comparison/sum_excel.xls")

########写入内容

def read_log(file):
i = 0
with open(file) as f:
"""使用while循环每次只读取一行,读到最后一行的时候结束"""
new_sheet.write(0, i, "cell_name")
per_col = new_sheet.col(i) # xlwt中是行和列都是从0开始计算的
per_col.width = 500 * 20
i = i + 1
while True:
lines = f.readline()
if not lines:
break
line = lines.split("\n")
# print(line[0])#Started job on | Dec 08 17:33:35
if line[0].find('|') != -1:
column_name = line[0].split("|")[0]
#记得赋值
column_name=column_name.strip(" ")
#读取Excel
df = pd.read_excel("/路径/CLL_comparison/sum_excel.xls")

#df = xlrd.open_workbook('/路径/CLL_comparison/sum_excel.xls')

# 定义新列
#第0行
new_sheet.write(0, i, column_name)
per_col = new_sheet.col(i) # xlwt中是行和列都是从0开始计算的
# sec_col = new_sheet.col(1)

per_col.width = 500 * 20
i=i+1
#df.to_excel('路径\Excel表.xls')
#Excel保存位置
new_excel.save('/路径/CLL_comparison/sum_excel.xls')

def FindFile(path):
j = 1
k = 0
line_count = 0
for root, dirs, files in os.walk(path):
#print(files[1])
for ipath in os.listdir(path):
fulldir = os.path.join(path, ipath) # 拼接成绝对路径
if re.search(r'\w+.final.out',fulldir):
#CLL - cell_S5Log.final.out
#print(fulldir) # 打印相关后缀的文件路径及名称

#def write_log(file1):
new_sheet.write(j, k, ipath)
k = k + 1
with open(fulldir) as f:
"""使用while循环每次只读取一行,读到最后一行的时候结束"""
while True:
lines = f.readline()
if not lines:
break
line = lines.split("\n")
# print(line[0])#Started job on | Dec 08 17:33:35
if line[0].find('|') != -1:
column_value = line[0].split("|")[1]
# 记得赋值
column_value = column_value.strip(" ")
# 读取Excel
# print(column_value)
df = pd.read_excel("路径/CLL_comparison/sum_excel.xls")
#df = xlrd.open_workbook('路径/CLL_comparison/sum_excel.xls')
# 定义新列
# 第0行
new_sheet.write(j, k, column_value)
# value_col = new_sheet.col(k) # xlwt中是行和列都是从0开始计算的
k = k + 1

new_excel.save('路径/CLL_comparison/sum_excel.xls')

if re.search(r'\w+.final.out', fulldir):
line_count = line_count + 1 ##44
j = j + 1
k = 0
print(line_count)
# root 表示当前正在访问的文件夹路径

# dirs 表示该文件夹下的子目录名list

# files 表示该文件夹下的文件list
####################
####################
####################写入数据
'''
def write_log(file1):
j = 1
k = 0
with open(file1) as f:
"""使用while循环每次只读取一行,读到最后一行的时候结束"""
while True:
lines = f.readline()
if not lines:
break
line = lines.split("\n")
# print(line[0])#Started job on | Dec 08 17:33:35
if line[0].find('|') != -1:
column_value = line[0].split("|")[1]
# 记得赋值
column_value = column_value.strip(" ")
# 读取Excel
#print(column_value)
df = pd.read_excel("路径\log\Excel表.xls")
# 定义新列
# 第0行
new_sheet.write(j, k, column_value)
#value_col = new_sheet.col(k) # xlwt中是行和列都是从0开始计算的
k = k + 1

new_excel.save('路径/CLL_comparison\sum_Excel.xls')
'''

# df.to_excel('路径\log\Excel表.xls')

if __name__ == '__main__':

#data = []
read_log("/路径/CLL_comparison/CLL-cell_S5Log.final.out")
#write_log("路径\log\CLL-cell_S5Log.final.out")

FindFile("路径/CLL_comparison")

featurecount(putty)

/work/tools/featureCounts/subread-2.0.1-Linux-x86_64/bin/featureCounts -T 16 \
-t exon \
-g gene_id \
-a /work/genomes/UCSC/hg19/gencode.v41lift37.annotation.gtf \
-o /home/xxxxxxx/tools/fastp/all_cell_counts/count.txt \
*.sortedByCoord.out.bam