关闭

生物信息学常见的数据下载,包括基因组,gtf,bed,注释

1400人阅读 评论(0) 收藏 举报
cd ~/reference
mkdir -p genome/hg19  && cd genome/hg19 
nohup wget http://hgdownload.cse.ucsc.edu/goldenPath/hg19/bigZips/chromFa.tar.gz &
tar zvfx chromFa.tar.gz
cat *.fa > hg19.fa
rm chr*.fa
 
 
cd ~/reference
mkdir -p genome/hg38  && cd genome/hg38 
nohup wget http://hgdownload.cse.ucsc.edu/goldenPath/hg38/bigZips/hg38.fa.gz  &
 
cd ~/reference
mkdir -p  genome/mm10  && cd genome/mm10 
nohup wget http://hgdownload.cse.ucsc.edu/goldenPath/mm10/bigZips/chromFa.tar.gz  &
tar zvfx chromFa.tar.gz
cat *.fa > mm10.fa
rm chr*.fa
 
 
cd ~/biosoft/RNA-SeQC
wget http://www.broadinstitute.org/cancer/cga/sites/default/files/data/tools/rnaseqc/ThousandReads.bam
wget http://www.broadinstitute.org/cancer/cga/sites/default/files/data/tools/rnaseqc/gencode.v7.annotation_goodContig.gtf.gz
wget http://www.broadinstitute.org/cancer/cga/sites/default/files/data/tools/rnaseqc/Homo_sapiens_assembly19.fasta.gz
wget http://www.broadinstitute.org/cancer/cga/sites/default/files/data/tools/rnaseqc/Homo_sapiens_assembly19.other.tar.gz
wget http://www.broadinstitute.org/cancer/cga/sites/default/files/data/tools/rnaseqc/gencode.v7.gc.txt
wget http://www.broadinstitute.org/cancer/cga/sites/default/files/data/tools/rnaseqc/rRNA.tar.gz
 
cd ~/reference
mkdir -p index/bowtie && cd index/bowtie 
nohup time ~/biosoft/bowtie/bowtie2-2.2.9/bowtie2-build  ~/reference/genome/hg19/hg19.fa  ~/reference/index/bowtie/hg19 1>hg19.bowtie_index.log 2>&1 &
nohup time ~/biosoft/bowtie/bowtie2-2.2.9/bowtie2-build  ~/reference/genome/hg38/hg38.fa  ~/reference/index/bowtie/hg38 1>hg38.bowtie_index.log 2>&1 &
nohup time ~/biosoft/bowtie/bowtie2-2.2.9/bowtie2-build  ~/reference/genome/mm10/mm10.fa  ~/reference/index/bowtie/mm10 1>mm10.bowtie_index.log 2>&1 &
  
cd ~/reference
mkdir -p index/bwa && cd index/bwa 
nohup time ~/biosoft/bwa/bwa-0.7.15/bwa index   -a bwtsw   -p ~/reference/index/bwa/hg19  ~/reference/genome/hg19/hg19.fa 1>hg19.bwa_index.log 2>&1   &
nohup time ~/biosoft/bwa/bwa-0.7.15/bwa index   -a bwtsw   -p ~/reference/index/bwa/hg38  ~/reference/genome/hg38/hg38.fa 1>hg38.bwa_index.log 2>&1   &
nohup time ~/biosoft/bwa/bwa-0.7.15/bwa index   -a bwtsw   -p ~/reference/index/bwa/mm10  ~/reference/genome/mm10/mm10.fa 1>mm10.bwa_index.log 2>&1   &
  
cd ~/reference
mkdir -p index/hisat && cd index/hisat 
nohup wget ftp://ftp.ccb.jhu.edu/pub/infphilo/hisat2/data/hg19.tar.gz  &
nohup wget ftp://ftp.ccb.jhu.edu/pub/infphilo/hisat2/data/hg38.tar.gz  &
nohup wget ftp://ftp.ccb.jhu.edu/pub/infphilo/hisat2/data/grcm38.tar.gz &
nohup wget ftp://ftp.ccb.jhu.edu/pub/infphilo/hisat2/data/mm10.tar.gz  &
tar zxvf hg19.tar.gz
tar zxvf grcm38.tar.gz
tar zxvf hg38.tar.gz
tar zxvf mm10.tar.gz 
  
  
mkdir -p ~/annotation/variation/human/ExAC
cd ~/annotation/variation/human/ExAC
## http://exac.broadinstitute.org/
## ftp://ftp.broadinstitute.org/pub/ExAC_release/current
wget ftp://ftp.broadinstitute.org/pub/ExAC_release/current/ExAC.r0.3.1.sites.vep.vcf.gz.tbi 
nohup wget ftp://ftp.broadinstitute.org/pub/ExAC_release/current/ExAC.r0.3.1.sites.vep.vcf.gz &
wget ftp://ftp.broadinstitute.org/pub/ExAC_release/current/cnv/exac-final-cnv.gene.scores071316 
wget ftp://ftp.broadinstitute.org/pub/ExAC_release/current/cnv/exac-final.autosome-1pct-sq60-qc-prot-coding.cnv.bed
 
 
mkdir -p ~/annotation/variation/human/dbSNP
cd ~/annotation/variation/human/dbSNP
## https://www.ncbi.nlm.nih.gov/projects/SNP/
## ftp://ftp.ncbi.nih.gov/snp/organisms/human_9606_b147_GRCh38p2/
## ftp://ftp.ncbi.nih.gov/snp/organisms/human_9606_b147_GRCh37p13/
nohup wget ftp://ftp.ncbi.nih.gov/snp/organisms/human_9606_b147_GRCh37p13/VCF/All_20160601.vcf.gz &
wget ftp://ftp.ncbi.nih.gov/snp/organisms/human_9606_b147_GRCh37p13/VCF/All_20160601.vcf.gz.tbi 
 
 
mkdir -p ~/annotation/variation/human/1000genomes
cd ~/annotation/variation/human/1000genomes 
## ftp://ftp.1000genomes.ebi.ac.uk/vol1/ftp/release/20130502/ 
nohup wget  -c -r -nd -np -k -L -p  ftp://ftp.1000genomes.ebi.ac.uk/vol1/ftp/release/20130502 &
 
mkdir -p ~/annotation/variation/human/cosmic
cd ~/annotation/variation/human/cosmic
## we need to register before we can download this file. 
 
mkdir -p ~/annotation/variation/human/ESP6500
cd ~/annotation/variation/human/ESP6500
# http://evs.gs.washington.edu/EVS/
nohup wget http://evs.gs.washington.edu/evs_bulk_data/ESP6500SI-V2-SSA137.GRCh38-liftover.snps_indels.vcf.tar.gz & 
 
mkdir -p ~/annotation/variation/human/UK10K
cd ~/annotation/variation/human/UK10K
# http://www.uk10k.org/
nohup wget ftp://ngs.sanger.ac.uk/production/uk10k/UK10K_COHORT/REL-2012-06-02/UK10K_COHORT.20160215.sites.vcf.gz & 
 
mkdir -p ~/annotation/variation/human/gonl
cd ~/annotation/variation/human/gonl
## http://www.nlgenome.nl/search/
## https://molgenis26.target.rug.nl/downloads/gonl_public/variants/release5/
nohup wget  -c -r -nd -np -k -L -p  https://molgenis26.target.rug.nl/downloads/gonl_public/variants/release5  &
 
mkdir -p ~/annotation/variation/human/omin
cd ~/annotation/variation/human/omin
 
mkdir -p ~/annotation/variation/human/GWAS
cd ~/annotation/variation/human/GWAS
 
mkdir -p ~/annotation/variation/human/hapmap
cd ~/annotation/variation/human/hapmap
# ftp://ftp.ncbi.nlm.nih.gov/hapmap/
wget ftp://ftp.ncbi.nlm.nih.gov/hapmap/phase_3/relationships_w_pops_051208.txt 
nohup wget -c -r -np -k -L -p  -nd -A.gz ftp://ftp.ncbi.nlm.nih.gov/hapmap/phase_3/hapmap3_reformatted &
# ftp://ftp.hgsc.bcm.tmc.edu/pub/data/HapMap3-ENCODE/ENCODE3/ENCODE3v1/
wget ftp://ftp.hgsc.bcm.tmc.edu/pub/data/HapMap3-ENCODE/ENCODE3/ENCODE3v1/bcm-encode3-QC.txt 
wget ftp://ftp.hgsc.bcm.tmc.edu/pub/data/HapMap3-ENCODE/ENCODE3/ENCODE3v1/bcm-encode3-submission.txt.gz
 
 
 
 
## 1 million single nucleotide polymorphisms (SNPs) for DNA samples from each of the three ethnic groups in Singapore – Chinese, Malays and Indians.
## The Affymetrix Genome-Wide Human SNP Array 6.0   && The Illumina Human1M single BeadChip 
## http://www.statgen.nus.edu.sg/~SGVP/
## http://www.statgen.nus.edu.sg/~SGVP/singhap/files-website/samples-information.txt
# http://www.statgen.nus.edu.sg/~SGVP/singhap/files-website/genotypes/2009-01-30/QC/
 
## Singapore Sequencing Malay Project (SSMP) 
mkdir -p ~/annotation/variation/human/SSMP
cd ~/annotation/variation/human/SSMP
## http://www.statgen.nus.edu.sg/~SSMP/
## http://www.statgen.nus.edu.sg/~SSMP/download/vcf/2012_05 
 
 
## Singapore Sequencing Indian Project (SSIP) 
mkdir -p ~/annotation/variation/human/SSIP
cd ~/annotation/variation/human/SSIP
# http://www.statgen.nus.edu.sg/~SSIP/
## http://www.statgen.nus.edu.sg/~SSIP/download/vcf/dataFreeze_Feb2013
 
 
 
wget ftp://ftp.ensembl.org/pub/release-75/gtf/homo_sapiens/Homo_sapiens.GRCh37.75.gtf.gz 
wget ftp://ftp.ensembl.org/pub/release-86/gtf/homo_sapiens/Homo_sapiens.GRCh38.86.chr.gtf.gz 
 
mkdir -p ~/reference/gtf/gencode
cd  ~/reference/gtf/gencode
## https://www.gencodegenes.org/releases/current.html
wget ftp://ftp.sanger.ac.uk/pub/gencode/Gencode_human/release_25/gencode.v25.2wayconspseudos.gtf.gz
wget ftp://ftp.sanger.ac.uk/pub/gencode/Gencode_human/release_25/gencode.v25.long_noncoding_RNAs.gtf.gz 
wget ftp://ftp.sanger.ac.uk/pub/gencode/Gencode_human/release_25/gencode.v25.polyAs.gtf.gz 
wget ftp://ftp.sanger.ac.uk/pub/gencode/Gencode_human/release_25/gencode.v25.annotation.gtf.gz 
## https://www.gencodegenes.org/releases/25lift37.html 
wget ftp://ftp.sanger.ac.uk/pub/gencode/Gencode_human/release_25/GRCh37_mapping/gencode.v25lift37.annotation.gtf.gz 
wget ftp://ftp.sanger.ac.uk/pub/gencode/Gencode_human/release_25/GRCh37_mapping/gencode.v25lift37.metadata.HGNC.gz 
wget ftp://ftp.sanger.ac.uk/pub/gencode/Gencode_human/release_25/GRCh37_mapping/gencode.v25lift37.metadata.EntrezGene.gz 
wget ftp://ftp.sanger.ac.uk/pub/gencode/Gencode_human/release_25/GRCh37_mapping/gencode.v25lift37.metadata.RefSeq.gz 
 
 
mkdir -p ~/reference/gtf/ensembl/homo_sapiens_86
cd  ~/reference/gtf/ensembl/homo_sapiens_86
## http://asia.ensembl.org/info/data/ftp/index.html
 
 
 
cd ~/reference
mkdir -p  genome/human_g1k_v37  && cd genome/human_g1k_v37
# http://ftp.1000genomes.ebi.ac.uk/vol1/ftp/technical/reference/ 
nohup wget http://ftp.1000genomes.ebi.ac.uk/vol1/ftp/technical/reference/human_g1k_v37.fasta.gz  &
gunzip human_g1k_v37.fasta.gz
wget http://ftp.1000genomes.ebi.ac.uk/vol1/ftp/technical/reference/human_g1k_v37.fasta.fai
wget http://ftp.1000genomes.ebi.ac.uk/vol1/ftp/technical/reference/README.human_g1k_v37.fasta.txt
java -jar ~/biosoft/picardtools/picard-tools-1.119/CreateSequenceDictionary.jar R=human_g1k_v37.fasta O=human_g1k_v37.dict
 
## ftp://ftp.broadinstitute.org/bundle/b37/
mkdir -p ~/annotation/GATK
cd ~/annotation/variation/GATK
wget ftp://gsapubftp-anonymous@ftp.broadinstitute.org/bundle/b37/1000G_phase1.snps.high_confidence.b37.vcf.gz 
wget ftp://gsapubftp-anonymous@ftp.broadinstitute.org/bundle/b37/dbsnp_138.b37.vcf.gz
wget ftp://gsapubftp-anonymous@ftp.broadinstitute.org/bundle/b37/human_g1k_v37.fasta.gz 
wget ftp://gsapubftp-anonymous@ftp.broadinstitute.org/bundle/b37/NA12878.HiSeq.WGS.bwa.cleaned.raw.subset.b37.sites.vcf.gz
wget ftp://gsapubftp-anonymous@ftp.broadinstitute.org/bundle/b37/Mills_and_1000G_gold_standard.indels.b37.vcf.gz 
wget ftp://gsapubftp-anonymous@ftp.broadinstitute.org/bundle/b37/hapmap_3.3.b37.vcf.gz
wget ftp://gsapubftp-anonymous@ftp.broadinstitute.org/bundle/b37/1000G_phase1.indels.b37.vcf.gz 
wget ftp://gsapubftp-anonymous@ftp.broadinstitute.org/bundle/b37/1000G_phase1.indels.b37.vcf.idx.gz
gunzip 1000G_phase1.indels.b37.vcf.idx.gz
gunzip 1000G_phase1.indels.b37.vcf.gz
  
  
mkdir -p  ~/institute/ENSEMBL/gtf
cd  ~/institute/ENSEMBL/gtf
wget ftp://ftp.ensembl.org/pub/release-87/gtf/homo_sapiens/Homo_sapiens.GRCh38.87.chr.gtf.gz 
wget ftp://ftp.ensembl.org/pub/release-87/gtf/mus_musculus/Mus_musculus.GRCm38.87.chr.gtf.gz
wget ftp://ftp.ensembl.org/pub/release-87/gtf/danio_rerio/Danio_rerio.GRCz10.87.chr.gtf.gz
 
 
 
 
 
cd ~/institute/TCGA/firehose
## https://gdac.broadinstitute.org/
wget http://gdac.broadinstitute.org/runs/stddata__2016_01_28/data/ACC/20160128/gdac.broadinstitute.org_ACC.Merge_snp__genome_wide_snp_6__broad_mit_edu__Level_3__segmented_scna_minus_germline_cnv_hg19__seg.Level_3.2016012800.0.0.tar.gz  -O ACC.gistic.seg.tar.gz
wget http://gdac.broadinstitute.org/runs/stddata__2016_01_28/data/ACC/20160128/gdac.broadinstitute.org_ACC.Merge_snp__genome_wide_snp_6__broad_mit_edu__Level_3__segmented_scna_hg19__seg.Level_3.2016012800.0.0.tar.gz  -O ACC.raw.seg.tar.gz 
wget http://gdac.broadinstitute.org/runs/stddata__2016_01_28/data/ACC/20160128/gdac.broadinstitute.org_ACC.Mutation_Packager_Calls.Level_3.2016012800.0.0.tar.gz -O ACC.maf.tar.gz
wget http://gdac.broadinstitute.org/runs/stddata__2016_01_28/data/ACC/20160128/gdac.broadinstitute.org_ACC.Mutation_Packager_Oncotated_Calls.Level_3.2016012800.0.0.tar.gz -O ACC.maf.anno.tar.gz
0
0
查看评论

GTF基因注释文件详解

GFF和GTF是两种最常用的数据库注释格式,在信息分析中建库时除了需要fasta文件一般还会需要这两种文件,提取需要的信息进行注释。 Cufflinks/Tophat 软件需要 GTF文件作为基因注释文件。  GFF全称为general feature format,这种格式主要是用来注...
  • sinat_38163598
  • sinat_38163598
  • 2017-06-03 12:33
  • 2299

BED文件格式

BED 文件格式        BED 文件格式提供了一种灵活的方式来定义的数据行,以用来描述注释的信息。BED行有3个必须的列和9个额外可选的列。 每行的数据格式要求一致。 必须包含的3列: chr...
  • biubiuv
  • biubiuv
  • 2014-10-21 17:19
  • 1695

生信:1:vcf格式文件解读

第二章:生物信息分析第一节:解读vcf格式文件1,vcf格式说明VCF格式:Variant Call Format,用于记录variants (SNP / InDel)的文件格式,关于其说明,详见:http://gatkforums.broadinstitute.org/discussion/126...
  • genome_denovo
  • genome_denovo
  • 2017-12-02 19:53
  • 1081

创建gtf下载

搞错俩, 麻利的改回来 cat id.v4_xloc  | perl -ne '@t = split(/\t/, $_); chomp @t;  $nId = "NONHSAG" . substr("000000"...
  • NeoBilly
  • NeoBilly
  • 2013-12-31 15:01
  • 734

生物基因数据文件——vcf格式详解

转载于:1.什么是vcf文件VCF是用于描述SNP,INDEL和SV结果的文本文件。在GATK软件中得到最好的支持,当然SAMtools得到的结果也是VCF格式,和GATK的CVF格式有点差别。2.VCF的主体结构##fileformat=VCFv4.0 ##FILTER= ##FORMAT= #...
  • u012150360
  • u012150360
  • 2017-04-24 22:18
  • 5950

Linux命令之time - 测定一个命令的运行时间!

time命令常用于测量一个命令的运行时间,注意不是用来显示和修改系统时间的(这是date命令干的事情)。包括实际使用时间(realtime)、用户态使用时间(the process spent in user mode)、内核态使用时间(the process spent in kernel mod...
  • zq9017197
  • zq9017197
  • 2012-07-04 17:07
  • 4397

nohup使用

nohup命令的原理:您启动作业的终端被称为这个作业的控制终端。当您注销时,一些 shell(缺省情况下不是 bash)将向这些后台作业传送 SIGHUP 信号,从而导致这些进程退出。为了保护进程以免产生这种行为,当您启动进程时,请使用 nohup。Unix/Linux下一般比如想让某个程序在后台运...
  • zhangxinrun
  • zhangxinrun
  • 2010-05-27 17:55
  • 4111

从基因组注释说起

N年前测序还是问题,基因组的解读排在后边,现如今,测序已然不是问题, 成百上千的基因组被测序,这么多的基因组需要解读还真不是件容易的事。以前高大上的工作,注定要飞入寻常百姓家。开发出易用且准确度高的注释工具就很迫切了。   首先来说说编码蛋白基因的注释。真核生物的基因往往具有内含子,不像原核生物那...
  • msw521sg
  • msw521sg
  • 2016-09-30 23:34
  • 1134

ANNOVAR 注释软件

ANNOVAR简介 ANNOVAR是由王凯编写的一个注释软件,可以对SNP和indel进行注释,也可以进行变异的过滤筛选。 ANNOVAR能够利用最新的数据来分析各种基因组中的遗传变异。主要包含三种不同的注释方法,Gene-based Annotation(基于基因的注释)、Region...
  • herokoking
  • herokoking
  • 2017-12-13 13:03
  • 446

Aspera从NCBI下载基因组数据

1.下载/安装Aspera 下载地址:http://downloads.asperasoft.com/en/downloads/8?list 选择对应的版本,我用的是centos7_x64服务器,下载是一个aspera-connect-3.7.2.141527-linux-64.sh 使用普通...
  • jiangpeng59
  • jiangpeng59
  • 2017-06-07 17:33
  • 2178
    个人资料
    • 访问:33869次
    • 积分:552
    • 等级:
    • 排名:千里之外
    • 原创:5篇
    • 转载:77篇
    • 译文:2篇
    • 评论:2条
    最新评论