[ngs] Instructions: working woth interval data

Diego Mazzotti mazzottidr at gmail.com
Fri Jun 14 19:34:31 PDT 2013


Dear all,
Find attached the script from today's exercise on interval data.

Please test it step by step and feel free to modify or correct it (please
send us the updated version if there are any corrections to be made!)

Best!


Diego R. Mazzotti, Ph.D.
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://lists.idyll.org/pipermail/ngs-2013/attachments/20130614/4b298a8d/attachment.htm>
-------------- next part --------------
 # download and install software
 cd /root
 wget -O dropbox.tar.gz "http://www.dropbox.com/download/?plat=lnx.x86_64"
 tar -xvzf dropbox.tar.gz
 ~/.dropbox-dist/dropboxd &
 mkdir Dropbox/intervals
 cd /mnt
 wget https://bedtools.googlecode.com/files/BEDTools.v2.17.0.tar.gz
 tar zxvf BEDTools.v2.17.0.tar.gz
 cd bedtools-2.17.0/
 make
 cp bin/bedtools /usr/local/bin/
 cd /mnt
 curl -O -L http://sourceforge.net/projects/bio-bwa/files/bwa-0.7.4.tar.bz2
 tar xvfj bwa-0.7.4.tar.bz2
 cd bwa-0.7.4
 make
 cp bwa /usr/local/bin
 cd /root
 curl -O -L http://sourceforge.net/projects/samtools/files/samtools/0.1.19/samtools-0.1.19.tar.bz2
 tar xvfj samtools-0.1.19.tar.bz2
 cd samtools-0.1.19
 make
 cp samtools /usr/local/bin
 cd misc/
 cp *.pl maq2sam-long maq2sam-short md5fa md5sum-lite wgsim /usr/local/bin/
 cd /Dropbox/intervals
 
 #download teachme data
 wget http://bcc.bx.psu.edu/data/teachme.zip
 unzip teachme.zip
 cd teachme/
 
 #look at genes.gff and sc.fa
 less genes.gff
 cd refs/
 less sc.fa
 
 #clone and install wgsim (mutation simulator)
 git clone https://github.com/lh3/wgsim
 cd wgsim/
 gcc -g -O2 -Wall -o wgsim wgsim.c -lz -lm
 cp wgsim /usr/bin/
 
 # go to refs folder and run wgsim on sc_fa to create NGS reads that contains mutations based on the sc.fa reference genome
 cd ..
 wgsim -N 100000 sc.fa sc_mut1.fq sc_mut2.fq > sc_mut_screen.txt
 
 #look at mutation summary and mutated read (just sc_mut1.fq)
 less sc_mut_screen.txt
 less sc_mut1.fq
 
 #make work directory and copy genes.gff and mutated reads
 cd ..
 mkdir work
 mv genes.gff work/
 mv refs/sc_mut* work/
 cd work/
 
 #index reference genome for alignment
 bwa index ../refs/sc.fa
 
 #perform bwa alignment
 bwa aln ../refs/sc.fa sc_mut1.fq > sc_mut1.sai
 
 #create SAM files and take a look at it
 bwa samse ../refs/sc.fa sc_mut1.sai sc_mut1.fq > sc_mut1.sam
 less sc_mut1.sam
 
 #create BAM files using samtools, create a temp bam file, sort it and index it
 samtools view -Sb sc_mut1.sam > tmp.bam
 samtools sort tmp.bam sc_mut1
 samtools index sc_mut1.bam
 
 #create BED files (containing interval information of the alingments)
  bedtools bamtobed -ed -i sc_mut1_bwa.bam > sc_mut1_bwa.bed
 
 #create genome.txt file from sc_mut1.sam header (first column: chr names; second column: chr size (without LN:) - use text editor)
 #save genomes.txt into Dropbox/intervals/teachme/work on your local computer
 
 #'Extend' genes.gff intervals to 1000 bases upstream positive strand reads / 1000 downstream negative strand reads - this is to include a hypothetical 1000 bases promoter
 bedtools slop -l 1000 -r 0 -s -i genes.gff -g genome.txt > slop.gff
 
 #check if it worked
 head -3 genes.gff
 head -3 slop.gff
 
 #subtract the rest of the genes.gff file ("genome") from the slop.gff file to keep only the promoter regions
 bedtools subtract -a slop.gff -b genes.gff -s > promoter.gff
 
 #check if it worked
 head -3 genes.gff
 head -3 promoter.gff
 
 #intersect the promoter.gff file (which contains annotation and interval information of the promoters) with the BED file created from the aligment to get the reads that mapped to the promoter regions
 bedtools intersect -a promoter.gff -b sc_mut1_bwa.bed -wo > answer.txt
 
 


More information about the ngs-2013 mailing list