Code Monkey home page Code Monkey logo

cd tutorial

sudo docker run -i -t -v $PWD:/tutorial -w /tutorial ceciliaklein/teaching:uvic

export PATH=$PATH:/tutorial/teaching-utils/;

######################################################################################

#1.Perform differential expression analysis between brain and liver using the EdgeR. #Present results using a heatmap with hierarchical clustering in rows and columns and #colored classification of differentially expressed genes (DEGs), i.e. overexpressed in #brain versus liver and the other way around.

cd analysis

#edgeR analysis

edgeR.analysis.R --input_matrix ../quantifications/encode.mouse.gene.expected_count.idr_NA.tsv
--metadata /tutorial/data/gene.quantifications.index.tsv
--fields tissue
--coefficient 3
--output brain_X_liver

#Write a list of the genes overexpressed in brain compared to liver, according to edgeR analysis:

awk '$NF<0.01 && $2<-10{print $1"\tover_brain_X_liver"}' edgeR.cpm1.n2.brain_X_liver.tsv > edgeR.0.01.over_brain_X_liver.txt

#Write a list of the genes overexperessed in liver compared to brain, according to edgeR analysis:

awk '$NF<0.01 && $2>10 {print $1"\tover_liver_X_brain"}' edgeR.cpm1.n2.brain_X_liver.tsv > edgeR.0.01.over_liver_X_brain.txt

#Extra (is just to make easy the treatment of the files without interferring the ones of the hands-on)

cp edgeR.0.01.over_brain_X_liver.txt delivery/

cp edgeR.0.01.over_liver_X_brain.txt delivery/

#Heatmap

awk '$3=="gene"{ match($0, /gene_id "([^"]+).+gene_type "([^"]+)/, var); print var[1],var[2] }' OFS="\t" /tutorial/refs/gencode.vM4.gtf
| join.py --file1 stdin
--file2 <(cat delivery/edgeR.0.01.over*.txt)
| sed '1igene\tedgeR\tgene_type' > gene.edgeR.tsv

cut -f1 gene.edgeR.tsv
| tail -n+2
| selectMatrixRows.sh - ../quantifications/encode.mouse.gene.TPM.idr_NA.tsv
| ggheatmap.R --width 5
--height 8
--col_metadata /tutorial/data/gene.quantifications.index.tsv
--colSide_by tissue
--col_labels labExpId
--row_metadata gene.edgeR.tsv
--merge_row_mdata_on gene
--rowSide_by edgeR,gene_type
--row_labels none
--log
--pseudocount 0.1
--col_dendro
--row_dendro
--matrix_palette /tutorial/palettes/palDiverging.txt
--colSide_palette /tutorial/palettes/palTissue.txt
--output heatmap.brain_X_liver.pdf

######################################################################################

#2. Perform gene ontology enrichment analysis of the two sets of DEGs using the command #line wrapper of GOstats R package for biological processes. Plot results using any #graphical representation and discuss results

#GO enrichment

#already have universe.txt

###overexpressed in brain #BP awk '{split($1,a,"."); print a[1]}' edgeR.0.01.over_brain_X_liver.txt
| GO_enrichment.R --universe universe.txt
--genes stdin
--categ BP
--output edgeR.over_brain_X_liver
--species mouse

#REViGO input awk 'NR==1{$1="% "$1}{print $1,$2}' edgeR.over_brain_X_liver.BP.tsv

###overexpressed in liver #BP awk '{split($1,a,"."); print a[1]}' edgeR.0.01.over_liver_X_brain.txt
| GO_enrichment.R --universe universe.txt
--genes stdin
--categ BP
--output edgeR.over_liver_X_brain
--species mouse

#REViGO input awk 'NR==1{$1="% "$1}{print $1,$2}' edgeR.over_liver_X_brain.BP.tsv

######################################################################################

#3. Analyze differential splicing using SUPPA between brain and liver for skipping exon, #intron retention, mutually exclusive exon and alternative first exon. Plot top results using #heatmaps. Different thresholds may be chosen for each event type.

cd .. cd splicing

#List of transcript ids awk '$3=="transcript" && $0~/gene_type "protein_coding"/{ match($0, /transcript_id "([^"]+)/, id); print id[1] }' /tutorial/refs/gencode.vM4.gtf |sort -u > protein_coding_transcript_IDs.txt

#Genome annotation restricted to exon features and filtered by transcript type cat /tutorial/refs/gencode.vM4.gtf |awk '$3=="exon"' |grep -Ff protein_coding_transcript_IDs.txt > exon-annot.gtf

#Filter transcript TPM matrix selectMatrixRows.sh protein_coding_transcript_IDs.txt /tutorial/quantifications/encode.mouse.transcript.TPM.idr_NA.tsv > pc-tx.tsv

#Individual transcript expression matrices for tissue in Brain Liver; do selectMatrixColumns.sh PRNAembryo${tissue}1:PRNAembryo${tissue}2 pc-tx.tsv > expr.${tissue}.tsv done

#Generate ALternative Splicing events: SE, FL, MX and RI suppa.py generateEvents -i exon-annot.gtf -e SE FL MX RI -o localEvents -f ioe

#Compute PSI calue for local event and tissue event=SE; suppa.py psiPerEvent --total-filter 10 --ioe-file localEvents_${event}_strict.ioe --expression-file pc-tx.tsv -o PSI-${event}

event=SE; for tissue in Brain Heart Liver ;do selectMatrixColumns.sh PRNAembryo${tissue}1:PRNAembryo${tissue}2 PSI-${event}.psi > ${tissue}.${event}.psi;done

event=MX; suppa.py psiPerEvent --total-filter 10 --ioe-file localEvents_${event}_strict.ioe --expression-file pc-tx.tsv -o PSI-${event}

event=MX; for tissue in Brain Liver ;do selectMatrixColumns.sh PRNAembryo${tissue}1:PRNAembryo${tissue}2 PSI-${event}.psi > ${tissue}.${event}.psi;done

event=AF; suppa.py psiPerEvent --total-filter 10 --ioe-file localEvents_${event}_strict.ioe --expression-file pc-tx.tsv -o PSI-${event}

event=AF; for tissue in Brain Liver ;do selectMatrixColumns.sh PRNAembryo${tissue}1:PRNAembryo${tissue}2 PSI-${event}.psi > ${tissue}.${event}.psi;done

event=RI; suppa.py psiPerEvent --total-filter 10 --ioe-file localEvents_${event}_strict.ioe --expression-file pc-tx.tsv -o PSI-${event}

event=RI; for tissue in Brain Liver ;do selectMatrixColumns.sh PRNAembryo${tissue}1:PRNAembryo${tissue}2 PSI-${event}.psi > ${tissue}.${event}.psi;done

#Generate differential splicing events event=SE; suppa.py diffSplice --method empirical --input localEvents_${event}_strict.ioe --psi Brain.${event}.psi Liver.${event}.psi --tpm expr.Brain.tsv expr.Liver.tsv -c -gc -o DS.${event}

event=MX; suppa.py diffSplice --method empirical --input localEvents_${event}_strict.ioe --psi Brain.${event}.psi Liver.${event}.psi --tpm expr.Brain.tsv expr.Liver.tsv -c -gc -o DS.${event}

event=RI; suppa.py diffSplice --method empirical --input localEvents_${event}_strict.ioe --psi Brain.${event}.psi Liver.${event}.psi --tpm expr.Brain.tsv expr.Liver.tsv -c -gc -o DS.${event}

event=AF; suppa.py diffSplice --method empirical --input localEvents_${event}_strict.ioe --psi Brain.${event}.psi Liver.${event}.psi --tpm expr.Brain.tsv expr.Liver.tsv -c -gc -o DS.${event}

#plot SE #prepare input for heatmap event=SE; awk 'BEGIN{FS=OFS="\t"}NR>1 && $2!="nan" && ($2>0.5 || $2<-0.5) && $3&lt;0.05{print}' DS.${event}.dpsi|cut -f1 > top-examples-SE.txt selectMatrixRows.sh top-examples-SE.txt DS.SE.psivec > matrix.top-examples-SE.tsv

#heatmap SE ggheatmap.R -i matrix.top-examples-SE.tsv -o heatmap_top-examples-SE.pdf --matrix_palette /tutorial/palettes/palSequential.txt --row_dendro --matrix_fill_limits "0,1" -B 8

#plot AF #prepare input for heatmap event=AF; awk 'BEGIN{FS=OFS="\t"}NR>1 && $2!="nan" && ($2>0.5 || $2<-0.5) && $3&lt;0.05{print}' DS.${event}.dpsi|cut -f1 > top-examples-AF.txt selectMatrixRows.sh top-examples-AF.txt DS.AF.psivec > matrix.top-examples-AF.tsv

#heatmap alternative first exons top examples ggheatmap.R -i matrix.top-examples-AF.tsv -o heatmap_top-examples-AF.pdf --matrix_palette /tutorial/palettes/palSequential.txt --row_dendro --matrix_fill_limits "0,1" -B 8

#plot MX #prepare input for heatmap event=MX; awk 'BEGIN{FS=OFS="\t"}NR>1 && $2!="nan" && ($2>0.3 || $2<-0.3) && $3&lt;0.05{print}' DS.${event}.dpsi|cut -f1 > top-examples-MX.txt selectMatrixRows.sh top-examples-MX.txt DS.MX.psivec > matrix.top-examples-MX.tsv

#heatmap alternative first exons top examples ggheatmap.R -i matrix.top-examples-MX.tsv -o heatmap_top-examples-MX.pdf --matrix_palette /tutorial/palettes/palSequential.txt --row_dendro --matrix_fill_limits "0,1" -B 8

#plot RI #prepare input for heatmap event=RI; awk 'BEGIN{FS=OFS="\t"}NR>1 && $2!="nan" && ($2>0.3 || $2<-0.3) && $3&lt;0.05{print}' DS.${event}.dpsi|cut -f1 > top-examples-RI.txt selectMatrixRows.sh top-examples-RI.txt DS.RI.psivec > matrix.top-examples-RI.tsv

#heatmap alternative first exons top examples ggheatmap.R -i matrix.top-examples-RI.tsv -o heatmap_top-examples-RI.pdf --matrix_palette /tutorial/palettes/palSequential.txt --row_dendro --matrix_fill_limits "0,1" -B 8

######################################################################################

#4. Find H3K4me3 peaks shared by brain and liver and the ones exclusively found in each #tissue using the narrow peaks found in /tutorial/results using bedtools intersect. Show #results using a bar plot colored by the color code used during the hands-on. Palette is #availables at /tutorial/palettes/palTissue.txt. Any color may be chosen for shared peaks.

cd ..

cd chip-analysis

#ChIP-seq #common peaks brain & liver --> id (.tsv) bedtools intersect -a /tutorial/results/CHIPembryoBrain.narrowPeak -b /tutorial/results/CHIPembryoLiver.narrowPeak -wao|awk 'BEGIN{FS=OFS="\t"; print "Brain_coordinates","Brain_peak","Liver_coordinates","Liver_peak","intersection"}$NF!=0{print $1":"$2"-"$3,$4,$11":"$12"-"$13,$14,$NF}' > common-peaks-Brain-Livers.tsv

#common peaks brain & liver --> coordinate peaks (.bed) bedtools intersect -a /tutorial/results/CHIPembryoBrain.narrowPeak -b /tutorial/results/CHIPembryoLiver.narrowPeak -wao|awk 'BEGIN{FS=OFS="\t"; print "Brain_coordinates","Brain_peak","Liver_coordinates","Liver_peak","intersection"}$NF!=0{print $1,$2,$3,$4";"$14}' > common-peaks-Brain-Livers.bed

#brain specific peaks --> id (.tsv) bedtools intersect -a /tutorial/results/CHIPembryoBrain.narrowPeak -b /tutorial/results/CHIPembryoLiver.narrowPeak -wao|awk 'BEGIN{FS=OFS="\t"; print "Brain_coordinates","Brain_peak"}$NF==0{print $1":"$2"-"$3,$4}' > Brain-specific-peaks.tsv

#brain specific peaks --> coordinates (.bed) bedtools intersect -a /tutorial/results/CHIPembryoBrain.narrowPeak -b /tutorial/results/CHIPembryoLiver.narrowPeak -wao|awk 'BEGIN{FS=OFS="\t"; print "Brain_coordinates","Brain_peak"}$NF==0{print $1,$2,$3,$4}' > Brain-specific-peaks.bed

#liver specific peaks --> id (.tsv) bedtools intersect -a /tutorial/results/CHIPembryoLiver.narrowPeak -b /tutorial/results/CHIPembryoBrain.narrowPeak -wao|awk 'BEGIN{FS=OFS="\t"; print "Liver_coordinates","Liver_peak"}$NF==0{print $1":"$2"-"$3,$4}' > Liver-specific-peaks.tsv

#liver specific peaks --> coordinates (.bed) bedtools intersect -a /tutorial/results/CHIPembryoLiver.narrowPeak -b /tutorial/results/CHIPembryoBrain.narrowPeak -wao|awk 'BEGIN{FS=OFS="\t"; print "Liver_coordinates","Liver_peak"}$NF==0{print $1,$2,$3,$4}' > Liver-specific-peaks.bed

#palette cp ../palettes/palTissue.txt /tutorial/chip-analysis/palPeaks.txt nano palPeaks.txt

#take the number of peaks of each set wc -l *s.tsv|awk '$2=="total"{print $2"\t"$1}$2!="total"{split($2,a,"-");print a[1]"\t"$1}'|sed '1iTissue\tNumber'> input.tsv

#barplot ggbarplot.R -i input.tsv -o number_of_peaks.pdf --title "Nº of peaks" --y_title "Number of Peaks" --x_title "Tissue/Set" --palette_fill palPeaks.txt --fill_by 1 --header

######################################################################################

#5. Create a BED file of 200bp up/downstream TSS of genes and overlap DEGs (step 1) #with the 3 sets of H3K4me3 peaks classified in the previous step (4). Show three #examples in the UCSC genome browser, including RNA-seq, ChIP-seq and ATAC-seq #tracks. Ideally, one example of each peak set (i.e. shared peak, peak exclusively called #in brain and peak exclusively called in liver). Discuss the integration of the three datasets #in the TSS of the selected cases.

#TSS OF ALL PROTEIN CODING GENES awk 'BEGIN{FS=OFS="\t"}$3=="gene" && $0~/gene_type "protein_coding"/ && $7=="+"{ match($0, /transcript_id "([^"]+)/, id); print $1,$4-200,$4+200,$7,id[1] }$3=="gene" && $0~/gene_type "protein_coding"/ && $7=="-"{ match($0, /transcript_id "([^"]+)/, id); print $1,$5-200,$5+200,id[1],".",$7 }' /tutorial/refs/gencode.vM4.gtf > protein-coding-genes-200up_downTSS.bed

#TSS OF GENES OVER IN BRAIN cut -f1 /tutorial/analysis/delivery/edgeR.0.01.over_brain_X_liver.txt | sort -u | grep -Ff- protein-coding-genes-200up_downTSS.bed | cut -f1,2,3 > brain_X_liver.bed

#TSS OFE GENES OVER IN LIVER cut -f1 /tutorial/analysis/delivery/edgeR.0.01.over_liver_X_brain.txt | sort -u | grep -Ff- protein-coding-genes-200up_downTSS.bed | cut -f1,2,3 > liver_X_brain.bed

#remove header from common_peaks to avoid error nano common-peaks-Brain-Livers.bed

#remove header from common_peaks to avoid error nano Brain-specific-peaks.bed

#remove header from common_peaks to avoid error nano Liver-specific-peaks.bed

#####BRAIN PROCEDURE

#obtain interesection between common peaks and TSS of overexpressed genes in brain cut -f1,2,3 common-peaks-Brain-Livers.bed | bedtools intersect -a stdin -b brain_X_liver.bed -wa > common_peaks_brain.bed

#obtain interesection between brain specific peaks and TSS of overexpressed genes in brain cut -f1,2,3 Brain-specific-peaks.bed | bedtools intersect -a stdin -b brain_X_liver.bed -wa > brain_peaks_brain.bed

#obtain interesection between liver specific peaks and TSS of overexpressed genes in brain cut -f1,2,3 Liver-specific-peaks.bed | bedtools intersect -a stdin -b brain_X_liver.bed -wa > liver_peaks_brain.bed

#####LIVER PROCEDURE

#obtain interesection between common peaks and TSS of overexpressed genes in liver cut -f1,2,3 common-peaks-Brain-Livers.bed | bedtools intersect -a stdin -b liver_X_brain.bed -wa > common_peaks_liver.bed

#obtain interesection between brain specific peaks and TSS of overexpressed genes in liver cut -f1,2,3 Brain-specific-peaks.bed | bedtools intersect -a stdin -b liver_X_brain.bed -wa > brain_peaks_liver.bed

#obtain interesection between liver specific peaks and TSS of overexpressed genes in liver cut -f1,2,3 Liver-specific-peaks.bed | bedtools intersect -a stdin -b liver_X_brain.bed -wa > liver_peaks_liver.bed

Ariadna Cilleros Portet's Projects

Ariadna Cilleros Portet doesn’t have any public repositories yet.

Recommend Projects

  • React photo React

    A declarative, efficient, and flexible JavaScript library for building user interfaces.

  • Vue.js photo Vue.js

    🖖 Vue.js is a progressive, incrementally-adoptable JavaScript framework for building UI on the web.

  • Typescript photo Typescript

    TypeScript is a superset of JavaScript that compiles to clean JavaScript output.

  • TensorFlow photo TensorFlow

    An Open Source Machine Learning Framework for Everyone

  • Django photo Django

    The Web framework for perfectionists with deadlines.

  • D3 photo D3

    Bring data to life with SVG, Canvas and HTML. 📊📈🎉

Recommend Topics

  • javascript

    JavaScript (JS) is a lightweight interpreted programming language with first-class functions.

  • web

    Some thing interesting about web. New door for the world.

  • server

    A server is a program made to process requests and deliver data to clients.

  • Machine learning

    Machine learning is a way of modeling and interpreting data that allows a piece of software to respond intelligently.

  • Game

    Some thing interesting about game, make everyone happy.

Recommend Org

  • Facebook photo Facebook

    We are working to build community through open source technology. NB: members must have two-factor auth.

  • Microsoft photo Microsoft

    Open source projects and samples from Microsoft.

  • Google photo Google

    Google ❤️ Open Source for everyone.

  • D3 photo D3

    Data-Driven Documents codes.