Create a tx2gene file that later will be use for over representation analyses
entap<-read.table("../annotation_m1_m3_a/final_results/entap_results.tsv",header=TRUE, sep="\t", quote="", comment.char="")
entap_ann <- entap[!is.na(entap$Description), ] # remove sequences with no annotation
entap_ann <- entap_ann %>%
mutate(gene = str_remove(.[[1]], "_i\\d+$"))
# Summarize by gene
final_tbl <- entap_ann %>%
group_by(gene) %>%
summarise(
isoforms = n(), # number of isoforms
annotation = paste(unique(Description), collapse = "; "), # unique annotations
.groups = "drop"
)
# Count number of annotations per gene
final_tbl$num_anno <- sapply(strsplit(final_tbl$annotation, ";"), length)
# Check number of "genes" with a single annotation
table(final_tbl$num_anno)##
## 1 2 3 4 5 6 7 8
## 14857 3417 1042 309 95 35 9 2
# check the number of "genes" that where having redundant isoforms
final_tbl[final_tbl$isoforms > 1 & final_tbl$num_anno == 1, ] %>% nrow()## [1] 3329
df_ann <- entap_ann %>%
mutate(
transcript = .[[1]],
base_gene = str_remove(transcript, "_i\\d+$")
)
# Step 2: For each gene, check whether multiple annotations exist
rosetta_stone <- df_ann %>%
group_by(base_gene) %>%
mutate(
anno_unique = n_distinct(Subject.Sequence),
gene = ifelse(anno_unique == 1, base_gene, transcript)
) %>%
ungroup() %>%
select(transcript, gene)
rosetta_stoneWe will read a table containing our gene annotations from EnTAP
ann <- read.table("../annotation_m1_m3_a/final_results/entap_results.tsv",header=TRUE, sep="\t",quote="", comment.char="")match GOmap file
GOmap <- read.table("../annotation_m1_m3_a/final_results/annotated_gene_ontology_terms.tsv",sep="\t",quote="",comment.char="",header=TRUE) %>%
mutate(gene_id=str_remove(query_sequence, ".p[0-9]+$"))
shared_transcripts <- df_ann %>%
group_by(Subject.Sequence) %>%
filter(n() > 1) %>%
pull(transcript)
pos<-which(GOmap$query_sequence %in% shared_transcripts)
GOmap$gene_id[pos] <- GOmap$gene_id[pos] %>% str_remove("_i\\d+$")Read in information about the predicted proteins from transdecoder
bed <- read.table("../annotation_m1_m3_a/nonredundant_M1_M3_A.fasta.transdecoder.bed", sep="\t", quote="", comment.char="", skip=1) %>%
mutate(
pepID=str_extract(V4, "(?<=ID=)[^;]+"),
score=str_extract(V4, "(?<=score=)[^,]+"))Get transcript length form this file
## Rows: 335702 Columns: 2
## -- Column specification --------------------------------------------------------
## Delimiter: " "
## chr (1): X1
## dbl (1): X2
##
## i Use `spec()` to retrieve the full column specification for this data.
## i Specify the column types or set `show_col_types = FALSE` to quiet this message.
Annotations were created for the predicted proteins using TransDecoder. This implies:
*Some transcripts might be associated with more than one predicted protein (most of which are likely artifacts).
*Transcript names now have .pX added to them.
We will address this further below:
bed <- select(bed, V1,pepID,score) %>%
group_by(V1) %>%
summarize(Transcript.ID=V1[which.max(score)],Peptide.ID=pepID[which.max(score)])Merge transcript/peptide IDs into annotation
output_dir<-"salmon_m1_m3_a/"
list_dir <- list.files(output_dir, pattern = "quant.sf$", recursive = TRUE, full.names = TRUE)
#sample information
sample_file<-read.table("all_reads.txt")
sample_file<-sample_file[order(sample_file$V2),] #to match the directory order
sample_names<-sample_file$V3
stage<-sample_file$V1 %>% gsub(pattern="\\_\\w+$", replacement="")
colour<-sample_file$V1 %>% gsub(pattern="^\\w+\\_", replacement="")
egg<-gsub(pattern ="\\_3\\w+|\\_A\\w+|\\_1\\w+" ,replacement = "" ,x=sample_names)
coldata<-data.frame(sample_names,stage,colour,egg) # All the sample data in a single dataframe
#import all the result with tximport
txi <- tximport(list_dir, type = "salmon",txOut=TRUE,ignoreTxVersion = TRUE)## reading in files with read_tsv
## 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37
Merge the counts data frame with the annotations
We will modify the EnTAP results to deal with transcripts and lineage contamination
Let´s summarize total expression per sample by annotation status
Visualize the annotation status of each sample (No database hit, annotated, uninformative hit)
## Summarize expression per sample by contaminant status
Summarize the results in a single data frame
bycontam <- group_by(tab,Contaminant) %>%
summarize(across(V1:V37,sum)) %>% # get the summaries
pivot_longer(!Contaminant,names_to="Sample",values_to="Count") %>%
mutate(Contaminant = fct_relevel(Contaminant,c("Unknown","Yes","No"))) %>%
as.data.frame()Visualise the results
ggplot(bycontam,aes(fill=Contaminant,y=Count,x=Sample)) +
geom_bar(position="stack",stat="identity")All the samples have low levels of contamination
Here we will explore how accurate our annotation was. We expect to find that most of our transcripts correspond to arachnid proteins.
Using the column Taxonomic.Lineage, let’s get a fast summary
## .
## vertebrata chordata deuterostomia trichonephila clavipes
## [1,] 453 464 527 547
## .
## trichonephila inaurata trichonephila inaurata madagascariensis
## [1,] 680 680
## .
## parasteatoda parasteatoda tepidariorum theridiidae endopterygota
## [1,] 849 849 852 901
## .
## trichonephila clavata stegodyphus dumicola nephila pilipes nephila
## [1,] 923 1032 1132 1136
## .
## neoptera dicondylia insecta pterygota hexapoda eresidae eresoidea
## [1,] 1309 1341 1341 1341 1349 1389 1389
## .
## stegodyphus pancrustacea mandibulata trichonephila nephilidae
## [1,] 1389 1533 1539 2150 3286
## .
## araneus ventricosus araneus argiope bruennichi argiope araneidae
## [1,] 6965 6976 18803 18815 25792
## .
## araneoidea orbiculariae entelegynae araneomorphae araneae arachnida
## [1,] 29930 30305 31704 31709 31717 32346
## .
## chelicerata arthropoda panarthropoda ecdysozoa protostomia bilateria
## [1,] 32515 34054 34056 34159 34507 35034
## .
## eumetazoa metazoa opisthokonta eukaryota cellular organisms root
## [1,] 35114 35123 35139 35192 35205 35286
## .
## Unknown
## [1,] 300416
## .
## cellular organisms viruses
## 35205 81
## .
## archaea bacteria eukaryota
## 3 10 35192
## .
## hydra vulgaris lasius niger patella vulgata penaeus vannamei
## [1,] 14 14 14 14
## .
## vespa mandarinia bombyx mori bos taurus orbicella faveolata ladona fulva
## [1,] 14 16 16 16 17
## .
## cherax quadricarinatus graphocephala atropunctata
## [1,] 18 18
## .
## mytilus galloprovincialis dermacentor silvarum lygus hesperus
## [1,] 18 19 19
## .
## poeciliopsis prolifica dreissena polymorpha harpegnathos saltator
## [1,] 19 21 21
## .
## ixodes ricinus lingula anatina ruditapes philippinarum
## [1,] 21 21 22
## .
## haemaphysalis longicornis homalodisca liturata cuerna arida
## [1,] 24 24 25
## .
## lytechinus pictus ornithodoros moubata saccostrea cucullata
## [1,] 25 25 25
## .
## homarus americanus ornithodoros erraticus diaphorina citri
## [1,] 29 29 30
## .
## photinus pyralis drosophila melanogaster rhipicephalus microplus
## [1,] 35 39 40
## .
## ixodes scapularis mus musculus cryptotermes secundus homo sapiens
## [1,] 45 48 49 55
## .
## rhipicephalus sanguineus limulus polyphemus centruroides sculpturatus
## [1,] 58 164 206
## .
## stegodyphus mimosarum uloborus diversus trichonephila clavipes
## [1,] 357 375 547
## .
## trichonephila inaurata madagascariensis parasteatoda tepidariorum
## [1,] 680 849
## .
## trichonephila clavata stegodyphus dumicola nephila pilipes
## [1,] 923 1032 1132
## .
## araneus ventricosus argiope bruennichi Unknown
## [1,] 6965 18803 300416
According to this summary, it seems that we are on the right path. Let’s explore the results per sample, in case there are some samples that we need to remove form further analyses.
For this I will use some categorize of interest
tab <- mutate(tab,
TaxGroup=case_when(
str_detect(Taxonomic.Lineage, "archaea") ~ "archaea",
str_detect(Taxonomic.Lineage, "bacteria") ~ "bacteria",
str_detect(Taxonomic.Lineage, "fungi") ~ "fungi",
str_detect(Taxonomic.Lineage, "mammalia") ~ "mamifero",
str_detect(Taxonomic.Lineage, "araneomorphae") ~ "araneomorphae",
str_detect(Taxonomic.Lineage, "insecta") ~ "insecta",
str_detect(Taxonomic.Lineage, "arthropoda") ~ "arthropoda",
str_detect(Taxonomic.Lineage, "Unknown") ~ "Unknown",
.default = "Other"
)
)
tab <- tab[!is.na(tab$Description),]
tab$gene_id<-rosetta_stone$gene
tab_ref<-tabOrganize length file
#missing_lengths <- map_dfr(missed, function(mi) {
# matches <- len %>%
# filter(str_detect(transcript, fixed(mi))) %>% # grep equivalent
# filter(!transcript %in% tab$gene_id)
# tibble(
# transcript = mi,
# length = mean(matches$length)
# )
#})
missing_lengths<-read_csv("missing_len.csv")## Rows: 19587 Columns: 2
## -- Column specification --------------------------------------------------------
## Delimiter: ","
## chr (1): transcript
## dbl (1): length
##
## i Use `spec()` to retrieve the full column specification for this data.
## i Specify the column types or set `show_col_types = FALSE` to quiet this message.
get a table summarizing how many RNA fragments map by the defined taxonomic groups
bytax <- group_by(tab,TaxGroup) %>%
summarize(across(V1:V37,sum)) %>%
pivot_longer(!TaxGroup,names_to="Sample",values_to="Count") %>%
data.frame()Let’s visualize the results per sample
Summarize annotation by taxonomic group
bytax2 <- bytax %>%
group_by(Sample) %>%
mutate(
total_reads = sum(Count, na.rm = TRUE),
percentage = 100 * Count / total_reads
) %>%
ungroup()
ci_data <- bytax2 %>%
group_by(TaxGroup) %>%
summarise(
mean_percentage = mean(percentage, na.rm = TRUE),
sd_percentage = sd(percentage, na.rm = TRUE),
n = n(),
se = sd_percentage / sqrt(n),
lower_CI = mean_percentage - 1.96 * se,
upper_CI = mean_percentage + 1.96 * se
)
ci_dataOur annotation was good! The majority of transcripts in all our samples correspond to Araneomorphae
To reduce possible contaminants in our analyses, we first will limit our analyses to transcripts which taxonomical association correspond to Araneomorphae, Arthropoda, or Insecta. With these transcripts, we will build three different models in Deseq2.
Data filtering
Deseq2 model to test using developmental stage as predictor
tx2gene<-rosetta_stone
#remove txOut=TRUE to make gene level sum
txi <- tximport(list_dir, type = "salmon",ignoreTxVersion = TRUE,tx2gene = tx2gene)
dds <- DESeqDataSetFromTximport(txi, coldata, ~stage)
colnames(dds)<-sample_names
dds<-dds[rownames(dds) %in% m$gene_id,]
dds_stage <- DESeq(dds,minReplicatesForReplace=Inf)Model result visualization
PCA will help use to remove outlier samples fro the dataset that might bias the analysis
vsdata <- vst(dds_stage, blind = FALSE) ## use instead of vst because of sample size
pcaData <- plotPCA(vsdata, intgroup=c("stage"),ntop=10000, returnData=TRUE)
pcaData$colour<-coldata$colour
percentVar <- round(100 * attr(pcaData, "percentVar"))
ggplot(pcaData, aes(PC1, PC2, shape=stage,col=colour)) +
#geom_text(label=coldata$sample_names)+
geom_point(size=5) +
xlab(paste0("PC1: ",percentVar[1],"% variance")) +
ylab(paste0("PC2: ",percentVar[2],"% variance")) +
theme_minimal()+
scale_colour_manual(values = c("black", "brown", "gray"))
It seems that there are five samples at the bottom of the plot that are
outliers. We need to remove this samples to properly generate some
results.
samples_rm<-c("G14_3_W6","G13_3_W1","G14_3_B4","G14_3_B5","G14_3_B1")
coldata$stage<-factor(coldata$stage,levels = c("molt1","molt3","adult"))
dds <- DESeqDataSetFromTximport(txi, coldata, ~stage)## using counts and average transcript lengths from tximport
colnames(dds)<-sample_names
dds<-dds[,!colnames(dds) %in% samples_rm]
dds<-dds[rownames(dds) %in% m$gene_id,]Re run the model
## estimating size factors
## using 'avgTxLength' from assays(dds), correcting for library size
## estimating dispersions
## gene-wise dispersion estimates
## mean-dispersion relationship
## final dispersion estimates
## fitting model and testing
Model result visualization
Let´s see if the visualization of the samples improve
vsdata <- vst(dds_stage, blind = FALSE) ## use instead of vst because of sample size
pcaData <- plotPCA(vsdata, intgroup=c("stage"),ntop=10000, returnData=TRUE)
pcaData$colour<-dds@colData$colour
percentVar <- round(100 * attr(pcaData, "percentVar"))
ggplot(pcaData, aes(PC1, PC2, shape=stage, col=colour)) +
stat_ellipse(aes(group=factor(stage)), type="norm", level=0.95) +
geom_point(size=5) +
xlab(paste0("PC1: ",percentVar[1],"% variance")) +
ylab(paste0("PC2: ",percentVar[2],"% variance")) +
theme_minimal() +
scale_colour_manual(values = c("black", "brown", "gray"))## Warning: The following aesthetics were dropped during statistical transformation:
## colour.
## i This can happen when ggplot fails to infer the correct grouping structure in
## the data.
## i Did you forget to specify a `group` aesthetic or to convert a numerical
## variable into a factor?
We successfully recover the three developmental stages of our
experimental design. A similar plot we get when we explore everything on
a heatmap
## Warning: package 'RColorBrewer' was built under R version 4.1.3
## Warning: package 'pheatmap' was built under R version 4.1.3
vsd<-vsdata
sampleDists <- dist(t(assay(vsd)))
sampleDistMatrix <- as.matrix(sampleDists)
rownames(sampleDistMatrix) <- paste(vsd$colour,vsd$stage, sep="-")
colnames(sampleDistMatrix) <- NULL
colors <- colorRampPalette( rev(brewer.pal(9, "Blues")) )(255)
pheatmap(sampleDistMatrix,
clustering_distance_rows=sampleDists,
clustering_distance_cols=sampleDists,
col=colors)let´s filter the dataset to include only juveniles
txi <- tximport(list_dir, type = "salmon",ignoreTxVersion = TRUE,tx2gene = tx2gene)
m <- filter(tab, str_detect(Taxonomic.Lineage, "araneomorphae|arthropoda|insecta"))[,c(1:38)]
#colnames(m)<-c(sample_names,"gene_id","Subject.Sequence")
samples_rm<-c("G14_3_W6","G13_3_W1","G14_3_B4","G14_3_B5","G14_3_B1")
coldata$stage<-as.factor(coldata$stage)
m3<-which(coldata$stage == "molt3")
txi_m3 <- lapply(txi, function(x) if(is.matrix(x)) return(x[,m3]) else return(x))
dds <- DESeqDataSetFromTximport(txi_m3, coldata[m3,], ~colour)
colnames(dds)<-sample_names[m3]
dds<-dds[,!colnames(dds) %in% samples_rm]
dds<-dds[rownames(dds) %in% m$gene_id,]Run the model
Let´s explore the model
vsdata <- vst(dds_juveniles, blind = FALSE) ## use instead of vst because of sample size
pcaData <- plotPCA(vsdata, intgroup=c("colour"),ntop=1000, returnData=TRUE)
#pcaData$stage<-dds@colData$stage
percentVar <- round(100 * attr(pcaData, "percentVar"))
ggplot(pcaData, aes(PC1, PC2,col=colour)) +
#geom_text(label=dds_juveniles$sample_names)+
geom_point(shape = 15,size=5) +
xlab(paste0("PC1: ",percentVar[1],"% variance")) +
ylab(paste0("PC2: ",percentVar[2],"% variance")) +
theme_minimal()+
scale_colour_manual(values = c("black", "gray"))
We successfully recover the three developmental stages of our
experimental design. A similar plot we get when we explore evrything on
a heatmap
library("RColorBrewer")
library(pheatmap)
vsd<-vsdata
sampleDists <- dist(t(assay(vsd)))
sampleDistMatrix <- as.matrix(sampleDists)
rownames(sampleDistMatrix) <- dds_juveniles$sample_names#paste(vsd$colour,vsd$stage, sep="-")
colnames(sampleDistMatrix) <- NULL
colors <- colorRampPalette( rev(brewer.pal(9, "Blues")) )(255)
pheatmap(sampleDistMatrix,
clustering_distance_rows=sampleDists,
clustering_distance_cols=sampleDists,
col=colors)Extract results
get shrunken log fold changes, specifying the coefficient
## using 'ashr' for LFC shrinkage. If used in published research, please cite:
## Stephens, M. (2016) False discovery rates: a new deal. Biostatistics, 18:2.
## https://doi.org/10.1093/biostatistics/kxw041
Plot the differential expressed transcripts with volcano plot. This visualization help to identify patterns of differential expression
express_values<-as_tibble(res_shrink)
express_values$gene_id<-rownames(res_shrink)
ggplot(data= express_values, aes(x= log2FoldChange, y= -log10(padj))) +
geom_point(colour= 'grey80',size=3) +
geom_point(data=express_values[abs(express_values$log2FoldChange) >= 1 & express_values$padj <= 0.01,],aes(x=log2FoldChange, y=(-log10(padj))),colour= '#0A3F55',size=3)+
geom_vline(xintercept= c(-1, 1), colour= 'black', linetype= 'dashed') +
geom_hline(yintercept= 2, colour= 'black', linetype= 'dashed') +
xlab('log2 fold-change') +
ylab('-log10(Padj)') +
theme_classic()## Warning: Removed 1799 rows containing missing values or values outside the scale range
## (`geom_point()`).
## Warning: Removed 40 rows containing missing values or values outside the scale range
## (`geom_point()`).
It seems that there are not many differential expressed transcripts
This plot shows the log2 fold changes over the mean of normalized counts for all the samples
It seems that there are not many differential expressed transcripts
res_sig005<-res_shrink[which(res_shrink$pvalue<0.05 & res_shrink$padj < 0.01),]
res_sig001<-res_shrink[which(res_shrink$pvalue<0.01 & res_shrink$padj < 0.01),]
nrow(res_sig005)## [1] 10
## [1] 10
There are 5 DE transcripts independent on the P-value threshold
Let’s visualize in a heatmap of the top 50 DE transcripts to check if there is any clear pattern of differential expression
rld_m3 <- rlog(dds_juveniles, blind=FALSE)
# get top 50 log fold change genes (excluding cook's cutoff outliers)
top50_m3 <- data.frame(res_sig005) %>%
filter(!is.na(padj)) %>%
arrange(-abs(log2FoldChange)) %>%
rownames() %>%
head(.,n=10)
df <- data.frame(colData(dds_juveniles)[,"colour"])
rownames(df) <- colnames(dds_juveniles)
colnames(df) <- "colour"
pheatmap(
assay(rld_m3)[top50_m3,],
cluster_rows=TRUE,
show_rownames=TRUE,
cluster_cols=FALSE,
annotation_col=df,
scale = 'row'
)
This plot confirms that there is not clear pattern of differtial
expression betwen black-and-whit and black juveniles
Let’s get the DE transcripts using an alpha of 0.01
res_sig<-res_shrink
de_export<-res_shrink[which(res_shrink$pvalue<0.05 & res_shrink$padj<0.01),]
de_export<-as.data.frame(de_export)
de_export$gene_id<-rownames(de_export)
write_csv(de_export,"sigDE_juveniles.txt")
res2 <- cbind(gene_id=rownames(res_sig),res_sig) %>%
data.frame() %>%
filter(!is.na(padj))
res2 <- inner_join(res2,tab,by="gene_id")Get the length of the transcripts for the correction with goseq
# vector of transcript lengths
tran_length<-len
names(tran_length)<-c("gene_id","len")
res2 <- inner_join(res2,tran_length,by="gene_id")## Warning in inner_join(res2, tran_length, by = "gene_id"): Detected an unexpected many-to-many relationship between `x` and `y`.
## i Row 6 of `x` matches multiple rows in `y`.
## i Row 349404 of `y` matches multiple rows in `x`.
## i If a many-to-many relationship is expected, set `relationship =
## "many-to-many"` to silence this warning.
res2_unique <- res2[!duplicated(res2$gene_id), ]
de <- res2_unique$pvalue<0.05 & res2_unique$padj < 0.01
names(de) <- res2_unique$gene_idmapping of transcript IDs to category IDs
#GOmap <- read.table("../annotation_m1_m3_a/final_results/annotated_gene_ontology_terms.tsv",sep="\t",quote="",comment.char="",header=TRUE) %>%
# mutate(gene_id=str_remove(query_sequence, ".p[0-9]+$"))match transcripts with GO terms
With this object we can start the analysis
Account for transcript length bias by calculating the probability of being DE based purely on gene length
False discovery rate correction on p-values using Benjamini-Hochberg
GO.wall <- cbind(
GO.wall,
padj_overrepresented=p.adjust(GO.wall$over_represented_pvalue, method="BH"),
padj_underrepresented=p.adjust(GO.wall$under_represented_pvalue, method="BH")
)Names of GO terms
sig_go<-GO.wall[which(GO.wall$padj_overrepresented < 0.001),c("category","term","over_represented_pvalue","padj_overrepresented")]
write_excel_csv(sig_go,"sig_go_m3.csv")Let’s explore the
GO.wall %>%
top_n(10, wt=-over_represented_pvalue) %>%
mutate(hitsPerc=numDEInCat*100/numInCat) %>%
ggplot(aes(x=hitsPerc,
y=term,
colour=over_represented_pvalue,
size=numDEInCat)) +
geom_point() +
expand_limits(x=0) +
labs(x="Hits (%)", y="GO term", colour="p value", size="Count")let´s filter the dataset to include only juveniles
txi <- tximport(list_dir, type = "salmon",ignoreTxVersion = TRUE,tx2gene = tx2gene)
m <- filter(tab, str_detect(Taxonomic.Lineage, "araneomorphae|arthropoda|insecta"))[,c(1:38)]
#colnames(m)<-c(sample_names,"gene_id","Subject.Sequence")
coldata$stage<-as.factor(coldata$stage)
adult<-which(coldata$stage == "adult")
txi_adult <- lapply(txi, function(x) if(is.matrix(x)) return(x[,adult]) else return(x))
dds <- DESeqDataSetFromTximport(txi_adult, coldata[adult,], ~colour)
colnames(dds)<-sample_names[adult]
#dds<-dds[,!colnames(dds) %in% samples_rm]
dds<-dds[rownames(dds) %in% m$gene_id,]run the model
Let´s explore the model
vsdata <- vst(dds_adults, blind = FALSE) ## use instead of vst because of sample size
pcaData <- plotPCA(vsdata, intgroup=c("colour"),ntop=10000, returnData=TRUE)
#pcaData$stage<-dds@colData$stage
percentVar <- round(100 * attr(pcaData, "percentVar"))
ggplot(pcaData, aes(PC1, PC2,col=colour)) +
#geom_text(label=dds_adults@colData$sample_names)+
geom_point(size=5) +
xlab(paste0("PC1: ",percentVar[1],"% variance")) +
ylab(paste0("PC2: ",percentVar[2],"% variance")) +
theme_minimal()+
scale_colour_manual(values = c("black", "gray"))
It seems that there are two samples G25_A_B7 and
G26_A_W1 that are outliers. Let’s remove them for further
analyses.
let´s filter the dataset to include only juveniles
txi <- tximport(list_dir, type = "salmon",ignoreTxVersion = TRUE,tx2gene = tx2gene)
m <- filter(tab, str_detect(Taxonomic.Lineage, "araneomorphae|arthropoda|insecta"))[,c(1:38)]
#colnames(m)<-c(sample_names,"gene_id","Subject.Sequence")
samples_rm<-c("G25_A_B7","G26_A_W1")
coldata$stage<-as.factor(coldata$stage)
adult<-which(coldata$stage == "adult")
txi_adult <- lapply(txi, function(x) if(is.matrix(x)) return(x[,adult]) else return(x))
dds <- DESeqDataSetFromTximport(txi_adult, coldata[adult,], ~colour)
colnames(dds)<-sample_names[adult]
dds<-dds[,!colnames(dds) %in% samples_rm]
dds<-dds[rownames(dds) %in% m$gene_id,]Re run the model
Let´s explore the model
vsdata <- vst(dds_adults, blind = FALSE) ## use instead of vst because of sample size
pcaData <- plotPCA(vsdata, intgroup=c("colour"),ntop=10000, returnData=TRUE)
#pcaData$stage<-dds@colData$stage
percentVar <- round(100 * attr(pcaData, "percentVar"))
ggplot(pcaData, aes(PC1, PC2,col=colour)) +
#geom_text(label=dds_adults@colData$sample_names)+
geom_point(size=5) +
xlab(paste0("PC1: ",percentVar[1],"% variance")) +
ylab(paste0("PC2: ",percentVar[2],"% variance")) +
theme_minimal()+
scale_colour_manual(values = c("black", "gray"))Here the differentiation on the PC1 between black-and-white and black is clear!
library("RColorBrewer")
library(pheatmap)
vsd<-vsdata
sampleDists <- dist(t(assay(vsd)))
sampleDistMatrix <- as.matrix(sampleDists)
rownames(sampleDistMatrix) <- paste(vsd$colour,vsd$stage, sep="-")
colnames(sampleDistMatrix) <- NULL
colors <- colorRampPalette( rev(brewer.pal(9, "Blues")) )(255)
pheatmap(sampleDistMatrix,
clustering_distance_rows=sampleDists,
clustering_distance_cols=sampleDists,
col=colors)Extract results
get shrunken log fold changes, specifying the coefficient
## using 'ashr' for LFC shrinkage. If used in published research, please cite:
## Stephens, M. (2016) False discovery rates: a new deal. Biostatistics, 18:2.
## https://doi.org/10.1093/biostatistics/kxw041
Plot the differential expressed transcripts with volcano plot. This visualization help to indetify patterns of differential expression
express_values<-as_tibble(res_shrink)
express_values$gene_id<-rownames(res_shrink)
ggplot(data= express_values, aes(x= log2FoldChange, y= -log10(padj))) +
geom_point(colour= 'grey80',size=3) +
geom_point(data=express_values[abs(express_values$log2FoldChange) >= 1 & express_values$padj <= 0.01,],aes(x=log2FoldChange, y=(-log10(padj))),colour= '#0A3F55',size=3,alpha=0.5)+
geom_vline(xintercept= c(-1, 1), colour= 'black', linetype= 'dashed') +
geom_hline(yintercept= 2, colour= 'black', linetype= 'dashed') +
xlab('log2 fold-change') +
ylab('-log10(Padj)') +
theme_classic()## Warning: Removed 6501 rows containing missing values or values outside the scale range
## (`geom_point()`).
## Warning: Removed 83 rows containing missing values or values outside the scale range
## (`geom_point()`).
It seems that there are not many differential expressed transcripts
This plot shows the log2 fold changes over the mean of normalized counts for all the samples
It seems that there are many expressed transcripts
res_sig005<-res_shrink[which(res_shrink$padj < 0.01),]
res_sig001<-res_shrink[which(res_shrink$padj < 0.01),]
nrow(res_sig005)## [1] 814
## [1] 814
There are 814 DE transcripts depending on the P-value threshold
Let’s visualize in a heatmap of the top 50 DE transcripts to check if there is any clear pattern of differential expression
rld_adults <- rlog(dds_adults, blind=FALSE)
# get top 100 log fold change genes (excluding cook's cutoff outliers)
top100_adults <- data.frame(res_sig005) %>%
filter(!is.na(padj)) %>%
arrange(-abs(log2FoldChange)) %>%
rownames() %>%
head(.,n=100)
df <- data.frame(colData(dds_adults)[,"colour"])
rownames(df) <- colnames(dds_adults)
colnames(df) <- "colour"
pheatmap(
assay(rld_adults)[top100_adults,],
cluster_rows=TRUE,
show_rownames=TRUE,
cluster_cols=FALSE,
annotation_col=df,
scale='row'
#,labels_row=ann[ann$Query.Sequence %in% top50_adults,]$Description
)
This plot shows a clear pattern where the black-and-white
individuals express more certain transcripts compared to the black
individuals
Because we only observed significant differences when comparing adults of different colour morphs, we will explore the biological function of this transcripts.
Let’s get the DE transcripts using an alpha of 0.01
res_sig<-res_shrink
de_export<-res_shrink[which(res_shrink$padj<0.01),]
#de_export<-de_export[which(de_export$log2FoldChange<=(-1) | de_export$log2FoldChange>=1),]
de_export<-as.data.frame(de_export)
de_export$gene_id<-rownames(de_export)
write_csv(de_export,"sigDE_adults.txt")
res2 <- cbind(gene_id=rownames(res_sig),res_sig) %>%
data.frame() %>%
filter(!is.na(padj))
res2 <- inner_join(res2,tab,by="gene_id")There 814 differential expressed transcripts
Get the length of the transcripts for the correction with goseq
# vector of transcript lengths
tran_length<-len
names(tran_length)<-c("gene_id","len")
res2 <- inner_join(res2,tran_length,by="gene_id")
res2_unique <- res2[!duplicated(res2$gene_id), ]
de <- res2_unique$pvalue<0.05 & res2_unique$padj < 0.01
names(de) <- res2_unique$gene_idmapping of transcript IDs to category IDs
GOmap <- read.table("../annotation_m1_m3_a/final_results/annotated_gene_ontology_terms.tsv",sep="\t",quote="",comment.char="",header=TRUE) %>%
mutate(gene_id=query_sequence)
#number of DE with GO annottion
length(which(de_export$gene_id %in% GOmap$gene_id))## [1] 287
#modify to include GO terms in file considering are summary in annotation
length(which(!de_export$gene_id %in% rosetta_stone$transcript))## [1] 423
#there are 423 expressed "genes" that wont be represented in the GOmap, let's fix that
genes<-de_export$gene_id[which(!de_export$gene_id %in% rosetta_stone$transcript)]
transcripts<-rosetta_stone$transcript[which(rosetta_stone$gene %in% genes)]
GOmap_extra<-GOmap %>%
filter(gene_id %in% transcripts) %>%
mutate(gene_id = str_remove(gene_id, "_i\\d+$"))
GOmap<-rbind(GOmap,GOmap_extra)
#confirm that the transcript with annotation is higher, it went from 287 to 568
length(which(de_export$gene_id %in% GOmap$gene_id))## [1] 568
match transcripts with GO terms
With this object we can start the analysis
Account for transcript length bias by calculating the probability of being DE based purely on gene length
False discovery rate correction on p-values using Benjamini-Hochberg
GO.wall <- cbind(
GO.wall,
padj_overrepresented=p.adjust(GO.wall$over_represented_pvalue, method="BH"),
padj_underrepresented=p.adjust(GO.wall$under_represented_pvalue, method="BH")
)Names of GO terms
sig_go<-GO.wall[which(GO.wall$padj_overrepresented < 0.001),c("category","term","over_represented_pvalue","padj_overrepresented")]
write_excel_csv(sig_go,"sig_go.csv")Let’s explore the
GO.wall %>%
top_n(15, wt=-over_represented_pvalue) %>%
mutate(hitsPerc=numDEInCat*100/numInCat) %>%
ggplot(aes(x=hitsPerc,
y=term,
colour=over_represented_pvalue,
size=numDEInCat)) +
geom_point() +
expand_limits(x=0) +
labs(x="Hits (%)", y="GO term", colour="p value", size="Count")
### GO terms of interest
top10_go<-GO.wall %>% top_n(10, wt=-over_represented_pvalue)
de_go_transcripts<-tibble(gene_id=character(),go=character(),term=character())
for(go_term in 1:nrow(top10_go)){
gene_id<-go[which(go$go_id == top10_go$category[go_term]),"gene_id"]
de_go<-gene_id[which(gene_id %in% de_export$gene_id)]
tmp<-tibble(gene_id=de_go,go=top10_go$category[go_term],term=top10_go$term[go_term])
de_go_transcripts<-rbind(de_go_transcripts,tmp)
}
de_go_unique<-unique(de_go_transcripts)
pheatmap(
assay(rld_adults)[rownames(rld_adults) %in% de_go_unique$gene_id,],
cluster_rows=TRUE,
show_rownames=TRUE,
cluster_cols=FALSE,
annotation_col=df,
scale='row'
#labels_row=ann[ann$Query.Sequence %in% top50_adults,]$Description
)fold change top10 goterms
de_go_transcripts<- de_go_transcripts %>% mutate(log2FoldChange= as.numeric(NA))
de_go_transcripts<- de_go_transcripts %>% mutate(padj= as.numeric(NA))
for(tt in 1:nrow(de_go_transcripts)){
pos<-which(de_export$gene_id %in% de_go_transcripts$gene_id[tt])
#pos<-grep(de_go_transcripts$gene_id[tt],de_export$gene_id)
de_go_transcripts$log2FoldChange[tt]<-de_export[pos,"log2FoldChange"]
de_go_transcripts$padj[tt]<-de_export[pos,"padj"]
}
de_go_transcripts$term<-factor(de_go_transcripts$term,levels = rev(c("chitin metabolic process","chitin binding","structural constituent of cuticle", "glucosamine-containing compound metabolic process","aminoglycan metabolic process","amino sugar metabolic process","extracellular region","carbohydrate derivative binding","myofibril","contractile fiber")))
na.omit(de_go_transcripts) %>%
ggplot(aes(y = term, x = log2FoldChange, fill = term)) +
ggridges::geom_density_ridges() +
theme_classic() +
geom_vline(xintercept = 0, linetype = "dashed", col = "black", size = 0.4) +
labs(x = "log2FoldChange", y = "GO term") +
scale_fill_manual(values = colorRampPalette(c("#deebf7", "#3182bd", "#08519c"))(length(unique(de_go_transcripts$term)))) +
guides(fill = "none") ## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## i Please use `linewidth` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
## Picking joint bandwidth of 0.85
Chitin and cuticle transcripts
cuticle_chitin<-GO.wall %>% top_n(15, wt=-over_represented_pvalue) %>% filter(term %in% c("chitin metabolic process","chitin binding","structural constituent of cuticle", "glucosamine-containing compound metabolic process","aminoglycan metabolic process", "structural constituent of chitin-based cuticle")) #GO:0005214
de_go<-go[go$gene_id %in% de_export$gene_id,]
de_go[de_go$go_id %in% cuticle_chitin$category,] %>% dim()## [1] 213 2
de_go_transcripts<-tibble(gene_id=character(),go=character())
cuticle_transcripts<-de_go[de_go$go_id %in% cuticle_chitin$category,]
for(go_term in 1:nrow(cuticle_chitin)){
gene_id<-go[which(go$go_id == cuticle_chitin$category[go_term]),"gene_id"]
de_go<-gene_id[which(gene_id %in% de_export$gene_id)]
tmp<-tibble(gene_id=de_go,go=cuticle_chitin$category[go_term])
de_go_transcripts<-rbind(de_go_transcripts,tmp)
}
##identify transcripts associated with cuticle/chitin, but with not ontology identification
non_identify<-de_export$transcript_id[which(de_export$gene_id %in% gene_id == FALSE)]
for(tran in non_identify){
cond<-grep("cutic|chitin",ann[grep(tran,ann$Query.Sequence),"Description"])
if(length(cond)>0){
tmp<-tibble(transcript_id=tran,go="blast_match")
de_go_transcripts<-rbind(de_go_transcripts,tmp)
} else {next}
}
##unique transcripts
de_go_transcripts_unique <- de_go_transcripts %>%
group_by(gene_id) %>%
summarize(
transcript_id = paste(unique(gene_id), collapse = " "),
go = paste(unique(go), collapse = " "),
.groups = "drop"
)
pheatmap(
assay(rld_adults)[rownames(rld_adults) %in% de_go_transcripts_unique$gene_id,],
cluster_rows=TRUE,
show_rownames=TRUE,
cluster_cols=FALSE,
annotation_col=df,
scale='row'
#labels_row=ann[ann$Query.Sequence %in% top50_adults,]$Description
)build own volcano plot with custom colours
express_values<-as_tibble(res_shrink)
express_values$gene_id<-rownames(res_shrink)
##specify colours
express_values<-express_values %>% mutate(colour=if_else(gene_id %in% de_go_transcripts_unique$gene_id, "red", NA_character_))
ggplot(data= express_values, aes(x= log2FoldChange, y= -log10(padj))) +
geom_point(colour= 'grey80',size=3) +
geom_point(data=express_values[abs(express_values$log2FoldChange) >= 1 & express_values$padj <= 0.01,],aes(x=log2FoldChange, y=(-log10(padj))),colour= '#0A3F55',size=3,alpha=0.5)+
geom_point(data= express_values[!is.na(express_values$colour) & abs(express_values$log2FoldChange) > 0 & express_values$padj <= 0.01,],aes(x=log2FoldChange, y=(-log10(padj))),colour= '#E93024',size=4) +
geom_vline(xintercept= 0, colour= 'black', linetype= 'dashed') +
geom_hline(yintercept= 2, colour= 'black', linetype= 'dashed') +
xlab('log2 fold-change') +
ylab('-log10(Padj)') +
theme_classic()## Warning: Removed 6501 rows containing missing values or values outside the scale range
## (`geom_point()`).
## Warning: Removed 83 rows containing missing values or values outside the scale range
## (`geom_point()`).
## Rows: 10 Columns: 6
## -- Column specification --------------------------------------------------------
## Delimiter: ","
## chr (1): gene_id
## dbl (5): baseMean, log2FoldChange, lfcSE, pvalue, padj
##
## i Use `spec()` to retrieve the full column specification for this data.
## i Specify the column types or set `show_col_types = FALSE` to quiet this message.
transcript_go_melanin<-go[grep("GO:0006582",ann$EggNOG.GO.Biological),"gene_id"]
#transcript_go_melanin<-go[grep("GO:0006582",ann$EggNOG.GO.Biological),"gene_id"]
sig_melanin<-transcript_go_melanin[which(transcript_go_melanin %in% de_juveniles$gene_id)]
length(sig_melanin)## [1] 0
## [1] 1
plots <- list() # To store all plots
for (tt in rownames(sig_melanin)) {
data <- plotCounts(dds_adults, gene = tt, intgroup = c("colour"), returnData = TRUE)
p <- ggplot(data, aes(x=colour, y=count, col=colour)) +
geom_boxplot() +
geom_point() +
scale_colour_manual(values = c("black", "gray")) +
theme_classic()+
ggtitle(paste("Transcript:", tt))
plots[[tt]] <- p # Add the plot to the list
}
# Display two plots at a time in a grid
do.call(grid.arrange, c(plots, nrow = 2))## Rows: 10 Columns: 6
## -- Column specification --------------------------------------------------------
## Delimiter: ","
## chr (1): gene_id
## dbl (5): baseMean, log2FoldChange, lfcSE, pvalue, padj
##
## i Use `spec()` to retrieve the full column specification for this data.
## i Specify the column types or set `show_col_types = FALSE` to quiet this message.
transcript_go_ommochrome<-go[grep("GO:0046152",ann$EggNOG.GO.Biological),"gene_id"]
sig_ommo_juv<-transcript_go_ommochrome[which(transcript_go_ommochrome %in% de_juveniles$gene_id)]
length(sig_ommo_juv)## [1] 1
melanin_transcripts<-c("TRINITY_DN11796_c0_g1_i4","TRINITY_DN447_c0_g2_i1","TRINITY_DN1136_c0_g1_i1","TRINITY_DN119_c0_g1_i6","TRINITY_DN2896_c0_g2_i2","TRINITY_DN23627_c0_g1_i2","TRINITY_DN47018_c0_g1_i1")
melano_ids <- tibble(
original = melanin_transcripts,
stripped = str_remove(melanin_transcripts, "_i\\d+$")
)
sig_melanin <- tibble(
bind_cols(as_tibble(de_juveniles)) %>% # add DE table columns
filter(gene_id %in% melano_ids$original |
gene_id %in% melano_ids$stripped))
nrow(sig_melanin)## [1] 0
melanin_annotated<-tx2gene$gene[tx2gene$transcript %in% melanin_transcripts]
plots <- list() # To store all plots
for (tt in melanin_annotated) {
data <- plotCounts(dds_juveniles, gene = tt, intgroup = c("colour"), returnData = TRUE)
p <- ggplot(data, aes(x=colour, y=count, col=colour)) +
geom_boxplot() +
geom_point() +
scale_colour_manual(values = c("black", "gray")) +
theme_classic()+
ggtitle(paste("Transcript:", tt))
plots[[tt]] <- p # Add the plot to the list
}
# Display two plots at a time in a grid
do.call(grid.arrange, c(plots, nrow = 2))melanin_transcripts<-c("TRINITY_DN11796_c0_g1_i4","TRINITY_DN447_c0_g2_i1","TRINITY_DN1136_c0_g1_i1","TRINITY_DN119_c0_g1_i6","TRINITY_DN2896_c0_g2_i2","TRINITY_DN23627_c0_g1_i2","TRINITY_DN47018_c0_g1_i1")
melano_ids <- tibble(
original = melanin_transcripts,
stripped = str_remove(melanin_transcripts, "_i\\d+$")
)
sig_melanin <- tibble(
bind_cols(as_tibble(de_export)) %>% # add DE table columns
filter(gene_id %in% melano_ids$original |
gene_id %in% melano_ids$stripped))
nrow(sig_melanin)## [1] 1
melanin_annotated<-tx2gene$gene[tx2gene$transcript %in% melanin_transcripts]
plots <- list() # To store all plots
for (tt in melanin_annotated) {
data <- plotCounts(dds_adults, gene = tt, intgroup = c("colour"), returnData = TRUE)
p <- ggplot(data, aes(x=colour, y=count, col=colour)) +
geom_boxplot() +
geom_point() +
scale_colour_manual(values = c("black", "gray")) +
theme_classic()+
ggtitle(paste("Transcript:", tt))
plots[[tt]] <- p # Add the plot to the list
}
# Display two plots at a time in a grid
do.call(grid.arrange, c(plots, nrow = 2))common_transcripts<-rownames(diff_m3_res_sig005)[which(rownames(diff_m3_res_sig005) %in% rownames(res_sig005))]
tt<-tx2gene$transcript[which(tx2gene$gene==common_transcripts)]
info_common_transcripts<-GOmap[which(GOmap$gene_id %in% tt),]
#TRINITY_DN12867_c0_g1_i36=protein turtle-like isoform X1 [Argiope bruennichi]
#TRINITY_DN6814_c0_g1=metalloprotease TIKI1-like [Argiope bruennichi]
#TRINITY_DN7609_c0_g1_i3=40S ribosomal protein S17-like [Argiope bruennichi]There are three common transcripts
ommo_transcripts<-c("TRINITY_DN18310_c0_g1_i3", "TRINITY_DN7113_c0_g1_i13", "TRINITY_DN119_c0_g1_i6", "TRINITY_DN12974_c0_g2_i1", "TRINITY_DN12012_c1_g1_i1")
ommo_ids <- tibble(
original = ommo_transcripts,
stripped = str_remove(ommo_transcripts, "_i\\d+$")
)
sig_ommo_juv <- tibble(
bind_cols(as_tibble(de_juveniles)) %>% # add DE table columns
filter(gene_id %in% ommo_ids$original |
gene_id %in% ommo_ids$stripped))
nrow(sig_ommo_juv)## [1] 0
ommo_annotated<-tx2gene$gene[tx2gene$transcript %in% ommo_transcripts]
plots <- list() # To store all plots
for (tt in ommo_annotated) {
data <- plotCounts(dds_juveniles, gene = tt, intgroup = c("colour"), returnData = TRUE)
p <- ggplot(data, aes(x=colour, y=count, col=colour)) +
geom_boxplot() +
geom_point() +
scale_colour_manual(values = c("black", "gray")) +
theme_classic()+
ggtitle(paste("Transcript:", tt))
plots[[tt]] <- p # Add the plot to the list
}
# Display two plots at a time in a grid
do.call(grid.arrange, c(plots, nrow = 2))ommo_transcripts<-c("TRINITY_DN18310_c0_g1_i3", "TRINITY_DN7113_c0_g1_i13", "TRINITY_DN119_c0_g1_i6", "TRINITY_DN12974_c0_g2_i1", "TRINITY_DN12012_c1_g1_i1")
ommo_ids <- tibble(
original = ommo_transcripts,
stripped = str_remove(ommo_transcripts, "_i\\d+$")
)
sig_ommo <- tibble(
bind_cols(as_tibble(de_export)) %>% # add DE table columns
filter(gene_id %in% ommo_ids$original |
gene_id %in% ommo_ids$stripped))
nrow(sig_ommo)## [1] 2
ommo_annotated<-tx2gene$gene[tx2gene$transcript %in% ommo_transcripts]
plots <- list() # To store all plots
for (tt in ommo_annotated) {
data <- plotCounts(dds_adults, gene = tt, intgroup = c("colour"), returnData = TRUE)
p <- ggplot(data, aes(x=colour, y=count, col=colour)) +
geom_boxplot() +
geom_point() +
scale_colour_manual(values = c("black", "gray")) +
theme_classic()+
ggtitle(paste("Transcript:", tt))
plots[[tt]] <- p # Add the plot to the list
}
# Display two plots at a time in a grid
do.call(grid.arrange, c(plots, nrow = 2))Let’s consolidate all the information about melanin and ommochromes in heatmaps
export transcripts for blast
#use the transcript code instead of "gene"
#adults
DE_transcripts_adults<-de_export$gene_id[which(de_export$gene_id %in% rosetta_stone$transcript)]
DE_miss_genes<-de_export$gene_id[which(!de_export$gene_id %in% rosetta_stone$transcript)]
miss_transcripts<-rosetta_stone$transcript[which(rosetta_stone$gene %in% DE_miss_genes)]
DE_transcripts_adults<-c(DE_transcripts_adults,miss_transcripts)
write_csv(tibble(transcript_id=DE_transcripts_adults),file="DE_adults.csv")
#juveniles
de_juv<-read_csv("sigDE_juveniles.txt")## Rows: 10 Columns: 6
## -- Column specification --------------------------------------------------------
## Delimiter: ","
## chr (1): gene_id
## dbl (5): baseMean, log2FoldChange, lfcSE, pvalue, padj
##
## i Use `spec()` to retrieve the full column specification for this data.
## i Specify the column types or set `show_col_types = FALSE` to quiet this message.
DE_transcripts_juv<-de_juv$gene_id[which(de_juv$gene_id %in% rosetta_stone$transcript)]
DE_miss_genes<-de_juv$gene_id[which(!de_juv$gene_id %in% rosetta_stone$transcript)]
miss_transcripts<-rosetta_stone$transcript[which(rosetta_stone$gene %in% DE_miss_genes)]
DE_transcripts_juv<-c(DE_transcripts_juv,miss_transcripts)
write_csv(tibble(transcript_id=DE_transcripts_juv),file="DE_juv.csv")Add transcripts from ommochromes and melanin with cuticle on volcano plot
## Volcano plot highlighting the transcript for easy visualization
express_values<-as_tibble(res_shrink)
express_values$gene_id<-rownames(res_shrink)
##specify colours
express_values<-express_values %>%
mutate(colour=if_else(gene_id %in% transcript_go_melanin, "go_melanin",
if_else(gene_id %in% melano_ids$original | gene_id %in% melano_ids$stripped, "melanin",
if_else(gene_id %in% transcript_go_ommochrome, "go_ommo",
if_else(gene_id %in% de_go_transcripts_unique$gene_id, "cuticle",
if_else(gene_id %in% ommo_ids$original | gene_id %in% ommo_ids$stripped, "ommo",
NA_character_))))))
ggplot(data= express_values, aes(x= log2FoldChange, y= -log10(padj))) +
#geom_point(colour= 'grey80',size=3) +
geom_point(data=express_values[express_values$padj <= 0.01,],aes(x=log2FoldChange, y=(-log10(padj))),colour= 'grey80',size=3,alpha=0.5)+
geom_point(data= express_values[express_values$padj <= 0.01 & express_values$colour=="cuticle",],aes(x=log2FoldChange, y=(-log10(padj))),colour= 'red',size=3, shape=19) +
geom_point(data= express_values[express_values$padj <= 0.01 & express_values$colour=="go_melanin",],aes(x=log2FoldChange, y=(-log10(padj))),colour= 'black',size=3, shape=17) +
geom_point(data= express_values[express_values$padj <= 0.01 & express_values$colour=="melanin",],aes(x=log2FoldChange, y=(-log10(padj))),colour= 'black',size=3, shape=19) +
geom_point(data= express_values[express_values$padj <= 0.01 & express_values$colour=="go_ommo",],aes(x=log2FoldChange, y=(-log10(padj))),colour= 'orange',size=3, shape=17) +
geom_point(data= express_values[express_values$padj <= 0.01 & express_values$colour=="ommo",],aes(x=log2FoldChange, y=(-log10(padj))),colour= 'orange',size=3, shape=19) +
geom_vline(xintercept= 0, colour= 'black', linetype= 'dashed') +
geom_hline(yintercept= 2, colour= 'black', linetype= 'dashed') +
xlab('log2 fold-change') +
ylab('-log10(Padj)') +
theme_classic()## Warning: Removed 6501 rows containing missing values or values outside the scale range
## (`geom_point()`).
## Warning: Removed 7247 rows containing missing values or values outside the scale range
## (`geom_point()`).
## Removed 7247 rows containing missing values or values outside the scale range
## (`geom_point()`).
## Removed 7247 rows containing missing values or values outside the scale range
## (`geom_point()`).
## Removed 7247 rows containing missing values or values outside the scale range
## (`geom_point()`).
## Removed 7247 rows containing missing values or values outside the scale range
## (`geom_point()`).