1 Load packages and inputs

1.1 R packages

library(tidyverse)
library(DESeq2)
library(tximport)
library(goseq)
library(EnhancedVolcano)
library(gridExtra)

1.2 Set working directory

#home
#setwd("C:/Users/Fabian C. Salgado/Dropbox/PhD_FabianSalgado/research_proposal/aim4/analyses/expression")
#laptop
setwd("C:/Users/facas/Dropbox/PhD_FabianSalgado/research_proposal/aim4/analyses/expression")

1.3 Account for overrepresentation

Create a tx2gene file that later will be use for over representation analyses

entap<-read.table("../annotation_m1_m3_a/final_results/entap_results.tsv",header=TRUE, sep="\t", quote="", comment.char="")

entap_ann <- entap[!is.na(entap$Description), ] # remove sequences with no annotation

entap_ann <- entap_ann %>%
  mutate(gene = str_remove(.[[1]], "_i\\d+$"))

# Summarize by gene
final_tbl <- entap_ann %>%
  group_by(gene) %>%
  summarise(
    isoforms = n(),                               # number of isoforms
    annotation = paste(unique(Description), collapse = "; "),  # unique annotations
    .groups = "drop"
  )

# Count number of annotations per gene
final_tbl$num_anno <- sapply(strsplit(final_tbl$annotation, ";"), length)

# Check number of "genes" with a single annotation
table(final_tbl$num_anno)

## 
##     1     2     3     4     5     6     7     8 
## 14857  3417  1042   309    95    35     9     2

# check the number of "genes" that where having redundant isoforms
final_tbl[final_tbl$isoforms > 1 & final_tbl$num_anno == 1, ] %>% nrow()

## [1] 3329

df_ann <- entap_ann %>%
  mutate(
    transcript = .[[1]],
    base_gene = str_remove(transcript, "_i\\d+$")
  )

# Step 2: For each gene, check whether multiple annotations exist
rosetta_stone <- df_ann %>%
  group_by(base_gene) %>%
  mutate(
    anno_unique = n_distinct(Subject.Sequence),
    gene = ifelse(anno_unique == 1, base_gene, transcript)
  ) %>%
  ungroup() %>%
  select(transcript, gene)

rosetta_stone

1.4 Annotation file

We will read a table containing our gene annotations from EnTAP

ann <- read.table("../annotation_m1_m3_a/final_results/entap_results.tsv",header=TRUE, sep="\t",quote="", comment.char="")

match GOmap file

GOmap <- read.table("../annotation_m1_m3_a/final_results/annotated_gene_ontology_terms.tsv",sep="\t",quote="",comment.char="",header=TRUE) %>%
  mutate(gene_id=str_remove(query_sequence, ".p[0-9]+$"))

shared_transcripts <- df_ann %>%
  group_by(Subject.Sequence) %>%
  filter(n() > 1) %>%
  pull(transcript)

pos<-which(GOmap$query_sequence %in% shared_transcripts)

GOmap$gene_id[pos] <- GOmap$gene_id[pos] %>% str_remove("_i\\d+$")

1.5 Predicted proteins

Read in information about the predicted proteins from transdecoder

bed <- read.table("../annotation_m1_m3_a/nonredundant_M1_M3_A.fasta.transdecoder.bed", sep="\t", quote="", comment.char="", skip=1) %>%
  mutate(
    pepID=str_extract(V4, "(?<=ID=)[^;]+"),
    score=str_extract(V4, "(?<=score=)[^,]+"))

Get transcript length form this file

len <- read_delim("../assembly_M1_M3_A/trans_len.txt",col_names = FALSE, delim = " ")

## Rows: 335702 Columns: 2
## -- Column specification --------------------------------------------------------
## Delimiter: " "
## chr (1): X1
## dbl (1): X2
## 
## i Use `spec()` to retrieve the full column specification for this data.
## i Specify the column types or set `show_col_types = FALSE` to quiet this message.

names(len)<-c("transcript","length")

Annotations were created for the predicted proteins using TransDecoder. This implies:

*Some transcripts might be associated with more than one predicted protein (most of which are likely artifacts).

*Transcript names now have .pX added to them.

We will address this further below:

bed <- select(bed, V1,pepID,score) %>%
  group_by(V1) %>%
  summarize(Transcript.ID=V1[which.max(score)],Peptide.ID=pepID[which.max(score)])

Merge transcript/peptide IDs into annotation

tab <- merge(select(bed,Peptide.ID,Transcript.ID),ann,by.x="Transcript.ID",by.y="Query.Sequence")

1.6 Import raw count data from Salmon

output_dir<-"salmon_m1_m3_a/"

list_dir <- list.files(output_dir, pattern = "quant.sf$", recursive = TRUE, full.names = TRUE)

#sample information
sample_file<-read.table("all_reads.txt")

sample_file<-sample_file[order(sample_file$V2),] #to match the directory order

sample_names<-sample_file$V3

stage<-sample_file$V1 %>% gsub(pattern="\\_\\w+$", replacement="")  
  
colour<-sample_file$V1 %>% gsub(pattern="^\\w+\\_", replacement="")

egg<-gsub(pattern ="\\_3\\w+|\\_A\\w+|\\_1\\w+" ,replacement = "" ,x=sample_names)

coldata<-data.frame(sample_names,stage,colour,egg) # All the sample data in a single dataframe

#import all the result with tximport
txi <- tximport(list_dir, type = "salmon",txOut=TRUE,ignoreTxVersion = TRUE)

## reading in files with read_tsv

## 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37

Merge the counts data frame with the annotations

We will modify the EnTAP results to deal with transcripts and lineage contamination

tab <- mutate(
  tab,
  Contaminant = replace_na(Contaminant, "Unknown"),
  Taxonomic.Lineage = replace_na(Taxonomic.Lineage, "Unknown"),
  Prevalence = rowSums(tab[,1:37]>0)
  )

2 Annotation quality

Let´s summarize total expression per sample by annotation status

2.1 Annotation status per samples

Visualize the annotation status of each sample (No database hit, annotated, uninformative hit)

ggplot(byann,aes(fill=AnnStatus,y=Count,x=Sample)) +
  geom_bar(position="stack",stat="identity")

## Summarize expression per sample by contaminant status

Summarize the results in a single data frame

bycontam <- group_by(tab,Contaminant) %>% 
  summarize(across(V1:V37,sum)) %>% # get the summaries
  pivot_longer(!Contaminant,names_to="Sample",values_to="Count") %>%
  mutate(Contaminant = fct_relevel(Contaminant,c("Unknown","Yes","No"))) %>%
  as.data.frame()

Visualise the results

ggplot(bycontam,aes(fill=Contaminant,y=Count,x=Sample)) +
  geom_bar(position="stack",stat="identity")

All the samples have low levels of contamination

2.2 Summarize total expression per sample by taxonomy

Here we will explore how accurate our annotation was. We expect to find that most of our transcripts correspond to arachnid proteins.

Using the column Taxonomic.Lineage, let’s get a fast summary

str_split(tab$Taxonomic.Lineage,";") %>% unlist() %>% table() %>% sort() %>% tail(n=50) %>% t()

##       .
##        vertebrata chordata deuterostomia trichonephila clavipes
##   [1,]        453      464           527                    547
##       .
##        trichonephila inaurata trichonephila inaurata madagascariensis
##   [1,]                    680                                     680
##       .
##        parasteatoda parasteatoda tepidariorum theridiidae endopterygota
##   [1,]          849                       849         852           901
##       .
##        trichonephila clavata stegodyphus dumicola nephila pilipes nephila
##   [1,]                   923                 1032            1132    1136
##       .
##        neoptera dicondylia insecta pterygota hexapoda eresidae eresoidea
##   [1,]     1309       1341    1341      1341     1349     1389      1389
##       .
##        stegodyphus pancrustacea mandibulata trichonephila nephilidae
##   [1,]        1389         1533        1539          2150       3286
##       .
##        araneus ventricosus araneus argiope bruennichi argiope araneidae
##   [1,]                6965    6976              18803   18815     25792
##       .
##        araneoidea orbiculariae entelegynae araneomorphae araneae arachnida
##   [1,]      29930        30305       31704         31709   31717     32346
##       .
##        chelicerata arthropoda panarthropoda ecdysozoa protostomia bilateria
##   [1,]       32515      34054         34056     34159       34507     35034
##       .
##        eumetazoa metazoa opisthokonta eukaryota cellular organisms   root
##   [1,]     35114   35123        35139     35192              35205  35286
##       .
##        Unknown
##   [1,]  300416

str_extract(tab$Taxonomic.Lineage, "[^;]+(?=;root)") %>% table()

## .
## cellular organisms            viruses 
##              35205                 81

str_extract(tab$Taxonomic.Lineage, "[^;]+(?=;cellular organisms)") %>% table()

## .
##   archaea  bacteria eukaryota 
##         3        10     35192

str_extract(tab$Taxonomic.Lineage, "^[^;]+") %>% table() %>% sort() %>% tail(n=50)  %>% t()

##       .
##        hydra vulgaris lasius niger patella vulgata penaeus vannamei
##   [1,]             14           14              14               14
##       .
##        vespa mandarinia bombyx mori bos taurus orbicella faveolata ladona fulva
##   [1,]               14          16         16                  16           17
##       .
##        cherax quadricarinatus graphocephala atropunctata
##   [1,]                     18                         18
##       .
##        mytilus galloprovincialis dermacentor silvarum lygus hesperus
##   [1,]                        18                   19             19
##       .
##        poeciliopsis prolifica dreissena polymorpha harpegnathos saltator
##   [1,]                     19                   21                    21
##       .
##        ixodes ricinus lingula anatina ruditapes philippinarum
##   [1,]             21              21                      22
##       .
##        haemaphysalis longicornis homalodisca liturata cuerna arida
##   [1,]                        24                   24           25
##       .
##        lytechinus pictus ornithodoros moubata saccostrea cucullata
##   [1,]                25                   25                   25
##       .
##        homarus americanus ornithodoros erraticus diaphorina citri
##   [1,]                 29                     29               30
##       .
##        photinus pyralis drosophila melanogaster rhipicephalus microplus
##   [1,]               35                      39                      40
##       .
##        ixodes scapularis mus musculus cryptotermes secundus homo sapiens
##   [1,]                45           48                    49           55
##       .
##        rhipicephalus sanguineus limulus polyphemus centruroides sculpturatus
##   [1,]                       58                164                       206
##       .
##        stegodyphus mimosarum uloborus diversus trichonephila clavipes
##   [1,]                   357               375                    547
##       .
##        trichonephila inaurata madagascariensis parasteatoda tepidariorum
##   [1,]                                     680                       849
##       .
##        trichonephila clavata stegodyphus dumicola nephila pilipes
##   [1,]                   923                 1032            1132
##       .
##        araneus ventricosus argiope bruennichi Unknown
##   [1,]                6965              18803  300416

According to this summary, it seems that we are on the right path. Let’s explore the results per sample, in case there are some samples that we need to remove form further analyses.

For this I will use some categorize of interest

tab <- mutate(tab,
              TaxGroup=case_when(
                str_detect(Taxonomic.Lineage, "archaea") ~ "archaea",
                str_detect(Taxonomic.Lineage, "bacteria") ~ "bacteria",
                str_detect(Taxonomic.Lineage, "fungi") ~ "fungi",
                str_detect(Taxonomic.Lineage, "mammalia") ~ "mamifero",
                str_detect(Taxonomic.Lineage, "araneomorphae") ~ "araneomorphae",
                str_detect(Taxonomic.Lineage, "insecta") ~ "insecta",
                str_detect(Taxonomic.Lineage, "arthropoda") ~ "arthropoda",
                str_detect(Taxonomic.Lineage, "Unknown") ~ "Unknown",
                .default = "Other"
              )
            )

tab <- tab[!is.na(tab$Description),]

tab$gene_id<-rosetta_stone$gene

tab_ref<-tab

Organize length file

#missing_lengths <- map_dfr(missed, function(mi) {
#  matches <- len %>%
#    filter(str_detect(transcript, fixed(mi))) %>%    # grep equivalent
#    filter(!transcript %in% tab$gene_id)

#  tibble(
#    transcript = mi,
#    length = mean(matches$length)
#  )
#})

missing_lengths<-read_csv("missing_len.csv")

## Rows: 19587 Columns: 2
## -- Column specification --------------------------------------------------------
## Delimiter: ","
## chr (1): transcript
## dbl (1): length
## 
## i Use `spec()` to retrieve the full column specification for this data.
## i Specify the column types or set `show_col_types = FALSE` to quiet this message.

len <- bind_rows(len, missing_lengths)

get a table summarizing how many RNA fragments map by the defined taxonomic groups

bytax <- group_by(tab,TaxGroup) %>% 
  summarize(across(V1:V37,sum)) %>% 
  pivot_longer(!TaxGroup,names_to="Sample",values_to="Count") %>%
  data.frame()

Let’s visualize the results per sample

ggplot(bytax,aes(fill=TaxGroup,y=Count,x=Sample)) +
  geom_bar(position="stack",stat="identity")

Summarize annotation by taxonomic group

bytax2 <- bytax %>%
  group_by(Sample) %>%
  mutate(
    total_reads = sum(Count, na.rm = TRUE),
    percentage  = 100 * Count / total_reads
  ) %>%
  ungroup()

ci_data <- bytax2 %>%
  group_by(TaxGroup) %>%
  summarise(
    mean_percentage = mean(percentage, na.rm = TRUE),
    sd_percentage   = sd(percentage, na.rm = TRUE),
    n               = n(),
    se              = sd_percentage / sqrt(n),
    lower_CI        = mean_percentage - 1.96 * se,
    upper_CI        = mean_percentage + 1.96 * se
  )

ci_data

Our annotation was good! The majority of transcripts in all our samples correspond to Araneomorphae

3 Differential Expression Analyses

To reduce possible contaminants in our analyses, we first will limit our analyses to transcripts which taxonomical association correspond to Araneomorphae, Arthropoda, or Insecta. With these transcripts, we will build three different models in Deseq2.

The first model will include all samples using developmental stage as a fixed term.
The second model will explore differences between the black-and-white and black colour morphs in juveniles.
The third model will examine differences between the black-and-white and black colour morphs in adults.

3.1 All samples categorize by developmental stage

Data filtering

m <- filter(tab, str_detect(Taxonomic.Lineage, "araneomorphae|arthropoda|insecta"))[,c(1:38)]

Deseq2 model to test using developmental stage as predictor

tx2gene<-rosetta_stone

#remove txOut=TRUE to make gene level sum

txi <- tximport(list_dir, type = "salmon",ignoreTxVersion = TRUE,tx2gene = tx2gene)

dds <- DESeqDataSetFromTximport(txi, coldata, ~stage)

colnames(dds)<-sample_names

dds<-dds[rownames(dds) %in% m$gene_id,]

dds_stage <- DESeq(dds,minReplicatesForReplace=Inf)

Model result visualization

PCA will help use to remove outlier samples fro the dataset that might bias the analysis

vsdata <- vst(dds_stage, blind = FALSE) ## use instead of vst because of sample size
pcaData <- plotPCA(vsdata, intgroup=c("stage"),ntop=10000, returnData=TRUE)
pcaData$colour<-coldata$colour
percentVar <- round(100 * attr(pcaData, "percentVar"))
ggplot(pcaData, aes(PC1, PC2, shape=stage,col=colour)) +
  #geom_text(label=coldata$sample_names)+
  geom_point(size=5) +
  xlab(paste0("PC1: ",percentVar[1],"% variance")) +
  ylab(paste0("PC2: ",percentVar[2],"% variance")) + 
  theme_minimal()+
  scale_colour_manual(values = c("black", "brown", "gray"))

It seems that there are five samples at the bottom of the plot that are outliers. We need to remove this samples to properly generate some results.

samples_rm<-c("G14_3_W6","G13_3_W1","G14_3_B4","G14_3_B5","G14_3_B1")

coldata$stage<-factor(coldata$stage,levels = c("molt1","molt3","adult"))

dds <- DESeqDataSetFromTximport(txi, coldata, ~stage)

## using counts and average transcript lengths from tximport

colnames(dds)<-sample_names

dds<-dds[,!colnames(dds) %in% samples_rm]

dds<-dds[rownames(dds) %in% m$gene_id,]

Re run the model

dds_stage <- DESeq(dds,minReplicatesForReplace=Inf)

## estimating size factors

## using 'avgTxLength' from assays(dds), correcting for library size

## estimating dispersions

## gene-wise dispersion estimates

## mean-dispersion relationship

## final dispersion estimates

## fitting model and testing

Model result visualization

Let´s see if the visualization of the samples improve

vsdata <- vst(dds_stage, blind = FALSE) ## use instead of vst because of sample size
pcaData <- plotPCA(vsdata, intgroup=c("stage"),ntop=10000, returnData=TRUE)
pcaData$colour<-dds@colData$colour
percentVar <- round(100 * attr(pcaData, "percentVar"))
ggplot(pcaData, aes(PC1, PC2, shape=stage, col=colour)) +
  stat_ellipse(aes(group=factor(stage)), type="norm", level=0.95) +
  geom_point(size=5) +
  xlab(paste0("PC1: ",percentVar[1],"% variance")) +
  ylab(paste0("PC2: ",percentVar[2],"% variance")) + 
  theme_minimal() +
  scale_colour_manual(values = c("black", "brown", "gray"))

## Warning: The following aesthetics were dropped during statistical transformation:
## colour.
## i This can happen when ggplot fails to infer the correct grouping structure in
##   the data.
## i Did you forget to specify a `group` aesthetic or to convert a numerical
##   variable into a factor?

We successfully recover the three developmental stages of our experimental design. A similar plot we get when we explore everything on a heatmap

library("RColorBrewer")

## Warning: package 'RColorBrewer' was built under R version 4.1.3

library(pheatmap)

## Warning: package 'pheatmap' was built under R version 4.1.3

vsd<-vsdata
sampleDists <- dist(t(assay(vsd)))
sampleDistMatrix <- as.matrix(sampleDists)
rownames(sampleDistMatrix) <- paste(vsd$colour,vsd$stage, sep="-")
colnames(sampleDistMatrix) <- NULL
colors <- colorRampPalette( rev(brewer.pal(9, "Blues")) )(255)
pheatmap(sampleDistMatrix,
         clustering_distance_rows=sampleDists,
         clustering_distance_cols=sampleDists,
         col=colors)

3.2 Colour morph differential expression juveniles

3.2.1 data filtering

let´s filter the dataset to include only juveniles

txi <- tximport(list_dir, type = "salmon",ignoreTxVersion = TRUE,tx2gene = tx2gene)

m <- filter(tab, str_detect(Taxonomic.Lineage, "araneomorphae|arthropoda|insecta"))[,c(1:38)]

#colnames(m)<-c(sample_names,"gene_id","Subject.Sequence")

samples_rm<-c("G14_3_W6","G13_3_W1","G14_3_B4","G14_3_B5","G14_3_B1")

coldata$stage<-as.factor(coldata$stage)

m3<-which(coldata$stage == "molt3")

txi_m3 <- lapply(txi, function(x) if(is.matrix(x)) return(x[,m3]) else return(x))

dds <- DESeqDataSetFromTximport(txi_m3, coldata[m3,], ~colour)

colnames(dds)<-sample_names[m3]

dds<-dds[,!colnames(dds) %in% samples_rm]

dds<-dds[rownames(dds) %in% m$gene_id,]

Run the model

dds_juveniles <- DESeq(dds,minReplicatesForReplace=Inf)

3.2.2 Model result visualization

Let´s explore the model

vsdata <- vst(dds_juveniles, blind = FALSE) ## use instead of vst because of sample size
pcaData <- plotPCA(vsdata, intgroup=c("colour"),ntop=1000, returnData=TRUE)
#pcaData$stage<-dds@colData$stage
percentVar <- round(100 * attr(pcaData, "percentVar"))
ggplot(pcaData, aes(PC1, PC2,col=colour)) +
  #geom_text(label=dds_juveniles$sample_names)+
  geom_point(shape = 15,size=5) +
  xlab(paste0("PC1: ",percentVar[1],"% variance")) +
  ylab(paste0("PC2: ",percentVar[2],"% variance")) + 
  theme_minimal()+
  scale_colour_manual(values = c("black", "gray"))

We successfully recover the three developmental stages of our experimental design. A similar plot we get when we explore evrything on a heatmap

library("RColorBrewer")
library(pheatmap)
vsd<-vsdata
sampleDists <- dist(t(assay(vsd)))
sampleDistMatrix <- as.matrix(sampleDists)
rownames(sampleDistMatrix) <- dds_juveniles$sample_names#paste(vsd$colour,vsd$stage, sep="-")
colnames(sampleDistMatrix) <- NULL
colors <- colorRampPalette( rev(brewer.pal(9, "Blues")) )(255)
pheatmap(sampleDistMatrix,
         clustering_distance_rows=sampleDists,
         clustering_distance_cols=sampleDists,
         col=colors)

3.2.3 shrunken log2 fold changes

Extract results

res <- results(dds_juveniles, contrast = c('colour', 'black', 'white'))

get shrunken log fold changes, specifying the coefficient

res_shrink <- lfcShrink(dds_juveniles, coef="colour_white_vs_black", type="ashr")

## using 'ashr' for LFC shrinkage. If used in published research, please cite:
##     Stephens, M. (2016) False discovery rates: a new deal. Biostatistics, 18:2.
##     https://doi.org/10.1093/biostatistics/kxw041

3.2.4 Volcano plot

Plot the differential expressed transcripts with volcano plot. This visualization help to identify patterns of differential expression

express_values<-as_tibble(res_shrink)
express_values$gene_id<-rownames(res_shrink)

ggplot(data= express_values, aes(x= log2FoldChange, y= -log10(padj))) +
    geom_point(colour= 'grey80',size=3) +
    geom_point(data=express_values[abs(express_values$log2FoldChange) >= 1 & express_values$padj <= 0.01,],aes(x=log2FoldChange, y=(-log10(padj))),colour= '#0A3F55',size=3)+
    geom_vline(xintercept= c(-1, 1), colour= 'black', linetype= 'dashed') +
    geom_hline(yintercept= 2, colour= 'black', linetype= 'dashed') +
    xlab('log2 fold-change') +
    ylab('-log10(Padj)') +
    theme_classic()

## Warning: Removed 1799 rows containing missing values or values outside the scale range
## (`geom_point()`).

## Warning: Removed 40 rows containing missing values or values outside the scale range
## (`geom_point()`).

It seems that there are not many differential expressed transcripts

3.2.5 MA plot

This plot shows the log2 fold changes over the mean of normalized counts for all the samples

plotMA(res_shrink, ylim=c(-4,4))

It seems that there are not many differential expressed transcripts

3.2.6 Significant expressed transcripts

res_sig005<-res_shrink[which(res_shrink$pvalue<0.05 & res_shrink$padj < 0.01),]
res_sig001<-res_shrink[which(res_shrink$pvalue<0.01 & res_shrink$padj < 0.01),]
nrow(res_sig005)

## [1] 10

nrow(res_sig001)

## [1] 10

diff_m3_res_sig005<-res_sig005
diff_m3_res_sig001<-res_sig001

There are 5 DE transcripts independent on the P-value threshold

Let’s visualize in a heatmap of the top 50 DE transcripts to check if there is any clear pattern of differential expression

rld_m3 <- rlog(dds_juveniles, blind=FALSE)

# get top 50 log fold change genes (excluding cook's cutoff outliers)
top50_m3 <- data.frame(res_sig005) %>%
  filter(!is.na(padj)) %>% 
  arrange(-abs(log2FoldChange)) %>% 
  rownames() %>% 
  head(.,n=10) 

df <- data.frame(colData(dds_juveniles)[,"colour"])
rownames(df) <- colnames(dds_juveniles)
colnames(df) <- "colour"

pheatmap(
  assay(rld_m3)[top50_m3,], 
  cluster_rows=TRUE, 
  show_rownames=TRUE,
  cluster_cols=FALSE,
  annotation_col=df,
  scale = 'row'
)

This plot confirms that there is not clear pattern of differtial expression betwen black-and-whit and black juveniles

Let’s get the DE transcripts using an alpha of 0.01

res_sig<-res_shrink

de_export<-res_shrink[which(res_shrink$pvalue<0.05 & res_shrink$padj<0.01),]

de_export<-as.data.frame(de_export)
de_export$gene_id<-rownames(de_export)

write_csv(de_export,"sigDE_juveniles.txt")

res2 <- cbind(gene_id=rownames(res_sig),res_sig) %>% 
  data.frame() %>%
  filter(!is.na(padj))

res2 <- inner_join(res2,tab,by="gene_id")

Get the length of the transcripts for the correction with goseq

# vector of transcript lengths
tran_length<-len
names(tran_length)<-c("gene_id","len")

res2 <- inner_join(res2,tran_length,by="gene_id")

## Warning in inner_join(res2, tran_length, by = "gene_id"): Detected an unexpected many-to-many relationship between `x` and `y`.
## i Row 6 of `x` matches multiple rows in `y`.
## i Row 349404 of `y` matches multiple rows in `x`.
## i If a many-to-many relationship is expected, set `relationship =
##   "many-to-many"` to silence this warning.

res2_unique <- res2[!duplicated(res2$gene_id), ]

de <- res2_unique$pvalue<0.05 & res2_unique$padj < 0.01
names(de) <- res2_unique$gene_id

mapping of transcript IDs to category IDs

#GOmap <- read.table("../annotation_m1_m3_a/final_results/annotated_gene_ontology_terms.tsv",sep="\t",quote="",comment.char="",header=TRUE) %>%
#  mutate(gene_id=str_remove(query_sequence, ".p[0-9]+$"))

match transcripts with GO terms

go <- data.frame(GOmap[,c(5,2)]) %>% filter(gene_id %in% res2_unique$gene_id)

With this object we can start the analysis

3.2.7 analysis

Account for transcript length bias by calculating the probability of being DE based purely on gene length

de_subset<-res2_unique[which(de==TRUE),]
mypwf <- nullp(DEgenes=de,bias.data=res2_unique$len)

GO.wall <- goseq(pwf=mypwf,gene2cat=go)

False discovery rate correction on p-values using Benjamini-Hochberg

GO.wall <- cbind(
  GO.wall,
  padj_overrepresented=p.adjust(GO.wall$over_represented_pvalue, method="BH"),
  padj_underrepresented=p.adjust(GO.wall$under_represented_pvalue, method="BH")
)

Names of GO terms

sig_go<-GO.wall[which(GO.wall$padj_overrepresented < 0.001),c("category","term","over_represented_pvalue","padj_overrepresented")]

write_excel_csv(sig_go,"sig_go_m3.csv")

Let’s explore the

GO.wall %>% 
  top_n(10, wt=-over_represented_pvalue) %>% 
  mutate(hitsPerc=numDEInCat*100/numInCat) %>% 
  ggplot(aes(x=hitsPerc, 
             y=term, 
             colour=over_represented_pvalue, 
             size=numDEInCat)) +
  geom_point() +
  expand_limits(x=0) +
  labs(x="Hits (%)", y="GO term", colour="p value", size="Count")

3.3 Colour morph differential expression adults

3.3.1 data filtering

let´s filter the dataset to include only juveniles

txi <- tximport(list_dir, type = "salmon",ignoreTxVersion = TRUE,tx2gene = tx2gene)

m <- filter(tab, str_detect(Taxonomic.Lineage, "araneomorphae|arthropoda|insecta"))[,c(1:38)]

#colnames(m)<-c(sample_names,"gene_id","Subject.Sequence")

coldata$stage<-as.factor(coldata$stage)

adult<-which(coldata$stage == "adult")

txi_adult <- lapply(txi, function(x) if(is.matrix(x)) return(x[,adult]) else return(x))

dds <- DESeqDataSetFromTximport(txi_adult, coldata[adult,], ~colour)

colnames(dds)<-sample_names[adult]

#dds<-dds[,!colnames(dds) %in% samples_rm]

dds<-dds[rownames(dds) %in% m$gene_id,]

run the model

dds_adults <- DESeq(dds,minReplicatesForReplace=Inf)

3.3.2 Model result visualization

Let´s explore the model

vsdata <- vst(dds_adults, blind = FALSE) ## use instead of vst because of sample size
pcaData <- plotPCA(vsdata, intgroup=c("colour"),ntop=10000, returnData=TRUE)
#pcaData$stage<-dds@colData$stage
percentVar <- round(100 * attr(pcaData, "percentVar"))
ggplot(pcaData, aes(PC1, PC2,col=colour)) +
  #geom_text(label=dds_adults@colData$sample_names)+
  geom_point(size=5) +
  xlab(paste0("PC1: ",percentVar[1],"% variance")) +
  ylab(paste0("PC2: ",percentVar[2],"% variance")) + 
  theme_minimal()+
  scale_colour_manual(values = c("black", "gray"))

It seems that there are two samples G25_A_B7 and G26_A_W1 that are outliers. Let’s remove them for further analyses.

3.3.3 Remove outlier samples

let´s filter the dataset to include only juveniles

txi <- tximport(list_dir, type = "salmon",ignoreTxVersion = TRUE,tx2gene = tx2gene)

m <- filter(tab, str_detect(Taxonomic.Lineage, "araneomorphae|arthropoda|insecta"))[,c(1:38)]

#colnames(m)<-c(sample_names,"gene_id","Subject.Sequence")

samples_rm<-c("G25_A_B7","G26_A_W1")

coldata$stage<-as.factor(coldata$stage)

adult<-which(coldata$stage == "adult")

txi_adult <- lapply(txi, function(x) if(is.matrix(x)) return(x[,adult]) else return(x))

dds <- DESeqDataSetFromTximport(txi_adult, coldata[adult,], ~colour)

colnames(dds)<-sample_names[adult]

dds<-dds[,!colnames(dds) %in% samples_rm]

dds<-dds[rownames(dds) %in% m$gene_id,]

Re run the model

dds_adults <- DESeq(dds,minReplicatesForReplace=Inf)

3.3.4 Model result visualization

Let´s explore the model

vsdata <- vst(dds_adults, blind = FALSE) ## use instead of vst because of sample size
pcaData <- plotPCA(vsdata, intgroup=c("colour"),ntop=10000, returnData=TRUE)
#pcaData$stage<-dds@colData$stage
percentVar <- round(100 * attr(pcaData, "percentVar"))
ggplot(pcaData, aes(PC1, PC2,col=colour)) +
  #geom_text(label=dds_adults@colData$sample_names)+
  geom_point(size=5) +
  xlab(paste0("PC1: ",percentVar[1],"% variance")) +
  ylab(paste0("PC2: ",percentVar[2],"% variance")) + 
  theme_minimal()+
  scale_colour_manual(values = c("black", "gray"))

Here the differentiation on the PC1 between black-and-white and black is clear!

library("RColorBrewer")
library(pheatmap)
vsd<-vsdata
sampleDists <- dist(t(assay(vsd)))
sampleDistMatrix <- as.matrix(sampleDists)
rownames(sampleDistMatrix) <- paste(vsd$colour,vsd$stage, sep="-")
colnames(sampleDistMatrix) <- NULL
colors <- colorRampPalette( rev(brewer.pal(9, "Blues")) )(255)
pheatmap(sampleDistMatrix,
         clustering_distance_rows=sampleDists,
         clustering_distance_cols=sampleDists,
         col=colors)

3.3.5 shrunken log2 fold changes

Extract results

res <- results(dds_adults, contrast = c('colour', 'black', 'white'))

get shrunken log fold changes, specifying the coefficient

res_shrink <- lfcShrink(dds_adults, coef="colour_white_vs_black", type="ashr")

## using 'ashr' for LFC shrinkage. If used in published research, please cite:
##     Stephens, M. (2016) False discovery rates: a new deal. Biostatistics, 18:2.
##     https://doi.org/10.1093/biostatistics/kxw041

3.3.6 Volcano plot

Plot the differential expressed transcripts with volcano plot. This visualization help to indetify patterns of differential expression

express_values<-as_tibble(res_shrink)
express_values$gene_id<-rownames(res_shrink)

ggplot(data= express_values, aes(x= log2FoldChange, y= -log10(padj))) +
    geom_point(colour= 'grey80',size=3) +
    geom_point(data=express_values[abs(express_values$log2FoldChange) >= 1 & express_values$padj <= 0.01,],aes(x=log2FoldChange, y=(-log10(padj))),colour= '#0A3F55',size=3,alpha=0.5)+
    geom_vline(xintercept= c(-1, 1), colour= 'black', linetype= 'dashed') +
    geom_hline(yintercept= 2, colour= 'black', linetype= 'dashed') +
    xlab('log2 fold-change') +
    ylab('-log10(Padj)') +
    theme_classic()

## Warning: Removed 6501 rows containing missing values or values outside the scale range
## (`geom_point()`).

## Warning: Removed 83 rows containing missing values or values outside the scale range
## (`geom_point()`).

It seems that there are not many differential expressed transcripts

3.3.7 MA plot

This plot shows the log2 fold changes over the mean of normalized counts for all the samples

plotMA(res_shrink, ylim=c(-4,4))

It seems that there are many expressed transcripts

3.3.8 Significant expressed transcripts

res_sig005<-res_shrink[which(res_shrink$padj < 0.01),]
res_sig001<-res_shrink[which(res_shrink$padj < 0.01),]
nrow(res_sig005)

## [1] 814

nrow(res_sig001)

## [1] 814

There are 814 DE transcripts depending on the P-value threshold

Let’s visualize in a heatmap of the top 50 DE transcripts to check if there is any clear pattern of differential expression

rld_adults <- rlog(dds_adults, blind=FALSE)

# get top 100 log fold change genes (excluding cook's cutoff outliers)
top100_adults <- data.frame(res_sig005) %>%
  filter(!is.na(padj)) %>% 
  arrange(-abs(log2FoldChange)) %>% 
  rownames() %>% 
  head(.,n=100) 

df <- data.frame(colData(dds_adults)[,"colour"])
rownames(df) <- colnames(dds_adults)
colnames(df) <- "colour"

pheatmap(
  assay(rld_adults)[top100_adults,], 
  cluster_rows=TRUE, 
  show_rownames=TRUE,
  cluster_cols=FALSE,
  annotation_col=df,
  scale='row'
  #,labels_row=ann[ann$Query.Sequence %in% top50_adults,]$Description
)

This plot shows a clear pattern where the black-and-white individuals express more certain transcripts compared to the black individuals

3.4 Gene ontology adults

Because we only observed significant differences when comparing adults of different colour morphs, we will explore the biological function of this transcripts.

3.4.1 input objects

Let’s get the DE transcripts using an alpha of 0.01

res_sig<-res_shrink

de_export<-res_shrink[which(res_shrink$padj<0.01),]
#de_export<-de_export[which(de_export$log2FoldChange<=(-1) | de_export$log2FoldChange>=1),]
de_export<-as.data.frame(de_export)
de_export$gene_id<-rownames(de_export)

write_csv(de_export,"sigDE_adults.txt")

res2 <- cbind(gene_id=rownames(res_sig),res_sig) %>% 
  data.frame() %>%
  filter(!is.na(padj))

res2 <- inner_join(res2,tab,by="gene_id")

There 814 differential expressed transcripts

Get the length of the transcripts for the correction with goseq

# vector of transcript lengths
tran_length<-len
names(tran_length)<-c("gene_id","len")

res2 <- inner_join(res2,tran_length,by="gene_id")

res2_unique <- res2[!duplicated(res2$gene_id), ]

de <- res2_unique$pvalue<0.05 & res2_unique$padj < 0.01
names(de) <- res2_unique$gene_id

mapping of transcript IDs to category IDs

GOmap <- read.table("../annotation_m1_m3_a/final_results/annotated_gene_ontology_terms.tsv",sep="\t",quote="",comment.char="",header=TRUE) %>%
 mutate(gene_id=query_sequence)

#number of DE with GO annottion
length(which(de_export$gene_id %in% GOmap$gene_id))

## [1] 287

#modify to include GO terms in file considering are summary in annotation
length(which(!de_export$gene_id %in% rosetta_stone$transcript))

## [1] 423

#there are 423 expressed "genes" that wont be represented in the GOmap, let's fix that
genes<-de_export$gene_id[which(!de_export$gene_id %in% rosetta_stone$transcript)]

transcripts<-rosetta_stone$transcript[which(rosetta_stone$gene %in% genes)]

GOmap_extra<-GOmap %>%
  filter(gene_id %in% transcripts) %>%
  mutate(gene_id = str_remove(gene_id, "_i\\d+$"))

GOmap<-rbind(GOmap,GOmap_extra)

#confirm that the transcript with annotation is higher, it went from 287 to 568
length(which(de_export$gene_id %in% GOmap$gene_id))

## [1] 568

match transcripts with GO terms

go <- data.frame(GOmap[,c(5,2)]) %>% filter(gene_id %in% res2_unique$gene_id)

With this object we can start the analysis

3.4.2 analysis

Account for transcript length bias by calculating the probability of being DE based purely on gene length

mypwf <- nullp(DEgenes=de,bias.data=res2_unique$len)

GO.wall <- goseq(pwf=mypwf,gene2cat=go)

False discovery rate correction on p-values using Benjamini-Hochberg

GO.wall <- cbind(
  GO.wall,
  padj_overrepresented=p.adjust(GO.wall$over_represented_pvalue, method="BH"),
  padj_underrepresented=p.adjust(GO.wall$under_represented_pvalue, method="BH")
)

Names of GO terms

sig_go<-GO.wall[which(GO.wall$padj_overrepresented < 0.001),c("category","term","over_represented_pvalue","padj_overrepresented")]

write_excel_csv(sig_go,"sig_go.csv")

Let’s explore the

GO.wall %>% 
  top_n(15, wt=-over_represented_pvalue) %>% 
  mutate(hitsPerc=numDEInCat*100/numInCat) %>% 
  ggplot(aes(x=hitsPerc, 
             y=term, 
             colour=over_represented_pvalue, 
             size=numDEInCat)) +
  geom_point() +
  expand_limits(x=0) +
  labs(x="Hits (%)", y="GO term", colour="p value", size="Count")

### GO terms of interest

top10_go<-GO.wall %>% top_n(10, wt=-over_represented_pvalue)

de_go_transcripts<-tibble(gene_id=character(),go=character(),term=character())

for(go_term in 1:nrow(top10_go)){
  
  gene_id<-go[which(go$go_id == top10_go$category[go_term]),"gene_id"]
  
  de_go<-gene_id[which(gene_id %in% de_export$gene_id)]
  
  tmp<-tibble(gene_id=de_go,go=top10_go$category[go_term],term=top10_go$term[go_term])
  
  de_go_transcripts<-rbind(de_go_transcripts,tmp)
}

de_go_unique<-unique(de_go_transcripts)

pheatmap(
  assay(rld_adults)[rownames(rld_adults) %in% de_go_unique$gene_id,], 
  cluster_rows=TRUE, 
  show_rownames=TRUE,
  cluster_cols=FALSE,
  annotation_col=df,
  scale='row'
  #labels_row=ann[ann$Query.Sequence %in% top50_adults,]$Description
)

fold change top10 goterms

de_go_transcripts<- de_go_transcripts %>% mutate(log2FoldChange= as.numeric(NA))

de_go_transcripts<- de_go_transcripts %>% mutate(padj= as.numeric(NA))

for(tt in 1:nrow(de_go_transcripts)){
  pos<-which(de_export$gene_id %in% de_go_transcripts$gene_id[tt])
  #pos<-grep(de_go_transcripts$gene_id[tt],de_export$gene_id)
  de_go_transcripts$log2FoldChange[tt]<-de_export[pos,"log2FoldChange"]
  de_go_transcripts$padj[tt]<-de_export[pos,"padj"]
}

de_go_transcripts$term<-factor(de_go_transcripts$term,levels = rev(c("chitin metabolic process","chitin binding","structural constituent of cuticle", "glucosamine-containing compound metabolic process","aminoglycan metabolic process","amino sugar metabolic process","extracellular region","carbohydrate derivative binding","myofibril","contractile fiber")))

na.omit(de_go_transcripts) %>% 
  ggplot(aes(y = term, x = log2FoldChange, fill = term)) +
  ggridges::geom_density_ridges() +
  theme_classic() +
  geom_vline(xintercept = 0, linetype = "dashed", col = "black", size = 0.4) +
  labs(x = "log2FoldChange", y = "GO term") +
  scale_fill_manual(values = colorRampPalette(c("#deebf7", "#3182bd", "#08519c"))(length(unique(de_go_transcripts$term)))) +
  guides(fill = "none")

## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## i Please use `linewidth` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

## Picking joint bandwidth of 0.85

Chitin and cuticle transcripts

cuticle_chitin<-GO.wall %>% top_n(15, wt=-over_represented_pvalue) %>% filter(term %in% c("chitin metabolic process","chitin binding","structural constituent of cuticle", "glucosamine-containing compound metabolic process","aminoglycan metabolic process", "structural constituent of chitin-based cuticle")) #GO:0005214  

de_go<-go[go$gene_id %in% de_export$gene_id,]

de_go[de_go$go_id %in% cuticle_chitin$category,] %>% dim()

## [1] 213   2

de_go_transcripts<-tibble(gene_id=character(),go=character())

cuticle_transcripts<-de_go[de_go$go_id %in% cuticle_chitin$category,]

for(go_term in 1:nrow(cuticle_chitin)){

gene_id<-go[which(go$go_id == cuticle_chitin$category[go_term]),"gene_id"]

de_go<-gene_id[which(gene_id %in% de_export$gene_id)]

tmp<-tibble(gene_id=de_go,go=cuticle_chitin$category[go_term])

de_go_transcripts<-rbind(de_go_transcripts,tmp)
}


##identify transcripts associated with cuticle/chitin, but with not ontology identification
non_identify<-de_export$transcript_id[which(de_export$gene_id %in% gene_id == FALSE)]

for(tran in non_identify){
  cond<-grep("cutic|chitin",ann[grep(tran,ann$Query.Sequence),"Description"])
  if(length(cond)>0){
    tmp<-tibble(transcript_id=tran,go="blast_match")
    de_go_transcripts<-rbind(de_go_transcripts,tmp)
  } else {next}
}


##unique transcripts
de_go_transcripts_unique <- de_go_transcripts %>%
  group_by(gene_id) %>%
  summarize(
    transcript_id = paste(unique(gene_id), collapse = " "),
    go = paste(unique(go), collapse = " "),
    .groups = "drop"
  )

pheatmap(
  assay(rld_adults)[rownames(rld_adults) %in% de_go_transcripts_unique$gene_id,], 
  cluster_rows=TRUE, 
  show_rownames=TRUE,
  cluster_cols=FALSE,
  annotation_col=df,
  scale='row'
  #labels_row=ann[ann$Query.Sequence %in% top50_adults,]$Description
)

build own volcano plot with custom colours

express_values<-as_tibble(res_shrink)
express_values$gene_id<-rownames(res_shrink)
##specify colours
express_values<-express_values %>% mutate(colour=if_else(gene_id %in% de_go_transcripts_unique$gene_id, "red", NA_character_))


ggplot(data= express_values, aes(x= log2FoldChange, y= -log10(padj))) +
    geom_point(colour= 'grey80',size=3) +
    geom_point(data=express_values[abs(express_values$log2FoldChange) >= 1 & express_values$padj <= 0.01,],aes(x=log2FoldChange, y=(-log10(padj))),colour= '#0A3F55',size=3,alpha=0.5)+
    geom_point(data= express_values[!is.na(express_values$colour) & abs(express_values$log2FoldChange) > 0 & express_values$padj <= 0.01,],aes(x=log2FoldChange, y=(-log10(padj))),colour= '#E93024',size=4) +
    geom_vline(xintercept= 0, colour= 'black', linetype= 'dashed') +
    geom_hline(yintercept= 2, colour= 'black', linetype= 'dashed') +
    xlab('log2 fold-change') +
    ylab('-log10(Padj)') +
    theme_classic()

## Warning: Removed 6501 rows containing missing values or values outside the scale range
## (`geom_point()`).

## Warning: Removed 83 rows containing missing values or values outside the scale range
## (`geom_point()`).

3.4.2.1 Melanin

3.4.2.1.1 Juveniles

de_juveniles<-read_csv("sigDE_juveniles.txt")

## Rows: 10 Columns: 6
## -- Column specification --------------------------------------------------------
## Delimiter: ","
## chr (1): gene_id
## dbl (5): baseMean, log2FoldChange, lfcSE, pvalue, padj
## 
## i Use `spec()` to retrieve the full column specification for this data.
## i Specify the column types or set `show_col_types = FALSE` to quiet this message.

transcript_go_melanin<-go[grep("GO:0006582",ann$EggNOG.GO.Biological),"gene_id"]

#transcript_go_melanin<-go[grep("GO:0006582",ann$EggNOG.GO.Biological),"gene_id"]

sig_melanin<-transcript_go_melanin[which(transcript_go_melanin %in% de_juveniles$gene_id)]

length(sig_melanin)

## [1] 0

3.4.2.1.2 Adults

sig_melanin<-de_export[which(de_export$gene_id %in% transcript_go_melanin),]

nrow(sig_melanin)

## [1] 1

plots <- list()  # To store all plots
for (tt in rownames(sig_melanin)) {
  data <- plotCounts(dds_adults, gene = tt, intgroup = c("colour"), returnData = TRUE)
  p <- ggplot(data, aes(x=colour, y=count, col=colour)) +
    geom_boxplot() +
    geom_point() +
    scale_colour_manual(values = c("black", "gray")) +
    theme_classic()+
    ggtitle(paste("Transcript:", tt)) 
  plots[[tt]] <- p  # Add the plot to the list
}

# Display two plots at a time in a grid
do.call(grid.arrange, c(plots, nrow = 2))

#TRINITY_DN10_c0_g1_i4=XP_055946152.1 cation-independent mannose-6-phosphate receptor-like

3.4.2.2 Ommochromes

3.4.2.2.1 Juveniles

de_juveniles<-read_csv("sigDE_juveniles.txt")

## Rows: 10 Columns: 6
## -- Column specification --------------------------------------------------------
## Delimiter: ","
## chr (1): gene_id
## dbl (5): baseMean, log2FoldChange, lfcSE, pvalue, padj
## 
## i Use `spec()` to retrieve the full column specification for this data.
## i Specify the column types or set `show_col_types = FALSE` to quiet this message.

transcript_go_ommochrome<-go[grep("GO:0046152",ann$EggNOG.GO.Biological),"gene_id"]

sig_ommo_juv<-transcript_go_ommochrome[which(transcript_go_ommochrome %in% de_juveniles$gene_id)]

length(sig_ommo_juv)

## [1] 1

#"TRINITY_DN14166_c0_g1_i4"=polyadenylate-binding protein 2-like isoform X2 [Argiope bruennichi]

3.4.2.2.2 Adults

sig_ommo_adults<-de_export[which(rownames(de_export) %in% transcript_go_ommochrome),]

nrow(sig_ommo_adults)

## [1] 1

#identity: TRINITY_DN32391_c0_g1_i16=Serine/threonine-protein kinase

3.5 Normalized counts DE transcripts

3.6 Normalized counts melanin associated transcripts

3.6.1 Juveniles

melanin_transcripts<-c("TRINITY_DN11796_c0_g1_i4","TRINITY_DN447_c0_g2_i1","TRINITY_DN1136_c0_g1_i1","TRINITY_DN119_c0_g1_i6","TRINITY_DN2896_c0_g2_i2","TRINITY_DN23627_c0_g1_i2","TRINITY_DN47018_c0_g1_i1")

melano_ids <- tibble(
  original = melanin_transcripts,
  stripped = str_remove(melanin_transcripts, "_i\\d+$")
)

sig_melanin <- tibble(
  bind_cols(as_tibble(de_juveniles)) %>%        # add DE table columns
  filter(gene_id %in% melano_ids$original |
         gene_id %in% melano_ids$stripped))

nrow(sig_melanin)

## [1] 0

melanin_annotated<-tx2gene$gene[tx2gene$transcript %in% melanin_transcripts]

plots <- list()  # To store all plots
for (tt in melanin_annotated) {
  data <- plotCounts(dds_juveniles, gene = tt, intgroup = c("colour"), returnData = TRUE)
  p <- ggplot(data, aes(x=colour, y=count, col=colour)) +
    geom_boxplot() +
    geom_point() +
    scale_colour_manual(values = c("black", "gray")) +
    theme_classic()+
    ggtitle(paste("Transcript:", tt)) 
  plots[[tt]] <- p  # Add the plot to the list
}




# Display two plots at a time in a grid
do.call(grid.arrange, c(plots, nrow = 2))

3.6.2 Adults

melanin_transcripts<-c("TRINITY_DN11796_c0_g1_i4","TRINITY_DN447_c0_g2_i1","TRINITY_DN1136_c0_g1_i1","TRINITY_DN119_c0_g1_i6","TRINITY_DN2896_c0_g2_i2","TRINITY_DN23627_c0_g1_i2","TRINITY_DN47018_c0_g1_i1")

melano_ids <- tibble(
  original = melanin_transcripts,
  stripped = str_remove(melanin_transcripts, "_i\\d+$")
)

sig_melanin <- tibble(
  bind_cols(as_tibble(de_export)) %>%        # add DE table columns
  filter(gene_id %in% melano_ids$original |
         gene_id %in% melano_ids$stripped))

nrow(sig_melanin)

## [1] 1

melanin_annotated<-tx2gene$gene[tx2gene$transcript %in% melanin_transcripts]

plots <- list()  # To store all plots
for (tt in melanin_annotated) {
  data <- plotCounts(dds_adults, gene = tt, intgroup = c("colour"), returnData = TRUE)
  p <- ggplot(data, aes(x=colour, y=count, col=colour)) +
    geom_boxplot() +
    geom_point() +
    scale_colour_manual(values = c("black", "gray")) +
    theme_classic()+
    ggtitle(paste("Transcript:", tt)) 
  plots[[tt]] <- p  # Add the plot to the list
}

# Display two plots at a time in a grid
do.call(grid.arrange, c(plots, nrow = 2))

3.7 Common transcripts adults and juveniles

common_transcripts<-rownames(diff_m3_res_sig005)[which(rownames(diff_m3_res_sig005) %in% rownames(res_sig005))]

tt<-tx2gene$transcript[which(tx2gene$gene==common_transcripts)]

info_common_transcripts<-GOmap[which(GOmap$gene_id %in% tt),]

#TRINITY_DN12867_c0_g1_i36=protein turtle-like isoform X1 [Argiope bruennichi]
#TRINITY_DN6814_c0_g1=metalloprotease TIKI1-like [Argiope bruennichi]
#TRINITY_DN7609_c0_g1_i3=40S ribosomal protein S17-like [Argiope bruennichi]

There are three common transcripts

3.8 Normalized counts Ommochrome associated transcripts

3.8.1 Juveniles

ommo_transcripts<-c("TRINITY_DN18310_c0_g1_i3", "TRINITY_DN7113_c0_g1_i13", "TRINITY_DN119_c0_g1_i6", "TRINITY_DN12974_c0_g2_i1", "TRINITY_DN12012_c1_g1_i1")

ommo_ids <- tibble(
  original = ommo_transcripts,
  stripped = str_remove(ommo_transcripts, "_i\\d+$")
)

sig_ommo_juv <- tibble(
  bind_cols(as_tibble(de_juveniles)) %>%        # add DE table columns
  filter(gene_id %in% ommo_ids$original |
         gene_id %in% ommo_ids$stripped))

nrow(sig_ommo_juv)

## [1] 0

ommo_annotated<-tx2gene$gene[tx2gene$transcript %in% ommo_transcripts]

plots <- list()  # To store all plots
for (tt in ommo_annotated) {
  data <- plotCounts(dds_juveniles, gene = tt, intgroup = c("colour"), returnData = TRUE)
  p <- ggplot(data, aes(x=colour, y=count, col=colour)) +
    geom_boxplot() +
    geom_point() +
    scale_colour_manual(values = c("black", "gray")) +
    theme_classic()+
    ggtitle(paste("Transcript:", tt)) 
  plots[[tt]] <- p  # Add the plot to the list
}

# Display two plots at a time in a grid
do.call(grid.arrange, c(plots, nrow = 2))

3.8.2 Adults

ommo_transcripts<-c("TRINITY_DN18310_c0_g1_i3", "TRINITY_DN7113_c0_g1_i13", "TRINITY_DN119_c0_g1_i6", "TRINITY_DN12974_c0_g2_i1", "TRINITY_DN12012_c1_g1_i1")

ommo_ids <- tibble(
  original = ommo_transcripts,
  stripped = str_remove(ommo_transcripts, "_i\\d+$")
)

sig_ommo <- tibble(
  bind_cols(as_tibble(de_export)) %>%        # add DE table columns
  filter(gene_id %in% ommo_ids$original |
         gene_id %in% ommo_ids$stripped))

nrow(sig_ommo)

## [1] 2

ommo_annotated<-tx2gene$gene[tx2gene$transcript %in% ommo_transcripts]

plots <- list()  # To store all plots
for (tt in ommo_annotated) {
  data <- plotCounts(dds_adults, gene = tt, intgroup = c("colour"), returnData = TRUE)
  p <- ggplot(data, aes(x=colour, y=count, col=colour)) +
    geom_boxplot() +
    geom_point() +
    scale_colour_manual(values = c("black", "gray")) +
    theme_classic()+
    ggtitle(paste("Transcript:", tt)) 
  plots[[tt]] <- p  # Add the plot to the list
}

# Display two plots at a time in a grid
do.call(grid.arrange, c(plots, nrow = 2))

#TRINITY_DN119_c0_g1=peroxidase-like [Argiope bruennichi]
#TRINITY_DN18310_c0_g1_i3=tryptophan 2,3-dioxygenase-like isoform X1

3.8.3 plots

Let’s consolidate all the information about melanin and ommochromes in heatmaps

export transcripts for blast

#use the transcript code instead of "gene"

#adults
DE_transcripts_adults<-de_export$gene_id[which(de_export$gene_id %in% rosetta_stone$transcript)]

DE_miss_genes<-de_export$gene_id[which(!de_export$gene_id %in% rosetta_stone$transcript)]

miss_transcripts<-rosetta_stone$transcript[which(rosetta_stone$gene %in% DE_miss_genes)]

DE_transcripts_adults<-c(DE_transcripts_adults,miss_transcripts)

write_csv(tibble(transcript_id=DE_transcripts_adults),file="DE_adults.csv")

#juveniles
de_juv<-read_csv("sigDE_juveniles.txt")

## Rows: 10 Columns: 6
## -- Column specification --------------------------------------------------------
## Delimiter: ","
## chr (1): gene_id
## dbl (5): baseMean, log2FoldChange, lfcSE, pvalue, padj
## 
## i Use `spec()` to retrieve the full column specification for this data.
## i Specify the column types or set `show_col_types = FALSE` to quiet this message.

DE_transcripts_juv<-de_juv$gene_id[which(de_juv$gene_id %in% rosetta_stone$transcript)]

DE_miss_genes<-de_juv$gene_id[which(!de_juv$gene_id %in% rosetta_stone$transcript)]

miss_transcripts<-rosetta_stone$transcript[which(rosetta_stone$gene %in% DE_miss_genes)]

DE_transcripts_juv<-c(DE_transcripts_juv,miss_transcripts)

write_csv(tibble(transcript_id=DE_transcripts_juv),file="DE_juv.csv")

Add transcripts from ommochromes and melanin with cuticle on volcano plot

## Volcano plot highlighting the transcript for easy visualization 

express_values<-as_tibble(res_shrink)
express_values$gene_id<-rownames(res_shrink)
##specify colours
express_values<-express_values %>% 
  mutate(colour=if_else(gene_id %in% transcript_go_melanin, "go_melanin",
                if_else(gene_id %in% melano_ids$original | gene_id %in% melano_ids$stripped, "melanin",
                if_else(gene_id %in% transcript_go_ommochrome, "go_ommo",
                if_else(gene_id %in% de_go_transcripts_unique$gene_id, "cuticle",
                if_else(gene_id %in% ommo_ids$original | gene_id %in% ommo_ids$stripped, "ommo",
                        NA_character_))))))


ggplot(data= express_values, aes(x= log2FoldChange, y= -log10(padj))) +
    #geom_point(colour= 'grey80',size=3) +
  geom_point(data=express_values[express_values$padj <= 0.01,],aes(x=log2FoldChange, y=(-log10(padj))),colour= 'grey80',size=3,alpha=0.5)+
  geom_point(data= express_values[express_values$padj <= 0.01 & express_values$colour=="cuticle",],aes(x=log2FoldChange, y=(-log10(padj))),colour= 'red',size=3, shape=19) +
    geom_point(data= express_values[express_values$padj <= 0.01 & express_values$colour=="go_melanin",],aes(x=log2FoldChange, y=(-log10(padj))),colour= 'black',size=3, shape=17) +
      geom_point(data= express_values[express_values$padj <= 0.01 & express_values$colour=="melanin",],aes(x=log2FoldChange, y=(-log10(padj))),colour= 'black',size=3, shape=19) +
      geom_point(data= express_values[express_values$padj <= 0.01 & express_values$colour=="go_ommo",],aes(x=log2FoldChange, y=(-log10(padj))),colour= 'orange',size=3, shape=17) +
      geom_point(data= express_values[express_values$padj <= 0.01 & express_values$colour=="ommo",],aes(x=log2FoldChange, y=(-log10(padj))),colour= 'orange',size=3, shape=19) +
    geom_vline(xintercept= 0, colour= 'black', linetype= 'dashed') +
    geom_hline(yintercept= 2, colour= 'black', linetype= 'dashed') +
    xlab('log2 fold-change') +
    ylab('-log10(Padj)') +
    theme_classic()

## Warning: Removed 6501 rows containing missing values or values outside the scale range
## (`geom_point()`).

## Warning: Removed 7247 rows containing missing values or values outside the scale range
## (`geom_point()`).
## Removed 7247 rows containing missing values or values outside the scale range
## (`geom_point()`).
## Removed 7247 rows containing missing values or values outside the scale range
## (`geom_point()`).
## Removed 7247 rows containing missing values or values outside the scale range
## (`geom_point()`).
## Removed 7247 rows containing missing values or values outside the scale range
## (`geom_point()`).

#ommo=tryptophan 2,3-dioxygenase-like isoform X1
#go_ommo=uncharacterized protein LOC129971514 [Argiope bruennichi] | Serine/threonine-protein kinase prp4 like protein [Argiope bruennichi]

Differential_expression_Amix

Fabian C. Salgado-Roa

2026-01-12

1 Load packages and inputs

1.1 R packages

1.2 Set working directory

1.3 Account for overrepresentation

1.4 Annotation file

1.5 Predicted proteins

1.6 Import raw count data from Salmon

2 Annotation quality

2.1 Annotation status per samples

2.2 Summarize total expression per sample by taxonomy

3 Differential Expression Analyses

3.1 All samples categorize by developmental stage

3.2 Colour morph differential expression juveniles

3.2.1 data filtering

3.2.2 Model result visualization

3.2.3 shrunken log2 fold changes

3.2.4 Volcano plot

3.2.5 MA plot

3.2.6 Significant expressed transcripts

3.2.7 analysis

3.3 Colour morph differential expression adults

3.3.1 data filtering

3.3.2 Model result visualization

3.3.3 Remove outlier samples

3.3.4 Model result visualization

3.3.5 shrunken log2 fold changes

3.3.6 Volcano plot

3.3.7 MA plot

3.3.8 Significant expressed transcripts

3.4 Gene ontology adults

3.4.1 input objects

3.4.2 analysis

3.4.2.1 Melanin

3.4.2.1.1 Juveniles

3.4.2.1.2 Adults

3.4.2.2 Ommochromes

3.4.2.2.1 Juveniles

3.4.2.2.2 Adults

3.5 Normalized counts DE transcripts

3.6 Normalized counts melanin associated transcripts

3.6.1 Juveniles

3.6.2 Adults

3.7 Common transcripts adults and juveniles

3.8 Normalized counts Ommochrome associated transcripts

3.8.1 Juveniles

3.8.2 Adults

3.8.3 plots