library(readr)
library(dplyr)
library(ggplot2)
library(tidyverse)
library(DESeq2)
mycounts <- read.csv("C:/Users/~/Desktop/datacounts.csv")
metadata <- read.csv("C:/Users/~/Desktop/datametadata.csv")
countData <- mycounts[!(duplicated(mycounts[[1]]) | duplicated(mycounts[[1]], fromLast=TRUE)), ]
rownames(mycounts) = make.names(mycounts$gene, unique=TRUE)
mycounts <- as.data.frame(mycounts)
metadata <- as.data.frame(metadata)
head(mycounts)
head(metadata)
class(mycounts)
class(metadata)
dds <- DESeqDataSetFromMatrix(countData=mycounts,
colData=metadata,
design=~1,
tidy=TRUE)
dds
sizeFactors(dds)
dispersions(dds)
dds <- DESeq(dds)
res = results(dds, contrast=c("Group", "CNL", "Target"), tidy=TRUE)
res <- as_tibble(res)
res
res %>%
filter(padj<0.05) %>%
write_csv("sigresults.csv")
#smallestGroupSize <- 3
#keep <- rowSums(counts(dds) >= 10) >= smallestGroupSize
#dds <- dds[keep,]
vsdata <- vst(dds, blind=FALSE)
plotPCA(vsdata, intgroup="Group") + geom_label(aes(label = colnames(vsdata))) + labs(title=" ")
df <- as.data.frame(colData(vsdata)[,c("Group","R","NR")])
H <- c("A","B","C")])
mat <- assay(vsdata)[ H, ]
mat <- as.data.frame(mat)
mat1 <- mat %>% arrange(desc(TD))
pheatmap(mat1, annotation_col=df, cluster_rows=F, cluster_cols=F,main = "")
plotCounts(dds, gene="MYC", intgroup="Group", returnData=TRUE) %>%
ggplot(aes(Group, count)) + geom_boxplot(aes(fill=Group)) + scale_y_log10() + ggtitle("MYC")
vsdata <- varianceStabilizingTransformation(dds) (less row)
Volcanoplot
p <- ggplot(data=res, aes(x=log2FoldChange, y=-log10(pvalue))) + geom_point()
# Add more simple "theme"
p <- ggplot(data=res, aes(x=log2FoldChange, y=-log10(pvalue))) + geom_point() + theme_minimal()
# Add vertical lines for log2FoldChange thresholds, and one horizontal line for the p-value threshold
p2 <- p + geom_vline(xintercept=c(-0.6, 0.6), col="red") +
geom_hline(yintercept=-log10(0.05), col="red")
# The significantly differentially expressed genes are the ones found in the upper-left and upper-right corners.
# Add a column to the data frame to specify if they are UP- or DOWN- regulated (log2FoldChange respectively positive or negative)
# add a column of NAs
res$diffexpressed <- "NO"
# if log2Foldchange > 0.6 and pvalue < 0.05, set as "UP"
res$diffexpressed[res$log2FoldChange > 0.6 & res$pvalue < 0.05] <- "UP"
# if log2Foldchange < -0.6 and pvalue < 0.05, set as "DOWN"
res$diffexpressed[res$log2FoldChange < -0.6 & res$pvalue < 0.05] <- "DOWN"
# Re-plot but this time color the points with "diffexpressed"
p <- ggplot(data=res, aes(x=log2FoldChange, y=-log10(pvalue), col=diffexpressed)) + geom_point() + theme_minimal()
# Add lines as before...
p2 <- p + geom_vline(xintercept=c(-0.6, 0.6), col="red") +
geom_hline(yintercept=-log10(0.05), col="red")
## Change point color
# 1. by resfault, it is assigned to the categories in an alphabetical orresr):
p3 <- p2 + scale_color_manual(values=c("blue", "black", "red"))
# 2. to automate a bit: ceate a named vector: the values are the colors to be used, the names are the categories they will be assigned to:
mycolors <- c("blue", "red", "black")
names(mycolors) <- c("DOWN", "UP", "NO")
p3 <- p2 + scale_colour_manual(values = mycolors)
# Now write down the name of genes besires the points...
# Create a new column "reslabel" to res, that will contain the name of genes differentially expressed (NA in case they are not)
res$delabel <- NA
res$delabel[res$diffexpressed != "NO"] <- res$row[res$diffexpressed != "NO"]
ggplot(data=res, aes(x=log2FoldChange, y=-log10(pvalue), col=diffexpressed, label=delabel)) +
geom_point() +
theme_minimal() +
geom_text()
library(ggrepel)
# plot adding up all layers we have seen so far
ggplot(data=res, aes(x=log2FoldChange, y=-log10(pvalue), col=diffexpressed, label=delabel)) +
geom_point() +
theme_minimal() +
geom_text_repel() +
scale_color_manual(values=c("blue", "black", "red")) +
geom_vline(xintercept=c(-0.6, 0.6), col="red") +
geom_hline(yintercept=-log10(0.05), col="red")
plotCounts(dds, gene="FSTL5", intgroup="Treatment", returnData=TRUE) %>%
ggplot(aes(Treatment, count)) + geom_boxplot(aes(fill=Treatment)) + scale_y_log10() + ggtitle("FSTL5")
P + scale_x_discrete(name ="Group", limits=c("CNL","Target"))
library("genefilter")
topVarGenes <- head(order(-rowVars(assay(rld))),30)
The heatmap becomes more interesting if we do not look at absolute expression strength but rather at the amount by which each gene deviates in a specific sample from the gene’s average across all samples. Hence, we center each genes’ values across samples, and plot a heatmap. We provide the column side colors to help identify the treated samples (in blue) from the untreated samples (in grey).
vsdata <- vst(dds, blind=FALSE)
plotPCA(vsdata, intgroup="Group")
plotPCA(vsdata, intgroup="Group") + geom_label(aes(label = colnames(vsdata)))
topVarGenes <- head(order(-rowVars(assay(vsdata))),50)
mat <- assay(vsdata)[ topVarGenes, ]
mat <- mat - rowMeans(mat)
df <- as.data.frame(colData(vsdata)[,c("ID","R","D","M","Treatment")])
pheatmap(mat, annotation_col=df, cluster_rows=FALSE, cluster_cols=FALSE)
res1 <- res %>% filter(padj<0.05) %>% arrange(padj)
res2 <- res1[1:100,] %>% arrange(desc(log2FoldChange))
Sig <- res2$row
mat <- assay(vsdata)[ Sig, ]
mat <- mat - rowMeans(mat)
df <- as.data.frame(colData(vsdata)[,c("ID","Treatment","HPV","PIK3CAmutation","Response.Primary","Responder")])
df <- as.data.frame(colData(vsdata)[,c("ID","Group")])
pheatmap(mat, annotation_col=df, cluster_rows=F, cluster_cols=F)
mat <- assay(vsdata)[ G, ]
mat1 <- as.data.frame(mat)
mat1 <- mat1 %>% arrange(desc(TD3))
pheatmap(mat1, annotation_col=df, cluster_rows=F, cluster_cols=F)
zmat <- scale(tmat1, center = TRUE, scale = TRUE) upon column
sacPlsda <- opls(vsdata, "Treatment", orthoI = 1)
PLS-DA
GSEA
library(org.Hs.eg.db)
ens2symbol <- AnnotationDbi::select(org.Hs.eg.db,
key=res$row,
columns="SYMBOL",
keytype="SYMBOL")
ens2symbol <- as_tibble(ens2symbol)
ens2symbol
res2 <- res %>%
dplyr::select(SYMBOL, stat) %>%
na.omit() %>%
distinct() %>%
group_by(SYMBOL) %>%
summarize(stat=mean(stat))
res2
library(fgsea)
ranks <- deframe(res2)
head(ranks, 20)
pathways.hallmark <- gmtPathways("C:/Users/~/Desktop/GSEA dataset/ h.all.v2023.1.Hs.symbols.gmt")
fgseaRes <- fgsea(pathways=pathways.hallmark, stats=ranks, nperm=1000)
fgseaRes <- fgsea(pathways=pathways.hallmark, stats=ranks, eps = 0.0,
minSize = 15,
maxSize = 500)
fgseaResTidy <- fgseaRes %>%
as_tibble() %>%
arrange(desc(NES))
# Show in a nice table:
fgseaResTidy %>%
dplyr::select(-leadingEdge, -ES, -nMoreExtreme) %>%
arrange(padj)
ggplot(fgseaResTidy, aes(reorder(pathway, NES), NES)) +
geom_col(aes(fill=padj<0.05)) +
coord_flip() +
labs(x="Pathway", y="Normalized Enrichment Score",
title="Hallmark pathways NES from GSEA") +
theme_minimal()
plotEnrichment(pathways.hallmark[["HALLMARK_APOPTOSIS"]],
ranks) + labs(title=" HALLMARK_APOPTOSIS")
fgseaRes <- fgsea(pathways=pathways.hallmark, stats=ranks, nperm=1000)
go <- fgsea(pathways=gmtPathways("C:/Users/~/Desktop/GSEA dataset/c5.all.v7.2.symbols.gmt"), ranks, nperm=1000) %>% as_tibble()
go <- as.matrix(go)
write.csv(go, "go.csv")
pathway | pval | padj | NES | size | |
---|---|---|---|---|---|
1 | HALLMARK_E2F_TARGETS | 0.003125 | 0.016 | 1.50255883270908 | 196 |
2 | HALLMARK_P53_PATHWAY | 0.00307692307692308 | 0.016 | 1.46976609146818 | 193 |
ggplot(fgseaResTidy, aes(reorder(pathway, NES), NES)) +
geom_col(aes(fill=padj<0.05)) +
coord_flip() +
labs(x="Pathway", y="Normalized Enrichment Score",
title="Hallmark pathways NES from GSEA") +
theme_minimal()
fgseaRes <- fgsea(pathways=pathways.hallmark, stats=ranks, nperm=1000)
go <- fgsea(pathways=gmtPathways("C:/Users/~/Desktop/GSEA dataset/c5.all.v7.2.symbols.gmt"), ranks, nperm=1000) %>% as_tibble()
Re <- fgsea(pathways=gmtPathways("C:/Users/~/Downloads/c2.cp.reactome.v2023.1.Hs.symbols.gmt"), ranks, nperm=1000) %>%
as_tibble()
go <- as.matrix(go)
write.csv(go, "go.csv")
Re <- as.matrix(Re)
write.csv(Re, "Re.csv")
hallmark <- fgsea(pathways=gmtPathways("C:/Users/~/Desktop/GSEA dataset/h.all.v7.2.symbols.gmt "), ranks, nperm=1000) %>% as_tibble()
hallmark <- as.matrix(hallmark)
write.csv(hallmark, "hallmark Bulk PRE HPV+ R vs NR.csv")
plotEnrichment(pathways.hallmark[["HALLMARK_INTERFERON_GAMMA_RESPONSE"]],
ranks) + labs(title="HALLMARK_INTERFERON_GAMMA_RESPONSE")