longteng

隨筆記錄, 工作日記.
正文

storehouse1

(2024-11-22 19:09:54) 下一個

cat IN1_S1_L001_R1_001.fastq IN1_S1_L002_R1_001.fastq > IN1_S1_concatenated_R1_001.fastq

cat IN1_S1_L001_R2_001.fastq IN1_S1_L002_R2_001.fastq > IN1_S1_concatenated_R2_001.fastq

https://bioinformatics-core-shared-training.github.io/RNAseq-R/align-and-count.nb.html

library(readr)

library(dplyr)

library(ggplot2)

library(tidyverse)

library(DESeq2)

mycounts <- read.csv("C:/Users/?/Desktop/mycounts.csv")

metadata <-  read.csv("C:/Users/?/Desktop/metadata.csv")

#countData <- mycounts[!(duplicated(mycounts[[1]]) | duplicated(mycounts[[1]], fromLast=TRUE)), ]

table(duplicated(mycounts$symbol))

mycounts<- mycounts[duplicated(mycounts$symbol) == FALSE,]

#rownames(mycounts) = make.names(mycounts$gene, unique=TRUE)

mycounts <- as.data.frame(mycounts)

metadata <- as.data.frame(metadata)

head(mycounts)

head(metadata)

class(mycounts)

class(metadata)

dds <- DESeqDataSetFromMatrix(countData=mycounts, 

                              colData=metadata, 

                              design=~Group, 

                              tidy=TRUE)

dds

keep <- rowSums(counts(dds)) >= 10

dds <- dds[keep,]

#smallestGroupSize <- 3

#keep <- rowSums(counts(dds) >= 10) >= smallestGroupSize

#dds <- dds[keep,]

#keep <- rowSums(counts(dds) >= 15) >= 34 #remove all genes with counts<15 in more than 75% samples(45*0.75=33.75)

#dds <- dds[keep,]

 

sizeFactors(dds)

dispersions(dds)

dds <- DESeq(dds)

res <- results(dds, tidy=TRUE)

res = results(dds, contrast=c("Group", "Treat", "CNL"), tidy=TRUE)

res <- as_tibble(res)

res

res %>% 

    filter(padj<0.05) %>% 

    write_csv("sigresults.csv")

vsdata <- vst(dds, blind=FALSE)

plotPCA(vsdata, intgroup="Group") + geom_label(aes(label = colnames(vsdata))) + labs(title=" PCA")

df <- as.data.frame(colData(vsdata)[,c("Treat","Group","D","R"," ")])

H <- c("P53","MYC")

mat <- assay(vsdata)[ H, ]

mat <- as.data.frame(mat)

mat1 <- mat %>% arrange(desc(T20))

pheatmap(mat1, annotation_col=df, cluster_rows=F, cluster_cols=F,main = "heatmap")

plotCounts(dds, gene="EN", intgroup="Group", returnData=TRUE) %>% 

  ggplot(aes(Group, count)) + geom_boxplot(aes(fill=Group)) + scale_y_log10() + ggtitle("")

vsdata <- varianceStabilizingTransformation(dds) (less row)

#Volcano

p <- ggplot(data=res, aes(x=log2FoldChange, y=-log10(pvalue))) + geom_point()

# Add more simple "theme"

p <- ggplot(data=res, aes(x=log2FoldChange, y=-log10(pvalue))) + geom_point() + theme_minimal()

# Add vertical lines for log2FoldChange thresholds, and one horizontal line for the p-value threshold 

p2 <- p + geom_vline(xintercept=c(-0.6, 0.6), col="red") +

    geom_hline(yintercept=-log10(0.05), col="red")

# The significantly differentially expressed genes are the ones found in the upper-left and upper-right corners.

# Add a column to the data frame to specify if they are UP- or DOWN- regulated (log2FoldChange respectively positive or negative)

# add a column of NAs

res$diffexpressed <- "NO"

# if log2Foldchange > 0.6 and pvalue < 0.05, set as "UP" 

res$diffexpressed[res$log2FoldChange > 0.6 & res$pvalue < 0.05] <- "UP"

# if log2Foldchange < -0.6 and pvalue < 0.05, set as "DOWN"

res$diffexpressed[res$log2FoldChange < -0.6 & res$pvalue < 0.05] <- "DOWN"

# Re-plot but this time color the points with "diffexpressed"

p <- ggplot(data=res, aes(x=log2FoldChange, y=-log10(pvalue), col=diffexpressed)) + geom_point() + theme_minimal()

# Add lines as before...

p2 <- p + geom_vline(xintercept=c(-0.6, 0.6), col="red") +

        geom_hline(yintercept=-log10(0.05), col="red")

## Change point color 

# 1. by resfault, it is assigned to the categories in an alphabetical orresr):

p3 <- p2 + scale_color_manual(values=c("blue", "black", "red"))

# 2. to automate a bit: ceate a named vector: the values are the colors to be used, the names are the categories they will be assigned to:

mycolors <- c("blue", "red", "black")

names(mycolors) <- c("DOWN", "UP", "NO")

p3 <- p2 + scale_colour_manual(values = mycolors)

# Now write down the name of genes besires the points...

# Create a new column "reslabel" to res, that will contain the name of genes differentially expressed (NA in case they are not)

res$delabel <- NA

res$delabel[res$diffexpressed != "NO"] <- res$row[res$diffexpressed != "NO"]

ggplot(data=res, aes(x=log2FoldChange, y=-log10(pvalue), col=diffexpressed, label=delabel)) + 

    geom_point() + 

    theme_minimal() +

    geom_text()

library(ggrepel)

# plot adding up all layers we have seen so far

ggplot(data=res, aes(x=log2FoldChange, y=-log10(pvalue), col=diffexpressed, label=delabel)) +

        geom_point() + 

        theme_minimal() +

        geom_text_repel() +

        scale_color_manual(values=c("blue", "black", "red")) +

        geom_vline(xintercept=c(-0.6, 0.6), col="red") +

        geom_hline(yintercept=-log10(0.05), col="red")

 

plotCounts(dds, gene="", intgroup="Group", returnData=TRUE) %>% ggplot(aes(Group, count)) + geom_boxplot(aes(fill=roup)) + scale_y_log10() + ggtitle("") + scale_x_discrete(name ="Group", limits=c("CNL","miR"))

plotCounts(dds, gene="BTF3", intgroup="Morphologic.Classification", returnData=TRUE) %>% ggplot(aes(Morphologic.Classification, count)) + geom_boxplot(aes(fill=Morphologic.Classification)) + scale_y_log10() + ggtitle(" ") +  theme(plot.title = element_text(size = 30)) + scale_x_discrete(name ="Morphologic.Classification", limits=c("m0  ","m1","m2","m3","m4","m5"))

https://www.data-to-viz.com/caveat/boxplot.html

ggplot(data = dat, aes(x=Treatment, y=cell., fill=Treatment)) + scale_x_discrete(name ="Treatment", limits=c("UT","B","A","B+A")) + geom_violin(width=0.8) + geom_boxplot(width=0.8, color="black", alpha=0.8) + theme( legend.position="none", plot.title = element_text(size=15)) + ggtitle("Treatment") + theme(axis.text = element_text(size=12)) + theme(panel.background = element_rect(fill = "white", colour = "grey50"))

 

library("genefilter")

vsdata <- vst(dds, blind=FALSE)

sum( rowMeans( counts(dds, normalized=TRUE)) > 5 )

plotPCA(vsdata, intgroup="Group")

plotPCA(vsdata, intgroup="Group") + geom_label(aes(label = colnames(vsdata)))

topVarGenes <- head(order(-rowVars(assay(vsdata))),50)

mat <- assay(vsdata)[ topVarGenes, ]

mat <- mat - rowMeans(mat)

df <- as.data.frame(colData(vsdata)[,c("ID","C","B")])

pheatmap(mat, annotation_col=df, cluster_rows=FALSE, cluster_cols=FALSE)

 

res1 <-  res %>% filter(padj<0.05) %>% arrange(padj)

res2 <- res1[1:100,] %>% arrange(desc(log2FoldChange))

Sig <- res2$row

mat <- assay(vsdata)[ Sig, ]

mat <- mat - rowMeans(mat)

df <- as.data.frame(colData(vsdata)[,c("ID","Group")])

pheatmap(mat, annotation_col=df, cluster_rows=F, cluster_cols=F)

 

zmat <- scale(tmat1, center = TRUE, scale = TRUE) # upon column

 

sacPlsda <- opls(vsdata, "Treatment", orthoI = 1)

PLS-DA

library(org.Hs.eg.db)

ens2symbol <- AnnotationDbi::select(org.Hs.eg.db,

                                    key=res$row, 

                                    columns="SYMBOL",

                                    keytype="SYMBOL")

ens2symbol <- as_tibble(ens2symbol)

ens2symbol

res2 <- res %>% 

  dplyr::select(SYMBOL, stat) %>% 

  na.omit() %>% 

  distinct() %>% 

  group_by(SYMBOL) %>% 

  summarize(stat=mean(stat))

res2

 

library(fgsea)

ranks <- deframe(res2)

head(ranks, 20)

 

pathways.hallmark <- gmtPathways("C:/Users/?/Desktop/GSEA dataset/h.all.v2023.1.Hs.symbols.gmt")

fgseaRes <- fgsea(pathways=pathways.hallmark, stats=ranks, nperm=1000)

fgseaResTidy <- fgseaRes %>%

  as_tibble() %>%

  arrange(desc(NES))

# Show in a nice table:

hallmark <- fgseaResTidy %>% 

  dplyr::select(-leadingEdge, -ES, -nMoreExtreme) %>% 

  arrange(padj)

hallmark <- as.matrix(hallmark)

write.csv(hallmark, " -NRhallmark.csv")

pathways.hallmark <- gmtPathways("C:/Users/?/Desktop/GSEA dataset/ h.all.v2023.1.Hs.symbols.gmt")

fgseaRes <- fgsea(pathways=pathways.hallmark, stats=ranks, nperm=1000)

fgseaRes <- fgsea(pathways=pathways.hallmark, stats=ranks, eps      = 0.0,

                  minSize  = 15,

                  maxSize  = 500)

fgseaResTidy <- fgseaRes %>%

  as_tibble() %>%

  arrange(desc(NES))

# Show in a nice table:

fgseaResTidy %>% 

  dplyr::select(-leadingEdge, -ES, -nMoreExtreme) %>% 

  arrange(padj) %>% 

  DT::datatable()

ggplot(fgseaResTidy, aes(reorder(pathway, NES), NES)) +

  geom_col(aes(fill=padj<0.05)) +

  coord_flip() +

  labs(x="Pathway", y="Normalized Enrichment Score",

       title="Hallmark pathways NES from GSEA") + 

  theme_minimal()

fgseaRes <- fgsea(pathways=pathways.hallmark, stats=ranks, nperm=1000)

go <- fgsea(pathways=gmtPathways("C:/Users/?/Desktop/GSEA dataset/c5.all.v7.2.symbols.gmt"), ranks, nperm=1000) %>%  as_tibble()

Re <- fgsea(pathways=gmtPathways("C:/Users/?/Downloads/c2.cp.reactome.v2023.1.Hs.symbols.gmt"), ranks, nperm=1000) %>% 

   as_tibble()

go <- as.matrix(go)

write.csv(go, "go.csv")

Re <- as.matrix(Re)

write.csv(Re, "Re.csv")

hallmark <- fgsea(pathways=gmtPathways("C:/Users/?/Desktop/GSEA dataset/h.all.v7.2.symbols.gmt "), ranks, nperm=1000) %>%   as_tibble()

hallmark <- as.matrix(hallmark)

write.csv(hallmark, "hallmark B PRE  + R vs NR.csv")

plotEnrichment(pathways.hallmark[["HALLMARK_INTERFERON_GAMMA_RESPONSE"]], ranks) + labs(title="HALLMARK_INTERFERON_GAMMA_RESPONSE")

plotEnrichment(pathways.hallmark[["HALLMARK_APOPTOSIS"]], ranks) + labs(title=" HALLMARK_APOPTOSIS")

pathways.immunmark <-gmtPathways("C:/Users/?/Dowloads/c7.all.v2022.1.Hs.symbols.gmt")

cell type signature gene sets

pathways.cellmark <- gmtPathways("C:/Users/?/Downloads/c8.all.v2022.1.Hs.symbols.gmt")

ggplot(Hal, aes(NES, NAME))+ geom_col(aes(fill=NES<0)) + scale_y_discrete(name ="NAME", limits=c("MITOTIC_SPINDLE",  "IL6_JAK_STAT3_SIGNALING",  "EPITHELIAL_MESENCHYMAL_TRANSITION")) + labs(x="Pathway", y="Normalized Enrichment Score", title="Hallmark pathways NES from GSEA") + theme_minimal()

 

pathway

pval

padj

NES

size

1

HALLMARK_E2F_TARGETS

0.003125

0.016

1.50255883270908

196

2

HALLMARK_P53_PATHWAY

0.00307692307692308

0.016

1.46976609146818

193

 

ggplot(fgseaResTidy, aes(reorder(pathway, NES), NES)) +

  geom_col(aes(fill=padj<0.05)) +

  coord_flip() +

  labs(x="Pathway", y="Normalized Enrichment Score",

       title="Hallmark pathways NES from GSEA") + 

  theme_minimal()

 

fgseaRes <- fgsea(pathways=pathways.hallmark, stats=ranks, nperm=1000)

go <- fgsea(pathways=gmtPathways("C:/Users/?/Desktop/GSEA dataset/c5.all.v7.2.symbols.gmt"), ranks, nperm=1000) %>%  as_tibble()

Re <- fgsea(pathways=gmtPathways("C:/Users/?/Downloads/c2.cp.reactome.v2023.1.Hs.symbols.gmt"), ranks, nperm=1000) %>% 

   as_tibble()

go <- as.matrix(go)

write.csv(go, "go.csv")

Re <- as.matrix(Re)

write.csv(Re, "Re.csv")

hallmark <- fgsea(pathways=gmtPathways("C:/Users/?/Desktop/GSEA dataset/h.all.v7.2.symbols.gmt "), ranks, nperm=1000) %>%   as_tibble()

hallmark <- as.matrix(hallmark)

write.csv(hallmark, "hallmark B PRE  + R vs NR.csv")

plotEnrichment(pathways.hallmark[["HALLMARK_INTERFERON_GAMMA_RESPONSE"]],

                            ranks) + labs(title="HALLMARK_INTERFERON_GAMMA_RESPONSE")

plotEnrichment(pathways.hallmark[["HALLMARK_APOPTOSIS"]],

                            ranks) + labs(title=" HALLMARK_APOPTOSIS")

 

 

pathways.immunmark <- gmtPathways("C:/Users/?/Dowloads/c7.all.v2022.1.Hs.symbols.gmt")

cell type signature gene sets

pathways.cellmark <- gmtPathways("C:/Users/?/Downloads/c8.all.v2022.1.Hs.symbols.gmt")

ggplot(Hal, aes(NES, NAME))+ geom_col(aes(fill=NES<0)) + scale_y_discrete(name ="NAME", limits=c("MITOTIC_SPINDLE",  "IL6_JAK_STAT3_SIGNALING",  "EPITHELIAL_MESENCHYMAL_TRANSITION",  "APICAL_SURFACE",  "COAGULATION",  "ANDROGEN_RESPONSE",  "UV_RESPONSE_UP",  "HEDGEHOG_SIGNALING",  "ADIPOGENESIS",  "APOPTOSIS",  "APICAL_JUNCTION",  "INTERFERON_ALPHA_RESPONSE",  "XENOBIOTIC_METABOLISM",  "KRAS_SIGNALING_DN",  "IL2_STAT5_SIGNALING",  "HEME_METABOLISM",  "UV_RESPONSE_DN",  "INFLAMMATORY_RESPONSE",  "MYOGENESIS",  "ESTROGEN_RESPONSE_LATE",  "KRAS_SIGNALING_UP",  "PEROXISOME",  "ESTROGEN_RESPONSE_EARLY",  "INTERFERON_GAMMA_RESPONSE",  "ALLOGRAFT_REJECTION",  "GLYCOLYSIS",  "HYPOXIA")) + 

    labs(x="Pathway", y="Normalized Enrichment Score",

         title="Hallmark pathways NES from GSEA") + 

    theme_minimal()

212      HAY_BONE_MARROW_CD8_T_CELL      0.001317523    0.006216336    -0.548956711   -1.973916357   0          71        c("PARP8", "PATL2", "TTC16", "PTPRC", "MIAT", "KLRG1", "KIF19", "GBP5", "FYN", "LINC00987", "TRGC2", "NR4A2", "ATP8A1", "TRG-AS1", "RASAL3", "PPP2R5C", "PTPN22", "DUSP2", "CD8B", "HERPUD2", "PTPN7", "SLAMF6", "F2R", "SLA", "SH2D1A", "GZMK", "CD8A", "SYNE2", "ATG2A", "GZMH", "A2M-AS1", "SON", "CXCR4", "ZNF683", "LINC01871", "HLA-B", "JMJD6")

                                                                                                                                                            =CHAR(34)&A1&CHAR(34)

mat <- assay(vsdata)[ T, ]

mat <- mat - rowMeans(mat)

df <- as.data.frame(colData(vsdata)[,c("ID","Response.Primary","LN.Responder","Responder"," ","Group","Treatment","T.Stage", "N.Stage")])

pheatmap(mat, annotation_col=df, cluster_rows=F, cluster_cols=F)                                                        

 

Removing NA or NaN values

There are two ways to remove missing values: 

Extracting values except for NA or NaN values:

Example 1: 

·       R

x <- c(1, 2, NA, 3, NA, 4)

d <- is.na(x)

x[! d]

Output: 

[1] 1 2 3 4

Example 2: 

·       R

x <- c(1, 2, 0 / 0, 3, NA, 4, 0 / 0)

x

x[! is.na(x)]

Output: 

[1]   1   2 NaN   3  NA   4 NaN

[1] 1 2 3 4

            

d <- filter(mycounts, X %in%  c("ABCB6", "ADORA2B", "AGL"))    

 

xcell

Install

devtools::install_github('dviraran/xCell')

Note: if the installation fails, try setting options(unzip = "internal") before calling the install function (thanks @hermidalc).

Usage

library(xCell)

exprMatrix = read.table(expr_file,header=TRUE,row.names=1, as.is=TRUE)

xCellAnalysis(exprMatrix)

This function performs all three steps in xCell, which can be performed seperately as well:

  1. rawEnrichmentAnalysis
  2. transformScores
  3. spillOver

exprMatrix = read.table(expr_file,header=TRUE,row.names=1, as.is=TRUE)

exprMatrix = read.table(file='C:\Users\?\Desktop\10-18-22HNSC \Xcell.txt', header=TRUE, row.names=1, as.is=TRUE)

https://github.com/dviraran/xCell#install

The best thing to do might actually be to perform GSEA both ways (log2FC and signed pvalue), then load both sets of results into EnrichmentMap (https://apps.cytoscape.org/apps/enrichmentmap), and identify sets selected as significant by both metrics. EnrichmentMap supports a mode where two datasets from GSEA can be loaded in simultaneously and co-displayed. That would probably be the most rigorous way to do it.

 

34        HALLMARK_OXIDATIVE_PHOSPHORYLATION      0.006535948    0.016263336    -0.494556993   -2.378736178   0          200      c("COX17", "ATP5PO", "MRPL35", "COX6C", "VDAC3", "NDUFA4", "TIMM8B", "MDH1", "ATP5MC1", "COX7A2", "NDUFA2", "SUCLA2", "ATP5PF", "NDUFS4", "PRDX3", "UQCR11", "MRPL15", "ECHS1", "NDUFA8", "MRPS11", "DLD", "NDUFB8", "SLC25A4", "NDUFB6", "NDUFA5", "NDUFA6", "NDUFC2", "ATP5F1C", "COX7B", "CYB5A", "POLR2F", "NDUFB7", "MRPL11", "NDUFB1", "ATP5ME", "NDUFAB1", "NDUFC1", "ATP5PD", "NDUFB3", "BDH2", "NDUFB4", "ACADM", "TIMM10", "UQCRB", "NDUFS3", "SDHD", "SURF1", "TOMM70", "ATP6V0B", "COX15", "MDH2", "CYCS", 

9          HALLMARK_G2M_CHECKPOINT 0.006451613    0.016263336    -0.320157329   -1.535067914   0          196      c("MEIS2", "MEIS1", "MAD2L1", "CKS2", "CENPA", "RAD21", "CCNA2", "HMMR", "FBXO5", "BRCA2", "SRSF1", "EGF", "KPNA2", "AURKA", "PURA", "CDK1", "XPO1", "EZH2", "MARCKS", "NDC80", "TFDP1", "ESPL1", "ORC5", "CCNB2", "KIF23", "SNRPD1", "ORC6", "BIRC5", "CDC6", "PBK", "TOP2A", "SQLE", "MKI67", "DBF4", "TTK", "CUL4A", "CKS1B", "DTYMK", "CDC27", "CENPE", "HSPA8", "CDC45", "PTTG1", "CDC20", "KIF2C", "CBX1", "HOXC10", "BUB3", "RAD54L", "NEK2", "KNL1", "SRSF10", "BUB1", "SMC2", "CDKN3", "UBE2C", "PLK1", "RASAL2",  GINS2, "AURKB", "SLC12A2", "HIRA", "INCENP", "CDC7")

 

 

 

An example of a rank metric is the product of the sign of the ‘direction’ in the expression change (i.e. ‘up-regulation’ is positive and ‘down-regulation’ is negative) and the p-value (P).

 

si=s(Pi)=sign(fold change gene i)⋅−log10(Pi)

Under this rank metric, up-regulated genes with relatively small p-values appear at the top of the list and down-regulated genes with small p-values at the bottom.                                                                                                                                                                        

 

 

dds2 <- DESeqDataSetFromMatrix(countData=mycounts, 

                               colData=metadata, 

                            design=~Group, 

                              tidy=TRUE)

fpm2 <- fpm(dds2)                                                                                                                                                                                                                                                                    

pheatmap                                                                                                                                        

dat <- read.csv("C:/Users/?/Desktop/xcell/B .csv")

dat[is.na(dat)] = 0

dat1 <- as.matrix(dat[, 3:14])

rownames(dat1) <- dat$ID

dat2 <- scale(dat1)

pheatmap(dat2,cluster_cols = F,cluster_rows = F,main = "")

annot_row <- data.frame(row.names = data$ID, Group = data$Group)

pheatmap(data2,annotation_row=annot_row,cluster_cols = F,cluster_rows = F,main = "B P  + R vs NR")

A <- c("KDR", "JAG1", "COL3A1", "COL5A2", "PDGFB", "CD34", "LAMA2", "CHI3L1", "ITGAV", "HSPG2", "PECAM1", "NID1", "TIMP1", "ANGPT2", "CCL2", "IGF1", "TEK", "LEP", "MMP2", "EFNB2", "EDN1", "CXCL12", "FGF7", "FGFR1", "TGFB2", "TGFB3")

mat <- assay(vsdata)[ A, ]

 tmat <- t(mat)

 ztmat <- scale(tmat, center = TRUE, scale = TRUE)

 tztmat <- t(ztmat)

 library(pheatmap)

df <- as.data.frame(colData(vsdata)[,c("ID","Group")])

pheatmap(tztmat, annotation_col=df, cluster_rows=F, cluster_cols=F)

Join R Functions of dplyr Package Overview            ggplot(

  data = penguins,

  mapping = aes(x = flipper_length_mm, y = body_mass_g, color = species)

) +

  geom_point() +

  geom_smooth(method = "lm")

 

ggplot(

  data = penguins,

  mapping = aes(x = flipper_length_mm, y = body_mass_g)

) +

  geom_point(mapping = aes(color = species)) +

  geom_smooth(method = "lm")

ggplot(

  data = penguins,

  mapping = aes(x = flipper_length_mm, y = body_mass_g)

) +

  geom_point(mapping = aes(color = species, shape = species)) +

  geom_smooth(method = "lm")

ggplot(

  data = penguins,

  mapping = aes(x = flipper_length_mm, y = body_mass_g)

) +

  geom_point(aes(color = species, shape = species)) +

  geom_smooth(method = "lm") +

  labs(

    title = "Body mass and flipper length",

    subtitle = "Dimensions for Adelie, Chinstrap, and Gentoo Penguins",

    x = "Flipper length (mm)", y = "Body mass (g)",

    color = "Species", shape = "Species"

  ) +

  scale_color_colorblind()   

ggplot(penguins, aes(x = fct_infreq(species))) +

  geom_bar()          

A bar chart of frequencies of species of penguins, where the bars are ordered in decreasing order of their heights (frequencies): Adelie (approximately 150), Gentoo (approximately 125), Chinstrap (approximately 90).

 

 

 

ggplot(penguins, aes(x = body_mass_g)) +

  geom_histogram(binwidth = 200)

ggplot(penguins, aes(x = body_mass_g, color = species)) +

  geom_density(linewidth = 0.75)  

ggplot(penguins, aes(x = body_mass_g, color = species, fill = species)) +

  geom_density(alpha = 0.5)

A density plot of body masses of penguins by species of penguins. Each species (Adelie, Chinstrap, and Gentoo) is represented in different colored outlines for the density curves. The density curves are also filled with the same colors, with some transparency added. ggplot(penguins, aes(x = island, fill = species))+

  geom_bar()

Bar plots of penguin species by island (Biscoe, Dream, and Torgersen)            ggplot(penguins, aes(x = island, fill =species)) +

  geom_bar(position = "fill")        

Bar plots of penguin species by island (Biscoe, Dream, and Torgersen) the bars are scaled to the same height, making it a relative frequencies plot

ggplot(penguins, aes(x = flipper_length_mm, y = body_mass_g)) +

  geom_point(aes(color = species, shape = island))

A scatterplot of body mass vs. flipper length of penguins. The plot displays a positive, linear, relatively strong relationship between these two variables. The points are colored based on the species of the penguins and the shapes of the points represent islands (round points are Biscoe island, triangles are Dream island, and squared are Torgersen island). The plot is very busy and it's difficult to distinguish the shapes of the points.

ggplot(penguins, aes(x = flipper_length_mm, y = body_mass_g)) +

  geom_point(aes(color = species, shape = species)) +

  facet_wrap(~island)

ggsave(filename = "penguin-plot.png")

 

https://r4ds.hadley.nz/layers.html

 

library("RColorBrewer")

display.brewer.all()

 

library("RColorBrewer")

> display.brewer.all()

> pheatmap(dat2,annotation_row=annot_row,cluster_cols = F,cluster_rows = F, color = colorRampPalette(rev(brewer.pal(n = 7, name ="RdYlBu")))(100))

> pheatmap(dat2,annotation_row=annot_row,cluster_cols = F,cluster_rows = F, color = colorRampPalette(rev(brewer.pal(n = 7, name ="Spectral")))(100))

> pheatmap(dat2,annotation_row=annot_row,cluster_cols = F,cluster_rows = F, color = colorRampPalette(rev(brewer.pal(n = 7, name ="RdYlGn")))(100))

> pheatmap(dat2,annotation_row=annot_row,cluster_cols = F,cluster_rows = F, color = colorRampPalette(rev(brewer.pal(n = 7, name ="Set1")))(100))

pheatmap(dat2,annotation_row=annot_row,cluster_cols = F,cluster_rows = F, color = colorRampPalette(c("blue", "pink", "red"))(50))

 Correlations were plotted using the corrplot package

pheatmap(test, color = colorRampPalette(c("navy", "white", "firebrick3"))(50))

 

if (!require(devtools)) install.packages("devtools")

devtools::install_github("yanlinlin82/ggvenn")

a <- list('A - R PvsPre, down' = c(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27),

           'A - R PvsPre, up' = c(28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54),

           'A - NR PvsPre, up' = c(28, 1, 29, 4, 3, 34, 6, 36, 37, 38, 11, 42, 43, 44, 15, 45, 16, 17, 18, 22, 48, 49, 51, 52, 26), 

           'A - NR PvsPre, down' = c(30, 31, 2, 3, 32, 5, 35, 7, 8, 9, 10, 12, 39, 40, 13, 41, 14, 46, 47, 19, 20, 21, 23, 24, 50, 25, 53, 54, 27))

ggvenn(a)

 

dmm <- dd1 %>% group_by(Gene.signature) %>% summarise_if(is.numeric, mean, na.rm = TRUE)

 

HNSC2 <- HNSC1[ , grepl( "01A" , names(HNSC1) ) ]

 

# this gives log2(n + 1)

ntd <- normTransform(dds)

library("vsn")

meanSdPlot(assay(ntd))

 

Remove columns from dataframe where ALL values are NA

x  y  z

1 1  1 NA

2 2  2 NA

3 3 NA NA

4 4  4 NA

5 5  5 NA

 

 

 

!apply (is.na(df), 2, all)

 var1  var2  var3 

 TRUE  TRUE FALSE 

 

> df[, !apply(is.na(df), 2, all)]

  var1 var2

1    1    1

2    2    2

3    3    1

4    4    3

5    5    4

6    6   NA

7    7   NA

8   NA    9

3. Row which contains all column values that are missing

Suppose if you want to remove all column values contains NA then following codes will be handy.

Method 1:Using  is.na(), rowSums() & ncol() Functions

df=data.frame(Col1=c(NA,"B","C","D",

"P1","P2","P3")

,Col2=c(NA,8,NA,9,10,8,9)

,Col3=c(NA,7,6,8,NA,7,8)

,Col4=c(NA,NA,7,7,NA,7,7))

df[rowSums(is.na(df)) != ncol(df), ]

Col1 Col2 Col3 Col4

2 B 8 7 NA

3 C NA 6 7

4 D 9 8 7

5 P1 10 NA NA

6 P2 8 7 7

7 P3 9 8 7

Method 2:Using  Using filter() Function

df=data.frame(Col1=c(NA,"B","C","D",

"P1","P2","P3")

,Col2=c(NA,8,NA,9,10,8,9)

,Col3=c(NA,7,6,8,NA,7,8)

,Col4=c(NA,NA,7,7,NA,7,7))

library("dplyr")

filter(df, rowSums(is.na(df)) != ncol(df))

Col1 Col2 Col3 Col4

1 B 8 7 NA

2 C NA 6 7

3 D 9 8 7

4 P1 10 NA NA

5 P2 8 7 7

6 P3 9 8 7

 

Replace NA with 0

SurDRRA[is.na(SurDRRA)] <- 0

b <- list(`MacKO:HetC_DIET,down` = 9:33, `MacKO:HetC_DIET,up` = 1:8, `KO:WT_DIET,up` = c(1:9, 13, 14, 18, 28:30, 32, 34:42), `KO:WT_DIET,down` = c(21, 23))

ggvenn(b,  set_name_size = 3,  text_size = 4, set_name_color = c("blue", "yellow4", "green4", "red")) + coord_cartesian(xlim = c(-3, 3), ylim = c(-3, 3))

ggvenn(a, set_name_size = 3, set_name_color = c("blue", "yellow4", "green4", "red"), text_size = 4, stroke_size = 0.5) + coord_cartesian(xlim = c(-3, 3), ylim = c(-3, 3)) + labs(title = "Stroma hallmark pathway")

ggvenn(a, fill_color = c("#F8766D", "#00BFC4"), text_size = 6, set_name_color = c("#F8766D", "#00BFC4"), set_name_size = 8)

heatmap

d <- data.table::fread("C:/Users/?/Desktop/D +vs-R TCIA.txt.gz", data.table = F)

data <- d %>% column_to_rownames("celltype")

ddd <- as.matrix(data)

heatmap(ddd, cexRow = 1, cexCol = 1, labRow =  d$celltype, margins = c(8, 10))

pheatmap(ddd)

 

data <- d %>% column_to_rownames("celltype")

ddd <- as.matrix(data)

pheatmap(ddd,cluster_cols = F,cluster_rows = F,main = "ADM safeTME",

         color=colorRampPalette(c("navy", "white", "red"))(50),

         cutree_cols=2)

pheatmap(ddd,cluster_cols = F,cluster_rows = F,main = "ADM safeTME",

         color=colorRampPalette(c("#00BFC4", "white", "#F8766D"))(50),

         cutree_cols=2)

Spatial Decon

temp = replace(restils$prop_of_nontumor, is.na(restils$prop_of_nontumor), 0)

  o = hclust(dist(temp))$order

 layout(mat = (matrix(c(1, 2), 1)), widths = c(7, 3))

TIL_barplot(ddd, draw_legend = TRUE, cex.names = 0.5)

 

d <- data.table::fread("C:/Users/?/Desktop/Til/DTil/DTilDM1difference.txt.gz", data.table = F)

data <- d %>% column_to_rownames("celltype")

ddd <- as.matrix(data)

temp = replace(restils$prop_of_nontumor, is.na(restils$prop_of_nontumor), 0)

o = hclust(dist(temp))$order

layout(mat = (matrix(c(1, 2), 1)), widths = c(7, 3))

TIL_barplot(ddd, draw_legend = TRUE, cex.names = 0.5, main = "D DM P vs Pre TME cell type difference")

 

 

To access any other kind of file with compression, simply use gzfile("") around the file name:

write.table(tst.df,gzfile("test.dat.gz")) # write a compressed file read.table(gzfile("test.dat.gz"),row.names=1)# read it back in

results <- cibersort(HNSC, q_Norm, perm = 100, QN=TRUE)

pheatmap

Error in cut.default(x, breaks = breaks, include.lowest = T) : 

  'x' must be numeric

D5 <- apply(as.matrix(d4), 2, as.numeric)

Labels <- c(rep("R", 6),rep("P", 5),rep("N", 5),rep("N_P", 5))

names(Labels) <- rownames(d5)

 Pos_DM <- as.data.frame(Labels)

p1 <- one.step.pigengene(Data=d1,saveDir='pigengene', bnNum=0, verbose=1,

seed=1, Labels=Labels, toCompact=FALSE, doHeat=FALSE)

 

h1 <- pheatmap.type(d5[,1:20],annRow= Pos_DM,show_rownames=FALSE)

 

ggplot(b, aes(fill=Cell.state, z.scores, x=celltype), size = 12) + 

     geom_bar(position="stack", stat="identity") + + theme(axis.text.x = element_text(angle = 90, hjust = 1))

Cell State          _Negative        _Neg Cell type Abundance       _Positive         _Pos Cell type Abundance

S01       _Neg_B cells_NR_P      0.00150677      _ Pos_B cells_NR_P      0.171670673

S02       _Neg_B cells_NR_P      0.088171129    _ Pos_B cells_NR_P      0.088419299

S03       _Neg_B cells_NR_P      0.057669946    _ Pos_B cells_NR_P      0.107745723

S04       _Neg_B cells_NR_P      0.032971537    _ Pos_B cells_NR_P      0.072909367

S05       _Neg_B cells_NR_P      0.047689777    _ Pos_B cells_NR_P      0.133199207

S01       _Neg_B cells_NR_Pre   0.019111569    _ Pos_B cells_NR_Pre  0.22481643

S02       _Neg_B cells_NR_Pre   0.061897723    _ Pos_B cells_NR_Pre  0.0780146

S03       _Neg_B cells_NR_Pre   0.091447498    _ Pos_B cells_NR_Pre  0.05800381

S04       _Neg_B cells_NR_Pre   0.07043731      _ Pos_B cells_NR_Pre  0.091037166

S05       _Neg_B cells_NR_Pre   0.061042366    _ Pos_B cells_NR_Pre  0.131287177

S01       _Neg_B cells_R_P        4.90E-05           _ Pos_B cells_R_P        0.072245259

S02       _Neg_B cells_R_P        0.113785657    _ Pos_B cells_R_P        0.084129808

S03       _Neg_B cells_R_P        0.102182471    _ Pos_B cells_R_P        0.144895115

S04       _Neg_B cells_R_P        0.044155086    _ Pos_B cells_R_P        0.114542694

S05       _Neg_B cells_R_P        0.119914577    _ Pos_B cells_R_P        0.058223155

 

library(circlize)

x = rnorm(2600)

factors = sample(letters, 2600, replace = TRUE)

circos.initialize(factors = factors, x = x)

circos.trackHist(factors = factors, x = x, track.height = 0.1, col = "#999999", border = "#999999")

circos.trackHist(factors = factors, x = x, force.ylim = FALSE, bin.size = 0.1, track.height = 0.1, col = "#999999", border = "#999999")

circos.trackHist(factors = factors, x = x, draw.density = TRUE, track.height = 0.1, col = "#999999", border = "#999999")

circos.trackHist(factors = factors, x = x, draw.density = TRUE, force.ylim = FALSE, track.height = 0.1, col = "#999999", border = "#999999")

https://jokergoo.github.io/circlize_examples/example/intro-14-hist.jpg

par(mfrow = c(1, 2))

circos.initialize(letters[1:4], xlim = c(0, 10))

circos.track(ylim = c(0, 1), panel.fun = function(x, y) {

    value = runif(10)

    circos.barplot(value, 1:10 - 0.5, col = 1:10)

})

circos.track(ylim = c(-1, 1), panel.fun = function(x, y) {

    value = runif(10, min = -1, max = 1)

    circos.barplot(value, 1:10 - 0.5, col = ifelse(value > 0, 2, 3))

})

circos.clear()

 

circos.initialize(letters[1:4], xlim = c(0, 10))

circos.track(ylim = c(0, 4), panel.fun = function(x, y) {

    value = matrix(runif(10*4), ncol = 4)

    circos.barplot(value, 1:10 - 0.5, col = 2:5)

})

https://jokergoo.github.io/circlize_book/book/circos-heatmap.html#circos-heatmap-input-data

 

https://bioinformatics.ccr.cancer.gov/docs/data-visualization-with-r/Lesson5_intro_to_ggplot/

 

#data import from excel

data<-readxl::read_xlsx("./data/RNASeq_totalcounts_vs_totaltrans.xlsx",

                        1,.name_repair = "universal")

Readxl’s default is .name_repair = "unique"

 

scatter_plot<-ggplot(data=data) + 

  geom_point(aes(x=Number.of.Transcripts, y = Total.Counts,

                 color=Treatment))

scatter_plot +

  scale_color_manual(values=c("red","black"),

                     labels=c('treated','untreated'))

https://bioinformatics.ccr.cancer.gov/docs/data-visualization-with-r/Lesson2_intro_to_ggplot_files/figure-html/unnamed-chunk-13-1.png

 

 

 

scatter_plot +

  scale_color_grey()

https://bioinformatics.ccr.cancer.gov/docs/data-visualization-with-r/Lesson2_intro_to_ggplot_files/figure-html/unnamed-chunk-13-2.png

To apply scale_fill to shape, we will have to alter the shapes, as only some shapes take a fill argument.

Image from https://ggplot2.tidyverse.org/articles/ggplot2-specs.html:

R Shapes

 

ggplot(data=data) + 

  geom_point(aes(x=Number.of.Transcripts, y = Total.Counts,fill=Treatment),

             shape=21,size=2) + #increase size and change points

  scale_fill_manual(values=c("purple", "yellow"))

https://bioinformatics.ccr.cancer.gov/docs/data-visualization-with-r/Lesson2_intro_to_ggplot_files/figure-html/unnamed-chunk-14-1.png

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

##   Class    Sex   Age Survived Freq

## 1   1st   Male Child       No    0

## 2   2nd   Male Child       No    0

## 3   3rd   Male Child       No   35

## 4  Crew   Male Child       No    0

## 5   1st Female Child       No    0

## 6   2nd Female Child       No    0

ggplot() + geom_col(data = Titanic, aes(x = Class, y = Freq, fill = Survived), position = "dodge") +

    facet_grid(Sex ~ Age)

https://bioinformatics.ccr.cancer.gov/docs/data-visualization-with-r/Lesson2_intro_to_ggplot_files/figure-html/unnamed-chunk-19-1.png

 

 

 

 

 

 

 

 

 

ggplot(data=data) + 

  geom_point(aes(x=Number.of.Transcripts, y = Total.Counts,

                 fill=Treatment),

             shape=21,size=2) + 

  scale_fill_manual(values=c("purple", "yellow"), 

                    labels=c('treated','untreated'))+ 

  #can change labels of fill levels along with colors

  xlab("Recovered transcripts per sample") + #add x label

  ylab("Total sequences per sample") +#add y label

  guides(fill = guide_legend(title="Treatment")) + #label the legend

  scale_y_continuous(trans="log10") + #log transform the y axis

  theme_bw()

https://bioinformatics.ccr.cancer.gov/docs/data-visualization-with-r/Lesson2_intro_to_ggplot_files/figure-html/unnamed-chunk-21-1.png

ggsave("Plot1.png",width=5.5,height=3.5,units="in",dpi=300)

http://timer.comp-genomics.org/timer/

 

 

 

 

 

performed GSEA with MSigDB sets and used gseaplot2 to visualize the results:

display_geneset <- "IVANOVA_HEMATOPOIESIS_STEM_CELL_LONG_TERM"

p_msig <- gseaplot2(gse_cgp, geneSetID = display_geneset, title = gse_cgp$Description[gse_cgp$ID == display_geneset],

               color = "green", base_size = 5) + theme( aspect.ratio = 1/1.618)

 

p_msig

ggsave("output/clusterProfiler_GSEA_IVANOVA_small.svg", width=6, units = "cm", scale = 1)

 

 

Example 1: S4 Class and Object in R

# create a class "Student_Info" with three member variables

setClass("Employee_Info", slots=list(name="character", age="numeric", role="character"))

 

# create an object of class 

employee1 <- new("Employee_Info", name = "Peter", age = 21, role = "Developer")

 

# call employee1 object 

employee1

 

 

https://rpubs.com/pranali018/enrichmentAnalysis

 

library(enrichplot)

library(ggplot2)

library("org.Hs.eg.db")

library(clusterProfiler)

library(pathview)

require(DOSE)

dotplot(gse, showCategory=3, split=".sign") + facet_grid(.~.sign)

 

#remove any rows where GeneSymbol is not present

num = which(is.na(res$GeneSymbol))

res = res[-num, ]

 

# we want the log2 fold change 

original_gene_list = res$log2FoldChange

 

# name the vector

names(original_gene_list) <- res$GeneSymbol

 

# omit any NA values 

gene_list<-na.omit(original_gene_list)

 

# sort the list in decreasing order (required for clusterProfiler)

gene_list = sort(gene_list, decreasing = TRUE)

 

gse <- gseGO(geneList=gene_list, 

             ont ="ALL", 

             keyType = "SYMBOL",

             minGSSize = 3, 

             maxGSSize = 800, 

             pvalueCutoff = 0.05,

             verbose = TRUE, 

             OrgDb = org.Hs.eg.db, 

             pAdjustMethod = "none")

out = as.matrix(gse@result)

 

dotplot(gse, showCategory=3, split=".sign") + facet_grid(.~.sign)

The dotplot shows the ontology terms which are either activated (up regulated) or suppresed(down regulated) in the dataset. The number of ontology terms per feature (BP, MF, CC) to be displayed is determined by the showCateory function. The color of the dots indicate the pvalues for each of the terms and the size of the dots depicts the number of genes overlapping in each feature

x2 = pairwise_termsim(gse)

emapplot(x2, showCategory = 20)

Enrichment map organizes enriched terms into a network with edges connecting overlapping gene sets. In this way, mutually overlapping gene sets are tend to cluster together, making it easy to identify functional modules.

# Use the `Gene Set` param for the index in the title, and as the value for geneSetId

gseaplot(gse, by = "all", title = gse$Description[1], geneSetID = 1)

Plot of the Running Enrichment Score (green line) for a gene set as the analysis walks down the ranked gene list, including the location of the maximum enrichment score (the red line). The black lines in the Running Enrichment Score show where the members of the gene set appear in the ranked list of genes, indicating the leading edge subset.

The Ranked list metric shows the value of the ranking metric (log2 fold change) as you move down the list of ranked genes. The ranking metric measures a gene’s correlation with a phenotype.

gseaplot(kk2, by = "all", title = kk2$Description[19], geneSetID = 19)

r1 = pathview(gene.data = kegg_gene_list, pathway.id = "hsa00190", species = kegg_organism)

gene_list = sort(ranks, decreasing = TRUE)

hallmark <- GSEA(

    geneList  = gene_list,

    exponent = 1,

    minGSSize = 10,

    maxGSSize = 500,

    eps = 1e-10,

    pvalueCutoff = 0.05,

    pAdjustMethod = "BH",

    gson = NULL,

    TERM2GENE = hall,

    TERM2NAME = NA,

    verbose = TRUE,

    seed = FALSE,

    by = "fgsea")

gseaplot(kk2, by = "all", title = kk2$Description[1], geneSetID = 1)

 

 

 

 

 

DESeqAnalysis

 

if (!requireNamespace("BiocManager", quietly = TRUE)) {

    install.packages("BiocManager")

}

install.packages(

    pkgs = "DESeqAnalysis",

    repos = c(

        "https://r.acidgenomics.com",

        BiocManager::repositories()

    ),

    dependencies = TRUE

)

https://rdrr.io/github/acidgenomics/DESeqAnalysis/man/DESeqAnalysis.html

resultsNames(data)

name <- resultsNames(data)[[2L]]

results <- DESeq2::results(data, name = name)

class(results)

 

lfcShrink <- DESeq2::lfcShrink(dds = data, res = results, coef = 2L)

 

results <- list(results)

names(results) <- name

 

lfcShrink <- list(lfcShrink)

names(lfcShrink) <- name

 

object <- DESeqAnalysis(

    data = data,

    transform = transform,

    results = results,

    lfcShrink = lfcShrink

)

plotDegStackedBar(

    object,

    i = 1L,

    direction = c("both", "up", "down"),

    orderBySize = FALSE,

    label = TRUE,

    flip = TRUE

)

 

assay(object@transform)

assay(object@data)

results(object, i = 1L)

sampleData(object)

genes <- head(rownames(object), 50L)

head(genes)

samples <- head(colnames(object), 2L)

head(samples)

 

x <- object[genes, samples]

 

 

mai <- assay(vsdata)[HY, ]

mai <- as.data.frame(mai)

ma <- res %>% filter((GeneSymbol %in% HY) & pvalue < 0.05)

mai1 <- mai %>% mutate(rownames(mai))

names(mai1)[11] <- "GeneSymbol"

mai2 <- merge(mai1, ma)

mai3 <- mai2 %>% arrange(desc(TD23_2))

mai4 <- apply(as.matrix(mai3[, 1:11]), 2, as.numeric)

rownames(mai4) <- mai3$GeneSymbol

df <- as.data.frame(colData(vsdata)[, c("ID", "Group")])

pheatmap(mai4[,-1], annotation_col=df, cluster_rows=F, cluster_cols=F, main = "HALLMARK_HYPOXIA")

 

mai <- assay(vsdata)[IN, ]

mai <- as.data.frame(mai)

ma <- res %>% filter((GeneSymbol %in% IN) & pvalue < 0.05)

mai1 <- mai %>% mutate(rownames(mai))

names(mai1)[10] <- "GeneSymbol"

mai2 <- merge(mai1, ma)

mai3 <- mai2 %>% arrange(desc(TD21_2))

mai4 <- apply(as.matrix(mai3[, 1:10]), 2, as.numeric)

rownames(mai4) <- mai3$GeneSymbol

df <- as.data.frame(colData(vsdata)[, c("ID", "Group")])

pheatmap(mai4[,-1], annotation_col=df, cluster_rows=F, cluster_cols=F, main = "  -NR HALLMARK_TNFA_SIGNALING_VIA_NFKB ")

 

mai <- assay(vsdata)[IN, ]

mai <- as.data.frame(mai)

ma <- res %>% filter((GeneSymbol %in% IN) & pvalue < 0.05)

mai1 <- mai %>% mutate(rownames(mai))

names(mai1)[24] <- "GeneSymbol"

mai2 <- merge(mai1, ma)

mai3 <- mai2 %>% arrange(desc(CK .R33.RM29P))

mai4 <- apply(as.matrix(mai3[, 1:24]), 2, as.numeric)

rownames(mai4) <- mai3$GeneSymbol

df <- as.data.frame(colData(vsdata)[, c("ID", " Treatment")])

pheatmap(mai4[,-1], annotation_col=df, cluster_rows=F, cluster_cols=F, main = " NFKB ")

gseaplot2(

    gse,

    geneSetID = 876,

    title = gse$Description[876],

    color = "green",

    base_size = 11,

    rel_heights = c(1.5, 0.5, 1),

    subplots = 1:3,

    pvalue_table = FALSE,

    ES_geom = "line"

)

 

Point label

https://bookdown.org/ndphillips/YaRrr/low-level-plotting-functions.html

Connecting points with segments().

 

before <- c(2.1, 3.5, 1.8, 4.2, 2.4, 3.9, 2.1, 4.4)

after <- c(7.5, 5.1, 6.9, 3.6, 7.5, 5.2, 6.1, 7.3)

 

# Create plotting space and before scores

plot(x = rep(1, length(before)), 

     y = before, 

     xlim = c(.5, 2.5), 

     ylim = c(0, 11),

     ylab = "Score", 

     xlab = "Time",

     main = "Using segments() to connect points", 

     xaxt = "n")

 

# Add after scores

points(x = rep(2, length(after)), y = after)

 

# Add connections with segments()

segments(x0 = rep(1, length(before)), 

         y0 = before, 

         x1 = rep(2, length(after)), 

         y1 = after, 

         col = gray(0, .5))

 

# Add labels

mtext(text = c("Before", "After"), 

      side = 1, at = c(1, 2), line = 1)

Connecting points with segments().

Figure 11.10: Connecting points with segments().

plot(x = pirates$height,

     y = pirates$weight,

     pch = 16,

     col = transparent("skyblue", .7),

     main = "Adding a regression line to a scatterplot()")

 

# Add the regression line

abline(lm(weight ~ height, data = pirates), 

       lty = 2)

 

plot(x = height, 

     y = weight, 

     xlim = c(155, 180),

     ylim = c(65, 80),

     pch = 16,

     col = yarrr::piratepal("xmen"))

 

# Add id labels

text(x = height, 

     y = weight,

     labels = id, 

     pos = 3)            # Put labels above the points

Adding labels to points with text()

# Create the plot

plot(x = ChickWeight$Time,

     y = ChickWeight$weight, 

     col = gray(.3, .5), 

     pch = 16,

     main = "Combining text with numeric scalers using paste()")

 

# Add reference line

abline(h = mean(ChickWeight$weight),

       lty = 2)

 

# Add text

text(x = 3, 

     y = mean(ChickWeight$weight),

     labels = paste("Mean weight =", 

                    round(mean(ChickWeight$weight), 2)),

     pos = 3)

https://bookdown.org/ndphillips/YaRrr/YaRrr_files/figure-html/unnamed-chunk-302-1.png

layout(mat = matrix(c(2, 1, 0, 3),

                        nrow = 2, 

                        ncol = 2),

       heights = c(1, 2),    # Heights of the two rows

       widths = c(2, 1))     # Widths of the two columns

 

# Plot 1: Scatterplot

par(mar = c(5, 4, 0, 0))

plot(x = pirates$height, 

     y = pirates$weight,

     xlab = "height", 

     ylab = "weight", 

     pch = 16, 

     col = yarrr::piratepal("pony", trans = .7))

 

# Plot 2: Top (height) boxplot

par(mar = c(0, 4, 0, 0))

boxplot(pirates$height, xaxt = "n",

        yaxt = "n", bty = "n", yaxt = "n",

        col = "white", frame = FALSE, horizontal = TRUE)

 

# Plot 3: Right (weight) boxplot

par(mar = c(5, 0, 0, 0))

boxplot(pirates$weight, xaxt = "n",

        yaxt = "n", bty = "n", yaxt = "n",

        col = "white", frame = F)

Adding boxplots to margins of a scatterplot with layout().

cor.test(formula = ~ Zyx + Zzz3, data = b2)

 

         Pearson's product-moment correlation

 

data:  Zyx and Zzz3

t = 1.265, df = 15, p-value = 0.2252

alternative hypothesis: true correlation is not equal to 0

95 percent confidence interval:

 -0.2000037  0.6884001

sample estimates:

      cor 

0.3104886

 

 

d$ID[duplicated(d$ID)]

[1] "SLC35E2"

> which(duplicated(d$ID))

[1] 16273

> which(d$ID == "SLC35E2")

[1] 16272 16273

 

subset(x = ToothGrowth,

    subset = len > 30 & supp == "VC",

    select = c(len, dose))

 

 

Table 9.1: Functions for managing your workspace, working directory, and writing data from R as .txt or .RData files, and reading files into R

Code Description

ls()    Display all objects in the current workspace

rm(a, b, ..)   Removes the objects a, b… from your workspace

rm(list = ls())        Removes all objects in your workspace

getwd()       Returns the current working directory

setwd(file = "dir)  Changes the working directory to a specified file location

list.files()    Returns the names of all files in the working directory

write.table(x, file = "mydata.txt", sep)      writes the object x to a text file called mydata.txt. Define how the columns will be separated with sep (e.g.; sep = "," for a comma–separated file, and sep = t" for a tab–separated file).

save(a, b, .., file = "myimage.RData)        Saves objects a, b, … to myimage.RData

save.image(file = "myimage.RData")        Saves all objects in your workspace to myimage.RData

load(file = "myimage.RData") Loads objects in the file myimage.RData

read.table(file = "mydata.txt", sep, header)       Reads a text file called mydata.txt, define how columns are separated with sep (e.g. sep = "," for comma-delimited files, and sep = "t" for tab-delimited files), and whether there is a header column with header = TRUE

save(a, b, c, file = "myobjects.RData")

save.image(file = "data/projectimage.RData")

 

study1.df <- data.frame(id = 1:5, 

                        sex = c("m", "m", "f", "f", "m"), 

                        score = c(51, 20, 67, 52, 42))

 

score.by.sex <- aggregate(score ~ sex, 

                          FUN = mean, 

                          data = study1.df)

 

study1.htest <- t.test(score ~ sex, 

                       data = study1.df)

 

mydata <- read.table(file = 'data/mydata.txt',    # file is in a data folder in my working directory

                     sep = 't',                  # file is tab--delimited

                     header = TRUE,               # the first row of the data is a header row

                     stringsAsFactors = FALSE)    # do NOT convert strings to factors!!

# Read a text file from the web

fromweb <- read.table(file = 'http://goo.gl/jTNf6P',

                      sep = 't',

                      header = TRUE)

pirates <- pirates[order(pirates$sex, pirates$height),]

combined.survey <- merge(x = risk.survey,

                         y = happiness.survey,

                         by = "participant",

                         all = TRUE)

The symbol types associated with the pch plotting parameter.

https://bookdown.org/ndphillips/YaRrr/YaRrr_files/figure-html/unnamed-chunk-284-1.png

hist(x = ChickWeight$weight[ChickWeight$Diet == 1],

     main = "Two Histograms in one",

     xlab = "Weight",

     ylab = "Frequency",

     breaks = 20,

     xlim = c(0, 500),

     col = gray(0, .5))

 

hist(x = ChickWeight$weight[ChickWeight$Diet == 2],

     breaks = 30,

     add = TRUE, # Add plot to previous one!

     col = gray(1, .8))

 

 

 

 

barplot(height = 1:5,  # A vector of heights

        names.arg = c("G1", "G2", "G3", "G4", "G5"), # A vector of names

        main = "Example Barplot", 

        xlab = "Group", 

        ylab = "Height")

swim.data

##          Naked Clothed

## No Shark   2.1     1.5

## Shark      3.0     3.0

Now, when I enter this matrix as the height = swim.data argument to barplot(), I’ll get multiple bars.

barplot(height = swim.data,

        beside = TRUE,                        # Put the bars next to each other

        legend.text = TRUE,                   # Add a legend

        col = c(transparent("green", .2), 

                transparent("red", .2)),

        main = "Swimming Speed Experiment",

        ylab = "Speed (in meters / second)",

        xlab = "Clothing Condition",

        ylim = c(0, 4))

https://bookdown.org/ndphillips/YaRrr/YaRrr_files/figure-html/unnamed-chunk-289-1.png

https://cran.r-project.org/web/packages/pals/vignettes/bivariate_choropleths.html

https://rfunctions.blogspot.com/2015/03/bivariate-maps-bivariatemap-function.html

WGCNA

https://fuzzyatelin.github.io/bioanth-stats/module-F21-Group1/module-F21-Group1.html

 

d <- dat1 %>% separate_wider_delim(`X|Y`, delim = "|", names = c("X", "Y"))

https://rdrr.io/cran/DGEobj.utils/man/convertCounts.html

https://alexslemonade.github.io/refinebio-examples/03-rnaseq/pathway-analysis_rnaseq_03_gsva.html

 

library(vtable)

dd <- st(dat, group = 'Condition', group.test = TRUE)

dat1 <- dat[, 2:52] %>%

    gather(key = variable, value = value, - Condition) %>% 

    group_by(Condition, variable) %>% 

    summarise(value = list(value)) %>% 

    spread(Condition, value) %>% 

    group_by(variable) %>% 

    mutate(p_value = t.test(unlist(TT), unlist(NT))$p.value,

           t_value = t.test(unlist(TT), unlist(NT))$statistic)

library(annotables)

grch38 %>% filter(ensgene %in%  )

 

ANGIOGENESIS_lm <- lm(HALLMARK_ANGIOGENESIS ~ MFP +  _status, data = HALL_clinic_TME)

ANGIOGENESIS.II.aov <- car::Anova(ANGIOGENESIS_lm, type = 2)

 

Correlation

https://statsandr.com/blog/correlation-coefficient-and-correlation-test-in-r/

library("ggpubr")

ggscatter(CE9F, x = "CE10", y = "OS", 

          add = "reg.line", conf.int = TRUE, 

          cor.coef = TRUE, cor.method = "pearson",

          xlab = "CE10_F", ylab = "OS")

cor.test(formula = ~ CE9 + OS,

         data =  Pos_CE1,

         subset = MFP == "D")

t.test(formula = CE10 ~ MFP,

                data = CE9IE_F1)

                   

                                CE1             CE2                 CE3            CE4                 

TCGA-BA-5152 0.15368566 0.276724867 0.19438770 0.044953297 

TCGA-BA-5153 0.09342664 0.046838160 0.02334477 0.006748026 

TCGA-BA-5559 0.07732019 0.006907233 0.07898090 0.252107417 

round(cor(dat), 2)

    CE1   CE2   CE3   CE4   CE5   CE6   CE7   CE8   CE9  CE10    OS

CE1   1.00  0.12  0.20  0.07 -0.35  0.15 -0.26 -0.60 -0.26 -0.30 -0.08

CE2   0.12  1.00  0.03 -0.10 -0.34 -0.38  0.13 -0.14 -0.34 -0.52 -0.08

CE3   0.20  0.03  1.00  0.02 -0.35 -0.03 -0.35 -0.57  0.28 -0.01 -0.20

CE4   0.07 -0.10  0.02  1.00 -0.06  0.32 -0.42 -0.21 -0.27 -0.25 -0.07

CE5  -0.35 -0.34 -0.35 -0.06  1.00  0.10 -0.05  0.57 -0.19 -0.09 -0.14

CE6   0.15 -0.38 -0.03  0.32  0.10  1.00 -0.23 -0.16 -0.31  0.00  0.00

CE7  -0.26  0.13 -0.35 -0.42 -0.05 -0.23  1.00  0.48 -0.15 -0.17  0.10

CE8  -0.60 -0.14 -0.57 -0.21  0.57 -0.16  0.48  1.00 -0.19 -0.10  0.12

CE9  -0.26 -0.34  0.28 -0.27 -0.19 -0.31 -0.15 -0.19  1.00  0.54  0.07

CE10 -0.30 -0.52 -0.01 -0.25 -0.09  0.00 -0.17 -0.10  0.54  1.00  0.21

OS   -0.08 -0.08 -0.20 -0.07 -0.14  0.00  0.10  0.12  0.07  0.21  1.00

ggplot(dat) +

  aes(x = CE1, y = CE2) +

  geom_point(colour = "#0c4c8a") +

  theme_minimal()

pairs(dat[, c("CE9", "CE10", "OS")])

library(corrplot)

corrplot(cor(dat),

  method = "number",

  type = "upper" # show only upper side

)

correlation tests for whole dataset

library(Hmisc)

res <- rcorr(as.matrix(dat)) # rcorr() accepts matrices only

 

# display p-values (rounded to 3 decimals)

round(res$P, 3)

 

library(ggstatsplot)

 

ggscatterstats(

  data = dat,

  x = CE1,

  y = CE2,

  bf.message = FALSE,

  marginal = FALSE # remove histograms

)

 

library(correlation)

 

correlation::correlation(dat,

  include_factors = TRUE, method = "auto"

)

[ 打印 ]
閱讀 ()評論 (0)
評論
目前還沒有任何評論
登錄後才可評論.