---
title: "Intro to R"
author: "Artemis Louyakis"
date: "2018.02.21"
output: html_notebook
---

## Each block between the triple grave accents is a chunk
## You can use the are outside of the chunks to make notes about your project

This is a notebook with some basic R commands

## This is where you can setup your notebook environment
## For today, we'll just set a working directory for all our chunks
```{r setup}
library(knitr)
dir.create("output")
setwd("~/Desktop/mcb5472r/output") ## will only set the working directory for this chunk
opts_knit$set(root.dir = '~/Desktop/mcb5472r/output') ## will set all chunks (note this doesn't always work and I have no explanation for why)
```


```{r load libraries}
getwd() ## we'll add this test to make sure our setup script worked
library(dplyr)
library(tidyr)
library(ggplot2)
library(NOISeq)
library(edgeR)

## Super important note - some packages have functions that will cancel out other functions. Your last loaded function will be the one that's used, for example, if two functions are named the same, e.g. rename() in plyr and in dplyr are run differently, so you have to make sure to load the package you want. You can also load and unload a package within your chunk if you run into this problem
```

```{r import your data}
# We'll import a matrix of count data; this is the output from a bowtie2 alignment and salmon counting combined into a matrix
# There are lots of ways to import data 
# Because we're in the directory where the file is located, you can just call the file name. You can also call the full path. Often your file might not be in the directory that you're working in, so it's useful to just use the full path (also helps you find your data later when you lose it...)
count_data <- read.table("~/Desktop/mcb5472r/example.mat", header=T, com='', check.names=F)

# "count_data" is now an object in R that we can work with
# You should see it over in your Gllobal Environment window
# 
```

```{r}
# Let's have a look at our data
# Again, many ways to visualize
names(count_data) # will print out your column headers
head(count_data) # will show you the first few rows

# You can also click the object name in the Global Environment window to open it in a tab
# This will display a reasonable number of rows and you can scroll to see more
# If working with large datasets, this is not convenient
```


```{r modify your data}
# For some analyses, you can't have fractional data. In some instances, it's ok to round

count_data <- round(count_data, 0) # this will round all your data to the nearest whole number

# We're going to assume for this dataset that if a partcular gene is expressed fewer than 10 times across all replicates, that it is an artefact and get rid of it. This cut off is a personal preference and will change depending on your dataset and your hypothesis (you may want more or less sensitivity)
count_data <- count_data[rowSums(count_data) > 10,] 
```

```{r tell R about your data}
# This transcriptome data has 4 samples - a00, a08, b00, and b08
# a and b correspond to replicates a and b; 00 and 08 correspond to time points at 0 hours and 8 hours
# We need to tell R what our replicates are and what are time points are
# We'll do this two ways - first by setting up factors and then by making a list

## factors are made by first making a dataframe with the information you want; this example has way more information than we'll need

myfactors = data.frame("rep" = substr(colnames(count_data), start=1, stop=1),
                       "time" = substr(colnames(count_data), start=2, stop=3),
                       "samp" = substr(colnames(count_data), start=1, stop=3),
                       "heading" = substr(colnames(count_data), start=1, stop=9))
rownames(myfactors) <- colnames(count_data) # this will transform the header from your data (sample ids) to the rownames of your factor file
save(myfactors, file = "myfactors_mate.RData") # this will save your object as RData so you can reload it without having to remake every time

# Now we'll look at the factors by clicking on it in Global Environment
# What kind of object is this? This is important to know when using certain functions to analyze your data
class(myfactors)


# lists are also very useful for a billion reasons
# we'll make a bunch of different lists
samps <- c("a00","a08","b00","b08")
time <- c("00","08")
rep1 <- c("a00_quant","a08_quant")
rep2 <- c("b00_quant","b08_quant")

# What kind of objects are your lists?
class(samps)
```


```{r make a basic plot}
# Now let's make a basic plot. We want to know how nicely our replicates correspond to one another

plot(count_data) # basic r plot of all our data
plot(count_data$a00_quant,count_data$b00_quant) # r plot of hour 0 replicates
plot(count_data$a00_quant,count_data$b00_quant, xlim=c(0, 1000), ylim=c(0, 1000)) # let's zoom in

# now with ggplot2's quickplot function

qplot(a00_quant,b00_quant,data=count_data) + 
          geom_point() + #adds points
          xlim(c(0,1000)) + ylim(c(0,1000)) + #reduces axes lengths to zoom in on majority of points
          labs(x = "rep1") + labs(y = "rep2") + #renames labels from column headers
          theme_bw() + #removes grey background
          theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank(), aspect.ratio=1) + #removes grid and makes square
          geom_smooth(method='loess', level=0.95, aes(fill = 'confidence'), alpha = 0.5) #adds regression line and 95% ci

ggplot(aes(a00_quant,b00_quant),data=count_data) + 
          geom_point() + #adds points
          xlim(c(0,1000)) + ylim(c(0,1000)) + #reduces axes lengths to zoom in on majority of points
          labs(x = "rep1") + labs(y = "rep2") + #renames labels from column headers
          theme_bw() + #removes grey background
          theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank(), aspect.ratio=1) + #removes grid and makes square
          geom_smooth(method='loess', level=0.95, aes(fill = 'confidence'), alpha = 0.5) #adds regression line and 95% ci
```



```{r normalizing data and making PCA plots}
# Principal Component Analysis plot - normally used to identify clustering patterns of biological and technical replicates; 
# ran on data without replicates to see if any patterns stood out based on nutrient source or time;
# differences seem to emerge between the two treatments over time
# uses edger and noiseq packages

tpmMatrix = cpm(count_data) # calculate counts per million
class(tpmMatrix) # what type of object is this? we need a matrix
rnaseqMatrix = as.matrix(tpmMatrix) # turn that object into a matrix
rnaseqMatrix = round(rnaseqMatrix) # round to the nearest whole number; for this analysis, it would have made more sense to bring in the raw counts (initial example.mat file) and calculate cpm, then round. Now we've really over manipulated the data
exp_study = DGEList(counts=rnaseqMatrix, group=factor(colnames(rnaseqMatrix))) # create an edger object - edger needs to look at the matrix in a particular format and this will make an object just for that program
exp_study = calcNormFactors(exp_study, method = "TMM") # calculates your normalization factors and stores them in your edger object
exp_study$samples$eff.lib.size = exp_study$samples$lib.size * exp_study$samples$norm.factors # transform your data by multiplying your counts by the normalization factor
tmm_table = cpm(exp_study) # this will put your data back in counts per million; I don't know how valid this step is
mydata2 = readData(tmm_table, myfactors) # using noiseq, bring in your data and your factor table metadata from earlier
myPCA = dat(mydata2, type = "PCA") # create your pca plot object
explo.plot(myPCA, factor = "rep") # plot the object and color by factor "rep"
explo.plot(myPCA, factor = "time") # plot the object and color by factor "time"
explo.plot(myPCA, factor = "samp") # plot the object and color by factor "samp"
```


```{r do some basic stats}
## correlation test for replicates
dir.create("stats")
setwd("~/Desktop/mcb5472r/output/stats")

# pearson correlation coefficient is used to identify how strong of a linear correlation you have and whether it's positive or negative (direction); for transcriptome data replicates, you want a positive linear relationship because gene expression counts in one replicate should be equivalent to the other replicate in ratio, so linear increase at a constant rate (see the plot between replicates that you made earlier)
pear_corr <- cor(count_data, use="all.obs",method="pearson")
pear_corr
write.table(pear_corr, file = "pear_corr.txt", sep = "\t", quote = FALSE)

# spearman correlation coefficient is used to identify both how strong and the direction of a monotonic relationship, so it's nonparametric; in transcriptome data, some genes may increase in expression level in one treatment, but decrease in another, while other genes increase or decrease expression in both treatments, so there may be a linear relationship, but it isn't at a constant rate
spear_corr <- cor(count_data, use="all.obs",method="spearman")
spear_corr
write.table(spear_corr, file = "spear_corr.txt", sep = "\t", quote = FALSE)
```

```{r complete differential expression analysis on all pairs}
# to use noiseq for your differential expression analysis, you must first make an object that noiseq can read; this is just like we did for edger, but makes an object specifically for noiseq
dds = NOISeq::readData(data = count_data, factors = myfactors)
save(dds, file ="mydata_mate.RData") # saving objects as RData will allow you to load them back in later

# i'm going to show you how to do this in a loop because this will be the most useful way to run this
# don't be scared, we'll go through it step by step
for(s1 in time){ # first for loop saying to look at each variable in the list "time" one at a time
  for(s2 in time){ # a second for loop is bringing the same list in as a second variable
    if(s1!=s2){ # we don't want to compare samples to themselves, so an if statement to ignore comparisons that match
      if(s1<s2){ # we also don't want to do a comparison twice (e.g. t00 vs t08 and t08 vs t00)
        print(c(s1,s2)) # print the names of the variables to our screen, so we can be sure we're comparing what we want
        tmm = noiseqbio(dds, norm = "tmm", k = 0.5, factor = "time", conditions = c(s1,s2), 
                             r = 50, filter = 1, nclust = 15, a0per= 0.9, random.seed = 12345, plot = TRUE) # run DE analysis
        tmm_r <- as.data.frame(tmm@results, row.names = NULL) # put the DE results into an object
        save(tmm_r, file = paste0("mate_",s1,"vs",s2,"_tmm.RData")) # save the DE results as RData, so we can bring in later
        write.table(tmm@results, paste0("mate_",s1,"vs",s2,"_tmm.txt"), sep = "\t", row.names = TRUE, col.names = NA, quote = FALSE) # save the results as a table; this could now be examined by using a text editor, MS Excel, etc. 
        tmm_r <- na.omit(tmm_r) # remove all lines that did not contain an expression value for both treatments
        tmm_r$P <- with(tmm_r, 1-prob) # add new column called "P" that uses the column "prob" subtracted from 1
        tmm_r$gene <- row.names(tmm_r) # add a new column "gene" that contains the rownames of the output dataframe
        tmm_r$EFFECTSIZE <- tmm_r$log2FC # add a new column "EFFECTSIZE" that's a copy of the data in the column "log2FC"
        write.table(tmm_r, paste0("mate_",s1,"vs",s2,"_tmm_filtered.txt"), sep = "\t", row.names = TRUE, col.names = NA, quote = FALSE) # write a new table that contains all your filters and edits from the last 4 lines
}}}}
# Note that you are using your variable in your functions and also to write your tables to files; using variables in for loops is a very helpful tool when analyzing multiple datasets that contain similar data or, like this example, comparing multiple things within the dataset
```

```{r volcano plots}
# volcano plots are a common way to visualize differential expression data
# you can run this in conjunction with your de analysis
# i like to run it separately so that i can tweak the plot later for papers or do visualize differently
# again, we'll use a loop even though we only have one pair because you will most likely have multiple pairs
# note that in this script, we're using the variables to read in each of the tables we made in the previous loop
# to use variables like this, you'll have to use paste or paste0 depending on what sort of argument it is

for(s1 in time){
  for(s2 in time){
    if(s1!=s2){
      if(s1<s2){
        print(c(s1,s2))
        tmm_r <- read.delim(paste0("mate_",s1,"vs",s2,"_tmm_filtered.txt"), header=TRUE, row.names = NULL, com='', check.names=F)
        plot(tmm_r$log2FC, -log10(1-tmm_r$prob), pch=20, xlim=c(-7,7), ylim=c(0,16), main = paste(s1,"vs",s2), 
          xlab = "log2FC", ylab = "-log10(p-value)",
            col = ifelse(abs(tmm_r$log2FC)>5 & tmm_r$prob>0.999,"red",
              ifelse(abs(tmm_r$log2FC)>5,"orange",
                ifelse(tmm_r$prob>0.99,"green","black"))))
}}}}
# you can use the help menu to see available options for the plot function;
# in this plot, we did the following:
# select the column log2FC for our x-axis, calculate the -log10 of 1-(the probability) column (could have also used -log10(P) for this), pch is used to change the size of our points, xlim and ylim give the limits for each axis, main is the title of our plot and uses the variables, xlab and ylab label our axes
# the bottom three lines are using a nested ifelse statement to color the points on the plot based on cutoffs;
# the order you place these is important - the first one will be the dominant command (the "if") and the second will happen only if the first criteria isn't met (the "else") and that applies within the ifelse statement and between the nests of statements
```




```{r}
# this code was added in during class and contains an interactive volcano plot using a package called "manhattanly",
# it uses another package called "plotly" to let you input information and scroll over the points to get information about it
# it is very similar to the other volcano plot, but uses the syntax required for this function; the information is the same
# it requires you first make an object with the function "volcanor", then make the plot with function "volcanoly", 
# and finally you'll print the plot to your notebook output
# you can add more columns of metadata to your input file to get more columns of metadata into your interactive hover

library(manhattanly)
for(s1 in time){
  for(s2 in time){
    if(s1!=s2){
      if(s1<s2){
        print(c(s1,s2))
        tmm_r <- read.delim(paste0("mate_",s1,"vs",s2,"_tmm_filtered.txt"), header=TRUE, row.names = NULL, com='', check.names=F)
        volc_obj <- volcanor(tmm_r, p = "P", effect_size = "EFFECTSIZE", gene = "gene")
        volc_plot <- volcanoly(volc_obj, effect_size_line = c(-5,5), effect_size_line_color = "orange", 
                  genomewideline = -log10(1e-3), genomewideline_color = "green", title = paste(s1,"vs",s2))
        print(volc_plot)
}}}}
```






## the end ##