---
title: "Intro to R"
output: html_notebook
author: Artemis Louyakis
date: 2018.02.21
---

## Each block between the triple grave accents is a chunk
## You can use the are outside of the chunks to make notes about your project

This is a notebook with some basic R commands

## This is where you can setup your notebook environment
## For today, we'll just set a working directory for all our chunks
```{r setup}
library(knitr)
dir.create("output")
setwd("~/Desktop/mcb5472r/output") ## will only set the working directory for this chunk
opts_knit$set(root.dir = '~/Desktop/mcb5472r/output') ## will set all chunks (note this doesn't always work and I have no explanation for why)
```


```{r load libraries}
getwd() ## we'll add this test to make sure our setup script worked
library(dplyr)
library(tidyr)
library(ggplot2)
library(NOISeq)

## Super important note - some packages have functions that will cancel out other functions. Your last loaded function will be the one that's used, for example, if two functions are named the same, e.g. rename() in plyr and in dplyr are run differently, so you have to make sure to load the package you want. You can also load and unload a package within your chunk if you run into this problem
```

```{r import your data}
# We'll import a matrix of count data; this is the output from a bowtie2 alignment and salmon counting combined into a matrix
# There are lots of ways to import data 
# Because we're in the directory where the file is located, you can just call the file name. You can also call the full path. Often your file might not be in the directory that you're working in, so it's useful to just use the full path (also helps you find your data later when you lose it...)
count_data <- read.table("~/Desktop/mcb5472r/example.mat", header=T, com='', check.names=F)

# "count_data" is now an object in R that we can work with
# You should see it over in your Gllobal Environment window
# 
```

```{r}
# Let's have a look at our data
# Again, many ways to visualize
names(count_data) # will print out your column headers
head(count_data) # will show you the first few rows

# You can also click the object name in the Global Environment window to open it in a tab
# This will display a reasonable number of rows and you can scroll to see more
# If working with large datasets, this is not convenient
```


```{r modify your data}
# For some analyses, you can't have fractional data. In some instances, it's ok to round

count_data <- round(count_data, 0) # this will round all your data to the nearest whole number

# We're going to assume for this dataset that if a partcular gene is expressed fewer than 10 times across all replicates, that it is an artefact and get rid of it. This cut off is a personal preference and will change depending on your dataset and your hypothesis (you may want more or less sensitivity)
count_data <- count_data[rowSums(count_data) > 10,] 
```

```{r tell R about your data}
# This transcriptome data has 4 samples - a00, a08, b00, and b08
# a and b correspond to replicates a and b; 00 and 08 correspond to time points at 0 hours and 8 hours
# We need to tell R what our replicates are and what are time points are
# We'll do this two ways - first by setting up factors and then by making a list

## factors are made by first making a dataframe with the information you want; this example has way more information than we'll need

myfactors = data.frame("rep" = substr(colnames(count_data), start=1, stop=1),
                       "time" = substr(colnames(count_data), start=2, stop=3),
                       "samp" = substr(colnames(count_data), start=1, stop=3),
                       "heading" = substr(colnames(count_data), start=1, stop=9))
rownames(myfactors) <- colnames(count_data) # this will transform the header from your data (sample ids) to the rownames of your factor file
save(myfactors, file = "myfactors_mate.RData") # this will save your object as RData so you can reload it without having to remake every time

# Now we'll look at the factors by clicking on it in Global Environment
# What kind of object is this? This is important to know when using certain functions to analyze your data
class(myfactors)


# lists are also very useful for a billion reasons
# we'll make a bunch of different lists
samps <- c("a00","a08","b00","b08")
time <- c("00","08")
rep1 <- c("a00_quant","a08_quant")
rep2 <- c("b00_quant","b08_quant")

# What kind of objects are your lists?
class(samps)
```


```{r make a basic plot}
# Now let's make a basic plot. We want to know how nicely our replicates correspond to one another

plot(count_data) # basic r plot of all our data
plot(count_data$a00_quant,count_data$a08_quant) # r plot of hour 0 replicates
plot(count_data$a00_quant,count_data$a08_quant, xlim=c(0, 1000), ylim=c(0, 1000)) # let's zoom in

# now with ggplot2's quickplot function

qplot(a00_quant,b00_quant,data=count_data) #+ 
          geom_point() #+ #adds points
          xlim(c(0,1000)) + ylim(c(0,1000)) #+ #reduces axes lengths to zoom in on majority of points
          labs(x = "rep1") + labs(y = "rep2") #+ #renames labels from column headers
          theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank(), aspect.ratio=1) #+ #removes grid and makes square
          theme_bw() #+ #removes grey background
          geom_smooth(method='loess', level=0.95, aes(fill = 'confidence'), alpha = 0.5) #adds regression line and 95% ci
```


```{r normalizing data and making PCA plots}
# Principal Component Analysis plot - normally used to identify clustering patterns of biological and technical replicates; 
# ran on data without replicates to see if any patterns stood out based on nutrient source or time;
# differences seem to emerge between the two treatments over time
tpmMatrix = cpm(count_data) 
rnaseqMatrix = as.matrix(tpmMatrix)
rnaseqMatrix = round(rnaseqMatrix)
exp_study = DGEList(counts=rnaseqMatrix, group=factor(colnames(rnaseqMatrix)))
exp_study = calcNormFactors(exp_study, method = "TMM")
exp_study$samples$eff.lib.size = exp_study$samples$lib.size * exp_study$samples$norm.factors
tmm_table = cpm(exp_study)
mydata2 = readData(tmm_table, myfactors)
myPCA = dat(mydata2, type = "PCA")
explo.plot(myPCA, factor = "rep")
explo.plot(myPCA, factor = "time")
explo.plot(myPCA, factor = "samp")
```


```{r do some basic stats}
## correlation test for replicates
dir.create("stats")
setwd("~/Desktop/mcb5472r/output/stats")

pear_corr <- cor(count_data, use="all.obs",method="pearson")
pear_corr
write.table(pear_corr, file = "pear_corr.txt", sep = "\t", quote = FALSE)

spear_corr <- cor(count_data, use="all.obs",method="spearman")
spear_corr
write.table(spear_corr, file = "spear_corr.txt", sep = "\t", quote = FALSE)
```

```{r complete differential expression analysis on all pairs}
# to use noiseq for your differential expression analysis, you must first make an object that noiseq can read
dds = NOISeq::readData(data = count_data, factors = myfactors)
save(dds, file ="mydata_mate.RData")

# i'm going to show you how to do this in a loop because this will be the most useful way to run this
# don't be scared, we'll go through it step by step
for(s1 in time){
  for(s2 in time){
    if(s1!=s2){
      if(s1<s2){
        print(c(s1,s2))
        tmm = noiseqbio(dds, norm = "tmm", k = 0.5, factor = "time", conditions = c(s1,s2), 
                             r = 50, filter = 1, nclust = 15, a0per= 0.9, random.seed = 12345, plot = TRUE)
        tmm_r <- as.data.frame(tmm@results, row.names = NULL)
        save(tmm_r, file = paste0("mate_",s1,"vs",s2,"_tmm.RData"))
        write.table(tmm@results, paste0("mate_",s1,"vs",s2,"_tmm.txt"), sep = "\t", row.names = TRUE, col.names = NA, quote = FALSE)
        tmm_r <- na.omit(tmm_r)
        tmm_r$P <- with(tmm_r, 1-prob)
        tmm_r$gene <- row.names(tmm_r)
        tmm_r$EFFECTSIZE <- tmm_r$M
        write.table(tmm_r, paste0("mate_",s1,"vs",s2,"_tmm_filtered.txt"), sep = "\t", row.names = TRUE, col.names = NA, quote = FALSE)
}}}}
```

```{r volcano plots}
# volcano plots are a common way to visualize differential expression data
# you can run this in conjunction with your de analysis
# i like to run it separately so that i can tweak the plot later for papers or do visualize differently
# again, we'll use a loop even though we only have one pair because you will most likely have multiple pairs

for(s1 in time){
  for(s2 in time){
    if(s1!=s2){
      if(s1<s2){
        print(c(s1,s2))
        tmm_r <- read.delim(paste0("mate_",s1,"vs",s2,"_tmm_filtered.txt"), header=TRUE, row.names = NULL, com='', check.names=F)
        plot(tmm_r$log2FC, -log10(1-tmm_r$prob), pch=20, xlim=c(-7,7), ylim=c(0,16), main = paste(s1,"vs",s2), 
          xlab = "log2FC", ylab = "-log10(p-value)",
            col = ifelse(abs(tmm_r$log2FC)>5 & tmm_r$prob>0.999,"red",
              ifelse(abs(tmm_r$log2FC)>5,"orange",
                ifelse(tmm_r$prob>0.99,"green","black"))))
}}}}
```


## the end ##