R note

My R learning note.

last upate: 2016-04-07

R

update all installed packages:

library(rvcheck)
update_all()

Basic

R −−max−mem−size =1Gb
options(digits =2 )
Data type
- 字符(charactor) 它们常常被引号包围
- 数字(numeric) 实数向量
- 整数(integer) 整数向量
- 逻辑(logical) 逻辑向量( TRUE=T、 FALSE=F)
- 复数(complex) 复数a
- 列表(list) S 对象的向量
- 因子(factor ) 常用于标记样本
1 e10 == 10000000000
x[!is.na(x)]

string

split string

groups <- unlist(strsplit(args$groups, ","))

regular expression capture

> library(stringr)
> s <- "_A_123_B_"
> found <- str_match(s, "A(.+)B")
> found
     [,1]      [,2]   
[1,] "A_123_B" "_123_"
> found[,2]
[1] "_123_"

# str_match_all() to match all groups in a regex

Data Structures

Vector

v <- seq(1, 10, 2) # 1 3 5 7 9
> c(1,2,c(3:5))
[1] 1 2 3 4 5
len <- length(v) # 5
c(v[1:2], 4, v[3:5]) # add an element
y <- vector(length=10) #
for (i in seq(v) ) print(i) # seq(1, length(v))
> rep(1:3, 2)
[1] 1 2 3 1 2 3
> rep(1:3, each=2)
[1] 1 1 2 2 3 3
> any(-1:1 > 0)
[1] TRUE

# filter
> v <- c(1, 2, NA, 3, 4)
> v[v>2]
[1] NA  3  4
> subset(v, v>2)
[1] 3 4
> ifelse(is.na(v), 0, v)
[1] 1 2 0 3 4

all(x==y) # exactly equal

Matrix and Array

nrow, ncol for array. NROW NCOL　for vector
> a = array(1:6, c(3,2))
> a
     [,1] [,2]
[1,]    1    4
[2,]    2    5
[3,]    3    6
> apply(a, 1, mean)
[1] 2.5 3.5 4.5
> apply(a, 2, mean)
[1] 2 5
> a = matrix(1:6, nrow=3)
> a
     [,1] [,2]
[1,]    1    4
[2,]    2    5
[3,]    3    6
> a[,2]
[1] 4 5 6
> a[,2, drop=FALSE]  # avoid dimension reduction
     [,1]
[1,]    4
[2,]    5
[3,]    6
> colnames(a) <- c("a", "b")
> rownames(a) <- c("r1", "r2", "r3")
> a
   a b
r1 1 4
r2 2 5
r3 3 6
> a[,'b']
r1 r2 r3
 4  5  6
#  apply function over each matrix element
> outer(1:nrow(mat), 1:ncol(mat) , FUN="*")
     [,1] [,2] [,3]
[1,]    1    2    3
[2,]    2    4    6
[3,]    3    6    9
[4,]    4    8   12
[5,]    5   10   15

> outer(1:nrow(mat), 1:ncol(mat) , FUN=function(r,c) log(r+c) )
          [,1]     [,2]     [,3]
[1,] 0.6931472 1.098612 1.386294
[2,] 1.0986123 1.386294 1.609438
[3,] 1.3862944 1.609438 1.791759
[4,] 1.6094379 1.791759 1.945910
[5,] 1.7917595 1.945910 2.079442

List

> l <- list("a"=1, "b"=2)
> l
$a
[1] 1

$b
[1] 2

> l["a"]
$a
[1] 1

> l[["a"]]
[1] 1
> l$a
[1] 1
>l$c <- 4 # new element
>unlist(l)
a b c
1 2 4
> lapply(list(1:3, 5:7), mean)
[[1]]
[1] 2

[[2]]
[1] 6
> sapply(list(1:3, 5:7), mean)
[1] 2 6

# filter list
> columns <- columns[sapply(columns, function(x) unlist(strsplit(as.character(x), ".", fix=TRUE))[1] %in% groups)]

Data frame

> df <- data.frame(id=c('a','b','c'),value=c(1,2,3))
> df
id value
1 a 1
2 b 2
3 c 3
> str(df)
'data.frame':   3 obs. of  2 variables:
 $ id   : Factor w/ 3 levels "a","b","c": 1 2 3
 $ value: num  1 2 3
> df[df$value>1,]
  id value
2  b     2
3  c     3
> subset(df, value>1)
  id value
2  b     2
3  c     3
> x <- df[, c("item", "rank")]
> library(dplyr)
> filter(df, value>1)
  id value
1  b     2
2  c     3
df < df[complete.cases(df),] #
merge(d1, d3, by.x="a", by.y="a") # merge

# filter by row sum
df2 <- df2[sapply(1:nrow(df2), function(i){sum(df[i, 2:nrow(df2)])>0}) > 0, ]
subSet <- DF[apply(DF,1,function(x){sum(tail(x,-1) >= 0.01) >= 2}),]
subSet <- DF[rowSums(DF[,2:ncol(DF)] >= 0.01) >= 2,]

sort

x <- x[order(-x$rank),]

fill na

x$rank[is.na(x$rank)] <- 0

replace text in data frame

re <- function(x) {
  if (is.character(x) | is.factor(x)) {
    x <- gsub("dataset_","",x)
    x <- gsub(".fa","",x)
  }
  return(x)
}
df <- as.data.frame(lapply(df, re))

remove columns

Data$genome <- NULL  # single column

Data[1:2] <- list(NULL)  # remove multiple columns
Data[1:2] <- NULL        # does not work!

Data <- subset( Data, select = -a ) # other way
Data <- subset( Data, select = -c(d, b ) )
Data <- subset( Data, select = -c( d : b )

filter by colum names

df[ , grepl( "ABC" , names( df ) ) ]

normalising-data-within-groups

library(plyr)
# do not use dplyr the same time, it will cause runtime error!

df <-
  ddply(
    df, .(group), summarise,
    normalized.value = (value - min(value)) / (max(value) - min(value))
  )

another cool usage

mutate function

# speedup relative to CPU=1
func <- function(app, time){
  i <- app[app=1] # time of CPU==1
  return(round(time[i]/time, 1))
}
df <- df  %>%
  group_by(test, dataset) %>%
  mutate(speedup=func(app, time_mean))

# A.1 -> new gruop A
get_group <- function(sample) {
  return (unlist(strsplit(as.character(sample), ".", fixed = TRUE))[1])
}

remember group_by before mutate and summarise:

df.country.award <- df.best %>%
  group_by(region) %>%
  mutate(sum = length(award2)) %>%
  group_by(region, country, sum) %>%
  summarise(num = length(award2)) %>%
  group_by(region, country) %>%
  mutate(perc = round(num / sum, digits = 2))

Others

list <- seq(1, 10, 0.1)
repeat = rep(list, 5), repeat2 = rep(x, each=5)
y <- x[-1] except for 1
digits <- as.character(z), d <- as.integer(digits)

hashmap

g <- as.list(levels(factor(df$group)))
g.index <- seq(1,length(g))
names(g.index) <- g

Control   GROUP  GROUP2 
      1       2       3

plyr and dplyr

df = data.frame(group=c('g1','g2','g1','g2'), id=c(1, 2, 3, 4), value=c(1, 2, 4, 6))
  group id value
1    g1  1     1
2    g2  2     2
3    g1  3     4
4    g2  4     6

summary of some columns (plyr)

> ddply(df, .(group), summarise, mean=mean(value))
  group mean
1    g1  2.5
2    g2  4.0
> ddply(df, ~ group, summarise, mean=mean(value))
  group mean
1    g1  2.5
2    g2  4.0
> ddply(df, .(group, id), summarise, mean=mean(value), sd=sd(value), N=length(value), se=sd/sqrt(N))
  group id mean
1    g1  1    1
2    g1  3    4
3    g2  2    2
4    g2  4    6

reference column by index

x <- df %>%
  mutate(a = .[[3]], b = .[[5]]) %>%
  group_by(item) %>%
  mutate(rank = get.rank(item, diff, p.corrected, a, b ))

Google’ R Style Guide

variable.name is preferred, variableName is accepted
FunctionName: CalculateAvgClicks Make function names verb
space: if (debug), x[1, ]
Curly Braces R ``` if (is.null(ylim)) { ylim <- c(0, 0.06) } else {

}

- Use <-, not =, for assignment.
- General Layout and Ordering

```R
Copyright statement comment
Author comment
File description comment, including purpose of program, inputs, and outputs
source() and library() statements
Function definitions
Executed statements, if applicable (e.g., print, plot)

The possibilities for creating errors when using attach are numerous. Avoid it.
Use S3 objects and methods unless there is a strong reason to use S4 objects or methods.

Change order of factor

# keep order
df_m$variable <- factor(df_m$variable, levels=unique(df_m$variable), ordered=TRUE)

# Change the order of levels
sizes <- factor(sizes, levels = c("small", "medium", "large"))

# change the order of levels in a factor based on values in the data
iss$spray <- reorder(iss$spray, iss$count, FUN=mean)

# change the names of levels in a factor.
# Can also use quotes -- useful if there are spaces or other strange characters
revalue(sizes, c("small"="S", "medium"="M", "large"="L"))
# mapvalues() lets you use two separate vectors instead of a named vector
mapvalues(sizes, c("small", "medium", "large"), c("S", "M", "L"))

Colors

format your axis tick labels with commas

cycle palette

library(ggthemes)
library(scales)

colors <- stata_pal(scheme = "s2color")(15)
show_col(colors)

n <- 18 # give me 18 colors
show_col(rep(colors, ceiling(n/length(colors)))[1:n])

ggplot

theme

http://docs.ggplot2.org/dev/vignettes/themes.html

axis

10,000

library(scales)
scale_x_continuous(breaks = seq(0, max(df$loc), by = args$x_interval), labels = comma)

20%

library(scales)
scale_y_continuous(formatter = 'percent')

log

library(scales)
p + scale_x_log10(breaks=10^(-1:5),
　　　　　labels=trans_format("log10", math_format(10^.x))) +
　　scale_y_log10(breaks=10^(0:3),
　　　　　labels=trans_format("log10", math_format(10^.x)))

ann_text <- data.frame(strain =c("△mutL", "PAO1::sacB"), 
                       count_mean = c(1000, 15),
                       count_se = c(0, 0),
                       type=c("brown", "brown")
                       )
 geom_text(data = ann_text,label = "ND") +

wrap facet strip text

wrapit <- function(text) {
  wtext <- paste(strwrap(text,width=40),collapse=" \n ")
  return(wtext)
}

data$wrapped_text <- llply(data$text, wrapit)
data$wrapped_text <- unlist(data$wrapped_text)

legend

wrap legend text

library(scales)
i + guides(colour = guide_legend(nrow = 2))

disable some legend

guide(color=FALSE)

plot

plot text in midle of every stacked bar

df.region.medal <- df.region.medal %>%
  group_by(region) %>%
  mutate(cnum = cumsum(num) - num / 2)

p.region.medal <-
  ggplot(df.region.medal, aes(x = region, y = num, fill = medal)) +
  geom_bar(stat = "identity") +
  geom_text(aes(y = cnum, label = lable )

template

#!/usr/bin/env Rscript
library(methods)
library(proto)

library(argparse)
library(dplyr)

library(ggplot2)

library(tidyr) # library(reshape2)

parser <- ArgumentParser(description = "", formatter_class = "argparse.RawTextHelpFormatter")

parser$add_argument("infile", type = "character", help = "")
parser$add_argument(
  "--width", type = "integer", default = 5,
  help = "output image width [20]"
)
parser$add_argument(
  "--ex", metavar = "ex", type = "double", default = 1.5,
  help = "x of equation [1.5]"
)
parser$add_argument("-s", "--gc-skew", action = "store_true",
                       dest = "gc_skew", help = "only plot GC Skew")

args <- parser$parse_args()

if (args$title == "") {
  args$title = NULL
}

#----------------------------------------------------------------

df <- read.csv(args[1], sep = "\t") # check.names=FALSE # avoid adding X

df <- select(df, X.chr, strand, cnt_f0, cnt_f1, cnt_f2)
df <- subset( df, select = c(-ID, -weight) )

df <- na.omit(df)

df$group <- factor(df$group, levels = unique(df$group), ordered = TRUE)


df <- df %>% gather(time, conc, -group, -a, convert = TRUE) # tidyr
# df <- df %>% gather(type, abundance, -1)
# df_m <- melt(df, id.vars = c("X.chr", "strand")) # reshape2 # not recommended


# mean and stdev
df2 <- df %>%
  group_by(type, group) %>%
  summarise(
    se = sd(conc) / sqrt(length(conc)),
    conc = mean(conc)
  )

  
#----------------------------------------------------------------

p <- ggplot(df_m) +

  geom_boxplot(aes(variable, value, fill = strand),
               position = position_dodge(width = 0.75)) +

  geom_bar(stat="identity", position="identity", colour="black", size=0.25) +
  # geom_bar(stat="identity", position="stack")+

  geom_text(aes(label=Weight), vjust=1.5, colour="white", position=position_dodge(.9), size=3) + # label
  geom_text(aes(y = cnum, label = ifelse(
    num > 1, sprintf("%d (%.0f%%)", num, perc * 100), ""
  )) # plot conditionally
  # label = sprintf("%0.2f", round(mean.value, digits = 2))  # sprintf

  geom_line(aes(x=dose, y=length, linetype=supp), linetype="dashed", size=1, colour="blue") +

  geom_point(data = another.df, aes(variable, value, color=strand))+
  geom_point(size=4, shape=21, colour="darkred", fill="pink")
  geom_point(aes(x = group, y = count, colour = group, fill = group),
              position = position_jitterdodge(jitter.width=0.4,
                                              dodge.width=0.1)) +

  geom_vline(xintercept = markx_x2, colour="red")

  geom_hline(data=df2, aes(group = query, yintercept=mean),
              linetype=2, size=0.2)

  facet_grid(. ~ X.chr, scales = "free_y") +
  
  coord_flip() +

  scale_x_discrete(labels = c('0','1','2')) +
  scale_y_continuous(expand=c(0,0)) # delete space between bar and x axis
  scale_x_continuous(limits=c(0, args$x_limit), breaks = seq(0, args$x_limit, by = args$x_break))+
  scale_y_continuous(labels = comma) +  expand_limits(y=0) + # equal to ylim(0, max(BOD$demand))

  scale_fill_manual(values=c("#CCEEFF", "#FFDDDD"), guide=FALSE)

  xlab('Frame') +
  ylab('Counts') +
  ggtitle(args$title) +

  guides(fill = guide_legend(reverse = TRUE)) # reverse legend order, could not use color=**  in aes at the same time
  labs(color="s...", shape="...")

p <- p +
  theme_bw() +
  theme(
    # backgroud
    panel.border = element_blank(),
    panel.background = element_blank(),
    # panel.grid = element_blank(),
    panel.grid.major = element_blank(),
    panel.grid.minor = element_blank(),

    # axis
    axis.line.x = element_line(colour = "black", size = 0.8),
    axis.line.y = element_line(colour = "black", size = 0.8),
    axis.text.x = element_blank(),
    axis.text.y = element_blank(),
    axis.ticks.x = element_blank(),
    axis.ticks.y = element_blank(),

    # border
    # panel.border = element_rect(colour = "black", size = 1.2, fill="transparent"),
    # axis.line.x = element_blank(),
    # axis.line.y = element_blank()

    # axis.ticks.y = element_blank(),
    # axis.title = element_blank(),
    # axis.text.x = element_text(angle=30, hjust=1, vjust=1)

    # remove y axix
    #     axis.line = element_line(colour = "black"),
    #     axis.line.y = element_line(colour = "white"),
    #     axis.ticks.y = element_blank(),
    #     axis.text.y = element_blank(),

    # legend
    legend.key = element_blank(),  
    legend.key.size = unit(0.3, "cm"),   
    legend.title = element_blank(),
    legend.text=element_text(size=12),  
    # legend.position = "none", # remove legend
    # legend.position = c(0.3, 0.2), # custom relative positon
    # legend.margin = unit(0.1, "cm"),
    legend.key.height=unit(1,"line"),
    legend.key.width=unit(1,"line"),

    # legend.background = element_rect(fill=alpha('blue', 0)), # need library(scale)    
    plot <- plot + theme(legend.position=c(1,1),legend.justification=c(1,1),
                       legend.direction="vertical",
                       legend.box="horizontal",
                       legend.box.just = c("top"),
                       legend.background = element_rect(fill="transparent"))

    # facet_grid
    strip.background = element_rect(
      colour = "white", fill = "white",
      size = 0.2
    )
    # remove strip
    #   strip.background = element_blank(),
    #   strip.text.x = element_blank(),
    #   strip.text.y = element_blank(),
    
    #   strip.text.x = element_text(angle = 0, hjust = 0), # left align
    strip.text.x = element_text(size = 12),
    strip.text.y = element_text(size = 12),

    text=element_text(size=12, family="arial", face="bold"),
    
    
    plot.margin = unit(c(0.1,0.1,0.1,0),"cm"),

    plot.title = element_text(size=16)
    # plot.title = element_text(hjust = 5)
  )

ggsave(p, file = paste(args$datafile, ".elisa.png", sep = ""),  
        width = 8, height = 4)

template 2


library(png)
library(grid)
library(ggplot2)
library(scales)
library(cowplot)
library(ggthemes)
library(ggrepel)


shenwei356.theme <-  theme_bw() +
  theme(
    panel.border = element_blank(),
    panel.background = element_blank(),
    panel.grid.major = element_blank(),
    panel.grid.minor = element_blank(),
    axis.line.x = element_line(colour = "black", size = 0.8),
    axis.line.y = element_line(colour = "black", size = 0.8),
    axis.ticks.x = element_line(size = 0.8),
    axis.ticks.y = element_line(size = 0.8),
    #     axis.text.x = element_text(
    #       angle = 30, hjust = 1, vjust = 1
    #     ),
    
    legend.key = element_blank(),
    legend.title = element_blank(),
    legend.text = element_text(size = 12, face = "plain"),
    legend.background = element_rect(fill = "transparent"),
    strip.background = element_rect(
      colour = "white", fill = "white",
      size = 0.2
    ),
    strip.text.x = element_text(size = 14),
    strip.text.y = element_text(size = 14),
    
    text = element_text(
      size = 14, family = "arial", face = "bold"
    ),
    plot.title = element_text(
      size = 16, family = "arial", face = "bold"
    )
  )

blank_theme <- theme(
  axis.title.x = element_blank(),
  axis.title.y = element_blank(),
  axis.text.x = element_blank(),
  axis.text.y = element_blank(),
  axis.ticks = element_blank(),
  panel.grid = element_blank(),
  axis.line = element_blank(),
  panel.background = element_blank(),
  plot.background = element_blank(),
  text = element_text(
    size = 14, family = "arial", face = "bold"
  ),
  plot.title = element_text(
    size = 18, family = "arial", face = "bold"
  )
)

plot_png <- function (file, title=NULL) {
  g <- rasterGrob(readPNG(file), interpolate = TRUE)
  df <- data.frame(x = c(0,0.5),y = c(0,0.5))
  p<- ggplot(df, aes(x = x, y = y)) +
    xlim(0,1) +
    ylim(0,1) +
    ggtitle(title) +
    annotation_custom(
      g, xmin = -Inf, xmax = Inf, ymin = -Inf, ymax = Inf
    ) +
    blank_theme
}

p <- plot_grid(
  plot_png("t.png"),
  plot_png("t.png"),
　　ggdraw() + draw_label(" "),　# blank
  ncol = 1,
  nrow = 2,
  labels = c("A", "B"),
  rel_heights = c(1, 1)
)
 + draw_label(
    "http://shenwei.me" ,
    angle = 35,
    size = 60,
    alpha = .1
  )  # water mark
ggsave(
  p, file = "Figure 2.png", width = 8, height = 5, dpi = 600
)

misc

recode a continuous variable to another variable

pg$wtclass <- cut(pg$weight, breaks = c(0, 5, 6, Inf))

R