R note

· Read in about 11 min · (2251 Words)

My R learning note.

last upate: 2016-04-07

R

update all installed packages:

library(rvcheck)
update_all()

Basic

  • R −−max−mem−size =1Gb
  • options(digits =2 )
  • Data type
    • 字符(charactor) 它们常常被引号包围
    • 数字(numeric) 实数向量
    • 整数(integer) 整数向量
    • 逻辑(logical) 逻辑向量( TRUE=T、 FALSE=F)
    • 复数(complex) 复数a
    • 列表(list) S 对象的向量
    • 因子(factor ) 常用于标记样本
  • 1 e10 == 10000000000
  • x[!is.na(x)]

string

split string

groups <- unlist(strsplit(args$groups, ","))

regular expression capture

> library(stringr)
> s <- "_A_123_B_"
> found <- str_match(s, "A(.+)B")
> found
     [,1]      [,2]   
[1,] "A_123_B" "_123_"
> found[,2]
[1] "_123_"

# str_match_all() to match all groups in a regex

Data Structures

Vector

v <- seq(1, 10, 2) # 1 3 5 7 9
> c(1,2,c(3:5))
[1] 1 2 3 4 5
len <- length(v) # 5
c(v[1:2], 4, v[3:5]) # add an element
y <- vector(length=10) #
for (i in seq(v) ) print(i) # seq(1, length(v))
> rep(1:3, 2)
[1] 1 2 3 1 2 3
> rep(1:3, each=2)
[1] 1 1 2 2 3 3
> any(-1:1 > 0)
[1] TRUE

# filter
> v <- c(1, 2, NA, 3, 4)
> v[v>2]
[1] NA  3  4
> subset(v, v>2)
[1] 3 4
> ifelse(is.na(v), 0, v)
[1] 1 2 0 3 4

all(x==y) # exactly equal

Matrix and Array

nrow, ncol for array. NROW NCOL for vector
> a = array(1:6, c(3,2))
> a
     [,1] [,2]
[1,]    1    4
[2,]    2    5
[3,]    3    6
> apply(a, 1, mean)
[1] 2.5 3.5 4.5
> apply(a, 2, mean)
[1] 2 5
> a = matrix(1:6, nrow=3)
> a
     [,1] [,2]
[1,]    1    4
[2,]    2    5
[3,]    3    6
> a[,2]
[1] 4 5 6
> a[,2, drop=FALSE]  # avoid dimension reduction
     [,1]
[1,]    4
[2,]    5
[3,]    6
> colnames(a) <- c("a", "b")
> rownames(a) <- c("r1", "r2", "r3")
> a
   a b
r1 1 4
r2 2 5
r3 3 6
> a[,'b']
r1 r2 r3
 4  5  6
#  apply function over each matrix element
> outer(1:nrow(mat), 1:ncol(mat) , FUN="*")
     [,1] [,2] [,3]
[1,]    1    2    3
[2,]    2    4    6
[3,]    3    6    9
[4,]    4    8   12
[5,]    5   10   15

> outer(1:nrow(mat), 1:ncol(mat) , FUN=function(r,c) log(r+c) )
          [,1]     [,2]     [,3]
[1,] 0.6931472 1.098612 1.386294
[2,] 1.0986123 1.386294 1.609438
[3,] 1.3862944 1.609438 1.791759
[4,] 1.6094379 1.791759 1.945910
[5,] 1.7917595 1.945910 2.079442

List

> l <- list("a"=1, "b"=2)
> l
$a
[1] 1

$b
[1] 2

> l["a"]
$a
[1] 1

> l[["a"]]
[1] 1
> l$a
[1] 1
>l$c <- 4 # new element
>unlist(l)
a b c
1 2 4
> lapply(list(1:3, 5:7), mean)
[[1]]
[1] 2

[[2]]
[1] 6
> sapply(list(1:3, 5:7), mean)
[1] 2 6

# filter list
> columns <- columns[sapply(columns, function(x) unlist(strsplit(as.character(x), ".", fix=TRUE))[1] %in% groups)]

Data frame

> df <- data.frame(id=c('a','b','c'),value=c(1,2,3))
> df
id value
1 a 1
2 b 2
3 c 3
> str(df)
'data.frame':   3 obs. of  2 variables:
 $ id   : Factor w/ 3 levels "a","b","c": 1 2 3
 $ value: num  1 2 3
> df[df$value>1,]
  id value
2  b     2
3  c     3
> subset(df, value>1)
  id value
2  b     2
3  c     3
> x <- df[, c("item", "rank")]
> library(dplyr)
> filter(df, value>1)
  id value
1  b     2
2  c     3
df < df[complete.cases(df),] #
merge(d1, d3, by.x="a", by.y="a") # merge

# filter by row sum
df2 <- df2[sapply(1:nrow(df2), function(i){sum(df[i, 2:nrow(df2)])>0}) > 0, ]
subSet <- DF[apply(DF,1,function(x){sum(tail(x,-1) >= 0.01) >= 2}),]
subSet <- DF[rowSums(DF[,2:ncol(DF)] >= 0.01) >= 2,]

sort

x <- x[order(-x$rank),]

fill na

x$rank[is.na(x$rank)] <- 0

replace text in data frame

re <- function(x) {
  if (is.character(x) | is.factor(x)) {
    x <- gsub("dataset_","",x)
    x <- gsub(".fa","",x)
  }
  return(x)
}
df <- as.data.frame(lapply(df, re))

remove columns

Data$genome <- NULL  # single column

Data[1:2] <- list(NULL)  # remove multiple columns
Data[1:2] <- NULL        # does not work!

Data <- subset( Data, select = -a ) # other way
Data <- subset( Data, select = -c(d, b ) )
Data <- subset( Data, select = -c( d : b )

filter by colum names

df[ , grepl( "ABC" , names( df ) ) ]

normalising-data-within-groups

library(plyr)
# do not use dplyr the same time, it will cause runtime error!

df <-
  ddply(
    df, .(group), summarise,
    normalized.value = (value - min(value)) / (max(value) - min(value))
  )

another cool usage

mutate function

# speedup relative to CPU=1
func <- function(app, time){
  i <- app[app=1] # time of CPU==1
  return(round(time[i]/time, 1))
}
df <- df  %>%
  group_by(test, dataset) %>%
  mutate(speedup=func(app, time_mean))

# A.1 -> new gruop A
get_group <- function(sample) {
  return (unlist(strsplit(as.character(sample), ".", fixed = TRUE))[1])
}

remember group_by before mutate and summarise:

df.country.award <- df.best %>%
  group_by(region) %>%
  mutate(sum = length(award2)) %>%
  group_by(region, country, sum) %>%
  summarise(num = length(award2)) %>%
  group_by(region, country) %>%
  mutate(perc = round(num / sum, digits = 2))

Others

  • list <- seq(1, 10, 0.1)
  • repeat = rep(list, 5), repeat2 = rep(x, each=5)
  • y <- x[-1] except for 1
  • digits <- as.character(z), d <- as.integer(digits)

hashmap

g <- as.list(levels(factor(df$group)))
g.index <- seq(1,length(g))
names(g.index) <- g

Control   GROUP  GROUP2 
      1       2       3 

plyr and dplyr

df = data.frame(group=c('g1','g2','g1','g2'), id=c(1, 2, 3, 4), value=c(1, 2, 4, 6))
  group id value
1    g1  1     1
2    g2  2     2
3    g1  3     4
4    g2  4     6

summary of some columns (plyr)

> ddply(df, .(group), summarise, mean=mean(value))
  group mean
1    g1  2.5
2    g2  4.0
> ddply(df, ~ group, summarise, mean=mean(value))
  group mean
1    g1  2.5
2    g2  4.0
> ddply(df, .(group, id), summarise, mean=mean(value), sd=sd(value), N=length(value), se=sd/sqrt(N))
  group id mean
1    g1  1    1
2    g1  3    4
3    g2  2    2
4    g2  4    6

reference column by index

x <- df %>%
  mutate(a = .[[3]], b = .[[5]]) %>%
  group_by(item) %>%
  mutate(rank = get.rank(item, diff, p.corrected, a, b ))

Google’ R Style Guide

  • variable.name is preferred, variableName is accepted
  • FunctionName: CalculateAvgClicks Make function names verb
  • space: if (debug), x[1, ]
  • Curly Braces R ``` if (is.null(ylim)) { ylim <- c(0, 0.06) } else {

}

- Use <-, not =, for assignment.
- General Layout and Ordering

```R
Copyright statement comment
Author comment
File description comment, including purpose of program, inputs, and outputs
source() and library() statements
Function definitions
Executed statements, if applicable (e.g., print, plot)
  • The possibilities for creating errors when using attach are numerous. Avoid it.
  • Use S3 objects and methods unless there is a strong reason to use S4 objects or methods.

Change order of factor

# keep order
df_m$variable <- factor(df_m$variable, levels=unique(df_m$variable), ordered=TRUE)
# Change the order of levels
sizes <- factor(sizes, levels = c("small", "medium", "large"))
# change the order of levels in a factor based on values in the data
iss$spray <- reorder(iss$spray, iss$count, FUN=mean)
# change the names of levels in a factor.
# Can also use quotes -- useful if there are spaces or other strange characters
revalue(sizes, c("small"="S", "medium"="M", "large"="L"))
# mapvalues() lets you use two separate vectors instead of a named vector
mapvalues(sizes, c("small", "medium", "large"), c("S", "M", "L"))

Colors

format your axis tick labels with commas

cycle palette

library(ggthemes)
library(scales)

colors <- stata_pal(scheme = "s2color")(15)
show_col(colors)

n <- 18 # give me 18 colors
show_col(rep(colors, ceiling(n/length(colors)))[1:n])

ggplot

theme

http://docs.ggplot2.org/dev/vignettes/themes.html

axis

10,000

library(scales)
scale_x_continuous(breaks = seq(0, max(df$loc), by = args$x_interval), labels = comma)

20%

library(scales)
scale_y_continuous(formatter = 'percent')

log

library(scales)
p + scale_x_log10(breaks=10^(-1:5),
     labels=trans_format("log10", math_format(10^.x))) +
  scale_y_log10(breaks=10^(0:3),
     labels=trans_format("log10", math_format(10^.x)))

plot differently in facets

ann_text <- data.frame(strain =c("△mutL", "PAO1::sacB"), 
                       count_mean = c(1000, 15),
                       count_se = c(0, 0),
                       type=c("brown", "brown")
                       )
 geom_text(data = ann_text,label = "ND") + 

wrap facet strip text

wrapit <- function(text) {
  wtext <- paste(strwrap(text,width=40),collapse=" \n ")
  return(wtext)
}

data$wrapped_text <- llply(data$text, wrapit)
data$wrapped_text <- unlist(data$wrapped_text)

legend

wrap legend text

library(scales)
i + guides(colour = guide_legend(nrow = 2))

disable some legend

guide(color=FALSE)

plot

plot text in midle of every stacked bar

df.region.medal <- df.region.medal %>%
  group_by(region) %>%
  mutate(cnum = cumsum(num) - num / 2)

p.region.medal <-
  ggplot(df.region.medal, aes(x = region, y = num, fill = medal)) +
  geom_bar(stat = "identity") +
  geom_text(aes(y = cnum, label = lable )

template

#!/usr/bin/env Rscript
library(methods)
library(proto)

library(argparse)
library(dplyr)

library(ggplot2)

library(tidyr) # library(reshape2)

parser <- ArgumentParser(description = "", formatter_class = "argparse.RawTextHelpFormatter")

parser$add_argument("infile", type = "character", help = "")
parser$add_argument(
  "--width", type = "integer", default = 5,
  help = "output image width [20]"
)
parser$add_argument(
  "--ex", metavar = "ex", type = "double", default = 1.5,
  help = "x of equation [1.5]"
)
parser$add_argument("-s", "--gc-skew", action = "store_true",
                       dest = "gc_skew", help = "only plot GC Skew")

args <- parser$parse_args()

if (args$title == "") {
  args$title = NULL
}

#----------------------------------------------------------------

df <- read.csv(args[1], sep = "\t") # check.names=FALSE # avoid adding X

df <- select(df, X.chr, strand, cnt_f0, cnt_f1, cnt_f2)
df <- subset( df, select = c(-ID, -weight) )

df <- na.omit(df)

df$group <- factor(df$group, levels = unique(df$group), ordered = TRUE)


df <- df %>% gather(time, conc, -group, -a, convert = TRUE) # tidyr
# df <- df %>% gather(type, abundance, -1)
# df_m <- melt(df, id.vars = c("X.chr", "strand")) # reshape2 # not recommended


# mean and stdev
df2 <- df %>%
  group_by(type, group) %>%
  summarise(
    se = sd(conc) / sqrt(length(conc)),
    conc = mean(conc)
  )

  
#----------------------------------------------------------------

p <- ggplot(df_m) +

  geom_boxplot(aes(variable, value, fill = strand),
               position = position_dodge(width = 0.75)) +

  geom_bar(stat="identity", position="identity", colour="black", size=0.25) +
  # geom_bar(stat="identity", position="stack")+

  geom_text(aes(label=Weight), vjust=1.5, colour="white", position=position_dodge(.9), size=3) + # label
  geom_text(aes(y = cnum, label = ifelse(
    num > 1, sprintf("%d (%.0f%%)", num, perc * 100), ""
  )) # plot conditionally
  # label = sprintf("%0.2f", round(mean.value, digits = 2))  # sprintf

  geom_line(aes(x=dose, y=length, linetype=supp), linetype="dashed", size=1, colour="blue") +

  geom_point(data = another.df, aes(variable, value, color=strand))+
  geom_point(size=4, shape=21, colour="darkred", fill="pink")
  geom_point(aes(x = group, y = count, colour = group, fill = group),
              position = position_jitterdodge(jitter.width=0.4,
                                              dodge.width=0.1)) +

  geom_vline(xintercept = markx_x2, colour="red")

  geom_hline(data=df2, aes(group = query, yintercept=mean),
              linetype=2, size=0.2)

  facet_grid(. ~ X.chr, scales = "free_y") +
  
  coord_flip() +

  scale_x_discrete(labels = c('0','1','2')) +
  scale_y_continuous(expand=c(0,0)) # delete space between bar and x axis
  scale_x_continuous(limits=c(0, args$x_limit), breaks = seq(0, args$x_limit, by = args$x_break))+
  scale_y_continuous(labels = comma) +  expand_limits(y=0) + # equal to ylim(0, max(BOD$demand))

  scale_fill_manual(values=c("#CCEEFF", "#FFDDDD"), guide=FALSE)

  xlab('Frame') +
  ylab('Counts') +
  ggtitle(args$title) +

  guides(fill = guide_legend(reverse = TRUE)) # reverse legend order, could not use color=**  in aes at the same time
  labs(color="s...", shape="...")

p <- p +
  theme_bw() +
  theme(
    # backgroud
    panel.border = element_blank(),
    panel.background = element_blank(),
    # panel.grid = element_blank(),
    panel.grid.major = element_blank(),
    panel.grid.minor = element_blank(),

    # axis
    axis.line.x = element_line(colour = "black", size = 0.8),
    axis.line.y = element_line(colour = "black", size = 0.8),
    axis.text.x = element_blank(),
    axis.text.y = element_blank(),
    axis.ticks.x = element_blank(),
    axis.ticks.y = element_blank(),

    # border
    # panel.border = element_rect(colour = "black", size = 1.2, fill="transparent"),
    # axis.line.x = element_blank(),
    # axis.line.y = element_blank()

    # axis.ticks.y = element_blank(),
    # axis.title = element_blank(),
    # axis.text.x = element_text(angle=30, hjust=1, vjust=1)

    # remove y axix
    #     axis.line = element_line(colour = "black"),
    #     axis.line.y = element_line(colour = "white"),
    #     axis.ticks.y = element_blank(),
    #     axis.text.y = element_blank(),

    # legend
    legend.key = element_blank(),  
    legend.key.size = unit(0.3, "cm"),   
    legend.title = element_blank(),
    legend.text=element_text(size=12),  
    # legend.position = "none", # remove legend
    # legend.position = c(0.3, 0.2), # custom relative positon
    # legend.margin = unit(0.1, "cm"),
    legend.key.height=unit(1,"line"),
    legend.key.width=unit(1,"line"),

    # legend.background = element_rect(fill=alpha('blue', 0)), # need library(scale)    
    plot <- plot + theme(legend.position=c(1,1),legend.justification=c(1,1),
                       legend.direction="vertical",
                       legend.box="horizontal",
                       legend.box.just = c("top"),
                       legend.background = element_rect(fill="transparent"))

    # facet_grid
    strip.background = element_rect(
      colour = "white", fill = "white",
      size = 0.2
    )
    # remove strip
    #   strip.background = element_blank(),
    #   strip.text.x = element_blank(),
    #   strip.text.y = element_blank(),
    
    #   strip.text.x = element_text(angle = 0, hjust = 0), # left align
    strip.text.x = element_text(size = 12),
    strip.text.y = element_text(size = 12),

    text=element_text(size=12, family="arial", face="bold"),
    
    
    plot.margin = unit(c(0.1,0.1,0.1,0),"cm"),

    plot.title = element_text(size=16)
    # plot.title = element_text(hjust = 5)
  )

ggsave(p, file = paste(args$datafile, ".elisa.png", sep = ""),  
        width = 8, height = 4)

template 2


library(png)
library(grid)
library(ggplot2)
library(scales)
library(cowplot)
library(ggthemes)
library(ggrepel)


shenwei356.theme <-  theme_bw() +
  theme(
    panel.border = element_blank(),
    panel.background = element_blank(),
    panel.grid.major = element_blank(),
    panel.grid.minor = element_blank(),
    axis.line.x = element_line(colour = "black", size = 0.8),
    axis.line.y = element_line(colour = "black", size = 0.8),
    axis.ticks.x = element_line(size = 0.8),
    axis.ticks.y = element_line(size = 0.8),
    #     axis.text.x = element_text(
    #       angle = 30, hjust = 1, vjust = 1
    #     ),
    
    legend.key = element_blank(),
    legend.title = element_blank(),
    legend.text = element_text(size = 12, face = "plain"),
    legend.background = element_rect(fill = "transparent"),
    strip.background = element_rect(
      colour = "white", fill = "white",
      size = 0.2
    ),
    strip.text.x = element_text(size = 14),
    strip.text.y = element_text(size = 14),
    
    text = element_text(
      size = 14, family = "arial", face = "bold"
    ),
    plot.title = element_text(
      size = 16, family = "arial", face = "bold"
    )
  )

blank_theme <- theme(
  axis.title.x = element_blank(),
  axis.title.y = element_blank(),
  axis.text.x = element_blank(),
  axis.text.y = element_blank(),
  axis.ticks = element_blank(),
  panel.grid = element_blank(),
  axis.line = element_blank(),
  panel.background = element_blank(),
  plot.background = element_blank(),
  text = element_text(
    size = 14, family = "arial", face = "bold"
  ),
  plot.title = element_text(
    size = 18, family = "arial", face = "bold"
  )
)

plot_png <- function (file, title=NULL) {
  g <- rasterGrob(readPNG(file), interpolate = TRUE)
  df <- data.frame(x = c(0,0.5),y = c(0,0.5))
  p<- ggplot(df, aes(x = x, y = y)) +
    xlim(0,1) +
    ylim(0,1) +
    ggtitle(title) +
    annotation_custom(
      g, xmin = -Inf, xmax = Inf, ymin = -Inf, ymax = Inf
    ) +
    blank_theme
}

p <- plot_grid(
  plot_png("t.png"),
  plot_png("t.png"),
  ggdraw() + draw_label(" "), # blank
  ncol = 1,
  nrow = 2,
  labels = c("A", "B"),
  rel_heights = c(1, 1)
)
 + draw_label(
    "http://shenwei.me" ,
    angle = 35,
    size = 60,
    alpha = .1
  )  # water mark
ggsave(
  p, file = "Figure 2.png", width = 8, height = 5, dpi = 600
)


misc

recode a continuous variable to another variable

pg$wtclass <- cut(pg$weight, breaks = c(0, 5, 6, Inf))