# Intro til R, Syntaks fra Owen: the R-guide (2010) # 1. Overview and History dieroll <- c(2,5,1,6,5,5,4,1) dieroll ls() newdieroll <- dieroll/2 # divide every element by two newdieroll ls() rm(newdieroll) # this was a silly variable anyway ls() help(log) ?log log(100) log2(16) # same as log(16,base=2) or just log(16,2) log(1000,base=10) # same as log10(1000) log2(c(1,2,3,4)) # log base 2 of the vector (1,2,3,4) apropos("norm") ??norm # ES: utvidet søk matrix() a <- c(1,2,3,4,5,6,7,8) A <- matrix(a,nrow=2,ncol=4, byrow=FALSE) # a is different from A A A <- matrix(a,2,4) # 2. Basic math 2+3 3/2 2^3 # this also can be written as 2**3 4^2 - 3*2 # this is simply 16 - 6 (56-14)/6 – 4*7*10/(5^2-5) # this is more complicated sqrt(2) abs(2-4) cos(4*pi) log(0) # not defined factorial(6) # 6! choose(52,5) # this is 52!/(47!*5!) x <- c(1,2,3,4) y <- c(5,6,7,8) x*y y/x y-x x^y cos(x*pi) + cos(y*pi) s <- c(1,1,3,4,7,11) length(s) sum(s) # 1+1+3+4+7+11 prod(s) # 1*1*3*4*7*11 cumsum(s) diff(s) # 1-1, 3-1, 4-3, 7-4, 11-7 diff(s, lag = 2) # 3-1, 4-1, 7-3, 11-4 a <- c(1,2,3,4,5,6,7,8,9,10) A <- matrix(a, nrow = 5, ncol = 2) # fill in by column A B <- matrix(a, nrow = 5, ncol = 2, byrow = TRUE) # fill in by row B C <- matrix(a, nrow = 2, ncol = 5, byrow = TRUE) t(C) # this is the same as A!! B%*%C D <- C%*%B det(D) solve(D) # this is D-1 # 3. Getting data into R mykids <- c("Stephen", "Christopher") # put text in quotes mykids 1:9 1.5:10 # you won’t get to 10 here c(1.5:10,10) # we can attach it to the end this way prod(1:8) # same as factorial(8) seq(1,5) # same as 1:5 seq(1,5,by=.5) # increment by 0.5 rep(10,10) # repeat the value 10 ten times rep(c("A","B","C","D"),2) matrix(rep(0,16),nrow=4) # a 4x4 matrix of zeroes x <- scan() # read what is typed into the variable x passengers <- scan() passengers passengers <- scan("passengers.txt") # ligger på NTNU/IntroR new.data <- data.frame() # creates an "empty" data frame new.data <- edit(new.data) # request that changes made are written to data frame fix(new.data) # changes saved automatically seatbelt <- c("Y","N","Y","Y","Y","Y","Y","Y","Y","Y", # return + "N","Y","Y","Y","Y","Y","Y","Y","Y","Y","Y","Y","Y", # return + "Y","Y","N","Y","Y","Y","Y") seatbelt car.dat <- data.frame(passengers,seatbelt) car.dat data() data(trees) trees trees$Height sum(trees$Height) # sum of just these values trees[4,3] # entry at forth row, third column trees[4,] # get the whole forth row attach(trees) Height search() attributes(trees) Height[Height > 75] smith <- read.table(file.choose(), header=T) # hent inn 'Ros.t.11.1.dat' som ligger på NTNU/IntroR # 4. Graphics plot(Height, Volume) # object trees is in the search path curve(sin(x), from = 0, to = 2*pi) par(mfrow = c(2, 2)) # gives a 2 x 2 layout of plots par(lend = 1) # gives "butt" line end caps for line plots2 par(bg = "cornsilk") # plots drawn with this colored background par(xlog = TRUE) # always plot x axis on a logarithmic scale oldpar <- par(no.readonly = TRUE) par(oldpar) # default (original) parameter settings restored # 5. Summarizing Data data(mtcars) # load in dataset attach(mtcars) # add mtcars to search path mtcars mean(hp) var(mpg) quantile(qsec, probs = c(.20, .80)) cor(wt,mpg) table(cyl) table(cyl)/length(cyl) # note: length(cyl) = 32 barplot(table(cyl)/length(cyl)) # use relative frequencies on the y-axis ?hist data(faithful) attach(faithful) hist(eruptions, main = "Old Faithful data", prob = T) hist(eruptions, main = "Old Faithful data", prob = T, breaks=18) stem(waiting) boxplot(faithful) plot(ecdf(x)) qqnorm(x) # creates the NPP for values stored in x qqline(x) # adds a reference line for the NPP # 6. Probability, Distributions, and Simulation x <- rnorm(100) # simulate 100 standard normal RVs, put in x w <- rexp(1000,rate=.1) # simulate 1000 from Exp( theta = 10, rate = 0.1) dbinom(3,size=10,prob=.25) # P(X = 3) for X ~ Bin(n=10, p=.25) dpois(0:2, lambda=4) # P(X=0), P(X=1), P(X=2) for X ~ Poisson pbinom(3,size=10,prob=.25) # P(X . 3) in the above distribution pnorm(12,mean=10,sd=2) # P(X . 12) for X~N(mu = 10, sigma = 2) qnorm(.75,mean=10,sd=2) # 3rd quartile of N(mu = 10,sigma = 2) qchisq(.10,df=8) # 10th percentile of chi-square (8) qt(.95,df=20) # 95th percentile of t(20) u <- runif(1000000, min=0, max=pi/2) pi/2*mean(4*sin(2*u)*exp(-u^2)) # a=0, b=pi/2, so b–a=pi/2 u <- runif(1000000, min=0, max=pi/2) # generate a new uniform vector pi/2*mean(4*sin(2*u)*exp(-u^2)) # a=0, b=pi/2, so b–a=pi/2 x <- 0:10 y <- dbinom(x, size=10, prob=.25) # evaluate probabilities plot(x, y, type = "h", lwd = 30, main = "Binomial Probabilities w/ n = 10, p = .25", col = "gray", lend = 'butt') plot(0:10, dbinom(x, size=10, prob=.25), type = "h", lwd = 30, lend = 'butt', col = 'gray') curve(dnorm(x), from = -3, to = 3) # the standard normal curve curve(pnorm(x, mean=10, sd=2), from = 4, to = 16) # a normal CDF simdata <- rexp(1000, rate=.1) hist(simdata, prob = T, breaks = "FD", main="Exp(theta = 10) RVs") curve(dexp(x, rate=.1), add = T) # overlay the density curve sample(1:100, 1) # choose a number between 1 and 100 sample(1:6, 10, replace = T) # toss a fair die 10 times sample(1:6, 10, T, c(.6,.2,.1,.05,.03,.02)) # not a fair die!! urn <- c(rep("red",8),rep("blue",4),rep("yellow",3)) sample(urn, 6, replace = F) # draw 6 balls from this urn # 7. Statistical Methods ?t.test data(trees) t.test(trees$Height, mu = 70) drug <- c(15, 10, 13, 7, 9, 8, 21, 9, 14, 8) plac <- c(15, 14, 12, 8, 14, 7, 16, 10, 15, 12) t.test(drug, plac, alternative = "less", var.equal = T) add <- c(24.6, 18.9, 27.3, 25.2, 22.0, 30.9) noadd <- c(23.8, 17.7, 26.6, 25.1, 21.6, 29.6) t.test(add, noadd, paired=T, alt = "greater") aov(x ~ a) # one-way ANOVA model aov(x ~ a + b) # two-way ANOVA with no interaction aov(x ~ a + b + a:b) # two-way ANOVA with interaction aov(x ~ a*b) # exactly the same as the above Str <- c(3225,3320,3165,3145,3220,3410,3320,3370,3545,3600,3580,3485) Type <- c(rep("A",4), rep("B",4), rep("C",4)) Type <- factor(Type) # consider these alternative expressions6 Type levels(Type) tapply(Str,Type,mean) tapply(Str,Type,var) boxplot(Str ~ Type, horizontal = T, xlab="Strength", col = "gray") anova.fit <- aov(Str ~ Type) summary(anova.fit) TukeyHSD(anova.fit) # the default is 95% CIs for differences plot(TukeyHSD(anova.fit)) lm(y ~ x) # simple linear regression (SLR) model lm(y ~ x1 + x2) # a regression plane lm(y ~ x1 + x2 + x3) # linear model with three regressors lm(y ~ x – 1) # SLR w/ an intercept of zero lm(y ~ x + I(x^2)) # quadratic regression model lm(y ~ x + I(x^2) + I(x^3)) # cubic model fit <- lm(dist ~ speed) # cars must be loaded and attached attributes(fit) fit summary(fit) anova(fit) confint(fit) plot(speed, dist) # first plot the data abline(fit) # could also type abline(-17.579, 3.932) plot(speed, fit$residuals) abline(0,0) # add a reference line at 0 predict.lm(fit, newdata=data.frame(speed=c(15,22)),int="conf") predict.lm(fit,newdata=data.frame(speed=c(15,22)),int="predict") ?chisq.test counts <- c(43, 49, 56, 45, 66, 41) probs <- rep(1/6, 6) chisq.test(counts, p = probs) color.blind <- matrix(c(442, 514, 38, 6), nrow=2, byrow=T) color.blind dimnames(color.blind) <- list(c("normal","c-b"),c("Male","Female")) chisq.test(color.blind, correct=F) # no correction for this one out <- chisq.test(color.blind, correct=F) attributes(out) out$expected # these are the expected counts