Skip to content

Subsetting Data

Selecting (Keeping) Variables

mydata <- c("v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12")

# select variables v1, v2, v3
myvars <- c("v1", "v2", "v3")
newdata <- mydata[myvars]

# another method
myvars <- paste("v", 1:3, sep="")
newdata <- mydata[myvars]

# select 1st and 5th thru 10th variables
newdata <- mydata[c(1,5:10)]

Excluding (DROPPING) Variables

mydata <- c("v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12")

# exclude variables v1, v2, v3
myvars <- names(mydata) %in% c("v1", "v2", "v3")
newdata <- mydata[!myvars]

# exclude 3rd and 5th variable
newdata <- mydata[c(-3,-5)]

# delete variables v3 and v5
mydata$v3 <- mydata$v5 <- NULL

Selecting Observations

mydata <- data.frame(gender=c("F", "M", "F"), age=c(66, 55, 22))

# first 5 observations
newdata <- mydata[1:5,]

# based on variable values
newdata <- mydata[ which(mydata$gender=='F'
& mydata$age > 65), ]

# or
attach(mydata)
newdata <- mydata[ which(gender=='F' & age > 65),]
detach(mydata)

Selection using the Subset Function

mydata <- data.frame(sex=c("m", "f", "m", "m"), age=c(8, 15, 29, 33), ID=c(1:4), weight=c(60, 70, 80, 83), income=c(0, 50, 100,200))

# using subset function
newdata <- subset(mydata, age >= 20 | age < 10,
select=c(ID, weight))

# using subset function (part 2)
newdata <- subset(mydata, sex=="m" & age > 25,
select=weight:income)

Random Samples

mydata <- data.frame(c(1:55))

# take a random sample of size 50 from a dataset mydata
# sample without replacement
mysample <- mydata[sample(1:nrow(mydata), 50,
   replace=FALSE),]