@fanxy
2018-09-18T01:57:24.000000Z
字数 5725
阅读 3156
樊潇彦
复旦大学经济学院
数量软件
# P69 创建leadership数据框
manager <- c(1,2,3,4,5)
date <- c("10/24/08","10/28/08","10/1/08","10/12/08","5/1/09")
gender <- c("M","F","F","M","F")
age <- c(32,45,25,39,99)
q1 <- c(5,3,3,3,2)
q2 <- c(4,5,5,3,2)
q3 <- c(5,2,5,4,1)
q4 <- c(5,5,5,NA,2)
q5 <- c(5,5,2,NA,1)
leadership <- data.frame(manager,date,gender,age,q1,q2,q3,q4,q5,
stringsAsFactors=FALSE)
# P70 创建新变量
mydata<-data.frame(x1 = c(2, 2, 6, 4),
x2 = c(3, 4, 2, 8))
mydata$sumx <- mydata$x1 + mydata$x2
mydata$meanx <- (mydata$x1 + mydata$x2)/2
attach(mydata)
mydata$sumx <- x1 + x2
mydata$meanx <- (x1 + x2)/2
detach(mydata)
mydata <- transform(mydata,
sumx = x1 + x2,
meanx = (x1 + x2)/2)
# P71-72 生成变量类别
leadership$agecat[leadership$age > 75] <- "Elder"
leadership$agecat[leadership$age >= 55 &
leadership$age <= 75] <- "Middle Aged"
leadership$agecat[leadership$age < 55] <- "Young"
leadership <- within(leadership,{
agecat <- NA
agecat[age > 75] <- "Elder"
agecat[age >= 55 & age <= 75] <- "Middle Aged"
agecat[age < 55] <- "Young" })
# P72-73 调用plyr包做变量重命名
names(leadership)
names(leadership)[2] <- "testDate"
leadership
library(plyr)
leadership <- rename(leadership,
c(manager="managerID", date="testDate"))
# P74-75 处理缺失值
is.na(leadership[, 6:10]) # 用 is.na() 判断缺失值
leadership[age == 99, "age"] <- NA # 将错误值改为NA
leadership
x <- c(1, 2, NA, 3)
y <- x[1] + x[2] + x[3] + x[4]
z <- sum(x) # 含NA的sum
x <- c(1, 2, NA, 3)
y <- sum(x, na.rm=TRUE) # 排除NA的sum
leadership
newdata <- na.omit(leadership) # 删除有缺失值的样本
newdata
# P76-77 日期变量的处理
mydates <- as.Date(c("2007-06-22", "2004-02-13"))
strDates <- c("01/05/1965", "08/16/1975")
dates <- as.Date(strDates, "%m/%d/%Y")
today <- Sys.Date()
format(today, format="%B %d %Y")
format(today, format="%A")
startdate <- as.Date("2004-02-13")
enddate <- as.Date("2009-06-22")
enddate - startdate
today <- Sys.Date()
dob <- as.Date("1956-10-12")
difftime(today, dob, units="weeks")
# P78 类型转换
a <- c(1,2,3)
a
is.numeric(a)
is.vector(a)
a <- as.character(a)
a
is.numeric(a)
is.vector(a)
is.character(a)
# P79 数据排序
newdata <- leadership[order(leadership$age),]
attach(leadership)
newdata <- leadership[order(gender, age),]
detach(leadership)
attach(leadership)
newdata <-leadership[order(gender, -age),]
detach(leadership)
# P79-80 数据合并
total <- merge(dataframeA, dataframeB, by="ID")
total <- merge(dataframeA, dataframeB, by=c("ID","Country"))
total <- rbind(dataframeA, dataframeB)
# P80 数据取子集
newdata <- leadership[, c(6:10)]
myvars <- paste("q", 1:5, sep="") # 相当于 myvars <- c("q1", "q2", "q3", "q4", "q5")
newdata <-leadership[myvars]
myvars <- names(leadership) %in% c("q3", "q4")
leadership[!myvars]
# P80-82 数据选取
newdata <- leadership[1:3,]
newdata <- leadership[leadership$gender=="M" & leadership$age > 30,]
attach(leadership)
newdata <- leadership[gender=='M' & age > 30,]
detach(leadership)
startdate <- as.Date("2009-01-01")
enddate <- as.Date("2009-10-31")
newdata <- leadership[which(leadership$date >= startdate & leadership$date <= enddate),]
newdata <- subset(leadership, age >= 35 | age < 24, select=c(q1, q2, q3, q4))
newdata <- subset(leadership, gender=="M" & age > 25, select=gender:q4)
# P83 用SQL操作
library(sqldf)
newdf <- sqldf("select * from mtcars where carb=1 order by mpg",
row.names=TRUE)
newdf
sqldf("select avg(mpg) as avg_mpg, avg(disp) as avg_disp, gear
from mtcars where cyl in (4, 6) group by gear")
# P86 成绩数据
Student <- c("John Davis","Angela Williams","Bullwinkle Moose",
"David Jones","Janice Markhammer",
"Cheryl Cushing","Reuven Ytzrhak",
"Greg Knox","Joel England","Mary Rayburn")
math <- c(502, 600, 412, 358, 495, 512, 410, 625, 573, 522)
science <- c(95, 99, 80, 82, 75, 85, 80, 95, 89, 86)
english <- c(25, 22, 18, 15, 20, 28, 15, 30, 27, 18)
roster <- data.frame(Student, math, science, english,
stringsAsFactors=FALSE)
# P89 计算均值和标准差
x <- c(1, 2, 3, 4, 5, 6, 7, 8)
mean(x)
sd(x)
n <- length(x)
meanx <- sum(x)/n
css <- sum((x - meanx)**2)
sdx <- sqrt(css / (n-1))
meanx
sdx
# P91 生成服从正态分布的伪随机数
runif(5) # 均匀分布 (uniform distribution)
set.seed(1234)
runif(5)
set.seed(1234)
runif(5)
# P92 生成服从多元正态分布的数据
library(MASS)
mean <- c(230.7, 146.7, 3.6)
sigma <- matrix( c(15360.8, 6721.2, -47.1,
6721.2, 4700.9, -16.5,
-47.1, -16.5, 0.3), nrow=3, ncol=3)
set.seed(1234)
mydata <- mvrnorm(500, mean, sigma) # 多元正态分布,一元正态分布为 rnorm(N,mean,sigma)
mydata <- as.data.frame(mydata)
names(mydata) <- c("y", "x1", "x2")
dim(mydata)
head(mydata, n=10)
# P95 将函数应用于数据对象
a <- 5
sqrt(a)
b <- c(1.243, 5.654, 2.99)
round(b)
c <- matrix(runif(12), nrow=3)
c
log(c)
mean(c)
# P95 将一个函数应用到矩阵的所有行(列)
mydata <- matrix(rnorm(30), nrow=6)
mydata
apply(mydata, 1, mean)
apply(mydata, 2, mean)
apply(mydata, 2, mean, trim=.4)
例题:将学生的各科考试成绩组合为单一的成绩衡量指标,基于相对名次(前20%、下20%、等等)给出从A到F的评分,根据学生姓氏和名字的首字母对花名册进行排序。
# P96 解决方案示例
options(digits=2)
Student <- c("John Davis", "Angela Williams", "Bullwinkle Moose",
"David Jones", "Janice Markhammer", "Cheryl Cushing",
"Reuven Ytzrhak", "Greg Knox", "Joel England",
"Mary Rayburn")
Math <- c(502, 600, 412, 358, 495, 512, 410, 625, 573, 522)
Science <- c(95, 99, 80, 82, 75, 85, 80, 95, 89, 86)
English <- c(25, 22, 18, 15, 20, 28, 15, 30, 27, 18)
roster <- data.frame(Student, Math, Science, English,
stringsAsFactors=FALSE)
z <- scale(roster[,2:4])
score <- apply(z, 1, mean)
roster <- cbind(roster, score)
y <- quantile(score, c(.8,.6,.4,.2))
roster$grade[score >= y[1]] <- "A"
roster$grade[score < y[1] & score >= y[2]] <- "B"
roster$grade[score < y[2] & score >= y[3]] <- "C"
roster$grade[score < y[3] & score >= y[4]] <- "D"
roster$grade[score < y[4]] <- "F"
name <- strsplit((roster$Student), " ")
Lastname <- sapply(name, "[", 2)
Firstname <- sapply(name, "[", 1)
roster <- cbind(Firstname,Lastname, roster[,-1])
roster <- roster[order(Lastname,Firstname),]
roster
# P104 数据转置
cars <- mtcars[1:5, 1:4]
cars
t(cars)
# P105 整合数据
options(digits=3)
attach(mtcars)
aggdata <-aggregate(mtcars, by=list(cyl,gear),
FUN=mean, na.rm=TRUE)
aggdata
library(reshape2) # 调用包
mydata <- read.table(header=TRUE, sep=" ", text=" # 读入数据
ID Time X1 X2
1 1 5 6
1 2 3 5
2 1 6 1
2 2 2 4
")
md <- melt(mydata, id=c("ID", "Time")) # 数据合并
# reshaping with aggregation
dcast(md, ID~variable, mean)
dcast(md, Time~variable, mean)
dcast(md, ID~Time, mean)
# reshaping without aggregation
dcast(md, ID+Time~variable)
dcast(md, ID+variable~Time)
dcast(md, ID~variable+Time)