@fanxy 2018-09-18T01:57:24.000000Z 字数 5725 阅读 3156

# 第二讲 数据处理

樊潇彦 复旦大学经济学院 数量软件

# 1. 基本数据管理

## 1.1 基本数学和逻辑运算

# P69 创建leadership数据框manager <- c(1,2,3,4,5)date <- c("10/24/08","10/28/08","10/1/08","10/12/08","5/1/09")gender <- c("M","F","F","M","F")age <- c(32,45,25,39,99)q1 <- c(5,3,3,3,2)q2 <- c(4,5,5,3,2)q3 <- c(5,2,5,4,1)q4 <- c(5,5,5,NA,2)q5 <- c(5,5,2,NA,1)leadership <- data.frame(manager,date,gender,age,q1,q2,q3,q4,q5,                          stringsAsFactors=FALSE)

# 2. 高级数据管理

## 2.1 数学函数

# P86 成绩数据Student <- c("John Davis","Angela Williams","Bullwinkle Moose",             "David Jones","Janice Markhammer",             "Cheryl Cushing","Reuven Ytzrhak",             "Greg Knox","Joel England","Mary Rayburn")math <- c(502, 600, 412, 358, 495, 512, 410, 625, 573, 522)science <- c(95, 99, 80, 82, 75, 85, 80, 95, 89, 86)english <- c(25, 22, 18, 15, 20, 28, 15, 30, 27, 18)roster <- data.frame(Student, math, science, english,                      stringsAsFactors=FALSE)# P89 计算均值和标准差x <- c(1, 2, 3, 4, 5, 6, 7, 8)mean(x)sd(x)n <- length(x)meanx <- sum(x)/ncss <- sum((x - meanx)**2)            sdx <- sqrt(css / (n-1))meanxsdx

## 2.2 统计函数与概率分布

# P91 生成服从正态分布的伪随机数runif(5)                                          # 均匀分布 (uniform distribution)set.seed(1234)                                                     runif(5)set.seed(1234)                                                      runif(5)# P92 生成服从多元正态分布的数据library(MASS)mean <- c(230.7, 146.7, 3.6)                                           sigma <- matrix( c(15360.8, 6721.2, -47.1,                                                 6721.2, 4700.9, -16.5,                   -47.1,  -16.5,   0.3), nrow=3, ncol=3)set.seed(1234)mydata <- mvrnorm(500, mean, sigma)              # 多元正态分布，一元正态分布为 rnorm(N,mean,sigma)mydata <- as.data.frame(mydata)                                         names(mydata) <- c("y", "x1", "x2")                                       dim(mydata)                                                             head(mydata, n=10)   # P95 将函数应用于数据对象a <- 5sqrt(a)b <- c(1.243, 5.654, 2.99)round(b)c <- matrix(runif(12), nrow=3)clog(c)mean(c)#  P95 将一个函数应用到矩阵的所有行（列）mydata <- matrix(rnorm(30), nrow=6)mydataapply(mydata, 1, mean)     apply(mydata, 2, mean) apply(mydata, 2, mean, trim=.4)   

## 2.3 字符处理、其他函数与例题

# P96 解决方案示例options(digits=2)Student <- c("John Davis", "Angela Williams", "Bullwinkle Moose",             "David Jones", "Janice Markhammer", "Cheryl Cushing",             "Reuven Ytzrhak", "Greg Knox", "Joel England",             "Mary Rayburn")Math <- c(502, 600, 412, 358, 495, 512, 410, 625, 573, 522)Science <- c(95, 99, 80, 82, 75, 85, 80, 95, 89, 86)English <- c(25, 22, 18, 15, 20, 28, 15, 30, 27, 18)roster <- data.frame(Student, Math, Science, English,                     stringsAsFactors=FALSE)z <- scale(roster[,2:4])score <- apply(z, 1, mean)roster <- cbind(roster, score)y <- quantile(score, c(.8,.6,.4,.2))roster$grade[score >= y[1]] <- "A"roster$grade[score < y[1] & score >= y[2]] <- "B"roster$grade[score < y[2] & score >= y[3]] <- "C"roster$grade[score < y[3] & score >= y[4]] <- "D"roster$grade[score < y[4]] <- "F"name <- strsplit((roster$Student), " ")Lastname <- sapply(name, "[", 2)Firstname <- sapply(name, "[", 1)roster <- cbind(Firstname,Lastname, roster[,-1])roster <- roster[order(Lastname,Firstname),]roster

## 2.4 整合与重构

# P104 数据转置cars <- mtcars[1:5, 1:4]      carst(cars)# P105 整合数据options(digits=3)attach(mtcars)aggdata <-aggregate(mtcars, by=list(cyl,gear),                     FUN=mean, na.rm=TRUE)aggdatalibrary(reshape2)                                     # 调用包mydata <- read.table(header=TRUE, sep=" ", text="     # 读入数据ID Time X1 X21 1 5 61 2 3 52 1 6 12 2 2 4")md <- melt(mydata, id=c("ID", "Time"))                # 数据合并# reshaping with aggregationdcast(md, ID~variable, mean)dcast(md, Time~variable, mean)dcast(md, ID~Time, mean)# reshaping without aggregationdcast(md, ID+Time~variable)dcast(md, ID+variable~Time)dcast(md, ID~variable+Time)

• 私有
• 公开
• 删除