[关闭]
@fanxy 2018-09-18T01:57:24.000000Z 字数 5725 阅读 3156

第二讲 数据处理

樊潇彦 复旦大学经济学院 数量软件


1. 基本数据管理

1.1 基本数学和逻辑运算

  1. # P69 创建leadership数据框
  2. manager <- c(1,2,3,4,5)
  3. date <- c("10/24/08","10/28/08","10/1/08","10/12/08","5/1/09")
  4. gender <- c("M","F","F","M","F")
  5. age <- c(32,45,25,39,99)
  6. q1 <- c(5,3,3,3,2)
  7. q2 <- c(4,5,5,3,2)
  8. q3 <- c(5,2,5,4,1)
  9. q4 <- c(5,5,5,NA,2)
  10. q5 <- c(5,5,2,NA,1)
  11. leadership <- data.frame(manager,date,gender,age,q1,q2,q3,q4,q5,
  12. stringsAsFactors=FALSE)

image.png-31.7kB
image.png-32.4kB

  1. # P70 创建新变量
  2. mydata<-data.frame(x1 = c(2, 2, 6, 4),
  3. x2 = c(3, 4, 2, 8))
  4. mydata$sumx <- mydata$x1 + mydata$x2
  5. mydata$meanx <- (mydata$x1 + mydata$x2)/2
  6. attach(mydata)
  7. mydata$sumx <- x1 + x2
  8. mydata$meanx <- (x1 + x2)/2
  9. detach(mydata)
  10. mydata <- transform(mydata,
  11. sumx = x1 + x2,
  12. meanx = (x1 + x2)/2)
  13. # P71-72 生成变量类别
  14. leadership$agecat[leadership$age > 75] <- "Elder"
  15. leadership$agecat[leadership$age >= 55 &
  16. leadership$age <= 75] <- "Middle Aged"
  17. leadership$agecat[leadership$age < 55] <- "Young"
  18. leadership <- within(leadership,{
  19. agecat <- NA
  20. agecat[age > 75] <- "Elder"
  21. agecat[age >= 55 & age <= 75] <- "Middle Aged"
  22. agecat[age < 55] <- "Young" })
  23. # P72-73 调用plyr包做变量重命名
  24. names(leadership)
  25. names(leadership)[2] <- "testDate"
  26. leadership
  27. library(plyr)
  28. leadership <- rename(leadership,
  29. c(manager="managerID", date="testDate"))
  30. # P74-75 处理缺失值
  31. is.na(leadership[, 6:10]) # 用 is.na() 判断缺失值
  32. leadership[age == 99, "age"] <- NA # 将错误值改为NA
  33. leadership
  34. x <- c(1, 2, NA, 3)
  35. y <- x[1] + x[2] + x[3] + x[4]
  36. z <- sum(x) # 含NA的sum
  37. x <- c(1, 2, NA, 3)
  38. y <- sum(x, na.rm=TRUE) # 排除NA的sum
  39. leadership
  40. newdata <- na.omit(leadership) # 删除有缺失值的样本
  41. newdata

1.2 日期变量与类型转换

image.png-55.8kB

  1. # P76-77 日期变量的处理
  2. mydates <- as.Date(c("2007-06-22", "2004-02-13"))
  3. strDates <- c("01/05/1965", "08/16/1975")
  4. dates <- as.Date(strDates, "%m/%d/%Y")
  5. today <- Sys.Date()
  6. format(today, format="%B %d %Y")
  7. format(today, format="%A")
  8. startdate <- as.Date("2004-02-13")
  9. enddate <- as.Date("2009-06-22")
  10. enddate - startdate
  11. today <- Sys.Date()
  12. dob <- as.Date("1956-10-12")
  13. difftime(today, dob, units="weeks")

image.png-37.9kB

  1. # P78 类型转换
  2. a <- c(1,2,3)
  3. a
  4. is.numeric(a)
  5. is.vector(a)
  6. a <- as.character(a)
  7. a
  8. is.numeric(a)
  9. is.vector(a)
  10. is.character(a)

1.3 排序、合并与选取

  1. # P79 数据排序
  2. newdata <- leadership[order(leadership$age),]
  3. attach(leadership)
  4. newdata <- leadership[order(gender, age),]
  5. detach(leadership)
  6. attach(leadership)
  7. newdata <-leadership[order(gender, -age),]
  8. detach(leadership)
  9. # P79-80 数据合并
  10. total <- merge(dataframeA, dataframeB, by="ID")
  11. total <- merge(dataframeA, dataframeB, by=c("ID","Country"))
  12. total <- rbind(dataframeA, dataframeB)
  13. # P80 数据取子集
  14. newdata <- leadership[, c(6:10)]
  15. myvars <- paste("q", 1:5, sep="") # 相当于 myvars <- c("q1", "q2", "q3", "q4", "q5")
  16. newdata <-leadership[myvars]
  17. myvars <- names(leadership) %in% c("q3", "q4")
  18. leadership[!myvars]
  19. # P80-82 数据选取
  20. newdata <- leadership[1:3,]
  21. newdata <- leadership[leadership$gender=="M" & leadership$age > 30,]
  22. attach(leadership)
  23. newdata <- leadership[gender=='M' & age > 30,]
  24. detach(leadership)
  25. startdate <- as.Date("2009-01-01")
  26. enddate <- as.Date("2009-10-31")
  27. newdata <- leadership[which(leadership$date >= startdate & leadership$date <= enddate),]
  28. newdata <- subset(leadership, age >= 35 | age < 24, select=c(q1, q2, q3, q4))
  29. newdata <- subset(leadership, gender=="M" & age > 25, select=gender:q4)
  30. # P83 用SQL操作
  31. library(sqldf)
  32. newdf <- sqldf("select * from mtcars where carb=1 order by mpg",
  33. row.names=TRUE)
  34. newdf
  35. sqldf("select avg(mpg) as avg_mpg, avg(disp) as avg_disp, gear
  36. from mtcars where cyl in (4, 6) group by gear")

2. 高级数据管理

2.1 数学函数

image.png-212.2kB

  1. # P86 成绩数据
  2. Student <- c("John Davis","Angela Williams","Bullwinkle Moose",
  3. "David Jones","Janice Markhammer",
  4. "Cheryl Cushing","Reuven Ytzrhak",
  5. "Greg Knox","Joel England","Mary Rayburn")
  6. math <- c(502, 600, 412, 358, 495, 512, 410, 625, 573, 522)
  7. science <- c(95, 99, 80, 82, 75, 85, 80, 95, 89, 86)
  8. english <- c(25, 22, 18, 15, 20, 28, 15, 30, 27, 18)
  9. roster <- data.frame(Student, math, science, english,
  10. stringsAsFactors=FALSE)
  11. # P89 计算均值和标准差
  12. x <- c(1, 2, 3, 4, 5, 6, 7, 8)
  13. mean(x)
  14. sd(x)
  15. n <- length(x)
  16. meanx <- sum(x)/n
  17. css <- sum((x - meanx)**2)
  18. sdx <- sqrt(css / (n-1))
  19. meanx
  20. sdx

2.2 统计函数与概率分布

image.png-197.9kB
image.png-83.7kB

  1. # P91 生成服从正态分布的伪随机数
  2. runif(5) # 均匀分布 (uniform distribution)
  3. set.seed(1234)
  4. runif(5)
  5. set.seed(1234)
  6. runif(5)
  7. # P92 生成服从多元正态分布的数据
  8. library(MASS)
  9. mean <- c(230.7, 146.7, 3.6)
  10. sigma <- matrix( c(15360.8, 6721.2, -47.1,
  11. 6721.2, 4700.9, -16.5,
  12. -47.1, -16.5, 0.3), nrow=3, ncol=3)
  13. set.seed(1234)
  14. mydata <- mvrnorm(500, mean, sigma) # 多元正态分布,一元正态分布为 rnorm(N,mean,sigma)
  15. mydata <- as.data.frame(mydata)
  16. names(mydata) <- c("y", "x1", "x2")
  17. dim(mydata)
  18. head(mydata, n=10)
  19. # P95 将函数应用于数据对象
  20. a <- 5
  21. sqrt(a)
  22. b <- c(1.243, 5.654, 2.99)
  23. round(b)
  24. c <- matrix(runif(12), nrow=3)
  25. c
  26. log(c)
  27. mean(c)
  28. # P95 将一个函数应用到矩阵的所有行(列)
  29. mydata <- matrix(rnorm(30), nrow=6)
  30. mydata
  31. apply(mydata, 1, mean)
  32. apply(mydata, 2, mean)
  33. apply(mydata, 2, mean, trim=.4)

2.3 字符处理、其他函数与例题

image.png-294.3kB
image.png-132kB

例题:将学生的各科考试成绩组合为单一的成绩衡量指标,基于相对名次(前20%、下20%、等等)给出从A到F的评分,根据学生姓氏和名字的首字母对花名册进行排序。

  1. # P96 解决方案示例
  2. options(digits=2)
  3. Student <- c("John Davis", "Angela Williams", "Bullwinkle Moose",
  4. "David Jones", "Janice Markhammer", "Cheryl Cushing",
  5. "Reuven Ytzrhak", "Greg Knox", "Joel England",
  6. "Mary Rayburn")
  7. Math <- c(502, 600, 412, 358, 495, 512, 410, 625, 573, 522)
  8. Science <- c(95, 99, 80, 82, 75, 85, 80, 95, 89, 86)
  9. English <- c(25, 22, 18, 15, 20, 28, 15, 30, 27, 18)
  10. roster <- data.frame(Student, Math, Science, English,
  11. stringsAsFactors=FALSE)
  12. z <- scale(roster[,2:4])
  13. score <- apply(z, 1, mean)
  14. roster <- cbind(roster, score)
  15. y <- quantile(score, c(.8,.6,.4,.2))
  16. roster$grade[score >= y[1]] <- "A"
  17. roster$grade[score < y[1] & score >= y[2]] <- "B"
  18. roster$grade[score < y[2] & score >= y[3]] <- "C"
  19. roster$grade[score < y[3] & score >= y[4]] <- "D"
  20. roster$grade[score < y[4]] <- "F"
  21. name <- strsplit((roster$Student), " ")
  22. Lastname <- sapply(name, "[", 2)
  23. Firstname <- sapply(name, "[", 1)
  24. roster <- cbind(Firstname,Lastname, roster[,-1])
  25. roster <- roster[order(Lastname,Firstname),]
  26. roster

2.4 整合与重构

  1. # P104 数据转置
  2. cars <- mtcars[1:5, 1:4]
  3. cars
  4. t(cars)
  5. # P105 整合数据
  6. options(digits=3)
  7. attach(mtcars)
  8. aggdata <-aggregate(mtcars, by=list(cyl,gear),
  9. FUN=mean, na.rm=TRUE)
  10. aggdata
  11. library(reshape2) # 调用包
  12. mydata <- read.table(header=TRUE, sep=" ", text=" # 读入数据
  13. ID Time X1 X2
  14. 1 1 5 6
  15. 1 2 3 5
  16. 2 1 6 1
  17. 2 2 2 4
  18. ")
  19. md <- melt(mydata, id=c("ID", "Time")) # 数据合并
  20. # reshaping with aggregation
  21. dcast(md, ID~variable, mean)
  22. dcast(md, Time~variable, mean)
  23. dcast(md, ID~Time, mean)
  24. # reshaping without aggregation
  25. dcast(md, ID+Time~variable)
  26. dcast(md, ID+variable~Time)
  27. dcast(md, ID~variable+Time)

3. 补充阅读:用 tidyr 和 dyplr 处理数据

添加新批注
在作者公开此批注前,只有你和作者可见。
回复批注