@fanxy
2021-09-15T04:17:21.000000Z
字数 9132
阅读 9402
樊潇彦 复旦大学经济学院 金融数据
数据下载:Ch02.rar
setwd("D:\\...\\Ch02")## tidyverse 包是一个综合包,内含数据读取与处理(readr,tidyr,dplyr)、作图(ggplot2)、特定数据类型处理(tibble,stringr,forcats),以及函数化编程(purrr)等常用包。install.packages("tidyverse")install.packages(c("readstata13","haven","readxl")) # 读写 stata、excel 等其他格式的数据install.packages(c("babynames","hflights")) # 含姓名和航班数据样本的包## 调用library(tidyverse)library(readstata13)library(haven)library(readxl)library(babynames)library(hflights)
Wiki: Data are values of qualitative or quantitative variables, belonging to a set of items.
.Rdata、.csv、.txt、.xlsx、.dta等)。
# 存放和读取.Rdatalibrary(quantmod)getSymbols("^GSPC",from="2015-1-1")tail(GSPC)save(GSPC,file="GSPC.Rdata") # 将保存历史记录的数据框直接写入本地Rdata文件rm(GSPC) # 删除变量GSPC之后,内存中无内容,无法访问load("GSPC.RDATA") # 装载save()函数保存的数据tail(GSPC)# .csv文件class(GSPC) # 原数据为时序类型GSPC_df=data.frame(Date=as.Date(index(GSPC)), # 日期coredata(GSPC)) # 数据class(GSPC_df) # 类型变为 data.framestr(GSPC_df) # 查看数据结构write.csv(GSPC,file="GSPC_csv.csv", row.names=F) # 将数据写入.csv文件,row.names=F不会保留行号GSPC <- read.csv("GSPC_csv.csv",header=T,skip=0) # read.csv为系统自带命令;含表头,即第一行为变量名class(GSPC) # data.frame类型,但没有日期# .txt文件write.table(GSPC_df, "GSPC_txt.txt") # 将.csv数据写为.txt格式GSPC <- read.table("GSPC_txt.txt",header = T, # read.table为系统自带命令stringsAsFactors =F) # 字符不要转成因子,第一列 Date 作为字符处理str(GSPC)# .dta文件library(readstata13)save.dta13(GSPC_df,"GSPC.dta",version=14)GSPC_dta=read.dta13("GSPC.dta")str(GSPC_dta)library(haven)GSPC=GSPC_dfcolnames(GSPC)=gsub("GSPC.","",colnames(GSPC)) # 原变量名中含有"GSPC.",去掉后保存,否则报错write_dta(GSPC, "GSPC.dta", version=14)GSPC_dta=read_dta("GSPC.dta")str(GSPC_dta)# .xls 或 .xlsx文件library(readxl)loan=read_excel("loan.xls")str(loan) # 发现部分变量名有误,在练习 loan 中介绍如何处理
readr 包读写和处理1. read_*, write_*:读写不同格式数据文件
library(readr)library(readxl)# 以 ',' 为分割符的 .csv 文件read_csv(system.file("extdata/mtcars.csv", package = "readr"))read_csv(system.file("extdata/mtcars.csv.zip", package = "readr"))read_csv(system.file("extdata/mtcars.csv.bz2", package = "readr"))read_csv("https://github.com/hadley/readr/raw/master/inst/extdata/mtcars.csv")read_csv("x,y\n1,2\n3,4")# 其他常用分隔符data=c("a;b\tc d|e\nf;g\th|i j")read_csv2(data, col_names=F) # ;read_tsv(data, col_names=F) # \tread_table(data, col_names=F) # " "read_lines(data) # \nread_delim(data, delim = "|", col_names=F) # 通用,如"|"# 读入列为固定长度的文件read_fwf(data, fwf_empty(data)) # 空格分割read_fwf(data, fwf_widths(c(2, 7))) # 规定每一列字符数read_fwf(data, fwf_positions(c(1,3), c(2,8))) # 规定起止字符位置# 作为字符读入read_file(data)# 读入log文件read_log(system.file("extdata/example.log", package = "readr"))# 将系统自带的 iris 数据存为.csv, .txt等write_csv(iris, "iris.csv") # ,write_tsv(iris, "iris.txt") # \twrite_delim(iris, "iris.csv", delim=",") # delim 可以设不同分割符
2. cols: 设置列向量的类型
? = guess-_/- to skip the column.
# 读入数据,最后一列为 characteriris=read_csv("iris.csv")str(iris)# 改为 factoriris=read_csv("iris.csv", col_types = cols(Species = col_factor(c("setosa", "versicolor", "virginica"))))str(iris)# 只读入一列 cols_onlyiris=read_csv("iris.csv", col_types = cols_only(Species = col_factor(c("setosa", "versicolor", "virginica"))))head(iris)# type_convert:直接转换为最有可能的格式df <- data.frame(x = as.character(runif(10)),y = as.character(sample(10)),stringsAsFactors = FALSE)str(df)str(type_convert(df))df <- data.frame(x = c("NA", 10), stringsAsFactors = FALSE)str(type_convert(df))
3. parse_*: 字符向量转换
parse_integer(c("1", "2", "3"))parse_double(c("1", "2", "3.123"))parse_factor(c("a", "b"), letters)parse_number("$1,123,456.00")# 空值处理x <- c("1", "2", "3", "-")parse_double(x, na = "-")
4. 其他
# 设置中国时间格式、数字分隔符和时区myloc <- locale(date_names ="zh", decimal_mark =",", tz="PRC" )parse_datetime(x=Sys.time(), locale=myloc)# 报错y <- read_csv("x\n1\n2\nb", col_types = list(col_double()))yproblems(y)
rm(list=ls())# 查看数据情况data(iris)dim(iris) # 几行几列nrow(iris); ncol(iris)colnames(iris) # 指标名称attributes(iris) # 数据性质str(iris) # 数据结构summary(iris)# 剔除缺失值和异常值样本set.seed(1)a=rnorm(100); b=a+rnorm(100) # 生成模拟数据a[c(2, 3)]="NA" # a有4个缺失值 NA 和 NULLa[c(4, 5)]="NULL"b[50]=10*max(b) # b有异常值simdata=data.frame(id=seq(1,100),a=a,b=b)head(simdata)str(simdata) # 发现a是因子变量simdata$a=as.numeric(as.character(a)) # 把a转换成数值变量na.fail(simdata) # 检测向量、矩阵、数据框中是否包含NA数值,如果包含返回错误信息simdata=na.omit(simdata) # 去掉有缺失值的样本varplot=c("a","b") # 需要作图的指标par(mfrow=c(1,length(varplot))) # 设置窗口for (i in varplot) boxplot(simdata[,i], main=i) # 做箱线图,发现b有异常值outlier=abs(simdata$b-mean(simdata$b))>3*sd(simdata$b) # 异常值所在行的逻辑向量simdata=simdata[-which(outlier),] # 去除异常值summary(simdata)
tidyr 和 dplyr 包处理数据1. 为什么用 tidyr 和 dplyr?
- Classroom data are like teddy bears and real data are like a grizzley bear with salmon blood dripping out its mouth.(Jenny Bryan)
- Up to 80% of data analysis is spent on the process of cleaning and preparing data. (Wickham, 2014 and Dasu and Johnson, 2003)
- `tidyr` & `dplyr` provide fundamental functions for cleaning, processing, & manipulating data, making 95% of your data wrangling tasks much easier!(Brad Boehmke, 2015)
2. 基本思想:化整为零、逐步处理
library(babynames)library(tidyr)library(dplyr, warn.conflicts = FALSE)a=babynames %>%filter(name == "Hadley") %>%group_by(year) %>%summarise(total = sum(n)) %>%arrange(desc(year))b0 <- babynamesb1 <- filter(b0, name == "Hadley")b2 <- group_by(b1, year)b3 <- summarise(b2, total = sum(n))b4 <- arrange(b3, desc(year))# 该命令还可以用于函数,如计算x1和x2之间的欧氏距离x1 <- 1:5; x2 <- 2:6; sqrt(sum((x1-x2)^2))(x1-x2)^2 %>% sum() %>% sqrt()

3. tidyr
tb <- read_csv("tb.csv")class(tb)head(tb)## 把 m04 (男-4岁)等列指标转换为 demographic = 指标名, n = 指标值的行样本tb2 <- tb %>%gather(demographic, n, m04:fu, na.rm = TRUE)tb2## 把 demographic 分为 sex 和 age 两个指标tb3 <- tb2 %>%separate(demographic, c("sex", "age"), 1)tb3## 重命名、排序tb4 <- tb3 %>%rename(country = iso2) %>%arrange(country, year, sex, age)tb4
4. dplyr
library(dplyr)library(hflights)data(hflights) # 调用航班数据flights <- tbl_df(hflights) # 生成本地数据glimpse(flights) # 查看数据情况
filter(flights, UniqueCarrier %in% c("AA", "UA"))filter(flights, Month==1, DayofMonth==1)filter(flights, Month==1 | DayofMonth==1)
select(flights, DepTime, ArrTime, FlightNum)select(flights, Year:DayofMonth, contains("Taxi"), contains("Delay"))
flights %>%select(UniqueCarrier, DepDelay) %>%arrange(desc(DepDelay)) # DepDelay指标的降序,升序为arrange(DepDelay)
flights <- flights %>% mutate(Speed = Distance/AirTime*60)
# 按Dest分组,计算平均ArrDelayflights %>%group_by(Dest) %>%summarise(avg_delay = mean(ArrDelay, na.rm=TRUE))# 计算组内多个指标的情况,n()为组内样本量,n_distinct(TailNum)为counts the number of unique items in TailNumflights %>%group_by(Dest) %>%summarise(flight_count = n(), plane_count = n_distinct(TailNum))# 分组对多个变量Cancelled, Diverted,计算统计指标meanflights %>%group_by(UniqueCarrier) %>%summarise_each(funs(mean), Cancelled, Diverted)# 分组对一个变量Delay计算多个指标min、maxflights %>%group_by(UniqueCarrier) %>%summarise_each(funs(min(., na.rm=TRUE), max(., na.rm=TRUE)), matches("Delay"))# 分组观察一个变量的统计表(number of cancelled and not cancelled flights)flights %>%group_by(Dest) %>%select(Cancelled) %>%table()
mean)的n-input, 1-output不同,Window function是n-input, n-output的统计函数,如min_rank、lead、lag、cummean(累积加总)等。
# 每架飞机在1年中的哪两天延误时间最长 top_n(2) 等同于filter(min_rank(desc(DepDelay)) <= 2)flights %>%group_by(UniqueCarrier) %>%select(Month, DayofMonth, DepDelay) %>%top_n(2) %>%arrange(UniqueCarrier, desc(DepDelay))# 计算每个月飞行的航班数和变动情况flights %>%group_by(Month) %>%summarise(flight_count = n()) %>%mutate(change = flight_count - lag(flight_count))
flights %>% sample_n(5) # 抽5个样本flights %>% sample_frac(0.25, replace=TRUE) # 抽25%样本
rm(list=ls())# 读取银行贷款数据loan=read_excel("loan.xls")str(loan) # 发现变量名有误:(1)先打开数据,选中要复制的表头;(2)运行以下命令将表头复制到timevectimevec=read.table("clipboard", sep = "\t", header = T) # 从内存剪贴板中读取数据colnames(loan)=c("var","freq",colnames(timevec)) # 更改变量名# 用 `tidyr` 和 `dplyr` 包的命令整理loan_year= loan %>%filter(freq == "年") %>% select(-freq) %>%separate(var,c("bank","var"))%>% # 将原来的 "银行名:贷款总额" 拆为两个新变量select(-var)%>% # 用 table(loan_year$var) 查看后删除 vargather(time, loan_vol, -bank, na.rm =T) %>%mutate(time=gsub("X","",time))%>% # 将原来的 "X2000.12" 中的 X 去掉separate(time,c("year","month"))%>% # 将 "2000.12" 拆为年和月mutate(year=as.numeric(year))%>% # 字符改为数值select(-month)%>% # 用 table(loan_year$month) 查看后删除 montharrange(bank,year) # 按银行名和年份排序str(loan_year)loan_quar=loan %>%filter(freq == "季") %>% select(-freq) %>%separate(var,c("bank","var"))%>%select(-var)%>%gather(time, loan_vol, -bank, na.rm =T) %>%mutate(time=gsub("X","",time))%>%separate(time,c("year","month")) %>%mutate(year=as.numeric(year)) %>%mutate(quar=as.numeric(month)/3) %>% # 用 table(loan_year$quar) 查看是否正确select(bank,year,quar,loan_vol)%>% # 按顺序选择变量arrange(bank,year,quar) # 按银行名、年份和季度排序str(loan_quar)