library(tidyverse)
library(lubridate)
答疑汇总
1 正则表达式
1.1 零宽断言
- 提取
P
前面的字符:
<- c("H11A1PNIO", "H1A1PNIO")
a str_extract(a, ".*(?=P)")
[1] "H11A1" "H1A1"
- 提取
Melbourne
:
<- "Stamford Plaza, 111 Little Conllins St, Melbourne Vic 3000"
x %>%
x str_extract("(?<=,)[^,]*$") %>%
str_extract("(?<= )\\w+")
[1] "Melbourne"
- 零宽断言符号存在重复时提取:
str_extract("sdfs--", ".*?(?=-)")
[1] "sdfs"
1.2 常用
# 提取字母与数字
tibble(test = c("MG1", "MGW1")) %>%
mutate(words = str_extract(test, "[:alpha:]+"),
digits = str_extract(test, "\\d+"))
# A tibble: 2 × 3
test words digits
<chr> <chr> <chr>
1 MG1 MG 1
2 MGW1 MGW 1
2 Quarto
2.1 中文 PDF 输出
在 YAML 中加上:
include-in-header:
text:
\usepackage{ctex}
3 tidyverse
3.1 dplyr + tidyr
3.1.1 separate_rows()
- 拆分行为多列
tibble(x = 1:2,
y = c("a, b, c", "a, d")) %>%
separate_rows(y, sep = ", ")
# A tibble: 5 × 2
x y
<int> <chr>
1 1 a
2 1 b
3 1 c
4 2 a
5 2 d
3.1.2 separate()
- 仅拆分第一个空格
<- tibble(x = c("a ds vc", "bs dsg sfa gs"))
df
%>%
df separate(x, c("first", "second"),
sep = " ", extra = "merge")
# A tibble: 2 × 2
first second
<chr> <chr>
1 a ds vc
2 bs dsg sfa gs
3.1.3 rowwise()
- 计算每行
,
前的数的和除以,
后的数的和:
<- tribble(~A , ~B, ~C,
data "12,0", "21,2", "36,4",
"4,2", "3,0", "2,0",
"0,2", "0,0", "1,0")
%>%
data mutate(across(1:3, ~ str_split(.x, ",")),
across(1:3, ~ map(.x, as.numeric))) %>%
rowwise() %>%
mutate(res = sum(A[1], B[1], C[1])/ sum(A[2], B[2], C[2]))
# A tibble: 3 × 4
# Rowwise:
A B C res
<list> <list> <list> <dbl>
1 <dbl [2]> <dbl [2]> <dbl [2]> 11.5
2 <dbl [2]> <dbl [2]> <dbl [2]> 4.5
3 <dbl [2]> <dbl [2]> <dbl [2]> 0.5
- 对不同列含有的数值组合后去重:
<- tribble(
df ~V1, ~V2, ~V3,
58, 0, 2,
171, 1, 3,
0, 58, 2,
1, 171, 3
)
%>%
df rowwise() %>%
mutate(total = list(c(V1, V2, V3))) %>%
as_tibble() %>%
mutate(total = map(total, sort)) %>%
distinct(total, .keep_all = TRUE)
# A tibble: 2 × 4
V1 V2 V3 total
<dbl> <dbl> <dbl> <list>
1 58 0 2 <dbl [3]>
2 171 1 3 <dbl [3]>
%>%
df mutate(total = pmap(., ~ sort(c(...)))) %>%
distinct(total, .keep_all = TRUE)
# A tibble: 4 × 4
V1 V2 V3 total
<dbl> <dbl> <dbl> <list>
1 58 0 2 <dbl [3]>
2 171 1 3 <dbl [3]>
3 0 58 2 <dbl [3]>
4 1 171 3 <dbl [3]>
3.1.4 content
包含一系列如 cur_data()
、cur_group_id()
等返回当前分组和变量信息的函数。
<- tibble(
df group = c("A", "A", rep("B", 4), "C", "C")
)
%>%
df group_by(group) %>%
mutate(a = cur_group_id(), # 返回分组的组号
b = row_number())
# A tibble: 8 × 3
# Groups: group [3]
group a b
<chr> <int> <int>
1 A 1 1
2 A 1 2
3 B 2 1
4 B 2 2
5 B 2 3
6 B 2 4
7 C 3 1
8 C 3 2
3.1.5 fill()
根据小时分组对缺失值进行填充:
1 号 01:00 的缺失值用 2 号 01:00 的值补,若没有则用 3 号 01:00 时的值,以此类推。
<- tibble(
df time = ymd_hms("2022-11-01-01-00-00") + hours(c(0:3, 24:27, 48:51)),
value = c(rep(NA, 5), 1:7))
df
# A tibble: 12 × 2
time value
<dttm> <int>
1 2022-11-01 01:00:00 NA
2 2022-11-01 02:00:00 NA
3 2022-11-01 03:00:00 NA
4 2022-11-01 04:00:00 NA
5 2022-11-02 01:00:00 NA
6 2022-11-02 02:00:00 1
7 2022-11-02 03:00:00 2
8 2022-11-02 04:00:00 3
9 2022-11-03 01:00:00 4
10 2022-11-03 02:00:00 5
11 2022-11-03 03:00:00 6
12 2022-11-03 04:00:00 7
%>%
df arrange(time) %>% # 对时间进行排序
group_by(hour(time)) %>% # 按照小时对时间进行分组
fill(value, .direction = "up") # 每个组内从下到上填充缺失值
# A tibble: 12 × 3
# Groups: hour(time) [4]
time value `hour(time)`
<dttm> <int> <int>
1 2022-11-08 01:00:00 4 1
2 2022-11-08 02:00:00 1 2
3 2022-11-08 03:00:00 2 3
4 2022-11-08 04:00:00 3 4
5 2022-11-09 01:00:00 4 1
6 2022-11-09 02:00:00 1 2
7 2022-11-09 03:00:00 2 3
8 2022-11-09 04:00:00 3 4
9 2022-11-10 01:00:00 4 1
10 2022-11-10 02:00:00 5 2
11 2022-11-10 03:00:00 6 3
12 2022-11-10 04:00:00 7 4
3.2 ggplot2
3.2.1 主题
theme_set(theme_bw())
ggplot(mpg, aes(cty, displ)) +
geom_point()
3.2.2 组合图
利用字符串拼接成所需执行代码:
library(patchwork)
<- ggplot(mpg) +
p1 geom_point(aes(x = displ, y = hwy))
<- ggplot(mpg) +
p2 geom_bar(aes(x = as.character(year), fill = drv), position = "dodge") +
labs(x = "year")
<- ggplot(mpg) +
p3 geom_density(aes(x = hwy, fill = drv), colour = NA) +
facet_grid(rows = vars(drv))
<- ggplot(mpg) +
p4 stat_summary(aes(x = drv, y = hwy, fill = drv), geom = "col", fun.data = mean_se) +
stat_summary(aes(x = drv, y = hwy), geom = "errorbar", fun.data = mean_se, width = 0.5)
<- list(p1, p2, p3, p4)
my_list
cat(paste0("my_list[[", 1:length(my_list), "]]"),
sep = " / ", file = "my_list") # 生成一个 connection
eval(parse(file = "my_list")) # 运行该 connection
3.2.3 列名传递
列名太长,通过列号传递映射:
<- function(x_i, y_i) {
plot_xy <- names(mpg)
names <- names[x_i]
x <- names[y_i]
y
%>%
mpg ggplot(aes(.data[[x]], .data[[y]])) +
geom_point()
}
plot_xy(7, 8)
3.3 purrr
3.3.1 对列名迭代
利用 tidyverse 与 base R 混合编程会出现奇奇怪怪的问题,主要是和 Tidy evaluation 有关系。如果需要用到 tidyverse 的函数,建议直接在 tidyverse 的框架下完成。大致流程就是:
- 写一个函数;
- 用
map()
系列的函数迭代。
<- names(mpg)[1:2]
name
<- function(cols) {
fn %>%
mpg group_by(.data[[cols]]) %>%
summarise(freq = n())
}
map(name, fn)
[[1]]
# A tibble: 15 × 2
manufacturer freq
<chr> <int>
1 audi 18
2 chevrolet 19
3 dodge 37
4 ford 25
5 honda 9
6 hyundai 14
7 jeep 8
8 land rover 4
9 lincoln 3
10 mercury 4
11 nissan 13
12 pontiac 5
13 subaru 14
14 toyota 34
15 volkswagen 27
[[2]]
# A tibble: 38 × 2
model freq
<chr> <int>
1 4runner 4wd 6
2 a4 7
3 a4 quattro 8
4 a6 quattro 3
5 altima 6
6 c1500 suburban 2wd 5
7 camry 7
8 camry solara 7
9 caravan 2wd 11
10 civic 9
# … with 28 more rows
4 杂七杂八 | 奇淫技巧
4.1 批量建模
<- paste0(c("mpg", "disp", "hp"), c(" ~ wt"))
y map(y, ~lm(.x, data = mtcars))
[[1]]
Call:
lm(formula = .x, data = mtcars)
Coefficients:
(Intercept) wt
37.285 -5.344
[[2]]
Call:
lm(formula = .x, data = mtcars)
Coefficients:
(Intercept) wt
-131.1 112.5
[[3]]
Call:
lm(formula = .x, data = mtcars)
Coefficients:
(Intercept) wt
-1.821 46.160
4.2 修改源码
library(rstatix)
trace(rstatix:::as_tidy_cor, edit = T) # 修改后保存
::cor_test(iris, Sepal.Length, Petal.Length) # 可以把调整 cor 输出为 4 位 rstatix
4.3 执行字符串
= "c(1:10)"
text eval(parse(text = text))
[1] 1 2 3 4 5 6 7 8 9 10
eval(parse(text = "1/2"))
[1] 0.5
4.4 缺失值统计
<- tibble(x = c(rep(letters[1:3], 2), NA, NA))
data data
# A tibble: 8 × 1
x
<chr>
1 a
2 b
3 c
4 a
5 b
6 c
7 <NA>
8 <NA>
$x == "a",] # 筛选时缺失值留在表格中 data[data
# A tibble: 4 × 1
x
<chr>
1 a
2 a
3 <NA>
4 <NA>
table(data$x == "a") # 缺失值没有被统计进去
FALSE TRUE
4 2
5 其他包
5.1 psych
描述性统计:
library(psych)
::describe(iris, quant = c(.25, .75)) # 不支持 dplyr 的 group_by() psych
vars n mean sd median trimmed mad min max range skew
Sepal.Length 1 150 5.84 0.83 5.80 5.81 1.04 4.3 7.9 3.6 0.31
Sepal.Width 2 150 3.06 0.44 3.00 3.04 0.44 2.0 4.4 2.4 0.31
Petal.Length 3 150 3.76 1.77 4.35 3.76 1.85 1.0 6.9 5.9 -0.27
Petal.Width 4 150 1.20 0.76 1.30 1.18 1.04 0.1 2.5 2.4 -0.10
Species* 5 150 2.00 0.82 2.00 2.00 1.48 1.0 3.0 2.0 0.00
kurtosis se Q0.25 Q0.75
Sepal.Length -0.61 0.07 5.1 6.4
Sepal.Width 0.14 0.04 2.8 3.3
Petal.Length -1.42 0.14 1.6 5.1
Petal.Width -1.36 0.06 0.3 1.8
Species* -1.52 0.07 1.0 3.0
# 分组统计
::describeBy(iris, group = iris$Species, quant = c(.25, .75)) psych