x_dt <- data.table::as.data.table(x)
y_dt <- data.table::as.data.table(y)data.table
library(data.table)
dt <- mtcars
data.table::setDT(dt)dplyr
library(dplyr)
library(tidyr)
library(magrittr)
df <- mtcarsbaseR <- aggregate(. ~ cyl, data = df, FUN = length)
baseR <- baseR[c("cyl", "mpg")] # Adjust column names if needed
print(baseR) cyl mpg
1 4 11
2 6 7
3 8 14
group by
dt[, .(n=.N), by= .(cyl)] cyl n
<num> <int>
1: 6 7
2: 4 11
3: 8 14
df %>% group_by(cyl) %>%
summarize(n=n())# A tibble: 3 × 2
cyl n
<dbl> <int>
1 4 11
2 6 7
3 8 14
dt[, .(avg= mean(mpg)), by= .(cyl)] cyl avg
<num> <num>
1: 6 19.74286
2: 4 26.66364
3: 8 15.10000
df %>% group_by(cyl) %>%
summarize(avg=mean(mpg))# A tibble: 3 × 2
cyl avg
<dbl> <dbl>
1 4 26.7
2 6 19.7
3 8 15.1
dt[, .(avg= mean(mpg)), by= .(cyl,gear)] cyl gear avg
<num> <num> <num>
1: 6 4 19.750
2: 4 4 26.925
3: 6 3 19.750
4: 8 3 15.050
5: 4 3 21.500
6: 4 5 28.200
7: 8 5 15.400
8: 6 5 19.700
df %>% group_by(cyl,gear) %>%
summarize(avg=mean(mpg))# A tibble: 8 × 3
# Groups: cyl [3]
cyl gear avg
<dbl> <dbl> <dbl>
1 4 3 21.5
2 4 4 26.9
3 4 5 28.2
4 6 3 19.8
5 6 4 19.8
6 6 5 19.7
7 8 3 15.0
8 8 5 15.4
pivot
# wide dataframe
wide_dt <- data.frame(player=c('A', 'B', 'C', 'D'),
year1=c(12, 15, 19, 19),
year2=c(22, 29, 18, 12))
data.table::setDT(wide_dt)
head(wide_dt) player year1 year2
<char> <num> <num>
1: A 12 22
2: B 15 29
3: C 19 18
4: D 19 12
wide_df <- data.frame(player=c('A', 'B', 'C', 'D'),
year1=c(12, 15, 19, 19),
year2=c(22, 29, 18, 12))
wide_df player year1 year2
1 A 12 22
2 B 15 29
3 C 19 18
4 D 19 12
# wide dataframe to long dataframe
wide_to_long <- data.table::melt(wide_dt,
measure.vars=c('year1','year2'),
variable.name= 'year',
value.name = 'points')
wide_to_long player year points
<char> <fctr> <num>
1: A year1 12
2: B year1 15
3: C year1 19
4: D year1 19
5: A year2 22
6: B year2 29
7: C year2 18
8: D year2 12
long_df <- tidyr::pivot_longer(wide_df,
cols=c('year1', 'year2'),
names_to='year',
values_to='points')
long_df# A tibble: 8 × 3
player year points
<chr> <chr> <dbl>
1 A year1 12
2 A year2 22
3 B year1 15
4 B year2 29
5 C year1 19
6 C year2 18
7 D year1 19
8 D year2 12
# long dataframe to wide dataframe
long_to_wide <- data.table::dcast(wide_to_long,
player ~ year,
value.var = 'points')
long_to_wideKey: <player>
player year1 year2
<char> <num> <num>
1: A 12 22
2: B 15 29
3: C 19 18
4: D 19 12
long_to_wide_df <- tidyr::pivot_wider(long_df,
names_from = 'year',
values_from = 'points')
long_to_wide_df# A tibble: 4 × 3
player year1 year2
<chr> <dbl> <dbl>
1 A 12 22
2 B 15 29
3 C 19 18
4 D 19 12
pivot in base R with reshape()
long to wide
# base R, reshape function
base_wide <- data.frame(player=c('A', 'B', 'C', 'D'),
year1=c(12, 15, 19, 19),
year2=c(22, 29, 18, 12))
base_wide player year1 year2
1 A 12 22
2 B 15 29
3 C 19 18
4 D 19 12
# wide to long
long_df <- reshape(
base_wide,
direction = "long",
varying = list(c("year1", "year2")),
v.names = "points",
idvar = "player",
times = c("year1", "year2"),
timevar = "year"
)
long_df player year points
A.year1 A year1 12
B.year1 B year1 15
C.year1 C year1 19
D.year1 D year1 19
A.year2 A year2 22
B.year2 B year2 29
C.year2 C year2 18
D.year2 D year2 12
long to wide
# long data
head(wide_to_long) player year points
<char> <fctr> <num>
1: A year1 12
2: B year1 15
3: C year1 19
4: D year1 19
5: A year2 22
6: B year2 29
# from long to wide
reshape(wide_to_long, direction = 'wide',idvar = 'player',
timevar = 'year', v.names = 'points', sep = '_') player points_year1 points_year2
<char> <num> <num>
1: A 12 22
2: B 15 29
3: C 19 18
4: D 19 12
Join
x <- data.table(Id = c("A", "B", "C", "C"),
X1 = c(1L, 3L, 5L, 7L),
XY = c("x2", "x4", "x6", "x8"),
key = "Id")
xKey: <Id>
Id X1 XY
<char> <int> <char>
1: A 1 x2
2: B 3 x4
3: C 5 x6
4: C 7 x8
y <- data.table(Id = c("A", "B", "B", "D"),
Y1 = c(1L, 3L, 5L, 7L),
XY = c("y1", "y3", "y5", "y7"),
key = "Id")
yKey: <Id>
Id Y1 XY
<char> <int> <char>
1: A 1 y1
2: B 3 y3
3: B 5 y5
4: D 7 y7
y[x, on = "Id"]Key: <Id>
Id Y1 XY X1 i.XY
<char> <int> <char> <int> <char>
1: A 1 y1 1 x2
2: B 3 y3 3 x4
3: B 5 y5 3 x4
4: C NA <NA> 5 x6
5: C NA <NA> 7 x8
# y[x, on = .(Id_x=Id_y)] # if different column name
# merge(x, y, all.x = TRUE, by = "Id")
# merge(x, y, all.x = TRUE, by.x = "x_column", by.y = "y_column")
# if column name differentleft_join(x, y, by = "Id")Key: <Id>
Id X1 XY.x Y1 XY.y
<char> <int> <char> <int> <char>
1: A 1 x2 1 y1
2: B 3 x4 3 y3
3: B 3 x4 5 y5
4: C 5 x6 NA <NA>
5: C 7 x8 NA <NA>
# left_join(x, y, by.x = "x_column", by.y= "y_column") # if column
#name differentx[y, on = "Id"]Key: <Id>
Id X1 XY Y1 i.XY
<char> <int> <char> <int> <char>
1: A 1 x2 1 y1
2: B 3 x4 3 y3
3: B 3 x4 5 y5
4: D NA <NA> 7 y7
# merge(x, y, all.y = TRUE, by = "Id")right_join(x, y, by = "Id")Key: <Id>
Id X1 XY.x Y1 XY.y
<char> <int> <char> <int> <char>
1: A 1 x2 1 y1
2: B 3 x4 3 y3
3: B 3 x4 5 y5
4: D NA <NA> 7 y7
matching rows from both x and y
x[y, on = "Id", nomatch = 0]Key: <Id>
Id X1 XY Y1 i.XY
<char> <int> <char> <int> <char>
1: A 1 x2 1 y1
2: B 3 x4 3 y3
3: B 3 x4 5 y5
# merge(x, y)inner_join(x, y, by = "Id")Key: <Id>
Id X1 XY.x Y1 XY.y
<char> <int> <char> <int> <char>
1: A 1 x2 1 y1
2: B 3 x4 3 y3
3: B 3 x4 5 y5
all the rows from x and y
merge(x, y, all = TRUE, by = "Id")Key: <Id>
Id X1 XY.x Y1 XY.y
<char> <int> <char> <int> <char>
1: A 1 x2 1 y1
2: B 3 x4 3 y3
3: B 3 x4 5 y5
4: C 5 x6 NA <NA>
5: C 7 x8 NA <NA>
6: D NA <NA> 7 y7
full_join(x, y, by = "Id")Key: <Id>
Id X1 XY.x Y1 XY.y
<char> <int> <char> <int> <char>
1: A 1 x2 1 y1
2: B 3 x4 3 y3
3: B 3 x4 5 y5
4: C 5 x6 NA <NA>
5: C 7 x8 NA <NA>
6: D NA <NA> 7 y7
Return rows from x matching y
unique(x[y$Id, on = "Id", nomatch = 0])Key: <Id>
Id X1 XY
<char> <int> <char>
1: A 1 x2
2: B 3 x4
semi_join(x, y, by = "Id")Key: <Id>
Id X1 XY
<char> <int> <char>
1: A 1 x2
2: B 3 x4
Return rows from x not matching y
x[!y, on = "Id"]Key: <Id>
Id X1 XY
<char> <int> <char>
1: C 5 x6
2: C 7 x8
anti_join(x, y, by = "Id")Key: <Id>
Id X1 XY
<char> <int> <char>
1: C 5 x6
2: C 7 x8
bind
x <- mtcars[1:5, 1:4]
y <- mtcars[6:10,1:4]
x mpg cyl disp hp
<num> <num> <num> <num>
1: 21.0 6 160 110
2: 21.0 6 160 110
3: 22.8 4 108 93
4: 21.4 6 258 110
5: 18.7 8 360 175
y mpg cyl disp hp
<num> <num> <num> <num>
1: 18.1 6 225.0 105
2: 14.3 8 360.0 245
3: 24.4 4 146.7 62
4: 22.8 4 140.8 95
5: 19.2 6 167.6 123
data.table::rbindlist(list(x_dt,y_dt)) mpg cyl disp hp
<num> <num> <num> <num>
1: 21.0 6 160.0 110
2: 21.0 6 160.0 110
3: 22.8 4 108.0 93
4: 21.4 6 258.0 110
5: 18.7 8 360.0 175
6: 18.1 6 225.0 105
7: 14.3 8 360.0 245
8: 24.4 4 146.7 62
9: 22.8 4 140.8 95
10: 19.2 6 167.6 123
# data.table::rbindlist(list(x_dt,y_dt),
# use.names = FALSE,
# fill = FALSE,
# idcol = 'id')dplyr::bind_rows(x,y) mpg cyl disp hp
<num> <num> <num> <num>
1: 21.0 6 160.0 110
2: 21.0 6 160.0 110
3: 22.8 4 108.0 93
4: 21.4 6 258.0 110
5: 18.7 8 360.0 175
6: 18.1 6 225.0 105
7: 14.3 8 360.0 245
8: 24.4 4 146.7 62
9: 22.8 4 140.8 95
10: 19.2 6 167.6 123
library(reticulate)
df_py <- reticulate::r_to_py(df)import pandas as pddf_python = r.df_py
# Print the Python dataframe
print(df_python) mpg cyl disp hp drat wt qsec vs am gear carb
0 21.0 6.0 160.0 110.0 3.90 2.620 16.46 0.0 1.0 4.0 4.0
1 21.0 6.0 160.0 110.0 3.90 2.875 17.02 0.0 1.0 4.0 4.0
2 22.8 4.0 108.0 93.0 3.85 2.320 18.61 1.0 1.0 4.0 1.0
3 21.4 6.0 258.0 110.0 3.08 3.215 19.44 1.0 0.0 3.0 1.0
4 18.7 8.0 360.0 175.0 3.15 3.440 17.02 0.0 0.0 3.0 2.0
5 18.1 6.0 225.0 105.0 2.76 3.460 20.22 1.0 0.0 3.0 1.0
6 14.3 8.0 360.0 245.0 3.21 3.570 15.84 0.0 0.0 3.0 4.0
7 24.4 4.0 146.7 62.0 3.69 3.190 20.00 1.0 0.0 4.0 2.0
8 22.8 4.0 140.8 95.0 3.92 3.150 22.90 1.0 0.0 4.0 2.0
9 19.2 6.0 167.6 123.0 3.92 3.440 18.30 1.0 0.0 4.0 4.0
10 17.8 6.0 167.6 123.0 3.92 3.440 18.90 1.0 0.0 4.0 4.0
11 16.4 8.0 275.8 180.0 3.07 4.070 17.40 0.0 0.0 3.0 3.0
12 17.3 8.0 275.8 180.0 3.07 3.730 17.60 0.0 0.0 3.0 3.0
13 15.2 8.0 275.8 180.0 3.07 3.780 18.00 0.0 0.0 3.0 3.0
14 10.4 8.0 472.0 205.0 2.93 5.250 17.98 0.0 0.0 3.0 4.0
15 10.4 8.0 460.0 215.0 3.00 5.424 17.82 0.0 0.0 3.0 4.0
16 14.7 8.0 440.0 230.0 3.23 5.345 17.42 0.0 0.0 3.0 4.0
17 32.4 4.0 78.7 66.0 4.08 2.200 19.47 1.0 1.0 4.0 1.0
18 30.4 4.0 75.7 52.0 4.93 1.615 18.52 1.0 1.0 4.0 2.0
19 33.9 4.0 71.1 65.0 4.22 1.835 19.90 1.0 1.0 4.0 1.0
20 21.5 4.0 120.1 97.0 3.70 2.465 20.01 1.0 0.0 3.0 1.0
21 15.5 8.0 318.0 150.0 2.76 3.520 16.87 0.0 0.0 3.0 2.0
22 15.2 8.0 304.0 150.0 3.15 3.435 17.30 0.0 0.0 3.0 2.0
23 13.3 8.0 350.0 245.0 3.73 3.840 15.41 0.0 0.0 3.0 4.0
24 19.2 8.0 400.0 175.0 3.08 3.845 17.05 0.0 0.0 3.0 2.0
25 27.3 4.0 79.0 66.0 4.08 1.935 18.90 1.0 1.0 4.0 1.0
26 26.0 4.0 120.3 91.0 4.43 2.140 16.70 0.0 1.0 5.0 2.0
27 30.4 4.0 95.1 113.0 3.77 1.513 16.90 1.0 1.0 5.0 2.0
28 15.8 8.0 351.0 264.0 4.22 3.170 14.50 0.0 1.0 5.0 4.0
29 19.7 6.0 145.0 175.0 3.62 2.770 15.50 0.0 1.0 5.0 6.0
30 15.0 8.0 301.0 335.0 3.54 3.570 14.60 0.0 1.0 5.0 8.0
31 21.4 4.0 121.0 109.0 4.11 2.780 18.60 1.0 1.0 4.0 2.0
result_pandas = df_python.groupby('cyl').size().reset_index(name='n')
print(result_pandas) cyl n
0 4.0 11
1 6.0 7
2 8.0 14