<- data.table::as.data.table(x)
x_dt <- data.table::as.data.table(y) y_dt
data.table
library(data.table)
<- mtcars
dt ::setDT(dt) data.table
dplyr
library(dplyr)
library(tidyr)
library(magrittr)
<- mtcars df
<- aggregate(. ~ cyl, data = df, FUN = length)
baseR <- baseR[c("cyl", "mpg")] # Adjust column names if needed
baseR print(baseR)
cyl mpg
1 4 11
2 6 7
3 8 14
group by
n=.N), by= .(cyl)] dt[, .(
cyl n
<num> <int>
1: 6 7
2: 4 11
3: 8 14
%>% group_by(cyl) %>%
df summarize(n=n())
# A tibble: 3 × 2
cyl n
<dbl> <int>
1 4 11
2 6 7
3 8 14
avg= mean(mpg)), by= .(cyl)] dt[, .(
cyl avg
<num> <num>
1: 6 19.74286
2: 4 26.66364
3: 8 15.10000
%>% group_by(cyl) %>%
df summarize(avg=mean(mpg))
# A tibble: 3 × 2
cyl avg
<dbl> <dbl>
1 4 26.7
2 6 19.7
3 8 15.1
avg= mean(mpg)), by= .(cyl,gear)] dt[, .(
cyl gear avg
<num> <num> <num>
1: 6 4 19.750
2: 4 4 26.925
3: 6 3 19.750
4: 8 3 15.050
5: 4 3 21.500
6: 4 5 28.200
7: 8 5 15.400
8: 6 5 19.700
%>% group_by(cyl,gear) %>%
df summarize(avg=mean(mpg))
# A tibble: 8 × 3
# Groups: cyl [3]
cyl gear avg
<dbl> <dbl> <dbl>
1 4 3 21.5
2 4 4 26.9
3 4 5 28.2
4 6 3 19.8
5 6 4 19.8
6 6 5 19.7
7 8 3 15.0
8 8 5 15.4
pivot
# wide dataframe
<- data.frame(player=c('A', 'B', 'C', 'D'),
wide_dt year1=c(12, 15, 19, 19),
year2=c(22, 29, 18, 12))
::setDT(wide_dt)
data.tablehead(wide_dt)
player year1 year2
<char> <num> <num>
1: A 12 22
2: B 15 29
3: C 19 18
4: D 19 12
<- data.frame(player=c('A', 'B', 'C', 'D'),
wide_df year1=c(12, 15, 19, 19),
year2=c(22, 29, 18, 12))
wide_df
player year1 year2
1 A 12 22
2 B 15 29
3 C 19 18
4 D 19 12
# wide dataframe to long dataframe
<- data.table::melt(wide_dt,
wide_to_long measure.vars=c('year1','year2'),
variable.name= 'year',
value.name = 'points')
wide_to_long
player year points
<char> <fctr> <num>
1: A year1 12
2: B year1 15
3: C year1 19
4: D year1 19
5: A year2 22
6: B year2 29
7: C year2 18
8: D year2 12
<- tidyr::pivot_longer(wide_df,
long_df cols=c('year1', 'year2'),
names_to='year',
values_to='points')
long_df
# A tibble: 8 × 3
player year points
<chr> <chr> <dbl>
1 A year1 12
2 A year2 22
3 B year1 15
4 B year2 29
5 C year1 19
6 C year2 18
7 D year1 19
8 D year2 12
# long dataframe to wide dataframe
<- data.table::dcast(wide_to_long,
long_to_wide ~ year,
player value.var = 'points')
long_to_wide
Key: <player>
player year1 year2
<char> <num> <num>
1: A 12 22
2: B 15 29
3: C 19 18
4: D 19 12
<- tidyr::pivot_wider(long_df,
long_to_wide_df names_from = 'year',
values_from = 'points')
long_to_wide_df
# A tibble: 4 × 3
player year1 year2
<chr> <dbl> <dbl>
1 A 12 22
2 B 15 29
3 C 19 18
4 D 19 12
pivot in base R with reshape()
long to wide
# base R, reshape function
<- data.frame(player=c('A', 'B', 'C', 'D'),
base_wide year1=c(12, 15, 19, 19),
year2=c(22, 29, 18, 12))
base_wide
player year1 year2
1 A 12 22
2 B 15 29
3 C 19 18
4 D 19 12
# wide to long
<- reshape(
long_df
base_wide,direction = "long",
varying = list(c("year1", "year2")),
v.names = "points",
idvar = "player",
times = c("year1", "year2"),
timevar = "year"
) long_df
player year points
A.year1 A year1 12
B.year1 B year1 15
C.year1 C year1 19
D.year1 D year1 19
A.year2 A year2 22
B.year2 B year2 29
C.year2 C year2 18
D.year2 D year2 12
long to wide
# long data
head(wide_to_long)
player year points
<char> <fctr> <num>
1: A year1 12
2: B year1 15
3: C year1 19
4: D year1 19
5: A year2 22
6: B year2 29
# from long to wide
reshape(wide_to_long, direction = 'wide',idvar = 'player',
timevar = 'year', v.names = 'points', sep = '_')
player points_year1 points_year2
<char> <num> <num>
1: A 12 22
2: B 15 29
3: C 19 18
4: D 19 12
Join
<- data.table(Id = c("A", "B", "C", "C"),
x X1 = c(1L, 3L, 5L, 7L),
XY = c("x2", "x4", "x6", "x8"),
key = "Id")
x
Key: <Id>
Id X1 XY
<char> <int> <char>
1: A 1 x2
2: B 3 x4
3: C 5 x6
4: C 7 x8
<- data.table(Id = c("A", "B", "B", "D"),
y Y1 = c(1L, 3L, 5L, 7L),
XY = c("y1", "y3", "y5", "y7"),
key = "Id")
y
Key: <Id>
Id Y1 XY
<char> <int> <char>
1: A 1 y1
2: B 3 y3
3: B 5 y5
4: D 7 y7
= "Id"] y[x, on
Key: <Id>
Id Y1 XY X1 i.XY
<char> <int> <char> <int> <char>
1: A 1 y1 1 x2
2: B 3 y3 3 x4
3: B 5 y5 3 x4
4: C NA <NA> 5 x6
5: C NA <NA> 7 x8
# y[x, on = .(Id_x=Id_y)] # if different column name
# merge(x, y, all.x = TRUE, by = "Id")
# merge(x, y, all.x = TRUE, by.x = "x_column", by.y = "y_column")
# if column name different
left_join(x, y, by = "Id")
Key: <Id>
Id X1 XY.x Y1 XY.y
<char> <int> <char> <int> <char>
1: A 1 x2 1 y1
2: B 3 x4 3 y3
3: B 3 x4 5 y5
4: C 5 x6 NA <NA>
5: C 7 x8 NA <NA>
# left_join(x, y, by.x = "x_column", by.y= "y_column") # if column
#name different
= "Id"] x[y, on
Key: <Id>
Id X1 XY Y1 i.XY
<char> <int> <char> <int> <char>
1: A 1 x2 1 y1
2: B 3 x4 3 y3
3: B 3 x4 5 y5
4: D NA <NA> 7 y7
# merge(x, y, all.y = TRUE, by = "Id")
right_join(x, y, by = "Id")
Key: <Id>
Id X1 XY.x Y1 XY.y
<char> <int> <char> <int> <char>
1: A 1 x2 1 y1
2: B 3 x4 3 y3
3: B 3 x4 5 y5
4: D NA <NA> 7 y7
matching rows from both x and y
= "Id", nomatch = 0] x[y, on
Key: <Id>
Id X1 XY Y1 i.XY
<char> <int> <char> <int> <char>
1: A 1 x2 1 y1
2: B 3 x4 3 y3
3: B 3 x4 5 y5
# merge(x, y)
inner_join(x, y, by = "Id")
Key: <Id>
Id X1 XY.x Y1 XY.y
<char> <int> <char> <int> <char>
1: A 1 x2 1 y1
2: B 3 x4 3 y3
3: B 3 x4 5 y5
all the rows from x and y
merge(x, y, all = TRUE, by = "Id")
Key: <Id>
Id X1 XY.x Y1 XY.y
<char> <int> <char> <int> <char>
1: A 1 x2 1 y1
2: B 3 x4 3 y3
3: B 3 x4 5 y5
4: C 5 x6 NA <NA>
5: C 7 x8 NA <NA>
6: D NA <NA> 7 y7
full_join(x, y, by = "Id")
Key: <Id>
Id X1 XY.x Y1 XY.y
<char> <int> <char> <int> <char>
1: A 1 x2 1 y1
2: B 3 x4 3 y3
3: B 3 x4 5 y5
4: C 5 x6 NA <NA>
5: C 7 x8 NA <NA>
6: D NA <NA> 7 y7
Return rows from x matching y
unique(x[y$Id, on = "Id", nomatch = 0])
Key: <Id>
Id X1 XY
<char> <int> <char>
1: A 1 x2
2: B 3 x4
semi_join(x, y, by = "Id")
Key: <Id>
Id X1 XY
<char> <int> <char>
1: A 1 x2
2: B 3 x4
Return rows from x not matching y
!y, on = "Id"] x[
Key: <Id>
Id X1 XY
<char> <int> <char>
1: C 5 x6
2: C 7 x8
anti_join(x, y, by = "Id")
Key: <Id>
Id X1 XY
<char> <int> <char>
1: C 5 x6
2: C 7 x8
bind
<- mtcars[1:5, 1:4]
x <- mtcars[6:10,1:4]
y x
mpg cyl disp hp
<num> <num> <num> <num>
1: 21.0 6 160 110
2: 21.0 6 160 110
3: 22.8 4 108 93
4: 21.4 6 258 110
5: 18.7 8 360 175
y
mpg cyl disp hp
<num> <num> <num> <num>
1: 18.1 6 225.0 105
2: 14.3 8 360.0 245
3: 24.4 4 146.7 62
4: 22.8 4 140.8 95
5: 19.2 6 167.6 123
::rbindlist(list(x_dt,y_dt)) data.table
mpg cyl disp hp
<num> <num> <num> <num>
1: 21.0 6 160.0 110
2: 21.0 6 160.0 110
3: 22.8 4 108.0 93
4: 21.4 6 258.0 110
5: 18.7 8 360.0 175
6: 18.1 6 225.0 105
7: 14.3 8 360.0 245
8: 24.4 4 146.7 62
9: 22.8 4 140.8 95
10: 19.2 6 167.6 123
# data.table::rbindlist(list(x_dt,y_dt),
# use.names = FALSE,
# fill = FALSE,
# idcol = 'id')
::bind_rows(x,y) dplyr
mpg cyl disp hp
<num> <num> <num> <num>
1: 21.0 6 160.0 110
2: 21.0 6 160.0 110
3: 22.8 4 108.0 93
4: 21.4 6 258.0 110
5: 18.7 8 360.0 175
6: 18.1 6 225.0 105
7: 14.3 8 360.0 245
8: 24.4 4 146.7 62
9: 22.8 4 140.8 95
10: 19.2 6 167.6 123
library(reticulate)
<- reticulate::r_to_py(df) df_py
import pandas as pd
= r.df_py
df_python
# Print the Python dataframe
print(df_python)
mpg cyl disp hp drat wt qsec vs am gear carb
0 21.0 6.0 160.0 110.0 3.90 2.620 16.46 0.0 1.0 4.0 4.0
1 21.0 6.0 160.0 110.0 3.90 2.875 17.02 0.0 1.0 4.0 4.0
2 22.8 4.0 108.0 93.0 3.85 2.320 18.61 1.0 1.0 4.0 1.0
3 21.4 6.0 258.0 110.0 3.08 3.215 19.44 1.0 0.0 3.0 1.0
4 18.7 8.0 360.0 175.0 3.15 3.440 17.02 0.0 0.0 3.0 2.0
5 18.1 6.0 225.0 105.0 2.76 3.460 20.22 1.0 0.0 3.0 1.0
6 14.3 8.0 360.0 245.0 3.21 3.570 15.84 0.0 0.0 3.0 4.0
7 24.4 4.0 146.7 62.0 3.69 3.190 20.00 1.0 0.0 4.0 2.0
8 22.8 4.0 140.8 95.0 3.92 3.150 22.90 1.0 0.0 4.0 2.0
9 19.2 6.0 167.6 123.0 3.92 3.440 18.30 1.0 0.0 4.0 4.0
10 17.8 6.0 167.6 123.0 3.92 3.440 18.90 1.0 0.0 4.0 4.0
11 16.4 8.0 275.8 180.0 3.07 4.070 17.40 0.0 0.0 3.0 3.0
12 17.3 8.0 275.8 180.0 3.07 3.730 17.60 0.0 0.0 3.0 3.0
13 15.2 8.0 275.8 180.0 3.07 3.780 18.00 0.0 0.0 3.0 3.0
14 10.4 8.0 472.0 205.0 2.93 5.250 17.98 0.0 0.0 3.0 4.0
15 10.4 8.0 460.0 215.0 3.00 5.424 17.82 0.0 0.0 3.0 4.0
16 14.7 8.0 440.0 230.0 3.23 5.345 17.42 0.0 0.0 3.0 4.0
17 32.4 4.0 78.7 66.0 4.08 2.200 19.47 1.0 1.0 4.0 1.0
18 30.4 4.0 75.7 52.0 4.93 1.615 18.52 1.0 1.0 4.0 2.0
19 33.9 4.0 71.1 65.0 4.22 1.835 19.90 1.0 1.0 4.0 1.0
20 21.5 4.0 120.1 97.0 3.70 2.465 20.01 1.0 0.0 3.0 1.0
21 15.5 8.0 318.0 150.0 2.76 3.520 16.87 0.0 0.0 3.0 2.0
22 15.2 8.0 304.0 150.0 3.15 3.435 17.30 0.0 0.0 3.0 2.0
23 13.3 8.0 350.0 245.0 3.73 3.840 15.41 0.0 0.0 3.0 4.0
24 19.2 8.0 400.0 175.0 3.08 3.845 17.05 0.0 0.0 3.0 2.0
25 27.3 4.0 79.0 66.0 4.08 1.935 18.90 1.0 1.0 4.0 1.0
26 26.0 4.0 120.3 91.0 4.43 2.140 16.70 0.0 1.0 5.0 2.0
27 30.4 4.0 95.1 113.0 3.77 1.513 16.90 1.0 1.0 5.0 2.0
28 15.8 8.0 351.0 264.0 4.22 3.170 14.50 0.0 1.0 5.0 4.0
29 19.7 6.0 145.0 175.0 3.62 2.770 15.50 0.0 1.0 5.0 6.0
30 15.0 8.0 301.0 335.0 3.54 3.570 14.60 0.0 1.0 5.0 8.0
31 21.4 4.0 121.0 109.0 4.11 2.780 18.60 1.0 1.0 4.0 2.0
= df_python.groupby('cyl').size().reset_index(name='n')
result_pandas print(result_pandas)
cyl n
0 4.0 11
1 6.0 7
2 8.0 14