R Basic with dplyr and data.table

code
R
data.table
dplyr
Author

Yousuf Ali

Published

May 29, 2024

data.table

library(data.table)
dt <- mtcars
data.table::setDT(dt)

dplyr

library(dplyr)
library(tidyr)
library(magrittr)
df <- mtcars
baseR <- aggregate(. ~ cyl, data = df, FUN = length)
baseR <- baseR[c("cyl", "mpg")]  # Adjust column names if needed
print(baseR)
  cyl mpg
1   4  11
2   6   7
3   8  14

group by

dt[, .(n=.N), by= .(cyl)]
     cyl     n
   <num> <int>
1:     6     7
2:     4    11
3:     8    14
df %>% group_by(cyl) %>% 
  summarize(n=n())
# A tibble: 3 × 2
    cyl     n
  <dbl> <int>
1     4    11
2     6     7
3     8    14
dt[, .(avg= mean(mpg)), by= .(cyl)]
     cyl      avg
   <num>    <num>
1:     6 19.74286
2:     4 26.66364
3:     8 15.10000
df %>% group_by(cyl) %>% 
  summarize(avg=mean(mpg))
# A tibble: 3 × 2
    cyl   avg
  <dbl> <dbl>
1     4  26.7
2     6  19.7
3     8  15.1
dt[, .(avg= mean(mpg)), by= .(cyl,gear)]
     cyl  gear    avg
   <num> <num>  <num>
1:     6     4 19.750
2:     4     4 26.925
3:     6     3 19.750
4:     8     3 15.050
5:     4     3 21.500
6:     4     5 28.200
7:     8     5 15.400
8:     6     5 19.700
df %>% group_by(cyl,gear) %>% 
  summarize(avg=mean(mpg))
# A tibble: 8 × 3
# Groups:   cyl [3]
    cyl  gear   avg
  <dbl> <dbl> <dbl>
1     4     3  21.5
2     4     4  26.9
3     4     5  28.2
4     6     3  19.8
5     6     4  19.8
6     6     5  19.7
7     8     3  15.0
8     8     5  15.4

pivot

# wide dataframe
wide_dt <- data.frame(player=c('A', 'B', 'C', 'D'),
                 year1=c(12, 15, 19, 19),
                 year2=c(22, 29, 18, 12))
data.table::setDT(wide_dt)
head(wide_dt)
   player year1 year2
   <char> <num> <num>
1:      A    12    22
2:      B    15    29
3:      C    19    18
4:      D    19    12
wide_df <- data.frame(player=c('A', 'B', 'C', 'D'),
                 year1=c(12, 15, 19, 19),
                 year2=c(22, 29, 18, 12))
wide_df
  player year1 year2
1      A    12    22
2      B    15    29
3      C    19    18
4      D    19    12
# wide dataframe to long dataframe
wide_to_long <- data.table::melt(wide_dt,
                                 measure.vars=c('year1','year2'),
                                 variable.name= 'year',
                                 value.name = 'points')
wide_to_long
   player   year points
   <char> <fctr>  <num>
1:      A  year1     12
2:      B  year1     15
3:      C  year1     19
4:      D  year1     19
5:      A  year2     22
6:      B  year2     29
7:      C  year2     18
8:      D  year2     12
long_df <- tidyr::pivot_longer(wide_df,
                    cols=c('year1', 'year2'),
                    names_to='year',
                    values_to='points')

long_df
# A tibble: 8 × 3
  player year  points
  <chr>  <chr>  <dbl>
1 A      year1     12
2 A      year2     22
3 B      year1     15
4 B      year2     29
5 C      year1     19
6 C      year2     18
7 D      year1     19
8 D      year2     12
# long dataframe to wide dataframe
long_to_wide <- data.table::dcast(wide_to_long, 
                                  player ~ year,
                                  value.var = 'points')
long_to_wide
Key: <player>
   player year1 year2
   <char> <num> <num>
1:      A    12    22
2:      B    15    29
3:      C    19    18
4:      D    19    12
long_to_wide_df <- tidyr::pivot_wider(long_df,
                              names_from = 'year',
                              values_from = 'points')
long_to_wide_df
# A tibble: 4 × 3
  player year1 year2
  <chr>  <dbl> <dbl>
1 A         12    22
2 B         15    29
3 C         19    18
4 D         19    12

pivot in base R with reshape()

long to wide
# base R, reshape function
base_wide <- data.frame(player=c('A', 'B', 'C', 'D'),
                 year1=c(12, 15, 19, 19),
                 year2=c(22, 29, 18, 12))
base_wide
  player year1 year2
1      A    12    22
2      B    15    29
3      C    19    18
4      D    19    12
# wide to long
long_df <- reshape(
  base_wide,
  direction = "long",
  varying = list(c("year1", "year2")),
  v.names = "points",
  idvar = "player",
  times = c("year1", "year2"),
  timevar = "year"
)
long_df
        player  year points
A.year1      A year1     12
B.year1      B year1     15
C.year1      C year1     19
D.year1      D year1     19
A.year2      A year2     22
B.year2      B year2     29
C.year2      C year2     18
D.year2      D year2     12

long to wide

# long data
head(wide_to_long)
   player   year points
   <char> <fctr>  <num>
1:      A  year1     12
2:      B  year1     15
3:      C  year1     19
4:      D  year1     19
5:      A  year2     22
6:      B  year2     29
# from long to wide
reshape(wide_to_long, direction = 'wide',idvar = 'player',
        timevar = 'year', v.names = 'points', sep = '_')
   player points_year1 points_year2
   <char>        <num>        <num>
1:      A           12           22
2:      B           15           29
3:      C           19           18
4:      D           19           12

Join

x <- data.table(Id  = c("A", "B", "C", "C"),
                X1  = c(1L, 3L, 5L, 7L),
                XY  = c("x2", "x4", "x6", "x8"),
                key = "Id")
x
Key: <Id>
       Id    X1     XY
   <char> <int> <char>
1:      A     1     x2
2:      B     3     x4
3:      C     5     x6
4:      C     7     x8
y <- data.table(Id  = c("A", "B", "B", "D"),
                Y1  = c(1L, 3L, 5L, 7L),
                XY  = c("y1", "y3", "y5", "y7"),
                key = "Id")

y
Key: <Id>
       Id    Y1     XY
   <char> <int> <char>
1:      A     1     y1
2:      B     3     y3
3:      B     5     y5
4:      D     7     y7
y[x, on = "Id"]
Key: <Id>
       Id    Y1     XY    X1   i.XY
   <char> <int> <char> <int> <char>
1:      A     1     y1     1     x2
2:      B     3     y3     3     x4
3:      B     5     y5     3     x4
4:      C    NA   <NA>     5     x6
5:      C    NA   <NA>     7     x8
# y[x, on = .(Id_x=Id_y)] # if different column name
# merge(x, y, all.x = TRUE, by = "Id")
# merge(x, y, all.x = TRUE, by.x = "x_column", by.y = "y_column") 
# if column name different
left_join(x, y, by = "Id")
Key: <Id>
       Id    X1   XY.x    Y1   XY.y
   <char> <int> <char> <int> <char>
1:      A     1     x2     1     y1
2:      B     3     x4     3     y3
3:      B     3     x4     5     y5
4:      C     5     x6    NA   <NA>
5:      C     7     x8    NA   <NA>
# left_join(x, y, by.x = "x_column", by.y= "y_column") # if column
#name different
x[y, on = "Id"]
Key: <Id>
       Id    X1     XY    Y1   i.XY
   <char> <int> <char> <int> <char>
1:      A     1     x2     1     y1
2:      B     3     x4     3     y3
3:      B     3     x4     5     y5
4:      D    NA   <NA>     7     y7
# merge(x, y, all.y = TRUE, by = "Id")
right_join(x, y, by = "Id")
Key: <Id>
       Id    X1   XY.x    Y1   XY.y
   <char> <int> <char> <int> <char>
1:      A     1     x2     1     y1
2:      B     3     x4     3     y3
3:      B     3     x4     5     y5
4:      D    NA   <NA>     7     y7

matching rows from both x and y

x[y, on = "Id", nomatch = 0]
Key: <Id>
       Id    X1     XY    Y1   i.XY
   <char> <int> <char> <int> <char>
1:      A     1     x2     1     y1
2:      B     3     x4     3     y3
3:      B     3     x4     5     y5
# merge(x, y)
inner_join(x, y, by = "Id")
Key: <Id>
       Id    X1   XY.x    Y1   XY.y
   <char> <int> <char> <int> <char>
1:      A     1     x2     1     y1
2:      B     3     x4     3     y3
3:      B     3     x4     5     y5

all the rows from x and y

merge(x, y, all = TRUE, by = "Id")
Key: <Id>
       Id    X1   XY.x    Y1   XY.y
   <char> <int> <char> <int> <char>
1:      A     1     x2     1     y1
2:      B     3     x4     3     y3
3:      B     3     x4     5     y5
4:      C     5     x6    NA   <NA>
5:      C     7     x8    NA   <NA>
6:      D    NA   <NA>     7     y7
full_join(x, y, by = "Id")
Key: <Id>
       Id    X1   XY.x    Y1   XY.y
   <char> <int> <char> <int> <char>
1:      A     1     x2     1     y1
2:      B     3     x4     3     y3
3:      B     3     x4     5     y5
4:      C     5     x6    NA   <NA>
5:      C     7     x8    NA   <NA>
6:      D    NA   <NA>     7     y7

Return rows from x matching y

unique(x[y$Id, on = "Id", nomatch = 0])
Key: <Id>
       Id    X1     XY
   <char> <int> <char>
1:      A     1     x2
2:      B     3     x4
semi_join(x, y, by = "Id")
Key: <Id>
       Id    X1     XY
   <char> <int> <char>
1:      A     1     x2
2:      B     3     x4

Return rows from x not matching y

x[!y, on = "Id"]
Key: <Id>
       Id    X1     XY
   <char> <int> <char>
1:      C     5     x6
2:      C     7     x8
anti_join(x, y, by = "Id")
Key: <Id>
       Id    X1     XY
   <char> <int> <char>
1:      C     5     x6
2:      C     7     x8

bind

x <- mtcars[1:5, 1:4]
y <- mtcars[6:10,1:4]
x
     mpg   cyl  disp    hp
   <num> <num> <num> <num>
1:  21.0     6   160   110
2:  21.0     6   160   110
3:  22.8     4   108    93
4:  21.4     6   258   110
5:  18.7     8   360   175
y
     mpg   cyl  disp    hp
   <num> <num> <num> <num>
1:  18.1     6 225.0   105
2:  14.3     8 360.0   245
3:  24.4     4 146.7    62
4:  22.8     4 140.8    95
5:  19.2     6 167.6   123
x_dt <- data.table::as.data.table(x)
y_dt <- data.table::as.data.table(y)
data.table::rbindlist(list(x_dt,y_dt))
      mpg   cyl  disp    hp
    <num> <num> <num> <num>
 1:  21.0     6 160.0   110
 2:  21.0     6 160.0   110
 3:  22.8     4 108.0    93
 4:  21.4     6 258.0   110
 5:  18.7     8 360.0   175
 6:  18.1     6 225.0   105
 7:  14.3     8 360.0   245
 8:  24.4     4 146.7    62
 9:  22.8     4 140.8    95
10:  19.2     6 167.6   123
# data.table::rbindlist(list(x_dt,y_dt), 
                        # use.names = FALSE,
                          # fill = FALSE,
#                       idcol = 'id')
dplyr::bind_rows(x,y)
      mpg   cyl  disp    hp
    <num> <num> <num> <num>
 1:  21.0     6 160.0   110
 2:  21.0     6 160.0   110
 3:  22.8     4 108.0    93
 4:  21.4     6 258.0   110
 5:  18.7     8 360.0   175
 6:  18.1     6 225.0   105
 7:  14.3     8 360.0   245
 8:  24.4     4 146.7    62
 9:  22.8     4 140.8    95
10:  19.2     6 167.6   123
library(reticulate)

df_py <- reticulate::r_to_py(df)
import pandas as pd
df_python = r.df_py

# Print the Python dataframe
print(df_python)
     mpg  cyl   disp     hp  drat     wt   qsec   vs   am  gear  carb
0   21.0  6.0  160.0  110.0  3.90  2.620  16.46  0.0  1.0   4.0   4.0
1   21.0  6.0  160.0  110.0  3.90  2.875  17.02  0.0  1.0   4.0   4.0
2   22.8  4.0  108.0   93.0  3.85  2.320  18.61  1.0  1.0   4.0   1.0
3   21.4  6.0  258.0  110.0  3.08  3.215  19.44  1.0  0.0   3.0   1.0
4   18.7  8.0  360.0  175.0  3.15  3.440  17.02  0.0  0.0   3.0   2.0
5   18.1  6.0  225.0  105.0  2.76  3.460  20.22  1.0  0.0   3.0   1.0
6   14.3  8.0  360.0  245.0  3.21  3.570  15.84  0.0  0.0   3.0   4.0
7   24.4  4.0  146.7   62.0  3.69  3.190  20.00  1.0  0.0   4.0   2.0
8   22.8  4.0  140.8   95.0  3.92  3.150  22.90  1.0  0.0   4.0   2.0
9   19.2  6.0  167.6  123.0  3.92  3.440  18.30  1.0  0.0   4.0   4.0
10  17.8  6.0  167.6  123.0  3.92  3.440  18.90  1.0  0.0   4.0   4.0
11  16.4  8.0  275.8  180.0  3.07  4.070  17.40  0.0  0.0   3.0   3.0
12  17.3  8.0  275.8  180.0  3.07  3.730  17.60  0.0  0.0   3.0   3.0
13  15.2  8.0  275.8  180.0  3.07  3.780  18.00  0.0  0.0   3.0   3.0
14  10.4  8.0  472.0  205.0  2.93  5.250  17.98  0.0  0.0   3.0   4.0
15  10.4  8.0  460.0  215.0  3.00  5.424  17.82  0.0  0.0   3.0   4.0
16  14.7  8.0  440.0  230.0  3.23  5.345  17.42  0.0  0.0   3.0   4.0
17  32.4  4.0   78.7   66.0  4.08  2.200  19.47  1.0  1.0   4.0   1.0
18  30.4  4.0   75.7   52.0  4.93  1.615  18.52  1.0  1.0   4.0   2.0
19  33.9  4.0   71.1   65.0  4.22  1.835  19.90  1.0  1.0   4.0   1.0
20  21.5  4.0  120.1   97.0  3.70  2.465  20.01  1.0  0.0   3.0   1.0
21  15.5  8.0  318.0  150.0  2.76  3.520  16.87  0.0  0.0   3.0   2.0
22  15.2  8.0  304.0  150.0  3.15  3.435  17.30  0.0  0.0   3.0   2.0
23  13.3  8.0  350.0  245.0  3.73  3.840  15.41  0.0  0.0   3.0   4.0
24  19.2  8.0  400.0  175.0  3.08  3.845  17.05  0.0  0.0   3.0   2.0
25  27.3  4.0   79.0   66.0  4.08  1.935  18.90  1.0  1.0   4.0   1.0
26  26.0  4.0  120.3   91.0  4.43  2.140  16.70  0.0  1.0   5.0   2.0
27  30.4  4.0   95.1  113.0  3.77  1.513  16.90  1.0  1.0   5.0   2.0
28  15.8  8.0  351.0  264.0  4.22  3.170  14.50  0.0  1.0   5.0   4.0
29  19.7  6.0  145.0  175.0  3.62  2.770  15.50  0.0  1.0   5.0   6.0
30  15.0  8.0  301.0  335.0  3.54  3.570  14.60  0.0  1.0   5.0   8.0
31  21.4  4.0  121.0  109.0  4.11  2.780  18.60  1.0  1.0   4.0   2.0
result_pandas = df_python.groupby('cyl').size().reset_index(name='n')
print(result_pandas)
   cyl   n
0  4.0  11
1  6.0   7
2  8.0  14