Merging rows

Jeff Stevens

2023-03-03

Introduction

The problem

What’s different between these data sets?

What is needed to create data3 from data1 and data2?

data1

# A tibble: 12 × 2
      id   resp
   <int>  <dbl>
 1     1 0.429 
 2     2 0.663 
 3     3 0.539 
 4     4 0.0143
 5     5 0.753 
 6     6 0.234 
 7     7 0.778 
 8     8 0.625 
 9     9 0.570 
10    10 0.316 
11    11 0.192 
12    12 0.657

data2

# A tibble: 12 × 2
      id  cond
   <int> <int>
 1     1     1
 2     2     2
 3     3     3
 4     4     1
 5     5     2
 6     6     3
 7     7     1
 8     8     2
 9     9     3
10    10     1
11    11     2
12    12     3

data3

# A tibble: 4 × 2
     id   resp
  <int>  <dbl>
1     1 0.429 
2     4 0.0143
3     7 0.778 
4    10 0.316

Set-up

library(dplyr)
library(nycflights13)
(flights2 <- select(flights, year:dep_time, carrier, tailnum))

# A tibble: 336,776 × 6
    year month   day dep_time carrier tailnum
   <int> <int> <int>    <int> <chr>   <chr>  
 1  2013     1     1      517 UA      N14228 
 2  2013     1     1      533 UA      N24211 
 3  2013     1     1      542 AA      N619AA 
 4  2013     1     1      544 B6      N804JB 
 5  2013     1     1      554 DL      N668DN 
 6  2013     1     1      554 UA      N39463 
 7  2013     1     1      555 B6      N516JB 
 8  2013     1     1      557 EV      N829AS 
 9  2013     1     1      557 B6      N593JB 
10  2013     1     1      558 AA      N3ALAA 
# … with 336,766 more rows

Set-up

set.seed(1)
(airlines2 <- slice_sample(airlines, prop = 0.5))

# A tibble: 8 × 2
  carrier name                  
  <chr>   <chr>                 
1 HA      Hawaiian Airlines Inc.
2 B6      JetBlue Airways       
3 F9      Frontier Airlines Inc.
4 9E      Endeavor Air Inc.     
5 AA      American Airlines Inc.
6 VX      Virgin America        
7 UA      United Air Lines Inc. 
8 AS      Alaska Airlines Inc.

(airlines3 <- rename(airlines2, airline = carrier))

# A tibble: 8 × 2
  airline name                  
  <chr>   <chr>                 
1 HA      Hawaiian Airlines Inc.
2 B6      JetBlue Airways       
3 F9      Frontier Airlines Inc.
4 9E      Endeavor Air Inc.     
5 AA      American Airlines Inc.
6 VX      Virgin America        
7 UA      United Air Lines Inc. 
8 AS      Alaska Airlines Inc.

Joining with {dplyr}

library(dplyr)

Filtering joins

Affect rows

Semi joins

Keep only matching observations

When is this useful?

Semi joins

# A tibble: 3 × 2
    key val_x
  <dbl> <chr>
1     1 x1   
2     2 x2   
3     3 x3

# A tibble: 3 × 2
    key val_y
  <dbl> <chr>
1     1 y1   
2     2 y2   
3     4 y3

semi_join(x, y, by = "key")

# A tibble: 2 × 2
    key val_x
  <dbl> <chr>
1     1 x1   
2     2 x2

Semi joins

airlines2

# A tibble: 8 × 2
  carrier name                  
  <chr>   <chr>                 
1 HA      Hawaiian Airlines Inc.
2 B6      JetBlue Airways       
3 F9      Frontier Airlines Inc.
4 9E      Endeavor Air Inc.     
5 AA      American Airlines Inc.
6 VX      Virgin America        
7 UA      United Air Lines Inc. 
8 AS      Alaska Airlines Inc.

Semi joins

flights2 |>
  semi_join(airlines2, by = "carrier")

# A tibble: 171,392 × 6
    year month   day dep_time carrier tailnum
   <int> <int> <int>    <int> <chr>   <chr>  
 1  2013     1     1      517 UA      N14228 
 2  2013     1     1      533 UA      N24211 
 3  2013     1     1      542 AA      N619AA 
 4  2013     1     1      544 B6      N804JB 
 5  2013     1     1      554 UA      N39463 
 6  2013     1     1      555 B6      N516JB 
 7  2013     1     1      557 B6      N593JB 
 8  2013     1     1      558 AA      N3ALAA 
 9  2013     1     1      558 B6      N793JB 
10  2013     1     1      558 B6      N657JB 
# … with 171,382 more rows

Anti joins

Keep only non-matching observations

When is this useful?

Anti joins

# A tibble: 3 × 2
    key val_x
  <dbl> <chr>
1     1 x1   
2     2 x2   
3     3 x3

# A tibble: 3 × 2
    key val_y
  <dbl> <chr>
1     1 y1   
2     2 y2   
3     4 y3

anti_join(x, y, by = "key")

# A tibble: 1 × 2
    key val_x
  <dbl> <chr>
1     3 x3

Anti joins

flights2 |>
  anti_join(airlines2, by = "carrier")

# A tibble: 165,384 × 6
    year month   day dep_time carrier tailnum
   <int> <int> <int>    <int> <chr>   <chr>  
 1  2013     1     1      554 DL      N668DN 
 2  2013     1     1      557 EV      N829AS 
 3  2013     1     1      600 MQ      N542MQ 
 4  2013     1     1      602 DL      N971DL 
 5  2013     1     1      602 MQ      N730MQ 
 6  2013     1     1      606 DL      N3739P 
 7  2013     1     1      608 MQ      N9EAMQ 
 8  2013     1     1      615 DL      N326NB 
 9  2013     1     1      622 US      N807AW 
10  2013     1     1      624 EV      N11107 
# … with 165,374 more rows

Joining with different key names

flights2

# A tibble: 336,776 × 6
    year month   day dep_time carrier tailnum
   <int> <int> <int>    <int> <chr>   <chr>  
 1  2013     1     1      517 UA      N14228 
 2  2013     1     1      533 UA      N24211 
 3  2013     1     1      542 AA      N619AA 
 4  2013     1     1      544 B6      N804JB 
 5  2013     1     1      554 DL      N668DN 
 6  2013     1     1      554 UA      N39463 
 7  2013     1     1      555 B6      N516JB 
 8  2013     1     1      557 EV      N829AS 
 9  2013     1     1      557 B6      N593JB 
10  2013     1     1      558 AA      N3ALAA 
# … with 336,766 more rows

airlines3

# A tibble: 8 × 2
  airline name                  
  <chr>   <chr>                 
1 HA      Hawaiian Airlines Inc.
2 B6      JetBlue Airways       
3 F9      Frontier Airlines Inc.
4 9E      Endeavor Air Inc.     
5 AA      American Airlines Inc.
6 VX      Virgin America        
7 UA      United Air Lines Inc. 
8 AS      Alaska Airlines Inc.

Joining with different key names

flights2 |>
  anti_join(airlines3, by = join_by(carrier == airline))

# A tibble: 165,384 × 6
    year month   day dep_time carrier tailnum
   <int> <int> <int>    <int> <chr>   <chr>  
 1  2013     1     1      554 DL      N668DN 
 2  2013     1     1      557 EV      N829AS 
 3  2013     1     1      600 MQ      N542MQ 
 4  2013     1     1      602 DL      N971DL 
 5  2013     1     1      602 MQ      N730MQ 
 6  2013     1     1      606 DL      N3739P 
 7  2013     1     1      608 MQ      N9EAMQ 
 8  2013     1     1      615 DL      N326NB 
 9  2013     1     1      622 US      N807AW 
10  2013     1     1      624 EV      N11107 
# … with 165,374 more rows

Binding rows and columns

Adding rows

tibble::add_row()

(df <- tibble(x = 1:3, y = 3:1))

# A tibble: 3 × 2
      x     y
  <int> <int>
1     1     3
2     2     2
3     3     1

df %>% add_row(x = 4, y = 0)

# A tibble: 4 × 2
      x     y
  <dbl> <dbl>
1     1     3
2     2     2
3     3     1
4     4     0

df %>% add_row(x = 4, y = 0, 
               .before = 2)

# A tibble: 4 × 2
      x     y
  <dbl> <dbl>
1     1     3
2     4     0
3     2     2
4     3     1

df %>% add_row(x = 4:5, y = 0:-1)

# A tibble: 5 × 2
      x     y
  <int> <int>
1     1     3
2     2     2
3     3     1
4     4     0
5     5    -1

Add rows

dplyr::bind_rows()

(df2 <- tibble(x = 4:5, y = 5:4))

# A tibble: 2 × 2
      x     y
  <int> <int>
1     4     5
2     5     4

bind_rows(df, df2)

# A tibble: 5 × 2
      x     y
  <int> <int>
1     1     3
2     2     2
3     3     1
4     4     5
5     5     4

(df3 <- tibble(x = 6:7, y = 7:6, 
              z = c("A", "B")))

# A tibble: 2 × 3
      x     y z    
  <int> <int> <chr>
1     6     7 A    
2     7     6 B

bind_rows(df, df3)

# A tibble: 5 × 3
      x     y z    
  <int> <int> <chr>
1     1     3 <NA> 
2     2     2 <NA> 
3     3     1 <NA> 
4     6     7 A    
5     7     6 B

Add columns

dplyr::bind_cols()

(df4 <- tibble(z = c("A", "B" , "C"), 
              zz = c("Z", "Y", "X")))

# A tibble: 3 × 2
  z     zz   
  <chr> <chr>
1 A     Z    
2 B     Y    
3 C     X

bind_cols(df, df4)

# A tibble: 3 × 4
      x     y z     zz   
  <int> <int> <chr> <chr>
1     1     3 A     Z    
2     2     2 B     Y    
3     3     1 C     X

bind_cols(df, new_col = df4$z)

# A tibble: 3 × 3
      x     y new_col
  <int> <int> <chr>  
1     1     3 A      
2     2     2 B      
3     3     1 C

But why is this dangerous? What is a better solution?

Set operations

For finding overlap, differences, and combinations of datasets

Intersect

Common rows in both x and y, keeping just unique rows

Set difference

All rows from x which are not also rows in y, keeping just unique rows

Union

All unique rows from x and y

Union all

All rows from x and y, keeping duplicates

SQL

Congratulations—you just learned SQL databases!

Solving the problem

What code combines data1 and data2 into data3?

data1

# A tibble: 12 × 2
      id   resp
   <int>  <dbl>
 1     1 0.429 
 2     2 0.663 
 3     3 0.539 
 4     4 0.0143
 5     5 0.753 
 6     6 0.234 
 7     7 0.778 
 8     8 0.625 
 9     9 0.570 
10    10 0.316 
11    11 0.192 
12    12 0.657

data2

# A tibble: 12 × 2
      id  cond
   <int> <int>
 1     1     1
 2     2     2
 3     3     3
 4     4     1
 5     5     2
 6     6     3
 7     7     1
 8     8     2
 9     9     3
10    10     1
11    11     2
12    12     3

data3

# A tibble: 4 × 2
     id   resp
  <int>  <dbl>
1     1 0.429 
2     4 0.0143
3     7 0.778 
4    10 0.316

Let’s code!

Merging rows [Rmd]