2023-03-03
What’s different between these data sets?
What is needed to create data3
from data1
and data2
?
data1
# A tibble: 12 × 2
id resp
<int> <dbl>
1 1 0.429
2 2 0.663
3 3 0.539
4 4 0.0143
5 5 0.753
6 6 0.234
7 7 0.778
8 8 0.625
9 9 0.570
10 10 0.316
11 11 0.192
12 12 0.657
data2
# A tibble: 12 × 2
id cond
<int> <int>
1 1 1
2 2 2
3 3 3
4 4 1
5 5 2
6 6 3
7 7 1
8 8 2
9 9 3
10 10 1
11 11 2
12 12 3
data3
# A tibble: 4 × 2
id resp
<int> <dbl>
1 1 0.429
2 4 0.0143
3 7 0.778
4 10 0.316
library(dplyr)
library(nycflights13)
(flights2 <- select(flights, year:dep_time, carrier, tailnum))
# A tibble: 336,776 × 6
year month day dep_time carrier tailnum
<int> <int> <int> <int> <chr> <chr>
1 2013 1 1 517 UA N14228
2 2013 1 1 533 UA N24211
3 2013 1 1 542 AA N619AA
4 2013 1 1 544 B6 N804JB
5 2013 1 1 554 DL N668DN
6 2013 1 1 554 UA N39463
7 2013 1 1 555 B6 N516JB
8 2013 1 1 557 EV N829AS
9 2013 1 1 557 B6 N593JB
10 2013 1 1 558 AA N3ALAA
# … with 336,766 more rows
set.seed(1)
(airlines2 <- slice_sample(airlines, prop = 0.5))
# A tibble: 8 × 2
carrier name
<chr> <chr>
1 HA Hawaiian Airlines Inc.
2 B6 JetBlue Airways
3 F9 Frontier Airlines Inc.
4 9E Endeavor Air Inc.
5 AA American Airlines Inc.
6 VX Virgin America
7 UA United Air Lines Inc.
8 AS Alaska Airlines Inc.
(airlines3 <- rename(airlines2, airline = carrier))
# A tibble: 8 × 2
airline name
<chr> <chr>
1 HA Hawaiian Airlines Inc.
2 B6 JetBlue Airways
3 F9 Frontier Airlines Inc.
4 9E Endeavor Air Inc.
5 AA American Airlines Inc.
6 VX Virgin America
7 UA United Air Lines Inc.
8 AS Alaska Airlines Inc.
When is this useful?
x
# A tibble: 3 × 2
key val_x
<dbl> <chr>
1 1 x1
2 2 x2
3 3 x3
y
# A tibble: 3 × 2
key val_y
<dbl> <chr>
1 1 y1
2 2 y2
3 4 y3
semi_join(x, y, by = "key")
# A tibble: 2 × 2
key val_x
<dbl> <chr>
1 1 x1
2 2 x2
airlines2
# A tibble: 8 × 2
carrier name
<chr> <chr>
1 HA Hawaiian Airlines Inc.
2 B6 JetBlue Airways
3 F9 Frontier Airlines Inc.
4 9E Endeavor Air Inc.
5 AA American Airlines Inc.
6 VX Virgin America
7 UA United Air Lines Inc.
8 AS Alaska Airlines Inc.
flights2 |>
semi_join(airlines2, by = "carrier")
# A tibble: 171,392 × 6
year month day dep_time carrier tailnum
<int> <int> <int> <int> <chr> <chr>
1 2013 1 1 517 UA N14228
2 2013 1 1 533 UA N24211
3 2013 1 1 542 AA N619AA
4 2013 1 1 544 B6 N804JB
5 2013 1 1 554 UA N39463
6 2013 1 1 555 B6 N516JB
7 2013 1 1 557 B6 N593JB
8 2013 1 1 558 AA N3ALAA
9 2013 1 1 558 B6 N793JB
10 2013 1 1 558 B6 N657JB
# … with 171,382 more rows
When is this useful?
x
# A tibble: 3 × 2
key val_x
<dbl> <chr>
1 1 x1
2 2 x2
3 3 x3
y
# A tibble: 3 × 2
key val_y
<dbl> <chr>
1 1 y1
2 2 y2
3 4 y3
anti_join(x, y, by = "key")
# A tibble: 1 × 2
key val_x
<dbl> <chr>
1 3 x3
flights2 |>
anti_join(airlines2, by = "carrier")
# A tibble: 165,384 × 6
year month day dep_time carrier tailnum
<int> <int> <int> <int> <chr> <chr>
1 2013 1 1 554 DL N668DN
2 2013 1 1 557 EV N829AS
3 2013 1 1 600 MQ N542MQ
4 2013 1 1 602 DL N971DL
5 2013 1 1 602 MQ N730MQ
6 2013 1 1 606 DL N3739P
7 2013 1 1 608 MQ N9EAMQ
8 2013 1 1 615 DL N326NB
9 2013 1 1 622 US N807AW
10 2013 1 1 624 EV N11107
# … with 165,374 more rows
flights2
# A tibble: 336,776 × 6
year month day dep_time carrier tailnum
<int> <int> <int> <int> <chr> <chr>
1 2013 1 1 517 UA N14228
2 2013 1 1 533 UA N24211
3 2013 1 1 542 AA N619AA
4 2013 1 1 544 B6 N804JB
5 2013 1 1 554 DL N668DN
6 2013 1 1 554 UA N39463
7 2013 1 1 555 B6 N516JB
8 2013 1 1 557 EV N829AS
9 2013 1 1 557 B6 N593JB
10 2013 1 1 558 AA N3ALAA
# … with 336,766 more rows
airlines3
# A tibble: 8 × 2
airline name
<chr> <chr>
1 HA Hawaiian Airlines Inc.
2 B6 JetBlue Airways
3 F9 Frontier Airlines Inc.
4 9E Endeavor Air Inc.
5 AA American Airlines Inc.
6 VX Virgin America
7 UA United Air Lines Inc.
8 AS Alaska Airlines Inc.
# A tibble: 165,384 × 6
year month day dep_time carrier tailnum
<int> <int> <int> <int> <chr> <chr>
1 2013 1 1 554 DL N668DN
2 2013 1 1 557 EV N829AS
3 2013 1 1 600 MQ N542MQ
4 2013 1 1 602 DL N971DL
5 2013 1 1 602 MQ N730MQ
6 2013 1 1 606 DL N3739P
7 2013 1 1 608 MQ N9EAMQ
8 2013 1 1 615 DL N326NB
9 2013 1 1 622 US N807AW
10 2013 1 1 624 EV N11107
# … with 165,374 more rows
(df <- tibble(x = 1:3, y = 3:1))
# A tibble: 3 × 2
x y
<int> <int>
1 1 3
2 2 2
3 3 1
# A tibble: 3 × 2
z zz
<chr> <chr>
1 A Z
2 B Y
3 C X
bind_cols(df, df4)
# A tibble: 3 × 4
x y z zz
<int> <int> <chr> <chr>
1 1 3 A Z
2 2 2 B Y
3 3 1 C X
bind_cols(df, new_col = df4$z)
# A tibble: 3 × 3
x y new_col
<int> <int> <chr>
1 1 3 A
2 2 2 B
3 3 1 C
But why is this dangerous? What is a better solution?
Common rows in both x
and y
, keeping just unique rows
All rows from x
which are not also rows in y
, keeping just unique rows
All unique rows from x
and y
All rows from x
and y
, keeping duplicates
Congratulations—you just learned SQL databases!
What code combines data1
and data2
into data3
?
data1
# A tibble: 12 × 2
id resp
<int> <dbl>
1 1 0.429
2 2 0.663
3 3 0.539
4 4 0.0143
5 5 0.753
6 6 0.234
7 7 0.778
8 8 0.625
9 9 0.570
10 10 0.316
11 11 0.192
12 12 0.657
data2
# A tibble: 12 × 2
id cond
<int> <int>
1 1 1
2 2 2
3 3 3
4 4 1
5 5 2
6 6 3
7 7 1
8 8 2
9 9 3
10 10 1
11 11 2
12 12 3
data3
# A tibble: 4 × 2
id resp
<int> <dbl>
1 1 0.429
2 4 0.0143
3 7 0.778
4 10 0.316