library(tidyverse)
# library(ggplot2) in tidyverse
# library(dplyr) in tidyverse
# library(stringr) in tidyverse
# library(readr) in tidyverse
library(ggthemes)
library(ggridges)
library(readxl)
library(googlesheets4)
library(sf) #standardized way to encode spatial vector data
library(gt) #Presentation-Ready Display Tables
library(gtsummary) #publication-ready analytical and summary tables using the R programming language
library(knitr) #general-purpose tool for dynamic report generation in R using Literate Programming techniques.
library(DT) #Data objects in R can be rendered as HTML tables using the JavaScript library 'DataTables'
library(lubridate) #tools to make dates parsing and manipulation easier
# datasets: ------------------------------------
library(gapminder)
library(palmerpenguins)
library(rnaturalearth)
library(nycflights13)
Cheatsheet for R
FS24 - Research Beyond the Lab
Useful stuff
HELP: ?<func_name>
Generic YAML header:
---
: "Title"
title: "Subtitle"
subtitle: today
date:
author- name: "Sven Prinz"
: 0009-0000-4485-1235
orcid: sprinz@ethz.ch
email:
affiliation- name: ETH Zurich
: https://ethz.ch/de.html
url:
abstract"Abstract."
: "CC BY"
license:
citation-title: Research Beyond the Lab
container# url: https://www.example.com # insert website link here
# bibliography: references.bib
# csl: apa.csl # other csls https://github.com/citation-style-language/styles
:
format# docx: default
:
html: true
toc-depth: 4
toc-expand: true
toc-resources: true
embed# fig-width: 6
# fig-asp: 0.618 # the golden ratio, only adjust fig-asp in chunks where needed
# fig-align: center
# out-width: "70%"
# global code chunk options are defined:
:
execute: true
warning: false # regularily clear out by knitr::clean_cache()
cache
: visual
editor:
editor_options: console
chunk_output_type---
Code chunk options: (full list)
#| label: (name chunk)
#| eval: false (code not evaluated, no results)
#| include: false (runs code, does not show code or results in doc)
#| echo: false (results shown, code not)
#| message: false (prevents message appearance)
#| warning: false (prevents warning appearance)
#| results: hide (hide printed output)
#| fig-show: hide (hide plots)
#| error: true (render continues despite error)
#| code-fold: true (collapse the code chunk)
Shortcuts
List of shortcut: Alt + Shift + K
Quarto Shortcuts
Pipe operator: Ctrl + Shift + M
Assignment operator: Alt + -
Run code chunk: Ctrl + Shift + Enter
Script Shortcuts
Sectioning comment headers: Ctrl + Shift + R
Run all chunks: Ctrl + Alt + R
Execute complete script: Ctrl + Shift + S
Restart R: Ctrl + Shift + 0/F10
Setup (load packages)
Read data
Read CSV
When reading data, the data type can be controlled by specifying col_types = "<types for each col>"
<- read_csv(here::here("data/jmp_wld_sanitation_long.csv"))
sanitation
<- read_csv("https://pos.it/r4ds-students-csv", # import directly from URL
students na = c("N/A", ""), # define what counts as NA
col_types = "ncccc" # specify the data type for each column
)
read_csv( # skip metadata at beginning of csv file
"The first line of metadata
The second line of metadata
1,2,3
4,5,6",
col_names = c("x", "y", "z"),
skip = 2
)
# A tibble: 2 × 3
x y z
<dbl> <dbl> <dbl>
1 1 2 3
2 4 5 6
read_csv("
x
10
.
20
30",
col_types = list(x = col_double()) # https://r4ds.hadley.nz/data-import#column-types
|>
) problems() # --> read_csv(<...>, na = ".")
# A tibble: 1 × 5
row col expected actual file
<int> <int> <chr> <chr> <chr>
1 3 1 a double . /tmp/RtmpojtjtK/file86659031dcb
Read RDS
<- read_rds(here::here("data/processed/waste-city-level-sml.rds"))
waste_gt <- read_rds(here::here("data/processed/waste-survey-processed_rds")) # here:here --> https://rbtl-fs24.github.io/website/assignments/md-05/am-05-1-survey-data.html
survey <- read_rds(here::here("data/msw-generation-and-composition-by-income-long-mean.rds")) waste_data_long_mean
Read spreadsheets
#read google sheet
<- read_sheet("https://docs.google.com/spreadsheets/d/12YhM_aLZkg2q383SP8PwfyRpPWA5X2Q2VkhzYYyF1dQ/edit#gid=0")
waste_data
#read excel file
<- read_xlsx(here::here("data/raw/faecal-sludge-analysis.xlsx")) # read_xls / read excel() (guesses if xls or xlsx) sludge
Explore
head(gapminder)
# A tibble: 6 × 6
country continent year lifeExp pop gdpPercap
<fct> <fct> <int> <dbl> <int> <dbl>
1 Afghanistan Asia 1952 28.8 8425333 779.
2 Afghanistan Asia 1957 30.3 9240934 821.
3 Afghanistan Asia 1962 32.0 10267083 853.
4 Afghanistan Asia 1967 34.0 11537966 836.
5 Afghanistan Asia 1972 36.1 13079460 740.
6 Afghanistan Asia 1977 38.4 14880372 786.
tail(gapminder)
# A tibble: 6 × 6
country continent year lifeExp pop gdpPercap
<fct> <fct> <int> <dbl> <int> <dbl>
1 Zimbabwe Africa 1982 60.4 7636524 789.
2 Zimbabwe Africa 1987 62.4 9216418 706.
3 Zimbabwe Africa 1992 60.4 10704340 693.
4 Zimbabwe Africa 1997 46.8 11404948 792.
5 Zimbabwe Africa 2002 40.0 11926563 672.
6 Zimbabwe Africa 2007 43.5 12311143 470.
glimpse(gapminder)
Rows: 1,704
Columns: 6
$ country <fct> "Afghanistan", "Afghanistan", "Afghanistan", "Afghanistan", …
$ continent <fct> Asia, Asia, Asia, Asia, Asia, Asia, Asia, Asia, Asia, Asia, …
$ year <int> 1952, 1957, 1962, 1967, 1972, 1977, 1982, 1987, 1992, 1997, …
$ lifeExp <dbl> 28.801, 30.332, 31.997, 34.020, 36.088, 38.438, 39.854, 40.8…
$ pop <int> 8425333, 9240934, 10267083, 11537966, 13079460, 14880372, 12…
$ gdpPercap <dbl> 779.4453, 820.8530, 853.1007, 836.1971, 739.9811, 786.1134, …
str(gapminder)
tibble [1,704 × 6] (S3: tbl_df/tbl/data.frame)
$ country : Factor w/ 142 levels "Afghanistan",..: 1 1 1 1 1 1 1 1 1 1 ...
$ continent: Factor w/ 5 levels "Africa","Americas",..: 3 3 3 3 3 3 3 3 3 3 ...
$ year : int [1:1704] 1952 1957 1962 1967 1972 1977 1982 1987 1992 1997 ...
$ lifeExp : num [1:1704] 28.8 30.3 32 34 36.1 ...
$ pop : int [1:1704] 8425333 9240934 10267083 11537966 13079460 14880372 12881816 13867957 16317921 22227415 ...
$ gdpPercap: num [1:1704] 779 821 853 836 740 ...
nrow(gapminder)
[1] 1704
ncol(gapminder)
[1] 6
distinct(gapminder, country) # keep unique rows
# A tibble: 142 × 1
country
<fct>
1 Afghanistan
2 Albania
3 Algeria
4 Angola
5 Argentina
6 Australia
7 Austria
8 Bahrain
9 Bangladesh
10 Belgium
# ℹ 132 more rows
Data transformation with dplyr
The dplyr
R Package aims to provide a function for each basic verb of data manipulation. These verbs can be organised into three categories based on the component of the dataset that they work with:
- Rows
- Columns
- Groups of rows
( across()
: Apply a function (or functions) across multiple columns: https://dplyr.tidyverse.org/reference/across.html)
Rows
filter()
The function filter()
chooses rows based on column values. To use filtering effectively, you have to know how to select the observations that you want using the comparison operators. R provides the standard suite: >
, >=
, <
, <=
, !=
(not equal), and ==
(equal).
Use &
and |
to filter for multiple values.
|>
sanitation filter(residence == "national")
# A tibble: 24,570 × 8
name iso3 year region_sdg varname_short varname_long residence percent
<chr> <chr> <dbl> <chr> <chr> <chr> <chr> <dbl>
1 Afghanis… AFG 2000 Central a… san_bas basic sanit… national 21.9
2 Afghanis… AFG 2000 Central a… san_lim limited san… national 5.65
3 Afghanis… AFG 2000 Central a… san_unimp unimproved … national 46.7
4 Afghanis… AFG 2000 Central a… san_od no sanitati… national 25.8
5 Afghanis… AFG 2000 Central a… san_sm safely mana… national NA
6 Afghanis… AFG 2001 Central a… san_bas basic sanit… national 21.9
7 Afghanis… AFG 2001 Central a… san_lim limited san… national 5.66
8 Afghanis… AFG 2001 Central a… san_unimp unimproved … national 46.7
9 Afghanis… AFG 2001 Central a… san_od no sanitati… national 25.8
10 Afghanis… AFG 2001 Central a… san_sm safely mana… national NA
# ℹ 24,560 more rows
|>
sanitation filter(residence != "national")
# A tibble: 49,140 × 8
name iso3 year region_sdg varname_short varname_long residence percent
<chr> <chr> <dbl> <chr> <chr> <chr> <chr> <dbl>
1 Afghanis… AFG 2000 Central a… san_bas basic sanit… rural 19.3
2 Afghanis… AFG 2000 Central a… san_bas basic sanit… urban 30.9
3 Afghanis… AFG 2000 Central a… san_lim limited san… rural 3.14
4 Afghanis… AFG 2000 Central a… san_lim limited san… urban 14.5
5 Afghanis… AFG 2000 Central a… san_unimp unimproved … rural 46.3
6 Afghanis… AFG 2000 Central a… san_unimp unimproved … urban 48.1
7 Afghanis… AFG 2000 Central a… san_od no sanitati… rural 31.3
8 Afghanis… AFG 2000 Central a… san_od no sanitati… urban 6.51
9 Afghanis… AFG 2000 Central a… san_sm safely mana… rural NA
10 Afghanis… AFG 2000 Central a… san_sm safely mana… urban NA
# ℹ 49,130 more rows
|>
sanitation filter(name == "Nigeria", (residence == "rural" | residence == "urban"))
# A tibble: 210 × 8
name iso3 year region_sdg varname_short varname_long residence percent
<chr> <chr> <dbl> <chr> <chr> <chr> <chr> <dbl>
1 Nigeria NGA 2000 Sub-Saharan… san_bas basic sanit… rural 28.0
2 Nigeria NGA 2000 Sub-Saharan… san_bas basic sanit… urban 29.8
3 Nigeria NGA 2000 Sub-Saharan… san_lim limited san… rural 15.1
4 Nigeria NGA 2000 Sub-Saharan… san_lim limited san… urban 40.1
5 Nigeria NGA 2000 Sub-Saharan… san_unimp unimproved … rural 23.4
6 Nigeria NGA 2000 Sub-Saharan… san_unimp unimproved … urban 16.6
7 Nigeria NGA 2000 Sub-Saharan… san_od no sanitati… rural 33.4
8 Nigeria NGA 2000 Sub-Saharan… san_od no sanitati… urban 13.5
9 Nigeria NGA 2000 Sub-Saharan… san_sm safely mana… rural 21.9
10 Nigeria NGA 2000 Sub-Saharan… san_sm safely mana… urban 20.1
# ℹ 200 more rows
|>
sanitation filter(iso3 == "UGA" | iso3 == "PER" | iso3 == "CHE")
# A tibble: 945 × 8
name iso3 year region_sdg varname_short varname_long residence percent
<chr> <chr> <dbl> <chr> <chr> <chr> <chr> <dbl>
1 Peru PER 2000 Latin America… san_bas basic sanit… national 63.3
2 Peru PER 2000 Latin America… san_bas basic sanit… rural 29.4
3 Peru PER 2000 Latin America… san_bas basic sanit… urban 75.7
4 Peru PER 2000 Latin America… san_lim limited san… national 7.88
5 Peru PER 2000 Latin America… san_lim limited san… rural 2.14
6 Peru PER 2000 Latin America… san_lim limited san… urban 10.0
7 Peru PER 2000 Latin America… san_unimp unimproved … national 10.3
8 Peru PER 2000 Latin America… san_unimp unimproved … rural 18.5
9 Peru PER 2000 Latin America… san_unimp unimproved … urban 7.29
10 Peru PER 2000 Latin America… san_od no sanitati… national 18.6
# ℹ 935 more rows
|>
sanitation filter(iso3 %in% c("UGA", "PER", "CHE"))
# A tibble: 945 × 8
name iso3 year region_sdg varname_short varname_long residence percent
<chr> <chr> <dbl> <chr> <chr> <chr> <chr> <dbl>
1 Peru PER 2000 Latin America… san_bas basic sanit… national 63.3
2 Peru PER 2000 Latin America… san_bas basic sanit… rural 29.4
3 Peru PER 2000 Latin America… san_bas basic sanit… urban 75.7
4 Peru PER 2000 Latin America… san_lim limited san… national 7.88
5 Peru PER 2000 Latin America… san_lim limited san… rural 2.14
6 Peru PER 2000 Latin America… san_lim limited san… urban 10.0
7 Peru PER 2000 Latin America… san_unimp unimproved … national 10.3
8 Peru PER 2000 Latin America… san_unimp unimproved … rural 18.5
9 Peru PER 2000 Latin America… san_unimp unimproved … urban 7.29
10 Peru PER 2000 Latin America… san_od no sanitati… national 18.6
# ℹ 935 more rows
|>
sanitation filter(percent > 80)
# A tibble: 8,314 × 8
name iso3 year region_sdg varname_short varname_long residence percent
<chr> <chr> <dbl> <chr> <chr> <chr> <chr> <dbl>
1 Albania ALB 2000 Northern Am… san_bas basic sanit… national 89.5
2 Albania ALB 2000 Northern Am… san_bas basic sanit… rural 84.2
3 Albania ALB 2000 Northern Am… san_bas basic sanit… urban 96.9
4 Albania ALB 2001 Northern Am… san_bas basic sanit… national 90.0
5 Albania ALB 2001 Northern Am… san_bas basic sanit… rural 84.9
6 Albania ALB 2001 Northern Am… san_bas basic sanit… urban 97.0
7 Albania ALB 2002 Northern Am… san_bas basic sanit… national 90.6
8 Albania ALB 2002 Northern Am… san_bas basic sanit… rural 85.7
9 Albania ALB 2002 Northern Am… san_bas basic sanit… urban 97.0
10 Albania ALB 2003 Northern Am… san_bas basic sanit… national 91.2
# ℹ 8,304 more rows
|>
sanitation filter(percent <= 5)
# A tibble: 21,424 × 8
name iso3 year region_sdg varname_short varname_long residence percent
<chr> <chr> <dbl> <chr> <chr> <chr> <chr> <dbl>
1 Afghanis… AFG 2000 Central a… san_lim limited san… rural 3.14
2 Afghanis… AFG 2001 Central a… san_lim limited san… rural 3.14
3 Afghanis… AFG 2002 Central a… san_lim limited san… rural 3.35
4 Afghanis… AFG 2003 Central a… san_lim limited san… rural 3.57
5 Afghanis… AFG 2004 Central a… san_lim limited san… rural 3.79
6 Afghanis… AFG 2005 Central a… san_lim limited san… rural 4.01
7 Afghanis… AFG 2005 Central a… san_od no sanitati… urban 4.86
8 Afghanis… AFG 2006 Central a… san_lim limited san… rural 4.22
9 Afghanis… AFG 2006 Central a… san_od no sanitati… urban 4.45
10 Afghanis… AFG 2007 Central a… san_lim limited san… rural 4.44
# ℹ 21,414 more rows
|>
sanitation filter(!is.na(percent))
# A tibble: 53,967 × 8
name iso3 year region_sdg varname_short varname_long residence percent
<chr> <chr> <dbl> <chr> <chr> <chr> <chr> <dbl>
1 Afghanis… AFG 2000 Central a… san_bas basic sanit… national 21.9
2 Afghanis… AFG 2000 Central a… san_bas basic sanit… rural 19.3
3 Afghanis… AFG 2000 Central a… san_bas basic sanit… urban 30.9
4 Afghanis… AFG 2000 Central a… san_lim limited san… national 5.65
5 Afghanis… AFG 2000 Central a… san_lim limited san… rural 3.14
6 Afghanis… AFG 2000 Central a… san_lim limited san… urban 14.5
7 Afghanis… AFG 2000 Central a… san_unimp unimproved … national 46.7
8 Afghanis… AFG 2000 Central a… san_unimp unimproved … rural 46.3
9 Afghanis… AFG 2000 Central a… san_unimp unimproved … urban 48.1
10 Afghanis… AFG 2000 Central a… san_od no sanitati… national 25.8
# ℹ 53,957 more rows
arrange()
The function arrange()
changes the order of the rows.
|>
sanitation arrange(desc(percent))
# A tibble: 73,710 × 8
name iso3 year region_sdg varname_short varname_long residence percent
<chr> <chr> <dbl> <chr> <chr> <chr> <chr> <dbl>
1 Andorra AND 2004 Northern … san_bas basic sanit… national 100.
2 Falkland… FLK 2013 Latin Ame… san_bas basic sanit… national 100.
3 Falkland… FLK 2015 Latin Ame… san_bas basic sanit… national 100.
4 Falkland… FLK 2002 Latin Ame… san_bas basic sanit… national 100.
5 Israel ISR 2004 Western A… san_bas basic sanit… national 100.
6 Andorra AND 2019 Northern … san_bas basic sanit… national 100.
7 Andorra AND 2019 Northern … san_sm safely mana… national 100.
8 Andorra AND 2020 Northern … san_bas basic sanit… national 100.
9 Andorra AND 2020 Northern … san_sm safely mana… national 100.
10 Andorra AND 2012 Northern … san_bas basic sanit… national 100.
# ℹ 73,700 more rows
distinct()
Keep only unique/distinct rows from a data frame.
Columns
select()
The select()
function chooses columns based on their names.
|>
sanitation select(name, percent)
# A tibble: 73,710 × 2
name percent
<chr> <dbl>
1 Afghanistan 21.9
2 Afghanistan 19.3
3 Afghanistan 30.9
4 Afghanistan 5.65
5 Afghanistan 3.14
6 Afghanistan 14.5
7 Afghanistan 46.7
8 Afghanistan 46.3
9 Afghanistan 48.1
10 Afghanistan 25.8
# ℹ 73,700 more rows
|>
sanitation select(-varname_short) #delete/remove a column
# A tibble: 73,710 × 7
name iso3 year region_sdg varname_long residence percent
<chr> <chr> <dbl> <chr> <chr> <chr> <dbl>
1 Afghanistan AFG 2000 Central and Southern … basic sanit… national 21.9
2 Afghanistan AFG 2000 Central and Southern … basic sanit… rural 19.3
3 Afghanistan AFG 2000 Central and Southern … basic sanit… urban 30.9
4 Afghanistan AFG 2000 Central and Southern … limited san… national 5.65
5 Afghanistan AFG 2000 Central and Southern … limited san… rural 3.14
6 Afghanistan AFG 2000 Central and Southern … limited san… urban 14.5
7 Afghanistan AFG 2000 Central and Southern … unimproved … national 46.7
8 Afghanistan AFG 2000 Central and Southern … unimproved … rural 46.3
9 Afghanistan AFG 2000 Central and Southern … unimproved … urban 48.1
10 Afghanistan AFG 2000 Central and Southern … no sanitati… national 25.8
# ℹ 73,700 more rows
|>
sanitation select(name:region_sdg, percent)
# A tibble: 73,710 × 5
name iso3 year region_sdg percent
<chr> <chr> <dbl> <chr> <dbl>
1 Afghanistan AFG 2000 Central and Southern Asia 21.9
2 Afghanistan AFG 2000 Central and Southern Asia 19.3
3 Afghanistan AFG 2000 Central and Southern Asia 30.9
4 Afghanistan AFG 2000 Central and Southern Asia 5.65
5 Afghanistan AFG 2000 Central and Southern Asia 3.14
6 Afghanistan AFG 2000 Central and Southern Asia 14.5
7 Afghanistan AFG 2000 Central and Southern Asia 46.7
8 Afghanistan AFG 2000 Central and Southern Asia 46.3
9 Afghanistan AFG 2000 Central and Southern Asia 48.1
10 Afghanistan AFG 2000 Central and Southern Asia 25.8
# ℹ 73,700 more rows
rename()
The rename()
function changes the names of variables.
|>
sanitation rename(country = name) # use `name of ...` if the name includes special characters/spaces
# A tibble: 73,710 × 8
country iso3 year region_sdg varname_short varname_long residence percent
<chr> <chr> <dbl> <chr> <chr> <chr> <chr> <dbl>
1 Afghanis… AFG 2000 Central a… san_bas basic sanit… national 21.9
2 Afghanis… AFG 2000 Central a… san_bas basic sanit… rural 19.3
3 Afghanis… AFG 2000 Central a… san_bas basic sanit… urban 30.9
4 Afghanis… AFG 2000 Central a… san_lim limited san… national 5.65
5 Afghanis… AFG 2000 Central a… san_lim limited san… rural 3.14
6 Afghanis… AFG 2000 Central a… san_lim limited san… urban 14.5
7 Afghanis… AFG 2000 Central a… san_unimp unimproved … national 46.7
8 Afghanis… AFG 2000 Central a… san_unimp unimproved … rural 46.3
9 Afghanis… AFG 2000 Central a… san_unimp unimproved … urban 48.1
10 Afghanis… AFG 2000 Central a… san_od no sanitati… national 25.8
# ℹ 73,700 more rows
rename(`income category` = income_cat)
rename_all(~ c(*all the new names*)) # https://www.perplexity.ai/search/How-do-you-RsHoALUfRsWscmTmbzU39A
|> janitor::clean_names() # <-- automated clean names students
mutate()
The mutate()
function adds new variables based on existing variables or external data.
|>
sanitation select(-varname_long) |>
mutate(prop = percent / 100)
# A tibble: 73,710 × 8
name iso3 year region_sdg varname_short residence percent prop
<chr> <chr> <dbl> <chr> <chr> <chr> <dbl> <dbl>
1 Afghanistan AFG 2000 Central and S… san_bas national 21.9 0.219
2 Afghanistan AFG 2000 Central and S… san_bas rural 19.3 0.193
3 Afghanistan AFG 2000 Central and S… san_bas urban 30.9 0.309
4 Afghanistan AFG 2000 Central and S… san_lim national 5.65 0.0565
5 Afghanistan AFG 2000 Central and S… san_lim rural 3.14 0.0314
6 Afghanistan AFG 2000 Central and S… san_lim urban 14.5 0.145
7 Afghanistan AFG 2000 Central and S… san_unimp national 46.7 0.467
8 Afghanistan AFG 2000 Central and S… san_unimp rural 46.3 0.463
9 Afghanistan AFG 2000 Central and S… san_unimp urban 48.1 0.481
10 Afghanistan AFG 2000 Central and S… san_od national 25.8 0.258
# ℹ 73,700 more rows
|>
sanitation mutate(id = seq(1:n()),
.before = name)
# A tibble: 73,710 × 9
id name iso3 year region_sdg varname_short varname_long residence
<int> <chr> <chr> <dbl> <chr> <chr> <chr> <chr>
1 1 Afghanistan AFG 2000 Central a… san_bas basic sanit… national
2 2 Afghanistan AFG 2000 Central a… san_bas basic sanit… rural
3 3 Afghanistan AFG 2000 Central a… san_bas basic sanit… urban
4 4 Afghanistan AFG 2000 Central a… san_lim limited san… national
5 5 Afghanistan AFG 2000 Central a… san_lim limited san… rural
6 6 Afghanistan AFG 2000 Central a… san_lim limited san… urban
7 7 Afghanistan AFG 2000 Central a… san_unimp unimproved … national
8 8 Afghanistan AFG 2000 Central a… san_unimp unimproved … rural
9 9 Afghanistan AFG 2000 Central a… san_unimp unimproved … urban
10 10 Afghanistan AFG 2000 Central a… san_od no sanitati… national
# ℹ 73,700 more rows
# ℹ 1 more variable: percent <dbl>
|>
waste_data mutate(mass_total_kg = NULL) |> # removes column
mutate(bin_id = paste0("B-", bin_id)) # add string to values
# A tibble: 20 × 9
bin_id collection_date location mass_paper_kg mass_plastic_kg mass_glass_kg
<chr> <chr> <chr> <dbl> <dbl> <dbl>
1 B-1 Sunday, 31st Mar… Baecker… 5 3 4
2 B-2 Sunday, 31st Mar… Baecker… 11 2 2
3 B-3 Sunday, 31st Mar… Baecker… 2 6 3
4 B-4 Sunday, 31st Mar… Baecker… 9 4 1
5 B-5 Sunday, 31st Mar… Baecker… 4 0 7
6 B-6 Sunday, 31st Mar… Landwie… 4 6 3
7 B-7 Sunday, 31st Mar… Landwie… 3 1 9
8 B-8 Sunday, 31st Mar… Landwie… 8 4 2
9 B-9 Sunday, 31st Mar… Landwie… 1 14 3
10 B-10 Sunday, 31st Mar… Landwie… 2 5 2
11 B-11 Thursday, 4th Ap… Baecker… 7 3 5
12 B-12 Thursday, 4th Ap… Baecker… 3 7 8
13 B-13 Thursday, 4th Ap… Baecker… 4 8 0
14 B-14 Thursday, 4th Ap… Baecker… 9 3 8
15 B-15 Thursday, 4th Ap… Baecker… 3 2 7
16 B-16 Thursday, 4th Ap… Landwie… 10 2 3
17 B-17 Thursday, 4th Ap… Landwie… 3 6 5
18 B-18 Thursday, 4th Ap… Landwie… 6 4 5
19 B-19 Thursday, 4th Ap… Landwie… 3 8 4
20 B-20 Thursday, 4th Ap… Landwie… 4 8 6
# ℹ 3 more variables: mass_metal_kg <dbl>, mass_organic_kg <dbl>,
# mass_other_kg <dbl>
|>
students mutate(age = parse_number(if_else(AGE == "five", "5", AGE)))
# A tibble: 6 × 6
`Student ID` `Full Name` favourite.food mealPlan AGE age
<dbl> <chr> <chr> <chr> <chr> <dbl>
1 1 Sunil Huffmann Strawberry yoghurt Lunch only 4 4
2 2 Barclay Lynn French fries Lunch only 5 5
3 3 Jayendra Lyne <NA> Breakfast and lu… 7 7
4 4 Leon Rossini Anchovies Lunch only <NA> NA
5 5 Chidiegwu Dunkel Pizza Breakfast and lu… five 5
6 6 Güvenç Attila Ice cream Lunch only 6 6
: mutate(..., .keep = "used") # <-- only used columns are kept argument
relocate()
|>
sanitation mutate(id = seq(1:n())) |>
relocate(id)
# A tibble: 73,710 × 9
id name iso3 year region_sdg varname_short varname_long residence
<int> <chr> <chr> <dbl> <chr> <chr> <chr> <chr>
1 1 Afghanistan AFG 2000 Central a… san_bas basic sanit… national
2 2 Afghanistan AFG 2000 Central a… san_bas basic sanit… rural
3 3 Afghanistan AFG 2000 Central a… san_bas basic sanit… urban
4 4 Afghanistan AFG 2000 Central a… san_lim limited san… national
5 5 Afghanistan AFG 2000 Central a… san_lim limited san… rural
6 6 Afghanistan AFG 2000 Central a… san_lim limited san… urban
7 7 Afghanistan AFG 2000 Central a… san_unimp unimproved … national
8 8 Afghanistan AFG 2000 Central a… san_unimp unimproved … rural
9 9 Afghanistan AFG 2000 Central a… san_unimp unimproved … urban
10 10 Afghanistan AFG 2000 Central a… san_od no sanitati… national
# ℹ 73,700 more rows
# ℹ 1 more variable: percent <dbl>
|>
sanitation mutate(id = seq(1:n())) |>
relocate(id, .after = name)
# A tibble: 73,710 × 9
name id iso3 year region_sdg varname_short varname_long residence
<chr> <int> <chr> <dbl> <chr> <chr> <chr> <chr>
1 Afghanistan 1 AFG 2000 Central a… san_bas basic sanit… national
2 Afghanistan 2 AFG 2000 Central a… san_bas basic sanit… rural
3 Afghanistan 3 AFG 2000 Central a… san_bas basic sanit… urban
4 Afghanistan 4 AFG 2000 Central a… san_lim limited san… national
5 Afghanistan 5 AFG 2000 Central a… san_lim limited san… rural
6 Afghanistan 6 AFG 2000 Central a… san_lim limited san… urban
7 Afghanistan 7 AFG 2000 Central a… san_unimp unimproved … national
8 Afghanistan 8 AFG 2000 Central a… san_unimp unimproved … rural
9 Afghanistan 9 AFG 2000 Central a… san_unimp unimproved … urban
10 Afghanistan 10 AFG 2000 Central a… san_od no sanitati… national
# ℹ 73,700 more rows
# ℹ 1 more variable: percent <dbl>
Groups
summarize()
The summarize()
function reduces multiple values down to a single summary.
filter(sanitation,
== "national",
residence == 2020,
year == "san_sm") |>
varname_short filter(!is.na(percent)) |>
summarize(mean_percent = mean(percent), #, na.rm = TRUE),
sd_percent = sd(percent),
n = n())
# A tibble: 1 × 3
mean_percent sd_percent n
<dbl> <dbl> <int>
1 60.3 29.9 120
<- waste_gt |>
waste_tbl_income filter(!is.na(generation_kg_capita)) |>
group_by(income_cat) |>
summarise(
count = n(),
mean = mean(generation_kg_capita),
sd = sd(generation_kg_capita),
median = median(generation_kg_capita),
min = min(generation_kg_capita),
max = max(generation_kg_capita)
)print(waste_tbl_income)
# A tibble: 4 × 7
income_cat count mean sd median min max
<fct> <int> <dbl> <dbl> <dbl> <dbl> <dbl>
1 high income 71 477. 214. 421. 116. 1142.
2 upper-middle income 72 381. 133. 378. 130. 828.
3 lower-middle income 116 275. 179. 219. 62.1 1109.
4 low income 67 215. 130. 182. 6.86 694.
|>
penguins #filter(!is.na(bill_depth_mm),!is.na(bill_length_mm)) |>
group_by(species, year) |>
summarize(bill_depth_mean = mean(bill_depth_mm, na.rm = TRUE),
bill_length_mean = mean(bill_length_mm, na.rm = TRUE),
n = n()) # <-- returns number of rows in each group
# A tibble: 9 × 5
# Groups: species [3]
species year bill_depth_mean bill_length_mean n
<fct> <int> <dbl> <dbl> <int>
1 Adelie 2007 18.8 38.8 50
2 Adelie 2008 18.2 38.6 50
3 Adelie 2009 18.1 39.0 52
4 Chinstrap 2007 18.5 48.7 26
5 Chinstrap 2008 18.4 48.7 18
6 Chinstrap 2009 18.3 49.1 24
7 Gentoo 2007 14.7 47.0 34
8 Gentoo 2008 14.9 46.9 46
9 Gentoo 2009 15.3 48.5 44
group_by() [reverse: ungroup()]
The group_by()
function is used to group the data by one or more variables.
filter(sanitation,
== "national",
residence == 2020,
year == "san_sm") |>
varname_short group_by(region_sdg) |>
filter(!is.na(percent)) |>
summarize(n = n(),
mean_percent = mean(percent),
sd_percent = sd(percent))
# A tibble: 8 × 4
region_sdg n mean_percent sd_percent
<chr> <int> <dbl> <dbl>
1 Australia and New Zealand 2 78.2 5.61
2 Central and Southern Asia 5 58.2 21.5
3 Eastern and South-Eastern Asia 11 69.8 21.4
4 Latin America and the Caribbean 14 43.4 16.8
5 Northern America and Europe 44 81.9 19.9
6 Oceania 3 36.1 10.7
7 Sub-Saharan Africa 21 21.4 10.9
8 Western Asia and Northern Africa 20 62.7 29.5
Alternative: use .by = c(..., ...)
as argument in other function to group within a single operation
slice_
|> slice_head(n = 1) # takes the first row from each group. sludge
# A tibble: 1 × 6
id date_sample system location users ts
<dbl> <dttm> <chr> <chr> <dbl> <dbl>
1 1 2023-11-01 00:00:00 pit latrine household 5 136.
|> slice_tail(n = 1) # takes the last row in each group. sludge
# A tibble: 1 × 6
id date_sample system location users ts
<dbl> <dttm> <chr> <chr> <dbl> <dbl>
1 20 2023-11-04 00:00:00 septic tank public toilet 59 15.6
|> slice_min(ts, n = 1) # takes the row with the smallest value of column ts. sludge
# A tibble: 1 × 6
id date_sample system location users ts
<dbl> <dttm> <chr> <chr> <dbl> <dbl>
1 16 2023-11-04 00:00:00 septic tank public toilet 26 0.72
|> slice_max(ts, n = 1) # takes the row with the largest value of column ts. sludge
# A tibble: 1 × 6
id date_sample system location users ts
<dbl> <dttm> <chr> <chr> <dbl> <dbl>
1 1 2023-11-01 00:00:00 pit latrine household 5 136.
|> slice_sample(n = 1) # takes one random row. sludge
# A tibble: 1 × 6
id date_sample system location users ts
<dbl> <dttm> <chr> <chr> <dbl> <dbl>
1 7 2023-11-02 00:00:00 septic tank household 14 15.2
You can vary n
to select more than one row, or instead of n =
, you can use prop = 0.1
to select (e.g.) 10% of the rows in each group.
Note that slice_min()
and slice_max()
keep tied values so n = 1
means give us all rows with the highest value. If you want exactly one row per group you can set with_ties = FALSE
.
count()
The count()
function is a convenient wrapper for group_by()
and summarise(n = n())
. You can prepare frequency tables with count()
.
|>
sanitation count(region_sdg)
# A tibble: 8 × 2
region_sdg n
<chr> <int>
1 Australia and New Zealand 630
2 Central and Southern Asia 4410
3 Eastern and South-Eastern Asia 5670
4 Latin America and the Caribbean 15750
5 Northern America and Europe 16695
6 Oceania 6615
7 Sub-Saharan Africa 16065
8 Western Asia and Northern Africa 7875
#same as
|>
sanitation group_by(region_sdg) |>
summarize(s = n())
# A tibble: 8 × 2
region_sdg s
<chr> <int>
1 Australia and New Zealand 630
2 Central and Southern Asia 4410
3 Eastern and South-Eastern Asia 5670
4 Latin America and the Caribbean 15750
5 Northern America and Europe 16695
6 Oceania 6615
7 Sub-Saharan Africa 16065
8 Western Asia and Northern Africa 7875
|>
sanitation count(varname_short, varname_long)
# A tibble: 5 × 3
varname_short varname_long n
<chr> <chr> <int>
1 san_bas basic sanitation services 14742
2 san_lim limited sanitation services 14742
3 san_od no sanitation facilities 14742
4 san_sm safely managed sanitation services 14742
5 san_unimp unimproved sanitation facilities 14742
|>
sanitation count(is.na(percent))
# A tibble: 2 × 2
`is.na(percent)` n
<lgl> <int>
1 FALSE 53967
2 TRUE 19743
When writing the count output to an object –> use count(WHATEVER)$n
to only take the numbers
Pivoting
<- survey |>
survey_long select(id, sep_paper:sep_electro) |>
pivot_longer(cols = -id, # all cols but id
names_to = "waste_type",
values_to = "answer",
values_drop_na = TRUE) |> # gets rid of all NAs in the created values column
mutate(waste_type = str_remove(waste_type, pattern = "sep_"))
print(survey_long)
# A tibble: 112 × 3
id waste_type answer
<int> <chr> <chr>
1 1 paper Separated
2 1 cardboard Separated
3 1 plastic Separated
4 1 glass Separated
5 1 metal Separated
6 1 organic Separated
7 1 vegioil Not separated
8 1 electro Separated
9 2 paper Separated
10 2 cardboard Separated
# ℹ 102 more rows
<- survey_long |>
survey_counted count(waste_type, answer, sort = TRUE)
print(survey_counted)
# A tibble: 24 × 3
waste_type answer n
<chr> <chr> <int>
1 glass Separated 12
2 metal Separated 11
3 plastic Separated 11
4 cardboard Separated 10
5 electro Separated 10
6 organic Separated 10
7 paper Separated 10
8 vegioil Not separated 8
9 vegioil Separated 4
10 cardboard Not separated 3
# ℹ 14 more rows
|>
survey_countedpivot_wider(
# id_cols = starts_with("<chr>"), # values that uniquely identify each row (if more than one id col) - https://r4ds.hadley.nz/data-tidy
names_from = answer,
values_from = n
# values_fill = 0 # <-- what should be filled in for missing value
)
# A tibble: 8 × 4
waste_type Separated `Not separated` `No answer`
<chr> <int> <int> <int>
1 glass 12 1 1
2 metal 11 2 1
3 plastic 11 1 2
4 cardboard 10 3 1
5 electro 10 1 3
6 organic 10 3 1
7 paper 10 2 2
8 vegioil 4 8 2
If the column names (wide format) contain multiple pieces of info (e.g., sp_m_014
) create multiple columns with names_sep
.
Example:
|>
who2 pivot_longer(
cols = !(country:year),
names_to = c("diagnosis", "gender", "age"),
names_sep = "_",
values_to = "count"
)
Data types & (atomic) vectors
Atomic vectors (six types): logical, integer, double, character, complex, and raw.
Integer and double vectors are collectively known as numeric vectors.
- lgl: logical (only TRUE FALSE or NA as values)
- int: integer
- dbl: double
- chr: character
<- c(TRUE, FALSE, NA)
vector_lgl typeof(vector_lgl)
[1] "logical"
sum(vector_lgl, na.rm = TRUE)
[1] 1
as.numeric(vector_lgl) # convert to dbl
[1] 1 0 NA
typeof(c(1, 3, 6)) #double as default (takes up less space than integer)
[1] "double"
seq(1,6,2) #type: integer
[1] 1 3 5
<- c(1L, 3L, 6L)
vector_int typeof(vector_int)
[1] "integer"
<- c(1293, 5.15, 90.500)
vector_dbl typeof(vector_dbl)
[1] "double"
<- c("large", "small", "medium")
vector_chr typeof(vector_chr)
[1] "character"
Logical vectors
> 150 vector_dbl
[1] TRUE FALSE FALSE
"large" == vector_chr
[1] TRUE FALSE FALSE
str_detect(vector_chr, "lar")
[1] TRUE FALSE FALSE
c(1L, 0L, 1L) == vector_lgl
[1] TRUE TRUE NA
<- rnorm(200, 50, 5)
norm_dist
mean(norm_dist)
[1] 50.0745
>= 50 norm_dist
[1] TRUE TRUE TRUE TRUE TRUE TRUE TRUE FALSE FALSE FALSE TRUE TRUE
[13] FALSE TRUE TRUE TRUE TRUE FALSE FALSE TRUE FALSE TRUE TRUE FALSE
[25] TRUE FALSE TRUE FALSE FALSE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
[37] TRUE FALSE FALSE TRUE TRUE TRUE TRUE FALSE FALSE TRUE FALSE FALSE
[49] FALSE TRUE TRUE FALSE FALSE FALSE TRUE TRUE FALSE FALSE TRUE FALSE
[61] FALSE FALSE TRUE TRUE TRUE FALSE FALSE TRUE FALSE FALSE TRUE FALSE
[73] FALSE TRUE TRUE TRUE FALSE FALSE TRUE TRUE FALSE TRUE TRUE FALSE
[85] TRUE FALSE FALSE TRUE TRUE TRUE TRUE TRUE FALSE FALSE TRUE FALSE
[97] TRUE FALSE TRUE FALSE TRUE FALSE TRUE TRUE FALSE FALSE TRUE TRUE
[109] FALSE FALSE TRUE FALSE FALSE FALSE FALSE TRUE TRUE TRUE FALSE TRUE
[121] FALSE FALSE FALSE FALSE TRUE FALSE FALSE TRUE FALSE FALSE FALSE FALSE
[133] FALSE TRUE FALSE FALSE TRUE TRUE FALSE TRUE FALSE FALSE TRUE TRUE
[145] TRUE FALSE TRUE TRUE TRUE FALSE FALSE FALSE TRUE FALSE FALSE FALSE
[157] FALSE FALSE TRUE FALSE TRUE FALSE TRUE FALSE TRUE TRUE FALSE FALSE
[169] TRUE TRUE TRUE TRUE FALSE TRUE FALSE FALSE FALSE TRUE TRUE TRUE
[181] FALSE TRUE FALSE FALSE TRUE FALSE FALSE FALSE TRUE TRUE TRUE FALSE
[193] FALSE TRUE FALSE FALSE FALSE FALSE FALSE FALSE
sum(norm_dist >= 50)
[1] 99
mean(norm_dist >= 50)
[1] 0.495
Numeric sequences
seq(1, 100, 1)
[1] 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18
[19] 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36
[37] 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54
[55] 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72
[73] 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90
[91] 91 92 93 94 95 96 97 98 99 100
seq(1, 100, 50)
[1] 1 51
sample(1:100, 100, replace = FALSE) # integer sampling
[1] 63 26 29 77 71 52 11 75 65 13 21 4 62 3 14 57 98 18
[19] 33 72 22 12 67 6 86 97 92 49 100 28 51 55 78 17 70 45
[37] 79 93 68 54 96 25 37 53 20 91 16 36 31 10 40 81 43 39
[55] 90 9 27 19 64 60 15 1 47 46 73 56 80 32 84 83 8 23
[73] 94 7 61 89 34 30 69 87 42 35 74 44 24 85 88 2 59 48
[91] 99 76 82 95 66 5 50 41 38 58
# along character vector:
seq_along(letters) # type == integer
[1] 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25
[26] 26
month.name
[1] "January" "February" "March" "April" "May" "June"
[7] "July" "August" "September" "October" "November" "December"
seq_along(month.name)
[1] 1 2 3 4 5 6 7 8 9 10 11 12
# random number dist:
runif(n = 1000, min = 1, max = 100) |> hist() # uniform distr sampling (continuous)
rnorm(n = 1000, mean = 500, sd = 150) |> hist() # normal distr sampling
Explicit vector coercion & augmented vectors
Vectors can also contain arbitrary additional metadata in the form of attributes. These attributes are used to create augmented vectors which build on additional behavior. For example, factors are built on top of integer vectors.
<- factor(vector_chr, levels = c("small", "medium", "large"))
vector_fct
typeof(vector_fct) # factor turns chr into integer vector
[1] "integer"
attributes(vector_fct)
$levels
[1] "small" "medium" "large"
$class
[1] "factor"
as.integer(vector_fct)
[1] 3 1 2
Create tibble / dataframe
<- tibble(name = c("bin_id", "collection_date",
data_dictionary "weekday", "location", "mass_VAR_kg"),
description = c("unique identifier of waste bin",
"calendar date of waste collection",
"corresponding weekday of collection",
"location of waste bin in Zurich",
"measured mass of waste of type VAR in kg (types: paper, plastic, glass, metal, organic,other)"))
<- tibble(
tib_data
vector_lgl,
vector_int,
vector_dbl,
vector_chr,
vector_fct,date = Sys.Date() # adjusts length to other columns (has to be repeatable)
)print(tib_data)
# A tibble: 3 × 6
vector_lgl vector_int vector_dbl vector_chr vector_fct date
<lgl> <int> <dbl> <chr> <fct> <date>
1 TRUE 1 1293 large large 2024-06-12
2 FALSE 3 5.15 small small 2024-06-12
3 NA 6 90.5 medium medium 2024-06-12
<- tribble( # Create (small) tibbles using a row-by-row layout
tribble ~id, ~bp1, ~bp2,
"A", 100, 120,
"B", 140, 115,
"C", 120, 125
)print(tribble)
# A tibble: 3 × 3
id bp1 bp2
<chr> <dbl> <dbl>
1 A 100 120
2 B 140 115
3 C 120 125
Accessing a vector from a dataframe:
|>
tib_data pull(vector_fct) # selects actual vector
[1] large small medium
Levels: small medium large
$vector_fct # selects actual vector tib_data
[1] large small medium
Levels: small medium large
5] # using numeric index, returns actual column; access col 5 tib_data[
# A tibble: 3 × 1
vector_fct
<fct>
1 large
2 small
3 medium
5]] # selects actual vector; access values in col 5 tib_data[[
[1] large small medium
Levels: small medium large
Joining data frames
<something>_join(x, y)
left_join()
: all rows from xright_join()
: all rows from yfull_join()
: all rows from both x and y
<- tribble(
x ~id, ~value_x,
1, "x1",
2, "x2",
3, "x3"
)
<- tribble(
y ~id, ~value_y,
1, "y1",
2, "y2",
4, "y4"
)
left_join(x, y)
# A tibble: 3 × 3
id value_x value_y
<dbl> <chr> <chr>
1 1 x1 y1
2 2 x2 y2
3 3 x3 <NA>
right_join(y, x)
# A tibble: 3 × 3
id value_y value_x
<dbl> <chr> <chr>
1 1 y1 x1
2 2 y2 x2
3 3 <NA> x3
right_join(x, y)
# A tibble: 3 × 3
id value_x value_y
<dbl> <chr> <chr>
1 1 x1 y1
2 2 x2 y2
3 4 <NA> y4
full_join(x, y)
# A tibble: 4 × 3
id value_x value_y
<dbl> <chr> <chr>
1 1 x1 y1
2 2 x2 y2
3 3 x3 <NA>
4 4 <NA> y4
full_join(y, x)
# A tibble: 4 × 3
id value_y value_x
<dbl> <chr> <chr>
1 1 y1 x1
2 2 y2 x2
3 4 y4 <NA>
4 3 <NA> x3
Conditions
|>
waste_data mutate(collection_date = if_else(collection_date == "Sunday, 31st March 2024",
"2024-03-31",
"2024-04-04"))
# A tibble: 20 × 10
bin_id collection_date location mass_total_kg mass_paper_kg mass_plastic_kg
<dbl> <chr> <chr> <dbl> <dbl> <dbl>
1 1 2024-03-31 Baeckeran… 30 5 3
2 2 2024-03-31 Baeckeran… 30 11 2
3 3 2024-03-31 Baeckeran… 30 2 6
4 4 2024-03-31 Baeckeran… 30 9 4
5 5 2024-03-31 Baeckeran… 30 4 0
6 6 2024-03-31 Landwiese 30 4 6
7 7 2024-03-31 Landwiese 30 3 1
8 8 2024-03-31 Landwiese 30 8 4
9 9 2024-03-31 Landwiese 30 1 14
10 10 2024-03-31 Landwiese 30 2 5
11 11 2024-04-04 Baeckeran… 30 7 3
12 12 2024-04-04 Baeckeran… 30 3 7
13 13 2024-04-04 Baeckeran… 30 4 8
14 14 2024-04-04 Baeckeran… 30 9 3
15 15 2024-04-04 Baeckeran… 30 3 2
16 16 2024-04-04 Landwiese 30 10 2
17 17 2024-04-04 Landwiese 30 3 6
18 18 2024-04-04 Landwiese 30 6 4
19 19 2024-04-04 Landwiese 30 3 8
20 20 2024-04-04 Landwiese 30 4 8
# ℹ 4 more variables: mass_glass_kg <dbl>, mass_metal_kg <dbl>,
# mass_organic_kg <dbl>, mass_other_kg <dbl>
|>
waste_gt mutate(income_cat = case_when(
== "HIC" ~ "high income",
income_id == "UMC" ~ "upper-middle income",
income_id == "LMC" ~ "lower-middle income",
income_id .default = income_id #old way of writing: TRUE ~ col_name
))
|>
sludge mutate(system = case_when(id == 6 ~ "septic tank",
.default = system))
|>
survey mutate(across(starts_with("sep_"), ~ case_when(
== "Separated" ~ "s", # dot is a placeholder for all selected cols
. == "Not separated" ~ "ns",
. == "No answer" ~ "noa"
. # https://www.perplexity.ai/search/In-an-R-zSpzF9tWTUOpT8NO0PdLAA )))
Programming with R
For loops
<- tib_data$vector_fct
size
for (s in size) {
<- paste(
msg "------", s, "------"
)print(msg)
}
[1] "------ large ------"
[1] "------ small ------"
[1] "------ medium ------"
If statements
<- c("bat", "cat", "dog", "bird", "horse")
pet
for(p in pet) {
if(p == "dog") {
<- paste("A", p, "is the best!")
msg else {
} <- paste("A", p, "is okay I guess.")
msg
}print(msg)
}
[1] "A bat is okay I guess."
[1] "A cat is okay I guess."
[1] "A dog is the best!"
[1] "A bird is okay I guess."
[1] "A horse is okay I guess."
<- c(NA, "meow", "woof", "chirp", "neigh")
sounds
<- list() #list is more versatile, can use index; alt: vector(length = 5)
message
for (i in seq_along(pet)) {
if (pet[i] == "dog") {
<- paste("The", pet[i], "goes", sounds[i])
message[i] else {
} <- paste("The", pet[i], "says", sounds[i])
message[i]
}
}print(message)
[[1]]
[1] "The bat says NA"
[[2]]
[1] "The cat says meow"
[[3]]
[1] "The dog goes woof"
[[4]]
[1] "The bird says chirp"
[[5]]
[1] "The horse says neigh"
Other functions
Factors
|>
sanitation mutate(varname_short = factor(varname_short, levels = c("san_sm","san_bas",
"san_lim", "san_unimp",
"san_od"))) |>
count(varname_short, varname_long)
# A tibble: 5 × 3
varname_short varname_long n
<fct> <chr> <int>
1 san_sm safely managed sanitation services 14742
2 san_bas basic sanitation services 14742
3 san_lim limited sanitation services 14742
4 san_unimp unimproved sanitation facilities 14742
5 san_od no sanitation facilities 14742
Any values not in the level will be silently converted to NA. (Might want to use forcats::fct()
instead.)
Access the set of valid levels directly with levels()
. You can also create a factor when reading your data with readr with col_factor()
:
read_csv(csv, col_types = cols(month = col_factor(month_levels)))
To reverse the order of the fct, s.t. the “first” item is on top of the y-axis on a figure use fct_rev()
(https://www.perplexity.ai/search/When-using-ggplot-.9PkPLwERA2BJyVcRre66g)
|>
fig_avail_response ggplot(aes(x = avail_resp_short, y = fct_rev(waste_type))) +
geom_tile(aes(fill = n)) +
labs(x = "Availability",
y = "Waste type",
fill = "# students")
fct_reorder()
takes three arguments:
.f
, the factor whose levels you want to modify..x
, a numeric vector that you want to use to reorder the levels.Optionally,
.fun
, a function that’s used if there are multiple values of.x
for each value of.f
. The default value ismedian
.
fct_relevel()
takes a factor, .f
, and then any number of levels that you want to move to the front of the line.
Further functions: https://r4ds.hadley.nz/factors
fct_reorder2(.f, .x, .y)
, fct_infreq()
(+fct_rev()
), fct_recode()
,fct_lump_*()
, ordered()
Dates
In R and other programming languages, dates are stored as numbers. The number of days since 1970-01-01 is the ISO 8601 standard.
In Excel, dates are stored as numbers of days since 1900-01-01. In Excel, the date number 1 corresponds to “1900-01-01,” but this system incorrectly considers 1900 as a leap year, which it is not. As a result, to correctly interpret date numbers that originate from systems like Excel, the origin “1899-12-30” is used to account for this discrepancy
<- read_excel(here::here("data/raw/date-formats.xlsx"))
dates
head(dates)
# A tibble: 1 × 5
date_iso date_us date_eu date_num date_time
<chr> <chr> <chr> <dbl> <dttm>
1 2023-11-01 11/01/2023 01/11/2023 45231 2023-11-01 00:00:00
<- dates |>
dates_class mutate(date_iso = as_date(date_iso)) |>
mutate(date_us = mdy(date_us)) |> #mdy: month, day, year
mutate(date_eu = dmy(date_eu)) |>
mutate(date_num = as_date(date_num, origin = "1899-12-30")) |>
mutate(date = as_date(date_time)) |>
mutate(date_time_tz = with_tz(date_time, tzone = "Africa/Kampala")) |> #forece_tz to apply local time zone
mutate(today = today())
print(dates_class)
# A tibble: 1 × 8
date_iso date_us date_eu date_num date_time date
<date> <date> <date> <date> <dttm> <date>
1 2023-11-01 2023-11-01 2023-11-01 2023-11-01 2023-11-01 00:00:00 2023-11-01
# ℹ 2 more variables: date_time_tz <dttm>, today <date>
as.numeric(today())
[1] 19886
as_date(1)
[1] "1970-01-02"
|>
dates_class select(today) |>
mutate(year = year(today)) |>
#mutate(month = month(today, label = TRUE, abbr = FALSE, locale = "fr_FR")) |>
mutate(quarter = quarter(today)) |>
mutate(week = week(today)) |>
mutate(day = day(today)) |>
#mutate(day_of_week = wday(today, label = TRUE, abbr = FALSE, locale = "fr_FR")) |>
mutate(day_of_year = yday(today)) |>
mutate(week_of_year = week(today))
# A tibble: 1 × 7
today year quarter week day day_of_year week_of_year
<date> <dbl> <int> <dbl> <int> <dbl> <dbl>
1 2024-06-12 2024 2 24 12 164 24
OlsonNames()
: List of tz database time zones
Others
Integer division %/%
, modulo %%
# objects
letters
[1] "a" "b" "c" "d" "e" "f" "g" "h" "i" "j" "k" "l" "m" "n" "o" "p" "q" "r" "s"
[20] "t" "u" "v" "w" "x" "y" "z"
LETTERS
[1] "A" "B" "C" "D" "E" "F" "G" "H" "I" "J" "K" "L" "M" "N" "O" "P" "Q" "R" "S"
[20] "T" "U" "V" "W" "X" "Y" "Z"
Selection / Detection
select()
and others https://dplyr.tidyverse.org/reference/select.html
starts_with("<e.g. a string>")
https://tidyselect.r-lib.org/reference/starts_with.html
str_detect()
Difference btw. str_detect and contains
# str_detect()
|>
survey mutate(gender_new = case_when(
== "Male" ~ "M",
gender str_detect(gender, pattern = "ema") == TRUE ~ "F",
str_detect(gender, pattern = "bin") == TRUE ~ NA_character_,
TRUE ~ gender
.before = gender) ),
# A tibble: 15 × 20
id time_stamp consent age gender_new gender people_household
<int> <dttm> <chr> <fct> <chr> <chr> <fct>
1 1 2024-03-20 08:37:09 Given >29 M Male 4 people
2 2 2024-03-21 15:16:53 Given 22-25 F Female 2 people
3 3 2024-03-21 15:24:27 Given 26-29 M Male 2 people
4 4 2024-03-21 15:34:44 Given 22-25 M Male 5 people
5 5 2024-03-21 15:35:08 Given >29 M Male 2 people
6 6 2024-03-21 17:48:17 Denied <NA> <NA> <NA> <NA>
7 7 2024-03-21 17:50:23 Given <18 <NA> Non-bina… 1 person
8 8 2024-03-21 18:14:25 Given 22-25 F Female 6 or more people
9 9 2024-03-21 18:47:23 Given >29 <NA> Non-bina… 6 or more people
10 10 2024-03-21 18:46:21 Given 22-25 F Female 2 people
11 11 2024-03-21 18:48:34 Given 18-21 M Male 2 people
12 12 2024-03-21 18:49:17 Given 26-29 F Female 4 people
13 13 2024-03-22 13:49:34 Given 26-29 M Male 4 people
14 14 2024-03-22 15:07:28 Given 22-25 F Female 2 people
15 15 2024-03-24 14:22:23 Given 18-21 M Male 6 or more people
# ℹ 13 more variables: degree <chr>, field_of_study <chr>, sep_paper <chr>,
# sep_cardboard <chr>, sep_plastic <chr>, sep_glass <chr>, sep_metal <chr>,
# sep_organic <chr>, sep_vegioil <chr>, sep_electro <chr>,
# grocery_waste <dbl>, study_to_waste <dbl>, suggest_improve <chr>
# long names can be saved to a variable
<- "field_of_study"
long_col_name
count(survey, !!sym(long_col_name)) # !!sym() converts the variable long_col_name into symbol usable in str_detect() function https://www.perplexity.ai/search/Using-the-filter-EamBK_pYRmucZrXnpBhYDQ
# A tibble: 7 × 2
field_of_study n
<chr> <int>
1 Architecture and Civil Engineering 1
2 Engineering Sciences 7
3 Humanities, Political and Social Sciences 1
4 Natural Sciences and Mathematics 3
5 Others 1
6 Prefer not to say. 1
7 <NA> 1
Visualization - Graphs
Save figures to files in code: ggsave()
–> when searching for figure, the code is also returned!
Aesthetic mappings
ggplot(data = penguins,
mapping = aes(x = flipper_length_mm,
y = body_mass_g,
color = species,
shape = species)) +
geom_point() # na.rm = TRUE can be included in the geom settings
Settings
ggplot(data = penguins,
mapping = aes(x = flipper_length_mm,
y = body_mass_g,
color = species,
shape = species)) +
geom_point(size = 5, alpha = 0.7)
Color scales
ggplot(data = penguins,
mapping = aes(x = flipper_length_mm,
y = body_mass_g,
color = species,
shape = species)) +
geom_point(size = 5, alpha = 0.7) +
scale_color_colorblind()
# similar for other mappings; e.g., scale_fill_manual(values = c("red", "orange", "grey")
ggplot(data = penguins,
mapping = aes(x = flipper_length_mm,
y = body_mass_g,
color = species,
shape = species)) +
geom_point(size = 5, alpha = 0.7) +
scale_color_manual(values = c("red", "blue", "green"))
Facets
ggplot(data = penguins,
mapping = aes(x = flipper_length_mm,
y = body_mass_g)) +
geom_point() +
facet_grid(island ~ species)
ggplot(data = filter(gapminder, year == 2007),
mapping = aes(x = gdpPercap,
y = lifeExp,
size = pop,
color = country)) +
geom_point(show.legend = FALSE) +
facet_wrap(~continent)
Themes
ggplot(data = penguins,
mapping = aes(x = flipper_length_mm,
y = body_mass_g,
color = species,
shape = species)) +
geom_point(size = 5, alpha = 0.7) +
scale_color_colorblind() +
theme_minimal()
Visualization Distribution
categorical & numerical variables:
geom_freqpoly()
orgeom_boxplot()
two categorical variables:
geom_tiles()
orgeom_count()
two numerical variables:
geom_point()
with transparencyalpha = 1/...
for large datasets; for very large datasets usegeom_bin2d()
orgeom_hex()
; or bin one continuous variable s.t. it acts as categorical
Categorical variables
ggplot(data = penguins,
mapping = aes(x = species)) +
geom_bar()
ggplot(data = penguins,
mapping = aes(x = species,
fill = island)) +
geom_bar() +
scale_fill_manual(values = c("red", "orange", "grey"))
ggplot(data = summarize(group_by(filter(gapminder, year == 2007), continent),
count = n(),
lifeExp = median(lifeExp)),
mapping = aes(x = continent,
y = count)) +
geom_col()
Visualizing covariation of two categorical variables
|>
diamonds count(color, cut) |>
ggplot(aes(x = color, y = cut)) +
geom_tile(aes(fill = n))
Numerical variables
ggplot(data = filter(gapminder, year == 2007),
mapping = aes(x = lifeExp,
fill = continent)) +
geom_histogram(col = "grey60", breaks = seq(35, 90, 1))
ggplot(data = penguins,
mapping = aes(x = body_mass_g,
fill = species)) +
geom_histogram() +
scale_fill_brewer(type = "qual")
ggplot(data = penguins,
mapping = aes(x = body_mass_g,
fill = species)) +
geom_density()
ggplot(data = penguins,
mapping = aes(x = body_mass_g,
y = species,
fill = species)) +
geom_density_ridges() +
scale_fill_colorblind()
ggplot(data = filter(gapminder, year == 2007),
mapping = aes(x = continent,
y = lifeExp,
fill = continent)) +
geom_boxplot(outlier.shape = NA)
ggplot(data = summarize(group_by(gapminder, continent, year),
lifeExp = median(lifeExp)),
mapping = aes(x = year,
y = lifeExp,
color = continent)) +
geom_line() +
geom_point()
ggplot(filter(diamonds, carat < 3), aes(x = carat, y = price)) +
geom_bin2d()
# install.packages("hexbin")
ggplot(filter(diamonds, carat < 3), aes(x = carat, y = price)) +
geom_hex()
ggplot(filter(diamonds, carat < 3), aes(x = carat, y = price)) +
geom_boxplot(aes(group = cut_width(carat, 0.2)),
varwidth = TRUE) # visualize nr of observations
Examples
ggplot(data = filter(gapminder,
== 2007),
year mapping = aes(x = continent,
y = lifeExp)) +
geom_boxplot() +
geom_jitter(width = 0.1, alpha = 1/4, size = 3) +
labs(x = NULL,
y = "Life Expectancy (years)") +
theme_minimal() # + geom_smooth(method = "METHOD")
<- ne_countries(scale = "small", returnclass = "sf")
world
|>
world mutate(income_grp = factor(income_grp, ordered = T)) |>
ggplot(aes(fill = income_grp)) +
geom_sf() +
theme_void() +
theme(legend.position = "top") +
labs(fill = "Income Group:") +
guides(fill = guide_legend(nrow = 2, byrow = TRUE))
ggplot(filter(sanitation,
== "Nigeria", (residence == "rural" | residence == "urban")),
name aes(year,
percent,group = varname_short,
color = varname_short)) +
geom_point() +
geom_line() +
facet_wrap(~residence) +
scale_color_colorblind()
ggplot(filter(sanitation,
== "Hungary",
name %in% c(2000, 2020),
year != "san_sm"),
varname_short aes(residence, percent, fill = varname_long)) +
geom_col()+
scale_fill_colorblind() +
facet_wrap(~year) +
geom_text(aes(label = round(percent, 1)),
position = position_stack(vjust = 0.5),
size = 3,
color = "white")
ggplot(data = filter(sanitation, year == 2020),
mapping = aes(x = percent, fill = varname_short)) +
geom_histogram() +
facet_grid(varname_short ~ residence, scales = "free_y") +
scale_fill_colorblind() +
theme(legend.position = "none")
ggplot(data = waste_data_long_mean,
mapping = aes(x = mean_percent,
y = waste_category,
fill = income_cat)) +
geom_col(position = position_dodge()) +
labs(title = "Waste Composition",
subtitle = "Mean percentages of nine waste categories displayed by income categories",
x = "mean (percent)",
y = "waste category",
fill = "Income category",
caption = "Data from: https://datacatalog.worldbank.org/search/dataset/0039597") +
scale_x_continuous(breaks = seq(0, 50, 5)) +
scale_fill_brewer(type = "qual", palette = 3) +
theme_minimal() +
theme(panel.grid.minor = element_blank(),
panel.grid.major.y = element_blank())
|>
sanitation filter(iso3 %in% c("AFG", "AGO"),
== "san_bas",
varname_short %in% c(2000, 2001, 2002),
year == "national") |>
residence ggplot(aes(x = year, y = percent)) +
geom_line(aes(group = name), color = "grey50") +
geom_point(aes(color = name, shape = name)) +
scale_x_continuous(breaks = c(2000, 2002)) # x-axis breaks at 1999 and 2000
Tables
waste_tbl_income
# A tibble: 4 × 7
income_cat count mean sd median min max
<fct> <int> <dbl> <dbl> <dbl> <dbl> <dbl>
1 high income 71 477. 214. 421. 116. 1142.
2 upper-middle income 72 381. 133. 378. 130. 828.
3 lower-middle income 116 275. 179. 219. 62.1 1109.
4 low income 67 215. 130. 182. 6.86 694.
|>
waste_tbl_income gt() |>
tab_header(title = "Waste generation per capita (kg/year) by income group",
subtitle = "Data from 326 cities") |>
fmt_number(columns = count:max, decimals = 0) |> #format numeric values
cols_label(income_cat = "income category") |>
opt_stylize(style = 1)
Waste generation per capita (kg/year) by income group | ||||||
---|---|---|---|---|---|---|
Data from 326 cities | ||||||
income category | count | mean | sd | median | min | max |
high income | 71 | 477 | 214 | 421 | 116 | 1,142 |
upper-middle income | 72 | 381 | 133 | 378 | 130 | 828 |
lower-middle income | 116 | 275 | 179 | 219 | 62 | 1,109 |
low income | 67 | 215 | 130 | 182 | 7 | 694 |
|>
penguins filter(!is.na(bill_depth_mm)) |>
group_by(island, species) |>
summarise(n = n(),
mean_bill_depth = mean(bill_depth_mm),
sd_bill_depth = sd(bill_depth_mm)) |>
ungroup() |>
gt() |>
fmt_number(columns = c(mean_bill_depth, sd_bill_depth),
decimals = 1)
island | species | n | mean_bill_depth | sd_bill_depth |
---|---|---|---|---|
Biscoe | Adelie | 44 | 18.4 | 1.2 |
Biscoe | Gentoo | 123 | 15.0 | 1.0 |
Dream | Adelie | 56 | 18.3 | 1.1 |
Dream | Chinstrap | 68 | 18.4 | 1.1 |
Torgersen | Adelie | 51 | 18.4 | 1.3 |
Cross reference in text
Give an ID (label) to your figure or table, starting with fig- or tbl-.
Add a caption to your figure or table.
Refer to it with @fig-… or @tbl-….
Table 1 @tbl-waste-income
highlights … (identical for @fig-...
)
:
chunk options#| label: tbl-waste-income
#| tbl-cap: "Waste generation per capita (kg/year) by income group. Data from 326 cities."
|>
waste_tbl_income kable(col.names = c("Income Category", "Count", "Mean", "Standard deviation",
"Median", "Min", "Max"),
align = 'lcccrrr',
digits = 1)
Income Category | Count | Mean | Standard deviation | Median | Min | Max |
---|---|---|---|---|---|---|
high income | 71 | 477.1 | 213.7 | 421.0 | 116.5 | 1141.8 |
upper-middle income | 72 | 380.7 | 132.8 | 378.3 | 130.4 | 828.1 |
lower-middle income | 116 | 274.8 | 178.8 | 218.6 | 62.1 | 1109.3 |
low income | 67 | 214.6 | 130.4 | 182.5 | 6.9 | 693.5 |
Write data
#write csv
write_csv(data_dictionary, here::here("data/processed/data_dictionary.csv"))
#write rds
write_rds(x = waste_gt, file = here::here("data/processed/waste-city-level-sml-write.rds"))
Alternative to rds: parquet files from arrow package (much faster and usable outside of R).
Markdown Docs
Enter / to invoke shortcut options
Examples for source editor code
Reference sheet: Help > Markdown Quick References
LaTeX equations: ( $$ before and after equation)
\[ x = \frac{a\pm s_{safety}}{\sqrt{bms}} \]
Citations
In Tilley & Kalina (2021) @tilley2021my
, the authors describe how visitors still expect a personal pick-up, despite the availability of taxi services.
Inequality underpins waste management systems, structuring who can or cannot access services (Kalina et al., 2023) [@kalina2023rich]
.
Git config
library(usethis)
use_git_config(user.name = "<your git username>", user.email = "<your email used for git>")
References
Reuse
Citation
@online{prinz2024,
author = {Prinz, Sven},
title = {Cheatsheet for {R}},
date = {2024-06-12},
url = {https://sprinz11.github.io/FS24-rbtl-cheat_sheet_for_R/},
langid = {en},
abstract = {This document includes a bunch of functions that were used
during the lecture “Research Beyond the Lab - Open Science and
Research Methods for a Global Engineer” in the Spring semester 2024
(https://rbtl-fs24.github.io/website/). Feel free to use it for the
exam. No guarantee is given for completeness or correctness.}
}