Data manipulation using `dplyr` and `tidyr`

relative paths

sessionInfo()
#> R version 4.3.3 (2024-02-29)
#> Platform: x86_64-pc-linux-gnu (64-bit)
#> Running under: Ubuntu 22.04.4 LTS
#> 
#> Matrix products: default
#> BLAS:   /usr/lib/x86_64-linux-gnu/blas/libblas.so.3.10.0 
#> LAPACK: /usr/lib/x86_64-linux-gnu/lapack/liblapack.so.3.10.0
#> 
#> locale:
#>  [1] LC_CTYPE=C.UTF-8       LC_NUMERIC=C           LC_TIME=C.UTF-8       
#>  [4] LC_COLLATE=C.UTF-8     LC_MONETARY=C.UTF-8    LC_MESSAGES=C.UTF-8   
#>  [7] LC_PAPER=C.UTF-8       LC_NAME=C              LC_ADDRESS=C          
#> [10] LC_TELEPHONE=C         LC_MEASUREMENT=C.UTF-8 LC_IDENTIFICATION=C   
#> 
#> time zone: UTC
#> tzcode source: system (glibc)
#> 
#> attached base packages:
#> [1] stats     graphics  grDevices utils     datasets  methods   base     
#> 
#> other attached packages:
#>  [1] RSQLite_2.3.1   lubridate_1.9.2 forcats_1.0.0   stringr_1.5.0  
#>  [5] dplyr_1.1.3     purrr_1.0.2     readr_2.1.4     tidyr_1.3.0    
#>  [9] tibble_3.2.1    ggplot2_3.4.3   tidyverse_2.0.0 knitr_1.43     
#> 
#> loaded via a namespace (and not attached):
#>  [1] bit_4.0.5         gtable_0.3.4      compiler_4.3.3    renv_1.0.5       
#>  [5] highr_0.10        tidyselect_1.2.0  blob_1.2.4        scales_1.2.1     
#>  [9] fastmap_1.1.1     yaml_2.3.7        R6_2.5.1          generics_0.1.3   
#> [13] munsell_0.5.0     DBI_1.1.3         pillar_1.9.0      tzdb_0.4.0       
#> [17] rlang_1.1.1       utf8_1.2.3        cachem_1.0.8      stringi_1.7.12   
#> [21] xfun_0.40         bit64_4.0.5       memoise_2.0.1     timechange_0.2.0 
#> [25] cli_3.6.1         withr_2.5.0       magrittr_2.0.3    grid_4.3.3       
#> [29] rstudioapi_0.15.0 hms_1.1.3         lifecycle_1.0.3   vctrs_0.6.3      
#> [33] evaluate_0.21     glue_1.6.2        fansi_1.0.4       colorspace_2.1-0 
#> [37] tools_4.3.3       pkgconfig_2.0.3
3 + 5
12 / 7
weight_kg <- 55
weight_kg <- 55    # doesn't print anything
(weight_kg <- 55)  # but putting parenthesis around the call prints the value of `weight_kg`
weight_kg          # and so does typing the name of the object
2.2 * weight_kg
weight_kg <- 57.5
2.2 * weight_kg
weight_lb <- 2.2 * weight_kg
weight_kg <- 100
mass <- 47.5            # mass?
age  <- 122             # age?
mass <- mass * 2.0      # mass?
age  <- age - 20        # age?
mass_index <- mass/age  # mass_index?
weight_kg <- sqrt(10)
round(3.14159)
#> [1] 3
args(round)
#> function (x, digits = 0) 
#> NULL
?round
round(3.14159, digits = 2)
#> [1] 3.14
round(3.14159, 2)
#> [1] 3.14
round(digits = 2, x = 3.14159)
#> [1] 3.14
weight_g <- c(50, 60, 65, 82)
weight_g
animals <- c("mouse", "rat", "dog")
animals
length(weight_g)
length(animals)
class(weight_g)
class(animals)
str(weight_g)
str(animals)
weight_g <- c(weight_g, 90) # add to the end of the vector
weight_g <- c(30, weight_g) # add to the beginning of the vector
weight_g
num_char <- c(1, 2, 3, "a")
num_logical <- c(1, 2, 3, TRUE)
char_logical <- c("a", "b", "c", TRUE)
tricky <- c(1, 2, 3, "4")
combined_logical <- c(num_logical, char_logical)
animals <- c("mouse", "rat", "dog", "cat")
animals[2]
#> [1] "rat"
animals[c(3, 2)]
#> [1] "dog" "rat"
more_animals <- animals[c(1, 2, 3, 2, 1, 4)]
more_animals
#> [1] "mouse" "rat"   "dog"   "rat"   "mouse" "cat"
weight_g <- c(21, 34, 39, 54, 55)
weight_g[c(TRUE, FALSE, FALSE, TRUE, TRUE)]
#> [1] 21 54 55
weight_g > 50    # will return logicals with TRUE for the indices that meet the condition
#> [1] FALSE FALSE FALSE  TRUE  TRUE
## so we can use this to select only the values above 50
weight_g[weight_g > 50]
#> [1] 54 55
weight_g[weight_g > 30 & weight_g < 50]
#> [1] 34 39
weight_g[weight_g <= 30 | weight_g == 55]
#> [1] 21 55
weight_g[weight_g >= 30 & weight_g == 21]
#> numeric(0)
animals <- c("mouse", "rat", "dog", "cat", "cat")

# return both rat and cat
animals[animals == "cat" | animals == "rat"]
#> [1] "rat" "cat" "cat"
# return a logical vector that is TRUE for the elements within animals
# that are found in the character vector and FALSE for those that are not
animals %in% c("rat", "cat", "dog", "duck", "goat", "bird", "fish")
#> [1] FALSE  TRUE  TRUE  TRUE  TRUE
# use the logical vector created by %in% to return elements from animals
# that are found in the character vector
animals[animals %in% c("rat", "cat", "dog", "duck", "goat", "bird", "fish")]
#> [1] "rat" "dog" "cat" "cat"
heights <- c(2, 4, 4, NA, 6)
mean(heights)
max(heights)
mean(heights, na.rm = TRUE)
max(heights, na.rm = TRUE)
## Extract those elements which are not missing values.
heights[!is.na(heights)]

## Returns the object with incomplete cases removed.
#The returned object is an atomic vector of type `"numeric"` (or #`"double"`).
na.omit(heights)

## Extract those elements which are complete cases.
#The returned object is an atomic vector of type `"numeric"` (or #`"double"`).
heights[complete.cases(heights)]
heights <- c(63, 69, 60, 65, NA, 68, 61, 70, 61, 59, 64, 69, 63, 63, NA, 72, 65, 64, 70, 63, 65)
heights <- c(63, 69, 60, 65, NA, 68, 61, 70, 61, 59, 64, 69, 63, 63, NA, 72, 65, 64, 70, 63, 65)

# 1.
heights_no_na <- heights[!is.na(heights)]
# or
heights_no_na <- na.omit(heights)
# or
heights_no_na <- heights[complete.cases(heights)]

# 2.
median(heights, na.rm = TRUE)

# 3.
heights_above_67 <- heights_no_na[heights_no_na > 67]
length(heights_above_67)
download.file(url = "https://ndownloader.figshare.com/files/2292169",
              destfile = "data_raw/portal_data_joined.csv")
## load the tidyverse packages, incl. dplyr
library(tidyverse)
surveys <- read_csv("data_raw/portal_data_joined.csv")
#> Rows: 34786 Columns: 13
#> ── Column specification ────────────────────────────────────────────────────────
#> Delimiter: ","
#> chr (6): species_id, sex, genus, species, taxa, plot_type
#> dbl (7): record_id, month, day, year, plot_id, hindfoot_length, weight
#> 
#> ℹ Use `spec()` to retrieve the full column specification for this data.
#> ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
head(surveys)
#> # A tibble: 6 × 13
#>   record_id month   day  year plot_id species_id sex   hindfoot_length weight
#>       <dbl> <dbl> <dbl> <dbl>   <dbl> <chr>      <chr>           <dbl>  <dbl>
#> 1         1     7    16  1977       2 NL         M                  32     NA
#> 2        72     8    19  1977       2 NL         M                  31     NA
#> 3       224     9    13  1977       2 NL         <NA>               NA     NA
#> 4       266    10    16  1977       2 NL         <NA>               NA     NA
#> 5       349    11    12  1977       2 NL         <NA>               NA     NA
#> 6       363    11    12  1977       2 NL         <NA>               NA     NA
#> # ℹ 4 more variables: genus <chr>, species <chr>, taxa <chr>, plot_type <chr>
view(surveys)
str(surveys)
str(surveys)
#> spc_tbl_ [34,786 × 13] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
#>  $ record_id      : num [1:34786] 1 72 224 266 349 363 435 506 588 661 ...
#>  $ month          : num [1:34786] 7 8 9 10 11 11 12 1 2 3 ...
#>  $ day            : num [1:34786] 16 19 13 16 12 12 10 8 18 11 ...
#>  $ year           : num [1:34786] 1977 1977 1977 1977 1977 ...
#>  $ plot_id        : num [1:34786] 2 2 2 2 2 2 2 2 2 2 ...
#>  $ species_id     : chr [1:34786] "NL" "NL" "NL" "NL" ...
#>  $ sex            : chr [1:34786] "M" "M" NA NA ...
#>  $ hindfoot_length: num [1:34786] 32 31 NA NA NA NA NA NA NA NA ...
#>  $ weight         : num [1:34786] NA NA NA NA NA NA NA NA 218 NA ...
#>  $ genus          : chr [1:34786] "Neotoma" "Neotoma" "Neotoma" "Neotoma" ...
#>  $ species        : chr [1:34786] "albigula" "albigula" "albigula" "albigula" ...
#>  $ taxa           : chr [1:34786] "Rodent" "Rodent" "Rodent" "Rodent" ...
#>  $ plot_type      : chr [1:34786] "Control" "Control" "Control" "Control" ...
#>  - attr(*, "spec")=
#>   .. cols(
#>   ..   record_id = col_double(),
#>   ..   month = col_double(),
#>   ..   day = col_double(),
#>   ..   year = col_double(),
#>   ..   plot_id = col_double(),
#>   ..   species_id = col_character(),
#>   ..   sex = col_character(),
#>   ..   hindfoot_length = col_double(),
#>   ..   weight = col_double(),
#>   ..   genus = col_character(),
#>   ..   species = col_character(),
#>   ..   taxa = col_character(),
#>   ..   plot_type = col_character()
#>   .. )
#>  - attr(*, "problems")=<externalptr>
## * class: data frame
## * how many rows: 34786,  how many columns: 13
# We can extract specific values by specifying row and column indices
# in the format: 
# data_frame[row_index, column_index]
# For instance, to extract the first row and column from surveys:
surveys[1, 1]

# First row, sixth column:
surveys[1, 6]   

# We can also use shortcuts to select a number of rows or columns at once
# To select all columns, leave the column index blank
# For instance, to select all columns for the first row:
surveys[1, ]

# The same shortcut works for rows --
# To select the first column across all rows:
surveys[, 1]

# An even shorter way to select first column across all rows:
surveys[1] # No comma! 

# To select multiple rows or columns, use vectors!
# To select the first three rows of the 5th and 6th column
surveys[c(1, 2, 3), c(5, 6)] 

# We can use the : operator to create those vectors for us:
surveys[1:3, 5:6] 

# This is equivalent to head_surveys <- head(surveys)
head_surveys <- surveys[1:6, ]

# As we've seen, when working with tibbles 
# subsetting with single square brackets ("[]") always returns a data frame.
# If you want a vector, use double square brackets ("[[]]")

# For instance, to get the first column as a vector:
surveys[[1]]

# To get the first value in our data frame:
surveys[[1, 1]]
surveys[, -1]                 # The whole data frame, except the first column
surveys[-(7:nrow(surveys)), ] # Equivalent to head(surveys)
# As before, using single brackets returns a data frame:
surveys["species_id"]
surveys[, "species_id"]

# Double brackets returns a vector:
surveys[["species_id"]]

# We can also use the $ operator with column names instead of double brackets
# This returns a vector:
surveys$species_id
## 1.
surveys_200 <- surveys[200, ]
## 2.
# Saving `n_rows` to improve readability and reduce duplication
n_rows <- nrow(surveys)
surveys_last <- surveys[n_rows, ]
## 3.
surveys_middle <- surveys[n_rows / 2, ]
## 4.
surveys_head <- surveys[-(7:n_rows), ]
surveys$sex <- factor(surveys$sex)
summary(surveys$sex)
sex <- factor(c("male", "female", "female", "male"))
levels(sex)
nlevels(sex)
sex # current order
#> [1] male   female female male  
#> Levels: female male
sex <- factor(sex, levels = c("male", "female"))
sex # after re-ordering
#> [1] male   female female male  
#> Levels: male female
surveys$taxa <- factor(surveys$taxa)
surveys$genus <- factor(surveys$genus)
summary(surveys)
nlevels(surveys$genus)

## * how many genera: There are 26 unique genera in the `genus` column.
## * how many rabbts: There are 75 rabbits in the `taxa` column.
as.character(sex)
year_fct <- factor(c(1990, 1983, 1977, 1998, 1990))
as.numeric(year_fct)               # Wrong! And there is no warning...
as.numeric(as.character(year_fct)) # Works...
as.numeric(levels(year_fct))[year_fct]    # The recommended way.
## bar plot of the number of females and males captured during the experiment:
plot(surveys$sex)
sex <- surveys$sex
levels(sex)
#> [1] "F" "M"
sex <- addNA(sex)
levels(sex)
#> [1] "F" "M" NA
head(sex)
#> [1] M    M    <NA> <NA> <NA> <NA>
#> Levels: F M <NA>
levels(sex)[3] <- "undetermined"
levels(sex)
#> [1] "F"            "M"            "undetermined"
head(sex)
#> [1] M            M            undetermined undetermined undetermined
#> [6] undetermined
#> Levels: F M undetermined
levels(sex)[1:2] <- c("female", "male")
sex <- factor(sex, levels = c("undetermined", "female", "male"))
plot(sex)
animal_data <- data.frame(
          animal = c(dog, cat, sea cucumber, sea urchin),
          feel = c("furry", "squishy", "spiny"),
          weight = c(45, 8 1.1, 0.8)
          )
country_climate <- data.frame(
       country = c("Canada", "Panama", "South Africa", "Australia"),
       climate = c("cold", "hot", "temperate", "hot/temperate"),
       temperature = c(10, 30, 18, "15"),
       northern_hemisphere = c(TRUE, TRUE, FALSE, "FALSE"),
       has_kangaroo = c(FALSE, FALSE, FALSE, 1)
       )
str(surveys)
library(lubridate)
my_date <- ymd("2015-01-01")
str(my_date)
# sep indicates the character to use to separate each component
my_date <- ymd(paste("2015", "1", "1", sep = "-")) 
str(my_date)
paste(surveys$year, surveys$month, surveys$day, sep = "-")
ymd(paste(surveys$year, surveys$month, surveys$day, sep = "-"))
#> Warning: 129 failed to parse.
surveys$date <- ymd(paste(surveys$year, surveys$month, surveys$day, sep = "-"))
#> Warning: 129 failed to parse.
str(surveys) # notice the new column, with 'date' as the class
summary(surveys$date)
#>         Min.      1st Qu.       Median         Mean      3rd Qu.         Max. 
#> "1977-07-16" "1984-03-12" "1990-07-22" "1990-12-15" "1997-07-29" "2002-12-31" 
#>         NA's 
#>        "129"
missing_dates <- surveys[is.na(surveys$date), c("year", "month", "day")]

head(missing_dates)
#> # A tibble: 6 × 3
#>    year month   day
#>   <dbl> <dbl> <dbl>
#> 1  2000     9    31
#> 2  2000     4    31
#> 3  2000     4    31
#> 4  2000     4    31
#> 5  2000     4    31
#> 6  2000     9    31
surveys <- read_csv("data_raw/portal_data_joined.csv")
#> Rows: 34786 Columns: 13
#> ── Column specification ────────────────────────────────────────────────────────
#> Delimiter: ","
#> chr (6): species_id, sex, genus, species, taxa, plot_type
#> dbl (7): record_id, month, day, year, plot_id, hindfoot_length, weight
#> 
#> ℹ Use `spec()` to retrieve the full column specification for this data.
#> ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## inspect the data
str(surveys)
## preview the data
view(surveys)
select(surveys, plot_id, species_id, weight)
select(surveys, -record_id, -species_id)
filter(surveys, year == 1995)
surveys2 <- filter(surveys, weight < 5)
surveys_sml <- select(surveys2, species_id, sex, weight)
surveys_sml <- select(filter(surveys, weight < 5), species_id, sex, weight)
surveys %>%
  filter(weight < 5) %>%
  select(species_id, sex, weight)
surveys_sml <- surveys %>%
  filter(weight < 5) %>%
  select(species_id, sex, weight)

surveys_sml
surveys %>%
    filter(year < 1995) %>%
    select(year, sex, weight)
surveys %>%
  mutate(weight_kg = weight / 1000)
surveys %>%
  mutate(weight_kg = weight / 1000,
         weight_lb = weight_kg * 2.2)
surveys %>%
  mutate(weight_kg = weight / 1000) %>%
  head()
surveys %>%
  filter(!is.na(weight)) %>%
  mutate(weight_kg = weight / 1000) %>%
  head()
surveys_hindfoot_cm <- surveys %>%
    filter(!is.na(hindfoot_length)) %>%
    mutate(hindfoot_cm = hindfoot_length / 10) %>%
    filter(hindfoot_cm < 3) %>%
    select(species_id, hindfoot_cm)
surveys %>%
  group_by(sex) %>%
  summarize(mean_weight = mean(weight, na.rm = TRUE))
surveys %>%
  group_by(sex, species_id) %>%
  summarize(mean_weight = mean(weight, na.rm = TRUE)) %>%
  tail()
#> `summarise()` has grouped output by 'sex'. You can override using the `.groups`
#> argument.
surveys %>%
  filter(!is.na(weight)) %>%
  group_by(sex, species_id) %>%
  summarize(mean_weight = mean(weight))
#> `summarise()` has grouped output by 'sex'. You can override using the `.groups`
#> argument.
surveys %>%
  filter(!is.na(weight)) %>%
  group_by(sex, species_id) %>%
  summarize(mean_weight = mean(weight)) %>%
  print(n = 15)
#> `summarise()` has grouped output by 'sex'. You can override using the `.groups`
#> argument.
surveys %>%
  filter(!is.na(weight)) %>%
  group_by(sex, species_id) %>%
  summarize(mean_weight = mean(weight),
            min_weight = min(weight))
#> `summarise()` has grouped output by 'sex'. You can override using the `.groups`
#> argument.
surveys %>%
  filter(!is.na(weight)) %>%
  group_by(sex, species_id) %>%
  summarize(mean_weight = mean(weight),
            min_weight = min(weight)) %>%
  arrange(min_weight)
#> `summarise()` has grouped output by 'sex'. You can override using the `.groups`
#> argument.
surveys %>%
  filter(!is.na(weight)) %>%
  group_by(sex, species_id) %>%
  summarize(mean_weight = mean(weight),
            min_weight = min(weight)) %>%
  arrange(desc(mean_weight))
#> `summarise()` has grouped output by 'sex'. You can override using the `.groups`
#> argument.
surveys %>%
    count(sex)
surveys %>%
    group_by(sex) %>%
    summarize(count = n())
surveys %>%
    count(sex, sort = TRUE)
surveys %>%
  count(sex, species)
surveys %>%
  count(sex, species) %>%
  arrange(species, desc(n))
surveys %>%
    count(plot_type)
surveys %>%
    filter(!is.na(hindfoot_length)) %>%
    group_by(species_id) %>%
    summarize(
        mean_hindfoot_length = mean(hindfoot_length),
        min_hindfoot_length = min(hindfoot_length),
        max_hindfoot_length = max(hindfoot_length),
        n = n()
    )
surveys %>%
    filter(!is.na(weight)) %>%
    group_by(year) %>%
    filter(weight == max(weight)) %>%
    select(year, genus, species, weight) %>%
    arrange(year)
surveys_gw <- surveys %>%
  filter(!is.na(weight)) %>%
  group_by(plot_id, genus) %>%
  summarize(mean_weight = mean(weight))
#> `summarise()` has grouped output by 'plot_id'. You can override using the
#> `.groups` argument.
str(surveys_gw)
surveys_wide <- surveys_gw %>%
  pivot_wider(names_from = genus, values_from = mean_weight)

str(surveys_wide)
surveys_gw %>%
  pivot_wider(names_from = genus, values_from = mean_weight, values_fill = 0) %>%
  head()
surveys_long <- surveys_wide %>%
  pivot_longer(names_to = "genus", values_to = "mean_weight", cols = -plot_id)

str(surveys_long)
surveys_wide %>%
  pivot_longer(names_to = "genus", values_to = "mean_weight", cols = -plot_id) %>%
  head()
surveys_wide_genera <- surveys %>%
  group_by(plot_id, year) %>%
  summarize(n_genera = n_distinct(genus)) %>%
  pivot_wider(names_from = year, values_from = n_genera)
#> `summarise()` has grouped output by 'plot_id'. You can override using the
#> `.groups` argument.
head(surveys_wide_genera)
surveys_wide_genera %>%
  pivot_longer(names_to = "year", values_to = "n_genera", cols = -plot_id)
surveys_long <- surveys %>%
  pivot_longer(names_to = "measurement", values_to = "value", cols = c(hindfoot_length, weight))
surveys_long %>%
  group_by(year, measurement, plot_type) %>%
  summarize(mean_value = mean(value, na.rm=TRUE)) %>%
  pivot_wider(names_from = measurement, values_from = mean_value)
#> `summarise()` has grouped output by 'year', 'measurement'. You can override
#> using the `.groups` argument.
surveys_complete <- surveys %>%
  filter(!is.na(weight),           # remove missing weight
         !is.na(hindfoot_length),  # remove missing hindfoot_length
         !is.na(sex))                # remove missing sex
## Extract the most common species_id
species_counts <- surveys_complete %>%
    count(species_id) %>%
    filter(n >= 50)

## Only keep the most common species
surveys_complete <- surveys_complete %>%
  filter(species_id %in% species_counts$species_id)
write_csv(surveys_complete, file = "data/surveys_complete.csv")
library(tidyverse)
surveys_complete <- read_csv("data/surveys_complete.csv")
ggplot(data = <DATA>, mapping = aes(<MAPPINGS>)) +  <GEOM_FUNCTION>()
ggplot(data = surveys_complete)
ggplot(data = surveys_complete, mapping = aes(x = weight, y = hindfoot_length))
ggplot(data = surveys_complete, aes(x = weight, y = hindfoot_length)) +
  geom_point()
# Assign plot to a variable
surveys_plot <- ggplot(data = surveys_complete,
                       mapping = aes(x = weight, y = hindfoot_length))

# Draw the plot
surveys_plot +
    geom_point()
# This is the correct syntax for adding layers
surveys_plot +
  geom_point()

# This will not add the new layer and will return an error message
surveys_plot
  + geom_point()
install.packages("hexbin")
library(hexbin)
surveys_plot +
 geom_hex()
ggplot(data = surveys_complete, aes(x = weight, y = hindfoot_length)) +
    geom_point()
ggplot(data = surveys_complete, aes(x = weight, y = hindfoot_length)) +
    geom_point(alpha = 0.1)
ggplot(data = surveys_complete, mapping = aes(x = weight, y = hindfoot_length)) +
    geom_point(alpha = 0.1, color = "blue")
ggplot(data = surveys_complete, mapping = aes(x = weight, y = hindfoot_length)) +
    geom_point(alpha = 0.1, aes(color = species_id))
ggplot(data = surveys_complete,
       mapping = aes(x = species_id, y = weight)) +
   geom_point(aes(color = plot_type))
ggplot(data = surveys_complete, mapping = aes(x = species_id, y = weight)) +
    geom_boxplot()
ggplot(data = surveys_complete, mapping = aes(x = species_id, y = weight)) +
    geom_boxplot(outlier.shape = NA) +
    geom_jitter(alpha = 0.3, color = "tomato")
ggplot(data = surveys_complete, mapping = aes(x = species_id, y = weight)) +
geom_jitter(alpha = 0.3, color = "tomato") +
geom_violin() 
ggplot(data = surveys_complete, mapping = aes(x = species_id, y = weight)) +
scale_y_log10() +
geom_jitter(alpha = 0.3, color = "tomato") +
geom_boxplot(outlier.shape = NA)
ggplot(data = surveys_complete, mapping = aes(x = species_id, y = hindfoot_length)) +
geom_jitter(alpha = 0.3, color = "tomato") +
geom_boxplot(outlier.shape = NA)
yearly_counts <- surveys_complete %>%
  count(year, genus)
ggplot(data = yearly_counts, aes(x = year, y = n)) +
     geom_line()
ggplot(data = yearly_counts, aes(x = year, y = n, group = genus)) +
    geom_line()
ggplot(data = yearly_counts, aes(x = year, y = n, color = genus)) +
    geom_line()
yearly_counts %>%
    ggplot(mapping = aes(x = year, y = n, color = genus)) +
    geom_line()
yearly_counts_graph <- surveys_complete %>%
    count(year, genus) %>%
    ggplot(mapping = aes(x = year, y = n, color = genus)) +
    geom_line()

yearly_counts_graph
ggplot(data = yearly_counts, aes(x = year, y = n)) +
    geom_line() +
    facet_wrap(facets = vars(genus))
 yearly_sex_counts <- surveys_complete %>%
                      count(year, genus, sex)
ggplot(data = yearly_sex_counts, mapping = aes(x = year, y = n, color = sex)) +
  geom_line() +
  facet_wrap(facets =  vars(genus))
ggplot(data = yearly_sex_counts,
       mapping = aes(x = year, y = n, color = sex)) +
  geom_line() +
  facet_grid(rows = vars(sex), cols =  vars(genus))
# One column, facet by rows
ggplot(data = yearly_sex_counts,
       mapping = aes(x = year, y = n, color = sex)) +
  geom_line() +
  facet_grid(rows = vars(genus))
# One row, facet by column
ggplot(data = yearly_sex_counts,
       mapping = aes(x = year, y = n, color = sex)) +
  geom_line() +
  facet_grid(cols = vars(genus))
 ggplot(data = yearly_sex_counts,
        mapping = aes(x = year, y = n, color = sex)) +
     geom_line() +
     facet_wrap(vars(genus)) +
     theme_bw()
yearly_weight <- surveys_complete %>%
                group_by(year, species_id) %>%
                 summarize(avg_weight = mean(weight))
#> `summarise()` has grouped output by 'year'. You can override using the
#> `.groups` argument.
ggplot(data = yearly_weight, mapping = aes(x=year, y=avg_weight)) +
   geom_line() +
   facet_wrap(vars(species_id)) +
   theme_bw()
ggplot(data = yearly_sex_counts, aes(x = year, y = n, color = sex)) +
    geom_line() +
    facet_wrap(vars(genus)) +
    labs(title = "Observed genera through time",
         x = "Year of observation",
         y = "Number of individuals") +
    theme_bw()
ggplot(data = yearly_sex_counts, mapping = aes(x = year, y = n, color = sex)) +
    geom_line() +
    facet_wrap(vars(genus)) +
    labs(title = "Observed genera through time",
        x = "Year of observation",
        y = "Number of individuals") +
    theme_bw() +
    theme(text=element_text(size = 16))
ggplot(data = yearly_sex_counts, mapping = aes(x = year, y = n, color = sex)) +
    geom_line() +
    facet_wrap(vars(genus)) +
    labs(title = "Observed genera through time",
        x = "Year of observation",
        y = "Number of individuals") +
    theme_bw() +
    theme(axis.text.x = element_text(colour = "grey20", size = 12, angle = 90, hjust = 0.5, vjust = 0.5),
                        axis.text.y = element_text(colour = "grey20", size = 12),
                        strip.text = element_text(face = "italic"),
                        text = element_text(size = 16))
grey_theme <- theme(axis.text.x = element_text(colour="grey20", size = 12,
                                               angle = 90, hjust = 0.5,
                                               vjust = 0.5),
                    axis.text.y = element_text(colour = "grey20", size = 12),
                    text=element_text(size = 16))

ggplot(surveys_complete, aes(x = species_id, y = hindfoot_length)) +
    geom_boxplot() +
    grey_theme
install.packages("patchwork")
library(patchwork)

plot_weight <- ggplot(data = surveys_complete, aes(x = species_id, y = weight)) +
  geom_boxplot() +
  labs(x = "Species", y = expression(log[10](Weight))) +
  scale_y_log10()

plot_count <- ggplot(data = yearly_counts, aes(x = year, y = n, color = genus)) +
  geom_line() +
  labs(x = "Year", y = "Abundance")

plot_weight / plot_count + plot_layout(heights = c(3, 2))
my_plot <- ggplot(data = yearly_sex_counts,
                  aes(x = year, y = n, color = sex)) +
    geom_line() +
    facet_wrap(vars(genus)) +
    labs(title = "Observed genera through time",
        x = "Year of observation",
        y = "Number of individuals") +
    theme_bw() +
    theme(axis.text.x = element_text(colour = "grey20", size = 12, angle = 90,
                                     hjust = 0.5, vjust = 0.5),
          axis.text.y = element_text(colour = "grey20", size = 12),
          text = element_text(size = 16))

ggsave("name_of_file.png", my_plot, width = 15, height = 10)

## This also works for plots combined with patchwork
plot_combined <- plot_weight / plot_count + plot_layout(heights = c(3, 2))
ggsave("plot_combined.png", plot_combined, width = 10, dpi = 300)
# if
if (condition is true) {
  perform action
}

# if ... else
if (condition is true) {
  perform action
} else {  # that is, if the condition is false,
  perform alternative action
}
x <- 8

if (x >= 10) {
  print("x is greater than or equal to 10")
}

x
[1] 8
x <- 8

if (x >= 10) {
  print("x is greater than or equal to 10")
} else {
  print("x is less than 10")
}
[1] "x is less than 10"
x <- 8

if (x >= 10) {
  print("x is greater than or equal to 10")
} else if (x > 5) {
  print("x is greater than 5, but less than 10")
} else {
  print("x is less than 5")
}
[1] "x is greater than 5, but less than 10"
x  <-  4 == 3
if (x) {
  "4 equals 3"
} else {
  "4 does not equal 3"
}
[1] "4 does not equal 3"
x <- 4 == 3
x
[1] FALSE
gapminder <- read.csv("data/gapminder_data.csv", header = TRUE)
gapminder[(gapminder$year == 2002),]
rows2002_number <- nrow(gapminder[(gapminder$year == 2002),])
rows2002_number >= 1
if(nrow(gapminder[(gapminder$year == 2002),]) >= 1){
   print("Record(s) for the year 2002 found.")
}
if(any(gapminder$year == 2002)){
   print("Record(s) for the year 2002 found.")
}
Error in if (gapminder$year == 2012) {: the condition has length > 1
# ifelse function
ifelse(condition is true, perform action, perform alternative action)
y <- -3
ifelse(y < 0, "y is a negative number", "y is either positive or zero")
[1] "y is a negative number"
for (iterator in set of values) {
  do a thing
}
for (i in 1:10) {
  print(i)
}
[1] 1
[1] 2
[1] 3
[1] 4
[1] 5
[1] 6
[1] 7
[1] 8
[1] 9
[1] 10
for (i in 1:5) {
  for (j in c('a', 'b', 'c', 'd', 'e')) {
    print(paste(i,j))
  }
}
[1] "1 a"
[1] "1 b"
[1] "1 c"
[1] "1 d"
[1] "1 e"
[1] "2 a"
[1] "2 b"
[1] "2 c"
[1] "2 d"
[1] "2 e"
[1] "3 a"
[1] "3 b"
[1] "3 c"
[1] "3 d"
[1] "3 e"
[1] "4 a"
[1] "4 b"
[1] "4 c"
[1] "4 d"
[1] "4 e"
[1] "5 a"
[1] "5 b"
[1] "5 c"
[1] "5 d"
[1] "5 e"
output_vector <- c()
for (i in 1:5) {
  for (j in c('a', 'b', 'c', 'd', 'e')) {
    temp_output <- paste(i, j)
    output_vector <- c(output_vector, temp_output)
  }
}
output_vector
 [1] "1 a" "1 b" "1 c" "1 d" "1 e" "2 a" "2 b" "2 c" "2 d" "2 e" "3 a" "3 b"
[13] "3 c" "3 d" "3 e" "4 a" "4 b" "4 c" "4 d" "4 e" "5 a" "5 b" "5 c" "5 d"
[25] "5 e"
output_matrix <- matrix(nrow = 5, ncol = 5)
j_vector <- c('a', 'b', 'c', 'd', 'e')
for (i in 1:5) {
  for (j in 1:5) {
    temp_j_value <- j_vector[j]
    temp_output <- paste(i, temp_j_value)
    output_matrix[i, j] <- temp_output
  }
}
output_vector2 <- as.vector(output_matrix)
output_vector2
 [1] "1 a" "2 a" "3 a" "4 a" "5 a" "1 b" "2 b" "3 b" "4 b" "5 b" "1 c" "2 c"
[13] "3 c" "4 c" "5 c" "1 d" "2 d" "3 d" "4 d" "5 d" "1 e" "2 e" "3 e" "4 e"
[25] "5 e"
while(this condition is true){
  do a thing
}
z <- 1
while(z > 0.1){
  z <- runif(1)
  cat(z, "\n")
}
all(output_vector == output_vector2)
all(output_vector %in% output_vector2)
all(output_vector2 %in% output_vector)
output_vector2 <- as.vector(output_matrix)
output_vector2 <- as.vector(t(output_matrix))
output_matrix[i, j] <- temp_output
output_matrix[j, i] <- temp_output
gapminder <- read.csv("data/gapminder_data.csv")
unique(gapminder$continent)
for (iContinent in unique(gapminder$continent)) {
  tmp <- gapminder[gapminder$continent == iContinent, ]
  cat(iContinent, mean(tmp$lifeExp, na.rm = TRUE), "\n")
  rm(tmp)
}
thresholdValue <- 50

for (iContinent in unique(gapminder$continent)) {
   tmp <- mean(gapminder[gapminder$continent == iContinent, "lifeExp"])

   if (tmp < thresholdValue){
       cat("Average Life Expectancy in", iContinent, "is less than", thresholdValue, "\n")
   } else {
       cat("Average Life Expectancy in", iContinent, "is greater than", thresholdValue, "\n")
   } # end if else condition
   rm(tmp)
} # end for loop
 lowerThreshold <- 50
 upperThreshold <- 70

for (iCountry in unique(gapminder$country)) {
    tmp <- mean(gapminder[gapminder$country == iCountry, "lifeExp"])

    if(tmp < lowerThreshold) {
        cat("Average Life Expectancy in", iCountry, "is less than", lowerThreshold, "\n")
    } else if(tmp > lowerThreshold && tmp < upperThreshold) {
        cat("Average Life Expectancy in", iCountry, "is between", lowerThreshold, "and", upperThreshold, "\n")
    } else {
        cat("Average Life Expectancy in", iCountry, "is greater than", upperThreshold, "\n")
    }
    rm(tmp)
}
grep("^B", unique(gapminder$country))
grep("^B", unique(gapminder$country), value = TRUE)
thresholdValue <- 50
candidateCountries <- grep("^B", unique(gapminder$country), value = TRUE)

for (iCountry in candidateCountries) {
    tmp <- mean(gapminder[gapminder$country == iCountry, "lifeExp"])

    if (tmp < thresholdValue) {
        cat("Average Life Expectancy in", iCountry, "is less than", thresholdValue, "plotting life expectancy graph... \n")

        with(subset(gapminder, country == iCountry),
                plot(year, lifeExp,
                     type = "o",
                     main = paste("Life Expectancy in", iCountry, "over time"),
                     ylab = "Life Expectancy",
                     xlab = "Year"
                     ) # end plot
             ) # end with
    } # end if
    rm(tmp)
} # end for loop

Column	Description
record_id	Unique id for the observation
month	month of observation
day	day of observation
year	year of observation
plot_id	ID of a particular experimental plot of land
species_id	2-letter code
sex	sex of animal (“M”, “F”)
hindfoot_length	length of the hindfoot in mm
weight	weight of the animal in grams
genus	genus of animal
species	species of animal
taxon	e.g. Rodent, Reptile, Bird, Rabbit
plot_type	type of plot

Overview

Questions

Objectives

What is R? What is RStudio?

Why learn R?

R does not involve lots of pointing and clicking, and that’s a good thing

R code is great for reproducibility

R is interdisciplinary and extensible

R works on data of all shapes and sizes

R produces high-quality graphics

R has a large and welcoming community

Not only is R free, but it is also open-source and cross-platform

Knowing your way around RStudio

Getting set up

Organizing your working directory

The working directory

Interacting with R

Seeking help

Searching function documentation with ? and ??

Automatic code completion

Package vignettes and cheat sheets

Finding more functions and packages

Dealing with error messages

Asking for help

R

OUTPUT

How to learn more after the workshop?

More resources

More about R

How to ask good programming questions?

Key Points

Overview

Questions

Objectives

Creating objects in R

R

R

Objects vs. variables

R

R

R

R

R

Saving your code

Comments

Challenge

R

Functions and their arguments

R

R

OUTPUT

R

OUTPUT

R

R

OUTPUT

R

OUTPUT

R

OUTPUT

Vectors and data types

R

R

R

R

R

R

Challenge

Show me the solution

Challenge(continued)

R

Show me the solution

Challenge(continued)

R

Show me the solution

Challenge(continued)

Show me the solution

Subsetting vectors

R

OUTPUT

Searching function documentation with `?` and `??`