#load required packages
library(tidyverse)
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.2 ──
## ✔ ggplot2 3.3.6 ✔ purrr 0.3.4
## ✔ tibble 3.1.8 ✔ dplyr 1.0.10
## ✔ tidyr 1.2.0 ✔ stringr 1.4.1
## ✔ readr 2.1.2 ✔ forcats 0.5.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
library(dplyr)
#load in the two required csv files from an html format with read_csv
part1 <- read_csv("https://raw.githubusercontent.com/mbtoomey/Biol_7263/main/Data/assignment6part1.csv")
## Rows: 2 Columns: 21
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (1): ID
## dbl (20): Sample1_Male_Control, Sample2_Male_Control, Sample3_Male_Control, ...
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
part2 <- read_csv("https://raw.githubusercontent.com/mbtoomey/Biol_7263/main/Data/assignment6part2.csv")
## Rows: 1 Columns: 17
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (1): ID
## dbl (16): Sample16.Treatment, Sample12.Control, Sample3.Control, Sample6.Tre...
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
CODE
#Here we are dealing with the first dataset, which I am calling part1. First I want to take the column names and separate them into three different columns.
#(this makes the matrix longer so I use pivot_longer)
#Then, for the sample column, I just wanted the sample number so I remove the word sample and transform these values into integers. I separate the names by
#using the _ between them in the original column name and then I move the data to a column called count. After that I want to make the table wider and
#add body length and age as columns in the matrix.
part1a <- part1 %>% pivot_longer(cols = starts_with("Sample"), names_to = c("Sample", "Gender", "Treatment"),
names_prefix = "Sample", names_transform = list(Sample = as.integer),
names_sep = "_", values_to = "count") %>%
pivot_wider(names_from = ID, values_from = count)
#For the second data set I first take the columns and move them to one column named SampleTreatment. Then I remove the prefix of sample and turn the sample
#numbers into integers. I then move all the values associated with the former columns to a column called count. I pipe this tibble into a separate function
#where I separate the SampleTreatment columns into two separate columns: sample and treatment. I use pivot_wider to make the count column a mass column.
#Finally I get rid of the Treatment column because next I will combine the two tibbles and part1a already has a treatment column.
part2a <- part2 %>% pivot_longer(cols = starts_with("Sample"), names_to = c("SampleTreatment"),
names_prefix = "Sample", names_transform = list(Sample = as.integer),
values_to = "count") %>% separate(SampleTreatment, into = c("Sample", "Treatment"), convert = TRUE) %>%
pivot_wider(names_from = ID, values_from = count) %>% select(-Treatment)
#I use the commany full join to join the two tibbles by the sample column
part1a %>% full_join(part2a, by = "Sample") -> final
#Write my combined tibbles to a csv file
write_csv(final, "Results/experiment_data.csv")
#Here I pipe my final tibble into a transmute function where I create a new tibble with gender and treatment info as well as a calculated residual mass
#column. Then I group the data by Treatment and Gender so that I can use the summarize function to make a tibble that calculates the mean mass and standard
#deviation for each gender within each treatment. I use na.rm to remove NAs before calculating these stats. I name this final tibble to the variable
#summarystats
final %>% transmute(Gender = Gender, Treatment = Treatment, resid_mass = mass / body_length) %>%
group_by(Treatment, Gender) %>%
summarize(mean_mass = mean(resid_mass, na.rm = TRUE), SD_mass = sd(resid_mass, na.rm = TRUE)) -> summarystats
## `summarise()` has grouped output by 'Treatment'. You can override using the
## `.groups` argument.
#I write my final tibble (summarystats) to a csv file
write_csv(final, "Results/summary_stats.csv")