Part 1 - Overall Goal

First, below are all the libraries we potentially might need through the course of this project

library(readr)

## Warning: package 'readr' was built under R version 4.1.3

library(tidyverse)

## Warning: package 'tidyverse' was built under R version 4.1.3

## -- Attaching packages --------------------------------------- tidyverse 1.3.2 --
## v ggplot2 3.3.6     v dplyr   1.0.9
## v tibble  3.1.8     v stringr 1.4.1
## v tidyr   1.2.0     v forcats 0.5.2
## v purrr   0.3.4

## Warning: package 'ggplot2' was built under R version 4.1.3

## Warning: package 'tibble' was built under R version 4.1.3

## Warning: package 'tidyr' was built under R version 4.1.3

## Warning: package 'purrr' was built under R version 4.1.3

## Warning: package 'dplyr' was built under R version 4.1.3

## Warning: package 'stringr' was built under R version 4.1.3

## Warning: package 'forcats' was built under R version 4.1.3

## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()

1 Data

From a data perspective, we will use a number of .csv files that contain information from the census bureau(2010). Firstly, we read the data in the below chunk inta a variable called sheet1

sheet1 <- read_csv("https://www4.stat.ncsu.edu/~online/datasets/EDU01a.csv")

## Rows: 3198 Columns: 42
## -- Column specification --------------------------------------------------------
## Delimiter: ","
## chr (22): Area_name, STCOU, EDU010187N1, EDU010187N2, EDU010188N1, EDU010188...
## dbl (20): EDU010187F, EDU010187D, EDU010188F, EDU010188D, EDU010189F, EDU010...
## 
## i Use `spec()` to retrieve the full column specification for this data.
## i Specify the column types or set `show_col_types = FALSE` to quiet this message.

sheet1

1.1 In the below part, we select the subset of data as per instructions i.e., the column Area_name and change its name to area_name, the column STCOU, and all the columns that end with the letter “D” into a variable called sheet2

sheet2 <- as_tibble(sheet1) %>%
  select(Area_name, STCOU, ends_with("D")) %>%
  rename(area_name = Area_name)
sheet2

1.2 Now, we convert the data that we have with us to a long format using the pivot_longer function

sheet3 <- sheet2 %>%
  pivot_longer(cols =3:12, names_to = "enrollment", values_to = "enrollment_value")
sheet3

1.3 Next, we extracted information (Measurement type and year) from Enrollment column

sheet3$Year <- as.numeric(format(as.Date(substr(sheet3$enrollment, 8, 9), format="%y"), "%Y"))
sheet3$Value_type=substr(sheet3$enrollment, 1, 7)
sheet4 <- sheet3
sheet4

1.4 Below, we then create 2 data sets as per instructions for county and non county data

sheet_county <- sheet4[c(grep(pattern = ", \\w\\w", sheet4$area_name)),]
sheet_county

sheet_non_county <- sheet4[-c(grep(pattern = ", \\w\\w", sheet4$area_name)),]
sheet_non_county

class(sheet_county) <- c("county", class(sheet_county))
class(sheet_non_county) <- c("non_county", class(sheet_non_county))

1.5 Creating a new column named state for the county data which describes the state the county corresponds to

sheet_county$State <- str_sub(sheet_county$area_name, -2, -1)
sheet5 <- sheet_county
sheet5

1.6 Creating a new column called division corresponding to the state’s classification based on the provided link in the project.

value1 <- list("Connecticut", "Maine", "Massachusetts", "New Hampshire", "Rhode Island", "Vermont")
value2 <- list("New Jersey", "New York", "Pennsylvania")
value3 <- list("Illinois", "Indiana", "Michigan", "Ohio", "Wisconsin")
value4 <- list("Iowa", "Kansas", "Minnesota", "Missouri", "Nebraska", "North Dakota", "South Dakota")
value5 <- list("Delaware","Florida","Georgia","Maryland","North Carolina","South Carolina","Virginia"," Washington,D.C.","West Virginia")
value6 <- list("Alabama", "Kentucky", "Mississippi","Tennessee")
value7 <- list("Arkansas", "Louisiana", "Oklahoma","Texas")
value8 <- list("Arizona", "Colorado", "Idaho","Montana", "Nevada", "New Mexico","Utah","Wyoming")
value9 <- list("Alaska", "California", "Hawaii", "Oregon","Washington")

length <- nrow(sheet_non_county)

sheet_non_county$division <- NaN

for (i in 1:length) {
  if(sheet_non_county$area_name[i] %in% toupper(value1)) {
    sheet_non_county$division[i]="New England"
  }
  else if(sheet_non_county$area_name[i] %in% toupper(value2)) {
    sheet_non_county$division[i]="Mid-Atlantic"
  }
  else if(sheet_non_county$area_name[i] %in% toupper(value3)) {
    sheet_non_county$division[i]="East North Central"
  }
  else if(sheet_non_county$area_name[i] %in% toupper(value4)) {
    sheet_non_county$division[i]="West North Central"
  }
  else if(sheet_non_county$area_name[i] %in% toupper(value5)) {
    sheet_non_county$division[i]="South Atlantic"
  }
  else if(sheet_non_county$area_name[i] %in% toupper(value6)) {
    sheet_non_county$division[i]="East South Central"
  }
  else if(sheet_non_county$area_name[i] %in% toupper(value7)) {
    sheet_non_county$division[i]="West South Central"
  }
  else if(sheet_non_county$area_name[i] %in% toupper(value8)) {
    sheet_non_county$division[i]="Mountain"
  }
  else if(sheet_non_county$area_name[i] %in% toupper(value9)) {
    sheet_non_county$division[i]="Pacific"
  }
  else {
    sheet_non_county$division[i]="ERROR"
  }
}

sheet_non_county

2 Requirements

2.1 Below is the function where we are performing step 1 and 2 from the part 1 of the project, here we are giving an optional argument with a default value that allows the user to specify the name of the column representing the value

function_step_1_and_2 <- function(tib, values_to = "enrollment_value"){
  tib1 <- as_tibble(tib) %>%
    select(Area_name, STCOU, ends_with("D")) %>%
    rename(area_name = Area_name)
  tib2 <- tib1 %>%
    pivot_longer(cols =3:12, names_to = "enrollment", values_to = values_to)
  return(tib2)
}

2.2 Using output from step 2 is being given as input to step 3 in a function below

function_step_3 <- function(tib2){
  tib2$Value_type <- substr(tib2$enrollment, 1, 7)
  tib2$Year <- as.numeric(format(as.Date(substr(tib2$enrollment, 8, 9),format="%y"), "%Y"))
  tib3 <- tib2
  return(tib3)
}

2.3 Here, we create a function which executes step 5 from the previous part and extracts state code from area_name

function_step_5 <- function(tib){
  tib$State <- str_sub(tib$area_name,-2,-1)
  tib5 <- tib
  return (tib5)
}

2.4 Here, we create a function which executes step 6 by creating a column division

function_step_6 <- function(tib){
value1 <- list("Connecticut", "Maine", "Massachusetts", "New Hampshire", "Rhode Island", "Vermont")
value2 <- list("New Jersey", "New York", "Pennsylvania")
value3 <- list("Illinois", "Indiana", "Michigan", "Ohio", "Wisconsin")
value4 <- list("Iowa", "Kansas", "Minnesota", "Missouri", "Nebraska", "North Dakota", "South Dakota")
value5 <- list("Delaware","Florida","Georgia","Maryland","North Carolina","South Carolina","Virginia"," Washington,D.C.","West Virginia")
value6 <- list("Alabama", "Kentucky", "Mississippi","Tennessee")
value7 <- list("Arkansas", "Louisiana", "Oklahoma","Texas")
value8 <- list("Arizona", "Colorado", "Idaho","Montana", "Nevada", "New Mexico","Utah","Wyoming")
value9 <- list("Alaska", "California", "Hawaii", "Oregon","Washington")

length <- nrow(tib)

tib$division <- NaN

for (i in 1:length) {
  if(tib$area_name[i] %in% toupper(value1)) {
    tib$division[i]="New England"
  }
  else if(tib$area_name[i] %in% toupper(value2)) {
    tib$division[i]="Mid-Atlantic"
  }
  else if(tib$area_name[i] %in% toupper(value3)) {
    tib$division[i]="East North Central"
  }
  else if(tib$area_name[i] %in% toupper(value4)) {
    tib$division[i]="West North Central"
  }
  else if(tib$area_name[i] %in% toupper(value5)) {
    tib$division[i]="South Atlantic"
  }
  else if(tib$area_name[i] %in% toupper(value6)) {
    tib$division[i]="East South Central"
  }
  else if(tib$area_name[i] %in% toupper(value7)) {
    tib$division[i]="West South Central"
  }
  else if(tib$area_name[i] %in% toupper(value8)) {
    tib$division[i]="Mountain"
  }
  else if(tib$area_name[i] %in% toupper(value9)) {
    tib$division[i]="Pacific"
  }
  else {
    tib$division[i]="ERROR"
  }
}

tib
}

2.5 The output of function 3 is given as an input to the below function. Thereafter, two mutually exclusive tibbles are created from a single tibble obtained from the output of function 3. Finally, we use the function 5 and function 6 on county and non-county tibbles.

function_step_4 <- function(tib3){
  sheet_county <- tib3[c(grep(pattern = ", \\w\\w", tib3$area_name)),]
  sheet_county
  
  sheet_non_county <- tib3[-c(grep(pattern = ", \\w\\w", tib3$area_name)),]
  sheet_non_county
  
  class(sheet_county) <- c("county", class(sheet_county))
  class(sheet_non_county) <- c("non_county", class(sheet_non_county))
  
  tib_sheet_county <- function_step_5(sheet_county)
  tib_sheet_non_county <- function_step_6(sheet_non_county)
  return(list(tib_sheet_county, tib_sheet_non_county))
}

function_step_6(tib_sheet_non_county)

2.6 Below we are creating a wrapper function which utilizes all the above functions created and provides an output as part of 2 tibbles

my_wrapper <- function(url, default_var_name = "enrollment_value"){
  inp1 <- read_csv(url)
  inp2 <- function_step_1_and_2(inp1)
  inp3 <- function_step_3(inp2)
  inp4 <- function_step_4(inp3)
  return(list(inp4[[1]], inp4[[2]]))
}

2.7 Using the wrapper function to read the 2 csv files

file1 <- my_wrapper("https://www4.stat.ncsu.edu/~online/datasets/EDU01a.csv")

## Rows: 3198 Columns: 42
## -- Column specification --------------------------------------------------------
## Delimiter: ","
## chr (22): Area_name, STCOU, EDU010187N1, EDU010187N2, EDU010188N1, EDU010188...
## dbl (20): EDU010187F, EDU010187D, EDU010188F, EDU010188D, EDU010189F, EDU010...
## 
## i Use `spec()` to retrieve the full column specification for this data.
## i Specify the column types or set `show_col_types = FALSE` to quiet this message.

file2 <- my_wrapper("https://www4.stat.ncsu.edu/~online/datasets/EDU01b.csv")

## Rows: 3198 Columns: 42
## -- Column specification --------------------------------------------------------
## Delimiter: ","
## chr (22): Area_name, STCOU, EDU010197N1, EDU010197N2, EDU010198N1, EDU010198...
## dbl (20): EDU010197F, EDU010197D, EDU010198F, EDU010198D, EDU010199F, EDU010...
## 
## i Use `spec()` to retrieve the full column specification for this data.
## i Specify the column types or set `show_col_types = FALSE` to quiet this message.

2.8 Creating a function below that creates a combined tibble of two csv files which has the independent county and non county tibble

combined_function <- function(file1, file2){
  
  tib_combined_county <- bind_rows(file1[[1]], file2[[1]])
  tib_combined_non_county <- bind_rows(file1[[2]], file2[[2]])
  
  return(list(tib_combined_county, tib_combined_non_county))
}

2.9 Combining county and non county tibble

combined_county <- combined_function(file1, file2)[[1]]
combined_non_county <- combined_function(file1, file2)[[2]]
class(combined_county) <- c("state", class(combined_non_county))
str(combined_non_county)

## non_county [1,060 x 7] (S3: non_county/tbl_df/tbl/data.frame)
##  $ area_name       : chr [1:1060] "UNITED STATES" "UNITED STATES" "UNITED STATES" "UNITED STATES" ...
##  $ STCOU           : chr [1:1060] "00000" "00000" "00000" "00000" ...
##  $ enrollment      : chr [1:1060] "EDU010187D" "EDU010188D" "EDU010189D" "EDU010190D" ...
##  $ enrollment_value: num [1:1060] 40024299 39967624 40317775 40737600 41385442 ...
##  $ Value_type      : chr [1:1060] "EDU0101" "EDU0101" "EDU0101" "EDU0101" ...
##  $ Year            : num [1:1060] 1987 1988 1989 1990 1991 ...
##  $ division        : chr [1:1060] "ERROR" "ERROR" "ERROR" "ERROR" ...

combined_county_noncounty <- function(x,y){
  df <- bind_rows(x,y)
  return(df)
}
combined_county_noncounty(combined_county, combined_non_county)

2.10 Creating a function below to plot the mean values of statistics across the years for each division.

plot.state <- function(tib, var_name = "enrollment_value"){
  avg_enrol <- tib %>%
    filter(division != "Error") %>%
    group_by(Year, division) %>%
    summarise(Mean = mean(get(var_name), na.rm = TRUE))
  avg_enrol
  ggplot(avg_enrol, aes(x = Year, y = Mean, color = division)) + geom_line() + geom_point()
}

2.11 Creating a function below for class county with a little more flexibility. This will allow the user to

- specify the state of interest, giving a default value if not specified

- determine whether the ‘top’ or ‘bottom’ most counties should be looked at with a default for ‘top’

- instruct how many of the ‘top’ or ‘bottom’ will be investigated with a default value of 5

plot.county <- function(tib, var_name = "enrollment_value", order = "top", state_new = "PA",  count = 5){
  avg_county <- tib %>%
    filter(State == state_new) %>%
    group_by(area_name) %>%
    summarise(Mean = mean(get(var_name), na.rm = TRUE))
  
  if(order == "top"){
    # avg_county <- head(avg_county[order(-mean_county$Mean),],count)
    avg_county <- head(avg_county %>% arrange(desc(Mean)), count)
  }else{
    avg_county <- head(avg_county %>% arrange(Mean), count)
  }
  
  filter_df <- subset(tib, area_name %in% avg_county$area_name)
  
  ggplot(filter_df, aes(x = Year, y=get(var_name), color = area_name)) + geom_point() + geom_line()
}

3 Put it all together

3.1 Running data processing function on the 2 URLs

file1 <- my_wrapper("https://www4.stat.ncsu.edu/~online/datasets/EDU01a.csv")

## Rows: 3198 Columns: 42
## -- Column specification --------------------------------------------------------
## Delimiter: ","
## chr (22): Area_name, STCOU, EDU010187N1, EDU010187N2, EDU010188N1, EDU010188...
## dbl (20): EDU010187F, EDU010187D, EDU010188F, EDU010188D, EDU010189F, EDU010...
## 
## i Use `spec()` to retrieve the full column specification for this data.
## i Specify the column types or set `show_col_types = FALSE` to quiet this message.

file2 <- my_wrapper("https://www4.stat.ncsu.edu/~online/datasets/EDU01b.csv")

## Rows: 3198 Columns: 42
## -- Column specification --------------------------------------------------------
## Delimiter: ","
## chr (22): Area_name, STCOU, EDU010197N1, EDU010197N2, EDU010198N1, EDU010198...
## dbl (20): EDU010197F, EDU010197D, EDU010198F, EDU010198D, EDU010199F, EDU010...
## 
## i Use `spec()` to retrieve the full column specification for this data.
## i Specify the column types or set `show_col_types = FALSE` to quiet this message.

3.2 Running data combining function

combined_function(file1, file2)[[1]]

combined_function(file1, file2)[[2]]

3.3 Running plot function on the state data set

plot.state(combined_non_county)

## `summarise()` has grouped output by 'Year'. You can override using the
## `.groups` argument.

## 3.4 Running plot function on the county data set using the given attributes

plot.county(combined_county, state_new = "PA", order = "top", count = 7)

plot.county(combined_county, state_new = "PA", order = "bottom", count = 4)

plot.county(combined_county)

plot.county(combined_county, state_new = "MN", order = "top", count = 10)

## 3.5 Running data processing function on four data sets using the given URLs

wrap1 <- my_wrapper("https://www4.stat.ncsu.edu/~online/datasets/PST01a.csv")

## Rows: 3198 Columns: 42
## -- Column specification --------------------------------------------------------
## Delimiter: ","
## chr (22): Area_name, STCOU, PST015171N1, PST015171N2, PST015172N1, PST015172...
## dbl (20): PST015171F, PST015171D, PST015172F, PST015172D, PST015173F, PST015...
## 
## i Use `spec()` to retrieve the full column specification for this data.
## i Specify the column types or set `show_col_types = FALSE` to quiet this message.

wrap2 <- my_wrapper("https://www4.stat.ncsu.edu/~online/datasets/PST01b.csv")

## Rows: 3198 Columns: 42
## -- Column specification --------------------------------------------------------
## Delimiter: ","
## chr (22): Area_name, STCOU, PST025182N1, PST025182N2, PST025183N1, PST025183...
## dbl (20): PST025182F, PST025182D, PST025183F, PST025183D, PST025184F, PST025...
## 
## i Use `spec()` to retrieve the full column specification for this data.
## i Specify the column types or set `show_col_types = FALSE` to quiet this message.

wrap3 <- my_wrapper("https://www4.stat.ncsu.edu/~online/datasets/PST01c.csv")

## Rows: 3198 Columns: 42
## -- Column specification --------------------------------------------------------
## Delimiter: ","
## chr (22): Area_name, STCOU, PST035191N1, PST035191N2, PST035192N1, PST035192...
## dbl (20): PST035191F, PST035191D, PST035192F, PST035192D, PST035193F, PST035...
## 
## i Use `spec()` to retrieve the full column specification for this data.
## i Specify the column types or set `show_col_types = FALSE` to quiet this message.

wrap4 <- my_wrapper("https://www4.stat.ncsu.edu/~online/datasets/PST01d.csv")

## Rows: 3198 Columns: 42
## -- Column specification --------------------------------------------------------
## Delimiter: ","
## chr (22): Area_name, STCOU, PST045200N1, PST045200N2, PST045201N1, PST045201...
## dbl (20): PST045200F, PST045200D, PST045201F, PST045201D, PST045202F, PST045...
## 
## i Use `spec()` to retrieve the full column specification for this data.
## i Specify the column types or set `show_col_types = FALSE` to quiet this message.

3.6 Running data comibining functions to put these into one object

comb1 <- combined_function(wrap1, wrap2)
comb2 <- combined_function(wrap3, wrap4)
comb3 <- combined_function(comb1, comb2)
comb3

## [[1]]
## # A tibble: 125,800 x 7
##    area_name   STCOU enrollment enrollment_value Value_type  Year State
##    <chr>       <chr> <chr>                 <dbl> <chr>      <dbl> <chr>
##  1 Autauga, AL 01001 PST015171D            25508 PST0151     1971 AL   
##  2 Autauga, AL 01001 PST015172D            27166 PST0151     1972 AL   
##  3 Autauga, AL 01001 PST015173D            28463 PST0151     1973 AL   
##  4 Autauga, AL 01001 PST015174D            29266 PST0151     1974 AL   
##  5 Autauga, AL 01001 PST015175D            29718 PST0151     1975 AL   
##  6 Autauga, AL 01001 PST015176D            29896 PST0151     1976 AL   
##  7 Autauga, AL 01001 PST015177D            30462 PST0151     1977 AL   
##  8 Autauga, AL 01001 PST015178D            30882 PST0151     1978 AL   
##  9 Autauga, AL 01001 PST015179D            32055 PST0151     1979 AL   
## 10 Autauga, AL 01001 PST025181D            31985 PST0251     1981 AL   
## # ... with 125,790 more rows
## 
## [[2]]
## # A tibble: 2,120 x 7
##    area_name     STCOU enrollment enrollment_value Value_type  Year division
##    <chr>         <chr> <chr>                 <dbl> <chr>      <dbl> <chr>   
##  1 UNITED STATES 00000 PST015171D        206827028 PST0151     1971 ERROR   
##  2 UNITED STATES 00000 PST015172D        209283904 PST0151     1972 ERROR   
##  3 UNITED STATES 00000 PST015173D        211357490 PST0151     1973 ERROR   
##  4 UNITED STATES 00000 PST015174D        213341552 PST0151     1974 ERROR   
##  5 UNITED STATES 00000 PST015175D        215465246 PST0151     1975 ERROR   
##  6 UNITED STATES 00000 PST015176D        217562728 PST0151     1976 ERROR   
##  7 UNITED STATES 00000 PST015177D        219759860 PST0151     1977 ERROR   
##  8 UNITED STATES 00000 PST015178D        222095080 PST0151     1978 ERROR   
##  9 UNITED STATES 00000 PST015179D        224567234 PST0151     1979 ERROR   
## 10 UNITED STATES 00000 PST025181D        229466391 PST0251     1981 ERROR   
## # ... with 2,110 more rows

3.7 Using the plot function on the state data set

plot.state(comb3[[2]])

## `summarise()` has grouped output by 'Year'. You can override using the
## `.groups` argument.

## 3.8 Using the plot function on the county data set using the given attributes

plot.county(comb3[[1]], state_new = "CT", order = "top", count = 6)

plot.county(comb3[[1]], state_new = "NC", order = "bottom", count = 10)

plot.county(comb3[[1]])

plot.county(comb3[[1]], state_new = "MN", order = "top", count = 4)

Naman_Goel_ST558_Project1

Naman Goel

9/08/2022