library(readr)
## Warning: package 'readr' was built under R version 4.1.3
library(tidyverse)
## Warning: package 'tidyverse' was built under R version 4.1.3
## -- Attaching packages --------------------------------------- tidyverse 1.3.2 --
## v ggplot2 3.3.6 v dplyr 1.0.9
## v tibble 3.1.8 v stringr 1.4.1
## v tidyr 1.2.0 v forcats 0.5.2
## v purrr 0.3.4
## Warning: package 'ggplot2' was built under R version 4.1.3
## Warning: package 'tibble' was built under R version 4.1.3
## Warning: package 'tidyr' was built under R version 4.1.3
## Warning: package 'purrr' was built under R version 4.1.3
## Warning: package 'dplyr' was built under R version 4.1.3
## Warning: package 'stringr' was built under R version 4.1.3
## Warning: package 'forcats' was built under R version 4.1.3
## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
sheet1 <- read_csv("https://www4.stat.ncsu.edu/~online/datasets/EDU01a.csv")
## Rows: 3198 Columns: 42
## -- Column specification --------------------------------------------------------
## Delimiter: ","
## chr (22): Area_name, STCOU, EDU010187N1, EDU010187N2, EDU010188N1, EDU010188...
## dbl (20): EDU010187F, EDU010187D, EDU010188F, EDU010188D, EDU010189F, EDU010...
##
## i Use `spec()` to retrieve the full column specification for this data.
## i Specify the column types or set `show_col_types = FALSE` to quiet this message.
sheet1
sheet2 <- as_tibble(sheet1) %>%
select(Area_name, STCOU, ends_with("D")) %>%
rename(area_name = Area_name)
sheet2
sheet3 <- sheet2 %>%
pivot_longer(cols =3:12, names_to = "enrollment", values_to = "enrollment_value")
sheet3
sheet3$Year <- as.numeric(format(as.Date(substr(sheet3$enrollment, 8, 9), format="%y"), "%Y"))
sheet3$Value_type=substr(sheet3$enrollment, 1, 7)
sheet4 <- sheet3
sheet4
sheet_county <- sheet4[c(grep(pattern = ", \\w\\w", sheet4$area_name)),]
sheet_county
sheet_non_county <- sheet4[-c(grep(pattern = ", \\w\\w", sheet4$area_name)),]
sheet_non_county
class(sheet_county) <- c("county", class(sheet_county))
class(sheet_non_county) <- c("non_county", class(sheet_non_county))
sheet_county$State <- str_sub(sheet_county$area_name, -2, -1)
sheet5 <- sheet_county
sheet5
value1 <- list("Connecticut", "Maine", "Massachusetts", "New Hampshire", "Rhode Island", "Vermont")
value2 <- list("New Jersey", "New York", "Pennsylvania")
value3 <- list("Illinois", "Indiana", "Michigan", "Ohio", "Wisconsin")
value4 <- list("Iowa", "Kansas", "Minnesota", "Missouri", "Nebraska", "North Dakota", "South Dakota")
value5 <- list("Delaware","Florida","Georgia","Maryland","North Carolina","South Carolina","Virginia"," Washington,D.C.","West Virginia")
value6 <- list("Alabama", "Kentucky", "Mississippi","Tennessee")
value7 <- list("Arkansas", "Louisiana", "Oklahoma","Texas")
value8 <- list("Arizona", "Colorado", "Idaho","Montana", "Nevada", "New Mexico","Utah","Wyoming")
value9 <- list("Alaska", "California", "Hawaii", "Oregon","Washington")
length <- nrow(sheet_non_county)
sheet_non_county$division <- NaN
for (i in 1:length) {
if(sheet_non_county$area_name[i] %in% toupper(value1)) {
sheet_non_county$division[i]="New England"
}
else if(sheet_non_county$area_name[i] %in% toupper(value2)) {
sheet_non_county$division[i]="Mid-Atlantic"
}
else if(sheet_non_county$area_name[i] %in% toupper(value3)) {
sheet_non_county$division[i]="East North Central"
}
else if(sheet_non_county$area_name[i] %in% toupper(value4)) {
sheet_non_county$division[i]="West North Central"
}
else if(sheet_non_county$area_name[i] %in% toupper(value5)) {
sheet_non_county$division[i]="South Atlantic"
}
else if(sheet_non_county$area_name[i] %in% toupper(value6)) {
sheet_non_county$division[i]="East South Central"
}
else if(sheet_non_county$area_name[i] %in% toupper(value7)) {
sheet_non_county$division[i]="West South Central"
}
else if(sheet_non_county$area_name[i] %in% toupper(value8)) {
sheet_non_county$division[i]="Mountain"
}
else if(sheet_non_county$area_name[i] %in% toupper(value9)) {
sheet_non_county$division[i]="Pacific"
}
else {
sheet_non_county$division[i]="ERROR"
}
}
sheet_non_county
function_step_1_and_2 <- function(tib, values_to = "enrollment_value"){
tib1 <- as_tibble(tib) %>%
select(Area_name, STCOU, ends_with("D")) %>%
rename(area_name = Area_name)
tib2 <- tib1 %>%
pivot_longer(cols =3:12, names_to = "enrollment", values_to = values_to)
return(tib2)
}
function_step_3 <- function(tib2){
tib2$Value_type <- substr(tib2$enrollment, 1, 7)
tib2$Year <- as.numeric(format(as.Date(substr(tib2$enrollment, 8, 9),format="%y"), "%Y"))
tib3 <- tib2
return(tib3)
}
function_step_5 <- function(tib){
tib$State <- str_sub(tib$area_name,-2,-1)
tib5 <- tib
return (tib5)
}
function_step_6 <- function(tib){
value1 <- list("Connecticut", "Maine", "Massachusetts", "New Hampshire", "Rhode Island", "Vermont")
value2 <- list("New Jersey", "New York", "Pennsylvania")
value3 <- list("Illinois", "Indiana", "Michigan", "Ohio", "Wisconsin")
value4 <- list("Iowa", "Kansas", "Minnesota", "Missouri", "Nebraska", "North Dakota", "South Dakota")
value5 <- list("Delaware","Florida","Georgia","Maryland","North Carolina","South Carolina","Virginia"," Washington,D.C.","West Virginia")
value6 <- list("Alabama", "Kentucky", "Mississippi","Tennessee")
value7 <- list("Arkansas", "Louisiana", "Oklahoma","Texas")
value8 <- list("Arizona", "Colorado", "Idaho","Montana", "Nevada", "New Mexico","Utah","Wyoming")
value9 <- list("Alaska", "California", "Hawaii", "Oregon","Washington")
length <- nrow(tib)
tib$division <- NaN
for (i in 1:length) {
if(tib$area_name[i] %in% toupper(value1)) {
tib$division[i]="New England"
}
else if(tib$area_name[i] %in% toupper(value2)) {
tib$division[i]="Mid-Atlantic"
}
else if(tib$area_name[i] %in% toupper(value3)) {
tib$division[i]="East North Central"
}
else if(tib$area_name[i] %in% toupper(value4)) {
tib$division[i]="West North Central"
}
else if(tib$area_name[i] %in% toupper(value5)) {
tib$division[i]="South Atlantic"
}
else if(tib$area_name[i] %in% toupper(value6)) {
tib$division[i]="East South Central"
}
else if(tib$area_name[i] %in% toupper(value7)) {
tib$division[i]="West South Central"
}
else if(tib$area_name[i] %in% toupper(value8)) {
tib$division[i]="Mountain"
}
else if(tib$area_name[i] %in% toupper(value9)) {
tib$division[i]="Pacific"
}
else {
tib$division[i]="ERROR"
}
}
tib
}
function_step_4 <- function(tib3){
sheet_county <- tib3[c(grep(pattern = ", \\w\\w", tib3$area_name)),]
sheet_county
sheet_non_county <- tib3[-c(grep(pattern = ", \\w\\w", tib3$area_name)),]
sheet_non_county
class(sheet_county) <- c("county", class(sheet_county))
class(sheet_non_county) <- c("non_county", class(sheet_non_county))
tib_sheet_county <- function_step_5(sheet_county)
tib_sheet_non_county <- function_step_6(sheet_non_county)
return(list(tib_sheet_county, tib_sheet_non_county))
}
function_step_6(tib_sheet_non_county)
my_wrapper <- function(url, default_var_name = "enrollment_value"){
inp1 <- read_csv(url)
inp2 <- function_step_1_and_2(inp1)
inp3 <- function_step_3(inp2)
inp4 <- function_step_4(inp3)
return(list(inp4[[1]], inp4[[2]]))
}
file1 <- my_wrapper("https://www4.stat.ncsu.edu/~online/datasets/EDU01a.csv")
## Rows: 3198 Columns: 42
## -- Column specification --------------------------------------------------------
## Delimiter: ","
## chr (22): Area_name, STCOU, EDU010187N1, EDU010187N2, EDU010188N1, EDU010188...
## dbl (20): EDU010187F, EDU010187D, EDU010188F, EDU010188D, EDU010189F, EDU010...
##
## i Use `spec()` to retrieve the full column specification for this data.
## i Specify the column types or set `show_col_types = FALSE` to quiet this message.
file2 <- my_wrapper("https://www4.stat.ncsu.edu/~online/datasets/EDU01b.csv")
## Rows: 3198 Columns: 42
## -- Column specification --------------------------------------------------------
## Delimiter: ","
## chr (22): Area_name, STCOU, EDU010197N1, EDU010197N2, EDU010198N1, EDU010198...
## dbl (20): EDU010197F, EDU010197D, EDU010198F, EDU010198D, EDU010199F, EDU010...
##
## i Use `spec()` to retrieve the full column specification for this data.
## i Specify the column types or set `show_col_types = FALSE` to quiet this message.
combined_function <- function(file1, file2){
tib_combined_county <- bind_rows(file1[[1]], file2[[1]])
tib_combined_non_county <- bind_rows(file1[[2]], file2[[2]])
return(list(tib_combined_county, tib_combined_non_county))
}
combined_county <- combined_function(file1, file2)[[1]]
combined_non_county <- combined_function(file1, file2)[[2]]
class(combined_county) <- c("state", class(combined_non_county))
str(combined_non_county)
## non_county [1,060 x 7] (S3: non_county/tbl_df/tbl/data.frame)
## $ area_name : chr [1:1060] "UNITED STATES" "UNITED STATES" "UNITED STATES" "UNITED STATES" ...
## $ STCOU : chr [1:1060] "00000" "00000" "00000" "00000" ...
## $ enrollment : chr [1:1060] "EDU010187D" "EDU010188D" "EDU010189D" "EDU010190D" ...
## $ enrollment_value: num [1:1060] 40024299 39967624 40317775 40737600 41385442 ...
## $ Value_type : chr [1:1060] "EDU0101" "EDU0101" "EDU0101" "EDU0101" ...
## $ Year : num [1:1060] 1987 1988 1989 1990 1991 ...
## $ division : chr [1:1060] "ERROR" "ERROR" "ERROR" "ERROR" ...
combined_county_noncounty <- function(x,y){
df <- bind_rows(x,y)
return(df)
}
combined_county_noncounty(combined_county, combined_non_county)
plot.state <- function(tib, var_name = "enrollment_value"){
avg_enrol <- tib %>%
filter(division != "Error") %>%
group_by(Year, division) %>%
summarise(Mean = mean(get(var_name), na.rm = TRUE))
avg_enrol
ggplot(avg_enrol, aes(x = Year, y = Mean, color = division)) + geom_line() + geom_point()
}
plot.county <- function(tib, var_name = "enrollment_value", order = "top", state_new = "PA", count = 5){
avg_county <- tib %>%
filter(State == state_new) %>%
group_by(area_name) %>%
summarise(Mean = mean(get(var_name), na.rm = TRUE))
if(order == "top"){
# avg_county <- head(avg_county[order(-mean_county$Mean),],count)
avg_county <- head(avg_county %>% arrange(desc(Mean)), count)
}else{
avg_county <- head(avg_county %>% arrange(Mean), count)
}
filter_df <- subset(tib, area_name %in% avg_county$area_name)
ggplot(filter_df, aes(x = Year, y=get(var_name), color = area_name)) + geom_point() + geom_line()
}
file1 <- my_wrapper("https://www4.stat.ncsu.edu/~online/datasets/EDU01a.csv")
## Rows: 3198 Columns: 42
## -- Column specification --------------------------------------------------------
## Delimiter: ","
## chr (22): Area_name, STCOU, EDU010187N1, EDU010187N2, EDU010188N1, EDU010188...
## dbl (20): EDU010187F, EDU010187D, EDU010188F, EDU010188D, EDU010189F, EDU010...
##
## i Use `spec()` to retrieve the full column specification for this data.
## i Specify the column types or set `show_col_types = FALSE` to quiet this message.
file2 <- my_wrapper("https://www4.stat.ncsu.edu/~online/datasets/EDU01b.csv")
## Rows: 3198 Columns: 42
## -- Column specification --------------------------------------------------------
## Delimiter: ","
## chr (22): Area_name, STCOU, EDU010197N1, EDU010197N2, EDU010198N1, EDU010198...
## dbl (20): EDU010197F, EDU010197D, EDU010198F, EDU010198D, EDU010199F, EDU010...
##
## i Use `spec()` to retrieve the full column specification for this data.
## i Specify the column types or set `show_col_types = FALSE` to quiet this message.
combined_function(file1, file2)[[1]]
combined_function(file1, file2)[[2]]
plot.state(combined_non_county)
## `summarise()` has grouped output by 'Year'. You can override using the
## `.groups` argument.
## 3.4 Running plot function on the county data set using the given attributes
plot.county(combined_county, state_new = "PA", order = "top", count = 7)
plot.county(combined_county, state_new = "PA", order = "bottom", count = 4)
plot.county(combined_county)
plot.county(combined_county, state_new = "MN", order = "top", count = 10)
## 3.5 Running data processing function on four data sets using the given URLs
wrap1 <- my_wrapper("https://www4.stat.ncsu.edu/~online/datasets/PST01a.csv")
## Rows: 3198 Columns: 42
## -- Column specification --------------------------------------------------------
## Delimiter: ","
## chr (22): Area_name, STCOU, PST015171N1, PST015171N2, PST015172N1, PST015172...
## dbl (20): PST015171F, PST015171D, PST015172F, PST015172D, PST015173F, PST015...
##
## i Use `spec()` to retrieve the full column specification for this data.
## i Specify the column types or set `show_col_types = FALSE` to quiet this message.
wrap2 <- my_wrapper("https://www4.stat.ncsu.edu/~online/datasets/PST01b.csv")
## Rows: 3198 Columns: 42
## -- Column specification --------------------------------------------------------
## Delimiter: ","
## chr (22): Area_name, STCOU, PST025182N1, PST025182N2, PST025183N1, PST025183...
## dbl (20): PST025182F, PST025182D, PST025183F, PST025183D, PST025184F, PST025...
##
## i Use `spec()` to retrieve the full column specification for this data.
## i Specify the column types or set `show_col_types = FALSE` to quiet this message.
wrap3 <- my_wrapper("https://www4.stat.ncsu.edu/~online/datasets/PST01c.csv")
## Rows: 3198 Columns: 42
## -- Column specification --------------------------------------------------------
## Delimiter: ","
## chr (22): Area_name, STCOU, PST035191N1, PST035191N2, PST035192N1, PST035192...
## dbl (20): PST035191F, PST035191D, PST035192F, PST035192D, PST035193F, PST035...
##
## i Use `spec()` to retrieve the full column specification for this data.
## i Specify the column types or set `show_col_types = FALSE` to quiet this message.
wrap4 <- my_wrapper("https://www4.stat.ncsu.edu/~online/datasets/PST01d.csv")
## Rows: 3198 Columns: 42
## -- Column specification --------------------------------------------------------
## Delimiter: ","
## chr (22): Area_name, STCOU, PST045200N1, PST045200N2, PST045201N1, PST045201...
## dbl (20): PST045200F, PST045200D, PST045201F, PST045201D, PST045202F, PST045...
##
## i Use `spec()` to retrieve the full column specification for this data.
## i Specify the column types or set `show_col_types = FALSE` to quiet this message.
comb1 <- combined_function(wrap1, wrap2)
comb2 <- combined_function(wrap3, wrap4)
comb3 <- combined_function(comb1, comb2)
comb3
## [[1]]
## # A tibble: 125,800 x 7
## area_name STCOU enrollment enrollment_value Value_type Year State
## <chr> <chr> <chr> <dbl> <chr> <dbl> <chr>
## 1 Autauga, AL 01001 PST015171D 25508 PST0151 1971 AL
## 2 Autauga, AL 01001 PST015172D 27166 PST0151 1972 AL
## 3 Autauga, AL 01001 PST015173D 28463 PST0151 1973 AL
## 4 Autauga, AL 01001 PST015174D 29266 PST0151 1974 AL
## 5 Autauga, AL 01001 PST015175D 29718 PST0151 1975 AL
## 6 Autauga, AL 01001 PST015176D 29896 PST0151 1976 AL
## 7 Autauga, AL 01001 PST015177D 30462 PST0151 1977 AL
## 8 Autauga, AL 01001 PST015178D 30882 PST0151 1978 AL
## 9 Autauga, AL 01001 PST015179D 32055 PST0151 1979 AL
## 10 Autauga, AL 01001 PST025181D 31985 PST0251 1981 AL
## # ... with 125,790 more rows
##
## [[2]]
## # A tibble: 2,120 x 7
## area_name STCOU enrollment enrollment_value Value_type Year division
## <chr> <chr> <chr> <dbl> <chr> <dbl> <chr>
## 1 UNITED STATES 00000 PST015171D 206827028 PST0151 1971 ERROR
## 2 UNITED STATES 00000 PST015172D 209283904 PST0151 1972 ERROR
## 3 UNITED STATES 00000 PST015173D 211357490 PST0151 1973 ERROR
## 4 UNITED STATES 00000 PST015174D 213341552 PST0151 1974 ERROR
## 5 UNITED STATES 00000 PST015175D 215465246 PST0151 1975 ERROR
## 6 UNITED STATES 00000 PST015176D 217562728 PST0151 1976 ERROR
## 7 UNITED STATES 00000 PST015177D 219759860 PST0151 1977 ERROR
## 8 UNITED STATES 00000 PST015178D 222095080 PST0151 1978 ERROR
## 9 UNITED STATES 00000 PST015179D 224567234 PST0151 1979 ERROR
## 10 UNITED STATES 00000 PST025181D 229466391 PST0251 1981 ERROR
## # ... with 2,110 more rows
plot.state(comb3[[2]])
## `summarise()` has grouped output by 'Year'. You can override using the
## `.groups` argument.
## 3.8 Using the plot function on the county data set using the given attributes
plot.county(comb3[[1]], state_new = "CT", order = "top", count = 6)
plot.county(comb3[[1]], state_new = "NC", order = "bottom", count = 10)
plot.county(comb3[[1]])
plot.county(comb3[[1]], state_new = "MN", order = "top", count = 4)