Introduction

Houston is one of the worst places in the country for allergies. Since there is reasonably good data available, I thought I should analyze the pollen and mold data with an eye towards prediction - both short and mid range time scales.

As with any project like this, step one is reading in and cleaning up the raw data.

The data is available online as artisanal spreadsheets at https://www.houstontx.gov/health/Pollen-Mold/Pollen_Archives/, one file per month, from January 2013 to the present.

Note that the filename formats changed from “april_2018_pollen.xlsx” to “201805-pollen-count.xls”, but in a somewhat haphazard way. To keep from building too much logic into the code I just look for all files named either way, test to see if they exist, and then move on. There are also a couple of special cases that have to be handled separately.

path <- "http://www.houstontx.gov/health/Pollen-Mold/Pollen_Archives/"

#   Build a list of valid url's

url_list <- tribble(~url)

for (yr in as.character(2013:2019)) {
  for (mon in tolower(month.name)) {
    
    #     In 2013, someone mis-spelled february
    if (yr=="2013" & mon=="february") {mon <- "febraury"}
    
    # most of the files were saved as .xls files
    url <- paste0(mon, "_", yr, "_pollen.xls")
    
    # but some where saved as .xlsx files
    if (yr=="2018" | yr=="2019" |
        (yr=="2017"&(mon=="november"|mon=="december"))) {
          url <- paste0(mon, "_", yr, "_pollen.xlsx")
    }
    
    # and lonely june 2018 is an .xls
    if(yr=="2018" & mon=="june") {url <- paste0(mon, "_", yr, "_pollen.xls")} 
    
    # File is there but bad
    if(yr=="2018" & mon=="march") {next} # bad file lurking out there
    
    # Now check to see if file is there, if yes add to url list 
    if (!url.exists(paste0(path,url))) {print(paste(url, "does not exist"))
                            next}
    #   add to url_list
    url_list <- add_row(url_list, url=url)
  }
}

#   Later files had numeric months instead of alpha
#   Read in files using numeric months

for (yr in as.character(2013:2019)) {
  for (mon in sprintf("%02d", 1:12)) {
    url <- paste0(yr, mon, "-pollen-count.xls")
    
    # Feb 2019 is an .xlsx file
    if (grepl("201902", url)) {url <- paste0(url, "x")}
    
    # April, June, July, Aug 2019 are .xlsx files
    if (grepl("201904|201906|201907|201908", url)) {url <- paste0(url, "x")}
    
    # Add these files to url list
    if (!url.exists(paste0(path,url))) {print(paste(url, "does not exist"))
                            next}
    #   add to url_list
    url_list <- add_row(url_list, url=url)
  }
}

Read the files in

In this step I made a regrettable mistake - I used dataframe names that had no meaning. I should have had the month and year in the dataframe name - it would have made tracking down issues much easier.

For this blog, however, I will correct that mistake and give the data frames names that tie them back to the month they represent.

####################################
# now let's read the files and save
####################################

#   First read the urls into local files

for (url in unlist(url_list[,1])){
  download.file(paste0(path, url), destfile=url, mode="wb")
}

#   Now parse the excel files into data frames and save to disk

for (url in url_list$url){
  print(url)
  fileout <- paste0(url,".rds")
  df2 <- read_excel(url, col_names=FALSE)  
  saveRDS(df2, fileout)
}
## [1] "january_2013_pollen.xls"
## [1] "febraury_2013_pollen.xls"
## [1] "march_2013_pollen.xls"
## [1] "april_2013_pollen.xls"
## [1] "may_2013_pollen.xls"
## [1] "june_2013_pollen.xls"
## [1] "july_2013_pollen.xls"
## [1] "august_2013_pollen.xls"
## [1] "september_2013_pollen.xls"
## [1] "october_2013_pollen.xls"
## [1] "november_2013_pollen.xls"
## [1] "december_2013_pollen.xls"
## [1] "january_2014_pollen.xls"
## [1] "february_2014_pollen.xls"
## [1] "march_2014_pollen.xls"
## [1] "april_2014_pollen.xls"
## [1] "may_2014_pollen.xls"
## [1] "june_2014_pollen.xls"
## [1] "july_2014_pollen.xls"
## [1] "august_2014_pollen.xls"
## [1] "september_2014_pollen.xls"
## [1] "october_2014_pollen.xls"
## [1] "november_2014_pollen.xls"
## [1] "december_2014_pollen.xls"
## [1] "january_2015_pollen.xls"
## [1] "february_2015_pollen.xls"
## [1] "march_2015_pollen.xls"
## [1] "april_2015_pollen.xls"
## [1] "may_2015_pollen.xls"
## [1] "june_2015_pollen.xls"
## [1] "july_2015_pollen.xls"
## [1] "august_2015_pollen.xls"
## [1] "september_2015_pollen.xls"
## [1] "october_2015_pollen.xls"
## [1] "november_2015_pollen.xls"
## [1] "december_2015_pollen.xls"
## [1] "january_2016_pollen.xls"
## [1] "february_2016_pollen.xls"
## [1] "march_2016_pollen.xls"
## [1] "april_2016_pollen.xls"
## [1] "may_2016_pollen.xls"
## [1] "june_2016_pollen.xls"
## [1] "july_2016_pollen.xls"
## [1] "august_2016_pollen.xls"
## [1] "september_2016_pollen.xls"
## [1] "october_2016_pollen.xls"
## [1] "november_2016_pollen.xls"
## [1] "december_2016_pollen.xls"
## [1] "january_2017_pollen.xls"
## [1] "february_2017_pollen.xls"
## [1] "march_2017_pollen.xls"
## [1] "april_2017_pollen.xls"
## [1] "may_2017_pollen.xls"
## [1] "june_2017_pollen.xls"
## [1] "july_2017_pollen.xls"
## [1] "august_2017_pollen.xls"
## [1] "september_2017_pollen.xls"
## [1] "october_2017_pollen.xls"
## [1] "november_2017_pollen.xlsx"
## [1] "december_2017_pollen.xlsx"
## [1] "january_2018_pollen.xlsx"
## [1] "february_2018_pollen.xlsx"
## [1] "may_2018_pollen.xlsx"
## [1] "june_2018_pollen.xls"
## [1] "201803-pollen-count.xls"
## [1] "201804-pollen-count.xls"
## [1] "201807-pollen-count.xls"
## [1] "201808-pollen-count.xls"
## [1] "201809-pollen-count.xls"
## [1] "201810-pollen-count.xls"
## [1] "201811-pollen-count.xls"
## [1] "201812-pollen-count.xls"
## [1] "201901-pollen-count.xls"
## [1] "201902-pollen-count.xlsx"
## [1] "201903-pollen-count.xls"
## [1] "201904-pollen-count.xlsx"
## [1] "201905-pollen-count.xls"
## [1] "201906-pollen-count.xlsx"
## [1] "201907-pollen-count.xlsx"
## [1] "201908-pollen-count.xlsx"

Final cleanup

Here we will read each data frame in, and parse it into a useable form that can be joined together into one large dataset. This is made challenging by the artisanal nature of the spreadsheets - while they are all similar, they also can differ in important details. Comments in the code will describe these issues.

The clean function is largely from Joel Schwartz, who posted it in answer to my plea for help on the rstudio forum.

In the original version, I grabbed the year and month from those cells in the spreadsheet - which could move about a little. However, out of the 75 Excel files, I discovered two that had empty year and month entries. Originally, I used a data frame editor to just put the correct values into the two dataframes, but here I have rewritten things to handle that. Instead of using the dates stored in the files, I now use the name of the dataframe, since it has the date in it. Ultimately this is more reliable, since it is much easier to look at and QC, and it naturally QC’d by the act of reading in the files. Probably a lesson in there.

#  Create an output dataframe column in url_list
url_list$filename <- NA
for (i in 1:nrow(url_list)) {
  url <-  unlist(url_list[i,1])
  if (grepl("febraury", url)){ # misspelled February case
    url_list[i,]$filename <- "february_2013"
  }
  else if (grepl("^20", url)) { # numeric dates
    y <- str_sub(url,1,4) # get year
    m <- str_sub(url,5,6) # get month
    url_list[i,]$filename <- paste0(tolower(month.name)[as.numeric(m)],
                                      "_",
                                      y)
  }
  else { # regular alpha dates
    url_list[i,]$filename <- str_extract(url,"\\w+(?=_pollen)")
  }
}

# Read in df's from disk



clean = function(data) {

  # Recode column names
  names.row = grep("DATE", data[, 1, drop=TRUE], ignore.case=TRUE)
  data[names.row, which(is.na(data[names.row,]))] <- 
    rep("NULL", sum(is.na(data[names.row,]))) # sometimes the variable is NA
  recode_vals = translate$to %>% set_names(translate$from)
  old_names = unlist(data[names.row, ][-1]) 
  names(data) = c("Date", recode(old_names, !!!recode_vals))
  
  # Get Month and Year for dates
  names.col = grep("Month:", data, ignore.case=TRUE)
  names.row = grep("Month:", data[, names.col, drop=TRUE], ignore.case=TRUE)
  #mon <- str_remove(data[1,]$Date, "Month:\\s*")
  mon  <- str_remove(data[names.row, names.col], "Month:\\s*|MONTH:\\s*")
  #mon  <- str_remove(data[names.row,]$Date, "Month:\\s*|MONTH:\\s*")
  #mon <- match(mon, toupper(month.name))
  names.col = grep("Year:", data, ignore.case=TRUE)
  names.row = grep("Year", data[, names.col, drop=TRUE], ignore.case=TRUE)
  yr  <- str_remove(data[names.row, names.col], "YEAR:\\s*|Year:\\s*")
  #yr  <- str_remove(data[2,]$Date, "YEAR:\\s*")

  # Remove Month, Year, Date, POLLEN, and Total rows
  data = data[!grepl("Month|YEAR|DATE|Total|POLLEN", data$Date, ignore.case=TRUE), ]
  data = data[!is.na(data$Date),]
  
  # Change Date column to correct dates
  data$Date = paste(yr, mon, data$Date, sep="-")
  data$Date = lubridate::ymd(data$Date)
  data = data[!is.na(data$Date),] # for things like Feb 31
  
  print(data$Date[1])
  
  data
}

# Clean up plant names to be used for variable names
translate <- tribble(
  ~from,                        ~to,
"Ashe Juniper / Bald Cypress",  "Ashe_JuniperOrBald_Cypress", 
"Alnus(Alder)",                 "Alnus",
"Black Gum",                    "Black_Gum", 
"Black Walnut",                 "Black_Walnut", 
"Cotton Wood",                  "Cotton_Wood",
"Glandular Mesquite",           "Glandular_Mesquite", 
"Osage Orange",                 "Osage_Orange", 
"Sweet Gum",                    "Sweet_Gum", 
"Gingko Biloba",                "Gingko_Biloba",  
"Burweed / Marshelder",         "BurweedOrMarshelder", 
"Dog Fennel",                   "Dog_Fennel", 
"Lamb's Quarters",              "Lambs_Quarters", 
"Partridge Pea",                "Partridge_Pea", 
"Plum Grannet",                 "Plum_Grannet", 
"WILLOW",                       "Willow", 
"plantago(plantain)",           "Plantago", 
"Plantago(Plantain)",           "Plantago", 
"Plantago(plantain)",           "Plantago", 
"PLANTAGO",                     "Plantago", 
"Walnut(juglans)",              "Walnut", 
"Other weed pollen",            "Other_Weed", 
"Other weed/unidentified",      "Other_Weed", 
"other weed pollen",            "Other_Weed", 
"other weed",                   "Other_Weed", 
"Other Weed",                   "Other_Weed", 
"OTHER WEED",                   "Other_Weed", 
"OTHER TREE",                   "Other_Tree", 
"Other Tree/Unidentified",      "Other_Tree", 
"other tree pollen",            "Other_Tree", 
"OTHER TREE POLLEN",            "Other_Tree", 
"Other tree pollen",            "Other_Tree", 
"Other Tree",                   "Other_Tree", 
"Wild Carrot",                  "Wild_Carrot" 
)


df <- map_df(mget(ls(pattern = "df[0-9]")), clean) %>%  
  select(-contains("Total"), -contains("TOTAL"), -contains("Tech"))
## [1] "2019-08-01"
df <- df %>% mutate_if(is.character,as.numeric)

saveRDS(df, paste0(filepath,"MasterPollenData.rds"))