Houston Pollen Data
Introduction
Houston is one of the worst places in the country for allergies. Since there is reasonably good data available, I thought I should analyze the pollen and mold data with an eye towards prediction - both short and mid range time scales.
As with any project like this, step one is reading in and cleaning up the raw data.
The data is available online as artisanal spreadsheets at https://www.houstontx.gov/health/Pollen-Mold/Pollen_Archives/, one file per month, from January 2013 to the present.
Note that the filename formats changed from “april_2018_pollen.xlsx” to “201805-pollen-count.xls”, but in a somewhat haphazard way. To keep from building too much logic into the code I just look for all files named either way, test to see if they exist, and then move on. There are also a couple of special cases that have to be handled separately.
path <- "http://www.houstontx.gov/health/Pollen-Mold/Pollen_Archives/"
# Build a list of valid url's
url_list <- tribble(~url)
for (yr in as.character(2013:2019)) {
for (mon in tolower(month.name)) {
# In 2013, someone mis-spelled february
if (yr=="2013" & mon=="february") {mon <- "febraury"}
# most of the files were saved as .xls files
url <- paste0(mon, "_", yr, "_pollen.xls")
# but some where saved as .xlsx files
if (yr=="2018" | yr=="2019" |
(yr=="2017"&(mon=="november"|mon=="december"))) {
url <- paste0(mon, "_", yr, "_pollen.xlsx")
}
# and lonely june 2018 is an .xls
if(yr=="2018" & mon=="june") {url <- paste0(mon, "_", yr, "_pollen.xls")}
# File is there but bad
if(yr=="2018" & mon=="march") {next} # bad file lurking out there
# Now check to see if file is there, if yes add to url list
if (!url.exists(paste0(path,url))) {print(paste(url, "does not exist"))
next}
# add to url_list
url_list <- add_row(url_list, url=url)
}
}
# Later files had numeric months instead of alpha
# Read in files using numeric months
for (yr in as.character(2013:2019)) {
for (mon in sprintf("%02d", 1:12)) {
url <- paste0(yr, mon, "-pollen-count.xls")
# Feb 2019 is an .xlsx file
if (grepl("201902", url)) {url <- paste0(url, "x")}
# April, June, July, Aug 2019 are .xlsx files
if (grepl("201904|201906|201907|201908", url)) {url <- paste0(url, "x")}
# Add these files to url list
if (!url.exists(paste0(path,url))) {print(paste(url, "does not exist"))
next}
# add to url_list
url_list <- add_row(url_list, url=url)
}
}
Read the files in
In this step I made a regrettable mistake - I used dataframe names that had no meaning. I should have had the month and year in the dataframe name - it would have made tracking down issues much easier.
For this blog, however, I will correct that mistake and give the data frames names that tie them back to the month they represent.
####################################
# now let's read the files and save
####################################
# First read the urls into local files
for (url in unlist(url_list[,1])){
download.file(paste0(path, url), destfile=url, mode="wb")
}
# Now parse the excel files into data frames and save to disk
for (url in url_list$url){
print(url)
fileout <- paste0(url,".rds")
df2 <- read_excel(url, col_names=FALSE)
saveRDS(df2, fileout)
}
## [1] "january_2013_pollen.xls"
## [1] "febraury_2013_pollen.xls"
## [1] "march_2013_pollen.xls"
## [1] "april_2013_pollen.xls"
## [1] "may_2013_pollen.xls"
## [1] "june_2013_pollen.xls"
## [1] "july_2013_pollen.xls"
## [1] "august_2013_pollen.xls"
## [1] "september_2013_pollen.xls"
## [1] "october_2013_pollen.xls"
## [1] "november_2013_pollen.xls"
## [1] "december_2013_pollen.xls"
## [1] "january_2014_pollen.xls"
## [1] "february_2014_pollen.xls"
## [1] "march_2014_pollen.xls"
## [1] "april_2014_pollen.xls"
## [1] "may_2014_pollen.xls"
## [1] "june_2014_pollen.xls"
## [1] "july_2014_pollen.xls"
## [1] "august_2014_pollen.xls"
## [1] "september_2014_pollen.xls"
## [1] "october_2014_pollen.xls"
## [1] "november_2014_pollen.xls"
## [1] "december_2014_pollen.xls"
## [1] "january_2015_pollen.xls"
## [1] "february_2015_pollen.xls"
## [1] "march_2015_pollen.xls"
## [1] "april_2015_pollen.xls"
## [1] "may_2015_pollen.xls"
## [1] "june_2015_pollen.xls"
## [1] "july_2015_pollen.xls"
## [1] "august_2015_pollen.xls"
## [1] "september_2015_pollen.xls"
## [1] "october_2015_pollen.xls"
## [1] "november_2015_pollen.xls"
## [1] "december_2015_pollen.xls"
## [1] "january_2016_pollen.xls"
## [1] "february_2016_pollen.xls"
## [1] "march_2016_pollen.xls"
## [1] "april_2016_pollen.xls"
## [1] "may_2016_pollen.xls"
## [1] "june_2016_pollen.xls"
## [1] "july_2016_pollen.xls"
## [1] "august_2016_pollen.xls"
## [1] "september_2016_pollen.xls"
## [1] "october_2016_pollen.xls"
## [1] "november_2016_pollen.xls"
## [1] "december_2016_pollen.xls"
## [1] "january_2017_pollen.xls"
## [1] "february_2017_pollen.xls"
## [1] "march_2017_pollen.xls"
## [1] "april_2017_pollen.xls"
## [1] "may_2017_pollen.xls"
## [1] "june_2017_pollen.xls"
## [1] "july_2017_pollen.xls"
## [1] "august_2017_pollen.xls"
## [1] "september_2017_pollen.xls"
## [1] "october_2017_pollen.xls"
## [1] "november_2017_pollen.xlsx"
## [1] "december_2017_pollen.xlsx"
## [1] "january_2018_pollen.xlsx"
## [1] "february_2018_pollen.xlsx"
## [1] "may_2018_pollen.xlsx"
## [1] "june_2018_pollen.xls"
## [1] "201803-pollen-count.xls"
## [1] "201804-pollen-count.xls"
## [1] "201807-pollen-count.xls"
## [1] "201808-pollen-count.xls"
## [1] "201809-pollen-count.xls"
## [1] "201810-pollen-count.xls"
## [1] "201811-pollen-count.xls"
## [1] "201812-pollen-count.xls"
## [1] "201901-pollen-count.xls"
## [1] "201902-pollen-count.xlsx"
## [1] "201903-pollen-count.xls"
## [1] "201904-pollen-count.xlsx"
## [1] "201905-pollen-count.xls"
## [1] "201906-pollen-count.xlsx"
## [1] "201907-pollen-count.xlsx"
## [1] "201908-pollen-count.xlsx"
Final cleanup
Here we will read each data frame in, and parse it into a useable form that can be joined together into one large dataset. This is made challenging by the artisanal nature of the spreadsheets - while they are all similar, they also can differ in important details. Comments in the code will describe these issues.
The clean function is largely from Joel Schwartz, who posted it in answer to my plea for help on the rstudio forum.
In the original version, I grabbed the year and month from those cells in the spreadsheet - which could move about a little. However, out of the 75 Excel files, I discovered two that had empty year and month entries. Originally, I used a data frame editor to just put the correct values into the two dataframes, but here I have rewritten things to handle that. Instead of using the dates stored in the files, I now use the name of the dataframe, since it has the date in it. Ultimately this is more reliable, since it is much easier to look at and QC, and it naturally QC’d by the act of reading in the files. Probably a lesson in there.
# Create an output dataframe column in url_list
url_list$filename <- NA
for (i in 1:nrow(url_list)) {
url <- unlist(url_list[i,1])
if (grepl("febraury", url)){ # misspelled February case
url_list[i,]$filename <- "february_2013"
}
else if (grepl("^20", url)) { # numeric dates
y <- str_sub(url,1,4) # get year
m <- str_sub(url,5,6) # get month
url_list[i,]$filename <- paste0(tolower(month.name)[as.numeric(m)],
"_",
y)
}
else { # regular alpha dates
url_list[i,]$filename <- str_extract(url,"\\w+(?=_pollen)")
}
}
# Read in df's from disk
clean = function(data) {
# Recode column names
names.row = grep("DATE", data[, 1, drop=TRUE], ignore.case=TRUE)
data[names.row, which(is.na(data[names.row,]))] <-
rep("NULL", sum(is.na(data[names.row,]))) # sometimes the variable is NA
recode_vals = translate$to %>% set_names(translate$from)
old_names = unlist(data[names.row, ][-1])
names(data) = c("Date", recode(old_names, !!!recode_vals))
# Get Month and Year for dates
names.col = grep("Month:", data, ignore.case=TRUE)
names.row = grep("Month:", data[, names.col, drop=TRUE], ignore.case=TRUE)
#mon <- str_remove(data[1,]$Date, "Month:\\s*")
mon <- str_remove(data[names.row, names.col], "Month:\\s*|MONTH:\\s*")
#mon <- str_remove(data[names.row,]$Date, "Month:\\s*|MONTH:\\s*")
#mon <- match(mon, toupper(month.name))
names.col = grep("Year:", data, ignore.case=TRUE)
names.row = grep("Year", data[, names.col, drop=TRUE], ignore.case=TRUE)
yr <- str_remove(data[names.row, names.col], "YEAR:\\s*|Year:\\s*")
#yr <- str_remove(data[2,]$Date, "YEAR:\\s*")
# Remove Month, Year, Date, POLLEN, and Total rows
data = data[!grepl("Month|YEAR|DATE|Total|POLLEN", data$Date, ignore.case=TRUE), ]
data = data[!is.na(data$Date),]
# Change Date column to correct dates
data$Date = paste(yr, mon, data$Date, sep="-")
data$Date = lubridate::ymd(data$Date)
data = data[!is.na(data$Date),] # for things like Feb 31
print(data$Date[1])
data
}
# Clean up plant names to be used for variable names
translate <- tribble(
~from, ~to,
"Ashe Juniper / Bald Cypress", "Ashe_JuniperOrBald_Cypress",
"Alnus(Alder)", "Alnus",
"Black Gum", "Black_Gum",
"Black Walnut", "Black_Walnut",
"Cotton Wood", "Cotton_Wood",
"Glandular Mesquite", "Glandular_Mesquite",
"Osage Orange", "Osage_Orange",
"Sweet Gum", "Sweet_Gum",
"Gingko Biloba", "Gingko_Biloba",
"Burweed / Marshelder", "BurweedOrMarshelder",
"Dog Fennel", "Dog_Fennel",
"Lamb's Quarters", "Lambs_Quarters",
"Partridge Pea", "Partridge_Pea",
"Plum Grannet", "Plum_Grannet",
"WILLOW", "Willow",
"plantago(plantain)", "Plantago",
"Plantago(Plantain)", "Plantago",
"Plantago(plantain)", "Plantago",
"PLANTAGO", "Plantago",
"Walnut(juglans)", "Walnut",
"Other weed pollen", "Other_Weed",
"Other weed/unidentified", "Other_Weed",
"other weed pollen", "Other_Weed",
"other weed", "Other_Weed",
"Other Weed", "Other_Weed",
"OTHER WEED", "Other_Weed",
"OTHER TREE", "Other_Tree",
"Other Tree/Unidentified", "Other_Tree",
"other tree pollen", "Other_Tree",
"OTHER TREE POLLEN", "Other_Tree",
"Other tree pollen", "Other_Tree",
"Other Tree", "Other_Tree",
"Wild Carrot", "Wild_Carrot"
)
df <- map_df(mget(ls(pattern = "df[0-9]")), clean) %>%
select(-contains("Total"), -contains("TOTAL"), -contains("Tech"))
## [1] "2019-08-01"
df <- df %>% mutate_if(is.character,as.numeric)
saveRDS(df, paste0(filepath,"MasterPollenData.rds"))