library(tidyverse)
library(readxl)
library(janitor)
#############
### SETUP ###
#############
# download a state from
www2.census.gov/.../# put that ZIP file in this location:
census2020Dir <- "C:/data/dhc/"
# put the two-character state abbreviation here (for example, "mn" for Minnesota or "va" for Virginia)
st <- "mn"
# now unzip the contents of that ZIP file to some location specified below
# in this example, it would be "C:/data/dhc/mn2020.dhc/"
# and within that directory, there should be 45 data files: mngeo2020.dhc, mn000012020.dhc, mn000022020.dhc, ... mn000452020.dhc
census2020DirState <- paste0(census2020Dir, st, "2020.dhc/")
# depending on what state you're reading and your computing resources, memory can be an issue
# so put the summary level(s) you want here
# (reading in only certain summary levels keeps you from running into memory limitations)
# excluding census blocks (summary level 100) in particular sharply reduces the amount of memory this will use
# this example just keeps counties (050), county subdivisions (060), tracts (140), block groups (150), and places (160)
# if you want all summary levels regardless, just comment out the next line
sumlev <- c("050", "060", "140", "150", "160")
########################
### GET COLUMN NAMES ###
########################
# The data files don't have header rows with the names of the columns, so those need to be derived from the table matrix documentation
# first, download that documentation
download.file("
www2.census.gov/.../2020-dhc-table-matrix.xlsx", destfile = paste0(census2020Dir, "2020-dhc-table-matrix.xlsx"),
mode = "wb",
quiet = TRUE)
# read in the table segment info
# if you have trouble with the auto-download, just paste that link in the address bar of your browser, and manually download the file to the location assigned to `census2020Dir`
segmentInfo <- read_xlsx(paste0(census2020Dir, "2020-dhc-table-matrix.xlsx"),
sheet = "DHC Table_Segments") %>%
clean_names()
# manipulate the info in that file to get the correct cell names
segmentInfoProcessed <- segmentInfo %>%
uncount(weights = total_records, .id = "cellNum") %>%
mutate(TABLE_PREFIX = str_extract(table_id, "[A-Z]+"),
TABLE_NUM = str_extract(table_id, "[0-9]+"),
TABLE_SUFFIX = str_extract(table_id, "[A-Z]+$"),
CELL_NUM = case_when(is.na(TABLE_SUFFIX) ~ str_pad(cellNum, 4, pad = "0"),
str_sub(TABLE_PREFIX, 1, 1) == "P" & str_length(TABLE_SUFFIX) == 1 ~ str_pad(cellNum, 3, pad = "0"),
str_sub(TABLE_PREFIX, 1, 1) == "P" & str_length(TABLE_SUFFIX) == 2 ~ str_pad(cellNum, 2, pad = "0"),
str_sub(TABLE_PREFIX, 1, 1) == "H" ~ str_pad(cellNum, 4, pad = "0")),
COLNAME = str_c(TABLE_PREFIX, str_pad(TABLE_NUM, 3, pad = "0"), coalesce(TABLE_SUFFIX, ""), CELL_NUM))
# split that data into a list, where each element is a vector of column names
columnNameList <- split(segmentInfoProcessed$COLNAME, segmentInfoProcessed$segment_number) %>%
map(function(x) {
paste(c("FILEID", "STUSAB", "CHARITER", "CIFSN", "LOGRECNO", x)) # this adds to each vector of column names the five initial column names that appear in each file (not specified in the table segment documentation)
})
# unfortunately, the geography names are not available in this documentation, so those need to be defined separately
namesGeog <- c("FILEID", "STUSAB", "SUMLEV", "GEOVAR", "GEOCOMP", "CHARITER", "CIFSN", "LOGRECNO", "GEOID", "GEOCODE",
"REGION", "DIVISION", "STATE", "STATENS", "COUNTY", "COUNTYCC", "COUNTYNS", "COUSUB", "COUSUBCC", "COUSUBNS",
"SUBMCD", "SUBMCDCC", "SUBMCDNS", "ESTATE", "ESTATECC", "ESTATENS", "CONCIT", "CONCITCC", "CONCITNS", "PLACE",
"PLACECC", "PLACENS", "TRACT", "BLKGRP", "BLOCK", "AIANHH", "AIHHTLI", "AIANHHFP", "AIANHHCC", "AIANHHNS",
"AITS", "AITSFP", "AITSCC", "AITSNS", "TTRACT", "TBLKGRP", "ANRC", "ANRCCC", "ANRCNS", "CBSA", "MEMI", "CSA",
"METDIV", "NECTA", "NMEMI", "CNECTA", "NECTADIV", "CBSAPCI", "NECTAPCI", "UA", "UATYPE", "UR", "CD116", "CD118",
"CD119", "CD120", "CD121", "SLDU18", "SLDU22", "SLDU24", "SLDU26", "SLDU28", "SLDL18", "SLDL22", "SLDL24",
"SLDL26", "SLDL28", "VTD", "VTDI", "ZCTA", "SDELM", "SDSEC", "SDUNI", "PUMA", "AREALAND", "AREAWATR", "BASENAME",
"NAME", "FUNCSTAT", "GCUNI", "POP100", "HU100 ", "INTPTLAT", "INTPTLON", "LSADC", "PARTFLAG", "UGA")
###############################
### READ THE GEOGRAPHY FILE ###
###############################
# read the geography file
geo <- read_delim(paste0(census2020DirState, st, "geo2020.dhc"),
delim = "|",
col_names = namesGeog,
col_types = cols(.default = "c"))
# %>%
# select(LOGRECNO, GEOID) # make things much quicker by keeping only the essential fields; others can be reconstructed later from GEOID
# if only some summary levels are wanted, eliminate the others
if (exists("sumlev")) {
if (length(sumlev) > 0) {
geo <- geo %>%
filter(str_sub(GEOID, 1, 3) %in% sumlev)
}
}
###########################
### READ THE DATA FILES ###
###########################
# get list of the files that need to be read
dataFileNameList <- list.files(path = census2020DirState,
pattern = "mn[0-9]",
full.names = TRUE)
# define function to read one data segment
readSegment <- function(.data, .colNames) {
# define the default type of each file
# this is to save memory: can read data as integers unless it's segment 13, which contains median ages to one decimal point
if (str_detect(.data, "000132020.dhc")) {
defaultType <- "n" # segment 13 has median ages to one decimal point, so it can't be an integer
} else {
defaultType <- "i" # otherwise, reading as integers saves lots of space
}
read_delim(.data,
delim = "|",
col_names = .colNames,
col_types = cols(.default = defaultType,
FILEID = "c",
STUSAB = "c",
CHARITER = "c",
CIFSN = "c",
LOGRECNO = "c"),
na = c("", ".")) %>%
semi_join(geo, by = "LOGRECNO") %>% # limit to only those records in the geographic file
select(-CHARITER, -CIFSN) # drop unnecessary fields
}
# iterate over the list of datasets to read them
dataList <- map2(dataFileNameList, columnNameList, readSegment)
# combine the list of data files into a single table with everything
dataTibble <- dataList %>%
reduce(left_join, by = c("FILEID", "STUSAB", "LOGRECNO")) # join all the data files together
# finish up by joining with the geographic file
FINAL_DATA <- geo %>%
left_join(dataTibble, by = c("FILEID", "STUSAB", "LOGRECNO"))
# from there, you can write the data to a CSV, or summarize it however you want