htmlrrvesthtml-tableextract

readHTMLTable returns dataframe values as integers


Can someone explain why the marked line below returns numbers instead of setting the column names the strings in that row? How would I get the correct column names if I include the commented out colClasses line when reading the table?

url<-'http://qpublic7.qpublic.net/ga_subdivison.php?county=ga_clarke&searchType=nbhd&numberValue=4025R&nameValue=&sectionValue=&townshipValue=&rangeValue=&startDate=01-1998&endDate=&startPrice=&endPrice=&startArea=&endArea=&startAcreage=&endAcreage=&saleQualification=All&saleVacant=All&propertyType=All&reasonType=All&start=0'
library(XML)
#colClasses = c("character","character","character","character","integer","integer","integer","character","character","integer","character","character"),
data<-readHTMLTable(url,header=F,as.data.frame=T)
View(data)
csv<-as.data.frame(data)
colnames(csv)<-csv[4,] #why does this line return numbers?
colnames(csv)<-gsub(" ","",colnames(csv))
View(head(csv))
csv<-csv[-c(1:4),]
#####
View(csv)

Solution

  • You were bit by stringsAsFactors. Plus, your as.data.frame call was not the right thing to do (look at the data structure with str vs View).

    library(XML)
    
    URL <- 'http://qpublic7.qpublic.net/ga_subdivison.php?county=ga_clarke&searchType=nbhd&numberValue=4025R&nameValue=&sectionValue=&townshipValue=&rangeValue=&startDate=01-1998&endDate=&startPrice=&endPrice=&startArea=&endArea=&startAcreage=&endAcreage=&saleQualification=All&saleVacant=All&propertyType=All&reasonType=All&start=0'
    
    csv <- readHTMLTable(URL, header=FALSE, as.data.frame=TRUE, stringsAsFactors=FALSE)[[2]]
    
    colnames(csv) <- csv[4,]
    colnames(csv) <- gsub(" ", "", colnames(csv))
    
    csv <- csv[-c(1:4),]
    
    dplyr::glimpse(csv)
    
    ## Observations: 52
    ## Variables: 11
    ## $ \/ParcelNumber\/ (chr) "173C2 F023", "173C2 G009", "173C2 G007", "17...
    ## $ PropertyType       (chr) "R", "R", "R", "R", "R", "R", "R", "R", "R"...
    ## $ SaleDate           (chr) "12-2015", "08-2015", "08-2015", "07-2015",...
    ## $ SalePrice          (chr) "200,000", "265,000", "210,000", "188,000",...
    ## $ HeatedSqFt         (chr) "1,538", "1,756", "1,415", "1,125", "1,559"...
    ## $ Acres              (chr) "0.30", "0.37", "0.37", "0.38", "0.32", "0....
    ## $ SaleQual           (chr) "Q", "Q", "Q", "Q", "Q", "Q", "U", "Q", "Q"...
    ## $ Reason             (chr) "FM", "FM", "FM", "FM", "FM", "FM", "B", "F...
    ## $ YearBuilt          (chr) "1952", "1954", "1963", "1963", "1998", "19...
    ## $ LocationAddress    (chr) "155 HARDIN DR", "140 HARDIN DR", "150 HARD...
    ## $ Neighborhood       (chr) "4025R-RIVERDALE", "4025R-RIVERDALE", "4025...
    
    
    # or use the more modern xml2 ---------------------------------------------
    
    library(xml2)
    library(rvest)
    
    pg <- read_html(URL)
    
    csv2 <- html_table(html_nodes(pg, "table")[[2]], fill=TRUE)
    
    colnames(csv2) <- csv[4,]
    colnames(csv2) <- gsub(" ", "", colnames(csv))
    
    csv2 <- csv2[-c(1:4), -c(12:13)]
    
    dplyr::glimpse(csv2)
    
    ## Observations: 52
    ## Variables: 11
    ## $ \/ParcelNumber\/ (chr) "173C2 F023 ", "173C2 G009 ", "173C2 G007 ", ...
    ## $ PropertyType       (chr) "R ", "R ", "R ", "R ", "R ", "R ", "R ", "...
    ## $ SaleDate           (chr) "12-2015 ", "08-2015 ", "08-2015 ", "07-201...
    ## $ SalePrice          (chr) "200,000 ", "265,000 ", "210,000 ", "188,00...
    ## $ HeatedSqFt         (chr) "1,538 ", "1,756 ", "1,415 ", "1,125 ", "1,...
    ## $ Acres              (chr) "0.30 ", "0.37 ", "0.37 ", "0.38 ", "0.32 "...
    ## $ SaleQual           (chr) "Q ", "Q ", "Q ", "Q ", "Q ", "Q ", "U ", "...
    ## $ Reason             (chr) "FM ", "FM ", "FM ", "FM ", "FM ", "FM ", "...
    ## $ YearBuilt          (chr) "1952 ", "1954 ", "1963 ", "1963 ", "1998 "...
    ## $ LocationAddress    (chr) "155 HARDIN DR ", "140 HARDIN DR ", "150 HA...
    ## $ Neighborhood       (chr) "4025R-RIVERDALE ", "4025R-RIVERDALE ", "40...