Can someone explain why the marked line below returns numbers instead of setting the column names the strings in that row? How would I get the correct column names if I include the commented out colClasses
line when reading the table?
url<-'http://qpublic7.qpublic.net/ga_subdivison.php?county=ga_clarke&searchType=nbhd&numberValue=4025R&nameValue=§ionValue=&townshipValue=&rangeValue=&startDate=01-1998&endDate=&startPrice=&endPrice=&startArea=&endArea=&startAcreage=&endAcreage=&saleQualification=All&saleVacant=All&propertyType=All&reasonType=All&start=0'
library(XML)
#colClasses = c("character","character","character","character","integer","integer","integer","character","character","integer","character","character"),
data<-readHTMLTable(url,header=F,as.data.frame=T)
View(data)
csv<-as.data.frame(data)
colnames(csv)<-csv[4,] #why does this line return numbers?
colnames(csv)<-gsub(" ","",colnames(csv))
View(head(csv))
csv<-csv[-c(1:4),]
#####
View(csv)
You were bit by stringsAsFactors
. Plus, your as.data.frame
call was not the right thing to do (look at the data structure with str
vs View
).
library(XML)
URL <- 'http://qpublic7.qpublic.net/ga_subdivison.php?county=ga_clarke&searchType=nbhd&numberValue=4025R&nameValue=§ionValue=&townshipValue=&rangeValue=&startDate=01-1998&endDate=&startPrice=&endPrice=&startArea=&endArea=&startAcreage=&endAcreage=&saleQualification=All&saleVacant=All&propertyType=All&reasonType=All&start=0'
csv <- readHTMLTable(URL, header=FALSE, as.data.frame=TRUE, stringsAsFactors=FALSE)[[2]]
colnames(csv) <- csv[4,]
colnames(csv) <- gsub(" ", "", colnames(csv))
csv <- csv[-c(1:4),]
dplyr::glimpse(csv)
## Observations: 52
## Variables: 11
## $ \/ParcelNumber\/ (chr) "173C2 F023", "173C2 G009", "173C2 G007", "17...
## $ PropertyType (chr) "R", "R", "R", "R", "R", "R", "R", "R", "R"...
## $ SaleDate (chr) "12-2015", "08-2015", "08-2015", "07-2015",...
## $ SalePrice (chr) "200,000", "265,000", "210,000", "188,000",...
## $ HeatedSqFt (chr) "1,538", "1,756", "1,415", "1,125", "1,559"...
## $ Acres (chr) "0.30", "0.37", "0.37", "0.38", "0.32", "0....
## $ SaleQual (chr) "Q", "Q", "Q", "Q", "Q", "Q", "U", "Q", "Q"...
## $ Reason (chr) "FM", "FM", "FM", "FM", "FM", "FM", "B", "F...
## $ YearBuilt (chr) "1952", "1954", "1963", "1963", "1998", "19...
## $ LocationAddress (chr) "155 HARDIN DR", "140 HARDIN DR", "150 HARD...
## $ Neighborhood (chr) "4025R-RIVERDALE", "4025R-RIVERDALE", "4025...
# or use the more modern xml2 ---------------------------------------------
library(xml2)
library(rvest)
pg <- read_html(URL)
csv2 <- html_table(html_nodes(pg, "table")[[2]], fill=TRUE)
colnames(csv2) <- csv[4,]
colnames(csv2) <- gsub(" ", "", colnames(csv))
csv2 <- csv2[-c(1:4), -c(12:13)]
dplyr::glimpse(csv2)
## Observations: 52
## Variables: 11
## $ \/ParcelNumber\/ (chr) "173C2 F023 ", "173C2 G009 ", "173C2 G007 ", ...
## $ PropertyType (chr) "R ", "R ", "R ", "R ", "R ", "R ", "R ", "...
## $ SaleDate (chr) "12-2015 ", "08-2015 ", "08-2015 ", "07-201...
## $ SalePrice (chr) "200,000 ", "265,000 ", "210,000 ", "188,00...
## $ HeatedSqFt (chr) "1,538 ", "1,756 ", "1,415 ", "1,125 ", "1,...
## $ Acres (chr) "0.30 ", "0.37 ", "0.37 ", "0.38 ", "0.32 "...
## $ SaleQual (chr) "Q ", "Q ", "Q ", "Q ", "Q ", "Q ", "U ", "...
## $ Reason (chr) "FM ", "FM ", "FM ", "FM ", "FM ", "FM ", "...
## $ YearBuilt (chr) "1952 ", "1954 ", "1963 ", "1963 ", "1998 "...
## $ LocationAddress (chr) "155 HARDIN DR ", "140 HARDIN DR ", "150 HA...
## $ Neighborhood (chr) "4025R-RIVERDALE ", "4025R-RIVERDALE ", "40...