Introduction

In this notebook we analyse each domain in the Alex top 1 million domain list, understanding the record type for www for every domain, and perform basic analysis on the CNAME records as well as authoritive name servers.

Considerations when interpretting this data

All queries in the supplied data set were captured over approximately two days between November 12th and 13th 2018, and were performed from a host within AS2818, and thus answers may be influenced due to EDNS Client Subnet.

Record type breakdown

Each one of the domains was queried for a www subdomain, either as A or CNAME, in addition to a query for name servers for the domain.

The NXDOMAIN largely exist due to them being subdomains themselves, or domains not intended for users to put into address bars in browsers (e.g. bp.blogspot.com) but used for serving other content. There are also several that do not intended on serving a web page, but are linked to (e.g. redd.it).

value   = c()
percent = c()
record  = c()

totalServfail <- recordData %>% filter(class == "") %>% summarize(count=n())
totalNxDomain <- recordData %>% filter(class != "" & class != "IN") %>% summarize(count=n())

totals <- recordData %>% group_by(recordType) %>% filter(class == "IN") %>% summarize(count=n())

for(i in 1:nrow(totals)){
  type    <- totals[i,]
  value   <- c(value, type$count)
  percent <- c(percent, type$count / 10000)
  record  <- c(record, type$recordType)
}

# NXDOMAIN values
value   <- c(value, as.numeric(totalNxDomain$count))
percent <- c(percent, as.numeric(totalNxDomain$count) / 10000)
record  <- c(record, "NXDOMAIN")

# SERVFAIL values
value   <- c(value, as.numeric(totalServfail$count))
percent <- c(percent, as.numeric(totalServfail$count) / 10000)
record  <- c(record, "SERVFAIL")

recordTotals <- data.frame(
  record  = record,
  value   = value,
  percent = percent
)
recordTypePlot <- ggplot(recordTotals, aes(x="", y=value, fill=record))+
geom_bar(width=1, stat="identity")
recordTypePlot +
  coord_polar("y", start=0) +
  scale_fill_brewer(palette="Paired", name="Record Type") +
  theme_minimal() +
  labs(title="Distribution of record types") +
  theme(
    axis.title.x = element_blank(),
    axis.title.y = element_blank(),
    axis.ticks   = element_blank(),
    axis.text    = element_blank(),
    panel.border = element_blank(),
    panel.grid   = element_blank(),
    plot.title=element_text(hjust = 0.5)
  )

recordTable <- data.frame(recordTotals)

recordTable$percent <- apply(recordTable, 1, function(x){
  x[[3]] = sprintf("%.2f%%", as.numeric(x[[3]]))
})
recordTable$value <- NULL
colnames(recordTable) <- c("Record Type", "Percentage")
kable(recordTable)
Record Type Percentage
A 51.00%
CNAME 46.74%
DNAME 0.00%
NXDOMAIN 1.71%
SERVFAIL 0.55%

CNAME Usage

# FIXME: this could be done better
cnameList <- list(

  ### CDNs
  c("Content Delivery Network", "Akamai", "edgekey.net."),
  c("Content Delivery Network", "Akamai", "edgesuite.net."),
  c("Content Delivery Network", "Akamai", "akadns.net."),

  c("Content Delivery Network", "Fastly", "fastly.net."),
  c("Content Delivery Network", "Fastly", "fastlylb.net."),
  c("Content Delivery Network", "CloudFlare", "cloudflare.net."),
  c("Content Delivery Network", "Amazon Web Services", "cloudfront.net."),
  #c("Content Delivery Network", "Cedexis", "cedexis.net."),
  #c("Content Delivery Network", "FastCDN", "fastcdn.com."),
  #c("Content Delivery Network", "J-Stream", "stream.ne.jp."),
  c("Content Delivery Network", "Incapsula", "incapdns.net."),
  #c("Content Delivery Network", "Azion", "azioncdn.net."),
  #c("Content Delivery Network", "Fasterized", "fasterized.com."),
  c("Content Delivery Network", "Azure", "azureedge.net."),
  #c("Content Delivery Network", "Distil", "distil.us."),

  ### Hosting Providers
  c("Hosting Provider", "Amazon Web Services", "elb.amazonaws.com."),
  c("Hosting Provider", "Amazon Web Services", "elasticbeanstalk.com."),
  c("Hosting Provider", "Google", "ghs.googlehosted.com."),

  c("Hosting Provider", "Azure", "azurewebsites.net."),
  c("Hosting Provider", "Azure", "cloudapp.net."),

  c("Hosting Provider", "Heroku", "herokussl.com."),
  c("Hosting Provider", "Heroku", "herokudns.com."),

  ### Content Providers
  c("Content Provider", "Google", "googleusercontent.com."),
  c("Content Provider", "Google", "ghs.google.com."),
  c("Content Provider", "Squarespace", "squarespace.com."),
  c("Content Provider", "Shopify", "shopify.com."),
  c("Content Provider", "Shopify", "shopifycloud.com."),
  c("Content Provider", "HubSpot", "hubspot.net.")
)

hostingTypes     <- c()
hostingProviders <- c()
hostingDomains   <- c()
totalHosts       <- c()

for(provider in cnameList){
  totalHost <- recordData %>%
                filter(str_detect(answer, sprintf("%s$", provider[[3]]))) %>%
                summarize(count=n())
  hostingTypes     <- c(hostingTypes, provider[[1]])
  hostingProviders <- c(hostingProviders, provider[[2]])
  hostingDomains   <- c(hostingDomains, provider[[3]])
  totalHosts       <- c(totalHosts, totalHost$count)
}

cnameUsage <- data.frame(
  type     = hostingTypes,
  provider = hostingProviders,
  domain   = hostingDomains,
  total    = totalHosts
)

# Total records from providers we know about
totalFoundCnames <- colSums(Filter(is.numeric, cnameUsage))
foundCnamesPercentage <-
  totalFoundCnames[[1]] / (recordTotals %>% filter(record == "CNAME"))$value * 100

# Total that point www back at apex
wwwToApexTotal <- recordData %>%
  filter(answer == sprintf("%s.", domain)) %>%
  summarize(count=n())

wwwToApexPercentage <-
   wwwToApexTotal[[1]] / (recordTotals %>% filter(record == "CNAME"))$value * 100

For the records that return CNAMEs, 88939 (or 19.03% of total CNAME records) records point directly at addresses that reveal information about hosting providers such as CDNs, cloud providers, or other Software as a Service offerings. 299924 (64.17% of total CNAME records) of records pointed users back to apex - i.e. www.example.com. responds with a CNAME to example.com.

cnamePlot <- ggplot(cnameUsage, aes(x=reorder(provider,total, sum), group=provider, y=total, fill=type))
cnamePlot +
  geom_bar(stat="identity") +
  scale_fill_brewer(palette="Paired", name="") +
  labs(
    title="CNAME Answers by Service",
    y="Number of records"
  ) +
  theme(
    axis.text.x=element_text(angle=45, hjust=1),
    axis.title.x=element_blank(),
    plot.title=element_text(hjust=0.5)
  )

The largest representation of hosts is from Google, mostly comprising of Blogspot addresses as well as a small percentage of Google Code sites.

DNAME Usage

dnameTotal <- recordData %>% filter(recordType == "DNAME") %>% summarize(count=n())
dnameRecords <- recordData %>%
  select(domain, recordType, answer) %>%
  filter(recordType == "DNAME")

dnameRecords$recordType <- NULL
colnames(dnameRecords) <- c("Domain", "Answer")

Only 17 domains were using DNAME records, most of whom were associated to secondary TLDs reserved for education or government use.

kable(dnameRecords)
Domain Answer
lancaster.ac.uk lancs.ac.uk.
gobierno.pr pr.gov.
7kanal.co.il israel7.ru.
yonhapnewstv.co.kr news-y.co.kr.
cbnu.ac.kr chungbuk.ac.kr.
dl-one2up.com dname.th7.swiftserve.com.
hochschule-bochum.de hs-bochum.de.
uos.de uni-osnabrueck.de.
multiplay.com multiplay.co.uk.
upb.de uni-paderborn.de.
nationaldrugstrategy.gov.au health.gov.au.
fh-worms.de HS-Worms.DE.
pbefcu.com pbcreditunion.com.
cancerscreening.gov.au health.gov.au.
gidaje.ng sheltered-plum-jrup4durxogpausujdn1h0pq.herokudns.com.
ianyang.ac.kr yeonsung.ac.kr.
alcohol.gov.au health.gov.au.

Nameserver Usage

nameServerList <- list(
  # Unlike CNAMEs, we have to be much looser on the matching as some providers use multiple TLDs
  # List taken from https://en.wikipedia.org/wiki/List_of_managed_DNS_providers

  # DNS Providers
  c("DNS Provider", "Dyn", "dynect."),
  c("DNS Provider", "NS1", "nsone.net."),
  c("DNS Provider", "Verisign", "verisigndns"),
  c("DNS Provider", "UltraDNS", "ultradns."),
  c("DNS Provider", "DNSimple", "dnsimple.com."),
  c("DNS Provider", "ClouDNS", "cloudns.net."),
  #c("DNS Provider", "CloudfloorDNS", "cloudfloordns."),
  c("DNS Provider", "DNS Made Easy", "dnsmadeeasy.com"),
  c("DNS Provider", "EasyDNS", "easydns."),
  #c("DNS Provider", "No-IP", "no-ip.com."),
  #c("DNS Provider", "Constellix", "constellix."),

  # Registrars
  c("Registrar", "Namecheap", "registrar-servers.com."),
  c("Registrar", "Gandi", "gandi.net."),
  c("Registrar", "Godaddy", "domaincontrol.com."),
  c("Registrar", "123-reg", "123-reg.co.uk."),
  c("Registrar", "eNom", "name-services.com."),

  # Hosting Providers
  c("Hosting Provider", "AWS Route53", ".awsdns-"),
  c("Hosting Provider", "Azure", "azure-dns"),
  c("Hosting Provider", "Google", "googledomains.com"),
  #c("Hosting Provider", "Alibaba", "alibabadns.com."),
  c("Hosting Provider", "Linode", "linode.com."),
  c("Hosting Provider", "DigitalOcean", "digitalocean.com."),
  c("Hosting Provider", "Godaddy", "secureserver.net."),
  c("Hosting Provider", "OVH", "ovh.net."),

  # Content Delivery Networks
  c("Content Delivery Network", "CloudFlare", "cloudflare.com."),
  c("Content Delivery Network", "Akamai", "akam.net"),

  # Content Providers
  #c("Content Provider", "Wordpress", "wordpress.com"),
  c("Content Provider", "Google", "google.com."),
  c("Content Provider", "Squarespace", "squarespacedns.com.")
)

nsHostingTypes <- c()
nsProviders    <- c()
nsDomains      <- c()
nsTotalHosts   <- c()

for(provider in nameServerList){
  totalHost <- recordData %>%
                filter_at(vars(contains("ns")), any_vars(str_detect(. , provider[[3]]))) %>%
                summarize(count=n())
  nsHostingTypes <- c(nsHostingTypes, provider[[1]])
  nsProviders    <- c(nsProviders, provider[[2]])
  nsDomains      <- c(nsDomains, provider[[3]])
  nsTotalHosts   <- c(nsTotalHosts, totalHost$count)
}

nameServerUsage <- data.frame(
  type     = nsHostingTypes,
  provider = nsProviders,
  domain   = nsDomains,
  total    = nsTotalHosts
)

totalFoundNameServers <- colSums(Filter(is.numeric, nameServerUsage))

Out of all the records that did return name servers, 323751 name servers references were matched across the dataset.

nsPlot <- ggplot(nameServerUsage, aes(x=reorder(provider,total, sum), group=provider, y=total, fill=type))
nsPlot +
  geom_bar(stat="identity") +
  scale_fill_brewer(palette="Paired", name="") +
  labs(
    title="Authoritive name servers by service",
    y="Number of records"
  ) +
  theme(
    axis.text.x=element_text(angle=45, hjust=1),
    axis.title.x=element_blank(),
    plot.title=element_text(hjust=0.5)
  )