osbase/net/dns/kubernetes: add Kubernetes DNS handler
This adds a DNS server handler for Kubernetes DNS service discovery. It
is partially based on the CoreDNS Kubernetes plugin. The query handler
however is written completely from scratch. The handler in the CoreDNS
plugin is very weird; it first handles each query type separately, and
generates msg.Service objects which then need to be converted to dns
records. The new implementation is much simpler, and also more correct:
It handles ANY queries, and follows the rules for NXDOMAIN (If a name is
NXDOMAIN for one qtype, it is NXDOMAIN for all qtypes, and subdomains of
the name are also NXDOMAIN.)
Change-Id: Id1d498ca5384a3b047587ed73e95e4871d82d499
Reviewed-on: https://review.monogon.dev/c/monogon/+/3259
Reviewed-by: Lorenz Brun <lorenz@monogon.tech>
Tested-by: Jenkins CI
diff --git a/osbase/net/dns/kubernetes/handler.go b/osbase/net/dns/kubernetes/handler.go
new file mode 100644
index 0000000..e6adf84
--- /dev/null
+++ b/osbase/net/dns/kubernetes/handler.go
@@ -0,0 +1,421 @@
+package kubernetes
+
+import (
+ "math/rand/v2"
+ "net"
+ "net/netip"
+
+ "github.com/miekg/dns"
+
+ netDNS "source.monogon.dev/osbase/net/dns"
+ "source.monogon.dev/osbase/net/dns/kubernetes/object"
+)
+
+const (
+ // DNSSchemaVersion is the schema version: https://github.com/kubernetes/dns/blob/master/docs/specification.md
+ DNSSchemaVersion = "1.1.0"
+ // defaultTTL to apply to all answers.
+ defaultTTL = 5
+)
+
+func (k *Kubernetes) HandleDNS(r *netDNS.Request) {
+ if netDNS.IsSubDomain(k.clusterDomain, r.QnameCanonical) {
+ r.SetAuthoritative()
+
+ subdomain := r.QnameCanonical[:len(r.QnameCanonical)-len(k.clusterDomain)]
+ subdomain, last := netDNS.SplitLastLabel(subdomain)
+ if last == "svc" {
+ k.handleService(r, subdomain)
+ } else if last == "" {
+ if r.Qtype == dns.TypeSOA || r.Qtype == dns.TypeANY {
+ r.Reply.Answer = append(r.Reply.Answer, k.makeSOA(r.Qname))
+ }
+ if r.Qtype == dns.TypeNS || r.Qtype == dns.TypeANY {
+ r.Reply.Answer = append(r.Reply.Answer, k.makeNS(r.Qname))
+ }
+ } else if last == "dns-version" && subdomain == "" {
+ if r.Qtype == dns.TypeTXT || r.Qtype == dns.TypeANY {
+ rr := new(dns.TXT)
+ rr.Hdr = dns.RR_Header{Name: r.Qname, Rrtype: dns.TypeTXT, Class: dns.ClassINET, Ttl: defaultTTL}
+ rr.Txt = []string{DNSSchemaVersion}
+ r.Reply.Answer = append(r.Reply.Answer, rr)
+ }
+ } else if last == "dns" && (subdomain == "" || subdomain == "ns.") {
+ // Name exists but has no records.
+ } else {
+ r.Reply.Rcode = dns.RcodeNameError
+ }
+
+ if r.Handled {
+ return
+ }
+ if len(r.Reply.Answer) == 0 {
+ zone := r.Qname[len(r.Qname)-len(k.clusterDomain):]
+ r.Reply.Ns = []dns.RR{k.makeSOA(zone)}
+ }
+ r.SendReply()
+ return
+ }
+
+ reverseIP, reverseBits, extra := netDNS.ParseReverse(r.QnameCanonical)
+ if reverseIP.IsValid() {
+ for _, ipRange := range k.ipRanges {
+ if !ipRange.Contains(reverseIP) || reverseBits < ipRange.Bits() {
+ continue
+ }
+
+ r.SetAuthoritative()
+
+ zoneBits := 0
+ if reverseIP.BitLen() == 32 {
+ zoneBits = (ipRange.Bits() + 7) & ^7
+ } else {
+ zoneBits = (ipRange.Bits() + 3) & ^3
+ }
+
+ if extra {
+ // Name with extra labels does not exist.
+ r.Reply.Rcode = dns.RcodeNameError
+ } else {
+ if reverseBits == reverseIP.BitLen() {
+ k.handleReverse(r, reverseIP)
+ }
+ if reverseBits == zoneBits {
+ if r.Qtype == dns.TypeSOA || r.Qtype == dns.TypeANY {
+ r.Reply.Answer = append(r.Reply.Answer, k.makeSOA(r.Qname))
+ }
+ if r.Qtype == dns.TypeNS || r.Qtype == dns.TypeANY {
+ r.Reply.Answer = append(r.Reply.Answer, k.makeNS(r.Qname))
+ }
+ }
+ }
+
+ if len(r.Reply.Answer) == 0 {
+ zoneDots := 0
+ if reverseIP.BitLen() == 32 {
+ zoneDots = 3 + zoneBits/8
+ } else {
+ zoneDots = 3 + zoneBits/4
+ }
+ zoneStart := len(r.Qname)
+ for zoneStart > 0 {
+ if r.Qname[zoneStart-1] == '.' {
+ zoneDots--
+ if zoneDots == 0 {
+ break
+ }
+ }
+ zoneStart--
+ }
+ zone := r.Qname[zoneStart:]
+ r.Reply.Ns = []dns.RR{k.makeSOA(zone)}
+ }
+ r.SendReply()
+ return
+ }
+ }
+}
+
+func (k *Kubernetes) handleService(r *netDNS.Request, subdomain string) {
+ if subdomain == "" {
+ // Name exists but has no records.
+ return
+ }
+
+ rest, namespace := netDNS.SplitLastLabel(subdomain)
+ if rest == "" {
+ // Name exists if the namespace exists, and has no records.
+ if !k.apiConn.NamespaceExists(namespace) {
+ k.notFound(r)
+ }
+ return
+ }
+
+ serviceSub, _ := netDNS.SplitLastLabel(rest)
+ rest, hostnameOrProto := netDNS.SplitLastLabel(serviceSub)
+
+ var proto string
+ var portName string
+ var hostname string
+ switch hostnameOrProto {
+ case "_tcp", "_udp", "_sctp":
+ proto = hostnameOrProto[1:]
+ rest, portName = netDNS.SplitLastLabel(rest)
+ if len(portName) >= 2 && portName[0] == '_' {
+ portName = portName[1:]
+ } else if portName != "" {
+ r.Reply.Rcode = dns.RcodeNameError
+ return
+ }
+ // If portName is empty, the name exists if the parent exists,
+ // but has no records.
+ default:
+ hostname = hostnameOrProto
+ }
+
+ if rest != "" {
+ // The query name has too many labels.
+ r.Reply.Rcode = dns.RcodeNameError
+ return
+ }
+
+ // serviceKey is "<service>.<ns>"
+ serviceKey := subdomain[len(serviceSub) : len(subdomain)-1]
+ service := k.apiConn.GetSvc(serviceKey)
+ if service == nil {
+ k.notFound(r)
+ return
+ }
+
+ // External service
+ if service.ExternalName != "" {
+ if serviceSub != "" {
+ // External services don't have subdomains.
+ r.Reply.Rcode = dns.RcodeNameError
+ return
+ }
+ if service.ExternalName == object.ExternalNameInvalid {
+ // The service has an invalid ExternalName, return an error.
+ r.AddExtendedError(dns.ExtendedErrorCodeInvalidData, "Kubernetes service has invalid externalName")
+ r.Reply.Rcode = dns.RcodeServerFailure
+ return
+ }
+ // We already ensure that ExternalName is valid and fully qualified
+ // when constructing the object.Service.
+ r.AddCNAME(service.ExternalName, defaultTTL)
+ return
+ }
+
+ // Headless service.
+ if service.Headless {
+ found := false
+ haveIP := make(map[string]struct{})
+ haveSRV := make(map[srvItem]struct{})
+ existingAnswer := len(r.Reply.Answer)
+ existingExtra := len(r.Reply.Extra)
+ for _, ep := range k.apiConn.EpIndex(serviceKey) {
+ if portName != "" {
+ // _<port>._<proto>.<service>.<ns>.svc.
+ var portNumber uint16
+ for _, p := range ep.Ports {
+ if p.Name == portName && p.Protocol == proto {
+ portNumber = p.Port
+ break
+ }
+ }
+ if portNumber == 0 {
+ continue
+ }
+ for _, addr := range ep.Addresses {
+ found = true
+ if r.Qtype == dns.TypeSRV || r.Qtype == dns.TypeANY {
+ targetName := addr.Hostname + r.Qname[len(serviceSub)-1:]
+ if !isDuplicateSRV(haveSRV, addr.Hostname, "", portNumber) {
+ addSRV(r, portNumber, targetName)
+ }
+ if !isDuplicateSRV(haveSRV, addr.Hostname, addr.IP, 0) {
+ addAddrExtra(r, targetName, net.IP(addr.IP))
+ }
+ }
+ }
+ } else {
+ // <service>.<ns>.svc. or <hostname>.<service>.<ns>.svc.
+ for _, addr := range ep.Addresses {
+ if hostname != "" && hostname != addr.Hostname {
+ continue
+ }
+ found = true
+ if proto != "" {
+ // _<proto>.<service>.<ns>.svc. has no records
+ // and exists if its parent exists.
+ break
+ }
+ if _, ok := haveIP[addr.IP]; !ok {
+ haveIP[addr.IP] = struct{}{}
+ addAddr(r, net.IP(addr.IP))
+ }
+ }
+ }
+ }
+ shuffleRRs(r.Reply.Answer[existingAnswer:])
+ shuffleRRs(r.Reply.Extra[existingExtra:])
+ if !found {
+ k.notFound(r)
+ }
+ return
+ }
+
+ if hostname != "" {
+ // Non-headless services don't have hostname records.
+ r.Reply.Rcode = dns.RcodeNameError
+ return
+ }
+
+ // ClusterIP service
+ if proto == "" {
+ // <service>.<ns>.svc. for ClusterIP service.
+ for _, ip := range service.ClusterIPs {
+ addAddr(r, net.IP(ip))
+ }
+ // The specification does not define what to return if the service has
+ // no (valid) clusterIP. We return an empty response with no error.
+ return
+ }
+
+ if portName == "" {
+ // _<proto>.<service>.<ns>.svc. exists but has no records.
+ return
+ }
+
+ // _<port>._<proto>.<service>.<ns>.svc. for ClusterIP service.
+ var portNumber uint16
+ for _, p := range service.Ports {
+ if p.Name == portName && p.Protocol == proto {
+ portNumber = p.Port
+ break
+ }
+ }
+ if portNumber == 0 {
+ r.Reply.Rcode = dns.RcodeNameError
+ return
+ }
+ if r.Qtype == dns.TypeSRV || r.Qtype == dns.TypeANY {
+ targetName := r.Qname[len(serviceSub):]
+ addSRV(r, portNumber, targetName)
+ for _, ip := range service.ClusterIPs {
+ addAddrExtra(r, targetName, net.IP(ip))
+ }
+ }
+}
+
+func (k *Kubernetes) handleReverse(r *netDNS.Request, ip netip.Addr) {
+ stringIP := string(ip.AsSlice())
+ found := false
+ for _, service := range k.apiConn.SvcIndexReverse(stringIP) {
+ found = true
+ if r.Qtype == dns.TypePTR || r.Qtype == dns.TypeANY {
+ rr := new(dns.PTR)
+ rr.Hdr = dns.RR_Header{Name: r.Qname, Rrtype: dns.TypePTR, Class: dns.ClassINET, Ttl: defaultTTL}
+ rr.Ptr = service.Name + "." + service.Namespace + ".svc." + k.clusterDomain
+ r.Reply.Answer = append(r.Reply.Answer, rr)
+ }
+ }
+ haveName := make(map[string]struct{})
+ for _, ep := range k.apiConn.EpIndexReverse(stringIP) {
+ for _, addr := range ep.Addresses {
+ if addr.IP == stringIP {
+ found = true
+ if r.Qtype == dns.TypePTR || r.Qtype == dns.TypeANY {
+ ptr := addr.Hostname + "." + ep.Index + ".svc." + k.clusterDomain
+ if _, ok := haveName[ptr]; ok {
+ continue
+ }
+ haveName[ptr] = struct{}{}
+ rr := new(dns.PTR)
+ rr.Hdr = dns.RR_Header{Name: r.Qname, Rrtype: dns.TypePTR, Class: dns.ClassINET, Ttl: defaultTTL}
+ rr.Ptr = ptr
+ r.Reply.Answer = append(r.Reply.Answer, rr)
+ }
+ }
+ }
+ }
+ if !found {
+ k.notFound(r)
+ }
+}
+
+func (k *Kubernetes) makeSOA(zone string) *dns.SOA {
+ header := dns.RR_Header{Name: zone, Rrtype: dns.TypeSOA, Class: dns.ClassINET, Ttl: defaultTTL}
+ return &dns.SOA{
+ Hdr: header,
+ Mbox: "nobody.invalid.",
+ Ns: k.nsDomain,
+ Serial: uint32(k.apiConn.Modified()),
+ Refresh: 7200,
+ Retry: 1800,
+ Expire: 86400,
+ Minttl: defaultTTL,
+ }
+}
+
+func (k *Kubernetes) makeNS(zone string) *dns.NS {
+ rr := new(dns.NS)
+ rr.Hdr = dns.RR_Header{Name: zone, Rrtype: dns.TypeNS, Class: dns.ClassINET, Ttl: defaultTTL}
+ rr.Ns = k.nsDomain
+ return rr
+}
+
+func addAddr(r *netDNS.Request, ip net.IP) {
+ if len(ip) == net.IPv4len && (r.Qtype == dns.TypeA || r.Qtype == dns.TypeANY) {
+ rr := new(dns.A)
+ rr.Hdr = dns.RR_Header{Name: r.Qname, Rrtype: dns.TypeA, Class: dns.ClassINET, Ttl: defaultTTL}
+ rr.A = ip
+ r.Reply.Answer = append(r.Reply.Answer, rr)
+ }
+ if len(ip) == net.IPv6len && (r.Qtype == dns.TypeAAAA || r.Qtype == dns.TypeANY) {
+ rr := new(dns.AAAA)
+ rr.Hdr = dns.RR_Header{Name: r.Qname, Rrtype: dns.TypeAAAA, Class: dns.ClassINET, Ttl: defaultTTL}
+ rr.AAAA = ip
+ r.Reply.Answer = append(r.Reply.Answer, rr)
+ }
+}
+
+func addAddrExtra(r *netDNS.Request, name string, ip net.IP) {
+ if len(ip) == net.IPv4len {
+ rr := new(dns.A)
+ rr.Hdr = dns.RR_Header{Name: name, Rrtype: dns.TypeA, Class: dns.ClassINET, Ttl: defaultTTL}
+ rr.A = ip
+ r.Reply.Extra = append(r.Reply.Extra, rr)
+ }
+ if len(ip) == net.IPv6len {
+ rr := new(dns.AAAA)
+ rr.Hdr = dns.RR_Header{Name: name, Rrtype: dns.TypeAAAA, Class: dns.ClassINET, Ttl: defaultTTL}
+ rr.AAAA = ip
+ r.Reply.Extra = append(r.Reply.Extra, rr)
+ }
+}
+
+func addSRV(r *netDNS.Request, portNumber uint16, targetName string) {
+ rr := new(dns.SRV)
+ rr.Hdr = dns.RR_Header{Name: r.Qname, Rrtype: dns.TypeSRV, Class: dns.ClassINET, Ttl: defaultTTL}
+ rr.Priority = 0
+ rr.Weight = 0
+ rr.Port = portNumber
+ rr.Target = targetName
+ r.Reply.Answer = append(r.Reply.Answer, rr)
+}
+
+// notFound should be called if a name was not found, but could exist
+// if there are Kubernetes object that are not yet available locally.
+func (k *Kubernetes) notFound(r *netDNS.Request) {
+ if !k.apiConn.HasSynced() {
+ // We don't know if the name exists or not, so return an error.
+ r.AddExtendedError(dns.ExtendedErrorCodeNotReady, "Kubernetes objects not yet synced")
+ r.Reply.Rcode = dns.RcodeServerFailure
+ } else {
+ r.Reply.Rcode = dns.RcodeNameError
+ }
+}
+
+type srvItem struct {
+ name string
+ addr string
+ port uint16
+}
+
+// isDuplicateSRV returns true if the (name, addr, port) combination already
+// exists in m, and adds it to m if not.
+func isDuplicateSRV(m map[srvItem]struct{}, name, addr string, port uint16) bool {
+ _, ok := m[srvItem{name, addr, port}]
+ if !ok {
+ m[srvItem{name, addr, port}] = struct{}{}
+ }
+ return ok
+}
+
+// shuffleRRs shuffles a slice of RRs for some load balancing.
+func shuffleRRs(rrs []dns.RR) {
+ rand.Shuffle(len(rrs), func(i, j int) {
+ rrs[i], rrs[j] = rrs[j], rrs[i]
+ })
+}