Linux – BIND SERVFAIL after upgrade to Debian Jessie

binddebiandns-zonedomain-name-systemlinux

I've done absolutely nothing to my BIND configuration, but looks like Debian Jessie upgrade has broke it. Maybe some new options got introduced to it, or the old things now work differently, but I cannot find what's going wrong.

I got SERVFAIL in my /var/log/bind/bind.log all the time.

I've checked my zones with named-checkzone and they are all 'OK'. I've disabled IPv6 system-wide. I recreated rndc key and even created /etc/rndc.conf. Nothing works.

Here are some configs:

/etc/bind/named.conf

include "/etc/bind/named.conf.options";
include "/etc/bind/named.conf.log";
include "/etc/bind/named.conf.local";
//include "/etc/bind/named.conf.default-zones";

acl localhost_acl {
        127.0.0.0/8;
};

acl internal_10_acl {
        192.168.10.0/24;
};

acl internal_150_acl {
        192.168.150.0/24;
};

acl vpn_acl {
        192.168.200.2;
        192.168.200.5;
};

key "rndc-key" {
algorithm hmac-md5;
secret "somesecretkey==";
};

controls {
inet 127.0.0.1 port 953
       allow { 127.0.0.1; } keys { "rndc-key"; };
};

/etc/bind/named.conf.options

options {
        directory "/var/cache/bind";
        dnssec-validation auto;
        auth-nxdomain no;    # conform to RFC1035
        listen-on-v6 { none; };
        listen-on {
                127.0.0.1;
                192.168.10.1;
                192.168.150.1;
                192.168.200.1;
        };
        allow-transfer { none; };
        max-recursion-queries 200;
};

/etc/bind/named.conf.log

logging {

    channel update_debug {

            file "/var/log/bind/update_debug.log" versions 3 size 100k;
            severity debug;
            print-severity  yes;
            print-time      yes;

    };

    channel security_info {

            file "/var/log/bind/security_info.log" versions 1 size 100k;
            severity debug;
            print-severity  yes;
            print-time      yes;

    };

    channel bind_log {

            file "/var/log/bind/bind.log" versions 3 size 1m;
            severity debug;
            print-category  yes;
            print-severity  yes;
            print-time      yes;

    };

    category default { bind_log; };
    category lame-servers { security_info; };
    category update { update_debug; };
    category update-security { update_debug; };
    category security { security_info; };

};

/etc/bind/named.conf.local (this is a long one):

// 1
view "internal_10_view" {

        allow-query-on { 127.0.0.1; 192.168.10.1; };
        allow-query { localhost_acl; internal_10_acl; };
        match-clients { localhost_acl; internal_10_acl; };

        zone "myhost.tld" {
                type master;
                file "/etc/bind/db.myhost.tld_10";
        };

        zone "168.192.in-addr.arpa" {
                type master;
                notify no;
                file "/etc/bind/db.192.168.10";
        };

        // formerly named.conf.default-zones

        zone "." {
                type hint;
                file "/etc/bind/db.root";
        };

        zone "localhost" {
                type master;
                file "/etc/bind/db.local";
        };

        zone "127.in-addr.arpa" {
                type master;
                file "/etc/bind/db.127";
        };

        zone "0.in-addr.arpa" {
                type master;
                file "/etc/bind/db.0";
        };

        zone "255.in-addr.arpa" {
                type master;
                file "/etc/bind/db.255";
        };

        // formerly zones.rfc1918

        zone "10.in-addr.arpa"      { type master; file "/etc/bind/db.empty"; };
        zone "16.172.in-addr.arpa"  { type master; file "/etc/bind/db.empty"; };
        zone "17.172.in-addr.arpa"  { type master; file "/etc/bind/db.empty"; };
        zone "18.172.in-addr.arpa"  { type master; file "/etc/bind/db.empty"; };
        zone "19.172.in-addr.arpa"  { type master; file "/etc/bind/db.empty"; };
        zone "20.172.in-addr.arpa"  { type master; file "/etc/bind/db.empty"; };
        zone "21.172.in-addr.arpa"  { type master; file "/etc/bind/db.empty"; };
        zone "22.172.in-addr.arpa"  { type master; file "/etc/bind/db.empty"; };
        zone "23.172.in-addr.arpa"  { type master; file "/etc/bind/db.empty"; };
        zone "24.172.in-addr.arpa"  { type master; file "/etc/bind/db.empty"; };
        zone "25.172.in-addr.arpa"  { type master; file "/etc/bind/db.empty"; };
        zone "26.172.in-addr.arpa"  { type master; file "/etc/bind/db.empty"; };
        zone "27.172.in-addr.arpa"  { type master; file "/etc/bind/db.empty"; };
        zone "28.172.in-addr.arpa"  { type master; file "/etc/bind/db.empty"; };
        zone "29.172.in-addr.arpa"  { type master; file "/etc/bind/db.empty"; };
        zone "30.172.in-addr.arpa"  { type master; file "/etc/bind/db.empty"; };
        zone "31.172.in-addr.arpa"  { type master; file "/etc/bind/db.empty"; };

};

// 2
view "internal_150_view" {

        allow-query-on { 192.168.150.1; };
        allow-query { internal_150_acl; };
        match-clients { internal_150_acl; };

        zone "myhost.tld" {
                type master;
                file "/etc/bind/db.myhost.tld_150";
        };

        zone "168.192.in-addr.arpa" {
                type master;
                notify no;
                file "/etc/bind/db.192.168.150";
        };

        // formerly named.conf.default-zones

        zone "." {
                type hint;
                file "/etc/bind/db.root";
        };

        zone "localhost" {
                type master;
                file "/etc/bind/db.local";
        };

        zone "127.in-addr.arpa" {
                type master;
                file "/etc/bind/db.127";
        };

        zone "0.in-addr.arpa" {
                type master;
                file "/etc/bind/db.0";
        };

        zone "255.in-addr.arpa" {
                type master;
                file "/etc/bind/db.255";
        };

        // formerly zones.rfc1918

        zone "10.in-addr.arpa"      { type master; file "/etc/bind/db.empty"; };
        zone "16.172.in-addr.arpa"  { type master; file "/etc/bind/db.empty"; };
        zone "17.172.in-addr.arpa"  { type master; file "/etc/bind/db.empty"; };
        zone "18.172.in-addr.arpa"  { type master; file "/etc/bind/db.empty"; };
        zone "19.172.in-addr.arpa"  { type master; file "/etc/bind/db.empty"; };
        zone "20.172.in-addr.arpa"  { type master; file "/etc/bind/db.empty"; };
        zone "21.172.in-addr.arpa"  { type master; file "/etc/bind/db.empty"; };
        zone "22.172.in-addr.arpa"  { type master; file "/etc/bind/db.empty"; };
        zone "23.172.in-addr.arpa"  { type master; file "/etc/bind/db.empty"; };
        zone "24.172.in-addr.arpa"  { type master; file "/etc/bind/db.empty"; };
        zone "25.172.in-addr.arpa"  { type master; file "/etc/bind/db.empty"; };
        zone "26.172.in-addr.arpa"  { type master; file "/etc/bind/db.empty"; };
        zone "27.172.in-addr.arpa"  { type master; file "/etc/bind/db.empty"; };
        zone "28.172.in-addr.arpa"  { type master; file "/etc/bind/db.empty"; };
        zone "29.172.in-addr.arpa"  { type master; file "/etc/bind/db.empty"; };
        zone "30.172.in-addr.arpa"  { type master; file "/etc/bind/db.empty"; };
        zone "31.172.in-addr.arpa"  { type master; file "/etc/bind/db.empty"; };

};

// 3
view "vpn_view" {

        allow-query-on { 192.168.200.1; };
        allow-query { vpn_acl; };
        match-clients { vpn_acl; };

        zone "myhost.tld" {
                type master;
                file "/etc/bind/db.myhost.tld_vpn";
        };

        // formerly named.conf.default-zones

        zone "." {
                type hint;
                file "/etc/bind/db.root";
        };

        zone "localhost" {
                type master;
                file "/etc/bind/db.local";
        };

        zone "127.in-addr.arpa" {
                type master;
                file "/etc/bind/db.127";
        };

        zone "0.in-addr.arpa" {
                type master;
                file "/etc/bind/db.0";
        };

        zone "255.in-addr.arpa" {
                type master;
                file "/etc/bind/db.255";
        };

        // formerly zones.rfc1918

        zone "10.in-addr.arpa"      { type master; file "/etc/bind/db.empty"; };
        zone "16.172.in-addr.arpa"  { type master; file "/etc/bind/db.empty"; };
        zone "17.172.in-addr.arpa"  { type master; file "/etc/bind/db.empty"; };
        zone "18.172.in-addr.arpa"  { type master; file "/etc/bind/db.empty"; };
        zone "19.172.in-addr.arpa"  { type master; file "/etc/bind/db.empty"; };
        zone "20.172.in-addr.arpa"  { type master; file "/etc/bind/db.empty"; };
        zone "21.172.in-addr.arpa"  { type master; file "/etc/bind/db.empty"; };
        zone "22.172.in-addr.arpa"  { type master; file "/etc/bind/db.empty"; };
        zone "23.172.in-addr.arpa"  { type master; file "/etc/bind/db.empty"; };
        zone "24.172.in-addr.arpa"  { type master; file "/etc/bind/db.empty"; };
        zone "25.172.in-addr.arpa"  { type master; file "/etc/bind/db.empty"; };
        zone "26.172.in-addr.arpa"  { type master; file "/etc/bind/db.empty"; };
        zone "27.172.in-addr.arpa"  { type master; file "/etc/bind/db.empty"; };
        zone "28.172.in-addr.arpa"  { type master; file "/etc/bind/db.empty"; };
        zone "29.172.in-addr.arpa"  { type master; file "/etc/bind/db.empty"; };
        zone "30.172.in-addr.arpa"  { type master; file "/etc/bind/db.empty"; };
        zone "32.172.in-addr.arpa"  { type master; file "/etc/bind/db.empty"; };

        // somedomain.tld
        zone "somedomain.tld" {
                type forward;
                forward first;
                forwarders { 192.168.34.110; 192.168.34.100; };
        };

};

/etc/rndc.conf

key "rndc-key" {
        algorithm hmac-md5;
        secret "somesecretkey==";
};

options {
        default-key "rndc-key";
        default-server 127.0.0.1;
        default-port 953;
};

me@jessie:~$ sudo netstat -lnptu | grep "named\W*$"

tcp        0      0 192.168.10.1:53         0.0.0.0:*               LISTEN      1871/named      
tcp        0      0 127.0.0.1:53            0.0.0.0:*               LISTEN      1871/named      
tcp        0      0 127.0.0.1:953           0.0.0.0:*               LISTEN      1871/named      
udp        0      0 192.168.200.1:53        0.0.0.0:*                           1871/named      
udp        0      0 192.168.10.1:53         0.0.0.0:*                           1871/named      
udp        0      0 127.0.0.1:53            0.0.0.0:*                           1871/named 

me@jessie:~$ ps aux | grep named

bind      5843  0.0  1.0 297780 84412 ?        Ssl  00:52   0:16 /usr/sbin/named -f -u bind -4

me@jessie:/etc/bind$ named -V

BIND 9.9.5-9-Debian (Extended Support Version) <id:f9b8a50e> built by make with '--prefix=/usr' '--mandir=/usr/share/man' '--infodir=/usr/share/info' '--sysconfdir=/etc/bind' '--localstatedir=/var' '--enable-threads' '--enable-largefile' '--with-libtool' '--enable-shared' '--enable-static' '--with-openssl=/usr' '--with-gssapi=/usr' '--with-gnu-ld' '--with-geoip=/usr' '--with-atf=no' '--enable-ipv6' '--enable-rrl' '--enable-filter-aaaa' 'CFLAGS=-fno-strict-aliasing -fno-delete-null-pointer-checks -DDIG_SIGCHASE -O2'                                                                                
compiled by GCC 4.9.2                                                                                                   
using OpenSSL version: OpenSSL 1.0.1k 8 Jan 2015                                                                        
using libxml2 version: 2.9.2    

me@jessie's_client:~$ dig @192.168.10.1 launchpad.net

; <<>> DiG 9.9.5-9-Debian <<>> @192.168.10.1 launchpad.net
; (1 server found)
;; global options: +cmd
;; Got answer:
;; ->>HEADER<<- opcode: QUERY, status: SERVFAIL, id: 19673
;; flags: qr rd ra; QUERY: 1, ANSWER: 0, AUTHORITY: 0, ADDITIONAL: 1

;; OPT PSEUDOSECTION:
; EDNS: version: 0, flags:; udp: 4096
;; QUESTION SECTION:
;launchpad.net.                 IN      A

;; Query time: 0 msec
;; SERVER: 192.168.10.1#53(192.168.10.1)
;; WHEN: Thu May 07 23:29:38 MSK 2015
;; MSG SIZE  rcvd: 42

And finally some logs at /var/log/bind/bind.log

07-May-2015 22:52:49.287 resolver: debug 1: createfetch: _xmpp-server._tcp.pandion.im SRV
07-May-2015 22:52:49.287 resolver: debug 1: createfetch: . NS
07-May-2015 22:52:49.954 resolver: debug 1: createfetch: _xmpp-server._tcp.pandion.im SRV
07-May-2015 22:52:50.353 resolver: debug 1: createfetch: launchpad.net A
07-May-2015 22:52:51.288 resolver: debug 1: createfetch: _xmpp-server._tcp.pandion.im SRV
07-May-2015 22:52:51.575 query-errors: debug 1: client 127.0.0.1#47208 (pandion.im): view internal_10_view: query failed (SERVFAIL) for pandion.im/IN/AAAA at query.c:7004
07-May-2015 22:52:53.138 query-errors: debug 1: client 127.0.0.1#55548 (_jabber._tcp.none.su): view internal_10_view: query failed (SERVFAIL) for _jabber._tcp.none.su/IN/SRV at query.c:7004
07-May-2015 22:52:53.955 resolver: debug 1: createfetch: _jabber._tcp.pandion.im SRV
07-May-2015 22:52:54.622 resolver: debug 1: createfetch: _jabber._tcp.pandion.im SRV
07-May-2015 22:52:55.353 query-errors: debug 1: client 192.168.10.2#37375 (launchpad.net): view internal_10_view: query failed (SERVFAIL) for launchpad.net/IN/A at query.c:7004
07-May-2015 22:52:55.354 resolver: debug 1: createfetch: launchpad.net A
07-May-2015 22:52:55.956 resolver: debug 1: createfetch: _jabber._tcp.pandion.im SRV

/var/log/bind/security_info.log

07-May-2015 00:45:26.055 warning: using built-in root key for view vpn_view
07-May-2015 12:31:37.603 warning: using built-in root key for view internal_10_view
07-May-2015 12:31:37.769 warning: using built-in root key for view internal_150_view
07-May-2015 12:31:37.773 warning: using built-in root key for view vpn_view
07-May-2015 12:31:44.859 warning: using built-in root key for view internal_10_view
07-May-2015 12:31:44.865 warning: using built-in root key for view internal_150_view
07-May-2015 12:31:44.871 warning: using built-in root key for view vpn_view
07-May-2015 12:31:46.005 warning: using built-in root key for view internal_10_view
07-May-2015 12:31:46.011 warning: using built-in root key for view internal_150_view
07-May-2015 12:31:46.016 warning: using built-in root key for view vpn_view
07-May-2015 12:31:47.108 warning: using built-in root key for view internal_10_view
07-May-2015 12:31:47.114 warning: using built-in root key for view internal_150_view
07-May-2015 12:31:47.121 warning: using built-in root key for view vpn_view
07-May-2015 12:31:48.946 warning: using built-in root key for view internal_10_view
07-May-2015 12:31:48.951 warning: using built-in root key for view internal_150_view
07-May-2015 12:31:48.957 warning: using built-in root key for view vpn_view
07-May-2015 14:07:39.729 warning: using built-in root key for view internal_10_view
07-May-2015 14:07:39.737 warning: using built-in root key for view internal_150_view
07-May-2015 14:07:39.743 warning: using built-in root key for view vpn_view
07-May-2015 14:12:05.871 warning: using built-in root key for view internal_10_view
07-May-2015 14:12:05.880 warning: using built-in root key for view internal_150_view
07-May-2015 14:12:05.890 warning: using built-in root key for view vpn_view
07-May-2015 14:27:07.630 warning: using built-in root key for view internal_10_view
07-May-2015 14:27:07.638 warning: using built-in root key for view internal_150_view
07-May-2015 14:27:07.644 warning: using built-in root key for view vpn_view

Any suggestions what might be wrong?

Best Answer

This one is a real pain to troubleshoot if you aren't familiar with the new max-recursion-queries option or why it was added.

CVE-2014-8500 was identified in late 2014 as impacting multiple nameserver products, including BIND. The exploit allows a malicious nameservers to craft a chain of referrals that will be followed infinitely, eventually leading to resource exhaustion. ISC's fix for this issue was to add an upper limit on how many levels of recursion the server is willing to perform on behalf of a single query. The ceiling is controlled by a new max-recursion-queries option that defaults to 75.

As it turns out 75 levels of recursion is not very friendly to an empty nameserver cache -- which you will always have after a full process restart. There are many domains that will fail to resolve with this default due to how many levels of referrals end up being traversed between a requested record and . (root). The pandion.im. domain happens to be one of those, and it probably has something to do with the glueless delegation from the TLD. Here's an excerpt from dig +trace +additional pandion.im:

im.                     172800  IN      NS      ns4.ja.net.
im.                     172800  IN      NS      hoppy.iom.com.
im.                     172800  IN      NS      barney.advsys.co.uk.
im.                     172800  IN      NS      pebbles.iom.com.
ns4.ja.net.             172800  IN      A       193.62.157.66
hoppy.iom.com.          172800  IN      A       217.23.163.140
barney.advsys.co.uk.    172800  IN      A       217.23.160.50
pebbles.iom.com.        172800  IN      A       80.168.83.242
ns4.ja.net.             172800  IN      AAAA    2001:630:0:47::42
;; Received 226 bytes from 199.7.83.42#53(199.7.83.42) in 29 ms

pandion.im.             259200  IN      NS      ed.ns.cloudflare.com.
pandion.im.             259200  IN      NS      jill.ns.cloudflare.com.
;; Received 81 bytes from 80.168.83.242#53(80.168.83.242) in 98 ms

The nameservers for im. are delegating pandion.im. to Cloudflare's nameservers without providing IP address glue. On an empty cache, this means that the server has to initiate a separate referral traversal to obtain the IP address of those nameservers, and all of those referrals count against the maximum number of recursions for the original query. At that point the query will only succeed if the server already knows the IP addresses of those nameservers from other queries:

# service named restart && sleep 1 && dig @localhost pandion.im | grep status
Checking named config:
Stopping named:                                        [  OK  ]
Starting named:                                        [  OK  ]
;; ->>HEADER<<- opcode: QUERY, status: SERVFAIL, id: 63173

Trying again, this time with attempts to look up those nameservers before pandion.im.:

# service named restart && sleep 1 && dig @localhost ed.ns.cloudflare.com jill.ns.cloudflare.com pandion.im | grep status
Checking named config:
Stopping named:                                        [  OK  ]
Starting named:                                        [  OK  ]
;; ->>HEADER<<- opcode: QUERY, status: SERVFAIL, id: 26428
;; ->>HEADER<<- opcode: QUERY, status: NOERROR, id: 30491
;; ->>HEADER<<- opcode: QUERY, status: NOERROR, id: 22162

Long story short, this problem is very non-intuitive to identify, especially since it will seem to eventually "go away" over time if the process is left running. One of our partners has recommended a value of 200 based on real world usage scenarios. Start with 200, and season to taste if it's too high for your liking.