<?xml version="1.0" encoding="US-ASCII"?>
<!DOCTYPE rfc SYSTEM "rfc2629.dtd">
<?rfc toc="yes"?>
<?rfc tocompact="yes"?>
<?rfc tocdepth="3"?>
<?rfc tocindent="yes"?>
<?rfc symrefs="yes"?>
<?rfc sortrefs="yes"?>
<?rfc comments="yes"?>
<?rfc inline="yes"?>
<?rfc compact="yes"?>
<?rfc subcompact="no"?>
<rfc category="std"
     docName="draft-chen-lsvr-flood-reduction-01"
     ipr="trust200902">
  <front>
    <title abbrev="Flood Reduction">BGP-SPF Flooding Reduction</title>

     <author initials="H" surname="Chen" fullname="Huaimo Chen">
      <organization>Futurewei</organization>
      <address>
        <postal>
          <street></street>
          <city>Boston, MA</city>
          <region></region>
          <code></code>
          <country>USA</country>
        </postal>
        <email>huaimo.chen@futurewei.com</email>
      </address>
    </author>
<!--
    <author initials="T." fullname="Tony Li" surname="Li">
      <organization>Arista Networks</organization>
      <address>
        <postal>
           <street>5453 Great America Parkway</street>
          <city>Santa Clara</city>
          <code>95054</code>
          <region>California</region>
          <country>USA</country>
        </postal>
        <email>tony.li@tony.li</email>
      </address>
    </author>

    <author fullname="Mike McBride" initials="M" surname="McBride">
      <organization>Futurewei</organization>
      <address>
        <email>michael.mcbride@futurewei.com</email>
      </address>
    </author>

    <author fullname="Ran Chen" initials="R" surname="Chen">
      <organization>ZTE Corporation</organization>
      <address>
        <email>chen.ran@zte.com.cn</email>
      </address>
    </author>

    <author fullname="Robert Raszuk" initials="R" surname="Raszuk">
      <organization>NTT Network Innovations</organization>
      <address>
        <email>robert@raszuk.net</email>
      </address>
    </author>
-->
    <author fullname="Gyan S. Mishra" initials="G" surname="Mishra">
      <organization>Verizon Inc.</organization>
      <address>
        <postal>
          <street>13101 Columbia Pike</street>
          <city>Silver Spring</city>
          <code>MD 20904</code>
          <country>USA</country>
        </postal>
        <phone> 301 502-1347</phone>
        <email>gyan.s.mishra@verizon.com</email>
      </address>
    </author>

     <author initials="A" fullname="Aijun Wang" 
            surname="Wang">
      <organization>China Telecom</organization>
      <address>
        <postal>
          <street>Beiqijia Town, Changping District</street>
          <city>Beijing</city>
          <region> </region>
          <code>102209</code>
          <country>China</country>
        </postal>
        <email>wangaj3@chinatelecom.cn</email>
      </address>
    </author>

     <author initials="Y" fullname="Yisong Liu" 
            surname="Liu">
      <organization>China Mobile</organization>
      <address>
        <email>liuyisong@chinamobile.com</email>
      </address>
    </author>
    <author fullname="Haibo Wang" initials="H. " surname="Wang">
      <organization>Huawei</organization>
      <address>
        <postal>
          <street>Huawei Bld., No.156 Beiqing Rd.</street>
          <city>Beijing</city>
          <code>100095</code>
          <country>China</country>
        </postal>
        <email>rainsword.wang@huawei.com</email>
      </address>
    </author>
   <author initials="Y" fullname="Yanhe Fan" 
            surname="Fan">
      <organization>Casa Systems</organization>
      <address>
        <postal>
          <street></street>
          <city></city>
          <region></region>
          <code></code>
          <country>USA</country>
        </postal>
        <email>yfan@casa-systems.com</email>
      </address>
    </author>

<!--
   <author initials="L" fullname="Lei Liu" 
            surname="Liu">
      <organization>Fujitsu</organization>
      <address>
        <postal>
          <street> </street>
          <city> </city>
          <region></region>
          <code></code>
          <country>USA</country>
        </postal>
        <email>liulei.kddi@gmail.com</email>
      </address>
    </author>

   <author initials="X" fullname="Xufeng Liu" 
            surname="Liu">
      <organization>Volta Networks</organization>
      <address>
        <postal>
          <street> </street>
          <city>McLean</city>
          <region>VA</region>
          <code></code>
          <country>USA</country>
        </postal>
        <email>xufeng.liu.ietf@gmail.com</email>
      </address>
    </author>


    <author initials="M" fullname="Mehmet Toy" 
            surname="Toy">
      <organization> Verizon </organization>
      <address>
        <postal>
          <street></street>
          <city></city>
          <region></region>
          <country>USA</country>
        </postal>
        <email>mehmet.toy@verizon.com</email>
      </address>
     </author>

     <author initials="Z" fullname="Zhenqiang Li" 
            surname="Li">
      <organization>China Mobile</organization>
      <address>
        <postal>
          <street>32 Xuanwumen West Ave, Xicheng District</street>
          <city>Beijing</city>
          <region> </region>
          <code>100053</code>
          <country>China</country>
        </postal>
        <email>lizhengqiang@chinamobile.com</email>
      </address>
    </author>
-->

    <date year="2021"/>

    <abstract>
     <t>This document describes extensions to
      Border Gateway Protocol (BGP) for flooding the link states
      on a topology that is a subgraph of the 
      complete topology of a BGP-SPF domain, so that 
      the amount of flooding traffic in the domain is greatly 
      reduced. 
      This would reduce convergence time with a more stable 
      and optimized routing environment.</t>
    </abstract>

    <note title="Requirements Language">
      <t>The key words "MUST", "MUST NOT", "REQUIRED", "SHALL", "SHALL NOT",
      "SHOULD", "SHOULD NOT", "RECOMMENDED", "MAY", and "OPTIONAL" in this
      document are to be interpreted as described in <xref
      target="RFC2119">RFC 2119</xref>.</t>
    </note>
  </front>

  <middle>
    <section title="Introduction">

<t> For some networks such as dense Data Center (DC) networks
with BGP-SPF,
the existing Link State (LS) flooding mechanism 
defined in <xref target="I-D.ietf-lsvr-bgp-spf"/>
for a BGP-SPF domain
may not be efficient and may have some issues. 
The extra LS flooding consumes network bandwidth. 
Processing the extra LS flooding, 
including receiving, buffering and decoding the extra LSs, 
wastes memory space and processor time.
This may cause scalability issues and 
affect the network convergence negatively.
</t>
	

     <t>This document describes extensions to
      Border Gateway Protocol (BGP) for flooding the link states
      on a topology that is a subgraph of the 
      complete topology of a BGP-SPF domain, so that 
      the amount of flooding traffic in the domain is greatly 
      reduced.</t> 

    </section> <!-- Introduction -->


    <section title="Terminologies">
    <t>The following terminologies are used in this document.
      <list style="hanging" hangIndent="6">
       <t hangText="BGP:">Border Gateway Protocol</t>
       <t hangText="LS:">Link State</t>
       <t hangText="SPF:">Shortest Path First</t>
       <t hangText="RR:">Route Reflector</t>
      </list>
     </t>
    </section> <!-- Terminologies -->


    <section title="Overview of BGP-SPF Link State Flooding">
      <t><xref target="I-D.ietf-lsvr-bgp-spf"/> defines 
         three BGP peering models:

      <list style="symbols">
         <t>BGP Peering in Route-Reflector or Controller Topology 
            (RR or Sparse model for short).</t>
         <t>BGP Single-Hop Peering on Network Node Connections
           (Node Connections model for short), and</t>
         <t>BGP Peering Between Directly-Connected Nodes
            (Directly-Connected Nodes model for short).</t>
      </list>

         This section briefs the BGP-SPF Link State Flooding
         in each of these models.</t>

      <section title="Flooding in RR Model">
        <t>In RR model, BGP-SPF speakers/nodes peer solely 
           with one or more Route Reflectors (RRs) or controllers
           over eBGP sessions. 
           A BGP-SPF speaker sends/advertises its BGP-LS-SPF Link NLRI 
           in a BGP update message to the RRs or controllers that 
           the speaker peers with when it discovers that its corresponding 
           link is up. 
           After receiving the Link NLRI, each of the RRs or controllers sends
           the NLRI in a BGP update message to the other BGP-SPF speakers 
           that peer with the RRs or controllers.</t>
 
        <t>For example, <xref target="rr-top-1"/>  
          shows a BGP-SPF domain, which contains two RRs RR1 and RR2, 
          and three network nodes A, B and C. RR1 peers with all three 
          nodes A, B and C in the network. 
          RR2 also peers with all three nodes A, B and C in the network. 
          There is a link between A and B, a link between A and C, and 
          a link between B and C. </t>
<t>           
           <figure anchor="rr-top-1" 
           title="BGP-SPF Domain with two RRs">
  <artwork align="center"> <![CDATA[
              +-------+      +-------+
              |  RR1  |------|  RR2  |
              +-------+      +-------+
             /    \   \  ____/  /    \   
            /      \___\/      /      \    
           /       /\   \___  /        \
          / ______/  \      \/          \
         / /-->       \     /\__________ \
        / /            ( B )            \ \
       / /         ___/     \___         \ \
      / /     ____/             \____     \ \
  ^  / / ____/                       \____ \ \
  | / / /                                 \ \ \  
 / / / /                                   \ \ \
  ( A )-------------------------------------( C )]]></artwork>
</figure>
</t>
        <t>Each of the nodes A, B and C in the network 
           sends/advertises its link NLRIs in BGP update messages 
           to both RR1 and RR2. After receiving a link NLRI in a 
           BGP update message from a node (e.g., node A), each of 
           RR1 and RR2 sends the NLRI in a BGP update message to 
           the other nodes (e.g., nodes B and C). 
           Each of the other nodes receives two copies of the 
           same NLRI, one from RR1 and the other from RR2. 
           One copy is enough, the other redundant copy should be 
           reduced.</t>
      </section> <!-- Flooding in RR Model -->

      <section title="Flooding in Node Connections Model">
        <t>In Node Connections model, EBGP single-hop sessions are
           established over direct point-to-point links 
           interconnecting the nodes in the BGP-SPF routing domain.
           Once the session has been established and 
           the BGP-LS-SPF AFI/SAFI capability has been exchanged for 
           the corresponding session, then the link is considered up 
           from a BGP-SPF perspective and the corresponding BGP-LS-SPF
           Link NLRI is advertised to all the nodes in the domain 
           through all the BGP sessions over the links. 
           If the session goes down, the corresponding Link NLRI will
           be withdrawn. The withdrawal is done through advertising 
           a BGP update containing the NLRI in MP_UNREACH_NLRI to 
           all the nodes in the domain using all BGP sessions over 
           the links.</t>



        <t>For example, <xref target="parallel-link-top-1"/> shows 
           a BGP-SPF domain, which contains four nodes A, B, C and D. 
           These four nodes are connected by six links. 
           There are two parallel links between A and B, 
           a link between A and C, a link between A and D, 
           a link between B and C and a link between C and D.</t>
<t>
<figure anchor="parallel-link-top-1" 
        title="BGP-SPF Domain with parallel links">
  <artwork align="center"> <![CDATA[
       -->
     _____________________
 ( A )-------------------( B )
 | |\  -->                 | |
 v | \_____                | v
   |  -->  \_______        | 
   |               \_____  | 
   |                     \ | ^
   |                      \| |
 ( D )-------------------( C )
       -->           <--       ]]></artwork>
</figure> 
</t>

         <t>Suppose that the BGP sessions over all the links except
            for the session over the link between A and D have been 
            established and the BGP-LS-SPF AFI/SAFI capability has 
            been exchanged for the corresponding sessions. When the
            BGP session over the link between A and D is established
            and the BGP-LS-SPF AFI/SAFI capability is exchanged for
            the corresponding session, node A considers that the link
            from A to D is up and sends the BGP-LS-SPF Link NLRI for
            the link through its four BGP sessions (i.e., the session 
            between A and B over the first parallel link between A and
            B, the session between A and B over the second parallel 
            link between A and B, the session between A and C over the
            link between A and C, and the session between A and D over
            the link between A and D) to nodes B, C and D. After 
            receiving the NLRI from node A, each of the nodes B, C and
            D sends the NLRI to the other nodes that have BGP sessions
            with the node. Node B sends the NLRI to node C. Node C
            sends the NLRI to nodes B and D. Node D sends the NLRI to
            node C.</t>

         <t>Similarly, when the BGP session over the link between A and
            D is established and the BGP-LS-SPF AFI/SAFI capability is
            exchanged for the corresponding session, node D considers
            that the link from D to A is up and sends the BGP-LS-SPF
            Link NLRI for the link through its two BGP sessions (i.e., the
            session between D and C over the link between D and C, and
            the session between D and A over the link between D and A)
            to nodes C and A. After receiving the NLRI from node D,
            each of the nodes A and C sends the NLRI to the other nodes
            that have BGP sessions with the node. Node C sends the NLRI
            to nodes A and B. Node A sends the NLRI to nodes B and C
            through two parallel BGP sessions to B and the BGP session
            to C. </t>

      </section> <!-- Flooding in Node Connections Model  -->

      <section title="Flooding in Directly-Connected Nodes Model">
        <t>In Directly-Connected Nodes model, BGP-SPF speakers peer 
           with all directly-connected nodes but the sessions may
           be between loopback addresses. 
           Consequently, there will be a single BGP session even if
           there are multiple direct connections between BGP-SPF
           speakers. BGP-LS-SPF Link NLRI is advertised as long as
           a BGP session has been established, the BGP-LS-SPF
           AFI/SAFI capability has been exchanged. Since there are
           BGP sessions between every directly-connected nodes in
           the BGP-SPF routing domain, there is only a reduction 
           in BGP sessions when there are parallel links between
           nodes comparing to node connections model.</t>
      </section> <!-- Flooding in Directly-Connected Nodes Model -->
    </section> <!-- Overview of BGP-SPF Link State Flooding -->


    <section title="Revised Flooding Procedures">
      <t>This section describes the revised flooding procedures
         to support flooding reduction 
         for different models, including RR Model and 
         Node Connections Model.
         These procedures are backward compatible.
         In a network with some nodes (including RRs) 
         not supporting flooding
         reduction, a link NLRI originated from any node 
         will be distributed to every node in the network.
      </t>

     <section title="Revised Flooding Procedure for RR Model">
      <t>In RR model, the revised flooding procedure is as follows:
        <list style = "symbols">
         <t>Every BGP-SPF speaker/node sends its BGP-LS-SPF Link NLRI
            to the same one or more of the RRs or controllers that the
            speaker peers with when it discovers that its
            corresponding link is up.
         </t>
         <t>After receiving the Link NLRI, the RR or controller 
            sends the NLRI to the other BGP-SPF speakers that peer
            with the RR or controller.</t>
        </list>
      </t>

      <t>For example, for the BGP-SPF domain in 
         <xref target="rr-top-1"/>, using the revised flooding procedure,
         speaker/Node A sends its Link NLRI for link A to B to 
         RR1 when A discovers that link A to B is up. 
         Node A does not send the NLRI to RR2. 
         After receiving the Link NLRI for link A to B from 
         speaker/node A, RR1 sends the NLRI to the other nodes B and C.

         Each of the other nodes receives only one copy of the same NLRI,
         which is from RR1. There is no redundant copy of the same NLRI.
         Comparing to the normal flooding in RR model as illustrated
         in <xref target="rr-top-1"/>, the revised flooding procedure
         reduces the amount of link states flooding by half. 
     </t>
     </section> <!-- Revised Flooding Procedure for RR Model -->

     <section title="Revised Flooding Procedure for Node Connections Model">
      <t>In Node Connections model, the revised flooding procedure is as
         follows:
        <list style = "symbols">
         <t>A BGP-SPF speaker/node has a flooding topology of the
            BGP-SPF domain. In an option, the flooding topology is
            computed in a distributed mode, where every BGP-SPF
            speaker computes a flooding topology for the domain
            using a same algorithm.
            In another option, the flooding topology is computed
            in a centralized mode, where one BGP-SPF speaker
            elected as a leader computes a flooding topology for
            the domain and advertises the flooding topology to
            every BGP-SPF speaker in the domain.
          </t>
         <t>A BGP-SPF speaker/node sends its link NLRI in a BGP update
            message for its link up or down to its peers that 
            are directly connected on the flooding topology, and
            sends its link NLRI in a BGP update message for its link 
            down to all its peers.
            
            When receiving the NLRI in a new BGP
            update message for a link up or down from a peer, the speaker
            sends the NLRI in a BGP update message to its other peers that
            are directly connected on the flooding topology.</t>

          <t>When a BGP-SPF session is down, the BGP-SPF speaker/node
             that was connected to the session will not withdraw the 
             link NLRIs received from the session right away. 
             It keeps the NLRIs for some time.</t>
         </list>
      </t>

      <t>Given a real network topology (RT), a flooding topology (FT) of
         the RT is a sub network topology of the RT and connects all the
         nodes in the RT.</t>

      <t>For example, <xref target="flooding-top-1"/> shows
         a flooding topology of the real topology in
         <xref target="parallel-link-top-1"/>. </t>
<t>
<figure anchor="flooding-top-1" 
        title="A Flooding Topology">
  <artwork align="center"> <![CDATA[            
 ( A )-------------------( B )
   |                       |
   |                       |
   |                       | 
   |                       | 
   |                       |
   |                       |
 ( D )-------------------( C )]]></artwork>
</figure> 
</t>
      <t>The flooding topology in <xref target="flooding-top-1"/>
         is a sub network topology of the RT in
         <xref target="parallel-link-top-1"/> and connects all the
         nodes (i.e., nodes A, B, C and D) in the RT in 
         <xref target="parallel-link-top-1"/>.</t>

      <t><xref target="reduced-flood-flow-1"/> shows a reduced flooding
         flow of a link NLRI in a BGP update message for a link up or
         down in the BGP-SPF domain, which is the same as the one in
         <xref target="parallel-link-top-1"/>.</t>
<t>
<figure anchor="reduced-flood-flow-1" 
        title="A Reduced Link State Flooding Flow">
  <artwork align="center"> <![CDATA[
       -->
     _____________________
 ( A )-------------------( B )
 | |\                      | |
 v | \_____                | v
   |       \_______        | 
   |               \_____  | 
   |                     \ | 
   |                      \| 
 ( D )-------------------( C )
       -->                    ]]></artwork>
</figure> 
</t>

      <t>Speaker/Node A sends the NLRI in a BGP update message 
         for its link to
         its peers B and D. Nodes B and D are peers of node A and 
         are directly connected to A on the
         flooding topology (FT). Node A does not send the NLRI to its peer
         C since C is not directly connected to A on the FT.</t>
      <t>After receiving the
         NLRI in the message from A, node B sends the NLRI in a BGP update
         message to B's other peer C 
         (which is directly connected to B on the FT).
         After receiving the NLRI in a BGP update message from A, node D
         sends the NLRI in a BGP update message to D's other peer C
         (which is directly connected to D on the FT). </t>
     <t>The number of NLRIs in messages flooded in
        <xref target="reduced-flood-flow-1"/> is much less than that in
        <xref target="parallel-link-top-1"/>.
        The performance of network is improved using the revised
        flooding procedure. </t>
     </section> <!-- Revised Flooding Procedure for Node Connections Model -->
    </section> <!-- Revised Flooding Procedures -->


    <section title="BGP Extensions for Flooding Reduction">
      <t>This section specifies BGP extensions for flooding
         reduction in two models: 
         RR model and Node Connections model.
         The extensions for Directly-Connected Node model 
         are included 
         in the extensions for Node Connections model.</t>

      <section title="Extensions for RR Model">
      <t>A single RR for a BGP-SPF domain is elected as a leader RR
         of the domain. The leader RR is the RR with the highest 
         priority to become a leader in the domain. If there are
         more than one RRs having the same highest priority, the RR
         with the highest Node ID and the highest priority is the
         leader RR in the domain.
         In a deployment, only every RR
         advertises its priority for becoming a leader using a
         Leader Priority TLV defined below.
       </t>

       <t>Two new TLVs are defined for flooding reduction in RR model.
         <list style="symbols">
           <t>Leader Priority TLV: A node uses it to advertise its
              priority for becoming a leader. </t>
           <t>Node Flood TLV: A RR or controller uses it to 
              tell every node the flooding behavior the node
              needs to follow.</t>
         </list>
          The format of Leader Priority TLV is illustrated in
          <xref target = "leader-priority-tlv"/>. 
       </t>
<t>
<figure anchor="leader-priority-tlv" 
        title="Leader Priority TLV">
<artwork align="center"> <![CDATA[
  0                   1                   2                   3
  0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
 +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
 |           Type = TBD1         |          Length = 4           |
 +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
 |                      Reserved                 |   Priority    |
 +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+]]></artwork>
</figure>
</t>
      <t>
        <list style="hanging" hangIndent="6">
         <t hangText="Type:">It is to be assigned by IANA.</t>
         <t hangText="Length:">4.</t>
         <t hangText="Reserved:">MUST be set to zero in transmission
              and should be ignored on reception.</t>
         <t hangText="Priority:">A unsigned integer from  0 to 255 
              in one octet indicating priority to become a leader.</t>
        </list>
       </t>

       <t>The format of Node Flood TLV is illustrated in
          <xref target = "node-flood-tlv"/>. 
       </t>
<t>
<figure anchor="node-flood-tlv" 
        title="Node Flood TLV">
<artwork align="center"> <![CDATA[
  0                   1                   2                   3
  0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
 +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
 |           Type = TBD2         |          Length = 4           |
 +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
 |                      Reserved                 | Flood-behavior|
 +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+]]></artwork>
</figure>
</t>
      <t>
        <list style="hanging" hangIndent="6">
         <t hangText="Type:">It is to be assigned by IANA.</t>
         <t hangText="Length:">4.</t>
         <t hangText="Reserved:">MUST be set to zero in transmission
              and should be ignored on reception.</t>
         <t hangText="Flood-behavior:">The following flooding behavior
              are defined.</t>
        </list>
      </t>
<t>
<figure>
<artwork> <![CDATA[
        0 - Reserved.
        1 - send link states to the RR with the minimum ID 
        2 - send link states to the RR with the maximum ID
        3 - send link states to 2 RRs with smaller IDs 
        4 - send link states to 2 RRs with larger IDs 
    6-127 - Standardized flooding behaviors for RR Model
  128-254 - Private flooding behaviors for RR Model.
]]></artwork>
</figure>
</t>

       <t>In a deployment, 
          the flooding behavior for every node is configured on a RR
          or controller such as the leader RR and the RR advertises
          the behavior to the other RRs and every node in the network
          though using a Node Flood TLV. 
       </t>

       <t>For example, if we want every node in the network to send 
          its link states to only one RR, we configure this behavior on
          a RR and the RR advertises the behavior to every node using 
          a Node Flood TLV with Flood-behavior set to one, which tells 
          every node to send its link states to the RR with the minimum
          ID. If we want every node in the network to send its link states
          to two RRs for redundancy, we configure this behavior on a RR
          and the RR advertises the behavior to every node using a Node
          Flood TLV with Flood-behavior set to 3, which tells every node
          to send its link states to the two RRs with smaller IDs
          (i.e., the RR with the minimum ID and the RR with the second
          minimum ID). 
        </t>
      </section> <!-- Extensions for RR Model -->


      <section title="Extensions for Node Connections Model">
       <t>There are two modes for the flooding topology computation:
          centralized mode and distributed mode. In a centralized
          mode, one BGP-SPF node is elected as a leader. The leader
          computes a flooding topology for the BGP-SPF domain and
          advertises the flooding topology to every BGP-SPF node in
          the domain. In a distributed mode, every BGP-SPF node
          computes a flooding topology for the BGP-SPF domain using
          a same algorithm. There is not any flooding topology
          distribution. 
       </t>

       <t> This section defines the new TLVs for the two modes, 
           describes the flooding topology distribution in centralized
           mode and an algorithm that can be used by every node
           to compute its flooding topology in distributed mode.</t>

      <section title="New TLVs">
       <t>Five new TLVs are defined for flooding reduction 
          in Node Connections model.
         <list style="symbols">
           <t>Node Algorithm TLV: A leader uses this TLV to 
              tell every node the algorithm to be used to compute
              a flooding topology.</t>
           <t>Algorithms Support TLV: A node uses this TLV 
              to indicate the algorithms that it supports for 
              distributed mode.</t>
           <t>Node IDs TLV: A leader uses this TLV  
              to indicate the mapping from nodes to their indices 
              for centralized mode.</t>
           <t>Paths TLV: A leader uses this TLV  
              to advertise a part of flooding topology 
              for centralized mode.</t>
           <t>Connection Used for Flooding TLV: A node uses this TLV  
              to indicate that a connection/link is a part of the 
              flooding topology and used for flooding.</t>
         </list>
       </t>


      <section title="Node Algorithm TLV">
       <t>The format of Node Algorithm TLV is illustrated in
          <xref target = "node-algorithm-tlv"/>. 
       </t>
<t>
<figure anchor="node-algorithm-tlv" 
        title="Node Algorithm TLV">
<artwork align="center"> <![CDATA[
  0                   1                   2                   3
  0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
 +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
 |           Type = TBD3         |          Length = 4           |
 +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
 |                      Reserved                 |   Algorithm   |
 +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+]]></artwork>
</figure>
</t>
      <t>
        <list style="hanging" hangIndent="6">
         <t hangText="Type:">It is to be assigned by IANA.</t>
         <t hangText="Length:">4.</t>
         <t hangText="Reserved:">MUST be set to zero in transmission
              and should be ignored on reception.</t>
         <t hangText="Algorithm:"> </t>
        </list>
      </t>
<t>
<figure>
<artwork> <![CDATA[
        0 - The leader computes a flooding topology using its own
            algorithm and advertises the flooding topology to every
            node.
    1-127 - Every node computes its flooding topology using this 
            standardized distributed algorithm.
  128-254 - Private distributed algorithms.
]]></artwork>
</figure>
</t>
       <t>A node such as the leader node can use this TLV to tell
          every node in the domain to 
          use the flooding topology from the leader for flooding
          the link states through advertising the TLV with 
          the Algorithm field set to zero, or to tell
          every node to compute its own flooding topology using 
          the algorithm given by the Algorithm field in the TLV 
          containing an identifier of an algorithm when the 
          Algorithm field is not zero.
       </t>
      </section> <!-- Node Algorithm TLV -->


      <section title="Algorithms Support TLV">
       <t>The format of Algorithms Support TLV is illustrated in
          <xref target = "algorithms-support-tlv"/>. 
       </t>
<t>
<figure anchor="algorithms-support-tlv" 
        title="Algorithms Support TLV">
<artwork align="center"> <![CDATA[
  0                   1                   2                   3
  0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
 +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
 |           Type = TBD4         |          Length (variable)    |
 +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
 |   Algorithm   |   Algorithm   |    . . .
 +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+]]></artwork>
</figure>
</t>
      <t>
        <list style="hanging" hangIndent="6">
         <t hangText="Type:">It is to be assigned by IANA.</t>
         <t hangText="Length:">The number of Algorithms in the TLV.</t>
         <t hangText="Algorithm:">A numeric identifier in the range 
              0-255 indicating the algorithm that can be used to
              compute the flooding topology.</t>
        </list>
      </t>
      </section> <!-- Algorithms Support TLV -->


      <section title="Node IDs TLV">
       <t>The format of Node IDs TLV is illustrated in
          <xref target = "node-ids-tlv"/>. 
       </t>
<t>
<figure anchor="node-ids-tlv" 
        title="Node IDs TLV">
<artwork align="center"> <![CDATA[
  0                   1                   2                   3
  0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
 +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
 |           Type = TBD5         |          Length (variable)    |
 +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
 |      Reserved               |L|         Starting Index        |
 +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
 |                            Node ID                            |
 +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
 ~                          . . . . . .                          ~
 +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
 |                            Node ID                            |
 +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+]]></artwork>
</figure>
</t>
      <t>
        <list style="hanging" hangIndent="6">
         <t hangText="Type:">It is to be assigned by IANA.</t>
         <t hangText="Length:">4 * (number of Node IDs + 1).</t>
         <t hangText="Reserved:">MUST be set to zero in transmission
              and should be ignored on reception.</t>
         <t hangText="L:">This bit is set to one if the index of the last
              node ID in this TLV is equal to the last index in the
              full list of node IDs for the BFP-SPF domain.</t>
         <t hangText="Starting Index:">
            The index of the first node ID in this TLV is Starting Index;
            the index of the second node ID in this TLV is Starting Index + 1;
            the index of the third node ID in this TLV is Starting Index + 2;
            and so on. </t>
         <t hangText="Node ID:">The BGP identifier of a node in the
            BGP-SPF domain.</t>
        </list>
      </t>
      </section> <!-- Node IDs TLV -->

      <section title="Paths TLV">
       <t>The format of Paths TLV is illustrated in
          <xref target = "paths-tlv"/>. 
          A leader uses this TLV to advertise a part of flooding
          topology for centralized mode. 
          A path may be described as a sequence of indices: 
          (Index 1, Index 2, Index 3, ...), denoting a connection
          between the node with index 1 and the node with index 2,
          a connection between the node with index 2 and the node
          with index 3, and so on.
          A single link/connection is a simple case of a path that only 
          connects two nodes. 
          A single link path may be encoded in a paths TLV of 8 bytes 
          with two indices.
       </t>
<t>
<figure anchor="paths-tlv" 
        title="Paths TLV">
<artwork align="center"> <![CDATA[
  0                   1                   2                   3
  0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
 +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
 |           Type = TBD6         |          Length (variable)    |
 +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
 |             Index 1           |             Index 2           |
 +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
 ~                          . . . . . .                          ~
 +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+]]></artwork>
</figure>
</t>
      <t>
        <list style="hanging" hangIndent="6">
         <t hangText="Type:">It is to be assigned by IANA.</t>
         <t hangText="Length:">2 * (number of indices in the path)
            when the TLV contains the indices for one path.</t>
         <t hangText="Index 1:">
            The index of the first node in the path.</t>
         <t hangText="Index 2:">
            The index of the second (next) node in the path.</t>
        </list>
      </t>

       <t>Multiple such as N paths may be encoded in one paths TLV.
          Each of the multiple paths is represented as a sequence of
          indices of the nodes on the path, and two paths (i.e., two
          sequences of indices for the two paths) are separated by a
          special index value such as 0xFFFF. In this case, there are 
          (N - 1) special indices as separators to separate N paths,
          and the Length field has a value of
          2 * (number of indices in N paths + N  - 1).
       </t>

       <t>When there are a number such as N of single link paths,
          using one paths TLV to represent them is more efficient
          than using N paths TLVs to represent them (i.e., each
          paths TLV represents a single link path). Using one
          TLV consumes 4 + 2 * (2*N + N - 1) = 6*N + 2 bytes. Using
          N TLVs occupies N * (4 + 4) = 8*N bytes. The space
          used by the former is about three quarters of
          the space used by the latter for a big N such as 30.
       </t>

      </section> <!-- Paths TLV -->

      <section title="Connection Used for Flooding TLV">
       <t>The format of Connection Used for Flooding TLV is illustrated in
          <xref target = "cuf-tlv"/>. 
       </t>
<t>
<figure anchor="cuf-tlv" 
        title="Connection Used for Flooding TLV">
<artwork align="center"> <![CDATA[
  0                   1                   2                   3
  0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
 +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
 |           Type = TBD7         |          Length = 8           |
 +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
 |                        Local Node ID                          |
 +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
 |                        Remote Node ID                         |
 +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+]]></artwork>
</figure>
</t>
      <t>
        <list style="hanging" hangIndent="6">
         <t hangText="Type:">It is to be assigned by IANA.</t>
         <t hangText="Length:">8.</t>
         <t hangText="Local Node ID:">
            The BGP ID of the local node of the session over the connection on
            the flooding topology which is used for flooding link states.</t>
         <t hangText="Remote Node ID:">
            The BGP ID of the remote node of the session over the connection on
            the flooding topology which is used for flooding link states.</t>
        </list>
      </t>

      </section> <!-- Connection Used for Flooding TLV -->
      </section> <!-- New TLVs -->


      <section anchor="FT-Distribution"
             title="Flooding Topology Distribution in Centralized Mode">
        <t>In centralized mode, the leader computes a flooding topology
           for the domain whenever there is a change in the real
           network topology of the domain and advertises the flooding
           topology to every node in the domain. 
        </t>

        <t>After the current leader has failed, a new leader is elected.
           The new leader computes a flooding topology for the domain
           and advertises the flooding topology to every node in the
           domain.</t>

<!--
        <t>In one option, the new leader advertises the first
           flooding topology it computed as a brand new flooding
           topology to every node in the domain. That is that the
           leader advertises the mappings between all the nodes and
           their indices and every connection/link on the flooding
           topology to every node in the domain. 
        </t>

        <t>In another option, the new leader advertises the first
           (new) flooding topology it computed as a updated flooding
           topology as compared to the current flooding topology it has
           received from the old leader to every node in the domain.
           That is that the leader tells every node in the domain to add
           the new connections/links to the current flooding topology and
           remove/withdraw old connections/links from the current flooding
           topology. These new connections/links are on the new flooding
           topology but not on the current flooding topology. Those old
           connections/links are on the current flooding topology but
           not on the new flooding topology. 
        </t>
-->
        <t>For a brand new flooding topology of the domain computed, the
           leader advertises the whole flooding topology to every node in
           the domain. The leader advertises the mappings between all the
           node IDs and their indices to every node in the domain using a
           number of node IDs TLVs first. These node IDs TLVs contain the
           IDs of all the nodes in the domain and indicates the index
           corresponding to each of the node IDs and are advertised under
           MP_REACH_NLRI in BGP update messages. And then the leader
           advertises the connections/links on the flooding topology to
           every node in the domain using a number of paths TLVs. These
           paths TLVs contain all the connections/links on the flooding
           topology and are advertised under MP_REACH_NLRI in BGP
           update messages.
       </t>

        <t>After advertising a flooding topology to every node in the
           domain, which is called the current flooding topology, for a new
           flooding topology computed for the updated real network topology
           of the domain, the leader advertises only the changes in the new
           flooding topology comparing to the current flooding topology to
           every node in the domain.  The leader advertises the changes in
           the mappings between all the node IDs and their indices to every
           node in the domain using node IDs TLVs first, and then
           advertises the changes in the flooding topology to every node in
           the domain using paths TLVs.
       </t>

        <t>For the new nodes added into the domain, the leader advertises
           the mappings between the IDs of the new nodes and their indices
           using a node IDs TLV under MP_REACH_NLRI in a BGP update
           message to add the mappings. For the dead nodes removed from the
           domain, the leader advertises the mappings between the IDs of the
           dead nodes and their indices using a node IDs TLV under
           MP_UNREACH_NLRI in a BGP update message to withdraw the mappings.
       </t>

        <t>For the new connections/links added into the current flooding
           topology, the leader advertises the new connections/links using
           a paths TLV under MP_REACH_NLRI in a BGP update message to
           add the new connections/inks to the current flooding topology.
           For the old connections/links removed from the current flooding
           topology, the leader advertises the old connections/links using
           a paths TLV under MP_UNREACH_NLRI in a BGP update message to
           withdraw the old connections/links from the current flooding
           topology. 
       </t>

      </section> <!-- Flooding Topology Distribution in Centralized Mode -->

   <section title="An Algorithm for Distributed Mode">
      <t>This section specifies an algorithm that can be used
         by every node to compute its flooding topology.</t>

      <t>The algorithm for computing a flooding topology of a BGP-SPF
         domain (real topology) is described as follows.
         <list style = "symbols">
          <t>Select a node R0  
             with the smallest node ID and without the status
             indicating that the node does not support transit;</t>
          <t>Build a tree using R0 as root of the tree (details below);</t>
          <t>And then connect a leaf to the tree to have a flooding
            topology (details follow).</t>   
         </list>
      </t>

      <t>The algorithm starts from 
         <list style = "symbols">
           <t>a variable MaxD with an initial value 3, </t>
           <t>an initial flooding topology FT = {(R0, D=0, PHs={})} 
              with node R0 as root, where R0's Degree D = 0, 
              Previous Hops PHs = { };</t>
           <t>an initial candidate queue Cq = 
              {(R1,D=0, PHs={R0}), (R2,D=0, PHs={R0}), ..., (Rm,D=0, PHs={R0})},
              where each of nodes R1 to Rm is connected to R0, 
              its Degree D = 0 and Previous Hops PHs ={R0}, 
              R1 to Rm are in 
              increasing order by their IDs.</t>
         </list>
      </t>


<t>
<list style="numbers">

<t>
Find and remove the first element with node A from Cq 
that is not on FT and one PH's D in PHs &lt; MaxD, and add
the element with A into FT; Set A's D to one, 
increase A's PH's D by one.
If no element in Cq satisfies the conditions, algorithm is 
restarted with ++MaxD, the initial FT and Cq.
</t>

<t>If all the nodes are on the FT, then goto step 4;</t>

<t>
Suppose that node Xi (i = 1, 2,..., n) is connected to node A and not on FT, 
and X1, X2,..., Xn are in increasing order
by their IDs (i.e., X1's ID &lt; X2's ID &lt; ... &lt; Xn's ID).
If they are not ordered, then make them in the order.

If Xi is not in Cq, then add it into the end of Cq with D = 0 and PHs = {A}; 
otherwise (i.e., Xi is in Cq), add A into the end of Xi's PHs; 
Goto step 1.
</t>

<t>
For each node B on FT whose D is one 
(from minimum to maximum node ID), 
find a link L attached to B such that 
L's remote node R can transit traffic and has minimum D and ID
(if there is no node R which can transit traffic, 
then find a link L to node R whose D and ID are minimum), 
add link L between B and R into FT and increase B's D and R's D by one.
Return FT.
</t>
</list>

</t>
      </section> <!-- An Algorithm for Distributed Mode -->
     </section> <!-- Extensions for Node Connections Model -->
    </section> <!-- BGP Extensions for Flooding Reduction -->



    <section anchor="Security" title="Security Considerations">
     <t>TBD</t>
<!--
      <t>Protocol extensions defined in this document do not 
      affect the BGP security other than those as discussed 
      in the Security Considerations section of 
      <xref target="RFC5575"/>.</t>
-->
    </section>

    <section anchor="Acknowledgements" title="Acknowledgements">
     <t>The authors of this document would like to thank 
     Donald E. Eastlake, Acee Lindem and Keyur Patel
     for the comments.</t>
    </section>

    <section anchor="IANA" title="IANA Considerations">
     <t>TBD</t>
<!--
    <section anchor="existing-safi" 
      title="Existing Registry: 
      Subsequent Address Family Identifiers (SAFI) Parameters">
      <t>This document requests assigning a new SAFI in the registry 
         "Subsequent Address Family Identifiers (SAFI) Parameters" as 
         follows:
        <figure>
            <artwork align="center"><![CDATA[
   +=======================+=========================+=============+
   | Code Point            | Description             | Reference   |
   +=======================+=========================+=============+
   | TBD1(179 suggested)   |  BIER-TE Policy SAFI    |This document|
   +=======================+=========================+=============+]]></artwork>
          </figure>
</t>
    </section>

    <section anchor="existing-tunnel-type" 
      title="Existing Registry: 
      BGP Tunnel Encapsulation Attribute Tunnel Types">
      <t>This document requests assigning a new Tunnel-Type in the 
         registry "BGP Tunnel Encapsulation Attribute Tunnel Types"
         as follows:
        <figure>
            <artwork align="center"><![CDATA[
   +=======================+=========================+=============+
   | Code Point            | Description             | Reference   |
   +=======================+=========================+=============+
   |  TBD2(16 suggested)   |  BIER-TE Tunnel/Path    |This document|
   +=======================+=========================+=============+]]></artwork>
          </figure>
</t>
    </section>

    <section anchor="existing-tunnel-type-subtlvs" 
      title="Existing Registry: 
      BGP Tunnel Encapsulation Attribute sub-TLVs">
      <t>This document requests assigning a few of new sub-TLVs 
         in the registry "BGP Tunnel Encapsulation Attribute sub-TLVs"
         as follows:
        <figure>
            <artwork align="center"><![CDATA[
   +=======================+=========================+=============+
   | Code Point            | Description             | Reference   |
   +=======================+=========================+=============+
   |  TBD3(16 suggested)   |  Path BitPositions      |This document|
   +=======================+=========================+=============+
   |  TBD4(17 suggested)   |  Path Name              |This document|
   +=======================+=========================+=============+
   |  TBD5(18 suggested)   |  Service Label          |This document|
   +=======================+=========================+=============+
   |  TBD6(19 suggested)   |  32 Bits Service ID     |This document|
   +=======================+=========================+=============+
   |  TBD7(20 suggested)   |  128 Bits Service ID    |This document|
   +=======================+=========================+=============+
   |  TBD8(21 suggested)   |  IPv4 Multicast Traffic |This document|
   +=======================+=========================+=============+
   |  TBD9(22 suggested)   |  IPv6 Multicast Traffic |This document|
   +=======================+=========================+=============+
]]></artwork>
          </figure>
</t>
    </section>
-->
    </section>


  </middle>

  <back>
    <references title="Normative References">
      <?rfc include="reference.RFC.2119"?>
      <?rfc include="reference.RFC.4721"?>
      <?rfc include="reference.RFC.4760"?>
      <?rfc include="reference.RFC.7938"?>

      <?rfc include="reference.I-D.ietf-lsvr-bgp-spf"?>
    </references>

    <references title="Informative References">
      <?rfc include="reference.RFC.8670"?>
      <?rfc include="reference.I-D.ietf-lsr-dynamic-flooding"?>
      <?rfc include="reference.I-D.ietf-lsr-flooding-topo-min-degree"?>
    </references>

<!-- Appendix -->
 
  </back>

</rfc>
