Linux 路由学习笔记之十路由缓存项的创建

本文链接：https://blog.csdn.net/lickylin/article/details/41556363

本文深入探讨了路由缓存的创建过程及其在数据转发中的作用，包括输入与输出路由缓存的区别、创建步骤、与邻居子系统的关联，以及数据转发流程中路由子系统与邻居子系统的交互。详细解释了路由缓存如何优化数据包的查找速度，减少资源消耗，确保高效的数据传输。

摘要生成于 C知道，由 DeepSeek-R1 满血版支持，前往体验 >

前面分析了路由以及二层、三层相关的文档，而对于路由缓存的创建，按路由类型的分类可以分为两类:输入路由与输出路由。

所谓输入路由，则为网口接收到数据以后，通过查找路由确定是将数据包转发，还是接收数据包；而输出路由，则是本地三层或者三层以上协议层要发送数据时，通过查找路由，确定使用哪一条路由发送出去。

而当找到路由以后，我们就会创建相应的路由缓存，这样的话，当有持续的数据转发或者发送或者接收时，直接查找相应的路由缓存即可，不必重新查找路由表了。

因此路由缓存也就是在ip_route_input、ip_route_output中会被创建。

路由缓存的创建大概可以分为以下几个步骤:

1.为路由缓存申请空间，并进行相应的初始化

2.将路由缓存与邻居项进行绑定

3.将新创建的路由缓存添加到相应的hash链表中。

以上三步主要涉及两个函数dst_alloc、rt_intern_hash

1.1 dst_alloc

看到这个函数的名称，可能会被迷惑，怎么写的是dst_alloc呢，关于rtable与dst_entry的关系，请看上一节分析文档。

功能:申请一个struct rtable类型的路由缓存

1. 判断已申请的路由缓存的个数是否已经大于路由缓存的最大值时，则调用垃圾

回收函数，强制进行缓存释放。若强制释放失败，则程序反悔失败。

2.调用kmem_cache_zalloc申请路由缓存

3.对rtable->u.dst中的成员进行初始化。

void * dst_alloc(struct dst_ops * ops)

{

struct dst_entry * dst;

if (ops->gc && atomic_read(&ops->entries) > ops->gc_thresh) {

if (ops->gc())

return NULL;

}

ipv4_dst_ops.kmem_cachep =

kmem_cache_create

此处通过函数kmem_cache_zalloc，申请内存大小为ops->kmem_cachep->size的内存，即申请ipv4_dst_ops.kmem_cachep->size大小的内存，而在ip_rt_init中，通过调用函数kmem_cache_create创建缓存时，即将ipv4_dst_ops.kmem_cachep->size的值设置为sizeof(struct rtable)。因此此处即申请了一个struct rtable大小的内存空间。

dst = kmem_cache_zalloc(ops->kmem_cachep, GFP_ATOMIC);

if (!dst)

return NULL;

atomic_set(&dst->__refcnt, 0);

dst->ops = ops;

dst->lastuse = jiffies;

dst->path = dst;

dst->input = dst_discard_in;

dst->output = dst_discard_out;

#if RT_CACHE_DEBUG >= 2

atomic_inc(&dst_total);

#endif

atomic_inc(&ops->entries);

return dst;

}

1.2 rt_intern_hash

路由缓存插入函数

1.查找该路由缓存是否已经在路由缓存的hash链表中，若已存在，则更新

使用时间，并将该缓存项放在hash链表的链首，程序返回

2.在遍历hash表的时候，对于使用计数为0的缓存项，计算其score值，score值最小的

值则有可能被释放内存(当chain_length > ip_rt_gc_elasticity时，即会释放该缓存占用的内存)

3.调用arp_bind_neighbour进行路由缓存项与邻居项的绑定操作。

4.将该路由缓存项放在hash链的链首。

static int rt_intern_hash(unsigned hash, struct rtable *rt, struct rtable **rp)

{

struct rtable *rth, **rthp;

unsigned long now;

struct rtable *cand, **candp;

u32 min_score;

int chain_length;

int attempts = !in_softirq();

restart:

chain_length = 0;

min_score = ~(u32)0;

cand = NULL;

candp = NULL;

now = jiffies;

/*根据值hash，找到rt_hash_table链表中相应的*/

rthp = &rt_hash_table[hash].chain;

/*调用相应hash表对应的自旋锁，执行上锁操作*/

spin_lock_bh(rt_hash_lock_addr(hash));

while ((rth = *rthp) != NULL) {

#ifdef CONFIG_IP_ROUTE_MULTIPATH_CACHED

if (!(rth->u.dst.flags & DST_BALANCED) &&

compare_keys(&rth->fl, &rt->fl)) {

#else

if (compare_keys(&rth->fl, &rt->fl)) {

#endif

/* Put it first */

*rthp = rth->u.dst.rt_next;

* Since lookup is lockfree, the deletion

* must be visible to another weakly ordered CPU before

* the insertion at the start of the hash chain.

rcu_assign_pointer(rth->u.dst.rt_next,

rt_hash_table[hash].chain);

* Since lookup is lockfree, the update writes

* must be ordered for consistency on SMP.

rcu_assign_pointer(rt_hash_table[hash].chain, rth);

rth->u.dst.__use++;

dst_hold(&rth->u.dst);

rth->u.dst.lastuse = now;

spin_unlock_bh(rt_hash_lock_addr(hash));

rt_drop(rt);

*rp = rth;

return 0;

}

if (!atomic_read(&rth->u.dst.__refcnt)) {

u32 score = rt_score(rth);

if (score <= min_score) {

cand = rth;

candp = rthp;

min_score = score;

}

chain_length++;

rthp = &rth->u.dst.rt_next;

}

if (cand) {

/* ip_rt_gc_elasticity used to be average length of chain

* length, when exceeded gc becomes really aggressive.

* The second limit is less certain. At the moment it allows

* only 2 entries per bucket. We will see.

if (chain_length > ip_rt_gc_elasticity) {

*candp = cand->u.dst.rt_next;

rt_free(cand);

}

/* Try to bind route to arp only if it is output

route or unicast forwarding path.

if (rt->rt_type == RTN_UNICAST || rt->fl.iif == 0) {

int err = arp_bind_neighbour(&rt->u.dst);

if (err) {

spin_unlock_bh(rt_hash_lock_addr(hash));

/*当出错的原因不是"没有内存时，则释放该路由缓存占用的

内存，程序返回相应的错误*/

if (err != -ENOBUFS) {

rt_drop(rt);

return err;

}

/*当是由于内存不够，导致arp绑定失败时，则需要进行同步

垃圾回收，实现释放内存的愿望，接着则重新执行一次上述的操作*/

/* Neighbour tables are full and nothing

can be released. Try to shrink route cache,

it is most likely it holds some neighbour records.

if (attempts-- > 0) {

int saved_elasticity = ip_rt_gc_elasticity;

int saved_int = ip_rt_gc_min_interval;

ip_rt_gc_elasticity = 1;

ip_rt_gc_min_interval = 0;

rt_garbage_collect();

ip_rt_gc_min_interval = saved_int;

ip_rt_gc_elasticity = saved_elasticity;

goto restart;

}

if (net_ratelimit())

printk(KERN_WARNING "Neighbour table overflow.\n");

rt_drop(rt);

return -ENOBUFS;

}

rt->u.dst.rt_next = rt_hash_table[hash].chain;

#if RT_CACHE_DEBUG >= 2

if (rt->u.dst.rt_next) {

struct rtable *trt;

printk(KERN_DEBUG "rt_cache @%02x: %u.%u.%u.%u", hash,

NIPQUAD(rt->rt_dst));

for (trt = rt->u.dst.rt_next; trt; trt = trt->u.dst.rt_next)

printk(" . %u.%u.%u.%u", NIPQUAD(trt->rt_dst));

printk("\n");

}

#endif

rt_hash_table[hash].chain = rt;

spin_unlock_bh(rt_hash_lock_addr(hash));

*rp = rt;

return 0;

}

在上面的函数中，arp_bind_neighbour实现了路由缓存项与邻居项的关联，也实现了路由模块与邻居子系统的关联。

1.2.1 arp_bind_neighbour

功能:实现路由缓存与邻居项的绑定操作。

1.当该路由缓存项没有关联相应的邻居项时，

则根据下一跳ip地址，调用__neigh_lookup_errno查找相应的邻居项

(调用__neigh_lookup_errno后，当邻居项不存在时，则会调用函数neigh_create

创建邻居项；若存在相应的邻居项，则返回该邻居项)

2.将邻居项与路由缓存项进行关联。

int arp_bind_neighbour(struct dst_entry *dst)

{

struct net_device *dev = dst->dev;

struct neighbour *n = dst->neighbour;

if (dev == NULL)

return -EINVAL;

if (n == NULL) {

__be32 nexthop = ((struct rtable*)dst)->rt_gateway;

if (dev->flags&(IFF_LOOPBACK|IFF_POINTOPOINT))

nexthop = 0;

n = __neigh_lookup_errno(

#if defined(CONFIG_ATM_CLIP) || defined(CONFIG_ATM_CLIP_MODULE)

dev->type == ARPHRD_ATM ? clip_tbl_hook :

#endif

&arp_tbl, &nexthop, dev);

if (IS_ERR(n))

return PTR_ERR(n);

dst->neighbour = n;

}

return 0;

}

经过arp_bind_neighbour后，就真正实现了路由子系统与邻居子系统的关联。

等下我们会详细分析下，这个关联会起到什么作用。

前面我们对路由进行分类时，按输入路由与输出路由进行了分析，下面也按输入路由缓存与输出路由缓存进行分析。

2.输入/输出路由缓存项

2.1 输入路由缓存项

对于输入路由缓存项，其接口函数__mkroute_input，该函数创建的路由缓存项是一个输入转发路由缓存项，即接口接收到一个数据包，通过查找路由确定是需要转发时，则会调用该函数调用输入路由缓存项，其input函数设置为 ip_forward，其output函数为，ip_output。

功能:创建一个输入路由缓存项。

1.对路由进行合法性检查

2.调用dst_alloc创建路由缓存项

3.设置路由缓存的输入、输出函数指针

static inline int __mkroute_input(struct sk_buff *skb,

struct fib_result* res,

struct in_device *in_dev,

__be32 daddr, __be32 saddr, u32 tos,

struct rtable **result)

{

struct rtable *rth;

int err;

struct in_device *out_dev;

unsigned flags = 0;

__be32 spec_dst;

u32 itag;

/* get a working reference to the output device */

out_dev = in_dev_get(FIB_RES_DEV(*res));

if (out_dev == NULL) {

if (net_ratelimit())

printk(KERN_CRIT "Bug in ip_route_input" \

"_slow(). Please, report\n");

return -EINVAL;

}

/*路由合法性检查，当调用该函数前，已经找到了一个从saddr->daddr的路由项，

而该函数的功能是创建该路由项对应的路由缓存。但是创建路由缓存之前，

我们需要对该路由项进行合法性检查，即判断daddr->saddr的反向路由是否存在，

若不存在，则说明从saddr->daddr的路由是有问题的，则该函数会返回错误*/

err = fib_validate_source(saddr, daddr, tos, FIB_RES_OIF(*res),

in_dev->dev, &spec_dst, &itag);

if (err < 0) {

ip_handle_martian_source(in_dev->dev, in_dev, skb, daddr,

saddr);

err = -EINVAL;

goto cleanup;

}

if (err)

flags |= RTCF_DIRECTSRC;

if (out_dev == in_dev && err && !(flags & (RTCF_NAT | RTCF_MASQ)) &&

(IN_DEV_SHARED_MEDIA(out_dev) ||

inet_addr_onlink(out_dev, saddr, FIB_RES_GW(*res))))

flags |= RTCF_DOREDIRECT;

if (skb->protocol != htons(ETH_P_IP)) {

/* Not IP (i.e. ARP). Do not create route, if it is

* invalid for proxy arp. DNAT routes are always valid.

if (out_dev == in_dev && !(flags & RTCF_DNAT)) {

err = -EINVAL;

goto cleanup;

}

/*创建路由缓存项*/

rth = dst_alloc(&ipv4_dst_ops);

if (!rth) {

err = -ENOBUFS;

goto cleanup;

}

/*对路由缓存项进行初始化*/

atomic_set(&rth->u.dst.__refcnt, 1);

rth->u.dst.flags= DST_HOST;

#ifdef CONFIG_IP_ROUTE_MULTIPATH_CACHED

if (res->fi->fib_nhs > 1)

rth->u.dst.flags |= DST_BALANCED;

#endif

if (in_dev->cnf.no_policy)

rth->u.dst.flags |= DST_NOPOLICY;

if (out_dev->cnf.no_xfrm)

rth->u.dst.flags |= DST_NOXFRM;

rth->fl.fl4_dst = daddr;

rth->rt_dst = daddr;

rth->fl.fl4_tos = tos;

rth->fl.mark = skb->mark;

rth->fl.fl4_src = saddr;

rth->rt_src = saddr;

rth->rt_gateway = daddr;

rth->rt_iif =

rth->fl.iif = in_dev->dev->ifindex;

rth->u.dst.dev = (out_dev)->dev;

dev_hold(rth->u.dst.dev);

rth->idev = in_dev_get(rth->u.dst.dev);

rth->fl.oif = 0;

rth->rt_spec_dst= spec_dst;

/*设置相应的input、output函数*/

rth->u.dst.input = ip_forward;

rth->u.dst.output = ip_output;

/*设置下一跳ip*/

rt_set_nexthop(rth, res, itag);

rth->rt_flags = flags;

*result = rth;

err = 0;

cleanup:

/* release the working reference to the output device */

in_dev_put(out_dev);

return err;

}

2.2 输出路由缓存项

对于输入路由缓存项，其接口函数__mkroute_input，该函数创建的路由缓存项是一个输入转发路由缓存项，即接口接收到一个数据包，通过查找路由确定是需要转发时，则会调用该函数调用输入路由缓存项，其input函数为dst_discard_in，其output函数为，ip_output。

功能:创建一个输入路由缓存项。

1.对路由项的类型进行判断，并执行相应的判断。

2.调用dst_alloc创建路由缓存项

3.设置路由缓存的输出函数指针

static inline int __mkroute_output(struct rtable **result,

struct fib_result* res,

const struct flowi *fl,

const struct flowi *oldflp,

struct net_device *dev_out,

unsigned flags)

{

struct rtable *rth;

struct in_device *in_dev;

u32 tos = RT_FL_TOS(oldflp);

int err = 0;

if (LOOPBACK(fl->fl4_src) && !(dev_out->flags&IFF_LOOPBACK))

return -EINVAL;

if (fl->fl4_dst == htonl(0xFFFFFFFF))

res->type = RTN_BROADCAST;

else if (MULTICAST(fl->fl4_dst))

res->type = RTN_MULTICAST;

else if (BADCLASS(fl->fl4_dst) || ZERONET(fl->fl4_dst))

return -EINVAL;

if (dev_out->flags & IFF_LOOPBACK)

flags |= RTCF_LOCAL;

/* get work reference to inet device */

in_dev = in_dev_get(dev_out);

if (!in_dev)

return -EINVAL;

if (res->type == RTN_BROADCAST) {

flags |= RTCF_BROADCAST | RTCF_LOCAL;

if (res->fi) {

fib_info_put(res->fi);

res->fi = NULL;

}

} else if (res->type == RTN_MULTICAST) {

flags |= RTCF_MULTICAST|RTCF_LOCAL;

if (!ip_check_mc(in_dev, oldflp->fl4_dst, oldflp->fl4_src,

oldflp->proto))

flags &= ~RTCF_LOCAL;

/* If multicast route do not exist use

default one, but do not gateway in this case.

Yes, it is hack.

if (res->fi && res->prefixlen < 4) {

fib_info_put(res->fi);

res->fi = NULL;

}

rth = dst_alloc(&ipv4_dst_ops);

if (!rth) {

err = -ENOBUFS;

goto cleanup;

}

atomic_set(&rth->u.dst.__refcnt, 1);

rth->u.dst.flags= DST_HOST;

#ifdef CONFIG_IP_ROUTE_MULTIPATH_CACHED

if (res->fi) {

rth->rt_multipath_alg = res->fi->fib_mp_alg;

if (res->fi->fib_nhs > 1)

rth->u.dst.flags |= DST_BALANCED;

}

#endif

if (in_dev->cnf.no_xfrm)

rth->u.dst.flags |= DST_NOXFRM;

if (in_dev->cnf.no_policy)

rth->u.dst.flags |= DST_NOPOLICY;

rth->fl.fl4_dst = oldflp->fl4_dst;

rth->fl.fl4_tos = tos;

rth->fl.fl4_src = oldflp->fl4_src;

rth->fl.oif = oldflp->oif;

rth->fl.mark = oldflp->mark;

rth->rt_dst = fl->fl4_dst;

rth->rt_src = fl->fl4_src;

rth->rt_iif = oldflp->oif ? : dev_out->ifindex;

/* get references to the devices that are to be hold by the routing

cache entry */

rth->u.dst.dev = dev_out;

dev_hold(dev_out);

rth->idev = in_dev_get(dev_out);

rth->rt_gateway = fl->fl4_dst;

rth->rt_spec_dst= fl->fl4_src;

/*对于输出路由缓存项，则只需设置output的指针即可，

其输入指针为初始值(dst_discard_in)*/

rth->u.dst.output=ip_output;

RT_CACHE_STAT_INC(out_slow_tot);

if (flags & RTCF_LOCAL) {

rth->u.dst.input = ip_local_deliver;

rth->rt_spec_dst = fl->fl4_dst;

}

if (flags & (RTCF_BROADCAST | RTCF_MULTICAST)) {

rth->rt_spec_dst = fl->fl4_src;

if (flags & RTCF_LOCAL &&

!(dev_out->flags & IFF_LOOPBACK)) {

rth->u.dst.output = ip_mc_output;

RT_CACHE_STAT_INC(out_slow_mc);

}

#ifdef CONFIG_IP_MROUTE

if (res->type == RTN_MULTICAST) {

if (IN_DEV_MFORWARD(in_dev) &&

!LOCAL_MCAST(oldflp->fl4_dst)) {

rth->u.dst.input = ip_mr_input;

rth->u.dst.output = ip_mc_output;

}

#endif

}

rt_set_nexthop(rth, res, 0);

rth->rt_flags = flags;

*result = rth;

cleanup:

/* release work reference to inet device */

in_dev_put(in_dev);

return err;

}

以上就是输入路由缓存项与输出路由缓存项相关的创建函数。

3.路由子系统与邻居子系统的关联

路由子系统与邻居子系统是如何关联的呢，在上面1.2.1中讲到了arp_bind_neighbour函数，下面我们就仔细分析下三层数据收发与路由子系统、邻居子系统的关系。

3.1 数据转发

当本地网卡收到需要转发的数据时，其走向如下：

a.调用ip_rcv函数，对三层数据进行处理

b.进入netfilter的prerouting链，进行netfilter的处理（netfliter子系统）

c.netfilter模块准许通过后，则调用ip_rcv_finish继续处理

d.在ip_rcv_finish中，若数据还没有和路由缓存项关联，则调用函数ip_route_input进行路由缓存项以及路由缓存的查找。当路由缓存没有查找到后，则会调用ip_route_input_slow进行路由项的查找，若查找到路由项，则会调用ip_mkroute_input创建路由缓存项，并在调用rt_intern_hash中，通过arp_bind_neighbour将路由缓存项与邻居项进行绑定，并调用__mkroute_input设置dst的input、output函数，并将skb与路由缓存项进行绑定

e.通过调用dst_input，进入skb->dst->input函数，即2.1中的ip_forward函数。

f.在ip_forward函数中，进行合法性判断后，则会进入netfilter的forward链

g.netfilter通过后，则调用ip_forward_finish，通过dst_output，调用到2.1中的ip_output函数

h.进入netfilter的post链，若准许通过则调用ip_finish_output

i.决定是否进行分段操作，最后调用函数ip_finish_output2

j.在ip_finish_output2里，则会根据数据包关联的路由缓存项，找到缓存项对应的邻居项，并调用neighbour->output，这就进入了邻居子系统了。

k.对于ipv4来书，其output函数为neigh_resolve_output，在该函数里，若判断下一跳地址对应的mac地址还没有解析到，则会调用neigh_event_send更改邻居项的状态，以发送arp request报文，并将该数据包存入队列中，等解析到mac地址以后再发送出去；若下一跳对应的mac地址已经解析到，则会调用neigh->ops->queue_xmit将数据发送出去，对于ipv4来说即是dev_queue_xmit函数，而在该函数里，则会通过dev->hard_start_xmit调用网卡驱动的发送函数，将数据发送出去。